diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile index 9511f7869..bb640dfde 100644 --- a/software/apps/baremetal/Makefile +++ b/software/apps/baremetal/Makefile @@ -20,18 +20,13 @@ APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c BINARIES := $(addprefix $(BIN_DIR)/,$(APPS)) ALL := $(APPS) -FP_APPS := axpy_f16 axpy_f32 -FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16 -FP_APPS += cmatmul_f16 matmul_f16 matmul_f32 -FP_APPS += dotp_f16 dotp_f32 -FP_APPS += mimo_mmse_f32 mimo_mmse_f16 mimo_mmse_f8 ofdm_f16 - -I_APPS := synth_i32 -I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 -I_APPS += cmatmul_q16 mimo_mmse_q16 - -ALL_GCC := $(filter-out $(FP_APPS), $(ALL)) -ALL_LLVM := $(filter-out $(I_APPS), $(ALL)) +FP_SUFFIXES := f16 f32 f8 +I_SUFFIXES := q16 q32 i16 i32 i8 +I_APPS := $(foreach suf, $(FP_SUFFIXES), $(filter %_$(suf), $(ALL))) +FP_APPS := $(foreach suf, $(I_SUFFIXES), $(filter %_$(suf), $(ALL))) +# Filter out applications +ALL_GCC := $(filter-out $(I_APPS), $(ALL)) +ALL_LLVM := $(filter-out $(FP_APPS), $(ALL)) # Make all applications all: $(ALL_GCC) diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c index 1795e9059..8bcb38296 100644 --- a/software/apps/baremetal/axpy_f16/main.c +++ b/software/apps/baremetal/axpy_f16/main.c @@ -15,7 +15,6 @@ #include "synchronization.h" #include "data_axpy_f16.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) // Vectors for kernel computation __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c index 34ead109c..cb3f1d8a9 100644 --- a/software/apps/baremetal/axpy_f32/main.c +++ b/software/apps/baremetal/axpy_f32/main.c @@ -15,7 +15,6 @@ #include "synchronization.h" #include "data_axpy_f32.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) // Vectors for kernel computation float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); diff --git a/software/apps/baremetal/cfft_radix2_q16/main.c b/software/apps/baremetal/cfft_radix2_q16/main.c index e23fb929e..36646f877 100644 --- a/software/apps/baremetal/cfft_radix2_q16/main.c +++ b/software/apps/baremetal/cfft_radix2_q16/main.c @@ -19,7 +19,7 @@ #include "synchronization.h" #include "data_cfft_radix2_q16.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) /* CFFT mempool libraries */ #include "baremetal/mempool_cfft_q16_bitreversal.h" diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c index b06ae3189..ced0ffc6a 100644 --- a/software/apps/baremetal/cfft_radix4_f16/main.c +++ b/software/apps/baremetal/cfft_radix4_f16/main.c @@ -19,25 +19,31 @@ /* CFFT data libraries */ #include "data_cfft_radix4_f16.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) +#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4)) -/* CHOOSE ONE */ -#define PARALLEL // Parallel FFT not "memory-aware". -// #define FOLDED // Parallel FFT with "memory-aware" load/store. -//#define SCHEDULED // Folded FFTs arranged in rows and cols.''' +/* +====================== +Parameters and defines -// Bitreversal index from table. +PARALLEL: When defined runs parallel FFT. +FOLDED: When defined runs parallel FFT with folded inputs in memory. +SCHEDULED: When defined runs multiple parallel folded-inputs FFTs. +N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially +by each core N_FFTs_COL: + +BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else +they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be +defined to also fold the twiddle factors in memory. +*/ + +#define PARALLEL #define BITREVERSETABLE -// Also the twiddles have "memory-aware" load/stores. -// #define FOLDED_TWIDDLES -// Independent FFTs scheduled on one row (default 1). -#define N_FFTs_ROW 1 -// Independent FFTs scheduled on columns (default 1). -#define N_FFTs_COL 1 +#define N_FFTs_ROW (1) +#define N_FFTs_COL (1) #if (N_FFTs_COL > MAX_COL) -#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)] +#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)] #endif #include "baremetal/mempool_cfft_q16_bitreversal.h" @@ -59,16 +65,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] #endif #if (defined(SCHEDULED) || defined(FOLDED)) -__fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_src[8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +__fp16 l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_src[8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_dst[8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); #endif int main() { @@ -96,7 +102,7 @@ int main() { if (core_id == 0) { for (uint32_t j = 0; j < N_FFTs_ROW; j++) { for (uint32_t i = 0; i < N_FFTs_COL; i++) { - dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS), + dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS), l2_pSrc, N_CSAMPLES * sizeof(int32_t)); } } @@ -113,9 +119,11 @@ int main() { for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) { *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] = *(v2h *)&l2_twiddleCoef_f16[2 * i]; - *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] = + *(v2h *)&l1_twiddleCoef_f16_src[2 * + (i + j * N_WORDS_COL + 1 * NUM_BANKS)] = *(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)]; - *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] = + *(v2h *)&l1_twiddleCoef_f16_src[2 * + (i + j * N_WORDS_COL + 2 * NUM_BANKS)] = *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)]; } } diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c index 08ed80e9b..29b84c950 100644 --- a/software/apps/baremetal/cfft_radix4_q16/main.c +++ b/software/apps/baremetal/cfft_radix4_q16/main.c @@ -19,23 +19,31 @@ /* CFFT data libraries */ #include "data_cfft_radix4_q16.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) +#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4)) -/* CHOOSE ONE */ -//#define SINGLE // Single core FFT. -//#define PARALLEL // Parallel FFT not "memory-aware". -//#define FOLDED // Parallel FFT with "memory-aware" load/store. -#define SCHEDULED // Folded FFTs arranged in rows and cols.''' +/* +====================== +Parameters and defines + +PARALLEL: When defined runs parallel FFT. +FOLDED: When defined runs parallel FFT with folded inputs in memory. +SCHEDULED: When defined runs multiple parallel folded-inputs FFTs. +N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially +by each core N_FFTs_COL: + +BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else +they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be +defined to also fold the twiddle factors in memory. +*/ -// Bitreversal index from table. +#define PARALLEL #define BITREVERSETABLE -// Independent FFTs scheduled on one row (default 1). -#define N_FFTs_ROW 2 -// Independent FFTs scheduled on columns (default 1). -#define N_FFTs_COL 2 + +#define N_FFTs_ROW (1) +#define N_FFTs_COL (1) #if (N_FFTs_COL > MAX_COL) -#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)] +#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)] #endif // Also the twiddles have "memory-aware" load/stores. #define FOLDED_TWIDDLES @@ -60,16 +68,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] #endif #if (defined(SCHEDULED) || defined(FOLDED)) -int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -int16_t l1_twiddleCoef_q16_src[8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +int16_t l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +int16_t l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +int16_t l1_twiddleCoef_q16_src[8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +int16_t l1_twiddleCoef_q16_dst[8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); #endif int main() { @@ -97,7 +105,7 @@ int main() { if (core_id == 0) { for (uint32_t j = 0; j < N_FFTs_ROW; j++) { for (uint32_t i = 0; i < N_FFTs_COL; i++) { - dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS), + dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS), l2_pSrc, N_CSAMPLES * sizeof(int32_t)); } } @@ -112,9 +120,11 @@ int main() { for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) { *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL)] = *(v2s *)&l2_twiddleCoef_q16[2 * i]; - *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] = + *(v2s *)&l1_twiddleCoef_q16_src[2 * + (i + j * N_WORDS_COL + 1 * NUM_BANKS)] = *(v2s *)&l2_twiddleCoef_q16[2 * (i * 2U)]; - *(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] = + *(v2s *)&l1_twiddleCoef_q16_src[2 * + (i + j * N_WORDS_COL + 2 * NUM_BANKS)] = *(v2s *)&l2_twiddleCoef_q16[2 * (i * 3U)]; } } diff --git a/software/apps/baremetal/chest_f16/main.c b/software/apps/baremetal/chest_f16/main.c index e0feb90c7..304313788 100644 --- a/software/apps/baremetal/chest_f16/main.c +++ b/software/apps/baremetal/chest_f16/main.c @@ -19,6 +19,14 @@ #include "baremetal/mempool_chest_f16.h" #include "data_chest_f16.h" +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core Channel Estimation. +PARALLEL: When defined runs parallel Channel Estimation. +*/ + //#define SINGLE #define PARALLEL diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c index 572b12de0..6f7a73938 100644 --- a/software/apps/baremetal/chest_q16/main.c +++ b/software/apps/baremetal/chest_q16/main.c @@ -19,6 +19,14 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_chest_q16.h" +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core Channel Estimation. +PARALLEL: When defined runs parallel Channel Estimation. +*/ + #define PARALLEL int16_t l1_PilotTX[2 * N_TX * N_SAMPLES] diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c index 6d1c26ff2..10baa6a81 100644 --- a/software/apps/baremetal/cholesky_f16/main.c +++ b/software/apps/baremetal/cholesky_f16/main.c @@ -17,6 +17,15 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_cholesky_f16s.h" +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core Cholesky Decomposition. +PARALLEL: When defined runs parallel Cholesky Decomposition. +FOLDED: When defined 1 intermediate results are folded in memory. +*/ + #define SINGLE #define FOLDED (0) diff --git a/software/apps/baremetal/cholesky_q32/main.c b/software/apps/baremetal/cholesky_q32/main.c index 64fbf3b2f..e0daba421 100644 --- a/software/apps/baremetal/cholesky_q32/main.c +++ b/software/apps/baremetal/cholesky_q32/main.c @@ -11,7 +11,7 @@ #include "synchronization.h" #define HALF (1023) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) #define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b)) #define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT)) #define ABS(a) (a > 0 ? a : -a) @@ -31,18 +31,19 @@ #define N_COL 1 #define N_ROW 1 int32_t l1_A[matrix_N * matrix_N] - __attribute__((aligned(N_BANKS), section(".l1"))); + __attribute__((aligned(NUM_BANKS), section(".l1"))); int32_t l1_L[matrix_N * matrix_N] - __attribute__((aligned(N_BANKS), section(".l1"))); -int32_t l1_y[matrix_N] __attribute__((aligned(N_BANKS), section(".l1"))); + __attribute__((aligned(NUM_BANKS), section(".l1"))); +int32_t l1_y[matrix_N] __attribute__((aligned(NUM_BANKS), section(".l1"))); #else -int32_t l1_AA[matrix_N * N_BANKS] - __attribute__((aligned(N_BANKS), section(".l1_prio"))); -int32_t l1_LL[N_ROW * matrix_N * N_BANKS] - __attribute__((aligned(N_BANKS), section(".l1_prio"))); -int32_t l1_LR[N_ROW * matrix_N * N_BANKS] - __attribute__((aligned(N_BANKS), section(".l1_prio"))); -int32_t l1_yy[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1_prio"))); +int32_t l1_AA[matrix_N * NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +int32_t l1_LL[N_ROW * matrix_N * NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +int32_t l1_LR[N_ROW * matrix_N * NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +int32_t l1_yy[NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); #endif int main() { @@ -58,11 +59,12 @@ int main() { for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) { l1_yy[idx_col * matrix_N + i] = l2_y[i]; for (uint32_t j = 0; j < matrix_N; j++) { - l1_AA[idx_col * matrix_N + i * N_BANKS + j] = l2_A[i * matrix_N + j]; + l1_AA[idx_col * matrix_N + i * NUM_BANKS + j] = + l2_A[i * matrix_N + j]; } } } - for (uint32_t i = 0; i < N_ROW * matrix_N * N_BANKS; i++) { + for (uint32_t i = 0; i < N_ROW * matrix_N * NUM_BANKS; i++) { l1_LL[i] = 0; l1_LR[i] = 0; } diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c index aa2ed55a6..727dba7ca 100644 --- a/software/apps/baremetal/cmatmul_f16/main.c +++ b/software/apps/baremetal/cmatmul_f16/main.c @@ -19,7 +19,18 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_cmatmul_f16.h" -#define PARALLEL_4x4 + +/* +====================== +Parameters and defines + +SINGLE_2x2: Single-core matmul on 2x2 tiles. +PARALLEL_2x2: Parallel matmul on 2x2 C-tiles. +PARALLEL_2x4: Parallel matmul on 4x4 C-tiles. +PARALLEL_4x4: Parallel matmul on 4x4 C-tiles. +PARALLEL_4x4_COPIES_A: Parallel matmul on 4x4 C-tiles, compies of A in memory to +avoid banking conflicts. +*/ #if defined(PARALLEL_4x4_COPIES_A) __fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)] @@ -51,7 +62,7 @@ int main() { // Wait at barrier until everyone is ready mempool_barrier(num_cores); -#if defined(SINGLE_CORE) +#if defined(SINGLE_2x2) // Execute function to test. if (core_id == 0) { mempool_start_benchmark(); diff --git a/software/apps/baremetal/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c index 0dcffbfc7..37089fd5b 100644 --- a/software/apps/baremetal/cmatmul_q16/main.c +++ b/software/apps/baremetal/cmatmul_q16/main.c @@ -16,6 +16,14 @@ #include "baremetal/mempool_cmatmul_q16.h" #include "data_cmatmul_q16.h" +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core matmul. +PARALLEL: When defined runs parallel matmul. +*/ + #define PARALLEL #define dim_M (matrix_M) #define dim_N (matrix_N) diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c index 2091f0336..3b8b272b9 100644 --- a/software/apps/baremetal/dotp_f16/main.c +++ b/software/apps/baremetal/dotp_f16/main.c @@ -14,9 +14,6 @@ #include "synchronization.h" #include "data_dotp_f16.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) -// #define SINGLE_CORE_REDUCTION -#define BINARY_REDUCTION // Vectors for kernel computation __fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); @@ -47,18 +44,6 @@ int main() { } mempool_barrier(num_cores); - // // SINGLE-CORE - // time_init = mempool_get_timer(); - // dotp_f16s(l1_X, l1_Y, sum, array_N); - // // dotp_f16s_unrolled4(l1_X, l1_Y, sum, array_N); - // time_end = mempool_get_timer(); - - // // PARALLEL - // time_init = mempool_get_timer(); - // dotp_f16vecp_unrolled4(l1_X, l1_Y, sum, array_N, num_cores); - // // dotp_f16p(l1_X, l1_Y, sum, array_N, num_cores); - // time_end = mempool_get_timer(); - // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); dotp_f16vecp_local_unrolled4(l1_X, l1_Y, sum, array_N); diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c index 3507795b1..e1a87b6b8 100644 --- a/software/apps/baremetal/dotp_f32/main.c +++ b/software/apps/baremetal/dotp_f32/main.c @@ -15,9 +15,6 @@ #include "synchronization.h" #include "data_dotp_f32.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) -// #define SINGLE_CORE_REDUCTION -#define BINARY_REDUCTION // Vectors for kernel computation float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); @@ -47,16 +44,6 @@ int main() { } mempool_barrier(num_cores); - // // SINGLE-CORE - // time_init = mempool_get_timer(); - // dotp_f32s_unrolled4(l1_A, l1_B, sum, array_N); - // time_end = mempool_get_timer(); - - // // PARALLEL - // time_init = mempool_get_timer(); - // dotp_f32p(l1_A, l1_B, sum, array_N, num_cores); - // time_end = mempool_get_timer(); - // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); dotp_f32p_local_unrolled4(l1_X, l1_Y, sum, array_N); diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c index ee2e2ea52..8f6490ee2 100644 --- a/software/apps/baremetal/dotp_i32/main.c +++ b/software/apps/baremetal/dotp_i32/main.c @@ -15,11 +15,6 @@ #include "synchronization.h" #include "data_dotp_i32.h" -#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) -#define LOG_BARRIERS -// #define ATOMIC_REDUCTION -// #define SINGLE_CORE_REDUCTION -#define BINARY_REDUCTION // Vectors for kernel computation int32_t l1_X[array_N] __attribute__((aligned(array_N), section(".l1_prio"))); @@ -49,16 +44,6 @@ int main() { } mempool_barrier(num_cores); - // // SINGLE-CORE - // time_init = mempool_get_timer(); - // dotp_i32s_unrolled4(l1_A, l1_B, sum, array_N); - // time_end = mempool_get_timer(); - - // // PARALLEL - // time_init = mempool_get_timer(); - // dotp_i32p(l1_A, l1_B, sum, array_N, num_cores); - // time_end = mempool_get_timer(); - // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); dotp_i32p_local_unrolled4(l1_X, l1_Y, sum, array_N); diff --git a/software/apps/baremetal/matmul_f16/main.c b/software/apps/baremetal/matmul_f16/main.c index 99a0269cc..9964257ca 100644 --- a/software/apps/baremetal/matmul_f16/main.c +++ b/software/apps/baremetal/matmul_f16/main.c @@ -17,7 +17,13 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_matmul_f16.h" -#define PARALLEL +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core matmul. +PARALLEL: When defined runs parallel matmul. +*/ __fp16 matrix_a[matrix_M * matrix_N] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); diff --git a/software/apps/baremetal/matmul_f32/main.c b/software/apps/baremetal/matmul_f32/main.c index d3d7622db..ba9165ed1 100644 --- a/software/apps/baremetal/matmul_f32/main.c +++ b/software/apps/baremetal/matmul_f32/main.c @@ -17,8 +17,13 @@ #include "baremetal/mempool_checks.h" #include "baremetal/mempool_matmul_f32.h" -#define PARALLEL -#define ASM +/* +====================== +Parameters and defines + +SINGLE: When defined runs single-core matmul. +PARALLEL: When defined runs parallel matmul. +*/ float matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); float matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c index 80309a1e0..b1ef24451 100644 --- a/software/apps/baremetal/mimo_mmse_f16/main.c +++ b/software/apps/baremetal/mimo_mmse_f16/main.c @@ -18,25 +18,45 @@ #include "baremetal/mempool_mimo_mmse_f16s.h" #include "data_mimo_mmse_f16.h" -#define ZF (0) // When asserted use zero-forcing -#define FOLD (1) // When asserted fold matrices in memory -#define NUM_BANKS (BANKING_FACTOR * NUM_CORES) + +/* +====================== +Parameters and defines + +DOUBLE_BUFFERING: When defined benchmark double buffered MIMO-MMSE, including +L2-L1 transfers. + +For MIMO-MMSE without L2-L1 transfers: +PARALLEL: When defined benchmark parallel MIMO-MMSE. +SINGLE: When defined benchmark single-core MIMO-MMSE. +VEC: When defined benchmark SIMD-vectorized kernels. +ZF: When defined 1 use zero forcing detector. +FOLD: When defined 1 fold matrices in memory. + +For MIMO-MMSE with L2-L1 transfers: +DMA_TRANSFER1: When defined transfer inputs for next round at the beginning of +computation. DMA_TRANSFER2: When defined transfer inputs for next round after +Hermitian computation. N_ROUNDS: Define number of rounds of Double-Buffering. +*/ + +#define ZF (0) +#define FOLD (1) #define PARALLEL #define VEC +#ifndef DOUBLE_BUFFERING + /********************************************************** ********************************************************** - _ _ ___ _ _ _____ __ - | \ | |/ _ \ | | / |_ _| __ __ _ _ __ ___ / _| - | \| | | | |_____| | | | | || '__/ _` | '_ \/ __| |_ - | |\ | |_| |_____| |___| | | || | | (_| | | | \__ \ _| - |_| \_|\___/ |_____|_| |_||_| \__,_|_| |_|___/_|(_) + _ _ ___ _____ __ + | \ | |/ _ \ |_ _| __ __ _ _ __ ___ / _| + | \| | | | |_____ | || '__/ _` | '_ \/ __| |_ + | |\ | |_| |_____ | || | | (_| | | | \__ \ _| + |_| \_|\___/ |_||_| \__,_|_| |_|___/_|(_) *********************************************************** ***********************************************************/ -#ifndef DOUBLE_BUFFERING - #if FOLD #define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS)) #define NUM_COL (NUM_BANKS / N_TX) @@ -193,6 +213,8 @@ int main() { return 0; } +#else + /********************************************************** ********************************************************** ____ __ __ _ _____ __ @@ -204,10 +226,6 @@ int main() { *********************************************************** ***********************************************************/ -#else -#define N_ROUNDS (1) -#define DMA_TRANSFER1 - // Inputs-Outputs even double-buffering rounds __fp16 l1A_H[2 * N_TX * N_RX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c index d243754fc..fb054e4e0 100644 --- a/software/apps/baremetal/mimo_mmse_f32/main.c +++ b/software/apps/baremetal/mimo_mmse_f32/main.c @@ -21,6 +21,17 @@ #include "data_mimo_mmse_f32.h" +/* +====================== +Parameters and defines + +PARALLEL: When defined benchmark parallel MIMO-MMSE. +SINGLE: When defined benchmark single-core MIMO-MMSE. +PARALLEL_HERMITIAN: When defined the Hermitian is finely-grained parallelized +over a group of cores. ZF: When defined 1 use zero forcing detector. FOLD: When +defined 1 fold matrices in memory. +*/ + #define SINGLE #define ZF (0) #define FOLD (0) diff --git a/software/apps/baremetal/mimo_mmse_f8/main.c b/software/apps/baremetal/mimo_mmse_f8/main.c index c5d1cd77e..006dbf83b 100644 --- a/software/apps/baremetal/mimo_mmse_f8/main.c +++ b/software/apps/baremetal/mimo_mmse_f8/main.c @@ -18,9 +18,20 @@ #include "baremetal/mempool_mimo_mmse_f8s.h" #include "data_mimo_mmse_f8.h" + +/* +====================== +Parameters and defines + +PARALLEL: When defined benchmark parallel MIMO-MMSE. +SINGLE: When defined benchmark single-core MIMO-MMSE. +VEC: When defined benchmark SIMD-vectorized kernels. +ZF: When defined 1 use zero forcing detector. +FOLD: When defined 1 fold matrices in memory. +*/ + #define ZF (0) // When asserted use zero-forcing #define FOLD (0) // When asserted fold matrixes in memory -#define NUM_BANKS (BANKING_FACTOR * NUM_CORES) #define PARALLEL #define VEC diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c index 9bcb5e9db..8e2b557a4 100644 --- a/software/apps/baremetal/mimo_mmse_q16/main.c +++ b/software/apps/baremetal/mimo_mmse_q16/main.c @@ -16,7 +16,13 @@ #include "baremetal/mempool_linearsolver_q16s.h" #include "baremetal/mempool_mimo_mmse_q16s.h" -#define PARALLEL +/* +====================== +Parameters and defines + +PARALLEL: When defined benchmark parallel MIMO-MMSE. +SINGLE: When defined benchmark single-core MIMO-MMSE. +*/ int16_t l1_H[2 * N_TX * N_RX * N_ITR] __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), diff --git a/software/apps/baremetal/ofdm_f16/main.c b/software/apps/baremetal/ofdm_f16/main.c index 264768199..3cf04dbed 100644 --- a/software/apps/baremetal/ofdm_f16/main.c +++ b/software/apps/baremetal/ofdm_f16/main.c @@ -18,7 +18,6 @@ #include "synchronization.h" #include "data_ofdm_f16.h" -#define N_BANKS (NUM_CORES * BANKING_FACTOR) // CFFT Parameters #define SCHEDULED @@ -28,7 +27,7 @@ #define N_FFTs_COL 4 #define N_FFTs_ROW (N_RX / N_FFTs_COL) // CMATMUL Parameters -#define NUM_COPIES (N_BANKS / (N_BEAMS * N_RX)) +#define NUM_COPIES (NUM_BANKS / (N_BEAMS * N_RX)) #define dim_M (N_BEAMS) #define dim_N (N_RX) #define dim_P (N_SC) @@ -43,18 +42,18 @@ dump(checkpoint, 1); uint32_t arrival_index __attribute__((section(".l1_prio"))); __fp16 l1_pBF_Coef_folded[2 * BANKING_FACTOR * NUM_CORES] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); -__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_src[6 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); -__fp16 l1_twiddleCoef_f16_dst[6 * N_BANKS] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_src[6 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_dst[6 * NUM_BANKS] + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] - __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + __attribute__((aligned(4 * NUM_BANKS), section(".l1_prio"))); /////////////////////////////////////////////////////////////////////////////////////////////////// /* MAIN */ @@ -67,7 +66,7 @@ int main() { mempool_start_benchmark(); if (core_id == 0) { // Each FFT is folded over 4 memory rows - // Each memory row is 2 * N_BANKS samples + // Each memory row is 2 * NUM_BANKS samples __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED); dma_memcpy_blocking(l1_pFFT_Src, l2_pFFT_Src, (N_RX * N_SC) * sizeof(int32_t)); @@ -78,7 +77,7 @@ int main() { dim_M * dim_N * sizeof(int32_t)); } for (uint32_t i = 0; i < N_FFTs_COL; i++) { - dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS), + dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * NUM_BANKS), l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t)); } } @@ -114,7 +113,7 @@ int main() { dma_memcpy_blocking(l2_pBF_Dst, l1_pFFT_Dst, (N_RX * N_SC) * sizeof(int32_t)); for (uint32_t i = 0; i < N_FFTs_COL; i++) { - dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS), + dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * NUM_BANKS), l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t)); } __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED); diff --git a/software/kernels/baremetal/mempool_chest_q16.h b/software/kernels/baremetal/mempool_chest_q16.h index b4d90adff..6e735bbe7 100644 --- a/software/kernels/baremetal/mempool_chest_q16.h +++ b/software/kernels/baremetal/mempool_chest_q16.h @@ -6,7 +6,7 @@ #pragma once #include "builtins_v2.h" -#define __MUL +#define __MUL // Multiplication by pilot instead of division. /* a[i] = ar[i] + i * ai[j] out[i][j] = a[i] / c[j] diff --git a/software/kernels/baremetal/mempool_cmatmul_f16.h b/software/kernels/baremetal/mempool_cmatmul_f16.h index 12645c454..f37c197d8 100644 --- a/software/kernels/baremetal/mempool_cmatmul_f16.h +++ b/software/kernels/baremetal/mempool_cmatmul_f16.h @@ -13,10 +13,8 @@ #pragma once #include "builtins_v2.h" -// Use complex dotp in a single offload -#define __CDOTP -// Shift cores startpoint over rows of matrix A -#define __SHIFT_A +#define __CDOTP // Use complex dotp in a single offload +#define __SHIFT_A // Shift cores startpoint over rows of matrix A /****************************************************************************** __ ___ _ _ ____ _ diff --git a/software/kernels/baremetal/mempool_cmatmul_q16.h b/software/kernels/baremetal/mempool_cmatmul_q16.h index aa6a71b6c..78ed04b31 100644 --- a/software/kernels/baremetal/mempool_cmatmul_q16.h +++ b/software/kernels/baremetal/mempool_cmatmul_q16.h @@ -13,8 +13,7 @@ #pragma once #include "builtins_v2.h" -// Shift cores startpoint over rows of matrix A -#define __SHIFT_A +#define __SHIFT_A // Shift cores startpoint over rows of matrix A #define CMATMUL_1x1_LOOP \ v2s sum = {0, 0}; \ diff --git a/software/kernels/baremetal/mempool_dotp_f16.h b/software/kernels/baremetal/mempool_dotp_f16.h index 791b7c68e..e8083cfcf 100644 --- a/software/kernels/baremetal/mempool_dotp_f16.h +++ b/software/kernels/baremetal/mempool_dotp_f16.h @@ -7,6 +7,16 @@ #pragma once #include "builtins_v2.h" +/* +====================== +Parameters and defines + +SINGLE_CORE_REDUCTION: Reduction with a single-core. +BINARY_REDUCTION: Reduction with binary tree. +*/ + +#define SINGLE_CORE_REDUCTION + #define DOTPF16VEC_UNROLLED4_LOOP \ { \ a01 = (*(v2h *)&in_a[i]); \ diff --git a/software/kernels/baremetal/mempool_dotp_f32.h b/software/kernels/baremetal/mempool_dotp_f32.h index 290b96d59..13e4ff9e5 100644 --- a/software/kernels/baremetal/mempool_dotp_f32.h +++ b/software/kernels/baremetal/mempool_dotp_f32.h @@ -4,6 +4,16 @@ // Author: Marco Bertuletti, ETH Zurich +/* +====================== +Parameters and defines + +SINGLE_CORE_REDUCTION: Reduction with a single-core. +BINARY_REDUCTION: Reduction with binary tree. +*/ + +#define SINGLE_CORE_REDUCTION + #define DOTPF32_UNROLLED4_LOOP \ { \ a0 = in_a[i]; \ diff --git a/software/kernels/baremetal/mempool_dotp_i32.h b/software/kernels/baremetal/mempool_dotp_i32.h index 4b80e92ed..3f8320b91 100644 --- a/software/kernels/baremetal/mempool_dotp_i32.h +++ b/software/kernels/baremetal/mempool_dotp_i32.h @@ -4,6 +4,18 @@ // Author: Marco Bertuletti, ETH Zurich +/* +====================== +Parameters and defines + +SINGLE_CORE_REDUCTION: Reduction with a single-core. +BINARY_REDUCTION: Reduction with binary tree. +ATOMIC_REDUCTION: Reduction with atomics. +LOG_BARRIERS: Use binary reduction +*/ + +#define SINGLE_CORE_REDUCTION + #define DOTPI32_UNROLLED4_LOOP \ { \ a0 = in_a[i]; \ diff --git a/software/kernels/baremetal/mempool_matmul_f32.h b/software/kernels/baremetal/mempool_matmul_f32.h index 8879fa52d..c6b669bcc 100644 --- a/software/kernels/baremetal/mempool_matmul_f32.h +++ b/software/kernels/baremetal/mempool_matmul_f32.h @@ -13,6 +13,8 @@ #pragma once #include "builtins_v2.h" +// The 4x4 matmul is executed with asm_volatile statement +#define ASM void matmul_2x2_single_f32(float const *__restrict__ A, float const *__restrict__ B, float *__restrict__ C, diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h index 3ce36f3b6..12b2320e5 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h @@ -48,7 +48,7 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, // STORE INDEXES #if defined(FOLDED) || defined(SCHEDULED) uint32_t n2_store = n2 >> 2U; - i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS; + i0_store = (i0 % n2_store) + (i0 / n2_store) * NUM_BANKS; i1_store = i0_store + n2_store; i2_store = i1_store + n2_store; i3_store = i2_store + n2_store; @@ -158,9 +158,9 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + * 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; + i1 = i0 + NUM_BANKS; + i2 = i1 + NUM_BANKS; + i3 = i2 + NUM_BANKS; #else /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + @@ -173,7 +173,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, #if defined(FOLDED) || defined(SCHEDULED) uint32_t n2_store = n2 >> 2U; i0_store = - (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS; + (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * NUM_BANKS; i1_store = i0_store + n2_store; i2_store = i1_store + n2_store; i3_store = i2_store + n2_store; @@ -278,9 +278,9 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; + i1 = i0 + NUM_BANKS; + i2 = i1 + NUM_BANKS; + i3 = i2 + NUM_BANKS; #else /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h index a0c1a7791..5db6be1cf 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_q16.h @@ -50,7 +50,7 @@ static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut, // STORE INDEXES #if defined(FOLDED) || defined(SCHEDULED) uint32_t n2_store = n2 >> 2U; - i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS; + i0_store = (i0 % n2_store) + (i0 / n2_store) * NUM_BANKS; i1_store = i0_store + n2_store; i2_store = i1_store + n2_store; i3_store = i2_store + n2_store; @@ -227,9 +227,9 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut, /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + * 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; + i1 = i0 + NUM_BANKS; + i2 = i1 + NUM_BANKS; + i3 = i2 + NUM_BANKS; #else /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + @@ -242,7 +242,7 @@ static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut, #if defined(FOLDED) || defined(SCHEDULED) uint32_t n2_store = n2 >> 2U; i0_store = - (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS; + (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * NUM_BANKS; i1_store = i0_store + n2_store; i2_store = i1_store + n2_store; i3_store = i2_store + n2_store; @@ -403,9 +403,9 @@ static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut, /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; + i1 = i0 + NUM_BANKS; + i2 = i1 + NUM_BANKS; + i3 = i2 + NUM_BANKS; #else /* index calculation for the input as, */ /* pIn[i0 + 0], pIn[i0 + fftLen/4], diff --git a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h index c6b4acf6b..90a3fd093 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h @@ -5,7 +5,6 @@ // Author: Marco Bertuletti, ETH Zurich #pragma once -#define BITREVERSETABLE #include "builtins_v2.h" #define MIN(x, y) (((x) < (y)) ? (x) : (y)) @@ -33,19 +32,19 @@ #ifdef FOLDED_TWIDDLES #define LOAD_STORE_TWIDDLEFACT \ CoSi1 = *(v2h *)&pCoef_src[2U * ic]; \ - CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * N_BANKS)]; \ - CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * N_BANKS)]; \ + CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * NUM_BANKS)]; \ + CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * NUM_BANKS)]; \ if (ic % 4 == 0) { \ *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi1; \ *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1; \ *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1; \ *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1; \ - ic_store += N_BANKS; \ + ic_store += NUM_BANKS; \ *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi2; \ *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2; \ *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2; \ *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2; \ - ic_store += N_BANKS; \ + ic_store += NUM_BANKS; \ *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi3; \ *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3; \ *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3; \ @@ -315,8 +314,8 @@ void mempool_radix4_cfft_f16p_scheduler( LOAD_STORE_TWIDDLEFACT; SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen; - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * fftLen; + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -345,8 +344,8 @@ void mempool_radix4_cfft_f16p_scheduler( SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -370,8 +369,8 @@ void mempool_radix4_cfft_f16p_scheduler( uint32_t col_shift = fftLen / 4; #endif - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift; + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * col_shift; radix4_butterfly_last(pIn, pOut, i0); } } @@ -416,7 +415,7 @@ void mempool_radix4_cfft_f16p_scheduler( : [s2] "r"(s2) :); for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8)); + uint16_t *ptr = (uint16_t *)(pIn + idx_row * (NUM_BANKS * 8)); // Load at address a tmpa1 = *(uint32_t *)&ptr[a1]; tmpa2 = *(uint32_t *)&ptr[a2]; @@ -465,12 +464,12 @@ void mempool_radix4_cfft_f16p_scheduler( idx3 = idx3 >> 1U; } idx0 = ic / 4; - idx1 = ic / 4 + N_BANKS; - idx2 = ic / 4 + 2 * N_BANKS; - idx3 = ic / 4 + 3 * N_BANKS; + idx1 = ic / 4 + NUM_BANKS; + idx2 = ic / 4 + 2 * NUM_BANKS; + idx3 = ic / 4 + 3 * NUM_BANKS; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8); - ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8); + ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (NUM_BANKS * 8); + ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (NUM_BANKS * 8); *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]); *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]); *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]); diff --git a/software/kernels/baremetal/mempool_radix4_cfft_q16p.h b/software/kernels/baremetal/mempool_radix4_cfft_q16p.h index e91602866..ef6122a00 100644 --- a/software/kernels/baremetal/mempool_radix4_cfft_q16p.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_q16p.h @@ -45,19 +45,19 @@ #define LOAD_STORE_TWIDDLEFACT \ CoSi1 = *(v2s *)&pCoef_src[2U * ic]; \ - CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * N_BANKS)]; \ - CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * N_BANKS)]; \ + CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * NUM_BANKS)]; \ + CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * NUM_BANKS)]; \ if (ic % 4 == 0) { \ *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi1; \ *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1; \ *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1; \ *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1; \ - ic_store += N_BANKS; \ + ic_store += NUM_BANKS; \ *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi2; \ *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2; \ *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2; \ *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2; \ - ic_store += N_BANKS; \ + ic_store += NUM_BANKS; \ *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi3; \ *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3; \ *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3; \ @@ -226,16 +226,16 @@ static inline void fold_radix4(int16_t *pSrc16, uint32_t fftLen, i1 = i0 + n2; i2 = i1 + n2; i3 = i2 + n2; - i1_store = i0 + N_BANKS; - i2_store = i1_store + N_BANKS; - i3_store = i2_store + N_BANKS; + i1_store = i0 + NUM_BANKS; + i2_store = i1_store + NUM_BANKS; + i3_store = i2_store + NUM_BANKS; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * N_BANKS)]; - B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * N_BANKS)]; - C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * N_BANKS)]; - *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * N_BANKS)] = A; - *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * N_BANKS)] = B; - *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * N_BANKS)] = C; + A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * NUM_BANKS)]; + B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * NUM_BANKS)]; + C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * NUM_BANKS)]; + *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * NUM_BANKS)] = A; + *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * NUM_BANKS)] = B; + *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * NUM_BANKS)] = C; } } mempool_log_partial_barrier(2, absolute_core_id, nPE); @@ -426,8 +426,8 @@ void mempool_radix4_cfft_q16p_scheduler( LOAD_STORE_TWIDDLEFACT; SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen; - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * fftLen; + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -460,8 +460,8 @@ void mempool_radix4_cfft_q16p_scheduler( SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -489,8 +489,8 @@ void mempool_radix4_cfft_q16p_scheduler( uint32_t col_shift = fftLen / 4; #endif - pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift; + pIn = pSrc16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (NUM_BANKS * 8) + 2 * col_id * col_shift; radix4_butterfly_last(pIn, pOut, i0); } } @@ -535,7 +535,7 @@ void mempool_radix4_cfft_q16p_scheduler( : [s2] "r"(s2) :); for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8)); + uint16_t *ptr = (uint16_t *)(pIn + idx_row * (NUM_BANKS * 8)); // Load at address a tmpa1 = *(uint32_t *)&ptr[a1]; tmpa2 = *(uint32_t *)&ptr[a2]; @@ -584,12 +584,12 @@ void mempool_radix4_cfft_q16p_scheduler( idx3 = idx3 >> 1U; } idx0 = ic / 4; - idx1 = ic / 4 + N_BANKS; - idx2 = ic / 4 + 2 * N_BANKS; - idx3 = ic / 4 + 3 * N_BANKS; + idx1 = ic / 4 + NUM_BANKS; + idx2 = ic / 4 + 2 * NUM_BANKS; + idx3 = ic / 4 + 3 * NUM_BANKS; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8); - ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8); + ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (NUM_BANKS * 8); + ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (NUM_BANKS * 8); *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]); *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]); *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]); diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 2602804cc..94f822ddc 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -27,7 +27,7 @@ DATA_DIR ?= $(abspath $(ROOT_DIR)/../data) COMPILER ?= gcc XPULPIMG ?= $(xpulpimg) ZFINX ?= $(zfinx) -XDIVSQRT ?= $(xDivSqrt) +XDIVSQRT ?= $(xDivSqrt) RISCV_XLEN ?= 32 @@ -92,6 +92,7 @@ DEFINES += -DNUM_CORES=$(num_cores) DEFINES += -DNUM_GROUPS=$(num_groups) DEFINES += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) DEFINES += -DBANKING_FACTOR=$(banking_factor) +DEFINES += -DNUM_BANKS=$(shell awk 'BEGIN{print $(banking_factor)*$(num_cores)}') DEFINES += -DNUM_CORES_PER_GROUP=$(shell awk 'BEGIN{print $(num_cores)/$(num_groups)}') DEFINES += -DNUM_TILES_PER_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)}') DEFINES += -DLOG2_NUM_CORES_PER_TILE=$(shell awk 'BEGIN{print log($(num_cores_per_tile))/log(2)}')