Skip to content

Commit

Permalink
[software] Add explanation for the use of defines
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Dec 19, 2024
1 parent 0f1de6f commit 5dd55e0
Show file tree
Hide file tree
Showing 34 changed files with 325 additions and 224 deletions.
19 changes: 7 additions & 12 deletions software/apps/baremetal/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,13 @@ APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c
BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
ALL := $(APPS)

FP_APPS := axpy_f16 axpy_f32
FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16
FP_APPS += cmatmul_f16 matmul_f16 matmul_f32
FP_APPS += dotp_f16 dotp_f32
FP_APPS += mimo_mmse_f32 mimo_mmse_f16 mimo_mmse_f8 ofdm_f16

I_APPS := synth_i32
I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32
I_APPS += cmatmul_q16 mimo_mmse_q16

ALL_GCC := $(filter-out $(FP_APPS), $(ALL))
ALL_LLVM := $(filter-out $(I_APPS), $(ALL))
FP_SUFFIXES := f16 f32 f8
I_SUFFIXES := q16 q32 i16 i32 i8
I_APPS := $(foreach suf, $(FP_SUFFIXES), $(filter %_$(suf), $(ALL)))
FP_APPS := $(foreach suf, $(I_SUFFIXES), $(filter %_$(suf), $(ALL)))
# Filter out applications
ALL_GCC := $(filter-out $(I_APPS), $(ALL))
ALL_LLVM := $(filter-out $(FP_APPS), $(ALL))

# Make all applications
all: $(ALL_GCC)
Expand Down
1 change: 0 additions & 1 deletion software/apps/baremetal/axpy_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "synchronization.h"

#include "data_axpy_f16.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)

// Vectors for kernel computation
__fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
Expand Down
1 change: 0 additions & 1 deletion software/apps/baremetal/axpy_f32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "synchronization.h"

#include "data_axpy_f32.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)

// Vectors for kernel computation
float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
Expand Down
2 changes: 1 addition & 1 deletion software/apps/baremetal/cfft_radix2_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include "synchronization.h"

#include "data_cfft_radix2_q16.h"
#define N_BANKS (NUM_CORES * BANKING_FACTOR)
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)

/* CFFT mempool libraries */
#include "baremetal/mempool_cfft_q16_bitreversal.h"
Expand Down
60 changes: 34 additions & 26 deletions software/apps/baremetal/cfft_radix4_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,31 @@

/* CFFT data libraries */
#include "data_cfft_radix4_f16.h"
#define N_BANKS (NUM_CORES * BANKING_FACTOR)
#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4))

/* CHOOSE ONE */
#define PARALLEL // Parallel FFT not "memory-aware".
// #define FOLDED // Parallel FFT with "memory-aware" load/store.
//#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
/*
======================
Parameters and defines
// Bitreversal index from table.
PARALLEL: When defined runs parallel FFT.
FOLDED: When defined runs parallel FFT with folded inputs in memory.
SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
by each core N_FFTs_COL:
BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
defined to also fold the twiddle factors in memory.
*/

#define PARALLEL
#define BITREVERSETABLE
// Also the twiddles have "memory-aware" load/stores.
// #define FOLDED_TWIDDLES

// Independent FFTs scheduled on one row (default 1).
#define N_FFTs_ROW 1
// Independent FFTs scheduled on columns (default 1).
#define N_FFTs_COL 1
#define N_FFTs_ROW (1)
#define N_FFTs_COL (1)
#if (N_FFTs_COL > MAX_COL)
#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)]
#endif

#include "baremetal/mempool_cfft_q16_bitreversal.h"
Expand All @@ -59,16 +65,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
#endif

#if (defined(SCHEDULED) || defined(FOLDED))
__fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
__fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
__fp16 l1_twiddleCoef_f16_src[8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
__fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
__fp16 l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS]
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
__fp16 l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS]
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
__fp16 l1_twiddleCoef_f16_src[8 * NUM_BANKS]
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
__fp16 l1_twiddleCoef_f16_dst[8 * NUM_BANKS]
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
#endif

int main() {
Expand Down Expand Up @@ -96,7 +102,7 @@ int main() {
if (core_id == 0) {
for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
for (uint32_t i = 0; i < N_FFTs_COL; i++) {
dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS),
l2_pSrc, N_CSAMPLES * sizeof(int32_t));
}
}
Expand All @@ -113,9 +119,11 @@ int main() {
for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
*(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] =
*(v2h *)&l2_twiddleCoef_f16[2 * i];
*(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
*(v2h *)&l1_twiddleCoef_f16_src[2 *
(i + j * N_WORDS_COL + 1 * NUM_BANKS)] =
*(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)];
*(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
*(v2h *)&l1_twiddleCoef_f16_src[2 *
(i + j * N_WORDS_COL + 2 * NUM_BANKS)] =
*(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
}
}
Expand Down
60 changes: 35 additions & 25 deletions software/apps/baremetal/cfft_radix4_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,31 @@

/* CFFT data libraries */
#include "data_cfft_radix4_q16.h"
#define N_BANKS (NUM_CORES * BANKING_FACTOR)
#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
#define MAX_COL (NUM_BANKS / (N_CSAMPLES / 4))

/* CHOOSE ONE */
//#define SINGLE // Single core FFT.
//#define PARALLEL // Parallel FFT not "memory-aware".
//#define FOLDED // Parallel FFT with "memory-aware" load/store.
#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
/*
======================
Parameters and defines
PARALLEL: When defined runs parallel FFT.
FOLDED: When defined runs parallel FFT with folded inputs in memory.
SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
by each core N_FFTs_COL:
BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
defined to also fold the twiddle factors in memory.
*/

// Bitreversal index from table.
#define PARALLEL
#define BITREVERSETABLE
// Independent FFTs scheduled on one row (default 1).
#define N_FFTs_ROW 2
// Independent FFTs scheduled on columns (default 1).
#define N_FFTs_COL 2

#define N_FFTs_ROW (1)
#define N_FFTs_COL (1)
#if (N_FFTs_COL > MAX_COL)
#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
#error Parallelization not supporting N_FFTs_COL > [NUM_BANKS / (N_CSAMPLES / 4)]
#endif
// Also the twiddles have "memory-aware" load/stores.
#define FOLDED_TWIDDLES
Expand All @@ -60,16 +68,16 @@ uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
#endif

#if (defined(SCHEDULED) || defined(FOLDED))
int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16_src[8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_pSrc[N_FFTs_ROW * 8 * NUM_BANKS]
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
int16_t l1_pDst[N_FFTs_ROW * 8 * NUM_BANKS]
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16_src[8 * NUM_BANKS]
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16_dst[8 * NUM_BANKS]
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
__attribute__((aligned(4 * NUM_BANKS), section(".l1_prio")));
#endif

int main() {
Expand Down Expand Up @@ -97,7 +105,7 @@ int main() {
if (core_id == 0) {
for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
for (uint32_t i = 0; i < N_FFTs_COL; i++) {
dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * NUM_BANKS),
l2_pSrc, N_CSAMPLES * sizeof(int32_t));
}
}
Expand All @@ -112,9 +120,11 @@ int main() {
for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
*(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL)] =
*(v2s *)&l2_twiddleCoef_q16[2 * i];
*(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
*(v2s *)&l1_twiddleCoef_q16_src[2 *
(i + j * N_WORDS_COL + 1 * NUM_BANKS)] =
*(v2s *)&l2_twiddleCoef_q16[2 * (i * 2U)];
*(v2s *)&l1_twiddleCoef_q16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
*(v2s *)&l1_twiddleCoef_q16_src[2 *
(i + j * N_WORDS_COL + 2 * NUM_BANKS)] =
*(v2s *)&l2_twiddleCoef_q16[2 * (i * 3U)];
}
}
Expand Down
8 changes: 8 additions & 0 deletions software/apps/baremetal/chest_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
#include "baremetal/mempool_chest_f16.h"
#include "data_chest_f16.h"

/*
======================
Parameters and defines
SINGLE: When defined runs single-core Channel Estimation.
PARALLEL: When defined runs parallel Channel Estimation.
*/

//#define SINGLE
#define PARALLEL

Expand Down
8 changes: 8 additions & 0 deletions software/apps/baremetal/chest_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_chest_q16.h"

/*
======================
Parameters and defines
SINGLE: When defined runs single-core Channel Estimation.
PARALLEL: When defined runs parallel Channel Estimation.
*/

#define PARALLEL

int16_t l1_PilotTX[2 * N_TX * N_SAMPLES]
Expand Down
9 changes: 9 additions & 0 deletions software/apps/baremetal/cholesky_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_cholesky_f16s.h"

/*
======================
Parameters and defines
SINGLE: When defined runs single-core Cholesky Decomposition.
PARALLEL: When defined runs parallel Cholesky Decomposition.
FOLDED: When defined 1 intermediate results are folded in memory.
*/

#define SINGLE
#define FOLDED (0)

Expand Down
28 changes: 15 additions & 13 deletions software/apps/baremetal/cholesky_q32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "synchronization.h"

#define HALF (1023)
#define N_BANKS (NUM_CORES * BANKING_FACTOR)
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b))
#define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT))
#define ABS(a) (a > 0 ? a : -a)
Expand All @@ -31,18 +31,19 @@
#define N_COL 1
#define N_ROW 1
int32_t l1_A[matrix_N * matrix_N]
__attribute__((aligned(N_BANKS), section(".l1")));
__attribute__((aligned(NUM_BANKS), section(".l1")));
int32_t l1_L[matrix_N * matrix_N]
__attribute__((aligned(N_BANKS), section(".l1")));
int32_t l1_y[matrix_N] __attribute__((aligned(N_BANKS), section(".l1")));
__attribute__((aligned(NUM_BANKS), section(".l1")));
int32_t l1_y[matrix_N] __attribute__((aligned(NUM_BANKS), section(".l1")));
#else
int32_t l1_AA[matrix_N * N_BANKS]
__attribute__((aligned(N_BANKS), section(".l1_prio")));
int32_t l1_LL[N_ROW * matrix_N * N_BANKS]
__attribute__((aligned(N_BANKS), section(".l1_prio")));
int32_t l1_LR[N_ROW * matrix_N * N_BANKS]
__attribute__((aligned(N_BANKS), section(".l1_prio")));
int32_t l1_yy[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1_prio")));
int32_t l1_AA[matrix_N * NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));
int32_t l1_LL[N_ROW * matrix_N * NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));
int32_t l1_LR[N_ROW * matrix_N * NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));
int32_t l1_yy[NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));
#endif

int main() {
Expand All @@ -58,11 +59,12 @@ int main() {
for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) {
l1_yy[idx_col * matrix_N + i] = l2_y[i];
for (uint32_t j = 0; j < matrix_N; j++) {
l1_AA[idx_col * matrix_N + i * N_BANKS + j] = l2_A[i * matrix_N + j];
l1_AA[idx_col * matrix_N + i * NUM_BANKS + j] =
l2_A[i * matrix_N + j];
}
}
}
for (uint32_t i = 0; i < N_ROW * matrix_N * N_BANKS; i++) {
for (uint32_t i = 0; i < N_ROW * matrix_N * NUM_BANKS; i++) {
l1_LL[i] = 0;
l1_LR[i] = 0;
}
Expand Down
15 changes: 13 additions & 2 deletions software/apps/baremetal/cmatmul_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,18 @@

#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_cmatmul_f16.h"
#define PARALLEL_4x4

/*
======================
Parameters and defines
SINGLE_2x2: Single-core matmul on 2x2 tiles.
PARALLEL_2x2: Parallel matmul on 2x2 C-tiles.
PARALLEL_2x4: Parallel matmul on 4x4 C-tiles.
PARALLEL_4x4: Parallel matmul on 4x4 C-tiles.
PARALLEL_4x4_COPIES_A: Parallel matmul on 4x4 C-tiles, compies of A in memory to
avoid banking conflicts.
*/

#if defined(PARALLEL_4x4_COPIES_A)
__fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)]
Expand Down Expand Up @@ -51,7 +62,7 @@ int main() {
// Wait at barrier until everyone is ready
mempool_barrier(num_cores);

#if defined(SINGLE_CORE)
#if defined(SINGLE_2x2)
// Execute function to test.
if (core_id == 0) {
mempool_start_benchmark();
Expand Down
8 changes: 8 additions & 0 deletions software/apps/baremetal/cmatmul_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
#include "baremetal/mempool_cmatmul_q16.h"
#include "data_cmatmul_q16.h"

/*
======================
Parameters and defines
SINGLE: When defined runs single-core matmul.
PARALLEL: When defined runs parallel matmul.
*/

#define PARALLEL
#define dim_M (matrix_M)
#define dim_N (matrix_N)
Expand Down
Loading

0 comments on commit 5dd55e0

Please sign in to comment.