Skip to content

Commit

Permalink
[software] Add explanation for the use of defines
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Dec 19, 2024
1 parent 0f1de6f commit ef75e96
Show file tree
Hide file tree
Showing 31 changed files with 237 additions and 142 deletions.
19 changes: 7 additions & 12 deletions software/apps/baremetal/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,13 @@ APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c
BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
ALL := $(APPS)

FP_APPS := axpy_f16 axpy_f32
FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16
FP_APPS += cmatmul_f16 matmul_f16 matmul_f32
FP_APPS += dotp_f16 dotp_f32
FP_APPS += mimo_mmse_f32 mimo_mmse_f16 mimo_mmse_f8 ofdm_f16

I_APPS := synth_i32
I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32
I_APPS += cmatmul_q16 mimo_mmse_q16

ALL_GCC := $(filter-out $(FP_APPS), $(ALL))
ALL_LLVM := $(filter-out $(I_APPS), $(ALL))
FP_SUFFIXES := f16 f32 f8
I_SUFFIXES := q16 q32 i16 i32 i8
I_APPS := $(foreach suf, $(FP_SUFFIXES), $(filter %_$(suf), $(ALL)))
FP_APPS := $(foreach suf, $(I_SUFFIXES), $(filter %_$(suf), $(ALL)))
# Filter out applications
ALL_GCC := $(filter-out $(I_APPS), $(ALL))
ALL_LLVM := $(filter-out $(FP_APPS), $(ALL))

# Make all applications
all: $(ALL_GCC)
Expand Down
1 change: 0 additions & 1 deletion software/apps/baremetal/axpy_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "synchronization.h"

#include "data_axpy_f16.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)

// Vectors for kernel computation
__fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
Expand Down
1 change: 0 additions & 1 deletion software/apps/baremetal/axpy_f32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "synchronization.h"

#include "data_axpy_f32.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)

// Vectors for kernel computation
float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
Expand Down
30 changes: 18 additions & 12 deletions software/apps/baremetal/cfft_radix4_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,26 @@
#define N_BANKS (NUM_CORES * BANKING_FACTOR)
#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))

/* CHOOSE ONE */
#define PARALLEL // Parallel FFT not "memory-aware".
// #define FOLDED // Parallel FFT with "memory-aware" load/store.
//#define SCHEDULED // Folded FFTs arranged in rows and cols.'''

// Bitreversal index from table.
/*
======================
Parameters and defines
PARALLEL: When defined runs parallel FFT.
FOLDED: When defined runs parallel FFT with folded inputs in memory.
SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
by each core N_FFTs_COL:
BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
defined to also fold the twiddle factors in memory.
*/

#define PARALLEL
#define BITREVERSETABLE
// Also the twiddles have "memory-aware" load/stores.
// #define FOLDED_TWIDDLES

// Independent FFTs scheduled on one row (default 1).
#define N_FFTs_ROW 1
// Independent FFTs scheduled on columns (default 1).
#define N_FFTs_COL 1
#define N_FFTs_ROW (1)
#define N_FFTs_COL (1)
#if (N_FFTs_COL > MAX_COL)
#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
#endif
Expand Down
28 changes: 18 additions & 10 deletions software/apps/baremetal/cfft_radix4_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,26 @@
#define N_BANKS (NUM_CORES * BANKING_FACTOR)
#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))

/* CHOOSE ONE */
//#define SINGLE // Single core FFT.
//#define PARALLEL // Parallel FFT not "memory-aware".
//#define FOLDED // Parallel FFT with "memory-aware" load/store.
#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
/*
======================
Parameters and defines
PARALLEL: When defined runs parallel FFT.
FOLDED: When defined runs parallel FFT with folded inputs in memory.
SCHEDULED: When defined runs multiple parallel folded-inputs FFTs.
N_FFTs_ROW: When the FFT is scheduled defines the number of FFTs run sequntially
by each core N_FFTs_COL:
BITREVERSETABLE: When defined bitreversal indeces are fetched from a table, else
they are computed by cores. FOLDED_TWIDDLES: When FOLDED is defined it can be
defined to also fold the twiddle factors in memory.
*/

// Bitreversal index from table.
#define PARALLEL
#define BITREVERSETABLE
// Independent FFTs scheduled on one row (default 1).
#define N_FFTs_ROW 2
// Independent FFTs scheduled on columns (default 1).
#define N_FFTs_COL 2

#define N_FFTs_ROW (1)
#define N_FFTs_COL (1)
#if (N_FFTs_COL > MAX_COL)
#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
#endif
Expand Down
8 changes: 8 additions & 0 deletions software/apps/baremetal/chest_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
#include "baremetal/mempool_chest_f16.h"
#include "data_chest_f16.h"

/*
======================
Parameters and defines
SINGLE: When defined runs single-core Channel Estimation.
PARALLEL: When defined runs parallel Channel Estimation.
*/

//#define SINGLE
#define PARALLEL

Expand Down
8 changes: 8 additions & 0 deletions software/apps/baremetal/chest_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_chest_q16.h"

/*
======================
Parameters and defines
SINGLE: When defined runs single-core Channel Estimation.
PARALLEL: When defined runs parallel Channel Estimation.
*/

#define PARALLEL

int16_t l1_PilotTX[2 * N_TX * N_SAMPLES]
Expand Down
9 changes: 9 additions & 0 deletions software/apps/baremetal/cholesky_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_cholesky_f16s.h"

/*
======================
Parameters and defines
SINGLE: When defined runs single-core Cholesky Decomposition.
PARALLEL: When defined runs parallel Cholesky Decomposition.
FOLDED: When defined 1 intermediate results are folded in memory.
*/

#define SINGLE
#define FOLDED (0)

Expand Down
15 changes: 13 additions & 2 deletions software/apps/baremetal/cmatmul_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,18 @@

#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_cmatmul_f16.h"
#define PARALLEL_4x4

/*
======================
Parameters and defines
SINGLE_2x2: Single-core matmul on 2x2 tiles.
PARALLEL_2x2: Parallel matmul on 2x2 C-tiles.
PARALLEL_2x4: Parallel matmul on 4x4 C-tiles.
PARALLEL_4x4: Parallel matmul on 4x4 C-tiles.
PARALLEL_4x4_COPIES_A: Parallel matmul on 4x4 C-tiles, compies of A in memory to
avoid banking conflicts.
*/

#if defined(PARALLEL_4x4_COPIES_A)
__fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)]
Expand Down Expand Up @@ -51,7 +62,7 @@ int main() {
// Wait at barrier until everyone is ready
mempool_barrier(num_cores);

#if defined(SINGLE_CORE)
#if defined(SINGLE_2x2)
// Execute function to test.
if (core_id == 0) {
mempool_start_benchmark();
Expand Down
8 changes: 8 additions & 0 deletions software/apps/baremetal/cmatmul_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
#include "baremetal/mempool_cmatmul_q16.h"
#include "data_cmatmul_q16.h"

/*
======================
Parameters and defines
SINGLE: When defined runs single-core matmul.
PARALLEL: When defined runs parallel matmul.
*/

#define PARALLEL
#define dim_M (matrix_M)
#define dim_N (matrix_N)
Expand Down
15 changes: 0 additions & 15 deletions software/apps/baremetal/dotp_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
#include "synchronization.h"

#include "data_dotp_f16.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
// #define SINGLE_CORE_REDUCTION
#define BINARY_REDUCTION

// Vectors for kernel computation
__fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
Expand Down Expand Up @@ -47,18 +44,6 @@ int main() {
}
mempool_barrier(num_cores);

// // SINGLE-CORE
// time_init = mempool_get_timer();
// dotp_f16s(l1_X, l1_Y, sum, array_N);
// // dotp_f16s_unrolled4(l1_X, l1_Y, sum, array_N);
// time_end = mempool_get_timer();

// // PARALLEL
// time_init = mempool_get_timer();
// dotp_f16vecp_unrolled4(l1_X, l1_Y, sum, array_N, num_cores);
// // dotp_f16p(l1_X, l1_Y, sum, array_N, num_cores);
// time_end = mempool_get_timer();

// PARALLEL, LOCAL ACCESSES
time_init = mempool_get_timer();
dotp_f16vecp_local_unrolled4(l1_X, l1_Y, sum, array_N);
Expand Down
13 changes: 0 additions & 13 deletions software/apps/baremetal/dotp_f32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
#include "synchronization.h"

#include "data_dotp_f32.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
// #define SINGLE_CORE_REDUCTION
#define BINARY_REDUCTION

// Vectors for kernel computation
float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
Expand Down Expand Up @@ -47,16 +44,6 @@ int main() {
}
mempool_barrier(num_cores);

// // SINGLE-CORE
// time_init = mempool_get_timer();
// dotp_f32s_unrolled4(l1_A, l1_B, sum, array_N);
// time_end = mempool_get_timer();

// // PARALLEL
// time_init = mempool_get_timer();
// dotp_f32p(l1_A, l1_B, sum, array_N, num_cores);
// time_end = mempool_get_timer();

// PARALLEL, LOCAL ACCESSES
time_init = mempool_get_timer();
dotp_f32p_local_unrolled4(l1_X, l1_Y, sum, array_N);
Expand Down
15 changes: 0 additions & 15 deletions software/apps/baremetal/dotp_i32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@
#include "synchronization.h"

#include "data_dotp_i32.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
#define LOG_BARRIERS
// #define ATOMIC_REDUCTION
// #define SINGLE_CORE_REDUCTION
#define BINARY_REDUCTION

// Vectors for kernel computation
int32_t l1_X[array_N] __attribute__((aligned(array_N), section(".l1_prio")));
Expand Down Expand Up @@ -49,16 +44,6 @@ int main() {
}
mempool_barrier(num_cores);

// // SINGLE-CORE
// time_init = mempool_get_timer();
// dotp_i32s_unrolled4(l1_A, l1_B, sum, array_N);
// time_end = mempool_get_timer();

// // PARALLEL
// time_init = mempool_get_timer();
// dotp_i32p(l1_A, l1_B, sum, array_N, num_cores);
// time_end = mempool_get_timer();

// PARALLEL, LOCAL ACCESSES
time_init = mempool_get_timer();
dotp_i32p_local_unrolled4(l1_X, l1_Y, sum, array_N);
Expand Down
8 changes: 7 additions & 1 deletion software/apps/baremetal/matmul_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_matmul_f16.h"

#define PARALLEL
/*
======================
Parameters and defines
SINGLE: When defined runs single-core matmul.
PARALLEL: When defined runs parallel matmul.
*/

__fp16 matrix_a[matrix_M * matrix_N]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
Expand Down
9 changes: 7 additions & 2 deletions software/apps/baremetal/matmul_f32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@
#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_matmul_f32.h"

#define PARALLEL
#define ASM
/*
======================
Parameters and defines
SINGLE: When defined runs single-core matmul.
PARALLEL: When defined runs parallel matmul.
*/

float matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio")));
float matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio")));
Expand Down
46 changes: 32 additions & 14 deletions software/apps/baremetal/mimo_mmse_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,45 @@
#include "baremetal/mempool_mimo_mmse_f16s.h"

#include "data_mimo_mmse_f16.h"
#define ZF (0) // When asserted use zero-forcing
#define FOLD (1) // When asserted fold matrices in memory
#define NUM_BANKS (BANKING_FACTOR * NUM_CORES)

/*
======================
Parameters and defines
DOUBLE_BUFFERING: When defined benchmark double buffered MIMO-MMSE, including
L2-L1 transfers.
For MIMO-MMSE without L2-L1 transfers:
PARALLEL: When defined benchmark parallel MIMO-MMSE.
SINGLE: When defined benchmark single-core MIMO-MMSE.
VEC: When defined benchmark SIMD-vectorized kernels.
ZF: When defined 1 use zero forcing detector.
FOLD: When defined 1 fold matrices in memory.
For MIMO-MMSE with L2-L1 transfers:
DMA_TRANSFER1: When defined transfer inputs for next round at the beginning of
computation. DMA_TRANSFER2: When defined transfer inputs for next round after
Hermitian computation. N_ROUNDS: Define number of rounds of Double-Buffering.
*/

#define ZF (0)
#define FOLD (1)
#define PARALLEL
#define VEC

#ifndef DOUBLE_BUFFERING

/**********************************************************
**********************************************************
_ _ ___ _ _ _____ __
| \ | |/ _ \ | | / |_ _| __ __ _ _ __ ___ / _|
| \| | | | |_____| | | | | || '__/ _` | '_ \/ __| |_
| |\ | |_| |_____| |___| | | || | | (_| | | | \__ \ _|
|_| \_|\___/ |_____|_| |_||_| \__,_|_| |_|___/_|(_)
_ _ ___ _____ __
| \ | |/ _ \ |_ _| __ __ _ _ __ ___ / _|
| \| | | | |_____ | || '__/ _` | '_ \/ __| |_
| |\ | |_| |_____ | || | | (_| | | | \__ \ _|
|_| \_|\___/ |_||_| \__,_|_| |_|___/_|(_)
***********************************************************
***********************************************************/

#ifndef DOUBLE_BUFFERING

#if FOLD
#define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
#define NUM_COL (NUM_BANKS / N_TX)
Expand Down Expand Up @@ -193,6 +213,8 @@ int main() {
return 0;
}

#else

/**********************************************************
**********************************************************
____ __ __ _ _____ __
Expand All @@ -204,10 +226,6 @@ int main() {
***********************************************************
***********************************************************/

#else
#define N_ROUNDS (1)
#define DMA_TRANSFER1

// Inputs-Outputs even double-buffering rounds
__fp16 l1A_H[2 * N_TX * N_RX * N_ITR]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
Expand Down
Loading

0 comments on commit ef75e96

Please sign in to comment.