diff --git a/Makefile b/Makefile index 47522c107..bcd4d7138 100644 --- a/Makefile +++ b/Makefile @@ -218,6 +218,7 @@ toolchain/riscv-opcodes/*: format: $(ROOT_DIR)/scripts/run_clang_format.py --clang-format-executable=$(LLVM_INSTALL_DIR)/bin/clang-format -i -r $(ROOT_DIR) + find ./software/data -name '*.py' -exec autopep8 --in-place --aggressive {} + clean: clean-riscv-tests rm -rf $(INSTALL_DIR) diff --git a/python-requirements.txt b/python-requirements.txt index 09e19ccd7..d0e903cda 100644 --- a/python-requirements.txt +++ b/python-requirements.txt @@ -14,3 +14,4 @@ pandas progressbar2 tabulate sympy +scipy diff --git a/software/.gitignore b/software/.gitignore index 49abad0af..35dccde4a 100644 --- a/software/.gitignore +++ b/software/.gitignore @@ -27,3 +27,4 @@ runtime/arch.ld # Generated data files data.h data/data*.h +data/__pyc* diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile index cc9e2db7a..c4a2a40a3 100644 --- a/software/apps/baremetal/Makefile +++ b/software/apps/baremetal/Makefile @@ -17,8 +17,6 @@ RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime) include $(RUNTIME_DIR)/runtime.mk APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c")) -DATA := $(patsubst %.args,%.h,$(shell find $(APPS_DIR) -name "data.args")) -ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py)) BINARIES := $(addprefix $(BIN_DIR)/,$(APPS)) ALL := $(APPS) @@ -34,7 +32,7 @@ all_llvm: $(ALL_LLVM) $(APPS): % : $(BIN_DIR)/% $(APPS_DIR)/Makefile $(shell find $(RUNTIME_DIR)/**.{S,c,h,ld} -type f) .PHONY: $(BINARIES) -$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) $(ALLPYS) update_opcodes +$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) data_%.h update_opcodes mkdir -p $(dir $@) $(RISCV_CC) -Iinclude -o $@ $< $(RUNTIME) $(RISCV_LDFLAGS) -T$(RUNTIME_DIR)/link.ld $(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump @@ -50,5 +48,6 @@ clean: rm -vf $(addsuffix /main.c.o,$(APPS)) rm -vf $(RUNTIME) rm -vf $(LINKER_SCRIPT) + rm -vf $(wildcard $(DATA_DIR)/data_*.h) .INTERMEDIATE: $(addsuffix /main.c.o,$(APPS)) diff --git a/software/apps/baremetal/axpy_i32/main.c b/software/apps/baremetal/axpy_i32/main.c index a9354796e..c391ba040 100644 --- a/software/apps/baremetal/axpy_i32/main.c +++ b/software/apps/baremetal/axpy_i32/main.c @@ -5,125 +5,50 @@ // Author: Yichao Zhang, ETH Zurich #include +#include #include -#include "baremetal/mempool_axpy_i32p.h" +/* Mempool runtime libraries */ +#include "builtins_v2.h" +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include - -#if NUM_CORES > 32 -#define size_M 64 -#define size_N 64 -#else -#define size_M (NUM_CORES) -#define size_N (NUM_CORES) -#endif - -#define ALPHA 2 -#if NUM_CORES > 32 -int32_t data_x[size_M * size_N] - __attribute__((aligned(64 * 1024), section(".l1"))); -int32_t data_y[size_M * size_N] - __attribute__((aligned(64 * 1024), section(".l1"))); -int32_t data_y_copy[size_M * size_N] - __attribute__((aligned(64 * 1024), section(".l1"))); -#else -int32_t data_x[size_M * size_N] __attribute__((aligned(32), section(".l1"))); -int32_t data_y[size_M * size_N] __attribute__((aligned(32), section(".l1"))); -int32_t data_y_copy[size_M * size_N] - __attribute__((aligned(32), section(".l1"))); -#endif +#include "baremetal/mempool_axpy_i32p.h" +#include "baremetal/mempool_checks.h" +#include "data_axpy_i32.h" +int32_t l1_X[array_N] + __attribute__((aligned(NUM_CORES * sizeof(uint32_t)), section(".l1"))); +int32_t l1_Y[array_N] + __attribute__((aligned(NUM_CORES * sizeof(uint32_t)), section(".l1"))); int volatile error __attribute__((section(".l1"))); -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id, - uint32_t num_cores) { - // How many rows/columns to split the matrix into - uint32_t const split = 8; - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } -} - -int verify_axpy(int32_t *matrix_X, int32_t *matrix_Y, int32_t *matrix_Y_COPY, - int32_t alpha, uint32_t elements) { - for (uint32_t i = 0; i < elements; i++) { - if (matrix_Y[i] != matrix_X[i] * alpha + matrix_Y_COPY[i]) { - return 1; - } - } - return 0; -} - int main() { uint32_t const core_id = mempool_get_core_id(); uint32_t const num_cores = mempool_get_core_count(); - uint32_t const total_elements = size_M * size_N; - - // Seed for create element matrix - int32_t const A_a = 1; - int32_t const A_b = 1; - int32_t const A_c = -32; - int32_t const B_a = 2; - int32_t const B_b = 1; - int32_t const B_c = 16; - - // Initialize synchronization variables mempool_barrier_init(core_id); + + // Initialize data if (core_id == 0) { - printf("Initialize %3d cores\n", num_cores); + dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t)); + dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t)); error = 0; } - - // init_elements; - init_matrix(data_x, size_M, size_N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(data_y, size_M, size_N, B_a, B_b, B_c, core_id, num_cores); - init_matrix(data_y_copy, size_M, size_N, B_a, B_b, B_c, core_id, num_cores); mempool_barrier(num_cores); - // start kernel testing + // Benchmark mempool_start_benchmark(); - calc_axpy_unloop_x4_localbank(data_x, data_y, ALPHA, total_elements, core_id, - num_cores); + calc_axpy_unloop_x4_localbank(l1_X, l1_Y, ALPHA, array_N, core_id, num_cores); mempool_barrier(num_cores); mempool_stop_benchmark(); - // end kernel testing // Verify results - if (core_id == 0) { - printf("START CHECKING RESULTS\n"); - if (verify_axpy(data_x, data_y, data_y_copy, ALPHA, total_elements)) { - printf("RESULTS ERROR\n"); - error = 1; - } else { - printf("RESULTS CORRECT\n"); - } - } + mempool_check_i32(l1_Y, l2_Z, array_N, 0, 0); mempool_barrier(num_cores); - return error; + return 0; } diff --git a/software/apps/baremetal/cfft_radix2_q16/main.c b/software/apps/baremetal/cfft_radix2_q16/main.c index 105cf6370..e23fb929e 100644 --- a/software/apps/baremetal/cfft_radix2_q16/main.c +++ b/software/apps/baremetal/cfft_radix2_q16/main.c @@ -19,6 +19,7 @@ #include "synchronization.h" #include "data_cfft_radix2_q16.h" +#define N_BANKS (NUM_CORES * BANKING_FACTOR) /* CFFT mempool libraries */ #include "baremetal/mempool_cfft_q16_bitreversal.h" @@ -69,7 +70,7 @@ int main() { mempool_stop_benchmark(); #endif - mempool_check_q16(l1_pSrc, l2_pRes, 2 * N_CSAMPLES, TOLERANCE, 0); + mempool_check_i16(l1_pSrc, l2_pRes, 2 * N_CSAMPLES, TOLERANCE, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/cfft_radix4_q16/main.c b/software/apps/baremetal/cfft_radix4_q16/main.c index 88d7182fa..08ed80e9b 100644 --- a/software/apps/baremetal/cfft_radix4_q16/main.c +++ b/software/apps/baremetal/cfft_radix4_q16/main.c @@ -19,6 +19,8 @@ /* CFFT data libraries */ #include "data_cfft_radix4_q16.h" +#define N_BANKS (NUM_CORES * BANKING_FACTOR) +#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) /* CHOOSE ONE */ //#define SINGLE // Single core FFT. @@ -225,7 +227,7 @@ int main() { printf("02: END COMPUTATION\n"); } - mempool_check_q16(pRes, l2_pRes, 2 * N_CSAMPLES, TOLERANCE, 0); + mempool_check_i16(pRes, l2_pRes, 2 * N_CSAMPLES, TOLERANCE, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c index eecac204a..498c60260 100644 --- a/software/apps/baremetal/chest_q16/main.c +++ b/software/apps/baremetal/chest_q16/main.c @@ -62,7 +62,7 @@ int main() { #endif /* Check */ - mempool_check_q16(l1_HEST, l2_HEST, 2 * N_TX * N_RX, 0, 0); + mempool_check_i16(l1_HEST, l2_HEST, 2 * N_TX * N_RX, 0, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/cholesky_q32/initialization.h b/software/apps/baremetal/cholesky_q32/initialization.h deleted file mode 100644 index 79993afa8..000000000 --- a/software/apps/baremetal/cholesky_q32/initialization.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#define FIXED_POINT 10 -#define HALF 1023 -#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b)) -#define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT)) -#define ABS(a) (a > 0 ? a : -a) - -void transpose(int32_t *matrix, int32_t *t_matrix, int32_t n); -void matrixmult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, - int32_t n); -void display(int32_t *matrix, uint32_t num_rows, uint32_t num_columns); -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id); -void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t core_id); - -void transpose(int32_t *matrix, int32_t *t_matrix, int32_t n) { - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - t_matrix[j * n + i] = matrix[i * n + j]; - } - } -} - -void matrixmult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product, - int32_t n) { - int k; - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { // not j < M - matrix_product[i * n + j] = 0; - for (k = 0; k < n; k++) { - matrix_product[i * n + j] += - FIX_MUL(matrix_1[i * n + k], matrix_2[k * n + j]); - } - } - } -} - -void display(int32_t *matrix, uint32_t num_rows, uint32_t num_columns) { -#if defined(FOLDED) - uint32_t i, j; - for (i = 0; i < num_rows; i++) { - for (j = 0; j < num_columns; j++) { - printf("%8d", matrix[i * N_BANKS + j]); - } - printf("\n"); - } -#else - uint32_t i, j; - for (i = 0; i < num_rows; i++) { - for (j = 0; j < num_columns; j++) { - printf("%8d ", matrix[i * num_columns + j]); - } - printf("\n"); - } -#endif -} - -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id) { - if (core_id == 0) { - for (uint32_t j = 0; j < num_rows; j++) { - for (uint32_t i = 0; i < num_columns; i++) { - matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } -} - -void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t core_id) { - if (core_id == 0) { - for (uint32_t i = 0; i < num_columns; i++) { - for (uint32_t j = 0; j < num_rows; j++) { - matrix[j * num_columns + i] = 0; - } - } - } -} diff --git a/software/apps/baremetal/cholesky_q32/main.c b/software/apps/baremetal/cholesky_q32/main.c index a670da805..64fbf3b2f 100644 --- a/software/apps/baremetal/cholesky_q32/main.c +++ b/software/apps/baremetal/cholesky_q32/main.c @@ -4,180 +4,126 @@ // Author: Marco Bertuletti, ETH Zurich +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#define N_BANKS (NUM_CORES * 4) -/* Matrix dimension */ -#define N 4 - +#define HALF (1023) +#define N_BANKS (NUM_CORES * BANKING_FACTOR) +#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b)) +#define FIX_MUL(a, b) ((int32_t)((a * b + HALF) >> FIXED_POINT)) +#define ABS(a) (a > 0 ? a : -a) #define SINGLE //#define PARALLEL //#define SCHEDULING -//#define LINSOLVE4 +//#define LINSOLVER -#define N_COL 1 -#define N_ROW 1 -#define N_ITR 1 +#include "data_cholesky_q32.h" + +#include "baremetal/mempool_cholesky_q32p.h" +#include "baremetal/mempool_cholesky_q32s.h" +#include "baremetal/mempool_linearsolver_q32p.h" +#include "baremetal/mempool_linearsolver_q32s.h" -int32_t A_matrix[N * N] __attribute__((aligned(N_BANKS), section(".l1"))); -int32_t AT_matrix[N * N] __attribute__((aligned(N_BANKS), section(".l1"))); -int32_t M_matrix[N * N] __attribute__((aligned(N_BANKS), section(".l1"))); #ifndef SCHEDULING -int32_t L_matrix[N * N] __attribute__((aligned(N_BANKS), section(".l1"))); -int32_t In[N] __attribute__((aligned(N_BANKS), section(".l1"))); -#else -// Matrices to generate the hermitian -int32_t In_matrix[N * N_BANKS] - __attribute__((aligned(N_BANKS), section(".l1"))); -// Outputs and input vector for linear system solution -int32_t LL_matrix[N_ROW * N * N_BANKS] +#define N_COL 1 +#define N_ROW 1 +int32_t l1_A[matrix_N * matrix_N] __attribute__((aligned(N_BANKS), section(".l1"))); -int32_t LR_matrix[N_ROW * N * N_BANKS] +int32_t l1_L[matrix_N * matrix_N] __attribute__((aligned(N_BANKS), section(".l1"))); -int32_t In[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1"))); +int32_t l1_y[matrix_N] __attribute__((aligned(N_BANKS), section(".l1"))); +#else +int32_t l1_AA[matrix_N * N_BANKS] + __attribute__((aligned(N_BANKS), section(".l1_prio"))); +int32_t l1_LL[N_ROW * matrix_N * N_BANKS] + __attribute__((aligned(N_BANKS), section(".l1_prio"))); +int32_t l1_LR[N_ROW * matrix_N * N_BANKS] + __attribute__((aligned(N_BANKS), section(".l1_prio"))); +int32_t l1_yy[N_BANKS] __attribute__((aligned(N_BANKS), section(".l1_prio"))); #endif -#include "initialization.h" - -#include "baremetal/mempool_cholesky_q32s.h" -#include "baremetal/mempool_linearsolver_q32s.h" - -#include "baremetal/mempool_cholesky_q32p.h" -#include "baremetal/mempool_linearsolver_q32p.h" - -void initialize() { +int main() { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); - /* Initialize input matrices */ - init_matrix(A_matrix, N, N, -156, 427, -219, core_id); - init_matrix_zeros(AT_matrix, N, N, core_id); - init_matrix_zeros(M_matrix, N, N, core_id); - -#ifndef SCHEDULING - - init_matrix_zeros(L_matrix, N, N, core_id); - mempool_barrier(num_cores); - /* Create positive definite matrix */ - if (core_id == 0) { - transpose(A_matrix, AT_matrix, N); - matrixmult(AT_matrix, A_matrix, M_matrix, N); - printf("Done initialization.\n"); - } - mempool_barrier(num_cores); -#ifdef LINEARSOLVER - init_matrix(In, 1, N, -156, 427, -219, core_id); - mempool_barrier(num_cores); -#endif - -#else - - init_matrix_zeros(In_matrix, N, N_BANKS, core_id); - init_matrix_zeros(LL_matrix, N_ROW * N, N_BANKS, core_id); - init_matrix_zeros(LR_matrix, N_ROW * N, N_BANKS, core_id); - mempool_barrier(num_cores); - /* Create positive definite matrix */ +// Initialize +#if defined(SCHEDULING) if (core_id == 0) { - transpose(A_matrix, AT_matrix, N); - matrixmult(AT_matrix, A_matrix, M_matrix, N); - for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) { - for (uint32_t i = 0; i < N; i++) { - for (uint32_t j = 0; j < N; j++) { - In_matrix[idx_col * N + i * N_BANKS + j] = M_matrix[i * N + j]; + for (uint32_t i = 0; i < matrix_N; i++) { + for (uint32_t idx_col = 0; idx_col < N_COL; idx_col++) { + l1_yy[idx_col * matrix_N + i] = l2_y[i]; + for (uint32_t j = 0; j < matrix_N; j++) { + l1_AA[idx_col * matrix_N + i * N_BANKS + j] = l2_A[i * matrix_N + j]; } } } - printf("Done initialization.\n"); + for (uint32_t i = 0; i < N_ROW * matrix_N * N_BANKS; i++) { + l1_LL[i] = 0; + l1_LR[i] = 0; + } + } +#else + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, matrix_N * matrix_N * sizeof(int32_t)); + dma_memcpy_blocking(l1_L, l2_L, matrix_N * matrix_N * sizeof(int32_t)); + dma_memcpy_blocking(l1_y, l2_y, matrix_N * sizeof(int32_t)); } - mempool_barrier(num_cores); -#ifdef LINEARSOLVER - init_matrix(In, 1, N_BANKS, -156, 427, -219, core_id); - mempool_barrier(num_cores); -#endif - #endif - return; -} - -/* BENCHMARK */ - -int main() { - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - mempool_barrier_init(core_id); - - initialize(); + mempool_barrier(num_cores); + // Benchmark #if defined(SINGLE) if (core_id == 0) { mempool_start_benchmark(); - for (uint32_t i = 0; i < N_ITR; i++) { -#ifndef LINEARSOLVER - // TEST #1 SINGLE-CORE CHOLESKY DECOMPOSITION - mempool_cholesky_crout_q32s(M_matrix, L_matrix, N); -#else - // TEST #2 SINGLE-CORE LINEAR-SYSTEM SOLUTION - mempool_linearsolver_q32s(M_matrix, L_matrix, In, N); - mempool_uprtrisolver_q32s(L_matrix, In, N); -#endif - } + // TEST #1 SINGLE-CORE CHOLESKY DECOMPOSITION + mempool_cholesky_crout_q32s(l1_A, l1_L, matrix_N); + // // TEST #2 SINGLE-CORE LINEAR-SYSTEM SOLUTION + // mempool_linearsolver_q32s(l1_A, l1_L, l1_y, matrix_N); + // mempool_uprtrisolver_q32s(l1_L, l1_y, matrix_N); mempool_stop_benchmark(); } mempool_barrier(num_cores); #endif -#if defined(PARALLEL) && !defined(SCHEDULING) -#ifndef LINEARSOLVER +#if defined(PARALLEL) // TEST #3 PARALLEL CHOLESKY DECOMPOSITION + // No trivial parallelization of linearsolver kernels mempool_start_benchmark(); - mempool_cholesky_q32p(M_matrix, L_matrix, N); + mempool_cholesky_q32p(l1_A, l1_L, matrix_N); mempool_stop_benchmark(); mempool_barrier(num_cores); -#else -// No trivial parallelization of linearsolver kernels -#endif - mempool_barrier(num_cores); #endif -#if defined(PARALLEL) && defined(SCHEDULING) - uint32_t nPE = (N / 4); - if (nPE > 1) { - /* Each decomposition is finely-grained parallelized over multiple cores */ - if (core_id < N_COL * nPE) { - mempool_start_benchmark(); -#ifndef LINEARSOLVER - // TEST #4 FINE-GRAINED PARALLEL CHOLESKY DECOMPOSITION x N_ROW x N_COL - mempool_cholesky_fold_schedule_q32p(In_matrix, In_matrix, LL_matrix, - LR_matrix, N, N_ROW, N_COL); -#else - // TEST #5 FINE-GRAINED PARALLEL LINEAR-SYSTEM SOLUTION x N_ROW x N_COL - mempool_linearsolver_fold_q32p(In_matrix, In_matrix, LL_matrix, LR_matrix, - In, N, N_ROW, N_COL); -#endif - mempool_stop_benchmark(); - } - } - if (nPE == 1) { - /* The decomposition is executed with a single-core. Each core gets a - * different input problem. This is the specific case of the 4x4 matrix. */ - if (core_id < N_COL * nPE) { - mempool_start_benchmark(); -#ifndef LINEARSOLVER - // TEST #6 SINGLE-CORE CHOLESKY DECOMPOSITION x N_ROW x N_COL - mempool_cholesky_schedule_q32s(In_matrix, LL_matrix, N, N_ROW, N_COL); -#else - // TEST #7 SINGLE-CORE LINEAR-SYSTEM SOLUTION x N_ROW x N_COL - mempool_linearsolver_q32s(In_matrix, LL_matrix, In, N, N_ROW, N_COL); -#endif - mempool_stop_benchmark(); - } +#if defined(SCHEDULING) + /* Each decomposition is finely-grained parallelized over multiple cores */ + uint32_t nPE = (matrix_N / 4); + if ((nPE > 1) && (core_id < N_COL * nPE)) { + mempool_start_benchmark(); + // TEST #4 FINE-GRAINED PARALLEL CHOLESKY DECOMPOSITION x N_ROW x N_COL + mempool_cholesky_fold_schedule_q32p(l1_AA, l1_AA, l1_LL, l1_LR, matrix_N, + N_ROW, N_COL); + // // TEST #5 FINE-GRAINED PARALLEL LINEAR-SYSTEM SOLUTION x N_ROW x N_COL + // mempool_linearsolver_fold_q32p(l1_AA, l1_AA, l1_LL, l1_LR, l1_yy, + // matrix_N, N_ROW, N_COL); + mempool_stop_benchmark(); } mempool_barrier(num_cores); + + /* The decomposition is executed with a single-core. Each core gets a + * different input problem. This is the specific case of the 4x4 matrix. */ + if ((nPE == 1) && (core_id < N_COL * nPE)) { + mempool_start_benchmark(); + // TEST #6 SINGLE-CORE CHOLESKY DECOMPOSITION x N_ROW x N_COL + mempool_cholesky_schedule_q32s(l1_AA, l1_LL, matrix_N, N_ROW, N_COL); + // // TEST #7 SINGLE-CORE LINEAR-SYSTEM SOLUTION x N_ROW x N_COL + // mempool_linearsolver_q32s(l1_AA, l1_LL, l1_yy, matrix_N, N_ROW, N_COL); + mempool_stop_benchmark(); + } #endif return 0; diff --git a/software/apps/baremetal/dotp_i32/define.h b/software/apps/baremetal/dotp_i32/define.h deleted file mode 100644 index d2b069d21..000000000 --- a/software/apps/baremetal/dotp_i32/define.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#define LEN (1024) -#define N_PE (NUM_CORES) -#define N_BANK (NUM_CORES * 4) -#define N_BANK_PE (N_PE * 4) - -/* Enable log barriers */ -#define LOG_BARRIERS - -/* STEP core 0 reduction */ -#define STEP (256) -#define STEP_CORES (STEP / 4) - -////////////////////////////////// -/* SELECT ONE */ - -// #define SINGLE -// #define SINGLE_UNROLLED - -// #define PARALLEL -// #define PARALLEL_UNROLLED - -// #define PARALLEL_LOCAL -// #define LOCAL_UNROLLED - -// #define PARALLEL_RED0 -// #define PARALLEL_UNROLLED_RED0 - -// #define PARALLEL_REDTREE -// #define PARALLEL_UNROLLED_REDTREE - -////////////////////////////////// - -// Vectors for kernel computation -int32_t vector_a[LEN] __attribute__((aligned(LEN), section(".l1"))); -int32_t vector_b[LEN] __attribute__((aligned(LEN), section(".l1"))); - -#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) || \ - defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE) -int32_t sum[N_BANK] __attribute__((aligned(N_BANK), section(".l1"))); -#else -int32_t sum __attribute__((section(".l1"))); -#endif - -// Vectors for performance metrics -uint32_t volatile red_barrier[NUM_CORES * 4] - __attribute__((aligned(NUM_CORES * 4), section(".l1"))); -int32_t result __attribute__((section(".l1"))); -int32_t check __attribute__((section(".l1"))); -int volatile error __attribute__((section(".l1"))); diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel.h b/software/apps/baremetal/dotp_i32/dotp_parallel.h deleted file mode 100644 index b765f6987..000000000 --- a/software/apps/baremetal/dotp_i32/dotp_parallel.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* Parallel dot-product */ -void dotp_parallel(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, - uint32_t nPE) { - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t step = Len / nPE; - - register int32_t local_sum = 0; - register int32_t a, b; - for (uint32_t i = core_id * step; i < core_id * step + step; i++) { - a = in_a[i]; - b = in_b[i]; - local_sum += a * b; - } - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_barrier(2, core_id); - (void)num_cores; -#else - mempool_barrier(num_cores); -#endif -} - -/* Parallel dot-product */ -void dotp_parallel_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len, uint32_t nPE) { - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t step = Len / nPE; - uint32_t reminder = step % 4; - uint32_t i; - - register int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0, - b3 = 0; - register int32_t local_sum0 = 0; - register int32_t local_sum1 = 0; - register int32_t local_sum2 = 0; - register int32_t local_sum3 = 0; - for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) { - a0 = in_a[i]; - b0 = in_b[i]; - a1 = in_a[i + 1]; - b1 = in_b[i + 1]; - a2 = in_a[i + 2]; - b2 = in_b[i + 2]; - a3 = in_a[i + 3]; - b3 = in_b[i + 3]; - local_sum0 += a0 * b0; - local_sum1 += a1 * b1; - local_sum2 += a2 * b2; - local_sum3 += a3 * b3; - } - i = core_id * step + step - reminder; - while (i < step) { - a0 = in_a[i]; - b0 = in_b[i]; - local_sum0 += a0 * b0; - i++; - } - local_sum0 += local_sum1; - local_sum2 += local_sum3; - local_sum0 += local_sum2; - mempool_barrier(num_cores); - - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_barrier(2, core_id); -#else - mempool_barrier(num_cores); -#endif -} diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_local.h b/software/apps/baremetal/dotp_i32/dotp_parallel_local.h deleted file mode 100644 index 950955832..000000000 --- a/software/apps/baremetal/dotp_i32/dotp_parallel_local.h +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* - Parallel dot-product with final reduction performed by multiple cores - using atomic-fetch and adds to a single memory location. - A) Parallelized workload - B) Atomic fetch and add to a single memory location - C) Barrier */ - -/*******************************************************/ -/** MULTI-CORE **/ -/*******************************************************/ - -/* Parallel dot-product */ -void dotp_parallel_local(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, - uint32_t nPE) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - - if (nPE == num_cores) { - register int32_t local_sum = 0; - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - local_sum += in_a[idx] * in_b[idx]; - local_sum += in_a[idx + 1] * in_b[idx + 1]; - local_sum += in_a[idx + 2] * in_b[idx + 2]; - local_sum += in_a[idx + 3] * in_b[idx + 3]; - idx += N_BANK; - } - if (core_id == (Len % N_BANK) / 4) { - while (idx < Len) { - local_sum += in_a[idx] * in_b[idx]; - idx++; - } - } - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_barrier(2, core_id); -#else - mempool_barrier(num_cores); -#endif - } else { - register int32_t local_sum = 0; - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - local_sum += in_a[idx] * in_b[idx]; - local_sum += in_a[idx + 1] * in_b[idx + 1]; - local_sum += in_a[idx + 2] * in_b[idx + 2]; - local_sum += in_a[idx + 3] * in_b[idx + 3]; - idx += N_BANK_PE; - } - if (core_id == (Len % N_BANK_PE) / 4) { - while (idx < Len) { - local_sum += in_a[idx] * in_b[idx]; - idx++; - } - } - if (core_id < nPE) { - mempool_stop_benchmark(); - mempool_start_benchmark(); - } - __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_partial_barrier(2, core_id, nPE); -#else - mempool_barrier(num_cores); -#endif - } -} - -/* Parallel dot-product with loop unrolling */ -void dotp_parallel_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len, uint32_t nPE) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - register int32_t local_sum_1 = 0; - register int32_t local_sum_2 = 0; - register int32_t local_sum_3 = 0; - register int32_t local_sum_4 = 0; - - if (nPE == num_cores) { - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - int32_t in_a1 = in_a[idx]; - int32_t in_b1 = in_b[idx]; - int32_t in_a2 = in_a[idx + 1]; - int32_t in_b2 = in_b[idx + 1]; - int32_t in_a3 = in_a[idx + 2]; - int32_t in_b3 = in_b[idx + 2]; - int32_t in_a4 = in_a[idx + 3]; - int32_t in_b4 = in_b[idx + 3]; - local_sum_1 += in_a1 * in_b1; - local_sum_2 += in_a2 * in_b2; - local_sum_3 += in_a3 * in_b3; - local_sum_4 += in_a4 * in_b4; - idx += N_BANK; - } - if (core_id == ((Len % N_BANK) / 4)) { - while (idx < Len) { - local_sum_1 += in_a[idx] * in_b[idx]; - idx++; - } - } - local_sum_1 += local_sum_2; - local_sum_3 += local_sum_4; - local_sum_1 += local_sum_3; - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum_1, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_barrier(2, core_id); -#else - mempool_barrier(num_cores); -#endif - } else { - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - int32_t in_a1 = in_a[idx]; - int32_t in_b1 = in_b[idx]; - int32_t in_a2 = in_a[idx + 1]; - int32_t in_b2 = in_b[idx + 1]; - int32_t in_a3 = in_a[idx + 2]; - int32_t in_b3 = in_b[idx + 2]; - int32_t in_a4 = in_a[idx + 3]; - int32_t in_b4 = in_b[idx + 3]; - local_sum_1 += in_a1 * in_b1; - local_sum_2 += in_a2 * in_b2; - local_sum_3 += in_a3 * in_b3; - local_sum_4 += in_a4 * in_b4; - idx += N_BANK_PE; - } - if (core_id == ((Len % N_BANK_PE) / 4)) { - while (idx < Len) { - local_sum_1 += in_a[idx] * in_b[idx]; - idx++; - } - } - local_sum_1 += local_sum_2; - local_sum_3 += local_sum_4; - local_sum_1 += local_sum_3; - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum_1, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_partial_barrier(2, core_id, nPE); -#else - mempool_barrier(num_cores); -#endif - } -} diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h b/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h deleted file mode 100644 index 0ad166d41..000000000 --- a/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* - Parallel dot-product with atomic fetch and add towards local memory - locations and final reduction by a single core. The cores write in - memory banks separated by a "step". - A) Parallelized workload - B) Atomic fetch and add to local memory banks - C) Barrier - D) Final reduction by core 0 incorporated in a barrier */ - -/*******************************************************/ -/** MULTI-CORE **/ -/*******************************************************/ - -/* Parallel dot-product */ -void dotp_parallel_red0(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - int32_t local_sum = 0; - - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - local_sum += in_a[idx] * in_b[idx]; - local_sum += in_a[idx + 1] * in_b[idx + 1]; - local_sum += in_a[idx + 2] * in_b[idx + 2]; - local_sum += in_a[idx + 3] * in_b[idx + 3]; - idx += N_BANK; - } - if (core_id == (Len % N_BANK) / 4) { - while (idx < Len) { - local_sum += in_a[idx] * in_b[idx]; - idx++; - } - } - __atomic_fetch_add(&s[(core_id / STEP_CORES) * STEP], local_sum, - __ATOMIC_RELAXED); - mempool_stop_benchmark(); - - mempool_start_benchmark(); - if ((num_cores - 1) == - __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) { - __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED); - __sync_synchronize(); // Full memory barrier - uint32_t idx_red = 0; - local_sum = 0; - while (idx_red < N_BANK) { - local_sum += s[idx_red]; - idx_red += STEP; - } - s[0] = local_sum; - wake_up_all(); - } - mempool_wfi(); -} - -/* Parallel dot-product with loop unrolling */ -void dotp_parallel_unrolled4_red0(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - int32_t local_sum_1 = 0; - int32_t local_sum_2 = 0; - int32_t local_sum_3 = 0; - int32_t local_sum_4 = 0; - - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - int32_t in_a1 = in_a[idx]; - int32_t in_b1 = in_b[idx]; - int32_t in_a2 = in_a[idx + 1]; - int32_t in_b2 = in_b[idx + 1]; - int32_t in_a3 = in_a[idx + 2]; - int32_t in_b3 = in_b[idx + 2]; - int32_t in_a4 = in_a[idx + 3]; - int32_t in_b4 = in_b[idx + 3]; - local_sum_1 += in_a1 * in_b1; - local_sum_2 += in_a2 * in_b2; - local_sum_3 += in_a3 * in_b3; - local_sum_4 += in_a4 * in_b4; - idx += N_BANK; - } - if (core_id == ((Len % N_BANK) / 4)) { - while (idx < Len) { - local_sum_1 += in_a[idx] * in_b[idx]; - idx++; - } - } - local_sum_1 += local_sum_2; - local_sum_3 += local_sum_4; - local_sum_1 += local_sum_3; - __atomic_fetch_add(&s[(core_id / STEP_CORES) * STEP], local_sum_1, - __ATOMIC_RELAXED); - mempool_stop_benchmark(); - - mempool_start_benchmark(); - if ((num_cores - 1) == - __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) { - __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED); - __sync_synchronize(); // Full memory barrier - uint32_t idx_red = 0; - local_sum_1 = 0; - while (idx_red < N_BANK) { - local_sum_1 += s[idx_red]; - idx_red += STEP; - } - s[0] = local_sum_1; - wake_up_all(); - } - mempool_wfi(); -} diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h b/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h deleted file mode 100644 index 3659de0a3..000000000 --- a/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* - Parallel dot-product with atomic fetch and add towards local memory - locations and final reduction by a single core. The cores write in - memory banks separated by a "step". - A) Parallelized workload - B) Atomic fetch and add to local memory banks - C) Barrier - D) Final reduction by core 0 incorporated in a barrier */ - -/*******************************************************/ -/** MULTI-CORE **/ -/*******************************************************/ - -void mempool_log_reduction(int32_t *sum, uint32_t volatile step, - uint32_t core_id); - -/* Parallel dot-product */ -void dotp_parallel_redtree(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - - register int32_t local_sum = 0; - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - local_sum += in_a[idx] * in_b[idx]; - local_sum += in_a[idx + 1] * in_b[idx + 1]; - local_sum += in_a[idx + 2] * in_b[idx + 2]; - local_sum += in_a[idx + 3] * in_b[idx + 3]; - idx += N_BANK; - } - if (core_id == (Len % N_BANK) / 4) { - while (idx < Len) { - local_sum += in_a[idx] * in_b[idx]; - idx++; - } - } - s[core_id * 4] = local_sum; // Each core is storing locally - mempool_stop_benchmark(); - mempool_start_benchmark(); - mempool_log_reduction(s, 2, core_id); -} - -void dotp_parallel_redtree_unrolled(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - register int32_t local_sum_1 = 0; - register int32_t local_sum_2 = 0; - register int32_t local_sum_3 = 0; - register int32_t local_sum_4 = 0; - - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - int32_t in_a1 = in_a[idx]; - int32_t in_b1 = in_b[idx]; - int32_t in_a2 = in_a[idx + 1]; - int32_t in_b2 = in_b[idx + 1]; - int32_t in_a3 = in_a[idx + 2]; - int32_t in_b3 = in_b[idx + 2]; - int32_t in_a4 = in_a[idx + 3]; - int32_t in_b4 = in_b[idx + 3]; - local_sum_1 += in_a1 * in_b1; - local_sum_2 += in_a2 * in_b2; - local_sum_3 += in_a3 * in_b3; - local_sum_4 += in_a4 * in_b4; - idx += N_BANK; - } - if (core_id == ((Len % N_BANK) / 4)) { - while (idx < Len) { - local_sum_1 += in_a[idx] * in_b[idx]; - idx++; - } - } - local_sum_1 += local_sum_2; - local_sum_3 += local_sum_4; - local_sum_1 += local_sum_3; - s[core_id * 4] = local_sum_1; // Each core is storing locally - mempool_stop_benchmark(); - mempool_start_benchmark(); - mempool_log_reduction(s, 2, core_id); -} - -void mempool_log_reduction(int32_t *sum, uint32_t volatile step, - uint32_t core_id) { - - uint32_t idx_sum, idx = (step * (core_id / step)) * 4; - uint32_t next_step, previous_step; - register int32_t local_sum; - uint32_t num_cores = mempool_get_core_count(); - - previous_step = step >> 1; - if ((step - previous_step) == - __atomic_fetch_add(&red_barrier[idx + previous_step - 1], previous_step, - __ATOMIC_RELAXED)) { - - local_sum = 0; - idx_sum = idx; - while (idx_sum < idx + step * 4) { - local_sum += sum[idx_sum]; - idx_sum += previous_step * 4; - } - sum[idx] = local_sum; - - next_step = step << 1; - __atomic_store_n(&red_barrier[idx + previous_step - 1], 0, - __ATOMIC_RELAXED); - if (num_cores == step) { - sum[0] = sum[idx]; - __sync_synchronize(); // Full memory barrier - wake_up_all(); - mempool_wfi(); - } else { - mempool_log_reduction(sum, next_step, core_id); - } - - } else - mempool_wfi(); -} diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c index f7cf7508f..ddc1ef141 100644 --- a/software/apps/baremetal/dotp_i32/main.c +++ b/software/apps/baremetal/dotp_i32/main.c @@ -8,132 +8,72 @@ #include #include +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "define.h" +#include "data_dotp_i32.h" +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) +#define LOG_BARRIERS +// #define ATOMIC_REDUCTION +// #define SINGLE_CORE_REDUCTION +#define BINARY_REDUCTION -#include "dotp_parallel.h" -#include "dotp_parallel_local.h" -#include "dotp_parallel_red0.h" -#include "dotp_parallel_redtree.h" -#include "dotp_single.h" +// Vectors for kernel computation +int32_t l1_X[array_N] __attribute__((aligned(array_N), section(".l1_prio"))); +int32_t l1_Y[array_N] __attribute__((aligned(array_N), section(".l1_prio"))); +uint32_t red_barrier[NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +int32_t sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); -void init_vectors(int32_t *in_a, int32_t *in_b, int32_t *s, int32_t *p_result, - int32_t *p_check, uint32_t Len) { - *p_result = 0; - *p_check = 0; - uint32_t j = 0; - uint32_t num_cores = mempool_get_core_count(); - while (j < Len) { - int32_t a = (int32_t)(j % num_cores); - int32_t b = (int32_t)(j % 4 + 3); - in_a[j] = a; - in_b[j] = b; - *p_check = *p_check + (int32_t)(a * b); - j++; - } -#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) || \ - defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE) - for (uint32_t k = 0; k < N_BANK; k++) { - s[k] = 0; - red_barrier[k] = 0; - } -#else - *s = 0; -#endif -} +#include "baremetal/mempool_dotp_i32p.h" +#include "baremetal/mempool_dotp_i32s.h" int main() { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); uint32_t time_init, time_end; - // initialize synchronization variables mempool_barrier_init(core_id); + time_init = 0; + time_end = 0; if (core_id == 0) { - error = 0; - time_init = 0; - time_end = 0; -#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) || \ - defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE) - init_vectors(vector_a, vector_b, sum, &result, &check, LEN); -#else - init_vectors(vector_a, vector_b, &sum, &result, &check, LEN); -#endif + dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t)); + dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t)); + } + for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) { + sum[k] = 0; + red_barrier[k] = 0; } - mempool_barrier(num_cores); // wait until all cores have finished + mempool_barrier(num_cores); - // Kernel execution + // // SINGLE-CORE + // time_init = mempool_get_timer(); + // dotp_i32s_unrolled4(l1_A, l1_B, sum, array_N); + // time_end = mempool_get_timer(); - time_init = mempool_get_timer(); -#ifdef SINGLE - dotp_single(vector_a, vector_b, &sum, LEN); -#elif defined(SINGLE_UNROLLED) - dotp_single_unrolled4(vector_a, vector_b, &sum, LEN); -#endif - time_end = mempool_get_timer(); + // // PARALLEL + // time_init = mempool_get_timer(); + // dotp_i32p(l1_A, l1_B, sum, array_N, num_cores); + // time_end = mempool_get_timer(); + // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); - mempool_start_benchmark(); -/* A) Parallelized workload - B) Atomic fetch and add to a single memory location - C) Barrier */ -#ifdef PARALLEL - dotp_parallel(vector_a, vector_b, &sum, LEN, N_PE); -#elif defined(PARALLEL_UNROLLED) - dotp_parallel_unrolled4(vector_a, vector_b, &sum, LEN, N_PE); -/* A) Parallelized workload - B) Atomic fetch and add to local memory banks - C) Barrier - D) Final reduction by core 0 incorporated in a barrier */ -#elif defined(PARALLEL_RED0) - dotp_parallel_red0(vector_a, vector_b, sum, LEN, N_PE); -#elif defined(PARALLEL_UNROLLED_RED0) - dotp_parallel_unrolled4_red0(vector_a, vector_b, sum, LEN, N_PE); -/* A) Parallelized workload - B) Nested set of barriers: reduction is performed in a logarithmic tree. */ -#elif defined(PARALLEL_REDTREE) - dotp_parallel_redtree(vector_a, vector_b, sum, LEN, N_PE); -#elif defined(PARALLEL_UNROLLED_REDTREE) - dotp_parallel_redtree_unrolled(vector_a, vector_b, sum, LEN, N_PE); -#endif - mempool_stop_benchmark(); + dotp_i32p_local_unrolled4(l1_X, l1_Y, sum, array_N); time_end = mempool_get_timer(); - /* A) Parallelized workload - B) Atomic fetch and add to a single memory location - C) Barrier */ - if (core_id < N_PE) { - time_init = mempool_get_timer(); - mempool_start_benchmark(); -#ifdef PARALLEL_LOCAL - dotp_parallel_local(vector_a, vector_b, &sum, LEN, N_PE); -#elif defined(LOCAL_UNROLLED) - dotp_parallel_local_unrolled4(vector_a, vector_b, &sum, LEN, N_PE); -#endif - mempool_stop_benchmark(); - time_end = mempool_get_timer(); - } - - mempool_barrier(num_cores); // Check results + mempool_barrier(num_cores); if (core_id == 0) { uint32_t clock_cycles = (time_end - time_init); -#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) || \ - defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE) - result = sum[0]; -#else - result = sum; -#endif printf("\nKernel execution takes %d clock cycles\n", clock_cycles); - printf("Result ==> %d\n", result); - printf("Check ==> %d\n\n", check); + printf("Result ==> %d\n", sum[0]); + printf("Check ==> %d\n\n", l2_Z); } mempool_barrier(num_cores); - return error; + return 0; } diff --git a/software/apps/baremetal/matmul_f16/main.c b/software/apps/baremetal/matmul_f16/main.c index b3b474b1d..99a0269cc 100644 --- a/software/apps/baremetal/matmul_f16/main.c +++ b/software/apps/baremetal/matmul_f16/main.c @@ -34,8 +34,10 @@ int main() { // Initialize Matrices 1 if (core_id == 0) { - dma_memcpy_blocking(matrix_a, A, (matrix_M * matrix_N) * sizeof(int16_t)); - dma_memcpy_blocking(matrix_b, B, (matrix_N * matrix_P) * sizeof(int16_t)); + dma_memcpy_blocking(matrix_a, l2_A, + (matrix_M * matrix_N) * sizeof(int16_t)); + dma_memcpy_blocking(matrix_b, l2_B, + (matrix_N * matrix_P) * sizeof(int16_t)); } mempool_barrier(num_cores); @@ -59,7 +61,7 @@ int main() { mempool_stop_benchmark(); #endif - mempool_check_f16(matrix_c, C, matrix_M * matrix_P, 0.5f, 0); + mempool_check_f16(matrix_c, l2_C, matrix_M * matrix_P, 0.5f, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/matmul_f32/main.c b/software/apps/baremetal/matmul_f32/main.c index bc391200f..d3d7622db 100644 --- a/software/apps/baremetal/matmul_f32/main.c +++ b/software/apps/baremetal/matmul_f32/main.c @@ -30,13 +30,14 @@ int main() { uint32_t num_cores = mempool_get_core_count(); mempool_barrier_init(core_id); - // Initialize Matrices + // Initialize data if (core_id == 0) { - dma_memcpy_blocking(matrix_a, A, matrix_M * matrix_N * sizeof(int32_t)); - dma_memcpy_blocking(matrix_b, B, matrix_N * matrix_P * sizeof(int32_t)); + dma_memcpy_blocking(matrix_a, l2_A, matrix_M * matrix_N * sizeof(int32_t)); + dma_memcpy_blocking(matrix_b, l2_B, matrix_N * matrix_P * sizeof(int32_t)); } mempool_barrier(num_cores); + // Benchmark #if defined(SINGLE) if (core_id == 0) { // Execute function to test. @@ -57,7 +58,7 @@ int main() { mempool_stop_benchmark(); #endif - mempool_check_f32(matrix_c, C, matrix_M * matrix_P, 0.01f, 0); + mempool_check_f32(matrix_c, l2_C, matrix_M * matrix_P, 0.01f, 0); mempool_barrier(num_cores); return 0; } diff --git a/software/apps/baremetal/matmul_i16/main.c b/software/apps/baremetal/matmul_i16/main.c index 5fe981858..a2b554dfa 100644 --- a/software/apps/baremetal/matmul_i16/main.c +++ b/software/apps/baremetal/matmul_i16/main.c @@ -7,135 +7,46 @@ #include #include -#include "baremetal/mempool_matmul_i16p.h" +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -// Define Matrix dimensions: -// C = AB with A=[MxN], B=[NxP], C=[MxP] -#define matrix_M 64 -#define matrix_N 64 -#define matrix_P 64 - -int16_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); -int16_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); -int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_matmul_i16p.h" +#include "data_matmul_i16.h" -int volatile error __attribute__((section(".l1"))); +int16_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); +int16_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); +int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); -void init_matrix(int16_t *matrix, uint32_t num_rows, uint32_t num_columns, - int16_t a, int16_t b, int16_t c, uint32_t core_id, - uint32_t num_cores) { - uint32_t const split = 8; // How many rows/columns to split the matrix into - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = - (int16_t)(a * (int16_t)i + b * (int16_t)j + c); - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = - (int16_t)(a * (int16_t)i + b * (int16_t)j + c); - } - } - } -} +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); -// Initialize the matrices in parallel -int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t inner_dim, int16_t aa, int16_t ab, int16_t ac, - int16_t ba, int16_t bb, int16_t bc, uint32_t core_id, - uint32_t num_cores) { - // Convert to signed - int32_t n = (int32_t)inner_dim; - // Parallelize over rows - for (uint32_t i = core_id; i < num_rows; i += num_cores) { - for (uint32_t j = 0; j < num_columns; ++j) { - int32_t ii = (int32_t)i; - int32_t jj = (int32_t)j; - int32_t lin = ((int32_t)aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + - (int32_t)ac * bc) * - n; - int32_t qua = - (((int32_t)aa * ba * ii + ab * bb * jj + ab * bc + (int32_t)ba * ac) * - (n * (n - 1))) / - 2; - int32_t cub = (((int32_t)ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6; - int32_t golden = lin + qua + cub; - if (matrix[i * num_columns + j] != golden) { - return (i + j) == 0 ? -1 : (int)(i * num_columns + j); - } - matrix[i * num_columns + j] = 0; - } + // Initialize data + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int16_t)); + dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int16_t)); } - return 0; -} - -int test_matrix_multiplication(int16_t *__restrict__ A, int16_t *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, uint32_t N, - uint32_t P, uint32_t core_id, - uint32_t num_cores) { - int16_t const A_a = 1; - int16_t const A_b = 1; - int16_t const A_c = -40; - int16_t const B_a = 0; - int16_t const B_b = 1; - int16_t const B_c = 19; - - // Initialize Matrices - init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores); - // Wait at barrier until everyone is ready mempool_barrier(num_cores); - // Execute function to test. - mempool_start_benchmark(); + // Benchmark + mempool_start_benchmark(); #ifdef __XPULPIMG - matmul_unrolled_4x2_pincr_asm_parallel_i16_xpulpv2(A, B, C, M, N, P, core_id, - num_cores); + matmul_unrolled_4x2_pincr_asm_parallel_i16_xpulpv2( + l1_A, l1_B, l1_C, matrix_M, matrix_N, matrix_P, core_id, num_cores); #else - matmul_unrolled_2x2_parallel_i16_rv32im(A, B, C, M, N, P, core_id, num_cores); + matmul_unrolled_2x2_parallel_i16_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N, + matrix_P, core_id, num_cores); #endif - mempool_stop_benchmark(); - // Wait at barrier befor checking mempool_barrier(num_cores); - if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id, - num_cores)) { - error = 1; - return -1; - } - return 0; -} -int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - if (core_id == 0) { - error = 0; - } - - // Test the Matrix multiplication - test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N, - matrix_P, core_id, num_cores); - // wait until all cores have finished + // Verify results + mempool_check_i32(l1_C, l2_C, matrix_M * matrix_P, 0, 0); mempool_barrier(num_cores); - - return error; + return 0; } diff --git a/software/apps/baremetal/matmul_i32/main.c b/software/apps/baremetal/matmul_i32/main.c index 65e2b82f1..3713dcabe 100644 --- a/software/apps/baremetal/matmul_i32/main.c +++ b/software/apps/baremetal/matmul_i32/main.c @@ -7,131 +7,46 @@ #include #include -#include "baremetal/mempool_matmul_i32p.h" +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -// Define Matrix dimensions: -// C = AB with A=[MxN], B=[NxP], C=[MxP] -#define matrix_M 64 -#define matrix_N 32 -#define matrix_P 64 - -int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); -int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); -int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_matmul_i32p.h" +#include "data_matmul_i32.h" -int volatile error __attribute__((section(".l1"))); +int32_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); +int32_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); +int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id, - uint32_t num_cores) { - uint32_t const split = 8; // How many rows/columns to split the matrix into - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } -} +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); -// Initialize the matrices in parallel -int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t inner_dim, int32_t aa, int32_t ab, int32_t ac, - int32_t ba, int32_t bb, int32_t bc, uint32_t core_id, - uint32_t num_cores) { - // Convert to signed - int32_t n = (int32_t)inner_dim; - // Parallelize over rows - for (uint32_t i = core_id; i < num_rows; i += num_cores) { - for (uint32_t j = 0; j < num_columns; ++j) { - int32_t ii = (int32_t)i; - int32_t jj = (int32_t)j; - int32_t lin = - (aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + ac * bc) * n; - int32_t qua = - ((aa * ba * ii + ab * bb * jj + ab * bc + ba * ac) * (n * (n - 1))) / - 2; - int32_t cub = ((ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6; - int32_t golden = lin + qua + cub; - if (matrix[i * num_columns + j] != golden) { - return (i + j) == 0 ? -1 : (int)(i * num_columns + j); - } - matrix[i * num_columns + j] = 0; - } + // Initialize data + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int32_t)); + dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int32_t)); } - return 0; -} - -int test_matrix_multiplication(int32_t *__restrict__ A, int32_t *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, uint32_t N, - uint32_t P, uint32_t core_id, - uint32_t num_cores) { - int32_t const A_a = 1; - int32_t const A_b = 1; - int32_t const A_c = -32; - int32_t const B_a = 2; - int32_t const B_b = 1; - int32_t const B_c = 16; - - // Initialize Matrices - init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores); - // Wait at barrier until everyone is ready mempool_barrier(num_cores); - // Execute function to test. - mempool_start_benchmark(); + // Benchmark + mempool_start_benchmark(); #ifdef __XPULPIMG - matmul_unrolled_2x2_parallel_i32_xpulpv2(A, B, C, M, N, P, core_id, - num_cores); + matmul_unrolled_2x2_parallel_i32_xpulpv2(l1_A, l1_B, l1_C, matrix_M, matrix_N, + matrix_P, core_id, num_cores); #else - matmul_unrolled_2x2_parallel_i32_rv32im(A, B, C, M, N, P, core_id, num_cores); + matmul_unrolled_2x2_parallel_i32_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N, + matrix_P, core_id, num_cores); #endif - mempool_stop_benchmark(); - // Wait at barrier befor checking mempool_barrier(num_cores); - if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id, - num_cores)) { - error = 1; - return -1; - } - return 0; -} -int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - if (core_id == 0) { - error = 0; - } - - // Test the Matrix multiplication - test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N, - matrix_P, core_id, num_cores); - // wait until all cores have finished + // Verify results + mempool_check_i32(l1_C, l2_C, matrix_M * matrix_P, 0, 0); mempool_barrier(num_cores); - - return error; + return 0; } diff --git a/software/apps/baremetal/matmul_i8/main.c b/software/apps/baremetal/matmul_i8/main.c index 4fb557f2c..3aa99a4e6 100644 --- a/software/apps/baremetal/matmul_i8/main.c +++ b/software/apps/baremetal/matmul_i8/main.c @@ -7,137 +7,46 @@ #include #include -#include "baremetal/mempool_matmul_i8p.h" +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -// Define Matrix dimensions: -// C = AB with A=[MxN], B=[NxP], C=[MxP] -#define matrix_M 64 -#define matrix_N 64 -#define matrix_P 64 - -int8_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); -int8_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); -int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_matmul_i8p.h" +#include "data_matmul_i8.h" -int volatile error __attribute__((section(".l1"))); +int8_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); +int8_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); +int32_t l1_C[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); -void init_matrix(int8_t *matrix, uint32_t num_rows, uint32_t num_columns, - int8_t a, int8_t b, int8_t c, uint32_t core_id, - uint32_t num_cores) { - uint32_t const split = 8; // How many rows/columns to split the matrix into - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = - (int8_t)(a * (int8_t)i + b * (int8_t)j + c); - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = - (int8_t)(a * (int8_t)i + b * (int8_t)j + c); - } - } - } -} +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); -// Initialize the matrices in parallel -int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t inner_dim, int8_t aa, int8_t ab, int8_t ac, - int8_t ba, int8_t bb, int8_t bc, uint32_t core_id, - uint32_t num_cores) { - // Convert to signed - int32_t n = (int32_t)inner_dim; - // Parallelize over rows - for (uint32_t i = core_id; i < num_rows; i += num_cores) { - for (uint32_t j = 0; j < num_columns; ++j) { - int32_t ii = (int32_t)i; - int32_t jj = (int32_t)j; - int32_t lin = ((int32_t)aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + - (int32_t)ac * bc) * - n; - int32_t qua = - (((int32_t)aa * ba * ii + ab * bb * jj + ab * bc + (int32_t)ba * ac) * - (n * (n - 1))) / - 2; - int32_t cub = (((int32_t)ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6; - int32_t golden = lin + qua + cub; - if (matrix[i * num_columns + j] != golden) { - return (i + j) == 0 ? -1 : (int)(i * num_columns + j); - } - matrix[i * num_columns + j] = 0; - } + // Initialize data + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, matrix_M * matrix_N * sizeof(int8_t)); + dma_memcpy_blocking(l1_B, l2_B, matrix_N * matrix_P * sizeof(int8_t)); } - return 0; -} - -int test_matrix_multiplication(int8_t *__restrict__ A, int8_t *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, uint32_t N, - uint32_t P, uint32_t core_id, - uint32_t num_cores) { - int8_t const A_a = 1; - int8_t const A_b = 1; - int8_t const A_c = -40; - int8_t const B_a = 0; - int8_t const B_b = 1; - int8_t const B_c = 19; - - // Initialize Matrices - init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores); - // Wait at barrier until everyone is ready mempool_barrier(num_cores); - // Execute function to test. - mempool_start_benchmark(); + // Benchmark + mempool_start_benchmark(); #ifdef __XPULPIMG - matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id, - num_cores); - // matmul_unrolled_2x4_parallel_i8_xpulpv2(A, B, C, M, N, P, core_id, - // num_cores); + matmul_unrolled_2x4_pincr_asm_parallel_i8_xpulpv2( + l1_A, l1_B, l1_C, matrix_M, matrix_N, matrix_P, core_id, num_cores); #else - matmul_unrolled_2x2_parallel_i8_rv32im(A, B, C, M, N, P, core_id, num_cores); + matmul_unrolled_2x2_parallel_i8_rv32im(l1_A, l1_B, l1_C, matrix_M, matrix_N, + matrix_P, core_id, num_cores); #endif - mempool_stop_benchmark(); - // Wait at barrier befor checking mempool_barrier(num_cores); - if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id, - num_cores)) { - error = 1; - return -1; - } - return 0; -} -int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - if (core_id == 0) { - error = 0; - } - - // Test the Matrix multiplication - test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N, - matrix_P, core_id, num_cores); - // wait until all cores have finished + // Verify results + mempool_check_i32(l1_C, l2_C, matrix_M * matrix_P, 0, 0); mempool_barrier(num_cores); - - return error; + return 0; } diff --git a/software/apps/systolic/Makefile b/software/apps/systolic/Makefile index 525b4b017..93e960434 100644 --- a/software/apps/systolic/Makefile +++ b/software/apps/systolic/Makefile @@ -14,8 +14,6 @@ RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime) include $(RUNTIME_DIR)/runtime.mk APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c")) -DATA := $(patsubst %.args,%.h,$(shell find $(APPS_DIR) -name "data.args")) -ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py)) BINARIES := $(addprefix $(BIN_DIR)/,$(APPS)) # Define the rule to build all applications @@ -26,7 +24,7 @@ $(APPS): % : $(BIN_DIR)/% $(APPS_DIR)/Makefile $(shell find $(RUNTIME_DIR)/**.{S # Check if the config is set to systolic ifeq ($(config),systolic) .PHONY: $(BINARIES) -$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) $(ALLPYS) update_opcodes +$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) data_%.h update_opcodes mkdir -p $(dir $@) $(RISCV_CC) -Iinclude $(RISCV_LDFLAGS) -o $@ $< $(RUNTIME) -T$(RUNTIME_DIR)/link.ld $(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump diff --git a/software/data/README.md b/software/data/README.md new file mode 100644 index 000000000..9fdab87cf --- /dev/null +++ b/software/data/README.md @@ -0,0 +1,29 @@ +# Data Generation + +Data for mempool applications is generated with the `gendata_header.py` script. +The `gendatalib.py` libaries generate random inputs and a reference golden model for the applications under test. +The application parameters are passed to the script with the `gendata_params.hjson` file. + +An example entry follows: `matmul_f32` is the name of MemPool application under test, the `type` refers to numpy precision, the `defines` are application parameters, turned into C constant declarations in the form `#define matrix_M (16)`, the `arrays` encode the C-type and name of input vectors for the application under test. + +` + "matmul_f32": { + "type": "float32", + "defines": [ + ("matrix_M", 16) + ("matrix_N", 16) + ("matrix_P", 16) + ] + "arrays": [ + ("float", "l2_A") + ("float", "l2_B") + ("float", "l2_C") + ] + } +` + +## To test a new application: +If a new application requires to be tested with data generated from a reference golden model: +- Add a new golden model to the existing library `gendatalib.py`. +- Add a golden model function call to the `gendata_header.py`. +- Add a new item in the `gendata_params.hjson` to make function parameters configurable. diff --git a/software/data/data_cfft_radix2_q16.h.tpl b/software/data/data_cfft_radix2_q16.h.tpl deleted file mode 100644 index 6044e424d..000000000 --- a/software/data/data_cfft_radix2_q16.h.tpl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Automatically generated by: -// data/data_cfft_radix2_q16.py - -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int16_t) 0X{:04X}, '.format(a&0xffff) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -<% def array_to_str(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LOG2 (${Log2Len}) -#define N_CSAMPLES (${Len}) -#define N_TWIDDLES (3 * N_CSAMPLES / 4) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) - -// Tolerance for correctness check -#define TOLERANCE (${tolerance}) - -% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']): - -// Data arrays for matrix ${m_str} -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) ${m_str}[${2*Len}] = ${array_to_cstr(m)}; - -% endfor \ - -// Twiddles -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)}; - -// Bitreversal -uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)}; diff --git a/software/data/data_cfft_radix2_q16.py b/software/data/data_cfft_radix2_q16.py deleted file mode 100644 index e1615e53e..000000000 --- a/software/data/data_cfft_radix2_q16.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the cfft kernel. -# Author: Marco Bertuletti - -import numpy as np -import math as M -import argparse -import pathlib -from mako.template import Template -from sympy.combinatorics import Permutation - - -################## -# compute_result # -################## - - -def compute_result(inp, len): - """ - Funciton to generate the expected result of the testcase. - - Arguments - --------- - input: numpy array of inputs - env: Length of the input transform. - """ - - # Q16: - # len=16: Q1.15 -> Q5.11 - # len=32: Q1.15 -> Q6.10 - # len=64: Q1.15 -> Q7.9 - # len=128: Q1.15 -> Q8.8 - # len=256: Q1.15 -> Q9.7 - # len=512: Q1.15 -> Q10.6 - # len=1024: Q1.15 -> Q11.5 - # len=2048: Q1.15 -> Q12.4 - # len=4096: Q1.15 -> Q13.3 - bit_shift_dict_q16 = { - 16: 11, - 32: 10, - 64: 9, - 128: 8, - 256: 7, - 512: 6, - 1024: 5, - 2048: 4, - 4096: 3} - my_type = np.int16 - my_fixpoint = 15 - bit_shift_dict = bit_shift_dict_q16 - a = inp.astype(my_type) - result = np.zeros(a.size, dtype=my_type) - complex_a = np.zeros(int(a.size / 2), dtype=np.csingle) - complex_result = np.zeros(a.size >> 1, dtype=np.csingle) - for i in range(a.size >> 1): - complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + ( - a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j - complex_result = np.fft.fft(complex_a) - for i in range(int(a.size / 2)): - result[2 * i] = (np.real(complex_result[i]) * - (2**(bit_shift_dict[int(a.size / 2)])) - ).astype(my_type) - result[2 * i + 1] = (np.imag(complex_result[i]) * - (2**(bit_shift_dict[int(a.size / 2)])) - ).astype(my_type) - - return result - - -def compute_twiddles(length): - PI = 3.14159265358979 - N = length - twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16) - for i in range(0, (int)(3 * N / 4)): - twiddleCoefq15_cos = M.cos(i * 2 * PI / N) - twiddleCoefq15_sin = M.sin(i * 2 * PI / N) - twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1))) - twiddleCoefq15[2 * i + - 1] = int(round(twiddleCoefq15_sin * (2**15 - 1))) - return twiddleCoefq15 - - -def compute_bitreversal(N, R): - - # Decompose - logR2 = [] - idx = N - while (idx >= R): - logR2.append(int(M.log2(R))) - idx = idx // R - if (idx > 1): - logR2.append(int(M.log2(idx))) - - # Bitreversal - indexes = [] - for x in range(N): - result = 0 - for bits in logR2: - mask = (0xffffffff >> (32 - bits)) - result = (result << bits) | (x & mask) - x = x >> bits - indexes.append(result) - - # Create transpositions table - tps = [] - for c in Permutation.from_sequence(indexes).cyclic_form: - for i in range(len(c) - 1): - tps.append([c[i] * 8, c[-1] * 8]) - - return tps - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_cfft_radix2_q16.h.tpl", - help='Path to mako template') - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-d", - "--dimension", - type=int, - required=False, - default=64, - help='Input dimension' - ) - - args = parser.parse_args() - - # Create sparse matrix - Len = args.dimension - Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16) - Result = compute_result(Input, Len) - Twiddles = compute_twiddles(Len) - Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2))) - - tolerance = { - 16: 16, - 32: 20, - 64: 24, - 128: 28, - 256: 32, - 512: 48, - 1024: 64, - 2048: 96, - 4096: 128} - - kwargs = {'name': 'data_cfft_radix2_q16', - 'vector_inp': Input, - 'vector_res': Result, - 'vector_twi': Twiddles, - 'vector_bitrev': Bitreversal, - 'Len': Len, - 'Log2Len': int(np.log2(Len)), - 'BitrevLen': int(2 * len(Bitreversal)), - 'tolerance': tolerance[int(Len)]} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_cfft_radix4_q16.h.tpl b/software/data/data_cfft_radix4_q16.h.tpl deleted file mode 100644 index 3af1b764d..000000000 --- a/software/data/data_cfft_radix4_q16.h.tpl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Automatically generated by: -// data/data_cfft_radix4_q16.py - -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int16_t) 0X{:04X}, '.format(a&0xffff) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -<% def array_to_str(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LOG2 (${Log2Len}) -#define N_CSAMPLES (${Len}) -#define N_TWIDDLES (3 * N_CSAMPLES / 4) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) - -// Maximum number of independent FFT columns allowed -#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) -// Tolerance for correctness check -#define TOLERANCE (${tolerance}) - -% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']): - -// Data arrays for matrix ${m_str} -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) ${m_str}[${2*Len}] = ${array_to_cstr(m)}; - -% endfor \ - -// Twiddles -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)}; - -// Bitreversal -uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)}; diff --git a/software/data/data_cfft_radix4_q16.py b/software/data/data_cfft_radix4_q16.py deleted file mode 100755 index b394a2884..000000000 --- a/software/data/data_cfft_radix4_q16.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the cfft kernel. -# Author: Marco Bertuletti - -import numpy as np -import math as M -import argparse -import pathlib -from mako.template import Template -from sympy.combinatorics import Permutation - - -################## -# compute_result # -################## - - -def compute_result(inp, len): - """ - Funciton to generate the expected result of the testcase. - - Arguments - --------- - input: numpy array of inputs - env: Length of the input transform. - """ - - # Q16: - # len=16: Q1.15 -> Q5.11 - # len=32: Q1.15 -> Q6.10 - # len=64: Q1.15 -> Q7.9 - # len=128: Q1.15 -> Q8.8 - # len=256: Q1.15 -> Q9.7 - # len=512: Q1.15 -> Q10.6 - # len=1024: Q1.15 -> Q11.5 - # len=2048: Q1.15 -> Q12.4 - # len=4096: Q1.15 -> Q13.3 - bit_shift_dict_q16 = { - 16: 11, - 32: 10, - 64: 9, - 128: 8, - 256: 7, - 512: 6, - 1024: 5, - 2048: 4, - 4096: 3} - my_type = np.int16 - my_fixpoint = 15 - bit_shift_dict = bit_shift_dict_q16 - a = inp.astype(my_type) - result = np.zeros(a.size, dtype=my_type) - complex_a = np.zeros(int(a.size / 2), dtype=np.csingle) - complex_result = np.zeros(a.size >> 1, dtype=np.csingle) - for i in range(a.size >> 1): - complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + ( - a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j - complex_result = np.fft.fft(complex_a) - for i in range(int(a.size / 2)): - result[2 * i] = (np.real(complex_result[i]) * - (2**(bit_shift_dict[int(a.size / 2)])) - ).astype(my_type) - result[2 * i + 1] = (np.imag(complex_result[i]) * - (2**(bit_shift_dict[int(a.size / 2)])) - ).astype(my_type) - - return result - - -def compute_twiddles(length): - PI = 3.14159265358979 - N = length - twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16) - for i in range(0, (int)(3 * N / 4)): - twiddleCoefq15_cos = M.cos(i * 2 * PI / N) - twiddleCoefq15_sin = M.sin(i * 2 * PI / N) - twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1))) - twiddleCoefq15[2 * i + - 1] = int(round(twiddleCoefq15_sin * (2**15 - 1))) - return twiddleCoefq15 - - -def compute_bitreversal(N, R): - - # Decompose - logR2 = [] - idx = N - while (idx >= R): - logR2.append(int(M.log2(R))) - idx = idx // R - if (idx > 1): - logR2.append(int(M.log2(idx))) - - # Bitreversal - indexes = [] - for x in range(N): - result = 0 - for bits in logR2: - mask = (0xffffffff >> (32 - bits)) - result = (result << bits) | (x & mask) - x = x >> bits - indexes.append(result) - - # Create transpositions table - tps = [] - for c in Permutation.from_sequence(indexes).cyclic_form: - for i in range(len(c) - 1): - tps.append([c[i] * 8, c[-1] * 8]) - - return tps - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_cfft_radix4_q16.h.tpl", - help='Path to mako template') - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-d", - "--dimension", - type=int, - required=False, - default=64, - help='Input dimension' - ) - - args = parser.parse_args() - - # Create sparse matrix - Len = args.dimension - Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16) - Result = compute_result(Input, Len) - Twiddles = compute_twiddles(Len) - Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2))) - - tolerance = { - 16: 16, - 32: 20, - 64: 24, - 128: 28, - 256: 32, - 512: 48, - 1024: 64, - 2048: 96, - 4096: 128} - - kwargs = {'name': 'data_cfft_radix4_q16', - 'vector_inp': Input, - 'vector_res': Result, - 'vector_twi': Twiddles, - 'vector_bitrev': Bitreversal, - 'Len': Len, - 'Log2Len': int(np.log2(Len)), - 'BitrevLen': len(Bitreversal), - 'tolerance': tolerance[int(Len)]} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_chest_q16.h.tpl b/software/data/data_chest_q16.h.tpl deleted file mode 100644 index 2e11a26e3..000000000 --- a/software/data/data_chest_q16.h.tpl +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Automatically generated by: -// data/data_chest_q16.py - -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int16_t) 0X{:04X}, '.format(a&0xffff) - i += 1 - if i % 32 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define N_TX (${nb_tx}) -#define N_RX (${nb_rx}) -#define N_SAMPLES (${nb_samples}) - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_PilotRX[${2*nb_rx*nb_samples}] = ${array_to_cstr(pilot_rx)}; - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_PilotTX[${2*nb_tx*nb_samples}] = ${array_to_cstr(pilot_tx)}; - -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_HEST[${2*nb_rx*nb_tx*nb_samples}] = ${array_to_cstr(Hest)}; diff --git a/software/data/data_chest_q16.py b/software/data/data_chest_q16.py deleted file mode 100755 index e1fca8649..000000000 --- a/software/data/data_chest_q16.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the Channel estimation. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib - -from mako.template import Template - -################## -# write_result # -################## - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - -###################### -# Fixpoint Functions # -###################### - - -def q_sat(x): - if x > 2**15 - 1: - return x - 2**16 - elif x < -2**15: - return x + 2**16 - else: - return x - - -def compute_chest_q16(in_rx, in_tx, p): - n_rx = in_rx.size - n_tx = in_tx.size - result = np.zeros(2 * (n_tx * n_rx), dtype=np.int16) - for i in range(n_rx): - a_r = in_rx[i].real - a_i = in_rx[i].imag - for j in range(n_tx): - b_r = in_tx[j].real - b_i = in_tx[j].imag - -# # Compute data division -# den = (2**16) // (b_r * b_r + b_i * b_i) -# num_r = (a_r * b_r) + (a_i * b_i) -# num_i = (a_i * b_r) - (a_r * b_i) -# result[2 * (i * n_tx + j)] = q_sat((num_r * den) // 2**p) -# result[2 * (i * n_tx + j) + 1] = q_sat((num_i * den) // 2**p) - - # Compute data multiplication - num_r = (a_r * b_r) - (a_i * b_i) - num_i = (a_i * b_r) + (a_r * b_i) - result[2 * (i * n_tx + j)] = q_sat(num_r // 2**p) - result[2 * (i * n_tx + j) + 1] = q_sat(num_i // 2**p) - return result - - -def generate_chest_q16(nb_tx, nb_rx, nb_samples): - FIXED_POINT = 8 - MAX = 2**7 - - qvector_pilot_tx = [] - qvector_pilot_rx = [] - qvector_Hest = [] - for k in range(nb_samples): - # Create pilots - pilot_rx = np.random.randint(-MAX, MAX - 1, size=nb_rx) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=nb_rx) - pilot_tx = np.random.randint(-MAX, MAX - 1, size=nb_tx) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=nb_tx) - # Compute Hest - Hest = compute_chest_q16(pilot_rx, pilot_tx, FIXED_POINT) - - pilot_tx = np.column_stack( - (pilot_tx.imag, pilot_tx.real)).astype( - np.int16).flatten() - pilot_rx = np.column_stack( - (pilot_rx.imag, pilot_rx.real)).astype( - np.int16).flatten() - qvector_pilot_tx.append(pilot_tx) - qvector_pilot_rx.append(pilot_rx) - qvector_Hest.append(Hest) - - qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * nb_tx * nb_samples]) - qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * nb_rx * nb_samples]) - qvector_Hest = np.reshape(qvector_Hest, [2 * nb_tx * nb_rx * nb_samples]) - return qvector_pilot_tx, qvector_pilot_rx, qvector_Hest - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-b", - "--num_rx", - type=int, - required=False, - default=32, - help='Number beams' - ) - parser.add_argument( - "-l", - "--num_tx", - type=int, - required=False, - default=4, - help='Number layers' - ) - parser.add_argument( - "-s", - "--num_samples", - type=int, - required=False, - default=32, - help='Number samples' - ) - - args = parser.parse_args() - nb_tx = args.num_tx - nb_rx = args.num_rx - nb_samples = args.num_samples - - pilot_tx, pilot_rx, Hest = generate_chest_q16(nb_tx, nb_rx, nb_samples) - tpl = pathlib.Path(__file__).parent.absolute() / "data_chest_q16.h.tpl" - kwargs = {'name': 'data_chest_q16', - 'pilot_tx': pilot_tx, - 'pilot_rx': pilot_rx, - 'Hest': Hest, - 'nb_tx': nb_tx, - 'nb_rx': nb_rx, - 'nb_samples': nb_samples} - gen_data_header_file(args.outdir, tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_matmul_f16.h.tpl b/software/data/data_matmul_f16.h.tpl deleted file mode 100644 index 96aa738a3..000000000 --- a/software/data/data_matmul_f16.h.tpl +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:.4f}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define matrix_M (${matrix_M}) -#define matrix_N (${matrix_N}) -#define matrix_P (${matrix_P}) - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${matrix_M * matrix_N}] = ${array_to_cstr(A)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${matrix_N * matrix_P}] = ${array_to_cstr(B)}; - -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${matrix_M * matrix_P}] = ${array_to_cstr(C)}; diff --git a/software/data/data_matmul_f32.h.tpl b/software/data/data_matmul_f32.h.tpl deleted file mode 100644 index 4e9e6a4d6..000000000 --- a/software/data/data_matmul_f32.h.tpl +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define matrix_M (${matrix_M}) -#define matrix_N (${matrix_N}) -#define matrix_P (${matrix_P}) - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${matrix_M * matrix_N}] = ${array_to_cstr(A)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${matrix_N * matrix_P}] = ${array_to_cstr(B)}; - -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${matrix_M * matrix_P}] = ${array_to_cstr(C)}; diff --git a/software/data/data_matmulf16.py b/software/data/data_matmulf16.py deleted file mode 100644 index 2c362208b..000000000 --- a/software/data/data_matmulf16.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 matmul. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_matmul_f16.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-m", - "--dim_m", - type=int, - required=False, - default=16, - help='First dimension.' - ) - parser.add_argument( - "-n", - "--dim_n", - type=int, - required=False, - default=16, - help='Second dimension.' - ) - parser.add_argument( - "-p", - "--dim_p", - type=int, - required=False, - default=16, - help='Third dimension.' - ) - - args = parser.parse_args() - - matrix_M = args.dim_m - matrix_N = args.dim_n - matrix_P = args.dim_p - - # Create matrix - A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(np.float16) - B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(np.float16) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float16) - B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float16) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float16) - - kwargs = { - 'name': 'data_matmul_f16', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_matmulf32.py b/software/data/data_matmulf32.py deleted file mode 100644 index 15086d0fc..000000000 --- a/software/data/data_matmulf32.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp32 matmul. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_matmul_f32.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - - parser.add_argument( - "-m", - "--dim_m", - type=int, - required=False, - default=16, - help='First dimension.' - ) - parser.add_argument( - "-n", - "--dim_n", - type=int, - required=False, - default=16, - help='Second dimension.' - ) - parser.add_argument( - "-p", - "--dim_p", - type=int, - required=False, - default=16, - help='Third dimension.' - ) - - args = parser.parse_args() - - matrix_M = args.dim_m - matrix_N = args.dim_n - matrix_P = args.dim_p - - # Create matrix - A = np.random.rand(matrix_M, matrix_N) - B = np.random.rand(matrix_N, matrix_P) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float32) - B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float32) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float32) - - kwargs = { - 'name': 'data_matmul_f32', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py new file mode 100644 index 000000000..44749a4a0 --- /dev/null +++ b/software/data/gendata_header.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script generates data.h files. +# Author: Marco Bertuletti + +import argparse +import os +import hjson +import ast +import numpy + +import gendatalib as datalib + + +header = """\ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// File generated with .data/print_header.py +// Author: Marco Bertuletti\n\n +""" + + +def format_type(typ, value): + """ + formats the type for printing in .h file. + :param typ: Input type + :param value: Input_value + """ + typ_i32b = ["int32_t", "uint32_t"] + typ_i16b = ["int16_t", "uint16_t"] + typ_i8b = ["int8_t", "uint8_t"] + + if typ in typ_i32b: + stringyfied_val = '({}) 0X{:08X}'.format(typ, value & 0xffffffff) + elif typ in typ_i16b: + stringyfied_val = '({}) 0X{:04X}'.format(typ, value & 0x0000ffff) + elif typ in typ_i8b: + stringyfied_val = '({}) 0X{:02X}'.format(typ, value & 0x000000ff) + elif typ == 'float': + stringyfied_val = '({}) {:+.8f}'.format(typ, value) + elif typ == '__fp16': + stringyfied_val = '({}) {:+.4f}'.format(typ, value) + else: + raise Exception("ERROR: Unsupported data type!!!") + + return stringyfied_val + + +def print_array(arr, typ, name): + """ + Converts arrays to a string. + + :param arr: Input array + :param typ: Type of the array. + :param name: Name of the array. + """ + + output_string = typ + attr = " __attribute__((aligned(sizeof(int32_t)), section(\".l2\"))) " + if (arr.size > 1): + output_string += attr + output_string += name + '[{}] = {{\n'.format(arr.size) + for (value, count) in zip(arr, range(arr.size)): + output_string += (format_type(typ, value) + ', ') + count += 1 + if count % 4 == 0: + output_string += '\n' + output_string = output_string[:-3] + output_string += "};\n\n" + else: + output_string += attr + output_string += (name + ' = ' + format_type(typ, arr)) + output_string += ";\n\n" + + return output_string + + +def print_file(header, defines, arrays, filename): + """ + Writes defines and arrays to a file. + + :param header: Header of the printed file + :param defines: A tuple of (define_name, define_value). + :param arrays: A tuple of (array_name, array_type, array_values). + :param filename: The output file to write to. + """ + + # Initialize the output string + output_string = header + + # Write the defines + for def_key, def_value in defines.items(): + output_string += "#define {} ({})\n".format(def_key, def_value) + output_string += "\n" # Add space between defines and arrays + + # Write the arrays using print_array + for array_values, array_type, array_name in arrays: + output_string += print_array(array_values, array_type, array_name) + + # Write everything to the file + with open(filename, "w") as file: + file.write(output_string) + + print("Generate {}".format(filename)) + + +def get_type(type_string): + """ + Gets the numpy type from the type specifyied in the json + :param type_string: type from json file. + """ + if type_string == "int8": + return numpy.int8 + elif type_string == "int16": + return numpy.int16 + elif type_string == "int32": + return numpy.int32 + elif type_string == "float32": + return numpy.float32 + elif type_string == "float16": + return numpy.float16 + else: + raise Exception("Input type is not valid") + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='Generate data.h header files.') + parser.add_argument('--app_name', type=str, help='Name of the app') + parser.add_argument('--params', type=str, help='Name of the app') + + # Parse the command-line arguments + args = parser.parse_args() + app_name = args.app_name + with open(args.params, 'r') as hjson_file: + config_data = hjson.load(hjson_file) + data_args = config_data.get(app_name) + + if data_args is not None: + my_type = get_type(data_args.get("type")) + defnes = dict([ast.literal_eval(defne) + for defne in data_args.get("defines")]) + arrays = [ast.literal_eval(array) for array in data_args.get("arrays")] + + # Determine output file name + filename = os.path.dirname(os.path.abspath(__file__)) + filename = os.path.join(filename, "data_{}.h".format(app_name)) + + # Define function mappings for each app_name + function_map = { + "axpy_i32": {"func": datalib.generate_iaxpy}, + "cfft_radix4_q16": {"func": datalib.generate_cfft_q16}, + "cfft_radix2_q16": {"func": datalib.generate_cfft_q16}, + "chest_q16": {"func": datalib.generate_qchest}, + "cholesky_q32": {"func": datalib.generate_qcholesky}, + "dotp_i32": {"func": datalib.generate_idotp}, + "matmul_f16": {"func": datalib.generate_fmatmul}, + "matmul_f32": {"func": datalib.generate_fmatmul}, + "matmul_i32": {"func": datalib.generate_imatmul}, + "matmul_i16": {"func": datalib.generate_imatmul}, + "matmul_i8": {"func": datalib.generate_imatmul}, + "fence": {"func": datalib.generate_iarray}, + "memcpy": {"func": datalib.generate_iarray}, + } + + # Check if app_name exists in the function map + if app_name in function_map: + func_info = function_map[app_name] + func = func_info["func"] + # Call the function + # The defnes dictionary is a function argument in case the generate + # function adds new definitions. + result, defnes = func(defines=defnes, my_type=my_type) + # Print result to data header + if len(arrays) == 1: + arrays = [(result, *arrays[0])] + else: + arrays = [(result[i], *arrays[i]) for i in range(len(arrays))] + print_file(header, defnes, arrays, filename) + + else: + print("Data generation is not defined.") diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson new file mode 100644 index 000000000..3a1de010e --- /dev/null +++ b/software/data/gendata_params.hjson @@ -0,0 +1,177 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// This script generates data.h files. +// Author: Marco Bertuletti + +{ + "axpy_i32": { + "type": "int32", + "defines": [ + ("ALPHA", 6) + ("array_N", 1024) + ] + "arrays": [ + ("int32_t", "l2_X") + ("int32_t", "l2_Y") + ("int32_t", "l2_Z") + ] + }, + + "dotp_i32": { + "type": "int32", + "defines": [ + ("array_N", 1024) + ] + "arrays": [ + ("int32_t", "l2_X") + ("int32_t", "l2_Y") + ("int32_t", "l2_Z") + ] + }, + + "cfft_radix4_q16": { + "type": "int16", + "defines": [ + ("N_CSAMPLES", 64) + ] + "arrays": [ + ("int16_t", "l2_pSrc") + ("int16_t", "l2_pRes") + ("int16_t", "l2_twiddleCoef_q16") + ("int16_t", "l2_BitRevIndexTable") + ] + }, + + "cfft_radix2_q16": { + "type": "int16", + "defines": [ + ("N_CSAMPLES", 256) + ] + "arrays": [ + ("int16_t", "l2_pSrc") + ("int16_t", "l2_pRes") + ("int16_t", "l2_twiddleCoef_q16") + ("int16_t", "l2_BitRevIndexTable") + ] + }, + + "chest_q16": { + "type": "int32", + "defines": [ + ("N_TX", 4) + ("N_RX", 4) + ("N_SAMPLES", 512) + ] + "arrays": [ + ("int16_t", "l2_PilotTX") + ("int16_t", "l2_PilotRX") + ("int16_t", "l2_HEST") + ] + }, + + "cholesky_q32": { + "type": "int32", + "defines": [ + ("matrix_N", 32) + ("FIXED_POINT", 10) + ] + "arrays": [ + ("int32_t", "l2_A") + ("int32_t", "l2_L") + ("int32_t", "l2_y") + ] + }, + + "matmul_f16": { + "type": "float16", + "defines": [ + ("matrix_M", 32) + ("matrix_N", 32) + ("matrix_P", 32) + ] + "arrays": [ + ("__fp16", "l2_A") + ("__fp16", "l2_B") + ("__fp16", "l2_C") + ] + }, + + "matmul_f32": { + "type": "float32", + "defines": [ + ("matrix_M", 16) + ("matrix_N", 16) + ("matrix_P", 16) + ] + "arrays": [ + ("float", "l2_A") + ("float", "l2_B") + ("float", "l2_C") + ] + } + + "matmul_i32": { + "type": "int32", + "defines": [ + ("matrix_M", 32) + ("matrix_N", 32) + ("matrix_P", 32) + ] + "arrays": [ + ("int32_t", "l2_A") + ("int32_t", "l2_B") + ("int32_t", "l2_C") + ] + } + + "matmul_i16": { + "type": "int16", + "defines": [ + ("matrix_M", 64) + ("matrix_N", 64) + ("matrix_P", 64) + ] + "arrays": [ + ("int16_t", "l2_A") + ("int16_t", "l2_B") + ("int32_t", "l2_C") + ] + } + + "matmul_i8": { + "type": "int8", + "defines": [ + ("matrix_M", 64) + ("matrix_N", 64) + ("matrix_P", 64) + ] + "arrays": [ + ("int8_t", "l2_A") + ("int8_t", "l2_B") + ("int32_t", "l2_C") + ] + } + + "fence": { + "type": "int32", + "defines": [ + ("array_N", 12288) + ] + "arrays": [ + ("int32_t", "l2_data") + ] + }, + + "memcpy": { + "type": "int32", + "defines": [ + ("array_N", 2048) + ] + "arrays": [ + ("int32_t", "l2_data") + ] + }, + +} diff --git a/software/data/gendatalib.py b/software/data/gendatalib.py new file mode 100644 index 000000000..c017415bf --- /dev/null +++ b/software/data/gendatalib.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script generates data for the fp16 matmul. +# Author: Marco Bertuletti + +# The script generates random inputs for the C functions. The inputs are +# propagated though a python golden model. Golden models are from the +# numpy library or the qmath bit-true library. + +import numpy as np +import math +import qmath +from scipy import signal + + +def select_maxval(my_type=np.int32): + size = 8 * np.dtype(my_type).itemsize + MAX = 2**(size - 2) - 1 + return MAX + + +def irandom(size, MAX, my_type=np.int16): + """Generate random numbers. + size (int or tuple): Size of the array to generate. + mytype (np.dtype): Data type for the fixed-point representation. + Defaults to np.int16. + + Returns: + np.ndarray: Array of random fixed-point numbers. + """ + return np.random.randint(-MAX, MAX - 1, size=size, dtype=my_type) + + +def icrandom(size, MAX, my_type=np.int16): + """Generate random complex numbers. + size (int or tuple): Size of the array to generate. + mytype (np.dtype): Data type for the fixed-point representation. + Defaults to np.int16. + + Returns: + np.ndarray: Array of random complex fixed-point numbers. + """ + real_part = np.random.randint(-MAX, MAX - 1, size=size, dtype=my_type) + imag_part = np.random.randint(-MAX, MAX - 1, size=size, dtype=my_type) + return real_part + 1j * imag_part + + +def generate_iarray(my_type=np.float32, defines={}): + + # Create random array of integers + array_N = defines['array_N'] + MAX = select_maxval(my_type) + A = irandom(MAX=MAX, size=(array_N), my_type=my_type) + return A, defines + + +def generate_fmatmul(my_type=np.float32, defines={}): + + # Create matrix + matrix_M = defines['matrix_M'] + matrix_N = defines['matrix_N'] + matrix_P = defines['matrix_P'] + A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(my_type) + B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(my_type) + C = np.matmul(A, B) + + A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type) + B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type) + C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(my_type) + + return [A, B, C], defines + + +def generate_imatmul(my_type=np.int32, defines={}): + + # Create matrix + matrix_M = defines['matrix_M'] + matrix_N = defines['matrix_N'] + matrix_P = defines['matrix_P'] + MAX = select_maxval(my_type) + A = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type) + B = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type) + C = np.matmul(A, B) + + A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type) + B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type) + C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.int32) + + return [A, B, C], defines + + +def generate_iaxpy(my_type=np.int32, defines={}): + + # Create matrix + ALPHA = defines['ALPHA'] + array_N = defines['array_N'] + MAX = select_maxval(my_type) + X = irandom(MAX=MAX, size=(array_N), my_type=my_type) + Y = irandom(MAX=MAX, size=(array_N), my_type=my_type) + Z = (Y + X * ALPHA).astype(my_type) + + return [X, Y, Z], defines + + +def generate_idotp(my_type=np.int32, defines={}): + + # Create matrix + array_N = defines['array_N'] + MAX = select_maxval(my_type) + X = irandom(MAX=MAX, size=(array_N), my_type=my_type) + Y = irandom(MAX=MAX, size=(array_N), my_type=my_type) + Z = np.array((np.dot(X, Y))).astype(my_type) + + return [X, Y, Z], defines + + +def generate_iconv(my_type=np.int32, defines={}): + + # Create matrix + matrix_M = defines['matrix_M'] + matrix_N = defines['matrix_N'] + kernel_N = defines['kernel_N'] + MAX = select_maxval(my_type) + X = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type) + K = irandom(MAX=MAX, size=(kernel_N, kernel_N), my_type=my_type) + Y = signal.convolve2d(X, K, mode="same", boundary='fill') + + X = X.flatten().astype(my_type) + K = K.flatten().astype(my_type) + Y = Y.flatten().astype(my_type) + + return [X, K, Y], defines + + +def generate_qchest(defines={}, fixed_point=15, my_type=np.int16): + + N_TX = defines['N_TX'] + N_RX = defines['N_RX'] + N_SAMPLES = defines['N_SAMPLES'] + + qvector_pilot_tx = [] + qvector_pilot_rx = [] + qvector_Hest = [] + for k in range(N_SAMPLES): + # Create pilots + pilot_rx = icrandom(size=N_RX, MAX=2**7, my_type=np.int32) + pilot_tx = icrandom(size=N_TX, MAX=2**7, my_type=np.int32) + # Compute Hest + Hest = qmath.qchest(pilot_rx, pilot_tx, fixed_point=8) + + pilot_tx = np.column_stack((pilot_tx.imag, pilot_tx.real)) + pilot_rx = np.column_stack((pilot_rx.imag, pilot_rx.real)) + qvector_pilot_tx.append(pilot_tx.astype(np.int16).flatten()) + qvector_pilot_rx.append(pilot_rx.astype(np.int16).flatten()) + qvector_Hest.append(Hest) + + qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * N_TX * N_SAMPLES]) + qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * N_RX * N_SAMPLES]) + qvector_Hest = np.reshape(qvector_Hest, [2 * N_TX * N_RX * N_SAMPLES]) + return [qvector_pilot_tx, qvector_pilot_rx, qvector_Hest], defines + + +def generate_qcholesky(defines={}, fixed_point=15, my_type=np.int32): + + matrix_N = defines['matrix_N'] + FIXED_POINT = defines['FIXED_POINT'] + + A = irandom(size=(matrix_N, matrix_N), MAX=2**14, my_type=my_type) + y = irandom(size=matrix_N, MAX=2**14, my_type=my_type) + A = qmath.qmatmul(A.T, A, FIXED_POINT, my_type) + L = qmath.qcholesky(A, fixed_point=FIXED_POINT, mytype=my_type) + + A = np.reshape(A, (matrix_N * matrix_N), order='C').astype(my_type) + L = np.reshape(L, (matrix_N * matrix_N), order='C').astype(my_type) + return [A, L, y], defines + + +def generate_cfft_q16(defines={}, fixed_point=15, my_type=np.int16): + + N_CSAMPLES = defines['N_CSAMPLES'] + src = icrandom(size=N_CSAMPLES, MAX=2**fixed_point, my_type=my_type) + tolerance = { + 16: 16, + 32: 20, + 64: 24, + 128: 28, + 256: 32, + 512: 48, + 1024: 64, + 2048: 96, + 4096: 128} + bit_shift_dict_q16 = { + 16: 11, + 32: 10, + 64: 9, + 128: 8, + 256: 7, + 512: 6, + 1024: 5, + 2048: 4, + 4096: 3} + + dst = np.fft.fft(src.astype(np.csingle) / (2**fixed_point)) + dst = dst * 2**(bit_shift_dict_q16[N_CSAMPLES]) + + dst = (np.column_stack((dst.real, dst.imag))).flatten() + src = (np.column_stack((src.real, src.imag))).flatten() + dst = dst.astype(np.int16) + src = src.astype(np.int16) + + twiddles = qmath.qtwiddleCoef(N_CSAMPLES) + bitrever = qmath.bitreversal(N_CSAMPLES, 2) + + defines['LOG2'] = int(math.log2(N_CSAMPLES)) + defines['N_TWIDDLES'] = 3 * N_CSAMPLES // 4 + defines['BITREVINDEXTABLE_LENGTH'] = len(bitrever) + defines['TOLERANCE'] = tolerance[N_CSAMPLES] + + return [src, dst, twiddles, bitrever], defines diff --git a/software/data/qmath.py b/software/data/qmath.py new file mode 100644 index 000000000..404d7b407 --- /dev/null +++ b/software/data/qmath.py @@ -0,0 +1,446 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script generates data for the fp16 mmse. +# Author: Marco Bertuletti + +import numpy as np +import math +from sympy.combinatorics import Permutation + + +def to_fixed_point(matrix, fixed_point=15, mytype=np.int16): + """Convert a complex matrix to a fixed-point matrix. + matrix (np.ndarray): Input complex matrix. + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + tuple: Real and imaginary parts of the fixed-point matrix. + """ + SCALE_FACTOR = 2**fixed_point + real_part = np.round(matrix.real * SCALE_FACTOR).astype(mytype) + imag_part = np.round(matrix.imag * SCALE_FACTOR).astype(mytype) + if (np.abs(real_part.any()) > 2**(fixed_point - 1)): + raise ValueError("Overflow") + if (np.abs(imag_part.any()) > 2**(fixed_point - 1)): + raise ValueError("Overflow") + return real_part, imag_part + + +def from_fixed_point(real_part, imag_part, fixed_point=15, mytype=np.int16): + """Convert a fixed-point matrix back to a floating-point complex matrix. + real_part (np.ndarray): Real part of the fixed-point matrix. + imag_part (np.ndarray): Imaginary part of the fixed-point matrix. + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + np.ndarray: Reconstructed complex matrix. + """ + SCALE_FACTOR = 2**fixed_point + return (real_part / SCALE_FACTOR) + 1j * (imag_part / SCALE_FACTOR) + + +def qmatmul(A, B, fixed_point=15, mytype=np.int16): + """Perform fixed-point matrix multiplication. + A (np.ndarray): First matrix. + B (np.ndarray): Second matrix. + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + np.ndarray: Fixed-point result of the matrix multiplication. + """ + SCALE_FACTOR = 2**fixed_point + rows_A, cols_A = A.shape + cols_B = B.shape[1] + C = np.zeros((rows_A, cols_B), dtype=mytype) + + for i in range(rows_A): + for j in range(cols_B): + for k in range(cols_A): + C[i, j] += A[i, k] * B[k, j] // SCALE_FACTOR + return C + + +def qcmatmul(A_real, A_imag, B_real, B_imag, fixed_point=15, mytype=np.int16): + """Perform fixed-point complex matrix multiplication. + A_real (np.ndarray): Real part of the first matrix. + A_imag (np.ndarray): Imaginary part of the first matrix. + B_real (np.ndarray): Real part of the second matrix. + B_imag (np.ndarray): Imaginary part of the second matrix. + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + tuple: Real and imaginary parts of the result matrix. + """ + SCALE_FACTOR = 2**fixed_point + rows_A, cols_A = A_real.shape + cols_B = B_real.shape[1] + + C_real = np.zeros((rows_A, cols_B), dtype=mytype) + C_imag = np.zeros((rows_A, cols_B), dtype=mytype) + + for i in range(rows_A): + for j in range(cols_B): + for k in range(cols_A): + real_product = A_real[i, k] * \ + B_real[k, j] - A_imag[i, k] * B_imag[k, j] + imag_product = A_real[i, k] * \ + B_imag[k, j] + A_imag[i, k] * B_real[k, j] + + C_real[i, j] += real_product // SCALE_FACTOR + C_imag[i, j] += imag_product // SCALE_FACTOR + + return C_real, C_imag + + +def qcmvmul(A_real, A_imag, B_real, B_imag, fixed_point=15, mytype=np.int16): + """Perform fixed-point complex matrix-vector multiplication. + A_real (np.ndarray): Real part of the matrix. + A_imag (np.ndarray): Imaginary part of the matrix. + B_real (np.ndarray): Real part of the vector. + B_imag (np.ndarray): Imaginary part of the vector. + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + tuple: Real and imaginary parts of the result vector. + """ + SCALE_FACTOR = 2**fixed_point + rows_A, cols_A = A_real.shape + + C_real = np.zeros(rows_A, dtype=mytype) + C_imag = np.zeros(rows_A, dtype=mytype) + + for i in range(rows_A): + for k in range(cols_A): + real_product = A_real[i, k] * B_real[k] - A_imag[i, k] * B_imag[k] + imag_product = A_real[i, k] * B_imag[k] + A_imag[i, k] * B_real[k] + + C_real[i] += real_product // SCALE_FACTOR + C_imag[i] += imag_product // SCALE_FACTOR + + return C_real, C_imag + + +def qsqrt(n, fixed_point=15, mytype=np.int16): + """Compute the square root of a number in fixed-point representation using + Newton-Raphson method. + n (np.ndarray): Input value(s) in fixed-point representation. + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + np.ndarray: Square root of the input in fixed-point representation. + """ + SCALE_FACTOR = 2**fixed_point + x = np.ones_like(n, dtype=mytype) * SCALE_FACTOR + n_one = n * SCALE_FACTOR + + itr = 0 + while True: + x_old = x + x = (x + n_one // x) // 2 + if np.array_equal( + x, x_old) or itr == 10: # Convergence or max iterations + break + itr += 1 + return x + + +def qcholesky(A, fixed_point=15, mytype=np.int16): + """Perform fixed-point Cholesky decomposition of a symmetric + positive-definite matrix. + A (np.ndarray): Input matrix (must be square and symmetric). + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + tuple: Flattened input matrix, flattened lower triangular matrix, and + result vector. + """ + SCALE_FACTOR = 2**fixed_point + rows, columns = A.shape + if rows != columns: + raise ValueError("Matrix must be square for Cholesky decomposition.") + + L = np.zeros((rows, columns), dtype=mytype) + + for row in range(rows): + for column in range(columns): + if row == column: + pivot = A[row, column] + for k in range(column): + Ljk = L[row, k] + pivot -= (Ljk**2) // SCALE_FACTOR + if pivot < 0: + # raise ValueError("Negative value encountered in diagonal + # element.") + pivot = 0 + L[row, column] = qsqrt(pivot, fixed_point, mytype) + elif row > column: + pivot = A[row, column] + for k in range(column): + Lik = L[row, k] + Ljk = L[column, k] + pivot -= (Lik * Ljk) // SCALE_FACTOR + diag = L[column, column] + L[row, column] = (pivot * SCALE_FACTOR) // diag + else: + L[row, column] = 0 + + return L + + +def qccholesky(M_real, M_imag, fixed_point=15, mytype=np.int16): + """Perform fixed-point Cholesky decomposition of a symmetric + positive-definite matrix. + A (np.ndarray): Input matrix (must be square and symmetric). + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + tuple: Flattened input matrix, flattened lower triangular matrix, + and result vector. + """ + + SCALE_FACTOR = 2**fixed_point + NEGATIVE = fixed_point**2 + 1 + + rows, columns = M_real.shape + L_real = np.zeros_like(M_real, dtype=mytype) # Initialize dest with zeros + L_imag = np.zeros_like(M_imag, dtype=mytype) # Initialize dest with zeros + + # Check for dimensional errors + if rows != columns: + raise ValueError("Matrix must be square for Cholesky decomposition.") + + for row in range(rows): + for column in range(columns): + + if row == column: + # Diagonal element + real_pivot = M_real[row, column] + for k in range(column): + real_Ljk = L_real[row, k] + imag_Ljk = L_imag[row, k] + product = (real_Ljk**2 + imag_Ljk**2) // SCALE_FACTOR + real_pivot = real_pivot - product + + # Handle negative values for square root + if real_pivot < 0: + if real_pivot < NEGATIVE: + raise ValueError("Negative value encountered.") + real_pivot = 0 + L_real[row, column] = qsqrt(real_pivot, fixed_point, mytype) + + elif row > column: + # Off-diagonal element (below the diagonal) + real_pivot = M_real[row, column] + imag_pivot = M_imag[row, column] + + for k in range(column): + real_Lik = L_real[row, k] + imag_Lik = L_imag[row, k] + real_Ljk = L_real[column, k] + imag_Ljk = L_imag[column, k] + real_product = (real_Lik * real_Ljk - imag_Lik * imag_Ljk) + imag_product = (real_Lik * imag_Ljk + imag_Lik * real_Ljk) + real_product = real_product // SCALE_FACTOR + imag_product = imag_product // SCALE_FACTOR + real_pivot = real_pivot - real_product + imag_pivot = imag_pivot - imag_product + + diag = L_real[column, column] + L_real[row, column] = (real_pivot * SCALE_FACTOR) // diag + L_imag[row, column] = (imag_pivot * SCALE_FACTOR) // diag + + else: + # Above diagonal, set to zero + L_real[row, column] = 0 + L_imag[row, column] = 0 + + return L_real, L_imag + + +def qinvertLt(M_real, M_imag, y_real, y_imag, fixed_point=15, mytype=np.int16): + """Invert a lower triangular complex matrix using fixed-point arithmetic. + M_real (np.ndarray): Real part of the lower triangular matrix. + M_imag (np.ndarray): Imaginary part of the lower triangular matrix. + y_real (np.ndarray): Real part of the vector. + y_imag (np.ndarray): Imaginary part of the vector. + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + tuple: Real and imaginary parts of the result vector. + """ + SCALE_FACTOR = 2**fixed_point + n = M_real.shape[0] + x_real = np.zeros_like(y_real, dtype=mytype) + x_imag = np.zeros_like(y_imag, dtype=mytype) + + for i in range(n): + sum_real = y_real[i] + sum_imag = y_imag[i] + for j in range(i): + sum_real -= (M_real[i, j] * x_real[j] - + M_imag[i, j] * x_imag[j]) // SCALE_FACTOR + sum_imag -= (M_real[i, j] * x_imag[j] + + M_imag[i, j] * x_real[j]) // SCALE_FACTOR + + x_real[i] = (sum_real * SCALE_FACTOR) // M_real[i, i] + x_imag[i] = (sum_imag * SCALE_FACTOR) // M_real[i, i] + + return x_real, x_imag + + +def qinvertUt(M_real, M_imag, y_real, y_imag, fixed_point=15, mytype=np.int16): + """Invert an upper triangular complex matrix using fixed-point arithmetic. + M_real (np.ndarray): Real part of the upper triangular matrix. + M_imag (np.ndarray): Imaginary part of the upper triangular matrix. + y_real (np.ndarray): Real part of the vector. + y_imag (np.ndarray): Imaginary part of the vector. + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + tuple: Real and imaginary parts of the result vector. + """ + SCALE_FACTOR = 2**fixed_point + n = M_real.shape[0] + x_real = np.zeros_like(y_real, dtype=mytype) + x_imag = np.zeros_like(y_imag, dtype=mytype) + + for i in range(n - 1, -1, -1): + sum_real = y_real[i] + sum_imag = y_imag[i] + + for j in range(i + 1, n): + sum_real -= (M_real[i, j] * x_real[j] - + M_imag[i, j] * x_imag[j]) // SCALE_FACTOR + sum_imag -= (M_real[i, j] * x_imag[j] + + M_imag[i, j] * x_real[j]) // SCALE_FACTOR + + x_real[i] = (sum_real * SCALE_FACTOR) // M_real[i, i] + x_imag[i] = (sum_imag * SCALE_FACTOR) // M_real[i, i] + + return x_real, x_imag + + +def qtwiddleCoef(N, fixed_point=15, mytype=np.int16): + """Generate fixed-point twiddle coefficients for FFT. + N (int): Number of points in FFT. + fixed_point (int): Number of bits for the fractional part. + mytype (np.dtype): Data type for the fixed-point representation. + + Returns: + np.ndarray: Twiddle coefficients in fixed-point representation. + """ + PI = 3.14159265358979 + twiddleCoefq15 = np.zeros((int(2 * 3 * N / 4)), dtype=mytype) + for i in range(int(3 * N / 4)): + twiddleCoefq15_cos = math.cos(i * 2 * PI / N) + twiddleCoefq15_sin = math.sin(i * 2 * PI / N) + twiddleCoefq15[2 * i] = \ + int(round(twiddleCoefq15_cos * (2**fixed_point - 1))) + twiddleCoefq15[2 * i + 1] = \ + int(round(twiddleCoefq15_sin * (2**fixed_point - 1))) + return twiddleCoefq15 + + +def bitreversal(N, R): + """Perform bit-reversal for FFT with radix-R decomposition. + + Args: + N (int): Number of points in FFT. + R (int): Radix for FFT decomposition. + + Returns: + np.ndarray: Flattened bit-reversal transposition table. + """ + # Decompose + logR2 = [] + idx = N + while (idx >= R): + logR2.append(int(math.log2(R))) + idx = idx // R + if (idx > 1): + logR2.append(int(math.log2(idx))) + # Bitreversal + indexes = [] + for x in range(N): + result = 0 + for bits in logR2: + mask = (0xffffffff >> (32 - bits)) + result = (result << bits) | (x & mask) + x = x >> bits + indexes.append(result) + # Create transpositions table + tps = [] + for c in Permutation.from_sequence(indexes).cyclic_form: + for i in range(len(c) - 1): + tps.append([c[i] * 8, c[-1] * 8]) + return np.ndarray.flatten(np.array(tps)) + + +def q_sat(x): + if x > 2**15 - 1: + return x - 2**16 + elif x < -2**15: + return x + 2**16 + else: + return x + + +def qchest(in_rx, in_tx, division=False, fixed_point=8, mytype=np.int16): + """Perform fixed-point complex channel estimation (CHEST). + in_rx (np.ndarray): Received signal array (complex numbers). + in_tx (np.ndarray): Transmitted signal array (complex numbers). + division (bool): Whether to perform division or multiplication. + Defaults to False. + fixed_point (int): Number of bits for the fractional part. Defaults to 8. + mytype (np.dtype): Data type for fixed-point representation. + Defaults to np.int16. + + Returns: + np.ndarray: Resulting array in fixed-point representation. + """ + SCALE_FACTOR = 2**fixed_point + n_rx = in_rx.size + n_tx = in_tx.size + + # Resulting array (real and imaginary interleaved) + result = np.zeros(2 * (n_tx * n_rx), dtype=mytype) + + for i in range(n_rx): + a_r = in_rx[i].real + a_i = in_rx[i].imag + for j in range(n_tx): + b_r = in_tx[j].real + b_i = in_tx[j].imag + + if division: + # Compute data division + den = (2**16) // (b_r * b_r + b_i * b_i) + if den == 0: + raise ZeroDivisionError( + "Division by zero encountered in CHEST.") + num_r = (a_r * b_r + a_i * b_i) + num_i = (a_i * b_r - a_r * b_i) + result[2 * (i * n_tx + j)] = (num_r // den) * SCALE_FACTOR + result[2 * (i * n_tx + j) + 1] = (num_i // den) * SCALE_FACTOR + else: + # Compute data multiplication + num_r = (a_r * b_r - a_i * b_i) + num_i = (a_i * b_r + a_r * b_i) + result[2 * (i * n_tx + j)] = q_sat(num_r // SCALE_FACTOR) + result[2 * (i * n_tx + j) + 1] = q_sat(num_i // SCALE_FACTOR) + + return result diff --git a/software/kernels/baremetal/mempool_checks.h b/software/kernels/baremetal/mempool_checks.h index d680764c1..110acec90 100644 --- a/software/kernels/baremetal/mempool_checks.h +++ b/software/kernels/baremetal/mempool_checks.h @@ -12,7 +12,7 @@ @param[in] TOL floating point tolerance @return none */ -void mempool_check_q32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp, +void mempool_check_i32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp, uint32_t NEL, int32_t TOL, bool verbose) { uint32_t core_id = mempool_get_core_id(); int32_t error; @@ -41,7 +41,7 @@ void mempool_check_q32(int32_t *__restrict__ pRes, int32_t *__restrict__ pExp, @param[in] TOL floating point tolerance @return none */ -void mempool_check_q16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp, +void mempool_check_i16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp, uint32_t NEL, int16_t TOL, bool verbose) { uint32_t core_id = mempool_get_core_id(); int16_t error; @@ -53,7 +53,36 @@ void mempool_check_q16(int16_t *__restrict__ pRes, int16_t *__restrict__ pExp, error = (int16_t)(exp - res); bool print = ((error > TOL) || (error < (-TOL))) | verbose; if (print) { - printf("CHECK(%d): EXP = %08X - RESP = %08X\n", i, exp, res); + printf("CHECK(%d): EXP = %04X - RESP = %04X\n", i, exp, res); + ERRORS++; + } + } + printf("%d ERRORS out of %d CHECKS\n", ERRORS, NEL); + } + return; +} + +/** + @brief Check for i8 kernels. + @param[in] pRes points to the result + @param[in] pExp points to the expected result + @param[in] NEL number of elements to check + @param[in] TOL floating point tolerance + @return none +*/ +void mempool_check_i8(int8_t *__restrict__ pRes, int8_t *__restrict__ pExp, + uint32_t NEL, int16_t TOL, bool verbose) { + uint32_t core_id = mempool_get_core_id(); + int16_t error; + if (core_id == 0) { + uint32_t ERRORS = 0; + for (uint32_t i = 0; i < NEL; i++) { + int16_t exp = (int8_t)pExp[i]; + int16_t res = (int8_t)pRes[i]; + error = (int8_t)(exp - res); + bool print = ((error > TOL) || (error < (-TOL))) | verbose; + if (print) { + printf("CHECK(%d): EXP = %02X - RESP = %02X\n", i, exp, res); ERRORS++; } } diff --git a/software/kernels/baremetal/mempool_dotp_i32p.h b/software/kernels/baremetal/mempool_dotp_i32p.h new file mode 100644 index 000000000..26fbe03e9 --- /dev/null +++ b/software/kernels/baremetal/mempool_dotp_i32p.h @@ -0,0 +1,196 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +/* Parallel dot-product */ +void dotp_i32p(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, + uint32_t nPE) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + register int32_t local_sum = 0; + register int32_t a, b; + for (uint32_t i = core_id * step; i < core_id * step + step; i++) { + a = in_a[i]; + b = in_b[i]; + local_sum += a * b; + } + __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED); +#ifdef LOG_BARRIERS + mempool_log_barrier(2, core_id); +#else + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier(num_cores); +#endif + return; +} + +/* Parallel dot-product with loop unrolling*/ +void dotp_i32p_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, + uint32_t nPE) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + uint32_t reminder = step % 4; + uint32_t i; + + register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0; + register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0; + register int32_t local_sum0 = 0; + register int32_t local_sum1 = 0; + register int32_t local_sum2 = 0; + register int32_t local_sum3 = 0; + for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) { + a0 = in_a[i]; + b0 = in_b[i]; + a1 = in_a[i + 1]; + b1 = in_b[i + 1]; + a2 = in_a[i + 2]; + b2 = in_b[i + 2]; + a3 = in_a[i + 3]; + b3 = in_b[i + 3]; + local_sum0 += a0 * b0; + local_sum1 += a1 * b1; + local_sum2 += a2 * b2; + local_sum3 += a3 * b3; + } + i = core_id * step + step - reminder; + while (i < step) { + a0 = in_a[i]; + b0 = in_b[i]; + local_sum0 += a0 * b0; + i++; + } + local_sum0 += local_sum1; + local_sum2 += local_sum3; + local_sum0 += local_sum2; + __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED); +#ifdef LOG_BARRIERS + mempool_log_barrier(2, core_id); +#else + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier(num_cores); +#endif + return; +} + +/* Bynary tree reduction */ +void mempool_binary_reduction(int32_t *sum, uint32_t core_id, + uint32_t num_cores) { + + uint32_t idx, step = 2, previous_step = 1; + while (num_cores > 1) { + idx = (step * (core_id / step)) * BANKING_FACTOR; + // dump_prova(idx); + if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1, + __ATOMIC_RELAXED)) { + + // Reduction + sum[idx] += sum[idx + previous_step * BANKING_FACTOR]; + + // Next level of binary tree + __atomic_store_n(&red_barrier[idx + previous_step - 1], 0, + __ATOMIC_RELAXED); + num_cores = num_cores / 2; + previous_step = step; + step = step * 2; + + } else { + // Goes to sleep + break; + } + } + + // Last core wakes everyone + if (num_cores == 1) { + __sync_synchronize(); + wake_up_all(); + } + mempool_wfi(); + + return; +} + +/* Parallel dot-product with loop unrolling */ +/* Load and stores only in local memory */ +#define NUM_CORES_RED (16) +void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, + uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t const remainder = Len % 4; + uint32_t const idx_stop = Len - remainder; + + register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0; + register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0; + register int32_t local_sum0 = 0; + register int32_t local_sum1 = 0; + register int32_t local_sum2 = 0; + register int32_t local_sum3 = 0; + + for (uint32_t i = core_id * 4; i < idx_stop; i += NUM_BANKS) { + a0 = in_a[i]; + b0 = in_b[i]; + a1 = in_a[i + 1]; + b1 = in_b[i + 1]; + a2 = in_a[i + 2]; + b2 = in_b[i + 2]; + a3 = in_a[i + 3]; + b3 = in_b[i + 3]; + local_sum0 += a0 * b0; + local_sum1 += a1 * b1; + local_sum2 += a2 * b2; + local_sum3 += a3 * b3; + } + if (core_id == ((Len % NUM_BANKS) / 4)) { + for (uint32_t i = Len - remainder; i < Len; i++) { + a0 = in_a[i]; + b0 = in_b[i]; + local_sum0 += a0 * b0; + } + } + local_sum0 += local_sum1; + local_sum2 += local_sum3; + local_sum0 += local_sum2; + +// A) Cores atomically fetch and add in sum variable +// B) A global barrier synchronizes all of them +#if defined(ATOMIC_REDUCTION) + __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED); + mempool_log_barrier(2, core_id); + +// A) Groups of NUM_CORES_RED cores atomically fetch and add in sum array +// B) The last core to the reduction barrier sums the partial reductions +#elif defined(SINGLE_CORE_REDUCTION) + uint32_t num_cores = mempool_get_core_count(); + __atomic_fetch_add( + &s[BANKING_FACTOR * NUM_CORES_RED * (core_id / NUM_CORES_RED)], + local_sum0, __ATOMIC_RELAXED); + if ((num_cores - 1) == + __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED); + __sync_synchronize(); // Full memory barrier + uint32_t idx_red = 0; + local_sum0 = 0; + while (idx_red < NUM_BANKS) { + local_sum0 += s[idx_red]; + idx_red += BANKING_FACTOR * NUM_CORES_RED; + } + s[0] = local_sum0; + wake_up_all(); + } + mempool_wfi(); + +// A) Cores store locally in sum array +// B) Partial sums are reduced logarithmically +#elif defined(BINARY_REDUCTION) + uint32_t num_cores = mempool_get_core_count(); + s[core_id * 4] = local_sum0; + mempool_binary_reduction(s, core_id, num_cores); + +#endif + + return; +} diff --git a/software/apps/baremetal/dotp_i32/dotp_single.h b/software/kernels/baremetal/mempool_dotp_i32s.h similarity index 88% rename from software/apps/baremetal/dotp_i32/dotp_single.h rename to software/kernels/baremetal/mempool_dotp_i32s.h index 58797ee80..dd562debb 100644 --- a/software/apps/baremetal/dotp_i32/dotp_single.h +++ b/software/kernels/baremetal/mempool_dotp_i32s.h @@ -5,12 +5,11 @@ // Author: Marco Bertuletti, ETH Zurich /* Single-core dot-product */ -void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { +void dotp_i32s(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); if (core_id == 0) { - mempool_start_benchmark(); // Kernel execution register int32_t local_sum = 0; @@ -18,7 +17,6 @@ void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { do { local_sum += ((*in_a++) * (*in_b++)); } while (in_a < end); - *s = local_sum; mempool_stop_benchmark(); } @@ -26,17 +24,15 @@ void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { } /* Single-core dot-product unrolled4 */ -void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { +void dotp_i32s_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, + uint32_t Len) { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); if (core_id == 0) { - mempool_start_benchmark(); uint32_t reminder = Len % 4; uint32_t i = 0; - int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0, b3 = 0; register int32_t local_sum_1 = 0; register int32_t local_sum_2 = 0; @@ -70,5 +66,4 @@ void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, mempool_stop_benchmark(); } mempool_barrier(num_cores); - // mempool_log_barrier(2, core_id); } diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 69d309158..52d86c6d1 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -172,11 +172,8 @@ OMP_RUNTIME := $(addsuffix .o,$(shell find $(OMP_DIR) -name "*.c")) %.ld: %.ld.c $(RISCV_CC) -P -E $(DEFINES) $< -o $@ -%.h: %.args - cat $< | xargs $(python) $(MEMPOOL_DIR)/scripts/gen_data.py --clangformat=$(LLVM_INSTALL_DIR)/bin/clang-format -o $@ - -%.h: %.py - $(python) $< +data_%.h: $(DATA_DIR)/gendata_params.hjson + $(python) $(DATA_DIR)/gendata_header.py --app_name $* --params $(DATA_DIR)/gendata_params.hjson # Bootrom %.elf: %.S $(ROOT_DIR)/bootrom.ld $(LINKER_SCRIPT) diff --git a/software/tests/baremetal/Makefile b/software/tests/baremetal/Makefile index 5efba8e1b..71dac7ce9 100644 --- a/software/tests/baremetal/Makefile +++ b/software/tests/baremetal/Makefile @@ -16,8 +16,6 @@ RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime) include $(RUNTIME_DIR)/runtime.mk TESTS := $(patsubst $(TESTS_DIR)/%/main.c,%,$(shell find $(TESTS_DIR) -name "main.c")) -DATA := $(patsubst %.args,%.h,$(shell find $(TESTS_DIR) -name "data.args")) -ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py)) BINARIES := $(addprefix $(BIN_DIR)/,$(TESTS)) # Make all applications @@ -26,7 +24,7 @@ all: $(TESTS) $(TESTS): % : $(BIN_DIR)/% $(TESTS_DIR)/Makefile $(shell find $(RUNTIME_DIR)/**.{S,c,h,ld} -type f) .PHONY: $(BINARIES) -$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) $(ALLPYS) update_opcodes +$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) data_%.h update_opcodes mkdir -p $(dir $@) $(RISCV_CC) -Iinclude $(RISCV_LDFLAGS) -o $@ $< $(RUNTIME) -T$(RUNTIME_DIR)/link.ld $(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump diff --git a/software/tests/baremetal/fence/data.args b/software/tests/baremetal/fence/data.args deleted file mode 100644 index f52fe46db..000000000 --- a/software/tests/baremetal/fence/data.args +++ /dev/null @@ -1 +0,0 @@ ---variable=l2_data --size=12288 diff --git a/software/tests/baremetal/fence/main.c b/software/tests/baremetal/fence/main.c index 82f493c12..934cddfea 100644 --- a/software/tests/baremetal/fence/main.c +++ b/software/tests/baremetal/fence/main.c @@ -7,7 +7,7 @@ #include #include -#include "data.h" +#include "data_fence.h" #include "dma.h" #include "encoding.h" #include "mempool_dma_frontend.h" diff --git a/software/tests/baremetal/memcpy/data.args b/software/tests/baremetal/memcpy/data.args deleted file mode 100644 index 21fbc935a..000000000 --- a/software/tests/baremetal/memcpy/data.args +++ /dev/null @@ -1 +0,0 @@ ---variable=l2_data --size=2048 \ No newline at end of file diff --git a/software/tests/baremetal/memcpy/main.c b/software/tests/baremetal/memcpy/main.c index 4e07a9a30..6ca336cbf 100644 --- a/software/tests/baremetal/memcpy/main.c +++ b/software/tests/baremetal/memcpy/main.c @@ -7,7 +7,7 @@ #include #include -#include "data.h" +#include "data_memcpy.h" #include "dma.h" #include "encoding.h" #include "mempool_dma_frontend.h"