From e77933cf499ae9d35a6a866e9cf622a8422ddad0 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Mon, 26 Aug 2024 16:30:21 +0200 Subject: [PATCH] [software] Clean-up dotp app and align to other kernels --- software/apps/baremetal/dotp_i32/define.h | 55 ----- .../apps/baremetal/dotp_i32/dotp_parallel.h | 83 -------- .../baremetal/dotp_i32/dotp_parallel_local.h | 162 --------------- .../baremetal/dotp_i32/dotp_parallel_red0.h | 122 ----------- .../dotp_i32/dotp_parallel_redtree.h | 129 ------------ software/apps/baremetal/dotp_i32/main.c | 136 ++++-------- software/data/data_dotp_i32.h.tpl | 24 +++ software/data/generate_dotp.py | 77 +++++++ .../kernels/baremetal/mempool_dotp_i32p.h | 196 ++++++++++++++++++ .../baremetal/mempool_dotp_i32s.h} | 11 +- 10 files changed, 338 insertions(+), 657 deletions(-) delete mode 100644 software/apps/baremetal/dotp_i32/define.h delete mode 100644 software/apps/baremetal/dotp_i32/dotp_parallel.h delete mode 100644 software/apps/baremetal/dotp_i32/dotp_parallel_local.h delete mode 100644 software/apps/baremetal/dotp_i32/dotp_parallel_red0.h delete mode 100644 software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h create mode 100644 software/data/data_dotp_i32.h.tpl create mode 100644 software/data/generate_dotp.py create mode 100644 software/kernels/baremetal/mempool_dotp_i32p.h rename software/{apps/baremetal/dotp_i32/dotp_single.h => kernels/baremetal/mempool_dotp_i32s.h} (88%) diff --git a/software/apps/baremetal/dotp_i32/define.h b/software/apps/baremetal/dotp_i32/define.h deleted file mode 100644 index d2b069d21..000000000 --- a/software/apps/baremetal/dotp_i32/define.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#define LEN (1024) -#define N_PE (NUM_CORES) -#define N_BANK (NUM_CORES * 4) -#define N_BANK_PE (N_PE * 4) - -/* Enable log barriers */ -#define LOG_BARRIERS - -/* STEP core 0 reduction */ -#define STEP (256) -#define STEP_CORES (STEP / 4) - -////////////////////////////////// -/* SELECT ONE */ - -// #define SINGLE -// #define SINGLE_UNROLLED - -// #define PARALLEL -// #define PARALLEL_UNROLLED - -// #define PARALLEL_LOCAL -// #define LOCAL_UNROLLED - -// #define PARALLEL_RED0 -// #define PARALLEL_UNROLLED_RED0 - -// #define PARALLEL_REDTREE -// #define PARALLEL_UNROLLED_REDTREE - -////////////////////////////////// - -// Vectors for kernel computation -int32_t vector_a[LEN] __attribute__((aligned(LEN), section(".l1"))); -int32_t vector_b[LEN] __attribute__((aligned(LEN), section(".l1"))); - -#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) || \ - defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE) -int32_t sum[N_BANK] __attribute__((aligned(N_BANK), section(".l1"))); -#else -int32_t sum __attribute__((section(".l1"))); -#endif - -// Vectors for performance metrics -uint32_t volatile red_barrier[NUM_CORES * 4] - __attribute__((aligned(NUM_CORES * 4), section(".l1"))); -int32_t result __attribute__((section(".l1"))); -int32_t check __attribute__((section(".l1"))); -int volatile error __attribute__((section(".l1"))); diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel.h b/software/apps/baremetal/dotp_i32/dotp_parallel.h deleted file mode 100644 index b765f6987..000000000 --- a/software/apps/baremetal/dotp_i32/dotp_parallel.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* Parallel dot-product */ -void dotp_parallel(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, - uint32_t nPE) { - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t step = Len / nPE; - - register int32_t local_sum = 0; - register int32_t a, b; - for (uint32_t i = core_id * step; i < core_id * step + step; i++) { - a = in_a[i]; - b = in_b[i]; - local_sum += a * b; - } - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_barrier(2, core_id); - (void)num_cores; -#else - mempool_barrier(num_cores); -#endif -} - -/* Parallel dot-product */ -void dotp_parallel_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len, uint32_t nPE) { - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t step = Len / nPE; - uint32_t reminder = step % 4; - uint32_t i; - - register int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0, - b3 = 0; - register int32_t local_sum0 = 0; - register int32_t local_sum1 = 0; - register int32_t local_sum2 = 0; - register int32_t local_sum3 = 0; - for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) { - a0 = in_a[i]; - b0 = in_b[i]; - a1 = in_a[i + 1]; - b1 = in_b[i + 1]; - a2 = in_a[i + 2]; - b2 = in_b[i + 2]; - a3 = in_a[i + 3]; - b3 = in_b[i + 3]; - local_sum0 += a0 * b0; - local_sum1 += a1 * b1; - local_sum2 += a2 * b2; - local_sum3 += a3 * b3; - } - i = core_id * step + step - reminder; - while (i < step) { - a0 = in_a[i]; - b0 = in_b[i]; - local_sum0 += a0 * b0; - i++; - } - local_sum0 += local_sum1; - local_sum2 += local_sum3; - local_sum0 += local_sum2; - mempool_barrier(num_cores); - - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_barrier(2, core_id); -#else - mempool_barrier(num_cores); -#endif -} diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_local.h b/software/apps/baremetal/dotp_i32/dotp_parallel_local.h deleted file mode 100644 index 950955832..000000000 --- a/software/apps/baremetal/dotp_i32/dotp_parallel_local.h +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* - Parallel dot-product with final reduction performed by multiple cores - using atomic-fetch and adds to a single memory location. - A) Parallelized workload - B) Atomic fetch and add to a single memory location - C) Barrier */ - -/*******************************************************/ -/** MULTI-CORE **/ -/*******************************************************/ - -/* Parallel dot-product */ -void dotp_parallel_local(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, - uint32_t nPE) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - - if (nPE == num_cores) { - register int32_t local_sum = 0; - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - local_sum += in_a[idx] * in_b[idx]; - local_sum += in_a[idx + 1] * in_b[idx + 1]; - local_sum += in_a[idx + 2] * in_b[idx + 2]; - local_sum += in_a[idx + 3] * in_b[idx + 3]; - idx += N_BANK; - } - if (core_id == (Len % N_BANK) / 4) { - while (idx < Len) { - local_sum += in_a[idx] * in_b[idx]; - idx++; - } - } - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_barrier(2, core_id); -#else - mempool_barrier(num_cores); -#endif - } else { - register int32_t local_sum = 0; - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - local_sum += in_a[idx] * in_b[idx]; - local_sum += in_a[idx + 1] * in_b[idx + 1]; - local_sum += in_a[idx + 2] * in_b[idx + 2]; - local_sum += in_a[idx + 3] * in_b[idx + 3]; - idx += N_BANK_PE; - } - if (core_id == (Len % N_BANK_PE) / 4) { - while (idx < Len) { - local_sum += in_a[idx] * in_b[idx]; - idx++; - } - } - if (core_id < nPE) { - mempool_stop_benchmark(); - mempool_start_benchmark(); - } - __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_partial_barrier(2, core_id, nPE); -#else - mempool_barrier(num_cores); -#endif - } -} - -/* Parallel dot-product with loop unrolling */ -void dotp_parallel_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len, uint32_t nPE) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - register int32_t local_sum_1 = 0; - register int32_t local_sum_2 = 0; - register int32_t local_sum_3 = 0; - register int32_t local_sum_4 = 0; - - if (nPE == num_cores) { - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - int32_t in_a1 = in_a[idx]; - int32_t in_b1 = in_b[idx]; - int32_t in_a2 = in_a[idx + 1]; - int32_t in_b2 = in_b[idx + 1]; - int32_t in_a3 = in_a[idx + 2]; - int32_t in_b3 = in_b[idx + 2]; - int32_t in_a4 = in_a[idx + 3]; - int32_t in_b4 = in_b[idx + 3]; - local_sum_1 += in_a1 * in_b1; - local_sum_2 += in_a2 * in_b2; - local_sum_3 += in_a3 * in_b3; - local_sum_4 += in_a4 * in_b4; - idx += N_BANK; - } - if (core_id == ((Len % N_BANK) / 4)) { - while (idx < Len) { - local_sum_1 += in_a[idx] * in_b[idx]; - idx++; - } - } - local_sum_1 += local_sum_2; - local_sum_3 += local_sum_4; - local_sum_1 += local_sum_3; - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum_1, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_barrier(2, core_id); -#else - mempool_barrier(num_cores); -#endif - } else { - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - int32_t in_a1 = in_a[idx]; - int32_t in_b1 = in_b[idx]; - int32_t in_a2 = in_a[idx + 1]; - int32_t in_b2 = in_b[idx + 1]; - int32_t in_a3 = in_a[idx + 2]; - int32_t in_b3 = in_b[idx + 2]; - int32_t in_a4 = in_a[idx + 3]; - int32_t in_b4 = in_b[idx + 3]; - local_sum_1 += in_a1 * in_b1; - local_sum_2 += in_a2 * in_b2; - local_sum_3 += in_a3 * in_b3; - local_sum_4 += in_a4 * in_b4; - idx += N_BANK_PE; - } - if (core_id == ((Len % N_BANK_PE) / 4)) { - while (idx < Len) { - local_sum_1 += in_a[idx] * in_b[idx]; - idx++; - } - } - local_sum_1 += local_sum_2; - local_sum_3 += local_sum_4; - local_sum_1 += local_sum_3; - mempool_stop_benchmark(); - mempool_start_benchmark(); - __atomic_fetch_add(&s[0], local_sum_1, __ATOMIC_RELAXED); -#ifdef LOG_BARRIERS - mempool_log_partial_barrier(2, core_id, nPE); -#else - mempool_barrier(num_cores); -#endif - } -} diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h b/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h deleted file mode 100644 index 0ad166d41..000000000 --- a/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* - Parallel dot-product with atomic fetch and add towards local memory - locations and final reduction by a single core. The cores write in - memory banks separated by a "step". - A) Parallelized workload - B) Atomic fetch and add to local memory banks - C) Barrier - D) Final reduction by core 0 incorporated in a barrier */ - -/*******************************************************/ -/** MULTI-CORE **/ -/*******************************************************/ - -/* Parallel dot-product */ -void dotp_parallel_red0(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - int32_t local_sum = 0; - - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - local_sum += in_a[idx] * in_b[idx]; - local_sum += in_a[idx + 1] * in_b[idx + 1]; - local_sum += in_a[idx + 2] * in_b[idx + 2]; - local_sum += in_a[idx + 3] * in_b[idx + 3]; - idx += N_BANK; - } - if (core_id == (Len % N_BANK) / 4) { - while (idx < Len) { - local_sum += in_a[idx] * in_b[idx]; - idx++; - } - } - __atomic_fetch_add(&s[(core_id / STEP_CORES) * STEP], local_sum, - __ATOMIC_RELAXED); - mempool_stop_benchmark(); - - mempool_start_benchmark(); - if ((num_cores - 1) == - __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) { - __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED); - __sync_synchronize(); // Full memory barrier - uint32_t idx_red = 0; - local_sum = 0; - while (idx_red < N_BANK) { - local_sum += s[idx_red]; - idx_red += STEP; - } - s[0] = local_sum; - wake_up_all(); - } - mempool_wfi(); -} - -/* Parallel dot-product with loop unrolling */ -void dotp_parallel_unrolled4_red0(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - int32_t local_sum_1 = 0; - int32_t local_sum_2 = 0; - int32_t local_sum_3 = 0; - int32_t local_sum_4 = 0; - - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - int32_t in_a1 = in_a[idx]; - int32_t in_b1 = in_b[idx]; - int32_t in_a2 = in_a[idx + 1]; - int32_t in_b2 = in_b[idx + 1]; - int32_t in_a3 = in_a[idx + 2]; - int32_t in_b3 = in_b[idx + 2]; - int32_t in_a4 = in_a[idx + 3]; - int32_t in_b4 = in_b[idx + 3]; - local_sum_1 += in_a1 * in_b1; - local_sum_2 += in_a2 * in_b2; - local_sum_3 += in_a3 * in_b3; - local_sum_4 += in_a4 * in_b4; - idx += N_BANK; - } - if (core_id == ((Len % N_BANK) / 4)) { - while (idx < Len) { - local_sum_1 += in_a[idx] * in_b[idx]; - idx++; - } - } - local_sum_1 += local_sum_2; - local_sum_3 += local_sum_4; - local_sum_1 += local_sum_3; - __atomic_fetch_add(&s[(core_id / STEP_CORES) * STEP], local_sum_1, - __ATOMIC_RELAXED); - mempool_stop_benchmark(); - - mempool_start_benchmark(); - if ((num_cores - 1) == - __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) { - __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED); - __sync_synchronize(); // Full memory barrier - uint32_t idx_red = 0; - local_sum_1 = 0; - while (idx_red < N_BANK) { - local_sum_1 += s[idx_red]; - idx_red += STEP; - } - s[0] = local_sum_1; - wake_up_all(); - } - mempool_wfi(); -} diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h b/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h deleted file mode 100644 index 3659de0a3..000000000 --- a/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* - Parallel dot-product with atomic fetch and add towards local memory - locations and final reduction by a single core. The cores write in - memory banks separated by a "step". - A) Parallelized workload - B) Atomic fetch and add to local memory banks - C) Barrier - D) Final reduction by core 0 incorporated in a barrier */ - -/*******************************************************/ -/** MULTI-CORE **/ -/*******************************************************/ - -void mempool_log_reduction(int32_t *sum, uint32_t volatile step, - uint32_t core_id); - -/* Parallel dot-product */ -void dotp_parallel_redtree(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - - register int32_t local_sum = 0; - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - local_sum += in_a[idx] * in_b[idx]; - local_sum += in_a[idx + 1] * in_b[idx + 1]; - local_sum += in_a[idx + 2] * in_b[idx + 2]; - local_sum += in_a[idx + 3] * in_b[idx + 3]; - idx += N_BANK; - } - if (core_id == (Len % N_BANK) / 4) { - while (idx < Len) { - local_sum += in_a[idx] * in_b[idx]; - idx++; - } - } - s[core_id * 4] = local_sum; // Each core is storing locally - mempool_stop_benchmark(); - mempool_start_benchmark(); - mempool_log_reduction(s, 2, core_id); -} - -void dotp_parallel_redtree_unrolled(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { - - uint32_t const remainder = Len % 4; - uint32_t const idx_stop = Len - remainder; - uint32_t core_id = mempool_get_core_id(); - register int32_t local_sum_1 = 0; - register int32_t local_sum_2 = 0; - register int32_t local_sum_3 = 0; - register int32_t local_sum_4 = 0; - - uint32_t idx = core_id * 4; - while (idx < idx_stop) { - int32_t in_a1 = in_a[idx]; - int32_t in_b1 = in_b[idx]; - int32_t in_a2 = in_a[idx + 1]; - int32_t in_b2 = in_b[idx + 1]; - int32_t in_a3 = in_a[idx + 2]; - int32_t in_b3 = in_b[idx + 2]; - int32_t in_a4 = in_a[idx + 3]; - int32_t in_b4 = in_b[idx + 3]; - local_sum_1 += in_a1 * in_b1; - local_sum_2 += in_a2 * in_b2; - local_sum_3 += in_a3 * in_b3; - local_sum_4 += in_a4 * in_b4; - idx += N_BANK; - } - if (core_id == ((Len % N_BANK) / 4)) { - while (idx < Len) { - local_sum_1 += in_a[idx] * in_b[idx]; - idx++; - } - } - local_sum_1 += local_sum_2; - local_sum_3 += local_sum_4; - local_sum_1 += local_sum_3; - s[core_id * 4] = local_sum_1; // Each core is storing locally - mempool_stop_benchmark(); - mempool_start_benchmark(); - mempool_log_reduction(s, 2, core_id); -} - -void mempool_log_reduction(int32_t *sum, uint32_t volatile step, - uint32_t core_id) { - - uint32_t idx_sum, idx = (step * (core_id / step)) * 4; - uint32_t next_step, previous_step; - register int32_t local_sum; - uint32_t num_cores = mempool_get_core_count(); - - previous_step = step >> 1; - if ((step - previous_step) == - __atomic_fetch_add(&red_barrier[idx + previous_step - 1], previous_step, - __ATOMIC_RELAXED)) { - - local_sum = 0; - idx_sum = idx; - while (idx_sum < idx + step * 4) { - local_sum += sum[idx_sum]; - idx_sum += previous_step * 4; - } - sum[idx] = local_sum; - - next_step = step << 1; - __atomic_store_n(&red_barrier[idx + previous_step - 1], 0, - __ATOMIC_RELAXED); - if (num_cores == step) { - sum[0] = sum[idx]; - __sync_synchronize(); // Full memory barrier - wake_up_all(); - mempool_wfi(); - } else { - mempool_log_reduction(sum, next_step, core_id); - } - - } else - mempool_wfi(); -} diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c index f7cf7508f..da00a937e 100644 --- a/software/apps/baremetal/dotp_i32/main.c +++ b/software/apps/baremetal/dotp_i32/main.c @@ -8,132 +8,72 @@ #include #include +#include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "define.h" +#include "data_dotp_i32.h" +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) +#define LOG_BARRIERS +// #define ATOMIC_REDUCTION +// #define SINGLE_CORE_REDUCTION +#define BINARY_REDUCTION -#include "dotp_parallel.h" -#include "dotp_parallel_local.h" -#include "dotp_parallel_red0.h" -#include "dotp_parallel_redtree.h" -#include "dotp_single.h" +// Vectors for kernel computation +int32_t l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +int32_t l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +uint32_t red_barrier[NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +int32_t sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); -void init_vectors(int32_t *in_a, int32_t *in_b, int32_t *s, int32_t *p_result, - int32_t *p_check, uint32_t Len) { - *p_result = 0; - *p_check = 0; - uint32_t j = 0; - uint32_t num_cores = mempool_get_core_count(); - while (j < Len) { - int32_t a = (int32_t)(j % num_cores); - int32_t b = (int32_t)(j % 4 + 3); - in_a[j] = a; - in_b[j] = b; - *p_check = *p_check + (int32_t)(a * b); - j++; - } -#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) || \ - defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE) - for (uint32_t k = 0; k < N_BANK; k++) { - s[k] = 0; - red_barrier[k] = 0; - } -#else - *s = 0; -#endif -} +#include "baremetal/mempool_dotp_i32p.h" +#include "baremetal/mempool_dotp_i32s.h" int main() { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); uint32_t time_init, time_end; - // initialize synchronization variables mempool_barrier_init(core_id); + time_init = 0; + time_end = 0; if (core_id == 0) { - error = 0; - time_init = 0; - time_end = 0; -#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) || \ - defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE) - init_vectors(vector_a, vector_b, sum, &result, &check, LEN); -#else - init_vectors(vector_a, vector_b, &sum, &result, &check, LEN); -#endif + dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t)); + dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t)); + } + for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) { + sum[k] = 0; + red_barrier[k] = 0; } - mempool_barrier(num_cores); // wait until all cores have finished + mempool_barrier(num_cores); - // Kernel execution + // // SINGLE-CORE + // time_init = mempool_get_timer(); + // dotp_i32s_unrolled4(l1_A, l1_B, sum, LEN); + // time_end = mempool_get_timer(); - time_init = mempool_get_timer(); -#ifdef SINGLE - dotp_single(vector_a, vector_b, &sum, LEN); -#elif defined(SINGLE_UNROLLED) - dotp_single_unrolled4(vector_a, vector_b, &sum, LEN); -#endif - time_end = mempool_get_timer(); + // // PARALLEL + // time_init = mempool_get_timer(); + // dotp_i32p(l1_A, l1_B, sum, LEN, num_cores); + // time_end = mempool_get_timer(); + // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); - mempool_start_benchmark(); -/* A) Parallelized workload - B) Atomic fetch and add to a single memory location - C) Barrier */ -#ifdef PARALLEL - dotp_parallel(vector_a, vector_b, &sum, LEN, N_PE); -#elif defined(PARALLEL_UNROLLED) - dotp_parallel_unrolled4(vector_a, vector_b, &sum, LEN, N_PE); -/* A) Parallelized workload - B) Atomic fetch and add to local memory banks - C) Barrier - D) Final reduction by core 0 incorporated in a barrier */ -#elif defined(PARALLEL_RED0) - dotp_parallel_red0(vector_a, vector_b, sum, LEN, N_PE); -#elif defined(PARALLEL_UNROLLED_RED0) - dotp_parallel_unrolled4_red0(vector_a, vector_b, sum, LEN, N_PE); -/* A) Parallelized workload - B) Nested set of barriers: reduction is performed in a logarithmic tree. */ -#elif defined(PARALLEL_REDTREE) - dotp_parallel_redtree(vector_a, vector_b, sum, LEN, N_PE); -#elif defined(PARALLEL_UNROLLED_REDTREE) - dotp_parallel_redtree_unrolled(vector_a, vector_b, sum, LEN, N_PE); -#endif - mempool_stop_benchmark(); + dotp_i32p_local_unrolled4(l1_A, l1_B, sum, LEN); time_end = mempool_get_timer(); - /* A) Parallelized workload - B) Atomic fetch and add to a single memory location - C) Barrier */ - if (core_id < N_PE) { - time_init = mempool_get_timer(); - mempool_start_benchmark(); -#ifdef PARALLEL_LOCAL - dotp_parallel_local(vector_a, vector_b, &sum, LEN, N_PE); -#elif defined(LOCAL_UNROLLED) - dotp_parallel_local_unrolled4(vector_a, vector_b, &sum, LEN, N_PE); -#endif - mempool_stop_benchmark(); - time_end = mempool_get_timer(); - } - - mempool_barrier(num_cores); // Check results + mempool_barrier(num_cores); if (core_id == 0) { uint32_t clock_cycles = (time_end - time_init); -#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) || \ - defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE) - result = sum[0]; -#else - result = sum; -#endif printf("\nKernel execution takes %d clock cycles\n", clock_cycles); - printf("Result ==> %d\n", result); - printf("Check ==> %d\n\n", check); + printf("Result ==> %d\n", sum[0]); + printf("Check ==> %d\n\n", l2_C); } mempool_barrier(num_cores); - return error; + return 0; } diff --git a/software/data/data_dotp_i32.h.tpl b/software/data/data_dotp_i32.h.tpl new file mode 100644 index 000000000..d76d92a24 --- /dev/null +++ b/software/data/data_dotp_i32.h.tpl @@ -0,0 +1,24 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +\ +<% def array_to_cstr(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '{}, '.format(a) + i += 1 + if i % 8 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +#define LEN (${Len}) + +int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; + +int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; + +int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = ${C}; diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py new file mode 100644 index 000000000..6bacf2488 --- /dev/null +++ b/software/data/generate_dotp.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script generates data for the fp16 matmul. +# Author: Marco Bertuletti + +import numpy as np +import argparse +import pathlib +from mako.template import Template + + +def generate_dotp_i32(Len): + + # Create matrix + MAX = 2**7 - 1 + A = np.random.randint(-MAX, MAX - 1, size=Len) + B = np.random.randint(-MAX, MAX - 1, size=Len) + C = np.dot(A, B) + return A, B, C + +################## +# compute_result # +################## + + +def gen_data_header_file(outdir: pathlib.Path.cwd(), + tpl: pathlib.Path.cwd(), **kwargs): + + file = outdir / f"{kwargs['name']}.h" + + print(tpl, outdir, kwargs['name']) + + template = Template(filename=str(tpl)) + with file.open('w') as f: + f.write(template.render(**kwargs)) + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for kernels') + parser.add_argument( + "-o", + "--outdir", + type=pathlib.Path, + default=pathlib.Path(__file__).parent.absolute(), + required=False, + help='Select out directory of generated data files' + ) + parser.add_argument( + "-n", + "--length", + type=int, + required=False, + default=4096, + help='First dimension.' + ) + + args = parser.parse_args() + Len = args.length + + A, B, C = generate_dotp_i32(Len) + tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_i32.h.tpl" + kwargs = { + 'name': 'data_dotp_i32', + 'A': A, + 'B': B, + 'C': C, + 'Len': Len} + gen_data_header_file(args.outdir, tpl, **kwargs) + + +if __name__ == "__main__": + main() diff --git a/software/kernels/baremetal/mempool_dotp_i32p.h b/software/kernels/baremetal/mempool_dotp_i32p.h new file mode 100644 index 000000000..26fbe03e9 --- /dev/null +++ b/software/kernels/baremetal/mempool_dotp_i32p.h @@ -0,0 +1,196 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +/* Parallel dot-product */ +void dotp_i32p(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, + uint32_t nPE) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + register int32_t local_sum = 0; + register int32_t a, b; + for (uint32_t i = core_id * step; i < core_id * step + step; i++) { + a = in_a[i]; + b = in_b[i]; + local_sum += a * b; + } + __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED); +#ifdef LOG_BARRIERS + mempool_log_barrier(2, core_id); +#else + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier(num_cores); +#endif + return; +} + +/* Parallel dot-product with loop unrolling*/ +void dotp_i32p_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, + uint32_t nPE) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + uint32_t reminder = step % 4; + uint32_t i; + + register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0; + register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0; + register int32_t local_sum0 = 0; + register int32_t local_sum1 = 0; + register int32_t local_sum2 = 0; + register int32_t local_sum3 = 0; + for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) { + a0 = in_a[i]; + b0 = in_b[i]; + a1 = in_a[i + 1]; + b1 = in_b[i + 1]; + a2 = in_a[i + 2]; + b2 = in_b[i + 2]; + a3 = in_a[i + 3]; + b3 = in_b[i + 3]; + local_sum0 += a0 * b0; + local_sum1 += a1 * b1; + local_sum2 += a2 * b2; + local_sum3 += a3 * b3; + } + i = core_id * step + step - reminder; + while (i < step) { + a0 = in_a[i]; + b0 = in_b[i]; + local_sum0 += a0 * b0; + i++; + } + local_sum0 += local_sum1; + local_sum2 += local_sum3; + local_sum0 += local_sum2; + __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED); +#ifdef LOG_BARRIERS + mempool_log_barrier(2, core_id); +#else + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier(num_cores); +#endif + return; +} + +/* Bynary tree reduction */ +void mempool_binary_reduction(int32_t *sum, uint32_t core_id, + uint32_t num_cores) { + + uint32_t idx, step = 2, previous_step = 1; + while (num_cores > 1) { + idx = (step * (core_id / step)) * BANKING_FACTOR; + // dump_prova(idx); + if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1, + __ATOMIC_RELAXED)) { + + // Reduction + sum[idx] += sum[idx + previous_step * BANKING_FACTOR]; + + // Next level of binary tree + __atomic_store_n(&red_barrier[idx + previous_step - 1], 0, + __ATOMIC_RELAXED); + num_cores = num_cores / 2; + previous_step = step; + step = step * 2; + + } else { + // Goes to sleep + break; + } + } + + // Last core wakes everyone + if (num_cores == 1) { + __sync_synchronize(); + wake_up_all(); + } + mempool_wfi(); + + return; +} + +/* Parallel dot-product with loop unrolling */ +/* Load and stores only in local memory */ +#define NUM_CORES_RED (16) +void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, + uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t const remainder = Len % 4; + uint32_t const idx_stop = Len - remainder; + + register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0; + register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0; + register int32_t local_sum0 = 0; + register int32_t local_sum1 = 0; + register int32_t local_sum2 = 0; + register int32_t local_sum3 = 0; + + for (uint32_t i = core_id * 4; i < idx_stop; i += NUM_BANKS) { + a0 = in_a[i]; + b0 = in_b[i]; + a1 = in_a[i + 1]; + b1 = in_b[i + 1]; + a2 = in_a[i + 2]; + b2 = in_b[i + 2]; + a3 = in_a[i + 3]; + b3 = in_b[i + 3]; + local_sum0 += a0 * b0; + local_sum1 += a1 * b1; + local_sum2 += a2 * b2; + local_sum3 += a3 * b3; + } + if (core_id == ((Len % NUM_BANKS) / 4)) { + for (uint32_t i = Len - remainder; i < Len; i++) { + a0 = in_a[i]; + b0 = in_b[i]; + local_sum0 += a0 * b0; + } + } + local_sum0 += local_sum1; + local_sum2 += local_sum3; + local_sum0 += local_sum2; + +// A) Cores atomically fetch and add in sum variable +// B) A global barrier synchronizes all of them +#if defined(ATOMIC_REDUCTION) + __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED); + mempool_log_barrier(2, core_id); + +// A) Groups of NUM_CORES_RED cores atomically fetch and add in sum array +// B) The last core to the reduction barrier sums the partial reductions +#elif defined(SINGLE_CORE_REDUCTION) + uint32_t num_cores = mempool_get_core_count(); + __atomic_fetch_add( + &s[BANKING_FACTOR * NUM_CORES_RED * (core_id / NUM_CORES_RED)], + local_sum0, __ATOMIC_RELAXED); + if ((num_cores - 1) == + __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED); + __sync_synchronize(); // Full memory barrier + uint32_t idx_red = 0; + local_sum0 = 0; + while (idx_red < NUM_BANKS) { + local_sum0 += s[idx_red]; + idx_red += BANKING_FACTOR * NUM_CORES_RED; + } + s[0] = local_sum0; + wake_up_all(); + } + mempool_wfi(); + +// A) Cores store locally in sum array +// B) Partial sums are reduced logarithmically +#elif defined(BINARY_REDUCTION) + uint32_t num_cores = mempool_get_core_count(); + s[core_id * 4] = local_sum0; + mempool_binary_reduction(s, core_id, num_cores); + +#endif + + return; +} diff --git a/software/apps/baremetal/dotp_i32/dotp_single.h b/software/kernels/baremetal/mempool_dotp_i32s.h similarity index 88% rename from software/apps/baremetal/dotp_i32/dotp_single.h rename to software/kernels/baremetal/mempool_dotp_i32s.h index 58797ee80..dd562debb 100644 --- a/software/apps/baremetal/dotp_i32/dotp_single.h +++ b/software/kernels/baremetal/mempool_dotp_i32s.h @@ -5,12 +5,11 @@ // Author: Marco Bertuletti, ETH Zurich /* Single-core dot-product */ -void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { +void dotp_i32s(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); if (core_id == 0) { - mempool_start_benchmark(); // Kernel execution register int32_t local_sum = 0; @@ -18,7 +17,6 @@ void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { do { local_sum += ((*in_a++) * (*in_b++)); } while (in_a < end); - *s = local_sum; mempool_stop_benchmark(); } @@ -26,17 +24,15 @@ void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { } /* Single-core dot-product unrolled4 */ -void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { +void dotp_i32s_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, + uint32_t Len) { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); if (core_id == 0) { - mempool_start_benchmark(); uint32_t reminder = Len % 4; uint32_t i = 0; - int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0, b3 = 0; register int32_t local_sum_1 = 0; register int32_t local_sum_2 = 0; @@ -70,5 +66,4 @@ void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, mempool_stop_benchmark(); } mempool_barrier(num_cores); - // mempool_log_barrier(2, core_id); }