From c346c1c800e91afe9dbca06db776bcd82e52d265 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Fri, 30 Aug 2024 12:25:17 +0200 Subject: [PATCH] [software] Add f32 and f16 dotp app --- software/apps/baremetal/dotp_f16/main.c | 78 +++++ software/apps/baremetal/dotp_f32/main.c | 76 +++++ software/apps/baremetal/dotp_i32/main.c | 3 +- software/data/data_dotp_f16.h.tpl | 24 ++ software/data/data_dotp_f32.h.tpl | 24 ++ software/data/generate_dotp.py | 38 +++ software/kernels/baremetal/mempool_dotp_f16.h | 222 +++++++++++++++ software/kernels/baremetal/mempool_dotp_f32.h | 267 ++++++++++++++++++ ...mempool_dotp_i32p.h => mempool_dotp_i32.h} | 170 +++++++---- .../kernels/baremetal/mempool_dotp_i32s.h | 69 ----- 10 files changed, 848 insertions(+), 123 deletions(-) create mode 100644 software/apps/baremetal/dotp_f16/main.c create mode 100644 software/apps/baremetal/dotp_f32/main.c create mode 100644 software/data/data_dotp_f16.h.tpl create mode 100644 software/data/data_dotp_f32.h.tpl create mode 100644 software/kernels/baremetal/mempool_dotp_f16.h create mode 100644 software/kernels/baremetal/mempool_dotp_f32.h rename software/kernels/baremetal/{mempool_dotp_i32p.h => mempool_dotp_i32.h} (63%) delete mode 100644 software/kernels/baremetal/mempool_dotp_i32s.h diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c new file mode 100644 index 000000000..36a7f8f99 --- /dev/null +++ b/software/apps/baremetal/dotp_f16/main.c @@ -0,0 +1,78 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include +#include + +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#include "data_dotp_f16.h" +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) +// #define SINGLE_CORE_REDUCTION +#define BINARY_REDUCTION + +// Vectors for kernel computation +__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +uint32_t red_barrier[NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +__fp16 sum[2 * NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); + +#include "baremetal/mempool_dotp_f16.h" + +int main() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t time_init, time_end; + mempool_barrier_init(core_id); + + time_init = 0; + time_end = 0; + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t)); + dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t)); + } + for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) { + sum[k] = 0; + red_barrier[k] = 0; + } + mempool_barrier(num_cores); + + // // SINGLE-CORE + // time_init = mempool_get_timer(); + // dotp_f16s(l1_A, l1_B, sum, LEN); + // // dotp_f16s_unrolled4(l1_A, l1_B, sum, LEN); + // time_end = mempool_get_timer(); + + // // PARALLEL + // time_init = mempool_get_timer(); + // dotp_f16vecp_unrolled4(l1_A, l1_B, sum, LEN, num_cores); + // // dotp_f16p(l1_A, l1_B, sum, LEN, num_cores); + // time_end = mempool_get_timer(); + + // PARALLEL, LOCAL ACCESSES + time_init = mempool_get_timer(); + dotp_f16vecp_local_unrolled4(l1_A, l1_B, sum, LEN); + time_end = mempool_get_timer(); + + // Check results + mempool_barrier(num_cores); + if (core_id == 0) { + uint32_t clock_cycles = (time_end - time_init); + printf("\nKernel execution takes %d clock cycles\n", clock_cycles); + printf("Result ==> %x\n", *(uint32_t *)&sum[0]); + printf("Check ==> %x\n\n", *(uint32_t *)&l2_C); + } + mempool_barrier(num_cores); + + return 0; +} diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c new file mode 100644 index 000000000..8c3c7e8cd --- /dev/null +++ b/software/apps/baremetal/dotp_f32/main.c @@ -0,0 +1,76 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include +#include +#include + +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#include "data_dotp_f32.h" +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) +// #define SINGLE_CORE_REDUCTION +#define BINARY_REDUCTION + +// Vectors for kernel computation +float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +uint32_t red_barrier[NUM_BANKS] + __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +float sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); + +#include "baremetal/mempool_dotp_f32.h" + +int main() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t time_init, time_end; + mempool_barrier_init(core_id); + + time_init = 0; + time_end = 0; + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t)); + dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t)); + } + for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) { + sum[k] = 0; + red_barrier[k] = 0; + } + mempool_barrier(num_cores); + + // // SINGLE-CORE + // time_init = mempool_get_timer(); + // dotp_f32s_unrolled4(l1_A, l1_B, sum, LEN); + // time_end = mempool_get_timer(); + + // // PARALLEL + // time_init = mempool_get_timer(); + // dotp_f32p(l1_A, l1_B, sum, LEN, num_cores); + // time_end = mempool_get_timer(); + + // PARALLEL, LOCAL ACCESSES + time_init = mempool_get_timer(); + dotp_f32p_local_unrolled4(l1_A, l1_B, sum, LEN); + time_end = mempool_get_timer(); + + // Check results + mempool_barrier(num_cores); + if (core_id == 0) { + uint32_t clock_cycles = (time_end - time_init); + printf("\nKernel execution takes %d clock cycles\n", clock_cycles); + printf("Result ==> %d\n", sum[0]); + printf("Check ==> %d\n\n", l2_C); + } + mempool_barrier(num_cores); + + return 0; +} diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c index da00a937e..441c98355 100644 --- a/software/apps/baremetal/dotp_i32/main.c +++ b/software/apps/baremetal/dotp_i32/main.c @@ -28,8 +28,7 @@ uint32_t red_barrier[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); int32_t sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); -#include "baremetal/mempool_dotp_i32p.h" -#include "baremetal/mempool_dotp_i32s.h" +#include "baremetal/mempool_dotp_i32.h" int main() { diff --git a/software/data/data_dotp_f16.h.tpl b/software/data/data_dotp_f16.h.tpl new file mode 100644 index 000000000..f7cacaed3 --- /dev/null +++ b/software/data/data_dotp_f16.h.tpl @@ -0,0 +1,24 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +\ +<% def array_to_cstr(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '(__fp16){:.4f}, '.format(a) + i += 1 + if i % 8 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +#define LEN (${Len}) + +__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; + +__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; + +__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = (__fp16)${C}f; diff --git a/software/data/data_dotp_f32.h.tpl b/software/data/data_dotp_f32.h.tpl new file mode 100644 index 000000000..3af0fbe66 --- /dev/null +++ b/software/data/data_dotp_f32.h.tpl @@ -0,0 +1,24 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +\ +<% def array_to_cstr(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '{}f, '.format(a) + i += 1 + if i % 8 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +#define LEN (${Len}) + +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; + +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; + +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = ${C}f; diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py index 6bacf2488..66fc95bcc 100644 --- a/software/data/generate_dotp.py +++ b/software/data/generate_dotp.py @@ -22,6 +22,24 @@ def generate_dotp_i32(Len): C = np.dot(A, B) return A, B, C + +def generate_dotp_f32(Len): + + # Create matrix + A = np.random.rand(Len).astype(np.float32) + B = np.random.rand(Len).astype(np.float32) + C = (np.dot(A, B)).astype(np.float32) + return A, B, C + + +def generate_dotp_f16(Len): + + # Create matrix + A = np.random.rand(Len).astype(np.float16) + B = np.random.rand(Len).astype(np.float16) + C = (np.dot(A, B)).astype(np.float16) + return A, B, C + ################## # compute_result # ################## @@ -72,6 +90,26 @@ def main(): 'Len': Len} gen_data_header_file(args.outdir, tpl, **kwargs) + A, B, C = generate_dotp_f32(Len) + tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f32.h.tpl" + kwargs = { + 'name': 'data_dotp_f32', + 'A': A, + 'B': B, + 'C': C, + 'Len': Len} + gen_data_header_file(args.outdir, tpl, **kwargs) + + A, B, C = generate_dotp_f16(Len) + tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f16.h.tpl" + kwargs = { + 'name': 'data_dotp_f16', + 'A': A, + 'B': B, + 'C': C, + 'Len': Len} + gen_data_header_file(args.outdir, tpl, **kwargs) + if __name__ == "__main__": main() diff --git a/software/kernels/baremetal/mempool_dotp_f16.h b/software/kernels/baremetal/mempool_dotp_f16.h new file mode 100644 index 000000000..17d13df24 --- /dev/null +++ b/software/kernels/baremetal/mempool_dotp_f16.h @@ -0,0 +1,222 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#pragma once +#include "builtins_v2.h" + +#define DOTPF16VEC_UNROLLED4_LOOP \ + { \ + a01 = (*(v2h *)&in_a[i]); \ + a23 = (*(v2h *)&in_a[i + 2]); \ + b01 = (*(v2h *)&in_b[i]); \ + b23 = (*(v2h *)&in_b[i + 2]); \ + asm volatile( \ + "vfdotpex.s.h %[local_sum0], %[a01], %[b01];" \ + "vfdotpex.s.h %[local_sum1], %[a23], %[b23];" \ + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1) \ + : [a01] "r"(a01), [a23] "r"(a23), [b01] "r"(b01), [b23] "r"(b23)); \ + } + +/* Single core reduction */ +void mempool_reduction_f16(__fp16 *sum, uint32_t num_cores) { + + // The last core to the reduction barrier sums the partial reductions + if ((num_cores - 1) == + __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) { + + // Reduction + uint32_t idx_red = 0; + __fp16 local_sum = (__fp16)0.0f; + while (idx_red < NUM_BANKS) { + asm volatile("fadd.h %0, %0, %1;" : "+&r"(local_sum) : "r"(sum[idx_red])); + idx_red += 2 * BANKING_FACTOR; + } + sum[0] = local_sum; + + __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED); + __sync_synchronize(); + wake_up_all(); + } + mempool_wfi(); + + return; +} + +/* Bynary tree reduction */ +void mempool_binary_reduction_f16(__fp16 *sum, uint32_t core_id, + uint32_t num_cores) { + + uint32_t idx, step = 2, previous_step = 1; + while (num_cores > 1) { + idx = (step * (core_id / step)) * BANKING_FACTOR; + // dump_prova(idx); + if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1, + __ATOMIC_RELAXED)) { + + // Reduction + __fp16 add = sum[2 * (idx + previous_step * BANKING_FACTOR)]; + asm volatile("fadd.h %0, %0, %1;" : "+&r"(sum[2 * idx]) : "r"(add)); + + // Next level of binary tree + __atomic_store_n(&red_barrier[idx + previous_step - 1], 0, + __ATOMIC_RELAXED); + num_cores = num_cores / 2; + previous_step = step; + step = step * 2; + + } else { + // Goes to sleep + break; + } + } + + // Last core wakes everyone + if (num_cores == 1) { + __sync_synchronize(); + wake_up_all(); + } + mempool_wfi(); + + return; +} + +/* Single-core dot-product */ +void dotp_f16s(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + if (core_id == 0) { + mempool_start_benchmark(); + // Kernel execution + __fp16 local_sum = (__fp16)0.0f; + __fp16 *end = in_a + Len; + do { + asm volatile("fmadd.h %0, %1, %2, %0;" + : "+&r"(local_sum) + : "r"(*in_a), "r"(*in_b)); + in_a++; + in_b++; + } while (in_a < end); + s[0] = local_sum; + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); + + return; +} + +/* Single-core dot-product unrolled4 */ +void dotp_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + if (core_id == 0) { + mempool_start_benchmark(); + uint32_t i = 0; + + v2h a01, a23; + v2h b01, b23; + float local_sum0 = 0.0f; + float local_sum1 = 0.0f; + + for (i = 0; i < Len; i += 4) { + DOTPF16VEC_UNROLLED4_LOOP; + } + // Reduction + asm volatile( + "fadd.s %[local_sum0], %[local_sum0], %[local_sum1];" + "fcvt.h.s %[local_sum0], %[local_sum0];" + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1) + :); + s[0] = *(__fp16 *)&local_sum0; + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); + + return; +} + +/* Parallel dot-product */ +void dotp_f16p(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len, + uint32_t nPE) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + __fp16 local_sum = (__fp16)0.0f; + __fp16 a, b; + for (uint32_t i = core_id * step; i < core_id * step + step; i++) { + a = in_a[i]; + b = in_b[i]; + asm volatile("fmadd.h %0, %1, %2, %0;" : "+&r"(local_sum) : "r"(a), "r"(b)); + } + s[2 * core_id * BANKING_FACTOR] = local_sum; + + uint32_t num_cores = mempool_get_core_count(); + mempool_reduction_f16(s, num_cores); + + return; +} + +/* Parallel dot-product with loop unrolling*/ +void dotp_f16vecp_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len, + uint32_t nPE) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + uint32_t i; + + v2h a01, a23; + v2h b01, b23; + float local_sum0 = 0.0f; + float local_sum1 = 0.0f; + + for (i = core_id * step; i < core_id * step + step; i += 4) { + DOTPF16VEC_UNROLLED4_LOOP; + } + asm volatile("fadd.s %[local_sum0], %[local_sum0], %[local_sum1];" + "fcvt.h.s %[local_sum0], %[local_sum0];" + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1) + :); + s[2 * core_id * BANKING_FACTOR] = *(__fp16 *)&local_sum0; + uint32_t num_cores = mempool_get_core_count(); + mempool_reduction_f16(s, num_cores); + + return; +} + +/* Parallel dot-product with loop unrolling */ +/* Load and stores only in local memory */ +void dotp_f16vecp_local_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, + uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + + v2h a01, a23; + v2h b01, b23; + float local_sum0 = 0.0f; + float local_sum1 = 0.0f; + for (uint32_t i = core_id * BANKING_FACTOR; i < Len; i += NUM_BANKS) { + DOTPF16VEC_UNROLLED4_LOOP; + } + asm volatile("fadd.s %[local_sum0], %[local_sum0], %[local_sum1];" + "fcvt.h.s %[local_sum0], %[local_sum0];" + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1) + :); + s[2 * core_id * BANKING_FACTOR] = *(__fp16 *)&local_sum0; + +// The last core to the reduction barrier sums the partial reductions +#if defined(SINGLE_CORE_REDUCTION) + uint32_t num_cores = mempool_get_core_count(); + mempool_reduction_f16(s, num_cores); +// A) Cores store locally in sum array +// B) Partial sums are reduced logarithmically +#elif defined(BINARY_REDUCTION) + uint32_t num_cores = mempool_get_core_count(); + mempool_binary_reduction_f16(s, core_id, num_cores); +#endif + + return; +} diff --git a/software/kernels/baremetal/mempool_dotp_f32.h b/software/kernels/baremetal/mempool_dotp_f32.h new file mode 100644 index 000000000..58fa0e9d5 --- /dev/null +++ b/software/kernels/baremetal/mempool_dotp_f32.h @@ -0,0 +1,267 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#define DOTPF32_UNROLLED4_LOOP \ + { \ + a0 = in_a[i]; \ + b0 = in_b[i]; \ + a1 = in_a[i + 1]; \ + b1 = in_b[i + 1]; \ + a2 = in_a[i + 2]; \ + b2 = in_b[i + 2]; \ + a3 = in_a[i + 3]; \ + b3 = in_b[i + 3]; \ + asm volatile( \ + "fmadd.s %[local_sum0], %[a0], %[b0], %[local_sum0];" \ + "fmadd.s %[local_sum1], %[a1], %[b1], %[local_sum1];" \ + "fmadd.s %[local_sum2], %[a2], %[b2], %[local_sum2];" \ + "fmadd.s %[local_sum3], %[a3], %[b3], %[local_sum3];" \ + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1), \ + [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3) \ + : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), \ + [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3)); \ + } + +/* Single core reduction */ +void mempool_reduction_f32(float *sum, uint32_t num_cores) { + + // The last core to the reduction barrier sums the partial reductions + if ((num_cores - 1) == + __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) { + + // Reduction + uint32_t idx_red = 0; + float local_sum = 0.0f; + while (idx_red < NUM_BANKS) { + asm volatile("fadd.s %0, %0, %1;" : "+&r"(local_sum) : "r"(sum[idx_red])); + idx_red += BANKING_FACTOR; + } + sum[0] = local_sum; + + __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED); + __sync_synchronize(); + wake_up_all(); + } + mempool_wfi(); + + return; +} + +/* Bynary tree reduction */ +void mempool_binary_reduction_f32(float *sum, uint32_t core_id, + uint32_t num_cores) { + + uint32_t idx, step = 2, previous_step = 1; + while (num_cores > 1) { + idx = (step * (core_id / step)) * BANKING_FACTOR; + // dump_prova(idx); + if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1, + __ATOMIC_RELAXED)) { + + // Reduction + float add = sum[idx + previous_step * BANKING_FACTOR]; + asm volatile("fadd.s %0, %0, %1;" : "+&r"(sum[idx]) : "r"(add)); + + // Next level of binary tree + __atomic_store_n(&red_barrier[idx + previous_step - 1], 0, + __ATOMIC_RELAXED); + num_cores = num_cores / 2; + previous_step = step; + step = step * 2; + + } else { + // Goes to sleep + break; + } + } + + // Last core wakes everyone + if (num_cores == 1) { + __sync_synchronize(); + wake_up_all(); + } + mempool_wfi(); + + return; +} + +/* Single-core dot-product */ +void dotp_f32s(float *in_a, float *in_b, float *s, uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + if (core_id == 0) { + mempool_start_benchmark(); + // Kernel execution + register float local_sum = 0; + float *end = in_a + Len; + do { + asm volatile("fmadd.s %0, %1, %2, %0;" + : "+&r"(local_sum) + : "r"(*in_a), "r"(*in_b)); + in_a++; + in_b++; + } while (in_a < end); + *s = local_sum; + mempool_stop_benchmark(); + } + + return; +} + +/* Single-core dot-product unrolled4 */ +void dotp_f32s_unrolled4(float *in_a, float *in_b, float *s, uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + if (core_id == 0) { + mempool_start_benchmark(); + uint32_t reminder = Len % 4; + uint32_t i = 0; + + register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; + register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; + register float local_sum0 = 0.0f; + register float local_sum1 = 0.0f; + register float local_sum2 = 0.0f; + register float local_sum3 = 0.0f; + + for (i = 0; i < (Len - reminder); i += 4) { + DOTPF32_UNROLLED4_LOOP; + } + while (i < Len) { + a0 = in_a[i]; + b0 = in_b[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" + : "+&r"(local_sum0) + : "r"(a0), "r"(b0)); + i++; + } + // Reduction + asm volatile( + "fadd.s %[local_sum0], %[local_sum0], %[local_sum1];" + "fadd.s %[local_sum2], %[local_sum2], %[local_sum3];" + "fadd.s %[local_sum0], %[local_sum0], %[local_sum2];" + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1), + [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3) + :); + *s = local_sum0; + mempool_stop_benchmark(); + } + + return; +} + +/* Parallel dot-product */ +void dotp_f32p(float *in_a, float *in_b, float *s, uint32_t Len, uint32_t nPE) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + register float local_sum = 0; + register float a, b; + for (uint32_t i = core_id * step; i < core_id * step + step; i++) { + a = in_a[i]; + b = in_b[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(local_sum) : "r"(a), "r"(b)); + } + s[core_id * BANKING_FACTOR] = local_sum; + + uint32_t num_cores = mempool_get_core_count(); + mempool_reduction_f32(s, num_cores); + + return; +} + +/* Parallel dot-product with loop unrolling*/ +void dotp_f32p_unrolled4(float *in_a, float *in_b, float *s, uint32_t Len, + uint32_t nPE) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + uint32_t reminder = step % 4; + uint32_t i; + + register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; + register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; + register float local_sum0 = 0.0f; + register float local_sum1 = 0.0f; + register float local_sum2 = 0.0f; + register float local_sum3 = 0.0f; + + for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) { + DOTPF32_UNROLLED4_LOOP; + } + i = core_id * step + step - reminder; + while (i < step) { + a0 = in_a[i]; + b0 = in_b[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" + : "+&r"(local_sum0) + : "r"(a0), "r"(b0)); + i++; + } + asm volatile("fadd.s %[local_sum0], %[local_sum0], %[local_sum1];" + "fadd.s %[local_sum2], %[local_sum2], %[local_sum3];" + "fadd.s %[local_sum0], %[local_sum0], %[local_sum2];" + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1), + [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3) + :); + s[core_id * BANKING_FACTOR] = local_sum0; + uint32_t num_cores = mempool_get_core_count(); + mempool_reduction_f32(s, num_cores); + + return; +} + +/* Parallel dot-product with loop unrolling */ +/* Load and stores only in local memory */ +void dotp_f32p_local_unrolled4(float *in_a, float *in_b, float *s, + uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + uint32_t const remainder = Len % BANKING_FACTOR; + uint32_t const idx_stop = Len - remainder; + + register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; + register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; + register float local_sum0 = 0.0f; + register float local_sum1 = 0.0f; + register float local_sum2 = 0.0f; + register float local_sum3 = 0.0f; + + for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) { + DOTPF32_UNROLLED4_LOOP; + } + if (core_id == ((Len % NUM_BANKS) / 4)) { + for (uint32_t i = Len - remainder; i < Len; i++) { + a0 = in_a[i]; + b0 = in_b[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" + : "+&r"(local_sum0) + : "r"(a0), "r"(b0)); + } + } + asm volatile("fadd.s %[local_sum0], %[local_sum0], %[local_sum1];" + "fadd.s %[local_sum2], %[local_sum2], %[local_sum3];" + "fadd.s %[local_sum0], %[local_sum0], %[local_sum2];" + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1), + [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3) + :); + s[core_id * BANKING_FACTOR] = local_sum0; + +// The last core to the reduction barrier sums the partial reductions +#if defined(SINGLE_CORE_REDUCTION) + uint32_t num_cores = mempool_get_core_count(); + mempool_reduction_f32(s, num_cores); + +// A) Cores store locally in sum array +// B) Partial sums are reduced logarithmically +#elif defined(BINARY_REDUCTION) + uint32_t num_cores = mempool_get_core_count(); + mempool_binary_reduction_f32(s, core_id, num_cores); + +#endif + + return; +} diff --git a/software/kernels/baremetal/mempool_dotp_i32p.h b/software/kernels/baremetal/mempool_dotp_i32.h similarity index 63% rename from software/kernels/baremetal/mempool_dotp_i32p.h rename to software/kernels/baremetal/mempool_dotp_i32.h index 26fbe03e9..4b80e92ed 100644 --- a/software/kernels/baremetal/mempool_dotp_i32p.h +++ b/software/kernels/baremetal/mempool_dotp_i32.h @@ -4,6 +4,115 @@ // Author: Marco Bertuletti, ETH Zurich +#define DOTPI32_UNROLLED4_LOOP \ + { \ + a0 = in_a[i]; \ + b0 = in_b[i]; \ + a1 = in_a[i + 1]; \ + b1 = in_b[i + 1]; \ + a2 = in_a[i + 2]; \ + b2 = in_b[i + 2]; \ + a3 = in_a[i + 3]; \ + b3 = in_b[i + 3]; \ + local_sum0 += a0 * b0; \ + local_sum1 += a1 * b1; \ + local_sum2 += a2 * b2; \ + local_sum3 += a3 * b3; \ + } + +/* Bynary tree reduction */ +void mempool_binary_reduction_i32(int32_t *sum, uint32_t core_id, + uint32_t num_cores) { + + uint32_t idx, step = 2, previous_step = 1; + while (num_cores > 1) { + idx = (step * (core_id / step)) * BANKING_FACTOR; + // dump_prova(idx); + if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1, + __ATOMIC_RELAXED)) { + + // Reduction + sum[idx] += sum[idx + previous_step * BANKING_FACTOR]; + + // Next level of binary tree + __atomic_store_n(&red_barrier[idx + previous_step - 1], 0, + __ATOMIC_RELAXED); + num_cores = num_cores / 2; + previous_step = step; + step = step * 2; + + } else { + // Goes to sleep + break; + } + } + + // Last core wakes everyone + if (num_cores == 1) { + __sync_synchronize(); + wake_up_all(); + } + mempool_wfi(); + + return; +} + +/* Single-core dot-product */ +void dotp_i32s(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + if (core_id == 0) { + mempool_start_benchmark(); + // Kernel execution + register int32_t local_sum = 0; + int32_t *end = in_a + Len; + do { + local_sum += ((*in_a++) * (*in_b++)); + } while (in_a < end); + *s = local_sum; + mempool_stop_benchmark(); + } + + return; +} + +/* Single-core dot-product unrolled4 */ +void dotp_i32s_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, + uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + if (core_id == 0) { + mempool_start_benchmark(); + uint32_t reminder = Len % 4; + uint32_t i = 0; + + register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0; + register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0; + register int32_t local_sum0 = 0; + register int32_t local_sum1 = 0; + register int32_t local_sum2 = 0; + register int32_t local_sum3 = 0; + + for (i = 0; i < (Len - reminder); i += 4) { + DOTPI32_UNROLLED4_LOOP; + } + while (i < Len) { + a0 = in_a[i]; + b0 = in_b[i]; + local_sum0 += a0 * b0; + i++; + } + // Reduction + local_sum0 += local_sum1; + local_sum2 += local_sum3; + local_sum0 += local_sum2; + *s = local_sum0; + mempool_stop_benchmark(); + } + + return; +} + /* Parallel dot-product */ void dotp_i32p(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, uint32_t nPE) { @@ -18,12 +127,14 @@ void dotp_i32p(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, local_sum += a * b; } __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED); + #ifdef LOG_BARRIERS mempool_log_barrier(2, core_id); #else uint32_t num_cores = mempool_get_core_count(); mempool_barrier(num_cores); #endif + return; } @@ -42,19 +153,9 @@ void dotp_i32p_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, register int32_t local_sum1 = 0; register int32_t local_sum2 = 0; register int32_t local_sum3 = 0; + for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) { - a0 = in_a[i]; - b0 = in_b[i]; - a1 = in_a[i + 1]; - b1 = in_b[i + 1]; - a2 = in_a[i + 2]; - b2 = in_b[i + 2]; - a3 = in_a[i + 3]; - b3 = in_b[i + 3]; - local_sum0 += a0 * b0; - local_sum1 += a1 * b1; - local_sum2 += a2 * b2; - local_sum3 += a3 * b3; + DOTPI32_UNROLLED4_LOOP; } i = core_id * step + step - reminder; while (i < step) { @@ -67,48 +168,13 @@ void dotp_i32p_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len, local_sum2 += local_sum3; local_sum0 += local_sum2; __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED); + #ifdef LOG_BARRIERS mempool_log_barrier(2, core_id); #else uint32_t num_cores = mempool_get_core_count(); mempool_barrier(num_cores); #endif - return; -} - -/* Bynary tree reduction */ -void mempool_binary_reduction(int32_t *sum, uint32_t core_id, - uint32_t num_cores) { - - uint32_t idx, step = 2, previous_step = 1; - while (num_cores > 1) { - idx = (step * (core_id / step)) * BANKING_FACTOR; - // dump_prova(idx); - if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1, - __ATOMIC_RELAXED)) { - - // Reduction - sum[idx] += sum[idx + previous_step * BANKING_FACTOR]; - - // Next level of binary tree - __atomic_store_n(&red_barrier[idx + previous_step - 1], 0, - __ATOMIC_RELAXED); - num_cores = num_cores / 2; - previous_step = step; - step = step * 2; - - } else { - // Goes to sleep - break; - } - } - - // Last core wakes everyone - if (num_cores == 1) { - __sync_synchronize(); - wake_up_all(); - } - mempool_wfi(); return; } @@ -120,7 +186,7 @@ void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { uint32_t core_id = mempool_get_core_id(); - uint32_t const remainder = Len % 4; + uint32_t const remainder = Len % BANKING_FACTOR; uint32_t const idx_stop = Len - remainder; register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0; @@ -130,7 +196,7 @@ void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, register int32_t local_sum2 = 0; register int32_t local_sum3 = 0; - for (uint32_t i = core_id * 4; i < idx_stop; i += NUM_BANKS) { + for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) { a0 = in_a[i]; b0 = in_b[i]; a1 = in_a[i + 1]; @@ -187,8 +253,8 @@ void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, // B) Partial sums are reduced logarithmically #elif defined(BINARY_REDUCTION) uint32_t num_cores = mempool_get_core_count(); - s[core_id * 4] = local_sum0; - mempool_binary_reduction(s, core_id, num_cores); + s[core_id * BANKING_FACTOR] = local_sum0; + mempool_binary_reduction_i32(s, core_id, num_cores); #endif diff --git a/software/kernels/baremetal/mempool_dotp_i32s.h b/software/kernels/baremetal/mempool_dotp_i32s.h deleted file mode 100644 index dd562debb..000000000 --- a/software/kernels/baremetal/mempool_dotp_i32s.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -/* Single-core dot-product */ -void dotp_i32s(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) { - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - if (core_id == 0) { - mempool_start_benchmark(); - // Kernel execution - register int32_t local_sum = 0; - int32_t *end = in_a + Len; - do { - local_sum += ((*in_a++) * (*in_b++)); - } while (in_a < end); - *s = local_sum; - mempool_stop_benchmark(); - } - mempool_barrier(num_cores); -} - -/* Single-core dot-product unrolled4 */ -void dotp_i32s_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, - uint32_t Len) { - - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - if (core_id == 0) { - mempool_start_benchmark(); - uint32_t reminder = Len % 4; - uint32_t i = 0; - int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0, b3 = 0; - register int32_t local_sum_1 = 0; - register int32_t local_sum_2 = 0; - register int32_t local_sum_3 = 0; - register int32_t local_sum_4 = 0; - for (i = 0; i < (Len - reminder); i += 4) { - a0 = in_a[i]; - b0 = in_b[i]; - a1 = in_a[i + 1]; - b1 = in_b[i + 1]; - a2 = in_a[i + 2]; - b2 = in_b[i + 2]; - a3 = in_a[i + 3]; - b3 = in_b[i + 3]; - local_sum_1 += a0 * b0; - local_sum_2 += a1 * b1; - local_sum_3 += a2 * b2; - local_sum_4 += a3 * b3; - } - while (i < Len) { - a0 = in_a[i]; - b0 = in_b[i]; - local_sum_1 += a0 * b0; - i++; - } - // Reduction - local_sum_1 += local_sum_2; - local_sum_3 += local_sum_4; - local_sum_1 += local_sum_3; - *s = local_sum_1; - mempool_stop_benchmark(); - } - mempool_barrier(num_cores); -}