From 198e9908c7b405ccc2bed64836bf413974f642ad Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 23 Aug 2024 15:05:24 +0200 Subject: [PATCH] snRuntime: Add `alloc_v2` functions and minor changes (#186) * snRuntime: Add `snrt_inter_cluster_barrier()` function` * snRuntime: Add private L1 alloc pointers and heap bound checks * snRuntime: Add `snrt_compute_core_local_ptr()` function * sw: Move floating-point types to snRuntime * snRuntime: Add `snrt_l1_update_next_v2()` used to free memory * snRuntime: Add `snrt_cluster_is_last_compute_core()` function --- sw/blas/gemm/src/gemm.h | 11 --- sw/dnn/fusedconv/src/main.c | 2 +- sw/dnn/src/dnn.h | 7 +- sw/snRuntime/api/alloc_decls.h | 8 +- sw/snRuntime/api/start_decls.h | 6 +- sw/snRuntime/api/sync_decls.h | 2 + sw/snRuntime/src/alloc.c | 4 +- sw/snRuntime/src/alloc.h | 8 +- sw/snRuntime/src/alloc_v2.c | 15 ++++ sw/snRuntime/src/alloc_v2.h | 83 +++++++++++++++++++ sw/snRuntime/src/dm.h | 2 +- sw/snRuntime/src/omp/eu.h | 2 +- sw/snRuntime/src/omp/omp.c | 12 +-- sw/snRuntime/src/start.c | 32 ++----- sw/snRuntime/src/start.h | 35 ++++++++ sw/snRuntime/src/sync.h | 32 ++++--- sw/snRuntime/src/team.c | 2 + sw/snRuntime/src/team.h | 4 + sw/snRuntime/src/types.h | 9 ++ sw/tests/atomics.c | 4 +- sw/tests/data_mover.c | 14 ++-- sw/tests/openmp_double_buffering.c | 4 +- sw/tests/openmp_for_static_schedule.c | 4 +- .../banshee/src/snitch_cluster_start.c | 14 +--- .../banshee/src/snitch_cluster_start.h | 20 +++++ .../sw/runtime/banshee/src/snrt.c | 1 + .../sw/runtime/banshee/src/snrt.h | 3 + .../sw/runtime/rtl/src/snitch_cluster_start.c | 16 +--- .../sw/runtime/rtl/src/snitch_cluster_start.h | 24 ++++++ .../snitch_cluster/sw/runtime/rtl/src/snrt.c | 1 + .../snitch_cluster/sw/runtime/rtl/src/snrt.h | 3 + 31 files changed, 267 insertions(+), 117 deletions(-) create mode 100644 sw/snRuntime/src/alloc_v2.c create mode 100644 sw/snRuntime/src/alloc_v2.h create mode 100644 sw/snRuntime/src/start.h create mode 100644 sw/snRuntime/src/types.h create mode 100644 target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.h create mode 100644 target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.h diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h index 43a974556..a480379a9 100644 --- a/sw/blas/gemm/src/gemm.h +++ b/sw/blas/gemm/src/gemm.h @@ -13,17 +13,6 @@ #pragma once -// Guard to avoid conflict with DNN header file -// TODO: move this definition to Snitch math library to solve problem -#ifndef PRECISION_T -#define PRECISION_T -typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t; -#endif - -typedef float v2f32 __attribute__((vector_size(8))); -typedef __fp16 v4f16 __attribute__((vector_size(8))); -typedef char v8f8 __attribute__((vector_size(8))); - // Floating-point multiplications by zero cannot be optimized as in some // edge cases they do not yield zero: // - 0f * NaN = NaN diff --git a/sw/dnn/fusedconv/src/main.c b/sw/dnn/fusedconv/src/main.c index 8d25be366..c59354da5 100644 --- a/sw/dnn/fusedconv/src/main.c +++ b/sw/dnn/fusedconv/src/main.c @@ -24,7 +24,7 @@ int main() { float *ptr; if (snrt_is_dm_core() == 0) { - ptr = snrt_l1alloc(total_size * sizeof(float)); + ptr = snrt_l1_alloc(total_size * sizeof(float)); share_ptr = ptr; } diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h index 2e0d3f8eb..a60b7ad7c 100644 --- a/sw/dnn/src/dnn.h +++ b/sw/dnn/src/dnn.h @@ -6,12 +6,7 @@ #include -// Guard to avoid conflict with BLAS header file -// TODO: move this definition to Snitch math library to solve problem -#ifndef PRECISION_T -#define PRECISION_T -typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t; -#endif +#include "snrt.h" #ifndef IMPLEMENTATION_T #define IMPLEMENTATION_T diff --git a/sw/snRuntime/api/alloc_decls.h b/sw/snRuntime/api/alloc_decls.h index a276472f3..bf4c5615a 100644 --- a/sw/snRuntime/api/alloc_decls.h +++ b/sw/snRuntime/api/alloc_decls.h @@ -10,8 +10,8 @@ typedef struct { // Base address from where allocation starts uint32_t base; - // Number of bytes alloctable - uint32_t size; + // End address up to which allocation is allowed + uint32_t end; // Address of the next allocated block uint32_t next; } snrt_allocator_t; @@ -20,10 +20,10 @@ inline void *snrt_l1_next(); inline void *snrt_l3_next(); -inline void *snrt_l1alloc(size_t size); +inline void *snrt_l1_alloc(size_t size); inline void snrt_l1_update_next(void *next); -inline void *snrt_l3alloc(size_t size); +inline void *snrt_l3_alloc(size_t size); inline void snrt_alloc_init(); diff --git a/sw/snRuntime/api/start_decls.h b/sw/snRuntime/api/start_decls.h index 28a1d942c..8bf535073 100644 --- a/sw/snRuntime/api/start_decls.h +++ b/sw/snRuntime/api/start_decls.h @@ -1,7 +1,9 @@ -// Copyright 2023 ETH Zurich and University of Bologna. +// Copyright 2024 ETH Zurich and University of Bologna. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 #pragma once -static inline void snrt_exit(int exit_code); +inline void snrt_exit(int exit_code); + +inline uint32_t snrt_cls_base_addr(); diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h index 05db1bad1..0e18f943a 100644 --- a/sw/snRuntime/api/sync_decls.h +++ b/sw/snRuntime/api/sync_decls.h @@ -26,3 +26,5 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx); inline void snrt_cluster_hw_barrier(); inline void snrt_global_barrier(); + +inline uint32_t snrt_global_all_to_all_reduction(uint32_t value); diff --git a/sw/snRuntime/src/alloc.c b/sw/snRuntime/src/alloc.c index 72c12048e..044b7944c 100644 --- a/sw/snRuntime/src/alloc.c +++ b/sw/snRuntime/src/alloc.c @@ -12,8 +12,8 @@ extern void *snrt_l3_next(); extern uint32_t snrt_l1_start_addr(); extern uint32_t snrt_l1_end_addr(); -extern void *snrt_l1alloc(size_t size); -extern void *snrt_l3alloc(size_t size); +extern void *snrt_l1_alloc(size_t size); +extern void *snrt_l3_alloc(size_t size); extern snrt_allocator_t *snrt_l1_allocator(); extern snrt_allocator_t *snrt_l3_allocator(); diff --git a/sw/snRuntime/src/alloc.h b/sw/snRuntime/src/alloc.h index 6d5f250bb..33852064e 100644 --- a/sw/snRuntime/src/alloc.h +++ b/sw/snRuntime/src/alloc.h @@ -28,7 +28,7 @@ inline void *snrt_l3_next() { return (void *)snrt_l3_allocator()->next; } * @param size number of bytes to allocate * @return pointer to the allocated memory */ -inline void *snrt_l1alloc(size_t size) { +inline void *snrt_l1_alloc(size_t size) { snrt_allocator_t *alloc = snrt_l1_allocator(); // TODO colluca: do we need this? What does it imply? @@ -64,7 +64,7 @@ inline void snrt_l1_update_next(void *next) { * @param size number of bytes to allocate * @return pointer to the allocated memory */ -inline void *snrt_l3alloc(size_t size) { +inline void *snrt_l3_alloc(size_t size) { snrt_allocator_t *alloc = snrt_l3_allocator(); // TODO: L3 alloc size check @@ -83,12 +83,12 @@ inline void snrt_alloc_init() { // occupy a possibly significant portion. snrt_l1_allocator()->base = ALIGN_UP(snrt_l1_start_addr(), MIN_CHUNK_SIZE); - snrt_l1_allocator()->size = snrt_l1_end_addr() - snrt_l1_start_addr(); + snrt_l1_allocator()->end = snrt_l1_end_addr(); snrt_l1_allocator()->next = snrt_l1_allocator()->base; // Initialize L3 allocator extern uint32_t _edram; snrt_l3_allocator()->base = ALIGN_UP((uint32_t)&_edram, MIN_CHUNK_SIZE); - snrt_l3_allocator()->size = 0; + snrt_l3_allocator()->end = snrt_l3_allocator()->base; snrt_l3_allocator()->next = snrt_l3_allocator()->base; } // Synchronize with other cores diff --git a/sw/snRuntime/src/alloc_v2.c b/sw/snRuntime/src/alloc_v2.c new file mode 100644 index 000000000..cd9d0a853 --- /dev/null +++ b/sw/snRuntime/src/alloc_v2.c @@ -0,0 +1,15 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +__thread snrt_allocator_t l1_allocator_v2; + +extern void *snrt_l1_next_v2(); + +extern void *snrt_l1_alloc_cluster_local(size_t size, size_t alignment); +extern void *snrt_l1_alloc_compute_core_local(size_t size, size_t alignment); + +extern void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx, + uint32_t dst_cluster_idx); + +extern void snrt_alloc_init_v2(); diff --git a/sw/snRuntime/src/alloc_v2.h b/sw/snRuntime/src/alloc_v2.h new file mode 100644 index 000000000..29ffb81e5 --- /dev/null +++ b/sw/snRuntime/src/alloc_v2.h @@ -0,0 +1,83 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +extern __thread snrt_allocator_t l1_allocator_v2; + +inline snrt_allocator_t *snrt_l1_allocator_v2() { return &l1_allocator_v2; } + +inline void *snrt_l1_next_v2() { return (void *)snrt_l1_allocator_v2()->next; } + +/** + * @brief Override the L1 allocator next pointer + */ +inline void snrt_l1_update_next_v2(void *next) { + snrt_l1_allocator_v2()->next = (uint32_t)next; +} + +// Check that allocation doesn't exceed allocator bounds, and raise an +// exception otherwise +inline void snrt_l1_alloc_check_bounds() { + if (snrt_l1_allocator_v2()->next > snrt_l1_allocator_v2()->end) + asm volatile("ecall \n"); +} + +// Dynamically allocate space for a variable of size `size` in the cluster's L1 +// memory. This function should be invoked by every core in a cluster. Every +// core receives a pointer to the allocated variable. +inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) { + snrt_l1_allocator_v2()->next = + ALIGN_UP(snrt_l1_allocator_v2()->next, alignment); + void *retval = snrt_l1_next_v2(); + snrt_l1_allocator_v2()->next += size; + snrt_l1_alloc_check_bounds(); + return retval; +} + +// Dynamically allocate space for N variables of size `size` in the cluster's +// L1 memory, N being the number of compute cores in the cluster. This function +// should be invoked by every core in a cluster. Every compute core receives a +// pointer to a unique variable among the N which have been allocated. The +// return value for the DM core is undefined. +inline void *snrt_l1_alloc_compute_core_local(size_t size, + const size_t alignment) { + snrt_l1_allocator_v2()->next = + ALIGN_UP(snrt_l1_allocator_v2()->next, alignment); + void *retval = snrt_l1_next_v2() + size * snrt_cluster_core_idx(); + snrt_l1_allocator_v2()->next += size * snrt_cluster_compute_core_num(); + snrt_l1_alloc_check_bounds(); + return retval; +} + +// Takes a pointer to a variable allocated using +// `snrt_l1_alloc_compute_core_local` and returns a pointer to the same +// variable allocated by another core, as specified by `core_idx`. +// The `size` argument should be the same used during allocation. +inline void *snrt_compute_core_local_ptr(void *ptr, uint32_t core_idx, + size_t size) { + size_t offset = (core_idx - snrt_cluster_core_idx()) * size; + return (void *)((uintptr_t)ptr + offset); +} + +// Takes a pointer to a variable in the source cluster's L1 memory and returns +// a pointer to the same offset in the destination cluster's L1 memory. +inline void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx, + uint32_t dst_cluster_idx) { + return (void *)((uintptr_t)ptr + + (dst_cluster_idx - src_cluster_idx) * SNRT_CLUSTER_OFFSET); +} + +inline void snrt_alloc_init_v2() { + // Calculate end address of the heap. The top of the TCDM address space is + // reserved for the cluster-local storage (CLS) and the stack of every + // core. We further provision a safety margin of 128B. The rest of the + // TCDM is reserved for the heap. + uint32_t heap_end_addr = snrt_cls_base_addr(); + heap_end_addr -= (1 << SNRT_LOG2_STACK_SIZE) * snrt_cluster_core_num(); + heap_end_addr -= 128; + // Initialize L1 allocator + snrt_l1_allocator_v2()->base = + ALIGN_UP(snrt_l1_start_addr(), MIN_CHUNK_SIZE); + snrt_l1_allocator_v2()->end = heap_end_addr; + snrt_l1_allocator_v2()->next = snrt_l1_allocator_v2()->base; +} diff --git a/sw/snRuntime/src/dm.h b/sw/snRuntime/src/dm.h index b5de96763..3df2e9268 100644 --- a/sw/snRuntime/src/dm.h +++ b/sw/snRuntime/src/dm.h @@ -157,7 +157,7 @@ inline void dm_init(void) { #else snrt_interrupt_enable(IRQ_M_CLUSTER); #endif - dm_p = (dm_t *)snrt_l1alloc(sizeof(dm_t)); + dm_p = (dm_t *)snrt_l1_alloc(sizeof(dm_t)); snrt_memset((void *)dm_p, 0, sizeof(dm_t)); dm_p_global = dm_p; } else { diff --git a/sw/snRuntime/src/omp/eu.h b/sw/snRuntime/src/omp/eu.h index 1bbed0c89..7b587240f 100644 --- a/sw/snRuntime/src/omp/eu.h +++ b/sw/snRuntime/src/omp/eu.h @@ -165,7 +165,7 @@ inline uint32_t eu_get_workers_in_wfi() { inline void eu_init(void) { if (snrt_cluster_core_idx() == 0) { // Allocate the eu struct in L1 for fast access - eu_p = snrt_l1alloc(sizeof(eu_t)); + eu_p = snrt_l1_alloc(sizeof(eu_t)); snrt_memset((void *)eu_p, 0, sizeof(eu_t)); // store copy of eu_p on shared memory eu_p_global = eu_p; diff --git a/sw/snRuntime/src/omp/omp.c b/sw/snRuntime/src/omp/omp.c index 5ebc91ded..6fdeec2e5 100644 --- a/sw/snRuntime/src/omp/omp.c +++ b/sw/snRuntime/src/omp/omp.c @@ -48,10 +48,10 @@ static inline void initTeam(omp_t *_this, omp_team_t *team) { void omp_init(void) { if (snrt_cluster_core_idx() == 0) { // allocate space for kmp arguments - kmpc_args = - (_kmp_ptr32 *)snrt_l1alloc(sizeof(_kmp_ptr32) * KMP_FORK_MAX_NARGS); + kmpc_args = (_kmp_ptr32 *)snrt_l1_alloc(sizeof(_kmp_ptr32) * + KMP_FORK_MAX_NARGS); #ifndef OMPSTATIC_NUMTHREADS - omp_p = (omp_t *)snrt_l1alloc(sizeof(omp_t)); + omp_p = (omp_t *)snrt_l1_alloc(sizeof(omp_t)); unsigned int nbCores = snrt_cluster_compute_core_num(); omp_p->numThreads = nbCores; omp_p->maxThreads = nbCores; @@ -67,20 +67,20 @@ void omp_init(void) { initTeam((omp_t *)omp_p, (omp_team_t *)&omp_p->plainTeam); omp_p->kmpc_barrier = - (snrt_barrier_t *)snrt_l1alloc(sizeof(snrt_barrier_t)); + (snrt_barrier_t *)snrt_l1_alloc(sizeof(snrt_barrier_t)); snrt_memset(omp_p->kmpc_barrier, 0, sizeof(snrt_barrier_t)); // Exchange omp pointer with other cluster cores omp_p_global = omp_p; #else omp_p.kmpc_barrier = - (snrt_barrier_t *)snrt_l1alloc(sizeof(snrt_barrier_t)); + (snrt_barrier_t *)snrt_l1_alloc(sizeof(snrt_barrier_t)); snrt_memset(omp_p.kmpc_barrier, 0, sizeof(snrt_barrier_t)); // Exchange omp pointer with other cluster cores omp_p_global = &omp_p; #endif #ifdef OPENMP_PROFILE - omp_prof = (omp_prof_t *)snrt_l1alloc(sizeof(omp_prof_t)); + omp_prof = (omp_prof_t *)snrt_l1_alloc(sizeof(omp_prof_t)); #endif } else { diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c index c91f4c25c..824c8231f 100644 --- a/sw/snRuntime/src/start.c +++ b/sw/snRuntime/src/start.c @@ -2,21 +2,8 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -#ifdef OPENOCD_SEMIHOSTING -#include "openocd.h" -#endif - #ifdef SNRT_INIT_CLS -static inline uint32_t snrt_cls_base_addr() { - extern volatile uint32_t __cdata_start, __cdata_end; - extern volatile uint32_t __cbss_start, __cbss_end; - uint32_t cdata_size = ((uint32_t)&__cdata_end) - ((uint32_t)&__cdata_start); - uint32_t cbss_size = ((uint32_t)&__cbss_end) - ((uint32_t)&__cbss_start); - uint32_t l1_end_addr = SNRT_TCDM_START_ADDR + - snrt_cluster_idx() * SNRT_CLUSTER_OFFSET + - SNRT_TCDM_SIZE; - return l1_end_addr - cdata_size - cbss_size; -} +extern uint32_t snrt_cls_base_addr(); #endif #ifdef SNRT_INIT_TLS @@ -98,21 +85,16 @@ static inline void snrt_init_cls() { #endif #ifdef SNRT_INIT_LIBS -static inline void snrt_init_libs() { snrt_alloc_init(); } +static inline void snrt_init_libs() { + snrt_alloc_init(); + snrt_alloc_init_v2(); +} #endif #ifdef SNRT_CRT0_EXIT -static inline void snrt_exit_default(int exit_code) { - exit_code = snrt_global_all_to_all_reduction(exit_code); -#ifdef OPENOCD_SEMIHOSTING - if (snrt_global_core_idx() == 0) __ocd_semihost_exit(exit_code); -#else - if (snrt_global_core_idx() == 0) - *(snrt_exit_code_destination()) = (exit_code << 1) | 1; -#endif -} +extern void snrt_exit_default(int exit_code); #ifndef SNRT_CRT0_ALTERNATE_EXIT -static inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); } +extern void snrt_exit(int exit_code); #endif #endif diff --git a/sw/snRuntime/src/start.h b/sw/snRuntime/src/start.h new file mode 100644 index 000000000..2ef7abbf4 --- /dev/null +++ b/sw/snRuntime/src/start.h @@ -0,0 +1,35 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#ifdef OPENOCD_SEMIHOSTING +#include "openocd.h" +#endif + +#ifdef SNRT_CRT0_EXIT +inline void snrt_exit_default(int exit_code) { + exit_code = snrt_global_all_to_all_reduction(exit_code); +#ifdef OPENOCD_SEMIHOSTING + if (snrt_global_core_idx() == 0) __ocd_semihost_exit(exit_code); +#else + if (snrt_global_core_idx() == 0) + *(snrt_exit_code_destination()) = (exit_code << 1) | 1; +#endif +} +#ifndef SNRT_CRT0_ALTERNATE_EXIT +inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); } +#endif +#endif + +#ifdef SNRT_INIT_CLS +inline uint32_t snrt_cls_base_addr() { + extern volatile uint32_t __cdata_start, __cdata_end; + extern volatile uint32_t __cbss_start, __cbss_end; + uint32_t cdata_size = ((uint32_t)&__cdata_end) - ((uint32_t)&__cdata_start); + uint32_t cbss_size = ((uint32_t)&__cbss_end) - ((uint32_t)&__cbss_start); + uint32_t l1_end_addr = SNRT_TCDM_START_ADDR + + snrt_cluster_idx() * SNRT_CLUSTER_OFFSET + + SNRT_TCDM_SIZE; + return l1_end_addr - cdata_size - cbss_size; +} +#endif diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h index c6c012ca9..fa4b75b24 100644 --- a/sw/snRuntime/src/sync.h +++ b/sw/snRuntime/src/sync.h @@ -66,25 +66,31 @@ inline void snrt_cluster_hw_barrier() { asm volatile("csrr x0, 0x7C2" ::: "memory"); } +// Synchronizes one core from every cluster with the others. +// One core per cluster is expected to invoke this function. +inline void snrt_inter_cluster_barrier() { + // Remember previous iteration + uint32_t prev_barrier_iteration = _snrt_barrier.iteration; + uint32_t cnt = + __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED); + + // Increment the barrier counter + if (cnt == snrt_cluster_num()) { + _snrt_barrier.cnt = 0; + __atomic_add_fetch(&(_snrt_barrier.iteration), 1, __ATOMIC_RELAXED); + } else { + while (prev_barrier_iteration == _snrt_barrier.iteration) + ; + } +} + /// Synchronize clusters globally with a global software barrier inline void snrt_global_barrier() { snrt_cluster_hw_barrier(); // Synchronize all DM cores in software if (snrt_is_dm_core()) { - // Remember previous iteration - uint32_t prev_barrier_iteration = _snrt_barrier.iteration; - uint32_t cnt = - __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED); - - // Increment the barrier counter - if (cnt == snrt_cluster_num()) { - _snrt_barrier.cnt = 0; - __atomic_add_fetch(&(_snrt_barrier.iteration), 1, __ATOMIC_RELAXED); - } else { - while (prev_barrier_iteration == _snrt_barrier.iteration) - ; - } + snrt_inter_cluster_barrier(); } // Synchronize cores in a cluster with the HW barrier snrt_cluster_hw_barrier(); diff --git a/sw/snRuntime/src/team.c b/sw/snRuntime/src/team.c index 5290e1d28..e5b83e61c 100644 --- a/sw/snRuntime/src/team.c +++ b/sw/snRuntime/src/team.c @@ -26,6 +26,8 @@ extern uint32_t snrt_cluster_compute_core_num(); extern int snrt_is_compute_core(); +extern int snrt_cluster_is_last_compute_core(); + extern int snrt_is_dm_core(); extern uint32_t snrt_cluster_dm_core_num(); diff --git a/sw/snRuntime/src/team.h b/sw/snRuntime/src/team.h index 1b49bfaad..eb06a4488 100644 --- a/sw/snRuntime/src/team.h +++ b/sw/snRuntime/src/team.h @@ -59,6 +59,10 @@ inline int __attribute__((const)) snrt_is_compute_core() { return snrt_cluster_core_idx() < snrt_cluster_compute_core_num(); } +inline int __attribute__((const)) snrt_cluster_is_last_compute_core() { + return snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1); +} + inline int __attribute__((const)) snrt_is_dm_core() { return !snrt_is_compute_core(); } diff --git a/sw/snRuntime/src/types.h b/sw/snRuntime/src/types.h new file mode 100644 index 000000000..235a0ecbb --- /dev/null +++ b/sw/snRuntime/src/types.h @@ -0,0 +1,9 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t; + +typedef float v2f32 __attribute__((vector_size(8))); +typedef __fp16 v4f16 __attribute__((vector_size(8))); +typedef char v8f8 __attribute__((vector_size(8))); diff --git a/sw/tests/atomics.c b/sw/tests/atomics.c index 69dfb0000..f989d0c6b 100644 --- a/sw/tests/atomics.c +++ b/sw/tests/atomics.c @@ -183,9 +183,9 @@ int main() { if (core_id == 0) { volatile uint32_t* l1_a = - snrt_l1alloc(NUM_TCDM_LOCATIONS * sizeof(uint32_t)); + snrt_l1_alloc(NUM_TCDM_LOCATIONS * sizeof(uint32_t)); volatile uint32_t* l3_a = - snrt_l3alloc(NUM_SPM_LOCATIONS * sizeof(uint32_t)); + snrt_l3_alloc(NUM_SPM_LOCATIONS * sizeof(uint32_t)); // In TCDM uint32_t tcdm_atomics[NUM_TCDM_LOCATIONS]; diff --git a/sw/tests/data_mover.c b/sw/tests/data_mover.c index 388262bb7..05eea2a60 100644 --- a/sw/tests/data_mover.c +++ b/sw/tests/data_mover.c @@ -39,14 +39,14 @@ int main() { // Prepare data buffers const uint32_t n_elem = 128, n_rep = 4; uint32_t *l1_a, *l1_b, *l1_c, *l1_d, *l1_2d_a; - l1_a = snrt_l1alloc(n_elem * sizeof(uint32_t)); - l1_b = snrt_l1alloc(n_elem * sizeof(uint32_t)); - l1_c = snrt_l1alloc(n_elem * sizeof(uint32_t)); - l1_d = snrt_l1alloc(n_elem * sizeof(uint32_t)); - l1_2d_a = snrt_l1alloc(n_elem * n_rep * sizeof(uint32_t)); + l1_a = snrt_l1_alloc(n_elem * sizeof(uint32_t)); + l1_b = snrt_l1_alloc(n_elem * sizeof(uint32_t)); + l1_c = snrt_l1_alloc(n_elem * sizeof(uint32_t)); + l1_d = snrt_l1_alloc(n_elem * sizeof(uint32_t)); + l1_2d_a = snrt_l1_alloc(n_elem * n_rep * sizeof(uint32_t)); uint32_t *l3_a, *l3_2d_a; - l3_a = snrt_l3alloc(n_elem * sizeof(uint32_t)); - l3_2d_a = snrt_l3alloc(n_elem * n_rep * sizeof(uint32_t)); + l3_a = snrt_l3_alloc(n_elem * sizeof(uint32_t)); + l3_2d_a = snrt_l3_alloc(n_elem * n_rep * sizeof(uint32_t)); printf("-- Test 1: L1 -> L1\n"); for (uint32_t i = 0; i < n_elem; ++i) l1_a[i] = i; diff --git a/sw/tests/openmp_double_buffering.c b/sw/tests/openmp_double_buffering.c index 128fd7d71..f00ef3086 100644 --- a/sw/tests/openmp_double_buffering.c +++ b/sw/tests/openmp_double_buffering.c @@ -13,8 +13,8 @@ unsigned __attribute__((noinline)) double_buffering(void) { static double *bufx, *bufy, *x, *y; static double a; - bufx = snrt_l1alloc(sizeof(double) * 2 * TILESIZE); - bufy = snrt_l1alloc(sizeof(double) * 2 * TILESIZE); + bufx = snrt_l1_alloc(sizeof(double) * 2 * TILESIZE); + bufy = snrt_l1_alloc(sizeof(double) * 2 * TILESIZE); x = axpy_4096_x; y = axpy_4096_y; a = axpy_4096_a; diff --git a/sw/tests/openmp_for_static_schedule.c b/sw/tests/openmp_for_static_schedule.c index 6c00bd2cb..af16091f9 100644 --- a/sw/tests/openmp_for_static_schedule.c +++ b/sw/tests/openmp_for_static_schedule.c @@ -10,8 +10,8 @@ unsigned __attribute__((noinline)) static_schedule(void) { static double *data_x, *data_y, data_a; // Allocate AXPY input vectors - data_x = snrt_l1alloc(sizeof(double) * AXPY_N); - data_y = snrt_l1alloc(sizeof(double) * AXPY_N); + data_x = snrt_l1_alloc(sizeof(double) * AXPY_N); + data_y = snrt_l1_alloc(sizeof(double) * AXPY_N); // Initialize AXPY input vectors data_a = 10.0; diff --git a/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.c b/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.c index 6c079c12b..b2061c22e 100644 --- a/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.c +++ b/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.c @@ -2,17 +2,5 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -#define SNRT_INIT_TLS -#define SNRT_INIT_BSS -#define SNRT_INIT_CLS -#define SNRT_INIT_LIBS -#define SNRT_CRT0_PRE_BARRIER -#define SNRT_INVOKE_MAIN -#define SNRT_CRT0_POST_BARRIER -#define SNRT_CRT0_EXIT - -static inline volatile uint32_t* snrt_exit_code_destination() { - return (volatile uint32_t*)0x02000014; -} - +#include "snitch_cluster_start.h" #include "start.c" diff --git a/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.h b/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.h new file mode 100644 index 000000000..dc25d264e --- /dev/null +++ b/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.h @@ -0,0 +1,20 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define SNRT_INIT_TLS +#define SNRT_INIT_BSS +#define SNRT_INIT_CLS +#define SNRT_INIT_LIBS +#define SNRT_CRT0_PRE_BARRIER +#define SNRT_INVOKE_MAIN +#define SNRT_CRT0_POST_BARRIER +#define SNRT_CRT0_EXIT + +static inline volatile uint32_t* snrt_exit_code_destination() { + return (volatile uint32_t*)0x02000014; +} + +#include "start.h" diff --git a/target/snitch_cluster/sw/runtime/banshee/src/snrt.c b/target/snitch_cluster/sw/runtime/banshee/src/snrt.c index fcd4ba4de..ebf23ce55 100644 --- a/target/snitch_cluster/sw/runtime/banshee/src/snrt.c +++ b/target/snitch_cluster/sw/runtime/banshee/src/snrt.c @@ -5,6 +5,7 @@ #include "snrt.h" #include "alloc.c" +#include "alloc_v2.c" #include "cls.c" #include "cluster_interrupts.c" #include "dm.c" diff --git a/target/snitch_cluster/sw/runtime/banshee/src/snrt.h b/target/snitch_cluster/sw/runtime/banshee/src/snrt.h index 1f510f4e5..a9857e850 100644 --- a/target/snitch_cluster/sw/runtime/banshee/src/snrt.h +++ b/target/snitch_cluster/sw/runtime/banshee/src/snrt.h @@ -21,6 +21,7 @@ // Implementation #include "alloc.h" +#include "alloc_v2.h" #include "cls.h" #include "cluster_interrupts.h" #include "dm.h" @@ -33,6 +34,8 @@ #include "printf.h" #include "riscv.h" #include "snitch_cluster_global_interrupts.h" +#include "snitch_cluster_start.h" #include "ssr.h" #include "sync.h" #include "team.h" +#include "types.h" diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.c b/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.c index 3a4ab9b7c..b2061c22e 100644 --- a/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.c +++ b/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.c @@ -2,19 +2,5 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -#define SNRT_INIT_TLS -#define SNRT_INIT_BSS -#define SNRT_INIT_CLS -#define SNRT_INIT_LIBS -#define SNRT_CRT0_PRE_BARRIER -#define SNRT_INVOKE_MAIN -#define SNRT_CRT0_POST_BARRIER -#define SNRT_CRT0_EXIT - -#ifndef OPENOCD_SEMIHOSTING -static inline volatile uint32_t* snrt_exit_code_destination() { - return (volatile uint32_t*)&tohost; -} -#endif - +#include "snitch_cluster_start.h" #include "start.c" diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.h b/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.h new file mode 100644 index 000000000..a413dc3d1 --- /dev/null +++ b/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.h @@ -0,0 +1,24 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define SNRT_INIT_TLS +#define SNRT_INIT_BSS +#define SNRT_INIT_CLS +#define SNRT_INIT_LIBS +#define SNRT_CRT0_PRE_BARRIER +#define SNRT_INVOKE_MAIN +#define SNRT_CRT0_POST_BARRIER +#define SNRT_CRT0_EXIT + +extern volatile uint32_t tohost; + +#ifndef OPENOCD_SEMIHOSTING +static inline volatile uint32_t* snrt_exit_code_destination() { + return (volatile uint32_t*)&tohost; +} +#endif + +#include "start.h" diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snrt.c b/target/snitch_cluster/sw/runtime/rtl/src/snrt.c index fcd4ba4de..ebf23ce55 100644 --- a/target/snitch_cluster/sw/runtime/rtl/src/snrt.c +++ b/target/snitch_cluster/sw/runtime/rtl/src/snrt.c @@ -5,6 +5,7 @@ #include "snrt.h" #include "alloc.c" +#include "alloc_v2.c" #include "cls.c" #include "cluster_interrupts.c" #include "dm.c" diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snrt.h b/target/snitch_cluster/sw/runtime/rtl/src/snrt.h index a5a471e63..426b623ed 100644 --- a/target/snitch_cluster/sw/runtime/rtl/src/snrt.h +++ b/target/snitch_cluster/sw/runtime/rtl/src/snrt.h @@ -21,6 +21,7 @@ // Implementation #include "alloc.h" +#include "alloc_v2.h" #include "cls.h" #include "cluster_interrupts.h" #include "dm.h" @@ -33,6 +34,8 @@ #include "printf.h" #include "riscv.h" #include "snitch_cluster_global_interrupts.h" +#include "snitch_cluster_start.h" #include "ssr.h" #include "sync.h" #include "team.h" +#include "types.h"