From 198e9908c7b405ccc2bed64836bf413974f642ad Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 23 Aug 2024 15:05:24 +0200
Subject: [PATCH] snRuntime: Add `alloc_v2` functions and minor changes (#186)

* snRuntime: Add `snrt_inter_cluster_barrier()` function`

* snRuntime: Add private L1 alloc pointers and heap bound checks

* snRuntime: Add `snrt_compute_core_local_ptr()` function

* sw: Move floating-point types to snRuntime

* snRuntime: Add `snrt_l1_update_next_v2()` used to free memory

* snRuntime: Add `snrt_cluster_is_last_compute_core()` function
---
 sw/blas/gemm/src/gemm.h                       | 11 ---
 sw/dnn/fusedconv/src/main.c                   |  2 +-
 sw/dnn/src/dnn.h                              |  7 +-
 sw/snRuntime/api/alloc_decls.h                |  8 +-
 sw/snRuntime/api/start_decls.h                |  6 +-
 sw/snRuntime/api/sync_decls.h                 |  2 +
 sw/snRuntime/src/alloc.c                      |  4 +-
 sw/snRuntime/src/alloc.h                      |  8 +-
 sw/snRuntime/src/alloc_v2.c                   | 15 ++++
 sw/snRuntime/src/alloc_v2.h                   | 83 +++++++++++++++++++
 sw/snRuntime/src/dm.h                         |  2 +-
 sw/snRuntime/src/omp/eu.h                     |  2 +-
 sw/snRuntime/src/omp/omp.c                    | 12 +--
 sw/snRuntime/src/start.c                      | 32 ++-----
 sw/snRuntime/src/start.h                      | 35 ++++++++
 sw/snRuntime/src/sync.h                       | 32 ++++---
 sw/snRuntime/src/team.c                       |  2 +
 sw/snRuntime/src/team.h                       |  4 +
 sw/snRuntime/src/types.h                      |  9 ++
 sw/tests/atomics.c                            |  4 +-
 sw/tests/data_mover.c                         | 14 ++--
 sw/tests/openmp_double_buffering.c            |  4 +-
 sw/tests/openmp_for_static_schedule.c         |  4 +-
 .../banshee/src/snitch_cluster_start.c        | 14 +---
 .../banshee/src/snitch_cluster_start.h        | 20 +++++
 .../sw/runtime/banshee/src/snrt.c             |  1 +
 .../sw/runtime/banshee/src/snrt.h             |  3 +
 .../sw/runtime/rtl/src/snitch_cluster_start.c | 16 +---
 .../sw/runtime/rtl/src/snitch_cluster_start.h | 24 ++++++
 .../snitch_cluster/sw/runtime/rtl/src/snrt.c  |  1 +
 .../snitch_cluster/sw/runtime/rtl/src/snrt.h  |  3 +
 31 files changed, 267 insertions(+), 117 deletions(-)
 create mode 100644 sw/snRuntime/src/alloc_v2.c
 create mode 100644 sw/snRuntime/src/alloc_v2.h
 create mode 100644 sw/snRuntime/src/start.h
 create mode 100644 sw/snRuntime/src/types.h
 create mode 100644 target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.h
 create mode 100644 target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.h

diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
index 43a974556..a480379a9 100644
--- a/sw/blas/gemm/src/gemm.h
+++ b/sw/blas/gemm/src/gemm.h
@@ -13,17 +13,6 @@
 
 #pragma once
 
-// Guard to avoid conflict with DNN header file
-// TODO: move this definition to Snitch math library to solve problem
-#ifndef PRECISION_T
-#define PRECISION_T
-typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
-#endif
-
-typedef float v2f32 __attribute__((vector_size(8)));
-typedef __fp16 v4f16 __attribute__((vector_size(8)));
-typedef char v8f8 __attribute__((vector_size(8)));
-
 // Floating-point multiplications by zero cannot be optimized as in some
 // edge cases they do not yield zero:
 // - 0f * NaN = NaN
diff --git a/sw/dnn/fusedconv/src/main.c b/sw/dnn/fusedconv/src/main.c
index 8d25be366..c59354da5 100644
--- a/sw/dnn/fusedconv/src/main.c
+++ b/sw/dnn/fusedconv/src/main.c
@@ -24,7 +24,7 @@ int main() {
     float *ptr;
 
     if (snrt_is_dm_core() == 0) {
-        ptr = snrt_l1alloc(total_size * sizeof(float));
+        ptr = snrt_l1_alloc(total_size * sizeof(float));
         share_ptr = ptr;
     }
 
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
index 2e0d3f8eb..a60b7ad7c 100644
--- a/sw/dnn/src/dnn.h
+++ b/sw/dnn/src/dnn.h
@@ -6,12 +6,7 @@
 
 #include <stdint.h>
 
-// Guard to avoid conflict with BLAS header file
-// TODO: move this definition to Snitch math library to solve problem
-#ifndef PRECISION_T
-#define PRECISION_T
-typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
-#endif
+#include "snrt.h"
 
 #ifndef IMPLEMENTATION_T
 #define IMPLEMENTATION_T
diff --git a/sw/snRuntime/api/alloc_decls.h b/sw/snRuntime/api/alloc_decls.h
index a276472f3..bf4c5615a 100644
--- a/sw/snRuntime/api/alloc_decls.h
+++ b/sw/snRuntime/api/alloc_decls.h
@@ -10,8 +10,8 @@
 typedef struct {
     // Base address from where allocation starts
     uint32_t base;
-    // Number of bytes alloctable
-    uint32_t size;
+    // End address up to which allocation is allowed
+    uint32_t end;
     // Address of the next allocated block
     uint32_t next;
 } snrt_allocator_t;
@@ -20,10 +20,10 @@ inline void *snrt_l1_next();
 
 inline void *snrt_l3_next();
 
-inline void *snrt_l1alloc(size_t size);
+inline void *snrt_l1_alloc(size_t size);
 
 inline void snrt_l1_update_next(void *next);
 
-inline void *snrt_l3alloc(size_t size);
+inline void *snrt_l3_alloc(size_t size);
 
 inline void snrt_alloc_init();
diff --git a/sw/snRuntime/api/start_decls.h b/sw/snRuntime/api/start_decls.h
index 28a1d942c..8bf535073 100644
--- a/sw/snRuntime/api/start_decls.h
+++ b/sw/snRuntime/api/start_decls.h
@@ -1,7 +1,9 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
+// Copyright 2024 ETH Zurich and University of Bologna.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
-static inline void snrt_exit(int exit_code);
+inline void snrt_exit(int exit_code);
+
+inline uint32_t snrt_cls_base_addr();
diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h
index 05db1bad1..0e18f943a 100644
--- a/sw/snRuntime/api/sync_decls.h
+++ b/sw/snRuntime/api/sync_decls.h
@@ -26,3 +26,5 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx);
 inline void snrt_cluster_hw_barrier();
 
 inline void snrt_global_barrier();
+
+inline uint32_t snrt_global_all_to_all_reduction(uint32_t value);
diff --git a/sw/snRuntime/src/alloc.c b/sw/snRuntime/src/alloc.c
index 72c12048e..044b7944c 100644
--- a/sw/snRuntime/src/alloc.c
+++ b/sw/snRuntime/src/alloc.c
@@ -12,8 +12,8 @@ extern void *snrt_l3_next();
 extern uint32_t snrt_l1_start_addr();
 extern uint32_t snrt_l1_end_addr();
 
-extern void *snrt_l1alloc(size_t size);
-extern void *snrt_l3alloc(size_t size);
+extern void *snrt_l1_alloc(size_t size);
+extern void *snrt_l3_alloc(size_t size);
 
 extern snrt_allocator_t *snrt_l1_allocator();
 extern snrt_allocator_t *snrt_l3_allocator();
diff --git a/sw/snRuntime/src/alloc.h b/sw/snRuntime/src/alloc.h
index 6d5f250bb..33852064e 100644
--- a/sw/snRuntime/src/alloc.h
+++ b/sw/snRuntime/src/alloc.h
@@ -28,7 +28,7 @@ inline void *snrt_l3_next() { return (void *)snrt_l3_allocator()->next; }
  * @param size number of bytes to allocate
  * @return pointer to the allocated memory
  */
-inline void *snrt_l1alloc(size_t size) {
+inline void *snrt_l1_alloc(size_t size) {
     snrt_allocator_t *alloc = snrt_l1_allocator();
 
     // TODO colluca: do we need this? What does it imply?
@@ -64,7 +64,7 @@ inline void snrt_l1_update_next(void *next) {
  * @param size number of bytes to allocate
  * @return pointer to the allocated memory
  */
-inline void *snrt_l3alloc(size_t size) {
+inline void *snrt_l3_alloc(size_t size) {
     snrt_allocator_t *alloc = snrt_l3_allocator();
 
     // TODO: L3 alloc size check
@@ -83,12 +83,12 @@ inline void snrt_alloc_init() {
         // occupy a possibly significant portion.
         snrt_l1_allocator()->base =
             ALIGN_UP(snrt_l1_start_addr(), MIN_CHUNK_SIZE);
-        snrt_l1_allocator()->size = snrt_l1_end_addr() - snrt_l1_start_addr();
+        snrt_l1_allocator()->end = snrt_l1_end_addr();
         snrt_l1_allocator()->next = snrt_l1_allocator()->base;
         // Initialize L3 allocator
         extern uint32_t _edram;
         snrt_l3_allocator()->base = ALIGN_UP((uint32_t)&_edram, MIN_CHUNK_SIZE);
-        snrt_l3_allocator()->size = 0;
+        snrt_l3_allocator()->end = snrt_l3_allocator()->base;
         snrt_l3_allocator()->next = snrt_l3_allocator()->base;
     }
     // Synchronize with other cores
diff --git a/sw/snRuntime/src/alloc_v2.c b/sw/snRuntime/src/alloc_v2.c
new file mode 100644
index 000000000..cd9d0a853
--- /dev/null
+++ b/sw/snRuntime/src/alloc_v2.c
@@ -0,0 +1,15 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+__thread snrt_allocator_t l1_allocator_v2;
+
+extern void *snrt_l1_next_v2();
+
+extern void *snrt_l1_alloc_cluster_local(size_t size, size_t alignment);
+extern void *snrt_l1_alloc_compute_core_local(size_t size, size_t alignment);
+
+extern void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx,
+                                uint32_t dst_cluster_idx);
+
+extern void snrt_alloc_init_v2();
diff --git a/sw/snRuntime/src/alloc_v2.h b/sw/snRuntime/src/alloc_v2.h
new file mode 100644
index 000000000..29ffb81e5
--- /dev/null
+++ b/sw/snRuntime/src/alloc_v2.h
@@ -0,0 +1,83 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+extern __thread snrt_allocator_t l1_allocator_v2;
+
+inline snrt_allocator_t *snrt_l1_allocator_v2() { return &l1_allocator_v2; }
+
+inline void *snrt_l1_next_v2() { return (void *)snrt_l1_allocator_v2()->next; }
+
+/**
+ * @brief Override the L1 allocator next pointer
+ */
+inline void snrt_l1_update_next_v2(void *next) {
+    snrt_l1_allocator_v2()->next = (uint32_t)next;
+}
+
+// Check that allocation doesn't exceed allocator bounds, and raise an
+// exception otherwise
+inline void snrt_l1_alloc_check_bounds() {
+    if (snrt_l1_allocator_v2()->next > snrt_l1_allocator_v2()->end)
+        asm volatile("ecall \n");
+}
+
+// Dynamically allocate space for a variable of size `size` in the cluster's L1
+// memory. This function should be invoked by every core in a cluster. Every
+// core receives a pointer to the allocated variable.
+inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) {
+    snrt_l1_allocator_v2()->next =
+        ALIGN_UP(snrt_l1_allocator_v2()->next, alignment);
+    void *retval = snrt_l1_next_v2();
+    snrt_l1_allocator_v2()->next += size;
+    snrt_l1_alloc_check_bounds();
+    return retval;
+}
+
+// Dynamically allocate space for N variables of size `size` in the cluster's
+// L1 memory, N being the number of compute cores in the cluster. This function
+// should be invoked by every core in a cluster. Every compute core receives a
+// pointer to a unique variable among the N which have been allocated. The
+// return value for the DM core is undefined.
+inline void *snrt_l1_alloc_compute_core_local(size_t size,
+                                              const size_t alignment) {
+    snrt_l1_allocator_v2()->next =
+        ALIGN_UP(snrt_l1_allocator_v2()->next, alignment);
+    void *retval = snrt_l1_next_v2() + size * snrt_cluster_core_idx();
+    snrt_l1_allocator_v2()->next += size * snrt_cluster_compute_core_num();
+    snrt_l1_alloc_check_bounds();
+    return retval;
+}
+
+// Takes a pointer to a variable allocated using
+// `snrt_l1_alloc_compute_core_local` and returns a pointer to the same
+// variable allocated by another core, as specified by `core_idx`.
+// The `size` argument should be the same used during allocation.
+inline void *snrt_compute_core_local_ptr(void *ptr, uint32_t core_idx,
+                                         size_t size) {
+    size_t offset = (core_idx - snrt_cluster_core_idx()) * size;
+    return (void *)((uintptr_t)ptr + offset);
+}
+
+// Takes a pointer to a variable in the source cluster's L1 memory and returns
+// a pointer to the same offset in the destination cluster's L1 memory.
+inline void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx,
+                                uint32_t dst_cluster_idx) {
+    return (void *)((uintptr_t)ptr +
+                    (dst_cluster_idx - src_cluster_idx) * SNRT_CLUSTER_OFFSET);
+}
+
+inline void snrt_alloc_init_v2() {
+    // Calculate end address of the heap. The top of the TCDM address space is
+    // reserved for the cluster-local storage (CLS) and the stack of every
+    // core. We further provision a safety margin of 128B. The rest of the
+    // TCDM is reserved for the heap.
+    uint32_t heap_end_addr = snrt_cls_base_addr();
+    heap_end_addr -= (1 << SNRT_LOG2_STACK_SIZE) * snrt_cluster_core_num();
+    heap_end_addr -= 128;
+    // Initialize L1 allocator
+    snrt_l1_allocator_v2()->base =
+        ALIGN_UP(snrt_l1_start_addr(), MIN_CHUNK_SIZE);
+    snrt_l1_allocator_v2()->end = heap_end_addr;
+    snrt_l1_allocator_v2()->next = snrt_l1_allocator_v2()->base;
+}
diff --git a/sw/snRuntime/src/dm.h b/sw/snRuntime/src/dm.h
index b5de96763..3df2e9268 100644
--- a/sw/snRuntime/src/dm.h
+++ b/sw/snRuntime/src/dm.h
@@ -157,7 +157,7 @@ inline void dm_init(void) {
 #else
         snrt_interrupt_enable(IRQ_M_CLUSTER);
 #endif
-        dm_p = (dm_t *)snrt_l1alloc(sizeof(dm_t));
+        dm_p = (dm_t *)snrt_l1_alloc(sizeof(dm_t));
         snrt_memset((void *)dm_p, 0, sizeof(dm_t));
         dm_p_global = dm_p;
     } else {
diff --git a/sw/snRuntime/src/omp/eu.h b/sw/snRuntime/src/omp/eu.h
index 1bbed0c89..7b587240f 100644
--- a/sw/snRuntime/src/omp/eu.h
+++ b/sw/snRuntime/src/omp/eu.h
@@ -165,7 +165,7 @@ inline uint32_t eu_get_workers_in_wfi() {
 inline void eu_init(void) {
     if (snrt_cluster_core_idx() == 0) {
         // Allocate the eu struct in L1 for fast access
-        eu_p = snrt_l1alloc(sizeof(eu_t));
+        eu_p = snrt_l1_alloc(sizeof(eu_t));
         snrt_memset((void *)eu_p, 0, sizeof(eu_t));
         // store copy of eu_p on shared memory
         eu_p_global = eu_p;
diff --git a/sw/snRuntime/src/omp/omp.c b/sw/snRuntime/src/omp/omp.c
index 5ebc91ded..6fdeec2e5 100644
--- a/sw/snRuntime/src/omp/omp.c
+++ b/sw/snRuntime/src/omp/omp.c
@@ -48,10 +48,10 @@ static inline void initTeam(omp_t *_this, omp_team_t *team) {
 void omp_init(void) {
     if (snrt_cluster_core_idx() == 0) {
         // allocate space for kmp arguments
-        kmpc_args =
-            (_kmp_ptr32 *)snrt_l1alloc(sizeof(_kmp_ptr32) * KMP_FORK_MAX_NARGS);
+        kmpc_args = (_kmp_ptr32 *)snrt_l1_alloc(sizeof(_kmp_ptr32) *
+                                                KMP_FORK_MAX_NARGS);
 #ifndef OMPSTATIC_NUMTHREADS
-        omp_p = (omp_t *)snrt_l1alloc(sizeof(omp_t));
+        omp_p = (omp_t *)snrt_l1_alloc(sizeof(omp_t));
         unsigned int nbCores = snrt_cluster_compute_core_num();
         omp_p->numThreads = nbCores;
         omp_p->maxThreads = nbCores;
@@ -67,20 +67,20 @@ void omp_init(void) {
 
         initTeam((omp_t *)omp_p, (omp_team_t *)&omp_p->plainTeam);
         omp_p->kmpc_barrier =
-            (snrt_barrier_t *)snrt_l1alloc(sizeof(snrt_barrier_t));
+            (snrt_barrier_t *)snrt_l1_alloc(sizeof(snrt_barrier_t));
         snrt_memset(omp_p->kmpc_barrier, 0, sizeof(snrt_barrier_t));
         // Exchange omp pointer with other cluster cores
         omp_p_global = omp_p;
 #else
         omp_p.kmpc_barrier =
-            (snrt_barrier_t *)snrt_l1alloc(sizeof(snrt_barrier_t));
+            (snrt_barrier_t *)snrt_l1_alloc(sizeof(snrt_barrier_t));
         snrt_memset(omp_p.kmpc_barrier, 0, sizeof(snrt_barrier_t));
         // Exchange omp pointer with other cluster cores
         omp_p_global = &omp_p;
 #endif
 
 #ifdef OPENMP_PROFILE
-        omp_prof = (omp_prof_t *)snrt_l1alloc(sizeof(omp_prof_t));
+        omp_prof = (omp_prof_t *)snrt_l1_alloc(sizeof(omp_prof_t));
 #endif
 
     } else {
diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index c91f4c25c..824c8231f 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -2,21 +2,8 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-#ifdef OPENOCD_SEMIHOSTING
-#include "openocd.h"
-#endif
-
 #ifdef SNRT_INIT_CLS
-static inline uint32_t snrt_cls_base_addr() {
-    extern volatile uint32_t __cdata_start, __cdata_end;
-    extern volatile uint32_t __cbss_start, __cbss_end;
-    uint32_t cdata_size = ((uint32_t)&__cdata_end) - ((uint32_t)&__cdata_start);
-    uint32_t cbss_size = ((uint32_t)&__cbss_end) - ((uint32_t)&__cbss_start);
-    uint32_t l1_end_addr = SNRT_TCDM_START_ADDR +
-                           snrt_cluster_idx() * SNRT_CLUSTER_OFFSET +
-                           SNRT_TCDM_SIZE;
-    return l1_end_addr - cdata_size - cbss_size;
-}
+extern uint32_t snrt_cls_base_addr();
 #endif
 
 #ifdef SNRT_INIT_TLS
@@ -98,21 +85,16 @@ static inline void snrt_init_cls() {
 #endif
 
 #ifdef SNRT_INIT_LIBS
-static inline void snrt_init_libs() { snrt_alloc_init(); }
+static inline void snrt_init_libs() {
+    snrt_alloc_init();
+    snrt_alloc_init_v2();
+}
 #endif
 
 #ifdef SNRT_CRT0_EXIT
-static inline void snrt_exit_default(int exit_code) {
-    exit_code = snrt_global_all_to_all_reduction(exit_code);
-#ifdef OPENOCD_SEMIHOSTING
-    if (snrt_global_core_idx() == 0) __ocd_semihost_exit(exit_code);
-#else
-    if (snrt_global_core_idx() == 0)
-        *(snrt_exit_code_destination()) = (exit_code << 1) | 1;
-#endif
-}
+extern void snrt_exit_default(int exit_code);
 #ifndef SNRT_CRT0_ALTERNATE_EXIT
-static inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); }
+extern void snrt_exit(int exit_code);
 #endif
 #endif
 
diff --git a/sw/snRuntime/src/start.h b/sw/snRuntime/src/start.h
new file mode 100644
index 000000000..2ef7abbf4
--- /dev/null
+++ b/sw/snRuntime/src/start.h
@@ -0,0 +1,35 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifdef OPENOCD_SEMIHOSTING
+#include "openocd.h"
+#endif
+
+#ifdef SNRT_CRT0_EXIT
+inline void snrt_exit_default(int exit_code) {
+    exit_code = snrt_global_all_to_all_reduction(exit_code);
+#ifdef OPENOCD_SEMIHOSTING
+    if (snrt_global_core_idx() == 0) __ocd_semihost_exit(exit_code);
+#else
+    if (snrt_global_core_idx() == 0)
+        *(snrt_exit_code_destination()) = (exit_code << 1) | 1;
+#endif
+}
+#ifndef SNRT_CRT0_ALTERNATE_EXIT
+inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); }
+#endif
+#endif
+
+#ifdef SNRT_INIT_CLS
+inline uint32_t snrt_cls_base_addr() {
+    extern volatile uint32_t __cdata_start, __cdata_end;
+    extern volatile uint32_t __cbss_start, __cbss_end;
+    uint32_t cdata_size = ((uint32_t)&__cdata_end) - ((uint32_t)&__cdata_start);
+    uint32_t cbss_size = ((uint32_t)&__cbss_end) - ((uint32_t)&__cbss_start);
+    uint32_t l1_end_addr = SNRT_TCDM_START_ADDR +
+                           snrt_cluster_idx() * SNRT_CLUSTER_OFFSET +
+                           SNRT_TCDM_SIZE;
+    return l1_end_addr - cdata_size - cbss_size;
+}
+#endif
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index c6c012ca9..fa4b75b24 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -66,25 +66,31 @@ inline void snrt_cluster_hw_barrier() {
     asm volatile("csrr x0, 0x7C2" ::: "memory");
 }
 
+// Synchronizes one core from every cluster with the others.
+// One core per cluster is expected to invoke this function.
+inline void snrt_inter_cluster_barrier() {
+    // Remember previous iteration
+    uint32_t prev_barrier_iteration = _snrt_barrier.iteration;
+    uint32_t cnt =
+        __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
+
+    // Increment the barrier counter
+    if (cnt == snrt_cluster_num()) {
+        _snrt_barrier.cnt = 0;
+        __atomic_add_fetch(&(_snrt_barrier.iteration), 1, __ATOMIC_RELAXED);
+    } else {
+        while (prev_barrier_iteration == _snrt_barrier.iteration)
+            ;
+    }
+}
+
 /// Synchronize clusters globally with a global software barrier
 inline void snrt_global_barrier() {
     snrt_cluster_hw_barrier();
 
     // Synchronize all DM cores in software
     if (snrt_is_dm_core()) {
-        // Remember previous iteration
-        uint32_t prev_barrier_iteration = _snrt_barrier.iteration;
-        uint32_t cnt =
-            __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
-
-        // Increment the barrier counter
-        if (cnt == snrt_cluster_num()) {
-            _snrt_barrier.cnt = 0;
-            __atomic_add_fetch(&(_snrt_barrier.iteration), 1, __ATOMIC_RELAXED);
-        } else {
-            while (prev_barrier_iteration == _snrt_barrier.iteration)
-                ;
-        }
+        snrt_inter_cluster_barrier();
     }
     // Synchronize cores in a cluster with the HW barrier
     snrt_cluster_hw_barrier();
diff --git a/sw/snRuntime/src/team.c b/sw/snRuntime/src/team.c
index 5290e1d28..e5b83e61c 100644
--- a/sw/snRuntime/src/team.c
+++ b/sw/snRuntime/src/team.c
@@ -26,6 +26,8 @@ extern uint32_t snrt_cluster_compute_core_num();
 
 extern int snrt_is_compute_core();
 
+extern int snrt_cluster_is_last_compute_core();
+
 extern int snrt_is_dm_core();
 
 extern uint32_t snrt_cluster_dm_core_num();
diff --git a/sw/snRuntime/src/team.h b/sw/snRuntime/src/team.h
index 1b49bfaad..eb06a4488 100644
--- a/sw/snRuntime/src/team.h
+++ b/sw/snRuntime/src/team.h
@@ -59,6 +59,10 @@ inline int __attribute__((const)) snrt_is_compute_core() {
     return snrt_cluster_core_idx() < snrt_cluster_compute_core_num();
 }
 
+inline int __attribute__((const)) snrt_cluster_is_last_compute_core() {
+    return snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1);
+}
+
 inline int __attribute__((const)) snrt_is_dm_core() {
     return !snrt_is_compute_core();
 }
diff --git a/sw/snRuntime/src/types.h b/sw/snRuntime/src/types.h
new file mode 100644
index 000000000..235a0ecbb
--- /dev/null
+++ b/sw/snRuntime/src/types.h
@@ -0,0 +1,9 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
+
+typedef float v2f32 __attribute__((vector_size(8)));
+typedef __fp16 v4f16 __attribute__((vector_size(8)));
+typedef char v8f8 __attribute__((vector_size(8)));
diff --git a/sw/tests/atomics.c b/sw/tests/atomics.c
index 69dfb0000..f989d0c6b 100644
--- a/sw/tests/atomics.c
+++ b/sw/tests/atomics.c
@@ -183,9 +183,9 @@ int main() {
 
     if (core_id == 0) {
         volatile uint32_t* l1_a =
-            snrt_l1alloc(NUM_TCDM_LOCATIONS * sizeof(uint32_t));
+            snrt_l1_alloc(NUM_TCDM_LOCATIONS * sizeof(uint32_t));
         volatile uint32_t* l3_a =
-            snrt_l3alloc(NUM_SPM_LOCATIONS * sizeof(uint32_t));
+            snrt_l3_alloc(NUM_SPM_LOCATIONS * sizeof(uint32_t));
 
         // In TCDM
         uint32_t tcdm_atomics[NUM_TCDM_LOCATIONS];
diff --git a/sw/tests/data_mover.c b/sw/tests/data_mover.c
index 388262bb7..05eea2a60 100644
--- a/sw/tests/data_mover.c
+++ b/sw/tests/data_mover.c
@@ -39,14 +39,14 @@ int main() {
     // Prepare data buffers
     const uint32_t n_elem = 128, n_rep = 4;
     uint32_t *l1_a, *l1_b, *l1_c, *l1_d, *l1_2d_a;
-    l1_a = snrt_l1alloc(n_elem * sizeof(uint32_t));
-    l1_b = snrt_l1alloc(n_elem * sizeof(uint32_t));
-    l1_c = snrt_l1alloc(n_elem * sizeof(uint32_t));
-    l1_d = snrt_l1alloc(n_elem * sizeof(uint32_t));
-    l1_2d_a = snrt_l1alloc(n_elem * n_rep * sizeof(uint32_t));
+    l1_a = snrt_l1_alloc(n_elem * sizeof(uint32_t));
+    l1_b = snrt_l1_alloc(n_elem * sizeof(uint32_t));
+    l1_c = snrt_l1_alloc(n_elem * sizeof(uint32_t));
+    l1_d = snrt_l1_alloc(n_elem * sizeof(uint32_t));
+    l1_2d_a = snrt_l1_alloc(n_elem * n_rep * sizeof(uint32_t));
     uint32_t *l3_a, *l3_2d_a;
-    l3_a = snrt_l3alloc(n_elem * sizeof(uint32_t));
-    l3_2d_a = snrt_l3alloc(n_elem * n_rep * sizeof(uint32_t));
+    l3_a = snrt_l3_alloc(n_elem * sizeof(uint32_t));
+    l3_2d_a = snrt_l3_alloc(n_elem * n_rep * sizeof(uint32_t));
 
     printf("-- Test 1: L1 -> L1\n");
     for (uint32_t i = 0; i < n_elem; ++i) l1_a[i] = i;
diff --git a/sw/tests/openmp_double_buffering.c b/sw/tests/openmp_double_buffering.c
index 128fd7d71..f00ef3086 100644
--- a/sw/tests/openmp_double_buffering.c
+++ b/sw/tests/openmp_double_buffering.c
@@ -13,8 +13,8 @@ unsigned __attribute__((noinline)) double_buffering(void) {
     static double *bufx, *bufy, *x, *y;
     static double a;
 
-    bufx = snrt_l1alloc(sizeof(double) * 2 * TILESIZE);
-    bufy = snrt_l1alloc(sizeof(double) * 2 * TILESIZE);
+    bufx = snrt_l1_alloc(sizeof(double) * 2 * TILESIZE);
+    bufy = snrt_l1_alloc(sizeof(double) * 2 * TILESIZE);
     x = axpy_4096_x;
     y = axpy_4096_y;
     a = axpy_4096_a;
diff --git a/sw/tests/openmp_for_static_schedule.c b/sw/tests/openmp_for_static_schedule.c
index 6c00bd2cb..af16091f9 100644
--- a/sw/tests/openmp_for_static_schedule.c
+++ b/sw/tests/openmp_for_static_schedule.c
@@ -10,8 +10,8 @@ unsigned __attribute__((noinline)) static_schedule(void) {
     static double *data_x, *data_y, data_a;
 
     // Allocate AXPY input vectors
-    data_x = snrt_l1alloc(sizeof(double) * AXPY_N);
-    data_y = snrt_l1alloc(sizeof(double) * AXPY_N);
+    data_x = snrt_l1_alloc(sizeof(double) * AXPY_N);
+    data_y = snrt_l1_alloc(sizeof(double) * AXPY_N);
 
     // Initialize AXPY input vectors
     data_a = 10.0;
diff --git a/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.c b/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.c
index 6c079c12b..b2061c22e 100644
--- a/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.c
+++ b/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.c
@@ -2,17 +2,5 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-#define SNRT_INIT_TLS
-#define SNRT_INIT_BSS
-#define SNRT_INIT_CLS
-#define SNRT_INIT_LIBS
-#define SNRT_CRT0_PRE_BARRIER
-#define SNRT_INVOKE_MAIN
-#define SNRT_CRT0_POST_BARRIER
-#define SNRT_CRT0_EXIT
-
-static inline volatile uint32_t* snrt_exit_code_destination() {
-    return (volatile uint32_t*)0x02000014;
-}
-
+#include "snitch_cluster_start.h"
 #include "start.c"
diff --git a/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.h b/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.h
new file mode 100644
index 000000000..dc25d264e
--- /dev/null
+++ b/target/snitch_cluster/sw/runtime/banshee/src/snitch_cluster_start.h
@@ -0,0 +1,20 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define SNRT_INIT_TLS
+#define SNRT_INIT_BSS
+#define SNRT_INIT_CLS
+#define SNRT_INIT_LIBS
+#define SNRT_CRT0_PRE_BARRIER
+#define SNRT_INVOKE_MAIN
+#define SNRT_CRT0_POST_BARRIER
+#define SNRT_CRT0_EXIT
+
+static inline volatile uint32_t* snrt_exit_code_destination() {
+    return (volatile uint32_t*)0x02000014;
+}
+
+#include "start.h"
diff --git a/target/snitch_cluster/sw/runtime/banshee/src/snrt.c b/target/snitch_cluster/sw/runtime/banshee/src/snrt.c
index fcd4ba4de..ebf23ce55 100644
--- a/target/snitch_cluster/sw/runtime/banshee/src/snrt.c
+++ b/target/snitch_cluster/sw/runtime/banshee/src/snrt.c
@@ -5,6 +5,7 @@
 #include "snrt.h"
 
 #include "alloc.c"
+#include "alloc_v2.c"
 #include "cls.c"
 #include "cluster_interrupts.c"
 #include "dm.c"
diff --git a/target/snitch_cluster/sw/runtime/banshee/src/snrt.h b/target/snitch_cluster/sw/runtime/banshee/src/snrt.h
index 1f510f4e5..a9857e850 100644
--- a/target/snitch_cluster/sw/runtime/banshee/src/snrt.h
+++ b/target/snitch_cluster/sw/runtime/banshee/src/snrt.h
@@ -21,6 +21,7 @@
 
 // Implementation
 #include "alloc.h"
+#include "alloc_v2.h"
 #include "cls.h"
 #include "cluster_interrupts.h"
 #include "dm.h"
@@ -33,6 +34,8 @@
 #include "printf.h"
 #include "riscv.h"
 #include "snitch_cluster_global_interrupts.h"
+#include "snitch_cluster_start.h"
 #include "ssr.h"
 #include "sync.h"
 #include "team.h"
+#include "types.h"
diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.c b/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.c
index 3a4ab9b7c..b2061c22e 100644
--- a/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.c
+++ b/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.c
@@ -2,19 +2,5 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-#define SNRT_INIT_TLS
-#define SNRT_INIT_BSS
-#define SNRT_INIT_CLS
-#define SNRT_INIT_LIBS
-#define SNRT_CRT0_PRE_BARRIER
-#define SNRT_INVOKE_MAIN
-#define SNRT_CRT0_POST_BARRIER
-#define SNRT_CRT0_EXIT
-
-#ifndef OPENOCD_SEMIHOSTING
-static inline volatile uint32_t* snrt_exit_code_destination() {
-    return (volatile uint32_t*)&tohost;
-}
-#endif
-
+#include "snitch_cluster_start.h"
 #include "start.c"
diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.h b/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.h
new file mode 100644
index 000000000..a413dc3d1
--- /dev/null
+++ b/target/snitch_cluster/sw/runtime/rtl/src/snitch_cluster_start.h
@@ -0,0 +1,24 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define SNRT_INIT_TLS
+#define SNRT_INIT_BSS
+#define SNRT_INIT_CLS
+#define SNRT_INIT_LIBS
+#define SNRT_CRT0_PRE_BARRIER
+#define SNRT_INVOKE_MAIN
+#define SNRT_CRT0_POST_BARRIER
+#define SNRT_CRT0_EXIT
+
+extern volatile uint32_t tohost;
+
+#ifndef OPENOCD_SEMIHOSTING
+static inline volatile uint32_t* snrt_exit_code_destination() {
+    return (volatile uint32_t*)&tohost;
+}
+#endif
+
+#include "start.h"
diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snrt.c b/target/snitch_cluster/sw/runtime/rtl/src/snrt.c
index fcd4ba4de..ebf23ce55 100644
--- a/target/snitch_cluster/sw/runtime/rtl/src/snrt.c
+++ b/target/snitch_cluster/sw/runtime/rtl/src/snrt.c
@@ -5,6 +5,7 @@
 #include "snrt.h"
 
 #include "alloc.c"
+#include "alloc_v2.c"
 #include "cls.c"
 #include "cluster_interrupts.c"
 #include "dm.c"
diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snrt.h b/target/snitch_cluster/sw/runtime/rtl/src/snrt.h
index a5a471e63..426b623ed 100644
--- a/target/snitch_cluster/sw/runtime/rtl/src/snrt.h
+++ b/target/snitch_cluster/sw/runtime/rtl/src/snrt.h
@@ -21,6 +21,7 @@
 
 // Implementation
 #include "alloc.h"
+#include "alloc_v2.h"
 #include "cls.h"
 #include "cluster_interrupts.h"
 #include "dm.h"
@@ -33,6 +34,8 @@
 #include "printf.h"
 #include "riscv.h"
 #include "snitch_cluster_global_interrupts.h"
+#include "snitch_cluster_start.h"
 #include "ssr.h"
 #include "sync.h"
 #include "team.h"
+#include "types.h"