snRuntime: Add alloc_v2 functions and minor changes (#186)

* snRuntime: Add `snrt_inter_cluster_barrier()` function` * snRuntime: Add private L1 alloc pointers and heap bound checks * snRuntime: Add `snrt_compute_core_local_ptr()` function * sw: Move floating-point types to snRuntime * snRuntime: Add `snrt_l1_update_next_v2()` used to free memory * snRuntime: Add `snrt_cluster_is_last_compute_core()` function
pulp-platform · Aug 23, 2024 · 198e990 · 198e990
1 parent b9f8903
commit 198e990
Show file tree

Hide file tree

Showing 31 changed files with 267 additions and 117 deletions.
diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
@@ -13,17 +13,6 @@
 
 #pragma once
 
-// Guard to avoid conflict with DNN header file
-// TODO: move this definition to Snitch math library to solve problem
-#ifndef PRECISION_T
-#define PRECISION_T
-typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
-#endif
-
-typedef float v2f32 __attribute__((vector_size(8)));
-typedef __fp16 v4f16 __attribute__((vector_size(8)));
-typedef char v8f8 __attribute__((vector_size(8)));
-
 // Floating-point multiplications by zero cannot be optimized as in some
 // edge cases they do not yield zero:
 // - 0f * NaN = NaN

diff --git a/sw/dnn/fusedconv/src/main.c b/sw/dnn/fusedconv/src/main.c
@@ -24,7 +24,7 @@ int main() {
     float *ptr;
 
     if (snrt_is_dm_core() == 0) {
-        ptr = snrt_l1alloc(total_size * sizeof(float));
+        ptr = snrt_l1_alloc(total_size * sizeof(float));
         share_ptr = ptr;
     }
 

diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
@@ -6,12 +6,7 @@
 
 #include <stdint.h>
 
-// Guard to avoid conflict with BLAS header file
-// TODO: move this definition to Snitch math library to solve problem
-#ifndef PRECISION_T
-#define PRECISION_T
-typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
-#endif
+#include "snrt.h"
 
 #ifndef IMPLEMENTATION_T
 #define IMPLEMENTATION_T

diff --git a/sw/snRuntime/api/alloc_decls.h b/sw/snRuntime/api/alloc_decls.h
@@ -10,8 +10,8 @@
 typedef struct {
     // Base address from where allocation starts
     uint32_t base;
-    // Number of bytes alloctable
-    uint32_t size;
+    // End address up to which allocation is allowed
+    uint32_t end;
     // Address of the next allocated block
     uint32_t next;
 } snrt_allocator_t;
@@ -20,10 +20,10 @@ inline void *snrt_l1_next();
 
 inline void *snrt_l3_next();
 
-inline void *snrt_l1alloc(size_t size);
+inline void *snrt_l1_alloc(size_t size);
 
 inline void snrt_l1_update_next(void *next);
 
-inline void *snrt_l3alloc(size_t size);
+inline void *snrt_l3_alloc(size_t size);
 
 inline void snrt_alloc_init();
diff --git a/sw/snRuntime/api/start_decls.h b/sw/snRuntime/api/start_decls.h
@@ -1,7 +1,9 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
+// Copyright 2024 ETH Zurich and University of Bologna.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
-static inline void snrt_exit(int exit_code);
+inline void snrt_exit(int exit_code);
+
+inline uint32_t snrt_cls_base_addr();
diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h
@@ -26,3 +26,5 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx);
 inline void snrt_cluster_hw_barrier();
 
 inline void snrt_global_barrier();
+
+inline uint32_t snrt_global_all_to_all_reduction(uint32_t value);
diff --git a/sw/snRuntime/src/alloc.c b/sw/snRuntime/src/alloc.c
@@ -12,8 +12,8 @@ extern void *snrt_l3_next();
 extern uint32_t snrt_l1_start_addr();
 extern uint32_t snrt_l1_end_addr();
 
-extern void *snrt_l1alloc(size_t size);
-extern void *snrt_l3alloc(size_t size);
+extern void *snrt_l1_alloc(size_t size);
+extern void *snrt_l3_alloc(size_t size);
 
 extern snrt_allocator_t *snrt_l1_allocator();
 extern snrt_allocator_t *snrt_l3_allocator();

diff --git a/sw/snRuntime/src/alloc.h b/sw/snRuntime/src/alloc.h
@@ -28,7 +28,7 @@ inline void *snrt_l3_next() { return (void *)snrt_l3_allocator()->next; }
  * @param size number of bytes to allocate
  * @return pointer to the allocated memory
  */
-inline void *snrt_l1alloc(size_t size) {
+inline void *snrt_l1_alloc(size_t size) {
     snrt_allocator_t *alloc = snrt_l1_allocator();
 
     // TODO colluca: do we need this? What does it imply?
@@ -64,7 +64,7 @@ inline void snrt_l1_update_next(void *next) {
  * @param size number of bytes to allocate
  * @return pointer to the allocated memory
  */
-inline void *snrt_l3alloc(size_t size) {
+inline void *snrt_l3_alloc(size_t size) {
     snrt_allocator_t *alloc = snrt_l3_allocator();
 
     // TODO: L3 alloc size check
@@ -83,12 +83,12 @@ inline void snrt_alloc_init() {
         // occupy a possibly significant portion.
         snrt_l1_allocator()->base =
             ALIGN_UP(snrt_l1_start_addr(), MIN_CHUNK_SIZE);
-        snrt_l1_allocator()->size = snrt_l1_end_addr() - snrt_l1_start_addr();
+        snrt_l1_allocator()->end = snrt_l1_end_addr();
         snrt_l1_allocator()->next = snrt_l1_allocator()->base;
         // Initialize L3 allocator
         extern uint32_t _edram;
         snrt_l3_allocator()->base = ALIGN_UP((uint32_t)&_edram, MIN_CHUNK_SIZE);
-        snrt_l3_allocator()->size = 0;
+        snrt_l3_allocator()->end = snrt_l3_allocator()->base;
         snrt_l3_allocator()->next = snrt_l3_allocator()->base;
     }
     // Synchronize with other cores

diff --git a/sw/snRuntime/src/alloc_v2.c b/sw/snRuntime/src/alloc_v2.c
@@ -0,0 +1,15 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+__thread snrt_allocator_t l1_allocator_v2;
+
+extern void *snrt_l1_next_v2();
+
+extern void *snrt_l1_alloc_cluster_local(size_t size, size_t alignment);
+extern void *snrt_l1_alloc_compute_core_local(size_t size, size_t alignment);
+
+extern void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx,
+                                uint32_t dst_cluster_idx);
+
+extern void snrt_alloc_init_v2();
diff --git a/sw/snRuntime/src/alloc_v2.h b/sw/snRuntime/src/alloc_v2.h
@@ -0,0 +1,83 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+extern __thread snrt_allocator_t l1_allocator_v2;
+
+inline snrt_allocator_t *snrt_l1_allocator_v2() { return &l1_allocator_v2; }
+
+inline void *snrt_l1_next_v2() { return (void *)snrt_l1_allocator_v2()->next; }
+
+/**
+ * @brief Override the L1 allocator next pointer
+ */
+inline void snrt_l1_update_next_v2(void *next) {
+    snrt_l1_allocator_v2()->next = (uint32_t)next;
+}
+
+// Check that allocation doesn't exceed allocator bounds, and raise an
+// exception otherwise
+inline void snrt_l1_alloc_check_bounds() {
+    if (snrt_l1_allocator_v2()->next > snrt_l1_allocator_v2()->end)
+        asm volatile("ecall \n");
+}
+
+// Dynamically allocate space for a variable of size `size` in the cluster's L1
+// memory. This function should be invoked by every core in a cluster. Every
+// core receives a pointer to the allocated variable.
+inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) {
+    snrt_l1_allocator_v2()->next =
+        ALIGN_UP(snrt_l1_allocator_v2()->next, alignment);
+    void *retval = snrt_l1_next_v2();
+    snrt_l1_allocator_v2()->next += size;
+    snrt_l1_alloc_check_bounds();
+    return retval;
+}
+
+// Dynamically allocate space for N variables of size `size` in the cluster's
+// L1 memory, N being the number of compute cores in the cluster. This function
+// should be invoked by every core in a cluster. Every compute core receives a
+// pointer to a unique variable among the N which have been allocated. The
+// return value for the DM core is undefined.
+inline void *snrt_l1_alloc_compute_core_local(size_t size,
+                                              const size_t alignment) {
+    snrt_l1_allocator_v2()->next =
+        ALIGN_UP(snrt_l1_allocator_v2()->next, alignment);
+    void *retval = snrt_l1_next_v2() + size * snrt_cluster_core_idx();
+    snrt_l1_allocator_v2()->next += size * snrt_cluster_compute_core_num();
+    snrt_l1_alloc_check_bounds();
+    return retval;
+}
+
+// Takes a pointer to a variable allocated using
+// `snrt_l1_alloc_compute_core_local` and returns a pointer to the same
+// variable allocated by another core, as specified by `core_idx`.
+// The `size` argument should be the same used during allocation.
+inline void *snrt_compute_core_local_ptr(void *ptr, uint32_t core_idx,
+                                         size_t size) {
+    size_t offset = (core_idx - snrt_cluster_core_idx()) * size;
+    return (void *)((uintptr_t)ptr + offset);
+}
+
+// Takes a pointer to a variable in the source cluster's L1 memory and returns
+// a pointer to the same offset in the destination cluster's L1 memory.
+inline void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx,
+                                uint32_t dst_cluster_idx) {
+    return (void *)((uintptr_t)ptr +
+                    (dst_cluster_idx - src_cluster_idx) * SNRT_CLUSTER_OFFSET);
+}
+
+inline void snrt_alloc_init_v2() {
+    // Calculate end address of the heap. The top of the TCDM address space is
+    // reserved for the cluster-local storage (CLS) and the stack of every
+    // core. We further provision a safety margin of 128B. The rest of the
+    // TCDM is reserved for the heap.
+    uint32_t heap_end_addr = snrt_cls_base_addr();
+    heap_end_addr -= (1 << SNRT_LOG2_STACK_SIZE) * snrt_cluster_core_num();
+    heap_end_addr -= 128;
+    // Initialize L1 allocator
+    snrt_l1_allocator_v2()->base =
+        ALIGN_UP(snrt_l1_start_addr(), MIN_CHUNK_SIZE);
+    snrt_l1_allocator_v2()->end = heap_end_addr;
+    snrt_l1_allocator_v2()->next = snrt_l1_allocator_v2()->base;
+}
diff --git a/sw/snRuntime/src/dm.h b/sw/snRuntime/src/dm.h
@@ -157,7 +157,7 @@ inline void dm_init(void) {
 #else
         snrt_interrupt_enable(IRQ_M_CLUSTER);
 #endif
-        dm_p = (dm_t *)snrt_l1alloc(sizeof(dm_t));
+        dm_p = (dm_t *)snrt_l1_alloc(sizeof(dm_t));
         snrt_memset((void *)dm_p, 0, sizeof(dm_t));
         dm_p_global = dm_p;
     } else {

diff --git a/sw/snRuntime/src/omp/eu.h b/sw/snRuntime/src/omp/eu.h
@@ -165,7 +165,7 @@ inline uint32_t eu_get_workers_in_wfi() {
 inline void eu_init(void) {
     if (snrt_cluster_core_idx() == 0) {
         // Allocate the eu struct in L1 for fast access
-        eu_p = snrt_l1alloc(sizeof(eu_t));
+        eu_p = snrt_l1_alloc(sizeof(eu_t));
         snrt_memset((void *)eu_p, 0, sizeof(eu_t));
         // store copy of eu_p on shared memory
         eu_p_global = eu_p;

diff --git a/sw/snRuntime/src/omp/omp.c b/sw/snRuntime/src/omp/omp.c
@@ -48,10 +48,10 @@ static inline void initTeam(omp_t *_this, omp_team_t *team) {
 void omp_init(void) {
     if (snrt_cluster_core_idx() == 0) {
         // allocate space for kmp arguments
-        kmpc_args =
-            (_kmp_ptr32 *)snrt_l1alloc(sizeof(_kmp_ptr32) * KMP_FORK_MAX_NARGS);
+        kmpc_args = (_kmp_ptr32 *)snrt_l1_alloc(sizeof(_kmp_ptr32) *
+                                                KMP_FORK_MAX_NARGS);
 #ifndef OMPSTATIC_NUMTHREADS
-        omp_p = (omp_t *)snrt_l1alloc(sizeof(omp_t));
+        omp_p = (omp_t *)snrt_l1_alloc(sizeof(omp_t));
         unsigned int nbCores = snrt_cluster_compute_core_num();
         omp_p->numThreads = nbCores;
         omp_p->maxThreads = nbCores;
@@ -67,20 +67,20 @@ void omp_init(void) {
 
         initTeam((omp_t *)omp_p, (omp_team_t *)&omp_p->plainTeam);
         omp_p->kmpc_barrier =
-            (snrt_barrier_t *)snrt_l1alloc(sizeof(snrt_barrier_t));
+            (snrt_barrier_t *)snrt_l1_alloc(sizeof(snrt_barrier_t));
         snrt_memset(omp_p->kmpc_barrier, 0, sizeof(snrt_barrier_t));
         // Exchange omp pointer with other cluster cores
         omp_p_global = omp_p;
 #else
         omp_p.kmpc_barrier =
-            (snrt_barrier_t *)snrt_l1alloc(sizeof(snrt_barrier_t));
+            (snrt_barrier_t *)snrt_l1_alloc(sizeof(snrt_barrier_t));
         snrt_memset(omp_p.kmpc_barrier, 0, sizeof(snrt_barrier_t));
         // Exchange omp pointer with other cluster cores
         omp_p_global = &omp_p;
 #endif
 
 #ifdef OPENMP_PROFILE
-        omp_prof = (omp_prof_t *)snrt_l1alloc(sizeof(omp_prof_t));
+        omp_prof = (omp_prof_t *)snrt_l1_alloc(sizeof(omp_prof_t));
 #endif
 
     } else {

diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
@@ -2,21 +2,8 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-#ifdef OPENOCD_SEMIHOSTING
-#include "openocd.h"
-#endif
-
 #ifdef SNRT_INIT_CLS
-static inline uint32_t snrt_cls_base_addr() {
-    extern volatile uint32_t __cdata_start, __cdata_end;
-    extern volatile uint32_t __cbss_start, __cbss_end;
-    uint32_t cdata_size = ((uint32_t)&__cdata_end) - ((uint32_t)&__cdata_start);
-    uint32_t cbss_size = ((uint32_t)&__cbss_end) - ((uint32_t)&__cbss_start);
-    uint32_t l1_end_addr = SNRT_TCDM_START_ADDR +
-                           snrt_cluster_idx() * SNRT_CLUSTER_OFFSET +
-                           SNRT_TCDM_SIZE;
-    return l1_end_addr - cdata_size - cbss_size;
-}
+extern uint32_t snrt_cls_base_addr();
 #endif
 
 #ifdef SNRT_INIT_TLS
@@ -98,21 +85,16 @@ static inline void snrt_init_cls() {
 #endif
 
 #ifdef SNRT_INIT_LIBS
-static inline void snrt_init_libs() { snrt_alloc_init(); }
+static inline void snrt_init_libs() {
+    snrt_alloc_init();
+    snrt_alloc_init_v2();
+}
 #endif
 
 #ifdef SNRT_CRT0_EXIT
-static inline void snrt_exit_default(int exit_code) {
-    exit_code = snrt_global_all_to_all_reduction(exit_code);
-#ifdef OPENOCD_SEMIHOSTING
-    if (snrt_global_core_idx() == 0) __ocd_semihost_exit(exit_code);
-#else
-    if (snrt_global_core_idx() == 0)
-        *(snrt_exit_code_destination()) = (exit_code << 1) | 1;
-#endif
-}
+extern void snrt_exit_default(int exit_code);
 #ifndef SNRT_CRT0_ALTERNATE_EXIT
-static inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); }
+extern void snrt_exit(int exit_code);
 #endif
 #endif
 

diff --git a/sw/snRuntime/src/start.h b/sw/snRuntime/src/start.h
@@ -0,0 +1,35 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifdef OPENOCD_SEMIHOSTING
+#include "openocd.h"
+#endif
+
+#ifdef SNRT_CRT0_EXIT
+inline void snrt_exit_default(int exit_code) {
+    exit_code = snrt_global_all_to_all_reduction(exit_code);
+#ifdef OPENOCD_SEMIHOSTING
+    if (snrt_global_core_idx() == 0) __ocd_semihost_exit(exit_code);
+#else
+    if (snrt_global_core_idx() == 0)
+        *(snrt_exit_code_destination()) = (exit_code << 1) | 1;
+#endif
+}
+#ifndef SNRT_CRT0_ALTERNATE_EXIT
+inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); }
+#endif
+#endif
+
+#ifdef SNRT_INIT_CLS
+inline uint32_t snrt_cls_base_addr() {
+    extern volatile uint32_t __cdata_start, __cdata_end;
+    extern volatile uint32_t __cbss_start, __cbss_end;
+    uint32_t cdata_size = ((uint32_t)&__cdata_end) - ((uint32_t)&__cdata_start);
+    uint32_t cbss_size = ((uint32_t)&__cbss_end) - ((uint32_t)&__cbss_start);
+    uint32_t l1_end_addr = SNRT_TCDM_START_ADDR +
+                           snrt_cluster_idx() * SNRT_CLUSTER_OFFSET +
+                           SNRT_TCDM_SIZE;
+    return l1_end_addr - cdata_size - cbss_size;
+}
+#endif
Original file line number	Diff line number	Diff line change
Expand Up		@@ -26,3 +26,5 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx);
		inline void snrt_cluster_hw_barrier();

		inline void snrt_global_barrier();

		inline uint32_t snrt_global_all_to_all_reduction(uint32_t value);