Skip to content

Commit

Permalink
snRuntime: Add alloc_v2 functions and minor changes (#186)
Browse files Browse the repository at this point in the history
* snRuntime: Add `snrt_inter_cluster_barrier()` function`

* snRuntime: Add private L1 alloc pointers and heap bound checks

* snRuntime: Add `snrt_compute_core_local_ptr()` function

* sw: Move floating-point types to snRuntime

* snRuntime: Add `snrt_l1_update_next_v2()` used to free memory

* snRuntime: Add `snrt_cluster_is_last_compute_core()` function
  • Loading branch information
colluca authored Aug 23, 2024
1 parent b9f8903 commit 198e990
Show file tree
Hide file tree
Showing 31 changed files with 267 additions and 117 deletions.
11 changes: 0 additions & 11 deletions sw/blas/gemm/src/gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,6 @@

#pragma once

// Guard to avoid conflict with DNN header file
// TODO: move this definition to Snitch math library to solve problem
#ifndef PRECISION_T
#define PRECISION_T
typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
#endif

typedef float v2f32 __attribute__((vector_size(8)));
typedef __fp16 v4f16 __attribute__((vector_size(8)));
typedef char v8f8 __attribute__((vector_size(8)));

// Floating-point multiplications by zero cannot be optimized as in some
// edge cases they do not yield zero:
// - 0f * NaN = NaN
Expand Down
2 changes: 1 addition & 1 deletion sw/dnn/fusedconv/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ int main() {
float *ptr;

if (snrt_is_dm_core() == 0) {
ptr = snrt_l1alloc(total_size * sizeof(float));
ptr = snrt_l1_alloc(total_size * sizeof(float));
share_ptr = ptr;
}

Expand Down
7 changes: 1 addition & 6 deletions sw/dnn/src/dnn.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,7 @@

#include <stdint.h>

// Guard to avoid conflict with BLAS header file
// TODO: move this definition to Snitch math library to solve problem
#ifndef PRECISION_T
#define PRECISION_T
typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
#endif
#include "snrt.h"

#ifndef IMPLEMENTATION_T
#define IMPLEMENTATION_T
Expand Down
8 changes: 4 additions & 4 deletions sw/snRuntime/api/alloc_decls.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
typedef struct {
// Base address from where allocation starts
uint32_t base;
// Number of bytes alloctable
uint32_t size;
// End address up to which allocation is allowed
uint32_t end;
// Address of the next allocated block
uint32_t next;
} snrt_allocator_t;
Expand All @@ -20,10 +20,10 @@ inline void *snrt_l1_next();

inline void *snrt_l3_next();

inline void *snrt_l1alloc(size_t size);
inline void *snrt_l1_alloc(size_t size);

inline void snrt_l1_update_next(void *next);

inline void *snrt_l3alloc(size_t size);
inline void *snrt_l3_alloc(size_t size);

inline void snrt_alloc_init();
6 changes: 4 additions & 2 deletions sw/snRuntime/api/start_decls.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#pragma once

static inline void snrt_exit(int exit_code);
inline void snrt_exit(int exit_code);

inline uint32_t snrt_cls_base_addr();
2 changes: 2 additions & 0 deletions sw/snRuntime/api/sync_decls.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx);
inline void snrt_cluster_hw_barrier();

inline void snrt_global_barrier();

inline uint32_t snrt_global_all_to_all_reduction(uint32_t value);
4 changes: 2 additions & 2 deletions sw/snRuntime/src/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ extern void *snrt_l3_next();
extern uint32_t snrt_l1_start_addr();
extern uint32_t snrt_l1_end_addr();

extern void *snrt_l1alloc(size_t size);
extern void *snrt_l3alloc(size_t size);
extern void *snrt_l1_alloc(size_t size);
extern void *snrt_l3_alloc(size_t size);

extern snrt_allocator_t *snrt_l1_allocator();
extern snrt_allocator_t *snrt_l3_allocator();
Expand Down
8 changes: 4 additions & 4 deletions sw/snRuntime/src/alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ inline void *snrt_l3_next() { return (void *)snrt_l3_allocator()->next; }
* @param size number of bytes to allocate
* @return pointer to the allocated memory
*/
inline void *snrt_l1alloc(size_t size) {
inline void *snrt_l1_alloc(size_t size) {
snrt_allocator_t *alloc = snrt_l1_allocator();

// TODO colluca: do we need this? What does it imply?
Expand Down Expand Up @@ -64,7 +64,7 @@ inline void snrt_l1_update_next(void *next) {
* @param size number of bytes to allocate
* @return pointer to the allocated memory
*/
inline void *snrt_l3alloc(size_t size) {
inline void *snrt_l3_alloc(size_t size) {
snrt_allocator_t *alloc = snrt_l3_allocator();

// TODO: L3 alloc size check
Expand All @@ -83,12 +83,12 @@ inline void snrt_alloc_init() {
// occupy a possibly significant portion.
snrt_l1_allocator()->base =
ALIGN_UP(snrt_l1_start_addr(), MIN_CHUNK_SIZE);
snrt_l1_allocator()->size = snrt_l1_end_addr() - snrt_l1_start_addr();
snrt_l1_allocator()->end = snrt_l1_end_addr();
snrt_l1_allocator()->next = snrt_l1_allocator()->base;
// Initialize L3 allocator
extern uint32_t _edram;
snrt_l3_allocator()->base = ALIGN_UP((uint32_t)&_edram, MIN_CHUNK_SIZE);
snrt_l3_allocator()->size = 0;
snrt_l3_allocator()->end = snrt_l3_allocator()->base;
snrt_l3_allocator()->next = snrt_l3_allocator()->base;
}
// Synchronize with other cores
Expand Down
15 changes: 15 additions & 0 deletions sw/snRuntime/src/alloc_v2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

__thread snrt_allocator_t l1_allocator_v2;

extern void *snrt_l1_next_v2();

extern void *snrt_l1_alloc_cluster_local(size_t size, size_t alignment);
extern void *snrt_l1_alloc_compute_core_local(size_t size, size_t alignment);

extern void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx,
uint32_t dst_cluster_idx);

extern void snrt_alloc_init_v2();
83 changes: 83 additions & 0 deletions sw/snRuntime/src/alloc_v2.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

extern __thread snrt_allocator_t l1_allocator_v2;

inline snrt_allocator_t *snrt_l1_allocator_v2() { return &l1_allocator_v2; }

inline void *snrt_l1_next_v2() { return (void *)snrt_l1_allocator_v2()->next; }

/**
* @brief Override the L1 allocator next pointer
*/
inline void snrt_l1_update_next_v2(void *next) {
snrt_l1_allocator_v2()->next = (uint32_t)next;
}

// Check that allocation doesn't exceed allocator bounds, and raise an
// exception otherwise
inline void snrt_l1_alloc_check_bounds() {
if (snrt_l1_allocator_v2()->next > snrt_l1_allocator_v2()->end)
asm volatile("ecall \n");
}

// Dynamically allocate space for a variable of size `size` in the cluster's L1
// memory. This function should be invoked by every core in a cluster. Every
// core receives a pointer to the allocated variable.
inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) {
snrt_l1_allocator_v2()->next =
ALIGN_UP(snrt_l1_allocator_v2()->next, alignment);
void *retval = snrt_l1_next_v2();
snrt_l1_allocator_v2()->next += size;
snrt_l1_alloc_check_bounds();
return retval;
}

// Dynamically allocate space for N variables of size `size` in the cluster's
// L1 memory, N being the number of compute cores in the cluster. This function
// should be invoked by every core in a cluster. Every compute core receives a
// pointer to a unique variable among the N which have been allocated. The
// return value for the DM core is undefined.
inline void *snrt_l1_alloc_compute_core_local(size_t size,
const size_t alignment) {
snrt_l1_allocator_v2()->next =
ALIGN_UP(snrt_l1_allocator_v2()->next, alignment);
void *retval = snrt_l1_next_v2() + size * snrt_cluster_core_idx();
snrt_l1_allocator_v2()->next += size * snrt_cluster_compute_core_num();
snrt_l1_alloc_check_bounds();
return retval;
}

// Takes a pointer to a variable allocated using
// `snrt_l1_alloc_compute_core_local` and returns a pointer to the same
// variable allocated by another core, as specified by `core_idx`.
// The `size` argument should be the same used during allocation.
inline void *snrt_compute_core_local_ptr(void *ptr, uint32_t core_idx,
size_t size) {
size_t offset = (core_idx - snrt_cluster_core_idx()) * size;
return (void *)((uintptr_t)ptr + offset);
}

// Takes a pointer to a variable in the source cluster's L1 memory and returns
// a pointer to the same offset in the destination cluster's L1 memory.
inline void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx,
uint32_t dst_cluster_idx) {
return (void *)((uintptr_t)ptr +
(dst_cluster_idx - src_cluster_idx) * SNRT_CLUSTER_OFFSET);
}

inline void snrt_alloc_init_v2() {
// Calculate end address of the heap. The top of the TCDM address space is
// reserved for the cluster-local storage (CLS) and the stack of every
// core. We further provision a safety margin of 128B. The rest of the
// TCDM is reserved for the heap.
uint32_t heap_end_addr = snrt_cls_base_addr();
heap_end_addr -= (1 << SNRT_LOG2_STACK_SIZE) * snrt_cluster_core_num();
heap_end_addr -= 128;
// Initialize L1 allocator
snrt_l1_allocator_v2()->base =
ALIGN_UP(snrt_l1_start_addr(), MIN_CHUNK_SIZE);
snrt_l1_allocator_v2()->end = heap_end_addr;
snrt_l1_allocator_v2()->next = snrt_l1_allocator_v2()->base;
}
2 changes: 1 addition & 1 deletion sw/snRuntime/src/dm.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ inline void dm_init(void) {
#else
snrt_interrupt_enable(IRQ_M_CLUSTER);
#endif
dm_p = (dm_t *)snrt_l1alloc(sizeof(dm_t));
dm_p = (dm_t *)snrt_l1_alloc(sizeof(dm_t));
snrt_memset((void *)dm_p, 0, sizeof(dm_t));
dm_p_global = dm_p;
} else {
Expand Down
2 changes: 1 addition & 1 deletion sw/snRuntime/src/omp/eu.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ inline uint32_t eu_get_workers_in_wfi() {
inline void eu_init(void) {
if (snrt_cluster_core_idx() == 0) {
// Allocate the eu struct in L1 for fast access
eu_p = snrt_l1alloc(sizeof(eu_t));
eu_p = snrt_l1_alloc(sizeof(eu_t));
snrt_memset((void *)eu_p, 0, sizeof(eu_t));
// store copy of eu_p on shared memory
eu_p_global = eu_p;
Expand Down
12 changes: 6 additions & 6 deletions sw/snRuntime/src/omp/omp.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ static inline void initTeam(omp_t *_this, omp_team_t *team) {
void omp_init(void) {
if (snrt_cluster_core_idx() == 0) {
// allocate space for kmp arguments
kmpc_args =
(_kmp_ptr32 *)snrt_l1alloc(sizeof(_kmp_ptr32) * KMP_FORK_MAX_NARGS);
kmpc_args = (_kmp_ptr32 *)snrt_l1_alloc(sizeof(_kmp_ptr32) *
KMP_FORK_MAX_NARGS);
#ifndef OMPSTATIC_NUMTHREADS
omp_p = (omp_t *)snrt_l1alloc(sizeof(omp_t));
omp_p = (omp_t *)snrt_l1_alloc(sizeof(omp_t));
unsigned int nbCores = snrt_cluster_compute_core_num();
omp_p->numThreads = nbCores;
omp_p->maxThreads = nbCores;
Expand All @@ -67,20 +67,20 @@ void omp_init(void) {

initTeam((omp_t *)omp_p, (omp_team_t *)&omp_p->plainTeam);
omp_p->kmpc_barrier =
(snrt_barrier_t *)snrt_l1alloc(sizeof(snrt_barrier_t));
(snrt_barrier_t *)snrt_l1_alloc(sizeof(snrt_barrier_t));
snrt_memset(omp_p->kmpc_barrier, 0, sizeof(snrt_barrier_t));
// Exchange omp pointer with other cluster cores
omp_p_global = omp_p;
#else
omp_p.kmpc_barrier =
(snrt_barrier_t *)snrt_l1alloc(sizeof(snrt_barrier_t));
(snrt_barrier_t *)snrt_l1_alloc(sizeof(snrt_barrier_t));
snrt_memset(omp_p.kmpc_barrier, 0, sizeof(snrt_barrier_t));
// Exchange omp pointer with other cluster cores
omp_p_global = &omp_p;
#endif

#ifdef OPENMP_PROFILE
omp_prof = (omp_prof_t *)snrt_l1alloc(sizeof(omp_prof_t));
omp_prof = (omp_prof_t *)snrt_l1_alloc(sizeof(omp_prof_t));
#endif

} else {
Expand Down
32 changes: 7 additions & 25 deletions sw/snRuntime/src/start.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,8 @@
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#ifdef OPENOCD_SEMIHOSTING
#include "openocd.h"
#endif

#ifdef SNRT_INIT_CLS
static inline uint32_t snrt_cls_base_addr() {
extern volatile uint32_t __cdata_start, __cdata_end;
extern volatile uint32_t __cbss_start, __cbss_end;
uint32_t cdata_size = ((uint32_t)&__cdata_end) - ((uint32_t)&__cdata_start);
uint32_t cbss_size = ((uint32_t)&__cbss_end) - ((uint32_t)&__cbss_start);
uint32_t l1_end_addr = SNRT_TCDM_START_ADDR +
snrt_cluster_idx() * SNRT_CLUSTER_OFFSET +
SNRT_TCDM_SIZE;
return l1_end_addr - cdata_size - cbss_size;
}
extern uint32_t snrt_cls_base_addr();
#endif

#ifdef SNRT_INIT_TLS
Expand Down Expand Up @@ -98,21 +85,16 @@ static inline void snrt_init_cls() {
#endif

#ifdef SNRT_INIT_LIBS
static inline void snrt_init_libs() { snrt_alloc_init(); }
static inline void snrt_init_libs() {
snrt_alloc_init();
snrt_alloc_init_v2();
}
#endif

#ifdef SNRT_CRT0_EXIT
static inline void snrt_exit_default(int exit_code) {
exit_code = snrt_global_all_to_all_reduction(exit_code);
#ifdef OPENOCD_SEMIHOSTING
if (snrt_global_core_idx() == 0) __ocd_semihost_exit(exit_code);
#else
if (snrt_global_core_idx() == 0)
*(snrt_exit_code_destination()) = (exit_code << 1) | 1;
#endif
}
extern void snrt_exit_default(int exit_code);
#ifndef SNRT_CRT0_ALTERNATE_EXIT
static inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); }
extern void snrt_exit(int exit_code);
#endif
#endif

Expand Down
35 changes: 35 additions & 0 deletions sw/snRuntime/src/start.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#ifdef OPENOCD_SEMIHOSTING
#include "openocd.h"
#endif

#ifdef SNRT_CRT0_EXIT
inline void snrt_exit_default(int exit_code) {
exit_code = snrt_global_all_to_all_reduction(exit_code);
#ifdef OPENOCD_SEMIHOSTING
if (snrt_global_core_idx() == 0) __ocd_semihost_exit(exit_code);
#else
if (snrt_global_core_idx() == 0)
*(snrt_exit_code_destination()) = (exit_code << 1) | 1;
#endif
}
#ifndef SNRT_CRT0_ALTERNATE_EXIT
inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); }
#endif
#endif

#ifdef SNRT_INIT_CLS
inline uint32_t snrt_cls_base_addr() {
extern volatile uint32_t __cdata_start, __cdata_end;
extern volatile uint32_t __cbss_start, __cbss_end;
uint32_t cdata_size = ((uint32_t)&__cdata_end) - ((uint32_t)&__cdata_start);
uint32_t cbss_size = ((uint32_t)&__cbss_end) - ((uint32_t)&__cbss_start);
uint32_t l1_end_addr = SNRT_TCDM_START_ADDR +
snrt_cluster_idx() * SNRT_CLUSTER_OFFSET +
SNRT_TCDM_SIZE;
return l1_end_addr - cdata_size - cbss_size;
}
#endif
Loading

0 comments on commit 198e990

Please sign in to comment.