diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h index bab6c339cc..db2870eb91 100644 --- a/sw/blas/gemm/src/gemm.h +++ b/sw/blas/gemm/src/gemm.h @@ -1361,7 +1361,7 @@ int gemm(precision_t prec, uint32_t expand, uint32_t setup_ssr, frac_m, frac_n, n, prec); } else if (k_tile == 0) { snrt_dma_start_1d(local_c_partial, - (void*)snrt_zero_memory_ptr(), + (void*)snrt_zero_memory_ptr(snrt_cluster_idx()), frac_m * frac_n * prec); } } diff --git a/sw/dnn/batchnorm/src/batchnorm.h b/sw/dnn/batchnorm/src/batchnorm.h index 4c8b5adc10..a31429bbb6 100644 --- a/sw/dnn/batchnorm/src/batchnorm.h +++ b/sw/dnn/batchnorm/src/batchnorm.h @@ -78,7 +78,7 @@ static inline void batchnorm_layer(const batchnorm_layer_t *l) { uint32_t weights_size = l->CI; uint32_t ofmap_size = 2 * l->IW * l->TILE_CI; - double *ptr = (double *)snrt_l1_start_addr(); + double *ptr = (double *)snrt_l1_start_addr(cluster_id); double *ifmap = ptr; ptr += ifmap_size; double *gamma = ptr; diff --git a/sw/snRuntime/src/alloc.h b/sw/snRuntime/src/alloc.h index 653137da14..aee76815d3 100644 --- a/sw/snRuntime/src/alloc.h +++ b/sw/snRuntime/src/alloc.h @@ -75,13 +75,14 @@ inline void *snrt_l3alloc(size_t size) { inline void snrt_alloc_init() { // Only one core per cluster has to initialize the L1 allocator if (snrt_is_dm_core()) { + const uint32_t cluster_idx = snrt_cluster_idx(); // Initialize L1 allocator // Note: at the moment the allocator assumes all of the TCDM is // available for allocation. However, the CLS, TLS and stack already // occupy a possibly significant portion. snrt_l1_allocator()->base = - ALIGN_UP(snrt_l1_start_addr(), MIN_CHUNK_SIZE); - snrt_l1_allocator()->size = snrt_l1_end_addr() - snrt_l1_start_addr(); + ALIGN_UP(snrt_l1_start_addr(cluster_idx), MIN_CHUNK_SIZE); + snrt_l1_allocator()->size = snrt_l1_end_addr(cluster_idx) - snrt_l1_start_addr(cluster_idx); snrt_l1_allocator()->next = snrt_l1_allocator()->base; // Initialize L3 allocator extern uint32_t _edram; diff --git a/sw/snRuntime/src/cluster_interrupts.h b/sw/snRuntime/src/cluster_interrupts.h index ee2a36f87b..1c908d21d0 100644 --- a/sw/snRuntime/src/cluster_interrupts.h +++ b/sw/snRuntime/src/cluster_interrupts.h @@ -9,7 +9,7 @@ * @param mask set bit at X sets the interrupt of hart X */ inline void snrt_int_cluster_set(uint32_t mask) { - *(snrt_cluster_clint_set_ptr()) = mask; + *(snrt_cluster_clint_set_ptr(snrt_cluster_idx())) = mask; } /** @@ -17,7 +17,7 @@ inline void snrt_int_cluster_set(uint32_t mask) { * @param mask set bit at X clears the interrupt of hart X */ inline void snrt_int_cluster_clr(uint32_t mask) { - *(snrt_cluster_clint_clr_ptr()) = mask; + *(snrt_cluster_clint_clr_ptr(snrt_cluster_idx())) = mask; } /** diff --git a/sw/snRuntime/src/perf_cnt.h b/sw/snRuntime/src/perf_cnt.h index 11157c08bf..c282656ffc 100644 --- a/sw/snRuntime/src/perf_cnt.h +++ b/sw/snRuntime/src/perf_cnt.h @@ -70,7 +70,7 @@ typedef struct { } perf_regs_t; inline perf_regs_t* snrt_perf_counters() { - return (perf_regs_t*)snrt_cluster_perf_counters_addr(); + return (perf_regs_t*)snrt_cluster_perf_counters_addr(snrt_cluster_idx()); } // Enable a specific perf_counter diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c index 582e93b8e7..37bb7fc76c 100644 --- a/sw/snRuntime/src/start.c +++ b/sw/snRuntime/src/start.c @@ -47,7 +47,7 @@ static inline void snrt_init_tls() { size = (size_t)(&__tbss_end) - (size_t)(&__tbss_start); for (int i = 0; i < snrt_cluster_core_num(); i++) { snrt_dma_start_1d((void*)(tls_ptr + i * tls_offset), - (void*)(snrt_zero_memory_ptr()), size); + (void*)(snrt_zero_memory_ptr(snrt_cluster_idx())), size); } } @@ -63,7 +63,7 @@ static inline void snrt_init_bss() { if (snrt_cluster_idx() == 0 && snrt_is_dm_core()) { size_t size = (size_t)(&__bss_end) - (size_t)(&__bss_start); snrt_dma_start_1d_wideptr((uint64_t)(&__bss_start), - (uint64_t)(snrt_zero_memory_ptr()), size); + (uint64_t)(snrt_zero_memory_ptr(snrt_cluster_idx())), size); } } #endif @@ -87,7 +87,7 @@ static inline void snrt_init_cls() { // Clear cbss section ptr = (void*)((uint32_t)ptr + size); size = (size_t)(&__cbss_end) - (size_t)(&__cbss_start); - snrt_dma_start_1d(ptr, (void*)(snrt_zero_memory_ptr()), size); + snrt_dma_start_1d(ptr, (void*)(snrt_zero_memory_ptr(snrt_cluster_idx())), size); } } #endif diff --git a/sw/tests/alias.c b/sw/tests/alias.c index d27c96ce35..54b33e17a5 100644 --- a/sw/tests/alias.c +++ b/sw/tests/alias.c @@ -5,7 +5,7 @@ #include "snrt.h" uint32_t cluster_global_to_local_address(uint32_t global_addr) { - return global_addr - snrt_l1_start_addr() + ALIAS_TCDM_BASE_ADDR; + return global_addr - snrt_l1_start_addr(snrt_cluster_idx()) + ALIAS_TCDM_BASE_ADDR; } const uint32_t n_inputs = 16; diff --git a/sw/tests/zero_mem.c b/sw/tests/zero_mem.c index 53a333712e..3ac4b47627 100644 --- a/sw/tests/zero_mem.c +++ b/sw/tests/zero_mem.c @@ -10,7 +10,7 @@ int main() { uint32_t n_inputs = 4; // Get memory locations - uint32_t *zero_mem = (uint32_t *)snrt_zero_memory_ptr(); + uint32_t *zero_mem = (uint32_t *)snrt_zero_memory_ptr(snrt_cluster_idx()); uint32_t *buffer_tcdm = snrt_l1_next(); uint32_t *buffer_golden = (snrt_l1_next() + 128); diff --git a/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.h b/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.h index cb30390cc8..243ca6f4d1 100644 --- a/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.h +++ b/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.h @@ -30,30 +30,35 @@ // snRuntime interface functions //=============================================================== -inline uint32_t __attribute__((const)) snrt_l1_start_addr() { - return CLUSTER_TCDM_START_ADDR; + +inline uint32_t cluster_base_offset(uint32_t cluster_idx) { + return cluster_idx * SNRT_CLUSTER_OFFSET; +} + +inline uint32_t snrt_l1_start_addr(uint32_t cluster_idx) { + return CLUSTER_TCDM_BASE_ADDR + cluster_base_offset(cluster_idx); } -inline uint32_t __attribute__((const)) snrt_l1_end_addr() { - return CLUSTER_TCDM_END_ADDR; +inline uint32_t snrt_l1_end_addr(uint32_t cluster_idx) { + return CLUSTER_PERIPH_BASE_ADDR + cluster_base_offset(cluster_idx); } -inline volatile uint32_t* __attribute__((const)) snrt_cluster_clint_set_ptr() { - return (uint32_t*)CLUSTER_CLINT_SET_ADDR; +inline volatile uint32_t* snrt_cluster_clint_set_ptr(uint32_t cluster_idx) { + return (uint32_t*)(CLUSTER_CLINT_SET_ADDR + cluster_base_offset(cluster_idx)); } -inline volatile uint32_t* __attribute__((const)) snrt_cluster_clint_clr_ptr() { - return (uint32_t*)CLUSTER_CLINT_CLR_ADDR; +inline volatile uint32_t* snrt_cluster_clint_clr_ptr(uint32_t cluster_idx) { + return (uint32_t*)(CLUSTER_CLINT_CLR_ADDR + cluster_base_offset(cluster_idx)); } -inline uint32_t __attribute__((const)) snrt_cluster_hw_barrier_addr() { - return CLUSTER_HW_BARRIER_ADDR; +inline uint32_t snrt_cluster_hw_barrier_addr(uint32_t cluster_idx) { + return CLUSTER_HW_BARRIER_ADDR + cluster_base_offset(cluster_idx); } -inline uint32_t __attribute__((const)) snrt_cluster_perf_counters_addr() { - return CLUSTER_PERF_COUNTER_ADDR; +inline uint32_t snrt_cluster_perf_counters_addr(uint32_t cluster_idx) { + return CLUSTER_PERF_COUNTER_ADDR + cluster_base_offset(cluster_idx); } -inline volatile uint32_t* __attribute__((const)) snrt_zero_memory_ptr() { - return (uint32_t*)CLUSTER_ZERO_MEM_START_ADDR; +inline volatile uint32_t* snrt_zero_memory_ptr(uint32_t cluster_idx) { + return (uint32_t*)(CLUSTER_ZERO_MEM_START_ADDR + cluster_base_offset(cluster_idx)); }