Skip to content

Commit

Permalink
snRuntime: Improve all-to-all reduction routine
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Oct 2, 2024
1 parent 243c7bc commit a90b4f5
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 3 deletions.
1 change: 1 addition & 0 deletions sw/snRuntime/api/cls_decls.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

typedef struct {
uint32_t hw_barrier;
uint32_t reduction;
snrt_allocator_t l1_allocator;
} cls_t;

Expand Down
2 changes: 2 additions & 0 deletions sw/snRuntime/api/sync_decls.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,5 @@ inline void snrt_cluster_hw_barrier();
inline void snrt_global_barrier();

inline uint32_t snrt_global_all_to_all_reduction(uint32_t value);

inline void snrt_wait_writeback(uint32_t val);
2 changes: 2 additions & 0 deletions sw/snRuntime/src/sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ extern void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
size_t len);

extern uint32_t snrt_global_all_to_all_reduction(uint32_t value);

extern void snrt_wait_writeback(uint32_t val);
28 changes: 25 additions & 3 deletions sw/snRuntime/src/sync.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,23 @@ inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) {
* will stall indefinitely.
*/
inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
__atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED);
snrt_global_barrier();
return _reduction_result;
// Reduce cores within cluster in TCDM
uint32_t *cluster_result = &(cls()->reduction);
uint32_t tmp = __atomic_fetch_add(cluster_result, value, __ATOMIC_RELAXED);

// Wait for writeback to ensure AMO is seen by all cores after barrier
snrt_wait_writeback(tmp);
snrt_cluster_hw_barrier();

// Reduce DM cores across clusters in global memory
if (snrt_is_dm_core()) {
__atomic_add_fetch(&_reduction_result, *cluster_result,
__ATOMIC_RELAXED);
snrt_inter_cluster_barrier();
*cluster_result = _reduction_result;
}
snrt_cluster_hw_barrier();
return *cluster_result;
}

/**
Expand Down Expand Up @@ -236,3 +250,11 @@ inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
}
}
}

//================================================================================
// Memory consistency
//================================================================================

inline void snrt_wait_writeback(uint32_t val) {
asm volatile("mv %0, %0" : "+r"(val)::);
}

0 comments on commit a90b4f5

Please sign in to comment.