diff --git a/sw/apps/atax/src/atax.h b/sw/apps/atax/src/atax.h index f2509f6b5a..3dabb4d8cb 100644 --- a/sw/apps/atax/src/atax.h +++ b/sw/apps/atax/src/atax.h @@ -10,8 +10,8 @@ #include "blas.h" #include "snrt.h" -static inline void atax(uint32_t M, uint32_t N, double *A, double *x, - double *y, double *tmp) { +static inline void atax(uint32_t M, uint32_t N, double *A, double *x, double *y, + double *tmp) { double tmp_fs; int core_range, core_offset, cluster_core_offset; @@ -56,7 +56,8 @@ void atax_job(void *args) { #ifndef JOB_ARGS_PRELOADED // Allocate space for job arguments in TCDM - local_args = (atax_args_t *)snrt_l1_alloc_cluster_local(sizeof(atax_args_t), sizeof(double)); + local_args = (atax_args_t *)snrt_l1_alloc_cluster_local(sizeof(atax_args_t), + sizeof(double)); // Copy job arguments to TCDM if (snrt_is_dm_core()) { @@ -102,7 +103,8 @@ void atax_job(void *args) { // Writeback results if (snrt_is_dm_core()) { - snrt_dma_store_1d_tile(y, local_y, snrt_cluster_idx(), N / snrt_cluster_num(), sizeof(double)); + snrt_dma_store_1d_tile(y, local_y, snrt_cluster_idx(), + N / snrt_cluster_num(), sizeof(double)); snrt_dma_wait_all(); snrt_mcycle(); } diff --git a/sw/apps/correlation/src/correlation.h b/sw/apps/correlation/src/correlation.h index 2a7d3d5af5..0ace1b29fe 100644 --- a/sw/apps/correlation/src/correlation.h +++ b/sw/apps/correlation/src/correlation.h @@ -18,7 +18,6 @@ static inline void correlation_step1(uint32_t N, uint32_t M, double *data, // Compute deviations if (snrt_is_compute_core()) { - snrt_mcycle(); // Distribute different attributes to the different cores @@ -61,7 +60,6 @@ static inline void correlation_step2(uint32_t N, uint32_t M, double *data, // Compute correlation if (snrt_is_compute_core()) { - snrt_mcycle(); // Distribute different attributes to the different cores @@ -125,15 +123,14 @@ void correlation_job(void *args) { // Load input matrix tile if (snrt_is_dm_core()) { - snrt_dma_load_2d_tile( - local_data, // dst - data, // src - 0, // tile_x1_idx - snrt_cluster_idx(), // tile_x0_idx - N, // tile_x1_size - tile_M, // tile_x0_size - M, // full_x0_size - sizeof(double) // prec + snrt_dma_load_2d_tile(local_data, // dst + data, // src + 0, // tile_x1_idx + snrt_cluster_idx(), // tile_x0_idx + N, // tile_x1_size + tile_M, // tile_x0_size + M, // full_x0_size + sizeof(double) // prec ); snrt_dma_wait_all(); } @@ -146,36 +143,34 @@ void correlation_job(void *args) { // The rest of the computation is done only on cluster 0 if (snrt_cluster_idx() == 0) { - // Aggregate data in cluster 0 - if (snrt_is_dm_core() ) { - + if (snrt_is_dm_core()) { snrt_mcycle(); - + // Theoretically speaking, moving the data in cluster 0's TCDM // is not required. However we need to reshape it because // `correlation_step1` is currently implemented in a way such // that it stores the output tile as contiguous data, not with // the proper stride it would have in the full matrix. for (unsigned int i = 0; i < snrt_cluster_num(); i++) { - double *remote_data = snrt_remote_l1_ptr(local_data, snrt_cluster_idx(), i); - snrt_dma_store_2d_tile( - local_data, // dst - remote_data, // src - 0, // tile_x1_idx - i, // tile_x0_idx - N, // tile_x1_size - tile_M, // tile_x0_size - M, // full_x0_size - sizeof(double) // prec + double *remote_data = + snrt_remote_l1_ptr(local_data, snrt_cluster_idx(), i); + snrt_dma_store_2d_tile(local_data, // dst + remote_data, // src + 0, // tile_x1_idx + i, // tile_x0_idx + N, // tile_x1_size + tile_M, // tile_x0_size + M, // full_x0_size + sizeof(double) // prec ); - double *remote_stddev = snrt_remote_l1_ptr(local_stddev, snrt_cluster_idx(), i); - snrt_dma_store_1d_tile( - local_stddev, // dst - remote_stddev, // src - i, // tile_idx - tile_M, // tile_size - sizeof(double) // prec + double *remote_stddev = + snrt_remote_l1_ptr(local_stddev, snrt_cluster_idx(), i); + snrt_dma_store_1d_tile(local_stddev, // dst + remote_stddev, // src + i, // tile_idx + tile_M, // tile_size + sizeof(double) // prec ); } snrt_dma_wait_all(); diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h index bbd55877ea..7778afed0c 100644 --- a/sw/apps/covariance/src/covariance.h +++ b/sw/apps/covariance/src/covariance.h @@ -244,8 +244,9 @@ void covariance_job(covariance_args_t *args) { #ifndef JOB_ARGS_PRELOADED // Allocate space for job arguments in TCDM - covariance_args_t *local_args = (covariance_args_t *)snrt_l1_alloc_cluster_local( - sizeof(covariance_args_t), sizeof(double)); + covariance_args_t *local_args = + (covariance_args_t *)snrt_l1_alloc_cluster_local( + sizeof(covariance_args_t), sizeof(double)); // Copy job arguments to TCDM if (snrt_is_dm_core()) { diff --git a/sw/apps/kmeans/scripts/verify.py b/sw/apps/kmeans/scripts/verify.py index 3edfe19ca5..6cbc0730c4 100755 --- a/sw/apps/kmeans/scripts/verify.py +++ b/sw/apps/kmeans/scripts/verify.py @@ -42,7 +42,8 @@ def get_expected_results(self): self.initial_centroids = self.initial_centroids.reshape((self.n_clusters, self.n_features)) self.samples = self.samples.reshape((n_samples, self.n_features)) # Calculate expected results - final_centroids, _ = KmeansDataGen().golden_model(self.samples, self.n_clusters, self.initial_centroids, max_iter) + final_centroids, _ = KmeansDataGen().golden_model(self.samples, self.n_clusters, + self.initial_centroids, max_iter) return final_centroids.flatten() def check_results(self, *args): @@ -55,9 +56,12 @@ def main(self): expected_centroids = self.get_expected_results().reshape((self.n_clusters, self.n_features)) actual_centroids = self.get_actual_results().reshape((self.n_clusters, self.n_features)) if self.n_features == 2 and not self.args.no_gui: - KmeansDataGen().visualize_clusters(self.samples, self.initial_centroids, "Initial centroids") - KmeansDataGen().visualize_clusters(self.samples, expected_centroids, "Expected centroids") - KmeansDataGen().visualize_clusters(self.samples, actual_centroids, "Actual centroids") + KmeansDataGen().visualize_clusters(self.samples, self.initial_centroids, + "Initial centroids") + KmeansDataGen().visualize_clusters(self.samples, expected_centroids, + "Expected centroids") + KmeansDataGen().visualize_clusters(self.samples, actual_centroids, + "Actual centroids") return retcode diff --git a/sw/apps/kmeans/src/kmeans.h b/sw/apps/kmeans/src/kmeans.h index 0a44f9810b..0a4916b74f 100644 --- a/sw/apps/kmeans/src/kmeans.h +++ b/sw/apps/kmeans/src/kmeans.h @@ -24,7 +24,12 @@ double euclidean_distance_squared(uint32_t n_features, double* point1, return sum; } -static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clusters, uint32_t n_features, double *samples, uint32_t *membership, uint32_t *partial_membership_cnt, double *initial_centroids, double *partial_centroids) { +static inline void kmeans_iteration(uint32_t n_samples_per_core, + uint32_t n_clusters, uint32_t n_features, + double* samples, uint32_t* membership, + uint32_t* partial_membership_cnt, + double* initial_centroids, + double* partial_centroids) { // Distribute work uint32_t start_sample_idx; uint32_t end_sample_idx; @@ -36,17 +41,17 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus // Assignment step for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; - centroid_idx++) { + centroid_idx++) { partial_membership_cnt[centroid_idx] = 0; } snrt_fpu_fence(); for (uint32_t sample_idx = start_sample_idx; - sample_idx < end_sample_idx; sample_idx++) { + sample_idx < end_sample_idx; sample_idx++) { double min_dist = inf; membership[sample_idx] = 0; for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; - centroid_idx++) { + centroid_idx++) { double dist = euclidean_distance_squared( n_features, &samples[sample_idx * n_features], &initial_centroids[centroid_idx * n_features]); @@ -68,22 +73,21 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus if (snrt_is_compute_core()) { // Update step for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; - centroid_idx++) { + centroid_idx++) { for (uint32_t feature_idx = 0; feature_idx < n_features; - feature_idx++) { + feature_idx++) { // Initialize centroids to zero // TODO: Can be optimized w/ DMA - partial_centroids[centroid_idx * n_features + feature_idx] = - 0; + partial_centroids[centroid_idx * n_features + feature_idx] = 0; } } snrt_fpu_fence(); for (uint32_t sample_idx = start_sample_idx; - sample_idx < end_sample_idx; sample_idx++) { + sample_idx < end_sample_idx; sample_idx++) { for (uint32_t feature_idx = 0; feature_idx < n_features; - feature_idx++) { + feature_idx++) { partial_centroids[membership[sample_idx] * n_features + - feature_idx] += + feature_idx] += samples[sample_idx * n_features + feature_idx]; } } @@ -97,36 +101,29 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus if (snrt_is_compute_core()) { if (snrt_cluster_core_idx() == 0) { - // Intra-cluster reduction for (uint32_t core_idx = 1; - core_idx < snrt_cluster_compute_core_num(); core_idx++) { + core_idx < snrt_cluster_compute_core_num(); core_idx++) { // Pointers to variables of the other core - uint32_t* remote_partial_membership_cnt = - snrt_compute_core_local_ptr( - partial_membership_cnt, - core_idx, - n_clusters * sizeof(uint32_t) - ); - double* remote_partial_centroids = - snrt_compute_core_local_ptr( - partial_centroids, - core_idx, - n_clusters * n_features * sizeof(double) - ); + uint32_t* remote_partial_membership_cnt = + snrt_compute_core_local_ptr(partial_membership_cnt, + core_idx, + n_clusters * sizeof(uint32_t)); + double* remote_partial_centroids = snrt_compute_core_local_ptr( + partial_centroids, core_idx, + n_clusters * n_features * sizeof(double)); for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; - centroid_idx++) { + centroid_idx++) { // Accumulate membership counters partial_membership_cnt[centroid_idx] += remote_partial_membership_cnt[centroid_idx]; // Accumulate centroid features for (uint32_t feature_idx = 0; feature_idx < n_features; - feature_idx++) { + feature_idx++) { partial_centroids[centroid_idx * n_features + - feature_idx] += - remote_partial_centroids[centroid_idx * - n_features + - feature_idx]; + feature_idx] += + remote_partial_centroids[centroid_idx * n_features + + feature_idx]; } } } @@ -137,32 +134,31 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus snrt_inter_cluster_barrier(); if (snrt_cluster_idx() == 0) { - snrt_mcycle(); // Inter-cluster reduction - for (uint32_t cluster_idx = 1; - cluster_idx < snrt_cluster_num(); cluster_idx++) { + for (uint32_t cluster_idx = 1; cluster_idx < snrt_cluster_num(); + cluster_idx++) { // Pointers to variables of remote clusters uint32_t* remote_partial_membership_cnt = - (uint32_t*)snrt_remote_l1_ptr( - partial_membership_cnt, 0, cluster_idx); + (uint32_t*)snrt_remote_l1_ptr(partial_membership_cnt, 0, + cluster_idx); double* remote_partial_centroids = (double*)snrt_remote_l1_ptr(partial_centroids, 0, cluster_idx); - for (uint32_t centroid_idx = 0; - centroid_idx < n_clusters; centroid_idx++) { + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { // Accumulate membership counters partial_membership_cnt[centroid_idx] += remote_partial_membership_cnt[centroid_idx]; // Accumulate centroid features - for (uint32_t feature_idx = 0; - feature_idx < n_features; feature_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { partial_centroids[centroid_idx * n_features + - feature_idx] += + feature_idx] += remote_partial_centroids[centroid_idx * - n_features + - feature_idx]; + n_features + + feature_idx]; } } } @@ -171,11 +167,11 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus // Normalize for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; - centroid_idx++) { + centroid_idx++) { for (uint32_t feature_idx = 0; feature_idx < n_features; - feature_idx++) { + feature_idx++) { partial_centroids[centroid_idx * n_features + - feature_idx] /= + feature_idx] /= partial_membership_cnt[centroid_idx]; } } @@ -195,12 +191,13 @@ void kmeans_job(kmeans_args_t* args) { uint32_t n_features = args->n_features; uint32_t n_clusters = args->n_clusters; uint32_t n_iter = args->n_iter; - void *samples = (void *)(args->samples_addr); - void *centroids = (void *)(args->centroids_addr); + void* samples = (void*)(args->samples_addr); + void* centroids = (void*)(args->centroids_addr); // Distribute work uint32_t n_samples_per_cluster = n_samples / snrt_cluster_num(); - uint32_t n_samples_per_core = n_samples_per_cluster / snrt_cluster_compute_core_num(); + uint32_t n_samples_per_core = + n_samples_per_cluster / snrt_cluster_compute_core_num(); // Dynamically allocate space in TCDM double* local_samples = snrt_l1_alloc_cluster_local( @@ -214,12 +211,10 @@ void kmeans_job(kmeans_args_t* args) { // First core's partial centroids will store final centroids double* partial_centroids = snrt_l1_alloc_compute_core_local( n_clusters * n_features * sizeof(double), sizeof(double)); - double *final_centroids = snrt_compute_core_local_ptr( - partial_centroids, - 0, - n_clusters * n_features * sizeof(double) - ); - final_centroids = snrt_remote_l1_ptr(final_centroids, snrt_cluster_idx(), 0); + double* final_centroids = snrt_compute_core_local_ptr( + partial_centroids, 0, n_clusters * n_features * sizeof(double)); + final_centroids = + snrt_remote_l1_ptr(final_centroids, snrt_cluster_idx(), 0); snrt_mcycle(); @@ -242,8 +237,8 @@ void kmeans_job(kmeans_args_t* args) { // Iterations of Lloyd's K-means algorithm for (uint32_t iter_idx = 0; iter_idx < n_iter; iter_idx++) { kmeans_iteration(n_samples_per_core, n_clusters, n_features, - local_samples, membership, partial_membership_cnt, local_centroids, - partial_centroids); + local_samples, membership, partial_membership_cnt, + local_centroids, partial_centroids); snrt_global_barrier(); local_centroids = final_centroids; snrt_mcycle(); diff --git a/sw/apps/kmeans/src/main.c b/sw/apps/kmeans/src/main.c index 52f8e0fdbd..cc27c9604e 100644 --- a/sw/apps/kmeans/src/main.c +++ b/sw/apps/kmeans/src/main.c @@ -10,7 +10,8 @@ #include "kmeans.h" int main() { - kmeans_args_t args = {n_samples, n_features, n_clusters, n_iter, (uint64_t)samples, (uint64_t)centroids}; + kmeans_args_t args = {n_samples, n_features, n_clusters, + n_iter, (uint64_t)samples, (uint64_t)centroids}; kmeans_job(&args); return 0; } diff --git a/sw/blas/gemv/src/gemv.h b/sw/blas/gemv/src/gemv.h index ef2d7d1707..2f13f94cfd 100644 --- a/sw/blas/gemv/src/gemv.h +++ b/sw/blas/gemv/src/gemv.h @@ -15,27 +15,29 @@ typedef struct { uint32_t trans; uint32_t m; uint32_t n; - double* a; - double* x; - double* y; + double *a; + double *x; + double *y; } gemv_args_t; static inline void single_core_gemv(uint32_t trans, uint32_t m, uint32_t n, - double alpha, double *a, uint32_t lda, double *x, uint32_t incx, double *y) { - + double alpha, double *a, uint32_t lda, + double *x, uint32_t incx, double *y) { // Configure SSR 0 to stream a uint32_t ssr0_b[2] = {n, m}; if (trans) { - uint32_t ssr0_i[2] = {lda*8, 8}; - snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0], ssr0_i[1]); + uint32_t ssr0_i[2] = {lda * 8, 8}; + snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0], + ssr0_i[1]); } else { - uint32_t ssr0_i[2] = {8, lda*8}; - snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0], ssr0_i[1]); + uint32_t ssr0_i[2] = {8, lda * 8}; + snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0], + ssr0_i[1]); } // Configure SSR 1 to stream x uint32_t ssr1_b[2] = {n, m}; - uint32_t ssr1_i[2] = {8*incx, 0}; + uint32_t ssr1_i[2] = {8 * incx, 0}; snrt_ssr_loop_2d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_i[0], ssr1_i[1]); // Enable SSRs @@ -63,9 +65,8 @@ static inline void single_core_gemv(uint32_t trans, uint32_t m, uint32_t n, // In contrast with BLAS we accept incx==0, as could be used e.g. // to compress vectors with a single value. -static inline void gemv(uint32_t trans, uint32_t m, uint32_t n, - double alpha, double *a, double *x, uint32_t incx, double *y) { - +static inline void gemv(uint32_t trans, uint32_t m, uint32_t n, double alpha, + double *a, double *x, uint32_t incx, double *y) { uint32_t frac_m, rem_m, start_m, core_m, lda; double *core_a; @@ -74,9 +75,9 @@ static inline void gemv(uint32_t trans, uint32_t m, uint32_t n, frac_m = m / snrt_cluster_compute_core_num(); rem_m = m % snrt_cluster_compute_core_num(); start_m = snrt_cluster_core_idx() * frac_m; - core_m = - snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1) ? - frac_m + rem_m : frac_m; + core_m = snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1) + ? frac_m + rem_m + : frac_m; if (trans) { lda = m; core_a = &a[start_m]; @@ -87,5 +88,6 @@ static inline void gemv(uint32_t trans, uint32_t m, uint32_t n, // Every core computes its portion of rows if (core_m > 0) - single_core_gemv(trans, core_m, n, alpha, core_a, lda, x, incx, &y[start_m]); + single_core_gemv(trans, core_m, n, alpha, core_a, lda, x, incx, + &y[start_m]); }