Skip to content

Commit

Permalink
Correct linting
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Aug 26, 2024
1 parent 07bba8f commit 009a3ec
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 85 deletions.
10 changes: 6 additions & 4 deletions sw/apps/atax/src/atax.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
#include "blas.h"
#include "snrt.h"

static inline void atax(uint32_t M, uint32_t N, double *A, double *x,
double *y, double *tmp) {
static inline void atax(uint32_t M, uint32_t N, double *A, double *x, double *y,
double *tmp) {
double tmp_fs;
int core_range, core_offset, cluster_core_offset;

Expand Down Expand Up @@ -56,7 +56,8 @@ void atax_job(void *args) {

#ifndef JOB_ARGS_PRELOADED
// Allocate space for job arguments in TCDM
local_args = (atax_args_t *)snrt_l1_alloc_cluster_local(sizeof(atax_args_t), sizeof(double));
local_args = (atax_args_t *)snrt_l1_alloc_cluster_local(sizeof(atax_args_t),
sizeof(double));

// Copy job arguments to TCDM
if (snrt_is_dm_core()) {
Expand Down Expand Up @@ -102,7 +103,8 @@ void atax_job(void *args) {

// Writeback results
if (snrt_is_dm_core()) {
snrt_dma_store_1d_tile(y, local_y, snrt_cluster_idx(), N / snrt_cluster_num(), sizeof(double));
snrt_dma_store_1d_tile(y, local_y, snrt_cluster_idx(),
N / snrt_cluster_num(), sizeof(double));
snrt_dma_wait_all();
snrt_mcycle();
}
Expand Down
5 changes: 3 additions & 2 deletions sw/apps/covariance/src/covariance.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,9 @@ void covariance_job(covariance_args_t *args) {

#ifndef JOB_ARGS_PRELOADED
// Allocate space for job arguments in TCDM
covariance_args_t *local_args = (covariance_args_t *)snrt_l1_alloc_cluster_local(
sizeof(covariance_args_t), sizeof(double));
covariance_args_t *local_args =
(covariance_args_t *)snrt_l1_alloc_cluster_local(
sizeof(covariance_args_t), sizeof(double));

// Copy job arguments to TCDM
if (snrt_is_dm_core()) {
Expand Down
12 changes: 8 additions & 4 deletions sw/apps/kmeans/scripts/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def get_expected_results(self):
self.initial_centroids = self.initial_centroids.reshape((self.n_clusters, self.n_features))
self.samples = self.samples.reshape((n_samples, self.n_features))
# Calculate expected results
final_centroids, _ = KmeansDataGen().golden_model(self.samples, self.n_clusters, self.initial_centroids, max_iter)
final_centroids, _ = KmeansDataGen().golden_model(self.samples, self.n_clusters,
self.initial_centroids, max_iter)
return final_centroids.flatten()

def check_results(self, *args):
Expand All @@ -55,9 +56,12 @@ def main(self):
expected_centroids = self.get_expected_results().reshape((self.n_clusters, self.n_features))
actual_centroids = self.get_actual_results().reshape((self.n_clusters, self.n_features))
if self.n_features == 2 and not self.args.no_gui:
KmeansDataGen().visualize_clusters(self.samples, self.initial_centroids, "Initial centroids")
KmeansDataGen().visualize_clusters(self.samples, expected_centroids, "Expected centroids")
KmeansDataGen().visualize_clusters(self.samples, actual_centroids, "Actual centroids")
KmeansDataGen().visualize_clusters(self.samples, self.initial_centroids,
"Initial centroids")
KmeansDataGen().visualize_clusters(self.samples, expected_centroids,
"Expected centroids")
KmeansDataGen().visualize_clusters(self.samples, actual_centroids,
"Actual centroids")

return retcode

Expand Down
109 changes: 52 additions & 57 deletions sw/apps/kmeans/src/kmeans.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@ double euclidean_distance_squared(uint32_t n_features, double* point1,
return sum;
}

static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clusters, uint32_t n_features, double *samples, uint32_t *membership, uint32_t *partial_membership_cnt, double *initial_centroids, double *partial_centroids) {
static inline void kmeans_iteration(uint32_t n_samples_per_core,
uint32_t n_clusters, uint32_t n_features,
double* samples, uint32_t* membership,
uint32_t* partial_membership_cnt,
double* initial_centroids,
double* partial_centroids) {
// Distribute work
uint32_t start_sample_idx;
uint32_t end_sample_idx;
Expand All @@ -36,17 +41,17 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus

// Assignment step
for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
centroid_idx++) {
centroid_idx++) {
partial_membership_cnt[centroid_idx] = 0;
}
snrt_fpu_fence();
for (uint32_t sample_idx = start_sample_idx;
sample_idx < end_sample_idx; sample_idx++) {
sample_idx < end_sample_idx; sample_idx++) {
double min_dist = inf;
membership[sample_idx] = 0;

for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
centroid_idx++) {
centroid_idx++) {
double dist = euclidean_distance_squared(
n_features, &samples[sample_idx * n_features],
&initial_centroids[centroid_idx * n_features]);
Expand All @@ -68,22 +73,21 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus
if (snrt_is_compute_core()) {
// Update step
for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
centroid_idx++) {
centroid_idx++) {
for (uint32_t feature_idx = 0; feature_idx < n_features;
feature_idx++) {
feature_idx++) {
// Initialize centroids to zero
// TODO: Can be optimized w/ DMA
partial_centroids[centroid_idx * n_features + feature_idx] =
0;
partial_centroids[centroid_idx * n_features + feature_idx] = 0;
}
}
snrt_fpu_fence();
for (uint32_t sample_idx = start_sample_idx;
sample_idx < end_sample_idx; sample_idx++) {
sample_idx < end_sample_idx; sample_idx++) {
for (uint32_t feature_idx = 0; feature_idx < n_features;
feature_idx++) {
feature_idx++) {
partial_centroids[membership[sample_idx] * n_features +
feature_idx] +=
feature_idx] +=
samples[sample_idx * n_features + feature_idx];
}
}
Expand All @@ -97,36 +101,29 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus

if (snrt_is_compute_core()) {
if (snrt_cluster_core_idx() == 0) {

// Intra-cluster reduction
for (uint32_t core_idx = 1;
core_idx < snrt_cluster_compute_core_num(); core_idx++) {
core_idx < snrt_cluster_compute_core_num(); core_idx++) {
// Pointers to variables of the other core
uint32_t* remote_partial_membership_cnt =
snrt_compute_core_local_ptr(
partial_membership_cnt,
core_idx,
n_clusters * sizeof(uint32_t)
);
double* remote_partial_centroids =
snrt_compute_core_local_ptr(
partial_centroids,
core_idx,
n_clusters * n_features * sizeof(double)
);
uint32_t* remote_partial_membership_cnt =
snrt_compute_core_local_ptr(partial_membership_cnt,
core_idx,
n_clusters * sizeof(uint32_t));
double* remote_partial_centroids = snrt_compute_core_local_ptr(
partial_centroids, core_idx,
n_clusters * n_features * sizeof(double));
for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
centroid_idx++) {
centroid_idx++) {
// Accumulate membership counters
partial_membership_cnt[centroid_idx] +=
remote_partial_membership_cnt[centroid_idx];
// Accumulate centroid features
for (uint32_t feature_idx = 0; feature_idx < n_features;
feature_idx++) {
feature_idx++) {
partial_centroids[centroid_idx * n_features +
feature_idx] +=
remote_partial_centroids[centroid_idx *
n_features +
feature_idx];
feature_idx] +=
remote_partial_centroids[centroid_idx * n_features +
feature_idx];
}
}
}
Expand All @@ -137,32 +134,31 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus
snrt_inter_cluster_barrier();

if (snrt_cluster_idx() == 0) {

snrt_mcycle();

// Inter-cluster reduction
for (uint32_t cluster_idx = 1;
cluster_idx < snrt_cluster_num(); cluster_idx++) {
for (uint32_t cluster_idx = 1; cluster_idx < snrt_cluster_num();
cluster_idx++) {
// Pointers to variables of remote clusters
uint32_t* remote_partial_membership_cnt =
(uint32_t*)snrt_remote_l1_ptr(
partial_membership_cnt, 0, cluster_idx);
(uint32_t*)snrt_remote_l1_ptr(partial_membership_cnt, 0,
cluster_idx);
double* remote_partial_centroids =
(double*)snrt_remote_l1_ptr(partial_centroids, 0,
cluster_idx);
for (uint32_t centroid_idx = 0;
centroid_idx < n_clusters; centroid_idx++) {
for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
centroid_idx++) {
// Accumulate membership counters
partial_membership_cnt[centroid_idx] +=
remote_partial_membership_cnt[centroid_idx];
// Accumulate centroid features
for (uint32_t feature_idx = 0;
feature_idx < n_features; feature_idx++) {
for (uint32_t feature_idx = 0; feature_idx < n_features;
feature_idx++) {
partial_centroids[centroid_idx * n_features +
feature_idx] +=
feature_idx] +=
remote_partial_centroids[centroid_idx *
n_features +
feature_idx];
n_features +
feature_idx];
}
}
}
Expand All @@ -171,11 +167,11 @@ static inline void kmeans_iteration(uint32_t n_samples_per_core, uint32_t n_clus

// Normalize
for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
centroid_idx++) {
centroid_idx++) {
for (uint32_t feature_idx = 0; feature_idx < n_features;
feature_idx++) {
feature_idx++) {
partial_centroids[centroid_idx * n_features +
feature_idx] /=
feature_idx] /=
partial_membership_cnt[centroid_idx];
}
}
Expand All @@ -195,12 +191,13 @@ void kmeans_job(kmeans_args_t* args) {
uint32_t n_features = args->n_features;
uint32_t n_clusters = args->n_clusters;
uint32_t n_iter = args->n_iter;
void *samples = (void *)(args->samples_addr);
void *centroids = (void *)(args->centroids_addr);
void* samples = (void*)(args->samples_addr);
void* centroids = (void*)(args->centroids_addr);

// Distribute work
uint32_t n_samples_per_cluster = n_samples / snrt_cluster_num();
uint32_t n_samples_per_core = n_samples_per_cluster / snrt_cluster_compute_core_num();
uint32_t n_samples_per_core =
n_samples_per_cluster / snrt_cluster_compute_core_num();

// Dynamically allocate space in TCDM
double* local_samples = snrt_l1_alloc_cluster_local(
Expand All @@ -214,12 +211,10 @@ void kmeans_job(kmeans_args_t* args) {
// First core's partial centroids will store final centroids
double* partial_centroids = snrt_l1_alloc_compute_core_local(
n_clusters * n_features * sizeof(double), sizeof(double));
double *final_centroids = snrt_compute_core_local_ptr(
partial_centroids,
0,
n_clusters * n_features * sizeof(double)
);
final_centroids = snrt_remote_l1_ptr(final_centroids, snrt_cluster_idx(), 0);
double* final_centroids = snrt_compute_core_local_ptr(
partial_centroids, 0, n_clusters * n_features * sizeof(double));
final_centroids =
snrt_remote_l1_ptr(final_centroids, snrt_cluster_idx(), 0);

snrt_mcycle();

Expand All @@ -242,8 +237,8 @@ void kmeans_job(kmeans_args_t* args) {
// Iterations of Lloyd's K-means algorithm
for (uint32_t iter_idx = 0; iter_idx < n_iter; iter_idx++) {
kmeans_iteration(n_samples_per_core, n_clusters, n_features,
local_samples, membership, partial_membership_cnt, local_centroids,
partial_centroids);
local_samples, membership, partial_membership_cnt,
local_centroids, partial_centroids);
snrt_global_barrier();
local_centroids = final_centroids;
snrt_mcycle();
Expand Down
3 changes: 2 additions & 1 deletion sw/apps/kmeans/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
#include "kmeans.h"

int main() {
kmeans_args_t args = {n_samples, n_features, n_clusters, n_iter, (uint64_t)samples, (uint64_t)centroids};
kmeans_args_t args = {n_samples, n_features, n_clusters,
n_iter, (uint64_t)samples, (uint64_t)centroids};
kmeans_job(&args);
return 0;
}
36 changes: 19 additions & 17 deletions sw/blas/gemv/src/gemv.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,29 @@ typedef struct {
uint32_t trans;
uint32_t m;
uint32_t n;
double* a;
double* x;
double* y;
double *a;
double *x;
double *y;
} gemv_args_t;

static inline void single_core_gemv(uint32_t trans, uint32_t m, uint32_t n,
double alpha, double *a, uint32_t lda, double *x, uint32_t incx, double *y) {

double alpha, double *a, uint32_t lda,
double *x, uint32_t incx, double *y) {
// Configure SSR 0 to stream a
uint32_t ssr0_b[2] = {n, m};
if (trans) {
uint32_t ssr0_i[2] = {lda*8, 8};
snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0], ssr0_i[1]);
uint32_t ssr0_i[2] = {lda * 8, 8};
snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0],
ssr0_i[1]);
} else {
uint32_t ssr0_i[2] = {8, lda*8};
snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0], ssr0_i[1]);
uint32_t ssr0_i[2] = {8, lda * 8};
snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0],
ssr0_i[1]);
}

// Configure SSR 1 to stream x
uint32_t ssr1_b[2] = {n, m};
uint32_t ssr1_i[2] = {8*incx, 0};
uint32_t ssr1_i[2] = {8 * incx, 0};
snrt_ssr_loop_2d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_i[0], ssr1_i[1]);

// Enable SSRs
Expand Down Expand Up @@ -63,9 +65,8 @@ static inline void single_core_gemv(uint32_t trans, uint32_t m, uint32_t n,

// In contrast with BLAS we accept incx==0, as could be used e.g.
// to compress vectors with a single value.
static inline void gemv(uint32_t trans, uint32_t m, uint32_t n,
double alpha, double *a, double *x, uint32_t incx, double *y) {

static inline void gemv(uint32_t trans, uint32_t m, uint32_t n, double alpha,
double *a, double *x, uint32_t incx, double *y) {
uint32_t frac_m, rem_m, start_m, core_m, lda;
double *core_a;

Expand All @@ -74,9 +75,9 @@ static inline void gemv(uint32_t trans, uint32_t m, uint32_t n,
frac_m = m / snrt_cluster_compute_core_num();
rem_m = m % snrt_cluster_compute_core_num();
start_m = snrt_cluster_core_idx() * frac_m;
core_m =
snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1) ?
frac_m + rem_m : frac_m;
core_m = snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1)
? frac_m + rem_m
: frac_m;
if (trans) {
lda = m;
core_a = &a[start_m];
Expand All @@ -87,5 +88,6 @@ static inline void gemv(uint32_t trans, uint32_t m, uint32_t n,

// Every core computes its portion of rows
if (core_m > 0)
single_core_gemv(trans, core_m, n, alpha, core_a, lda, x, incx, &y[start_m]);
single_core_gemv(trans, core_m, n, alpha, core_a, lda, x, incx,
&y[start_m]);
}

0 comments on commit 009a3ec

Please sign in to comment.