diff --git a/python-requirements.txt b/python-requirements.txt index 72c35460f..d9a7d89a9 100644 --- a/python-requirements.txt +++ b/python-requirements.txt @@ -25,6 +25,7 @@ pyflexfloat pytablewriter pytest pyyaml +scikit-learn tabulate termcolor yamllint diff --git a/sw/apps/atax/scripts/datagen.py b/sw/apps/atax/scripts/datagen.py index 0008bea26..ffa633ff3 100755 --- a/sw/apps/atax/scripts/datagen.py +++ b/sw/apps/atax/scripts/datagen.py @@ -3,7 +3,7 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Author: Jose Pedro Castro Fonseca +# Author: Jose Pedro Castro Fonseca # Luca Colagrande import numpy as np @@ -21,17 +21,31 @@ class AtaxDataGen(du.DataGen): def golden_model(self, A, x): return np.matmul(A.transpose(), np.matmul(A, x)) + def validate(self, M, N, **kwargs): + assert (N % 8) == 0, "N must be an integer multiple of the number of cores" + + # Calculate total TCDM occupation + a_size = M * N * 8 + x_size = N * 8 + y_size = N * 8 + tmp_size = M * 8 + total_size = a_size + total_size += x_size + total_size += y_size + total_size += tmp_size + du.validate_tcdm_footprint(total_size) + def emit_header(self, **kwargs): header = [super().emit_header()] + # Validate parameters + self.validate(**kwargs) + M, N = kwargs['M'], kwargs['N'] A = du.generate_random_array((M, N)) x = du.generate_random_array((N, 1)) y = self.golden_model(A, x) - assert (M % 8) == 0, "M must be an integer multiple of the number of cores" - assert (N % 8) == 0, "N must be an integer multiple of the number of cores" - A = A.flatten() x = x.flatten() y = y.flatten() diff --git a/sw/apps/atax/src/args.h b/sw/apps/atax/src/args.h new file mode 100644 index 000000000..c02b2da4b --- /dev/null +++ b/sw/apps/atax/src/args.h @@ -0,0 +1,16 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef struct { + uint32_t M; + uint32_t N; + uint64_t A_addr; + uint64_t x_addr; + uint64_t y_addr; +} atax_args_t; diff --git a/sw/apps/atax/src/atax.h b/sw/apps/atax/src/atax.h index 012b330b0..3dabb4d8c 100644 --- a/sw/apps/atax/src/atax.h +++ b/sw/apps/atax/src/atax.h @@ -6,35 +6,33 @@ // Luca Colagrande #include +#include "args.h" +#include "blas.h" #include "snrt.h" -void kernel_atax(uint32_t M, uint32_t N, double *A, double *x, double *y, - double *tmp) { +static inline void atax(uint32_t M, uint32_t N, double *A, double *x, double *y, + double *tmp) { double tmp_fs; - int core_range, core_offset; + int core_range, core_offset, cluster_core_offset; // tmp = A * x if (snrt_is_compute_core()) { - core_range = M / snrt_cluster_compute_core_num(); - core_offset = snrt_cluster_core_idx() * core_range; - for (int i1 = 0; i1 < core_range; i1++) { - int i = core_offset + i1; - tmp_fs = 0.0; - for (int j = 0; j < N; j++) { - tmp_fs += A[i * N + j] * x[j]; - } - tmp[i] = tmp_fs; - } + snrt_mcycle(); + gemv(0, M, N, 1, A, x, 1, tmp); + snrt_mcycle(); } snrt_cluster_hw_barrier(); // y = At * tmp if (snrt_is_compute_core()) { - core_range = N / snrt_cluster_compute_core_num(); - core_offset = snrt_cluster_core_idx() * core_range; + snrt_mcycle(); + core_range = N / snrt_global_compute_core_num(); + core_offset = snrt_global_compute_core_idx() * core_range; + cluster_core_offset = snrt_cluster_core_idx() * core_range; for (int j1 = 0; j1 < core_range; j1++) { int j = core_offset + j1; + int cluster_j = cluster_core_offset + j1; tmp_fs = 0.0; for (int i = 0; i < M; i++) { // The order of the for loops was exchanged, so that each loop @@ -42,7 +40,80 @@ void kernel_atax(uint32_t M, uint32_t N, double *A, double *x, double *y, // positions. tmp_fs += A[i * N + j] * tmp[i]; } - y[j] = tmp_fs; + y[cluster_j] = tmp_fs; } + snrt_fpu_fence(); + snrt_mcycle(); + } +} + +void atax_job(void *args) { + double *local_A; + double *local_x; + double *local_y; + double *local_tmp; + atax_args_t *local_args; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + local_args = (atax_args_t *)snrt_l1_alloc_cluster_local(sizeof(atax_args_t), + sizeof(double)); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(atax_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); +#else + local_args = (atax_args_t *)args; +#endif + + // Aliases + uint32_t M = local_args->M; + uint32_t N = local_args->N; + double *A = (double *)(local_args->A_addr); + double *x = (double *)(local_args->x_addr); + double *y = (double *)(local_args->y_addr); + + // Allocate local variables + size_t size_A = M * N * sizeof(double); + size_t size_x = N * sizeof(double); + size_t size_y = N * sizeof(double); + size_t size_tmp = M * sizeof(double); + size_t size_y_tile = size_y / snrt_cluster_num(); + local_A = snrt_l1_alloc_cluster_local(size_A, sizeof(double)); + local_x = snrt_l1_alloc_cluster_local(size_x, sizeof(double)); + local_y = snrt_l1_alloc_cluster_local(size_y_tile, sizeof(double)); + local_tmp = snrt_l1_alloc_cluster_local(size_tmp, sizeof(double)); + + // Initialize input matrices + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_A, A, size_A); + snrt_dma_start_1d(local_x, x, size_x); + snrt_dma_wait_all(); + } + snrt_mcycle(); + snrt_cluster_hw_barrier(); + + // Compute + atax(M, N, local_A, local_x, local_y, local_tmp); + snrt_cluster_hw_barrier(); + snrt_mcycle(); + + // Writeback results + if (snrt_is_dm_core()) { + snrt_dma_store_1d_tile(y, local_y, snrt_cluster_idx(), + N / snrt_cluster_num(), sizeof(double)); + snrt_dma_wait_all(); + snrt_mcycle(); } + snrt_cluster_hw_barrier(); + + // Free memory +#ifndef JOB_ARGS_PRELOADED + snrt_l1_update_next_v2(local_args); +#else + snrt_l1_update_next_v2(local_A); +#endif } diff --git a/sw/apps/atax/src/main.c b/sw/apps/atax/src/main.c index 98454d875..1f691dcc9 100644 --- a/sw/apps/atax/src/main.c +++ b/sw/apps/atax/src/main.c @@ -12,46 +12,16 @@ int main() { uint32_t nerr = 0; - double *local_A; - double *local_x; - double *local_y; - double *local_tmp; - // Allocate local variables - local_A = snrt_l1_next(); - local_x = local_A + M * N; - local_y = local_x + N; - local_tmp = local_y + N; - - // Initialize input matrices - if (snrt_is_dm_core()) { - snrt_dma_start_1d(local_A, A, sizeof(double) * M * N); - snrt_dma_start_1d(local_x, x, sizeof(double) * N); - snrt_dma_start_1d(local_y, (void *)snrt_zero_memory_ptr(), - sizeof(double) * N); - snrt_dma_start_1d(local_tmp, (void *)snrt_zero_memory_ptr(), - sizeof(double) * M); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); - - // Compute - kernel_atax(M, N, local_A, local_x, local_y, local_tmp); - snrt_cluster_hw_barrier(); - - // Writeback results - if (snrt_is_dm_core()) { - snrt_dma_start_1d(y, local_y, sizeof(double) * N); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); + atax_args_t args = {M, N, (uint64_t)A, (uint64_t)x, (uint64_t)y}; + atax_job(&args); // Check computation is correct #ifdef BIST if (snrt_cluster_core_idx() == 0) { // Check y for (int i = 0; i < N; i++) { - double diff = fabs(golden[i] - local_y[i]); + double diff = fabs(golden[i] - y[i]); if (diff > MAX_ERROR) { nerr++; } diff --git a/sw/apps/common.mk b/sw/apps/common.mk index 6bdc85984..e45da98ea 100644 --- a/sw/apps/common.mk +++ b/sw/apps/common.mk @@ -12,12 +12,14 @@ SECTION ?= DATA_H := $($(APP)_BUILD_DIR)/data.h DATAGEN_PY = $(SCRIPTS_DIR)/datagen.py -$(APP)_HEADERS := $(DATA_H) -$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR) +$(APP)_HEADERS := $(DATA_H) +$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR) +$(APP)_DATAGEN_ARGS += -c $($(APP)_DATA_CFG) +$(APP)_DATAGEN_ARGS += --section="$(SECTION)" $(dir $(DATA_H)): mkdir -p $@ -$(DATA_H): DATA_CFG := $($(APP)_DATA_CFG) +$(DATA_H): DATAGEN_ARGS := $($(APP)_DATAGEN_ARGS) $(DATA_H): $(DATAGEN_PY) $($(APP)_DATA_CFG) | $(dir $(DATA_H)) - $< -c $(DATA_CFG) --section="$(SECTION)" $@ + $< $(DATAGEN_ARGS) $@ diff --git a/sw/apps/correlation/scripts/datagen.py b/sw/apps/correlation/scripts/datagen.py index d60f527d1..57d18fac8 100755 --- a/sw/apps/correlation/scripts/datagen.py +++ b/sw/apps/correlation/scripts/datagen.py @@ -3,7 +3,7 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Author: Jose Pedro Castro Fonseca +# Author: Jose Pedro Castro Fonseca # Luca Colagrande import numpy as np @@ -21,9 +21,24 @@ class CorrelationDataGen(du.DataGen): def golden_model(self, data): return np.corrcoef(data, rowvar=False) + def validate(self, M, N, **kwargs): + assert (M % 8) == 0, "M must be an integer multiple of the number of cores" + + # Calculate total TCDM occupation + data_size = N * M * 8 + corr_size = M * M * 8 + stddev_size = M * 8 + total_size = data_size + total_size += corr_size + total_size += stddev_size + du.validate_tcdm_footprint(total_size) + def emit_header(self, **kwargs): header = [super().emit_header()] + # Validate parameters + self.validate(**kwargs) + M, N = kwargs['M'], kwargs['N'] data = du.generate_random_array((N, M)) corr = self.golden_model(data) diff --git a/sw/apps/correlation/src/args.h b/sw/apps/correlation/src/args.h new file mode 100644 index 000000000..c8f660dbf --- /dev/null +++ b/sw/apps/correlation/src/args.h @@ -0,0 +1,15 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef struct { + uint32_t N; + uint32_t M; + uint64_t data_addr; + uint64_t corr_addr; +} correlation_args_t; diff --git a/sw/apps/correlation/src/correlation.h b/sw/apps/correlation/src/correlation.h index 62796e1d5..0ace1b29f 100644 --- a/sw/apps/correlation/src/correlation.h +++ b/sw/apps/correlation/src/correlation.h @@ -7,18 +7,23 @@ #include #include +#include "args.h" #include "snrt.h" -void kernel_correlation(uint32_t N, uint32_t M, double *data, double *stddev, - double *corr) { +// Single-cluster computation of the first step in the correlation kernel +static inline void correlation_step1(uint32_t N, uint32_t M, double *data, + double *stddev) { int i1, i, j, k; int core_range, core_offset; // Compute deviations if (snrt_is_compute_core()) { + snrt_mcycle(); + // Distribute different attributes to the different cores core_range = M / snrt_cluster_compute_core_num(); core_offset = snrt_cluster_core_idx() * core_range; + for (i1 = 0; i1 < core_range; i1++) { i = core_offset + i1; @@ -41,12 +46,26 @@ void kernel_correlation(uint32_t N, uint32_t M, double *data, double *stddev, data[k * M + i] -= mean; } } + snrt_fpu_fence(); + + snrt_mcycle(); } +} - snrt_cluster_hw_barrier(); +// Single-cluster computation of the second step in the correlation kernel +static inline void correlation_step2(uint32_t N, uint32_t M, double *data, + double *stddev, double *corr) { + int i1, i, j, k; + int core_range, core_offset; // Compute correlation if (snrt_is_compute_core()) { + snrt_mcycle(); + + // Distribute different attributes to the different cores + core_range = M / snrt_cluster_compute_core_num(); + core_offset = snrt_cluster_core_idx() * core_range; + for (i1 = 0; i1 < core_range; i1++) { i = core_offset + i1; for (j = 0; j <= i; j++) { @@ -58,5 +77,128 @@ void kernel_correlation(uint32_t N, uint32_t M, double *data, double *stddev, corr[j * M + i] = corr[i * M + j]; } } + snrt_fpu_fence(); + + snrt_mcycle(); + } +} + +void correlation_job(void *args) { + double *local_data; + double *local_stddev; + double *local_corr; + correlation_args_t *local_args; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + local_args = (correlation_args_t *)snrt_l1_alloc_cluster_local( + sizeof(correlation_args_t), sizeof(double)); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(correlation_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); +#else + local_args = (correlation_args_t *)args; +#endif + + // Aliases + uint32_t M = local_args->M; + uint32_t N = local_args->N; + double *data = (double *)(local_args->data_addr); + double *corr = (double *)(local_args->corr_addr); + + // Allocate local variables + size_t size_data = N * M * sizeof(double); + size_t size_corr = M * M * sizeof(double); + size_t size_stddev = M * sizeof(double); + local_data = snrt_l1_alloc_cluster_local(size_data, sizeof(double)); + local_corr = snrt_l1_alloc_cluster_local(size_corr, sizeof(double)); + local_stddev = snrt_l1_alloc_cluster_local(size_stddev, sizeof(double)); + + // Parallelize step 1 across clusters, distributing the M columns + size_t tile_M = M / snrt_cluster_num(); + + // Load input matrix tile + if (snrt_is_dm_core()) { + snrt_dma_load_2d_tile(local_data, // dst + data, // src + 0, // tile_x1_idx + snrt_cluster_idx(), // tile_x0_idx + N, // tile_x1_size + tile_M, // tile_x0_size + M, // full_x0_size + sizeof(double) // prec + ); + snrt_dma_wait_all(); + } + snrt_mcycle(); + snrt_cluster_hw_barrier(); + + // Perform step 1 of the correlation + correlation_step1(N, tile_M, local_data, local_stddev); + snrt_global_barrier(); + + // The rest of the computation is done only on cluster 0 + if (snrt_cluster_idx() == 0) { + // Aggregate data in cluster 0 + if (snrt_is_dm_core()) { + snrt_mcycle(); + + // Theoretically speaking, moving the data in cluster 0's TCDM + // is not required. However we need to reshape it because + // `correlation_step1` is currently implemented in a way such + // that it stores the output tile as contiguous data, not with + // the proper stride it would have in the full matrix. + for (unsigned int i = 0; i < snrt_cluster_num(); i++) { + double *remote_data = + snrt_remote_l1_ptr(local_data, snrt_cluster_idx(), i); + snrt_dma_store_2d_tile(local_data, // dst + remote_data, // src + 0, // tile_x1_idx + i, // tile_x0_idx + N, // tile_x1_size + tile_M, // tile_x0_size + M, // full_x0_size + sizeof(double) // prec + ); + double *remote_stddev = + snrt_remote_l1_ptr(local_stddev, snrt_cluster_idx(), i); + snrt_dma_store_1d_tile(local_stddev, // dst + remote_stddev, // src + i, // tile_idx + tile_M, // tile_size + sizeof(double) // prec + ); + } + snrt_dma_wait_all(); + + snrt_mcycle(); + } + snrt_cluster_hw_barrier(); + + // Perform step 2 of the correlation + correlation_step2(N, M, local_data, local_stddev, local_corr); + snrt_cluster_hw_barrier(); + snrt_mcycle(); + + // Cluster 0 writes back output matrix + if (snrt_is_dm_core()) { + snrt_dma_start_1d(corr, local_corr, size_corr); + snrt_dma_wait_all(); + snrt_mcycle(); + } + snrt_cluster_hw_barrier(); + } else { + snrt_mcycle(); } + + // Free memory +#ifndef JOB_ARGS_PRELOADED + snrt_l1_update_next_v2(local_args); +#else + snrt_l1_update_next_v2(local_data); +#endif } diff --git a/sw/apps/correlation/src/main.c b/sw/apps/correlation/src/main.c index 54c2723d8..4881e0a95 100644 --- a/sw/apps/correlation/src/main.c +++ b/sw/apps/correlation/src/main.c @@ -12,40 +12,16 @@ int main() { uint32_t nerr = 0; - double *local_mean; - double *local_corr; - double *local_stddev; - double *local_data; - double diff; - local_data = snrt_l1_next(); - local_corr = local_data + N * M; - local_stddev = local_corr + M * M; - - // Initialize input matrix - if (snrt_is_dm_core()) { - snrt_dma_start_1d(local_data, data, sizeof(double) * N * M); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); - - // Perform Computations - kernel_correlation(N, M, local_data, local_stddev, local_corr); - snrt_cluster_hw_barrier(); - - // Writeback outputs - if (snrt_is_dm_core()) { - snrt_dma_start_1d(corr, local_corr, sizeof(double) * M * M); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); + correlation_args_t args = {N, M, (uint64_t)data, (uint64_t)corr}; + correlation_job(&args); #ifdef BIST // Check computation is correct if (snrt_cluster_core_idx() == 0) { for (int i = 0; i < M; i++) { for (int j = 0; j < M; j++) { - diff = fabs(golden[i * M + j] - local_corr[i * M + j]); + double diff = fabs(golden[i * M + j] - corr[i * M + j]); if (diff > MAX_ERROR) { nerr++; } diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h index cdeb427bf..7778afed0 100644 --- a/sw/apps/covariance/src/covariance.h +++ b/sw/apps/covariance/src/covariance.h @@ -244,7 +244,9 @@ void covariance_job(covariance_args_t *args) { #ifndef JOB_ARGS_PRELOADED // Allocate space for job arguments in TCDM - covariance_args_t *local_args = (covariance_args_t *)snrt_l1_next(); + covariance_args_t *local_args = + (covariance_args_t *)snrt_l1_alloc_cluster_local( + sizeof(covariance_args_t), sizeof(double)); // Copy job arguments to TCDM if (snrt_is_dm_core()) { @@ -263,19 +265,13 @@ void covariance_job(covariance_args_t *args) { b_tile_bytes = b_tile_size * sizeof(double); // Allocate space for job operands in TCDM - local_a0_addr = (uint64_t)args + sizeof(covariance_args_t); - local_at0_addr = local_a0_addr + a_tile_bytes; - local_b0_addr = local_at0_addr + a_tile_bytes; - local_a[0] = (double *)local_a0_addr; - local_at[0] = (double *)local_at0_addr; - local_b[0] = (double *)local_b0_addr; + local_a[0] = snrt_l1_alloc_cluster_local(a_tile_bytes, sizeof(double)); + local_at[0] = snrt_l1_alloc_cluster_local(a_tile_bytes, sizeof(double)); + local_b[0] = snrt_l1_alloc_cluster_local(b_tile_bytes, sizeof(double)); if (DOUBLE_BUFFER) { - local_a1_addr = local_b0_addr + b_tile_bytes; - local_at1_addr = local_a1_addr + a_tile_bytes; - local_b1_addr = local_at1_addr + a_tile_bytes; - local_a[1] = (double *)local_a1_addr; - local_at[1] = (double *)local_at1_addr; - local_b[1] = (double *)local_b1_addr; + local_a[1] = snrt_l1_alloc_cluster_local(a_tile_bytes, sizeof(double)); + local_at[1] = snrt_l1_alloc_cluster_local(a_tile_bytes, sizeof(double)); + local_b[1] = snrt_l1_alloc_cluster_local(b_tile_bytes, sizeof(double)); } // Calculate number of iterations @@ -364,4 +360,11 @@ void covariance_job(covariance_args_t *args) { // Synchronize cores after every iteration snrt_cluster_hw_barrier(); } + + // Free memory +#ifndef JOB_ARGS_PRELOADED + snrt_l1_update_next_v2(args); +#else + snrt_l1_update_next_v2(local_a[0]); +#endif } diff --git a/sw/apps/kmeans/data/params.json b/sw/apps/kmeans/data/params.json new file mode 100644 index 000000000..b2f50d01d --- /dev/null +++ b/sw/apps/kmeans/data/params.json @@ -0,0 +1,11 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + n_clusters: 3, + n_features: 2, + n_samples: 128, + max_iter: 3, + seed: 42 +} diff --git a/sw/apps/kmeans/scripts/datagen.py b/sw/apps/kmeans/scripts/datagen.py new file mode 100755 index 000000000..f99f043c0 --- /dev/null +++ b/sw/apps/kmeans/scripts/datagen.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Authors: Luca Colagrande + +import matplotlib.pyplot as plt +from sklearn.datasets import make_blobs +from sklearn.cluster import KMeans +import numpy as np + +import snitch.util.sim.data_utils as du + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + + +class KmeansDataGen(du.DataGen): + + def parser(self): + p = super().parser() + p.add_argument( + '--no-gui', + action='store_true', + help='Run without visualization') + return p + + def main(self): + super().main() + + # Visualize centroids + args = self.parse_args() + gui = not args.no_gui + if gui: + self.visualize_clusters(self.X, self.initial_centroids) + self.visualize_clusters(self.X, self.expected_centroids) + + def golden_model(self, samples, n_clusters, initial_centroids, max_iter): + # Apply k-means clustering + kmeans = KMeans( + n_clusters=n_clusters, + init=initial_centroids, + max_iter=max_iter + ) + kmeans.fit(samples) + return kmeans.cluster_centers_, kmeans.n_iter_ + + def visualize_clusters(self, samples, centroids, title=None): + plt.scatter(samples[:, 0], samples[:, 1], s=30) + plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200, linewidths=3, color='red') + if not title: + title = "K-means clusters" + plt.title(title) + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + plt.show() + + def validate(self, **kwargs): + assert (kwargs['n_samples'] % 8) == 0, 'Number of samples must be a multiple of the' \ + ' number of cores' + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + # Validate parameters + self.validate(**kwargs) + + # Aliases + n_samples = kwargs['n_samples'] + n_features = kwargs['n_features'] + n_clusters = kwargs['n_clusters'] + seed = kwargs['seed'] + max_iter = kwargs['max_iter'] + + # Generate random samples + X, _ = make_blobs( + n_samples=n_samples, + n_features=n_features, + centers=n_clusters, + random_state=seed + ) + + # Generate initial centroids randomly + rng = np.random.default_rng(seed=seed) + initial_centroids = rng.uniform(low=X.min(axis=0), high=X.max(axis=0), + size=(n_clusters, n_features)) + + # Calculate expected centroids and number of iterations + expected_centroids, n_iter = self.golden_model(X, n_clusters, initial_centroids, max_iter) + + # Generate header + header += [du.format_scalar_definition('uint32_t', 'n_samples', n_samples)] + header += [du.format_scalar_definition('uint32_t', 'n_features', n_features)] + header += [du.format_scalar_definition('uint32_t', 'n_clusters', n_clusters)] + header += [du.format_scalar_definition('uint32_t', 'n_iter', n_iter)] + header += [du.format_array_definition('double', 'centroids', initial_centroids.flatten(), + alignment=BURST_ALIGNMENT, section=kwargs['section'])] + header += [du.format_array_definition('double', 'samples', X.flatten(), + alignment=BURST_ALIGNMENT, section=kwargs['section'])] + header = '\n\n'.join(header) + + # Internalize variables to use in other functions + self.X = X + self.initial_centroids = initial_centroids + self.expected_centroids = expected_centroids + + return header + + +if __name__ == '__main__': + KmeansDataGen().main() diff --git a/sw/apps/kmeans/scripts/verify.py b/sw/apps/kmeans/scripts/verify.py new file mode 100755 index 000000000..6cbc0730c --- /dev/null +++ b/sw/apps/kmeans/scripts/verify.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import sys +from datagen import KmeansDataGen + +from snitch.util.sim.verif_utils import Verifier + + +class KmeansVerifier(Verifier): + + OUTPUT_UIDS = ['centroids'] + + def parser(self): + p = super().parser() + p.add_argument( + '--no-gui', + action='store_true', + help='Run without GUI') + return p + + def __init__(self): + super().__init__() + # Get inputs + self.n_clusters = self.get_input_from_symbol('n_clusters', 'uint32_t')[0] + self.n_features = self.get_input_from_symbol('n_features', 'uint32_t')[0] + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') + + def get_expected_results(self): + # Get inputs + max_iter = self.get_input_from_symbol('n_iter', 'uint32_t')[0] + n_samples = self.get_input_from_symbol('n_samples', 'uint32_t')[0] + self.initial_centroids = self.get_input_from_symbol('centroids', 'double') + self.samples = self.get_input_from_symbol('samples', 'double') + # Reshape arrays + self.initial_centroids = self.initial_centroids.reshape((self.n_clusters, self.n_features)) + self.samples = self.samples.reshape((n_samples, self.n_features)) + # Calculate expected results + final_centroids, _ = KmeansDataGen().golden_model(self.samples, self.n_clusters, + self.initial_centroids, max_iter) + return final_centroids.flatten() + + def check_results(self, *args): + return super().check_results(*args, rtol=1e-10) + + def main(self): + retcode = super().main() + + # Visualize results + expected_centroids = self.get_expected_results().reshape((self.n_clusters, self.n_features)) + actual_centroids = self.get_actual_results().reshape((self.n_clusters, self.n_features)) + if self.n_features == 2 and not self.args.no_gui: + KmeansDataGen().visualize_clusters(self.samples, self.initial_centroids, + "Initial centroids") + KmeansDataGen().visualize_clusters(self.samples, expected_centroids, + "Expected centroids") + KmeansDataGen().visualize_clusters(self.samples, actual_centroids, + "Actual centroids") + + return retcode + + +if __name__ == "__main__": + sys.exit(KmeansVerifier().main()) diff --git a/sw/apps/kmeans/src/args.h b/sw/apps/kmeans/src/args.h new file mode 100644 index 000000000..4596d241e --- /dev/null +++ b/sw/apps/kmeans/src/args.h @@ -0,0 +1,17 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef struct { + uint32_t n_samples; + uint32_t n_features; + uint32_t n_clusters; + uint32_t n_iter; + uint64_t samples_addr; + uint64_t centroids_addr; +} kmeans_args_t; diff --git a/sw/apps/kmeans/src/kmeans.h b/sw/apps/kmeans/src/kmeans.h new file mode 100644 index 000000000..0a4916b74 --- /dev/null +++ b/sw/apps/kmeans/src/kmeans.h @@ -0,0 +1,252 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include + +#include "args.h" +#include "math.h" +#include "snrt.h" + +// Explicitly place in TCDM, through thread-local storage. +// Otherwise every core loads it from DRAM. +__thread double inf = INFINITY; + +double euclidean_distance_squared(uint32_t n_features, double* point1, + double* point2) { + double sum = 0; + for (uint32_t i = 0; i < n_features; i++) { + double diff = point1[i] - point2[i]; + sum += diff * diff; + } + return sum; +} + +static inline void kmeans_iteration(uint32_t n_samples_per_core, + uint32_t n_clusters, uint32_t n_features, + double* samples, uint32_t* membership, + uint32_t* partial_membership_cnt, + double* initial_centroids, + double* partial_centroids) { + // Distribute work + uint32_t start_sample_idx; + uint32_t end_sample_idx; + if (snrt_is_compute_core()) { + start_sample_idx = snrt_cluster_core_idx() * n_samples_per_core; + end_sample_idx = start_sample_idx + n_samples_per_core; + + snrt_mcycle(); + + // Assignment step + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + partial_membership_cnt[centroid_idx] = 0; + } + snrt_fpu_fence(); + for (uint32_t sample_idx = start_sample_idx; + sample_idx < end_sample_idx; sample_idx++) { + double min_dist = inf; + membership[sample_idx] = 0; + + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + double dist = euclidean_distance_squared( + n_features, &samples[sample_idx * n_features], + &initial_centroids[centroid_idx * n_features]); + if (dist < min_dist) { + min_dist = dist; + membership[sample_idx] = centroid_idx; + } + } + partial_membership_cnt[membership[sample_idx]]++; + } + } + + snrt_mcycle(); + + // snrt_global_barrier(); + + snrt_mcycle(); + + if (snrt_is_compute_core()) { + // Update step + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { + // Initialize centroids to zero + // TODO: Can be optimized w/ DMA + partial_centroids[centroid_idx * n_features + feature_idx] = 0; + } + } + snrt_fpu_fence(); + for (uint32_t sample_idx = start_sample_idx; + sample_idx < end_sample_idx; sample_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { + partial_centroids[membership[sample_idx] * n_features + + feature_idx] += + samples[sample_idx * n_features + feature_idx]; + } + } + } + + snrt_mcycle(); + + snrt_cluster_hw_barrier(); + + snrt_mcycle(); + + if (snrt_is_compute_core()) { + if (snrt_cluster_core_idx() == 0) { + // Intra-cluster reduction + for (uint32_t core_idx = 1; + core_idx < snrt_cluster_compute_core_num(); core_idx++) { + // Pointers to variables of the other core + uint32_t* remote_partial_membership_cnt = + snrt_compute_core_local_ptr(partial_membership_cnt, + core_idx, + n_clusters * sizeof(uint32_t)); + double* remote_partial_centroids = snrt_compute_core_local_ptr( + partial_centroids, core_idx, + n_clusters * n_features * sizeof(double)); + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + // Accumulate membership counters + partial_membership_cnt[centroid_idx] += + remote_partial_membership_cnt[centroid_idx]; + // Accumulate centroid features + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { + partial_centroids[centroid_idx * n_features + + feature_idx] += + remote_partial_centroids[centroid_idx * n_features + + feature_idx]; + } + } + } + + snrt_mcycle(); + +#if !defined(KMEANS_REDUCTION_ON_HOST) + snrt_inter_cluster_barrier(); + + if (snrt_cluster_idx() == 0) { + snrt_mcycle(); + + // Inter-cluster reduction + for (uint32_t cluster_idx = 1; cluster_idx < snrt_cluster_num(); + cluster_idx++) { + // Pointers to variables of remote clusters + uint32_t* remote_partial_membership_cnt = + (uint32_t*)snrt_remote_l1_ptr(partial_membership_cnt, 0, + cluster_idx); + double* remote_partial_centroids = + (double*)snrt_remote_l1_ptr(partial_centroids, 0, + cluster_idx); + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + // Accumulate membership counters + partial_membership_cnt[centroid_idx] += + remote_partial_membership_cnt[centroid_idx]; + // Accumulate centroid features + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { + partial_centroids[centroid_idx * n_features + + feature_idx] += + remote_partial_centroids[centroid_idx * + n_features + + feature_idx]; + } + } + } + + snrt_mcycle(); + + // Normalize + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { + partial_centroids[centroid_idx * n_features + + feature_idx] /= + partial_membership_cnt[centroid_idx]; + } + } + } + + snrt_mcycle(); +#endif + } + } +} + +void kmeans_job(kmeans_args_t* args) { + snrt_mcycle(); + + // Aliases + uint32_t n_samples = args->n_samples; + uint32_t n_features = args->n_features; + uint32_t n_clusters = args->n_clusters; + uint32_t n_iter = args->n_iter; + void* samples = (void*)(args->samples_addr); + void* centroids = (void*)(args->centroids_addr); + + // Distribute work + uint32_t n_samples_per_cluster = n_samples / snrt_cluster_num(); + uint32_t n_samples_per_core = + n_samples_per_cluster / snrt_cluster_compute_core_num(); + + // Dynamically allocate space in TCDM + double* local_samples = snrt_l1_alloc_cluster_local( + n_samples_per_cluster * n_features * sizeof(double), sizeof(double)); + double* local_centroids = snrt_l1_alloc_cluster_local( + n_clusters * n_features * sizeof(double), sizeof(double)); + uint32_t* membership = snrt_l1_alloc_cluster_local( + n_samples_per_cluster * sizeof(uint32_t), sizeof(uint32_t)); + uint32_t* partial_membership_cnt = snrt_l1_alloc_compute_core_local( + n_clusters * sizeof(uint32_t), sizeof(uint32_t)); + // First core's partial centroids will store final centroids + double* partial_centroids = snrt_l1_alloc_compute_core_local( + n_clusters * n_features * sizeof(double), sizeof(double)); + double* final_centroids = snrt_compute_core_local_ptr( + partial_centroids, 0, n_clusters * n_features * sizeof(double)); + final_centroids = + snrt_remote_l1_ptr(final_centroids, snrt_cluster_idx(), 0); + + snrt_mcycle(); + + // Transfer samples and initial centroids with DMA + size_t size; + size_t offset; + if (snrt_is_dm_core()) { + size = n_samples_per_cluster * n_features * sizeof(double); + offset = snrt_cluster_idx() * size; + snrt_dma_start_1d((void*)local_samples, samples + offset, size); + size = n_clusters * n_features * sizeof(double); + snrt_dma_start_1d((void*)local_centroids, centroids, size); + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + snrt_mcycle(); + + // Iterations of Lloyd's K-means algorithm + for (uint32_t iter_idx = 0; iter_idx < n_iter; iter_idx++) { + kmeans_iteration(n_samples_per_core, n_clusters, n_features, + local_samples, membership, partial_membership_cnt, + local_centroids, partial_centroids); + snrt_global_barrier(); + local_centroids = final_centroids; + snrt_mcycle(); + } + + // Transfer final centroids with DMA + if (snrt_is_dm_core() && snrt_cluster_idx() == 0) { + snrt_dma_start_1d((void*)centroids, (void*)final_centroids, size); + snrt_dma_wait_all(); + } +} diff --git a/sw/apps/kmeans/src/main.c b/sw/apps/kmeans/src/main.c new file mode 100644 index 000000000..cc27c9604 --- /dev/null +++ b/sw/apps/kmeans/src/main.c @@ -0,0 +1,17 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include + +#include "data.h" +#include "kmeans.h" + +int main() { + kmeans_args_t args = {n_samples, n_features, n_clusters, + n_iter, (uint64_t)samples, (uint64_t)centroids}; + kmeans_job(&args); + return 0; +} diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py index 38634dd5e..30b179fec 100755 --- a/sw/blas/axpy/scripts/datagen.py +++ b/sw/blas/axpy/scripts/datagen.py @@ -21,7 +21,7 @@ class AxpyDataGen(du.DataGen): def golden_model(self, a, x, y): return a*x + y - def validate_config(self, **kwargs): + def validate(self, **kwargs): assert kwargs['n'] % kwargs['n_tiles'] == 0, "n must be an integer multiple of n_tiles" n_per_tile = kwargs['n'] // kwargs['n_tiles'] assert (n_per_tile % 8) == 0, "n must be an integer multiple of the number of cores" @@ -36,7 +36,7 @@ def validate_config(self, **kwargs): def emit_header(self, **kwargs): header = [super().emit_header()] - self.validate_config(**kwargs) + self.validate(**kwargs) a = du.generate_random_array(1)[0] x = du.generate_random_array(kwargs['n']) diff --git a/sw/blas/blas.h b/sw/blas/blas.h index 69005ccb7..20bc44292 100644 --- a/sw/blas/blas.h +++ b/sw/blas/blas.h @@ -20,4 +20,5 @@ static inline double multiply_opt(double multiplicand, double multiplier) { #include "axpy/src/axpy.h" #include "dot/src/dot.h" #include "gemm/src/gemm.h" +#include "gemv/src/gemv.h" #include "syrk/src/syrk.h" diff --git a/sw/blas/gemm/scripts/datagen.py b/sw/blas/gemm/scripts/datagen.py index 2eb6e2f4d..f4e22ddb1 100755 --- a/sw/blas/gemm/scripts/datagen.py +++ b/sw/blas/gemm/scripts/datagen.py @@ -42,9 +42,9 @@ def infer_implementation(self, gemm_fp): prec, impl = re.search(r'gemm_fp(\d+)_(\w+)', gemm_fp).group(1, 2) return (int(prec) / 8), impl - def validate_config(self, gemm_fp, parallelize_m, - parallelize_k, m_tiles, n_tiles, k_tiles, transa, - transb, M, N, K, beta, **kwargs): + def validate(self, gemm_fp, parallelize_m, + parallelize_k, m_tiles, n_tiles, k_tiles, transa, + transb, M, N, K, beta, **kwargs): frac_m = M / m_tiles frac_n = N / n_tiles frac_k = K / k_tiles @@ -90,7 +90,7 @@ def emit_header(self, **kwargs): header = [super().emit_header()] # Validate parameters - self.validate_config(**kwargs) + self.validate(**kwargs) M, N, K = kwargs['M'], kwargs['N'], kwargs['K'] @@ -98,9 +98,9 @@ def emit_header(self, **kwargs): ctype = du.ctype_from_precision_t(prec) - a = du.generate_random_array((M, K), prec) - b = du.generate_random_array((K, N), prec) - c = du.generate_random_array((M, N), prec) + a = du.generate_random_array((M, K), prec, seed=42) + b = du.generate_random_array((K, N), prec, seed=42) + c = du.generate_random_array((M, N), prec, seed=42) result = self.exact_golden_model(1, a, b, kwargs['beta'], c) # Store matrices in transposed form if requested diff --git a/sw/blas/gemm/scripts/verify.py b/sw/blas/gemm/scripts/verify.py index 353ea1328..a89b4dbc3 100755 --- a/sw/blas/gemm/scripts/verify.py +++ b/sw/blas/gemm/scripts/verify.py @@ -18,7 +18,7 @@ class GemmVerifier(Verifier): OUTPUT_UIDS = ['c'] ERR_THRESHOLD = { 1: 1e-4, - 2: 8e-2, + 2: 5e-1, 4: 1e-3, 8: 1e-3 } diff --git a/sw/blas/gemv/data/params.json b/sw/blas/gemv/data/params.json new file mode 100644 index 000000000..85bf17a44 --- /dev/null +++ b/sw/blas/gemv/data/params.json @@ -0,0 +1,10 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + alpha: 2, + trans: false, + m: 13, + n: 16 +} diff --git a/sw/blas/gemv/scripts/datagen.py b/sw/blas/gemv/scripts/datagen.py new file mode 100755 index 000000000..7c5c060a0 --- /dev/null +++ b/sw/blas/gemv/scripts/datagen.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Authors: Luca Colagrande + +import numpy as np +import sys + +import snitch.util.sim.data_utils as du + + +np.random.seed(42) + + +class GemvDataGen(du.DataGen): + + # AXI splits bursts crossing 4KB address boundaries. To minimize + # the occurrence of these splits the data should be aligned to 4KB + BURST_ALIGNMENT = 4096 + + def golden_model(self, alpha, a, x): + return alpha * np.matmul(a, x).flatten() + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + m, n, alpha = kwargs['m'], kwargs['n'], kwargs['alpha'] + + a = du.generate_random_array((m, n)) + x = du.generate_random_array((n, 1)) + y = self.golden_model(alpha, a, x) + + # Store matrix in transposed form if requested + a = a.T if kwargs['trans'] else a + + a_uid = 'a' + x_uid = 'x' + y_uid = 'y' + + cfg = { + **kwargs, + 'a': a_uid, + 'x': x_uid, + 'y': y_uid, + } + + a = a.flatten() + x = x.flatten() + + header += [du.format_array_declaration('double', a_uid, a.shape)] + header += [du.format_array_declaration('double', x_uid, x.shape)] + header += [du.format_array_declaration('double', y_uid, y.shape)] + header += [du.format_struct_definition('gemv_args_t', 'args', cfg)] + header += [du.format_array_definition('double', a_uid, a, + section=kwargs['section'])] + header += [du.format_array_definition('double', x_uid, x, + section=kwargs['section'])] + result_def = du.format_array_definition('double', 'result', y) + header += [du.format_ifdef_wrapper('BIST', result_def)] + header = '\n\n'.join(header) + + return header + + +if __name__ == "__main__": + sys.exit(GemvDataGen().main()) diff --git a/sw/blas/gemv/scripts/verify.py b/sw/blas/gemv/scripts/verify.py new file mode 100755 index 000000000..0a42c69a1 --- /dev/null +++ b/sw/blas/gemv/scripts/verify.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import numpy as np +import sys +from datagen import GemvDataGen + +from snitch.util.sim.verif_utils import Verifier + + +class GemvVerifier(Verifier): + + OUTPUT_UIDS = ['y'] + + def __init__(self): + super().__init__() + self.func_args = { + 'alpha': 'd', + 'trans': 'I', + 'm': 'I', + 'n': 'I', + 'a': 'I', + 'x': 'I', + 'y': 'I' + } + self.func_args = self.get_input_from_symbol('args', self.func_args) + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') + + def get_expected_results(self): + a = self.get_input_from_symbol('a', 'double') + x = self.get_input_from_symbol('x', 'double') + trans = self.func_args['trans'] + m = self.func_args['m'] + n = self.func_args['n'] + alpha = self.func_args['alpha'] + + if trans: + a = np.reshape(a, (n, m)) + a = a.transpose() + else: + a = np.reshape(a, (m, n)) + return GemvDataGen().golden_model(alpha, a, x) + + def check_results(self, *args): + return super().check_results(*args, rtol=1e-9) + + +if __name__ == "__main__": + sys.exit(GemvVerifier().main()) diff --git a/sw/blas/gemv/src/gemv.h b/sw/blas/gemv/src/gemv.h new file mode 100644 index 000000000..2f13f94cf --- /dev/null +++ b/sw/blas/gemv/src/gemv.h @@ -0,0 +1,93 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include + +#include "snrt.h" + +#pragma once + +typedef struct { + double alpha; + uint32_t trans; + uint32_t m; + uint32_t n; + double *a; + double *x; + double *y; +} gemv_args_t; + +static inline void single_core_gemv(uint32_t trans, uint32_t m, uint32_t n, + double alpha, double *a, uint32_t lda, + double *x, uint32_t incx, double *y) { + // Configure SSR 0 to stream a + uint32_t ssr0_b[2] = {n, m}; + if (trans) { + uint32_t ssr0_i[2] = {lda * 8, 8}; + snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0], + ssr0_i[1]); + } else { + uint32_t ssr0_i[2] = {8, lda * 8}; + snrt_ssr_loop_2d(SNRT_SSR_DM0, ssr0_b[0], ssr0_b[1], ssr0_i[0], + ssr0_i[1]); + } + + // Configure SSR 1 to stream x + uint32_t ssr1_b[2] = {n, m}; + uint32_t ssr1_i[2] = {8 * incx, 0}; + snrt_ssr_loop_2d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_i[0], ssr1_i[1]); + + // Enable SSRs + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_2D, a); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_2D, x); + snrt_ssr_enable(); + + for (uint32_t i = 0; i < m; i++) { + double acc = 0.0; + + asm volatile( + "fld ft3, 0(%[alpha]) \n" + "frep.o %[n_frep], 1, 0, 0 \n" + "fmadd.d %[acc], ft0, ft1, %[acc] \n" + "fmul.d %[acc], %[acc], ft3 \n" + : [ acc ] "+f"(acc) + : [ n_frep ] "r"(n - 1), [ alpha ] "r"(&alpha) + : "ft0", "ft1", "ft2", "ft3", "memory"); + + y[i] = acc; + } + snrt_ssr_disable(); + snrt_fpu_fence(); +} + +// In contrast with BLAS we accept incx==0, as could be used e.g. +// to compress vectors with a single value. +static inline void gemv(uint32_t trans, uint32_t m, uint32_t n, double alpha, + double *a, double *x, uint32_t incx, double *y) { + uint32_t frac_m, rem_m, start_m, core_m, lda; + double *core_a; + + // Distribute rows to cores in cluster. + // Last core computes remainder rows + frac_m = m / snrt_cluster_compute_core_num(); + rem_m = m % snrt_cluster_compute_core_num(); + start_m = snrt_cluster_core_idx() * frac_m; + core_m = snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1) + ? frac_m + rem_m + : frac_m; + if (trans) { + lda = m; + core_a = &a[start_m]; + } else { + lda = n; + core_a = &a[start_m * lda]; + } + + // Every core computes its portion of rows + if (core_m > 0) + single_core_gemv(trans, core_m, n, alpha, core_a, lda, x, incx, + &y[start_m]); +} diff --git a/sw/blas/gemv/src/main.c b/sw/blas/gemv/src/main.c new file mode 100644 index 000000000..ce0694db5 --- /dev/null +++ b/sw/blas/gemv/src/main.c @@ -0,0 +1,47 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "gemv.h" + +#include "data.h" +#include "snrt.h" + +int main() { + uint32_t trans = args.trans; + uint32_t m = args.m; + uint32_t n = args.n; + double alpha = args.alpha; + double *a = args.a; + double *x = args.x; + double *y = args.y; + + uint32_t size_a = m * n * sizeof(double); + uint32_t size_x = n * sizeof(double); + uint32_t size_y = m * sizeof(double); + + double *local_a = snrt_l1_alloc_cluster_local(size_a, sizeof(double)); + double *local_x = snrt_l1_alloc_cluster_local(size_x, sizeof(double)); + double *local_y = snrt_l1_alloc_cluster_local(size_y, sizeof(double)); + + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_a, a, size_a); + snrt_dma_start_1d(local_x, x, size_x); + } + + snrt_cluster_hw_barrier(); + + if (snrt_is_compute_core()) { + gemv(trans, m, n, alpha, local_a, local_x, 1, local_y); + } + + snrt_cluster_hw_barrier(); + + if (snrt_is_dm_core()) { + snrt_dma_start_1d(y, local_y, size_y); + } + + return 0; +} diff --git a/sw/dnn/flashattention_2/scripts/datagen.py b/sw/dnn/flashattention_2/scripts/datagen.py index f54ff7a08..109f10e82 100755 --- a/sw/dnn/flashattention_2/scripts/datagen.py +++ b/sw/dnn/flashattention_2/scripts/datagen.py @@ -138,7 +138,7 @@ def exact_flexfloat_golden_model(Q, K, V, B_r, B_c, desc): # Verify layer parameters are valid -def validate_config(L, S, d, B_r, B_c, dtype, baseline, gemm_impl): +def validate(L, S, d, B_r, B_c, dtype, baseline, gemm_impl): assert (L % B_r) == 0, 'L is not an integer multiple of B_r' assert (S % B_c) == 0, 'S is not an integer multiple of B_c' assert dtype != 'FP64', 'FP64 precision is not supported yet' @@ -164,20 +164,20 @@ def validate_config(L, S, d, B_r, B_c, dtype, baseline, gemm_impl): data_utils.validate_tcdm_footprint(total_size) # Q*K^t - gemm.GemmDataGen().validate_config( + gemm.GemmDataGen().validate( gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1, k_tiles=1, transa=0, transb=1, M=B_r, N=B_c, K=d, beta=0 ) # P*V if baseline: - gemm.GemmDataGen().validate_config( + gemm.GemmDataGen().validate( gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1, k_tiles=1, transa=0, transb=0, M=B_r, N=d, K=B_c, beta=1 ) else: # P*(V^t)^t - gemm.GemmDataGen().validate_config( + gemm.GemmDataGen().validate( gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1, k_tiles=1, transa=0, transb=1, M=B_r, N=d, K=B_c, beta=1 ) @@ -204,7 +204,7 @@ def emit_header(section, params): prec = params['dtype'] gemm_impl = get_gemm_implementation(params) - validate_config(gemm_impl=gemm_impl, **params) + validate(gemm_impl=gemm_impl, **params) # torch_type = data_utils.torch_type_from_precision_t(prec) ff_desc = data_utils.ff_desc_from_precision_t(prec) diff --git a/sw/dnn/layernorm/scripts/datagen.py b/sw/dnn/layernorm/scripts/datagen.py index 37f1fb3b5..4868546d1 100755 --- a/sw/dnn/layernorm/scripts/datagen.py +++ b/sw/dnn/layernorm/scripts/datagen.py @@ -46,7 +46,7 @@ def golden_model_torch(ifmap, eps, shape): return ln(ifmap) -def validate_config(**kwargs): +def validate(**kwargs): # Aliases batch_size = kwargs['input_dim']['batch_size'] seq_len = kwargs['input_dim']['seq_len'] @@ -74,7 +74,7 @@ def validate_config(**kwargs): def emit_header(**kwargs): # Validate parameters - validate_config(**kwargs) + validate(**kwargs) batch_size = kwargs['input_dim']['batch_size'] seq_len = kwargs['input_dim']['seq_len'] diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk index e4456fdfc..6083a8ee1 100644 --- a/target/snitch_cluster/sw.mk +++ b/target/snitch_cluster/sw.mk @@ -50,6 +50,7 @@ include sw/tests/tests.mk APPS = sw/apps/nop APPS += sw/apps/blas/axpy APPS += sw/apps/blas/gemm +APPS += sw/apps/blas/gemv APPS += sw/apps/blas/dot APPS += sw/apps/blas/syrk APPS += sw/apps/dnn/batchnorm @@ -68,6 +69,7 @@ APPS += sw/apps/atax APPS += sw/apps/correlation APPS += sw/apps/covariance APPS += sw/apps/doitgen +APPS += sw/apps/kmeans # Include Makefile from each app subdirectory $(foreach app,$(APPS), \ diff --git a/target/snitch_cluster/sw/apps/atax/app.mk b/target/snitch_cluster/sw/apps/atax/app.mk index 87e94ca8a..5c37ae044 100644 --- a/target/snitch_cluster/sw/apps/atax/app.mk +++ b/target/snitch_cluster/sw/apps/atax/app.mk @@ -8,6 +8,7 @@ APP := atax $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build SRC_DIR := $(ROOT)/sw/apps/$(APP)/src SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/blas include $(ROOT)/sw/apps/common.mk include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/apps/blas/gemv/app.mk b/target/snitch_cluster/sw/apps/blas/gemv/app.mk new file mode 100644 index 000000000..5e9aa7c06 --- /dev/null +++ b/target/snitch_cluster/sw/apps/blas/gemv/app.mk @@ -0,0 +1,13 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := gemv +$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build +SRC_DIR := $(ROOT)/sw/blas/$(APP)/src +SRCS := $(SRC_DIR)/main.c + +include $(ROOT)/sw/apps/common.mk +include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/apps/kmeans/app.mk b/target/snitch_cluster/sw/apps/kmeans/app.mk new file mode 100644 index 000000000..0ae7646b4 --- /dev/null +++ b/target/snitch_cluster/sw/apps/kmeans/app.mk @@ -0,0 +1,14 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := kmeans +$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build +SRC_DIR := $(ROOT)/sw/apps/$(APP)/src +SRCS := $(SRC_DIR)/main.c +$(APP)_DATAGEN_ARGS = --no-gui + +include $(ROOT)/sw/apps/common.mk +include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/fdiv.yaml b/target/snitch_cluster/sw/fdiv.yaml index d6b7aea3b..473876b6e 100644 --- a/target/snitch_cluster/sw/fdiv.yaml +++ b/target/snitch_cluster/sw/fdiv.yaml @@ -13,3 +13,5 @@ runs: cmd: [../../../sw/dnn/flashattention_2/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/correlation/build/correlation.elf cmd: [../../../sw/apps/correlation/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/kmeans/build/kmeans.elf + cmd: [../../../sw/apps/kmeans/scripts/verify.py, "${sim_bin}", "${elf}"] diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index d9e2f8c2f..65a2f8067 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -95,9 +95,11 @@ runs: - elf: apps/dnn/transpose/build/transpose.elf cmd: [../../../sw/dnn/transpose/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf - # - elf: apps/atax/build/atax.elf - # cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/atax/build/atax.elf + cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/covariance/build/covariance.elf cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/doitgen/build/doitgen.elf cmd: [../../../sw/apps/doitgen/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/blas/gemv/build/gemv.elf + cmd: [../../../sw/blas/gemv/scripts/verify.py, "${sim_bin}", "${elf}"] diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py index 3b732c5cc..fce39a9b5 100644 --- a/util/sim/data_utils.py +++ b/util/sim/data_utils.py @@ -118,7 +118,7 @@ def ctype_from_precision_t(prec): return precision_t_to_ctype_map[_integer_precision_t(prec)] -def generate_random_array(size, prec='FP64'): +def generate_random_array(size, prec='FP64', seed=None): """Consistent random array generation for Snitch experiments. Samples values between -1 and 1 from a uniform distribution and @@ -133,7 +133,7 @@ def generate_random_array(size, prec='FP64'): (e.g. "FP64") and integer enumeration values (e.g. 8). """ # Generate in 64b precision and then cast down - rand = np.random.default_rng().random(size=size, dtype=np.float64) * 2 - 1 + rand = np.random.default_rng(seed=seed).random(size=size, dtype=np.float64) * 2 - 1 # Generate FlexFloat array for 8b floats, casted from 16b Numpy array if _integer_precision_t(prec) == 1: return ff.array(rand.astype(np.float16), ff_desc_from_precision_t(prec)) diff --git a/util/sim/verif_utils.py b/util/sim/verif_utils.py index 90f4c6129..ef0caf8c7 100644 --- a/util/sim/verif_utils.py +++ b/util/sim/verif_utils.py @@ -24,6 +24,8 @@ def dump_results_to_csv(expected_results, actual_results, error, max_error, path size), flattens them, and dumps them to a CSV file. Each array is mapped to a different column of the CSV, in the same order as they appear as arguments in the function signature. + Also dumps an additional column, indicating pass (or failure) status, + calculated as `error <= max_error`. Args: expected_results: Array of expected results. @@ -38,11 +40,12 @@ def dump_results_to_csv(expected_results, actual_results, error, max_error, path # Flatten and zip arrays arrays = (expected_results, actual_results, error, max_error) flattened = [flatten(arr) for arr in arrays] + flattened.append((flattened[2] <= flattened[3]).astype(bool).astype(str)) zipped = np.column_stack(flattened) # Write row-by-row to CSV file with open(path, 'w') as csv_file: csv_writer = csv.writer(csv_file) - csv_writer.writerow(['expected', 'actual', 'error', 'max_error']) + csv_writer.writerow(['expected', 'actual', 'error', 'max_error', 'pass']) for row in zipped: csv_writer.writerow(row) # Print path where results were written