From eb38f12606436d549e7dbb628cb078ee58fb1e76 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 26 Jan 2024 09:37:17 +0100 Subject: [PATCH] sw: Add K-means app --- python-requirements.txt | 3 + sw/apps/kmeans/.gitignore | 1 + sw/apps/kmeans/Makefile | 30 +++ sw/apps/kmeans/data/datagen.py | 128 ++++++++++++ sw/apps/kmeans/data/params.json | 11 + sw/apps/kmeans/src/kmeans.h | 192 ++++++++++++++++++ sw/apps/kmeans/src/main.c | 15 ++ sw/apps/kmeans/verify.py | 65 ++++++ target/snitch_cluster/sw/apps/Makefile | 1 + target/snitch_cluster/sw/apps/kmeans/Makefile | 10 + target/snitch_cluster/sw/fdiv.yaml | 2 + 11 files changed, 458 insertions(+) create mode 100644 sw/apps/kmeans/.gitignore create mode 100644 sw/apps/kmeans/Makefile create mode 100755 sw/apps/kmeans/data/datagen.py create mode 100644 sw/apps/kmeans/data/params.json create mode 100644 sw/apps/kmeans/src/kmeans.h create mode 100644 sw/apps/kmeans/src/main.c create mode 100755 sw/apps/kmeans/verify.py create mode 100644 target/snitch_cluster/sw/apps/kmeans/Makefile diff --git a/python-requirements.txt b/python-requirements.txt index 6db0bf03f6..6117b40343 100644 --- a/python-requirements.txt +++ b/python-requirements.txt @@ -8,14 +8,17 @@ editorconfig-checker==2.3.51 flake8 gitpython hjson +json5 jsonref jsonschema mako +matplotlib progressbar2 tabulate yamllint pyyaml pytablewriter +scikit-learn termcolor pandas pyelftools diff --git a/sw/apps/kmeans/.gitignore b/sw/apps/kmeans/.gitignore new file mode 100644 index 0000000000..8485f615ee --- /dev/null +++ b/sw/apps/kmeans/.gitignore @@ -0,0 +1 @@ +data/data.h \ No newline at end of file diff --git a/sw/apps/kmeans/Makefile b/sw/apps/kmeans/Makefile new file mode 100644 index 0000000000..cf379ddcdb --- /dev/null +++ b/sw/apps/kmeans/Makefile @@ -0,0 +1,30 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +# Usage of absolute paths is required to externally include this Makefile +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +DATA_DIR := $(realpath $(MK_DIR)/data) +SRC_DIR := $(realpath $(MK_DIR)/src) + +DATA_CFG ?= $(DATA_DIR)/params.json +SECTION ?= + +APP ?= kmeans +SRCS ?= $(realpath $(SRC_DIR)/main.c) +INCDIRS ?= $(DATA_DIR) $(SRC_DIR) + +DATAGEN_PY = $(DATA_DIR)/datagen.py +DATA_H = $(DATA_DIR)/data.h + +$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) + $< -c $(DATA_CFG) --no-gui --section="$(SECTION)" > $@ + +.PHONY: clean-data clean + +clean-data: + rm -f $(DATA_H) + +clean: clean-data diff --git a/sw/apps/kmeans/data/datagen.py b/sw/apps/kmeans/data/datagen.py new file mode 100755 index 0000000000..b57fdb5360 --- /dev/null +++ b/sw/apps/kmeans/data/datagen.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Authors: Luca Colagrande + +import argparse +import hjson +import matplotlib.pyplot as plt +import numpy as np +import os +import pathlib +from sklearn.datasets import make_blobs +from sklearn.cluster import KMeans +import sys + + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +from data_utils import emit_license, format_scalar_definition, \ + format_vector_definition # noqa: E402 + + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + + +def golden_model(samples, n_clusters, initial_centroids, max_iter): + # Apply k-means clustering + kmeans = KMeans( + n_clusters=n_clusters, + init=initial_centroids, + max_iter=max_iter + ) + kmeans.fit(samples) + return kmeans.cluster_centers_, kmeans.n_iter_ + + +def visualize_clusters(samples, centroids, title=None): + plt.scatter(samples[:, 0], samples[:, 1], s=30) + plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200, linewidths=3, color='red') + if not title: + title = "K-means clusters" + plt.title(title) + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + plt.show() + + +def emit_header(**kwargs): + + # Aliases + n_samples = kwargs['n_samples'] + n_features = kwargs['n_features'] + n_clusters = kwargs['n_clusters'] + max_iter = kwargs['max_iter'] + seed = kwargs['seed'] + gui = not kwargs['no_gui'] + + # Generate random samples + X, _ = make_blobs( + n_samples=n_samples, + n_features=n_features, + centers=n_clusters, + random_state=seed + ) + + # Generate initial centroids randomly + rng = np.random.default_rng(seed=seed) + initial_centroids = rng.uniform(low=X.min(axis=0), high=X.max(axis=0), + size=(n_clusters, n_features)) + + # Visualize the generated samples + if gui: + visualize_clusters(X, initial_centroids) + + # Apply k-means clustering + centers, n_iter = golden_model(X, n_clusters, initial_centroids, max_iter) + + # Visualize the clusters + if gui: + visualize_clusters(X, centers) + + # Generate header + data_str = [emit_license()] + data_str += [format_scalar_definition('uint32_t', 'n_samples', n_samples)] + data_str += [format_scalar_definition('uint32_t', 'n_features', n_features)] + data_str += [format_scalar_definition('uint32_t', 'n_clusters', n_clusters)] + data_str += [format_scalar_definition('uint32_t', 'n_iter', n_iter)] + data_str += [format_vector_definition('double', 'centroids', initial_centroids.flatten(), + alignment=BURST_ALIGNMENT, section=kwargs['section'])] + data_str += [format_vector_definition('double', 'samples', X.flatten(), + alignment=BURST_ALIGNMENT, section=kwargs['section'])] + data_str = '\n\n'.join(data_str) + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for kernels') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel') + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + '--no-gui', + action='store_true', + help='Run without visualization') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + param['section'] = args.section + param['no_gui'] = args.no_gui + + # Emit header file + print(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/sw/apps/kmeans/data/params.json b/sw/apps/kmeans/data/params.json new file mode 100644 index 0000000000..b2f50d01d0 --- /dev/null +++ b/sw/apps/kmeans/data/params.json @@ -0,0 +1,11 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + n_clusters: 3, + n_features: 2, + n_samples: 128, + max_iter: 3, + seed: 42 +} diff --git a/sw/apps/kmeans/src/kmeans.h b/sw/apps/kmeans/src/kmeans.h new file mode 100644 index 0000000000..ba4aacf91a --- /dev/null +++ b/sw/apps/kmeans/src/kmeans.h @@ -0,0 +1,192 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include + +#include "math.h" +#include "snrt.h" + +double euclidean_distance_squared(uint32_t n_features, double* point1, + double* point2) { + double sum = 0; + for (uint32_t i = 0; i < n_features; i++) { + double diff = point1[i] - point2[i]; + sum += diff * diff; + } + return sum; +} + +void kmeans(uint32_t n_samples, uint32_t n_features, uint32_t n_clusters, + uint32_t n_iter, double* samples, double* centroids) { + // Distribute work across clusters + uint32_t n_samples_per_cluster = n_samples / snrt_cluster_num(); + + // Dynamically allocate space in TCDM + double* local_samples = snrt_l1_alloc_cluster_local( + n_samples_per_cluster * n_features * sizeof(double), sizeof(double)); + double* local_centroids = snrt_l1_alloc_cluster_local( + n_clusters * n_features * sizeof(double), sizeof(double)); + uint32_t* membership = snrt_l1_alloc_cluster_local( + n_samples_per_cluster * sizeof(uint32_t), sizeof(uint32_t)); + uint32_t* partial_membership_cnt = snrt_l1_alloc_compute_core_local( + n_clusters * sizeof(uint32_t), sizeof(uint32_t)); + // First core's partial centroids will store final centroids + double* final_centroids = snrt_l1_next(); + double* partial_centroids = snrt_l1_alloc_compute_core_local( + n_clusters * n_features * sizeof(double), sizeof(double)); + + // Transfer samples and initial centroids with DMA + size_t size; + size_t offset; + if (snrt_is_dm_core()) { + size = n_samples_per_cluster * n_features * sizeof(double); + offset = snrt_cluster_idx() * size; + snrt_dma_start_1d((void*)local_samples, (void*)samples + offset, size); + size = n_clusters * n_features * sizeof(double); + snrt_dma_start_1d((void*)local_centroids, (void*)centroids, size); + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + // Iterations of Lloyd's K-means algorithm + for (uint32_t iter_idx = 0; iter_idx < n_iter; iter_idx++) { + // Distribute work across compute cores in a cluster + uint32_t n_samples_per_core; + uint32_t start_sample_idx; + uint32_t end_sample_idx; + if (snrt_is_compute_core()) { + n_samples_per_core = + n_samples_per_cluster / snrt_cluster_compute_core_num(); + start_sample_idx = snrt_cluster_core_idx() * n_samples_per_core; + end_sample_idx = start_sample_idx + n_samples_per_core; + + // Assignment step + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + partial_membership_cnt[centroid_idx] = 0; + } + snrt_fpu_fence(); + for (uint32_t sample_idx = start_sample_idx; + sample_idx < end_sample_idx; sample_idx++) { + double min_dist = INFINITY; + membership[sample_idx] = 0; + + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + double dist = euclidean_distance_squared( + n_features, &local_samples[sample_idx * n_features], + &local_centroids[centroid_idx * n_features]); + if (dist < min_dist) { + min_dist = dist; + membership[sample_idx] = centroid_idx; + } + } + partial_membership_cnt[membership[sample_idx]]++; + } + } + + snrt_global_barrier(); + + if (snrt_is_compute_core()) { + // Update step + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { + partial_centroids[centroid_idx * n_features + feature_idx] = + 0; + } + } + snrt_fpu_fence(); + for (uint32_t sample_idx = start_sample_idx; + sample_idx < end_sample_idx; sample_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { + partial_centroids[membership[sample_idx] * n_features + + feature_idx] += + local_samples[sample_idx * n_features + feature_idx]; + } + } + if (snrt_cluster_core_idx() == 0) { + // Intra-cluster reduction + for (uint32_t core_idx = 1; + core_idx < snrt_cluster_compute_core_num(); core_idx++) { + // Pointers to variables of the other core + uint32_t* remote_partial_membership_cnt = + partial_membership_cnt + core_idx * n_clusters; + double* remote_partial_centroids = + partial_centroids + core_idx * n_clusters * n_features; + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + // Accumulate membership counters + partial_membership_cnt[centroid_idx] += + remote_partial_membership_cnt[centroid_idx]; + // Accumulate centroid features + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { + partial_centroids[centroid_idx * n_features + + feature_idx] += + remote_partial_centroids[centroid_idx * + n_features + + feature_idx]; + } + } + } + snrt_inter_cluster_barrier(); + if (snrt_cluster_idx() == 0) { + // Inter-cluster reduction + for (uint32_t cluster_idx = 1; + cluster_idx < snrt_cluster_num(); cluster_idx++) { + // Pointers to variables of remote clusters + uint32_t* remote_partial_membership_cnt = + (uint32_t*)snrt_remote_l1_ptr( + partial_membership_cnt, 0, cluster_idx); + double* remote_partial_centroids = + (double*)snrt_remote_l1_ptr(partial_centroids, + 0, cluster_idx); + for (uint32_t centroid_idx = 0; + centroid_idx < n_clusters; centroid_idx++) { + // Accumulate membership counters + partial_membership_cnt[centroid_idx] += + remote_partial_membership_cnt[centroid_idx]; + // Accumulate centroid features + for (uint32_t feature_idx = 0; + feature_idx < n_features; feature_idx++) { + final_centroids[centroid_idx * n_features + + feature_idx] += + remote_partial_centroids[centroid_idx * + n_features + + feature_idx]; + } + } + } + // Normalize + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; + centroid_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; + feature_idx++) { + final_centroids[centroid_idx * n_features + + feature_idx] /= + partial_membership_cnt[centroid_idx]; + } + } + } + } + } + + snrt_global_barrier(); + local_centroids = final_centroids; + } + + snrt_cluster_hw_barrier(); + + // Transfer final centroids with DMA + if (snrt_is_dm_core() && snrt_cluster_idx() == 0) { + snrt_dma_start_1d((void*)centroids, (void*)final_centroids, size); + snrt_dma_wait_all(); + } +} diff --git a/sw/apps/kmeans/src/main.c b/sw/apps/kmeans/src/main.c new file mode 100644 index 0000000000..ba09ef3d21 --- /dev/null +++ b/sw/apps/kmeans/src/main.c @@ -0,0 +1,15 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include + +#include "data.h" +#include "kmeans.h" + +int main() { + kmeans(n_samples, n_features, n_clusters, n_iter, samples, centroids); + return 0; +} diff --git a/sw/apps/kmeans/verify.py b/sw/apps/kmeans/verify.py new file mode 100755 index 0000000000..8feb40d639 --- /dev/null +++ b/sw/apps/kmeans/verify.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import sys +from pathlib import Path +import numpy as np +from data.datagen import golden_model, visualize_clusters + +sys.path.append(str(Path(__file__).parent / '../../../util/sim/')) +import verification # noqa: E402 +from elf import Elf # noqa: E402 +from data_utils import bytes_to_doubles, bytes_to_uint32s # noqa: E402 + + +ERR_THRESHOLD = 1E-10 + + +def main(): + # Run simulation and get outputs + args = verification.parse_args() + raw_results = verification.simulate(sim_bin=args.sim_bin, + snitch_bin=args.snitch_bin, + symbols_bin=args.symbols_bin, + log=args.log, + output_uids=['centroids']) + centroids_actual = np.array(bytes_to_doubles(raw_results['centroids'])) + + # Extract input operands from ELF file + if args.symbols_bin: + elf = Elf(args.symbols_bin) + else: + elf = Elf(args.snitch_bin) + max_iter = bytes_to_uint32s(elf.get_symbol_contents('n_iter'))[0] + n_clusters = bytes_to_uint32s(elf.get_symbol_contents('n_clusters'))[0] + n_features = bytes_to_uint32s(elf.get_symbol_contents('n_features'))[0] + n_samples = bytes_to_uint32s(elf.get_symbol_contents('n_samples'))[0] + initial_centroids = np.array(bytes_to_doubles(elf.get_symbol_contents('centroids'))) + samples = np.array(bytes_to_doubles(elf.get_symbol_contents('samples'))) + + # Reshape + samples = samples.reshape((n_samples, n_features)) + initial_centroids = initial_centroids.reshape((n_clusters, n_features)) + centroids_actual = centroids_actual.reshape((n_clusters, n_features)) + + # Visualize centroids computed in simulation + visualize_clusters(samples, initial_centroids, "Initial centroids") + visualize_clusters(samples, centroids_actual, "Actual centroids") + + # Verify results + centroids_golden, _ = golden_model(samples, n_clusters, initial_centroids, max_iter) + visualize_clusters(samples, centroids_golden, "Golden centroids") + relative_err = np.absolute((centroids_golden - centroids_actual) / centroids_golden) + fail = np.any(relative_err > ERR_THRESHOLD) + if (fail): + verification.dump_results_to_csv([centroids_golden, centroids_actual, relative_err], + Path.cwd() / 'kmeans_results.csv') + return int(fail) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile index cc02ccc732..3f111682cf 100644 --- a/target/snitch_cluster/sw/apps/Makefile +++ b/target/snitch_cluster/sw/apps/Makefile @@ -16,6 +16,7 @@ SUBDIRS += dnn/layernorm SUBDIRS += dnn/linear SUBDIRS += dnn/maxpool SUBDIRS += dnn/softmax +SUBDIRS += kmeans SUBDIRS += montecarlo/pi_estimation .PHONY: all clean $(SUBDIRS) diff --git a/target/snitch_cluster/sw/apps/kmeans/Makefile b/target/snitch_cluster/sw/apps/kmeans/Makefile new file mode 100644 index 0000000000..b7a2ca723b --- /dev/null +++ b/target/snitch_cluster/sw/apps/kmeans/Makefile @@ -0,0 +1,10 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +include ../../../../../sw/apps/kmeans/Makefile +include ../common.mk + +$(DEP): $(DATA_H) diff --git a/target/snitch_cluster/sw/fdiv.yaml b/target/snitch_cluster/sw/fdiv.yaml index 7d6e91080f..fc28ed52be 100644 --- a/target/snitch_cluster/sw/fdiv.yaml +++ b/target/snitch_cluster/sw/fdiv.yaml @@ -3,3 +3,5 @@ # SPDX-License-Identifier: Apache-2.0 runs: + - elf: apps/kmeans/build/kmeans.elf + cmd: [../../../sw/apps/kmeans/verify.py, "${sim_bin}", "${elf}"]