From d22f384bb047690c0daafa1e84938503d4a1ba63 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Tue, 23 Jan 2024 18:28:01 +0100 Subject: [PATCH] sw: Implement K-means kernel --- .github/workflows/ci.yml | 22 +++ .gitlab-ci.yml | 8 + python-requirements.txt | 1 + sw/apps/kmeans/.gitignore | 1 + sw/apps/kmeans/Makefile | 30 +++ sw/apps/kmeans/data/datagen.py | 127 ++++++++++++ sw/apps/kmeans/data/params.json | 11 ++ sw/apps/kmeans/src/kmeans.h | 182 ++++++++++++++++++ sw/apps/kmeans/src/main.c | 15 ++ sw/apps/kmeans/verify.py | 65 +++++++ target/snitch_cluster/sw/apps/Makefile | 1 + target/snitch_cluster/sw/apps/kmeans/Makefile | 10 + target/snitch_cluster/sw/fdiv.yaml | 7 + 13 files changed, 480 insertions(+) create mode 100644 sw/apps/kmeans/.gitignore create mode 100644 sw/apps/kmeans/Makefile create mode 100755 sw/apps/kmeans/data/datagen.py create mode 100644 sw/apps/kmeans/data/params.json create mode 100644 sw/apps/kmeans/src/kmeans.h create mode 100644 sw/apps/kmeans/src/main.c create mode 100755 sw/apps/kmeans/verify.py create mode 100644 target/snitch_cluster/sw/apps/kmeans/Makefile create mode 100644 target/snitch_cluster/sw/fdiv.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f8f87b3f86..c731e5e635 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,6 +46,28 @@ jobs: run: | ./run.py sw/run.yaml --simulator verilator -j + # Tests requiring hardware FDIV unit + sw-snitch-cluster-fdiv-vlt: + name: Simulate FDIV SW on Snitch Cluster w/ Verilator + runs-on: ubuntu-22.04 + container: + image: ghcr.io/pulp-platform/snitch_cluster:tracer-dma + steps: + - uses: actions/checkout@v2 + with: + submodules: 'recursive' + - name: Build Software + run: | + bender vendor init + make -C target/snitch_cluster CFG_OVERRIDE=target/snitch_cluster/cfg/fdiv.hjson sw + - name: Build Hardware + run: | + make -C target/snitch_cluster CFG_OVERRIDE=target/snitch_cluster/cfg/fdiv.hjson bin/snitch_cluster.vlt + - name: Run Tests + working-directory: target/snitch_cluster + run: | + ./run.py sw/fdiv.yaml --simulator verilator -j + ######################################### # Build SW on Snitch Cluster w/ Banshee # ######################################### diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 18cd5d4aaa..19af5c09a4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -134,3 +134,11 @@ snitch-cluster-banshee: - cargo install --debug --path . - cd ../target/snitch_cluster - ./run.py sw/run.yaml --simulator banshee -j --run-dir runs/banshee + +# Tests requiring hardware FDIV unit +snitch-cluster-fdiv-vsim: + script: + - cd target/snitch_cluster + - make CFG_OVERRIDE=cfg/fdiv sw + - make bin/snitch_cluster.vsim + - ./run.py sw/fdiv.yaml --simulator vsim -j --run-dir runs/vsim diff --git a/python-requirements.txt b/python-requirements.txt index 6db0bf03f6..f535755deb 100644 --- a/python-requirements.txt +++ b/python-requirements.txt @@ -16,6 +16,7 @@ tabulate yamllint pyyaml pytablewriter +scikit-learn termcolor pandas pyelftools diff --git a/sw/apps/kmeans/.gitignore b/sw/apps/kmeans/.gitignore new file mode 100644 index 0000000000..8485f615ee --- /dev/null +++ b/sw/apps/kmeans/.gitignore @@ -0,0 +1 @@ +data/data.h \ No newline at end of file diff --git a/sw/apps/kmeans/Makefile b/sw/apps/kmeans/Makefile new file mode 100644 index 0000000000..cf379ddcdb --- /dev/null +++ b/sw/apps/kmeans/Makefile @@ -0,0 +1,30 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +# Usage of absolute paths is required to externally include this Makefile +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +DATA_DIR := $(realpath $(MK_DIR)/data) +SRC_DIR := $(realpath $(MK_DIR)/src) + +DATA_CFG ?= $(DATA_DIR)/params.json +SECTION ?= + +APP ?= kmeans +SRCS ?= $(realpath $(SRC_DIR)/main.c) +INCDIRS ?= $(DATA_DIR) $(SRC_DIR) + +DATAGEN_PY = $(DATA_DIR)/datagen.py +DATA_H = $(DATA_DIR)/data.h + +$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) + $< -c $(DATA_CFG) --no-gui --section="$(SECTION)" > $@ + +.PHONY: clean-data clean + +clean-data: + rm -f $(DATA_H) + +clean: clean-data diff --git a/sw/apps/kmeans/data/datagen.py b/sw/apps/kmeans/data/datagen.py new file mode 100755 index 0000000000..b353d1208a --- /dev/null +++ b/sw/apps/kmeans/data/datagen.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Authors: Luca Colagrande + +import argparse +import json5 +import matplotlib.pyplot as plt +import numpy as np +import os +import pathlib +from sklearn.datasets import make_blobs +from sklearn.cluster import KMeans +import sys + + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +from data_utils import emit_license, format_scalar_definition, \ + format_vector_definition, format_ifdef_wrapper # noqa: E402 + + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + + +def golden_model(samples, n_clusters, initial_centroids, max_iter): + # Apply k-means clustering + kmeans = KMeans( + n_clusters=n_clusters, + init=initial_centroids, + max_iter=max_iter + ) + kmeans.fit(samples) + return kmeans.cluster_centers_, kmeans.n_iter_ + + +def visualize_clusters(samples, centroids, title=None): + plt.scatter(samples[:, 0], samples[:, 1], s=30) + plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200, linewidths=3, color='red') + if not title: + title = "K-means clusters" + plt.title(title) + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + plt.show() + + +def emit_header(**kwargs): + + # Aliases + n_samples = kwargs['n_samples'] + n_features = kwargs['n_features'] + n_clusters = kwargs['n_clusters'] + max_iter = kwargs['max_iter'] + seed = kwargs['seed'] + gui = not kwargs['no_gui'] + + # Generate random samples + X, _ = make_blobs( + n_samples=n_samples, + n_features=n_features, + centers=n_clusters, + random_state=seed + ) + + # Generate initial centroids randomly + rng = np.random.default_rng(seed=seed) + initial_centroids = rng.uniform(low=X.min(axis=0), high=X.max(axis=0), size=(n_clusters, n_features)) + + # Visualize the generated samples + if gui: + visualize_clusters(X, initial_centroids) + + # Apply k-means clustering + centers, n_iter = golden_model(X, n_clusters, initial_centroids, max_iter) + + # Visualize the clusters + if gui: + visualize_clusters(X, centers) + + # Generate header + data_str = [emit_license()] + data_str += [format_scalar_definition('uint32_t', 'n_samples', n_samples)] + data_str += [format_scalar_definition('uint32_t', 'n_features', n_features)] + data_str += [format_scalar_definition('uint32_t', 'n_clusters', n_clusters)] + data_str += [format_scalar_definition('uint32_t', 'n_iter', n_iter)] + data_str += [format_vector_definition('double', 'centroids', initial_centroids.flatten(), + alignment=BURST_ALIGNMENT, section=kwargs['section'])] + data_str += [format_vector_definition('double', 'samples', X.flatten(), + alignment=BURST_ALIGNMENT, section=kwargs['section'])] + data_str = '\n\n'.join(data_str) + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for kernels') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel') + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + '--no-gui', + action='store_true', + help='Run without visualization') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = json5.loads(f.read()) + param['section'] = args.section + param['no_gui'] = args.no_gui + + # Emit header file + print(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/sw/apps/kmeans/data/params.json b/sw/apps/kmeans/data/params.json new file mode 100644 index 0000000000..b2f50d01d0 --- /dev/null +++ b/sw/apps/kmeans/data/params.json @@ -0,0 +1,11 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + n_clusters: 3, + n_features: 2, + n_samples: 128, + max_iter: 3, + seed: 42 +} diff --git a/sw/apps/kmeans/src/kmeans.h b/sw/apps/kmeans/src/kmeans.h new file mode 100644 index 0000000000..6404e3cf55 --- /dev/null +++ b/sw/apps/kmeans/src/kmeans.h @@ -0,0 +1,182 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include + +#include "math.h" +#include "snrt.h" + +double euclidean_distance_squared(uint32_t n_features, double* point1, double* point2) { + double sum = 0; + for (uint32_t i = 0; i < n_features; i++) { + double diff = point1[i] - point2[i]; + sum += diff * diff; + } + return sum; +} + +// Allocate space in L1, returns pointer to the same location for each core in cluster. +// Different clusters get different pointers, to the same offset within its TCDM. +inline void* snrt_l1_alloc_cluster_private(void* base, size_t size, void** new_base) { + *new_base = base + size; + return base; +} + +// Allocate space in L1, each compute core gets unique space. +inline void* snrt_l1_alloc_compute_core_private(void* base, size_t size, void** new_base) { + *new_base = base + size * snrt_cluster_compute_core_num(); + return base + size * snrt_cluster_core_idx(); +} + +// Allocate space in L1, all clusters get pointer to cluster 0's allocation. +inline void* snrt_l1_alloc_common(void* base, size_t size, void** new_base) { + *new_base = base + size; + return (void*)((uintptr_t)base - snrt_cluster_idx() * SNRT_CLUSTER_OFFSET); +} + +// Takes the pointer to a variable in one cluster's TCDM (src), and returns the pointer +// to the variable at the same offset in another cluster's TCDM (dst) +inline void* snrt_remote_cluster_ptr(void* src, uint32_t src_cluster_idx, uint32_t dst_cluster_idx) { + return (void *)((uintptr_t)src + (dst_cluster_idx - src_cluster_idx) * SNRT_CLUSTER_OFFSET); +} + +void kmeans(uint32_t n_samples, uint32_t n_features, uint32_t n_clusters, uint32_t n_iter, double* samples, double* centroids) { + // Distribute work across clusters + uint32_t n_samples_per_cluster = n_samples / snrt_cluster_num(); + + // Allocate space for operands in TCDM + void *l1_base, *prev_l1_base; + l1_base = snrt_l1_next(); + double *local_samples = snrt_l1_alloc_cluster_private(l1_base, n_samples_per_cluster * n_features * sizeof(double), &l1_base); + double *local_centroids = snrt_l1_alloc_cluster_private(l1_base, n_clusters * n_features * sizeof(double), &l1_base); + // Allocate space for intermediate variables in TCDM + uint32_t *membership = snrt_l1_alloc_cluster_private(l1_base, n_samples_per_cluster * sizeof(uint32_t), &l1_base); + // Alias first core's partial membership counters with final membership counters + prev_l1_base = l1_base; + uint32_t *final_membership_cnt = snrt_l1_alloc_common(l1_base, n_clusters * sizeof(uint32_t), &l1_base); + uint32_t *partial_membership_cnt = snrt_l1_alloc_compute_core_private(prev_l1_base, n_clusters * sizeof(uint32_t), &l1_base); + // Alias first core's partial centroids with final centroids + prev_l1_base = l1_base; + double *final_centroids = snrt_l1_alloc_common(l1_base, n_clusters * n_features * sizeof(double), &l1_base); + double *partial_centroids = snrt_l1_alloc_compute_core_private(prev_l1_base, n_clusters * n_features * sizeof(double), &l1_base); + + // Transfer samples and initial centroids with DMA + size_t size; + size_t offset; + if (snrt_is_dm_core()) { + size = n_samples_per_cluster * n_features * sizeof(double); + offset = snrt_cluster_idx() * size; + snrt_dma_start_1d((void *)local_samples, (void *)samples + offset, size); + size = n_clusters * n_features * sizeof(double); + snrt_dma_start_1d((void *)local_centroids, (void *)centroids, size); + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + // Iterations of Lloyd's K-means algorithm + for (uint32_t iter_idx = 0; iter_idx < n_iter; iter_idx++) { + + // Distribute work across compute cores in a cluster + uint32_t n_samples_per_core; + uint32_t start_sample_idx; + uint32_t end_sample_idx; + if (snrt_is_compute_core()) { + n_samples_per_core = n_samples_per_cluster / snrt_cluster_compute_core_num(); + start_sample_idx = snrt_cluster_core_idx() * n_samples_per_core; + end_sample_idx = start_sample_idx + n_samples_per_core; + + // Assignment step + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; centroid_idx++) { + partial_membership_cnt[centroid_idx] = 0; + } + snrt_fpu_fence(); + for (uint32_t sample_idx = start_sample_idx; sample_idx < end_sample_idx; sample_idx++) { + + double min_dist = INFINITY; + membership[sample_idx] = 0; + + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; centroid_idx++) { + double dist = euclidean_distance_squared( + n_features, &local_samples[sample_idx * n_features], &local_centroids[centroid_idx * n_features]); + if (dist < min_dist) { + min_dist = dist; + membership[sample_idx] = centroid_idx; + } + } + partial_membership_cnt[membership[sample_idx]]++; + } + } + + snrt_global_barrier(); + + if (snrt_is_compute_core()) { + + // Update step + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; centroid_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; feature_idx++) { + partial_centroids[centroid_idx * n_features + feature_idx] = 0; + } + } + snrt_fpu_fence(); + for (uint32_t sample_idx = start_sample_idx; sample_idx < end_sample_idx; sample_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; feature_idx++) { + partial_centroids[membership[sample_idx] * n_features + feature_idx] += local_samples[sample_idx * n_features + feature_idx]; + } + } + if (snrt_cluster_core_idx() == 0) { + // Intra-cluster reduction + for (uint32_t core_idx = 1; core_idx < snrt_cluster_compute_core_num(); core_idx++) { + // Pointers to variables of the other core + uint32_t* remote_partial_membership_cnt = partial_membership_cnt + core_idx * n_clusters; + double* remote_partial_centroids = partial_centroids + core_idx * n_clusters * n_features; + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; centroid_idx++) { + // Accumulate membership counters + partial_membership_cnt[centroid_idx] += remote_partial_membership_cnt[centroid_idx]; + // Accumulate centroid features + for (uint32_t feature_idx = 0; feature_idx < n_features; feature_idx++) { + partial_centroids[centroid_idx * n_features + feature_idx] += remote_partial_centroids[centroid_idx * n_features + feature_idx]; + } + } + } + snrt_inter_cluster_barrier(); + if (snrt_cluster_idx() == 0) { + // Inter-cluster reduction + for (uint32_t cluster_idx = 1; cluster_idx < snrt_cluster_num(); cluster_idx++) { + // Pointers to variables of remote clusters + uint32_t* remote_partial_membership_cnt = (uint32_t *)snrt_remote_cluster_ptr(partial_membership_cnt, 0, cluster_idx); + double* remote_partial_centroids = (double *)snrt_remote_cluster_ptr(partial_centroids, 0, cluster_idx); + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; centroid_idx++) { + // Accumulate membership counters + final_membership_cnt[centroid_idx] += remote_partial_membership_cnt[centroid_idx]; + // Accumulate centroid features + for (uint32_t feature_idx = 0; feature_idx < n_features; feature_idx++) { + final_centroids[centroid_idx * n_features + feature_idx] += remote_partial_centroids[centroid_idx * n_features + feature_idx]; + } + } + } + // Normalize + for (uint32_t centroid_idx = 0; centroid_idx < n_clusters; centroid_idx++) { + for (uint32_t feature_idx = 0; feature_idx < n_features; feature_idx++) { + final_centroids[centroid_idx * n_features + feature_idx] /= final_membership_cnt[centroid_idx]; + } + } + } + } + } + + snrt_global_barrier(); + local_centroids = final_centroids; + } + + snrt_cluster_hw_barrier(); + + // Transfer final centroids with DMA + if (snrt_is_dm_core() && snrt_cluster_idx() == 0) { + snrt_dma_start_1d((void *)centroids, (void *)final_centroids, size); + snrt_dma_wait_all(); + } +} diff --git a/sw/apps/kmeans/src/main.c b/sw/apps/kmeans/src/main.c new file mode 100644 index 0000000000..ba09ef3d21 --- /dev/null +++ b/sw/apps/kmeans/src/main.c @@ -0,0 +1,15 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include + +#include "data.h" +#include "kmeans.h" + +int main() { + kmeans(n_samples, n_features, n_clusters, n_iter, samples, centroids); + return 0; +} diff --git a/sw/apps/kmeans/verify.py b/sw/apps/kmeans/verify.py new file mode 100755 index 0000000000..8feb40d639 --- /dev/null +++ b/sw/apps/kmeans/verify.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import sys +from pathlib import Path +import numpy as np +from data.datagen import golden_model, visualize_clusters + +sys.path.append(str(Path(__file__).parent / '../../../util/sim/')) +import verification # noqa: E402 +from elf import Elf # noqa: E402 +from data_utils import bytes_to_doubles, bytes_to_uint32s # noqa: E402 + + +ERR_THRESHOLD = 1E-10 + + +def main(): + # Run simulation and get outputs + args = verification.parse_args() + raw_results = verification.simulate(sim_bin=args.sim_bin, + snitch_bin=args.snitch_bin, + symbols_bin=args.symbols_bin, + log=args.log, + output_uids=['centroids']) + centroids_actual = np.array(bytes_to_doubles(raw_results['centroids'])) + + # Extract input operands from ELF file + if args.symbols_bin: + elf = Elf(args.symbols_bin) + else: + elf = Elf(args.snitch_bin) + max_iter = bytes_to_uint32s(elf.get_symbol_contents('n_iter'))[0] + n_clusters = bytes_to_uint32s(elf.get_symbol_contents('n_clusters'))[0] + n_features = bytes_to_uint32s(elf.get_symbol_contents('n_features'))[0] + n_samples = bytes_to_uint32s(elf.get_symbol_contents('n_samples'))[0] + initial_centroids = np.array(bytes_to_doubles(elf.get_symbol_contents('centroids'))) + samples = np.array(bytes_to_doubles(elf.get_symbol_contents('samples'))) + + # Reshape + samples = samples.reshape((n_samples, n_features)) + initial_centroids = initial_centroids.reshape((n_clusters, n_features)) + centroids_actual = centroids_actual.reshape((n_clusters, n_features)) + + # Visualize centroids computed in simulation + visualize_clusters(samples, initial_centroids, "Initial centroids") + visualize_clusters(samples, centroids_actual, "Actual centroids") + + # Verify results + centroids_golden, _ = golden_model(samples, n_clusters, initial_centroids, max_iter) + visualize_clusters(samples, centroids_golden, "Golden centroids") + relative_err = np.absolute((centroids_golden - centroids_actual) / centroids_golden) + fail = np.any(relative_err > ERR_THRESHOLD) + if (fail): + verification.dump_results_to_csv([centroids_golden, centroids_actual, relative_err], + Path.cwd() / 'kmeans_results.csv') + return int(fail) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile index d86145c858..15b7b076df 100644 --- a/target/snitch_cluster/sw/apps/Makefile +++ b/target/snitch_cluster/sw/apps/Makefile @@ -17,6 +17,7 @@ SUBDIRS += dnn/layernorm SUBDIRS += dnn/linear SUBDIRS += dnn/maxpool SUBDIRS += dnn/softmax +SUBDIRS += kmeans SUBDIRS += montecarlo/pi_estimation .PHONY: all clean $(SUBDIRS) diff --git a/target/snitch_cluster/sw/apps/kmeans/Makefile b/target/snitch_cluster/sw/apps/kmeans/Makefile new file mode 100644 index 0000000000..b7a2ca723b --- /dev/null +++ b/target/snitch_cluster/sw/apps/kmeans/Makefile @@ -0,0 +1,10 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +include ../../../../../sw/apps/kmeans/Makefile +include ../common.mk + +$(DEP): $(DATA_H) diff --git a/target/snitch_cluster/sw/fdiv.yaml b/target/snitch_cluster/sw/fdiv.yaml new file mode 100644 index 0000000000..fc28ed52be --- /dev/null +++ b/target/snitch_cluster/sw/fdiv.yaml @@ -0,0 +1,7 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +runs: + - elf: apps/kmeans/build/kmeans.elf + cmd: [../../../sw/apps/kmeans/verify.py, "${sim_bin}", "${elf}"]