From eb38f12606436d549e7dbb628cb078ee58fb1e76 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 26 Jan 2024 09:37:17 +0100
Subject: [PATCH] sw: Add K-means app

---
 python-requirements.txt                       |   3 +
 sw/apps/kmeans/.gitignore                     |   1 +
 sw/apps/kmeans/Makefile                       |  30 +++
 sw/apps/kmeans/data/datagen.py                | 128 ++++++++++++
 sw/apps/kmeans/data/params.json               |  11 +
 sw/apps/kmeans/src/kmeans.h                   | 192 ++++++++++++++++++
 sw/apps/kmeans/src/main.c                     |  15 ++
 sw/apps/kmeans/verify.py                      |  65 ++++++
 target/snitch_cluster/sw/apps/Makefile        |   1 +
 target/snitch_cluster/sw/apps/kmeans/Makefile |  10 +
 target/snitch_cluster/sw/fdiv.yaml            |   2 +
 11 files changed, 458 insertions(+)
 create mode 100644 sw/apps/kmeans/.gitignore
 create mode 100644 sw/apps/kmeans/Makefile
 create mode 100755 sw/apps/kmeans/data/datagen.py
 create mode 100644 sw/apps/kmeans/data/params.json
 create mode 100644 sw/apps/kmeans/src/kmeans.h
 create mode 100644 sw/apps/kmeans/src/main.c
 create mode 100755 sw/apps/kmeans/verify.py
 create mode 100644 target/snitch_cluster/sw/apps/kmeans/Makefile

diff --git a/python-requirements.txt b/python-requirements.txt
index 6db0bf03f6..6117b40343 100644
--- a/python-requirements.txt
+++ b/python-requirements.txt
@@ -8,14 +8,17 @@ editorconfig-checker==2.3.51
 flake8
 gitpython
 hjson
+json5
 jsonref
 jsonschema
 mako
+matplotlib
 progressbar2
 tabulate
 yamllint
 pyyaml
 pytablewriter
+scikit-learn
 termcolor
 pandas
 pyelftools
diff --git a/sw/apps/kmeans/.gitignore b/sw/apps/kmeans/.gitignore
new file mode 100644
index 0000000000..8485f615ee
--- /dev/null
+++ b/sw/apps/kmeans/.gitignore
@@ -0,0 +1 @@
+data/data.h
\ No newline at end of file
diff --git a/sw/apps/kmeans/Makefile b/sw/apps/kmeans/Makefile
new file mode 100644
index 0000000000..cf379ddcdb
--- /dev/null
+++ b/sw/apps/kmeans/Makefile
@@ -0,0 +1,30 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+# Usage of absolute paths is required to externally include this Makefile
+MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+DATA_DIR := $(realpath $(MK_DIR)/data)
+SRC_DIR  := $(realpath $(MK_DIR)/src)
+
+DATA_CFG ?= $(DATA_DIR)/params.json
+SECTION  ?=
+
+APP     ?= kmeans
+SRCS    ?= $(realpath $(SRC_DIR)/main.c)
+INCDIRS ?= $(DATA_DIR) $(SRC_DIR)
+
+DATAGEN_PY = $(DATA_DIR)/datagen.py
+DATA_H     = $(DATA_DIR)/data.h
+
+$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
+	$< -c $(DATA_CFG) --no-gui --section="$(SECTION)" > $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_H)
+
+clean: clean-data
diff --git a/sw/apps/kmeans/data/datagen.py b/sw/apps/kmeans/data/datagen.py
new file mode 100755
index 0000000000..b57fdb5360
--- /dev/null
+++ b/sw/apps/kmeans/data/datagen.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Authors: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import hjson
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pathlib
+from sklearn.datasets import make_blobs
+from sklearn.cluster import KMeans
+import sys
+
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+from data_utils import emit_license, format_scalar_definition, \
+                       format_vector_definition  # noqa: E402
+
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+
+def golden_model(samples, n_clusters, initial_centroids, max_iter):
+    # Apply k-means clustering
+    kmeans = KMeans(
+        n_clusters=n_clusters,
+        init=initial_centroids,
+        max_iter=max_iter
+    )
+    kmeans.fit(samples)
+    return kmeans.cluster_centers_, kmeans.n_iter_
+
+
+def visualize_clusters(samples, centroids, title=None):
+    plt.scatter(samples[:, 0], samples[:, 1], s=30)
+    plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200, linewidths=3, color='red')
+    if not title:
+        title = "K-means clusters"
+    plt.title(title)
+    plt.xlabel("Feature 1")
+    plt.ylabel("Feature 2")
+    plt.show()
+
+
+def emit_header(**kwargs):
+
+    # Aliases
+    n_samples = kwargs['n_samples']
+    n_features = kwargs['n_features']
+    n_clusters = kwargs['n_clusters']
+    max_iter = kwargs['max_iter']
+    seed = kwargs['seed']
+    gui = not kwargs['no_gui']
+
+    # Generate random samples
+    X, _ = make_blobs(
+        n_samples=n_samples,
+        n_features=n_features,
+        centers=n_clusters,
+        random_state=seed
+    )
+
+    # Generate initial centroids randomly
+    rng = np.random.default_rng(seed=seed)
+    initial_centroids = rng.uniform(low=X.min(axis=0), high=X.max(axis=0),
+                                    size=(n_clusters, n_features))
+
+    # Visualize the generated samples
+    if gui:
+        visualize_clusters(X, initial_centroids)
+
+    # Apply k-means clustering
+    centers, n_iter = golden_model(X, n_clusters, initial_centroids, max_iter)
+
+    # Visualize the clusters
+    if gui:
+        visualize_clusters(X, centers)
+
+    # Generate header
+    data_str = [emit_license()]
+    data_str += [format_scalar_definition('uint32_t', 'n_samples', n_samples)]
+    data_str += [format_scalar_definition('uint32_t', 'n_features', n_features)]
+    data_str += [format_scalar_definition('uint32_t', 'n_clusters', n_clusters)]
+    data_str += [format_scalar_definition('uint32_t', 'n_iter', n_iter)]
+    data_str += [format_vector_definition('double', 'centroids', initial_centroids.flatten(),
+                 alignment=BURST_ALIGNMENT, section=kwargs['section'])]
+    data_str += [format_vector_definition('double', 'samples', X.flatten(),
+                 alignment=BURST_ALIGNMENT, section=kwargs['section'])]
+    data_str = '\n\n'.join(data_str)
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for kernels')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel')
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        '--no-gui',
+        action='store_true',
+        help='Run without visualization')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+    param['no_gui'] = args.no_gui
+
+    # Emit header file
+    print(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sw/apps/kmeans/data/params.json b/sw/apps/kmeans/data/params.json
new file mode 100644
index 0000000000..b2f50d01d0
--- /dev/null
+++ b/sw/apps/kmeans/data/params.json
@@ -0,0 +1,11 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    n_clusters: 3,
+    n_features: 2,
+    n_samples: 128,
+    max_iter: 3,
+    seed: 42
+}
diff --git a/sw/apps/kmeans/src/kmeans.h b/sw/apps/kmeans/src/kmeans.h
new file mode 100644
index 0000000000..ba4aacf91a
--- /dev/null
+++ b/sw/apps/kmeans/src/kmeans.h
@@ -0,0 +1,192 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include <stdint.h>
+
+#include "math.h"
+#include "snrt.h"
+
+double euclidean_distance_squared(uint32_t n_features, double* point1,
+                                  double* point2) {
+    double sum = 0;
+    for (uint32_t i = 0; i < n_features; i++) {
+        double diff = point1[i] - point2[i];
+        sum += diff * diff;
+    }
+    return sum;
+}
+
+void kmeans(uint32_t n_samples, uint32_t n_features, uint32_t n_clusters,
+            uint32_t n_iter, double* samples, double* centroids) {
+    // Distribute work across clusters
+    uint32_t n_samples_per_cluster = n_samples / snrt_cluster_num();
+
+    // Dynamically allocate space in TCDM
+    double* local_samples = snrt_l1_alloc_cluster_local(
+        n_samples_per_cluster * n_features * sizeof(double), sizeof(double));
+    double* local_centroids = snrt_l1_alloc_cluster_local(
+        n_clusters * n_features * sizeof(double), sizeof(double));
+    uint32_t* membership = snrt_l1_alloc_cluster_local(
+        n_samples_per_cluster * sizeof(uint32_t), sizeof(uint32_t));
+    uint32_t* partial_membership_cnt = snrt_l1_alloc_compute_core_local(
+        n_clusters * sizeof(uint32_t), sizeof(uint32_t));
+    // First core's partial centroids will store final centroids
+    double* final_centroids = snrt_l1_next();
+    double* partial_centroids = snrt_l1_alloc_compute_core_local(
+        n_clusters * n_features * sizeof(double), sizeof(double));
+
+    // Transfer samples and initial centroids with DMA
+    size_t size;
+    size_t offset;
+    if (snrt_is_dm_core()) {
+        size = n_samples_per_cluster * n_features * sizeof(double);
+        offset = snrt_cluster_idx() * size;
+        snrt_dma_start_1d((void*)local_samples, (void*)samples + offset, size);
+        size = n_clusters * n_features * sizeof(double);
+        snrt_dma_start_1d((void*)local_centroids, (void*)centroids, size);
+        snrt_dma_wait_all();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    // Iterations of Lloyd's K-means algorithm
+    for (uint32_t iter_idx = 0; iter_idx < n_iter; iter_idx++) {
+        // Distribute work across compute cores in a cluster
+        uint32_t n_samples_per_core;
+        uint32_t start_sample_idx;
+        uint32_t end_sample_idx;
+        if (snrt_is_compute_core()) {
+            n_samples_per_core =
+                n_samples_per_cluster / snrt_cluster_compute_core_num();
+            start_sample_idx = snrt_cluster_core_idx() * n_samples_per_core;
+            end_sample_idx = start_sample_idx + n_samples_per_core;
+
+            // Assignment step
+            for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                 centroid_idx++) {
+                partial_membership_cnt[centroid_idx] = 0;
+            }
+            snrt_fpu_fence();
+            for (uint32_t sample_idx = start_sample_idx;
+                 sample_idx < end_sample_idx; sample_idx++) {
+                double min_dist = INFINITY;
+                membership[sample_idx] = 0;
+
+                for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                     centroid_idx++) {
+                    double dist = euclidean_distance_squared(
+                        n_features, &local_samples[sample_idx * n_features],
+                        &local_centroids[centroid_idx * n_features]);
+                    if (dist < min_dist) {
+                        min_dist = dist;
+                        membership[sample_idx] = centroid_idx;
+                    }
+                }
+                partial_membership_cnt[membership[sample_idx]]++;
+            }
+        }
+
+        snrt_global_barrier();
+
+        if (snrt_is_compute_core()) {
+            // Update step
+            for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                 centroid_idx++) {
+                for (uint32_t feature_idx = 0; feature_idx < n_features;
+                     feature_idx++) {
+                    partial_centroids[centroid_idx * n_features + feature_idx] =
+                        0;
+                }
+            }
+            snrt_fpu_fence();
+            for (uint32_t sample_idx = start_sample_idx;
+                 sample_idx < end_sample_idx; sample_idx++) {
+                for (uint32_t feature_idx = 0; feature_idx < n_features;
+                     feature_idx++) {
+                    partial_centroids[membership[sample_idx] * n_features +
+                                      feature_idx] +=
+                        local_samples[sample_idx * n_features + feature_idx];
+                }
+            }
+            if (snrt_cluster_core_idx() == 0) {
+                // Intra-cluster reduction
+                for (uint32_t core_idx = 1;
+                     core_idx < snrt_cluster_compute_core_num(); core_idx++) {
+                    // Pointers to variables of the other core
+                    uint32_t* remote_partial_membership_cnt =
+                        partial_membership_cnt + core_idx * n_clusters;
+                    double* remote_partial_centroids =
+                        partial_centroids + core_idx * n_clusters * n_features;
+                    for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                         centroid_idx++) {
+                        // Accumulate membership counters
+                        partial_membership_cnt[centroid_idx] +=
+                            remote_partial_membership_cnt[centroid_idx];
+                        // Accumulate centroid features
+                        for (uint32_t feature_idx = 0; feature_idx < n_features;
+                             feature_idx++) {
+                            partial_centroids[centroid_idx * n_features +
+                                              feature_idx] +=
+                                remote_partial_centroids[centroid_idx *
+                                                             n_features +
+                                                         feature_idx];
+                        }
+                    }
+                }
+                snrt_inter_cluster_barrier();
+                if (snrt_cluster_idx() == 0) {
+                    // Inter-cluster reduction
+                    for (uint32_t cluster_idx = 1;
+                         cluster_idx < snrt_cluster_num(); cluster_idx++) {
+                        // Pointers to variables of remote clusters
+                        uint32_t* remote_partial_membership_cnt =
+                            (uint32_t*)snrt_remote_l1_ptr(
+                                partial_membership_cnt, 0, cluster_idx);
+                        double* remote_partial_centroids =
+                            (double*)snrt_remote_l1_ptr(partial_centroids,
+                                                             0, cluster_idx);
+                        for (uint32_t centroid_idx = 0;
+                             centroid_idx < n_clusters; centroid_idx++) {
+                            // Accumulate membership counters
+                            partial_membership_cnt[centroid_idx] +=
+                                remote_partial_membership_cnt[centroid_idx];
+                            // Accumulate centroid features
+                            for (uint32_t feature_idx = 0;
+                                 feature_idx < n_features; feature_idx++) {
+                                final_centroids[centroid_idx * n_features +
+                                                feature_idx] +=
+                                    remote_partial_centroids[centroid_idx *
+                                                                 n_features +
+                                                             feature_idx];
+                            }
+                        }
+                    }
+                    // Normalize
+                    for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                         centroid_idx++) {
+                        for (uint32_t feature_idx = 0; feature_idx < n_features;
+                             feature_idx++) {
+                            final_centroids[centroid_idx * n_features +
+                                            feature_idx] /=
+                                partial_membership_cnt[centroid_idx];
+                        }
+                    }
+                }
+            }
+        }
+
+        snrt_global_barrier();
+        local_centroids = final_centroids;
+    }
+
+    snrt_cluster_hw_barrier();
+
+    // Transfer final centroids with DMA
+    if (snrt_is_dm_core() && snrt_cluster_idx() == 0) {
+        snrt_dma_start_1d((void*)centroids, (void*)final_centroids, size);
+        snrt_dma_wait_all();
+    }
+}
diff --git a/sw/apps/kmeans/src/main.c b/sw/apps/kmeans/src/main.c
new file mode 100644
index 0000000000..ba09ef3d21
--- /dev/null
+++ b/sw/apps/kmeans/src/main.c
@@ -0,0 +1,15 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include <stdint.h>
+
+#include "data.h"
+#include "kmeans.h"
+
+int main() {
+    kmeans(n_samples, n_features, n_clusters, n_iter, samples, centroids);
+    return 0;
+}
diff --git a/sw/apps/kmeans/verify.py b/sw/apps/kmeans/verify.py
new file mode 100755
index 0000000000..8feb40d639
--- /dev/null
+++ b/sw/apps/kmeans/verify.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import sys
+from pathlib import Path
+import numpy as np
+from data.datagen import golden_model, visualize_clusters
+
+sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
+import verification  # noqa: E402
+from elf import Elf  # noqa: E402
+from data_utils import bytes_to_doubles, bytes_to_uint32s  # noqa: E402
+
+
+ERR_THRESHOLD = 1E-10
+
+
+def main():
+    # Run simulation and get outputs
+    args = verification.parse_args()
+    raw_results = verification.simulate(sim_bin=args.sim_bin,
+                                        snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
+                                        log=args.log,
+                                        output_uids=['centroids'])
+    centroids_actual = np.array(bytes_to_doubles(raw_results['centroids']))
+
+    # Extract input operands from ELF file
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
+    max_iter = bytes_to_uint32s(elf.get_symbol_contents('n_iter'))[0]
+    n_clusters = bytes_to_uint32s(elf.get_symbol_contents('n_clusters'))[0]
+    n_features = bytes_to_uint32s(elf.get_symbol_contents('n_features'))[0]
+    n_samples = bytes_to_uint32s(elf.get_symbol_contents('n_samples'))[0]
+    initial_centroids = np.array(bytes_to_doubles(elf.get_symbol_contents('centroids')))
+    samples = np.array(bytes_to_doubles(elf.get_symbol_contents('samples')))
+
+    # Reshape
+    samples = samples.reshape((n_samples, n_features))
+    initial_centroids = initial_centroids.reshape((n_clusters, n_features))
+    centroids_actual = centroids_actual.reshape((n_clusters, n_features))
+
+    # Visualize centroids computed in simulation
+    visualize_clusters(samples, initial_centroids, "Initial centroids")
+    visualize_clusters(samples, centroids_actual, "Actual centroids")
+
+    # Verify results
+    centroids_golden, _ = golden_model(samples, n_clusters, initial_centroids, max_iter)
+    visualize_clusters(samples, centroids_golden, "Golden centroids")
+    relative_err = np.absolute((centroids_golden - centroids_actual) / centroids_golden)
+    fail = np.any(relative_err > ERR_THRESHOLD)
+    if (fail):
+        verification.dump_results_to_csv([centroids_golden, centroids_actual, relative_err],
+                                         Path.cwd() / 'kmeans_results.csv')
+    return int(fail)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile
index cc02ccc732..3f111682cf 100644
--- a/target/snitch_cluster/sw/apps/Makefile
+++ b/target/snitch_cluster/sw/apps/Makefile
@@ -16,6 +16,7 @@ SUBDIRS += dnn/layernorm
 SUBDIRS += dnn/linear
 SUBDIRS += dnn/maxpool
 SUBDIRS += dnn/softmax
+SUBDIRS += kmeans
 SUBDIRS += montecarlo/pi_estimation
 
 .PHONY: all clean $(SUBDIRS)
diff --git a/target/snitch_cluster/sw/apps/kmeans/Makefile b/target/snitch_cluster/sw/apps/kmeans/Makefile
new file mode 100644
index 0000000000..b7a2ca723b
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/kmeans/Makefile
@@ -0,0 +1,10 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+include ../../../../../sw/apps/kmeans/Makefile
+include ../common.mk
+
+$(DEP): $(DATA_H)
diff --git a/target/snitch_cluster/sw/fdiv.yaml b/target/snitch_cluster/sw/fdiv.yaml
index 7d6e91080f..fc28ed52be 100644
--- a/target/snitch_cluster/sw/fdiv.yaml
+++ b/target/snitch_cluster/sw/fdiv.yaml
@@ -3,3 +3,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
 runs:
+  - elf: apps/kmeans/build/kmeans.elf
+    cmd: [../../../sw/apps/kmeans/verify.py, "${sim_bin}", "${elf}"]