sw: Add K-means app

pulp-platform · Jan 26, 2024 · eb38f12 · eb38f12
1 parent 100e127
commit eb38f12
Show file tree

Hide file tree

Showing 11 changed files with 458 additions and 0 deletions.
diff --git a/python-requirements.txt b/python-requirements.txt
@@ -8,14 +8,17 @@ editorconfig-checker==2.3.51
 flake8
 gitpython
 hjson
+json5
 jsonref
 jsonschema
 mako
+matplotlib
 progressbar2
 tabulate
 yamllint
 pyyaml
 pytablewriter
+scikit-learn
 termcolor
 pandas
 pyelftools

diff --git a/sw/apps/kmeans/.gitignore b/sw/apps/kmeans/.gitignore
@@ -0,0 +1 @@
+data/data.h
diff --git a/sw/apps/kmeans/Makefile b/sw/apps/kmeans/Makefile
@@ -0,0 +1,30 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <[email protected]>
+
+# Usage of absolute paths is required to externally include this Makefile
+MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+DATA_DIR := $(realpath $(MK_DIR)/data)
+SRC_DIR  := $(realpath $(MK_DIR)/src)
+
+DATA_CFG ?= $(DATA_DIR)/params.json
+SECTION  ?=
+
+APP     ?= kmeans
+SRCS    ?= $(realpath $(SRC_DIR)/main.c)
+INCDIRS ?= $(DATA_DIR) $(SRC_DIR)
+
+DATAGEN_PY = $(DATA_DIR)/datagen.py
+DATA_H     = $(DATA_DIR)/data.h
+
+$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
+	$< -c $(DATA_CFG) --no-gui --section="$(SECTION)" > $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_H)
+
+clean: clean-data
diff --git a/sw/apps/kmeans/data/datagen.py b/sw/apps/kmeans/data/datagen.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Authors: Luca Colagrande <[email protected]>
+
+import argparse
+import hjson
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pathlib
+from sklearn.datasets import make_blobs
+from sklearn.cluster import KMeans
+import sys
+
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+from data_utils import emit_license, format_scalar_definition, \
+                       format_vector_definition  # noqa: E402
+
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+
+def golden_model(samples, n_clusters, initial_centroids, max_iter):
+    # Apply k-means clustering
+    kmeans = KMeans(
+        n_clusters=n_clusters,
+        init=initial_centroids,
+        max_iter=max_iter
+    )
+    kmeans.fit(samples)
+    return kmeans.cluster_centers_, kmeans.n_iter_
+
+
+def visualize_clusters(samples, centroids, title=None):
+    plt.scatter(samples[:, 0], samples[:, 1], s=30)
+    plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200, linewidths=3, color='red')
+    if not title:
+        title = "K-means clusters"
+    plt.title(title)
+    plt.xlabel("Feature 1")
+    plt.ylabel("Feature 2")
+    plt.show()
+
+
+def emit_header(**kwargs):
+
+    # Aliases
+    n_samples = kwargs['n_samples']
+    n_features = kwargs['n_features']
+    n_clusters = kwargs['n_clusters']
+    max_iter = kwargs['max_iter']
+    seed = kwargs['seed']
+    gui = not kwargs['no_gui']
+
+    # Generate random samples
+    X, _ = make_blobs(
+        n_samples=n_samples,
+        n_features=n_features,
+        centers=n_clusters,
+        random_state=seed
+    )
+
+    # Generate initial centroids randomly
+    rng = np.random.default_rng(seed=seed)
+    initial_centroids = rng.uniform(low=X.min(axis=0), high=X.max(axis=0),
+                                    size=(n_clusters, n_features))
+
+    # Visualize the generated samples
+    if gui:
+        visualize_clusters(X, initial_centroids)
+
+    # Apply k-means clustering
+    centers, n_iter = golden_model(X, n_clusters, initial_centroids, max_iter)
+
+    # Visualize the clusters
+    if gui:
+        visualize_clusters(X, centers)
+
+    # Generate header
+    data_str = [emit_license()]
+    data_str += [format_scalar_definition('uint32_t', 'n_samples', n_samples)]
+    data_str += [format_scalar_definition('uint32_t', 'n_features', n_features)]
+    data_str += [format_scalar_definition('uint32_t', 'n_clusters', n_clusters)]
+    data_str += [format_scalar_definition('uint32_t', 'n_iter', n_iter)]
+    data_str += [format_vector_definition('double', 'centroids', initial_centroids.flatten(),
+                 alignment=BURST_ALIGNMENT, section=kwargs['section'])]
+    data_str += [format_vector_definition('double', 'samples', X.flatten(),
+                 alignment=BURST_ALIGNMENT, section=kwargs['section'])]
+    data_str = '\n\n'.join(data_str)
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for kernels')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel')
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        '--no-gui',
+        action='store_true',
+        help='Run without visualization')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+    param['no_gui'] = args.no_gui
+
+    # Emit header file
+    print(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sw/apps/kmeans/data/params.json b/sw/apps/kmeans/data/params.json
@@ -0,0 +1,11 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    n_clusters: 3,
+    n_features: 2,
+    n_samples: 128,
+    max_iter: 3,
+    seed: 42
+}
diff --git a/sw/apps/kmeans/src/kmeans.h b/sw/apps/kmeans/src/kmeans.h
@@ -0,0 +1,192 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <[email protected]>
+
+#include <stdint.h>
+
+#include "math.h"
+#include "snrt.h"
+
+double euclidean_distance_squared(uint32_t n_features, double* point1,
+                                  double* point2) {
+    double sum = 0;
+    for (uint32_t i = 0; i < n_features; i++) {
+        double diff = point1[i] - point2[i];
+        sum += diff * diff;
+    }
+    return sum;
+}
+
+void kmeans(uint32_t n_samples, uint32_t n_features, uint32_t n_clusters,
+            uint32_t n_iter, double* samples, double* centroids) {
+    // Distribute work across clusters
+    uint32_t n_samples_per_cluster = n_samples / snrt_cluster_num();
+
+    // Dynamically allocate space in TCDM
+    double* local_samples = snrt_l1_alloc_cluster_local(
+        n_samples_per_cluster * n_features * sizeof(double), sizeof(double));
+    double* local_centroids = snrt_l1_alloc_cluster_local(
+        n_clusters * n_features * sizeof(double), sizeof(double));
+    uint32_t* membership = snrt_l1_alloc_cluster_local(
+        n_samples_per_cluster * sizeof(uint32_t), sizeof(uint32_t));
+    uint32_t* partial_membership_cnt = snrt_l1_alloc_compute_core_local(
+        n_clusters * sizeof(uint32_t), sizeof(uint32_t));
+    // First core's partial centroids will store final centroids
+    double* final_centroids = snrt_l1_next();
+    double* partial_centroids = snrt_l1_alloc_compute_core_local(
+        n_clusters * n_features * sizeof(double), sizeof(double));
+
+    // Transfer samples and initial centroids with DMA
+    size_t size;
+    size_t offset;
+    if (snrt_is_dm_core()) {
+        size = n_samples_per_cluster * n_features * sizeof(double);
+        offset = snrt_cluster_idx() * size;
+        snrt_dma_start_1d((void*)local_samples, (void*)samples + offset, size);
+        size = n_clusters * n_features * sizeof(double);
+        snrt_dma_start_1d((void*)local_centroids, (void*)centroids, size);
+        snrt_dma_wait_all();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    // Iterations of Lloyd's K-means algorithm
+    for (uint32_t iter_idx = 0; iter_idx < n_iter; iter_idx++) {
+        // Distribute work across compute cores in a cluster
+        uint32_t n_samples_per_core;
+        uint32_t start_sample_idx;
+        uint32_t end_sample_idx;
+        if (snrt_is_compute_core()) {
+            n_samples_per_core =
+                n_samples_per_cluster / snrt_cluster_compute_core_num();
+            start_sample_idx = snrt_cluster_core_idx() * n_samples_per_core;
+            end_sample_idx = start_sample_idx + n_samples_per_core;
+
+            // Assignment step
+            for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                 centroid_idx++) {
+                partial_membership_cnt[centroid_idx] = 0;
+            }
+            snrt_fpu_fence();
+            for (uint32_t sample_idx = start_sample_idx;
+                 sample_idx < end_sample_idx; sample_idx++) {
+                double min_dist = INFINITY;
+                membership[sample_idx] = 0;
+
+                for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                     centroid_idx++) {
+                    double dist = euclidean_distance_squared(
+                        n_features, &local_samples[sample_idx * n_features],
+                        &local_centroids[centroid_idx * n_features]);
+                    if (dist < min_dist) {
+                        min_dist = dist;
+                        membership[sample_idx] = centroid_idx;
+                    }
+                }
+                partial_membership_cnt[membership[sample_idx]]++;
+            }
+        }
+
+        snrt_global_barrier();
+
+        if (snrt_is_compute_core()) {
+            // Update step
+            for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                 centroid_idx++) {
+                for (uint32_t feature_idx = 0; feature_idx < n_features;
+                     feature_idx++) {
+                    partial_centroids[centroid_idx * n_features + feature_idx] =
+                        0;
+                }
+            }
+            snrt_fpu_fence();
+            for (uint32_t sample_idx = start_sample_idx;
+                 sample_idx < end_sample_idx; sample_idx++) {
+                for (uint32_t feature_idx = 0; feature_idx < n_features;
+                     feature_idx++) {
+                    partial_centroids[membership[sample_idx] * n_features +
+                                      feature_idx] +=
+                        local_samples[sample_idx * n_features + feature_idx];
+                }
+            }
+            if (snrt_cluster_core_idx() == 0) {
+                // Intra-cluster reduction
+                for (uint32_t core_idx = 1;
+                     core_idx < snrt_cluster_compute_core_num(); core_idx++) {
+                    // Pointers to variables of the other core
+                    uint32_t* remote_partial_membership_cnt =
+                        partial_membership_cnt + core_idx * n_clusters;
+                    double* remote_partial_centroids =
+                        partial_centroids + core_idx * n_clusters * n_features;
+                    for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                         centroid_idx++) {
+                        // Accumulate membership counters
+                        partial_membership_cnt[centroid_idx] +=
+                            remote_partial_membership_cnt[centroid_idx];
+                        // Accumulate centroid features
+                        for (uint32_t feature_idx = 0; feature_idx < n_features;
+                             feature_idx++) {
+                            partial_centroids[centroid_idx * n_features +
+                                              feature_idx] +=
+                                remote_partial_centroids[centroid_idx *
+                                                             n_features +
+                                                         feature_idx];
+                        }
+                    }
+                }
+                snrt_inter_cluster_barrier();
+                if (snrt_cluster_idx() == 0) {
+                    // Inter-cluster reduction
+                    for (uint32_t cluster_idx = 1;
+                         cluster_idx < snrt_cluster_num(); cluster_idx++) {
+                        // Pointers to variables of remote clusters
+                        uint32_t* remote_partial_membership_cnt =
+                            (uint32_t*)snrt_remote_l1_ptr(
+                                partial_membership_cnt, 0, cluster_idx);
+                        double* remote_partial_centroids =
+                            (double*)snrt_remote_l1_ptr(partial_centroids,
+                                                             0, cluster_idx);
+                        for (uint32_t centroid_idx = 0;
+                             centroid_idx < n_clusters; centroid_idx++) {
+                            // Accumulate membership counters
+                            partial_membership_cnt[centroid_idx] +=
+                                remote_partial_membership_cnt[centroid_idx];
+                            // Accumulate centroid features
+                            for (uint32_t feature_idx = 0;
+                                 feature_idx < n_features; feature_idx++) {
+                                final_centroids[centroid_idx * n_features +
+                                                feature_idx] +=
+                                    remote_partial_centroids[centroid_idx *
+                                                                 n_features +
+                                                             feature_idx];
+                            }
+                        }
+                    }
+                    // Normalize
+                    for (uint32_t centroid_idx = 0; centroid_idx < n_clusters;
+                         centroid_idx++) {
+                        for (uint32_t feature_idx = 0; feature_idx < n_features;
+                             feature_idx++) {
+                            final_centroids[centroid_idx * n_features +
+                                            feature_idx] /=
+                                partial_membership_cnt[centroid_idx];
+                        }
+                    }
+                }
+            }
+        }
+
+        snrt_global_barrier();
+        local_centroids = final_centroids;
+    }
+
+    snrt_cluster_hw_barrier();
+
+    // Transfer final centroids with DMA
+    if (snrt_is_dm_core() && snrt_cluster_idx() == 0) {
+        snrt_dma_start_1d((void*)centroids, (void*)final_centroids, size);
+        snrt_dma_wait_all();
+    }
+}