pulp-platform · colluca · Aug 29, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
@@ -25,6 +25,7 @@ pyflexfloat
 pytablewriter
 pytest
 pyyaml
+scikit-learn
 tabulate
 termcolor
 yamllint

@@ -3,7 +3,7 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Author: Jose Pedro Castro Fonseca <jose.pc.fonseca@gmail, [email protected]>
+# Author: Jose Pedro Castro Fonseca <[email protected]>
 #         Luca Colagrande <[email protected]>
 
 import numpy as np
@@ -21,17 +21,31 @@ class AtaxDataGen(du.DataGen):
     def golden_model(self, A, x):
         return np.matmul(A.transpose(), np.matmul(A, x))
 
+    def validate(self, M, N, **kwargs):
+        assert (N % 8) == 0, "N must be an integer multiple of the number of cores"
+
+        # Calculate total TCDM occupation
+        a_size = M * N * 8
+        x_size = N * 8
+        y_size = N * 8
+        tmp_size = M * 8
+        total_size = a_size
+        total_size += x_size
+        total_size += y_size
+        total_size += tmp_size
+        du.validate_tcdm_footprint(total_size)
+
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
+        # Validate parameters
+        self.validate(**kwargs)
+
         M, N = kwargs['M'], kwargs['N']
         A = du.generate_random_array((M, N))
         x = du.generate_random_array((N, 1))
         y = self.golden_model(A, x)
 
-        assert (M % 8) == 0, "M must be an integer multiple of the number of cores"
-        assert (N % 8) == 0, "N must be an integer multiple of the number of cores"
-
         A = A.flatten()
         x = x.flatten()
         y = y.flatten()

@@ -0,0 +1,16 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <[email protected]>
+
+#pragma once
+#include <stdint.h>
+
+typedef struct {
+    uint32_t M;
+    uint32_t N;
+    uint64_t A_addr;
+    uint64_t x_addr;
+    uint64_t y_addr;
+} atax_args_t;
@@ -6,43 +6,114 @@
 //         Luca Colagrande <[email protected]>
 
 #include <stdint.h>
+#include "args.h"
+#include "blas.h"
 #include "snrt.h"
 
-void kernel_atax(uint32_t M, uint32_t N, double *A, double *x, double *y,
-                 double *tmp) {
+static inline void atax(uint32_t M, uint32_t N, double *A, double *x, double *y,
+                        double *tmp) {
     double tmp_fs;
-    int core_range, core_offset;
+    int core_range, core_offset, cluster_core_offset;
 
     // tmp = A * x
     if (snrt_is_compute_core()) {
-        core_range = M / snrt_cluster_compute_core_num();
-        core_offset = snrt_cluster_core_idx() * core_range;
-        for (int i1 = 0; i1 < core_range; i1++) {
-            int i = core_offset + i1;
-            tmp_fs = 0.0;
-            for (int j = 0; j < N; j++) {
-                tmp_fs += A[i * N + j] * x[j];
-            }
-            tmp[i] = tmp_fs;
-        }
+        snrt_mcycle();
+        gemv(0, M, N, 1, A, x, 1, tmp);
+        snrt_mcycle();
     }
 
     snrt_cluster_hw_barrier();
 
     // y = At * tmp
     if (snrt_is_compute_core()) {
-        core_range = N / snrt_cluster_compute_core_num();
-        core_offset = snrt_cluster_core_idx() * core_range;
+        snrt_mcycle();
+        core_range = N / snrt_global_compute_core_num();
+        core_offset = snrt_global_compute_core_idx() * core_range;
+        cluster_core_offset = snrt_cluster_core_idx() * core_range;
         for (int j1 = 0; j1 < core_range; j1++) {
             int j = core_offset + j1;
+            int cluster_j = cluster_core_offset + j1;
             tmp_fs = 0.0;
             for (int i = 0; i < M; i++) {
                 // The order of the for loops was exchanged, so that each loop
                 // reduces in y at position j, iterating through the i
                 // positions.
                 tmp_fs += A[i * N + j] * tmp[i];
             }
-            y[j] = tmp_fs;
+            y[cluster_j] = tmp_fs;
         }
+        snrt_fpu_fence();
+        snrt_mcycle();
+    }
+}
+
+void atax_job(void *args) {
+    double *local_A;
+    double *local_x;
+    double *local_y;
+    double *local_tmp;
+    atax_args_t *local_args;
+
+#ifndef JOB_ARGS_PRELOADED
+    // Allocate space for job arguments in TCDM
+    local_args = (atax_args_t *)snrt_l1_alloc_cluster_local(sizeof(atax_args_t),
+                                                            sizeof(double));
+
+    // Copy job arguments to TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_args, args, sizeof(atax_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+#else
+    local_args = (atax_args_t *)args;
+#endif
+
+    // Aliases
+    uint32_t M = local_args->M;
+    uint32_t N = local_args->N;
+    double *A = (double *)(local_args->A_addr);
+    double *x = (double *)(local_args->x_addr);
+    double *y = (double *)(local_args->y_addr);
+
+    // Allocate local variables
+    size_t size_A = M * N * sizeof(double);
+    size_t size_x = N * sizeof(double);
+    size_t size_y = N * sizeof(double);
+    size_t size_tmp = M * sizeof(double);
+    size_t size_y_tile = size_y / snrt_cluster_num();
+    local_A = snrt_l1_alloc_cluster_local(size_A, sizeof(double));
+    local_x = snrt_l1_alloc_cluster_local(size_x, sizeof(double));
+    local_y = snrt_l1_alloc_cluster_local(size_y_tile, sizeof(double));
+    local_tmp = snrt_l1_alloc_cluster_local(size_tmp, sizeof(double));
+
+    // Initialize input matrices
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_A, A, size_A);
+        snrt_dma_start_1d(local_x, x, size_x);
+        snrt_dma_wait_all();
+    }
+    snrt_mcycle();
+    snrt_cluster_hw_barrier();
+
+    // Compute
+    atax(M, N, local_A, local_x, local_y, local_tmp);
+    snrt_cluster_hw_barrier();
+    snrt_mcycle();
+
+    // Writeback results
+    if (snrt_is_dm_core()) {
+        snrt_dma_store_1d_tile(y, local_y, snrt_cluster_idx(),
+                               N / snrt_cluster_num(), sizeof(double));
+        snrt_dma_wait_all();
+        snrt_mcycle();
     }
+    snrt_cluster_hw_barrier();
+
+    // Free memory
+#ifndef JOB_ARGS_PRELOADED
+    snrt_l1_update_next_v2(local_args);
+#else
+    snrt_l1_update_next_v2(local_A);
+#endif
 }
@@ -12,46 +12,16 @@
 
 int main() {
     uint32_t nerr = 0;
-    double *local_A;
-    double *local_x;
-    double *local_y;
-    double *local_tmp;
 
-    // Allocate local variables
-    local_A = snrt_l1_next();
-    local_x = local_A + M * N;
-    local_y = local_x + N;
-    local_tmp = local_y + N;
-
-    // Initialize input matrices
-    if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(local_A, A, sizeof(double) * M * N);
-        snrt_dma_start_1d(local_x, x, sizeof(double) * N);
-        snrt_dma_start_1d(local_y, (void *)snrt_zero_memory_ptr(),
-                          sizeof(double) * N);
-        snrt_dma_start_1d(local_tmp, (void *)snrt_zero_memory_ptr(),
-                          sizeof(double) * M);
-        snrt_dma_wait_all();
-    }
-    snrt_cluster_hw_barrier();
-
-    // Compute
-    kernel_atax(M, N, local_A, local_x, local_y, local_tmp);
-    snrt_cluster_hw_barrier();
-
-    // Writeback results
-    if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(y, local_y, sizeof(double) * N);
-        snrt_dma_wait_all();
-    }
-    snrt_cluster_hw_barrier();
+    atax_args_t args = {M, N, (uint64_t)A, (uint64_t)x, (uint64_t)y};
+    atax_job(&args);
 
 // Check computation is correct
 #ifdef BIST
     if (snrt_cluster_core_idx() == 0) {
         // Check y
         for (int i = 0; i < N; i++) {
-            double diff = fabs(golden[i] - local_y[i]);
+            double diff = fabs(golden[i] - y[i]);
             if (diff > MAX_ERROR) {
                 nerr++;
             }

@@ -12,12 +12,14 @@ SECTION         ?=
 DATA_H          := $($(APP)_BUILD_DIR)/data.h
 DATAGEN_PY       = $(SCRIPTS_DIR)/datagen.py
 
-$(APP)_HEADERS := $(DATA_H)
-$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR)
+$(APP)_HEADERS      := $(DATA_H)
+$(APP)_INCDIRS      += $(dir $(DATA_H)) $(SRC_DIR)
+$(APP)_DATAGEN_ARGS += -c $($(APP)_DATA_CFG)
+$(APP)_DATAGEN_ARGS += --section="$(SECTION)"
 
 $(dir $(DATA_H)):
 	mkdir -p $@
 
-$(DATA_H): DATA_CFG := $($(APP)_DATA_CFG)
+$(DATA_H): DATAGEN_ARGS := $($(APP)_DATAGEN_ARGS)
 $(DATA_H): $(DATAGEN_PY) $($(APP)_DATA_CFG) | $(dir $(DATA_H))
-	$< -c $(DATA_CFG) --section="$(SECTION)" $@
+	$< $(DATAGEN_ARGS) $@
@@ -3,7 +3,7 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Author: Jose Pedro Castro Fonseca <jose.pc.fonseca@gmail, [email protected]>
+# Author: Jose Pedro Castro Fonseca <[email protected]>
 #         Luca Colagrande <[email protected]>
 
 import numpy as np
@@ -21,9 +21,24 @@ class CorrelationDataGen(du.DataGen):
     def golden_model(self, data):
         return np.corrcoef(data, rowvar=False)
 
+    def validate(self, M, N, **kwargs):
+        assert (M % 8) == 0, "M must be an integer multiple of the number of cores"
+
+        # Calculate total TCDM occupation
+        data_size = N * M * 8
+        corr_size = M * M * 8
+        stddev_size = M * 8
+        total_size = data_size
+        total_size += corr_size
+        total_size += stddev_size
+        du.validate_tcdm_footprint(total_size)
+
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
+        # Validate parameters
+        self.validate(**kwargs)
+
         M, N = kwargs['M'], kwargs['N']
         data = du.generate_random_array((N, M))
         corr = self.golden_model(data)

@@ -0,0 +1,15 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <[email protected]>
+
+#pragma once
+#include <stdint.h>
+
+typedef struct {
+    uint32_t N;
+    uint32_t M;
+    uint64_t data_addr;
+    uint64_t corr_addr;
+} correlation_args_t;