From 22f5c13946dffb72f86796c90a8d51cf59c6655f Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Wed, 5 Jun 2024 23:50:01 +0200
Subject: [PATCH] sw: Add DOTP benchmark

---
 sw/blas/blas.h                                |   3 +-
 sw/blas/dotp/Makefile                         |  31 +++++
 sw/blas/dotp/data/params.json                 |   7 ++
 sw/blas/dotp/scripts/datagen.py               |  51 +++++++++
 sw/blas/dotp/scripts/verify.py                |  32 ++++++
 sw/blas/dotp/src/dotp.h                       |  88 ++++++++++++++
 sw/blas/dotp/src/main.c                       | 107 ++++++++++++++++++
 target/snitch_cluster/sw.mk                   |   1 +
 .../snitch_cluster/sw/apps/blas/dotp/Makefile |  10 ++
 9 files changed, 329 insertions(+), 1 deletion(-)
 create mode 100644 sw/blas/dotp/Makefile
 create mode 100644 sw/blas/dotp/data/params.json
 create mode 100755 sw/blas/dotp/scripts/datagen.py
 create mode 100755 sw/blas/dotp/scripts/verify.py
 create mode 100644 sw/blas/dotp/src/dotp.h
 create mode 100644 sw/blas/dotp/src/main.c
 create mode 100644 target/snitch_cluster/sw/apps/blas/dotp/Makefile

diff --git a/sw/blas/blas.h b/sw/blas/blas.h
index a7910d25e2..9207bf6f74 100644
--- a/sw/blas/blas.h
+++ b/sw/blas/blas.h
@@ -5,4 +5,5 @@
 #pragma once
 
 #include "axpy/src/axpy.h"
-#include "gemm/src/gemm.h"
\ No newline at end of file
+#include "gemm/src/gemm.h"
+#include "dotp/src/dotp.h"
diff --git a/sw/blas/dotp/Makefile b/sw/blas/dotp/Makefile
new file mode 100644
index 0000000000..49ff75b883
--- /dev/null
+++ b/sw/blas/dotp/Makefile
@@ -0,0 +1,31 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Usage of absolute paths is required to externally include this Makefile
+MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+DATA_DIR := $(realpath $(MK_DIR)/data)
+SRC_DIR  := $(realpath $(MK_DIR)/src)
+
+DATA_CFG ?= $(DATA_DIR)/params.json
+SECTION  ?=
+
+APP     ?= dotp
+SRCS    ?= $(realpath $(SRC_DIR)/main.c)
+INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR)
+
+DATAGEN_PY = $(MK_DIR)/scripts/datagen.py
+DATA_H    ?= $(DATA_DIR)/data.h
+
+$(dir $(DATA_H)):
+	mkdir -p $@
+
+$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H))
+	$< -c $(DATA_CFG) --section="$(SECTION)" > $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_H)
+
+clean: clean-data
diff --git a/sw/blas/dotp/data/params.json b/sw/blas/dotp/data/params.json
new file mode 100644
index 0000000000..66dfcf770f
--- /dev/null
+++ b/sw/blas/dotp/data/params.json
@@ -0,0 +1,7 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    n: 4096
+}
diff --git a/sw/blas/dotp/scripts/datagen.py b/sw/blas/dotp/scripts/datagen.py
new file mode 100755
index 0000000000..94a5e1be1c
--- /dev/null
+++ b/sw/blas/dotp/scripts/datagen.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import os
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+from data_utils import format_scalar_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper, DataGen  # noqa: E402
+
+
+class AxpyDataGen(DataGen):
+
+    MIN = -1000
+    MAX = +1000
+    # AXI splits bursts crossing 4KB address boundaries. To minimize
+    # the occurrence of these splits the data should be aligned to 4KB
+    BURST_ALIGNMENT = 4096
+
+    def golden_model(self, x, y):
+        return np.dot(x, y)
+
+    def emit_header(self, **kwargs):
+        header = [super().emit_header()]
+
+        n = kwargs['n']
+        x = np.random.uniform(self.MIN, self.MAX, n)
+        y = np.random.uniform(self.MIN, self.MAX, n)
+        g = self.golden_model(x, y)
+
+        assert (n % 8) == 0, "n must be an integer multiple of the number of cores"
+
+        header += [format_scalar_definition('const uint32_t', 'n', n)]
+        header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
+                                           section=kwargs['section'])]
+        header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
+                                           section=kwargs['section'])]
+        header += [format_array_declaration('double', 'z', [n], alignment=self.BURST_ALIGNMENT,
+                                            section=kwargs['section'])]
+        result_def = format_scalar_definition('double', 'g', g)
+        header += [format_ifdef_wrapper('BIST', result_def)]
+        header = '\n\n'.join(header)
+
+        return header
+
+
+if __name__ == '__main__':
+    sys.exit(AxpyDataGen().main())
diff --git a/sw/blas/dotp/scripts/verify.py b/sw/blas/dotp/scripts/verify.py
new file mode 100755
index 0000000000..5ea42423e0
--- /dev/null
+++ b/sw/blas/dotp/scripts/verify.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+from pathlib import Path
+from datagen import AxpyDataGen
+
+sys.path.append(str(Path(__file__).parent / '../../../../util/sim/'))
+from verif_utils import Verifier  # noqa: E402
+
+
+class AxpyVerifier(Verifier):
+
+    OUTPUT_UIDS = ['z']
+
+    def get_actual_results(self):
+        return self.get_output_from_symbol('z', 'double')
+
+    def get_expected_results(self):
+        a = self.get_input_from_symbol('a', 'double')
+        x = self.get_input_from_symbol('x', 'double')
+        y = self.get_input_from_symbol('y', 'double')
+        return AxpyDataGen().golden_model(a, x, y)
+
+    def check_results(self, *args):
+        return super().check_results(*args, rtol=1e-10)
+
+
+if __name__ == "__main__":
+    sys.exit(AxpyVerifier().main())
diff --git a/sw/blas/dotp/src/dotp.h b/sw/blas/dotp/src/dotp.h
new file mode 100644
index 0000000000..f2052e7397
--- /dev/null
+++ b/sw/blas/dotp/src/dotp.h
@@ -0,0 +1,88 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "snrt.h"
+
+inline void dotp_seq (uint32_t N, double *input_A, double *input_B, double *output) {
+    // Start of SSR region.
+    register volatile double ft0 asm("ft0");
+    register volatile double ft1 asm("ft1");
+    asm volatile(""
+                 : "=f"(ft0), "=f"(ft1));
+
+    snrt_ssr_loop_1d(SNRT_SSR_DM0, N, sizeof(double));
+    snrt_ssr_loop_1d(SNRT_SSR_DM1, N, sizeof(double));
+
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, input_A);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, input_B);
+
+    register volatile double res_ssr asm("fs0") = 0;
+
+    snrt_ssr_enable();
+
+    const register uint32_t Nm1 asm("t0") = N - 1;
+    asm volatile(
+        "frep.o %[n_frep], 1, 0, 0 \n"
+        "fmadd.d %0, ft0, ft1, %0"
+        : "=f"(res_ssr)                                      /* output operands */
+        : "f"(ft0), "f"(ft1), "0"(res_ssr), [n_frep]"r"(Nm1) /* input operands */
+        :);
+
+    // End of SSR region.
+    snrt_fpu_fence();
+    snrt_ssr_disable();
+    asm volatile(""
+                 :
+                 : "f"(ft0), "f"(ft1));
+    output[0] = res_ssr;
+}
+
+inline void dotp_seq_4_acc (uint32_t N, double *input_A, double *input_B, double *output) {
+    // Start of SSR region.
+    register volatile double ft0 asm("ft0");
+    register volatile double ft1 asm("ft1");
+    asm volatile(""
+                 : "=f"(ft0), "=f"(ft1));
+
+    snrt_ssr_loop_1d(SNRT_SSR_DM0, N, sizeof(double));
+    snrt_ssr_loop_1d(SNRT_SSR_DM1, N, sizeof(double));
+
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, input_A);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, input_B);
+
+    register volatile double res_ssr_0 asm("fs0") = 0;
+    register volatile double res_ssr_1 asm("fs1") = 0;
+    register volatile double res_ssr_2 asm("fs2") = 0;
+    register volatile double res_ssr_3 asm("fs3") = 0;
+
+    snrt_ssr_enable();
+
+    const register uint32_t Nm1 asm("t0") = (N >> 2) - 1;
+    asm volatile(
+        "frep.o %[n_frep], 4, 0, 0 \n"
+        "fmadd.d %0, ft0, ft1, %0 \n"
+        "fmadd.d %1, ft0, ft1, %1 \n"
+        "fmadd.d %2, ft0, ft1, %2 \n"
+        "fmadd.d %3, ft0, ft1, %3"
+        : "=f"(res_ssr_0), "=f"(res_ssr_1), "=f"(res_ssr_2), "=f"(res_ssr_3) /* output operands */
+        : "f"(ft0), "f"(ft1), "0"(res_ssr_0), "1"(res_ssr_1), "2"(res_ssr_2), "3"(res_ssr_3), [n_frep]"r"(Nm1)           /* input operands */
+        :);
+
+    // End of SSR region.
+    snrt_fpu_fence();
+    snrt_ssr_disable();
+
+    asm volatile(
+        "fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_1] \n"
+        "fadd.d %[res_ssr_2], %[res_ssr_2], %[res_ssr_3] \n"
+        "fadd.d %[res_ssr_0], %[res_ssr_0], %[res_ssr_2]"
+        : [res_ssr_0]"=f"(res_ssr_0), [res_ssr_2]"=f"(res_ssr_2) /* output operands */
+        : [res_ssr_1]"f"(res_ssr_1), [res_ssr_3]"f"(res_ssr_3)           /* input operands */
+        :);
+
+    asm volatile(""
+                 :
+                 : "f"(ft0), "f"(ft1));
+    output[0] = res_ssr_0;
+}
diff --git a/sw/blas/dotp/src/main.c b/sw/blas/dotp/src/main.c
new file mode 100644
index 0000000000..c289952efa
--- /dev/null
+++ b/sw/blas/dotp/src/main.c
@@ -0,0 +1,107 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "snrt.h"
+
+#include "printf.h"
+
+#define XSSR
+#include "dotp.h"
+#include "data.h"
+
+int main() {
+    double *local_x, *local_y, *local_z;
+    double *remote_x, *remote_y, *remote_z;
+
+    volatile double sum;
+
+    uint32_t start_cycle, end_cycle;
+
+    // Calculate size and pointers for each cluster
+    uint32_t frac = n / snrt_cluster_num();
+    uint32_t offset = frac * snrt_cluster_idx();
+    remote_x = x + offset;
+    remote_y = y + offset;
+    remote_z = z + snrt_cluster_idx();
+
+    // Allocate space in TCDM
+    local_x = (double *)snrt_l1_next();
+    local_y = local_x + frac;
+    local_z = local_y + frac;
+
+    // Copy data in TCDM
+    if (snrt_is_dm_core()) {
+        size_t size = frac * sizeof(double);
+        snrt_dma_start_1d(local_x, remote_x, size);
+        snrt_dma_start_1d(local_y, remote_y, size);
+        snrt_dma_wait_all();
+    }
+
+    // Calculate TCDM size and pointers for each core
+    int core_idx = snrt_cluster_core_idx();
+    int frac_core = n / snrt_cluster_compute_core_num();
+    int offset_core = core_idx * frac_core;
+    local_x += offset_core;
+    local_y += offset_core;
+    local_z += core_idx;
+
+    snrt_cluster_hw_barrier();
+
+    // Compute
+    if (!snrt_is_dm_core()) {
+        start_cycle = snrt_mcycle();
+        dotp_seq_4_acc(frac_core, local_x, local_y, local_z);
+        snrt_cluster_hw_barrier();
+
+#ifndef _DOTP_EXCLUDE_FINAL_SYNC_
+        if (!snrt_cluster_core_idx()) {
+            sum = 0;
+            for (uint32_t i = 0; i < snrt_cluster_compute_core_num(); ++i) {
+                sum += local_z[i];
+            }
+        }
+        snrt_fpu_fence();
+#endif
+
+        end_cycle = snrt_mcycle();
+    }
+
+    snrt_cluster_hw_barrier();
+
+    if (!snrt_cluster_core_idx()) {
+      unsigned int runtime = end_cycle - start_cycle;
+      double performance   = (double) (2 * n - 1) / runtime;
+      double util          = 100 * (performance / (2 * snrt_cluster_compute_core_num()));
+
+      printf("Core %d execution time: %u cycles\nPerformance: %f DP-FLOP/Cycle\nUtilization: %f%%\n",
+        snrt_cluster_core_idx(), runtime, performance, util);
+    }
+
+    snrt_cluster_hw_barrier();
+
+    // Copy data out of TCDM
+    if (snrt_is_dm_core()) {
+        size_t size = frac_core * sizeof(double);
+        snrt_dma_start_1d(remote_z, local_z, size);
+        snrt_dma_wait_all();
+    }
+
+    snrt_cluster_hw_barrier();
+
+// TODO: currently only works for single cluster otherwise need to
+//       synchronize all cores here
+#ifdef BIST
+    uint32_t nerr = 1;
+
+    // Check computation is correct
+    if (snrt_global_core_idx() == 0) {
+      if (sum == g) nerr--;
+      printf("%f %f\n", sum, g);
+    }
+
+    return nerr;
+#endif
+
+    return 0;
+}
diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk
index 1415fcb4e4..329b606a57 100644
--- a/target/snitch_cluster/sw.mk
+++ b/target/snitch_cluster/sw.mk
@@ -42,6 +42,7 @@ APPS  = sw/apps/lto
 APPS += sw/apps/nop
 APPS += sw/apps/blas/axpy
 APPS += sw/apps/blas/gemm
+APPS += sw/apps/blas/dotp
 APPS += sw/apps/dnn/batchnorm
 APPS += sw/apps/dnn/conv2d
 APPS += sw/apps/dnn/fusedconv
diff --git a/target/snitch_cluster/sw/apps/blas/dotp/Makefile b/target/snitch_cluster/sw/apps/blas/dotp/Makefile
new file mode 100644
index 0000000000..63f748994d
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/blas/dotp/Makefile
@@ -0,0 +1,10 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Matteo Perotti <mperotti@iis.ee.ethz.ch>
+
+include ../../../../../../sw/blas/dotp/Makefile
+include ../../common.mk
+
+$(DEP): $(DATA_H)