From 6d6a94af3fef8b5704e878de14577495e5fc6fb1 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 12 Aug 2024 10:35:40 +0200
Subject: [PATCH 01/19] sw/blas/axpy: Add multiple impls and optimize TCDM
 placement

---
 sw/blas/axpy/data/params.json   |   3 +-
 sw/blas/axpy/scripts/datagen.py |  52 +++++++++++-----
 sw/blas/axpy/src/args.h         |  17 +++++
 sw/blas/axpy/src/axpy.h         | 107 +++++++++++++++++++++++++++-----
 sw/blas/axpy/src/main.c         |  49 ++-------------
 5 files changed, 155 insertions(+), 73 deletions(-)
 create mode 100644 sw/blas/axpy/src/args.h

diff --git a/sw/blas/axpy/data/params.json b/sw/blas/axpy/data/params.json
index 2f8f5871c..ba0e9b476 100644
--- a/sw/blas/axpy/data/params.json
+++ b/sw/blas/axpy/data/params.json
@@ -3,5 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 {
-    n: 384
+    "n": 384,
+    "funcptr": "axpy_opt"
 }
diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py
index 117495391..48d84bce3 100755
--- a/sw/blas/axpy/scripts/datagen.py
+++ b/sw/blas/axpy/scripts/datagen.py
@@ -9,7 +9,7 @@
 import sys
 
 from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_array_declaration, format_ifdef_wrapper, DataGen
+    format_array_declaration, format_ifdef_wrapper, format_struct_definition, DataGen
 
 
 class AxpyDataGen(DataGen):
@@ -19,29 +19,53 @@ class AxpyDataGen(DataGen):
     # AXI splits bursts crossing 4KB address boundaries. To minimize
     # the occurrence of these splits the data should be aligned to 4KB
     BURST_ALIGNMENT = 4096
+    # Function pointers to alternative implementations
+    FUNCPTRS = ["axpy_naive", "axpy_fma", "axpy_opt"]
 
     def golden_model(self, a, x, y):
         return a*x + y
 
+    def validate_config(self, **kwargs):
+        assert (kwargs['n'] % 8) == 0, "n must be an integer multiple of the number of cores"
+        assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
+
+        # Calculate total TCDM occupation
+        # Note: doesn't account for double buffering
+        vec_size = kwargs['n'] * 8
+        total_size = 3 * vec_size
+        data_utils.validate_tcdm_footprint(total_size)
+
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
-        n = kwargs['n']
-        a = np.random.uniform(self.MIN, self.MAX, 1)
-        x = np.random.uniform(self.MIN, self.MAX, n)
-        y = np.random.uniform(self.MIN, self.MAX, n)
+        self.validate_config(**kwargs)
+
+        a = np.random.uniform(self.MIN, self.MAX, 1)[0]
+        x = np.random.uniform(self.MIN, self.MAX, kwargs['n'])
+        y = np.random.uniform(self.MIN, self.MAX, kwargs['n'])
         g = self.golden_model(a, x, y)
 
-        assert (n % 8) == 0, "n must be an integer multiple of the number of cores"
+        x_uid = 'x'
+        y_uid = 'y'
+        z_uid = 'z'
+
+        cfg = {
+            'n': kwargs['n'],
+            'a': a,
+            'x': x_uid,
+            'y': y_uid,
+            'z': z_uid,
+            'funcptr': kwargs['funcptr']
+        }
 
-        header += [format_scalar_definition('const uint32_t', 'n', n)]
-        header += [format_scalar_definition('const double', 'a', a[0])]
-        header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
-                                           section=kwargs['section'])]
-        header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
-                                           section=kwargs['section'])]
-        header += [format_array_declaration('double', 'z', [n], alignment=self.BURST_ALIGNMENT,
-                                            section=kwargs['section'])]
+        header += [format_scalar_definition('const double', 'a', a)]
+        header += [format_array_definition('double', x_uid, x,
+                   alignment=self.BURST_ALIGNMENT, section=kwargs['section'])]
+        header += [format_array_definition('double', y_uid, y,
+                   alignment=self.BURST_ALIGNMENT, section=kwargs['section'])]
+        header += [format_array_declaration('double', z_uid, x.shape,
+                   alignment=self.BURST_ALIGNMENT, section=kwargs['section'])]
+        header += [format_struct_definition('axpy_args_t', 'args', cfg)]
         result_def = format_array_definition('double', 'g', g)
         header += [format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
diff --git a/sw/blas/axpy/src/args.h b/sw/blas/axpy/src/args.h
new file mode 100644
index 000000000..aeaa76745
--- /dev/null
+++ b/sw/blas/axpy/src/args.h
@@ -0,0 +1,17 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <stdint.h>
+
+typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y, double* z);
+
+typedef struct {
+    uint32_t n;
+    double a;
+    double *x;
+    double *y;
+    double *z;
+    axpy_fp_t funcptr;
+} axpy_args_t;
diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h
index e8f5ae6c0..d5ded81af 100644
--- a/sw/blas/axpy/src/axpy.h
+++ b/sw/blas/axpy/src/axpy.h
@@ -2,28 +2,47 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+#include "args.h"
 #include "snrt.h"
 
-inline void axpy(uint32_t n, double a, double* x, double* y, double* z) {
+#define BANK_ALIGNMENT 8
+#define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT)
+#define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT)
+
+static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double* z) {
     int core_idx = snrt_cluster_core_idx();
     int frac = n / snrt_cluster_compute_core_num();
-    int offset = core_idx * frac;
+    int offset = core_idx;
+
+    for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) {
+        z[i] = a * x[i] + y[i];
+    }
+    snrt_fpu_fence();
+}
 
-#ifndef XSSR
+static inline void axpy_fma(uint32_t n, double a, double* x, double* y, double* z) {
+    int core_idx = snrt_cluster_core_idx();
+    int frac = n / snrt_cluster_compute_core_num();
+    int offset = core_idx;
 
-    for (int i = 0; i < frac; i++) {
-        z[offset] = a * x[offset] + y[offset];
-        offset++;
+    for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) {
+        asm volatile (
+            "fmadd.d %[z], %[a], %[x], %[y] \n"
+            : [ z ]"=f"(z[i])
+            : [ a ]"f"(a), [ x ]"f"(x[i]), [ y ]"f"(y[i])
+        );
     }
     snrt_fpu_fence();
+}
 
-#else
+static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* z) {
+    int core_idx = snrt_cluster_core_idx();
+    int frac = n / snrt_cluster_compute_core_num();
+    int offset = core_idx;
 
-    // TODO(colluca): revert once Banshee supports SNRT_SSR_DM_ALL
-    // snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac, sizeof(double));
-    snrt_ssr_loop_1d(SNRT_SSR_DM0, frac, sizeof(double));
-    snrt_ssr_loop_1d(SNRT_SSR_DM1, frac, sizeof(double));
-    snrt_ssr_loop_1d(SNRT_SSR_DM2, frac, sizeof(double));
+    snrt_ssr_loop_1d(SNRT_SSR_DM_ALL,
+                     frac,
+                     snrt_cluster_compute_core_num() * sizeof(double));
 
     snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x + offset);
     snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y + offset);
@@ -36,10 +55,70 @@ inline void axpy(uint32_t n, double a, double* x, double* y, double* z) {
         "fmadd.d ft2, %[a], ft0, ft1\n"
         :
         : [ n_frep ] "r"(frac - 1), [ a ] "f"(a)
-        : "ft0", "ft1", "ft2", "memory");
-
+        : "ft0", "ft1", "ft2", "memory"
+    );
+    
     snrt_fpu_fence();
     snrt_ssr_disable();
+}
 
+static inline void axpy_job(axpy_args_t *args) {
+    uint64_t local_x_addr, local_y_addr, local_z_addr;
+    double *local_x, *local_y, *local_z;
+    double *remote_x, *remote_y, *remote_z;
+
+#ifndef JOB_ARGS_PRELOADED
+    // Allocate space for job arguments in TCDM
+    axpy_args_t *local_args = (axpy_args_t *)snrt_l1_next();
+
+    // Copy job arguments to TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_args, args, sizeof(axpy_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+    args = local_args;
 #endif
+
+    // Calculate size and pointers for each cluster
+    uint32_t frac = args->n / snrt_cluster_num();
+    uint32_t offset = frac * snrt_cluster_idx();
+    remote_x = args->x + offset;
+    remote_y = args->y + offset;
+    remote_z = args->z + offset;
+
+    // Allocate space for job operands in TCDM
+    // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
+    local_x_addr = ALIGN_UP_TCDM((uint64_t)args + sizeof(axpy_args_t));
+    local_y_addr = ALIGN_UP_TCDM(local_x_addr + frac * sizeof(double)) + 8 * BANK_ALIGNMENT;
+    local_z_addr = ALIGN_UP_TCDM(local_y_addr + frac * sizeof(double)) + 16 * BANK_ALIGNMENT;
+    local_x = (double *)local_x_addr;
+    local_y = (double *)local_y_addr;
+    local_z = (double *)local_z_addr;
+
+    // Copy job operands in TCDM
+    if (snrt_is_dm_core()) {
+        size_t size = frac * sizeof(double);
+        snrt_dma_start_1d(local_x, remote_x, size);
+        snrt_dma_start_1d(local_y, remote_y, size);
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+
+    // Compute
+    if (!snrt_is_dm_core()) {
+        axpy_fp_t fp = args->funcptr;
+        uint32_t start_cycle = snrt_mcycle();
+        fp(frac, args->a, local_x, local_y, local_z);
+        uint32_t end_cycle = snrt_mcycle();
+    }
+    snrt_cluster_hw_barrier();
+
+    // Copy data out of TCDM
+    if (snrt_is_dm_core()) {
+        size_t size = frac * sizeof(double);
+        snrt_dma_start_1d(remote_z, local_z, size);
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
 }
diff --git a/sw/blas/axpy/src/main.c b/sw/blas/axpy/src/main.c
index 22f3dd129..83cb58ae8 100644
--- a/sw/blas/axpy/src/main.c
+++ b/sw/blas/axpy/src/main.c
@@ -4,64 +4,25 @@
 
 #include "snrt.h"
 
-#define XSSR
 #include "axpy.h"
 #include "data.h"
 
 int main() {
-    double *local_x, *local_y, *local_z;
-    double *remote_x, *remote_y, *remote_z;
 
-    // Calculate size and pointers for each cluster
-    uint32_t frac = n / snrt_cluster_num();
-    uint32_t offset = frac * snrt_cluster_idx();
-    remote_x = x + offset;
-    remote_y = y + offset;
-    remote_z = z + offset;
-
-    // Allocate space in TCDM
-    local_x = (double *)snrt_l1_next();
-    local_y = local_x + frac;
-    local_z = local_y + frac;
-
-    // Copy data in TCDM
-    if (snrt_is_dm_core()) {
-        size_t size = frac * sizeof(double);
-        snrt_dma_start_1d(local_x, remote_x, size);
-        snrt_dma_start_1d(local_y, remote_y, size);
-        snrt_dma_wait_all();
-    }
-
-    snrt_cluster_hw_barrier();
-
-    // Compute
-    if (!snrt_is_dm_core()) {
-        uint32_t start_cycle = snrt_mcycle();
-        axpy(frac, a, local_x, local_y, local_z);
-        uint32_t end_cycle = snrt_mcycle();
-    }
-
-    snrt_cluster_hw_barrier();
-
-    // Copy data out of TCDM
-    if (snrt_is_dm_core()) {
-        size_t size = frac * sizeof(double);
-        snrt_dma_start_1d(remote_z, local_z, size);
-        snrt_dma_wait_all();
-    }
-
-    snrt_cluster_hw_barrier();
+    axpy_job(&args);
 
 // TODO: currently only works for single cluster otherwise need to
 //       synchronize all cores here
 #ifdef BIST
+    uint32_t n = args.n;
+    double* z = args.z;
     uint32_t nerr = n;
 
     // Check computation is correct
     if (snrt_global_core_idx() == 0) {
         for (int i = 0; i < n; i++) {
-            if (local_z[i] == g[i]) nerr--;
-            printf("%d %d\n", local_z[i], g[i]);
+            if (z[i] == g[i]) nerr--;
+            printf("%d %d\n", z[i], g[i]);
         }
     }
 

From ca26488027d90e83416a7a1c1a8a10189c50f260 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 12 Aug 2024 11:03:58 +0200
Subject: [PATCH 02/19] sw/blas/axpy: Support multiple tiles

---
 sw/blas/axpy/data/params.json   |  1 +
 sw/blas/axpy/scripts/datagen.py |  7 +++-
 sw/blas/axpy/src/args.h         |  1 +
 sw/blas/axpy/src/axpy.h         | 72 +++++++++++++++++++--------------
 4 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/sw/blas/axpy/data/params.json b/sw/blas/axpy/data/params.json
index ba0e9b476..a4fa15275 100644
--- a/sw/blas/axpy/data/params.json
+++ b/sw/blas/axpy/data/params.json
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 {
+    "n_tiles": 3,
     "n": 384,
     "funcptr": "axpy_opt"
 }
diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py
index 48d84bce3..af91d886d 100755
--- a/sw/blas/axpy/scripts/datagen.py
+++ b/sw/blas/axpy/scripts/datagen.py
@@ -26,12 +26,14 @@ def golden_model(self, a, x, y):
         return a*x + y
 
     def validate_config(self, **kwargs):
-        assert (kwargs['n'] % 8) == 0, "n must be an integer multiple of the number of cores"
+        assert kwargs['n'] % kwargs['n_tiles'] == 0, "n must be an integer multiple of n_tiles"
+        n_per_tile = kwargs['n'] // kwargs['n_tiles']
+        assert (n_per_tile % 8) == 0, "n must be an integer multiple of the number of cores"
         assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
 
         # Calculate total TCDM occupation
         # Note: doesn't account for double buffering
-        vec_size = kwargs['n'] * 8
+        vec_size = n_per_tile * 8
         total_size = 3 * vec_size
         data_utils.validate_tcdm_footprint(total_size)
 
@@ -55,6 +57,7 @@ def emit_header(self, **kwargs):
             'x': x_uid,
             'y': y_uid,
             'z': z_uid,
+            'n_tiles': kwargs['n_tiles'],
             'funcptr': kwargs['funcptr']
         }
 
diff --git a/sw/blas/axpy/src/args.h b/sw/blas/axpy/src/args.h
index aeaa76745..0efe3a2b4 100644
--- a/sw/blas/axpy/src/args.h
+++ b/sw/blas/axpy/src/args.h
@@ -13,5 +13,6 @@ typedef struct {
     double *x;
     double *y;
     double *z;
+    uint32_t n_tiles;
     axpy_fp_t funcptr;
 } axpy_args_t;
diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h
index d5ded81af..a1fc7cda6 100644
--- a/sw/blas/axpy/src/axpy.h
+++ b/sw/blas/axpy/src/axpy.h
@@ -63,6 +63,7 @@ static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double*
 }
 
 static inline void axpy_job(axpy_args_t *args) {
+    uint32_t frac, offset, size;
     uint64_t local_x_addr, local_y_addr, local_z_addr;
     double *local_x, *local_y, *local_z;
     double *remote_x, *remote_y, *remote_z;
@@ -80,12 +81,8 @@ static inline void axpy_job(axpy_args_t *args) {
     args = local_args;
 #endif
 
-    // Calculate size and pointers for each cluster
-    uint32_t frac = args->n / snrt_cluster_num();
-    uint32_t offset = frac * snrt_cluster_idx();
-    remote_x = args->x + offset;
-    remote_y = args->y + offset;
-    remote_z = args->z + offset;
+    // Calculate size of each tile
+    frac = args->n / args->n_tiles;
 
     // Allocate space for job operands in TCDM
     // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
@@ -96,29 +93,44 @@ static inline void axpy_job(axpy_args_t *args) {
     local_y = (double *)local_y_addr;
     local_z = (double *)local_z_addr;
 
-    // Copy job operands in TCDM
-    if (snrt_is_dm_core()) {
-        size_t size = frac * sizeof(double);
-        snrt_dma_start_1d(local_x, remote_x, size);
-        snrt_dma_start_1d(local_y, remote_y, size);
-        snrt_dma_wait_all();
-    }
-    snrt_cluster_hw_barrier();
-
-    // Compute
-    if (!snrt_is_dm_core()) {
-        axpy_fp_t fp = args->funcptr;
-        uint32_t start_cycle = snrt_mcycle();
-        fp(frac, args->a, local_x, local_y, local_z);
-        uint32_t end_cycle = snrt_mcycle();
+    // Iterate over multiple tiles
+    for (int i = 0; i < args->n_tiles; i++) {
+
+        // DMA in
+        if (snrt_is_dm_core()) {
+
+            // Calculate size and pointers to current tile
+            size = frac * sizeof(double);
+            offset = i * frac;
+            remote_x = args->x + offset;
+            remote_y = args->y + offset;
+
+            // Copy job operands in TCDM
+            snrt_dma_start_1d(local_x, remote_x, size);
+            snrt_dma_start_1d(local_y, remote_y, size);
+            snrt_dma_wait_all();
+        }
+        snrt_cluster_hw_barrier();
+
+        // Compute
+        if (!snrt_is_dm_core()) {
+            axpy_fp_t fp = args->funcptr;
+            uint32_t start_cycle = snrt_mcycle();
+            fp(frac, args->a, local_x, local_y, local_z);
+            uint32_t end_cycle = snrt_mcycle();
+        }
+        snrt_cluster_hw_barrier();
+
+        // DMA out
+        if (snrt_is_dm_core()) {
+
+            // Calculate pointers to current tile
+            remote_z = args->z + offset;
+            
+            // Copy job outputs from TCDM
+            snrt_dma_start_1d(remote_z, local_z, size);
+            snrt_dma_wait_all();
+        }
+        snrt_cluster_hw_barrier();
     }
-    snrt_cluster_hw_barrier();
-
-    // Copy data out of TCDM
-    if (snrt_is_dm_core()) {
-        size_t size = frac * sizeof(double);
-        snrt_dma_start_1d(remote_z, local_z, size);
-        snrt_dma_wait_all();
-    }
-    snrt_cluster_hw_barrier();
 }

From 7810bf786ec637f32af1768902a70edb6f8bcb62 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 12 Aug 2024 17:58:51 +0200
Subject: [PATCH 03/19] sw/blas/axpy: Add double buffering

---
 sw/blas/axpy/scripts/datagen.py |   5 +-
 sw/blas/axpy/src/axpy.h         | 127 +++++++++++++++++++++++---------
 2 files changed, 94 insertions(+), 38 deletions(-)

diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py
index af91d886d..cf6795667 100755
--- a/sw/blas/axpy/scripts/datagen.py
+++ b/sw/blas/axpy/scripts/datagen.py
@@ -8,6 +8,7 @@
 import numpy as np
 import sys
 
+from snitch.util.sim import data_utils
 from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
     format_array_declaration, format_ifdef_wrapper, format_struct_definition, DataGen
 
@@ -32,9 +33,9 @@ def validate_config(self, **kwargs):
         assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
 
         # Calculate total TCDM occupation
-        # Note: doesn't account for double buffering
+        # Note: doesn't account for gaps created by data alignment
         vec_size = n_per_tile * 8
-        total_size = 3 * vec_size
+        total_size = 2 * 3 * vec_size
         data_utils.validate_tcdm_footprint(total_size)
 
     def emit_header(self, **kwargs):
diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h
index a1fc7cda6..c5df546ab 100644
--- a/sw/blas/axpy/src/axpy.h
+++ b/sw/blas/axpy/src/axpy.h
@@ -5,6 +5,8 @@
 #include "args.h"
 #include "snrt.h"
 
+#define DOUBLE_BUFFER 1
+
 #define BANK_ALIGNMENT 8
 #define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT)
 #define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT)
@@ -64,9 +66,14 @@ static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double*
 
 static inline void axpy_job(axpy_args_t *args) {
     uint32_t frac, offset, size;
-    uint64_t local_x_addr, local_y_addr, local_z_addr;
-    double *local_x, *local_y, *local_z;
+    uint64_t local_x0_addr, local_y0_addr, local_z0_addr,
+             local_x1_addr, local_y1_addr, local_z1_addr;
+    double *local_x[2];
+    double *local_y[2];
+    double *local_z[2];
     double *remote_x, *remote_y, *remote_z;
+    uint32_t iterations, i, i_dma_in, i_compute, i_dma_out, buff_idx;
+
 
 #ifndef JOB_ARGS_PRELOADED
     // Allocate space for job arguments in TCDM
@@ -83,54 +90,102 @@ static inline void axpy_job(axpy_args_t *args) {
 
     // Calculate size of each tile
     frac = args->n / args->n_tiles;
+    size = frac * sizeof(double);
 
     // Allocate space for job operands in TCDM
     // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
-    local_x_addr = ALIGN_UP_TCDM((uint64_t)args + sizeof(axpy_args_t));
-    local_y_addr = ALIGN_UP_TCDM(local_x_addr + frac * sizeof(double)) + 8 * BANK_ALIGNMENT;
-    local_z_addr = ALIGN_UP_TCDM(local_y_addr + frac * sizeof(double)) + 16 * BANK_ALIGNMENT;
-    local_x = (double *)local_x_addr;
-    local_y = (double *)local_y_addr;
-    local_z = (double *)local_z_addr;
+    local_x0_addr = ALIGN_UP_TCDM((uint64_t)args + sizeof(axpy_args_t));
+    local_y0_addr = ALIGN_UP_TCDM(local_x0_addr + size) + 8 * BANK_ALIGNMENT;
+    local_z0_addr = ALIGN_UP_TCDM(local_y0_addr + size) + 16 * BANK_ALIGNMENT;
+    local_x[0] = (double *)local_x0_addr;
+    local_y[0] = (double *)local_y0_addr;
+    local_z[0] = (double *)local_z0_addr;
+    if (DOUBLE_BUFFER) {
+        local_x1_addr = ALIGN_UP_TCDM(local_z0_addr + size);
+        local_y1_addr = ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT;
+        local_z1_addr = ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT;
+        local_x[1] = (double *)local_x1_addr;
+        local_y[1] = (double *)local_y1_addr;
+        local_z[1] = (double *)local_z1_addr;
+    }
 
-    // Iterate over multiple tiles
-    for (int i = 0; i < args->n_tiles; i++) {
+    // Calculate number of iterations
+    iterations = args->n_tiles;
+    if (DOUBLE_BUFFER) iterations += 2;
+
+    // Iterate over all tiles
+    for (i = 0; i < iterations; i++) {
 
-        // DMA in
         if (snrt_is_dm_core()) {
+            // DMA in
+            if (!DOUBLE_BUFFER || (i < args->n_tiles)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_in = i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0;
+
+                // Calculate size and pointers to current tile
+                offset = i_dma_in * frac;
+                remote_x = args->x + offset;
+                remote_y = args->y + offset;
+
+                // Copy job operands in TCDM
+                snrt_dma_start_1d(local_x[buff_idx], remote_x, size);
+                snrt_dma_start_1d(local_y[buff_idx], remote_y, size);
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+
+            // Additional barriers required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            // DMA out
+            if (!DOUBLE_BUFFER || (i > 1)) {
+                snrt_mcycle();
 
-            // Calculate size and pointers to current tile
-            size = frac * sizeof(double);
-            offset = i * frac;
-            remote_x = args->x + offset;
-            remote_y = args->y + offset;
+                // Compute tile and buffer indices
+                i_dma_out = DOUBLE_BUFFER ? i - 2 : i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0;
 
-            // Copy job operands in TCDM
-            snrt_dma_start_1d(local_x, remote_x, size);
-            snrt_dma_start_1d(local_y, remote_y, size);
-            snrt_dma_wait_all();
+                // Calculate pointers to current tile
+                offset = i_dma_out * frac;
+                remote_z = args->z + offset;
+
+                // Copy job outputs from TCDM
+                snrt_dma_start_1d(remote_z, local_z[buff_idx], size);
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
         }
-        snrt_cluster_hw_barrier();
 
         // Compute
-        if (!snrt_is_dm_core()) {
-            axpy_fp_t fp = args->funcptr;
-            uint32_t start_cycle = snrt_mcycle();
-            fp(frac, args->a, local_x, local_y, local_z);
-            uint32_t end_cycle = snrt_mcycle();
-        }
-        snrt_cluster_hw_barrier();
+        if (snrt_is_compute_core()) {
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
 
-        // DMA out
-        if (snrt_is_dm_core()) {
+            if (!DOUBLE_BUFFER || (i > 0 && i < (args->n_tiles + 1))) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_compute = DOUBLE_BUFFER ? i - 1 : i;
+                buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0;
+
+                // Perform tile computation
+                axpy_fp_t fp = args->funcptr;
+                fp(frac, args->a, local_x[buff_idx], local_y[buff_idx], local_z[buff_idx]);
 
-            // Calculate pointers to current tile
-            remote_z = args->z + offset;
-            
-            // Copy job outputs from TCDM
-            snrt_dma_start_1d(remote_z, local_z, size);
-            snrt_dma_wait_all();
+                snrt_mcycle();
+            }
+
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
         }
+
+        // Synchronize cores after every iteration
         snrt_cluster_hw_barrier();
     }
 }

From c91cb215efc8c0a997d7b34c99d6d0f70a56e862 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Tue, 13 Aug 2024 18:17:25 +0200
Subject: [PATCH 04/19] sw: Add `ata` kernel

---
 sw/apps/ata/.gitignore                   |   1 +
 sw/apps/ata/data/params.json             |  10 +
 sw/apps/ata/scripts/datagen.py           |  72 ++++++
 sw/apps/ata/scripts/verify.py            |  44 ++++
 sw/apps/ata/src/args.h                   |  19 ++
 sw/apps/ata/src/ata.h                    | 277 +++++++++++++++++++++++
 sw/apps/ata/src/main.c                   |  17 ++
 target/snitch_cluster/sw.mk              |   1 +
 target/snitch_cluster/sw/apps/ata/app.mk |  13 ++
 9 files changed, 454 insertions(+)
 create mode 100644 sw/apps/ata/.gitignore
 create mode 100644 sw/apps/ata/data/params.json
 create mode 100755 sw/apps/ata/scripts/datagen.py
 create mode 100755 sw/apps/ata/scripts/verify.py
 create mode 100644 sw/apps/ata/src/args.h
 create mode 100644 sw/apps/ata/src/ata.h
 create mode 100644 sw/apps/ata/src/main.c
 create mode 100644 target/snitch_cluster/sw/apps/ata/app.mk

diff --git a/sw/apps/ata/.gitignore b/sw/apps/ata/.gitignore
new file mode 100644
index 000000000..8485f615e
--- /dev/null
+++ b/sw/apps/ata/.gitignore
@@ -0,0 +1 @@
+data/data.h
\ No newline at end of file
diff --git a/sw/apps/ata/data/params.json b/sw/apps/ata/data/params.json
new file mode 100644
index 000000000..1db35db08
--- /dev/null
+++ b/sw/apps/ata/data/params.json
@@ -0,0 +1,10 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    "m": 16,
+    "n": 4,
+    "m_tiles": 2,
+    "funcptr": "ata_opt"
+}
diff --git a/sw/apps/ata/scripts/datagen.py b/sw/apps/ata/scripts/datagen.py
new file mode 100755
index 000000000..11978b918
--- /dev/null
+++ b/sw/apps/ata/scripts/datagen.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import numpy as np
+
+from snitch.util.sim import data_utils
+from snitch.util.sim.data_utils import format_array_definition, format_array_declaration, \
+    format_struct_definition, DataGen
+
+
+DOUBLE_BUFFER = True
+
+class AtaDataGen(DataGen):
+
+    # Function pointers to alternative implementations
+    FUNCPTRS = ["ata_baseline", "ata_opt"]
+
+    def golden_model(self, A):
+        return np.matmul(A, A.transpose())
+
+    def validate(self, **kwargs):
+        assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles"
+        m_frac = kwargs['m'] / kwargs['m_tiles']
+        assert (m_frac % 8) == 0, "m_frac must be an integer multiple of the number of cores"
+        assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4"
+        assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
+
+        # Calculate total TCDM occupation
+        a_tile_size = m_frac * kwargs['n'] * 8
+        b_tile_size = m_frac * m_frac * 8
+        total_size = 2 * a_tile_size + b_tile_size
+        if DOUBLE_BUFFER:
+            total_size *= 2
+        data_utils.validate_tcdm_footprint(total_size)
+
+    def emit_header(self, **kwargs):
+        header = [super().emit_header()]
+
+        self.validate(**kwargs)
+
+        A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100
+        B = self.golden_model(A)
+
+        A = A.flatten()
+        B = B.flatten()
+
+        A_uid = 'A'
+        B_uid = 'B'
+
+        cfg = {
+            'm': kwargs['m'],
+            'n': kwargs['n'],
+            'a': A_uid,
+            'b': B_uid,
+            'm_tiles': kwargs['m_tiles'],
+            'funcptr': kwargs['funcptr']
+        }
+
+        header += [format_array_definition('double', A_uid, A)]
+        header += [format_array_declaration('double', B_uid, B.shape)]
+        header += [format_struct_definition('ata_args_t', 'args', cfg)]
+        header = '\n\n'.join(header)
+
+        return header
+
+
+if __name__ == '__main__':
+    AtaDataGen().main()
diff --git a/sw/apps/ata/scripts/verify.py b/sw/apps/ata/scripts/verify.py
new file mode 100755
index 000000000..1c6b50747
--- /dev/null
+++ b/sw/apps/ata/scripts/verify.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import numpy as np
+import sys
+from datagen import AtaDataGen
+
+from snitch.util.sim.verif_utils import Verifier
+
+
+class AtaVerifier(Verifier):
+
+    OUTPUT_UIDS = ['B']
+
+    def __init__(self):
+        super().__init__()
+        self.func_args = {
+            'm': 'I',
+            'n': 'I',
+            'A': 'I',
+            'B': 'I',
+            'm_tiles': 'I',
+            'funcptr': 'I'
+        }
+        self.func_args = self.get_input_from_symbol('args', self.func_args)
+
+    def get_actual_results(self):
+        return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double')
+
+    def get_expected_results(self):
+        A = self.get_input_from_symbol('A', 'double')
+        A = np.reshape(A, (self.func_args['m'], self.func_args['n']))
+        return AtaDataGen().golden_model(A).flatten()
+
+    def check_results(self, *args):
+        return super().check_results(*args, rtol=1e-10)
+
+
+if __name__ == "__main__":
+    sys.exit(AtaVerifier().main())
diff --git a/sw/apps/ata/src/args.h b/sw/apps/ata/src/args.h
new file mode 100644
index 000000000..520693e22
--- /dev/null
+++ b/sw/apps/ata/src/args.h
@@ -0,0 +1,19 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#pragma once
+#include <stdint.h>
+
+typedef void (*ata_fp_t)(uint32_t m, uint32_t n, double *a, double *at,double *b);
+
+typedef struct {
+    uint32_t m;
+    uint32_t n;
+    double *a;
+    double *b;
+    uint32_t m_tiles;
+    ata_fp_t funcptr;
+} ata_args_t;
diff --git a/sw/apps/ata/src/ata.h b/sw/apps/ata/src/ata.h
new file mode 100644
index 000000000..0e33ea5ff
--- /dev/null
+++ b/sw/apps/ata/src/ata.h
@@ -0,0 +1,277 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "args.h"
+#include "snrt.h"
+
+#define DOUBLE_BUFFER 1
+
+__thread int setup_ssr = 1;
+
+void ata_naive(uint32_t m, uint32_t n, double *a, double *at, double *b) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    for (uint32_t i = offset; i < m; i += stride) {
+        for (uint32_t j = 0; j < m; j++) {
+            b[i * m + j] = 0;
+            for (uint32_t k = 0; k < n; k++) {
+                b[i * m + j] += a[i * n + k] * at[j * n + k];
+            }
+        }
+    }
+}
+
+void ata_baseline(uint32_t m, uint32_t n, double *a, double *at, double *b) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Unrolling factor of innermost loop
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll = 8;
+
+    for (uint32_t i = offset; i < m; i += stride) {
+        for (uint32_t j = 0; j < m; j++) {
+
+            double acc = 0;
+
+            for (uint32_t k = 0; k < n; k += unroll) {
+                asm volatile(
+                    "fmadd.d %[acc], %[a0], %[at0], %[acc] \n"
+                    "fmadd.d %[acc], %[a1], %[at1], %[acc] \n"
+                    "fmadd.d %[acc], %[a2], %[at2], %[acc] \n"
+                    "fmadd.d %[acc], %[a3], %[at3], %[acc] \n"
+                    "fmadd.d %[acc], %[a4], %[at4], %[acc] \n"
+                    "fmadd.d %[acc], %[a5], %[at5], %[acc] \n"
+                    "fmadd.d %[acc], %[a6], %[at6], %[acc] \n"
+                    "fmadd.d %[acc], %[a7], %[at7], %[acc] \n"
+                    : [ acc ] "+f"(acc)
+                    : [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]),
+                      [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]),
+                      [ a4 ] "f"(a[i * n + k + 4]), [ a5 ] "f"(a[i * n + k + 5]),
+                      [ a6 ] "f"(a[i * n + k + 6]), [ a7 ] "f"(a[i * n + k + 7]),
+                      [ at0 ] "f"(at[j * n + k + 0]), [ at1 ] "f"(at[j * n + k + 1]),
+                      [ at2 ] "f"(at[j * n + k + 2]), [ at3 ] "f"(at[j * n + k + 3]),
+                      [ at4 ] "f"(at[j * n + k + 4]), [ at5 ] "f"(at[j * n + k + 5]),
+                      [ at6 ] "f"(at[j * n + k + 6]), [ at7 ] "f"(at[j * n + k + 7])
+                    :
+                );
+            }
+
+            b[i * m + j] = acc;
+        }
+    }
+}
+
+void ata_opt(uint32_t m, uint32_t n, double *a, double *at, double *b) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Unrolling factor of innermost loop
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll = 4;
+
+    if (setup_ssr) {
+        // Configure ft0 and ft1 to load A and At
+        // for (i = offset; i < m; i += stride)
+        //     for (j1 = 0; j1 < m; j1 += unroll)
+        //         for (k = 0; k < n; k++)
+        //             for (j0 = 0; j0 < unroll; j0++)
+        //                 j = j1 + j0
+        //                 ft0.push(a[i * n + k])
+        //                 ft1.push(at[j * n + k])
+        const uint32_t ssr0_b[4] = {unroll, n, m / unroll, m / stride};
+        const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)};
+        snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3],
+                         ssr0_i[1], ssr0_i[2], ssr0_i[3]);
+        snrt_ssr_repeat(SNRT_SSR_DM0, unroll);
+        const uint32_t ssr1_b[4] = {unroll, n, m / unroll, m / stride};
+        const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll * n * sizeof(double), 0};
+        snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2],
+                         ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2],
+                         ssr1_i[3]);
+        setup_ssr = 0;
+    }
+
+    // SSR start address need to be configured each time
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, a + offset * n);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, at);
+    snrt_ssr_enable();
+
+    for (uint32_t i = offset; i < m; i += stride) {
+        for (uint32_t j = 0; j < m; j += unroll) {
+
+            double acc[unroll];
+            acc[0] = 0;
+            acc[1] = 0;
+            acc[2] = 0;
+            acc[3] = 0;
+
+            asm volatile(
+                "frep.o %[n_frep], %[unroll], 0, 0 \n"
+                "fmadd.d %[b0], ft0, ft1, %[b0] \n"
+                "fmadd.d %[b1], ft0, ft1, %[b1] \n"
+                "fmadd.d %[b2], ft0, ft1, %[b2] \n"
+                "fmadd.d %[b3], ft0, ft1, %[b3] \n"
+                : [ b0 ] "+f"(acc[0]), [ b1 ] "+f"(acc[1]),
+                  [ b2 ] "+f"(acc[2]), [ b3 ] "+f"(acc[3])
+                : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll)
+                : "ft0", "ft1", "ft2");
+
+            b[i * m + j + 0] = acc[0];
+            b[i * m + j + 1] = acc[1];
+            b[i * m + j + 2] = acc[2];
+            b[i * m + j + 3] = acc[3];
+        }
+    }
+
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+}
+
+void ata_job(ata_args_t *args) {
+    uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes;
+    uint64_t local_a0_addr, local_at0_addr, local_b0_addr,
+             local_a1_addr, local_at1_addr, local_b1_addr;
+    double *local_a[2];
+    double *local_at[2];
+    double *local_b[2];
+    uint32_t iterations, sb_iterations;
+    uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx;
+
+#ifndef JOB_ARGS_PRELOADED
+    // Allocate space for job arguments in TCDM
+    ata_args_t *local_args = (ata_args_t *)snrt_l1_next();
+
+    // Copy job arguments to TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_args, args, sizeof(ata_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+    args = local_args;
+#endif
+
+    // Calculate size of each tile
+    m_frac = args->m / args->m_tiles;
+    a_tile_size = args->n * m_frac;
+    b_tile_size = m_frac * m_frac;
+    a_tile_bytes = a_tile_size * sizeof(double);
+    b_tile_bytes = b_tile_size * sizeof(double);
+
+    // Allocate space for job operands in TCDM
+    // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
+    local_a0_addr = (uint64_t)args + sizeof(ata_args_t);
+    local_at0_addr = local_a0_addr + a_tile_bytes;
+    local_b0_addr = local_at0_addr + a_tile_bytes;
+    local_a[0] = (double *)local_a0_addr;
+    local_at[0] = (double *)local_at0_addr;
+    local_b[0] = (double *)local_b0_addr;
+    if (DOUBLE_BUFFER) {
+        local_a1_addr = local_b0_addr + b_tile_bytes;
+        local_at1_addr = local_a1_addr + a_tile_bytes;
+        local_b1_addr = local_at1_addr + a_tile_bytes;
+        local_a[1] = (double *)local_a1_addr;
+        local_at[1] = (double *)local_at1_addr;
+        local_b[1] = (double *)local_b1_addr;
+    }
+
+    // Calculate number of iterations
+    sb_iterations = args->m_tiles * args->m_tiles;
+    if (DOUBLE_BUFFER) iterations = sb_iterations + 2;
+    else iterations = sb_iterations;
+
+    // Iterate over all tiles
+    for (i = 0; i < iterations; i++) {
+        
+        if (snrt_is_dm_core()) {
+            // DMA in
+            if (!DOUBLE_BUFFER || (i < sb_iterations)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_in = i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0;
+                i_row = i_dma_in / args->m_tiles;
+                i_col = i_dma_in % args->m_tiles;
+
+                // Copy job operands in TCDM
+                snrt_dma_load_1d_tile(
+                    local_a[buff_idx],
+                    args->a,
+                    i_row,
+                    a_tile_size,
+                    sizeof(double));
+                snrt_dma_load_1d_tile(
+                    local_at[buff_idx],
+                    args->a,
+                    i_col,
+                    a_tile_size,
+                    sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+
+            // Additional barriers required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            // DMA out
+            if (!DOUBLE_BUFFER || (i > 1)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_out = DOUBLE_BUFFER ? i - 2 : i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0;
+                i_row = i_dma_out / args->m_tiles;
+                i_col = i_dma_out % args->m_tiles;
+
+                // Copy job outputs from TCDM
+                snrt_dma_store_2d_tile(
+                    args->b,
+                    local_b[buff_idx],
+                    i_row,
+                    i_col,
+                    m_frac,
+                    m_frac,
+                    args->m,
+                    sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+        }
+
+        // Compute
+        if (snrt_is_compute_core()) {
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_compute = DOUBLE_BUFFER ? i - 1 : i;
+                buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0;
+
+                // Perform tile computation
+                ata_fp_t fp = args->funcptr;
+                fp(m_frac, args->n, local_a[buff_idx], 
+                   local_at[buff_idx], local_b[buff_idx]);
+
+                snrt_mcycle();
+            }
+
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+        }
+        // Synchronize cores after every iteration
+        snrt_cluster_hw_barrier();
+    }
+}
diff --git a/sw/apps/ata/src/main.c b/sw/apps/ata/src/main.c
new file mode 100644
index 000000000..c8df4bea9
--- /dev/null
+++ b/sw/apps/ata/src/main.c
@@ -0,0 +1,17 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz>
+
+#include "snrt.h"
+
+#include "ata.h"
+#include "data.h"
+
+int main() {
+
+    ata_job(&args);
+
+    return 0;
+}
diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk
index ca8246124..0a1e4c00c 100644
--- a/target/snitch_cluster/sw.mk
+++ b/target/snitch_cluster/sw.mk
@@ -63,6 +63,7 @@ APPS += sw/apps/dnn/concat
 APPS += sw/apps/dnn/fused_concat_linear
 APPS += sw/apps/dnn/transpose
 APPS += sw/apps/montecarlo/pi_estimation
+APPS += sw/apps/ata
 APPS += sw/apps/atax
 APPS += sw/apps/correlation
 APPS += sw/apps/covariance
diff --git a/target/snitch_cluster/sw/apps/ata/app.mk b/target/snitch_cluster/sw/apps/ata/app.mk
new file mode 100644
index 000000000..af63400b4
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/ata/app.mk
@@ -0,0 +1,13 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := ata
+$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
+SRC_DIR          := $(ROOT)/sw/apps/$(APP)/src
+SRCS             := $(SRC_DIR)/main.c
+
+include $(ROOT)/sw/apps/common.mk
+include $(ROOT)/target/snitch_cluster/sw/apps/common.mk

From f63c6c9f41ee7fd02230880f3965d26dca4c1617 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 14 Aug 2024 00:25:29 +0200
Subject: [PATCH 05/19] sw: Add optimized covariance kernel

---
 sw/apps/covariance/data/params.json   |   6 +-
 sw/apps/covariance/roi.json           |  36 +++
 sw/apps/covariance/scripts/datagen.py |  61 +++--
 sw/apps/covariance/scripts/verify.py  |  20 +-
 sw/apps/covariance/src/args.h         |  22 ++
 sw/apps/covariance/src/covariance.h   | 375 +++++++++++++++++++++++---
 sw/apps/covariance/src/main.c         |  51 +---
 7 files changed, 471 insertions(+), 100 deletions(-)
 create mode 100644 sw/apps/covariance/roi.json
 create mode 100644 sw/apps/covariance/src/args.h

diff --git a/sw/apps/covariance/data/params.json b/sw/apps/covariance/data/params.json
index 9e89d9f85..5ae088d97 100644
--- a/sw/apps/covariance/data/params.json
+++ b/sw/apps/covariance/data/params.json
@@ -3,6 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 {
-    M: 16,
-    N: 8
+    "m": 32,
+    "n": 2,
+    "m_tiles": 2,
+    "funcptr": "covariance_opt"
 }
diff --git a/sw/apps/covariance/roi.json b/sw/apps/covariance/roi.json
new file mode 100644
index 000000000..757a2ce6d
--- /dev/null
+++ b/sw/apps/covariance/roi.json
@@ -0,0 +1,36 @@
+[
+    <% DOUBLE_BUFFER = 1 %>
+    <% N_TILES = 4 %>
+
+    // Compute cores
+    % for j in range(0, 8):
+    {
+        "thread": "${f'hart_{j}'}",
+        "roi": [
+        % for i in range(0, N_TILES):
+            {"idx": ${2 * i + 1}, "label": "${f'tile_{i}'}"},
+        % endfor
+        ]
+    },
+    % endfor
+
+    // DMA core
+    {
+        "thread": "hart_8",
+        "roi": [
+    % if not DOUBLE_BUFFER:
+        % for i in range(0, N_TILES):
+            {"idx": ${4 * i + 1}, "label": "${f'tile_{i}_in'}"},
+            {"idx": ${4 * i + 3}, "label": "${f'tile_{i}_out'}"},
+        % endfor
+    % else:
+            {"idx": 1, "label": "tile_0_in"},
+        % for i in range(1, N_TILES):
+            {"idx": ${4 * (i - 1) + 3}, "label": "${f'tile_{i}_in'}"},
+            {"idx": ${4 * (i - 1) + 5}, "label": "${f'tile_{i-1}_out'}"},
+        % endfor
+            {"idx": ${4 * (i - 1) + 7}, "label": "tile_15_out"},
+    % endif
+        ]
+    }
+]
\ No newline at end of file
diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py
index 44e20d55e..165fc30a5 100755
--- a/sw/apps/covariance/scripts/datagen.py
+++ b/sw/apps/covariance/scripts/datagen.py
@@ -8,38 +8,67 @@
 
 import numpy as np
 
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_array_declaration, format_ifdef_wrapper, DataGen
+from snitch.util.sim import data_utils
+from snitch.util.sim.data_utils import format_array_definition, \
+    format_array_declaration, format_struct_definition, DataGen
 
 
-# AXI splits bursts crossing 4KB address boundaries. To minimize
-# the occurrence of these splits the data should be aligned to 4KB
-BURST_ALIGNMENT = 4096
+DOUBLE_BUFFER = True
 
 
 class CovarianceDataGen(DataGen):
 
+    # Function pointers to alternative implementations
+    FUNCPTRS = ["covariance_naive", "covariance_opt"]
+
     def golden_model(self, data):
         return np.cov(data, rowvar=False)
 
+    def validate(self, **kwargs):
+        assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles"
+        m_per_tile = kwargs['m'] / kwargs['m_tiles']
+        assert (m_per_tile % 8) == 0, "m_per_tile must be an integer multiple of the number of cores"
+        assert (m_per_tile % 4) == 0, "m_per_tile must be an integer multiple of unroll1 = 4"
+        m_per_core = m_per_tile / 8
+        assert (m_per_core % 2) == 0, "m_per_core must be an integer multiple of the unroll0 = 2"
+        assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
+
+        # Calculate total TCDM occupation
+        a_tile_size = m_per_tile * kwargs['n'] * 8
+        b_tile_size = m_per_tile * m_per_tile * 8
+        total_size = 2 * a_tile_size + b_tile_size
+        if DOUBLE_BUFFER:
+            total_size *= 2
+        data_utils.validate_tcdm_footprint(total_size)
+
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
-        M, N = kwargs['M'], kwargs['N']
-        data = np.random.randint(-200, 100, size=(N, M))
-        cov = self.golden_model(data)
+        self.validate(**kwargs)
 
-        assert (M % 8) == 0, "M must be an integer multiple of the number of cores"
+        data = np.random.randint(-200, 100, size=(kwargs['n'], kwargs['m']))
+        cov = self.golden_model(data)
 
-        data = data.flatten()
+        data = data.transpose().flatten()
         cov = cov.flatten()
 
-        header += [format_scalar_definition('uint32_t', 'M', M)]
-        header += [format_scalar_definition('uint32_t', 'N', N)]
-        header += [format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)]
-        header += [format_array_declaration('double', 'cov', cov.shape, alignment=BURST_ALIGNMENT)]
-        result_def = format_array_definition('double', 'golden', cov, alignment=BURST_ALIGNMENT)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        data_uid = 'data'
+        cov_uid = 'cov'
+
+        cfg = {
+            'm': kwargs['m'],
+            'n': kwargs['n'],
+            'inv_n': 1 / kwargs['n'],
+            'inv_n_m1': 1 / (kwargs['n'] - 1),
+            'data': data_uid,
+            'cov': cov_uid,
+            'm_tiles': kwargs['m_tiles'],
+            'funcptr': kwargs['funcptr']
+        }
+
+        header += [format_array_definition('double', data_uid, data)]
+        header += [format_array_declaration('double', cov_uid, cov.shape)]
+        header += [format_struct_definition('covariance_args_t', 'args', cfg)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/apps/covariance/scripts/verify.py b/sw/apps/covariance/scripts/verify.py
index 4c5b0cdd1..a390d83d1 100755
--- a/sw/apps/covariance/scripts/verify.py
+++ b/sw/apps/covariance/scripts/verify.py
@@ -16,14 +16,26 @@ class CovarianceVerifier(Verifier):
 
     OUTPUT_UIDS = ['cov']
 
+    def __init__(self):
+        super().__init__()
+        self.func_args = {
+            'm': 'I',
+            'n': 'I',
+            'inv_n': 'd',
+            'inv_n_m1': 'd',
+            'data': 'I',
+            'cov': 'I',
+            'm_tiles': 'I',
+            'funcptr': 'I'
+        }
+        self.func_args = self.get_input_from_symbol('args', self.func_args)
+
     def get_actual_results(self):
-        return self.get_output_from_symbol('cov', 'double')
+        return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double')
 
     def get_expected_results(self):
-        M = self.get_input_from_symbol('M', 'uint32_t')[0]
-        N = self.get_input_from_symbol('N', 'uint32_t')[0]
         data = self.get_input_from_symbol('data', 'double')
-        data = np.reshape(data, (N, M))
+        data = np.reshape(data, (self.func_args['m'], self.func_args['n'])).transpose()
         return CovarianceDataGen().golden_model(data).flatten()
 
     def check_results(self, *args):
diff --git a/sw/apps/covariance/src/args.h b/sw/apps/covariance/src/args.h
new file mode 100644
index 000000000..f88768dd5
--- /dev/null
+++ b/sw/apps/covariance/src/args.h
@@ -0,0 +1,22 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#pragma once
+#include <stdint.h>
+
+typedef void (*covariance_fp_t)(uint32_t m, uint32_t n, double inv_n,
+    double inv_n_m1, double *data, double *datat,double *cov);
+
+typedef struct {
+    uint32_t m;
+    uint32_t n;
+    double inv_n;
+    double inv_n_m1;
+    double *data;
+    double *cov;
+    uint32_t m_tiles;
+    covariance_fp_t funcptr;
+} covariance_args_t;
diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h
index fec79d195..41c33a93b 100644
--- a/sw/apps/covariance/src/covariance.h
+++ b/sw/apps/covariance/src/covariance.h
@@ -5,50 +5,359 @@
 // Author: Jose Pedro Castro Fonseca <jcastro@ethz.ch>
 //         Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-#include <stdint.h>
+#include "args.h"
 #include "snrt.h"
 
-void kernel_covariance(uint32_t N, uint32_t M, double *data, double *cov) {
-    int i1, i, j, k;
-    int core_range, core_offset;
-
-    // Compute deviations
-    if (snrt_is_compute_core()) {
-        // Distribute different attributes to the different cores
-        core_range = M / snrt_cluster_compute_core_num();
-        core_offset = snrt_cluster_core_idx() * core_range;
-        for (i1 = 0; i1 < core_range; i1++) {
-            i = core_offset + i1;
-
-            // Calculate mean vector
-            double mean = 0.0;
-            for (k = 0; k < N; k++) {
-                mean += data[k * M + i];
-            }
-            mean = mean / N;
+#define DOUBLE_BUFFER 1
+
+void covariance_naive(uint32_t m, uint32_t n, double inv_n,
+                      double inv_n_m1, double *data, double *datat,
+                      double *cov) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Center data
+    for (uint32_t i = offset; i < m; i += stride) {
+
+        // Calculate row mean
+        double data_mean = 0.0;
+        double datat_mean = 0.0;
+        for (uint32_t j = 0; j < n; j++) {
+            data_mean += data[i * n + j];
+            datat_mean += datat[i * n + j];
+        }
+        data_mean = data_mean * inv_n;
+        datat_mean = datat_mean * inv_n;
+
+        // Center row around zero
+        for (uint32_t j = 0; j < n; j++) {
+            data[i * n + j] -= data_mean;
+            datat[i * n + j] -= datat_mean;
+        }
+    }
 
-            // Standardize data to zero mean
-            for (k = 0; k < N; k++) {
-                data[k * M + i] -= mean;
+    snrt_fpu_fence();
+    snrt_cluster_hw_barrier();
+
+    // Compute covariance matrix
+    for (uint32_t i = offset; i < m; i += stride) {
+        for (uint32_t j = 0; j < m; j++) {
+            cov[i * m + j] = 0.0;
+            for (uint32_t k = 0; k < n; k++) {
+                cov[i * m + j] += data[i * n + k] * datat[j * n + k];
             }
+            cov[i * m + j] *= inv_n_m1;
         }
+    }
+}
+
+void covariance_opt(uint32_t m, uint32_t n, double inv_n,
+                    double inv_n_m1, double *data, double *datat,
+                    double *cov) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Unrolling factor of innermost loop
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll0 = 2;
+
+    // Configure ft0 and ft1 to load data and datat elements
+    // for (k = 0; k < 2; k++)
+    //     for (i1 = offset; i1 < m; i1 += stride * unroll0)
+    //         for (j = 0; j < n; j++)
+    //             for (i0 = 0; i0 < unroll0; i0++)
+    //                 i = i1 + i0 * stride
+    //                 ft0.push(data[i * n + j])
+    //                 ft1.push(datat[i * n + j])
+    const uint32_t ssr01_b[4] = {unroll0, n, 2, m / (stride * unroll0)};
+    const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double),
+                                 0, sizeof(double) * n * stride * unroll0};
+    snrt_ssr_loop_4d(SNRT_SSR_DM0,
+                     ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3],
+                     ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]);
+    snrt_ssr_loop_4d(SNRT_SSR_DM1,
+                     ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3],
+                     ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]);
+    // Configure ft2 to store data and datat elements
+    // for (i1 = offset; i1 < m; i1 += stride * unroll0)
+    //     for (j = 0; j < n; j++)
+    //         for (i0 = 0; i0 < unroll0; i0++)
+    //             i = i1 + i0 * stride
+    //             data[i * n + j] = ft2.pop()
+    //             datat[i * n + j] = ft2.pop()
+    const uint32_t ssr2_b[4] = {2, unroll0, n, m / (stride * unroll0)};
+    const uint32_t ssr2_i[4] = {(uint32_t)datat - (uint32_t)data,
+                                sizeof(double) * n * stride,
+                                sizeof(double),
+                                sizeof(double) * n * stride * unroll0};
+    snrt_ssr_loop_4d(SNRT_SSR_DM2,
+                     ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3],
+                     ssr2_i[0], ssr2_i[1], ssr2_i[2], ssr2_i[3]);
+
+    // SSR start address need to be configured each time
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, data + offset * n);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat + offset * n);
+    snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_4D, data + offset * n);
+    snrt_ssr_enable();
+
+    // Center data
+    for (uint32_t i = offset; i < m; i += stride * unroll0) {
+
+        // Calculate row means
+        double m[2 * unroll0];
+        m[0] = 0.0; // mean(data[i])
+        m[1] = 0.0; // mean(datat[i])
+        m[2] = 0.0; // mean(data[i + stride])
+        m[3] = 0.0; // mean(datat[i + stride])
+        asm volatile(
+            "frep.o %[n_frep], %[n_insn], 0, 0 \n"
+            "fadd.d %[m0], ft0, %[m0] \n"
+            "fadd.d %[m1], ft1, %[m1] \n"
+            "fadd.d %[m2], ft0, %[m2] \n"
+            "fadd.d %[m3], ft1, %[m3] \n"
+            : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]),
+              [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3])
+            : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0)
+            : "ft0", "ft1", "ft2");
+        m[0] *= inv_n;
+        m[1] *= inv_n;
+        m[2] *= inv_n;
+        m[3] *= inv_n;
+
         snrt_fpu_fence();
+
+        // Center row around zero
+        asm volatile(
+            "frep.o %[n_frep], %[n_insn], 0, 0 \n"
+            "fsub.d ft2, ft0, %[m0] \n"
+            "fsub.d ft2, ft1, %[m1] \n"
+            "fsub.d ft2, ft0, %[m2] \n"
+            "fsub.d ft2, ft1, %[m3] \n"
+            : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]),
+              [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3])
+            : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0)
+            : "ft0", "ft1", "ft2");
     }
 
+    snrt_ssr_disable();
+
+    snrt_fpu_fence();
     snrt_cluster_hw_barrier();
 
-    // Compute covariance
-    if (snrt_is_compute_core()) {
-        for (i1 = 0; i1 < core_range; i1++) {
-            i = core_offset + i1;
-            for (j = 0; j <= i; j++) {
-                double tmp = 0.0;
-                for (k = 0; k < N; k++) {
-                    tmp += data[k * M + i] * data[k * M + j];
-                }
-                cov[i * M + j] = tmp / (N - 1);
-                cov[j * M + i] = cov[i * M + j];
+    // The following is taken from the AtA kernel, apart from the normalization
+    // by 1/(n - 1).
+    // Here data stands for A and datat for At.
+
+    // Unrolling factor of innermost loop
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll1 = 4;
+
+    // Configure ft0 and ft1 to load A and At
+    // for (i = offset; i < m; i += stride)
+    //     for (j1 = 0; j1 < m; j1 += unroll1)
+    //         for (k = 0; k < n; k++)
+    //             for (j0 = 0; j0 < unroll1; j0++)
+    //                 j = j1 + j0
+    //                 ft0.push(a[i * n + k])
+    //                 ft1.push(at[j * n + k])
+    const uint32_t ssr0_b[4] = {unroll1, n, m / unroll1, m / stride};
+    const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)};
+    snrt_ssr_loop_3d(SNRT_SSR_DM0,
+        ssr0_b[1], ssr0_b[2], ssr0_b[3],
+        ssr0_i[1], ssr0_i[2], ssr0_i[3]);
+    snrt_ssr_repeat(SNRT_SSR_DM0, unroll1);
+    const uint32_t ssr1_b[4] = {unroll1, n, m / unroll1, m / stride};
+    const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll1 * n * sizeof(double), 0};
+    snrt_ssr_loop_4d(SNRT_SSR_DM1,
+        ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3],
+        ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
+
+    // SSR start address need to be configured each time
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, data + offset * n);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat);
+    snrt_ssr_enable();
+
+    for (uint32_t i = offset; i < m; i += stride) {
+        for (uint32_t j = 0; j < m; j += unroll1) {
+
+            double acc[unroll1];
+            acc[0] = 0;
+            acc[1] = 0;
+            acc[2] = 0;
+            acc[3] = 0;
+
+            asm volatile(
+                "frep.o %[n_frep], %[unroll1], 0, 0 \n"
+                "fmadd.d %[b0], ft0, ft1, %[b0] \n"
+                "fmadd.d %[b1], ft0, ft1, %[b1] \n"
+                "fmadd.d %[b2], ft0, ft1, %[b2] \n"
+                "fmadd.d %[b3], ft0, ft1, %[b3] \n"
+                : [ b0 ] "+f"(acc[0]), [ b1 ] "+f"(acc[1]),
+                  [ b2 ] "+f"(acc[2]), [ b3 ] "+f"(acc[3])
+                : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1)
+                : "ft0", "ft1", "ft2");
+
+            snrt_ssr_disable();
+
+            cov[i * m + j + 0] = acc[0] * inv_n_m1;
+            cov[i * m + j + 1] = acc[1] * inv_n_m1;
+            cov[i * m + j + 2] = acc[2] * inv_n_m1;
+            cov[i * m + j + 3] = acc[3] * inv_n_m1;
+
+            snrt_ssr_enable();
+        }
+    }
+
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+}
+
+void covariance_job(covariance_args_t *args) {
+    uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes;
+    uint64_t local_a0_addr, local_at0_addr, local_b0_addr,
+             local_a1_addr, local_at1_addr, local_b1_addr;
+    double *local_a[2];
+    double *local_at[2];
+    double *local_b[2];
+    uint32_t iterations, sb_iterations;
+    uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx;
+
+#ifndef JOB_ARGS_PRELOADED
+    // Allocate space for job arguments in TCDM
+    covariance_args_t *local_args = (covariance_args_t *)snrt_l1_next();
+
+    // Copy job arguments to TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_args, args, sizeof(covariance_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+    args = local_args;
+#endif
+
+    // Calculate size of each tile
+    m_frac = args->m / args->m_tiles;
+    a_tile_size = args->n * m_frac;
+    b_tile_size = m_frac * m_frac;
+    a_tile_bytes = a_tile_size * sizeof(double);
+    b_tile_bytes = b_tile_size * sizeof(double);
+
+    // Allocate space for job operands in TCDM
+    // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
+    local_a0_addr = (uint64_t)args + sizeof(covariance_args_t);
+    local_at0_addr = local_a0_addr + a_tile_bytes;
+    local_b0_addr = local_at0_addr + a_tile_bytes;
+    local_a[0] = (double *)local_a0_addr;
+    local_at[0] = (double *)local_at0_addr;
+    local_b[0] = (double *)local_b0_addr;
+    if (DOUBLE_BUFFER) {
+        local_a1_addr = local_b0_addr + b_tile_bytes;
+        local_at1_addr = local_a1_addr + a_tile_bytes;
+        local_b1_addr = local_at1_addr + a_tile_bytes;
+        local_a[1] = (double *)local_a1_addr;
+        local_at[1] = (double *)local_at1_addr;
+        local_b[1] = (double *)local_b1_addr;
+    }
+
+    // Calculate number of iterations
+    sb_iterations = args->m_tiles * args->m_tiles;
+    if (DOUBLE_BUFFER) iterations = sb_iterations + 2;
+    else iterations = sb_iterations;
+
+    // Iterate over all tiles
+    for (i = 0; i < iterations; i++) {
+        
+        if (snrt_is_dm_core()) {
+            // DMA in
+            if (!DOUBLE_BUFFER || (i < sb_iterations)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_in = i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0;
+                i_row = i_dma_in / args->m_tiles;
+                i_col = i_dma_in % args->m_tiles;
+
+                // Copy job operands in TCDM
+                snrt_dma_load_1d_tile(
+                    local_a[buff_idx],
+                    args->data,
+                    i_row,
+                    a_tile_size,
+                    sizeof(double));
+                snrt_dma_load_1d_tile(
+                    local_at[buff_idx],
+                    args->data,
+                    i_col,
+                    a_tile_size,
+                    sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
             }
+
+            // Additional barriers required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            // Additional barrier required to synchronize the compute cores
+            // among them after the data centering phase
+            if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1)))
+                snrt_cluster_hw_barrier();
+
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            // DMA out
+            if (!DOUBLE_BUFFER || (i > 1)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_out = DOUBLE_BUFFER ? i - 2 : i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0;
+                i_row = i_dma_out / args->m_tiles;
+                i_col = i_dma_out % args->m_tiles;
+
+                // Copy job outputs from TCDM
+                snrt_dma_store_2d_tile(
+                    args->cov,
+                    local_b[buff_idx],
+                    i_row,
+                    i_col,
+                    m_frac,
+                    m_frac,
+                    args->m,
+                    sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+        }
+
+        // Compute
+        if (snrt_is_compute_core()) {
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_compute = DOUBLE_BUFFER ? i - 1 : i;
+                buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0;
+
+                // Perform tile computation
+                covariance_fp_t fp = args->funcptr;
+                fp(m_frac, args->n, args->inv_n, args->inv_n_m1,
+                   local_a[buff_idx], local_at[buff_idx], local_b[buff_idx]);
+
+                snrt_mcycle();
+            }
+
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
         }
+        // Synchronize cores after every iteration
+        snrt_cluster_hw_barrier();
     }
 }
diff --git a/sw/apps/covariance/src/main.c b/sw/apps/covariance/src/main.c
index 26b151393..3c9d225a8 100644
--- a/sw/apps/covariance/src/main.c
+++ b/sw/apps/covariance/src/main.c
@@ -1,56 +1,17 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
+// Copyright 2024 ETH Zurich and University of Bologna.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
-// Author: Jose Pedro Castro Fonseca <jcastro@ethz.ch>
-//         Luca Colagrande <colluca@iis.ee.ethz.ch>
+// Author: Luca Colagrande <colluca@iis.ee.ethz>
+
+#include "snrt.h"
 
 #include "covariance.h"
 #include "data.h"
 
-#define MAX_ERROR 1e-10
-
 int main() {
-    uint32_t nerr = 0;
-    double *local_mean;
-    double *local_cov;
-    double *local_data;
-    double diff;
-
-    local_data = snrt_l1_next();
-    local_cov = local_data + N * M;
-
-    // Initialize input matrix
-    if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(local_data, data, sizeof(double) * N * M);
-        snrt_dma_wait_all();
-    }
-    snrt_cluster_hw_barrier();
-
-    // Perform Computations
-    kernel_covariance(N, M, local_data, local_cov);
-    snrt_cluster_hw_barrier();
-
-    // Writeback outputs
-    if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(cov, local_cov, sizeof(double) * M * M);
-        snrt_dma_wait_all();
-    }
-    snrt_cluster_hw_barrier();
 
-#ifdef BIST
-    // Check computation is correct
-    if (snrt_cluster_core_idx() == 0) {
-        for (int i = 0; i < M; i++) {
-            for (int j = 0; j < M; j++) {
-                diff = fabs(golden[i * M + j] - local_cov[i * M + j]);
-                if (diff > MAX_ERROR) {
-                    nerr++;
-                }
-            }
-        }
-    }
-#endif
+    covariance_job(&args);
 
-    return nerr;
+    return 0;
 }

From 601e3958e33d470804e3455ccd4f27984f9ecda2 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 19 Aug 2024 09:35:23 +0200
Subject: [PATCH 06/19] gen_trace.py: Do not return on exception

Ensures that performance metrics are dumped even if the
simulation didn't terminate successfully.
---
 util/trace/gen_trace.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index db094ad7e..0fab642e0 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -1145,7 +1145,6 @@ def main():
                         message += 'line {lineno}.'
                     print(traceback.format_exc(), file=sys.stderr)
                     print(message, file=sys.stderr)
-                    return 1
             else:
                 break  # Nothing more in pipe, EOF
         perf_metrics[-1]['tend'] = time_info[0] / 1000

From 7de5657efa8a300e1dbeb12169c53d4c81c57f85 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 19 Aug 2024 09:36:50 +0200
Subject: [PATCH 07/19] target: Delete performance dumps on `clean-traces`

---
 target/common/common.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/common/common.mk b/target/common/common.mk
index 70afd80c2..995e80ba0 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -203,6 +203,7 @@ SNITCH_DASM_TRACES      = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null)
 SNITCH_TXT_TRACES       = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.txt/g'))
 SNITCH_ANNOTATED_TRACES = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.s/g'))
 SNITCH_PERF_DUMPS       = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g'))
+DMA_PERF_DUMPS          = $(LOGS_DIR)/dma_*_perf.json
 
 TXT_TRACES       += $(SNITCH_TXT_TRACES)
 ANNOTATED_TRACES += $(SNITCH_ANNOTATED_TRACES)
@@ -219,7 +220,7 @@ annotate: $(ANNOTATED_TRACES)
 perf: $(JOINT_PERF_DUMP)
 visual-trace: $(VISUAL_TRACE)
 clean-traces:
-	rm -f $(TXT_TRACES)
+	rm -f $(TXT_TRACES) $(SNITCH_PERF_DUMPS) $(DMA_PERF_DUMPS)
 clean-annotate:
 	rm -f $(ANNOTATED_TRACES)
 clean-perf:

From ff3c3e3cfbebbdcd059cd4f21b2214a2f199f6c1 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 19 Aug 2024 14:33:54 +0200
Subject: [PATCH 08/19] ata: Generalize and optimize

---
 sw/apps/ata/scripts/datagen.py |  10 +--
 sw/apps/ata/scripts/verify.py  |   3 +-
 sw/apps/ata/src/args.h         |   4 +-
 sw/apps/ata/src/ata.h          | 110 +++++++++++++++++++++------------
 4 files changed, 82 insertions(+), 45 deletions(-)

diff --git a/sw/apps/ata/scripts/datagen.py b/sw/apps/ata/scripts/datagen.py
index 11978b918..f6474f2e6 100755
--- a/sw/apps/ata/scripts/datagen.py
+++ b/sw/apps/ata/scripts/datagen.py
@@ -17,10 +17,10 @@
 class AtaDataGen(DataGen):
 
     # Function pointers to alternative implementations
-    FUNCPTRS = ["ata_baseline", "ata_opt"]
+    FUNCPTRS = ["ata_naive", "ata_baseline", "ata_opt"]
 
-    def golden_model(self, A):
-        return np.matmul(A, A.transpose())
+    def golden_model(self, alpha, A):
+        return alpha * np.matmul(A, A.transpose())
 
     def validate(self, **kwargs):
         assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles"
@@ -43,7 +43,8 @@ def emit_header(self, **kwargs):
         self.validate(**kwargs)
 
         A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100
-        B = self.golden_model(A)
+        alpha = np.random.randint(-200, 100)/100
+        B = self.golden_model(alpha, A)
 
         A = A.flatten()
         B = B.flatten()
@@ -52,6 +53,7 @@ def emit_header(self, **kwargs):
         B_uid = 'B'
 
         cfg = {
+            'alpha': alpha,
             'm': kwargs['m'],
             'n': kwargs['n'],
             'a': A_uid,
diff --git a/sw/apps/ata/scripts/verify.py b/sw/apps/ata/scripts/verify.py
index 1c6b50747..206af870a 100755
--- a/sw/apps/ata/scripts/verify.py
+++ b/sw/apps/ata/scripts/verify.py
@@ -19,6 +19,7 @@ class AtaVerifier(Verifier):
     def __init__(self):
         super().__init__()
         self.func_args = {
+            'alpha': 'd',
             'm': 'I',
             'n': 'I',
             'A': 'I',
@@ -34,7 +35,7 @@ def get_actual_results(self):
     def get_expected_results(self):
         A = self.get_input_from_symbol('A', 'double')
         A = np.reshape(A, (self.func_args['m'], self.func_args['n']))
-        return AtaDataGen().golden_model(A).flatten()
+        return AtaDataGen().golden_model(self.func_args['alpha'], A).flatten()
 
     def check_results(self, *args):
         return super().check_results(*args, rtol=1e-10)
diff --git a/sw/apps/ata/src/args.h b/sw/apps/ata/src/args.h
index 520693e22..f65a6a13f 100644
--- a/sw/apps/ata/src/args.h
+++ b/sw/apps/ata/src/args.h
@@ -7,9 +7,11 @@
 #pragma once
 #include <stdint.h>
 
-typedef void (*ata_fp_t)(uint32_t m, uint32_t n, double *a, double *at,double *b);
+typedef void (*ata_fp_t)(double alpha, uint32_t m, uint32_t n, double *a,
+    double *at, double *b);
 
 typedef struct {
+    double alpha;
     uint32_t m;
     uint32_t n;
     double *a;
diff --git a/sw/apps/ata/src/ata.h b/sw/apps/ata/src/ata.h
index 0e33ea5ff..8673353a4 100644
--- a/sw/apps/ata/src/ata.h
+++ b/sw/apps/ata/src/ata.h
@@ -11,7 +11,7 @@
 
 __thread int setup_ssr = 1;
 
-void ata_naive(uint32_t m, uint32_t n, double *a, double *at, double *b) {
+void ata_naive(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
@@ -21,53 +21,83 @@ void ata_naive(uint32_t m, uint32_t n, double *a, double *at, double *b) {
             for (uint32_t k = 0; k < n; k++) {
                 b[i * m + j] += a[i * n + k] * at[j * n + k];
             }
+            b[i * m + j] *= alpha;
         }
     }
 }
 
-void ata_baseline(uint32_t m, uint32_t n, double *a, double *at, double *b) {
+void ata_baseline(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
-    // Unrolling factor of innermost loop
+    // Unrolling factors
     // Note: changes must be reflected in the inline assembly code
     //       and datagen script
-    const uint32_t unroll = 8;
+    const uint32_t unroll1 = 4;
+    const uint32_t unroll0 = 4;
 
     for (uint32_t i = offset; i < m; i += stride) {
-        for (uint32_t j = 0; j < m; j++) {
+        for (uint32_t j = 0; j < m; j += unroll1) {
 
-            double acc = 0;
+            double acc[4];
+            acc[0] = 0;
+            acc[1] = 0;
+            acc[2] = 0;
+            acc[3] = 0;
 
-            for (uint32_t k = 0; k < n; k += unroll) {
+            for (uint32_t k = 0; k < n; k += unroll0) {
                 asm volatile(
-                    "fmadd.d %[acc], %[a0], %[at0], %[acc] \n"
-                    "fmadd.d %[acc], %[a1], %[at1], %[acc] \n"
-                    "fmadd.d %[acc], %[a2], %[at2], %[acc] \n"
-                    "fmadd.d %[acc], %[a3], %[at3], %[acc] \n"
-                    "fmadd.d %[acc], %[a4], %[at4], %[acc] \n"
-                    "fmadd.d %[acc], %[a5], %[at5], %[acc] \n"
-                    "fmadd.d %[acc], %[a6], %[at6], %[acc] \n"
-                    "fmadd.d %[acc], %[a7], %[at7], %[acc] \n"
-                    : [ acc ] "+f"(acc)
-                    : [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]),
-                      [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]),
-                      [ a4 ] "f"(a[i * n + k + 4]), [ a5 ] "f"(a[i * n + k + 5]),
-                      [ a6 ] "f"(a[i * n + k + 6]), [ a7 ] "f"(a[i * n + k + 7]),
-                      [ at0 ] "f"(at[j * n + k + 0]), [ at1 ] "f"(at[j * n + k + 1]),
-                      [ at2 ] "f"(at[j * n + k + 2]), [ at3 ] "f"(at[j * n + k + 3]),
-                      [ at4 ] "f"(at[j * n + k + 4]), [ at5 ] "f"(at[j * n + k + 5]),
-                      [ at6 ] "f"(at[j * n + k + 6]), [ at7 ] "f"(at[j * n + k + 7])
+                    "fmadd.d %[acc0], %[a0], %[at0], %[acc0] \n"
+                    "fmadd.d %[acc1], %[a0], %[at1], %[acc1] \n"
+                    "fmadd.d %[acc2], %[a0], %[at2], %[acc2] \n"
+                    "fmadd.d %[acc3], %[a0], %[at3], %[acc3] \n"
+                    "fmadd.d %[acc0], %[a1], %[at4], %[acc0] \n"
+                    "fmadd.d %[acc1], %[a1], %[at5], %[acc1] \n"
+                    "fmadd.d %[acc2], %[a1], %[at6], %[acc2] \n"
+                    "fmadd.d %[acc3], %[a1], %[at7], %[acc3] \n"
+                    "fmadd.d %[acc0], %[a2], %[at8], %[acc0] \n"
+                    "fmadd.d %[acc1], %[a2], %[at9], %[acc1] \n"
+                    "fmadd.d %[acc2], %[a2], %[at10], %[acc2] \n"
+                    "fmadd.d %[acc3], %[a2], %[at11], %[acc3] \n"
+                    "fmadd.d %[acc0], %[a3], %[at12], %[acc0] \n"
+                    "fmadd.d %[acc1], %[a3], %[at13], %[acc1] \n"
+                    "fmadd.d %[acc2], %[a3], %[at14], %[acc2] \n"
+                    "fmadd.d %[acc3], %[a3], %[at15], %[acc3] \n"
+                    : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                      [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
+                    : [ a0 ] "f"(a[i * n + k + 0]),
+                      [ a1 ] "f"(a[i * n + k + 1]),
+                      [ a2 ] "f"(a[i * n + k + 2]),
+                      [ a3 ] "f"(a[i * n + k + 3]),
+                      [ at0 ] "f"(at[(j + 0) * n + k]),
+                      [ at1 ] "f"(at[(j + 1) * n + k]),
+                      [ at2 ] "f"(at[(j + 2) * n + k]),
+                      [ at3 ] "f"(at[(j + 3) * n + k]),
+                      [ at4 ] "f"(at[(j + 0) * n + k + 1]),
+                      [ at5 ] "f"(at[(j + 1) * n + k + 1]),
+                      [ at6 ] "f"(at[(j + 2) * n + k + 1]),
+                      [ at7 ] "f"(at[(j + 3) * n + k + 1]),
+                      [ at8 ] "f"(at[(j + 0) * n + k + 2]),
+                      [ at9 ] "f"(at[(j + 1) * n + k + 2]),
+                      [ at10 ] "f"(at[(j + 2) * n + k + 2]),
+                      [ at11 ] "f"(at[(j + 3) * n + k + 2]),
+                      [ at12 ] "f"(at[(j + 0) * n + k + 3]),
+                      [ at13 ] "f"(at[(j + 1) * n + k + 3]),
+                      [ at14 ] "f"(at[(j + 2) * n + k + 3]),
+                      [ at15 ] "f"(at[(j + 3) * n + k + 3])
                     :
                 );
             }
 
-            b[i * m + j] = acc;
+            b[i * m + j + 0] = alpha * acc[0];
+            b[i * m + j + 1] = alpha * acc[1];
+            b[i * m + j + 2] = alpha * acc[2];
+            b[i * m + j + 3] = alpha * acc[3];
         }
     }
 }
 
-void ata_opt(uint32_t m, uint32_t n, double *a, double *at, double *b) {
+void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
@@ -114,19 +144,21 @@ void ata_opt(uint32_t m, uint32_t n, double *a, double *at, double *b) {
 
             asm volatile(
                 "frep.o %[n_frep], %[unroll], 0, 0 \n"
-                "fmadd.d %[b0], ft0, ft1, %[b0] \n"
-                "fmadd.d %[b1], ft0, ft1, %[b1] \n"
-                "fmadd.d %[b2], ft0, ft1, %[b2] \n"
-                "fmadd.d %[b3], ft0, ft1, %[b3] \n"
-                : [ b0 ] "+f"(acc[0]), [ b1 ] "+f"(acc[1]),
-                  [ b2 ] "+f"(acc[2]), [ b3 ] "+f"(acc[3])
-                : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll)
+                "fmadd.d %[acc0], ft0, ft1, %[acc0] \n"
+                "fmadd.d %[acc1], ft0, ft1, %[acc1] \n"
+                "fmadd.d %[acc2], ft0, ft1, %[acc2] \n"
+                "fmadd.d %[acc3], ft0, ft1, %[acc3] \n"
+                "fmul.d %[b0], %[acc0], %[alpha] \n"
+                "fmul.d %[b1], %[acc1], %[alpha] \n"
+                "fmul.d %[b2], %[acc2], %[alpha] \n"
+                "fmul.d %[b3], %[acc3], %[alpha] \n"
+                : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                  [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]),
+                  [ b0 ] "=f"(b[i * m + j + 0]), [ b1 ] "=f"(b[i * m + j + 1]),
+                  [ b2 ] "=f"(b[i * m + j + 2]), [ b3 ] "=f"(b[i * m + j + 3])
+                : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll),
+                  [ alpha ] "f"(alpha)
                 : "ft0", "ft1", "ft2");
-
-            b[i * m + j + 0] = acc[0];
-            b[i * m + j + 1] = acc[1];
-            b[i * m + j + 2] = acc[2];
-            b[i * m + j + 3] = acc[3];
         }
     }
 
@@ -262,7 +294,7 @@ void ata_job(ata_args_t *args) {
 
                 // Perform tile computation
                 ata_fp_t fp = args->funcptr;
-                fp(m_frac, args->n, local_a[buff_idx], 
+                fp(args->alpha, m_frac, args->n, local_a[buff_idx],
                    local_at[buff_idx], local_b[buff_idx]);
 
                 snrt_mcycle();

From 018f77aa6766750e003fc661c55005d55f820354 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 19 Aug 2024 14:34:53 +0200
Subject: [PATCH 09/19] sw: Allow apps to extend `INCDIRS`

---
 sw/apps/common.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sw/apps/common.mk b/sw/apps/common.mk
index 89f5da9f6..6bdc85984 100644
--- a/sw/apps/common.mk
+++ b/sw/apps/common.mk
@@ -13,7 +13,7 @@ DATA_H          := $($(APP)_BUILD_DIR)/data.h
 DATAGEN_PY       = $(SCRIPTS_DIR)/datagen.py
 
 $(APP)_HEADERS := $(DATA_H)
-$(APP)_INCDIRS := $(dir $(DATA_H)) $(SRC_DIR)
+$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR)
 
 $(dir $(DATA_H)):
 	mkdir -p $@

From 3e975e9dbc75d3e76b95c8698644057aa7371c3f Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 19 Aug 2024 14:36:32 +0200
Subject: [PATCH 10/19] covariance: Fix bug and optimize baseline

---
 sw/apps/covariance/roi.json                   | 36 ---------
 sw/apps/covariance/scripts/datagen.py         |  9 ++-
 sw/apps/covariance/src/covariance.h           | 76 ++++++++++++-------
 .../snitch_cluster/sw/apps/covariance/app.mk  |  1 +
 4 files changed, 57 insertions(+), 65 deletions(-)
 delete mode 100644 sw/apps/covariance/roi.json

diff --git a/sw/apps/covariance/roi.json b/sw/apps/covariance/roi.json
deleted file mode 100644
index 757a2ce6d..000000000
--- a/sw/apps/covariance/roi.json
+++ /dev/null
@@ -1,36 +0,0 @@
-[
-    <% DOUBLE_BUFFER = 1 %>
-    <% N_TILES = 4 %>
-
-    // Compute cores
-    % for j in range(0, 8):
-    {
-        "thread": "${f'hart_{j}'}",
-        "roi": [
-        % for i in range(0, N_TILES):
-            {"idx": ${2 * i + 1}, "label": "${f'tile_{i}'}"},
-        % endfor
-        ]
-    },
-    % endfor
-
-    // DMA core
-    {
-        "thread": "hart_8",
-        "roi": [
-    % if not DOUBLE_BUFFER:
-        % for i in range(0, N_TILES):
-            {"idx": ${4 * i + 1}, "label": "${f'tile_{i}_in'}"},
-            {"idx": ${4 * i + 3}, "label": "${f'tile_{i}_out'}"},
-        % endfor
-    % else:
-            {"idx": 1, "label": "tile_0_in"},
-        % for i in range(1, N_TILES):
-            {"idx": ${4 * (i - 1) + 3}, "label": "${f'tile_{i}_in'}"},
-            {"idx": ${4 * (i - 1) + 5}, "label": "${f'tile_{i-1}_out'}"},
-        % endfor
-            {"idx": ${4 * (i - 1) + 7}, "label": "tile_15_out"},
-    % endif
-        ]
-    }
-]
\ No newline at end of file
diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py
index 165fc30a5..c3b7cd8b3 100755
--- a/sw/apps/covariance/scripts/datagen.py
+++ b/sw/apps/covariance/scripts/datagen.py
@@ -12,6 +12,7 @@
 from snitch.util.sim.data_utils import format_array_definition, \
     format_array_declaration, format_struct_definition, DataGen
 
+np.random.seed(42)
 
 DOUBLE_BUFFER = True
 
@@ -19,17 +20,19 @@
 class CovarianceDataGen(DataGen):
 
     # Function pointers to alternative implementations
-    FUNCPTRS = ["covariance_naive", "covariance_opt"]
+    FUNCPTRS = ["covariance_naive", "covariance_baseline", "covariance_opt"]
 
     def golden_model(self, data):
         return np.cov(data, rowvar=False)
 
     def validate(self, **kwargs):
+        n_cores = 8
         assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles"
         m_per_tile = kwargs['m'] / kwargs['m_tiles']
-        assert (m_per_tile % 8) == 0, "m_per_tile must be an integer multiple of the number of cores"
+        assert (m_per_tile % n_cores) == 0, \
+            "m_per_tile must be an integer multiple of the number of cores"
         assert (m_per_tile % 4) == 0, "m_per_tile must be an integer multiple of unroll1 = 4"
-        m_per_core = m_per_tile / 8
+        m_per_core = m_per_tile / n_cores
         assert (m_per_core % 2) == 0, "m_per_core must be an integer multiple of the unroll0 = 2"
         assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
 
diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h
index 41c33a93b..29c6aec69 100644
--- a/sw/apps/covariance/src/covariance.h
+++ b/sw/apps/covariance/src/covariance.h
@@ -7,6 +7,7 @@
 
 #include "args.h"
 #include "snrt.h"
+#include "ata.h"
 
 #define DOUBLE_BUFFER 1
 
@@ -40,15 +41,40 @@ void covariance_naive(uint32_t m, uint32_t n, double inv_n,
     snrt_cluster_hw_barrier();
 
     // Compute covariance matrix
+    ata_naive(inv_n_m1, m, n, data, datat, cov);
+}
+
+void covariance_baseline(uint32_t m, uint32_t n, double inv_n,
+                         double inv_n_m1, double *data, double *datat,
+                         double *cov) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Center data
     for (uint32_t i = offset; i < m; i += stride) {
-        for (uint32_t j = 0; j < m; j++) {
-            cov[i * m + j] = 0.0;
-            for (uint32_t k = 0; k < n; k++) {
-                cov[i * m + j] += data[i * n + k] * datat[j * n + k];
-            }
-            cov[i * m + j] *= inv_n_m1;
+
+        // Calculate row mean
+        double data_mean = 0.0;
+        double datat_mean = 0.0;
+        for (uint32_t j = 0; j < n; j++) {
+            data_mean += data[i * n + j];
+            datat_mean += datat[i * n + j];
+        }
+        data_mean = data_mean * inv_n;
+        datat_mean = datat_mean * inv_n;
+
+        // Center row around zero
+        for (uint32_t j = 0; j < n; j++) {
+            data[i * n + j] -= data_mean;
+            datat[i * n + j] -= datat_mean;
         }
     }
+
+    snrt_fpu_fence();
+    snrt_cluster_hw_barrier();
+
+    // Compute covariance matrix
+    ata_baseline(inv_n_m1, m, n, data, datat, cov);
 }
 
 void covariance_opt(uint32_t m, uint32_t n, double inv_n,
@@ -79,6 +105,7 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
     snrt_ssr_loop_4d(SNRT_SSR_DM1,
                      ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3],
                      ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]);
+    snrt_ssr_repeat(SNRT_SSR_DM0, 1);
     // Configure ft2 to store data and datat elements
     // for (i1 = offset; i1 < m; i1 += stride * unroll0)
     //     for (j = 0; j < n; j++)
@@ -145,8 +172,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
     snrt_fpu_fence();
     snrt_cluster_hw_barrier();
 
-    // The following is taken from the AtA kernel, apart from the normalization
-    // by 1/(n - 1).
+    // The following is taken from the AtA kernel, where alpha is set to
+    // the factor 1/(n - 1).
     // Here data stands for A and datat for At.
 
     // Unrolling factor of innermost loop
@@ -175,7 +202,7 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
         ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
 
     // SSR start address need to be configured each time
-    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, data + offset * n);
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, data + offset * n);
     snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat);
     snrt_ssr_enable();
 
@@ -190,23 +217,21 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
 
             asm volatile(
                 "frep.o %[n_frep], %[unroll1], 0, 0 \n"
-                "fmadd.d %[b0], ft0, ft1, %[b0] \n"
-                "fmadd.d %[b1], ft0, ft1, %[b1] \n"
-                "fmadd.d %[b2], ft0, ft1, %[b2] \n"
-                "fmadd.d %[b3], ft0, ft1, %[b3] \n"
-                : [ b0 ] "+f"(acc[0]), [ b1 ] "+f"(acc[1]),
-                  [ b2 ] "+f"(acc[2]), [ b3 ] "+f"(acc[3])
-                : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1)
+                "fmadd.d %[acc0], ft0, ft1, %[acc0] \n"
+                "fmadd.d %[acc1], ft0, ft1, %[acc1] \n"
+                "fmadd.d %[acc2], ft0, ft1, %[acc2] \n"
+                "fmadd.d %[acc3], ft0, ft1, %[acc3] \n"
+                "fmul.d %[b0], %[acc0], %[alpha] \n"
+                "fmul.d %[b1], %[acc1], %[alpha] \n"
+                "fmul.d %[b2], %[acc2], %[alpha] \n"
+                "fmul.d %[b3], %[acc3], %[alpha] \n"
+                : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                  [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]),
+                  [ b0 ] "=f"(cov[i * m + j + 0]), [ b1 ] "=f"(cov[i * m + j + 1]),
+                  [ b2 ] "=f"(cov[i * m + j + 2]), [ b3 ] "=f"(cov[i * m + j + 3])
+                : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1),
+                  [ alpha ] "f"(inv_n_m1)
                 : "ft0", "ft1", "ft2");
-
-            snrt_ssr_disable();
-
-            cov[i * m + j + 0] = acc[0] * inv_n_m1;
-            cov[i * m + j + 1] = acc[1] * inv_n_m1;
-            cov[i * m + j + 2] = acc[2] * inv_n_m1;
-            cov[i * m + j + 3] = acc[3] * inv_n_m1;
-
-            snrt_ssr_enable();
         }
     }
 
@@ -245,7 +270,6 @@ void covariance_job(covariance_args_t *args) {
     b_tile_bytes = b_tile_size * sizeof(double);
 
     // Allocate space for job operands in TCDM
-    // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
     local_a0_addr = (uint64_t)args + sizeof(covariance_args_t);
     local_at0_addr = local_a0_addr + a_tile_bytes;
     local_b0_addr = local_at0_addr + a_tile_bytes;
diff --git a/target/snitch_cluster/sw/apps/covariance/app.mk b/target/snitch_cluster/sw/apps/covariance/app.mk
index c177a9d61..005791c79 100644
--- a/target/snitch_cluster/sw/apps/covariance/app.mk
+++ b/target/snitch_cluster/sw/apps/covariance/app.mk
@@ -8,6 +8,7 @@ APP              := covariance
 $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
 SRC_DIR          := $(ROOT)/sw/apps/$(APP)/src
 SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/apps/ata/src/
 
 include $(ROOT)/sw/apps/common.mk
 include $(ROOT)/target/snitch_cluster/sw/apps/common.mk

From 5e663d7e25cc410083ecb4c0b540328eae1e6f04 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 19 Aug 2024 18:10:00 +0200
Subject: [PATCH 11/19] sw: Replace AtA kernel with syrk

---
 sw/apps/covariance/src/covariance.h           |   6 +-
 sw/blas/blas.h                                |  14 ++
 sw/blas/gemm/src/gemm.h                       |  13 --
 sw/blas/gemm/src/main.c                       |   2 +-
 sw/{apps/ata => blas/syrk}/.gitignore         |   0
 sw/{apps/ata => blas/syrk}/data/params.json   |  10 +-
 sw/{apps/ata => blas/syrk}/scripts/datagen.py |  46 +++--
 sw/{apps/ata => blas/syrk}/scripts/verify.py  |  20 +-
 sw/{apps/ata => blas/syrk}/src/args.h         |  13 +-
 sw/{apps/ata => blas/syrk}/src/main.c         |   4 +-
 .../ata/src/ata.h => blas/syrk/src/syrk.h}    | 186 +++++++++---------
 target/snitch_cluster/sw.mk                   |   2 +-
 .../snitch_cluster/sw/apps/blas/gemm/app.mk   |   1 +
 .../sw/apps/{ata => blas/syrk}/app.mk         |   7 +-
 .../snitch_cluster/sw/apps/covariance/app.mk  |   2 +-
 15 files changed, 180 insertions(+), 146 deletions(-)
 rename sw/{apps/ata => blas/syrk}/.gitignore (100%)
 rename sw/{apps/ata => blas/syrk}/data/params.json (63%)
 rename sw/{apps/ata => blas/syrk}/scripts/datagen.py (56%)
 rename sw/{apps/ata => blas/syrk}/scripts/verify.py (70%)
 rename sw/{apps/ata => blas/syrk}/src/args.h (66%)
 rename sw/{apps/ata => blas/syrk}/src/main.c (88%)
 rename sw/{apps/ata/src/ata.h => blas/syrk/src/syrk.h} (69%)
 rename target/snitch_cluster/sw/apps/{ata => blas/syrk}/app.mk (65%)

diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h
index 29c6aec69..53944e6ca 100644
--- a/sw/apps/covariance/src/covariance.h
+++ b/sw/apps/covariance/src/covariance.h
@@ -6,8 +6,8 @@
 //         Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 #include "args.h"
+#include "blas.h"
 #include "snrt.h"
-#include "ata.h"
 
 #define DOUBLE_BUFFER 1
 
@@ -41,7 +41,7 @@ void covariance_naive(uint32_t m, uint32_t n, double inv_n,
     snrt_cluster_hw_barrier();
 
     // Compute covariance matrix
-    ata_naive(inv_n_m1, m, n, data, datat, cov);
+    syrk_naive(m, n, inv_n_m1, data, datat, 0, cov);
 }
 
 void covariance_baseline(uint32_t m, uint32_t n, double inv_n,
@@ -74,7 +74,7 @@ void covariance_baseline(uint32_t m, uint32_t n, double inv_n,
     snrt_cluster_hw_barrier();
 
     // Compute covariance matrix
-    ata_baseline(inv_n_m1, m, n, data, datat, cov);
+    syrk_baseline(m, n, inv_n_m1, data, datat, 0, cov);
 }
 
 void covariance_opt(uint32_t m, uint32_t n, double inv_n,
diff --git a/sw/blas/blas.h b/sw/blas/blas.h
index 33c29e175..69005ccb7 100644
--- a/sw/blas/blas.h
+++ b/sw/blas/blas.h
@@ -4,6 +4,20 @@
 
 #pragma once
 
+// Floating-point multiplications by zero cannot be optimized as in some
+// edge cases they do not yield zero:
+// - 0f * NaN = NaN
+// - 0f * INFINITY == NaN
+// Thus in order to optimize it, we need to test for zero. You can use this
+// function for free when `multiplier` is a constant.
+static inline double multiply_opt(double multiplicand, double multiplier) {
+    if (multiplier)
+        return multiplicand * multiplier;
+    else
+        return 0;
+}
+
 #include "axpy/src/axpy.h"
 #include "dot/src/dot.h"
 #include "gemm/src/gemm.h"
+#include "syrk/src/syrk.h"
diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
index a480379a9..1a73aedf8 100644
--- a/sw/blas/gemm/src/gemm.h
+++ b/sw/blas/gemm/src/gemm.h
@@ -13,19 +13,6 @@
 
 #pragma once
 
-// Floating-point multiplications by zero cannot be optimized as in some
-// edge cases they do not yield zero:
-// - 0f * NaN = NaN
-// - 0f * INFINITY == NaN
-// Thus in order to optimize it, we need to test for zero. You can use this
-// function for free when `multiplier` is a constant.
-static inline double multiply_opt(double multiplicand, double multiplier) {
-    if (multiplier)
-        return multiplicand * multiplier;
-    else
-        return 0;
-}
-
 #include "gemm_fp16.h"
 #include "gemm_fp32.h"
 #include "gemm_fp64.h"
diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c
index 17f3936b0..9760000c6 100644
--- a/sw/blas/gemm/src/main.c
+++ b/sw/blas/gemm/src/main.c
@@ -9,7 +9,7 @@
 #include <math.h>
 #include <stdint.h>
 
-#include "gemm.h"
+#include "blas.h"
 
 #include "data.h"
 #include "snrt.h"
diff --git a/sw/apps/ata/.gitignore b/sw/blas/syrk/.gitignore
similarity index 100%
rename from sw/apps/ata/.gitignore
rename to sw/blas/syrk/.gitignore
diff --git a/sw/apps/ata/data/params.json b/sw/blas/syrk/data/params.json
similarity index 63%
rename from sw/apps/ata/data/params.json
rename to sw/blas/syrk/data/params.json
index 1db35db08..492d8e0cc 100644
--- a/sw/apps/ata/data/params.json
+++ b/sw/blas/syrk/data/params.json
@@ -3,8 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 {
-    "m": 16,
-    "n": 4,
-    "m_tiles": 2,
-    "funcptr": "ata_opt"
+    "m": 8,
+    "n": 2,
+    "alpha": 1.5,
+    "beta": 3.2,
+    "m_tiles": 1,
+    "funcptr": "syrk_opt"
 }
diff --git a/sw/apps/ata/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py
similarity index 56%
rename from sw/apps/ata/scripts/datagen.py
rename to sw/blas/syrk/scripts/datagen.py
index f6474f2e6..05cd2f038 100755
--- a/sw/apps/ata/scripts/datagen.py
+++ b/sw/blas/syrk/scripts/datagen.py
@@ -14,25 +14,27 @@
 
 DOUBLE_BUFFER = True
 
-class AtaDataGen(DataGen):
+class SyrkDataGen(DataGen):
 
     # Function pointers to alternative implementations
-    FUNCPTRS = ["ata_naive", "ata_baseline", "ata_opt"]
+    FUNCPTRS = ["syrk_naive", "syrk_baseline", "syrk_opt"]
 
-    def golden_model(self, alpha, A):
-        return alpha * np.matmul(A, A.transpose())
+    def golden_model(self, alpha, A, beta, C):
+        return alpha * np.matmul(A, A.transpose()) + beta * C
 
     def validate(self, **kwargs):
+        n_cores = 8
         assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles"
         m_frac = kwargs['m'] / kwargs['m_tiles']
-        assert (m_frac % 8) == 0, "m_frac must be an integer multiple of the number of cores"
-        assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4"
+        assert (m_frac % n_cores) == 0, "m_frac must be an integer multiple of the number of cores"
+        if kwargs['funcptr'] != "syrk_naive":
+            assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4"
         assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
 
         # Calculate total TCDM occupation
         a_tile_size = m_frac * kwargs['n'] * 8
-        b_tile_size = m_frac * m_frac * 8
-        total_size = 2 * a_tile_size + b_tile_size
+        c_tile_size = m_frac * m_frac * 8
+        total_size = 2 * a_tile_size + c_tile_size
         if DOUBLE_BUFFER:
             total_size *= 2
         data_utils.validate_tcdm_footprint(total_size)
@@ -42,33 +44,43 @@ def emit_header(self, **kwargs):
 
         self.validate(**kwargs)
 
+        if 'alpha' in kwargs:
+            alpha = kwargs['alpha']
+        else:
+            alpha = np.random.randint(-200, 100)/100
+        if 'beta' in kwargs:
+            beta = kwargs['beta']
+        else:
+            beta = np.random.randint(-200, 100)/100
+
         A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100
-        alpha = np.random.randint(-200, 100)/100
-        B = self.golden_model(alpha, A)
+        C_in = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['m']))/100
+        C_out = self.golden_model(alpha, A, beta, C_in)
 
         A = A.flatten()
-        B = B.flatten()
+        C_in = C_in.flatten()
 
         A_uid = 'A'
-        B_uid = 'B'
+        C_uid = 'C'
 
         cfg = {
-            'alpha': alpha,
             'm': kwargs['m'],
             'n': kwargs['n'],
+            'alpha': alpha,
+            'beta': beta,
             'a': A_uid,
-            'b': B_uid,
+            'c': C_uid,
             'm_tiles': kwargs['m_tiles'],
             'funcptr': kwargs['funcptr']
         }
 
         header += [format_array_definition('double', A_uid, A)]
-        header += [format_array_declaration('double', B_uid, B.shape)]
-        header += [format_struct_definition('ata_args_t', 'args', cfg)]
+        header += [format_array_definition('double', C_uid, C_in)]
+        header += [format_struct_definition('syrk_args_t', 'args', cfg)]
         header = '\n\n'.join(header)
 
         return header
 
 
 if __name__ == '__main__':
-    AtaDataGen().main()
+    SyrkDataGen().main()
diff --git a/sw/apps/ata/scripts/verify.py b/sw/blas/syrk/scripts/verify.py
similarity index 70%
rename from sw/apps/ata/scripts/verify.py
rename to sw/blas/syrk/scripts/verify.py
index 206af870a..0624156cb 100755
--- a/sw/apps/ata/scripts/verify.py
+++ b/sw/blas/syrk/scripts/verify.py
@@ -7,23 +7,24 @@
 
 import numpy as np
 import sys
-from datagen import AtaDataGen
+from datagen import SyrkDataGen
 
 from snitch.util.sim.verif_utils import Verifier
 
 
-class AtaVerifier(Verifier):
+class SyrkVerifier(Verifier):
 
-    OUTPUT_UIDS = ['B']
+    OUTPUT_UIDS = ['C']
 
     def __init__(self):
         super().__init__()
         self.func_args = {
-            'alpha': 'd',
             'm': 'I',
             'n': 'I',
+            'alpha': 'd',
+            'beta': 'd',
             'A': 'I',
-            'B': 'I',
+            'C': 'I',
             'm_tiles': 'I',
             'funcptr': 'I'
         }
@@ -34,12 +35,17 @@ def get_actual_results(self):
 
     def get_expected_results(self):
         A = self.get_input_from_symbol('A', 'double')
+        C = self.get_input_from_symbol('C', 'double')
         A = np.reshape(A, (self.func_args['m'], self.func_args['n']))
-        return AtaDataGen().golden_model(self.func_args['alpha'], A).flatten()
+        C = np.reshape(C, (self.func_args['m'], self.func_args['m']))
+        return SyrkDataGen().golden_model(
+            self.func_args['alpha'], A,
+            self.func_args['beta'], C
+        ).flatten()
 
     def check_results(self, *args):
         return super().check_results(*args, rtol=1e-10)
 
 
 if __name__ == "__main__":
-    sys.exit(AtaVerifier().main())
+    sys.exit(SyrkVerifier().main())
diff --git a/sw/apps/ata/src/args.h b/sw/blas/syrk/src/args.h
similarity index 66%
rename from sw/apps/ata/src/args.h
rename to sw/blas/syrk/src/args.h
index f65a6a13f..6bb58e00e 100644
--- a/sw/apps/ata/src/args.h
+++ b/sw/blas/syrk/src/args.h
@@ -7,15 +7,16 @@
 #pragma once
 #include <stdint.h>
 
-typedef void (*ata_fp_t)(double alpha, uint32_t m, uint32_t n, double *a,
-    double *at, double *b);
+typedef void (*syrk_fp_t)(uint32_t m, uint32_t n, double alpha, double *a,
+    double *at, double beta, double *b);
 
 typedef struct {
-    double alpha;
     uint32_t m;
     uint32_t n;
+    double alpha;
+    double beta;
     double *a;
-    double *b;
+    double *c;
     uint32_t m_tiles;
-    ata_fp_t funcptr;
-} ata_args_t;
+    syrk_fp_t funcptr;
+} syrk_args_t;
diff --git a/sw/apps/ata/src/main.c b/sw/blas/syrk/src/main.c
similarity index 88%
rename from sw/apps/ata/src/main.c
rename to sw/blas/syrk/src/main.c
index c8df4bea9..9f1ad7163 100644
--- a/sw/apps/ata/src/main.c
+++ b/sw/blas/syrk/src/main.c
@@ -6,12 +6,12 @@
 
 #include "snrt.h"
 
-#include "ata.h"
+#include "blas.h"
 #include "data.h"
 
 int main() {
 
-    ata_job(&args);
+    syrk_job(&args);
 
     return 0;
 }
diff --git a/sw/apps/ata/src/ata.h b/sw/blas/syrk/src/syrk.h
similarity index 69%
rename from sw/apps/ata/src/ata.h
rename to sw/blas/syrk/src/syrk.h
index 8673353a4..9494f2777 100644
--- a/sw/apps/ata/src/ata.h
+++ b/sw/blas/syrk/src/syrk.h
@@ -7,26 +7,27 @@
 #include "args.h"
 #include "snrt.h"
 
-#define DOUBLE_BUFFER 1
-
 __thread int setup_ssr = 1;
 
-void ata_naive(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) {
+void syrk_naive(uint32_t m, uint32_t n, double alpha, double *a, double *at,
+                double beta, double *c) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
     for (uint32_t i = offset; i < m; i += stride) {
         for (uint32_t j = 0; j < m; j++) {
-            b[i * m + j] = 0;
+            double acc = 0;
             for (uint32_t k = 0; k < n; k++) {
-                b[i * m + j] += a[i * n + k] * at[j * n + k];
+                acc += a[i * n + k] * at[j * n + k];
             }
-            b[i * m + j] *= alpha;
+            c[i * m + j] = multiply_opt(c[i * m + j], beta);
+            c[i * m + j] += alpha * acc;
         }
     }
 }
 
-void ata_baseline(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) {
+void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at,
+                   double beta, double *c) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
@@ -89,15 +90,20 @@ void ata_baseline(double alpha, uint32_t m, uint32_t n, double *a, double *at, d
                 );
             }
 
-            b[i * m + j + 0] = alpha * acc[0];
-            b[i * m + j + 1] = alpha * acc[1];
-            b[i * m + j + 2] = alpha * acc[2];
-            b[i * m + j + 3] = alpha * acc[3];
+            c[i * m + j + 0] = multiply_opt(c[i * m + j + 0], beta);
+            c[i * m + j + 1] = multiply_opt(c[i * m + j + 1], beta);
+            c[i * m + j + 2] = multiply_opt(c[i * m + j + 2], beta);
+            c[i * m + j + 3] = multiply_opt(c[i * m + j + 3], beta);
+            c[i * m + j + 0] += alpha * acc[0];
+            c[i * m + j + 1] += alpha * acc[1];
+            c[i * m + j + 2] += alpha * acc[2];
+            c[i * m + j + 3] += alpha * acc[3];
         }
     }
 }
 
-void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) {
+void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at,
+              double beta, double *c) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
@@ -148,16 +154,20 @@ void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double
                 "fmadd.d %[acc1], ft0, ft1, %[acc1] \n"
                 "fmadd.d %[acc2], ft0, ft1, %[acc2] \n"
                 "fmadd.d %[acc3], ft0, ft1, %[acc3] \n"
-                "fmul.d %[b0], %[acc0], %[alpha] \n"
-                "fmul.d %[b1], %[acc1], %[alpha] \n"
-                "fmul.d %[b2], %[acc2], %[alpha] \n"
-                "fmul.d %[b3], %[acc3], %[alpha] \n"
-                : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
-                  [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]),
-                  [ b0 ] "=f"(b[i * m + j + 0]), [ b1 ] "=f"(b[i * m + j + 1]),
-                  [ b2 ] "=f"(b[i * m + j + 2]), [ b3 ] "=f"(b[i * m + j + 3])
+                "fmul.d %[acc0], %[acc0], %[alpha] \n"
+                "fmul.d %[acc1], %[acc1], %[alpha] \n"
+                "fmul.d %[acc2], %[acc2], %[alpha] \n"
+                "fmul.d %[acc3], %[acc3], %[alpha] \n"
+                "fmadd.d %[c0], %[c0], %[beta], %[acc0] \n"
+                "fmadd.d %[c1], %[c1], %[beta], %[acc1] \n"
+                "fmadd.d %[c2], %[c2], %[beta], %[acc2] \n"
+                "fmadd.d %[c3], %[c3], %[beta], %[acc3] \n"
+                : [ c0 ] "+f"(c[i * m + j + 0]), [ c1 ] "+f"(c[i * m + j + 1]),
+                  [ c2 ] "+f"(c[i * m + j + 2]), [ c3 ] "+f"(c[i * m + j + 3]),
+                  [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                  [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
                 : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll),
-                  [ alpha ] "f"(alpha)
+                  [ alpha ] "f"(alpha), [ beta ] "f"(beta)
                 : "ft0", "ft1", "ft2");
         }
     }
@@ -166,23 +176,23 @@ void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double
     snrt_fpu_fence();
 }
 
-void ata_job(ata_args_t *args) {
-    uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes;
-    uint64_t local_a0_addr, local_at0_addr, local_b0_addr,
-             local_a1_addr, local_at1_addr, local_b1_addr;
+void syrk_job(syrk_args_t *args) {
+    uint32_t m_frac, a_tile_size, a_tile_bytes, c_tile_size, c_tile_bytes;
+    uint64_t local_a0_addr, local_at0_addr, local_c0_addr,
+             local_a1_addr, local_at1_addr, local_c1_addr;
     double *local_a[2];
     double *local_at[2];
-    double *local_b[2];
-    uint32_t iterations, sb_iterations;
+    double *local_c[2];
+    uint32_t n_tiles, iterations;
     uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx;
 
 #ifndef JOB_ARGS_PRELOADED
     // Allocate space for job arguments in TCDM
-    ata_args_t *local_args = (ata_args_t *)snrt_l1_next();
+    syrk_args_t *local_args = (syrk_args_t *)snrt_l1_next();
 
     // Copy job arguments to TCDM
     if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(local_args, args, sizeof(ata_args_t));
+        snrt_dma_start_1d(local_args, args, sizeof(syrk_args_t));
         snrt_dma_wait_all();
     }
     snrt_cluster_hw_barrier();
@@ -192,43 +202,66 @@ void ata_job(ata_args_t *args) {
     // Calculate size of each tile
     m_frac = args->m / args->m_tiles;
     a_tile_size = args->n * m_frac;
-    b_tile_size = m_frac * m_frac;
+    c_tile_size = m_frac * m_frac;
     a_tile_bytes = a_tile_size * sizeof(double);
-    b_tile_bytes = b_tile_size * sizeof(double);
+    c_tile_bytes = c_tile_size * sizeof(double);
 
     // Allocate space for job operands in TCDM
     // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
-    local_a0_addr = (uint64_t)args + sizeof(ata_args_t);
+    local_a0_addr = (uint64_t)args + sizeof(syrk_args_t);
     local_at0_addr = local_a0_addr + a_tile_bytes;
-    local_b0_addr = local_at0_addr + a_tile_bytes;
+    local_c0_addr = local_at0_addr + a_tile_bytes;
     local_a[0] = (double *)local_a0_addr;
     local_at[0] = (double *)local_at0_addr;
-    local_b[0] = (double *)local_b0_addr;
-    if (DOUBLE_BUFFER) {
-        local_a1_addr = local_b0_addr + b_tile_bytes;
-        local_at1_addr = local_a1_addr + a_tile_bytes;
-        local_b1_addr = local_at1_addr + a_tile_bytes;
-        local_a[1] = (double *)local_a1_addr;
-        local_at[1] = (double *)local_at1_addr;
-        local_b[1] = (double *)local_b1_addr;
-    }
+    local_c[0] = (double *)local_c0_addr;
+    local_a1_addr = local_c0_addr + c_tile_bytes;
+    local_at1_addr = local_a1_addr + a_tile_bytes;
+    local_c1_addr = local_at1_addr + a_tile_bytes;
+    local_a[1] = (double *)local_a1_addr;
+    local_at[1] = (double *)local_at1_addr;
+    local_c[1] = (double *)local_c1_addr;
 
     // Calculate number of iterations
-    sb_iterations = args->m_tiles * args->m_tiles;
-    if (DOUBLE_BUFFER) iterations = sb_iterations + 2;
-    else iterations = sb_iterations;
+    n_tiles = args->m_tiles * args->m_tiles;
+    iterations = n_tiles + 2;
 
     // Iterate over all tiles
     for (i = 0; i < iterations; i++) {
         
         if (snrt_is_dm_core()) {
+            // DMA out
+            // (out before in to avoid overwriting data)
+            if (i > 1) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_out = i - 2;
+                buff_idx = i_dma_out % 2;
+                i_row = i_dma_out / args->m_tiles;
+                i_col = i_dma_out % args->m_tiles;
+
+                // Copy job outputs from TCDM
+                snrt_dma_store_2d_tile(
+                    args->c,
+                    local_c[buff_idx],
+                    i_row,
+                    i_col,
+                    m_frac,
+                    m_frac,
+                    args->m,
+                    sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+
             // DMA in
-            if (!DOUBLE_BUFFER || (i < sb_iterations)) {
+            if (i < n_tiles) {
                 snrt_mcycle();
 
                 // Compute tile and buffer indices
                 i_dma_in = i;
-                buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0;
+                buff_idx = i_dma_in % 2;
                 i_row = i_dma_in / args->m_tiles;
                 i_col = i_dma_in % args->m_tiles;
 
@@ -245,35 +278,17 @@ void ata_job(ata_args_t *args) {
                     i_col,
                     a_tile_size,
                     sizeof(double));
-                snrt_dma_wait_all();
-
-                snrt_mcycle();
-            }
-
-            // Additional barriers required when not double buffering
-            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
-            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
-
-            // DMA out
-            if (!DOUBLE_BUFFER || (i > 1)) {
-                snrt_mcycle();
-
-                // Compute tile and buffer indices
-                i_dma_out = DOUBLE_BUFFER ? i - 2 : i;
-                buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0;
-                i_row = i_dma_out / args->m_tiles;
-                i_col = i_dma_out % args->m_tiles;
-
-                // Copy job outputs from TCDM
-                snrt_dma_store_2d_tile(
-                    args->b,
-                    local_b[buff_idx],
-                    i_row,
-                    i_col,
-                    m_frac,
-                    m_frac,
-                    args->m,
-                    sizeof(double));
+                if (args->funcptr == syrk_opt || args->beta != 0) {
+                    snrt_dma_load_2d_tile(
+                        local_c[buff_idx],
+                        args->c,
+                        i_row,
+                        i_col,
+                        m_frac,
+                        m_frac,
+                        args->m,
+                        sizeof(double));
+                }
                 snrt_dma_wait_all();
 
                 snrt_mcycle();
@@ -282,27 +297,22 @@ void ata_job(ata_args_t *args) {
 
         // Compute
         if (snrt_is_compute_core()) {
-            // Additional barrier required when not double buffering
-            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
-
-            if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) {
+            if (i > 0 && i < (n_tiles + 1)) {
                 snrt_mcycle();
 
                 // Compute tile and buffer indices
-                i_compute = DOUBLE_BUFFER ? i - 1 : i;
-                buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0;
+                i_compute = i - 1;
+                buff_idx = i_compute % 2;
 
                 // Perform tile computation
-                ata_fp_t fp = args->funcptr;
-                fp(args->alpha, m_frac, args->n, local_a[buff_idx],
-                   local_at[buff_idx], local_b[buff_idx]);
+                syrk_fp_t fp = args->funcptr;
+                fp(m_frac, args->n, args->alpha, local_a[buff_idx],
+                   local_at[buff_idx], args->beta, local_c[buff_idx]);
 
                 snrt_mcycle();
             }
-
-            // Additional barrier required when not double buffering
-            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
         }
+
         // Synchronize cores after every iteration
         snrt_cluster_hw_barrier();
     }
diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk
index 0a1e4c00c..674ea2cad 100644
--- a/target/snitch_cluster/sw.mk
+++ b/target/snitch_cluster/sw.mk
@@ -51,6 +51,7 @@ APPS  = sw/apps/nop
 APPS += sw/apps/blas/axpy
 APPS += sw/apps/blas/gemm
 APPS += sw/apps/blas/dot
+APPS += sw/apps/blas/syrk
 APPS += sw/apps/dnn/batchnorm
 APPS += sw/apps/dnn/conv2d
 APPS += sw/apps/dnn/fusedconv
@@ -63,7 +64,6 @@ APPS += sw/apps/dnn/concat
 APPS += sw/apps/dnn/fused_concat_linear
 APPS += sw/apps/dnn/transpose
 APPS += sw/apps/montecarlo/pi_estimation
-APPS += sw/apps/ata
 APPS += sw/apps/atax
 APPS += sw/apps/correlation
 APPS += sw/apps/covariance
diff --git a/target/snitch_cluster/sw/apps/blas/gemm/app.mk b/target/snitch_cluster/sw/apps/blas/gemm/app.mk
index 5d2b54068..f50f6d21c 100644
--- a/target/snitch_cluster/sw/apps/blas/gemm/app.mk
+++ b/target/snitch_cluster/sw/apps/blas/gemm/app.mk
@@ -8,6 +8,7 @@ APP              := gemm
 $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build
 SRC_DIR          := $(ROOT)/sw/blas/$(APP)/src
 SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/blas
 
 include $(ROOT)/sw/apps/common.mk
 include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/ata/app.mk b/target/snitch_cluster/sw/apps/blas/syrk/app.mk
similarity index 65%
rename from target/snitch_cluster/sw/apps/ata/app.mk
rename to target/snitch_cluster/sw/apps/blas/syrk/app.mk
index af63400b4..c0fd05044 100644
--- a/target/snitch_cluster/sw/apps/ata/app.mk
+++ b/target/snitch_cluster/sw/apps/blas/syrk/app.mk
@@ -4,10 +4,11 @@
 #
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP              := ata
-$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
-SRC_DIR          := $(ROOT)/sw/apps/$(APP)/src
+APP              := syrk
+$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build
+SRC_DIR          := $(ROOT)/sw/blas/$(APP)/src
 SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/blas
 
 include $(ROOT)/sw/apps/common.mk
 include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/covariance/app.mk b/target/snitch_cluster/sw/apps/covariance/app.mk
index 005791c79..e985e671e 100644
--- a/target/snitch_cluster/sw/apps/covariance/app.mk
+++ b/target/snitch_cluster/sw/apps/covariance/app.mk
@@ -8,7 +8,7 @@ APP              := covariance
 $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
 SRC_DIR          := $(ROOT)/sw/apps/$(APP)/src
 SRCS             := $(SRC_DIR)/main.c
-$(APP)_INCDIRS   := $(ROOT)/sw/apps/ata/src/
+$(APP)_INCDIRS   := $(ROOT)/sw/blas/
 
 include $(ROOT)/sw/apps/common.mk
 include $(ROOT)/target/snitch_cluster/sw/apps/common.mk

From 33f49db515ba6be55e5e4851808350f679ebe309 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 19 Aug 2024 18:13:21 +0200
Subject: [PATCH 12/19] ci: Add covariance and syrk

---
 target/snitch_cluster/sw/fdiv.yaml | 2 --
 target/snitch_cluster/sw/run.yaml  | 4 ++++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/target/snitch_cluster/sw/fdiv.yaml b/target/snitch_cluster/sw/fdiv.yaml
index a8b5f3930..d6b7aea3b 100644
--- a/target/snitch_cluster/sw/fdiv.yaml
+++ b/target/snitch_cluster/sw/fdiv.yaml
@@ -13,5 +13,3 @@ runs:
     cmd: [../../../sw/dnn/flashattention_2/scripts/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/correlation/build/correlation.elf
     cmd: [../../../sw/apps/correlation/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/covariance/build/covariance.elf
-    cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"]
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index 7a5a55a4c..ab302f7c3 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -80,6 +80,8 @@ runs:
     cmd: [../../../sw/blas/gemm/scripts/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/blas/dot/build/dot.elf
     cmd: [../../../sw/blas/dot/scripts/verify.py, "${sim_bin}", "${elf}"]
+  - elf: apps/blas/syrk/build/syrk.elf
+    cmd: [../../../sw/blas/syrk/scripts/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/dnn/batchnorm/build/batchnorm.elf
   - elf: apps/dnn/maxpool/build/maxpool.elf
   # - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results
@@ -95,3 +97,5 @@ runs:
   - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf
   # - elf: apps/atax/build/atax.elf
   #   cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"]
+  - elf: apps/covariance/build/covariance.elf
+    cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"]

From 55848dc4d25c4922fa10c785b02a877eade1a0db Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 19 Aug 2024 18:27:40 +0200
Subject: [PATCH 13/19] ci: Correct linting

---
 sw/apps/covariance/src/args.h       |   3 +-
 sw/apps/covariance/src/covariance.h | 118 ++++++++++++----------------
 sw/apps/covariance/src/main.c       |   1 -
 sw/blas/axpy/src/args.h             |   9 ++-
 sw/blas/axpy/src/axpy.h             |  40 +++++-----
 sw/blas/axpy/src/main.c             |   1 -
 sw/blas/syrk/scripts/datagen.py     |   5 +-
 sw/blas/syrk/src/args.h             |   2 +-
 sw/blas/syrk/src/main.c             |   1 -
 sw/blas/syrk/src/syrk.h             |  95 +++++++++-------------
 10 files changed, 114 insertions(+), 161 deletions(-)

diff --git a/sw/apps/covariance/src/args.h b/sw/apps/covariance/src/args.h
index f88768dd5..cd15bc852 100644
--- a/sw/apps/covariance/src/args.h
+++ b/sw/apps/covariance/src/args.h
@@ -8,7 +8,8 @@
 #include <stdint.h>
 
 typedef void (*covariance_fp_t)(uint32_t m, uint32_t n, double inv_n,
-    double inv_n_m1, double *data, double *datat,double *cov);
+                                double inv_n_m1, double *data, double *datat,
+                                double *cov);
 
 typedef struct {
     uint32_t m;
diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h
index 53944e6ca..cdeb427bf 100644
--- a/sw/apps/covariance/src/covariance.h
+++ b/sw/apps/covariance/src/covariance.h
@@ -11,15 +11,13 @@
 
 #define DOUBLE_BUFFER 1
 
-void covariance_naive(uint32_t m, uint32_t n, double inv_n,
-                      double inv_n_m1, double *data, double *datat,
-                      double *cov) {
+void covariance_naive(uint32_t m, uint32_t n, double inv_n, double inv_n_m1,
+                      double *data, double *datat, double *cov) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
     // Center data
     for (uint32_t i = offset; i < m; i += stride) {
-
         // Calculate row mean
         double data_mean = 0.0;
         double datat_mean = 0.0;
@@ -44,15 +42,13 @@ void covariance_naive(uint32_t m, uint32_t n, double inv_n,
     syrk_naive(m, n, inv_n_m1, data, datat, 0, cov);
 }
 
-void covariance_baseline(uint32_t m, uint32_t n, double inv_n,
-                         double inv_n_m1, double *data, double *datat,
-                         double *cov) {
+void covariance_baseline(uint32_t m, uint32_t n, double inv_n, double inv_n_m1,
+                         double *data, double *datat, double *cov) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
     // Center data
     for (uint32_t i = offset; i < m; i += stride) {
-
         // Calculate row mean
         double data_mean = 0.0;
         double datat_mean = 0.0;
@@ -77,9 +73,8 @@ void covariance_baseline(uint32_t m, uint32_t n, double inv_n,
     syrk_baseline(m, n, inv_n_m1, data, datat, 0, cov);
 }
 
-void covariance_opt(uint32_t m, uint32_t n, double inv_n,
-                    double inv_n_m1, double *data, double *datat,
-                    double *cov) {
+void covariance_opt(uint32_t m, uint32_t n, double inv_n, double inv_n_m1,
+                    double *data, double *datat, double *cov) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
@@ -97,14 +92,14 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
     //                 ft0.push(data[i * n + j])
     //                 ft1.push(datat[i * n + j])
     const uint32_t ssr01_b[4] = {unroll0, n, 2, m / (stride * unroll0)};
-    const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double),
-                                 0, sizeof(double) * n * stride * unroll0};
-    snrt_ssr_loop_4d(SNRT_SSR_DM0,
-                     ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3],
-                     ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]);
-    snrt_ssr_loop_4d(SNRT_SSR_DM1,
-                     ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3],
-                     ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]);
+    const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double), 0,
+                                 sizeof(double) * n * stride * unroll0};
+    snrt_ssr_loop_4d(SNRT_SSR_DM0, ssr01_b[0], ssr01_b[1], ssr01_b[2],
+                     ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2],
+                     ssr01_i[3]);
+    snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr01_b[0], ssr01_b[1], ssr01_b[2],
+                     ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2],
+                     ssr01_i[3]);
     snrt_ssr_repeat(SNRT_SSR_DM0, 1);
     // Configure ft2 to store data and datat elements
     // for (i1 = offset; i1 < m; i1 += stride * unroll0)
@@ -115,11 +110,9 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
     //             datat[i * n + j] = ft2.pop()
     const uint32_t ssr2_b[4] = {2, unroll0, n, m / (stride * unroll0)};
     const uint32_t ssr2_i[4] = {(uint32_t)datat - (uint32_t)data,
-                                sizeof(double) * n * stride,
-                                sizeof(double),
+                                sizeof(double) * n * stride, sizeof(double),
                                 sizeof(double) * n * stride * unroll0};
-    snrt_ssr_loop_4d(SNRT_SSR_DM2,
-                     ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3],
+    snrt_ssr_loop_4d(SNRT_SSR_DM2, ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3],
                      ssr2_i[0], ssr2_i[1], ssr2_i[2], ssr2_i[3]);
 
     // SSR start address need to be configured each time
@@ -130,21 +123,20 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
 
     // Center data
     for (uint32_t i = offset; i < m; i += stride * unroll0) {
-
         // Calculate row means
         double m[2 * unroll0];
-        m[0] = 0.0; // mean(data[i])
-        m[1] = 0.0; // mean(datat[i])
-        m[2] = 0.0; // mean(data[i + stride])
-        m[3] = 0.0; // mean(datat[i + stride])
+        m[0] = 0.0;  // mean(data[i])
+        m[1] = 0.0;  // mean(datat[i])
+        m[2] = 0.0;  // mean(data[i + stride])
+        m[3] = 0.0;  // mean(datat[i + stride])
         asm volatile(
             "frep.o %[n_frep], %[n_insn], 0, 0 \n"
             "fadd.d %[m0], ft0, %[m0] \n"
             "fadd.d %[m1], ft1, %[m1] \n"
             "fadd.d %[m2], ft0, %[m2] \n"
             "fadd.d %[m3], ft1, %[m3] \n"
-            : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]),
-              [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3])
+            : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]),
+              [ m3 ] "+f"(m[3])
             : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0)
             : "ft0", "ft1", "ft2");
         m[0] *= inv_n;
@@ -161,8 +153,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
             "fsub.d ft2, ft1, %[m1] \n"
             "fsub.d ft2, ft0, %[m2] \n"
             "fsub.d ft2, ft1, %[m3] \n"
-            : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]),
-              [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3])
+            : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]),
+              [ m3 ] "+f"(m[3])
             : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0)
             : "ft0", "ft1", "ft2");
     }
@@ -190,16 +182,16 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
     //                 ft0.push(a[i * n + k])
     //                 ft1.push(at[j * n + k])
     const uint32_t ssr0_b[4] = {unroll1, n, m / unroll1, m / stride};
-    const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)};
-    snrt_ssr_loop_3d(SNRT_SSR_DM0,
-        ssr0_b[1], ssr0_b[2], ssr0_b[3],
-        ssr0_i[1], ssr0_i[2], ssr0_i[3]);
+    const uint32_t ssr0_i[4] = {0, sizeof(double), 0,
+                                stride * n * sizeof(double)};
+    snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1],
+                     ssr0_i[2], ssr0_i[3]);
     snrt_ssr_repeat(SNRT_SSR_DM0, unroll1);
     const uint32_t ssr1_b[4] = {unroll1, n, m / unroll1, m / stride};
-    const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll1 * n * sizeof(double), 0};
-    snrt_ssr_loop_4d(SNRT_SSR_DM1,
-        ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3],
-        ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
+    const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double),
+                                unroll1 * n * sizeof(double), 0};
+    snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3],
+                     ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
 
     // SSR start address need to be configured each time
     snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, data + offset * n);
@@ -208,7 +200,6 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
 
     for (uint32_t i = offset; i < m; i += stride) {
         for (uint32_t j = 0; j < m; j += unroll1) {
-
             double acc[unroll1];
             acc[0] = 0;
             acc[1] = 0;
@@ -227,8 +218,10 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
                 "fmul.d %[b3], %[acc3], %[alpha] \n"
                 : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
                   [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]),
-                  [ b0 ] "=f"(cov[i * m + j + 0]), [ b1 ] "=f"(cov[i * m + j + 1]),
-                  [ b2 ] "=f"(cov[i * m + j + 2]), [ b3 ] "=f"(cov[i * m + j + 3])
+                  [ b0 ] "=f"(cov[i * m + j + 0]),
+                  [ b1 ] "=f"(cov[i * m + j + 1]),
+                  [ b2 ] "=f"(cov[i * m + j + 2]),
+                  [ b3 ] "=f"(cov[i * m + j + 3])
                 : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1),
                   [ alpha ] "f"(inv_n_m1)
                 : "ft0", "ft1", "ft2");
@@ -241,8 +234,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n,
 
 void covariance_job(covariance_args_t *args) {
     uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes;
-    uint64_t local_a0_addr, local_at0_addr, local_b0_addr,
-             local_a1_addr, local_at1_addr, local_b1_addr;
+    uint64_t local_a0_addr, local_at0_addr, local_b0_addr, local_a1_addr,
+        local_at1_addr, local_b1_addr;
     double *local_a[2];
     double *local_at[2];
     double *local_b[2];
@@ -287,12 +280,13 @@ void covariance_job(covariance_args_t *args) {
 
     // Calculate number of iterations
     sb_iterations = args->m_tiles * args->m_tiles;
-    if (DOUBLE_BUFFER) iterations = sb_iterations + 2;
-    else iterations = sb_iterations;
+    if (DOUBLE_BUFFER)
+        iterations = sb_iterations + 2;
+    else
+        iterations = sb_iterations;
 
     // Iterate over all tiles
     for (i = 0; i < iterations; i++) {
-        
         if (snrt_is_dm_core()) {
             // DMA in
             if (!DOUBLE_BUFFER || (i < sb_iterations)) {
@@ -305,18 +299,10 @@ void covariance_job(covariance_args_t *args) {
                 i_col = i_dma_in % args->m_tiles;
 
                 // Copy job operands in TCDM
-                snrt_dma_load_1d_tile(
-                    local_a[buff_idx],
-                    args->data,
-                    i_row,
-                    a_tile_size,
-                    sizeof(double));
-                snrt_dma_load_1d_tile(
-                    local_at[buff_idx],
-                    args->data,
-                    i_col,
-                    a_tile_size,
-                    sizeof(double));
+                snrt_dma_load_1d_tile(local_a[buff_idx], args->data, i_row,
+                                      a_tile_size, sizeof(double));
+                snrt_dma_load_1d_tile(local_at[buff_idx], args->data, i_col,
+                                      a_tile_size, sizeof(double));
                 snrt_dma_wait_all();
 
                 snrt_mcycle();
@@ -343,15 +329,9 @@ void covariance_job(covariance_args_t *args) {
                 i_col = i_dma_out % args->m_tiles;
 
                 // Copy job outputs from TCDM
-                snrt_dma_store_2d_tile(
-                    args->cov,
-                    local_b[buff_idx],
-                    i_row,
-                    i_col,
-                    m_frac,
-                    m_frac,
-                    args->m,
-                    sizeof(double));
+                snrt_dma_store_2d_tile(args->cov, local_b[buff_idx], i_row,
+                                       i_col, m_frac, m_frac, args->m,
+                                       sizeof(double));
                 snrt_dma_wait_all();
 
                 snrt_mcycle();
diff --git a/sw/apps/covariance/src/main.c b/sw/apps/covariance/src/main.c
index 3c9d225a8..112ead333 100644
--- a/sw/apps/covariance/src/main.c
+++ b/sw/apps/covariance/src/main.c
@@ -10,7 +10,6 @@
 #include "data.h"
 
 int main() {
-
     covariance_job(&args);
 
     return 0;
diff --git a/sw/blas/axpy/src/args.h b/sw/blas/axpy/src/args.h
index 0efe3a2b4..c5d542852 100644
--- a/sw/blas/axpy/src/args.h
+++ b/sw/blas/axpy/src/args.h
@@ -5,14 +5,15 @@
 #pragma once
 #include <stdint.h>
 
-typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y, double* z);
+typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y,
+                          double* z);
 
 typedef struct {
     uint32_t n;
     double a;
-    double *x;
-    double *y;
-    double *z;
+    double* x;
+    double* y;
+    double* z;
     uint32_t n_tiles;
     axpy_fp_t funcptr;
 } axpy_args_t;
diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h
index c5df546ab..8ded48167 100644
--- a/sw/blas/axpy/src/axpy.h
+++ b/sw/blas/axpy/src/axpy.h
@@ -11,7 +11,8 @@
 #define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT)
 #define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT)
 
-static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double* z) {
+static inline void axpy_naive(uint32_t n, double a, double *x, double *y,
+                              double *z) {
     int core_idx = snrt_cluster_core_idx();
     int frac = n / snrt_cluster_compute_core_num();
     int offset = core_idx;
@@ -22,28 +23,27 @@ static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double
     snrt_fpu_fence();
 }
 
-static inline void axpy_fma(uint32_t n, double a, double* x, double* y, double* z) {
+static inline void axpy_fma(uint32_t n, double a, double *x, double *y,
+                            double *z) {
     int core_idx = snrt_cluster_core_idx();
     int frac = n / snrt_cluster_compute_core_num();
     int offset = core_idx;
 
     for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) {
-        asm volatile (
-            "fmadd.d %[z], %[a], %[x], %[y] \n"
-            : [ z ]"=f"(z[i])
-            : [ a ]"f"(a), [ x ]"f"(x[i]), [ y ]"f"(y[i])
-        );
+        asm volatile("fmadd.d %[z], %[a], %[x], %[y] \n"
+                     : [ z ] "=f"(z[i])
+                     : [ a ] "f"(a), [ x ] "f"(x[i]), [ y ] "f"(y[i]));
     }
     snrt_fpu_fence();
 }
 
-static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* z) {
+static inline void axpy_opt(uint32_t n, double a, double *x, double *y,
+                            double *z) {
     int core_idx = snrt_cluster_core_idx();
     int frac = n / snrt_cluster_compute_core_num();
     int offset = core_idx;
 
-    snrt_ssr_loop_1d(SNRT_SSR_DM_ALL,
-                     frac,
+    snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac,
                      snrt_cluster_compute_core_num() * sizeof(double));
 
     snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x + offset);
@@ -57,24 +57,22 @@ static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double*
         "fmadd.d ft2, %[a], ft0, ft1\n"
         :
         : [ n_frep ] "r"(frac - 1), [ a ] "f"(a)
-        : "ft0", "ft1", "ft2", "memory"
-    );
-    
+        : "ft0", "ft1", "ft2", "memory");
+
     snrt_fpu_fence();
     snrt_ssr_disable();
 }
 
 static inline void axpy_job(axpy_args_t *args) {
     uint32_t frac, offset, size;
-    uint64_t local_x0_addr, local_y0_addr, local_z0_addr,
-             local_x1_addr, local_y1_addr, local_z1_addr;
+    uint64_t local_x0_addr, local_y0_addr, local_z0_addr, local_x1_addr,
+        local_y1_addr, local_z1_addr;
     double *local_x[2];
     double *local_y[2];
     double *local_z[2];
     double *remote_x, *remote_y, *remote_z;
     uint32_t iterations, i, i_dma_in, i_compute, i_dma_out, buff_idx;
 
-
 #ifndef JOB_ARGS_PRELOADED
     // Allocate space for job arguments in TCDM
     axpy_args_t *local_args = (axpy_args_t *)snrt_l1_next();
@@ -102,8 +100,10 @@ static inline void axpy_job(axpy_args_t *args) {
     local_z[0] = (double *)local_z0_addr;
     if (DOUBLE_BUFFER) {
         local_x1_addr = ALIGN_UP_TCDM(local_z0_addr + size);
-        local_y1_addr = ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT;
-        local_z1_addr = ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT;
+        local_y1_addr =
+            ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT;
+        local_z1_addr =
+            ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT;
         local_x[1] = (double *)local_x1_addr;
         local_y[1] = (double *)local_y1_addr;
         local_z[1] = (double *)local_z1_addr;
@@ -115,7 +115,6 @@ static inline void axpy_job(axpy_args_t *args) {
 
     // Iterate over all tiles
     for (i = 0; i < iterations; i++) {
-
         if (snrt_is_dm_core()) {
             // DMA in
             if (!DOUBLE_BUFFER || (i < args->n_tiles)) {
@@ -176,7 +175,8 @@ static inline void axpy_job(axpy_args_t *args) {
 
                 // Perform tile computation
                 axpy_fp_t fp = args->funcptr;
-                fp(frac, args->a, local_x[buff_idx], local_y[buff_idx], local_z[buff_idx]);
+                fp(frac, args->a, local_x[buff_idx], local_y[buff_idx],
+                   local_z[buff_idx]);
 
                 snrt_mcycle();
             }
diff --git a/sw/blas/axpy/src/main.c b/sw/blas/axpy/src/main.c
index 83cb58ae8..e0389d25d 100644
--- a/sw/blas/axpy/src/main.c
+++ b/sw/blas/axpy/src/main.c
@@ -8,7 +8,6 @@
 #include "data.h"
 
 int main() {
-
     axpy_job(&args);
 
 // TODO: currently only works for single cluster otherwise need to
diff --git a/sw/blas/syrk/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py
index 05cd2f038..9b4959fca 100755
--- a/sw/blas/syrk/scripts/datagen.py
+++ b/sw/blas/syrk/scripts/datagen.py
@@ -8,12 +8,12 @@
 import numpy as np
 
 from snitch.util.sim import data_utils
-from snitch.util.sim.data_utils import format_array_definition, format_array_declaration, \
-    format_struct_definition, DataGen
+from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen
 
 
 DOUBLE_BUFFER = True
 
+
 class SyrkDataGen(DataGen):
 
     # Function pointers to alternative implementations
@@ -55,7 +55,6 @@ def emit_header(self, **kwargs):
 
         A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100
         C_in = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['m']))/100
-        C_out = self.golden_model(alpha, A, beta, C_in)
 
         A = A.flatten()
         C_in = C_in.flatten()
diff --git a/sw/blas/syrk/src/args.h b/sw/blas/syrk/src/args.h
index 6bb58e00e..24342d3e3 100644
--- a/sw/blas/syrk/src/args.h
+++ b/sw/blas/syrk/src/args.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 
 typedef void (*syrk_fp_t)(uint32_t m, uint32_t n, double alpha, double *a,
-    double *at, double beta, double *b);
+                          double *at, double beta, double *b);
 
 typedef struct {
     uint32_t m;
diff --git a/sw/blas/syrk/src/main.c b/sw/blas/syrk/src/main.c
index 9f1ad7163..f8c09ae4f 100644
--- a/sw/blas/syrk/src/main.c
+++ b/sw/blas/syrk/src/main.c
@@ -10,7 +10,6 @@
 #include "data.h"
 
 int main() {
-
     syrk_job(&args);
 
     return 0;
diff --git a/sw/blas/syrk/src/syrk.h b/sw/blas/syrk/src/syrk.h
index 9494f2777..718ad7fe9 100644
--- a/sw/blas/syrk/src/syrk.h
+++ b/sw/blas/syrk/src/syrk.h
@@ -39,7 +39,6 @@ void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at,
 
     for (uint32_t i = offset; i < m; i += stride) {
         for (uint32_t j = 0; j < m; j += unroll1) {
-
             double acc[4];
             acc[0] = 0;
             acc[1] = 0;
@@ -66,28 +65,26 @@ void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at,
                     "fmadd.d %[acc3], %[a3], %[at15], %[acc3] \n"
                     : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
                       [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
-                    : [ a0 ] "f"(a[i * n + k + 0]),
-                      [ a1 ] "f"(a[i * n + k + 1]),
-                      [ a2 ] "f"(a[i * n + k + 2]),
-                      [ a3 ] "f"(a[i * n + k + 3]),
-                      [ at0 ] "f"(at[(j + 0) * n + k]),
-                      [ at1 ] "f"(at[(j + 1) * n + k]),
-                      [ at2 ] "f"(at[(j + 2) * n + k]),
-                      [ at3 ] "f"(at[(j + 3) * n + k]),
-                      [ at4 ] "f"(at[(j + 0) * n + k + 1]),
-                      [ at5 ] "f"(at[(j + 1) * n + k + 1]),
-                      [ at6 ] "f"(at[(j + 2) * n + k + 1]),
-                      [ at7 ] "f"(at[(j + 3) * n + k + 1]),
-                      [ at8 ] "f"(at[(j + 0) * n + k + 2]),
-                      [ at9 ] "f"(at[(j + 1) * n + k + 2]),
-                      [ at10 ] "f"(at[(j + 2) * n + k + 2]),
-                      [ at11 ] "f"(at[(j + 3) * n + k + 2]),
-                      [ at12 ] "f"(at[(j + 0) * n + k + 3]),
-                      [ at13 ] "f"(at[(j + 1) * n + k + 3]),
-                      [ at14 ] "f"(at[(j + 2) * n + k + 3]),
-                      [ at15 ] "f"(at[(j + 3) * n + k + 3])
                     :
-                );
+                    [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]),
+                    [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]),
+                    [ at0 ] "f"(at[(j + 0) * n + k]),
+                    [ at1 ] "f"(at[(j + 1) * n + k]),
+                    [ at2 ] "f"(at[(j + 2) * n + k]),
+                    [ at3 ] "f"(at[(j + 3) * n + k]),
+                    [ at4 ] "f"(at[(j + 0) * n + k + 1]),
+                    [ at5 ] "f"(at[(j + 1) * n + k + 1]),
+                    [ at6 ] "f"(at[(j + 2) * n + k + 1]),
+                    [ at7 ] "f"(at[(j + 3) * n + k + 1]),
+                    [ at8 ] "f"(at[(j + 0) * n + k + 2]),
+                    [ at9 ] "f"(at[(j + 1) * n + k + 2]),
+                    [ at10 ] "f"(at[(j + 2) * n + k + 2]),
+                    [ at11 ] "f"(at[(j + 3) * n + k + 2]),
+                    [ at12 ] "f"(at[(j + 0) * n + k + 3]),
+                    [ at13 ] "f"(at[(j + 1) * n + k + 3]),
+                    [ at14 ] "f"(at[(j + 2) * n + k + 3]),
+                    [ at15 ] "f"(at[(j + 3) * n + k + 3])
+                    :);
             }
 
             c[i * m + j + 0] = multiply_opt(c[i * m + j + 0], beta);
@@ -122,15 +119,16 @@ void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at,
         //                 ft0.push(a[i * n + k])
         //                 ft1.push(at[j * n + k])
         const uint32_t ssr0_b[4] = {unroll, n, m / unroll, m / stride};
-        const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)};
+        const uint32_t ssr0_i[4] = {0, sizeof(double), 0,
+                                    stride * n * sizeof(double)};
         snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3],
                          ssr0_i[1], ssr0_i[2], ssr0_i[3]);
         snrt_ssr_repeat(SNRT_SSR_DM0, unroll);
         const uint32_t ssr1_b[4] = {unroll, n, m / unroll, m / stride};
-        const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll * n * sizeof(double), 0};
+        const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double),
+                                    unroll * n * sizeof(double), 0};
         snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2],
-                         ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2],
-                         ssr1_i[3]);
+                         ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
         setup_ssr = 0;
     }
 
@@ -141,7 +139,6 @@ void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at,
 
     for (uint32_t i = offset; i < m; i += stride) {
         for (uint32_t j = 0; j < m; j += unroll) {
-
             double acc[unroll];
             acc[0] = 0;
             acc[1] = 0;
@@ -178,8 +175,8 @@ void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at,
 
 void syrk_job(syrk_args_t *args) {
     uint32_t m_frac, a_tile_size, a_tile_bytes, c_tile_size, c_tile_bytes;
-    uint64_t local_a0_addr, local_at0_addr, local_c0_addr,
-             local_a1_addr, local_at1_addr, local_c1_addr;
+    uint64_t local_a0_addr, local_at0_addr, local_c0_addr, local_a1_addr,
+        local_at1_addr, local_c1_addr;
     double *local_a[2];
     double *local_at[2];
     double *local_c[2];
@@ -227,7 +224,6 @@ void syrk_job(syrk_args_t *args) {
 
     // Iterate over all tiles
     for (i = 0; i < iterations; i++) {
-        
         if (snrt_is_dm_core()) {
             // DMA out
             // (out before in to avoid overwriting data)
@@ -241,15 +237,8 @@ void syrk_job(syrk_args_t *args) {
                 i_col = i_dma_out % args->m_tiles;
 
                 // Copy job outputs from TCDM
-                snrt_dma_store_2d_tile(
-                    args->c,
-                    local_c[buff_idx],
-                    i_row,
-                    i_col,
-                    m_frac,
-                    m_frac,
-                    args->m,
-                    sizeof(double));
+                snrt_dma_store_2d_tile(args->c, local_c[buff_idx], i_row, i_col,
+                                       m_frac, m_frac, args->m, sizeof(double));
                 snrt_dma_wait_all();
 
                 snrt_mcycle();
@@ -266,28 +255,14 @@ void syrk_job(syrk_args_t *args) {
                 i_col = i_dma_in % args->m_tiles;
 
                 // Copy job operands in TCDM
-                snrt_dma_load_1d_tile(
-                    local_a[buff_idx],
-                    args->a,
-                    i_row,
-                    a_tile_size,
-                    sizeof(double));
-                snrt_dma_load_1d_tile(
-                    local_at[buff_idx],
-                    args->a,
-                    i_col,
-                    a_tile_size,
-                    sizeof(double));
+                snrt_dma_load_1d_tile(local_a[buff_idx], args->a, i_row,
+                                      a_tile_size, sizeof(double));
+                snrt_dma_load_1d_tile(local_at[buff_idx], args->a, i_col,
+                                      a_tile_size, sizeof(double));
                 if (args->funcptr == syrk_opt || args->beta != 0) {
-                    snrt_dma_load_2d_tile(
-                        local_c[buff_idx],
-                        args->c,
-                        i_row,
-                        i_col,
-                        m_frac,
-                        m_frac,
-                        args->m,
-                        sizeof(double));
+                    snrt_dma_load_2d_tile(local_c[buff_idx], args->c, i_row,
+                                          i_col, m_frac, m_frac, args->m,
+                                          sizeof(double));
                 }
                 snrt_dma_wait_all();
 

From a8cafa96a6b4c8be52b441be48142711b6967cb4 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Tue, 20 Aug 2024 15:31:27 +0200
Subject: [PATCH 14/19] sw: Add doitgen kernel

---
 sw/apps/doitgen/.gitignore                   |   1 +
 sw/apps/doitgen/data/params.json             |  12 +
 sw/apps/doitgen/scripts/datagen.py           |  90 ++++++
 sw/apps/doitgen/scripts/verify.py            |  48 +++
 sw/apps/doitgen/src/args.h                   |  22 ++
 sw/apps/doitgen/src/doitgen.h                | 303 +++++++++++++++++++
 sw/apps/doitgen/src/main.c                   |  17 ++
 target/snitch_cluster/sw.mk                  |   1 +
 target/snitch_cluster/sw/apps/doitgen/app.mk |  14 +
 target/snitch_cluster/sw/run.yaml            |   2 +
 10 files changed, 510 insertions(+)
 create mode 100644 sw/apps/doitgen/.gitignore
 create mode 100644 sw/apps/doitgen/data/params.json
 create mode 100755 sw/apps/doitgen/scripts/datagen.py
 create mode 100755 sw/apps/doitgen/scripts/verify.py
 create mode 100644 sw/apps/doitgen/src/args.h
 create mode 100644 sw/apps/doitgen/src/doitgen.h
 create mode 100644 sw/apps/doitgen/src/main.c
 create mode 100644 target/snitch_cluster/sw/apps/doitgen/app.mk

diff --git a/sw/apps/doitgen/.gitignore b/sw/apps/doitgen/.gitignore
new file mode 100644
index 000000000..8485f615e
--- /dev/null
+++ b/sw/apps/doitgen/.gitignore
@@ -0,0 +1 @@
+data/data.h
\ No newline at end of file
diff --git a/sw/apps/doitgen/data/params.json b/sw/apps/doitgen/data/params.json
new file mode 100644
index 000000000..4417f0c35
--- /dev/null
+++ b/sw/apps/doitgen/data/params.json
@@ -0,0 +1,12 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    "r": 16,
+    "q": 16,
+    "s": 32,
+    "r_tiles": 2,
+    "q_tiles": 2,
+    "funcptr": "doitgen_baseline"
+}
diff --git a/sw/apps/doitgen/scripts/datagen.py b/sw/apps/doitgen/scripts/datagen.py
new file mode 100755
index 000000000..d0dddf6f5
--- /dev/null
+++ b/sw/apps/doitgen/scripts/datagen.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import numpy as np
+
+from snitch.util.sim import data_utils
+from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen
+
+np.random.seed(42)
+
+DOUBLE_BUFFER = True
+
+
+class DoitgenDataGen(DataGen):
+
+    # Function pointers to alternative implementations
+    FUNCPTRS = ["doitgen_naive", "doitgen_baseline", "doitgen_opt"]
+
+    def golden_model(self, A, x):
+        R, Q, S = A.shape
+        P, _ = x.shape
+        Aout = np.ndarray((R, Q, P))
+        for r in range(R):
+            for q in range(Q):
+                for p in range(P):
+                    Aout[r, q, p] = 0
+                    for s in range(S):
+                        Aout[r, q, p] += A[r, q, s] * x[p, s]
+        return Aout
+
+    def validate(self, **kwargs):
+        n_cores = 8
+        assert (kwargs['r'] % kwargs['r_tiles']) == 0, "r must be an integer multiple of r_tiles"
+        assert (kwargs['q'] % kwargs['q_tiles']) == 0, "q must be an integer multiple of q_tiles"
+        if kwargs['funcptr'] != 'doitgen_naive':
+            assert (kwargs['s'] % 4) == 0, "s must be an integer multiple of unrolling factor"
+        r_per_tile = kwargs['r'] / kwargs['r_tiles']
+        q_per_tile = kwargs['q'] / kwargs['q_tiles']
+        assert (r_per_tile % n_cores) == 0, "r_per_tile must be an integer multiple of n_cores"
+        assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
+
+        # Calculate total TCDM occupation
+        a_tile_size = r_per_tile * q_per_tile * kwargs['s'] * 8
+        x_size = kwargs['s'] * kwargs['s'] * 8
+        total_size = 2 * a_tile_size + x_size
+        if DOUBLE_BUFFER:
+            total_size *= 2
+        data_utils.validate_tcdm_footprint(total_size)
+
+    def emit_header(self, **kwargs):
+        header = [super().emit_header()]
+
+        self.validate(**kwargs)
+
+        A = np.random.randint(-100, 100, size=(kwargs['r'], kwargs['q'], kwargs['s']))
+        x = np.random.randint(-100, 100, size=(kwargs['s'], kwargs['s']))
+
+        _ = self.golden_model(A, x)
+
+        A = A.flatten()
+        x = x.flatten()
+
+        A_uid = 'A'
+        x_uid = 'x'
+
+        cfg = {
+            'r': kwargs['r'],
+            'q': kwargs['q'],
+            's': kwargs['s'],
+            'A': A_uid,
+            'x': x_uid,
+            'r_tiles': kwargs['r_tiles'],
+            'q_tiles': kwargs['q_tiles'],
+            'funcptr': kwargs['funcptr']
+        }
+
+        header += [format_array_definition('double', A_uid, A)]
+        header += [format_array_definition('double', x_uid, x)]
+        header += [format_struct_definition('doitgen_args_t', 'args', cfg)]
+        header = '\n\n'.join(header)
+
+        return header
+
+
+if __name__ == '__main__':
+    DoitgenDataGen().main()
diff --git a/sw/apps/doitgen/scripts/verify.py b/sw/apps/doitgen/scripts/verify.py
new file mode 100755
index 000000000..8f72b0415
--- /dev/null
+++ b/sw/apps/doitgen/scripts/verify.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import numpy as np
+import sys
+from datagen import DoitgenDataGen
+
+from snitch.util.sim.verif_utils import Verifier
+
+
+class DoitgenVerifier(Verifier):
+
+    OUTPUT_UIDS = ['A']
+
+    def __init__(self):
+        super().__init__()
+        self.func_args = {
+            'r': 'I',
+            'q': 'I',
+            's': 'I',
+            'A': 'I',
+            'x': 'I',
+            'r_tiles': 'I',
+            'q_tiles': 'I',
+            'funcptr': 'I'
+        }
+        self.func_args = self.get_input_from_symbol('args', self.func_args)
+
+    def get_actual_results(self):
+        return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double')
+
+    def get_expected_results(self):
+        A = self.get_input_from_symbol('A', 'double')
+        A = np.reshape(A, (self.func_args['r'], self.func_args['q'], self.func_args['s']))
+        x = self.get_input_from_symbol('x', 'double')
+        x = np.reshape(x, (self.func_args['s'], self.func_args['s']))
+        return DoitgenDataGen().golden_model(A, x).flatten()
+
+    def check_results(self, *args):
+        return super().check_results(*args, rtol=1e-10)
+
+
+if __name__ == "__main__":
+    sys.exit(DoitgenVerifier().main())
diff --git a/sw/apps/doitgen/src/args.h b/sw/apps/doitgen/src/args.h
new file mode 100644
index 000000000..5d3f56ce4
--- /dev/null
+++ b/sw/apps/doitgen/src/args.h
@@ -0,0 +1,22 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#pragma once
+#include <stdint.h>
+
+typedef void (*doitgen_fp_t)(uint32_t r, uint32_t q, uint32_t s, double *A,
+                             double *x, double *Aout);
+
+typedef struct {
+    uint32_t r;
+    uint32_t q;
+    uint32_t s;
+    double *A;
+    double *x;
+    uint32_t r_tiles;
+    uint32_t q_tiles;
+    doitgen_fp_t funcptr;
+} doitgen_args_t;
diff --git a/sw/apps/doitgen/src/doitgen.h b/sw/apps/doitgen/src/doitgen.h
new file mode 100644
index 000000000..2f7bc6128
--- /dev/null
+++ b/sw/apps/doitgen/src/doitgen.h
@@ -0,0 +1,303 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "args.h"
+#include "snrt.h"
+
+#define DOUBLE_BUFFER 1
+
+__thread int setup_ssr = 1;
+
+void doitgen_naive(uint32_t r, uint32_t q, uint32_t s, double *A, double *x,
+                   double *Aout) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    for (uint32_t i = offset; i < r; i += stride) {
+        for (uint32_t j = 0; j < q; j++) {
+            for (uint32_t k = 0; k < s; k++) {
+                Aout[i * q * s + j * s + k] = 0.0;
+                for (uint32_t l = 0; l < s; l++) {
+                    Aout[i * q * s + j * s + k] +=
+                        A[i * q * s + j * s + l] * x[k * s + l];
+                }
+            }
+        }
+    }
+
+    snrt_fpu_fence();
+}
+
+void doitgen_baseline(uint32_t r, uint32_t q, uint32_t s, double *A, double *x,
+                      double *Aout) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Unrolling factors
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll1 = 4;
+    const uint32_t unroll0 = 4;
+
+    for (uint32_t i = offset; i < r; i += stride) {
+        for (uint32_t j = 0; j < q; j++) {
+            for (uint32_t k = 0; k < s; k += unroll1) {
+                double acc[4];
+                acc[0] = 0;
+                acc[1] = 0;
+                acc[2] = 0;
+                acc[3] = 0;
+
+                for (uint32_t l = 0; l < s; l += unroll0) {
+                    asm volatile(
+                        "fmadd.d %[acc0], %[a0], %[x0], %[acc0] \n"
+                        "fmadd.d %[acc1], %[a0], %[x1], %[acc1] \n"
+                        "fmadd.d %[acc2], %[a0], %[x2], %[acc2] \n"
+                        "fmadd.d %[acc3], %[a0], %[x3], %[acc3] \n"
+                        "fmadd.d %[acc0], %[a1], %[x4], %[acc0] \n"
+                        "fmadd.d %[acc1], %[a1], %[x5], %[acc1] \n"
+                        "fmadd.d %[acc2], %[a1], %[x6], %[acc2] \n"
+                        "fmadd.d %[acc3], %[a1], %[x7], %[acc3] \n"
+                        "fmadd.d %[acc0], %[a2], %[x8], %[acc0] \n"
+                        "fmadd.d %[acc1], %[a2], %[x9], %[acc1] \n"
+                        "fmadd.d %[acc2], %[a2], %[x10], %[acc2] \n"
+                        "fmadd.d %[acc3], %[a2], %[x11], %[acc3] \n"
+                        "fmadd.d %[acc0], %[a3], %[x12], %[acc0] \n"
+                        "fmadd.d %[acc1], %[a3], %[x13], %[acc1] \n"
+                        "fmadd.d %[acc2], %[a3], %[x14], %[acc2] \n"
+                        "fmadd.d %[acc3], %[a3], %[x15], %[acc3] \n"
+                        : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                          [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
+                        : [ a0 ] "f"(A[i * q * s + j * s + l + 0]),
+                          [ a1 ] "f"(A[i * q * s + j * s + l + 1]),
+                          [ a2 ] "f"(A[i * q * s + j * s + l + 2]),
+                          [ a3 ] "f"(A[i * q * s + j * s + l + 3]),
+                          [ x0 ] "f"(x[(k + 0) * s + l + 0]),
+                          [ x1 ] "f"(x[(k + 1) * s + l + 0]),
+                          [ x2 ] "f"(x[(k + 2) * s + l + 0]),
+                          [ x3 ] "f"(x[(k + 3) * s + l + 0]),
+                          [ x4 ] "f"(x[(k + 0) * s + l + 1]),
+                          [ x5 ] "f"(x[(k + 1) * s + l + 1]),
+                          [ x6 ] "f"(x[(k + 2) * s + l + 1]),
+                          [ x7 ] "f"(x[(k + 3) * s + l + 1]),
+                          [ x8 ] "f"(x[(k + 0) * s + l + 2]),
+                          [ x9 ] "f"(x[(k + 1) * s + l + 2]),
+                          [ x10 ] "f"(x[(k + 2) * s + l + 2]),
+                          [ x11 ] "f"(x[(k + 3) * s + l + 2]),
+                          [ x12 ] "f"(x[(k + 0) * s + l + 3]),
+                          [ x13 ] "f"(x[(k + 1) * s + l + 3]),
+                          [ x14 ] "f"(x[(k + 2) * s + l + 3]),
+                          [ x15 ] "f"(x[(k + 3) * s + l + 3])
+                        :);
+                }
+
+                Aout[i * q * s + j * s + k + 0] = acc[0];
+                Aout[i * q * s + j * s + k + 1] = acc[1];
+                Aout[i * q * s + j * s + k + 2] = acc[2];
+                Aout[i * q * s + j * s + k + 3] = acc[3];
+            }
+        }
+    }
+
+    snrt_fpu_fence();
+}
+
+void doitgen_opt(uint32_t r, uint32_t q, uint32_t s, double *A, double *x,
+                 double *Aout) {
+    uint32_t bound = r / snrt_cluster_compute_core_num();
+    uint32_t offset = bound * snrt_cluster_core_idx();
+
+    // Unrolling factor of innermost loop
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll = 4;
+
+    if (setup_ssr) {
+        // Configure ft0 and ft1 to load A and x
+        // for (i = offset; i < bound; i++)
+        //     for (j = 0; j < q; j++)
+        //         for (k1 = 0; k1 < s; k1 += unroll)
+        //             for (l = 0; l < s; l++)
+        //                 for (k0 = 0; k0 < unroll; k0++)
+        //                     k = k1 + k0
+        //                     ft0.push(A[i * q * s + j * s + l])
+        //                     ft1.push(x[k * s + l])
+        const uint32_t ssr0_b[4] = {unroll, s, s / unroll, q * bound};
+        const uint32_t ssr0_i[4] = {0, sizeof(double), 0, s * sizeof(double)};
+        snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3],
+                         ssr0_i[1], ssr0_i[2], ssr0_i[3]);
+        snrt_ssr_repeat(SNRT_SSR_DM0, unroll);
+        const uint32_t ssr1_b[4] = {unroll, s, s / unroll, q * bound};
+        const uint32_t ssr1_i[4] = {s * sizeof(double), sizeof(double),
+                                    unroll * s * sizeof(double), 0};
+        snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2],
+                         ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
+        setup_ssr = 0;
+    }
+
+    // SSR start address need to be configured each time
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, A + offset * q * s);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, x);
+    snrt_ssr_enable();
+
+    for (uint32_t i = offset; i < (offset + bound); i++) {
+        for (uint32_t j = 0; j < q; j++) {
+            for (uint32_t k = 0; k < s; k += unroll) {
+                double acc[unroll];
+                acc[0] = 0;
+                acc[1] = 0;
+                acc[2] = 0;
+                acc[3] = 0;
+
+                asm volatile(
+                    "frep.o %[n_frep], %[unroll], 0, 0 \n"
+                    "fmadd.d %[acc0], ft0, ft1, %[acc0] \n"
+                    "fmadd.d %[acc1], ft0, ft1, %[acc1] \n"
+                    "fmadd.d %[acc2], ft0, ft1, %[acc2] \n"
+                    "fmadd.d %[acc3], ft0, ft1, %[acc3] \n"
+                    : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                      [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
+                    : [ n_frep ] "r"(s - 1), [ unroll ] "i"(unroll)
+                    : "ft0", "ft1", "ft2");
+
+                Aout[i * q * s + j * s + k + 0] = acc[0];
+                Aout[i * q * s + j * s + k + 1] = acc[1];
+                Aout[i * q * s + j * s + k + 2] = acc[2];
+                Aout[i * q * s + j * s + k + 3] = acc[3];
+            }
+        }
+    }
+
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+}
+
+void doitgen_job(doitgen_args_t *args) {
+    uint32_t r_frac, q_frac, a_tile_size, a_tile_bytes, x_size, x_bytes;
+    uint64_t local_a0_addr, local_aout0_addr, local_x0_addr, local_a1_addr,
+        local_aout1_addr;
+    double *local_a[2];
+    double *local_aout[2];
+    double *local_x;
+    uint32_t iterations, sb_iterations;
+    uint32_t i, i_dma_in, i_compute, i_dma_out, i_r, i_q, buff_idx;
+
+#ifndef JOB_ARGS_PRELOADED
+    // Allocate space for job arguments in TCDM
+    doitgen_args_t *local_args = (doitgen_args_t *)snrt_l1_next();
+
+    // Copy job arguments to TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_args, args, sizeof(doitgen_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+    args = local_args;
+#endif
+
+    // Calculate size of each tile
+    r_frac = args->r / args->r_tiles;
+    q_frac = args->q / args->q_tiles;
+    a_tile_size = r_frac * q_frac * args->s;
+    x_size = args->s * args->s;
+    a_tile_bytes = a_tile_size * sizeof(double);
+    x_bytes = x_size * sizeof(double);
+
+    // Allocate space for job operands in TCDM
+    local_x0_addr = (uint64_t)args + sizeof(doitgen_args_t);
+    local_a0_addr = local_x0_addr + x_bytes;
+    local_aout0_addr = local_a0_addr + a_tile_bytes;
+    local_x = (double *)local_x0_addr;
+    local_a[0] = (double *)local_a0_addr;
+    local_aout[0] = (double *)local_aout0_addr;
+    if (DOUBLE_BUFFER) {
+        local_a1_addr = local_aout0_addr + a_tile_bytes;
+        local_aout1_addr = local_a1_addr + a_tile_bytes;
+        local_a[1] = (double *)local_a1_addr;
+        local_aout[1] = (double *)local_aout1_addr;
+    }
+
+    // Calculate number of iterations
+    sb_iterations = args->r_tiles * args->q_tiles;
+    if (DOUBLE_BUFFER)
+        iterations = sb_iterations + 2;
+    else
+        iterations = sb_iterations;
+
+    // Iterate over all tiles
+    for (i = 0; i < iterations; i++) {
+        if (snrt_is_dm_core()) {
+            // DMA in
+            if (!DOUBLE_BUFFER || (i < sb_iterations)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_in = i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0;
+                i_r = i_dma_in / args->q_tiles;
+                i_q = i_dma_in % args->q_tiles;
+
+                // Copy job operands in TCDM
+                snrt_dma_load_2d_tile(local_a[buff_idx], args->A, i_r, i_q,
+                                      r_frac, q_frac * args->s,
+                                      args->q * args->s, sizeof(double));
+                if (i_dma_in == 0) snrt_dma_start_1d(local_x, args->x, x_bytes);
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+
+            // Additional barriers required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            // DMA out
+            if (!DOUBLE_BUFFER || (i > 1)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_out = DOUBLE_BUFFER ? i - 2 : i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0;
+                i_r = i_dma_out / args->q_tiles;
+                i_q = i_dma_out % args->q_tiles;
+
+                // Copy job outputs from TCDM
+                snrt_dma_store_2d_tile(args->A, local_aout[buff_idx], i_r, i_q,
+                                       r_frac, q_frac * args->s,
+                                       args->q * args->s, sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+        }
+
+        // Compute
+        if (snrt_is_compute_core()) {
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_compute = DOUBLE_BUFFER ? i - 1 : i;
+                buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0;
+
+                // Perform tile computation
+                doitgen_fp_t fp = args->funcptr;
+                fp(r_frac, q_frac, args->s, local_a[buff_idx], local_x,
+                   local_aout[buff_idx]);
+
+                snrt_mcycle();
+            }
+
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+        }
+        // Synchronize cores after every iteration
+        snrt_cluster_hw_barrier();
+    }
+}
diff --git a/sw/apps/doitgen/src/main.c b/sw/apps/doitgen/src/main.c
new file mode 100644
index 000000000..64c9571f8
--- /dev/null
+++ b/sw/apps/doitgen/src/main.c
@@ -0,0 +1,17 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz>
+
+#include "snrt.h"
+
+#include "doitgen.h"
+
+#include "data.h"
+
+int main() {
+    doitgen_job(&args);
+
+    return 0;
+}
diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk
index 674ea2cad..e4456fdfc 100644
--- a/target/snitch_cluster/sw.mk
+++ b/target/snitch_cluster/sw.mk
@@ -67,6 +67,7 @@ APPS += sw/apps/montecarlo/pi_estimation
 APPS += sw/apps/atax
 APPS += sw/apps/correlation
 APPS += sw/apps/covariance
+APPS += sw/apps/doitgen
 
 # Include Makefile from each app subdirectory
 $(foreach app,$(APPS), \
diff --git a/target/snitch_cluster/sw/apps/doitgen/app.mk b/target/snitch_cluster/sw/apps/doitgen/app.mk
new file mode 100644
index 000000000..ebef550d3
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/doitgen/app.mk
@@ -0,0 +1,14 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := doitgen
+$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
+SRC_DIR          := $(ROOT)/sw/apps/$(APP)/src
+SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/blas/
+
+include $(ROOT)/sw/apps/common.mk
+include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index ab302f7c3..d9e2f8c2f 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -99,3 +99,5 @@ runs:
   #   cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/covariance/build/covariance.elf
     cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"]
+  - elf: apps/doitgen/build/doitgen.elf
+    cmd: [../../../sw/apps/doitgen/scripts/verify.py, "${sim_bin}", "${elf}"]

From 77f8792cc5fcff8198affbd57f45c3cd97b31195 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 22 Aug 2024 11:32:58 +0200
Subject: [PATCH 15/19] sw: Remove spurious files after #171

---
 sw/apps/atax/.gitignore        |  1 -
 sw/apps/correlation/.gitignore |  1 -
 sw/apps/covariance/.gitignore  |  1 -
 sw/apps/doitgen/.gitignore     |  1 -
 sw/blas/.gitignore             |  1 -
 sw/blas/dot/Makefile           | 31 -------------------------------
 sw/blas/syrk/.gitignore        |  1 -
 sw/dnn/.gitignore              |  1 -
 8 files changed, 38 deletions(-)
 delete mode 100644 sw/apps/atax/.gitignore
 delete mode 100644 sw/apps/correlation/.gitignore
 delete mode 100644 sw/apps/covariance/.gitignore
 delete mode 100644 sw/apps/doitgen/.gitignore
 delete mode 100644 sw/blas/.gitignore
 delete mode 100644 sw/blas/dot/Makefile
 delete mode 100644 sw/blas/syrk/.gitignore
 delete mode 100644 sw/dnn/.gitignore

diff --git a/sw/apps/atax/.gitignore b/sw/apps/atax/.gitignore
deleted file mode 100644
index 8485f615e..000000000
--- a/sw/apps/atax/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data/data.h
\ No newline at end of file
diff --git a/sw/apps/correlation/.gitignore b/sw/apps/correlation/.gitignore
deleted file mode 100644
index 8485f615e..000000000
--- a/sw/apps/correlation/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data/data.h
\ No newline at end of file
diff --git a/sw/apps/covariance/.gitignore b/sw/apps/covariance/.gitignore
deleted file mode 100644
index 8485f615e..000000000
--- a/sw/apps/covariance/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data/data.h
\ No newline at end of file
diff --git a/sw/apps/doitgen/.gitignore b/sw/apps/doitgen/.gitignore
deleted file mode 100644
index 8485f615e..000000000
--- a/sw/apps/doitgen/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data/data.h
\ No newline at end of file
diff --git a/sw/blas/.gitignore b/sw/blas/.gitignore
deleted file mode 100644
index 2ff975f29..000000000
--- a/sw/blas/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-**/data/data.h
\ No newline at end of file
diff --git a/sw/blas/dot/Makefile b/sw/blas/dot/Makefile
deleted file mode 100644
index 077b84e5a..000000000
--- a/sw/blas/dot/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2024 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-# Usage of absolute paths is required to externally include this Makefile
-MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
-DATA_DIR := $(realpath $(MK_DIR)/data)
-SRC_DIR  := $(realpath $(MK_DIR)/src)
-
-DATA_CFG ?= $(DATA_DIR)/params.json
-SECTION  ?=
-
-APP     ?= dot
-SRCS    ?= $(realpath $(SRC_DIR)/main.c)
-INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR)
-
-DATAGEN_PY = $(MK_DIR)/scripts/datagen.py
-DATA_H    ?= $(DATA_DIR)/data.h
-
-$(dir $(DATA_H)):
-	mkdir -p $@
-
-$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H))
-	$< -c $(DATA_CFG) --section="$(SECTION)" $@
-
-.PHONY: clean-data clean
-
-clean-data:
-	rm -f $(DATA_H)
-
-clean: clean-data
diff --git a/sw/blas/syrk/.gitignore b/sw/blas/syrk/.gitignore
deleted file mode 100644
index 8485f615e..000000000
--- a/sw/blas/syrk/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data/data.h
\ No newline at end of file
diff --git a/sw/dnn/.gitignore b/sw/dnn/.gitignore
deleted file mode 100644
index aed262ca8..000000000
--- a/sw/dnn/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*/data/data.h

From bdfa4bedcc82afc96f0e56f72a14dd5f44a58979 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 23 Aug 2024 13:01:03 +0200
Subject: [PATCH 16/19] sw: Uniformize random data generation

---
 sw/apps/atax/scripts/datagen.py        |  5 ++--
 sw/apps/correlation/scripts/datagen.py |  3 +-
 sw/apps/covariance/scripts/datagen.py  |  6 ++--
 sw/apps/doitgen/scripts/datagen.py     |  8 ++---
 sw/blas/axpy/scripts/datagen.py        | 12 ++++----
 sw/blas/dot/scripts/datagen.py         |  7 ++---
 sw/blas/gemm/scripts/datagen.py        | 16 +++++-----
 sw/blas/syrk/scripts/datagen.py        | 12 ++++----
 util/sim/data_utils.py                 | 41 ++++++++++++++++++++++++++
 9 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/sw/apps/atax/scripts/datagen.py b/sw/apps/atax/scripts/datagen.py
index 51317c70e..c73ae70a2 100755
--- a/sw/apps/atax/scripts/datagen.py
+++ b/sw/apps/atax/scripts/datagen.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+import snitch.util.sim.data_utils as du
 from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
     format_array_declaration, format_ifdef_wrapper, DataGen
 
@@ -26,8 +27,8 @@ def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
         M, N = kwargs['M'], kwargs['N']
-        A = np.random.randint(-200, 100, size=(M, N))/100
-        x = np.random.randint(-200, 100, size=(N, 1))/100
+        A = du.generate_random_array((M, N))
+        x = du.generate_random_array((N, 1))
         y = self.golden_model(A, x)
 
         assert (M % 8) == 0, "M must be an integer multiple of the number of cores"
diff --git a/sw/apps/correlation/scripts/datagen.py b/sw/apps/correlation/scripts/datagen.py
index b2047d5eb..7880c1693 100755
--- a/sw/apps/correlation/scripts/datagen.py
+++ b/sw/apps/correlation/scripts/datagen.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+import snitch.util.sim.data_utils as du
 from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
     format_array_declaration, format_ifdef_wrapper, DataGen
 
@@ -26,7 +27,7 @@ def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
         M, N = kwargs['M'], kwargs['N']
-        data = np.random.randint(-200, 100, size=(N, M))/100
+        data = du.generate_random_array((N, M))
         corr = self.golden_model(data)
 
         data = data.flatten()
diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py
index c3b7cd8b3..07bb92d0a 100755
--- a/sw/apps/covariance/scripts/datagen.py
+++ b/sw/apps/covariance/scripts/datagen.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from snitch.util.sim import data_utils
+import snitch.util.sim.data_utils as du
 from snitch.util.sim.data_utils import format_array_definition, \
     format_array_declaration, format_struct_definition, DataGen
 
@@ -42,14 +42,14 @@ def validate(self, **kwargs):
         total_size = 2 * a_tile_size + b_tile_size
         if DOUBLE_BUFFER:
             total_size *= 2
-        data_utils.validate_tcdm_footprint(total_size)
+        du.validate_tcdm_footprint(total_size)
 
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
         self.validate(**kwargs)
 
-        data = np.random.randint(-200, 100, size=(kwargs['n'], kwargs['m']))
+        data = du.generate_random_array((kwargs['n'], kwargs['m']))
         cov = self.golden_model(data)
 
         data = data.transpose().flatten()
diff --git a/sw/apps/doitgen/scripts/datagen.py b/sw/apps/doitgen/scripts/datagen.py
index d0dddf6f5..5f14ec86d 100755
--- a/sw/apps/doitgen/scripts/datagen.py
+++ b/sw/apps/doitgen/scripts/datagen.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 
-from snitch.util.sim import data_utils
+import snitch.util.sim.data_utils as du
 from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen
 
 np.random.seed(42)
@@ -49,15 +49,15 @@ def validate(self, **kwargs):
         total_size = 2 * a_tile_size + x_size
         if DOUBLE_BUFFER:
             total_size *= 2
-        data_utils.validate_tcdm_footprint(total_size)
+        du.validate_tcdm_footprint(total_size)
 
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
         self.validate(**kwargs)
 
-        A = np.random.randint(-100, 100, size=(kwargs['r'], kwargs['q'], kwargs['s']))
-        x = np.random.randint(-100, 100, size=(kwargs['s'], kwargs['s']))
+        A = du.generate_random_array((kwargs['r'], kwargs['q'], kwargs['s']))
+        x = du.generate_random_array((kwargs['s'], kwargs['s']))
 
         _ = self.golden_model(A, x)
 
diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py
index cf6795667..ec00a4c88 100755
--- a/sw/blas/axpy/scripts/datagen.py
+++ b/sw/blas/axpy/scripts/datagen.py
@@ -8,15 +8,13 @@
 import numpy as np
 import sys
 
-from snitch.util.sim import data_utils
+import snitch.util.sim.data_utils as du
 from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
     format_array_declaration, format_ifdef_wrapper, format_struct_definition, DataGen
 
 
 class AxpyDataGen(DataGen):
 
-    MIN = -1000
-    MAX = +1000
     # AXI splits bursts crossing 4KB address boundaries. To minimize
     # the occurrence of these splits the data should be aligned to 4KB
     BURST_ALIGNMENT = 4096
@@ -36,16 +34,16 @@ def validate_config(self, **kwargs):
         # Note: doesn't account for gaps created by data alignment
         vec_size = n_per_tile * 8
         total_size = 2 * 3 * vec_size
-        data_utils.validate_tcdm_footprint(total_size)
+        du.validate_tcdm_footprint(total_size)
 
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
         self.validate_config(**kwargs)
 
-        a = np.random.uniform(self.MIN, self.MAX, 1)[0]
-        x = np.random.uniform(self.MIN, self.MAX, kwargs['n'])
-        y = np.random.uniform(self.MIN, self.MAX, kwargs['n'])
+        a = du.generate_random_array(1)[0]
+        x = du.generate_random_array(kwargs['n'])
+        y = du.generate_random_array(kwargs['n'])
         g = self.golden_model(a, x, y)
 
         x_uid = 'x'
diff --git a/sw/blas/dot/scripts/datagen.py b/sw/blas/dot/scripts/datagen.py
index 01560c51f..d11b53ff8 100755
--- a/sw/blas/dot/scripts/datagen.py
+++ b/sw/blas/dot/scripts/datagen.py
@@ -6,14 +6,13 @@
 import numpy as np
 import sys
 
+import snitch.util.sim.data_utils as du
 from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
     format_scalar_declaration, format_ifdef_wrapper, DataGen
 
 
 class DotDataGen(DataGen):
 
-    MIN = -1000
-    MAX = +1000
     # AXI splits bursts crossing 4KB address boundaries. To minimize
     # the occurrence of these splits the data should be aligned to 4KB
     BURST_ALIGNMENT = 4096
@@ -25,8 +24,8 @@ def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
         n = kwargs['n']
-        x = np.random.uniform(self.MIN, self.MAX, n)
-        y = np.random.uniform(self.MIN, self.MAX, n)
+        x = du.generate_random_array(n)
+        y = du.generate_random_array(n)
         g = self.golden_model(x, y)
 
         assert (n % (8 * 4)) == 0, "n must be an integer multiple of the number of cores times " \
diff --git a/sw/blas/gemm/scripts/datagen.py b/sw/blas/gemm/scripts/datagen.py
index da7f8ba57..636175604 100755
--- a/sw/blas/gemm/scripts/datagen.py
+++ b/sw/blas/gemm/scripts/datagen.py
@@ -10,10 +10,9 @@
 
 import numpy as np
 import re
-import pyflexfloat as ff
 import sys
 
-from snitch.util.sim import data_utils
+import snitch.util.sim.data_utils as du
 from snitch.util.sim.data_utils import DataGen, format_array_declaration, \
     format_struct_definition, format_array_definition, format_ifdef_wrapper
 
@@ -56,14 +55,14 @@ def validate_config(self, gemm_fp, parallelize_m,
 
         # Calculate total TCDM occupation
         # Note: doesn't account for double buffering
-        prec = data_utils.size_from_precision_t(dtype)
+        prec = du.size_from_precision_t(dtype)
         a_size = frac_m * frac_k * prec
         b_size = frac_k * frac_n * prec
         c_size = frac_m * frac_n * prec
         total_size = a_size
         total_size += b_size
         total_size += c_size
-        data_utils.validate_tcdm_footprint(total_size)
+        du.validate_tcdm_footprint(total_size)
 
         assert (M % m_tiles) == 0, 'M is not an integer multiple of tile size'
         assert (N % n_tiles) == 0, 'N is not an integer multiple of tile size'
@@ -99,12 +98,11 @@ def emit_header(self, **kwargs):
 
         prec, _ = self.infer_implementation(kwargs['gemm_fp'])
 
-        ff_desc = data_utils.ff_desc_from_precision_t(prec)
-        ctype = data_utils.ctype_from_precision_t(prec)
+        ctype = du.ctype_from_precision_t(prec)
 
-        a = ff.array(np.random.rand(M, K), ff_desc)
-        b = ff.array(np.random.rand(K, N), ff_desc)
-        c = ff.array(np.random.rand(M, N), ff_desc)
+        a = du.generate_random_array((M, K), prec)
+        b = du.generate_random_array((K, N), prec)
+        c = du.generate_random_array((M, N), prec)
         result = self.exact_golden_model(1, a, b, kwargs['beta'], c)
 
         # Store matrices in transposed form if requested
diff --git a/sw/blas/syrk/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py
index 9b4959fca..ad15222f3 100755
--- a/sw/blas/syrk/scripts/datagen.py
+++ b/sw/blas/syrk/scripts/datagen.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 
-from snitch.util.sim import data_utils
+import snitch.util.sim.data_utils as du
 from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen
 
 
@@ -37,7 +37,7 @@ def validate(self, **kwargs):
         total_size = 2 * a_tile_size + c_tile_size
         if DOUBLE_BUFFER:
             total_size *= 2
-        data_utils.validate_tcdm_footprint(total_size)
+        du.validate_tcdm_footprint(total_size)
 
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
@@ -47,14 +47,14 @@ def emit_header(self, **kwargs):
         if 'alpha' in kwargs:
             alpha = kwargs['alpha']
         else:
-            alpha = np.random.randint(-200, 100)/100
+            alpha = du.generate_random_array(1)[0]
         if 'beta' in kwargs:
             beta = kwargs['beta']
         else:
-            beta = np.random.randint(-200, 100)/100
+            beta = du.generate_random_array(1)[0]
 
-        A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100
-        C_in = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['m']))/100
+        A = du.generate_random_array((kwargs['m'], kwargs['n']))
+        C_in = du.generate_random_array((kwargs['m'], kwargs['m']))
 
         A = A.flatten()
         C_in = C_in.flatten()
diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py
index e6f48acce..3b732c5cc 100644
--- a/util/sim/data_utils.py
+++ b/util/sim/data_utils.py
@@ -83,6 +83,24 @@ def torch_type_from_precision_t(prec):
     return precision_t_to_torch_type_map[_integer_precision_t(prec)]
 
 
+def numpy_type_from_precision_t(prec):
+    """Convert `precision_t` type to PyTorch type.
+
+    Args:
+        prec: A value of type `precision_t`. Accepts both enum strings
+            (e.g. "FP64") and integer enumeration values (e.g. 8).
+    """
+    # Types which have a direct correspondence in Numpy
+    precision_t_to_numpy_type_map = {
+        8: np.float64,
+        4: np.float32,
+        2: np.float16
+    }
+    prec = _integer_precision_t(prec)
+    assert prec != 1, "No direct correspondence between FP8 and Numpy"
+    return precision_t_to_numpy_type_map[prec]
+
+
 # Returns the C type representing a floating-point value of the specified precision
 def ctype_from_precision_t(prec):
     """Convert `precision_t` type to a C type string.
@@ -100,6 +118,29 @@ def ctype_from_precision_t(prec):
     return precision_t_to_ctype_map[_integer_precision_t(prec)]
 
 
+def generate_random_array(size, prec='FP64'):
+    """Consistent random array generation for Snitch experiments.
+
+    Samples values between -1 and 1 from a uniform distribution and
+    of the exact specified type, e.g. actual 64-bit doubles.
+
+    This function ensures that e.g. power measurements are not skewed
+    by using integer values in the FPU.
+
+    Args:
+        size: Tuple of array dimensions.
+        prec: A value of type `precision_t`. Accepts both enum strings
+            (e.g. "FP64") and integer enumeration values (e.g. 8).
+    """
+    # Generate in 64b precision and then cast down
+    rand = np.random.default_rng().random(size=size, dtype=np.float64) * 2 - 1
+    # Generate FlexFloat array for 8b floats, casted from 16b Numpy array
+    if _integer_precision_t(prec) == 1:
+        return ff.array(rand.astype(np.float16), ff_desc_from_precision_t(prec))
+    else:
+        return rand.astype(numpy_type_from_precision_t(prec))
+
+
 def flatten(array):
     """Flatten various array types with a homogeneous API.
 

From 8ce7b9119c73765322bae9ddd6c1aa90226f3850 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 23 Aug 2024 13:07:01 +0200
Subject: [PATCH 17/19] sw: Uniformize `data_utils` import

---
 sw/apps/atax/scripts/datagen.py        | 18 ++++++++---------
 sw/apps/correlation/scripts/datagen.py | 19 +++++++++--------
 sw/apps/covariance/scripts/datagen.py  | 10 ++++-----
 sw/apps/doitgen/scripts/datagen.py     |  9 ++++-----
 sw/blas/axpy/scripts/datagen.py        | 19 ++++++++---------
 sw/blas/dot/scripts/datagen.py         | 22 +++++++++-----------
 sw/blas/gemm/scripts/datagen.py        | 28 ++++++++++++--------------
 sw/blas/syrk/scripts/datagen.py        |  9 ++++-----
 8 files changed, 60 insertions(+), 74 deletions(-)

diff --git a/sw/apps/atax/scripts/datagen.py b/sw/apps/atax/scripts/datagen.py
index c73ae70a2..0008bea26 100755
--- a/sw/apps/atax/scripts/datagen.py
+++ b/sw/apps/atax/scripts/datagen.py
@@ -9,8 +9,6 @@
 import numpy as np
 
 import snitch.util.sim.data_utils as du
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_array_declaration, format_ifdef_wrapper, DataGen
 
 
 # AXI splits bursts crossing 4KB address boundaries. To minimize
@@ -18,7 +16,7 @@
 BURST_ALIGNMENT = 4096
 
 
-class AtaxDataGen(DataGen):
+class AtaxDataGen(du.DataGen):
 
     def golden_model(self, A, x):
         return np.matmul(A.transpose(), np.matmul(A, x))
@@ -38,13 +36,13 @@ def emit_header(self, **kwargs):
         x = x.flatten()
         y = y.flatten()
 
-        header += [format_scalar_definition('uint32_t', 'M', M)]
-        header += [format_scalar_definition('uint32_t', 'N', N)]
-        header += [format_array_definition('double', 'A', A, alignment=BURST_ALIGNMENT)]
-        header += [format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT)]
-        header += [format_array_declaration('double', 'y', y.shape, alignment=BURST_ALIGNMENT)]
-        result_def = format_array_definition('double', 'golden', y, alignment=BURST_ALIGNMENT)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        header += [du.format_scalar_definition('uint32_t', 'M', M)]
+        header += [du.format_scalar_definition('uint32_t', 'N', N)]
+        header += [du.format_array_definition('double', 'A', A, alignment=BURST_ALIGNMENT)]
+        header += [du.format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT)]
+        header += [du.format_array_declaration('double', 'y', y.shape, alignment=BURST_ALIGNMENT)]
+        result_def = du.format_array_definition('double', 'golden', y, alignment=BURST_ALIGNMENT)
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/apps/correlation/scripts/datagen.py b/sw/apps/correlation/scripts/datagen.py
index 7880c1693..d60f527d1 100755
--- a/sw/apps/correlation/scripts/datagen.py
+++ b/sw/apps/correlation/scripts/datagen.py
@@ -9,8 +9,6 @@
 import numpy as np
 
 import snitch.util.sim.data_utils as du
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_array_declaration, format_ifdef_wrapper, DataGen
 
 
 # AXI splits bursts crossing 4KB address boundaries. To minimize
@@ -18,7 +16,7 @@
 BURST_ALIGNMENT = 4096
 
 
-class CorrelationDataGen(DataGen):
+class CorrelationDataGen(du.DataGen):
 
     def golden_model(self, data):
         return np.corrcoef(data, rowvar=False)
@@ -33,13 +31,14 @@ def emit_header(self, **kwargs):
         data = data.flatten()
         corr = corr.flatten()
 
-        header += [format_scalar_definition('uint32_t', 'M', M)]
-        header += [format_scalar_definition('uint32_t', 'N', N)]
-        header += [format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)]
-        header += [format_array_declaration('double', 'corr', corr.shape,
-                                            alignment=BURST_ALIGNMENT)]
-        result_def = format_array_definition('double', 'golden', corr, alignment=BURST_ALIGNMENT)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        header += [du.format_scalar_definition('uint32_t', 'M', M)]
+        header += [du.format_scalar_definition('uint32_t', 'N', N)]
+        header += [du.format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)]
+        header += [du.format_array_declaration('double', 'corr', corr.shape,
+                                               alignment=BURST_ALIGNMENT)]
+        result_def = du.format_array_definition('double', 'golden', corr,
+                                                alignment=BURST_ALIGNMENT)
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py
index 07bb92d0a..7beb2c671 100755
--- a/sw/apps/covariance/scripts/datagen.py
+++ b/sw/apps/covariance/scripts/datagen.py
@@ -9,15 +9,13 @@
 import numpy as np
 
 import snitch.util.sim.data_utils as du
-from snitch.util.sim.data_utils import format_array_definition, \
-    format_array_declaration, format_struct_definition, DataGen
 
 np.random.seed(42)
 
 DOUBLE_BUFFER = True
 
 
-class CovarianceDataGen(DataGen):
+class CovarianceDataGen(du.DataGen):
 
     # Function pointers to alternative implementations
     FUNCPTRS = ["covariance_naive", "covariance_baseline", "covariance_opt"]
@@ -69,9 +67,9 @@ def emit_header(self, **kwargs):
             'funcptr': kwargs['funcptr']
         }
 
-        header += [format_array_definition('double', data_uid, data)]
-        header += [format_array_declaration('double', cov_uid, cov.shape)]
-        header += [format_struct_definition('covariance_args_t', 'args', cfg)]
+        header += [du.format_array_definition('double', data_uid, data)]
+        header += [du.format_array_declaration('double', cov_uid, cov.shape)]
+        header += [du.format_struct_definition('covariance_args_t', 'args', cfg)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/apps/doitgen/scripts/datagen.py b/sw/apps/doitgen/scripts/datagen.py
index 5f14ec86d..d1a9c3b46 100755
--- a/sw/apps/doitgen/scripts/datagen.py
+++ b/sw/apps/doitgen/scripts/datagen.py
@@ -8,14 +8,13 @@
 import numpy as np
 
 import snitch.util.sim.data_utils as du
-from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen
 
 np.random.seed(42)
 
 DOUBLE_BUFFER = True
 
 
-class DoitgenDataGen(DataGen):
+class DoitgenDataGen(du.DataGen):
 
     # Function pointers to alternative implementations
     FUNCPTRS = ["doitgen_naive", "doitgen_baseline", "doitgen_opt"]
@@ -78,9 +77,9 @@ def emit_header(self, **kwargs):
             'funcptr': kwargs['funcptr']
         }
 
-        header += [format_array_definition('double', A_uid, A)]
-        header += [format_array_definition('double', x_uid, x)]
-        header += [format_struct_definition('doitgen_args_t', 'args', cfg)]
+        header += [du.format_array_definition('double', A_uid, A)]
+        header += [du.format_array_definition('double', x_uid, x)]
+        header += [du.format_struct_definition('doitgen_args_t', 'args', cfg)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py
index ec00a4c88..38634dd5e 100755
--- a/sw/blas/axpy/scripts/datagen.py
+++ b/sw/blas/axpy/scripts/datagen.py
@@ -5,15 +5,12 @@
 #
 # Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-import numpy as np
 import sys
 
 import snitch.util.sim.data_utils as du
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_array_declaration, format_ifdef_wrapper, format_struct_definition, DataGen
 
 
-class AxpyDataGen(DataGen):
+class AxpyDataGen(du.DataGen):
 
     # AXI splits bursts crossing 4KB address boundaries. To minimize
     # the occurrence of these splits the data should be aligned to 4KB
@@ -60,16 +57,16 @@ def emit_header(self, **kwargs):
             'funcptr': kwargs['funcptr']
         }
 
-        header += [format_scalar_definition('const double', 'a', a)]
-        header += [format_array_definition('double', x_uid, x,
+        header += [du.format_scalar_definition('const double', 'a', a)]
+        header += [du.format_array_definition('double', x_uid, x,
                    alignment=self.BURST_ALIGNMENT, section=kwargs['section'])]
-        header += [format_array_definition('double', y_uid, y,
+        header += [du.format_array_definition('double', y_uid, y,
                    alignment=self.BURST_ALIGNMENT, section=kwargs['section'])]
-        header += [format_array_declaration('double', z_uid, x.shape,
+        header += [du.format_array_declaration('double', z_uid, x.shape,
                    alignment=self.BURST_ALIGNMENT, section=kwargs['section'])]
-        header += [format_struct_definition('axpy_args_t', 'args', cfg)]
-        result_def = format_array_definition('double', 'g', g)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        header += [du.format_struct_definition('axpy_args_t', 'args', cfg)]
+        result_def = du.format_array_definition('double', 'g', g)
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/blas/dot/scripts/datagen.py b/sw/blas/dot/scripts/datagen.py
index d11b53ff8..8a8631a6a 100755
--- a/sw/blas/dot/scripts/datagen.py
+++ b/sw/blas/dot/scripts/datagen.py
@@ -7,11 +7,9 @@
 import sys
 
 import snitch.util.sim.data_utils as du
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_scalar_declaration, format_ifdef_wrapper, DataGen
 
 
-class DotDataGen(DataGen):
+class DotDataGen(du.DataGen):
 
     # AXI splits bursts crossing 4KB address boundaries. To minimize
     # the occurrence of these splits the data should be aligned to 4KB
@@ -31,15 +29,15 @@ def emit_header(self, **kwargs):
         assert (n % (8 * 4)) == 0, "n must be an integer multiple of the number of cores times " \
                                    "the unrolling factor"
 
-        header += [format_scalar_definition('const uint32_t', 'n', n)]
-        header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
-                                           section=kwargs['section'])]
-        header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
-                                           section=kwargs['section'])]
-        header += [format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT,
-                                             section=kwargs['section'])]
-        result_def = format_scalar_definition('double', 'g', g)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        header += [du.format_scalar_definition('const uint32_t', 'n', n)]
+        header += [du.format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
+                                              section=kwargs['section'])]
+        header += [du.format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
+                                              section=kwargs['section'])]
+        header += [du.format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT,
+                                                section=kwargs['section'])]
+        result_def = du.format_scalar_definition('double', 'g', g)
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/blas/gemm/scripts/datagen.py b/sw/blas/gemm/scripts/datagen.py
index 636175604..2eb6e2f4d 100755
--- a/sw/blas/gemm/scripts/datagen.py
+++ b/sw/blas/gemm/scripts/datagen.py
@@ -13,14 +13,12 @@
 import sys
 
 import snitch.util.sim.data_utils as du
-from snitch.util.sim.data_utils import DataGen, format_array_declaration, \
-    format_struct_definition, format_array_definition, format_ifdef_wrapper
 
 
 np.random.seed(42)
 
 
-class GemmDataGen(DataGen):
+class GemmDataGen(du.DataGen):
 
     # AXI splits bursts crossing 4KB address boundaries. To minimize
     # the occurrence of these splits the data should be aligned to 4KB
@@ -125,18 +123,18 @@ def emit_header(self, **kwargs):
         b = b.flatten()
         c = c.flatten()
 
-        header += [format_array_declaration(ctype, a_uid, a.shape)]
-        header += [format_array_declaration(ctype, b_uid, b.shape)]
-        header += [format_array_declaration(ctype, c_uid, c.shape)]
-        header += [format_struct_definition('gemm_args_t', 'args', cfg)]
-        header += [format_array_definition(ctype, a_uid, a,
-                                           section=kwargs['section'])]
-        header += [format_array_definition(ctype, b_uid, b,
-                                           section=kwargs['section'])]
-        header += [format_array_definition(ctype, c_uid, c,
-                                           section=kwargs['section'])]
-        result_def = format_array_definition(ctype, 'result', result.flatten())
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        header += [du.format_array_declaration(ctype, a_uid, a.shape)]
+        header += [du.format_array_declaration(ctype, b_uid, b.shape)]
+        header += [du.format_array_declaration(ctype, c_uid, c.shape)]
+        header += [du.format_struct_definition('gemm_args_t', 'args', cfg)]
+        header += [du.format_array_definition(ctype, a_uid, a,
+                                              section=kwargs['section'])]
+        header += [du.format_array_definition(ctype, b_uid, b,
+                                              section=kwargs['section'])]
+        header += [du.format_array_definition(ctype, c_uid, c,
+                                              section=kwargs['section'])]
+        result_def = du.format_array_definition(ctype, 'result', result.flatten())
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/blas/syrk/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py
index ad15222f3..3fb86644f 100755
--- a/sw/blas/syrk/scripts/datagen.py
+++ b/sw/blas/syrk/scripts/datagen.py
@@ -8,13 +8,12 @@
 import numpy as np
 
 import snitch.util.sim.data_utils as du
-from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen
 
 
 DOUBLE_BUFFER = True
 
 
-class SyrkDataGen(DataGen):
+class SyrkDataGen(du.DataGen):
 
     # Function pointers to alternative implementations
     FUNCPTRS = ["syrk_naive", "syrk_baseline", "syrk_opt"]
@@ -73,9 +72,9 @@ def emit_header(self, **kwargs):
             'funcptr': kwargs['funcptr']
         }
 
-        header += [format_array_definition('double', A_uid, A)]
-        header += [format_array_definition('double', C_uid, C_in)]
-        header += [format_struct_definition('syrk_args_t', 'args', cfg)]
+        header += [du.format_array_definition('double', A_uid, A)]
+        header += [du.format_array_definition('double', C_uid, C_in)]
+        header += [du.format_struct_definition('syrk_args_t', 'args', cfg)]
         header = '\n\n'.join(header)
 
         return header

From ad76f7807474ce3713c81e40ca20be2718790f47 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 23 Aug 2024 14:58:31 +0200
Subject: [PATCH 18/19] gemm: Lower error thresholds as cancellations can now
 take place

---
 sw/blas/gemm/scripts/verify.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sw/blas/gemm/scripts/verify.py b/sw/blas/gemm/scripts/verify.py
index 40840b327..353ea1328 100755
--- a/sw/blas/gemm/scripts/verify.py
+++ b/sw/blas/gemm/scripts/verify.py
@@ -18,9 +18,9 @@ class GemmVerifier(Verifier):
     OUTPUT_UIDS = ['c']
     ERR_THRESHOLD = {
         1: 1e-4,
-        2: 1e-2,
-        4: 1e-6,
-        8: 1e-6
+        2: 8e-2,
+        4: 1e-3,
+        8: 1e-3
     }
 
     def __init__(self):

From 10c545fd3c86c48ce2c47d0bbdde6b82b864d68d Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 28 Aug 2024 17:08:48 +0200
Subject: [PATCH 19/19] ci: Fix Dockerfile

---
 util/container/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/util/container/Dockerfile b/util/container/Dockerfile
index 9cdc7d9aa..bfef21266 100644
--- a/util/container/Dockerfile
+++ b/util/container/Dockerfile
@@ -94,6 +94,7 @@ RUN tar xzf snitch-spike-dasm-${SPIKE_DASM_VERSION}-x86_64-linux-gnu-ubuntu18.04
 # Install Doxygen
 RUN wget https://www.doxygen.nl/files/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
 RUN tar xzf doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
+RUN mv doxygen-${DOXYGEN_VERSION} doxygen
 
 # 2. Stage
 FROM ubuntu:22.04 AS snitch_cluster
@@ -154,7 +155,7 @@ COPY --from=builder /tools/spike-dasm bin/
 COPY --from=builder /root/.cargo/bin/banshee bin/
 COPY --from=builder /opt/python /opt/python
 COPY --from=builder /tools/verilator /tools/verilator/
-COPY --from=builder /tools/doxygen-${DOXYGEN_VERSION}/bin/doxygen bin/
+COPY --from=builder /tools/doxygen/bin/doxygen bin/
 
 # Create and activate virtual environment
 ENV VIRTUAL_ENV "/root/.venvs/snitch_cluster"