diff --git a/sw/apps/atax/.gitignore b/sw/apps/atax/.gitignore
deleted file mode 100644
index 8485f615e..000000000
--- a/sw/apps/atax/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data/data.h
\ No newline at end of file
diff --git a/sw/apps/atax/scripts/datagen.py b/sw/apps/atax/scripts/datagen.py
index 51317c70e..0008bea26 100755
--- a/sw/apps/atax/scripts/datagen.py
+++ b/sw/apps/atax/scripts/datagen.py
@@ -8,8 +8,7 @@
 
 import numpy as np
 
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_array_declaration, format_ifdef_wrapper, DataGen
+import snitch.util.sim.data_utils as du
 
 
 # AXI splits bursts crossing 4KB address boundaries. To minimize
@@ -17,7 +16,7 @@
 BURST_ALIGNMENT = 4096
 
 
-class AtaxDataGen(DataGen):
+class AtaxDataGen(du.DataGen):
 
     def golden_model(self, A, x):
         return np.matmul(A.transpose(), np.matmul(A, x))
@@ -26,8 +25,8 @@ def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
         M, N = kwargs['M'], kwargs['N']
-        A = np.random.randint(-200, 100, size=(M, N))/100
-        x = np.random.randint(-200, 100, size=(N, 1))/100
+        A = du.generate_random_array((M, N))
+        x = du.generate_random_array((N, 1))
         y = self.golden_model(A, x)
 
         assert (M % 8) == 0, "M must be an integer multiple of the number of cores"
@@ -37,13 +36,13 @@ def emit_header(self, **kwargs):
         x = x.flatten()
         y = y.flatten()
 
-        header += [format_scalar_definition('uint32_t', 'M', M)]
-        header += [format_scalar_definition('uint32_t', 'N', N)]
-        header += [format_array_definition('double', 'A', A, alignment=BURST_ALIGNMENT)]
-        header += [format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT)]
-        header += [format_array_declaration('double', 'y', y.shape, alignment=BURST_ALIGNMENT)]
-        result_def = format_array_definition('double', 'golden', y, alignment=BURST_ALIGNMENT)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        header += [du.format_scalar_definition('uint32_t', 'M', M)]
+        header += [du.format_scalar_definition('uint32_t', 'N', N)]
+        header += [du.format_array_definition('double', 'A', A, alignment=BURST_ALIGNMENT)]
+        header += [du.format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT)]
+        header += [du.format_array_declaration('double', 'y', y.shape, alignment=BURST_ALIGNMENT)]
+        result_def = du.format_array_definition('double', 'golden', y, alignment=BURST_ALIGNMENT)
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/apps/common.mk b/sw/apps/common.mk
index 89f5da9f6..6bdc85984 100644
--- a/sw/apps/common.mk
+++ b/sw/apps/common.mk
@@ -13,7 +13,7 @@ DATA_H          := $($(APP)_BUILD_DIR)/data.h
 DATAGEN_PY       = $(SCRIPTS_DIR)/datagen.py
 
 $(APP)_HEADERS := $(DATA_H)
-$(APP)_INCDIRS := $(dir $(DATA_H)) $(SRC_DIR)
+$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR)
 
 $(dir $(DATA_H)):
 	mkdir -p $@
diff --git a/sw/apps/correlation/.gitignore b/sw/apps/correlation/.gitignore
deleted file mode 100644
index 8485f615e..000000000
--- a/sw/apps/correlation/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data/data.h
\ No newline at end of file
diff --git a/sw/apps/correlation/scripts/datagen.py b/sw/apps/correlation/scripts/datagen.py
index b2047d5eb..d60f527d1 100755
--- a/sw/apps/correlation/scripts/datagen.py
+++ b/sw/apps/correlation/scripts/datagen.py
@@ -8,8 +8,7 @@
 
 import numpy as np
 
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_array_declaration, format_ifdef_wrapper, DataGen
+import snitch.util.sim.data_utils as du
 
 
 # AXI splits bursts crossing 4KB address boundaries. To minimize
@@ -17,7 +16,7 @@
 BURST_ALIGNMENT = 4096
 
 
-class CorrelationDataGen(DataGen):
+class CorrelationDataGen(du.DataGen):
 
     def golden_model(self, data):
         return np.corrcoef(data, rowvar=False)
@@ -26,19 +25,20 @@ def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
         M, N = kwargs['M'], kwargs['N']
-        data = np.random.randint(-200, 100, size=(N, M))/100
+        data = du.generate_random_array((N, M))
         corr = self.golden_model(data)
 
         data = data.flatten()
         corr = corr.flatten()
 
-        header += [format_scalar_definition('uint32_t', 'M', M)]
-        header += [format_scalar_definition('uint32_t', 'N', N)]
-        header += [format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)]
-        header += [format_array_declaration('double', 'corr', corr.shape,
-                                            alignment=BURST_ALIGNMENT)]
-        result_def = format_array_definition('double', 'golden', corr, alignment=BURST_ALIGNMENT)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        header += [du.format_scalar_definition('uint32_t', 'M', M)]
+        header += [du.format_scalar_definition('uint32_t', 'N', N)]
+        header += [du.format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)]
+        header += [du.format_array_declaration('double', 'corr', corr.shape,
+                                               alignment=BURST_ALIGNMENT)]
+        result_def = du.format_array_definition('double', 'golden', corr,
+                                                alignment=BURST_ALIGNMENT)
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/apps/covariance/.gitignore b/sw/apps/covariance/.gitignore
deleted file mode 100644
index 8485f615e..000000000
--- a/sw/apps/covariance/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data/data.h
\ No newline at end of file
diff --git a/sw/apps/covariance/data/params.json b/sw/apps/covariance/data/params.json
index 9e89d9f85..5ae088d97 100644
--- a/sw/apps/covariance/data/params.json
+++ b/sw/apps/covariance/data/params.json
@@ -3,6 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 {
-    M: 16,
-    N: 8
+    "m": 32,
+    "n": 2,
+    "m_tiles": 2,
+    "funcptr": "covariance_opt"
 }
diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py
index 44e20d55e..7beb2c671 100755
--- a/sw/apps/covariance/scripts/datagen.py
+++ b/sw/apps/covariance/scripts/datagen.py
@@ -8,38 +8,68 @@
 
 import numpy as np
 
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_array_declaration, format_ifdef_wrapper, DataGen
+import snitch.util.sim.data_utils as du
 
+np.random.seed(42)
 
-# AXI splits bursts crossing 4KB address boundaries. To minimize
-# the occurrence of these splits the data should be aligned to 4KB
-BURST_ALIGNMENT = 4096
+DOUBLE_BUFFER = True
 
 
-class CovarianceDataGen(DataGen):
+class CovarianceDataGen(du.DataGen):
+
+    # Function pointers to alternative implementations
+    FUNCPTRS = ["covariance_naive", "covariance_baseline", "covariance_opt"]
 
     def golden_model(self, data):
         return np.cov(data, rowvar=False)
 
+    def validate(self, **kwargs):
+        n_cores = 8
+        assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles"
+        m_per_tile = kwargs['m'] / kwargs['m_tiles']
+        assert (m_per_tile % n_cores) == 0, \
+            "m_per_tile must be an integer multiple of the number of cores"
+        assert (m_per_tile % 4) == 0, "m_per_tile must be an integer multiple of unroll1 = 4"
+        m_per_core = m_per_tile / n_cores
+        assert (m_per_core % 2) == 0, "m_per_core must be an integer multiple of the unroll0 = 2"
+        assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
+
+        # Calculate total TCDM occupation
+        a_tile_size = m_per_tile * kwargs['n'] * 8
+        b_tile_size = m_per_tile * m_per_tile * 8
+        total_size = 2 * a_tile_size + b_tile_size
+        if DOUBLE_BUFFER:
+            total_size *= 2
+        du.validate_tcdm_footprint(total_size)
+
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
-        M, N = kwargs['M'], kwargs['N']
-        data = np.random.randint(-200, 100, size=(N, M))
-        cov = self.golden_model(data)
+        self.validate(**kwargs)
 
-        assert (M % 8) == 0, "M must be an integer multiple of the number of cores"
+        data = du.generate_random_array((kwargs['n'], kwargs['m']))
+        cov = self.golden_model(data)
 
-        data = data.flatten()
+        data = data.transpose().flatten()
         cov = cov.flatten()
 
-        header += [format_scalar_definition('uint32_t', 'M', M)]
-        header += [format_scalar_definition('uint32_t', 'N', N)]
-        header += [format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)]
-        header += [format_array_declaration('double', 'cov', cov.shape, alignment=BURST_ALIGNMENT)]
-        result_def = format_array_definition('double', 'golden', cov, alignment=BURST_ALIGNMENT)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        data_uid = 'data'
+        cov_uid = 'cov'
+
+        cfg = {
+            'm': kwargs['m'],
+            'n': kwargs['n'],
+            'inv_n': 1 / kwargs['n'],
+            'inv_n_m1': 1 / (kwargs['n'] - 1),
+            'data': data_uid,
+            'cov': cov_uid,
+            'm_tiles': kwargs['m_tiles'],
+            'funcptr': kwargs['funcptr']
+        }
+
+        header += [du.format_array_definition('double', data_uid, data)]
+        header += [du.format_array_declaration('double', cov_uid, cov.shape)]
+        header += [du.format_struct_definition('covariance_args_t', 'args', cfg)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/apps/covariance/scripts/verify.py b/sw/apps/covariance/scripts/verify.py
index 4c5b0cdd1..a390d83d1 100755
--- a/sw/apps/covariance/scripts/verify.py
+++ b/sw/apps/covariance/scripts/verify.py
@@ -16,14 +16,26 @@ class CovarianceVerifier(Verifier):
 
     OUTPUT_UIDS = ['cov']
 
+    def __init__(self):
+        super().__init__()
+        self.func_args = {
+            'm': 'I',
+            'n': 'I',
+            'inv_n': 'd',
+            'inv_n_m1': 'd',
+            'data': 'I',
+            'cov': 'I',
+            'm_tiles': 'I',
+            'funcptr': 'I'
+        }
+        self.func_args = self.get_input_from_symbol('args', self.func_args)
+
     def get_actual_results(self):
-        return self.get_output_from_symbol('cov', 'double')
+        return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double')
 
     def get_expected_results(self):
-        M = self.get_input_from_symbol('M', 'uint32_t')[0]
-        N = self.get_input_from_symbol('N', 'uint32_t')[0]
         data = self.get_input_from_symbol('data', 'double')
-        data = np.reshape(data, (N, M))
+        data = np.reshape(data, (self.func_args['m'], self.func_args['n'])).transpose()
         return CovarianceDataGen().golden_model(data).flatten()
 
     def check_results(self, *args):
diff --git a/sw/apps/covariance/src/args.h b/sw/apps/covariance/src/args.h
new file mode 100644
index 000000000..cd15bc852
--- /dev/null
+++ b/sw/apps/covariance/src/args.h
@@ -0,0 +1,23 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#pragma once
+#include <stdint.h>
+
+typedef void (*covariance_fp_t)(uint32_t m, uint32_t n, double inv_n,
+                                double inv_n_m1, double *data, double *datat,
+                                double *cov);
+
+typedef struct {
+    uint32_t m;
+    uint32_t n;
+    double inv_n;
+    double inv_n_m1;
+    double *data;
+    double *cov;
+    uint32_t m_tiles;
+    covariance_fp_t funcptr;
+} covariance_args_t;
diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h
index fec79d195..cdeb427bf 100644
--- a/sw/apps/covariance/src/covariance.h
+++ b/sw/apps/covariance/src/covariance.h
@@ -5,50 +5,363 @@
 // Author: Jose Pedro Castro Fonseca <jcastro@ethz.ch>
 //         Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-#include <stdint.h>
+#include "args.h"
+#include "blas.h"
 #include "snrt.h"
 
-void kernel_covariance(uint32_t N, uint32_t M, double *data, double *cov) {
-    int i1, i, j, k;
-    int core_range, core_offset;
-
-    // Compute deviations
-    if (snrt_is_compute_core()) {
-        // Distribute different attributes to the different cores
-        core_range = M / snrt_cluster_compute_core_num();
-        core_offset = snrt_cluster_core_idx() * core_range;
-        for (i1 = 0; i1 < core_range; i1++) {
-            i = core_offset + i1;
-
-            // Calculate mean vector
-            double mean = 0.0;
-            for (k = 0; k < N; k++) {
-                mean += data[k * M + i];
-            }
-            mean = mean / N;
+#define DOUBLE_BUFFER 1
 
-            // Standardize data to zero mean
-            for (k = 0; k < N; k++) {
-                data[k * M + i] -= mean;
-            }
+void covariance_naive(uint32_t m, uint32_t n, double inv_n, double inv_n_m1,
+                      double *data, double *datat, double *cov) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Center data
+    for (uint32_t i = offset; i < m; i += stride) {
+        // Calculate row mean
+        double data_mean = 0.0;
+        double datat_mean = 0.0;
+        for (uint32_t j = 0; j < n; j++) {
+            data_mean += data[i * n + j];
+            datat_mean += datat[i * n + j];
+        }
+        data_mean = data_mean * inv_n;
+        datat_mean = datat_mean * inv_n;
+
+        // Center row around zero
+        for (uint32_t j = 0; j < n; j++) {
+            data[i * n + j] -= data_mean;
+            datat[i * n + j] -= datat_mean;
         }
+    }
+
+    snrt_fpu_fence();
+    snrt_cluster_hw_barrier();
+
+    // Compute covariance matrix
+    syrk_naive(m, n, inv_n_m1, data, datat, 0, cov);
+}
+
+void covariance_baseline(uint32_t m, uint32_t n, double inv_n, double inv_n_m1,
+                         double *data, double *datat, double *cov) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Center data
+    for (uint32_t i = offset; i < m; i += stride) {
+        // Calculate row mean
+        double data_mean = 0.0;
+        double datat_mean = 0.0;
+        for (uint32_t j = 0; j < n; j++) {
+            data_mean += data[i * n + j];
+            datat_mean += datat[i * n + j];
+        }
+        data_mean = data_mean * inv_n;
+        datat_mean = datat_mean * inv_n;
+
+        // Center row around zero
+        for (uint32_t j = 0; j < n; j++) {
+            data[i * n + j] -= data_mean;
+            datat[i * n + j] -= datat_mean;
+        }
+    }
+
+    snrt_fpu_fence();
+    snrt_cluster_hw_barrier();
+
+    // Compute covariance matrix
+    syrk_baseline(m, n, inv_n_m1, data, datat, 0, cov);
+}
+
+void covariance_opt(uint32_t m, uint32_t n, double inv_n, double inv_n_m1,
+                    double *data, double *datat, double *cov) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Unrolling factor of innermost loop
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll0 = 2;
+
+    // Configure ft0 and ft1 to load data and datat elements
+    // for (k = 0; k < 2; k++)
+    //     for (i1 = offset; i1 < m; i1 += stride * unroll0)
+    //         for (j = 0; j < n; j++)
+    //             for (i0 = 0; i0 < unroll0; i0++)
+    //                 i = i1 + i0 * stride
+    //                 ft0.push(data[i * n + j])
+    //                 ft1.push(datat[i * n + j])
+    const uint32_t ssr01_b[4] = {unroll0, n, 2, m / (stride * unroll0)};
+    const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double), 0,
+                                 sizeof(double) * n * stride * unroll0};
+    snrt_ssr_loop_4d(SNRT_SSR_DM0, ssr01_b[0], ssr01_b[1], ssr01_b[2],
+                     ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2],
+                     ssr01_i[3]);
+    snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr01_b[0], ssr01_b[1], ssr01_b[2],
+                     ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2],
+                     ssr01_i[3]);
+    snrt_ssr_repeat(SNRT_SSR_DM0, 1);
+    // Configure ft2 to store data and datat elements
+    // for (i1 = offset; i1 < m; i1 += stride * unroll0)
+    //     for (j = 0; j < n; j++)
+    //         for (i0 = 0; i0 < unroll0; i0++)
+    //             i = i1 + i0 * stride
+    //             data[i * n + j] = ft2.pop()
+    //             datat[i * n + j] = ft2.pop()
+    const uint32_t ssr2_b[4] = {2, unroll0, n, m / (stride * unroll0)};
+    const uint32_t ssr2_i[4] = {(uint32_t)datat - (uint32_t)data,
+                                sizeof(double) * n * stride, sizeof(double),
+                                sizeof(double) * n * stride * unroll0};
+    snrt_ssr_loop_4d(SNRT_SSR_DM2, ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3],
+                     ssr2_i[0], ssr2_i[1], ssr2_i[2], ssr2_i[3]);
+
+    // SSR start address need to be configured each time
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, data + offset * n);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat + offset * n);
+    snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_4D, data + offset * n);
+    snrt_ssr_enable();
+
+    // Center data
+    for (uint32_t i = offset; i < m; i += stride * unroll0) {
+        // Calculate row means
+        double m[2 * unroll0];
+        m[0] = 0.0;  // mean(data[i])
+        m[1] = 0.0;  // mean(datat[i])
+        m[2] = 0.0;  // mean(data[i + stride])
+        m[3] = 0.0;  // mean(datat[i + stride])
+        asm volatile(
+            "frep.o %[n_frep], %[n_insn], 0, 0 \n"
+            "fadd.d %[m0], ft0, %[m0] \n"
+            "fadd.d %[m1], ft1, %[m1] \n"
+            "fadd.d %[m2], ft0, %[m2] \n"
+            "fadd.d %[m3], ft1, %[m3] \n"
+            : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]),
+              [ m3 ] "+f"(m[3])
+            : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0)
+            : "ft0", "ft1", "ft2");
+        m[0] *= inv_n;
+        m[1] *= inv_n;
+        m[2] *= inv_n;
+        m[3] *= inv_n;
+
         snrt_fpu_fence();
+
+        // Center row around zero
+        asm volatile(
+            "frep.o %[n_frep], %[n_insn], 0, 0 \n"
+            "fsub.d ft2, ft0, %[m0] \n"
+            "fsub.d ft2, ft1, %[m1] \n"
+            "fsub.d ft2, ft0, %[m2] \n"
+            "fsub.d ft2, ft1, %[m3] \n"
+            : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]),
+              [ m3 ] "+f"(m[3])
+            : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0)
+            : "ft0", "ft1", "ft2");
+    }
+
+    snrt_ssr_disable();
+
+    snrt_fpu_fence();
+    snrt_cluster_hw_barrier();
+
+    // The following is taken from the AtA kernel, where alpha is set to
+    // the factor 1/(n - 1).
+    // Here data stands for A and datat for At.
+
+    // Unrolling factor of innermost loop
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll1 = 4;
+
+    // Configure ft0 and ft1 to load A and At
+    // for (i = offset; i < m; i += stride)
+    //     for (j1 = 0; j1 < m; j1 += unroll1)
+    //         for (k = 0; k < n; k++)
+    //             for (j0 = 0; j0 < unroll1; j0++)
+    //                 j = j1 + j0
+    //                 ft0.push(a[i * n + k])
+    //                 ft1.push(at[j * n + k])
+    const uint32_t ssr0_b[4] = {unroll1, n, m / unroll1, m / stride};
+    const uint32_t ssr0_i[4] = {0, sizeof(double), 0,
+                                stride * n * sizeof(double)};
+    snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1],
+                     ssr0_i[2], ssr0_i[3]);
+    snrt_ssr_repeat(SNRT_SSR_DM0, unroll1);
+    const uint32_t ssr1_b[4] = {unroll1, n, m / unroll1, m / stride};
+    const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double),
+                                unroll1 * n * sizeof(double), 0};
+    snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3],
+                     ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
+
+    // SSR start address need to be configured each time
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, data + offset * n);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat);
+    snrt_ssr_enable();
+
+    for (uint32_t i = offset; i < m; i += stride) {
+        for (uint32_t j = 0; j < m; j += unroll1) {
+            double acc[unroll1];
+            acc[0] = 0;
+            acc[1] = 0;
+            acc[2] = 0;
+            acc[3] = 0;
+
+            asm volatile(
+                "frep.o %[n_frep], %[unroll1], 0, 0 \n"
+                "fmadd.d %[acc0], ft0, ft1, %[acc0] \n"
+                "fmadd.d %[acc1], ft0, ft1, %[acc1] \n"
+                "fmadd.d %[acc2], ft0, ft1, %[acc2] \n"
+                "fmadd.d %[acc3], ft0, ft1, %[acc3] \n"
+                "fmul.d %[b0], %[acc0], %[alpha] \n"
+                "fmul.d %[b1], %[acc1], %[alpha] \n"
+                "fmul.d %[b2], %[acc2], %[alpha] \n"
+                "fmul.d %[b3], %[acc3], %[alpha] \n"
+                : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                  [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]),
+                  [ b0 ] "=f"(cov[i * m + j + 0]),
+                  [ b1 ] "=f"(cov[i * m + j + 1]),
+                  [ b2 ] "=f"(cov[i * m + j + 2]),
+                  [ b3 ] "=f"(cov[i * m + j + 3])
+                : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1),
+                  [ alpha ] "f"(inv_n_m1)
+                : "ft0", "ft1", "ft2");
+        }
     }
 
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+}
+
+void covariance_job(covariance_args_t *args) {
+    uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes;
+    uint64_t local_a0_addr, local_at0_addr, local_b0_addr, local_a1_addr,
+        local_at1_addr, local_b1_addr;
+    double *local_a[2];
+    double *local_at[2];
+    double *local_b[2];
+    uint32_t iterations, sb_iterations;
+    uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx;
+
+#ifndef JOB_ARGS_PRELOADED
+    // Allocate space for job arguments in TCDM
+    covariance_args_t *local_args = (covariance_args_t *)snrt_l1_next();
+
+    // Copy job arguments to TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_args, args, sizeof(covariance_args_t));
+        snrt_dma_wait_all();
+    }
     snrt_cluster_hw_barrier();
+    args = local_args;
+#endif
+
+    // Calculate size of each tile
+    m_frac = args->m / args->m_tiles;
+    a_tile_size = args->n * m_frac;
+    b_tile_size = m_frac * m_frac;
+    a_tile_bytes = a_tile_size * sizeof(double);
+    b_tile_bytes = b_tile_size * sizeof(double);
+
+    // Allocate space for job operands in TCDM
+    local_a0_addr = (uint64_t)args + sizeof(covariance_args_t);
+    local_at0_addr = local_a0_addr + a_tile_bytes;
+    local_b0_addr = local_at0_addr + a_tile_bytes;
+    local_a[0] = (double *)local_a0_addr;
+    local_at[0] = (double *)local_at0_addr;
+    local_b[0] = (double *)local_b0_addr;
+    if (DOUBLE_BUFFER) {
+        local_a1_addr = local_b0_addr + b_tile_bytes;
+        local_at1_addr = local_a1_addr + a_tile_bytes;
+        local_b1_addr = local_at1_addr + a_tile_bytes;
+        local_a[1] = (double *)local_a1_addr;
+        local_at[1] = (double *)local_at1_addr;
+        local_b[1] = (double *)local_b1_addr;
+    }
+
+    // Calculate number of iterations
+    sb_iterations = args->m_tiles * args->m_tiles;
+    if (DOUBLE_BUFFER)
+        iterations = sb_iterations + 2;
+    else
+        iterations = sb_iterations;
+
+    // Iterate over all tiles
+    for (i = 0; i < iterations; i++) {
+        if (snrt_is_dm_core()) {
+            // DMA in
+            if (!DOUBLE_BUFFER || (i < sb_iterations)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_in = i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0;
+                i_row = i_dma_in / args->m_tiles;
+                i_col = i_dma_in % args->m_tiles;
+
+                // Copy job operands in TCDM
+                snrt_dma_load_1d_tile(local_a[buff_idx], args->data, i_row,
+                                      a_tile_size, sizeof(double));
+                snrt_dma_load_1d_tile(local_at[buff_idx], args->data, i_col,
+                                      a_tile_size, sizeof(double));
+                snrt_dma_wait_all();
 
-    // Compute covariance
-    if (snrt_is_compute_core()) {
-        for (i1 = 0; i1 < core_range; i1++) {
-            i = core_offset + i1;
-            for (j = 0; j <= i; j++) {
-                double tmp = 0.0;
-                for (k = 0; k < N; k++) {
-                    tmp += data[k * M + i] * data[k * M + j];
-                }
-                cov[i * M + j] = tmp / (N - 1);
-                cov[j * M + i] = cov[i * M + j];
+                snrt_mcycle();
             }
+
+            // Additional barriers required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            // Additional barrier required to synchronize the compute cores
+            // among them after the data centering phase
+            if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1)))
+                snrt_cluster_hw_barrier();
+
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            // DMA out
+            if (!DOUBLE_BUFFER || (i > 1)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_out = DOUBLE_BUFFER ? i - 2 : i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0;
+                i_row = i_dma_out / args->m_tiles;
+                i_col = i_dma_out % args->m_tiles;
+
+                // Copy job outputs from TCDM
+                snrt_dma_store_2d_tile(args->cov, local_b[buff_idx], i_row,
+                                       i_col, m_frac, m_frac, args->m,
+                                       sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+        }
+
+        // Compute
+        if (snrt_is_compute_core()) {
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_compute = DOUBLE_BUFFER ? i - 1 : i;
+                buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0;
+
+                // Perform tile computation
+                covariance_fp_t fp = args->funcptr;
+                fp(m_frac, args->n, args->inv_n, args->inv_n_m1,
+                   local_a[buff_idx], local_at[buff_idx], local_b[buff_idx]);
+
+                snrt_mcycle();
+            }
+
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
         }
+        // Synchronize cores after every iteration
+        snrt_cluster_hw_barrier();
     }
 }
diff --git a/sw/apps/covariance/src/main.c b/sw/apps/covariance/src/main.c
index 26b151393..112ead333 100644
--- a/sw/apps/covariance/src/main.c
+++ b/sw/apps/covariance/src/main.c
@@ -1,56 +1,16 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
+// Copyright 2024 ETH Zurich and University of Bologna.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
-// Author: Jose Pedro Castro Fonseca <jcastro@ethz.ch>
-//         Luca Colagrande <colluca@iis.ee.ethz.ch>
+// Author: Luca Colagrande <colluca@iis.ee.ethz>
+
+#include "snrt.h"
 
 #include "covariance.h"
 #include "data.h"
 
-#define MAX_ERROR 1e-10
-
 int main() {
-    uint32_t nerr = 0;
-    double *local_mean;
-    double *local_cov;
-    double *local_data;
-    double diff;
-
-    local_data = snrt_l1_next();
-    local_cov = local_data + N * M;
-
-    // Initialize input matrix
-    if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(local_data, data, sizeof(double) * N * M);
-        snrt_dma_wait_all();
-    }
-    snrt_cluster_hw_barrier();
-
-    // Perform Computations
-    kernel_covariance(N, M, local_data, local_cov);
-    snrt_cluster_hw_barrier();
-
-    // Writeback outputs
-    if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(cov, local_cov, sizeof(double) * M * M);
-        snrt_dma_wait_all();
-    }
-    snrt_cluster_hw_barrier();
-
-#ifdef BIST
-    // Check computation is correct
-    if (snrt_cluster_core_idx() == 0) {
-        for (int i = 0; i < M; i++) {
-            for (int j = 0; j < M; j++) {
-                diff = fabs(golden[i * M + j] - local_cov[i * M + j]);
-                if (diff > MAX_ERROR) {
-                    nerr++;
-                }
-            }
-        }
-    }
-#endif
+    covariance_job(&args);
 
-    return nerr;
+    return 0;
 }
diff --git a/sw/apps/doitgen/data/params.json b/sw/apps/doitgen/data/params.json
new file mode 100644
index 000000000..4417f0c35
--- /dev/null
+++ b/sw/apps/doitgen/data/params.json
@@ -0,0 +1,12 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    "r": 16,
+    "q": 16,
+    "s": 32,
+    "r_tiles": 2,
+    "q_tiles": 2,
+    "funcptr": "doitgen_baseline"
+}
diff --git a/sw/apps/doitgen/scripts/datagen.py b/sw/apps/doitgen/scripts/datagen.py
new file mode 100755
index 000000000..d1a9c3b46
--- /dev/null
+++ b/sw/apps/doitgen/scripts/datagen.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import numpy as np
+
+import snitch.util.sim.data_utils as du
+
+np.random.seed(42)
+
+DOUBLE_BUFFER = True
+
+
+class DoitgenDataGen(du.DataGen):
+
+    # Function pointers to alternative implementations
+    FUNCPTRS = ["doitgen_naive", "doitgen_baseline", "doitgen_opt"]
+
+    def golden_model(self, A, x):
+        R, Q, S = A.shape
+        P, _ = x.shape
+        Aout = np.ndarray((R, Q, P))
+        for r in range(R):
+            for q in range(Q):
+                for p in range(P):
+                    Aout[r, q, p] = 0
+                    for s in range(S):
+                        Aout[r, q, p] += A[r, q, s] * x[p, s]
+        return Aout
+
+    def validate(self, **kwargs):
+        n_cores = 8
+        assert (kwargs['r'] % kwargs['r_tiles']) == 0, "r must be an integer multiple of r_tiles"
+        assert (kwargs['q'] % kwargs['q_tiles']) == 0, "q must be an integer multiple of q_tiles"
+        if kwargs['funcptr'] != 'doitgen_naive':
+            assert (kwargs['s'] % 4) == 0, "s must be an integer multiple of unrolling factor"
+        r_per_tile = kwargs['r'] / kwargs['r_tiles']
+        q_per_tile = kwargs['q'] / kwargs['q_tiles']
+        assert (r_per_tile % n_cores) == 0, "r_per_tile must be an integer multiple of n_cores"
+        assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
+
+        # Calculate total TCDM occupation
+        a_tile_size = r_per_tile * q_per_tile * kwargs['s'] * 8
+        x_size = kwargs['s'] * kwargs['s'] * 8
+        total_size = 2 * a_tile_size + x_size
+        if DOUBLE_BUFFER:
+            total_size *= 2
+        du.validate_tcdm_footprint(total_size)
+
+    def emit_header(self, **kwargs):
+        header = [super().emit_header()]
+
+        self.validate(**kwargs)
+
+        A = du.generate_random_array((kwargs['r'], kwargs['q'], kwargs['s']))
+        x = du.generate_random_array((kwargs['s'], kwargs['s']))
+
+        _ = self.golden_model(A, x)
+
+        A = A.flatten()
+        x = x.flatten()
+
+        A_uid = 'A'
+        x_uid = 'x'
+
+        cfg = {
+            'r': kwargs['r'],
+            'q': kwargs['q'],
+            's': kwargs['s'],
+            'A': A_uid,
+            'x': x_uid,
+            'r_tiles': kwargs['r_tiles'],
+            'q_tiles': kwargs['q_tiles'],
+            'funcptr': kwargs['funcptr']
+        }
+
+        header += [du.format_array_definition('double', A_uid, A)]
+        header += [du.format_array_definition('double', x_uid, x)]
+        header += [du.format_struct_definition('doitgen_args_t', 'args', cfg)]
+        header = '\n\n'.join(header)
+
+        return header
+
+
+if __name__ == '__main__':
+    DoitgenDataGen().main()
diff --git a/sw/apps/doitgen/scripts/verify.py b/sw/apps/doitgen/scripts/verify.py
new file mode 100755
index 000000000..8f72b0415
--- /dev/null
+++ b/sw/apps/doitgen/scripts/verify.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import numpy as np
+import sys
+from datagen import DoitgenDataGen
+
+from snitch.util.sim.verif_utils import Verifier
+
+
+class DoitgenVerifier(Verifier):
+
+    OUTPUT_UIDS = ['A']
+
+    def __init__(self):
+        super().__init__()
+        self.func_args = {
+            'r': 'I',
+            'q': 'I',
+            's': 'I',
+            'A': 'I',
+            'x': 'I',
+            'r_tiles': 'I',
+            'q_tiles': 'I',
+            'funcptr': 'I'
+        }
+        self.func_args = self.get_input_from_symbol('args', self.func_args)
+
+    def get_actual_results(self):
+        return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double')
+
+    def get_expected_results(self):
+        A = self.get_input_from_symbol('A', 'double')
+        A = np.reshape(A, (self.func_args['r'], self.func_args['q'], self.func_args['s']))
+        x = self.get_input_from_symbol('x', 'double')
+        x = np.reshape(x, (self.func_args['s'], self.func_args['s']))
+        return DoitgenDataGen().golden_model(A, x).flatten()
+
+    def check_results(self, *args):
+        return super().check_results(*args, rtol=1e-10)
+
+
+if __name__ == "__main__":
+    sys.exit(DoitgenVerifier().main())
diff --git a/sw/apps/doitgen/src/args.h b/sw/apps/doitgen/src/args.h
new file mode 100644
index 000000000..5d3f56ce4
--- /dev/null
+++ b/sw/apps/doitgen/src/args.h
@@ -0,0 +1,22 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#pragma once
+#include <stdint.h>
+
+typedef void (*doitgen_fp_t)(uint32_t r, uint32_t q, uint32_t s, double *A,
+                             double *x, double *Aout);
+
+typedef struct {
+    uint32_t r;
+    uint32_t q;
+    uint32_t s;
+    double *A;
+    double *x;
+    uint32_t r_tiles;
+    uint32_t q_tiles;
+    doitgen_fp_t funcptr;
+} doitgen_args_t;
diff --git a/sw/apps/doitgen/src/doitgen.h b/sw/apps/doitgen/src/doitgen.h
new file mode 100644
index 000000000..2f7bc6128
--- /dev/null
+++ b/sw/apps/doitgen/src/doitgen.h
@@ -0,0 +1,303 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "args.h"
+#include "snrt.h"
+
+#define DOUBLE_BUFFER 1
+
+__thread int setup_ssr = 1;
+
+void doitgen_naive(uint32_t r, uint32_t q, uint32_t s, double *A, double *x,
+                   double *Aout) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    for (uint32_t i = offset; i < r; i += stride) {
+        for (uint32_t j = 0; j < q; j++) {
+            for (uint32_t k = 0; k < s; k++) {
+                Aout[i * q * s + j * s + k] = 0.0;
+                for (uint32_t l = 0; l < s; l++) {
+                    Aout[i * q * s + j * s + k] +=
+                        A[i * q * s + j * s + l] * x[k * s + l];
+                }
+            }
+        }
+    }
+
+    snrt_fpu_fence();
+}
+
+void doitgen_baseline(uint32_t r, uint32_t q, uint32_t s, double *A, double *x,
+                      double *Aout) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Unrolling factors
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll1 = 4;
+    const uint32_t unroll0 = 4;
+
+    for (uint32_t i = offset; i < r; i += stride) {
+        for (uint32_t j = 0; j < q; j++) {
+            for (uint32_t k = 0; k < s; k += unroll1) {
+                double acc[4];
+                acc[0] = 0;
+                acc[1] = 0;
+                acc[2] = 0;
+                acc[3] = 0;
+
+                for (uint32_t l = 0; l < s; l += unroll0) {
+                    asm volatile(
+                        "fmadd.d %[acc0], %[a0], %[x0], %[acc0] \n"
+                        "fmadd.d %[acc1], %[a0], %[x1], %[acc1] \n"
+                        "fmadd.d %[acc2], %[a0], %[x2], %[acc2] \n"
+                        "fmadd.d %[acc3], %[a0], %[x3], %[acc3] \n"
+                        "fmadd.d %[acc0], %[a1], %[x4], %[acc0] \n"
+                        "fmadd.d %[acc1], %[a1], %[x5], %[acc1] \n"
+                        "fmadd.d %[acc2], %[a1], %[x6], %[acc2] \n"
+                        "fmadd.d %[acc3], %[a1], %[x7], %[acc3] \n"
+                        "fmadd.d %[acc0], %[a2], %[x8], %[acc0] \n"
+                        "fmadd.d %[acc1], %[a2], %[x9], %[acc1] \n"
+                        "fmadd.d %[acc2], %[a2], %[x10], %[acc2] \n"
+                        "fmadd.d %[acc3], %[a2], %[x11], %[acc3] \n"
+                        "fmadd.d %[acc0], %[a3], %[x12], %[acc0] \n"
+                        "fmadd.d %[acc1], %[a3], %[x13], %[acc1] \n"
+                        "fmadd.d %[acc2], %[a3], %[x14], %[acc2] \n"
+                        "fmadd.d %[acc3], %[a3], %[x15], %[acc3] \n"
+                        : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                          [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
+                        : [ a0 ] "f"(A[i * q * s + j * s + l + 0]),
+                          [ a1 ] "f"(A[i * q * s + j * s + l + 1]),
+                          [ a2 ] "f"(A[i * q * s + j * s + l + 2]),
+                          [ a3 ] "f"(A[i * q * s + j * s + l + 3]),
+                          [ x0 ] "f"(x[(k + 0) * s + l + 0]),
+                          [ x1 ] "f"(x[(k + 1) * s + l + 0]),
+                          [ x2 ] "f"(x[(k + 2) * s + l + 0]),
+                          [ x3 ] "f"(x[(k + 3) * s + l + 0]),
+                          [ x4 ] "f"(x[(k + 0) * s + l + 1]),
+                          [ x5 ] "f"(x[(k + 1) * s + l + 1]),
+                          [ x6 ] "f"(x[(k + 2) * s + l + 1]),
+                          [ x7 ] "f"(x[(k + 3) * s + l + 1]),
+                          [ x8 ] "f"(x[(k + 0) * s + l + 2]),
+                          [ x9 ] "f"(x[(k + 1) * s + l + 2]),
+                          [ x10 ] "f"(x[(k + 2) * s + l + 2]),
+                          [ x11 ] "f"(x[(k + 3) * s + l + 2]),
+                          [ x12 ] "f"(x[(k + 0) * s + l + 3]),
+                          [ x13 ] "f"(x[(k + 1) * s + l + 3]),
+                          [ x14 ] "f"(x[(k + 2) * s + l + 3]),
+                          [ x15 ] "f"(x[(k + 3) * s + l + 3])
+                        :);
+                }
+
+                Aout[i * q * s + j * s + k + 0] = acc[0];
+                Aout[i * q * s + j * s + k + 1] = acc[1];
+                Aout[i * q * s + j * s + k + 2] = acc[2];
+                Aout[i * q * s + j * s + k + 3] = acc[3];
+            }
+        }
+    }
+
+    snrt_fpu_fence();
+}
+
+void doitgen_opt(uint32_t r, uint32_t q, uint32_t s, double *A, double *x,
+                 double *Aout) {
+    uint32_t bound = r / snrt_cluster_compute_core_num();
+    uint32_t offset = bound * snrt_cluster_core_idx();
+
+    // Unrolling factor of innermost loop
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll = 4;
+
+    if (setup_ssr) {
+        // Configure ft0 and ft1 to load A and x
+        // for (i = offset; i < bound; i++)
+        //     for (j = 0; j < q; j++)
+        //         for (k1 = 0; k1 < s; k1 += unroll)
+        //             for (l = 0; l < s; l++)
+        //                 for (k0 = 0; k0 < unroll; k0++)
+        //                     k = k1 + k0
+        //                     ft0.push(A[i * q * s + j * s + l])
+        //                     ft1.push(x[k * s + l])
+        const uint32_t ssr0_b[4] = {unroll, s, s / unroll, q * bound};
+        const uint32_t ssr0_i[4] = {0, sizeof(double), 0, s * sizeof(double)};
+        snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3],
+                         ssr0_i[1], ssr0_i[2], ssr0_i[3]);
+        snrt_ssr_repeat(SNRT_SSR_DM0, unroll);
+        const uint32_t ssr1_b[4] = {unroll, s, s / unroll, q * bound};
+        const uint32_t ssr1_i[4] = {s * sizeof(double), sizeof(double),
+                                    unroll * s * sizeof(double), 0};
+        snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2],
+                         ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
+        setup_ssr = 0;
+    }
+
+    // SSR start address need to be configured each time
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, A + offset * q * s);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, x);
+    snrt_ssr_enable();
+
+    for (uint32_t i = offset; i < (offset + bound); i++) {
+        for (uint32_t j = 0; j < q; j++) {
+            for (uint32_t k = 0; k < s; k += unroll) {
+                double acc[unroll];
+                acc[0] = 0;
+                acc[1] = 0;
+                acc[2] = 0;
+                acc[3] = 0;
+
+                asm volatile(
+                    "frep.o %[n_frep], %[unroll], 0, 0 \n"
+                    "fmadd.d %[acc0], ft0, ft1, %[acc0] \n"
+                    "fmadd.d %[acc1], ft0, ft1, %[acc1] \n"
+                    "fmadd.d %[acc2], ft0, ft1, %[acc2] \n"
+                    "fmadd.d %[acc3], ft0, ft1, %[acc3] \n"
+                    : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                      [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
+                    : [ n_frep ] "r"(s - 1), [ unroll ] "i"(unroll)
+                    : "ft0", "ft1", "ft2");
+
+                Aout[i * q * s + j * s + k + 0] = acc[0];
+                Aout[i * q * s + j * s + k + 1] = acc[1];
+                Aout[i * q * s + j * s + k + 2] = acc[2];
+                Aout[i * q * s + j * s + k + 3] = acc[3];
+            }
+        }
+    }
+
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+}
+
+void doitgen_job(doitgen_args_t *args) {
+    uint32_t r_frac, q_frac, a_tile_size, a_tile_bytes, x_size, x_bytes;
+    uint64_t local_a0_addr, local_aout0_addr, local_x0_addr, local_a1_addr,
+        local_aout1_addr;
+    double *local_a[2];
+    double *local_aout[2];
+    double *local_x;
+    uint32_t iterations, sb_iterations;
+    uint32_t i, i_dma_in, i_compute, i_dma_out, i_r, i_q, buff_idx;
+
+#ifndef JOB_ARGS_PRELOADED
+    // Allocate space for job arguments in TCDM
+    doitgen_args_t *local_args = (doitgen_args_t *)snrt_l1_next();
+
+    // Copy job arguments to TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_args, args, sizeof(doitgen_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+    args = local_args;
+#endif
+
+    // Calculate size of each tile
+    r_frac = args->r / args->r_tiles;
+    q_frac = args->q / args->q_tiles;
+    a_tile_size = r_frac * q_frac * args->s;
+    x_size = args->s * args->s;
+    a_tile_bytes = a_tile_size * sizeof(double);
+    x_bytes = x_size * sizeof(double);
+
+    // Allocate space for job operands in TCDM
+    local_x0_addr = (uint64_t)args + sizeof(doitgen_args_t);
+    local_a0_addr = local_x0_addr + x_bytes;
+    local_aout0_addr = local_a0_addr + a_tile_bytes;
+    local_x = (double *)local_x0_addr;
+    local_a[0] = (double *)local_a0_addr;
+    local_aout[0] = (double *)local_aout0_addr;
+    if (DOUBLE_BUFFER) {
+        local_a1_addr = local_aout0_addr + a_tile_bytes;
+        local_aout1_addr = local_a1_addr + a_tile_bytes;
+        local_a[1] = (double *)local_a1_addr;
+        local_aout[1] = (double *)local_aout1_addr;
+    }
+
+    // Calculate number of iterations
+    sb_iterations = args->r_tiles * args->q_tiles;
+    if (DOUBLE_BUFFER)
+        iterations = sb_iterations + 2;
+    else
+        iterations = sb_iterations;
+
+    // Iterate over all tiles
+    for (i = 0; i < iterations; i++) {
+        if (snrt_is_dm_core()) {
+            // DMA in
+            if (!DOUBLE_BUFFER || (i < sb_iterations)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_in = i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0;
+                i_r = i_dma_in / args->q_tiles;
+                i_q = i_dma_in % args->q_tiles;
+
+                // Copy job operands in TCDM
+                snrt_dma_load_2d_tile(local_a[buff_idx], args->A, i_r, i_q,
+                                      r_frac, q_frac * args->s,
+                                      args->q * args->s, sizeof(double));
+                if (i_dma_in == 0) snrt_dma_start_1d(local_x, args->x, x_bytes);
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+
+            // Additional barriers required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            // DMA out
+            if (!DOUBLE_BUFFER || (i > 1)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_out = DOUBLE_BUFFER ? i - 2 : i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0;
+                i_r = i_dma_out / args->q_tiles;
+                i_q = i_dma_out % args->q_tiles;
+
+                // Copy job outputs from TCDM
+                snrt_dma_store_2d_tile(args->A, local_aout[buff_idx], i_r, i_q,
+                                       r_frac, q_frac * args->s,
+                                       args->q * args->s, sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+        }
+
+        // Compute
+        if (snrt_is_compute_core()) {
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_compute = DOUBLE_BUFFER ? i - 1 : i;
+                buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0;
+
+                // Perform tile computation
+                doitgen_fp_t fp = args->funcptr;
+                fp(r_frac, q_frac, args->s, local_a[buff_idx], local_x,
+                   local_aout[buff_idx]);
+
+                snrt_mcycle();
+            }
+
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+        }
+        // Synchronize cores after every iteration
+        snrt_cluster_hw_barrier();
+    }
+}
diff --git a/sw/apps/doitgen/src/main.c b/sw/apps/doitgen/src/main.c
new file mode 100644
index 000000000..64c9571f8
--- /dev/null
+++ b/sw/apps/doitgen/src/main.c
@@ -0,0 +1,17 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz>
+
+#include "snrt.h"
+
+#include "doitgen.h"
+
+#include "data.h"
+
+int main() {
+    doitgen_job(&args);
+
+    return 0;
+}
diff --git a/sw/blas/.gitignore b/sw/blas/.gitignore
deleted file mode 100644
index 2ff975f29..000000000
--- a/sw/blas/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-**/data/data.h
\ No newline at end of file
diff --git a/sw/blas/axpy/data/params.json b/sw/blas/axpy/data/params.json
index 2f8f5871c..a4fa15275 100644
--- a/sw/blas/axpy/data/params.json
+++ b/sw/blas/axpy/data/params.json
@@ -3,5 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 {
-    n: 384
+    "n_tiles": 3,
+    "n": 384,
+    "funcptr": "axpy_opt"
 }
diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py
index 117495391..38634dd5e 100755
--- a/sw/blas/axpy/scripts/datagen.py
+++ b/sw/blas/axpy/scripts/datagen.py
@@ -5,45 +5,68 @@
 #
 # Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-import numpy as np
 import sys
 
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_array_declaration, format_ifdef_wrapper, DataGen
+import snitch.util.sim.data_utils as du
 
 
-class AxpyDataGen(DataGen):
+class AxpyDataGen(du.DataGen):
 
-    MIN = -1000
-    MAX = +1000
     # AXI splits bursts crossing 4KB address boundaries. To minimize
     # the occurrence of these splits the data should be aligned to 4KB
     BURST_ALIGNMENT = 4096
+    # Function pointers to alternative implementations
+    FUNCPTRS = ["axpy_naive", "axpy_fma", "axpy_opt"]
 
     def golden_model(self, a, x, y):
         return a*x + y
 
+    def validate_config(self, **kwargs):
+        assert kwargs['n'] % kwargs['n_tiles'] == 0, "n must be an integer multiple of n_tiles"
+        n_per_tile = kwargs['n'] // kwargs['n_tiles']
+        assert (n_per_tile % 8) == 0, "n must be an integer multiple of the number of cores"
+        assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
+
+        # Calculate total TCDM occupation
+        # Note: doesn't account for gaps created by data alignment
+        vec_size = n_per_tile * 8
+        total_size = 2 * 3 * vec_size
+        du.validate_tcdm_footprint(total_size)
+
     def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
-        n = kwargs['n']
-        a = np.random.uniform(self.MIN, self.MAX, 1)
-        x = np.random.uniform(self.MIN, self.MAX, n)
-        y = np.random.uniform(self.MIN, self.MAX, n)
+        self.validate_config(**kwargs)
+
+        a = du.generate_random_array(1)[0]
+        x = du.generate_random_array(kwargs['n'])
+        y = du.generate_random_array(kwargs['n'])
         g = self.golden_model(a, x, y)
 
-        assert (n % 8) == 0, "n must be an integer multiple of the number of cores"
-
-        header += [format_scalar_definition('const uint32_t', 'n', n)]
-        header += [format_scalar_definition('const double', 'a', a[0])]
-        header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
-                                           section=kwargs['section'])]
-        header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
-                                           section=kwargs['section'])]
-        header += [format_array_declaration('double', 'z', [n], alignment=self.BURST_ALIGNMENT,
-                                            section=kwargs['section'])]
-        result_def = format_array_definition('double', 'g', g)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        x_uid = 'x'
+        y_uid = 'y'
+        z_uid = 'z'
+
+        cfg = {
+            'n': kwargs['n'],
+            'a': a,
+            'x': x_uid,
+            'y': y_uid,
+            'z': z_uid,
+            'n_tiles': kwargs['n_tiles'],
+            'funcptr': kwargs['funcptr']
+        }
+
+        header += [du.format_scalar_definition('const double', 'a', a)]
+        header += [du.format_array_definition('double', x_uid, x,
+                   alignment=self.BURST_ALIGNMENT, section=kwargs['section'])]
+        header += [du.format_array_definition('double', y_uid, y,
+                   alignment=self.BURST_ALIGNMENT, section=kwargs['section'])]
+        header += [du.format_array_declaration('double', z_uid, x.shape,
+                   alignment=self.BURST_ALIGNMENT, section=kwargs['section'])]
+        header += [du.format_struct_definition('axpy_args_t', 'args', cfg)]
+        result_def = du.format_array_definition('double', 'g', g)
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/blas/axpy/src/args.h b/sw/blas/axpy/src/args.h
new file mode 100644
index 000000000..c5d542852
--- /dev/null
+++ b/sw/blas/axpy/src/args.h
@@ -0,0 +1,19 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <stdint.h>
+
+typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y,
+                          double* z);
+
+typedef struct {
+    uint32_t n;
+    double a;
+    double* x;
+    double* y;
+    double* z;
+    uint32_t n_tiles;
+    axpy_fp_t funcptr;
+} axpy_args_t;
diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h
index e8f5ae6c0..8ded48167 100644
--- a/sw/blas/axpy/src/axpy.h
+++ b/sw/blas/axpy/src/axpy.h
@@ -2,28 +2,49 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+#include "args.h"
 #include "snrt.h"
 
-inline void axpy(uint32_t n, double a, double* x, double* y, double* z) {
+#define DOUBLE_BUFFER 1
+
+#define BANK_ALIGNMENT 8
+#define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT)
+#define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT)
+
+static inline void axpy_naive(uint32_t n, double a, double *x, double *y,
+                              double *z) {
     int core_idx = snrt_cluster_core_idx();
     int frac = n / snrt_cluster_compute_core_num();
-    int offset = core_idx * frac;
+    int offset = core_idx;
+
+    for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) {
+        z[i] = a * x[i] + y[i];
+    }
+    snrt_fpu_fence();
+}
 
-#ifndef XSSR
+static inline void axpy_fma(uint32_t n, double a, double *x, double *y,
+                            double *z) {
+    int core_idx = snrt_cluster_core_idx();
+    int frac = n / snrt_cluster_compute_core_num();
+    int offset = core_idx;
 
-    for (int i = 0; i < frac; i++) {
-        z[offset] = a * x[offset] + y[offset];
-        offset++;
+    for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) {
+        asm volatile("fmadd.d %[z], %[a], %[x], %[y] \n"
+                     : [ z ] "=f"(z[i])
+                     : [ a ] "f"(a), [ x ] "f"(x[i]), [ y ] "f"(y[i]));
     }
     snrt_fpu_fence();
+}
 
-#else
+static inline void axpy_opt(uint32_t n, double a, double *x, double *y,
+                            double *z) {
+    int core_idx = snrt_cluster_core_idx();
+    int frac = n / snrt_cluster_compute_core_num();
+    int offset = core_idx;
 
-    // TODO(colluca): revert once Banshee supports SNRT_SSR_DM_ALL
-    // snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac, sizeof(double));
-    snrt_ssr_loop_1d(SNRT_SSR_DM0, frac, sizeof(double));
-    snrt_ssr_loop_1d(SNRT_SSR_DM1, frac, sizeof(double));
-    snrt_ssr_loop_1d(SNRT_SSR_DM2, frac, sizeof(double));
+    snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac,
+                     snrt_cluster_compute_core_num() * sizeof(double));
 
     snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x + offset);
     snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y + offset);
@@ -40,6 +61,131 @@ inline void axpy(uint32_t n, double a, double* x, double* y, double* z) {
 
     snrt_fpu_fence();
     snrt_ssr_disable();
+}
+
+static inline void axpy_job(axpy_args_t *args) {
+    uint32_t frac, offset, size;
+    uint64_t local_x0_addr, local_y0_addr, local_z0_addr, local_x1_addr,
+        local_y1_addr, local_z1_addr;
+    double *local_x[2];
+    double *local_y[2];
+    double *local_z[2];
+    double *remote_x, *remote_y, *remote_z;
+    uint32_t iterations, i, i_dma_in, i_compute, i_dma_out, buff_idx;
 
+#ifndef JOB_ARGS_PRELOADED
+    // Allocate space for job arguments in TCDM
+    axpy_args_t *local_args = (axpy_args_t *)snrt_l1_next();
+
+    // Copy job arguments to TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_args, args, sizeof(axpy_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+    args = local_args;
 #endif
+
+    // Calculate size of each tile
+    frac = args->n / args->n_tiles;
+    size = frac * sizeof(double);
+
+    // Allocate space for job operands in TCDM
+    // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
+    local_x0_addr = ALIGN_UP_TCDM((uint64_t)args + sizeof(axpy_args_t));
+    local_y0_addr = ALIGN_UP_TCDM(local_x0_addr + size) + 8 * BANK_ALIGNMENT;
+    local_z0_addr = ALIGN_UP_TCDM(local_y0_addr + size) + 16 * BANK_ALIGNMENT;
+    local_x[0] = (double *)local_x0_addr;
+    local_y[0] = (double *)local_y0_addr;
+    local_z[0] = (double *)local_z0_addr;
+    if (DOUBLE_BUFFER) {
+        local_x1_addr = ALIGN_UP_TCDM(local_z0_addr + size);
+        local_y1_addr =
+            ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT;
+        local_z1_addr =
+            ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT;
+        local_x[1] = (double *)local_x1_addr;
+        local_y[1] = (double *)local_y1_addr;
+        local_z[1] = (double *)local_z1_addr;
+    }
+
+    // Calculate number of iterations
+    iterations = args->n_tiles;
+    if (DOUBLE_BUFFER) iterations += 2;
+
+    // Iterate over all tiles
+    for (i = 0; i < iterations; i++) {
+        if (snrt_is_dm_core()) {
+            // DMA in
+            if (!DOUBLE_BUFFER || (i < args->n_tiles)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_in = i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0;
+
+                // Calculate size and pointers to current tile
+                offset = i_dma_in * frac;
+                remote_x = args->x + offset;
+                remote_y = args->y + offset;
+
+                // Copy job operands in TCDM
+                snrt_dma_start_1d(local_x[buff_idx], remote_x, size);
+                snrt_dma_start_1d(local_y[buff_idx], remote_y, size);
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+
+            // Additional barriers required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            // DMA out
+            if (!DOUBLE_BUFFER || (i > 1)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_out = DOUBLE_BUFFER ? i - 2 : i;
+                buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0;
+
+                // Calculate pointers to current tile
+                offset = i_dma_out * frac;
+                remote_z = args->z + offset;
+
+                // Copy job outputs from TCDM
+                snrt_dma_start_1d(remote_z, local_z[buff_idx], size);
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+        }
+
+        // Compute
+        if (snrt_is_compute_core()) {
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+
+            if (!DOUBLE_BUFFER || (i > 0 && i < (args->n_tiles + 1))) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_compute = DOUBLE_BUFFER ? i - 1 : i;
+                buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0;
+
+                // Perform tile computation
+                axpy_fp_t fp = args->funcptr;
+                fp(frac, args->a, local_x[buff_idx], local_y[buff_idx],
+                   local_z[buff_idx]);
+
+                snrt_mcycle();
+            }
+
+            // Additional barrier required when not double buffering
+            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
+        }
+
+        // Synchronize cores after every iteration
+        snrt_cluster_hw_barrier();
+    }
 }
diff --git a/sw/blas/axpy/src/main.c b/sw/blas/axpy/src/main.c
index 22f3dd129..e0389d25d 100644
--- a/sw/blas/axpy/src/main.c
+++ b/sw/blas/axpy/src/main.c
@@ -4,64 +4,24 @@
 
 #include "snrt.h"
 
-#define XSSR
 #include "axpy.h"
 #include "data.h"
 
 int main() {
-    double *local_x, *local_y, *local_z;
-    double *remote_x, *remote_y, *remote_z;
-
-    // Calculate size and pointers for each cluster
-    uint32_t frac = n / snrt_cluster_num();
-    uint32_t offset = frac * snrt_cluster_idx();
-    remote_x = x + offset;
-    remote_y = y + offset;
-    remote_z = z + offset;
-
-    // Allocate space in TCDM
-    local_x = (double *)snrt_l1_next();
-    local_y = local_x + frac;
-    local_z = local_y + frac;
-
-    // Copy data in TCDM
-    if (snrt_is_dm_core()) {
-        size_t size = frac * sizeof(double);
-        snrt_dma_start_1d(local_x, remote_x, size);
-        snrt_dma_start_1d(local_y, remote_y, size);
-        snrt_dma_wait_all();
-    }
-
-    snrt_cluster_hw_barrier();
-
-    // Compute
-    if (!snrt_is_dm_core()) {
-        uint32_t start_cycle = snrt_mcycle();
-        axpy(frac, a, local_x, local_y, local_z);
-        uint32_t end_cycle = snrt_mcycle();
-    }
-
-    snrt_cluster_hw_barrier();
-
-    // Copy data out of TCDM
-    if (snrt_is_dm_core()) {
-        size_t size = frac * sizeof(double);
-        snrt_dma_start_1d(remote_z, local_z, size);
-        snrt_dma_wait_all();
-    }
-
-    snrt_cluster_hw_barrier();
+    axpy_job(&args);
 
 // TODO: currently only works for single cluster otherwise need to
 //       synchronize all cores here
 #ifdef BIST
+    uint32_t n = args.n;
+    double* z = args.z;
     uint32_t nerr = n;
 
     // Check computation is correct
     if (snrt_global_core_idx() == 0) {
         for (int i = 0; i < n; i++) {
-            if (local_z[i] == g[i]) nerr--;
-            printf("%d %d\n", local_z[i], g[i]);
+            if (z[i] == g[i]) nerr--;
+            printf("%d %d\n", z[i], g[i]);
         }
     }
 
diff --git a/sw/blas/blas.h b/sw/blas/blas.h
index 33c29e175..69005ccb7 100644
--- a/sw/blas/blas.h
+++ b/sw/blas/blas.h
@@ -4,6 +4,20 @@
 
 #pragma once
 
+// Floating-point multiplications by zero cannot be optimized as in some
+// edge cases they do not yield zero:
+// - 0f * NaN = NaN
+// - 0f * INFINITY == NaN
+// Thus in order to optimize it, we need to test for zero. You can use this
+// function for free when `multiplier` is a constant.
+static inline double multiply_opt(double multiplicand, double multiplier) {
+    if (multiplier)
+        return multiplicand * multiplier;
+    else
+        return 0;
+}
+
 #include "axpy/src/axpy.h"
 #include "dot/src/dot.h"
 #include "gemm/src/gemm.h"
+#include "syrk/src/syrk.h"
diff --git a/sw/blas/dot/Makefile b/sw/blas/dot/Makefile
deleted file mode 100644
index 077b84e5a..000000000
--- a/sw/blas/dot/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2024 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-# Usage of absolute paths is required to externally include this Makefile
-MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
-DATA_DIR := $(realpath $(MK_DIR)/data)
-SRC_DIR  := $(realpath $(MK_DIR)/src)
-
-DATA_CFG ?= $(DATA_DIR)/params.json
-SECTION  ?=
-
-APP     ?= dot
-SRCS    ?= $(realpath $(SRC_DIR)/main.c)
-INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR)
-
-DATAGEN_PY = $(MK_DIR)/scripts/datagen.py
-DATA_H    ?= $(DATA_DIR)/data.h
-
-$(dir $(DATA_H)):
-	mkdir -p $@
-
-$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H))
-	$< -c $(DATA_CFG) --section="$(SECTION)" $@
-
-.PHONY: clean-data clean
-
-clean-data:
-	rm -f $(DATA_H)
-
-clean: clean-data
diff --git a/sw/blas/dot/scripts/datagen.py b/sw/blas/dot/scripts/datagen.py
index 01560c51f..8a8631a6a 100755
--- a/sw/blas/dot/scripts/datagen.py
+++ b/sw/blas/dot/scripts/datagen.py
@@ -6,14 +6,11 @@
 import numpy as np
 import sys
 
-from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \
-    format_scalar_declaration, format_ifdef_wrapper, DataGen
+import snitch.util.sim.data_utils as du
 
 
-class DotDataGen(DataGen):
+class DotDataGen(du.DataGen):
 
-    MIN = -1000
-    MAX = +1000
     # AXI splits bursts crossing 4KB address boundaries. To minimize
     # the occurrence of these splits the data should be aligned to 4KB
     BURST_ALIGNMENT = 4096
@@ -25,22 +22,22 @@ def emit_header(self, **kwargs):
         header = [super().emit_header()]
 
         n = kwargs['n']
-        x = np.random.uniform(self.MIN, self.MAX, n)
-        y = np.random.uniform(self.MIN, self.MAX, n)
+        x = du.generate_random_array(n)
+        y = du.generate_random_array(n)
         g = self.golden_model(x, y)
 
         assert (n % (8 * 4)) == 0, "n must be an integer multiple of the number of cores times " \
                                    "the unrolling factor"
 
-        header += [format_scalar_definition('const uint32_t', 'n', n)]
-        header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
-                                           section=kwargs['section'])]
-        header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
-                                           section=kwargs['section'])]
-        header += [format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT,
-                                             section=kwargs['section'])]
-        result_def = format_scalar_definition('double', 'g', g)
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        header += [du.format_scalar_definition('const uint32_t', 'n', n)]
+        header += [du.format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT,
+                                              section=kwargs['section'])]
+        header += [du.format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT,
+                                              section=kwargs['section'])]
+        header += [du.format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT,
+                                                section=kwargs['section'])]
+        result_def = du.format_scalar_definition('double', 'g', g)
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/blas/gemm/scripts/datagen.py b/sw/blas/gemm/scripts/datagen.py
index da7f8ba57..2eb6e2f4d 100755
--- a/sw/blas/gemm/scripts/datagen.py
+++ b/sw/blas/gemm/scripts/datagen.py
@@ -10,18 +10,15 @@
 
 import numpy as np
 import re
-import pyflexfloat as ff
 import sys
 
-from snitch.util.sim import data_utils
-from snitch.util.sim.data_utils import DataGen, format_array_declaration, \
-    format_struct_definition, format_array_definition, format_ifdef_wrapper
+import snitch.util.sim.data_utils as du
 
 
 np.random.seed(42)
 
 
-class GemmDataGen(DataGen):
+class GemmDataGen(du.DataGen):
 
     # AXI splits bursts crossing 4KB address boundaries. To minimize
     # the occurrence of these splits the data should be aligned to 4KB
@@ -56,14 +53,14 @@ def validate_config(self, gemm_fp, parallelize_m,
 
         # Calculate total TCDM occupation
         # Note: doesn't account for double buffering
-        prec = data_utils.size_from_precision_t(dtype)
+        prec = du.size_from_precision_t(dtype)
         a_size = frac_m * frac_k * prec
         b_size = frac_k * frac_n * prec
         c_size = frac_m * frac_n * prec
         total_size = a_size
         total_size += b_size
         total_size += c_size
-        data_utils.validate_tcdm_footprint(total_size)
+        du.validate_tcdm_footprint(total_size)
 
         assert (M % m_tiles) == 0, 'M is not an integer multiple of tile size'
         assert (N % n_tiles) == 0, 'N is not an integer multiple of tile size'
@@ -99,12 +96,11 @@ def emit_header(self, **kwargs):
 
         prec, _ = self.infer_implementation(kwargs['gemm_fp'])
 
-        ff_desc = data_utils.ff_desc_from_precision_t(prec)
-        ctype = data_utils.ctype_from_precision_t(prec)
+        ctype = du.ctype_from_precision_t(prec)
 
-        a = ff.array(np.random.rand(M, K), ff_desc)
-        b = ff.array(np.random.rand(K, N), ff_desc)
-        c = ff.array(np.random.rand(M, N), ff_desc)
+        a = du.generate_random_array((M, K), prec)
+        b = du.generate_random_array((K, N), prec)
+        c = du.generate_random_array((M, N), prec)
         result = self.exact_golden_model(1, a, b, kwargs['beta'], c)
 
         # Store matrices in transposed form if requested
@@ -127,18 +123,18 @@ def emit_header(self, **kwargs):
         b = b.flatten()
         c = c.flatten()
 
-        header += [format_array_declaration(ctype, a_uid, a.shape)]
-        header += [format_array_declaration(ctype, b_uid, b.shape)]
-        header += [format_array_declaration(ctype, c_uid, c.shape)]
-        header += [format_struct_definition('gemm_args_t', 'args', cfg)]
-        header += [format_array_definition(ctype, a_uid, a,
-                                           section=kwargs['section'])]
-        header += [format_array_definition(ctype, b_uid, b,
-                                           section=kwargs['section'])]
-        header += [format_array_definition(ctype, c_uid, c,
-                                           section=kwargs['section'])]
-        result_def = format_array_definition(ctype, 'result', result.flatten())
-        header += [format_ifdef_wrapper('BIST', result_def)]
+        header += [du.format_array_declaration(ctype, a_uid, a.shape)]
+        header += [du.format_array_declaration(ctype, b_uid, b.shape)]
+        header += [du.format_array_declaration(ctype, c_uid, c.shape)]
+        header += [du.format_struct_definition('gemm_args_t', 'args', cfg)]
+        header += [du.format_array_definition(ctype, a_uid, a,
+                                              section=kwargs['section'])]
+        header += [du.format_array_definition(ctype, b_uid, b,
+                                              section=kwargs['section'])]
+        header += [du.format_array_definition(ctype, c_uid, c,
+                                              section=kwargs['section'])]
+        result_def = du.format_array_definition(ctype, 'result', result.flatten())
+        header += [du.format_ifdef_wrapper('BIST', result_def)]
         header = '\n\n'.join(header)
 
         return header
diff --git a/sw/blas/gemm/scripts/verify.py b/sw/blas/gemm/scripts/verify.py
index 40840b327..353ea1328 100755
--- a/sw/blas/gemm/scripts/verify.py
+++ b/sw/blas/gemm/scripts/verify.py
@@ -18,9 +18,9 @@ class GemmVerifier(Verifier):
     OUTPUT_UIDS = ['c']
     ERR_THRESHOLD = {
         1: 1e-4,
-        2: 1e-2,
-        4: 1e-6,
-        8: 1e-6
+        2: 8e-2,
+        4: 1e-3,
+        8: 1e-3
     }
 
     def __init__(self):
diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
index a480379a9..1a73aedf8 100644
--- a/sw/blas/gemm/src/gemm.h
+++ b/sw/blas/gemm/src/gemm.h
@@ -13,19 +13,6 @@
 
 #pragma once
 
-// Floating-point multiplications by zero cannot be optimized as in some
-// edge cases they do not yield zero:
-// - 0f * NaN = NaN
-// - 0f * INFINITY == NaN
-// Thus in order to optimize it, we need to test for zero. You can use this
-// function for free when `multiplier` is a constant.
-static inline double multiply_opt(double multiplicand, double multiplier) {
-    if (multiplier)
-        return multiplicand * multiplier;
-    else
-        return 0;
-}
-
 #include "gemm_fp16.h"
 #include "gemm_fp32.h"
 #include "gemm_fp64.h"
diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c
index 17f3936b0..9760000c6 100644
--- a/sw/blas/gemm/src/main.c
+++ b/sw/blas/gemm/src/main.c
@@ -9,7 +9,7 @@
 #include <math.h>
 #include <stdint.h>
 
-#include "gemm.h"
+#include "blas.h"
 
 #include "data.h"
 #include "snrt.h"
diff --git a/sw/blas/syrk/data/params.json b/sw/blas/syrk/data/params.json
new file mode 100644
index 000000000..492d8e0cc
--- /dev/null
+++ b/sw/blas/syrk/data/params.json
@@ -0,0 +1,12 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    "m": 8,
+    "n": 2,
+    "alpha": 1.5,
+    "beta": 3.2,
+    "m_tiles": 1,
+    "funcptr": "syrk_opt"
+}
diff --git a/sw/blas/syrk/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py
new file mode 100755
index 000000000..3fb86644f
--- /dev/null
+++ b/sw/blas/syrk/scripts/datagen.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import numpy as np
+
+import snitch.util.sim.data_utils as du
+
+
+DOUBLE_BUFFER = True
+
+
+class SyrkDataGen(du.DataGen):
+
+    # Function pointers to alternative implementations
+    FUNCPTRS = ["syrk_naive", "syrk_baseline", "syrk_opt"]
+
+    def golden_model(self, alpha, A, beta, C):
+        return alpha * np.matmul(A, A.transpose()) + beta * C
+
+    def validate(self, **kwargs):
+        n_cores = 8
+        assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles"
+        m_frac = kwargs['m'] / kwargs['m_tiles']
+        assert (m_frac % n_cores) == 0, "m_frac must be an integer multiple of the number of cores"
+        if kwargs['funcptr'] != "syrk_naive":
+            assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4"
+        assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
+
+        # Calculate total TCDM occupation
+        a_tile_size = m_frac * kwargs['n'] * 8
+        c_tile_size = m_frac * m_frac * 8
+        total_size = 2 * a_tile_size + c_tile_size
+        if DOUBLE_BUFFER:
+            total_size *= 2
+        du.validate_tcdm_footprint(total_size)
+
+    def emit_header(self, **kwargs):
+        header = [super().emit_header()]
+
+        self.validate(**kwargs)
+
+        if 'alpha' in kwargs:
+            alpha = kwargs['alpha']
+        else:
+            alpha = du.generate_random_array(1)[0]
+        if 'beta' in kwargs:
+            beta = kwargs['beta']
+        else:
+            beta = du.generate_random_array(1)[0]
+
+        A = du.generate_random_array((kwargs['m'], kwargs['n']))
+        C_in = du.generate_random_array((kwargs['m'], kwargs['m']))
+
+        A = A.flatten()
+        C_in = C_in.flatten()
+
+        A_uid = 'A'
+        C_uid = 'C'
+
+        cfg = {
+            'm': kwargs['m'],
+            'n': kwargs['n'],
+            'alpha': alpha,
+            'beta': beta,
+            'a': A_uid,
+            'c': C_uid,
+            'm_tiles': kwargs['m_tiles'],
+            'funcptr': kwargs['funcptr']
+        }
+
+        header += [du.format_array_definition('double', A_uid, A)]
+        header += [du.format_array_definition('double', C_uid, C_in)]
+        header += [du.format_struct_definition('syrk_args_t', 'args', cfg)]
+        header = '\n\n'.join(header)
+
+        return header
+
+
+if __name__ == '__main__':
+    SyrkDataGen().main()
diff --git a/sw/blas/syrk/scripts/verify.py b/sw/blas/syrk/scripts/verify.py
new file mode 100755
index 000000000..0624156cb
--- /dev/null
+++ b/sw/blas/syrk/scripts/verify.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import numpy as np
+import sys
+from datagen import SyrkDataGen
+
+from snitch.util.sim.verif_utils import Verifier
+
+
+class SyrkVerifier(Verifier):
+
+    OUTPUT_UIDS = ['C']
+
+    def __init__(self):
+        super().__init__()
+        self.func_args = {
+            'm': 'I',
+            'n': 'I',
+            'alpha': 'd',
+            'beta': 'd',
+            'A': 'I',
+            'C': 'I',
+            'm_tiles': 'I',
+            'funcptr': 'I'
+        }
+        self.func_args = self.get_input_from_symbol('args', self.func_args)
+
+    def get_actual_results(self):
+        return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double')
+
+    def get_expected_results(self):
+        A = self.get_input_from_symbol('A', 'double')
+        C = self.get_input_from_symbol('C', 'double')
+        A = np.reshape(A, (self.func_args['m'], self.func_args['n']))
+        C = np.reshape(C, (self.func_args['m'], self.func_args['m']))
+        return SyrkDataGen().golden_model(
+            self.func_args['alpha'], A,
+            self.func_args['beta'], C
+        ).flatten()
+
+    def check_results(self, *args):
+        return super().check_results(*args, rtol=1e-10)
+
+
+if __name__ == "__main__":
+    sys.exit(SyrkVerifier().main())
diff --git a/sw/blas/syrk/src/args.h b/sw/blas/syrk/src/args.h
new file mode 100644
index 000000000..24342d3e3
--- /dev/null
+++ b/sw/blas/syrk/src/args.h
@@ -0,0 +1,22 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#pragma once
+#include <stdint.h>
+
+typedef void (*syrk_fp_t)(uint32_t m, uint32_t n, double alpha, double *a,
+                          double *at, double beta, double *b);
+
+typedef struct {
+    uint32_t m;
+    uint32_t n;
+    double alpha;
+    double beta;
+    double *a;
+    double *c;
+    uint32_t m_tiles;
+    syrk_fp_t funcptr;
+} syrk_args_t;
diff --git a/sw/blas/syrk/src/main.c b/sw/blas/syrk/src/main.c
new file mode 100644
index 000000000..f8c09ae4f
--- /dev/null
+++ b/sw/blas/syrk/src/main.c
@@ -0,0 +1,16 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz>
+
+#include "snrt.h"
+
+#include "blas.h"
+#include "data.h"
+
+int main() {
+    syrk_job(&args);
+
+    return 0;
+}
diff --git a/sw/blas/syrk/src/syrk.h b/sw/blas/syrk/src/syrk.h
new file mode 100644
index 000000000..718ad7fe9
--- /dev/null
+++ b/sw/blas/syrk/src/syrk.h
@@ -0,0 +1,294 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "args.h"
+#include "snrt.h"
+
+__thread int setup_ssr = 1;
+
+void syrk_naive(uint32_t m, uint32_t n, double alpha, double *a, double *at,
+                double beta, double *c) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    for (uint32_t i = offset; i < m; i += stride) {
+        for (uint32_t j = 0; j < m; j++) {
+            double acc = 0;
+            for (uint32_t k = 0; k < n; k++) {
+                acc += a[i * n + k] * at[j * n + k];
+            }
+            c[i * m + j] = multiply_opt(c[i * m + j], beta);
+            c[i * m + j] += alpha * acc;
+        }
+    }
+}
+
+void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at,
+                   double beta, double *c) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Unrolling factors
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll1 = 4;
+    const uint32_t unroll0 = 4;
+
+    for (uint32_t i = offset; i < m; i += stride) {
+        for (uint32_t j = 0; j < m; j += unroll1) {
+            double acc[4];
+            acc[0] = 0;
+            acc[1] = 0;
+            acc[2] = 0;
+            acc[3] = 0;
+
+            for (uint32_t k = 0; k < n; k += unroll0) {
+                asm volatile(
+                    "fmadd.d %[acc0], %[a0], %[at0], %[acc0] \n"
+                    "fmadd.d %[acc1], %[a0], %[at1], %[acc1] \n"
+                    "fmadd.d %[acc2], %[a0], %[at2], %[acc2] \n"
+                    "fmadd.d %[acc3], %[a0], %[at3], %[acc3] \n"
+                    "fmadd.d %[acc0], %[a1], %[at4], %[acc0] \n"
+                    "fmadd.d %[acc1], %[a1], %[at5], %[acc1] \n"
+                    "fmadd.d %[acc2], %[a1], %[at6], %[acc2] \n"
+                    "fmadd.d %[acc3], %[a1], %[at7], %[acc3] \n"
+                    "fmadd.d %[acc0], %[a2], %[at8], %[acc0] \n"
+                    "fmadd.d %[acc1], %[a2], %[at9], %[acc1] \n"
+                    "fmadd.d %[acc2], %[a2], %[at10], %[acc2] \n"
+                    "fmadd.d %[acc3], %[a2], %[at11], %[acc3] \n"
+                    "fmadd.d %[acc0], %[a3], %[at12], %[acc0] \n"
+                    "fmadd.d %[acc1], %[a3], %[at13], %[acc1] \n"
+                    "fmadd.d %[acc2], %[a3], %[at14], %[acc2] \n"
+                    "fmadd.d %[acc3], %[a3], %[at15], %[acc3] \n"
+                    : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                      [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
+                    :
+                    [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]),
+                    [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]),
+                    [ at0 ] "f"(at[(j + 0) * n + k]),
+                    [ at1 ] "f"(at[(j + 1) * n + k]),
+                    [ at2 ] "f"(at[(j + 2) * n + k]),
+                    [ at3 ] "f"(at[(j + 3) * n + k]),
+                    [ at4 ] "f"(at[(j + 0) * n + k + 1]),
+                    [ at5 ] "f"(at[(j + 1) * n + k + 1]),
+                    [ at6 ] "f"(at[(j + 2) * n + k + 1]),
+                    [ at7 ] "f"(at[(j + 3) * n + k + 1]),
+                    [ at8 ] "f"(at[(j + 0) * n + k + 2]),
+                    [ at9 ] "f"(at[(j + 1) * n + k + 2]),
+                    [ at10 ] "f"(at[(j + 2) * n + k + 2]),
+                    [ at11 ] "f"(at[(j + 3) * n + k + 2]),
+                    [ at12 ] "f"(at[(j + 0) * n + k + 3]),
+                    [ at13 ] "f"(at[(j + 1) * n + k + 3]),
+                    [ at14 ] "f"(at[(j + 2) * n + k + 3]),
+                    [ at15 ] "f"(at[(j + 3) * n + k + 3])
+                    :);
+            }
+
+            c[i * m + j + 0] = multiply_opt(c[i * m + j + 0], beta);
+            c[i * m + j + 1] = multiply_opt(c[i * m + j + 1], beta);
+            c[i * m + j + 2] = multiply_opt(c[i * m + j + 2], beta);
+            c[i * m + j + 3] = multiply_opt(c[i * m + j + 3], beta);
+            c[i * m + j + 0] += alpha * acc[0];
+            c[i * m + j + 1] += alpha * acc[1];
+            c[i * m + j + 2] += alpha * acc[2];
+            c[i * m + j + 3] += alpha * acc[3];
+        }
+    }
+}
+
+void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at,
+              double beta, double *c) {
+    uint32_t offset = snrt_cluster_core_idx();
+    uint32_t stride = snrt_cluster_compute_core_num();
+
+    // Unrolling factor of innermost loop
+    // Note: changes must be reflected in the inline assembly code
+    //       and datagen script
+    const uint32_t unroll = 4;
+
+    if (setup_ssr) {
+        // Configure ft0 and ft1 to load A and At
+        // for (i = offset; i < m; i += stride)
+        //     for (j1 = 0; j1 < m; j1 += unroll)
+        //         for (k = 0; k < n; k++)
+        //             for (j0 = 0; j0 < unroll; j0++)
+        //                 j = j1 + j0
+        //                 ft0.push(a[i * n + k])
+        //                 ft1.push(at[j * n + k])
+        const uint32_t ssr0_b[4] = {unroll, n, m / unroll, m / stride};
+        const uint32_t ssr0_i[4] = {0, sizeof(double), 0,
+                                    stride * n * sizeof(double)};
+        snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3],
+                         ssr0_i[1], ssr0_i[2], ssr0_i[3]);
+        snrt_ssr_repeat(SNRT_SSR_DM0, unroll);
+        const uint32_t ssr1_b[4] = {unroll, n, m / unroll, m / stride};
+        const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double),
+                                    unroll * n * sizeof(double), 0};
+        snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2],
+                         ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
+        setup_ssr = 0;
+    }
+
+    // SSR start address need to be configured each time
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, a + offset * n);
+    snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, at);
+    snrt_ssr_enable();
+
+    for (uint32_t i = offset; i < m; i += stride) {
+        for (uint32_t j = 0; j < m; j += unroll) {
+            double acc[unroll];
+            acc[0] = 0;
+            acc[1] = 0;
+            acc[2] = 0;
+            acc[3] = 0;
+
+            asm volatile(
+                "frep.o %[n_frep], %[unroll], 0, 0 \n"
+                "fmadd.d %[acc0], ft0, ft1, %[acc0] \n"
+                "fmadd.d %[acc1], ft0, ft1, %[acc1] \n"
+                "fmadd.d %[acc2], ft0, ft1, %[acc2] \n"
+                "fmadd.d %[acc3], ft0, ft1, %[acc3] \n"
+                "fmul.d %[acc0], %[acc0], %[alpha] \n"
+                "fmul.d %[acc1], %[acc1], %[alpha] \n"
+                "fmul.d %[acc2], %[acc2], %[alpha] \n"
+                "fmul.d %[acc3], %[acc3], %[alpha] \n"
+                "fmadd.d %[c0], %[c0], %[beta], %[acc0] \n"
+                "fmadd.d %[c1], %[c1], %[beta], %[acc1] \n"
+                "fmadd.d %[c2], %[c2], %[beta], %[acc2] \n"
+                "fmadd.d %[c3], %[c3], %[beta], %[acc3] \n"
+                : [ c0 ] "+f"(c[i * m + j + 0]), [ c1 ] "+f"(c[i * m + j + 1]),
+                  [ c2 ] "+f"(c[i * m + j + 2]), [ c3 ] "+f"(c[i * m + j + 3]),
+                  [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                  [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
+                : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll),
+                  [ alpha ] "f"(alpha), [ beta ] "f"(beta)
+                : "ft0", "ft1", "ft2");
+        }
+    }
+
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+}
+
+void syrk_job(syrk_args_t *args) {
+    uint32_t m_frac, a_tile_size, a_tile_bytes, c_tile_size, c_tile_bytes;
+    uint64_t local_a0_addr, local_at0_addr, local_c0_addr, local_a1_addr,
+        local_at1_addr, local_c1_addr;
+    double *local_a[2];
+    double *local_at[2];
+    double *local_c[2];
+    uint32_t n_tiles, iterations;
+    uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx;
+
+#ifndef JOB_ARGS_PRELOADED
+    // Allocate space for job arguments in TCDM
+    syrk_args_t *local_args = (syrk_args_t *)snrt_l1_next();
+
+    // Copy job arguments to TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(local_args, args, sizeof(syrk_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+    args = local_args;
+#endif
+
+    // Calculate size of each tile
+    m_frac = args->m / args->m_tiles;
+    a_tile_size = args->n * m_frac;
+    c_tile_size = m_frac * m_frac;
+    a_tile_bytes = a_tile_size * sizeof(double);
+    c_tile_bytes = c_tile_size * sizeof(double);
+
+    // Allocate space for job operands in TCDM
+    // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
+    local_a0_addr = (uint64_t)args + sizeof(syrk_args_t);
+    local_at0_addr = local_a0_addr + a_tile_bytes;
+    local_c0_addr = local_at0_addr + a_tile_bytes;
+    local_a[0] = (double *)local_a0_addr;
+    local_at[0] = (double *)local_at0_addr;
+    local_c[0] = (double *)local_c0_addr;
+    local_a1_addr = local_c0_addr + c_tile_bytes;
+    local_at1_addr = local_a1_addr + a_tile_bytes;
+    local_c1_addr = local_at1_addr + a_tile_bytes;
+    local_a[1] = (double *)local_a1_addr;
+    local_at[1] = (double *)local_at1_addr;
+    local_c[1] = (double *)local_c1_addr;
+
+    // Calculate number of iterations
+    n_tiles = args->m_tiles * args->m_tiles;
+    iterations = n_tiles + 2;
+
+    // Iterate over all tiles
+    for (i = 0; i < iterations; i++) {
+        if (snrt_is_dm_core()) {
+            // DMA out
+            // (out before in to avoid overwriting data)
+            if (i > 1) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_out = i - 2;
+                buff_idx = i_dma_out % 2;
+                i_row = i_dma_out / args->m_tiles;
+                i_col = i_dma_out % args->m_tiles;
+
+                // Copy job outputs from TCDM
+                snrt_dma_store_2d_tile(args->c, local_c[buff_idx], i_row, i_col,
+                                       m_frac, m_frac, args->m, sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+
+            // DMA in
+            if (i < n_tiles) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_in = i;
+                buff_idx = i_dma_in % 2;
+                i_row = i_dma_in / args->m_tiles;
+                i_col = i_dma_in % args->m_tiles;
+
+                // Copy job operands in TCDM
+                snrt_dma_load_1d_tile(local_a[buff_idx], args->a, i_row,
+                                      a_tile_size, sizeof(double));
+                snrt_dma_load_1d_tile(local_at[buff_idx], args->a, i_col,
+                                      a_tile_size, sizeof(double));
+                if (args->funcptr == syrk_opt || args->beta != 0) {
+                    snrt_dma_load_2d_tile(local_c[buff_idx], args->c, i_row,
+                                          i_col, m_frac, m_frac, args->m,
+                                          sizeof(double));
+                }
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+        }
+
+        // Compute
+        if (snrt_is_compute_core()) {
+            if (i > 0 && i < (n_tiles + 1)) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_compute = i - 1;
+                buff_idx = i_compute % 2;
+
+                // Perform tile computation
+                syrk_fp_t fp = args->funcptr;
+                fp(m_frac, args->n, args->alpha, local_a[buff_idx],
+                   local_at[buff_idx], args->beta, local_c[buff_idx]);
+
+                snrt_mcycle();
+            }
+        }
+
+        // Synchronize cores after every iteration
+        snrt_cluster_hw_barrier();
+    }
+}
diff --git a/sw/dnn/.gitignore b/sw/dnn/.gitignore
deleted file mode 100644
index aed262ca8..000000000
--- a/sw/dnn/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*/data/data.h
diff --git a/target/common/common.mk b/target/common/common.mk
index 70afd80c2..995e80ba0 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -203,6 +203,7 @@ SNITCH_DASM_TRACES      = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null)
 SNITCH_TXT_TRACES       = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.txt/g'))
 SNITCH_ANNOTATED_TRACES = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.s/g'))
 SNITCH_PERF_DUMPS       = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g'))
+DMA_PERF_DUMPS          = $(LOGS_DIR)/dma_*_perf.json
 
 TXT_TRACES       += $(SNITCH_TXT_TRACES)
 ANNOTATED_TRACES += $(SNITCH_ANNOTATED_TRACES)
@@ -219,7 +220,7 @@ annotate: $(ANNOTATED_TRACES)
 perf: $(JOINT_PERF_DUMP)
 visual-trace: $(VISUAL_TRACE)
 clean-traces:
-	rm -f $(TXT_TRACES)
+	rm -f $(TXT_TRACES) $(SNITCH_PERF_DUMPS) $(DMA_PERF_DUMPS)
 clean-annotate:
 	rm -f $(ANNOTATED_TRACES)
 clean-perf:
diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk
index ca8246124..e4456fdfc 100644
--- a/target/snitch_cluster/sw.mk
+++ b/target/snitch_cluster/sw.mk
@@ -51,6 +51,7 @@ APPS  = sw/apps/nop
 APPS += sw/apps/blas/axpy
 APPS += sw/apps/blas/gemm
 APPS += sw/apps/blas/dot
+APPS += sw/apps/blas/syrk
 APPS += sw/apps/dnn/batchnorm
 APPS += sw/apps/dnn/conv2d
 APPS += sw/apps/dnn/fusedconv
@@ -66,6 +67,7 @@ APPS += sw/apps/montecarlo/pi_estimation
 APPS += sw/apps/atax
 APPS += sw/apps/correlation
 APPS += sw/apps/covariance
+APPS += sw/apps/doitgen
 
 # Include Makefile from each app subdirectory
 $(foreach app,$(APPS), \
diff --git a/target/snitch_cluster/sw/apps/blas/gemm/app.mk b/target/snitch_cluster/sw/apps/blas/gemm/app.mk
index 5d2b54068..f50f6d21c 100644
--- a/target/snitch_cluster/sw/apps/blas/gemm/app.mk
+++ b/target/snitch_cluster/sw/apps/blas/gemm/app.mk
@@ -8,6 +8,7 @@ APP              := gemm
 $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build
 SRC_DIR          := $(ROOT)/sw/blas/$(APP)/src
 SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/blas
 
 include $(ROOT)/sw/apps/common.mk
 include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/blas/syrk/app.mk b/target/snitch_cluster/sw/apps/blas/syrk/app.mk
new file mode 100644
index 000000000..c0fd05044
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/blas/syrk/app.mk
@@ -0,0 +1,14 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := syrk
+$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build
+SRC_DIR          := $(ROOT)/sw/blas/$(APP)/src
+SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/blas
+
+include $(ROOT)/sw/apps/common.mk
+include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/covariance/app.mk b/target/snitch_cluster/sw/apps/covariance/app.mk
index c177a9d61..e985e671e 100644
--- a/target/snitch_cluster/sw/apps/covariance/app.mk
+++ b/target/snitch_cluster/sw/apps/covariance/app.mk
@@ -8,6 +8,7 @@ APP              := covariance
 $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
 SRC_DIR          := $(ROOT)/sw/apps/$(APP)/src
 SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/blas/
 
 include $(ROOT)/sw/apps/common.mk
 include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/doitgen/app.mk b/target/snitch_cluster/sw/apps/doitgen/app.mk
new file mode 100644
index 000000000..ebef550d3
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/doitgen/app.mk
@@ -0,0 +1,14 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := doitgen
+$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
+SRC_DIR          := $(ROOT)/sw/apps/$(APP)/src
+SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/blas/
+
+include $(ROOT)/sw/apps/common.mk
+include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/fdiv.yaml b/target/snitch_cluster/sw/fdiv.yaml
index a8b5f3930..d6b7aea3b 100644
--- a/target/snitch_cluster/sw/fdiv.yaml
+++ b/target/snitch_cluster/sw/fdiv.yaml
@@ -13,5 +13,3 @@ runs:
     cmd: [../../../sw/dnn/flashattention_2/scripts/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/correlation/build/correlation.elf
     cmd: [../../../sw/apps/correlation/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/covariance/build/covariance.elf
-    cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"]
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index 7a5a55a4c..d9e2f8c2f 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -80,6 +80,8 @@ runs:
     cmd: [../../../sw/blas/gemm/scripts/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/blas/dot/build/dot.elf
     cmd: [../../../sw/blas/dot/scripts/verify.py, "${sim_bin}", "${elf}"]
+  - elf: apps/blas/syrk/build/syrk.elf
+    cmd: [../../../sw/blas/syrk/scripts/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/dnn/batchnorm/build/batchnorm.elf
   - elf: apps/dnn/maxpool/build/maxpool.elf
   # - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results
@@ -95,3 +97,7 @@ runs:
   - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf
   # - elf: apps/atax/build/atax.elf
   #   cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"]
+  - elf: apps/covariance/build/covariance.elf
+    cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"]
+  - elf: apps/doitgen/build/doitgen.elf
+    cmd: [../../../sw/apps/doitgen/scripts/verify.py, "${sim_bin}", "${elf}"]
diff --git a/util/container/Dockerfile b/util/container/Dockerfile
index 9cdc7d9aa..bfef21266 100644
--- a/util/container/Dockerfile
+++ b/util/container/Dockerfile
@@ -94,6 +94,7 @@ RUN tar xzf snitch-spike-dasm-${SPIKE_DASM_VERSION}-x86_64-linux-gnu-ubuntu18.04
 # Install Doxygen
 RUN wget https://www.doxygen.nl/files/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
 RUN tar xzf doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
+RUN mv doxygen-${DOXYGEN_VERSION} doxygen
 
 # 2. Stage
 FROM ubuntu:22.04 AS snitch_cluster
@@ -154,7 +155,7 @@ COPY --from=builder /tools/spike-dasm bin/
 COPY --from=builder /root/.cargo/bin/banshee bin/
 COPY --from=builder /opt/python /opt/python
 COPY --from=builder /tools/verilator /tools/verilator/
-COPY --from=builder /tools/doxygen-${DOXYGEN_VERSION}/bin/doxygen bin/
+COPY --from=builder /tools/doxygen/bin/doxygen bin/
 
 # Create and activate virtual environment
 ENV VIRTUAL_ENV "/root/.venvs/snitch_cluster"
diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py
index e6f48acce..3b732c5cc 100644
--- a/util/sim/data_utils.py
+++ b/util/sim/data_utils.py
@@ -83,6 +83,24 @@ def torch_type_from_precision_t(prec):
     return precision_t_to_torch_type_map[_integer_precision_t(prec)]
 
 
+def numpy_type_from_precision_t(prec):
+    """Convert `precision_t` type to PyTorch type.
+
+    Args:
+        prec: A value of type `precision_t`. Accepts both enum strings
+            (e.g. "FP64") and integer enumeration values (e.g. 8).
+    """
+    # Types which have a direct correspondence in Numpy
+    precision_t_to_numpy_type_map = {
+        8: np.float64,
+        4: np.float32,
+        2: np.float16
+    }
+    prec = _integer_precision_t(prec)
+    assert prec != 1, "No direct correspondence between FP8 and Numpy"
+    return precision_t_to_numpy_type_map[prec]
+
+
 # Returns the C type representing a floating-point value of the specified precision
 def ctype_from_precision_t(prec):
     """Convert `precision_t` type to a C type string.
@@ -100,6 +118,29 @@ def ctype_from_precision_t(prec):
     return precision_t_to_ctype_map[_integer_precision_t(prec)]
 
 
+def generate_random_array(size, prec='FP64'):
+    """Consistent random array generation for Snitch experiments.
+
+    Samples values between -1 and 1 from a uniform distribution and
+    of the exact specified type, e.g. actual 64-bit doubles.
+
+    This function ensures that e.g. power measurements are not skewed
+    by using integer values in the FPU.
+
+    Args:
+        size: Tuple of array dimensions.
+        prec: A value of type `precision_t`. Accepts both enum strings
+            (e.g. "FP64") and integer enumeration values (e.g. 8).
+    """
+    # Generate in 64b precision and then cast down
+    rand = np.random.default_rng().random(size=size, dtype=np.float64) * 2 - 1
+    # Generate FlexFloat array for 8b floats, casted from 16b Numpy array
+    if _integer_precision_t(prec) == 1:
+        return ff.array(rand.astype(np.float16), ff_desc_from_precision_t(prec))
+    else:
+        return rand.astype(numpy_type_from_precision_t(prec))
+
+
 def flatten(array):
     """Flatten various array types with a homogeneous API.
 
diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index db094ad7e..0fab642e0 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -1145,7 +1145,6 @@ def main():
                         message += 'line {lineno}.'
                     print(traceback.format_exc(), file=sys.stderr)
                     print(message, file=sys.stderr)
-                    return 1
             else:
                 break  # Nothing more in pipe, EOF
         perf_metrics[-1]['tend'] = time_info[0] / 1000