diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h
index e1df4b426c..0af8014758 100644
--- a/sw/apps/covariance/src/covariance.h
+++ b/sw/apps/covariance/src/covariance.h
@@ -6,8 +6,8 @@
 //         Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 #include "args.h"
+#include "blas.h"
 #include "snrt.h"
-#include "ata.h"
 
 #define DOUBLE_BUFFER 1
 
@@ -41,7 +41,7 @@ void covariance_naive(uint32_t m, uint32_t n, double inv_n,
     snrt_cluster_hw_barrier();
 
     // Compute covariance matrix
-    ata_naive(inv_n_m1, m, n, data, datat, cov);
+    syrk_naive(m, n, inv_n_m1, data, datat, 0, cov);
 }
 
 void covariance_baseline(uint32_t m, uint32_t n, double inv_n,
@@ -74,7 +74,7 @@ void covariance_baseline(uint32_t m, uint32_t n, double inv_n,
     snrt_cluster_hw_barrier();
 
     // Compute covariance matrix
-    ata_baseline(inv_n_m1, m, n, data, datat, cov);
+    syrk_baseline(m, n, inv_n_m1, data, datat, 0, cov);
 }
 
 void covariance_opt(uint32_t m, uint32_t n, double inv_n,
diff --git a/sw/blas/blas.h b/sw/blas/blas.h
index 33c29e1753..69005ccb75 100644
--- a/sw/blas/blas.h
+++ b/sw/blas/blas.h
@@ -4,6 +4,20 @@
 
 #pragma once
 
+// Floating-point multiplications by zero cannot be optimized as in some
+// edge cases they do not yield zero:
+// - 0f * NaN = NaN
+// - 0f * INFINITY == NaN
+// Thus in order to optimize it, we need to test for zero. You can use this
+// function for free when `multiplier` is a constant.
+static inline double multiply_opt(double multiplicand, double multiplier) {
+    if (multiplier)
+        return multiplicand * multiplier;
+    else
+        return 0;
+}
+
 #include "axpy/src/axpy.h"
 #include "dot/src/dot.h"
 #include "gemm/src/gemm.h"
+#include "syrk/src/syrk.h"
diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
index 43a9745564..29de895807 100644
--- a/sw/blas/gemm/src/gemm.h
+++ b/sw/blas/gemm/src/gemm.h
@@ -24,19 +24,6 @@ typedef float v2f32 __attribute__((vector_size(8)));
 typedef __fp16 v4f16 __attribute__((vector_size(8)));
 typedef char v8f8 __attribute__((vector_size(8)));
 
-// Floating-point multiplications by zero cannot be optimized as in some
-// edge cases they do not yield zero:
-// - 0f * NaN = NaN
-// - 0f * INFINITY == NaN
-// Thus in order to optimize it, we need to test for zero. You can use this
-// function for free when `multiplier` is a constant.
-static inline double multiply_opt(double multiplicand, double multiplier) {
-    if (multiplier)
-        return multiplicand * multiplier;
-    else
-        return 0;
-}
-
 #include "gemm_fp16.h"
 #include "gemm_fp32.h"
 #include "gemm_fp64.h"
diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c
index 17f3936b09..9760000c6b 100644
--- a/sw/blas/gemm/src/main.c
+++ b/sw/blas/gemm/src/main.c
@@ -9,7 +9,7 @@
 #include <math.h>
 #include <stdint.h>
 
-#include "gemm.h"
+#include "blas.h"
 
 #include "data.h"
 #include "snrt.h"
diff --git a/sw/apps/ata/.gitignore b/sw/blas/syrk/.gitignore
similarity index 100%
rename from sw/apps/ata/.gitignore
rename to sw/blas/syrk/.gitignore
diff --git a/sw/apps/ata/data/params.json b/sw/blas/syrk/data/params.json
similarity index 63%
rename from sw/apps/ata/data/params.json
rename to sw/blas/syrk/data/params.json
index 1db35db089..492d8e0ccf 100644
--- a/sw/apps/ata/data/params.json
+++ b/sw/blas/syrk/data/params.json
@@ -3,8 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 {
-    "m": 16,
-    "n": 4,
-    "m_tiles": 2,
-    "funcptr": "ata_opt"
+    "m": 8,
+    "n": 2,
+    "alpha": 1.5,
+    "beta": 3.2,
+    "m_tiles": 1,
+    "funcptr": "syrk_opt"
 }
diff --git a/sw/apps/ata/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py
similarity index 56%
rename from sw/apps/ata/scripts/datagen.py
rename to sw/blas/syrk/scripts/datagen.py
index f6474f2e67..05cd2f0381 100755
--- a/sw/apps/ata/scripts/datagen.py
+++ b/sw/blas/syrk/scripts/datagen.py
@@ -14,25 +14,27 @@
 
 DOUBLE_BUFFER = True
 
-class AtaDataGen(DataGen):
+class SyrkDataGen(DataGen):
 
     # Function pointers to alternative implementations
-    FUNCPTRS = ["ata_naive", "ata_baseline", "ata_opt"]
+    FUNCPTRS = ["syrk_naive", "syrk_baseline", "syrk_opt"]
 
-    def golden_model(self, alpha, A):
-        return alpha * np.matmul(A, A.transpose())
+    def golden_model(self, alpha, A, beta, C):
+        return alpha * np.matmul(A, A.transpose()) + beta * C
 
     def validate(self, **kwargs):
+        n_cores = 8
         assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles"
         m_frac = kwargs['m'] / kwargs['m_tiles']
-        assert (m_frac % 8) == 0, "m_frac must be an integer multiple of the number of cores"
-        assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4"
+        assert (m_frac % n_cores) == 0, "m_frac must be an integer multiple of the number of cores"
+        if kwargs['funcptr'] != "syrk_naive":
+            assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4"
         assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}"
 
         # Calculate total TCDM occupation
         a_tile_size = m_frac * kwargs['n'] * 8
-        b_tile_size = m_frac * m_frac * 8
-        total_size = 2 * a_tile_size + b_tile_size
+        c_tile_size = m_frac * m_frac * 8
+        total_size = 2 * a_tile_size + c_tile_size
         if DOUBLE_BUFFER:
             total_size *= 2
         data_utils.validate_tcdm_footprint(total_size)
@@ -42,33 +44,43 @@ def emit_header(self, **kwargs):
 
         self.validate(**kwargs)
 
+        if 'alpha' in kwargs:
+            alpha = kwargs['alpha']
+        else:
+            alpha = np.random.randint(-200, 100)/100
+        if 'beta' in kwargs:
+            beta = kwargs['beta']
+        else:
+            beta = np.random.randint(-200, 100)/100
+
         A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100
-        alpha = np.random.randint(-200, 100)/100
-        B = self.golden_model(alpha, A)
+        C_in = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['m']))/100
+        C_out = self.golden_model(alpha, A, beta, C_in)
 
         A = A.flatten()
-        B = B.flatten()
+        C_in = C_in.flatten()
 
         A_uid = 'A'
-        B_uid = 'B'
+        C_uid = 'C'
 
         cfg = {
-            'alpha': alpha,
             'm': kwargs['m'],
             'n': kwargs['n'],
+            'alpha': alpha,
+            'beta': beta,
             'a': A_uid,
-            'b': B_uid,
+            'c': C_uid,
             'm_tiles': kwargs['m_tiles'],
             'funcptr': kwargs['funcptr']
         }
 
         header += [format_array_definition('double', A_uid, A)]
-        header += [format_array_declaration('double', B_uid, B.shape)]
-        header += [format_struct_definition('ata_args_t', 'args', cfg)]
+        header += [format_array_definition('double', C_uid, C_in)]
+        header += [format_struct_definition('syrk_args_t', 'args', cfg)]
         header = '\n\n'.join(header)
 
         return header
 
 
 if __name__ == '__main__':
-    AtaDataGen().main()
+    SyrkDataGen().main()
diff --git a/sw/apps/ata/scripts/verify.py b/sw/blas/syrk/scripts/verify.py
similarity index 70%
rename from sw/apps/ata/scripts/verify.py
rename to sw/blas/syrk/scripts/verify.py
index 206af870a3..0624156cb2 100755
--- a/sw/apps/ata/scripts/verify.py
+++ b/sw/blas/syrk/scripts/verify.py
@@ -7,23 +7,24 @@
 
 import numpy as np
 import sys
-from datagen import AtaDataGen
+from datagen import SyrkDataGen
 
 from snitch.util.sim.verif_utils import Verifier
 
 
-class AtaVerifier(Verifier):
+class SyrkVerifier(Verifier):
 
-    OUTPUT_UIDS = ['B']
+    OUTPUT_UIDS = ['C']
 
     def __init__(self):
         super().__init__()
         self.func_args = {
-            'alpha': 'd',
             'm': 'I',
             'n': 'I',
+            'alpha': 'd',
+            'beta': 'd',
             'A': 'I',
-            'B': 'I',
+            'C': 'I',
             'm_tiles': 'I',
             'funcptr': 'I'
         }
@@ -34,12 +35,17 @@ def get_actual_results(self):
 
     def get_expected_results(self):
         A = self.get_input_from_symbol('A', 'double')
+        C = self.get_input_from_symbol('C', 'double')
         A = np.reshape(A, (self.func_args['m'], self.func_args['n']))
-        return AtaDataGen().golden_model(self.func_args['alpha'], A).flatten()
+        C = np.reshape(C, (self.func_args['m'], self.func_args['m']))
+        return SyrkDataGen().golden_model(
+            self.func_args['alpha'], A,
+            self.func_args['beta'], C
+        ).flatten()
 
     def check_results(self, *args):
         return super().check_results(*args, rtol=1e-10)
 
 
 if __name__ == "__main__":
-    sys.exit(AtaVerifier().main())
+    sys.exit(SyrkVerifier().main())
diff --git a/sw/apps/ata/src/args.h b/sw/blas/syrk/src/args.h
similarity index 66%
rename from sw/apps/ata/src/args.h
rename to sw/blas/syrk/src/args.h
index f65a6a13fe..6bb58e00ec 100644
--- a/sw/apps/ata/src/args.h
+++ b/sw/blas/syrk/src/args.h
@@ -7,15 +7,16 @@
 #pragma once
 #include <stdint.h>
 
-typedef void (*ata_fp_t)(double alpha, uint32_t m, uint32_t n, double *a,
-    double *at, double *b);
+typedef void (*syrk_fp_t)(uint32_t m, uint32_t n, double alpha, double *a,
+    double *at, double beta, double *b);
 
 typedef struct {
-    double alpha;
     uint32_t m;
     uint32_t n;
+    double alpha;
+    double beta;
     double *a;
-    double *b;
+    double *c;
     uint32_t m_tiles;
-    ata_fp_t funcptr;
-} ata_args_t;
+    syrk_fp_t funcptr;
+} syrk_args_t;
diff --git a/sw/apps/ata/src/main.c b/sw/blas/syrk/src/main.c
similarity index 88%
rename from sw/apps/ata/src/main.c
rename to sw/blas/syrk/src/main.c
index c8df4bea90..9f1ad7163d 100644
--- a/sw/apps/ata/src/main.c
+++ b/sw/blas/syrk/src/main.c
@@ -6,12 +6,12 @@
 
 #include "snrt.h"
 
-#include "ata.h"
+#include "blas.h"
 #include "data.h"
 
 int main() {
 
-    ata_job(&args);
+    syrk_job(&args);
 
     return 0;
 }
diff --git a/sw/apps/ata/src/ata.h b/sw/blas/syrk/src/syrk.h
similarity index 69%
rename from sw/apps/ata/src/ata.h
rename to sw/blas/syrk/src/syrk.h
index 8673353a46..9494f2777c 100644
--- a/sw/apps/ata/src/ata.h
+++ b/sw/blas/syrk/src/syrk.h
@@ -7,26 +7,27 @@
 #include "args.h"
 #include "snrt.h"
 
-#define DOUBLE_BUFFER 1
-
 __thread int setup_ssr = 1;
 
-void ata_naive(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) {
+void syrk_naive(uint32_t m, uint32_t n, double alpha, double *a, double *at,
+                double beta, double *c) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
     for (uint32_t i = offset; i < m; i += stride) {
         for (uint32_t j = 0; j < m; j++) {
-            b[i * m + j] = 0;
+            double acc = 0;
             for (uint32_t k = 0; k < n; k++) {
-                b[i * m + j] += a[i * n + k] * at[j * n + k];
+                acc += a[i * n + k] * at[j * n + k];
             }
-            b[i * m + j] *= alpha;
+            c[i * m + j] = multiply_opt(c[i * m + j], beta);
+            c[i * m + j] += alpha * acc;
         }
     }
 }
 
-void ata_baseline(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) {
+void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at,
+                   double beta, double *c) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
@@ -89,15 +90,20 @@ void ata_baseline(double alpha, uint32_t m, uint32_t n, double *a, double *at, d
                 );
             }
 
-            b[i * m + j + 0] = alpha * acc[0];
-            b[i * m + j + 1] = alpha * acc[1];
-            b[i * m + j + 2] = alpha * acc[2];
-            b[i * m + j + 3] = alpha * acc[3];
+            c[i * m + j + 0] = multiply_opt(c[i * m + j + 0], beta);
+            c[i * m + j + 1] = multiply_opt(c[i * m + j + 1], beta);
+            c[i * m + j + 2] = multiply_opt(c[i * m + j + 2], beta);
+            c[i * m + j + 3] = multiply_opt(c[i * m + j + 3], beta);
+            c[i * m + j + 0] += alpha * acc[0];
+            c[i * m + j + 1] += alpha * acc[1];
+            c[i * m + j + 2] += alpha * acc[2];
+            c[i * m + j + 3] += alpha * acc[3];
         }
     }
 }
 
-void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) {
+void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at,
+              double beta, double *c) {
     uint32_t offset = snrt_cluster_core_idx();
     uint32_t stride = snrt_cluster_compute_core_num();
 
@@ -148,16 +154,20 @@ void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double
                 "fmadd.d %[acc1], ft0, ft1, %[acc1] \n"
                 "fmadd.d %[acc2], ft0, ft1, %[acc2] \n"
                 "fmadd.d %[acc3], ft0, ft1, %[acc3] \n"
-                "fmul.d %[b0], %[acc0], %[alpha] \n"
-                "fmul.d %[b1], %[acc1], %[alpha] \n"
-                "fmul.d %[b2], %[acc2], %[alpha] \n"
-                "fmul.d %[b3], %[acc3], %[alpha] \n"
-                : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
-                  [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]),
-                  [ b0 ] "=f"(b[i * m + j + 0]), [ b1 ] "=f"(b[i * m + j + 1]),
-                  [ b2 ] "=f"(b[i * m + j + 2]), [ b3 ] "=f"(b[i * m + j + 3])
+                "fmul.d %[acc0], %[acc0], %[alpha] \n"
+                "fmul.d %[acc1], %[acc1], %[alpha] \n"
+                "fmul.d %[acc2], %[acc2], %[alpha] \n"
+                "fmul.d %[acc3], %[acc3], %[alpha] \n"
+                "fmadd.d %[c0], %[c0], %[beta], %[acc0] \n"
+                "fmadd.d %[c1], %[c1], %[beta], %[acc1] \n"
+                "fmadd.d %[c2], %[c2], %[beta], %[acc2] \n"
+                "fmadd.d %[c3], %[c3], %[beta], %[acc3] \n"
+                : [ c0 ] "+f"(c[i * m + j + 0]), [ c1 ] "+f"(c[i * m + j + 1]),
+                  [ c2 ] "+f"(c[i * m + j + 2]), [ c3 ] "+f"(c[i * m + j + 3]),
+                  [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]),
+                  [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3])
                 : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll),
-                  [ alpha ] "f"(alpha)
+                  [ alpha ] "f"(alpha), [ beta ] "f"(beta)
                 : "ft0", "ft1", "ft2");
         }
     }
@@ -166,23 +176,23 @@ void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double
     snrt_fpu_fence();
 }
 
-void ata_job(ata_args_t *args) {
-    uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes;
-    uint64_t local_a0_addr, local_at0_addr, local_b0_addr,
-             local_a1_addr, local_at1_addr, local_b1_addr;
+void syrk_job(syrk_args_t *args) {
+    uint32_t m_frac, a_tile_size, a_tile_bytes, c_tile_size, c_tile_bytes;
+    uint64_t local_a0_addr, local_at0_addr, local_c0_addr,
+             local_a1_addr, local_at1_addr, local_c1_addr;
     double *local_a[2];
     double *local_at[2];
-    double *local_b[2];
-    uint32_t iterations, sb_iterations;
+    double *local_c[2];
+    uint32_t n_tiles, iterations;
     uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx;
 
 #ifndef JOB_ARGS_PRELOADED
     // Allocate space for job arguments in TCDM
-    ata_args_t *local_args = (ata_args_t *)snrt_l1_next();
+    syrk_args_t *local_args = (syrk_args_t *)snrt_l1_next();
 
     // Copy job arguments to TCDM
     if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(local_args, args, sizeof(ata_args_t));
+        snrt_dma_start_1d(local_args, args, sizeof(syrk_args_t));
         snrt_dma_wait_all();
     }
     snrt_cluster_hw_barrier();
@@ -192,43 +202,66 @@ void ata_job(ata_args_t *args) {
     // Calculate size of each tile
     m_frac = args->m / args->m_tiles;
     a_tile_size = args->n * m_frac;
-    b_tile_size = m_frac * m_frac;
+    c_tile_size = m_frac * m_frac;
     a_tile_bytes = a_tile_size * sizeof(double);
-    b_tile_bytes = b_tile_size * sizeof(double);
+    c_tile_bytes = c_tile_size * sizeof(double);
 
     // Allocate space for job operands in TCDM
     // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th.
-    local_a0_addr = (uint64_t)args + sizeof(ata_args_t);
+    local_a0_addr = (uint64_t)args + sizeof(syrk_args_t);
     local_at0_addr = local_a0_addr + a_tile_bytes;
-    local_b0_addr = local_at0_addr + a_tile_bytes;
+    local_c0_addr = local_at0_addr + a_tile_bytes;
     local_a[0] = (double *)local_a0_addr;
     local_at[0] = (double *)local_at0_addr;
-    local_b[0] = (double *)local_b0_addr;
-    if (DOUBLE_BUFFER) {
-        local_a1_addr = local_b0_addr + b_tile_bytes;
-        local_at1_addr = local_a1_addr + a_tile_bytes;
-        local_b1_addr = local_at1_addr + a_tile_bytes;
-        local_a[1] = (double *)local_a1_addr;
-        local_at[1] = (double *)local_at1_addr;
-        local_b[1] = (double *)local_b1_addr;
-    }
+    local_c[0] = (double *)local_c0_addr;
+    local_a1_addr = local_c0_addr + c_tile_bytes;
+    local_at1_addr = local_a1_addr + a_tile_bytes;
+    local_c1_addr = local_at1_addr + a_tile_bytes;
+    local_a[1] = (double *)local_a1_addr;
+    local_at[1] = (double *)local_at1_addr;
+    local_c[1] = (double *)local_c1_addr;
 
     // Calculate number of iterations
-    sb_iterations = args->m_tiles * args->m_tiles;
-    if (DOUBLE_BUFFER) iterations = sb_iterations + 2;
-    else iterations = sb_iterations;
+    n_tiles = args->m_tiles * args->m_tiles;
+    iterations = n_tiles + 2;
 
     // Iterate over all tiles
     for (i = 0; i < iterations; i++) {
         
         if (snrt_is_dm_core()) {
+            // DMA out
+            // (out before in to avoid overwriting data)
+            if (i > 1) {
+                snrt_mcycle();
+
+                // Compute tile and buffer indices
+                i_dma_out = i - 2;
+                buff_idx = i_dma_out % 2;
+                i_row = i_dma_out / args->m_tiles;
+                i_col = i_dma_out % args->m_tiles;
+
+                // Copy job outputs from TCDM
+                snrt_dma_store_2d_tile(
+                    args->c,
+                    local_c[buff_idx],
+                    i_row,
+                    i_col,
+                    m_frac,
+                    m_frac,
+                    args->m,
+                    sizeof(double));
+                snrt_dma_wait_all();
+
+                snrt_mcycle();
+            }
+
             // DMA in
-            if (!DOUBLE_BUFFER || (i < sb_iterations)) {
+            if (i < n_tiles) {
                 snrt_mcycle();
 
                 // Compute tile and buffer indices
                 i_dma_in = i;
-                buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0;
+                buff_idx = i_dma_in % 2;
                 i_row = i_dma_in / args->m_tiles;
                 i_col = i_dma_in % args->m_tiles;
 
@@ -245,35 +278,17 @@ void ata_job(ata_args_t *args) {
                     i_col,
                     a_tile_size,
                     sizeof(double));
-                snrt_dma_wait_all();
-
-                snrt_mcycle();
-            }
-
-            // Additional barriers required when not double buffering
-            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
-            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
-
-            // DMA out
-            if (!DOUBLE_BUFFER || (i > 1)) {
-                snrt_mcycle();
-
-                // Compute tile and buffer indices
-                i_dma_out = DOUBLE_BUFFER ? i - 2 : i;
-                buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0;
-                i_row = i_dma_out / args->m_tiles;
-                i_col = i_dma_out % args->m_tiles;
-
-                // Copy job outputs from TCDM
-                snrt_dma_store_2d_tile(
-                    args->b,
-                    local_b[buff_idx],
-                    i_row,
-                    i_col,
-                    m_frac,
-                    m_frac,
-                    args->m,
-                    sizeof(double));
+                if (args->funcptr == syrk_opt || args->beta != 0) {
+                    snrt_dma_load_2d_tile(
+                        local_c[buff_idx],
+                        args->c,
+                        i_row,
+                        i_col,
+                        m_frac,
+                        m_frac,
+                        args->m,
+                        sizeof(double));
+                }
                 snrt_dma_wait_all();
 
                 snrt_mcycle();
@@ -282,27 +297,22 @@ void ata_job(ata_args_t *args) {
 
         // Compute
         if (snrt_is_compute_core()) {
-            // Additional barrier required when not double buffering
-            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
-
-            if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) {
+            if (i > 0 && i < (n_tiles + 1)) {
                 snrt_mcycle();
 
                 // Compute tile and buffer indices
-                i_compute = DOUBLE_BUFFER ? i - 1 : i;
-                buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0;
+                i_compute = i - 1;
+                buff_idx = i_compute % 2;
 
                 // Perform tile computation
-                ata_fp_t fp = args->funcptr;
-                fp(args->alpha, m_frac, args->n, local_a[buff_idx],
-                   local_at[buff_idx], local_b[buff_idx]);
+                syrk_fp_t fp = args->funcptr;
+                fp(m_frac, args->n, args->alpha, local_a[buff_idx],
+                   local_at[buff_idx], args->beta, local_c[buff_idx]);
 
                 snrt_mcycle();
             }
-
-            // Additional barrier required when not double buffering
-            if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier();
         }
+
         // Synchronize cores after every iteration
         snrt_cluster_hw_barrier();
     }
diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk
index 0a1e4c00c3..674ea2cadb 100644
--- a/target/snitch_cluster/sw.mk
+++ b/target/snitch_cluster/sw.mk
@@ -51,6 +51,7 @@ APPS  = sw/apps/nop
 APPS += sw/apps/blas/axpy
 APPS += sw/apps/blas/gemm
 APPS += sw/apps/blas/dot
+APPS += sw/apps/blas/syrk
 APPS += sw/apps/dnn/batchnorm
 APPS += sw/apps/dnn/conv2d
 APPS += sw/apps/dnn/fusedconv
@@ -63,7 +64,6 @@ APPS += sw/apps/dnn/concat
 APPS += sw/apps/dnn/fused_concat_linear
 APPS += sw/apps/dnn/transpose
 APPS += sw/apps/montecarlo/pi_estimation
-APPS += sw/apps/ata
 APPS += sw/apps/atax
 APPS += sw/apps/correlation
 APPS += sw/apps/covariance
diff --git a/target/snitch_cluster/sw/apps/blas/gemm/app.mk b/target/snitch_cluster/sw/apps/blas/gemm/app.mk
index 5d2b540687..f50f6d21c1 100644
--- a/target/snitch_cluster/sw/apps/blas/gemm/app.mk
+++ b/target/snitch_cluster/sw/apps/blas/gemm/app.mk
@@ -8,6 +8,7 @@ APP              := gemm
 $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build
 SRC_DIR          := $(ROOT)/sw/blas/$(APP)/src
 SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/blas
 
 include $(ROOT)/sw/apps/common.mk
 include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/ata/app.mk b/target/snitch_cluster/sw/apps/blas/syrk/app.mk
similarity index 65%
rename from target/snitch_cluster/sw/apps/ata/app.mk
rename to target/snitch_cluster/sw/apps/blas/syrk/app.mk
index af63400b4a..c0fd050442 100644
--- a/target/snitch_cluster/sw/apps/ata/app.mk
+++ b/target/snitch_cluster/sw/apps/blas/syrk/app.mk
@@ -4,10 +4,11 @@
 #
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP              := ata
-$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
-SRC_DIR          := $(ROOT)/sw/apps/$(APP)/src
+APP              := syrk
+$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build
+SRC_DIR          := $(ROOT)/sw/blas/$(APP)/src
 SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(ROOT)/sw/blas
 
 include $(ROOT)/sw/apps/common.mk
 include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/covariance/app.mk b/target/snitch_cluster/sw/apps/covariance/app.mk
index 005791c791..e985e671e7 100644
--- a/target/snitch_cluster/sw/apps/covariance/app.mk
+++ b/target/snitch_cluster/sw/apps/covariance/app.mk
@@ -8,7 +8,7 @@ APP              := covariance
 $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
 SRC_DIR          := $(ROOT)/sw/apps/$(APP)/src
 SRCS             := $(SRC_DIR)/main.c
-$(APP)_INCDIRS   := $(ROOT)/sw/apps/ata/src/
+$(APP)_INCDIRS   := $(ROOT)/sw/blas/
 
 include $(ROOT)/sw/apps/common.mk
 include $(ROOT)/target/snitch_cluster/sw/apps/common.mk