From 3cb1bff5efd0dcfab340746bcc1460c4d8f0843d Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 26 Oct 2023 17:08:24 +0200
Subject: [PATCH] dnn: Refactor and verify layernorm

---
 .clang-format-ignore                          |   3 +
 .github/workflows/lint.yml                    |   1 +
 sw/blas/axpy/data/datagen.py                  |  14 +-
 sw/blas/gemm/data/datagen.py                  |  16 +-
 sw/blas/gemm/src/gemm.h                       |  36 ++-
 sw/dnn/.gitignore                             |   1 +
 sw/dnn/batchnorm/data/datagen.py              | 136 ++++++++++
 .../dnn/batchnorm/data}/params.hjson          |   9 +-
 sw/dnn/{ => batchnorm}/src/batchnorm.h        |  46 ++--
 sw/dnn/batchnorm/src/main.c                   |  15 ++
 sw/dnn/common.mk                              |  31 +++
 sw/dnn/{src => conv2d}/conv2d.h               |   0
 sw/dnn/gelu/data/datagen.py                   | 113 +++++++++
 .../src => sw/dnn/gelu/data}/params.hjson     |   5 +-
 sw/dnn/{ => gelu}/src/gelu.h                  |  40 ++-
 sw/dnn/gelu/src/main.c                        |   9 +
 sw/dnn/gemm/data/datagen.py                   | 163 ++++++++++++
 .../src => sw/dnn/gemm/data}/params.hjson     |   3 -
 sw/dnn/{ => gemm}/src/gemm.h                  |   8 +-
 .../gemm/src/gemm.c => sw/dnn/gemm/src/main.c |  26 +-
 sw/dnn/layernorm/data/datagen.py              | 117 +++++++++
 .../dnn/layernorm/data}/params.hjson          |  10 +-
 sw/dnn/layernorm/layout.csv                   |   3 +
 sw/dnn/layernorm/src/layernorm.h              | 177 +++++++++++++
 sw/dnn/layernorm/src/main.c                   |  14 ++
 sw/dnn/layernorm/verify.py                    |  86 +++++++
 sw/dnn/linear/data/datagen.py                 | 124 +++++++++
 .../src => sw/dnn/linear/data}/params.hjson   |   3 -
 sw/dnn/{ => linear}/src/linear.h              |  30 ---
 sw/dnn/linear/src/main.c                      |  39 +++
 sw/dnn/maxpool/data/datagen.py                | 127 ++++++++++
 .../src => sw/dnn/maxpool/data}/params.hjson  |   4 +-
 sw/dnn/maxpool/src/main.c                     |  17 ++
 sw/dnn/{ => maxpool}/src/maxpool.h            |  74 ++++--
 sw/dnn/{src => softmax}/softmax.h             |   0
 sw/dnn/src/dnn.h                              |  23 +-
 sw/dnn/src/layernorm.h                        | 238 ------------------
 target/snitch_cluster/Makefile                |   2 +-
 target/snitch_cluster/cfg/divsqrt.hjson       | 127 ++++++++++
 target/snitch_cluster/sw/apps/Makefile        |   8 +-
 target/snitch_cluster/sw/apps/common.mk       |   1 +
 .../sw/apps/dnn/batchnorm/Makefile            |   8 +-
 .../sw/apps/dnn/batchnorm/src/batchnorm.c     |  33 ---
 .../snitch_cluster/sw/apps/dnn/gelu/Makefile  |   8 +-
 .../sw/apps/dnn/gelu/src/gelu.c               |  26 --
 .../snitch_cluster/sw/apps/dnn/gemm/Makefile  |   8 +-
 .../sw/apps/dnn/layernorm/Makefile            |   6 +-
 .../sw/apps/dnn/layernorm/src/layernorm.c     |  29 ---
 .../sw/apps/dnn/linear/Makefile               |   8 +-
 .../sw/apps/dnn/linear/src/linear.c           |  26 --
 .../sw/apps/dnn/maxpool/Makefile              |   8 +-
 .../sw/apps/dnn/maxpool/src/maxpool.c         |  32 ---
 target/snitch_cluster/sw/run.yaml             |   3 +-
 util/sim/data_utils.py                        | 132 ++++++++--
 54 files changed, 1614 insertions(+), 612 deletions(-)
 create mode 100644 sw/dnn/.gitignore
 create mode 100755 sw/dnn/batchnorm/data/datagen.py
 rename {target/snitch_cluster/sw/apps/dnn/batchnorm/src => sw/dnn/batchnorm/data}/params.hjson (66%)
 rename sw/dnn/{ => batchnorm}/src/batchnorm.h (82%)
 create mode 100644 sw/dnn/batchnorm/src/main.c
 create mode 100644 sw/dnn/common.mk
 rename sw/dnn/{src => conv2d}/conv2d.h (100%)
 create mode 100755 sw/dnn/gelu/data/datagen.py
 rename {target/snitch_cluster/sw/apps/dnn/gelu/src => sw/dnn/gelu/data}/params.hjson (76%)
 rename sw/dnn/{ => gelu}/src/gelu.h (77%)
 create mode 100644 sw/dnn/gelu/src/main.c
 create mode 100755 sw/dnn/gemm/data/datagen.py
 rename {target/snitch_cluster/sw/apps/dnn/gemm/src => sw/dnn/gemm/data}/params.hjson (86%)
 rename sw/dnn/{ => gemm}/src/gemm.h (96%)
 rename target/snitch_cluster/sw/apps/dnn/gemm/src/gemm.c => sw/dnn/gemm/src/main.c (93%)
 create mode 100755 sw/dnn/layernorm/data/datagen.py
 rename {target/snitch_cluster/sw/apps/dnn/layernorm/src => sw/dnn/layernorm/data}/params.hjson (52%)
 create mode 100644 sw/dnn/layernorm/layout.csv
 create mode 100644 sw/dnn/layernorm/src/layernorm.h
 create mode 100644 sw/dnn/layernorm/src/main.c
 create mode 100755 sw/dnn/layernorm/verify.py
 create mode 100755 sw/dnn/linear/data/datagen.py
 rename {target/snitch_cluster/sw/apps/dnn/linear/src => sw/dnn/linear/data}/params.hjson (81%)
 rename sw/dnn/{ => linear}/src/linear.h (80%)
 create mode 100644 sw/dnn/linear/src/main.c
 create mode 100755 sw/dnn/maxpool/data/datagen.py
 rename {target/snitch_cluster/sw/apps/dnn/maxpool/src => sw/dnn/maxpool/data}/params.hjson (83%)
 create mode 100644 sw/dnn/maxpool/src/main.c
 rename sw/dnn/{ => maxpool}/src/maxpool.h (68%)
 rename sw/dnn/{src => softmax}/softmax.h (100%)
 delete mode 100644 sw/dnn/src/layernorm.h
 create mode 100644 target/snitch_cluster/cfg/divsqrt.hjson
 delete mode 100644 target/snitch_cluster/sw/apps/dnn/batchnorm/src/batchnorm.c
 delete mode 100644 target/snitch_cluster/sw/apps/dnn/gelu/src/gelu.c
 delete mode 100644 target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c
 delete mode 100644 target/snitch_cluster/sw/apps/dnn/linear/src/linear.c
 delete mode 100644 target/snitch_cluster/sw/apps/dnn/maxpool/src/maxpool.c

diff --git a/.clang-format-ignore b/.clang-format-ignore
index c78f2e5fc2..ce95be05f0 100644
--- a/.clang-format-ignore
+++ b/.clang-format-ignore
@@ -4,3 +4,6 @@
 
 # Ignore vendored third-party code
 ./sw/math/*
+./target/snitch_cluster/sw/apps/transformer/src/transformer.c
+./target/snitch_cluster/sw/apps/transformer/src/data.h
+./sw/apps/transformer/src/transformer.h
\ No newline at end of file
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 6c4f91184b..c90fd5b6f4 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -117,6 +117,7 @@ jobs:
         with:
           flake8-version: "6.0.0"
           max-line-length: "100"
+          exclude: "target/snitch_cluster/sw/apps/dnn/datagen.py"
 
   ######################
   # Clang-Format Check #
diff --git a/sw/blas/axpy/data/datagen.py b/sw/blas/axpy/data/datagen.py
index f7ae7a6488..3f48e348d7 100755
--- a/sw/blas/axpy/data/datagen.py
+++ b/sw/blas/axpy/data/datagen.py
@@ -11,8 +11,8 @@
 import os
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
-from data_utils import format_scalar_definition, format_vector_definition, \
-                       format_vector_declaration, format_ifdef_wrapper  # noqa: E402
+from data_utils import format_scalar_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
 
 MIN = -1000
 MAX = +1000
@@ -47,16 +47,16 @@ def main():
     a = np.random.uniform(MIN, MAX, 1)
     x = np.random.uniform(MIN, MAX, length)
     y = np.random.uniform(MIN, MAX, length)
-    z = np.zeros(length)
     g = golden_model(a, x, y)
 
     # Format header file
     l_str = format_scalar_definition('const uint32_t', 'l', length)
     a_str = format_scalar_definition('const double', 'a', a[0])
-    x_str = format_vector_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
-    y_str = format_vector_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
-    z_str = format_vector_declaration('double', 'z', z, alignment=BURST_ALIGNMENT, section=section)
-    g_str = format_vector_definition('double', 'g', g)
+    x_str = format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
+    y_str = format_array_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
+    z_str = format_array_declaration('double', 'z', [length],
+                                     alignment=BURST_ALIGNMENT, section=section)
+    g_str = format_array_definition('double', 'g', g)
     g_str = format_ifdef_wrapper('BIST', g_str)
     f_str = '\n\n'.join([l_str, a_str, x_str, y_str, z_str, g_str])
     f_str += '\n'
diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py
index 0ccab83817..45a008eff2 100755
--- a/sw/blas/gemm/data/datagen.py
+++ b/sw/blas/gemm/data/datagen.py
@@ -15,7 +15,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
 from data_utils import emit_license, format_scalar_definition, \
-                       format_vector_definition, format_ifdef_wrapper  # noqa: E402
+                       format_array_definition, format_ifdef_wrapper  # noqa: E402
 
 
 np.random.seed(42)
@@ -100,18 +100,18 @@ def emit_header(**kwargs):
     data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])]
     data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)]
     data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
     if kwargs['prec'] == 8:
-        result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten())
+        result_def = format_array_definition(C_TYPES['64'], 'result', result.flatten())
     else:
-        result_def = format_vector_definition(C_TYPES[str(kwargs['prec'])],
-                                              'result',
-                                              result.flatten())
+        result_def = format_array_definition(C_TYPES[str(kwargs['prec'])],
+                                             'result',
+                                             result.flatten())
     data_str += [format_ifdef_wrapper('BIST', result_def)]
     data_str = '\n\n'.join(data_str)
 
diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
index baab570478..ea2c865636 100644
--- a/sw/blas/gemm/src/gemm.h
+++ b/sw/blas/gemm/src/gemm.h
@@ -24,7 +24,6 @@ typedef char v8f8 __attribute__((vector_size(8)));
 dump_float(gemm, 8);
 dump_uint(index, 9);
 
-
 void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
                         uint32_t ldA, uint32_t ta, double* B, uint32_t ldB,
                         uint32_t tb, double* C, uint32_t ldC, double BETA) {
@@ -74,24 +73,23 @@ void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
 }
 
 /* params:
-* M: number of rows of A and C
-* N: number of columns of B and C
-* K: number of columns of A and rows of B
-* A: pointer to matrix A
-* ldA: row stride of A
-* ta: transpose A
-* B: pointer to matrix B
-* ldB: row stride of B
-* tb: transpose B
-* C: pointer to matrix C
-* ldC: row stride of C
-* ALPHA: scalar alpha
-* A is MxK, B is KxN, C is MxN
-*/
+ * M: number of rows of A and C
+ * N: number of columns of B and C
+ * K: number of columns of A and rows of B
+ * A: pointer to matrix A
+ * ldA: row stride of A
+ * ta: transpose A
+ * B: pointer to matrix B
+ * ldB: row stride of B
+ * tb: transpose B
+ * C: pointer to matrix C
+ * ldC: row stride of C
+ * ALPHA: scalar alpha
+ * A is MxK, B is KxN, C is MxN
+ */
 void gemm_fp32_baseline(uint32_t M, uint32_t N, uint32_t K, float* A,
                         uint32_t ldA, uint32_t ta, float* B, uint32_t ldB,
                         uint32_t tb, float* C, uint32_t ldC, float ALPHA) {
-    
     // float c0, c1, c2, c3 = 0;
     float c0 = 0.0f;
     float c1 = 0.0f;
@@ -110,7 +108,7 @@ void gemm_fp32_baseline(uint32_t M, uint32_t N, uint32_t K, float* A,
                 c1 = 0.0f;
                 c2 = 0.0f;
                 c3 = 0.0f;
-                for (uint32_t k = 0; k < K; k+=4) {
+                for (uint32_t k = 0; k < K; k += 4) {
                     c0 += A[(k + 0) + m * ldA] * B[(k + 0) * ldB + n];
                     c1 += A[(k + 1) + m * ldA] * B[(k + 1) * ldB + n];
                     c2 += A[(k + 2) + m * ldA] * B[(k + 2) * ldB + n];
@@ -131,7 +129,7 @@ void gemm_fp32_baseline(uint32_t M, uint32_t N, uint32_t K, float* A,
                 c1 = 0.0f;
                 c2 = 0.0f;
                 c3 = 0.0f;
-                for (uint32_t k = 0; k < K; k+=4) {
+                for (uint32_t k = 0; k < K; k += 4) {
                     c0 += A[(k + 0) * M * ldA + m * ldA] * B[(k + 0) * ldB + n];
                     c1 += A[(k + 1) * M * ldA + m * ldA] * B[(k + 1) * ldB + n];
                     c2 += A[(k + 2) * M * ldA + m * ldA] * B[(k + 2) * ldB + n];
@@ -152,7 +150,7 @@ void gemm_fp32_baseline(uint32_t M, uint32_t N, uint32_t K, float* A,
                 c1 = 0.0f;
                 c2 = 0.0f;
                 c3 = 0.0f;
-                for (uint32_t k = 0; k < K; k+=4) {
+                for (uint32_t k = 0; k < K; k += 4) {
                     // c0 += A[k + m * ldA] * B[k + n * ldB];
                     c0 += A[(k + 0) + m * ldA] * B[(k + 0) + n * ldB];
                     c1 += A[(k + 1) + m * ldA] * B[(k + 1) + n * ldB];
diff --git a/sw/dnn/.gitignore b/sw/dnn/.gitignore
new file mode 100644
index 0000000000..aed262ca8f
--- /dev/null
+++ b/sw/dnn/.gitignore
@@ -0,0 +1 @@
+*/data/data.h
diff --git a/sw/dnn/batchnorm/data/datagen.py b/sw/dnn/batchnorm/data/datagen.py
new file mode 100755
index 0000000000..8dd0b1de73
--- /dev/null
+++ b/sw/dnn/batchnorm/data/datagen.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tim Fischer <fischeti@iis.ee.ethz.ch>
+# Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION_T = {
+    '64': 'FP64',
+    '32': 'FP32',
+    '16': 'FP16',
+    '8': 'FP8'
+}
+
+
+def golden_model(ifmap):
+    n, ci, ih, iw = ifmap.shape
+    bn = torch.nn.BatchNorm2d(ci)
+    bn.weight.requires_grad = False
+    bn.bias.requires_grad = False
+    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
+    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+    gamma = bn.weight / torch.sqrt(running_var + bn.eps)
+    beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
+    ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
+    return ofmap, gamma, beta
+
+
+def emit_header(**kwargs):
+
+    in_channels = kwargs['input_dim']['channels']
+    in_height = kwargs['input_dim']['height']
+    in_width = kwargs['input_dim']['width']
+    tile_ci = kwargs['tile_ci']
+    prec = str(kwargs['prec'])
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    ctype = data_utils.floating_point_ctype(prec)
+
+    ifmap = torch.randn(1, in_channels, in_height, in_width, requires_grad=False, dtype=torch_type)
+    ofmap, gamma, beta = golden_model(ifmap)
+
+    # convert from CHW to HWC format
+    ifmap = ifmap.permute(0, 2, 3, 1)
+    ofmap = ofmap.permute(0, 2, 3, 1)
+
+    n, ih, iw, ci = ifmap.shape
+
+    ifmap_uid = 'ifmap'
+    ofmap_uid = 'ofmap'
+    beta_uid = 'beta'
+    gamma_uid = 'gamma'
+
+    layer_cfg = {
+        'CI': ci,
+        'IH': ih,
+        'IW': iw,
+        'TILE_CI': tile_ci,
+        'ifmap': ifmap_uid,
+        'ofmap': ofmap_uid,
+        'beta': beta_uid,
+        'gamma': gamma_uid
+    }
+
+    data_str = [emit_license()]
+    # Array forward declarations
+    data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)]
+    data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)]
+    data_str += [format_array_declaration(ctype, beta_uid, beta.shape)]
+    data_str += [format_array_declaration(ctype, gamma_uid, gamma.shape)]
+    # Layer struct
+    data_str += [format_struct_definition('batchnorm_layer_t', 'layer', layer_cfg)]
+    # Array definitions
+    data_str += [format_array_definition(ctype, ifmap_uid, ifmap)]
+    data_str += [format_array_definition(ctype, beta_uid, beta)]
+    data_str += [format_array_definition(ctype, gamma_uid, gamma)]
+    # Golden results for BIST
+    result_def = format_array_definition(ctype, 'golden', ofmap)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/dnn/batchnorm/src/params.hjson b/sw/dnn/batchnorm/data/params.hjson
similarity index 66%
rename from target/snitch_cluster/sw/apps/dnn/batchnorm/src/params.hjson
rename to sw/dnn/batchnorm/data/params.hjson
index b8d774d0b8..34645f93e4 100644
--- a/target/snitch_cluster/sw/apps/dnn/batchnorm/src/params.hjson
+++ b/sw/dnn/batchnorm/data/params.hjson
@@ -2,17 +2,12 @@
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a single BatchNorm layer
-
 {
-    kernel: "BatchNorm"
-    channels: {
-        out: 32,
-        in: 32
-    }
     input_dim: {
+        channels: 32
         height: 8,
         width: 8
     }
+    tile_ci: 32
     prec: 64
 }
diff --git a/sw/dnn/src/batchnorm.h b/sw/dnn/batchnorm/src/batchnorm.h
similarity index 82%
rename from sw/dnn/src/batchnorm.h
rename to sw/dnn/batchnorm/src/batchnorm.h
index f1e8460646..4c8b5adc10 100644
--- a/sw/dnn/src/batchnorm.h
+++ b/sw/dnn/batchnorm/src/batchnorm.h
@@ -4,6 +4,18 @@
 
 #include "snrt.h"
 
+typedef struct {
+    uint32_t CI;
+    uint32_t IH;
+    uint32_t IW;
+    uint32_t TILE_CI;
+    double *ifmap;
+    double *ofmap;
+    double *gamma;
+    double *beta;
+    precision_t dtype;
+} batchnorm_layer_t;
+
 /**
  * @brief implementation of a FP64 batchnorm as a linear combination
  * y = gamma * x + beta
@@ -50,12 +62,17 @@ static inline void batchnorm_fp64(double *ifmap, double *gamma, double *beta,
     snrt_ssr_disable();
 }
 
-static inline void batchnorm_layer(const conv_layer *l) {
+static inline void batchnorm_layer(const batchnorm_layer_t *l) {
     const uint32_t cluster_num = snrt_cluster_num();
     const uint32_t cluster_id = snrt_cluster_idx();
     const uint32_t compute_num = snrt_cluster_compute_core_num();
     const uint32_t compute_id = snrt_cluster_core_idx();
 
+    // Calculate output dimensions
+    uint32_t OH = l->IH;
+    uint32_t OW = l->IW;
+    uint32_t CO = l->CI;
+
     // Each cluster loads one tile of a row
     uint32_t ifmap_size = 2 * l->IW * l->TILE_CI;
     uint32_t weights_size = l->CI;
@@ -78,7 +95,7 @@ static inline void batchnorm_layer(const conv_layer *l) {
     uint32_t prev_ow;
     uint32_t prev_ci;
 
-    for (uint32_t oh = cluster_id; oh < l->OH; oh += cluster_num) {
+    for (uint32_t oh = cluster_id; oh < OH; oh += cluster_num) {
         for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) {
             if (snrt_is_dm_core()) {
                 // Load weights once in the beginning
@@ -112,16 +129,15 @@ static inline void batchnorm_layer(const conv_layer *l) {
                 if (!(oh == cluster_id && ci == 0)) {
                     if (l->TILE_CI == l->CI) {
                         // data is stored consecutively
-                        snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI],
+                        snrt_dma_start_1d(&l->ofmap[prev_oh * OW * l->CI],
                                           &ofmap[!read_buf * (ofmap_size / 2)],
                                           sizeof(double) * l->IW * l->CI);
                     } else {
                         // data is stored in interleaved layout
                         snrt_dma_start_2d(
-                            &l->ofmap[prev_oh * l->OW * l->CI +
-                                      prev_ci],                   /* dst */
-                            &ofmap[!read_buf * (ofmap_size / 2)], /* src */
-                            sizeof(double) * l->TILE_CI,          /* size */
+                            &l->ofmap[prev_oh * OW * l->CI + prev_ci], /* dst */
+                            &ofmap[!read_buf * (ofmap_size / 2)],      /* src */
+                            sizeof(double) * l->TILE_CI, /* size */
                             sizeof(double) * l->CI,      /* dst_stride */
                             sizeof(double) * l->TILE_CI, /* src_stride */
                             l->IW);                      /* repetitions */
@@ -146,7 +162,7 @@ static inline void batchnorm_layer(const conv_layer *l) {
                 batchnorm_fp64(&ifmap[read_buf * ofmap_size / 2 + compute_id],
                                &gamma[ci + compute_id], &beta[ci + compute_id],
                                &ofmap[write_buf * ofmap_size / 2 + compute_id],
-                               l->OW, l->TILE_CI, compute_num, setup_SSR);
+                               OW, l->TILE_CI, compute_num, setup_SSR);
 
                 write_buf = !write_buf;
                 read_buf = !read_buf;
@@ -160,18 +176,18 @@ static inline void batchnorm_layer(const conv_layer *l) {
     if (snrt_is_dm_core()) {
         if (l->TILE_CI == l->CI) {
             // data is stored consecutively
-            snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI],
+            snrt_dma_start_1d(&l->ofmap[prev_oh * OW * l->CI],
                               &ofmap[!read_buf * (ofmap_size / 2)],
                               sizeof(double) * l->IW * l->CI);
         } else {
             // data is stored in interleaved layout
             snrt_dma_start_2d(
-                &l->ofmap[prev_oh * l->OW * l->CI + prev_ci], /* dst */
-                &ofmap[!read_buf * (ofmap_size / 2)],         /* src */
-                sizeof(double) * l->TILE_CI,                  /* size */
-                sizeof(double) * l->CI,                       /* dst_stride */
-                sizeof(double) * l->TILE_CI,                  /* src_stride */
-                l->IW);                                       /* repetitions */
+                &l->ofmap[prev_oh * OW * l->CI + prev_ci], /* dst */
+                &ofmap[!read_buf * (ofmap_size / 2)],      /* src */
+                sizeof(double) * l->TILE_CI,               /* size */
+                sizeof(double) * l->CI,                    /* dst_stride */
+                sizeof(double) * l->TILE_CI,               /* src_stride */
+                l->IW);                                    /* repetitions */
         }
 
         snrt_dma_wait_all();
diff --git a/sw/dnn/batchnorm/src/main.c b/sw/dnn/batchnorm/src/main.c
new file mode 100644
index 0000000000..789d3dd547
--- /dev/null
+++ b/sw/dnn/batchnorm/src/main.c
@@ -0,0 +1,15 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    batchnorm_layer(&layer);
+
+    snrt_global_barrier();
+
+    return 0;
+}
diff --git a/sw/dnn/common.mk b/sw/dnn/common.mk
new file mode 100644
index 0000000000..c6630d7652
--- /dev/null
+++ b/sw/dnn/common.mk
@@ -0,0 +1,31 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+# Usage of absolute paths is required to externally include this Makefile
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+
+DATA_DIR       := $(realpath $(MK_DIR)/$(APP)/data)
+SRC_DIR        := $(realpath $(MK_DIR)/$(APP)/src)
+COMMON_SRC_DIR := $(realpath $(MK_DIR)/src)
+
+DATA_CFG ?= $(DATA_DIR)/params.hjson
+SECTION  ?=
+
+SRCS    ?= $(realpath $(SRC_DIR)/main.c)
+INCDIRS ?= $(DATA_DIR) $(SRC_DIR) $(COMMON_SRC_DIR)
+
+DATAGEN_PY := $(DATA_DIR)/datagen.py
+DATA_H     := $(DATA_DIR)/data.h
+
+$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
+	$< -c $(DATA_CFG) --section="$(SECTION)" $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_H)
+
+clean: clean-data
diff --git a/sw/dnn/src/conv2d.h b/sw/dnn/conv2d/conv2d.h
similarity index 100%
rename from sw/dnn/src/conv2d.h
rename to sw/dnn/conv2d/conv2d.h
diff --git a/sw/dnn/gelu/data/datagen.py b/sw/dnn/gelu/data/datagen.py
new file mode 100755
index 0000000000..25d72b1055
--- /dev/null
+++ b/sw/dnn/gelu/data/datagen.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tim Fischer <fischeti@iis.ee.ethz.ch>
+# Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION_T = {
+    '64': 'FP64',
+    '32': 'FP32',
+    '16': 'FP16',
+    '8': 'FP8'
+}
+
+
+def golden_model(ifmap):
+    gelu = torch.nn.GELU()
+    return gelu(ifmap)
+
+
+def emit_header(**kwargs):
+
+    batch_size = kwargs['input_dim']['batch_size']
+    seq_len = kwargs['input_dim']['seq_len']
+    hidden_nodes = kwargs['input_dim']['hidden_nodes']
+    prec = str(kwargs['prec'])
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    ctype = data_utils.floating_point_ctype(prec)
+
+    ifmap = torch.randn(batch_size, seq_len, hidden_nodes, requires_grad=False, dtype=torch_type)
+    ofmap = golden_model(ifmap)
+
+    ifmap_uid = 'ifmap'
+    ofmap_uid = 'ofmap'
+
+    layer_cfg = {
+        'batch_size': batch_size,
+        'seq_len': seq_len,
+        'hidden_nodes': hidden_nodes,
+        'ifmap': ifmap_uid,
+        'ofmap': ofmap_uid,
+        'dtype': PRECISION_T[prec]
+    }
+
+    data_str = [emit_license()]
+    # Array forward declarations
+    data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)]
+    data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)]
+    # Layer struct
+    data_str += [format_struct_definition('gelu_layer_t', 'layer', layer_cfg)]
+    # Array definitions
+    data_str += [format_array_definition(ctype, ifmap_uid, ifmap)]
+    # Golden results for BIST
+    result_def = format_array_definition(ctype, 'golden', ofmap)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/dnn/gelu/src/params.hjson b/sw/dnn/gelu/data/params.hjson
similarity index 76%
rename from target/snitch_cluster/sw/apps/dnn/gelu/src/params.hjson
rename to sw/dnn/gelu/data/params.hjson
index 6d4c2fe7c8..b290c78e5d 100644
--- a/target/snitch_cluster/sw/apps/dnn/gelu/src/params.hjson
+++ b/sw/dnn/gelu/data/params.hjson
@@ -2,13 +2,10 @@
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a single GELU layer
-
 {
-    kernel: "GELU"
     input_dim: {
         batch_size: 3,
-        seq_len:8,
+        seq_len: 8,
         hidden_nodes: 4
     }
     prec: 32
diff --git a/sw/dnn/src/gelu.h b/sw/dnn/gelu/src/gelu.h
similarity index 77%
rename from sw/dnn/src/gelu.h
rename to sw/dnn/gelu/src/gelu.h
index cb903a64ae..83e4516a34 100644
--- a/sw/dnn/src/gelu.h
+++ b/sw/dnn/gelu/src/gelu.h
@@ -6,34 +6,28 @@
 
 #include "math.h"
 #include "snrt.h"
-#include "utils.h"
 
 /**
  * @struct gelu_layer_struct
  * @brief This structure contains all parameters necessary
  *        for computing the GELU activation function
- * @var gelu_layer_struct::BATCH_SIZE
+ * @var gelu_layer_struct::batch_size
  * Size of each input sample
- * @var gelu_layer_struct::SEQ_LEN
+ * @var gelu_layer_struct::seq_len
  * Size of each output sample
- * @var gelu_layer_struct::HIDDEN_NODES
+ * @var gelu_layer_struct::hidden_nodes
  * Number of hidden dimensions
  * @var gelu_layer_struct::ifmap
  * Pointer to input feature map
  * @var gelu_layer_struct::ofmap
  * Pointer to output feature map
- * @var gelu_layer_struct::result
- * Pointer to the golden model output
  */
 typedef struct gelu_layer_struct {
-    uint32_t BATCH_SIZE;
-    uint32_t SEQ_LEN;
-    uint32_t HIDDEN_NODES;
-
+    uint32_t batch_size;
+    uint32_t seq_len;
+    uint32_t hidden_nodes;
     float *ifmap;
     float *ofmap;
-    float *result;
-
     precision_t dtype;
 } gelu_layer_t;
 
@@ -70,7 +64,7 @@ static inline void gelu_fp32(float *input, float *output, int32_t ldI,
 /**
  * @brief  GELU layer
  *
- * @param l gelu_layer struct that holds addresses and parameters
+ * @param l gelu_layer_t struct that holds addresses and parameters
  *
  */
 static inline void gelu_layer(const gelu_layer_t *l) {
@@ -80,7 +74,7 @@ static inline void gelu_layer(const gelu_layer_t *l) {
     uint32_t compute_id = snrt_cluster_compute_core_num();
 
     uint32_t ifmap_size =
-        l->BATCH_SIZE * l->SEQ_LEN * l->HIDDEN_NODES * sizeof(float);
+        l->batch_size * l->seq_len * l->hidden_nodes * sizeof(float);
     uint32_t ofmap_size = ifmap_size;
 
     void *ptr = (float *)snrt_l1_next();
@@ -92,9 +86,9 @@ static inline void gelu_layer(const gelu_layer_t *l) {
     // DMA transfer the ifmap into the cluster TCDM
     if (snrt_is_dm_core()) {
         snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
-            ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
-            l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
-            l->SEQ_LEN * l->HIDDEN_NODES * sizeof(float));
+            ifmap, l->ifmap, l->batch_size * sizeof(float),
+            l->batch_size * sizeof(float), l->batch_size * sizeof(float),
+            l->seq_len * l->hidden_nodes * sizeof(float));
 
         snrt_dma_wait_all();
     }
@@ -103,23 +97,23 @@ static inline void gelu_layer(const gelu_layer_t *l) {
 
     if (snrt_is_compute_core()) {
         // determine the row offset for each core
-        int32_t row_offset = compute_id * l->HIDDEN_NODES;
+        int32_t row_offset = compute_id * l->hidden_nodes;
 
         // determine the row stride of each matrix
-        int32_t ldI = compute_num * l->HIDDEN_NODES;
+        int32_t ldI = compute_num * l->hidden_nodes;
 
         // determine the batch offset for each core
-        int32_t batch_offset = l->SEQ_LEN * l->HIDDEN_NODES;
+        int32_t batch_offset = l->seq_len * l->hidden_nodes;
 
         // printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
 
-        for (int b = 0; b < l->BATCH_SIZE; b++) {
+        for (int b = 0; b < l->batch_size; b++) {
             // if (compute_id == 1) {
             //     printf("BATCH: %d\n", b);
             // }
             gelu_fp32(&ifmap[row_offset + b * batch_offset],
-                      &ofmap[row_offset + b * batch_offset], ldI, l->BATCH_SIZE,
-                      l->SEQ_LEN / 8, l->HIDDEN_NODES);
+                      &ofmap[row_offset + b * batch_offset], ldI, l->batch_size,
+                      l->seq_len / 8, l->hidden_nodes);
         }
 
         snrt_cluster_hw_barrier();
diff --git a/sw/dnn/gelu/src/main.c b/sw/dnn/gelu/src/main.c
new file mode 100644
index 0000000000..3e8c742cf6
--- /dev/null
+++ b/sw/dnn/gelu/src/main.c
@@ -0,0 +1,9 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() { gelu_layer(&layer); }
\ No newline at end of file
diff --git a/sw/dnn/gemm/data/datagen.py b/sw/dnn/gemm/data/datagen.py
new file mode 100755
index 0000000000..e4338e72c1
--- /dev/null
+++ b/sw/dnn/gemm/data/datagen.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tim Fischer <fischeti@iis.ee.ethz.ch>
+# Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION_T = {
+    '64': 'FP64',
+    '32': 'FP32',
+    '16': 'FP16',
+    '8': 'FP8'
+}
+
+
+def rand_data_generator(shape, prec, alt=False):
+    if prec == '64':
+        return torch.randn(shape, requires_grad=False, dtype=torch.float64), {}
+    elif prec == '32':
+        return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
+    elif prec == '16':
+        if alt:
+            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+        else:
+            return torch.randn(shape, requires_grad=False, dtype=torch.float16), {}
+    elif prec == '8':
+        sign = torch.randint(0, 2, shape,
+                             requires_grad=False, dtype=torch.uint8)  # -1 or 1
+        exponent = torch.randint(0, 16, shape,
+                                 requires_grad=False, dtype=torch.uint8)  # < 0b01111
+        mantissa = torch.randint(0, 4, shape,
+                                 requires_grad=False, dtype=torch.uint8)  # can be arbitrary
+        bits = {'sign': sign, 'exponent': exponent, 'mantissa': mantissa}
+        # TODO: not actually correct
+        sign_val = (-1.0)**sign.double()
+        exp_val = (2.0**(exponent.double()-15.0))
+        man_val = (1.0 + mantissa.double() / (2**2))
+        val = sign_val*exp_val*man_val
+        return val, bits
+
+
+def golden_model(alpha, A, B, C):
+    return alpha * C + torch.matmul(A, B)
+
+
+def emit_header(**kwargs):
+
+    M = kwargs['M']
+    N = kwargs['N']
+    K = kwargs['K']
+    alpha = kwargs['alpha']
+    expand = kwargs['expand']
+    transpose_A = kwargs['transpose_A']
+    transpose_B = kwargs['transpose_B']
+    prec = str(kwargs['prec'])
+
+    mat_A, bits_A = rand_data_generator((M, K), prec)
+    mat_B, bits_B = rand_data_generator((K, N), prec)
+    mat_C, bits_C = rand_data_generator((M, N), prec)
+
+    result = golden_model(alpha, mat_A, mat_B, mat_C)
+
+    if transpose_A:
+        mat_A = mat_A.T
+    if transpose_B:
+        mat_B = mat_B.T
+
+    ctype = data_utils.floating_point_ctype(prec)
+
+    A_uid = 'A'
+    B_uid = 'B'
+    C_uid = 'C'
+
+    layer_cfg = {
+        'M': M,
+        'N': N,
+        'K': K,
+        'TA': int(transpose_A),
+        'TB': int(transpose_B),
+        'ALPHA': alpha,
+        'expand': expand,
+        'dtype': PRECISION_T[prec],
+        'A': A_uid,
+        'B': B_uid,
+        'C': C_uid
+    }
+
+    data_str = [emit_license()]
+    # Array forward declarations
+    data_str += [format_array_declaration(ctype, A_uid, mat_A.shape)]
+    data_str += [format_array_declaration(ctype, B_uid, mat_B.shape)]
+    data_str += [format_array_declaration(ctype, C_uid, mat_C.shape)]
+    # Layer struct
+    data_str += [format_struct_definition('gemm_layer_t', 'layer', layer_cfg)]
+    # Array definitions
+    if prec == 'FP8':
+        data_str += [format_array_definition(ctype, A_uid, bits_A)]
+        data_str += [format_array_definition(ctype, B_uid, bits_B)]
+        data_str += [format_array_definition(ctype, C_uid, bits_C)]
+    else:
+        data_str += [format_array_definition(ctype, A_uid, mat_A)]
+        data_str += [format_array_definition(ctype, B_uid, mat_B)]
+        data_str += [format_array_definition(ctype, C_uid, mat_C)]
+    # Golden results for BIST
+    result_def = format_array_definition(ctype, 'checksum', torch.sum(result, dim=-1))
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/dnn/gemm/src/params.hjson b/sw/dnn/gemm/data/params.hjson
similarity index 86%
rename from target/snitch_cluster/sw/apps/dnn/gemm/src/params.hjson
rename to sw/dnn/gemm/data/params.hjson
index e3b54c274a..ce1506ae66 100644
--- a/target/snitch_cluster/sw/apps/dnn/gemm/src/params.hjson
+++ b/sw/dnn/gemm/data/params.hjson
@@ -2,10 +2,7 @@
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a GEMM
-
 {
-    kernel: "GEMM"
     M: 16,
     N: 16,
     K: 16,
diff --git a/sw/dnn/src/gemm.h b/sw/dnn/gemm/src/gemm.h
similarity index 96%
rename from sw/dnn/src/gemm.h
rename to sw/dnn/gemm/src/gemm.h
index dd71c7dafd..cf2b2949e0 100644
--- a/sw/dnn/src/gemm.h
+++ b/sw/dnn/gemm/src/gemm.h
@@ -54,12 +54,12 @@ typedef struct gemm_layer_struct {
     uint32_t TILE_N;
     uint32_t TILE_K;
 
-    double *A;
-    double *B;
-    double *C;
+    void *A;
+    void *B;
+    void *C;
 
     uint32_t ALPHA;
 
     precision_t dtype;
     uint32_t expand;
-} gemm_layer;
+} gemm_layer_t;
diff --git a/target/snitch_cluster/sw/apps/dnn/gemm/src/gemm.c b/sw/dnn/gemm/src/main.c
similarity index 93%
rename from target/snitch_cluster/sw/apps/dnn/gemm/src/gemm.c
rename to sw/dnn/gemm/src/main.c
index 5a709613eb..a94b0247ed 100644
--- a/target/snitch_cluster/sw/apps/dnn/gemm/src/gemm.c
+++ b/sw/dnn/gemm/src/main.c
@@ -5,7 +5,9 @@
 // SW testbench for profiling GEMM kernels in different
 // floating point precisions (fp64, fp32, fp16), as well as
 // different memory layouts for matrices (transposed/not-transposed)
-// Correctness of results are checked automatically
+
+// TODO(colluca): Add IPC verification and remove
+#define BIST
 
 #include "dnn.h"
 #include "snrt.h"
@@ -21,16 +23,10 @@
 // banking conflicts in the beginning
 #define MAT_PADDING 0
 
-#define CHECK_RESULT
-
 void *share_ptr;
 
 int main() {
-    gemm_l.A = (void *)gemm_A_dram;
-    gemm_l.B = (void *)gemm_B_dram;
-    gemm_l.C = (void *)gemm_C_dram;
-
-    const gemm_layer l1_gemm_l = gemm_l;
+    const gemm_layer_t l1_gemm_l = layer;
 
     const uint32_t cluster_num = snrt_cluster_num();
     const uint32_t cluster_id = snrt_cluster_idx();
@@ -163,39 +159,39 @@ int main() {
     }
     snrt_cluster_hw_barrier();
 
-#ifdef CHECK_RESULT
+#ifdef BIST
 
     if (compute_id == 0) {
         if (l1_gemm_l.dtype == FP64) {
             for (uint32_t m = 0; m < l1_gemm_l.M; m++) {
-                double checksum = gemm_checksum[m];
+                double check = checksum[m];
                 double sum = 0.0;
                 for (uint32_t n = 0; n < l1_gemm_l.N; n++) {
                     sum += ((double *)mat_C)[m * l1_gemm_l.N + n];
                 }
-                if (fabs(sum - checksum) > 0.001) {
+                if (fabs(sum - check) > 0.001) {
                     errors += l1_gemm_l.N;
                 }
             }
         } else if (l1_gemm_l.dtype == FP32) {
             for (uint32_t m = 0; m < l1_gemm_l.M; m++) {
-                float checksum = gemm_checksum[m];
+                float check = checksum[m];
                 float sum = 0.0;
                 for (uint32_t n = 0; n < l1_gemm_l.N; n++) {
                     sum += ((float *)mat_C)[m * l1_gemm_l.N + n];
                 }
-                if (fabs(sum - checksum) > 0.001) {
+                if (fabs(sum - check) > 0.001) {
                     errors += l1_gemm_l.N;
                 }
             }
         } else if (l1_gemm_l.dtype == FP16) {
             for (uint32_t m = 0; m < l1_gemm_l.M; m++) {
-                __fp16 checksum = gemm_checksum[m];
+                __fp16 check = checksum[m];
                 float sum = 0.0;
                 for (uint32_t n = 0; n < l1_gemm_l.N; n++) {
                     sum += ((__fp16 *)mat_C)[m * l1_gemm_l.N + n];
                 }
-                if (fabs(sum - checksum) > 0.05) {
+                if (fabs(sum - check) > 0.05) {
                     errors += l1_gemm_l.N;
                 }
             }
diff --git a/sw/dnn/layernorm/data/datagen.py b/sw/dnn/layernorm/data/datagen.py
new file mode 100755
index 0000000000..d1af8ea364
--- /dev/null
+++ b/sw/dnn/layernorm/data/datagen.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tim Fischer <fischeti@iis.ee.ethz.ch>
+# Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION_T = {
+    '64': 'FP64',
+    '32': 'FP32',
+    '16': 'FP16',
+    '8': 'FP8'
+}
+
+
+def golden_model(ifmap, eps, shape, prec):
+    ln = torch.nn.LayerNorm(shape, eps=eps)
+    return ln(ifmap)
+
+
+def emit_header(**kwargs):
+    batch_size = kwargs['input_dim']['batch_size']
+    seq_len = kwargs['input_dim']['seq_len']
+    embeddings = kwargs['input_dim']['embeddings']
+    eps = kwargs['eps']
+    prec = str(kwargs['prec'])
+    n_tiles = kwargs['n_tiles']
+
+    assert (seq_len % n_tiles) == 0, 'Input dimension is not an integer multiple of tile size'
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    ifmap = torch.randn(batch_size, seq_len, embeddings, requires_grad=False, dtype=torch_type)
+
+    ofmap = golden_model(ifmap, eps, embeddings, prec)
+    ofmap = ofmap.detach().numpy()
+
+    ctype = data_utils.floating_point_ctype(prec)
+
+    ifmap_uid = 'ifmap'
+    ofmap_uid = 'ofmap'
+
+    layer_cfg = {
+        **kwargs['input_dim'],
+        'n_tiles': n_tiles,
+        'ifmap': ifmap_uid,
+        'ofmap': ofmap_uid,
+        'eps': eps,
+        'dtype': PRECISION_T[prec]
+    }
+
+    data_str = [emit_license()]
+    data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape,
+                 alignment=BURST_ALIGNMENT)]
+    data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape,
+                 alignment=BURST_ALIGNMENT)]
+    data_str += [format_struct_definition('layernorm_layer_t', 'layer', layer_cfg)]
+    data_str += [format_array_definition(ctype, ifmap_uid, ifmap,
+                 alignment=BURST_ALIGNMENT)]
+    result_def = format_array_definition(ctype, 'golden', ofmap, alignment=BURST_ALIGNMENT)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson b/sw/dnn/layernorm/data/params.hjson
similarity index 52%
rename from target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson
rename to sw/dnn/layernorm/data/params.hjson
index a9e3fca54a..7b60b21349 100644
--- a/target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson
+++ b/sw/dnn/layernorm/data/params.hjson
@@ -1,16 +1,14 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
+// Copyright 2023 ETH Zurich and University of Bologna.
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a single LayerNorm layer
-
 {
-    kernel: "LayerNorm"
     input_dim: {
-        batch_size: 1,
-        seq_len: 32,
+        batch_size: 2,
+        seq_len: 64,
         embeddings: 32
     }
     eps: 1e-5
     prec: 32
+    n_tiles: 2
 }
\ No newline at end of file
diff --git a/sw/dnn/layernorm/layout.csv b/sw/dnn/layernorm/layout.csv
new file mode 100644
index 0000000000..9fd0970bec
--- /dev/null
+++ b/sw/dnn/layernorm/layout.csv
@@ -0,0 +1,3 @@
+            , setup, dma in, compute tile, dma out, dma in, compute tile, dma out
+"range(0,8)",      1,      ,            3,        ,       ,            5,        
+8           ,      1,     2,             ,       4,      5,             ,       7
diff --git a/sw/dnn/layernorm/src/layernorm.h b/sw/dnn/layernorm/src/layernorm.h
new file mode 100644
index 0000000000..f5af22f457
--- /dev/null
+++ b/sw/dnn/layernorm/src/layernorm.h
@@ -0,0 +1,177 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+#include "snrt.h"
+// #include "printf.h"
+#include "dnn.h"
+
+/**
+ * @struct layernorm_layer_struct
+ * @brief This structure contains all parameters necessary
+ *        for computing the LayerNorm activation function
+ * @var layernorm_layer_struct::batch_size
+ * Size of each input sample
+ * @var layernorm_layer_struct::seq_len
+ * Size of each output sample
+ * @var layernorm_layer_struct::embeddings
+ * Number of hidden dimensions
+ * @var layernorm_layer_struct::n_tiles
+ * Number of tiles to split the data into
+ * @var layernorm_layer_struct::ifmap
+ * Pointer to input feature map
+ * @var layernorm_layer_struct::ofmap
+ * Pointer to output feature map
+ */
+typedef struct layernorm_layer_struct {
+    uint32_t batch_size;
+    uint32_t seq_len;
+    uint32_t embeddings;
+    uint32_t n_tiles;
+    float eps;
+    void *ifmap;
+    void *ofmap;
+    precision_t dtype;
+} layernorm_layer_t;
+
+/**
+ * Single-cluster implementation of a layernorm tile (data assumed in TCDM)
+ */
+static inline void layernorm_fp32(float *input, float *output,
+                                  int32_t batch_size, int32_t seq_len,
+                                  int32_t embeddings, int32_t eps) {
+    if (snrt_is_compute_core()) {
+        // Get parameters for every core's tile
+        // offset: offset between data accessed by every core (for
+        //         corresponding iterations)
+        // stride: offset between data accessed by the same core in
+        //         consecutive iterations
+        // tile_seq_len: fraction of the sequence assigned to each core
+        uint32_t offset = snrt_cluster_core_idx() * embeddings;
+        uint32_t stride = snrt_cluster_compute_core_num() * embeddings;
+        uint32_t tile_seq_len = seq_len / snrt_cluster_compute_core_num();
+        float *core_itile = input + offset;
+        float *core_otile = output + offset;
+
+        // get derived layernorm quantities
+        uint32_t batch_offset = seq_len * embeddings;
+
+        // compute the mean and variance along the last dimension
+        float mean = 0.0;  // max value of the current core
+        float var = 0.0;   // sum of the exp values of the current core
+        for (int32_t b = 0; b < batch_size; b++) {
+            for (int32_t s = 0; s < tile_seq_len; s++) {
+                mean = 0.0;
+                var = 0.0;
+
+                for (int32_t i = 0; i < embeddings; i++) {
+                    mean += core_itile[b * batch_offset + s * stride + i];
+                }
+                mean /= embeddings;
+
+                for (int32_t i = 0; i < embeddings; i++) {
+                    var +=
+                        (core_itile[b * batch_offset + s * stride + i] - mean) *
+                        (core_itile[b * batch_offset + s * stride + i] - mean);
+                }
+                var /= embeddings;
+
+                // compute the shifted value of the current row
+                for (int32_t i = 0; i < embeddings; i++) {
+                    core_otile[b * batch_offset + s * stride + i] =
+                        (core_itile[b * batch_offset + s * stride + i] - mean) /
+                        sqrtf(var + eps);
+                }
+            }
+        }
+
+        snrt_fpu_fence();
+    }
+}
+
+// /**
+//  * Implementation of the LayerNorm layer for the Transformer model for FP64.
+//  */
+// static inline void transformer_layernorm_fp64(double *input, int32_t ldI,
+//                                               int32_t seq_len, int32_t
+//                                               embeddings, int32_t eps) {
+//     layernorm_fp64(input, input, ldI, 0, 1, seq_len, embeddings, eps);
+// }
+
+// /**
+//  * Implementation of the LayerNorm layer for the Transformer model for FP32.
+//  */
+// static inline void transformer_layernorm_fp32(float *input, int32_t ldI,
+//                                               int32_t seq_len, int32_t
+//                                               embeddings, int32_t eps) {
+//     layernorm_fp32(input, input, ldI, 0, 1, seq_len, embeddings, eps);
+// }
+
+// Tiles the seq_len axis
+static inline void layernorm_layer(layernorm_layer_t l) {
+    snrt_mcycle();
+
+    // Compute the tiling parameters
+    uint32_t n_tiles = l.n_tiles;
+    uint32_t tile_seq_len = l.seq_len / n_tiles;
+    uint32_t tile_size = l.batch_size * tile_seq_len * l.embeddings;
+    uint32_t tile_offset = tile_seq_len * l.embeddings;
+
+    // Allocate space for arrays in TCDM
+    float *local_itile = (float *)snrt_l1_next();
+    float *local_otile = local_itile + tile_size;
+
+    // Get pointers to arrays in DRAM
+    float *remote_ifmap = (float *)l.ifmap;
+    float *remote_ofmap = (float *)l.ofmap;
+
+    // Iterate tiles
+    snrt_mcycle();
+    for (int tile_idx = 0; tile_idx < n_tiles; tile_idx++) {
+        // Copy input tile
+        if (snrt_is_dm_core()) {
+            float *remote_itile = remote_ifmap + tile_idx * tile_offset;
+            snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
+                local_itile,                                 /* dst */
+                remote_itile,                                /* src */
+                tile_seq_len * l.embeddings * sizeof(float), /* size */
+                tile_seq_len * l.embeddings * sizeof(float), /* dst_stride */
+                l.seq_len * l.embeddings * sizeof(float),    /* src_stride */
+                l.batch_size                                 /* repetitions */
+            );
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+
+        snrt_cluster_hw_barrier();
+
+        // Compute layernorm tile
+        if (snrt_is_compute_core()) snrt_mcycle();
+        layernorm_fp32(local_itile, local_otile, l.batch_size, tile_seq_len,
+                       l.embeddings, l.eps);
+        if (snrt_is_compute_core()) snrt_mcycle();
+
+        snrt_cluster_hw_barrier();
+
+        // DMA transfer the ofmap to DRAM
+        if (snrt_is_dm_core()) {
+            snrt_mcycle();
+            float *remote_otile = remote_ofmap + tile_idx * tile_offset;
+            snrt_dma_txid_t txid_ofmap = snrt_dma_start_2d(
+                remote_otile,                                /* dst */
+                local_otile,                                 /* src */
+                tile_seq_len * l.embeddings * sizeof(float), /* size */
+                l.seq_len * l.embeddings * sizeof(float),    /* dst_stride */
+                tile_seq_len * l.embeddings * sizeof(float), /* src_stride */
+                l.batch_size                                 /* repetitions */
+            );
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+    }
+
+    snrt_global_barrier();
+}
diff --git a/sw/dnn/layernorm/src/main.c b/sw/dnn/layernorm/src/main.c
new file mode 100644
index 0000000000..21fea7cf56
--- /dev/null
+++ b/sw/dnn/layernorm/src/main.c
@@ -0,0 +1,14 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    layernorm_layer(layer);
+    return 0;
+}
diff --git a/sw/dnn/layernorm/verify.py b/sw/dnn/layernorm/verify.py
new file mode 100755
index 0000000000..f7ac49e30e
--- /dev/null
+++ b/sw/dnn/layernorm/verify.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from data.datagen import golden_model
+
+sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
+import verification  # noqa: E402
+from elf import Elf  # noqa: E402
+from data_utils import bytes_to_float, bytes_to_struct  # noqa: E402
+
+
+ERR_THRESHOLD = 0.001
+
+PRECISION_T = {
+    8: '64',
+    4: '32',
+    2: '16',
+    1: '8'
+}
+
+NUMPY_T = {
+    '64': np.float64,
+    '32': np.float32,
+    '16': np.float16
+}
+
+
+def main():
+    # Run simulation and get outputs
+    args = verification.parse_args()
+    raw_results = verification.simulate(sim_bin=args.sim_bin,
+                                        snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
+                                        log=args.log,
+                                        output_uids=['ofmap'])
+
+    # Extract input operands from ELF file
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
+
+    layer_struct = {
+        'batch_size': 'I',
+        'seq_len': 'I',
+        'embeddings': 'I',
+        'n_tiles': 'I',
+        'eps': 'f',
+        'ifmap_ptr': 'I',
+        'ofmap_ptr': 'I',
+        'dtype': 'I'
+    }
+    layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
+    batch_size = layer['batch_size']
+    seq_len = layer['seq_len']
+    embeddings = layer['embeddings']
+    eps = layer['eps']
+    prec = PRECISION_T[layer['dtype']]
+
+    ifmap = np.array(bytes_to_float(elf.get_symbol_contents('ifmap'), prec), dtype=NUMPY_T[prec])
+    ifmap = ifmap.reshape(batch_size, seq_len, embeddings)
+    ifmap = torch.from_numpy(ifmap)
+
+    # Verify results
+    ofmap_actual = np.array(bytes_to_float(raw_results['ofmap'], prec), dtype=NUMPY_T[prec])
+    ofmap_golden = golden_model(ifmap, eps, embeddings, prec).detach().numpy().flatten()
+
+    absolute_err = np.absolute(ofmap_golden - ofmap_actual)
+    fail = np.any(absolute_err > ERR_THRESHOLD)
+    if (fail):
+        verification.dump_results_to_csv([ofmap_golden, ofmap_actual, absolute_err],
+                                         Path.cwd() / 'layernorm_results.csv')
+
+    return int(fail)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/sw/dnn/linear/data/datagen.py b/sw/dnn/linear/data/datagen.py
new file mode 100755
index 0000000000..67c7934a7e
--- /dev/null
+++ b/sw/dnn/linear/data/datagen.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tim Fischer <fischeti@iis.ee.ethz.ch>
+# Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION_T = {
+    '64': 'FP64',
+    '32': 'FP32',
+    '16': 'FP16',
+    '8': 'FP8'
+}
+
+
+def golden_model(ifmap, weights, bias):
+    ifmap = ifmap.flatten(1)
+    return torch.matmul(ifmap, weights.T) + bias
+
+
+def emit_header(**kwargs):
+
+    out_channels = kwargs['channels']['out']
+    in_height = kwargs['input_dim']['height']
+    in_width = kwargs['input_dim']['width']
+    prec = str(kwargs['prec'])
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    ctype = data_utils.floating_point_ctype(prec)
+
+    ifmap = torch.randn(in_height, in_width, requires_grad=False, dtype=torch_type)
+    weights = torch.randn(out_channels, in_width, requires_grad=False, dtype=torch_type)
+    bias = torch.randn(out_channels, requires_grad=False, dtype=torch_type)
+    ofmap = golden_model(ifmap, weights, bias)
+
+    ch, ci = ifmap.shape
+    _, co = ofmap.shape
+
+    ifmap_uid = 'ifmap'
+    weights_uid = 'weights'
+    bias_uid = 'bias'
+    ofmap_uid = 'ofmap'
+
+    layer_cfg = {
+        'CO': co,
+        'CI': ci,
+        'CH': ch,
+        'CW': ci,
+        'ifmap': ifmap_uid,
+        'ofmap': ofmap_uid
+    }
+
+    data_str = [emit_license()]
+    # Array forward declarations
+    data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)]
+    data_str += [format_array_declaration(ctype, weights_uid, weights.shape)]
+    data_str += [format_array_declaration(ctype, bias_uid, bias.shape)]
+    data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)]
+    # Layer struct
+    data_str += [format_struct_definition('linear_layer_t', 'layer', layer_cfg)]
+    # Array definitions
+    data_str += [format_array_definition(ctype, ifmap_uid, ifmap)]
+    data_str += [format_array_definition(ctype, weights_uid, weights)]
+    data_str += [format_array_definition(ctype, bias_uid, bias)]
+    # Golden results for BIST
+    result_def = format_array_definition(ctype, 'golden', ofmap)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/dnn/linear/src/params.hjson b/sw/dnn/linear/data/params.hjson
similarity index 81%
rename from target/snitch_cluster/sw/apps/dnn/linear/src/params.hjson
rename to sw/dnn/linear/data/params.hjson
index 00b5bda648..8b52bfdbfa 100644
--- a/target/snitch_cluster/sw/apps/dnn/linear/src/params.hjson
+++ b/sw/dnn/linear/data/params.hjson
@@ -2,10 +2,7 @@
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a single linear layer
-
 {
-    kernel: "Linear"
     channels: {
         out: 16,
     }
diff --git a/sw/dnn/src/linear.h b/sw/dnn/linear/src/linear.h
similarity index 80%
rename from sw/dnn/src/linear.h
rename to sw/dnn/linear/src/linear.h
index 3b1b7c3f76..82d174fcc9 100644
--- a/sw/dnn/src/linear.h
+++ b/sw/dnn/linear/src/linear.h
@@ -36,7 +36,6 @@ typedef struct linear_layer_struct {
     float *weights;
     float *bias;
     float *ofmap;
-    float *result;
 
     precision_t dtype;
 } linear_layer_t;
@@ -127,33 +126,4 @@ static inline void linear_layer(const linear_layer_t *l) {
     }
 
     snrt_cluster_hw_barrier();
-
-    if (snrt_is_dm_core()) {
-        snrt_dma_txid_t txid_result = snrt_dma_start_2d(
-            result, l->result, l->CH * sizeof(float), l->CH * sizeof(float),
-            l->CH * sizeof(float), l->CO * sizeof(float));
-        snrt_dma_wait_all();
-    }
-
-    snrt_cluster_hw_barrier();
-
-    // TODO: fix this, wrong values for ofmap printed
-    if (compute_id == 0) {
-        // compare result with ofmap
-        float tolerance = 1e-6;
-        int error = 0;
-        for (int i = 0; i < l->CH; i++) {
-            for (int j = 0; j < l->CO; j++) {
-                if (result[i * l->CO + j] - ofmap[i * l->CO + j] > tolerance) {
-                    printf(
-                        "MISMATCH: result[%d][%d] = %f, ofmap[%d][%d] = %f\n",
-                        i, j, result[i * l->CO + j], i, j,
-                        ofmap[i * l->CO + j]);
-                    error += 1;
-                }
-            }
-        }
-
-        printf("[%d/%d] mismatches\n", error, l->CH * l->CO);
-    }
 }
diff --git a/sw/dnn/linear/src/main.c b/sw/dnn/linear/src/main.c
new file mode 100644
index 0000000000..8a10396a1f
--- /dev/null
+++ b/sw/dnn/linear/src/main.c
@@ -0,0 +1,39 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// TODO(colluca): add IPC test and remove this flag
+#define BIST
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    linear_layer(&layer);
+
+#ifdef BIST
+    // TODO: fix this, wrong values for ofmap printed
+    if (snrt_global_core_idx() == 0) {
+        // compare result with ofmap
+        uint32_t n_results = layer.CH * layer.CO;
+        uint32_t n_errors = n_results;
+        float tolerance = 1e-6;
+        for (int i = 0; i < layer.CH; i++) {
+            for (int j = 0; j < layer.CO; j++) {
+                if (golden[i * layer.CO + j] - ofmap[i * layer.CO + j] >
+                    tolerance) {
+                    printf(
+                        "MISMATCH: golden[%d][%d] = %f, ofmap[%d][%d] = %f\n",
+                        i, j, golden[i * layer.CO + j], i, j,
+                        ofmap[i * layer.CO + j]);
+                } else {
+                    n_errors--;
+                }
+            }
+        }
+        printf("[%d/%d] mismatches\n", n_errors, n_results);
+        return n_errors;
+    }
+#endif
+}
\ No newline at end of file
diff --git a/sw/dnn/maxpool/data/datagen.py b/sw/dnn/maxpool/data/datagen.py
new file mode 100755
index 0000000000..818930090d
--- /dev/null
+++ b/sw/dnn/maxpool/data/datagen.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tim Fischer <fischeti@iis.ee.ethz.ch>
+# Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION_T = {
+    '64': 'FP64',
+    '32': 'FP32',
+    '16': 'FP16',
+    '8': 'FP8'
+}
+
+
+def golden_model(ifmap, kernel):
+    max_pool = torch.nn.MaxPool2d(kernel_size=kernel)
+    return max_pool(ifmap)
+
+
+def emit_header(**kwargs):
+
+    in_channels = kwargs['channels']['in']
+    in_height = kwargs['input_dim']['height']
+    in_width = kwargs['input_dim']['width']
+    kernel_size = kwargs['kernel_size']
+    tile_ci = kwargs['tile_ci']
+    prec = str(kwargs['prec'])
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    ctype = data_utils.floating_point_ctype(prec)
+
+    ifmap = torch.randn(1, in_channels, in_height, in_width, requires_grad=False, dtype=torch_type)
+    ofmap = golden_model(ifmap, kernel_size)
+
+    # convert from CHW to HWC format
+    ifmap = ifmap.permute(0, 2, 3, 1)
+    ofmap = ofmap.permute(0, 2, 3, 1)
+
+    n, ih, iw, ci = ifmap.shape
+    _, oh, ow, co = ofmap.shape
+
+    ifmap_uid = 'ifmap'
+    ofmap_uid = 'ofmap'
+
+    layer_cfg = {
+        'CO': co,
+        'CI': ci,
+        'IH': ih,
+        'IW': iw,
+        'OH': oh,
+        'OW': ow,
+        'FH': kernel_size,
+        'FW': kernel_size,
+        'tile_ci': tile_ci,
+        'ifmap': ifmap_uid,
+        'ofmap': ofmap_uid
+    }
+
+    data_str = [emit_license()]
+    # Array forward declarations
+    data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)]
+    data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)]
+    # Layer struct
+    data_str += [format_struct_definition('maxpool_layer_t', 'layer', layer_cfg)]
+    # Array definitions
+    data_str += [format_array_definition(ctype, ifmap_uid, ifmap)]
+    # Golden results for BIST
+    result_def = format_array_definition(ctype, 'golden', ofmap)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/dnn/maxpool/src/params.hjson b/sw/dnn/maxpool/data/params.hjson
similarity index 83%
rename from target/snitch_cluster/sw/apps/dnn/maxpool/src/params.hjson
rename to sw/dnn/maxpool/data/params.hjson
index 1826a9f571..c81bdb0a53 100644
--- a/target/snitch_cluster/sw/apps/dnn/maxpool/src/params.hjson
+++ b/sw/dnn/maxpool/data/params.hjson
@@ -2,10 +2,7 @@
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a single Conv2d layer
-
 {
-    kernel: "MaxPool"
     channels: {
         out: 32,
         in: 32
@@ -15,5 +12,6 @@
         width: 8
     }
     kernel_size: 2
+    tile_ci: 32
     prec: 64
 }
diff --git a/sw/dnn/maxpool/src/main.c b/sw/dnn/maxpool/src/main.c
new file mode 100644
index 0000000000..0205c8b13b
--- /dev/null
+++ b/sw/dnn/maxpool/src/main.c
@@ -0,0 +1,17 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// SW testbench for profiling MaxPool Layer
+// Automatically checks the correctness of the results
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    maxpool_layer(&layer);
+
+    snrt_global_barrier();
+    return 0;
+}
diff --git a/sw/dnn/src/maxpool.h b/sw/dnn/maxpool/src/maxpool.h
similarity index 68%
rename from sw/dnn/src/maxpool.h
rename to sw/dnn/maxpool/src/maxpool.h
index ad86782f5c..afba2a074a 100644
--- a/sw/dnn/src/maxpool.h
+++ b/sw/dnn/maxpool/src/maxpool.h
@@ -4,6 +4,50 @@
 
 #include "snrt.h"
 
+/**
+ * @struct conv_layer_struct
+ * @brief This structure contains all parameters necessary for Convolutional
+ * layers
+ * @var conv_layer_struct::CO
+ * Number of output channels
+ * @var conv_layer_struct::CI
+ * Number of input channels
+ * @var conv_layer_struct::IH
+ * Height of input feature map
+ * @var conv_layer_struct::IW
+ * Width of input feature map
+ * @var conv_layer_struct::OH
+ * Height of output feature map
+ * @var conv_layer_struct::OW
+ * Width of output feature map
+ * @var conv_layer_struct::FH
+ * Height of filter
+ * @var conv_layer_struct::FW
+ * Width of filter
+ * @var conv_layer_struct::ifmap
+ * Pointer to input feature map
+ * @var conv_layer_struct::ofmap
+ * Pointer to output feature map
+ * @var conv_layer_struct::tile_ci
+ * Tiling factor of input channel
+ * @var gemm_layer_struct::dtype
+ * Precision of Convolution layer
+ */
+typedef struct maxpool_layer_struct {
+    uint32_t CO;
+    uint32_t CI;
+    uint32_t IH;
+    uint32_t IW;
+    uint32_t OH;
+    uint32_t OW;
+    uint32_t FH;
+    uint32_t FW;
+    uint32_t tile_ci;
+    double *ifmap;
+    double *ofmap;
+    precision_t dtype;
+} maxpool_layer_t;
+
 /**
  * @brief implementation of FP64 maxpooling
  *
@@ -30,15 +74,15 @@ static inline void maxpool_fp64(double *ifmap, double *ofmap, uint32_t CI,
     }
 }
 
-static inline void maxpool_layer(const conv_layer *l) {
+static inline void maxpool_layer(const maxpool_layer_t *l) {
     uint32_t cluster_num = snrt_cluster_num();
     uint32_t cluster_id = snrt_cluster_idx();
     uint32_t compute_num = snrt_cluster_compute_core_num();
     uint32_t compute_id = snrt_global_core_idx();
 
     // Each cluster loads one tile of kernel size
-    uint32_t ifmap_size = 2 * l->FH * l->FW * l->TILE_CI;
-    uint32_t ofmap_size = 2 * l->TILE_CI;
+    uint32_t ifmap_size = 2 * l->FH * l->FW * l->tile_ci;
+    uint32_t ofmap_size = 2 * l->tile_ci;
 
     double *ptr = (double *)snrt_l1_next();
     double *ifmap = ptr;
@@ -56,29 +100,29 @@ static inline void maxpool_layer(const conv_layer *l) {
     // tiles are distributed across clusters
     for (uint32_t tile = cluster_id; tile < l->OH * l->OW;
          tile += cluster_num) {
-        for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) {
+        for (uint32_t ci = 0; ci < l->CI; ci += l->tile_ci) {
             uint32_t oh = tile / l->OW;
             uint32_t ow = tile % l->OW;
 
             if (snrt_is_dm_core()) {
                 for (uint32_t fh = 0; fh < l->FH; fh++) {
-                    if (l->TILE_CI == l->CI) {
+                    if (l->tile_ci == l->CI) {
                         snrt_dma_start_1d(
                             &ifmap[write_buf * (ifmap_size / 2) +
-                                   fh * l->FW * l->TILE_CI], /* dst */
+                                   fh * l->FW * l->tile_ci], /* dst */
                             &l->ifmap[((oh * l->FH + fh) * l->IW + ow * l->FW) *
                                       l->CI], /* src */
-                            sizeof(double) * l->TILE_CI * l->FW /* size */);
+                            sizeof(double) * l->tile_ci * l->FW /* size */);
                     } else {
                         // printf("bubu\n");
                         snrt_dma_start_2d(
                             &ifmap[write_buf * (ifmap_size / 2) +
-                                   fh * l->FW * l->TILE_CI], /* dst */
+                                   fh * l->FW * l->tile_ci], /* dst */
                             &l->ifmap[((oh * l->FH + fh) * l->IW + ow * l->FW) *
                                           l->CI +
                                       ci],               /* src */
-                            sizeof(double) * l->TILE_CI, /* size */
-                            sizeof(double) * l->TILE_CI, /* dst_stride */
+                            sizeof(double) * l->tile_ci, /* size */
+                            sizeof(double) * l->tile_ci, /* dst_stride */
                             sizeof(double) * l->CI,      /* src_stride */
                             l->FW /* repetitions */);
                     }
@@ -93,9 +137,9 @@ static inline void maxpool_layer(const conv_layer *l) {
                         &l->ofmap[(prev_oh * l->OW + prev_ow) * l->CI +
                                   prev_ci],                   /* dst */
                         &ofmap[!read_buf * (ofmap_size / 2)], /* src */
-                        sizeof(double) * l->TILE_CI,          /* size */
+                        sizeof(double) * l->tile_ci,          /* size */
                         sizeof(double) * l->CI,               /* dst_stride */
-                        sizeof(double) * l->TILE_CI,          /* src_stride */
+                        sizeof(double) * l->tile_ci,          /* src_stride */
                         1 /* repetitions */);
                 }
 
@@ -113,7 +157,7 @@ static inline void maxpool_layer(const conv_layer *l) {
 
                 maxpool_fp64(&ifmap[read_buf * ifmap_size / 2 + compute_id],
                              &ofmap[write_buf * ofmap_size / 2 + compute_id],
-                             l->TILE_CI, l->FH, l->FW, compute_num);
+                             l->tile_ci, l->FH, l->FW, compute_num);
 
                 write_buf = !write_buf;
                 read_buf = !read_buf;
@@ -127,9 +171,9 @@ static inline void maxpool_layer(const conv_layer *l) {
         snrt_dma_start_2d(
             &l->ofmap[(prev_oh * l->OW + prev_ow) * l->CI + prev_ci], /* dst */
             &ofmap[!read_buf * (ofmap_size / 2)],                     /* src */
-            sizeof(double) * l->TILE_CI,                              /* size */
+            sizeof(double) * l->tile_ci,                              /* size */
             sizeof(double) * l->CI,      /* dst_stride */
-            sizeof(double) * l->TILE_CI, /* src_stride */
+            sizeof(double) * l->tile_ci, /* src_stride */
             1 /* repetitions */);
         snrt_dma_wait_all();
     }
diff --git a/sw/dnn/src/softmax.h b/sw/dnn/softmax/softmax.h
similarity index 100%
rename from sw/dnn/src/softmax.h
rename to sw/dnn/softmax/softmax.h
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
index 537f488cd9..d1d190a968 100644
--- a/sw/dnn/src/dnn.h
+++ b/sw/dnn/src/dnn.h
@@ -194,16 +194,13 @@ typedef struct network_single_cluster_t_ {
 
 } network_single_cluster_t;
 
-// Must be included before batchnorm since the batchnorm layer
-// uses the conv_layer struct. This is bad design.
-// TODO Fix this, union types should be preferred
-#include "conv2d.h"
-
-#include "batchnorm.h"
-#include "gelu.h"
-#include "gemm.h"
-#include "layernorm.h"
-#include "linear.h"
-#include "maxpool.h"
-#include "softmax.h"
-#include "utils.h"
+// #include "conv2d.h"
+
+#include "../batchnorm/src/batchnorm.h"
+#include "../gelu/src/gelu.h"
+#include "../gemm/src/gemm.h"
+#include "../layernorm/src/layernorm.h"
+#include "../linear/src/linear.h"
+#include "../maxpool/src/maxpool.h"
+// #include "softmax.h"
+// #include "utils.h"
diff --git a/sw/dnn/src/layernorm.h b/sw/dnn/src/layernorm.h
deleted file mode 100644
index b875303146..0000000000
--- a/sw/dnn/src/layernorm.h
+++ /dev/null
@@ -1,238 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "math.h"
-#include "snrt.h"
-// #include "printf.h"
-#include "utils.h"
-
-// add dump function for layernorm
-dump_float(ln, 5);
-
-/**
- * @struct layernorm_layer_struct
- * @brief This structure contains all parameters necessary
- *        for computing the LayerNorm activation function
- * @var layernorm_layer_struct::BATCH_SIZE
- * Size of each input sample
- * @var layernorm_layer_struct::SEQ_LEN
- * Size of each output sample
- * @var layernorm_layer_struct::EMBEDDINGS
- * Number of hidden dimensions
- * @var layernorm_layer_struct::ifmap
- * Pointer to input feature map
- * @var layernorm_layer_struct::ofmap
- * Pointer to output feature map
- * @var layernorm_layer_struct::result
- * Pointer to the golden model output
- */
-typedef struct layernorm_layer_struct {
-    uint32_t BATCH_SIZE;
-    uint32_t SEQ_LEN;
-    uint32_t EMBEDDINGS;
-    uint32_t EPS;
-
-    float *ifmap;
-    float *ofmap;
-    float *result;
-
-    precision_t dtype;
-} layernorm_layer_t;
-
-/**
- * Implementation of the LayerNorm layer.
- */
-static inline void layernorm_fp32(float *input, float *output, int32_t ldI,
-                                  int32_t batch_offset, int32_t batch_size,
-                                  int32_t seq_len, int32_t embeddings,
-                                  int32_t eps) {
-    float mean = 0.0;  // max value of the current core
-    float var = 0.0;   // sum of the exp values of the current core
-
-    uint32_t compute_id = snrt_global_core_idx();
-    uint32_t num_cores = snrt_cluster_compute_core_num();
-
-    // compute the mean and variance along the last dimension
-    for (int32_t b = 0; b < batch_size; b++) {
-        for (int32_t s = 0; s < seq_len; s++) {
-            mean = 0.0;
-            var = 0.0;
-
-            for (int32_t i = 0; i < embeddings; i++) {
-                mean += input[b * batch_offset + s * ldI + i];
-            }
-            mean /= embeddings;
-
-            // printf("mean[%d] = %f\n", b, mean);
-
-            for (int32_t i = 0; i < embeddings; i++) {
-                var += (input[b * batch_offset + s * ldI + i] - mean) *
-                       (input[b * batch_offset + s * ldI + i] - mean);
-            }
-            var /= embeddings;
-
-            // printf("var[%d] = %f\n", b, var);
-
-            // compute the shifted value of the current row
-            for (int32_t i = 0; i < embeddings; i++) {
-                output[b * batch_offset + s * ldI + i] =
-                    (input[b * batch_offset + s * ldI + i] - mean) /
-                    sqrtf(var + eps);
-                // printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i,
-                //        output[b * batch_offset + s * ldI + i]);
-            }
-        }
-    }
-
-    snrt_cluster_hw_barrier();
-}
-
-/**
- * Implementation of the LayerNorm layer for the Transformer model for FP64.
- */
-static inline void transformer_layernorm_fp64(double *input, int32_t ldI,
-                                              int32_t seq_len, int32_t embeddings,
-                                              int32_t eps) {
-    double mean = 0.0;  // max value of the current core
-    double var = 0.0;   // sum of the exp values of the current core
-
-    uint32_t compute_id = snrt_global_core_idx();
-    uint32_t num_cores = snrt_cluster_compute_core_num();
-
-    for (int32_t s = 0; s < seq_len; s++) {
-        mean = 0.0;
-        var = 0.0;
-
-        for (int32_t i = 0; i < embeddings; i++) {
-            mean += input[s * ldI + i];
-        }
-        mean /= embeddings;
-
-        // printf("mean[%d] = %f\n", b, mean);
-
-        for (int32_t i = 0; i < embeddings; i++) {
-            var += (input[s * ldI + i] - mean) *
-                    (input[s * ldI + i] - mean);
-        }
-        var /= embeddings;
-
-        // printf("var[%d] = %f\n", b, var);
-
-        // compute the shifted value of the current row
-        for (int32_t i = 0; i < embeddings; i++) {
-            input[s * ldI + i] =
-                (input[s * ldI + i] - mean) /
-                sqrtf(var + eps);
-            // printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i,
-            //        output[s * ldI + i]);
-            // dump_ln(input[s * ldI + i]);
-        }
-    }
-
-    snrt_cluster_hw_barrier();
-}
-
-
-/**
- * Implementation of the LayerNorm layer for the Transformer model for FP32.
- */
-static inline void transformer_layernorm_fp32(float *input, int32_t ldI,
-                                              int32_t seq_len, int32_t embeddings,
-                                              int32_t eps) {
-    float mean = 0.0;  // max value of the current core
-    float var = 0.0;   // sum of the exp values of the current core
-
-    uint32_t compute_id = snrt_global_core_idx();
-    uint32_t num_cores = snrt_cluster_compute_core_num();
-
-    for (int32_t s = 0; s < seq_len; s++) {
-        mean = 0.0;
-        var = 0.0;
-
-        for (int32_t i = 0; i < embeddings; i++) {
-            mean += input[s * ldI + i];
-        }
-        mean /= embeddings;
-
-        // printf("mean[%d] = %f\n", b, mean);
-
-        for (int32_t i = 0; i < embeddings; i++) {
-            var += (input[s * ldI + i] - mean) *
-                    (input[s * ldI + i] - mean);
-        }
-        var /= embeddings;
-
-        // printf("var[%d] = %f\n", b, var);
-
-        // compute the shifted value of the current row
-        for (int32_t i = 0; i < embeddings; i++) {
-            input[s * ldI + i] =
-                (input[s * ldI + i] - mean) /
-                sqrtf(var + eps);
-            // printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i,
-            //        output[s * ldI + i]);
-            // dump_ln(input[s * ldI + i]);
-        }
-    }
-
-    snrt_cluster_hw_barrier();
-}
-
-/**
- * @brief  layernorm layer
- *
- * @param l layernorm_layer struct that holds addresses and parameters
- *
- */
-static inline void layernorm_layer(const layernorm_layer_t *l) {
-    uint32_t cluster_num = snrt_cluster_num();
-    uint32_t cluster_id = snrt_cluster_idx();
-    uint32_t compute_num = snrt_cluster_compute_core_num();
-    uint32_t compute_id = snrt_global_core_idx();
-
-    uint32_t ifmap_size =
-        l->BATCH_SIZE * l->SEQ_LEN * l->EMBEDDINGS * sizeof(float);
-    uint32_t ofmap_size = ifmap_size;
-
-    void *ptr = (float *)snrt_l1_next();
-    float *ifmap = ptr;
-    ptr += ifmap_size;
-    float *ofmap = ptr;
-    ptr += ofmap_size;
-
-    // DMA transfer the ifmap into the cluster TCDM
-    if (snrt_is_dm_core()) {
-        snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
-            ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
-            l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
-            l->SEQ_LEN * l->EMBEDDINGS * sizeof(float));
-
-        snrt_dma_wait_all();
-    }
-
-    snrt_cluster_hw_barrier();
-
-    if (snrt_is_compute_core()) {
-        // determine the row offset for each core
-        int32_t row_offset = compute_id * l->EMBEDDINGS;
-
-        // determine the row stride of each matrix
-        int32_t ldI = compute_num * l->EMBEDDINGS;
-
-        // determine the batch offset for each core
-        int32_t batch_offset = l->SEQ_LEN * l->EMBEDDINGS;
-
-        // printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
-        layernorm_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI,
-                       batch_offset, l->BATCH_SIZE, l->SEQ_LEN / 8,
-                       l->EMBEDDINGS, l->EPS);
-
-    } else {
-        snrt_cluster_hw_barrier();
-    }
-
-    snrt_global_barrier();
-}
\ No newline at end of file
diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile
index 7b38bbad6a..44bdc2b6dc 100644
--- a/target/snitch_cluster/Makefile
+++ b/target/snitch_cluster/Makefile
@@ -56,7 +56,7 @@ VLT_AR = ${VLT_BUILDDIR}/Vtestharness__ALL.a
 # (LRU) config, all targets depending on the configuration file have
 # to be rebuilt. This file is used to express this condition as a
 # prerequisite for other rules.
-DEFAULT_CFG = cfg/default.hjson
+DEFAULT_CFG = cfg/divsqrt.hjson
 CFG         = cfg/lru.hjson
 
 #####################
diff --git a/target/snitch_cluster/cfg/divsqrt.hjson b/target/snitch_cluster/cfg/divsqrt.hjson
new file mode 100644
index 0000000000..f4b63104bf
--- /dev/null
+++ b/target/snitch_cluster/cfg/divsqrt.hjson
@@ -0,0 +1,127 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Cluster configuration for a simple testbench system.
+{
+    nr_s1_quadrant: 1,
+    s1_quadrant: {
+        nr_clusters: 1,
+    },
+
+    cluster: {
+        boot_addr: 4096, // 0x1000
+        cluster_base_addr: 268435456, // 0x1000_0000
+        cluster_base_offset: 0, // 0x0
+        cluster_base_hartid: 0,
+        addr_width: 48,
+        data_width: 64,
+        tcdm: {
+            size: 128,
+            banks: 32,
+        },
+        cluster_periph_size: 64, // kB
+        zero_mem_size: 64, // kB
+        dma_data_width: 512,
+        dma_axi_req_fifo_depth: 3,
+        dma_req_fifo_depth: 3,
+        // Timing parameters
+        timing: {
+            lat_comp_fp32: 3,
+            lat_comp_fp64: 3,
+            lat_comp_fp16: 2,
+            lat_comp_fp16_alt: 2,
+            lat_comp_fp8: 1,
+            lat_comp_fp8_alt: 1,
+            lat_noncomp: 1,
+            lat_conv: 1,
+            lat_sdotp: 2,
+            fpu_pipe_config: "BEFORE"
+            narrow_xbar_latency: "CUT_ALL_PORTS",
+            wide_xbar_latency: "CUT_ALL_PORTS",
+            // Isolate the core.
+            register_core_req: true,
+            register_core_rsp: true,
+            register_offload_req: true,
+            register_offload_rsp: true
+        },
+        hives: [
+            // Hive 0
+            {
+                icache: {
+                    size: 8, // total instruction cache size in kByte
+                    sets: 2, // number of ways
+                    cacheline: 256 // word size in bits
+                },
+                cores: [
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/dma_core_template" },
+                ]
+            }
+        ]
+    },
+    dram: {
+        // 0x8000_0000
+        address: 2147483648,
+        // 0x8000_0000
+        length: 2147483648
+    },
+    peripherals: {
+        clint: {
+            // 0xffff_0000
+            address: 4294901760,
+            // 0x0000_1000
+            length: 4096
+        },
+    },
+    // Templates.
+    compute_core_template: {
+        isa: "rv32imafd",
+        xssr: true,
+        xfrep: true,
+        xdma: false,
+        xf16: true,
+        xf16alt: true,
+        xf8: true,
+        xf8alt: true,
+        xfdotp: true,
+        xfvec: true,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+        // Enable division/square root unit
+        Xdiv_sqrt: true,
+    },
+    dma_core_template: {
+        isa: "rv32imafd",
+        Xdiv_sqrt: true,
+        # isa: "rv32ema",
+        xdma: true
+        xssr: false
+        xfrep: false
+        xf16: false,
+        xf16alt: false,
+        xf8: false,
+        xf8alt: false,
+        xfdotp: false,
+        xfvec: false,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+    }
+}
diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile
index 0410fb1cb4..e5d8c8be5e 100644
--- a/target/snitch_cluster/sw/apps/Makefile
+++ b/target/snitch_cluster/sw/apps/Makefile
@@ -6,18 +6,18 @@
 
 SUBDIRS  = lto
 SUBDIRS += nop
-SUBDIRS += transformer
+# SUBDIRS += transformer
 SUBDIRS += blas/axpy
 SUBDIRS += blas/gemm
 SUBDIRS += dnn/batchnorm
-SUBDIRS += dnn/conv2d
-SUBDIRS += dnn/fusedconv
+# SUBDIRS += dnn/conv2d
+# SUBDIRS += dnn/fusedconv
 SUBDIRS += dnn/gelu
 SUBDIRS += dnn/gemm
 SUBDIRS += dnn/layernorm
 SUBDIRS += dnn/linear
 SUBDIRS += dnn/maxpool
-SUBDIRS += dnn/softmax
+# SUBDIRS += dnn/softmax
 SUBDIRS += montecarlo/pi_estimation
 
 .PHONY: all clean $(SUBDIRS)
diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk
index 8e1950860e..94eb35236a 100644
--- a/target/snitch_cluster/sw/apps/common.mk
+++ b/target/snitch_cluster/sw/apps/common.mk
@@ -37,6 +37,7 @@ INCDIRS += $(SNRT_DIR)/api
 INCDIRS += $(SNRT_DIR)/api/omp
 INCDIRS += $(SNRT_DIR)/src
 INCDIRS += $(SNRT_DIR)/src/omp
+INCDIRS += $(ROOT)/sw/blas
 INCDIRS += $(ROOT)/sw/deps/riscv-opcodes
 INCDIRS += $(ROOT)/sw/math/include
 
diff --git a/target/snitch_cluster/sw/apps/dnn/batchnorm/Makefile b/target/snitch_cluster/sw/apps/dnn/batchnorm/Makefile
index e5521ec799..f84fccea61 100644
--- a/target/snitch_cluster/sw/apps/dnn/batchnorm/Makefile
+++ b/target/snitch_cluster/sw/apps/dnn/batchnorm/Makefile
@@ -2,11 +2,11 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Gianna Paulin <pauling@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP = batchnorm
+APP ?= batchnorm
 
-include ../Makefile
+include ../../../../../../sw/dnn/common.mk
 include ../../common.mk
 
-$(DEP): $(DATA_H)
+$(DEP): $(DATA_H)
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/dnn/batchnorm/src/batchnorm.c b/target/snitch_cluster/sw/apps/dnn/batchnorm/src/batchnorm.c
deleted file mode 100644
index 07eb0502b2..0000000000
--- a/target/snitch_cluster/sw/apps/dnn/batchnorm/src/batchnorm.c
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// SW testbench for profiling BatchNorm Layer
-// Automatically checks the correctness of the results
-
-#include "dnn.h"
-#include "snrt.h"
-
-#include "data.h"
-
-int main() {
-    batchnorm_l.ifmap = (double *)batchnorm_ifmap_dram;
-    batchnorm_l.ofmap = (double *)batchnorm_ofmap_dram;
-    batchnorm_l.gamma = (double *)batchnorm_gamma_dram;
-    batchnorm_l.beta = (double *)batchnorm_beta_dram;
-    batchnorm_l.TILE_CI = 32;
-
-    batchnorm_layer(&batchnorm_l);
-
-    snrt_global_barrier();
-
-    // TODO: fix check layer implementation to avoid DRAM overriding by other
-    // cores uint32_t errors = check_layer(&batchnorm_l, (double
-    // *)batchnorm_checksum);
-
-    snrt_global_barrier();
-
-    return 0;
-
-    // return errors;
-}
diff --git a/target/snitch_cluster/sw/apps/dnn/gelu/Makefile b/target/snitch_cluster/sw/apps/dnn/gelu/Makefile
index fae8b36817..4225a8cd89 100644
--- a/target/snitch_cluster/sw/apps/dnn/gelu/Makefile
+++ b/target/snitch_cluster/sw/apps/dnn/gelu/Makefile
@@ -2,11 +2,11 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Gianna Paulin <pauling@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP = gelu
+APP ?= gelu
 
-include ../Makefile
+include ../../../../../../sw/dnn/common.mk
 include ../../common.mk
 
-$(DEP): $(DATA_H)
+$(DEP): $(DATA_H)
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/dnn/gelu/src/gelu.c b/target/snitch_cluster/sw/apps/dnn/gelu/src/gelu.c
deleted file mode 100644
index e2e4471920..0000000000
--- a/target/snitch_cluster/sw/apps/dnn/gelu/src/gelu.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// SW testbench for profiling linear kernels in different
-// floating point precisions (fp64, fp32, fp16), as well as
-// different memory layouts for matrices (transposed/not-transposed)
-// Correctness of results are checked automatically
-
-#include "dnn.h"
-#include "snrt.h"
-
-#include "data.h"
-
-int main() {
-    gelu_l.ifmap = (float*)gelu_ifmap_dram;
-    // gelu_l.result = (float*)gelu_ofmap_dram;
-
-    // checksum = (float*)gelu_checksum;
-
-    gelu_layer(&gelu_l);
-
-    // uint32_t error = check_gelu_layer(&linear_l, (float*)linear_checksum);
-
-    return 0;
-}
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/dnn/gemm/Makefile b/target/snitch_cluster/sw/apps/dnn/gemm/Makefile
index 0a821adefc..48a31215e5 100644
--- a/target/snitch_cluster/sw/apps/dnn/gemm/Makefile
+++ b/target/snitch_cluster/sw/apps/dnn/gemm/Makefile
@@ -2,11 +2,11 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Gianna Paulin <pauling@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP = gemm
+APP ?= gemm
 
-include ../Makefile
+include ../../../../../../sw/dnn/common.mk
 include ../../common.mk
 
-$(DEP): $(DATA_H)
+$(DEP): $(DATA_H)
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile b/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile
index 87fa026c70..f8df5a08ac 100644
--- a/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile
+++ b/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile
@@ -2,11 +2,11 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Gianna Paulin <pauling@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP = layernorm
+APP ?= layernorm
 
-include ../Makefile
+include ../../../../../../sw/dnn/common.mk
 include ../../common.mk
 
 $(DEP): $(DATA_H)
diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c b/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c
deleted file mode 100644
index fa776940f6..0000000000
--- a/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// SW testbench for profiling linear kernels in different
-// floating point precisions (fp64, fp32, fp16), as well as
-// different memory layouts for matrices (transposed/not-transposed)
-// Correctness of results are checked automatically
-
-#include "dnn.h"
-#include "snrt.h"
-
-#include "data.h"
-
-int main() {
-    layernorm_l.ifmap = (float*)layernorm_ifmap_dram;
-    layernorm_l.result = (float*)layernorm_ofmap_dram;
-
-    // checksum = (float*)layernorm_checksum;
-
-    // printf("Starting layernorm layer\n");
-
-    layernorm_layer(&layernorm_l);
-
-    // uint32_t error = check_layernorm_layer(&linear_l,
-    // (float*)linear_checksum);
-
-    return 0;
-}
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/dnn/linear/Makefile b/target/snitch_cluster/sw/apps/dnn/linear/Makefile
index 2e14e33dfe..7b43893846 100644
--- a/target/snitch_cluster/sw/apps/dnn/linear/Makefile
+++ b/target/snitch_cluster/sw/apps/dnn/linear/Makefile
@@ -2,11 +2,11 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Gianna Paulin <pauling@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP = linear
+APP ?= linear
 
-include ../Makefile
+include ../../../../../../sw/dnn/common.mk
 include ../../common.mk
 
-$(DEP): $(DATA_H)
+$(DEP): $(DATA_H)
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/dnn/linear/src/linear.c b/target/snitch_cluster/sw/apps/dnn/linear/src/linear.c
deleted file mode 100644
index 7135d30ff3..0000000000
--- a/target/snitch_cluster/sw/apps/dnn/linear/src/linear.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// SW testbench for profiling linear kernels in different
-// floating point precisions (fp64, fp32, fp16), as well as
-// different memory layouts for matrices (transposed/not-transposed)
-// Correctness of results are checked automatically
-
-#include "dnn.h"
-#include "snrt.h"
-
-#include "data.h"
-
-int main() {
-    linear_l.ifmap = (float*)linear_ifmap_dram;
-    linear_l.weights = (float*)linear_weights_dram;
-    linear_l.bias = (float*)linear_bias_dram;
-    linear_l.result = (float*)linear_ofmap_dram;
-
-    linear_layer(&linear_l);
-
-    // uint32_t error = check_linear_layer(&linear_l, (float*)linear_checksum);
-
-    return 0;
-}
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/dnn/maxpool/Makefile b/target/snitch_cluster/sw/apps/dnn/maxpool/Makefile
index fd01d270b4..e83838ca4e 100644
--- a/target/snitch_cluster/sw/apps/dnn/maxpool/Makefile
+++ b/target/snitch_cluster/sw/apps/dnn/maxpool/Makefile
@@ -2,11 +2,11 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Gianna Paulin <pauling@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP = maxpool
+APP ?= maxpool
 
-include ../Makefile
+include ../../../../../../sw/dnn/common.mk
 include ../../common.mk
 
-$(DEP): $(DATA_H)
+$(DEP): $(DATA_H)
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/dnn/maxpool/src/maxpool.c b/target/snitch_cluster/sw/apps/dnn/maxpool/src/maxpool.c
deleted file mode 100644
index c3b91394b7..0000000000
--- a/target/snitch_cluster/sw/apps/dnn/maxpool/src/maxpool.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// SW testbench for profiling MaxPool Layer
-// Automatically checks the correctness of the results
-
-#include "dnn.h"
-#include "snrt.h"
-
-#include "data.h"
-
-int main() {
-    maxpool_l.ifmap = (double*)maxpool_ifmap_dram;
-    maxpool_l.ofmap = (double*)maxpool_ofmap_dram;
-    maxpool_l.TILE_CI = 32;
-
-    maxpool_layer(&maxpool_l);
-
-    snrt_global_barrier();
-
-    // FIXME: The checksum is overwritten in DRAM by the
-    //        output of the cores. This is a bug.
-
-    // uint32_t error = check_layer(&maxpool_l, (double*)maxpool_checksum);
-
-    // snrt_global_barrier();
-
-    // return error;
-
-    return 0;
-}
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index f25ea76418..0c712fa552 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -77,11 +77,10 @@ runs:
   - elf: apps/dnn/linear/build/linear.elf
   - elf: apps/dnn/maxpool/build/maxpool.elf
   - elf: apps/dnn/gemm/build/gemm.elf
+  - elf: apps/dnn/layernorm/build/layernorm.elf # Illegal FDIV without FDIV unit
   # - elf: apps/dnn/gelu/build/gelu.elf # seems like it stalls
   # - elf: apps/dnn/conv2d/build/conv2d.elf # fails with exit code 32
   # - elf: apps/dnn/fusedconv/build/fusedconv.elf # fails newly
-  # - elf: apps/dnn/layernorm/build/layernorm.elf
-  #   throws illegal instruction on FDIV in simulation
   # - elf: apps/dnn/softmax/build/softmax.elf
   #   throws illegal instruction on FDIV in simulation
   - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf
diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py
index 2ed260d3f1..ab28fdb1dc 100644
--- a/util/sim/data_utils.py
+++ b/util/sim/data_utils.py
@@ -6,15 +6,45 @@
 
 import struct
 from datetime import datetime
+import torch
+import numpy as np
 
 
 def emit_license():
     s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna.\n"
          f"// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n"
-         f"// SPDX-License-Identifier: Apache-2.0\n\n")
+         f"// SPDX-License-Identifier: Apache-2.0\n")
     return s
 
 
+def floating_point_torch_type(precision):
+    prec_to_torch_type_map = {
+        '64': torch.float64,
+        '32': torch.float32,
+        '16': torch.float16,
+        '8': None
+    }
+    return prec_to_torch_type_map[precision]
+
+
+# Returns the C type representing a floating-point value of the specified precision
+def floating_point_ctype(precision):
+    prec_to_fp_type_map = {
+        '64': 'double',
+        '32': 'float',
+        '16': '__fp16',
+        '8': '__fp8'
+    }
+    return prec_to_fp_type_map[precision]
+
+
+def flatten(array):
+    if isinstance(array, np.ndarray):
+        return array.flatten()
+    if isinstance(array, torch.Tensor):
+        return array.numpy().flatten()
+
+
 def variable_attributes(alignment=None, section=None):
     attributes = ''
     if alignment:
@@ -24,27 +54,60 @@ def variable_attributes(alignment=None, section=None):
     return attributes
 
 
-def format_vector_definition(type, uid, vector, alignment=None, section=None):
+def alias_dtype(dtype):
+    if dtype == '__fp8':
+        return 'char'
+    else:
+        return dtype
+
+
+def format_array_declaration(dtype, uid, shape, alignment=None, section=None):
     attributes = variable_attributes(alignment, section)
-    s = f'{type} {uid}[{len(vector)}] {attributes} = ' + '{\n'
-    for el in vector:
-        if type != 'char':
-            el_str = f'{el}'
-        else:
+    s = f'{alias_dtype(dtype)} {uid}'
+    for dim in shape:
+        s += f'[{dim}]'
+    if attributes:
+        s += f' {attributes};'
+    else:
+        s += ';'
+    return s
+
+
+# In the case of dtype __fp8, array field expects a dictionary of
+# sign, exponent and mantissa arrays
+def format_array_definition(dtype, uid, array, alignment=None, section=None):
+    # Definition starts with the declaration stripped off of the terminating semicolon
+    s = format_array_declaration(dtype, uid, array.shape, alignment, section)[:-1]
+    s += ' = {\n'
+    # Flatten array
+    if dtype == '__fp8':
+        array = zip(flatten(array['sign']),
+                    flatten(array['exponent']),
+                    flatten(array['mantissa']))
+    else:
+        array = flatten(array)
+    # Format array elements
+    for el in array:
+        if dtype == '__fp8':
+            sign, exp, mant = el
+            el = sign * 2**7 + exp * 2**2 + mant
             el_str = f'0x{el:02x}'
+        else:
+            el_str = f'{el}'
         s += f'\t{el_str},\n'
     s += '};'
     return s
 
 
-def format_vector_declaration(type, uid, vector, alignment=None, section=None):
-    attributes = variable_attributes(alignment, section)
-    s = f'{type} {uid}[{len(vector)}] {attributes};'
+def format_scalar_definition(dtype, uid, scalar):
+    s = f'{alias_dtype(dtype)} {uid} = {scalar};'
     return s
 
 
-def format_scalar_definition(type, uid, scalar):
-    s = f'{type} {uid} = {scalar};'
+def format_struct_definition(dtype, uid, map):
+    s = f'{alias_dtype(dtype)} {uid} = {{\n'
+    s += ',\n'.join([f'\t.{key} = {value}' for (key, value) in map.items()])
+    s += '\n};'
     return s
 
 
@@ -56,20 +119,43 @@ def format_ifdef_wrapper(macro, body):
 
 
 # bytearray assumed little-endian
-def bytes_to_doubles(byte_array):
-    double_size = struct.calcsize('d')  # Size of a double in bytes
-    num_doubles = len(byte_array) // double_size
+def bytes_to_struct(byte_array, struct_map):
+    struct_fields = struct_map.keys()
+    fmt_specifiers = struct_map.values()
+    fmt_string = ''.join(fmt_specifiers)
+    field_values = struct.unpack(f'<{fmt_string}', byte_array)
+    return dict(zip(struct_fields, field_values))
+
 
-    # Unpack the byte array into a list of doubles
-    doubles = []
-    for i in range(num_doubles):
-        double_bytes = byte_array[i * double_size:(i + 1) * double_size]
-        double = struct.unpack('<d', double_bytes)[0]
-        doubles.append(double)
-    return doubles
+# bytearray assumed little-endian
+def bytes_to_float(byte_array, prec='64'):
+    if prec == '64':
+        fmt_specifier = 'd'
+    elif prec == '32':
+        fmt_specifier = 'f'
+    else:
+        raise ValueError('Only single and double precision supported so far')
+
+    size = struct.calcsize(fmt_specifier)  # Size of an element
+    num_elements = len(byte_array) // size
+
+    # Unpack the byte array into a list of elements
+    elements = []
+    for i in range(num_elements):
+        element_bytes = byte_array[i * size:(i + 1) * size]
+        element = struct.unpack(f'<{fmt_specifier}', element_bytes)[0]
+        elements.append(element)
+    if len(elements) == 1:
+        return elements[0]
+    else:
+        return elements
 
 
-def bytes_to_uint32s(byte_array):
+# bytearray assumed little-endian
+def bytes_to_int(byte_array, prec='32', signedness='unsigned'):
+    assert prec == '32', "Only 32 bit precision supported so far"
+    assert signedness == 'unsigned', "Only unsigned integers supported so far"
+
     uint32_size = struct.calcsize('I')  # Size of a uint32 in bytes
     num_uints = len(byte_array) // uint32_size