From 1eb0e45e7f9bbfa4016b5184968778787e99ed49 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 25 Oct 2023 17:58:07 +0200
Subject: [PATCH] Temporary DNN refactoring

---
 sw/blas/axpy/data/datagen.py                  |  13 +-
 sw/blas/gemm/data/datagen.py                  |  12 +-
 sw/dnn/{src => batchnorm}/batchnorm.h         |   0
 sw/dnn/common.mk                              |  31 +++++
 sw/dnn/{src => conv2d}/conv2d.h               |   0
 sw/dnn/{src => gelu}/gelu.h                   |   0
 sw/dnn/{src => gemm}/gemm.h                   |   0
 sw/dnn/layernorm/.gitignore                   |   1 +
 sw/dnn/layernorm/data/datagen.py              |  99 +++++++++++++++
 .../dnn/layernorm/data}/params.hjson          |   5 +-
 sw/dnn/{ => layernorm}/src/layernorm.h        |  85 ++++---------
 sw/dnn/layernorm/src/main.c                   | 116 ++++++++++++++++++
 sw/dnn/layernorm/verify.py                    |   0
 sw/dnn/{src => linear}/linear.h               |   0
 sw/dnn/{src => maxpool}/maxpool.h             |   0
 sw/dnn/{src => softmax}/softmax.h             |   0
 sw/dnn/src/dnn.h                              |  20 +--
 target/snitch_cluster/sw/apps/Makefile        |  18 +--
 target/snitch_cluster/sw/apps/common.mk       |   1 +
 .../sw/apps/dnn/layernorm/Makefile            |   6 +-
 .../sw/apps/dnn/layernorm/src/layernorm.c     |  29 -----
 util/sim/data_utils.py                        |  60 +++++++--
 22 files changed, 355 insertions(+), 141 deletions(-)
 rename sw/dnn/{src => batchnorm}/batchnorm.h (100%)
 create mode 100644 sw/dnn/common.mk
 rename sw/dnn/{src => conv2d}/conv2d.h (100%)
 rename sw/dnn/{src => gelu}/gelu.h (100%)
 rename sw/dnn/{src => gemm}/gemm.h (100%)
 create mode 100644 sw/dnn/layernorm/.gitignore
 create mode 100755 sw/dnn/layernorm/data/datagen.py
 rename {target/snitch_cluster/sw/apps/dnn/layernorm/src => sw/dnn/layernorm/data}/params.hjson (64%)
 rename sw/dnn/{ => layernorm}/src/layernorm.h (71%)
 create mode 100644 sw/dnn/layernorm/src/main.c
 create mode 100644 sw/dnn/layernorm/verify.py
 rename sw/dnn/{src => linear}/linear.h (100%)
 rename sw/dnn/{src => maxpool}/maxpool.h (100%)
 rename sw/dnn/{src => softmax}/softmax.h (100%)
 delete mode 100644 target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c

diff --git a/sw/blas/axpy/data/datagen.py b/sw/blas/axpy/data/datagen.py
index f7ae7a6488..fc25cd40af 100755
--- a/sw/blas/axpy/data/datagen.py
+++ b/sw/blas/axpy/data/datagen.py
@@ -11,8 +11,8 @@
 import os
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
-from data_utils import format_scalar_definition, format_vector_definition, \
-                       format_vector_declaration, format_ifdef_wrapper  # noqa: E402
+from data_utils import format_scalar_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
 
 MIN = -1000
 MAX = +1000
@@ -47,16 +47,15 @@ def main():
     a = np.random.uniform(MIN, MAX, 1)
     x = np.random.uniform(MIN, MAX, length)
     y = np.random.uniform(MIN, MAX, length)
-    z = np.zeros(length)
     g = golden_model(a, x, y)
 
     # Format header file
     l_str = format_scalar_definition('const uint32_t', 'l', length)
     a_str = format_scalar_definition('const double', 'a', a[0])
-    x_str = format_vector_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
-    y_str = format_vector_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
-    z_str = format_vector_declaration('double', 'z', z, alignment=BURST_ALIGNMENT, section=section)
-    g_str = format_vector_definition('double', 'g', g)
+    x_str = format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
+    y_str = format_array_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
+    z_str = format_array_declaration('double', 'z', [length], alignment=BURST_ALIGNMENT, section=section)
+    g_str = format_array_definition('double', 'g', g)
     g_str = format_ifdef_wrapper('BIST', g_str)
     f_str = '\n\n'.join([l_str, a_str, x_str, y_str, z_str, g_str])
     f_str += '\n'
diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py
index 0ccab83817..a3e25539fd 100755
--- a/sw/blas/gemm/data/datagen.py
+++ b/sw/blas/gemm/data/datagen.py
@@ -15,7 +15,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
 from data_utils import emit_license, format_scalar_definition, \
-                       format_vector_definition, format_ifdef_wrapper  # noqa: E402
+                       format_array_definition, format_ifdef_wrapper  # noqa: E402
 
 
 np.random.seed(42)
@@ -100,16 +100,16 @@ def emit_header(**kwargs):
     data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])]
     data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)]
     data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
     if kwargs['prec'] == 8:
-        result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten())
+        result_def = format_array_definition(C_TYPES['64'], 'result', result.flatten())
     else:
-        result_def = format_vector_definition(C_TYPES[str(kwargs['prec'])],
+        result_def = format_array_definition(C_TYPES[str(kwargs['prec'])],
                                               'result',
                                               result.flatten())
     data_str += [format_ifdef_wrapper('BIST', result_def)]
diff --git a/sw/dnn/src/batchnorm.h b/sw/dnn/batchnorm/batchnorm.h
similarity index 100%
rename from sw/dnn/src/batchnorm.h
rename to sw/dnn/batchnorm/batchnorm.h
diff --git a/sw/dnn/common.mk b/sw/dnn/common.mk
new file mode 100644
index 0000000000..0d933719de
--- /dev/null
+++ b/sw/dnn/common.mk
@@ -0,0 +1,31 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+# Usage of absolute paths is required to externally include this Makefile
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+
+DATA_DIR       := $(realpath $(MK_DIR)/$(APP)/data)
+SRC_DIR        := $(realpath $(MK_DIR)/$(APP)/src)
+COMMON_SRC_DIR := $(realpath $(MK_DIR)/src)
+
+DATA_CFG ?= $(DATA_DIR)/params.hjson
+SECTION  ?=
+
+SRCS    ?= $(realpath $(SRC_DIR)/main.c)
+INCDIRS ?= $(DATA_DIR) $(SRC_DIR) $(COMMON_SRC_DIR)
+
+DATAGEN_PY := $(DATA_DIR)/datagen.py
+DATA_H     := $(DATA_DIR)/data.h
+
+$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
+	$< -c $(DATA_CFG) --section="$(SECTION)" > $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_H)
+
+clean: clean-data
diff --git a/sw/dnn/src/conv2d.h b/sw/dnn/conv2d/conv2d.h
similarity index 100%
rename from sw/dnn/src/conv2d.h
rename to sw/dnn/conv2d/conv2d.h
diff --git a/sw/dnn/src/gelu.h b/sw/dnn/gelu/gelu.h
similarity index 100%
rename from sw/dnn/src/gelu.h
rename to sw/dnn/gelu/gelu.h
diff --git a/sw/dnn/src/gemm.h b/sw/dnn/gemm/gemm.h
similarity index 100%
rename from sw/dnn/src/gemm.h
rename to sw/dnn/gemm/gemm.h
diff --git a/sw/dnn/layernorm/.gitignore b/sw/dnn/layernorm/.gitignore
new file mode 100644
index 0000000000..f5ac16baa2
--- /dev/null
+++ b/sw/dnn/layernorm/.gitignore
@@ -0,0 +1 @@
+data/data.h
diff --git a/sw/dnn/layernorm/data/datagen.py b/sw/dnn/layernorm/data/datagen.py
new file mode 100755
index 0000000000..2be1c989a1
--- /dev/null
+++ b/sw/dnn/layernorm/data/datagen.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+from data_utils import emit_license, format_scalar_definition, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+import data_utils
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+
+def golden_model(ifmap, eps, shape, prec):
+    dtype = data_utils.floating_point_torch_type(prec)
+    ln = torch.nn.LayerNorm(shape, eps=eps)#, dtype=dtype)
+    return ln(ifmap)
+
+
+def emit_header(**kwargs):
+    batch_size = kwargs['input_dim']['batch_size']
+    seq_len = kwargs['input_dim']['seq_len']
+    embeddings = kwargs['input_dim']['embeddings']
+    eps = kwargs['eps']
+    prec = str(kwargs['prec'])
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    ifmap = torch.randn(batch_size, seq_len, embeddings, requires_grad=False, dtype=torch_type)
+
+    ofmap = golden_model(ifmap, eps, embeddings, prec)
+    ofmap = ofmap.detach().numpy()
+
+    ctype = data_utils.floating_point_ctype(prec)
+    checksum = torch.sum(ifmap, dim=-1)
+
+    ifmap_uid = 'ifmap'
+    ofmap_uid = 'ofmap'
+
+    layer_cfg = {
+        **kwargs['input_dim'],
+        'ifmap': ifmap_uid,
+        'ofmap': ofmap_uid
+    }
+
+    data_str = [emit_license()]
+    data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape,
+                 alignment=BURST_ALIGNMENT)]    
+    data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape,
+                 alignment=BURST_ALIGNMENT)]
+    data_str += [format_struct_definition('layernorm_layer_t', 'layer', layer_cfg)]
+    data_str += [format_array_definition(ctype, ifmap_uid, ifmap,
+                 alignment=BURST_ALIGNMENT)]
+    result_def = format_array_definition(ctype, 'golden', ofmap, alignment=BURST_ALIGNMENT)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    print(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson b/sw/dnn/layernorm/data/params.hjson
similarity index 64%
rename from target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson
rename to sw/dnn/layernorm/data/params.hjson
index a9e3fca54a..a12036b254 100644
--- a/target/snitch_cluster/sw/apps/dnn/layernorm/src/params.hjson
+++ b/sw/dnn/layernorm/data/params.hjson
@@ -1,11 +1,8 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
+// Copyright 2023 ETH Zurich and University of Bologna.
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a single LayerNorm layer
-
 {
-    kernel: "LayerNorm"
     input_dim: {
         batch_size: 1,
         seq_len: 32,
diff --git a/sw/dnn/src/layernorm.h b/sw/dnn/layernorm/src/layernorm.h
similarity index 71%
rename from sw/dnn/src/layernorm.h
rename to sw/dnn/layernorm/src/layernorm.h
index b875303146..3ce0c0c27b 100644
--- a/sw/dnn/src/layernorm.h
+++ b/sw/dnn/layernorm/src/layernorm.h
@@ -7,7 +7,7 @@
 #include "math.h"
 #include "snrt.h"
 // #include "printf.h"
-#include "utils.h"
+#include "dnn.h"
 
 // add dump function for layernorm
 dump_float(ln, 5);
@@ -16,29 +16,24 @@ dump_float(ln, 5);
  * @struct layernorm_layer_struct
  * @brief This structure contains all parameters necessary
  *        for computing the LayerNorm activation function
- * @var layernorm_layer_struct::BATCH_SIZE
+ * @var layernorm_layer_struct::batch_size
  * Size of each input sample
- * @var layernorm_layer_struct::SEQ_LEN
+ * @var layernorm_layer_struct::seq_len
  * Size of each output sample
- * @var layernorm_layer_struct::EMBEDDINGS
+ * @var layernorm_layer_struct::embeddings
  * Number of hidden dimensions
  * @var layernorm_layer_struct::ifmap
  * Pointer to input feature map
  * @var layernorm_layer_struct::ofmap
  * Pointer to output feature map
- * @var layernorm_layer_struct::result
- * Pointer to the golden model output
  */
 typedef struct layernorm_layer_struct {
-    uint32_t BATCH_SIZE;
-    uint32_t SEQ_LEN;
-    uint32_t EMBEDDINGS;
-    uint32_t EPS;
-
-    float *ifmap;
-    float *ofmap;
-    float *result;
-
+    uint32_t batch_size;
+    uint32_t seq_len;
+    uint32_t embeddings;
+    uint32_t eps;
+    void *ifmap;
+    void *ofmap;
     precision_t dtype;
 } layernorm_layer_t;
 
@@ -95,44 +90,8 @@ static inline void layernorm_fp32(float *input, float *output, int32_t ldI,
  */
 static inline void transformer_layernorm_fp64(double *input, int32_t ldI,
                                               int32_t seq_len, int32_t embeddings,
-                                              int32_t eps) {
-    double mean = 0.0;  // max value of the current core
-    double var = 0.0;   // sum of the exp values of the current core
-
-    uint32_t compute_id = snrt_global_core_idx();
-    uint32_t num_cores = snrt_cluster_compute_core_num();
-
-    for (int32_t s = 0; s < seq_len; s++) {
-        mean = 0.0;
-        var = 0.0;
-
-        for (int32_t i = 0; i < embeddings; i++) {
-            mean += input[s * ldI + i];
-        }
-        mean /= embeddings;
-
-        // printf("mean[%d] = %f\n", b, mean);
-
-        for (int32_t i = 0; i < embeddings; i++) {
-            var += (input[s * ldI + i] - mean) *
-                    (input[s * ldI + i] - mean);
-        }
-        var /= embeddings;
-
-        // printf("var[%d] = %f\n", b, var);
-
-        // compute the shifted value of the current row
-        for (int32_t i = 0; i < embeddings; i++) {
-            input[s * ldI + i] =
-                (input[s * ldI + i] - mean) /
-                sqrtf(var + eps);
-            // printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i,
-            //        output[s * ldI + i]);
-            // dump_ln(input[s * ldI + i]);
-        }
-    }
-
-    snrt_cluster_hw_barrier();
+                                              int32_t eps) {    
+    layernorm_fp64(input, input, ldI, 0, 1, seq_len, embeddings, eps);
 }
 
 
@@ -187,14 +146,14 @@ static inline void transformer_layernorm_fp32(float *input, int32_t ldI,
  * @param l layernorm_layer struct that holds addresses and parameters
  *
  */
-static inline void layernorm_layer(const layernorm_layer_t *l) {
+static inline void layernorm_layer(layernorm_layer_t l) {
     uint32_t cluster_num = snrt_cluster_num();
     uint32_t cluster_id = snrt_cluster_idx();
     uint32_t compute_num = snrt_cluster_compute_core_num();
     uint32_t compute_id = snrt_global_core_idx();
 
     uint32_t ifmap_size =
-        l->BATCH_SIZE * l->SEQ_LEN * l->EMBEDDINGS * sizeof(float);
+        l.batch_size * l.seq_len * l.embeddings * sizeof(float);
     uint32_t ofmap_size = ifmap_size;
 
     void *ptr = (float *)snrt_l1_next();
@@ -206,9 +165,9 @@ static inline void layernorm_layer(const layernorm_layer_t *l) {
     // DMA transfer the ifmap into the cluster TCDM
     if (snrt_is_dm_core()) {
         snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
-            ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
-            l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
-            l->SEQ_LEN * l->EMBEDDINGS * sizeof(float));
+            ifmap, l.ifmap, l.batch_size * sizeof(float),
+            l.batch_size * sizeof(float), l.batch_size * sizeof(float),
+            l.seq_len * l.embeddings * sizeof(float));
 
         snrt_dma_wait_all();
     }
@@ -217,18 +176,18 @@ static inline void layernorm_layer(const layernorm_layer_t *l) {
 
     if (snrt_is_compute_core()) {
         // determine the row offset for each core
-        int32_t row_offset = compute_id * l->EMBEDDINGS;
+        int32_t row_offset = compute_id * l.embeddings;
 
         // determine the row stride of each matrix
-        int32_t ldI = compute_num * l->EMBEDDINGS;
+        int32_t ldI = compute_num * l.embeddings;
 
         // determine the batch offset for each core
-        int32_t batch_offset = l->SEQ_LEN * l->EMBEDDINGS;
+        int32_t batch_offset = l.seq_len * l.embeddings;
 
         // printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
         layernorm_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI,
-                       batch_offset, l->BATCH_SIZE, l->SEQ_LEN / 8,
-                       l->EMBEDDINGS, l->EPS);
+                       batch_offset, l.batch_size, l.seq_len / 8,
+                       l.embeddings, l.eps);
 
     } else {
         snrt_cluster_hw_barrier();
diff --git a/sw/dnn/layernorm/src/main.c b/sw/dnn/layernorm/src/main.c
new file mode 100644
index 0000000000..badc8aefdb
--- /dev/null
+++ b/sw/dnn/layernorm/src/main.c
@@ -0,0 +1,116 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+// #include "snrt.h"
+#include "layernorm.h"
+
+#include "data.h"
+
+int main() {
+    layernorm_layer(layer);
+
+
+//     void *local_a, *local_b, *local_c;
+//     void *remote_a, *remote_b, *remote_c;
+
+//     // Calculate size and pointers for each cluster
+//     uint32_t frac_m = M / snrt_cluster_num();
+//     uint32_t frac_a = frac_m * K;
+//     uint32_t frac_c = frac_m * N;
+//     uint32_t size_frac_a = frac_a * dtype_size;
+//     uint32_t size_b = K * N * dtype_size;
+//     uint32_t size_frac_c = frac_c * dtype_size;
+//     uint32_t offset_a = frac_a * snrt_cluster_idx();
+//     uint32_t offset_c = frac_c * snrt_cluster_idx();
+//     remote_a = a + offset_a;
+//     remote_b = b;
+//     remote_c = c + offset_c;
+
+//     // Allocate space in TCDM
+//     local_a = (void *)snrt_l1_next();
+//     local_b = local_a + size_frac_a;
+//     local_c = local_b + size_b;
+
+//     // Copy data in TCDM
+//     if (snrt_is_dm_core()) {
+//         snrt_dma_start_1d(local_a, remote_a, size_frac_a);
+//         snrt_dma_start_1d(local_b, remote_b, size_b);
+//         snrt_dma_start_1d(local_c, remote_c, size_frac_c);
+//         snrt_dma_wait_all();
+//     }
+
+//     snrt_cluster_hw_barrier();
+
+//     // Compute
+//     if (!snrt_is_dm_core()) {
+//         const uint32_t setup_ssr = 1;
+//         uint32_t start_cycle = snrt_mcycle();
+
+//         volatile uint32_t lda = K;
+//         volatile uint32_t ldb = N;
+//         volatile uint32_t ldc = N;
+
+//         // Transpose of A unsopported
+//         if (TA) return -1;
+//         if (TB) {
+//             // Transpose of B supported only in FP64
+//             if (dtype_size != FP64) return -1;
+//             ldb = K;
+//         }
+
+//         gemm(dtype_size, expand, setup_ssr, TA, TB, frac_m, N, K, 1, local_a,
+//              lda, local_b, ldb, BETA, local_c, ldc);
+
+//         uint32_t end_cycle = snrt_mcycle();
+//     }
+
+//     snrt_cluster_hw_barrier();
+
+//     // Copy data out of TCDM
+//     if (snrt_is_dm_core()) {
+//         snrt_dma_start_1d(remote_c, local_c, size_frac_c);
+//         snrt_dma_wait_all();
+//     }
+
+// // TODO: currently only works for single cluster otherwise need to
+// //       synchronize all cores here
+// #ifdef BIST
+//     uint32_t errors = M * N;
+
+//     if (snrt_cluster_core_idx() == 0) {
+//         for (uint32_t m = 0; m < M; m++) {
+//             for (uint32_t n = 0; n < N; n++) {
+//                 uint32_t idx = m * N + n;
+//                 switch (dtype_size) {
+//                     case FP64:
+//                         if (fabs(result[idx] - ((double *)local_c)[idx]) >
+//                             0.001)
+//                             errors--;
+//                         break;
+//                     case FP32:
+//                         if (fabs(result[idx] - ((float *)local_c)[idx]) > 0.001)
+//                             errors--;
+//                         break;
+//                     case FP16:
+//                         if (fabs(result[idx] - ((__fp16 *)local_c)[idx]) >
+//                             0.001)
+//                             errors--;
+//                         break;
+//                     case FP8:
+//                         printf("No golden model yet for fp8!\n");
+//                         return -1;
+//                         break;
+//                 }
+//             }
+//         }
+//         printf("%d/%d Errors\n", errors, M * N);
+//     }
+
+//     return errors;
+// #endif
+
+//     return 0;
+}
diff --git a/sw/dnn/layernorm/verify.py b/sw/dnn/layernorm/verify.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/sw/dnn/src/linear.h b/sw/dnn/linear/linear.h
similarity index 100%
rename from sw/dnn/src/linear.h
rename to sw/dnn/linear/linear.h
diff --git a/sw/dnn/src/maxpool.h b/sw/dnn/maxpool/maxpool.h
similarity index 100%
rename from sw/dnn/src/maxpool.h
rename to sw/dnn/maxpool/maxpool.h
diff --git a/sw/dnn/src/softmax.h b/sw/dnn/softmax/softmax.h
similarity index 100%
rename from sw/dnn/src/softmax.h
rename to sw/dnn/softmax/softmax.h
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
index 537f488cd9..e228de21d1 100644
--- a/sw/dnn/src/dnn.h
+++ b/sw/dnn/src/dnn.h
@@ -197,13 +197,13 @@ typedef struct network_single_cluster_t_ {
 // Must be included before batchnorm since the batchnorm layer
 // uses the conv_layer struct. This is bad design.
 // TODO Fix this, union types should be preferred
-#include "conv2d.h"
-
-#include "batchnorm.h"
-#include "gelu.h"
-#include "gemm.h"
-#include "layernorm.h"
-#include "linear.h"
-#include "maxpool.h"
-#include "softmax.h"
-#include "utils.h"
+// #include "conv2d.h"
+
+// #include "batchnorm.h"
+// #include "gelu.h"
+// #include "gemm.h"
+// #include "layernorm.h"
+// #include "linear.h"
+// #include "maxpool.h"
+// #include "softmax.h"
+// #include "utils.h"
diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile
index 0410fb1cb4..219887ee6a 100644
--- a/target/snitch_cluster/sw/apps/Makefile
+++ b/target/snitch_cluster/sw/apps/Makefile
@@ -6,18 +6,18 @@
 
 SUBDIRS  = lto
 SUBDIRS += nop
-SUBDIRS += transformer
+# SUBDIRS += transformer
 SUBDIRS += blas/axpy
 SUBDIRS += blas/gemm
-SUBDIRS += dnn/batchnorm
-SUBDIRS += dnn/conv2d
-SUBDIRS += dnn/fusedconv
-SUBDIRS += dnn/gelu
-SUBDIRS += dnn/gemm
+# SUBDIRS += dnn/batchnorm
+# SUBDIRS += dnn/conv2d
+# SUBDIRS += dnn/fusedconv
+# SUBDIRS += dnn/gelu
+# SUBDIRS += dnn/gemm
 SUBDIRS += dnn/layernorm
-SUBDIRS += dnn/linear
-SUBDIRS += dnn/maxpool
-SUBDIRS += dnn/softmax
+# SUBDIRS += dnn/linear
+# SUBDIRS += dnn/maxpool
+# SUBDIRS += dnn/softmax
 SUBDIRS += montecarlo/pi_estimation
 
 .PHONY: all clean $(SUBDIRS)
diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk
index 8e1950860e..94eb35236a 100644
--- a/target/snitch_cluster/sw/apps/common.mk
+++ b/target/snitch_cluster/sw/apps/common.mk
@@ -37,6 +37,7 @@ INCDIRS += $(SNRT_DIR)/api
 INCDIRS += $(SNRT_DIR)/api/omp
 INCDIRS += $(SNRT_DIR)/src
 INCDIRS += $(SNRT_DIR)/src/omp
+INCDIRS += $(ROOT)/sw/blas
 INCDIRS += $(ROOT)/sw/deps/riscv-opcodes
 INCDIRS += $(ROOT)/sw/math/include
 
diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile b/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile
index 87fa026c70..f8df5a08ac 100644
--- a/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile
+++ b/target/snitch_cluster/sw/apps/dnn/layernorm/Makefile
@@ -2,11 +2,11 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Gianna Paulin <pauling@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP = layernorm
+APP ?= layernorm
 
-include ../Makefile
+include ../../../../../../sw/dnn/common.mk
 include ../../common.mk
 
 $(DEP): $(DATA_H)
diff --git a/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c b/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c
deleted file mode 100644
index fa776940f6..0000000000
--- a/target/snitch_cluster/sw/apps/dnn/layernorm/src/layernorm.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// SW testbench for profiling linear kernels in different
-// floating point precisions (fp64, fp32, fp16), as well as
-// different memory layouts for matrices (transposed/not-transposed)
-// Correctness of results are checked automatically
-
-#include "dnn.h"
-#include "snrt.h"
-
-#include "data.h"
-
-int main() {
-    layernorm_l.ifmap = (float*)layernorm_ifmap_dram;
-    layernorm_l.result = (float*)layernorm_ofmap_dram;
-
-    // checksum = (float*)layernorm_checksum;
-
-    // printf("Starting layernorm layer\n");
-
-    layernorm_layer(&layernorm_l);
-
-    // uint32_t error = check_layernorm_layer(&linear_l,
-    // (float*)linear_checksum);
-
-    return 0;
-}
\ No newline at end of file
diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py
index 2ed260d3f1..609cc708f8 100644
--- a/util/sim/data_utils.py
+++ b/util/sim/data_utils.py
@@ -6,15 +6,38 @@
 
 import struct
 from datetime import datetime
+import torch
+import numpy as np
 
 
 def emit_license():
     s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna.\n"
          f"// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n"
-         f"// SPDX-License-Identifier: Apache-2.0\n\n")
+         f"// SPDX-License-Identifier: Apache-2.0\n")
     return s
 
 
+def floating_point_torch_type(precision):
+    prec_to_torch_type_map = {
+        '64': torch.float64,
+        '32': torch.float32,
+        '16': torch.float16,
+        '8': None
+    }
+    return prec_to_torch_type_map[precision]
+
+
+# Returns the C type representing a floating-point value of the specified precision
+def floating_point_ctype(precision):
+    prec_to_fp_type_map = {
+        '64': 'double',
+        '32': 'float',
+        '16': '__fp16',
+        '8': 'char'
+    }
+    return prec_to_fp_type_map[precision]
+
+
 def variable_attributes(alignment=None, section=None):
     attributes = ''
     if alignment:
@@ -24,11 +47,27 @@ def variable_attributes(alignment=None, section=None):
     return attributes
 
 
-def format_vector_definition(type, uid, vector, alignment=None, section=None):
+def format_array_declaration(dtype, uid, shape, alignment=None, section=None):
     attributes = variable_attributes(alignment, section)
-    s = f'{type} {uid}[{len(vector)}] {attributes} = ' + '{\n'
-    for el in vector:
-        if type != 'char':
+    s = f'{dtype} {uid}'
+    for dim in shape:
+        s += f'[{dim}]'
+    s+= f' {attributes};'
+    return s
+
+
+def format_array_definition(dtype, uid, array, alignment=None, section=None):
+    # Definition starts with the declaration stripped off of the terminating semicolon
+    s = format_array_declaration(dtype, uid, array.shape, alignment, section)[:-1]
+    s += ' = {\n'
+    # Flatten array
+    if isinstance(array, np.ndarray):
+        array = array.flat
+    if isinstance(array, torch.Tensor):
+        array = array.numpy().flat
+    # Format array elements
+    for el in array:
+        if dtype != 'char':
             el_str = f'{el}'
         else:
             el_str = f'0x{el:02x}'
@@ -37,14 +76,15 @@ def format_vector_definition(type, uid, vector, alignment=None, section=None):
     return s
 
 
-def format_vector_declaration(type, uid, vector, alignment=None, section=None):
-    attributes = variable_attributes(alignment, section)
-    s = f'{type} {uid}[{len(vector)}] {attributes};'
+def format_scalar_definition(dtype, uid, scalar):
+    s = f'{dtype} {uid} = {scalar};'
     return s
 
 
-def format_scalar_definition(type, uid, scalar):
-    s = f'{type} {uid} = {scalar};'
+def format_struct_definition(dtype, uid, map):
+    s = f'{dtype} {uid} = {{\n'
+    s += ',\n'.join([f'\t.{key} = {value}' for (key, value) in map.items()])
+    s += '\n};'
     return s