diff --git a/sw/dnn/softmax/data/datagen.py b/sw/dnn/softmax/data/datagen.py
new file mode 100755
index 0000000000..6c645f5fae
--- /dev/null
+++ b/sw/dnn/softmax/data/datagen.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tim Fischer <fischeti@iis.ee.ethz.ch>
+# Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION_T = {
+    '64': 'FP64',
+    '32': 'FP32',
+    '16': 'FP16',
+    '8': 'FP8'
+}
+
+
+def golden_model(ifmap, axis):
+    softmax = torch.nn.Softmax(dim=axis)
+    return softmax(ifmap)
+
+
+def emit_header(**kwargs):
+    batch_size = kwargs['input_dim']['batch_size']
+    seq_len = kwargs['input_dim']['seq_len']
+    input_samples = kwargs['input_dim']['input_samples']
+    reduce_dim = kwargs['reduce_dim']
+    prec = str(kwargs['prec'])
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    ifmap = torch.randn(batch_size, seq_len, input_samples, requires_grad=False, dtype=torch_type)
+
+    ofmap = golden_model(ifmap, reduce_dim)
+    ofmap = ofmap.detach().numpy()
+
+    ctype = data_utils.floating_point_ctype(prec)
+
+    ifmap_uid = 'ifmap'
+    ofmap_uid = 'ofmap'
+
+    layer_cfg = {
+        **kwargs['input_dim'],
+        'reduce_dim': reduce_dim,
+        'ifmap': ifmap_uid,
+        'ofmap': ofmap_uid,
+        'dtype': PRECISION_T[prec]
+    }
+
+    data_str = [emit_license()]
+    data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape,
+                 alignment=BURST_ALIGNMENT)]
+    data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape,
+                 alignment=BURST_ALIGNMENT)]
+    data_str += [format_struct_definition('softmax_layer_t', 'layer', layer_cfg)]
+    data_str += [format_array_definition(ctype, ifmap_uid, ifmap,
+                 alignment=BURST_ALIGNMENT)]
+    result_def = format_array_definition(ctype, 'golden', ofmap, alignment=BURST_ALIGNMENT)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/dnn/softmax/src/params.hjson b/sw/dnn/softmax/data/params.hjson
similarity index 81%
rename from target/snitch_cluster/sw/apps/dnn/softmax/src/params.hjson
rename to sw/dnn/softmax/data/params.hjson
index a2724a0924..3ee58efb96 100644
--- a/target/snitch_cluster/sw/apps/dnn/softmax/src/params.hjson
+++ b/sw/dnn/softmax/data/params.hjson
@@ -2,10 +2,7 @@
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a single SoftMax layer
-
 {
-    kernel: "SoftMax"
     input_dim: {
         batch_size: 3,
         seq_len: 16,
diff --git a/sw/dnn/softmax/src/main.c b/sw/dnn/softmax/src/main.c
new file mode 100644
index 0000000000..7178c2b195
--- /dev/null
+++ b/sw/dnn/softmax/src/main.c
@@ -0,0 +1,14 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    softmax_layer(layer);
+    return 0;
+}
diff --git a/sw/dnn/softmax/softmax.h b/sw/dnn/softmax/src/softmax.h
similarity index 64%
rename from sw/dnn/softmax/softmax.h
rename to sw/dnn/softmax/src/softmax.h
index f58d48fe63..a677766239 100644
--- a/sw/dnn/softmax/softmax.h
+++ b/sw/dnn/softmax/src/softmax.h
@@ -6,38 +6,31 @@
 
 #include "math.h"
 #include "snrt.h"
-// #include "printf.h"
-#include "utils.h"
 
 /**
  * @struct softmax_layer_struct
  * @brief This structure contains all parameters necessary
  *       for computing the Softmax activation function
- * @var softmax_layer_struct::BATCH_SIZE
+ * @var softmax_layer_struct::batch_size
  * Size of each input sample
- * @var softmax_layer_struct::SEQ_LEN
+ * @var softmax_layer_struct::seq_len
  * Size of each output sample
- * @var softmax_layer_struct::INPUT_SAMPLES
+ * @var softmax_layer_struct::input_samples
  * Number of input samples
- * @var softmax_layer_struct::REDUCE_DIM
+ * @var softmax_layer_struct::reduce_dim
  * Along which dimension to reduce
  * @var softmax_layer_struct::ifmap
  * Pointer to input feature map
  * @var softmax_layer_struct::ofmap
  * Pointer to output feature map
- * @var softmax_layer_struct::result
- * Pointer to the golden model output
  */
 typedef struct softmax_layer_struct {
-    uint32_t BATCH_SIZE;
-    uint32_t SEQ_LEN;
-    uint32_t INPUT_SAMPLES;
-    uint32_t REDUCE_DIM;
-
+    uint32_t batch_size;
+    uint32_t seq_len;
+    uint32_t input_samples;
+    int32_t reduce_dim;
     float *ifmap;
     float *ofmap;
-    float *result;
-
     precision_t dtype;
 } softmax_layer_t;
 
@@ -50,9 +43,6 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI,
     float max_core = 0.0;  // max value of the current core
     float sum = 0.0;       // sum of the exp values of the current core
 
-    // uint32_t compute_id = snrt_global_core_idx();
-    // uint32_t num_cores = snrt_cluster_compute_core_num();
-
     for (int32_t b = 0; b < batch_size; b++) {
         for (int32_t s = 0; s < seq_len; s++) {
             max_core = -INFINITY;
@@ -67,23 +57,13 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI,
             // compute the shifted value of the current row
             for (int32_t i = 0; i < input_samples; i++) {
                 output[b * batch_offset + s * ldI + i] =
-                    // FIXME: Below code is erroring due to the standard math
-                    // lib conflict
-                    // TODO: Try out with musl lib
-                    // expf(input[b * batch_offset + s * ldI + i] - max_core);
-                    // FIXME: actually there should be an exponentiation
-                    input[b * batch_offset + s * ldI + i] - max_core;
+                    expf(input[b * batch_offset + s * ldI + i] - max_core);
                 sum += output[b * batch_offset + s * ldI + i];
             }
 
             // compute the softmax value of the current row
             for (int32_t i = 0; i < input_samples; i++) {
-                // INFO: DIVSQRT unit MUST be activated in the cluster
-                // configuration
                 output[b * batch_offset + s * ldI + i] /= sum;
-                // printf("output[%d] = %f\n", compute_id * input_samples + b *
-                // batch_offset + s * ldI + i,
-                //        output[b * batch_offset + s * ldI + i]);
             }
         }
     }
@@ -97,14 +77,14 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI,
  * @param l softmax_layer struct that holds addresses and parameters
  *
  */
-static inline void softmax_layer(softmax_layer_t *const l) {
+static inline void softmax_layer(softmax_layer_t const l) {
     uint32_t cluster_num = snrt_cluster_num();
     uint32_t cluster_id = snrt_cluster_idx();
     uint32_t compute_num = snrt_cluster_compute_core_num();
     uint32_t compute_id = snrt_global_core_idx();
 
     uint32_t ifmap_size =
-        l->BATCH_SIZE * l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float);
+        l.batch_size * l.seq_len * l.input_samples * sizeof(float);
     uint32_t ofmap_size = ifmap_size;
 
     void *ptr = (float *)snrt_l1_next();
@@ -116,9 +96,9 @@ static inline void softmax_layer(softmax_layer_t *const l) {
     // DMA transfer the ifmap into the cluster TCDM
     if (snrt_is_dm_core()) {
         snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
-            ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
-            l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
-            l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float));
+            ifmap, l.ifmap, l.batch_size * sizeof(float),
+            l.batch_size * sizeof(float), l.batch_size * sizeof(float),
+            l.seq_len * l.input_samples * sizeof(float));
 
         snrt_dma_wait_all();
     }
@@ -127,21 +107,31 @@ static inline void softmax_layer(softmax_layer_t *const l) {
 
     if (snrt_is_compute_core()) {
         // determine the row offset for each core
-        int32_t row_offset = compute_id * l->INPUT_SAMPLES;
+        int32_t row_offset = compute_id * l.input_samples;
 
         // determine the row stride of each matrix
-        int32_t ldI = compute_num * l->INPUT_SAMPLES;
+        int32_t ldI = compute_num * l.input_samples;
 
         // determine the batch offset for each core
-        int32_t batch_offset = l->SEQ_LEN * l->INPUT_SAMPLES;
+        int32_t batch_offset = l.seq_len * l.input_samples;
 
         // printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
         softmax_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI, batch_offset,
-                     l->BATCH_SIZE, l->SEQ_LEN / 8, l->INPUT_SAMPLES);
+                     l.batch_size, l.seq_len / 8, l.input_samples);
 
     } else {
         snrt_cluster_hw_barrier();
     }
 
+    // DMA transfer the ofmap to DRAM
+    if (snrt_is_dm_core()) {
+        snrt_dma_txid_t txid_ofmap = snrt_dma_start_2d(
+            l.ofmap, ofmap, l.batch_size * sizeof(float),
+            l.batch_size * sizeof(float), l.batch_size * sizeof(float),
+            l.seq_len * l.input_samples * sizeof(float));
+
+        snrt_dma_wait_all();
+    }
+
     snrt_global_barrier();
 }
\ No newline at end of file
diff --git a/sw/dnn/softmax/verify.py b/sw/dnn/softmax/verify.py
new file mode 100755
index 0000000000..312bd6bb84
--- /dev/null
+++ b/sw/dnn/softmax/verify.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from data.datagen import golden_model
+
+sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
+import verification  # noqa: E402
+from elf import Elf  # noqa: E402
+from data_utils import bytes_to_float, bytes_to_struct  # noqa: E402
+
+
+ERR_THRESHOLD = 0.003
+
+PRECISION_T = {
+    8: '64',
+    4: '32',
+    2: '16',
+    1: '8'
+}
+
+NUMPY_T = {
+    '64': np.float64,
+    '32': np.float32,
+    '16': np.float16
+}
+
+
+def main():
+    # Run simulation and get outputs
+    args = verification.parse_args()
+    raw_results = verification.simulate(sim_bin=args.sim_bin,
+                                        snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
+                                        log=args.log,
+                                        output_uids=['ofmap'])
+
+    # Extract input operands from ELF file
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
+
+    layer_struct = {
+        'batch_size': 'I',
+        'seq_len': 'I',
+        'input_samples': 'I',
+        'reduce_dim': 'i',
+        'ifmap_ptr': 'I',
+        'ofmap_ptr': 'I',
+        'dtype': 'I'
+    }
+    layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
+    batch_size = layer['batch_size']
+    seq_len = layer['seq_len']
+    input_samples = layer['input_samples']
+    reduce_dim = layer['reduce_dim']
+    prec = PRECISION_T[layer['dtype']]
+
+    ifmap = np.array(bytes_to_float(elf.get_symbol_contents('ifmap'), prec), dtype=NUMPY_T[prec])
+    ifmap = ifmap.reshape(batch_size, seq_len, input_samples)
+    ifmap = torch.from_numpy(ifmap)
+
+    # Verify results
+    ofmap_actual = np.array(bytes_to_float(raw_results['ofmap'], prec), dtype=NUMPY_T[prec])
+    ofmap_golden = golden_model(ifmap, reduce_dim).detach().numpy().flatten()
+
+    absolute_err = np.absolute(ofmap_golden - ofmap_actual)
+    fail = np.any(absolute_err > ERR_THRESHOLD)
+    if (fail):
+        verification.dump_results_to_csv([ofmap_golden, ofmap_actual, absolute_err],
+                                         Path.cwd() / 'softmax_results.csv')
+
+    return int(fail)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
index d1d190a968..a38cc9c76e 100644
--- a/sw/dnn/src/dnn.h
+++ b/sw/dnn/src/dnn.h
@@ -202,5 +202,5 @@ typedef struct network_single_cluster_t_ {
 #include "../layernorm/src/layernorm.h"
 #include "../linear/src/linear.h"
 #include "../maxpool/src/maxpool.h"
-// #include "softmax.h"
+#include "../softmax/src/softmax.h"
 // #include "utils.h"
diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile
index e5d8c8be5e..1a8ce51ae2 100644
--- a/target/snitch_cluster/sw/apps/Makefile
+++ b/target/snitch_cluster/sw/apps/Makefile
@@ -17,7 +17,7 @@ SUBDIRS += dnn/gemm
 SUBDIRS += dnn/layernorm
 SUBDIRS += dnn/linear
 SUBDIRS += dnn/maxpool
-# SUBDIRS += dnn/softmax
+SUBDIRS += dnn/softmax
 SUBDIRS += montecarlo/pi_estimation
 
 .PHONY: all clean $(SUBDIRS)
diff --git a/target/snitch_cluster/sw/apps/dnn/softmax/Makefile b/target/snitch_cluster/sw/apps/dnn/softmax/Makefile
index 8f2209c298..d4f685c7d0 100644
--- a/target/snitch_cluster/sw/apps/dnn/softmax/Makefile
+++ b/target/snitch_cluster/sw/apps/dnn/softmax/Makefile
@@ -2,11 +2,11 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Gianna Paulin <pauling@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-APP = softmax
+APP ?= softmax
 
-include ../Makefile
+include ../../../../../../sw/dnn/common.mk
 include ../../common.mk
 
 $(DEP): $(DATA_H)
diff --git a/target/snitch_cluster/sw/apps/dnn/softmax/src/softmax.c b/target/snitch_cluster/sw/apps/dnn/softmax/src/softmax.c
deleted file mode 100644
index d648b5548f..0000000000
--- a/target/snitch_cluster/sw/apps/dnn/softmax/src/softmax.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// SW testbench for profiling linear kernels in different
-// floating point precisions (fp64, fp32, fp16), as well as
-// different memory layouts for matrices (transposed/not-transposed)
-// Correctness of results are checked automatically
-
-#include "dnn.h"
-#include "snrt.h"
-
-#include "data.h"
-
-int main() {
-    softmax_l.ifmap = (float*)softmax_ifmap_dram;
-    // softmax_l.result = (float*)softmax_ofmap_dram;
-
-    // checksum = (float*)softmax_checksum;
-
-    softmax_layer(&softmax_l);
-
-    snrt_global_barrier();
-
-    // uint32_t error = check_softmax_layer(&linear_l, (float*)linear_checksum);
-
-    return 0;
-}
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index 0c712fa552..85afc9ed59 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -78,9 +78,9 @@ runs:
   - elf: apps/dnn/maxpool/build/maxpool.elf
   - elf: apps/dnn/gemm/build/gemm.elf
   - elf: apps/dnn/layernorm/build/layernorm.elf # Illegal FDIV without FDIV unit
+    cmd: ../../sw/dnn/layernorm/verify.py {sim_bin} {elf}
   # - elf: apps/dnn/gelu/build/gelu.elf # seems like it stalls
   # - elf: apps/dnn/conv2d/build/conv2d.elf # fails with exit code 32
   # - elf: apps/dnn/fusedconv/build/fusedconv.elf # fails newly
-  # - elf: apps/dnn/softmax/build/softmax.elf
-  #   throws illegal instruction on FDIV in simulation
+  - elf: ../../sw/dnn/softmax/verify.py {sim_bin} {elf} # Illegal FDIV without FDIV unit
   - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf