diff --git a/sw/dnn/concat/data/datagen.py b/sw/dnn/concat/data/datagen.py
new file mode 100755
index 0000000000..0cc39bc2e9
--- /dev/null
+++ b/sw/dnn/concat/data/datagen.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import numpy as np
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION = {
+    'FP64': '64',
+    'FP32': '32',
+    'FP16': '16',
+    'FP8': '8'
+}
+
+
+def golden_model(inputs):
+    innermost_dim = len(inputs[0].shape) - 1
+    return torch.cat(inputs, dim=innermost_dim)
+
+
+def emit_header(section, params):
+    num_inputs = params['num_inputs']
+    input_shape = params['input_shape']
+    prec = PRECISION[params['dtype']]
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+
+    inputs = [torch.rand(*input_shape, requires_grad=False, dtype=torch_type) for _ in range(num_inputs)]
+    output = golden_model(inputs)
+
+    ctype = data_utils.floating_point_ctype(prec)
+
+    layer_cfg = {
+        **params,
+        'inputs': 'inputs',
+        'output': 'output'
+    }
+
+    data_str = [emit_license()]
+    data_str += [format_array_declaration(ctype, f'input_{i}', input_shape) for i in range(num_inputs)]
+    data_str += [format_array_declaration('void*', 'inputs', [num_inputs])]
+    data_str += [format_array_declaration(ctype, 'output', output.shape)]
+    data_str += [format_struct_definition('concat_layer_t', 'layer', layer_cfg)]
+    data_str += [format_array_definition(ctype, f'input_{i}', t) for i, t in enumerate(inputs)]
+    data_str += [format_array_definition('void*', 'inputs', np.array([f'input_{i}' for i in range(num_inputs)]))]
+    result_def = format_array_definition(ctype, 'golden', output)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(args.section, param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sw/dnn/concat/data/params.hjson b/sw/dnn/concat/data/params.hjson
new file mode 100644
index 0000000000..8f02b5f479
--- /dev/null
+++ b/sw/dnn/concat/data/params.hjson
@@ -0,0 +1,9 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+{
+    num_inputs: 1
+    input_shape: [32, 4]
+    dtype: FP64
+}
\ No newline at end of file
diff --git a/sw/dnn/concat/src/concat.h b/sw/dnn/concat/src/concat.h
new file mode 100644
index 0000000000..98464a1b87
--- /dev/null
+++ b/sw/dnn/concat/src/concat.h
@@ -0,0 +1,61 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "snrt.h"
+
+/**
+ * @struct concat_layer_t
+ * @brief This structure contains all parameters necessary
+ *        for computing a Concat layer.
+ * @var concat_layer_t::input_shape
+ * Shape of the input tensors
+ * @var concat_layer_t::num_inputs
+ * Number of input tensors to concatenate
+ * @var concat_layer_t::inputs
+ * Pointer to an array of pointers to the individual tensors to concatenate
+ * @var concat_layer_t::output
+ * Pointer to the concatenated output tensor
+ */
+typedef struct {
+    uint32_t num_inputs;
+    uint32_t input_shape[2];
+    void **inputs;
+    void *output;
+    precision_t dtype;
+} concat_layer_t;
+
+// Concatenates a series of input tensors along the innermost axis.
+// Every cluster stores one of the input tensors in the output tensor, all
+// clusters operate in parallel.
+// Note: currently requires that the number of inputs is smaller than the
+// number of clusters in the system. 
+static inline int concat_layer(concat_layer_t l) {
+    // Return error if number of input tensors is greater than number of
+    // clusters
+    if (l.num_inputs > snrt_cluster_num()) return 1;
+    
+    // Perform the concatenation
+    if (snrt_is_dm_core()) {
+        if (snrt_cluster_idx() < l.num_inputs) {
+            size_t row_size = l.input_shape[1] * sizeof(double);
+            size_t concatenated_row_size = row_size * l.num_inputs;
+            void *input = l.inputs[snrt_cluster_idx()];
+            void *output = l.output + snrt_cluster_idx() * row_size;
+            printf("%d: %x\n", snrt_cluster_idx(), output);
+            snrt_dma_start_2d(output,                 // dst
+                              input,                  // src
+                              row_size,               // size
+                              concatenated_row_size,  // dst_stride
+                              row_size,               // src_stride
+                              l.input_shape[0]        // repeat
+            );
+            snrt_dma_wait_all();
+        }
+    }
+
+    snrt_global_barrier();
+    return 0;
+}
diff --git a/sw/dnn/concat/src/main.c b/sw/dnn/concat/src/main.c
new file mode 100644
index 0000000000..8e9e434c4e
--- /dev/null
+++ b/sw/dnn/concat/src/main.c
@@ -0,0 +1,14 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    uint32_t nerr = concat_layer(layer);
+    return nerr;
+}
diff --git a/sw/dnn/concat/verify.py b/sw/dnn/concat/verify.py
new file mode 100755
index 0000000000..60196384d6
--- /dev/null
+++ b/sw/dnn/concat/verify.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from data.datagen import golden_model
+
+sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
+import verification  # noqa: E402
+from elf import Elf  # noqa: E402
+from data_utils import bytes_to_float, bytes_to_struct  # noqa: E402
+
+
+ERR_THRESHOLD = 1E-6
+
+PRECISION_T = {
+    8: '64',
+    4: '32',
+    2: '16',
+    1: '8'
+}
+
+NUMPY_T = {
+    '64': np.float64,
+    '32': np.float32,
+    '16': np.float16
+}
+
+
+def main():
+    # Run simulation and get outputs
+    args = verification.parse_args()
+    raw_results = verification.simulate(sim_bin=args.sim_bin,
+                                        snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
+                                        log=args.log,
+                                        output_uids=['output'])
+
+    # Extract input operands from ELF file
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
+
+    layer_struct = {
+        'num_inputs': 'I',
+        'height': 'I',
+        'width': 'I',
+        'inputs': 'I',
+        'output': 'I',
+        'dtype': 'I'
+    }
+    layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
+    num_inputs = layer['num_inputs']
+    input_shape = [layer['height'], layer['width']]
+    inputs = layer['inputs']
+    output = layer['output']
+    prec = PRECISION_T[layer['dtype']]
+
+    inputs = [np.array(bytes_to_float(elf.get_symbol_contents(f'input_{i}'), prec), dtype=NUMPY_T[prec]) for i in range(num_inputs)]
+    inputs = [torch.from_numpy(tensor.reshape(input_shape)) for tensor in inputs]
+
+    # Verify results
+    output_actual = np.array(bytes_to_float(raw_results['output'], prec), dtype=NUMPY_T[prec])
+    output_golden = golden_model(inputs).detach().numpy().flatten()
+
+    relative_err = np.absolute((output_golden - output_actual) / output_golden)
+    fail = np.any(relative_err > ERR_THRESHOLD)
+    if (fail):
+        verification.dump_results_to_csv([output_golden, output_actual, relative_err],
+                                         Path.cwd() / 'concat_results.csv')
+        print('Maximum relative error:', np.max(relative_err))
+
+    return int(fail)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
index 5c41041c83..313220493a 100644
--- a/sw/dnn/src/dnn.h
+++ b/sw/dnn/src/dnn.h
@@ -204,4 +204,5 @@ typedef struct network_single_cluster_t_ {
 #include "../linear/src/linear.h"
 #include "../maxpool/src/maxpool.h"
 #include "../softmax/src/softmax.h"
+#include "../concat/src/concat.h"
 // #include "utils.h"
diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile
index 596b37e4ff..222f75cd48 100644
--- a/target/snitch_cluster/sw/apps/Makefile
+++ b/target/snitch_cluster/sw/apps/Makefile
@@ -19,6 +19,7 @@ SUBDIRS += dnn/linear
 SUBDIRS += dnn/maxpool
 SUBDIRS += dnn/softmax
 SUBDIRS += dnn/flashattention_2
+SUBDIRS += dnn/concat
 SUBDIRS += montecarlo/pi_estimation
 
 .PHONY: all clean $(SUBDIRS)
diff --git a/target/snitch_cluster/sw/apps/dnn/concat/Makefile b/target/snitch_cluster/sw/apps/dnn/concat/Makefile
new file mode 100644
index 0000000000..088d29d663
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/dnn/concat/Makefile
@@ -0,0 +1,12 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP ?= concat
+
+include ../../../../../../sw/dnn/common.mk
+include ../../common.mk
+
+$(DEP): $(DATA_H)
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index 8e50eea97b..4a8499a636 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -88,4 +88,6 @@ runs:
   # Illegal FDIV without FDIV unit
   - elf: apps/dnn/flashattention_2/build/flashattention_2.elf
     cmd: ../../sw/dnn/flashattention_2/verify.py {sim_bin} {elf}
+  - elf: apps/dnn/concat/build/concat.elf
+    cmd: ../../sw/dnn/concat/verify.py {sim_bin} {elf}
   - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf