From a17d2cbaae8604021ef788f25ba7cf24157abc6a Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Sun, 12 Nov 2023 20:55:21 +0100
Subject: [PATCH] dnn: Add FusedConcatLinear layer

---
 sw/dnn/fused_concat_linear/data/datagen.py    | 113 ++++++++++++++++++
 sw/dnn/fused_concat_linear/data/params.hjson  |  10 ++
 .../src/fused_concat_linear.h                 |  53 ++++++++
 sw/dnn/fused_concat_linear/src/main.c         |  14 +++
 sw/dnn/fused_concat_linear/verify.py          |  90 ++++++++++++++
 sw/dnn/src/dnn.h                              |   1 +
 sw/snRuntime/src/team.h                       |   4 +
 target/snitch_cluster/sw/apps/Makefile        |   1 +
 .../sw/apps/dnn/fused_concat_linear/Makefile  |  12 ++
 target/snitch_cluster/sw/run.yaml             |   2 +
 10 files changed, 300 insertions(+)
 create mode 100755 sw/dnn/fused_concat_linear/data/datagen.py
 create mode 100644 sw/dnn/fused_concat_linear/data/params.hjson
 create mode 100644 sw/dnn/fused_concat_linear/src/fused_concat_linear.h
 create mode 100644 sw/dnn/fused_concat_linear/src/main.c
 create mode 100755 sw/dnn/fused_concat_linear/verify.py
 create mode 100644 target/snitch_cluster/sw/apps/dnn/fused_concat_linear/Makefile

diff --git a/sw/dnn/fused_concat_linear/data/datagen.py b/sw/dnn/fused_concat_linear/data/datagen.py
new file mode 100755
index 0000000000..8d139feb96
--- /dev/null
+++ b/sw/dnn/fused_concat_linear/data/datagen.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import numpy as np
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION = {
+    'FP64': '64',
+    'FP32': '32',
+    'FP16': '16',
+    'FP8': '8'
+}
+
+
+def golden_model(inputs, weights):
+    innermost_dim = len(inputs[0].shape) - 1
+    concat_output = torch.cat(inputs, dim=innermost_dim)
+    linear_output = torch.matmul(concat_output, weights)
+    return linear_output, concat_output
+
+
+def emit_header(section, params):
+    num_inputs = params['num_inputs']
+    input_shape = params['input_shape']
+    output_shape = params['output_shape']
+    prec = PRECISION[params['dtype']]
+
+    assert input_shape[0] == output_shape[0], 'Inconsistent input and output shapes'
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+
+    inputs = [torch.rand(*input_shape, requires_grad=False, dtype=torch_type) for _ in range(num_inputs)]
+    weights = torch.rand([input_shape[1]*num_inputs, output_shape[1]], requires_grad=False, dtype=torch_type)
+    linear_output, concat_output = golden_model(inputs, weights)
+
+    ctype = data_utils.floating_point_ctype(prec)
+
+    layer_cfg = {
+        **params,
+        'inputs': 'inputs',
+        'weights': 'weights',
+        'concat_output': 'concat_output',
+        'linear_output': 'linear_output'
+    }
+
+    data_str = [emit_license()]
+    data_str += [format_array_declaration(ctype, f'input_{i}', input_shape) for i in range(num_inputs)]
+    data_str += [format_array_declaration('void*', 'inputs', [num_inputs])]
+    data_str += [format_array_declaration(ctype, 'concat_output', concat_output.shape)]
+    data_str += [format_array_declaration(ctype, 'linear_output', linear_output.shape)]
+    data_str += [format_array_declaration(ctype, 'weights', weights.shape)]
+    data_str += [format_struct_definition('fused_concat_linear_layer_t', 'layer', layer_cfg)]
+    data_str += [format_array_definition(ctype, f'input_{i}', t) for i, t in enumerate(inputs)]
+    data_str += [format_array_definition('void*', 'inputs', np.array([f'input_{i}' for i in range(num_inputs)]))]
+    data_str += [format_array_definition(ctype, 'weights', weights)]
+    result_def = format_array_definition(ctype, 'golden', linear_output)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(args.section, param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sw/dnn/fused_concat_linear/data/params.hjson b/sw/dnn/fused_concat_linear/data/params.hjson
new file mode 100644
index 0000000000..2274863f41
--- /dev/null
+++ b/sw/dnn/fused_concat_linear/data/params.hjson
@@ -0,0 +1,10 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+{
+    num_inputs: 1
+    input_shape: [32, 4]
+    output_shape: [32, 16]
+    dtype: FP64
+}
\ No newline at end of file
diff --git a/sw/dnn/fused_concat_linear/src/fused_concat_linear.h b/sw/dnn/fused_concat_linear/src/fused_concat_linear.h
new file mode 100644
index 0000000000..09a7a9b7bb
--- /dev/null
+++ b/sw/dnn/fused_concat_linear/src/fused_concat_linear.h
@@ -0,0 +1,53 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "snrt.h"
+
+/**
+ * @struct fused_concat_linear_layer_t
+ * @brief This structure contains all parameters necessary
+ *        for computing a Concat layer.
+ * @var fused_concat_linear_layer_t::input_shape
+ * Shape of the input tensors
+ * @var fused_concat_linear_layer_t::num_inputs
+ * Number of input tensors to concatenate
+ * @var fused_concat_linear_layer_t::inputs
+ * Pointer to an array of pointers to the individual tensors to concatenate
+ * @var fused_concat_linear_layer_t::output
+ * Pointer to the concatenated output tensor
+ */
+typedef struct {
+    uint32_t num_inputs;
+    uint32_t input_shape[2];
+    uint32_t output_shape[2];
+    void **inputs;
+    void *weights;
+    void *concat_output;
+    void *linear_output;
+    precision_t dtype;
+} fused_concat_linear_layer_t;
+
+static inline int fused_concat_linear_layer(fused_concat_linear_layer_t l) {
+    // Concat layer
+    concat_layer_t concat_layer_cfg = {
+        .num_inputs = l.num_inputs,
+        .input_shape = {l.input_shape[0], l.input_shape[1]},
+        .inputs = l.inputs,
+        .output = l.concat_output,
+        .dtype = l.dtype
+    };
+    int nerr = concat_layer(concat_layer_cfg);
+
+    // Linear layer
+    uint32_t m = l.input_shape[0];
+    uint32_t k = l.input_shape[1] * l.num_inputs;
+    uint32_t n = l.output_shape[1];
+    gemm(l.dtype, 0, 0, 0, 0, m, n, k, 1.0, l.concat_output, k, l.weights, n, 0.0, l.linear_output, n);
+
+    snrt_global_barrier();
+
+    return nerr;
+}
diff --git a/sw/dnn/fused_concat_linear/src/main.c b/sw/dnn/fused_concat_linear/src/main.c
new file mode 100644
index 0000000000..6d5f7adf03
--- /dev/null
+++ b/sw/dnn/fused_concat_linear/src/main.c
@@ -0,0 +1,14 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    uint32_t nerr = fused_concat_linear_layer(layer);
+    return nerr;
+}
diff --git a/sw/dnn/fused_concat_linear/verify.py b/sw/dnn/fused_concat_linear/verify.py
new file mode 100755
index 0000000000..fedf63a5cf
--- /dev/null
+++ b/sw/dnn/fused_concat_linear/verify.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from data.datagen import golden_model
+
+sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
+import verification  # noqa: E402
+from elf import Elf  # noqa: E402
+from data_utils import bytes_to_float, bytes_to_struct  # noqa: E402
+
+
+ERR_THRESHOLD = 1E-6
+
+PRECISION_T = {
+    8: '64',
+    4: '32',
+    2: '16',
+    1: '8'
+}
+
+NUMPY_T = {
+    '64': np.float64,
+    '32': np.float32,
+    '16': np.float16
+}
+
+
+def main():
+    # Run simulation and get outputs
+    args = verification.parse_args()
+    raw_results = verification.simulate(sim_bin=args.sim_bin,
+                                        snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
+                                        log=args.log,
+                                        output_uids=['linear_output'])
+
+    # Extract input operands from ELF file
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
+
+    layer_struct = {
+        'num_inputs': 'I',
+        'in_height': 'I',
+        'in_width': 'I',
+        'out_height': 'I',
+        'out_width': 'I',
+        'inputs': 'I',
+        'weights': 'I',
+        'concat_output': 'I',
+        'linear_output': 'I',
+        'dtype': 'I'
+    }
+    layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
+    num_inputs = layer['num_inputs']
+    input_shape = [layer['in_height'], layer['in_width']]
+    weights_shape = [layer['in_width']*num_inputs, layer['out_width']]
+    prec = PRECISION_T[layer['dtype']]
+
+    inputs = [np.array(bytes_to_float(elf.get_symbol_contents(f'input_{i}'), prec), dtype=NUMPY_T[prec]) for i in range(num_inputs)]
+    inputs = [torch.from_numpy(tensor.reshape(input_shape)) for tensor in inputs]
+    weights = np.array(bytes_to_float(elf.get_symbol_contents(f'weights'), prec), dtype=NUMPY_T[prec])
+    weights = torch.from_numpy(weights.reshape(weights_shape))
+
+    # Verify results
+    output_actual = np.array(bytes_to_float(raw_results['linear_output'], prec), dtype=NUMPY_T[prec])
+    output_golden, _ = golden_model(inputs, weights)
+    output_golden = output_golden.detach().numpy().flatten()
+
+    relative_err = np.absolute((output_golden - output_actual) / output_golden)
+    fail = np.any(relative_err > ERR_THRESHOLD)
+    if (fail):
+        verification.dump_results_to_csv([output_golden, output_actual, relative_err],
+                                         Path.cwd() / 'results.csv')
+        print('Maximum relative error:', np.max(relative_err))
+
+    return int(fail)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
index 313220493a..56e62e6d44 100644
--- a/sw/dnn/src/dnn.h
+++ b/sw/dnn/src/dnn.h
@@ -205,4 +205,5 @@ typedef struct network_single_cluster_t_ {
 #include "../maxpool/src/maxpool.h"
 #include "../softmax/src/softmax.h"
 #include "../concat/src/concat.h"
+#include "../fused_concat_linear/src/fused_concat_linear.h"
 // #include "utils.h"
diff --git a/sw/snRuntime/src/team.h b/sw/snRuntime/src/team.h
index 918037e646..033b0bf781 100644
--- a/sw/snRuntime/src/team.h
+++ b/sw/snRuntime/src/team.h
@@ -24,6 +24,10 @@ inline uint32_t __attribute__((const)) snrt_global_core_num() {
     return snrt_cluster_num() * snrt_cluster_core_num();
 }
 
+inline uint32_t __attribute__((const)) snrt_global_compute_core_num() {
+    return snrt_cluster_num() * snrt_cluster_compute_core_num();
+}
+
 inline uint32_t __attribute__((const)) snrt_global_core_idx() {
     return snrt_hartid() - snrt_global_core_base_hartid();
 }
diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile
index 222f75cd48..c1faab9fc3 100644
--- a/target/snitch_cluster/sw/apps/Makefile
+++ b/target/snitch_cluster/sw/apps/Makefile
@@ -20,6 +20,7 @@ SUBDIRS += dnn/maxpool
 SUBDIRS += dnn/softmax
 SUBDIRS += dnn/flashattention_2
 SUBDIRS += dnn/concat
+SUBDIRS += dnn/fused_concat_linear
 SUBDIRS += montecarlo/pi_estimation
 
 .PHONY: all clean $(SUBDIRS)
diff --git a/target/snitch_cluster/sw/apps/dnn/fused_concat_linear/Makefile b/target/snitch_cluster/sw/apps/dnn/fused_concat_linear/Makefile
new file mode 100644
index 0000000000..827014a08f
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/dnn/fused_concat_linear/Makefile
@@ -0,0 +1,12 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP ?= fused_concat_linear
+
+include ../../../../../../sw/dnn/common.mk
+include ../../common.mk
+
+$(DEP): $(DATA_H)
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index 4a8499a636..2746aea862 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -90,4 +90,6 @@ runs:
     cmd: ../../sw/dnn/flashattention_2/verify.py {sim_bin} {elf}
   - elf: apps/dnn/concat/build/concat.elf
     cmd: ../../sw/dnn/concat/verify.py {sim_bin} {elf}
+  - elf: apps/dnn/fused_concat_linear/build/fused_concat_linear.elf
+    cmd: ../../sw/dnn/fused_concat_linear/verify.py {sim_bin} {elf}
   - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf