From 2b80dbb1b74e542a4b8a46d80e55eeac76e5a9d9 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 18 Aug 2023 10:47:51 +0200
Subject: [PATCH 01/13] snRuntime: Correct undefined symbol errors

---
 sw/snRuntime/src/start.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index 77d5a0326..8a692e921 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -2,6 +2,14 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+static inline void snrt_crt0_cluster_hw_barrier() {
+    uint32_t register r;
+    uint32_t hw_barrier =
+        SNRT_CLUSTER_HW_BARRIER_ADDR + snrt_cluster_idx() * SNRT_CLUSTER_OFFSET;
+    asm volatile("lw %0, 0(%1)" : "=r"(r) : "r"(hw_barrier) : "memory");
+}
+
+#ifdef SNRT_INIT_CLS
 static inline uint32_t snrt_cls_base_addr() {
     extern volatile uint32_t __cdata_start, __cdata_end;
     extern volatile uint32_t __cbss_start, __cbss_end;
@@ -12,14 +20,9 @@ static inline uint32_t snrt_cls_base_addr() {
                            SNRT_TCDM_SIZE;
     return l1_end_addr - cdata_size - cbss_size;
 }
+#endif
 
-static inline void snrt_crt0_cluster_hw_barrier() {
-    uint32_t register r;
-    uint32_t hw_barrier =
-        SNRT_CLUSTER_HW_BARRIER_ADDR + snrt_cluster_idx() * SNRT_CLUSTER_OFFSET;
-    asm volatile("lw %0, 0(%1)" : "=r"(r) : "r"(hw_barrier) : "memory");
-}
-
+#ifdef SNRT_INIT_TLS
 static inline void snrt_init_tls() {
     extern volatile uint32_t __tdata_start, __tdata_end;
     extern volatile uint32_t __tbss_start, __tbss_end;
@@ -41,7 +44,9 @@ static inline void snrt_init_tls() {
         tls_ptr++;
     }
 }
+#endif
 
+#ifdef SNRT_INIT_BSS
 static inline void snrt_init_bss() {
     extern volatile uint32_t __bss_start, __bss_end;
 
@@ -54,7 +59,9 @@ static inline void snrt_init_bss() {
         }
     }
 }
+#endif
 
+#ifdef SNRT_INIT_CLS
 static inline void snrt_init_cls() {
     extern volatile uint32_t __cdata_start, __cdata_end;
     extern volatile uint32_t __cbss_start, __cbss_end;
@@ -81,13 +88,18 @@ static inline void snrt_init_cls() {
         }
     }
 }
+#endif
 
+#ifdef SNRT_INIT_LIBS
 static inline void snrt_init_libs() { snrt_alloc_init(); }
+#endif
 
+#ifdef SNRT_CRT0_EXIT
 static inline void snrt_exit(int exit_code) {
     if (snrt_global_core_idx() == 0)
         *(snrt_exit_code_destination()) = (exit_code << 1) | 1;
 }
+#endif
 
 void snrt_main() {
     int exit_code = 0;

From d25cd9e2f25768163b499505f38512bae064014a Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Tue, 30 May 2023 12:21:25 +0200
Subject: [PATCH 02/13] util: Streamline addr2line functionality and extend
 eventvis.py

---
 util/trace/a2l.py           | 109 +++++++++
 util/trace/annotate.py      |  90 ++-----
 util/trace/eventvis.py      |  32 ++-
 util/trace/layout_events.py | 119 +++++----
 util/trace/tracevis.py      | 471 ++++++++++++++++++++----------------
 5 files changed, 495 insertions(+), 326 deletions(-)
 create mode 100644 util/trace/a2l.py

diff --git a/util/trace/a2l.py b/util/trace/a2l.py
new file mode 100644
index 000000000..c62633739
--- /dev/null
+++ b/util/trace/a2l.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+#
+# Utilities for common tasks involving addr2line
+
+import os
+from pathlib import Path
+from functools import lru_cache
+from operator import itemgetter
+
+
+def unzip(ls):
+    return zip(*ls)
+
+
+def format_function_name(name):
+    if name == '??':
+        return 'unknown function'
+    return name
+
+
+def format_line(num):
+    if num == '?':
+        return -1
+    return int(num)
+
+
+class Addr2LineOutput:
+
+    indent_unit = '  '
+
+    def __init__(self, raw):
+        self.raw = raw
+
+    # Returns the function stack of the current line.
+    # If there was no function inlining, then the function stack
+    # includes only the function the line belongs to.
+    # If there was inlining, it includes all functions the line
+    # belonged to after inlining the previous, up to (and including)
+    # the last function which was not inlined.
+    def function_stack(self):
+        output = self.raw.split('\n')
+
+        functions = output[::2]
+        filepaths, lines = unzip([o.split(':') for o in output[1::2]])
+
+        functions = map(format_function_name, functions)
+        lines = map(format_line, lines)
+
+        stack = zip(functions, filepaths, lines)
+        stack = [{'func': s[0], 'file': s[1], 'line': s[2]} for s in stack]
+        return stack
+
+    def function_stack_string(self, short=True):
+        stack = reversed(self.function_stack())
+        s = ''
+        indent = ''
+        for i, level in enumerate(stack):
+            func, file, line = level.values()
+            if short:
+                file = Path(file).name
+            indent = self.indent_unit * i
+            s += f'{indent}{func} ({file}:{line})\n'
+        return s
+
+    def line(self):
+        file, line = itemgetter('file', 'line')(self.function_stack()[0])
+
+        # Open source file
+        src = []
+        try:
+            with open(file, 'r') as f:
+                src = [x.strip() for x in f.readlines()]
+        except OSError:
+            src = []
+
+        # Extract line
+        if src and line >= 0:
+            return src[line-1]
+        else:
+            return ''
+
+    def __str__(self):
+        s = self.function_stack_string()
+        if self.line():
+            indent = self.indent_unit * len(s.strip().split('\n'))
+            s += f'{indent}{self.line()}'
+        return s
+
+
+class Elf:
+
+    def __init__(self, elf, a2l_binary='addr2line'):
+        self.elf = Path(elf)
+        self.a2l = a2l_binary
+
+        assert self.elf.exists(), f'File not found {self.elf}'
+
+    @lru_cache(maxsize=1024)
+    def addr2line(self, addr):
+        if type(addr) == str:
+            addr = int(addr, 16)
+        cmd = f'{self.a2l} -e {self.elf} -f -i {addr:x}'
+        return Addr2LineOutput(os.popen(cmd).read())
diff --git a/util/trace/annotate.py b/util/trace/annotate.py
index a88664544..512556190 100755
--- a/util/trace/annotate.py
+++ b/util/trace/annotate.py
@@ -22,9 +22,8 @@
 import sys
 import os
 import re
-from functools import lru_cache
 import argparse
-from termcolor import colored
+import a2l
 
 # Argument parsing
 parser = argparse.ArgumentParser('annotate', allow_abbrev=True)
@@ -84,7 +83,7 @@
 
 args = parser.parse_args()
 
-elf = args.elf
+elf_file = args.elf
 trace = args.trace
 output = args.output
 diff = args.diff
@@ -93,7 +92,7 @@
 keep_time = args.keep_time
 
 if not quiet:
-    print('elf:', elf, file=sys.stderr)
+    print('elf:', elf_file, file=sys.stderr)
     print('trace:', trace, file=sys.stderr)
     print('output:', output, file=sys.stderr)
     print('diff:', diff, file=sys.stderr)
@@ -110,34 +109,9 @@
 trace_start_col = -1
 
 
-@lru_cache(maxsize=1024)
-def adr2line(addr):
-    cmd = f'{addr2line} -e {elf} -f -i {addr:x}'
-    return os.popen(cmd).read().split('\n')
-
-
-# helper functions to parse addr2line output
-def a2l_file_path(a2l_file_str):
-    return a2l_file_str.split(':')[0]
-
-
-def a2l_file_name(a2l_file_str):
-    return a2l_file_str.split('/')[-1].split(':')[0]
-
-
-def a2l_file_line(a2l_file_str):
-    return int(a2l_file_str.split(':')[-1].split(' ')[0])
-
-
-def format_a2l_funcname(a2l_func_name):
-    if a2l_func_name == '??':
-        return 'unknown function'
-    return a2l_func_name
-
-
 # helper functions to assemble diff output
 def format_call(level, call):
-    funcname = format_a2l_funcname(call[0])
+    funcname = a2l.format_function_name(call[0])
     if level == 0:
         return f'{funcname} ({call[1]})\n'
     else:
@@ -189,6 +163,9 @@ def dump_hunk(hunk_tstart, hunk_sstart, hunk_trace, hunk_source):
     of.write(f'{hunk_header}{hunk_trace}{hunk_source}')
 
 
+# Open ELF file for addr2line processing
+elf = a2l.Elf(elf_file)
+
 # core functionality
 with open(trace, 'r') as f:
 
@@ -223,12 +200,16 @@ def dump_hunk(hunk_tstart, hunk_sstart, hunk_trace, hunk_source):
         # RTL traces might not contain a PC on each line
         try:
             # Get address from PC column
-            addr_str = cols[3]
-            addr = int(addr_str, base=16)
+            addr = cols[3]
             # Find index of first character in PC
             if trace_start_col < 0:
-                trace_start_col = line.find(addr_str)
+                trace_start_col = line.find(addr)
+            # Get addr2line information and format it as an assembly comment
+            a2l_output = elf.addr2line(addr)
+            annot = '\n'.join([f'#; {line}' for line in str(a2l_output).split('\n')])
         except (ValueError, IndexError):
+            a2l_output = None
+            annot = ''
             if keep_time:
                 filtered_line = f'{time:>12}    {line[trace_start_col:]}'
             else:
@@ -245,41 +226,14 @@ def dump_hunk(hunk_tstart, hunk_sstart, hunk_trace, hunk_source):
         else:
             filtered_line = f'{line[trace_start_col:]}'
 
-        addr_hex = f'{addr:x}'
-        ret = adr2line(addr)
-
-        funs = ret[::2]
-        file_paths = [a2l_file_path(x) for x in ret[1::2]]
-        file_names = [a2l_file_name(x) for x in ret[1::2]]
-        file_lines = [a2l_file_line(x) for x in ret[1::2]]
-        # Assemble annotation string
-        if len(funs):
-            annot = f'#; {funs[0]} ({file_names[0]}:{file_lines[0]})'
-            for fun, file_name, file_line in zip(funs[1:], file_names[1:], file_lines[1:]):
-                annot = f'{annot}\n#;  in {fun} ({file_name}:{file_line})'
-
-        # Get source of last file and print the line
-        src_fname = file_paths[0]
-        if src_fname not in src_files.keys():
-            try:
-                # Issue warning if source was modified after trace
-                src_timestamp = os.path.getmtime(src_fname)
-                if src_timestamp >= trace_timestamp:
-                    print(colored('Warning:', 'yellow'),
-                          f'{src_fname} has been edited since the trace was generated')
-
-                with open(src_fname, 'r') as src_f:
-                    src_files[src_fname] = [x.strip() for x in src_f.readlines()]
-            except OSError:
-                src_files[src_fname] = None
-        if src_files[src_fname] is not None:
-            src_line = src_files[src_fname][file_lines[0]-1]
-            annot = f'{annot}\n#;  {src_line}'
-
         # Print diff
         if diff:
             # Compare current and previous call stacks
-            next_call_stack = assemble_call_stack(funs, file_paths, file_lines)
+            if a2l_output:
+                funs, files, lines = zip(*[level.values() for level in a2l_output.function_stack()])
+            else:
+                funs = files = lines = []
+            next_call_stack = assemble_call_stack(funs, files, lines)
             matching_cstack_levels = matching_call_stack_levels(next_call_stack, call_stack)
             matching_src_line = matching_source_line(next_call_stack, call_stack)
 
@@ -297,13 +251,14 @@ def dump_hunk(hunk_tstart, hunk_sstart, hunk_trace, hunk_source):
             call_stack = next_call_stack
 
             # Assemble source part of hunk
-            if len(funs) and src_files[src_fname]:
+            src_line = a2l_output.line()
+            if len(funs) and src_line:
                 for i, call in enumerate(call_stack):
                     if i >= matching_cstack_levels:
                         hunk_source += f'+{format_call(i, call)}'
                 if not matching_src_line:
                     indentation = '  ' * (len(call_stack) - 1)
-                    hunk_source += f'+{indentation}{file_lines[0]}: {src_line}\n'
+                    hunk_source += f'+{indentation}{lines[0]}: {src_line}\n'
 
             # Assemble trace part of hunk
             hunk_trace += f'-{filtered_line}'
@@ -329,4 +284,3 @@ def dump_hunk(hunk_tstart, hunk_sstart, hunk_trace, hunk_source):
 
 if not quiet:
     print(' done')
-    print(adr2line.cache_info())
diff --git a/util/trace/eventvis.py b/util/trace/eventvis.py
index 2d81ef8fb..4d0fdfdc7 100755
--- a/util/trace/eventvis.py
+++ b/util/trace/eventvis.py
@@ -31,6 +31,7 @@
 import argparse
 import csv
 import json
+import tracevis
 
 
 def pairwise(iterable):
@@ -51,6 +52,15 @@ def main():
         'csv',
         metavar='<csv>',
         help='Input CSV file')
+    parser.add_argument(
+        '--traces',
+        metavar='<trace>',
+        nargs='*',
+        help='Simulation traces to process')
+    parser.add_argument(
+        '--elf',
+        nargs='?',
+        help='ELF from which the traces were generated')
     parser.add_argument(
         '-o',
         '--output',
@@ -60,8 +70,21 @@ def main():
         help='Output JSON file')
     args = parser.parse_args()
 
-    # Read CSV to collect TraceViewer events
+    # TraceViewer events
     events = []
+
+    # Add a dummy instant event to mark time 0.
+    # This is to avoid that the events are shifted from
+    # their actual start times to align the first event
+    # at time 0.
+    event = {'name': 'zero',
+             'ph':   'I',  # Instant event type
+             'ts':   0,
+             's':    'g'  # Global scope
+             }
+    events.append(event)
+
+    # Read CSV to collect TraceViewer events
     with open(args.csv) as f:
         reader = csv.reader(f, delimiter=',')
 
@@ -92,6 +115,13 @@ def main():
                              }
                     events.append(event)
 
+    # Optionally extract also instruction-level events
+    # from the simulation traces
+    if args.traces and args.elf:
+        events += tracevis.parse_traces(args.traces, start=0, end=-1, fmt='snitch',
+                                        addr2line='addr2line', use_time=True, pid=1,
+                                        cache=True, elf=args.elf, collapse_call_stack=True)
+
     # Create TraceViewer JSON object
     tvobj = {}
     tvobj['traceEvents'] = events
diff --git a/util/trace/layout_events.py b/util/trace/layout_events.py
index a17fa504d..ea877c53c 100755
--- a/util/trace/layout_events.py
+++ b/util/trace/layout_events.py
@@ -40,6 +40,7 @@
 import argparse
 import csv
 import pandas as pd
+from math import isnan
 
 
 def main():
@@ -53,6 +54,11 @@ def main():
         'layout',
         metavar='<layout>',
         help='Layout CSV file')
+    parser.add_argument(
+        '--num-clusters',
+        type=int,
+        default=1,
+        help='Number of clusters')
     parser.add_argument(
         '-o',
         '--output',
@@ -65,58 +71,67 @@ def main():
     # Read input CSV
     df = pd.read_csv(args.csv)
 
-    # Open output CSV for writing
-    with open(args.output, mode='w') as out_f:
-        writer = csv.writer(out_f, delimiter=',', quotechar='"')
-
-        # Open layout CSV
-        with open(args.layout) as layout_f:
-            layout_reader = csv.reader(layout_f, delimiter=',')
-
-            # Get region labels from layout header
-            regions = [label for label in next(layout_reader) if label and not label.isspace()]
-
-            # Generate output header: appropriately spaced region labels
-            header = [''] + [val for label in regions for val in [label, '']]
-            writer.writerow(header)
-
-            # Iterate layout rows
-            for row in layout_reader:
-
-                # First entry in row is a hart ID or a Python expression
-                # which generates a list of hart IDs
-                expr = row[0]
-                code = compile(expr, "<string>", "eval")
-                tids = eval(code)
-                if isinstance(tids, int):
-                    tids = [tids]
-
-                # Iterate hart IDs
-                for tid in tids:
-
-                    # Start output row with hart ID
-                    orow = [tid]
-
-                    # Iterate all other cells in layout row (indices of regions to take)
-                    for cell in row[1:]:
-
-                        # If the cell is not empty, get start and end times
-                        # of the region from the input CSV and append them to the
-                        # output row. Otherwise, leave cells empty.
-                        if cell and not cell.isspace():
-                            reg_idx = int(cell)
-                            row_idx = tid
-                            col_idx = 1 + reg_idx * 2
-                            assert row_idx < df.shape[0], f'Hart ID {row_idx} out of bounds'
-                            assert (col_idx + 1) < df.shape[1], \
-                                f'Region index {reg_idx} out of bounds'
-                            orow.append(int(df.iat[row_idx, col_idx]))
-                            orow.append(int(df.iat[row_idx, col_idx + 1]))
-                        else:
-                            orow.append('')
-                            orow.append('')
-
-                    writer.writerow(orow)
+    # Output CSV data
+    data = []
+    columns = []
+
+    # Open layout CSV
+    with open(args.layout) as layout_f:
+        layout_reader = csv.reader(layout_f, delimiter=',')
+
+        # Get region labels from layout header
+        regions = [label for label in next(layout_reader) if label and not label.isspace()]
+
+        # Generate output columns: appropriately spaced region labels
+        columns = ['hartid'] + [val for label in regions for val in [label, '']]
+
+        # Iterate layout rows
+        for row in layout_reader:
+
+            # First entry in row is a hart ID or a Python expression
+            # which generates a list of hart IDs
+            expr = row[0]
+            code = compile(expr, "<string>", "eval")
+            tids = eval(code, {}, {'num_clusters': args.num_clusters})
+            if type(tids) == int:
+                tids = [tids]
+
+            # Iterate hart IDs
+            for tid in tids:
+
+                # Start output row with hart ID
+                orow = [tid]
+
+                # Iterate all other cells in layout row (indices of regions to take)
+                for cell in row[1:]:
+
+                    # If the cell is not empty, get start and end times
+                    # of the region from the input CSV and append them to the
+                    # output row. Otherwise, leave cells empty.
+                    if cell and not cell.isspace():
+                        reg_idx = int(cell)
+                        row_idx = tid
+                        col_idx = 1 + reg_idx * 2
+                        assert row_idx < df.shape[0], f'Hart ID {row_idx} out of bounds'
+                        assert (col_idx + 1) < df.shape[1],\
+                            f'Region index {reg_idx} out of bounds for hart {tid}'
+                        assert not isnan(df.iat[row_idx, col_idx]),\
+                            (f'Region {reg_idx} looks empty for hart {tid},'
+                             f'check whether it was simulated')
+                        orow.append(int(df.iat[row_idx, col_idx]))
+                        orow.append(int(df.iat[row_idx, col_idx + 1]))
+                    else:
+                        orow.append('')
+                        orow.append('')
+
+                data.append(orow)
+
+    # Create output dataframe and write to CSV
+    df = pd.DataFrame(data, columns=columns)
+    df.set_index('hartid', inplace=True)
+    df.sort_index(axis='index', inplace=True)
+    df.index.name = None
+    df.to_csv(args.output)
 
 
 if __name__ == '__main__':
diff --git a/util/trace/tracevis.py b/util/trace/tracevis.py
index ecc344253..599c82bd6 100755
--- a/util/trace/tracevis.py
+++ b/util/trace/tracevis.py
@@ -12,12 +12,13 @@
 # This script is inspired by https://github.com/SalvatoreDiGirolamo/tracevis
 # Author: Noah Huetter <huettern@student.ethz.ch>
 #         Samuel Riedel <sriedel@iis.ee.ethz.ch>
+#         Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 import re
-import os
 import sys
-from functools import lru_cache
+import json
 import argparse
+from a2l import Elf
 
 has_progressbar = True
 try:
@@ -31,13 +32,16 @@
 # line format:
 # Snitch RTL simulation:
 # 101000 82      M         0x00001000 csrr    a0, mhartid     #; comment
-# time   cycle   priv_lvl  pc         insn
+# CVA6 RTL simulation:
+# 101ns  82      M         0000000000001000 0      301022f3     csrr   t0, misa  ...
+# time   cycle   priv_lvl  pc               branch machine_insn insn
 # MemPool RTL simulation:
 # 101000 82      0x00001000 csrr    a0, mhartid     #; comment
 # time   cycle   pc         insn
 # Banshee traces:
 # 00000432 00000206 0005     800101e0  x15:00000064 x15=00000065 # addi    a5, a5, 1
 # cycle    instret  hard_id  pc        register                    insn
+FORMATS = ['cva6', 'snitch', 'banshee']
 
 # regex matches to groups
 # 0 -> time
@@ -45,10 +49,11 @@
 # 2 -> privilege level (RTL) / hartid (banshee)
 # 3 -> pc (hex with 0x prefix)
 # 4 -> instruction
-# 5 -> args (RTL) / empty (banshee)
-# 6 -> comment (RTL) / instruction arguments (banshee)
-RTL_REGEX = r' *(\d+) +(\d+) +([3M1S0U]?) *(0x[0-9a-f]+) ([.\w]+) +(.+)#; (.*)'
-BANSHEE_REGEX = r' *(\d+) (\d+) (\d+) ([0-9a-f]+) *.+ +.+# ([\w\.]*)( +)(.*)'
+# 5 -> args (RTL) / empty (cva6, banshee)
+# 6 -> comment (RTL) / instruction arguments (banshee) / empty (cva6)
+REGEX = {'snitch': r' *(\d+) +(\d+) +([3M1S0U]?) *(0x[0-9a-f]+) ([.\w]+) +(.+)#; (.*)',
+         'cva6': r' *(\d+)ns +(\d+) +([3M1S0U]?) *([0-9a-f]+) +[01]+ +[0-9a-f]+ +([.\w]+)',
+         'banshee': r' *(\d+) (\d+) (\d+) ([0-9a-f]+) *.+ +.+# ([\w\.]*)( +)(.*)'}
 
 # regex matches a line of instruction retired by the accelerator
 # 0 -> time
@@ -57,29 +62,20 @@
 # 3 -> comment
 ACC_LINE_REGEX = r' *(\d+) +(\d+) +([3M1S0U]?) *#; (.*)'
 
-buf = []
 
+# Parses the output of the `parse_line()` function into a TraceViewer
+# event, formatted as a dictionary. It operates on multiple of these
+# outputs, collected in a buffer `buf`.
+def flush(lah, buf, **kwargs):
+    elf = kwargs['elf']
+    fmt = kwargs['fmt']
+    use_time = kwargs['use_time']
+    collapse_call_stack = kwargs['collapse_call_stack']
 
-@lru_cache(maxsize=1024)
-def addr2line_cache(addr):
-    cmd = f'{addr2line} -e {elf} -f -a -i {addr:x}'
-    return os.popen(cmd).read().split('\n')
-
-
-def flush(buf, hartid):
-    global output_file
-    # get function names
-    pcs = [x[3] for x in buf]
-    a2ls = []
-
-    if cache:
-        for addr in pcs:
-            a2ls += addr2line_cache(int(addr, base=16))[:-1]
-    else:
-        a2ls = os.popen(
-            f'{addr2line} -e {elf} -f -a -i {" ".join(pcs)}').read().split('\n')[:-1]
-
+    # Iterate buffer entries
+    events = []
     for i in range(len(buf)-1):
+
         (time, cyc, priv, pc, instr, args, cmt) = buf.pop(0)
 
         if use_time:
@@ -91,158 +87,86 @@ def flush(buf, hartid):
 
         # Have lookahead time to this instruction?
         next_time = lah[time] if time in lah else next_time
+        duration = next_time - time
 
-        # print(f'time "{time}", cyc "{cyc}", priv "{priv}", pc "{pc}"'
-        #       f', instr "{instr}", args "{args}"', file=sys.stderr)
-
-        [pc, func, file] = a2ls.pop(0), a2ls.pop(0), a2ls.pop(0)
-
-        # check for more output of a2l
-        inlined = ''
-        while not a2ls[0].startswith('0x'):
-            inlined += '(inlined by) ' + a2ls.pop(0)
-        # print(f'pc "{pc}", func "{func}", file "{file}"')
+        # Get information on current instruction from addr2line
+        a2l_info = elf.addr2line(pc)
 
-        # assemble values for json
+        # Assemble TraceViewer event
         # Doc: https://docs.google.com/document/d/
-        # 1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
+        #      1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
+        event = {}
         # The name of the event, as displayed in Trace Viewer
-        name = instr
+        event['name'] = instr
+        # The event type, 'X' indicates a "complete event"
+        event['ph'] = 'X'
         # The event categories. This is a comma separated list of categories for the event.
         # The categories can be used to hide events in the Trace Viewer UI.
-        cat = 'instr'
-        # The tracing clock timestamp of the event.
-        # The timestamps are provided at microsecond granularity.
-        ts = time
-        # There is an extra parameter dur to specify the tracing clock duration
-        # of complete events in microseconds.
-        duration = next_time - time
-
-        if banshee:
+        event['cat'] = 'instr'
+        # The tracing clock timestamp of the event. The timestamps are provided at microsecond
+        # granularity.
+        if use_time:
+            time = time / 1000 if fmt == 'cva6' else time / 1000000
+        event['ts'] = time
+        # There is an extra parameter dur to specify the tracing clock duration of complete
+        # events in microseconds. In Banshee, each instruction takes one cycle
+        if use_time:
+            duration = duration / 1000 if fmt == 'cva6' else duration / 1000000
+        event['dur'] = 1 if fmt == 'banshee' else duration
+        # The thread ID is used to group events in a single TraceViewer row
+        if not collapse_call_stack:
+            event['tid'] = a2l_info.function_stack[0]['func']
+        if fmt == 'banshee':
             # Banshee stores all traces in a single file
-            hartid = priv
-            # In Banshee, each instruction takes one cycle
-            duration = 1
-
-        pid = elf+':hartid'+str(hartid)
-        funcname = func
-
-        # args
-        arg_pc = pc
-        arg_instr = instr
-        arg_args = args
-        arg_cycles = cyc
-        arg_coords = file
-        arg_inlined = inlined
-
-        output_file.write((
-            f'{{"name": "{name}", "cat": "{cat}", "ph": "X", '
-            f'"ts": {ts}, "dur": {duration}, "pid": "{pid}", '
-            f'"tid": "{funcname}", "args": {{"pc": "{arg_pc}", '
-            f'"instr": "{arg_instr} {arg_args}", "time": "{arg_cycles}", '
-            f'"Origin": "{arg_coords}", "inline": "{arg_inlined}"'
-            f'}}}},\n'))
-
-
-def parse_line(line, hartid):
-    global last_time, last_cyc
+            event['tid'] = priv
+        # Additional event args
+        event['args'] = {}
+        event['args']['pc'] = pc
+        event['args']['instr'] = f'{instr} {args}'
+        if cmt:
+            event['args']['comment'] = cmt
+        event['args']['cycle'] = cyc
+        event['args']['stack'] = a2l_info.function_stack_string(short=True)
+        event['args']['line'] = a2l_info.line()
+
+        events.append(event)
+    return events
+
+
+# Parses a trace line and returns an array of values extracted from the line
+def parse_line(line, **kwargs):
+    fmt = kwargs['fmt']
+
+    # Compile regex
+    re_line = re.compile(REGEX[fmt])
+
     # print(line)
     match = re_line.match(line)
     if match:
-        (time, cyc, priv, pc, instr, args, cmt) = tuple(
-            [match.group(i+1).strip() for i in range(re_line.groups)])
-        buf.append((time, cyc, priv, pc, instr, args, cmt))
-        last_time, last_cyc = time, cyc
-
-    if len(buf) > 10:
-        flush(buf, hartid)
-    return 0
-
-
-# Argument parsing
-parser = argparse.ArgumentParser('tracevis', allow_abbrev=True)
-parser.add_argument(
-    'elf',
-    metavar='<elf>',
-    help='The binary executed to generate the traces',
-
-
-)
-parser.add_argument(
-    'traces',
-    metavar='<trace>',
-    nargs='+',
-    help='Snitch traces to visualize')
-parser.add_argument(
-    '-o',
-    '--output',
-    metavar='<json>',
-    nargs='?',
-    default='chrome.json',
-    help='Output JSON file')
-parser.add_argument(
-    '--addr2line',
-    metavar='<path>',
-    nargs='?',
-    default='addr2line',
-    help='`addr2line` binary to use for parsing')
-parser.add_argument(
-    '-t',
-    '--time',
-    action='store_true',
-    help='Use the traces time instead of cycles')
-parser.add_argument(
-    '-b',
-    '--banshee',
-    action='store_true',
-    help='Parse Banshee traces')
-parser.add_argument(
-    '--no-cache',
-    action='store_true',
-    help='Disable addr2line caching (slow but might give better traces in some cases)')
-parser.add_argument(
-    '-s',
-    '--start',
-    metavar='<line>',
-    nargs='?',
-    type=int,
-    default=0,
-    help='First line to parse')
-parser.add_argument(
-    '-e',
-    '--end',
-    metavar='<line>',
-    nargs='?',
-    type=int,
-    default=-1,
-    help='Last line to parse')
-
-args = parser.parse_args()
-
-elf = args.elf
-traces = args.traces
-output = args.output
-use_time = args.time
-banshee = args.banshee
-addr2line = args.addr2line
-cache = not args.no_cache
-
-print('elf:', elf, file=sys.stderr)
-print('traces:', traces, file=sys.stderr)
-print('output:', output, file=sys.stderr)
-print('addr2line:', addr2line, file=sys.stderr)
-print('cache:', cache, file=sys.stderr)
-
-# Compile regex
-if banshee:
-    re_line = re.compile(BANSHEE_REGEX)
-else:
-    re_line = re.compile(RTL_REGEX)
-
-re_acc_line = re.compile(ACC_LINE_REGEX)
-
-
-def offload_lookahead(lines):
+        # TODO extend CVA6 regex to extract instruction args
+        if fmt == 'cva6':
+            (time, cyc, priv, pc, instr) = tuple(
+                [match.group(i+1).strip() for i in range(re_line.groups)])
+            args = cmt = ''
+        else:
+            (time, cyc, priv, pc, instr, args, cmt) = tuple(
+                [match.group(i+1).strip() for i in range(re_line.groups)])
+        return (time, cyc, priv, pc, instr, args, cmt)
+
+    return None
+
+
+# Parses a trace file and returns a dictionary mapping the time stamp
+# when every instruction is issued, to the time stamp when the instruction
+# writes back.
+def offload_lookahead(lines, **kwargs):
+    fmt = kwargs['fmt']
+    use_time = kwargs['use_time']
+
+    # Compile regex
+    re_line = re.compile(REGEX[fmt])
+    re_acc_line = re.compile(ACC_LINE_REGEX)
+
     # dict mapping time stamp of retired instruction to time stamp of
     # accelerator complete
     lah = {}
@@ -287,40 +211,177 @@ def offload_lookahead(lines):
     return lah
 
 
-lah = {}
-
-with open(output, 'w') as output_file:
-    # JSON header
-    output_file.write('{"traceEvents": [\n')
-
-    for filename in traces:
-        hartid = 0
-        parsed_nums = re.findall(r'\d+', filename)
-        hartid = int(parsed_nums[-1]) if len(parsed_nums) else hartid+1
-        fails = lines = 0
-        last_time = last_cyc = 0
-
-        print(
-            f'parsing hartid {hartid} with trace {filename}', file=sys.stderr)
-        tot_lines = len(open(filename).readlines())
-        with open(filename) as f:
-            all_lines = f.readlines()[args.start:args.end]
-            # offload lookahead
-            if not banshee:
-                lah = offload_lookahead(all_lines)
-            if has_progressbar:
-                for lino, line in progressbar.progressbar(
-                        enumerate(all_lines),
-                        max_value=tot_lines):
-                    fails += parse_line(line, hartid)
-                    lines += 1
+# Parses a trace file and returns a list of TraceViewer events.
+# Each event is formatted as a dictionary.
+def parse_trace(filename, **kwargs):
+
+    start = kwargs['start']
+    end = kwargs['end']
+    fmt = kwargs['fmt']
+
+    # Open trace
+    print(f'parsing trace {filename}', file=sys.stderr)
+    lah = {}
+    buf = []
+    fails = lines = 0
+    with open(filename) as f:
+
+        # Read lines
+        all_lines = f.readlines()
+        if end < 0:
+            end = len(all_lines) + end + 1
+        all_lines = all_lines[start:end]
+
+        # offload lookahead
+        if fmt == 'snitch':
+            lah = offload_lookahead(all_lines, **kwargs)
+
+        # Use a progress bar iterator if the package is installed
+        if has_progressbar:
+            iterations = progressbar.progressbar(
+                    enumerate(all_lines),
+                    max_value=len(all_lines))
+        else:
+            iterations = enumerate(all_lines)
+
+        # Iterate lines
+        events = []
+        for lino, line in iterations:
+            # Parse line
+            parsed_line = parse_line(line, **kwargs)
+            if parsed_line:
+                buf.append(parsed_line)
+            else:
+                fails += 1
+            lines += 1
+
+            # Flush buffer when it contains enough lines
+            if len(buf) > 10:
+                events += flush(lah, buf, **kwargs)
+        events += flush(lah, buf, **kwargs)
+
+        print(f' parsed {lines-fails} of {lines} lines', file=sys.stderr)
+        return events
+
+
+def parse_traces(traces, **kwargs):
+
+    # Open ELF file
+    elf_path = kwargs['elf']
+    kwargs['elf'] = Elf(elf_path, a2l_binary=kwargs['addr2line'])
+
+    # Iterate traces
+    events = []
+    for i, filename in enumerate(traces):
+
+        # Extract hartid from filename or use current index
+        # TODO doesn't work with hex numbers
+        # parsed_nums = re.findall(r'\d+', filename)
+        # hartid = int(parsed_nums[-1]) if len(parsed_nums) else i
+        hartid = i
+
+        # Extract TraceViewer events from trace
+        trace_events = parse_trace(filename, **kwargs)
+
+        # Assign a per-trace unique TID or PID to all events
+        pid = elf_path if 'pid' not in kwargs else kwargs['pid']
+        for event in trace_events:
+            if kwargs['collapse_call_stack']:
+                event['pid'] = pid
+                event['tid'] = hartid
             else:
-                for lino, line in enumerate(
-                        all_lines):
-                    fails += parse_line(line, hartid)
-                    lines += 1
-            flush(buf, hartid)
-            print(f' parsed {lines-fails} of {lines} lines', file=sys.stderr)
-
-    # JSON footer
-    output_file.write(r'{}]}''\n')
+                event['pid'] = pid+':hartid'+str(hartid)
+
+        # Add to events from previous traces
+        events += trace_events
+
+    return events
+
+
+def main(**kwargs):
+    elf = kwargs['elf']
+    traces = kwargs['traces']
+    output = kwargs['output']
+    addr2line = kwargs['addr2line']
+
+    print('elf:', elf, file=sys.stderr)
+    print('traces:', traces, file=sys.stderr)
+    print('output:', output, file=sys.stderr)
+    print('addr2line:', addr2line, file=sys.stderr)
+
+    # Parse traces and create TraceViewer JSON object
+    events = parse_traces(**kwargs)
+    tvobj = {'traceEvents': events, 'displayTimeUnit': 'ns'}
+
+    # Dump JSON object to file
+    with open(output, 'w') as output_file:
+        json.dump(tvobj, output_file, indent=4)
+
+
+# Parse command-line args
+def parse_args():
+    # Argument parsing
+    parser = argparse.ArgumentParser('tracevis', allow_abbrev=True)
+    parser.add_argument(
+        'elf',
+        metavar='<elf>',
+        help='The binary executed to generate the traces',
+    )
+    parser.add_argument(
+        'traces',
+        metavar='<trace>',
+        nargs='+',
+        help='Traces to visualize')
+    parser.add_argument(
+        '-o',
+        '--output',
+        metavar='<json>',
+        nargs='?',
+        default='chrome.json',
+        help='Output JSON file')
+    parser.add_argument(
+        '--addr2line',
+        metavar='<path>',
+        nargs='?',
+        default='addr2line',
+        help='`addr2line` binary to use for parsing')
+    parser.add_argument(
+        '-t',
+        '--time',
+        dest='use_time',
+        action='store_true',
+        help='Use the traces time instead of cycles')
+    parser.add_argument(
+        '-f',
+        '--format',
+        dest='fmt',
+        type=str,
+        default='snitch',
+        choices=FORMATS,
+        help='Trace format')
+    parser.add_argument(
+        '--collapse-call-stack',
+        action='store_true',
+        help='Visualize all instructions of a core in a single TraceViewer thread')
+    parser.add_argument(
+        '-s',
+        '--start',
+        metavar='<line>',
+        nargs='?',
+        type=int,
+        default=0,
+        help='First line to parse')
+    parser.add_argument(
+        '-e',
+        '--end',
+        metavar='<line>',
+        nargs='?',
+        type=int,
+        default=-1,
+        help='Last line to parse (inclusive)')
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = vars(parse_args())
+    main(**args)

From 675bff7d666f83b5c8a03b34211baa5e7133455f Mon Sep 17 00:00:00 2001
From: Luca Colagrande <bigcola.96@gmail.com>
Date: Sat, 19 Aug 2023 16:01:20 +0200
Subject: [PATCH 03/13] docs: Make agnostic of repo to reuse snippets in occamy

---
 docs/publications.md       |  4 ++++
 docs/ug/getting_started.md | 17 ++++++++++++-----
 mkdocs.yml                 |  1 +
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/docs/publications.md b/docs/publications.md
index dd0ebd23f..6f14daa64 100644
--- a/docs/publications.md
+++ b/docs/publications.md
@@ -2,6 +2,8 @@
 
 If you use the Snitch cluster or its extensions in your work, you can cite us:
 
+<!--start-publications-->
+
 <details>
 <summary><b>Snitch: A tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads</b></summary>
 <p>
@@ -95,3 +97,5 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
 ```
 
 </p>
+
+<!--end-publications-->
diff --git a/docs/ug/getting_started.md b/docs/ug/getting_started.md
index de2487ae5..82f60ff70 100644
--- a/docs/ug/getting_started.md
+++ b/docs/ug/getting_started.md
@@ -1,10 +1,12 @@
+<!--start-section-1-->
+
 # Getting Started
 
 ## Installation
 
 Clone the repository:
 ```shell
-git clone https://github.com/pulp-platform/snitch_cluster.git --recurse-submodules
+git clone https://github.com/pulp-platform/{{ repo  }}.git --recurse-submodules
 ```
 
 If you had already cloned the repository without the `--recurse-submodules` flag, clone its submodules:
@@ -18,15 +20,17 @@ This repository requires several tools to be installed on your machine. Some of
 
 Note that installing all tools, in appropriate versions, may be non-trivial. For this purpose, we provide a Docker container with all free tools installed.
 
-The [following section](#docker-container) provides instructions to install the Docker container.
+The [following section](https://pulp-platform.github.io/{{ repo }}/ug/getting_started.html#docker-container) provides instructions to install the Docker container.
 
-Users with access to ETH Zurich IIS machines can find all tools already installed on these machines. To complete the setup, skip to the [IIS environment setup](#iis-environment-setup) section.
+Users with access to ETH Zurich IIS machines can find all tools already installed on these machines. To complete the setup, skip to the [IIS environment setup](https://pulp-platform.github.io/{{ repo }}/ug/getting_started.html#iis-environment-setup) section.
 
-If you do choose to setup a custom development environment on your own machine, we strongly recommend you take example from our [Docker file](https://github.com/pulp-platform/snitch_cluster/blob/{{ branch }}/util/container/README.md).
+If you do choose to setup a custom development environment on your own machine, we strongly recommend you take example from our [Docker file](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/util/container/README.md).
 
 ## Docker container
 
-The following instructions are extracted from the Docker container [README.md](https://github.com/pulp-platform/snitch_cluster/blob/{{ branch }}/util/container/README.md). For additional information on the Docker container refer to that document.
+<!--end-section-1-->
+
+The following instructions are extracted from the Docker container [README.md](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/util/container/README.md). For additional information on the Docker container refer to that document.
 
 ### Installation
 
@@ -38,6 +42,8 @@ The following instructions are extracted from the Docker container [README.md](h
    heading-offset=1
 %}
 
+<!--start-section-2-->
+
 ## IIS environment setup
 
 To make sure the right versions of each tool are picked up, set the following environment variables, e.g. in a bash shell:
@@ -74,3 +80,4 @@ Install the required packages in the currently active virtual environment:
 ```shell
 pip install -r python-requirements.txt
 ```
+<!--end-section-2-->
diff --git a/mkdocs.yml b/mkdocs.yml
index b817649fd..3f9595b0a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -26,6 +26,7 @@ plugins:
       on_error_fail: true
 use_directory_urls: false
 extra:
+  repo: snitch_cluster
   branch: main
 nav:
   - Home: index.md

From b5d7fbd9adf4f030383b0d0acf57ccc71e4495d3 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 25 Aug 2023 11:28:25 +0200
Subject: [PATCH 04/13] snRuntime: Various improvements from offload study

---
 sw/snRuntime/api/cluster_interrupt_decls.h    |  2 +
 sw/snRuntime/src/cluster_interrupts.h         | 27 +++++++++++-
 sw/snRuntime/src/riscv.h                      |  2 +
 sw/snRuntime/src/start.c                      | 44 +++++++------------
 sw/snRuntime/src/team.h                       |  5 +++
 target/common/common.mk                       | 18 +++++---
 target/snitch_cluster/Makefile                |  8 ++--
 .../sw/runtime/common/snitch_cluster_memory.c |  2 +
 8 files changed, 69 insertions(+), 39 deletions(-)

diff --git a/sw/snRuntime/api/cluster_interrupt_decls.h b/sw/snRuntime/api/cluster_interrupt_decls.h
index aa18db210..00a6f0a43 100644
--- a/sw/snRuntime/api/cluster_interrupt_decls.h
+++ b/sw/snRuntime/api/cluster_interrupt_decls.h
@@ -6,6 +6,8 @@ inline void snrt_int_cluster_set(uint32_t mask);
 
 inline void snrt_int_cluster_clr(uint32_t mask);
 
+inline void snrt_int_clr_mcip_unsafe();
+
 inline void snrt_int_clr_mcip();
 
 inline void snrt_int_set_mcip();
diff --git a/sw/snRuntime/src/cluster_interrupts.h b/sw/snRuntime/src/cluster_interrupts.h
index a3b15e8e0..ee2a36f87 100644
--- a/sw/snRuntime/src/cluster_interrupts.h
+++ b/sw/snRuntime/src/cluster_interrupts.h
@@ -2,6 +2,8 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+#include "../../deps/riscv-opcodes/encoding.h"
+
 /**
  * @brief Write mask to the cluster-local interrupt set register
  * @param mask set bit at X sets the interrupt of hart X
@@ -18,10 +20,33 @@ inline void snrt_int_cluster_clr(uint32_t mask) {
     *(snrt_cluster_clint_clr_ptr()) = mask;
 }
 
-inline void snrt_int_clr_mcip() {
+/**
+ * @brief Clear MCIP interrupt
+ * @detail The interrupt is cleared asynchronously, i.e. it may not be cleared
+ *         yet when the function returns. Use `snrt_int_clr_mcip()` or
+ *         `snrt_int_wait_mcip_clr` if you need to block until the interrupt is
+ *         cleared.
+ */
+inline void snrt_int_clr_mcip_unsafe() {
     snrt_int_cluster_clr(1 << snrt_cluster_core_idx());
 }
 
+/**
+ * @brief Wait for MCIP interrupt to be cleared
+ */
+inline void snrt_int_wait_mcip_clr() {
+    while (read_csr(mip) & MIP_MCIP)
+        ;
+}
+
+/**
+ * @brief Clear MCIP interrupt and wait for the write to have effect
+ */
+inline void snrt_int_clr_mcip() {
+    snrt_int_clr_mcip_unsafe();
+    snrt_int_wait_mcip_clr();
+}
+
 inline void snrt_int_set_mcip() {
     snrt_int_cluster_set(1 << snrt_cluster_core_idx());
 }
diff --git a/sw/snRuntime/src/riscv.h b/sw/snRuntime/src/riscv.h
index 47542daa3..faaf888b8 100644
--- a/sw/snRuntime/src/riscv.h
+++ b/sw/snRuntime/src/riscv.h
@@ -10,6 +10,8 @@
  */
 static inline void snrt_wfi() { asm volatile("wfi"); }
 
+static inline void snrt_nop() { asm volatile("nop" : : :); }
+
 static inline uint32_t snrt_mcycle() {
     uint32_t register r;
     asm volatile("csrr %0, mcycle" : "=r"(r) : : "memory");
diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index 8a692e921..3fb338f4a 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -2,13 +2,6 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-static inline void snrt_crt0_cluster_hw_barrier() {
-    uint32_t register r;
-    uint32_t hw_barrier =
-        SNRT_CLUSTER_HW_BARRIER_ADDR + snrt_cluster_idx() * SNRT_CLUSTER_OFFSET;
-    asm volatile("lw %0, 0(%1)" : "=r"(r) : "r"(hw_barrier) : "memory");
-}
-
 #ifdef SNRT_INIT_CLS
 static inline uint32_t snrt_cls_base_addr() {
     extern volatile uint32_t __cdata_start, __cdata_end;
@@ -52,11 +45,9 @@ static inline void snrt_init_bss() {
 
     // Only one core needs to perform the initialization
     if (snrt_cluster_idx() == 0 && snrt_is_dm_core()) {
-        volatile uint32_t* p;
-
-        for (p = (uint32_t*)(&__bss_start); p < (uint32_t*)(&__bss_end); p++) {
-            *p = 0;
-        }
+        size_t size = (size_t)(&__bss_end) - (size_t)(&__bss_start);
+        snrt_dma_start_1d_wideptr((uint64_t)(&__bss_start),
+                                  (uint64_t)(snrt_zero_memory_ptr()), size);
     }
 }
 #endif
@@ -70,22 +61,17 @@ static inline void snrt_init_cls() {
 
     // Only one core per cluster has to do this
     if (snrt_is_dm_core()) {
-        volatile uint32_t* p;
-        volatile uint32_t* cls_ptr = (volatile uint32_t*)snrt_cls_base_addr();
+        void* ptr = (void*)snrt_cls_base_addr();
+        size_t size;
 
         // Copy cdata section to base of the TCDM
-        for (p = (uint32_t*)(&__cdata_start); p < (uint32_t*)(&__cdata_end);
-             p++) {
-            *cls_ptr = *p;
-            cls_ptr++;
-        }
+        size = (size_t)(&__cdata_end) - (size_t)(&__cdata_start);
+        if (size > 0) snrt_dma_start_1d(ptr, (void*)(&__cdata_start), size);
 
         // Clear cbss section
-        for (p = (uint32_t*)(&__cbss_start); p < (uint32_t*)(&__cbss_end);
-             p++) {
-            *cls_ptr = 0;
-            cls_ptr++;
-        }
+        ptr = (void*)((uint32_t)ptr + size);
+        size = (size_t)(&__cbss_end) - (size_t)(&__cbss_start);
+        snrt_dma_start_1d(ptr, (void*)(snrt_zero_memory_ptr()), size);
     }
 }
 #endif
@@ -105,7 +91,6 @@ void snrt_main() {
     int exit_code = 0;
 
 #ifdef SNRT_CRT0_CALLBACK0
-
     snrt_crt0_callback0();
 #endif
 
@@ -129,6 +114,11 @@ void snrt_main() {
     snrt_init_cls();
 #endif
 
+#if defined(SNRT_INIT_BSS) || defined(SNRT_INIT_CLS)
+    // Single DMA wait call for both snrt_init_bss() and snrt_init_cls()
+    if (snrt_is_dm_core()) snrt_dma_wait_all();
+#endif
+
 #ifdef SNRT_CRT0_CALLBACK3
     snrt_crt0_callback3();
 #endif
@@ -142,7 +132,7 @@ void snrt_main() {
 #endif
 
 #ifdef SNRT_CRT0_PRE_BARRIER
-    snrt_crt0_cluster_hw_barrier();
+    snrt_cluster_hw_barrier();
 #endif
 
 #ifdef SNRT_CRT0_CALLBACK5
@@ -159,7 +149,7 @@ void snrt_main() {
 #endif
 
 #ifdef SNRT_CRT0_POST_BARRIER
-    snrt_crt0_cluster_hw_barrier();
+    snrt_cluster_hw_barrier();
 #endif
 
 #ifdef SNRT_CRT0_CALLBACK7
diff --git a/sw/snRuntime/src/team.h b/sw/snRuntime/src/team.h
index 54a3b0aa2..918037e64 100644
--- a/sw/snRuntime/src/team.h
+++ b/sw/snRuntime/src/team.h
@@ -28,6 +28,11 @@ inline uint32_t __attribute__((const)) snrt_global_core_idx() {
     return snrt_hartid() - snrt_global_core_base_hartid();
 }
 
+inline uint32_t __attribute__((const)) snrt_global_compute_core_idx() {
+    return snrt_cluster_idx() * snrt_cluster_compute_core_num() +
+           snrt_cluster_core_idx();
+}
+
 inline uint32_t __attribute__((const)) snrt_cluster_idx() {
     return snrt_global_core_idx() / snrt_cluster_core_num();
 }
diff --git a/target/common/common.mk b/target/common/common.mk
index 3535fb156..9c469f5a6 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -6,15 +6,19 @@ LOGS_DIR       ?= logs
 TB_DIR         ?= $(SNITCH_ROOT)/target/common/test
 UTIL_DIR       ?= $(SNITCH_ROOT)/util
 
-# Support for local override
+# External executables
 BENDER		   ?= bender
 DASM 	       ?= spike-dasm
 VLT			   ?= verilator
 VERIBLE_FMT    ?= verible-verilog-format
+CLANG_FORMAT   ?= clang-format
+
+# Internal executables
 BIN2JTAG       ?= $(UTIL_DIR)/bin2jtag.py
-ANNOTATE	   ?= $(UTIL_DIR)/trace/annotate.py
 GENTRACE	   ?= $(UTIL_DIR)/trace/gen_trace.py
-CLANG_FORMAT   ?= clang-format
+ANNOTATE_PY	   ?= $(UTIL_DIR)/trace/annotate.py
+EVENTS_PY	   ?= $(UTIL_DIR)/trace/events.py
+PERF_CSV_PY	   ?= $(UTIL_DIR)/trace/perf_csv.py
 
 VERILATOR_ROOT ?= $(dir $(shell which $(VLT)))/../share/verilator
 VLT_ROOT	   ?= ${VERILATOR_ROOT}
@@ -194,10 +198,10 @@ traces: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.
 # make annotate
 # Generate source-code interleaved traces for all harts. Reads the binary from
 # the logs/.rtlbinary file that is written at start of simulation in the vsim script
-$(LOGS_DIR)/trace_hart_%.s: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE}
-	$(PYTHON) ${ANNOTATE} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $<
-$(LOGS_DIR)/trace_hart_%.diff: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE}
-	$(PYTHON) ${ANNOTATE} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -d
+$(LOGS_DIR)/trace_hart_%.s: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
+	$(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $<
+$(LOGS_DIR)/trace_hart_%.diff: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
+	$(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -d
 BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary)
 annotate: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.s/') || echo "") \
           $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.diff/') || echo "")
diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile
index 9b346ba1a..f464697d1 100644
--- a/target/snitch_cluster/Makefile
+++ b/target/snitch_cluster/Makefile
@@ -266,12 +266,12 @@ bin/snitch_cluster.vcs: ${VCS_SOURCES} ${TB_SRCS} $(TB_CC_SOURCES) $(TB_ASM_SOUR
 ##########
 
 $(LOGS_DIR)/perf.csv: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/')) \
-		$(ROOT)/util/trace/perf_csv.py
-	$(PYTHON) $(ROOT)/util/trace/perf_csv.py -o $@ -i $(LOGS_DIR)/hart_*_perf.json
+		$(PERF_CSV_PY)
+	$(PYTHON) $(PERF_CSV_PY) -o $@ -i $(LOGS_DIR)/hart_*_perf.json
 
 $(LOGS_DIR)/event.csv: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/')) \
-		$(ROOT)/util/trace/perf_csv.py
-	$(PYTHON) $(ROOT)/util/trace/perf_csv.py -o $@ -i $(LOGS_DIR)/hart_*_perf.json --filter tstart tend
+		$(PERF_CSV_PY)
+	$(PYTHON) $(PERF_CSV_PY) -o $@ -i $(LOGS_DIR)/hart_*_perf.json --filter tstart tend
 
 ########
 # Util #
diff --git a/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.c b/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.c
index f76f16508..48c08faa3 100644
--- a/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.c
+++ b/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.c
@@ -11,3 +11,5 @@ extern volatile uint32_t* snrt_cluster_clint_set_ptr();
 extern volatile uint32_t* snrt_cluster_clint_clr_ptr();
 
 extern uint32_t snrt_cluster_hw_barrier_addr();
+
+extern volatile uint32_t* snrt_zero_memory_ptr();

From 84fc3d6f5df43096e46883b6a2b127873326be92 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Mon, 28 Aug 2023 11:00:19 +0200
Subject: [PATCH 05/13] util/sim: Extend simulation utilities for Occamy

---
 .github/workflows/ci.yml                      |   6 +-
 .gitlab-ci.yml                                |  12 +-
 sw/blas/axpy/verify.py                        |   6 +-
 sw/blas/gemm/Makefile                         |  10 +-
 .../{interrupt-local.c => interrupt_local.c}  |   0
 target/snitch_cluster/sw/apps/run.py          |  21 ---
 target/snitch_cluster/sw/apps/run.yaml        |  17 --
 target/snitch_cluster/sw/run.yaml             |  85 ++++++++++
 target/snitch_cluster/sw/tests/Makefile       |   2 +-
 target/snitch_cluster/sw/tests/run.py         |  21 ---
 target/snitch_cluster/sw/tests/run.yaml       |  71 ---------
 util/sim/elf.py                               |   7 +-
 util/sim/list_apps.py                         |  13 +-
 util/sim/{sim_utils.py => simulate.py}        | 145 ++++++++++--------
 util/sim/verification.py                      |   9 +-
 15 files changed, 209 insertions(+), 216 deletions(-)
 rename sw/tests/{interrupt-local.c => interrupt_local.c} (100%)
 delete mode 100755 target/snitch_cluster/sw/apps/run.py
 delete mode 100644 target/snitch_cluster/sw/apps/run.yaml
 create mode 100644 target/snitch_cluster/sw/run.yaml
 delete mode 100755 target/snitch_cluster/sw/tests/run.py
 delete mode 100644 target/snitch_cluster/sw/tests/run.yaml
 rename util/sim/{sim_utils.py => simulate.py} (53%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 71fdc8eaa..84faeac81 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -43,8 +43,7 @@ jobs:
       - name: Run Tests
         working-directory: target/snitch_cluster
         run: |-
-          ./sw/tests/run.py sw/tests/run.yaml --simulator verilator
-          ./sw/apps/run.py sw/apps/run.yaml --simulator verilator
+          ../../util/sim/simulate.py sw/run.yaml --simulator verilator
 
   ############################################
   # Build SW on Snitch Cluster w/ Banshee #
@@ -67,5 +66,4 @@ jobs:
           SNITCH_LOG: info
         working-directory: target/snitch_cluster
         run: |-
-          ./sw/tests/run.py sw/tests/run.yaml --simulator banshee
-          ./sw/apps/run.py sw/apps/run.yaml --simulator banshee
+          ../../util/sim/simulate.py sw/run.yaml --simulator banshee
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 811c82856..18adcf22e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -93,8 +93,7 @@ snitch-cluster-vlt:
   script:
     - cd target/snitch_cluster
     - $VERILATOR make bin/snitch_cluster.vlt
-    - $VERILATOR ./sw/tests/run.py sw/tests/run.yaml --simulator verilator
-    - $VERILATOR ./sw/apps/run.py sw/apps/run.yaml --simulator verilator
+    - $VERILATOR ../../util/sim/simulate.py sw/run.yaml --simulator verilator
   # yamllint enable rule:line-length
 
 # VCS
@@ -103,8 +102,7 @@ snitch-cluster-vcs:
   script:
     - cd target/snitch_cluster
     - $VCS make bin/snitch_cluster.vcs
-    - $VCS ./sw/tests/run.py sw/tests/run.yaml --simulator vcs
-    - $VCS ./sw/apps/run.py sw/apps/run.yaml --simulator vcs
+    - $VCS ../../util/sim/simulate.py sw/run.yaml --simulator vcs
 
 # Questa
 snitch-cluster-vsim:
@@ -112,8 +110,7 @@ snitch-cluster-vsim:
   script:
     - cd target/snitch_cluster
     - $QUESTA make bin/snitch_cluster.vsim
-    - $QUESTA ./sw/tests/run.py sw/tests/run.yaml --simulator vsim
-    - $QUESTA ./sw/apps/run.py sw/apps/run.yaml --simulator vsim
+    - $QUESTA ../../util/sim/simulate.py sw/run.yaml --simulator vsim
 
 # Banshee
 snitch-cluster-banshee:
@@ -129,5 +126,4 @@ snitch-cluster-banshee:
     - cd banshee
     - cargo install --debug --path .
     - cd ../target/snitch_cluster
-    - ./sw/tests/run.py sw/tests/run.yaml --simulator banshee
-    - ./sw/apps/run.py sw/apps/run.yaml --simulator banshee
+    - ../../util/sim/simulate.py sw/run.yaml --simulator banshee
diff --git a/sw/blas/axpy/verify.py b/sw/blas/axpy/verify.py
index 5838c68df..02cb15975 100755
--- a/sw/blas/axpy/verify.py
+++ b/sw/blas/axpy/verify.py
@@ -24,12 +24,16 @@ def main():
     args = verification.parse_args()
     raw_results = verification.simulate(sim_bin=args.sim_bin,
                                         snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
                                         log=args.log,
                                         output_uids=['z'])
     z_actual = np.array(bytes_to_doubles(raw_results['z']))
 
     # Extract input operands from ELF file
-    elf = Elf(args.snitch_bin)
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
     a = np.array(bytes_to_doubles(elf.get_symbol_contents('a')))
     x = np.array(bytes_to_doubles(elf.get_symbol_contents('x')))
     y = np.array(bytes_to_doubles(elf.get_symbol_contents('y')))
diff --git a/sw/blas/gemm/Makefile b/sw/blas/gemm/Makefile
index c390c3667..604556ed1 100644
--- a/sw/blas/gemm/Makefile
+++ b/sw/blas/gemm/Makefile
@@ -9,18 +9,20 @@ MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 DATA_DIR := $(realpath $(MK_DIR)/data)
 SRC_DIR  := $(realpath $(MK_DIR)/src)
 
-DATA_CFG ?= $(DATA_DIR)/params.hjson
-
 APP     ?= gemm
 SRCS    ?= $(realpath $(SRC_DIR)/main.c)
 INCDIRS ?= $(DATA_DIR) $(SRC_DIR)
 
-$(DATA_DIR)/data.h: $(DATA_DIR)/datagen.py $(DATA_CFG)
+DATA_CFG  ?= $(DATA_DIR)/params.hjson
+DATAGEN_PY = $(DATA_DIR)/datagen.py
+DATA_H     = $(DATA_DIR)/data.h
+
+$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
 	$< -c $(DATA_CFG) > $@
 
 .PHONY: clean-data clean
 
 clean-data:
-	rm -f $(DATA_DIR)/data.h
+	rm -f $(DATA_H)
 
 clean: clean-data
diff --git a/sw/tests/interrupt-local.c b/sw/tests/interrupt_local.c
similarity index 100%
rename from sw/tests/interrupt-local.c
rename to sw/tests/interrupt_local.c
diff --git a/target/snitch_cluster/sw/apps/run.py b/target/snitch_cluster/sw/apps/run.py
deleted file mode 100755
index 86b9422eb..000000000
--- a/target/snitch_cluster/sw/apps/run.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright 2023 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-import sys
-from pathlib import Path
-
-sys.path.append(str(Path(__file__).parent / '../../../../util/sim'))
-import sim_utils # noqa: E402,E261
-
-
-def main():
-    sim_utils.main(lambda test: Path(__file__).parent / f'{test}/build/{Path(test).name}.elf')
-
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/target/snitch_cluster/sw/apps/run.yaml b/target/snitch_cluster/sw/apps/run.yaml
deleted file mode 100644
index 93bd32d8b..000000000
--- a/target/snitch_cluster/sw/apps/run.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright 2023 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-runs:
-  - app: blas/axpy
-    cmd: ../../sw/blas/axpy/verify.py {sim_bin} {elf}
-  - app: blas/gemm
-  - app: dnn/batchnorm
-  - app: dnn/linear
-  - app: dnn/maxpool
-  - app: dnn/gemm
-# dnn/gelu # seems like it stalls
-# dnn/conv2d # fails with exit code 32
-# dnn/fusedconv # fails newly
-# dnn/layernorm # throws illegal instruction on FDIV in simulation
-# dnn/softmax # throws illegal instruction on FDIV in simulation
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
new file mode 100644
index 000000000..4c7ff7b1e
--- /dev/null
+++ b/target/snitch_cluster/sw/run.yaml
@@ -0,0 +1,85 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+runs:
+  - elf: tests/build/atomics.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x4
+  - elf: tests/build/barrier.elf
+  - elf: tests/build/data_mover.elf
+  - elf: tests/build/dma_simple.elf
+  - elf: tests/build/event_unit.elf
+  - elf: tests/build/fence_i.elf
+  - elf: tests/build/fp8_comparison_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with segfault
+  - elf: tests/build/fp8_comparison_vector.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with segfault
+  - elf: tests/build/fp8_computation_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
+  - elf: tests/build/fp8_computation_vector.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6
+  - elf: tests/build/fp8alt_comparison_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with segfault
+  - elf: tests/build/fp8alt_comparison_vector.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - elf: tests/build/fp8alt_computation_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
+  - elf: tests/build/fp8alt_computation_vector.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x12
+  - elf: tests/build/fp16_comparison_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - elf: tests/build/fp16_comparison_vector.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - elf: tests/build/fp16_computation_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
+  - elf: tests/build/fp16_computation_vector.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6
+  - elf: tests/build/fp16alt_comparison_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - elf: tests/build/fp16alt_comparison_vector.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - elf: tests/build/fp16alt_computation_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
+  - elf: tests/build/fp16alt_computation_vector.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x16
+  - elf: tests/build/fp32_comparison_scalar.elf
+  - elf: tests/build/fp32_comparison_vector.elf
+  - elf: tests/build/fp32_computation_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2
+  - elf: tests/build/fp32_computation_vector.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2
+  - elf: tests/build/fp32_conversions_scalar.elf
+    simulators: [vsim, vcs, verilator] # banshee fails with illegal instruction
+  - elf: tests/build/fp64_conversions_scalar.elf
+    simulators: [vsim, vcs, verilator]
+  # - elf: tests/build/interrupt.elf
+  - elf: tests/build/interrupt_local.elf
+  - elf: tests/build/multi_cluster.elf
+  - elf: tests/build/openmp_parallel.elf
+  - elf: tests/build/openmp_for_static_schedule.elf
+  - elf: tests/build/openmp_double_buffering.elf
+  - elf: tests/build/perf_cnt.elf
+  - elf: tests/build/printf_simple.elf
+  - elf: tests/build/printf_fmtint.elf
+  - elf: tests/build/simple.elf
+  - elf: tests/build/team_global.elf
+  - elf: tests/build/tls.elf
+  - elf: tests/build/varargs_1.elf
+  - elf: tests/build/varargs_2.elf
+  - elf: tests/build/zero_mem.elf
+  - elf: tests/build/non_null_exitcode.elf
+    exit_code: 14
+  - elf: apps/blas/axpy/build/axpy.elf
+    cmd: ../../sw/blas/axpy/verify.py {sim_bin} {elf}
+  - elf: apps/blas/gemm/build/gemm.elf
+  - elf: apps/dnn/batchnorm/build/batchnorm.elf
+  - elf: apps/dnn/linear/build/linear.elf
+  - elf: apps/dnn/maxpool/build/maxpool.elf
+  - elf: apps/dnn/gemm/build/gemm.elf
+  # - elf: apps/dnn/gelu/build/gelu.elf # seems like it stalls
+  # - elf: apps/dnn/conv2d/build/conv2d.elf # fails with exit code 32
+  # - elf: apps/dnn/fusedconv/build/fusedconv.elf # fails newly
+  # - elf: apps/dnn/layernorm/build/layernorm.elf
+  #   throws illegal instruction on FDIV in simulation
+  # - elf: apps/dnn/softmax/build/softmax.elf
+  #   throws illegal instruction on FDIV in simulation
diff --git a/target/snitch_cluster/sw/tests/Makefile b/target/snitch_cluster/sw/tests/Makefile
index c3b204b55..57b26d9a0 100644
--- a/target/snitch_cluster/sw/tests/Makefile
+++ b/target/snitch_cluster/sw/tests/Makefile
@@ -44,7 +44,7 @@ RISCV_LDFLAGS += -lsnRuntime
 # Outputs #
 ###########
 
-APPS        = $(shell $(MK_DIR)/../../../../util/sim/list_apps.py run.yaml)
+APPS        = $(shell $(MK_DIR)/../../../../util/sim/list_apps.py --in-dir tests/ ../run.yaml)
 ELFS        = $(abspath $(addprefix $(BUILDDIR)/,$(addsuffix .elf,$(APPS))))
 DEPS        = $(abspath $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(APPS))))
 DUMPS       = $(abspath $(addprefix $(BUILDDIR)/,$(addsuffix .dump,$(APPS))))
diff --git a/target/snitch_cluster/sw/tests/run.py b/target/snitch_cluster/sw/tests/run.py
deleted file mode 100755
index 3fe6a6e51..000000000
--- a/target/snitch_cluster/sw/tests/run.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright 2023 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-import sys
-from pathlib import Path
-
-sys.path.append(str(Path(__file__).parent / '../../../../util/sim'))
-import sim_utils # noqa: E402,E261
-
-
-def main():
-    sim_utils.main(lambda test: Path(__file__).parent / f'build/{test}.elf')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/target/snitch_cluster/sw/tests/run.yaml b/target/snitch_cluster/sw/tests/run.yaml
deleted file mode 100644
index 2f86d1a70..000000000
--- a/target/snitch_cluster/sw/tests/run.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2023 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-runs:
-  - app: atomics
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x4
-  - app: barrier
-  - app: data_mover
-  - app: dma_simple
-  - app: event_unit
-  - app: fence_i
-  - app: fp8_comparison_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with segfault
-  - app: fp8_comparison_vector
-    simulators: [vsim, vcs, verilator] # banshee fails with segfault
-  - app: fp8_computation_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
-  - app: fp8_computation_vector
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6
-  - app: fp8alt_comparison_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with segfault
-  - app: fp8alt_comparison_vector
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - app: fp8alt_computation_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
-  - app: fp8alt_computation_vector
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x12
-  - app: fp16_comparison_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - app: fp16_comparison_vector
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - app: fp16_computation_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
-  - app: fp16_computation_vector
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6
-  - app: fp16alt_comparison_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - app: fp16alt_comparison_vector
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - app: fp16alt_computation_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
-  - app: fp16alt_computation_vector
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x16
-  - app: fp32_comparison_scalar
-  - app: fp32_comparison_vector
-  - app: fp32_computation_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2
-  - app: fp32_computation_vector
-    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2
-  - app: fp32_conversions_scalar
-    simulators: [vsim, vcs, verilator] # banshee fails with illegal instruction
-  - app: fp64_conversions_scalar
-    simulators: [vsim, vcs, verilator]
-  # - app: interrupt
-  - app: interrupt-local
-  - app: multi_cluster
-  - app: openmp_parallel
-  - app: openmp_for_static_schedule
-  - app: openmp_double_buffering
-  - app: perf_cnt
-  - app: printf_simple
-  - app: printf_fmtint
-  - app: simple
-  - app: team_global
-  - app: tls
-  - app: varargs_1
-  - app: varargs_2
-  - app: zero_mem
-  - app: non_null_exitcode
-    exit_code: 14
diff --git a/util/sim/elf.py b/util/sim/elf.py
index db1721160..a46a6764d 100644
--- a/util/sim/elf.py
+++ b/util/sim/elf.py
@@ -1,7 +1,6 @@
-#!/usr/bin/env python3
-# Copyright 2020 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
 #
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 #
diff --git a/util/sim/list_apps.py b/util/sim/list_apps.py
index 608f8e335..baefdf7eb 100755
--- a/util/sim/list_apps.py
+++ b/util/sim/list_apps.py
@@ -7,22 +7,29 @@
 
 import argparse
 import yaml
+from pathlib import Path
 
 
 def main():
     # Argument parsing
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--in-dir',
+        type=Path,
+        help='Only apps below this directory (at any level) will be listed')
     parser.add_argument(
         'input',
-        help='The YAML file containing run information',
-    )
+        help='The YAML file containing run information')
     args = parser.parse_args()
 
     with open(args.input, 'r') as file:
         tests = yaml.safe_load(file)['runs']
 
     for test in tests:
-        print(test['app'])
+        elf = Path(test['elf'])
+        match_parts = args.in_dir.parts
+        if elf.parts[:len(match_parts)] == match_parts:
+            print(elf.stem)
 
 
 if __name__ == '__main__':
diff --git a/util/sim/sim_utils.py b/util/sim/simulate.py
similarity index 53%
rename from util/sim/sim_utils.py
rename to util/sim/simulate.py
index bdd01615a..a1466dc16 100755
--- a/util/sim/sim_utils.py
+++ b/util/sim/simulate.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2023 ETH Zurich and University of Bologna.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
@@ -18,7 +19,7 @@
 BANSHEE_CFG = 'src/banshee.yaml'
 
 # Tool settings
-SIMULATORS = ['vsim', 'banshee', 'verilator', 'vcs']
+SIMULATORS = ['vsim', 'banshee', 'verilator', 'vcs', 'other']
 DEFAULT_SIMULATOR = SIMULATORS[0]
 SIMULATOR_BINS = {
     'vsim': 'bin/snitch_cluster.vsim',
@@ -48,6 +49,11 @@ def parse_args():
         default=DEFAULT_SIMULATOR,
         choices=SIMULATORS,
         help='Choose a simulator to run the test with')
+    parser.add_argument(
+        '--sim-bin',
+        action='store',
+        nargs='?',
+        help='Override default path to simulator binary')
     parser.add_argument(
         '--dry-run',
         action='store_true',
@@ -75,73 +81,95 @@ def check_exit_code(test, exit_code):
         return exit_code
 
 
-def run_test(test, format_elf_path, simulator, dry_run=False):
-    # Get test parameters
-    app = test['app']
+def run_simulation(cmd, simulator, test):
+    # Defaults
+    result = 1
+
+    # Spawn simulation subprocess
+    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
+
+    # Poll simulation subprocess and log its output
+    while p.poll() is None:
+        line = p.stdout.readline()
+        print(line, end='', flush=True)
+
+        # When simulating with vsim or vcs, we need to parse the simulation
+        # log to catch the application's return code
+        if simulator in ['vsim', 'vcs']:
+            # Capture success
+            regex_success = r'\[SUCCESS\] Program finished successfully'
+            match_success = re.search(regex_success, line)
+            if match_success:
+                result = 0
+            else:
+                regex_fail = r'\[FAILURE\] Finished with exit code\s+(\d+)'
+                match = re.search(regex_fail, line)
+                if match:
+                    exit_code = match.group(1)
+                    result = check_exit_code(test, exit_code)
+
+    # Check if the subprocess terminated correctly
+    exit_code = p.poll()
+    # In Banshee and Verilator the exit code of the Snitch binary is returned
+    # through the exit code of the simulation command
+    if simulator in ['banshee', 'verilator']:
+        result = check_exit_code(test, exit_code)
+    # For custom commands the return code is that of the command
+    elif simulator == 'other':
+        result = exit_code
+    # For standard simulation commands the simulated Snitch binary exit
+    # code is overriden only if the simulator failed
+    else:
+        if exit_code != 0:
+            result = exit_code
+
+    return result
+
+
+def run_test(test, args):
+    # Extract args
+    simulator = args.simulator
+    sim_bin = args.sim_bin if args.sim_bin else SIMULATOR_BINS[simulator]
+    dry_run = args.dry_run
+    testlist = args.testlist
+
+    # Check if simulator is supported for this test
     if 'simulators' in test:
         if simulator not in test['simulators']:
             return 0
 
     # Construct path to executable
-    elf = format_elf_path(app)
+    elf = Path(test['elf'])
+    if testlist:
+        elf = Path(testlist).absolute().parent / elf
     cprint(f'Run test {colored(elf, "cyan")}', attrs=["bold"])
 
     # Construct simulation command (override only supported for RTL)
     if 'cmd' in test and simulator != 'banshee':
         cmd = test['cmd']
+        cmd = cmd.format(sim_bin=sim_bin, elf=elf, simulator=simulator)
+        simulator = 'other'
     else:
         cmd = SIMULATOR_CMDS[simulator]
-    cmd = cmd.format(sim_bin=SIMULATOR_BINS[simulator], elf=elf)
+        cmd = cmd.format(sim_bin=sim_bin, elf=elf)
     print(f'$ {cmd}', flush=True)
 
-    # Run test
+    # Run simulation
     result = 0
     if not dry_run:
-        result = 1
-
-        # When simulating with vsim or vcs, we need to parse the simulation
-        # log to catch the application's return code
-        if simulator in ['vsim', 'vcs']:
-            p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
-                                 text=True)
-
-            while p.poll() is None:
-                line = p.stdout.readline()
-                print(line, end='', flush=True)
-
-                # Capture success
-                regex_success = r'\[SUCCESS\] Program finished successfully'
-                match_success = re.search(regex_success, line)
-                if match_success:
-                    result = 0
-                else:
-                    regex_fail = r'\[FAILURE\] Finished with exit code\s+(\d+)'
-                    match = re.search(regex_fail, line)
-                    if match:
-                        exit_code = match.group(1)
-                        result = check_exit_code(test, exit_code)
-
-            # Check if the subprocess terminated correctly
-            if p.poll() != 0:
-                result = p.poll()
-
-        else:
-            p = subprocess.Popen(cmd, shell=True)
-            p.wait()
-            exit_code = p.returncode
-            result = check_exit_code(test, exit_code)
+        result = run_simulation(cmd, simulator, test)
 
-        # Report failure or success
-        if result != 0:
-            cprint(f'{app} test failed', 'red', attrs=['bold'], flush=True)
-        else:
-            cprint(f'{app} test passed', 'green', attrs=['bold'], flush=True)
+    # Report failure or success
+    if result != 0:
+        cprint(f'{elf} test failed', 'red', attrs=['bold'], flush=True)
+    else:
+        cprint(f'{elf} test passed', 'green', attrs=['bold'], flush=True)
 
     return result
 
 
 def print_failed_test(test):
-    print(f'{colored(test["app"], "cyan")} test {colored("failed", "red")}')
+    print(f'{colored(test["elf"], "cyan")} test {colored("failed", "red")}')
 
 
 def print_test_summary(failed_tests, dry_run=False):
@@ -157,28 +185,25 @@ def print_test_summary(failed_tests, dry_run=False):
     return 0
 
 
-def run_tests(testlist, format_elf_path, simulator, dry_run=False, early_exit=False):
+def run_tests(args):
     # Iterate tests
-    tests = get_tests(testlist)
+    tests = get_tests(args.testlist)
     failed_tests = []
     for test in tests:
         # Run test
-        result = run_test(test, format_elf_path, simulator, dry_run)
+        result = run_test(test, args)
         if result != 0:
             failed_tests.append(test)
             # End program if requested on first test failure
-            if early_exit:
+            if args.early_exit:
                 break
+    return print_test_summary(failed_tests, args.dry_run)
 
-    return print_test_summary(failed_tests, dry_run)
 
-
-# format_elf_path: function which constructs the path to an ELF binary
-#                  from a test name as listed in the test list file
-def main(format_elf_path):
+def main():
     args = parse_args()
-    sys.exit(run_tests(args.testlist,
-                       format_elf_path,
-                       args.simulator,
-                       args.dry_run,
-                       args.early_exit))
+    sys.exit(run_tests(args))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/util/sim/verification.py b/util/sim/verification.py
index ed3f0f670..9878ef62a 100644
--- a/util/sim/verification.py
+++ b/util/sim/verification.py
@@ -24,13 +24,18 @@ def parse_args():
     parser.add_argument(
         'snitch_bin',
         help='The Snitch binary to be executed by the simulated Snitch hardware')
+    parser.add_argument(
+        '--symbols-bin',
+        help='An optional binary containing the I/O symbols. By default,'
+             'these are searched for in snitch_bin. This argument serves as an'
+             'alternative.')
     parser.add_argument(
         '--log',
         help='Redirect simulation output to this log file')
     return parser.parse_args()
 
 
-def simulate(sim_bin, snitch_bin, log, output_uids):
+def simulate(sim_bin, snitch_bin, log, output_uids, symbols_bin=None):
     # Open ELF file for processing
     elf = Elf(snitch_bin)
 
@@ -43,6 +48,8 @@ def simulate(sim_bin, snitch_bin, log, output_uids):
     sim.poll(tohost, 1, 0)
 
     # Read out results from memory
+    if symbols_bin:
+        elf = Elf(symbols_bin)
     raw_outputs = {}
     for uid in output_uids:
         address = elf.get_symbol_address(uid)

From 7d4d2316203eac32a88fc5cf9ffee5099dff1c21 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 25 Aug 2023 13:20:44 +0200
Subject: [PATCH 06/13] sw: Update AXPY and GEMM for (trivial) multi-cluster

---
 sw/blas/axpy/src/main.c           |  31 +++--
 sw/blas/gemm/data/datagen.py      |  80 +++++-------
 sw/blas/gemm/data/params.hjson    |   4 +-
 sw/blas/gemm/src/gemm.h           |  61 ++++++++++
 sw/blas/gemm/src/main.c           | 194 ++++++++++++------------------
 sw/blas/gemm/verify.py            |  61 ++++++++++
 sw/dnn/src/dnn.h                  |   5 +
 target/snitch_cluster/sw/run.yaml |   1 +
 util/sim/data_utils.py            |  27 ++++-
 9 files changed, 285 insertions(+), 179 deletions(-)
 create mode 100755 sw/blas/gemm/verify.py

diff --git a/sw/blas/axpy/src/main.c b/sw/blas/axpy/src/main.c
index 7fe9d3f83..1b379c811 100644
--- a/sw/blas/axpy/src/main.c
+++ b/sw/blas/axpy/src/main.c
@@ -10,17 +10,26 @@
 
 int main() {
     double *local_x, *local_y, *local_z;
+    double *remote_x, *remote_y, *remote_z;
+
+    // Calculate size and pointers for each cluster
+    uint32_t frac = l / snrt_cluster_num();
+    uint32_t offset = frac * snrt_cluster_idx();
+    remote_x = x + offset;
+    remote_y = y + offset;
+    remote_z = z + offset;
 
     // Allocate space in TCDM
     local_x = (double *)snrt_l1_next();
-    local_y = local_x + l;
-    local_z = local_y + l;
+    local_y = local_x + frac;
+    local_z = local_y + frac;
 
     // Copy data in TCDM
     if (snrt_is_dm_core()) {
-        size_t size = l * sizeof(double);
-        snrt_dma_start_1d(local_x, x, size);
-        snrt_dma_start_1d(local_y, y, size);
+        size_t size = frac * sizeof(double);
+        snrt_dma_start_1d(local_x, remote_x, size);
+        snrt_dma_start_1d(local_y, remote_y, size);
+        snrt_dma_wait_all();
     }
 
     snrt_cluster_hw_barrier();
@@ -28,7 +37,7 @@ int main() {
     // Compute
     if (!snrt_is_dm_core()) {
         uint32_t start_cycle = snrt_mcycle();
-        axpy(l, a, local_x, local_y, local_z);
+        axpy(frac, a, local_x, local_y, local_z);
         uint32_t end_cycle = snrt_mcycle();
     }
 
@@ -36,10 +45,15 @@ int main() {
 
     // Copy data out of TCDM
     if (snrt_is_dm_core()) {
-        size_t size = l * sizeof(double);
-        snrt_dma_start_1d(z, local_z, size);
+        size_t size = frac * sizeof(double);
+        snrt_dma_start_1d(remote_z, local_z, size);
+        snrt_dma_wait_all();
     }
 
+    snrt_cluster_hw_barrier();
+
+// TODO: currently only works for single cluster otherwise need to
+//       synchronize all cores here
 #ifdef BIST
     uint32_t nerr = l;
 
@@ -47,6 +61,7 @@ int main() {
     if (snrt_global_core_idx() == 0) {
         for (int i = 0; i < l; i++) {
             if (local_z[i] == g[i]) nerr--;
+            printf("%d %d\n", local_z[i], g[i]);
         }
     }
 
diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py
index c6faa092f..b33eb4afc 100755
--- a/sw/blas/gemm/data/datagen.py
+++ b/sw/blas/gemm/data/datagen.py
@@ -10,6 +10,13 @@
 import argparse
 import pathlib
 import hjson
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+from data_utils import emit_license, format_scalar_definition, \
+                       format_vector_definition, format_ifdef_wrapper  # noqa: E402
+
 
 np.random.seed(42)
 
@@ -33,41 +40,11 @@
 }
 
 
-def format_vector_definition(id, vector, typ):
-    s = f'{typ} {id}[{len(vector)}] = ' + '{\n'
-    for i, el in enumerate(vector):
-        if typ != 'char':
-            s += f'\t{el},'
-        else:
-            if type(el) == float:
-                print(el)
-            s += f'0x{el:02x},'
-        if i % 8 == 7:
-            s += '\n'
-    s += '};'
-    return s
-
-
-def format_vector_declaration(id, vector, typ):
-    s = f'{typ} {id}[{len(vector)}];'
-    return s
-
-
-def format_scalar_definition(id, scalar, typ):
-    s = f'{typ} {id} = {scalar};'
-    return s
-
-
-def emit_header_file(**kwargs):
-
-    emit_str = "// Copyright 2023 ETH Zurich and University of Bologna.\n" + \
-               "// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n" + \
-               "// SPDX-License-Identifier: Apache-2.0\n\n"
-    emit_str += emit_gemm_data(**kwargs)
-    return emit_str
+def golden_model(a, b, alpha, c):
+    return np.matmul(a, b) + alpha * c
 
 
-def emit_gemm_data(**kwargs):
+def emit_header(**kwargs):
 
     # Generate random input matrices
     dtype = NUMPY_TYPES[str(kwargs['prec'])]
@@ -104,30 +81,31 @@ def emit_gemm_data(**kwargs):
         a = np.random.rand(kwargs['M'], kwargs['K']).astype(dtype)
         b = np.random.rand(kwargs['K'], kwargs['N']).astype(dtype)
         c = np.random.rand(kwargs['M'], kwargs['N']).astype(dtype)
-        result = np.matmul(a, b) + kwargs['alpha'] * c
+        result = golden_model(a, b, kwargs['alpha'], c)
 
     # Store matrices in transposed form if requested
     a = a.T if kwargs['ta'] else a
     b = b.T if kwargs['tb'] else b
 
-    data_str = []
-    data_str += [format_scalar_definition('M', kwargs['M'], 'uint32_t')]
-    data_str += [format_scalar_definition('N', kwargs['N'], 'uint32_t')]
-    data_str += [format_scalar_definition('K', kwargs['K'], 'uint32_t')]
-    data_str += [format_scalar_definition('TA', int(kwargs['ta']), 'uint32_t')]
-    data_str += [format_scalar_definition('TB', int(kwargs['tb']), 'uint32_t')]
-    data_str += [format_scalar_definition('ALPHA', kwargs['alpha'], 'uint32_t')]
-    data_str += [format_scalar_definition('dtype_size', kwargs['prec']//8, 'uint32_t')]
-    data_str += [format_scalar_definition('expand', kwargs['expand'], 'uint32_t')]
-    data_str += [format_vector_definition('a', a.flatten(), C_TYPES[str(kwargs['prec'])])]
-    data_str += [format_vector_definition('b', b.flatten(), C_TYPES[str(kwargs['prec'])])]
-    data_str += [format_vector_definition('c', c.flatten(), C_TYPES[str(kwargs['prec'])])]
+    data_str = [emit_license()]
+    data_str += [format_scalar_definition('uint32_t', 'M', kwargs['M'])]
+    data_str += [format_scalar_definition('uint32_t', 'N', kwargs['N'])]
+    data_str += [format_scalar_definition('uint32_t', 'K', kwargs['K'])]
+    data_str += [format_scalar_definition('uint32_t', 'TA', int(kwargs['ta']))]
+    data_str += [format_scalar_definition('uint32_t', 'TB', int(kwargs['tb']))]
+    data_str += [format_scalar_definition('uint32_t', 'ALPHA', kwargs['alpha'])]
+    data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)]
+    data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])]
+    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten())]
+    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten())]
+    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten())]
     if kwargs['prec'] == 8:
-        data_str += [format_vector_definition('result', result.flatten(), C_TYPES['64'])]
+        result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten())
     else:
-        data_str += [format_vector_definition('result',
-                                              result.flatten(),
-                                              C_TYPES[str(kwargs['prec'])])]
+        result_def = format_vector_definition(C_TYPES[str(kwargs['prec'])],
+                                              'result',
+                                              result.flatten())
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
     data_str = '\n\n'.join(data_str)
 
     return data_str
@@ -149,7 +127,7 @@ def main():
         param = hjson.loads(f.read())
 
     # Emit header file
-    print(emit_header_file(**param))
+    print(emit_header(**param))
 
 
 if __name__ == '__main__':
diff --git a/sw/blas/gemm/data/params.hjson b/sw/blas/gemm/data/params.hjson
index e079a52e6..23a4100cf 100644
--- a/sw/blas/gemm/data/params.hjson
+++ b/sw/blas/gemm/data/params.hjson
@@ -5,12 +5,12 @@
 // Parameters for a GEMM
 
 {
-    M: 16,
+    M: 192,
     N: 16,
     K: 16,
     alpha: 0,
     ta: false,
     tb: true, // must be true for SIMD
-    prec: 32,
+    prec: 64,
     expand: 0
 }
diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
index da967e698..86ec17ede 100644
--- a/sw/blas/gemm/src/gemm.h
+++ b/sw/blas/gemm/src/gemm.h
@@ -4,14 +4,22 @@
 //
 // Author: Tim Fischer <fischeti@iis.ee.ethz.ch>
 //         Luca Bertaccini <lbertaccini@iis.ee.ethz.ch>
+//         Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 #include <stdint.h>
 
 #include "snrt.h"
 
+// Guard to avoid conflict with DNN header file
+// TODO: move this definition to Snitch math library to solve problem
+#ifndef PRECISION_T
+#define PRECISION_T
+typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
+
 typedef float v2f32 __attribute__((vector_size(8)));
 typedef __fp16 v4f16 __attribute__((vector_size(8)));
 typedef char v8f8 __attribute__((vector_size(8)));
+#endif
 
 void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
                         uint32_t ldA, uint32_t ta, double* B, uint32_t ldB,
@@ -874,3 +882,56 @@ void gemm_fp8_ex_opt(uint32_t M, uint32_t N, uint32_t K, char* A, uint32_t ldA,
 
     snrt_ssr_disable();
 }
+
+// BLAS compliant GEMM kernel, with some additional arguments at the beginning
+// to specify Snitch implementation details. Matrix sizes and pointers are for
+// the whole cluster computation
+// TODO: alpha (and beta) should be of floating-point type (same precision as
+// operands)
+void gemm(precision_t prec, uint32_t expand, uint32_t setup_ssr,
+          uint32_t transa, uint32_t transb, uint32_t m, uint32_t n, uint32_t k,
+          uint32_t alpha, void* a, uint32_t lda, void* b, uint32_t ldb,
+          double beta, void* c, uint32_t ldc) {
+    const uint32_t compute_num = snrt_cluster_compute_core_num();
+    const uint32_t compute_id = snrt_cluster_core_idx();
+
+    // Compute cores work not on contiguous blocks but on strided rows
+    uint32_t lda_strided = compute_num * lda;
+    uint32_t ldc_strided = compute_num * ldc;
+
+    // Compute cores access A and C at offsets of one row from each other
+    uint32_t offsetA = compute_id * lda;
+    uint32_t offsetC = compute_id * ldc;
+
+    // Compute fraction of C rows every core computes
+    uint32_t frac_m = m / compute_num;
+
+    switch (prec) {
+        case FP64:
+            gemm_fp64_opt(frac_m, n, k, (double*)a + offsetA, lda_strided,
+                          transa, (double*)b, ldb, transb, (double*)c + offsetC,
+                          ldc_strided, &alpha, setup_ssr);
+            break;
+        case FP32:
+            gemm_fp32_opt(frac_m, n, k, (float*)a + offsetA, lda_strided,
+                          (float*)b, ldb, (float*)c + offsetC, ldc_strided,
+                          &alpha, setup_ssr);
+            break;
+        case FP16:
+            if (expand) {
+                gemm_fp16_ex_opt(
+                    frac_m, n, k, (__fp16*)a + offsetA, lda_strided, (__fp16*)b,
+                    ldb, (__fp16*)c + offsetC, ldc_strided, &alpha, setup_ssr);
+            } else {
+                gemm_fp16_opt(frac_m, n, k, (__fp16*)a + offsetA, lda_strided,
+                              (__fp16*)b, ldb, (__fp16*)c + offsetC,
+                              ldc_strided, &alpha, setup_ssr);
+            }
+            break;
+        case FP8:
+            gemm_fp8_ex_opt(frac_m, n, k, (char*)a + offsetA, lda, (char*)b,
+                            ldb, (char*)c + offsetC, ldc_strided, &alpha,
+                            setup_ssr);
+            break;
+    }
+}
diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c
index bde009c95..3da55c2ab 100644
--- a/sw/blas/gemm/src/main.c
+++ b/sw/blas/gemm/src/main.c
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 // Author: Tim Fischer <fischeti@iis.ee.ethz.ch>
+//         Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 #include <math.h>
 #include <stdint.h>
@@ -11,40 +12,33 @@
 #include "gemm.h"
 #include "snrt.h"
 
-// Padding of innermost dimension of a Matrix
-// Useful for preventing banking conflicts between cores
-// that are accessing different rows of the matrix
-#define MAT_ROW_PADDING 0
-
-// Padding in between matrices A, B for preventing
-// banking conflicts in the beginning
-#define MAT_PADDING 0
-
-#define CHECK_RESULT
-
-typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
-
-void *l1_a, *l1_b, *l1_c;
-
 int main() {
-    const uint32_t compute_num = snrt_cluster_compute_core_num();
-    const uint32_t compute_id = snrt_cluster_core_idx();
-
-    uint32_t a_size = (M * (K + MAT_ROW_PADDING) + MAT_PADDING) * dtype_size;
-    uint32_t b_size = (K + MAT_ROW_PADDING) * N * dtype_size;
-    uint32_t c_size = M * N * dtype_size;
-
+    void *local_a, *local_b, *local_c;
+    void *remote_a, *remote_b, *remote_c;
+
+    // Calculate size and pointers for each cluster
+    uint32_t frac_m = M / snrt_cluster_num();
+    uint32_t frac_a = frac_m * K;
+    uint32_t frac_c = frac_m * N;
+    uint32_t size_frac_a = frac_a * dtype_size;
+    uint32_t size_b = K * N * dtype_size;
+    uint32_t size_frac_c = frac_c * dtype_size;
+    uint32_t offset_a = frac_a * snrt_cluster_idx();
+    uint32_t offset_c = frac_c * snrt_cluster_idx();
+    remote_a = a + offset_a;
+    remote_b = b;
+    remote_c = c + offset_c;
+
+    // Allocate space in TCDM
+    local_a = (void *)snrt_l1_next();
+    local_b = local_a + size_frac_a;
+    local_c = local_b + size_b;
+
+    // Copy data in TCDM
     if (snrt_is_dm_core()) {
-        l1_a = snrt_l1alloc(a_size);
-        l1_b = snrt_l1alloc(b_size);
-        l1_c = snrt_l1alloc(c_size);
-        snrt_dma_start_2d(l1_a, a, dtype_size * K,
-                          dtype_size * (K + MAT_ROW_PADDING), dtype_size * K,
-                          M);
-        snrt_dma_start_2d(l1_b, b, dtype_size * K,
-                          dtype_size * (K + MAT_ROW_PADDING), dtype_size * K,
-                          N);
-        snrt_dma_start_1d(l1_c, c, dtype_size * M * N);
+        snrt_dma_start_1d(local_a, remote_a, size_frac_a);
+        snrt_dma_start_1d(local_b, remote_b, size_b);
+        snrt_dma_start_1d(local_c, remote_c, size_frac_c);
         snrt_dma_wait_all();
     }
 
@@ -52,104 +46,70 @@ int main() {
 
     // Compute
     if (!snrt_is_dm_core()) {
-        const uint32_t setup_SSR = 1;
+        const uint32_t setup_ssr = 1;
         uint32_t start_cycle = snrt_mcycle();
 
-        if (!TA && !TB) {
-            volatile uint32_t A_offset =
-                compute_id * (K + MAT_ROW_PADDING) * dtype_size;
-            volatile uint32_t C_offset = compute_id * N * dtype_size;
-            volatile uint32_t ldA = compute_num * (K + MAT_ROW_PADDING);
-            volatile uint32_t ldB = N + MAT_ROW_PADDING;
-            volatile uint32_t ldC = N * compute_num;
-
-            gemm_fp64_opt(M / compute_num, N, K, &l1_a[A_offset], ldA, TA, l1_b,
-                          ldB, TB, &l1_c[C_offset], ldC, &ALPHA, setup_SSR);
-        } else if (!TA && TB) {
-            volatile uint32_t A_offset =
-                compute_id * (K + MAT_ROW_PADDING) * dtype_size;
-            volatile uint32_t C_offset = compute_id * N * dtype_size;
-            volatile uint32_t ldA = compute_num * (K + MAT_ROW_PADDING);
-            volatile uint32_t ldB = K + MAT_ROW_PADDING;
-            volatile uint32_t ldC = N * compute_num;
-
-            switch (dtype_size) {
-                case FP64:
-                    gemm_fp64_opt(M / compute_num, N, K, &l1_a[A_offset], ldA,
-                                  TA, l1_b, ldB, TB, &l1_c[C_offset], ldC,
-                                  &ALPHA, setup_SSR);
-                    break;
-                case FP32:
-                    gemm_fp32_opt(M / compute_num, N, K, &l1_a[A_offset], ldA,
-                                  l1_b, ldB, &l1_c[C_offset], ldC, &ALPHA,
-                                  setup_SSR);
-                    break;
-                case FP16:
-                    if (expand) {
-                        gemm_fp16_ex_opt(M / compute_num, N, K, &l1_a[A_offset],
-                                         ldA, l1_b, ldB, &l1_c[C_offset], ldC,
-                                         &ALPHA, setup_SSR);
-                    } else {
-                        gemm_fp16_opt(M / compute_num, N, K, &l1_a[A_offset],
-                                      ldA, l1_b, ldB, &l1_c[C_offset], ldC,
-                                      &ALPHA, setup_SSR);
-                    }
-                    break;
-                case FP8:
-                    gemm_fp8_ex_opt(M / compute_num, N, K, &l1_a[A_offset], ldA,
-                                    l1_b, ldB, &l1_c[C_offset], ldC, &ALPHA,
-                                    setup_SSR);
-                    break;
-            }
-        } else if (TA) {
-            printf("transpose TA not supported\n");
+        volatile uint32_t lda = K;
+        volatile uint32_t ldb = N;
+        volatile uint32_t ldc = N;
+
+        // Transpose of A unsopported
+        if (TA) return -1;
+        if (TB) {
+            // Transpose of B supported only in FP64
+            if (dtype_size != FP64) return -1;
+            ldb = K;
         }
+
+        gemm(dtype_size, expand, setup_ssr, TA, TB, frac_m, N, K, ALPHA,
+             local_a, lda, local_b, ldb, 1, local_c, ldc);
+
         uint32_t end_cycle = snrt_mcycle();
     }
 
     snrt_cluster_hw_barrier();
 
-#ifdef CHECK_RESULT
-
-    uint32_t errors = 0;
-    if (compute_id == 0) {
-        switch (dtype_size) {
-            case FP64:
-                for (uint32_t m = 0; m < M; m++) {
-                    for (uint32_t n = 0; n < N; n++) {
-                        uint32_t idx = m * N + n;
-                        if (fabs(result[idx] - ((double *)l1_c)[idx]) > 0.001)
-                            errors++;
-                    }
-                }
-                break;
-            case FP32:
-                for (uint32_t m = 0; m < M; m++) {
-                    for (uint32_t n = 0; n < N; n++) {
-                        uint32_t idx = m * N + n;
-                        if (fabs(result[idx] - ((float *)l1_c)[idx]) > 0.001)
-                            errors++;
-                    }
-                }
-                break;
-            case FP16:
-                for (uint32_t m = 0; m < M; m++) {
-                    for (uint32_t n = 0; n < N; n++) {
-                        uint32_t idx = m * N + n;
-                        if (fabs(result[idx] - ((__fp16 *)l1_c)[idx]) > 0.001)
-                            errors++;
-                    }
+    // Copy data out of TCDM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(remote_c, local_c, size_frac_c);
+        snrt_dma_wait_all();
+    }
+
+// TODO: currently only works for single cluster otherwise need to
+//       synchronize all cores here
+#ifdef BIST
+    uint32_t errors = M * N;
+
+    if (snrt_cluster_core_idx() == 0) {
+        for (uint32_t m = 0; m < M; m++) {
+            for (uint32_t n = 0; n < N; n++) {
+                uint32_t idx = m * N + n;
+                switch (dtype_size) {
+                    case FP64:
+                        if (fabs(result[idx] - ((double *)local_c)[idx]) >
+                            0.001)
+                            errors--;
+                        break;
+                    case FP32:
+                        if (fabs(result[idx] - ((float *)local_c)[idx]) > 0.001)
+                            errors--;
+                        break;
+                    case FP16:
+                        if (fabs(result[idx] - ((__fp16 *)local_c)[idx]) >
+                            0.001)
+                            errors--;
+                        break;
+                    case FP8:
+                        printf("No golden model yet for fp8!\n");
+                        return -1;
+                        break;
                 }
-                break;
-            case FP8:
-                printf("No golden model yet for fp8!\n");
-                return -1;
-                break;
+            }
         }
         printf("%d/%d Errors\n", errors, M * N);
     }
-    return errors;
 
+    return errors;
 #endif
 
     return 0;
diff --git a/sw/blas/gemm/verify.py b/sw/blas/gemm/verify.py
new file mode 100755
index 000000000..3bae7f801
--- /dev/null
+++ b/sw/blas/gemm/verify.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import sys
+from pathlib import Path
+import numpy as np
+from data.datagen import golden_model
+
+sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
+import verification  # noqa: E402
+from elf import Elf  # noqa: E402
+from data_utils import bytes_to_doubles, bytes_to_uint32s  # noqa: E402
+
+
+ERR_THRESHOLD = 0.001
+
+
+def main():
+    # Run simulation and get outputs
+    args = verification.parse_args()
+    raw_results = verification.simulate(sim_bin=args.sim_bin,
+                                        snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
+                                        log=args.log,
+                                        output_uids=['c'])
+    c_actual = np.array(bytes_to_doubles(raw_results['c']))
+
+    # Extract input operands from ELF file
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
+    a = np.array(bytes_to_doubles(elf.get_symbol_contents('a')))
+    b = np.array(bytes_to_doubles(elf.get_symbol_contents('b')))
+    c = np.array(bytes_to_doubles(elf.get_symbol_contents('c')))
+    alpha = bytes_to_uint32s(elf.get_symbol_contents('ALPHA'))[0]
+    m = bytes_to_uint32s(elf.get_symbol_contents('M'))[0]
+    n = bytes_to_uint32s(elf.get_symbol_contents('N'))[0]
+    k = bytes_to_uint32s(elf.get_symbol_contents('K'))[0]
+    tb = bytes_to_uint32s(elf.get_symbol_contents('TB'))[0]
+    a = np.reshape(a, (m, k))
+    b = np.reshape(b, (k, n))
+    if tb:
+        b = b.transpose()
+    c = np.reshape(c, (m, n))
+
+    # Verify results
+    c_golden = golden_model(a, b, alpha, c).flatten()
+
+    absolute_err = np.absolute(c_golden - c_actual)
+    fail = np.any(absolute_err > ERR_THRESHOLD)
+
+    return int(fail)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
index b345ce874..537f488cd 100644
--- a/sw/dnn/src/dnn.h
+++ b/sw/dnn/src/dnn.h
@@ -6,11 +6,16 @@
 
 #include <stdint.h>
 
+// Guard to avoid conflict with BLAS header file
+// TODO: move this definition to Snitch math library to solve problem
+#ifndef PRECISION_T
+#define PRECISION_T
 typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
 
 typedef float v2f32 __attribute__((vector_size(8)));
 typedef __fp16 v4f16 __attribute__((vector_size(8)));
 typedef char v8f8 __attribute__((vector_size(8)));
+#endif
 
 typedef union {
     double f64;
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index 4c7ff7b1e..8e80cf35c 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -72,6 +72,7 @@ runs:
   - elf: apps/blas/axpy/build/axpy.elf
     cmd: ../../sw/blas/axpy/verify.py {sim_bin} {elf}
   - elf: apps/blas/gemm/build/gemm.elf
+    cmd: ../../sw/blas/gemm/verify.py {sim_bin} {elf}
   - elf: apps/dnn/batchnorm/build/batchnorm.elf
   - elf: apps/dnn/linear/build/linear.elf
   - elf: apps/dnn/maxpool/build/maxpool.elf
diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py
index cea0721f4..664e2624b 100644
--- a/util/sim/data_utils.py
+++ b/util/sim/data_utils.py
@@ -5,6 +5,14 @@
 # Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 import struct
+from datetime import datetime
+
+
+def emit_license():
+    s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna."
+         f"// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n"
+         f"// SPDX-License-Identifier: Apache-2.0\n\n")
+    return s
 
 
 def variable_attributes(alignment=None, section=None):
@@ -20,7 +28,11 @@ def format_vector_definition(type, uid, vector, alignment=None, section=None):
     attributes = variable_attributes(alignment, section)
     s = f'{type} {uid}[{len(vector)}] {attributes} = ' + '{\n'
     for el in vector:
-        s += f'\t{el},\n'
+        if type != 'char':
+            el_str = f'{el}'
+        else:
+            el_str = f'0x{el:02x}'
+        s += f'\t{el_str},\n'
     s += '};'
     return s
 
@@ -55,3 +67,16 @@ def bytes_to_doubles(byte_array):
         double = struct.unpack('<d', double_bytes)[0]
         doubles.append(double)
     return doubles
+
+
+def bytes_to_uint32s(byte_array):
+    uint32_size = struct.calcsize('I')  # Size of a uint32 in bytes
+    num_uints = len(byte_array) // uint32_size
+
+    # Unpack the byte array into a list of uints
+    uints = []
+    for i in range(num_uints):
+        uint32_bytes = byte_array[i * uint32_size:(i + 1) * uint32_size]
+        uint = struct.unpack('<I', uint32_bytes)[0]
+        uints.append(uint)
+    return uints

From f8b1fcf4ad8549befa342f20ecc440a22164e6bb Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Tue, 12 Sep 2023 20:53:51 +0200
Subject: [PATCH 07/13] sw: Make compatible with non-12 LLVM versions (#46)

Co-authored-by: Luca Colagrande <luca.colagrande3@gmail.com>
---
 sw/blas/gemm/src/gemm.h               | 16 ++++++++--------
 target/snitch_cluster/sw/toolchain.mk |  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
index 86ec17ede..ab0f17285 100644
--- a/sw/blas/gemm/src/gemm.h
+++ b/sw/blas/gemm/src/gemm.h
@@ -121,7 +121,7 @@ void gemm_fp64_opt(uint32_t M, uint32_t N, uint32_t K, double* A, uint32_t ldA,
     for (uint32_t m = 0; m < M; m++) {
         uint32_t n = 0;
         for (uint32_t n0 = 0; n0 < N / unroll; n0++) {
-            register double c[unroll];
+            double c[unroll];
 
             // Load intermediate result
             if (*ALPHA) {
@@ -234,7 +234,7 @@ void gemm_fp32_opt(const uint32_t M, const uint32_t N, const uint32_t K,
         for (uint32_t n0 = 0; n0 < N / unroll; n0++) {
             float* _C = &C[m * ldC + n / 2];
             const register float zero = 0.0;
-            register v2f32 c[unroll], reduce_reg[unroll];
+            v2f32 c[unroll], reduce_reg[unroll];
 
             asm volatile(
                 "lw      t0, 0(%[ALPHA]) \n"
@@ -384,8 +384,8 @@ void gemm_fp16_opt(uint32_t M, uint32_t N, uint32_t K, __fp16* A, uint32_t ldA,
         for (uint32_t n0 = 0; n0 < N / unroll; n0++) {
             __fp16* _C = &C[m * ldC + n];
             const register float zero = 0.0;
-            register v4f16 c[unroll];
-            register v2f32 reduce_reg[unroll];
+            v4f16 c[unroll];
+            v2f32 reduce_reg[unroll];
             uint32_t alpha;
 
             asm volatile(
@@ -568,8 +568,8 @@ void gemm_fp16_ex_opt(uint32_t M, uint32_t N, uint32_t K, __fp16* A,
         for (uint32_t n0 = 0; n0 < N / unroll; n0++) {
             __fp16* _C = &C[m * ldC + n];
             const register float zero = 0.0;
-            register v4f16 c[unroll];
-            register v2f32 reduce_reg[unroll];
+            v4f16 c[unroll];
+            v2f32 reduce_reg[unroll];
             uint32_t alpha;
 
             asm volatile(
@@ -735,8 +735,8 @@ void gemm_fp8_ex_opt(uint32_t M, uint32_t N, uint32_t K, char* A, uint32_t ldA,
         for (uint32_t n0 = 0; n0 < N / unroll; n0++) {
             char* _C = &C[m * ldC + n];
             const register float zero = 0.0;
-            register v8f8 c[unroll];
-            register v4f16 reduce_reg[unroll];
+            v8f8 c[unroll];
+            v4f16 reduce_reg[unroll];
             uint32_t alpha;
 
             asm volatile(
diff --git a/target/snitch_cluster/sw/toolchain.mk b/target/snitch_cluster/sw/toolchain.mk
index e8fb7b46b..4fa0fc5af 100644
--- a/target/snitch_cluster/sw/toolchain.mk
+++ b/target/snitch_cluster/sw/toolchain.mk
@@ -17,6 +17,7 @@ DEBUG ?= OFF # ON to turn on debugging symbols
 
 # Compiler toolchain
 LLVM_BINROOT    ?= $(dir $(shell which riscv32-unknown-elf-clang))
+LLVM_VER        ?= $(shell $(LLVM_BINROOT)/llvm-config --version | grep -Eo '[0-9]+\.[0-9]+\.[0-9]+')
 RISCV_CC        ?= $(LLVM_BINROOT)/clang
 RISCV_LD        ?= $(LLVM_BINROOT)/ld.lld
 RISCV_AR        ?= $(LLVM_BINROOT)/llvm-ar
@@ -48,7 +49,7 @@ RISCV_LDFLAGS += -fuse-ld=$(RISCV_LD)
 RISCV_LDFLAGS += -nostartfiles
 RISCV_LDFLAGS += -nostdlib
 RISCV_LDFLAGS += -lc
-RISCV_LDFLAGS += -L$(LLVM_BINROOT)/../lib/clang/12.0.1/lib/
+RISCV_LDFLAGS += -L$(LLVM_BINROOT)/../lib/clang/$(LLVM_VER)/lib/
 RISCV_LDFLAGS += -lclang_rt.builtins-riscv32
 
 # Archiver flags

From 5c470c12a1f451509bea8344a50f1abd872e3be9 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 15 Sep 2023 10:26:06 +0200
Subject: [PATCH 08/13] axpy: Allow storing data to custom sections

---
 sw/blas/axpy/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sw/blas/axpy/Makefile b/sw/blas/axpy/Makefile
index 40c3af1f6..bed4edaa8 100644
--- a/sw/blas/axpy/Makefile
+++ b/sw/blas/axpy/Makefile
@@ -9,7 +9,8 @@ MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 DATA_DIR := $(realpath $(MK_DIR)/data)
 SRC_DIR  := $(realpath $(MK_DIR)/src)
 
-LENGTH ?= 24
+LENGTH  ?= 24
+SECTION ?=
 
 APP     ?= axpy
 SRCS    ?= $(SRC_DIR)/main.c
@@ -19,7 +20,7 @@ DATAGEN_PY = $(DATA_DIR)/datagen.py
 DATA_H     = $(DATA_DIR)/data.h
 
 $(DATA_H): $(DATAGEN_PY)
-	$< $(LENGTH) > $@
+	$< $(LENGTH) --section="$(SECTION)" > $@
 
 .PHONY: clean-data clean
 

From 4b9f7f1e0bcaddd9706c051a08d1934d7769445d Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 15 Sep 2023 10:27:39 +0200
Subject: [PATCH 09/13] util/perf_csv.py: Allow >10 hexadecimal-indexed traces

---
 util/trace/perf_csv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/trace/perf_csv.py b/util/trace/perf_csv.py
index d2286aae5..450758c70 100755
--- a/util/trace/perf_csv.py
+++ b/util/trace/perf_csv.py
@@ -51,7 +51,7 @@ def main():
     for dump in dumps:
 
         # Get hart id from filename and append to index
-        hartid = int(re.search(HARTID_REGEX, dump).group(1))
+        hartid = int(re.search(HARTID_REGEX, dump).group(1), base=16)
         index.append(hartid)
 
         # Populate dictionary of metrics for the current hart

From e57335773da2899bae86410760b6e8b0c1a0de51 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 15 Sep 2023 17:27:20 +0200
Subject: [PATCH 10/13] util/sim: Log errors caught by IPC verification
 framework

---
 sw/blas/axpy/verify.py   |  3 +++
 util/sim/verification.py | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/sw/blas/axpy/verify.py b/sw/blas/axpy/verify.py
index 02cb15975..80b195ff9 100755
--- a/sw/blas/axpy/verify.py
+++ b/sw/blas/axpy/verify.py
@@ -42,6 +42,9 @@ def main():
     z_golden = golden_model(a, x, y)
     relative_err = np.absolute((z_golden - z_actual) / z_golden)
     fail = np.any(relative_err > ERR_THRESHOLD)
+    if (fail):
+        verification.dump_results_to_csv([z_golden, z_actual, relative_err],
+                                         Path.cwd() / 'axpy_results.csv')
 
     return int(fail)
 
diff --git a/util/sim/verification.py b/util/sim/verification.py
index 9878ef62a..04594a51c 100644
--- a/util/sim/verification.py
+++ b/util/sim/verification.py
@@ -7,6 +7,8 @@
 
 import sys
 import argparse
+import numpy as np
+import csv
 from elf import Elf
 from pathlib import Path
 
@@ -60,3 +62,19 @@ def simulate(sim_bin, snitch_bin, log, output_uids, symbols_bin=None):
     sim.finish(wait_for_sim=True)
 
     return raw_outputs
+
+
+# Takes a set of Numpy arrays (of the same shape), flattens them, zips them
+# and dumps them to a CSV file. Arrays may for example be: golden results, actual
+# results, absolute errors and relative errors.
+def dump_results_to_csv(results, path):
+    # Flatten and zip arrays
+    flattened = [arr.flatten() for arr in results]
+    zipped = np.column_stack(flattened)
+    # Write row-by-row to CSV file
+    with open(path, 'w') as csv_file:
+        csv_writer = csv.writer(csv_file)
+        for row in zipped:
+            csv_writer.writerow(row)
+    # Print path where results were written
+    print(f"Wrote results to {path}")

From 4581ed9d3f5cbf2d298a43c8c60c844fdf4dfb4e Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Tue, 5 Sep 2023 11:28:39 +0200
Subject: [PATCH 11/13] test: Fix bandwidth loss in `tb_memory_axi` IP

---
 target/common/test/tb_memory_axi.sv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/common/test/tb_memory_axi.sv b/target/common/test/tb_memory_axi.sv
index cddbc4239..2c8e28a6e 100644
--- a/target/common/test/tb_memory_axi.sv
+++ b/target/common/test/tb_memory_axi.sv
@@ -95,6 +95,7 @@ module tb_memory_axi #(
     .ID_WIDTH   ( AxiIdWidth   ),
     .USER_WIDTH ( AxiUserWidth ),
     .DECOUPLE_W ( 1            ),
+    .FULL_BW    ( 1            ),
     .AXI_MAX_WRITE_TXNS ( 32'd128 ),
     .AXI_MAX_READ_TXNS  ( 32'd128 )
   ) i_axi_to_reg (

From 1273cc22baa86bea5c48c6e7f8b8316430d3284a Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 20 Sep 2023 16:47:50 +0200
Subject: [PATCH 12/13] simulate.py: Run tests in parallel

---
 .github/workflows/ci.yml |   6 ++-
 .gitlab-ci.yml           |   9 ++--
 target/common/common.mk  |   2 +-
 util/sim/simulate.py     | 111 ++++++++++++++++++++++++++++++---------
 4 files changed, 97 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 84faeac81..f2c3e692a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -43,7 +43,8 @@ jobs:
       - name: Run Tests
         working-directory: target/snitch_cluster
         run: |-
-          ../../util/sim/simulate.py sw/run.yaml --simulator verilator
+          ../../util/sim/simulate.py sw/run.yaml --simulator verilator -j \
+          --verbose
 
   ############################################
   # Build SW on Snitch Cluster w/ Banshee #
@@ -66,4 +67,5 @@ jobs:
           SNITCH_LOG: info
         working-directory: target/snitch_cluster
         run: |-
-          ../../util/sim/simulate.py sw/run.yaml --simulator banshee
+          ../../util/sim/simulate.py sw/run.yaml --simulator banshee -j \
+          --verbose
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 18adcf22e..610c271ea 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -93,7 +93,7 @@ snitch-cluster-vlt:
   script:
     - cd target/snitch_cluster
     - $VERILATOR make bin/snitch_cluster.vlt
-    - $VERILATOR ../../util/sim/simulate.py sw/run.yaml --simulator verilator
+    - $VERILATOR ../../util/sim/simulate.py sw/run.yaml --simulator verilator -j --verbose
   # yamllint enable rule:line-length
 
 # VCS
@@ -102,7 +102,7 @@ snitch-cluster-vcs:
   script:
     - cd target/snitch_cluster
     - $VCS make bin/snitch_cluster.vcs
-    - $VCS ../../util/sim/simulate.py sw/run.yaml --simulator vcs
+    - $VCS ../../util/sim/simulate.py sw/run.yaml --simulator vcs -j --verbose
 
 # Questa
 snitch-cluster-vsim:
@@ -110,7 +110,8 @@ snitch-cluster-vsim:
   script:
     - cd target/snitch_cluster
     - $QUESTA make bin/snitch_cluster.vsim
-    - $QUESTA ../../util/sim/simulate.py sw/run.yaml --simulator vsim
+    - $QUESTA ../../util/sim/simulate.py sw/run.yaml --simulator vsim -j
+      --verbose
 
 # Banshee
 snitch-cluster-banshee:
@@ -126,4 +127,4 @@ snitch-cluster-banshee:
     - cd banshee
     - cargo install --debug --path .
     - cd ../target/snitch_cluster
-    - ../../util/sim/simulate.py sw/run.yaml --simulator banshee
+    - ../../util/sim/simulate.py sw/run.yaml --simulator banshee -j --verbose
diff --git a/target/common/common.mk b/target/common/common.mk
index 9c469f5a6..6b9c679d0 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -154,7 +154,7 @@ define QUESTASIM
 	@echo 'binary=$$(realpath --relative-to=${MKFILE_DIR} $$1)' >> $@
 	@echo 'cd ${MKFILE_DIR}' >> $@
 	@echo 'echo $$binary > $(LOGS_DIR)/.rtlbinary' >> $@
-	@echo '${VSIM} +permissive ${VSIM_FLAGS} -work ${MKFILE_DIR}/${VSIM_BUILDDIR} -c \
+	@echo '${VSIM} +permissive ${VSIM_FLAGS} $$3 -work ${MKFILE_DIR}/${VSIM_BUILDDIR} -c \
 				-ldflags "-Wl,-rpath,${FESVR}/lib -L${FESVR}/lib -lfesvr -lutil" \
 				$1 +permissive-off ++$$binary ++$$2' >> $@
 	@chmod +x $@
diff --git a/util/sim/simulate.py b/util/sim/simulate.py
index a1466dc16..db00292af 100755
--- a/util/sim/simulate.py
+++ b/util/sim/simulate.py
@@ -8,11 +8,14 @@
 # TODO colluca: timeout feature
 
 import argparse
+import multiprocessing
 from pathlib import Path
 import subprocess
 from termcolor import colored, cprint
+import os
 import re
 import sys
+import time
 import yaml
 
 
@@ -28,7 +31,7 @@
     'vcs': 'bin/snitch_cluster.vcs'
 }
 SIMULATOR_CMDS = {
-    'vsim': '{sim_bin} {elf}',
+    'vsim': '{sim_bin} {elf} "" -batch',
     'banshee': ('{{sim_bin}} --no-opt-llvm --no-opt-jit --configuration {cfg}'
                 ' --trace {{elf}} > /dev/null').format(cfg=BANSHEE_CFG),
     'verilator': '{sim_bin} {elf}',
@@ -62,6 +65,22 @@ def parse_args():
         '--early-exit',
         action='store_true',
         help='Exit as soon as any test fails')
+    parser.add_argument(
+        '-j',
+        action='store',
+        dest='n_procs',
+        nargs='?',
+        type=int,
+        default=1,
+        const=os.cpu_count(),
+        help=('Maximum number of tests to run in parallel. '
+              'One if the option is not present. Equal to the number of CPU cores '
+              'if the option is present but not followed by an argument.'))
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help=('Option to print simulation logs when multiple tests are run in parallel.'
+              'Logs are always printed when n_procs == 1'))
     args = parser.parse_args()
     return args
 
@@ -81,17 +100,25 @@ def check_exit_code(test, exit_code):
         return exit_code
 
 
-def run_simulation(cmd, simulator, test):
+def multiple_processes(args):
+    return args.n_procs != 1
+
+
+def run_simulation(cmd, simulator, test, quiet=False):
     # Defaults
     result = 1
+    log = ''
 
     # Spawn simulation subprocess
-    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
+    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                         universal_newlines=True)
 
     # Poll simulation subprocess and log its output
     while p.poll() is None:
         line = p.stdout.readline()
-        print(line, end='', flush=True)
+        log += line
+        if not quiet:
+            print(line, end='', flush=True)
 
         # When simulating with vsim or vcs, we need to parse the simulation
         # log to catch the application's return code
@@ -123,7 +150,7 @@ def run_simulation(cmd, simulator, test):
         if exit_code != 0:
             result = exit_code
 
-    return result
+    return result, log
 
 
 def run_test(test, args):
@@ -132,11 +159,12 @@ def run_test(test, args):
     sim_bin = args.sim_bin if args.sim_bin else SIMULATOR_BINS[simulator]
     dry_run = args.dry_run
     testlist = args.testlist
+    quiet = multiple_processes(args)
 
     # Check if simulator is supported for this test
     if 'simulators' in test:
         if simulator not in test['simulators']:
-            return 0
+            return (0, '')
 
     # Construct path to executable
     elf = Path(test['elf'])
@@ -152,12 +180,14 @@ def run_test(test, args):
     else:
         cmd = SIMULATOR_CMDS[simulator]
         cmd = cmd.format(sim_bin=sim_bin, elf=elf)
-    print(f'$ {cmd}', flush=True)
+    if not quiet:
+        print(f'$ {cmd}', flush=True)
 
     # Run simulation
     result = 0
+    log = ''
     if not dry_run:
-        result = run_simulation(cmd, simulator, test)
+        result, log = run_simulation(cmd, simulator, test, quiet)
 
     # Report failure or success
     if result != 0:
@@ -165,39 +195,72 @@ def run_test(test, args):
     else:
         cprint(f'{elf} test passed', 'green', attrs=['bold'], flush=True)
 
-    return result
+    return (result, log)
 
 
 def print_failed_test(test):
     print(f'{colored(test["elf"], "cyan")} test {colored("failed", "red")}')
 
 
-def print_test_summary(failed_tests, dry_run=False):
-    if not dry_run:
-        print('\n==== Test summary ====')
+def print_test_summary(failed_tests, args):
+    if not args.dry_run:
+        header = f'\n==== Test summary {"(early exit)" if args.early_exit else ""} ===='
+        cprint(header, attrs=['bold'])
         if failed_tests:
             for failed_test in failed_tests:
                 print_failed_test(failed_test)
-            return 1
         else:
             print(f'{colored("All tests passed!", "green")}')
-            return 0
-    return 0
 
 
 def run_tests(args):
-    # Iterate tests
+
+    # Get tests from testlist
     tests = get_tests(args.testlist)
+
+    # Create a process Pool
+    with multiprocessing.Pool(args.n_procs) as pool:
+
+        # Create a shared object which parent and child processes can access
+        # concurrently to terminate the pool early as soon as one process fails
+        exit_early = multiprocessing.Value('B')
+        exit_early.value = 0
+
+        # Define callback for early exit
+        def completion_callback(return_value):
+            result = return_value[0]
+            log = return_value[1]
+            if args.early_exit and result != 0:
+                exit_early.value = 1
+            # Printing the log all at once here, rather than line-by-line
+            # in run_simulation, ensures that the logs of different processes
+            # are not interleaved in stdout.
+            # However, as we prefer line-by-line printing when a single process
+            # is used, we have to make sure we don't print twice.
+            if args.verbose and multiple_processes(args):
+                print(log)
+
+        # Queue tests to process pool
+        results = []
+        for test in tests:
+            result = pool.apply_async(run_test, args=(test, args), callback=completion_callback)
+            results.append(result)
+
+        # Wait for all tests to complete
+        running = range(len(tests))
+        while len(running) != 0 and not exit_early.value:
+            time.sleep(1)
+            running = [i for i in running if not results[i].ready()]
+
+    # Query test results
     failed_tests = []
-    for test in tests:
-        # Run test
-        result = run_test(test, args)
-        if result != 0:
+    for test, result in zip(tests, results):
+        if result.ready() and result.get()[0] != 0:
             failed_tests.append(test)
-            # End program if requested on first test failure
-            if args.early_exit:
-                break
-    return print_test_summary(failed_tests, args.dry_run)
+
+    print_test_summary(failed_tests, args)
+
+    return len(failed_tests)
 
 
 def main():

From cd78ab6b9e1ea047fa44dc1eded86136d3777d09 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 21 Sep 2023 16:23:42 +0200
Subject: [PATCH 13/13] util/sim: Minor improvements

---
 util/sim/simulate.py     | 10 ++++------
 util/sim/verification.py |  4 ++--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/util/sim/simulate.py b/util/sim/simulate.py
index db00292af..4e36cc1e1 100755
--- a/util/sim/simulate.py
+++ b/util/sim/simulate.py
@@ -213,10 +213,7 @@ def print_test_summary(failed_tests, args):
             print(f'{colored("All tests passed!", "green")}')
 
 
-def run_tests(args):
-
-    # Get tests from testlist
-    tests = get_tests(args.testlist)
+def run_tests(tests, args):
 
     # Create a process Pool
     with multiprocessing.Pool(args.n_procs) as pool:
@@ -265,8 +262,9 @@ def completion_callback(return_value):
 
 def main():
     args = parse_args()
-    sys.exit(run_tests(args))
+    tests = get_tests(args.testlist)
+    return run_tests(tests, args)
 
 
 if __name__ == '__main__':
-    main()
+    sys.exit(main())
diff --git a/util/sim/verification.py b/util/sim/verification.py
index 04594a51c..9dd3428e4 100644
--- a/util/sim/verification.py
+++ b/util/sim/verification.py
@@ -28,8 +28,8 @@ def parse_args():
         help='The Snitch binary to be executed by the simulated Snitch hardware')
     parser.add_argument(
         '--symbols-bin',
-        help='An optional binary containing the I/O symbols. By default,'
-             'these are searched for in snitch_bin. This argument serves as an'
+        help='An optional binary containing the I/O symbols. By default, '
+             'these are searched for in snitch_bin. This argument serves as an '
              'alternative.')
     parser.add_argument(
         '--log',