From de323667c2b96cc3d0cf3dac0ecb628fedc95adb Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 17 Jan 2024 15:03:21 +0100
Subject: [PATCH 1/8] target: Fix bug in tracer targets

---
 target/common/common.mk | 59 ++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/target/common/common.mk b/target/common/common.mk
index 6460962aa..9efad2e95 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -199,25 +199,31 @@ endef
 # Traces #
 ##########
 
-DASM_TRACES      = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null))
-TXT_TRACES       = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.txt/g'))
-PERF_TRACES      = $(shell (echo $(DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g'))
-ANNOTATED_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.s/g'))
-DIFF_TRACES      = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.diff/g'))
-
-GENTRACE_OUTPUTS = $(TXT_TRACES) $(PERF_TRACES)
-ANNOTATE_OUTPUTS = $(ANNOTATED_TRACES)
-PERF_CSV         = $(LOGS_DIR)/perf.csv
-EVENT_CSV        = $(LOGS_DIR)/event.csv
-TRACE_CSV        = $(LOGS_DIR)/trace.csv
-TRACE_JSON       = $(LOGS_DIR)/trace.json
-
-.PHONY: traces annotate perf-csv event-csv layout
-traces: $(GENTRACE_OUTPUTS)
-annotate: $(ANNOTATE_OUTPUTS)
-perf-csv: $(PERF_CSV)
-event-csv: $(EVENT_CSV)
-layout: $(TRACE_CSV) $(TRACE_JSON)
+SNITCH_DASM_TRACES      = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null))
+SNITCH_TXT_TRACES       = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.txt/g'))
+SNITCH_ANNOTATED_TRACES = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.s/g'))
+SNITCH_PERF_DUMPS       = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g'))
+
+TXT_TRACES       += $(SNITCH_TXT_TRACES)
+ANNOTATED_TRACES += $(SNITCH_ANNOTATED_TRACES)
+PERF_DUMPS       += $(SNITCH_PERF_DUMPS)
+JOINT_PERF_DUMP   = $(LOGS_DIR)/perf.json
+ROI_DUMP          = $(LOGS_DIR)/roi.json
+VISUAL_TRACE      = $(LOGS_DIR)/trace.json
+
+.PHONY: traces annotate visual-trace clean-traces clean-annotate clean-perf clean-visual-trace
+traces: $(TXT_TRACES)
+annotate: $(ANNOTATED_TRACES)
+perf: $(JOINT_PERF_DUMP)
+visual-trace: $(VISUAL_TRACE)
+clean-traces:
+	rm -f $(TXT_TRACES)
+clean-annotate:
+	rm -f $(ANNOTATED_TRACES)
+clean-perf:
+	rm -f $(PERF_DUMPS) $(JOINT_PERF_DUMP)
+clean-visual-trace:
+	rm -f $(VISUAL_TRACE)
 
 $(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
 	$(DASM) < $< | $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt
@@ -230,14 +236,11 @@ $(LOGS_DIR)/trace_hart_%.s: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
 $(LOGS_DIR)/trace_hart_%.diff: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
 	${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -d
 
-$(PERF_CSV): $(PERF_TRACES) $(PERF_CSV_PY)
-	$(PERF_CSV_PY) -o $@ -i $(PERF_TRACES)
+$(JOINT_PERF_DUMP): $(PERF_DUMPS) $(JOIN_PY)
+	$(JOIN_PY) -i $(shell ls $(LOGS_DIR)/*_perf.json) -o $@
 
-$(EVENT_CSV): $(PERF_TRACES) $(PERF_CSV_PY)
-	$(PERF_CSV_PY) -o $@ -i $(PERF_TRACES) --filter tstart tend
+$(ROI_DUMP): $(JOINT_PERF_DUMP) $(ROI_SPEC) $(ROI_PY)
+	$(ROI_PY) $(JOINT_PERF_DUMP) $(ROI_SPEC) --cfg $(CFG) -o $@
 
-$(TRACE_CSV): $(EVENT_CSV) $(LAYOUT_FILE) $(LAYOUT_EVENTS_PY)
-	$(LAYOUT_EVENTS_PY) $(LAYOUT_EVENTS_FLAGS) $(EVENT_CSV) $(LAYOUT_FILE) -o $@
-
-$(TRACE_JSON): $(TRACE_CSV) $(EVENTVIS_PY)
-	$(EVENTVIS_PY) -o $@ $(TRACE_CSV)
+$(VISUAL_TRACE): $(ROI_DUMP) $(VISUALIZE_PY)
+	$(VISUALIZE_PY) $(ROI_DUMP) --traces $(SNITCH_TXT_TRACES) --elf $(BINARY) -o $@

From d48a60c66bd718c985af1a7e5a76ab5aff5d8943 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 3 Apr 2024 13:42:46 +0200
Subject: [PATCH 2/8] trace: Refactor and prepare for DMA trace generation

* bench: Refactor benchmarking utils to use JSON

* docs: Add trace and benchmarking utilities
---
 .github/workflows/ci.yml             |  14 +++
 .gitlab-ci.yml                       |   8 ++
 docs/rm/bench/join.md                |   1 +
 docs/rm/bench/roi.md                 |   1 +
 docs/rm/bench/visualize.md           |   1 +
 docs/rm/trace/annotate.md            |   1 +
 docs/rm/trace/events.md              |   1 +
 docs/rm/trace/gen_trace.md           |   4 +
 mkdocs.yml                           |   7 ++
 python-requirements.txt              |  17 ++--
 target/common/common.mk              |  10 +-
 util/bench/__init__.py               |   0
 util/bench/join.py                   |  62 ++++++++++++
 util/bench/roi.py                    | 123 +++++++++++++++++++++++
 util/bench/tests/__init__.py         |   0
 util/bench/tests/test_data/data.json |  46 +++++++++
 util/bench/tests/test_data/roi.json  |  33 ++++++
 util/bench/tests/test_data/spec.json |  16 +++
 util/bench/tests/test_roi.py         |  62 ++++++++++++
 util/bench/visualize.py              | 117 +++++++++++++++++++++
 util/trace/annotate.py               |  39 ++++---
 util/trace/events.py                 |   9 +-
 util/trace/eventvis.py               | 136 -------------------------
 util/trace/gen_trace.py              |  80 ++++++++-------
 util/trace/layout_events.py          | 145 ---------------------------
 util/trace/perf_csv.py               |  83 ---------------
 26 files changed, 584 insertions(+), 432 deletions(-)
 create mode 100644 docs/rm/bench/join.md
 create mode 100644 docs/rm/bench/roi.md
 create mode 100644 docs/rm/bench/visualize.md
 create mode 100644 docs/rm/trace/annotate.md
 create mode 100644 docs/rm/trace/events.md
 create mode 100644 util/bench/__init__.py
 create mode 100755 util/bench/join.py
 create mode 100755 util/bench/roi.py
 create mode 100644 util/bench/tests/__init__.py
 create mode 100644 util/bench/tests/test_data/data.json
 create mode 100644 util/bench/tests/test_data/roi.json
 create mode 100644 util/bench/tests/test_data/spec.json
 create mode 100644 util/bench/tests/test_roi.py
 create mode 100755 util/bench/visualize.py
 delete mode 100755 util/trace/eventvis.py
 delete mode 100755 util/trace/layout_events.py
 delete mode 100755 util/trace/perf_csv.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9d73348e8..4613be05d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,6 +21,20 @@ jobs:
       - name: Build docs
         run: make docs
 
+  #####################
+  # Python unit tests #
+  #####################
+
+  pytest:
+    name: Python unit tests
+    runs-on: ubuntu-22.04
+    container:
+      image: ghcr.io/pulp-platform/snitch_cluster:main
+    steps:
+      - uses: actions/checkout@v2
+      - name: Run pytest
+        run: pytest
+
   ##############################################
   # Simulate SW on Snitch Cluster w/ Verilator #
   ##############################################
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6e8f45cb1..656558761 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -24,6 +24,14 @@ docs:
   script:
     - make docs
 
+#####################
+# Python unit tests #
+#####################
+
+pytest:
+  script:
+    - pytest
+
 #################################
 # Build Snitch cluster software #
 #################################
diff --git a/docs/rm/bench/join.md b/docs/rm/bench/join.md
new file mode 100644
index 000000000..ee9aa8221
--- /dev/null
+++ b/docs/rm/bench/join.md
@@ -0,0 +1 @@
+::: join
\ No newline at end of file
diff --git a/docs/rm/bench/roi.md b/docs/rm/bench/roi.md
new file mode 100644
index 000000000..239fedf30
--- /dev/null
+++ b/docs/rm/bench/roi.md
@@ -0,0 +1 @@
+::: roi
\ No newline at end of file
diff --git a/docs/rm/bench/visualize.md b/docs/rm/bench/visualize.md
new file mode 100644
index 000000000..b2c2bed8b
--- /dev/null
+++ b/docs/rm/bench/visualize.md
@@ -0,0 +1 @@
+::: visualize
\ No newline at end of file
diff --git a/docs/rm/trace/annotate.md b/docs/rm/trace/annotate.md
new file mode 100644
index 000000000..b70b1a847
--- /dev/null
+++ b/docs/rm/trace/annotate.md
@@ -0,0 +1 @@
+::: annotate
\ No newline at end of file
diff --git a/docs/rm/trace/events.md b/docs/rm/trace/events.md
new file mode 100644
index 000000000..5b9cca4ae
--- /dev/null
+++ b/docs/rm/trace/events.md
@@ -0,0 +1 @@
+::: events
\ No newline at end of file
diff --git a/docs/rm/trace/gen_trace.md b/docs/rm/trace/gen_trace.md
index c15c297be..8eef24668 100644
--- a/docs/rm/trace/gen_trace.md
+++ b/docs/rm/trace/gen_trace.md
@@ -1 +1,5 @@
+<<<<<<< HEAD
 ::: gen_trace
+=======
+::: gen_trace
+>>>>>>> c744477... trace: Refactor and prepare for DMA trace generation
diff --git a/mkdocs.yml b/mkdocs.yml
index 61e4494a9..2ccad29a7 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -28,6 +28,7 @@ plugins:
           paths:
             - util/sim
             - util/trace
+            - util/bench
             - target/snitch_cluster/util
   - macros:
       on_error_fail: true
@@ -67,6 +68,12 @@ nav:
               - rm/sim/Elf.md
           - Trace Utilities:
               - gen_trace.py: rm/trace/gen_trace.md
+              - annotate.py: rm/trace/annotate.md
+              - events.py: rm/trace/events.md
+          - Benchmarking Utilities:
+              - join.py: rm/bench/join.md
+              - roi.py: rm/bench/roi.md
+              - visualize.py: rm/bench/visualize.md
           - Snitch Target Utilities:
               - run.py: rm/snitch_target_utils/run.md
               - build.py: rm/snitch_target_utils/build.md
diff --git a/python-requirements.txt b/python-requirements.txt
index 583de9499..72c35460f 100644
--- a/python-requirements.txt
+++ b/python-requirements.txt
@@ -2,6 +2,7 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 
+# Keep sorted.
 bin2coe
 dataclasses
 editorconfig-checker==2.3.51
@@ -13,18 +14,20 @@ json5
 jsonref
 jsonschema
 mako
+matplotlib
 mkdocs-material
-progressbar2
-tabulate
-yamllint
-pyyaml
-pytablewriter
-termcolor
 pandas
 prettytable
-pyelftools
+progressbar2
 psutil
+pyelftools
 pyflexfloat
+pytablewriter
+pytest
+pyyaml
+tabulate
+termcolor
+yamllint
 
 -r docs/requirements.txt
 -r sw/dnn/requirements.txt
diff --git a/target/common/common.mk b/target/common/common.mk
index 9efad2e95..18f40ac45 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -36,9 +36,9 @@ ADDR2LINE    ?= $(LLVM_BINROOT)/llvm-addr2line
 GENTRACE_PY      ?= $(UTIL_DIR)/trace/gen_trace.py
 ANNOTATE_PY      ?= $(UTIL_DIR)/trace/annotate.py
 EVENTS_PY        ?= $(UTIL_DIR)/trace/events.py
-PERF_CSV_PY      ?= $(UTIL_DIR)/trace/perf_csv.py
-LAYOUT_EVENTS_PY ?= $(UTIL_DIR)/trace/layout_events.py
-EVENTVIS_PY      ?= $(UTIL_DIR)/trace/eventvis.py
+JOIN_PY          ?= $(UTIL_DIR)/bench/join.py
+ROI_PY           ?= $(UTIL_DIR)/bench/roi.py
+VISUALIZE_PY     ?= $(UTIL_DIR)/bench/visualize.py
 
 # For some reason `$(VERILATOR_SEPP) which verilator` returns a
 # a two-liner with the OS on the first line, hence the tail -n1
@@ -225,8 +225,8 @@ clean-perf:
 clean-visual-trace:
 	rm -f $(VISUAL_TRACE)
 
-$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
-	$(DASM) < $< | $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt
+$(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
+	$(DASM) < $< | $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
 
 # Generate source-code interleaved traces for all harts. Reads the binary from
 # the logs/.rtlbinary file that is written at start of simulation in the vsim script
diff --git a/util/bench/__init__.py b/util/bench/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/util/bench/join.py b/util/bench/join.py
new file mode 100755
index 000000000..56c0defe0
--- /dev/null
+++ b/util/bench/join.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Combines performance metrics from all threads into one JSON file.
+
+This script takes the performance metrics from multiple cores or DMA
+engines, in JSON format as dumped by the [`events.py`][events] or
+[`gen_trace.py`][gen_trace] scripts, and merges them into a single
+JSON file for global inspection and further processing.
+"""
+
+import sys
+import argparse
+import re
+import json
+
+
+FILENAME_REGEX = r'([a-z]+)_([0-9a-f]+)_perf.json'
+
+
+def main():
+    # Argument parsing
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-i',
+        '--inputs',
+        metavar='<inputs>',
+        nargs='+',
+        help='Input performance metric dumps')
+    parser.add_argument(
+        '-o',
+        '--output',
+        metavar='<output>',
+        nargs='?',
+        default='perf.json',
+        help='Output JSON file')
+    args = parser.parse_args()
+
+    # Populate a list (one entry per hart) of dictionaries
+    # enumerating all the performance metrics for each hart
+    data = {}
+    for filename in sorted(args.inputs):
+
+        # Get thread ID and type (DMA or hart) from filename
+        match = re.search(FILENAME_REGEX, filename)
+        typ = match.group(1)
+        idx = int(match.group(2), base=16)
+
+        # Populate dictionary of metrics for the current hart
+        with open(filename, 'r') as f:
+            data[f'{typ}_{idx}'] = json.load(f)
+
+    # Export data
+    with open(args.output, 'w') as f:
+        json.dump(data, f, indent=4)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/util/bench/roi.py b/util/bench/roi.py
new file mode 100755
index 000000000..7e4b18723
--- /dev/null
+++ b/util/bench/roi.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Filters and labels execution regions for visualization.
+
+This script takes a JSON file of performance metrics, as output by
+[`join.py`][join], and generates another JSON, where the execution
+regions are filtered and labeled for visualization, according to an
+auxiliary region-of-interest (ROI) specification file (JSON format).
+The specification file can be a Mako template to parameterize
+certain parameters, such as the number of clusters in the system.
+The output JSON can be passed to the [`visualize.py`][visualize]
+script for visualization.
+
+Check out `test_data/data.json` and `test_data/spec.json` for an
+example input and specification file which can be fed as input to the
+tool respectively. The corresponding output is contained in
+`test_data/roi.json`.
+"""
+
+import argparse
+import json
+import json5
+from mako.template import Template
+import sys
+
+
+def format_roi(roi, label):
+    return {
+        "label": label,
+        "tstart": roi["tstart"],
+        "tend": roi["tend"],
+        "attrs": {key: value for key, value in roi.items() if key not in ["tstart", "tend"]}
+    }
+
+
+def get_roi(data, thread, idx):
+    thread_type, thread_idx = thread.split('_')
+    thread_idx = int(thread_idx)
+    try:
+        thread_data = data[thread]
+    except KeyError:
+        raise KeyError(f"Nonexistent thread {thread}")
+    if thread_type in ["hart", "dma"]:
+        try:
+            if thread_type == "hart":
+                return thread_data[idx]
+            elif thread_type == "dma":
+                return thread_data["transfers"][idx]
+        except IndexError:
+            raise IndexError(f"Thread {thread} does not contain region {idx}")
+    else:
+        raise ValueError(f"Unsupported thread type {thread_type}")
+
+
+def filter_and_label_rois(data, spec):
+    output = {}
+    # Iterate all threads in the rendered specification
+    for thread_spec in spec:
+        thread = thread_spec['thread']
+        output_rois = []
+        # Iterate all ROIs to keep for the current thread
+        for roi in thread_spec['roi']:
+            output_roi = format_roi(get_roi(data, thread, roi['idx']), roi['label'])
+            output_rois.append(output_roi)
+        # Add ROIs for current thread to output, if any
+        if output_rois:
+            output[thread] = output_rois
+    return output
+
+
+def load_json_inputs(input_path, spec_path, **kwargs):
+    # Read input JSON
+    with open(input_path, 'r') as f:
+        data = json5.load(f)
+    # Read and render specification template JSON
+    with open(spec_path, 'r') as f:
+        spec_template = Template(f.read())
+        rendered_spec = spec_template.render(**kwargs)
+        spec = json5.loads(rendered_spec)
+    return data, spec
+
+
+def main():
+    # Argument parsing
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input',
+        help='Input JSON file')
+    parser.add_argument(
+        'spec',
+        help='ROI specification file (JSON format)')
+    parser.add_argument(
+        '--cfg',
+        help='Hardware configuration file used to render the specification file')
+    parser.add_argument(
+        '-o',
+        '--output',
+        nargs='?',
+        default='roi.json',
+        help='Output JSON file')
+    args = parser.parse_args()
+
+    # Load hardware configuration
+    with open(args.cfg, 'r') as f:
+        cfg = json5.load(f)
+
+    # Read and render input files
+    data, spec = load_json_inputs(args.input, args.spec, cfg=cfg)
+
+    # Process inputs and generate output JSON
+    output = filter_and_label_rois(data, spec)
+
+    # Write output to file
+    with open(args.output, 'w') as f:
+        json.dump(output, f, indent=4)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/util/bench/tests/__init__.py b/util/bench/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/util/bench/tests/test_data/data.json b/util/bench/tests/test_data/data.json
new file mode 100644
index 000000000..77ca26416
--- /dev/null
+++ b/util/bench/tests/test_data/data.json
@@ -0,0 +1,46 @@
+{
+    "hart_0": [
+        {
+            "tstart": 1759.0,
+            "tend": 6802.0,
+            "fpss_fpu_occupancy": 0.006345429307951616,
+            "total_ipc": 0.04501288915328178
+        },
+        {
+            "tstart": 6802.0,
+            "tend": 12647.0,
+            "fpss_fpu_occupancy": 0.013860369609856264,
+            "total_ipc": 0.20756331279945245
+        }
+    ],
+    "dma_9": {
+        "aggregate_bw": 11.829313543599257,
+        "transfers": [
+            {
+                "tstart": 3512,
+                "tend": 3526,
+                "bw": 1.1428571428571428
+            },
+            {
+                "tstart": 3564,
+                "tend": 3578,
+                "bw": 1.1428571428571428
+            }
+        ]
+    },
+    "dma_18": {
+        "aggregate_bw": 16.633245382585752,
+        "transfers": [
+            {
+                "tstart": 3608,
+                "tend": 3622,
+                "bw": 1.1428571428571428
+            },
+            {
+                "tstart": 3660,
+                "tend": 3674,
+                "bw": 1.1428571428571428
+            }
+        ]
+    }
+}
diff --git a/util/bench/tests/test_data/roi.json b/util/bench/tests/test_data/roi.json
new file mode 100644
index 000000000..a6efe3773
--- /dev/null
+++ b/util/bench/tests/test_data/roi.json
@@ -0,0 +1,33 @@
+{
+    "hart_0": [
+        {
+            "label": "compute",
+            "tstart": 6802.0,
+            "tend": 12647.0,
+            "attrs": {
+                "fpss_fpu_occupancy": 0.013860369609856264,
+                "total_ipc": 0.20756331279945245
+            }
+        }
+    ],
+    "dma_9": [
+        {
+            "label": "dma_in",
+            "tstart": 3512,
+            "tend": 3526,
+            "attrs": {
+                "bw": 1.1428571428571428
+            }
+        }
+    ],
+    "dma_18": [
+        {
+            "label": "dma_in",
+            "tstart": 3608,
+            "tend": 3622,
+            "attrs": {
+                "bw": 1.1428571428571428
+            }
+        }
+    ]
+}
diff --git a/util/bench/tests/test_data/spec.json b/util/bench/tests/test_data/spec.json
new file mode 100644
index 000000000..ae58303c0
--- /dev/null
+++ b/util/bench/tests/test_data/spec.json
@@ -0,0 +1,16 @@
+[
+    {
+        "thread": "hart_0",
+        "roi": [
+            {"idx": 1, "label": "compute"}
+        ]
+    },
+% for i in range(0, num_clusters):
+    {
+        "thread": "${f'dma_{9*(i+1)}'}",
+        "roi": [
+            {"idx": 0, "label": "dma_in"}
+        ]
+    },
+% endfor
+]
diff --git a/util/bench/tests/test_roi.py b/util/bench/tests/test_roi.py
new file mode 100644
index 000000000..ffb567816
--- /dev/null
+++ b/util/bench/tests/test_roi.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import json
+from pathlib import Path
+import pytest
+from bench.roi import get_roi, format_roi, load_json_inputs, filter_and_label_rois
+
+TEST_DATA_DIR = Path(__file__).resolve().parent / 'test_data'
+INPUT_JSON = TEST_DATA_DIR / 'data.json'
+SPEC_JSON = TEST_DATA_DIR / 'spec.json'
+OUTPUT_JSON = TEST_DATA_DIR / 'roi.json'
+
+
+def test_format_roi():
+    label = "compute"
+    roi = {
+        "tstart": 1759.0,
+        "tend": 6802.0,
+        "fpss_fpu_occupancy": 0.006345429307951616,
+        "total_ipc": 0.04501288915328178
+    }
+    formatted_roi = {
+        "label": "compute",
+        "tstart": 1759.0,
+        "tend": 6802.0,
+        "attrs": {
+            "fpss_fpu_occupancy": 0.006345429307951616,
+            "total_ipc": 0.04501288915328178
+        },
+    }
+    assert format_roi(roi, label) == formatted_roi
+
+
+@pytest.mark.parametrize("thread, idx, roi", [
+    ('hart_0', 0, {
+        "tstart": 1759.0,
+        "tend": 6802.0,
+        "fpss_fpu_occupancy": 0.006345429307951616,
+        "total_ipc": 0.04501288915328178
+    }),
+    ('dma_9', 1, {
+        "tstart": 3564,
+        "tend": 3578,
+        "bw": 1.1428571428571428
+    })
+])
+def test_get_roi(thread, idx, roi):
+    with open(INPUT_JSON, 'r') as f:
+        data = json.load(f)
+    assert get_roi(data, thread, idx) == roi
+
+
+def test_filter_and_label_rois():
+    data, spec = load_json_inputs(INPUT_JSON, SPEC_JSON, num_clusters=2)
+    with open(OUTPUT_JSON, 'r') as f:
+        output = json.load(f)
+    assert filter_and_label_rois(data, spec) == output
diff --git a/util/bench/visualize.py b/util/bench/visualize.py
new file mode 100755
index 000000000..087d8b86e
--- /dev/null
+++ b/util/bench/visualize.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Translates a ROI JSON for visualization in Chrome.
+
+This script translates a JSON file, in the format produced by
+[`roi.py`][roi], to a JSON file adhering to the syntax required by
+Chrome's
+[Trace-Viewer](https://github.com/catapult-project/catapult/tree/master/tracing).
+
+The output can be visualized in a Chrome browser: go to the
+`about:tracing` URL and load the JSON file.
+
+This script can be compared to `trace/tracevis.py`, but instead of
+visualizing individual instructions, it represents entire execution
+regions as a whole.
+"""
+
+import argparse
+import json
+from pathlib import Path
+import sys
+
+sys.path.append(str(Path(__file__).parent / '../trace'))
+import tracevis  # noqa: E402
+
+
+# Converts nanoseconds to microseconds
+def us(ns):
+    return ns / 1000
+
+
+def main():
+    # Argument parsing
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input',
+        metavar='<input>',
+        help='Input JSON file')
+    parser.add_argument(
+        '--traces',
+        metavar='<trace>',
+        nargs='*',
+        help='Simulation traces to process')
+    parser.add_argument(
+        '--elf',
+        nargs='?',
+        help='ELF from which the traces were generated')
+    parser.add_argument(
+        '-o',
+        '--output',
+        metavar='<json>',
+        nargs='?',
+        default='trace.json',
+        help='Output JSON file')
+    args = parser.parse_args()
+
+    # TraceViewer events
+    events = []
+
+    # Add a dummy instant event to mark time 0.
+    # This is to avoid that the events are shifted from
+    # their actual start times, as done to align the first event
+    # to time 0.
+    event = {'name': 'zero',
+             'ph':   'I',  # Instant event type
+             'ts':   0,
+             's':    'g'  # Global scope
+             }
+    events.append(event)
+
+    # Read JSON contents
+    with open(args.input) as f:
+        data = json.load(f)
+
+    # Iterate threads
+    for thread, regions in data.items():
+
+        # Iterate execution regions for current thread
+        for region in regions:
+
+            # Create TraceViewer event
+            ts = int(region['tstart'])
+            dur = int(region['tend']) - ts
+            event = {
+                'name': region['label'],
+                'ph': "X",  # Complete event type
+                'ts': us(ts),
+                'dur': us(dur),
+                'pid': 0,
+                'tid': thread,
+                'args': region['attrs']
+            }
+            events.append(event)
+
+    # Optionally extract also instruction-level events
+    # from the simulation traces
+    if args.traces and args.elf:
+        events += tracevis.parse_traces(args.traces, start=0, end=-1, fmt='snitch',
+                                        addr2line='addr2line', use_time=True, pid=1,
+                                        cache=True, elf=args.elf, collapse_call_stack=True)
+
+    # Create TraceViewer JSON object
+    tvobj = {}
+    tvobj['traceEvents'] = events
+    tvobj['displayTimeUnit'] = "ns"
+
+    # Dump TraceViewer events to JSON file
+    with open(args.output, 'w') as f:
+        json.dump(tvobj, f, indent=4)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/util/trace/annotate.py b/util/trace/annotate.py
index e14853bdb..243784a07 100755
--- a/util/trace/annotate.py
+++ b/util/trace/annotate.py
@@ -1,23 +1,30 @@
 #!/usr/bin/env python3
-
 # Copyright 2021 ETH Zurich and University of Bologna.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
-
-# This script parses the traces generated by Snitch and creates an annotated
-# trace that includes code sources
-# Example output:
-#     ; snrt_hartid (team.c:14)
-#     ;  in snrt_cluster_core_idx (team.c:47)
-#     ;  in main (event_unit.c:21)
-#     ;  asm("csrr %0, mhartid" : "=r"(hartid));
-#           80000048  x13=0000000a                            # csrr    a3, mhartid
-#
-# If the -d/--diff option is specified, it instead outputs a (fictitious) diff
-# file which allows to visualize the trace-source correlation side-by-side
-# instead of interleaved.
-# For neater visualization, feed the diff file into a diff visualization tool e.g.:
-# kompare -o <diff_file>
+"""Annotates an instruction trace with source-code information.
+
+This script parses a human-readable trace, as generated by CVA6 or
+Snitch's [`gen_trace.py`][gen_trace] script, and annotates every
+instruction in the trace with information on its originating
+source-code.
+
+Example output:
+```
+    ; snrt_hartid (team.c:14)
+    ;  in snrt_cluster_core_idx (team.c:47)
+    ;  in main (event_unit.c:21)
+    ;  asm("csrr %0, mhartid" : "=r"(hartid));
+          80000048  x13=0000000a             # csrr a3, mhartid
+```
+
+By default, the source-code information is interleaved in the same
+file with the instruction trace. If you prefer to have a
+side-by-side view, the -d/--diff option can be used. In this case,
+the tool outputs a (fictitious) diff file which can be fed into a
+diff visualization tool for side-by-side visualization in a GUI,
+e.g. `kompare -o <diff_file>`.
+"""
 
 import sys
 import os
diff --git a/util/trace/events.py b/util/trace/events.py
index a655be033..c5442ee2a 100755
--- a/util/trace/events.py
+++ b/util/trace/events.py
@@ -3,12 +3,13 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# This script takes a CVA6 or Snitch trace and it exports the simulation time
-# of all mcycle CSR reads in a format compatible with the gen_trace.py
-# script's JSON output.
-#
 # Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Export `mcycle` CSR read events from a Snitch or CVA6 trace.
 
+This script takes a CVA6 or Snitch trace and it exports the
+simulation time of all `mcycle` CSR reads to a JSON file in a format
+compatible with [`gen_trace.py`][gen_trace]'s output.
+"""
 
 import sys
 import argparse
diff --git a/util/trace/eventvis.py b/util/trace/eventvis.py
deleted file mode 100755
index 4d0fdfdc7..000000000
--- a/util/trace/eventvis.py
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2020 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# This script takes a CSV of events, compatible with the CSV format produced by
-# `perf_csv.py`, and creates a JSON file that can be visualized by
-# [Trace-Viewer](https://github.com/catapult-project/catapult/tree/master/tracing)
-# In Chrome, open `about:tracing` and load the JSON file to view it.
-#
-# Following is an example CSV containing two regions (as would be defined by the
-# presence of one mcycle CSR read in the traces):
-#
-#  , prepare data,      , send interrupt,
-# 0, 32906,        32911, 32911,          33662
-#
-# The first line is used to assign a name to each region.
-# Each of the following lines starts with the hartid, followed by the start and
-# end timestamps of each region.
-# While the alignment of the region names in the first line w.r.t. the following
-# lines does not matter, we suggest to align them with the columns containing the
-# start times of the respective regions (as in the example above).
-#
-# This script can be compared to `tracevis.py`, but instead of visualizing individual
-# instructions, it visualizes coarser grained regions as delimited by events
-# in the traces.
-#
-# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-import sys
-import argparse
-import csv
-import json
-import tracevis
-
-
-def pairwise(iterable):
-    "s -> (s0, s1), (s2, s3), (s4, s5), ..."
-    a = iter(iterable)
-    return zip(a, a)
-
-
-# Converts nanoseconds to microseconds
-def us(ns):
-    return ns / 1000
-
-
-def main():
-    # Argument parsing
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        'csv',
-        metavar='<csv>',
-        help='Input CSV file')
-    parser.add_argument(
-        '--traces',
-        metavar='<trace>',
-        nargs='*',
-        help='Simulation traces to process')
-    parser.add_argument(
-        '--elf',
-        nargs='?',
-        help='ELF from which the traces were generated')
-    parser.add_argument(
-        '-o',
-        '--output',
-        metavar='<json>',
-        nargs='?',
-        default='events.json',
-        help='Output JSON file')
-    args = parser.parse_args()
-
-    # TraceViewer events
-    events = []
-
-    # Add a dummy instant event to mark time 0.
-    # This is to avoid that the events are shifted from
-    # their actual start times to align the first event
-    # at time 0.
-    event = {'name': 'zero',
-             'ph':   'I',  # Instant event type
-             'ts':   0,
-             's':    'g'  # Global scope
-             }
-    events.append(event)
-
-    # Read CSV to collect TraceViewer events
-    with open(args.csv) as f:
-        reader = csv.reader(f, delimiter=',')
-
-        # Get region names
-        regions = [name for name in next(reader) if name]
-
-        # Process lines
-        for row in reader:
-
-            # First entry in row is the hart ID
-            tid = row[0]
-
-            # Start and end times of each region follow
-            for i, (start, end) in enumerate(pairwise(row[1:])):
-
-                # Filter regions this hart does not take part in
-                if start:
-
-                    # Create TraceViewer event
-                    ts = int(start)
-                    dur = int(end) - ts
-                    event = {'name': regions[i],
-                             'ph': "X",  # Complete event type
-                             'ts': us(ts),
-                             'dur': us(dur),
-                             'pid': 0,
-                             'tid': tid
-                             }
-                    events.append(event)
-
-    # Optionally extract also instruction-level events
-    # from the simulation traces
-    if args.traces and args.elf:
-        events += tracevis.parse_traces(args.traces, start=0, end=-1, fmt='snitch',
-                                        addr2line='addr2line', use_time=True, pid=1,
-                                        cache=True, elf=args.elf, collapse_call_stack=True)
-
-    # Create TraceViewer JSON object
-    tvobj = {}
-    tvobj['traceEvents'] = events
-    tvobj['displayTimeUnit'] = "ns"
-
-    # Dump TraceViewer events to JSON file
-    with open(args.output, 'w') as f:
-        json.dump(tvobj, f, indent=4)
-
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index fb3351ed2..145272382 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -924,6 +924,12 @@ def main():
     )
     parser.add_argument(
         '-o',
+        '--output',
+        required=True,
+        type=argparse.FileType('w'),
+        help='Path to the output file'
+    )
+    parser.add_argument(
         '--offl',
         action='store_true',
         help='Annotate FPSS and sequencer offloads when they happen in core')
@@ -951,42 +957,44 @@ def main():
 
     args = parser.parse_args()
     line_iter = iter(args.infile.readline, b'')
-    # Prepare stateful data structures
-    time_info = None
-    gpr_wb_info = defaultdict(deque)
-    fpr_wb_info = defaultdict(deque)
-    fseq_info = {
-        'curr_sec': 0,
-        'fpss_pcs': deque(),
-        'fseq_pcs': deque(),
-        'cfg_buf': deque(),
-        'curr_cfg': None
-    }
-    perf_metrics = [
-        defaultdict(int)
-    ]  # all values initially 0, also 'start' time of measurement 0
-    perf_metrics[0]['start'] = None
-    # Parse input line by line
-    for line in line_iter:
-        if line:
-            ann_insn, time_info, empty = annotate_insn(
-                line, gpr_wb_info, fpr_wb_info, fseq_info, perf_metrics, False,
-                time_info, args.offl, not args.saddr, args.permissive)
-            if perf_metrics[0]['start'] is None:
-                perf_metrics[0]['tstart'] = time_info[0] / 1000
-                perf_metrics[0]['start'] = time_info[1]
-            if not empty:
-                print(ann_insn)
-        else:
-            break  # Nothing more in pipe, EOF
-    perf_metrics[-1]['tend'] = time_info[0] / 1000
-    perf_metrics[-1]['end'] = time_info[1]
-    # Compute metrics
-    eval_perf_metrics(perf_metrics)
-    # Emit metrics
-    print('\n## Performance metrics')
-    for idx in range(len(perf_metrics)):
-        print('\n' + fmt_perf_metrics(perf_metrics, idx, not args.allkeys))
+
+    with args.output as file:
+        # Prepare stateful data structures
+        time_info = None
+        gpr_wb_info = defaultdict(deque)
+        fpr_wb_info = defaultdict(deque)
+        fseq_info = {
+            'curr_sec': 0,
+            'fpss_pcs': deque(),
+            'fseq_pcs': deque(),
+            'cfg_buf': deque(),
+            'curr_cfg': None
+        }
+        perf_metrics = [
+            defaultdict(int)
+        ]  # all values initially 0, also 'start' time of measurement 0
+        perf_metrics[0]['start'] = None
+        # Parse input line by line
+        for line in line_iter:
+            if line:
+                ann_insn, time_info, empty = annotate_insn(
+                    line, gpr_wb_info, fpr_wb_info, fseq_info, perf_metrics, False,
+                    time_info, args.offl, not args.saddr, args.permissive)
+                if perf_metrics[0]['start'] is None:
+                    perf_metrics[0]['tstart'] = time_info[0] / 1000
+                    perf_metrics[0]['start'] = time_info[1]
+                if not empty:
+                    print(ann_insn, file=file)
+            else:
+                break  # Nothing more in pipe, EOF
+        perf_metrics[-1]['tend'] = time_info[0] / 1000
+        perf_metrics[-1]['end'] = time_info[1]
+        # Compute metrics
+        eval_perf_metrics(perf_metrics)
+        # Emit metrics
+        print('\n## Performance metrics', file=file)
+        for idx in range(len(perf_metrics)):
+            print('\n' + fmt_perf_metrics(perf_metrics, idx, not args.allkeys), file=file)
 
     if args.dump_perf:
         with args.dump_perf as file:
diff --git a/util/trace/layout_events.py b/util/trace/layout_events.py
deleted file mode 100755
index 0d0e91435..000000000
--- a/util/trace/layout_events.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2020 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# This script takes a CSV of events, compatible with the CSV format produced by
-# `perf_csv.py`, and creates another CSV of events, where the events are reordered based
-# on a layout CSV file and labeled for viewing with the `eventvis.py` script.
-#
-# Following is an example CSV of events as output by `perf_csv.py`,
-# which could be fed as input to this tool:
-#
-#  , 0_tstart, 0_tend, 1_tstart, 1_tend, 2_tstart, 2_tend
-# 0,      334,  10940,    10940,  10945,    10945,  10995
-# 1,     2654,  11061,    11061,  11172,    11172,  11189
-# 2,     2654,  11061,    11061,  11172,    11172,  11190
-# 3,     2654,  11061,    11061,  11172,    11172,  11191
-#
-# This is an example layout CSV, which could be fed to the tool
-# together with the previous CSV:
-#
-#             , dma-in, compute, dma-out
-#            0,     0,        ,
-# "range(1,3)",      ,       1,
-#            9,      ,        ,        2
-#
-# To produce the following output:
-#
-#  , dma_in,      , compute,      , dma_out,
-# 0,    334, 10940,        ,      ,        ,
-# 1,       ,      ,   11061, 11172,        ,
-# 2,       ,      ,   11061, 11172,        ,
-# 3,       ,      ,        ,      ,   11172, 11191
-#
-# The output CSV can be fed directly to `eventvis.py`.
-#
-# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-import sys
-import argparse
-import csv
-import pandas as pd
-from math import isnan
-import hjson
-
-
-def main():
-    # Argument parsing
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        'csv',
-        metavar='<csv>',
-        help='Input CSV file')
-    parser.add_argument(
-        'layout',
-        metavar='<layout>',
-        help='Layout CSV file')
-    parser.add_argument(
-        '--cfg',
-        type=str,
-        help='System configuration .hjson file')
-    parser.add_argument(
-        '-o',
-        '--output',
-        metavar='<output>',
-        nargs='?',
-        default='trace.csv',
-        help='Output CSV file')
-    args = parser.parse_args()
-
-    # Read input CSV
-    df = pd.read_csv(args.csv)
-
-    # Read system configuration .hjson file
-    cfg = None
-    with open(args.cfg) as cfg_file:
-        cfg = hjson.load(cfg_file)
-
-    # Output CSV data
-    data = []
-    columns = []
-
-    # Open layout CSV
-    with open(args.layout) as layout_f:
-        layout_reader = csv.reader(layout_f, delimiter=',')
-
-        # Get region labels from layout header
-        regions = [label for label in next(layout_reader) if label and not label.isspace()]
-
-        # Generate output columns: appropriately spaced region labels
-        columns = ['hartid'] + [val for label in regions for val in [label, '']]
-
-        # Iterate layout rows
-        for row in layout_reader:
-
-            # First entry in row is a hart ID or a Python expression
-            # which generates a list of hart IDs
-            expr = row[0]
-            code = compile(expr, "<string>", "eval")
-            # Symbols must be added to globals to be used in list comprehensions
-            # see https://bugs.python.org/issue36300
-            tids = eval(code, {'cfg': cfg}, {'cfg': cfg})
-            if type(tids) == int:
-                tids = [tids]
-
-            # Iterate hart IDs
-            for tid in tids:
-
-                # Start output row with hart ID
-                orow = [tid]
-
-                # Iterate all other cells in layout row (indices of regions to take)
-                for cell in row[1:]:
-
-                    # If the cell is not empty, get start and end times
-                    # of the region from the input CSV and append them to the
-                    # output row. Otherwise, leave cells empty.
-                    if cell and not cell.isspace():
-                        reg_idx = int(cell)
-                        row_idx = tid
-                        col_idx = 1 + reg_idx * 2
-                        assert row_idx < df.shape[0], f'Hart ID {row_idx} out of bounds'
-                        assert (col_idx + 1) < df.shape[1],\
-                            f'Region index {reg_idx} out of bounds for hart {tid}'
-                        assert not isnan(df.iat[row_idx, col_idx]),\
-                            (f'Region {reg_idx} looks empty for hart {tid},'
-                             f'check whether it was simulated')
-                        orow.append(int(df.iat[row_idx, col_idx]))
-                        orow.append(int(df.iat[row_idx, col_idx + 1]))
-                    else:
-                        orow.append('')
-                        orow.append('')
-
-                data.append(orow)
-
-    # Create output dataframe and write to CSV
-    df = pd.DataFrame(data, columns=columns)
-    df.set_index('hartid', inplace=True)
-    df.sort_index(axis='index', inplace=True)
-    df.index.name = None
-    df.to_csv(args.output)
-
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/util/trace/perf_csv.py b/util/trace/perf_csv.py
deleted file mode 100755
index f26e242e2..000000000
--- a/util/trace/perf_csv.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2020 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# This script takes the performance metrics from all cores, in JSON format
-# as dumped by the `events.py` or `gen_trace.py` scripts, and merges them
-# into a single CSV file for global inspection.
-#
-# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-
-import sys
-import argparse
-import re
-import json
-import pandas as pd
-
-
-HARTID_REGEX = r'hart_([0-9a-f]+)_perf.json'
-
-
-def main():
-    # Argument parsing
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '-i',
-        '--inputs',
-        metavar='<inputs>',
-        nargs='+',
-        help='Input performance metric dumps')
-    parser.add_argument(
-        '-o',
-        '--output',
-        metavar='<csv>',
-        nargs='?',
-        default='perf.csv',
-        help='Output CSV file')
-    parser.add_argument(
-        '--filter',
-        nargs='*',
-        help='All and only performance metrics to include in the CSV')
-    args = parser.parse_args()
-
-    dumps = sorted(args.inputs)
-
-    # Populate a list (one entry per hart) of dictionaries
-    # enumerating all the performance metrics for each hart
-    data = []
-    index = []
-    for dump in dumps:
-
-        # Get hart id from filename and append to index
-        hartid = int(re.search(HARTID_REGEX, dump).group(1), base=16)
-        index.append(hartid)
-
-        # Populate dictionary of metrics for the current hart
-        hart_metrics = {}
-        with open(dump, 'r') as f:
-            hart_data = json.load(f)
-
-            # Uniquefy names of performance metrics in each trace
-            # region by prepending the region index, and merge
-            # all region metrics in a single dictionary
-            for i, region in enumerate(hart_data):
-
-                # If filter was provided on the command-line then filter out all
-                # perf metrics which were not listed
-                if args.filter:
-                    region = {key: val for (key, val) in region.items() if key in args.filter}
-
-                region_metrics = {f'{i}_{key}': val for (key, val) in region.items()}
-                hart_metrics.update(region_metrics)
-
-        data.append(hart_metrics)
-
-    # Export data
-    df = pd.DataFrame.from_records(data, index)
-    df.to_csv(args.output)
-
-
-if __name__ == '__main__':
-    sys.exit(main())

From 1039d2056147e9ca4d7320e95d45adecca8afca8 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 3 Apr 2024 13:52:18 +0200
Subject: [PATCH 3/8] trace: Add functionality to parse DMA trace

---
 target/common/common.mk |   4 +-
 util/trace/gen_trace.py | 177 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 161 insertions(+), 20 deletions(-)

diff --git a/target/common/common.mk b/target/common/common.mk
index 18f40ac45..e2c19eeb3 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -225,8 +225,8 @@ clean-perf:
 clean-visual-trace:
 	rm -f $(VISUAL_TRACE)
 
-$(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
-	$(DASM) < $< | $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
+$(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json dma_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
+	$(DASM) < $< | $(GENTRACE_PY) --permissive --dma-trace $(SIM_DIR)/dma_trace_$*.log --dump-hart-perf $(LOGS_DIR)/hart_$*_perf.json --dump-dma-perf $(LOGS_DIR)/dma_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
 
 # Generate source-code interleaved traces for all harts. Reads the binary from
 # the logs/.rtlbinary file that is written at start of simulation in the vsim script
diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index 145272382..ad97359ef 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -3,13 +3,31 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Author: Paul Scheffler <paulsc@iis.ee.ethz.ch>
-#         Luca Colagrande <colluca@iis.ee.ethz.ch>
-"""Human-readable Snitch trace generator script.
-
-This script takes a trace generated for a Snitch hart and transforms the
-additional decode stage info into meaningful annotation. It also counts
-and computes various performance metrics up to each mcycle CSR read.
+# Authors: Paul Scheffler <paulsc@iis.ee.ethz.ch>
+#          Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Script to generate human-readable instruction traces for Snitch.
+
+This script takes a trace generated by a Snitch hart
+(see `snitch_cc.sv`) and transforms the additional decode stage info
+into meaningful annotation.
+
+It also counts and computes various performance metrics for every
+execution region. An execution region is a sequence of instructions.
+Every `mcycle` CSR read instruction in your trace implicitly defines
+two execution regions, comprising respectively:
+
+- all instructions executed before the read, up to the previous read
+or the first executed instruction
+- all instructions executed after the read, up to the next read or
+the last executed instruction
+
+Performance metrics are appended at the end of the generated trace
+and can optionally be dumped to a separate JSON file.
+
+It also computes various performance metrics for every DMA transfer,
+provided that the Snitch core is equipped with a tightly-coupled DMA
+engine, and the DMA trace logged during simulation is fed to the tool.
+DMA performance metrics are dumped to a separate JSON file.
 """
 
 # TODO: OPER_TYPES and FPU_OPER_TYPES could break: optimization might alter enum mapping
@@ -19,9 +37,10 @@
 import re
 import argparse
 import json
+import ast
 from ctypes import c_int32, c_uint32
 from collections import deque, defaultdict
-import pathlib
+from pathlib import Path
 
 EXTRA_WB_WARN = 'WARNING: {} transactions still in flight for {}.'
 
@@ -319,7 +338,7 @@
 def load_opcodes():
     global _cached_opcodes
     opcode_file_name = 'opcodes-flt-occamy_CUSTOM.csv'
-    opcode_file_path = pathlib.Path(__file__).parent.absolute() / opcode_file_name
+    opcode_file_path = Path(__file__).parent.absolute() / opcode_file_name
 
     _cached_opcodes = {}
     with open(opcode_file_path, 'r') as f:
@@ -497,6 +516,109 @@ def flt_lit(num: int, fmt: int, width: int = 6, vlen: int = 1) -> str:
         return floats[0]
 
 
+# -------------------- DMA --------------------
+
+
+# We always assume dma_trans contains at least one incomplete placeholder DMA transaction.
+# This incomplete transaction contains default settings. Only upon a DMCPY* instruction
+# is the size of the transaction known, completing the transaction. At that point, a new
+# incomplete transaction is created, inheriting the configuration settings from the previous
+# transaction, which may or may not be overriden before the next DMCPY*.
+def update_dma(insn, extras, dma_trans):
+    # Extract instruction mnemonic from full instruction decoding (includes operand registers)
+    MNEMONIC_REGEX = r'^([\w.]+)\s'
+    match = re.match(MNEMONIC_REGEX, insn)
+    if match:
+        mnemonic = match.group(1)
+        # Process DMA instruction
+        if mnemonic in ['dmsrc', 'dmdst', 'dmstr']:
+            pass
+        elif mnemonic == 'dmrep':
+            dma_trans[-1]['rep'] = extras['opa']
+        elif mnemonic in ['dmcpy', 'dmcpyi']:
+            # Create new placeholder transaction to inherit current DMA settings
+            dma_trans.append(dma_trans[-1].copy())
+            # Set size of the transaction
+            dma_trans[-2]['size'] = extras['opa']
+            # Override repetition count if the transaction is configured to be 1D
+            config = extras['rs2']
+            enable_2d = (config & 2) >> 1
+            if not enable_2d:
+                dma_trans[-2]['rep'] = 1
+
+
+def eval_dma_metrics(dma_trans, dma_trace):
+    dma_trace = Path(dma_trace)
+    if dma_trace.exists():
+        with open(dma_trace, 'r') as f:
+            # Initialize variables
+            compl_transfers = []
+            outst_transfers = []
+            req_transfer_idx = 0
+            req_bytes = 0
+            # Iterate lines in DMA trace
+            for line in f.readlines():
+                dma = ast.literal_eval(line)
+                if 'backend_burst_req_valid' in dma:
+                    # When the first burst in a transfer is granted, we record a new transfer in
+                    # the outstanding transfers queue, with the information obtained from the core
+                    # trace. We record the number of bytes moved by each burst in a transfer, and
+                    # compare the total to the number of bytes moved by the transfer, to count how
+                    # many bursts belong to the current DMA transfer (a number which is difficult
+                    # to pre-compute from the core trace as it depends on address alignments, etc.)
+                    if dma['backend_burst_req_valid'] and dma['backend_burst_req_ready']:
+                        if req_bytes == 0:
+                            n_bytes = dma_trans[req_transfer_idx]['rep'] * \
+                                    dma_trans[req_transfer_idx]['size']
+                            outst_transfers.append({'tstart': dma['time'],
+                                                    'exp_bursts': 0,
+                                                    'rec_bursts': 0,
+                                                    'bytes': n_bytes})
+                        req_bytes += dma['backend_burst_req_num_bytes']
+                        outst_transfers[-1]['exp_bursts'] += 1
+                    # We move on to the next transfer when the bytes requested by the previous
+                    # bursts match the current transfer size.
+                    if req_bytes == outst_transfers[-1]['bytes']:
+                        req_bytes = 0
+                        req_transfer_idx += 1
+                    # Upon a burst completion, we increment the received bursts count. When this
+                    # count matches the expected bursts count of the current transfer we record the
+                    # end time of the transfer and promote the transfer from the outstanding to the
+                    # completed transfers' queue.
+                    if dma['transfer_completed']:
+                        outst_transfers[0]['rec_bursts'] += 1
+                        if outst_transfers[0]['rec_bursts'] == outst_transfers[0]['exp_bursts']:
+                            outst_transfers[0]['tend'] = dma['time']
+                            compl_transfer = outst_transfers.pop(0)
+                            compl_transfer.pop('exp_bursts')
+                            compl_transfer.pop('rec_bursts')
+                            compl_transfers.append(compl_transfer)
+            # Calculate bandwidth of individual transfers
+            for transfer in compl_transfers:
+                transfer['cycles'] = transfer['tend'] - transfer['tstart']
+                transfer['bw'] = transfer['bytes'] / transfer['cycles']
+            # Calculate aggregate bandwidth: total number of bytes transferred while any transfer is
+            # active (accounts for overlaps between transfers).
+            prev_trans_end = 0
+            active_cycles = 0
+            n_bytes = 0
+            for transfer in compl_transfers:
+                # Calculate active cycles, without double-counting overlaps
+                curr_trans_start, curr_trans_end = transfer['tstart'], transfer['tend']
+                if curr_trans_start > prev_trans_end:
+                    active_cycles += curr_trans_end - curr_trans_start
+                else:
+                    active_cycles += curr_trans_end - prev_trans_end
+                prev_trans_end = curr_trans_end
+                # Calculate total number of bytes
+                n_bytes += transfer['bytes']
+            dma_metrics = {}
+            if active_cycles != 0:
+                dma_metrics['aggregate_bw'] = n_bytes / active_cycles
+            dma_metrics['transfers'] = compl_transfers
+            return dma_metrics
+
+
 # -------------------- FPU Sequencer --------------------
 
 
@@ -759,7 +881,8 @@ def annotate_insn(
     annot_fseq_offl:
     bool = False,  # Annotate whenever core offloads to CPU on own line
     force_hex_addr: bool = True,
-    permissive: bool = True
+    permissive: bool = True,
+    dma_trans: list = []
 ) -> (str, tuple, bool
       ):  # Return time info, whether trace line contains no info, and fseq_len
     match = re.search(TRACE_IN_REGEX, line.strip('\n'))
@@ -788,6 +911,7 @@ def annotate_insn(
                 insn, pc_str = ('', '')
             else:
                 perf_metrics[-1]['snitch_issues'] += 1
+            update_dma(insn, extras, dma_trans)
         # Annotate sequencer
         elif extras['source'] == TRACE_SRCES['sequencer']:
             if extras['cbuf_push']:
@@ -948,12 +1072,20 @@ def main():
         '--permissive',
         action='store_true',
         help='Ignore some state-related issues when they occur')
-    parser.add_argument('-d',
-                        '--dump-perf',
-                        nargs='?',
-                        metavar='file',
-                        type=argparse.FileType('w'),
-                        help='Dump performance metrics as json text.')
+    parser.add_argument(
+        '--dma-trace',
+        help='Path to a DMA trace file'
+    )
+    parser.add_argument(
+        '--dump-hart-perf',
+        nargs='?',
+        type=argparse.FileType('w'),
+        help='Dump hart performance metrics as json text.'
+    )
+    parser.add_argument(
+        '--dump-dma-perf',
+        help='Dump DMA performance metrics as json text.'
+    )
 
     args = parser.parse_args()
     line_iter = iter(args.infile.readline, b'')
@@ -970,6 +1102,7 @@ def main():
             'cfg_buf': deque(),
             'curr_cfg': None
         }
+        dma_trans = [{'rep': 1}]
         perf_metrics = [
             defaultdict(int)
         ]  # all values initially 0, also 'start' time of measurement 0
@@ -995,10 +1128,18 @@ def main():
         print('\n## Performance metrics', file=file)
         for idx in range(len(perf_metrics)):
             print('\n' + fmt_perf_metrics(perf_metrics, idx, not args.allkeys), file=file)
+        # Emit DMA metrics
+        if args.dma_trace:
+            dma_metrics = eval_dma_metrics(dma_trans, args.dma_trace)
 
-    if args.dump_perf:
-        with args.dump_perf as file:
+    # Dump hart performance metrics to JSON file
+    if args.dump_hart_perf:
+        with args.dump_hart_perf as file:
             file.write(json.dumps(perf_metrics, indent=4))
+    # Dump DMA performance metrics to JSON file
+    if args.dump_dma_perf and dma_metrics is not None:
+        with open(args.dump_dma_perf, 'w') as file:
+            file.write(json.dumps(dma_metrics, indent=4))
 
     # Check for any loose ends and warn before exiting
     seq_isns = len(fseq_info['fseq_pcs']) + len(fseq_info['cfg_buf'])

From f4894ec6f9523972ae5037fed51476a88c4fc6f4 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 12 Jul 2024 20:54:09 +0200
Subject: [PATCH 4/8] hw: Enable DMA tracing again after iDMA introduction

---
 hw/snitch_cluster/src/snitch_cc.sv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 1bb836e58..76a35113d 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -387,6 +387,7 @@ module snitch_cc #(
       .NumAxInFlight (DMANumAxInFlight),
       .DMAReqFifoDepth (DMAReqFifoDepth),
       .NumChannels (DMANumChannels),
+      .DMATracing (1),
       .axi_ar_chan_t (axi_ar_chan_t),
       .axi_aw_chan_t (axi_aw_chan_t),
       .axi_req_t (axi_req_t),

From ddcdd43af92f3b1ac98e2a0e9bfa2ab55b46d7ab Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 17 Jul 2024 11:49:22 +0200
Subject: [PATCH 5/8] trace: Adapt DMA trace parsing after iDMA introduction

---
 target/common/common.mk |  2 +-
 util/trace/gen_trace.py | 79 ++++++++++++++++++++++-------------------
 2 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/target/common/common.mk b/target/common/common.mk
index e2c19eeb3..da1e0423e 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -226,7 +226,7 @@ clean-visual-trace:
 	rm -f $(VISUAL_TRACE)
 
 $(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json dma_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
-	$(DASM) < $< | $(GENTRACE_PY) --permissive --dma-trace $(SIM_DIR)/dma_trace_$*.log --dump-hart-perf $(LOGS_DIR)/hart_$*_perf.json --dump-dma-perf $(LOGS_DIR)/dma_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
+	$(DASM) < $< | $(GENTRACE_PY) --permissive --dma-trace $(SIM_DIR)/dma_trace_$*_00000.log --dump-hart-perf $(LOGS_DIR)/hart_$*_perf.json --dump-dma-perf $(LOGS_DIR)/dma_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
 
 # Generate source-code interleaved traces for all harts. Reads the binary from
 # the logs/.rtlbinary file that is written at start of simulation in the vsim script
diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index ad97359ef..62b9737d9 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -554,49 +554,56 @@ def eval_dma_metrics(dma_trans, dma_trace):
             # Initialize variables
             compl_transfers = []
             outst_transfers = []
-            req_transfer_idx = 0
+            transfer_idx = 0
+            exp_bytes = 0
             req_bytes = 0
+            bursts_in_transfer = 0
+            rec_bursts = 0
             # Iterate lines in DMA trace
             for line in f.readlines():
                 dma = ast.literal_eval(line)
-                if 'backend_burst_req_valid' in dma:
-                    # When the first burst in a transfer is granted, we record a new transfer in
-                    # the outstanding transfers queue, with the information obtained from the core
-                    # trace. We record the number of bytes moved by each burst in a transfer, and
-                    # compare the total to the number of bytes moved by the transfer, to count how
-                    # many bursts belong to the current DMA transfer (a number which is difficult
-                    # to pre-compute from the core trace as it depends on address alignments, etc.)
-                    if dma['backend_burst_req_valid'] and dma['backend_burst_req_ready']:
-                        if req_bytes == 0:
-                            n_bytes = dma_trans[req_transfer_idx]['rep'] * \
-                                    dma_trans[req_transfer_idx]['size']
-                            outst_transfers.append({'tstart': dma['time'],
-                                                    'exp_bursts': 0,
-                                                    'rec_bursts': 0,
-                                                    'bytes': n_bytes})
-                        req_bytes += dma['backend_burst_req_num_bytes']
-                        outst_transfers[-1]['exp_bursts'] += 1
-                    # We move on to the next transfer when the bytes requested by the previous
-                    # bursts match the current transfer size.
-                    if req_bytes == outst_transfers[-1]['bytes']:
+                time = dma['meta']['time']
+                # When the first burst in a transfer is granted, we record a new transfer in
+                # the outstanding transfers queue, with the information obtained from the core
+                # trace. We record the number of bytes moved by each burst in a transfer, and
+                # compare the total to the number of bytes moved by the transfer, to count how
+                # many bursts belong to the current DMA transfer (a number which is difficult
+                # to pre-compute from the core trace as it depends on address alignments, etc.)
+                if dma['backend']['req_valid'] and dma['backend']['req_ready']:
+                    if req_bytes == 0:
+                        exp_bytes = dma_trans[transfer_idx]['rep'] * \
+                                    dma_trans[transfer_idx]['size']
+                        outst_transfers.append({'tstart': time,
+                                                'bytes': exp_bytes})
+                    req_bytes += dma['backend']['req_length']
+                    bursts_in_transfer += 1
+                    # When the aggregate size of the issued bursts matches the size of the current
+                    # transfer, we record the number of bursts in the transfer. This info is later
+                    # needed to count responses and determine the end time of the transfer.
+                    if req_bytes == exp_bytes:
+                        outst_transfers[-1]['bursts'] = bursts_in_transfer
+                        # Reset the state for the next transfer.
                         req_bytes = 0
-                        req_transfer_idx += 1
-                    # Upon a burst completion, we increment the received bursts count. When this
-                    # count matches the expected bursts count of the current transfer we record the
-                    # end time of the transfer and promote the transfer from the outstanding to the
-                    # completed transfers' queue.
-                    if dma['transfer_completed']:
-                        outst_transfers[0]['rec_bursts'] += 1
-                        if outst_transfers[0]['rec_bursts'] == outst_transfers[0]['exp_bursts']:
-                            outst_transfers[0]['tend'] = dma['time']
-                            compl_transfer = outst_transfers.pop(0)
-                            compl_transfer.pop('exp_bursts')
-                            compl_transfer.pop('rec_bursts')
-                            compl_transfers.append(compl_transfer)
+                        bursts_in_transfer = 0
+                        transfer_idx += 1
+                # Upon a burst completion, we increment the received bursts count.
+                if dma['backend']['rsp_valid'] and dma['backend']['rsp_ready']:
+                    rec_bursts += 1
+                    # When the received bursts count matches the expected bursts for the current
+                    # transfer we record the end time of the transfer and promote the transfer
+                    # from the outstanding to the completed transfers' queue. The first response
+                    # may arrive before the last request is issued. To allow for this condition
+                    # we default to -1.
+                    if rec_bursts == outst_transfers[0].get('bursts', -1):
+                        outst_transfers[0]['tend'] = time
+                        compl_transfers.append(outst_transfers.pop(0))
+                        # Reset the state for the next transfer.
+                        rec_bursts = 0
             # Calculate bandwidth of individual transfers
             for transfer in compl_transfers:
                 transfer['cycles'] = transfer['tend'] - transfer['tstart']
-                transfer['bw'] = transfer['bytes'] / transfer['cycles']
+                if transfer['bytes'] > 0:
+                    transfer['bw'] = transfer['bytes'] / transfer['cycles']
             # Calculate aggregate bandwidth: total number of bytes transferred while any transfer is
             # active (accounts for overlaps between transfers).
             prev_trans_end = 0
@@ -1112,7 +1119,7 @@ def main():
             if line:
                 ann_insn, time_info, empty = annotate_insn(
                     line, gpr_wb_info, fpr_wb_info, fseq_info, perf_metrics, False,
-                    time_info, args.offl, not args.saddr, args.permissive)
+                    time_info, args.offl, not args.saddr, args.permissive, dma_trans)
                 if perf_metrics[0]['start'] is None:
                     perf_metrics[0]['tstart'] = time_info[0] / 1000
                     perf_metrics[0]['start'] = time_info[1]

From 10628d7c9aef6a1a53593d9efd2e71577f5033cb Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Tue, 23 Jul 2024 23:40:02 +0200
Subject: [PATCH 6/8] target: Align Verilator timescale with Questa

---
 target/common/common.mk | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/common/common.mk b/target/common/common.mk
index da1e0423e..aac1e5d59 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -79,6 +79,7 @@ VLT_SOURCES   = $(shell ${BENDER} script flist ${VLT_BENDER} | ${SED_SRCS})
 VLT_BUILDDIR := $(abspath work-vlt)
 VLT_FESVR     = $(VLT_BUILDDIR)/riscv-isa-sim
 VLT_FLAGS    += --timing
+VLT_FLAGS    += --timescale 1ns/1ps
 VLT_FLAGS    += -Wno-BLKANDNBLK
 VLT_FLAGS    += -Wno-LITENDIAN
 VLT_FLAGS    += -Wno-CASEINCOMPLETE

From 0a12023950e97c27c19250b5ddc89f438ba9a6d4 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 17 Jul 2024 14:17:40 +0200
Subject: [PATCH 7/8] treewide: Update iDMA to include tracer

---
 Bender.lock | 6 +++---
 Bender.yml  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Bender.lock b/Bender.lock
index f7b6b0080..e6b40378a 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -7,7 +7,7 @@ packages:
     dependencies:
     - common_cells
   axi:
-    revision: 41859549968db0147062b6c3cdbe4af00080cc09
+    revision: 4e54ac6766b160217a83a74d5a23af9bbf59e6ee
     version: null
     source:
       Git: https://github.com/pulp-platform/axi
@@ -71,8 +71,8 @@ packages:
     dependencies:
     - common_cells
   idma:
-    revision: c12caf59bb482fe44b27361f6924ad346b2d22fe
-    version: 0.6.3
+    revision: 0a0c113434aee743923d9e85631de009b4f00847
+    version: null
     source:
       Git: https://github.com/pulp-platform/iDMA
     dependencies:
diff --git a/Bender.yml b/Bender.yml
index 660175daa..350f78c5a 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -27,7 +27,7 @@ dependencies:
   tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version:  0.2.11  }
   riscv-dbg:          { git: https://github.com/pulp-platform/riscv-dbg,          version:  0.8.0   }
   cluster_icache:     { git: https://github.com/pulp-platform/cluster_icache.git, version:  0.1.0   }
-  idma:               { git: https://github.com/pulp-platform/iDMA,               version:  0.6.3   }
+  idma:               { git: https://github.com/pulp-platform/iDMA,               rev:      snitch-tracing }
 
 export_include_dirs:
   - hw/reqrsp_interface/include

From eafe83e15b37f3605d41ca8f7f5d5e214d4d33b1 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 9 Aug 2024 12:57:42 +0200
Subject: [PATCH 8/8] target: Generate iDMA dependencies

---
 .github/workflows/ci.yml       |  1 +
 .github/workflows/lint.yml     |  1 +
 .gitlab-ci.yml                 |  9 +++++----
 Makefile                       |  2 +-
 iis-setup.sh                   |  4 +---
 target/common/common.mk        |  5 ++---
 target/snitch_cluster/Makefile | 10 ++++++++++
 7 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4613be05d..6f5efe205 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -56,6 +56,7 @@ jobs:
       - name: Build Hardware
         working-directory: target/snitch_cluster
         run: |
+          pip install -r $(bender path idma)/requirements.txt
           make CFG_OVERRIDE=cfg/github-ci.hjson VLT_JOBS=1 bin/snitch_cluster.vlt
       - name: Run Tests
         working-directory: target/snitch_cluster
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 023c20fd8..cbe6372b7 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -54,6 +54,7 @@ jobs:
       - name: Generate RTL sources
         working-directory: target/snitch_cluster
         run: |
+          pip install -r $(bender path idma)/requirements.txt
           make rtl
       # For some reason, the checkout is done by a different user,
       # than that running `git diff` (root, possibly due to Docker).
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 656558761..adf33c858 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,6 +15,7 @@ variables:
 
 before_script:
   source iis-setup.sh
+  pip install $($BENDER path idma)/requirements.txt
 
 ##############
 # Build docs #
@@ -158,7 +159,7 @@ snitch-cluster-mchan-vsim:
 # Non-free #
 ############
 
-nonfree:
-  script:
-    - make nonfree
-    - make elab
+# nonfree:
+#   script:
+#     - make nonfree
+#     - make elab
diff --git a/Makefile b/Makefile
index dcd0d6ef1..2d89360e3 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,7 @@ ROOT = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ############
 
 NONFREE_REMOTE ?= git@iis-git.ee.ethz.ch:pulp-restricted/snitch-cluster-nonfree.git
-NONFREE_COMMIT ?= e30961e20a23a76442da27d2ba07c9fe20f3b575
+NONFREE_COMMIT ?= 214380a8c6af4cdd3d53a5b74cfaf462da877be2
 NONFREE_DIR = $(ROOT)/nonfree
 
 all: nonfree
diff --git a/iis-setup.sh b/iis-setup.sh
index d56886c72..cf376b1a1 100755
--- a/iis-setup.sh
+++ b/iis-setup.sh
@@ -20,11 +20,9 @@ source .venv/bin/activate
 # occurring when the /tmp folder is filled by other processes.
 mkdir tmp
 TMPDIR=tmp pip install -r python-requirements.txt
+TMPDIR=tmp pip install -r $($BENDER path idma)/requirements.txt
 rm -rf tmp
 
-# Bender initialization
-$BENDER vendor init
-
 # Install spike-dasm
 mkdir tools/
 cd tools/
diff --git a/target/common/common.mk b/target/common/common.mk
index aac1e5d59..4c2b2c95b 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -47,9 +47,8 @@ VLT_ROOT        ?= ${VERILATOR_ROOT}
 VLT_JOBS        ?= $(shell nproc)
 VLT_NUM_THREADS ?= 1
 
-MATCH_END := '/+incdir+/ s/$$/\/*\/*/'
-MATCH_BGN := 's/+incdir+//g'
-SED_SRCS  := sed -e ${MATCH_END} -e ${MATCH_BGN}
+MATCH_REMOVE := 's/+incdir+\/[^ ]*//g'
+SED_SRCS     := sed -e ${MATCH_REMOVE}
 
 COMMON_BENDER_FLAGS += -t rtl
 
diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile
index 60c49d8dc..552e169b5 100644
--- a/target/snitch_cluster/Makefile
+++ b/target/snitch_cluster/Makefile
@@ -151,9 +151,19 @@ include $(ROOT)/target/snitch_cluster/sw.mk
 # RTL #
 #######
 
+# Include iDMA Makefile to generate prerequisite iDMA sources
+include $(shell $(BENDER) path idma)/idma.mk
+
 GENERATED_RTL_SOURCES  = $(PERIPH_DIR)/snitch_cluster_peripheral_reg_top.sv
 GENERATED_RTL_SOURCES += $(PERIPH_DIR)/snitch_cluster_peripheral_reg_pkg.sv
 GENERATED_RTL_SOURCES += $(GENERATED_DIR)/snitch_cluster_wrapper.sv
+GENERATED_RTL_SOURCES += $(IDMA_ROOT)/target/rtl/idma_inst64_top.sv
+GENERATED_RTL_SOURCES += $(IDMA_ROOT)/target/rtl/include/idma/tracer.svh
+
+# Add dependency on DMA header files
+VSIM_SOURCES += $(IDMA_ROOT)/target/rtl/include/idma/tracer.svh
+VLT_SOURCES  += $(IDMA_ROOT)/target/rtl/include/idma/tracer.svh
+VCS_SOURCES  += $(IDMA_ROOT)/target/rtl/include/idma/tracer.svh
 
 .PHONY: rtl clean-rtl