diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml
index a4760147d..38d296200 100644
--- a/.github/workflows/build-docker.yml
+++ b/.github/workflows/build-docker.yml
@@ -6,7 +6,7 @@
 name: build-docker
 on:
   push:
-    branches: [main]
+    branches: [tracer/dma]
   workflow_dispatch:
 jobs:
   build-docker:
@@ -27,6 +27,6 @@ jobs:
           context: .
           file: util/container/Dockerfile
           push: true
-          tags: ghcr.io/pulp-platform/snitch_cluster:${{ github.ref_name }}
+          tags: ghcr.io/pulp-platform/snitch_cluster:tracer-dma
           build-args: |-
             SNITCH_LLVM_VERSION=latest
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f8f87b3f8..c2088aac5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,12 +15,26 @@ jobs:
     name: Build documentation
     runs-on: ubuntu-22.04
     container:
-      image: ghcr.io/pulp-platform/snitch_cluster:main
+      image: ghcr.io/pulp-platform/snitch_cluster:tracer-dma
     steps:
       - uses: actions/checkout@v2
       - name: Build docs
         run: make docs
 
+  #####################
+  # Python unit tests #
+  #####################
+
+  pytest:
+    name: Python unit tests
+    runs-on: ubuntu-22.04
+    container:
+      image: ghcr.io/pulp-platform/snitch_cluster:tracer-dma
+    steps:
+      - uses: actions/checkout@v2
+      - name: Run pytest
+        run: pytest
+
   ##############################################
   # Simulate SW on Snitch Cluster w/ Verilator #
   ##############################################
@@ -29,7 +43,7 @@ jobs:
     name: Simulate SW on Snitch Cluster w/ Verilator
     runs-on: ubuntu-22.04
     container:
-      image: ghcr.io/pulp-platform/snitch_cluster:main
+      image: ghcr.io/pulp-platform/snitch_cluster:tracer-dma
     steps:
       - uses: actions/checkout@v2
         with:
@@ -54,7 +68,7 @@ jobs:
     name: Simulate SW on Snitch Cluster w/ Banshee
     runs-on: ubuntu-22.04
     container:
-      image: ghcr.io/pulp-platform/snitch_cluster:main
+      image: ghcr.io/pulp-platform/snitch_cluster:tracer-dma
     steps:
       - uses: actions/checkout@v2
         with:
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 18cd5d4aa..784183381 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -40,6 +40,14 @@ docs:
   script:
     - make docs
 
+#####################
+# Python unit tests #
+#####################
+
+pytest:
+  script:
+    - pytest
+
 #################################
 # Build Snitch cluster software #
 #################################
diff --git a/docs/rm/bench/join.md b/docs/rm/bench/join.md
new file mode 100644
index 000000000..ee9aa8221
--- /dev/null
+++ b/docs/rm/bench/join.md
@@ -0,0 +1 @@
+::: join
\ No newline at end of file
diff --git a/docs/rm/bench/roi.md b/docs/rm/bench/roi.md
new file mode 100644
index 000000000..239fedf30
--- /dev/null
+++ b/docs/rm/bench/roi.md
@@ -0,0 +1 @@
+::: roi
\ No newline at end of file
diff --git a/docs/rm/bench/visualize.md b/docs/rm/bench/visualize.md
new file mode 100644
index 000000000..b2c2bed8b
--- /dev/null
+++ b/docs/rm/bench/visualize.md
@@ -0,0 +1 @@
+::: visualize
\ No newline at end of file
diff --git a/docs/rm/trace/annotate.md b/docs/rm/trace/annotate.md
new file mode 100644
index 000000000..b70b1a847
--- /dev/null
+++ b/docs/rm/trace/annotate.md
@@ -0,0 +1 @@
+::: annotate
\ No newline at end of file
diff --git a/docs/rm/trace/events.md b/docs/rm/trace/events.md
new file mode 100644
index 000000000..5b9cca4ae
--- /dev/null
+++ b/docs/rm/trace/events.md
@@ -0,0 +1 @@
+::: events
\ No newline at end of file
diff --git a/docs/rm/trace/gen_trace.md b/docs/rm/trace/gen_trace.md
new file mode 100644
index 000000000..3ba7b50eb
--- /dev/null
+++ b/docs/rm/trace/gen_trace.md
@@ -0,0 +1 @@
+::: gen_trace
\ No newline at end of file
diff --git a/hw/future/src/dma/axi_dma_backend.sv b/hw/future/src/dma/axi_dma_backend.sv
index b8cfa81dc..09a27f0a9 100644
--- a/hw/future/src/dma/axi_dma_backend.sv
+++ b/hw/future/src/dma/axi_dma_backend.sv
@@ -288,7 +288,6 @@ module axi_dma_backend #(
   //--------------------------------------
   //pragma translate_off
 `ifndef SYNTHESYS
-`ifndef VERILATOR
   generate
     if (DmaTracing) begin : gen_dma_tracer
       string fn;
@@ -595,7 +594,6 @@ module axi_dma_backend #(
       end
     end
   endgenerate
-`endif
 `endif
   //pragma translate_on
 endmodule : axi_dma_backend
diff --git a/mkdocs.yml b/mkdocs.yml
index 70d213601..158e453b6 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -25,7 +25,7 @@ plugins:
   - mkdocstrings:
       handlers:
         python:
-          paths: [util/sim]
+          paths: [util/sim, util/trace, util/bench]
   - macros:
       on_error_fail: true
 use_directory_urls: false
@@ -57,6 +57,14 @@ nav:
               - sim_utils: rm/sim/sim_utils.md
               - rm/sim/Simulation.md
               - rm/sim/Simulator.md
+          - Trace Utilities:
+              - gen_trace.py: rm/trace/gen_trace.md
+              - annotate.py: rm/trace/annotate.md
+              - events.py: rm/trace/events.md
+          - Benchmarking Utilities:
+              - join.py: rm/bench/join.md
+              - roi.py: rm/bench/roi.md
+              - visualize.py: rm/bench/visualize.md
           - Snitch Runtime:
               - Pages: runtime/Pages/index.md
               - Files: runtime/Files/index.md
diff --git a/python-requirements.txt b/python-requirements.txt
index 6db0bf03f..2c28c2a25 100644
--- a/python-requirements.txt
+++ b/python-requirements.txt
@@ -2,24 +2,28 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 
+# Keep sorted.
 bin2coe
 dataclasses
 editorconfig-checker==2.3.51
 flake8
 gitpython
 hjson
+json5
 jsonref
 jsonschema
 mako
+matplotlib
+pandas
 progressbar2
-tabulate
-yamllint
-pyyaml
+psutil
+pyelftools
 pytablewriter
+pytest
+pyyaml
+tabulate
 termcolor
-pandas
-pyelftools
-psutil
+yamllint
 
 -r docs/requirements.txt
 -r sw/dnn/requirements.txt
diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h
index 2ece472ed..9b8ea6beb 100644
--- a/sw/snRuntime/api/sync_decls.h
+++ b/sw/snRuntime/api/sync_decls.h
@@ -9,6 +9,7 @@ typedef struct {
 
 extern volatile uint32_t _snrt_mutex;
 extern volatile snrt_barrier_t _snrt_barrier;
+extern volatile uint32_t _reduction_result;
 
 inline volatile uint32_t *snrt_mutex();
 
diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index 4e4cd2152..582e93b8e 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -97,10 +97,14 @@ static inline void snrt_init_libs() { snrt_alloc_init(); }
 #endif
 
 #ifdef SNRT_CRT0_EXIT
-static inline void snrt_exit(int exit_code) {
+static inline void snrt_exit_default(int exit_code) {
+    exit_code = snrt_global_all_to_all_reduction(exit_code);
     if (snrt_global_core_idx() == 0)
         *(snrt_exit_code_destination()) = (exit_code << 1) | 1;
 }
+#ifndef SNRT_CRT0_ALTERNATE_EXIT
+static inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); }
+#endif
 #endif
 
 void snrt_main() {
diff --git a/sw/snRuntime/src/sync.c b/sw/snRuntime/src/sync.c
index 58b079268..5d7173a9f 100644
--- a/sw/snRuntime/src/sync.c
+++ b/sw/snRuntime/src/sync.c
@@ -8,6 +8,7 @@
 
 volatile uint32_t _snrt_mutex;
 volatile snrt_barrier_t _snrt_barrier;
+volatile uint32_t _reduction_result;
 
 //================================================================================
 // Functions
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index 07eea700f..7557a57e9 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -81,6 +81,12 @@ inline void snrt_global_barrier() {
     snrt_cluster_hw_barrier();
 }
 
+inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
+    __atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED);
+    snrt_global_barrier();
+    return _reduction_result;
+}
+
 /**
  * @brief Generic barrier
  *
diff --git a/sw/tests/event_unit.c b/sw/tests/event_unit.c
index 4e6695414..18586b3b7 100644
--- a/sw/tests/event_unit.c
+++ b/sw/tests/event_unit.c
@@ -9,7 +9,6 @@ volatile static uint32_t sum = 0;
 static void task(void *arg, uint32_t argc) {
     uint32_t arg0 = ((uint32_t *)arg)[0];
     __atomic_add_fetch(&sum, arg0, __ATOMIC_RELAXED);
-    printf("work arg[0] = %d argc = %d\n", arg0, argc);
 }
 
 uint32_t run_and_verify_task(uint32_t *arg, uint32_t n_workers) {
diff --git a/sw/tests/fp16_comparison_scalar.c b/sw/tests/fp16_comparison_scalar.c
index 925cf11d7..3cd6f437a 100644
--- a/sw/tests/fp16_comparison_scalar.c
+++ b/sw/tests/fp16_comparison_scalar.c
@@ -6,9 +6,9 @@
 #include "printf.h"
 
 int main() {
-    int errs = 40;
-
     if (snrt_is_compute_core()) {
+        int errs = 40;
+
         uint32_t i8a = 0xFFFF4248;   // 3.14
         uint32_t i8an = 0xFFFFC248;  // -3.14
         uint32_t i8b = 0xFFFF3E79;   // 1.618
@@ -210,7 +210,8 @@ int main() {
 
         errs -= (cmp0 == 0x1);
         errs -= (cmp1 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp16_comparison_vector.c b/sw/tests/fp16_comparison_vector.c
index 2bb25993b..565957f23 100644
--- a/sw/tests/fp16_comparison_vector.c
+++ b/sw/tests/fp16_comparison_vector.c
@@ -6,9 +6,9 @@
 #include "printf.h"
 
 int main() {
-    int errs = 64;
-
     if (snrt_is_compute_core()) {
+        int errs = 64;
+
         uint32_t fa16 = 0x4048F5C3;   // 0x4248 3.14
         uint32_t fa16n = 0xC048F5C3;  // 0xC248 -3.14
         uint32_t fb16 = 0x3FCF1AA0;   // 0x3E79  1.618
@@ -287,7 +287,8 @@ int main() {
             "vfeq.h %1, ft8, ft0\n"
             : "+r"(cmp0));
         errs -= (cmp0 == 0xf);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp16_computation_scalar.c b/sw/tests/fp16_computation_scalar.c
index f5eeab74e..2107e5474 100644
--- a/sw/tests/fp16_computation_scalar.c
+++ b/sw/tests/fp16_computation_scalar.c
@@ -4,9 +4,9 @@
 #include <snrt.h>
 
 int main() {
-    int errs = 33;
-
     if (snrt_is_compute_core()) {
+        int errs = 33;
+
         uint32_t i_a = 0xFFFF4248;   // 3.14
         uint32_t i_an = 0xFFFFC248;  // -3.14
         uint32_t i_b = 0xFFFF3E79;   // 1.618
@@ -317,7 +317,8 @@ int main() {
             "feq.h %0, ft3, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp16_computation_vector.c b/sw/tests/fp16_computation_vector.c
index 20ba1c54c..ae1f6c29b 100644
--- a/sw/tests/fp16_computation_vector.c
+++ b/sw/tests/fp16_computation_vector.c
@@ -4,9 +4,9 @@
 #include "snrt.h"
 
 int main() {
-    int errs = 46;
-
     if (snrt_is_compute_core()) {
+        int errs = 46;
+
         uint32_t i_a = 0x4048F5C3;   // 3.14 0
         uint32_t i_an = 0xC048F5C3;  // -3.14
         uint32_t i_b = 0x3FCF1AA0;   // 1.618 2
@@ -563,7 +563,8 @@ int main() {
             "vfeq.h %0, ft7, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0xf);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp16alt_comparison_scalar.c b/sw/tests/fp16alt_comparison_scalar.c
index 9985f0c24..e3496cf64 100644
--- a/sw/tests/fp16alt_comparison_scalar.c
+++ b/sw/tests/fp16alt_comparison_scalar.c
@@ -6,9 +6,9 @@
 #include "printf.h"
 
 int main() {
-    int errs = 40;
-
     if (snrt_is_compute_core()) {
+        int errs = 40;
+
         uint32_t i8a = 0xFFFF4049;   // 3.14
         uint32_t i8an = 0xFFFFC049;  // -3.14
         uint32_t i8b = 0xFFFF3FCF;   // 1.618
@@ -213,7 +213,8 @@ int main() {
 
         errs -= (cmp0 == 0x1);
         errs -= (cmp1 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp16alt_comparison_vector.c b/sw/tests/fp16alt_comparison_vector.c
index ad504cae7..a612382fb 100644
--- a/sw/tests/fp16alt_comparison_vector.c
+++ b/sw/tests/fp16alt_comparison_vector.c
@@ -6,9 +6,9 @@
 #include "printf.h"
 
 int main() {
-    int errs = 64;
-
     if (snrt_is_compute_core()) {
+        int errs = 64;
+
         uint32_t fa16 = 0x4048F5C3;   // 0x4248 3.14
         uint32_t fa16n = 0xC048F5C3;  // 0xC248 -3.14
         uint32_t fb16 = 0x3FCF1AA0;   // 0x3E79  1.618
@@ -289,7 +289,8 @@ int main() {
             "vfeq.ah %1, ft8, ft0\n"
             : "+r"(cmp0));
         errs -= (cmp0 == 0xf);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp16alt_computation_scalar.c b/sw/tests/fp16alt_computation_scalar.c
index 323bfd9d7..49e6130fe 100644
--- a/sw/tests/fp16alt_computation_scalar.c
+++ b/sw/tests/fp16alt_computation_scalar.c
@@ -4,9 +4,9 @@
 #include <snrt.h>
 
 int main() {
-    int errs = 33;
-
     if (snrt_is_compute_core()) {
+        int errs = 33;
+
         uint32_t i_a = 0xFFFF4049;   // 3.14
         uint32_t i_an = 0xFFFFC049;  // -3.14
         uint32_t i_b = 0xFFFF3FCF;   // 1.618
@@ -319,7 +319,8 @@ int main() {
             "feq.ah %0, ft3, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp16alt_computation_vector.c b/sw/tests/fp16alt_computation_vector.c
index 3e1740a28..6ddfed0d6 100644
--- a/sw/tests/fp16alt_computation_vector.c
+++ b/sw/tests/fp16alt_computation_vector.c
@@ -4,9 +4,9 @@
 #include <snrt.h>
 
 int main() {
-    int errs = 46;
-
     if (snrt_is_compute_core()) {
+        int errs = 46;
+
         uint32_t i_a = 0x4048F5C3;   // 3.14 0
         uint32_t i_an = 0xC048F5C3;  // -3.14
         uint32_t i_b = 0x3FCF1AA0;   // 1.618 2
@@ -566,7 +566,8 @@ int main() {
             "vfeq.ah %0, ft7, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0xf);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp32_comparison_scalar.c b/sw/tests/fp32_comparison_scalar.c
index d06cdf51a..5560f3d33 100644
--- a/sw/tests/fp32_comparison_scalar.c
+++ b/sw/tests/fp32_comparison_scalar.c
@@ -6,9 +6,9 @@
 #include "printf.h"
 
 int main() {
-    int errs = 40;
-
     if (snrt_is_compute_core()) {
+        int errs = 40;
+
         uint32_t i8a = 0x4048F5C3;   // 3.14
         uint32_t i8an = 0xC048F5C3;  // -3.14
         uint32_t i8b = 0x3FCF1AA0;   // 1.618
@@ -210,7 +210,8 @@ int main() {
 
         errs -= (cmp0 == 0x1);
         errs -= (cmp1 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp32_comparison_vector.c b/sw/tests/fp32_comparison_vector.c
index d04c04a65..431f1b3ef 100644
--- a/sw/tests/fp32_comparison_vector.c
+++ b/sw/tests/fp32_comparison_vector.c
@@ -6,9 +6,9 @@
 #include "printf.h"
 
 int main() {
-    int errs = 64;
-
     if (snrt_is_compute_core()) {
+        int errs = 64;
+
         uint32_t fa32 = 0x4048F5C3;   // 0x4248 3.14
         uint32_t fa32n = 0xC048F5C3;  // 0xC248 -3.14
         uint32_t fb32 = 0x3FCF1AA0;   // 0x3E79  1.618
@@ -283,7 +283,8 @@ int main() {
             "vfeq.s %1, ft8, ft0\n"
             : "+r"(cmp0));
         errs -= (cmp0 == 3);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp32_computation_scalar.c b/sw/tests/fp32_computation_scalar.c
index b0f3267a3..231f8a2f8 100644
--- a/sw/tests/fp32_computation_scalar.c
+++ b/sw/tests/fp32_computation_scalar.c
@@ -4,9 +4,9 @@
 #include <snrt.h>
 
 int main() {
-    int errs = 33;
-
     if (snrt_is_compute_core()) {
+        int errs = 33;
+
         uint32_t i_a = 0x4048F5C3;   // 3.14
         uint32_t i_an = 0xC048F5C3;  // -3.14
         uint32_t i_b = 0x3FCF1AA0;   // 1.618
@@ -317,7 +317,8 @@ int main() {
             "feq.s %0, ft3, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp32_computation_vector.c b/sw/tests/fp32_computation_vector.c
index b717ee287..da699747a 100644
--- a/sw/tests/fp32_computation_vector.c
+++ b/sw/tests/fp32_computation_vector.c
@@ -4,9 +4,9 @@
 #include <snrt.h>
 
 int main() {
-    int errs = 46;
-
     if (snrt_is_compute_core()) {
+        int errs = 46;
+
         uint32_t i_a = 0x4048F5C3;   // 3.14 0
         uint32_t i_an = 0xC048F5C3;  // -3.14
         uint32_t i_b = 0x3FCF1AA0;   // 1.618 2
@@ -531,7 +531,8 @@ int main() {
             "vfeq.s %0, ft7, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0x3);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp32_conversions_scalar.c b/sw/tests/fp32_conversions_scalar.c
index 14c52f1f7..ca5783095 100644
--- a/sw/tests/fp32_conversions_scalar.c
+++ b/sw/tests/fp32_conversions_scalar.c
@@ -8,9 +8,9 @@
 typedef float v2s __attribute__((vector_size(8)));
 
 int main() {
-    int errs = 48;
-
     if (snrt_is_compute_core()) {
+        int errs = 48;
+
         unsigned int res_cvt0 = 0;
         unsigned int res_cvt1 = 0;
 
@@ -480,7 +480,8 @@ int main() {
             : "+r"(res_cvt0), "+r"(res_cvt1));
         errs -= (res_cvt0 == 0x1);
         errs -= (res_cvt1 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp64_conversions_scalar.c b/sw/tests/fp64_conversions_scalar.c
index 49ec4c64d..44a95c82d 100644
--- a/sw/tests/fp64_conversions_scalar.c
+++ b/sw/tests/fp64_conversions_scalar.c
@@ -8,9 +8,9 @@
 typedef float v2s __attribute__((vector_size(8)));
 
 int main() {
-    int errs = 48;
-
     if (snrt_is_compute_core()) {
+        int errs = 48;
+
         unsigned int res_cvt0 = 0;
         unsigned int res_cvt1 = 0;
 
@@ -495,7 +495,8 @@ int main() {
               "+f"(fvalue_negative));
         errs -= (res_cvt0 == 0x1);
         errs -= (res_cvt1 == 0x1);
-    }
 
+        return errs;
+    }
     return 0;
 }
diff --git a/sw/tests/fp8_comparison_scalar.c b/sw/tests/fp8_comparison_scalar.c
index 60caca400..bcabab874 100644
--- a/sw/tests/fp8_comparison_scalar.c
+++ b/sw/tests/fp8_comparison_scalar.c
@@ -5,9 +5,9 @@
 #include "snrt.h"
 
 int main() {
-    int errs = 40;
-
     if (snrt_is_compute_core()) {
+        int errs = 40;
+
         uint32_t i8a = 0xFFFFFF42;   // 3.14
         uint32_t i8an = 0xFFFFFFC2;  // -3.14
         uint32_t i8b = 0xFFFFFF3E;   // 1.618
@@ -209,7 +209,8 @@ int main() {
 
         errs -= (cmp0 == 0x1);
         errs -= (cmp1 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp8_comparison_vector.c b/sw/tests/fp8_comparison_vector.c
index 156d9bb78..9dc77404d 100644
--- a/sw/tests/fp8_comparison_vector.c
+++ b/sw/tests/fp8_comparison_vector.c
@@ -6,9 +6,9 @@
 #include "printf.h"
 
 int main() {
-    int errs = 64;
-
     if (snrt_is_compute_core()) {
+        int errs = 64;
+
         uint32_t fa8 = 0x4048F5C3;   // 0x4248 3.14
         uint32_t fa8n = 0xC048F5C3;  // 0xC248 -3.14
         uint32_t fb8 = 0x3FCF1AA0;   // 0x3E79  1.618
@@ -295,7 +295,8 @@ int main() {
             "vfeq.b %1, ft8, ft0\n"
             : "+r"(cmp0));
         errs -= (cmp0 == 0xff);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp8_computation_scalar.c b/sw/tests/fp8_computation_scalar.c
index a82c4103a..de2b2afc0 100644
--- a/sw/tests/fp8_computation_scalar.c
+++ b/sw/tests/fp8_computation_scalar.c
@@ -4,9 +4,9 @@
 #include <snrt.h>
 
 int main() {
-    int errs = 33;
-
     if (snrt_is_compute_core()) {
+        int errs = 33;
+
         uint32_t i_a = 0xFFFFFF42;   // 3.14
         uint32_t i_an = 0xFFFFFFC2;  // -3.14
         uint32_t i_b = 0xFFFFFF3E;   // 1.618
@@ -317,7 +317,8 @@ int main() {
             "feq.b %0, ft3, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp8_computation_vector.c b/sw/tests/fp8_computation_vector.c
index 4a2f602d0..d62edced3 100644
--- a/sw/tests/fp8_computation_vector.c
+++ b/sw/tests/fp8_computation_vector.c
@@ -4,9 +4,9 @@
 #include <snrt.h>
 
 int main() {
-    int errs = 46;
-
     if (snrt_is_compute_core()) {
+        int errs = 46;
+
         uint32_t i_a = 0x4048F5C3;   // 3.14 0
         uint32_t i_an = 0xC048F5C3;  // -3.14
         uint32_t i_b = 0x3FCF1AA0;   // 1.618 2
@@ -631,7 +631,8 @@ int main() {
             "vfeq.b %0, ft7, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0xff);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp8alt_comparison_scalar.c b/sw/tests/fp8alt_comparison_scalar.c
index 6f367a7d9..d4c29b992 100644
--- a/sw/tests/fp8alt_comparison_scalar.c
+++ b/sw/tests/fp8alt_comparison_scalar.c
@@ -6,9 +6,9 @@
 #include "printf.h"
 
 int main() {
-    int errs = 40;
-
     if (snrt_is_compute_core()) {
+        int errs = 40;
+
         uint32_t i8a = 0xFFFFFF45;   // 3.14
         uint32_t i8an = 0xFFFFFFC5;  // -3.14
         uint32_t i8b = 0xFFFFFF3D;   // 1.618
@@ -212,7 +212,8 @@ int main() {
 
         errs -= (cmp0 == 0x1);
         errs -= (cmp1 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp8alt_comparison_vector.c b/sw/tests/fp8alt_comparison_vector.c
index 25e27bfb9..ee27dddfd 100644
--- a/sw/tests/fp8alt_comparison_vector.c
+++ b/sw/tests/fp8alt_comparison_vector.c
@@ -6,9 +6,9 @@
 #include "printf.h"
 
 int main() {
-    int errs = 64;
-
     if (snrt_is_compute_core()) {
+        int errs = 64;
+
         uint32_t fa8 = 0x4048F5C3;   // 0x4248 3.14
         uint32_t fa8n = 0xC048F5C3;  // 0xC248 -3.14
         uint32_t fb8 = 0x3FCF1AA0;   // 0x3E79  1.618
@@ -297,7 +297,8 @@ int main() {
             "vfeq.ab %1, ft8, ft0\n"
             : "+r"(cmp0));
         errs -= (cmp0 == 0xff);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp8alt_computation_scalar.c b/sw/tests/fp8alt_computation_scalar.c
index 4b79aac72..7bc93ae62 100644
--- a/sw/tests/fp8alt_computation_scalar.c
+++ b/sw/tests/fp8alt_computation_scalar.c
@@ -4,9 +4,9 @@
 #include <snrt.h>
 
 int main() {
-    int errs = 33;
-
     if (snrt_is_compute_core()) {
+        int errs = 33;
+
         uint32_t i_a = 0xFFFFFF45;   // 3.14
         uint32_t i_an = 0xFFFFFFC5;  // -3.14
         uint32_t i_b = 0xFFFFFF3D;   // 1.618
@@ -319,7 +319,8 @@ int main() {
             "feq.ab %0, ft3, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0x1);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/fp8alt_computation_vector.c b/sw/tests/fp8alt_computation_vector.c
index 15da6ec04..1dea86586 100644
--- a/sw/tests/fp8alt_computation_vector.c
+++ b/sw/tests/fp8alt_computation_vector.c
@@ -4,9 +4,9 @@
 #include <snrt.h>
 
 int main() {
-    int errs = 46;
-
     if (snrt_is_compute_core()) {
+        int errs = 46;
+
         uint32_t i_a = 0x4048F5C3;   // 3.14 0
         uint32_t i_an = 0xC048F5C3;  // -3.14
         uint32_t i_b = 0x3FCF1AA0;   // 1.618 2
@@ -635,7 +635,8 @@ int main() {
             "vfeq.ab %0, ft7, ft0\n"
             : "+r"(res0));
         errs -= (res0 == 0xff);
-    }
 
-    return errs;
+        return errs;
+    }
+    return 0;
 }
diff --git a/sw/tests/team_global.c b/sw/tests/team_global.c
index a03fc8de1..cf9da6b89 100644
--- a/sw/tests/team_global.c
+++ b/sw/tests/team_global.c
@@ -15,7 +15,7 @@ int main() {
     uint32_t errors = 0;
     errors += (snrt_global_core_idx() != i);
     errors += (snrt_global_core_num() != 9);
-    errors += (snrt_cluster_idx() != i / 1);
+    errors += (snrt_cluster_idx() != i / 9);
     errors += (snrt_cluster_num() != 1);
     errors += (snrt_cluster_core_idx() != i % 9);
     errors += (snrt_cluster_core_num() != 9);
diff --git a/target/common/common.mk b/target/common/common.mk
index 0cf03c463..143f9b9a8 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -31,9 +31,9 @@ VLIB         ?= $(QUESTA_SEPP) vlib
 GENTRACE_PY      ?= $(UTIL_DIR)/trace/gen_trace.py
 ANNOTATE_PY      ?= $(UTIL_DIR)/trace/annotate.py
 EVENTS_PY        ?= $(UTIL_DIR)/trace/events.py
-PERF_CSV_PY      ?= $(UTIL_DIR)/trace/perf_csv.py
-LAYOUT_EVENTS_PY ?= $(UTIL_DIR)/trace/layout_events.py
-EVENTVIS_PY      ?= $(UTIL_DIR)/trace/eventvis.py
+JOIN_PY          ?= $(UTIL_DIR)/bench/join.py
+ROI_PY           ?= $(UTIL_DIR)/bench/roi.py
+VISUALIZE_PY     ?= $(UTIL_DIR)/bench/visualize.py
 
 VERILATOR_ROOT ?= $(dir $(shell $(VERILATOR_SEPP) which verilator))..
 VLT_ROOT       ?= ${VERILATOR_ROOT}
@@ -77,6 +77,7 @@ VLT_FLAGS    += -Wno-UNSIGNED
 VLT_FLAGS    += -Wno-UNOPTFLAT
 VLT_FLAGS    += -Wno-fatal
 VLT_FLAGS    += --unroll-count 1024
+VLT_FLAGS    += --timescale 1ns/1ps
 VLT_CFLAGS   += -std=c++14 -pthread
 VLT_CFLAGS   +=-I ${VLT_BUILDDIR} -I $(VLT_ROOT)/include -I $(VLT_ROOT)/include/vltstd -I $(VLT_FESVR)/include -I $(TB_DIR) -I ${MKFILE_DIR}/test
 
@@ -232,26 +233,27 @@ endef
 
 DASM_TRACES      = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null))
 TXT_TRACES       = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.txt/g'))
-PERF_TRACES      = $(shell (echo $(DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g'))
+PERF_DUMPS       = $(shell (echo $(DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g'))
 ANNOTATED_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.s/g'))
 DIFF_TRACES      = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.diff/g'))
 
-GENTRACE_OUTPUTS = $(TXT_TRACES) $(PERF_TRACES)
+GENTRACE_OUTPUTS = $(TXT_TRACES) $(PERF_DUMPS)
 ANNOTATE_OUTPUTS = $(ANNOTATED_TRACES)
-PERF_CSV         = $(LOGS_DIR)/perf.csv
-EVENT_CSV        = $(LOGS_DIR)/event.csv
-TRACE_CSV        = $(LOGS_DIR)/trace.csv
+PERF_DUMP        = $(LOGS_DIR)/perf.json
+ROI_DUMP         = $(LOGS_DIR)/roi.json
 TRACE_JSON       = $(LOGS_DIR)/trace.json
 
-.PHONY: traces annotate perf-csv event-csv layout
+.PHONY: traces annotate trace-view clean-traces clean-annotate
 traces: $(GENTRACE_OUTPUTS)
 annotate: $(ANNOTATE_OUTPUTS)
-perf-csv: $(PERF_CSV)
-event-csv: $(EVENT_CSV)
-layout: $(TRACE_CSV) $(TRACE_JSON)
+trace-view: $(TRACE_JSON)
+clean-traces:
+	rm -f $(GENTRACE_OUTPUTS)
+clean-annotate:
+	rm -f $(ANNOTATE_OUTPUTS)
 
-$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
-	$(DASM) < $< | $(PYTHON) $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt
+$(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
+	$(DASM) < $< | $(PYTHON) $(GENTRACE_PY) --permissive --dma-trace $(LOGS_DIR)/dma_trace_$*.log --dump-hart-perf $(LOGS_DIR)/hart_$*_perf.json --dump-dma-perf $(LOGS_DIR)/dma_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
 
 # Generate source-code interleaved traces for all harts. Reads the binary from
 # the logs/.rtlbinary file that is written at start of simulation in the vsim script
@@ -261,14 +263,11 @@ $(LOGS_DIR)/trace_hart_%.s: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
 $(LOGS_DIR)/trace_hart_%.diff: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
 	$(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -d
 
-$(PERF_CSV): $(PERF_TRACES) $(PERF_CSV_PY)
-	$(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES)
+$(PERF_DUMP): $(PERF_DUMPS) $(JOIN_PY)
+	$(PYTHON) $(JOIN_PY) -i $(shell ls $(LOGS_DIR)/*_perf.json) -o $@
 
-$(EVENT_CSV): $(PERF_TRACES) $(PERF_CSV_PY)
-	$(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES) --filter tstart tend
+$(ROI_DUMP): $(PERF_DUMP) $(ROI_SPEC) $(ROI_PY)
+	$(PYTHON) $(ROI_PY) $(PERF_DUMP) $(ROI_SPEC) --cfg $(CFG) -o $@
 
-$(TRACE_CSV): $(EVENT_CSV) $(LAYOUT_FILE) $(LAYOUT_EVENTS_PY)
-	$(PYTHON) $(LAYOUT_EVENTS_PY) $(LAYOUT_EVENTS_FLAGS) $(EVENT_CSV) $(LAYOUT_FILE) -o $@
-
-$(TRACE_JSON): $(TRACE_CSV) $(EVENTVIS_PY)
-	$(PYTHON) $(EVENTVIS_PY) -o $@ $(TRACE_CSV)
+$(TRACE_JSON): $(ROI_DUMP) $(VISUALIZE_PY)
+	$(PYTHON) $(VISUALIZE_PY) $(ROI_DUMP) --traces $(TXT_TRACES) --elf $(BINARY) -o $@
diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson
index 7f28a1073..48c98d234 100644
--- a/target/snitch_cluster/cfg/default.hjson
+++ b/target/snitch_cluster/cfg/default.hjson
@@ -36,7 +36,7 @@
             lat_noncomp: 1,
             lat_conv: 2,
             lat_sdotp: 3,
-            fpu_pipe_config: "BEFORE"
+            fpu_pipe_config: "BEFORE",
             narrow_xbar_latency: "CUT_ALL_PORTS",
             wide_xbar_latency: "CUT_ALL_PORTS",
             // Isolate the core.
@@ -106,10 +106,10 @@
     dma_core_template: {
         isa: "rv32imafd",
         // Xdiv_sqrt: true,
-        # isa: "rv32ema",
-        xdma: true
-        xssr: false
-        xfrep: false
+        // isa: "rv32ema",
+        xdma: true,
+        xssr: false,
+        xfrep: false,
         xf16: false,
         xf16alt: false,
         xf8: false,
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index ce241a8d4..1e9d25e89 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -50,8 +50,8 @@ runs:
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2
   - elf: tests/build/fp32_conversions_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with illegal instruction
-  - elf: tests/build/fp64_conversions_scalar.elf
-    simulators: [vsim, vcs, verilator]
+  # - elf: tests/build/fp64_conversions_scalar.elf
+  #   simulators: [vsim, vcs, verilator]
   # - elf: tests/build/interrupt.elf
   - elf: tests/build/interrupt_local.elf
   - elf: tests/build/multi_cluster.elf
@@ -68,7 +68,7 @@ runs:
   - elf: tests/build/varargs_2.elf
   - elf: tests/build/zero_mem.elf
   - elf: tests/build/non_null_exitcode.elf
-    retcode: 14
+    retcode: 126
   - elf: apps/blas/axpy/build/axpy.elf
     cmd: [../../../sw/blas/axpy/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/blas/gemm/build/gemm.elf
diff --git a/util/bench/__init__.py b/util/bench/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/util/bench/join.py b/util/bench/join.py
new file mode 100755
index 000000000..56c0defe0
--- /dev/null
+++ b/util/bench/join.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Combines performance metrics from all threads into one JSON file.
+
+This script takes the performance metrics from multiple cores or DMA
+engines, in JSON format as dumped by the [`events.py`][events] or
+[`gen_trace.py`][gen_trace] scripts, and merges them into a single
+JSON file for global inspection and further processing.
+"""
+
+import sys
+import argparse
+import re
+import json
+
+
+FILENAME_REGEX = r'([a-z]+)_([0-9a-f]+)_perf.json'
+
+
+def main():
+    # Argument parsing
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-i',
+        '--inputs',
+        metavar='<inputs>',
+        nargs='+',
+        help='Input performance metric dumps')
+    parser.add_argument(
+        '-o',
+        '--output',
+        metavar='<output>',
+        nargs='?',
+        default='perf.json',
+        help='Output JSON file')
+    args = parser.parse_args()
+
+    # Populate a list (one entry per hart) of dictionaries
+    # enumerating all the performance metrics for each hart
+    data = {}
+    for filename in sorted(args.inputs):
+
+        # Get thread ID and type (DMA or hart) from filename
+        match = re.search(FILENAME_REGEX, filename)
+        typ = match.group(1)
+        idx = int(match.group(2), base=16)
+
+        # Populate dictionary of metrics for the current hart
+        with open(filename, 'r') as f:
+            data[f'{typ}_{idx}'] = json.load(f)
+
+    # Export data
+    with open(args.output, 'w') as f:
+        json.dump(data, f, indent=4)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/util/bench/roi.py b/util/bench/roi.py
new file mode 100755
index 000000000..4671bff95
--- /dev/null
+++ b/util/bench/roi.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Filters and labels execution regions for visualization.
+
+This script takes a JSON file of performance metrics, as output by
+[`join.py`][join], and generates another JSON, where the execution
+regions are filtered and labeled for visualization, according to an
+auxiliary region-of-interest (ROI) specification file (JSON format).
+The specification file can be a Mako template to parameterize
+certain parameters, such as the number of clusters in the system.
+The output JSON can be passed to the [`visualize.py`][visualize]
+script for visualization.
+
+Check out `test_data/data.json` and `test_data/spec.json` for an
+example input and specification file which can be fed as input to the
+tool respectively. The corresponding output is contained in
+`test_data/roi.json`.
+"""
+
+import argparse
+import json
+import json5
+from mako.template import Template
+import sys
+
+
+def format_roi(roi, label):
+    return {
+        "label": label,
+        "tstart": roi["tstart"],
+        "tend": roi["tend"],
+        "attrs": {key: value for key, value in roi.items() if key not in ["tstart", "tend"]}
+    }
+
+
+def get_roi(data, thread, idx):
+    thread_type, thread_idx = thread.split('_')
+    thread_idx = int(thread_idx)
+    thread_data = data[thread]
+    if thread_type == "hart":
+        return thread_data[idx]
+    elif thread_type == "dma":
+        return thread_data["transfers"][idx]
+    else:
+        raise ValueError(f"Unsupported thread type {thread_type}")
+
+
+def filter_and_label_rois(data, spec):
+    output = {}
+    # Iterate all threads in the rendered specification
+    for thread_spec in spec:
+        thread = thread_spec['thread']
+        output_rois = []
+        # Iterate all ROIs to keep for the current thread
+        for roi in thread_spec['roi']:
+            output_roi = format_roi(get_roi(data, thread, roi['idx']), roi['label'])
+            output_rois.append(output_roi)
+        # Add ROIs for current thread to output, if any
+        if output_rois:
+            output[thread] = output_rois
+    return output
+
+
+def load_json_inputs(input_path, spec_path, **kwargs):
+    # Read input JSON
+    with open(input_path, 'r') as f:
+        data = json5.load(f)
+    # Read and render specification template JSON
+    with open(spec_path, 'r') as f:
+        spec_template = Template(f.read())
+        rendered_spec = spec_template.render(**kwargs)
+        spec = json5.loads(rendered_spec)
+    return data, spec
+
+
+def main():
+    # Argument parsing
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input',
+        help='Input JSON file')
+    parser.add_argument(
+        'spec',
+        help='ROI specification file (JSON format)')
+    parser.add_argument(
+        '--cfg',
+        help='Hardware configuration file used to render the specification file')
+    parser.add_argument(
+        '-o',
+        '--output',
+        nargs='?',
+        default='roi.json',
+        help='Output JSON file')
+    args = parser.parse_args()
+
+    # Load hardware configuration
+    with open(args.cfg, 'r') as f:
+        cfg = json5.load(f)
+
+    # Read and render input files
+    data, spec = load_json_inputs(args.input, args.spec, cfg=cfg)
+
+    # Process inputs and generate output JSON
+    output = filter_and_label_rois(data, spec)
+
+    # Write output to file
+    with open(args.output, 'w') as f:
+        json.dump(output, f, indent=4)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/util/bench/tests/__init__.py b/util/bench/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/util/bench/tests/test_data/data.json b/util/bench/tests/test_data/data.json
new file mode 100644
index 000000000..77ca26416
--- /dev/null
+++ b/util/bench/tests/test_data/data.json
@@ -0,0 +1,46 @@
+{
+    "hart_0": [
+        {
+            "tstart": 1759.0,
+            "tend": 6802.0,
+            "fpss_fpu_occupancy": 0.006345429307951616,
+            "total_ipc": 0.04501288915328178
+        },
+        {
+            "tstart": 6802.0,
+            "tend": 12647.0,
+            "fpss_fpu_occupancy": 0.013860369609856264,
+            "total_ipc": 0.20756331279945245
+        }
+    ],
+    "dma_9": {
+        "aggregate_bw": 11.829313543599257,
+        "transfers": [
+            {
+                "tstart": 3512,
+                "tend": 3526,
+                "bw": 1.1428571428571428
+            },
+            {
+                "tstart": 3564,
+                "tend": 3578,
+                "bw": 1.1428571428571428
+            }
+        ]
+    },
+    "dma_18": {
+        "aggregate_bw": 16.633245382585752,
+        "transfers": [
+            {
+                "tstart": 3608,
+                "tend": 3622,
+                "bw": 1.1428571428571428
+            },
+            {
+                "tstart": 3660,
+                "tend": 3674,
+                "bw": 1.1428571428571428
+            }
+        ]
+    }
+}
diff --git a/util/bench/tests/test_data/roi.json b/util/bench/tests/test_data/roi.json
new file mode 100644
index 000000000..a6efe3773
--- /dev/null
+++ b/util/bench/tests/test_data/roi.json
@@ -0,0 +1,33 @@
+{
+    "hart_0": [
+        {
+            "label": "compute",
+            "tstart": 6802.0,
+            "tend": 12647.0,
+            "attrs": {
+                "fpss_fpu_occupancy": 0.013860369609856264,
+                "total_ipc": 0.20756331279945245
+            }
+        }
+    ],
+    "dma_9": [
+        {
+            "label": "dma_in",
+            "tstart": 3512,
+            "tend": 3526,
+            "attrs": {
+                "bw": 1.1428571428571428
+            }
+        }
+    ],
+    "dma_18": [
+        {
+            "label": "dma_in",
+            "tstart": 3608,
+            "tend": 3622,
+            "attrs": {
+                "bw": 1.1428571428571428
+            }
+        }
+    ]
+}
diff --git a/util/bench/tests/test_data/spec.json b/util/bench/tests/test_data/spec.json
new file mode 100644
index 000000000..ae58303c0
--- /dev/null
+++ b/util/bench/tests/test_data/spec.json
@@ -0,0 +1,16 @@
+[
+    {
+        "thread": "hart_0",
+        "roi": [
+            {"idx": 1, "label": "compute"}
+        ]
+    },
+% for i in range(0, num_clusters):
+    {
+        "thread": "${f'dma_{9*(i+1)}'}",
+        "roi": [
+            {"idx": 0, "label": "dma_in"}
+        ]
+    },
+% endfor
+]
diff --git a/util/bench/tests/test_roi.py b/util/bench/tests/test_roi.py
new file mode 100644
index 000000000..ffb567816
--- /dev/null
+++ b/util/bench/tests/test_roi.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import json
+from pathlib import Path
+import pytest
+from bench.roi import get_roi, format_roi, load_json_inputs, filter_and_label_rois
+
+TEST_DATA_DIR = Path(__file__).resolve().parent / 'test_data'
+INPUT_JSON = TEST_DATA_DIR / 'data.json'
+SPEC_JSON = TEST_DATA_DIR / 'spec.json'
+OUTPUT_JSON = TEST_DATA_DIR / 'roi.json'
+
+
+def test_format_roi():
+    label = "compute"
+    roi = {
+        "tstart": 1759.0,
+        "tend": 6802.0,
+        "fpss_fpu_occupancy": 0.006345429307951616,
+        "total_ipc": 0.04501288915328178
+    }
+    formatted_roi = {
+        "label": "compute",
+        "tstart": 1759.0,
+        "tend": 6802.0,
+        "attrs": {
+            "fpss_fpu_occupancy": 0.006345429307951616,
+            "total_ipc": 0.04501288915328178
+        },
+    }
+    assert format_roi(roi, label) == formatted_roi
+
+
+@pytest.mark.parametrize("thread, idx, roi", [
+    ('hart_0', 0, {
+        "tstart": 1759.0,
+        "tend": 6802.0,
+        "fpss_fpu_occupancy": 0.006345429307951616,
+        "total_ipc": 0.04501288915328178
+    }),
+    ('dma_9', 1, {
+        "tstart": 3564,
+        "tend": 3578,
+        "bw": 1.1428571428571428
+    })
+])
+def test_get_roi(thread, idx, roi):
+    with open(INPUT_JSON, 'r') as f:
+        data = json.load(f)
+    assert get_roi(data, thread, idx) == roi
+
+
+def test_filter_and_label_rois():
+    data, spec = load_json_inputs(INPUT_JSON, SPEC_JSON, num_clusters=2)
+    with open(OUTPUT_JSON, 'r') as f:
+        output = json.load(f)
+    assert filter_and_label_rois(data, spec) == output
diff --git a/util/bench/visualize.py b/util/bench/visualize.py
new file mode 100755
index 000000000..087d8b86e
--- /dev/null
+++ b/util/bench/visualize.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Translates a ROI JSON for visualization in Chrome.
+
+This script translates a JSON file, in the format produced by
+[`roi.py`][roi], to a JSON file adhering to the syntax required by
+Chrome's
+[Trace-Viewer](https://github.com/catapult-project/catapult/tree/master/tracing).
+
+The output can be visualized in a Chrome browser: go to the
+`about:tracing` URL and load the JSON file.
+
+This script can be compared to `trace/tracevis.py`, but instead of
+visualizing individual instructions, it represents entire execution
+regions as a whole.
+"""
+
+import argparse
+import json
+from pathlib import Path
+import sys
+
+sys.path.append(str(Path(__file__).parent / '../trace'))
+import tracevis  # noqa: E402
+
+
+# Converts nanoseconds to microseconds
+def us(ns):
+    return ns / 1000
+
+
+def main():
+    # Argument parsing
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input',
+        metavar='<input>',
+        help='Input JSON file')
+    parser.add_argument(
+        '--traces',
+        metavar='<trace>',
+        nargs='*',
+        help='Simulation traces to process')
+    parser.add_argument(
+        '--elf',
+        nargs='?',
+        help='ELF from which the traces were generated')
+    parser.add_argument(
+        '-o',
+        '--output',
+        metavar='<json>',
+        nargs='?',
+        default='trace.json',
+        help='Output JSON file')
+    args = parser.parse_args()
+
+    # TraceViewer events
+    events = []
+
+    # Add a dummy instant event to mark time 0.
+    # This is to avoid that the events are shifted from
+    # their actual start times, as done to align the first event
+    # to time 0.
+    event = {'name': 'zero',
+             'ph':   'I',  # Instant event type
+             'ts':   0,
+             's':    'g'  # Global scope
+             }
+    events.append(event)
+
+    # Read JSON contents
+    with open(args.input) as f:
+        data = json.load(f)
+
+    # Iterate threads
+    for thread, regions in data.items():
+
+        # Iterate execution regions for current thread
+        for region in regions:
+
+            # Create TraceViewer event
+            ts = int(region['tstart'])
+            dur = int(region['tend']) - ts
+            event = {
+                'name': region['label'],
+                'ph': "X",  # Complete event type
+                'ts': us(ts),
+                'dur': us(dur),
+                'pid': 0,
+                'tid': thread,
+                'args': region['attrs']
+            }
+            events.append(event)
+
+    # Optionally extract also instruction-level events
+    # from the simulation traces
+    if args.traces and args.elf:
+        events += tracevis.parse_traces(args.traces, start=0, end=-1, fmt='snitch',
+                                        addr2line='addr2line', use_time=True, pid=1,
+                                        cache=True, elf=args.elf, collapse_call_stack=True)
+
+    # Create TraceViewer JSON object
+    tvobj = {}
+    tvobj['traceEvents'] = events
+    tvobj['displayTimeUnit'] = "ns"
+
+    # Dump TraceViewer events to JSON file
+    with open(args.output, 'w') as f:
+        json.dump(tvobj, f, indent=4)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/util/container/Dockerfile b/util/container/Dockerfile
index d917a6790..8fb4de146 100644
--- a/util/container/Dockerfile
+++ b/util/container/Dockerfile
@@ -78,7 +78,7 @@ RUN cargo install --path /tmp/banshee
 FROM ubuntu:18.04 AS snitch_cluster
 ARG SNITCH_LLVM_VERSION=latest
 ARG VERIBLE_VERSION=0.0-776-g09e0b87
-ARG VERILATOR_VERSION=4.100
+ARG VERILATOR_VERSION=4.110
 
 LABEL version="0.1"
 LABEL description="Snitch container for hardware and software development."
diff --git a/util/trace/annotate.py b/util/trace/annotate.py
index 4d2c94862..683091b31 100755
--- a/util/trace/annotate.py
+++ b/util/trace/annotate.py
@@ -1,23 +1,30 @@
 #!/usr/bin/env python3
-
 # Copyright 2021 ETH Zurich and University of Bologna.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
-
-# This script parses the traces generated by Snitch and creates an annotated
-# trace that includes code sources
-# Example output:
-#     ; snrt_hartid (team.c:14)
-#     ;  in snrt_cluster_core_idx (team.c:47)
-#     ;  in main (event_unit.c:21)
-#     ;  asm("csrr %0, mhartid" : "=r"(hartid));
-#           80000048  x13=0000000a                            # csrr    a3, mhartid
-#
-# If the -d/--diff option is specified, it instead outputs a (fictitious) diff
-# file which allows to visualize the trace-source correlation side-by-side
-# instead of interleaved.
-# For neater visualization, feed the diff file into a diff visualization tool e.g.:
-# kompare -o <diff_file>
+"""Annotates an instruction trace with source-code information.
+
+This script parses a human-readable trace, as generated by CVA6 or
+Snitch's [`gen_trace.py`][gen_trace] script, and annotates every
+instruction in the trace with information on its originating
+source-code.
+
+Example output:
+```
+    ; snrt_hartid (team.c:14)
+    ;  in snrt_cluster_core_idx (team.c:47)
+    ;  in main (event_unit.c:21)
+    ;  asm("csrr %0, mhartid" : "=r"(hartid));
+          80000048  x13=0000000a             # csrr a3, mhartid
+```
+
+By default, the source-code information is interleaved in the same
+file with the instruction trace. If you prefer to have a
+side-by-side view, the -d/--diff option can be used. In this case,
+the tool outputs a (fictitious) diff file which can be fed into a
+diff visualization tool for side-by-side visualization in a GUI,
+e.g. `kompare -o <diff_file>`.
+"""
 
 import sys
 import os
diff --git a/util/trace/events.py b/util/trace/events.py
index a655be033..c5442ee2a 100755
--- a/util/trace/events.py
+++ b/util/trace/events.py
@@ -3,12 +3,13 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# This script takes a CVA6 or Snitch trace and it exports the simulation time
-# of all mcycle CSR reads in a format compatible with the gen_trace.py
-# script's JSON output.
-#
 # Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Export `mcycle` CSR read events from a Snitch or CVA6 trace.
 
+This script takes a CVA6 or Snitch trace and it exports the
+simulation time of all `mcycle` CSR reads to a JSON file in a format
+compatible with [`gen_trace.py`][gen_trace]'s output.
+"""
 
 import sys
 import argparse
diff --git a/util/trace/eventvis.py b/util/trace/eventvis.py
deleted file mode 100755
index 4d0fdfdc7..000000000
--- a/util/trace/eventvis.py
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2020 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# This script takes a CSV of events, compatible with the CSV format produced by
-# `perf_csv.py`, and creates a JSON file that can be visualized by
-# [Trace-Viewer](https://github.com/catapult-project/catapult/tree/master/tracing)
-# In Chrome, open `about:tracing` and load the JSON file to view it.
-#
-# Following is an example CSV containing two regions (as would be defined by the
-# presence of one mcycle CSR read in the traces):
-#
-#  , prepare data,      , send interrupt,
-# 0, 32906,        32911, 32911,          33662
-#
-# The first line is used to assign a name to each region.
-# Each of the following lines starts with the hartid, followed by the start and
-# end timestamps of each region.
-# While the alignment of the region names in the first line w.r.t. the following
-# lines does not matter, we suggest to align them with the columns containing the
-# start times of the respective regions (as in the example above).
-#
-# This script can be compared to `tracevis.py`, but instead of visualizing individual
-# instructions, it visualizes coarser grained regions as delimited by events
-# in the traces.
-#
-# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-import sys
-import argparse
-import csv
-import json
-import tracevis
-
-
-def pairwise(iterable):
-    "s -> (s0, s1), (s2, s3), (s4, s5), ..."
-    a = iter(iterable)
-    return zip(a, a)
-
-
-# Converts nanoseconds to microseconds
-def us(ns):
-    return ns / 1000
-
-
-def main():
-    # Argument parsing
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        'csv',
-        metavar='<csv>',
-        help='Input CSV file')
-    parser.add_argument(
-        '--traces',
-        metavar='<trace>',
-        nargs='*',
-        help='Simulation traces to process')
-    parser.add_argument(
-        '--elf',
-        nargs='?',
-        help='ELF from which the traces were generated')
-    parser.add_argument(
-        '-o',
-        '--output',
-        metavar='<json>',
-        nargs='?',
-        default='events.json',
-        help='Output JSON file')
-    args = parser.parse_args()
-
-    # TraceViewer events
-    events = []
-
-    # Add a dummy instant event to mark time 0.
-    # This is to avoid that the events are shifted from
-    # their actual start times to align the first event
-    # at time 0.
-    event = {'name': 'zero',
-             'ph':   'I',  # Instant event type
-             'ts':   0,
-             's':    'g'  # Global scope
-             }
-    events.append(event)
-
-    # Read CSV to collect TraceViewer events
-    with open(args.csv) as f:
-        reader = csv.reader(f, delimiter=',')
-
-        # Get region names
-        regions = [name for name in next(reader) if name]
-
-        # Process lines
-        for row in reader:
-
-            # First entry in row is the hart ID
-            tid = row[0]
-
-            # Start and end times of each region follow
-            for i, (start, end) in enumerate(pairwise(row[1:])):
-
-                # Filter regions this hart does not take part in
-                if start:
-
-                    # Create TraceViewer event
-                    ts = int(start)
-                    dur = int(end) - ts
-                    event = {'name': regions[i],
-                             'ph': "X",  # Complete event type
-                             'ts': us(ts),
-                             'dur': us(dur),
-                             'pid': 0,
-                             'tid': tid
-                             }
-                    events.append(event)
-
-    # Optionally extract also instruction-level events
-    # from the simulation traces
-    if args.traces and args.elf:
-        events += tracevis.parse_traces(args.traces, start=0, end=-1, fmt='snitch',
-                                        addr2line='addr2line', use_time=True, pid=1,
-                                        cache=True, elf=args.elf, collapse_call_stack=True)
-
-    # Create TraceViewer JSON object
-    tvobj = {}
-    tvobj['traceEvents'] = events
-    tvobj['displayTimeUnit'] = "ns"
-
-    # Dump TraceViewer events to JSON file
-    with open(args.output, 'w') as f:
-        json.dump(tvobj, f, indent=4)
-
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index fd91ffbf9..f1f94d80d 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -2,11 +2,34 @@
 # Copyright 2020 ETH Zurich and University of Bologna.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
-# This script takes a trace generated for a Snitch hart and transforms the
-# additional decode stage info into meaningful annotation. It also counts
-# and computes various performance metrics up to each mcycle CSR read.
-
-# Author: Paul Scheffler <paulsc@iis.ee.ethz.ch>
+#
+# Authors: Paul Scheffler <paulsc@iis.ee.ethz.ch>
+#          Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Script to generate human-readable instruction traces for Snitch.
+
+This script takes a trace generated by a Snitch hart
+(see `snitch_cc.sv`) and transforms the additional decode stage info
+into meaningful annotation.
+
+It also counts and computes various performance metrics for every
+execution region. An execution region is a sequence of instructions.
+Every `mcycle` CSR read instruction in your trace implicitly defines
+two execution regions, comprising respectively:
+
+- all instructions executed before the read, up to the previous read
+or the first executed instruction
+- all instructions executed after the read, up to the next read or
+the last executed instruction
+
+Performance metrics are appended at the end of the generated trace
+and can optionally be dumped to a separate JSON file.
+
+It also computes various performance metrics for every DMA transfer,
+provided that the Snitch core is equipped with a tightly-coupled DMA
+engine, and the DMA trace logged during simulation
+(see `axi_dma_backend.sv`) is fed to the tool. DMA performance
+metrics are dumped to a separate JSON file.
+"""
 
 # TODO: OPER_TYPES and FPU_OPER_TYPES could break: optimization might alter enum mapping
 # TODO: We annotate all FP16 LSU values as IEEE, not FP16ALT... can we do better?
@@ -16,8 +39,10 @@
 import math
 import argparse
 import json
+import ast
 from ctypes import c_int32, c_uint32
 from collections import deque, defaultdict
+from pathlib import Path
 
 EXTRA_WB_WARN = 'WARNING: {} transactions still in flight for {}.'
 
@@ -381,6 +406,109 @@ def flt_lit(num: int, fmt: int, width: int = 7) -> str:
     return flt_fmt(flt_decode(num, fmt), width)
 
 
+# -------------------- DMA --------------------
+
+
+# We always assume dma_trans contains at least one incomplete placeholder DMA transaction.
+# This incomplete transaction contains default settings. Only upon a DMCPY* instruction
+# is the size of the transaction known, completing the transaction. At that point, a new
+# incomplete transaction is created, inheriting the configuration settings from the previous
+# transaction, which may or may not be overriden before the next DMCPY*.
+def update_dma(insn, extras, dma_trans):
+    # Extract instruction mnemonic from full instruction decoding (includes operand registers)
+    MNEMONIC_REGEX = r'^([\w.]+)\s'
+    match = re.match(MNEMONIC_REGEX, insn)
+    if match:
+        mnemonic = match.group(1)
+        # Process DMA instruction
+        if mnemonic in ['dmsrc', 'dmdst', 'dmstr']:
+            pass
+        elif mnemonic == 'dmrep':
+            dma_trans[-1]['rep'] = extras['opa']
+        elif mnemonic in ['dmcpy', 'dmcpyi']:
+            # Create new placeholder transaction to inherit current DMA settings
+            dma_trans.append(dma_trans[-1].copy())
+            # Set size of the transaction
+            dma_trans[-2]['size'] = extras['opa']
+            # Override repetition count if the transaction is configured to be 1D
+            config = extras['rs2']
+            enable_2d = (config & 2) >> 1
+            if not enable_2d:
+                dma_trans[-2]['rep'] = 1
+
+
+def eval_dma_metrics(dma_trans, dma_trace):
+    dma_trace = Path(dma_trace)
+    if dma_trace.exists():
+        with open(dma_trace, 'r') as f:
+            # Initialize variables
+            compl_transfers = []
+            outst_transfers = []
+            req_transfer_idx = 0
+            req_bytes = 0
+            # Iterate lines in DMA trace
+            for line in f.readlines():
+                dma = ast.literal_eval(line)
+                if 'backend_burst_req_valid' in dma:
+                    # When the first burst in a transfer is granted, we record a new transfer in
+                    # the outstanding transfers queue, with the information obtained from the core
+                    # trace. We record the number of bytes moved by each burst in a transfer, and
+                    # compare the total to the number of bytes moved by the transfer, to count how
+                    # many bursts belong to the current DMA transfer (a number which is difficult
+                    # to pre-compute from the core trace as it depends on address alignments, etc.)
+                    if dma['backend_burst_req_valid'] and dma['backend_burst_req_ready']:
+                        if req_bytes == 0:
+                            n_bytes = dma_trans[req_transfer_idx]['rep'] * \
+                                    dma_trans[req_transfer_idx]['size']
+                            outst_transfers.append({'tstart': dma['time'],
+                                                    'exp_bursts': 0,
+                                                    'rec_bursts': 0,
+                                                    'bytes': n_bytes})
+                        req_bytes += dma['backend_burst_req_num_bytes']
+                        outst_transfers[-1]['exp_bursts'] += 1
+                    # We move on to the next transfer when the bytes requested by the previous
+                    # bursts match the current transfer size.
+                    if req_bytes == outst_transfers[-1]['bytes']:
+                        req_bytes = 0
+                        req_transfer_idx += 1
+                    # Upon a burst completion, we increment the received bursts count. When this
+                    # count matches the expected bursts count of the current transfer we record the
+                    # end time of the transfer and promote the transfer from the outstanding to the
+                    # completed transfers' queue.
+                    if dma['transfer_completed']:
+                        outst_transfers[0]['rec_bursts'] += 1
+                        if outst_transfers[0]['rec_bursts'] == outst_transfers[0]['exp_bursts']:
+                            outst_transfers[0]['tend'] = dma['time']
+                            compl_transfer = outst_transfers.pop(0)
+                            compl_transfer.pop('exp_bursts')
+                            compl_transfer.pop('rec_bursts')
+                            compl_transfers.append(compl_transfer)
+            # Calculate bandwidth of individual transfers
+            for transfer in compl_transfers:
+                transfer['cycles'] = transfer['tend'] - transfer['tstart']
+                transfer['bw'] = transfer['bytes'] / transfer['cycles']
+            # Calculate aggregate bandwidth: total number of bytes transferred while any transfer is
+            # active (accounts for overlaps between transfers).
+            prev_trans_end = 0
+            active_cycles = 0
+            n_bytes = 0
+            for transfer in compl_transfers:
+                # Calculate active cycles, without double-counting overlaps
+                curr_trans_start, curr_trans_end = transfer['tstart'], transfer['tend']
+                if curr_trans_start > prev_trans_end:
+                    active_cycles += curr_trans_end - curr_trans_start
+                else:
+                    active_cycles += curr_trans_end - prev_trans_end
+                prev_trans_end = curr_trans_end
+                # Calculate total number of bytes
+                n_bytes += transfer['bytes']
+            dma_metrics = {}
+            if active_cycles != 0:
+                dma_metrics['aggregate_bw'] = n_bytes / active_cycles
+            dma_metrics['transfers'] = compl_transfers
+            return dma_metrics
+
+
 # -------------------- FPU Sequencer --------------------
 
 
@@ -638,7 +766,8 @@ def annotate_insn(
     annot_fseq_offl:
     bool = False,  # Annotate whenever core offloads to CPU on own line
     force_hex_addr: bool = True,
-    permissive: bool = True
+    permissive: bool = True,
+    dma_trans: list = []
 ) -> (str, tuple, bool
       ):  # Return time info, whether trace line contains no info, and fseq_len
     match = re.search(TRACE_IN_REGEX, line.strip('\n'))
@@ -667,6 +796,7 @@ def annotate_insn(
                 insn, pc_str = ('', '')
             else:
                 perf_metrics[-1]['snitch_issues'] += 1
+            update_dma(insn, extras, dma_trans)
         # Annotate sequencer
         elif extras['source'] == TRACE_SRCES['sequencer']:
             if extras['cbuf_push']:
@@ -803,6 +933,12 @@ def main():
     )
     parser.add_argument(
         '-o',
+        '--output',
+        required=True,
+        type=argparse.FileType('w'),
+        help='Path to the output file'
+    )
+    parser.add_argument(
         '--offl',
         action='store_true',
         help='Annotate FPSS and sequencer offloads when they happen in core')
@@ -821,55 +957,74 @@ def main():
         '--permissive',
         action='store_true',
         help='Ignore some state-related issues when they occur')
-    parser.add_argument('-d',
-                        '--dump-perf',
-                        nargs='?',
-                        metavar='file',
-                        type=argparse.FileType('w'),
-                        help='Dump performance metrics as json text.')
+    parser.add_argument(
+        '--dma-trace',
+        help='Path to a DMA trace file'
+    )
+    parser.add_argument(
+        '--dump-hart-perf',
+        nargs='?',
+        type=argparse.FileType('w'),
+        help='Dump hart performance metrics as json text.'
+    )
+    parser.add_argument(
+        '--dump-dma-perf',
+        help='Dump DMA performance metrics as json text.'
+    )
 
     args = parser.parse_args()
     line_iter = iter(args.infile.readline, b'')
-    # Prepare stateful data structures
-    time_info = None
-    gpr_wb_info = defaultdict(deque)
-    fpr_wb_info = defaultdict(deque)
-    fseq_info = {
-        'curr_sec': 0,
-        'fpss_pcs': deque(),
-        'fseq_pcs': deque(),
-        'cfg_buf': deque(),
-        'curr_cfg': None
-    }
-    perf_metrics = [
-        defaultdict(int)
-    ]  # all values initially 0, also 'start' time of measurement 0
-    perf_metrics[0]['start'] = None
-    # Parse input line by line
-    for line in line_iter:
-        if line:
-            ann_insn, time_info, empty = annotate_insn(
-                line, gpr_wb_info, fpr_wb_info, fseq_info, perf_metrics, False,
-                time_info, args.offl, not args.saddr, args.permissive)
-            if perf_metrics[0]['start'] is None:
-                perf_metrics[0]['tstart'] = time_info[0] / 1000
-                perf_metrics[0]['start'] = time_info[1]
-            if not empty:
-                print(ann_insn)
-        else:
-            break  # Nothing more in pipe, EOF
-    perf_metrics[-1]['tend'] = time_info[0] / 1000
-    perf_metrics[-1]['end'] = time_info[1]
-    # Compute metrics
-    eval_perf_metrics(perf_metrics)
-    # Emit metrics
-    print('\n## Performance metrics')
-    for idx in range(len(perf_metrics)):
-        print('\n' + fmt_perf_metrics(perf_metrics, idx, not args.allkeys))
-
-    if args.dump_perf:
-        with args.dump_perf as file:
+
+    with args.output as file:
+        # Prepare stateful data structures
+        time_info = None
+        gpr_wb_info = defaultdict(deque)
+        fpr_wb_info = defaultdict(deque)
+        fseq_info = {
+            'curr_sec': 0,
+            'fpss_pcs': deque(),
+            'fseq_pcs': deque(),
+            'cfg_buf': deque(),
+            'curr_cfg': None
+        }
+        dma_trans = [{'rep': 1}]
+        perf_metrics = [
+            defaultdict(int)
+        ]  # all values initially 0, also 'start' time of measurement 0
+        perf_metrics[0]['start'] = None
+        # Parse input line by line
+        for line in line_iter:
+            if line:
+                ann_insn, time_info, empty = annotate_insn(
+                    line, gpr_wb_info, fpr_wb_info, fseq_info, perf_metrics, False,
+                    time_info, args.offl, not args.saddr, args.permissive, dma_trans)
+                if perf_metrics[0]['start'] is None:
+                    perf_metrics[0]['tstart'] = time_info[0] / 1000
+                    perf_metrics[0]['start'] = time_info[1]
+                if not empty:
+                    print(ann_insn, file=file)
+            else:
+                break  # Nothing more in pipe, EOF
+        perf_metrics[-1]['tend'] = time_info[0] / 1000
+        perf_metrics[-1]['end'] = time_info[1]
+        # Compute metrics
+        eval_perf_metrics(perf_metrics)
+        # Emit metrics
+        print('\n## Performance metrics', file=file)
+        for idx in range(len(perf_metrics)):
+            print('\n' + fmt_perf_metrics(perf_metrics, idx, not args.allkeys), file=file)
+        # Emit DMA metrics
+        if args.dma_trace:
+            dma_metrics = eval_dma_metrics(dma_trans, args.dma_trace)
+
+    # Dump hart performance metrics to JSON file
+    if args.dump_hart_perf:
+        with args.dump_hart_perf as file:
             file.write(json.dumps(perf_metrics, indent=4))
+    # Dump DMA performance metrics to JSON file
+    if args.dump_dma_perf and dma_metrics is not None:
+        with open(args.dump_dma_perf, 'w') as file:
+            file.write(json.dumps(dma_metrics, indent=4))
 
     # Check for any loose ends and warn before exiting
     seq_isns = len(fseq_info['fseq_pcs']) + len(fseq_info['cfg_buf'])
diff --git a/util/trace/layout_events.py b/util/trace/layout_events.py
deleted file mode 100755
index ea877c53c..000000000
--- a/util/trace/layout_events.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2020 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# This script takes a CSV of events, compatible with the CSV format produced by
-# `perf_csv.py`, and creates another CSV of events, where the events are reordered based
-# on a layout CSV file and labeled for viewing with the `eventvis.py` script.
-#
-# Following is an example CSV of events as output by `perf_csv.py`,
-# which could be fed as input to this tool:
-#
-#  , 0_tstart, 0_tend, 1_tstart, 1_tend, 2_tstart, 2_tend
-# 0,      334,  10940,    10940,  10945,    10945,  10995
-# 1,     2654,  11061,    11061,  11172,    11172,  11189
-# 2,     2654,  11061,    11061,  11172,    11172,  11190
-# 3,     2654,  11061,    11061,  11172,    11172,  11191
-#
-# This is an example layout CSV, which could be fed to the tool
-# together with the previous CSV:
-#
-#             , dma-in, compute, dma-out
-#            0,     0,        ,
-# "range(1,3)",      ,       1,
-#            9,      ,        ,        2
-#
-# To produce the following output:
-#
-#  , dma_in,      , compute,      , dma_out,
-# 0,    334, 10940,        ,      ,        ,
-# 1,       ,      ,   11061, 11172,        ,
-# 2,       ,      ,   11061, 11172,        ,
-# 3,       ,      ,        ,      ,   11172, 11191
-#
-# The output CSV can be fed directly to `eventvis.py`.
-#
-# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-import sys
-import argparse
-import csv
-import pandas as pd
-from math import isnan
-
-
-def main():
-    # Argument parsing
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        'csv',
-        metavar='<csv>',
-        help='Input CSV file')
-    parser.add_argument(
-        'layout',
-        metavar='<layout>',
-        help='Layout CSV file')
-    parser.add_argument(
-        '--num-clusters',
-        type=int,
-        default=1,
-        help='Number of clusters')
-    parser.add_argument(
-        '-o',
-        '--output',
-        metavar='<output>',
-        nargs='?',
-        default='trace.csv',
-        help='Output CSV file')
-    args = parser.parse_args()
-
-    # Read input CSV
-    df = pd.read_csv(args.csv)
-
-    # Output CSV data
-    data = []
-    columns = []
-
-    # Open layout CSV
-    with open(args.layout) as layout_f:
-        layout_reader = csv.reader(layout_f, delimiter=',')
-
-        # Get region labels from layout header
-        regions = [label for label in next(layout_reader) if label and not label.isspace()]
-
-        # Generate output columns: appropriately spaced region labels
-        columns = ['hartid'] + [val for label in regions for val in [label, '']]
-
-        # Iterate layout rows
-        for row in layout_reader:
-
-            # First entry in row is a hart ID or a Python expression
-            # which generates a list of hart IDs
-            expr = row[0]
-            code = compile(expr, "<string>", "eval")
-            tids = eval(code, {}, {'num_clusters': args.num_clusters})
-            if type(tids) == int:
-                tids = [tids]
-
-            # Iterate hart IDs
-            for tid in tids:
-
-                # Start output row with hart ID
-                orow = [tid]
-
-                # Iterate all other cells in layout row (indices of regions to take)
-                for cell in row[1:]:
-
-                    # If the cell is not empty, get start and end times
-                    # of the region from the input CSV and append them to the
-                    # output row. Otherwise, leave cells empty.
-                    if cell and not cell.isspace():
-                        reg_idx = int(cell)
-                        row_idx = tid
-                        col_idx = 1 + reg_idx * 2
-                        assert row_idx < df.shape[0], f'Hart ID {row_idx} out of bounds'
-                        assert (col_idx + 1) < df.shape[1],\
-                            f'Region index {reg_idx} out of bounds for hart {tid}'
-                        assert not isnan(df.iat[row_idx, col_idx]),\
-                            (f'Region {reg_idx} looks empty for hart {tid},'
-                             f'check whether it was simulated')
-                        orow.append(int(df.iat[row_idx, col_idx]))
-                        orow.append(int(df.iat[row_idx, col_idx + 1]))
-                    else:
-                        orow.append('')
-                        orow.append('')
-
-                data.append(orow)
-
-    # Create output dataframe and write to CSV
-    df = pd.DataFrame(data, columns=columns)
-    df.set_index('hartid', inplace=True)
-    df.sort_index(axis='index', inplace=True)
-    df.index.name = None
-    df.to_csv(args.output)
-
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/util/trace/perf_csv.py b/util/trace/perf_csv.py
deleted file mode 100755
index f26e242e2..000000000
--- a/util/trace/perf_csv.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2020 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# This script takes the performance metrics from all cores, in JSON format
-# as dumped by the `events.py` or `gen_trace.py` scripts, and merges them
-# into a single CSV file for global inspection.
-#
-# Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-
-import sys
-import argparse
-import re
-import json
-import pandas as pd
-
-
-HARTID_REGEX = r'hart_([0-9a-f]+)_perf.json'
-
-
-def main():
-    # Argument parsing
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '-i',
-        '--inputs',
-        metavar='<inputs>',
-        nargs='+',
-        help='Input performance metric dumps')
-    parser.add_argument(
-        '-o',
-        '--output',
-        metavar='<csv>',
-        nargs='?',
-        default='perf.csv',
-        help='Output CSV file')
-    parser.add_argument(
-        '--filter',
-        nargs='*',
-        help='All and only performance metrics to include in the CSV')
-    args = parser.parse_args()
-
-    dumps = sorted(args.inputs)
-
-    # Populate a list (one entry per hart) of dictionaries
-    # enumerating all the performance metrics for each hart
-    data = []
-    index = []
-    for dump in dumps:
-
-        # Get hart id from filename and append to index
-        hartid = int(re.search(HARTID_REGEX, dump).group(1), base=16)
-        index.append(hartid)
-
-        # Populate dictionary of metrics for the current hart
-        hart_metrics = {}
-        with open(dump, 'r') as f:
-            hart_data = json.load(f)
-
-            # Uniquefy names of performance metrics in each trace
-            # region by prepending the region index, and merge
-            # all region metrics in a single dictionary
-            for i, region in enumerate(hart_data):
-
-                # If filter was provided on the command-line then filter out all
-                # perf metrics which were not listed
-                if args.filter:
-                    region = {key: val for (key, val) in region.items() if key in args.filter}
-
-                region_metrics = {f'{i}_{key}': val for (key, val) in region.items()}
-                hart_metrics.update(region_metrics)
-
-        data.append(hart_metrics)
-
-    # Export data
-    df = pd.DataFrame.from_records(data, index)
-    df.to_csv(args.output)
-
-
-if __name__ == '__main__':
-    sys.exit(main())