diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 01fff63fb..26e4ee4ff 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,7 +2,7 @@
 # Unless a later match takes precedence, global owners below will be
 # requested for review when someone opens a pull request.
 
-* @paulsc96 @colluca
+* @paulsc96 @colluca @fischeti
 
 hw/snitch_cluster @paulsc96 @lucabertaccini
 hw/snitch_dma @paulsc96 @thommythomaso
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f2c3e692a..f8f87b3f8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -36,19 +36,19 @@ jobs:
           submodules: 'recursive'
       - name: Build Software
         run: |
+          bender vendor init
           make -C target/snitch_cluster sw
       - name: Build Hardware
         run: |
           make -C target/snitch_cluster bin/snitch_cluster.vlt
       - name: Run Tests
         working-directory: target/snitch_cluster
-        run: |-
-          ../../util/sim/simulate.py sw/run.yaml --simulator verilator -j \
-          --verbose
+        run: |
+          ./run.py sw/run.yaml --simulator verilator -j
 
-  ############################################
+  #########################################
   # Build SW on Snitch Cluster w/ Banshee #
-  ############################################
+  #########################################
 
   sw-snitch-cluster-banshee:
     name: Simulate SW on Snitch Cluster w/ Banshee
@@ -61,11 +61,11 @@ jobs:
           submodules: 'recursive'
       - name: Build Software
         run: |
+          bender vendor init
           make -C target/snitch_cluster SELECT_RUNTIME=banshee sw
       - name: Run Tests
         env:
           SNITCH_LOG: info
         working-directory: target/snitch_cluster
-        run: |-
-          ../../util/sim/simulate.py sw/run.yaml --simulator banshee -j \
-          --verbose
+        run: |
+          ./run.py sw/run.yaml --simulator banshee -j
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 610c271ea..18cd5d4aa 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -5,15 +5,18 @@
 variables:
   GIT_STRATEGY: clone
   GIT_SUBMODULE_STRATEGY: recursive
+  # Enable colors in CI terminal
+  TERM: ansi
+  FORCE_COLOR: 1
+  # Configure environment
   PYTHON: /usr/local/anaconda3-2022.05/bin/python3
   BENDER: bender-0.27.1
   CC: gcc-9.2.0
   CXX: g++-9.2.0
-  VCS: vcs-2020.12
-  VERILATOR: verilator-4.110
-  QUESTA: questa-2022.3
+  VCS_SEPP: vcs-2020.12
+  VERILATOR_SEPP: verilator-4.110
+  QUESTA_SEPP: questa-2022.3
   LLVM_BINROOT: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin
-  CLANG: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/clang
   CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER: /usr/pack/gcc-9.2.0-af/linux-x64/bin/gcc
   LLVM_SYS_120_PREFIX: /usr/pack/llvm-12.0.1-af
   CMAKE: cmake-3.18.1
@@ -21,7 +24,13 @@ variables:
 before_script:
   - $PYTHON -m venv .venv
   - source .venv/bin/activate
-  - pip install -r python-requirements.txt
+  # Unpack packages in a local temporary directory which can be safely cleaned
+  # after installation. Also protects against "No space left on device" errors
+  # occurring when the /tmp folder is filled by other processes.
+  - mkdir tmp
+  - TMPDIR=tmp pip install -r python-requirements.txt
+  - rm -rf tmp
+  - $BENDER vendor init
 
 ##############
 # Build docs #
@@ -79,8 +88,8 @@ snitch-ip-tests:
           - tcdm_interface
   script:
     - cd hw/$IP
-    - $QUESTA ./util/compile.sh
-    - $QUESTA ./util/run_vsim.sh
+    - ./util/compile.sh
+    - ./util/run_vsim.sh
 
 ########################
 # Snitch cluster tests #
@@ -89,29 +98,26 @@ snitch-ip-tests:
 # Verilator
 snitch-cluster-vlt:
   needs: [snitch-cluster-sw]
-  # yamllint disable rule:line-length
   script:
     - cd target/snitch_cluster
-    - $VERILATOR make bin/snitch_cluster.vlt
-    - $VERILATOR ../../util/sim/simulate.py sw/run.yaml --simulator verilator -j --verbose
-  # yamllint enable rule:line-length
+    - make bin/snitch_cluster.vlt
+    - ./run.py sw/run.yaml --simulator verilator -j --run-dir runs/vlt
 
 # VCS
 snitch-cluster-vcs:
   needs: [snitch-cluster-sw]
   script:
     - cd target/snitch_cluster
-    - $VCS make bin/snitch_cluster.vcs
-    - $VCS ../../util/sim/simulate.py sw/run.yaml --simulator vcs -j --verbose
+    - make bin/snitch_cluster.vcs
+    - ./run.py sw/run.yaml --simulator vcs -j --run-dir runs/vcs
 
 # Questa
 snitch-cluster-vsim:
   needs: [snitch-cluster-sw]
   script:
     - cd target/snitch_cluster
-    - $QUESTA make bin/snitch_cluster.vsim
-    - $QUESTA ../../util/sim/simulate.py sw/run.yaml --simulator vsim -j
-      --verbose
+    - make bin/snitch_cluster.vsim
+    - ./run.py sw/run.yaml --simulator vsim -j --run-dir runs/vsim
 
 # Banshee
 snitch-cluster-banshee:
@@ -127,4 +133,4 @@ snitch-cluster-banshee:
     - cd banshee
     - cargo install --debug --path .
     - cd ../target/snitch_cluster
-    - ../../util/sim/simulate.py sw/run.yaml --simulator banshee -j --verbose
+    - ./run.py sw/run.yaml --simulator banshee -j --run-dir runs/banshee
diff --git a/Bender.yml b/Bender.yml
index 732788d0d..84dad47e8 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -22,7 +22,7 @@ dependencies:
   axi:                { git: https://github.com/pulp-platform/axi,                version:  0.39.0  }
   axi_riscv_atomics:  { git: https://github.com/pulp-platform/axi_riscv_atomics,  version:  0.6.0   }
   common_cells:       { git: https://github.com/pulp-platform/common_cells,       version:  1.28.0  }
-  FPnew:              { git: https://github.com/openhwgroup/cvfpu,                rev:      1202ca3 }  # TODO: feature branch `feature/expanding_sdotp`; get merged!
+  FPnew:              { git: "https://github.com/pulp-platform/cvfpu.git",        rev:      pulp-v0.1.3 }
   register_interface: { git: https://github.com/pulp-platform/register_interface, version:  0.4.2   }
   tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version:  0.2.11  }
   riscv-dbg:          { git: https://github.com/pulp-platform/riscv-dbg,          version:  0.8.0   }
@@ -37,13 +37,40 @@ vendor_package:
       - "Makefile"
       - ".gitignore"
       - "README"
-      - "src/math/tanh.c"
+      - "src/math/ceil.c"
+      - "src/math/ceilf.c"
+      - "src/math/ceill.c"
       - "src/math/expm1.c"
+      - "src/math/expf.c"
+      - "src/math/exp2f_data.c"
+      - "src/math/exp2f_data.h"
+      - "src/math/log2.c"
+      - "src/math/log2_data.c"
+      - "src/math/log2_data.h"
+      - "src/math/log2f.c"
+      - "src/math/log2f_data.c"
+      - "src/math/log2f_data.h"
+      - "src/math/__math_divzero.c"
+      - "src/math/__math_invalid.c"
+      - "src/math/__math_invalidf.c"
+      - "src/math/__math_invalidl.c"
+      - "src/math/__math_oflow.c"
+      - "src/math/__math_oflowf.c"
+      - "src/math/__math_uflow.c"
+      - "src/math/__math_uflowf.c"
+      - "src/math/__math_xflow.c"
+      - "src/math/__math_xflowf.c"
+      - "src/math/sqrt.c"
+      - "src/math/sqrtf.c"
+      - "src/math/sqrt_data.c"
+      - "src/math/sqrt_data.h"
+      - "src/math/tanh.c"
       - "src/internal/libm.h"
       - "src/include/features.h"
       - "include/endian.h"
       - "include/math.h"
       - "include/features.h"
+      - "include/float.h"
       - "include/alltypes.h.in"
       - "arch/riscv64/bits/alltypes.h.in"
       - "arch/riscv64/bits/float.h"
diff --git a/Makefile b/Makefile
index 06a1c662f..087204634 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,8 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 
-REGGEN = $(shell bender path register_interface)/vendor/lowrisc_opentitan/util/regtool.py
+BENDER ?= bender
+REGGEN  = $(shell $(BENDER) path register_interface)/vendor/lowrisc_opentitan/util/regtool.py
 
 GENERATED_DOCS_DIR = docs/generated
 GENERATED_DOC_SRCS = $(GENERATED_DOCS_DIR)/peripherals.md
@@ -16,9 +17,7 @@ clean: clean-docs
 doc-srcs: $(GENERATED_DOC_SRCS)
 
 docs: doc-srcs
-	@if mkdocs build | grep -q "ERROR"; then \
-		exit 1; \
-	fi
+	mkdocs build
 
 clean-docs:
 	rm -rf $(GENERATED_DOCS_DIR)
diff --git a/README.md b/README.md
index d5fe00f57..1f7b6459c 100644
--- a/README.md
+++ b/README.md
@@ -86,19 +86,39 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
 </details>
 
 <details>
-<summary><b>Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra</b></summary>
+<summary><b>Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra</b></summary>
 <p>
 
 ```
-@inproceedings{scheffler2021indirect,
+@article{scheffler2023sparsessr,
   author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca},
-  booktitle={2021 Design, Automation & Test in Europe Conference & Exhibition (DATE)},
-  title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra},
-  year={2021},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  title={Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra},
+  year={2023},
+  volume={34},
+  number={12},
+  pages={3147-3161},
+  doi={10.1109/TPDS.2023.3322029}
+}
+```
+
+</p>
+</details>
+
+<details>
+<summary><b>A High-performance, Energy-efficient Modular DMA Engine Architecture</b></summary>
+<p>
+
+```
+@ARTICLE{benz2023idma,
+  author={Benz, Thomas and Rogenmoser, Michael and Scheffler, Paul and Riedel, Samuel and Ottaviano, Alessandro and Kurth, Andreas and Hoefler, Torsten and Benini, Luca},
+  journal={IEEE Transactions on Computers},
+  title={A High-performance, Energy-efficient Modular DMA Engine Architecture},
+  year={2023},
   volume={},
   number={},
-  pages={1787-1792}
-}
+  pages={1-14},
+  doi={10.1109/TC.2023.3329930}}
 ```
 
 </p>
diff --git a/apt-requirements.txt b/apt-requirements.txt
index 5bb7b560d..15f12e8b7 100644
--- a/apt-requirements.txt
+++ b/apt-requirements.txt
@@ -6,8 +6,4 @@
 clang-format
 device-tree-compiler
 graphviz
-python3
-python3-pip
-python3-setuptools
-python3-wheel
 tar
diff --git a/docs/publications.md b/docs/publications.md
index 6f14daa64..e4c86b4c6 100644
--- a/docs/publications.md
+++ b/docs/publications.md
@@ -42,19 +42,39 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
 </details>
 
 <details>
-<summary><b>Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra</b></summary>
+<summary><b>Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra</b></summary>
 <p>
 
 ```
-@inproceedings{scheffler2021indirect,
+@article{scheffler2023sparsessr,
   author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca},
-  booktitle={2021 Design, Automation & Test in Europe Conference & Exhibition (DATE)},
-  title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra},
-  year={2021},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  title={Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra},
+  year={2023},
+  volume={34},
+  number={12},
+  pages={3147-3161},
+  doi={10.1109/TPDS.2023.3322029}
+}
+```
+
+</p>
+</details>
+
+<details>
+<summary><b>A High-performance, Energy-efficient Modular DMA Engine Architecture</b></summary>
+<p>
+
+```
+@ARTICLE{benz2023idma,
+  author={Benz, Thomas and Rogenmoser, Michael and Scheffler, Paul and Riedel, Samuel and Ottaviano, Alessandro and Kurth, Andreas and Hoefler, Torsten and Benini, Luca},
+  journal={IEEE Transactions on Computers},
+  title={A High-performance, Energy-efficient Modular DMA Engine Architecture},
+  year={2023},
   volume={},
   number={},
-  pages={1787-1792}
-}
+  pages={1-14},
+  doi={10.1109/TC.2023.3329930}}
 ```
 
 </p>
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 6a766858d..e913931e3 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,7 +4,8 @@
 
 # Keep sorted.
 mkdocs
-# Last version compatible with python-3.6 (default on Ubuntu 18.04)
-mkdocs-material <= 8.2.11
+mkdocs-material
 mkdocs-include-markdown-plugin
-mkdocs-macros-plugin
\ No newline at end of file
+mkdocs-macros-plugin
+mkdocstrings
+mkdocstrings-python
diff --git a/docs/rm/custom_instructions.md b/docs/rm/custom_instructions.md
index 2a79b757a..f7fcfbd0d 100644
--- a/docs/rm/custom_instructions.md
+++ b/docs/rm/custom_instructions.md
@@ -37,7 +37,7 @@ The FREP instruction has the following signature:
 | max_inst | max_rpt | stagger_max | stagger_mask | 0        | OP-CUSTOM1 | FREP.I    |
 | max_inst | max_rpt | stagger_max | stagger_mask | 1        | OP-CUSTOM1 | FREP.O    |
 
-FREP.I and FREP.O repeat the *max_inst* instructions following the FREP instruction for *max_rpt + 1* times. The FREP.I instruction (*I* stands for inner) repeats every instruction the specified number of times and moves on to executing and repeating the next. The FREP.O instruction (*O* stands for outer) repeats the whole sequence of instructions *max_rpt + 1* times. Register staggering can be enabled and configured via the *stagger_mask* and *stagger_max* immediates. A detailed explanation of their use can be found in the Snitch [paper](/publications).
+FREP.I and FREP.O repeat the *max_inst + 1* instructions following the FREP instruction for *max_rpt + 1* times. The FREP.I instruction (*I* stands for inner) repeats every instruction the specified number of times and moves on to executing and repeating the next. The FREP.O instruction (*O* stands for outer) repeats the whole sequence of instructions *max_rpt + 1* times. Register staggering can be enabled and configured via the *stagger_mask* and *stagger_max* immediates. A detailed explanation of their use can be found in the Snitch [paper](/publications).
 
 The assembly instruction signature follows:
 
diff --git a/docs/rm/sim/Simulation.md b/docs/rm/sim/Simulation.md
new file mode 100644
index 000000000..6671fb590
--- /dev/null
+++ b/docs/rm/sim/Simulation.md
@@ -0,0 +1 @@
+::: Simulation
diff --git a/docs/rm/sim/Simulator.md b/docs/rm/sim/Simulator.md
new file mode 100644
index 000000000..56f03482d
--- /dev/null
+++ b/docs/rm/sim/Simulator.md
@@ -0,0 +1 @@
+::: Simulator
diff --git a/docs/rm/sim/sim_utils.md b/docs/rm/sim/sim_utils.md
new file mode 100644
index 000000000..876e5fac4
--- /dev/null
+++ b/docs/rm/sim/sim_utils.md
@@ -0,0 +1 @@
+::: sim_utils
\ No newline at end of file
diff --git a/hw/mem_interface/util/compile.sh b/hw/mem_interface/util/compile.sh
index 73ccc7fca..1a3678cfa 100755
--- a/hw/mem_interface/util/compile.sh
+++ b/hw/mem_interface/util/compile.sh
@@ -10,11 +10,11 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$BENDER script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
     --vlog-arg="+cover=sbecft" \
     > compile.tcl
 echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/mem_interface/util/run_vsim.sh b/hw/mem_interface/util/run_vsim.sh
index e30929642..45a6b77e1 100755
--- a/hw/mem_interface/util/run_vsim.sh
+++ b/hw/mem_interface/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
 [ ! -z "$VSIM" ] || VSIM=vsim
 
 call_vsim() {
-    echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+    echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
     grep "Errors: 0," vsim.log
 }
 
diff --git a/hw/reqrsp_interface/util/compile.sh b/hw/reqrsp_interface/util/compile.sh
index 73ccc7fca..af966e202 100755
--- a/hw/reqrsp_interface/util/compile.sh
+++ b/hw/reqrsp_interface/util/compile.sh
@@ -10,11 +10,11 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$(BENDER) script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
     --vlog-arg="+cover=sbecft" \
     > compile.tcl
 echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/reqrsp_interface/util/run_vsim.sh b/hw/reqrsp_interface/util/run_vsim.sh
index e7fe59fb9..9eeee2e14 100755
--- a/hw/reqrsp_interface/util/run_vsim.sh
+++ b/hw/reqrsp_interface/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
 [ ! -z "$VSIM" ] || VSIM=vsim
 
 call_vsim() {
-    echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+    echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
     grep "Errors: 0," vsim.log
 }
 
diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 2d38c63b3..5bb3b1b48 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -487,6 +487,7 @@ module snitch_cc #(
       .trace_port_o            ( fpu_trace           ),
       .sequencer_tracer_port_o ( fpu_sequencer_trace ),
       // pragma translate_on
+      .hart_id_i        ( hart_id_i      ),
       .acc_req_i        ( acc_snitch_req ),
       .acc_req_valid_i  ( acc_qvalid     ),
       .acc_req_ready_o  ( acc_qready     ),
diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv
index 0b994e19e..fc75a386c 100644
--- a/hw/snitch_cluster/src/snitch_fp_ss.sv
+++ b/hw/snitch_cluster/src/snitch_fp_ss.sv
@@ -42,6 +42,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
   output fpu_trace_port_t  trace_port_o,
   output fpu_sequencer_trace_port_t sequencer_tracer_port_o,
   // pragma translate_on
+  input  logic [31:0]      hart_id_i,
   // Accelerator Interface - Slave
   input  acc_req_t         acc_req_i,
   input  logic             acc_req_valid_i,
@@ -2509,6 +2510,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
   ) i_fpu (
     .clk_i                           ,
     .rst_ni         ( ~rst_i        ),
+    .hart_id_i      ( hart_id_i     ),
     .operands_i     ( op            ),
     .rnd_mode_i     ( fpu_rnd_mode  ),
     .op_i           ( fpu_op        ),
diff --git a/hw/snitch_cluster/src/snitch_fpu.sv b/hw/snitch_cluster/src/snitch_fpu.sv
index 44d28df45..ed7958edc 100644
--- a/hw/snitch_cluster/src/snitch_fpu.sv
+++ b/hw/snitch_cluster/src/snitch_fpu.sv
@@ -19,6 +19,7 @@ module snitch_fpu import snitch_pkg::*; #(
   input logic                               clk_i,
   input logic                               rst_ni,
   // Input signals
+  input logic [31:0]                        hart_id_i,
   input logic [2:0][FLEN-1:0]               operands_i,
   input fpnew_pkg::roundmode_e              rnd_mode_i,
   input fpnew_pkg::operation_e              op_i,
@@ -99,12 +100,15 @@ module snitch_fpu import snitch_pkg::*; #(
 
   fpnew_top #(
     // FPU configuration
-    .Features       ( FPUFeatures ),
-    .Implementation ( FPUImplementation ),
-    .TagType        ( logic[6:0]        )
+    .Features                    ( FPUFeatures            ),
+    .Implementation              ( FPUImplementation      ),
+    .TagType                     ( logic[6:0]             ),
+    .CompressedVecCmpResult      ( 1                      ),
+    .StochasticRndImplementation ( fpnew_pkg::DEFAULT_RSR )
   ) i_fpu (
     .clk_i                                    ,
     .rst_ni                                   ,
+    .hart_id_i       ( hart_id_i             ),
     .operands_i      ( fpu_in_q.operands     ),
     .rnd_mode_i      ( fpu_in_q.rnd_mode     ),
     .op_i            ( fpu_in_q.op           ),
@@ -114,6 +118,7 @@ module snitch_fpu import snitch_pkg::*; #(
     .int_fmt_i       ( fpu_in_q.int_fmt      ),
     .vectorial_op_i  ( fpu_in_q.vectorial_op ),
     .tag_i           ( fpu_in_q.tag          ),
+    .simd_mask_i     ( '1                    ),
     .in_valid_i      ( in_valid_q            ),
     .in_ready_o      ( in_ready_q            ),
     .flush_i         ( 1'b0                  ),
diff --git a/hw/snitch_cluster/util/compile.sh b/hw/snitch_cluster/util/compile.sh
index 73ccc7fca..1a3678cfa 100755
--- a/hw/snitch_cluster/util/compile.sh
+++ b/hw/snitch_cluster/util/compile.sh
@@ -10,11 +10,11 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$BENDER script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
     --vlog-arg="+cover=sbecft" \
     > compile.tcl
 echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/snitch_cluster/util/run_vsim.sh b/hw/snitch_cluster/util/run_vsim.sh
index e9298efed..00d08aee3 100755
--- a/hw/snitch_cluster/util/run_vsim.sh
+++ b/hw/snitch_cluster/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
 [ ! -z "$VSIM" ] || VSIM=vsim
 
 call_vsim() {
-    echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+    echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
     grep "Errors: 0," vsim.log
 }
 
diff --git a/hw/snitch_icache/util/compile.sh b/hw/snitch_icache/util/compile.sh
index 73ccc7fca..1a3678cfa 100755
--- a/hw/snitch_icache/util/compile.sh
+++ b/hw/snitch_icache/util/compile.sh
@@ -10,11 +10,11 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$BENDER script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
     --vlog-arg="+cover=sbecft" \
     > compile.tcl
 echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/snitch_icache/util/run_vsim.sh b/hw/snitch_icache/util/run_vsim.sh
index 94671daf5..42cc47f94 100755
--- a/hw/snitch_icache/util/run_vsim.sh
+++ b/hw/snitch_icache/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
 [ ! -z "$VSIM" ] || VSIM=vsim
 
 call_vsim() {
-    echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+    echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
     grep "Errors: 0," vsim.log
 }
 
diff --git a/hw/snitch_ssr/util/compile.sh b/hw/snitch_ssr/util/compile.sh
index 73ccc7fca..af966e202 100755
--- a/hw/snitch_ssr/util/compile.sh
+++ b/hw/snitch_ssr/util/compile.sh
@@ -10,11 +10,11 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$(BENDER) script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
     --vlog-arg="+cover=sbecft" \
     > compile.tcl
 echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/tcdm_interface/util/compile.sh b/hw/tcdm_interface/util/compile.sh
index 73ccc7fca..1a3678cfa 100755
--- a/hw/tcdm_interface/util/compile.sh
+++ b/hw/tcdm_interface/util/compile.sh
@@ -10,11 +10,11 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$BENDER script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
     --vlog-arg="+cover=sbecft" \
     > compile.tcl
 echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/tcdm_interface/util/run_vsim.sh b/hw/tcdm_interface/util/run_vsim.sh
index 078ae72a8..6f10155d0 100755
--- a/hw/tcdm_interface/util/run_vsim.sh
+++ b/hw/tcdm_interface/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
 [ ! -z "$VSIM" ] || VSIM=vsim
 
 call_vsim() {
-    echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+    echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
     grep "Errors: 0," vsim.log
 }
 
diff --git a/mkdocs.yml b/mkdocs.yml
index 3f9595b0a..70d213601 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -22,6 +22,10 @@ markdown_extensions:
       emoji_generator: !!python/name:materialx.emoji.to_svg
 plugins:
   - include-markdown
+  - mkdocstrings:
+      handlers:
+        python:
+          paths: [util/sim]
   - macros:
       on_error_fail: true
 use_directory_urls: false
@@ -49,10 +53,15 @@ nav:
           - Custom Instructions: rm/custom_instructions.md
           # - Solder: rm/solder.md
       - Software:
-          - Pages: runtime/Pages/index.md
-          - Files: runtime/Files/index.md
-          - Classes: runtime/Classes/index.md
-          - Examples: runtime/Examples/index.md
-          - Modules: runtime/Modules/index.md
-          - Namespaces: runtime/Namespaces/index.md
+          - Simulation Utilities:
+              - sim_utils: rm/sim/sim_utils.md
+              - rm/sim/Simulation.md
+              - rm/sim/Simulator.md
+          - Snitch Runtime:
+              - Pages: runtime/Pages/index.md
+              - Files: runtime/Files/index.md
+              - Classes: runtime/Classes/index.md
+              - Examples: runtime/Examples/index.md
+              - Modules: runtime/Modules/index.md
+              - Namespaces: runtime/Namespaces/index.md
   - Publications: publications.md
diff --git a/python-requirements.txt b/python-requirements.txt
index d426cf140..6db0bf03f 100644
--- a/python-requirements.txt
+++ b/python-requirements.txt
@@ -19,6 +19,7 @@ pytablewriter
 termcolor
 pandas
 pyelftools
+psutil
 
 -r docs/requirements.txt
 -r sw/dnn/requirements.txt
diff --git a/sw/blas/gemm/Makefile b/sw/blas/gemm/Makefile
index 604556ed1..9605f07d7 100644
--- a/sw/blas/gemm/Makefile
+++ b/sw/blas/gemm/Makefile
@@ -9,16 +9,18 @@ MK_DIR   := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 DATA_DIR := $(realpath $(MK_DIR)/data)
 SRC_DIR  := $(realpath $(MK_DIR)/src)
 
+DATA_CFG ?= $(DATA_DIR)/params.hjson
+SECTION  ?=
+
 APP     ?= gemm
 SRCS    ?= $(realpath $(SRC_DIR)/main.c)
 INCDIRS ?= $(DATA_DIR) $(SRC_DIR)
 
-DATA_CFG  ?= $(DATA_DIR)/params.hjson
 DATAGEN_PY = $(DATA_DIR)/datagen.py
 DATA_H     = $(DATA_DIR)/data.h
 
 $(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
-	$< -c $(DATA_CFG) > $@
+	$< -c $(DATA_CFG) --section="$(SECTION)" > $@
 
 .PHONY: clean-data clean
 
diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py
index c7c3fb9e0..25e2dca57 100755
--- a/sw/blas/gemm/data/datagen.py
+++ b/sw/blas/gemm/data/datagen.py
@@ -39,9 +39,13 @@
     'fp8alt': {'exp': 4, 'mant': 3}
 }
 
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
 
-def golden_model(a, b, alpha, c):
-    return np.matmul(a, b) + alpha * c
+
+def golden_model(alpha, a, b, beta, c):
+    return alpha * np.matmul(a, b) + beta * c
 
 
 def emit_header(**kwargs):
@@ -73,11 +77,14 @@ def emit_header(**kwargs):
             * (1.0 + mantissa_b.astype(np.double) / (2**2))
         _c = ((-1.0)**sign_c.astype(np.double))*(2.0**(exponent_c.astype(np.double)-15.0)) \
             * (1.0 + mantissa_c.astype(np.double) / (2**2))
-        result = np.matmul(_a, _b) + kwargs['alpha'] * _c
+        result = golden_model(1, _a, _b, kwargs['beta'], _c)
         a = sign_a << 7 | exponent_a << FP8_FORMATS['fp8']['mant'] | mantissa_a
         b = sign_b << 7 | exponent_b << FP8_FORMATS['fp8']['mant'] | mantissa_b
         c = sign_c << 7 | exponent_c << FP8_FORMATS['fp8']['mant'] | mantissa_c
     else:
+            a = np.random.rand(kwargs['M'], kwargs['K']).astype(dtype)
+            b = np.random.rand(kwargs['K'], kwargs['N']).astype(dtype)
+            c = np.random.rand(kwargs['M'], kwargs['N']).astype(dtype)
         if kwargs['linspace']:
             a = np.linspace(0.1, kwargs['M'] * kwargs['K'] + 0.1 -1, num=kwargs['M'] * kwargs['K']).reshape((kwargs['M'], kwargs['K'])).astype(dtype)
             b = np.linspace(0.2, kwargs['K'] * kwargs['N'] + 0.2 -1, num=kwargs['K'] * kwargs['N']).reshape((kwargs['K'], kwargs['N'])).astype(dtype)
@@ -86,7 +93,7 @@ def emit_header(**kwargs):
             a = np.random.rand(kwargs['M'], kwargs['K']).astype(dtype)
             b = np.random.rand(kwargs['K'], kwargs['N']).astype(dtype)
             c = np.random.rand(kwargs['M'], kwargs['N']).astype(dtype)
-        result = golden_model(a, b, kwargs['alpha'], c)
+        result = golden_model(1, a, b, kwargs['beta'], c)
 
     # Store matrices in transposed form if requested
     a = a.T if kwargs['ta'] else a
@@ -98,12 +105,15 @@ def emit_header(**kwargs):
     data_str += [format_scalar_definition('uint32_t', 'K', kwargs['K'])]
     data_str += [format_scalar_definition('uint32_t', 'TA', int(kwargs['ta']))]
     data_str += [format_scalar_definition('uint32_t', 'TB', int(kwargs['tb']))]
-    data_str += [format_scalar_definition('uint32_t', 'ALPHA', kwargs['alpha'])]
+    data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])]
     data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)]
     data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten())]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten())]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten())]
+    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
+                 alignment=BURST_ALIGNMENT, section=kwargs['section'])]
+    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
+                 alignment=BURST_ALIGNMENT, section=kwargs['section'])]
+    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
+                 alignment=BURST_ALIGNMENT, section=kwargs['section'])]
     if kwargs['prec'] == 8:
         result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten())
     else:
@@ -125,11 +135,16 @@ def main():
         required=True,
         help='Select param config file kernel'
     )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
     args = parser.parse_args()
 
     # Load param config file
     with args.cfg.open() as f:
         param = hjson.loads(f.read())
+    param['section'] = args.section
 
     # Emit header file
     print(emit_header(**param))
diff --git a/sw/blas/gemm/data/params.hjson b/sw/blas/gemm/data/params.hjson
index 63cdefd29..1428d1c99 100644
--- a/sw/blas/gemm/data/params.hjson
+++ b/sw/blas/gemm/data/params.hjson
@@ -8,7 +8,7 @@
     M: 192,
     N: 16,
     K: 16,
-    alpha: 0,
+    beta: 0,
     ta: false,
     tb: true, // must be true for SIMD
     prec: 64,
diff --git a/sw/blas/gemm/verify.py b/sw/blas/gemm/verify.py
index 3bae7f801..b6f886b7b 100755
--- a/sw/blas/gemm/verify.py
+++ b/sw/blas/gemm/verify.py
@@ -37,22 +37,27 @@ def main():
     a = np.array(bytes_to_doubles(elf.get_symbol_contents('a')))
     b = np.array(bytes_to_doubles(elf.get_symbol_contents('b')))
     c = np.array(bytes_to_doubles(elf.get_symbol_contents('c')))
-    alpha = bytes_to_uint32s(elf.get_symbol_contents('ALPHA'))[0]
+    beta = bytes_to_uint32s(elf.get_symbol_contents('BETA'))[0]
     m = bytes_to_uint32s(elf.get_symbol_contents('M'))[0]
     n = bytes_to_uint32s(elf.get_symbol_contents('N'))[0]
     k = bytes_to_uint32s(elf.get_symbol_contents('K'))[0]
     tb = bytes_to_uint32s(elf.get_symbol_contents('TB'))[0]
     a = np.reshape(a, (m, k))
-    b = np.reshape(b, (k, n))
     if tb:
+        b = np.reshape(b, (n, k))
         b = b.transpose()
+    else:
+        b = np.reshape(b, (k, n))
     c = np.reshape(c, (m, n))
 
     # Verify results
-    c_golden = golden_model(a, b, alpha, c).flatten()
+    c_golden = golden_model(1, a, b, beta, c).flatten()
 
     absolute_err = np.absolute(c_golden - c_actual)
     fail = np.any(absolute_err > ERR_THRESHOLD)
+    if (fail):
+        verification.dump_results_to_csv([c_golden, c_actual, absolute_err],
+                                         Path.cwd() / 'gemm_results.csv')
 
     return int(fail)
 
diff --git a/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch
new file mode 100644
index 000000000..068851f3c
--- /dev/null
+++ b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch
@@ -0,0 +1,125 @@
+From 91c1b48e44629a80bdc1832111707c051ab0b3b2 Mon Sep 17 00:00:00 2001
+From: Luca Colagrande <luca.colagrande3@gmail.com>
+Date: Mon, 23 Oct 2023 14:30:18 +0200
+Subject: [PATCH] sw/math: Refactor to proper library
+
+The previous header-only library style led to conflicts on certain
+defines (for instance `N`) defined in both math library sources and
+application sources.
+---
+ Makefile       | 77 +++++++++++++++++++++++++++++++++++++++++++++++---
+ include/math.h |  3 --
+ 2 files changed, 73 insertions(+), 7 deletions(-)
+
+diff --git a/Makefile b/Makefile
+index 1327953..a6f7a1a 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,17 +1,86 @@
+-BITS_DIR = include/bits
++# Copyright 2023 ETH Zurich and University of Bologna.
++# Licensed under the Apache License, Version 2.0, see LICENSE for details.
++# SPDX-License-Identifier: Apache-2.0
++#
++# Luca Colagrande <colluca@iis.ee.ethz.ch>
++# Viviane Potocnik, ETH Zurich <vivianep@iis.ee.ethz.ch>
++
++# Usage of absolute paths is required to externally include
++# this Makefile from multiple different locations
++MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
++
++###############
++# Directories #
++###############
++
++BUILDDIR ?= $(abspath build)
++SRC_DIR   = $(MK_DIR)/src/math
++BITS_DIR  = $(MK_DIR)/include/bits
++
++###################
++# Build variables #
++###################
++
++INCDIRS += $(MK_DIR)/arch/riscv64/
++INCDIRS += $(MK_DIR)/arch/generic
++INCDIRS += $(MK_DIR)/src/include
++INCDIRS += $(MK_DIR)/src/internal
++INCDIRS += $(MK_DIR)/include/bits
++INCDIRS += $(MK_DIR)/include
++
++SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c))
++
++###########
++# Outputs #
++###########
++
+ ALLTYPES_H = $(BITS_DIR)/alltypes.h
+ 
++OBJS        = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS)))))
++DEPS        = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS)))))
++LIB         = $(BUILDDIR)/libmath.a
++DUMP        = $(BUILDDIR)/libmath.dump
++ALL_OUTPUTS = $(LIB) $(DUMP)
+ 
+-.PHONY: all clean
++#########
++# Rules #
++#########
+ 
+-all: $(ALLTYPES_H)
++.PHONY: all
++all: $(ALL_OUTPUTS)
+ 
++.PHONY: clean
+ clean:
+ 	rm -rf $(BITS_DIR)
+ 	rm -f $(ALLTYPES_H)
++	rm -rf $(BUILDDIR)
+ 
+ $(BITS_DIR):
+ 	mkdir -p $@
+ 
+ $(ALLTYPES_H): | $(BITS_DIR)
+-	sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@
++	sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@
++
++$(DEPS): $(ALLTYPES_H)
++
++$(BUILDDIR):
++	mkdir -p $@
++
++$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR)
++	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
++
++$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR)
++	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
++
++$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR)
++	$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@
++
++$(LIB): $(OBJS) | $(BUILDDIR)
++	$(RISCV_AR) $(RISCV_ARFLAGS) $@ $^
++
++$(DUMP): $(LIB) | $(BUILDDIR)
++	$(RISCV_OBJDUMP) -D $< > $@
++
++ifneq ($(MAKECMDGOALS),clean)
++-include $(DEPS)
++endif
+diff --git a/include/math.h b/include/math.h
+index 6dad71c..14f28ec 100644
+--- a/include/math.h
++++ b/include/math.h
+@@ -435,9 +435,6 @@ float       pow10f(float);
+ long double pow10l(long double);
+ #endif
+ 
+-#include "../src/math/expm1.c"
+-#include "../src/math/tanh.c"
+-
+ #ifdef __cplusplus
+ }
+ #endif
+-- 
+2.28.0
+
diff --git a/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch
new file mode 100644
index 000000000..050af9d33
--- /dev/null
+++ b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch
@@ -0,0 +1,81 @@
+From eb96f4d7454a07498f571eb1ed18aa1db2413551 Mon Sep 17 00:00:00 2001
+From: Luca Colagrande <luca.colagrande3@gmail.com>
+Date: Mon, 23 Oct 2023 16:45:17 +0200
+Subject: [PATCH] `sw/math`: Add safe FP <--> INT conversions
+
+---
+ src/internal/libm.h | 51 +++++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 47 insertions(+), 4 deletions(-)
+
+diff --git a/src/internal/libm.h b/src/internal/libm.h
+index 72ad17d..60b9866 100644
+--- a/src/internal/libm.h
++++ b/src/internal/libm.h
+@@ -96,6 +96,47 @@ static int32_t converttoint(double_t);
+ #define predict_false(x) (x)
+ #endif
+ 
++/* FPU fence to synchronize the FPU and integer core in Snitch. */
++inline void snrt_fpu_fence() {
++    unsigned tmp;
++    __asm__ volatile(
++        "fmv.x.w %0, fa0\n"
++        "mv      %0, %0\n"
++        : "+r"(tmp)::"memory");
++}
++
++/* Synch-secure double to uint64 conversion functions. */
++static inline uint64_t asuint64(double f) {
++    uint64_t result;
++    snrt_fpu_fence();
++    result = *(uint64_t *)&f;
++    return result;
++}
++
++/* Synch-secure float to uint conversion functions. */
++static inline uint64_t asuint(float f) {
++    uint32_t result;
++    snrt_fpu_fence();
++    result = *(uint32_t *)&f;
++    return result;
++}
++
++/* Synch-secure uint64 to double conversion functions. */
++static inline double asdouble(uint64_t i) {
++    double result;
++    snrt_fpu_fence();
++    result = *(double *)&i;
++    return result;
++}
++
++/* Synch-secure uint to float conversion functions. */
++static inline float asfloat(uint32_t i) {
++	float result;
++	snrt_fpu_fence();
++	result = *(float *)&i;
++	return result;
++}
++
+ /* Evaluate an expression as the specified type. With standard excess
+    precision handling a type cast or assignment is enough (with
+    -ffloat-store an assignment is required, in old compilers argument
+@@ -187,10 +228,12 @@ static inline void fp_force_evall(long double x)
+ 	}                                         \
+ } while(0)
+ 
+-#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i
+-#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
+-#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i
+-#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
++// Unsafe in Snitch due to the decoupled FPU and integer
++// arithmetic units. Use at your own risk.
++#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i
++#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f
++#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i
++#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f
+ 
+ #define EXTRACT_WORDS(hi,lo,d)                    \
+ do {                                              \
+-- 
+2.28.0
+
diff --git a/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch
new file mode 100644
index 000000000..cffc3c407
--- /dev/null
+++ b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch
@@ -0,0 +1,149 @@
+From b419b07facc9591ba0d8683f53c9adefb8a9b0c6 Mon Sep 17 00:00:00 2001
+From: Luca Colagrande <luca.colagrande3@gmail.com>
+Date: Wed, 8 Nov 2023 09:35:17 +0100
+Subject: [PATCH] `sw/math`: Implement safe `tanh` function
+
+---
+ src/internal/libm.h | 31 +++++++++++++++++++++++++++++++
+ src/math/expm1.c    | 34 ++++++++++++++++++++++++++--------
+ src/math/tanh.c     | 17 ++++++++++++-----
+ 3 files changed, 69 insertions(+), 13 deletions(-)
+
+diff --git a/src/internal/libm.h b/src/internal/libm.h
+index 60b9866..c96c0ec 100644
+--- a/src/internal/libm.h
++++ b/src/internal/libm.h
+@@ -96,6 +96,37 @@ static int32_t converttoint(double_t);
+ #define predict_false(x) (x)
+ #endif
+ 
++/* Memory-consistent functions to manipulate the upper word of a
++   double-precision floating-point number in the integer core.
++   Since there is no dedicated instruction to move the upper 32-bits
++   of a double-precision floating point register to an integer register
++   the compiler resorts to moving the value through the memory. However in
++   Snitch neither the program ordering between floating-point and integer
++   instructions is guaranteed, nor is memory consistency between the integer
++   and floating-point threads.  */
++
++static inline uint32_t safe_extract_upper_32b_from_double(double x) {
++	double f;
++	uint32_t result;
++	asm volatile("fsd %[x], 0(%[ptr]) \n"
++	             "fld ft3, 0(%[ptr]) \n"
++				 "fmv.x.w t0, ft3 \n"
++				 "mv      t0, t0 \n"
++				 "lw %[result], 4(%[ptr]) \n"
++	 : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory");
++	return result;
++}
++
++static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) {
++	asm volatile("sw %[x], 4(%[ptr]) \n"
++                 "lw %[x], 4(%[ptr]) \n"
++                 "fmv.w.x ft3, %[x] \n"
++	 : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory");
++}
++
++/* TODO: the following functions are not really safe, compare previous two
++   functions */
++
+ /* FPU fence to synchronize the FPU and integer core in Snitch. */
+ inline void snrt_fpu_fence() {
+     unsigned tmp;
+diff --git a/src/math/expm1.c b/src/math/expm1.c
+index ac1e61e..d94f57f 100644
+--- a/src/math/expm1.c
++++ b/src/math/expm1.c
+@@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
+ double expm1(double x)
+ {
+ 	double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk;
+-	union {double f; uint64_t i;} u = {x};
+-	uint32_t hx = u.i>>32 & 0x7fffffff;
+-	int k, sign = u.i>>63;
++	/// Original implementation
++	// union {double f; uint64_t i;} u = {x};
++	// uint32_t hx = u.i>>32 & 0x7fffffff;
++	// int k, sign = u.i>>63;
++	/// Safe implementation in Snitch
++	uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
++	uint32_t hx = upper_32b_x & 0x7fffffff;
++	int k, sign = upper_32b_x>>31;
+ 
+ 	/* filter out huge and non-finite argument */
+ 	if (hx >= 0x4043687A) {  /* if |x|>=56*ln2 */
+@@ -182,8 +187,12 @@ double expm1(double x)
+ 			return -2.0*(e-(x+0.5));
+ 		return 1.0+2.0*(x-e);
+ 	}
+-	u.i = (uint64_t)(0x3ff + k)<<52;  /* 2^k */
+-	twopk = u.f;
++	/// Original implementation
++	// u.i = (uint64_t)(0x3ff + k)<<52;  /* 2^k */
++	// twopk = u.f;
++	/// Safe implementation in Snitch
++	uint32_t u_i = (uint32_t)(0x3ff + k)<<20;
++	safe_inject_into_upper_32b_double(u_i, &twopk);
+ 	if (k < 0 || k > 56) {  /* suffice to return exp(x)-1 */
+ 		y = x - e + 1.0;
+ 		if (k == 1024)
+@@ -192,10 +201,19 @@ double expm1(double x)
+ 			y = y*twopk;
+ 		return y - 1.0;
+ 	}
+-	u.i = (uint64_t)(0x3ff - k)<<52;  /* 2^-k */
++	/// Original implementation
++	// u.i = (uint64_t)(0x3ff - k)<<52;  /* 2^-k */
++	// if (k < 20)
++	// 	y = (x-e+(1-u.f))*twopk;
++	// else
++	// 	y = (x-(e+u.f)+1)*twopk;
++	/// Safe implementation in Snitch
++	u_i = (uint32_t)(0x3ff - k)<<20;
++	double u_f = 0;
++	safe_inject_into_upper_32b_double(u_i, &u_f);
+ 	if (k < 20)
+-		y = (x-e+(1-u.f))*twopk;
++		y = (x-e+(1-u_f))*twopk;
+ 	else
+-		y = (x-(e+u.f)+1)*twopk;
++		y = (x-(e+u_f)+1)*twopk;
+ 	return y;
+ }
+diff --git a/src/math/tanh.c b/src/math/tanh.c
+index 20d6dbc..2481db1 100644
+--- a/src/math/tanh.c
++++ b/src/math/tanh.c
+@@ -6,16 +6,23 @@
+  */
+ double tanh(double x)
+ {
+-	union {double f; uint64_t i;} u = {.f = x};
+ 	uint32_t w;
+ 	int sign;
+ 	double_t t;
+ 
+ 	/* x = |x| */
+-	sign = u.i >> 63;
+-	u.i &= (uint64_t)-1/2;
+-	x = u.f;
+-	w = u.i >> 32;
++	/// Original implementation
++	// union {double f; uint64_t i;} u = {.f = x};
++	// sign = u.i >> 63;
++	// u.i &= (uint64_t)-1/2;
++	// x = u.f;
++	// w = u.i >> 32;
++	/// Safe implementation in Snitch
++	uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
++	sign = upper_32b_x >> 31;
++	uint32_t sign_mask = (~(1 << 31));
++	w = upper_32b_x & sign_mask;
++	safe_inject_into_upper_32b_double(w, &x);
+ 
+ 	if (w > 0x3fe193ea) {
+ 		/* |x| > log(3)/2 ~= 0.5493 or nan */
+-- 
+2.28.0
+
diff --git a/sw/math/Makefile b/sw/math/Makefile
index 132795388..afb3192d1 100644
--- a/sw/math/Makefile
+++ b/sw/math/Makefile
@@ -1,17 +1,86 @@
-BITS_DIR = include/bits
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+# Viviane Potocnik, ETH Zurich <vivianep@iis.ee.ethz.ch>
+
+# Usage of absolute paths is required to externally include
+# this Makefile from multiple different locations
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+
+###############
+# Directories #
+###############
+
+BUILDDIR ?= $(abspath build)
+SRC_DIR   = $(MK_DIR)/src/math
+BITS_DIR  = $(MK_DIR)/include/bits
+
+###################
+# Build variables #
+###################
+
+INCDIRS += $(MK_DIR)/arch/riscv64/
+INCDIRS += $(MK_DIR)/arch/generic
+INCDIRS += $(MK_DIR)/src/include
+INCDIRS += $(MK_DIR)/src/internal
+INCDIRS += $(MK_DIR)/include/bits
+INCDIRS += $(MK_DIR)/include
+
+SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c))
+
+###########
+# Outputs #
+###########
+
 ALLTYPES_H = $(BITS_DIR)/alltypes.h
 
+OBJS        = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS)))))
+DEPS        = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS)))))
+LIB         = $(BUILDDIR)/libmath.a
+DUMP        = $(BUILDDIR)/libmath.dump
+ALL_OUTPUTS = $(LIB) $(DUMP)
 
-.PHONY: all clean
+#########
+# Rules #
+#########
 
-all: $(ALLTYPES_H)
+.PHONY: all
+all: $(ALL_OUTPUTS)
 
+.PHONY: clean
 clean:
 	rm -rf $(BITS_DIR)
 	rm -f $(ALLTYPES_H)
+	rm -rf $(BUILDDIR)
 
 $(BITS_DIR):
 	mkdir -p $@
 
 $(ALLTYPES_H): | $(BITS_DIR)
-	sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@
+	sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@
+
+$(DEPS): $(ALLTYPES_H)
+
+$(BUILDDIR):
+	mkdir -p $@
+
+$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR)
+	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
+
+$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR)
+	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
+
+$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR)
+	$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@
+
+$(LIB): $(OBJS) | $(BUILDDIR)
+	$(RISCV_AR) $(RISCV_ARFLAGS) $@ $^
+
+$(DUMP): $(LIB) | $(BUILDDIR)
+	$(RISCV_OBJDUMP) -D $< > $@
+
+ifneq ($(MAKECMDGOALS),clean)
+-include $(DEPS)
+endif
diff --git a/sw/math/include/float.h b/sw/math/include/float.h
new file mode 100644
index 000000000..713aadb90
--- /dev/null
+++ b/sw/math/include/float.h
@@ -0,0 +1,52 @@
+#ifndef _FLOAT_H
+#define _FLOAT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int __flt_rounds(void);
+#define FLT_ROUNDS (__flt_rounds())
+
+#define FLT_RADIX 2
+
+#define FLT_TRUE_MIN 1.40129846432481707092e-45F
+#define FLT_MIN 1.17549435082228750797e-38F
+#define FLT_MAX 3.40282346638528859812e+38F
+#define FLT_EPSILON 1.1920928955078125e-07F
+
+#define FLT_MANT_DIG 24
+#define FLT_MIN_EXP (-125)
+#define FLT_MAX_EXP 128
+#define FLT_HAS_SUBNORM 1
+
+#define FLT_DIG 6
+#define FLT_DECIMAL_DIG 9
+#define FLT_MIN_10_EXP (-37)
+#define FLT_MAX_10_EXP 38
+
+#define DBL_TRUE_MIN 4.94065645841246544177e-324
+#define DBL_MIN 2.22507385850720138309e-308
+#define DBL_MAX 1.79769313486231570815e+308
+#define DBL_EPSILON 2.22044604925031308085e-16
+
+#define DBL_MANT_DIG 53
+#define DBL_MIN_EXP (-1021)
+#define DBL_MAX_EXP 1024
+#define DBL_HAS_SUBNORM 1
+
+#define DBL_DIG 15
+#define DBL_DECIMAL_DIG 17
+#define DBL_MIN_10_EXP (-307)
+#define DBL_MAX_10_EXP 308
+
+#define LDBL_HAS_SUBNORM 1
+#define LDBL_DECIMAL_DIG DECIMAL_DIG
+
+#include <bits/float.h>
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sw/math/include/math.h b/sw/math/include/math.h
index 6dad71c1e..14f28ec8c 100644
--- a/sw/math/include/math.h
+++ b/sw/math/include/math.h
@@ -435,9 +435,6 @@ float       pow10f(float);
 long double pow10l(long double);
 #endif
 
-#include "../src/math/expm1.c"
-#include "../src/math/tanh.c"
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/sw/math/src/internal/libm.h b/sw/math/src/internal/libm.h
index 72ad17d8e..c96c0eced 100644
--- a/sw/math/src/internal/libm.h
+++ b/sw/math/src/internal/libm.h
@@ -96,6 +96,78 @@ static int32_t converttoint(double_t);
 #define predict_false(x) (x)
 #endif
 
+/* Memory-consistent functions to manipulate the upper word of a
+   double-precision floating-point number in the integer core.
+   Since there is no dedicated instruction to move the upper 32-bits
+   of a double-precision floating point register to an integer register
+   the compiler resorts to moving the value through the memory. However in
+   Snitch neither the program ordering between floating-point and integer
+   instructions is guaranteed, nor is memory consistency between the integer
+   and floating-point threads.  */
+
+static inline uint32_t safe_extract_upper_32b_from_double(double x) {
+	double f;
+	uint32_t result;
+	asm volatile("fsd %[x], 0(%[ptr]) \n"
+	             "fld ft3, 0(%[ptr]) \n"
+				 "fmv.x.w t0, ft3 \n"
+				 "mv      t0, t0 \n"
+				 "lw %[result], 4(%[ptr]) \n"
+	 : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory");
+	return result;
+}
+
+static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) {
+	asm volatile("sw %[x], 4(%[ptr]) \n"
+                 "lw %[x], 4(%[ptr]) \n"
+                 "fmv.w.x ft3, %[x] \n"
+	 : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory");
+}
+
+/* TODO: the following functions are not really safe, compare previous two
+   functions */
+
+/* FPU fence to synchronize the FPU and integer core in Snitch. */
+inline void snrt_fpu_fence() {
+    unsigned tmp;
+    __asm__ volatile(
+        "fmv.x.w %0, fa0\n"
+        "mv      %0, %0\n"
+        : "+r"(tmp)::"memory");
+}
+
+/* Synch-secure double to uint64 conversion functions. */
+static inline uint64_t asuint64(double f) {
+    uint64_t result;
+    snrt_fpu_fence();
+    result = *(uint64_t *)&f;
+    return result;
+}
+
+/* Synch-secure float to uint conversion functions. */
+static inline uint64_t asuint(float f) {
+    uint32_t result;
+    snrt_fpu_fence();
+    result = *(uint32_t *)&f;
+    return result;
+}
+
+/* Synch-secure uint64 to double conversion functions. */
+static inline double asdouble(uint64_t i) {
+    double result;
+    snrt_fpu_fence();
+    result = *(double *)&i;
+    return result;
+}
+
+/* Synch-secure uint to float conversion functions. */
+static inline float asfloat(uint32_t i) {
+	float result;
+	snrt_fpu_fence();
+	result = *(float *)&i;
+	return result;
+}
+
 /* Evaluate an expression as the specified type. With standard excess
    precision handling a type cast or assignment is enough (with
    -ffloat-store an assignment is required, in old compilers argument
@@ -187,10 +259,12 @@ static inline void fp_force_evall(long double x)
 	}                                         \
 } while(0)
 
-#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i
-#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
-#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i
-#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
+// Unsafe in Snitch due to the decoupled FPU and integer
+// arithmetic units. Use at your own risk.
+#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i
+#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f
+#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i
+#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f
 
 #define EXTRACT_WORDS(hi,lo,d)                    \
 do {                                              \
diff --git a/sw/math/src/math/__math_divzero.c b/sw/math/src/math/__math_divzero.c
new file mode 100644
index 000000000..59d213500
--- /dev/null
+++ b/sw/math/src/math/__math_divzero.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_divzero(uint32_t sign)
+{
+	return fp_barrier(sign ? -1.0 : 1.0) / 0.0;
+}
diff --git a/sw/math/src/math/__math_invalid.c b/sw/math/src/math/__math_invalid.c
new file mode 100644
index 000000000..177404900
--- /dev/null
+++ b/sw/math/src/math/__math_invalid.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_invalid(double x)
+{
+	return (x - x) / (x - x);
+}
diff --git a/sw/math/src/math/__math_invalidf.c b/sw/math/src/math/__math_invalidf.c
new file mode 100644
index 000000000..357d4b121
--- /dev/null
+++ b/sw/math/src/math/__math_invalidf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_invalidf(float x)
+{
+	return (x - x) / (x - x);
+}
diff --git a/sw/math/src/math/__math_invalidl.c b/sw/math/src/math/__math_invalidl.c
new file mode 100644
index 000000000..1fca99de4
--- /dev/null
+++ b/sw/math/src/math/__math_invalidl.c
@@ -0,0 +1,9 @@
+#include <float.h>
+#include "libm.h"
+
+#if LDBL_MANT_DIG != DBL_MANT_DIG
+long double __math_invalidl(long double x)
+{
+	return (x - x) / (x - x);
+}
+#endif
diff --git a/sw/math/src/math/__math_oflow.c b/sw/math/src/math/__math_oflow.c
new file mode 100644
index 000000000..c85dbf982
--- /dev/null
+++ b/sw/math/src/math/__math_oflow.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_oflow(uint32_t sign)
+{
+	return __math_xflow(sign, 0x1p769);
+}
diff --git a/sw/math/src/math/__math_oflowf.c b/sw/math/src/math/__math_oflowf.c
new file mode 100644
index 000000000..fa7d06208
--- /dev/null
+++ b/sw/math/src/math/__math_oflowf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_oflowf(uint32_t sign)
+{
+	return __math_xflowf(sign, 0x1p97f);
+}
diff --git a/sw/math/src/math/__math_uflow.c b/sw/math/src/math/__math_uflow.c
new file mode 100644
index 000000000..b90594aee
--- /dev/null
+++ b/sw/math/src/math/__math_uflow.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_uflow(uint32_t sign)
+{
+	return __math_xflow(sign, 0x1p-767);
+}
diff --git a/sw/math/src/math/__math_uflowf.c b/sw/math/src/math/__math_uflowf.c
new file mode 100644
index 000000000..94d50f2bf
--- /dev/null
+++ b/sw/math/src/math/__math_uflowf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_uflowf(uint32_t sign)
+{
+	return __math_xflowf(sign, 0x1p-95f);
+}
diff --git a/sw/math/src/math/__math_xflow.c b/sw/math/src/math/__math_xflow.c
new file mode 100644
index 000000000..744203c4c
--- /dev/null
+++ b/sw/math/src/math/__math_xflow.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_xflow(uint32_t sign, double y)
+{
+	return eval_as_double(fp_barrier(sign ? -y : y) * y);
+}
diff --git a/sw/math/src/math/__math_xflowf.c b/sw/math/src/math/__math_xflowf.c
new file mode 100644
index 000000000..f2c84784f
--- /dev/null
+++ b/sw/math/src/math/__math_xflowf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_xflowf(uint32_t sign, float y)
+{
+	return eval_as_float(fp_barrierf(sign ? -y : y) * y);
+}
diff --git a/sw/math/src/math/ceil.c b/sw/math/src/math/ceil.c
new file mode 100644
index 000000000..b13e6f2d6
--- /dev/null
+++ b/sw/math/src/math/ceil.c
@@ -0,0 +1,31 @@
+#include "libm.h"
+
+#if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
+#define EPS DBL_EPSILON
+#elif FLT_EVAL_METHOD==2
+#define EPS LDBL_EPSILON
+#endif
+static const double_t toint = 1/EPS;
+
+double ceil(double x)
+{
+	union {double f; uint64_t i;} u = {x};
+	int e = u.i >> 52 & 0x7ff;
+	double_t y;
+
+	if (e >= 0x3ff+52 || x == 0)
+		return x;
+	/* y = int(x) - x, where int(x) is an integer neighbor of x */
+	if (u.i >> 63)
+		y = x - toint + toint - x;
+	else
+		y = x + toint - toint - x;
+	/* special case because of non-nearest rounding modes */
+	if (e <= 0x3ff-1) {
+		FORCE_EVAL(y);
+		return u.i >> 63 ? -0.0 : 1;
+	}
+	if (y < 0)
+		return x + y + 1;
+	return x + y;
+}
diff --git a/sw/math/src/math/ceilf.c b/sw/math/src/math/ceilf.c
new file mode 100644
index 000000000..869835f39
--- /dev/null
+++ b/sw/math/src/math/ceilf.c
@@ -0,0 +1,27 @@
+#include "libm.h"
+
+float ceilf(float x)
+{
+	union {float f; uint32_t i;} u = {x};
+	int e = (int)(u.i >> 23 & 0xff) - 0x7f;
+	uint32_t m;
+
+	if (e >= 23)
+		return x;
+	if (e >= 0) {
+		m = 0x007fffff >> e;
+		if ((u.i & m) == 0)
+			return x;
+		FORCE_EVAL(x + 0x1p120f);
+		if (u.i >> 31 == 0)
+			u.i += m;
+		u.i &= ~m;
+	} else {
+		FORCE_EVAL(x + 0x1p120f);
+		if (u.i >> 31)
+			u.f = -0.0;
+		else if (u.i << 1)
+			u.f = 1.0;
+	}
+	return u.f;
+}
diff --git a/sw/math/src/math/ceill.c b/sw/math/src/math/ceill.c
new file mode 100644
index 000000000..60a83020d
--- /dev/null
+++ b/sw/math/src/math/ceill.c
@@ -0,0 +1,34 @@
+#include "libm.h"
+
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
+long double ceill(long double x)
+{
+	return ceil(x);
+}
+#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
+
+static const long double toint = 1/LDBL_EPSILON;
+
+long double ceill(long double x)
+{
+	union ldshape u = {x};
+	int e = u.i.se & 0x7fff;
+	long double y;
+
+	if (e >= 0x3fff+LDBL_MANT_DIG-1 || x == 0)
+		return x;
+	/* y = int(x) - x, where int(x) is an integer neighbor of x */
+	if (u.i.se >> 15)
+		y = x - toint + toint - x;
+	else
+		y = x + toint - toint - x;
+	/* special case because of non-nearest rounding modes */
+	if (e <= 0x3fff-1) {
+		FORCE_EVAL(y);
+		return u.i.se >> 15 ? -0.0 : 1;
+	}
+	if (y < 0)
+		return x + y + 1;
+	return x + y;
+}
+#endif
diff --git a/sw/math/src/math/exp2f_data.c b/sw/math/src/math/exp2f_data.c
new file mode 100644
index 000000000..be324727f
--- /dev/null
+++ b/sw/math/src/math/exp2f_data.c
@@ -0,0 +1,35 @@
+/*
+ * Shared data between expf, exp2f and powf.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "exp2f_data.h"
+
+#define N (1 << EXP2F_TABLE_BITS)
+
+const struct exp2f_data __exp2f_data = {
+  /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
+     used for computing 2^(k/N) for an int |k| < 150 N as
+     double(tab[k%N] + (k << 52-BITS)) */
+  .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+  },
+  .shift_scaled = 0x1.8p+52 / N,
+  .poly = {
+  0x1.c6af84b912394p-5, 0x1.ebfce50fac4f3p-3, 0x1.62e42ff0c52d6p-1,
+  },
+  .shift = 0x1.8p+52,
+  .invln2_scaled = 0x1.71547652b82fep+0 * N,
+  .poly_scaled = {
+  0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N,
+  },
+};
diff --git a/sw/math/src/math/exp2f_data.h b/sw/math/src/math/exp2f_data.h
new file mode 100644
index 000000000..fe744f15b
--- /dev/null
+++ b/sw/math/src/math/exp2f_data.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _EXP2F_DATA_H
+#define _EXP2F_DATA_H
+
+#include <features.h>
+#include <stdint.h>
+
+/* Shared between expf, exp2f and powf.  */
+#define EXP2F_TABLE_BITS 5
+#define EXP2F_POLY_ORDER 3
+extern hidden const struct exp2f_data {
+	uint64_t tab[1 << EXP2F_TABLE_BITS];
+	double shift_scaled;
+	double poly[EXP2F_POLY_ORDER];
+	double shift;
+	double invln2_scaled;
+	double poly_scaled[EXP2F_POLY_ORDER];
+} __exp2f_data;
+
+#endif
diff --git a/sw/math/src/math/expf.c b/sw/math/src/math/expf.c
new file mode 100644
index 000000000..f9fbf8e72
--- /dev/null
+++ b/sw/math/src/math/expf.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision e^x function.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "libm.h"
+#include "exp2f_data.h"
+
+/*
+EXP2F_TABLE_BITS = 5
+EXP2F_POLY_ORDER = 3
+
+ULP error: 0.502 (nearest rounding.)
+Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.)
+Wrong count: 170635 (all nearest rounding wrong results with fma.)
+Non-nearest ULP error: 1 (rounded ULP error)
+*/
+
+#define N (1 << EXP2F_TABLE_BITS)
+#define InvLn2N __exp2f_data.invln2_scaled
+#define T __exp2f_data.tab
+#define C __exp2f_data.poly_scaled
+
+static inline uint32_t top12(float x)
+{
+	return asuint(x) >> 20;
+}
+
+float expf(float x)
+{
+	uint32_t abstop;
+	uint64_t ki, t;
+	double_t kd, xd, z, r, r2, y, s;
+
+	xd = (double_t)x;
+	abstop = top12(x) & 0x7ff;
+	if (predict_false(abstop >= top12(88.0f))) {
+		/* |x| >= 88 or x is nan.  */
+		if (asuint(x) == asuint(-INFINITY))
+			return 0.0f;
+		if (abstop >= top12(INFINITY))
+			return x + x;
+		if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */
+			return __math_oflowf(0);
+		if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */
+			return __math_uflowf(0);
+	}
+
+	/* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k.  */
+	z = InvLn2N * xd;
+
+	/* Round and convert z to int, the result is in [-150*N, 128*N] and
+	   ideally ties-to-even rule is used, otherwise the magnitude of r
+	   can be bigger which gives larger approximation error.  */
+#if TOINT_INTRINSICS
+	kd = roundtoint(z);
+	ki = converttoint(z);
+#else
+# define SHIFT __exp2f_data.shift
+	kd = eval_as_double(z + SHIFT);
+	ki = asuint64(kd);
+	kd -= SHIFT;
+#endif
+	r = z - kd;
+
+	/* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
+	t = T[ki % N];
+	t += ki << (52 - EXP2F_TABLE_BITS);
+	s = asdouble(t);
+	z = C[0] * r + C[1];
+	r2 = r * r;
+	y = C[2] * r + 1;
+	y = z * r2 + y;
+	y = y * s;
+	return eval_as_float(y);
+}
diff --git a/sw/math/src/math/expm1.c b/sw/math/src/math/expm1.c
index ac1e61e4f..d94f57fe5 100644
--- a/sw/math/src/math/expm1.c
+++ b/sw/math/src/math/expm1.c
@@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
 double expm1(double x)
 {
 	double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk;
-	union {double f; uint64_t i;} u = {x};
-	uint32_t hx = u.i>>32 & 0x7fffffff;
-	int k, sign = u.i>>63;
+	/// Original implementation
+	// union {double f; uint64_t i;} u = {x};
+	// uint32_t hx = u.i>>32 & 0x7fffffff;
+	// int k, sign = u.i>>63;
+	/// Safe implementation in Snitch
+	uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
+	uint32_t hx = upper_32b_x & 0x7fffffff;
+	int k, sign = upper_32b_x>>31;
 
 	/* filter out huge and non-finite argument */
 	if (hx >= 0x4043687A) {  /* if |x|>=56*ln2 */
@@ -182,8 +187,12 @@ double expm1(double x)
 			return -2.0*(e-(x+0.5));
 		return 1.0+2.0*(x-e);
 	}
-	u.i = (uint64_t)(0x3ff + k)<<52;  /* 2^k */
-	twopk = u.f;
+	/// Original implementation
+	// u.i = (uint64_t)(0x3ff + k)<<52;  /* 2^k */
+	// twopk = u.f;
+	/// Safe implementation in Snitch
+	uint32_t u_i = (uint32_t)(0x3ff + k)<<20;
+	safe_inject_into_upper_32b_double(u_i, &twopk);
 	if (k < 0 || k > 56) {  /* suffice to return exp(x)-1 */
 		y = x - e + 1.0;
 		if (k == 1024)
@@ -192,10 +201,19 @@ double expm1(double x)
 			y = y*twopk;
 		return y - 1.0;
 	}
-	u.i = (uint64_t)(0x3ff - k)<<52;  /* 2^-k */
+	/// Original implementation
+	// u.i = (uint64_t)(0x3ff - k)<<52;  /* 2^-k */
+	// if (k < 20)
+	// 	y = (x-e+(1-u.f))*twopk;
+	// else
+	// 	y = (x-(e+u.f)+1)*twopk;
+	/// Safe implementation in Snitch
+	u_i = (uint32_t)(0x3ff - k)<<20;
+	double u_f = 0;
+	safe_inject_into_upper_32b_double(u_i, &u_f);
 	if (k < 20)
-		y = (x-e+(1-u.f))*twopk;
+		y = (x-e+(1-u_f))*twopk;
 	else
-		y = (x-(e+u.f)+1)*twopk;
+		y = (x-(e+u_f)+1)*twopk;
 	return y;
 }
diff --git a/sw/math/src/math/log2.c b/sw/math/src/math/log2.c
new file mode 100644
index 000000000..1276ed4e3
--- /dev/null
+++ b/sw/math/src/math/log2.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision log2(x) function.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "libm.h"
+#include "log2_data.h"
+
+#define T __log2_data.tab
+#define T2 __log2_data.tab2
+#define B __log2_data.poly1
+#define A __log2_data.poly
+#define InvLn2hi __log2_data.invln2hi
+#define InvLn2lo __log2_data.invln2lo
+#define N (1 << LOG2_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+
+/* Top 16 bits of a double.  */
+static inline uint32_t top16(double x)
+{
+	return asuint64(x) >> 48;
+}
+
+double log2(double x)
+{
+	double_t z, r, r2, r4, y, invc, logc, kd, hi, lo, t1, t2, t3, p;
+	uint64_t ix, iz, tmp;
+	uint32_t top;
+	int k, i;
+
+	ix = asuint64(x);
+	top = top16(x);
+#define LO asuint64(1.0 - 0x1.5b51p-5)
+#define HI asuint64(1.0 + 0x1.6ab2p-5)
+	if (predict_false(ix - LO < HI - LO)) {
+		/* Handle close to 1.0 inputs separately.  */
+		/* Fix sign of zero with downward rounding when x==1.  */
+		if (WANT_ROUNDING && predict_false(ix == asuint64(1.0)))
+			return 0;
+		r = x - 1.0;
+#if __FP_FAST_FMA
+		hi = r * InvLn2hi;
+		lo = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -hi);
+#else
+		double_t rhi, rlo;
+		rhi = asdouble(asuint64(r) & -1ULL << 32);
+		rlo = r - rhi;
+		hi = rhi * InvLn2hi;
+		lo = rlo * InvLn2hi + r * InvLn2lo;
+#endif
+		r2 = r * r; /* rounding error: 0x1p-62.  */
+		r4 = r2 * r2;
+		/* Worst-case error is less than 0.54 ULP (0.55 ULP without fma).  */
+		p = r2 * (B[0] + r * B[1]);
+		y = hi + p;
+		lo += hi - y + p;
+		lo += r4 * (B[2] + r * B[3] + r2 * (B[4] + r * B[5]) +
+			    r4 * (B[6] + r * B[7] + r2 * (B[8] + r * B[9])));
+		y += lo;
+		return eval_as_double(y);
+	}
+	if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) {
+		/* x < 0x1p-1022 or inf or nan.  */
+		if (ix * 2 == 0)
+			return __math_divzero(1);
+		if (ix == asuint64(INFINITY)) /* log(inf) == inf.  */
+			return x;
+		if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+			return __math_invalid(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint64(x * 0x1p52);
+		ix -= 52ULL << 52;
+	}
+
+	/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+	   The range is split into N subintervals.
+	   The ith subinterval contains z and c is near its center.  */
+	tmp = ix - OFF;
+	i = (tmp >> (52 - LOG2_TABLE_BITS)) % N;
+	k = (int64_t)tmp >> 52; /* arithmetic shift */
+	iz = ix - (tmp & 0xfffULL << 52);
+	invc = T[i].invc;
+	logc = T[i].logc;
+	z = asdouble(iz);
+	kd = (double_t)k;
+
+	/* log2(x) = log2(z/c) + log2(c) + k.  */
+	/* r ~= z/c - 1, |r| < 1/(2*N).  */
+#if __FP_FAST_FMA
+	/* rounding error: 0x1p-55/N.  */
+	r = __builtin_fma(z, invc, -1.0);
+	t1 = r * InvLn2hi;
+	t2 = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -t1);
+#else
+	double_t rhi, rlo;
+	/* rounding error: 0x1p-55/N + 0x1p-65.  */
+	r = (z - T2[i].chi - T2[i].clo) * invc;
+	rhi = asdouble(asuint64(r) & -1ULL << 32);
+	rlo = r - rhi;
+	t1 = rhi * InvLn2hi;
+	t2 = rlo * InvLn2hi + r * InvLn2lo;
+#endif
+
+	/* hi + lo = r/ln2 + log2(c) + k.  */
+	t3 = kd + logc;
+	hi = t3 + t1;
+	lo = t3 - hi + t1 + t2;
+
+	/* log2(r+1) = r/ln2 + r^2*poly(r).  */
+	/* Evaluation is optimized assuming superscalar pipelined execution.  */
+	r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
+	r4 = r2 * r2;
+	/* Worst-case error if |y| > 0x1p-4: 0.547 ULP (0.550 ULP without fma).
+	   ~ 0.5 + 2/N/ln2 + abs-poly-error*0x1p56 ULP (+ 0.003 ULP without fma).  */
+	p = A[0] + r * A[1] + r2 * (A[2] + r * A[3]) + r4 * (A[4] + r * A[5]);
+	y = lo + r2 * p + hi;
+	return eval_as_double(y);
+}
diff --git a/sw/math/src/math/log2_data.c b/sw/math/src/math/log2_data.c
new file mode 100644
index 000000000..3dd1ca514
--- /dev/null
+++ b/sw/math/src/math/log2_data.c
@@ -0,0 +1,201 @@
+/*
+ * Data for log2.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "log2_data.h"
+
+#define N (1 << LOG2_TABLE_BITS)
+
+const struct log2_data __log2_data = {
+// First coefficient: 0x1.71547652b82fe1777d0ffda0d24p0
+.invln2hi = 0x1.7154765200000p+0,
+.invln2lo = 0x1.705fc2eefa200p-33,
+.poly1 = {
+// relative error: 0x1.2fad8188p-63
+// in -0x1.5b51p-5 0x1.6ab2p-5
+-0x1.71547652b82fep-1,
+0x1.ec709dc3a03f7p-2,
+-0x1.71547652b7c3fp-2,
+0x1.2776c50f05be4p-2,
+-0x1.ec709dd768fe5p-3,
+0x1.a61761ec4e736p-3,
+-0x1.7153fbc64a79bp-3,
+0x1.484d154f01b4ap-3,
+-0x1.289e4a72c383cp-3,
+0x1.0b32f285aee66p-3,
+},
+.poly = {
+// relative error: 0x1.a72c2bf8p-58
+// abs error: 0x1.67a552c8p-66
+// in -0x1.f45p-8 0x1.f45p-8
+-0x1.71547652b8339p-1,
+0x1.ec709dc3a04bep-2,
+-0x1.7154764702ffbp-2,
+0x1.2776c50034c48p-2,
+-0x1.ec7b328ea92bcp-3,
+0x1.a6225e117f92ep-3,
+},
+/* Algorithm:
+
+	x = 2^k z
+	log2(x) = k + log2(c) + log2(z/c)
+	log2(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+	tab[i].invc = 1/c
+	tab[i].logc = (double)log2(c)
+	tab2[i].chi = (double)c
+	tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+	1) the rounding error in 0x1.8p10 + logc is 0,
+	2) the rounding error in z - chi - clo is < 0x1p-64 and
+	3) the rounding error in (double)log2(c) is minimized (< 0x1p-68).
+
+Note: 1) ensures that k + logc can be computed without rounding error, 2)
+ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a
+single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log2(x)| < 0x1p-4, this is not enough so that is special cased.  */
+.tab = {
+{0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1},
+{0x1.6e1f766d2cca1p+0, -0x1.08494bd76d000p-1},
+{0x1.6a13d0e30d48ap+0, -0x1.00143aee8f800p-1},
+{0x1.661ec32d06c85p+0, -0x1.efec5360b4000p-2},
+{0x1.623fa951198f8p+0, -0x1.dfdd91ab7e000p-2},
+{0x1.5e75ba4cf026cp+0, -0x1.cffae0cc79000p-2},
+{0x1.5ac055a214fb8p+0, -0x1.c043811fda000p-2},
+{0x1.571ed0f166e1ep+0, -0x1.b0b67323ae000p-2},
+{0x1.53909590bf835p+0, -0x1.a152f5a2db000p-2},
+{0x1.5014fed61adddp+0, -0x1.9217f5af86000p-2},
+{0x1.4cab88e487bd0p+0, -0x1.8304db0719000p-2},
+{0x1.49539b4334feep+0, -0x1.74189f9a9e000p-2},
+{0x1.460cbdfafd569p+0, -0x1.6552bb5199000p-2},
+{0x1.42d664ee4b953p+0, -0x1.56b23a29b1000p-2},
+{0x1.3fb01111dd8a6p+0, -0x1.483650f5fa000p-2},
+{0x1.3c995b70c5836p+0, -0x1.39de937f6a000p-2},
+{0x1.3991c4ab6fd4ap+0, -0x1.2baa1538d6000p-2},
+{0x1.3698e0ce099b5p+0, -0x1.1d98340ca4000p-2},
+{0x1.33ae48213e7b2p+0, -0x1.0fa853a40e000p-2},
+{0x1.30d191985bdb1p+0, -0x1.01d9c32e73000p-2},
+{0x1.2e025cab271d7p+0, -0x1.e857da2fa6000p-3},
+{0x1.2b404cf13cd82p+0, -0x1.cd3c8633d8000p-3},
+{0x1.288b02c7ccb50p+0, -0x1.b26034c14a000p-3},
+{0x1.25e2263944de5p+0, -0x1.97c1c2f4fe000p-3},
+{0x1.234563d8615b1p+0, -0x1.7d6023f800000p-3},
+{0x1.20b46e33eaf38p+0, -0x1.633a71a05e000p-3},
+{0x1.1e2eefdcda3ddp+0, -0x1.494f5e9570000p-3},
+{0x1.1bb4a580b3930p+0, -0x1.2f9e424e0a000p-3},
+{0x1.19453847f2200p+0, -0x1.162595afdc000p-3},
+{0x1.16e06c0d5d73cp+0, -0x1.f9c9a75bd8000p-4},
+{0x1.1485f47b7e4c2p+0, -0x1.c7b575bf9c000p-4},
+{0x1.12358ad0085d1p+0, -0x1.960c60ff48000p-4},
+{0x1.0fef00f532227p+0, -0x1.64ce247b60000p-4},
+{0x1.0db2077d03a8fp+0, -0x1.33f78b2014000p-4},
+{0x1.0b7e6d65980d9p+0, -0x1.0387d1a42c000p-4},
+{0x1.0953efe7b408dp+0, -0x1.a6f9208b50000p-5},
+{0x1.07325cac53b83p+0, -0x1.47a954f770000p-5},
+{0x1.05197e40d1b5cp+0, -0x1.d23a8c50c0000p-6},
+{0x1.03091c1208ea2p+0, -0x1.16a2629780000p-6},
+{0x1.0101025b37e21p+0, -0x1.720f8d8e80000p-8},
+{0x1.fc07ef9caa76bp-1, 0x1.6fe53b1500000p-7},
+{0x1.f4465d3f6f184p-1, 0x1.11ccce10f8000p-5},
+{0x1.ecc079f84107fp-1, 0x1.c4dfc8c8b8000p-5},
+{0x1.e573a99975ae8p-1, 0x1.3aa321e574000p-4},
+{0x1.de5d6f0bd3de6p-1, 0x1.918a0d08b8000p-4},
+{0x1.d77b681ff38b3p-1, 0x1.e72e9da044000p-4},
+{0x1.d0cb5724de943p-1, 0x1.1dcd2507f6000p-3},
+{0x1.ca4b2dc0e7563p-1, 0x1.476ab03dea000p-3},
+{0x1.c3f8ee8d6cb51p-1, 0x1.7074377e22000p-3},
+{0x1.bdd2b4f020c4cp-1, 0x1.98ede8ba94000p-3},
+{0x1.b7d6c006015cap-1, 0x1.c0db86ad2e000p-3},
+{0x1.b20366e2e338fp-1, 0x1.e840aafcee000p-3},
+{0x1.ac57026295039p-1, 0x1.0790ab4678000p-2},
+{0x1.a6d01bc2731ddp-1, 0x1.1ac056801c000p-2},
+{0x1.a16d3bc3ff18bp-1, 0x1.2db11d4fee000p-2},
+{0x1.9c2d14967feadp-1, 0x1.406464ec58000p-2},
+{0x1.970e4f47c9902p-1, 0x1.52dbe093af000p-2},
+{0x1.920fb3982bcf2p-1, 0x1.651902050d000p-2},
+{0x1.8d30187f759f1p-1, 0x1.771d2cdeaf000p-2},
+{0x1.886e5ebb9f66dp-1, 0x1.88e9c857d9000p-2},
+{0x1.83c97b658b994p-1, 0x1.9a80155e16000p-2},
+{0x1.7f405ffc61022p-1, 0x1.abe186ed3d000p-2},
+{0x1.7ad22181415cap-1, 0x1.bd0f2aea0e000p-2},
+{0x1.767dcf99eff8cp-1, 0x1.ce0a43dbf4000p-2},
+},
+#if !__FP_FAST_FMA
+.tab2 = {
+{0x1.6200012b90a8ep-1, 0x1.904ab0644b605p-55},
+{0x1.66000045734a6p-1, 0x1.1ff9bea62f7a9p-57},
+{0x1.69fffc325f2c5p-1, 0x1.27ecfcb3c90bap-55},
+{0x1.6e00038b95a04p-1, 0x1.8ff8856739326p-55},
+{0x1.71fffe09994e3p-1, 0x1.afd40275f82b1p-55},
+{0x1.7600015590e1p-1, -0x1.2fd75b4238341p-56},
+{0x1.7a00012655bd5p-1, 0x1.808e67c242b76p-56},
+{0x1.7e0003259e9a6p-1, -0x1.208e426f622b7p-57},
+{0x1.81fffedb4b2d2p-1, -0x1.402461ea5c92fp-55},
+{0x1.860002dfafcc3p-1, 0x1.df7f4a2f29a1fp-57},
+{0x1.89ffff78c6b5p-1, -0x1.e0453094995fdp-55},
+{0x1.8e00039671566p-1, -0x1.a04f3bec77b45p-55},
+{0x1.91fffe2bf1745p-1, -0x1.7fa34400e203cp-56},
+{0x1.95fffcc5c9fd1p-1, -0x1.6ff8005a0695dp-56},
+{0x1.9a0003bba4767p-1, 0x1.0f8c4c4ec7e03p-56},
+{0x1.9dfffe7b92da5p-1, 0x1.e7fd9478c4602p-55},
+{0x1.a1fffd72efdafp-1, -0x1.a0c554dcdae7ep-57},
+{0x1.a5fffde04ff95p-1, 0x1.67da98ce9b26bp-55},
+{0x1.a9fffca5e8d2bp-1, -0x1.284c9b54c13dep-55},
+{0x1.adfffddad03eap-1, 0x1.812c8ea602e3cp-58},
+{0x1.b1ffff10d3d4dp-1, -0x1.efaddad27789cp-55},
+{0x1.b5fffce21165ap-1, 0x1.3cb1719c61237p-58},
+{0x1.b9fffd950e674p-1, 0x1.3f7d94194cep-56},
+{0x1.be000139ca8afp-1, 0x1.50ac4215d9bcp-56},
+{0x1.c20005b46df99p-1, 0x1.beea653e9c1c9p-57},
+{0x1.c600040b9f7aep-1, -0x1.c079f274a70d6p-56},
+{0x1.ca0006255fd8ap-1, -0x1.a0b4076e84c1fp-56},
+{0x1.cdfffd94c095dp-1, 0x1.8f933f99ab5d7p-55},
+{0x1.d1ffff975d6cfp-1, -0x1.82c08665fe1bep-58},
+{0x1.d5fffa2561c93p-1, -0x1.b04289bd295f3p-56},
+{0x1.d9fff9d228b0cp-1, 0x1.70251340fa236p-55},
+{0x1.de00065bc7e16p-1, -0x1.5011e16a4d80cp-56},
+{0x1.e200002f64791p-1, 0x1.9802f09ef62ep-55},
+{0x1.e600057d7a6d8p-1, -0x1.e0b75580cf7fap-56},
+{0x1.ea00027edc00cp-1, -0x1.c848309459811p-55},
+{0x1.ee0006cf5cb7cp-1, -0x1.f8027951576f4p-55},
+{0x1.f2000782b7dccp-1, -0x1.f81d97274538fp-55},
+{0x1.f6000260c450ap-1, -0x1.071002727ffdcp-59},
+{0x1.f9fffe88cd533p-1, -0x1.81bdce1fda8bp-58},
+{0x1.fdfffd50f8689p-1, 0x1.7f91acb918e6ep-55},
+{0x1.0200004292367p+0, 0x1.b7ff365324681p-54},
+{0x1.05fffe3e3d668p+0, 0x1.6fa08ddae957bp-55},
+{0x1.0a0000a85a757p+0, -0x1.7e2de80d3fb91p-58},
+{0x1.0e0001a5f3fccp+0, -0x1.1823305c5f014p-54},
+{0x1.11ffff8afbaf5p+0, -0x1.bfabb6680bac2p-55},
+{0x1.15fffe54d91adp+0, -0x1.d7f121737e7efp-54},
+{0x1.1a00011ac36e1p+0, 0x1.c000a0516f5ffp-54},
+{0x1.1e00019c84248p+0, -0x1.082fbe4da5dap-54},
+{0x1.220000ffe5e6ep+0, -0x1.8fdd04c9cfb43p-55},
+{0x1.26000269fd891p+0, 0x1.cfe2a7994d182p-55},
+{0x1.2a00029a6e6dap+0, -0x1.00273715e8bc5p-56},
+{0x1.2dfffe0293e39p+0, 0x1.b7c39dab2a6f9p-54},
+{0x1.31ffff7dcf082p+0, 0x1.df1336edc5254p-56},
+{0x1.35ffff05a8b6p+0, -0x1.e03564ccd31ebp-54},
+{0x1.3a0002e0eaeccp+0, 0x1.5f0e74bd3a477p-56},
+{0x1.3e000043bb236p+0, 0x1.c7dcb149d8833p-54},
+{0x1.4200002d187ffp+0, 0x1.e08afcf2d3d28p-56},
+{0x1.460000d387cb1p+0, 0x1.20837856599a6p-55},
+{0x1.4a00004569f89p+0, -0x1.9fa5c904fbcd2p-55},
+{0x1.4e000043543f3p+0, -0x1.81125ed175329p-56},
+{0x1.51fffcc027f0fp+0, 0x1.883d8847754dcp-54},
+{0x1.55ffffd87b36fp+0, -0x1.709e731d02807p-55},
+{0x1.59ffff21df7bap+0, 0x1.7f79f68727b02p-55},
+{0x1.5dfffebfc3481p+0, -0x1.180902e30e93ep-54},
+},
+#endif
+};
diff --git a/sw/math/src/math/log2_data.h b/sw/math/src/math/log2_data.h
new file mode 100644
index 000000000..276a786d1
--- /dev/null
+++ b/sw/math/src/math/log2_data.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _LOG2_DATA_H
+#define _LOG2_DATA_H
+
+#include <features.h>
+
+#define LOG2_TABLE_BITS 6
+#define LOG2_POLY_ORDER 7
+#define LOG2_POLY1_ORDER 11
+extern hidden const struct log2_data {
+	double invln2hi;
+	double invln2lo;
+	double poly[LOG2_POLY_ORDER - 1];
+	double poly1[LOG2_POLY1_ORDER - 1];
+	struct {
+		double invc, logc;
+	} tab[1 << LOG2_TABLE_BITS];
+#if !__FP_FAST_FMA
+	struct {
+		double chi, clo;
+	} tab2[1 << LOG2_TABLE_BITS];
+#endif
+} __log2_data;
+
+#endif
diff --git a/sw/math/src/math/log2f.c b/sw/math/src/math/log2f.c
new file mode 100644
index 000000000..c368f88f3
--- /dev/null
+++ b/sw/math/src/math/log2f.c
@@ -0,0 +1,72 @@
+/*
+ * Single-precision log2 function.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "libm.h"
+#include "log2f_data.h"
+
+/*
+LOG2F_TABLE_BITS = 4
+LOG2F_POLY_ORDER = 4
+
+ULP error: 0.752 (nearest rounding.)
+Relative error: 1.9 * 2^-26 (before rounding.)
+*/
+
+#define N (1 << LOG2F_TABLE_BITS)
+#define T __log2f_data.tab
+#define A __log2f_data.poly
+#define OFF 0x3f330000
+
+float log2f(float x)
+{
+	double_t z, r, r2, p, y, y0, invc, logc;
+	uint32_t ix, iz, top, tmp;
+	int k, i;
+
+	ix = asuint(x);
+	/* Fix sign of zero with downward rounding when x==1.  */
+	if (WANT_ROUNDING && predict_false(ix == 0x3f800000))
+		return 0;
+	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+		/* x < 0x1p-126 or inf or nan.  */
+		if (ix * 2 == 0)
+			return __math_divzerof(1);
+		if (ix == 0x7f800000) /* log2(inf) == inf.  */
+			return x;
+		if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
+			return __math_invalidf(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint(x * 0x1p23f);
+		ix -= 23 << 23;
+	}
+
+	/* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+	   The range is split into N subintervals.
+	   The ith subinterval contains z and c is near its center.  */
+	tmp = ix - OFF;
+	i = (tmp >> (23 - LOG2F_TABLE_BITS)) % N;
+	top = tmp & 0xff800000;
+	iz = ix - top;
+	k = (int32_t)tmp >> 23; /* arithmetic shift */
+	invc = T[i].invc;
+	logc = T[i].logc;
+	z = (double_t)asfloat(iz);
+
+	/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
+	r = z * invc - 1;
+	y0 = logc + (double_t)k;
+
+	/* Pipelined polynomial evaluation to approximate log1p(r)/ln2.  */
+	r2 = r * r;
+	y = A[1] * r + A[2];
+	y = A[0] * r2 + y;
+	p = A[3] * r + y0;
+	y = y * r2 + p;
+	return eval_as_float(y);
+}
diff --git a/sw/math/src/math/log2f_data.c b/sw/math/src/math/log2f_data.c
new file mode 100644
index 000000000..24e450f1e
--- /dev/null
+++ b/sw/math/src/math/log2f_data.c
@@ -0,0 +1,33 @@
+/*
+ * Data definition for log2f.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "log2f_data.h"
+
+const struct log2f_data __log2f_data = {
+  .tab = {
+  { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 },
+  { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 },
+  { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 },
+  { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 },
+  { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 },
+  { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 },
+  { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 },
+  { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 },
+  { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 },
+  { 0x1p+0, 0x0p+0 },
+  { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 },
+  { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 },
+  { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 },
+  { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 },
+  { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 },
+  { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 },
+  },
+  .poly = {
+  -0x1.712b6f70a7e4dp-2, 0x1.ecabf496832ep-2, -0x1.715479ffae3dep-1,
+  0x1.715475f35c8b8p0,
+  }
+};
diff --git a/sw/math/src/math/log2f_data.h b/sw/math/src/math/log2f_data.h
new file mode 100644
index 000000000..4fa489560
--- /dev/null
+++ b/sw/math/src/math/log2f_data.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _LOG2F_DATA_H
+#define _LOG2F_DATA_H
+
+#include <features.h>
+
+#define LOG2F_TABLE_BITS 4
+#define LOG2F_POLY_ORDER 4
+extern hidden const struct log2f_data {
+	struct {
+		double invc, logc;
+	} tab[1 << LOG2F_TABLE_BITS];
+	double poly[LOG2F_POLY_ORDER];
+} __log2f_data;
+
+#endif
diff --git a/sw/math/src/math/sqrt.c b/sw/math/src/math/sqrt.c
new file mode 100644
index 000000000..5ba265596
--- /dev/null
+++ b/sw/math/src/math/sqrt.c
@@ -0,0 +1,158 @@
+#include <stdint.h>
+#include <math.h>
+#include "libm.h"
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+	return (uint64_t)a*b >> 32;
+}
+
+/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
+
+double sqrt(double x)
+{
+	uint64_t ix, top, m;
+
+	/* special case handling.  */
+	ix = asuint64(x);
+	top = ix >> 52;
+	if (predict_false(top - 0x001 >= 0x7ff - 0x001)) {
+		/* x < 0x1p-1022 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7ff0000000000000)
+			return x;
+		if (ix > 0x7ff0000000000000)
+			return __math_invalid(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint64(x * 0x1p52);
+		top = ix >> 52;
+		top -= 52;
+	}
+
+	/* argument reduction:
+	   x = 4^e m; with integer e, and m in [1, 4)
+	   m: fixed point representation [2.62]
+	   2^e is the exponent part of the result.  */
+	int even = top & 1;
+	m = (ix << 11) | 0x8000000000000000;
+	if (even) m >>= 1;
+	top = (top + 0x3ff) >> 1;
+
+	/* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4)
+
+	   initial estimate:
+	   7bit table lookup (1bit exponent and 6bit significand).
+
+	   iterative approximation:
+	   using 2 goldschmidt iterations with 32bit int arithmetics
+	   and a final iteration with 64bit int arithmetics.
+
+	   details:
+
+	   the relative error (e = r0 sqrt(m)-1) of a linear estimate
+	   (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
+	   a table lookup is faster and needs one less iteration
+	   6 bit lookup table (128b) gives |e| < 0x1.f9p-8
+	   7 bit lookup table (256b) gives |e| < 0x1.fdp-9
+	   for single and double prec 6bit is enough but for quad
+	   prec 7bit is needed (or modified iterations). to avoid
+	   one more iteration >=13bit table would be needed (16k).
+
+	   a newton-raphson iteration for r is
+	     w = r*r
+	     u = 3 - m*w
+	     r = r*u/2
+	   can use a goldschmidt iteration for s at the end or
+	     s = m*r
+
+	   first goldschmidt iteration is
+	     s = m*r
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   next goldschmidt iteration is
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   and at the end r is not computed only s.
+
+	   they use the same amount of operations and converge at the
+	   same quadratic rate, i.e. if
+	     r1 sqrt(m) - 1 = e, then
+	     r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3
+	   the advantage of goldschmidt is that the mul for s and r
+	   are independent (computed in parallel), however it is not
+	   "self synchronizing": it only uses the input m in the
+	   first iteration so rounding errors accumulate. at the end
+	   or when switching to larger precision arithmetics rounding
+	   errors dominate so the first iteration should be used.
+
+	   the fixed point representations are
+	     m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30
+	   and after switching to 64 bit
+	     m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62  */
+
+	static const uint64_t three = 0xc0000000;
+	uint64_t r, s, d, u, i;
+
+	i = (ix >> 46) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r sqrt(m) - 1| < 0x1.fdp-9 */
+	s = mul32(m>>32, r);
+	/* |s/sqrt(m) - 1| < 0x1.fdp-9 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */
+	r = r << 32;
+	s = mul64(m, r);
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	s = mul64(s, u);  /* repr: 3.61 */
+	/* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */
+	s = (s - 2) >> 9; /* repr: 12.52 */
+	/* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */
+
+	/* s < sqrt(m) < s + 0x1.09p-52,
+	   compute nearest rounded result:
+	   the nearest result to 52 bits is either s or s+0x1p-52,
+	   we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m.  */
+	uint64_t d0, d1, d2;
+	double y, t;
+	d0 = (m << 42) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 63;
+	s &= 0x000fffffffffffff;
+	s |= top << 52;
+	y = asdouble(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding modes and inexact exception:
+		   only (s+1)^2 == 2^42 m case is exact otherwise
+		   add a tiny value to cause the fenv effects.  */
+		uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000;
+		tiny |= (d1^d2) & 0x8000000000000000;
+		t = asdouble(tiny);
+		y = eval_as_double(y + t);
+	}
+	return y;
+}
diff --git a/sw/math/src/math/sqrt_data.c b/sw/math/src/math/sqrt_data.c
new file mode 100644
index 000000000..61bc22f43
--- /dev/null
+++ b/sw/math/src/math/sqrt_data.c
@@ -0,0 +1,19 @@
+#include "sqrt_data.h"
+const uint16_t __rsqrt_tab[128] = {
+0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43,
+0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b,
+0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1,
+0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430,
+0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59,
+0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925,
+0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479,
+0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040,
+0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234,
+0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2,
+0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1,
+0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192,
+0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f,
+0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4,
+0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59,
+0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560,
+};
diff --git a/sw/math/src/math/sqrt_data.h b/sw/math/src/math/sqrt_data.h
new file mode 100644
index 000000000..260c7f9c2
--- /dev/null
+++ b/sw/math/src/math/sqrt_data.h
@@ -0,0 +1,13 @@
+#ifndef _SQRT_DATA_H
+#define _SQRT_DATA_H
+
+#include <features.h>
+#include <stdint.h>
+
+/* if x in [1,2): i = (int)(64*x);
+   if x in [2,4): i = (int)(32*x-64);
+   __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error:
+   |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */
+extern hidden const uint16_t __rsqrt_tab[128];
+
+#endif
diff --git a/sw/math/src/math/sqrtf.c b/sw/math/src/math/sqrtf.c
new file mode 100644
index 000000000..740d81cba
--- /dev/null
+++ b/sw/math/src/math/sqrtf.c
@@ -0,0 +1,83 @@
+#include <stdint.h>
+#include <math.h>
+#include "libm.h"
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+	return (uint64_t)a*b >> 32;
+}
+
+/* see sqrt.c for more detailed comments.  */
+
+float sqrtf(float x)
+{
+	uint32_t ix, m, m1, m0, even, ey;
+
+	ix = asuint(x);
+	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+		/* x < 0x1p-126 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7f800000)
+			return x;
+		if (ix > 0x7f800000)
+			return __math_invalidf(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint(x * 0x1p23f);
+		ix -= 23 << 23;
+	}
+
+	/* x = 4^e m; with int e and m in [1, 4).  */
+	even = ix & 0x00800000;
+	m1 = (ix << 8) | 0x80000000;
+	m0 = (ix << 7) & 0x7fffffff;
+	m = even ? m0 : m1;
+
+	/* 2^e is the exponent part of the return value.  */
+	ey = ix >> 1;
+	ey += 0x3f800000 >> 1;
+	ey &= 0x7f800000;
+
+	/* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations.  */
+	static const uint32_t three = 0xc0000000;
+	uint32_t r, s, d, u, i;
+	i = (ix >> 17) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r*sqrt(m) - 1| < 0x1p-8 */
+	s = mul32(m, r);
+	/* |s/sqrt(m) - 1| < 0x1p-8 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r*sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	s = mul32(s, u);
+	/* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */
+	s = (s - 1)>>6;
+	/* s < sqrt(m) < s + 0x1.08p-23 */
+
+	/* compute nearest rounded result.  */
+	uint32_t d0, d1, d2;
+	float y, t;
+	d0 = (m << 16) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 31;
+	s &= 0x007fffff;
+	s |= ey;
+	y = asfloat(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding and inexact exception. */
+		uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000;
+		tiny |= (d1^d2) & 0x80000000;
+		t = asfloat(tiny);
+		y = eval_as_float(y + t);
+	}
+	return y;
+}
diff --git a/sw/math/src/math/tanh.c b/sw/math/src/math/tanh.c
index 20d6dbcf4..2481db1dc 100644
--- a/sw/math/src/math/tanh.c
+++ b/sw/math/src/math/tanh.c
@@ -6,16 +6,23 @@
  */
 double tanh(double x)
 {
-	union {double f; uint64_t i;} u = {.f = x};
 	uint32_t w;
 	int sign;
 	double_t t;
 
 	/* x = |x| */
-	sign = u.i >> 63;
-	u.i &= (uint64_t)-1/2;
-	x = u.f;
-	w = u.i >> 32;
+	/// Original implementation
+	// union {double f; uint64_t i;} u = {.f = x};
+	// sign = u.i >> 63;
+	// u.i &= (uint64_t)-1/2;
+	// x = u.f;
+	// w = u.i >> 32;
+	/// Safe implementation in Snitch
+	uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
+	sign = upper_32b_x >> 31;
+	uint32_t sign_mask = (~(1 << 31));
+	w = upper_32b_x & sign_mask;
+	safe_inject_into_upper_32b_double(w, &x);
 
 	if (w > 0x3fe193ea) {
 		/* |x| > log(3)/2 ~= 0.5493 or nan */
diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index 7c94acdd9..169e54d7b 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -8,43 +8,49 @@ typedef uint32_t snrt_dma_txid_t;
 /// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers.
 inline snrt_dma_txid_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src,
                                                  size_t size) {
-    register uint32_t reg_dst_low asm("a0") = dst >> 0;    // 10
-    register uint32_t reg_dst_high asm("a1") = dst >> 32;  // 11
-    register uint32_t reg_src_low asm("a2") = src >> 0;    // 12
-    register uint32_t reg_src_high asm("a3") = src >> 32;  // 13
-    register uint32_t reg_size asm("a4") = size;           // 14
-
-    // dmsrc a2, a3
-    asm volatile(
-        ".word (0b0000000 << 25) | \
-               (     (13) << 20) | \
-               (     (12) << 15) | \
-               (    0b000 << 12) | \
-               (0b0101011 <<  0)   \n" ::"r"(reg_src_high),
-        "r"(reg_src_low));
-
-    // dmdst a0, a1
-    asm volatile(
-        ".word (0b0000001 << 25) | \
-               (     (11) << 20) | \
-               (     (10) << 15) | \
-               (    0b000 << 12) | \
-               (0b0101011 <<  0)   \n" ::"r"(reg_dst_high),
-        "r"(reg_dst_low));
-
-    // dmcpyi a0, a4, 0b00
-    register uint32_t reg_txid asm("a0");  // 10
-    asm volatile(
-        ".word (0b0000010 << 25) | \
-               (  0b00000 << 20) | \
-               (     (14) << 15) | \
-               (    0b000 << 12) | \
-               (     (10) <<  7) | \
-               (0b0101011 <<  0)   \n"
-        : "=r"(reg_txid)
-        : "r"(reg_size));
-
-    return reg_txid;
+    // Current DMA does not allow transfers with size == 0 (blocks)
+    // TODO(colluca) remove this check once new DMA is integrated
+    if (size > 0) {
+        register uint32_t reg_dst_low asm("a0") = dst >> 0;    // 10
+        register uint32_t reg_dst_high asm("a1") = dst >> 32;  // 11
+        register uint32_t reg_src_low asm("a2") = src >> 0;    // 12
+        register uint32_t reg_src_high asm("a3") = src >> 32;  // 13
+        register uint32_t reg_size asm("a4") = size;           // 14
+
+        // dmsrc a2, a3
+        asm volatile(
+            ".word (0b0000000 << 25) | \
+                (     (13) << 20) | \
+                (     (12) << 15) | \
+                (    0b000 << 12) | \
+                (0b0101011 <<  0)   \n" ::"r"(reg_src_high),
+            "r"(reg_src_low));
+
+        // dmdst a0, a1
+        asm volatile(
+            ".word (0b0000001 << 25) | \
+                (     (11) << 20) | \
+                (     (10) << 15) | \
+                (    0b000 << 12) | \
+                (0b0101011 <<  0)   \n" ::"r"(reg_dst_high),
+            "r"(reg_dst_low));
+
+        // dmcpyi a0, a4, 0b00
+        register uint32_t reg_txid asm("a0");  // 10
+        asm volatile(
+            ".word (0b0000010 << 25) | \
+                (  0b00000 << 20) | \
+                (     (14) << 15) | \
+                (    0b000 << 12) | \
+                (     (10) <<  7) | \
+                (0b0101011 <<  0)   \n"
+            : "=r"(reg_txid)
+            : "r"(reg_size));
+
+        return reg_txid;
+    } else {
+        return -1;
+    }
 }
 
 /// Initiate an asynchronous 1D DMA transfer.
@@ -58,65 +64,71 @@ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src,
                                                  size_t size, size_t dst_stride,
                                                  size_t src_stride,
                                                  size_t repeat) {
-    register uint32_t reg_dst_low asm("a0") = dst >> 0;       // 10
-    register uint32_t reg_dst_high asm("a1") = dst >> 32;     // 11
-    register uint32_t reg_src_low asm("a2") = src >> 0;       // 12
-    register uint32_t reg_src_high asm("a3") = src >> 32;     // 13
-    register uint32_t reg_size asm("a4") = size;              // 14
-    register uint32_t reg_dst_stride asm("a5") = dst_stride;  // 15
-    register uint32_t reg_src_stride asm("a6") = src_stride;  // 16
-    register uint32_t reg_repeat asm("a7") = repeat;          // 17
-
-    // dmsrc a0, a1
-    asm volatile(
-        ".word (0b0000000 << 25) | \
-               (     (13) << 20) | \
-               (     (12) << 15) | \
-               (    0b000 << 12) | \
-               (0b0101011 <<  0)   \n" ::"r"(reg_src_high),
-        "r"(reg_src_low));
-
-    // dmdst a0, a1
-    asm volatile(
-        ".word (0b0000001 << 25) | \
-               (     (11) << 20) | \
-               (     (10) << 15) | \
-               (    0b000 << 12) | \
-               (0b0101011 <<  0)   \n" ::"r"(reg_dst_high),
-        "r"(reg_dst_low));
-
-    // dmstr a5, a6
-    asm volatile(
-        ".word (0b0000110 << 25) | \
-               (     (15) << 20) | \
-               (     (16) << 15) | \
-               (    0b000 << 12) | \
-               (0b0101011 <<  0)   \n"
-        :
-        : "r"(reg_dst_stride), "r"(reg_src_stride));
-
-    // dmrep a7
-    asm volatile(
-        ".word (0b0000111 << 25) | \
-               (     (17) << 15) | \
-               (    0b000 << 12) | \
-               (0b0101011 <<  0)   \n"
-        :
-        : "r"(reg_repeat));
-
-    // dmcpyi a0, a4, 0b10
-    register uint32_t reg_txid asm("a0");  // 10
-    asm volatile(
-        ".word (0b0000010 << 25) | \
-               (  0b00010 << 20) | \
-               (     (14) << 15) | \
-               (    0b000 << 12) | \
-               (     (10) <<  7) | \
-               (0b0101011 <<  0)   \n"
-        : "=r"(reg_txid)
-        : "r"(reg_size));
-
-    return reg_txid;
+    // Current DMA does not allow transfers with size == 0 (blocks)
+    // TODO(colluca) remove this check once new DMA is integrated
+    if (size > 0) {
+        register uint32_t reg_dst_low asm("a0") = dst >> 0;       // 10
+        register uint32_t reg_dst_high asm("a1") = dst >> 32;     // 11
+        register uint32_t reg_src_low asm("a2") = src >> 0;       // 12
+        register uint32_t reg_src_high asm("a3") = src >> 32;     // 13
+        register uint32_t reg_size asm("a4") = size;              // 14
+        register uint32_t reg_dst_stride asm("a5") = dst_stride;  // 15
+        register uint32_t reg_src_stride asm("a6") = src_stride;  // 16
+        register uint32_t reg_repeat asm("a7") = repeat;          // 17
+
+        // dmsrc a0, a1
+        asm volatile(
+            ".word (0b0000000 << 25) | \
+                (     (13) << 20) | \
+                (     (12) << 15) | \
+                (    0b000 << 12) | \
+                (0b0101011 <<  0)   \n" ::"r"(reg_src_high),
+            "r"(reg_src_low));
+
+        // dmdst a0, a1
+        asm volatile(
+            ".word (0b0000001 << 25) | \
+                (     (11) << 20) | \
+                (     (10) << 15) | \
+                (    0b000 << 12) | \
+                (0b0101011 <<  0)   \n" ::"r"(reg_dst_high),
+            "r"(reg_dst_low));
+
+        // dmstr a5, a6
+        asm volatile(
+            ".word (0b0000110 << 25) | \
+                (     (15) << 20) | \
+                (     (16) << 15) | \
+                (    0b000 << 12) | \
+                (0b0101011 <<  0)   \n"
+            :
+            : "r"(reg_dst_stride), "r"(reg_src_stride));
+
+        // dmrep a7
+        asm volatile(
+            ".word (0b0000111 << 25) | \
+                (     (17) << 15) | \
+                (    0b000 << 12) | \
+                (0b0101011 <<  0)   \n"
+            :
+            : "r"(reg_repeat));
+
+        // dmcpyi a0, a4, 0b10
+        register uint32_t reg_txid asm("a0");  // 10
+        asm volatile(
+            ".word (0b0000010 << 25) | \
+                (  0b00010 << 20) | \
+                (     (14) << 15) | \
+                (    0b000 << 12) | \
+                (     (10) <<  7) | \
+                (0b0101011 <<  0)   \n"
+            : "=r"(reg_txid)
+            : "r"(reg_size));
+
+        return reg_txid;
+    } else {
+        return -1;
+    }
 }
 
 /// Initiate an asynchronous 2D DMA transfer.
diff --git a/sw/snRuntime/src/dump.h b/sw/snRuntime/src/dump.h
index 8f24cc1b9..1d65395b5 100644
--- a/sw/snRuntime/src/dump.h
+++ b/sw/snRuntime/src/dump.h
@@ -4,6 +4,7 @@
 //
 // Authors: Samuel Riedel, ETH Zurich <sriedel@iis.ee.ethz.ch>
 //          Viviane Potocnik, ETH Zurich <vivianep@iis.ee.ethz.ch>
+//          Luca Colagrande, ETH Zurich <colluca@iis.ee.ethz.ch>
 
 // Dump a value via CSR
 // !!! Careful: This is only supported in simulation and an experimental
@@ -11,18 +12,14 @@
 // This can be exploited to quickly print measurement values from all cores
 // simultaneously without the hassle of printf. To specify multiple metrics,
 // different CSRs can be used. The macro will define a function that will then
-// always print via the same CSR. E.g., `dump(errors, 8)` will define a function
-// with the following signature: `dump_errors(uint32_t val)`, which will print
-// the given value via the 8th register. Alternatively, the `write_csr(reg,
-// val)` macro can be used directly.
+// always print via the same CSR. E.g., `dump(uint32_t, errors, 8)` will define
+// a function with the following signature: `dump_errors(uint32_t val)`, which
+// will print the given value via the 8th register. Alternatively, the
+// `write_csr(reg, val)` macro can be used directly.
 
-#define dump_float(name, reg)                                                  \
-    static __attribute__((always_inline)) inline void dump_##name(float val) { \
-        asm volatile("csrw " #reg ", %0" ::"rK"(val));                         \
+#define NAMED_DUMP(type, name, reg)                                           \
+    static __attribute__((always_inline)) inline void dump_##name(type val) { \
+        asm volatile("csrw " #reg ", %0" ::"rK"(val));                        \
     }
 
-#define dump_uint(name, reg)                                                   \
-    static                                                                     \
-        __attribute__((always_inline)) inline void dump_##name(uint32_t val) { \
-        asm volatile("csrw " #reg ", %0" ::"rK"(val));                         \
-    }
\ No newline at end of file
+#define DUMP(val) ({ asm volatile("csrw 0x7C3, %0" ::"rK"(val)); })
diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index 3fb338f4a..4e4cd2152 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -20,22 +20,38 @@ static inline void snrt_init_tls() {
     extern volatile uint32_t __tdata_start, __tdata_end;
     extern volatile uint32_t __tbss_start, __tbss_end;
 
-    volatile uint32_t* p;
-    volatile uint32_t* tls_ptr;
+    size_t size;
+    volatile uint32_t tls_ptr;
 
-    asm volatile("mv %0, tp" : "=r"(tls_ptr) : :);
-
-    // Copy tdata section
-    for (p = (uint32_t*)(&__tdata_start); p < (uint32_t*)(&__tdata_end); p++) {
-        *tls_ptr = *p;
-        tls_ptr++;
+    // To avoid contentions in main memory, and take advantage of the
+    // bandwidth of the DMA, the DM core initializes the TLS section
+    // for every core in a cluster.
+    if (snrt_is_dm_core()) {
+        size = (size_t)(&__tdata_end) - (size_t)(&__tdata_start);
+
+        // First initialize the DM core's .tdata section from main memory
+        asm volatile("mv %0, tp" : "=r"(tls_ptr) : :);
+        snrt_dma_start_1d((void*)tls_ptr, (void*)(&__tdata_start), size);
+
+        // Then initialize all other cores' .tdata sections from the DM
+        // core's. The offset between the TLS section of successive cores
+        // is defined in start.S
+        size_t tls_offset = (1 << SNRT_LOG2_STACK_SIZE) + 8;
+        for (int i = 1; i < snrt_cluster_core_num(); i++) {
+            snrt_dma_start_1d((void*)(tls_ptr + i * tls_offset), (void*)tls_ptr,
+                              size);
+        }
+
+        // Initialize all cores' .tbss sections
+        tls_ptr += size;
+        size = (size_t)(&__tbss_end) - (size_t)(&__tbss_start);
+        for (int i = 0; i < snrt_cluster_core_num(); i++) {
+            snrt_dma_start_1d((void*)(tls_ptr + i * tls_offset),
+                              (void*)(snrt_zero_memory_ptr()), size);
+        }
     }
 
-    // Clear tbss section
-    for (p = (uint32_t*)(&__tbss_start); p < (uint32_t*)(&__tbss_end); p++) {
-        *tls_ptr = 0;
-        tls_ptr++;
-    }
+    snrt_cluster_hw_barrier();
 }
 #endif
 
@@ -66,7 +82,7 @@ static inline void snrt_init_cls() {
 
         // Copy cdata section to base of the TCDM
         size = (size_t)(&__cdata_end) - (size_t)(&__cdata_start);
-        if (size > 0) snrt_dma_start_1d(ptr, (void*)(&__cdata_start), size);
+        snrt_dma_start_1d(ptr, (void*)(&__cdata_start), size);
 
         // Clear cbss section
         ptr = (void*)((uint32_t)ptr + size);
diff --git a/target/common/common.mk b/target/common/common.mk
index 6b9c679d0..0cf03c463 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -2,26 +2,41 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 
-LOGS_DIR       ?= logs
-TB_DIR         ?= $(SNITCH_ROOT)/target/common/test
-UTIL_DIR       ?= $(SNITCH_ROOT)/util
+# Makefile invocation
+DEBUG ?= OFF  # ON to turn on wave logging
+
+# Directories
+LOGS_DIR ?= logs
+TB_DIR   ?= $(SNITCH_ROOT)/target/common/test
+UTIL_DIR ?= $(SNITCH_ROOT)/util
+
+# SEPP packages
+QUESTA_SEPP    ?=
+VCS_SEPP       ?=
+VERILATOR_SEPP ?=
 
 # External executables
-BENDER		   ?= bender
-DASM 	       ?= spike-dasm
-VLT			   ?= verilator
-VERIBLE_FMT    ?= verible-verilog-format
-CLANG_FORMAT   ?= clang-format
+BENDER       ?= bender
+DASM         ?= spike-dasm
+VLT          ?= $(VERILATOR_SEPP) verilator
+VCS          ?= $(VCS_SEPP) vcs
+VERIBLE_FMT  ?= verible-verilog-format
+CLANG_FORMAT ?= clang-format
+VSIM         ?= $(QUESTA_SEPP) vsim
+VOPT         ?= $(QUESTA_SEPP) vopt
+VLOG         ?= $(QUESTA_SEPP) vlog
+VLIB         ?= $(QUESTA_SEPP) vlib
 
 # Internal executables
-BIN2JTAG       ?= $(UTIL_DIR)/bin2jtag.py
-GENTRACE	   ?= $(UTIL_DIR)/trace/gen_trace.py
-ANNOTATE_PY	   ?= $(UTIL_DIR)/trace/annotate.py
-EVENTS_PY	   ?= $(UTIL_DIR)/trace/events.py
-PERF_CSV_PY	   ?= $(UTIL_DIR)/trace/perf_csv.py
+GENTRACE_PY      ?= $(UTIL_DIR)/trace/gen_trace.py
+ANNOTATE_PY      ?= $(UTIL_DIR)/trace/annotate.py
+EVENTS_PY        ?= $(UTIL_DIR)/trace/events.py
+PERF_CSV_PY      ?= $(UTIL_DIR)/trace/perf_csv.py
+LAYOUT_EVENTS_PY ?= $(UTIL_DIR)/trace/layout_events.py
+EVENTVIS_PY      ?= $(UTIL_DIR)/trace/eventvis.py
 
-VERILATOR_ROOT ?= $(dir $(shell which $(VLT)))/../share/verilator
-VLT_ROOT	   ?= ${VERILATOR_ROOT}
+VERILATOR_ROOT ?= $(dir $(shell $(VERILATOR_SEPP) which verilator))..
+VLT_ROOT       ?= ${VERILATOR_ROOT}
 
 MATCH_END := '/+incdir+/ s/$$/\/*\/*/'
 MATCH_BGN := 's/+incdir+//g'
@@ -29,7 +44,14 @@ SED_SRCS  := sed -e ${MATCH_END} -e ${MATCH_BGN}
 
 VSIM_BENDER   += -t test -t rtl -t simulation -t vsim
 VSIM_SOURCES   = $(shell ${BENDER} script flist ${VSIM_BENDER} | ${SED_SRCS})
-VSIM_BUILDDIR := work-vsim
+VSIM_BUILDDIR ?= work-vsim
+VSIM_FLAGS    += -t 1ps
+ifeq ($(DEBUG), ON)
+VSIM_FLAGS    += -do "log -r /*; run -a"
+VOPT_FLAGS     = +acc
+else
+VSIM_FLAGS    += -do "run -a"
+endif
 
 # VCS_BUILDDIR should to be the same as the `DEFAULT : ./work-vcs`
 # in target/snitch_cluster/synopsys_sim.setup
@@ -38,8 +60,8 @@ VCS_SOURCES   = $(shell ${BENDER} script flist ${VCS_BENDER} | ${SED_SRCS})
 VCS_BUILDDIR := work-vcs
 
 # fesvr is being installed here
-FESVR          ?= ${MKFILE_DIR}work
-FESVR_VERSION  ?= 35d50bc40e59ea1d5566fbd3d9226023821b1bb6
+FESVR         ?= ${MKFILE_DIR}work
+FESVR_VERSION ?= 35d50bc40e59ea1d5566fbd3d9226023821b1bb6
 
 VLT_BENDER   += -t rtl
 VLT_SOURCES   = $(shell ${BENDER} script flist ${VLT_BENDER} | ${SED_SRCS})
@@ -146,25 +168,33 @@ endef
 # Modelsim #
 ############
 
+$(VSIM_BUILDDIR):
+	mkdir -p $@
+
+# Expects vlog/vcom script in $< (e.g. as output by bender)
+# Expects the top module name in $1
+# Produces a binary used to run the simulation at the path specified by $@
 define QUESTASIM
-	${VSIM} -c -do "source $<; quit" | tee $(dir $<)vsim.log
-	@! grep -P "Errors: [1-9]*," $(dir $<)vsim.log
-	@mkdir -p bin
+	${VSIM} -c -do "source $<; quit" | tee $(dir $<)vlog.log
+	@! grep -P "Errors: [1-9]*," $(dir $<)vlog.log
+	$(VOPT) $(VOPT_FLAGS) -work $(VSIM_BUILDDIR) $1 -o $(1)_opt | tee $(dir $<)vopt.log
+	@! grep -P "Errors: [1-9]*," $(dir $<)vopt.log
+	@mkdir -p $(dir $@)
 	@echo "#!/bin/bash" > $@
-	@echo 'binary=$$(realpath --relative-to=${MKFILE_DIR} $$1)' >> $@
-	@echo 'cd ${MKFILE_DIR}' >> $@
+	@echo 'binary=$$(realpath $$1)' >> $@
+	@echo 'mkdir -p $(LOGS_DIR)' >> $@
 	@echo 'echo $$binary > $(LOGS_DIR)/.rtlbinary' >> $@
 	@echo '${VSIM} +permissive ${VSIM_FLAGS} $$3 -work ${MKFILE_DIR}/${VSIM_BUILDDIR} -c \
 				-ldflags "-Wl,-rpath,${FESVR}/lib -L${FESVR}/lib -lfesvr -lutil" \
-				$1 +permissive-off ++$$binary ++$$2' >> $@
+				$(1)_opt +permissive-off ++$$binary ++$$2' >> $@
 	@chmod +x $@
 	@echo "#!/bin/bash" > $@.gui
-	@echo 'binary=$$(pwd)/$$1' >> $@.gui
-	@echo 'cd ${MKFILE_DIR}' >> $@.gui
+	@echo 'binary=$$(realpath $$1)' >> $@.gui
+	@echo 'mkdir -p $(LOGS_DIR)' >> $@.gui
 	@echo 'echo $$binary > $(LOGS_DIR)/.rtlbinary' >> $@.gui
 	@echo '${VSIM} +permissive ${VSIM_FLAGS} -work ${MKFILE_DIR}/${VSIM_BUILDDIR} \
 				-ldflags "-Wl,-rpath,${FESVR}/lib -L${FESVR}/lib -lfesvr -lutil" \
-				$1 +permissive-off ++$$binary ++$$2' >> $@.gui
+				$(1)_opt +permissive-off ++$$binary ++$$2' >> $@.gui
 	@chmod +x $@.gui
 endef
 
@@ -175,7 +205,7 @@ $(VCS_BUILDDIR)/compile.sh:
 	mkdir -p $(VCS_BUILDDIR)
 	${BENDER} script vcs ${VCS_BENDER} --vlog-arg="${VLOGAN_FLAGS}" --vcom-arg="${VHDLAN_FLAGS}" > $@
 	chmod +x $@
-	$@ > $(VCS_BUILDDIR)/compile.log
+	$(VCS_SEPP) $@ > $(VCS_BUILDDIR)/compile.log
 
 ########
 # Util #
@@ -189,26 +219,56 @@ define reggen_generate_header
 	@$(CLANG_FORMAT) -i $1
 endef
 
-$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE)
-	$(DASM) < $< | $(PYTHON) $(GENTRACE) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt
+# Arg 1: binary
+# Arg 2: max size in bytes
+define BINARY_SIZE_CHECK
+  echo "Binary size: $$(stat -c %s $(1))B"
+  @[ "$$(stat -c %s $(1))" -lt "$(2)" ] || (echo "Binary exceeds specified size of $(2)B"; exit 1)
+endef
+
+##########
+# Traces #
+##########
+
+DASM_TRACES      = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null))
+TXT_TRACES       = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.txt/g'))
+PERF_TRACES      = $(shell (echo $(DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g'))
+ANNOTATED_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.s/g'))
+DIFF_TRACES      = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.diff/g'))
 
-traces: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.txt/') || echo "") \
-        $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/') || echo "")
+GENTRACE_OUTPUTS = $(TXT_TRACES) $(PERF_TRACES)
+ANNOTATE_OUTPUTS = $(ANNOTATED_TRACES)
+PERF_CSV         = $(LOGS_DIR)/perf.csv
+EVENT_CSV        = $(LOGS_DIR)/event.csv
+TRACE_CSV        = $(LOGS_DIR)/trace.csv
+TRACE_JSON       = $(LOGS_DIR)/trace.json
+
+.PHONY: traces annotate perf-csv event-csv layout
+traces: $(GENTRACE_OUTPUTS)
+annotate: $(ANNOTATE_OUTPUTS)
+perf-csv: $(PERF_CSV)
+event-csv: $(EVENT_CSV)
+layout: $(TRACE_CSV) $(TRACE_JSON)
+
+$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
+	$(DASM) < $< | $(PYTHON) $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt
 
-# make annotate
 # Generate source-code interleaved traces for all harts. Reads the binary from
 # the logs/.rtlbinary file that is written at start of simulation in the vsim script
+BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary)
 $(LOGS_DIR)/trace_hart_%.s: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
 	$(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $<
 $(LOGS_DIR)/trace_hart_%.diff: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
 	$(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -d
-BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary)
-annotate: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.s/') || echo "") \
-          $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.diff/') || echo "")
 
-# Arg 1: binary
-# Arg 2: max size in bytes
-define BINRAY_SIZE_CHECK
-  echo "Binary size: $$(stat -c %s $(1))B"
-  @[ "$$(stat -c %s $(1))" -lt "$(2)" ] || (echo "Binary exceeds specified size of $(2)B"; exit 1)
-endef
+$(PERF_CSV): $(PERF_TRACES) $(PERF_CSV_PY)
+	$(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES)
+
+$(EVENT_CSV): $(PERF_TRACES) $(PERF_CSV_PY)
+	$(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES) --filter tstart tend
+
+$(TRACE_CSV): $(EVENT_CSV) $(LAYOUT_FILE) $(LAYOUT_EVENTS_PY)
+	$(PYTHON) $(LAYOUT_EVENTS_PY) $(LAYOUT_EVENTS_FLAGS) $(EVENT_CSV) $(LAYOUT_FILE) -o $@
+
+$(TRACE_JSON): $(TRACE_CSV) $(EVENTVIS_PY)
+	$(PYTHON) $(EVENTVIS_PY) -o $@ $(TRACE_CSV)
diff --git a/target/common/test/ipc.cc b/target/common/test/ipc.cc
index 5eaffcf85..09188a7b3 100644
--- a/target/common/test/ipc.cc
+++ b/target/common/test/ipc.cc
@@ -19,60 +19,67 @@ void* IpcIface::ipc_thread_handle(void* in) {
     // Handle commands
     ipc_op_t op;
 
-    while (!feof(tx)) {
-        uint8_t ret_value = fread(&op, sizeof(ipc_op_t), 1, tx);
-        if (ret_value != 1) {
-            if (ferror(tx)) {
-                continue;  // jumps to while() again
-            }
-        }
-        switch (op.opcode) {
-            case Read:
-                // Read full blocks until one full block or less left
-                printf("[IPC] Read from 0x%x len 0x%x ...\n", op.addr, op.len);
-                for (uint64_t i = op.len; i > IPC_BUF_SIZE; i -= IPC_BUF_SIZE) {
-                    sim::MEM.read(op.addr, IPC_BUF_SIZE, buf_data);
-                    fwrite(buf_data, IPC_BUF_SIZE, 1, rx);
-                    op.addr += IPC_BUF_SIZE;
-                    op.len -= IPC_BUF_SIZE;
-                }
-                sim::MEM.read(op.addr, op.len, buf_data);
-                fwrite(buf_data, op.len, 1, rx);
-                fflush(rx);
-                break;
-            case Write:
-                // Write full blocks until one full block or less left
-                printf("[IPC] Write to 0x%x len %d ...\n", op.addr, op.len);
-                for (uint64_t i = op.len; i > IPC_BUF_SIZE; i -= IPC_BUF_SIZE) {
-                    fread(buf_data, IPC_BUF_SIZE, 1, tx);
-                    sim::MEM.write(op.addr, IPC_BUF_SIZE, buf_data, buf_strb);
-                    op.addr += IPC_BUF_SIZE;
-                    op.len -= IPC_BUF_SIZE;
-                }
-                fread(buf_data, op.len, 1, tx);
-                sim::MEM.write(op.addr, op.len, buf_data, buf_strb);
-                break;
-            case Poll:
-                // Unpack 32b checking mask and expected value from length
-                uint32_t mask = op.len & 0xFFFFFFFF;
-                uint32_t expected = (op.len >> 32) & 0xFFFFFFFF;
-                printf("[IPC] Poll on 0x%x mask 0x%x expected 0x%x ...\n",
-                       op.addr, mask, expected);
-                uint32_t read;
-                do {
-                    sim::MEM.read(op.addr, sizeof(uint32_t),
-                                  (uint8_t*)(void*)&read);
-                    nanosleep(
-                        (const struct timespec[]){{0, IPC_POLL_PERIOD_NS}},
-                        NULL);
-                } while ((read & mask) == (expected & mask));
-                // Send back read 32b word
-                fwrite(&read, sizeof(uint32_t), 1, rx);
-                fflush(rx);
+    while (1) {
+        if (!fread(&op, sizeof(ipc_op_t), 1, tx)) {
+            if (feof(tx)) {
+                printf(
+                    "[IPC] All messages read. Closing FIFOs and joining main "
+                    "thread.\n");
                 break;
+            }
+        } else {
+            switch (op.opcode) {
+                case Read:
+                    // Read full blocks until one full block or less left
+                    printf("[IPC] Read from 0x%x len 0x%x ...\n", op.addr,
+                           op.len);
+                    for (uint64_t i = op.len; i > IPC_BUF_SIZE;
+                         i -= IPC_BUF_SIZE) {
+                        sim::MEM.read(op.addr, IPC_BUF_SIZE, buf_data);
+                        fwrite(buf_data, IPC_BUF_SIZE, 1, rx);
+                        op.addr += IPC_BUF_SIZE;
+                        op.len -= IPC_BUF_SIZE;
+                    }
+                    sim::MEM.read(op.addr, op.len, buf_data);
+                    fwrite(buf_data, op.len, 1, rx);
+                    fflush(rx);
+                    break;
+                case Write:
+                    // Write full blocks until one full block or less left
+                    printf("[IPC] Write to 0x%x len %d ...\n", op.addr, op.len);
+                    for (uint64_t i = op.len; i > IPC_BUF_SIZE;
+                         i -= IPC_BUF_SIZE) {
+                        fread(buf_data, IPC_BUF_SIZE, 1, tx);
+                        sim::MEM.write(op.addr, IPC_BUF_SIZE, buf_data,
+                                       buf_strb);
+                        op.addr += IPC_BUF_SIZE;
+                        op.len -= IPC_BUF_SIZE;
+                    }
+                    fread(buf_data, op.len, 1, tx);
+                    sim::MEM.write(op.addr, op.len, buf_data, buf_strb);
+                    break;
+                case Poll:
+                    // Unpack 32b checking mask and expected value from length
+                    uint32_t mask = op.len & 0xFFFFFFFF;
+                    uint32_t expected = (op.len >> 32) & 0xFFFFFFFF;
+                    printf("[IPC] Poll on 0x%x mask 0x%x expected 0x%x ...\n",
+                           op.addr, mask, expected);
+                    uint32_t read;
+                    do {
+                        sim::MEM.read(op.addr, sizeof(uint32_t),
+                                      (uint8_t*)(void*)&read);
+                        nanosleep(
+                            (const struct timespec[]){{0, IPC_POLL_PERIOD_NS}},
+                            NULL);
+                    } while ((read & mask) == (expected & mask));
+                    // Send back read 32b word
+                    fwrite(&read, sizeof(uint32_t), 1, rx);
+                    fflush(rx);
+                    break;
+            }
         }
-        printf("[IPC] ... done\n");
     }
+
     // TX FIFO closed at other end: close both FIFOs and join main thread
     fclose(tx);
     fclose(rx);
diff --git a/target/common/test/verilator_lib.cc b/target/common/test/verilator_lib.cc
index 63ac66d5b..3e1ae89e1 100644
--- a/target/common/test/verilator_lib.cc
+++ b/target/common/test/verilator_lib.cc
@@ -14,10 +14,15 @@ namespace sim {
 
 // Number of cycles between HTIF checks.
 const int HTIFTimeInterval = 200;
+
+// We want to return timestamp in picosecond accuracy, assuming that one cycle
+// takes 1ns Since 1 cycle takes 2 sim::TIME increments, scale by 500 to get
+// time = cycle * 1000 + <some constant>
+const int TIME_CYCLES_TO_TIMESTAMP = 500;
 void sim_thread_main(void *arg) { ((Sim *)arg)->main(); }
 
 // Sim time.
-int TIME = 0;
+vluint64_t TIME = 0;
 
 Sim::Sim(int argc, char **argv) : htif_t(argc, argv), ipc(argc, argv) {
     // Search arguments for `--vcd` flag and enable waves if requested
@@ -78,7 +83,7 @@ void Sim::main() {
 }  // namespace sim
 
 // Verilator callback to get the current time.
-double sc_time_stamp() { return sim::TIME * 1e-9; }
+double sc_time_stamp() { return sim::TIME * sim::TIME_CYCLES_TO_TIMESTAMP; }
 
 // DPI calls.
 void tb_memory_read(long long addr, int len, const svOpenArrayHandle data) {
diff --git a/target/snitch_cluster/.gitignore b/target/snitch_cluster/.gitignore
index b7f1de414..f74d9fde4 100644
--- a/target/snitch_cluster/.gitignore
+++ b/target/snitch_cluster/.gitignore
@@ -6,4 +6,5 @@
 /work-vsim/
 /work-vlt/
 /work-vcs/
-/*.log
\ No newline at end of file
+/*.log
+/runs/
\ No newline at end of file
diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile
index 7b38bbad6..037621213 100644
--- a/target/snitch_cluster/Makefile
+++ b/target/snitch_cluster/Makefile
@@ -9,7 +9,7 @@
 # Makefile invocation #
 #######################
 
-DEBUG          ?= OFF  # ON to turn on debugging symbols
+DEBUG          ?= OFF  # ON to turn on debugging symbols and wave logging
 CFG_OVERRIDE   ?=      # Override default config file
 SELECT_RUNTIME ?=      # Select snRuntime implementation: "banshee" or "rtl" (default)
 
@@ -37,9 +37,6 @@ REGGEN          ?= $(shell $(BENDER) path register_interface)/vendor/lowrisc_ope
 CLUSTER_GEN     ?= $(ROOT)/util/clustergen.py
 CLUSTER_GEN_SRC ?= $(wildcard $(ROOT)/util/clustergen/*.py)
 
-VSIM      	?= vsim
-VLOG      	?= vlog
-
 #########################
 # Files and directories #
 #########################
@@ -71,9 +68,6 @@ QUESTA_64BIT = -64
 VLOG_64BIT   = -64
 
 VSIM_FLAGS += ${QUESTA_64BIT}
-VSIM_FLAGS += -t 1ps
-VSIM_FLAGS += -voptargs=+acc
-VSIM_FLAGS += -do "log -r /*; run -a"
 
 VLOG_FLAGS += -svinputport=compat
 VLOG_FLAGS += -override_timescale 1ns/1ps
@@ -245,7 +239,7 @@ clean-vsim: clean-work
 	rm -rf bin/snitch_cluster.vsim bin/snitch_cluster.vsim.gui $(VSIM_BUILDDIR) vsim.wlf
 
 ${VSIM_BUILDDIR}/compile.vsim.tcl:
-	vlib $(dir $@)
+	$(VLIB) $(dir $@)
 	${BENDER} script vsim ${VSIM_BENDER} --vlog-arg="${VLOG_FLAGS} -work $(dir $@) " > $@
 	echo '${VLOG} -work $(dir $@) ${TB_CC_SOURCES} ${TB_ASM_SOURCES} -vv -ccflags "$(TB_CC_FLAGS)"' >> $@
 	echo 'return 0' >> $@
@@ -267,22 +261,10 @@ clean-vcs: clean-work
 # Build compilation script and compile all sources for VCS simulation
 bin/snitch_cluster.vcs: ${VCS_SOURCES} ${TB_SRCS} $(TB_CC_SOURCES) $(TB_ASM_SOURCES) $(VCS_BUILDDIR)/compile.sh work/lib/libfesvr.a
 	mkdir -p bin
-	vcs -Mlib=$(VCS_BUILDDIR) -Mdir=$(VCS_BUILDDIR) -o bin/snitch_cluster.vcs -cc $(CC) -cpp $(CXX) \
+	$(VCS) -Mlib=$(VCS_BUILDDIR) -Mdir=$(VCS_BUILDDIR) -o bin/snitch_cluster.vcs -cc $(CC) -cpp $(CXX) \
 		-assert disable_cover -override_timescale=1ns/1ps -full64 tb_bin $(TB_CC_SOURCES) $(TB_ASM_SOURCES) \
 		-CFLAGS "$(TB_CC_FLAGS)" -LDFLAGS "-L${FESVR}/lib" -lfesvr
 
-##########
-# Traces #
-##########
-
-$(LOGS_DIR)/perf.csv: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/')) \
-		$(PERF_CSV_PY)
-	$(PYTHON) $(PERF_CSV_PY) -o $@ -i $(LOGS_DIR)/hart_*_perf.json
-
-$(LOGS_DIR)/event.csv: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/')) \
-		$(PERF_CSV_PY)
-	$(PYTHON) $(PERF_CSV_PY) -o $@ -i $(LOGS_DIR)/hart_*_perf.json --filter tstart tend
-
 ########
 # Util #
 ########
diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson
index c39c2a490..7f28a1073 100644
--- a/target/snitch_cluster/cfg/default.hjson
+++ b/target/snitch_cluster/cfg/default.hjson
@@ -34,8 +34,8 @@
             lat_comp_fp8: 1,
             lat_comp_fp8_alt: 1,
             lat_noncomp: 1,
-            lat_conv: 1,
-            lat_sdotp: 2,
+            lat_conv: 2,
+            lat_sdotp: 3,
             fpu_pipe_config: "BEFORE"
             narrow_xbar_latency: "CUT_ALL_PORTS",
             wide_xbar_latency: "CUT_ALL_PORTS",
diff --git a/target/snitch_cluster/run.py b/target/snitch_cluster/run.py
new file mode 100755
index 000000000..bef478ef7
--- /dev/null
+++ b/target/snitch_cluster/run.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).parent / '../../util/sim'))
+from sim_utils import parser, get_simulations, run_simulations  # noqa: E402
+from Simulator import QuestaSimulator, VCSSimulator, VerilatorSimulator, \
+                      BansheeSimulator  # noqa: E402
+
+
+SIMULATORS = {
+    'vsim': QuestaSimulator(Path(__file__).parent.resolve() / 'bin/snitch_cluster.vsim'),
+    'vcs': VCSSimulator(Path(__file__).parent.resolve() / 'bin/snitch_cluster.vcs'),
+    'verilator': VerilatorSimulator(Path(__file__).parent.resolve() / 'bin/snitch_cluster.vlt'),
+    'banshee': BansheeSimulator(Path(__file__).parent.resolve() / 'src/banshee.yaml')
+}
+
+
+def main():
+    args = parser('vsim', SIMULATORS.keys()).parse_args()
+    simulations = get_simulations(args.testlist, SIMULATORS[args.simulator], args.run_dir)
+    return run_simulations(simulations,
+                           n_procs=args.n_procs,
+                           dry_run=args.dry_run,
+                           early_exit=args.early_exit,
+                           verbose=args.verbose)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/target/snitch_cluster/sw/Makefile b/target/snitch_cluster/sw/Makefile
index 9badf70ea..a0115d00a 100644
--- a/target/snitch_cluster/sw/Makefile
+++ b/target/snitch_cluster/sw/Makefile
@@ -13,21 +13,19 @@ else
 RUNTIME = runtime/rtl
 endif
 
-MATH = ../../../sw/math
-
-SUBDIRS = runtime/banshee runtime/rtl $(MATH) apps tests
+SUBDIRS = runtime/banshee runtime/rtl math apps tests
 
 .PHONY: all $(SUBDIRS)
 
 all: $(SUBDIRS)
 
 # Explicit dependency of apps on runtime
-apps: $(RUNTIME) $(MATH)
+apps: $(RUNTIME) math
 	$(MAKE) -C $@ TARGET=$(TARGET)
 
 # Explicit dependency of tests on runtime
-tests: $(RUNTIME) $(MATH)
+tests: $(RUNTIME) math
 	$(MAKE) -C $@ $(TARGET)
 
-runtime/rtl runtime/banshee $(MATH):
+runtime/rtl runtime/banshee math:
 	$(MAKE) -C $@ $(TARGET)
diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk
index d8b0659a4..e27a19cfd 100644
--- a/target/snitch_cluster/sw/apps/common.mk
+++ b/target/snitch_cluster/sw/apps/common.mk
@@ -22,6 +22,7 @@ RISCV_CFLAGS += -DBIST
 else
 RUNTIME_DIR := $(ROOT)/target/snitch_cluster/sw/runtime/rtl
 endif
+MATH_DIR := $(ROOT)/target/snitch_cluster/sw/math
 
 # Paths relative to the app including this Makefile
 BUILDDIR = $(abspath build)
@@ -37,19 +38,18 @@ INCDIRS += $(SNRT_DIR)/api/omp
 INCDIRS += $(SNRT_DIR)/src
 INCDIRS += $(SNRT_DIR)/src/omp
 INCDIRS += $(ROOT)/sw/deps/riscv-opcodes
-
-# Math library override
-INCDIRS += $(ROOT)/sw/math/arch/riscv64/bits/
-INCDIRS += $(ROOT)/sw/math/arch/generic
-INCDIRS += $(ROOT)/sw/math/src/include
-INCDIRS += $(ROOT)/sw/math/src/internal
-INCDIRS += $(ROOT)/sw/math/include/bits
 INCDIRS += $(ROOT)/sw/math/include
 
+LIBS  = $(MATH_DIR)/build/libmath.a
+LIBS += $(RUNTIME_DIR)/build/libsnRuntime.a
+
+LIBDIRS  = $(dir $(LIBS))
+LIBNAMES = $(patsubst lib%,%,$(notdir $(basename $(LIBS))))
+
 RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR))
 RISCV_LDFLAGS += -T$(abspath $(SNRT_DIR)/base.ld)
-RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR)/build/)
-RISCV_LDFLAGS += -lsnRuntime
+RISCV_LDFLAGS += $(addprefix -L,$(LIBDIRS))
+RISCV_LDFLAGS += $(addprefix -l,$(LIBNAMES))
 
 ###########
 # Outputs #
@@ -78,11 +78,11 @@ $(BUILDDIR):
 $(DEP): $(SRCS) | $(BUILDDIR)
 	$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(ELF)' $< > $@
 
-$(ELF): $(SRCS) $(DEP) | $(BUILDDIR)
+$(ELF): $(SRCS) $(DEP) $(LIBS) | $(BUILDDIR)
 	$(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@
 
 $(DUMP): $(ELF) | $(BUILDDIR)
-	$(RISCV_OBJDUMP) -D $< > $@
+	$(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) $< > $@
 
 $(DWARF): $(ELF) | $(BUILDDIR)
 	$(RISCV_DWARFDUMP) $< > $@
diff --git a/target/snitch_cluster/sw/math/Makefile b/target/snitch_cluster/sw/math/Makefile
new file mode 100644
index 000000000..d0a83e86a
--- /dev/null
+++ b/target/snitch_cluster/sw/math/Makefile
@@ -0,0 +1,8 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+include ../toolchain.mk
+include ../../../../sw/math/Makefile
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index f25ea7641..ce241a8d4 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -68,11 +68,11 @@ runs:
   - elf: tests/build/varargs_2.elf
   - elf: tests/build/zero_mem.elf
   - elf: tests/build/non_null_exitcode.elf
-    exit_code: 14
+    retcode: 14
   - elf: apps/blas/axpy/build/axpy.elf
-    cmd: ../../sw/blas/axpy/verify.py {sim_bin} {elf}
+    cmd: [../../../sw/blas/axpy/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/blas/gemm/build/gemm.elf
-    cmd: ../../sw/blas/gemm/verify.py {sim_bin} {elf}
+    cmd: [../../../sw/blas/gemm/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/dnn/batchnorm/build/batchnorm.elf
   - elf: apps/dnn/linear/build/linear.elf
   - elf: apps/dnn/maxpool/build/maxpool.elf
diff --git a/target/snitch_cluster/sw/toolchain.mk b/target/snitch_cluster/sw/toolchain.mk
index 4fa0fc5af..3d50974b8 100644
--- a/target/snitch_cluster/sw/toolchain.mk
+++ b/target/snitch_cluster/sw/toolchain.mk
@@ -34,6 +34,7 @@ RISCV_CFLAGS += -mcmodel=medany
 # RISCV_CFLAGS += -mno-fdiv # Not supported by Clang
 RISCV_CFLAGS += -ffast-math
 RISCV_CFLAGS += -fno-builtin-printf
+RISCV_CFLAGS += -fno-builtin-sqrtf
 RISCV_CFLAGS += -fno-common
 RISCV_CFLAGS += -fopenmp
 RISCV_CFLAGS += -ftls-model=local-exec
@@ -54,3 +55,7 @@ RISCV_LDFLAGS += -lclang_rt.builtins-riscv32
 
 # Archiver flags
 RISCV_ARFLAGS = rcs
+
+# Objdump flags
+RISCV_OBJDUMP_FLAGS += --mcpu=snitch
+RISCV_OBJDUMP_FLAGS += -D
diff --git a/util/container/Dockerfile b/util/container/Dockerfile
index ea320f325..d917a6790 100644
--- a/util/container/Dockerfile
+++ b/util/container/Dockerfile
@@ -7,7 +7,11 @@
 # 1. Stage
 FROM ubuntu:18.04 AS builder
 ARG CMAKE_VERSION=3.19.4
+ARG PYTHON_VERSION=3.9.12
+# Run dpkg without interactive dialogue
+ARG DEBIAN_FRONTEND=noninteractive
 
+# Install APT requirements
 COPY apt-requirements.txt /tmp/apt-requirements.txt
 RUN apt-get update && \
     sed 's/#.*//' /tmp/apt-requirements.txt \
@@ -20,8 +24,26 @@ RUN apt-get update && \
         lsb-release \
         software-properties-common \
         unzip \
-        wget \
-        zlib1g-dev
+        wget
+# Required to install Python
+RUN apt-get update && apt-get install -y \
+        zlib1g-dev \
+        libreadline-gplv2-dev \
+        libncursesw5-dev \
+        libssl-dev \
+        libsqlite3-dev \
+        tk-dev \
+        libgdbm-dev \
+        libc6-dev \
+        libbz2-dev \
+        libffi-dev
+
+# Install Python
+RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
+RUN tar xzf Python-${PYTHON_VERSION}.tgz
+RUN cd Python-${PYTHON_VERSION} && \
+    ./configure --enable-optimizations --prefix=/opt/python/ && \
+    make install -j
 
 # Build Rust tools
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
@@ -37,6 +59,7 @@ RUN wget https://apt.llvm.org/llvm.sh
 RUN chmod +x llvm.sh
 RUN ./llvm.sh 12
 
+# Change working directory
 WORKDIR /tools
 
 # Install a newer version of cmake (we need this for banshee)
@@ -73,9 +96,11 @@ RUN apt-get update && \
     sed 's/#.*//' /tmp/apt-requirements.txt \
         | xargs apt-get install -y && \
     apt-get install -y --no-install-recommends \
+        ca-certificates \
         gnupg2 \
         curl \
         wget \
+        build-essential \
         git && \
     apt-get clean ; \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/doc/*
@@ -86,12 +111,7 @@ RUN echo 'deb http://download.opensuse.org/repositories/home:/phiwag:/edatools/x
     apt-get update && apt-get install -y verilator-${VERILATOR_VERSION} && \
     apt-get clean ; \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/doc/*
-
-# Install Python requirements
-COPY python-requirements.txt /tmp/python-requirements.txt
-COPY docs/requirements.txt /tmp/docs/requirements.txt
-COPY sw/dnn/requirements.txt /tmp/sw/dnn/requirements.txt
-RUN pip3 install -r /tmp/python-requirements.txt
+ENV VLT_ROOT "/usr/share/verilator"
 
 # Get the precompiled LLVM toolchain
 RUN latest_tag=`curl -s -H "Accept: application/vnd.github.v3+json" https://api.github.com/repos/pulp-platform/llvm-project/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/'` && \
@@ -119,6 +139,17 @@ RUN apt-get update && apt-get install software-properties-common -y && \
 # Copy artifacts from stage 1.
 COPY --from=builder /root/.cargo/bin/bender bin/
 COPY --from=builder /root/.cargo/bin/banshee bin/
+COPY --from=builder /opt/python /opt/python
+
+# Create and activate virtual environment
+ENV VIRTUAL_ENV "/root/.venvs/snitch_cluster"
+RUN /opt/python/bin/python3 -m venv ${VIRTUAL_ENV}
+ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
+# Install Python requirements
+COPY python-requirements.txt /tmp/python-requirements.txt
+COPY docs/requirements.txt /tmp/docs/requirements.txt
+COPY sw/dnn/requirements.txt /tmp/sw/dnn/requirements.txt
+RUN pip install -r /tmp/python-requirements.txt
 
 # Set locale to UTF-8, required because Python 3.6 defaults on ASCII encoding.
 # See https://click.palletsprojects.com/en/8.1.x/unicode-support/
diff --git a/util/sim/Simulation.py b/util/sim/Simulation.py
new file mode 100644
index 000000000..3cc219389
--- /dev/null
+++ b/util/sim/Simulation.py
@@ -0,0 +1,242 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from termcolor import colored, cprint
+from pathlib import Path
+import subprocess
+import re
+import os
+from mako.template import Template
+
+
+class Simulation(object):
+    """Provides a common interface to manage simulations."""
+
+    LOG_FILE = 'sim.txt'
+
+    def __init__(self, elf=None, dry_run=False, retcode=0, run_dir=None):
+        """Constructor for the Simulation class.
+
+        A Simulation object is defined at a minimum by a software
+        binary to be simulated on the desired hardware. The hardware is
+        implicitly determined by the simulation command.
+
+        Arguments:
+            elf: The software binary to simulate.
+            run_dir: The directory where to launch the simulation
+                command. If none is passed, the current working
+                directory is assumed.
+            dry_run: A preview of the simulation command will be
+                displayed without actually launching the simulation.
+        """
+        self.elf = elf
+        self.dry_run = dry_run
+        self.run_dir = run_dir if run_dir is not None else Path.cwd()
+        self.testname = Path(self.elf).stem
+        self.cmd = []
+        self.log = None
+        self.process = None
+        self.expected_retcode = int(retcode)
+
+    def launch(self, dry_run=None):
+        """Launch the simulation.
+
+        Launch the simulation by invoking the command stored in the
+        `cmd` attribute of the class. Subclasses are required to define
+        a non-empty `cmd` attribute prior to invoking this method.
+
+        Arguments:
+            dry_run: A preview of the simulation command is displayed
+                without actually launching the simulation.
+        """
+        # Override dry_run setting at launch time
+        if dry_run is not None:
+            self.dry_run = dry_run
+
+        # Print launch message and simulation command
+        cprint(f'Run test {colored(self.elf, "cyan")}', attrs=["bold"])
+        cmd_string = ' '.join(self.cmd)
+        print(f'[{self.run_dir}]$ {cmd_string}', flush=True)
+
+        # Launch simulation if not doing a dry run
+        if not self.dry_run:
+            # Create run directory and log file
+            os.makedirs(self.run_dir, exist_ok=True)
+            self.log = self.run_dir / self.LOG_FILE
+            # Launch simulation subprocess
+            with open(self.log, 'w') as f:
+                self.process = subprocess.Popen(self.cmd, stdout=f, stderr=subprocess.STDOUT,
+                                                cwd=self.run_dir, universal_newlines=True)
+
+    def completed(self):
+        """Return whether the simulation completed."""
+        if self.dry_run:
+            return True
+        elif self.process:
+            return self.process.poll() is not None
+        else:
+            return False
+
+    def get_retcode(self):
+        """Get the return code of the simulation."""
+        if self.dry_run:
+            return 0
+        else:
+            if self.process:
+                return int(self.process.returncode)
+
+    def successful(self):
+        """Return whether the simulation was successful."""
+        actual_retcode = self.get_retcode()
+        if actual_retcode is not None:
+            return int(actual_retcode) == int(self.expected_retcode)
+        else:
+            return False
+
+    def print_log(self):
+        """Print a log of the simulation to stdout."""
+        with open(self.log, 'r') as f:
+            print(f.read())
+
+    def print_status(self):
+        """Print a status message to stdout.
+
+        The status message reports whether the test is still running
+        or, if it completed, whether it was successful or failed.
+        """
+        if self.completed():
+            if self.successful():
+                cprint(f'{self.elf} test passed', 'green', attrs=['bold'], flush=True)
+            else:
+                cprint(f'{self.elf} test failed', 'red', attrs=['bold'], flush=True)
+        else:
+            cprint(f'{self.elf} test running', 'black', flush=True)
+
+
+class RTLSimulation(Simulation):
+    """A simulation run on an RTL simulator.
+
+    An RTL simulation is launched through a simulation binary built
+    in advance from some RTL design.
+    """
+
+    def __init__(self, sim_bin=None, **kwargs):
+        """Constructor for the RTLSimulation class.
+
+        Arguments:
+            sim_bin: The simulation binary.
+            kwargs: Arguments passed to the base class constructor.
+        """
+        super().__init__(**kwargs)
+        self.cmd = [str(sim_bin), str(self.elf)]
+
+
+class VerilatorSimulation(RTLSimulation):
+    """An RTL simulation running on Verilator.
+
+    The return code of the simulation is returned directly as the
+    return code of the command launching the simulation.
+    """
+
+    def get_retcode(self):
+        return self.process.returncode
+
+
+class QuestaVCSSimulation(RTLSimulation):
+    """An RTL simulation running on QuestaSim or VCS.
+
+    QuestaSim and VCS print out the simulation return code in the
+    simulation log. This is parsed to extract the return code.
+    """
+
+    def get_retcode(self):
+        # Extract the application's return code from the simulation log
+        with open(self.log, 'r') as f:
+            for line in f.readlines():
+                regex_success = r'\[SUCCESS\] Program finished successfully'
+                match_success = re.search(regex_success, line)
+                if match_success:
+                    return 0
+                else:
+                    regex_fail = r'\[FAILURE\] Finished with exit code\s+(\d+)'
+                    match = re.search(regex_fail, line)
+                    if match:
+                        return int(match.group(1))
+
+    def successful(self):
+        # Check that simulation return code matches expected value (in super class)
+        # and that the simulation process terminated correctly
+        success = super().successful()
+        if self.process.returncode != 0:
+            return False
+        else:
+            return success
+
+
+class QuestaSimulation(QuestaVCSSimulation):
+    """An RTL simulation running on QuestaSim."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cmd += ['', '-batch']
+
+
+class VCSSimulation(QuestaVCSSimulation):
+    """An RTL simulation running on VCS."""
+    pass
+
+
+class BansheeSimulation(Simulation):
+    """A simulation running on Banshee.
+
+    The return code of the simulation is returned directly as the
+    return code of the command launching the simulation.
+    """
+
+    def __init__(self, banshee_cfg=None, **kwargs):
+        """Constructor for the BansheeSimulation class.
+
+        Arguments:
+            banshee_cfg: A Banshee config file.
+            kwargs: Arguments passed to the base class constructor.
+        """
+        super().__init__(**kwargs)
+        self.cmd = ['banshee', '--no-opt-llvm', '--no-opt-jit', '--configuration',
+                    str(banshee_cfg), '--trace', str(self.elf)]
+
+
+class CustomSimulation(Simulation):
+    """A simulation which is run through a custom command.
+
+    The custom command generally invokes an RTL simulator binary behind
+    the scenes and executes some additional verification logic after
+    the end of the simulation.
+
+    Custom simulations are considered unsuccessful if the return code
+    of the custom command is non-null. As a custom command can
+    implement any verification logic, there is no reason to implement
+    any additional logic here.
+    """
+
+    def __init__(self, sim_bin=None, cmd=None, **kwargs):
+        """Constructor for the CustomSimulation class.
+
+        Arguments:
+            sim_bin: The simulation binary.
+            cmd: The custom command used to launch the simulation.
+            kwargs: Arguments passed to the base class constructor.
+        """
+        super().__init__(**kwargs)
+        self.dynamic_args = {
+            'sim_bin': str(sim_bin),
+            'elf': str(self.elf),
+            'run_dir': str(self.run_dir)
+        }
+        self.cmd = cmd
+
+    def launch(self, **kwargs):
+        self.cmd = [Template(arg).render(**self.dynamic_args) for arg in self.cmd]
+        super().launch(**kwargs)
diff --git a/util/sim/Simulator.py b/util/sim/Simulator.py
new file mode 100644
index 000000000..3d3090573
--- /dev/null
+++ b/util/sim/Simulator.py
@@ -0,0 +1,187 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from Simulation import QuestaSimulation, VCSSimulation, VerilatorSimulation, BansheeSimulation, \
+                       CustomSimulation
+
+
+class Simulator(object):
+    """An object capable of constructing Simulation objects.
+
+    A simulator constructs a [Simulation][Simulation.Simulation] object
+    from a test object, as defined e.g. in a test suite specification
+    file.
+
+    At minimum, a test is defined by a binary (`elf`) which is to be
+    simulated and a set of simulators it can be run on. A test could be
+    defined by a class of its own, but at the moment we assume a test
+    to be represented by a dictionary with the `elf` and `simulators`
+    keys at minimum.
+    """
+
+    def __init__(self, name, simulation_cls):
+        """Constructor for the Simulator class.
+
+        A simulator must be identifiable by a unique identifier string
+        and construct at least one type of
+        [Simulation][Simulation.Simulation] object.
+
+        Arguments:
+            name: The unique identifier of the simulator.
+            simulation_cls: One type of
+                [Simulation][Simulation.Simulation] object the
+                simulator can construct.
+        """
+        self.name = name
+        self.simulation_cls = simulation_cls
+
+    def supports(self, test):
+        """Check whether a certain test is supported by the simulator.
+
+        Arguments:
+            test: The test to check.
+        """
+        return 'simulators' not in test or self.name in test['simulators']
+
+    def get_simulation(self, test, simulation_cls=None, **kwargs):
+        """Construct a Simulation object from the specified test.
+
+        Arguments:
+            test: The test for which a Simulation object must be
+                constructed.
+            simulation_cls: Create a simulation instance of this
+                Simulation subclass. Use `self.simulation_cls` by
+                default.
+        """
+        kwargs.update({key: test[key] for key in ['elf', 'run_dir', 'retcode'] if key in test})
+        if simulation_cls is not None:
+            return simulation_cls(**kwargs)
+        else:
+            return self.simulation_cls(**kwargs)
+
+
+class RTLSimulator(Simulator):
+    """Base class for RTL simulators.
+
+    An RTL simulator requires a simulation binary built from an RTL
+    design to launch a simulation.
+
+    A test may need to be run with a custom command, itself invoking
+    the simulation binary behind the scenes, e.g. for verification
+    purposes. Such a test carries the custom command (a list of args)
+    under the `cmd` key. In such case, the RTL simulator constructs a
+    [CustomSimulation][Simulation.CustomSimulation] object from the
+    given test, with the custom command and simulation binary.
+    """
+
+    def __init__(self, binary, **kwargs):
+        """Constructor for the RTLSimulator class.
+
+        Arguments:
+            binary: The simulation binary.
+            kwargs: Arguments passed to the base class constructor.
+        """
+        super().__init__(**kwargs)
+        self.binary = binary
+
+    def get_simulation(self, test):
+        if 'cmd' in test:
+            return super().get_simulation(
+                test,
+                simulation_cls=CustomSimulation,
+                sim_bin=self.binary,
+                cmd=test['cmd'])
+        else:
+            return super().get_simulation(
+                test,
+                sim_bin=self.binary
+            )
+
+
+class VCSSimulator(RTLSimulator):
+    """VCS simulator
+
+    An [RTL simulator][Simulator.RTLSimulator], identified by the name
+    `vcs`, tailored to the creation of
+    [VCS simulations][Simulation.VCSSimulation].
+    """
+
+    def __init__(self, binary):
+        """Constructor for the VCSSimulator class.
+
+        Arguments:
+            binary: The VCS simulation binary.
+        """
+        super().__init__(binary, name='vcs', simulation_cls=VCSSimulation)
+
+
+class QuestaSimulator(RTLSimulator):
+    """QuestaSim simulator
+
+    An [RTL simulator][Simulator.RTLSimulator], identified by the name
+    `vsim`, tailored to the creation of
+    [QuestaSim simulations][Simulation.QuestaSimulation].
+    """
+
+    def __init__(self, binary):
+        """Constructor for the QuestaSimulator class.
+
+        Arguments:
+            binary: The QuestaSim simulation binary.
+        """
+        super().__init__(binary, name='vsim', simulation_cls=QuestaSimulation)
+
+
+class VerilatorSimulator(RTLSimulator):
+    """Verilator simulator
+
+    An [RTL simulator][Simulator.RTLSimulator], identified by the name
+    `verilator`, tailored to the creation of
+    [Verilator simulations][Simulation.VerilatorSimulation].
+    """
+
+    def __init__(self, binary):
+        """Constructor for the VerilatorSimulator class.
+
+        Arguments:
+            binary: The Verilator simulation binary.
+        """
+        super().__init__(binary, name='verilator', simulation_cls=VerilatorSimulation)
+
+
+class BansheeSimulator(Simulator):
+    """Banshee simulator
+
+    A simulator, identified by the name `banshee`, tailored to the
+    creation of [Banshee simulations][Simulation.BansheeSimulation].
+    """
+
+    def __init__(self, cfg):
+        """Constructor for the BansheeSimulator class.
+
+        Arguments:
+            cfg: A Banshee config file.
+        """
+        super().__init__(name='banshee', simulation_cls=BansheeSimulation)
+        self.cfg = cfg
+
+    def supports(self, test):
+        """See base class.
+
+        The Banshee simulator does not support tests carrying a custom
+        command.
+        """
+        supported = super().supports(test)
+        if 'cmd' in test:
+            return False
+        else:
+            return supported
+
+    def get_simulation(self, test):
+        return super().get_simulation(
+            test,
+            banshee_cfg=self.cfg
+        )
diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py
index 664e2624b..2ed260d3f 100644
--- a/util/sim/data_utils.py
+++ b/util/sim/data_utils.py
@@ -9,7 +9,7 @@
 
 
 def emit_license():
-    s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna."
+    s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna.\n"
          f"// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n"
          f"// SPDX-License-Identifier: Apache-2.0\n\n")
     return s
diff --git a/util/sim/elf.py b/util/sim/elf.py
index a46a6764d..27ab5b3e7 100644
--- a/util/sim/elf.py
+++ b/util/sim/elf.py
@@ -36,6 +36,15 @@ def get_symbol_size(self, uid):
     def get_symbol_contents(self, uid):
         addr = self.get_symbol_address(uid)
         size = self.get_symbol_size(uid)
-        fpos = list(self.elf.address_offsets(addr, size))[0]
-        self.elf.stream.seek(fpos)
-        return self.elf.stream.read(size)
+        try:
+            fpos = list(self.elf.address_offsets(addr, size))[0]
+            self.elf.stream.seek(fpos)
+            contents = self.elf.stream.read(size)
+        except IndexError:
+            # We assume all segments in our ELF are of type PT_LOAD and
+            # that the only section whose contents are not stored in
+            # the ELF file is the .bss section. Therefore, whenever
+            # `address_offsets()` fails to return a valid offset into the
+            # file we assume that the address falls in the .bss section.
+            contents = bytearray([0] * size)
+        return contents
diff --git a/util/sim/sim_utils.py b/util/sim/sim_utils.py
new file mode 100755
index 000000000..371d56b81
--- /dev/null
+++ b/util/sim/sim_utils.py
@@ -0,0 +1,288 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Convenience functions to set up a Python simulation framework.
+
+Such a framework enables you to transparently run a software test suite
+on any simulator of choice, provided that the latter is supported by
+the framework. It can be used in CIs, regression testing or to conduct
+systematic evaluation experiments.
+
+Three interfaces are required to implement a common framework:
+
+1. a test suite specification interface to specify the software tests
+2. a command-line interface used to launch the simulations
+3. an interface to the simulators supported by the framework
+
+The framework can be divided into three components each managing one of
+the defined interfaces:
+
+1. a test suite frontend
+2. a command-line frontend
+3. a simulation backend
+
+A fourth component, the core, serves to glue all other components
+together.
+
+The [parser()][sim_utils.parser] function provides a minimum
+command-line interface to control the tool.
+
+The [get_simulations()][sim_utils.get_simulations] function
+provides a common means to implement the test suite frontend. At the
+input interface it assumes a test suite specification file in YAML
+syntax, and returns a list of simulation objects which implement a
+common interface to the simulation backend. This interface is defined
+by the [Simulation][Simulation.Simulation] class.
+
+The core logic of the framework is implemented in the
+[run_simulations()][sim_utils.run_simulations] function. It takes
+the output from [get_simulations()][sim_utils.get_simulations] and
+launches the simulations through the interface to the simulation
+backend.
+
+The simulation backend is implemented by the
+[Simulation][Simulation.Simulation] and
+[Simulator][Simulator.Simulator] classes and their subclasses.
+"""
+
+import argparse
+from termcolor import colored, cprint
+from pathlib import Path
+import os
+import time
+import yaml
+import signal
+import psutil
+
+POLL_PERIOD = 0.2
+
+
+def parser(default_simulator='vsim', simulator_choices=['vsim']):
+    """Default command-line parser for Python simulation frameworks.
+
+    Returns a Python `argparse` parser with common options used to
+    simulate one or multiple binaries on an RTL design. Can be extended
+    by adding arguments to it.
+
+    Args:
+        default_simulator: The simulator to be used when none is
+            specified on the command-line.
+        simulator_choices: All simulator choices which can be passed on
+            the command-line.
+    """
+    # Argument parsing
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'testlist',
+        help='File specifying a list of apps to run')
+    parser.add_argument(
+        '--simulator',
+        action='store',
+        nargs='?',
+        default=default_simulator,
+        choices=simulator_choices,
+        help='Choose a simulator to run the test with')
+    parser.add_argument(
+        '--run-dir',
+        action='store',
+        default='runs',
+        nargs='?',
+        help='Parent directory of each test run directory')
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Preview the simulation commands which will be run')
+    parser.add_argument(
+        '--early-exit',
+        action='store_true',
+        help='Exit as soon as any test fails')
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Activate verbose printing')
+    parser.add_argument(
+        '-j',
+        action='store',
+        dest='n_procs',
+        nargs='?',
+        type=int,
+        default=1,
+        const=os.cpu_count(),
+        help=('Maximum number of tests to run in parallel. '
+              'One if the option is not present. Equal to the number of CPU cores '
+              'if the option is present but not followed by an argument.'))
+    return parser
+
+
+def _resolve_relative_path(base_path, s):
+    """Resolve a relative path string w.r.t. a ceratin base.
+
+    Checks if an input string represents a valid relative path w.r.t.
+    to a certain base path and resolves it to an absolute path, if this
+    is the case. Otherwise returns the original string.
+
+    Args:
+        s: The input string
+        base_path: The base path
+    """
+    try:
+        base_path = Path(base_path).resolve()  # Get the absolute path of the base directory
+        input_path = Path(s)
+        if input_path.is_absolute() or not s.startswith(("./", "../")):
+            return s
+        else:
+            # Resolve the path against the base directory and check existence
+            absolute_path = (base_path / input_path).resolve()
+            return str(absolute_path)
+    except (TypeError, ValueError):
+        # Handle invalid base_path or s
+        return s
+    except Exception as e:
+        # Handle other exceptions like permission errors, etc.
+        print(f"An error occurred: {str(e)}")
+        return s
+
+
+def get_simulations(testlist, simulator, run_dir=None):
+    """Create simulation objects from a test list file.
+
+    Args:
+        testlist: Path to a test list file. A test list file is a YAML
+            file describing a set of tests.
+        simulator: The simulator to use to run the tests. A test run on
+            a specific simulator defines a simulation.
+        run_dir: A directory under which all tests should be run. If
+            provided, a unique subdirectory for each test will be
+            created under this directory, based on the test name.
+
+    Returns:
+        A list of `Simulation` objects. The list contains a
+            `Simulation` object for every test which supports the given
+            `simulator`. This object defines a simulation of the test on
+            that particular `simulator`.
+    """
+    # Get tests from test list file
+    testlist_path = Path(testlist).absolute()
+    with open(testlist_path, 'r') as f:
+        tests = yaml.safe_load(f)['runs']
+    # Convert relative paths in testlist file to absolute paths
+    for test in tests:
+        test['elf'] = testlist_path.parent / test['elf']
+        if 'cmd' in test:
+            test['cmd'] = [_resolve_relative_path(testlist_path.parent, arg) for arg in test['cmd']]
+    # Create simulation object for every test which supports the specified simulator
+    simulations = [simulator.get_simulation(test) for test in tests if simulator.supports(test)]
+    # Set simulation run directory
+    if run_dir is not None:
+        for sim in simulations:
+            sim.run_dir = Path(run_dir) / sim.testname
+    return simulations
+
+
+def print_summary(failed_sims, early_exit=False, dry_run=False):
+    """Print a summary of the simulation suite's exit status.
+
+    Args:
+        failed_sims: A list of failed simulations from the simulation
+            suite.
+        early_exit: Whether the simulation suite was configured to
+            terminate upon the first failing simulation.
+        dry_run: Whether the simulation suite was launched in dry run
+            mode.
+    """
+    if not dry_run:
+        header = f'==== Test summary {"(early exit)" if early_exit else ""} ===='
+        cprint(header, attrs=['bold'])
+        if failed_sims:
+            [sim.print_status() for sim in failed_sims]
+        else:
+            print(f'{colored("All tests passed!", "green")}')
+
+
+def terminate_processes():
+    print('Terminate processes')
+    # Get PID and PGID of parent process (current Python script)
+    ppid = os.getpid()
+    pgid = os.getpgid(0)
+    # Kill processes in current process group, except parent process
+    for proc in psutil.process_iter(['pid', 'name']):
+        pid = proc.info['pid']
+        if os.getpgid(pid) == pgid and pid != ppid:
+            os.kill(pid, signal.SIGKILL)
+
+
+def get_unique_run_dir(sim, prefix=None):
+    """Get unique run directory for a simulation.
+
+    If the simulation was already assigned a run directory at creation
+    time, None is returned. Otherwise, return a unique run directory
+    based on the testname under an optional prefix directory.
+
+    Args:
+        sim: The simulation for which the run directory is
+            requested.
+        prefix: Get a unique run directory under a directory which
+            could be common to multiple simulations. We call this
+            a prefix. By default the current working directory is
+            assumed as the prefix.
+    """
+    if sim.run_dir is None:
+        if prefix is None:
+            prefix = Path.cwd()
+        return prefix / sim.testname
+
+
+def run_simulations(simulations, n_procs=1, dry_run=None, early_exit=False,
+                    verbose=False):
+    """Run simulations defined by a list of `Simulation` objects.
+
+    Args:
+        simulations: A list of `Simulation` objects as returned e.g. by
+            [sim_utils.get_simulations][].
+
+    Returns:
+        The number of failed simulations.
+    """
+    # Register SIGTERM handler, used to gracefully terminate all simulation subprocesses
+    signal.signal(signal.SIGTERM, lambda _, __: terminate_processes())
+
+    # Spawn a process for every test, wait for all running tests to terminate and check results
+    running_sims = []
+    failed_sims = []
+    early_exit_requested = False
+    try:
+        while (len(simulations) or len(running_sims)) and not early_exit_requested:
+            # If there are still simulations to run and there are less running simulations than
+            # the maximum number of processes allowed in parallel, spawn new simulation
+            if len(simulations) and len(running_sims) < n_procs:
+                running_sims.append(simulations.pop(0))
+                running_sims[-1].launch(dry_run=dry_run)
+            # Remove completed sims from running sims list
+            idcs = [i for i, sim in enumerate(running_sims) if sim.completed()]
+            completed_sims = [running_sims.pop(i) for i in sorted(idcs, reverse=True)]
+            # Check completed sims and report status
+            for sim in completed_sims:
+                if sim.successful():
+                    sim.print_status()
+                else:
+                    failed_sims.append(sim)
+                    if verbose:
+                        sim.print_log()
+                    sim.print_status()
+                    # If in early-exit mode, terminate as soon as any simulation fails
+                    if early_exit:
+                        early_exit_requested = True
+                        break
+            time.sleep(POLL_PERIOD)
+    except KeyboardInterrupt:
+        early_exit_requested = True
+
+    # Clean up after early exit
+    if early_exit_requested:
+        terminate_processes()
+
+    # Print summary
+    print_summary(failed_sims, early_exit_requested)
+    return len(failed_sims)
diff --git a/util/sim/simulate.py b/util/sim/simulate.py
deleted file mode 100755
index 4e36cc1e1..000000000
--- a/util/sim/simulate.py
+++ /dev/null
@@ -1,270 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2023 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-# TODO colluca: timeout feature
-
-import argparse
-import multiprocessing
-from pathlib import Path
-import subprocess
-from termcolor import colored, cprint
-import os
-import re
-import sys
-import time
-import yaml
-
-
-BANSHEE_CFG = 'src/banshee.yaml'
-
-# Tool settings
-SIMULATORS = ['vsim', 'banshee', 'verilator', 'vcs', 'other']
-DEFAULT_SIMULATOR = SIMULATORS[0]
-SIMULATOR_BINS = {
-    'vsim': 'bin/snitch_cluster.vsim',
-    'banshee': 'banshee',
-    'verilator': 'bin/snitch_cluster.vlt',
-    'vcs': 'bin/snitch_cluster.vcs'
-}
-SIMULATOR_CMDS = {
-    'vsim': '{sim_bin} {elf} "" -batch',
-    'banshee': ('{{sim_bin}} --no-opt-llvm --no-opt-jit --configuration {cfg}'
-                ' --trace {{elf}} > /dev/null').format(cfg=BANSHEE_CFG),
-    'verilator': '{sim_bin} {elf}',
-    'vcs': '{sim_bin} {elf}'
-}
-
-
-def parse_args():
-    # Argument parsing
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        'testlist',
-        help='File specifying a list of apps to run')
-    parser.add_argument(
-        '--simulator',
-        action='store',
-        nargs='?',
-        default=DEFAULT_SIMULATOR,
-        choices=SIMULATORS,
-        help='Choose a simulator to run the test with')
-    parser.add_argument(
-        '--sim-bin',
-        action='store',
-        nargs='?',
-        help='Override default path to simulator binary')
-    parser.add_argument(
-        '--dry-run',
-        action='store_true',
-        help='Preview the simulation commands which will be run')
-    parser.add_argument(
-        '--early-exit',
-        action='store_true',
-        help='Exit as soon as any test fails')
-    parser.add_argument(
-        '-j',
-        action='store',
-        dest='n_procs',
-        nargs='?',
-        type=int,
-        default=1,
-        const=os.cpu_count(),
-        help=('Maximum number of tests to run in parallel. '
-              'One if the option is not present. Equal to the number of CPU cores '
-              'if the option is present but not followed by an argument.'))
-    parser.add_argument(
-        '--verbose',
-        action='store_true',
-        help=('Option to print simulation logs when multiple tests are run in parallel.'
-              'Logs are always printed when n_procs == 1'))
-    args = parser.parse_args()
-    return args
-
-
-# Get tests from a test list file
-def get_tests(testlist_path):
-    testlist_path = Path(testlist_path).absolute()
-    with open(testlist_path, 'r') as file:
-        tests = yaml.safe_load(file)['runs']
-    return tests
-
-
-def check_exit_code(test, exit_code):
-    if 'exit_code' in test:
-        return not (int(test['exit_code']) == int(exit_code))
-    else:
-        return exit_code
-
-
-def multiple_processes(args):
-    return args.n_procs != 1
-
-
-def run_simulation(cmd, simulator, test, quiet=False):
-    # Defaults
-    result = 1
-    log = ''
-
-    # Spawn simulation subprocess
-    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-                         universal_newlines=True)
-
-    # Poll simulation subprocess and log its output
-    while p.poll() is None:
-        line = p.stdout.readline()
-        log += line
-        if not quiet:
-            print(line, end='', flush=True)
-
-        # When simulating with vsim or vcs, we need to parse the simulation
-        # log to catch the application's return code
-        if simulator in ['vsim', 'vcs']:
-            # Capture success
-            regex_success = r'\[SUCCESS\] Program finished successfully'
-            match_success = re.search(regex_success, line)
-            if match_success:
-                result = 0
-            else:
-                regex_fail = r'\[FAILURE\] Finished with exit code\s+(\d+)'
-                match = re.search(regex_fail, line)
-                if match:
-                    exit_code = match.group(1)
-                    result = check_exit_code(test, exit_code)
-
-    # Check if the subprocess terminated correctly
-    exit_code = p.poll()
-    # In Banshee and Verilator the exit code of the Snitch binary is returned
-    # through the exit code of the simulation command
-    if simulator in ['banshee', 'verilator']:
-        result = check_exit_code(test, exit_code)
-    # For custom commands the return code is that of the command
-    elif simulator == 'other':
-        result = exit_code
-    # For standard simulation commands the simulated Snitch binary exit
-    # code is overriden only if the simulator failed
-    else:
-        if exit_code != 0:
-            result = exit_code
-
-    return result, log
-
-
-def run_test(test, args):
-    # Extract args
-    simulator = args.simulator
-    sim_bin = args.sim_bin if args.sim_bin else SIMULATOR_BINS[simulator]
-    dry_run = args.dry_run
-    testlist = args.testlist
-    quiet = multiple_processes(args)
-
-    # Check if simulator is supported for this test
-    if 'simulators' in test:
-        if simulator not in test['simulators']:
-            return (0, '')
-
-    # Construct path to executable
-    elf = Path(test['elf'])
-    if testlist:
-        elf = Path(testlist).absolute().parent / elf
-    cprint(f'Run test {colored(elf, "cyan")}', attrs=["bold"])
-
-    # Construct simulation command (override only supported for RTL)
-    if 'cmd' in test and simulator != 'banshee':
-        cmd = test['cmd']
-        cmd = cmd.format(sim_bin=sim_bin, elf=elf, simulator=simulator)
-        simulator = 'other'
-    else:
-        cmd = SIMULATOR_CMDS[simulator]
-        cmd = cmd.format(sim_bin=sim_bin, elf=elf)
-    if not quiet:
-        print(f'$ {cmd}', flush=True)
-
-    # Run simulation
-    result = 0
-    log = ''
-    if not dry_run:
-        result, log = run_simulation(cmd, simulator, test, quiet)
-
-    # Report failure or success
-    if result != 0:
-        cprint(f'{elf} test failed', 'red', attrs=['bold'], flush=True)
-    else:
-        cprint(f'{elf} test passed', 'green', attrs=['bold'], flush=True)
-
-    return (result, log)
-
-
-def print_failed_test(test):
-    print(f'{colored(test["elf"], "cyan")} test {colored("failed", "red")}')
-
-
-def print_test_summary(failed_tests, args):
-    if not args.dry_run:
-        header = f'\n==== Test summary {"(early exit)" if args.early_exit else ""} ===='
-        cprint(header, attrs=['bold'])
-        if failed_tests:
-            for failed_test in failed_tests:
-                print_failed_test(failed_test)
-        else:
-            print(f'{colored("All tests passed!", "green")}')
-
-
-def run_tests(tests, args):
-
-    # Create a process Pool
-    with multiprocessing.Pool(args.n_procs) as pool:
-
-        # Create a shared object which parent and child processes can access
-        # concurrently to terminate the pool early as soon as one process fails
-        exit_early = multiprocessing.Value('B')
-        exit_early.value = 0
-
-        # Define callback for early exit
-        def completion_callback(return_value):
-            result = return_value[0]
-            log = return_value[1]
-            if args.early_exit and result != 0:
-                exit_early.value = 1
-            # Printing the log all at once here, rather than line-by-line
-            # in run_simulation, ensures that the logs of different processes
-            # are not interleaved in stdout.
-            # However, as we prefer line-by-line printing when a single process
-            # is used, we have to make sure we don't print twice.
-            if args.verbose and multiple_processes(args):
-                print(log)
-
-        # Queue tests to process pool
-        results = []
-        for test in tests:
-            result = pool.apply_async(run_test, args=(test, args), callback=completion_callback)
-            results.append(result)
-
-        # Wait for all tests to complete
-        running = range(len(tests))
-        while len(running) != 0 and not exit_early.value:
-            time.sleep(1)
-            running = [i for i in running if not results[i].ready()]
-
-    # Query test results
-    failed_tests = []
-    for test, result in zip(tests, results):
-        if result.ready() and result.get()[0] != 0:
-            failed_tests.append(test)
-
-    print_test_summary(failed_tests, args)
-
-    return len(failed_tests)
-
-
-def main():
-    args = parse_args()
-    tests = get_tests(args.testlist)
-    return run_tests(tests, args)
-
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/util/trace/perf_csv.py b/util/trace/perf_csv.py
index 450758c70..f26e242e2 100755
--- a/util/trace/perf_csv.py
+++ b/util/trace/perf_csv.py
@@ -17,7 +17,7 @@
 import pandas as pd
 
 
-HARTID_REGEX = r'\D*(\d*)\D*'
+HARTID_REGEX = r'hart_([0-9a-f]+)_perf.json'
 
 
 def main():