diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 01fff63fb..26e4ee4ff 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,7 +2,7 @@
# Unless a later match takes precedence, global owners below will be
# requested for review when someone opens a pull request.
-* @paulsc96 @colluca
+* @paulsc96 @colluca @fischeti
hw/snitch_cluster @paulsc96 @lucabertaccini
hw/snitch_dma @paulsc96 @thommythomaso
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f2c3e692a..f8f87b3f8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -36,19 +36,19 @@ jobs:
submodules: 'recursive'
- name: Build Software
run: |
+ bender vendor init
make -C target/snitch_cluster sw
- name: Build Hardware
run: |
make -C target/snitch_cluster bin/snitch_cluster.vlt
- name: Run Tests
working-directory: target/snitch_cluster
- run: |-
- ../../util/sim/simulate.py sw/run.yaml --simulator verilator -j \
- --verbose
+ run: |
+ ./run.py sw/run.yaml --simulator verilator -j
- ############################################
+ #########################################
# Build SW on Snitch Cluster w/ Banshee #
- ############################################
+ #########################################
sw-snitch-cluster-banshee:
name: Simulate SW on Snitch Cluster w/ Banshee
@@ -61,11 +61,11 @@ jobs:
submodules: 'recursive'
- name: Build Software
run: |
+ bender vendor init
make -C target/snitch_cluster SELECT_RUNTIME=banshee sw
- name: Run Tests
env:
SNITCH_LOG: info
working-directory: target/snitch_cluster
- run: |-
- ../../util/sim/simulate.py sw/run.yaml --simulator banshee -j \
- --verbose
+ run: |
+ ./run.py sw/run.yaml --simulator banshee -j
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 610c271ea..18cd5d4aa 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -5,15 +5,18 @@
variables:
GIT_STRATEGY: clone
GIT_SUBMODULE_STRATEGY: recursive
+ # Enable colors in CI terminal
+ TERM: ansi
+ FORCE_COLOR: 1
+ # Configure environment
PYTHON: /usr/local/anaconda3-2022.05/bin/python3
BENDER: bender-0.27.1
CC: gcc-9.2.0
CXX: g++-9.2.0
- VCS: vcs-2020.12
- VERILATOR: verilator-4.110
- QUESTA: questa-2022.3
+ VCS_SEPP: vcs-2020.12
+ VERILATOR_SEPP: verilator-4.110
+ QUESTA_SEPP: questa-2022.3
LLVM_BINROOT: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin
- CLANG: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/clang
CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER: /usr/pack/gcc-9.2.0-af/linux-x64/bin/gcc
LLVM_SYS_120_PREFIX: /usr/pack/llvm-12.0.1-af
CMAKE: cmake-3.18.1
@@ -21,7 +24,13 @@ variables:
before_script:
- $PYTHON -m venv .venv
- source .venv/bin/activate
- - pip install -r python-requirements.txt
+ # Unpack packages in a local temporary directory which can be safely cleaned
+ # after installation. Also protects against "No space left on device" errors
+ # occurring when the /tmp folder is filled by other processes.
+ - mkdir tmp
+ - TMPDIR=tmp pip install -r python-requirements.txt
+ - rm -rf tmp
+ - $BENDER vendor init
##############
# Build docs #
@@ -79,8 +88,8 @@ snitch-ip-tests:
- tcdm_interface
script:
- cd hw/$IP
- - $QUESTA ./util/compile.sh
- - $QUESTA ./util/run_vsim.sh
+ - ./util/compile.sh
+ - ./util/run_vsim.sh
########################
# Snitch cluster tests #
@@ -89,29 +98,26 @@ snitch-ip-tests:
# Verilator
snitch-cluster-vlt:
needs: [snitch-cluster-sw]
- # yamllint disable rule:line-length
script:
- cd target/snitch_cluster
- - $VERILATOR make bin/snitch_cluster.vlt
- - $VERILATOR ../../util/sim/simulate.py sw/run.yaml --simulator verilator -j --verbose
- # yamllint enable rule:line-length
+ - make bin/snitch_cluster.vlt
+ - ./run.py sw/run.yaml --simulator verilator -j --run-dir runs/vlt
# VCS
snitch-cluster-vcs:
needs: [snitch-cluster-sw]
script:
- cd target/snitch_cluster
- - $VCS make bin/snitch_cluster.vcs
- - $VCS ../../util/sim/simulate.py sw/run.yaml --simulator vcs -j --verbose
+ - make bin/snitch_cluster.vcs
+ - ./run.py sw/run.yaml --simulator vcs -j --run-dir runs/vcs
# Questa
snitch-cluster-vsim:
needs: [snitch-cluster-sw]
script:
- cd target/snitch_cluster
- - $QUESTA make bin/snitch_cluster.vsim
- - $QUESTA ../../util/sim/simulate.py sw/run.yaml --simulator vsim -j
- --verbose
+ - make bin/snitch_cluster.vsim
+ - ./run.py sw/run.yaml --simulator vsim -j --run-dir runs/vsim
# Banshee
snitch-cluster-banshee:
@@ -127,4 +133,4 @@ snitch-cluster-banshee:
- cd banshee
- cargo install --debug --path .
- cd ../target/snitch_cluster
- - ../../util/sim/simulate.py sw/run.yaml --simulator banshee -j --verbose
+ - ./run.py sw/run.yaml --simulator banshee -j --run-dir runs/banshee
diff --git a/Bender.yml b/Bender.yml
index 732788d0d..84dad47e8 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -22,7 +22,7 @@ dependencies:
axi: { git: https://github.com/pulp-platform/axi, version: 0.39.0 }
axi_riscv_atomics: { git: https://github.com/pulp-platform/axi_riscv_atomics, version: 0.6.0 }
common_cells: { git: https://github.com/pulp-platform/common_cells, version: 1.28.0 }
- FPnew: { git: https://github.com/openhwgroup/cvfpu, rev: 1202ca3 } # TODO: feature branch `feature/expanding_sdotp`; get merged!
+ FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 }
register_interface: { git: https://github.com/pulp-platform/register_interface, version: 0.4.2 }
tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version: 0.2.11 }
riscv-dbg: { git: https://github.com/pulp-platform/riscv-dbg, version: 0.8.0 }
@@ -37,13 +37,40 @@ vendor_package:
- "Makefile"
- ".gitignore"
- "README"
- - "src/math/tanh.c"
+ - "src/math/ceil.c"
+ - "src/math/ceilf.c"
+ - "src/math/ceill.c"
- "src/math/expm1.c"
+ - "src/math/expf.c"
+ - "src/math/exp2f_data.c"
+ - "src/math/exp2f_data.h"
+ - "src/math/log2.c"
+ - "src/math/log2_data.c"
+ - "src/math/log2_data.h"
+ - "src/math/log2f.c"
+ - "src/math/log2f_data.c"
+ - "src/math/log2f_data.h"
+ - "src/math/__math_divzero.c"
+ - "src/math/__math_invalid.c"
+ - "src/math/__math_invalidf.c"
+ - "src/math/__math_invalidl.c"
+ - "src/math/__math_oflow.c"
+ - "src/math/__math_oflowf.c"
+ - "src/math/__math_uflow.c"
+ - "src/math/__math_uflowf.c"
+ - "src/math/__math_xflow.c"
+ - "src/math/__math_xflowf.c"
+ - "src/math/sqrt.c"
+ - "src/math/sqrtf.c"
+ - "src/math/sqrt_data.c"
+ - "src/math/sqrt_data.h"
+ - "src/math/tanh.c"
- "src/internal/libm.h"
- "src/include/features.h"
- "include/endian.h"
- "include/math.h"
- "include/features.h"
+ - "include/float.h"
- "include/alltypes.h.in"
- "arch/riscv64/bits/alltypes.h.in"
- "arch/riscv64/bits/float.h"
diff --git a/Makefile b/Makefile
index 06a1c662f..087204634 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,8 @@
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
-REGGEN = $(shell bender path register_interface)/vendor/lowrisc_opentitan/util/regtool.py
+BENDER ?= bender
+REGGEN = $(shell $(BENDER) path register_interface)/vendor/lowrisc_opentitan/util/regtool.py
GENERATED_DOCS_DIR = docs/generated
GENERATED_DOC_SRCS = $(GENERATED_DOCS_DIR)/peripherals.md
@@ -16,9 +17,7 @@ clean: clean-docs
doc-srcs: $(GENERATED_DOC_SRCS)
docs: doc-srcs
- @if mkdocs build | grep -q "ERROR"; then \
- exit 1; \
- fi
+ mkdocs build
clean-docs:
rm -rf $(GENERATED_DOCS_DIR)
diff --git a/README.md b/README.md
index d5fe00f57..1f7b6459c 100644
--- a/README.md
+++ b/README.md
@@ -86,19 +86,39 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
-Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra
+Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra
```
-@inproceedings{scheffler2021indirect,
+@article{scheffler2023sparsessr,
author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca},
- booktitle={2021 Design, Automation & Test in Europe Conference & Exhibition (DATE)},
- title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra},
- year={2021},
+ journal={IEEE Transactions on Parallel and Distributed Systems},
+ title={Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra},
+ year={2023},
+ volume={34},
+ number={12},
+ pages={3147-3161},
+ doi={10.1109/TPDS.2023.3322029}
+}
+```
+
+
+
+
+
+A High-performance, Energy-efficient Modular DMA Engine Architecture
+
+
+```
+@ARTICLE{benz2023idma,
+ author={Benz, Thomas and Rogenmoser, Michael and Scheffler, Paul and Riedel, Samuel and Ottaviano, Alessandro and Kurth, Andreas and Hoefler, Torsten and Benini, Luca},
+ journal={IEEE Transactions on Computers},
+ title={A High-performance, Energy-efficient Modular DMA Engine Architecture},
+ year={2023},
volume={},
number={},
- pages={1787-1792}
-}
+ pages={1-14},
+ doi={10.1109/TC.2023.3329930}}
```
diff --git a/apt-requirements.txt b/apt-requirements.txt
index 5bb7b560d..15f12e8b7 100644
--- a/apt-requirements.txt
+++ b/apt-requirements.txt
@@ -6,8 +6,4 @@
clang-format
device-tree-compiler
graphviz
-python3
-python3-pip
-python3-setuptools
-python3-wheel
tar
diff --git a/docs/publications.md b/docs/publications.md
index 6f14daa64..e4c86b4c6 100644
--- a/docs/publications.md
+++ b/docs/publications.md
@@ -42,19 +42,39 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
-Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra
+Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra
```
-@inproceedings{scheffler2021indirect,
+@article{scheffler2023sparsessr,
author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca},
- booktitle={2021 Design, Automation & Test in Europe Conference & Exhibition (DATE)},
- title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra},
- year={2021},
+ journal={IEEE Transactions on Parallel and Distributed Systems},
+ title={Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra},
+ year={2023},
+ volume={34},
+ number={12},
+ pages={3147-3161},
+ doi={10.1109/TPDS.2023.3322029}
+}
+```
+
+
+
+
+
+A High-performance, Energy-efficient Modular DMA Engine Architecture
+
+
+```
+@ARTICLE{benz2023idma,
+ author={Benz, Thomas and Rogenmoser, Michael and Scheffler, Paul and Riedel, Samuel and Ottaviano, Alessandro and Kurth, Andreas and Hoefler, Torsten and Benini, Luca},
+ journal={IEEE Transactions on Computers},
+ title={A High-performance, Energy-efficient Modular DMA Engine Architecture},
+ year={2023},
volume={},
number={},
- pages={1787-1792}
-}
+ pages={1-14},
+ doi={10.1109/TC.2023.3329930}}
```
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 6a766858d..e913931e3 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,7 +4,8 @@
# Keep sorted.
mkdocs
-# Last version compatible with python-3.6 (default on Ubuntu 18.04)
-mkdocs-material <= 8.2.11
+mkdocs-material
mkdocs-include-markdown-plugin
-mkdocs-macros-plugin
\ No newline at end of file
+mkdocs-macros-plugin
+mkdocstrings
+mkdocstrings-python
diff --git a/docs/rm/custom_instructions.md b/docs/rm/custom_instructions.md
index 2a79b757a..f7fcfbd0d 100644
--- a/docs/rm/custom_instructions.md
+++ b/docs/rm/custom_instructions.md
@@ -37,7 +37,7 @@ The FREP instruction has the following signature:
| max_inst | max_rpt | stagger_max | stagger_mask | 0 | OP-CUSTOM1 | FREP.I |
| max_inst | max_rpt | stagger_max | stagger_mask | 1 | OP-CUSTOM1 | FREP.O |
-FREP.I and FREP.O repeat the *max_inst* instructions following the FREP instruction for *max_rpt + 1* times. The FREP.I instruction (*I* stands for inner) repeats every instruction the specified number of times and moves on to executing and repeating the next. The FREP.O instruction (*O* stands for outer) repeats the whole sequence of instructions *max_rpt + 1* times. Register staggering can be enabled and configured via the *stagger_mask* and *stagger_max* immediates. A detailed explanation of their use can be found in the Snitch [paper](/publications).
+FREP.I and FREP.O repeat the *max_inst + 1* instructions following the FREP instruction for *max_rpt + 1* times. The FREP.I instruction (*I* stands for inner) repeats every instruction the specified number of times and moves on to executing and repeating the next. The FREP.O instruction (*O* stands for outer) repeats the whole sequence of instructions *max_rpt + 1* times. Register staggering can be enabled and configured via the *stagger_mask* and *stagger_max* immediates. A detailed explanation of their use can be found in the Snitch [paper](/publications).
The assembly instruction signature follows:
diff --git a/docs/rm/sim/Simulation.md b/docs/rm/sim/Simulation.md
new file mode 100644
index 000000000..6671fb590
--- /dev/null
+++ b/docs/rm/sim/Simulation.md
@@ -0,0 +1 @@
+::: Simulation
diff --git a/docs/rm/sim/Simulator.md b/docs/rm/sim/Simulator.md
new file mode 100644
index 000000000..56f03482d
--- /dev/null
+++ b/docs/rm/sim/Simulator.md
@@ -0,0 +1 @@
+::: Simulator
diff --git a/docs/rm/sim/sim_utils.md b/docs/rm/sim/sim_utils.md
new file mode 100644
index 000000000..876e5fac4
--- /dev/null
+++ b/docs/rm/sim/sim_utils.md
@@ -0,0 +1 @@
+::: sim_utils
\ No newline at end of file
diff --git a/hw/mem_interface/util/compile.sh b/hw/mem_interface/util/compile.sh
index 73ccc7fca..1a3678cfa 100755
--- a/hw/mem_interface/util/compile.sh
+++ b/hw/mem_interface/util/compile.sh
@@ -10,11 +10,11 @@ set -e
[ ! -z "$VSIM" ] || VSIM=vsim
-bender script vsim -t test \
+$BENDER script vsim -t test \
--vlog-arg="-svinputport=compat" \
--vlog-arg="-override_timescale 1ns/1ps" \
--vlog-arg="-suppress 2583" \
--vlog-arg="+cover=sbecft" \
> compile.tcl
echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/mem_interface/util/run_vsim.sh b/hw/mem_interface/util/run_vsim.sh
index e30929642..45a6b77e1 100755
--- a/hw/mem_interface/util/run_vsim.sh
+++ b/hw/mem_interface/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
[ ! -z "$VSIM" ] || VSIM=vsim
call_vsim() {
- echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+ echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
grep "Errors: 0," vsim.log
}
diff --git a/hw/reqrsp_interface/util/compile.sh b/hw/reqrsp_interface/util/compile.sh
index 73ccc7fca..af966e202 100755
--- a/hw/reqrsp_interface/util/compile.sh
+++ b/hw/reqrsp_interface/util/compile.sh
@@ -10,11 +10,11 @@ set -e
[ ! -z "$VSIM" ] || VSIM=vsim
-bender script vsim -t test \
+$(BENDER) script vsim -t test \
--vlog-arg="-svinputport=compat" \
--vlog-arg="-override_timescale 1ns/1ps" \
--vlog-arg="-suppress 2583" \
--vlog-arg="+cover=sbecft" \
> compile.tcl
echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/reqrsp_interface/util/run_vsim.sh b/hw/reqrsp_interface/util/run_vsim.sh
index e7fe59fb9..9eeee2e14 100755
--- a/hw/reqrsp_interface/util/run_vsim.sh
+++ b/hw/reqrsp_interface/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
[ ! -z "$VSIM" ] || VSIM=vsim
call_vsim() {
- echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+ echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
grep "Errors: 0," vsim.log
}
diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 2d38c63b3..5bb3b1b48 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -487,6 +487,7 @@ module snitch_cc #(
.trace_port_o ( fpu_trace ),
.sequencer_tracer_port_o ( fpu_sequencer_trace ),
// pragma translate_on
+ .hart_id_i ( hart_id_i ),
.acc_req_i ( acc_snitch_req ),
.acc_req_valid_i ( acc_qvalid ),
.acc_req_ready_o ( acc_qready ),
diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv
index 0b994e19e..fc75a386c 100644
--- a/hw/snitch_cluster/src/snitch_fp_ss.sv
+++ b/hw/snitch_cluster/src/snitch_fp_ss.sv
@@ -42,6 +42,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
output fpu_trace_port_t trace_port_o,
output fpu_sequencer_trace_port_t sequencer_tracer_port_o,
// pragma translate_on
+ input logic [31:0] hart_id_i,
// Accelerator Interface - Slave
input acc_req_t acc_req_i,
input logic acc_req_valid_i,
@@ -2509,6 +2510,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
) i_fpu (
.clk_i ,
.rst_ni ( ~rst_i ),
+ .hart_id_i ( hart_id_i ),
.operands_i ( op ),
.rnd_mode_i ( fpu_rnd_mode ),
.op_i ( fpu_op ),
diff --git a/hw/snitch_cluster/src/snitch_fpu.sv b/hw/snitch_cluster/src/snitch_fpu.sv
index 44d28df45..ed7958edc 100644
--- a/hw/snitch_cluster/src/snitch_fpu.sv
+++ b/hw/snitch_cluster/src/snitch_fpu.sv
@@ -19,6 +19,7 @@ module snitch_fpu import snitch_pkg::*; #(
input logic clk_i,
input logic rst_ni,
// Input signals
+ input logic [31:0] hart_id_i,
input logic [2:0][FLEN-1:0] operands_i,
input fpnew_pkg::roundmode_e rnd_mode_i,
input fpnew_pkg::operation_e op_i,
@@ -99,12 +100,15 @@ module snitch_fpu import snitch_pkg::*; #(
fpnew_top #(
// FPU configuration
- .Features ( FPUFeatures ),
- .Implementation ( FPUImplementation ),
- .TagType ( logic[6:0] )
+ .Features ( FPUFeatures ),
+ .Implementation ( FPUImplementation ),
+ .TagType ( logic[6:0] ),
+ .CompressedVecCmpResult ( 1 ),
+ .StochasticRndImplementation ( fpnew_pkg::DEFAULT_RSR )
) i_fpu (
.clk_i ,
.rst_ni ,
+ .hart_id_i ( hart_id_i ),
.operands_i ( fpu_in_q.operands ),
.rnd_mode_i ( fpu_in_q.rnd_mode ),
.op_i ( fpu_in_q.op ),
@@ -114,6 +118,7 @@ module snitch_fpu import snitch_pkg::*; #(
.int_fmt_i ( fpu_in_q.int_fmt ),
.vectorial_op_i ( fpu_in_q.vectorial_op ),
.tag_i ( fpu_in_q.tag ),
+ .simd_mask_i ( '1 ),
.in_valid_i ( in_valid_q ),
.in_ready_o ( in_ready_q ),
.flush_i ( 1'b0 ),
diff --git a/hw/snitch_cluster/util/compile.sh b/hw/snitch_cluster/util/compile.sh
index 73ccc7fca..1a3678cfa 100755
--- a/hw/snitch_cluster/util/compile.sh
+++ b/hw/snitch_cluster/util/compile.sh
@@ -10,11 +10,11 @@ set -e
[ ! -z "$VSIM" ] || VSIM=vsim
-bender script vsim -t test \
+$BENDER script vsim -t test \
--vlog-arg="-svinputport=compat" \
--vlog-arg="-override_timescale 1ns/1ps" \
--vlog-arg="-suppress 2583" \
--vlog-arg="+cover=sbecft" \
> compile.tcl
echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/snitch_cluster/util/run_vsim.sh b/hw/snitch_cluster/util/run_vsim.sh
index e9298efed..00d08aee3 100755
--- a/hw/snitch_cluster/util/run_vsim.sh
+++ b/hw/snitch_cluster/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
[ ! -z "$VSIM" ] || VSIM=vsim
call_vsim() {
- echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+ echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
grep "Errors: 0," vsim.log
}
diff --git a/hw/snitch_icache/util/compile.sh b/hw/snitch_icache/util/compile.sh
index 73ccc7fca..1a3678cfa 100755
--- a/hw/snitch_icache/util/compile.sh
+++ b/hw/snitch_icache/util/compile.sh
@@ -10,11 +10,11 @@ set -e
[ ! -z "$VSIM" ] || VSIM=vsim
-bender script vsim -t test \
+$BENDER script vsim -t test \
--vlog-arg="-svinputport=compat" \
--vlog-arg="-override_timescale 1ns/1ps" \
--vlog-arg="-suppress 2583" \
--vlog-arg="+cover=sbecft" \
> compile.tcl
echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/snitch_icache/util/run_vsim.sh b/hw/snitch_icache/util/run_vsim.sh
index 94671daf5..42cc47f94 100755
--- a/hw/snitch_icache/util/run_vsim.sh
+++ b/hw/snitch_icache/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
[ ! -z "$VSIM" ] || VSIM=vsim
call_vsim() {
- echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+ echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
grep "Errors: 0," vsim.log
}
diff --git a/hw/snitch_ssr/util/compile.sh b/hw/snitch_ssr/util/compile.sh
index 73ccc7fca..af966e202 100755
--- a/hw/snitch_ssr/util/compile.sh
+++ b/hw/snitch_ssr/util/compile.sh
@@ -10,11 +10,11 @@ set -e
[ ! -z "$VSIM" ] || VSIM=vsim
-bender script vsim -t test \
+$(BENDER) script vsim -t test \
--vlog-arg="-svinputport=compat" \
--vlog-arg="-override_timescale 1ns/1ps" \
--vlog-arg="-suppress 2583" \
--vlog-arg="+cover=sbecft" \
> compile.tcl
echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/tcdm_interface/util/compile.sh b/hw/tcdm_interface/util/compile.sh
index 73ccc7fca..1a3678cfa 100755
--- a/hw/tcdm_interface/util/compile.sh
+++ b/hw/tcdm_interface/util/compile.sh
@@ -10,11 +10,11 @@ set -e
[ ! -z "$VSIM" ] || VSIM=vsim
-bender script vsim -t test \
+$BENDER script vsim -t test \
--vlog-arg="-svinputport=compat" \
--vlog-arg="-override_timescale 1ns/1ps" \
--vlog-arg="-suppress 2583" \
--vlog-arg="+cover=sbecft" \
> compile.tcl
echo 'return 0' >> compile.tcl
-$VSIM -c -do 'exit -code [source compile.tcl]'
+$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]'
diff --git a/hw/tcdm_interface/util/run_vsim.sh b/hw/tcdm_interface/util/run_vsim.sh
index 078ae72a8..6f10155d0 100755
--- a/hw/tcdm_interface/util/run_vsim.sh
+++ b/hw/tcdm_interface/util/run_vsim.sh
@@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
[ ! -z "$VSIM" ] || VSIM=vsim
call_vsim() {
- echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
+ echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1
grep "Errors: 0," vsim.log
}
diff --git a/mkdocs.yml b/mkdocs.yml
index 3f9595b0a..70d213601 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -22,6 +22,10 @@ markdown_extensions:
emoji_generator: !!python/name:materialx.emoji.to_svg
plugins:
- include-markdown
+ - mkdocstrings:
+ handlers:
+ python:
+ paths: [util/sim]
- macros:
on_error_fail: true
use_directory_urls: false
@@ -49,10 +53,15 @@ nav:
- Custom Instructions: rm/custom_instructions.md
# - Solder: rm/solder.md
- Software:
- - Pages: runtime/Pages/index.md
- - Files: runtime/Files/index.md
- - Classes: runtime/Classes/index.md
- - Examples: runtime/Examples/index.md
- - Modules: runtime/Modules/index.md
- - Namespaces: runtime/Namespaces/index.md
+ - Simulation Utilities:
+ - sim_utils: rm/sim/sim_utils.md
+ - rm/sim/Simulation.md
+ - rm/sim/Simulator.md
+ - Snitch Runtime:
+ - Pages: runtime/Pages/index.md
+ - Files: runtime/Files/index.md
+ - Classes: runtime/Classes/index.md
+ - Examples: runtime/Examples/index.md
+ - Modules: runtime/Modules/index.md
+ - Namespaces: runtime/Namespaces/index.md
- Publications: publications.md
diff --git a/python-requirements.txt b/python-requirements.txt
index d426cf140..6db0bf03f 100644
--- a/python-requirements.txt
+++ b/python-requirements.txt
@@ -19,6 +19,7 @@ pytablewriter
termcolor
pandas
pyelftools
+psutil
-r docs/requirements.txt
-r sw/dnn/requirements.txt
diff --git a/sw/blas/gemm/Makefile b/sw/blas/gemm/Makefile
index 604556ed1..9605f07d7 100644
--- a/sw/blas/gemm/Makefile
+++ b/sw/blas/gemm/Makefile
@@ -9,16 +9,18 @@ MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
DATA_DIR := $(realpath $(MK_DIR)/data)
SRC_DIR := $(realpath $(MK_DIR)/src)
+DATA_CFG ?= $(DATA_DIR)/params.hjson
+SECTION ?=
+
APP ?= gemm
SRCS ?= $(realpath $(SRC_DIR)/main.c)
INCDIRS ?= $(DATA_DIR) $(SRC_DIR)
-DATA_CFG ?= $(DATA_DIR)/params.hjson
DATAGEN_PY = $(DATA_DIR)/datagen.py
DATA_H = $(DATA_DIR)/data.h
$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
- $< -c $(DATA_CFG) > $@
+ $< -c $(DATA_CFG) --section="$(SECTION)" > $@
.PHONY: clean-data clean
diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py
index c7c3fb9e0..25e2dca57 100755
--- a/sw/blas/gemm/data/datagen.py
+++ b/sw/blas/gemm/data/datagen.py
@@ -39,9 +39,13 @@
'fp8alt': {'exp': 4, 'mant': 3}
}
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
-def golden_model(a, b, alpha, c):
- return np.matmul(a, b) + alpha * c
+
+def golden_model(alpha, a, b, beta, c):
+ return alpha * np.matmul(a, b) + beta * c
def emit_header(**kwargs):
@@ -73,11 +77,14 @@ def emit_header(**kwargs):
* (1.0 + mantissa_b.astype(np.double) / (2**2))
_c = ((-1.0)**sign_c.astype(np.double))*(2.0**(exponent_c.astype(np.double)-15.0)) \
* (1.0 + mantissa_c.astype(np.double) / (2**2))
- result = np.matmul(_a, _b) + kwargs['alpha'] * _c
+ result = golden_model(1, _a, _b, kwargs['beta'], _c)
a = sign_a << 7 | exponent_a << FP8_FORMATS['fp8']['mant'] | mantissa_a
b = sign_b << 7 | exponent_b << FP8_FORMATS['fp8']['mant'] | mantissa_b
c = sign_c << 7 | exponent_c << FP8_FORMATS['fp8']['mant'] | mantissa_c
else:
+ a = np.random.rand(kwargs['M'], kwargs['K']).astype(dtype)
+ b = np.random.rand(kwargs['K'], kwargs['N']).astype(dtype)
+ c = np.random.rand(kwargs['M'], kwargs['N']).astype(dtype)
if kwargs['linspace']:
a = np.linspace(0.1, kwargs['M'] * kwargs['K'] + 0.1 -1, num=kwargs['M'] * kwargs['K']).reshape((kwargs['M'], kwargs['K'])).astype(dtype)
b = np.linspace(0.2, kwargs['K'] * kwargs['N'] + 0.2 -1, num=kwargs['K'] * kwargs['N']).reshape((kwargs['K'], kwargs['N'])).astype(dtype)
@@ -86,7 +93,7 @@ def emit_header(**kwargs):
a = np.random.rand(kwargs['M'], kwargs['K']).astype(dtype)
b = np.random.rand(kwargs['K'], kwargs['N']).astype(dtype)
c = np.random.rand(kwargs['M'], kwargs['N']).astype(dtype)
- result = golden_model(a, b, kwargs['alpha'], c)
+ result = golden_model(1, a, b, kwargs['beta'], c)
# Store matrices in transposed form if requested
a = a.T if kwargs['ta'] else a
@@ -98,12 +105,15 @@ def emit_header(**kwargs):
data_str += [format_scalar_definition('uint32_t', 'K', kwargs['K'])]
data_str += [format_scalar_definition('uint32_t', 'TA', int(kwargs['ta']))]
data_str += [format_scalar_definition('uint32_t', 'TB', int(kwargs['tb']))]
- data_str += [format_scalar_definition('uint32_t', 'ALPHA', kwargs['alpha'])]
+ data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])]
data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)]
data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])]
- data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten())]
- data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten())]
- data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten())]
+ data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
+ alignment=BURST_ALIGNMENT, section=kwargs['section'])]
+ data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
+ alignment=BURST_ALIGNMENT, section=kwargs['section'])]
+ data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
+ alignment=BURST_ALIGNMENT, section=kwargs['section'])]
if kwargs['prec'] == 8:
result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten())
else:
@@ -125,11 +135,16 @@ def main():
required=True,
help='Select param config file kernel'
)
+ parser.add_argument(
+ '--section',
+ type=str,
+ help='Section to store matrices in')
args = parser.parse_args()
# Load param config file
with args.cfg.open() as f:
param = hjson.loads(f.read())
+ param['section'] = args.section
# Emit header file
print(emit_header(**param))
diff --git a/sw/blas/gemm/data/params.hjson b/sw/blas/gemm/data/params.hjson
index 63cdefd29..1428d1c99 100644
--- a/sw/blas/gemm/data/params.hjson
+++ b/sw/blas/gemm/data/params.hjson
@@ -8,7 +8,7 @@
M: 192,
N: 16,
K: 16,
- alpha: 0,
+ beta: 0,
ta: false,
tb: true, // must be true for SIMD
prec: 64,
diff --git a/sw/blas/gemm/verify.py b/sw/blas/gemm/verify.py
index 3bae7f801..b6f886b7b 100755
--- a/sw/blas/gemm/verify.py
+++ b/sw/blas/gemm/verify.py
@@ -37,22 +37,27 @@ def main():
a = np.array(bytes_to_doubles(elf.get_symbol_contents('a')))
b = np.array(bytes_to_doubles(elf.get_symbol_contents('b')))
c = np.array(bytes_to_doubles(elf.get_symbol_contents('c')))
- alpha = bytes_to_uint32s(elf.get_symbol_contents('ALPHA'))[0]
+ beta = bytes_to_uint32s(elf.get_symbol_contents('BETA'))[0]
m = bytes_to_uint32s(elf.get_symbol_contents('M'))[0]
n = bytes_to_uint32s(elf.get_symbol_contents('N'))[0]
k = bytes_to_uint32s(elf.get_symbol_contents('K'))[0]
tb = bytes_to_uint32s(elf.get_symbol_contents('TB'))[0]
a = np.reshape(a, (m, k))
- b = np.reshape(b, (k, n))
if tb:
+ b = np.reshape(b, (n, k))
b = b.transpose()
+ else:
+ b = np.reshape(b, (k, n))
c = np.reshape(c, (m, n))
# Verify results
- c_golden = golden_model(a, b, alpha, c).flatten()
+ c_golden = golden_model(1, a, b, beta, c).flatten()
absolute_err = np.absolute(c_golden - c_actual)
fail = np.any(absolute_err > ERR_THRESHOLD)
+ if (fail):
+ verification.dump_results_to_csv([c_golden, c_actual, absolute_err],
+ Path.cwd() / 'gemm_results.csv')
return int(fail)
diff --git a/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch
new file mode 100644
index 000000000..068851f3c
--- /dev/null
+++ b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch
@@ -0,0 +1,125 @@
+From 91c1b48e44629a80bdc1832111707c051ab0b3b2 Mon Sep 17 00:00:00 2001
+From: Luca Colagrande
+Date: Mon, 23 Oct 2023 14:30:18 +0200
+Subject: [PATCH] sw/math: Refactor to proper library
+
+The previous header-only library style led to conflicts on certain
+defines (for instance `N`) defined in both math library sources and
+application sources.
+---
+ Makefile | 77 +++++++++++++++++++++++++++++++++++++++++++++++---
+ include/math.h | 3 --
+ 2 files changed, 73 insertions(+), 7 deletions(-)
+
+diff --git a/Makefile b/Makefile
+index 1327953..a6f7a1a 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,17 +1,86 @@
+-BITS_DIR = include/bits
++# Copyright 2023 ETH Zurich and University of Bologna.
++# Licensed under the Apache License, Version 2.0, see LICENSE for details.
++# SPDX-License-Identifier: Apache-2.0
++#
++# Luca Colagrande
++# Viviane Potocnik, ETH Zurich
++
++# Usage of absolute paths is required to externally include
++# this Makefile from multiple different locations
++MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
++
++###############
++# Directories #
++###############
++
++BUILDDIR ?= $(abspath build)
++SRC_DIR = $(MK_DIR)/src/math
++BITS_DIR = $(MK_DIR)/include/bits
++
++###################
++# Build variables #
++###################
++
++INCDIRS += $(MK_DIR)/arch/riscv64/
++INCDIRS += $(MK_DIR)/arch/generic
++INCDIRS += $(MK_DIR)/src/include
++INCDIRS += $(MK_DIR)/src/internal
++INCDIRS += $(MK_DIR)/include/bits
++INCDIRS += $(MK_DIR)/include
++
++SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c))
++
++###########
++# Outputs #
++###########
++
+ ALLTYPES_H = $(BITS_DIR)/alltypes.h
+
++OBJS = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS)))))
++DEPS = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS)))))
++LIB = $(BUILDDIR)/libmath.a
++DUMP = $(BUILDDIR)/libmath.dump
++ALL_OUTPUTS = $(LIB) $(DUMP)
+
+-.PHONY: all clean
++#########
++# Rules #
++#########
+
+-all: $(ALLTYPES_H)
++.PHONY: all
++all: $(ALL_OUTPUTS)
+
++.PHONY: clean
+ clean:
+ rm -rf $(BITS_DIR)
+ rm -f $(ALLTYPES_H)
++ rm -rf $(BUILDDIR)
+
+ $(BITS_DIR):
+ mkdir -p $@
+
+ $(ALLTYPES_H): | $(BITS_DIR)
+- sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@
++ sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@
++
++$(DEPS): $(ALLTYPES_H)
++
++$(BUILDDIR):
++ mkdir -p $@
++
++$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR)
++ $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
++
++$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR)
++ $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
++
++$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR)
++ $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@
++
++$(LIB): $(OBJS) | $(BUILDDIR)
++ $(RISCV_AR) $(RISCV_ARFLAGS) $@ $^
++
++$(DUMP): $(LIB) | $(BUILDDIR)
++ $(RISCV_OBJDUMP) -D $< > $@
++
++ifneq ($(MAKECMDGOALS),clean)
++-include $(DEPS)
++endif
+diff --git a/include/math.h b/include/math.h
+index 6dad71c..14f28ec 100644
+--- a/include/math.h
++++ b/include/math.h
+@@ -435,9 +435,6 @@ float pow10f(float);
+ long double pow10l(long double);
+ #endif
+
+-#include "../src/math/expm1.c"
+-#include "../src/math/tanh.c"
+-
+ #ifdef __cplusplus
+ }
+ #endif
+--
+2.28.0
+
diff --git a/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch
new file mode 100644
index 000000000..050af9d33
--- /dev/null
+++ b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch
@@ -0,0 +1,81 @@
+From eb96f4d7454a07498f571eb1ed18aa1db2413551 Mon Sep 17 00:00:00 2001
+From: Luca Colagrande
+Date: Mon, 23 Oct 2023 16:45:17 +0200
+Subject: [PATCH] `sw/math`: Add safe FP <--> INT conversions
+
+---
+ src/internal/libm.h | 51 +++++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 47 insertions(+), 4 deletions(-)
+
+diff --git a/src/internal/libm.h b/src/internal/libm.h
+index 72ad17d..60b9866 100644
+--- a/src/internal/libm.h
++++ b/src/internal/libm.h
+@@ -96,6 +96,47 @@ static int32_t converttoint(double_t);
+ #define predict_false(x) (x)
+ #endif
+
++/* FPU fence to synchronize the FPU and integer core in Snitch. */
++inline void snrt_fpu_fence() {
++ unsigned tmp;
++ __asm__ volatile(
++ "fmv.x.w %0, fa0\n"
++ "mv %0, %0\n"
++ : "+r"(tmp)::"memory");
++}
++
++/* Synch-secure double to uint64 conversion functions. */
++static inline uint64_t asuint64(double f) {
++ uint64_t result;
++ snrt_fpu_fence();
++ result = *(uint64_t *)&f;
++ return result;
++}
++
++/* Synch-secure float to uint conversion functions. */
++static inline uint64_t asuint(float f) {
++ uint32_t result;
++ snrt_fpu_fence();
++ result = *(uint32_t *)&f;
++ return result;
++}
++
++/* Synch-secure uint64 to double conversion functions. */
++static inline double asdouble(uint64_t i) {
++ double result;
++ snrt_fpu_fence();
++ result = *(double *)&i;
++ return result;
++}
++
++/* Synch-secure uint to float conversion functions. */
++static inline float asfloat(uint32_t i) {
++ float result;
++ snrt_fpu_fence();
++ result = *(float *)&i;
++ return result;
++}
++
+ /* Evaluate an expression as the specified type. With standard excess
+ precision handling a type cast or assignment is enough (with
+ -ffloat-store an assignment is required, in old compilers argument
+@@ -187,10 +228,12 @@ static inline void fp_force_evall(long double x)
+ } \
+ } while(0)
+
+-#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i
+-#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
+-#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i
+-#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
++// Unsafe in Snitch due to the decoupled FPU and integer
++// arithmetic units. Use at your own risk.
++#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i
++#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f
++#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i
++#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f
+
+ #define EXTRACT_WORDS(hi,lo,d) \
+ do { \
+--
+2.28.0
+
diff --git a/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch
new file mode 100644
index 000000000..cffc3c407
--- /dev/null
+++ b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch
@@ -0,0 +1,149 @@
+From b419b07facc9591ba0d8683f53c9adefb8a9b0c6 Mon Sep 17 00:00:00 2001
+From: Luca Colagrande
+Date: Wed, 8 Nov 2023 09:35:17 +0100
+Subject: [PATCH] `sw/math`: Implement safe `tanh` function
+
+---
+ src/internal/libm.h | 31 +++++++++++++++++++++++++++++++
+ src/math/expm1.c | 34 ++++++++++++++++++++++++++--------
+ src/math/tanh.c | 17 ++++++++++++-----
+ 3 files changed, 69 insertions(+), 13 deletions(-)
+
+diff --git a/src/internal/libm.h b/src/internal/libm.h
+index 60b9866..c96c0ec 100644
+--- a/src/internal/libm.h
++++ b/src/internal/libm.h
+@@ -96,6 +96,37 @@ static int32_t converttoint(double_t);
+ #define predict_false(x) (x)
+ #endif
+
++/* Memory-consistent functions to manipulate the upper word of a
++ double-precision floating-point number in the integer core.
++ Since there is no dedicated instruction to move the upper 32-bits
++ of a double-precision floating point register to an integer register
++ the compiler resorts to moving the value through the memory. However in
++ Snitch neither the program ordering between floating-point and integer
++ instructions is guaranteed, nor is memory consistency between the integer
++ and floating-point threads. */
++
++static inline uint32_t safe_extract_upper_32b_from_double(double x) {
++ double f;
++ uint32_t result;
++ asm volatile("fsd %[x], 0(%[ptr]) \n"
++ "fld ft3, 0(%[ptr]) \n"
++ "fmv.x.w t0, ft3 \n"
++ "mv t0, t0 \n"
++ "lw %[result], 4(%[ptr]) \n"
++ : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory");
++ return result;
++}
++
++static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) {
++ asm volatile("sw %[x], 4(%[ptr]) \n"
++ "lw %[x], 4(%[ptr]) \n"
++ "fmv.w.x ft3, %[x] \n"
++ : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory");
++}
++
++/* TODO: the following functions are not really safe, compare previous two
++ functions */
++
+ /* FPU fence to synchronize the FPU and integer core in Snitch. */
+ inline void snrt_fpu_fence() {
+ unsigned tmp;
+diff --git a/src/math/expm1.c b/src/math/expm1.c
+index ac1e61e..d94f57f 100644
+--- a/src/math/expm1.c
++++ b/src/math/expm1.c
+@@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
+ double expm1(double x)
+ {
+ double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk;
+- union {double f; uint64_t i;} u = {x};
+- uint32_t hx = u.i>>32 & 0x7fffffff;
+- int k, sign = u.i>>63;
++ /// Original implementation
++ // union {double f; uint64_t i;} u = {x};
++ // uint32_t hx = u.i>>32 & 0x7fffffff;
++ // int k, sign = u.i>>63;
++ /// Safe implementation in Snitch
++ uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
++ uint32_t hx = upper_32b_x & 0x7fffffff;
++ int k, sign = upper_32b_x>>31;
+
+ /* filter out huge and non-finite argument */
+ if (hx >= 0x4043687A) { /* if |x|>=56*ln2 */
+@@ -182,8 +187,12 @@ double expm1(double x)
+ return -2.0*(e-(x+0.5));
+ return 1.0+2.0*(x-e);
+ }
+- u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */
+- twopk = u.f;
++ /// Original implementation
++ // u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */
++ // twopk = u.f;
++ /// Safe implementation in Snitch
++ uint32_t u_i = (uint32_t)(0x3ff + k)<<20;
++ safe_inject_into_upper_32b_double(u_i, &twopk);
+ if (k < 0 || k > 56) { /* suffice to return exp(x)-1 */
+ y = x - e + 1.0;
+ if (k == 1024)
+@@ -192,10 +201,19 @@ double expm1(double x)
+ y = y*twopk;
+ return y - 1.0;
+ }
+- u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */
++ /// Original implementation
++ // u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */
++ // if (k < 20)
++ // y = (x-e+(1-u.f))*twopk;
++ // else
++ // y = (x-(e+u.f)+1)*twopk;
++ /// Safe implementation in Snitch
++ u_i = (uint32_t)(0x3ff - k)<<20;
++ double u_f = 0;
++ safe_inject_into_upper_32b_double(u_i, &u_f);
+ if (k < 20)
+- y = (x-e+(1-u.f))*twopk;
++ y = (x-e+(1-u_f))*twopk;
+ else
+- y = (x-(e+u.f)+1)*twopk;
++ y = (x-(e+u_f)+1)*twopk;
+ return y;
+ }
+diff --git a/src/math/tanh.c b/src/math/tanh.c
+index 20d6dbc..2481db1 100644
+--- a/src/math/tanh.c
++++ b/src/math/tanh.c
+@@ -6,16 +6,23 @@
+ */
+ double tanh(double x)
+ {
+- union {double f; uint64_t i;} u = {.f = x};
+ uint32_t w;
+ int sign;
+ double_t t;
+
+ /* x = |x| */
+- sign = u.i >> 63;
+- u.i &= (uint64_t)-1/2;
+- x = u.f;
+- w = u.i >> 32;
++ /// Original implementation
++ // union {double f; uint64_t i;} u = {.f = x};
++ // sign = u.i >> 63;
++ // u.i &= (uint64_t)-1/2;
++ // x = u.f;
++ // w = u.i >> 32;
++ /// Safe implementation in Snitch
++ uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
++ sign = upper_32b_x >> 31;
++ uint32_t sign_mask = (~(1 << 31));
++ w = upper_32b_x & sign_mask;
++ safe_inject_into_upper_32b_double(w, &x);
+
+ if (w > 0x3fe193ea) {
+ /* |x| > log(3)/2 ~= 0.5493 or nan */
+--
+2.28.0
+
diff --git a/sw/math/Makefile b/sw/math/Makefile
index 132795388..afb3192d1 100644
--- a/sw/math/Makefile
+++ b/sw/math/Makefile
@@ -1,17 +1,86 @@
-BITS_DIR = include/bits
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande
+# Viviane Potocnik, ETH Zurich
+
+# Usage of absolute paths is required to externally include
+# this Makefile from multiple different locations
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+
+###############
+# Directories #
+###############
+
+BUILDDIR ?= $(abspath build)
+SRC_DIR = $(MK_DIR)/src/math
+BITS_DIR = $(MK_DIR)/include/bits
+
+###################
+# Build variables #
+###################
+
+INCDIRS += $(MK_DIR)/arch/riscv64/
+INCDIRS += $(MK_DIR)/arch/generic
+INCDIRS += $(MK_DIR)/src/include
+INCDIRS += $(MK_DIR)/src/internal
+INCDIRS += $(MK_DIR)/include/bits
+INCDIRS += $(MK_DIR)/include
+
+SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c))
+
+###########
+# Outputs #
+###########
+
ALLTYPES_H = $(BITS_DIR)/alltypes.h
+OBJS = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS)))))
+DEPS = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS)))))
+LIB = $(BUILDDIR)/libmath.a
+DUMP = $(BUILDDIR)/libmath.dump
+ALL_OUTPUTS = $(LIB) $(DUMP)
-.PHONY: all clean
+#########
+# Rules #
+#########
-all: $(ALLTYPES_H)
+.PHONY: all
+all: $(ALL_OUTPUTS)
+.PHONY: clean
clean:
rm -rf $(BITS_DIR)
rm -f $(ALLTYPES_H)
+ rm -rf $(BUILDDIR)
$(BITS_DIR):
mkdir -p $@
$(ALLTYPES_H): | $(BITS_DIR)
- sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@
+ sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@
+
+$(DEPS): $(ALLTYPES_H)
+
+$(BUILDDIR):
+ mkdir -p $@
+
+$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR)
+ $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
+
+$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR)
+ $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
+
+$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR)
+ $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@
+
+$(LIB): $(OBJS) | $(BUILDDIR)
+ $(RISCV_AR) $(RISCV_ARFLAGS) $@ $^
+
+$(DUMP): $(LIB) | $(BUILDDIR)
+ $(RISCV_OBJDUMP) -D $< > $@
+
+ifneq ($(MAKECMDGOALS),clean)
+-include $(DEPS)
+endif
diff --git a/sw/math/include/float.h b/sw/math/include/float.h
new file mode 100644
index 000000000..713aadb90
--- /dev/null
+++ b/sw/math/include/float.h
@@ -0,0 +1,52 @@
+#ifndef _FLOAT_H
+#define _FLOAT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int __flt_rounds(void);
+#define FLT_ROUNDS (__flt_rounds())
+
+#define FLT_RADIX 2
+
+#define FLT_TRUE_MIN 1.40129846432481707092e-45F
+#define FLT_MIN 1.17549435082228750797e-38F
+#define FLT_MAX 3.40282346638528859812e+38F
+#define FLT_EPSILON 1.1920928955078125e-07F
+
+#define FLT_MANT_DIG 24
+#define FLT_MIN_EXP (-125)
+#define FLT_MAX_EXP 128
+#define FLT_HAS_SUBNORM 1
+
+#define FLT_DIG 6
+#define FLT_DECIMAL_DIG 9
+#define FLT_MIN_10_EXP (-37)
+#define FLT_MAX_10_EXP 38
+
+#define DBL_TRUE_MIN 4.94065645841246544177e-324
+#define DBL_MIN 2.22507385850720138309e-308
+#define DBL_MAX 1.79769313486231570815e+308
+#define DBL_EPSILON 2.22044604925031308085e-16
+
+#define DBL_MANT_DIG 53
+#define DBL_MIN_EXP (-1021)
+#define DBL_MAX_EXP 1024
+#define DBL_HAS_SUBNORM 1
+
+#define DBL_DIG 15
+#define DBL_DECIMAL_DIG 17
+#define DBL_MIN_10_EXP (-307)
+#define DBL_MAX_10_EXP 308
+
+#define LDBL_HAS_SUBNORM 1
+#define LDBL_DECIMAL_DIG DECIMAL_DIG
+
+#include
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sw/math/include/math.h b/sw/math/include/math.h
index 6dad71c1e..14f28ec8c 100644
--- a/sw/math/include/math.h
+++ b/sw/math/include/math.h
@@ -435,9 +435,6 @@ float pow10f(float);
long double pow10l(long double);
#endif
-#include "../src/math/expm1.c"
-#include "../src/math/tanh.c"
-
#ifdef __cplusplus
}
#endif
diff --git a/sw/math/src/internal/libm.h b/sw/math/src/internal/libm.h
index 72ad17d8e..c96c0eced 100644
--- a/sw/math/src/internal/libm.h
+++ b/sw/math/src/internal/libm.h
@@ -96,6 +96,78 @@ static int32_t converttoint(double_t);
#define predict_false(x) (x)
#endif
+/* Memory-consistent functions to manipulate the upper word of a
+ double-precision floating-point number in the integer core.
+ Since there is no dedicated instruction to move the upper 32-bits
+ of a double-precision floating point register to an integer register
+ the compiler resorts to moving the value through the memory. However in
+ Snitch neither the program ordering between floating-point and integer
+ instructions is guaranteed, nor is memory consistency between the integer
+ and floating-point threads. */
+
+static inline uint32_t safe_extract_upper_32b_from_double(double x) {
+ double f;
+ uint32_t result;
+ asm volatile("fsd %[x], 0(%[ptr]) \n"
+ "fld ft3, 0(%[ptr]) \n"
+ "fmv.x.w t0, ft3 \n"
+ "mv t0, t0 \n"
+ "lw %[result], 4(%[ptr]) \n"
+ : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory");
+ return result;
+}
+
+static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) {
+ asm volatile("sw %[x], 4(%[ptr]) \n"
+ "lw %[x], 4(%[ptr]) \n"
+ "fmv.w.x ft3, %[x] \n"
+ : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory");
+}
+
+/* TODO: the following functions are not really safe, compare previous two
+ functions */
+
+/* FPU fence to synchronize the FPU and integer core in Snitch. */
+inline void snrt_fpu_fence() {
+ unsigned tmp;
+ __asm__ volatile(
+ "fmv.x.w %0, fa0\n"
+ "mv %0, %0\n"
+ : "+r"(tmp)::"memory");
+}
+
+/* Synch-secure double to uint64 conversion functions. */
+static inline uint64_t asuint64(double f) {
+ uint64_t result;
+ snrt_fpu_fence();
+ result = *(uint64_t *)&f;
+ return result;
+}
+
+/* Synch-secure float to uint conversion functions. */
+static inline uint64_t asuint(float f) {
+ uint32_t result;
+ snrt_fpu_fence();
+ result = *(uint32_t *)&f;
+ return result;
+}
+
+/* Synch-secure uint64 to double conversion functions. */
+static inline double asdouble(uint64_t i) {
+ double result;
+ snrt_fpu_fence();
+ result = *(double *)&i;
+ return result;
+}
+
+/* Synch-secure uint to float conversion functions. */
+static inline float asfloat(uint32_t i) {
+ float result;
+ snrt_fpu_fence();
+ result = *(float *)&i;
+ return result;
+}
+
/* Evaluate an expression as the specified type. With standard excess
precision handling a type cast or assignment is enough (with
-ffloat-store an assignment is required, in old compilers argument
@@ -187,10 +259,12 @@ static inline void fp_force_evall(long double x)
} \
} while(0)
-#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i
-#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
-#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i
-#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
+// Unsafe in Snitch due to the decoupled FPU and integer
+// arithmetic units. Use at your own risk.
+#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i
+#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f
+#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i
+#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f
#define EXTRACT_WORDS(hi,lo,d) \
do { \
diff --git a/sw/math/src/math/__math_divzero.c b/sw/math/src/math/__math_divzero.c
new file mode 100644
index 000000000..59d213500
--- /dev/null
+++ b/sw/math/src/math/__math_divzero.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_divzero(uint32_t sign)
+{
+ return fp_barrier(sign ? -1.0 : 1.0) / 0.0;
+}
diff --git a/sw/math/src/math/__math_invalid.c b/sw/math/src/math/__math_invalid.c
new file mode 100644
index 000000000..177404900
--- /dev/null
+++ b/sw/math/src/math/__math_invalid.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_invalid(double x)
+{
+ return (x - x) / (x - x);
+}
diff --git a/sw/math/src/math/__math_invalidf.c b/sw/math/src/math/__math_invalidf.c
new file mode 100644
index 000000000..357d4b121
--- /dev/null
+++ b/sw/math/src/math/__math_invalidf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_invalidf(float x)
+{
+ return (x - x) / (x - x);
+}
diff --git a/sw/math/src/math/__math_invalidl.c b/sw/math/src/math/__math_invalidl.c
new file mode 100644
index 000000000..1fca99de4
--- /dev/null
+++ b/sw/math/src/math/__math_invalidl.c
@@ -0,0 +1,9 @@
+#include
+#include "libm.h"
+
+#if LDBL_MANT_DIG != DBL_MANT_DIG
+long double __math_invalidl(long double x)
+{
+ return (x - x) / (x - x);
+}
+#endif
diff --git a/sw/math/src/math/__math_oflow.c b/sw/math/src/math/__math_oflow.c
new file mode 100644
index 000000000..c85dbf982
--- /dev/null
+++ b/sw/math/src/math/__math_oflow.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_oflow(uint32_t sign)
+{
+ return __math_xflow(sign, 0x1p769);
+}
diff --git a/sw/math/src/math/__math_oflowf.c b/sw/math/src/math/__math_oflowf.c
new file mode 100644
index 000000000..fa7d06208
--- /dev/null
+++ b/sw/math/src/math/__math_oflowf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_oflowf(uint32_t sign)
+{
+ return __math_xflowf(sign, 0x1p97f);
+}
diff --git a/sw/math/src/math/__math_uflow.c b/sw/math/src/math/__math_uflow.c
new file mode 100644
index 000000000..b90594aee
--- /dev/null
+++ b/sw/math/src/math/__math_uflow.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_uflow(uint32_t sign)
+{
+ return __math_xflow(sign, 0x1p-767);
+}
diff --git a/sw/math/src/math/__math_uflowf.c b/sw/math/src/math/__math_uflowf.c
new file mode 100644
index 000000000..94d50f2bf
--- /dev/null
+++ b/sw/math/src/math/__math_uflowf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_uflowf(uint32_t sign)
+{
+ return __math_xflowf(sign, 0x1p-95f);
+}
diff --git a/sw/math/src/math/__math_xflow.c b/sw/math/src/math/__math_xflow.c
new file mode 100644
index 000000000..744203c4c
--- /dev/null
+++ b/sw/math/src/math/__math_xflow.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_xflow(uint32_t sign, double y)
+{
+ return eval_as_double(fp_barrier(sign ? -y : y) * y);
+}
diff --git a/sw/math/src/math/__math_xflowf.c b/sw/math/src/math/__math_xflowf.c
new file mode 100644
index 000000000..f2c84784f
--- /dev/null
+++ b/sw/math/src/math/__math_xflowf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_xflowf(uint32_t sign, float y)
+{
+ return eval_as_float(fp_barrierf(sign ? -y : y) * y);
+}
diff --git a/sw/math/src/math/ceil.c b/sw/math/src/math/ceil.c
new file mode 100644
index 000000000..b13e6f2d6
--- /dev/null
+++ b/sw/math/src/math/ceil.c
@@ -0,0 +1,31 @@
+#include "libm.h"
+
+#if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
+#define EPS DBL_EPSILON
+#elif FLT_EVAL_METHOD==2
+#define EPS LDBL_EPSILON
+#endif
+static const double_t toint = 1/EPS;
+
+double ceil(double x)
+{
+ union {double f; uint64_t i;} u = {x};
+ int e = u.i >> 52 & 0x7ff;
+ double_t y;
+
+ if (e >= 0x3ff+52 || x == 0)
+ return x;
+ /* y = int(x) - x, where int(x) is an integer neighbor of x */
+ if (u.i >> 63)
+ y = x - toint + toint - x;
+ else
+ y = x + toint - toint - x;
+ /* special case because of non-nearest rounding modes */
+ if (e <= 0x3ff-1) {
+ FORCE_EVAL(y);
+ return u.i >> 63 ? -0.0 : 1;
+ }
+ if (y < 0)
+ return x + y + 1;
+ return x + y;
+}
diff --git a/sw/math/src/math/ceilf.c b/sw/math/src/math/ceilf.c
new file mode 100644
index 000000000..869835f39
--- /dev/null
+++ b/sw/math/src/math/ceilf.c
@@ -0,0 +1,27 @@
+#include "libm.h"
+
+float ceilf(float x)
+{
+ union {float f; uint32_t i;} u = {x};
+ int e = (int)(u.i >> 23 & 0xff) - 0x7f;
+ uint32_t m;
+
+ if (e >= 23)
+ return x;
+ if (e >= 0) {
+ m = 0x007fffff >> e;
+ if ((u.i & m) == 0)
+ return x;
+ FORCE_EVAL(x + 0x1p120f);
+ if (u.i >> 31 == 0)
+ u.i += m;
+ u.i &= ~m;
+ } else {
+ FORCE_EVAL(x + 0x1p120f);
+ if (u.i >> 31)
+ u.f = -0.0;
+ else if (u.i << 1)
+ u.f = 1.0;
+ }
+ return u.f;
+}
diff --git a/sw/math/src/math/ceill.c b/sw/math/src/math/ceill.c
new file mode 100644
index 000000000..60a83020d
--- /dev/null
+++ b/sw/math/src/math/ceill.c
@@ -0,0 +1,34 @@
+#include "libm.h"
+
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
+long double ceill(long double x)
+{
+ return ceil(x);
+}
+#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
+
+static const long double toint = 1/LDBL_EPSILON;
+
+long double ceill(long double x)
+{
+ union ldshape u = {x};
+ int e = u.i.se & 0x7fff;
+ long double y;
+
+ if (e >= 0x3fff+LDBL_MANT_DIG-1 || x == 0)
+ return x;
+ /* y = int(x) - x, where int(x) is an integer neighbor of x */
+ if (u.i.se >> 15)
+ y = x - toint + toint - x;
+ else
+ y = x + toint - toint - x;
+ /* special case because of non-nearest rounding modes */
+ if (e <= 0x3fff-1) {
+ FORCE_EVAL(y);
+ return u.i.se >> 15 ? -0.0 : 1;
+ }
+ if (y < 0)
+ return x + y + 1;
+ return x + y;
+}
+#endif
diff --git a/sw/math/src/math/exp2f_data.c b/sw/math/src/math/exp2f_data.c
new file mode 100644
index 000000000..be324727f
--- /dev/null
+++ b/sw/math/src/math/exp2f_data.c
@@ -0,0 +1,35 @@
+/*
+ * Shared data between expf, exp2f and powf.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "exp2f_data.h"
+
+#define N (1 << EXP2F_TABLE_BITS)
+
+const struct exp2f_data __exp2f_data = {
+ /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
+ used for computing 2^(k/N) for an int |k| < 150 N as
+ double(tab[k%N] + (k << 52-BITS)) */
+ .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+ },
+ .shift_scaled = 0x1.8p+52 / N,
+ .poly = {
+ 0x1.c6af84b912394p-5, 0x1.ebfce50fac4f3p-3, 0x1.62e42ff0c52d6p-1,
+ },
+ .shift = 0x1.8p+52,
+ .invln2_scaled = 0x1.71547652b82fep+0 * N,
+ .poly_scaled = {
+ 0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N,
+ },
+};
diff --git a/sw/math/src/math/exp2f_data.h b/sw/math/src/math/exp2f_data.h
new file mode 100644
index 000000000..fe744f15b
--- /dev/null
+++ b/sw/math/src/math/exp2f_data.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _EXP2F_DATA_H
+#define _EXP2F_DATA_H
+
+#include
+#include
+
+/* Shared between expf, exp2f and powf. */
+#define EXP2F_TABLE_BITS 5
+#define EXP2F_POLY_ORDER 3
+extern hidden const struct exp2f_data {
+ uint64_t tab[1 << EXP2F_TABLE_BITS];
+ double shift_scaled;
+ double poly[EXP2F_POLY_ORDER];
+ double shift;
+ double invln2_scaled;
+ double poly_scaled[EXP2F_POLY_ORDER];
+} __exp2f_data;
+
+#endif
diff --git a/sw/math/src/math/expf.c b/sw/math/src/math/expf.c
new file mode 100644
index 000000000..f9fbf8e72
--- /dev/null
+++ b/sw/math/src/math/expf.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision e^x function.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include
+#include
+#include "libm.h"
+#include "exp2f_data.h"
+
+/*
+EXP2F_TABLE_BITS = 5
+EXP2F_POLY_ORDER = 3
+
+ULP error: 0.502 (nearest rounding.)
+Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.)
+Wrong count: 170635 (all nearest rounding wrong results with fma.)
+Non-nearest ULP error: 1 (rounded ULP error)
+*/
+
+#define N (1 << EXP2F_TABLE_BITS)
+#define InvLn2N __exp2f_data.invln2_scaled
+#define T __exp2f_data.tab
+#define C __exp2f_data.poly_scaled
+
+static inline uint32_t top12(float x)
+{
+ return asuint(x) >> 20;
+}
+
+float expf(float x)
+{
+ uint32_t abstop;
+ uint64_t ki, t;
+ double_t kd, xd, z, r, r2, y, s;
+
+ xd = (double_t)x;
+ abstop = top12(x) & 0x7ff;
+ if (predict_false(abstop >= top12(88.0f))) {
+ /* |x| >= 88 or x is nan. */
+ if (asuint(x) == asuint(-INFINITY))
+ return 0.0f;
+ if (abstop >= top12(INFINITY))
+ return x + x;
+ if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */
+ return __math_oflowf(0);
+ if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */
+ return __math_uflowf(0);
+ }
+
+ /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k. */
+ z = InvLn2N * xd;
+
+ /* Round and convert z to int, the result is in [-150*N, 128*N] and
+ ideally ties-to-even rule is used, otherwise the magnitude of r
+ can be bigger which gives larger approximation error. */
+#if TOINT_INTRINSICS
+ kd = roundtoint(z);
+ ki = converttoint(z);
+#else
+# define SHIFT __exp2f_data.shift
+ kd = eval_as_double(z + SHIFT);
+ ki = asuint64(kd);
+ kd -= SHIFT;
+#endif
+ r = z - kd;
+
+ /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
+ t = T[ki % N];
+ t += ki << (52 - EXP2F_TABLE_BITS);
+ s = asdouble(t);
+ z = C[0] * r + C[1];
+ r2 = r * r;
+ y = C[2] * r + 1;
+ y = z * r2 + y;
+ y = y * s;
+ return eval_as_float(y);
+}
diff --git a/sw/math/src/math/expm1.c b/sw/math/src/math/expm1.c
index ac1e61e4f..d94f57fe5 100644
--- a/sw/math/src/math/expm1.c
+++ b/sw/math/src/math/expm1.c
@@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
double expm1(double x)
{
double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk;
- union {double f; uint64_t i;} u = {x};
- uint32_t hx = u.i>>32 & 0x7fffffff;
- int k, sign = u.i>>63;
+ /// Original implementation
+ // union {double f; uint64_t i;} u = {x};
+ // uint32_t hx = u.i>>32 & 0x7fffffff;
+ // int k, sign = u.i>>63;
+ /// Safe implementation in Snitch
+ uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
+ uint32_t hx = upper_32b_x & 0x7fffffff;
+ int k, sign = upper_32b_x>>31;
/* filter out huge and non-finite argument */
if (hx >= 0x4043687A) { /* if |x|>=56*ln2 */
@@ -182,8 +187,12 @@ double expm1(double x)
return -2.0*(e-(x+0.5));
return 1.0+2.0*(x-e);
}
- u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */
- twopk = u.f;
+ /// Original implementation
+ // u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */
+ // twopk = u.f;
+ /// Safe implementation in Snitch
+ uint32_t u_i = (uint32_t)(0x3ff + k)<<20;
+ safe_inject_into_upper_32b_double(u_i, &twopk);
if (k < 0 || k > 56) { /* suffice to return exp(x)-1 */
y = x - e + 1.0;
if (k == 1024)
@@ -192,10 +201,19 @@ double expm1(double x)
y = y*twopk;
return y - 1.0;
}
- u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */
+ /// Original implementation
+ // u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */
+ // if (k < 20)
+ // y = (x-e+(1-u.f))*twopk;
+ // else
+ // y = (x-(e+u.f)+1)*twopk;
+ /// Safe implementation in Snitch
+ u_i = (uint32_t)(0x3ff - k)<<20;
+ double u_f = 0;
+ safe_inject_into_upper_32b_double(u_i, &u_f);
if (k < 20)
- y = (x-e+(1-u.f))*twopk;
+ y = (x-e+(1-u_f))*twopk;
else
- y = (x-(e+u.f)+1)*twopk;
+ y = (x-(e+u_f)+1)*twopk;
return y;
}
diff --git a/sw/math/src/math/log2.c b/sw/math/src/math/log2.c
new file mode 100644
index 000000000..1276ed4e3
--- /dev/null
+++ b/sw/math/src/math/log2.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision log2(x) function.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include
+#include
+#include "libm.h"
+#include "log2_data.h"
+
+#define T __log2_data.tab
+#define T2 __log2_data.tab2
+#define B __log2_data.poly1
+#define A __log2_data.poly
+#define InvLn2hi __log2_data.invln2hi
+#define InvLn2lo __log2_data.invln2lo
+#define N (1 << LOG2_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+
+/* Top 16 bits of a double. */
+static inline uint32_t top16(double x)
+{
+ return asuint64(x) >> 48;
+}
+
+double log2(double x)
+{
+ double_t z, r, r2, r4, y, invc, logc, kd, hi, lo, t1, t2, t3, p;
+ uint64_t ix, iz, tmp;
+ uint32_t top;
+ int k, i;
+
+ ix = asuint64(x);
+ top = top16(x);
+#define LO asuint64(1.0 - 0x1.5b51p-5)
+#define HI asuint64(1.0 + 0x1.6ab2p-5)
+ if (predict_false(ix - LO < HI - LO)) {
+ /* Handle close to 1.0 inputs separately. */
+ /* Fix sign of zero with downward rounding when x==1. */
+ if (WANT_ROUNDING && predict_false(ix == asuint64(1.0)))
+ return 0;
+ r = x - 1.0;
+#if __FP_FAST_FMA
+ hi = r * InvLn2hi;
+ lo = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -hi);
+#else
+ double_t rhi, rlo;
+ rhi = asdouble(asuint64(r) & -1ULL << 32);
+ rlo = r - rhi;
+ hi = rhi * InvLn2hi;
+ lo = rlo * InvLn2hi + r * InvLn2lo;
+#endif
+ r2 = r * r; /* rounding error: 0x1p-62. */
+ r4 = r2 * r2;
+ /* Worst-case error is less than 0.54 ULP (0.55 ULP without fma). */
+ p = r2 * (B[0] + r * B[1]);
+ y = hi + p;
+ lo += hi - y + p;
+ lo += r4 * (B[2] + r * B[3] + r2 * (B[4] + r * B[5]) +
+ r4 * (B[6] + r * B[7] + r2 * (B[8] + r * B[9])));
+ y += lo;
+ return eval_as_double(y);
+ }
+ if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) {
+ /* x < 0x1p-1022 or inf or nan. */
+ if (ix * 2 == 0)
+ return __math_divzero(1);
+ if (ix == asuint64(INFINITY)) /* log(inf) == inf. */
+ return x;
+ if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+ return __math_invalid(x);
+ /* x is subnormal, normalize it. */
+ ix = asuint64(x * 0x1p52);
+ ix -= 52ULL << 52;
+ }
+
+ /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ tmp = ix - OFF;
+ i = (tmp >> (52 - LOG2_TABLE_BITS)) % N;
+ k = (int64_t)tmp >> 52; /* arithmetic shift */
+ iz = ix - (tmp & 0xfffULL << 52);
+ invc = T[i].invc;
+ logc = T[i].logc;
+ z = asdouble(iz);
+ kd = (double_t)k;
+
+ /* log2(x) = log2(z/c) + log2(c) + k. */
+ /* r ~= z/c - 1, |r| < 1/(2*N). */
+#if __FP_FAST_FMA
+ /* rounding error: 0x1p-55/N. */
+ r = __builtin_fma(z, invc, -1.0);
+ t1 = r * InvLn2hi;
+ t2 = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -t1);
+#else
+ double_t rhi, rlo;
+ /* rounding error: 0x1p-55/N + 0x1p-65. */
+ r = (z - T2[i].chi - T2[i].clo) * invc;
+ rhi = asdouble(asuint64(r) & -1ULL << 32);
+ rlo = r - rhi;
+ t1 = rhi * InvLn2hi;
+ t2 = rlo * InvLn2hi + r * InvLn2lo;
+#endif
+
+ /* hi + lo = r/ln2 + log2(c) + k. */
+ t3 = kd + logc;
+ hi = t3 + t1;
+ lo = t3 - hi + t1 + t2;
+
+ /* log2(r+1) = r/ln2 + r^2*poly(r). */
+ /* Evaluation is optimized assuming superscalar pipelined execution. */
+ r2 = r * r; /* rounding error: 0x1p-54/N^2. */
+ r4 = r2 * r2;
+ /* Worst-case error if |y| > 0x1p-4: 0.547 ULP (0.550 ULP without fma).
+ ~ 0.5 + 2/N/ln2 + abs-poly-error*0x1p56 ULP (+ 0.003 ULP without fma). */
+ p = A[0] + r * A[1] + r2 * (A[2] + r * A[3]) + r4 * (A[4] + r * A[5]);
+ y = lo + r2 * p + hi;
+ return eval_as_double(y);
+}
diff --git a/sw/math/src/math/log2_data.c b/sw/math/src/math/log2_data.c
new file mode 100644
index 000000000..3dd1ca514
--- /dev/null
+++ b/sw/math/src/math/log2_data.c
@@ -0,0 +1,201 @@
+/*
+ * Data for log2.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "log2_data.h"
+
+#define N (1 << LOG2_TABLE_BITS)
+
+const struct log2_data __log2_data = {
+// First coefficient: 0x1.71547652b82fe1777d0ffda0d24p0
+.invln2hi = 0x1.7154765200000p+0,
+.invln2lo = 0x1.705fc2eefa200p-33,
+.poly1 = {
+// relative error: 0x1.2fad8188p-63
+// in -0x1.5b51p-5 0x1.6ab2p-5
+-0x1.71547652b82fep-1,
+0x1.ec709dc3a03f7p-2,
+-0x1.71547652b7c3fp-2,
+0x1.2776c50f05be4p-2,
+-0x1.ec709dd768fe5p-3,
+0x1.a61761ec4e736p-3,
+-0x1.7153fbc64a79bp-3,
+0x1.484d154f01b4ap-3,
+-0x1.289e4a72c383cp-3,
+0x1.0b32f285aee66p-3,
+},
+.poly = {
+// relative error: 0x1.a72c2bf8p-58
+// abs error: 0x1.67a552c8p-66
+// in -0x1.f45p-8 0x1.f45p-8
+-0x1.71547652b8339p-1,
+0x1.ec709dc3a04bep-2,
+-0x1.7154764702ffbp-2,
+0x1.2776c50034c48p-2,
+-0x1.ec7b328ea92bcp-3,
+0x1.a6225e117f92ep-3,
+},
+/* Algorithm:
+
+ x = 2^k z
+ log2(x) = k + log2(c) + log2(z/c)
+ log2(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+ tab[i].invc = 1/c
+ tab[i].logc = (double)log2(c)
+ tab2[i].chi = (double)c
+ tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+ 1) the rounding error in 0x1.8p10 + logc is 0,
+ 2) the rounding error in z - chi - clo is < 0x1p-64 and
+ 3) the rounding error in (double)log2(c) is minimized (< 0x1p-68).
+
+Note: 1) ensures that k + logc can be computed without rounding error, 2)
+ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a
+single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log2(x)| < 0x1p-4, this is not enough so that is special cased. */
+.tab = {
+{0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1},
+{0x1.6e1f766d2cca1p+0, -0x1.08494bd76d000p-1},
+{0x1.6a13d0e30d48ap+0, -0x1.00143aee8f800p-1},
+{0x1.661ec32d06c85p+0, -0x1.efec5360b4000p-2},
+{0x1.623fa951198f8p+0, -0x1.dfdd91ab7e000p-2},
+{0x1.5e75ba4cf026cp+0, -0x1.cffae0cc79000p-2},
+{0x1.5ac055a214fb8p+0, -0x1.c043811fda000p-2},
+{0x1.571ed0f166e1ep+0, -0x1.b0b67323ae000p-2},
+{0x1.53909590bf835p+0, -0x1.a152f5a2db000p-2},
+{0x1.5014fed61adddp+0, -0x1.9217f5af86000p-2},
+{0x1.4cab88e487bd0p+0, -0x1.8304db0719000p-2},
+{0x1.49539b4334feep+0, -0x1.74189f9a9e000p-2},
+{0x1.460cbdfafd569p+0, -0x1.6552bb5199000p-2},
+{0x1.42d664ee4b953p+0, -0x1.56b23a29b1000p-2},
+{0x1.3fb01111dd8a6p+0, -0x1.483650f5fa000p-2},
+{0x1.3c995b70c5836p+0, -0x1.39de937f6a000p-2},
+{0x1.3991c4ab6fd4ap+0, -0x1.2baa1538d6000p-2},
+{0x1.3698e0ce099b5p+0, -0x1.1d98340ca4000p-2},
+{0x1.33ae48213e7b2p+0, -0x1.0fa853a40e000p-2},
+{0x1.30d191985bdb1p+0, -0x1.01d9c32e73000p-2},
+{0x1.2e025cab271d7p+0, -0x1.e857da2fa6000p-3},
+{0x1.2b404cf13cd82p+0, -0x1.cd3c8633d8000p-3},
+{0x1.288b02c7ccb50p+0, -0x1.b26034c14a000p-3},
+{0x1.25e2263944de5p+0, -0x1.97c1c2f4fe000p-3},
+{0x1.234563d8615b1p+0, -0x1.7d6023f800000p-3},
+{0x1.20b46e33eaf38p+0, -0x1.633a71a05e000p-3},
+{0x1.1e2eefdcda3ddp+0, -0x1.494f5e9570000p-3},
+{0x1.1bb4a580b3930p+0, -0x1.2f9e424e0a000p-3},
+{0x1.19453847f2200p+0, -0x1.162595afdc000p-3},
+{0x1.16e06c0d5d73cp+0, -0x1.f9c9a75bd8000p-4},
+{0x1.1485f47b7e4c2p+0, -0x1.c7b575bf9c000p-4},
+{0x1.12358ad0085d1p+0, -0x1.960c60ff48000p-4},
+{0x1.0fef00f532227p+0, -0x1.64ce247b60000p-4},
+{0x1.0db2077d03a8fp+0, -0x1.33f78b2014000p-4},
+{0x1.0b7e6d65980d9p+0, -0x1.0387d1a42c000p-4},
+{0x1.0953efe7b408dp+0, -0x1.a6f9208b50000p-5},
+{0x1.07325cac53b83p+0, -0x1.47a954f770000p-5},
+{0x1.05197e40d1b5cp+0, -0x1.d23a8c50c0000p-6},
+{0x1.03091c1208ea2p+0, -0x1.16a2629780000p-6},
+{0x1.0101025b37e21p+0, -0x1.720f8d8e80000p-8},
+{0x1.fc07ef9caa76bp-1, 0x1.6fe53b1500000p-7},
+{0x1.f4465d3f6f184p-1, 0x1.11ccce10f8000p-5},
+{0x1.ecc079f84107fp-1, 0x1.c4dfc8c8b8000p-5},
+{0x1.e573a99975ae8p-1, 0x1.3aa321e574000p-4},
+{0x1.de5d6f0bd3de6p-1, 0x1.918a0d08b8000p-4},
+{0x1.d77b681ff38b3p-1, 0x1.e72e9da044000p-4},
+{0x1.d0cb5724de943p-1, 0x1.1dcd2507f6000p-3},
+{0x1.ca4b2dc0e7563p-1, 0x1.476ab03dea000p-3},
+{0x1.c3f8ee8d6cb51p-1, 0x1.7074377e22000p-3},
+{0x1.bdd2b4f020c4cp-1, 0x1.98ede8ba94000p-3},
+{0x1.b7d6c006015cap-1, 0x1.c0db86ad2e000p-3},
+{0x1.b20366e2e338fp-1, 0x1.e840aafcee000p-3},
+{0x1.ac57026295039p-1, 0x1.0790ab4678000p-2},
+{0x1.a6d01bc2731ddp-1, 0x1.1ac056801c000p-2},
+{0x1.a16d3bc3ff18bp-1, 0x1.2db11d4fee000p-2},
+{0x1.9c2d14967feadp-1, 0x1.406464ec58000p-2},
+{0x1.970e4f47c9902p-1, 0x1.52dbe093af000p-2},
+{0x1.920fb3982bcf2p-1, 0x1.651902050d000p-2},
+{0x1.8d30187f759f1p-1, 0x1.771d2cdeaf000p-2},
+{0x1.886e5ebb9f66dp-1, 0x1.88e9c857d9000p-2},
+{0x1.83c97b658b994p-1, 0x1.9a80155e16000p-2},
+{0x1.7f405ffc61022p-1, 0x1.abe186ed3d000p-2},
+{0x1.7ad22181415cap-1, 0x1.bd0f2aea0e000p-2},
+{0x1.767dcf99eff8cp-1, 0x1.ce0a43dbf4000p-2},
+},
+#if !__FP_FAST_FMA
+.tab2 = {
+{0x1.6200012b90a8ep-1, 0x1.904ab0644b605p-55},
+{0x1.66000045734a6p-1, 0x1.1ff9bea62f7a9p-57},
+{0x1.69fffc325f2c5p-1, 0x1.27ecfcb3c90bap-55},
+{0x1.6e00038b95a04p-1, 0x1.8ff8856739326p-55},
+{0x1.71fffe09994e3p-1, 0x1.afd40275f82b1p-55},
+{0x1.7600015590e1p-1, -0x1.2fd75b4238341p-56},
+{0x1.7a00012655bd5p-1, 0x1.808e67c242b76p-56},
+{0x1.7e0003259e9a6p-1, -0x1.208e426f622b7p-57},
+{0x1.81fffedb4b2d2p-1, -0x1.402461ea5c92fp-55},
+{0x1.860002dfafcc3p-1, 0x1.df7f4a2f29a1fp-57},
+{0x1.89ffff78c6b5p-1, -0x1.e0453094995fdp-55},
+{0x1.8e00039671566p-1, -0x1.a04f3bec77b45p-55},
+{0x1.91fffe2bf1745p-1, -0x1.7fa34400e203cp-56},
+{0x1.95fffcc5c9fd1p-1, -0x1.6ff8005a0695dp-56},
+{0x1.9a0003bba4767p-1, 0x1.0f8c4c4ec7e03p-56},
+{0x1.9dfffe7b92da5p-1, 0x1.e7fd9478c4602p-55},
+{0x1.a1fffd72efdafp-1, -0x1.a0c554dcdae7ep-57},
+{0x1.a5fffde04ff95p-1, 0x1.67da98ce9b26bp-55},
+{0x1.a9fffca5e8d2bp-1, -0x1.284c9b54c13dep-55},
+{0x1.adfffddad03eap-1, 0x1.812c8ea602e3cp-58},
+{0x1.b1ffff10d3d4dp-1, -0x1.efaddad27789cp-55},
+{0x1.b5fffce21165ap-1, 0x1.3cb1719c61237p-58},
+{0x1.b9fffd950e674p-1, 0x1.3f7d94194cep-56},
+{0x1.be000139ca8afp-1, 0x1.50ac4215d9bcp-56},
+{0x1.c20005b46df99p-1, 0x1.beea653e9c1c9p-57},
+{0x1.c600040b9f7aep-1, -0x1.c079f274a70d6p-56},
+{0x1.ca0006255fd8ap-1, -0x1.a0b4076e84c1fp-56},
+{0x1.cdfffd94c095dp-1, 0x1.8f933f99ab5d7p-55},
+{0x1.d1ffff975d6cfp-1, -0x1.82c08665fe1bep-58},
+{0x1.d5fffa2561c93p-1, -0x1.b04289bd295f3p-56},
+{0x1.d9fff9d228b0cp-1, 0x1.70251340fa236p-55},
+{0x1.de00065bc7e16p-1, -0x1.5011e16a4d80cp-56},
+{0x1.e200002f64791p-1, 0x1.9802f09ef62ep-55},
+{0x1.e600057d7a6d8p-1, -0x1.e0b75580cf7fap-56},
+{0x1.ea00027edc00cp-1, -0x1.c848309459811p-55},
+{0x1.ee0006cf5cb7cp-1, -0x1.f8027951576f4p-55},
+{0x1.f2000782b7dccp-1, -0x1.f81d97274538fp-55},
+{0x1.f6000260c450ap-1, -0x1.071002727ffdcp-59},
+{0x1.f9fffe88cd533p-1, -0x1.81bdce1fda8bp-58},
+{0x1.fdfffd50f8689p-1, 0x1.7f91acb918e6ep-55},
+{0x1.0200004292367p+0, 0x1.b7ff365324681p-54},
+{0x1.05fffe3e3d668p+0, 0x1.6fa08ddae957bp-55},
+{0x1.0a0000a85a757p+0, -0x1.7e2de80d3fb91p-58},
+{0x1.0e0001a5f3fccp+0, -0x1.1823305c5f014p-54},
+{0x1.11ffff8afbaf5p+0, -0x1.bfabb6680bac2p-55},
+{0x1.15fffe54d91adp+0, -0x1.d7f121737e7efp-54},
+{0x1.1a00011ac36e1p+0, 0x1.c000a0516f5ffp-54},
+{0x1.1e00019c84248p+0, -0x1.082fbe4da5dap-54},
+{0x1.220000ffe5e6ep+0, -0x1.8fdd04c9cfb43p-55},
+{0x1.26000269fd891p+0, 0x1.cfe2a7994d182p-55},
+{0x1.2a00029a6e6dap+0, -0x1.00273715e8bc5p-56},
+{0x1.2dfffe0293e39p+0, 0x1.b7c39dab2a6f9p-54},
+{0x1.31ffff7dcf082p+0, 0x1.df1336edc5254p-56},
+{0x1.35ffff05a8b6p+0, -0x1.e03564ccd31ebp-54},
+{0x1.3a0002e0eaeccp+0, 0x1.5f0e74bd3a477p-56},
+{0x1.3e000043bb236p+0, 0x1.c7dcb149d8833p-54},
+{0x1.4200002d187ffp+0, 0x1.e08afcf2d3d28p-56},
+{0x1.460000d387cb1p+0, 0x1.20837856599a6p-55},
+{0x1.4a00004569f89p+0, -0x1.9fa5c904fbcd2p-55},
+{0x1.4e000043543f3p+0, -0x1.81125ed175329p-56},
+{0x1.51fffcc027f0fp+0, 0x1.883d8847754dcp-54},
+{0x1.55ffffd87b36fp+0, -0x1.709e731d02807p-55},
+{0x1.59ffff21df7bap+0, 0x1.7f79f68727b02p-55},
+{0x1.5dfffebfc3481p+0, -0x1.180902e30e93ep-54},
+},
+#endif
+};
diff --git a/sw/math/src/math/log2_data.h b/sw/math/src/math/log2_data.h
new file mode 100644
index 000000000..276a786d1
--- /dev/null
+++ b/sw/math/src/math/log2_data.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _LOG2_DATA_H
+#define _LOG2_DATA_H
+
+#include
+
+#define LOG2_TABLE_BITS 6
+#define LOG2_POLY_ORDER 7
+#define LOG2_POLY1_ORDER 11
+extern hidden const struct log2_data {
+ double invln2hi;
+ double invln2lo;
+ double poly[LOG2_POLY_ORDER - 1];
+ double poly1[LOG2_POLY1_ORDER - 1];
+ struct {
+ double invc, logc;
+ } tab[1 << LOG2_TABLE_BITS];
+#if !__FP_FAST_FMA
+ struct {
+ double chi, clo;
+ } tab2[1 << LOG2_TABLE_BITS];
+#endif
+} __log2_data;
+
+#endif
diff --git a/sw/math/src/math/log2f.c b/sw/math/src/math/log2f.c
new file mode 100644
index 000000000..c368f88f3
--- /dev/null
+++ b/sw/math/src/math/log2f.c
@@ -0,0 +1,72 @@
+/*
+ * Single-precision log2 function.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include
+#include
+#include "libm.h"
+#include "log2f_data.h"
+
+/*
+LOG2F_TABLE_BITS = 4
+LOG2F_POLY_ORDER = 4
+
+ULP error: 0.752 (nearest rounding.)
+Relative error: 1.9 * 2^-26 (before rounding.)
+*/
+
+#define N (1 << LOG2F_TABLE_BITS)
+#define T __log2f_data.tab
+#define A __log2f_data.poly
+#define OFF 0x3f330000
+
+float log2f(float x)
+{
+ double_t z, r, r2, p, y, y0, invc, logc;
+ uint32_t ix, iz, top, tmp;
+ int k, i;
+
+ ix = asuint(x);
+ /* Fix sign of zero with downward rounding when x==1. */
+ if (WANT_ROUNDING && predict_false(ix == 0x3f800000))
+ return 0;
+ if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+ /* x < 0x1p-126 or inf or nan. */
+ if (ix * 2 == 0)
+ return __math_divzerof(1);
+ if (ix == 0x7f800000) /* log2(inf) == inf. */
+ return x;
+ if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
+ return __math_invalidf(x);
+ /* x is subnormal, normalize it. */
+ ix = asuint(x * 0x1p23f);
+ ix -= 23 << 23;
+ }
+
+ /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ tmp = ix - OFF;
+ i = (tmp >> (23 - LOG2F_TABLE_BITS)) % N;
+ top = tmp & 0xff800000;
+ iz = ix - top;
+ k = (int32_t)tmp >> 23; /* arithmetic shift */
+ invc = T[i].invc;
+ logc = T[i].logc;
+ z = (double_t)asfloat(iz);
+
+ /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
+ r = z * invc - 1;
+ y0 = logc + (double_t)k;
+
+ /* Pipelined polynomial evaluation to approximate log1p(r)/ln2. */
+ r2 = r * r;
+ y = A[1] * r + A[2];
+ y = A[0] * r2 + y;
+ p = A[3] * r + y0;
+ y = y * r2 + p;
+ return eval_as_float(y);
+}
diff --git a/sw/math/src/math/log2f_data.c b/sw/math/src/math/log2f_data.c
new file mode 100644
index 000000000..24e450f1e
--- /dev/null
+++ b/sw/math/src/math/log2f_data.c
@@ -0,0 +1,33 @@
+/*
+ * Data definition for log2f.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "log2f_data.h"
+
+const struct log2f_data __log2f_data = {
+ .tab = {
+ { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 },
+ { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 },
+ { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 },
+ { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 },
+ { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 },
+ { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 },
+ { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 },
+ { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 },
+ { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 },
+ { 0x1p+0, 0x0p+0 },
+ { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 },
+ { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 },
+ { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 },
+ { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 },
+ { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 },
+ { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 },
+ },
+ .poly = {
+ -0x1.712b6f70a7e4dp-2, 0x1.ecabf496832ep-2, -0x1.715479ffae3dep-1,
+ 0x1.715475f35c8b8p0,
+ }
+};
diff --git a/sw/math/src/math/log2f_data.h b/sw/math/src/math/log2f_data.h
new file mode 100644
index 000000000..4fa489560
--- /dev/null
+++ b/sw/math/src/math/log2f_data.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _LOG2F_DATA_H
+#define _LOG2F_DATA_H
+
+#include
+
+#define LOG2F_TABLE_BITS 4
+#define LOG2F_POLY_ORDER 4
+extern hidden const struct log2f_data {
+ struct {
+ double invc, logc;
+ } tab[1 << LOG2F_TABLE_BITS];
+ double poly[LOG2F_POLY_ORDER];
+} __log2f_data;
+
+#endif
diff --git a/sw/math/src/math/sqrt.c b/sw/math/src/math/sqrt.c
new file mode 100644
index 000000000..5ba265596
--- /dev/null
+++ b/sw/math/src/math/sqrt.c
@@ -0,0 +1,158 @@
+#include
+#include
+#include "libm.h"
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+/* returns a*b*2^-32 - e, with error 0 <= e < 1. */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+ return (uint64_t)a*b >> 32;
+}
+
+/* returns a*b*2^-64 - e, with error 0 <= e < 3. */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+ uint64_t ahi = a>>32;
+ uint64_t alo = a&0xffffffff;
+ uint64_t bhi = b>>32;
+ uint64_t blo = b&0xffffffff;
+ return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
+
+double sqrt(double x)
+{
+ uint64_t ix, top, m;
+
+ /* special case handling. */
+ ix = asuint64(x);
+ top = ix >> 52;
+ if (predict_false(top - 0x001 >= 0x7ff - 0x001)) {
+ /* x < 0x1p-1022 or inf or nan. */
+ if (ix * 2 == 0)
+ return x;
+ if (ix == 0x7ff0000000000000)
+ return x;
+ if (ix > 0x7ff0000000000000)
+ return __math_invalid(x);
+ /* x is subnormal, normalize it. */
+ ix = asuint64(x * 0x1p52);
+ top = ix >> 52;
+ top -= 52;
+ }
+
+ /* argument reduction:
+ x = 4^e m; with integer e, and m in [1, 4)
+ m: fixed point representation [2.62]
+ 2^e is the exponent part of the result. */
+ int even = top & 1;
+ m = (ix << 11) | 0x8000000000000000;
+ if (even) m >>= 1;
+ top = (top + 0x3ff) >> 1;
+
+ /* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4)
+
+ initial estimate:
+ 7bit table lookup (1bit exponent and 6bit significand).
+
+ iterative approximation:
+ using 2 goldschmidt iterations with 32bit int arithmetics
+ and a final iteration with 64bit int arithmetics.
+
+ details:
+
+ the relative error (e = r0 sqrt(m)-1) of a linear estimate
+ (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
+ a table lookup is faster and needs one less iteration
+ 6 bit lookup table (128b) gives |e| < 0x1.f9p-8
+ 7 bit lookup table (256b) gives |e| < 0x1.fdp-9
+ for single and double prec 6bit is enough but for quad
+ prec 7bit is needed (or modified iterations). to avoid
+ one more iteration >=13bit table would be needed (16k).
+
+ a newton-raphson iteration for r is
+ w = r*r
+ u = 3 - m*w
+ r = r*u/2
+ can use a goldschmidt iteration for s at the end or
+ s = m*r
+
+ first goldschmidt iteration is
+ s = m*r
+ u = 3 - s*r
+ r = r*u/2
+ s = s*u/2
+ next goldschmidt iteration is
+ u = 3 - s*r
+ r = r*u/2
+ s = s*u/2
+ and at the end r is not computed only s.
+
+ they use the same amount of operations and converge at the
+ same quadratic rate, i.e. if
+ r1 sqrt(m) - 1 = e, then
+ r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3
+ the advantage of goldschmidt is that the mul for s and r
+ are independent (computed in parallel), however it is not
+ "self synchronizing": it only uses the input m in the
+ first iteration so rounding errors accumulate. at the end
+ or when switching to larger precision arithmetics rounding
+ errors dominate so the first iteration should be used.
+
+ the fixed point representations are
+ m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30
+ and after switching to 64 bit
+ m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62 */
+
+ static const uint64_t three = 0xc0000000;
+ uint64_t r, s, d, u, i;
+
+ i = (ix >> 46) % 128;
+ r = (uint32_t)__rsqrt_tab[i] << 16;
+ /* |r sqrt(m) - 1| < 0x1.fdp-9 */
+ s = mul32(m>>32, r);
+ /* |s/sqrt(m) - 1| < 0x1.fdp-9 */
+ d = mul32(s, r);
+ u = three - d;
+ r = mul32(r, u) << 1;
+ /* |r sqrt(m) - 1| < 0x1.7bp-16 */
+ s = mul32(s, u) << 1;
+ /* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+ d = mul32(s, r);
+ u = three - d;
+ r = mul32(r, u) << 1;
+ /* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */
+ r = r << 32;
+ s = mul64(m, r);
+ d = mul64(s, r);
+ u = (three<<32) - d;
+ s = mul64(s, u); /* repr: 3.61 */
+ /* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */
+ s = (s - 2) >> 9; /* repr: 12.52 */
+ /* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */
+
+ /* s < sqrt(m) < s + 0x1.09p-52,
+ compute nearest rounded result:
+ the nearest result to 52 bits is either s or s+0x1p-52,
+ we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m. */
+ uint64_t d0, d1, d2;
+ double y, t;
+ d0 = (m << 42) - s*s;
+ d1 = s - d0;
+ d2 = d1 + s + 1;
+ s += d1 >> 63;
+ s &= 0x000fffffffffffff;
+ s |= top << 52;
+ y = asdouble(s);
+ if (FENV_SUPPORT) {
+ /* handle rounding modes and inexact exception:
+ only (s+1)^2 == 2^42 m case is exact otherwise
+ add a tiny value to cause the fenv effects. */
+ uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000;
+ tiny |= (d1^d2) & 0x8000000000000000;
+ t = asdouble(tiny);
+ y = eval_as_double(y + t);
+ }
+ return y;
+}
diff --git a/sw/math/src/math/sqrt_data.c b/sw/math/src/math/sqrt_data.c
new file mode 100644
index 000000000..61bc22f43
--- /dev/null
+++ b/sw/math/src/math/sqrt_data.c
@@ -0,0 +1,19 @@
+#include "sqrt_data.h"
+const uint16_t __rsqrt_tab[128] = {
+0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43,
+0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b,
+0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1,
+0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430,
+0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59,
+0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925,
+0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479,
+0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040,
+0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234,
+0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2,
+0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1,
+0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192,
+0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f,
+0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4,
+0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59,
+0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560,
+};
diff --git a/sw/math/src/math/sqrt_data.h b/sw/math/src/math/sqrt_data.h
new file mode 100644
index 000000000..260c7f9c2
--- /dev/null
+++ b/sw/math/src/math/sqrt_data.h
@@ -0,0 +1,13 @@
+#ifndef _SQRT_DATA_H
+#define _SQRT_DATA_H
+
+#include
+#include
+
+/* if x in [1,2): i = (int)(64*x);
+ if x in [2,4): i = (int)(32*x-64);
+ __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error:
+ |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */
+extern hidden const uint16_t __rsqrt_tab[128];
+
+#endif
diff --git a/sw/math/src/math/sqrtf.c b/sw/math/src/math/sqrtf.c
new file mode 100644
index 000000000..740d81cba
--- /dev/null
+++ b/sw/math/src/math/sqrtf.c
@@ -0,0 +1,83 @@
+#include
+#include
+#include "libm.h"
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+ return (uint64_t)a*b >> 32;
+}
+
+/* see sqrt.c for more detailed comments. */
+
+float sqrtf(float x)
+{
+ uint32_t ix, m, m1, m0, even, ey;
+
+ ix = asuint(x);
+ if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+ /* x < 0x1p-126 or inf or nan. */
+ if (ix * 2 == 0)
+ return x;
+ if (ix == 0x7f800000)
+ return x;
+ if (ix > 0x7f800000)
+ return __math_invalidf(x);
+ /* x is subnormal, normalize it. */
+ ix = asuint(x * 0x1p23f);
+ ix -= 23 << 23;
+ }
+
+ /* x = 4^e m; with int e and m in [1, 4). */
+ even = ix & 0x00800000;
+ m1 = (ix << 8) | 0x80000000;
+ m0 = (ix << 7) & 0x7fffffff;
+ m = even ? m0 : m1;
+
+ /* 2^e is the exponent part of the return value. */
+ ey = ix >> 1;
+ ey += 0x3f800000 >> 1;
+ ey &= 0x7f800000;
+
+ /* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations. */
+ static const uint32_t three = 0xc0000000;
+ uint32_t r, s, d, u, i;
+ i = (ix >> 17) % 128;
+ r = (uint32_t)__rsqrt_tab[i] << 16;
+ /* |r*sqrt(m) - 1| < 0x1p-8 */
+ s = mul32(m, r);
+ /* |s/sqrt(m) - 1| < 0x1p-8 */
+ d = mul32(s, r);
+ u = three - d;
+ r = mul32(r, u) << 1;
+ /* |r*sqrt(m) - 1| < 0x1.7bp-16 */
+ s = mul32(s, u) << 1;
+ /* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+ d = mul32(s, r);
+ u = three - d;
+ s = mul32(s, u);
+ /* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */
+ s = (s - 1)>>6;
+ /* s < sqrt(m) < s + 0x1.08p-23 */
+
+ /* compute nearest rounded result. */
+ uint32_t d0, d1, d2;
+ float y, t;
+ d0 = (m << 16) - s*s;
+ d1 = s - d0;
+ d2 = d1 + s + 1;
+ s += d1 >> 31;
+ s &= 0x007fffff;
+ s |= ey;
+ y = asfloat(s);
+ if (FENV_SUPPORT) {
+ /* handle rounding and inexact exception. */
+ uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000;
+ tiny |= (d1^d2) & 0x80000000;
+ t = asfloat(tiny);
+ y = eval_as_float(y + t);
+ }
+ return y;
+}
diff --git a/sw/math/src/math/tanh.c b/sw/math/src/math/tanh.c
index 20d6dbcf4..2481db1dc 100644
--- a/sw/math/src/math/tanh.c
+++ b/sw/math/src/math/tanh.c
@@ -6,16 +6,23 @@
*/
double tanh(double x)
{
- union {double f; uint64_t i;} u = {.f = x};
uint32_t w;
int sign;
double_t t;
/* x = |x| */
- sign = u.i >> 63;
- u.i &= (uint64_t)-1/2;
- x = u.f;
- w = u.i >> 32;
+ /// Original implementation
+ // union {double f; uint64_t i;} u = {.f = x};
+ // sign = u.i >> 63;
+ // u.i &= (uint64_t)-1/2;
+ // x = u.f;
+ // w = u.i >> 32;
+ /// Safe implementation in Snitch
+ uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
+ sign = upper_32b_x >> 31;
+ uint32_t sign_mask = (~(1 << 31));
+ w = upper_32b_x & sign_mask;
+ safe_inject_into_upper_32b_double(w, &x);
if (w > 0x3fe193ea) {
/* |x| > log(3)/2 ~= 0.5493 or nan */
diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index 7c94acdd9..169e54d7b 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -8,43 +8,49 @@ typedef uint32_t snrt_dma_txid_t;
/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers.
inline snrt_dma_txid_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src,
size_t size) {
- register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10
- register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11
- register uint32_t reg_src_low asm("a2") = src >> 0; // 12
- register uint32_t reg_src_high asm("a3") = src >> 32; // 13
- register uint32_t reg_size asm("a4") = size; // 14
-
- // dmsrc a2, a3
- asm volatile(
- ".word (0b0000000 << 25) | \
- ( (13) << 20) | \
- ( (12) << 15) | \
- ( 0b000 << 12) | \
- (0b0101011 << 0) \n" ::"r"(reg_src_high),
- "r"(reg_src_low));
-
- // dmdst a0, a1
- asm volatile(
- ".word (0b0000001 << 25) | \
- ( (11) << 20) | \
- ( (10) << 15) | \
- ( 0b000 << 12) | \
- (0b0101011 << 0) \n" ::"r"(reg_dst_high),
- "r"(reg_dst_low));
-
- // dmcpyi a0, a4, 0b00
- register uint32_t reg_txid asm("a0"); // 10
- asm volatile(
- ".word (0b0000010 << 25) | \
- ( 0b00000 << 20) | \
- ( (14) << 15) | \
- ( 0b000 << 12) | \
- ( (10) << 7) | \
- (0b0101011 << 0) \n"
- : "=r"(reg_txid)
- : "r"(reg_size));
-
- return reg_txid;
+ // Current DMA does not allow transfers with size == 0 (blocks)
+ // TODO(colluca) remove this check once new DMA is integrated
+ if (size > 0) {
+ register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10
+ register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11
+ register uint32_t reg_src_low asm("a2") = src >> 0; // 12
+ register uint32_t reg_src_high asm("a3") = src >> 32; // 13
+ register uint32_t reg_size asm("a4") = size; // 14
+
+ // dmsrc a2, a3
+ asm volatile(
+ ".word (0b0000000 << 25) | \
+ ( (13) << 20) | \
+ ( (12) << 15) | \
+ ( 0b000 << 12) | \
+ (0b0101011 << 0) \n" ::"r"(reg_src_high),
+ "r"(reg_src_low));
+
+ // dmdst a0, a1
+ asm volatile(
+ ".word (0b0000001 << 25) | \
+ ( (11) << 20) | \
+ ( (10) << 15) | \
+ ( 0b000 << 12) | \
+ (0b0101011 << 0) \n" ::"r"(reg_dst_high),
+ "r"(reg_dst_low));
+
+ // dmcpyi a0, a4, 0b00
+ register uint32_t reg_txid asm("a0"); // 10
+ asm volatile(
+ ".word (0b0000010 << 25) | \
+ ( 0b00000 << 20) | \
+ ( (14) << 15) | \
+ ( 0b000 << 12) | \
+ ( (10) << 7) | \
+ (0b0101011 << 0) \n"
+ : "=r"(reg_txid)
+ : "r"(reg_size));
+
+ return reg_txid;
+ } else {
+ return -1;
+ }
}
/// Initiate an asynchronous 1D DMA transfer.
@@ -58,65 +64,71 @@ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src,
size_t size, size_t dst_stride,
size_t src_stride,
size_t repeat) {
- register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10
- register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11
- register uint32_t reg_src_low asm("a2") = src >> 0; // 12
- register uint32_t reg_src_high asm("a3") = src >> 32; // 13
- register uint32_t reg_size asm("a4") = size; // 14
- register uint32_t reg_dst_stride asm("a5") = dst_stride; // 15
- register uint32_t reg_src_stride asm("a6") = src_stride; // 16
- register uint32_t reg_repeat asm("a7") = repeat; // 17
-
- // dmsrc a0, a1
- asm volatile(
- ".word (0b0000000 << 25) | \
- ( (13) << 20) | \
- ( (12) << 15) | \
- ( 0b000 << 12) | \
- (0b0101011 << 0) \n" ::"r"(reg_src_high),
- "r"(reg_src_low));
-
- // dmdst a0, a1
- asm volatile(
- ".word (0b0000001 << 25) | \
- ( (11) << 20) | \
- ( (10) << 15) | \
- ( 0b000 << 12) | \
- (0b0101011 << 0) \n" ::"r"(reg_dst_high),
- "r"(reg_dst_low));
-
- // dmstr a5, a6
- asm volatile(
- ".word (0b0000110 << 25) | \
- ( (15) << 20) | \
- ( (16) << 15) | \
- ( 0b000 << 12) | \
- (0b0101011 << 0) \n"
- :
- : "r"(reg_dst_stride), "r"(reg_src_stride));
-
- // dmrep a7
- asm volatile(
- ".word (0b0000111 << 25) | \
- ( (17) << 15) | \
- ( 0b000 << 12) | \
- (0b0101011 << 0) \n"
- :
- : "r"(reg_repeat));
-
- // dmcpyi a0, a4, 0b10
- register uint32_t reg_txid asm("a0"); // 10
- asm volatile(
- ".word (0b0000010 << 25) | \
- ( 0b00010 << 20) | \
- ( (14) << 15) | \
- ( 0b000 << 12) | \
- ( (10) << 7) | \
- (0b0101011 << 0) \n"
- : "=r"(reg_txid)
- : "r"(reg_size));
-
- return reg_txid;
+ // Current DMA does not allow transfers with size == 0 (blocks)
+ // TODO(colluca) remove this check once new DMA is integrated
+ if (size > 0) {
+ register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10
+ register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11
+ register uint32_t reg_src_low asm("a2") = src >> 0; // 12
+ register uint32_t reg_src_high asm("a3") = src >> 32; // 13
+ register uint32_t reg_size asm("a4") = size; // 14
+ register uint32_t reg_dst_stride asm("a5") = dst_stride; // 15
+ register uint32_t reg_src_stride asm("a6") = src_stride; // 16
+ register uint32_t reg_repeat asm("a7") = repeat; // 17
+
+ // dmsrc a0, a1
+ asm volatile(
+ ".word (0b0000000 << 25) | \
+ ( (13) << 20) | \
+ ( (12) << 15) | \
+ ( 0b000 << 12) | \
+ (0b0101011 << 0) \n" ::"r"(reg_src_high),
+ "r"(reg_src_low));
+
+ // dmdst a0, a1
+ asm volatile(
+ ".word (0b0000001 << 25) | \
+ ( (11) << 20) | \
+ ( (10) << 15) | \
+ ( 0b000 << 12) | \
+ (0b0101011 << 0) \n" ::"r"(reg_dst_high),
+ "r"(reg_dst_low));
+
+ // dmstr a5, a6
+ asm volatile(
+ ".word (0b0000110 << 25) | \
+ ( (15) << 20) | \
+ ( (16) << 15) | \
+ ( 0b000 << 12) | \
+ (0b0101011 << 0) \n"
+ :
+ : "r"(reg_dst_stride), "r"(reg_src_stride));
+
+ // dmrep a7
+ asm volatile(
+ ".word (0b0000111 << 25) | \
+ ( (17) << 15) | \
+ ( 0b000 << 12) | \
+ (0b0101011 << 0) \n"
+ :
+ : "r"(reg_repeat));
+
+ // dmcpyi a0, a4, 0b10
+ register uint32_t reg_txid asm("a0"); // 10
+ asm volatile(
+ ".word (0b0000010 << 25) | \
+ ( 0b00010 << 20) | \
+ ( (14) << 15) | \
+ ( 0b000 << 12) | \
+ ( (10) << 7) | \
+ (0b0101011 << 0) \n"
+ : "=r"(reg_txid)
+ : "r"(reg_size));
+
+ return reg_txid;
+ } else {
+ return -1;
+ }
}
/// Initiate an asynchronous 2D DMA transfer.
diff --git a/sw/snRuntime/src/dump.h b/sw/snRuntime/src/dump.h
index 8f24cc1b9..1d65395b5 100644
--- a/sw/snRuntime/src/dump.h
+++ b/sw/snRuntime/src/dump.h
@@ -4,6 +4,7 @@
//
// Authors: Samuel Riedel, ETH Zurich
// Viviane Potocnik, ETH Zurich
+// Luca Colagrande, ETH Zurich
// Dump a value via CSR
// !!! Careful: This is only supported in simulation and an experimental
@@ -11,18 +12,14 @@
// This can be exploited to quickly print measurement values from all cores
// simultaneously without the hassle of printf. To specify multiple metrics,
// different CSRs can be used. The macro will define a function that will then
-// always print via the same CSR. E.g., `dump(errors, 8)` will define a function
-// with the following signature: `dump_errors(uint32_t val)`, which will print
-// the given value via the 8th register. Alternatively, the `write_csr(reg,
-// val)` macro can be used directly.
+// always print via the same CSR. E.g., `dump(uint32_t, errors, 8)` will define
+// a function with the following signature: `dump_errors(uint32_t val)`, which
+// will print the given value via the 8th register. Alternatively, the
+// `write_csr(reg, val)` macro can be used directly.
-#define dump_float(name, reg) \
- static __attribute__((always_inline)) inline void dump_##name(float val) { \
- asm volatile("csrw " #reg ", %0" ::"rK"(val)); \
+#define NAMED_DUMP(type, name, reg) \
+ static __attribute__((always_inline)) inline void dump_##name(type val) { \
+ asm volatile("csrw " #reg ", %0" ::"rK"(val)); \
}
-#define dump_uint(name, reg) \
- static \
- __attribute__((always_inline)) inline void dump_##name(uint32_t val) { \
- asm volatile("csrw " #reg ", %0" ::"rK"(val)); \
- }
\ No newline at end of file
+#define DUMP(val) ({ asm volatile("csrw 0x7C3, %0" ::"rK"(val)); })
diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index 3fb338f4a..4e4cd2152 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -20,22 +20,38 @@ static inline void snrt_init_tls() {
extern volatile uint32_t __tdata_start, __tdata_end;
extern volatile uint32_t __tbss_start, __tbss_end;
- volatile uint32_t* p;
- volatile uint32_t* tls_ptr;
+ size_t size;
+ volatile uint32_t tls_ptr;
- asm volatile("mv %0, tp" : "=r"(tls_ptr) : :);
-
- // Copy tdata section
- for (p = (uint32_t*)(&__tdata_start); p < (uint32_t*)(&__tdata_end); p++) {
- *tls_ptr = *p;
- tls_ptr++;
+ // To avoid contentions in main memory, and take advantage of the
+ // bandwidth of the DMA, the DM core initializes the TLS section
+ // for every core in a cluster.
+ if (snrt_is_dm_core()) {
+ size = (size_t)(&__tdata_end) - (size_t)(&__tdata_start);
+
+ // First initialize the DM core's .tdata section from main memory
+ asm volatile("mv %0, tp" : "=r"(tls_ptr) : :);
+ snrt_dma_start_1d((void*)tls_ptr, (void*)(&__tdata_start), size);
+
+ // Then initialize all other cores' .tdata sections from the DM
+ // core's. The offset between the TLS section of successive cores
+ // is defined in start.S
+ size_t tls_offset = (1 << SNRT_LOG2_STACK_SIZE) + 8;
+ for (int i = 1; i < snrt_cluster_core_num(); i++) {
+ snrt_dma_start_1d((void*)(tls_ptr + i * tls_offset), (void*)tls_ptr,
+ size);
+ }
+
+ // Initialize all cores' .tbss sections
+ tls_ptr += size;
+ size = (size_t)(&__tbss_end) - (size_t)(&__tbss_start);
+ for (int i = 0; i < snrt_cluster_core_num(); i++) {
+ snrt_dma_start_1d((void*)(tls_ptr + i * tls_offset),
+ (void*)(snrt_zero_memory_ptr()), size);
+ }
}
- // Clear tbss section
- for (p = (uint32_t*)(&__tbss_start); p < (uint32_t*)(&__tbss_end); p++) {
- *tls_ptr = 0;
- tls_ptr++;
- }
+ snrt_cluster_hw_barrier();
}
#endif
@@ -66,7 +82,7 @@ static inline void snrt_init_cls() {
// Copy cdata section to base of the TCDM
size = (size_t)(&__cdata_end) - (size_t)(&__cdata_start);
- if (size > 0) snrt_dma_start_1d(ptr, (void*)(&__cdata_start), size);
+ snrt_dma_start_1d(ptr, (void*)(&__cdata_start), size);
// Clear cbss section
ptr = (void*)((uint32_t)ptr + size);
diff --git a/target/common/common.mk b/target/common/common.mk
index 6b9c679d0..0cf03c463 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -2,26 +2,41 @@
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
-LOGS_DIR ?= logs
-TB_DIR ?= $(SNITCH_ROOT)/target/common/test
-UTIL_DIR ?= $(SNITCH_ROOT)/util
+# Makefile invocation
+DEBUG ?= OFF # ON to turn on wave logging
+
+# Directories
+LOGS_DIR ?= logs
+TB_DIR ?= $(SNITCH_ROOT)/target/common/test
+UTIL_DIR ?= $(SNITCH_ROOT)/util
+
+# SEPP packages
+QUESTA_SEPP ?=
+VCS_SEPP ?=
+VERILATOR_SEPP ?=
# External executables
-BENDER ?= bender
-DASM ?= spike-dasm
-VLT ?= verilator
-VERIBLE_FMT ?= verible-verilog-format
-CLANG_FORMAT ?= clang-format
+BENDER ?= bender
+DASM ?= spike-dasm
+VLT ?= $(VERILATOR_SEPP) verilator
+VCS ?= $(VCS_SEPP) vcs
+VERIBLE_FMT ?= verible-verilog-format
+CLANG_FORMAT ?= clang-format
+VSIM ?= $(QUESTA_SEPP) vsim
+VOPT ?= $(QUESTA_SEPP) vopt
+VLOG ?= $(QUESTA_SEPP) vlog
+VLIB ?= $(QUESTA_SEPP) vlib
# Internal executables
-BIN2JTAG ?= $(UTIL_DIR)/bin2jtag.py
-GENTRACE ?= $(UTIL_DIR)/trace/gen_trace.py
-ANNOTATE_PY ?= $(UTIL_DIR)/trace/annotate.py
-EVENTS_PY ?= $(UTIL_DIR)/trace/events.py
-PERF_CSV_PY ?= $(UTIL_DIR)/trace/perf_csv.py
+GENTRACE_PY ?= $(UTIL_DIR)/trace/gen_trace.py
+ANNOTATE_PY ?= $(UTIL_DIR)/trace/annotate.py
+EVENTS_PY ?= $(UTIL_DIR)/trace/events.py
+PERF_CSV_PY ?= $(UTIL_DIR)/trace/perf_csv.py
+LAYOUT_EVENTS_PY ?= $(UTIL_DIR)/trace/layout_events.py
+EVENTVIS_PY ?= $(UTIL_DIR)/trace/eventvis.py
-VERILATOR_ROOT ?= $(dir $(shell which $(VLT)))/../share/verilator
-VLT_ROOT ?= ${VERILATOR_ROOT}
+VERILATOR_ROOT ?= $(dir $(shell $(VERILATOR_SEPP) which verilator))..
+VLT_ROOT ?= ${VERILATOR_ROOT}
MATCH_END := '/+incdir+/ s/$$/\/*\/*/'
MATCH_BGN := 's/+incdir+//g'
@@ -29,7 +44,14 @@ SED_SRCS := sed -e ${MATCH_END} -e ${MATCH_BGN}
VSIM_BENDER += -t test -t rtl -t simulation -t vsim
VSIM_SOURCES = $(shell ${BENDER} script flist ${VSIM_BENDER} | ${SED_SRCS})
-VSIM_BUILDDIR := work-vsim
+VSIM_BUILDDIR ?= work-vsim
+VSIM_FLAGS += -t 1ps
+ifeq ($(DEBUG), ON)
+VSIM_FLAGS += -do "log -r /*; run -a"
+VOPT_FLAGS = +acc
+else
+VSIM_FLAGS += -do "run -a"
+endif
# VCS_BUILDDIR should to be the same as the `DEFAULT : ./work-vcs`
# in target/snitch_cluster/synopsys_sim.setup
@@ -38,8 +60,8 @@ VCS_SOURCES = $(shell ${BENDER} script flist ${VCS_BENDER} | ${SED_SRCS})
VCS_BUILDDIR := work-vcs
# fesvr is being installed here
-FESVR ?= ${MKFILE_DIR}work
-FESVR_VERSION ?= 35d50bc40e59ea1d5566fbd3d9226023821b1bb6
+FESVR ?= ${MKFILE_DIR}work
+FESVR_VERSION ?= 35d50bc40e59ea1d5566fbd3d9226023821b1bb6
VLT_BENDER += -t rtl
VLT_SOURCES = $(shell ${BENDER} script flist ${VLT_BENDER} | ${SED_SRCS})
@@ -146,25 +168,33 @@ endef
# Modelsim #
############
+$(VSIM_BUILDDIR):
+ mkdir -p $@
+
+# Expects vlog/vcom script in $< (e.g. as output by bender)
+# Expects the top module name in $1
+# Produces a binary used to run the simulation at the path specified by $@
define QUESTASIM
- ${VSIM} -c -do "source $<; quit" | tee $(dir $<)vsim.log
- @! grep -P "Errors: [1-9]*," $(dir $<)vsim.log
- @mkdir -p bin
+ ${VSIM} -c -do "source $<; quit" | tee $(dir $<)vlog.log
+ @! grep -P "Errors: [1-9]*," $(dir $<)vlog.log
+ $(VOPT) $(VOPT_FLAGS) -work $(VSIM_BUILDDIR) $1 -o $(1)_opt | tee $(dir $<)vopt.log
+ @! grep -P "Errors: [1-9]*," $(dir $<)vopt.log
+ @mkdir -p $(dir $@)
@echo "#!/bin/bash" > $@
- @echo 'binary=$$(realpath --relative-to=${MKFILE_DIR} $$1)' >> $@
- @echo 'cd ${MKFILE_DIR}' >> $@
+ @echo 'binary=$$(realpath $$1)' >> $@
+ @echo 'mkdir -p $(LOGS_DIR)' >> $@
@echo 'echo $$binary > $(LOGS_DIR)/.rtlbinary' >> $@
@echo '${VSIM} +permissive ${VSIM_FLAGS} $$3 -work ${MKFILE_DIR}/${VSIM_BUILDDIR} -c \
-ldflags "-Wl,-rpath,${FESVR}/lib -L${FESVR}/lib -lfesvr -lutil" \
- $1 +permissive-off ++$$binary ++$$2' >> $@
+ $(1)_opt +permissive-off ++$$binary ++$$2' >> $@
@chmod +x $@
@echo "#!/bin/bash" > $@.gui
- @echo 'binary=$$(pwd)/$$1' >> $@.gui
- @echo 'cd ${MKFILE_DIR}' >> $@.gui
+ @echo 'binary=$$(realpath $$1)' >> $@.gui
+ @echo 'mkdir -p $(LOGS_DIR)' >> $@.gui
@echo 'echo $$binary > $(LOGS_DIR)/.rtlbinary' >> $@.gui
@echo '${VSIM} +permissive ${VSIM_FLAGS} -work ${MKFILE_DIR}/${VSIM_BUILDDIR} \
-ldflags "-Wl,-rpath,${FESVR}/lib -L${FESVR}/lib -lfesvr -lutil" \
- $1 +permissive-off ++$$binary ++$$2' >> $@.gui
+ $(1)_opt +permissive-off ++$$binary ++$$2' >> $@.gui
@chmod +x $@.gui
endef
@@ -175,7 +205,7 @@ $(VCS_BUILDDIR)/compile.sh:
mkdir -p $(VCS_BUILDDIR)
${BENDER} script vcs ${VCS_BENDER} --vlog-arg="${VLOGAN_FLAGS}" --vcom-arg="${VHDLAN_FLAGS}" > $@
chmod +x $@
- $@ > $(VCS_BUILDDIR)/compile.log
+ $(VCS_SEPP) $@ > $(VCS_BUILDDIR)/compile.log
########
# Util #
@@ -189,26 +219,56 @@ define reggen_generate_header
@$(CLANG_FORMAT) -i $1
endef
-$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE)
- $(DASM) < $< | $(PYTHON) $(GENTRACE) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt
+# Arg 1: binary
+# Arg 2: max size in bytes
+define BINARY_SIZE_CHECK
+ echo "Binary size: $$(stat -c %s $(1))B"
+ @[ "$$(stat -c %s $(1))" -lt "$(2)" ] || (echo "Binary exceeds specified size of $(2)B"; exit 1)
+endef
+
+##########
+# Traces #
+##########
+
+DASM_TRACES = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null))
+TXT_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.txt/g'))
+PERF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g'))
+ANNOTATED_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.s/g'))
+DIFF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.diff/g'))
-traces: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.txt/') || echo "") \
- $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/') || echo "")
+GENTRACE_OUTPUTS = $(TXT_TRACES) $(PERF_TRACES)
+ANNOTATE_OUTPUTS = $(ANNOTATED_TRACES)
+PERF_CSV = $(LOGS_DIR)/perf.csv
+EVENT_CSV = $(LOGS_DIR)/event.csv
+TRACE_CSV = $(LOGS_DIR)/trace.csv
+TRACE_JSON = $(LOGS_DIR)/trace.json
+
+.PHONY: traces annotate perf-csv event-csv layout
+traces: $(GENTRACE_OUTPUTS)
+annotate: $(ANNOTATE_OUTPUTS)
+perf-csv: $(PERF_CSV)
+event-csv: $(EVENT_CSV)
+layout: $(TRACE_CSV) $(TRACE_JSON)
+
+$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY)
+ $(DASM) < $< | $(PYTHON) $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt
-# make annotate
# Generate source-code interleaved traces for all harts. Reads the binary from
# the logs/.rtlbinary file that is written at start of simulation in the vsim script
+BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary)
$(LOGS_DIR)/trace_hart_%.s: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
$(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $<
$(LOGS_DIR)/trace_hart_%.diff: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY}
$(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -d
-BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary)
-annotate: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.s/') || echo "") \
- $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.diff/') || echo "")
-# Arg 1: binary
-# Arg 2: max size in bytes
-define BINRAY_SIZE_CHECK
- echo "Binary size: $$(stat -c %s $(1))B"
- @[ "$$(stat -c %s $(1))" -lt "$(2)" ] || (echo "Binary exceeds specified size of $(2)B"; exit 1)
-endef
+$(PERF_CSV): $(PERF_TRACES) $(PERF_CSV_PY)
+ $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES)
+
+$(EVENT_CSV): $(PERF_TRACES) $(PERF_CSV_PY)
+ $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES) --filter tstart tend
+
+$(TRACE_CSV): $(EVENT_CSV) $(LAYOUT_FILE) $(LAYOUT_EVENTS_PY)
+ $(PYTHON) $(LAYOUT_EVENTS_PY) $(LAYOUT_EVENTS_FLAGS) $(EVENT_CSV) $(LAYOUT_FILE) -o $@
+
+$(TRACE_JSON): $(TRACE_CSV) $(EVENTVIS_PY)
+ $(PYTHON) $(EVENTVIS_PY) -o $@ $(TRACE_CSV)
diff --git a/target/common/test/ipc.cc b/target/common/test/ipc.cc
index 5eaffcf85..09188a7b3 100644
--- a/target/common/test/ipc.cc
+++ b/target/common/test/ipc.cc
@@ -19,60 +19,67 @@ void* IpcIface::ipc_thread_handle(void* in) {
// Handle commands
ipc_op_t op;
- while (!feof(tx)) {
- uint8_t ret_value = fread(&op, sizeof(ipc_op_t), 1, tx);
- if (ret_value != 1) {
- if (ferror(tx)) {
- continue; // jumps to while() again
- }
- }
- switch (op.opcode) {
- case Read:
- // Read full blocks until one full block or less left
- printf("[IPC] Read from 0x%x len 0x%x ...\n", op.addr, op.len);
- for (uint64_t i = op.len; i > IPC_BUF_SIZE; i -= IPC_BUF_SIZE) {
- sim::MEM.read(op.addr, IPC_BUF_SIZE, buf_data);
- fwrite(buf_data, IPC_BUF_SIZE, 1, rx);
- op.addr += IPC_BUF_SIZE;
- op.len -= IPC_BUF_SIZE;
- }
- sim::MEM.read(op.addr, op.len, buf_data);
- fwrite(buf_data, op.len, 1, rx);
- fflush(rx);
- break;
- case Write:
- // Write full blocks until one full block or less left
- printf("[IPC] Write to 0x%x len %d ...\n", op.addr, op.len);
- for (uint64_t i = op.len; i > IPC_BUF_SIZE; i -= IPC_BUF_SIZE) {
- fread(buf_data, IPC_BUF_SIZE, 1, tx);
- sim::MEM.write(op.addr, IPC_BUF_SIZE, buf_data, buf_strb);
- op.addr += IPC_BUF_SIZE;
- op.len -= IPC_BUF_SIZE;
- }
- fread(buf_data, op.len, 1, tx);
- sim::MEM.write(op.addr, op.len, buf_data, buf_strb);
- break;
- case Poll:
- // Unpack 32b checking mask and expected value from length
- uint32_t mask = op.len & 0xFFFFFFFF;
- uint32_t expected = (op.len >> 32) & 0xFFFFFFFF;
- printf("[IPC] Poll on 0x%x mask 0x%x expected 0x%x ...\n",
- op.addr, mask, expected);
- uint32_t read;
- do {
- sim::MEM.read(op.addr, sizeof(uint32_t),
- (uint8_t*)(void*)&read);
- nanosleep(
- (const struct timespec[]){{0, IPC_POLL_PERIOD_NS}},
- NULL);
- } while ((read & mask) == (expected & mask));
- // Send back read 32b word
- fwrite(&read, sizeof(uint32_t), 1, rx);
- fflush(rx);
+ while (1) {
+ if (!fread(&op, sizeof(ipc_op_t), 1, tx)) {
+ if (feof(tx)) {
+ printf(
+ "[IPC] All messages read. Closing FIFOs and joining main "
+ "thread.\n");
break;
+ }
+ } else {
+ switch (op.opcode) {
+ case Read:
+ // Read full blocks until one full block or less left
+ printf("[IPC] Read from 0x%x len 0x%x ...\n", op.addr,
+ op.len);
+ for (uint64_t i = op.len; i > IPC_BUF_SIZE;
+ i -= IPC_BUF_SIZE) {
+ sim::MEM.read(op.addr, IPC_BUF_SIZE, buf_data);
+ fwrite(buf_data, IPC_BUF_SIZE, 1, rx);
+ op.addr += IPC_BUF_SIZE;
+ op.len -= IPC_BUF_SIZE;
+ }
+ sim::MEM.read(op.addr, op.len, buf_data);
+ fwrite(buf_data, op.len, 1, rx);
+ fflush(rx);
+ break;
+ case Write:
+ // Write full blocks until one full block or less left
+ printf("[IPC] Write to 0x%x len %d ...\n", op.addr, op.len);
+ for (uint64_t i = op.len; i > IPC_BUF_SIZE;
+ i -= IPC_BUF_SIZE) {
+ fread(buf_data, IPC_BUF_SIZE, 1, tx);
+ sim::MEM.write(op.addr, IPC_BUF_SIZE, buf_data,
+ buf_strb);
+ op.addr += IPC_BUF_SIZE;
+ op.len -= IPC_BUF_SIZE;
+ }
+ fread(buf_data, op.len, 1, tx);
+ sim::MEM.write(op.addr, op.len, buf_data, buf_strb);
+ break;
+ case Poll:
+ // Unpack 32b checking mask and expected value from length
+ uint32_t mask = op.len & 0xFFFFFFFF;
+ uint32_t expected = (op.len >> 32) & 0xFFFFFFFF;
+ printf("[IPC] Poll on 0x%x mask 0x%x expected 0x%x ...\n",
+ op.addr, mask, expected);
+ uint32_t read;
+ do {
+ sim::MEM.read(op.addr, sizeof(uint32_t),
+ (uint8_t*)(void*)&read);
+ nanosleep(
+ (const struct timespec[]){{0, IPC_POLL_PERIOD_NS}},
+ NULL);
+ } while ((read & mask) == (expected & mask));
+ // Send back read 32b word
+ fwrite(&read, sizeof(uint32_t), 1, rx);
+ fflush(rx);
+ break;
+ }
}
- printf("[IPC] ... done\n");
}
+
// TX FIFO closed at other end: close both FIFOs and join main thread
fclose(tx);
fclose(rx);
diff --git a/target/common/test/verilator_lib.cc b/target/common/test/verilator_lib.cc
index 63ac66d5b..3e1ae89e1 100644
--- a/target/common/test/verilator_lib.cc
+++ b/target/common/test/verilator_lib.cc
@@ -14,10 +14,15 @@ namespace sim {
// Number of cycles between HTIF checks.
const int HTIFTimeInterval = 200;
+
+// We want to return timestamp in picosecond accuracy, assuming that one cycle
+// takes 1ns Since 1 cycle takes 2 sim::TIME increments, scale by 500 to get
+// time = cycle * 1000 +
+const int TIME_CYCLES_TO_TIMESTAMP = 500;
void sim_thread_main(void *arg) { ((Sim *)arg)->main(); }
// Sim time.
-int TIME = 0;
+vluint64_t TIME = 0;
Sim::Sim(int argc, char **argv) : htif_t(argc, argv), ipc(argc, argv) {
// Search arguments for `--vcd` flag and enable waves if requested
@@ -78,7 +83,7 @@ void Sim::main() {
} // namespace sim
// Verilator callback to get the current time.
-double sc_time_stamp() { return sim::TIME * 1e-9; }
+double sc_time_stamp() { return sim::TIME * sim::TIME_CYCLES_TO_TIMESTAMP; }
// DPI calls.
void tb_memory_read(long long addr, int len, const svOpenArrayHandle data) {
diff --git a/target/snitch_cluster/.gitignore b/target/snitch_cluster/.gitignore
index b7f1de414..f74d9fde4 100644
--- a/target/snitch_cluster/.gitignore
+++ b/target/snitch_cluster/.gitignore
@@ -6,4 +6,5 @@
/work-vsim/
/work-vlt/
/work-vcs/
-/*.log
\ No newline at end of file
+/*.log
+/runs/
\ No newline at end of file
diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile
index 7b38bbad6..037621213 100644
--- a/target/snitch_cluster/Makefile
+++ b/target/snitch_cluster/Makefile
@@ -9,7 +9,7 @@
# Makefile invocation #
#######################
-DEBUG ?= OFF # ON to turn on debugging symbols
+DEBUG ?= OFF # ON to turn on debugging symbols and wave logging
CFG_OVERRIDE ?= # Override default config file
SELECT_RUNTIME ?= # Select snRuntime implementation: "banshee" or "rtl" (default)
@@ -37,9 +37,6 @@ REGGEN ?= $(shell $(BENDER) path register_interface)/vendor/lowrisc_ope
CLUSTER_GEN ?= $(ROOT)/util/clustergen.py
CLUSTER_GEN_SRC ?= $(wildcard $(ROOT)/util/clustergen/*.py)
-VSIM ?= vsim
-VLOG ?= vlog
-
#########################
# Files and directories #
#########################
@@ -71,9 +68,6 @@ QUESTA_64BIT = -64
VLOG_64BIT = -64
VSIM_FLAGS += ${QUESTA_64BIT}
-VSIM_FLAGS += -t 1ps
-VSIM_FLAGS += -voptargs=+acc
-VSIM_FLAGS += -do "log -r /*; run -a"
VLOG_FLAGS += -svinputport=compat
VLOG_FLAGS += -override_timescale 1ns/1ps
@@ -245,7 +239,7 @@ clean-vsim: clean-work
rm -rf bin/snitch_cluster.vsim bin/snitch_cluster.vsim.gui $(VSIM_BUILDDIR) vsim.wlf
${VSIM_BUILDDIR}/compile.vsim.tcl:
- vlib $(dir $@)
+ $(VLIB) $(dir $@)
${BENDER} script vsim ${VSIM_BENDER} --vlog-arg="${VLOG_FLAGS} -work $(dir $@) " > $@
echo '${VLOG} -work $(dir $@) ${TB_CC_SOURCES} ${TB_ASM_SOURCES} -vv -ccflags "$(TB_CC_FLAGS)"' >> $@
echo 'return 0' >> $@
@@ -267,22 +261,10 @@ clean-vcs: clean-work
# Build compilation script and compile all sources for VCS simulation
bin/snitch_cluster.vcs: ${VCS_SOURCES} ${TB_SRCS} $(TB_CC_SOURCES) $(TB_ASM_SOURCES) $(VCS_BUILDDIR)/compile.sh work/lib/libfesvr.a
mkdir -p bin
- vcs -Mlib=$(VCS_BUILDDIR) -Mdir=$(VCS_BUILDDIR) -o bin/snitch_cluster.vcs -cc $(CC) -cpp $(CXX) \
+ $(VCS) -Mlib=$(VCS_BUILDDIR) -Mdir=$(VCS_BUILDDIR) -o bin/snitch_cluster.vcs -cc $(CC) -cpp $(CXX) \
-assert disable_cover -override_timescale=1ns/1ps -full64 tb_bin $(TB_CC_SOURCES) $(TB_ASM_SOURCES) \
-CFLAGS "$(TB_CC_FLAGS)" -LDFLAGS "-L${FESVR}/lib" -lfesvr
-##########
-# Traces #
-##########
-
-$(LOGS_DIR)/perf.csv: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/')) \
- $(PERF_CSV_PY)
- $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(LOGS_DIR)/hart_*_perf.json
-
-$(LOGS_DIR)/event.csv: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/')) \
- $(PERF_CSV_PY)
- $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(LOGS_DIR)/hart_*_perf.json --filter tstart tend
-
########
# Util #
########
diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson
index c39c2a490..7f28a1073 100644
--- a/target/snitch_cluster/cfg/default.hjson
+++ b/target/snitch_cluster/cfg/default.hjson
@@ -34,8 +34,8 @@
lat_comp_fp8: 1,
lat_comp_fp8_alt: 1,
lat_noncomp: 1,
- lat_conv: 1,
- lat_sdotp: 2,
+ lat_conv: 2,
+ lat_sdotp: 3,
fpu_pipe_config: "BEFORE"
narrow_xbar_latency: "CUT_ALL_PORTS",
wide_xbar_latency: "CUT_ALL_PORTS",
diff --git a/target/snitch_cluster/run.py b/target/snitch_cluster/run.py
new file mode 100755
index 000000000..bef478ef7
--- /dev/null
+++ b/target/snitch_cluster/run.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande
+
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).parent / '../../util/sim'))
+from sim_utils import parser, get_simulations, run_simulations # noqa: E402
+from Simulator import QuestaSimulator, VCSSimulator, VerilatorSimulator, \
+ BansheeSimulator # noqa: E402
+
+
+SIMULATORS = {
+ 'vsim': QuestaSimulator(Path(__file__).parent.resolve() / 'bin/snitch_cluster.vsim'),
+ 'vcs': VCSSimulator(Path(__file__).parent.resolve() / 'bin/snitch_cluster.vcs'),
+ 'verilator': VerilatorSimulator(Path(__file__).parent.resolve() / 'bin/snitch_cluster.vlt'),
+ 'banshee': BansheeSimulator(Path(__file__).parent.resolve() / 'src/banshee.yaml')
+}
+
+
+def main():
+ args = parser('vsim', SIMULATORS.keys()).parse_args()
+ simulations = get_simulations(args.testlist, SIMULATORS[args.simulator], args.run_dir)
+ return run_simulations(simulations,
+ n_procs=args.n_procs,
+ dry_run=args.dry_run,
+ early_exit=args.early_exit,
+ verbose=args.verbose)
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/target/snitch_cluster/sw/Makefile b/target/snitch_cluster/sw/Makefile
index 9badf70ea..a0115d00a 100644
--- a/target/snitch_cluster/sw/Makefile
+++ b/target/snitch_cluster/sw/Makefile
@@ -13,21 +13,19 @@ else
RUNTIME = runtime/rtl
endif
-MATH = ../../../sw/math
-
-SUBDIRS = runtime/banshee runtime/rtl $(MATH) apps tests
+SUBDIRS = runtime/banshee runtime/rtl math apps tests
.PHONY: all $(SUBDIRS)
all: $(SUBDIRS)
# Explicit dependency of apps on runtime
-apps: $(RUNTIME) $(MATH)
+apps: $(RUNTIME) math
$(MAKE) -C $@ TARGET=$(TARGET)
# Explicit dependency of tests on runtime
-tests: $(RUNTIME) $(MATH)
+tests: $(RUNTIME) math
$(MAKE) -C $@ $(TARGET)
-runtime/rtl runtime/banshee $(MATH):
+runtime/rtl runtime/banshee math:
$(MAKE) -C $@ $(TARGET)
diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk
index d8b0659a4..e27a19cfd 100644
--- a/target/snitch_cluster/sw/apps/common.mk
+++ b/target/snitch_cluster/sw/apps/common.mk
@@ -22,6 +22,7 @@ RISCV_CFLAGS += -DBIST
else
RUNTIME_DIR := $(ROOT)/target/snitch_cluster/sw/runtime/rtl
endif
+MATH_DIR := $(ROOT)/target/snitch_cluster/sw/math
# Paths relative to the app including this Makefile
BUILDDIR = $(abspath build)
@@ -37,19 +38,18 @@ INCDIRS += $(SNRT_DIR)/api/omp
INCDIRS += $(SNRT_DIR)/src
INCDIRS += $(SNRT_DIR)/src/omp
INCDIRS += $(ROOT)/sw/deps/riscv-opcodes
-
-# Math library override
-INCDIRS += $(ROOT)/sw/math/arch/riscv64/bits/
-INCDIRS += $(ROOT)/sw/math/arch/generic
-INCDIRS += $(ROOT)/sw/math/src/include
-INCDIRS += $(ROOT)/sw/math/src/internal
-INCDIRS += $(ROOT)/sw/math/include/bits
INCDIRS += $(ROOT)/sw/math/include
+LIBS = $(MATH_DIR)/build/libmath.a
+LIBS += $(RUNTIME_DIR)/build/libsnRuntime.a
+
+LIBDIRS = $(dir $(LIBS))
+LIBNAMES = $(patsubst lib%,%,$(notdir $(basename $(LIBS))))
+
RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR))
RISCV_LDFLAGS += -T$(abspath $(SNRT_DIR)/base.ld)
-RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR)/build/)
-RISCV_LDFLAGS += -lsnRuntime
+RISCV_LDFLAGS += $(addprefix -L,$(LIBDIRS))
+RISCV_LDFLAGS += $(addprefix -l,$(LIBNAMES))
###########
# Outputs #
@@ -78,11 +78,11 @@ $(BUILDDIR):
$(DEP): $(SRCS) | $(BUILDDIR)
$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(ELF)' $< > $@
-$(ELF): $(SRCS) $(DEP) | $(BUILDDIR)
+$(ELF): $(SRCS) $(DEP) $(LIBS) | $(BUILDDIR)
$(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@
$(DUMP): $(ELF) | $(BUILDDIR)
- $(RISCV_OBJDUMP) -D $< > $@
+ $(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) $< > $@
$(DWARF): $(ELF) | $(BUILDDIR)
$(RISCV_DWARFDUMP) $< > $@
diff --git a/target/snitch_cluster/sw/math/Makefile b/target/snitch_cluster/sw/math/Makefile
new file mode 100644
index 000000000..d0a83e86a
--- /dev/null
+++ b/target/snitch_cluster/sw/math/Makefile
@@ -0,0 +1,8 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande
+
+include ../toolchain.mk
+include ../../../../sw/math/Makefile
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index f25ea7641..ce241a8d4 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -68,11 +68,11 @@ runs:
- elf: tests/build/varargs_2.elf
- elf: tests/build/zero_mem.elf
- elf: tests/build/non_null_exitcode.elf
- exit_code: 14
+ retcode: 14
- elf: apps/blas/axpy/build/axpy.elf
- cmd: ../../sw/blas/axpy/verify.py {sim_bin} {elf}
+ cmd: [../../../sw/blas/axpy/verify.py, "${sim_bin}", "${elf}"]
- elf: apps/blas/gemm/build/gemm.elf
- cmd: ../../sw/blas/gemm/verify.py {sim_bin} {elf}
+ cmd: [../../../sw/blas/gemm/verify.py, "${sim_bin}", "${elf}"]
- elf: apps/dnn/batchnorm/build/batchnorm.elf
- elf: apps/dnn/linear/build/linear.elf
- elf: apps/dnn/maxpool/build/maxpool.elf
diff --git a/target/snitch_cluster/sw/toolchain.mk b/target/snitch_cluster/sw/toolchain.mk
index 4fa0fc5af..3d50974b8 100644
--- a/target/snitch_cluster/sw/toolchain.mk
+++ b/target/snitch_cluster/sw/toolchain.mk
@@ -34,6 +34,7 @@ RISCV_CFLAGS += -mcmodel=medany
# RISCV_CFLAGS += -mno-fdiv # Not supported by Clang
RISCV_CFLAGS += -ffast-math
RISCV_CFLAGS += -fno-builtin-printf
+RISCV_CFLAGS += -fno-builtin-sqrtf
RISCV_CFLAGS += -fno-common
RISCV_CFLAGS += -fopenmp
RISCV_CFLAGS += -ftls-model=local-exec
@@ -54,3 +55,7 @@ RISCV_LDFLAGS += -lclang_rt.builtins-riscv32
# Archiver flags
RISCV_ARFLAGS = rcs
+
+# Objdump flags
+RISCV_OBJDUMP_FLAGS += --mcpu=snitch
+RISCV_OBJDUMP_FLAGS += -D
diff --git a/util/container/Dockerfile b/util/container/Dockerfile
index ea320f325..d917a6790 100644
--- a/util/container/Dockerfile
+++ b/util/container/Dockerfile
@@ -7,7 +7,11 @@
# 1. Stage
FROM ubuntu:18.04 AS builder
ARG CMAKE_VERSION=3.19.4
+ARG PYTHON_VERSION=3.9.12
+# Run dpkg without interactive dialogue
+ARG DEBIAN_FRONTEND=noninteractive
+# Install APT requirements
COPY apt-requirements.txt /tmp/apt-requirements.txt
RUN apt-get update && \
sed 's/#.*//' /tmp/apt-requirements.txt \
@@ -20,8 +24,26 @@ RUN apt-get update && \
lsb-release \
software-properties-common \
unzip \
- wget \
- zlib1g-dev
+ wget
+# Required to install Python
+RUN apt-get update && apt-get install -y \
+ zlib1g-dev \
+ libreadline-gplv2-dev \
+ libncursesw5-dev \
+ libssl-dev \
+ libsqlite3-dev \
+ tk-dev \
+ libgdbm-dev \
+ libc6-dev \
+ libbz2-dev \
+ libffi-dev
+
+# Install Python
+RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
+RUN tar xzf Python-${PYTHON_VERSION}.tgz
+RUN cd Python-${PYTHON_VERSION} && \
+ ./configure --enable-optimizations --prefix=/opt/python/ && \
+ make install -j
# Build Rust tools
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
@@ -37,6 +59,7 @@ RUN wget https://apt.llvm.org/llvm.sh
RUN chmod +x llvm.sh
RUN ./llvm.sh 12
+# Change working directory
WORKDIR /tools
# Install a newer version of cmake (we need this for banshee)
@@ -73,9 +96,11 @@ RUN apt-get update && \
sed 's/#.*//' /tmp/apt-requirements.txt \
| xargs apt-get install -y && \
apt-get install -y --no-install-recommends \
+ ca-certificates \
gnupg2 \
curl \
wget \
+ build-essential \
git && \
apt-get clean ; \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/doc/*
@@ -86,12 +111,7 @@ RUN echo 'deb http://download.opensuse.org/repositories/home:/phiwag:/edatools/x
apt-get update && apt-get install -y verilator-${VERILATOR_VERSION} && \
apt-get clean ; \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/doc/*
-
-# Install Python requirements
-COPY python-requirements.txt /tmp/python-requirements.txt
-COPY docs/requirements.txt /tmp/docs/requirements.txt
-COPY sw/dnn/requirements.txt /tmp/sw/dnn/requirements.txt
-RUN pip3 install -r /tmp/python-requirements.txt
+ENV VLT_ROOT "/usr/share/verilator"
# Get the precompiled LLVM toolchain
RUN latest_tag=`curl -s -H "Accept: application/vnd.github.v3+json" https://api.github.com/repos/pulp-platform/llvm-project/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/'` && \
@@ -119,6 +139,17 @@ RUN apt-get update && apt-get install software-properties-common -y && \
# Copy artifacts from stage 1.
COPY --from=builder /root/.cargo/bin/bender bin/
COPY --from=builder /root/.cargo/bin/banshee bin/
+COPY --from=builder /opt/python /opt/python
+
+# Create and activate virtual environment
+ENV VIRTUAL_ENV "/root/.venvs/snitch_cluster"
+RUN /opt/python/bin/python3 -m venv ${VIRTUAL_ENV}
+ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
+# Install Python requirements
+COPY python-requirements.txt /tmp/python-requirements.txt
+COPY docs/requirements.txt /tmp/docs/requirements.txt
+COPY sw/dnn/requirements.txt /tmp/sw/dnn/requirements.txt
+RUN pip install -r /tmp/python-requirements.txt
# Set locale to UTF-8, required because Python 3.6 defaults on ASCII encoding.
# See https://click.palletsprojects.com/en/8.1.x/unicode-support/
diff --git a/util/sim/Simulation.py b/util/sim/Simulation.py
new file mode 100644
index 000000000..3cc219389
--- /dev/null
+++ b/util/sim/Simulation.py
@@ -0,0 +1,242 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande
+
+from termcolor import colored, cprint
+from pathlib import Path
+import subprocess
+import re
+import os
+from mako.template import Template
+
+
+class Simulation(object):
+ """Provides a common interface to manage simulations."""
+
+ LOG_FILE = 'sim.txt'
+
+ def __init__(self, elf=None, dry_run=False, retcode=0, run_dir=None):
+ """Constructor for the Simulation class.
+
+ A Simulation object is defined at a minimum by a software
+ binary to be simulated on the desired hardware. The hardware is
+ implicitly determined by the simulation command.
+
+ Arguments:
+ elf: The software binary to simulate.
+ run_dir: The directory where to launch the simulation
+ command. If none is passed, the current working
+ directory is assumed.
+ dry_run: A preview of the simulation command will be
+ displayed without actually launching the simulation.
+ """
+ self.elf = elf
+ self.dry_run = dry_run
+ self.run_dir = run_dir if run_dir is not None else Path.cwd()
+ self.testname = Path(self.elf).stem
+ self.cmd = []
+ self.log = None
+ self.process = None
+ self.expected_retcode = int(retcode)
+
+ def launch(self, dry_run=None):
+ """Launch the simulation.
+
+ Launch the simulation by invoking the command stored in the
+ `cmd` attribute of the class. Subclasses are required to define
+ a non-empty `cmd` attribute prior to invoking this method.
+
+ Arguments:
+ dry_run: A preview of the simulation command is displayed
+ without actually launching the simulation.
+ """
+ # Override dry_run setting at launch time
+ if dry_run is not None:
+ self.dry_run = dry_run
+
+ # Print launch message and simulation command
+ cprint(f'Run test {colored(self.elf, "cyan")}', attrs=["bold"])
+ cmd_string = ' '.join(self.cmd)
+ print(f'[{self.run_dir}]$ {cmd_string}', flush=True)
+
+ # Launch simulation if not doing a dry run
+ if not self.dry_run:
+ # Create run directory and log file
+ os.makedirs(self.run_dir, exist_ok=True)
+ self.log = self.run_dir / self.LOG_FILE
+ # Launch simulation subprocess
+ with open(self.log, 'w') as f:
+ self.process = subprocess.Popen(self.cmd, stdout=f, stderr=subprocess.STDOUT,
+ cwd=self.run_dir, universal_newlines=True)
+
+ def completed(self):
+ """Return whether the simulation completed."""
+ if self.dry_run:
+ return True
+ elif self.process:
+ return self.process.poll() is not None
+ else:
+ return False
+
+ def get_retcode(self):
+ """Get the return code of the simulation."""
+ if self.dry_run:
+ return 0
+ else:
+ if self.process:
+ return int(self.process.returncode)
+
+ def successful(self):
+ """Return whether the simulation was successful."""
+ actual_retcode = self.get_retcode()
+ if actual_retcode is not None:
+ return int(actual_retcode) == int(self.expected_retcode)
+ else:
+ return False
+
+ def print_log(self):
+ """Print a log of the simulation to stdout."""
+ with open(self.log, 'r') as f:
+ print(f.read())
+
+ def print_status(self):
+ """Print a status message to stdout.
+
+ The status message reports whether the test is still running
+ or, if it completed, whether it was successful or failed.
+ """
+ if self.completed():
+ if self.successful():
+ cprint(f'{self.elf} test passed', 'green', attrs=['bold'], flush=True)
+ else:
+ cprint(f'{self.elf} test failed', 'red', attrs=['bold'], flush=True)
+ else:
+ cprint(f'{self.elf} test running', 'black', flush=True)
+
+
+class RTLSimulation(Simulation):
+ """A simulation run on an RTL simulator.
+
+ An RTL simulation is launched through a simulation binary built
+ in advance from some RTL design.
+ """
+
+ def __init__(self, sim_bin=None, **kwargs):
+ """Constructor for the RTLSimulation class.
+
+ Arguments:
+ sim_bin: The simulation binary.
+ kwargs: Arguments passed to the base class constructor.
+ """
+ super().__init__(**kwargs)
+ self.cmd = [str(sim_bin), str(self.elf)]
+
+
+class VerilatorSimulation(RTLSimulation):
+ """An RTL simulation running on Verilator.
+
+ The return code of the simulation is returned directly as the
+ return code of the command launching the simulation.
+ """
+
+ def get_retcode(self):
+ return self.process.returncode
+
+
+class QuestaVCSSimulation(RTLSimulation):
+ """An RTL simulation running on QuestaSim or VCS.
+
+ QuestaSim and VCS print out the simulation return code in the
+ simulation log. This is parsed to extract the return code.
+ """
+
+ def get_retcode(self):
+ # Extract the application's return code from the simulation log
+ with open(self.log, 'r') as f:
+ for line in f.readlines():
+ regex_success = r'\[SUCCESS\] Program finished successfully'
+ match_success = re.search(regex_success, line)
+ if match_success:
+ return 0
+ else:
+ regex_fail = r'\[FAILURE\] Finished with exit code\s+(\d+)'
+ match = re.search(regex_fail, line)
+ if match:
+ return int(match.group(1))
+
+ def successful(self):
+ # Check that simulation return code matches expected value (in super class)
+ # and that the simulation process terminated correctly
+ success = super().successful()
+ if self.process.returncode != 0:
+ return False
+ else:
+ return success
+
+
+class QuestaSimulation(QuestaVCSSimulation):
+ """An RTL simulation running on QuestaSim."""
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.cmd += ['', '-batch']
+
+
+class VCSSimulation(QuestaVCSSimulation):
+ """An RTL simulation running on VCS."""
+ pass
+
+
+class BansheeSimulation(Simulation):
+ """A simulation running on Banshee.
+
+ The return code of the simulation is returned directly as the
+ return code of the command launching the simulation.
+ """
+
+ def __init__(self, banshee_cfg=None, **kwargs):
+ """Constructor for the BansheeSimulation class.
+
+ Arguments:
+ banshee_cfg: A Banshee config file.
+ kwargs: Arguments passed to the base class constructor.
+ """
+ super().__init__(**kwargs)
+ self.cmd = ['banshee', '--no-opt-llvm', '--no-opt-jit', '--configuration',
+ str(banshee_cfg), '--trace', str(self.elf)]
+
+
+class CustomSimulation(Simulation):
+ """A simulation which is run through a custom command.
+
+ The custom command generally invokes an RTL simulator binary behind
+ the scenes and executes some additional verification logic after
+ the end of the simulation.
+
+ Custom simulations are considered unsuccessful if the return code
+ of the custom command is non-null. As a custom command can
+ implement any verification logic, there is no reason to implement
+ any additional logic here.
+ """
+
+ def __init__(self, sim_bin=None, cmd=None, **kwargs):
+ """Constructor for the CustomSimulation class.
+
+ Arguments:
+ sim_bin: The simulation binary.
+ cmd: The custom command used to launch the simulation.
+ kwargs: Arguments passed to the base class constructor.
+ """
+ super().__init__(**kwargs)
+ self.dynamic_args = {
+ 'sim_bin': str(sim_bin),
+ 'elf': str(self.elf),
+ 'run_dir': str(self.run_dir)
+ }
+ self.cmd = cmd
+
+ def launch(self, **kwargs):
+ self.cmd = [Template(arg).render(**self.dynamic_args) for arg in self.cmd]
+ super().launch(**kwargs)
diff --git a/util/sim/Simulator.py b/util/sim/Simulator.py
new file mode 100644
index 000000000..3d3090573
--- /dev/null
+++ b/util/sim/Simulator.py
@@ -0,0 +1,187 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande
+
+from Simulation import QuestaSimulation, VCSSimulation, VerilatorSimulation, BansheeSimulation, \
+ CustomSimulation
+
+
+class Simulator(object):
+ """An object capable of constructing Simulation objects.
+
+ A simulator constructs a [Simulation][Simulation.Simulation] object
+ from a test object, as defined e.g. in a test suite specification
+ file.
+
+ At minimum, a test is defined by a binary (`elf`) which is to be
+ simulated and a set of simulators it can be run on. A test could be
+ defined by a class of its own, but at the moment we assume a test
+ to be represented by a dictionary with the `elf` and `simulators`
+ keys at minimum.
+ """
+
+ def __init__(self, name, simulation_cls):
+ """Constructor for the Simulator class.
+
+ A simulator must be identifiable by a unique identifier string
+ and construct at least one type of
+ [Simulation][Simulation.Simulation] object.
+
+ Arguments:
+ name: The unique identifier of the simulator.
+ simulation_cls: One type of
+ [Simulation][Simulation.Simulation] object the
+ simulator can construct.
+ """
+ self.name = name
+ self.simulation_cls = simulation_cls
+
+ def supports(self, test):
+ """Check whether a certain test is supported by the simulator.
+
+ Arguments:
+ test: The test to check.
+ """
+ return 'simulators' not in test or self.name in test['simulators']
+
+ def get_simulation(self, test, simulation_cls=None, **kwargs):
+ """Construct a Simulation object from the specified test.
+
+ Arguments:
+ test: The test for which a Simulation object must be
+ constructed.
+ simulation_cls: Create a simulation instance of this
+ Simulation subclass. Use `self.simulation_cls` by
+ default.
+ """
+ kwargs.update({key: test[key] for key in ['elf', 'run_dir', 'retcode'] if key in test})
+ if simulation_cls is not None:
+ return simulation_cls(**kwargs)
+ else:
+ return self.simulation_cls(**kwargs)
+
+
+class RTLSimulator(Simulator):
+ """Base class for RTL simulators.
+
+ An RTL simulator requires a simulation binary built from an RTL
+ design to launch a simulation.
+
+ A test may need to be run with a custom command, itself invoking
+ the simulation binary behind the scenes, e.g. for verification
+ purposes. Such a test carries the custom command (a list of args)
+ under the `cmd` key. In such case, the RTL simulator constructs a
+ [CustomSimulation][Simulation.CustomSimulation] object from the
+ given test, with the custom command and simulation binary.
+ """
+
+ def __init__(self, binary, **kwargs):
+ """Constructor for the RTLSimulator class.
+
+ Arguments:
+ binary: The simulation binary.
+ kwargs: Arguments passed to the base class constructor.
+ """
+ super().__init__(**kwargs)
+ self.binary = binary
+
+ def get_simulation(self, test):
+ if 'cmd' in test:
+ return super().get_simulation(
+ test,
+ simulation_cls=CustomSimulation,
+ sim_bin=self.binary,
+ cmd=test['cmd'])
+ else:
+ return super().get_simulation(
+ test,
+ sim_bin=self.binary
+ )
+
+
+class VCSSimulator(RTLSimulator):
+ """VCS simulator
+
+ An [RTL simulator][Simulator.RTLSimulator], identified by the name
+ `vcs`, tailored to the creation of
+ [VCS simulations][Simulation.VCSSimulation].
+ """
+
+ def __init__(self, binary):
+ """Constructor for the VCSSimulator class.
+
+ Arguments:
+ binary: The VCS simulation binary.
+ """
+ super().__init__(binary, name='vcs', simulation_cls=VCSSimulation)
+
+
+class QuestaSimulator(RTLSimulator):
+ """QuestaSim simulator
+
+ An [RTL simulator][Simulator.RTLSimulator], identified by the name
+ `vsim`, tailored to the creation of
+ [QuestaSim simulations][Simulation.QuestaSimulation].
+ """
+
+ def __init__(self, binary):
+ """Constructor for the QuestaSimulator class.
+
+ Arguments:
+ binary: The QuestaSim simulation binary.
+ """
+ super().__init__(binary, name='vsim', simulation_cls=QuestaSimulation)
+
+
+class VerilatorSimulator(RTLSimulator):
+ """Verilator simulator
+
+ An [RTL simulator][Simulator.RTLSimulator], identified by the name
+ `verilator`, tailored to the creation of
+ [Verilator simulations][Simulation.VerilatorSimulation].
+ """
+
+ def __init__(self, binary):
+ """Constructor for the VerilatorSimulator class.
+
+ Arguments:
+ binary: The Verilator simulation binary.
+ """
+ super().__init__(binary, name='verilator', simulation_cls=VerilatorSimulation)
+
+
+class BansheeSimulator(Simulator):
+ """Banshee simulator
+
+ A simulator, identified by the name `banshee`, tailored to the
+ creation of [Banshee simulations][Simulation.BansheeSimulation].
+ """
+
+ def __init__(self, cfg):
+ """Constructor for the BansheeSimulator class.
+
+ Arguments:
+ cfg: A Banshee config file.
+ """
+ super().__init__(name='banshee', simulation_cls=BansheeSimulation)
+ self.cfg = cfg
+
+ def supports(self, test):
+ """See base class.
+
+ The Banshee simulator does not support tests carrying a custom
+ command.
+ """
+ supported = super().supports(test)
+ if 'cmd' in test:
+ return False
+ else:
+ return supported
+
+ def get_simulation(self, test):
+ return super().get_simulation(
+ test,
+ banshee_cfg=self.cfg
+ )
diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py
index 664e2624b..2ed260d3f 100644
--- a/util/sim/data_utils.py
+++ b/util/sim/data_utils.py
@@ -9,7 +9,7 @@
def emit_license():
- s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna."
+ s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna.\n"
f"// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n"
f"// SPDX-License-Identifier: Apache-2.0\n\n")
return s
diff --git a/util/sim/elf.py b/util/sim/elf.py
index a46a6764d..27ab5b3e7 100644
--- a/util/sim/elf.py
+++ b/util/sim/elf.py
@@ -36,6 +36,15 @@ def get_symbol_size(self, uid):
def get_symbol_contents(self, uid):
addr = self.get_symbol_address(uid)
size = self.get_symbol_size(uid)
- fpos = list(self.elf.address_offsets(addr, size))[0]
- self.elf.stream.seek(fpos)
- return self.elf.stream.read(size)
+ try:
+ fpos = list(self.elf.address_offsets(addr, size))[0]
+ self.elf.stream.seek(fpos)
+ contents = self.elf.stream.read(size)
+ except IndexError:
+ # We assume all segments in our ELF are of type PT_LOAD and
+ # that the only section whose contents are not stored in
+ # the ELF file is the .bss section. Therefore, whenever
+ # `address_offsets()` fails to return a valid offset into the
+ # file we assume that the address falls in the .bss section.
+ contents = bytearray([0] * size)
+ return contents
diff --git a/util/sim/sim_utils.py b/util/sim/sim_utils.py
new file mode 100755
index 000000000..371d56b81
--- /dev/null
+++ b/util/sim/sim_utils.py
@@ -0,0 +1,288 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande
+"""Convenience functions to set up a Python simulation framework.
+
+Such a framework enables you to transparently run a software test suite
+on any simulator of choice, provided that the latter is supported by
+the framework. It can be used in CIs, regression testing or to conduct
+systematic evaluation experiments.
+
+Three interfaces are required to implement a common framework:
+
+1. a test suite specification interface to specify the software tests
+2. a command-line interface used to launch the simulations
+3. an interface to the simulators supported by the framework
+
+The framework can be divided into three components each managing one of
+the defined interfaces:
+
+1. a test suite frontend
+2. a command-line frontend
+3. a simulation backend
+
+A fourth component, the core, serves to glue all other components
+together.
+
+The [parser()][sim_utils.parser] function provides a minimum
+command-line interface to control the tool.
+
+The [get_simulations()][sim_utils.get_simulations] function
+provides a common means to implement the test suite frontend. At the
+input interface it assumes a test suite specification file in YAML
+syntax, and returns a list of simulation objects which implement a
+common interface to the simulation backend. This interface is defined
+by the [Simulation][Simulation.Simulation] class.
+
+The core logic of the framework is implemented in the
+[run_simulations()][sim_utils.run_simulations] function. It takes
+the output from [get_simulations()][sim_utils.get_simulations] and
+launches the simulations through the interface to the simulation
+backend.
+
+The simulation backend is implemented by the
+[Simulation][Simulation.Simulation] and
+[Simulator][Simulator.Simulator] classes and their subclasses.
+"""
+
+import argparse
+from termcolor import colored, cprint
+from pathlib import Path
+import os
+import time
+import yaml
+import signal
+import psutil
+
+POLL_PERIOD = 0.2
+
+
+def parser(default_simulator='vsim', simulator_choices=['vsim']):
+ """Default command-line parser for Python simulation frameworks.
+
+ Returns a Python `argparse` parser with common options used to
+ simulate one or multiple binaries on an RTL design. Can be extended
+ by adding arguments to it.
+
+ Args:
+ default_simulator: The simulator to be used when none is
+ specified on the command-line.
+ simulator_choices: All simulator choices which can be passed on
+ the command-line.
+ """
+ # Argument parsing
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ 'testlist',
+ help='File specifying a list of apps to run')
+ parser.add_argument(
+ '--simulator',
+ action='store',
+ nargs='?',
+ default=default_simulator,
+ choices=simulator_choices,
+ help='Choose a simulator to run the test with')
+ parser.add_argument(
+ '--run-dir',
+ action='store',
+ default='runs',
+ nargs='?',
+ help='Parent directory of each test run directory')
+ parser.add_argument(
+ '--dry-run',
+ action='store_true',
+ help='Preview the simulation commands which will be run')
+ parser.add_argument(
+ '--early-exit',
+ action='store_true',
+ help='Exit as soon as any test fails')
+ parser.add_argument(
+ '--verbose',
+ action='store_true',
+ help='Activate verbose printing')
+ parser.add_argument(
+ '-j',
+ action='store',
+ dest='n_procs',
+ nargs='?',
+ type=int,
+ default=1,
+ const=os.cpu_count(),
+ help=('Maximum number of tests to run in parallel. '
+ 'One if the option is not present. Equal to the number of CPU cores '
+ 'if the option is present but not followed by an argument.'))
+ return parser
+
+
+def _resolve_relative_path(base_path, s):
+ """Resolve a relative path string w.r.t. a ceratin base.
+
+ Checks if an input string represents a valid relative path w.r.t.
+ to a certain base path and resolves it to an absolute path, if this
+ is the case. Otherwise returns the original string.
+
+ Args:
+ s: The input string
+ base_path: The base path
+ """
+ try:
+ base_path = Path(base_path).resolve() # Get the absolute path of the base directory
+ input_path = Path(s)
+ if input_path.is_absolute() or not s.startswith(("./", "../")):
+ return s
+ else:
+ # Resolve the path against the base directory and check existence
+ absolute_path = (base_path / input_path).resolve()
+ return str(absolute_path)
+ except (TypeError, ValueError):
+ # Handle invalid base_path or s
+ return s
+ except Exception as e:
+ # Handle other exceptions like permission errors, etc.
+ print(f"An error occurred: {str(e)}")
+ return s
+
+
+def get_simulations(testlist, simulator, run_dir=None):
+ """Create simulation objects from a test list file.
+
+ Args:
+ testlist: Path to a test list file. A test list file is a YAML
+ file describing a set of tests.
+ simulator: The simulator to use to run the tests. A test run on
+ a specific simulator defines a simulation.
+ run_dir: A directory under which all tests should be run. If
+ provided, a unique subdirectory for each test will be
+ created under this directory, based on the test name.
+
+ Returns:
+ A list of `Simulation` objects. The list contains a
+ `Simulation` object for every test which supports the given
+ `simulator`. This object defines a simulation of the test on
+ that particular `simulator`.
+ """
+ # Get tests from test list file
+ testlist_path = Path(testlist).absolute()
+ with open(testlist_path, 'r') as f:
+ tests = yaml.safe_load(f)['runs']
+ # Convert relative paths in testlist file to absolute paths
+ for test in tests:
+ test['elf'] = testlist_path.parent / test['elf']
+ if 'cmd' in test:
+ test['cmd'] = [_resolve_relative_path(testlist_path.parent, arg) for arg in test['cmd']]
+ # Create simulation object for every test which supports the specified simulator
+ simulations = [simulator.get_simulation(test) for test in tests if simulator.supports(test)]
+ # Set simulation run directory
+ if run_dir is not None:
+ for sim in simulations:
+ sim.run_dir = Path(run_dir) / sim.testname
+ return simulations
+
+
+def print_summary(failed_sims, early_exit=False, dry_run=False):
+ """Print a summary of the simulation suite's exit status.
+
+ Args:
+ failed_sims: A list of failed simulations from the simulation
+ suite.
+ early_exit: Whether the simulation suite was configured to
+ terminate upon the first failing simulation.
+ dry_run: Whether the simulation suite was launched in dry run
+ mode.
+ """
+ if not dry_run:
+ header = f'==== Test summary {"(early exit)" if early_exit else ""} ===='
+ cprint(header, attrs=['bold'])
+ if failed_sims:
+ [sim.print_status() for sim in failed_sims]
+ else:
+ print(f'{colored("All tests passed!", "green")}')
+
+
+def terminate_processes():
+ print('Terminate processes')
+ # Get PID and PGID of parent process (current Python script)
+ ppid = os.getpid()
+ pgid = os.getpgid(0)
+ # Kill processes in current process group, except parent process
+ for proc in psutil.process_iter(['pid', 'name']):
+ pid = proc.info['pid']
+ if os.getpgid(pid) == pgid and pid != ppid:
+ os.kill(pid, signal.SIGKILL)
+
+
+def get_unique_run_dir(sim, prefix=None):
+ """Get unique run directory for a simulation.
+
+ If the simulation was already assigned a run directory at creation
+ time, None is returned. Otherwise, return a unique run directory
+ based on the testname under an optional prefix directory.
+
+ Args:
+ sim: The simulation for which the run directory is
+ requested.
+ prefix: Get a unique run directory under a directory which
+ could be common to multiple simulations. We call this
+ a prefix. By default the current working directory is
+ assumed as the prefix.
+ """
+ if sim.run_dir is None:
+ if prefix is None:
+ prefix = Path.cwd()
+ return prefix / sim.testname
+
+
+def run_simulations(simulations, n_procs=1, dry_run=None, early_exit=False,
+ verbose=False):
+ """Run simulations defined by a list of `Simulation` objects.
+
+ Args:
+ simulations: A list of `Simulation` objects as returned e.g. by
+ [sim_utils.get_simulations][].
+
+ Returns:
+ The number of failed simulations.
+ """
+ # Register SIGTERM handler, used to gracefully terminate all simulation subprocesses
+ signal.signal(signal.SIGTERM, lambda _, __: terminate_processes())
+
+ # Spawn a process for every test, wait for all running tests to terminate and check results
+ running_sims = []
+ failed_sims = []
+ early_exit_requested = False
+ try:
+ while (len(simulations) or len(running_sims)) and not early_exit_requested:
+ # If there are still simulations to run and there are less running simulations than
+ # the maximum number of processes allowed in parallel, spawn new simulation
+ if len(simulations) and len(running_sims) < n_procs:
+ running_sims.append(simulations.pop(0))
+ running_sims[-1].launch(dry_run=dry_run)
+ # Remove completed sims from running sims list
+ idcs = [i for i, sim in enumerate(running_sims) if sim.completed()]
+ completed_sims = [running_sims.pop(i) for i in sorted(idcs, reverse=True)]
+ # Check completed sims and report status
+ for sim in completed_sims:
+ if sim.successful():
+ sim.print_status()
+ else:
+ failed_sims.append(sim)
+ if verbose:
+ sim.print_log()
+ sim.print_status()
+ # If in early-exit mode, terminate as soon as any simulation fails
+ if early_exit:
+ early_exit_requested = True
+ break
+ time.sleep(POLL_PERIOD)
+ except KeyboardInterrupt:
+ early_exit_requested = True
+
+ # Clean up after early exit
+ if early_exit_requested:
+ terminate_processes()
+
+ # Print summary
+ print_summary(failed_sims, early_exit_requested)
+ return len(failed_sims)
diff --git a/util/sim/simulate.py b/util/sim/simulate.py
deleted file mode 100755
index 4e36cc1e1..000000000
--- a/util/sim/simulate.py
+++ /dev/null
@@ -1,270 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2023 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Luca Colagrande
-
-# TODO colluca: timeout feature
-
-import argparse
-import multiprocessing
-from pathlib import Path
-import subprocess
-from termcolor import colored, cprint
-import os
-import re
-import sys
-import time
-import yaml
-
-
-BANSHEE_CFG = 'src/banshee.yaml'
-
-# Tool settings
-SIMULATORS = ['vsim', 'banshee', 'verilator', 'vcs', 'other']
-DEFAULT_SIMULATOR = SIMULATORS[0]
-SIMULATOR_BINS = {
- 'vsim': 'bin/snitch_cluster.vsim',
- 'banshee': 'banshee',
- 'verilator': 'bin/snitch_cluster.vlt',
- 'vcs': 'bin/snitch_cluster.vcs'
-}
-SIMULATOR_CMDS = {
- 'vsim': '{sim_bin} {elf} "" -batch',
- 'banshee': ('{{sim_bin}} --no-opt-llvm --no-opt-jit --configuration {cfg}'
- ' --trace {{elf}} > /dev/null').format(cfg=BANSHEE_CFG),
- 'verilator': '{sim_bin} {elf}',
- 'vcs': '{sim_bin} {elf}'
-}
-
-
-def parse_args():
- # Argument parsing
- parser = argparse.ArgumentParser()
- parser.add_argument(
- 'testlist',
- help='File specifying a list of apps to run')
- parser.add_argument(
- '--simulator',
- action='store',
- nargs='?',
- default=DEFAULT_SIMULATOR,
- choices=SIMULATORS,
- help='Choose a simulator to run the test with')
- parser.add_argument(
- '--sim-bin',
- action='store',
- nargs='?',
- help='Override default path to simulator binary')
- parser.add_argument(
- '--dry-run',
- action='store_true',
- help='Preview the simulation commands which will be run')
- parser.add_argument(
- '--early-exit',
- action='store_true',
- help='Exit as soon as any test fails')
- parser.add_argument(
- '-j',
- action='store',
- dest='n_procs',
- nargs='?',
- type=int,
- default=1,
- const=os.cpu_count(),
- help=('Maximum number of tests to run in parallel. '
- 'One if the option is not present. Equal to the number of CPU cores '
- 'if the option is present but not followed by an argument.'))
- parser.add_argument(
- '--verbose',
- action='store_true',
- help=('Option to print simulation logs when multiple tests are run in parallel.'
- 'Logs are always printed when n_procs == 1'))
- args = parser.parse_args()
- return args
-
-
-# Get tests from a test list file
-def get_tests(testlist_path):
- testlist_path = Path(testlist_path).absolute()
- with open(testlist_path, 'r') as file:
- tests = yaml.safe_load(file)['runs']
- return tests
-
-
-def check_exit_code(test, exit_code):
- if 'exit_code' in test:
- return not (int(test['exit_code']) == int(exit_code))
- else:
- return exit_code
-
-
-def multiple_processes(args):
- return args.n_procs != 1
-
-
-def run_simulation(cmd, simulator, test, quiet=False):
- # Defaults
- result = 1
- log = ''
-
- # Spawn simulation subprocess
- p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
- universal_newlines=True)
-
- # Poll simulation subprocess and log its output
- while p.poll() is None:
- line = p.stdout.readline()
- log += line
- if not quiet:
- print(line, end='', flush=True)
-
- # When simulating with vsim or vcs, we need to parse the simulation
- # log to catch the application's return code
- if simulator in ['vsim', 'vcs']:
- # Capture success
- regex_success = r'\[SUCCESS\] Program finished successfully'
- match_success = re.search(regex_success, line)
- if match_success:
- result = 0
- else:
- regex_fail = r'\[FAILURE\] Finished with exit code\s+(\d+)'
- match = re.search(regex_fail, line)
- if match:
- exit_code = match.group(1)
- result = check_exit_code(test, exit_code)
-
- # Check if the subprocess terminated correctly
- exit_code = p.poll()
- # In Banshee and Verilator the exit code of the Snitch binary is returned
- # through the exit code of the simulation command
- if simulator in ['banshee', 'verilator']:
- result = check_exit_code(test, exit_code)
- # For custom commands the return code is that of the command
- elif simulator == 'other':
- result = exit_code
- # For standard simulation commands the simulated Snitch binary exit
- # code is overriden only if the simulator failed
- else:
- if exit_code != 0:
- result = exit_code
-
- return result, log
-
-
-def run_test(test, args):
- # Extract args
- simulator = args.simulator
- sim_bin = args.sim_bin if args.sim_bin else SIMULATOR_BINS[simulator]
- dry_run = args.dry_run
- testlist = args.testlist
- quiet = multiple_processes(args)
-
- # Check if simulator is supported for this test
- if 'simulators' in test:
- if simulator not in test['simulators']:
- return (0, '')
-
- # Construct path to executable
- elf = Path(test['elf'])
- if testlist:
- elf = Path(testlist).absolute().parent / elf
- cprint(f'Run test {colored(elf, "cyan")}', attrs=["bold"])
-
- # Construct simulation command (override only supported for RTL)
- if 'cmd' in test and simulator != 'banshee':
- cmd = test['cmd']
- cmd = cmd.format(sim_bin=sim_bin, elf=elf, simulator=simulator)
- simulator = 'other'
- else:
- cmd = SIMULATOR_CMDS[simulator]
- cmd = cmd.format(sim_bin=sim_bin, elf=elf)
- if not quiet:
- print(f'$ {cmd}', flush=True)
-
- # Run simulation
- result = 0
- log = ''
- if not dry_run:
- result, log = run_simulation(cmd, simulator, test, quiet)
-
- # Report failure or success
- if result != 0:
- cprint(f'{elf} test failed', 'red', attrs=['bold'], flush=True)
- else:
- cprint(f'{elf} test passed', 'green', attrs=['bold'], flush=True)
-
- return (result, log)
-
-
-def print_failed_test(test):
- print(f'{colored(test["elf"], "cyan")} test {colored("failed", "red")}')
-
-
-def print_test_summary(failed_tests, args):
- if not args.dry_run:
- header = f'\n==== Test summary {"(early exit)" if args.early_exit else ""} ===='
- cprint(header, attrs=['bold'])
- if failed_tests:
- for failed_test in failed_tests:
- print_failed_test(failed_test)
- else:
- print(f'{colored("All tests passed!", "green")}')
-
-
-def run_tests(tests, args):
-
- # Create a process Pool
- with multiprocessing.Pool(args.n_procs) as pool:
-
- # Create a shared object which parent and child processes can access
- # concurrently to terminate the pool early as soon as one process fails
- exit_early = multiprocessing.Value('B')
- exit_early.value = 0
-
- # Define callback for early exit
- def completion_callback(return_value):
- result = return_value[0]
- log = return_value[1]
- if args.early_exit and result != 0:
- exit_early.value = 1
- # Printing the log all at once here, rather than line-by-line
- # in run_simulation, ensures that the logs of different processes
- # are not interleaved in stdout.
- # However, as we prefer line-by-line printing when a single process
- # is used, we have to make sure we don't print twice.
- if args.verbose and multiple_processes(args):
- print(log)
-
- # Queue tests to process pool
- results = []
- for test in tests:
- result = pool.apply_async(run_test, args=(test, args), callback=completion_callback)
- results.append(result)
-
- # Wait for all tests to complete
- running = range(len(tests))
- while len(running) != 0 and not exit_early.value:
- time.sleep(1)
- running = [i for i in running if not results[i].ready()]
-
- # Query test results
- failed_tests = []
- for test, result in zip(tests, results):
- if result.ready() and result.get()[0] != 0:
- failed_tests.append(test)
-
- print_test_summary(failed_tests, args)
-
- return len(failed_tests)
-
-
-def main():
- args = parse_args()
- tests = get_tests(args.testlist)
- return run_tests(tests, args)
-
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/util/trace/perf_csv.py b/util/trace/perf_csv.py
index 450758c70..f26e242e2 100755
--- a/util/trace/perf_csv.py
+++ b/util/trace/perf_csv.py
@@ -17,7 +17,7 @@
import pandas as pd
-HARTID_REGEX = r'\D*(\d*)\D*'
+HARTID_REGEX = r'hart_([0-9a-f]+)_perf.json'
def main():