diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 01fff63fb..26e4ee4ff 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,7 +2,7 @@ # Unless a later match takes precedence, global owners below will be # requested for review when someone opens a pull request. -* @paulsc96 @colluca +* @paulsc96 @colluca @fischeti hw/snitch_cluster @paulsc96 @lucabertaccini hw/snitch_dma @paulsc96 @thommythomaso diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f2c3e692a..f8f87b3f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,19 +36,19 @@ jobs: submodules: 'recursive' - name: Build Software run: | + bender vendor init make -C target/snitch_cluster sw - name: Build Hardware run: | make -C target/snitch_cluster bin/snitch_cluster.vlt - name: Run Tests working-directory: target/snitch_cluster - run: |- - ../../util/sim/simulate.py sw/run.yaml --simulator verilator -j \ - --verbose + run: | + ./run.py sw/run.yaml --simulator verilator -j - ############################################ + ######################################### # Build SW on Snitch Cluster w/ Banshee # - ############################################ + ######################################### sw-snitch-cluster-banshee: name: Simulate SW on Snitch Cluster w/ Banshee @@ -61,11 +61,11 @@ jobs: submodules: 'recursive' - name: Build Software run: | + bender vendor init make -C target/snitch_cluster SELECT_RUNTIME=banshee sw - name: Run Tests env: SNITCH_LOG: info working-directory: target/snitch_cluster - run: |- - ../../util/sim/simulate.py sw/run.yaml --simulator banshee -j \ - --verbose + run: | + ./run.py sw/run.yaml --simulator banshee -j diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 610c271ea..18cd5d4aa 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -5,15 +5,18 @@ variables: GIT_STRATEGY: clone GIT_SUBMODULE_STRATEGY: recursive + # Enable colors in CI terminal + TERM: ansi + FORCE_COLOR: 1 + # Configure environment PYTHON: /usr/local/anaconda3-2022.05/bin/python3 BENDER: bender-0.27.1 CC: gcc-9.2.0 CXX: g++-9.2.0 - VCS: vcs-2020.12 - VERILATOR: verilator-4.110 - QUESTA: questa-2022.3 + VCS_SEPP: vcs-2020.12 + VERILATOR_SEPP: verilator-4.110 + QUESTA_SEPP: questa-2022.3 LLVM_BINROOT: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin - CLANG: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/clang CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER: /usr/pack/gcc-9.2.0-af/linux-x64/bin/gcc LLVM_SYS_120_PREFIX: /usr/pack/llvm-12.0.1-af CMAKE: cmake-3.18.1 @@ -21,7 +24,13 @@ variables: before_script: - $PYTHON -m venv .venv - source .venv/bin/activate - - pip install -r python-requirements.txt + # Unpack packages in a local temporary directory which can be safely cleaned + # after installation. Also protects against "No space left on device" errors + # occurring when the /tmp folder is filled by other processes. + - mkdir tmp + - TMPDIR=tmp pip install -r python-requirements.txt + - rm -rf tmp + - $BENDER vendor init ############## # Build docs # @@ -79,8 +88,8 @@ snitch-ip-tests: - tcdm_interface script: - cd hw/$IP - - $QUESTA ./util/compile.sh - - $QUESTA ./util/run_vsim.sh + - ./util/compile.sh + - ./util/run_vsim.sh ######################## # Snitch cluster tests # @@ -89,29 +98,26 @@ snitch-ip-tests: # Verilator snitch-cluster-vlt: needs: [snitch-cluster-sw] - # yamllint disable rule:line-length script: - cd target/snitch_cluster - - $VERILATOR make bin/snitch_cluster.vlt - - $VERILATOR ../../util/sim/simulate.py sw/run.yaml --simulator verilator -j --verbose - # yamllint enable rule:line-length + - make bin/snitch_cluster.vlt + - ./run.py sw/run.yaml --simulator verilator -j --run-dir runs/vlt # VCS snitch-cluster-vcs: needs: [snitch-cluster-sw] script: - cd target/snitch_cluster - - $VCS make bin/snitch_cluster.vcs - - $VCS ../../util/sim/simulate.py sw/run.yaml --simulator vcs -j --verbose + - make bin/snitch_cluster.vcs + - ./run.py sw/run.yaml --simulator vcs -j --run-dir runs/vcs # Questa snitch-cluster-vsim: needs: [snitch-cluster-sw] script: - cd target/snitch_cluster - - $QUESTA make bin/snitch_cluster.vsim - - $QUESTA ../../util/sim/simulate.py sw/run.yaml --simulator vsim -j - --verbose + - make bin/snitch_cluster.vsim + - ./run.py sw/run.yaml --simulator vsim -j --run-dir runs/vsim # Banshee snitch-cluster-banshee: @@ -127,4 +133,4 @@ snitch-cluster-banshee: - cd banshee - cargo install --debug --path . - cd ../target/snitch_cluster - - ../../util/sim/simulate.py sw/run.yaml --simulator banshee -j --verbose + - ./run.py sw/run.yaml --simulator banshee -j --run-dir runs/banshee diff --git a/Bender.yml b/Bender.yml index 732788d0d..84dad47e8 100644 --- a/Bender.yml +++ b/Bender.yml @@ -22,7 +22,7 @@ dependencies: axi: { git: https://github.com/pulp-platform/axi, version: 0.39.0 } axi_riscv_atomics: { git: https://github.com/pulp-platform/axi_riscv_atomics, version: 0.6.0 } common_cells: { git: https://github.com/pulp-platform/common_cells, version: 1.28.0 } - FPnew: { git: https://github.com/openhwgroup/cvfpu, rev: 1202ca3 } # TODO: feature branch `feature/expanding_sdotp`; get merged! + FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 } register_interface: { git: https://github.com/pulp-platform/register_interface, version: 0.4.2 } tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version: 0.2.11 } riscv-dbg: { git: https://github.com/pulp-platform/riscv-dbg, version: 0.8.0 } @@ -37,13 +37,40 @@ vendor_package: - "Makefile" - ".gitignore" - "README" - - "src/math/tanh.c" + - "src/math/ceil.c" + - "src/math/ceilf.c" + - "src/math/ceill.c" - "src/math/expm1.c" + - "src/math/expf.c" + - "src/math/exp2f_data.c" + - "src/math/exp2f_data.h" + - "src/math/log2.c" + - "src/math/log2_data.c" + - "src/math/log2_data.h" + - "src/math/log2f.c" + - "src/math/log2f_data.c" + - "src/math/log2f_data.h" + - "src/math/__math_divzero.c" + - "src/math/__math_invalid.c" + - "src/math/__math_invalidf.c" + - "src/math/__math_invalidl.c" + - "src/math/__math_oflow.c" + - "src/math/__math_oflowf.c" + - "src/math/__math_uflow.c" + - "src/math/__math_uflowf.c" + - "src/math/__math_xflow.c" + - "src/math/__math_xflowf.c" + - "src/math/sqrt.c" + - "src/math/sqrtf.c" + - "src/math/sqrt_data.c" + - "src/math/sqrt_data.h" + - "src/math/tanh.c" - "src/internal/libm.h" - "src/include/features.h" - "include/endian.h" - "include/math.h" - "include/features.h" + - "include/float.h" - "include/alltypes.h.in" - "arch/riscv64/bits/alltypes.h.in" - "arch/riscv64/bits/float.h" diff --git a/Makefile b/Makefile index 06a1c662f..087204634 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 -REGGEN = $(shell bender path register_interface)/vendor/lowrisc_opentitan/util/regtool.py +BENDER ?= bender +REGGEN = $(shell $(BENDER) path register_interface)/vendor/lowrisc_opentitan/util/regtool.py GENERATED_DOCS_DIR = docs/generated GENERATED_DOC_SRCS = $(GENERATED_DOCS_DIR)/peripherals.md @@ -16,9 +17,7 @@ clean: clean-docs doc-srcs: $(GENERATED_DOC_SRCS) docs: doc-srcs - @if mkdocs build | grep -q "ERROR"; then \ - exit 1; \ - fi + mkdocs build clean-docs: rm -rf $(GENERATED_DOCS_DIR) diff --git a/README.md b/README.md index d5fe00f57..1f7b6459c 100644 --- a/README.md +++ b/README.md @@ -86,19 +86,39 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
-Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra +Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra

``` -@inproceedings{scheffler2021indirect, +@article{scheffler2023sparsessr, author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca}, - booktitle={2021 Design, Automation & Test in Europe Conference & Exhibition (DATE)}, - title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra}, - year={2021}, + journal={IEEE Transactions on Parallel and Distributed Systems}, + title={Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra}, + year={2023}, + volume={34}, + number={12}, + pages={3147-3161}, + doi={10.1109/TPDS.2023.3322029} +} +``` + +

+
+ +
+A High-performance, Energy-efficient Modular DMA Engine Architecture +

+ +``` +@ARTICLE{benz2023idma, + author={Benz, Thomas and Rogenmoser, Michael and Scheffler, Paul and Riedel, Samuel and Ottaviano, Alessandro and Kurth, Andreas and Hoefler, Torsten and Benini, Luca}, + journal={IEEE Transactions on Computers}, + title={A High-performance, Energy-efficient Modular DMA Engine Architecture}, + year={2023}, volume={}, number={}, - pages={1787-1792} -} + pages={1-14}, + doi={10.1109/TC.2023.3329930}} ```

diff --git a/apt-requirements.txt b/apt-requirements.txt index 5bb7b560d..15f12e8b7 100644 --- a/apt-requirements.txt +++ b/apt-requirements.txt @@ -6,8 +6,4 @@ clang-format device-tree-compiler graphviz -python3 -python3-pip -python3-setuptools -python3-wheel tar diff --git a/docs/publications.md b/docs/publications.md index 6f14daa64..e4c86b4c6 100644 --- a/docs/publications.md +++ b/docs/publications.md @@ -42,19 +42,39 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
-Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra +Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra

``` -@inproceedings{scheffler2021indirect, +@article{scheffler2023sparsessr, author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca}, - booktitle={2021 Design, Automation & Test in Europe Conference & Exhibition (DATE)}, - title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra}, - year={2021}, + journal={IEEE Transactions on Parallel and Distributed Systems}, + title={Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra}, + year={2023}, + volume={34}, + number={12}, + pages={3147-3161}, + doi={10.1109/TPDS.2023.3322029} +} +``` + +

+
+ +
+A High-performance, Energy-efficient Modular DMA Engine Architecture +

+ +``` +@ARTICLE{benz2023idma, + author={Benz, Thomas and Rogenmoser, Michael and Scheffler, Paul and Riedel, Samuel and Ottaviano, Alessandro and Kurth, Andreas and Hoefler, Torsten and Benini, Luca}, + journal={IEEE Transactions on Computers}, + title={A High-performance, Energy-efficient Modular DMA Engine Architecture}, + year={2023}, volume={}, number={}, - pages={1787-1792} -} + pages={1-14}, + doi={10.1109/TC.2023.3329930}} ```

diff --git a/docs/requirements.txt b/docs/requirements.txt index 6a766858d..e913931e3 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,7 +4,8 @@ # Keep sorted. mkdocs -# Last version compatible with python-3.6 (default on Ubuntu 18.04) -mkdocs-material <= 8.2.11 +mkdocs-material mkdocs-include-markdown-plugin -mkdocs-macros-plugin \ No newline at end of file +mkdocs-macros-plugin +mkdocstrings +mkdocstrings-python diff --git a/docs/rm/custom_instructions.md b/docs/rm/custom_instructions.md index 2a79b757a..f7fcfbd0d 100644 --- a/docs/rm/custom_instructions.md +++ b/docs/rm/custom_instructions.md @@ -37,7 +37,7 @@ The FREP instruction has the following signature: | max_inst | max_rpt | stagger_max | stagger_mask | 0 | OP-CUSTOM1 | FREP.I | | max_inst | max_rpt | stagger_max | stagger_mask | 1 | OP-CUSTOM1 | FREP.O | -FREP.I and FREP.O repeat the *max_inst* instructions following the FREP instruction for *max_rpt + 1* times. The FREP.I instruction (*I* stands for inner) repeats every instruction the specified number of times and moves on to executing and repeating the next. The FREP.O instruction (*O* stands for outer) repeats the whole sequence of instructions *max_rpt + 1* times. Register staggering can be enabled and configured via the *stagger_mask* and *stagger_max* immediates. A detailed explanation of their use can be found in the Snitch [paper](/publications). +FREP.I and FREP.O repeat the *max_inst + 1* instructions following the FREP instruction for *max_rpt + 1* times. The FREP.I instruction (*I* stands for inner) repeats every instruction the specified number of times and moves on to executing and repeating the next. The FREP.O instruction (*O* stands for outer) repeats the whole sequence of instructions *max_rpt + 1* times. Register staggering can be enabled and configured via the *stagger_mask* and *stagger_max* immediates. A detailed explanation of their use can be found in the Snitch [paper](/publications). The assembly instruction signature follows: diff --git a/docs/rm/sim/Simulation.md b/docs/rm/sim/Simulation.md new file mode 100644 index 000000000..6671fb590 --- /dev/null +++ b/docs/rm/sim/Simulation.md @@ -0,0 +1 @@ +::: Simulation diff --git a/docs/rm/sim/Simulator.md b/docs/rm/sim/Simulator.md new file mode 100644 index 000000000..56f03482d --- /dev/null +++ b/docs/rm/sim/Simulator.md @@ -0,0 +1 @@ +::: Simulator diff --git a/docs/rm/sim/sim_utils.md b/docs/rm/sim/sim_utils.md new file mode 100644 index 000000000..876e5fac4 --- /dev/null +++ b/docs/rm/sim/sim_utils.md @@ -0,0 +1 @@ +::: sim_utils \ No newline at end of file diff --git a/hw/mem_interface/util/compile.sh b/hw/mem_interface/util/compile.sh index 73ccc7fca..1a3678cfa 100755 --- a/hw/mem_interface/util/compile.sh +++ b/hw/mem_interface/util/compile.sh @@ -10,11 +10,11 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$BENDER script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ --vlog-arg="+cover=sbecft" \ > compile.tcl echo 'return 0' >> compile.tcl -$VSIM -c -do 'exit -code [source compile.tcl]' +$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]' diff --git a/hw/mem_interface/util/run_vsim.sh b/hw/mem_interface/util/run_vsim.sh index e30929642..45a6b77e1 100755 --- a/hw/mem_interface/util/run_vsim.sh +++ b/hw/mem_interface/util/run_vsim.sh @@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) [ ! -z "$VSIM" ] || VSIM=vsim call_vsim() { - echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 + echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 grep "Errors: 0," vsim.log } diff --git a/hw/reqrsp_interface/util/compile.sh b/hw/reqrsp_interface/util/compile.sh index 73ccc7fca..af966e202 100755 --- a/hw/reqrsp_interface/util/compile.sh +++ b/hw/reqrsp_interface/util/compile.sh @@ -10,11 +10,11 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$(BENDER) script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ --vlog-arg="+cover=sbecft" \ > compile.tcl echo 'return 0' >> compile.tcl -$VSIM -c -do 'exit -code [source compile.tcl]' +$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]' diff --git a/hw/reqrsp_interface/util/run_vsim.sh b/hw/reqrsp_interface/util/run_vsim.sh index e7fe59fb9..9eeee2e14 100755 --- a/hw/reqrsp_interface/util/run_vsim.sh +++ b/hw/reqrsp_interface/util/run_vsim.sh @@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) [ ! -z "$VSIM" ] || VSIM=vsim call_vsim() { - echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 + echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 grep "Errors: 0," vsim.log } diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv index 2d38c63b3..5bb3b1b48 100644 --- a/hw/snitch_cluster/src/snitch_cc.sv +++ b/hw/snitch_cluster/src/snitch_cc.sv @@ -487,6 +487,7 @@ module snitch_cc #( .trace_port_o ( fpu_trace ), .sequencer_tracer_port_o ( fpu_sequencer_trace ), // pragma translate_on + .hart_id_i ( hart_id_i ), .acc_req_i ( acc_snitch_req ), .acc_req_valid_i ( acc_qvalid ), .acc_req_ready_o ( acc_qready ), diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv index 0b994e19e..fc75a386c 100644 --- a/hw/snitch_cluster/src/snitch_fp_ss.sv +++ b/hw/snitch_cluster/src/snitch_fp_ss.sv @@ -42,6 +42,7 @@ module snitch_fp_ss import snitch_pkg::*; #( output fpu_trace_port_t trace_port_o, output fpu_sequencer_trace_port_t sequencer_tracer_port_o, // pragma translate_on + input logic [31:0] hart_id_i, // Accelerator Interface - Slave input acc_req_t acc_req_i, input logic acc_req_valid_i, @@ -2509,6 +2510,7 @@ module snitch_fp_ss import snitch_pkg::*; #( ) i_fpu ( .clk_i , .rst_ni ( ~rst_i ), + .hart_id_i ( hart_id_i ), .operands_i ( op ), .rnd_mode_i ( fpu_rnd_mode ), .op_i ( fpu_op ), diff --git a/hw/snitch_cluster/src/snitch_fpu.sv b/hw/snitch_cluster/src/snitch_fpu.sv index 44d28df45..ed7958edc 100644 --- a/hw/snitch_cluster/src/snitch_fpu.sv +++ b/hw/snitch_cluster/src/snitch_fpu.sv @@ -19,6 +19,7 @@ module snitch_fpu import snitch_pkg::*; #( input logic clk_i, input logic rst_ni, // Input signals + input logic [31:0] hart_id_i, input logic [2:0][FLEN-1:0] operands_i, input fpnew_pkg::roundmode_e rnd_mode_i, input fpnew_pkg::operation_e op_i, @@ -99,12 +100,15 @@ module snitch_fpu import snitch_pkg::*; #( fpnew_top #( // FPU configuration - .Features ( FPUFeatures ), - .Implementation ( FPUImplementation ), - .TagType ( logic[6:0] ) + .Features ( FPUFeatures ), + .Implementation ( FPUImplementation ), + .TagType ( logic[6:0] ), + .CompressedVecCmpResult ( 1 ), + .StochasticRndImplementation ( fpnew_pkg::DEFAULT_RSR ) ) i_fpu ( .clk_i , .rst_ni , + .hart_id_i ( hart_id_i ), .operands_i ( fpu_in_q.operands ), .rnd_mode_i ( fpu_in_q.rnd_mode ), .op_i ( fpu_in_q.op ), @@ -114,6 +118,7 @@ module snitch_fpu import snitch_pkg::*; #( .int_fmt_i ( fpu_in_q.int_fmt ), .vectorial_op_i ( fpu_in_q.vectorial_op ), .tag_i ( fpu_in_q.tag ), + .simd_mask_i ( '1 ), .in_valid_i ( in_valid_q ), .in_ready_o ( in_ready_q ), .flush_i ( 1'b0 ), diff --git a/hw/snitch_cluster/util/compile.sh b/hw/snitch_cluster/util/compile.sh index 73ccc7fca..1a3678cfa 100755 --- a/hw/snitch_cluster/util/compile.sh +++ b/hw/snitch_cluster/util/compile.sh @@ -10,11 +10,11 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$BENDER script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ --vlog-arg="+cover=sbecft" \ > compile.tcl echo 'return 0' >> compile.tcl -$VSIM -c -do 'exit -code [source compile.tcl]' +$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]' diff --git a/hw/snitch_cluster/util/run_vsim.sh b/hw/snitch_cluster/util/run_vsim.sh index e9298efed..00d08aee3 100755 --- a/hw/snitch_cluster/util/run_vsim.sh +++ b/hw/snitch_cluster/util/run_vsim.sh @@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) [ ! -z "$VSIM" ] || VSIM=vsim call_vsim() { - echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 + echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 grep "Errors: 0," vsim.log } diff --git a/hw/snitch_icache/util/compile.sh b/hw/snitch_icache/util/compile.sh index 73ccc7fca..1a3678cfa 100755 --- a/hw/snitch_icache/util/compile.sh +++ b/hw/snitch_icache/util/compile.sh @@ -10,11 +10,11 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$BENDER script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ --vlog-arg="+cover=sbecft" \ > compile.tcl echo 'return 0' >> compile.tcl -$VSIM -c -do 'exit -code [source compile.tcl]' +$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]' diff --git a/hw/snitch_icache/util/run_vsim.sh b/hw/snitch_icache/util/run_vsim.sh index 94671daf5..42cc47f94 100755 --- a/hw/snitch_icache/util/run_vsim.sh +++ b/hw/snitch_icache/util/run_vsim.sh @@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) [ ! -z "$VSIM" ] || VSIM=vsim call_vsim() { - echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 + echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 grep "Errors: 0," vsim.log } diff --git a/hw/snitch_ssr/util/compile.sh b/hw/snitch_ssr/util/compile.sh index 73ccc7fca..af966e202 100755 --- a/hw/snitch_ssr/util/compile.sh +++ b/hw/snitch_ssr/util/compile.sh @@ -10,11 +10,11 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$(BENDER) script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ --vlog-arg="+cover=sbecft" \ > compile.tcl echo 'return 0' >> compile.tcl -$VSIM -c -do 'exit -code [source compile.tcl]' +$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]' diff --git a/hw/tcdm_interface/util/compile.sh b/hw/tcdm_interface/util/compile.sh index 73ccc7fca..1a3678cfa 100755 --- a/hw/tcdm_interface/util/compile.sh +++ b/hw/tcdm_interface/util/compile.sh @@ -10,11 +10,11 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$BENDER script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ --vlog-arg="+cover=sbecft" \ > compile.tcl echo 'return 0' >> compile.tcl -$VSIM -c -do 'exit -code [source compile.tcl]' +$QUESTA_SEPP $VSIM -c -do 'exit -code [source compile.tcl]' diff --git a/hw/tcdm_interface/util/run_vsim.sh b/hw/tcdm_interface/util/run_vsim.sh index 078ae72a8..6f10155d0 100755 --- a/hw/tcdm_interface/util/run_vsim.sh +++ b/hw/tcdm_interface/util/run_vsim.sh @@ -12,7 +12,7 @@ ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) [ ! -z "$VSIM" ] || VSIM=vsim call_vsim() { - echo "log -r /*; run -all" | $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 + echo "log -r /*; run -all" | $QUESTA_SEPP $VSIM -c -coverage -voptargs='+acc +cover=sbecft' "$@" | tee vsim.log 2>&1 grep "Errors: 0," vsim.log } diff --git a/mkdocs.yml b/mkdocs.yml index 3f9595b0a..70d213601 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,6 +22,10 @@ markdown_extensions: emoji_generator: !!python/name:materialx.emoji.to_svg plugins: - include-markdown + - mkdocstrings: + handlers: + python: + paths: [util/sim] - macros: on_error_fail: true use_directory_urls: false @@ -49,10 +53,15 @@ nav: - Custom Instructions: rm/custom_instructions.md # - Solder: rm/solder.md - Software: - - Pages: runtime/Pages/index.md - - Files: runtime/Files/index.md - - Classes: runtime/Classes/index.md - - Examples: runtime/Examples/index.md - - Modules: runtime/Modules/index.md - - Namespaces: runtime/Namespaces/index.md + - Simulation Utilities: + - sim_utils: rm/sim/sim_utils.md + - rm/sim/Simulation.md + - rm/sim/Simulator.md + - Snitch Runtime: + - Pages: runtime/Pages/index.md + - Files: runtime/Files/index.md + - Classes: runtime/Classes/index.md + - Examples: runtime/Examples/index.md + - Modules: runtime/Modules/index.md + - Namespaces: runtime/Namespaces/index.md - Publications: publications.md diff --git a/python-requirements.txt b/python-requirements.txt index d426cf140..6db0bf03f 100644 --- a/python-requirements.txt +++ b/python-requirements.txt @@ -19,6 +19,7 @@ pytablewriter termcolor pandas pyelftools +psutil -r docs/requirements.txt -r sw/dnn/requirements.txt diff --git a/sw/blas/gemm/Makefile b/sw/blas/gemm/Makefile index 604556ed1..9605f07d7 100644 --- a/sw/blas/gemm/Makefile +++ b/sw/blas/gemm/Makefile @@ -9,16 +9,18 @@ MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) DATA_DIR := $(realpath $(MK_DIR)/data) SRC_DIR := $(realpath $(MK_DIR)/src) +DATA_CFG ?= $(DATA_DIR)/params.hjson +SECTION ?= + APP ?= gemm SRCS ?= $(realpath $(SRC_DIR)/main.c) INCDIRS ?= $(DATA_DIR) $(SRC_DIR) -DATA_CFG ?= $(DATA_DIR)/params.hjson DATAGEN_PY = $(DATA_DIR)/datagen.py DATA_H = $(DATA_DIR)/data.h $(DATA_H): $(DATAGEN_PY) $(DATA_CFG) - $< -c $(DATA_CFG) > $@ + $< -c $(DATA_CFG) --section="$(SECTION)" > $@ .PHONY: clean-data clean diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py index c7c3fb9e0..25e2dca57 100755 --- a/sw/blas/gemm/data/datagen.py +++ b/sw/blas/gemm/data/datagen.py @@ -39,9 +39,13 @@ 'fp8alt': {'exp': 4, 'mant': 3} } +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 -def golden_model(a, b, alpha, c): - return np.matmul(a, b) + alpha * c + +def golden_model(alpha, a, b, beta, c): + return alpha * np.matmul(a, b) + beta * c def emit_header(**kwargs): @@ -73,11 +77,14 @@ def emit_header(**kwargs): * (1.0 + mantissa_b.astype(np.double) / (2**2)) _c = ((-1.0)**sign_c.astype(np.double))*(2.0**(exponent_c.astype(np.double)-15.0)) \ * (1.0 + mantissa_c.astype(np.double) / (2**2)) - result = np.matmul(_a, _b) + kwargs['alpha'] * _c + result = golden_model(1, _a, _b, kwargs['beta'], _c) a = sign_a << 7 | exponent_a << FP8_FORMATS['fp8']['mant'] | mantissa_a b = sign_b << 7 | exponent_b << FP8_FORMATS['fp8']['mant'] | mantissa_b c = sign_c << 7 | exponent_c << FP8_FORMATS['fp8']['mant'] | mantissa_c else: + a = np.random.rand(kwargs['M'], kwargs['K']).astype(dtype) + b = np.random.rand(kwargs['K'], kwargs['N']).astype(dtype) + c = np.random.rand(kwargs['M'], kwargs['N']).astype(dtype) if kwargs['linspace']: a = np.linspace(0.1, kwargs['M'] * kwargs['K'] + 0.1 -1, num=kwargs['M'] * kwargs['K']).reshape((kwargs['M'], kwargs['K'])).astype(dtype) b = np.linspace(0.2, kwargs['K'] * kwargs['N'] + 0.2 -1, num=kwargs['K'] * kwargs['N']).reshape((kwargs['K'], kwargs['N'])).astype(dtype) @@ -86,7 +93,7 @@ def emit_header(**kwargs): a = np.random.rand(kwargs['M'], kwargs['K']).astype(dtype) b = np.random.rand(kwargs['K'], kwargs['N']).astype(dtype) c = np.random.rand(kwargs['M'], kwargs['N']).astype(dtype) - result = golden_model(a, b, kwargs['alpha'], c) + result = golden_model(1, a, b, kwargs['beta'], c) # Store matrices in transposed form if requested a = a.T if kwargs['ta'] else a @@ -98,12 +105,15 @@ def emit_header(**kwargs): data_str += [format_scalar_definition('uint32_t', 'K', kwargs['K'])] data_str += [format_scalar_definition('uint32_t', 'TA', int(kwargs['ta']))] data_str += [format_scalar_definition('uint32_t', 'TB', int(kwargs['tb']))] - data_str += [format_scalar_definition('uint32_t', 'ALPHA', kwargs['alpha'])] + data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])] data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)] data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])] - data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten())] - data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten())] - data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten())] + data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(), + alignment=BURST_ALIGNMENT, section=kwargs['section'])] + data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(), + alignment=BURST_ALIGNMENT, section=kwargs['section'])] + data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(), + alignment=BURST_ALIGNMENT, section=kwargs['section'])] if kwargs['prec'] == 8: result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten()) else: @@ -125,11 +135,16 @@ def main(): required=True, help='Select param config file kernel' ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') args = parser.parse_args() # Load param config file with args.cfg.open() as f: param = hjson.loads(f.read()) + param['section'] = args.section # Emit header file print(emit_header(**param)) diff --git a/sw/blas/gemm/data/params.hjson b/sw/blas/gemm/data/params.hjson index 63cdefd29..1428d1c99 100644 --- a/sw/blas/gemm/data/params.hjson +++ b/sw/blas/gemm/data/params.hjson @@ -8,7 +8,7 @@ M: 192, N: 16, K: 16, - alpha: 0, + beta: 0, ta: false, tb: true, // must be true for SIMD prec: 64, diff --git a/sw/blas/gemm/verify.py b/sw/blas/gemm/verify.py index 3bae7f801..b6f886b7b 100755 --- a/sw/blas/gemm/verify.py +++ b/sw/blas/gemm/verify.py @@ -37,22 +37,27 @@ def main(): a = np.array(bytes_to_doubles(elf.get_symbol_contents('a'))) b = np.array(bytes_to_doubles(elf.get_symbol_contents('b'))) c = np.array(bytes_to_doubles(elf.get_symbol_contents('c'))) - alpha = bytes_to_uint32s(elf.get_symbol_contents('ALPHA'))[0] + beta = bytes_to_uint32s(elf.get_symbol_contents('BETA'))[0] m = bytes_to_uint32s(elf.get_symbol_contents('M'))[0] n = bytes_to_uint32s(elf.get_symbol_contents('N'))[0] k = bytes_to_uint32s(elf.get_symbol_contents('K'))[0] tb = bytes_to_uint32s(elf.get_symbol_contents('TB'))[0] a = np.reshape(a, (m, k)) - b = np.reshape(b, (k, n)) if tb: + b = np.reshape(b, (n, k)) b = b.transpose() + else: + b = np.reshape(b, (k, n)) c = np.reshape(c, (m, n)) # Verify results - c_golden = golden_model(a, b, alpha, c).flatten() + c_golden = golden_model(1, a, b, beta, c).flatten() absolute_err = np.absolute(c_golden - c_actual) fail = np.any(absolute_err > ERR_THRESHOLD) + if (fail): + verification.dump_results_to_csv([c_golden, c_actual, absolute_err], + Path.cwd() / 'gemm_results.csv') return int(fail) diff --git a/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch new file mode 100644 index 000000000..068851f3c --- /dev/null +++ b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch @@ -0,0 +1,125 @@ +From 91c1b48e44629a80bdc1832111707c051ab0b3b2 Mon Sep 17 00:00:00 2001 +From: Luca Colagrande +Date: Mon, 23 Oct 2023 14:30:18 +0200 +Subject: [PATCH] sw/math: Refactor to proper library + +The previous header-only library style led to conflicts on certain +defines (for instance `N`) defined in both math library sources and +application sources. +--- + Makefile | 77 +++++++++++++++++++++++++++++++++++++++++++++++--- + include/math.h | 3 -- + 2 files changed, 73 insertions(+), 7 deletions(-) + +diff --git a/Makefile b/Makefile +index 1327953..a6f7a1a 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,17 +1,86 @@ +-BITS_DIR = include/bits ++# Copyright 2023 ETH Zurich and University of Bologna. ++# Licensed under the Apache License, Version 2.0, see LICENSE for details. ++# SPDX-License-Identifier: Apache-2.0 ++# ++# Luca Colagrande ++# Viviane Potocnik, ETH Zurich ++ ++# Usage of absolute paths is required to externally include ++# this Makefile from multiple different locations ++MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) ++ ++############### ++# Directories # ++############### ++ ++BUILDDIR ?= $(abspath build) ++SRC_DIR = $(MK_DIR)/src/math ++BITS_DIR = $(MK_DIR)/include/bits ++ ++################### ++# Build variables # ++################### ++ ++INCDIRS += $(MK_DIR)/arch/riscv64/ ++INCDIRS += $(MK_DIR)/arch/generic ++INCDIRS += $(MK_DIR)/src/include ++INCDIRS += $(MK_DIR)/src/internal ++INCDIRS += $(MK_DIR)/include/bits ++INCDIRS += $(MK_DIR)/include ++ ++SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c)) ++ ++########### ++# Outputs # ++########### ++ + ALLTYPES_H = $(BITS_DIR)/alltypes.h + ++OBJS = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS))))) ++DEPS = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS))))) ++LIB = $(BUILDDIR)/libmath.a ++DUMP = $(BUILDDIR)/libmath.dump ++ALL_OUTPUTS = $(LIB) $(DUMP) + +-.PHONY: all clean ++######### ++# Rules # ++######### + +-all: $(ALLTYPES_H) ++.PHONY: all ++all: $(ALL_OUTPUTS) + ++.PHONY: clean + clean: + rm -rf $(BITS_DIR) + rm -f $(ALLTYPES_H) ++ rm -rf $(BUILDDIR) + + $(BITS_DIR): + mkdir -p $@ + + $(ALLTYPES_H): | $(BITS_DIR) +- sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@ ++ sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@ ++ ++$(DEPS): $(ALLTYPES_H) ++ ++$(BUILDDIR): ++ mkdir -p $@ ++ ++$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR) ++ $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ ++ ++$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR) ++ $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ ++ ++$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR) ++ $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@ ++ ++$(LIB): $(OBJS) | $(BUILDDIR) ++ $(RISCV_AR) $(RISCV_ARFLAGS) $@ $^ ++ ++$(DUMP): $(LIB) | $(BUILDDIR) ++ $(RISCV_OBJDUMP) -D $< > $@ ++ ++ifneq ($(MAKECMDGOALS),clean) ++-include $(DEPS) ++endif +diff --git a/include/math.h b/include/math.h +index 6dad71c..14f28ec 100644 +--- a/include/math.h ++++ b/include/math.h +@@ -435,9 +435,6 @@ float pow10f(float); + long double pow10l(long double); + #endif + +-#include "../src/math/expm1.c" +-#include "../src/math/tanh.c" +- + #ifdef __cplusplus + } + #endif +-- +2.28.0 + diff --git a/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch new file mode 100644 index 000000000..050af9d33 --- /dev/null +++ b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch @@ -0,0 +1,81 @@ +From eb96f4d7454a07498f571eb1ed18aa1db2413551 Mon Sep 17 00:00:00 2001 +From: Luca Colagrande +Date: Mon, 23 Oct 2023 16:45:17 +0200 +Subject: [PATCH] `sw/math`: Add safe FP <--> INT conversions + +--- + src/internal/libm.h | 51 +++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 47 insertions(+), 4 deletions(-) + +diff --git a/src/internal/libm.h b/src/internal/libm.h +index 72ad17d..60b9866 100644 +--- a/src/internal/libm.h ++++ b/src/internal/libm.h +@@ -96,6 +96,47 @@ static int32_t converttoint(double_t); + #define predict_false(x) (x) + #endif + ++/* FPU fence to synchronize the FPU and integer core in Snitch. */ ++inline void snrt_fpu_fence() { ++ unsigned tmp; ++ __asm__ volatile( ++ "fmv.x.w %0, fa0\n" ++ "mv %0, %0\n" ++ : "+r"(tmp)::"memory"); ++} ++ ++/* Synch-secure double to uint64 conversion functions. */ ++static inline uint64_t asuint64(double f) { ++ uint64_t result; ++ snrt_fpu_fence(); ++ result = *(uint64_t *)&f; ++ return result; ++} ++ ++/* Synch-secure float to uint conversion functions. */ ++static inline uint64_t asuint(float f) { ++ uint32_t result; ++ snrt_fpu_fence(); ++ result = *(uint32_t *)&f; ++ return result; ++} ++ ++/* Synch-secure uint64 to double conversion functions. */ ++static inline double asdouble(uint64_t i) { ++ double result; ++ snrt_fpu_fence(); ++ result = *(double *)&i; ++ return result; ++} ++ ++/* Synch-secure uint to float conversion functions. */ ++static inline float asfloat(uint32_t i) { ++ float result; ++ snrt_fpu_fence(); ++ result = *(float *)&i; ++ return result; ++} ++ + /* Evaluate an expression as the specified type. With standard excess + precision handling a type cast or assignment is enough (with + -ffloat-store an assignment is required, in old compilers argument +@@ -187,10 +228,12 @@ static inline void fp_force_evall(long double x) + } \ + } while(0) + +-#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i +-#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f +-#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i +-#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f ++// Unsafe in Snitch due to the decoupled FPU and integer ++// arithmetic units. Use at your own risk. ++#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i ++#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f ++#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i ++#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f + + #define EXTRACT_WORDS(hi,lo,d) \ + do { \ +-- +2.28.0 + diff --git a/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch new file mode 100644 index 000000000..cffc3c407 --- /dev/null +++ b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch @@ -0,0 +1,149 @@ +From b419b07facc9591ba0d8683f53c9adefb8a9b0c6 Mon Sep 17 00:00:00 2001 +From: Luca Colagrande +Date: Wed, 8 Nov 2023 09:35:17 +0100 +Subject: [PATCH] `sw/math`: Implement safe `tanh` function + +--- + src/internal/libm.h | 31 +++++++++++++++++++++++++++++++ + src/math/expm1.c | 34 ++++++++++++++++++++++++++-------- + src/math/tanh.c | 17 ++++++++++++----- + 3 files changed, 69 insertions(+), 13 deletions(-) + +diff --git a/src/internal/libm.h b/src/internal/libm.h +index 60b9866..c96c0ec 100644 +--- a/src/internal/libm.h ++++ b/src/internal/libm.h +@@ -96,6 +96,37 @@ static int32_t converttoint(double_t); + #define predict_false(x) (x) + #endif + ++/* Memory-consistent functions to manipulate the upper word of a ++ double-precision floating-point number in the integer core. ++ Since there is no dedicated instruction to move the upper 32-bits ++ of a double-precision floating point register to an integer register ++ the compiler resorts to moving the value through the memory. However in ++ Snitch neither the program ordering between floating-point and integer ++ instructions is guaranteed, nor is memory consistency between the integer ++ and floating-point threads. */ ++ ++static inline uint32_t safe_extract_upper_32b_from_double(double x) { ++ double f; ++ uint32_t result; ++ asm volatile("fsd %[x], 0(%[ptr]) \n" ++ "fld ft3, 0(%[ptr]) \n" ++ "fmv.x.w t0, ft3 \n" ++ "mv t0, t0 \n" ++ "lw %[result], 4(%[ptr]) \n" ++ : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory"); ++ return result; ++} ++ ++static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) { ++ asm volatile("sw %[x], 4(%[ptr]) \n" ++ "lw %[x], 4(%[ptr]) \n" ++ "fmv.w.x ft3, %[x] \n" ++ : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory"); ++} ++ ++/* TODO: the following functions are not really safe, compare previous two ++ functions */ ++ + /* FPU fence to synchronize the FPU and integer core in Snitch. */ + inline void snrt_fpu_fence() { + unsigned tmp; +diff --git a/src/math/expm1.c b/src/math/expm1.c +index ac1e61e..d94f57f 100644 +--- a/src/math/expm1.c ++++ b/src/math/expm1.c +@@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */ + double expm1(double x) + { + double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk; +- union {double f; uint64_t i;} u = {x}; +- uint32_t hx = u.i>>32 & 0x7fffffff; +- int k, sign = u.i>>63; ++ /// Original implementation ++ // union {double f; uint64_t i;} u = {x}; ++ // uint32_t hx = u.i>>32 & 0x7fffffff; ++ // int k, sign = u.i>>63; ++ /// Safe implementation in Snitch ++ uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x); ++ uint32_t hx = upper_32b_x & 0x7fffffff; ++ int k, sign = upper_32b_x>>31; + + /* filter out huge and non-finite argument */ + if (hx >= 0x4043687A) { /* if |x|>=56*ln2 */ +@@ -182,8 +187,12 @@ double expm1(double x) + return -2.0*(e-(x+0.5)); + return 1.0+2.0*(x-e); + } +- u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */ +- twopk = u.f; ++ /// Original implementation ++ // u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */ ++ // twopk = u.f; ++ /// Safe implementation in Snitch ++ uint32_t u_i = (uint32_t)(0x3ff + k)<<20; ++ safe_inject_into_upper_32b_double(u_i, &twopk); + if (k < 0 || k > 56) { /* suffice to return exp(x)-1 */ + y = x - e + 1.0; + if (k == 1024) +@@ -192,10 +201,19 @@ double expm1(double x) + y = y*twopk; + return y - 1.0; + } +- u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */ ++ /// Original implementation ++ // u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */ ++ // if (k < 20) ++ // y = (x-e+(1-u.f))*twopk; ++ // else ++ // y = (x-(e+u.f)+1)*twopk; ++ /// Safe implementation in Snitch ++ u_i = (uint32_t)(0x3ff - k)<<20; ++ double u_f = 0; ++ safe_inject_into_upper_32b_double(u_i, &u_f); + if (k < 20) +- y = (x-e+(1-u.f))*twopk; ++ y = (x-e+(1-u_f))*twopk; + else +- y = (x-(e+u.f)+1)*twopk; ++ y = (x-(e+u_f)+1)*twopk; + return y; + } +diff --git a/src/math/tanh.c b/src/math/tanh.c +index 20d6dbc..2481db1 100644 +--- a/src/math/tanh.c ++++ b/src/math/tanh.c +@@ -6,16 +6,23 @@ + */ + double tanh(double x) + { +- union {double f; uint64_t i;} u = {.f = x}; + uint32_t w; + int sign; + double_t t; + + /* x = |x| */ +- sign = u.i >> 63; +- u.i &= (uint64_t)-1/2; +- x = u.f; +- w = u.i >> 32; ++ /// Original implementation ++ // union {double f; uint64_t i;} u = {.f = x}; ++ // sign = u.i >> 63; ++ // u.i &= (uint64_t)-1/2; ++ // x = u.f; ++ // w = u.i >> 32; ++ /// Safe implementation in Snitch ++ uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x); ++ sign = upper_32b_x >> 31; ++ uint32_t sign_mask = (~(1 << 31)); ++ w = upper_32b_x & sign_mask; ++ safe_inject_into_upper_32b_double(w, &x); + + if (w > 0x3fe193ea) { + /* |x| > log(3)/2 ~= 0.5493 or nan */ +-- +2.28.0 + diff --git a/sw/math/Makefile b/sw/math/Makefile index 132795388..afb3192d1 100644 --- a/sw/math/Makefile +++ b/sw/math/Makefile @@ -1,17 +1,86 @@ -BITS_DIR = include/bits +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande +# Viviane Potocnik, ETH Zurich + +# Usage of absolute paths is required to externally include +# this Makefile from multiple different locations +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) + +############### +# Directories # +############### + +BUILDDIR ?= $(abspath build) +SRC_DIR = $(MK_DIR)/src/math +BITS_DIR = $(MK_DIR)/include/bits + +################### +# Build variables # +################### + +INCDIRS += $(MK_DIR)/arch/riscv64/ +INCDIRS += $(MK_DIR)/arch/generic +INCDIRS += $(MK_DIR)/src/include +INCDIRS += $(MK_DIR)/src/internal +INCDIRS += $(MK_DIR)/include/bits +INCDIRS += $(MK_DIR)/include + +SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c)) + +########### +# Outputs # +########### + ALLTYPES_H = $(BITS_DIR)/alltypes.h +OBJS = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS))))) +DEPS = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS))))) +LIB = $(BUILDDIR)/libmath.a +DUMP = $(BUILDDIR)/libmath.dump +ALL_OUTPUTS = $(LIB) $(DUMP) -.PHONY: all clean +######### +# Rules # +######### -all: $(ALLTYPES_H) +.PHONY: all +all: $(ALL_OUTPUTS) +.PHONY: clean clean: rm -rf $(BITS_DIR) rm -f $(ALLTYPES_H) + rm -rf $(BUILDDIR) $(BITS_DIR): mkdir -p $@ $(ALLTYPES_H): | $(BITS_DIR) - sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@ + sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@ + +$(DEPS): $(ALLTYPES_H) + +$(BUILDDIR): + mkdir -p $@ + +$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ + +$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ + +$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@ + +$(LIB): $(OBJS) | $(BUILDDIR) + $(RISCV_AR) $(RISCV_ARFLAGS) $@ $^ + +$(DUMP): $(LIB) | $(BUILDDIR) + $(RISCV_OBJDUMP) -D $< > $@ + +ifneq ($(MAKECMDGOALS),clean) +-include $(DEPS) +endif diff --git a/sw/math/include/float.h b/sw/math/include/float.h new file mode 100644 index 000000000..713aadb90 --- /dev/null +++ b/sw/math/include/float.h @@ -0,0 +1,52 @@ +#ifndef _FLOAT_H +#define _FLOAT_H + +#ifdef __cplusplus +extern "C" { +#endif + +int __flt_rounds(void); +#define FLT_ROUNDS (__flt_rounds()) + +#define FLT_RADIX 2 + +#define FLT_TRUE_MIN 1.40129846432481707092e-45F +#define FLT_MIN 1.17549435082228750797e-38F +#define FLT_MAX 3.40282346638528859812e+38F +#define FLT_EPSILON 1.1920928955078125e-07F + +#define FLT_MANT_DIG 24 +#define FLT_MIN_EXP (-125) +#define FLT_MAX_EXP 128 +#define FLT_HAS_SUBNORM 1 + +#define FLT_DIG 6 +#define FLT_DECIMAL_DIG 9 +#define FLT_MIN_10_EXP (-37) +#define FLT_MAX_10_EXP 38 + +#define DBL_TRUE_MIN 4.94065645841246544177e-324 +#define DBL_MIN 2.22507385850720138309e-308 +#define DBL_MAX 1.79769313486231570815e+308 +#define DBL_EPSILON 2.22044604925031308085e-16 + +#define DBL_MANT_DIG 53 +#define DBL_MIN_EXP (-1021) +#define DBL_MAX_EXP 1024 +#define DBL_HAS_SUBNORM 1 + +#define DBL_DIG 15 +#define DBL_DECIMAL_DIG 17 +#define DBL_MIN_10_EXP (-307) +#define DBL_MAX_10_EXP 308 + +#define LDBL_HAS_SUBNORM 1 +#define LDBL_DECIMAL_DIG DECIMAL_DIG + +#include + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sw/math/include/math.h b/sw/math/include/math.h index 6dad71c1e..14f28ec8c 100644 --- a/sw/math/include/math.h +++ b/sw/math/include/math.h @@ -435,9 +435,6 @@ float pow10f(float); long double pow10l(long double); #endif -#include "../src/math/expm1.c" -#include "../src/math/tanh.c" - #ifdef __cplusplus } #endif diff --git a/sw/math/src/internal/libm.h b/sw/math/src/internal/libm.h index 72ad17d8e..c96c0eced 100644 --- a/sw/math/src/internal/libm.h +++ b/sw/math/src/internal/libm.h @@ -96,6 +96,78 @@ static int32_t converttoint(double_t); #define predict_false(x) (x) #endif +/* Memory-consistent functions to manipulate the upper word of a + double-precision floating-point number in the integer core. + Since there is no dedicated instruction to move the upper 32-bits + of a double-precision floating point register to an integer register + the compiler resorts to moving the value through the memory. However in + Snitch neither the program ordering between floating-point and integer + instructions is guaranteed, nor is memory consistency between the integer + and floating-point threads. */ + +static inline uint32_t safe_extract_upper_32b_from_double(double x) { + double f; + uint32_t result; + asm volatile("fsd %[x], 0(%[ptr]) \n" + "fld ft3, 0(%[ptr]) \n" + "fmv.x.w t0, ft3 \n" + "mv t0, t0 \n" + "lw %[result], 4(%[ptr]) \n" + : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory"); + return result; +} + +static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) { + asm volatile("sw %[x], 4(%[ptr]) \n" + "lw %[x], 4(%[ptr]) \n" + "fmv.w.x ft3, %[x] \n" + : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory"); +} + +/* TODO: the following functions are not really safe, compare previous two + functions */ + +/* FPU fence to synchronize the FPU and integer core in Snitch. */ +inline void snrt_fpu_fence() { + unsigned tmp; + __asm__ volatile( + "fmv.x.w %0, fa0\n" + "mv %0, %0\n" + : "+r"(tmp)::"memory"); +} + +/* Synch-secure double to uint64 conversion functions. */ +static inline uint64_t asuint64(double f) { + uint64_t result; + snrt_fpu_fence(); + result = *(uint64_t *)&f; + return result; +} + +/* Synch-secure float to uint conversion functions. */ +static inline uint64_t asuint(float f) { + uint32_t result; + snrt_fpu_fence(); + result = *(uint32_t *)&f; + return result; +} + +/* Synch-secure uint64 to double conversion functions. */ +static inline double asdouble(uint64_t i) { + double result; + snrt_fpu_fence(); + result = *(double *)&i; + return result; +} + +/* Synch-secure uint to float conversion functions. */ +static inline float asfloat(uint32_t i) { + float result; + snrt_fpu_fence(); + result = *(float *)&i; + return result; +} + /* Evaluate an expression as the specified type. With standard excess precision handling a type cast or assignment is enough (with -ffloat-store an assignment is required, in old compilers argument @@ -187,10 +259,12 @@ static inline void fp_force_evall(long double x) } \ } while(0) -#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i -#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f -#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i -#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f +// Unsafe in Snitch due to the decoupled FPU and integer +// arithmetic units. Use at your own risk. +#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i +#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f +#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i +#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f #define EXTRACT_WORDS(hi,lo,d) \ do { \ diff --git a/sw/math/src/math/__math_divzero.c b/sw/math/src/math/__math_divzero.c new file mode 100644 index 000000000..59d213500 --- /dev/null +++ b/sw/math/src/math/__math_divzero.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_divzero(uint32_t sign) +{ + return fp_barrier(sign ? -1.0 : 1.0) / 0.0; +} diff --git a/sw/math/src/math/__math_invalid.c b/sw/math/src/math/__math_invalid.c new file mode 100644 index 000000000..177404900 --- /dev/null +++ b/sw/math/src/math/__math_invalid.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_invalid(double x) +{ + return (x - x) / (x - x); +} diff --git a/sw/math/src/math/__math_invalidf.c b/sw/math/src/math/__math_invalidf.c new file mode 100644 index 000000000..357d4b121 --- /dev/null +++ b/sw/math/src/math/__math_invalidf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_invalidf(float x) +{ + return (x - x) / (x - x); +} diff --git a/sw/math/src/math/__math_invalidl.c b/sw/math/src/math/__math_invalidl.c new file mode 100644 index 000000000..1fca99de4 --- /dev/null +++ b/sw/math/src/math/__math_invalidl.c @@ -0,0 +1,9 @@ +#include +#include "libm.h" + +#if LDBL_MANT_DIG != DBL_MANT_DIG +long double __math_invalidl(long double x) +{ + return (x - x) / (x - x); +} +#endif diff --git a/sw/math/src/math/__math_oflow.c b/sw/math/src/math/__math_oflow.c new file mode 100644 index 000000000..c85dbf982 --- /dev/null +++ b/sw/math/src/math/__math_oflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_oflow(uint32_t sign) +{ + return __math_xflow(sign, 0x1p769); +} diff --git a/sw/math/src/math/__math_oflowf.c b/sw/math/src/math/__math_oflowf.c new file mode 100644 index 000000000..fa7d06208 --- /dev/null +++ b/sw/math/src/math/__math_oflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_oflowf(uint32_t sign) +{ + return __math_xflowf(sign, 0x1p97f); +} diff --git a/sw/math/src/math/__math_uflow.c b/sw/math/src/math/__math_uflow.c new file mode 100644 index 000000000..b90594aee --- /dev/null +++ b/sw/math/src/math/__math_uflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_uflow(uint32_t sign) +{ + return __math_xflow(sign, 0x1p-767); +} diff --git a/sw/math/src/math/__math_uflowf.c b/sw/math/src/math/__math_uflowf.c new file mode 100644 index 000000000..94d50f2bf --- /dev/null +++ b/sw/math/src/math/__math_uflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_uflowf(uint32_t sign) +{ + return __math_xflowf(sign, 0x1p-95f); +} diff --git a/sw/math/src/math/__math_xflow.c b/sw/math/src/math/__math_xflow.c new file mode 100644 index 000000000..744203c4c --- /dev/null +++ b/sw/math/src/math/__math_xflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_xflow(uint32_t sign, double y) +{ + return eval_as_double(fp_barrier(sign ? -y : y) * y); +} diff --git a/sw/math/src/math/__math_xflowf.c b/sw/math/src/math/__math_xflowf.c new file mode 100644 index 000000000..f2c84784f --- /dev/null +++ b/sw/math/src/math/__math_xflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_xflowf(uint32_t sign, float y) +{ + return eval_as_float(fp_barrierf(sign ? -y : y) * y); +} diff --git a/sw/math/src/math/ceil.c b/sw/math/src/math/ceil.c new file mode 100644 index 000000000..b13e6f2d6 --- /dev/null +++ b/sw/math/src/math/ceil.c @@ -0,0 +1,31 @@ +#include "libm.h" + +#if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1 +#define EPS DBL_EPSILON +#elif FLT_EVAL_METHOD==2 +#define EPS LDBL_EPSILON +#endif +static const double_t toint = 1/EPS; + +double ceil(double x) +{ + union {double f; uint64_t i;} u = {x}; + int e = u.i >> 52 & 0x7ff; + double_t y; + + if (e >= 0x3ff+52 || x == 0) + return x; + /* y = int(x) - x, where int(x) is an integer neighbor of x */ + if (u.i >> 63) + y = x - toint + toint - x; + else + y = x + toint - toint - x; + /* special case because of non-nearest rounding modes */ + if (e <= 0x3ff-1) { + FORCE_EVAL(y); + return u.i >> 63 ? -0.0 : 1; + } + if (y < 0) + return x + y + 1; + return x + y; +} diff --git a/sw/math/src/math/ceilf.c b/sw/math/src/math/ceilf.c new file mode 100644 index 000000000..869835f39 --- /dev/null +++ b/sw/math/src/math/ceilf.c @@ -0,0 +1,27 @@ +#include "libm.h" + +float ceilf(float x) +{ + union {float f; uint32_t i;} u = {x}; + int e = (int)(u.i >> 23 & 0xff) - 0x7f; + uint32_t m; + + if (e >= 23) + return x; + if (e >= 0) { + m = 0x007fffff >> e; + if ((u.i & m) == 0) + return x; + FORCE_EVAL(x + 0x1p120f); + if (u.i >> 31 == 0) + u.i += m; + u.i &= ~m; + } else { + FORCE_EVAL(x + 0x1p120f); + if (u.i >> 31) + u.f = -0.0; + else if (u.i << 1) + u.f = 1.0; + } + return u.f; +} diff --git a/sw/math/src/math/ceill.c b/sw/math/src/math/ceill.c new file mode 100644 index 000000000..60a83020d --- /dev/null +++ b/sw/math/src/math/ceill.c @@ -0,0 +1,34 @@ +#include "libm.h" + +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 +long double ceill(long double x) +{ + return ceil(x); +} +#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384 + +static const long double toint = 1/LDBL_EPSILON; + +long double ceill(long double x) +{ + union ldshape u = {x}; + int e = u.i.se & 0x7fff; + long double y; + + if (e >= 0x3fff+LDBL_MANT_DIG-1 || x == 0) + return x; + /* y = int(x) - x, where int(x) is an integer neighbor of x */ + if (u.i.se >> 15) + y = x - toint + toint - x; + else + y = x + toint - toint - x; + /* special case because of non-nearest rounding modes */ + if (e <= 0x3fff-1) { + FORCE_EVAL(y); + return u.i.se >> 15 ? -0.0 : 1; + } + if (y < 0) + return x + y + 1; + return x + y; +} +#endif diff --git a/sw/math/src/math/exp2f_data.c b/sw/math/src/math/exp2f_data.c new file mode 100644 index 000000000..be324727f --- /dev/null +++ b/sw/math/src/math/exp2f_data.c @@ -0,0 +1,35 @@ +/* + * Shared data between expf, exp2f and powf. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "exp2f_data.h" + +#define N (1 << EXP2F_TABLE_BITS) + +const struct exp2f_data __exp2f_data = { + /* tab[i] = uint(2^(i/N)) - (i << 52-BITS) + used for computing 2^(k/N) for an int |k| < 150 N as + double(tab[k%N] + (k << 52-BITS)) */ + .tab = { +0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, +0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, +0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, +0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, +0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, +0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, +0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, +0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, + .shift_scaled = 0x1.8p+52 / N, + .poly = { + 0x1.c6af84b912394p-5, 0x1.ebfce50fac4f3p-3, 0x1.62e42ff0c52d6p-1, + }, + .shift = 0x1.8p+52, + .invln2_scaled = 0x1.71547652b82fep+0 * N, + .poly_scaled = { + 0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N, + }, +}; diff --git a/sw/math/src/math/exp2f_data.h b/sw/math/src/math/exp2f_data.h new file mode 100644 index 000000000..fe744f15b --- /dev/null +++ b/sw/math/src/math/exp2f_data.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _EXP2F_DATA_H +#define _EXP2F_DATA_H + +#include +#include + +/* Shared between expf, exp2f and powf. */ +#define EXP2F_TABLE_BITS 5 +#define EXP2F_POLY_ORDER 3 +extern hidden const struct exp2f_data { + uint64_t tab[1 << EXP2F_TABLE_BITS]; + double shift_scaled; + double poly[EXP2F_POLY_ORDER]; + double shift; + double invln2_scaled; + double poly_scaled[EXP2F_POLY_ORDER]; +} __exp2f_data; + +#endif diff --git a/sw/math/src/math/expf.c b/sw/math/src/math/expf.c new file mode 100644 index 000000000..f9fbf8e72 --- /dev/null +++ b/sw/math/src/math/expf.c @@ -0,0 +1,80 @@ +/* + * Single-precision e^x function. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "exp2f_data.h" + +/* +EXP2F_TABLE_BITS = 5 +EXP2F_POLY_ORDER = 3 + +ULP error: 0.502 (nearest rounding.) +Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.) +Wrong count: 170635 (all nearest rounding wrong results with fma.) +Non-nearest ULP error: 1 (rounded ULP error) +*/ + +#define N (1 << EXP2F_TABLE_BITS) +#define InvLn2N __exp2f_data.invln2_scaled +#define T __exp2f_data.tab +#define C __exp2f_data.poly_scaled + +static inline uint32_t top12(float x) +{ + return asuint(x) >> 20; +} + +float expf(float x) +{ + uint32_t abstop; + uint64_t ki, t; + double_t kd, xd, z, r, r2, y, s; + + xd = (double_t)x; + abstop = top12(x) & 0x7ff; + if (predict_false(abstop >= top12(88.0f))) { + /* |x| >= 88 or x is nan. */ + if (asuint(x) == asuint(-INFINITY)) + return 0.0f; + if (abstop >= top12(INFINITY)) + return x + x; + if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */ + return __math_oflowf(0); + if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */ + return __math_uflowf(0); + } + + /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k. */ + z = InvLn2N * xd; + + /* Round and convert z to int, the result is in [-150*N, 128*N] and + ideally ties-to-even rule is used, otherwise the magnitude of r + can be bigger which gives larger approximation error. */ +#if TOINT_INTRINSICS + kd = roundtoint(z); + ki = converttoint(z); +#else +# define SHIFT __exp2f_data.shift + kd = eval_as_double(z + SHIFT); + ki = asuint64(kd); + kd -= SHIFT; +#endif + r = z - kd; + + /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ + t = T[ki % N]; + t += ki << (52 - EXP2F_TABLE_BITS); + s = asdouble(t); + z = C[0] * r + C[1]; + r2 = r * r; + y = C[2] * r + 1; + y = z * r2 + y; + y = y * s; + return eval_as_float(y); +} diff --git a/sw/math/src/math/expm1.c b/sw/math/src/math/expm1.c index ac1e61e4f..d94f57fe5 100644 --- a/sw/math/src/math/expm1.c +++ b/sw/math/src/math/expm1.c @@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */ double expm1(double x) { double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk; - union {double f; uint64_t i;} u = {x}; - uint32_t hx = u.i>>32 & 0x7fffffff; - int k, sign = u.i>>63; + /// Original implementation + // union {double f; uint64_t i;} u = {x}; + // uint32_t hx = u.i>>32 & 0x7fffffff; + // int k, sign = u.i>>63; + /// Safe implementation in Snitch + uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x); + uint32_t hx = upper_32b_x & 0x7fffffff; + int k, sign = upper_32b_x>>31; /* filter out huge and non-finite argument */ if (hx >= 0x4043687A) { /* if |x|>=56*ln2 */ @@ -182,8 +187,12 @@ double expm1(double x) return -2.0*(e-(x+0.5)); return 1.0+2.0*(x-e); } - u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */ - twopk = u.f; + /// Original implementation + // u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */ + // twopk = u.f; + /// Safe implementation in Snitch + uint32_t u_i = (uint32_t)(0x3ff + k)<<20; + safe_inject_into_upper_32b_double(u_i, &twopk); if (k < 0 || k > 56) { /* suffice to return exp(x)-1 */ y = x - e + 1.0; if (k == 1024) @@ -192,10 +201,19 @@ double expm1(double x) y = y*twopk; return y - 1.0; } - u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */ + /// Original implementation + // u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */ + // if (k < 20) + // y = (x-e+(1-u.f))*twopk; + // else + // y = (x-(e+u.f)+1)*twopk; + /// Safe implementation in Snitch + u_i = (uint32_t)(0x3ff - k)<<20; + double u_f = 0; + safe_inject_into_upper_32b_double(u_i, &u_f); if (k < 20) - y = (x-e+(1-u.f))*twopk; + y = (x-e+(1-u_f))*twopk; else - y = (x-(e+u.f)+1)*twopk; + y = (x-(e+u_f)+1)*twopk; return y; } diff --git a/sw/math/src/math/log2.c b/sw/math/src/math/log2.c new file mode 100644 index 000000000..1276ed4e3 --- /dev/null +++ b/sw/math/src/math/log2.c @@ -0,0 +1,122 @@ +/* + * Double-precision log2(x) function. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "log2_data.h" + +#define T __log2_data.tab +#define T2 __log2_data.tab2 +#define B __log2_data.poly1 +#define A __log2_data.poly +#define InvLn2hi __log2_data.invln2hi +#define InvLn2lo __log2_data.invln2lo +#define N (1 << LOG2_TABLE_BITS) +#define OFF 0x3fe6000000000000 + +/* Top 16 bits of a double. */ +static inline uint32_t top16(double x) +{ + return asuint64(x) >> 48; +} + +double log2(double x) +{ + double_t z, r, r2, r4, y, invc, logc, kd, hi, lo, t1, t2, t3, p; + uint64_t ix, iz, tmp; + uint32_t top; + int k, i; + + ix = asuint64(x); + top = top16(x); +#define LO asuint64(1.0 - 0x1.5b51p-5) +#define HI asuint64(1.0 + 0x1.6ab2p-5) + if (predict_false(ix - LO < HI - LO)) { + /* Handle close to 1.0 inputs separately. */ + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && predict_false(ix == asuint64(1.0))) + return 0; + r = x - 1.0; +#if __FP_FAST_FMA + hi = r * InvLn2hi; + lo = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -hi); +#else + double_t rhi, rlo; + rhi = asdouble(asuint64(r) & -1ULL << 32); + rlo = r - rhi; + hi = rhi * InvLn2hi; + lo = rlo * InvLn2hi + r * InvLn2lo; +#endif + r2 = r * r; /* rounding error: 0x1p-62. */ + r4 = r2 * r2; + /* Worst-case error is less than 0.54 ULP (0.55 ULP without fma). */ + p = r2 * (B[0] + r * B[1]); + y = hi + p; + lo += hi - y + p; + lo += r4 * (B[2] + r * B[3] + r2 * (B[4] + r * B[5]) + + r4 * (B[6] + r * B[7] + r2 * (B[8] + r * B[9]))); + y += lo; + return eval_as_double(y); + } + if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzero(1); + if (ix == asuint64(INFINITY)) /* log(inf) == inf. */ + return x; + if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) + return __math_invalid(x); + /* x is subnormal, normalize it. */ + ix = asuint64(x * 0x1p52); + ix -= 52ULL << 52; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - LOG2_TABLE_BITS)) % N; + k = (int64_t)tmp >> 52; /* arithmetic shift */ + iz = ix - (tmp & 0xfffULL << 52); + invc = T[i].invc; + logc = T[i].logc; + z = asdouble(iz); + kd = (double_t)k; + + /* log2(x) = log2(z/c) + log2(c) + k. */ + /* r ~= z/c - 1, |r| < 1/(2*N). */ +#if __FP_FAST_FMA + /* rounding error: 0x1p-55/N. */ + r = __builtin_fma(z, invc, -1.0); + t1 = r * InvLn2hi; + t2 = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -t1); +#else + double_t rhi, rlo; + /* rounding error: 0x1p-55/N + 0x1p-65. */ + r = (z - T2[i].chi - T2[i].clo) * invc; + rhi = asdouble(asuint64(r) & -1ULL << 32); + rlo = r - rhi; + t1 = rhi * InvLn2hi; + t2 = rlo * InvLn2hi + r * InvLn2lo; +#endif + + /* hi + lo = r/ln2 + log2(c) + k. */ + t3 = kd + logc; + hi = t3 + t1; + lo = t3 - hi + t1 + t2; + + /* log2(r+1) = r/ln2 + r^2*poly(r). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + r2 = r * r; /* rounding error: 0x1p-54/N^2. */ + r4 = r2 * r2; + /* Worst-case error if |y| > 0x1p-4: 0.547 ULP (0.550 ULP without fma). + ~ 0.5 + 2/N/ln2 + abs-poly-error*0x1p56 ULP (+ 0.003 ULP without fma). */ + p = A[0] + r * A[1] + r2 * (A[2] + r * A[3]) + r4 * (A[4] + r * A[5]); + y = lo + r2 * p + hi; + return eval_as_double(y); +} diff --git a/sw/math/src/math/log2_data.c b/sw/math/src/math/log2_data.c new file mode 100644 index 000000000..3dd1ca514 --- /dev/null +++ b/sw/math/src/math/log2_data.c @@ -0,0 +1,201 @@ +/* + * Data for log2. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "log2_data.h" + +#define N (1 << LOG2_TABLE_BITS) + +const struct log2_data __log2_data = { +// First coefficient: 0x1.71547652b82fe1777d0ffda0d24p0 +.invln2hi = 0x1.7154765200000p+0, +.invln2lo = 0x1.705fc2eefa200p-33, +.poly1 = { +// relative error: 0x1.2fad8188p-63 +// in -0x1.5b51p-5 0x1.6ab2p-5 +-0x1.71547652b82fep-1, +0x1.ec709dc3a03f7p-2, +-0x1.71547652b7c3fp-2, +0x1.2776c50f05be4p-2, +-0x1.ec709dd768fe5p-3, +0x1.a61761ec4e736p-3, +-0x1.7153fbc64a79bp-3, +0x1.484d154f01b4ap-3, +-0x1.289e4a72c383cp-3, +0x1.0b32f285aee66p-3, +}, +.poly = { +// relative error: 0x1.a72c2bf8p-58 +// abs error: 0x1.67a552c8p-66 +// in -0x1.f45p-8 0x1.f45p-8 +-0x1.71547652b8339p-1, +0x1.ec709dc3a04bep-2, +-0x1.7154764702ffbp-2, +0x1.2776c50034c48p-2, +-0x1.ec7b328ea92bcp-3, +0x1.a6225e117f92ep-3, +}, +/* Algorithm: + + x = 2^k z + log2(x) = k + log2(c) + log2(z/c) + log2(z/c) = poly(z/c - 1) + +where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls +into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = (double)log2(c) + tab2[i].chi = (double)c + tab2[i].clo = (double)(c - (double)c) + +where c is near the center of the subinterval and is chosen by trying +-2^29 +floating point invc candidates around 1/center and selecting one for which + + 1) the rounding error in 0x1.8p10 + logc is 0, + 2) the rounding error in z - chi - clo is < 0x1p-64 and + 3) the rounding error in (double)log2(c) is minimized (< 0x1p-68). + +Note: 1) ensures that k + logc can be computed without rounding error, 2) +ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a +single rounding error when there is no fast fma for z*invc - 1, 3) ensures +that logc + poly(z/c - 1) has small error, however near x == 1 when +|log2(x)| < 0x1p-4, this is not enough so that is special cased. */ +.tab = { +{0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1}, +{0x1.6e1f766d2cca1p+0, -0x1.08494bd76d000p-1}, +{0x1.6a13d0e30d48ap+0, -0x1.00143aee8f800p-1}, +{0x1.661ec32d06c85p+0, -0x1.efec5360b4000p-2}, +{0x1.623fa951198f8p+0, -0x1.dfdd91ab7e000p-2}, +{0x1.5e75ba4cf026cp+0, -0x1.cffae0cc79000p-2}, +{0x1.5ac055a214fb8p+0, -0x1.c043811fda000p-2}, +{0x1.571ed0f166e1ep+0, -0x1.b0b67323ae000p-2}, +{0x1.53909590bf835p+0, -0x1.a152f5a2db000p-2}, +{0x1.5014fed61adddp+0, -0x1.9217f5af86000p-2}, +{0x1.4cab88e487bd0p+0, -0x1.8304db0719000p-2}, +{0x1.49539b4334feep+0, -0x1.74189f9a9e000p-2}, +{0x1.460cbdfafd569p+0, -0x1.6552bb5199000p-2}, +{0x1.42d664ee4b953p+0, -0x1.56b23a29b1000p-2}, +{0x1.3fb01111dd8a6p+0, -0x1.483650f5fa000p-2}, +{0x1.3c995b70c5836p+0, -0x1.39de937f6a000p-2}, +{0x1.3991c4ab6fd4ap+0, -0x1.2baa1538d6000p-2}, +{0x1.3698e0ce099b5p+0, -0x1.1d98340ca4000p-2}, +{0x1.33ae48213e7b2p+0, -0x1.0fa853a40e000p-2}, +{0x1.30d191985bdb1p+0, -0x1.01d9c32e73000p-2}, +{0x1.2e025cab271d7p+0, -0x1.e857da2fa6000p-3}, +{0x1.2b404cf13cd82p+0, -0x1.cd3c8633d8000p-3}, +{0x1.288b02c7ccb50p+0, -0x1.b26034c14a000p-3}, +{0x1.25e2263944de5p+0, -0x1.97c1c2f4fe000p-3}, +{0x1.234563d8615b1p+0, -0x1.7d6023f800000p-3}, +{0x1.20b46e33eaf38p+0, -0x1.633a71a05e000p-3}, +{0x1.1e2eefdcda3ddp+0, -0x1.494f5e9570000p-3}, +{0x1.1bb4a580b3930p+0, -0x1.2f9e424e0a000p-3}, +{0x1.19453847f2200p+0, -0x1.162595afdc000p-3}, +{0x1.16e06c0d5d73cp+0, -0x1.f9c9a75bd8000p-4}, +{0x1.1485f47b7e4c2p+0, -0x1.c7b575bf9c000p-4}, +{0x1.12358ad0085d1p+0, -0x1.960c60ff48000p-4}, +{0x1.0fef00f532227p+0, -0x1.64ce247b60000p-4}, +{0x1.0db2077d03a8fp+0, -0x1.33f78b2014000p-4}, +{0x1.0b7e6d65980d9p+0, -0x1.0387d1a42c000p-4}, +{0x1.0953efe7b408dp+0, -0x1.a6f9208b50000p-5}, +{0x1.07325cac53b83p+0, -0x1.47a954f770000p-5}, +{0x1.05197e40d1b5cp+0, -0x1.d23a8c50c0000p-6}, +{0x1.03091c1208ea2p+0, -0x1.16a2629780000p-6}, +{0x1.0101025b37e21p+0, -0x1.720f8d8e80000p-8}, +{0x1.fc07ef9caa76bp-1, 0x1.6fe53b1500000p-7}, +{0x1.f4465d3f6f184p-1, 0x1.11ccce10f8000p-5}, +{0x1.ecc079f84107fp-1, 0x1.c4dfc8c8b8000p-5}, +{0x1.e573a99975ae8p-1, 0x1.3aa321e574000p-4}, +{0x1.de5d6f0bd3de6p-1, 0x1.918a0d08b8000p-4}, +{0x1.d77b681ff38b3p-1, 0x1.e72e9da044000p-4}, +{0x1.d0cb5724de943p-1, 0x1.1dcd2507f6000p-3}, +{0x1.ca4b2dc0e7563p-1, 0x1.476ab03dea000p-3}, +{0x1.c3f8ee8d6cb51p-1, 0x1.7074377e22000p-3}, +{0x1.bdd2b4f020c4cp-1, 0x1.98ede8ba94000p-3}, +{0x1.b7d6c006015cap-1, 0x1.c0db86ad2e000p-3}, +{0x1.b20366e2e338fp-1, 0x1.e840aafcee000p-3}, +{0x1.ac57026295039p-1, 0x1.0790ab4678000p-2}, +{0x1.a6d01bc2731ddp-1, 0x1.1ac056801c000p-2}, +{0x1.a16d3bc3ff18bp-1, 0x1.2db11d4fee000p-2}, +{0x1.9c2d14967feadp-1, 0x1.406464ec58000p-2}, +{0x1.970e4f47c9902p-1, 0x1.52dbe093af000p-2}, +{0x1.920fb3982bcf2p-1, 0x1.651902050d000p-2}, +{0x1.8d30187f759f1p-1, 0x1.771d2cdeaf000p-2}, +{0x1.886e5ebb9f66dp-1, 0x1.88e9c857d9000p-2}, +{0x1.83c97b658b994p-1, 0x1.9a80155e16000p-2}, +{0x1.7f405ffc61022p-1, 0x1.abe186ed3d000p-2}, +{0x1.7ad22181415cap-1, 0x1.bd0f2aea0e000p-2}, +{0x1.767dcf99eff8cp-1, 0x1.ce0a43dbf4000p-2}, +}, +#if !__FP_FAST_FMA +.tab2 = { +{0x1.6200012b90a8ep-1, 0x1.904ab0644b605p-55}, +{0x1.66000045734a6p-1, 0x1.1ff9bea62f7a9p-57}, +{0x1.69fffc325f2c5p-1, 0x1.27ecfcb3c90bap-55}, +{0x1.6e00038b95a04p-1, 0x1.8ff8856739326p-55}, +{0x1.71fffe09994e3p-1, 0x1.afd40275f82b1p-55}, +{0x1.7600015590e1p-1, -0x1.2fd75b4238341p-56}, +{0x1.7a00012655bd5p-1, 0x1.808e67c242b76p-56}, +{0x1.7e0003259e9a6p-1, -0x1.208e426f622b7p-57}, +{0x1.81fffedb4b2d2p-1, -0x1.402461ea5c92fp-55}, +{0x1.860002dfafcc3p-1, 0x1.df7f4a2f29a1fp-57}, +{0x1.89ffff78c6b5p-1, -0x1.e0453094995fdp-55}, +{0x1.8e00039671566p-1, -0x1.a04f3bec77b45p-55}, +{0x1.91fffe2bf1745p-1, -0x1.7fa34400e203cp-56}, +{0x1.95fffcc5c9fd1p-1, -0x1.6ff8005a0695dp-56}, +{0x1.9a0003bba4767p-1, 0x1.0f8c4c4ec7e03p-56}, +{0x1.9dfffe7b92da5p-1, 0x1.e7fd9478c4602p-55}, +{0x1.a1fffd72efdafp-1, -0x1.a0c554dcdae7ep-57}, +{0x1.a5fffde04ff95p-1, 0x1.67da98ce9b26bp-55}, +{0x1.a9fffca5e8d2bp-1, -0x1.284c9b54c13dep-55}, +{0x1.adfffddad03eap-1, 0x1.812c8ea602e3cp-58}, +{0x1.b1ffff10d3d4dp-1, -0x1.efaddad27789cp-55}, +{0x1.b5fffce21165ap-1, 0x1.3cb1719c61237p-58}, +{0x1.b9fffd950e674p-1, 0x1.3f7d94194cep-56}, +{0x1.be000139ca8afp-1, 0x1.50ac4215d9bcp-56}, +{0x1.c20005b46df99p-1, 0x1.beea653e9c1c9p-57}, +{0x1.c600040b9f7aep-1, -0x1.c079f274a70d6p-56}, +{0x1.ca0006255fd8ap-1, -0x1.a0b4076e84c1fp-56}, +{0x1.cdfffd94c095dp-1, 0x1.8f933f99ab5d7p-55}, +{0x1.d1ffff975d6cfp-1, -0x1.82c08665fe1bep-58}, +{0x1.d5fffa2561c93p-1, -0x1.b04289bd295f3p-56}, +{0x1.d9fff9d228b0cp-1, 0x1.70251340fa236p-55}, +{0x1.de00065bc7e16p-1, -0x1.5011e16a4d80cp-56}, +{0x1.e200002f64791p-1, 0x1.9802f09ef62ep-55}, +{0x1.e600057d7a6d8p-1, -0x1.e0b75580cf7fap-56}, +{0x1.ea00027edc00cp-1, -0x1.c848309459811p-55}, +{0x1.ee0006cf5cb7cp-1, -0x1.f8027951576f4p-55}, +{0x1.f2000782b7dccp-1, -0x1.f81d97274538fp-55}, +{0x1.f6000260c450ap-1, -0x1.071002727ffdcp-59}, +{0x1.f9fffe88cd533p-1, -0x1.81bdce1fda8bp-58}, +{0x1.fdfffd50f8689p-1, 0x1.7f91acb918e6ep-55}, +{0x1.0200004292367p+0, 0x1.b7ff365324681p-54}, +{0x1.05fffe3e3d668p+0, 0x1.6fa08ddae957bp-55}, +{0x1.0a0000a85a757p+0, -0x1.7e2de80d3fb91p-58}, +{0x1.0e0001a5f3fccp+0, -0x1.1823305c5f014p-54}, +{0x1.11ffff8afbaf5p+0, -0x1.bfabb6680bac2p-55}, +{0x1.15fffe54d91adp+0, -0x1.d7f121737e7efp-54}, +{0x1.1a00011ac36e1p+0, 0x1.c000a0516f5ffp-54}, +{0x1.1e00019c84248p+0, -0x1.082fbe4da5dap-54}, +{0x1.220000ffe5e6ep+0, -0x1.8fdd04c9cfb43p-55}, +{0x1.26000269fd891p+0, 0x1.cfe2a7994d182p-55}, +{0x1.2a00029a6e6dap+0, -0x1.00273715e8bc5p-56}, +{0x1.2dfffe0293e39p+0, 0x1.b7c39dab2a6f9p-54}, +{0x1.31ffff7dcf082p+0, 0x1.df1336edc5254p-56}, +{0x1.35ffff05a8b6p+0, -0x1.e03564ccd31ebp-54}, +{0x1.3a0002e0eaeccp+0, 0x1.5f0e74bd3a477p-56}, +{0x1.3e000043bb236p+0, 0x1.c7dcb149d8833p-54}, +{0x1.4200002d187ffp+0, 0x1.e08afcf2d3d28p-56}, +{0x1.460000d387cb1p+0, 0x1.20837856599a6p-55}, +{0x1.4a00004569f89p+0, -0x1.9fa5c904fbcd2p-55}, +{0x1.4e000043543f3p+0, -0x1.81125ed175329p-56}, +{0x1.51fffcc027f0fp+0, 0x1.883d8847754dcp-54}, +{0x1.55ffffd87b36fp+0, -0x1.709e731d02807p-55}, +{0x1.59ffff21df7bap+0, 0x1.7f79f68727b02p-55}, +{0x1.5dfffebfc3481p+0, -0x1.180902e30e93ep-54}, +}, +#endif +}; diff --git a/sw/math/src/math/log2_data.h b/sw/math/src/math/log2_data.h new file mode 100644 index 000000000..276a786d1 --- /dev/null +++ b/sw/math/src/math/log2_data.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOG2_DATA_H +#define _LOG2_DATA_H + +#include + +#define LOG2_TABLE_BITS 6 +#define LOG2_POLY_ORDER 7 +#define LOG2_POLY1_ORDER 11 +extern hidden const struct log2_data { + double invln2hi; + double invln2lo; + double poly[LOG2_POLY_ORDER - 1]; + double poly1[LOG2_POLY1_ORDER - 1]; + struct { + double invc, logc; + } tab[1 << LOG2_TABLE_BITS]; +#if !__FP_FAST_FMA + struct { + double chi, clo; + } tab2[1 << LOG2_TABLE_BITS]; +#endif +} __log2_data; + +#endif diff --git a/sw/math/src/math/log2f.c b/sw/math/src/math/log2f.c new file mode 100644 index 000000000..c368f88f3 --- /dev/null +++ b/sw/math/src/math/log2f.c @@ -0,0 +1,72 @@ +/* + * Single-precision log2 function. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "log2f_data.h" + +/* +LOG2F_TABLE_BITS = 4 +LOG2F_POLY_ORDER = 4 + +ULP error: 0.752 (nearest rounding.) +Relative error: 1.9 * 2^-26 (before rounding.) +*/ + +#define N (1 << LOG2F_TABLE_BITS) +#define T __log2f_data.tab +#define A __log2f_data.poly +#define OFF 0x3f330000 + +float log2f(float x) +{ + double_t z, r, r2, p, y, y0, invc, logc; + uint32_t ix, iz, top, tmp; + int k, i; + + ix = asuint(x); + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && predict_false(ix == 0x3f800000)) + return 0; + if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { + /* x < 0x1p-126 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzerof(1); + if (ix == 0x7f800000) /* log2(inf) == inf. */ + return x; + if ((ix & 0x80000000) || ix * 2 >= 0xff000000) + return __math_invalidf(x); + /* x is subnormal, normalize it. */ + ix = asuint(x * 0x1p23f); + ix -= 23 << 23; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF] and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (23 - LOG2F_TABLE_BITS)) % N; + top = tmp & 0xff800000; + iz = ix - top; + k = (int32_t)tmp >> 23; /* arithmetic shift */ + invc = T[i].invc; + logc = T[i].logc; + z = (double_t)asfloat(iz); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ + r = z * invc - 1; + y0 = logc + (double_t)k; + + /* Pipelined polynomial evaluation to approximate log1p(r)/ln2. */ + r2 = r * r; + y = A[1] * r + A[2]; + y = A[0] * r2 + y; + p = A[3] * r + y0; + y = y * r2 + p; + return eval_as_float(y); +} diff --git a/sw/math/src/math/log2f_data.c b/sw/math/src/math/log2f_data.c new file mode 100644 index 000000000..24e450f1e --- /dev/null +++ b/sw/math/src/math/log2f_data.c @@ -0,0 +1,33 @@ +/* + * Data definition for log2f. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "log2f_data.h" + +const struct log2f_data __log2f_data = { + .tab = { + { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 }, + { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 }, + { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 }, + { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 }, + { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 }, + { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 }, + { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 }, + { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 }, + { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 }, + { 0x1p+0, 0x0p+0 }, + { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 }, + { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 }, + { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 }, + { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 }, + { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 }, + { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 }, + }, + .poly = { + -0x1.712b6f70a7e4dp-2, 0x1.ecabf496832ep-2, -0x1.715479ffae3dep-1, + 0x1.715475f35c8b8p0, + } +}; diff --git a/sw/math/src/math/log2f_data.h b/sw/math/src/math/log2f_data.h new file mode 100644 index 000000000..4fa489560 --- /dev/null +++ b/sw/math/src/math/log2f_data.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOG2F_DATA_H +#define _LOG2F_DATA_H + +#include + +#define LOG2F_TABLE_BITS 4 +#define LOG2F_POLY_ORDER 4 +extern hidden const struct log2f_data { + struct { + double invc, logc; + } tab[1 << LOG2F_TABLE_BITS]; + double poly[LOG2F_POLY_ORDER]; +} __log2f_data; + +#endif diff --git a/sw/math/src/math/sqrt.c b/sw/math/src/math/sqrt.c new file mode 100644 index 000000000..5ba265596 --- /dev/null +++ b/sw/math/src/math/sqrt.c @@ -0,0 +1,158 @@ +#include +#include +#include "libm.h" +#include "sqrt_data.h" + +#define FENV_SUPPORT 1 + +/* returns a*b*2^-32 - e, with error 0 <= e < 1. */ +static inline uint32_t mul32(uint32_t a, uint32_t b) +{ + return (uint64_t)a*b >> 32; +} + +/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ +static inline uint64_t mul64(uint64_t a, uint64_t b) +{ + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); +} + +double sqrt(double x) +{ + uint64_t ix, top, m; + + /* special case handling. */ + ix = asuint64(x); + top = ix >> 52; + if (predict_false(top - 0x001 >= 0x7ff - 0x001)) { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return x; + if (ix == 0x7ff0000000000000) + return x; + if (ix > 0x7ff0000000000000) + return __math_invalid(x); + /* x is subnormal, normalize it. */ + ix = asuint64(x * 0x1p52); + top = ix >> 52; + top -= 52; + } + + /* argument reduction: + x = 4^e m; with integer e, and m in [1, 4) + m: fixed point representation [2.62] + 2^e is the exponent part of the result. */ + int even = top & 1; + m = (ix << 11) | 0x8000000000000000; + if (even) m >>= 1; + top = (top + 0x3ff) >> 1; + + /* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4) + + initial estimate: + 7bit table lookup (1bit exponent and 6bit significand). + + iterative approximation: + using 2 goldschmidt iterations with 32bit int arithmetics + and a final iteration with 64bit int arithmetics. + + details: + + the relative error (e = r0 sqrt(m)-1) of a linear estimate + (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best, + a table lookup is faster and needs one less iteration + 6 bit lookup table (128b) gives |e| < 0x1.f9p-8 + 7 bit lookup table (256b) gives |e| < 0x1.fdp-9 + for single and double prec 6bit is enough but for quad + prec 7bit is needed (or modified iterations). to avoid + one more iteration >=13bit table would be needed (16k). + + a newton-raphson iteration for r is + w = r*r + u = 3 - m*w + r = r*u/2 + can use a goldschmidt iteration for s at the end or + s = m*r + + first goldschmidt iteration is + s = m*r + u = 3 - s*r + r = r*u/2 + s = s*u/2 + next goldschmidt iteration is + u = 3 - s*r + r = r*u/2 + s = s*u/2 + and at the end r is not computed only s. + + they use the same amount of operations and converge at the + same quadratic rate, i.e. if + r1 sqrt(m) - 1 = e, then + r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3 + the advantage of goldschmidt is that the mul for s and r + are independent (computed in parallel), however it is not + "self synchronizing": it only uses the input m in the + first iteration so rounding errors accumulate. at the end + or when switching to larger precision arithmetics rounding + errors dominate so the first iteration should be used. + + the fixed point representations are + m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30 + and after switching to 64 bit + m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62 */ + + static const uint64_t three = 0xc0000000; + uint64_t r, s, d, u, i; + + i = (ix >> 46) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r sqrt(m) - 1| < 0x1.fdp-9 */ + s = mul32(m>>32, r); + /* |s/sqrt(m) - 1| < 0x1.fdp-9 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r sqrt(m) - 1| < 0x1.7bp-16 */ + s = mul32(s, u) << 1; + /* |s/sqrt(m) - 1| < 0x1.7bp-16 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */ + r = r << 32; + s = mul64(m, r); + d = mul64(s, r); + u = (three<<32) - d; + s = mul64(s, u); /* repr: 3.61 */ + /* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */ + s = (s - 2) >> 9; /* repr: 12.52 */ + /* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */ + + /* s < sqrt(m) < s + 0x1.09p-52, + compute nearest rounded result: + the nearest result to 52 bits is either s or s+0x1p-52, + we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m. */ + uint64_t d0, d1, d2; + double y, t; + d0 = (m << 42) - s*s; + d1 = s - d0; + d2 = d1 + s + 1; + s += d1 >> 63; + s &= 0x000fffffffffffff; + s |= top << 52; + y = asdouble(s); + if (FENV_SUPPORT) { + /* handle rounding modes and inexact exception: + only (s+1)^2 == 2^42 m case is exact otherwise + add a tiny value to cause the fenv effects. */ + uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000; + tiny |= (d1^d2) & 0x8000000000000000; + t = asdouble(tiny); + y = eval_as_double(y + t); + } + return y; +} diff --git a/sw/math/src/math/sqrt_data.c b/sw/math/src/math/sqrt_data.c new file mode 100644 index 000000000..61bc22f43 --- /dev/null +++ b/sw/math/src/math/sqrt_data.c @@ -0,0 +1,19 @@ +#include "sqrt_data.h" +const uint16_t __rsqrt_tab[128] = { +0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43, +0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b, +0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1, +0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430, +0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59, +0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925, +0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479, +0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040, +0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234, +0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2, +0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1, +0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192, +0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f, +0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4, +0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59, +0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560, +}; diff --git a/sw/math/src/math/sqrt_data.h b/sw/math/src/math/sqrt_data.h new file mode 100644 index 000000000..260c7f9c2 --- /dev/null +++ b/sw/math/src/math/sqrt_data.h @@ -0,0 +1,13 @@ +#ifndef _SQRT_DATA_H +#define _SQRT_DATA_H + +#include +#include + +/* if x in [1,2): i = (int)(64*x); + if x in [2,4): i = (int)(32*x-64); + __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error: + |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */ +extern hidden const uint16_t __rsqrt_tab[128]; + +#endif diff --git a/sw/math/src/math/sqrtf.c b/sw/math/src/math/sqrtf.c new file mode 100644 index 000000000..740d81cba --- /dev/null +++ b/sw/math/src/math/sqrtf.c @@ -0,0 +1,83 @@ +#include +#include +#include "libm.h" +#include "sqrt_data.h" + +#define FENV_SUPPORT 1 + +static inline uint32_t mul32(uint32_t a, uint32_t b) +{ + return (uint64_t)a*b >> 32; +} + +/* see sqrt.c for more detailed comments. */ + +float sqrtf(float x) +{ + uint32_t ix, m, m1, m0, even, ey; + + ix = asuint(x); + if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { + /* x < 0x1p-126 or inf or nan. */ + if (ix * 2 == 0) + return x; + if (ix == 0x7f800000) + return x; + if (ix > 0x7f800000) + return __math_invalidf(x); + /* x is subnormal, normalize it. */ + ix = asuint(x * 0x1p23f); + ix -= 23 << 23; + } + + /* x = 4^e m; with int e and m in [1, 4). */ + even = ix & 0x00800000; + m1 = (ix << 8) | 0x80000000; + m0 = (ix << 7) & 0x7fffffff; + m = even ? m0 : m1; + + /* 2^e is the exponent part of the return value. */ + ey = ix >> 1; + ey += 0x3f800000 >> 1; + ey &= 0x7f800000; + + /* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations. */ + static const uint32_t three = 0xc0000000; + uint32_t r, s, d, u, i; + i = (ix >> 17) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r*sqrt(m) - 1| < 0x1p-8 */ + s = mul32(m, r); + /* |s/sqrt(m) - 1| < 0x1p-8 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r*sqrt(m) - 1| < 0x1.7bp-16 */ + s = mul32(s, u) << 1; + /* |s/sqrt(m) - 1| < 0x1.7bp-16 */ + d = mul32(s, r); + u = three - d; + s = mul32(s, u); + /* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */ + s = (s - 1)>>6; + /* s < sqrt(m) < s + 0x1.08p-23 */ + + /* compute nearest rounded result. */ + uint32_t d0, d1, d2; + float y, t; + d0 = (m << 16) - s*s; + d1 = s - d0; + d2 = d1 + s + 1; + s += d1 >> 31; + s &= 0x007fffff; + s |= ey; + y = asfloat(s); + if (FENV_SUPPORT) { + /* handle rounding and inexact exception. */ + uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000; + tiny |= (d1^d2) & 0x80000000; + t = asfloat(tiny); + y = eval_as_float(y + t); + } + return y; +} diff --git a/sw/math/src/math/tanh.c b/sw/math/src/math/tanh.c index 20d6dbcf4..2481db1dc 100644 --- a/sw/math/src/math/tanh.c +++ b/sw/math/src/math/tanh.c @@ -6,16 +6,23 @@ */ double tanh(double x) { - union {double f; uint64_t i;} u = {.f = x}; uint32_t w; int sign; double_t t; /* x = |x| */ - sign = u.i >> 63; - u.i &= (uint64_t)-1/2; - x = u.f; - w = u.i >> 32; + /// Original implementation + // union {double f; uint64_t i;} u = {.f = x}; + // sign = u.i >> 63; + // u.i &= (uint64_t)-1/2; + // x = u.f; + // w = u.i >> 32; + /// Safe implementation in Snitch + uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x); + sign = upper_32b_x >> 31; + uint32_t sign_mask = (~(1 << 31)); + w = upper_32b_x & sign_mask; + safe_inject_into_upper_32b_double(w, &x); if (w > 0x3fe193ea) { /* |x| > log(3)/2 ~= 0.5493 or nan */ diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h index 7c94acdd9..169e54d7b 100644 --- a/sw/snRuntime/src/dma.h +++ b/sw/snRuntime/src/dma.h @@ -8,43 +8,49 @@ typedef uint32_t snrt_dma_txid_t; /// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers. inline snrt_dma_txid_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, size_t size) { - register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 - register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11 - register uint32_t reg_src_low asm("a2") = src >> 0; // 12 - register uint32_t reg_src_high asm("a3") = src >> 32; // 13 - register uint32_t reg_size asm("a4") = size; // 14 - - // dmsrc a2, a3 - asm volatile( - ".word (0b0000000 << 25) | \ - ( (13) << 20) | \ - ( (12) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" ::"r"(reg_src_high), - "r"(reg_src_low)); - - // dmdst a0, a1 - asm volatile( - ".word (0b0000001 << 25) | \ - ( (11) << 20) | \ - ( (10) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" ::"r"(reg_dst_high), - "r"(reg_dst_low)); - - // dmcpyi a0, a4, 0b00 - register uint32_t reg_txid asm("a0"); // 10 - asm volatile( - ".word (0b0000010 << 25) | \ - ( 0b00000 << 20) | \ - ( (14) << 15) | \ - ( 0b000 << 12) | \ - ( (10) << 7) | \ - (0b0101011 << 0) \n" - : "=r"(reg_txid) - : "r"(reg_size)); - - return reg_txid; + // Current DMA does not allow transfers with size == 0 (blocks) + // TODO(colluca) remove this check once new DMA is integrated + if (size > 0) { + register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 + register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11 + register uint32_t reg_src_low asm("a2") = src >> 0; // 12 + register uint32_t reg_src_high asm("a3") = src >> 32; // 13 + register uint32_t reg_size asm("a4") = size; // 14 + + // dmsrc a2, a3 + asm volatile( + ".word (0b0000000 << 25) | \ + ( (13) << 20) | \ + ( (12) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" ::"r"(reg_src_high), + "r"(reg_src_low)); + + // dmdst a0, a1 + asm volatile( + ".word (0b0000001 << 25) | \ + ( (11) << 20) | \ + ( (10) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" ::"r"(reg_dst_high), + "r"(reg_dst_low)); + + // dmcpyi a0, a4, 0b00 + register uint32_t reg_txid asm("a0"); // 10 + asm volatile( + ".word (0b0000010 << 25) | \ + ( 0b00000 << 20) | \ + ( (14) << 15) | \ + ( 0b000 << 12) | \ + ( (10) << 7) | \ + (0b0101011 << 0) \n" + : "=r"(reg_txid) + : "r"(reg_size)); + + return reg_txid; + } else { + return -1; + } } /// Initiate an asynchronous 1D DMA transfer. @@ -58,65 +64,71 @@ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src, size_t size, size_t dst_stride, size_t src_stride, size_t repeat) { - register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 - register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11 - register uint32_t reg_src_low asm("a2") = src >> 0; // 12 - register uint32_t reg_src_high asm("a3") = src >> 32; // 13 - register uint32_t reg_size asm("a4") = size; // 14 - register uint32_t reg_dst_stride asm("a5") = dst_stride; // 15 - register uint32_t reg_src_stride asm("a6") = src_stride; // 16 - register uint32_t reg_repeat asm("a7") = repeat; // 17 - - // dmsrc a0, a1 - asm volatile( - ".word (0b0000000 << 25) | \ - ( (13) << 20) | \ - ( (12) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" ::"r"(reg_src_high), - "r"(reg_src_low)); - - // dmdst a0, a1 - asm volatile( - ".word (0b0000001 << 25) | \ - ( (11) << 20) | \ - ( (10) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" ::"r"(reg_dst_high), - "r"(reg_dst_low)); - - // dmstr a5, a6 - asm volatile( - ".word (0b0000110 << 25) | \ - ( (15) << 20) | \ - ( (16) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" - : - : "r"(reg_dst_stride), "r"(reg_src_stride)); - - // dmrep a7 - asm volatile( - ".word (0b0000111 << 25) | \ - ( (17) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" - : - : "r"(reg_repeat)); - - // dmcpyi a0, a4, 0b10 - register uint32_t reg_txid asm("a0"); // 10 - asm volatile( - ".word (0b0000010 << 25) | \ - ( 0b00010 << 20) | \ - ( (14) << 15) | \ - ( 0b000 << 12) | \ - ( (10) << 7) | \ - (0b0101011 << 0) \n" - : "=r"(reg_txid) - : "r"(reg_size)); - - return reg_txid; + // Current DMA does not allow transfers with size == 0 (blocks) + // TODO(colluca) remove this check once new DMA is integrated + if (size > 0) { + register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 + register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11 + register uint32_t reg_src_low asm("a2") = src >> 0; // 12 + register uint32_t reg_src_high asm("a3") = src >> 32; // 13 + register uint32_t reg_size asm("a4") = size; // 14 + register uint32_t reg_dst_stride asm("a5") = dst_stride; // 15 + register uint32_t reg_src_stride asm("a6") = src_stride; // 16 + register uint32_t reg_repeat asm("a7") = repeat; // 17 + + // dmsrc a0, a1 + asm volatile( + ".word (0b0000000 << 25) | \ + ( (13) << 20) | \ + ( (12) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" ::"r"(reg_src_high), + "r"(reg_src_low)); + + // dmdst a0, a1 + asm volatile( + ".word (0b0000001 << 25) | \ + ( (11) << 20) | \ + ( (10) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" ::"r"(reg_dst_high), + "r"(reg_dst_low)); + + // dmstr a5, a6 + asm volatile( + ".word (0b0000110 << 25) | \ + ( (15) << 20) | \ + ( (16) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" + : + : "r"(reg_dst_stride), "r"(reg_src_stride)); + + // dmrep a7 + asm volatile( + ".word (0b0000111 << 25) | \ + ( (17) << 15) | \ + ( 0b000 << 12) | \ + (0b0101011 << 0) \n" + : + : "r"(reg_repeat)); + + // dmcpyi a0, a4, 0b10 + register uint32_t reg_txid asm("a0"); // 10 + asm volatile( + ".word (0b0000010 << 25) | \ + ( 0b00010 << 20) | \ + ( (14) << 15) | \ + ( 0b000 << 12) | \ + ( (10) << 7) | \ + (0b0101011 << 0) \n" + : "=r"(reg_txid) + : "r"(reg_size)); + + return reg_txid; + } else { + return -1; + } } /// Initiate an asynchronous 2D DMA transfer. diff --git a/sw/snRuntime/src/dump.h b/sw/snRuntime/src/dump.h index 8f24cc1b9..1d65395b5 100644 --- a/sw/snRuntime/src/dump.h +++ b/sw/snRuntime/src/dump.h @@ -4,6 +4,7 @@ // // Authors: Samuel Riedel, ETH Zurich // Viviane Potocnik, ETH Zurich +// Luca Colagrande, ETH Zurich // Dump a value via CSR // !!! Careful: This is only supported in simulation and an experimental @@ -11,18 +12,14 @@ // This can be exploited to quickly print measurement values from all cores // simultaneously without the hassle of printf. To specify multiple metrics, // different CSRs can be used. The macro will define a function that will then -// always print via the same CSR. E.g., `dump(errors, 8)` will define a function -// with the following signature: `dump_errors(uint32_t val)`, which will print -// the given value via the 8th register. Alternatively, the `write_csr(reg, -// val)` macro can be used directly. +// always print via the same CSR. E.g., `dump(uint32_t, errors, 8)` will define +// a function with the following signature: `dump_errors(uint32_t val)`, which +// will print the given value via the 8th register. Alternatively, the +// `write_csr(reg, val)` macro can be used directly. -#define dump_float(name, reg) \ - static __attribute__((always_inline)) inline void dump_##name(float val) { \ - asm volatile("csrw " #reg ", %0" ::"rK"(val)); \ +#define NAMED_DUMP(type, name, reg) \ + static __attribute__((always_inline)) inline void dump_##name(type val) { \ + asm volatile("csrw " #reg ", %0" ::"rK"(val)); \ } -#define dump_uint(name, reg) \ - static \ - __attribute__((always_inline)) inline void dump_##name(uint32_t val) { \ - asm volatile("csrw " #reg ", %0" ::"rK"(val)); \ - } \ No newline at end of file +#define DUMP(val) ({ asm volatile("csrw 0x7C3, %0" ::"rK"(val)); }) diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c index 3fb338f4a..4e4cd2152 100644 --- a/sw/snRuntime/src/start.c +++ b/sw/snRuntime/src/start.c @@ -20,22 +20,38 @@ static inline void snrt_init_tls() { extern volatile uint32_t __tdata_start, __tdata_end; extern volatile uint32_t __tbss_start, __tbss_end; - volatile uint32_t* p; - volatile uint32_t* tls_ptr; + size_t size; + volatile uint32_t tls_ptr; - asm volatile("mv %0, tp" : "=r"(tls_ptr) : :); - - // Copy tdata section - for (p = (uint32_t*)(&__tdata_start); p < (uint32_t*)(&__tdata_end); p++) { - *tls_ptr = *p; - tls_ptr++; + // To avoid contentions in main memory, and take advantage of the + // bandwidth of the DMA, the DM core initializes the TLS section + // for every core in a cluster. + if (snrt_is_dm_core()) { + size = (size_t)(&__tdata_end) - (size_t)(&__tdata_start); + + // First initialize the DM core's .tdata section from main memory + asm volatile("mv %0, tp" : "=r"(tls_ptr) : :); + snrt_dma_start_1d((void*)tls_ptr, (void*)(&__tdata_start), size); + + // Then initialize all other cores' .tdata sections from the DM + // core's. The offset between the TLS section of successive cores + // is defined in start.S + size_t tls_offset = (1 << SNRT_LOG2_STACK_SIZE) + 8; + for (int i = 1; i < snrt_cluster_core_num(); i++) { + snrt_dma_start_1d((void*)(tls_ptr + i * tls_offset), (void*)tls_ptr, + size); + } + + // Initialize all cores' .tbss sections + tls_ptr += size; + size = (size_t)(&__tbss_end) - (size_t)(&__tbss_start); + for (int i = 0; i < snrt_cluster_core_num(); i++) { + snrt_dma_start_1d((void*)(tls_ptr + i * tls_offset), + (void*)(snrt_zero_memory_ptr()), size); + } } - // Clear tbss section - for (p = (uint32_t*)(&__tbss_start); p < (uint32_t*)(&__tbss_end); p++) { - *tls_ptr = 0; - tls_ptr++; - } + snrt_cluster_hw_barrier(); } #endif @@ -66,7 +82,7 @@ static inline void snrt_init_cls() { // Copy cdata section to base of the TCDM size = (size_t)(&__cdata_end) - (size_t)(&__cdata_start); - if (size > 0) snrt_dma_start_1d(ptr, (void*)(&__cdata_start), size); + snrt_dma_start_1d(ptr, (void*)(&__cdata_start), size); // Clear cbss section ptr = (void*)((uint32_t)ptr + size); diff --git a/target/common/common.mk b/target/common/common.mk index 6b9c679d0..0cf03c463 100644 --- a/target/common/common.mk +++ b/target/common/common.mk @@ -2,26 +2,41 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 -LOGS_DIR ?= logs -TB_DIR ?= $(SNITCH_ROOT)/target/common/test -UTIL_DIR ?= $(SNITCH_ROOT)/util +# Makefile invocation +DEBUG ?= OFF # ON to turn on wave logging + +# Directories +LOGS_DIR ?= logs +TB_DIR ?= $(SNITCH_ROOT)/target/common/test +UTIL_DIR ?= $(SNITCH_ROOT)/util + +# SEPP packages +QUESTA_SEPP ?= +VCS_SEPP ?= +VERILATOR_SEPP ?= # External executables -BENDER ?= bender -DASM ?= spike-dasm -VLT ?= verilator -VERIBLE_FMT ?= verible-verilog-format -CLANG_FORMAT ?= clang-format +BENDER ?= bender +DASM ?= spike-dasm +VLT ?= $(VERILATOR_SEPP) verilator +VCS ?= $(VCS_SEPP) vcs +VERIBLE_FMT ?= verible-verilog-format +CLANG_FORMAT ?= clang-format +VSIM ?= $(QUESTA_SEPP) vsim +VOPT ?= $(QUESTA_SEPP) vopt +VLOG ?= $(QUESTA_SEPP) vlog +VLIB ?= $(QUESTA_SEPP) vlib # Internal executables -BIN2JTAG ?= $(UTIL_DIR)/bin2jtag.py -GENTRACE ?= $(UTIL_DIR)/trace/gen_trace.py -ANNOTATE_PY ?= $(UTIL_DIR)/trace/annotate.py -EVENTS_PY ?= $(UTIL_DIR)/trace/events.py -PERF_CSV_PY ?= $(UTIL_DIR)/trace/perf_csv.py +GENTRACE_PY ?= $(UTIL_DIR)/trace/gen_trace.py +ANNOTATE_PY ?= $(UTIL_DIR)/trace/annotate.py +EVENTS_PY ?= $(UTIL_DIR)/trace/events.py +PERF_CSV_PY ?= $(UTIL_DIR)/trace/perf_csv.py +LAYOUT_EVENTS_PY ?= $(UTIL_DIR)/trace/layout_events.py +EVENTVIS_PY ?= $(UTIL_DIR)/trace/eventvis.py -VERILATOR_ROOT ?= $(dir $(shell which $(VLT)))/../share/verilator -VLT_ROOT ?= ${VERILATOR_ROOT} +VERILATOR_ROOT ?= $(dir $(shell $(VERILATOR_SEPP) which verilator)).. +VLT_ROOT ?= ${VERILATOR_ROOT} MATCH_END := '/+incdir+/ s/$$/\/*\/*/' MATCH_BGN := 's/+incdir+//g' @@ -29,7 +44,14 @@ SED_SRCS := sed -e ${MATCH_END} -e ${MATCH_BGN} VSIM_BENDER += -t test -t rtl -t simulation -t vsim VSIM_SOURCES = $(shell ${BENDER} script flist ${VSIM_BENDER} | ${SED_SRCS}) -VSIM_BUILDDIR := work-vsim +VSIM_BUILDDIR ?= work-vsim +VSIM_FLAGS += -t 1ps +ifeq ($(DEBUG), ON) +VSIM_FLAGS += -do "log -r /*; run -a" +VOPT_FLAGS = +acc +else +VSIM_FLAGS += -do "run -a" +endif # VCS_BUILDDIR should to be the same as the `DEFAULT : ./work-vcs` # in target/snitch_cluster/synopsys_sim.setup @@ -38,8 +60,8 @@ VCS_SOURCES = $(shell ${BENDER} script flist ${VCS_BENDER} | ${SED_SRCS}) VCS_BUILDDIR := work-vcs # fesvr is being installed here -FESVR ?= ${MKFILE_DIR}work -FESVR_VERSION ?= 35d50bc40e59ea1d5566fbd3d9226023821b1bb6 +FESVR ?= ${MKFILE_DIR}work +FESVR_VERSION ?= 35d50bc40e59ea1d5566fbd3d9226023821b1bb6 VLT_BENDER += -t rtl VLT_SOURCES = $(shell ${BENDER} script flist ${VLT_BENDER} | ${SED_SRCS}) @@ -146,25 +168,33 @@ endef # Modelsim # ############ +$(VSIM_BUILDDIR): + mkdir -p $@ + +# Expects vlog/vcom script in $< (e.g. as output by bender) +# Expects the top module name in $1 +# Produces a binary used to run the simulation at the path specified by $@ define QUESTASIM - ${VSIM} -c -do "source $<; quit" | tee $(dir $<)vsim.log - @! grep -P "Errors: [1-9]*," $(dir $<)vsim.log - @mkdir -p bin + ${VSIM} -c -do "source $<; quit" | tee $(dir $<)vlog.log + @! grep -P "Errors: [1-9]*," $(dir $<)vlog.log + $(VOPT) $(VOPT_FLAGS) -work $(VSIM_BUILDDIR) $1 -o $(1)_opt | tee $(dir $<)vopt.log + @! grep -P "Errors: [1-9]*," $(dir $<)vopt.log + @mkdir -p $(dir $@) @echo "#!/bin/bash" > $@ - @echo 'binary=$$(realpath --relative-to=${MKFILE_DIR} $$1)' >> $@ - @echo 'cd ${MKFILE_DIR}' >> $@ + @echo 'binary=$$(realpath $$1)' >> $@ + @echo 'mkdir -p $(LOGS_DIR)' >> $@ @echo 'echo $$binary > $(LOGS_DIR)/.rtlbinary' >> $@ @echo '${VSIM} +permissive ${VSIM_FLAGS} $$3 -work ${MKFILE_DIR}/${VSIM_BUILDDIR} -c \ -ldflags "-Wl,-rpath,${FESVR}/lib -L${FESVR}/lib -lfesvr -lutil" \ - $1 +permissive-off ++$$binary ++$$2' >> $@ + $(1)_opt +permissive-off ++$$binary ++$$2' >> $@ @chmod +x $@ @echo "#!/bin/bash" > $@.gui - @echo 'binary=$$(pwd)/$$1' >> $@.gui - @echo 'cd ${MKFILE_DIR}' >> $@.gui + @echo 'binary=$$(realpath $$1)' >> $@.gui + @echo 'mkdir -p $(LOGS_DIR)' >> $@.gui @echo 'echo $$binary > $(LOGS_DIR)/.rtlbinary' >> $@.gui @echo '${VSIM} +permissive ${VSIM_FLAGS} -work ${MKFILE_DIR}/${VSIM_BUILDDIR} \ -ldflags "-Wl,-rpath,${FESVR}/lib -L${FESVR}/lib -lfesvr -lutil" \ - $1 +permissive-off ++$$binary ++$$2' >> $@.gui + $(1)_opt +permissive-off ++$$binary ++$$2' >> $@.gui @chmod +x $@.gui endef @@ -175,7 +205,7 @@ $(VCS_BUILDDIR)/compile.sh: mkdir -p $(VCS_BUILDDIR) ${BENDER} script vcs ${VCS_BENDER} --vlog-arg="${VLOGAN_FLAGS}" --vcom-arg="${VHDLAN_FLAGS}" > $@ chmod +x $@ - $@ > $(VCS_BUILDDIR)/compile.log + $(VCS_SEPP) $@ > $(VCS_BUILDDIR)/compile.log ######## # Util # @@ -189,26 +219,56 @@ define reggen_generate_header @$(CLANG_FORMAT) -i $1 endef -$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE) - $(DASM) < $< | $(PYTHON) $(GENTRACE) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt +# Arg 1: binary +# Arg 2: max size in bytes +define BINARY_SIZE_CHECK + echo "Binary size: $$(stat -c %s $(1))B" + @[ "$$(stat -c %s $(1))" -lt "$(2)" ] || (echo "Binary exceeds specified size of $(2)B"; exit 1) +endef + +########## +# Traces # +########## + +DASM_TRACES = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null)) +TXT_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.txt/g')) +PERF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g')) +ANNOTATED_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.s/g')) +DIFF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.diff/g')) -traces: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.txt/') || echo "") \ - $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/') || echo "") +GENTRACE_OUTPUTS = $(TXT_TRACES) $(PERF_TRACES) +ANNOTATE_OUTPUTS = $(ANNOTATED_TRACES) +PERF_CSV = $(LOGS_DIR)/perf.csv +EVENT_CSV = $(LOGS_DIR)/event.csv +TRACE_CSV = $(LOGS_DIR)/trace.csv +TRACE_JSON = $(LOGS_DIR)/trace.json + +.PHONY: traces annotate perf-csv event-csv layout +traces: $(GENTRACE_OUTPUTS) +annotate: $(ANNOTATE_OUTPUTS) +perf-csv: $(PERF_CSV) +event-csv: $(EVENT_CSV) +layout: $(TRACE_CSV) $(TRACE_JSON) + +$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY) + $(DASM) < $< | $(PYTHON) $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt -# make annotate # Generate source-code interleaved traces for all harts. Reads the binary from # the logs/.rtlbinary file that is written at start of simulation in the vsim script +BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary) $(LOGS_DIR)/trace_hart_%.s: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY} $(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< $(LOGS_DIR)/trace_hart_%.diff: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY} $(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -d -BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary) -annotate: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.s/') || echo "") \ - $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/\.dasm/\.diff/') || echo "") -# Arg 1: binary -# Arg 2: max size in bytes -define BINRAY_SIZE_CHECK - echo "Binary size: $$(stat -c %s $(1))B" - @[ "$$(stat -c %s $(1))" -lt "$(2)" ] || (echo "Binary exceeds specified size of $(2)B"; exit 1) -endef +$(PERF_CSV): $(PERF_TRACES) $(PERF_CSV_PY) + $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES) + +$(EVENT_CSV): $(PERF_TRACES) $(PERF_CSV_PY) + $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES) --filter tstart tend + +$(TRACE_CSV): $(EVENT_CSV) $(LAYOUT_FILE) $(LAYOUT_EVENTS_PY) + $(PYTHON) $(LAYOUT_EVENTS_PY) $(LAYOUT_EVENTS_FLAGS) $(EVENT_CSV) $(LAYOUT_FILE) -o $@ + +$(TRACE_JSON): $(TRACE_CSV) $(EVENTVIS_PY) + $(PYTHON) $(EVENTVIS_PY) -o $@ $(TRACE_CSV) diff --git a/target/common/test/ipc.cc b/target/common/test/ipc.cc index 5eaffcf85..09188a7b3 100644 --- a/target/common/test/ipc.cc +++ b/target/common/test/ipc.cc @@ -19,60 +19,67 @@ void* IpcIface::ipc_thread_handle(void* in) { // Handle commands ipc_op_t op; - while (!feof(tx)) { - uint8_t ret_value = fread(&op, sizeof(ipc_op_t), 1, tx); - if (ret_value != 1) { - if (ferror(tx)) { - continue; // jumps to while() again - } - } - switch (op.opcode) { - case Read: - // Read full blocks until one full block or less left - printf("[IPC] Read from 0x%x len 0x%x ...\n", op.addr, op.len); - for (uint64_t i = op.len; i > IPC_BUF_SIZE; i -= IPC_BUF_SIZE) { - sim::MEM.read(op.addr, IPC_BUF_SIZE, buf_data); - fwrite(buf_data, IPC_BUF_SIZE, 1, rx); - op.addr += IPC_BUF_SIZE; - op.len -= IPC_BUF_SIZE; - } - sim::MEM.read(op.addr, op.len, buf_data); - fwrite(buf_data, op.len, 1, rx); - fflush(rx); - break; - case Write: - // Write full blocks until one full block or less left - printf("[IPC] Write to 0x%x len %d ...\n", op.addr, op.len); - for (uint64_t i = op.len; i > IPC_BUF_SIZE; i -= IPC_BUF_SIZE) { - fread(buf_data, IPC_BUF_SIZE, 1, tx); - sim::MEM.write(op.addr, IPC_BUF_SIZE, buf_data, buf_strb); - op.addr += IPC_BUF_SIZE; - op.len -= IPC_BUF_SIZE; - } - fread(buf_data, op.len, 1, tx); - sim::MEM.write(op.addr, op.len, buf_data, buf_strb); - break; - case Poll: - // Unpack 32b checking mask and expected value from length - uint32_t mask = op.len & 0xFFFFFFFF; - uint32_t expected = (op.len >> 32) & 0xFFFFFFFF; - printf("[IPC] Poll on 0x%x mask 0x%x expected 0x%x ...\n", - op.addr, mask, expected); - uint32_t read; - do { - sim::MEM.read(op.addr, sizeof(uint32_t), - (uint8_t*)(void*)&read); - nanosleep( - (const struct timespec[]){{0, IPC_POLL_PERIOD_NS}}, - NULL); - } while ((read & mask) == (expected & mask)); - // Send back read 32b word - fwrite(&read, sizeof(uint32_t), 1, rx); - fflush(rx); + while (1) { + if (!fread(&op, sizeof(ipc_op_t), 1, tx)) { + if (feof(tx)) { + printf( + "[IPC] All messages read. Closing FIFOs and joining main " + "thread.\n"); break; + } + } else { + switch (op.opcode) { + case Read: + // Read full blocks until one full block or less left + printf("[IPC] Read from 0x%x len 0x%x ...\n", op.addr, + op.len); + for (uint64_t i = op.len; i > IPC_BUF_SIZE; + i -= IPC_BUF_SIZE) { + sim::MEM.read(op.addr, IPC_BUF_SIZE, buf_data); + fwrite(buf_data, IPC_BUF_SIZE, 1, rx); + op.addr += IPC_BUF_SIZE; + op.len -= IPC_BUF_SIZE; + } + sim::MEM.read(op.addr, op.len, buf_data); + fwrite(buf_data, op.len, 1, rx); + fflush(rx); + break; + case Write: + // Write full blocks until one full block or less left + printf("[IPC] Write to 0x%x len %d ...\n", op.addr, op.len); + for (uint64_t i = op.len; i > IPC_BUF_SIZE; + i -= IPC_BUF_SIZE) { + fread(buf_data, IPC_BUF_SIZE, 1, tx); + sim::MEM.write(op.addr, IPC_BUF_SIZE, buf_data, + buf_strb); + op.addr += IPC_BUF_SIZE; + op.len -= IPC_BUF_SIZE; + } + fread(buf_data, op.len, 1, tx); + sim::MEM.write(op.addr, op.len, buf_data, buf_strb); + break; + case Poll: + // Unpack 32b checking mask and expected value from length + uint32_t mask = op.len & 0xFFFFFFFF; + uint32_t expected = (op.len >> 32) & 0xFFFFFFFF; + printf("[IPC] Poll on 0x%x mask 0x%x expected 0x%x ...\n", + op.addr, mask, expected); + uint32_t read; + do { + sim::MEM.read(op.addr, sizeof(uint32_t), + (uint8_t*)(void*)&read); + nanosleep( + (const struct timespec[]){{0, IPC_POLL_PERIOD_NS}}, + NULL); + } while ((read & mask) == (expected & mask)); + // Send back read 32b word + fwrite(&read, sizeof(uint32_t), 1, rx); + fflush(rx); + break; + } } - printf("[IPC] ... done\n"); } + // TX FIFO closed at other end: close both FIFOs and join main thread fclose(tx); fclose(rx); diff --git a/target/common/test/verilator_lib.cc b/target/common/test/verilator_lib.cc index 63ac66d5b..3e1ae89e1 100644 --- a/target/common/test/verilator_lib.cc +++ b/target/common/test/verilator_lib.cc @@ -14,10 +14,15 @@ namespace sim { // Number of cycles between HTIF checks. const int HTIFTimeInterval = 200; + +// We want to return timestamp in picosecond accuracy, assuming that one cycle +// takes 1ns Since 1 cycle takes 2 sim::TIME increments, scale by 500 to get +// time = cycle * 1000 + +const int TIME_CYCLES_TO_TIMESTAMP = 500; void sim_thread_main(void *arg) { ((Sim *)arg)->main(); } // Sim time. -int TIME = 0; +vluint64_t TIME = 0; Sim::Sim(int argc, char **argv) : htif_t(argc, argv), ipc(argc, argv) { // Search arguments for `--vcd` flag and enable waves if requested @@ -78,7 +83,7 @@ void Sim::main() { } // namespace sim // Verilator callback to get the current time. -double sc_time_stamp() { return sim::TIME * 1e-9; } +double sc_time_stamp() { return sim::TIME * sim::TIME_CYCLES_TO_TIMESTAMP; } // DPI calls. void tb_memory_read(long long addr, int len, const svOpenArrayHandle data) { diff --git a/target/snitch_cluster/.gitignore b/target/snitch_cluster/.gitignore index b7f1de414..f74d9fde4 100644 --- a/target/snitch_cluster/.gitignore +++ b/target/snitch_cluster/.gitignore @@ -6,4 +6,5 @@ /work-vsim/ /work-vlt/ /work-vcs/ -/*.log \ No newline at end of file +/*.log +/runs/ \ No newline at end of file diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile index 7b38bbad6..037621213 100644 --- a/target/snitch_cluster/Makefile +++ b/target/snitch_cluster/Makefile @@ -9,7 +9,7 @@ # Makefile invocation # ####################### -DEBUG ?= OFF # ON to turn on debugging symbols +DEBUG ?= OFF # ON to turn on debugging symbols and wave logging CFG_OVERRIDE ?= # Override default config file SELECT_RUNTIME ?= # Select snRuntime implementation: "banshee" or "rtl" (default) @@ -37,9 +37,6 @@ REGGEN ?= $(shell $(BENDER) path register_interface)/vendor/lowrisc_ope CLUSTER_GEN ?= $(ROOT)/util/clustergen.py CLUSTER_GEN_SRC ?= $(wildcard $(ROOT)/util/clustergen/*.py) -VSIM ?= vsim -VLOG ?= vlog - ######################### # Files and directories # ######################### @@ -71,9 +68,6 @@ QUESTA_64BIT = -64 VLOG_64BIT = -64 VSIM_FLAGS += ${QUESTA_64BIT} -VSIM_FLAGS += -t 1ps -VSIM_FLAGS += -voptargs=+acc -VSIM_FLAGS += -do "log -r /*; run -a" VLOG_FLAGS += -svinputport=compat VLOG_FLAGS += -override_timescale 1ns/1ps @@ -245,7 +239,7 @@ clean-vsim: clean-work rm -rf bin/snitch_cluster.vsim bin/snitch_cluster.vsim.gui $(VSIM_BUILDDIR) vsim.wlf ${VSIM_BUILDDIR}/compile.vsim.tcl: - vlib $(dir $@) + $(VLIB) $(dir $@) ${BENDER} script vsim ${VSIM_BENDER} --vlog-arg="${VLOG_FLAGS} -work $(dir $@) " > $@ echo '${VLOG} -work $(dir $@) ${TB_CC_SOURCES} ${TB_ASM_SOURCES} -vv -ccflags "$(TB_CC_FLAGS)"' >> $@ echo 'return 0' >> $@ @@ -267,22 +261,10 @@ clean-vcs: clean-work # Build compilation script and compile all sources for VCS simulation bin/snitch_cluster.vcs: ${VCS_SOURCES} ${TB_SRCS} $(TB_CC_SOURCES) $(TB_ASM_SOURCES) $(VCS_BUILDDIR)/compile.sh work/lib/libfesvr.a mkdir -p bin - vcs -Mlib=$(VCS_BUILDDIR) -Mdir=$(VCS_BUILDDIR) -o bin/snitch_cluster.vcs -cc $(CC) -cpp $(CXX) \ + $(VCS) -Mlib=$(VCS_BUILDDIR) -Mdir=$(VCS_BUILDDIR) -o bin/snitch_cluster.vcs -cc $(CC) -cpp $(CXX) \ -assert disable_cover -override_timescale=1ns/1ps -full64 tb_bin $(TB_CC_SOURCES) $(TB_ASM_SOURCES) \ -CFLAGS "$(TB_CC_FLAGS)" -LDFLAGS "-L${FESVR}/lib" -lfesvr -########## -# Traces # -########## - -$(LOGS_DIR)/perf.csv: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/')) \ - $(PERF_CSV_PY) - $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(LOGS_DIR)/hart_*_perf.json - -$(LOGS_DIR)/event.csv: $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null | sed 's/trace_hart/hart/' | sed 's/.dasm/_perf.json/')) \ - $(PERF_CSV_PY) - $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(LOGS_DIR)/hart_*_perf.json --filter tstart tend - ######## # Util # ######## diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson index c39c2a490..7f28a1073 100644 --- a/target/snitch_cluster/cfg/default.hjson +++ b/target/snitch_cluster/cfg/default.hjson @@ -34,8 +34,8 @@ lat_comp_fp8: 1, lat_comp_fp8_alt: 1, lat_noncomp: 1, - lat_conv: 1, - lat_sdotp: 2, + lat_conv: 2, + lat_sdotp: 3, fpu_pipe_config: "BEFORE" narrow_xbar_latency: "CUT_ALL_PORTS", wide_xbar_latency: "CUT_ALL_PORTS", diff --git a/target/snitch_cluster/run.py b/target/snitch_cluster/run.py new file mode 100755 index 000000000..bef478ef7 --- /dev/null +++ b/target/snitch_cluster/run.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).parent / '../../util/sim')) +from sim_utils import parser, get_simulations, run_simulations # noqa: E402 +from Simulator import QuestaSimulator, VCSSimulator, VerilatorSimulator, \ + BansheeSimulator # noqa: E402 + + +SIMULATORS = { + 'vsim': QuestaSimulator(Path(__file__).parent.resolve() / 'bin/snitch_cluster.vsim'), + 'vcs': VCSSimulator(Path(__file__).parent.resolve() / 'bin/snitch_cluster.vcs'), + 'verilator': VerilatorSimulator(Path(__file__).parent.resolve() / 'bin/snitch_cluster.vlt'), + 'banshee': BansheeSimulator(Path(__file__).parent.resolve() / 'src/banshee.yaml') +} + + +def main(): + args = parser('vsim', SIMULATORS.keys()).parse_args() + simulations = get_simulations(args.testlist, SIMULATORS[args.simulator], args.run_dir) + return run_simulations(simulations, + n_procs=args.n_procs, + dry_run=args.dry_run, + early_exit=args.early_exit, + verbose=args.verbose) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/target/snitch_cluster/sw/Makefile b/target/snitch_cluster/sw/Makefile index 9badf70ea..a0115d00a 100644 --- a/target/snitch_cluster/sw/Makefile +++ b/target/snitch_cluster/sw/Makefile @@ -13,21 +13,19 @@ else RUNTIME = runtime/rtl endif -MATH = ../../../sw/math - -SUBDIRS = runtime/banshee runtime/rtl $(MATH) apps tests +SUBDIRS = runtime/banshee runtime/rtl math apps tests .PHONY: all $(SUBDIRS) all: $(SUBDIRS) # Explicit dependency of apps on runtime -apps: $(RUNTIME) $(MATH) +apps: $(RUNTIME) math $(MAKE) -C $@ TARGET=$(TARGET) # Explicit dependency of tests on runtime -tests: $(RUNTIME) $(MATH) +tests: $(RUNTIME) math $(MAKE) -C $@ $(TARGET) -runtime/rtl runtime/banshee $(MATH): +runtime/rtl runtime/banshee math: $(MAKE) -C $@ $(TARGET) diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk index d8b0659a4..e27a19cfd 100644 --- a/target/snitch_cluster/sw/apps/common.mk +++ b/target/snitch_cluster/sw/apps/common.mk @@ -22,6 +22,7 @@ RISCV_CFLAGS += -DBIST else RUNTIME_DIR := $(ROOT)/target/snitch_cluster/sw/runtime/rtl endif +MATH_DIR := $(ROOT)/target/snitch_cluster/sw/math # Paths relative to the app including this Makefile BUILDDIR = $(abspath build) @@ -37,19 +38,18 @@ INCDIRS += $(SNRT_DIR)/api/omp INCDIRS += $(SNRT_DIR)/src INCDIRS += $(SNRT_DIR)/src/omp INCDIRS += $(ROOT)/sw/deps/riscv-opcodes - -# Math library override -INCDIRS += $(ROOT)/sw/math/arch/riscv64/bits/ -INCDIRS += $(ROOT)/sw/math/arch/generic -INCDIRS += $(ROOT)/sw/math/src/include -INCDIRS += $(ROOT)/sw/math/src/internal -INCDIRS += $(ROOT)/sw/math/include/bits INCDIRS += $(ROOT)/sw/math/include +LIBS = $(MATH_DIR)/build/libmath.a +LIBS += $(RUNTIME_DIR)/build/libsnRuntime.a + +LIBDIRS = $(dir $(LIBS)) +LIBNAMES = $(patsubst lib%,%,$(notdir $(basename $(LIBS)))) + RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR)) RISCV_LDFLAGS += -T$(abspath $(SNRT_DIR)/base.ld) -RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR)/build/) -RISCV_LDFLAGS += -lsnRuntime +RISCV_LDFLAGS += $(addprefix -L,$(LIBDIRS)) +RISCV_LDFLAGS += $(addprefix -l,$(LIBNAMES)) ########### # Outputs # @@ -78,11 +78,11 @@ $(BUILDDIR): $(DEP): $(SRCS) | $(BUILDDIR) $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(ELF)' $< > $@ -$(ELF): $(SRCS) $(DEP) | $(BUILDDIR) +$(ELF): $(SRCS) $(DEP) $(LIBS) | $(BUILDDIR) $(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@ $(DUMP): $(ELF) | $(BUILDDIR) - $(RISCV_OBJDUMP) -D $< > $@ + $(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) $< > $@ $(DWARF): $(ELF) | $(BUILDDIR) $(RISCV_DWARFDUMP) $< > $@ diff --git a/target/snitch_cluster/sw/math/Makefile b/target/snitch_cluster/sw/math/Makefile new file mode 100644 index 000000000..d0a83e86a --- /dev/null +++ b/target/snitch_cluster/sw/math/Makefile @@ -0,0 +1,8 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +include ../toolchain.mk +include ../../../../sw/math/Makefile diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index f25ea7641..ce241a8d4 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -68,11 +68,11 @@ runs: - elf: tests/build/varargs_2.elf - elf: tests/build/zero_mem.elf - elf: tests/build/non_null_exitcode.elf - exit_code: 14 + retcode: 14 - elf: apps/blas/axpy/build/axpy.elf - cmd: ../../sw/blas/axpy/verify.py {sim_bin} {elf} + cmd: [../../../sw/blas/axpy/verify.py, "${sim_bin}", "${elf}"] - elf: apps/blas/gemm/build/gemm.elf - cmd: ../../sw/blas/gemm/verify.py {sim_bin} {elf} + cmd: [../../../sw/blas/gemm/verify.py, "${sim_bin}", "${elf}"] - elf: apps/dnn/batchnorm/build/batchnorm.elf - elf: apps/dnn/linear/build/linear.elf - elf: apps/dnn/maxpool/build/maxpool.elf diff --git a/target/snitch_cluster/sw/toolchain.mk b/target/snitch_cluster/sw/toolchain.mk index 4fa0fc5af..3d50974b8 100644 --- a/target/snitch_cluster/sw/toolchain.mk +++ b/target/snitch_cluster/sw/toolchain.mk @@ -34,6 +34,7 @@ RISCV_CFLAGS += -mcmodel=medany # RISCV_CFLAGS += -mno-fdiv # Not supported by Clang RISCV_CFLAGS += -ffast-math RISCV_CFLAGS += -fno-builtin-printf +RISCV_CFLAGS += -fno-builtin-sqrtf RISCV_CFLAGS += -fno-common RISCV_CFLAGS += -fopenmp RISCV_CFLAGS += -ftls-model=local-exec @@ -54,3 +55,7 @@ RISCV_LDFLAGS += -lclang_rt.builtins-riscv32 # Archiver flags RISCV_ARFLAGS = rcs + +# Objdump flags +RISCV_OBJDUMP_FLAGS += --mcpu=snitch +RISCV_OBJDUMP_FLAGS += -D diff --git a/util/container/Dockerfile b/util/container/Dockerfile index ea320f325..d917a6790 100644 --- a/util/container/Dockerfile +++ b/util/container/Dockerfile @@ -7,7 +7,11 @@ # 1. Stage FROM ubuntu:18.04 AS builder ARG CMAKE_VERSION=3.19.4 +ARG PYTHON_VERSION=3.9.12 +# Run dpkg without interactive dialogue +ARG DEBIAN_FRONTEND=noninteractive +# Install APT requirements COPY apt-requirements.txt /tmp/apt-requirements.txt RUN apt-get update && \ sed 's/#.*//' /tmp/apt-requirements.txt \ @@ -20,8 +24,26 @@ RUN apt-get update && \ lsb-release \ software-properties-common \ unzip \ - wget \ - zlib1g-dev + wget +# Required to install Python +RUN apt-get update && apt-get install -y \ + zlib1g-dev \ + libreadline-gplv2-dev \ + libncursesw5-dev \ + libssl-dev \ + libsqlite3-dev \ + tk-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libffi-dev + +# Install Python +RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz +RUN tar xzf Python-${PYTHON_VERSION}.tgz +RUN cd Python-${PYTHON_VERSION} && \ + ./configure --enable-optimizations --prefix=/opt/python/ && \ + make install -j # Build Rust tools RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y @@ -37,6 +59,7 @@ RUN wget https://apt.llvm.org/llvm.sh RUN chmod +x llvm.sh RUN ./llvm.sh 12 +# Change working directory WORKDIR /tools # Install a newer version of cmake (we need this for banshee) @@ -73,9 +96,11 @@ RUN apt-get update && \ sed 's/#.*//' /tmp/apt-requirements.txt \ | xargs apt-get install -y && \ apt-get install -y --no-install-recommends \ + ca-certificates \ gnupg2 \ curl \ wget \ + build-essential \ git && \ apt-get clean ; \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/doc/* @@ -86,12 +111,7 @@ RUN echo 'deb http://download.opensuse.org/repositories/home:/phiwag:/edatools/x apt-get update && apt-get install -y verilator-${VERILATOR_VERSION} && \ apt-get clean ; \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/doc/* - -# Install Python requirements -COPY python-requirements.txt /tmp/python-requirements.txt -COPY docs/requirements.txt /tmp/docs/requirements.txt -COPY sw/dnn/requirements.txt /tmp/sw/dnn/requirements.txt -RUN pip3 install -r /tmp/python-requirements.txt +ENV VLT_ROOT "/usr/share/verilator" # Get the precompiled LLVM toolchain RUN latest_tag=`curl -s -H "Accept: application/vnd.github.v3+json" https://api.github.com/repos/pulp-platform/llvm-project/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/'` && \ @@ -119,6 +139,17 @@ RUN apt-get update && apt-get install software-properties-common -y && \ # Copy artifacts from stage 1. COPY --from=builder /root/.cargo/bin/bender bin/ COPY --from=builder /root/.cargo/bin/banshee bin/ +COPY --from=builder /opt/python /opt/python + +# Create and activate virtual environment +ENV VIRTUAL_ENV "/root/.venvs/snitch_cluster" +RUN /opt/python/bin/python3 -m venv ${VIRTUAL_ENV} +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" +# Install Python requirements +COPY python-requirements.txt /tmp/python-requirements.txt +COPY docs/requirements.txt /tmp/docs/requirements.txt +COPY sw/dnn/requirements.txt /tmp/sw/dnn/requirements.txt +RUN pip install -r /tmp/python-requirements.txt # Set locale to UTF-8, required because Python 3.6 defaults on ASCII encoding. # See https://click.palletsprojects.com/en/8.1.x/unicode-support/ diff --git a/util/sim/Simulation.py b/util/sim/Simulation.py new file mode 100644 index 000000000..3cc219389 --- /dev/null +++ b/util/sim/Simulation.py @@ -0,0 +1,242 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from termcolor import colored, cprint +from pathlib import Path +import subprocess +import re +import os +from mako.template import Template + + +class Simulation(object): + """Provides a common interface to manage simulations.""" + + LOG_FILE = 'sim.txt' + + def __init__(self, elf=None, dry_run=False, retcode=0, run_dir=None): + """Constructor for the Simulation class. + + A Simulation object is defined at a minimum by a software + binary to be simulated on the desired hardware. The hardware is + implicitly determined by the simulation command. + + Arguments: + elf: The software binary to simulate. + run_dir: The directory where to launch the simulation + command. If none is passed, the current working + directory is assumed. + dry_run: A preview of the simulation command will be + displayed without actually launching the simulation. + """ + self.elf = elf + self.dry_run = dry_run + self.run_dir = run_dir if run_dir is not None else Path.cwd() + self.testname = Path(self.elf).stem + self.cmd = [] + self.log = None + self.process = None + self.expected_retcode = int(retcode) + + def launch(self, dry_run=None): + """Launch the simulation. + + Launch the simulation by invoking the command stored in the + `cmd` attribute of the class. Subclasses are required to define + a non-empty `cmd` attribute prior to invoking this method. + + Arguments: + dry_run: A preview of the simulation command is displayed + without actually launching the simulation. + """ + # Override dry_run setting at launch time + if dry_run is not None: + self.dry_run = dry_run + + # Print launch message and simulation command + cprint(f'Run test {colored(self.elf, "cyan")}', attrs=["bold"]) + cmd_string = ' '.join(self.cmd) + print(f'[{self.run_dir}]$ {cmd_string}', flush=True) + + # Launch simulation if not doing a dry run + if not self.dry_run: + # Create run directory and log file + os.makedirs(self.run_dir, exist_ok=True) + self.log = self.run_dir / self.LOG_FILE + # Launch simulation subprocess + with open(self.log, 'w') as f: + self.process = subprocess.Popen(self.cmd, stdout=f, stderr=subprocess.STDOUT, + cwd=self.run_dir, universal_newlines=True) + + def completed(self): + """Return whether the simulation completed.""" + if self.dry_run: + return True + elif self.process: + return self.process.poll() is not None + else: + return False + + def get_retcode(self): + """Get the return code of the simulation.""" + if self.dry_run: + return 0 + else: + if self.process: + return int(self.process.returncode) + + def successful(self): + """Return whether the simulation was successful.""" + actual_retcode = self.get_retcode() + if actual_retcode is not None: + return int(actual_retcode) == int(self.expected_retcode) + else: + return False + + def print_log(self): + """Print a log of the simulation to stdout.""" + with open(self.log, 'r') as f: + print(f.read()) + + def print_status(self): + """Print a status message to stdout. + + The status message reports whether the test is still running + or, if it completed, whether it was successful or failed. + """ + if self.completed(): + if self.successful(): + cprint(f'{self.elf} test passed', 'green', attrs=['bold'], flush=True) + else: + cprint(f'{self.elf} test failed', 'red', attrs=['bold'], flush=True) + else: + cprint(f'{self.elf} test running', 'black', flush=True) + + +class RTLSimulation(Simulation): + """A simulation run on an RTL simulator. + + An RTL simulation is launched through a simulation binary built + in advance from some RTL design. + """ + + def __init__(self, sim_bin=None, **kwargs): + """Constructor for the RTLSimulation class. + + Arguments: + sim_bin: The simulation binary. + kwargs: Arguments passed to the base class constructor. + """ + super().__init__(**kwargs) + self.cmd = [str(sim_bin), str(self.elf)] + + +class VerilatorSimulation(RTLSimulation): + """An RTL simulation running on Verilator. + + The return code of the simulation is returned directly as the + return code of the command launching the simulation. + """ + + def get_retcode(self): + return self.process.returncode + + +class QuestaVCSSimulation(RTLSimulation): + """An RTL simulation running on QuestaSim or VCS. + + QuestaSim and VCS print out the simulation return code in the + simulation log. This is parsed to extract the return code. + """ + + def get_retcode(self): + # Extract the application's return code from the simulation log + with open(self.log, 'r') as f: + for line in f.readlines(): + regex_success = r'\[SUCCESS\] Program finished successfully' + match_success = re.search(regex_success, line) + if match_success: + return 0 + else: + regex_fail = r'\[FAILURE\] Finished with exit code\s+(\d+)' + match = re.search(regex_fail, line) + if match: + return int(match.group(1)) + + def successful(self): + # Check that simulation return code matches expected value (in super class) + # and that the simulation process terminated correctly + success = super().successful() + if self.process.returncode != 0: + return False + else: + return success + + +class QuestaSimulation(QuestaVCSSimulation): + """An RTL simulation running on QuestaSim.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.cmd += ['', '-batch'] + + +class VCSSimulation(QuestaVCSSimulation): + """An RTL simulation running on VCS.""" + pass + + +class BansheeSimulation(Simulation): + """A simulation running on Banshee. + + The return code of the simulation is returned directly as the + return code of the command launching the simulation. + """ + + def __init__(self, banshee_cfg=None, **kwargs): + """Constructor for the BansheeSimulation class. + + Arguments: + banshee_cfg: A Banshee config file. + kwargs: Arguments passed to the base class constructor. + """ + super().__init__(**kwargs) + self.cmd = ['banshee', '--no-opt-llvm', '--no-opt-jit', '--configuration', + str(banshee_cfg), '--trace', str(self.elf)] + + +class CustomSimulation(Simulation): + """A simulation which is run through a custom command. + + The custom command generally invokes an RTL simulator binary behind + the scenes and executes some additional verification logic after + the end of the simulation. + + Custom simulations are considered unsuccessful if the return code + of the custom command is non-null. As a custom command can + implement any verification logic, there is no reason to implement + any additional logic here. + """ + + def __init__(self, sim_bin=None, cmd=None, **kwargs): + """Constructor for the CustomSimulation class. + + Arguments: + sim_bin: The simulation binary. + cmd: The custom command used to launch the simulation. + kwargs: Arguments passed to the base class constructor. + """ + super().__init__(**kwargs) + self.dynamic_args = { + 'sim_bin': str(sim_bin), + 'elf': str(self.elf), + 'run_dir': str(self.run_dir) + } + self.cmd = cmd + + def launch(self, **kwargs): + self.cmd = [Template(arg).render(**self.dynamic_args) for arg in self.cmd] + super().launch(**kwargs) diff --git a/util/sim/Simulator.py b/util/sim/Simulator.py new file mode 100644 index 000000000..3d3090573 --- /dev/null +++ b/util/sim/Simulator.py @@ -0,0 +1,187 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from Simulation import QuestaSimulation, VCSSimulation, VerilatorSimulation, BansheeSimulation, \ + CustomSimulation + + +class Simulator(object): + """An object capable of constructing Simulation objects. + + A simulator constructs a [Simulation][Simulation.Simulation] object + from a test object, as defined e.g. in a test suite specification + file. + + At minimum, a test is defined by a binary (`elf`) which is to be + simulated and a set of simulators it can be run on. A test could be + defined by a class of its own, but at the moment we assume a test + to be represented by a dictionary with the `elf` and `simulators` + keys at minimum. + """ + + def __init__(self, name, simulation_cls): + """Constructor for the Simulator class. + + A simulator must be identifiable by a unique identifier string + and construct at least one type of + [Simulation][Simulation.Simulation] object. + + Arguments: + name: The unique identifier of the simulator. + simulation_cls: One type of + [Simulation][Simulation.Simulation] object the + simulator can construct. + """ + self.name = name + self.simulation_cls = simulation_cls + + def supports(self, test): + """Check whether a certain test is supported by the simulator. + + Arguments: + test: The test to check. + """ + return 'simulators' not in test or self.name in test['simulators'] + + def get_simulation(self, test, simulation_cls=None, **kwargs): + """Construct a Simulation object from the specified test. + + Arguments: + test: The test for which a Simulation object must be + constructed. + simulation_cls: Create a simulation instance of this + Simulation subclass. Use `self.simulation_cls` by + default. + """ + kwargs.update({key: test[key] for key in ['elf', 'run_dir', 'retcode'] if key in test}) + if simulation_cls is not None: + return simulation_cls(**kwargs) + else: + return self.simulation_cls(**kwargs) + + +class RTLSimulator(Simulator): + """Base class for RTL simulators. + + An RTL simulator requires a simulation binary built from an RTL + design to launch a simulation. + + A test may need to be run with a custom command, itself invoking + the simulation binary behind the scenes, e.g. for verification + purposes. Such a test carries the custom command (a list of args) + under the `cmd` key. In such case, the RTL simulator constructs a + [CustomSimulation][Simulation.CustomSimulation] object from the + given test, with the custom command and simulation binary. + """ + + def __init__(self, binary, **kwargs): + """Constructor for the RTLSimulator class. + + Arguments: + binary: The simulation binary. + kwargs: Arguments passed to the base class constructor. + """ + super().__init__(**kwargs) + self.binary = binary + + def get_simulation(self, test): + if 'cmd' in test: + return super().get_simulation( + test, + simulation_cls=CustomSimulation, + sim_bin=self.binary, + cmd=test['cmd']) + else: + return super().get_simulation( + test, + sim_bin=self.binary + ) + + +class VCSSimulator(RTLSimulator): + """VCS simulator + + An [RTL simulator][Simulator.RTLSimulator], identified by the name + `vcs`, tailored to the creation of + [VCS simulations][Simulation.VCSSimulation]. + """ + + def __init__(self, binary): + """Constructor for the VCSSimulator class. + + Arguments: + binary: The VCS simulation binary. + """ + super().__init__(binary, name='vcs', simulation_cls=VCSSimulation) + + +class QuestaSimulator(RTLSimulator): + """QuestaSim simulator + + An [RTL simulator][Simulator.RTLSimulator], identified by the name + `vsim`, tailored to the creation of + [QuestaSim simulations][Simulation.QuestaSimulation]. + """ + + def __init__(self, binary): + """Constructor for the QuestaSimulator class. + + Arguments: + binary: The QuestaSim simulation binary. + """ + super().__init__(binary, name='vsim', simulation_cls=QuestaSimulation) + + +class VerilatorSimulator(RTLSimulator): + """Verilator simulator + + An [RTL simulator][Simulator.RTLSimulator], identified by the name + `verilator`, tailored to the creation of + [Verilator simulations][Simulation.VerilatorSimulation]. + """ + + def __init__(self, binary): + """Constructor for the VerilatorSimulator class. + + Arguments: + binary: The Verilator simulation binary. + """ + super().__init__(binary, name='verilator', simulation_cls=VerilatorSimulation) + + +class BansheeSimulator(Simulator): + """Banshee simulator + + A simulator, identified by the name `banshee`, tailored to the + creation of [Banshee simulations][Simulation.BansheeSimulation]. + """ + + def __init__(self, cfg): + """Constructor for the BansheeSimulator class. + + Arguments: + cfg: A Banshee config file. + """ + super().__init__(name='banshee', simulation_cls=BansheeSimulation) + self.cfg = cfg + + def supports(self, test): + """See base class. + + The Banshee simulator does not support tests carrying a custom + command. + """ + supported = super().supports(test) + if 'cmd' in test: + return False + else: + return supported + + def get_simulation(self, test): + return super().get_simulation( + test, + banshee_cfg=self.cfg + ) diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py index 664e2624b..2ed260d3f 100644 --- a/util/sim/data_utils.py +++ b/util/sim/data_utils.py @@ -9,7 +9,7 @@ def emit_license(): - s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna." + s = (f"// Copyright {datetime.now().year} ETH Zurich and University of Bologna.\n" f"// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n" f"// SPDX-License-Identifier: Apache-2.0\n\n") return s diff --git a/util/sim/elf.py b/util/sim/elf.py index a46a6764d..27ab5b3e7 100644 --- a/util/sim/elf.py +++ b/util/sim/elf.py @@ -36,6 +36,15 @@ def get_symbol_size(self, uid): def get_symbol_contents(self, uid): addr = self.get_symbol_address(uid) size = self.get_symbol_size(uid) - fpos = list(self.elf.address_offsets(addr, size))[0] - self.elf.stream.seek(fpos) - return self.elf.stream.read(size) + try: + fpos = list(self.elf.address_offsets(addr, size))[0] + self.elf.stream.seek(fpos) + contents = self.elf.stream.read(size) + except IndexError: + # We assume all segments in our ELF are of type PT_LOAD and + # that the only section whose contents are not stored in + # the ELF file is the .bss section. Therefore, whenever + # `address_offsets()` fails to return a valid offset into the + # file we assume that the address falls in the .bss section. + contents = bytearray([0] * size) + return contents diff --git a/util/sim/sim_utils.py b/util/sim/sim_utils.py new file mode 100755 index 000000000..371d56b81 --- /dev/null +++ b/util/sim/sim_utils.py @@ -0,0 +1,288 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande +"""Convenience functions to set up a Python simulation framework. + +Such a framework enables you to transparently run a software test suite +on any simulator of choice, provided that the latter is supported by +the framework. It can be used in CIs, regression testing or to conduct +systematic evaluation experiments. + +Three interfaces are required to implement a common framework: + +1. a test suite specification interface to specify the software tests +2. a command-line interface used to launch the simulations +3. an interface to the simulators supported by the framework + +The framework can be divided into three components each managing one of +the defined interfaces: + +1. a test suite frontend +2. a command-line frontend +3. a simulation backend + +A fourth component, the core, serves to glue all other components +together. + +The [parser()][sim_utils.parser] function provides a minimum +command-line interface to control the tool. + +The [get_simulations()][sim_utils.get_simulations] function +provides a common means to implement the test suite frontend. At the +input interface it assumes a test suite specification file in YAML +syntax, and returns a list of simulation objects which implement a +common interface to the simulation backend. This interface is defined +by the [Simulation][Simulation.Simulation] class. + +The core logic of the framework is implemented in the +[run_simulations()][sim_utils.run_simulations] function. It takes +the output from [get_simulations()][sim_utils.get_simulations] and +launches the simulations through the interface to the simulation +backend. + +The simulation backend is implemented by the +[Simulation][Simulation.Simulation] and +[Simulator][Simulator.Simulator] classes and their subclasses. +""" + +import argparse +from termcolor import colored, cprint +from pathlib import Path +import os +import time +import yaml +import signal +import psutil + +POLL_PERIOD = 0.2 + + +def parser(default_simulator='vsim', simulator_choices=['vsim']): + """Default command-line parser for Python simulation frameworks. + + Returns a Python `argparse` parser with common options used to + simulate one or multiple binaries on an RTL design. Can be extended + by adding arguments to it. + + Args: + default_simulator: The simulator to be used when none is + specified on the command-line. + simulator_choices: All simulator choices which can be passed on + the command-line. + """ + # Argument parsing + parser = argparse.ArgumentParser() + parser.add_argument( + 'testlist', + help='File specifying a list of apps to run') + parser.add_argument( + '--simulator', + action='store', + nargs='?', + default=default_simulator, + choices=simulator_choices, + help='Choose a simulator to run the test with') + parser.add_argument( + '--run-dir', + action='store', + default='runs', + nargs='?', + help='Parent directory of each test run directory') + parser.add_argument( + '--dry-run', + action='store_true', + help='Preview the simulation commands which will be run') + parser.add_argument( + '--early-exit', + action='store_true', + help='Exit as soon as any test fails') + parser.add_argument( + '--verbose', + action='store_true', + help='Activate verbose printing') + parser.add_argument( + '-j', + action='store', + dest='n_procs', + nargs='?', + type=int, + default=1, + const=os.cpu_count(), + help=('Maximum number of tests to run in parallel. ' + 'One if the option is not present. Equal to the number of CPU cores ' + 'if the option is present but not followed by an argument.')) + return parser + + +def _resolve_relative_path(base_path, s): + """Resolve a relative path string w.r.t. a ceratin base. + + Checks if an input string represents a valid relative path w.r.t. + to a certain base path and resolves it to an absolute path, if this + is the case. Otherwise returns the original string. + + Args: + s: The input string + base_path: The base path + """ + try: + base_path = Path(base_path).resolve() # Get the absolute path of the base directory + input_path = Path(s) + if input_path.is_absolute() or not s.startswith(("./", "../")): + return s + else: + # Resolve the path against the base directory and check existence + absolute_path = (base_path / input_path).resolve() + return str(absolute_path) + except (TypeError, ValueError): + # Handle invalid base_path or s + return s + except Exception as e: + # Handle other exceptions like permission errors, etc. + print(f"An error occurred: {str(e)}") + return s + + +def get_simulations(testlist, simulator, run_dir=None): + """Create simulation objects from a test list file. + + Args: + testlist: Path to a test list file. A test list file is a YAML + file describing a set of tests. + simulator: The simulator to use to run the tests. A test run on + a specific simulator defines a simulation. + run_dir: A directory under which all tests should be run. If + provided, a unique subdirectory for each test will be + created under this directory, based on the test name. + + Returns: + A list of `Simulation` objects. The list contains a + `Simulation` object for every test which supports the given + `simulator`. This object defines a simulation of the test on + that particular `simulator`. + """ + # Get tests from test list file + testlist_path = Path(testlist).absolute() + with open(testlist_path, 'r') as f: + tests = yaml.safe_load(f)['runs'] + # Convert relative paths in testlist file to absolute paths + for test in tests: + test['elf'] = testlist_path.parent / test['elf'] + if 'cmd' in test: + test['cmd'] = [_resolve_relative_path(testlist_path.parent, arg) for arg in test['cmd']] + # Create simulation object for every test which supports the specified simulator + simulations = [simulator.get_simulation(test) for test in tests if simulator.supports(test)] + # Set simulation run directory + if run_dir is not None: + for sim in simulations: + sim.run_dir = Path(run_dir) / sim.testname + return simulations + + +def print_summary(failed_sims, early_exit=False, dry_run=False): + """Print a summary of the simulation suite's exit status. + + Args: + failed_sims: A list of failed simulations from the simulation + suite. + early_exit: Whether the simulation suite was configured to + terminate upon the first failing simulation. + dry_run: Whether the simulation suite was launched in dry run + mode. + """ + if not dry_run: + header = f'==== Test summary {"(early exit)" if early_exit else ""} ====' + cprint(header, attrs=['bold']) + if failed_sims: + [sim.print_status() for sim in failed_sims] + else: + print(f'{colored("All tests passed!", "green")}') + + +def terminate_processes(): + print('Terminate processes') + # Get PID and PGID of parent process (current Python script) + ppid = os.getpid() + pgid = os.getpgid(0) + # Kill processes in current process group, except parent process + for proc in psutil.process_iter(['pid', 'name']): + pid = proc.info['pid'] + if os.getpgid(pid) == pgid and pid != ppid: + os.kill(pid, signal.SIGKILL) + + +def get_unique_run_dir(sim, prefix=None): + """Get unique run directory for a simulation. + + If the simulation was already assigned a run directory at creation + time, None is returned. Otherwise, return a unique run directory + based on the testname under an optional prefix directory. + + Args: + sim: The simulation for which the run directory is + requested. + prefix: Get a unique run directory under a directory which + could be common to multiple simulations. We call this + a prefix. By default the current working directory is + assumed as the prefix. + """ + if sim.run_dir is None: + if prefix is None: + prefix = Path.cwd() + return prefix / sim.testname + + +def run_simulations(simulations, n_procs=1, dry_run=None, early_exit=False, + verbose=False): + """Run simulations defined by a list of `Simulation` objects. + + Args: + simulations: A list of `Simulation` objects as returned e.g. by + [sim_utils.get_simulations][]. + + Returns: + The number of failed simulations. + """ + # Register SIGTERM handler, used to gracefully terminate all simulation subprocesses + signal.signal(signal.SIGTERM, lambda _, __: terminate_processes()) + + # Spawn a process for every test, wait for all running tests to terminate and check results + running_sims = [] + failed_sims = [] + early_exit_requested = False + try: + while (len(simulations) or len(running_sims)) and not early_exit_requested: + # If there are still simulations to run and there are less running simulations than + # the maximum number of processes allowed in parallel, spawn new simulation + if len(simulations) and len(running_sims) < n_procs: + running_sims.append(simulations.pop(0)) + running_sims[-1].launch(dry_run=dry_run) + # Remove completed sims from running sims list + idcs = [i for i, sim in enumerate(running_sims) if sim.completed()] + completed_sims = [running_sims.pop(i) for i in sorted(idcs, reverse=True)] + # Check completed sims and report status + for sim in completed_sims: + if sim.successful(): + sim.print_status() + else: + failed_sims.append(sim) + if verbose: + sim.print_log() + sim.print_status() + # If in early-exit mode, terminate as soon as any simulation fails + if early_exit: + early_exit_requested = True + break + time.sleep(POLL_PERIOD) + except KeyboardInterrupt: + early_exit_requested = True + + # Clean up after early exit + if early_exit_requested: + terminate_processes() + + # Print summary + print_summary(failed_sims, early_exit_requested) + return len(failed_sims) diff --git a/util/sim/simulate.py b/util/sim/simulate.py deleted file mode 100755 index 4e36cc1e1..000000000 --- a/util/sim/simulate.py +++ /dev/null @@ -1,270 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2023 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Luca Colagrande - -# TODO colluca: timeout feature - -import argparse -import multiprocessing -from pathlib import Path -import subprocess -from termcolor import colored, cprint -import os -import re -import sys -import time -import yaml - - -BANSHEE_CFG = 'src/banshee.yaml' - -# Tool settings -SIMULATORS = ['vsim', 'banshee', 'verilator', 'vcs', 'other'] -DEFAULT_SIMULATOR = SIMULATORS[0] -SIMULATOR_BINS = { - 'vsim': 'bin/snitch_cluster.vsim', - 'banshee': 'banshee', - 'verilator': 'bin/snitch_cluster.vlt', - 'vcs': 'bin/snitch_cluster.vcs' -} -SIMULATOR_CMDS = { - 'vsim': '{sim_bin} {elf} "" -batch', - 'banshee': ('{{sim_bin}} --no-opt-llvm --no-opt-jit --configuration {cfg}' - ' --trace {{elf}} > /dev/null').format(cfg=BANSHEE_CFG), - 'verilator': '{sim_bin} {elf}', - 'vcs': '{sim_bin} {elf}' -} - - -def parse_args(): - # Argument parsing - parser = argparse.ArgumentParser() - parser.add_argument( - 'testlist', - help='File specifying a list of apps to run') - parser.add_argument( - '--simulator', - action='store', - nargs='?', - default=DEFAULT_SIMULATOR, - choices=SIMULATORS, - help='Choose a simulator to run the test with') - parser.add_argument( - '--sim-bin', - action='store', - nargs='?', - help='Override default path to simulator binary') - parser.add_argument( - '--dry-run', - action='store_true', - help='Preview the simulation commands which will be run') - parser.add_argument( - '--early-exit', - action='store_true', - help='Exit as soon as any test fails') - parser.add_argument( - '-j', - action='store', - dest='n_procs', - nargs='?', - type=int, - default=1, - const=os.cpu_count(), - help=('Maximum number of tests to run in parallel. ' - 'One if the option is not present. Equal to the number of CPU cores ' - 'if the option is present but not followed by an argument.')) - parser.add_argument( - '--verbose', - action='store_true', - help=('Option to print simulation logs when multiple tests are run in parallel.' - 'Logs are always printed when n_procs == 1')) - args = parser.parse_args() - return args - - -# Get tests from a test list file -def get_tests(testlist_path): - testlist_path = Path(testlist_path).absolute() - with open(testlist_path, 'r') as file: - tests = yaml.safe_load(file)['runs'] - return tests - - -def check_exit_code(test, exit_code): - if 'exit_code' in test: - return not (int(test['exit_code']) == int(exit_code)) - else: - return exit_code - - -def multiple_processes(args): - return args.n_procs != 1 - - -def run_simulation(cmd, simulator, test, quiet=False): - # Defaults - result = 1 - log = '' - - # Spawn simulation subprocess - p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True) - - # Poll simulation subprocess and log its output - while p.poll() is None: - line = p.stdout.readline() - log += line - if not quiet: - print(line, end='', flush=True) - - # When simulating with vsim or vcs, we need to parse the simulation - # log to catch the application's return code - if simulator in ['vsim', 'vcs']: - # Capture success - regex_success = r'\[SUCCESS\] Program finished successfully' - match_success = re.search(regex_success, line) - if match_success: - result = 0 - else: - regex_fail = r'\[FAILURE\] Finished with exit code\s+(\d+)' - match = re.search(regex_fail, line) - if match: - exit_code = match.group(1) - result = check_exit_code(test, exit_code) - - # Check if the subprocess terminated correctly - exit_code = p.poll() - # In Banshee and Verilator the exit code of the Snitch binary is returned - # through the exit code of the simulation command - if simulator in ['banshee', 'verilator']: - result = check_exit_code(test, exit_code) - # For custom commands the return code is that of the command - elif simulator == 'other': - result = exit_code - # For standard simulation commands the simulated Snitch binary exit - # code is overriden only if the simulator failed - else: - if exit_code != 0: - result = exit_code - - return result, log - - -def run_test(test, args): - # Extract args - simulator = args.simulator - sim_bin = args.sim_bin if args.sim_bin else SIMULATOR_BINS[simulator] - dry_run = args.dry_run - testlist = args.testlist - quiet = multiple_processes(args) - - # Check if simulator is supported for this test - if 'simulators' in test: - if simulator not in test['simulators']: - return (0, '') - - # Construct path to executable - elf = Path(test['elf']) - if testlist: - elf = Path(testlist).absolute().parent / elf - cprint(f'Run test {colored(elf, "cyan")}', attrs=["bold"]) - - # Construct simulation command (override only supported for RTL) - if 'cmd' in test and simulator != 'banshee': - cmd = test['cmd'] - cmd = cmd.format(sim_bin=sim_bin, elf=elf, simulator=simulator) - simulator = 'other' - else: - cmd = SIMULATOR_CMDS[simulator] - cmd = cmd.format(sim_bin=sim_bin, elf=elf) - if not quiet: - print(f'$ {cmd}', flush=True) - - # Run simulation - result = 0 - log = '' - if not dry_run: - result, log = run_simulation(cmd, simulator, test, quiet) - - # Report failure or success - if result != 0: - cprint(f'{elf} test failed', 'red', attrs=['bold'], flush=True) - else: - cprint(f'{elf} test passed', 'green', attrs=['bold'], flush=True) - - return (result, log) - - -def print_failed_test(test): - print(f'{colored(test["elf"], "cyan")} test {colored("failed", "red")}') - - -def print_test_summary(failed_tests, args): - if not args.dry_run: - header = f'\n==== Test summary {"(early exit)" if args.early_exit else ""} ====' - cprint(header, attrs=['bold']) - if failed_tests: - for failed_test in failed_tests: - print_failed_test(failed_test) - else: - print(f'{colored("All tests passed!", "green")}') - - -def run_tests(tests, args): - - # Create a process Pool - with multiprocessing.Pool(args.n_procs) as pool: - - # Create a shared object which parent and child processes can access - # concurrently to terminate the pool early as soon as one process fails - exit_early = multiprocessing.Value('B') - exit_early.value = 0 - - # Define callback for early exit - def completion_callback(return_value): - result = return_value[0] - log = return_value[1] - if args.early_exit and result != 0: - exit_early.value = 1 - # Printing the log all at once here, rather than line-by-line - # in run_simulation, ensures that the logs of different processes - # are not interleaved in stdout. - # However, as we prefer line-by-line printing when a single process - # is used, we have to make sure we don't print twice. - if args.verbose and multiple_processes(args): - print(log) - - # Queue tests to process pool - results = [] - for test in tests: - result = pool.apply_async(run_test, args=(test, args), callback=completion_callback) - results.append(result) - - # Wait for all tests to complete - running = range(len(tests)) - while len(running) != 0 and not exit_early.value: - time.sleep(1) - running = [i for i in running if not results[i].ready()] - - # Query test results - failed_tests = [] - for test, result in zip(tests, results): - if result.ready() and result.get()[0] != 0: - failed_tests.append(test) - - print_test_summary(failed_tests, args) - - return len(failed_tests) - - -def main(): - args = parse_args() - tests = get_tests(args.testlist) - return run_tests(tests, args) - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/util/trace/perf_csv.py b/util/trace/perf_csv.py index 450758c70..f26e242e2 100755 --- a/util/trace/perf_csv.py +++ b/util/trace/perf_csv.py @@ -17,7 +17,7 @@ import pandas as pd -HARTID_REGEX = r'\D*(\d*)\D*' +HARTID_REGEX = r'hart_([0-9a-f]+)_perf.json' def main():