From 3a4122377c55ec22e5465948b31497c0fbf07dd6 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Tue, 1 Oct 2024 16:11:05 +0200 Subject: [PATCH] treewide: Update docs and tutorial --- README.md | 95 +- docs/publications.md | 145 +- docs/rm/{ => hw}/custom_instructions.md | 2 +- docs/rm/{ => hw}/peripherals.md | 2 +- docs/rm/hw/reqrsp_interface.md | 1 + docs/rm/hw/snitch.md | 1 + docs/rm/hw/snitch_cluster.md | 1 + docs/rm/reqrsp_interface.md | 1 - docs/rm/snitch.md | 1 - docs/rm/snitch_cluster.md | 1 - docs/rm/{ => sw}/bench/join.md | 0 docs/rm/{ => sw}/bench/roi.md | 0 docs/rm/{ => sw}/bench/visualize.md | 0 docs/rm/{ => sw}/sim/Elf.md | 0 docs/rm/{ => sw}/sim/Simulation.md | 0 docs/rm/{ => sw}/sim/Simulator.md | 0 docs/rm/{ => sw}/sim/data_utils.md | 0 docs/rm/{ => sw}/sim/sim_utils.md | 0 docs/rm/{ => sw}/sim/verif_utils.md | 0 docs/rm/{ => sw}/snitch_target_utils/build.md | 0 docs/rm/{ => sw}/snitch_target_utils/run.md | 0 docs/rm/{ => sw}/trace/annotate.md | 0 docs/rm/{ => sw}/trace/events.md | 0 docs/rm/{ => sw}/trace/gen_trace.md | 0 docs/ug/code_optimization.md | 78 + docs/ug/documentation.md | 23 +- docs/ug/example_trace.html | 123935 --------------- docs/ug/trace_analysis.md | 11 +- docs/ug/tutorial.md | 350 +- hw/reqrsp_interface/doc/index.md | 3 + hw/snitch/doc/index.md | 5 +- mkdocs.yml | 49 +- target/snitch_cluster/README.md | 341 +- util/container/README.md | 11 +- 34 files changed, 533 insertions(+), 124523 deletions(-) rename docs/rm/{ => hw}/custom_instructions.md (99%) rename docs/rm/{ => hw}/peripherals.md (76%) create mode 120000 docs/rm/hw/reqrsp_interface.md create mode 120000 docs/rm/hw/snitch.md create mode 120000 docs/rm/hw/snitch_cluster.md delete mode 120000 docs/rm/reqrsp_interface.md delete mode 120000 docs/rm/snitch.md delete mode 120000 docs/rm/snitch_cluster.md rename docs/rm/{ => sw}/bench/join.md (100%) rename docs/rm/{ => sw}/bench/roi.md (100%) rename docs/rm/{ => sw}/bench/visualize.md (100%) rename docs/rm/{ => sw}/sim/Elf.md (100%) rename docs/rm/{ => sw}/sim/Simulation.md (100%) rename docs/rm/{ => sw}/sim/Simulator.md (100%) rename docs/rm/{ => sw}/sim/data_utils.md (100%) rename docs/rm/{ => sw}/sim/sim_utils.md (100%) rename docs/rm/{ => sw}/sim/verif_utils.md (100%) rename docs/rm/{ => sw}/snitch_target_utils/build.md (100%) rename docs/rm/{ => sw}/snitch_target_utils/run.md (100%) rename docs/rm/{ => sw}/trace/annotate.md (100%) rename docs/rm/{ => sw}/trace/events.md (100%) rename docs/rm/{ => sw}/trace/gen_trace.md (100%) create mode 100644 docs/ug/code_optimization.md delete mode 100644 docs/ug/example_trace.html diff --git a/README.md b/README.md index 301c2d1e1..8d5963898 100644 --- a/README.md +++ b/README.md @@ -40,20 +40,24 @@ licenses. See the respective folder for the licenses used. ## Publications + If you use the Snitch cluster or its extensions in your work, you can cite us:
-Snitch: A tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads +Snitch: A Tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads

``` -@article{zaruba2020snitch, - title={Snitch: A tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads}, +@ARTICLE{zaruba2021snitch, author={Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca}, - journal={IEEE Transactions on Computers}, - year={2020}, - publisher={IEEE} + journal={IEEE Transactions on Computers}, + title={Snitch: A Tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads}, + year={2021}, + volume={70}, + number={11}, + pages={1845-1860}, + doi={10.1109/TC.2020.3027900} } ``` @@ -61,19 +65,19 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:

-Stream semantic registers: A lightweight risc-v isa extension achieving full compute utilization in single-issue cores +Stream Semantic Registers: A Lightweight RISC-V ISA Extension Achieving Full Compute Utilization in Single-Issue Cores

``` -@article{schuiki2020stream, - title={Stream semantic registers: A lightweight risc-v isa extension achieving full compute utilization in single-issue cores}, +@ARTICLE{schuiki2021ssr, author={Schuiki, Fabian and Zaruba, Florian and Hoefler, Torsten and Benini, Luca}, - journal={IEEE Transactions on Computers}, + journal={IEEE Transactions on Computers}, + title={Stream Semantic Registers: A Lightweight RISC-V ISA Extension Achieving Full Compute Utilization in Single-Issue Cores}, + year={2021}, volume={70}, number={2}, - pages={212--227}, - year={2020}, - publisher={IEEE} + pages={212-227}, + doi={10.1109/TC.2020.2987314} } ``` @@ -81,14 +85,14 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:

-Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra +Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra

``` -@article{scheffler2023sparsessr, +@ARTICLE{scheffler2023sparsessr, author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca}, - journal={IEEE Transactions on Parallel and Distributed Systems}, - title={Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra}, + journal={IEEE Transactions on Parallel and Distributed Systems}, + title={Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra}, year={2023}, volume={34}, number={12}, @@ -101,37 +105,39 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:

-A High-performance, Energy-efficient Modular DMA Engine Architecture +A High-Performance, Energy-Efficient Modular DMA Engine Architecture

``` -@ARTICLE{benz2023idma, +@ARTICLE{benz2024idma, author={Benz, Thomas and Rogenmoser, Michael and Scheffler, Paul and Riedel, Samuel and Ottaviano, Alessandro and Kurth, Andreas and Hoefler, Torsten and Benini, Luca}, - journal={IEEE Transactions on Computers}, - title={A High-performance, Energy-efficient Modular DMA Engine Architecture}, - year={2023}, - volume={}, - number={}, - pages={1-14}, - doi={10.1109/TC.2023.3329930}} + journal={IEEE Transactions on Computers}, + title={A High-Performance, Energy-Efficient Modular DMA Engine Architecture}, + year={2024}, + volume={73}, + number={1}, + pages={263-277}, + doi={10.1109/TC.2023.3329930} +} ```

-MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores +MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores

``` -@inproceedings{bertaccini2022minifloat, +@INPROCEEDINGS{bertaccini2022minifloat, author={Bertaccini, Luca and Paulin, Gianna and Fischer, Tim and Mach, Stefan and Benini, Luca}, - booktitle={2022 IEEE 29th Symposium on Computer Arithmetic (ARITH)}, - title={MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores}, + booktitle={2022 IEEE 29th Symposium on Computer Arithmetic (ARITH)}, + title={MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores}, year={2022}, volume={}, number={}, - pages={1-8} + pages={1-8}, + doi={10.1109/ARITH54963.2022.00010} } ``` @@ -139,14 +145,14 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:

-Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters +Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters

``` -@inproceedings{paulin2022softtiles, +@INPROCEEDINGS{paulin2022softtiles, author={Paulin, Gianna and Cavalcante, Matheus and Scheffler, Paul and Bertaccini, Luca and Zhang, Yichao and Gürkaynak, Frank and Benini, Luca}, - booktitle={2022 IEEE Computer Society Annual Symposium on VLSI (ISVLSI)}, - title={Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters}, + booktitle={2022 IEEE Computer Society Annual Symposium on VLSI (ISVLSI)}, + title={Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters}, year={2022}, volume={}, number={}, @@ -159,20 +165,23 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:

-SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers +SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers

``` @misc{scheffler2024saris, - title={SARIS: Accelerating Stencil Computations on Energy-Efficient - RISC-V Compute Clusters with Indirect Stream Registers}, - author={Paul Scheffler and Luca Colagrande and Luca Benini}, - year={2024}, - eprint={2404.05303}, - archivePrefix={arXiv}, - primaryClass={cs.MS} + title={SARIS: Accelerating Stencil Computations on Energy-Efficient + RISC-V Compute Clusters with Indirect Stream Registers}, + author={Paul Scheffler and Luca Colagrande and Luca Benini}, + year={2024}, + eprint={2404.05303}, + archivePrefix={arXiv}, + primaryClass={cs.MS}, + url={https://arxiv.org/abs/2404.05303} } ```

+ + diff --git a/docs/publications.md b/docs/publications.md index 4c1e08d0c..eb38ee9d8 100644 --- a/docs/publications.md +++ b/docs/publications.md @@ -1,141 +1,8 @@ # Publications -If you use the Snitch cluster or its extensions in your work, you can cite us: - - - -
-Snitch: A tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads -

- -``` -@article{zaruba2020snitch, - title={Snitch: A tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads}, - author={Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca}, - journal={IEEE Transactions on Computers}, - year={2020}, - publisher={IEEE} -} -``` - -

-
- -
-Stream semantic registers: A lightweight risc-v isa extension achieving full compute utilization in single-issue cores -

- -``` -@article{schuiki2020stream, - title={Stream semantic registers: A lightweight risc-v isa extension achieving full compute utilization in single-issue cores}, - author={Schuiki, Fabian and Zaruba, Florian and Hoefler, Torsten and Benini, Luca}, - journal={IEEE Transactions on Computers}, - volume={70}, - number={2}, - pages={212--227}, - year={2020}, - publisher={IEEE} -} -``` - -

-
- -
-Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra -

- -``` -@article{scheffler2023sparsessr, - author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca}, - journal={IEEE Transactions on Parallel and Distributed Systems}, - title={Sparse Stream Semantic Registers: A Lightweight ISA Extension Accelerating General Sparse Linear Algebra}, - year={2023}, - volume={34}, - number={12}, - pages={3147-3161}, - doi={10.1109/TPDS.2023.3322029} -} -``` - -

-
- -
-A High-performance, Energy-efficient Modular DMA Engine Architecture -

- -``` -@ARTICLE{benz2023idma, - author={Benz, Thomas and Rogenmoser, Michael and Scheffler, Paul and Riedel, Samuel and Ottaviano, Alessandro and Kurth, Andreas and Hoefler, Torsten and Benini, Luca}, - journal={IEEE Transactions on Computers}, - title={A High-performance, Energy-efficient Modular DMA Engine Architecture}, - year={2023}, - volume={}, - number={}, - pages={1-14}, - doi={10.1109/TC.2023.3329930}} -``` - -

-
- -
-MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores -

- -``` -@inproceedings{bertaccini2022minifloat, - author={Bertaccini, Luca and Paulin, Gianna and Fischer, Tim and Mach, Stefan and Benini, Luca}, - booktitle={2022 IEEE 29th Symposium on Computer Arithmetic (ARITH)}, - title={MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores}, - year={2022}, - volume={}, - number={}, - pages={1-8} -} -``` - -

-
- -
-Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters -

- -``` -@inproceedings{paulin2022softtiles, - author={Paulin, Gianna and Cavalcante, Matheus and Scheffler, Paul and Bertaccini, Luca and Zhang, Yichao and Gürkaynak, Frank and Benini, Luca}, - booktitle={2022 IEEE Computer Society Annual Symposium on VLSI (ISVLSI)}, - title={Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters}, - year={2022}, - volume={}, - number={}, - pages={44-49}, - doi={10.1109/ISVLSI54635.2022.00021} -} -``` - -

-
- -
-SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers -

- -``` -@misc{scheffler2024saris, - title={SARIS: Accelerating Stencil Computations on Energy-Efficient - RISC-V Compute Clusters with Indirect Stream Registers}, - author={Paul Scheffler and Luca Colagrande and Luca Benini}, - year={2024}, - eprint={2404.05303}, - archivePrefix={arXiv}, - primaryClass={cs.MS} -} -``` - -

-
- - +{% + include-markdown '../README.md' + start="" + end="" + comments=false +%} diff --git a/docs/rm/custom_instructions.md b/docs/rm/hw/custom_instructions.md similarity index 99% rename from docs/rm/custom_instructions.md rename to docs/rm/hw/custom_instructions.md index 913bcc7be..f270eab43 100644 --- a/docs/rm/custom_instructions.md +++ b/docs/rm/hw/custom_instructions.md @@ -37,7 +37,7 @@ The FREP instruction has the following signature: | max_inst | max_rpt | stagger_max | stagger_mask | 0 | OP-CUSTOM1 | FREP.I | | max_inst | max_rpt | stagger_max | stagger_mask | 1 | OP-CUSTOM1 | FREP.O | -FREP.I and FREP.O repeat the *max_inst + 1* instructions following the FREP instruction for *max_rpt + 1* times. The FREP.I instruction (*I* stands for inner) repeats every instruction the specified number of times and moves on to executing and repeating the next. The FREP.O instruction (*O* stands for outer) repeats the whole sequence of instructions *max_rpt + 1* times. Register staggering can be enabled and configured via the *stagger_mask* and *stagger_max* immediates. A detailed explanation of their use can be found in the Snitch [paper](/publications). +FREP.I and FREP.O repeat the *max_inst + 1* instructions following the FREP instruction for *max_rpt + 1* times. The FREP.I instruction (*I* stands for inner) repeats every instruction the specified number of times and moves on to executing and repeating the next. The FREP.O instruction (*O* stands for outer) repeats the whole sequence of instructions *max_rpt + 1* times. Register staggering can be enabled and configured via the *stagger_mask* and *stagger_max* immediates. A detailed explanation of their use can be found in the Snitch [paper](../..//publications.md). The assembly instruction signature follows: diff --git a/docs/rm/peripherals.md b/docs/rm/hw/peripherals.md similarity index 76% rename from docs/rm/peripherals.md rename to docs/rm/hw/peripherals.md index 98ff1063e..43e3b8626 100644 --- a/docs/rm/peripherals.md +++ b/docs/rm/hw/peripherals.md @@ -2,4 +2,4 @@ This section documents the registers exposed by the Snitch cluster to interface with various cluster-level peripherals, including the performance counters. -{% include-markdown '../generated/peripherals.md' %} +{% include-markdown '../../generated/peripherals.md' %} diff --git a/docs/rm/hw/reqrsp_interface.md b/docs/rm/hw/reqrsp_interface.md new file mode 120000 index 000000000..915f3847e --- /dev/null +++ b/docs/rm/hw/reqrsp_interface.md @@ -0,0 +1 @@ +../../../hw/reqrsp_interface/doc/index.md \ No newline at end of file diff --git a/docs/rm/hw/snitch.md b/docs/rm/hw/snitch.md new file mode 120000 index 000000000..29f373f54 --- /dev/null +++ b/docs/rm/hw/snitch.md @@ -0,0 +1 @@ +../../../hw/snitch/doc/index.md \ No newline at end of file diff --git a/docs/rm/hw/snitch_cluster.md b/docs/rm/hw/snitch_cluster.md new file mode 120000 index 000000000..e63fc9b72 --- /dev/null +++ b/docs/rm/hw/snitch_cluster.md @@ -0,0 +1 @@ +../../../hw/snitch_cluster/doc/index.md \ No newline at end of file diff --git a/docs/rm/reqrsp_interface.md b/docs/rm/reqrsp_interface.md deleted file mode 120000 index f7e23bc4e..000000000 --- a/docs/rm/reqrsp_interface.md +++ /dev/null @@ -1 +0,0 @@ -../../hw/reqrsp_interface/doc/index.md \ No newline at end of file diff --git a/docs/rm/snitch.md b/docs/rm/snitch.md deleted file mode 120000 index e36c4db10..000000000 --- a/docs/rm/snitch.md +++ /dev/null @@ -1 +0,0 @@ -../../hw/snitch/doc/index.md \ No newline at end of file diff --git a/docs/rm/snitch_cluster.md b/docs/rm/snitch_cluster.md deleted file mode 120000 index 61c4a3593..000000000 --- a/docs/rm/snitch_cluster.md +++ /dev/null @@ -1 +0,0 @@ -../../hw/snitch_cluster/doc/index.md \ No newline at end of file diff --git a/docs/rm/bench/join.md b/docs/rm/sw/bench/join.md similarity index 100% rename from docs/rm/bench/join.md rename to docs/rm/sw/bench/join.md diff --git a/docs/rm/bench/roi.md b/docs/rm/sw/bench/roi.md similarity index 100% rename from docs/rm/bench/roi.md rename to docs/rm/sw/bench/roi.md diff --git a/docs/rm/bench/visualize.md b/docs/rm/sw/bench/visualize.md similarity index 100% rename from docs/rm/bench/visualize.md rename to docs/rm/sw/bench/visualize.md diff --git a/docs/rm/sim/Elf.md b/docs/rm/sw/sim/Elf.md similarity index 100% rename from docs/rm/sim/Elf.md rename to docs/rm/sw/sim/Elf.md diff --git a/docs/rm/sim/Simulation.md b/docs/rm/sw/sim/Simulation.md similarity index 100% rename from docs/rm/sim/Simulation.md rename to docs/rm/sw/sim/Simulation.md diff --git a/docs/rm/sim/Simulator.md b/docs/rm/sw/sim/Simulator.md similarity index 100% rename from docs/rm/sim/Simulator.md rename to docs/rm/sw/sim/Simulator.md diff --git a/docs/rm/sim/data_utils.md b/docs/rm/sw/sim/data_utils.md similarity index 100% rename from docs/rm/sim/data_utils.md rename to docs/rm/sw/sim/data_utils.md diff --git a/docs/rm/sim/sim_utils.md b/docs/rm/sw/sim/sim_utils.md similarity index 100% rename from docs/rm/sim/sim_utils.md rename to docs/rm/sw/sim/sim_utils.md diff --git a/docs/rm/sim/verif_utils.md b/docs/rm/sw/sim/verif_utils.md similarity index 100% rename from docs/rm/sim/verif_utils.md rename to docs/rm/sw/sim/verif_utils.md diff --git a/docs/rm/snitch_target_utils/build.md b/docs/rm/sw/snitch_target_utils/build.md similarity index 100% rename from docs/rm/snitch_target_utils/build.md rename to docs/rm/sw/snitch_target_utils/build.md diff --git a/docs/rm/snitch_target_utils/run.md b/docs/rm/sw/snitch_target_utils/run.md similarity index 100% rename from docs/rm/snitch_target_utils/run.md rename to docs/rm/sw/snitch_target_utils/run.md diff --git a/docs/rm/trace/annotate.md b/docs/rm/sw/trace/annotate.md similarity index 100% rename from docs/rm/trace/annotate.md rename to docs/rm/sw/trace/annotate.md diff --git a/docs/rm/trace/events.md b/docs/rm/sw/trace/events.md similarity index 100% rename from docs/rm/trace/events.md rename to docs/rm/sw/trace/events.md diff --git a/docs/rm/trace/gen_trace.md b/docs/rm/sw/trace/gen_trace.md similarity index 100% rename from docs/rm/trace/gen_trace.md rename to docs/rm/sw/trace/gen_trace.md diff --git a/docs/ug/code_optimization.md b/docs/ug/code_optimization.md new file mode 100644 index 000000000..649917f3b --- /dev/null +++ b/docs/ug/code_optimization.md @@ -0,0 +1,78 @@ +# Code Optimization + +The methods covered in the [Debugging and benchmarking](tutorial.md#debugging-and-benchmarking) and [Trace Analysis](trace_analysis.md) pages, show you how to analyze the performance of an application and identify limiting factors. + +The following table summarizes the most common bottlenecks and methods to address them in Snitch. In the next sections we will look individually at each method in detail. + +|Bottleneck |Solution | +|-------------------------------------------------------------|-----------------------------------------| +|[I$ misses](#instruction-cache-misses) |Cache preheating | +|[High-latency load/stores](#high-latency-loadstores) |Pre-loading to L1 TCDM memory | +|[TCDM bank conflicts](#tcdm-bank-conflicts) |Smart data placement | +|[Explicit load/stores](#explicit-loadstores) |Stream-semantic registers (SSRs) | +|[Loop overheads](#loop-overheads) |Loop unrolling or hardware loops (FREP) | +|[Read-after-write (RAW) stalls](#read-after-write-raw-stalls)|Instruction reordering | + +!!! tip + Have a look at the optimized kernels within this repository to see how these optimizations can be implemented. [AXPY](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/sw/blas/axpy) and [DOT](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/sw/blas/dot) are accessible examples for beginners. + +## Instruction cache misses + +While instruction cache (I$) misses are a natural phenomenon which needs to be accounted for, we may want to temporarily remove these to isolate the effect of some other bottleneck. + +L1 I$ misses appear as delays on instructions aligned to the L1 I$ line length. This is specified by the `cacheline` parameter in the [hardware configuration file](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/target/snitch_cluster/cfg/default.hjson). + +To reduce the effect of I$ misses, you can wrap the code you're interested in benchmarking in a loop, while ensuring the compiler does not unroll it (the `volatile` keyword is required to this end): + +```C +for (volatile int i = 0; i < 2; i++) { + snrt_mcycle(); + + snrt_mcycle(); +} +``` + +The first iteration only serves to pre-heat the I$. The second can be benchmarked with the effect of I$ misses reduced to a minimum. + +If the working set size of your code is large enough to still observe I$ misses on the second iteration, you can increase the L1 I$ size by tuning the parameters in the hardware configuration file. + +## High-latency load/stores + +Where variables in your code are placed in memory by the compiler determines the access-latency to these variables. + +Most global and constant data is placed by the compiler in the `.data` and `.bss` linker sections. These are mapped to the [last-level (or L3) memory](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/sw/snRuntime/base.ld), that is the simulation memory in the Snitch cluster testbench. Data which is accessed repeatedly from L3 will repeatedly incur the long round-trip access times to L3. In these cases, caching the data in the cluster's L1 TCDM memory can be beneficial. Since the TCDM memory is not a traditional hardware-managed cache, but a software-managed scratch-pad memory, the data must be explicitly loaded into the cluster's TCDM memory from L3 in software. + +When data needs to be loaded (stored) in bulk from (to) the L3 memory, using the DMA engine in the cluster can be beneficial. The DMA transfers data in _bursts_ and the round-trip latency is payed once for the burst, while successive items, or _beats_, in the burst are delivered in a pipelined fashion. + +## TCDM bank conflicts + +The TCDM memory in the cluster is divided into multiple banks, and the TCDM interconnect connects every port from the Snitch cores to every bank of the memory. Accesses to distinct banks can be executed in parallel, while accesses to the same bank result in a conflict, resulting in wasted cycles until access to the bank is granted to each port. + +Depending on the access patterns of an application, it may be possible to reduce bank conflicts by smartly arranging the data in memory. + +!!! note + Conflicts may accur within a single core, e.g. between the SSR and LSU ports, between cores in the cluster, or even between the DMA engine and the cores. + +## Explicit load/stores + +On a single-issue core, such as Snitch, load/store instructions may limit the performance of an application, as useful compute instructions cannot be issued while a load/store is issued, potentially leading to under-utilization of the compute resources. + +Snitch provides an ISA extension, _Stream semantic registers (SSRs)_, which can be used to stream data from memory, without having to issue load/store instructions. + +!!! info + For more information, please consult the [SSR paper](https://doi.org/10.1109/TC.2020.2987314). + +## Loop overheads + +Similar to load/store instructions, loop management instructions may also represent a significant overhead to a computation. In some cases, it is possible to reduce these overheads to some extent through the use of loop unrolling. By unrolling, the overhead is payed once for every N original loop iterations, where N is equal to the applied unrolling factor. + +Hardware loops allow to remove a loop's overhead altogether. In Snitch, the FREP ISA extension provides hardware loop capabilities for loops comprising of floating-point instructions exclusively. In addition to eliminating the loop overheads, the FREP extension provides Snitch with pseudo-dual issue capabilities. + +!!! info + For more information, please consult the [Snitch paper](https://doi.org/10.1109/TC.2020.3027900). + +## Read-after-write (RAW) stalls + +The FPU in Snitch is pipelined, causing instructions to take multiple cycles, from the moment they are issued to the moment they write back to the register file. If the instruction following a floating-point operation depends on the result of the previous operation, i.e. there is a read-after-write (RAW) dependency between successive instructions, the latter will be stalled for a few cycles until the result from the previous is available. + +If other independent instructions are present, it may be possible to reorder these between the two dependent instructions, hiding the RAW latency under other useful instructions. In some cases, loop unrolling is coupled to this technique, to provide independent instructions for the reordering. diff --git a/docs/ug/documentation.md b/docs/ug/documentation.md index 24467bc24..ad464b590 100644 --- a/docs/ug/documentation.md +++ b/docs/ug/documentation.md @@ -1,22 +1,23 @@ # Documentation -Documentation of the generator and related infrastructure is hosted under -`docs`. Static `html` documentation is build from the latest `main` branch by -the CI. We use [mkdocs](https://www.mkdocs.org/) together with the [material -theme](https://squidfunk.github.io/mkdocs-material/). Before building the -documentation, make sure you have the required dependencies installed: +Documentation pages for the Snitch cluster are hosted under `docs`. Static +`html` documentation is built and deployed from the latest `main` branch by the +CI. We use [mkdocs](https://www.mkdocs.org/) together with the [material +theme](https://squidfunk.github.io/mkdocs-material/). -```shell -pip install . -``` - -After everything is installed, you can build a static copy of the `html` documentation by -executing (in the root directory): +You can build a static copy of the `html` documentation by +executing (in the root of this repository): ```shell make docs ``` +Documentation for the Python sources in this repository is generated from the +docstrings contained within the sources themselves, using +[mkdocstrings](https://mkdocstrings.github.io/). +Documentation for the C sources in this repository is generated from the +Doxygen-style comments within the sources themselves, using Doxygen. + ## Organization The `docs` folder is organized as follows: diff --git a/docs/ug/example_trace.html b/docs/ug/example_trace.html deleted file mode 100644 index ec1467686..000000000 --- a/docs/ug/example_trace.html +++ /dev/null @@ -1,123935 +0,0 @@ - - - - - - - - Trace from trace.json - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/ug/trace_analysis.md b/docs/ug/trace_analysis.md index 0fefb2dfd..2a9d1bad4 100644 --- a/docs/ug/trace_analysis.md +++ b/docs/ug/trace_analysis.md @@ -2,9 +2,12 @@ ## Trace generation -During RTL simulation, the Snitch core complex (CC) dumps a wide set of information to the `logs/trace_hart_XXXXX.dasm` file (see [snitch_cc.sv](https://github.com/pulp-platform/snitch_cluster/blob/main/hw/snitch_cluster/src/snitch_cc.sv)), `XXXXX` denoting the index of the Snitch core in the system. +During RTL simulation, the Snitch _core complex (CC)_ dumps a wide set of information to the `logs/trace_hart_XXXXX.dasm` file (see [snitch_cc.sv](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/hw/snitch_cluster/src/snitch_cc.sv)), `XXXXX` denoting the hart ID of the Snitch core in the system. -The [gen_trace.py](../rm/trace/gen_trace.md) script can be used to elaborate this information into a human-readable form, and is invoked by the `make traces` target to generate `logs/trace_hart_XXXXX.txt`. +The [gen_trace.py](../rm/sw/trace/gen_trace.md) script can be used to elaborate this information into a human-readable form, and is invoked by the `make traces` target to generate `logs/trace_hart_XXXXX.txt`. + +!!! info + For more information on the topics covered in this page have a look inside the [gen_trace.py](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/util/trace/gen_trace.py) script. ## Trace walkthrough @@ -79,7 +82,7 @@ One last note should be made about `frep` loops. While not visible from this tra ## Performance metrics -Finally, at the end of the trace, a collection of performance metrics automatically computed from the trace is reported. The performance metrics are associated to regions defined in your code. More information on how to define these regions can be found in the Snitch [tutorial](../../target/snitch_cluster/README.md). +Finally, at the end of the trace, a collection of performance metrics automatically computed from the trace is reported. The performance metrics are associated to regions defined in your code. More information on how to define these regions can be found in the [tutorial](tutorial.md#debugging-and-benchmarking). ``` ## Performance metrics @@ -104,7 +107,7 @@ cycles 87 total_ipc 0.8046 ``` -The trace will contain the most relevant performance metrics for manual inspection. These and additional performance metrics can also be dumped to a JSON file for further processing (see [gen_trace.py](../../util/trace/gen_trace.py)). +The trace will contain the most relevant performance metrics for manual inspection. These and additional performance metrics are also dumped to a `.json` file for further processing. In the following table you can find a complete list of all the performance metrics extracted from the trace along with their description: |Metric |Description | diff --git a/docs/ug/tutorial.md b/docs/ug/tutorial.md index 363fa82e3..538f00e34 100644 --- a/docs/ug/tutorial.md +++ b/docs/ug/tutorial.md @@ -2,27 +2,345 @@ The following tutorial will guide you through the use of the Snitch cluster. You will learn how to develop, simulate, debug and benchmark software for the Snitch cluster architecture. - +You can assume the working directory to be `target/snitch_cluster`. All paths are to be assumed relative to this directory. Paths relative to the root of the repository are prefixed with a slash. + +## Setup + +If you don't have access to an IIS machine, and you have set up the Snitch Docker container as described in the [getting started guide](getting_started.md), all of the commands presented in this tutorial will have to be executed in the Docker container. + {% - include-markdown '../../target/snitch_cluster/README.md' + include-markdown '../../util/container/README.md' + start="## Usage" + end="## Limitations" comments=false - start="## Tutorial" + heading-offset=1 %} -## Using Verilator with LLVM +Where you should replace `` with the path to the root directory of the Snitch cluster repository cloned on your machine. + +!!! warning + As QuestaSim and VCS are proprietary tools and require a license, only Verilator is provided within the container for RTL simulations. + +## Building the hardware + +To run software on Snitch without a physical chip, you will need a simulation model of the Snitch cluster. You can build a cycle-accurate simulation model from the RTL sources directly using QuestaSim, VCS or Verilator, with either of the following commands: + +=== "Verilator" + ```shell + make bin/snitch_cluster.vlt + ``` + +=== "Questa" + ```shell + make DEBUG=ON bin/snitch_cluster.vsim + ``` + +=== "VCS" + ```shell + make bin/snitch_cluster.vcs + ``` + +These commands compile the RTL sources respectively in `work-vlt`, `work-vsim` and `work-vcs`. Additionally, common C++ testbench sources (e.g. the [frontend server (fesvr)](https://github.com/riscv-software-src/riscv-isa-sim)) are compiled under `work`. Each command will also generate a script or an executable (e.g. `bin/snitch_cluster.vsim`) which we can use to simulate software on Snitch, as we will see in section [Running a simulation](#running-a-simulation). + +!!! info + The variable `DEBUG=ON` is required when using QuestaSim to preserve the visibility of all internal signals. If you need to inspect the simulation waveforms, you should set this variable when building the simulation model. For faster simulations you can omit the variable assignment, allowing QuestaSim to optimize internal signals away. + + +## Building the Banshee simulator + +Instead of building a simulation model from the RTL sources, you can use our instruction-accurate simulator called `banshee`. To install the simulator, please follow the instructions provided in the [Banshee repository](https://github.com/pulp-platform/banshee). + +## Configuring the hardware + +The Snitch cluster RTL sources are partly automatically generated from a configuration file provided in `.hjson` format. Several RTL files are templated and use the `.hjson` configuration file as input to fill in the template. An example is [snitch_cluster_wrapper.sv.tpl](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl). + +In the [`cfg`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/target/snitch_cluster/cfg) folder, different configurations are provided. The [`cfg/default.hjson`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/target/snitch_cluster/cfg/default.hjson) configuration instantiates 8 compute cores + 1 DMA core in the cluster. + +The command you previously executed automatically generated the RTL sources from the templates, and it implicitly used the default configuration file. In this configuration the FPU is not equipped with a floating-point divide and square-root unit. +To override the default configuration file, e.g. to use the configuration with FDIV/FSQRT unit, define the following variable when you invoke `make`: +```shell +make CFG_OVERRIDE=cfg/fdiv.hjson bin/snitch_cluster.vlt +``` + +If you want to use a custom configuration, just point `CFG_OVERRIDE` to the path of your configuration file. + +!!! tip + When you override the configuration file on the `make` command-line, the configuration is stored in the `cfg/lru.hjson` file. Successive invocations of `make` will automatically pick up the `cfg/lru.hjson` file. You can therefore omit the `CFG_OVERRIDE` definition in successive commands unless you want to override the least-recently used configuration. + +## Building the software + +To build all of the software for the Snitch cluster, run the following command. Different simulators may require different C runtime or library function implementations, so different options have to be specified to select the appropriate implementation, e.g. for Banshee simulations or OpenOCD semi-hosting: + +=== "RTL" + + ```bash + make DEBUG=ON sw -j + ``` + +=== "Banshee" + + ```bash + make DEBUG=ON SELECT_RUNTIME=banshee sw -j + ``` + +=== "OpenOCD" + + ```bash + make DEBUG=ON OPENOCD_SEMIHOSTING=ON sw -j + ``` + +This builds all software targets defined in the repository, e.g. the Snitch runtime library and all applications. Artifacts are stored in the build directory of each target. For example, have a look inside `sw/apps/blas/axpy/build/` and you will find the artifacts of the AXPY application build, e.g. the compiled executable `axpy.elf` and a disassembly `axpy.dump`. + +If you only want to build a specific software target, you can by replacing `sw` with the name of that target, e.g. the name of an application: + +```bash +make DEBUG=ON axpy -j +``` + +For this to be possible, we require all software targets to have unique and distinct names from any other Make target. + +!!! warning + The RTL is not the only source which is generated from the configuration file. The software stack also depends on the configuration file. Make sure you always build the software with the same configuration of the hardware you are going to run it on. + +!!! info + The `DEBUG=ON` flag is used to tell the compiler to produce debugging symbols and disassemble the generated ELF binaries for inspection (`.dump` files in the build directories). Debugging symbols are required by the `annotate` target, showcased in the [Debugging and benchmarking](#debugging-and-benchmarking) section of this guide. + +!!! tip + On GVSOC, it is better to use OpenOCD semi-hosting to prevent putchar from disturbing the DRAMSys timing model. + +## Running a simulation + +Run one of the executables which was compiled in the previous step on your Snitch cluster simulator of choice: + +=== "Verilator" + + ```shell + bin/snitch_cluster.vlt sw/apps/blas/axpy/build/axpy.elf + ``` + +=== "Questa" + + ```shell + bin/snitch_cluster.vsim sw/apps/blas/axpy/build/axpy.elf + ``` + +=== "VCS" + + ```shell + bin/snitch_cluster.vcs sw/apps/blas/axpy/build/axpy.elf + ``` + +=== "Banshee" + + ```shell + banshee --no-opt-llvm --no-opt-jit --configuration src/banshee.yaml --trace sw/apps/blas/axpy/build/axpy.elf + ``` + +The simulator binaries can be invoked from any directory, just adapt the relative paths in the preceding commands accordingly, or use absolute paths. We refer to the working directory where the simulation is launched as the _simulation directory_. Within it, you will find several log files produced by the RTL simulation. + +!!! tip + If you don't want your log files to be overriden when you run another simulation, just create separate simulation directories for every simulation whose artifacts you want to preserve, and run the simulations therein. + +The previous commands will launch the simulation on the console. QuestaSim simulations can also be launched with the GUI, e.g. for waveform inspection. Just adapt the previous command to: + +```shell +bin/snitch_cluster.vsim.gui sw/apps/blas/axpy/build/axpy.elf +``` + +## Debugging and benchmarking + +When you run a simulation, every core logs all the instructions it executes in a trace file. The traces are located in the `logs` folder within the simulation directory. Every trace is identified by a hart ID, that is a unique ID for every _hardware thread (hart)_ in a RISC-V system (and since all our cores have a single thread that is a unique ID per core). + +The simulation dumps the traces in a non-human-readable format with `.dasm` extension. To convert these to a human-readable form run: + +```bash +make traces -j +``` + +If the simulation directory does not coincide with the current working directory, you will have to provide the path to the simulation directory explicitly, this holds for all of the commands in this seciton: + +```bash +make traces SIM_DIR= -j +``` + +This will generate human-readable traces with `.txt` extension. In addition, several performance metrics will be computed and appended to the end of the trace. These and additional metrics are also dumped to a `.json` file for further processing. Detailed information on how to interpret the traces and performance metrics can be found in the [Trace Analysis](trace_analysis.md) page. + +Debugging a program from the traces alone can be quite tedious and time-consuming, as it would require you to manually understand which lines in your source code every instruction originates from. Surely, you can help yourself with the disassembly, but we can do better. + +You can automatically annotate every instruction with the originating source line using: + +```bash +make annotate -j +``` + +This will produce a `.s` file from every `.txt` trace, in which the instructions from the `.txt` trace are now interleaved with comments indicating which source lines those instructions correspond to. + +!!! note + The `annotate` target uses the `addr2line` binutil behind the scenes, which needs debugging symbols to correlate instruction addresses with originating source code lines. The `DEBUG=ON` flag you specified when building the software is necessary for this step to succeed. + +Every performance metric is associated to a region in the trace. You can define regions by instrumenting your code with calls to the `snrt_mcycle()` function. Every call to this function defines two code regions: -LLVM+clang can be used to build the Verilator model. Optionally specify a path -to the LLVM toolchain in `CLANG_PATH` and set `VLT_USE_LLVM=ON`. -For the verilated model itself to be complied with LLVM, verilator must be built -with LLVM (`CC=clang CXX=clang++ ./configure`). The `VLT` environment variable -can then be used to point to the verilator binary. +- the code preceding the call, up to the previous `snrt_mcycle()` call or the start of the program +- the code following the call, up to the next `snrt_mcycle()` call or the end of the program +If you would like to benchmark a specific part of your program, you would call `snrt_mcycle()` before and after that part. Performance metrics, such as the IPC, will be extracted for that region separately from other regions. + +Sometimes you may want to graphically visualize the regions in your traces, to have a holistic and high-level view over all cores' operations. This can be useful e.g. to visualize if the compute and DMA phases in a double-buffered application overlap correctly and to what extent. To achieve this, you can use the following command, provided a file specifying the _regions of interest (ROI)_ and associating a textual label to each region: + +```shell +make visual-trace ROI_SPEC=../../sw/blas/axpy/roi.json +``` + +Where `ROI_SPEC` points to the mentioned specification file. + +This command generates the `logs/trace.json` file, which you can graphically visualize in your browser. Go to [http://ui.perfetto.dev/](http://ui.perfetto.dev/) and load the trace file. You can now graphically view the compute and DMA transfer regions in your code. If you click on a region, you will be able to see the performance metrics extracted for that region. Furthermore, you can also view the low-level traces of each core, with the individual instructions. Click on an instruction, and you will be able to see the originating source line information, the same you've seen to be generated by the `annotate` target. + +!!! note + As mentioned also for the `annotate` target, the `DEBUG=ON` flag is required when building the software for the source line information to be extracted. + +!!! info + If you want to dig deeper into the ROI specification file syntax and how the visual trace is built behind the scenes, have a look at the documentation for the [`roi.py`](../rm/sw/bench/roi.md) and [`visualize.py`](../rm/sw/bench/visualize.md) scripts or at the sources themselves, hosted in the [`bench`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/util/bench) folder. + +## Developing your first Snitch application + +In the following you will create your own AXPY kernel implementation as an example how to develop software for Snitch. + +### Writing the C code + +Create a directory for your AXPY kernel: + +```bash +mkdir sw/apps/tutorial +``` + +And a `src` subdirectory to host your source code: + +```bash +mkdir sw/apps/tutorial/src +``` + +Here, create a new file named `tutorial.c` with the following contents: + +```C +#include "snrt.h" +#include "data.h" + +// Define your kernel +void axpy(uint32_t l, double a, double *x, double *y, double *z) { + int core_idx = snrt_cluster_core_idx(); + int offset = core_idx * l; + + for (int i = 0; i < l; i++) { + z[offset] = a * x[offset] + y[offset]; + offset++; + } + snrt_fpu_fence(); +} + +int main() { + // Read the mcycle CSR (this is our way to mark/delimit a specific code region for benchmarking) + uint32_t start_cycle = snrt_mcycle(); + + // DM core does not participate in the computation + if(snrt_is_compute_core()) + axpy(L / snrt_cluster_compute_core_num(), a, x, y, z); + + // Read the mcycle CSR + uint32_t end_cycle = snrt_mcycle(); +} + +``` + +The [`snrt.h`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/target/snitch_cluster/sw/runtime/rtl/src/snrt.h) file implements the snRuntime API, a library of convenience functions to program Snitch-cluster-based systems, and it is automatically referenced by our compilation scripts. Documentation for the snRuntime can be found at the [Snitch Runtime](../doxygen/html/index.html) pages. + +!!! note + The [snRuntime sources](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/sw/snRuntime) only define the snRuntime API, and provide a base implementation for a subset of functions. A complete implementation of the snRuntime for RTL simulation can be found under [`target/snitch_cluster/sw/runtime/rtl`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/target/snitch_cluster/sw/runtime/rtl). + +We will have to instead create the `data.h` file ourselves. Create a folder to host the data for your kernel to operate on: + +```bash +mkdir sw/apps/tutorial/data +``` + +Here, create a C file named `data.h` with the following contents: + +```C +uint32_t L = 16; + +double a = 2; + +double x[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + +double y[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + +double z[16]; + +``` + +In this file we hardcode the data to be used by the kernel. This data will be loaded in memory together with your application code. + +### Compiling the C code + +In your `tutorial` folder, create a new file named `app.mk` with the following contents: + +```make +APP = tutorial +SRCS = src/tutorial.c +INCDIRS = data + +include $(ROOT)/target/snitch_cluster/sw/apps/common.mk +``` + +This file will be included in the top-level Makefile, compiling your source code into an executable with the name provided in the `APP` variable. + +In order for the top-level Makefile to find your application, add your application's directory to the `APPS` variable in [`sw.mk`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/target/snitch_cluster/sw.mk): + +``` +APPS += sw/apps/tutorial +``` + +Now you can recompile the software, including your newly added tutorial application, as shown in section [Building the software](#building-the-software). + +!!! note + Only the software targets depending on the sources you have added/modified have been recompiled. + +!!! info + If you want to dig deeper into how our build system works and how these files were generated you can start from the [top-level Makefile](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/target/snitch_cluster/Makefile) and work your way through the other Makefiles included within it. + +### Running your application + +You can then run your application as shown in section [Running a simulation](#running-a-simulation). Make sure to pick up the right binary, i.e. `sw/apps/tutorial/build/tutorial.elf`. + +### Generating input data + +In general, you may want to randomly generate the data for your application. You may also want to test your kernel on different problem sizes, e.g. varying the length of the AXPY vectors, without having to manually rewrite the file. + +The approach we use is to generate the header file with a Python script. An input `.json` file can be used to configure the data generation, e.g. to set the length of the AXPY vectors. Have a look at the [`datagen.py`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/sw/blas/axpy/scripts/datagen.py) and [`params.json`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/sw/blas/axpy/data/params.json) files in our full-fledged [AXPY application](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/sw/blas/axpy/) as an example. As you can see, the data generation script reuses many convenience classes and functions from the [`data_utils`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/util/sim/data_utils.py) module. We advise you to do the same. Documentation for this module can be found at the [auto-generated pages](../rm/sw/sim/data_utils.md). + +### Verifying your application + +When developing an application, it is good practice to verify the results of your application against a golden model. The traditional approach is to generate expected results in your data generation script, dump these into the header file and extend your application to check its results against the expected results, _in simulation_! Every cycle spent on verification is simulated, and this may take a significant time for large designs. We refer to this approach as the _Built-in self-test (BIST)_ approach. + +A better alternative is to read out the results from your application at the end of the simulation, and compare them outside of the simulation. You may have a look at our AXPY's [`verify.py`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/sw/blas/axpy/scripts/verify.py) script as an example. This script can be used to verify the AXPY application by prepending it to the usual simulation command, as: + +```shell +../../sw/blas/axpy/scripts/verify.py bin/snitch_cluster.vlt sw/apps/blas/axpy/build/axpy.elf +``` + +You can test if the verification passed by checking that the exit code of the previous command is 0 (e.g. in a bash terminal): ```bash -# Optional: Specify which llvm to use -export CLANG_PATH=/path/to/llvm-12.0.1 -# Optional: Point to a verilator binary compiled with LLVM -export VLT=/path/to/verilator-llvm/bin/verilator -make VLT_USE_LLVM=ON bin/snitch_cluster.vlt +echo $? ``` + +Again, most of the logic in the script is implemented in convenience classes and functions provided by the [`verif_utils`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/util/sim/verif_utils.py) module. Documentation for this module can be found at the [auto-generated pages](../rm/sw/sim/verif_utils.md). + +!!! info + The `verif_utils` functions build upon a complex verification infrastructure, which uses inter-process communication (IPC) between the Python process and the simulation process to get the results of your application at the end of the simulation. If you want to dig deeper into how this framework is implemented, have a look at the [`SnitchSim.py`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/util/sim/SnitchSim.py) module and the IPC files within the [`test`](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/target/common/test) folder. + +## Code reuse + +As you may have noticed, there is a good deal of code which is independent of the hardware platform we execute our AXPY kernel on. This is true for the `data.h` file and possible data generation scripts. The Snitch AXPY kernel itself is not specific to the Snitch cluster, but can be ported to any platform which provides an implementation of the snRuntime API. An example is Occamy, with its own testbench and SW development environment. + +It is thus preferable to develop the data generation scripts and Snitch kernels in a shared location, from which multiple platforms can take and include the code. The `sw` directory in the root of this repository was created with this goal in mind. For the AXPY example, shared sources are hosted under the `sw/blas/axpy` directory. + +We recommend that you follow this approach also in your own developments for as much of the code which can be reused. diff --git a/hw/reqrsp_interface/doc/index.md b/hw/reqrsp_interface/doc/index.md index bac6eaab3..7d14b13db 100644 --- a/hw/reqrsp_interface/doc/index.md +++ b/hw/reqrsp_interface/doc/index.md @@ -1,5 +1,8 @@ # Reqrsp Interface +!!! warning + This page is no longer maintained, and may contain outdated information. + The `reqrsp_interface` (request and response) is a custom interface based on common principles found in other interconnects such as AXI or TileLink. It has only two channels (request and response) which are handshaked according to the diff --git a/hw/snitch/doc/index.md b/hw/snitch/doc/index.md index 2711e6089..8b4badc0a 100644 --- a/hw/snitch/doc/index.md +++ b/hw/snitch/doc/index.md @@ -1,5 +1,8 @@ # Snitch +!!! warning + This page is no longer maintained, and may contain outdated information. + Snitch is a single-stage, single-issue, in-order RISC-V core (RV32I or RV32E) tuned for simplicity and minimal area footprint. Furthermore it is highly configurable and can be used in a plethora of different applications. @@ -7,7 +10,7 @@ configurable and can be used in a plethora of different applications. The core has an optional accelerator interface which can be used to control and off-load RISC-V instructions. The load/store interface is a dual-channel interface with a separately handshaked request and response channel. More -information can be found [here](../../rm/reqrsp_interface). +information can be found [here](reqrsp_interface.md). This folder contains the main Snitch core, incl. L0 translation lookaside buffer (TLB), register file and load store unit (LSU). diff --git a/mkdocs.yml b/mkdocs.yml index d13ad0eb1..43e2d23a8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,9 +4,10 @@ site_name: Snitch Cluster theme: name: material - icon: repo: fontawesome/brands/github + features: + - content.code.copy repo_url: https://github.com/pulp-platform/snitch_cluster repo_name: pulp-platform/snitch_cluster @@ -16,10 +17,12 @@ markdown_extensions: - def_list - pymdownx.highlight - pymdownx.superfences - - pymdownx.tabbed + - pymdownx.tabbed: + alternate_style: true - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji emoji_generator: !!python/name:material.extensions.emoji.to_svg + - toc plugins: - include-markdown - mkdocstrings: @@ -45,37 +48,35 @@ nav: - Base: ug/tutorial.md - Advanced: - Trace Analysis: ug/trace_analysis.md + - Code Optimization: ug/code_optimization.md - Documentation: ug/documentation.md - # Remove - # - Trace Example: ug/example_trace.html - Reference Manual: - Hardware: - - Snitch: rm/snitch.md + - Snitch: rm/hw/snitch.md - Snitch Cluster: - - Overview: rm/snitch_cluster.md - - Peripherals: rm/peripherals.md + - Overview: rm/hw/snitch_cluster.md + - Peripherals: rm/hw/peripherals.md - Schema: schema-doc/snitch_cluster.md - - Reqrsp Interface: rm/reqrsp_interface.md - - Custom Instructions: rm/custom_instructions.md - # - Solder: rm/solder.md + - Reqrsp Interface: rm/hw/reqrsp_interface.md + - Custom Instructions: rm/hw/custom_instructions.md - Software: - Simulation Utilities: - - data_utils: rm/sim/data_utils.md - - verif_utils: rm/sim/verif_utils.md - - sim_utils: rm/sim/sim_utils.md - - rm/sim/Simulation.md - - rm/sim/Simulator.md - - rm/sim/Elf.md + - data_utils: rm/sw/sim/data_utils.md + - verif_utils: rm/sw/sim/verif_utils.md + - sim_utils: rm/sw/sim/sim_utils.md + - rm/sw/sim/Simulation.md + - rm/sw/sim/Simulator.md + - rm/sw/sim/Elf.md - Trace Utilities: - - gen_trace.py: rm/trace/gen_trace.md - - annotate.py: rm/trace/annotate.md - - events.py: rm/trace/events.md + - gen_trace.py: rm/sw/trace/gen_trace.md + - annotate.py: rm/sw/trace/annotate.md + - events.py: rm/sw/trace/events.md - Benchmarking Utilities: - - join.py: rm/bench/join.md - - roi.py: rm/bench/roi.md - - visualize.py: rm/bench/visualize.md + - join.py: rm/sw/bench/join.md + - roi.py: rm/sw/bench/roi.md + - visualize.py: rm/sw/bench/visualize.md - Snitch Target Utilities: - - run.py: rm/snitch_target_utils/run.md - - build.py: rm/snitch_target_utils/build.md + - run.py: rm/sw/snitch_target_utils/run.md + - build.py: rm/sw/snitch_target_utils/build.md - Snitch Runtime: doxygen/html/index.html - Publications: publications.md diff --git a/target/snitch_cluster/README.md b/target/snitch_cluster/README.md index ce67ea9ec..f50adae0a 100644 --- a/target/snitch_cluster/README.md +++ b/target/snitch_cluster/README.md @@ -1,346 +1,9 @@ # Snitch cluster target The Snitch cluster target (`target/snitch_cluster`) is a simple RTL testbench -around a Snitch cluster. The cluster can be configured using a config file. By default, the config file which will be picked up is `target/snitch_cluster/cfg/default.hsjon`. - -The configuration parameters are documented using JSON schema. Documentation for the schema and available configuration options can be found in `docs/schema-doc/snitch_cluster/`). +around a Snitch cluster. The cluster testbench simulates an infinite memory. The RISC-V ELF file to be simulated is preloaded using RISC-V's Front-End Server (`fesvr`). -## Tutorial - -In the following tutorial you can assume the working directory to be `target/snitch_cluster`. All paths are to be assumed relative to this directory. Paths relative to the root of the repository are prefixed with a slash. - -### Building the hardware - -To compile the hardware for simulation run one of the following commands, depending on the desired simulator: - -```shell -# Verilator -make bin/snitch_cluster.vlt - -# Questa -make DEBUG=ON bin/snitch_cluster.vsim - -# VCS -make bin/snitch_cluster.vcs -``` - -These commands compile the RTL sources respectively in `work-vlt`, `work-vsim` and `work-vcs`. Additionally, common C++ testbench sources (e.g. the [frontend server (fesvr)](https://github.com/riscv-software-src/riscv-isa-sim)) are compiled under `work`. Each command will also generate a script or an executable (e.g. `bin/snitch_cluster.vsim`) which you can invoke to simulate the hardware. We will see how to do this in a later section. -The variable `DEBUG=ON` is used to preserve the visibility of all the internal signals during simulation. - -### Building the Banshee simulator -Instead of running an RTL simulation, you can use our instruction-accurate simulator called `banshee`. To install the simulator, please follow the instructions of the Banshee repository: [https://github.com/pulp-platform/banshee](https://github.com/pulp-platform/banshee). - -### Cluster configuration - -Note that the Snitch cluster RTL sources are partly automatically generated from a configuration file provided in `.hjson` format. Several RTL files are templated and use the `.hjson` configuration file to fill the template entries. An example is `/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl`. - -Under the `cfg` folder, different configurations are provided. The `cfg/default.hjson` configuration instantiates 8 compute cores + 1 DMA core in the cluster. If you need a specific configuration you can create your own configuration file. - -The command you executed previously automatically generated the templated RTL sources. It implicitly used the default configuration file. -To override the default configuration file, define the following variable when you invoke `make`: -```shell -make CFG_OVERRIDE=cfg/custom.hjson bin/snitch_cluster.vlt -``` - -___Note:__ whenever you override the configuration file on the `make` command-line, the configuration will be stored in the `cfg/lru.hjson` file. Successive invocations of `make` will automatically pick up the `cfg/lru.hjson` file. You can therefore omit the `CFG_OVERRIDE` definition in successive commands unless you want to override the least-recently used configuration._ - -Banshee uses also a cluster configuration file, however, that is given directly when simulating a specific binary with banshee with the help of `--configuration `. - -### Building the software - -To build all of the software for the Snitch cluster, run the following command: - -```bash -# for RTL simulation -make DEBUG=ON sw - -# for Banshee simulation (requires slightly different runtime) -make SELECT_RUNTIME=banshee DEBUG=ON sw - -# to use OpenOCD semi-hosting for putchar and termination -make DEBUG=ON OPENOCD_SEMIHOSTING=ON sw -``` - -The `sw` target first generates some C header files which depend on the hardware configuration. Hence, the need to generate the software for the same configuration as your hardware. Afterwards, it recursively invokes the `make` target in the `sw` subdirectory to build the apps/kernels which have been developed in that directory. - -The `DEBUG=ON` flag is used to tell the compiler to produce debugging symbols. It is necessary for the `annotate` target, showcased in the Debugging section of this guide, to work. - -The `SELECT_RUNTIME` flag is set by default to `rtl`. To build the software with the Banshee runtime, set the flag to `banshee`. - -___Note:__ the RTL is not the only source which is generated from the configuration file. The software stack also depends on the configuration file. Make sure you always build the software with the same configuration of the hardware you are going to run it on._ - -___Note:__ on GVSOC, it is better to use OpenOCD semi-hosting to prevent putchar from disturbing the DRAMSys timing model._ - -### Running a simulation - -Run one of the executables which was compiled in the previous step on your Snitch cluster simulator of choice: - -```shell -# Verilator -bin/snitch_cluster.vlt sw/apps/blas/axpy/build/axpy.elf - -# Questa -bin/snitch_cluster.vsim sw/apps/blas/axpy/build/axpy.elf - -# VCS -bin/snitch_cluster.vcs sw/apps/blas/axpy/build/axpy.elf - -# Banshee -banshee --no-opt-llvm --no-opt-jit --configuration src/banshee.yaml --trace sw/apps/blas/axpy/build/axpy.elf -``` - -The Snitch cluster simulator binaries can be invoked from any directory, just adapt the relative paths in the preceding commands accordingly, or use absolute paths. We refer to the working directory where the simulation is launched as the simulation directory. Within it, you will find several log files produced by the RTL simulation. - -The previous commands will launch the simulation on the console. QuestaSim simulations can also be launched with the QuestaSim GUI, by adapting the previous command to: - -```shell -# Questa -bin/snitch_cluster.vsim.gui sw/apps/blas/axpy/build/axpy.elf -``` - -For Banshee, you need to give a specific cluster configuration to the simulator with the flag `--configuration `. A default Snitch cluster configuration is given (`src/banshee.yaml`). The flag `--trace` enables the printing of the traces similar to the RTL simulation. -For more information and debug options, please have a look at the Banshee repository: [https://github.com/pulp-platform/banshee](https://github.com/pulp-platform/banshee). - -### Creating your first Snitch app - -In the following you will create your own AXPY kernel implementation as an example how to develop software for Snitch. - -#### Writing the C Code - -Create a directory for your AXPY kernel under `sw/`: - -```bash -mkdir sw/apps/axpy -``` - -And a `src` subdirectory to host your source code: - -```bash -mkdir sw/apps/axpy/src -``` - -Here, create a new file named `axpy.c` inside the `src` directory with the following contents: - -```C -#include "snrt.h" -#include "data.h" - -// Define your kernel -void axpy(uint32_t l, double a, double *x, double *y, double *z) { - for (uint32_t i = 0; i < l ; i++) { - z[i] = a * x[i] + y[i]; - } - snrt_fpu_fence(); -} - -int main() { - // Read the mcycle CSR (this is our way to mark/delimit a specific code region for benchmarking) - uint32_t start_cycle = snrt_mcycle(); - - // DM core does not participate in the computation - if(snrt_is_compute_core()) - axpy(L, a, x, y, z); - - // Read the mcycle CSR - uint32_t end_cycle = snrt_mcycle(); -} - -``` - -The `snrt.h` file implements the snRuntime API, a library of convenience functions to program Snitch cluster based systems. These sources are located under `target/snitch_cluster/sw/runtime/rtl` and are automatically referenced by our compilation scripts. - -___Note:__ Have a look at the files inside `sw/snRuntime` in the root of this repository to see what kind of functionality the snRuntime API defines. Note this is only an API, with some base implementations. The Snitch cluster implementation of the snRuntime for RTL simulation can be found under `target/snitch_cluster/sw/runtime/rtl`. It is automatically built and linked with user applications thanks to our compilation scripts._ - -We will have to instead create the `data.h` file ourselves. Create a `target/snitch_cluster/sw/apps/axpy/data` folder to host the data for your kernel to operate on: - -```bash -mkdir sw/apps/axpy/data -``` - -Here, create a C file named `data.h` with the following contents: - -```C -uint32_t L = 16; - -double a = 2; - -double x[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - -double y[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - -double z[16]; - -``` - -In this file we hardcode the data to be used by the kernel. This data will be loaded in memory together with your application code. In general, to verify your code you may want to randomly generate the above data. You may also want to test your kernel on different problem sizes, e.g. varying the length of the vectors, without having to manually rewrite the file. This can be achieved by generating the data header file with a Python script. You may have a look at the `sw/blas/axpy/scripts/datagen.py` script in the root of this repository as an example. As you can see, it reuses many convenience classes and functions for data generation from the `data_utils` module. Documentation for this module can be found [here](https://pulp-platform.github.io/snitch_cluster/rm/sim/data_utils.html). - -#### Compiling the C Code - -In your `axpy` folder, create a new file named `Makefile` with the following contents: - -```make -APP = axpy -SRCS = src/axpy.c -INCDIRS = data - -include ../common.mk -``` - -This Makefile will be invoked recursively by the top-level Makefile, compiling your source code into an executable with the name provided in the `APP` variable. - -In order for the top-level Makefile to find your application, add your application's directory to the `APPS` variable in `sw.mk`: - -``` -APPS += sw/apps/axpy -``` - -Now you can recompile all software, including your newly added AXPY application: - -```shell -make DEBUG=ON sw -``` - -Note, only the targets depending on the sources you have added/modified will be recompiled. - -In the `sw/apps/axpy/build` directory, you will now find your `axpy.elf` executable and some other files which were automatically generated to aid debugging. Open `axpy.dump` and search for ``, `` and ``. You will see the addresses where the respective vectors defined in `data.h` have been allocated by the compiler. This file can also be very useful to see what assembly instructions your source code was compiled to, and correlate the traces (we will later see) with the source code. - -If you want to dig deeper into how our build system works and how these files were generated you can follow the recursive Makefile invocations starting from the `sw` target in `snitch_cluster/Makefile`. - -#### Run your application - -You can run your application in simulation as shown in the previous sections. Make sure to pick up the right binary, e.g.: - -```shell -bin/snitch_cluster.vsim sw/apps/axpy/build/axpy.elf -``` - -### Debugging and benchmarking - -When you run the simulation, every core will log all the instructions it executes (along with additional information, such as the value of the registers before/after the instruction) in a trace file. The traces are located in the `logs` folder within the simulation directory. The traces are identified by their hart ID, that is a unique ID for every hardware thread (hart) in a RISC-V system (and since all our cores have a single thread that is a unique ID per core). - -The simulation logs the traces in a non-human readable format with `.dasm` extension. To convert these to a human-readable form run: - -```bash -make -j traces -``` - -If the simulation directory does not coincide with the current working directory, you will have to specify the path explicitly: - -```bash -make -j traces SIM_DIR= -``` - -Detailed information on how to interpret the generated traces can be found [here](../../docs/ug/trace_analysis.md). - -In addition to generating readable traces (`.txt` format), the above command also computes several performance metrics from the trace and appends them at the end of the trace. These can be collected into a single CSV file with the following target: - -```bash -make logs/perf.csv -# View the CSV file -libreoffice logs/perf.csv -``` - -In this file you can find the `X_tstart` and `X_tend` metrics. These are the cycles in which a particular code region `X` starts and ends, and can hence be used to profile your code. Code regions are defined by calls to `snrt_mcycle()`. Every call to this function defines two code regions: -- the code preceding the call, up to the previous `snrt_mcycle()` call or the start of the source file -- the code following the call, up to the next `snrt_mcycle()` call or the end of the source file - -The CSV file can be useful to automate collection and post-processing of benchmarking data. - -Finally, debugging your program from the trace alone can be quite tedious and time-consuming. You would have to manually understand which instructions in the trace correspond to which lines in your source code. Surely, you can help yourself with the disassembly. - -Alternatively, you can automatically annotate the traces with that information. With the following commands you can view the trace instructions side-by-side with the corresponding source code lines they were compiled from: - -```bash -make -j annotate -kompare -o logs/trace_hart_00000.diff -``` - -If you prefer to view this information in a regular text editor (e.g. for search), you can open the `logs/trace_hart_xxxxx.s` files. Here, the annotations are interleaved with the trace rather than being presented side-by-side. - -___Note:__ the `annotate` target uses the `addr2line` binutil behind the scenes, which needs debugging symbols to correlate instruction addresses with originating source code lines. The `DEBUG=ON` flag you specified when building the software is used to tell the compiler to produce debugging symbols when compiling your code._ - -The traces contain a lot of information which we might not be interested at first. To simply visualize the runtime of the compute region in our code, first create a file named `layout.csv` in `sw/apps/axpy` with the following contents: - -``` - , compute -"range(0,8)", 1 -8 , - -``` - -Then run the following commands: - -```bash -# Similar to logs/perf.csv but filters all but tstart and tend metrics -make logs/event.csv -# Labels, filters and reorders the event regions as specified by an application-specific layout file -../../util/trace/layout_events.py logs/event.csv sw/apps/axpy/layout.csv -o logs/trace.csv -# Creates a trace file which can be visualized with Chrome's TraceViewer -../../util/trace/eventvis.py -o logs/trace.json logs/trace.csv -``` - -Go to `http://ui.perfetto.dev/`. Here you can load the `logs/trace.json` file and graphically view the runtime of the compute region in your code. To learn more about the layout file syntax and what the Python scripts do you can have a look at the description comment at the start of the scripts themselves. - -__Great, but, have you noticed a problem?__ - -Look into `sw/apps/axpy/build/axpy.dump` and search for the address of the output variable `` : - -``` -Disassembly of section .bss: - -80000960 : - ... -``` - -Now grep this address in your traces: - -```bash -grep 80000960 logs/*.txt -... -``` - -It appears in every trace! All the cores issue a `fsd` (float store double) to this address. You are not parallelizing your kernel but executing it 8 times! - -Modify `sw/apps/axpy/src/axpy.c` to truly parallelize your kernel: - -```C -#include "snrt.h" -#include "data.h" - -// Define your kernel -void axpy(uint32_t l, double a, double *x, double *y, double *z) { - int core_idx = snrt_cluster_core_idx(); - int offset = core_idx * l; - - for (int i = 0; i < l; i++) { - z[offset] = a * x[offset] + y[offset]; - offset++; - } - snrt_fpu_fence(); -} - -int main() { - // Read the mcycle CSR (this is our way to mark/delimit a specific code region for benchmarking) - uint32_t start_cycle = snrt_mcycle(); - - // DM core does not participate in the computation - if(snrt_is_compute_core()) - axpy(L / snrt_cluster_compute_core_num(), a, x, y, z); - - // Read the mcycle CSR - uint32_t end_cycle = snrt_mcycle(); -} -``` - -Now re-run your kernel and compare the execution time of the compute region with the previous version. - -## Code Reuse - -As you may have noticed, there is a good deal of code which is independent of the hardware platform we execute our AXPY kernel on. This is true for the `data.h` file and possible data generation scripts. The Snitch AXPY kernel itself is not specific to the Snitch cluster, but can be ported to any platform which provides an implementation of the snRuntime API. An example is Occamy, with its own testbench and SW development environment. - -It is thus preferable to develop the data generation scripts and Snitch kernels in a shared location, from which multiple platforms can take and include the code. The `sw` directory in the root of this repository was created with this goal in mind. For the AXPY example, shared sources are hosted under the `sw/blas/axpy` directory. As an example of how these shared sources are used to build an AXPY application for a specific platform (in this case the standalone Snitch cluster) you can have a look at the `target/snitch_cluster/sw/apps/blas/axpy`. - -We recommend that you follow this approach also in your own developments for as much of the code which can be reused. +You can find information on how to build and simulate the Snitch cluster in the dedicated [tutorial](https://pulp-platform.github.io/snitch_cluster/ug/tutorial.html). diff --git a/util/container/README.md b/util/container/README.md index 714e55a13..05a136926 100644 --- a/util/container/README.md +++ b/util/container/README.md @@ -10,7 +10,7 @@ There is a pre-built version of the container available online. This version is To download the container, first login to the GitHub container registry: ```shell -$ docker login ghcr.io +docker login ghcr.io ``` You will be asked for a username (your GitHub username). As a password you should use a @@ -19,16 +19,15 @@ that at least has package registry read permission. You can then install the container by running: ```shell -$ docker pull ghcr.io/pulp-platform/snitch_cluster:main +docker pull ghcr.io/pulp-platform/snitch_cluster:main ``` ### Build instructions -In case you cannot use the pre-built container, e.g. if you need to make changes to the Dockerfile, you can build the -container locally by running the following command in the root of the repository: +In case you cannot use the pre-built container, e.g. if you need to make changes to the Dockerfile, you can build the container locally by running the following command in the root of the repository: ```shell -$ sudo docker buildx build -t ghcr.io/pulp-platform/snitch_cluster:main -f util/container/Dockerfile . +sudo docker buildx build -t ghcr.io/pulp-platform/snitch_cluster:main -f util/container/Dockerfile . ``` ## Usage @@ -36,7 +35,7 @@ $ sudo docker buildx build -t ghcr.io/pulp-platform/snitch_cluster:main -f util/ To run the container in interactive mode: ```shell -$ docker run -it -v $REPO_TOP:/repo -w /repo ghcr.io/pulp-platform/snitch_cluster:main +docker run -it -v :/repo -w /repo ghcr.io/pulp-platform/snitch_cluster:main ``` ## Limitations