From 546c9ae468ea8fbdbbce583e6ba5da1b7d98d9b8 Mon Sep 17 00:00:00 2001
From: Gianna Paulin <pauling@iis.ee.ethz.ch>
Date: Tue, 27 Jun 2023 10:25:00 +0200
Subject: [PATCH] docs: Adapt documentation and schemas for new repository

* clustergen: Require absolute paths to schemas

* ci: Freeze verible_version for CI checks

* doc: Update directory structure documenation to include README's of the main directories and add mkdocs-include-markdown-plugin as requirement

* doc: Update overall documentation

* docs: Reduce redundancy and organize user guide in a linear way

* ci: Update to new version of `pulp-actions`

*.github: Add CODEOWNERS file

* doc: Add README.md

* doc: Clean up publications, keep only cluster-related works

* doc: Address review feedback

* doc: Build on `main` branch

* doc: Some fixes

---------

Co-authored-by: Luca Colagrande <luca.colagrande3@gmail.com>
Co-authored-by: Paul Scheffler <paulsc@iis.ee.ethz.ch>
---
 .github/CODEOWNERS                        |  11 +
 .github/workflows/gitlab-ci.yaml          |   2 +-
 .github/workflows/lint.yml                |   7 +-
 README.md                                 | 143 +++++++++
 docs/doxybook2.json                       |   2 +-
 docs/index.md                             |   8 +-
 docs/publications.md                      |  96 +++++-
 docs/requirements.txt                     |   2 +
 docs/rm/reqrsp_interface.md               |   1 +
 docs/rm/snitch.md                         |   1 +
 docs/rm/snitch_cluster.md                 |   1 +
 docs/schema/snitch_cluster.schema.json    |   2 +-
 docs/schema/snitch_cluster_tb.schema.json |   4 +-
 docs/ug/directory_structure.md            |  94 +++---
 docs/ug/docker.md                         |   1 -
 docs/ug/documentation.md                  |  12 +-
 docs/ug/getting_started.md                | 176 +++++------
 docs/ug/setup-iis.md                      | 297 ------------------
 docs/ug/snitch_cluster.md                 | 110 -------
 docs/ug/tutorial.md                       |  28 ++
 hw/README.md                              |  10 +
 mkdocs.yml                                |  62 ++--
 sw/README.md                              |   7 +-
 target/README.md                          |  16 +
 target/snitch_cluster/README.md           | 337 ++++++++++++++++++---
 target/snitch_cluster/WALKTHROUGH.md      | 350 ----------------------
 util/clustergen/cluster.py                |  24 +-
 util/container/README.md                  |  36 +--
 28 files changed, 797 insertions(+), 1043 deletions(-)
 create mode 100644 .github/CODEOWNERS
 create mode 100644 README.md
 create mode 120000 docs/rm/reqrsp_interface.md
 create mode 120000 docs/rm/snitch.md
 create mode 120000 docs/rm/snitch_cluster.md
 delete mode 120000 docs/ug/docker.md
 delete mode 100644 docs/ug/setup-iis.md
 delete mode 100644 docs/ug/snitch_cluster.md
 create mode 100644 docs/ug/tutorial.md
 create mode 100644 hw/README.md
 create mode 100644 target/README.md
 delete mode 100644 target/snitch_cluster/WALKTHROUGH.md

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 000000000..01fff63fb
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,11 @@
+# These owners will be the default owners for everything in the repo.
+# Unless a later match takes precedence, global owners below will be
+# requested for review when someone opens a pull request.
+
+* @paulsc96 @colluca
+
+hw/snitch_cluster @paulsc96 @lucabertaccini
+hw/snitch_dma @paulsc96 @thommythomaso
+hw/snitch_icache @paulsc96 @SamuelRiedel
+
+sw @colluca @fischeti @viv-eth
diff --git a/.github/workflows/gitlab-ci.yaml b/.github/workflows/gitlab-ci.yaml
index ff744fe72..e367a594b 100644
--- a/.github/workflows/gitlab-ci.yaml
+++ b/.github/workflows/gitlab-ci.yaml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-22.04
     steps:
       - name: Check Gitlab CI
-        uses: pulp-platform/pulp-actions/gitlab-ci@v2
+        uses: pulp-platform/pulp-actions/gitlab-ci@v2.1.0
         # Skip on forks or pull requests from forks due to missing secrets.
         if:
           # yamllint disable rule:line-length
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index ec6875f21..545fff3ab 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -12,6 +12,7 @@ jobs:
   # Verible Lint #
   ################
   verible-lint:
+    name: Lint Verilog sources
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
@@ -25,15 +26,17 @@ jobs:
           fail_on_error: true
           reviewdog_reporter: github-check
           extra_args: "--waiver_files util/lint/waiver.verible"
+          verible_version: "v0.0-3318-g8d254167"
 
   #####################
   # Vendor Up-to-Date #
   #####################
   bender-vendor-up-to-date:
+    name: Check bender vendor up-to-date
     runs-on: ubuntu-latest
     steps:
       - name: Check bender vendor up-to-date
-        uses: pulp-platform/pulp-actions/bender-vendor-up-to-date@v2
+        uses: pulp-platform/pulp-actions/bender-vendor-up-to-date@v2.1.0
 
   ######################
   # Opcodes Up-to-Date #
@@ -61,7 +64,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check License
-        uses: pulp-platform/pulp-actions/lint-license@patch/license-checker
+        uses: pulp-platform/pulp-actions/lint-license@v2.1.0
         with:
           patches: 0001-Allow-hash-comments-in-assembly.patch
           # We cover ETH Zurich and lowRISC licenses and Apache 2.0
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..9c0b8c954
--- /dev/null
+++ b/README.md
@@ -0,0 +1,143 @@
+![CI](https://github.com/pulp-platform/snitch_cluster/actions/workflows/ci.yml/badge.svg)
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+
+# Snitch Cluster
+
+This repository hosts the hardware and software for the Snitch cluster and its generator. Snitch is a high-efficiency compute cluster platform focused on floating-point workloads. It is developed as part of the PULP project, a joint effort between ETH Zurich and the University of Bologna.
+
+## Getting Started
+
+To get started, check out the [getting started guide](https://pulp-platform.github.io/snitch_cluster/ug/getting_started.html).
+
+## Content
+
+What can you expect to find in this repository?
+
+- The RISC-V [Snitch integer core](https://pulp-platform.github.io/snitch_cluster/rm/snitch/). This can be useful stand-alone if you are just interested in re-using the core for your project, e.g., as a tiny control core or you want to make a peripheral smart. The sky is the limit.
+- The [Snitch cluster](https://pulp-platform.github.io/snitch_cluster/ug/snitch_cluster/). A highly configurable cluster containing one to many integer cores with optional floating-point capabilities as well as our custom ISA extensions `Xssr`, `Xfrep`, and `Xdma`.
+- A runtime and example applications for the Snitch cluster.
+- RTL simulation environments for Verilator, Questa Advanced Simulator, and VCS, as well as configurations for our [Banshee system simulator](https://github.com/pulp-platform/banshee)
+
+This code was previously hosted in the [Snitch monorepo](https://github.com/pulp-platform/snitch) and was spun off into its own repository to simplify maintenance and dependency handling. Note that our Snitch-based manycore system [Occamy](https://github.com/pulp-platform/occamy) has also moved.
+
+## Tool Requirements
+
+* `verilator >= v4.1`
+* `bender >= v0.27.0`
+
+## License
+
+Snitch is being made available under permissive open source licenses.
+
+The following files are released under Apache License 2.0 (`Apache-2.0`) see `LICENSE`:
+
+- `sw/`
+- `util/`
+
+The following files are released under Solderpad v0.51 (`SHL-0.51`) see `hw/LICENSE`:
+
+- `hw/`
+
+The `sw/deps` directory references submodules that come with their own
+licenses. See the respective folder for the licenses used.
+
+- `sw/deps/`
+
+## Publications
+
+
+If you use the Snitch cluster or its extensions in your work, you can cite us:
+
+<details>
+<summary><b>Snitch: A tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads</b></summary>
+<p>
+
+```
+@article{zaruba2020snitch,
+  title={Snitch: A tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads},
+  author={Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca},
+  journal={IEEE Transactions on Computers},
+  year={2020},
+  publisher={IEEE}
+}
+```
+
+</p>
+</details>
+
+<details>
+<summary><b>Stream semantic registers: A lightweight risc-v isa extension achieving full compute utilization in single-issue cores</b></summary>
+<p>
+
+```
+@article{schuiki2020stream,
+  title={Stream semantic registers: A lightweight risc-v isa extension achieving full compute utilization in single-issue cores},
+  author={Schuiki, Fabian and Zaruba, Florian and Hoefler, Torsten and Benini, Luca},
+  journal={IEEE Transactions on Computers},
+  volume={70},
+  number={2},
+  pages={212--227},
+  year={2020},
+  publisher={IEEE}
+}
+```
+
+</p>
+</details>
+
+<details>
+<summary><b>Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra</b></summary>
+<p>
+
+```
+@inproceedings{scheffler2021indirect,
+  author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca},
+  booktitle={2021 Design, Automation & Test in Europe Conference & Exhibition (DATE)},
+  title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra},
+  year={2021},
+  volume={},
+  number={},
+  pages={1787-1792}
+}
+```
+
+</p>
+</details>
+
+<details>
+<summary><b>MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores</b></summary>
+<p>
+
+```
+@inproceedings{bertaccini2022minifloat,
+  author={Bertaccini, Luca and Paulin, Gianna and Fischer, Tim and Mach, Stefan and Benini, Luca},
+  booktitle={2022 IEEE 29th Symposium on Computer Arithmetic (ARITH)},
+  title={MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores},
+  year={2022},
+  volume={},
+  number={},
+  pages={1-8}
+}
+```
+
+</p>
+</details>
+
+<details>
+<summary><b>Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters</b></summary>
+<p>
+
+```
+@inproceedings{paulin2022softtiles,
+  author={Paulin, Gianna and Cavalcante, Matheus and Scheffler, Paul and Bertaccini, Luca and Zhang, Yichao and Gürkaynak, Frank and Benini, Luca},
+  booktitle={2022 IEEE Computer Society Annual Symposium on VLSI (ISVLSI)},
+  title={Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters},
+  year={2022},
+  volume={},
+  number={},
+  pages={44-49},
+  doi={10.1109/ISVLSI54635.2022.00021}
+}
+```
+
+</p>
diff --git a/docs/doxybook2.json b/docs/doxybook2.json
index e70ed0f02..28d69396e 100644
--- a/docs/doxybook2.json
+++ b/docs/doxybook2.json
@@ -1,5 +1,5 @@
 {
-  "baseUrl": "/snitch/runtime/",
+  "baseUrl": "/snitch_cluster/runtime/",
   "indexInFolders": true,
   "linkSuffix": "/",
   "indexClassesName": "index",
diff --git a/docs/index.md b/docs/index.md
index 081b20f8d..3f074bbc9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,4 +1,4 @@
-# Snitch System Generator
+# Snitch
 
 The Snitch project is an open-source RISC-V hardware research project of ETH Zurich and University of Bologna targeting highest possible energy-efficiency. The system is designed around a versatile and small integer core, which we call Snitch. The system is ought to be highly parameterizable and suitable for many use-cases, ranging from small, control-only cores, to large many-core system made for pure number crunching in the HPC domain.
 
@@ -8,11 +8,13 @@ See our dedicated [getting started guide](ug/getting_started.md).
 
 ## Documentation
 
-The documentation is built from the latest master and hosted at github pages: [https://pulp-platform.github.io/snitch](https://pulp-platform.github.io/snitch).
+The documentation is built from the latest master and hosted at github pages: [https://pulp-platform.github.io/snitch_cluster](https://pulp-platform.github.io/snitch_cluster).
 
 ## About this Repository
 
-This repository is developed as a monorepo, external dependencies are "vendored-in" and checked in. Keeping it a monolithic repository helps to keep the hardware dependencies under control and enables precise snapshotting (invaluable when you are taping-out chips).
+The original repository [https://github.com/pulp-platform/snitch](https://github.com/pulp-platform/snitch) was developed as a monorepo where external dependencies are "vendored-in" and checked in. For easier integration into heterogeneous systems with other PULP Platform IPs, the original repo was archived. This new repository [https://github.com/pulp-platform/snitch_cluster](https://github.com/pulp-platform/snitch_cluster) handles depenencies with [Bender](https://github.com/pulp-platform/bender) and has a couple of repositories as submodules.
+The Occamy System part of the original repository is being moved to its own repository [https://github.com/pulp-platform/occamy](https://github.com/pulp-platform/occamy).
+
 
 ## Licensing
 
diff --git a/docs/publications.md b/docs/publications.md
index 04e4a455b..dd0ebd23f 100644
--- a/docs/publications.md
+++ b/docs/publications.md
@@ -1,15 +1,97 @@
 # Publications
 
-The Snitch architecture is built on research that is described in the following publications.
+If you use the Snitch cluster or its extensions in your work, you can cite us:
 
-## 2021
+<details>
+<summary><b>Snitch: A tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads</b></summary>
+<p>
 
-F. Zaruba, F. Schuiki, T. Hoefler and L. Benini, "Snitch: A Tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads," in IEEE Transactions on Computers, vol. 70, no. 11, pp. 1845-1860, 1 Nov. 2021, [doi: 10.1109/TC.2020.3027900](http://www.doi.org/10.1109/TC.2020.3027900).
+```
+@article{zaruba2020snitch,
+  title={Snitch: A tiny Pseudo Dual-Issue Processor for Area and Energy Efficient Execution of Floating-Point Intensive Workloads},
+  author={Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca},
+  journal={IEEE Transactions on Computers},
+  year={2020},
+  publisher={IEEE}
+}
+```
 
-F. Schuiki, F. Zaruba, T. Hoefler and L. Benini, "Stream Semantic Registers: A Lightweight RISC-V ISA Extension Achieving Full Compute Utilization in Single-Issue Cores," in IEEE Transactions on Computers, vol. 70, no. 2, pp. 212-227, 1 Feb. 2021, [doi: 10.1109/TC.2020.2987314](http://www.doi.org/10.1109/TC.2020.2987314).
+</p>
+</details>
 
-S. Riedel, F. Schuiki, P. Scheffler, F. Zaruba and L. Benini, "Banshee: A Fast LLVM-Based RISC-V Binary Translator," 2021 IEEE/ACM International Conference On Computer Aided Design (ICCAD), 2021, pp. 1-9, [doi: 10.1109/ICCAD51958.2021.9643546](http://www.doi.org/10.1109/ICCAD51958.2021.9643546).
+<details>
+<summary><b>Stream semantic registers: A lightweight risc-v isa extension achieving full compute utilization in single-issue cores</b></summary>
+<p>
 
-F. Zaruba, F. Schuiki and L. Benini, "Manticore: A 4096-Core RISC-V Chiplet Architecture for Ultraefficient Floating-Point Computing," in IEEE Micro, vol. 41, no. 2, pp. 36-42, 1 March-April 2021, [doi: 10.1109/MM.2020.3045564](http://www.doi.org/10.1109/MM.2020.3045564).
+```
+@article{schuiki2020stream,
+  title={Stream semantic registers: A lightweight risc-v isa extension achieving full compute utilization in single-issue cores},
+  author={Schuiki, Fabian and Zaruba, Florian and Hoefler, Torsten and Benini, Luca},
+  journal={IEEE Transactions on Computers},
+  volume={70},
+  number={2},
+  pages={212--227},
+  year={2020},
+  publisher={IEEE}
+}
+```
 
-P. Scheffler, F. Zaruba, F. Schuiki, T. Hoefler and L. Benini, "Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra," 2021 Design, Automation & Test in Europe Conference & Exhibition (DATE), 2021, pp. 1787-1792, [doi: 10.23919/DATE51398.2021.9474230](http://www.doi.org/10.23919/DATE51398.2021.9474230).
+</p>
+</details>
+
+<details>
+<summary><b>Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra</b></summary>
+<p>
+
+```
+@inproceedings{scheffler2021indirect,
+  author={Scheffler, Paul and Zaruba, Florian and Schuiki, Fabian and Hoefler, Torsten and Benini, Luca},
+  booktitle={2021 Design, Automation & Test in Europe Conference & Exhibition (DATE)},
+  title={Indirection Stream Semantic Register Architecture for Efficient Sparse-Dense Linear Algebra},
+  year={2021},
+  volume={},
+  number={},
+  pages={1787-1792}
+}
+```
+
+</p>
+</details>
+
+<details>
+<summary><b>MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores</b></summary>
+<p>
+
+```
+@inproceedings{bertaccini2022minifloat,
+  author={Bertaccini, Luca and Paulin, Gianna and Fischer, Tim and Mach, Stefan and Benini, Luca},
+  booktitle={2022 IEEE 29th Symposium on Computer Arithmetic (ARITH)},
+  title={MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores},
+  year={2022},
+  volume={},
+  number={},
+  pages={1-8}
+}
+```
+
+</p>
+</details>
+
+<details>
+<summary><b>Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters</b></summary>
+<p>
+
+```
+@inproceedings{paulin2022softtiles,
+  author={Paulin, Gianna and Cavalcante, Matheus and Scheffler, Paul and Bertaccini, Luca and Zhang, Yichao and Gürkaynak, Frank and Benini, Luca},
+  booktitle={2022 IEEE Computer Society Annual Symposium on VLSI (ISVLSI)},
+  title={Soft Tiles: Capturing Physical Implementation Flexibility for Tightly-Coupled Parallel Processing Clusters},
+  year={2022},
+  volume={},
+  number={},
+  pages={44-49},
+  doi={10.1109/ISVLSI54635.2022.00021}
+}
+```
+
+</p>
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 2877d2c49..6a766858d 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -6,3 +6,5 @@
 mkdocs
 # Last version compatible with python-3.6 (default on Ubuntu 18.04)
 mkdocs-material <= 8.2.11
+mkdocs-include-markdown-plugin
+mkdocs-macros-plugin
\ No newline at end of file
diff --git a/docs/rm/reqrsp_interface.md b/docs/rm/reqrsp_interface.md
new file mode 120000
index 000000000..f7e23bc4e
--- /dev/null
+++ b/docs/rm/reqrsp_interface.md
@@ -0,0 +1 @@
+../../hw/reqrsp_interface/doc/index.md
\ No newline at end of file
diff --git a/docs/rm/snitch.md b/docs/rm/snitch.md
new file mode 120000
index 000000000..e36c4db10
--- /dev/null
+++ b/docs/rm/snitch.md
@@ -0,0 +1 @@
+../../hw/snitch/doc/index.md
\ No newline at end of file
diff --git a/docs/rm/snitch_cluster.md b/docs/rm/snitch_cluster.md
new file mode 120000
index 000000000..61c4a3593
--- /dev/null
+++ b/docs/rm/snitch_cluster.md
@@ -0,0 +1 @@
+../../hw/snitch_cluster/doc/index.md
\ No newline at end of file
diff --git a/docs/schema/snitch_cluster.schema.json b/docs/schema/snitch_cluster.schema.json
index 85a5d051b..6e8c1181a 100644
--- a/docs/schema/snitch_cluster.schema.json
+++ b/docs/schema/snitch_cluster.schema.json
@@ -1,6 +1,6 @@
 {
     "$schema": "http://json-schema.org/draft-07/schema#",
-    "$id": "http://pulp-platform.org/snitch/snitch_cluster.schema.json",
+    "$id": "http://pulp-platform.org/snitch_cluster/snitch_cluster.schema.json",
     "title": "Snitch Cluster Schema",
     "description": "Base description of a Snitch cluster and its internal structure and configuration.",
     "type": "object",
diff --git a/docs/schema/snitch_cluster_tb.schema.json b/docs/schema/snitch_cluster_tb.schema.json
index 11e493951..4fe0cf435 100644
--- a/docs/schema/snitch_cluster_tb.schema.json
+++ b/docs/schema/snitch_cluster_tb.schema.json
@@ -1,6 +1,6 @@
 {
     "$schema": "http://json-schema.org/draft-07/schema#",
-    "$id": "http://pulp-platform.org/snitch/snitch_cluster_tb.schema.json",
+    "$id": "http://pulp-platform.org/snitch_cluster/snitch_cluster_tb.schema.json",
     "title": "Snitch Cluster TB Schema",
     "description": "Description for a very simple single-cluster testbench. That is the most minimal system available. Most of the hardware is emulated by the testbench.",
     "type": "object",
@@ -9,7 +9,7 @@
     ],
     "properties": {
         "cluster": {
-            "$ref": "http://pulp-platform.org/snitch/snitch_cluster.schema.json"
+            "$ref": "http://pulp-platform.org/snitch_cluster/snitch_cluster.schema.json"
         },
         "dram": {
             "title": "DRAM",
diff --git a/docs/ug/directory_structure.md b/docs/ug/directory_structure.md
index 66235a759..ea1fba0c1 100644
--- a/docs/ug/directory_structure.md
+++ b/docs/ug/directory_structure.md
@@ -5,65 +5,41 @@ are co-located. The top-level ist structured as follows:
 
 * `docs`: [Documentation](documentation.md) of the generator and software.
   Contains additional user guides.
-* `hw`: All hardware components.
+* `hw`: All hardware IP components.
 * `sw`: Hardware independent software, libraries, runtimes etc.
+* `target`: Contains the testbench setup, cluster configuration specific hardware and software, libraries, runtimes etc.
 * `util`: Utility and helper scripts.
 
-## Hardware
-
-* `ip`: Blocks which are instantiated in the design e.g., they are not
-  stand-alone.
-    * `src`: RTL sources
-    * `test`: Test-benches
-* `vendor`: "Third-party" components which are updated using the vendor script.
-  They are not (primarily) developed as part of this repository.
-* `system`: Specific systems built around Snitch components.
-    * `snitch-cluster`: Single cluster with a minimal environment to run
-      meaningful applications.
-    * `occamy`: Multi-cluster system with an environment to run applications.
-
-## Software
-
-* `vendor`: Software components which come with their own license requirements
-  from third parties.
-
-# Vendored Source Directories
-
-This repo is organized in a monolithic fashion, i.e., all resources are checked
-in, we do not use git submodules or other ways of obtaining (HW) source files.
-But not all IPs are developed with this repository. We rely on the `vendor` tool
-to copy data from other repositories into the tree. We keep separate patches if
-changes are necessary. Ideally, patches should be upstreamed to the originating
-repository once things stabilize.
-
-## Creating Patches
-
-If you need to make changes to one of the IPs in the `hw/vendor` subdirectory
-you need to obtain a set of patches which should be applied. CI will check
-whether there are any changes without patches. Upon obtaining the sources the
-vendor tool can automatically apply the patches for you.
-
-To create patches you first need to commit the changes. Then, in the current
-directory create a set of patches (it will create a file for each commit) for
-the commit (range) you are interested:
-
-```
-git format-patch --relative -o <path/to/patch/folder> HEAD^1
-```
-
-In the vendor file specify the path to the patches:
-
-```
-patch_dir: "<path/to/patch/folder>"
-```
-
-## Updating Sources
-
-The vendor tool supports updating the sources. If you are in a clean directory
-with no changes (you can `git stash` to achieve this), the vendor tool can
-automatically commit the updates (`--commit`). For the `common_cells` for
-example:
-
-```
-./util/vendor.py hw/vendor/pulp_platform_common_cells.vendor.hjson --update --commit
-```
+## Hardware `hw` Directory
+<!---
+The following documentation is directly included from `../../hw/README.md`
+-->
+{%
+   include-markdown '../../hw/README.md'
+   start="# Snitch Hardware"
+   comments=false
+%}
+
+## Software `sw` Directory
+
+This subdirectory contains the various bits and pieces of software for the Snitch ecosystem.
+
+<!---
+The following documentation is directly included from `../../sw/README.md`
+-->
+{%
+   include-markdown '../../sw/README.md'
+   start="## Contents"
+   comments=false
+%}
+
+
+## Target `target` Directory
+<!---
+The following documentation is directly included from `../../sw/README.md`
+-->
+{%
+   include-markdown '../../target/README.md'
+   start="# HW Targets"
+   comments=false
+%}
diff --git a/docs/ug/docker.md b/docs/ug/docker.md
deleted file mode 120000
index 88d2ee5b5..000000000
--- a/docs/ug/docker.md
+++ /dev/null
@@ -1 +0,0 @@
-../../util/container/README.md
\ No newline at end of file
diff --git a/docs/ug/documentation.md b/docs/ug/documentation.md
index e792efd94..b1179e70e 100644
--- a/docs/ug/documentation.md
+++ b/docs/ug/documentation.md
@@ -1,25 +1,31 @@
 # Documentation
 
 Documentation of the generator and related infrastructure is hosted under
-`docs`. Static `html` documentation is build from the latest `master` branch by
+`docs`. Static `html` documentation is build from the latest `main` branch by
 the CI. We use [mkdocs](https://www.mkdocs.org/) together with the [material
 theme](https://squidfunk.github.io/mkdocs-material/). Before building the
 documentation, make sure you have the required dependencies installed:
 
-```bash
+```shell
 pip install -r docs/requirements.txt
 ```
 
 After everything is installed, you can build and serve a local copy by
 executing (in the root directory):
 
-```bash
+```shell
 mkdocs serve
 ```
 
 This opens a local webserver listening on
 [http://127.0.0.1:8000/](http://127.0.0.1:8000/).
 
+Alternatively, you can build a static copy of the `html` documentation:
+
+```shell
+mkdocs build
+```
+
 ## Organization
 
 The `docs` folder is organized as follows:
diff --git a/docs/ug/getting_started.md b/docs/ug/getting_started.md
index 3383243b9..de2487ae5 100644
--- a/docs/ug/getting_started.md
+++ b/docs/ug/getting_started.md
@@ -1,106 +1,76 @@
 # Getting Started
 
-## Quick Start
-
-This will take you through the necessary steps to get a sample program running on a cluster of Snitch cores.
-
-1. Clone the repository.
-   ```
-   git clone https://github.com/pulp-platform/snitch.git --recurse-submodules
-   ```
-2. Start the Docker container containing all necessary development tools. If you
-   do not want (or can not) use Docker please see the
-   [prerequisites](#prerequisites) sections on how to obtain all required tools.
-    ```
-    docker run -it -v `pwd`/snitch:/repo -w /repo ghcr.io/pulp-platform/snitch
-    ```
-3. To simulate a cluster of Snitch cores go to `hw/system/snitch_cluster` and build the Verilator model for the Snitch cluster.
-    ```
-    cd hw/system/snitch_cluster
-    make bin/snitch_cluster.vlt
-    ```
-4. Build the software.
-    ```
-    mkdir sw/build
-    cd sw/build
-    cmake ..
-    make
-    ```
-5. Run a sample application on the Verilator model.
-    ```
-    ./bin/snitch_cluster.vlt sw/build/benchmark/benchmark-matmul-all
-    ```
-6. Generate the annotated traces and inspect the trace for core 0.
-    ```
-    make traces
-    less trace_hart_00000000.txt
-    ```
-    Optionally you can inspect the dumped waveforms (`snitch_cluster.vcd`).
-    `spike-dasm` is required to generate the traces. Using the source from this repository supports disassembly of Snitch-custom instructions:
-    ```
-    cd sw/vendor/riscv-isa-sim
-    mkdir build; cd build
-    ../configure; make spike-dasm
-    ```
-7. Visualize the traces with the `util/trace/tracevis.py` script.
-    ```
-    ./util/trace/tracevis.py -o trace.json sw/build/benchmark/benchmark-matmul-all hw/system/snitch_cluster/logs/trace_hart_*.txt
-    ```
-    The generated JSON file can be visualized with [Trace-Viewer](https://github.com/catapult-project/catapult/tree/master/tracing), or by loading it into Chrome's `about:tracing`. You can check out an example trace [here](../example_trace.html).
-8. Annotate the traces with the `util/trace/annotate.py` script.
-    ```
-    ./util/trace/annotate.py -o annotated.s sw/build/benchmark/benchmark-matmul-all hw/system/snitch_cluster/logs/trace_hart_00001.txt
-    ```
-    The generated `annotated.s` interleaves source code with retired instructions.
-
-## Prerequisites
-
-We recommend using the Docker container. If that should not be possible (because
-of missing privileges for example) you can install the required tools and
-components yourself.
-
-We recommend a reasonable new Linux distribution, for example, Ubuntu 18.04:
-
-- Install essential packages:
-    ```
-    sudo apt-get install build-essential python3 python3-pip python3-setuptools python3-wheel
-    ```
-- Install the Python requirements using:
-    ```
-    pip3 install --user -r python-requirements.txt
-    ```
-- We are using `Bender` for file list generation. The easiest way to obtain `Bender` is through its binary release channel:
-    ```
-    curl --proto '=https' --tlsv1.2 https://pulp-platform.github.io/bender/init -sSf | sh
-    ```
-- Finally, get a RISC-V toolchain. We recommend obtaining binary releases for your operating system from [SiFive's SW site](https://www.sifive.com/software).
-    - Unpack the toolchain to a location of your choice (assuming `$RISCV` here). For example for Ubuntu you do:
-      ```
-      mkdir -p $RISCV && tar -x -f riscv64-unknown-elf-gcc-8.3.0-2020.04.0-x86_64-linux-ubuntu14.tar.gz --strip-components=1 -C $RISCV
-      ```
-    - Add the `$RISCV/bin` folder to your path variable.
-      ```
-      export PATH=$RISCV/bin:$PATH
-      ```
-    - The downloaded toolchain is a multi-lib toolchain, nevertheless our SW scripts currently expect binaries named `riscv32-*`. You can just alias `riscv64-*` to `riscv32-*` using:
-      ```
-      cd $RISCV/bin && for file in riscv64-*; do ln -s $file $(echo "$file" | sed 's/^riscv64/riscv32/g'); done
-      ```
-
-An alternative way, if you have Rust installed, is `cargo install bender`.
-
-### Tool Requirements
-
-- `bender >= 0.21`
-- `verilator >= 4.100`
-
-### Software Development
-
-- The `banshee` simulator is built using Rust. We recommend [`rustup`](https://rustup.rs/) if you haven't installed Rust already.
-- C/C++ code is formatted using `clang-format`.
-
-### Hardware Development
-
-- We use `verible` for style linting. Either build it from [source](https://github.com/google/verible) or, if available for your platform,  use one of the [pre-built images](https://github.com/google/verible/releases).
-- We support simulation with Verilator, VCS and Modelsim.
+## Installation
 
+Clone the repository:
+```shell
+git clone https://github.com/pulp-platform/snitch_cluster.git --recurse-submodules
+```
+
+If you had already cloned the repository without the `--recurse-submodules` flag, clone its submodules:
+```shell
+git submodule init --recursive
+```
+
+## Tools and environment
+
+This repository requires several tools to be installed on your machine. Some of these tools require non-free licenses. However, most of the functionality in this repository can be reproduced with free tools alone.
+
+Note that installing all tools, in appropriate versions, may be non-trivial. For this purpose, we provide a Docker container with all free tools installed.
+
+The [following section](#docker-container) provides instructions to install the Docker container.
+
+Users with access to ETH Zurich IIS machines can find all tools already installed on these machines. To complete the setup, skip to the [IIS environment setup](#iis-environment-setup) section.
+
+If you do choose to setup a custom development environment on your own machine, we strongly recommend you take example from our [Docker file](https://github.com/pulp-platform/snitch_cluster/blob/{{ branch }}/util/container/README.md).
+
+## Docker container
+
+The following instructions are extracted from the Docker container [README.md](https://github.com/pulp-platform/snitch_cluster/blob/{{ branch }}/util/container/README.md). For additional information on the Docker container refer to that document.
+
+### Installation
+
+{%
+   include-markdown '../../util/container/README.md'
+   start="## Installation"
+   end="## Usage"
+   comments=false
+   heading-offset=1
+%}
+
+## IIS environment setup
+
+To make sure the right versions of each tool are picked up, set the following environment variables, e.g. in a bash shell:
+
+```bash
+export PYTHON="/usr/local/anaconda3-2022.05/bin/python3"
+export BENDER="bender-0.27.1"
+export CC="gcc-9.2.0"
+export CXX="g++-9.2.0"
+export LLVM_BINROOT="/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin"
+# As a temporary workaround (until correct tool versions are installed system-wide):
+export PATH=/home/colluca/snitch/bin:$PATH
+export PATH=/usr/scratch/dachstein/colluca/opt/verible/bin:$PATH
+```
+
+Add these commands to your shell startup file (e.g. `~/.bashrc` if you use bash as the default shell) to ensure that the environment is set up correctly every time you open a new shell.
+
+Create a Python virtual environment:
+
+```shell
+$PYTHON -m venv ~/.venvs/snitch_cluster
+```
+
+Activate your environment, e.g. in a bash shell:
+
+```bash
+source ~/.venvs/snitch_cluster/bin/activate
+```
+
+You may want to add the last command to your shell startup file to ensure that the virtual environment is activated on every new shell you open.
+
+Install the required packages in the currently active virtual environment:
+
+```shell
+pip install -r python-requirements.txt
+```
diff --git a/docs/ug/setup-iis.md b/docs/ug/setup-iis.md
deleted file mode 100644
index 34482f777..000000000
--- a/docs/ug/setup-iis.md
+++ /dev/null
@@ -1,297 +0,0 @@
-# Getting Started at IIS
-Below you can find the flow to run
-
-First, be aware of the shell which you are using.
-- We recommend using bash:
-    ```bash
-    bash
-    ```
-
-## Scratch folder
-Due to the limited size of your home directory, we recomment working in your scratch. You can select between `/scratch`, `/scratch2`, `/scratch3`.
-
-- Create yourself a scratch folder to work in it:
-    ```bash
-    # get your machine name
-    export MACHINE=$(hostname | cut -d . -f 1)
-    # Look how much free space there is in the scratch folders
-    df -h | grep scratch
-    # Pick one and create your folder in there, example :
-    mkdir -p /scratch/${USER}
-    # Note, contrary to your home folder, the scratch folder is local to your machine, but you can access it on any other machine over the network as follows:
-    cd /usr/scratch/${MACHINE}/${USER}
-    ```
-
-## Installation
-At IIS the default version of some tools (`gcc`, `cmake`, ...) might be too old for certain projects.
-
-- Create a install directory to install the needed tools:
-    ```bash
-    export INSTALL_DIR=/usr/scratch/${MACHINE}/${USER}/install-snitch
-    mkdir $INSTALL_DIR
-    cd $INSTALL_DIR
-    ```
-
-- Use the pre-installed LLVM toolchain by adding the following to your path:
-    ```bash
-    export PATH=/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/:$PATH
-    ```
-
-    or download the latest toolchain (andd add the location to your path):
-    ```bash
-    mkdir -p riscv-llvm
-    export LATEST_TAG=`curl -s -H "Accept: application/vnd.github.v3+json" https://api.github.com/repos/pulp-platform/llvm-project/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/'`
-    wget -qO- https://github.com/pulp-platform/llvm-project/releases/download/${LATEST_TAG}/riscv32-pulp-llvm-centos7-${LATEST_TAG}.tar.gz  | tar xvz --strip-components=1 -C riscv-llvm
-    # go back to installation directory
-    cd ${INSTALL_DIR}
-    # add location to path
-    export PATH=${INSTALL_DIR}/riscv-llvm/bin/:${PATH}
-    # unset temporary variables
-    unset LATEST_TAG
-    ```
-
-- Install the correct python version:
-    ```bash
-    export PYTHON_VERSION=3.9.10
-    mkdir -p python-${PYTHON_VERSION}
-    # download into temporary directory
-    mkdir tmp
-    cd tmp
-    curl https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz | tar -xz --strip-components=1
-    # install into location /usr/scratch/${MACHINE}/${USER}/python-${PYTHON_VERSION}
-    ./configure --prefix=${INSTALL_DIR}/python-${PYTHON_VERSION} --enable-ipv6
-    make -j$(nproc)
-    make install
-    # go back to installation directory
-    cd ${INSTALL_DIR}
-    # delete temporary installation directory
-    rm -rf tmp
-    # add location to path
-    export PATH=${PWD}/python-${PYTHON_VERSION}/bin/:$PATH
-    # unset temporary variables
-    unset PYTHON_VERSION
-    ```
-
-- Install the correct verilator version:
-    ```bash
-    export VERILATOR_VERSION=4.100
-    mkdir tmp
-    wget -qO- https://github.com/verilator/verilator/archive/refs/tags/v${VERILATOR_VERSION}.tar.gz  | tar xvz --strip-components=1 -C tmp
-    mkdir -p verilator-${VERILATOR_VERSION}
-    cd tmp
-    autoconf
-    unset VERILATOR_ROOT
-    ./configure --prefix=${INSTALL_DIR}/verilator-${VERILATOR_VERSION}
-    make -j$(nproc)
-    make install
-    # unset temporary variables
-    unset VERILATOR_VERSION
-    # go back to installation directory
-    cd ${INSTALL_DIR}
-    # create symbolic link
-    export PATH="${INSTALL_DIR}/verilator-${VERILATOR_VERSION}/bin/:$PATH"
-    # export INCLUDE_PATH="${INSTALL_DIR}/verilator-${VERILATOR_VERSION}/include/:$INCLUDE_PATH"
-    # export INCLUDE_PATH="${INSTALL_DIR}/verilator-${VERILATOR_VERSION}/include/vltstd/:$INCLUDE_PATH"
-    ```
-
-- Install the correct `verible-verilog-lint` tool version:
-    ```bash
-    export VERIBLE_VERSION=0.0-3222-gb19cdf44
-    mkdir -p verible-${VERIBLE_VERSION}
-    cd verible-${VERIBLE_VERSION}
-    curl -Ls -o verible.tar.gz https://github.com/google/verible/releases/download/v$VERIBLE_VERSION/verible-v$VERIBLE_VERSION-CentOS-7.9.2009-Core-x86_64.tar.gz
-    chmod 777 ${INSTALL_DIR}/verible-${VERIBLE_VERSION}
-    tar -C ${INSTALL_DIR}/verible-${VERIBLE_VERSION} -xf verible.tar.gz --strip-components=1
-    # add location to path
-    export PATH="${INSTALL_DIR}/verible-${VERIBLE_VERSION}/bin/:$PATH"
-    # unset temporary variables
-    unset VERIBLE_VERSION
-    ```
-
-For installing the last missing pieces you need to clone the repository.
-
-- Clone the repository:
-    ```bash
-    cd /usr/scratch/${MACHINE}/${USER}
-    git clone git@github.com:pulp-platform/snitch.git
-    ```
-- Create virtual environment and install the `python-requirements.txt`:
-    ```bash
-    # create virtual environment with correct and newly installed python version
-    python3.9 -m venv ~/.venvs/snitch
-    # activate the virtual environment
-    source ~/.venvs/snitch/bin/activate
-    # enter the cloned snitch directory
-    cd snitch
-    # install python requirements
-    pip install -r python-requirements.txt
-    ```
-
-- Create a location for all you binaries in your home directory and create it to your path:
-    ```bash
-    mkdir -p /home/${USER}/.snitch-bin
-    # Add the created binary location to your path
-    export PATH=/home/${USER}/.snitch-bin:$PATH
-    ```
-
-- Install the correct `spike-dasm` and create a symbolic link to your binary location `/home/${USER}/.snitch-bin`:
-    ```bash
-    cd sw/vendor/riscv-isa-sim
-    mkdir build
-    cd build
-    ../configure
-    make spike-dasm
-    # create symbolic link
-    ln -s /usr/scratch/${MACHINE}/${USER}/snitch/sw/vendor/riscv-isa-sim/build/spike-dasm /home/${USER}/.snitch-bin/spike-dasm
-    ```
-
-- Use a newer `cmake` versions:
-    ```bash
-    # make sure you are in /home/${USER}/.snitch-bin
-    cd /home/${USER}/.snitch-bin
-    ln -s /usr/sepp/bin/cmake-3.18.1 cmake
-    ```
-
-
-### Tool Specific Versions
-Unfortunately, depending on which RTL simulator you are using, you need to use a different GCC version. Therefore, you have to set the following variables **in addition** to the above commands.
-
-Let's go to the system `snitch_cluster`:
-
-```bash
-cd /usr/scratch/${MACHINE}/${USER}/snitch
-cd hw/system/snitch_cluster
-```
-
-#### Questasim
-
-First, let's prepare the environment to use Questasim and let's run some tests:
-
-```bash
-# Use Questasim's older GCC version for correct DPI compilation
-export QUESTA_HOME=/usr/pack/modelsim-10.7b-kgf/questasim/
-export CC=$QUESTA_HOME/gcc-5.3.0-linux_x86_64/bin/gcc
-export CXX=$QUESTA_HOME/gcc-5.3.0-linux_x86_64/bin/g++
-export LD=$QUESTA_HOME/gcc-5.3.0-linux_x86_64/bin/ld
-
-# compile HW for Questasim
-make bin/snitch_cluster.vsim
-
-# build and run all snRuntime SW tests on Questasim
-make sw.test.vsim
-
-# undo the variables if you change simulator
-unset QUESTA_HOME
-unset CC
-unset CXX
-unset LD
-```
-
-#### VCS 
-
-Next, let's test prepare the environment for VCS and let's run some tests:
-
-```bash
-# set GCC and G++ to version 9.2
-export GCC_DIR="/usr/pack/gcc-9.2.0-af"
-export GCC_DIR2="${GCC_DIR}/linux-x64"
-# use correct CC and CXX
-export CC="${GCC_DIR2}/bin/gcc"
-export CXX="${GCC_DIR2}/bin/g++"
-# set correct libraries
-export LD_LIBRARY_PATH="${GCC_DIR2}/lib64"
-export LIBRARY_PATH="${GCC_DIR2}/lib64"
-# set correct include paths
-export C_INCLUDE_PATH="${GCC_DIR}/include"
-export CPLUS_INCLUDE_PATH="${GCC_DIR}/include"
-# set correct PATH
-export PATH="${GCC_DIR2}/linux-x64/bin:${PATH}"
-
-# compile HW for VCS with correct VCS version prefix
-vcs-2020.12 make bin/snitch_cluster.vcs
-
-# build and run all snRuntime SW tests on VCS
-vcs-2020.12 make sw.test.vcs
-
-# undo the variables if you change simulator
-unset CC
-unset CXX
-```
-
-
-### Verilator 
-
-Verilator uses the same GCC compiler as VCS:
-
-```bash
-# compile HW for Verilator
-make bin/snitch_cluster.vlt
-
-# build and run all snRuntime SW tests on Verilator
-make sw.test.vlt
-```
-
-## Summary
-
-Next time you start with a fresh terminal, you can execute the following commands to use the correct tools:
-
-```bash
-bash
-# set all required variables
-export MACHINE=$(hostname | cut -d . -f 1)
-export INSTALL_DIR=/usr/scratch/${MACHINE}/${USER}/install-snitch
-export PYTHON_VERSION=3.9.10
-export VERILATOR_VERSION=4.100
-export VERIBLE_VERSION=0.0-3222-gb19cdf44
-
-# LLVM
-# pre-installed
-export PATH=/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/:$PATH
-# or manually installad
-#export PATH=${INSTALL_DIR}/riscv-llvm/bin/:${PATH}
-# Correct Python version
-export PATH=${INSTALL_DIR}/python-${PYTHON_VERSION}/bin/:$PATH
-# Activate the virtual python environment
-source ~/.venvs/snitch/bin/activate
-# Correct Verilator version
-export PATH="${INSTALL_DIR}/verilator-${VERILATOR_VERSION}/bin/:$PATH"
-# Correct Verible version
-export PATH="${INSTALL_DIR}/verible-${VERIBLE_VERSION}/bin/:$PATH"
-# Use correct `cmake` and `spike-dasm` version
-export PATH=/home/${USER}/.snitch-bin:$PATH
-
-# unset all temporary variables
-unset MACHINE
-unset INSTALL_DIR
-unset PYTHON_VERSION
-unset VERILATOR_VERSION
-unset VERIBLE_VERSION
-```
-
-If you use **Questasim**, set the following variables:
-
-```bash
-export QUESTA_HOME=/usr/pack/modelsim-10.7b-kgf/questasim/
-export CC=$QUESTA_HOME/gcc-5.3.0-linux_x86_64/bin/gcc
-export CXX=$QUESTA_HOME/gcc-5.3.0-linux_x86_64/bin/g++
-export LD=$QUESTA_HOME/gcc-5.3.0-linux_x86_64/bin/ld
-```
-
-If you use **VCS** or **Verilator**, set the following variables:
-
-```bash
-# set GCC and G++ to version 9.2
-export GCC_DIR="/usr/pack/gcc-9.2.0-af"
-export GCC_DIR2="${GCC_DIR}/linux-x64"
-# use correct CC and CXX
-export CC="${GCC_DIR2}/bin/gcc"
-export CXX="${GCC_DIR2}/bin/g++"
-# set correct libraries
-export LD_LIBRARY_PATH="${GCC_DIR2}/lib64"
-export LIBRARY_PATH="${GCC_DIR2}/lib64"
-# set correct include paths
-export C_INCLUDE_PATH="${GCC_DIR}/include"
-export CPLUS_INCLUDE_PATH="${GCC_DIR}/include"
-# set correct PATH
-export PATH="${GCC_DIR2}/linux-x64/bin:${PATH}"
-```
diff --git a/docs/ug/snitch_cluster.md b/docs/ug/snitch_cluster.md
deleted file mode 100644
index c9379904d..000000000
--- a/docs/ug/snitch_cluster.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Snitch Cluster System
-
-The Snitch cluster system (`hw/system/snitch_cluster`) is a fundamental system
-around a Snitch core. The cluster can be configured using a config file.
-
-The configuration parameters are documented using JSON schema, and documentation
-is generated for the schema. The configuration options can be found [here](../../../schema-doc/snitch_cluster/).
-
-The cluster testbench simulates an infinite memory. The RISC-V ELF file is
-preloaded using RISC-V's Front-end Server (`fesvr`).
-
-## Getting Started
-
-In `hw/system/snicht_cluster`:
-
-- Build the software:
-    ```
-    mkdir sw/build
-    cd sw/build
-    cmake ..
-    make
-    ```
-- Compile the model for your simulator:
-
-    === "Verilator"
-
-        ```
-        make bin/snitch_cluster.vlt
-        ```
-
-    === "Questasim"
-
-        ```
-        make bin/snitch_cluster.vsim
-        ```
-
-    === "VCS"
-
-        ```
-        make bin/snitch_cluster.vcs
-        ```
-
-- Run a binary on the simulator:
-
-    === "Verilator"
-
-        ```
-        bin/snitch_cluster.vlt path/to/riscv/binary
-        ```
-
-    === "Questasim"
-
-        ```
-        # Headless
-        bin/snitch_cluster.vsim path/to/riscv/binary
-        # GUI
-        bin/snitch_cluster.vsim.gui path/to/riscv/binary
-        ```
-
-    === "VCS"
-
-        ```
-        bin/snitch_cluster.vcs path/to/riscv/binary
-        ```
-
-- Build the traces in .logs/trace_hart_<hart_id>.txt with the help of spike-dasm:
-    ```
-    make traces
-    ```
-
-- Annotate the traces in .logs/trace_hart_<hart_id>.s with the source code related with the retired instructions:
-    ```
-    make annotate
-    ```
-
-- Get an overview of all Makefile targets:
-    ```
-    make help
-    ```
-
-## Configure the Cluster
-
-To configure the cluster with a different configuration, either edit the
-configuration files in the `cfg` folder or create a new configuration file and
-pass it to the Makefile:
-
-```
-make bin/snitch_cluster.vlt CFG=cfg/single-core.hjson
-```
-
-The default config is in `cfg/cluster.default.hjson`. Alternatively, you can also
-set your `CFG` environment variable, the Makefile will pick it up and override
-the standard config.
-
-
-## Using Verilator with LLVM
-
-LLVM+clang can be used to build the Verilator model. Optionally specify a path
-to the LLVM toolchain in `CLANG_PATH` and set `VLT_USE_LLVM=ON`.
-For the verilated model itself to be complied with LLVM, verilator must be built
-with LLVM (`CC=clang CXX=clang++ ./configure`). The `VLT` environment variable
-can then be used to point to the verilator binary.
-
-```bash
-# Optional: Specify which llvm to use
-export CLANG_PATH=/path/to/llvm-12.0.1
-# Optional: Point to a verilator binary compiled with LLVM
-export VLT=/path/to/verilator-llvm/bin/verilator
-make VLT_USE_LLVM=ON bin/snitch_cluster.vlt
-```
diff --git a/docs/ug/tutorial.md b/docs/ug/tutorial.md
new file mode 100644
index 000000000..363fa82e3
--- /dev/null
+++ b/docs/ug/tutorial.md
@@ -0,0 +1,28 @@
+# Tutorial
+
+The following tutorial will guide you through the use of the Snitch cluster. You will learn how to develop, simulate, debug and benchmark software for the Snitch cluster architecture.
+
+<!---
+The following documentation is directly included from `../../target/snitch_cluster/README.md`
+-->
+{%
+   include-markdown '../../target/snitch_cluster/README.md'
+   comments=false
+   start="## Tutorial"
+%}
+
+## Using Verilator with LLVM
+
+LLVM+clang can be used to build the Verilator model. Optionally specify a path
+to the LLVM toolchain in `CLANG_PATH` and set `VLT_USE_LLVM=ON`.
+For the verilated model itself to be complied with LLVM, verilator must be built
+with LLVM (`CC=clang CXX=clang++ ./configure`). The `VLT` environment variable
+can then be used to point to the verilator binary.
+
+```bash
+# Optional: Specify which llvm to use
+export CLANG_PATH=/path/to/llvm-12.0.1
+# Optional: Point to a verilator binary compiled with LLVM
+export VLT=/path/to/verilator-llvm/bin/verilator
+make VLT_USE_LLVM=ON bin/snitch_cluster.vlt
+```
diff --git a/hw/README.md b/hw/README.md
new file mode 100644
index 000000000..3dbbc0840
--- /dev/null
+++ b/hw/README.md
@@ -0,0 +1,10 @@
+# Snitch Hardware
+
+The `hw` directory contains various HW IPs which are instantiated in the Snitch Cluster design e.g., they are not stand-alone.
+Some of the IPs have stand-alone test benches. All IPs inside the `hw` directory are structured as follows:
+
+- `<ip_name>`: each directory contains one IP that is instantiated in the cluster design, e.g., they are not stand-alone.
+  - `doc`: documentation if existing
+  - `src`: RTL sources
+  - `test`: Standalone testbenches if existing
+  - `util`: Helper scripts to run standalone test benches if existing
diff --git a/mkdocs.yml b/mkdocs.yml
index 9fc663b5b..0140a15f8 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,15 +1,15 @@
 # Copyright 2020 ETH Zurich and University of Bologna.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
-site_name: Snitch
+site_name: Snitch Cluster
 theme:
   name: material
 
   icon:
     repo: fontawesome/brands/github
 
-repo_url: https://github.com/pulp-platform/snitch
-repo_name: pulp-platform/snitch
+repo_url: https://github.com/pulp-platform/snitch_cluster
+repo_name: pulp-platform/snitch_cluster
 
 markdown_extensions:
   - admonition
@@ -20,40 +20,36 @@ markdown_extensions:
   - pymdownx.emoji:
       emoji_index: !!python/name:materialx.emoji.twemoji
       emoji_generator: !!python/name:materialx.emoji.to_svg
+plugins:
+  - include-markdown
+  - macros
+use_directory_urls: false
+extra:
+  branch: main
 nav:
   - Home: index.md
   - User Guide:
       - Getting Started: ug/getting_started.md
-      - Benchmarking: ug/benchmarking.md
-      - Tooling Setup at IIS: ug/setup-iis.md
-      - Trace Example: ug/example_trace.html
-      - Docker: ug/docker.md
-      - Directory Structure: ug/directory_structure.md
+      - Repository Structure: ug/directory_structure.md
+      - Tutorial: ug/tutorial.md
       - Documentation: ug/documentation.md
-  - Systems:
-      - Snitch Cluster:
-          - Guide: ug/snitch_cluster.md
-          - Schema: schema-doc/snitch_cluster.md
-      - Occamy:
-          - Guide: ug/occamy_system/occamy.md
-          - Architecture:
-              - Overview: ug/occamy_system/1_overview.md
-              - Address Map: ug/occamy_system/2_addrmap.md
-              - System Components: ug/occamy_system/3_system_components.md
-              - Clocking: ug/occamy_system/4_clocking.md
-              - Interrupts and Synchronization: 5_interrupts_and_synchronization.md
-          - Schema: schema-doc/occamy.md
+      # Remove
+      # - Benchmarking: ug/benchmarking.md
+      # - Trace Example: ug/example_trace.html
   - Reference Manual:
-      - Snitch: rm/snitch/index.md
-      - Snitch Cluster: rm/snitch_cluster/index.md
-      - Reqrsp Interface: rm/reqrsp_interface/index.md
-      - Custom Instructions: rm/custom_instructions.md
-  - Snitch Runtime:
-      - Pages: runtime/Pages/index.md
-      - Files: runtime/Files/index.md
-      - Classes: runtime/Classes/index.md
-      - Examples: runtime/Examples/index.md
-      - Modules: runtime/Modules/index.md
-      - Namespaces: runtime/Namespaces/index.md
-      # - Solder: rm/solder.md
+      - Hardware:
+          - Snitch: rm/snitch.md
+          - Snitch Cluster:
+              - Overview: rm/snitch_cluster.md
+              - Schema: schema-doc/snitch_cluster.md
+          - Reqrsp Interface: rm/reqrsp_interface.md
+          - Custom Instructions: rm/custom_instructions.md
+          # - Solder: rm/solder.md
+      - Software:
+          - Pages: runtime/Pages/index.md
+          - Files: runtime/Files/index.md
+          - Classes: runtime/Classes/index.md
+          - Examples: runtime/Examples/index.md
+          - Modules: runtime/Modules/index.md
+          - Namespaces: runtime/Namespaces/index.md
   - Publications: publications.md
diff --git a/sw/README.md b/sw/README.md
index 7a81ec774..5f1c15047 100644
--- a/sw/README.md
+++ b/sw/README.md
@@ -4,15 +4,11 @@ This subdirectory contains the various bits and pieces of software for the Snitc
 
 ## Contents
 
-### Simulator
-
-The `banshee` directory contains an LLVM-based binary translation simulator for Snitch systems that is capable of specifically emulating the custom instruction set extensions. See `banshee/README.md` for more details.
-
 ### Libraries
 
 - `applications`: Contains applications and kernels, mostly NN-related with SW testbenches for performance profiling.
 - `cmake`: Bits and pieces for integration with the CMake build system.
-- `snRuntime`: The fundamental, bare-metal runtime for Snitch systems. Exposes a minimal API to manage execution of code across the available cores and clusters, query information about a thread's context, and to coordinate and exchange data with other threads.
+- `snRuntime`: The fundamental, bare-metal runtime for Snitch systems. Exposes a minimal API to manage execution of code across the available cores and clusters, query information about a thread's context, and to coordinate and exchange data with other threads. Hardware configuration dependent implementations of the `snRuntime` can be found, e.g., under `target/snitch_cluster/sw/snRuntime`.
 - `snBLAS`: A minimal reference implementation of the basic linear algebra subprograms that demonstrates the use of Snitch and its extensions.
 
 ### Tests
@@ -25,3 +21,4 @@ The `banshee` directory contains an LLVM-based binary translation simulator for
 The `deps` directory contains third-party tools that we inline into this repository for ease of use.
 
 - `deps/riscv-opcodes`: Utilities to manage instruction encodings and generate functions and data structurse for parsing and representation in various languages.
+- `deps/printf`: A printf / sprintf implementation for embedded systems.
diff --git a/target/README.md b/target/README.md
new file mode 100644
index 000000000..ec46e56a6
--- /dev/null
+++ b/target/README.md
@@ -0,0 +1,16 @@
+# HW Targets
+
+This subdirectory contains the supported systems and their simulation environment including testbenches and bootrom.
+
+  - `shared`: contains the shared fesvr related testbench components.
+  - `snitch_cluster`
+    - `cfg`: containing the configuration files `*.hsjon`.
+    - `generated`: contains the generated `bootdata.cc` and RTL wrapper for the snitch cluster `snitch_cluster_wrapper.sv`.
+    - `src`: contains the [Banshee](https://github.com/pulp-platform/banshee) configuration for the snitch cluster.
+    - `sw`: contains all shared
+      - `apps`: contains applications for the snitch cluster.
+      - `runtime`: contains the HW specific runtime implementation for the snitch cluster.
+        - `rtl`: RTL-related startup implmentations.
+        - `banshee`: Banshee-related startup SW implmentations.
+      - `tests`: lists of tests that can run on the snitch cluster.
+    - `test`: contains testharness and bootrom of the snitch cluster
\ No newline at end of file
diff --git a/target/snitch_cluster/README.md b/target/snitch_cluster/README.md
index 78949a793..5f7641a18 100644
--- a/target/snitch_cluster/README.md
+++ b/target/snitch_cluster/README.md
@@ -1,55 +1,324 @@
-# Snitch Cluster
+# Snitch cluster target
 
-This system provides the minimum necessary logic which is needed around a Snitch
-cluster to be executing binaries. Further documentation can be found
-[here](https://pulp-platform.github.io/snitch/ug/snitch_cluster/).
+The Snitch cluster target (`target/snitch_cluster`) is a simple RTL testbench
+around a Snitch cluster. The cluster can be configured using a config file. By default, the config file which will be picked up is `target/snitch_cluster/cfg/default.hsjon`.
 
-## Usage
+The configuration parameters are documented using JSON schema. Documentation for the schema and available configuration options can be found in `docs/schema-doc/snitch_cluster/`).
 
-Compile the Verilator model:
+The cluster testbench simulates an infinite memory. The RISC-V ELF file to be simulated is
+preloaded using RISC-V's Front-End Server (`fesvr`).
 
-    make bin/snitch_cluster.vlt
+## Tutorial
 
-This generates the following files:
+In the following tutorial you can assume the working directory to be `target/snitch_cluster`. All paths are to be assumed relative to this directory. Paths relative to the root of the repository are prefixed with a slash.
 
-- `bin/snitch_cluster.vlt`: An executable that runs a RISC-V binary on a Snitch
-  cluster.
-- `bin/libsnitch_cluster.a`: A library version that allows other programs to run
-  binaries on the system and interact with the memory.
+### Building the hardware
 
-## Running
+To compile the hardware for simulation run one of the following commands, depending on the desired simulator:
 
-You can run a binary on the simulator by passing it as a command-line argument
-to `bin/snitch_cluster`, for example:
+```shell
+# Verilator (for Docker users)
+make bin/snitch_cluster.vlt
+# Verilator (for IIS users)
+verilator-4.110 make bin/snitch_cluster.vlt
 
-    bin/snitch_cluster.vlt sw/alive
+# Questa (for IIS users)
+questa-2022.3 make bin/snitch_cluster.vsim
 
-Questasim simulation can be run in GUI mode with Wave-Format scripts in `wave/*.do`
+# VCS (for IIS users)
+vcs-2020.12 make bin/snitch_cluster.vcs
+```
 
-    bin/snitch_cluster.vsim.gui sw/alive
-    VSIM> do wave/all_cores.do
+These commands compile the RTL sources respectively in `work-vlt`, `work-vsim` and `work-vcs`. Additionally, common C++ testbench sources (e.g. the [frontend server (fesvr)](https://github.com/riscv-software-src/riscv-isa-sim)) are compiled under `work`. Each command will also generate a script or an executable (e.g. `bin/snitch_cluster.vsim`) which you can invoke to simulate the hardware. We will see how to do this in a later section.
 
-## Traces
+### Cluster configuration
 
-Each simulation will generate a unique tracefile for each hart in the system.
-The tracefile can be disassembled to instruction mnemonics by using the `traces`
-target.
+Note that the Snitch cluster RTL sources are partly automatically generated from a configuration file provided in `.hjson` format. Several RTL files are templated and use the `.hjson` configuration file to fill the template entries. An example is `/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl`.
 
-    make traces
+Under the `cfg` folder, different configurations are provided. The `cfg/default.hjson` configuration instantiates 8 compute cores + 1 DMA core in the cluster. If you need a specific configuration you can create your own configuration file.
 
-A source-code annotated trace can be generated using the `annotate` target
+The command you executed previously automatically generated the templated RTL sources. It implicitly used the default configuration file.
+To override the default configuration file, define the following variable when you invoke `make`:
+```shell
+make CFG_OVERRIDE=cfg/custom.hjson bin/snitch_cluster.vlt
+```
 
-    make annotate
+___Note:__ whenever you override the configuration file on the `make` command-line, the configuration will be stored in the `cfg/lru.hjson` file. Successive invocations of `make` will automatically pick up the `cfg/lru.hjson` file. You can therefore omit the `CFG_OVERRIDE` definition in successive commands unless you want to override the least-recently used configuration._
 
-## Software
+### Building the software
 
-The runtime and additional software libraries for the cluster configuration can be compiled as follows:
+To build all of the software for the Snitch cluster, run the following command:
 
-    mkdir sw/build
-    cd sw/build
-    cmake ..
-    make
+```bash
+make DEBUG=ON sw
+```
 
-If you have compiled the Verilator model as described above, you can run the unit tests on your system as follows:
+The `sw` target first generates some C header files which depend on the hardware configuration. Hence, the need to generate the software for the same configuration as your hardware. Afterwards, it recursively invokes the `make` target in the `sw` subdirectory to build the apps/kernels which have been developed in that directory.
 
-    make test
+The `DEBUG=ON` flag is used to tell the compiler to produce debugging symbols. It is necessary for the `annotate` target, showcased in the Debugging section of this guide, to work.
+
+___Note:__ the RTL is not the only source which is generated from the configuration file. The software stack also depends on the configuration file. Make sure you always build the software with the same configuration of the hardware you are going to run it on._
+
+### Running a simulation
+
+Create the `logs` directory to host the simulation traces:
+
+```shell
+# If it's the first time you run this the logs/ folder won't exist and you will have to create it
+mkdir logs
+```
+
+Run one of the executables which was compiled in the previous step on your Snitch cluster hardware with your preferred simulator:
+
+```shell
+# Verilator (for Docker users)
+bin/snitch_cluster.vlt sw/apps/blas/axpy/build/axpy.elf
+# Verilator (for IIS users)
+verilator-4.110 bin/snitch_cluster.vlt sw/apps/blas/axpy/build/axpy.elf
+
+# Questa (for IIS users)
+questa-2022.3 bin/snitch_cluster.vsim sw/apps/blas/axpy/build/axpy.elf
+
+# VCS (for IIS users)
+vcs-2020.12 bin/snitch_cluster.vcs sw/apps/blas/axpy/build/axpy.elf
+```
+
+The previous commands will run the simulation in your current terminal. You can also run the simulation in the QuestaSim GUI by adapting the previous command to:
+
+```shell
+# Questa (for IIS users)
+questa-2022.3 bin/snitch_cluster.vsim.gui sw/apps/blas/axpy/build/axpy.elf
+```
+
+### Creating your first Snitch app
+
+In the following you will create your own AXPY kernel implementation as an example how to develop software for Snitch.
+
+#### Writing the C Code
+
+Create a directory for your AXPY kernel under `sw/`:
+
+```bash
+mkdir sw/apps/axpy
+```
+
+And a `src` subdirectory to host your source code:
+
+```bash
+mkdir sw/apps/axpy/src
+```
+
+Here, create a new file named `axpy.c` inside the `src` directory with the following contents:
+
+```C
+#include "snrt.h"
+#include "data.h"
+
+// Define your kernel
+void axpy(uint32_t l, double a, double *x, double *y, double *z) {
+    for (uint32_t i = 0; i < l ; i++) {
+        z[i] = a * x[i] + y[i];
+    }
+    snrt_fpu_fence();
+}
+
+int main() {
+    // Read the mcycle CSR (this is our way to mark/delimit a specific code region for benchmarking)
+    uint32_t start_cycle = mcycle();
+
+    // DM core does not participate in the computation
+    if(snrt_is_compute_core())
+        axpy(L, a, x, y, z);
+
+    // Read the mcycle CSR
+    uint32_t end_cycle = mcycle();
+}
+
+```
+
+The `snrt.h` file implements the snRuntime API, a library of convenience functions to program Snitch cluster based systems. These sources are located under `target/snitch_cluster/sw/runtime/rtl` and are automatically referenced by our compilation scripts.
+
+___Note:__ Have a look at the files inside `sw/snRuntime` in the root of this repository to see what kind of functionality the snRuntime API defines. Note this is only an API, with some base implementations. The Snitch cluster implementation of the snRuntime for RTL simulation can be found under `target/snitch_cluster/sw/runtime/rtl`. It is automatically built and linked with user applications thanks to our compilation scripts._
+
+We will have to instead create the `data.h` file ourselves. Create a `target/snitch_cluster/sw/apps/axpy/data/data` folder to host the data for your kernel to operate on:
+
+```bash
+mkdir sw/apps/axpy/data
+```
+
+Here, create a C file named `data.h` with the following contents:
+
+```C
+uint32_t L = 16;
+
+double a = 2;
+
+double x[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+double y[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1};
+
+double z[16];
+
+```
+
+In this file we hardcode the data to be used by the kernel. This data will be loaded in memory together with your application code. In general, to verify your code you may want to randomly generate the above data. You may also want to test your kernel on different problem sizes, e.g. varying the length of the vectors, without having to manually rewrite the file. This can be achieved by generating the data header file with a Python script. You may have a look at the `sw/blas/axpy/datagen` folder in the root of this repository as an example. You may reuse several of the functions defined in `sw/blas/axpy/datagen/datagen.py`. Eventually, we will promote these functions to a dedicated Python module which can be easily reused.
+
+#### Compiling the C Code
+
+In your `axpy` folder, create a new file named `Makefile` with the following contents:
+
+```make
+APP     = axpy
+SRCS    = src/axpy.c
+INCDIRS = data
+
+include ../common.mk
+```
+
+This Makefile will be invoked recursively by the top-level Makefile, compiling your source code into an executable with the name provided in the `APP` variable.
+
+In order for the top-level Makefile to find your application, add the following line to `sw/apps.list`:
+
+```
+apps/axpy
+```
+
+Now you can recompile all software, including your newly added AXPY application:
+
+```shell
+make DEBUG=ON sw
+```
+
+Note, only the targets depending on the sources you have added/modified will be recompiled.
+
+In the `sw/apps/axpy/build` directory, you will now find your `axpy.elf` executable and some other files which were automatically generated to aid debugging. Open `axpy.dump` and search for `<x>`, `<y>` and `<z>`. You will see the addresses where the respective vectors defined in `data.h` have been allocated by the compiler. This file can also be very useful to see what assembly instructions your source code was compiled to, and correlate the traces (we will later see) with the source code.
+
+If you want to dig deeper into how our build system works and how these files were generated you can follow the recursive Makefile invocations starting from the `sw` target in `snitch_cluster/Makefile`.
+
+#### Run your application
+
+You can run your application in simulation as shown in the previous sections. Make sure to pick up the right binary, e.g.:
+
+```shell
+questa-2022.3 bin/snitch_cluster.vsim sw/apps/axpy/build/axpy.elf
+```
+
+### Debugging and benchmarking
+
+When you run the simulation, every core will log all the instructions it executes (along with additional information, such as the value of the registers before/after the instruction) in a trace file, located in the `target/snitch_cluster/logs` directory. The traces are identified by their hart ID, that is a unique ID for every hardware thread (hart) in a RISC-V system (and since all our cores have a single thread that is a unique ID per core)
+
+The simulation logs the traces in a non-human readable format with `.dasm` extension. To convert these to a human-readable form run:
+
+```bash
+make -j traces
+```
+
+In addition to generating readable traces (`.txt` format), the above command also computes several performance metrics from the trace and appends them at the end of the trace. These can be collected into a single CSV file with the following target:
+
+```bash
+make logs/perf.csv
+# View the CSV file
+libreoffice logs/perf.csv
+```
+
+In this file you can find the `X_tstart` and `X_tend` metrics. These are the cycles in which a particular code region `X` starts and ends, and can hence be used to profile your code. Code regions are defined by calls to `mcycle()`. Every call to this function defines two code regions:
+- the code preceding the call, up to the previous `mcycle()` call or the start of the source file
+- the code following the call, up to the next `mcycle()` call or the end of the source file
+
+The CSV file can be useful to automate collection and post-processing of benchmarking data.
+
+Finally, debugging your program from the trace alone can be quite tedious and time-consuming. You would have to manually understand which instructions in the trace correspond to which lines in your source code. Surely, you can help yourself with the disassembly.
+
+Alternatively, you can automatically annotate the traces with that information. With the following commands you can view the trace instructions side-by-side with the corresponding source code lines they were compiled from:
+
+```bash
+make -j annotate
+kompare -o logs/trace_hart_00000.diff
+```
+
+If you prefer to view this information in a regular text editor (e.g. for search), you can open the `logs/trace_hart_xxxxx.s` files. Here, the annotations are interleaved with the trace rather than being presented side-by-side.
+
+___Note:__ the `annotate` target uses the `addr2line` binutil behind the scenes, which needs debugging symbols to correlate instruction addresses with originating source code lines. The `DEBUG=ON` flag you specified when building the software is used to tell the compiler to produce debugging symbols when compiling your code._
+
+The traces contain a lot of information which we might not be interested at first. To simply visualize the runtime of the compute region in our code, first create a file named `layout.csv` in `sw/apps/axpy` with the following contents:
+
+```
+            , compute
+"range(0,9)",       1
+9           ,
+
+```
+
+Then run the following commands:
+
+```bash
+# Similar to logs/perf.csv but filters all but tstart and tend metrics
+make logs/event.csv
+# Labels, filters and reorders the event regions as specified by an application-specific layout file
+../../../util/trace/layout_events.py logs/event.csv sw/apps/axpy/layout.csv -o logs/trace.csv
+# Creates a trace file which can be visualized with Chrome's TraceViewer
+../../../util/trace/eventvis.py -o logs/trace.json logs/trace.csv
+```
+
+Go to `http://ui.perfetto.dev/`. Here you can load the `logs/trace.json` file and graphically view the runtime of the compute region in your code. To learn more about the layout file syntax and what the Python scripts do you can have a look at the description comment at the start of the scripts themselves.
+
+__Great, but, have you noticed a problem?__
+
+Look into `sw/apps/axpy/build/axpy.dump` and search for the address of the output variable `<z>` :
+
+```
+Disassembly of section .bss:
+
+80000960 <z>:
+	...
+```
+
+Now grep this address in your traces:
+
+```bash
+grep 80000960 logs/*.txt
+...
+```
+
+It appears in every trace! All the cores issue a `fsd` (float store double) to this address. You are not parallelizing your kernel but executing it 8 times!
+
+Modify `sw/apps/axpy/src/axpy.c` to truly parallelize your kernel:
+
+```C
+#include "snrt.h"
+#include "data.h"
+
+// Define your kernel
+void axpy(uint32_t l, double a, double *x, double *y, double *z) {
+    int core_idx = snrt_cluster_core_idx();
+    int offset = core_idx * l;
+
+    for (int i = 0; i < l; i++) {
+        z[offset] = a * x[offset] + y[offset];
+        offset++;
+    }
+    snrt_fpu_fence();
+}
+
+int main() {
+    // Read the mcycle CSR (this is our way to mark/delimit a specific code region for benchmarking)
+    uint32_t start_cycle = mcycle();
+
+    // DM core does not participate in the computation
+    if(snrt_is_compute_core())
+        axpy(L / snrt_cluster_compute_core_num(), a, x, y, z);
+
+    // Read the mcycle CSR
+    uint32_t end_cycle = mcycle();
+}
+```
+
+Now re-run your kernel and compare the execution time of the compute region with the previous version.
+
+## Code Reuse
+
+As you may have noticed, there is a good deal of code which is independent of the hardware platform we execute our AXPY kernel on. This is true for the `data.h` file and possible data generation scripts. The Snitch AXPY kernel itself is not specific to the Snitch cluster, but can be ported to any platform which provides an implementation of the snRuntime API. An example is Occamy, with its own testbench and SW development environment.
+
+It is thus preferable to develop the data generation scripts and Snitch kernels in a shared location, from which multiple platforms can take and include the code. The `sw` directory in the root of this repository was created with this goal in mind. For the AXPY example, shared sources are hosted under the `sw/blas/axpy` directory. As an example of how these shared sources are used to build an AXPY application for a specific platform (in this case the standalone Snitch cluster) you can have a look at the `target/snitch_cluster/sw/apps/blas/axpy`.
+
+We recommend that you follow this approach also in your own developments for as much of the code which can be reused.
diff --git a/target/snitch_cluster/WALKTHROUGH.md b/target/snitch_cluster/WALKTHROUGH.md
deleted file mode 100644
index f58bb0726..000000000
--- a/target/snitch_cluster/WALKTHROUGH.md
+++ /dev/null
@@ -1,350 +0,0 @@
-# Walkthrough
-
-## Fast setup at IIS
-
-### Scratch folder
-
-First, create yourself a folder to work in on the scratch disk. Your home directory is mounted from the network, and has tighter size and access speed constraints than the scratch disks in your machine. You can sometimes select between multiple scratch disks, such as `/scratch`, `/scratch2`, `/scratch3`.
-
-```bash
-# Look how much free space there is in the scratch folders
-df -h | grep scratch
-# Pick one and create your folder in there, for example:
-mkdir /scratch/[your username]
-# Note, contrary to your home folder, the scratch folder is local to your machine, but you can access it on any other machine like so
-cd /usr/scratch/[your machine]/[your username]
-# You can find the name of a machine by running
-hostname
-# (Note, keep only the name before .ee.ethz.ch)
-```
-
-### Dependencies
-
-At IIS the default version of some tools (`gcc`, `cmake`, ...) might be too old for certain projects. You will need to setup your own default binary for these tools:
-
-```bash
-# Create your own bin folder in your home directory
-mkdir ~/bin && cd ~/bin
-# There you can change the default binaries for your user
-ln -s /usr/pack/gcc-9.2.0-af/linux-x64/bin/gcc gcc
-ln -s /usr/pack/gcc-9.2.0-af/linux-x64/bin/g++ g++
-ln -s /usr/sepp/bin/cmake-3.18.1 cmake
-ln -s /home/colluca/bin/spike-dasm spike-dasm
-# Now you need to add this folder to your PATH:
-# Open ~/.profile and add the lines
-export PATH=~/bin:$PATH
-export PATH=/usr/scratch/dachstein/colluca/opt/verible/bin:$PATH
-```
-
-Create a Python virtual environment:
-
-```
-python3.9 -m venv ~/.venvs/snitch
-```
-
-Activate your environment, e.g. in a bash shell:
-```
-source ~/.venvs/snitch/bin/activate
-```
-
-Note that the default shell for IIS users is `tcsh`, hence you may need to adapt the previous command accordingly.
-
-Add the last line to your shell startup file (e.g. `~/.bashrc` if you use bash as the default shell) if you want the virtual environment to be activated by default when you open a new terminal.
-
-To compile your code to a RISC-V executable you will need a compiler toolchain for RISC-V. There are plenty of pre-compiled RISC-V toolchains at IIS, for Snitch you can use the following LLVM toolchain.
-
-```bash
-# You can add this to your shell startup file such that you do not have to run this command every time you open a new terminal
-export PATH=/usr/scratch2/rapanui/lbertaccini/snitch_occamy_vsum_test/riscv32-pulp-llvm-centos7-131/bin/:$PATH
-```
-
-## Cloning Snitch
-
-First, clone this repository on your scratch folder. We suggest you first make a private fork of the repo.
-
-```bash
-git clone https://github.com/pulp-platform/snitch.git
-cd snitch
-```
-
-Now install the required Python dependencies. Make sure you have activated your virtual environment before doing so.
-
-```
-pip install -r python-requirements.txt
-```
-
-## Compiling the Snitch hardware for simulation
-
-Go to the `snitch_cluster` folder, where most of your efforts will take place:
-
-```
-cd target/snitch_cluster
-```
-
-___Note:__ from now on, assume all paths to be relative to `target/snitch_cluster`._
-
-The Snitch cluster RTL sources are partly automatically generated from a configuration file provided in `.hjson` format. Several RTL files are templated and use the `.hjson` configuration file to fill the template entries. An example is `hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl`.
-Under the `cfg` folder different configurations are provided. The `default.hjson` configuration instantiates 8 compute cores + 1 DMA core in the cluster. If you need a specific configuration you can create your own configuration file. To override the default configuration, define the following variable when you invoke Make:
-
-```bash
-# Compile the RTL for Questasim
-make CFG_OVERRIDE=cfg/cluster.hjson bin/snitch_cluster.vsim
-```
-
-The previous command generates the templated RTL sources from the configuration file and compiles the RTL for Questasim simulation.
-
-The RTL simulation model is compiled in `./work-vsim` and the [frontend server (fesvr)](https://github.com/riscv-software-src/riscv-isa-sim) and other C++ sources used throughout the testbench are compiled into `./work`. A script named `bin/snitch_cluster.vsim` was also generated (_you can have a look inside the file_) as a wrapper for the command that you would invoke to simulate your hardware with Questasim. The script takes an executable compiled for Snitch as input, and feeds it as an argument to the simulator. The testbench relies on the `fesvr` utilities to load your executable into the simulated DRAM memory.
-
-Note the `CFG_OVERRIDE` variable need only be defined for those targets which make use of the configuration file, e.g. RTL generation.
-
-Note that the RTL is not the only source which is generated from the configuration file. The software stack also depends on the configuration file. Make sure you always build the software with the same configuration of the hardware you are going to run it on. By default, if you compile the software after you have compiled the hardware, this is ensured automatically for you. Whenever you override the configuration file on the Make command-line, the configuration will be stored in the `cfg/lru.hjson` file. Successive invocations of Make may omit the `CFG_OVERRIDE` flag and the least-recently used configuration saved in `cfg/lru.hjson` will be picked up automatically.
-
-___Note:__ When you have time, have a look at the `Makefile` and the commands that are executed by the `sw` and `bin/snitch_cluster.vsim` targets. Note that the Makefile includes the Make fragment in `target/common/common.mk` at the root of this repository where plenty of things are defined._
-
-## Building the Snitch software
-
-To build all of the software for the Snitch cluster, run the following Make command:
-
-```bash
-make DEBUG=ON sw
-```
-
-The `sw` target first generates some C header files which depend on the hardware configuration. Hence, the need to generate the software for the same configuration as your hardware. Afterwards, it recursively invokes the `make` target in the `sw` subdirectory to build the apps/kernels which have been developed in that directory.
-
-The `DEBUG=ON` flag is used to tell the compiler to produce debugging symbols. It is necessary for the `annotate` target, showcased in the Debugging section of this guide, to work.
-
-## Creating your first Snitch app
-
-### Writing the C code
-
-Create a directory for your AXPY kernel under `sw/`:
-
-```bash
-mkdir sw/apps/axpy
-```
-
-And a `src` subdirectory to host your source code:
-
-```bash
-mkdir sw/apps/axpy/src
-```
-
-Here, create a new file named `axpy.c` with the following contents:
-
-```C
-#include "snrt.h"
-#include "data.h"
-
-// Define your kernel
-void axpy(uint32_t l, double a, double *x, double *y, double *z) {
-    for (uint32_t i = 0; i < l ; i++) {
-        z[i] = a * x[i] + y[i];
-    }
-    snrt_fpu_fence();
-}
-
-int main() {
-    // Read the mcycle CSR (this is our way to mark/delimit a specific code region for benchmarking)
-    uint32_t start_cycle = mcycle();
-
-    // DM core does not participate in the computation
-    if(snrt_is_compute_core())
-        axpy(L, a, x, y, z);
-
-    // Read the mcycle CSR
-    uint32_t end_cycle = mcycle();
-}
-
-```
-
-The `snrt.h` file implements the snRuntime API, a library of convenience functions to program Snitch cluster based systems. These sources are located under `sw/runtime/rtl` and are automatically referenced by our compilation scripts.
-
-___Note:__ When you have time, have a look at the files inside `sw/snRuntime` in the root of this repository to see what kind of functionality the snRuntime API defines. Note this is only an API, with some base implementations. The Snitch cluster implementation of the snRuntime for RTL simulation can be found under `sw/runtime/rtl`. It is automatically built and linked with user applications thanks to our compilation scripts._
-
-We will have to instead create the `data.h` file ourselves. Create a `data` folder to host the data for your kernel to operate on:
-
-```bash
-mkdir sw/apps/axpy/data
-```
-
-Here, create a C file named `data.h` with the following contents:
-
-```C
-uint32_t L = 16;
-
-double a = 2;
-
-double x[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-
-double y[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1};
-
-double z[16];
-
-```
-
-In this file we hardcode the data to be used by the kernel. This data will be loaded in memory together with your application code. In general, to verify your code you may want to randomly generate the above data. Or you may want to test your kernel on different problem sizes, e.g. varying the length of the vectors, without having to manually rewrite the file. This can be achieved by generating the data header file with a Python script. You may have a look at the `sw/blas/axpy/datagen` folder in the root of this repository as an example. You may reuse several of the functions defined in `sw/blas/axpy/datagen/datagen.py`. Eventually, we will promote these functions to a dedicated Python module which can be easily reused.
-
-### Compiling the C code
-
-In your `axpy` folder, create a new file named `Makefile` with the following contents:
-
-```make
-APP     = axpy
-SRCS    = src/axpy.c
-INCDIRS = data
-
-include ../common.mk
-```
-
-This Makefile will be invoked recursively by the top-level Makefile, compiling your source code into an executable with the name provided in the `APP` variable.
-
-In order for the top-level Makefile to find your application, add the following line to `sw/apps.list`:
-
-```
-apps/axpy
-```
-
-Now you can recompile all software, including your newly added AXPY application, with the following command (in the `snitch_cluster` folder):
-
-```bash
-make DEBUG=ON sw
-```
-
-Note, only the targets depending on the sources you have added/modified will be recompiled.
-
-In the `sw/apps/axpy/build` directory, you will now find your `axpy.elf` executable and some other files which were automatically generated to aid debugging. Open `axpy.dump` and search for `<x>`, `<y>` and `<z>`. You will see the addresses where the respective vectors defined in `data.h` have been allocated by the compiler. This file can also be very useful to see what assembly instructions your source code was compiled to, and correlate the traces (we will later see) with the source code.
-
-If you want to dig deeper into how our build system works and how these files were generated you can follow the recursive Makefile invocations starting from the `sw` target in `snitch_cluster/Makefile`.
-
-### Running your application
-
-Run the executable on your Snitch cluster hardware in simulation:
-
-```bash
-# If it's the first time you run this the logs/ folder won't exist and you will have to create it
-mkdir logs
-# Run the simulation in the current terminal
-bin/snitch_cluster.vsim sw/apps/axpy/build/axpy.elf
-# Run the simulation in the QuestaSim GUI
-bin/snitch_cluster.vsim.gui sw/apps/axpy/build/axpy.elf
-```
-
-### Debugging and benchmarking
-
-When you run the simulation, every core will log all the instructions it executes (along with additional information, such as the value of the registers before/after the instruction) in a trace file, located in the `./logs` directory. The traces are identified by their hart ID, that is a unique ID for every hardware thread (hart) in a RISC-V system (and since all our cores have a single thread that is a unique ID per core)
-
-The simulation logs the traces in a non-human readable format with `.dasm` extension. To convert these to a human-readable form run:
-
-```bash
-make -j traces
-```
-
-In addition to generating readable traces (`.txt` format), the above command also computes several performance metrics from the trace and appends them at the end of the trace. These can be collected into a single CSV file with the following target:
-
-```bash
-make logs/perf.csv
-# View the CSV file
-libreoffice logs/perf.csv
-```
-
-In this file you can find the `X_tstart` and `X_tend` metrics. These are the cycles in which a particular code region `X` starts and ends, and can hence be used to profile your code. Code regions are defined by calls to `mcycle()`. Every call to this function defines two code regions:
-- the code preceding the call, up to the previous `mcycle()` call or the start of the source file
-- the code following the call, up to the next `mcycle()` call or the end of the source file
-
-The CSV file can be useful to automate collection and post-processing of benchmarking data.
-
-Finally, debugging your program from the trace alone can be quite tedious and time-consuming. You would have to manually understand which instructions in the trace correspond to which lines in your source code. Surely, you can help yourself with the disassembly.
-
-Alternatively, you can automatically annotate the traces with that information. With the following commands you can view the trace instructions side-by-side with the corresponding source code lines they were compiled from:
-
-```bash
-make -j annotate
-kompare -o logs/trace_hart_00000.diff
-```
-
-If you prefer to view this information in a regular text editor (e.g. for search), you can open the `logs/trace_hart_xxxxx.s` files. Here, the annotations are interleaved with the trace rather than being presented side-by-side.
-
-___Note:__ the `annotate` target uses the `addr2line` binutil behind the scenes, which needs debugging symbols to correlate instruction addresses with originating source code lines. The `DEBUG=ON` flag you specified when building the software is used to tell the compiler to produce debugging symbols when compiling your code._
-
-The traces contain a lot of information which we might not be interested at first. To simply visualize the runtime of the compute region in our code, first create a file named `layout.csv` in `sw/apps/axpy` with the following contents:
-
-```
-            , compute
-"range(0,9)",       1
-9           ,
-
-```
-
-Then run the following commands:
-
-```bash
-# Similar to logs/perf.csv but filters all but tstart and tend metrics
-make logs/event.csv
-# Labels, filters and reorders the event regions as specified by an application-specific layout file
-../../../util/trace/layout_events.py logs/event.csv sw/apps/axpy/layout.csv -o logs/trace.csv
-# Creates a trace file which can be visualized with Chrome's TraceViewer
-../../../util/trace/eventvis.py -o logs/trace.json logs/trace.csv
-```
-
-Open a Chrome browser and go to `chrome://tracing`. Here you can load the `logs/trace.json` file and graphically view the runtime of the compute region in your code. To learn more about the layout file syntax and what the Python scripts do you can have a look at the description comment at the start of the scripts themselves.
-
-__Great, but, have you noticed a problem?__
-
-Look into `sw/apps/axpy/build/axpy.dump` and search for the address of the output variable `<z>` :
-
-```
-Disassembly of section .bss:
-
-80000960 <z>:
-	...
-```
-
-Now grep this address in your traces:
-
-```bash
-grep 80000960 logs/*.txt
-...
-```
-
-It appears in every trace! All the cores issue a `fsd` (float store double) to this address. You are not parallelizing your kernel but executing it 8 times!
-
-Modify `sw/apps/axpy/src/axpy.c` to truly parallelize your kernel:
-
-```C
-#include "snrt.h"
-#include "data.h"
-
-// Define your kernel
-void axpy(uint32_t l, double a, double *x, double *y, double *z) {
-    int core_idx = snrt_cluster_core_idx();
-    int offset = core_idx * l;
-
-    for (int i = 0; i < l; i++) {
-        z[offset] = a * x[offset] + y[offset];
-        offset++;
-    }
-    snrt_fpu_fence();
-}
-
-int main() {
-    // Read the mcycle CSR (this is our way to mark/delimit a specific code region for benchmarking)
-    uint32_t start_cycle = mcycle();
-
-    // DM core does not participate in the computation
-    if(snrt_is_compute_core())
-        axpy(L / snrt_cluster_compute_core_num(), a, x, y, z);
-
-    // Read the mcycle CSR
-    uint32_t end_cycle = mcycle();
-}
-```
-
-Now re-run your kernel and compare the execution time of the compute region with the previous version.
-
-## Code reuse
-
-As you may have noticed, there is a good deal of code which is independent of the hardware platform we execute our AXPY kernel on. This is true for the `data.h` file and possible data generation scripts. The Snitch AXPY kernel itself is not specific to the Snitch cluster, but can be ported to any platform which provides an implementation of the snRuntime API. An example is Occamy, with its own testbench and SW development environment.
-
-It is thus preferable to develop the data generation scripts and Snitch kernels in a shared location, from which multiple platforms can take and include the code. The `sw` directory in the root of this repository was created with this goal in mind. For the AXPY example, shared sources are hosted under the `sw/blas/axpy` directory. As an example of how these shared sources are used to build an AXPY application for a specific platform (in this case the standalone Snitch cluster) you can have a look at the `target/snitch_cluster/sw/apps/blas/axpy`.
-
-We recommend that you follow this approach also in your own developments for as much of the code which can be reused.
diff --git a/util/clustergen/cluster.py b/util/clustergen/cluster.py
index 8022091df..3b9badb64 100644
--- a/util/clustergen/cluster.py
+++ b/util/clustergen/cluster.py
@@ -12,7 +12,7 @@
 import json
 import re
 import logging as log
-import pathlib
+from pathlib import Path
 
 
 # Fill in default values for config values which do not have a user-defined value.
@@ -45,7 +45,7 @@ class Generator(object):
     DISCLAIMER = """// AUTOMATICALLY GENERATED by clustergen.py; edit the script or configuration
 // instead."""
 
-    file_path = pathlib.Path(__file__).parent
+    file_path = Path(__file__).parent
     snitch_cluster_folder = file_path / "../../hw/snitch_cluster"
 
     templates = TemplateLookup(directories=[snitch_cluster_folder],
@@ -54,19 +54,15 @@ class Generator(object):
     Generator class which contains common component to generate different systems.
     @root_schema: Schema object to which the generator corresponds.
     """
-    def __init__(self, root_schema):
-        # Load the cluster schema.
-        absolute_path_to_schema_dir = self.file_path / "../../docs/schema"
-        root_schema_filename = root_schema
-
-        self.root_schema = read_schema(absolute_path_to_schema_dir /
-                                       root_schema_filename)
+    def __init__(self, root_schema, remote_schemas=[]):
+        self.root_schema_path = root_schema
+        self.root_schema = read_schema(root_schema)
 
         store_set = dict()
 
-        # iterate over schema directory and generate a mapping from remote URLs
+        # iterate over remote schemas and generate a mapping from remote URLs
         # to local URIs.
-        for path in absolute_path_to_schema_dir.iterdir():
+        for path in remote_schemas:
             schema = read_schema(path)
             store_set[schema["$id"]] = schema
 
@@ -157,7 +153,7 @@ def __init__(self, cfg, pma_cfg):
         Initialize with a given configuration. The constructor checks conformans
         to the cluster schema and constructs a `cfg` object.
         """
-        super().__init__("snitch_cluster.schema.json")
+        super().__init__(Path(__file__).parent / "../../docs/schema/snitch_cluster.schema.json")
         self.mems = set()
         self.mems_desc = dict()
         self.validate(cfg)
@@ -365,7 +361,9 @@ class SnitchClusterTB(Generator):
     complex systems.
     """
     def __init__(self, cfg):
-        super().__init__("snitch_cluster_tb.schema.json")
+        schema = Path(__file__).parent / "../../docs/schema/snitch_cluster_tb.schema.json"
+        remote_schemas = [Path(__file__).parent / "../../docs/schema/snitch_cluster.schema.json"]
+        super().__init__(schema, remote_schemas)
         # Validate the schema.
         self.validate(cfg)
         # from here we know that we have a valid object.
diff --git a/util/container/README.md b/util/container/README.md
index 0ef6040c6..e928c937f 100644
--- a/util/container/README.md
+++ b/util/container/README.md
@@ -1,42 +1,42 @@
-# Docker Container
+# Docker container
 
-Docker container based on Ubuntu 18.04 LTS containing various hardware and
-software development tools for Snitch.
+This directory contains the [Docker file](Dockerfile) used to build the `snitch_cluster` Docker container. The container is based on the Ubuntu 18.04 LTS image and comes with all free development tools for Snitch pre-installed. The environment is also already configured, such that no additional steps are required to work in the container after installation.
 
-## Pre-built Container
+## Installation
 
-There is an experimental version of the container available.
-To download, first login to the GitHub container registry:
+### Pre-built container
+
+There is a pre-built version of the container available online. This version is up to date with the latest developments on the `main` branch. The CI publishes a new container every time a new commit is pushed to this branch.
+
+To download the container, first login to the GitHub container registry:
 ```shell
 $ docker login ghcr.io
 ```
 You will be asked for a username (your GitHub username).
 As a password you should use a
-[PAT](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
+[personal access token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
 that at least has package registry read permission.
 
-Then you can run:
-
+You can then install the container by running:
 ```shell
-$ docker pull ghcr.io/pulp-platform/snitch
+$ docker pull ghcr.io/pulp-platform/snitch_cluster
 ```
 
-## Using the Container
+### Build instructions
 
-To run container in interactive mode:
+In case you cannot use the pre-built container, e.g. if you need to make changes to the Dockerfile, you can build the
+container locally by running the following command in the root of the repository:
 
 ```shell
-$ docker run -it -v $REPO_TOP:/repo -w /repo ghcr.io/pulp-platform/snitch
+$ sudo docker build -t ghcr.io/pulp-platform/snitch_cluster -f util/container/Dockerfile .
 ```
 
-## Local Build Instructions
+## Usage
 
-In case you do not want to use the pre-built container you can also build the
-container in local mode:
+To run the container in interactive mode:
 
 ```shell
-$ cd $REPO_TOP
-$ sudo docker build -t ghcr.io/pulp-platform/snitch -f util/container/Dockerfile .
+$ docker run -it -v $REPO_TOP:/repo -w /repo ghcr.io/pulp-platform/snitch_cluster
 ```
 
 ## Limitations