From d024b04d22d026f158549b3e8c1c56544b42a64d Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 28 Aug 2024 16:08:03 +0200
Subject: [PATCH] treewide: Add Doxygen-generated docs (#189)

* docs: Set up Doxygen documentation

* snRuntime: Document fundamental modules

* ci: Install Doxygen in container and use for doc deployment
---
 .github/workflows/publish-docs.yml |  33 ++---
 .gitignore                         |   1 +
 Makefile                           |  19 ++-
 docs/.gitignore                    |   1 -
 docs/Doxyfile                      |  11 ++
 docs/doxybook2.json                |  13 --
 docs/rm/snRuntime.md               |   1 +
 mkdocs.yml                         |   8 +-
 sw/snRuntime/src/alloc_v2.h        | 102 ++++++++++++---
 sw/snRuntime/src/dma.h             | 201 +++++++++++++++++++++++------
 sw/snRuntime/src/ssr.h             | 144 ++++++++++++++++-----
 sw/snRuntime/src/sync.h            |  96 ++++++++++----
 sw/snRuntime/src/team.h            |  86 ++++++++++++
 util/container/Dockerfile          |   6 +
 14 files changed, 566 insertions(+), 156 deletions(-)
 delete mode 100644 docs/.gitignore
 create mode 100644 docs/Doxyfile
 delete mode 100644 docs/doxybook2.json
 create mode 100644 docs/rm/snRuntime.md

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index e98139f07..23a53bf4a 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -10,32 +10,19 @@ jobs:
   deploy:
     name: Deploy documentation
     runs-on: ubuntu-22.04
+    container:
+      image: ghcr.io/pulp-platform/snitch_cluster:main
     steps:
       - uses: actions/checkout@v2
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-      - uses: mattnotmitt/doxygen-action@v1
-        with:
-          working-directory: sw/
-      - name: bender install
-        uses: pulp-platform/pulp-actions/bender-install@v2
-        with:
-          version: 0.27.1
-      - name: Install Python requirements
-        run: pip install -r python-requirements.txt
-      - name: Generate runtime documentation
-        # yamllint disable rule:line-length
+      # For some reason, the checkout is done by a different user,
+      # than that deploying to Github (root, possibly due to Docker).
+      # So we need to set the repository as a safe directory.
+      - name: Git config safe.directory
         run: |
-          mkdir doxybook2; cd doxybook2
-          wget https://github.com/matusnovak/doxybook2/releases/download/v1.4.0/doxybook2-linux-amd64-v1.4.0.zip
-          unzip doxybook2-linux-amd64-v1.4.0.zip; cd ../
-          chmod +x doxybook2/bin/doxybook2
-          mkdir docs/runtime
-          ./doxybook2/bin/doxybook2 --input sw/doxygen/xml --output docs/runtime --config docs/doxybook2.json
-          rm -rf doxybook2
-        # yamllint enable rule:line-length
+          git config --global --add safe.directory $GITHUB_WORKSPACE
       - name: Generate documentation sources
-        run: make doc-srcs
+        run: |
+          make doc-srcs
+          make doxygen-docs
       - name: Build and deploy documentation
         run: mkdocs gh-deploy --force
diff --git a/.gitignore b/.gitignore
index 5f7019e12..320ef1947 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ gmon.out
 # Docs
 /site/
 /docs/generated/
+/docs/doxygen/
 
 # Installation directories
 /.venv/
diff --git a/Makefile b/Makefile
index dcd0d6ef1..498c7d121 100644
--- a/Makefile
+++ b/Makefile
@@ -49,23 +49,36 @@ clean-nonfree:
 # Docs #
 ########
 
-GENERATED_DOCS_DIR = docs/generated
+DOCS_DIR = docs
+
+GENERATED_DOCS_DIR = $(DOCS_DIR)/generated
 GENERATED_DOC_SRCS = $(GENERATED_DOCS_DIR)/peripherals.md
 
+DOXYGEN_DOCS_DIR = $(DOCS_DIR)/doxygen
+DOXYGEN_INPUTS   = $(DOCS_DIR)/rm/snRuntime.md
+DOXYGEN_INPUTS  += $(shell find sw/snRuntime -name '*.c' -o -name '*.h')
+DOXYFILE         = $(DOCS_DIR)/Doxyfile
+
 all: docs
 clean: clean-docs
-.PHONY: doc-srcs docs clean-docs
+.PHONY: doc-srcs doxygen-docs docs clean-docs
 
 doc-srcs: $(GENERATED_DOC_SRCS)
 
-docs: doc-srcs
+doxygen-docs: $(DOXYGEN_DOCS_DIR)
+
+docs: doc-srcs doxygen-docs
 	mkdocs build
 
 clean-docs:
 	rm -rf $(GENERATED_DOCS_DIR)
+	rm -rf $(DOXYGEN_DOCS_DIR)
 
 $(GENERATED_DOCS_DIR):
 	mkdir -p $@
 
 $(GENERATED_DOCS_DIR)/peripherals.md: hw/snitch_cluster/src/snitch_cluster_peripheral/snitch_cluster_peripheral_reg.hjson | $(GENERATED_DOCS_DIR)
 	$(REGGEN) -d $< > $@
+
+$(DOXYGEN_DOCS_DIR): $(DOXYFILE) $(DOXYGEN_INPUTS)
+	doxygen $<
diff --git a/docs/.gitignore b/docs/.gitignore
deleted file mode 100644
index e4d05d91a..000000000
--- a/docs/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-runtime
diff --git a/docs/Doxyfile b/docs/Doxyfile
new file mode 100644
index 000000000..f9d60136b
--- /dev/null
+++ b/docs/Doxyfile
@@ -0,0 +1,11 @@
+PROJECT_NAME = "Snitch Runtime"
+INPUT = docs/rm/snRuntime.md sw/snRuntime
+RECURSIVE = YES
+FILE_PATTERNS = *.h *.c
+OUTPUT_DIRECTORY = docs/doxygen/
+USE_MDFILE_AS_MAINPAGE = docs/rm/snRuntime.md
+GENERATE_LATEX = NO
+ENABLE_PREPROCESSING = YES
+MACRO_EXPANSION = YES
+EXPAND_ONLY_PREDEF = YES
+PREDEFINED = __attribute__(x)=
\ No newline at end of file
diff --git a/docs/doxybook2.json b/docs/doxybook2.json
deleted file mode 100644
index 28d69396e..000000000
--- a/docs/doxybook2.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "baseUrl": "/snitch_cluster/runtime/",
-  "indexInFolders": true,
-  "linkSuffix": "/",
-  "indexClassesName": "index",
-  "indexFilesName": "index",
-  "indexGroupsName": "index",
-  "indexNamespacesName": "index",
-  "indexRelatedPagesName": "index",
-  "indexExamplesName": "index",
-  "mainPageInRoot": true,
-  "mainPageName": "index"
-}
diff --git a/docs/rm/snRuntime.md b/docs/rm/snRuntime.md
new file mode 100644
index 000000000..42eb97ef2
--- /dev/null
+++ b/docs/rm/snRuntime.md
@@ -0,0 +1 @@
+These pages host the documentation for the Snitch runtime, a set of runtime and library functions to make writing parallel and efficient C code for Snitch easier.
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index 2ccad29a7..d13ad0eb1 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -77,11 +77,5 @@ nav:
           - Snitch Target Utilities:
               - run.py: rm/snitch_target_utils/run.md
               - build.py: rm/snitch_target_utils/build.md
-          - Snitch Runtime:
-              - Pages: runtime/Pages/index.md
-              - Files: runtime/Files/index.md
-              - Classes: runtime/Classes/index.md
-              - Examples: runtime/Examples/index.md
-              - Modules: runtime/Modules/index.md
-              - Namespaces: runtime/Namespaces/index.md
+          - Snitch Runtime: doxygen/html/index.html
   - Publications: publications.md
diff --git a/sw/snRuntime/src/alloc_v2.h b/sw/snRuntime/src/alloc_v2.h
index 29ffb81e5..2ca18b802 100644
--- a/sw/snRuntime/src/alloc_v2.h
+++ b/sw/snRuntime/src/alloc_v2.h
@@ -2,29 +2,61 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+/**
+ * @file
+ * @brief Defines functions to dynamically allocate the cluster's L1 memory.
+ *
+ * This file provides functions to dynamically allocate the cluster's L1
+ * memory. It includes functions for allocating memory for cluster-local
+ * variables, compute core-local variables, and for manipulating pointers to
+ * variables allocated by different cores or clusters.
+ */
+
 extern __thread snrt_allocator_t l1_allocator_v2;
 
+/**
+ * @brief Get a pointer to the L1 allocator.
+ *
+ * @return Pointer to the L1 allocator.
+ */
 inline snrt_allocator_t *snrt_l1_allocator_v2() { return &l1_allocator_v2; }
 
+/**
+ * @brief Get the next pointer of the L1 allocator.
+ *
+ * @return The next pointer of the L1 allocator.
+ */
 inline void *snrt_l1_next_v2() { return (void *)snrt_l1_allocator_v2()->next; }
 
 /**
- * @brief Override the L1 allocator next pointer
+ * @brief Override the L1 allocator next pointer.
+ *
+ * @param next The new value for the next pointer.
  */
 inline void snrt_l1_update_next_v2(void *next) {
     snrt_l1_allocator_v2()->next = (uint32_t)next;
 }
 
-// Check that allocation doesn't exceed allocator bounds, and raise an
-// exception otherwise
+/**
+ * @brief Check if the allocation exceeds the allocator bounds and raise an
+ *        exception if it does.
+ */
 inline void snrt_l1_alloc_check_bounds() {
     if (snrt_l1_allocator_v2()->next > snrt_l1_allocator_v2()->end)
         asm volatile("ecall \n");
 }
 
-// Dynamically allocate space for a variable of size `size` in the cluster's L1
-// memory. This function should be invoked by every core in a cluster. Every
-// core receives a pointer to the allocated variable.
+/**
+ * @brief Allocate space for a variable in the cluster's L1 memory.
+ *
+ * This function dynamically allocates space for a variable of size `size` in
+ * the cluster's L1 memory.
+ * The allocation is aligned to the specified `alignment`.
+ *
+ * @param size The size of the variable to allocate.
+ * @param alignment The alignment of the allocation.
+ * @return Pointer to the allocated variable.
+ */
 inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) {
     snrt_l1_allocator_v2()->next =
         ALIGN_UP(snrt_l1_allocator_v2()->next, alignment);
@@ -34,11 +66,19 @@ inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) {
     return retval;
 }
 
-// Dynamically allocate space for N variables of size `size` in the cluster's
-// L1 memory, N being the number of compute cores in the cluster. This function
-// should be invoked by every core in a cluster. Every compute core receives a
-// pointer to a unique variable among the N which have been allocated. The
-// return value for the DM core is undefined.
+/**
+ * @brief Allocate space for N variables in the cluster's L1 memory.
+ *
+ * This function dynamically allocates space for N variables of size `size` in
+ * the cluster's L1 memory, where N is the number of compute cores in the
+ * cluster. The variables are allocated in a contiguous block of memory.
+ * The whole block is aligned to the specified `alignment`.
+ *
+ * @param size The size of each variable to allocate.
+ * @param alignment The alignment of the allocation.
+ * @return Pointer to the allocated variable for each compute core.
+ *         The return value for the DM core is undefined.
+ */
 inline void *snrt_l1_alloc_compute_core_local(size_t size,
                                               const size_t alignment) {
     snrt_l1_allocator_v2()->next =
@@ -49,24 +89,52 @@ inline void *snrt_l1_alloc_compute_core_local(size_t size,
     return retval;
 }
 
-// Takes a pointer to a variable allocated using
-// `snrt_l1_alloc_compute_core_local` and returns a pointer to the same
-// variable allocated by another core, as specified by `core_idx`.
-// The `size` argument should be the same used during allocation.
+/**
+ * @brief Get a pointer to the same variable allocated by another core.
+ *
+ * This function takes a pointer to a variable allocated using
+ * `snrt_l1_alloc_compute_core_local` and returns a pointer to the same
+ * variable allocated by another core, as specified by `core_idx`.
+ * The `size` argument should be the same used during allocation.
+ *
+ * @param ptr Pointer to the variable allocated by the current core.
+ * @param core_idx Index of the core that allocated the variable.
+ * @param size The size of the variable.
+ * @return Pointer to the same variable allocated by the specified core.
+ */
 inline void *snrt_compute_core_local_ptr(void *ptr, uint32_t core_idx,
                                          size_t size) {
     size_t offset = (core_idx - snrt_cluster_core_idx()) * size;
     return (void *)((uintptr_t)ptr + offset);
 }
 
-// Takes a pointer to a variable in the source cluster's L1 memory and returns
-// a pointer to the same offset in the destination cluster's L1 memory.
+/**
+ * @brief Get a pointer to the same offset in another cluster's L1 memory.
+ *
+ * This function takes a pointer to a variable in the calling (source)
+ * cluster's L1 memory and returns a pointer to the same offset in the target
+ * (destination) cluster's L1 memory.
+ *
+ * @param ptr Pointer to the variable in the source cluster's L1 memory.
+ * @param src_cluster_idx Index of the source cluster.
+ * @param dst_cluster_idx Index of the destination cluster.
+ * @return Pointer to the same offset in the destination cluster's L1 memory.
+ */
 inline void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx,
                                 uint32_t dst_cluster_idx) {
     return (void *)((uintptr_t)ptr +
                     (dst_cluster_idx - src_cluster_idx) * SNRT_CLUSTER_OFFSET);
 }
 
+/**
+ * @brief Initialize the L1 allocator.
+ *
+ * This function initializes the L1 allocator by calculating the end address
+ * of the heap and setting the base, end, and next pointers of the allocator.
+ *
+ * @note This function should be called before using any of the allocation
+ *       functions.
+ */
 inline void snrt_alloc_init_v2() {
     // Calculate end address of the heap. The top of the TCDM address space is
     // reserved for the cluster-local storage (CLS) and the stack of every
diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index a4b82a7d3..363805346 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -2,6 +2,11 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+/**
+ * @file
+ * @brief This file provides functions to program the Snitch DMA.
+ */
+
 #pragma once
 
 #define OP_CUSTOM1 0b0101011
@@ -22,6 +27,13 @@
 /// A DMA transfer identifier.
 typedef uint32_t snrt_dma_txid_t;
 
+/**
+ * @brief Start an asynchronous 1D DMA transfer with 64-bit wide pointers.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of the transfer in bytes.
+ * @return The DMA transfer ID.
+ */
 inline uint32_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src,
                                           size_t size) {
     register uint32_t reg_dst_low asm("a0") = dst >> 0;    // 10
@@ -51,13 +63,30 @@ inline uint32_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src,
     return reg_txid;
 }
 
-/// Initiate an asynchronous 1D DMA transfer.
+/**
+ * @brief Start an asynchronous 1D DMA transfer with native-size pointers.
+ * @param dst The destination pointer.
+ * @param src The source pointer.
+ * @param size The size of the transfer in bytes.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_1d(void *dst, const void *src,
                                          size_t size) {
     return snrt_dma_start_1d_wideptr((size_t)dst, (size_t)src, size);
 }
 
-/// Initiate an asynchronous 2D DMA transfer with wide 64-bit pointers.
+/**
+ * @brief Start an asynchronous 2D DMA transfer with 64-bit wide pointers.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of every 1D transfer within the 2D transfer in bytes.
+ * @param dst_stride The offset between consecutive 1D transfers at the
+ *                   destination, in bytes.
+ * @param src_stride The offset between consecutive 1D transfers at the
+ *                   source, in bytes.
+ * @param repeat The number of 1D transfers composing the 2D transfer.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src,
                                                  size_t size, size_t dst_stride,
                                                  size_t src_stride,
@@ -102,7 +131,18 @@ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src,
     return reg_txid;
 }
 
-/// Initiate an asynchronous 2D DMA transfer.
+/**
+ * @brief Start an asynchronous 2D DMA transfer with native-size pointers.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of every 1D transfer within the 2D transfer in bytes.
+ * @param dst_stride The offset between consecutive 1D transfers at the
+ *                   destination, in bytes.
+ * @param src_stride The offset between consecutive 1D transfers at the
+ *                   source, in bytes.
+ * @param repeat The number of 1D transfers composing the 2D transfer.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src,
                                          size_t size, size_t dst_stride,
                                          size_t src_stride, size_t repeat) {
@@ -110,8 +150,15 @@ inline snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src,
                                      src_stride, repeat);
 }
 
-/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a
-/// specific channel.
+/**
+ * @brief Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a
+ *        specific channel.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of the transfer in bytes.
+ * @param channel The index of the channel.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_1d_channel_wideptr(uint64_t dst,
                                                          uint64_t src,
                                                          size_t size,
@@ -144,7 +191,15 @@ inline snrt_dma_txid_t snrt_dma_start_1d_channel_wideptr(uint64_t dst,
     return reg_txid;
 }
 
-/// Initiate an asynchronous 1D DMA transfer and a specific channel.
+/**
+ * @brief Start an asynchronous 1D DMA transfer with native-size pointers on a
+ *        specific channel.
+ * @param dst The destination pointer.
+ * @param src The source pointer.
+ * @param size The size of the transfer in bytes.
+ * @param channel The index of the channel.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_1d_channel(void *dst, const void *src,
                                                  size_t size,
                                                  uint32_t channel) {
@@ -152,8 +207,20 @@ inline snrt_dma_txid_t snrt_dma_start_1d_channel(void *dst, const void *src,
                                              channel);
 }
 
-/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a
-/// specific channel.
+/**
+ * @brief Start an asynchronous 2D DMA transfer with 64-bit wide pointers on a
+ *        specific channel.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of every 1D transfer within the 2D transfer in bytes.
+ * @param dst_stride The offset between consecutive 1D transfers at the
+ *                   destination, in bytes.
+ * @param src_stride The offset between consecutive 1D transfers at the
+ *                   source, in bytes.
+ * @param repeat The number of 1D transfers composing the 2D transfer.
+ * @param channel The index of the channel.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_2d_channel_wideptr(
     uint64_t dst, uint64_t src, size_t size, size_t dst_stride,
     size_t src_stride, size_t repeat, uint32_t channel) {
@@ -198,7 +265,20 @@ inline snrt_dma_txid_t snrt_dma_start_2d_channel_wideptr(
     return reg_txid;
 }
 
-/// Initiate an asynchronous 2D DMA transfer and a specific channel.
+/**
+ * @brief Start an asynchronous 2D DMA transfer with native-size pointers on a
+ *        specific channel.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of every 1D transfer within the 2D transfer in bytes.
+ * @param dst_stride The offset between consecutive 1D transfers at the
+ *                   destination, in bytes.
+ * @param src_stride The offset between consecutive 1D transfers at the
+ *                   source, in bytes.
+ * @param repeat The number of 1D transfers composing the 2D transfer.
+ * @param channel The index of the channel.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_2d_channel(void *dst, const void *src,
                                                  size_t size, size_t dst_stride,
                                                  size_t src_stride,
@@ -209,7 +289,10 @@ inline snrt_dma_txid_t snrt_dma_start_2d_channel(void *dst, const void *src,
                                              channel);
 }
 
-/// Block until a transfer finishes.
+/**
+ * @brief Block until a DMA transfer finishes.
+ * @param dst The DMA transfer ID.
+ */
 inline void snrt_dma_wait(snrt_dma_txid_t tid) {
     // dmstati t0, 0  # 0=status.completed_id
     asm volatile(
@@ -221,7 +304,10 @@ inline void snrt_dma_wait(snrt_dma_txid_t tid) {
         : "t0");
 }
 
-/// Block until a transfer finishes on a specific channel.
+/**
+ * @brief Block until a DMA transfer finishes on a specific channel.
+ * @param dst The DMA transfer ID.
+ */
 inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) {
     // dmstati t0, 0  # 0=status.completed_id
     register uint32_t cfg asm("t1") = channel << 2;
@@ -235,7 +321,9 @@ inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) {
         : "t0");
 }
 
-/// Block until all operation on the DMA ceases.
+/**
+ * @brief Block until all DMA operation ceases.
+ */
 inline void snrt_dma_wait_all() {
     // dmstati t0, 2  # 2=status.busy
     asm volatile(
@@ -246,7 +334,10 @@ inline void snrt_dma_wait_all() {
         : "t0");
 }
 
-/// Block until all operation on the DMA ceases on a specific channel.
+/**
+ * @brief Block until a specific DMA channel is idle.
+ * @param channel The index of the channel.
+ */
 inline void snrt_dma_wait_all_channel(uint32_t channel) {
     register uint32_t tmp;
     // dmstati t0, 2  # 2=status.busy
@@ -260,7 +351,10 @@ inline void snrt_dma_wait_all_channel(uint32_t channel) {
         : "t0");
 }
 
-/// Wait until all channels are idle
+/**
+ * @brief Block until the first @p num_channels channels are idle.
+ * @param num_channels The number of channels to wait on.
+ */
 inline void snrt_dma_wait_all_channels(uint32_t num_channels) {
     register uint32_t tmp;
     // dmstati t0, 2  # 2=status.busy
@@ -270,10 +364,10 @@ inline void snrt_dma_wait_all_channels(uint32_t num_channels) {
 }
 
 /**
- * @brief start tracking of dma performance region. Does not have any
+ * @brief Start tracking of dma performance region. Does not have any
  * implications on the HW. Only injects a marker in the DMA traces that can be
- * analyzed
- *
+ * analyzed.
+ * @deprecated
  */
 inline void snrt_dma_start_tracking() {
     // dmstati zero, 0
@@ -282,10 +376,10 @@ inline void snrt_dma_start_tracking() {
 }
 
 /**
- * @brief stop tracking of dma performance region. Does not have any
+ * @brief Stop tracking of dma performance region. Does not have any
  * implications on the HW. Only injects a marker in the DMA traces that can be
- * analyzed
- *
+ * analyzed.
+ * @deprecated
  */
 inline void snrt_dma_stop_tracking() {
     asm volatile(".word %0\n" ::"i"(
@@ -293,11 +387,10 @@ inline void snrt_dma_stop_tracking() {
 }
 
 /**
- * @brief fast memset function performed by DMA
- *
- * @param ptr pointer to the start of the region
- * @param value value to set
- * @param len number of bytes, must be multiple of DMA bus-width
+ * @brief Fast memset function performed by DMA.
+ * @param ptr Pointer to the start of the region.
+ * @param value Value to set.
+ * @param len Number of bytes, must be a multiple of the DMA bus width.
  */
 inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) {
     // set first 64bytes to value
@@ -314,9 +407,14 @@ inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) {
     snrt_dma_wait_all();
 }
 
-/// Load a 1D-tile of size tile_size from a 1D array. The specific tile is
-/// selected by tile_idx. Every element in the src and dst arrays has prec
-/// bytes.
+/**
+ * @brief Load a tile of a 1D array.
+ * @param dst Pointer to the tile destination.
+ * @param src Pointer to the source array.
+ * @param tile_idx Index of the tile in the 1D array.
+ * @param tile_size Number of elements within a tile of the 1D array.
+ * @param prec Number of bytes of each element in the 1D array.
+ */
 inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src,
                                              size_t tile_idx, size_t tile_size,
                                              uint32_t prec) {
@@ -324,9 +422,14 @@ inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src,
     return snrt_dma_start_1d(dst, src + tile_idx * tile_nbytes, tile_nbytes);
 }
 
-/// Store a 1D-tile of size tile_size to a 1D array. The specific tile is
-/// selected by tile_idx. Every element in the src and dst arrays has prec
-/// bytes.
+/**
+ * @brief Store a tile to a 1D array.
+ * @param dst Pointer to the destination array.
+ * @param src Pointer to the source tile.
+ * @param tile_idx Index of the tile in the 1D array.
+ * @param tile_size Number of elements within a tile of the 1D array.
+ * @param prec Number of bytes of each element in the 1D array.
+ */
 inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src,
                                               size_t tile_idx, size_t tile_size,
                                               uint32_t prec) {
@@ -334,10 +437,20 @@ inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src,
     return snrt_dma_start_1d(dst + tile_idx * tile_nbytes, src, tile_nbytes);
 }
 
-/// Load a 2D-tile of shape (tile_x1_size, tile_x0_size) from the 2D array
-/// of shape (full_x1_size, full_x0_size). The specific tile is selected
-/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and
-/// destination arrays has prec bytes.
+/**
+ * @brief Load a 2D tile of a 2D array.
+ * @param dst Pointer to the tile destination.
+ * @param src Pointer to the source array.
+ * @param tile_x1_idx Outermost coordinate of the tile in the 2D array.
+ * @param tile_x0_idx Innermost coordinate of the tile in the 2D array.
+ * @param tile_x1_size Number of elements in the outermost dimension of the
+ *                     tile.
+ * @param tile_x0_size Number of elements in the innermost dimension of the
+ *                     tile.
+ * @param full_x0_size Number of elements in the innermost dimension of the
+ *                     array.
+ * @param prec Number of bytes of each element in the 2D array.
+ */
 inline snrt_dma_txid_t snrt_dma_load_2d_tile(
     void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
     size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
@@ -357,10 +470,20 @@ inline snrt_dma_txid_t snrt_dma_load_2d_tile(
     );
 }
 
-/// Store a 2D-tile of shape (tile_x1_size, tile_x0_size) to the 2D array
-/// of shape (full_x1_size, full_x0_size). The specific tile is selected
-/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and
-/// destination arrays has prec bytes.
+/**
+ * @brief Store a 2D tile to a 2D array.
+ * @param dst Pointer to the destination array.
+ * @param src Pointer to the source tile.
+ * @param tile_x1_idx Outermost coordinate of the tile in the 2D array.
+ * @param tile_x0_idx Innermost coordinate of the tile in the 2D array.
+ * @param tile_x1_size Number of elements in the outermost dimension of the
+ *                     tile.
+ * @param tile_x0_size Number of elements in the innermost dimension of the
+ *                     tile.
+ * @param full_x0_size Number of elements in the innermost dimension of the
+ *                     array.
+ * @param prec Number of bytes of each element in the 2D array.
+ */
 inline snrt_dma_txid_t snrt_dma_store_2d_tile(
     void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
     size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
diff --git a/sw/snRuntime/src/ssr.h b/sw/snRuntime/src/ssr.h
index 1a067fea5..d8858baa4 100644
--- a/sw/snRuntime/src/ssr.h
+++ b/sw/snRuntime/src/ssr.h
@@ -2,9 +2,27 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+/**
+ * @file
+ * @brief This file contains functions to conveniently program Snitch's SSRs.
+ *
+ * An SSR stream can be configured to replace a store (or load) sequence as
+ * could be generated by an N-dimensional affine loop nest:
+ * @code{.c}
+ * for (int i = 0; i < b1; i++)
+ *     for (int j = 0; j < b0; j++)
+ *         array[i * s1 + j * s0] = 0;
+ * @endcode
+ *
+ * The configuration functions provided in this file reflect the parameters
+ * one would define to set up such a loop nest.
+ */
+
 #pragma once
 
-/// Synchronize the integer and float pipelines.
+/**
+ * @brief Synchronize the integer and float pipelines.
+ */
 inline void snrt_fpu_fence() {
     unsigned tmp;
     asm volatile(
@@ -13,34 +31,41 @@ inline void snrt_fpu_fence() {
         : "+r"(tmp)::"memory");
 }
 
-/// The different SSR data movers.
+/**
+ * @brief The different SSRs.
+ */
 enum snrt_ssr_dm {
-    SNRT_SSR_DM0 = 0,
-    SNRT_SSR_DM1 = 1,
-    SNRT_SSR_DM2 = 2,
-    // To write to all SSRs, use index 31
-    SNRT_SSR_DM_ALL = 31,
+    SNRT_SSR_DM0 = 0,    /**< SSR data mover 0 */
+    SNRT_SSR_DM1 = 1,    /**< SSR data mover 1 */
+    SNRT_SSR_DM2 = 2,    /**< SSR data mover 2 */
+    SNRT_SSR_DM_ALL = 31 /**< Write to all SSRs */
 };
 
-/// The different dimensions.
+/**
+ * @brief The different dimensions.
+ */
 enum snrt_ssr_dim {
-    SNRT_SSR_1D = 0,
-    SNRT_SSR_2D = 1,
-    SNRT_SSR_3D = 2,
-    SNRT_SSR_4D = 3,
+    SNRT_SSR_1D = 0, /**< 1D stream */
+    SNRT_SSR_2D = 1, /**< 2D stream */
+    SNRT_SSR_3D = 2, /**< 3D stream */
+    SNRT_SSR_4D = 3  /**< 4D stream */
 };
 
-/// The SSR configuration registers.
+/**
+ * @brief The SSR configuration registers.
+ */
 enum {
-    REG_STATUS = 0,
-    REG_REPEAT = 1,
-    REG_BOUNDS = 2,   // + loop index
-    REG_STRIDES = 6,  // + loop index
-    REG_RPTR = 24,    // + snrt_ssr_dim
-    REG_WPTR = 28,    // + snrt_ssr_dim
+    REG_STATUS = 0,  /**< SSR status register */
+    REG_REPEAT = 1,  /**< SSR repeat register */
+    REG_BOUNDS = 2,  /**< SSR bounds register */
+    REG_STRIDES = 6, /**< SSR strides register */
+    REG_RPTR = 24,   /**< SSR read pointer register */
+    REG_WPTR = 28    /**< SSR write pointer register */
 };
 
-/// Enable SSR.
+/**
+ * @brief Enable all SSRs.
+ */
 inline void snrt_ssr_enable() {
 #ifdef __TOOLCHAIN_LLVM__
     __builtin_ssr_enable();
@@ -49,7 +74,9 @@ inline void snrt_ssr_enable() {
 #endif
 }
 
-/// Disable SSR.
+/**
+ * @brief Disable all SSRs.
+ */
 inline void snrt_ssr_disable() {
 #ifdef __TOOLCHAIN_LLVM__
     __builtin_ssr_disable();
@@ -58,6 +85,12 @@ inline void snrt_ssr_disable() {
 #endif
 }
 
+/**
+ * @brief Read the value of an SSR configuration register.
+ * @param reg The register index.
+ * @param dm The SSR index.
+ * @return The value of the register.
+ */
 inline uint32_t read_ssr_cfg(uint32_t reg, uint32_t dm) {
     uint32_t value;
     asm volatile("scfgri %[value], %[dm] | %[reg]<<5\n"
@@ -66,12 +99,23 @@ inline uint32_t read_ssr_cfg(uint32_t reg, uint32_t dm) {
     return value;
 }
 
+/**
+ * @brief Write a value to an SSR configuration register.
+ * @param reg The register index.
+ * @param dm The SSR index.
+ * @param value The value to write.
+ */
 inline void write_ssr_cfg(uint32_t reg, uint32_t dm, uint32_t value) {
     asm volatile("scfgwi %[value], %[dm] | %[reg]<<5\n" ::[value] "r"(value),
                  [ dm ] "i"(dm), [ reg ] "i"(reg));
 }
 
-// Configure an SSR data mover for a 1D loop nest.
+/**
+ * @brief Configure an SSR data mover for a 1D loop nest.
+ * @param dm The SSR index.
+ * @param b0 The bound of the loop.
+ * @param s0 The stride of the loop.
+ */
 inline void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) {
     --b0;
     write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
@@ -80,7 +124,14 @@ inline void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) {
     a += s0 * b0;
 }
 
-// Configure an SSR data mover for a 2D loop nest.
+/**
+ * @brief Configure an SSR data mover for a 2D loop nest.
+ * @param dm The SSR index.
+ * @param b0 The bound of the first loop.
+ * @param b1 The bound of the second loop.
+ * @param s0 The stride of the first loop.
+ * @param s1 The stride of the second loop.
+ */
 inline void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
                              size_t s0, size_t s1) {
     --b0;
@@ -94,7 +145,16 @@ inline void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
     a += s1 * b1;
 }
 
-// Configure an SSR data mover for a 3D loop nest.
+/**
+ * @brief Configure an SSR data mover for a 3D loop nest.
+ * @param dm The SSR index.
+ * @param b0 The bound of the first loop.
+ * @param b1 The bound of the second loop.
+ * @param b2 The bound of the third loop.
+ * @param s0 The stride of the first loop.
+ * @param s1 The stride of the second loop.
+ * @param s2 The stride of the third loop.
+ */
 inline void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
                              size_t b2, size_t s0, size_t s1, size_t s2) {
     --b0;
@@ -112,10 +172,18 @@ inline void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
     a += s2 * b2;
 }
 
-// Configure an SSR data mover for a 4D loop nest.
-// b0: Inner-most bound (limit of loop)
-// b3: Outer-most bound (limit of loop)
-// s0: increment size of inner-most loop
+/**
+ * @brief Configure an SSR data mover for a 4D loop nest.
+ * @param dm The SSR index.
+ * @param b0 The bound of the first loop.
+ * @param b1 The bound of the second loop.
+ * @param b2 The bound of the third loop.
+ * @param b3 The bound of the fourth loop.
+ * @param s0 The stride of the first loop.
+ * @param s1 The stride of the second loop.
+ * @param s2 The stride of the third loop.
+ * @param s3 The stride of the fourth loop.
+ */
 inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
                              size_t b2, size_t b3, size_t s0, size_t s1,
                              size_t s2, size_t s3) {
@@ -138,18 +206,32 @@ inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
     a += s3 * b3;
 }
 
-/// Configure the repetition count for a stream.
+/**
+ * @brief Configure the repetition count for a stream.
+ * @param dm The SSR index.
+ * @param count The repetition count.
+ */
 inline void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count) {
     write_ssr_cfg(REG_REPEAT, dm, count - 1);
 }
 
-/// Start a streaming read.
+/**
+ * @brief Start a streaming read.
+ * @param dm The SSR index.
+ * @param dim The number of dimensions to use.
+ * @param ptr The pointer to the data.
+ */
 inline void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim,
                           volatile void *ptr) {
     write_ssr_cfg(REG_RPTR + dim, dm, (uintptr_t)ptr);
 }
 
-/// Start a streaming write.
+/**
+ * @brief Start a streaming write.
+ * @param dm The SSR index.
+ * @param dim The number of dimensions to use.
+ * @param ptr The pointer to the data.
+ */
 inline void snrt_ssr_write(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim,
                            volatile void *ptr) {
     write_ssr_cfg(REG_WPTR + dim, dm, (uintptr_t)ptr);
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index fa4b75b24..add26fa08 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -5,6 +5,11 @@
 // Luca Colagrande <colluca@iis.ee.ethz.ch>
 // Viviane Potocnik <vivianep@iis.ee.ethz.ch>
 
+/**
+ * @file
+ * @brief This file provides functions to synchronize Snitch cores.
+ */
+
 #pragma once
 
 #include <math.h>
@@ -13,11 +18,18 @@
 // Mutex functions
 //================================================================================
 
+/**
+ * @brief Get a pointer to a mutex variable.
+ */
 inline volatile uint32_t *snrt_mutex() { return &_snrt_mutex; }
 
 /**
- * @brief lock a mutex, blocking
- * @details declare mutex with `static volatile uint32_t mtx = 0;`
+ * @brief Acquire a mutex, blocking.
+ * @details Test-and-set (TAS) implementation of a lock.
+ * @param pmtx A pointer to a variable which can be used as a mutex, i.e. to
+ *             which all cores have a reference and at a memory location to
+ *             which atomic accesses can be made. This can be declared e.g. as
+ *             `static volatile uint32_t mtx = 0;`.
  */
 inline void snrt_mutex_acquire(volatile uint32_t *pmtx) {
     asm volatile(
@@ -31,9 +43,9 @@ inline void snrt_mutex_acquire(volatile uint32_t *pmtx) {
 }
 
 /**
- * @brief lock a mutex, blocking
- * @details test and test-and-set (ttas) implementation of a lock.
- *          Declare mutex with `static volatile uint32_t mtx = 0;`
+ * @brief Acquire a mutex, blocking.
+ * @details Same as @ref snrt_mutex_acquire but acquires the lock using a test
+ *          and test-and-set (TTAS) strategy.
  */
 inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) {
     asm volatile(
@@ -50,7 +62,7 @@ inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) {
 }
 
 /**
- * @brief Release the mutex
+ * @brief Release a previously-acquired mutex.
  */
 inline void snrt_mutex_release(volatile uint32_t *pmtx) {
     asm volatile("amoswap.w.rl  x0,x0,(%0)   # Release lock by storing 0\n"
@@ -61,13 +73,21 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx) {
 // Barrier functions
 //================================================================================
 
-/// Synchronize cores in a cluster with a hardware barrier
+/**
+ * @brief Synchronize cores in a cluster with a hardware barrier, blocking.
+ * @note Synchronizes all (both DM and compute) cores. All cores must invoke
+ *       this function, or the calling cores will stall indefinitely.
+ */
 inline void snrt_cluster_hw_barrier() {
     asm volatile("csrr x0, 0x7C2" ::: "memory");
 }
 
-// Synchronizes one core from every cluster with the others.
-// One core per cluster is expected to invoke this function.
+/**
+ * @brief Synchronize one core from every cluster with the others.
+ * @details Implemented as a software barrier.
+ * @note One core per cluster must invoke this function, or the calling cores
+ *       will stall indefinitely.
+ */
 inline void snrt_inter_cluster_barrier() {
     // Remember previous iteration
     uint32_t prev_barrier_iteration = _snrt_barrier.iteration;
@@ -84,7 +104,15 @@ inline void snrt_inter_cluster_barrier() {
     }
 }
 
-/// Synchronize clusters globally with a global software barrier
+/**
+ * @brief Synchronize all Snitch cores.
+ * @details Synchronization is performed hierarchically. Within a cluster,
+ *          cores are synchronized through a hardware barrier (see
+ *          @ref snrt_cluster_hw_barrier). Clusters are synchronized through
+ *          a software barrier (see @ref snrt_inter_cluster_barrier).
+ * @note Every Snitch core must invoke this function, or the calling cores
+ *       will stall indefinitely.
+ */
 inline void snrt_global_barrier() {
     snrt_cluster_hw_barrier();
 
@@ -96,17 +124,12 @@ inline void snrt_global_barrier() {
     snrt_cluster_hw_barrier();
 }
 
-inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
-    __atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED);
-    snrt_global_barrier();
-    return _reduction_result;
-}
-
 /**
- * @brief Generic barrier
- *
- * @param barr pointer to a barrier
- * @param n number of harts that have to enter before released
+ * @brief Generic software barrier.
+ * @param barr pointer to a barrier variable.
+ * @param n number of harts that have to enter before released.
+ * @note Exactly the specified number of harts must invoke this function, or
+ *       the calling cores will stall indefinitely.
  */
 inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) {
     // Remember previous iteration
@@ -128,8 +151,37 @@ inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) {
 // Reduction functions
 //================================================================================
 
-// Assumes the dst and src buffers are at the same offset in the TCDM of every
-// cluster
+/**
+ * @brief Perform a global sum reduction, blocking.
+ * @details All cores participate in the reduction and synchronize globally
+ *          to wait for the reduction to complete.
+ *          The synchronization is performed via @ref snrt_global_barrier.
+ * @param value The value to be summed.
+ * @return The result of the sum reduction.
+ * @note Every Snitch core must invoke this function, or the calling cores
+ *       will stall indefinitely.
+ */
+inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
+    __atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED);
+    snrt_global_barrier();
+    return _reduction_result;
+}
+
+/**
+ * @brief Perform a sum reduction among clusters, blocking.
+ * @details The reduction is performed in a logarithmic fashion. Half of the
+ *          clusters active in every level of the binary-tree participate as
+ *          as senders, the other half as receivers. Senders use the DMA to
+ *          send their data to the respective receiver's destination buffer.
+ *          The receiver then reduces each element in its destination buffer
+ *          with the respective element in its source buffer. It then proceeds
+ *          to the next level in the binary tree.
+ * @param dst_buffer The pointer to the calling cluster's destination buffer.
+ * @param src_buffer The pointer to the calling cluster's source buffer.
+ * @param len The amount of data in each buffer.
+ * @note The destination buffers must lie at the same offset in every cluster's
+ *       TCDM.
+ */
 inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
                                       size_t len) {
     // If we have a single cluster the reduction degenerates to a memcpy
diff --git a/sw/snRuntime/src/team.h b/sw/snRuntime/src/team.h
index eb06a4488..560fff7cd 100644
--- a/sw/snRuntime/src/team.h
+++ b/sw/snRuntime/src/team.h
@@ -2,67 +2,153 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+/**
+ * @file
+ * @brief This file contains functions and macros related to Snitch team
+ * management.
+ *
+ * The functions in this file provide information about the Snitch hardware
+ * configuration, such as the number of clusters, cores per cluster, and the
+ * current core's index within the system. These functions can be used for team
+ * management and core-specific operations.
+ */
+
 #pragma once
 
+/**
+ * @brief Get the RISC-V hardware thread ID (hartid).
+ *
+ * @return The hardware thread ID.
+ */
 inline uint32_t __attribute__((const)) snrt_hartid() {
     uint32_t hartid;
     asm("csrr %0, mhartid" : "=r"(hartid));
     return hartid;
 }
 
+/**
+ * @brief Get the number of Snitch clusters in the system.
+ *
+ * @return The number of clusters.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_num() {
     return SNRT_CLUSTER_NUM;
 }
 
+/**
+ * @brief Get the number of cores per cluster.
+ *
+ * @return The number of cores per cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_core_num() {
     return SNRT_CLUSTER_CORE_NUM;
 }
 
+/**
+ * @brief Get the hartid of the first Snitch core in the system.
+ *
+ * @return The hartid of the first Snitch core in the system.
+ */
 inline uint32_t __attribute__((const)) snrt_global_core_base_hartid() {
     return SNRT_BASE_HARTID;
 }
 
+/**
+ * @brief Get the total number of Snitch cores in the system.
+ *
+ * @return The total number of cores.
+ */
 inline uint32_t __attribute__((const)) snrt_global_core_num() {
     return snrt_cluster_num() * snrt_cluster_core_num();
 }
 
+/**
+ * @brief Get the total number of Snitch compute cores in the system.
+ *
+ * @return The total number of compute cores.
+ */
 inline uint32_t __attribute__((const)) snrt_global_compute_core_num() {
     return snrt_cluster_num() * snrt_cluster_compute_core_num();
 }
 
+/**
+ * @brief Get the index (!= hartid) of the current Snitch core in the system.
+ *
+ * @return The index of the current Snitch core.
+ */
 inline uint32_t __attribute__((const)) snrt_global_core_idx() {
     return snrt_hartid() - snrt_global_core_base_hartid();
 }
 
+/**
+ * @brief Get the index of the current Snitch compute core in the system.
+ *
+ * @return The index of the current Snitch compute core.
+ */
 inline uint32_t __attribute__((const)) snrt_global_compute_core_idx() {
     return snrt_cluster_idx() * snrt_cluster_compute_core_num() +
            snrt_cluster_core_idx();
 }
 
+/**
+ * @brief Get the index of the current Snitch cluster in the system.
+ *
+ * @return The index of the current cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_idx() {
     return snrt_global_core_idx() / snrt_cluster_core_num();
 }
 
+/**
+ * @brief Get the index of the current Snitch core within the cluster.
+ *
+ * @return The index of the current core within the cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_core_idx() {
     return snrt_global_core_idx() % snrt_cluster_core_num();
 }
 
+/**
+ * @brief Get the number of data mover (DM) cores per cluster.
+ *
+ * @return The number of DM cores per cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_dm_core_num() {
     return SNRT_CLUSTER_DM_CORE_NUM;
 }
 
+/**
+ * @brief Get the number of compute cores per cluster.
+ *
+ * @return The number of compute cores per cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_compute_core_num() {
     return snrt_cluster_core_num() - snrt_cluster_dm_core_num();
 }
 
+/**
+ * @brief Check if the current core is a compute core.
+ *
+ * @return True if the current core is a compute core, false otherwise.
+ */
 inline int __attribute__((const)) snrt_is_compute_core() {
     return snrt_cluster_core_idx() < snrt_cluster_compute_core_num();
 }
 
+/**
+ * @brief Check if the current core is the last compute core in the cluster.
+ *
+ * @return True if the current core is the last compute core, false otherwise.
+ */
 inline int __attribute__((const)) snrt_cluster_is_last_compute_core() {
     return snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1);
 }
 
+/**
+ * @brief Check if the current core is a data mover (DM) core.
+ *
+ * @return True if the current core is a DM core, false otherwise.
+ */
 inline int __attribute__((const)) snrt_is_dm_core() {
     return !snrt_is_compute_core();
 }
diff --git a/util/container/Dockerfile b/util/container/Dockerfile
index f6e98f713..9cdc7d9aa 100644
--- a/util/container/Dockerfile
+++ b/util/container/Dockerfile
@@ -11,6 +11,7 @@ ARG PYTHON_VERSION=3.9.12
 ARG BENDER_VERSION=0.27.1
 ARG SPIKE_DASM_VERSION=0.1.0
 ARG VERILATOR_VERSION=5.006
+ARG DOXYGEN_VERSION=1.12.0
 # Run dpkg without interactive dialogue
 ARG DEBIAN_FRONTEND=noninteractive
 
@@ -90,6 +91,10 @@ RUN tar xzf bender-${BENDER_VERSION}-x86_64-linux-gnu-ubuntu18.04.tar.gz
 RUN wget https://github.com/pulp-platform/riscv-isa-sim/releases/download/snitch-v${SPIKE_DASM_VERSION}/snitch-spike-dasm-${SPIKE_DASM_VERSION}-x86_64-linux-gnu-ubuntu18.04.tar.gz
 RUN tar xzf snitch-spike-dasm-${SPIKE_DASM_VERSION}-x86_64-linux-gnu-ubuntu18.04.tar.gz
 
+# Install Doxygen
+RUN wget https://www.doxygen.nl/files/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
+RUN tar xzf doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
+
 # 2. Stage
 FROM ubuntu:22.04 AS snitch_cluster
 ARG SNITCH_LLVM_VERSION=latest
@@ -149,6 +154,7 @@ COPY --from=builder /tools/spike-dasm bin/
 COPY --from=builder /root/.cargo/bin/banshee bin/
 COPY --from=builder /opt/python /opt/python
 COPY --from=builder /tools/verilator /tools/verilator/
+COPY --from=builder /tools/doxygen-${DOXYGEN_VERSION}/bin/doxygen bin/
 
 # Create and activate virtual environment
 ENV VIRTUAL_ENV "/root/.venvs/snitch_cluster"