From d024b04d22d026f158549b3e8c1c56544b42a64d Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 28 Aug 2024 16:08:03 +0200 Subject: [PATCH] treewide: Add Doxygen-generated docs (#189) * docs: Set up Doxygen documentation * snRuntime: Document fundamental modules * ci: Install Doxygen in container and use for doc deployment --- .github/workflows/publish-docs.yml | 33 ++--- .gitignore | 1 + Makefile | 19 ++- docs/.gitignore | 1 - docs/Doxyfile | 11 ++ docs/doxybook2.json | 13 -- docs/rm/snRuntime.md | 1 + mkdocs.yml | 8 +- sw/snRuntime/src/alloc_v2.h | 102 ++++++++++++--- sw/snRuntime/src/dma.h | 201 +++++++++++++++++++++++------ sw/snRuntime/src/ssr.h | 144 ++++++++++++++++----- sw/snRuntime/src/sync.h | 96 ++++++++++---- sw/snRuntime/src/team.h | 86 ++++++++++++ util/container/Dockerfile | 6 + 14 files changed, 566 insertions(+), 156 deletions(-) delete mode 100644 docs/.gitignore create mode 100644 docs/Doxyfile delete mode 100644 docs/doxybook2.json create mode 100644 docs/rm/snRuntime.md diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml index e98139f07..23a53bf4a 100644 --- a/.github/workflows/publish-docs.yml +++ b/.github/workflows/publish-docs.yml @@ -10,32 +10,19 @@ jobs: deploy: name: Deploy documentation runs-on: ubuntu-22.04 + container: + image: ghcr.io/pulp-platform/snitch_cluster:main steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v4 - with: - python-version: "3.11" - - uses: mattnotmitt/doxygen-action@v1 - with: - working-directory: sw/ - - name: bender install - uses: pulp-platform/pulp-actions/bender-install@v2 - with: - version: 0.27.1 - - name: Install Python requirements - run: pip install -r python-requirements.txt - - name: Generate runtime documentation - # yamllint disable rule:line-length + # For some reason, the checkout is done by a different user, + # than that deploying to Github (root, possibly due to Docker). + # So we need to set the repository as a safe directory. + - name: Git config safe.directory run: | - mkdir doxybook2; cd doxybook2 - wget https://github.com/matusnovak/doxybook2/releases/download/v1.4.0/doxybook2-linux-amd64-v1.4.0.zip - unzip doxybook2-linux-amd64-v1.4.0.zip; cd ../ - chmod +x doxybook2/bin/doxybook2 - mkdir docs/runtime - ./doxybook2/bin/doxybook2 --input sw/doxygen/xml --output docs/runtime --config docs/doxybook2.json - rm -rf doxybook2 - # yamllint enable rule:line-length + git config --global --add safe.directory $GITHUB_WORKSPACE - name: Generate documentation sources - run: make doc-srcs + run: | + make doc-srcs + make doxygen-docs - name: Build and deploy documentation run: mkdocs gh-deploy --force diff --git a/.gitignore b/.gitignore index 5f7019e12..320ef1947 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ gmon.out # Docs /site/ /docs/generated/ +/docs/doxygen/ # Installation directories /.venv/ diff --git a/Makefile b/Makefile index dcd0d6ef1..498c7d121 100644 --- a/Makefile +++ b/Makefile @@ -49,23 +49,36 @@ clean-nonfree: # Docs # ######## -GENERATED_DOCS_DIR = docs/generated +DOCS_DIR = docs + +GENERATED_DOCS_DIR = $(DOCS_DIR)/generated GENERATED_DOC_SRCS = $(GENERATED_DOCS_DIR)/peripherals.md +DOXYGEN_DOCS_DIR = $(DOCS_DIR)/doxygen +DOXYGEN_INPUTS = $(DOCS_DIR)/rm/snRuntime.md +DOXYGEN_INPUTS += $(shell find sw/snRuntime -name '*.c' -o -name '*.h') +DOXYFILE = $(DOCS_DIR)/Doxyfile + all: docs clean: clean-docs -.PHONY: doc-srcs docs clean-docs +.PHONY: doc-srcs doxygen-docs docs clean-docs doc-srcs: $(GENERATED_DOC_SRCS) -docs: doc-srcs +doxygen-docs: $(DOXYGEN_DOCS_DIR) + +docs: doc-srcs doxygen-docs mkdocs build clean-docs: rm -rf $(GENERATED_DOCS_DIR) + rm -rf $(DOXYGEN_DOCS_DIR) $(GENERATED_DOCS_DIR): mkdir -p $@ $(GENERATED_DOCS_DIR)/peripherals.md: hw/snitch_cluster/src/snitch_cluster_peripheral/snitch_cluster_peripheral_reg.hjson | $(GENERATED_DOCS_DIR) $(REGGEN) -d $< > $@ + +$(DOXYGEN_DOCS_DIR): $(DOXYFILE) $(DOXYGEN_INPUTS) + doxygen $< diff --git a/docs/.gitignore b/docs/.gitignore deleted file mode 100644 index e4d05d91a..000000000 --- a/docs/.gitignore +++ /dev/null @@ -1 +0,0 @@ -runtime diff --git a/docs/Doxyfile b/docs/Doxyfile new file mode 100644 index 000000000..f9d60136b --- /dev/null +++ b/docs/Doxyfile @@ -0,0 +1,11 @@ +PROJECT_NAME = "Snitch Runtime" +INPUT = docs/rm/snRuntime.md sw/snRuntime +RECURSIVE = YES +FILE_PATTERNS = *.h *.c +OUTPUT_DIRECTORY = docs/doxygen/ +USE_MDFILE_AS_MAINPAGE = docs/rm/snRuntime.md +GENERATE_LATEX = NO +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = YES +EXPAND_ONLY_PREDEF = YES +PREDEFINED = __attribute__(x)= \ No newline at end of file diff --git a/docs/doxybook2.json b/docs/doxybook2.json deleted file mode 100644 index 28d69396e..000000000 --- a/docs/doxybook2.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "baseUrl": "/snitch_cluster/runtime/", - "indexInFolders": true, - "linkSuffix": "/", - "indexClassesName": "index", - "indexFilesName": "index", - "indexGroupsName": "index", - "indexNamespacesName": "index", - "indexRelatedPagesName": "index", - "indexExamplesName": "index", - "mainPageInRoot": true, - "mainPageName": "index" -} diff --git a/docs/rm/snRuntime.md b/docs/rm/snRuntime.md new file mode 100644 index 000000000..42eb97ef2 --- /dev/null +++ b/docs/rm/snRuntime.md @@ -0,0 +1 @@ +These pages host the documentation for the Snitch runtime, a set of runtime and library functions to make writing parallel and efficient C code for Snitch easier. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 2ccad29a7..d13ad0eb1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -77,11 +77,5 @@ nav: - Snitch Target Utilities: - run.py: rm/snitch_target_utils/run.md - build.py: rm/snitch_target_utils/build.md - - Snitch Runtime: - - Pages: runtime/Pages/index.md - - Files: runtime/Files/index.md - - Classes: runtime/Classes/index.md - - Examples: runtime/Examples/index.md - - Modules: runtime/Modules/index.md - - Namespaces: runtime/Namespaces/index.md + - Snitch Runtime: doxygen/html/index.html - Publications: publications.md diff --git a/sw/snRuntime/src/alloc_v2.h b/sw/snRuntime/src/alloc_v2.h index 29ffb81e5..2ca18b802 100644 --- a/sw/snRuntime/src/alloc_v2.h +++ b/sw/snRuntime/src/alloc_v2.h @@ -2,29 +2,61 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +/** + * @file + * @brief Defines functions to dynamically allocate the cluster's L1 memory. + * + * This file provides functions to dynamically allocate the cluster's L1 + * memory. It includes functions for allocating memory for cluster-local + * variables, compute core-local variables, and for manipulating pointers to + * variables allocated by different cores or clusters. + */ + extern __thread snrt_allocator_t l1_allocator_v2; +/** + * @brief Get a pointer to the L1 allocator. + * + * @return Pointer to the L1 allocator. + */ inline snrt_allocator_t *snrt_l1_allocator_v2() { return &l1_allocator_v2; } +/** + * @brief Get the next pointer of the L1 allocator. + * + * @return The next pointer of the L1 allocator. + */ inline void *snrt_l1_next_v2() { return (void *)snrt_l1_allocator_v2()->next; } /** - * @brief Override the L1 allocator next pointer + * @brief Override the L1 allocator next pointer. + * + * @param next The new value for the next pointer. */ inline void snrt_l1_update_next_v2(void *next) { snrt_l1_allocator_v2()->next = (uint32_t)next; } -// Check that allocation doesn't exceed allocator bounds, and raise an -// exception otherwise +/** + * @brief Check if the allocation exceeds the allocator bounds and raise an + * exception if it does. + */ inline void snrt_l1_alloc_check_bounds() { if (snrt_l1_allocator_v2()->next > snrt_l1_allocator_v2()->end) asm volatile("ecall \n"); } -// Dynamically allocate space for a variable of size `size` in the cluster's L1 -// memory. This function should be invoked by every core in a cluster. Every -// core receives a pointer to the allocated variable. +/** + * @brief Allocate space for a variable in the cluster's L1 memory. + * + * This function dynamically allocates space for a variable of size `size` in + * the cluster's L1 memory. + * The allocation is aligned to the specified `alignment`. + * + * @param size The size of the variable to allocate. + * @param alignment The alignment of the allocation. + * @return Pointer to the allocated variable. + */ inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) { snrt_l1_allocator_v2()->next = ALIGN_UP(snrt_l1_allocator_v2()->next, alignment); @@ -34,11 +66,19 @@ inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) { return retval; } -// Dynamically allocate space for N variables of size `size` in the cluster's -// L1 memory, N being the number of compute cores in the cluster. This function -// should be invoked by every core in a cluster. Every compute core receives a -// pointer to a unique variable among the N which have been allocated. The -// return value for the DM core is undefined. +/** + * @brief Allocate space for N variables in the cluster's L1 memory. + * + * This function dynamically allocates space for N variables of size `size` in + * the cluster's L1 memory, where N is the number of compute cores in the + * cluster. The variables are allocated in a contiguous block of memory. + * The whole block is aligned to the specified `alignment`. + * + * @param size The size of each variable to allocate. + * @param alignment The alignment of the allocation. + * @return Pointer to the allocated variable for each compute core. + * The return value for the DM core is undefined. + */ inline void *snrt_l1_alloc_compute_core_local(size_t size, const size_t alignment) { snrt_l1_allocator_v2()->next = @@ -49,24 +89,52 @@ inline void *snrt_l1_alloc_compute_core_local(size_t size, return retval; } -// Takes a pointer to a variable allocated using -// `snrt_l1_alloc_compute_core_local` and returns a pointer to the same -// variable allocated by another core, as specified by `core_idx`. -// The `size` argument should be the same used during allocation. +/** + * @brief Get a pointer to the same variable allocated by another core. + * + * This function takes a pointer to a variable allocated using + * `snrt_l1_alloc_compute_core_local` and returns a pointer to the same + * variable allocated by another core, as specified by `core_idx`. + * The `size` argument should be the same used during allocation. + * + * @param ptr Pointer to the variable allocated by the current core. + * @param core_idx Index of the core that allocated the variable. + * @param size The size of the variable. + * @return Pointer to the same variable allocated by the specified core. + */ inline void *snrt_compute_core_local_ptr(void *ptr, uint32_t core_idx, size_t size) { size_t offset = (core_idx - snrt_cluster_core_idx()) * size; return (void *)((uintptr_t)ptr + offset); } -// Takes a pointer to a variable in the source cluster's L1 memory and returns -// a pointer to the same offset in the destination cluster's L1 memory. +/** + * @brief Get a pointer to the same offset in another cluster's L1 memory. + * + * This function takes a pointer to a variable in the calling (source) + * cluster's L1 memory and returns a pointer to the same offset in the target + * (destination) cluster's L1 memory. + * + * @param ptr Pointer to the variable in the source cluster's L1 memory. + * @param src_cluster_idx Index of the source cluster. + * @param dst_cluster_idx Index of the destination cluster. + * @return Pointer to the same offset in the destination cluster's L1 memory. + */ inline void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx, uint32_t dst_cluster_idx) { return (void *)((uintptr_t)ptr + (dst_cluster_idx - src_cluster_idx) * SNRT_CLUSTER_OFFSET); } +/** + * @brief Initialize the L1 allocator. + * + * This function initializes the L1 allocator by calculating the end address + * of the heap and setting the base, end, and next pointers of the allocator. + * + * @note This function should be called before using any of the allocation + * functions. + */ inline void snrt_alloc_init_v2() { // Calculate end address of the heap. The top of the TCDM address space is // reserved for the cluster-local storage (CLS) and the stack of every diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h index a4b82a7d3..363805346 100644 --- a/sw/snRuntime/src/dma.h +++ b/sw/snRuntime/src/dma.h @@ -2,6 +2,11 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +/** + * @file + * @brief This file provides functions to program the Snitch DMA. + */ + #pragma once #define OP_CUSTOM1 0b0101011 @@ -22,6 +27,13 @@ /// A DMA transfer identifier. typedef uint32_t snrt_dma_txid_t; +/** + * @brief Start an asynchronous 1D DMA transfer with 64-bit wide pointers. + * @param dst The destination address. + * @param src The source address. + * @param size The size of the transfer in bytes. + * @return The DMA transfer ID. + */ inline uint32_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, size_t size) { register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 @@ -51,13 +63,30 @@ inline uint32_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, return reg_txid; } -/// Initiate an asynchronous 1D DMA transfer. +/** + * @brief Start an asynchronous 1D DMA transfer with native-size pointers. + * @param dst The destination pointer. + * @param src The source pointer. + * @param size The size of the transfer in bytes. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_1d(void *dst, const void *src, size_t size) { return snrt_dma_start_1d_wideptr((size_t)dst, (size_t)src, size); } -/// Initiate an asynchronous 2D DMA transfer with wide 64-bit pointers. +/** + * @brief Start an asynchronous 2D DMA transfer with 64-bit wide pointers. + * @param dst The destination address. + * @param src The source address. + * @param size The size of every 1D transfer within the 2D transfer in bytes. + * @param dst_stride The offset between consecutive 1D transfers at the + * destination, in bytes. + * @param src_stride The offset between consecutive 1D transfers at the + * source, in bytes. + * @param repeat The number of 1D transfers composing the 2D transfer. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src, size_t size, size_t dst_stride, size_t src_stride, @@ -102,7 +131,18 @@ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src, return reg_txid; } -/// Initiate an asynchronous 2D DMA transfer. +/** + * @brief Start an asynchronous 2D DMA transfer with native-size pointers. + * @param dst The destination address. + * @param src The source address. + * @param size The size of every 1D transfer within the 2D transfer in bytes. + * @param dst_stride The offset between consecutive 1D transfers at the + * destination, in bytes. + * @param src_stride The offset between consecutive 1D transfers at the + * source, in bytes. + * @param repeat The number of 1D transfers composing the 2D transfer. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src, size_t size, size_t dst_stride, size_t src_stride, size_t repeat) { @@ -110,8 +150,15 @@ inline snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src, src_stride, repeat); } -/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a -/// specific channel. +/** + * @brief Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a + * specific channel. + * @param dst The destination address. + * @param src The source address. + * @param size The size of the transfer in bytes. + * @param channel The index of the channel. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_1d_channel_wideptr(uint64_t dst, uint64_t src, size_t size, @@ -144,7 +191,15 @@ inline snrt_dma_txid_t snrt_dma_start_1d_channel_wideptr(uint64_t dst, return reg_txid; } -/// Initiate an asynchronous 1D DMA transfer and a specific channel. +/** + * @brief Start an asynchronous 1D DMA transfer with native-size pointers on a + * specific channel. + * @param dst The destination pointer. + * @param src The source pointer. + * @param size The size of the transfer in bytes. + * @param channel The index of the channel. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_1d_channel(void *dst, const void *src, size_t size, uint32_t channel) { @@ -152,8 +207,20 @@ inline snrt_dma_txid_t snrt_dma_start_1d_channel(void *dst, const void *src, channel); } -/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a -/// specific channel. +/** + * @brief Start an asynchronous 2D DMA transfer with 64-bit wide pointers on a + * specific channel. + * @param dst The destination address. + * @param src The source address. + * @param size The size of every 1D transfer within the 2D transfer in bytes. + * @param dst_stride The offset between consecutive 1D transfers at the + * destination, in bytes. + * @param src_stride The offset between consecutive 1D transfers at the + * source, in bytes. + * @param repeat The number of 1D transfers composing the 2D transfer. + * @param channel The index of the channel. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_2d_channel_wideptr( uint64_t dst, uint64_t src, size_t size, size_t dst_stride, size_t src_stride, size_t repeat, uint32_t channel) { @@ -198,7 +265,20 @@ inline snrt_dma_txid_t snrt_dma_start_2d_channel_wideptr( return reg_txid; } -/// Initiate an asynchronous 2D DMA transfer and a specific channel. +/** + * @brief Start an asynchronous 2D DMA transfer with native-size pointers on a + * specific channel. + * @param dst The destination address. + * @param src The source address. + * @param size The size of every 1D transfer within the 2D transfer in bytes. + * @param dst_stride The offset between consecutive 1D transfers at the + * destination, in bytes. + * @param src_stride The offset between consecutive 1D transfers at the + * source, in bytes. + * @param repeat The number of 1D transfers composing the 2D transfer. + * @param channel The index of the channel. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_2d_channel(void *dst, const void *src, size_t size, size_t dst_stride, size_t src_stride, @@ -209,7 +289,10 @@ inline snrt_dma_txid_t snrt_dma_start_2d_channel(void *dst, const void *src, channel); } -/// Block until a transfer finishes. +/** + * @brief Block until a DMA transfer finishes. + * @param dst The DMA transfer ID. + */ inline void snrt_dma_wait(snrt_dma_txid_t tid) { // dmstati t0, 0 # 0=status.completed_id asm volatile( @@ -221,7 +304,10 @@ inline void snrt_dma_wait(snrt_dma_txid_t tid) { : "t0"); } -/// Block until a transfer finishes on a specific channel. +/** + * @brief Block until a DMA transfer finishes on a specific channel. + * @param dst The DMA transfer ID. + */ inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) { // dmstati t0, 0 # 0=status.completed_id register uint32_t cfg asm("t1") = channel << 2; @@ -235,7 +321,9 @@ inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) { : "t0"); } -/// Block until all operation on the DMA ceases. +/** + * @brief Block until all DMA operation ceases. + */ inline void snrt_dma_wait_all() { // dmstati t0, 2 # 2=status.busy asm volatile( @@ -246,7 +334,10 @@ inline void snrt_dma_wait_all() { : "t0"); } -/// Block until all operation on the DMA ceases on a specific channel. +/** + * @brief Block until a specific DMA channel is idle. + * @param channel The index of the channel. + */ inline void snrt_dma_wait_all_channel(uint32_t channel) { register uint32_t tmp; // dmstati t0, 2 # 2=status.busy @@ -260,7 +351,10 @@ inline void snrt_dma_wait_all_channel(uint32_t channel) { : "t0"); } -/// Wait until all channels are idle +/** + * @brief Block until the first @p num_channels channels are idle. + * @param num_channels The number of channels to wait on. + */ inline void snrt_dma_wait_all_channels(uint32_t num_channels) { register uint32_t tmp; // dmstati t0, 2 # 2=status.busy @@ -270,10 +364,10 @@ inline void snrt_dma_wait_all_channels(uint32_t num_channels) { } /** - * @brief start tracking of dma performance region. Does not have any + * @brief Start tracking of dma performance region. Does not have any * implications on the HW. Only injects a marker in the DMA traces that can be - * analyzed - * + * analyzed. + * @deprecated */ inline void snrt_dma_start_tracking() { // dmstati zero, 0 @@ -282,10 +376,10 @@ inline void snrt_dma_start_tracking() { } /** - * @brief stop tracking of dma performance region. Does not have any + * @brief Stop tracking of dma performance region. Does not have any * implications on the HW. Only injects a marker in the DMA traces that can be - * analyzed - * + * analyzed. + * @deprecated */ inline void snrt_dma_stop_tracking() { asm volatile(".word %0\n" ::"i"( @@ -293,11 +387,10 @@ inline void snrt_dma_stop_tracking() { } /** - * @brief fast memset function performed by DMA - * - * @param ptr pointer to the start of the region - * @param value value to set - * @param len number of bytes, must be multiple of DMA bus-width + * @brief Fast memset function performed by DMA. + * @param ptr Pointer to the start of the region. + * @param value Value to set. + * @param len Number of bytes, must be a multiple of the DMA bus width. */ inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) { // set first 64bytes to value @@ -314,9 +407,14 @@ inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) { snrt_dma_wait_all(); } -/// Load a 1D-tile of size tile_size from a 1D array. The specific tile is -/// selected by tile_idx. Every element in the src and dst arrays has prec -/// bytes. +/** + * @brief Load a tile of a 1D array. + * @param dst Pointer to the tile destination. + * @param src Pointer to the source array. + * @param tile_idx Index of the tile in the 1D array. + * @param tile_size Number of elements within a tile of the 1D array. + * @param prec Number of bytes of each element in the 1D array. + */ inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec) { @@ -324,9 +422,14 @@ inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src, return snrt_dma_start_1d(dst, src + tile_idx * tile_nbytes, tile_nbytes); } -/// Store a 1D-tile of size tile_size to a 1D array. The specific tile is -/// selected by tile_idx. Every element in the src and dst arrays has prec -/// bytes. +/** + * @brief Store a tile to a 1D array. + * @param dst Pointer to the destination array. + * @param src Pointer to the source tile. + * @param tile_idx Index of the tile in the 1D array. + * @param tile_size Number of elements within a tile of the 1D array. + * @param prec Number of bytes of each element in the 1D array. + */ inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec) { @@ -334,10 +437,20 @@ inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src, return snrt_dma_start_1d(dst + tile_idx * tile_nbytes, src, tile_nbytes); } -/// Load a 2D-tile of shape (tile_x1_size, tile_x0_size) from the 2D array -/// of shape (full_x1_size, full_x0_size). The specific tile is selected -/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and -/// destination arrays has prec bytes. +/** + * @brief Load a 2D tile of a 2D array. + * @param dst Pointer to the tile destination. + * @param src Pointer to the source array. + * @param tile_x1_idx Outermost coordinate of the tile in the 2D array. + * @param tile_x0_idx Innermost coordinate of the tile in the 2D array. + * @param tile_x1_size Number of elements in the outermost dimension of the + * tile. + * @param tile_x0_size Number of elements in the innermost dimension of the + * tile. + * @param full_x0_size Number of elements in the innermost dimension of the + * array. + * @param prec Number of bytes of each element in the 2D array. + */ inline snrt_dma_txid_t snrt_dma_load_2d_tile( void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, @@ -357,10 +470,20 @@ inline snrt_dma_txid_t snrt_dma_load_2d_tile( ); } -/// Store a 2D-tile of shape (tile_x1_size, tile_x0_size) to the 2D array -/// of shape (full_x1_size, full_x0_size). The specific tile is selected -/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and -/// destination arrays has prec bytes. +/** + * @brief Store a 2D tile to a 2D array. + * @param dst Pointer to the destination array. + * @param src Pointer to the source tile. + * @param tile_x1_idx Outermost coordinate of the tile in the 2D array. + * @param tile_x0_idx Innermost coordinate of the tile in the 2D array. + * @param tile_x1_size Number of elements in the outermost dimension of the + * tile. + * @param tile_x0_size Number of elements in the innermost dimension of the + * tile. + * @param full_x0_size Number of elements in the innermost dimension of the + * array. + * @param prec Number of bytes of each element in the 2D array. + */ inline snrt_dma_txid_t snrt_dma_store_2d_tile( void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, diff --git a/sw/snRuntime/src/ssr.h b/sw/snRuntime/src/ssr.h index 1a067fea5..d8858baa4 100644 --- a/sw/snRuntime/src/ssr.h +++ b/sw/snRuntime/src/ssr.h @@ -2,9 +2,27 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +/** + * @file + * @brief This file contains functions to conveniently program Snitch's SSRs. + * + * An SSR stream can be configured to replace a store (or load) sequence as + * could be generated by an N-dimensional affine loop nest: + * @code{.c} + * for (int i = 0; i < b1; i++) + * for (int j = 0; j < b0; j++) + * array[i * s1 + j * s0] = 0; + * @endcode + * + * The configuration functions provided in this file reflect the parameters + * one would define to set up such a loop nest. + */ + #pragma once -/// Synchronize the integer and float pipelines. +/** + * @brief Synchronize the integer and float pipelines. + */ inline void snrt_fpu_fence() { unsigned tmp; asm volatile( @@ -13,34 +31,41 @@ inline void snrt_fpu_fence() { : "+r"(tmp)::"memory"); } -/// The different SSR data movers. +/** + * @brief The different SSRs. + */ enum snrt_ssr_dm { - SNRT_SSR_DM0 = 0, - SNRT_SSR_DM1 = 1, - SNRT_SSR_DM2 = 2, - // To write to all SSRs, use index 31 - SNRT_SSR_DM_ALL = 31, + SNRT_SSR_DM0 = 0, /**< SSR data mover 0 */ + SNRT_SSR_DM1 = 1, /**< SSR data mover 1 */ + SNRT_SSR_DM2 = 2, /**< SSR data mover 2 */ + SNRT_SSR_DM_ALL = 31 /**< Write to all SSRs */ }; -/// The different dimensions. +/** + * @brief The different dimensions. + */ enum snrt_ssr_dim { - SNRT_SSR_1D = 0, - SNRT_SSR_2D = 1, - SNRT_SSR_3D = 2, - SNRT_SSR_4D = 3, + SNRT_SSR_1D = 0, /**< 1D stream */ + SNRT_SSR_2D = 1, /**< 2D stream */ + SNRT_SSR_3D = 2, /**< 3D stream */ + SNRT_SSR_4D = 3 /**< 4D stream */ }; -/// The SSR configuration registers. +/** + * @brief The SSR configuration registers. + */ enum { - REG_STATUS = 0, - REG_REPEAT = 1, - REG_BOUNDS = 2, // + loop index - REG_STRIDES = 6, // + loop index - REG_RPTR = 24, // + snrt_ssr_dim - REG_WPTR = 28, // + snrt_ssr_dim + REG_STATUS = 0, /**< SSR status register */ + REG_REPEAT = 1, /**< SSR repeat register */ + REG_BOUNDS = 2, /**< SSR bounds register */ + REG_STRIDES = 6, /**< SSR strides register */ + REG_RPTR = 24, /**< SSR read pointer register */ + REG_WPTR = 28 /**< SSR write pointer register */ }; -/// Enable SSR. +/** + * @brief Enable all SSRs. + */ inline void snrt_ssr_enable() { #ifdef __TOOLCHAIN_LLVM__ __builtin_ssr_enable(); @@ -49,7 +74,9 @@ inline void snrt_ssr_enable() { #endif } -/// Disable SSR. +/** + * @brief Disable all SSRs. + */ inline void snrt_ssr_disable() { #ifdef __TOOLCHAIN_LLVM__ __builtin_ssr_disable(); @@ -58,6 +85,12 @@ inline void snrt_ssr_disable() { #endif } +/** + * @brief Read the value of an SSR configuration register. + * @param reg The register index. + * @param dm The SSR index. + * @return The value of the register. + */ inline uint32_t read_ssr_cfg(uint32_t reg, uint32_t dm) { uint32_t value; asm volatile("scfgri %[value], %[dm] | %[reg]<<5\n" @@ -66,12 +99,23 @@ inline uint32_t read_ssr_cfg(uint32_t reg, uint32_t dm) { return value; } +/** + * @brief Write a value to an SSR configuration register. + * @param reg The register index. + * @param dm The SSR index. + * @param value The value to write. + */ inline void write_ssr_cfg(uint32_t reg, uint32_t dm, uint32_t value) { asm volatile("scfgwi %[value], %[dm] | %[reg]<<5\n" ::[value] "r"(value), [ dm ] "i"(dm), [ reg ] "i"(reg)); } -// Configure an SSR data mover for a 1D loop nest. +/** + * @brief Configure an SSR data mover for a 1D loop nest. + * @param dm The SSR index. + * @param b0 The bound of the loop. + * @param s0 The stride of the loop. + */ inline void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) { --b0; write_ssr_cfg(REG_BOUNDS + 0, dm, b0); @@ -80,7 +124,14 @@ inline void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) { a += s0 * b0; } -// Configure an SSR data mover for a 2D loop nest. +/** + * @brief Configure an SSR data mover for a 2D loop nest. + * @param dm The SSR index. + * @param b0 The bound of the first loop. + * @param b1 The bound of the second loop. + * @param s0 The stride of the first loop. + * @param s1 The stride of the second loop. + */ inline void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t s0, size_t s1) { --b0; @@ -94,7 +145,16 @@ inline void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1, a += s1 * b1; } -// Configure an SSR data mover for a 3D loop nest. +/** + * @brief Configure an SSR data mover for a 3D loop nest. + * @param dm The SSR index. + * @param b0 The bound of the first loop. + * @param b1 The bound of the second loop. + * @param b2 The bound of the third loop. + * @param s0 The stride of the first loop. + * @param s1 The stride of the second loop. + * @param s2 The stride of the third loop. + */ inline void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t b2, size_t s0, size_t s1, size_t s2) { --b0; @@ -112,10 +172,18 @@ inline void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1, a += s2 * b2; } -// Configure an SSR data mover for a 4D loop nest. -// b0: Inner-most bound (limit of loop) -// b3: Outer-most bound (limit of loop) -// s0: increment size of inner-most loop +/** + * @brief Configure an SSR data mover for a 4D loop nest. + * @param dm The SSR index. + * @param b0 The bound of the first loop. + * @param b1 The bound of the second loop. + * @param b2 The bound of the third loop. + * @param b3 The bound of the fourth loop. + * @param s0 The stride of the first loop. + * @param s1 The stride of the second loop. + * @param s2 The stride of the third loop. + * @param s3 The stride of the fourth loop. + */ inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t b2, size_t b3, size_t s0, size_t s1, size_t s2, size_t s3) { @@ -138,18 +206,32 @@ inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1, a += s3 * b3; } -/// Configure the repetition count for a stream. +/** + * @brief Configure the repetition count for a stream. + * @param dm The SSR index. + * @param count The repetition count. + */ inline void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count) { write_ssr_cfg(REG_REPEAT, dm, count - 1); } -/// Start a streaming read. +/** + * @brief Start a streaming read. + * @param dm The SSR index. + * @param dim The number of dimensions to use. + * @param ptr The pointer to the data. + */ inline void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, volatile void *ptr) { write_ssr_cfg(REG_RPTR + dim, dm, (uintptr_t)ptr); } -/// Start a streaming write. +/** + * @brief Start a streaming write. + * @param dm The SSR index. + * @param dim The number of dimensions to use. + * @param ptr The pointer to the data. + */ inline void snrt_ssr_write(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, volatile void *ptr) { write_ssr_cfg(REG_WPTR + dim, dm, (uintptr_t)ptr); diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h index fa4b75b24..add26fa08 100644 --- a/sw/snRuntime/src/sync.h +++ b/sw/snRuntime/src/sync.h @@ -5,6 +5,11 @@ // Luca Colagrande // Viviane Potocnik +/** + * @file + * @brief This file provides functions to synchronize Snitch cores. + */ + #pragma once #include @@ -13,11 +18,18 @@ // Mutex functions //================================================================================ +/** + * @brief Get a pointer to a mutex variable. + */ inline volatile uint32_t *snrt_mutex() { return &_snrt_mutex; } /** - * @brief lock a mutex, blocking - * @details declare mutex with `static volatile uint32_t mtx = 0;` + * @brief Acquire a mutex, blocking. + * @details Test-and-set (TAS) implementation of a lock. + * @param pmtx A pointer to a variable which can be used as a mutex, i.e. to + * which all cores have a reference and at a memory location to + * which atomic accesses can be made. This can be declared e.g. as + * `static volatile uint32_t mtx = 0;`. */ inline void snrt_mutex_acquire(volatile uint32_t *pmtx) { asm volatile( @@ -31,9 +43,9 @@ inline void snrt_mutex_acquire(volatile uint32_t *pmtx) { } /** - * @brief lock a mutex, blocking - * @details test and test-and-set (ttas) implementation of a lock. - * Declare mutex with `static volatile uint32_t mtx = 0;` + * @brief Acquire a mutex, blocking. + * @details Same as @ref snrt_mutex_acquire but acquires the lock using a test + * and test-and-set (TTAS) strategy. */ inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) { asm volatile( @@ -50,7 +62,7 @@ inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) { } /** - * @brief Release the mutex + * @brief Release a previously-acquired mutex. */ inline void snrt_mutex_release(volatile uint32_t *pmtx) { asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n" @@ -61,13 +73,21 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx) { // Barrier functions //================================================================================ -/// Synchronize cores in a cluster with a hardware barrier +/** + * @brief Synchronize cores in a cluster with a hardware barrier, blocking. + * @note Synchronizes all (both DM and compute) cores. All cores must invoke + * this function, or the calling cores will stall indefinitely. + */ inline void snrt_cluster_hw_barrier() { asm volatile("csrr x0, 0x7C2" ::: "memory"); } -// Synchronizes one core from every cluster with the others. -// One core per cluster is expected to invoke this function. +/** + * @brief Synchronize one core from every cluster with the others. + * @details Implemented as a software barrier. + * @note One core per cluster must invoke this function, or the calling cores + * will stall indefinitely. + */ inline void snrt_inter_cluster_barrier() { // Remember previous iteration uint32_t prev_barrier_iteration = _snrt_barrier.iteration; @@ -84,7 +104,15 @@ inline void snrt_inter_cluster_barrier() { } } -/// Synchronize clusters globally with a global software barrier +/** + * @brief Synchronize all Snitch cores. + * @details Synchronization is performed hierarchically. Within a cluster, + * cores are synchronized through a hardware barrier (see + * @ref snrt_cluster_hw_barrier). Clusters are synchronized through + * a software barrier (see @ref snrt_inter_cluster_barrier). + * @note Every Snitch core must invoke this function, or the calling cores + * will stall indefinitely. + */ inline void snrt_global_barrier() { snrt_cluster_hw_barrier(); @@ -96,17 +124,12 @@ inline void snrt_global_barrier() { snrt_cluster_hw_barrier(); } -inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) { - __atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED); - snrt_global_barrier(); - return _reduction_result; -} - /** - * @brief Generic barrier - * - * @param barr pointer to a barrier - * @param n number of harts that have to enter before released + * @brief Generic software barrier. + * @param barr pointer to a barrier variable. + * @param n number of harts that have to enter before released. + * @note Exactly the specified number of harts must invoke this function, or + * the calling cores will stall indefinitely. */ inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) { // Remember previous iteration @@ -128,8 +151,37 @@ inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) { // Reduction functions //================================================================================ -// Assumes the dst and src buffers are at the same offset in the TCDM of every -// cluster +/** + * @brief Perform a global sum reduction, blocking. + * @details All cores participate in the reduction and synchronize globally + * to wait for the reduction to complete. + * The synchronization is performed via @ref snrt_global_barrier. + * @param value The value to be summed. + * @return The result of the sum reduction. + * @note Every Snitch core must invoke this function, or the calling cores + * will stall indefinitely. + */ +inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) { + __atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED); + snrt_global_barrier(); + return _reduction_result; +} + +/** + * @brief Perform a sum reduction among clusters, blocking. + * @details The reduction is performed in a logarithmic fashion. Half of the + * clusters active in every level of the binary-tree participate as + * as senders, the other half as receivers. Senders use the DMA to + * send their data to the respective receiver's destination buffer. + * The receiver then reduces each element in its destination buffer + * with the respective element in its source buffer. It then proceeds + * to the next level in the binary tree. + * @param dst_buffer The pointer to the calling cluster's destination buffer. + * @param src_buffer The pointer to the calling cluster's source buffer. + * @param len The amount of data in each buffer. + * @note The destination buffers must lie at the same offset in every cluster's + * TCDM. + */ inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer, size_t len) { // If we have a single cluster the reduction degenerates to a memcpy diff --git a/sw/snRuntime/src/team.h b/sw/snRuntime/src/team.h index eb06a4488..560fff7cd 100644 --- a/sw/snRuntime/src/team.h +++ b/sw/snRuntime/src/team.h @@ -2,67 +2,153 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +/** + * @file + * @brief This file contains functions and macros related to Snitch team + * management. + * + * The functions in this file provide information about the Snitch hardware + * configuration, such as the number of clusters, cores per cluster, and the + * current core's index within the system. These functions can be used for team + * management and core-specific operations. + */ + #pragma once +/** + * @brief Get the RISC-V hardware thread ID (hartid). + * + * @return The hardware thread ID. + */ inline uint32_t __attribute__((const)) snrt_hartid() { uint32_t hartid; asm("csrr %0, mhartid" : "=r"(hartid)); return hartid; } +/** + * @brief Get the number of Snitch clusters in the system. + * + * @return The number of clusters. + */ inline uint32_t __attribute__((const)) snrt_cluster_num() { return SNRT_CLUSTER_NUM; } +/** + * @brief Get the number of cores per cluster. + * + * @return The number of cores per cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_core_num() { return SNRT_CLUSTER_CORE_NUM; } +/** + * @brief Get the hartid of the first Snitch core in the system. + * + * @return The hartid of the first Snitch core in the system. + */ inline uint32_t __attribute__((const)) snrt_global_core_base_hartid() { return SNRT_BASE_HARTID; } +/** + * @brief Get the total number of Snitch cores in the system. + * + * @return The total number of cores. + */ inline uint32_t __attribute__((const)) snrt_global_core_num() { return snrt_cluster_num() * snrt_cluster_core_num(); } +/** + * @brief Get the total number of Snitch compute cores in the system. + * + * @return The total number of compute cores. + */ inline uint32_t __attribute__((const)) snrt_global_compute_core_num() { return snrt_cluster_num() * snrt_cluster_compute_core_num(); } +/** + * @brief Get the index (!= hartid) of the current Snitch core in the system. + * + * @return The index of the current Snitch core. + */ inline uint32_t __attribute__((const)) snrt_global_core_idx() { return snrt_hartid() - snrt_global_core_base_hartid(); } +/** + * @brief Get the index of the current Snitch compute core in the system. + * + * @return The index of the current Snitch compute core. + */ inline uint32_t __attribute__((const)) snrt_global_compute_core_idx() { return snrt_cluster_idx() * snrt_cluster_compute_core_num() + snrt_cluster_core_idx(); } +/** + * @brief Get the index of the current Snitch cluster in the system. + * + * @return The index of the current cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_idx() { return snrt_global_core_idx() / snrt_cluster_core_num(); } +/** + * @brief Get the index of the current Snitch core within the cluster. + * + * @return The index of the current core within the cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_core_idx() { return snrt_global_core_idx() % snrt_cluster_core_num(); } +/** + * @brief Get the number of data mover (DM) cores per cluster. + * + * @return The number of DM cores per cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_dm_core_num() { return SNRT_CLUSTER_DM_CORE_NUM; } +/** + * @brief Get the number of compute cores per cluster. + * + * @return The number of compute cores per cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_compute_core_num() { return snrt_cluster_core_num() - snrt_cluster_dm_core_num(); } +/** + * @brief Check if the current core is a compute core. + * + * @return True if the current core is a compute core, false otherwise. + */ inline int __attribute__((const)) snrt_is_compute_core() { return snrt_cluster_core_idx() < snrt_cluster_compute_core_num(); } +/** + * @brief Check if the current core is the last compute core in the cluster. + * + * @return True if the current core is the last compute core, false otherwise. + */ inline int __attribute__((const)) snrt_cluster_is_last_compute_core() { return snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1); } +/** + * @brief Check if the current core is a data mover (DM) core. + * + * @return True if the current core is a DM core, false otherwise. + */ inline int __attribute__((const)) snrt_is_dm_core() { return !snrt_is_compute_core(); } diff --git a/util/container/Dockerfile b/util/container/Dockerfile index f6e98f713..9cdc7d9aa 100644 --- a/util/container/Dockerfile +++ b/util/container/Dockerfile @@ -11,6 +11,7 @@ ARG PYTHON_VERSION=3.9.12 ARG BENDER_VERSION=0.27.1 ARG SPIKE_DASM_VERSION=0.1.0 ARG VERILATOR_VERSION=5.006 +ARG DOXYGEN_VERSION=1.12.0 # Run dpkg without interactive dialogue ARG DEBIAN_FRONTEND=noninteractive @@ -90,6 +91,10 @@ RUN tar xzf bender-${BENDER_VERSION}-x86_64-linux-gnu-ubuntu18.04.tar.gz RUN wget https://github.com/pulp-platform/riscv-isa-sim/releases/download/snitch-v${SPIKE_DASM_VERSION}/snitch-spike-dasm-${SPIKE_DASM_VERSION}-x86_64-linux-gnu-ubuntu18.04.tar.gz RUN tar xzf snitch-spike-dasm-${SPIKE_DASM_VERSION}-x86_64-linux-gnu-ubuntu18.04.tar.gz +# Install Doxygen +RUN wget https://www.doxygen.nl/files/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz +RUN tar xzf doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz + # 2. Stage FROM ubuntu:22.04 AS snitch_cluster ARG SNITCH_LLVM_VERSION=latest @@ -149,6 +154,7 @@ COPY --from=builder /tools/spike-dasm bin/ COPY --from=builder /root/.cargo/bin/banshee bin/ COPY --from=builder /opt/python /opt/python COPY --from=builder /tools/verilator /tools/verilator/ +COPY --from=builder /tools/doxygen-${DOXYGEN_VERSION}/bin/doxygen bin/ # Create and activate virtual environment ENV VIRTUAL_ENV "/root/.venvs/snitch_cluster"