Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Snitch Cluster Offloading #13

Merged
merged 15 commits into from
Dec 13, 2024
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*~
.ninja*
**/build/*

.vscode/settings.json
10 changes: 10 additions & 0 deletions .vscode/c_cpp_properties.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"configurations": [
{
"name": "cMake",
"configurationProvider": "ms-vscode.cmake-tools",
"compileCommands": "${config:cmake.buildDirectory}/compile_commands.json"
}
],
"version": 4
}
40 changes: 38 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#
# Moritz Scherer <[email protected]>
# Viviane Potocnik <[email protected]>
# Philip Wiese <[email protected]>

cmake_minimum_required(VERSION 3.13)

Expand All @@ -27,15 +28,50 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

project(chimera-sdk LANGUAGES C ASM)

# WIESEP: It is important to set the ISA and ABI for the host and the cluster snitch
set(ISA_HOST rv32imc)
set(ABI_HOST ilp32)
set(ISA_CLUSTER_SNITCH rv32im)
set(ABI_CLUSTER_SNITCH ilp32)

message(STATUS "[CHIMERA-SDK] ISA_HOST : ${ISA_HOST}")
message(STATUS "[CHIMERA-SDK] ABI_HOST : ${ABI_HOST}")
message(STATUS "[CHIMERA-SDK] ISA_CLUSTER_SNITCH : ${ISA_CLUSTER_SNITCH}")
message(STATUS "[CHIMERA-SDK] ABI_CLUSTER_SNITCH : ${ABI_CLUSTER_SNITCH}")

Xeratec marked this conversation as resolved.
Show resolved Hide resolved
include(${CMAKE_CURRENT_LIST_DIR}/cmake/Utils.cmake)

add_subdirectory(targets)
# WIESEP: Add a object library to collect all runtime sources
add_library(runtime STATIC)

target_compile_options(runtime
PRIVATE
-O2
Xeratec marked this conversation as resolved.
Show resolved Hide resolved
-march=${ISA_HOST}
-mabi=${ABI_HOST}
)

target_link_options(runtime
PRIVATE
-march=${ISA_HOST}
-mabi=${ABI_HOST}
)

# WIESEP: Expose common link option
target_link_options(runtime
PUBLIC
-nostartfiles
-nostdlib
Xeratec marked this conversation as resolved.
Show resolved Hide resolved
)

add_subdirectory(hal)
add_subdirectory(targets)
add_subdirectory(driver)

# WIESEP: Interface library to link against all components of the SDK
add_library(chimera-sdk INTERFACE)
target_link_libraries(chimera-sdk INTERFACE hal)
target_link_libraries(chimera-sdk INTERFACE runtime)
target_sources(chimera-sdk INTERFACE $<TARGET_OBJECTS:runtime>)

enable_testing()

Expand Down
40 changes: 33 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,43 @@ The resulting binaries will be stored in `build/bin`, and can be used within the

To format all source files, run
```
python scripts/run_clang_format.py -ir hal/
python scripts/run_clang_format.py -ir targets/
python scripts/run_clang_format.py -ir tests/
python scripts/run_clang_format.py -ir hal/ targets/ tests/ driver/
```

Our CI uses llvm-12 for clang-format, so on IIS machines you may run
```
python scripts/run_clang_format.py -ir hal/ --clang-format-executable=/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/clang-format
python scripts/run_clang_format.py -ir tests/ hal/ targets/ driver/ --clang-format-executable=/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/clang-format

python scripts/run_clang_format.py -ir targets/ --clang-format-executable=/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/clang-format

python scripts/run_clang_format.py -ir tests/ --clang-format-executable=/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/clang-format
```

## Visual Studio Code Integration

To enable automatic configuration of the C/C++ extension and support for the integrated cMake build flow on the IIS workstations, add the following content to `.vscode/settings.json`:
```json
{
"cmake.configureSettings": {
"TOOLCHAIN_DIR": "/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin",
"TARGET_PLATFORM": "chimera-convolve",
},
"cmake.environment": {
"PATH": "/usr/pack/riscv-1.0-kgf/default/bin:${env:PATH}",
"LD_LIBRARY_PATH": "/usr/pack/riscv-1.0-kgf/lib64:/usr/pack/riscv-1.0-kgf/lib64",
}
}
Xeratec marked this conversation as resolved.
Show resolved Hide resolved
```

## Technical Details

### Mixed ISA Compilation
The current approach compiles all code for both the host and cluster cores into a single library. This requires precise handling to ensure compatibility between the different instruction set architectures (ISAs) and application binary interfaces (ABIs).
This requires careful handling to avoid invalid instructions caused by mismatched ISAs or ABIs between the host and cluster cores. Hence, we define four CMake variables,`ISA_HOST`, `ABI_HOST`, `ISA_CLUSTER_SNITCH`, and `ABI_CLUSTER_SNITCH` to specify the appropriate ISA and ABI for each core type.
Furthermore, the tests are split into `src_host` and `src_cluster` directories to clearly separate code executed on the host and cluster cores.

### cMake Build Flow
All runtime functions executed by the host core are compiled into a dedicated `runtime` static library. The trampoline function, which is executed by the cluster core, is a notable exception. To support its compilation with a different ISA, the trampoline function is built separately as an object library. This object library is then linked into the `runtime` library, ensuring that it integrates seamlessly while maintaining the necessary ISA compatibility.

### Warning
Special attention is required for functions that execute before the cluster core is fully initialized, such as the trampoline function and interrupt handlers. At this stage, critical resources like the stack, global pointer, and thread pointer are not yet configured. Consequently, the compiler must not generate code that allocates stack frames. To address this, such functions are implemented as naked functions, which prevent the compiler from adding prologues or epilogues that rely on stack operations.

**It is recommended to always check the generated assembly code to ensure that the correct instructions are generated for the target core!**

16 changes: 16 additions & 0 deletions driver/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright 2024 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Philip Wiese <[email protected]>


# WIESEP: Add all runtime drivers
add_subdirectory(cluster)

# WIESEP: Export this directory as root include directory for the drivers
target_include_directories(runtime PUBLIC ${CMAKE_CURRENT_LIST_DIR})




38 changes: 38 additions & 0 deletions driver/cluster/CMakeLists.txt
Xeratec marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2024 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Philip Wiese <[email protected]>

file(GLOB_RECURSE C_SOURCES_SNTICH
"trampoline_snitchCluster.c"
)

file(GLOB_RECURSE C_SOURCES
"*.c"
)

# WIESEP: Remove the C_SOURCES_SNTICH from the list of sources
list(REMOVE_ITEM C_SOURCES ${C_SOURCES_SNTICH})

# WIESEP: Create an object library for the snitch cluster trampoline to compile it with the correct ISA and ABI
add_library(cluster_snitch OBJECT ${C_SOURCES_SNTICH})

target_compile_options(cluster_snitch
PRIVATE
-O2
-march=${ISA_CLUSTER_SNITCH}
-mabi=${ABI_CLUSTER_SNITCH}
)

# Add include directories from runtime to cluster_snitch
target_include_directories(cluster_snitch
PRIVATE
$<TARGET_PROPERTY:runtime,INTERFACE_INCLUDE_DIRECTORIES>
)

target_sources(runtime INTERFACE $<TARGET_OBJECTS:cluster_snitch>)
target_sources(runtime PRIVATE ${C_SOURCES})



196 changes: 196 additions & 0 deletions driver/cluster/offload_snitchCluster.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
// Copyright 2024 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Moritz Scherer <[email protected]>
// Philip Wiese <[email protected]>

#include "soc.h"
#include "offload_snitchCluster.h"

#include <stdint.h>
#include <stddef.h>

// Persistent trampoline function pointer for each core
void (*_trampoline_function[NUM_CLUSTER_CORES])(void *) = {NULL};

// Peristent argument storage for the trampoline function
void *_trampoline_args[NUM_CLUSTER_CORES] = {NULL};

// Persistant stack pointer storage for each core
void *_trampoline_stack[NUM_CLUSTER_CORES] = {NULL};

/**
* @brief Trampoline function for the cluster core.
* This function will set up the stack pointer and call the function.
*
* @warning Make sure that this function is compiled with ISA for the Snitch cores (RV32IM)
*
*/
extern void _trampoline();

/**
* @brief Generate a trampoline function for the cluster core.
* The trampoline function will set up the stack pointer and call the function.
*
* @param function Function pointer to offload
* @param args Arguments to pass to the function
* @param stack Stack pointer for core
* @return A pointer to the persistent trampoline function
*/
static void *_generate_trampoline(uint32_t core_id, void (*function)(void *), void *args,
void *stack) {
// Assign trampoline with captured arguments to the persistent function pointer
_trampoline_function[core_id] = function;
_trampoline_args[core_id] = args;
_trampoline_stack[core_id] = stack;

// Store captured arguments in a persistent context if needed
return _trampoline;
}

/**
* @brief Setup the interrupt handler for the cluster cores.
* All cores in all clusters will jump to the handler when an interrupt is triggered.
*
* @param handler Function pointer to the interrupt handler
*/
void setup_snitchCluster_interruptHandler(void *handler) {
volatile void **snitchTrapHandlerAddr =
(volatile void **)(SOC_CTRL_BASE + CHIMERA_SNITCH_INTR_HANDLER_ADDR_REG_OFFSET);

*snitchTrapHandlerAddr = handler;
}

/**
* @brief Offload a void function pointer to a cluster's core.
* The function will be executed on the specified core of the cluster.
*
* @param function Function pointer to offload
* @param args Arguments to pass to the function
* @param stack_ptr Stack pointer for the core
* @param clusterId ID of the cluster to offload to
* @param core_id ID of the core to offload to (cores are 0-indexed for each cluster)
*/
void offload_snitchCluster_core(void *function, void *args, void *stack_ptr, uint8_t clusterId,
uint32_t core_id) {
volatile void **snitchBootAddr =
(volatile void **)(SOC_CTRL_BASE + CHIMERA_SNITCH_BOOT_ADDR_REG_OFFSET);

// Core with hartid 0 is CVA6's, thus we start with 1
uint32_t hartId = 1 + core_id;
for (uint32_t i = 0; i < clusterId; i++) {
hartId += _chimera_numCores[i];
}

*snitchBootAddr = _generate_trampoline(hartId, function, args, stack_ptr);

// Check if the cluster is busy
wait_snitchCluster_busy(clusterId);

// Send interrupt to the core
volatile uint32_t *interruptTarget = ((uint32_t *)CLINT_CTRL_BASE) + hartId;
*interruptTarget = 1;
}

/**
* @brief Offload a void function pointer to a cluster.
* The function will be executed on all cores of the cluster.
*
* @param function Function pointer to offload
* @param clusterId ID of the cluster to offload to
*/
void offload_snitchCluster(void *function, void *args, void *stack_ptr, uint8_t clusterId) {
volatile void **snitchBootAddr =
(volatile void **)(SOC_CTRL_BASE + CHIMERA_SNITCH_BOOT_ADDR_REG_OFFSET);

// Core with hartid 0 is CVA6's, thus we start with 1
uint32_t hartId = 1;
for (uint32_t i = 0; i < clusterId; i++) {
hartId += _chimera_numCores[i];
}

// Check if the cluster is busy
wait_snitchCluster_busy(clusterId);

for (uint32_t i = 0; i < _chimera_numCores[clusterId]; i++) {
*snitchBootAddr = _generate_trampoline(hartId, function, args, stack_ptr);
// Send interrupt to the core
volatile uint32_t *interruptTarget = ((uint32_t *)CLINT_CTRL_BASE) + hartId + i;
*interruptTarget = 1;
}
}

/**
* @brief Blocking wait for the cluster to become idle.
* The function busy waits until the cluster is ready.
*
* @warning In the current Snitch bootrom implementation each cores clears the busy flag as soon as
* is returned. Hence the busy flag does not reflect the actual status of the cluster.
*
* @todo Fix the bootrom after adding synchornization primitives for the Snitch cores.
*
* @param clusterId ID of the cluster to wait for.
*/
void wait_snitchCluster_busy(uint8_t clusterId) {
volatile int32_t *busy_ptr;

if (clusterId == 0) {
busy_ptr = (volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_CLUSTER_0_BUSY_REG_OFFSET);
} else if (clusterId == 1) {
busy_ptr = (volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_CLUSTER_1_BUSY_REG_OFFSET);
} else if (clusterId == 2) {
busy_ptr = (volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_CLUSTER_2_BUSY_REG_OFFSET);
} else if (clusterId == 3) {
busy_ptr = (volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_CLUSTER_3_BUSY_REG_OFFSET);
} else if (clusterId == 4) {
busy_ptr = (volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_CLUSTER_4_BUSY_REG_OFFSET);
}

while (*busy_ptr == 1) {
}
// TODO: temporary race condition fix
for (int i = 0; i < 100; i++) {
// NOP
asm volatile("addi x0, x0, 0\n" :::);
}

return;
}

/**
* @brief Wait for the cluster to return a value.
* The function busy waits until the cluster returns a non-zero value.
*
* @warning The return values must be non-zero, otherwise the function will busy wait forever!
*
* @param clusterId ID of the cluster to wait for.
* @return uint32_t Return value of the cluster.
*/
uint32_t wait_snitchCluster_return(uint8_t clusterId) {
volatile int32_t *snitchReturnAddr;
if (clusterId == 0) {
snitchReturnAddr =
(volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_SNITCH_CLUSTER_0_RETURN_REG_OFFSET);
} else if (clusterId == 1) {
snitchReturnAddr =
(volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_SNITCH_CLUSTER_1_RETURN_REG_OFFSET);
} else if (clusterId == 2) {
snitchReturnAddr =
(volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_SNITCH_CLUSTER_2_RETURN_REG_OFFSET);
} else if (clusterId == 3) {
snitchReturnAddr =
(volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_SNITCH_CLUSTER_3_RETURN_REG_OFFSET);
} else if (clusterId == 4) {
snitchReturnAddr =
(volatile int32_t *)(SOC_CTRL_BASE + CHIMERA_SNITCH_CLUSTER_4_RETURN_REG_OFFSET);
}

while (*snitchReturnAddr == 0) {
}

uint32_t retVal = *snitchReturnAddr;
*snitchReturnAddr = 0;

return retVal;
}
Loading