From 4734896a4b06aaff3c79bc1bd1fce3c634e97700 Mon Sep 17 00:00:00 2001 From: Cyril Koenig Date: Fri, 26 Apr 2024 21:36:49 +0200 Subject: [PATCH] hero: Added libomptarget dev --- hw/system/spatz_cluster/Makefile | 2 +- sw/snRuntime/link/common.ld.in | 4 + sw/snRuntime/src/team.c | 8 +- sw/spatzBenchmarks/CMakeLists.txt | 36 ++- sw/spatzBenchmarks/omptarget/io.h | 94 ++++++ sw/spatzBenchmarks/omptarget/main.c | 333 ++++++++++++++++++++++ sw/spatzBenchmarks/omptarget/sw_mailbox.c | 86 ++++++ sw/spatzBenchmarks/omptarget/sw_mailbox.h | 197 +++++++++++++ 8 files changed, 742 insertions(+), 18 deletions(-) create mode 100644 sw/spatzBenchmarks/omptarget/io.h create mode 100644 sw/spatzBenchmarks/omptarget/main.c create mode 100644 sw/spatzBenchmarks/omptarget/sw_mailbox.c create mode 100644 sw/spatzBenchmarks/omptarget/sw_mailbox.h diff --git a/hw/system/spatz_cluster/Makefile b/hw/system/spatz_cluster/Makefile index 04e165b4..3918e31f 100644 --- a/hw/system/spatz_cluster/Makefile +++ b/hw/system/spatz_cluster/Makefile @@ -177,7 +177,7 @@ lint/tmp/files: ${BENDER} ## Build SW into sw/build with the LLVM toolchain sw: clean.sw mkdir -p sw/build - cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DHTIF_SERVER=${HTIF_SERVER}-DSPATZ_CFG_FILENAME=${SPATZ_CFG_FILENAME} -DPYTHON=${PYTHON} .. && make + cd sw/build && ${CMAKE} -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DHTIF_SERVER=${HTIF_SERVER} -DSPATZ_CFG_FILENAME=${SPATZ_CFG_FILENAME} -DPYTHON=${PYTHON} .. && make # VSIM ## Build SW into sw/build with the LLVM toolchain (including tests) for Questasim simulator diff --git a/sw/snRuntime/link/common.ld.in b/sw/snRuntime/link/common.ld.in index 41ec54d8..7de98d4e 100644 --- a/sw/snRuntime/link/common.ld.in +++ b/sw/snRuntime/link/common.ld.in @@ -12,6 +12,10 @@ MEMORY SECTIONS { + .init : + { + . = ALIGN(4); + } /* The program code and other data goes into DRAM */ .text : { diff --git a/sw/snRuntime/src/team.c b/sw/snRuntime/src/team.c index 4612e858..ba808492 100644 --- a/sw/snRuntime/src/team.c +++ b/sw/snRuntime/src/team.c @@ -53,12 +53,12 @@ uint32_t snrt_cluster_compute_core_idx() { uint32_t snrt_cluster_compute_core_num() { // TODO: Actually derive this from the device tree! - return snrt_cluster_core_num() - 1; + return snrt_cluster_core_num(); } uint32_t snrt_cluster_dm_core_idx() { // TODO: Actually derive this from the device tree! - return snrt_cluster_core_num() - 1; + return 0; } uint32_t snrt_cluster_dm_core_num() { @@ -68,12 +68,12 @@ uint32_t snrt_cluster_dm_core_num() { int snrt_is_compute_core() { // TODO: Actually derive this from the device tree! - return snrt_cluster_core_idx() < snrt_cluster_core_num() - 1; + return 1; } int snrt_is_dm_core() { // TODO: Actually derive this from the device tree! - return !snrt_is_compute_core(); + return snrt_cluster_core_idx() == snrt_cluster_dm_core_idx(); } uint32_t _snrt_barrier_reg_ptr() { diff --git a/sw/spatzBenchmarks/CMakeLists.txt b/sw/spatzBenchmarks/CMakeLists.txt index 474b1919..c31bc08a 100644 --- a/sw/spatzBenchmarks/CMakeLists.txt +++ b/sw/spatzBenchmarks/CMakeLists.txt @@ -23,24 +23,30 @@ add_compile_options(-O3 -g -ffunction-sections) # Macro to regenerate the golden values and compile a module macro(add_spatz_test_oneParam name file param1) - set(target_name ${name}_M${param1}) - add_snitch_test(${target_name} ${file}) - target_link_libraries(test-${SNITCH_TEST_PREFIX}${target_name} benchmark ${SNITCH_RUNTIME}) - target_compile_definitions(test-${SNITCH_TEST_PREFIX}${target_name} PUBLIC DATAHEADER="data/data_${param1}.h") + if (BUILD_TESTS) + set(target_name ${name}_M${param1}) + add_snitch_test(${target_name} ${file}) + target_link_libraries(test-${SNITCH_TEST_PREFIX}${target_name} benchmark ${SNITCH_RUNTIME}) + target_compile_definitions(test-${SNITCH_TEST_PREFIX}${target_name} PUBLIC DATAHEADER="data/data_${param1}.h") + endif() endmacro() macro(add_spatz_test_twoParam name file param1 param2) +if (BUILD_TESTS) set(target_name ${name}_M${param1}_N${param2}) add_snitch_test(${target_name} ${file}) target_link_libraries(test-${SNITCH_TEST_PREFIX}${target_name} benchmark ${SNITCH_RUNTIME}) target_compile_definitions(test-${SNITCH_TEST_PREFIX}${target_name} PUBLIC DATAHEADER="data/data_${param1}_${param2}.h") +endif() endmacro() macro(add_spatz_test_threeParam name file param1 param2 param3) +if (BUILD_TESTS) set(target_name ${name}_M${param1}_N${param2}_K${param3}) add_snitch_test(${target_name} ${file}) target_link_libraries(test-${SNITCH_TEST_PREFIX}${target_name} benchmark ${SNITCH_RUNTIME}) target_compile_definitions(test-${SNITCH_TEST_PREFIX}${target_name} PUBLIC DATAHEADER="data/data_${param1}_${param2}_${param3}.h") + endif() endmacro() # Benchmark library @@ -54,8 +60,8 @@ add_library(hp-fmatmul hp-fmatmul/kernel/hp-fmatmul.c) add_library(widening-hp-fmatmul widening-hp-fmatmul/kernel/widening-fmatmul.c) add_library(widening-bp-fmatmul widening-bp-fmatmul/kernel/widening-fmatmul.c) -add_library(sdotp-hp-fmatmul sdotp-hp-fmatmul/kernel/sdotp-fmatmul.c) -add_library(sdotp-bp-fmatmul sdotp-bp-fmatmul/kernel/sdotp-fmatmul.c) +# add_library(sdotp-hp-fmatmul sdotp-hp-fmatmul/kernel/sdotp-fmatmul.c) +# add_library(sdotp-bp-fmatmul sdotp-bp-fmatmul/kernel/sdotp-fmatmul.c) add_library(dp-faxpy dp-faxpy/kernel/faxpy.c) @@ -88,14 +94,16 @@ add_spatz_test_threeParam(widening-bp-fmatmul widening-bp-fmatmul/main.c 64 128 add_spatz_test_threeParam(widening-bp-fmatmul widening-bp-fmatmul/main.c 128 128 128) add_spatz_test_threeParam(widening-bp-fmatmul widening-bp-fmatmul/main.c 128 256 128) -add_spatz_test_threeParam(sdotp-hp-fmatmul sdotp-hp-fmatmul/main.c 64 64 64 ) -add_spatz_test_threeParam(sdotp-hp-fmatmul sdotp-hp-fmatmul/main.c 64 128 64 ) -add_spatz_test_threeParam(sdotp-hp-fmatmul sdotp-hp-fmatmul/main.c 128 128 128) +# Hero toolchain doew not support sftop at the moment -add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 64 64 64 ) -add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 64 128 64 ) -add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 128 128 128) -add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 128 256 128) +# add_spatz_test_threeParam(sdotp-hp-fmatmul sdotp-hp-fmatmul/main.c 64 64 64 ) +# add_spatz_test_threeParam(sdotp-hp-fmatmul sdotp-hp-fmatmul/main.c 64 128 64 ) +# add_spatz_test_threeParam(sdotp-hp-fmatmul sdotp-hp-fmatmul/main.c 128 128 128) + +# add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 64 64 64 ) +# add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 64 128 64 ) +# add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 128 128 128) +# add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 128 256 128) add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 256) add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 1024) @@ -110,3 +118,5 @@ add_spatz_test_twoParam(dp-fft dp-fft/main.c 128 2) add_spatz_test_twoParam(sp-fft sp-fft/main.c 256 2) add_spatz_test_twoParam(sp-fft sp-fft/main.c 512 2) + +add_library(omptarget omptarget/main.c omptarget/sw_mailbox.c) diff --git a/sw/spatzBenchmarks/omptarget/io.h b/sw/spatzBenchmarks/omptarget/io.h new file mode 100644 index 00000000..db3ce619 --- /dev/null +++ b/sw/spatzBenchmarks/omptarget/io.h @@ -0,0 +1,94 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Robert Balas +// + +/* Description: Memory mapped register I/O access + */ + +#ifndef __IO_H +#define __IO_H + +#include + + +/* generic I/O write */ +static inline void writeb(uint8_t val, uintptr_t addr) +{ + asm volatile("sb %0, 0(%1)" + : + : "r"(val), "r"((volatile uint8_t *)addr) + : "memory"); +} + +static inline void writeh(uint16_t val, uintptr_t addr) +{ + asm volatile("sh %0, 0(%1)" + : + : "r"(val), "r"((volatile uint16_t *)addr) + : "memory"); +} + +static inline void writew(uint32_t val, uintptr_t addr) +{ + asm volatile("sw %0, 0(%1)" + : + : "r"(val), "r"((volatile uint32_t *)addr) + : "memory"); +} + +static inline void writed(uint64_t val, uintptr_t addr) +{ + asm volatile("sd %0, 0(%1)" + : + : "r"(val), "r"((volatile uint64_t *)addr) + : "memory"); +} + +/* generic I/O read */ +static inline uint8_t readb(const uintptr_t addr) +{ + uint8_t val; + + asm volatile("lb %0, 0(%1)" + : "=r"(val) + : "r"((const volatile uint8_t *)addr) + : "memory"); + return val; +} + +static inline uint16_t readh(const uintptr_t addr) +{ + uint16_t val; + + asm volatile("lh %0, 0(%1)" + : "=r"(val) + : "r"((const volatile uint16_t *)addr) + : "memory"); + return val; +} + +static inline uint32_t readw(const uintptr_t addr) +{ + uint32_t val; + + asm volatile("lw %0, 0(%1)" + : "=r"(val) + : "r"((const volatile uint32_t *)addr) + : "memory"); + return val; +} + +static inline uint64_t readd(const uintptr_t addr) +{ + uint64_t val; + + asm volatile("ld %0, 0(%1)" + : "=r"(val) + : "r"((const volatile uint64_t *)addr) + : "memory"); + return val; +} +#endif diff --git a/sw/spatzBenchmarks/omptarget/main.c b/sw/spatzBenchmarks/omptarget/main.c new file mode 100644 index 00000000..e09de774 --- /dev/null +++ b/sw/spatzBenchmarks/omptarget/main.c @@ -0,0 +1,333 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include + +#include "sw_mailbox.h" +#include "io.h" + +__attribute__((optimize("O0"))) void csleep(uint32_t cycles) { + uint32_t start = read_csr(mcycle); + while ((read_csr(mcycle) - start) < cycles) {} +} + +void snrt_putchar(char c) { + writew(c, (uintptr_t) 0x3002000); + csleep(100000); +} + +void snrt_puthalfbyte(uint8_t halfbyte) { + uint32_t ascii[16] = {48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70}; + snrt_putchar(ascii[halfbyte]); +} + +void snrt_putbyte(uint8_t byte) { + snrt_puthalfbyte((byte >> 4) & 0xf); + snrt_puthalfbyte(byte & 0xf); +} + +void snrt_putword(uint32_t word) { + for(int i = 3; i >= 0; i--) + snrt_putbyte((word >> 8 * i) & 0xff); + snrt_putchar(10); + snrt_putchar(13); +} + +//void _putchar(char character) {snrt_putchar(character);} + +//================================================================================ +// MACROS AND SETTINGS +//================================================================================ + +// set to >0 for debugging +#define DEBUG_LEVEL_OFFLOAD_MANAGER 1 + +const uint32_t active_pe = 8; + +/* MAILBOX SIGNALING */ +#define MBOX_DEVICE_READY (0x01U) +#define MBOX_DEVICE_START (0x02U) +#define MBOX_DEVICE_BUSY (0x03U) +#define MBOX_DEVICE_DONE (0x04U) +#define MBOX_DEVICE_STOP (0x0FU) +#define MBOX_DEVICE_LOGLVL (0x10U) +#define MBOX_HOST_READY (0x1000U) +#define MBOX_HOST_DONE (0x3000U) + +#define TO_RUNTIME (0x10000000U) // bypass PULP driver +#define RAB_UPDATE (0x20000000U) // handled by PULP driver +#define RAB_SWITCH (0x30000000U) // handled by PULP driver + +//================================================================================ +// TYPES +//================================================================================ + +// Shrinked gomp_team_t descriptor +typedef struct offload_rab_miss_handler_desc_s { + void (*omp_task_f)(void *arg, uint32_t argc); + void *omp_args; + void *omp_argc; + int barrier_id; +} offload_rab_miss_handler_desc_t; + +typedef uint32_t virt_addr_t; +typedef uint32_t virt_pfn_t; + +// This struct represents a miss in the RAB Miss Hardware FIFO. +typedef struct rab_miss_t { + virt_addr_t virt_addr; + int core_id; + int cluster_id; + int intra_cluster_id; + uint8_t is_prefetch; +} rab_miss_t; + +//================================================================================ +// Data +//================================================================================ +static volatile uint32_t g_printf_mutex = 0; + +static volatile uint32_t *soc_scratch = (uint32_t *)(0x02000014); + +const uint32_t snrt_stack_size __attribute__((weak, section(".rodata"))) = 12; + +// The boot data generated along with the system RTL. +// See `hw/system/snitch_cluster/test/tb_lib.hh` for details. +struct snrt_cluster_bootdata { + uint32_t boot_addr; + uint32_t core_count; + uint32_t hartid_base; + uint32_t tcdm_start; + uint32_t tcdm_size; + uint32_t tcdm_offset; + uint64_t global_mem_start; + uint64_t global_mem_end; + uint32_t cluster_count; + uint32_t s1_quadrant_count; + uint32_t clint_base; +}; + +/** + * @brief Called by each hart before the pre-main barrier in snrt crt0 + * + */ +void _snrt_hier_wakeup(void) { + const uint32_t core_id = snrt_cluster_core_idx(); + + // master core wakes other cluster cores through cluster local clint + if (core_id == 0) { + // clear the interrupt from cva6 + snrt_int_sw_clear(snrt_hartid()); + // wake remaining cluster cores + const unsigned cluster_core_num = snrt_cluster_core_num(); + snrt_int_cluster_set(~0x1 & ((1 << cluster_core_num) - 1)); + } else { + // clear my interrupt + snrt_int_cluster_clr(1 << core_id); + } +} + +//================================================================================ +// TODO: Symbols to declare somewhere else on a merge +//================================================================================ + + +//================================================================================ +// HERO Functions +//================================================================================ + +static void offload_rab_misses_handler(void *arg, uint32_t argc) { + (void)arg; + (void)argc; + snrt_error("unimplemented!\r\n"); + // static void offload_rab_misses_handler(uint32_t *status) { + // uint32_t *status = (uint32_t)arg; + // if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("offload_rab_misses_handler: synch @%p (0x%x)\n", status, + // *(volatile unsigned int *)status); + // do { + // handle_rab_misses(); + // } while (*((volatile uint32_t *)status) != 0xdeadbeefU); + // if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("offload_rab_misses_handler: synch @%p (0x%x)\n", status, + // *(volatile unsigned int *)status); +} + +static int gomp_offload_manager() { + const uint32_t core_id = snrt_cluster_core_idx(); + + // Init the manager (handshake btw host and accelerator is here) + // gomp_init_offload_manager(); + + // FIXME For the momenent we are not using the cmd sended as trigger. + // It should be used to perform the deactivation of the accelerator, + // as well as other operations, like local data allocation or movement. + // FIXME Note that the offload at the moment use several time the mailbox. + // We should compact the offload descriptor and just sent a pointer to + // that descriptor. + uint32_t cmd = (uint32_t)NULL; + uint32_t data; + + // Offloaded function pointer and arguments + void (*offloadFn)(uint64_t) = NULL; + uint64_t offloadArgs = 0x0; + unsigned nbOffloadRabMissHandlers = 0x0; + uint32_t offload_rab_miss_sync = 0x0U; + // offload_rab_miss_handler_desc_t rab_miss_handler = {.omp_task_f = offload_rab_misses_handler, + // .omp_args = (void *)&offload_rab_miss_sync, + // .omp_argc = 1, + // .barrier_id = -1}; + + int cycles = 0; + uint32_t issue_fpu, dma_busy; + rab_miss_t rab_miss; + // reset_vmm(); + + while (1) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Waiting for command...\n"); + + // (1) Wait for the offload trigger cmd == MBOX_DEVICE_START + mailbox_read((unsigned int *)&cmd, 1); + if (MBOX_DEVICE_STOP == cmd) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Got MBOX_DEVICE_STOP from host, stopping execution now.\n"); + break; + } else if (MBOX_DEVICE_LOGLVL == cmd) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Got command 0x%x, setting log level.\n", cmd); + mailbox_read((unsigned int *)&data, 1); + //snrt_debug_set_loglevel(data); + continue; + } else if (MBOX_DEVICE_START != cmd) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Got unexpected command 0x%x, stopping execution now.\n", cmd); + break; + } + + // (2) The host sends through the mailbox the pointer to the function that should be + // executed on the accelerator. + mailbox_read((unsigned int *)&offloadFn, 1); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("tgt_fn @ 0x%x\n", (unsigned int)offloadFn); + + // (3) The host sends through the mailbox the pointer to the arguments that should + // be used. + mailbox_read((unsigned int *)&offloadArgs, 1); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("tgt_vars @ 0x%x\n", (unsigned int)offloadArgs); + + // (3b) The host sends through the mailbox the number of rab misses handlers threads + mailbox_read((unsigned int *)&nbOffloadRabMissHandlers, 1); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("nbOffloadRabMissHandlers %d/%d\n", nbOffloadRabMissHandlers, active_pe); + + // (3c) Spawning nbOffloadRabMissHandlers + unsigned mhCoreMask = 0; + nbOffloadRabMissHandlers = + nbOffloadRabMissHandlers < active_pe - 1 ? nbOffloadRabMissHandlers : active_pe - 1; + if (nbOffloadRabMissHandlers) { + offload_rab_miss_sync = 0x0U; + for (int pid = active_pe - 1, i = nbOffloadRabMissHandlers; i > 0; i--, pid--) { + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("enabling RAB miss handler on %d\n", pid); + mhCoreMask |= (1 << pid); + } + } + omp_getData()->maxThreads = active_pe - nbOffloadRabMissHandlers; + omp_getData()->numThreads = active_pe - nbOffloadRabMissHandlers; + // eu_dispatch_team_config(mhCoreMask); + // eu_dispatch_push((unsigned int)&offload_rab_misses_handler); + // eu_dispatch_push((unsigned int)&offload_rab_miss_sync); + // eu_dispatch_team_config(omp_getData()->coreMask); + + // (4) Ensure access to offloadArgs. It might be in SVM. + if (offloadArgs != 0x0) { + // FIXME + // pulp_tryread((unsigned int *)offloadArgs); + } + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("begin offloading\n"); + // reset_timer(); + // start_timer(); + + //for (unsigned i = 0; i < 16; i += 2) { + // snrt_trace(" %2d: 0x%08x = ... ; %2d: 0x%08x = ...\n", i, ((uint32_t *)offloadArgs)[i], + // /* *((uint32_t *)(((uint32_t *)offloadArgs)[i])) ,*/ i + 1, + // ((uint32_t *)offloadArgs)[i + 1] /*, *((uint32_t *)(((uint32_t *)offloadArgs)[i + 1]))*/ ); + //} + + // (5) Execute the offloaded function. + // snrt_reset_perf_counter(SNRT_PERF_CNT0); + // snrt_reset_perf_counter(SNRT_PERF_CNT1); + // snrt_start_perf_counter(SNRT_PERF_CNT0, SNRT_PERF_CNT_ISSUE_FPU, core_id); + // snrt_start_perf_counter(SNRT_PERF_CNT1, SNRT_PERF_CNT_DMA_BUSY, core_id); + cycles = read_csr(mcycle); + + offloadFn(offloadArgs); + + cycles = read_csr(mcycle) - cycles; + // snrt_stop_perf_counter(SNRT_PERF_CNT0); + // snrt_stop_perf_counter(SNRT_PERF_CNT1); + // issue_fpu = snrt_get_perf_counter(SNRT_PERF_CNT0); + // dma_busy = snrt_get_perf_counter(SNRT_PERF_CNT1); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("end offloading\n"); + + // (6) Report EOC and profiling + //snrt_info("cycles: %d\r\n", cycles); + + mailbox_write(MBOX_DEVICE_DONE); + mailbox_write(cycles); + + //if (DEBUG_LEVEL_OFFLOAD_MANAGER > 0) + // snrt_trace("Kernel execution time [Snitch cycles] = %d\n", cycles); + + if (nbOffloadRabMissHandlers) { + offload_rab_miss_sync = 0xdeadbeefU; + // gomp_atomic_add_thread_pool_idle_cores(nbOffloadRabMissHandlers); + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + (void)argc; + (void)argv; + unsigned core_idx = snrt_cluster_core_idx(); + unsigned core_num = snrt_cluster_core_num(); + + + /** + * One core initializes the global data structures + */ + if (core_idx == 0) { + // read memory layout from scratch2 + g_a2h_rb = NULL; + g_h2a_mbox = (struct ring_buf *)readw(0x3000000); + g_a2h_mbox = (struct ring_buf *)readw(0x3000004); + writew(0x1, 0x40000248); + writew(0x1, 0x40000208); + writew(0x1, 0x40000348); + writew(0x1, 0x40000308); + } + + __snrt_omp_bootstrap(core_idx); + + gomp_offload_manager(); + + //snrt_trace("bye\n"); + // exit + __snrt_omp_destroy(core_idx); + snrt_hero_exit(0); + return 0; +} diff --git a/sw/spatzBenchmarks/omptarget/sw_mailbox.c b/sw/spatzBenchmarks/omptarget/sw_mailbox.c new file mode 100644 index 00000000..0072eeb2 --- /dev/null +++ b/sw/spatzBenchmarks/omptarget/sw_mailbox.c @@ -0,0 +1,86 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "sw_mailbox.h" +#include "io.h" + +/*********************************************************************************** + * MACROS + ***********************************************************************************/ + +#define SYS_exit 60 +#define SYS_write 64 +#define SYS_read 63 +#define SYS_wake 1235 +#define SYS_cycle 1236 + +/*********************************************************************************** + * DATA + ***********************************************************************************/ +volatile struct ring_buf *g_a2h_rb; +volatile struct ring_buf *g_a2h_mbox; +volatile struct ring_buf *g_h2a_mbox; + +/*********************************************************************************** + * FUNCTIONS + ***********************************************************************************/ + +int syscall(uint64_t which, uint64_t arg0, uint64_t arg1, uint64_t arg2, + uint64_t arg3, uint64_t arg4) { + uint64_t magic_mem[6]; + int ret; + uint32_t retries = 0; + + volatile struct ring_buf *rb = g_a2h_rb; + + magic_mem[0] = which; + magic_mem[1] = arg0; + magic_mem[2] = arg1; + magic_mem[3] = arg2; + magic_mem[4] = arg3; + magic_mem[5] = arg4; + + do { + ret = rb_device_put(rb, (void *)magic_mem); + if (ret) { + ++retries; + csleep(100); + } + } while (ret != 0); + return retries; +} + +void snrt_hero_exit(int code) { syscall(SYS_exit, code, 0, 0, 0, 0); } + +/*********************************************************************************** + * MAILBOX + ***********************************************************************************/ + +int mailbox_try_read(uint32_t *buffer) { + return rb_device_get(g_h2a_mbox, buffer) == 0 ? 1 : 0; +} +int mailbox_read(uint32_t *buffer, size_t n_words) { + int ret; + while (n_words--) { + do { + ret = rb_device_get(g_h2a_mbox, &buffer[n_words]); + if (ret) { + csleep(10); + } + } while (ret); + } + return 0; +} +int mailbox_write(uint32_t word) { + int ret; + do { + ret = rb_device_put(g_a2h_mbox, &word); + if (ret) { + csleep(10); + } + } while (ret); + return ret; +} diff --git a/sw/spatzBenchmarks/omptarget/sw_mailbox.h b/sw/spatzBenchmarks/omptarget/sw_mailbox.h new file mode 100644 index 00000000..8d532e46 --- /dev/null +++ b/sw/spatzBenchmarks/omptarget/sw_mailbox.h @@ -0,0 +1,197 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include +#include +#include +#include + +/*********************************************************************************** + * MACROS + ***********************************************************************************/ + +#define MBOX_DEVICE_READY (0x01U) +#define MBOX_DEVICE_START (0x02U) +#define MBOX_DEVICE_BUSY (0x03U) +#define MBOX_DEVICE_DONE (0x04U) +#define MBOX_DEVICE_STOP (0x0FU) +#define MBOX_DEVICE_LOGLVL (0x10U) +#define MBOX_HOST_READY (0x1000U) +#define MBOX_HOST_DONE (0x3000U) + +#define SYS_exit 60 +#define SYS_write 64 +#define SYS_read 63 +#define SYS_wake 1235 +#define SYS_cycle 1236 + +/*********************************************************************************** + * TYPES + ***********************************************************************************/ + +/** + * @brief Ring buffer for simple communication from accelerator to host. + * @tail: Points to the element in `data` which is read next + * @head: Points to the element in `data` which is written next + * @size: Number of elements in `data`. Head and tail pointer wrap at `size` + * @element_size: Size of each element in bytes + * @data_p: points to the base of the data buffer in physical address + * @data_v: points to the base of the data buffer in virtual address space + */ +struct ring_buf { + uint32_t head; + uint32_t size; + uint32_t tail; + uint32_t element_size; + uint64_t data_v; + uint64_t data_p; +}; + + +/*********************************************************************************** + * DATA + ***********************************************************************************/ +extern volatile struct ring_buf *g_a2h_rb; +extern volatile struct ring_buf *g_a2h_mbox; +extern volatile struct ring_buf *g_h2a_mbox; + +/*********************************************************************************** + * INLINES + ***********************************************************************************/ + +static inline void dump_mbox(struct ring_buf *rbuf) { + printf("---DUMPING NOW---\n\r"); + printf("mbox (%x)\n\r", rbuf); + uint8_t* addr = rbuf; + for(int i = 0; i < sizeof(struct ring_buf); i++) { + if(i % 8 == 0) + printf("\n\r(%x) : ", addr); + printf("%x-", *(addr++)); + } + printf("\n\r"); + printf("head : %#x = %u\n\r" , &rbuf->head , rbuf->head ); + printf("size : %#x = %u\n\r" , &rbuf->size , rbuf->size ); + printf("tail : %#x = %u\n\r" , &rbuf->tail , rbuf->tail ); + printf("data_p : %#x = %lx\n\r", &rbuf->data_p , rbuf->data_p ); + printf("data_v : %#x = %lx\n\r", &rbuf->data_v , rbuf->data_v ); + //printf("tail %u, data_v %" PRIu64 ", element_size %u, size %u, data_p %" PRIu64 ", head %u\n\r", rbuf->tail, rbuf->data_v, rbuf->element_size, rbuf->size, rbuf->data_p, rbuf->head); + printf("---DUMPING ENDS---\n\r"); +} + +/** + * @brief Copy data from `el` in the next free slot in the ring-buffer on the + *physical addresses + * + * @param rb pointer to the ring buffer struct + * @param el pointer to the data to be copied into the ring buffer + * @return int 0 on succes, -1 if the buffer is full + */ +static inline int rb_device_put(volatile struct ring_buf *rb, void *el) { + uint32_t next_head = (rb->head + 1) % rb->size; + // caught the tail, can't put data + if (next_head == rb->tail) + return -1; + for (uint32_t i = 0; i < rb->element_size; i++) + *((uint8_t *)rb->data_p + rb->element_size *rb->head + i) = + *((uint8_t *)el + i); + rb->head = next_head; + return 0; +} +/** + * @brief Pop element from ring buffer on virtual addresses + * + * @param rb pointer to ring buffer struct + * @param el pointer to where element is copied to + * @return 0 on success, -1 if no element could be popped + */ +static inline int rb_host_get(volatile struct ring_buf *rb, void *el) { + // caught the head, can't get data + if (rb->tail == rb->head) + return -1; + for (uint32_t i = 0; i < rb->element_size; i++) + *((uint8_t *)el + i) = + *((uint8_t *)rb->data_v + rb->element_size * rb->tail + i); + rb->tail = (rb->tail + 1) % rb->size; + return 0; +} + +/** + * @brief Copy data from `el` in the next free slot in the ring-buffer on the + *virtual addresses + * + * @param rb pointer to the ring buffer struct + * @param el pointer to the data to be copied into the ring buffer + * @return int 0 on succes, -1 if the buffer is full + */ +static inline int rb_host_put(volatile struct ring_buf *rb, void *el) { + uint32_t next_head = (rb->head + 1) % rb->size; + // caught the tail, can't put data + if (next_head == rb->tail) + return -1; + for (uint32_t i = 0; i < rb->element_size; i++) + *((uint8_t *)rb->data_v + rb->element_size *rb->head + i) = + *((uint8_t *)el + i); + rb->head = next_head; + return 0; +} +/** + * @brief Pop element from ring buffer on physicl addresses + * + * @param rb pointer to ring buffer struct + * @param el pointer to where element is copied to + * @return 0 on success, -1 if no element could be popped + */ +static inline int rb_device_get(volatile struct ring_buf *rb, void *el) { + // caught the head, can't get data + if (rb->tail == rb->head) + return -1; + for (uint32_t i = 0; i < rb->element_size; i++) + *((uint8_t *)el + i) = + *((uint8_t *)rb->data_p + rb->element_size * rb->tail + i); + rb->tail = (rb->tail + 1) % rb->size; + return 0; +} +/** + * @brief Init the ring buffer. See `struct ring_buf` for details + */ +static inline void rb_init(volatile struct ring_buf *rb, uint64_t size, + uint64_t element_size) { + rb->tail = 0; + rb->head = 0; + rb->size = size; + rb->element_size = element_size; +} + +/** + * @brief Holds physical addresses of the shared L3 + * @a2h_rb: accelerator to host ring buffer + * @head: base of heap memory + */ +struct l3_layout { + uint32_t a2h_rb; + uint32_t a2h_mbox; + uint32_t h2a_mbox; + uint32_t heap; +}; + +/*********************************************************************************** + * PUBLICS + ***********************************************************************************/ +int syscall(uint64_t which, uint64_t arg0, uint64_t arg1, uint64_t arg2, + uint64_t arg3, uint64_t arg4); +void csleep(uint32_t cycles); +void snrt_hero_exit(int code); +/** + * @brief Blocking mailbox read access + */ +int mailbox_read(uint32_t *buffer, size_t n_words); +/** + * @brief Non-Blocking mailbox read access. Return 1 on success, 0 on fail + */ +int mailbox_try_read(uint32_t *buffer); +/** + * @brief Blocking mailbox write access + */ +int mailbox_write(uint32_t word);