diff --git a/target/rtl/bootrom/src/bootrom.c b/target/rtl/bootrom/src/bootrom.c index 390c60c5..1cbe6970 100644 --- a/target/rtl/bootrom/src/bootrom.c +++ b/target/rtl/bootrom/src/bootrom.c @@ -33,6 +33,7 @@ void delay_cycles(uint64_t cycle) { // Boot modes. enum boot_mode_t { + HALT, TARGET_CHIPID, UART, COPY_TO_REMOTE, @@ -67,16 +68,17 @@ void bootrom() { print_u8(address_prefix, target_chip_id); print_str(address_prefix, "\r\n\t Enter the number to select the mode: "); - print_str(address_prefix, "\r\n\t 1. Change the target remote Chip ID"); - print_str(address_prefix, "\r\n\t 2. Load from UART to 0x"); + print_str(address_prefix, "\r\n\t 1. Halt the CVA6 Core"); + print_str(address_prefix, "\r\n\t 2. Change the target remote Chip ID"); + print_str(address_prefix, "\r\n\t 3. Load from UART to 0x"); print_u48(address_prefix, remote_chip_mem_start_address); print_str(address_prefix, - "\r\n\t 3. Copy memory from local chip to remote chip"); + "\r\n\t 4. Copy memory from local chip to remote chip"); print_str(address_prefix, - "\r\n\t 4. Copy memory from remote chip to local chip"); - print_str(address_prefix, "\r\n\t 5. Print memory from 0x"); + "\r\n\t 5. Copy memory from remote chip to local chip"); + print_str(address_prefix, "\r\n\t 6. Print memory from 0x"); print_u48(address_prefix, remote_chip_mem_start_address); - print_str(address_prefix, "\r\n\t 6. Continue to Boot from 0x"); + print_str(address_prefix, "\r\n\t 7. Continue to Boot from 0x"); print_u48(address_prefix, local_chip_mem_start_address); print_str(address_prefix, "\r\n"); @@ -85,6 +87,11 @@ void bootrom() { char* cur = 0; switch (boot_mode) { + case HALT: + print_str(address_prefix, "\r\n\t CVA6 Core is Halted. "); + getchar(address_prefix); + __asm__ volatile("wfi"); + break; case TARGET_CHIPID: print_str(address_prefix, "\r\n\t Enter the target remote Chip ID: "); @@ -92,12 +99,12 @@ void bootrom() { cur = in_buf; target_chip_id = 0; while (*cur != '\0') { - if (*cur >= '0' || *cur <= '9') { + if (*cur >= '0' && *cur <= '9') { target_chip_id = (target_chip_id << 4) + *cur - '0'; - } else if (*cur >= 'A' || *cur <= 'F') { + } else if (*cur >= 'A' && *cur <= 'F') { target_chip_id = (target_chip_id << 4) + *cur - 'A' + 10; - } else if (*cur >= 'a' || *cur <= 'f') { + } else if (*cur >= 'a' && *cur <= 'f') { target_chip_id = (target_chip_id << 4) + *cur - 'a' + 10; } else { diff --git a/target/sim/sw/device/apps/common.mk b/target/sim/sw/device/apps/common.mk index dd5de1c0..5d94fcf9 100644 --- a/target/sim/sw/device/apps/common.mk +++ b/target/sim/sw/device/apps/common.mk @@ -47,7 +47,6 @@ INCDIRS += $(SNRT_DIR)/../math/include BASE_LD = $(abspath $(SNRT_DIR)/base.ld) MEMORY_LD = $(abspath $(APPSDIR)/memory.ld) ORIGIN_LD = $(abspath $(BUILDDIR)/origin.ld) -BASE_LD = $(abspath $(SNRT_DIR)/base.ld) SNRT_LIB_DIR = $(abspath $(RUNTIME_DIR)/build/) SNRT_LIB_NAME = snRuntime SNRT_LIB = $(realpath $(SNRT_LIB_DIR)/lib$(SNRT_LIB_NAME).a) diff --git a/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c b/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c index 457f7511..d50feffa 100644 --- a/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c +++ b/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c @@ -26,7 +26,7 @@ int main() { if (snrt_is_dm_core()) { tcdm0_start_addr = (uint64_t)snrt_cluster_base_addrl(); tcdm0_start_addr += (uint64_t)snrt_cluster_base_addrh() << 32; - printf("The C0 TCDM ADDR is %p%p \n", + printf("The C0 TCDM ADDR is %p%p \r\n", (uint8_t*)(tcdm0_start_addr >> 32), (uint8_t*)tcdm0_start_addr); } @@ -37,7 +37,7 @@ int main() { if (snrt_is_dm_core()) { tcdm1_start_addr = (uint64_t)snrt_cluster_base_addrl(); tcdm1_start_addr += (uint64_t)snrt_cluster_base_addrh() << 32; - printf("The C1 TCDM ADDR is %p%p \n", + printf("The C1 TCDM ADDR is %p%p \r\n", (uint8_t*)(tcdm1_start_addr >> 32), (uint8_t*)tcdm1_start_addr); } @@ -48,7 +48,7 @@ int main() { if (snrt_is_dm_core()) { test_data_start_addr = (uint64_t)test_data; test_data_start_addr += (uint64_t)snrt_cluster_base_addrh() << 32; - printf("[C0] Start to load data from %p%p \n", + printf("[C0] Start to load data from %p%p \r\n", (uint8_t*)(test_data_start_addr >> 32), (uint8_t*)test_data_start_addr); snrt_dma_start_1d_wideptr(tcdm0_start_addr, test_data_start_addr, @@ -62,7 +62,7 @@ int main() { // Thenc C1 fetches data from C0 if (snrt_cluster_idx() == 1) { if (snrt_is_dm_core()) { - printf("[C1] Start to load data from %p%p \n", + printf("[C1] Start to load data from %p%p \r\n", (uint8_t*)(tcdm0_start_addr >> 32), (uint8_t*)tcdm0_start_addr); snrt_dma_start_1d_wideptr(tcdm1_start_addr, tcdm0_start_addr, @@ -76,12 +76,12 @@ int main() { // Start to check if (snrt_cluster_idx() == 0) { if (snrt_cluster_core_idx() == 0) { - printf("C0 Checking the results\n"); + printf("C0 Checking the results\r\n"); for (int i = 0; i < length_data; i++) { if (((int8_t*)tcdm0_start_addr)[i] != test_data[i]) { err++; - printf("C0 data is incorrect!\n"); - printf("tcdm0[%d]=%d, test_data[%d]=%d\n", i, + printf("C0 data is incorrect!\r\n"); + printf("tcdm0[%d]=%d, test_data[%d]=%d\r\n", i, ((int8_t*)tcdm0_start_addr)[i], i, test_data[i]); return -1; } @@ -91,12 +91,12 @@ int main() { snrt_global_barrier(); if (snrt_cluster_idx() == 1) { if (snrt_cluster_core_idx() == 0) { - printf("C1 Checking the results\n"); + printf("C1 Checking the results\r\n"); for (int i = 0; i < length_data; i++) { if (((int8_t*)tcdm1_start_addr)[i] != test_data[i]) { err++; - printf("C1 data is incorrect!\n"); - printf("tcdm0[%d]=%d, test_data[%d]=%d\n", i, + printf("C1 data is incorrect!\r\n"); + printf("tcdm0[%d]=%d, test_data[%d]=%d\r\n", i, ((int8_t*)tcdm1_start_addr)[i], i, test_data[i]); return -1; } @@ -107,7 +107,7 @@ int main() { snrt_global_barrier(); if (snrt_cluster_idx() == 0) { if (snrt_is_dm_core()) { - printf("Checking all done! No error!\n"); + printf("Checking all done! No error!\r\n"); } } diff --git a/target/sim/sw/device/runtime/src/occamy_device.h b/target/sim/sw/device/runtime/src/occamy_device.h index dcc1fbde..b769e8f4 100644 --- a/target/sim/sw/device/runtime/src/occamy_device.h +++ b/target/sim/sw/device/runtime/src/occamy_device.h @@ -55,14 +55,17 @@ inline void return_to_cva6(sync_t sync) { if (cnt == snrt_cluster_num()) { #endif *((volatile uint32_t*)barrier_ptr) = 0; - // Interrupt the local host to signal the exit code (snitch by default only has the access to local domain) - set_host_sw_interrupt(0); + // Interrupt the local host to signal the exit code (snitch by + // default only has the access to local domain) + comm_buffer_t* comm_buffer = get_communication_buffer(); + set_host_sw_interrupt(comm_buffer->chip_id); } } } // Otherwise assume cores are already synchronized and only // one core calls this function else { - set_host_sw_interrupt(0); + comm_buffer_t* comm_buffer = get_communication_buffer(); + set_host_sw_interrupt(comm_buffer->chip_id); } } diff --git a/target/sim/sw/device/runtime/src/occamy_start.c b/target/sim/sw/device/runtime/src/occamy_start.c index 00fca72b..e8c33381 100644 --- a/target/sim/sw/device/runtime/src/occamy_start.c +++ b/target/sim/sw/device/runtime/src/occamy_start.c @@ -26,7 +26,10 @@ static inline void snrt_exit_default(int exit_code); static inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); // Interrupt the local host to signal the exit code (snitch by default only has the access to local domain) - if (snrt_global_core_idx() == 0) set_host_sw_interrupt(0); + if (snrt_global_core_idx() == 0) { + comm_buffer_t* comm_buffer = get_communication_buffer(); + set_host_sw_interrupt(comm_buffer->chip_id); + } } #include "start.c" diff --git a/target/sim/sw/host/Makefile b/target/sim/sw/host/Makefile index 21728704..35f09d05 100644 --- a/target/sim/sw/host/Makefile +++ b/target/sim/sw/host/Makefile @@ -6,7 +6,13 @@ # Add user applications to APPS variable APPS = hello_world +ifneq ($(findstring chiplet,$(CFG_OVERRIDE)),) +# If chiplet cfg is used, offloaf_multichip is compiled with the support to execute applications on cores at different chips +APPS += offload_multichip +else +# Otherwise, simple offload is compiled, which is mainly used for ci. APPS += offload +endif TARGET ?= all diff --git a/target/sim/sw/host/apps/common.mk b/target/sim/sw/host/apps/common.mk index 0d1a251d..cbfc3eec 100644 --- a/target/sim/sw/host/apps/common.mk +++ b/target/sim/sw/host/apps/common.mk @@ -92,7 +92,7 @@ finalize-build: $(FINAL_OUTPUTS) .PHONY: clean clean: rm -rf $(BUILDDIR) - rm -f $(OFFSET_LD) + rm -f $(ORIGIN_LD) $(BUILDDIR): mkdir -p $@ @@ -106,6 +106,7 @@ $(DEP): $(SRCS) | $(BUILDDIR) # Partially linked object $(PARTIAL_ELF): $(DEP) $(LD_SRCS) | $(BUILDDIR) + rm -f $(ORIGIN_LD) $(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@ $(PARTIAL_DUMP): $(PARTIAL_ELF) | $(BUILDDIR) diff --git a/target/sim/sw/host/apps/hello_world/src/hello_world.c b/target/sim/sw/host/apps/hello_world/src/hello_world.c index 377f6c8f..49327073 100644 --- a/target/sim/sw/host/apps/hello_world/src/hello_world.c +++ b/target/sim/sw/host/apps/hello_world/src/hello_world.c @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include "chip_id.h" -#include "host.c" +#include "host.h" // Frequency at which the UART peripheral is clocked #define PERIPH_FREQ 50000000 diff --git a/target/sim/sw/host/apps/offload/Makefile b/target/sim/sw/host/apps/offload/Makefile index 38eeb69d..63b434f1 100644 --- a/target/sim/sw/host/apps/offload/Makefile +++ b/target/sim/sw/host/apps/offload/Makefile @@ -102,7 +102,7 @@ finalize-build: $(FINAL_OUTPUTS) .PHONY: clean clean: rm -rf $(BUILDDIR) - rm -f $(OFFSET_LD) + rm -f $(ORIGIN_LD) $(BUILDDIR): mkdir -p $@ @@ -124,6 +124,7 @@ $(PARTIAL_DUMP): $(PARTIAL_ELF) | $(BUILDDIR) $(RISCV_OBJDUMP) -D $< > $@ # Device object relocation address +.PHONY: $(DEVICE_DIR)/apps/%/build/origin.ld $(DEVICE_DIR)/apps/%/build/origin.ld: $(PARTIAL_ELF) | $(DEVICE_DIR)/apps/%/build @RELOC_ADDR=$$($(RISCV_OBJDUMP) -t $< | grep snitch_main | cut -c9-16); \ echo "Writing device object relocation address 0x$$RELOC_ADDR to $@"; \ diff --git a/target/sim/sw/host/apps/offload/src/offload.c b/target/sim/sw/host/apps/offload/src/offload.c index 7f8b9c6d..fb56de87 100644 --- a/target/sim/sw/host/apps/offload/src/offload.c +++ b/target/sim/sw/host/apps/offload/src/offload.c @@ -2,42 +2,60 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -#include "host.c" +#include "host.h" + +// Global Variables for communication buffer +volatile comm_buffer_t* comm_buffer_ptr = (comm_buffer_t*)0; int main() { // Reset and ungate all quadrants, deisolate - uintptr_t address_prefix = (uintptr_t)get_current_chip_baseaddress(); - uint32_t chip_id = get_current_chip_id(); - - init_uart(address_prefix, 50000000, 1000000); - print_str(address_prefix, "[Occamy] The Offload main function \r\n"); - print_str(address_prefix, "[Occamy] Current Chip ID is: "); - print_u8(address_prefix, chip_id); - print_str(address_prefix, "\r\n"); - reset_and_ungate_quadrants_all(chip_id); - print_str(address_prefix, "[Occamy] Snitch ungated. \r\n"); - deisolate_all(chip_id); - print_str(address_prefix, "[Occamy] Snitch deisolated. \r\n"); - // Enable interrupts to receive notice of job termination + uintptr_t current_chip_address_prefix = + (uintptr_t)get_current_chip_baseaddress(); + uint32_t current_chip_id = get_current_chip_id(); + + init_uart(current_chip_address_prefix, 50000000, 1000000); + print_str(current_chip_address_prefix, + "[Occamy] The Offload main function \r\n"); + print_str(current_chip_address_prefix, "[Occamy] Current Chip ID is: "); + print_u8(current_chip_address_prefix, current_chip_id); + print_str(current_chip_address_prefix, "\r\n"); + + comm_buffer_ptr = (comm_buffer_t*)(((uint64_t)&__narrow_spm_start) | + current_chip_address_prefix); + + // print_str(current_chip_address_prefix, + // "[Occamy] Snitch Communication Buffer is: "); + // print_u48(current_chip_address_prefix, (uint64_t)comm_buffer_ptr); + // print_str(current_chip_address_prefix, "\r\n"); + reset_and_ungate_quadrants_all(current_chip_id); + // print_str(current_chip_address_prefix, "[Occamy] Snitch ungated. \r\n"); + deisolate_all(current_chip_id); + // print_str(current_chip_address_prefix, "[Occamy] Snitch deisolated. + // \r\n"); Enable interrupts to receive notice of job termination enable_sw_interrupts(); // Program Snitch entry point and communication buffer - program_snitches(chip_id); - print_str(address_prefix, "[Occamy] Snitch Jump Address Programmed. \r\n"); + comm_buffer_ptr->lock = 0; + comm_buffer_ptr->chip_id = current_chip_id; + program_snitches(current_chip_id, comm_buffer_ptr); + // print_str(current_chip_address_prefix, + // "[Occamy] Snitch Jump Address Programmed. \r\n"); // Compiler fence to ensure Snitch entry point is // programmed before Snitches are woken up - asm volatile("" ::: "memory"); + asm volatile("fence.i" ::: "memory"); - print_str(address_prefix, "[Occamy] Calling snitch cluster to execute the task \r\n"); + print_str(current_chip_address_prefix, + "[Occamy] Calling snitch cluster to execute the task \r\n"); // Start Snitches - wakeup_snitches_cl(chip_id); + wakeup_snitches_cl(current_chip_id); - int ret = wait_snitches_done(chip_id); + int ret = wait_snitches_done(current_chip_id); - print_str(address_prefix, "[Occamy] Snitch cluster done with exit code "); - print_u32(address_prefix, ret); - print_str(address_prefix, "\r\n"); + print_str(current_chip_address_prefix, + "[Occamy] Snitch cluster done with exit code "); + print_u32(current_chip_address_prefix, ret); + print_str(current_chip_address_prefix, "\r\n"); // Wait for job done and return Snitch exit code return ret; diff --git a/target/sim/sw/host/apps/offload_multichip/Makefile b/target/sim/sw/host/apps/offload_multichip/Makefile new file mode 100644 index 00000000..bd355e81 --- /dev/null +++ b/target/sim/sw/host/apps/offload_multichip/Makefile @@ -0,0 +1,153 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +###################### +# Invocation options # +###################### + +DEBUG ?= OFF # ON to turn on debugging symbols + +################### +# Build variables # +################### + +APP = offload_multichip +SRCS = src/offload_multichip.c +# DEVICE_APPS += $(shell cd ../../../device/apps/;find -mindepth 2 -maxdepth 2 -type d | sed 's|^\./||') + +# Compiler toolchain +CVA6_GCC_ROOT = /tools/riscv/bin +RISCV_CC = $(CVA6_GCC_ROOT)/riscv64-unknown-elf-gcc +RISCV_OBJCOPY = $(CVA6_GCC_ROOT)/riscv64-unknown-elf-objcopy +RISCV_OBJDUMP = $(CVA6_GCC_ROOT)/riscv64-unknown-elf-objdump +RISCV_READELF = $(CVA6_GCC_ROOT)/riscv64-unknown-elf-readelf + +# Directories +BUILDDIR = $(abspath build) +HOST_DIR = $(abspath ../../) +RUNTIME_DIR = $(abspath $(HOST_DIR)/runtime) +DEVICE_DIR = $(abspath $(HOST_DIR)/../device) + +# now we only include the snax app +DEVICE_APPS += snax/snax-gemmx-matmul +DEVICE_APPS += snax/snax-gemmx-conv +DEVICE_APPS += snax/snax-test-integration +DEVICE_APPS += snax/snax-hypercorex-test-csr +DEVICE_APPS += snax/snax-hypercorex-char-recog +DEVICE_APPS += snax/snax-xdma-maxpool +DEVICE_APPS += snax/snax-xdma-memset + +# Dependencies +INCDIRS += $(RUNTIME_DIR) +INCDIRS += $(HOST_DIR)/../shared/platform/generated +INCDIRS += $(HOST_DIR)/../shared/runtime +SRCS += $(RUNTIME_DIR)/start.S + +# Compiler flags +RISCV_CFLAGS += $(addprefix -I,$(INCDIRS)) +RISCV_CFLAGS += -march=rv64imafdc +RISCV_CFLAGS += -mabi=lp64d +RISCV_CFLAGS += -mcmodel=medany +RISCV_CFLAGS += -ffast-math +RISCV_CFLAGS += -fno-builtin-printf +RISCV_CFLAGS += -fno-common +RISCV_CFLAGS += -O3 +RISCV_CFLAGS += -ffunction-sections +RISCV_CFLAGS += -Wextra +RISCV_CFLAGS += -Werror +ifeq ($(DEBUG), ON) +RISCV_CFLAGS += -g +endif + +# Linking sources +LINKER_SCRIPT = $(abspath $(HOST_DIR)/runtime/host.ld) +LD_SRCS = $(LINKER_SCRIPT) + +# Linker flags +RISCV_LDFLAGS += -nostartfiles +RISCV_LDFLAGS += -lm +RISCV_LDFLAGS += -lgcc +RISCV_LDFLAGS += -T$(LINKER_SCRIPT) + +# Device binaries +DEVICE_BUILDDIRS = $(addsuffix /build, $(addprefix $(DEVICE_DIR)/apps/, $(DEVICE_APPS))) + +########### +# Outputs # +########### + +PARTIAL_ELF = $(abspath $(BUILDDIR)/$(APP).part.elf) +DEP = $(abspath $(BUILDDIR)/$(APP).d) +PARTIAL_DUMP = $(abspath $(BUILDDIR)/$(APP).part.dump) +LD_ORIGINS = $(abspath $(addsuffix /origin.ld, $(DEVICE_BUILDDIRS))) +ELFS = $(abspath $(addsuffix .elf, $(addprefix $(BUILDDIR)/$(APP)-, $(notdir $(DEVICE_APPS))))) +DUMPS = $(abspath $(addsuffix .dump, $(addprefix $(BUILDDIR)/$(APP)-, $(notdir $(DEVICE_APPS))))) +DWARFS = $(abspath $(addsuffix .dwarf, $(addprefix $(BUILDDIR)/$(APP)-, $(notdir $(DEVICE_APPS))))) +PARTIAL_OUTPUTS = $(PARTIAL_ELF) $(PARTIAL_DUMP) $(LD_ORIGINS) +FINAL_OUTPUTS = $(ELFS) $(DUMPS) $(DWARFS) + +######### +# Rules # +######### + +.PHONY: partial-build +partial-build: $(PARTIAL_OUTPUTS) + +.PHONY: finalize-build +finalize-build: $(FINAL_OUTPUTS) + +.PHONY: clean +clean: + rm -rf $(BUILDDIR) + rm -f $(ORIGIN_LD) + +$(BUILDDIR): + mkdir -p $@ + +$(DEVICE_BUILDDIRS): + mkdir -p $@ + +$(DEP): $(SRCS) | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(PARTIAL_ELF)' $< > $@ + for elf in $(ELFS); do \ + $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$$elf' $< >> $@; \ + done + +# Partially linked object +$(PARTIAL_ELF): $(DEP) $(LD_SRCS) | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) $(RISCV_LDFLAGS) $(SRCS) -o $@ + +$(PARTIAL_DUMP): $(PARTIAL_ELF) | $(BUILDDIR) + $(RISCV_OBJDUMP) -D $< > $@ + +# Device object relocation address +.PHONY: $(DEVICE_DIR)/apps/%/build/origin.ld +$(DEVICE_DIR)/apps/%/build/origin.ld: $(PARTIAL_ELF) | $(DEVICE_DIR)/apps/%/build + @RELOC_ADDR=$$($(RISCV_OBJDUMP) -t $< | grep snitch_main | cut -c9-16); \ + echo "Writing device object relocation address 0x$$RELOC_ADDR to $@"; \ + echo "L3_ORIGIN = 0x$$RELOC_ADDR;" > $@ + +# Generates a rule which looks somewhat like: +# +# $(BUILDDIR)/$(APP)-%.elf: $(DEVICE_DIR)/apps/%/build/%.bin $(DEP) $(LD_SRCS) | $(BUILDDIR) +# $(RISCV_CC) $(RISCV_CFLAGS) -DDEVICEBIN=\"$<\" $(RISCV_LDFLAGS) $(SRCS) -o $@ +# +# This approach is required cause you can't use multiple %-signs in a prerequisite +define elf_rule_template = + $$(BUILDDIR)/$$(APP)-$(notdir $(1)).elf: $$(DEVICE_DIR)/apps/$(1)/build/$(notdir $(1)).bin $$(DEP) $$(LD_SRCS) | $$(BUILDDIR) + $$(RISCV_CC) $$(RISCV_CFLAGS) -DDEVICEBIN=\"$$<\" $$(RISCV_LDFLAGS) $$(SRCS) -o $$@ +endef +$(foreach f,$(DEVICE_APPS),$(eval $(call elf_rule_template,$(f)))) + +$(BUILDDIR)/$(APP)-%.dump: $(BUILDDIR)/$(APP)-%.elf | $(BUILDDIR) + $(RISCV_OBJDUMP) -D $< > $@ + +$(BUILDDIR)/$(APP)-%.dwarf: $(BUILDDIR)/$(APP)-%.elf | $(BUILDDIR) + $(RISCV_READELF) --debug-dump $< > $@ + +ifneq ($(MAKECMDGOALS),clean) +-include $(DEP) +endif \ No newline at end of file diff --git a/target/sim/sw/host/apps/offload_multichip/src/offload_multichip.c b/target/sim/sw/host/apps/offload_multichip/src/offload_multichip.c new file mode 100644 index 00000000..e2f7af1b --- /dev/null +++ b/target/sim/sw/host/apps/offload_multichip/src/offload_multichip.c @@ -0,0 +1,87 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "host.h" + +// Global Variables for communication buffer +volatile comm_buffer_t* comm_buffer_ptr = (comm_buffer_t*)0; + +int main() { + // Reset and ungate all quadrants, deisolate + uintptr_t current_chip_address_prefix = + (uintptr_t)get_current_chip_baseaddress(); + uint32_t current_chip_id = get_current_chip_id(); + uint32_t target_chip_id = 0; + char in_buf[8]; + + init_uart(current_chip_address_prefix, 50000000, 1000000); + print_str(current_chip_address_prefix, + "[Occamy] The Offload main function \r\n"); + print_str(current_chip_address_prefix, "[Occamy] Current Chip ID is: "); + print_u8(current_chip_address_prefix, current_chip_id); + print_str(current_chip_address_prefix, "\r\n"); + print_str(current_chip_address_prefix, "[Occamy] Enter target Chip ID: "); + scan_str(current_chip_address_prefix, in_buf); + print_str(current_chip_address_prefix, "\r\n"); + + char* cur = in_buf; + + while (*cur != '\0') { + if (*cur >= '0' && *cur <= '9') { + target_chip_id = (target_chip_id << 4) + *cur - '0'; + } else if (*cur >= 'A' && *cur <= 'F') { + target_chip_id = (target_chip_id << 4) + *cur - 'A' + 10; + } else if (*cur >= 'a' && *cur <= 'f') { + target_chip_id = (target_chip_id << 4) + *cur - 'a' + 10; + } else { + print_str(current_chip_address_prefix, + "[Occamy] Invalid target chip ID. \r\n"); + scan_char(current_chip_address_prefix); + break; + } + cur++; + } + + uintptr_t target_chip_address_prefix = + (uintptr_t)get_chip_baseaddress(target_chip_id); + comm_buffer_ptr = (comm_buffer_t*)(((uint64_t)&__narrow_spm_start) | + target_chip_address_prefix); + + // print_str(current_chip_address_prefix, + // "[Occamy] Snitch Communication Buffer is: "); + // print_u48(current_chip_address_prefix, (uint64_t)comm_buffer_ptr); + // print_str(current_chip_address_prefix, "\r\n"); + reset_and_ungate_quadrants_all(target_chip_id); + // print_str(current_chip_address_prefix, "[Occamy] Snitch ungated. \r\n"); + deisolate_all(target_chip_id); + // print_str(current_chip_address_prefix, "[Occamy] Snitch deisolated. + // \r\n"); Enable interrupts to receive notice of job termination + enable_sw_interrupts(); + // Program Snitch entry point and communication buffer + comm_buffer_ptr->lock = 0; + comm_buffer_ptr->chip_id = current_chip_id; + program_snitches(target_chip_id, comm_buffer_ptr); + // print_str(current_chip_address_prefix, + // "[Occamy] Snitch Jump Address Programmed. \r\n"); + + // Compiler fence to ensure Snitch entry point is + // programmed before Snitches are woken up + asm volatile("fence.i" ::: "memory"); + + print_str(current_chip_address_prefix, + "[Occamy] Calling snitch cluster to execute the task \r\n"); + + // Start Snitches + wakeup_snitches_cl(target_chip_id); + + int ret = wait_snitches_done(target_chip_id); + + print_str(current_chip_address_prefix, + "[Occamy] Snitch cluster done with exit code "); + print_u32(current_chip_address_prefix, ret); + print_str(current_chip_address_prefix, "\r\n"); + + // Wait for job done and return Snitch exit code + return ret; +} diff --git a/target/sim/sw/host/runtime/host.c b/target/sim/sw/host/runtime/host.c deleted file mode 100644 index 46c228a8..00000000 --- a/target/sim/sw/host/runtime/host.c +++ /dev/null @@ -1,803 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -#include "host.h" -#include "chip_id.h" -#include "occamy.h" -#include "sys_dma.h" -#include "uart.h" - -#include "heterogeneous_runtime.h" - -// Handle multireg degeneration to single register -#if OCCAMY_SOC_ISOLATE_MULTIREG_COUNT == 1 -#define OCCAMY_SOC_ISOLATE_0_REG_OFFSET OCCAMY_SOC_ISOLATE_REG_OFFSET -#define OCCAMY_SOC_ISOLATE_0_ISOLATE_0_MASK OCCAMY_SOC_ISOLATE_ISOLATE_0_MASK -#endif -#if OCCAMY_SOC_ISOLATED_MULTIREG_COUNT == 1 -#define OCCAMY_SOC_ISOLATED_0_REG_OFFSET OCCAMY_SOC_ISOLATED_REG_OFFSET -#endif -#if OCCAMY_SOC_SCRATCH_MULTIREG_COUNT == 1 -#define OCCAMY_SOC_SCRATCH_0_REG_OFFSET OCCAMY_SOC_SCRATCH_0_REG_OFFSET -#endif -#if OCCAMY_SOC_SCRATCH_MULTIREG_COUNT == 1 -#define OCCAMY_SOC_SCRATCH_0_REG_OFFSET OCCAMY_SOC_SCRATCH_REG_OFFSET -#endif - -//=============================================================== -// RISC-V -//=============================================================== - -#define MIP_MTIP_OFFSET 7 -#define MIP_MSIP_OFFSET 3 -#define MIE_MSIE_OFFSET 3 -#define MIE_MTIE_OFFSET 7 -#define MSTATUS_MIE_OFFSET 3 -#define MSTATUS_FS_OFFSET 13 - -// //=============================================================== -// // Memory map pointers -// //=============================================================== - -// #if SELECT_FLL == 0 // ETH FLL -// volatile uint32_t* const fll_system_base = -// (volatile uint32_t*)FLL_SYSTEM_BASE_ADDR; -// volatile uint32_t* const fll_periph_base = -// (volatile uint32_t*)FLL_PERIPH_BASE_ADDR; -// volatile uint32_t* const fll_hbm2e_base = -// (volatile uint32_t*)FLL_HBM2E_BASE_ADDR; -// #elif SELECT_FLL == 1 // GF FLL -// volatile uint32_t* const fll_system_base = -// (volatile uint32_t*)FLL_SYSTEM_BASE_ADDR + (0x200 >> 2); -// volatile uint32_t* const fll_periph_base = -// (volatile uint32_t*)FLL_PERIPH_BASE_ADDR + (0x200 >> 2); -// volatile uint32_t* const fll_hbm2e_base = -// (volatile uint32_t*)FLL_HBM2E_BASE_ADDR + (0x200 >> 2); -// #endif - -// volatile uint32_t* const fll_base[N_CLOCKS] = {fll_system_base, -// fll_periph_base, fll_hbm2e_base}; - -volatile uint64_t* const clint_mtime_ptr = - (volatile uint64_t*)(CLINT_BASE_ADDR + CLINT_MTIME_LOW_REG_OFFSET); -volatile uint64_t* const clint_mtimecmp0_ptr = - (volatile uint64_t*)(CLINT_BASE_ADDR + CLINT_MTIMECMP_LOW0_REG_OFFSET); - -//=============================================================== -// Globals -//=============================================================== - -volatile comm_buffer_t comm_buffer __attribute__((aligned(8))); - -//=============================================================== -// Anticipated function declarations -//=============================================================== - -static inline void set_sw_interrupts_unsafe(uint8_t chip_id, - uint32_t base_hartid, - uint32_t num_harts, - uint32_t stride); - -//=============================================================== -// Initialization -//=============================================================== - -void initialize_bss() { - extern volatile uint64_t __bss_start, __bss_end; - - size_t bss_size = (size_t)(&__bss_end) - (size_t)(&__bss_start); - if (bss_size) - sys_dma_blk_memcpy((uint64_t)(&__bss_start), WIDE_ZERO_MEM_BASE_ADDR, - bss_size); -} - -void initialize_wide_spm() { - extern volatile uint64_t __wide_spm_start, __wide_spm_end; - - size_t wide_spm_size = - (size_t)(&__wide_spm_end) - (size_t)(&__wide_spm_start); - if (wide_spm_size) - sys_dma_blk_memcpy(SPM_WIDE_BASE_ADDR, (uint64_t)(&__wide_spm_start), - wide_spm_size); -} - -void enable_fpu() { - uint64_t mstatus; - - asm volatile("csrr %[mstatus], mstatus" : [mstatus] "=r"(mstatus)); - mstatus |= (1 << MSTATUS_FS_OFFSET); - asm volatile("csrw mstatus, %[mstatus]" : : [mstatus] "r"(mstatus)); -} - -void set_d_cache_enable(uint16_t ena) { - asm volatile("csrw 0x701, %0" ::"r"(ena)); -} - -//=============================================================== -// Synchronization and mutual exclusion -//=============================================================== - -static inline void fence() { asm volatile("fence" : : : "memory"); } - -/** - * @brief lock a mutex, blocking - * @details test-and-set (tas) implementation of a lock. - * Declare mutex with `static volatile uint32_t mtx = 0;` - */ -void mutex_tas_acquire(volatile uint32_t* pmtx) { - asm volatile( - "li x5,1 # x5 = 1\n" - "1:\n" - " amoswap.w.aq x5,x5,(%0) # x5 = oldlock & lock = 1\n" - " bnez x5,1b # Retry if previously set)\n" - : "+r"(pmtx) - : - : "x5"); -} - -/** - * @brief lock a mutex, blocking - * @details test-and-test-and-set (ttas) implementation of a lock. - * Declare mutex with `static volatile uint32_t mtx = 0;` - */ -static inline void mutex_ttas_acquire(volatile uint32_t* pmtx) { - asm volatile( - "1:\n" - " lw x5, 0(%0)\n" - " bnez x5, 1b\n" - " li x5,1 # x5 = 1\n" - "2:\n" - " amoswap.w.aq x5,x5,(%0) # x5 = oldlock & lock = 1\n" - " bnez x5,2b # Retry if previously set)\n" - : "+r"(pmtx) - : - : "x5"); -} - -/** - * @brief Release the mutex - */ -static inline void mutex_release(volatile uint32_t* pmtx) { - asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n" - : "+r"(pmtx)); -} - -//=============================================================== -// Device programming -//=============================================================== - -extern void snitch_main(); - -static inline void wakeup_snitch(uint8_t chip_id, uint32_t hartid) { - set_sw_interrupt(chip_id, hartid); -} - -/** - * @brief Waits until snitches are parked in a `wfi` instruction - * - * @detail delays execution to wait for the Snitch cores to be ready. - * After being parked, the Snitch cores can accept an interrupt - * and start executing its binary - */ -// TODO: implement in a more robust manner -void wait_snitches_parked(uint32_t timeout) { delay_ns(100000); } - -/** - * @brief Programs the Snitches with the Snitch binary - * - * @detail After boot, the Snitches are "parked" on a WFI - * until they receive a software interrupt. Upon - * wakeup, the Snitch jumps to a minimal interrupt - * handler in boot ROM which loads the address of the - * user binary from the soc_ctrl_scratch_0 register. - * This routine programs the soc_ctrl_scratch_0 register - * with the address of the user binary. - */ -static inline void program_snitches(uint8_t chip_id) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - *(volatile uint32_t*)((uintptr_t)soc_ctrl_scratch_ptr(1) | base_addr) = - (uintptr_t)snitch_main; - *(volatile uint32_t*)((uintptr_t)soc_ctrl_scratch_ptr(2) | base_addr) = - (uintptr_t)&comm_buffer; -} - -/** - * @brief Wake-up a Snitch cluster - * - * @detail Send a cluster interrupt to all Snitches in a cluster - */ - -static inline void wakeup_cluster(uint8_t chip_id, uint32_t cluster_id) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - *(volatile uint32_t*)((uintptr_t)cluster_clint_set_ptr(cluster_id) | - base_addr) = 511; -} - -/** - * @brief Wake-up Snitches - * - * @detail All Snitches are "parked" in a WFI. A SW interrupt - * must be issued to "unpark" every Snitch. This function - * sends a SW interrupt to all Snitches. - */ -void wakeup_snitches(uint8_t chip_id) { - volatile uint32_t* lock = get_shared_lock(); - - mutex_ttas_acquire(lock); - set_sw_interrupts_unsafe(chip_id, 1, N_SNITCHES, 1); - mutex_release(lock); -} - -/** - * @brief Wake-up Snitches - * - * @detail Send a cluster interrupt to all Snitches - */ -static inline void wakeup_snitches_cl(uint8_t chip_id) { - for (int i = 0; i < N_CLUSTERS; i++) wakeup_cluster(chip_id, i); -} - -/** - * @brief Wake-up Snitches - * - * @detail All Snitches are "parked" in a WFI. A SW interrupt - * must be issued to "unpark" every Snitch. This function - * sends a SW interrupt to a given range of Snitches. - */ -void wakeup_snitches_selective(uint8_t chip_id, uint32_t base_hartid, - uint32_t num_harts, uint32_t stride) { - volatile uint32_t* lock = get_shared_lock(); - - mutex_ttas_acquire(lock); - set_sw_interrupts_unsafe(chip_id, base_hartid, num_harts, stride); - mutex_release(lock); -} - -// temporary deprecate this function since it uses the N_CORES_PER_CLUSTER - -// /** -// * @brief Wake-up Snitches -// * -// * @detail All Snitches are "parked" in a WFI. A SW interrupt -// * must be issued to "unpark" every Snitch. This function -// * sends a SW interrupt to one Snitch in every cluster, -// * the so called "master" of the cluster. The "master" is -// * then expected to wake-up all the other Snitches in its -// * cluster. The "master" Snitches can use the cluster-local -// * CLINTs without sending requests outside the cluster, -// * avoiding congestion. -// */ -// void wakeup_master_snitches() { -// volatile uint32_t* lock = get_shared_lock(); - -// mutex_ttas_acquire(lock); -// set_sw_interrupts_unsafe(1, N_CLUSTERS, N_CORES_PER_CLUSTER); -// mutex_release(lock); -// } - -/** - * @brief Waits until snitches are done executing - */ -static inline int wait_snitches_done(uint8_t chip_id) { - wait_sw_interrupt(); - uint8_t current_chip_id = get_current_chip_id(); - clear_host_sw_interrupt(current_chip_id); - - uintptr_t baseaddress = (uintptr_t)get_chip_baseaddress(chip_id); - uint32_t* retval_ptr = - (uint32_t*)((uintptr_t)soc_ctrl_scratch_ptr(3) | baseaddress); - int retval = *retval_ptr; - // LSB signals completion - if (retval & 1) - return retval >> 1; - else - return -1; -} - -static inline volatile uint32_t* get_shared_lock() { - return &(comm_buffer.lock); -} - -//=============================================================== -// Reset and clock gating -//=============================================================== - -static inline void set_clk_ena_quad(uint8_t chip_id, uint32_t quad_idx, - uint32_t value, uint32_t cluster_clk_enable_mask) { - uint32_t* clk_ena_ptr = - (uint32_t*)((uintptr_t)quad_cfg_clk_ena_ptr(quad_idx) | - (uintptr_t)get_chip_baseaddress(chip_id)); - *clk_ena_ptr = value & cluster_clk_enable_mask; -} - -// static inline void set_clk_ena_quad(uint32_t quad_idx, uint32_t value) { -// *quad_cfg_clk_ena_ptr(quad_idx) = value & 0x1; -// } - -static inline void set_reset_n_quad(uint8_t chip_id, uint32_t quad_idx, - uint32_t value) { - uint32_t* reset_n_ptr = - (uint32_t*)((uintptr_t)quad_cfg_reset_n_ptr(quad_idx) | - (uintptr_t)get_chip_baseaddress(chip_id)); - *reset_n_ptr = value & 0x1; -} - -static inline void reset_and_ungate_quad(uint8_t chip_id, - uint32_t quadrant_idx, uint32_t cluster_clk_enable_mask) { - set_reset_n_quad(chip_id, quadrant_idx, 0); - set_clk_ena_quad(chip_id, quadrant_idx, 0, cluster_clk_enable_mask); - __asm__ __volatile__("fence" ::: "memory"); - set_reset_n_quad(chip_id, quadrant_idx, 0xFFFFFFFF); - set_clk_ena_quad(chip_id, quadrant_idx, 0xFFFFFFFF, cluster_clk_enable_mask); -} - -static inline void reset_and_ungate_quadrants(uint8_t chip_id, uint32_t cluster_clk_enable_mask) { - for (int i = 0; i < N_QUADS; i++) reset_and_ungate_quad(chip_id, i, cluster_clk_enable_mask); -} - -static inline void reset_and_ungate_quadrants_all(uint8_t chip_id) { - for (int i = 0; i < N_QUADS; i++) reset_and_ungate_quad(chip_id, i, 0xFFFF); -} -//=============================================================== -// Interrupts -//=============================================================== - -static inline void wfi() { asm volatile("wfi"); } - -static inline void enable_sw_interrupts() { - uint64_t mie; - - asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); - mie |= (1 << MIE_MSIE_OFFSET); - asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); -} - -static inline uint32_t get_clint_msip_hart(uint8_t chip_id, uint32_t hartid) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - uint32_t field_offset = hartid % CLINT_MSIP_P_FIELDS_PER_REG; - uint32_t lsb_offset = field_offset * CLINT_MSIP_P_FIELD_WIDTH; - return (*(volatile uint32_t*)((uintptr_t)clint_msip_ptr(hartid) | - base_addr) >> - lsb_offset) & - 1; -} - -/** - * @brief Gets SW interrupt pending status from local CSR - * - * @detail Use this in favour of remote_sw_interrupt_pending() - * when polling a core's own interrupt pending - * status. This avoids unnecessary congestion on the - * interconnect and shared CLINT. - */ -static inline uint32_t sw_interrupt_pending() { - uint64_t mip; - - asm volatile("csrr %[mip], mip" : [mip] "=r"(mip)); - return mip & (1 << MIP_MSIP_OFFSET); -} - -// TODO: for portability to architectures where WFI is implemented as a NOP -// also sw_interrupts_enabled() should be checked -static inline void wait_sw_interrupt() { - do wfi(); - while (!sw_interrupt_pending()); -} - -static inline void clear_sw_interrupt_unsafe(uint8_t chip_id, uint32_t hartid) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - uint32_t field_offset = hartid % CLINT_MSIP_P_FIELDS_PER_REG; - uint32_t lsb_offset = field_offset * CLINT_MSIP_P_FIELD_WIDTH; - - *(volatile uint32_t*)((uintptr_t)clint_msip_ptr(hartid) | base_addr) &= - ~(1 << lsb_offset); -} - -static inline void clear_sw_interrupt(uint8_t chip_id, uint32_t hartid) { - volatile uint32_t* shared_lock = get_shared_lock(); - - mutex_tas_acquire(shared_lock); - clear_sw_interrupt_unsafe(chip_id, hartid); - mutex_release(shared_lock); -} - -/** - * @brief Gets SW interrupt pending status from CLINT - * - * @detail Use sw_interrupt_pending() in favour of this - * when polling a core's own interrupt pending - * status. That function interrogates a local CSR - * instead of the shared CLINT. - */ -static inline uint32_t remote_sw_interrupt_pending(uint8_t chip_id, - uint32_t hartid) { - return get_clint_msip_hart(chip_id, hartid); -} - -static inline uint32_t timer_interrupts_enabled() { - uint64_t mie; - asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); - return (mie >> MIE_MTIE_OFFSET) & 1; -} - -static inline void set_sw_interrupt_unsafe(uint8_t chip_id, uint32_t hartid) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - uint32_t field_offset = hartid % CLINT_MSIP_P_FIELDS_PER_REG; - uint32_t lsb_offset = field_offset * CLINT_MSIP_P_FIELD_WIDTH; - - *(volatile uint32_t*)((uintptr_t)clint_msip_ptr(hartid) | base_addr) |= - (1 << lsb_offset); -} - -void set_sw_interrupt(uint8_t chip_id, uint32_t hartid) { - volatile uint32_t* shared_lock = get_shared_lock(); - - mutex_ttas_acquire(shared_lock); - set_sw_interrupt_unsafe(chip_id, hartid); - mutex_release(shared_lock); -} - -static inline void set_sw_interrupts_unsafe(uint8_t chip_id, - uint32_t base_hartid, - uint32_t num_harts, - uint32_t stride) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - volatile uint32_t* ptr = - (uint32_t*)((uintptr_t)clint_msip_ptr(base_hartid) | base_addr); - - uint32_t num_fields = num_harts; - uint32_t field_idx = base_hartid; - uint32_t field_offset = field_idx % CLINT_MSIP_P_FIELDS_PER_REG; - uint32_t reg_idx = field_idx / CLINT_MSIP_P_FIELDS_PER_REG; - uint32_t prev_reg_idx = reg_idx; - uint32_t mask = 0; - uint32_t reg_jump; - uint32_t last_field = num_fields - 1; - - for (uint32_t i = 0; i < num_fields; i++) { - // put field in mask - mask |= 1 << field_offset; - - // calculate next field info - field_idx += stride; - field_offset = field_idx % CLINT_MSIP_P_FIELDS_PER_REG; - reg_idx = field_idx / CLINT_MSIP_P_FIELDS_PER_REG; - reg_jump = reg_idx - prev_reg_idx; - - // if next value is in another register - if (i != last_field && reg_jump) { - // store mask - if (mask == (uint32_t)(-1)) - *ptr = mask; - else - *ptr |= mask; - // update pointer and reset mask - ptr += reg_jump; - prev_reg_idx = reg_idx; - mask = 0; - } - } - - // store last mask - *ptr |= mask; -} - -void set_cluster_interrupt(uint8_t chip_id, uint32_t cluster_id, - uint32_t core_id) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - *(volatile uint32_t*)((uintptr_t)cluster_clint_set_ptr(cluster_id) | - base_addr) = (1 << core_id); -} - -static inline uint32_t timer_interrupt_pending() { - uint64_t mip; - - asm volatile("csrr %[mip], mip" : [mip] "=r"(mip)); - return mip & (1 << MIP_MTIP_OFFSET); -} - -void wait_timer_interrupt() { - do wfi(); - while (!timer_interrupt_pending() && timer_interrupts_enabled()); -} - -void enable_global_interrupts() { - uint64_t mstatus; - - asm volatile("csrr %[mstatus], mstatus" : [mstatus] "=r"(mstatus)); - mstatus |= (1 << MSTATUS_MIE_OFFSET); - asm volatile("csrw mstatus, %[mstatus]" : : [mstatus] "r"(mstatus)); -} - -void enable_timer_interrupts() { - uint64_t mie; - - asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); - mie |= (1 << MIE_MTIE_OFFSET); - asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); -} - -void disable_timer_interrupts() { - uint64_t mie; - - asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); - mie &= ~(1 << MIE_MTIE_OFFSET); - asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); -} - -void disable_sw_interrupts() { - uint64_t mie; - - asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); - mie &= ~(1 << MIE_MSIE_OFFSET); - asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); -} - -/** - * @brief Gets SW interrupt pending status from local CSR - * - * @detail Use this in favour of wait_remote_sw_interrupt_pending() - * when polling a core's own interrupt pending - * status. This avoids unnecessary congestion on the - * interconnect and shared CLINT. - */ -void wait_sw_interrupt_cleared() { while (sw_interrupt_pending()); } - -/** - * @brief Gets SW interrupt pending status from shared CLINT - * - * @detail Use wait_sw_interrupt_cleared() in favour of this - * when polling a core's own interrupt pending - * status. That function polls a local CSR instead - * of the shared CLINT. - */ -void wait_remote_sw_interrupt_pending(uint8_t chip_id, uint32_t hartid) { - while (remote_sw_interrupt_pending(chip_id, hartid)); -} - -//=============================================================== -// Timers -//=============================================================== - -static const float rtc_period = 30517.58; // ns - -static inline uint64_t mcycle() { - register uint64_t r; - asm volatile("csrr %0, mcycle" : "=r"(r)); - return r; -} - -static inline uint64_t mtime(uint8_t chip_id) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - return *(volatile uint64_t*)((uintptr_t)clint_mtime_ptr | base_addr); -} - -void set_timer_interrupt(uint8_t chip_id, uint64_t interval_ns) { - // Convert ns to RTC unit - uint64_t rtc_interval = interval_ns / (int64_t)rtc_period; - - // Calculate the base address for the chip - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - - // Offset interval by current time and set the timer interrupt - *(volatile uint64_t*)((uintptr_t)clint_mtimecmp0_ptr | base_addr) = - mtime(chip_id) + rtc_interval; -} - -/** - * @brief Clears timer interrupt - * - * @detail Pending timer interrupts are cleared in HW when - * writing to the mtimecmp register. Note that - * eventually the mtime register is going to be greater - * than the newly programmed mtimecmp register, reasserting - * the pending bit. If this is not desired, it is safer - * to disable the timer interrupt before clearing it. - */ -void clear_timer_interrupt(uint8_t chip_id) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - *(volatile uint64_t*)((uintptr_t)clint_mtimecmp0_ptr | base_addr) = - mtime(chip_id) + 1; -} - -// Minimum delay is of one RTC period -void delay_ns(uint64_t delay) { - uint8_t chip_id = get_current_chip_id(); - set_timer_interrupt(chip_id, delay); - - // Wait for set_timer_interrupt() to have effect - fence(); - enable_timer_interrupts(); - - wait_timer_interrupt(); - disable_timer_interrupts(); - clear_timer_interrupt(chip_id); -} - -//=============================================================== -// Clocks and FLLs -//=============================================================== - -// #define N_LOCK_CYCLES 10 - -// typedef enum { SYSTEM_CLK = 0, PERIPH_CLK = 1, HBM2E_CLK = 2 } clk_t; - -// static inline void fll_reg_write_u32(clk_t clk, uint32_t byte_offset, -// uint32_t val) { -// *(fll_base[clk] + (byte_offset / 4)) = val; -// } - -// static inline uint32_t fll_reg_read_u32(clk_t clk, uint32_t byte_offset) { -// return *(fll_base[clk] + (byte_offset / 4)); -// } - -/** - * @brief Returns the multiplier to the reference frequency of the FLL - */ -// uint32_t get_fll_freq(clk_t clk) { -// #if SELECT_FLL==0 // ETH FLL -// return fll_reg_read_u32(clk, ETH_FLL_STATUS_I_REG_OFFSET) & -// ETH_FLL_STATUS_I_MULTIPLIER_MASK; -// #elif SELECT_FLL==1 // GF FLL -// return fll_reg_read_u32(clk, FLL_FREQ_REG_OFFSET); -// #endif -// } - -// uint32_t fll_locked(clk_t clk) { -// #if SELECT_FLL==0 // ETH FLL -// return fll_reg_read_u32(clk, ETH_FLL_LOCK_REG_OFFSET) & -// ETH_FLL_LOCK_LOCKED_MASK; -// #elif SELECT_FLL==1 // GF FLL -// return fll_reg_read_u32(clk, FLL_STATE_REG_OFFSET) == 3; -// #endif -// } - -/** - * @brief Measures frequency of clock source - * - * @return Frequency in GHz - */ -// float measure_frequency(clk_t clk) { -// return freq_meter_ref_freqs[clk] * get_fll_freq(clk); -// } - -/** - * @brief Derives system frequency through RISC-V's - * mtime memory-mapped register and mcycle CSR - * - * @param rtc_cycles Number of RTC cycles to wait for measurement. - * The higher it is, the more precise the measurement. - * @return Frequency in GHz - */ -// float measure_system_frequency(uint32_t rtc_cycles) { -// uint64_t start_cycle; -// uint64_t end_cycle; -// float time_delta; -// uint64_t cycle_delta; - -// // Compute time delta -// time_delta = rtc_cycles * rtc_period; - -// // Measure cycle delta -// start_cycle = mcycle(); -// delay_ns(time_delta); -// end_cycle = mcycle(); -// cycle_delta = end_cycle - start_cycle; - -// // Return frequency -// return cycle_delta / time_delta; -// } - -/** - * @brief Reprogram the FLL in closed-loop mode with the specified divider - * @detail Blocking function, returns after the new frequency is locked - */ -// void program_fll(clk_t clk, uint32_t divider) { -// #if SELECT_FLL==0 // ETH FLL -// // Reconfigure FLL -// uint32_t val = 0; -// val |= 1 << 31; // Select closed loop mode -// val |= 1 << 30; // Gate output by LOCK signal -// val |= 1 << 26; // Set post-clock divider to 1 (neutral) -// val |= divider - 1; // Set refclk multiplier to specified value -// fll_reg_write_u32(clk, ETH_FLL_CONFIG_I_REG_OFFSET, val); -// // Wait new frequency locked -// while (!fll_locked(clk)); -// #elif SELECT_FLL==1 // GF FLL -// // Fallback to reference clock during reconfiguration -// fll_reg_write_u32(clk, FLL_BYPASS_REG_OFFSET, 1); -// // Disable DFG IP clock generation during reconfiguration -// fll_reg_write_u32(clk, FLL_CLKGENEN_REG_OFFSET, 0); -// // Reconfigure DFG IP input signals -// fll_reg_write_u32(clk, FLL_FIXLENMODE_REG_OFFSET, 0); // Closed-loop -// mode fll_reg_write_u32(clk, FLL_FBDIV_REG_OFFSET, divider - 1); -// fll_reg_write_u32(clk, FLL_CLKDIV_REG_OFFSET, 0); -// fll_reg_write_u32(clk, FLL_CLKSRCSEL_REG_OFFSET, 1); -// // Reconfigure lock settings -// fll_reg_write_u32(clk, FLL_UPPERTHRESH_REG_OFFSET, divider + 1); -// fll_reg_write_u32(clk, FLL_LOWERTHRESH_REG_OFFSET, divider - 1); -// fll_reg_write_u32(clk, FLL_LOCKCYCLES_REG_OFFSET, N_LOCK_CYCLES); -// // Enable DFG IP clock generation after new settings are applied -// fll_reg_write_u32(clk, FLL_CLKGENEN_REG_OFFSET, 1); -// // Wait new frequency locked -// while (!fll_locked(clk)); -// // Disable bypass of DFG clock -// fll_reg_write_u32(clk, FLL_BYPASS_REG_OFFSET, 0); -// #endif -// } - -//=============================================================== -// Isolation -//=============================================================== - -uint32_t const ISO_MASK_ALL = 0b1111; -uint32_t const ISO_MASK_NONE = 0; - -static inline void deisolate_quad(uint8_t chip_id, uint32_t quad_idx, - uint32_t iso_mask) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - volatile uint32_t* isolate_ptr = - (volatile uint32_t*)((uintptr_t)quad_cfg_isolate_ptr(quad_idx) | - base_addr); - *isolate_ptr &= ~iso_mask; -} - -/** - * @brief Loads the "isolated" register field for the quadrant requested - * - * @return Masked register field realigned to start at LSB - */ -static inline uint32_t get_quad_cfg_isolated(uint8_t chip_id, - uint32_t quad_idx) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - return *(volatile uint32_t*)((uintptr_t)quad_cfg_isolated_ptr(quad_idx) | - base_addr) & - ISO_MASK_ALL; -} - -void isolate_quad(uint8_t chip_id, uint32_t quad_idx, uint32_t iso_mask) { - uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); - volatile uint32_t* isolate_ptr = - (volatile uint32_t*)((uintptr_t)quad_cfg_isolate_ptr(quad_idx) | - base_addr); - *isolate_ptr |= iso_mask; - fence(); -} - -static inline void deisolate_all(uint8_t chip_id) { - for (uint32_t i = 0; i < N_QUADS; ++i) { - deisolate_quad(chip_id, i, ISO_MASK_ALL); - } -} - -/** - * @brief Check quadrant isolated or not - * - * @param iso_mask set bit to 1 to check if path is isolated, 0 de-isolated - * @return 1 is check passes, 0 otherwise - */ -uint32_t check_isolated_timeout(uint8_t chip_id, uint32_t max_tries, - uint32_t quadrant_idx, uint32_t iso_mask) { - for (uint32_t i = 0; i < max_tries; ++i) { - if (get_quad_cfg_isolated(chip_id, quadrant_idx) == iso_mask) { - return 1; - } - } - return 0; -} - -//=============================================================== -// SoC configuration -//=============================================================== - -// void activate_interleaved_mode_hbm() { -// uint64_t addr = -// OCCAMY_HBM_XBAR_INTERLEAVED_ENA_REG_OFFSET + HBM_XBAR_CFG_BASE_ADDR; -// *((volatile uint32_t*)addr) = 1; -// } - -// void deactivate_interleaved_mode_hbm() { -// uint64_t addr = -// OCCAMY_HBM_XBAR_INTERLEAVED_ENA_REG_OFFSET + HBM_XBAR_CFG_BASE_ADDR; -// *((volatile uint32_t*)addr) = 1; -// } diff --git a/target/sim/sw/host/runtime/host.h b/target/sim/sw/host/runtime/host.h index b8b2ac3d..cc9d7efd 100644 --- a/target/sim/sw/host/runtime/host.h +++ b/target/sim/sw/host/runtime/host.h @@ -1,16 +1,834 @@ // Copyright 2022 ETH Zurich and University of Bologna. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 - +// #include "host.h" #include #include +#include "chip_id.h" +#include "heterogeneous_runtime.h" +#include "occamy.h" +#include "sys_dma.h" +#include "uart.h" + +extern uint64_t __narrow_spm_start; +extern uint64_t __narrow_spm_end; +extern uint64_t __wide_spm_start; +extern uint64_t __wide_spm_end; + +// Handle multireg degeneration to single register +#if OCCAMY_SOC_ISOLATE_MULTIREG_COUNT == 1 +#define OCCAMY_SOC_ISOLATE_0_REG_OFFSET OCCAMY_SOC_ISOLATE_REG_OFFSET +#define OCCAMY_SOC_ISOLATE_0_ISOLATE_0_MASK OCCAMY_SOC_ISOLATE_ISOLATE_0_MASK +#endif +#if OCCAMY_SOC_ISOLATED_MULTIREG_COUNT == 1 +#define OCCAMY_SOC_ISOLATED_0_REG_OFFSET OCCAMY_SOC_ISOLATED_REG_OFFSET +#endif +#if OCCAMY_SOC_SCRATCH_MULTIREG_COUNT == 1 +#define OCCAMY_SOC_SCRATCH_0_REG_OFFSET OCCAMY_SOC_SCRATCH_0_REG_OFFSET +#endif +#if OCCAMY_SOC_SCRATCH_MULTIREG_COUNT == 1 +#define OCCAMY_SOC_SCRATCH_0_REG_OFFSET OCCAMY_SOC_SCRATCH_REG_OFFSET +#endif + +//=============================================================== +// RISC-V +//=============================================================== + +#define MIP_MTIP_OFFSET 7 +#define MIP_MSIP_OFFSET 3 +#define MIE_MSIE_OFFSET 3 +#define MIE_MTIE_OFFSET 7 +#define MSTATUS_MIE_OFFSET 3 +#define MSTATUS_FS_OFFSET 13 + +// //=============================================================== +// // Memory map pointers +// //=============================================================== + +// #if SELECT_FLL == 0 // ETH FLL +// volatile uint32_t* const fll_system_base = +// (volatile uint32_t*)FLL_SYSTEM_BASE_ADDR; +// volatile uint32_t* const fll_periph_base = +// (volatile uint32_t*)FLL_PERIPH_BASE_ADDR; +// volatile uint32_t* const fll_hbm2e_base = +// (volatile uint32_t*)FLL_HBM2E_BASE_ADDR; +// #elif SELECT_FLL == 1 // GF FLL +// volatile uint32_t* const fll_system_base = +// (volatile uint32_t*)FLL_SYSTEM_BASE_ADDR + (0x200 >> 2); +// volatile uint32_t* const fll_periph_base = +// (volatile uint32_t*)FLL_PERIPH_BASE_ADDR + (0x200 >> 2); +// volatile uint32_t* const fll_hbm2e_base = +// (volatile uint32_t*)FLL_HBM2E_BASE_ADDR + (0x200 >> 2); +// #endif + +// volatile uint32_t* const fll_base[N_CLOCKS] = {fll_system_base, +// fll_periph_base, fll_hbm2e_base}; -static inline void set_sw_interrupt(uint8_t chip_id, uint32_t hartid); +volatile uint64_t* const clint_mtime_ptr = + (volatile uint64_t*)(CLINT_BASE_ADDR + CLINT_MTIME_LOW_REG_OFFSET); +volatile uint64_t* const clint_mtimecmp0_ptr = + (volatile uint64_t*)(CLINT_BASE_ADDR + CLINT_MTIMECMP_LOW0_REG_OFFSET); + +//=============================================================== +// Globals +//=============================================================== + +// volatile comm_buffer_t comm_buffer __attribute__((aligned(8))); + +//=============================================================== +// Anticipated function declarations +//=============================================================== + +static inline void set_sw_interrupts_unsafe(uint8_t chip_id, + uint32_t base_hartid, + uint32_t num_harts, + uint32_t stride); + +static inline void set_sw_interrupt(uint8_t chip_id, + volatile comm_buffer_t* comm_buffer_ptr, + uint32_t hartid); void delay_ns(uint64_t delay); -static inline volatile uint32_t* get_shared_lock(); +static inline volatile uint32_t* get_shared_lock( + volatile comm_buffer_t* comm_buffer_ptr); static inline void wait_sw_interrupt(); -static inline void clear_sw_interrupt(uint8_t chip_id, uint32_t hartid); +static inline void clear_sw_interrupt(uint8_t chip_id, + volatile comm_buffer_t* comm_buffer_ptr, + uint32_t hartid); + +//=============================================================== +// Initialization +//=============================================================== + +void initialize_bss() { + extern volatile uint64_t __bss_start, __bss_end; + + size_t bss_size = (size_t)(&__bss_end) - (size_t)(&__bss_start); + if (bss_size) + sys_dma_blk_memcpy((uint64_t)(&__bss_start), WIDE_ZERO_MEM_BASE_ADDR, + bss_size); +} + +void initialize_wide_spm() { + size_t wide_spm_size = + (size_t)(&__wide_spm_end) - (size_t)(&__wide_spm_start); + if (wide_spm_size) + sys_dma_blk_memcpy(SPM_WIDE_BASE_ADDR, (uint64_t)(&__wide_spm_start), + wide_spm_size); +} + +void enable_fpu() { + uint64_t mstatus; + + asm volatile("csrr %[mstatus], mstatus" : [mstatus] "=r"(mstatus)); + mstatus |= (1 << MSTATUS_FS_OFFSET); + asm volatile("csrw mstatus, %[mstatus]" : : [mstatus] "r"(mstatus)); +} + +void set_d_cache_enable(uint16_t ena) { + asm volatile("csrw 0x701, %0" ::"r"(ena)); +} + +//=============================================================== +// Synchronization and mutual exclusion +//=============================================================== + +static inline void fence() { asm volatile("fence" : : : "memory"); } + +/** + * @brief lock a mutex, blocking + * @details test-and-set (tas) implementation of a lock. + * Declare mutex with `static volatile uint32_t mtx = 0;` + */ +void mutex_tas_acquire(volatile uint32_t* pmtx) { + asm volatile( + "li x5,1 # x5 = 1\n" + "1:\n" + " amoswap.w.aq x5,x5,(%0) # x5 = oldlock & lock = 1\n" + " bnez x5,1b # Retry if previously set)\n" + : "+r"(pmtx) + : + : "x5"); +} + +/** + * @brief lock a mutex, blocking + * @details test-and-test-and-set (ttas) implementation of a lock. + * Declare mutex with `static volatile uint32_t mtx = 0;` + */ +static inline void mutex_ttas_acquire(volatile uint32_t* pmtx) { + asm volatile( + "1:\n" + " lw x5, 0(%0)\n" + " bnez x5, 1b\n" + " li x5,1 # x5 = 1\n" + "2:\n" + " amoswap.w.aq x5,x5,(%0) # x5 = oldlock & lock = 1\n" + " bnez x5,2b # Retry if previously set)\n" + : "+r"(pmtx) + : + : "x5"); +} + +/** + * @brief Release the mutex + */ +static inline void mutex_release(volatile uint32_t* pmtx) { + asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n" + : "+r"(pmtx)); +} + +//=============================================================== +// Device programming +//=============================================================== + +extern void snitch_main(); + +static inline void wakeup_snitch(uint8_t chip_id, + volatile comm_buffer_t* comm_buffer_ptr, + uint32_t hartid) { + set_sw_interrupt(chip_id, comm_buffer_ptr, hartid); +} + +/** + * @brief Waits until snitches are parked in a `wfi` instruction + * + * @detail delays execution to wait for the Snitch cores to be ready. + * After being parked, the Snitch cores can accept an interrupt + * and start executing its binary + */ +// TODO: implement in a more robust manner +void wait_snitches_parked(uint32_t timeout) { delay_ns(100000); } + +/** + * @brief Programs the Snitches with the Snitch binary + * + * @detail After boot, the Snitches are "parked" on a WFI + * until they receive a software interrupt. Upon + * wakeup, the Snitch jumps to a minimal interrupt + * handler in boot ROM which loads the address of the + * user binary from the soc_ctrl_scratch_0 register. + * This routine programs the soc_ctrl_scratch_0 register + * with the address of the user binary. + */ +static inline void program_snitches(uint8_t chip_id, + volatile comm_buffer_t* comm_buffer_ptr) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + *(volatile uint32_t*)((uintptr_t)soc_ctrl_scratch_ptr(1) | base_addr) = + (uintptr_t)snitch_main; + *(volatile uint32_t*)((uintptr_t)soc_ctrl_scratch_ptr(2) | base_addr) = + (uintptr_t)comm_buffer_ptr; +} + +/** + * @brief Wake-up a Snitch cluster + * + * @detail Send a cluster interrupt to all Snitches in a cluster + */ + +static inline void wakeup_cluster(uint8_t chip_id, uint32_t cluster_id) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + *(volatile uint32_t*)((uintptr_t)cluster_clint_set_ptr(cluster_id) | + base_addr) = 511; +} + +/** + * @brief Wake-up Snitches + * + * @detail All Snitches are "parked" in a WFI. A SW interrupt + * must be issued to "unpark" every Snitch. This function + * sends a SW interrupt to all Snitches. + */ +void wakeup_snitches(uint8_t chip_id, volatile comm_buffer_t* comm_buffer_ptr) { + volatile uint32_t* lock = get_shared_lock(comm_buffer_ptr); + + mutex_ttas_acquire(lock); + set_sw_interrupts_unsafe(chip_id, 1, N_SNITCHES, 1); + mutex_release(lock); +} + +/** + * @brief Wake-up Snitches + * + * @detail Send a cluster interrupt to all Snitches + */ +static inline void wakeup_snitches_cl(uint8_t chip_id) { + for (int i = 0; i < N_CLUSTERS; i++) wakeup_cluster(chip_id, i); +} + +/** + * @brief Wake-up Snitches + * + * @detail All Snitches are "parked" in a WFI. A SW interrupt + * must be issued to "unpark" every Snitch. This function + * sends a SW interrupt to a given range of Snitches. + */ +void wakeup_snitches_selective(uint8_t chip_id, + volatile comm_buffer_t* comm_buffer_ptr, + uint32_t base_hartid, uint32_t num_harts, + uint32_t stride) { + volatile uint32_t* lock = get_shared_lock(comm_buffer_ptr); + + mutex_ttas_acquire(lock); + set_sw_interrupts_unsafe(chip_id, base_hartid, num_harts, stride); + mutex_release(lock); +} + +// temporary deprecate this function since it uses the N_CORES_PER_CLUSTER + +// /** +// * @brief Wake-up Snitches +// * +// * @detail All Snitches are "parked" in a WFI. A SW interrupt +// * must be issued to "unpark" every Snitch. This function +// * sends a SW interrupt to one Snitch in every cluster, +// * the so called "master" of the cluster. The "master" is +// * then expected to wake-up all the other Snitches in its +// * cluster. The "master" Snitches can use the cluster-local +// * CLINTs without sending requests outside the cluster, +// * avoiding congestion. +// */ +// void wakeup_master_snitches() { +// volatile uint32_t* lock = get_shared_lock(); + +// mutex_ttas_acquire(lock); +// set_sw_interrupts_unsafe(1, N_CLUSTERS, N_CORES_PER_CLUSTER); +// mutex_release(lock); +// } + +/** + * @brief Waits until snitches are done executing + */ +static inline int wait_snitches_done(uint8_t chip_id) { + wait_sw_interrupt(); + uint8_t current_chip_id = get_current_chip_id(); + clear_host_sw_interrupt(current_chip_id); + + uintptr_t baseaddress = (uintptr_t)get_chip_baseaddress(chip_id); + uint32_t* retval_ptr = + (uint32_t*)((uintptr_t)soc_ctrl_scratch_ptr(3) | baseaddress); + int retval = *retval_ptr; + // LSB signals completion + if (retval & 1) + return retval >> 1; + else + return -1; +} + +static inline volatile uint32_t* get_shared_lock( + volatile comm_buffer_t* comm_buffer_ptr) { + return &((*comm_buffer_ptr).lock); +} + +//=============================================================== +// Reset and clock gating +//=============================================================== + +static inline void set_clk_ena_quad(uint8_t chip_id, uint32_t quad_idx, + uint32_t value, + uint32_t cluster_clk_enable_mask) { + uint32_t* clk_ena_ptr = + (uint32_t*)((uintptr_t)quad_cfg_clk_ena_ptr(quad_idx) | + (uintptr_t)get_chip_baseaddress(chip_id)); + *clk_ena_ptr = value & cluster_clk_enable_mask; +} + +// static inline void set_clk_ena_quad(uint32_t quad_idx, uint32_t value) { +// *quad_cfg_clk_ena_ptr(quad_idx) = value & 0x1; +// } + +static inline void set_reset_n_quad(uint8_t chip_id, uint32_t quad_idx, + uint32_t value) { + uint32_t* reset_n_ptr = + (uint32_t*)((uintptr_t)quad_cfg_reset_n_ptr(quad_idx) | + (uintptr_t)get_chip_baseaddress(chip_id)); + *reset_n_ptr = value & 0x1; +} + +static inline void reset_and_ungate_quad(uint8_t chip_id, uint32_t quadrant_idx, + uint32_t cluster_clk_enable_mask) { + set_reset_n_quad(chip_id, quadrant_idx, 0); + set_clk_ena_quad(chip_id, quadrant_idx, 0, cluster_clk_enable_mask); + __asm__ __volatile__("fence" ::: "memory"); + set_reset_n_quad(chip_id, quadrant_idx, 0xFFFFFFFF); + set_clk_ena_quad(chip_id, quadrant_idx, 0xFFFFFFFF, + cluster_clk_enable_mask); +} + +static inline void reset_and_ungate_quadrants( + uint8_t chip_id, uint32_t cluster_clk_enable_mask) { + for (int i = 0; i < N_QUADS; i++) + reset_and_ungate_quad(chip_id, i, cluster_clk_enable_mask); +} + +static inline void reset_and_ungate_quadrants_all(uint8_t chip_id) { + for (int i = 0; i < N_QUADS; i++) reset_and_ungate_quad(chip_id, i, 0xFFFF); +} +//=============================================================== +// Interrupts +//=============================================================== + +static inline void wfi() { asm volatile("wfi"); } + +static inline void enable_sw_interrupts() { + uint64_t mie; + + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); + mie |= (1 << MIE_MSIE_OFFSET); + asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); +} + +static inline uint32_t get_clint_msip_hart(uint8_t chip_id, uint32_t hartid) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + uint32_t field_offset = hartid % CLINT_MSIP_P_FIELDS_PER_REG; + uint32_t lsb_offset = field_offset * CLINT_MSIP_P_FIELD_WIDTH; + return (*(volatile uint32_t*)((uintptr_t)clint_msip_ptr(hartid) | + base_addr) >> + lsb_offset) & + 1; +} + +/** + * @brief Gets SW interrupt pending status from local CSR + * + * @detail Use this in favour of remote_sw_interrupt_pending() + * when polling a core's own interrupt pending + * status. This avoids unnecessary congestion on the + * interconnect and shared CLINT. + */ +static inline uint32_t sw_interrupt_pending() { + uint64_t mip; + + asm volatile("csrr %[mip], mip" : [mip] "=r"(mip)); + return mip & (1 << MIP_MSIP_OFFSET); +} + +// TODO: for portability to architectures where WFI is implemented as a NOP +// also sw_interrupts_enabled() should be checked +static inline void wait_sw_interrupt() { + do wfi(); + while (!sw_interrupt_pending()); +} + +static inline void clear_sw_interrupt_unsafe(uint8_t chip_id, uint32_t hartid) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + uint32_t field_offset = hartid % CLINT_MSIP_P_FIELDS_PER_REG; + uint32_t lsb_offset = field_offset * CLINT_MSIP_P_FIELD_WIDTH; + + *(volatile uint32_t*)((uintptr_t)clint_msip_ptr(hartid) | base_addr) &= + ~(1 << lsb_offset); +} + +static inline void clear_sw_interrupt(uint8_t chip_id, + volatile comm_buffer_t* comm_buffer_ptr, + uint32_t hartid) { + volatile uint32_t* shared_lock = get_shared_lock(comm_buffer_ptr); + + mutex_tas_acquire(shared_lock); + clear_sw_interrupt_unsafe(chip_id, hartid); + mutex_release(shared_lock); +} + +/** + * @brief Gets SW interrupt pending status from CLINT + * + * @detail Use sw_interrupt_pending() in favour of this + * when polling a core's own interrupt pending + * status. That function interrogates a local CSR + * instead of the shared CLINT. + */ +static inline uint32_t remote_sw_interrupt_pending(uint8_t chip_id, + uint32_t hartid) { + return get_clint_msip_hart(chip_id, hartid); +} + +static inline uint32_t timer_interrupts_enabled() { + uint64_t mie; + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); + return (mie >> MIE_MTIE_OFFSET) & 1; +} + +static inline void set_sw_interrupt_unsafe(uint8_t chip_id, uint32_t hartid) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + uint32_t field_offset = hartid % CLINT_MSIP_P_FIELDS_PER_REG; + uint32_t lsb_offset = field_offset * CLINT_MSIP_P_FIELD_WIDTH; + + *(volatile uint32_t*)((uintptr_t)clint_msip_ptr(hartid) | base_addr) |= + (1 << lsb_offset); +} + +void set_sw_interrupt(uint8_t chip_id, volatile comm_buffer_t* comm_buffer_ptr, + uint32_t hartid) { + volatile uint32_t* shared_lock = get_shared_lock(comm_buffer_ptr); + mutex_ttas_acquire(shared_lock); + set_sw_interrupt_unsafe(chip_id, hartid); + mutex_release(shared_lock); +} + +static inline void set_sw_interrupts_unsafe(uint8_t chip_id, + uint32_t base_hartid, + uint32_t num_harts, + uint32_t stride) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + volatile uint32_t* ptr = + (uint32_t*)((uintptr_t)clint_msip_ptr(base_hartid) | base_addr); + + uint32_t num_fields = num_harts; + uint32_t field_idx = base_hartid; + uint32_t field_offset = field_idx % CLINT_MSIP_P_FIELDS_PER_REG; + uint32_t reg_idx = field_idx / CLINT_MSIP_P_FIELDS_PER_REG; + uint32_t prev_reg_idx = reg_idx; + uint32_t mask = 0; + uint32_t reg_jump; + uint32_t last_field = num_fields - 1; + + for (uint32_t i = 0; i < num_fields; i++) { + // put field in mask + mask |= 1 << field_offset; + + // calculate next field info + field_idx += stride; + field_offset = field_idx % CLINT_MSIP_P_FIELDS_PER_REG; + reg_idx = field_idx / CLINT_MSIP_P_FIELDS_PER_REG; + reg_jump = reg_idx - prev_reg_idx; + + // if next value is in another register + if (i != last_field && reg_jump) { + // store mask + if (mask == (uint32_t)(-1)) + *ptr = mask; + else + *ptr |= mask; + // update pointer and reset mask + ptr += reg_jump; + prev_reg_idx = reg_idx; + mask = 0; + } + } + + // store last mask + *ptr |= mask; +} + +void set_cluster_interrupt(uint8_t chip_id, uint32_t cluster_id, + uint32_t core_id) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + *(volatile uint32_t*)((uintptr_t)cluster_clint_set_ptr(cluster_id) | + base_addr) = (1 << core_id); +} + +static inline uint32_t timer_interrupt_pending() { + uint64_t mip; + + asm volatile("csrr %[mip], mip" : [mip] "=r"(mip)); + return mip & (1 << MIP_MTIP_OFFSET); +} + +void wait_timer_interrupt() { + do wfi(); + while (!timer_interrupt_pending() && timer_interrupts_enabled()); +} + +void enable_global_interrupts() { + uint64_t mstatus; + + asm volatile("csrr %[mstatus], mstatus" : [mstatus] "=r"(mstatus)); + mstatus |= (1 << MSTATUS_MIE_OFFSET); + asm volatile("csrw mstatus, %[mstatus]" : : [mstatus] "r"(mstatus)); +} + +void enable_timer_interrupts() { + uint64_t mie; + + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); + mie |= (1 << MIE_MTIE_OFFSET); + asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); +} + +void disable_timer_interrupts() { + uint64_t mie; + + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); + mie &= ~(1 << MIE_MTIE_OFFSET); + asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); +} + +void disable_sw_interrupts() { + uint64_t mie; + + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); + mie &= ~(1 << MIE_MSIE_OFFSET); + asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); +} + +/** + * @brief Gets SW interrupt pending status from local CSR + * + * @detail Use this in favour of wait_remote_sw_interrupt_pending() + * when polling a core's own interrupt pending + * status. This avoids unnecessary congestion on the + * interconnect and shared CLINT. + */ +void wait_sw_interrupt_cleared() { while (sw_interrupt_pending()); } + +/** + * @brief Gets SW interrupt pending status from shared CLINT + * + * @detail Use wait_sw_interrupt_cleared() in favour of this + * when polling a core's own interrupt pending + * status. That function polls a local CSR instead + * of the shared CLINT. + */ +void wait_remote_sw_interrupt_pending(uint8_t chip_id, uint32_t hartid) { + while (remote_sw_interrupt_pending(chip_id, hartid)); +} + +//=============================================================== +// Timers +//=============================================================== + +static const float rtc_period = 30517.58; // ns + +static inline uint64_t mcycle() { + register uint64_t r; + asm volatile("csrr %0, mcycle" : "=r"(r)); + return r; +} + +static inline uint64_t mtime(uint8_t chip_id) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + return *(volatile uint64_t*)((uintptr_t)clint_mtime_ptr | base_addr); +} + +void set_timer_interrupt(uint8_t chip_id, uint64_t interval_ns) { + // Convert ns to RTC unit + uint64_t rtc_interval = interval_ns / (int64_t)rtc_period; + + // Calculate the base address for the chip + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + + // Offset interval by current time and set the timer interrupt + *(volatile uint64_t*)((uintptr_t)clint_mtimecmp0_ptr | base_addr) = + mtime(chip_id) + rtc_interval; +} + +/** + * @brief Clears timer interrupt + * + * @detail Pending timer interrupts are cleared in HW when + * writing to the mtimecmp register. Note that + * eventually the mtime register is going to be greater + * than the newly programmed mtimecmp register, reasserting + * the pending bit. If this is not desired, it is safer + * to disable the timer interrupt before clearing it. + */ +void clear_timer_interrupt(uint8_t chip_id) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + *(volatile uint64_t*)((uintptr_t)clint_mtimecmp0_ptr | base_addr) = + mtime(chip_id) + 1; +} + +// Minimum delay is of one RTC period +void delay_ns(uint64_t delay) { + uint8_t chip_id = get_current_chip_id(); + set_timer_interrupt(chip_id, delay); + + // Wait for set_timer_interrupt() to have effect + fence(); + enable_timer_interrupts(); + + wait_timer_interrupt(); + disable_timer_interrupts(); + clear_timer_interrupt(chip_id); +} + +//=============================================================== +// Clocks and FLLs +//=============================================================== + +// #define N_LOCK_CYCLES 10 + +// typedef enum { SYSTEM_CLK = 0, PERIPH_CLK = 1, HBM2E_CLK = 2 } clk_t; + +// static inline void fll_reg_write_u32(clk_t clk, uint32_t byte_offset, +// uint32_t val) { +// *(fll_base[clk] + (byte_offset / 4)) = val; +// } + +// static inline uint32_t fll_reg_read_u32(clk_t clk, uint32_t byte_offset) { +// return *(fll_base[clk] + (byte_offset / 4)); +// } + +/** + * @brief Returns the multiplier to the reference frequency of the FLL + */ +// uint32_t get_fll_freq(clk_t clk) { +// #if SELECT_FLL==0 // ETH FLL +// return fll_reg_read_u32(clk, ETH_FLL_STATUS_I_REG_OFFSET) & +// ETH_FLL_STATUS_I_MULTIPLIER_MASK; +// #elif SELECT_FLL==1 // GF FLL +// return fll_reg_read_u32(clk, FLL_FREQ_REG_OFFSET); +// #endif +// } + +// uint32_t fll_locked(clk_t clk) { +// #if SELECT_FLL==0 // ETH FLL +// return fll_reg_read_u32(clk, ETH_FLL_LOCK_REG_OFFSET) & +// ETH_FLL_LOCK_LOCKED_MASK; +// #elif SELECT_FLL==1 // GF FLL +// return fll_reg_read_u32(clk, FLL_STATE_REG_OFFSET) == 3; +// #endif +// } + +/** + * @brief Measures frequency of clock source + * + * @return Frequency in GHz + */ +// float measure_frequency(clk_t clk) { +// return freq_meter_ref_freqs[clk] * get_fll_freq(clk); +// } + +/** + * @brief Derives system frequency through RISC-V's + * mtime memory-mapped register and mcycle CSR + * + * @param rtc_cycles Number of RTC cycles to wait for measurement. + * The higher it is, the more precise the measurement. + * @return Frequency in GHz + */ +// float measure_system_frequency(uint32_t rtc_cycles) { +// uint64_t start_cycle; +// uint64_t end_cycle; +// float time_delta; +// uint64_t cycle_delta; + +// // Compute time delta +// time_delta = rtc_cycles * rtc_period; + +// // Measure cycle delta +// start_cycle = mcycle(); +// delay_ns(time_delta); +// end_cycle = mcycle(); +// cycle_delta = end_cycle - start_cycle; + +// // Return frequency +// return cycle_delta / time_delta; +// } + +/** + * @brief Reprogram the FLL in closed-loop mode with the specified divider + * @detail Blocking function, returns after the new frequency is locked + */ +// void program_fll(clk_t clk, uint32_t divider) { +// #if SELECT_FLL==0 // ETH FLL +// // Reconfigure FLL +// uint32_t val = 0; +// val |= 1 << 31; // Select closed loop mode +// val |= 1 << 30; // Gate output by LOCK signal +// val |= 1 << 26; // Set post-clock divider to 1 (neutral) +// val |= divider - 1; // Set refclk multiplier to specified value +// fll_reg_write_u32(clk, ETH_FLL_CONFIG_I_REG_OFFSET, val); +// // Wait new frequency locked +// while (!fll_locked(clk)); +// #elif SELECT_FLL==1 // GF FLL +// // Fallback to reference clock during reconfiguration +// fll_reg_write_u32(clk, FLL_BYPASS_REG_OFFSET, 1); +// // Disable DFG IP clock generation during reconfiguration +// fll_reg_write_u32(clk, FLL_CLKGENEN_REG_OFFSET, 0); +// // Reconfigure DFG IP input signals +// fll_reg_write_u32(clk, FLL_FIXLENMODE_REG_OFFSET, 0); // Closed-loop +// mode fll_reg_write_u32(clk, FLL_FBDIV_REG_OFFSET, divider - 1); +// fll_reg_write_u32(clk, FLL_CLKDIV_REG_OFFSET, 0); +// fll_reg_write_u32(clk, FLL_CLKSRCSEL_REG_OFFSET, 1); +// // Reconfigure lock settings +// fll_reg_write_u32(clk, FLL_UPPERTHRESH_REG_OFFSET, divider + 1); +// fll_reg_write_u32(clk, FLL_LOWERTHRESH_REG_OFFSET, divider - 1); +// fll_reg_write_u32(clk, FLL_LOCKCYCLES_REG_OFFSET, N_LOCK_CYCLES); +// // Enable DFG IP clock generation after new settings are applied +// fll_reg_write_u32(clk, FLL_CLKGENEN_REG_OFFSET, 1); +// // Wait new frequency locked +// while (!fll_locked(clk)); +// // Disable bypass of DFG clock +// fll_reg_write_u32(clk, FLL_BYPASS_REG_OFFSET, 0); +// #endif +// } + +//=============================================================== +// Isolation +//=============================================================== + +uint32_t const ISO_MASK_ALL = 0b1111; +uint32_t const ISO_MASK_NONE = 0; + +static inline void deisolate_quad(uint8_t chip_id, uint32_t quad_idx, + uint32_t iso_mask) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + volatile uint32_t* isolate_ptr = + (volatile uint32_t*)((uintptr_t)quad_cfg_isolate_ptr(quad_idx) | + base_addr); + *isolate_ptr &= ~iso_mask; +} + +/** + * @brief Loads the "isolated" register field for the quadrant requested + * + * @return Masked register field realigned to start at LSB + */ +static inline uint32_t get_quad_cfg_isolated(uint8_t chip_id, + uint32_t quad_idx) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + return *(volatile uint32_t*)((uintptr_t)quad_cfg_isolated_ptr(quad_idx) | + base_addr) & + ISO_MASK_ALL; +} + +void isolate_quad(uint8_t chip_id, uint32_t quad_idx, uint32_t iso_mask) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + volatile uint32_t* isolate_ptr = + (volatile uint32_t*)((uintptr_t)quad_cfg_isolate_ptr(quad_idx) | + base_addr); + *isolate_ptr |= iso_mask; + fence(); +} + +static inline void deisolate_all(uint8_t chip_id) { + for (uint32_t i = 0; i < N_QUADS; ++i) { + deisolate_quad(chip_id, i, ISO_MASK_ALL); + } +} + +/** + * @brief Check quadrant isolated or not + * + * @param iso_mask set bit to 1 to check if path is isolated, 0 de-isolated + * @return 1 is check passes, 0 otherwise + */ +uint32_t check_isolated_timeout(uint8_t chip_id, uint32_t max_tries, + uint32_t quadrant_idx, uint32_t iso_mask) { + for (uint32_t i = 0; i < max_tries; ++i) { + if (get_quad_cfg_isolated(chip_id, quadrant_idx) == iso_mask) { + return 1; + } + } + return 0; +} + +//=============================================================== +// SoC configuration +//=============================================================== + +// void activate_interleaved_mode_hbm() { +// uint64_t addr = +// OCCAMY_HBM_XBAR_INTERLEAVED_ENA_REG_OFFSET + HBM_XBAR_CFG_BASE_ADDR; +// *((volatile uint32_t*)addr) = 1; +// } + +// void deactivate_interleaved_mode_hbm() { +// uint64_t addr = +// OCCAMY_HBM_XBAR_INTERLEAVED_ENA_REG_OFFSET + HBM_XBAR_CFG_BASE_ADDR; +// *((volatile uint32_t*)addr) = 1; +// } diff --git a/target/sim/sw/host/runtime/host.ld b/target/sim/sw/host/runtime/host.ld index fefdf86e..7e0b3606 100644 --- a/target/sim/sw/host/runtime/host.ld +++ b/target/sim/sw/host/runtime/host.ld @@ -6,18 +6,18 @@ ENTRY(_start) MEMORY { - DRAM (rwxa) : ORIGIN = 0x80000000, LENGTH = 0x100000 + NARROW_SPM (rwxa) : ORIGIN = 0x70000000, LENGTH = 0x20000 + WIDE_SPM (rwxa) : ORIGIN = 0x80000000, LENGTH = 0x100000 } SECTIONS { - /* The program code and data goes into DRAM */ + /* The program code and data goes into WIDE_SPM */ .appl : { /* __stack_pointer$ = . + 0x70000; */ /* __global_pointer$ = . + 0x7f0; */ - - __return_pointer$ = ORIGIN(DRAM) + LENGTH(DRAM) - 4; + __return_pointer$ = ORIGIN(WIDE_SPM) + LENGTH(WIDE_SPM) - 4; *(.text.startup) *(.text .text.*) __SDATA_BEGIN__ = .; @@ -26,10 +26,10 @@ SECTIONS *(.sdata .sdata.* .gnu.linkonce.s.*) *(.data) *(.rodata .rodata.*) - } > DRAM + } > WIDE_SPM /* HTIF section for FESVR */ - .htif : { *(.htif) } > DRAM + .htif : { *(.htif) } > WIDE_SPM .bss (NOLOAD) : { @@ -40,20 +40,23 @@ SECTIONS *(COMMON) . = ALIGN(8); __bss_end = . ; - } > DRAM + } > WIDE_SPM .wide_spm : { - . = ALIGN(8); - __wide_spm_start = . ; - *(.wide_spm) - . = ALIGN(8); - __wide_spm_end = . ; - } > DRAM + __wide_spm_start = ORIGIN(WIDE_SPM); + __wide_spm_end = ORIGIN(WIDE_SPM) + LENGTH(WIDE_SPM); + } > WIDE_SPM + + .narrow_spm : + { + __narrow_spm_start = ORIGIN(NARROW_SPM); + __narrow_spm_end = ORIGIN(NARROW_SPM) + LENGTH(NARROW_SPM); + } > WIDE_SPM __end = .; - .devicebin : { *(.devicebin) } > DRAM + .devicebin : { *(.devicebin) } > WIDE_SPM /* Discard sections */ /DISCARD/ : { *(.riscv.attributes) *(.comment) } diff --git a/target/sim/sw/shared/platform/generated/uart.h b/target/sim/sw/shared/platform/generated/uart.h index 3c7e545e..1646f08f 100644 --- a/target/sim/sw/shared/platform/generated/uart.h +++ b/target/sim/sw/shared/platform/generated/uart.h @@ -180,7 +180,9 @@ inline static void scan_str(uintptr_t address_prefix, char *str) { if (*cur == '\r') { *cur = '\0'; return; - } else + } else { + print_char(address_prefix, *cur); cur++; + } } } diff --git a/target/sim/sw/shared/runtime/chip_id.h b/target/sim/sw/shared/runtime/chip_id.h index 04928e84..52d6ab67 100644 --- a/target/sim/sw/shared/runtime/chip_id.h +++ b/target/sim/sw/shared/runtime/chip_id.h @@ -37,6 +37,10 @@ inline uint8_t *get_chip_baseaddress(uint8_t chip_id) { #endif } +inline uint32_t get_chip_baseaddress_h(uint8_t chip_id) { + return (uint32_t)(chip_id << 8); +} + inline uint32_t get_current_chip_baseaddress_h() { uint32_t chip_id = get_current_chip_id(); return (uint32_t)(chip_id << 8); diff --git a/target/sim/sw/shared/runtime/heterogeneous_runtime.h b/target/sim/sw/shared/runtime/heterogeneous_runtime.h index 7ab12a9d..96b3397d 100644 --- a/target/sim/sw/shared/runtime/heterogeneous_runtime.h +++ b/target/sim/sw/shared/runtime/heterogeneous_runtime.h @@ -13,6 +13,7 @@ // usr_data is an explicitly-sized integer field instead of a pointer typedef struct { volatile uint32_t lock; + volatile uint32_t chip_id; volatile uint32_t usr_data_ptr; } comm_buffer_t; @@ -21,26 +22,85 @@ typedef struct { /**************/ inline static void set_host_sw_interrupt(uint8_t chip_id) { +#if __riscv_xlen == 64 uint32_t* msip_ptr = (uint32_t*)(((uintptr_t)clint_msip_ptr(0)) | ((uintptr_t)get_chip_baseaddress(chip_id))); *msip_ptr = 1; +#elif __riscv_xlen == 32 + uint32_t* msip_ptr = clint_msip_ptr(0); + uint32_t target_addrh = get_chip_baseaddress_h(chip_id); + uint32_t current_addrh = get_current_chip_baseaddress_h(); + + register uint32_t reg_target_addrh asm("t0") = target_addrh; + register uint32_t reg_return_value asm("t1") = 1; + register uint32_t reg_msip_ptr asm("t2") = (uint32_t)msip_ptr; + register uint32_t reg_current_addrh asm("t3") = current_addrh; + + asm volatile( + "csrw 0xbc0, t0;" + "sw t1, 0(t2);" + "csrw 0xbc0, t3;" + : + : "r"(reg_target_addrh), "r"(reg_return_value), "r"(reg_msip_ptr), + "r"(reg_current_addrh) + : "memory"); +#endif } inline void clear_host_sw_interrupt_unsafe(uint8_t chip_id) { +#if __riscv_xlen == 64 uint32_t* msip_ptr = (uint32_t*)(((uintptr_t)clint_msip_ptr(0)) | ((uintptr_t)get_chip_baseaddress(chip_id))); - *msip_ptr = 0; +#elif __riscv_xlen == 32 + uint32_t* msip_ptr = clint_msip_ptr(0); + uint32_t target_addrh = get_chip_baseaddress_h(chip_id); + uint32_t current_addrh = get_current_chip_baseaddress_h(); + + register uint32_t reg_target_addrh asm("t0") = target_addrh; + register uint32_t reg_return_value asm("t1") = 0; + register uint32_t reg_msip_ptr asm("t2") = (uint32_t)msip_ptr; + register uint32_t reg_current_addrh asm("t3") = current_addrh; + + asm volatile( + "csrw 0xbc0, t0;" + "sw t1, 0(t2);" + "csrw 0xbc0, t3;" + : + : "r"(reg_target_addrh), "r"(reg_return_value), "r"(reg_msip_ptr), + "r"(reg_current_addrh) + : "memory"); +#endif } inline void wait_host_sw_interrupt_clear(uint8_t chip_id) { +#if __riscv_xlen == 64 uint32_t* msip_ptr = (uint32_t*)(((uintptr_t)clint_msip_ptr(0)) | ((uintptr_t)get_chip_baseaddress(chip_id))); - while (*msip_ptr); +#elif __riscv_xlen == 32 + uint32_t* msip_ptr = clint_msip_ptr(0); + uint32_t target_addrh = get_chip_baseaddress_h(chip_id); + uint32_t current_addrh = get_current_chip_baseaddress_h(); + + register uint32_t reg_target_addrh asm("t0") = target_addrh; + register uint32_t reg_value asm("t1"); + register uint32_t reg_msip_ptr asm("t2") = (uint32_t)msip_ptr; + register uint32_t reg_current_addrh asm("t3") = current_addrh; + + do { + asm volatile( + "csrw 0xbc0, t0;" + "lw t1, 0(t2);" + "csrw 0xbc0, t3;" + : "=r"(reg_value) + : "r"(reg_target_addrh), "r"(reg_msip_ptr), "r"(reg_current_addrh) + : "memory"); + } while (reg_value); +#endif } static inline void clear_host_sw_interrupt(uint8_t chip_id) { diff --git a/target/sim_chip/apps/Makefile b/target/sim_chip/apps/Makefile index 5a7bb53d..72a2846a 100644 --- a/target/sim_chip/apps/Makefile +++ b/target/sim_chip/apps/Makefile @@ -13,11 +13,11 @@ RISCV_READELF = $(CVA6_GCC_ROOT)/riscv64-unknown-elf-readelf ELFS = $(shell find $(TARGET)/sim/sw/host/apps -type f -name "*.elf") # Generate BIN filenames from the ELF filenames BINS = $(ELFS:.elf=.bin) -$(info BINS: $(BINS)) -.PHONY: apps clean +.PHONY: apps hex clean apps: $(BINS) + python3 bin2hex.py # Use full paths for the dependencies %.bin: %.elf diff --git a/util/occamygen/occamy.py b/util/occamygen/occamy.py index 8dc50948..5be51ffa 100644 --- a/util/occamygen/occamy.py +++ b/util/occamygen/occamy.py @@ -315,10 +315,10 @@ def am_connect_soc_wide_xbar_quad(am, am_soc_narrow_xbar, am_wide_xbar_quadrant_ "quadrant_{}_cluster_{}_tcdm".format(i, j), clusters_tcdm_size[j+1], *bases_cluster - ).attach_to( - am_wide_xbar_quadrant_s1[i] ).attach_to( am_narrow_xbar_quadrant_s1[i] + ).attach_to( + am_wide_xbar_quadrant_s1[i] ) ) @@ -330,11 +330,11 @@ def am_connect_soc_wide_xbar_quad(am, am_soc_narrow_xbar, am_wide_xbar_quadrant_ "quadrant_{}_cluster_{}_periph".format(i, j), clusters_periph_size[j+1], *bases_cluster - ).attach_to( - am_wide_xbar_quadrant_s1[i] ).attach_to( am_narrow_xbar_quadrant_s1[i] - ) + ) # .attach_to( + # am_wide_xbar_quadrant_s1[i] + # ) ) bases_cluster = list() diff --git a/util/occamygen/occamygen.py b/util/occamygen/occamygen.py index cb529876..08a0e9ca 100755 --- a/util/occamygen/occamygen.py +++ b/util/occamygen/occamygen.py @@ -370,23 +370,22 @@ def main(): no_loopback=True, context="soc", node=am_soc_narrow_xbar) + + # Default port: wide xbar (Should stay on the first position) + soc_narrow_xbar.add_output_entry("soc_wide", am_soc_wide_xbar) + soc_narrow_xbar.add_input("soc_wide") for i in range(nr_s1_quadrants): - # soc_narrow_xbar.add_output_symbolic_multi("s1_quadrant_{}".format(i), - # [("s1_quadrant_base_addr", - # "S1QuadrantAddressSpace"), - # ("s1_quadrant_cfg_base_addr", - # "S1QuadrantCfgAddressSpace")]) - soc_narrow_xbar.add_output_entry( - "s1_quadrant_{}".format(i), am_narrow_xbar_quadrant_s1[i]) + soc_narrow_xbar.add_output_symbolic_multi("s1_quadrant_{}".format(i), + [("ClusterBaseOffset", + "S1QuadrantAddressSpace"), + ("S1QuadrantCfgBaseOffset", + "S1QuadrantCfgAddressSpace")]) soc_narrow_xbar.add_input("s1_quadrant_{}".format(i)) soc_narrow_xbar.add_input("cva6") - soc_narrow_xbar.add_input("soc_wide") soc_narrow_xbar.add_input("periph") - # Default port: wide xbar - soc_narrow_xbar.add_output_entry("soc_wide", am_soc_wide_xbar) soc_narrow_xbar.add_output_entry("periph", am_soc_axi_lite_periph_xbar) soc_narrow_xbar.add_output_entry("spm_narrow", am_spm_narrow) soc_narrow_xbar.add_output_entry("sys_idma_cfg", am_sys_idma_cfg) diff --git a/util/solder/solder.py b/util/solder/solder.py index 60746a3e..f4863103 100644 --- a/util/solder/solder.py +++ b/util/solder/solder.py @@ -1598,7 +1598,7 @@ def emit(self): for i, (idx, entries) in enumerate(self.symbolic_addrmap_multi): for base, length in entries: addrmap_lines.append( - " '{{ idx: {}, start_addr: {}[{i}], end_addr: {}[{i}] + {} }}".format( + " '{{ idx: {}, start_addr: {{chip_id_i, {}[39:0]}}, end_addr: {{chip_id_i, {{{} + {}}}[39:0] }} }}".format( idx, base, base, length, i=i)) addrmap += "{}\n}};\n".format(',\n'.join(addrmap_lines))