diff --git a/Bender.yml b/Bender.yml index 05f8c6bac..5a44efc47 100644 --- a/Bender.yml +++ b/Bender.yml @@ -101,7 +101,7 @@ sources: - target/rtl/src/occamy_xilinx.sv # target/sim - - target: any(simulation,simulation_vlt) + - target: simulation_occamy files: - target/rtl/test/uartdpi/uartdpi.sv - target/rtl/test/testharness.sv diff --git a/Makefile b/Makefile index 90f704a21..9a3fb58c4 100644 --- a/Makefile +++ b/Makefile @@ -117,7 +117,7 @@ occamy_system_vsim: # In ESAT Server $(MAKE) -C ./target/sim bin/occamy_top.vsim hemaia_system_vsim_preparation: # In SNAX Docker - $(MAKE) -C ./target/sim_chip work-vsim/compile.vsim.tcl + $(MAKE) -C ./target/sim_chip work-vsim/compile.vsim.tcl CFG_OVERRIDE=$(CFG) hemaia_system_vsim: # In ESAT Server $(MAKE) -C ./target/sim_chip bin/occamy_chip.vsim diff --git a/hw/occamy/occamy_soc.sv.tpl b/hw/occamy/occamy_soc.sv.tpl index c9f156abd..02452121e 100644 --- a/hw/occamy/occamy_soc.sv.tpl +++ b/hw/occamy/occamy_soc.sv.tpl @@ -350,6 +350,7 @@ module ${name}_soc ) i_axi_dma_backend_sys_idma ( .clk_i, .rst_ni, + .chip_id_i, .dma_id_i ( 'd0 ), .axi_dma_req_o ( ${in_sys_idma_mst.req_name()} ), .axi_dma_res_i ( ${in_sys_idma_mst.rsp_name()} ), diff --git a/hw/vendor/openhwgroup_cva6/common/local/util/instr_tracer.sv b/hw/vendor/openhwgroup_cva6/common/local/util/instr_tracer.sv index 81e547d79..c41118b43 100644 --- a/hw/vendor/openhwgroup_cva6/common/local/util/instr_tracer.sv +++ b/hw/vendor/openhwgroup_cva6/common/local/util/instr_tracer.sv @@ -20,7 +20,8 @@ module instr_tracer ( instr_tracer_if tracer_if, - input logic[riscv::XLEN-1:0] hart_id_i + input logic[riscv::XLEN-1:0] hart_id_i, + input ariane_pkg::chip_id_t chip_id_i ); // keep the decoded instructions in a queue @@ -44,11 +45,12 @@ module instr_tracer ( // static uvm_cmdline_processor uvcl = uvm_cmdline_processor::get_inst(); - function void create_file(logic [63:0] hart_id); + function void create_file(logic [7:0] chip_id, logic [63:0] hart_id); string fn, fn_commit_log; - $sformat(fn, "trace_hart_%0.0f.log", hart_id); - $sformat(fn_commit_log, "trace_hart_%0.0f_commit.log", hart_id); - $display("[TRACER] Output filename is: %s", fn); + $sformat(fn, "logs/trace_chip_%01x%01x_hart_%05x.log", chip_id[7:4], chip_id[3:0], hart_id); + $sformat(fn_commit_log, "logs/trace_chip_%01x%01x_hart_%05x_commit.log", chip_id[7:4], + chip_id[3:0], hart_id); + $display("[Tracer] Logging Hart %d to %s", hart_id, fn); f = $fopen(fn,"w"); if (ariane_pkg::ENABLE_SPIKE_COMMIT_LOG) commit_log = $fopen(fn_commit_log, "w"); @@ -214,7 +216,7 @@ module instr_tracer ( initial begin #15ns; - create_file(hart_id_i); + create_file(chip_id_i, hart_id_i); trace(); end diff --git a/hw/vendor/openhwgroup_cva6/core/cva6.sv b/hw/vendor/openhwgroup_cva6/core/cva6.sv index 88246b352..91d6d880c 100644 --- a/hw/vendor/openhwgroup_cva6/core/cva6.sv +++ b/hw/vendor/openhwgroup_cva6/core/cva6.sv @@ -961,7 +961,8 @@ input axi_rsp_t axi_resp_i instr_tracer instr_tracer_i ( .tracer_if(tracer_if), - .hart_id_i + .hart_id_i, + .chip_id_i ); // mock tracer for Verilator, to be used with spike-dasm diff --git a/target/rtl/cfg/occamy_cfg/hemaia.hjson b/target/rtl/cfg/occamy_cfg/hemaia.hjson index dee024921..f6aa203e9 100755 --- a/target/rtl/cfg/occamy_cfg/hemaia.hjson +++ b/target/rtl/cfg/occamy_cfg/hemaia.hjson @@ -7,7 +7,12 @@ hemaia_multichip: { chip_id_width: 8, single_chip: true, - single_chip_id: 0 + single_chip_id: 0, + testbench_cfg: { + // Emulate a four-chips configuration + upper_left_coordinate: [0, 0], + lower_right_coordinate: [2, 2] + } } addr_width: 48, data_width: 64, diff --git a/target/sim/sim.mk b/target/sim/sim.mk index 2b70b93e9..fa7a1d4d9 100644 --- a/target/sim/sim.mk +++ b/target/sim/sim.mk @@ -37,20 +37,20 @@ MATCH_END := '/+incdir+/ s/$$/\/*\/*/' MATCH_BGN := 's/+incdir+//g' SED_SRCS := sed -e ${MATCH_END} -e ${MATCH_BGN} -VSIM_BENDER += -t test -t rtl -t simulation -t vsim +VSIM_BENDER += -t test -t rtl -t simulation_occamy -t vsim VSIM_SOURCES = $(shell ${BENDER} script flist ${VSIM_BENDER} | ${SED_SRCS}) VSIM_BUILDDIR ?= work-vsim VOPT_FLAGS = +acc # VCS_BUILDDIR should to be the same as the `DEFAULT : ./work-vcs` # in target/snitch_cluster/synopsys_sim.setup -VCS_BENDER += -t test -t rtl -t simulation -t vcs +VCS_BENDER += -t test -t rtl -t simulation_occamy -t vcs VCS_SOURCES = $(shell ${BENDER} script flist ${VCS_BENDER} | ${SED_SRCS}) VCS_BUILDDIR := work-vcs # For synthesis with DC compiler SYN_FLIST ?= syn_flist.tcl -SYN_BENDER += -t test -t synthesis -t simulation +SYN_BENDER += -t test -t synthesis -t simulation_occamy ifeq ($(MEM_TYPE), exclude_tcsram) VSIM_BENDER += -t tech_cells_generic_exclude_tc_sram SYN_BENDER += -t tech_cells_generic_exclude_tc_sram @@ -70,7 +70,7 @@ SYN_BUILDDIR := work-syn FESVR ?= ${MKFILE_DIR}work FESVR_VERSION ?= 98d2c29e431f3b14feefbda48c5f70c2f451acf2 -VLT_BENDER += -t rtl -t simulation_vlt +VLT_BENDER += -t rtl -t simulation_occamy VLT_SOURCES = $(shell ${BENDER} script flist ${VLT_BENDER} | ${SED_SRCS}) VLT_BUILDDIR := work-vlt VLT_FESVR = $(VLT_BUILDDIR)/riscv-isa-sim @@ -245,9 +245,9 @@ endef # Traces # ########## -DASM_TRACES = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null)) +DASM_TRACES = $(shell (ls $(LOGS_DIR)/trace_chip_??_hart_*.dasm 2>/dev/null)) TXT_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.txt/g')) -PERF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g')) +PERF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/.dasm/_perf.json/g')) ANNOTATED_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.s/g')) DIFF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.diff/g')) @@ -265,16 +265,19 @@ perf-csv: $(PERF_CSV) event-csv: $(EVENT_CSV) layout: $(TRACE_CSV) $(TRACE_JSON) -$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY) - $(DASM) < $< | $(PYTHON) $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt - +$(LOGS_DIR)/%.txt $(LOGS_DIR)/%_perf.json: $(LOGS_DIR)/%.dasm $(GENTRACE_PY) + @CHIP=$(word 3,$(subst _, ,$*)) && \ + HART=$(word 5,$(subst _, ,$*)) && \ + echo "Processing Chip $$CHIP Hart $$HART" && \ + $(DASM) < $< | $(PYTHON) $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/chip_$$CHIP\_hart_$$HART\_perf.json > $(LOGS_DIR)/trace_chip_$$CHIP\_hart_$$HART.txt # Generate source-code interleaved traces for all harts. Reads the binary from # the logs/.rtlbinary file that is written at start of simulation in the vsim script BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary) -$(LOGS_DIR)/trace_hart_%.s: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY} - $(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -$(LOGS_DIR)/trace_hart_%.diff: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY} - $(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -d + +$(LOGS_DIR)/%.s: $(LOGS_DIR)/%.txt $(ANNOTATE_PY) + $(PYTHON) $(ANNOTATE_PY) $(ANNOTATE_FLAGS) -o $@ $(BINARY) $< +$(LOGS_DIR)/%.diff: $(LOGS_DIR)/%.txt $(ANNOTATE_PY) + $(PYTHON) $(ANNOTATE_PY) $(ANNOTATE_FLAGS) -o $@ $(BINARY) $< -d $(PERF_CSV): $(PERF_TRACES) $(PERF_CSV_PY) $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES) diff --git a/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c b/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c index 4ca19a5c6..457f7511e 100644 --- a/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c +++ b/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c @@ -12,8 +12,11 @@ #include "data.h" #include "snrt.h" -int8_t* tcdm0_start_addr; -int8_t* tcdm1_start_addr; + +uint64_t tcdm0_start_addr; +uint64_t tcdm1_start_addr; +uint64_t test_data_start_addr; + int main() { int err = 0; // First set the addr of cluster 0 @@ -21,24 +24,35 @@ int main() { // tcdm1_start_addr = (int8_t*)0x10100000; if (snrt_cluster_idx() == 0) { if (snrt_is_dm_core()) { - tcdm0_start_addr = (int8_t*)snrt_cluster_base_addrl(); - printf("The C0 TCDM ADDR is %p \n", tcdm0_start_addr); + tcdm0_start_addr = (uint64_t)snrt_cluster_base_addrl(); + tcdm0_start_addr += (uint64_t)snrt_cluster_base_addrh() << 32; + printf("The C0 TCDM ADDR is %p%p \n", + (uint8_t*)(tcdm0_start_addr >> 32), + (uint8_t*)tcdm0_start_addr); } } snrt_global_barrier(); if (snrt_cluster_idx() == 1) { if (snrt_is_dm_core()) { - tcdm1_start_addr = (int8_t*)snrt_cluster_base_addrl(); - printf("The C1 TCDM ADDR is %p \n", tcdm1_start_addr); + tcdm1_start_addr = (uint64_t)snrt_cluster_base_addrl(); + tcdm1_start_addr += (uint64_t)snrt_cluster_base_addrh() << 32; + printf("The C1 TCDM ADDR is %p%p \n", + (uint8_t*)(tcdm1_start_addr >> 32), + (uint8_t*)tcdm1_start_addr); } } snrt_global_barrier(); // C0 Load the data from l3 -> l1 if (snrt_cluster_idx() == 0) { if (snrt_is_dm_core()) { - printf("[C0] Start to load data from %p\n", test_data); - snrt_dma_start_1d(tcdm0_start_addr, test_data, length_data); + test_data_start_addr = (uint64_t)test_data; + test_data_start_addr += (uint64_t)snrt_cluster_base_addrh() << 32; + printf("[C0] Start to load data from %p%p \n", + (uint8_t*)(test_data_start_addr >> 32), + (uint8_t*)test_data_start_addr); + snrt_dma_start_1d_wideptr(tcdm0_start_addr, test_data_start_addr, + length_data); snrt_dma_wait_all(); } } @@ -48,8 +62,11 @@ int main() { // Thenc C1 fetches data from C0 if (snrt_cluster_idx() == 1) { if (snrt_is_dm_core()) { - printf("[C1] Load data from C0 TCDM %p\n", tcdm0_start_addr); - snrt_dma_start_1d(tcdm1_start_addr, tcdm0_start_addr, length_data); + printf("[C1] Start to load data from %p%p \n", + (uint8_t*)(tcdm0_start_addr >> 32), + (uint8_t*)tcdm0_start_addr); + snrt_dma_start_1d_wideptr(tcdm1_start_addr, tcdm0_start_addr, + length_data); snrt_dma_wait_all(); } } @@ -61,11 +78,11 @@ int main() { if (snrt_cluster_core_idx() == 0) { printf("C0 Checking the results\n"); for (int i = 0; i < length_data; i++) { - if (tcdm0_start_addr[i] != test_data[i]) { + if (((int8_t*)tcdm0_start_addr)[i] != test_data[i]) { err++; printf("C0 data is incorrect!\n"); printf("tcdm0[%d]=%d, test_data[%d]=%d\n", i, - tcdm0_start_addr[i], i, test_data[i]); + ((int8_t*)tcdm0_start_addr)[i], i, test_data[i]); return -1; } } @@ -76,11 +93,11 @@ int main() { if (snrt_cluster_core_idx() == 0) { printf("C1 Checking the results\n"); for (int i = 0; i < length_data; i++) { - if (tcdm1_start_addr[i] != test_data[i]) { + if (((int8_t*)tcdm1_start_addr)[i] != test_data[i]) { err++; printf("C1 data is incorrect!\n"); printf("tcdm0[%d]=%d, test_data[%d]=%d\n", i, - tcdm1_start_addr[i], i, test_data[i]); + ((int8_t*)tcdm1_start_addr)[i], i, test_data[i]); return -1; } } @@ -95,4 +112,4 @@ int main() { } return 0; -} \ No newline at end of file +} diff --git a/target/sim/sw/device/runtime/src/occamy_device.h b/target/sim/sw/device/runtime/src/occamy_device.h index 24228d58c..dcc1fbde1 100644 --- a/target/sim/sw/device/runtime/src/occamy_device.h +++ b/target/sim/sw/device/runtime/src/occamy_device.h @@ -55,13 +55,14 @@ inline void return_to_cva6(sync_t sync) { if (cnt == snrt_cluster_num()) { #endif *((volatile uint32_t*)barrier_ptr) = 0; - set_host_sw_interrupt(); + // Interrupt the local host to signal the exit code (snitch by default only has the access to local domain) + set_host_sw_interrupt(0); } } } // Otherwise assume cores are already synchronized and only // one core calls this function else { - set_host_sw_interrupt(); + set_host_sw_interrupt(0); } } diff --git a/target/sim/sw/device/runtime/src/occamy_start.c b/target/sim/sw/device/runtime/src/occamy_start.c index e16e9812e..00fca72ba 100644 --- a/target/sim/sw/device/runtime/src/occamy_start.c +++ b/target/sim/sw/device/runtime/src/occamy_start.c @@ -25,7 +25,8 @@ static inline void snrt_exit_default(int exit_code); static inline void snrt_exit(int exit_code) { snrt_exit_default(exit_code); - if (snrt_global_core_idx() == 0) set_host_sw_interrupt(); + // Interrupt the local host to signal the exit code (snitch by default only has the access to local domain) + if (snrt_global_core_idx() == 0) set_host_sw_interrupt(0); } #include "start.c" diff --git a/target/sim/sw/device/runtime/src/putchar_chip.c b/target/sim/sw/device/runtime/src/putchar_chip.c index 30b21761f..31628cbe0 100644 --- a/target/sim/sw/device/runtime/src/putchar_chip.c +++ b/target/sim/sw/device/runtime/src/putchar_chip.c @@ -7,8 +7,6 @@ #include "uart.h" void _putchar(char character) { - while (is_transmit_empty() == 0) { - }; - - write_reg_u8(UART_THR, character); + // Print to UART of local chip + print_char((uintptr_t)0, character); } diff --git a/target/sim/sw/host/apps/hello_world/src/hello_world.c b/target/sim/sw/host/apps/hello_world/src/hello_world.c index c63df5b56..377f6c8f8 100644 --- a/target/sim/sw/host/apps/hello_world/src/hello_world.c +++ b/target/sim/sw/host/apps/hello_world/src/hello_world.c @@ -3,23 +3,26 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "chip_id.h" #include "host.c" // Frequency at which the UART peripheral is clocked #define PERIPH_FREQ 50000000 int main() { - init_uart(PERIPH_FREQ, 1000000); + uintptr_t address_prefix = (uintptr_t)get_current_chip_baseaddress(); + + init_uart(address_prefix, PERIPH_FREQ, 1000000); asm volatile("fence" : : : "memory"); - print_uart("Hello world from Occamy in VCU128! \r\n"); + print_str(address_prefix, "Hello world from Occamy in VCU128! \r\n"); char uart_rx_buffer[512]; char uart_tx_buffer[512]; while (1) { - scan_uart(uart_rx_buffer); + scan_str(address_prefix, uart_rx_buffer); sprintf(uart_tx_buffer, "[Occamy] What you said is: %s", uart_rx_buffer); - print_uart(uart_tx_buffer); + print_str(address_prefix, uart_tx_buffer); // Artificial delay to ensure last symbol has been transmitted // (just waiting for the UART TSR register to be empty is not // sufficient) diff --git a/target/sim/sw/host/apps/offload/src/offload.c b/target/sim/sw/host/apps/offload/src/offload.c index 36a24cc44..20a0cf2ae 100644 --- a/target/sim/sw/host/apps/offload/src/offload.c +++ b/target/sim/sw/host/apps/offload/src/offload.c @@ -6,31 +6,38 @@ int main() { // Reset and ungate all quadrants, deisolate - init_uart(50000000, 1000000); - print_uart("[Occamy] The Offload main function \r\n"); - reset_and_ungate_quadrants(); - deisolate_all(); - + uintptr_t address_prefix = (uintptr_t)get_current_chip_baseaddress(); + uint32_t chip_id = get_current_chip_id(); + + init_uart(address_prefix, 50000000, 1000000); + print_str(address_prefix, "[Occamy] The Offload main function \r\n"); + print_str(address_prefix, "[Occamy] Current Chip ID is: "); + print_u8(address_prefix, chip_id); + print_str(address_prefix, "\r\n"); + reset_and_ungate_quadrants(chip_id); + print_str(address_prefix, "[Occamy] Snitch ungated. \r\n"); + deisolate_all(chip_id); + print_str(address_prefix, "[Occamy] Snitch deisolated. \r\n"); // Enable interrupts to receive notice of job termination enable_sw_interrupts(); - // Program Snitch entry point and communication buffer - program_snitches(); + program_snitches(chip_id); + print_str(address_prefix, "[Occamy] Snitch Jump Address Programmed. \r\n"); // Compiler fence to ensure Snitch entry point is // programmed before Snitches are woken up asm volatile("" ::: "memory"); - print_uart("[Occamy] Calling snitch cluster to execute the task \r\n"); + print_str(address_prefix, "[Occamy] Calling snitch cluster to execute the task \r\n"); // Start Snitches - wakeup_snitches_cl(); + wakeup_snitches_cl(chip_id); - int ret = wait_snitches_done(); + int ret = wait_snitches_done(chip_id); - print_uart("[Occamy] Snitch cluster done with exit code "); - print_uart_int(ret); - print_uart("\r\n"); + print_str(address_prefix, "[Occamy] Snitch cluster done with exit code "); + print_u32(address_prefix, ret); + print_str(address_prefix, "\r\n"); // Wait for job done and return Snitch exit code return ret; diff --git a/target/sim/sw/host/runtime/host.c b/target/sim/sw/host/runtime/host.c index 601189756..47327bea9 100644 --- a/target/sim/sw/host/runtime/host.c +++ b/target/sim/sw/host/runtime/host.c @@ -1,14 +1,14 @@ // Copyright 2022 ETH Zurich and University of Bologna. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 - #include "host.h" - -#include "heterogeneous_runtime.h" +#include "chip_id.h" #include "occamy.h" #include "sys_dma.h" #include "uart.h" +#include "heterogeneous_runtime.h" + // Handle multireg degeneration to single register #if OCCAMY_SOC_ISOLATE_MULTIREG_COUNT == 1 #define OCCAMY_SOC_ISOLATE_0_REG_OFFSET OCCAMY_SOC_ISOLATE_REG_OFFSET @@ -73,7 +73,8 @@ volatile comm_buffer_t comm_buffer __attribute__((aligned(8))); // Anticipated function declarations //=============================================================== -static inline void set_sw_interrupts_unsafe(uint32_t base_hartid, +static inline void set_sw_interrupts_unsafe(uint8_t chip_id, + uint32_t base_hartid, uint32_t num_harts, uint32_t stride); @@ -103,9 +104,9 @@ void initialize_wide_spm() { void enable_fpu() { uint64_t mstatus; - asm volatile("csrr %[mstatus], mstatus" : [ mstatus ] "=r"(mstatus)); + asm volatile("csrr %[mstatus], mstatus" : [mstatus] "=r"(mstatus)); mstatus |= (1 << MSTATUS_FS_OFFSET); - asm volatile("csrw mstatus, %[mstatus]" : : [ mstatus ] "r"(mstatus)); + asm volatile("csrw mstatus, %[mstatus]" : : [mstatus] "r"(mstatus)); } void set_d_cache_enable(uint16_t ena) { @@ -167,7 +168,9 @@ static inline void mutex_release(volatile uint32_t* pmtx) { extern void snitch_main(); -static inline void wakeup_snitch(uint32_t hartid) { set_sw_interrupt(hartid); } +static inline void wakeup_snitch(uint8_t chip_id, uint32_t hartid) { + set_sw_interrupt(chip_id, hartid); +} /** * @brief Waits until snitches are parked in a `wfi` instruction @@ -190,9 +193,12 @@ void wait_snitches_parked(uint32_t timeout) { delay_ns(100000); } * This routine programs the soc_ctrl_scratch_0 register * with the address of the user binary. */ -static inline void program_snitches() { - *soc_ctrl_scratch_ptr(1) = (uintptr_t)snitch_main; - *soc_ctrl_scratch_ptr(2) = (uintptr_t)&comm_buffer; +static inline void program_snitches(uint8_t chip_id) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + *(volatile uint32_t*)((uintptr_t)soc_ctrl_scratch_ptr(1) | base_addr) = + (uintptr_t)snitch_main; + *(volatile uint32_t*)((uintptr_t)soc_ctrl_scratch_ptr(2) | base_addr) = + (uintptr_t)&comm_buffer; } /** @@ -201,8 +207,10 @@ static inline void program_snitches() { * @detail Send a cluster interrupt to all Snitches in a cluster */ -static inline void wakeup_cluster(uint32_t cluster_id) { - *(cluster_clint_set_ptr(cluster_id)) = 511; +static inline void wakeup_cluster(uint8_t chip_id, uint32_t cluster_id) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + *(volatile uint32_t*)((uintptr_t)cluster_clint_set_ptr(cluster_id) | + base_addr) = 511; } /** @@ -212,11 +220,11 @@ static inline void wakeup_cluster(uint32_t cluster_id) { * must be issued to "unpark" every Snitch. This function * sends a SW interrupt to all Snitches. */ -void wakeup_snitches() { +void wakeup_snitches(uint8_t chip_id) { volatile uint32_t* lock = get_shared_lock(); mutex_ttas_acquire(lock); - set_sw_interrupts_unsafe(1, N_SNITCHES, 1); + set_sw_interrupts_unsafe(chip_id, 1, N_SNITCHES, 1); mutex_release(lock); } @@ -225,8 +233,8 @@ void wakeup_snitches() { * * @detail Send a cluster interrupt to all Snitches */ -static inline void wakeup_snitches_cl() { - for (int i = 0; i < N_CLUSTERS; i++) wakeup_cluster(i); +static inline void wakeup_snitches_cl(uint8_t chip_id) { + for (int i = 0; i < N_CLUSTERS; i++) wakeup_cluster(chip_id, i); } /** @@ -236,12 +244,12 @@ static inline void wakeup_snitches_cl() { * must be issued to "unpark" every Snitch. This function * sends a SW interrupt to a given range of Snitches. */ -void wakeup_snitches_selective(uint32_t base_hartid, uint32_t num_harts, - uint32_t stride) { +void wakeup_snitches_selective(uint8_t chip_id, uint32_t base_hartid, + uint32_t num_harts, uint32_t stride) { volatile uint32_t* lock = get_shared_lock(); mutex_ttas_acquire(lock); - set_sw_interrupts_unsafe(base_hartid, num_harts, stride); + set_sw_interrupts_unsafe(chip_id, base_hartid, num_harts, stride); mutex_release(lock); } @@ -270,10 +278,15 @@ void wakeup_snitches_selective(uint32_t base_hartid, uint32_t num_harts, /** * @brief Waits until snitches are done executing */ -static inline int wait_snitches_done() { +static inline int wait_snitches_done(uint8_t chip_id) { wait_sw_interrupt(); - clear_host_sw_interrupt(); - int retval = *soc_ctrl_scratch_ptr(3); + uint8_t current_chip_id = get_current_chip_id(); + clear_host_sw_interrupt(current_chip_id); + + uintptr_t baseaddress = (uintptr_t)get_chip_baseaddress(chip_id); + uint32_t* retval_ptr = + (uint32_t*)((uintptr_t)soc_ctrl_scratch_ptr(3) | baseaddress); + int retval = *retval_ptr; // LSB signals completion if (retval & 1) return retval >> 1; @@ -289,23 +302,33 @@ static inline volatile uint32_t* get_shared_lock() { // Reset and clock gating //=============================================================== -static inline void set_clk_ena_quad(uint32_t quad_idx, uint32_t value) { - *quad_cfg_clk_ena_ptr(quad_idx) = value & 0x1; +static inline void set_clk_ena_quad(uint8_t chip_id, uint32_t quad_idx, + uint32_t value) { + uint32_t* clk_ena_ptr = + (uint32_t*)((uintptr_t)quad_cfg_clk_ena_ptr(quad_idx) | + (uintptr_t)get_chip_baseaddress(chip_id)); + *clk_ena_ptr = value & 0x1; } -static inline void set_reset_n_quad(uint32_t quad_idx, uint32_t value) { - *quad_cfg_reset_n_ptr(quad_idx) = value & 0x1; +static inline void set_reset_n_quad(uint8_t chip_id, uint32_t quad_idx, + uint32_t value) { + uint32_t* reset_n_ptr = + (uint32_t*)((uintptr_t)quad_cfg_reset_n_ptr(quad_idx) | + (uintptr_t)get_chip_baseaddress(chip_id)); + *reset_n_ptr = value & 0x1; } -static inline void reset_and_ungate_quad(uint32_t quadrant_idx) { - set_reset_n_quad(quadrant_idx, 0); - set_clk_ena_quad(quadrant_idx, 0); - set_reset_n_quad(quadrant_idx, 1); - set_clk_ena_quad(quadrant_idx, 1); +static inline void reset_and_ungate_quad(uint8_t chip_id, + uint32_t quadrant_idx) { + set_reset_n_quad(chip_id, quadrant_idx, 0); + set_clk_ena_quad(chip_id, quadrant_idx, 0); + __asm__ __volatile__("fence" ::: "memory"); + set_reset_n_quad(chip_id, quadrant_idx, 1); + set_clk_ena_quad(chip_id, quadrant_idx, 1); } -static inline void reset_and_ungate_quadrants() { - for (int i = 0; i < N_QUADS; i++) reset_and_ungate_quad(i); +static inline void reset_and_ungate_quadrants(uint8_t chip_id) { + for (int i = 0; i < N_QUADS; i++) reset_and_ungate_quad(chip_id, i); } //=============================================================== @@ -317,15 +340,19 @@ static inline void wfi() { asm volatile("wfi"); } static inline void enable_sw_interrupts() { uint64_t mie; - asm volatile("csrr %[mie], mie" : [ mie ] "=r"(mie)); + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); mie |= (1 << MIE_MSIE_OFFSET); - asm volatile("csrw mie, %[mie]" : : [ mie ] "r"(mie)); + asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); } -static inline uint32_t get_clint_msip_hart(uint32_t hartid) { +static inline uint32_t get_clint_msip_hart(uint8_t chip_id, uint32_t hartid) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); uint32_t field_offset = hartid % CLINT_MSIP_P_FIELDS_PER_REG; uint32_t lsb_offset = field_offset * CLINT_MSIP_P_FIELD_WIDTH; - return (*clint_msip_ptr(hartid) >> lsb_offset) & 1; + return (*(volatile uint32_t*)((uintptr_t)clint_msip_ptr(hartid) | + base_addr) >> + lsb_offset) & + 1; } /** @@ -339,30 +366,31 @@ static inline uint32_t get_clint_msip_hart(uint32_t hartid) { static inline uint32_t sw_interrupt_pending() { uint64_t mip; - asm volatile("csrr %[mip], mip" : [ mip ] "=r"(mip)); + asm volatile("csrr %[mip], mip" : [mip] "=r"(mip)); return mip & (1 << MIP_MSIP_OFFSET); } // TODO: for portability to architectures where WFI is implemented as a NOP // also sw_interrupts_enabled() should be checked static inline void wait_sw_interrupt() { - do - wfi(); + do wfi(); while (!sw_interrupt_pending()); } -static inline void clear_sw_interrupt_unsafe(uint32_t hartid) { +static inline void clear_sw_interrupt_unsafe(uint8_t chip_id, uint32_t hartid) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); uint32_t field_offset = hartid % CLINT_MSIP_P_FIELDS_PER_REG; uint32_t lsb_offset = field_offset * CLINT_MSIP_P_FIELD_WIDTH; - *clint_msip_ptr(hartid) &= ~(1 << lsb_offset); + *(volatile uint32_t*)((uintptr_t)clint_msip_ptr(hartid) | base_addr) &= + ~(1 << lsb_offset); } -static inline void clear_sw_interrupt(uint32_t hartid) { +static inline void clear_sw_interrupt(uint8_t chip_id, uint32_t hartid) { volatile uint32_t* shared_lock = get_shared_lock(); mutex_tas_acquire(shared_lock); - clear_sw_interrupt_unsafe(hartid); + clear_sw_interrupt_unsafe(chip_id, hartid); mutex_release(shared_lock); } @@ -374,35 +402,41 @@ static inline void clear_sw_interrupt(uint32_t hartid) { * status. That function interrogates a local CSR * instead of the shared CLINT. */ -static inline uint32_t remote_sw_interrupt_pending(uint32_t hartid) { - return get_clint_msip_hart(hartid); +static inline uint32_t remote_sw_interrupt_pending(uint8_t chip_id, + uint32_t hartid) { + return get_clint_msip_hart(chip_id, hartid); } static inline uint32_t timer_interrupts_enabled() { uint64_t mie; - asm volatile("csrr %[mie], mie" : [ mie ] "=r"(mie)); + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); return (mie >> MIE_MTIE_OFFSET) & 1; } -static inline void set_sw_interrupt_unsafe(uint32_t hartid) { +static inline void set_sw_interrupt_unsafe(uint8_t chip_id, uint32_t hartid) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); uint32_t field_offset = hartid % CLINT_MSIP_P_FIELDS_PER_REG; uint32_t lsb_offset = field_offset * CLINT_MSIP_P_FIELD_WIDTH; - *clint_msip_ptr(hartid) |= (1 << lsb_offset); + *(volatile uint32_t*)((uintptr_t)clint_msip_ptr(hartid) | base_addr) |= + (1 << lsb_offset); } -void set_sw_interrupt(uint32_t hartid) { +void set_sw_interrupt(uint8_t chip_id, uint32_t hartid) { volatile uint32_t* shared_lock = get_shared_lock(); mutex_ttas_acquire(shared_lock); - set_sw_interrupt_unsafe(hartid); + set_sw_interrupt_unsafe(chip_id, hartid); mutex_release(shared_lock); } -static inline void set_sw_interrupts_unsafe(uint32_t base_hartid, +static inline void set_sw_interrupts_unsafe(uint8_t chip_id, + uint32_t base_hartid, uint32_t num_harts, uint32_t stride) { - volatile uint32_t* ptr = clint_msip_ptr(base_hartid); + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + volatile uint32_t* ptr = + (uint32_t*)((uintptr_t)clint_msip_ptr(base_hartid) | base_addr); uint32_t num_fields = num_harts; uint32_t field_idx = base_hartid; @@ -441,53 +475,55 @@ static inline void set_sw_interrupts_unsafe(uint32_t base_hartid, *ptr |= mask; } -void set_cluster_interrupt(uint32_t cluster_id, uint32_t core_id) { - *(cluster_clint_set_ptr(cluster_id)) = (1 << core_id); +void set_cluster_interrupt(uint8_t chip_id, uint32_t cluster_id, + uint32_t core_id) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + *(volatile uint32_t*)((uintptr_t)cluster_clint_set_ptr(cluster_id) | + base_addr) = (1 << core_id); } static inline uint32_t timer_interrupt_pending() { uint64_t mip; - asm volatile("csrr %[mip], mip" : [ mip ] "=r"(mip)); + asm volatile("csrr %[mip], mip" : [mip] "=r"(mip)); return mip & (1 << MIP_MTIP_OFFSET); } void wait_timer_interrupt() { - do - wfi(); + do wfi(); while (!timer_interrupt_pending() && timer_interrupts_enabled()); } void enable_global_interrupts() { uint64_t mstatus; - asm volatile("csrr %[mstatus], mstatus" : [ mstatus ] "=r"(mstatus)); + asm volatile("csrr %[mstatus], mstatus" : [mstatus] "=r"(mstatus)); mstatus |= (1 << MSTATUS_MIE_OFFSET); - asm volatile("csrw mstatus, %[mstatus]" : : [ mstatus ] "r"(mstatus)); + asm volatile("csrw mstatus, %[mstatus]" : : [mstatus] "r"(mstatus)); } void enable_timer_interrupts() { uint64_t mie; - asm volatile("csrr %[mie], mie" : [ mie ] "=r"(mie)); + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); mie |= (1 << MIE_MTIE_OFFSET); - asm volatile("csrw mie, %[mie]" : : [ mie ] "r"(mie)); + asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); } void disable_timer_interrupts() { uint64_t mie; - asm volatile("csrr %[mie], mie" : [ mie ] "=r"(mie)); + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); mie &= ~(1 << MIE_MTIE_OFFSET); - asm volatile("csrw mie, %[mie]" : : [ mie ] "r"(mie)); + asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); } void disable_sw_interrupts() { uint64_t mie; - asm volatile("csrr %[mie], mie" : [ mie ] "=r"(mie)); + asm volatile("csrr %[mie], mie" : [mie] "=r"(mie)); mie &= ~(1 << MIE_MSIE_OFFSET); - asm volatile("csrw mie, %[mie]" : : [ mie ] "r"(mie)); + asm volatile("csrw mie, %[mie]" : : [mie] "r"(mie)); } /** @@ -498,10 +534,7 @@ void disable_sw_interrupts() { * status. This avoids unnecessary congestion on the * interconnect and shared CLINT. */ -void wait_sw_interrupt_cleared() { - while (sw_interrupt_pending()) - ; -} +void wait_sw_interrupt_cleared() { while (sw_interrupt_pending()); } /** * @brief Gets SW interrupt pending status from shared CLINT @@ -511,9 +544,8 @@ void wait_sw_interrupt_cleared() { * status. That function polls a local CSR instead * of the shared CLINT. */ -void wait_remote_sw_interrupt_pending(uint32_t hartid) { - while (remote_sw_interrupt_pending(hartid)) - ; +void wait_remote_sw_interrupt_pending(uint8_t chip_id, uint32_t hartid) { + while (remote_sw_interrupt_pending(chip_id, hartid)); } //=============================================================== @@ -528,14 +560,21 @@ static inline uint64_t mcycle() { return r; } -static inline uint64_t mtime() { return *clint_mtime_ptr; } +static inline uint64_t mtime(uint8_t chip_id) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + return *(volatile uint64_t*)((uintptr_t)clint_mtime_ptr | base_addr); +} -void set_timer_interrupt(uint64_t interval_ns) { +void set_timer_interrupt(uint8_t chip_id, uint64_t interval_ns) { // Convert ns to RTC unit uint64_t rtc_interval = interval_ns / (int64_t)rtc_period; - // Offset interval by current time - *clint_mtimecmp0_ptr = mtime() + rtc_interval; + // Calculate the base address for the chip + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + + // Offset interval by current time and set the timer interrupt + *(volatile uint64_t*)((uintptr_t)clint_mtimecmp0_ptr | base_addr) = + mtime(chip_id) + rtc_interval; } /** @@ -548,11 +587,16 @@ void set_timer_interrupt(uint64_t interval_ns) { * the pending bit. If this is not desired, it is safer * to disable the timer interrupt before clearing it. */ -void clear_timer_interrupt() { *clint_mtimecmp0_ptr = mtime() + 1; } +void clear_timer_interrupt(uint8_t chip_id) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + *(volatile uint64_t*)((uintptr_t)clint_mtimecmp0_ptr | base_addr) = + mtime(chip_id) + 1; +} // Minimum delay is of one RTC period void delay_ns(uint64_t delay) { - set_timer_interrupt(delay); + uint8_t chip_id = get_current_chip_id(); + set_timer_interrupt(chip_id, delay); // Wait for set_timer_interrupt() to have effect fence(); @@ -560,7 +604,7 @@ void delay_ns(uint64_t delay) { wait_timer_interrupt(); disable_timer_interrupts(); - clear_timer_interrupt(); + clear_timer_interrupt(chip_id); } //=============================================================== @@ -682,8 +726,13 @@ void delay_ns(uint64_t delay) { uint32_t const ISO_MASK_ALL = 0b1111; uint32_t const ISO_MASK_NONE = 0; -static inline void deisolate_quad(uint32_t quad_idx, uint32_t iso_mask) { - *quad_cfg_isolate_ptr(quad_idx) &= ~iso_mask; +static inline void deisolate_quad(uint8_t chip_id, uint32_t quad_idx, + uint32_t iso_mask) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + volatile uint32_t* isolate_ptr = + (volatile uint32_t*)((uintptr_t)quad_cfg_isolate_ptr(quad_idx) | + base_addr); + *isolate_ptr &= ~iso_mask; } /** @@ -691,17 +740,27 @@ static inline void deisolate_quad(uint32_t quad_idx, uint32_t iso_mask) { * * @return Masked register field realigned to start at LSB */ -static inline uint32_t get_quad_cfg_isolated(uint32_t quad_idx) { - return *quad_cfg_isolated_ptr(quad_idx) & ISO_MASK_ALL; -} - -void isolate_quad(uint32_t quad_idx, uint32_t iso_mask) { - *quad_cfg_isolate_ptr(quad_idx) |= iso_mask; +static inline uint32_t get_quad_cfg_isolated(uint8_t chip_id, + uint32_t quad_idx) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + return *(volatile uint32_t*)((uintptr_t)quad_cfg_isolated_ptr(quad_idx) | + base_addr) & + ISO_MASK_ALL; +} + +void isolate_quad(uint8_t chip_id, uint32_t quad_idx, uint32_t iso_mask) { + uintptr_t base_addr = (uintptr_t)get_chip_baseaddress(chip_id); + volatile uint32_t* isolate_ptr = + (volatile uint32_t*)((uintptr_t)quad_cfg_isolate_ptr(quad_idx) | + base_addr); + *isolate_ptr |= iso_mask; fence(); } -static inline void deisolate_all() { - for (uint32_t i = 0; i < N_QUADS; ++i) deisolate_quad(i, ISO_MASK_ALL); +static inline void deisolate_all(uint8_t chip_id) { + for (uint32_t i = 0; i < N_QUADS; ++i) { + deisolate_quad(chip_id, i, ISO_MASK_ALL); + } } /** @@ -710,10 +769,13 @@ static inline void deisolate_all() { * @param iso_mask set bit to 1 to check if path is isolated, 0 de-isolated * @return 1 is check passes, 0 otherwise */ -uint32_t check_isolated_timeout(uint32_t max_tries, uint32_t quadrant_idx, - uint32_t iso_mask) { - for (uint32_t i = 0; i < max_tries; ++i) - if (get_quad_cfg_isolated(quadrant_idx) == iso_mask) return 1; +uint32_t check_isolated_timeout(uint8_t chip_id, uint32_t max_tries, + uint32_t quadrant_idx, uint32_t iso_mask) { + for (uint32_t i = 0; i < max_tries; ++i) { + if (get_quad_cfg_isolated(chip_id, quadrant_idx) == iso_mask) { + return 1; + } + } return 0; } diff --git a/target/sim/sw/host/runtime/host.h b/target/sim/sw/host/runtime/host.h index 55e1623ab..b8b2ac3d9 100644 --- a/target/sim/sw/host/runtime/host.h +++ b/target/sim/sw/host/runtime/host.h @@ -5,7 +5,7 @@ #include #include -static inline void set_sw_interrupt(uint32_t hartid); +static inline void set_sw_interrupt(uint8_t chip_id, uint32_t hartid); void delay_ns(uint64_t delay); @@ -13,4 +13,4 @@ static inline volatile uint32_t* get_shared_lock(); static inline void wait_sw_interrupt(); -static inline void clear_sw_interrupt(uint32_t hartid); +static inline void clear_sw_interrupt(uint8_t chip_id, uint32_t hartid); diff --git a/target/sim/sw/shared/platform/generated/sys_dma.h b/target/sim/sw/shared/platform/generated/sys_dma.h index 3a513decd..0ac70fa30 100644 --- a/target/sim/sw/shared/platform/generated/sys_dma.h +++ b/target/sim/sw/shared/platform/generated/sys_dma.h @@ -47,6 +47,7 @@ extern "C" { #include #include "occamy_memory_map.h" +#include "chip_id.h" #define IDMA_SRC_ADDR \ (SYS_IDMA_CFG_BASE_ADDR + IDMA_REG64_FRONTEND_SRC_ADDR_REG_OFFSET) @@ -68,25 +69,32 @@ extern "C" { #define IDMA_CONF_SERIALIZE 0 inline volatile uint64_t *sys_dma_src_ptr(void) { - return (volatile uint64_t *)IDMA_SRC_ADDR; + return (volatile uint64_t *)(IDMA_SRC_ADDR | + (uintptr_t)get_current_chip_baseaddress()); } inline volatile uint64_t *sys_dma_dst_ptr(void) { - return (volatile uint64_t *)IDMA_DST_ADDR; + return (volatile uint64_t *)(IDMA_DST_ADDR | + (uintptr_t)get_current_chip_baseaddress()); } inline volatile uint64_t *sys_dma_num_bytes_ptr(void) { - return (volatile uint64_t *)IDMA_NUMBYTES_ADDR; + return (volatile uint64_t *)(IDMA_NUMBYTES_ADDR | + (uintptr_t)get_current_chip_baseaddress()); } inline volatile uint64_t *sys_dma_conf_ptr(void) { - return (volatile uint64_t *)IDMA_CONF_ADDR; + return (volatile uint64_t *)(IDMA_CONF_ADDR | + (uintptr_t)get_current_chip_baseaddress()); } inline volatile uint64_t *sys_dma_status_ptr(void) { - return (volatile uint64_t *)IDMA_STATUS_ADDR; + return (volatile uint64_t *)(IDMA_STATUS_ADDR | + (uintptr_t)get_current_chip_baseaddress()); } inline volatile uint64_t *sys_dma_nextid_ptr(void) { - return (volatile uint64_t *)IDMA_NEXTID_ADDR; + return (volatile uint64_t *)(IDMA_NEXTID_ADDR | + (uintptr_t)get_current_chip_baseaddress()); } inline volatile uint64_t *sys_dma_done_ptr(void) { - return (volatile uint64_t *)IDMA_DONE_ADDR; + return (volatile uint64_t *)(IDMA_DONE_ADDR | + (uintptr_t)get_current_chip_baseaddress()); } inline uint64_t sys_dma_memcpy(uint64_t dst, uint64_t src, uint64_t size) { diff --git a/target/sim/sw/shared/platform/generated/tlb.h b/target/sim/sw/shared/platform/generated/tlb.h.bak similarity index 97% rename from target/sim/sw/shared/platform/generated/tlb.h rename to target/sim/sw/shared/platform/generated/tlb.h.bak index 16be33ee8..06eb0fd1b 100644 --- a/target/sim/sw/shared/platform/generated/tlb.h +++ b/target/sim/sw/shared/platform/generated/tlb.h.bak @@ -2,6 +2,8 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +// Please be aware that tlb is fully removed from Quadrant + #pragma once #include diff --git a/target/sim/sw/shared/platform/generated/uart.h b/target/sim/sw/shared/platform/generated/uart.h index def2ae442..3c7e545e3 100644 --- a/target/sim/sw/shared/platform/generated/uart.h +++ b/target/sim/sw/shared/platform/generated/uart.h @@ -20,6 +20,26 @@ #define UART_DLAB_LSB UART_BASE_ADDR + 0 #define UART_DLAB_MSB UART_BASE_ADDR + 4 +/* + UART_LINE_CONTROL[1:0]: iLCR_WLS Word Length Select, 2'b11 for two bits mode + UART_LINE_CONTROL[2]: iLCE_STB, 1'b0 for one bit stop bit + UART_LINE_CONTROL[3]: iLCR_PEN, Parity Enable, disable as there is higher + level parity check algorithm UART_LINE_CONTROL[4]: iLCR_PES, also related to + parity check UART_LINE_CONTROL[5]: iLCR_SP, also related to parity check + UART_LINE_CONTROL[6]: iLCR_BC, signaling a break control + UART_LINE_CONTROL[7]: iLCR_DLAB, don't know what it is + +*/ +/* + UART_MODEM_CONTROL[0]: iMCR_DTR (DTR output, not used) + UART_MODEM_CONTROL[1]: iMCR_RTS (RTS output, set 1 to inform the device is + ready to receive the data) UART_MODEM_CONTROL[2]: iMCR_OUT1 (General Purpose + Output 1, not used) UART_MODEM_CONTROL[3]: iMCR_OUT2 (General Purpose Output + 2, not used) UART_MODEM_CONTROL[4]: iMCR_LOOP (Internal Loopback, should set + to 0) UART_MODEM_CONTROL[5]: iMCR_AFE (Automatic Flow Control, set to 1 to + automatically manage DTR and RTS) +*/ + inline static void write_reg_u8(uintptr_t addr, uint8_t value) { volatile uint8_t *loc_addr = (volatile uint8_t *)addr; *loc_addr = value; @@ -29,104 +49,138 @@ inline static uint8_t read_reg_u8(uintptr_t addr) { return *(volatile uint8_t *)addr; } -inline static int is_data_ready() { - return read_reg_u8(UART_LINE_STATUS) & 0x01; +inline static int is_data_ready(uintptr_t address_prefix) { + return read_reg_u8(address_prefix | UART_LINE_STATUS) & 0x01; } -inline static int is_data_overrun() { - return read_reg_u8(UART_LINE_STATUS) & 0x02; +inline static int is_data_overrun(uintptr_t address_prefix) { + return read_reg_u8(address_prefix | UART_LINE_STATUS) & 0x02; } -inline static int is_transmit_empty() { - return read_reg_u8(UART_LINE_STATUS) & 0x20; +inline static int is_transmit_empty(uintptr_t address_prefix) { + return read_reg_u8(address_prefix | UART_LINE_STATUS) & 0x20; } -inline static int is_transmit_done() { - return read_reg_u8(UART_LINE_STATUS) & 0x40; +inline static int is_transmit_done(uintptr_t address_prefix) { + return read_reg_u8(address_prefix | UART_LINE_STATUS) & 0x40; } -inline static void write_serial(char a) { - while (is_transmit_empty() == 0) { +inline static void init_uart(uintptr_t address_prefix, uint32_t freq, + uint32_t baud) { + uint32_t divisor = freq / (baud << 4); + + write_reg_u8(address_prefix | UART_INTERRUPT_ENABLE, + 0x00); // Disable all interrupts + write_reg_u8(address_prefix | UART_LINE_CONTROL, + 0x80); // Enable DLAB (set baud rate divisor) + write_reg_u8(address_prefix | UART_DLAB_LSB, divisor); // divisor (lo byte) + write_reg_u8(address_prefix | UART_DLAB_MSB, + (divisor >> 8) & 0xFF); // divisor (hi byte) + write_reg_u8(address_prefix | UART_LINE_CONTROL, + 0x03); // 8 bits, no parity, one stop bit + write_reg_u8(address_prefix | UART_FIFO_CONTROL, + 0xC7); // Enable FIFO, clear them, with 14-byte threshold + write_reg_u8(address_prefix | UART_MODEM_CONTROL, + 0x22); // Flow control enabled, auto flow control mode +} +inline static void print_char(uintptr_t address_prefix, char a) { + while (is_transmit_empty(address_prefix) == 0) { }; - write_reg_u8(UART_THR, a); + write_reg_u8(address_prefix | UART_THR, a); } -inline static uint8_t read_serial() { - while (is_data_ready() == 0) { +inline static uint8_t scan_char(uintptr_t address_prefix) { + while (is_data_ready(address_prefix) == 0) { }; - return read_reg_u8(UART_RBR); + return read_reg_u8(address_prefix | UART_RBR); } -inline static void init_uart(uint32_t freq, uint32_t baud) { - uint32_t divisor = freq / (baud << 4); - write_reg_u8(UART_INTERRUPT_ENABLE, 0x00); // Disable all interrupts - write_reg_u8(UART_LINE_CONTROL, - 0x80); // Enable DLAB (set baud rate divisor) - write_reg_u8(UART_DLAB_LSB, divisor); // divisor (lo byte) - write_reg_u8(UART_DLAB_MSB, (divisor >> 8) & 0xFF); // divisor (hi byte) - write_reg_u8(UART_LINE_CONTROL, 0x03); // 8 bits, no parity, one stop bit - write_reg_u8(UART_FIFO_CONTROL, - 0xC7); // Enable FIFO, clear them, with 14-byte threshold - write_reg_u8(UART_MODEM_CONTROL, 0x22); // Autoflow mode +// inline static int putchar(char a) { +// uintptr_t address_prefix = (uintptr_t)get_current_chip_baseaddress(); +// print_char_uart(address_prefix, a); +// return 0; +// } + +// inline static uint8_t getchar(void) { +// uintptr_t address_prefix = (uintptr_t)get_current_chip_baseaddress(); +// return scan_char(address_prefix); +// } + +inline static void print_u8(uintptr_t address_prefix, uint8_t value) { + char lut[16] = {'0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + print_char(address_prefix, lut[value / 16]); + print_char(address_prefix, lut[value % 16]); + while (!is_transmit_done(address_prefix)); } -inline static void print_uart(const char *str) { - const char *cur = &str[0]; - while (*cur != '\0') { - write_serial((uint8_t)*cur); - ++cur; +inline static void print_u32(uintptr_t address_prefix, uint32_t value) { + char lut[16] = {'0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + for (int i = 28; i >= 0; i = i - 4) { + print_char(address_prefix, lut[(value >> i) % 16]); } - while (!is_transmit_done()); + while (!is_transmit_done(address_prefix)); } -inline static void scan_uart(char *str) { - char *cur = &str[0]; - while (1) { - *cur = read_serial(); - if (*cur == '\r') { - *cur = '\0'; - return; - } else - cur++; +inline static void print_u48(uintptr_t address_prefix, uint64_t value) { + char lut[16] = {'0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + for (int i = 44; i >= 0; i = i - 4) { + print_char(address_prefix, lut[(value >> i) % 16]); } + while (!is_transmit_done(address_prefix)); } -static uint8_t bin_to_hex_table[16] = {'0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; - -inline static void bin_to_hex(uint8_t inp, uint8_t res[2]) { - res[1] = bin_to_hex_table[inp & 0xf]; - res[0] = bin_to_hex_table[(inp >> 4) & 0xf]; - return; +inline static void print_u64(uintptr_t address_prefix, uint64_t value) { + char lut[16] = {'0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + for (int i = 60; i >= 0; i = i - 4) { + print_char(address_prefix, lut[(value >> i) % 16]); + } + while (!is_transmit_done(address_prefix)); } -inline static void print_uart_int(uint32_t addr) { - int i; - for (i = 3; i > -1; i--) { - uint8_t cur = (addr >> (i * 8)) & 0xff; - uint8_t hex[2]; - bin_to_hex(cur, hex); - write_serial(hex[0]); - write_serial(hex[1]); +inline static void print_mem_hex(uintptr_t address_prefix, char *str, + uint32_t length) { + uint8_t lut[16] = {'0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + for (uint64_t i = (uint64_t)str; i < (uint64_t)str + length; i++) { + if (i % 16 == 0) { + print_char(address_prefix, '\r'); + print_char(address_prefix, '\n'); + for (int j = 28; j >= 0; j = j - 4) + print_char(address_prefix, lut[(i >> j) % 16]); + print_char(address_prefix, ':'); + print_char(address_prefix, ' '); + } + char temp = *((char *)i); + print_char(address_prefix, lut[temp / 16]); + print_char(address_prefix, lut[temp % 16]); + print_char(address_prefix, ' '); } + while (!is_transmit_done(address_prefix)); } -inline static void print_uart_addr(uint64_t addr) { - int i; - for (i = 7; i > -1; i--) { - uint8_t cur = (addr >> (i * 8)) & 0xff; - uint8_t hex[2]; - bin_to_hex(cur, hex); - write_serial(hex[0]); - write_serial(hex[1]); +inline static void print_str(uintptr_t address_prefix, const char *str) { + const char *cur = &str[0]; + while (*cur != '\0') { + print_char(address_prefix, (uint8_t)*cur); + ++cur; } + while (!is_transmit_done(address_prefix)); } -inline static void print_uart_byte(uint8_t byte) { - uint8_t hex[2]; - bin_to_hex(byte, hex); - write_serial(hex[0]); - write_serial(hex[1]); +inline static void scan_str(uintptr_t address_prefix, char *str) { + char *cur = &str[0]; + while (1) { + *cur = scan_char(address_prefix); + if (*cur == '\r') { + *cur = '\0'; + return; + } else + cur++; + } } diff --git a/target/sim/sw/shared/runtime/chip_id.h b/target/sim/sw/shared/runtime/chip_id.h new file mode 100644 index 000000000..04928e84f --- /dev/null +++ b/target/sim/sw/shared/runtime/chip_id.h @@ -0,0 +1,43 @@ +#pragma once +#include + +inline uint8_t get_current_chip_id() { + uint32_t chip_id; +# if __riscv_xlen == 64 + // 64-bit system (CVA6), get chip_id from 0xf15 + asm volatile("csrr %0, 0xf15" : "=r"(chip_id)); +# else + // 32-bit system, get chip_id from 0xbc2 (base_addrh) + // and shift it to the right by 8 bits + asm volatile ("csrr %0, 0xbc2" : "=r"(chip_id)); + chip_id = chip_id >> 8; +# endif + return (uint8_t)chip_id; +} + +inline uint8_t *get_current_chip_baseaddress() { +#if __riscv_xlen == 64 + // 64-bit system (CVA6), get chip_id from 0xf15 + uint32_t chip_id; + asm volatile("csrr %0, 0xf15" : "=r"(chip_id)); + return (uint8_t *)((uintptr_t)chip_id << 40); +#else + // 32-bit system, return 0 (not supported) + return (uint8_t *)0; +#endif +} + +inline uint8_t *get_chip_baseaddress(uint8_t chip_id) { +#if __riscv_xlen == 64 + // 64-bit system, perform the shift and return the base address + return (uint8_t *)((uintptr_t)chip_id << 40); +#else + // 32-bit system, return 0 (not supported) + return (uint8_t *)0; +#endif +} + +inline uint32_t get_current_chip_baseaddress_h() { + uint32_t chip_id = get_current_chip_id(); + return (uint32_t)(chip_id << 8); +} \ No newline at end of file diff --git a/target/sim/sw/shared/runtime/heterogeneous_runtime.h b/target/sim/sw/shared/runtime/heterogeneous_runtime.h index 77cfaeebb..7ab12a9d6 100644 --- a/target/sim/sw/shared/runtime/heterogeneous_runtime.h +++ b/target/sim/sw/shared/runtime/heterogeneous_runtime.h @@ -4,6 +4,7 @@ #include +#include "chip_id.h" #include "occamy.h" #include "occamy_memory_map.h" @@ -19,18 +20,32 @@ typedef struct { /* Interrupts */ /**************/ -inline void set_host_sw_interrupt() { *clint_msip_ptr(0) = 1; } +inline static void set_host_sw_interrupt(uint8_t chip_id) { + uint32_t* msip_ptr = + (uint32_t*)(((uintptr_t)clint_msip_ptr(0)) | + ((uintptr_t)get_chip_baseaddress(chip_id))); + *msip_ptr = 1; +} + +inline void clear_host_sw_interrupt_unsafe(uint8_t chip_id) { + uint32_t* msip_ptr = + (uint32_t*)(((uintptr_t)clint_msip_ptr(0)) | + ((uintptr_t)get_chip_baseaddress(chip_id))); + + *msip_ptr = 0; +} -inline void clear_host_sw_interrupt_unsafe() { *clint_msip_ptr(0) = 0; } +inline void wait_host_sw_interrupt_clear(uint8_t chip_id) { + uint32_t* msip_ptr = + (uint32_t*)(((uintptr_t)clint_msip_ptr(0)) | + ((uintptr_t)get_chip_baseaddress(chip_id))); -inline void wait_host_sw_interrupt_clear() { - while (*clint_msip_ptr(0)) - ; + while (*msip_ptr); } -inline void clear_host_sw_interrupt() { - clear_host_sw_interrupt_unsafe(); - wait_host_sw_interrupt_clear(); +static inline void clear_host_sw_interrupt(uint8_t chip_id) { + clear_host_sw_interrupt_unsafe(chip_id); + wait_host_sw_interrupt_clear(chip_id); } /**************************/ @@ -38,17 +53,23 @@ inline void clear_host_sw_interrupt() { /**************************/ // Configure RO cache address range -inline void configure_read_only_cache_addr_rule(uint32_t quad_idx, +inline void configure_read_only_cache_addr_rule(uint8_t chip_id, + uint32_t quad_idx, uint32_t rule_idx, uint64_t start_addr, uint64_t end_addr) { volatile uint64_t* rule_ptr = - quad_cfg_ro_cache_addr_rule_ptr(quad_idx, rule_idx); + (uint64_t*)(((uintptr_t)quad_cfg_ro_cache_addr_rule_ptr(quad_idx, + rule_idx)) | + ((uintptr_t)get_chip_baseaddress(chip_id))); *(rule_ptr) = start_addr; *(rule_ptr + 1) = end_addr; } // Enable RO cache -inline void enable_read_only_cache(uint32_t quad_idx) { - *(quad_cfg_ro_cache_enable_ptr(quad_idx)) = 1; +inline void enable_read_only_cache(uint8_t chip_id, uint32_t quad_idx) { + volatile uint32_t* enable_ptr = + (uint32_t*)(((uintptr_t)quad_cfg_ro_cache_enable_ptr(quad_idx)) | + ((uintptr_t)get_chip_baseaddress(chip_id))); + *enable_ptr = 1; } diff --git a/target/sim_chip/.gitignore b/target/sim_chip/.gitignore index 5d83521cd..534ec2d2d 100644 --- a/target/sim_chip/.gitignore +++ b/target/sim_chip/.gitignore @@ -2,3 +2,5 @@ /work-vlt /bin /apps/*.bin +/testharness/testharness.sv +/logs diff --git a/target/sim_chip/Makefile b/target/sim_chip/Makefile index c4844b2c5..00e91ab7e 100644 --- a/target/sim_chip/Makefile +++ b/target/sim_chip/Makefile @@ -29,6 +29,7 @@ ROOT := $(SIM_MKFILE_DIR)../.. SNITCH_ROOT := $(shell bender path snitch_cluster) TARGET = occamy_chip +LOGS_DIR = logs CVA6_TXT_TRACE = $(LOGS_DIR)/trace_hart_00000.txt CVA6_PERF_DUMP = $(LOGS_DIR)/hart_00000_perf.json @@ -109,10 +110,10 @@ VLT_COBJ += $(VLT_BUILDDIR)/testharness/testharness.o CVA6_BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary) -CVA6_TXT_TRACE = $(LOGS_DIR)/trace_hart_00000.txt -CVA6_PERF_TRACE = $(LOGS_DIR)/hart_00000_perf.json -CVA6_ANNOTATED_TRACE = $(LOGS_DIR)/trace_hart_00000.s -CVA6_DIFF_TRACE = $(LOGS_DIR)/trace_hart_00000.diff +CVA6_TXT_TRACE = $(LOGS_DIR)/trace_chip_00_hart_00000.txt +CVA6_PERF_TRACE = $(LOGS_DIR)/chip_00_hart_00000_perf.json +CVA6_ANNOTATED_TRACE = $(LOGS_DIR)/trace_chip_00_hart_00000.s +CVA6_DIFF_TRACE = $(LOGS_DIR)/trace_chip_00_hart_00000.diff TXT_TRACES += $(CVA6_TXT_TRACE) PERF_TRACES += $(CVA6_PERF_TRACE) @@ -140,11 +141,40 @@ apps: clean-apps: $(MAKE) -C apps clean +############### +# Testharness # +############### + +TARGET_RTL = $(ROOT)/target/rtl + +CFG = $(TARGET_RTL)/cfg/occamy_cfg/lru.hjson + +OCCAMYGEN ?= $(ROOT)/util/occamygen/occamygen.py + +$(CFG): + @# If the LRU config file doesn't exist, we use the default config. + @if [ ! -e $@ ] ; then \ + DEFAULT_CFG="$(TARGET_RTL)/cfg/occamy_cfg/hemaia.hjson"; \ + echo "Using default config file: $$DEFAULT_CFG"; \ + cp $$DEFAULT_CFG $@; \ + fi + @# If a config file is provided on the command-line + @# then we override the LRU file with it + @if [ $(CFG_OVERRIDE) ] ; then \ + echo "Overriding config file with: $(CFG_OVERRIDE)"; \ + cp $(CFG_OVERRIDE) $@; \ + fi + + +testharness/testharness.sv: testharness/testharness.sv.tpl $(CFG) + @echo "[OCCAMYGEN] Generate $@" + @$(OCCAMYGEN) --cfg $(CFG) --outdir testharness --multichip-testharness-sv $< + ############# # Verilator # ############# -${VLT_AR}: ${VLT_SOURCES} ${TB_SRCS} +${VLT_AR}: ${VLT_SOURCES} ${TB_SRCS} testharness/testharness.sv $(call VERILATE,testharness) # Quick sanity check, not really meant for simulation. @@ -184,7 +214,7 @@ clean-vlt: clean-work $(VSIM_BUILDDIR): mkdir -p $@ -$(VSIM_BUILDDIR)/compile.vsim.tcl: $(BENDER_LOCK) | $(VSIM_BUILDDIR) +$(VSIM_BUILDDIR)/compile.vsim.tcl: $(BENDER_LOCK) | $(VSIM_BUILDDIR) testharness/testharness.sv $(BENDER) script vsim $(VSIM_BENDER) --vlog-arg="$(VLOG_FLAGS) -work $(dir $@) " > $@ echo '$(VLOG) -work $(dir $@) $(TB_CC_SOURCES) -vv -ccflags "$(TB_CC_FLAGS)"' >> $@ echo 'return 0' >> $@ diff --git a/target/sim_chip/apps/copybin.py b/target/sim_chip/apps/copybin.py new file mode 100755 index 000000000..8a106ca78 --- /dev/null +++ b/target/sim_chip/apps/copybin.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import os +import shutil +import argparse + +def copy_file_m_n_times(input_file, m, n): + # Check if the input file exists + if not os.path.isfile(input_file): + print(f"Error: {input_file} does not exist.") + return + + os.makedirs("../bin/", exist_ok=True) + # Loop through m and n to create copies + for x in range(m): + for y in range(n): + # Define the new file name + output_file = f"../bin/app_chip_{x}_{y}.bin" + + # Copy the file to the new location with the new name + shutil.copy(input_file, output_file) + print(f"Copied {input_file} to {output_file}") + +def main(): + # Define argument parser + parser = argparse.ArgumentParser(description='Copy a file m*n times with specific naming convention.') + + # Positional arguments + parser.add_argument('-i', '--input-file', required=True, help='Path to the input file to be copied') + parser.add_argument('-m', required=True, type=int, help='Number of rows (copies along x-axis)') + parser.add_argument('-n', required=True, type=int, help='Number of columns (copies along y-axis)') + + # Parse the arguments + args = parser.parse_args() + + # Call the function with parsed arguments + copy_file_m_n_times(args.input_file, args.m, args.n) + +if __name__ == '__main__': + main() diff --git a/target/sim_chip/sim.mk b/target/sim_chip/sim.mk index bbb0b0982..082418f32 100644 --- a/target/sim_chip/sim.mk +++ b/target/sim_chip/sim.mk @@ -40,7 +40,7 @@ SED_SRCS := sed -e ${MATCH_END} -e ${MATCH_BGN} VSIM_BENDER += -t test -t rtl -t vsim VSIM_SOURCES = $(shell ${BENDER} script flist ${VSIM_BENDER} | ${SED_SRCS}) VSIM_BUILDDIR ?= work-vsim -VOPT_FLAGS = +acc +VOPT_FLAGS = +acc # VCS_BUILDDIR should to be the same as the `DEFAULT : ./work-vcs` # in target/snitch_cluster/synopsys_sim.setup @@ -207,9 +207,9 @@ endef # Traces # ########## -DASM_TRACES = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null)) +DASM_TRACES = $(shell (ls $(LOGS_DIR)/trace_chip_??_hart_*.dasm 2>/dev/null)) TXT_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.txt/g')) -PERF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g')) +PERF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/.dasm/_perf.json/g')) ANNOTATED_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.s/g')) DIFF_TRACES = $(shell (echo $(DASM_TRACES) | sed 's/\.dasm/\.diff/g')) @@ -227,16 +227,19 @@ perf-csv: $(PERF_CSV) event-csv: $(EVENT_CSV) layout: $(TRACE_CSV) $(TRACE_JSON) -$(LOGS_DIR)/trace_hart_%.txt $(LOGS_DIR)/hart_%_perf.json: $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY) - $(DASM) < $< | $(PYTHON) $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/hart_$*_perf.json > $(LOGS_DIR)/trace_hart_$*.txt - +$(LOGS_DIR)/%.txt $(LOGS_DIR)/%_perf.json: $(LOGS_DIR)/%.dasm $(GENTRACE_PY) + @CHIP=$(word 3,$(subst _, ,$*)) && \ + HART=$(word 5,$(subst _, ,$*)) && \ + echo "Processing Chip $$CHIP Hart $$HART" && \ + $(DASM) < $< | $(PYTHON) $(GENTRACE_PY) --permissive -d $(LOGS_DIR)/chip_$$CHIP\_hart_$$HART\_perf.json > $(LOGS_DIR)/trace_chip_$$CHIP\_hart_$$HART.txt # Generate source-code interleaved traces for all harts. Reads the binary from # the logs/.rtlbinary file that is written at start of simulation in the vsim script BINARY ?= $(shell cat $(LOGS_DIR)/.rtlbinary) -$(LOGS_DIR)/trace_hart_%.s: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY} - $(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -$(LOGS_DIR)/trace_hart_%.diff: $(LOGS_DIR)/trace_hart_%.txt ${ANNOTATE_PY} - $(PYTHON) ${ANNOTATE_PY} ${ANNOTATE_FLAGS} -o $@ $(BINARY) $< -d + +$(LOGS_DIR)/%.s: $(LOGS_DIR)/%.txt $(ANNOTATE_PY) + $(PYTHON) $(ANNOTATE_PY) $(ANNOTATE_FLAGS) -o $@ $(BINARY) $< +$(LOGS_DIR)/%.diff: $(LOGS_DIR)/%.txt $(ANNOTATE_PY) + $(PYTHON) $(ANNOTATE_PY) $(ANNOTATE_FLAGS) -o $@ $(BINARY) $< -d $(PERF_CSV): $(PERF_TRACES) $(PERF_CSV_PY) $(PYTHON) $(PERF_CSV_PY) -o $@ -i $(PERF_TRACES) diff --git a/target/sim_chip/testharness/testharness.sv b/target/sim_chip/testharness/testharness.sv.tpl similarity index 61% rename from target/sim_chip/testharness/testharness.sv rename to target/sim_chip/testharness/testharness.sv.tpl index c5ed939bb..6088c593c 100644 --- a/target/sim_chip/testharness/testharness.sv +++ b/target/sim_chip/testharness/testharness.sv.tpl @@ -1,6 +1,15 @@ -// Copyright 2020 ETH Zurich and University of Bologna. +// Copyright 2024 KU Leuven. // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 +// Yunhao Deng + +<% + x = range(0, 1) + y = range(0, 1) + if multichip_cfg["single_chip"] is False: + x = range(multichip_cfg["testbench_cfg"]["upper_left_coordinate"][0], multichip_cfg["testbench_cfg"]["lower_right_coordinate"][0] + 1) + y = range(multichip_cfg["testbench_cfg"]["upper_left_coordinate"][1], multichip_cfg["testbench_cfg"]["lower_right_coordinate"][1] + 1) +%> `timescale 1ns / 1ps `include "axi/typedef.svh" @@ -18,20 +27,20 @@ module testharness logic rst_ni; logic rtc_i; - reg [511:0] wide_sram_buffer[SRAM_DEPTH-1:0]; // Task to load binary file into memory array - task automatic load_binary_to_buffer(input string filepath); + task automatic load_binary_to_hardware(input string filepath, + ref logic [511:0] wide_sram_buffer[SRAM_DEPTH-1:0]); integer fd; integer i; reg [7:0] buffer[SRAM_WIDTH*SRAM_DEPTH]; - foreach (buffer[i]) buffer[i] = 0; + foreach (buffer[i]) buffer[i] = 0; // Zero out the buffer // Open the binary file fd = $fopen(filepath, "rb"); if (fd == 0) begin $display("Failed to open binary file: %s", filepath); - $finish; + $finish(-1); end // Read the binary data into the buffer @@ -110,6 +119,8 @@ module testharness $display("Binary file '%s' loaded into memory.", filepath); endtask + // Chip finish signal + integer chip_finish[${max(x)}:${min(x)}][${max(y)}:${min(y)}]; // Generate reset and clock. initial begin @@ -117,6 +128,9 @@ module testharness clk_i = 0; // Reset the chip + foreach (chip_finish[i,j]) begin + chip_finish[i][j] = 0; + end rst_ni = 1; #0; $display("Resetting the system at %tns", $time / 1000); @@ -124,15 +138,37 @@ module testharness #(10 + $urandom % 10); $display("Reset released at %tns", $time / 1000); rst_ni = 1; + // Load the binaries +% for i in x: +% for j in y: + load_binary_to_hardware("app_chip_${i}_${j}.bin", i_occamy_${i}_${j}.i_spm_wide_cut.i_mem.i_tc_sram.sram); +% endfor +% endfor + end - // Initialize the memory - load_binary_to_buffer("app_chip_0_0.bin"); - - force i_occamy.i_spm_wide_cut.i_mem.i_tc_sram.sram = wide_sram_buffer; - #0; - release i_occamy.i_spm_wide_cut.i_mem.i_tc_sram.sram; + always_comb begin + automatic integer allFinished = 1; + automatic integer allCorrect = 1; + for (int i = ${min(x)}; i <= ${max(x)}; i = i + 1) begin + for (int j = ${min(y)}; j <= ${max(y)}; j = j + 1) begin + if (chip_finish[i][j] == 0) begin + allFinished = 0; + end + if (chip_finish[i][j] == -1) begin + allCorrect = 0; + end + end + end - // Places to load the binaries + if (allFinished == 1) begin + if (allCorrect == 1) begin + $display("All chips finished successfully at %tns", $time / 1000); + $finish; + end else begin + $error("All chips finished with errors at %tns", $time / 1000); + end + $finish(-1); + end end always #(CLKTCK / 2) begin @@ -152,36 +188,34 @@ module testharness end end - // Finish Block - always @(i_occamy.i_spm_wide_cut.i_mem.i_tc_sram.sram[SRAM_DEPTH-1][(SRAM_WIDTH*8-1)-:32]) begin - if (i_occamy.i_spm_wide_cut.i_mem.i_tc_sram.sram[SRAM_DEPTH-1][(SRAM_WIDTH*8-1)-:32] != 0) begin - if (i_occamy.i_spm_wide_cut.i_mem.i_tc_sram.sram[SRAM_DEPTH-1][(SRAM_WIDTH*8-1)-:32] == 32'd1) begin - $display("Simulation finished at %tns", $time / 1000); - $finish; - end else begin - $error("Simulation finished with errors %d at %tns", i_occamy.i_spm_wide_cut.i_mem.i_tc_sram.sram[SRAM_DEPTH-1][(SRAM_WIDTH*8-1)-:32], $time / 1000); - end - end - end - logic clk_periph_i, rst_periph_ni; assign clk_periph_i = clk_i; assign rst_periph_ni = rst_ni; + // Must be the frequency of i_uart0.clk_i in Hz + localparam int unsigned UartDPIFreq = 1_000_000_000; + +% for i in x: +% for j in y: /// Uart signals - logic tx, rx; + logic tx_${i}_${j}, rx_${i}_${j}; + + <% + i_hex_string = "{:01x}".format(i) + j_hex_string = "{:01x}".format(j) + %> - occamy_chip i_occamy ( + occamy_chip i_occamy_${i}_${j} ( .clk_i, .rst_ni, .clk_periph_i, .rst_periph_ni, .rtc_i, - .chip_id_i('0), + .chip_id_i(8'h${i_hex_string}${j_hex_string}), .test_mode_i(1'b0), .boot_mode_i(0), - .uart_tx_o(tx), - .uart_rx_i(rx), + .uart_tx_o(tx_${i}_${j}), + .uart_rx_i(rx_${i}_${j}), .uart_rts_no(), .uart_cts_ni('0), .gpio_d_i('0), @@ -200,19 +234,34 @@ module testharness .ext_irq_i('0) ); - // Must be the frequency of i_uart0.clk_i in Hz - localparam int unsigned UartDPIFreq = 1_000_000_000; - uartdpi #( .BAUD('d20_000_000), // Frequency shouldn't matter since we are sending with the same clock. .FREQ(UartDPIFreq), - .NAME("uart0") - ) i_uart0 ( + .NAME("uart_${i}_${j}") + ) i_uart_${i}_${j} ( .clk_i (clk_i), .rst_ni(rst_ni), - .tx_o (rx), - .rx_i (tx) + .tx_o (rx_${i}_${j}), + .rx_i (tx_${i}_${j}) ); + // Chip Status Monitor Block + always @(i_occamy_${i}_${j}.i_spm_wide_cut.i_mem.i_tc_sram.sram[SRAM_DEPTH-1][(SRAM_WIDTH*8-1)-:32]) begin + if (i_occamy_${i}_${j}.i_spm_wide_cut.i_mem.i_tc_sram.sram[SRAM_DEPTH-1][(SRAM_WIDTH*8-1)-:32] != 0) begin + if (i_occamy_${i}_${j}.i_spm_wide_cut.i_mem.i_tc_sram.sram[SRAM_DEPTH-1][(SRAM_WIDTH*8-1)-:32] == 32'd1) begin + $display("Simulation of chip_${i}_${j} is finished at %tns", $time / 1000); + chip_finish[${i}][${j}] = 1; + end else begin + $error("Simulation of chip_${i}_${j} is finished with errors %d at %tns", + i_occamy_${i}_${j}.i_spm_wide_cut.i_mem.i_tc_sram.sram[SRAM_DEPTH-1][(SRAM_WIDTH*8-1)-:32], + $time / 1000); + chip_finish[${i}][${j}] = -1; + end + end + end + +% endfor +% endfor + endmodule diff --git a/util/occamygen/occamy.py b/util/occamygen/occamy.py index 6345ba1a3..593b37565 100644 --- a/util/occamygen/occamy.py +++ b/util/occamygen/occamy.py @@ -641,7 +641,6 @@ def get_bootdata_kwargs(occamy_cfg, cluster_generators, name): } return bootdata_kwargs - def get_testharness_kwargs(soc_wide_xbar, soc_axi_lite_narrow_periph_xbar, chip_id, solder, name): testharness_kwargs = { "name": name, @@ -652,6 +651,12 @@ def get_testharness_kwargs(soc_wide_xbar, soc_axi_lite_narrow_periph_xbar, chip_ } return testharness_kwargs +def get_multichip_testharness_kwargs(occamy_cfg, name): + testharness_kwargs = { + "name": name, + "multichip_cfg": occamy_cfg["hemaia_multichip"] + } + return testharness_kwargs def get_chip_kwargs(soc_wide_xbar, soc_axi_lite_narrow_periph_xbar, occamy_cfg, cluster_generators, util, name): core_per_cluster_list = [cluster_generator.cfg["nr_cores"] diff --git a/util/occamygen/occamygen.py b/util/occamygen/occamygen.py index 49e80e50d..38b8bed38 100755 --- a/util/occamygen/occamygen.py +++ b/util/occamygen/occamygen.py @@ -92,6 +92,9 @@ def main(): parser.add_argument("--testharness-sv", metavar="TESTHARNESS_SV", help="Name of the testharness wrapper file (output).") + parser.add_argument("--multichip-testharness-sv", + metavar="MULTICHIP_TESTHARNESS_SV", + help="Name of the multichip testharness wrapper file (output).") parser.add_argument("--cva6-sv", metavar="CVA6_SV", help="Name of the CVA6 wrapper file (output).") @@ -604,6 +607,10 @@ def main(): soc_wide_xbar, soc_axi_lite_narrow_periph_xbar, occamy_cfg["hemaia_multichip"]["single_chip_id"], solder, name) write_template(args.testharness_sv, outdir, **testharness_kwargs) + if args.multichip_testharness_sv: + multichip_testharness_kwargs = occamy.get_multichip_testharness_kwargs(occamy_cfg, name) + write_template(args.multichip_testharness_sv, outdir, **multichip_testharness_kwargs) + ############ # BOOTDATA # ############