diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2a1c8e11..1e748b5e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -235,6 +235,7 @@ jobs: compile-ara: runs-on: ubuntu-20.04 strategy: + fail-fast: false matrix: ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes] needs: ["tc-verilator", "tc-isa-sim"] diff --git a/.gitmodules b/.gitmodules index f7d26db14..32546e093 100644 --- a/.gitmodules +++ b/.gitmodules @@ -21,9 +21,6 @@ [submodule "hardware/deps/common_verification"] path = hardware/deps/common_verification url = https://github.com/pulp-platform/common_verification.git -[submodule "hardware/deps/cva6"] - path = hardware/deps/cva6 - url = https://github.com/pulp-platform/cva6.git [submodule "toolchain/newlib"] path = toolchain/newlib url = https://sourceware.org/git/newlib-cygwin.git @@ -32,3 +29,9 @@ path = toolchain/riscv-llvm url = https://github.com/llvm/llvm-project.git ignore = dirty +[submodule "hardware/deps/apb"] + path = hardware/deps/apb + url = https://github.com/pulp-platform/apb.git +[submodule "hardware/deps/cva6"] + path = hardware/deps/cva6 + url = git@github.com:MaistoV/cva6_fork.git diff --git a/Bender.lock b/Bender.lock index ae080d9ac..cf547586a 100644 --- a/Bender.lock +++ b/Bender.lock @@ -1,36 +1,62 @@ --- packages: + apb: + revision: 77ddf073f194d44b9119949d2421be59789e69ae + version: 0.2.4 + source: + Git: "https://github.com/pulp-platform/apb.git" + dependencies: + - common_cells axi: - revision: 442ff3375710513623f95944d66cc2bd09b2f155 - version: 0.29.1 + revision: 9251564ed67e3e71adf46dbeba62ef4435d2524c + version: 0.31.1 source: Git: "https://github.com/pulp-platform/axi.git" dependencies: - common_cells - common_verification common_cells: - revision: 015917ff33e5f944e866814f72f2074fb0f4220f - version: 1.22.1 + revision: 53b0b58af2db5bd3c850a7038fae170ed78326bb + version: 1.31.1 source: Git: "https://github.com/pulp-platform/common_cells.git" dependencies: - common_verification - tech_cells_generic common_verification: - revision: 6fc76fb013315af9fabbb90b431863d498df2d6d - version: 0.2.0 + revision: 9c07fa860593b2caabd9b5681740c25fac04b878 + version: 0.2.3 source: Git: "https://github.com/pulp-platform/common_verification.git" dependencies: [] cva6: - revision: 3245e44ec49c1cdcd19eb298cd81f0672eaf81ca + revision: 5e2e520696aa63545b91fca38ce340314291be5c version: ~ source: - Git: "https://github.com/pulp-platform/cva6.git" - dependencies: [] + Git: "https://github.com/MaistoV/cva6_fork.git" + dependencies: + - axi + - common_cells + - fpnew + - tech_cells_generic + fpnew: + revision: 3116391bf66660f806b45e212b9949c528b4e270 + version: 0.7.0 + source: + Git: "https://github.com/openhwgroup/cvfpu.git" + dependencies: + - common_cells + - fpu_div_sqrt_mvp + fpu_div_sqrt_mvp: + revision: 86e1f558b3c95e91577c41b2fc452c86b04e85ac + version: 1.0.4 + source: + Git: "https://github.com/pulp-platform/fpu_div_sqrt_mvp.git" + dependencies: + - common_cells tech_cells_generic: - revision: 203038f857158ae4634c47ce0281f402cc2a1344 - version: 0.2.4 + revision: 298b7297d220ba2601d0f24f684f97ff32f61123 + version: 0.2.12 source: Git: "https://github.com/pulp-platform/tech_cells_generic.git" dependencies: diff --git a/Bender.yml b/Bender.yml index e50de82cc..5518043c3 100644 --- a/Bender.yml +++ b/Bender.yml @@ -10,8 +10,9 @@ package: dependencies: axi: { git: "https://github.com/pulp-platform/axi.git", version: 0.29.1 } common_cells: { git: "https://github.com/pulp-platform/common_cells.git", version: 1.22.1 } - cva6: { git: "https://github.com/pulp-platform/cva6.git", rev: acc_port } + cva6: { git: "https://github.com/MaistoV/cva6_fork.git", rev: ara_cheshire } tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.1 } + apb: { git: "https://github.com/pulp-platform/apb.git", version: 0.2.4 } workspace: checkout_dir: "hardware/deps" @@ -27,7 +28,6 @@ sources: # Sources # Level 1 - - hardware/src/axi_to_mem.sv - hardware/src/ctrl_registers.sv - hardware/src/cva6_accel_first_pass_decoder.sv - hardware/src/ara_dispatcher.sv diff --git a/apps/.gitignore b/apps/.gitignore index ef412f9ba..9bf00c4f9 100644 --- a/apps/.gitignore +++ b/apps/.gitignore @@ -1,2 +1,4 @@ bin common/link.ld +*.o* +data.S* \ No newline at end of file diff --git a/apps/Makefile b/apps/Makefile index 6bb74f304..c06b873b2 100644 --- a/apps/Makefile +++ b/apps/Makefile @@ -21,6 +21,12 @@ APPS_DIR := $(ROOT_DIR) COMMON_DIR := $(ROOT_DIR)/common TESTS_DIR := $(ROOT_DIR)/riscv-tests/isa +# Build environment for Linux +LINUX ?= 0 +ifeq ($(LINUX), 1) +include $(COMMON_DIR)/linux.mk +endif + # This will overwrite the ROOT_DIR variable from the included makefile include $(COMMON_DIR)/runtime.mk include $(COMMON_DIR)/riscv_tests.mk @@ -33,9 +39,9 @@ BINARIES := $(filter-out bin/benchmarks, $(addprefix bin/,$(APPS))) CVA6_EXTENSIONS := rv64ui rv64uc rv64um rv64uf rv64ud rv64si # Atomics are messy, since there is currently no memory region capable of handling them # CVA6_EXTENSIONS := rv64ua -CVA6_BINARIES := $(addprefix bin/, $(cva6_tests)) +CVA6_BINARIES := $(addsuffix $(IS_LINUX_EXTENSION), $(addprefix bin/, $(cva6_tests))) ARA_EXTENSIONS := rv64uv -ARA_BINARIES := $(addprefix bin/, $(ara_tests)) +ARA_BINARIES := $(addsuffix $(IS_LINUX_EXTENSION), $(addprefix bin/, $(ara_tests))) # FFT requires special treatment because of its header files ifeq ($(ENV_DEFINES),) @@ -95,14 +101,18 @@ endef $(foreach app,$(APPS),$(eval $(call app_compile_template_spike,$(app)))) define app_compile_template -bin/$1: $1/data.S.o $(addsuffix .o, $(shell find $(1) -name "*.c" -o -name "*.S")) $(RUNTIME_LLVM) linker_script +bin/$1: $1/data.S.o$$(IS_LINUX_EXTENSION) $(addsuffix .o$$(IS_LINUX_EXTENSION), $(shell find $(1) -name "*.c" -o -name "*.S")) $(RUNTIME_LLVM) linker_script mkdir -p bin/ - $$(RISCV_CC) -Iinclude $(RISCV_CCFLAGS) -o $$@ $$(addsuffix .o, $$(shell find $(1) -name "*.c" -o -name "*.S")) $(RUNTIME_LLVM) $$(RISCV_LDFLAGS) -T$$(CURDIR)/common/link.ld - $$(RISCV_OBJDUMP) $$(RISCV_OBJDUMP_FLAGS) -D $$@ > $$@.dump - $$(RISCV_STRIP) $$@ -S --strip-unneeded + $$(RISCV_CC) $(RISCV_CCFLAGS) -o $$@$$(IS_LINUX_EXTENSION) $$(addsuffix .o$$(IS_LINUX_EXTENSION), $$(shell find $(1) -name "*.c" -o -name "*.S")) $(RUNTIME_LLVM) $$(RISCV_LDFLAGS) $$(LD_FLAGS) + $$(RISCV_OBJDUMP) $$(RISCV_OBJDUMP_FLAGS) -D $$@$$(IS_LINUX_EXTENSION) > $$@$$(IS_LINUX_EXTENSION).dump + # Don't strip symbols for Linux build since need them for debug + if [ "$$(IS_LINUX_EXTENSION)" == "" ]; then \ + $$(RISCV_STRIP) $$@$$(IS_LINUX_EXTENSION) -S --strip-unneeded; \ + fi endef $(foreach app,$(APPS),$(eval $(call app_compile_template,$(app)))) + # Make the RISC-V tests riscv_tests: $(CVA6_BINARIES) $(ARA_BINARIES) @@ -111,7 +121,7 @@ TESTS_$(1) := $(addprefix bin/, $($(addsuffix _ara_tests, $1))) bin/$(1)-ara-%: $(TESTS_DIR)/$(1)/%.$(2) $(RUNTIME_GCC) linker_script mkdir -p bin/ - $$(RISCV_CC_GCC) -Iinclude -I$$(TESTS_DIR)/macros/scalar -I$$(TESTS_DIR)/macros/vector $$(RISCV_CCFLAGS_GCC) $$(RISCV_LDFLAGS_GCC) -o $$@ $$< $(RUNTIME_GCC) -T$$(CURDIR)/common/link.ld + $$(RISCV_CC_GCC) -Iinclude -I$$(TESTS_DIR)/macros/scalar -I$$(TESTS_DIR)/macros/vector $$(RISCV_CCFLAGS_GCC) $$(RISCV_LDFLAGS_GCC) -o $$@ $$< $(RUNTIME_GCC) $$(LD_FLAGS) $$(RISCV_OBJDUMP) $$(RISCV_OBJDUMP_FLAGS) -D $$@ > $$@.dump $$(RISCV_STRIP) $$@ -S --strip-unneeded endef @@ -121,7 +131,7 @@ TESTS_$(1) := $(addprefix bin/, $($(addsuffix _ara_tests, $1))) bin/$(1)-ara-%: $(TESTS_DIR)/$(1)/%.$(2) $(RUNTIME_LLVM) linker_script mkdir -p bin/ - $$(RISCV_CC) -Iinclude -I$$(TESTS_DIR)/macros/scalar -I$$(TESTS_DIR)/macros/vector $$(RISCV_CCFLAGS) $$(RISCV_LDFLAGS) -o $$@ $$< $(RUNTIME_LLVM) -T$$(CURDIR)/common/link.ld + $$(RISCV_CC) -Iinclude -I$$(TESTS_DIR)/macros/scalar -I$$(TESTS_DIR)/macros/vector $$(RISCV_CCFLAGS) $$(RISCV_LDFLAGS) -o $$@ $$< $(RUNTIME_LLVM) $$(LD_FLAGS) $$(RISCV_OBJDUMP) $$(RISCV_OBJDUMP_FLAGS) -D $$@ > $$@.dump $$(RISCV_STRIP) $$@ -S --strip-unneeded endef @@ -169,13 +179,14 @@ benchmarks_clean: .PHONY: clean clean: riscv_tests_spike_clean benchmarks_clean + rm -vf bin/* rm -vf $(BINARIES) rm -vf $(CVA6_BINARIES) rm -vf $(ARA_BINARIES) rm -vf $(addsuffix .dump,$(BINARIES)) rm -vf $(addsuffix .dump,$(CVA6_BINARIES)) rm -vf $(addsuffix .dump,$(ARA_BINARIES)) - rm -vf $(addsuffix /main.c.o,$(APPS)) + rm -vf $(addsuffix /main.c.o$(IS_LINUX_EXTENSION),$(APPS)) rm -vf $(RUNTIME_GCC) rm -vf $(RUNTIME_LLVM) rm -vf $(RUNTIME_SPIKE) diff --git a/apps/common/linux.mk b/apps/common/linux.mk new file mode 100644 index 000000000..65daa016e --- /dev/null +++ b/apps/common/linux.mk @@ -0,0 +1,47 @@ +IS_LINUX_EXTENSION := .linux + +CVA6_SDK ?= /usr/scratch/fenga3/vmaisto/cva6-sdk_fork_backup +ROOTFS_DEST ?= $(CVA6_SDK)/rootfs/ara/apps/bin +cp_to_rootfs: + mkdir -p $(ROOTFS_DEST) + @echo "[Copying binaries to rootfs directory $(ROOTFS_DEST)]" + cp -v bin/*.linux $(ROOTFS_DEST) + +# Set the runtime variables to empty, the Linux libs will takcare of that +LD_FLAGS := +RUNTIME_GCC ?= common/util-gcc.c.o +RUNTIME_LLVM ?= common/util-llvm.c.o + + +# Override +INSTALL_DIR ?= $(ARA_DIR)/install +GCC_INSTALL_DIR ?= $(CVA6_SDK)/buildroot/output/host/ +LLVM_INSTALL_DIR ?= $(INSTALL_DIR)/riscv-llvm + +RISCV_XLEN ?= 64 +RISCV_ARCH ?= rv$(RISCV_XLEN)gcv +RISCV_ABI ?= lp64d +RISCV_TARGET ?= riscv$(RISCV_XLEN)-buildroot-linux-gnu- + +# Don't use LLVM +RISCV_PREFIX ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET) +RISCV_CC ?= $(RISCV_PREFIX)gcc +RISCV_CXX ?= $(RISCV_PREFIX)g++ +RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump +RISCV_OBJCOPY ?= $(RISCV_PREFIX)objcopy +RISCV_AS ?= $(RISCV_PREFIX)as +RISCV_AR ?= $(RISCV_PREFIX)ar +RISCV_LD ?= $(RISCV_PREFIX)ld +RISCV_STRIP ?= $(RISCV_PREFIX)strip + +# Override flags +# LLVM_FLAGS ?= -march=rv64gcv_zfh_zvfh0p1 -mabi=$(RISCV_ABI) -mno-relax -fuse-ld=lld +LLVM_FLAGS ?= -march=rv64gcv -mabi=$(RISCV_ABI) +LLVM_V_FLAGS ?= #+no-optimized-zero-stride-load +# RISCV_FLAGS ?= $(LLVM_FLAGS) $(LLVM_V_FLAGS) -mcmodel=medany -I$(CURDIR)/common -std=gnu99 -O3 -ffast-math -fno-common -fno-builtin-printf $(DEFINES) $(RISCV_WARNINGS) +RISCV_FLAGS ?= -g $(LLVM_FLAGS) $(LLVM_V_FLAGS) -I$(CURDIR)/common -std=gnu99 -O0 $(DEFINES) $(RISCV_WARNINGS) +RISCV_CCFLAGS ?= $(RISCV_FLAGS) #-ffunction-sections -fdata-sections +RISCV_CXXFLAGS ?= $(RISCV_FLAGS) -ffunction-sections -fdata-sections +RISCV_LDFLAGS ?= #-static -nostartfiles -lm -Wl,--gc-sections + +RISCV_OBJDUMP_FLAGS ?= -S diff --git a/apps/common/printf.h b/apps/common/printf.h index dd8d0d514..53bdc952c 100644 --- a/apps/common/printf.h +++ b/apps/common/printf.h @@ -29,6 +29,11 @@ // /////////////////////////////////////////////////////////////////////////////// + +#ifdef __linux__ + #include +#else // ! __linux__ + #ifndef _PRINTF_H_ #define _PRINTF_H_ @@ -100,4 +105,5 @@ int fctprintf(void (*out)(char character, void *arg), void *arg, } #endif +#endif // __linux__ #endif // _PRINTF_H_ diff --git a/apps/common/runtime.h b/apps/common/runtime.h index 4e8dbb44d..e61d5fda8 100644 --- a/apps/common/runtime.h +++ b/apps/common/runtime.h @@ -7,13 +7,19 @@ asm volatile( \ "csrs mstatus, %[bits];" ::[bits] "r"(0x00000600 & (0x00000600 >> 1))) -extern int64_t event_trigger; -extern int64_t timer; -// SoC-level CSR -extern uint64_t hw_cnt_en_reg; +// SoC-level CSR, put in memory for Linux build +#ifdef __linux__ + int64_t event_trigger; + int64_t timer; + uint64_t hw_cnt_en_reg; +#else // ! __linux__ + extern int64_t event_trigger; + extern int64_t timer; + extern uint64_t hw_cnt_en_reg; +#endif // __linux__ // Return the current value of the cycle counter -inline int64_t get_cycle_count() { +int64_t get_cycle_count() { int64_t cycle_count; // The fence is needed to be sure that Ara is idle, and it is not performing // the last vector stores when we read mcycle with stop_timer() @@ -31,26 +37,26 @@ inline int64_t get_cycle_count() { #define HW_CNT_READY hw_cnt_en_reg = 1; #define HW_CNT_NOT_READY hw_cnt_en_reg = 0; // Start and stop the counter -inline void start_timer() { timer = -get_cycle_count(); } -inline void stop_timer() { timer += get_cycle_count(); } +void start_timer() { timer = -get_cycle_count(); } +void stop_timer() { timer += get_cycle_count(); } // Get the value of the timer -inline int64_t get_timer() { return timer; } +int64_t get_timer() { return timer; } #else #define HW_CNT_READY ; #define HW_CNT_NOT_READY ; // Start and stop the counter -inline void start_timer() { +void start_timer() { while (0) ; } -inline void stop_timer() { +void stop_timer() { while (0) ; } // Get the value of the timer -inline int64_t get_timer() { return 0; } +int64_t get_timer() { return 0; } #endif #endif // _RUNTIME_H_ diff --git a/apps/common/runtime.mk b/apps/common/runtime.mk index 66b05f660..5205ba0ba 100644 --- a/apps/common/runtime.mk +++ b/apps/common/runtime.mk @@ -42,7 +42,7 @@ ISA_SIM_MOD_INSTALL_DIR ?= $(INSTALL_DIR)/riscv-isa-sim-mod RISCV_XLEN ?= 64 RISCV_ARCH ?= rv$(RISCV_XLEN)gcv RISCV_ABI ?= lp64d -RISCV_TARGET ?= riscv$(RISCV_XLEN)-unknown-elf +RISCV_TARGET ?= riscv$(RISCV_XLEN)-unknown-elf- # Use LLVM RISCV_PREFIX ?= $(LLVM_INSTALL_DIR)/bin/ @@ -56,7 +56,9 @@ RISCV_LD ?= $(RISCV_PREFIX)ld.lld RISCV_STRIP ?= $(RISCV_PREFIX)llvm-strip # Use gcc to compile scalar riscv-tests -RISCV_CC_GCC ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET)-gcc +RISCV_CC_GCC ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET)gcc +RISCV_OBJCOPY_GCC ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET)objcopy +RISCV_OBJDUMP_GCC ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET)objdump # Benchmark with spike spike_env_dir ?= $(ARA_DIR)/apps/riscv-tests @@ -109,6 +111,8 @@ RUNTIME_GCC ?= common/crt0-gcc.S.o common/printf-gcc.c.o common/string-gcc.c.o RUNTIME_LLVM ?= common/crt0-llvm.S.o common/printf-llvm.c.o common/string-llvm.c.o common/serial-llvm.c.o common/util-llvm.c.o RUNTIME_SPIKE ?= $(spike_env_dir)/benchmarks/common/crt.S.o.spike $(spike_env_dir)/benchmarks/common/syscalls.c.o.spike common/util.c.o.spike +LD_FLAGS ?= -T$(CURDIR)/common/link.ld + .INTERMEDIATE: $(RUNTIME_GCC) $(RUNTIME_LLVM) %-gcc.S.o: %.S @@ -123,10 +127,10 @@ RUNTIME_SPIKE ?= $(spike_env_dir)/benchmarks/common/crt.S.o.spike $(spike_env_di %-llvm.c.o: %.c $(RISCV_CC) $(RISCV_CCFLAGS) -c $< -o $@ -%.S.o: %.S +%.S.o$(IS_LINUX_EXTENSION): %.S $(RISCV_CC) $(RISCV_CCFLAGS) -c $< -o $@ -%.c.o: %.c +%.c.o$(IS_LINUX_EXTENSION): %.c $(RISCV_CC) $(RISCV_CCFLAGS) -c $< -o $@ %.S.o.spike: %.S patch-spike-crt0 diff --git a/apps/riscv-tests/benchmarks/Makefile b/apps/riscv-tests/benchmarks/Makefile index cc327145a..43df648b8 100644 --- a/apps/riscv-tests/benchmarks/Makefile +++ b/apps/riscv-tests/benchmarks/Makefile @@ -35,12 +35,12 @@ bmarks = \ # Build rules #-------------------------------------------------------------------- -RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf- +RISCV_PREFIX ?= /usr/scratch/fenga3/vmaisto/cva6-sdk_fork/buildroot/output/host/bin/riscv64-buildroot-linux-gnu- RISCV_GCC ?= $(RISCV_PREFIX)gcc -RISCV_GCC_OPTS ?= -DPREALLOCATE=1 -mcmodel=medany -static -std=gnu99 -O2 -ffast-math -fno-common -fno-builtin-printf -RISCV_LINK ?= $(RISCV_GCC) -T $(src_dir)/common/test.ld $(incs) -RISCV_LINK_OPTS ?= -static -nostdlib -nostartfiles -lm -lgcc -T $(src_dir)/common/test.ld -RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes --section=.text --section=.text.startup --section=.text.init --section=.data +RISCV_GCC_OPTS ?= -DPREALLOCATE=1 -mcmodel=medany -std=gnu99 -O2 -ffast-math -fPIC +RISCV_LINK ?= $(RISCV_GCC) $(incs) +RISCV_LINK_OPTS ?= +RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes -S RISCV_SIM ?= spike --isa=rv$(XLEN)gc incs += -I$(src_dir)/../env -I$(src_dir)/common $(addprefix -I$(src_dir)/, $(bmarks)) @@ -48,7 +48,7 @@ objs := define compile_template $(1).riscv: $(wildcard $(src_dir)/$(1)/*) $(wildcard $(src_dir)/common/*) - $$(RISCV_GCC) $$(incs) $$(RISCV_GCC_OPTS) -o $$@ $(wildcard $(src_dir)/$(1)/*.c) $(wildcard $(src_dir)/common/*.c) $(wildcard $(src_dir)/common/*.S) $$(RISCV_LINK_OPTS) + $$(RISCV_GCC) $$(incs) $$(RISCV_GCC_OPTS) -o $$@ $(wildcard $(src_dir)/$(1)/*.c) $(wildcard $(src_dir)/common/*.c) $$(RISCV_LINK_OPTS) endef $(foreach bmark,$(bmarks),$(eval $(call compile_template,$(bmark)))) diff --git a/apps/riscv-tests/benchmarks/common/syscalls.c b/apps/riscv-tests/benchmarks/common/syscalls.c index 4d20be9e4..b9d33c368 100644 --- a/apps/riscv-tests/benchmarks/common/syscalls.c +++ b/apps/riscv-tests/benchmarks/common/syscalls.c @@ -8,6 +8,36 @@ #include #include "util.h" +#define NUM_COUNTERS 2 +static uintptr_t counters[NUM_COUNTERS]; +static char* counter_names[NUM_COUNTERS]; + +void setStats(int enable) +{ + int i = 0; +#define READ_CTR(name) do { \ + while (i >= NUM_COUNTERS) ; \ + uintptr_t csr = read_csr(name); \ + if (!enable) { csr -= counters[i]; counter_names[i] = #name; } \ + counters[i++] = csr; \ + } while (0) + + // Read from user CSRs + READ_CTR(cycle); + READ_CTR(instret); + +#undef READ_CTR +} + +int __attribute__((weak)) main(int argc, char** argv) +{ + // single-threaded programs override this function. + printstr("Implement main(), foo!\n"); + return -1; +} + +#ifndef __linux__ + #define SYS_write 64 #undef strcmp @@ -33,26 +63,6 @@ static uintptr_t syscall(uintptr_t which, uint64_t arg0, uint64_t arg1, uint64_t return magic_mem[0]; } -#define NUM_COUNTERS 2 -static uintptr_t counters[NUM_COUNTERS]; -static char* counter_names[NUM_COUNTERS]; - -void setStats(int enable) -{ - int i = 0; -#define READ_CTR(name) do { \ - while (i >= NUM_COUNTERS) ; \ - uintptr_t csr = read_csr(name); \ - if (!enable) { csr -= counters[i]; counter_names[i] = #name; } \ - counters[i++] = csr; \ - } while (0) - - READ_CTR(mcycle); - READ_CTR(minstret); - -#undef READ_CTR -} - void __attribute__((noreturn)) tohost_exit(uintptr_t code) { tohost = (code << 1) | 1; @@ -86,19 +96,13 @@ void __attribute__((weak)) thread_entry(int cid, int nc) while (cid != 0); } -int __attribute__((weak)) main(int argc, char** argv) -{ - // single-threaded programs override this function. - printstr("Implement main(), foo!\n"); - return -1; -} - static void init_tls() { register void* thread_pointer asm("tp"); - extern char _tdata_begin, _tdata_end, _tbss_end; + extern char _tls_data; + extern __thread char _tdata_begin, _tdata_end, _tbss_end; size_t tdata_size = &_tdata_end - &_tdata_begin; - memcpy(thread_pointer, &_tdata_begin, tdata_size); + memcpy(thread_pointer, &_tls_data, tdata_size); size_t tbss_size = &_tbss_end - &_tdata_end; memset(thread_pointer + tdata_size, 0, tbss_size); } @@ -115,7 +119,7 @@ void _init(int cid, int nc) char* pbuf = buf; for (int i = 0; i < NUM_COUNTERS; i++) if (counters[i]) - pbuf += sprintf(pbuf, "%s = %ld\n", counter_names[i], counters[i]); + pbuf += sprintf(pbuf, "%s = %d\n", counter_names[i], counters[i]); if (pbuf != buf) printstr(buf); @@ -226,7 +230,7 @@ static void vprintfmt(void (*putch)(int, void**), void **putdat, const char *fmt case '-': padc = '-'; goto reswitch; - + // flag to pad with 0's instead of spaces case '0': padc = '0'; @@ -335,7 +339,7 @@ static void vprintfmt(void (*putch)(int, void**), void **putdat, const char *fmt case '%': putch(ch, putdat); break; - + // unrecognized escape sequence - just print it literally default: putch('%', putdat); @@ -356,19 +360,19 @@ int printf(const char* fmt, ...) return 0; // incorrect return value, but who cares, anyway? } -void sprintf_putch(int ch, void** data) -{ - char** pstr = (char**)data; - **pstr = ch; - (*pstr)++; -} - int sprintf(char* str, const char* fmt, ...) { va_list ap; char* str0 = str; va_start(ap, fmt); + void sprintf_putch(int ch, void** data) + { + char** pstr = (char**)data; + **pstr = ch; + (*pstr)++; + } + vprintfmt(sprintf_putch, (void**)&str, fmt, ap); *str = 0; @@ -467,3 +471,5 @@ long atol(const char* str) return sign ? -res : res; } + +#endif // __linux__ diff --git a/apps/riscv-tests/mt/Makefile b/apps/riscv-tests/mt/Makefile index b45e55182..81052bafc 100644 --- a/apps/riscv-tests/mt/Makefile +++ b/apps/riscv-tests/mt/Makefile @@ -75,12 +75,12 @@ bmarks = $(bmarks_vvadd) $(bmarks_matmul) # Build rules #-------------------------------------------------------------------- -RISCV_PREFIX=riscv$(XLEN)-unknown-elf- +RISCV_PREFIX := /scratch/vmaisto/cva6-sdk_fork/buildroot/output/host/bin/riscv64-buildroot-linux-gnu- RISCV_GCC = $(RISCV_PREFIX)gcc RISCV_GCC_OPTS = -std=gnu99 -O2 -ffast-math -RISCV_LINK = $(RISCV_GCC) -T $(common)/test.ld $(incs) +RISCV_LINK = $(RISCV_GCC) $(incs) RISCV_LINK_OPTS = -nostdlib -nostartfiles -ffast-math -lc -RISCV_OBJDUMP = $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes --section=.text --section=.text.startup --section=.data +RISCV_OBJDUMP = $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes -S RISCV_SIM = spike -p2 VPATH += $(common) $(common)/../mt-matmul $(common)/../mt-vvadd diff --git a/hardware/Makefile b/hardware/Makefile index d85fd638c..c8a281ad6 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -102,7 +102,7 @@ vlog_args += -suppress vlog-2583 -suppress vlog-13314 -suppress vlog-13233 vlog_args += -work $(library) # Defines -bender_defs += --define NR_LANES=$(nr_lanes) --define VLEN=$(vlen) --define RVV_ARIANE=1 +bender_defs += --define NR_LANES=$(nr_lanes) --define VLEN=$(vlen) --define ARIANE_ACCELERATOR_PORT=1 # Default target all: compile @@ -114,7 +114,9 @@ $(buildpath): # Bender bender: @[ -x ./bender ] && echo "Bender already exists." || \ - curl --proto '=https' --tlsv1.2 https://fabianschuiki.github.io/bender/init -sSf | sh -s -- 0.23.1 + wget https://github.com/pulp-platform/bender/releases/download/v0.23.1/bender-0.23.1-x86_64-linux-gnu.tar.gz + tar xf bender-0.23.1-x86_64-linux-gnu.tar.gz + rm -rf bender-0.23.1-x86_64-linux-gnu.tar.gz @echo "$$(./bender --version) available." # Patches @@ -132,7 +134,7 @@ $(buildpath)/$(library): .PHONY: compile compile: dpi lib $(buildpath) bender $(buildpath)/compile_$(config).tcl $(buildpath)/compile_$(config).tcl: $(config_file) Makefile ../Bender.yml $(shell find src -type f) $(shell find ../config -type f) $(shell find include -type f) $(shell find tb -type f) $(shell find deps -type f) - ./bender script vsim --vlog-arg="$(vlog_args)" -t rtl -t asic -t ara_test -t cva6_test $(bender_defs) > $(buildpath)/compile_$(config).tcl + ./bender script vsim --vlog-arg="$(vlog_args)" -t rtl -t asic -t ara_test -t cva6_test -t cv64a6_imafdcv_sv39 $(bender_defs) > $(buildpath)/compile_$(config).tcl echo "exit" >> $(buildpath)/compile_$(config).tcl cd $(buildpath) && $(questa_cmd) vsim -work $(library) -c -do compile_$(config).tcl # Remove the file if compilation did not succeed @@ -164,11 +166,13 @@ verilate: $(buildpath) bender $(veril_library)/V$(veril_top) $(veril_library)/V$(veril_top): $(config_file) Makefile ../Bender.yml $(shell find src -type f) $(shell find ../config -type f) $(shell find include -type f) $(shell find tb -type f) $(shell find deps -type f) rm -rf $(veril_library); mkdir -p $(veril_library) - ./bender script verilator -t rtl -t ara_test -t cva6_test -t verilator $(bender_defs) > $(veril_library)/bender_script_$(config) + ./bender script verilator -t rtl -t ara_test -t cva6_test -t cv64a6_imafdcv_sv39 -t verilator $(bender_defs) > $(veril_library)/bender_script_$(config) # Verilate the design $(veril_path)/verilator -f $(veril_library)/bender_script_$(config) \ -GNrLanes=$(nr_lanes) \ -O3 \ + -Wno-fatal \ + -Wno-PINCONNECTEMPTY \ -Wno-BLKANDNBLK \ -Wno-CASEINCOMPLETE \ -Wno-CMPCONST \ @@ -179,6 +183,7 @@ $(veril_library)/V$(veril_top): $(config_file) Makefile ../Bender.yml $(shell fi -Wno-UNSIGNED \ -Wno-WIDTH \ -Wno-WIDTHCONCAT \ + -Wall \ --hierarchical \ tb/verilator/waiver.vlt \ --Mdir $(veril_library) \ @@ -222,7 +227,7 @@ lint: spyglass/tmp/files spyglass/sdc/func.sdc spyglass/scripts/run_lint.tcl spyglass/tmp/files: $(bender) mkdir -p spyglass/tmp - ./bender script verilator -t rtl -t spyglass -t cva6_test $(bender_defs) --define SPYGLASS > spyglass/tmp/files + ./bender script verilator -t rtl -t spyglass -t cva6_test -t cv64a6_imafdcv_sv39 $(bender_defs) --define SPYGLASS > spyglass/tmp/files # DPIs .PHONY: dpi diff --git a/hardware/deps/apb b/hardware/deps/apb new file mode 160000 index 000000000..77ddf073f --- /dev/null +++ b/hardware/deps/apb @@ -0,0 +1 @@ +Subproject commit 77ddf073f194d44b9119949d2421be59789e69ae diff --git a/hardware/deps/axi b/hardware/deps/axi index 442ff3375..bfee21757 160000 --- a/hardware/deps/axi +++ b/hardware/deps/axi @@ -1 +1 @@ -Subproject commit 442ff3375710513623f95944d66cc2bd09b2f155 +Subproject commit bfee21757bf090ec8e358456314b0b0fd3c90809 diff --git a/hardware/deps/cva6 b/hardware/deps/cva6 index bebbc1475..5e2e52069 160000 --- a/hardware/deps/cva6 +++ b/hardware/deps/cva6 @@ -1 +1 @@ -Subproject commit bebbc1475f9ffba661e8354d8773e27ab9338db1 +Subproject commit 5e2e520696aa63545b91fca38ce340314291be5c diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 014b00473..b8ffa78c8 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -155,17 +155,17 @@ package ara_pkg; } ara_op_e; // Return true if op is a load operation - function automatic is_load(ara_op_e op); + function automatic logic is_load(ara_op_e op); is_load = op inside {[VLE:VLXE]}; endfunction : is_load // Return true if op is a store operation - function automatic is_store(ara_op_e op); + function automatic logic is_store(ara_op_e op); is_store = op inside {[VSE:VSXE]}; endfunction : is_store // Return true of op is either VCPOP or VFIRST - function automatic vd_scalar(ara_op_e op); + function automatic logic vd_scalar(ara_op_e op); vd_scalar = op inside {[VCPOP:VFIRST]}; endfunction : vd_scalar @@ -239,8 +239,8 @@ package ara_pkg; ///////////////////////////// // Use Ariane's accelerator interface. - typedef ariane_pkg::accelerator_req_t accelerator_req_t; - typedef ariane_pkg::accelerator_resp_t accelerator_resp_t; + typedef acc_pkg::accelerator_req_t accelerator_req_t; + typedef acc_pkg::accelerator_resp_t accelerator_resp_t; ///////////////////////// // Backend interface // @@ -322,11 +322,11 @@ package ara_pkg; // Scalar response elen_t resp; - // Instruction triggered an error - logic error; + // Instruction triggered an exception + ariane_pkg::exception_t exception; // New value for vstart - vlen_t error_vl; + vlen_t exception_vl; } ara_resp_t; //////////////////// @@ -974,11 +974,20 @@ package ara_pkg; } opqueue_e; // Each lane has eight VRF banks + // NOTE: values != 8 are not supported localparam int unsigned NrVRFBanksPerLane = 8; - // Find the starting address of a vector register vid + // Find the starting address (in bytes) of a vector register chunk of vid function automatic logic [63:0] vaddr(logic [4:0] vid, int NrLanes); - vaddr = vid * (VLENB / NrLanes / 8); + // Each vector register spans multiple words in each bank in each lane + // The start address is the same in every lane + // Therefore, within each lane, each vector register chunk starts on a given offset + vaddr = vid * (VLENB / NrLanes / NrVRFBanksPerLane); + // NOTE: the only extensively tested configuration of Ara keeps: + // - (VLEN / NrLanes) constant to 1024; + // - NrVRFBanksPerLane always equal to 8. + // Given so, each vector register will span 2 words across all the banks and lanes, + // therefore, vaddr = vid * 16 endfunction: vaddr // Differenciate between SLDU and ADDRGEN operands from opqueue @@ -1016,7 +1025,7 @@ package ara_pkg; typedef struct packed { rvv_pkg::vew_e eew; // Effective element width - vlen_t vl; // Vector length + vlen_t elem_count; // Vector body length opqueue_conversion_e conv; // Type conversion logic [1:0] ntr_red; // Neutral type for reductions logic is_reduct; // Is this a reduction? diff --git a/hardware/scripts/wave_core.tcl b/hardware/scripts/wave_core.tcl index 7f0434ad7..757f814e7 100644 --- a/hardware/scripts/wave_core.tcl +++ b/hardware/scripts/wave_core.tcl @@ -7,15 +7,15 @@ add wave -noupdate -group CVA6 -group core /ara_tb/dut/i_ara_soc/i_system/i_ariane/* add wave -noupdate -group CVA6 -group frontend /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/* -add wave -noupdate -group CVA6 -group frontend -group icache /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_cva6_icache/* -add wave -noupdate -group CVA6 -group frontend -group ras /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_ras/* -add wave -noupdate -group CVA6 -group frontend -group btb /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_btb/* -add wave -noupdate -group CVA6 -group frontend -group bht /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_bht/* +add wave -noupdate -group CVA6 -group frontend -group icache /ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/* +# add wave -noupdate -group CVA6 -group frontend -group ras /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_ras/* +# add wave -noupdate -group CVA6 -group frontend -group btb /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_btb/* +# add wave -noupdate -group CVA6 -group frontend -group bht /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_bht/* # add wave -noupdate -group CVA6 -group frontend -group instr_scan /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/*/i_instr_scan/* # add wave -noupdate -group CVA6 -group frontend -group fetch_fifo /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_fetch_fifo/* add wave -noupdate -group CVA6 -group id_stage -group decoder /ara_tb/dut/i_ara_soc/i_system/i_ariane/id_stage_i/decoder_i/* -add wave -noupdate -group CVA6 -group id_stage -group compressed_decoder /ara_tb/dut/i_ara_soc/i_system/i_ariane/id_stage_i/compressed_decoder_i/* +add wave -noupdate -group CVA6 -group id_stage -group compressed_decoder /ara_tb/dut/i_ara_soc/i_system/i_ariane/id_stage_i/genblk1/compressed_decoder_i/* add wave -noupdate -group CVA6 -group id_stage /ara_tb/dut/i_ara_soc/i_system/i_ariane/id_stage_i/* add wave -noupdate -group CVA6 -group issue_stage -group scoreboard /ara_tb/dut/i_ara_soc/i_system/i_ariane/issue_stage_i/i_scoreboard/* @@ -32,10 +32,10 @@ add wave -noupdate -group CVA6 -group ex_stage -group fpu -group fpnew /ara_tb/d add wave -noupdate -group CVA6 -group ex_stage -group lsu /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/* add wave -noupdate -group CVA6 -group ex_stage -group lsu -group lsu_bypass /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/lsu_bypass_i/* -add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_mmu/* -add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group itlb /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_mmu/i_itlb/* -add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group dtlb /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_mmu/i_dtlb/* -add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group ptw /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_mmu/i_ptw/* +add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/gen_mmu_sv39/i_cva6_mmu/* +add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group itlb /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/gen_mmu_sv39/i_cva6_mmu/i_itlb/* +add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group dtlb /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/gen_mmu_sv39/i_cva6_mmu/i_dtlb/* +add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group ptw /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/gen_mmu_sv39/i_cva6_mmu/i_ptw/* add wave -noupdate -group CVA6 -group ex_stage -group lsu -group store_unit /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_store_unit/* add wave -noupdate -group CVA6 -group ex_stage -group lsu -group store_unit -group store_buffer /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_store_unit/store_buffer_i/* @@ -46,7 +46,6 @@ add wave -noupdate -group CVA6 -group ex_stage -group branch_unit /ara_tb/dut/i_ add wave -noupdate -group CVA6 -group ex_stage -group csr_buffer /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/csr_buffer_i/* -add wave -noupdate -group CVA6 -group ex_stage -group dispatcher /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/gen_accelerator/i_acc_dispatcher/* add wave -noupdate -group CVA6 -group ex_stage /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/* add wave -noupdate -group CVA6 -group commit_stage /ara_tb/dut/i_ara_soc/i_system/i_ariane/commit_stage_i/* @@ -55,10 +54,12 @@ add wave -noupdate -group CVA6 -group csr_file /ara_tb/dut/i_ara_soc/i_system/i_ add wave -noupdate -group CVA6 -group controller /ara_tb/dut/i_ara_soc/i_system/i_ariane/controller_i/* -add wave -noupdate -group CVA6 -group wt_dcache /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_wt_dcache/* -add wave -noupdate -group CVA6 -group wt_dcache -group miss_handler /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_wt_dcache/i_wt_dcache_missunit/* +add wave -noupdate -group CVA6 -group wt_dcache /ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/i_wt_dcache/* +add wave -noupdate -group CVA6 -group wt_dcache -group miss_handler /ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/i_wt_dcache/i_wt_dcache_missunit/* -add wave -noupdate -group CVA6 -group wt_dcache -group load {/ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_wt_dcache/gen_rd_ports[0]/i_wt_dcache_ctrl/*} -add wave -noupdate -group CVA6 -group wt_dcache -group ptw {/ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_wt_dcache/gen_rd_ports[1]/i_wt_dcache_ctrl/*} +add wave -noupdate -group CVA6 -group wt_dcache -group load {/ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/i_wt_dcache/gen_rd_ports[0]/i_wt_dcache_ctrl/*} +add wave -noupdate -group CVA6 -group wt_dcache -group ptw {/ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/i_wt_dcache/gen_rd_ports[1]/i_wt_dcache_ctrl/*} -add wave -noupdate -group CVA6 -group perf_counters /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_perf_counters/* +add wave -noupdate -group CVA6 -group dispatcher /ara_tb/dut/i_ara_soc/i_system/i_ariane/gen_accelerator/i_acc_dispatcher/* + +add wave -noupdate -group CVA6 -group perf_counters /ara_tb/dut/i_ara_soc/i_system/i_ariane/gen_perf_counter/perf_counters_i/* diff --git a/hardware/src/accel_dispatcher_ideal.sv b/hardware/src/accel_dispatcher_ideal.sv index 8c564b34c..b89d93474 100644 --- a/hardware/src/accel_dispatcher_ideal.sv +++ b/hardware/src/accel_dispatcher_ideal.sv @@ -25,11 +25,7 @@ module accel_dispatcher_ideal import axi_pkg::*; import ara_pkg::*; ( input logic rst_ni, // Accelerator interaface output accelerator_req_t acc_req_o, - output logic acc_req_valid_o, - input logic acc_req_ready_i, - input accelerator_resp_t acc_resp_i, - input logic acc_resp_valid_i, - output logic acc_resp_ready_o + input accelerator_resp_t acc_resp_i ); localparam string vtrace = `STRINGIFY(`VTRACE); @@ -69,7 +65,7 @@ module accel_dispatcher_ideal import axi_pkg::*; import ara_pkg::*; ( status_cnt_n = status_cnt_q; fifo_data_raw = fifo_q[read_pointer_q]; - if (acc_req_ready_i && ~fifo_empty) begin + if (acc_resp_i.req_ready && ~fifo_empty) begin // read from the queue is a default assignment // but increment the read pointer... if (read_pointer_n == N_VINSN - 1) @@ -94,16 +90,16 @@ module accel_dispatcher_ideal import axi_pkg::*; import ara_pkg::*; ( assign fifo_empty = (status_cnt_q == 0); - // Always valid until empty - assign acc_req_valid_o = ~fifo_empty; - // Flush the answer - assign acc_resp_ready_o = 1'b1; // Output assignment assign fifo_data = fifo_payload_t'(fifo_data_raw); assign acc_req_o = '{ insn : fifo_data.insn, rs1 : fifo_data.rs1, rs2 : fifo_data.rs2, + // Always valid until empty + req_valid : ~fifo_empty, + // Flush the answer + resp_ready : 1'b1, default : '0 }; @@ -133,7 +129,7 @@ module accel_dispatcher_ideal import axi_pkg::*; import ara_pkg::*; ( // Stop the computation when the instructions are over and ara has returned idle // Just check that we are after reset always_ff @(posedge clk_i) begin - if (rst_ni && was_reset && !acc_req_valid_o && i_system.i_ara.ara_idle) begin + if (rst_ni && was_reset && !acc_req_o.req_valid && i_system.i_ara.ara_idle) begin $display("[hw-cycles]: %d", int'(perf_cnt_q)); $info("Core Test ", $sformatf("*** SUCCESS *** (tohost = %0d)", 0)); $finish(0); @@ -160,10 +156,10 @@ endmodule fifo_payload_t payload; acc_req_o = '0; - acc_req_valid_o = 1'b0; + acc_req_o.req_valid = 1'b0; // Flush the answer - acc_resp_ready_o = 1'b1; + acc_req_o.resp_ready = 1'b1; acc_req_o = '0; acc_req_o.frm = fpnew_pkg::RNE; @@ -176,17 +172,17 @@ endmodule while ($fscanf(fd, "%h", payload) == 1) begin // Always valid - acc_req_valid_o = 1'b1; + acc_req_o.req_valid = 1'b1; acc_req_o.insn = payload.insn; acc_req_o.rs1 = payload.rs1; // Wait for the handshake - wait(acc_req_ready_i); + wait(acc_resp_i.req_ready); @(posedge clk_i); @(negedge clk_i); end // Stop dispatching - acc_req_valid_o = 1'b0; + acc_req_o.req_valid = 1'b0; $fclose(fd); end diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index c6976be6f..2bb6c6d08 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -37,13 +37,28 @@ module ara import ara_pkg::*; #( input logic scan_enable_i, input logic scan_data_i, output logic scan_data_o, + + // CSR input + input logic en_ld_st_translation_i, + + // Interface with CVA6's sv39 MMU + // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless + output ariane_pkg::exception_t mmu_misaligned_ex_o, + output logic mmu_req_o, // request address translation + output logic [riscv::VLEN-1:0] mmu_vaddr_o, // virtual address out + output logic mmu_is_store_o, // the translation is requested by a store + // if we need to walk the page table we can't grant in the same cycle + // Cycle 0 + input logic mmu_dtlb_hit_i, // sent in the same cycle as the request if translation hits in the DTLB + input logic [riscv::PPNW-1:0] mmu_dtlb_ppn_i, // ppn (send same cycle as hit) + // Cycle 1 + input logic mmu_valid_i, // translation is valid + input logic [riscv::PLEN-1:0] mmu_paddr_i, // translated address + input ariane_pkg::exception_t mmu_exception_i, // address translation threw an exception + // Interface with Ariane input accelerator_req_t acc_req_i, - input logic acc_req_valid_i, - output logic acc_req_ready_o, output accelerator_resp_t acc_resp_o, - output logic acc_resp_valid_o, - input logic acc_resp_ready_i, // AXI interface output axi_req_t axi_req_o, input axi_resp_t axi_resp_i @@ -95,11 +110,7 @@ module ara import ara_pkg::*; #( .rst_ni (rst_ni ), // Interface with Ariane .acc_req_i (acc_req_i ), - .acc_req_valid_i (acc_req_valid_i ), - .acc_req_ready_o (acc_req_ready_o ), .acc_resp_o (acc_resp_o ), - .acc_resp_valid_o (acc_resp_valid_o), - .acc_resp_ready_i (acc_resp_ready_i), // Interface with the sequencer .ara_req_o (ara_req ), .ara_req_valid_o (ara_req_valid ), @@ -131,8 +142,8 @@ module ara import ara_pkg::*; #( pe_resp_t [NrPEs-1:0] pe_resp; // Interface with the address generator logic addrgen_ack; - logic addrgen_error; - vlen_t addrgen_error_vl; + ariane_pkg::exception_t addrgen_exception; + vlen_t addrgen_exception_vl; logic [NrLanes-1:0] alu_vinsn_done; logic [NrLanes-1:0] mfpu_vinsn_done; // Interface with the operand requesters @@ -179,8 +190,8 @@ module ara import ara_pkg::*; #( .pe_scalar_resp_ready_o(pe_scalar_resp_ready ), // Interface with the address generator .addrgen_ack_i (addrgen_ack ), - .addrgen_error_i (addrgen_error ), - .addrgen_error_vl_i (addrgen_error_vl ) + .addrgen_exception_i (addrgen_exception ), + .addrgen_exception_vl_i(addrgen_exception_vl ) ); // Scalar move support @@ -345,8 +356,8 @@ module ara import ara_pkg::*; #( .pe_req_ready_o (pe_req_ready[NrLanes+OffsetStore : NrLanes+OffsetLoad]), .pe_resp_o (pe_resp[NrLanes+OffsetStore : NrLanes+OffsetLoad] ), .addrgen_ack_o (addrgen_ack ), - .addrgen_error_o (addrgen_error ), - .addrgen_error_vl_o (addrgen_error_vl ), + .addrgen_exception_o (addrgen_exception ), + .addrgen_exception_vl_o (addrgen_exception_vl ), // Interface with the Mask unit .mask_i (mask ), .mask_valid_i (mask_valid ), @@ -362,6 +373,18 @@ module ara import ara_pkg::*; #( .addrgen_operand_target_fu_i(sldu_addrgen_operand_target_fu ), .addrgen_operand_valid_i (sldu_addrgen_operand_valid ), .addrgen_operand_ready_o (addrgen_operand_ready ), + // CSR input + .en_ld_st_translation_i, + // Interface with CVA6's sv39 MMU + .mmu_misaligned_ex_o , + .mmu_req_o , + .mmu_vaddr_o , + .mmu_is_store_o , + .mmu_dtlb_hit_i , + .mmu_dtlb_ppn_i , + .mmu_valid_i , + .mmu_paddr_i , + .mmu_exception_i , // Load unit .ldu_result_req_o (ldu_result_req ), .ldu_result_addr_o (ldu_result_addr ), diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index b48f33c66..6daf98e99 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -22,11 +22,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( input logic rst_ni, // Interfaces with Ariane input accelerator_req_t acc_req_i, - input logic acc_req_valid_i, - output logic acc_req_ready_o, output accelerator_resp_t acc_resp_o, - output logic acc_resp_valid_o, - input logic acc_resp_ready_i, // Interface with Ara's backend output ara_req_t ara_req_o, output logic ara_req_valid_o, @@ -57,17 +53,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // CSRs // //////////// - vlen_t vstart_d, vstart_q; - vlen_t vl_d, vl_q; - vtype_t vtype_d, vtype_q; - vxsat_e vxsat_d, vxsat_q; - vxrm_t vxrm_d, vxrm_q; - - `FF(vstart_q, vstart_d, '0) - `FF(vl_q, vl_d, '0) - `FF(vtype_q, vtype_d, '{vill: 1'b1, default: '0}) - `FF(vxsat_q, vxsat_d, '0) - `FF(vxrm_q, vxrm_d, '0) + vlen_t csr_vstart_d, csr_vstart_q; + vlen_t csr_vl_d, csr_vl_q; + vtype_t csr_vtype_d, csr_vtype_q; + vxsat_e csr_vxsat_d, csr_vxsat_q; + vxrm_t csr_vxrm_d, csr_vxrm_q; + + `FF(csr_vstart_q, csr_vstart_d, '0) + `FF(csr_vl_q, csr_vl_d, '0) + `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, default: '0}) + `FF(csr_vxsat_q, csr_vxsat_d, '0) + `FF(csr_vxrm_q, csr_vxrm_d, '0) // Converts between the internal representation of `vtype_t` and the full XLEN-bit CSR. function automatic riscv::xlen_t xlen_vtype(vtype_t vtype); xlen_vtype = {vtype.vill, {riscv::XLEN-9{1'b0}}, vtype.vma, vtype.vta, vtype.vsew, @@ -138,7 +134,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( NORMAL_OPERATION, WAIT_IDLE, RESHUFFLE, - SLDU_SEQUENCER + SLDU_SEQUENCER // NOTE: this is never used! } state_e; state_e state_d, state_q; @@ -197,9 +193,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // its counters of pending memory operations // Ara should tell Ariane when a memory operation is completed, so that it can modify // its pending load/store counters. - // A memory operation can be completed both when it is over and when vl_q == 0. In the latter case, + // A memory operation can be completed both when it is over and when csr_vl_q == 0. In the latter case, // Ara's decoder answers immediately, and this can cause a collision with an answer from Ara's VLSU. - // To avoid collisions, we give precedence to the VLSU, and we delay the vl_q == 0 memory op + // To avoid collisions, we give precedence to the VLSU, and we delay the csr_vl_q == 0 memory op // completion signal if a collision occurs logic load_zero_vl, store_zero_vl; // Do not checks vregs validity against current LMUL @@ -209,14 +205,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( logic is_decoding; // Is this an in-lane operation? logic in_lane_op; - // If the vslideup offset is greater than vl_q, the vslideup has no effects + // If the vslideup offset is greater than csr_vl_q, the vslideup has no effects logic null_vslideup; // Pipeline the VLSU's load and store complete signals, for timing reasons logic load_complete_q; logic store_complete_q; - `FF(load_complete_q, load_complete_i, 1'b0) - `FF(store_complete_q, store_complete_i, 1'b0) + logic illegal_insn_load, illegal_insn_store; + `FF(load_complete_q, load_complete_i || illegal_insn_load, 1'b0) + `FF(store_complete_q, store_complete_i || illegal_insn_store, 1'b0) // NP2 Slide support logic is_stride_np2; @@ -240,14 +237,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( always_comb begin: p_decoder // Default values - vstart_d = vstart_q; - vl_d = vl_q; - vtype_d = vtype_q; + csr_vstart_d = csr_vstart_q; + csr_vl_d = csr_vl_q; + csr_vtype_d = csr_vtype_q; state_d = state_q; eew_d = eew_q; eew_valid_d = eew_valid_q; - lmul_vs2 = vtype_q.vlmul; - lmul_vs1 = vtype_q.vlmul; + lmul_vs2 = csr_vtype_q.vlmul; + lmul_vs1 = csr_vtype_q.vlmul; reshuffle_req_d = reshuffle_req_q; eew_old_buffer_d = eew_old_buffer_q; @@ -259,8 +256,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( rs_mask_request_d = 1'b0; illegal_insn = 1'b0; - vxsat_d = vxsat_q; - vxrm_d = vxrm_q; + illegal_insn_load = 1'b0; + illegal_insn_store = 1'b0; + csr_vxsat_d = csr_vxsat_q; + csr_vxrm_d = csr_vxrm_q; is_vload = 1'b0; is_vstore = 1'b0; @@ -275,8 +274,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( is_decoding = 1'b0; in_lane_op = 1'b0; - acc_req_ready_o = 1'b0; - acc_resp_valid_o = 1'b0; acc_resp_o = '{ trans_id : acc_req_i.trans_id, load_complete : load_zero_vl | load_complete_q, @@ -285,18 +282,20 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( fflags_valid : |fflags_ex_valid_i, default : '0 }; + acc_resp_o.req_ready = 1'b0; + acc_resp_o.resp_valid = 1'b0; // fflags for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane]; ara_req_d = '{ - vl : vl_q, - vstart : vstart_q, - vtype : vtype_q, - emul : vtype_q.vlmul, - eew_vs1 : vtype_q.vsew, - eew_vs2 : vtype_q.vsew, - eew_vd_op : vtype_q.vsew, + vl : csr_vl_q, + vstart : csr_vstart_q, + vtype : csr_vtype_q, + emul : csr_vtype_q.vlmul, + eew_vs1 : csr_vtype_q.vsew, + eew_vs2 : csr_vtype_q.vsew, + eew_vd_op : csr_vtype_q.vsew, eew_vmask : eew_q[VMASK], cvt_resize : CVT_SAME, default : '0 @@ -307,9 +306,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b0; // Saturation in any lane will raise vxsat flag - vxsat_d |= |vxsat_flag_i; + csr_vxsat_d |= |vxsat_flag_i; // Fixed-point rounding mode is applied to all lanes - for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = vxrm_q; + for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = csr_vxrm_q; // Rounding mode is shared between all lanes for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane]; // Special states @@ -325,8 +324,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); // Stall the interface, wait for the backend to accept the injected uop - acc_req_ready_o = 1'b0; - acc_resp_valid_o = 1'b0; + acc_resp_o.req_ready = 1'b0; + acc_resp_o.resp_valid = 1'b0; // Handle LMUL > 1 rs_lmul_cnt_d = rs_lmul_cnt_q; @@ -424,14 +423,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end end end - endcase + endcase // state_q - if (state_d == NORMAL_OPERATION && state_q != RESHUFFLE) begin - if (acc_req_valid_i && ara_req_ready_i && acc_resp_ready_i) begin + if (state_d == NORMAL_OPERATION && state_q != RESHUFFLE) begin : not_reshuffling + if (acc_req_i.req_valid && ara_req_ready_i && acc_req_i.resp_ready) begin : ready // Decoding is_decoding = 1'b1; // Acknowledge the request - acc_req_ready_o = ara_req_ready_i; + acc_resp_o.req_ready = 1'b1; // Decode the instructions based on their opcode unique case (acc_req_i.insn.itype.opcode) @@ -439,45 +438,46 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Vector Arithmetic instructions // ////////////////////////////////////// - riscv::OpcodeVec: begin + riscv::OpcodeVec: begin : OpcodeVec // Instruction is of one of the RVV types automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); - // These always respond at the same cycle - acc_resp_valid_o = 1'b1; + // These (mostly) always respond at the same cycle + acc_resp_o.resp_valid = 1'b1; // Decode based on their func3 field unique case (insn.varith_type.func3) // Configuration instructions OPCFG: begin: opcfg // These can be acknowledged regardless of the state of Ara - acc_req_ready_o = 1'b1; + // NOTE: unless there is a pending fault-only first vector load + // acc_resp_o.req_ready = 1'b1; is_config = 1'b1; // Update vtype if (insn.vsetvli_type.func1 == 1'b0) begin // vsetvli - vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11)); + csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11)); end else if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli - vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10)); + csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10)); end else if (insn.vsetvl_type.func7 == 7'b100_0000) begin // vsetvl - vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0])); + csr_vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0])); end else - acc_resp_o.error = 1'b1; + illegal_insn = 1'b1; // Check whether the updated vtype makes sense - if ((vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN - (vtype_d.vlmul == LMUL_RSVD) || // reserved value + if ((csr_vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN + (csr_vtype_d.vlmul == LMUL_RSVD) || // reserved value // LMUL >= SEW/ELEN - (signed'($clog2(ELENB)) + signed'(vtype_d.vlmul) < signed'(vtype_d.vsew))) begin - vtype_d = '{vill: 1'b1, default: '0}; - vl_d = '0; + (signed'($clog2(ELENB)) + signed'(csr_vtype_d.vlmul) < signed'(csr_vtype_d.vsew))) begin + csr_vtype_d = '{vill: 1'b1, default: '0}; + csr_vl_d = '0; end // Update the vector length else begin // Maximum vector length. VLMAX = LMUL * VLEN / SEW. - automatic int unsigned vlmax = VLENB >> vtype_d.vsew; - unique case (vtype_d.vlmul) + automatic int unsigned vlmax = VLENB >> csr_vtype_d.vsew; + unique case (csr_vtype_d.vlmul) LMUL_1 : vlmax <<= 0; LMUL_2 : vlmax <<= 1; LMUL_4 : vlmax <<= 2; @@ -490,24 +490,24 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli - vl_d = vlen_t'(insn.vsetivli_type.uimm5); + csr_vl_d = vlen_t'(insn.vsetivli_type.uimm5); end else begin // vsetvl || vsetvli if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd == '0) begin // Do not update the vector length - vl_d = vl_q; + csr_vl_d = csr_vl_q; end else if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd != '0) begin // Set the vector length to vlmax - vl_d = vlmax; + csr_vl_d = vlmax; end else begin // Normal stripmining - vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(vl_d)]) || + csr_vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(csr_vl_d)]) || (vlen_t'(acc_req_i.rs1) > vlmax)) ? vlmax : vlen_t'(acc_req_i.rs1); end end end // Return the new vl - acc_resp_o.result = vl_d; + acc_resp_o.result = csr_vl_d; // If the vtype has changed, wait for the backend before issuing any new instructions. // This is to avoid hazards on implicit register labels when LMUL_old > LMUL_new @@ -515,7 +515,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Checking only lmul_q is a trick: we want to stall only if both lmuls have // zero MSB. If lmul_q has zero MSB, it's greater than lmul_d only if also // lmul_d has zero MSB since the slice comparison is intrinsically unsigned - if (!vtype_q.vlmul[2] && (vtype_d.vlmul[2:0] < vtype_q.vlmul[2:0])) + if (!csr_vtype_q.vlmul[2] && (csr_vtype_d.vlmul[2:0] < csr_vtype_q.vlmul[2:0])) state_d = WAIT_IDLE; end @@ -635,7 +635,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.varith_type.vm) begin ara_req_d.eew_vs1 = eew_q[ara_req_d.vs1]; ara_req_d.vtype.vsew = eew_q[ara_req_d.vs1]; - ara_req_d.vl = (vl_q << vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0]; + ara_req_d.vl = (csr_vl_q << csr_vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0]; end end 6'b100000: ara_req_d.op = ara_pkg::VSADDU; @@ -651,11 +651,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101100: begin ara_req_d.op = ara_pkg::VNSRL; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -669,11 +669,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101101: begin ara_req_d.op = ara_pkg::VNSRA; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -686,28 +686,28 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b101110: begin ara_req_d.op = ara_pkg::VNCLIPU; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 6'b101111: begin ara_req_d.op = ara_pkg::VNCLIP; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end // Reductions encode in cvt_resize the neutral value bits // CVT_WIDE is 2'b00 (hack to save wires) 6'b110000: begin ara_req_d.op = ara_pkg::VWREDSUMU; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110001: begin ara_req_d.op = ara_pkg::VWREDSUM; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; @@ -731,7 +731,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPIVX: begin: opivx @@ -761,7 +761,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = acc_req_i.rs1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslideup/vslide1up on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Vl refers to current system vsew, but operand requesters @@ -769,13 +769,13 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // i.e., request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = acc_req_i.rs1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslidedown/vslide1down on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Request will need reshuffling @@ -884,11 +884,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101100: begin ara_req_d.op = ara_pkg::VNSRL; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -902,11 +902,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101101: begin ara_req_d.op = ara_pkg::VNSRA; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -919,11 +919,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b101110: begin ara_req_d.op = ara_pkg::VNCLIPU; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 6'b101111: begin ara_req_d.op = ara_pkg::VNCLIP; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase @@ -941,7 +941,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPIVI: begin: opivi @@ -969,19 +969,19 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1}; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslideup/vslide1up on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1}; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Encode vslidedown/vslide1down on the use_scalar_op field ara_req_d.use_scalar_op = 1'b0; // Request will need reshuffling @@ -1094,11 +1094,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101100: begin ara_req_d.op = ara_pkg::VNSRL; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -1112,11 +1112,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b101101: begin ara_req_d.op = ara_pkg::VNSRA; ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); - lmul_vs2 = next_lmul(vtype_q.vlmul); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); // Check whether the EEW is not too wide. - if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1; // Check whether we can access vs2 unique case (ara_req_d.emul.next()) @@ -1129,11 +1129,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b101110: begin ara_req_d.op = ara_pkg::VNCLIPU; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 6'b101111: begin ara_req_d.op = ara_pkg::VNCLIP; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase @@ -1151,7 +1151,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPMVV: begin: opmvv @@ -1215,8 +1215,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b010000: begin // VWXUNARY0 // vmv.x.s // Stall the interface until we get the result - acc_req_ready_o = 1'b0; - acc_resp_valid_o = 1'b0; + acc_resp_o.req_ready = 1'b0; + acc_resp_o.resp_valid = 1'b0; case (insn.varith_type.rs1) 5'b00000: begin @@ -1240,7 +1240,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b1; // Sign extend operands - unique case (vtype_q.vsew) + unique case (csr_vtype_q.vsew) EW8: begin ara_req_d.conversion_vs2 = OpQueueConversionSExt8; end @@ -1254,13 +1254,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin - acc_req_ready_o = 1'b1; - acc_resp_o.result = ara_resp_i.resp; - acc_resp_o.error = ara_resp_i.error; - acc_resp_valid_o = 1'b1; - ara_req_valid_d = 1'b0; - end + if ( ara_resp_valid_i ) begin : ara_resp_valid + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.result = ara_resp_i.resp; + acc_resp_o.exception = ara_resp_i.exception; + // Clear request to backend + ara_req_valid_d = 1'b0; + end : ara_resp_valid end 6'b010100: begin ara_req_d.use_vd_op = 1'b1; @@ -1360,8 +1361,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW64) || - int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW64) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00011: begin // VSEXT.VF8 @@ -1370,44 +1371,44 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW64) || - int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW64) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00100: begin // VZEXT.VF4 ara_req_d.conversion_vs2 = OpQueueConversionZExt4; - ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); + ara_req_d.eew_vs2 = prev_prev_ew(csr_vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW32) || - int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) < int'(EW32) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00101: begin // VSEXT.VF4 ara_req_d.conversion_vs2 = OpQueueConversionSExt4; - ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); + ara_req_d.eew_vs2 = prev_prev_ew(csr_vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW32) || - int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; + if (int'(csr_vtype_q.vsew) < int'(EW32) || + int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1; end 5'b00110: begin // VZEXT.VF2 ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.prev(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8}) illegal_insn = 1'b1; end 5'b00111: begin // VSEXT.VF2 ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.prev(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; // Invalid conversion - if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) + if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8}) illegal_insn = 1'b1; end default: illegal_insn = 1'b1; @@ -1447,92 +1448,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Widening instructions 6'b110000: begin // VWADDU ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; @@ -1540,31 +1541,31 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end default: illegal_insn = 1'b1; @@ -1598,7 +1599,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPMVX: begin: opmvx @@ -1623,17 +1624,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin // vslide1up ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin // vslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; end @@ -1641,7 +1642,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // vmv.s.x ara_req_d.op = ara_pkg::VMVSX; ara_req_d.use_vs2 = 1'b0; - ara_req_d.vl = |vl_q ? 1 : '0; + ara_req_d.vl = |csr_vl_q ? 1 : '0; // This instruction ignores LMUL checks skip_lmul_checks = 1'b1; end @@ -1679,92 +1680,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Widening instructions 6'b110000: begin // VWADDU ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; @@ -1772,41 +1773,41 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111110: begin // VWMACCUS ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; end default: illegal_insn = 1'b1; @@ -1834,7 +1835,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1; // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end OPFVV: begin: opfvv @@ -1893,8 +1894,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b010000: begin // VWFUNARY0 // vmv.f.s // Stall the interface until we get the result - acc_req_ready_o = 1'b0; - acc_resp_valid_o = 1'b0; + acc_resp_o.req_ready = 1'b0; + acc_resp_o.resp_valid = 1'b0; ara_req_d.op = ara_pkg::VFMVFS; ara_req_d.use_vd = 1'b0; @@ -1904,7 +1905,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ignore_zero_vl_check = 1'b1; // Zero-extend operands - unique case (vtype_q.vsew) + unique case (csr_vtype_q.vsew) EW16: begin ara_req_d.conversion_vs2 = OpQueueConversionZExt4; end @@ -1915,13 +1916,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin - acc_req_ready_o = 1'b1; - acc_resp_o.result = ara_resp_i.resp; - acc_resp_o.error = ara_resp_i.error; - acc_resp_valid_o = 1'b1; - ara_req_valid_d = 1'b0; - end + if ( ara_resp_valid_i ) begin : ara_resp_valid + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.result = ara_resp_i.resp; + acc_resp_o.exception = ara_resp_i.exception; + // Clear request to backend + ara_req_valid_d = 1'b0; + end : ara_resp_valid end 6'b011000: ara_req_d.op = ara_pkg::VMFEQ; 6'b011001: ara_req_d.op = ara_pkg::VMFLE; @@ -1942,96 +1944,95 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000: begin // Widening VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01001: begin // Widening VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01010: begin // Widening VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01011: begin // Widening VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01100: begin // Widening VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01110: begin // Widening VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b01111: begin // Widening VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_WIDE; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; end 5'b10000: begin // Narrowing VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10001: begin // Narrowing VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10010: begin // Narrowing VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10011: begin // Narrowing VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10100: begin // Narrowing VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10101: begin // Narrowing VFNCVTRODFF ara_req_d.op = VFNCVTRODFF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10110: begin // Narrowing VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end 5'b10111: begin // Narrowing VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_NARROW; - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); end default: begin // Trigger an error - acc_resp_o.error = 1'b1; - ara_req_valid_d = 1'b0; + illegal_insn = 1'b1; end endcase end @@ -2094,99 +2095,99 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b110000: begin // VFWADD ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; end 6'b110001: begin // VFWREDUSUM ara_req_d.op = ara_pkg::VFWREDUSUM; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vs1 = vtype_q.vsew.next(); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = resize_e'(2'b00); end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; end 6'b110011: begin // VFWREDOSUM ara_req_d.op = ara_pkg::VFWREDOSUM; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vs1 = vtype_q.vsew.next(); + ara_req_d.eew_vs1 = csr_vtype_q.vsew.next(); ara_req_d.cvt_resize = resize_e'(2'b00); end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase @@ -2242,7 +2243,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end else illegal_insn = 1'b1; // Vector FP instructions are disabled end @@ -2281,17 +2282,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001110: begin // vfslide1up ara_req_d.op = ara_pkg::VSLIDEUP; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; // If stride > vl, the vslideup has no effects - if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || - (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; + if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] || + (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1; end 6'b001111: begin // vfslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; + ara_req_d.eew_vs2 = csr_vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; end @@ -2299,7 +2300,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // vmv.s.f ara_req_d.op = ara_pkg::VFMVSF; ara_req_d.use_vs2 = 1'b0; - ara_req_d.vl = |vl_q ? 1 : '0; + ara_req_d.vl = |csr_vl_q ? 1 : '0; // This instruction ignores LMUL checks skip_lmul_checks = 1'b1; end @@ -2360,85 +2361,85 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b110000: begin // VFWADD ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.wide_fp_imm = 1'b1; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; ara_req_d.swap_vs2_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - lmul_vs2 = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); - ara_req_d.eew_vs2 = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + lmul_vs2 = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); + ara_req_d.eew_vs2 = csr_vtype_q.vsew.next(); ara_req_d.wide_fp_imm = 1'b1; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; ara_req_d.use_vd_op = 1'b1; - ara_req_d.emul = next_lmul(vtype_q.vlmul); - ara_req_d.vtype.vsew = vtype_q.vsew.next(); + ara_req_d.emul = next_lmul(csr_vtype_q.vlmul); + ara_req_d.vtype.vsew = csr_vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.wide_fp_imm = 1'b1; - ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.eew_vd_op = csr_vtype_q.vsew.next(); end default: illegal_insn = 1'b1; endcase // Check if the FP scalar operand is NaN-boxed. If not, replace it with a NaN. - case (vtype_q.vsew) + case (csr_vtype_q.vsew) EW16: if (~(&acc_req_i.rs1[63:16])) ara_req_d.scalar_op = 64'h0000000000007e00; EW32: if (~(&acc_req_i.rs1[63:32])) ara_req_d.scalar_op = 64'h000000007fc00000; endcase @@ -2481,17 +2482,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( endcase // Instruction is invalid if the vtype is invalid - if (vtype_q.vill) illegal_insn = 1'b1; + if (csr_vtype_q.vill) illegal_insn = 1'b1; end else illegal_insn = 1'b1; // Vector FP instructions are disabled end endcase - end + end : OpcodeVec //////////////////// // Vector Loads // //////////////////// - riscv::OpcodeLoadFp: begin + riscv::OpcodeLoadFp: begin : OpcodeLoadFp // Instruction is of one of the RVV types automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); @@ -2499,7 +2500,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( is_vload = 1'b1; // Wait before acknowledging this instruction - acc_req_ready_o = 1'b0; + acc_resp_o.req_ready = 1'b0; // These generate a request to Ara's backend ara_req_d.vd = insn.vmem_type.rd; @@ -2515,7 +2516,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW8; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW8; end end @@ -2523,7 +2524,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW16; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW16; end end @@ -2531,7 +2532,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW32; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW32; end end @@ -2539,15 +2540,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW64; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW64; end end default: begin // Invalid. Element is too wide, or encoding is non-existant. - acc_req_ready_o = 1'b1; - acc_resp_o.error = 1'b1; - acc_resp_valid_o = 1'b1; - ara_req_valid_d = 1'b0; + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + illegal_insn = 1'b1; + ara_req_valid_d = 1'b0; end endcase @@ -2562,19 +2563,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000:; // Unit-strided, whole registers 5'b01011: begin // Unit-strided, mask load, EEW=1 // We operate ceil(vl/8) bytes - ara_req_d.vl = (vl_q >> 3) + |vl_q[2:0]; + ara_req_d.vl = (csr_vl_q >> 3) + |csr_vl_q[2:0]; ara_req_d.vtype.vsew = EW8; end 5'b10000: begin // Unit-strided, fault-only first // TODO: Not implemented - illegal_insn = 1'b1; - acc_req_ready_o = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_load = 1'b1; end default: begin // Reserved - illegal_insn = 1'b1; - acc_req_ready_o = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_load = 1'b1; end endcase end @@ -2594,24 +2591,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // For memory operations: EMUL = LMUL * (EEW / SEW) // EEW is encoded in the instruction - ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew)); + ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew)); // Exception if EMUL > 8 or < 1/8 - unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]}) + unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]}) // The new emul is lower than the previous lmul 2'b01: begin // But the new eew is greater than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin + illegal_insn_load = 1'b1; end end // The new emul is greater than the previous lmul 2'b10: begin // But the new eew is lower than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin + illegal_insn_load = 1'b1; end end default:; @@ -2621,20 +2616,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // access. unique case (ara_req_d.emul) LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_load = 1'b1; end LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_load = 1'b1; end LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_load = 1'b1; end LMUL_RSVD: begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_load = 1'b1; end default:; endcase @@ -2644,9 +2635,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Execute also if vl == 0 ignore_zero_vl_check = 1'b1; // The LMUL value is kept in the instruction itself - illegal_insn = 1'b0; - acc_req_ready_o = 1'b0; - acc_resp_valid_o = 1'b0; + illegal_insn_load = 1'b0; ara_req_valid_d = 1'b1; // Maximum vector length. VLMAX = nf * VLEN / EW8. @@ -2670,22 +2659,23 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end default: begin // Trigger an error for the reserved simm values - illegal_insn = 1'b1; + illegal_insn_load = 1'b1; end endcase end // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin - acc_req_ready_o = 1'b1; - acc_resp_o.error = ara_resp_i.error; - acc_resp_valid_o = 1'b1; - ara_req_valid_d = 1'b0; - // In case of error, modify vstart - if (ara_resp_i.error) - vstart_d = ara_resp_i.error_vl; - end - end + if ( ara_resp_valid_i ) begin : ara_resp_valid + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.exception = ara_resp_i.exception; + ara_req_valid_d = 1'b0; // Clear request to backend + // In case of exception, modify vstart + if ( ara_resp_i.exception.valid ) begin : exception + csr_vstart_d = ara_resp_i.exception_vl; + end : exception + end : ara_resp_valid + end : OpcodeLoadFp ///////////////////// // Vector Stores // @@ -2697,7 +2687,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // The current vector length refers to the target EEW! // Vector stores never re-shuffle the source register! - riscv::OpcodeStoreFp: begin + riscv::OpcodeStoreFp: begin : OpcodeStoreFp // Instruction is of one of the RVV types automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); @@ -2705,7 +2695,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( is_vstore = 1'b1; // Wait before acknowledging this instruction - acc_req_ready_o = 1'b0; + acc_resp_o.req_ready = 1'b0; // vl depends on the EEW encoded in the instruction. // Ara does not reshuffle source vregs upon vector stores, @@ -2728,7 +2718,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW8; // ara_req_d.vtype.vsew is the target EEW! end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW8; end end @@ -2736,7 +2726,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW16; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW16; end end @@ -2744,7 +2734,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW32; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW32; end end @@ -2752,15 +2742,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin ara_req_d.vtype.vsew = EW64; end else begin - ara_req_d.vtype.vsew = vtype_q.vsew; + ara_req_d.vtype.vsew = csr_vtype_q.vsew; ara_req_d.eew_vs2 = EW64; end end default: begin // Invalid. Element is too wide, or encoding is non-existant. - acc_req_ready_o = 1'b1; - acc_resp_o.error = 1'b1; - acc_resp_valid_o = 1'b1; - ara_req_valid_d = 1'b0; + illegal_insn = 1'b1; end endcase @@ -2775,13 +2762,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000:; // Unit-strided, whole registers 5'b01011: begin // Unit-strided, mask store, EEW=1 // We operate ceil(vl/8) bytes - ara_req_d.vl = (vl_q >> 3) + |vl_q[2:0]; + ara_req_d.vl = (csr_vl_q >> 3) + |csr_vl_q[2:0]; ara_req_d.vtype.vsew = EW8; end default: begin // Reserved - illegal_insn = 1'b1; - acc_req_ready_o = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_store = 1'b1; end endcase end @@ -2801,24 +2786,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // For memory operations: EMUL = LMUL * (EEW / SEW) // EEW is encoded in the instruction - ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew)); + ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew)); // Exception if EMUL > 8 or < 1/8 - unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]}) + unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]}) // The new emul is lower than the previous lmul 2'b01: begin // But the new eew is greater than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin + illegal_insn_store = 1'b1; end end // The new emul is greater than the previous lmul 2'b10: begin // But the new eew is lower than vsew - if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin + illegal_insn_store = 1'b1; end end default:; @@ -2828,20 +2811,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // access. unique case (ara_req_d.emul) LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_store = 1'b1; end LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_store = 1'b1; end LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_store = 1'b1; end LMUL_RSVD: begin - illegal_insn = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn_store = 1'b1; end default:; endcase @@ -2873,227 +2852,309 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end default: begin // Trigger an error for the reserved simm values - illegal_insn = 1'b1; + illegal_insn_store = 1'b1; end endcase - illegal_insn = 1'b0; - acc_req_ready_o = 1'b0; - acc_resp_valid_o = 1'b0; + // illegal_insn_store = 1'b0; // TODO: IS THIS A BUG? + acc_resp_o.req_ready = 1'b0; + acc_resp_o.resp_valid = 1'b0; ara_req_valid_d = 1'b1; end // Wait until the back-end answers to acknowledge those instructions - if (ara_resp_valid_i) begin - acc_req_ready_o = 1'b1; - acc_resp_o.error = ara_resp_i.error; - acc_resp_valid_o = 1'b1; - ara_req_valid_d = 1'b0; - // If there is an error, change vstart - if (ara_resp_i.error) - vstart_d = ara_resp_i.error_vl; - end - end + if ( ara_resp_valid_i ) begin : ara_resp_valid + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.exception = ara_resp_i.exception; + ara_req_valid_d = 1'b0; // Clear request to backend + // In case of exception, modify vstart + if ( ara_resp_i.exception.valid ) begin : exception + csr_vstart_d = ara_resp_i.exception_vl; + end : exception + end : ara_resp_valid + end : OpcodeStoreFp //////////////////////////// // CSR Reads and Writes // //////////////////////////// - riscv::OpcodeSystem: begin - // These always respond at the same cycle - acc_resp_valid_o = 1'b1; - is_config = 1'b1; - - unique case (acc_req_i.insn.itype.funct3) - 3'b001: begin // csrrw - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - // Only vstart can be written with CSR instructions. - riscv::CSR_VSTART: begin - vstart_d = acc_req_i.rs1; - acc_resp_o.result = vstart_q; - end - riscv::CSR_VXRM: begin - vxrm_d = vxrm_t'(acc_req_i.rs1[1:0]); - acc_resp_o.result = vlen_t'(vxrm_q); - end - riscv::CSR_VXSAT: begin - vxsat_d = vxsat_e'(acc_req_i.rs1[0]); - acc_resp_o.result = vlen_t'(vxsat_q); - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b010: begin // csrrs - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q | vlen_t'(acc_req_i.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXRM: begin - vxrm_d = vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]); - acc_resp_o.result = vlen_t'(vxrm_q); - end - riscv::CSR_VXSAT: begin - vxsat_d = vxsat_q | vxsat_e'(acc_req_i.rs1[0]); - acc_resp_o.result = vlen_t'(vxsat_q); - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b011: begin // csrrc - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q & ~vlen_t'(acc_req_i.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXSAT: begin - vxsat_d = vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b101: begin // csrrwi - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - // Only vstart can be written with CSR instructions. - riscv::CSR_VSTART: begin - vstart_d = vlen_t'(acc_req_i.insn.itype.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VXRM: begin - vxrm_d = vxrm_t'(acc_req_i.rs1[1:0]); - acc_resp_o.result = vlen_t'(vxrm_q); - end - riscv::CSR_VXSAT: begin - // logic [19:15] rs1; So, LSB is [15] - vxsat_d = acc_req_i.insn.itype.rs1[15]; - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b110: begin // csrrsi - // Decode the CSR. - case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q | vlen_t'(acc_req_i.insn.itype.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXSAT: begin - // logic [19:15] rs1; So, LSB is [15] - vxsat_d = vxsat_q | vxsat_e'(acc_req_i.insn.itype.rs1[15]); - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - 3'b111: begin // csrrci - // Decode the CSR. - unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) - riscv::CSR_VSTART: begin - vstart_d = vstart_q & ~vlen_t'(acc_req_i.insn.itype.rs1); - acc_resp_o.result = vstart_q; - end - riscv::CSR_VTYPE: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q); - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VL: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VLENB: begin - // Only reads are allowed - if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; - else acc_resp_o.error = 1'b1; - end - riscv::CSR_VXSAT: begin - // logic [19:15] rs1; So, LSB is [15] - vxsat_d = vxsat_q & ~vxsat_e'(acc_req_i.insn.itype.rs1[15]); - acc_resp_o.result = vxsat_q; - end - default: acc_resp_o.error = 1'b1; - endcase - end - default: begin - // Trigger an illegal instruction - acc_resp_o.error = 1'b1; - acc_resp_valid_o = 1'b1; - end - endcase - end + riscv::OpcodeSystem: begin : OpcodeSystem + // CSR ops have semantic dependency from vector instrucitons. + // Therefore, Ara must be idle before performing any CSR operation. + + // Stall if there is any pending vector instruction + // NOTE: This is overconstraining. Not all CSR ops actually need to stall if a vector instruction is pending. + // E.g., CSR vl is never updated by instructions past ara_dispatcher, except for "unit-stride fault-only-first loads". Reading vl would be safe otherwise. + // E.g., CSR vlenb is a design-constant parameter, reading is always safe. + // E.g., CSRs vxrm and vxsat have no influence on-non fixed-point instructions, it could be read and written safely when no fixed-point operation is running. + // By better analyzing the spec, more of optimizations of such can be made. For the sake of simplicity, the current implementation treats CSR ops as one block. + if ( ara_idle_i ) begin : ara_idle + // These always respond at the same cycle + acc_resp_o.resp_valid = 1'b1; + is_config = 1'b1; + + unique case (acc_req_i.insn.itype.funct3) + 3'b001: begin // csrrw + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + // Only vstart can be written with CSR instructions. + riscv::CSR_VSTART: begin + csr_vstart_d = acc_req_i.rs1; + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = vxrm_t'(acc_req_i.rs1[16:15]); + acc_resp_o.result = vlen_t'(csr_vxrm_q); + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = vxsat_e'(acc_req_i.rs1[15]); + acc_resp_o.result = vlen_t'(csr_vxsat_q); + end + riscv::CSR_VCSR: begin + csr_vxrm_d = vxrm_t'( acc_req_i.rs1[17:16] ); + csr_vxsat_d = vxsat_e'( acc_req_i.rs1[15] ); + acc_resp_o.result = vlen_t'( { csr_vxrm_q, csr_vxsat_q } ); + end + default: illegal_insn = 1'b1; + endcase + end + 3'b010: begin // csrrs + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q | vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[16:15]); + acc_resp_o.result = vlen_t'(csr_vxrm_q); + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]); + acc_resp_o.result = vlen_t'(csr_vxsat_q); + end + riscv::CSR_VCSR: begin + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[17:16]); + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]); + acc_resp_o.result = vlen_t'( { csr_vxrm_q, csr_vxsat_q } ); + end + default: illegal_insn = 1'b1; + endcase + end + 3'b011: begin // csrrc + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q & ~vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]); + acc_resp_o.result = csr_vxrm_q; + end + riscv::CSR_VCSR: begin + csr_vxrm_d = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[2:1]); + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = vlen_t'( { csr_vxrm_q, csr_vxsat_q } ); + end + default: illegal_insn = 1'b1; + endcase + end + 3'b101: begin // csrrwi + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + // Only vstart can be written with CSR instructions. + riscv::CSR_VSTART: begin + csr_vstart_d = vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = vxrm_t'(acc_req_i.rs1[1:0]); + acc_resp_o.result = vlen_t'(csr_vxrm_q); + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = acc_req_i.rs1[0]; + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VCSR: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = vxrm_t'(acc_req_i.rs1[2:1]); + csr_vxsat_d = vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + default: illegal_insn = 1'b1; + endcase + end + 3'b110: begin // csrrsi + // Decode the CSR. + case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q | vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXSAT: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VXRM: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]); + acc_resp_o.result = csr_vxrm_q; + end + riscv::CSR_VCSR: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[2:1]); + csr_vxsat_d = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = { csr_vxrm_q, csr_vxsat_q }; + end + default: illegal_insn = 1'b1; + endcase + end + 3'b111: begin // csrrci + // Decode the CSR. + unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm)) + riscv::CSR_VSTART: begin + csr_vstart_d = csr_vstart_q & ~vlen_t'(acc_req_i.rs1); + acc_resp_o.result = csr_vstart_q; + end + riscv::CSR_VTYPE: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q); + else illegal_insn = 1'b1; + end + riscv::CSR_VL: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q; + else illegal_insn = 1'b1; + end + riscv::CSR_VLENB: begin + // Only reads are allowed + if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB; + else illegal_insn = 1'b1; + end + riscv::CSR_VXSAT: begin + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = csr_vxsat_q; + end + riscv::CSR_VXRM: begin + csr_vxrm_d = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]); + acc_resp_o.result = csr_vxrm_q; + end + riscv::CSR_VCSR: begin + // logic [19:15] rs1; So, LSB is [15] + csr_vxrm_d = csr_vxrm_q & ~vxrm_t'(acc_req_i.rs1[2:1]); + csr_vxsat_d = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]); + acc_resp_o.result = { csr_vxrm_q, csr_vxsat_q }; + end + default: illegal_insn= 1'b1; + endcase + end + default: begin + // Trigger an illegal instruction + illegal_insn = 1'b1; + end + endcase // acc_req_i.insn.itype.funct3 + end : ara_idle + else begin : csr_stall + acc_resp_o.req_ready = 1'b0; + end : csr_stall + end : OpcodeSystem default: begin // Trigger an illegal instruction - acc_resp_o.error = 1'b1; - acc_resp_valid_o = 1'b1; + illegal_insn = 1'b1; end - endcase - end + + endcase // acc_req_i.insn.itype.opcode + end : ready // Check that we have fixed-point support if requested // vxsat and vxrm are always accessible anyway - if (ara_req_valid_d && (ara_req_d.op inside {[VSADDU:VNCLIPU], VSMUL}) && (FixPtSupport == FixedPointDisable)) + if (ara_req_valid_d && (ara_req_d.op inside {[VSADDU:VNCLIPU], VSMUL}) && (FixPtSupport == FixedPointDisable)) begin : fixed_point_check illegal_insn = 1'b1; + end : fixed_point_check // Check that we have we have vfrec7, vfrsqrt7 - if (ara_req_valid_d && (ara_req_d.op inside {VFREC7, VFRSQRT7}) && (FPExtSupport == FPExtSupportDisable)) + if (ara_req_valid_d && (ara_req_d.op inside {VFREC7, VFRSQRT7}) && (FPExtSupport == FPExtSupportDisable)) begin : vfrec7_vfrsqrt7_support_check illegal_insn = 1'b1; + end : vfrec7_vfrsqrt7_support_check + + + // Raise an illegal instruction exception + if ( illegal_insn || illegal_insn_load || illegal_insn_store ) begin : illegal_instruction + ara_req_valid_d = 1'b0; + acc_resp_o.req_ready = 1'b1; + acc_resp_o.resp_valid = 1'b1; + acc_resp_o.exception.valid = 1'b1; + acc_resp_o.exception.cause = riscv::ILLEGAL_INSTR; + acc_resp_o.exception.tval = acc_req_i.insn; + end : illegal_instruction + + // Reset vstart to zero for successful vector instructions + // Corner cases: + // * vstart exception reporting, e.g., VLSU, is handled above + // * CSR operations are not considered vector instructions + if ( acc_resp_o.resp_valid + & !acc_resp_o.exception.valid + & (acc_req_i.insn.itype.opcode != riscv::OpcodeSystem) + ) begin : reset_vstart + csr_vstart_d = '0; + end : reset_vstart // Check if we need to reshuffle our vector registers involved in the operation // This operation is costly when occurs, so avoid it if possible - if (ara_req_valid_d && !acc_resp_o.error) begin + if ( ara_req_valid_d && !acc_resp_o.exception.valid ) begin : check_reshuffle automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); // Is the instruction an in-lane one and could it be subject to reshuffling? @@ -3104,7 +3165,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Optimization: reshuffle vd only if we are not overwriting the whole vector register! reshuffle_req_d = {ara_req_d.use_vs1 && (ara_req_d.eew_vs1 != eew_q[ara_req_d.vs1]) && eew_valid_q[ara_req_d.vs1] && in_lane_op, ara_req_d.use_vs2 && (ara_req_d.eew_vs2 != eew_q[ara_req_d.vs2]) && eew_valid_q[ara_req_d.vs2] && in_lane_op, - ara_req_d.use_vd && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && vl_q != (VLENB >> ara_req_d.vtype.vsew)}; + ara_req_d.use_vd && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && csr_vl_q != (VLENB >> ara_req_d.vtype.vsew)}; // Prepare the information to reshuffle the vector registers during the next cycles // Reshuffle in the following order: vd, v2, v1. The order is arbitrary. @@ -3126,7 +3187,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end default:; endcase - end + end : check_reshuffle // Reshuffle if at least one of the three registers needs a reshuffle if (|reshuffle_req_d) begin @@ -3134,8 +3195,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr); // Stall the interface, and inject a reshuffling instruction - acc_req_ready_o = 1'b0; - acc_resp_valid_o = 1'b0; + acc_resp_o.req_ready = 1'b0; + acc_resp_o.resp_valid = 1'b0; ara_req_valid_d = 1'b0; // Initialize the reshuffle counter limit to handle LMUL > 1 @@ -3149,13 +3210,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Reshuffle state_d = RESHUFFLE; end - end - - // Raise an illegal instruction exception - if (illegal_insn) begin - acc_resp_o.error = 1'b1; - ara_req_valid_d = 1'b0; - end + end : not_reshuffling // Update the EEW if (ara_req_valid_d && ara_req_d.use_vd && ara_req_ready_i) begin @@ -3195,14 +3250,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Any valid non-config instruction is a NOP if vl == 0, with some exceptions, // e.g. whole vector memory operations / whole vector register move - if (is_decoding && (vl_q == '0 || null_vslideup) && !is_config && - !ignore_zero_vl_check && !acc_resp_o.error) begin + if (is_decoding && (csr_vl_q == '0 || null_vslideup) && !is_config && + !ignore_zero_vl_check && !acc_resp_o.exception.valid) begin // If we are acknowledging a memory operation, we must tell Ariane that the memory // operation was resolved (to decrement its pending load/store counter) // This can collide with the same signal from the vector load/store unit, so we must // delay the zero_vl acknowledge by 1 cycle - acc_req_ready_o = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q)); - acc_resp_valid_o = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q)); + acc_resp_o.req_ready = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q)); + acc_resp_o.resp_valid = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q)); ara_req_valid_d = 1'b0; load_zero_vl = is_vload; store_zero_vl = is_vstore; diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 5fb0abff1..f384eaa63 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -40,8 +40,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i output logic pe_scalar_resp_ready_o, // Interface with the Address Generation input logic addrgen_ack_i, - input logic addrgen_error_i, - input vlen_t addrgen_error_vl_i + input ariane_pkg::exception_t addrgen_exception_i, + input vlen_t addrgen_exception_vl_i ); /////////////////////////////////// @@ -438,8 +438,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i state_d = IDLE; ara_req_ready_o = 1'b1; ara_resp_valid_o = 1'b1; - ara_resp_o.error = addrgen_error_i; - ara_resp_o.error_vl = addrgen_error_vl_i; + ara_resp_o.exception = addrgen_exception_i; + ara_resp_o.exception_vl = addrgen_exception_vl_i; end // Wait for the scalar result diff --git a/hardware/src/ara_soc.sv b/hardware/src/ara_soc.sv index 431605ef0..5c1ac53db 100644 --- a/hardware/src/ara_soc.sv +++ b/hardware/src/ara_soc.sv @@ -53,6 +53,7 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( `include "axi/assign.svh" `include "axi/typedef.svh" `include "common_cells/registers.svh" + `include "apb/typedef.svh" ////////////////////// // Memory Regions // @@ -137,7 +138,8 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( UniqueIds : 1'b0, AxiAddrWidth : AxiAddrWidth, AxiDataWidth : AxiWideDataWidth, - NoAddrRules : NrAXISlaves + NoAddrRules : NrAXISlaves, + default : '0 }; axi_pkg::xbar_rule_64_t [NrAXISlaves-1:0] routing_rules; @@ -187,8 +189,8 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( axi_atop_filter #( .AxiIdWidth (AxiSocIdWidth ), .AxiMaxWriteTxns(4 ), - .req_t (soc_wide_req_t ), - .resp_t (soc_wide_resp_t) + .axi_req_t (soc_wide_req_t ), + .axi_resp_t (soc_wide_resp_t) ) i_l2mem_atop_filter ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -257,96 +259,102 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( // UART // //////////// - axi2apb_64_32 #( - .AXI4_ADDRESS_WIDTH(AxiAddrWidth ), - .AXI4_RDATA_WIDTH (AxiNarrowDataWidth), - .AXI4_WDATA_WIDTH (AxiNarrowDataWidth), - .AXI4_ID_WIDTH (AxiSocIdWidth ), - .AXI4_USER_WIDTH (AxiUserWidth ), - .BUFF_DEPTH_SLAVE (2 ), - .APB_ADDR_WIDTH (32 ) - ) i_axi2apb_64_32_uart ( - .ACLK (clk_i ), - .ARESETn (rst_ni ), - .test_en_i (1'b0 ), - .AWID_i (periph_narrow_axi_req[UART].aw.id ), - .AWADDR_i (periph_narrow_axi_req[UART].aw.addr ), - .AWLEN_i (periph_narrow_axi_req[UART].aw.len ), - .AWSIZE_i (periph_narrow_axi_req[UART].aw.size ), - .AWBURST_i (periph_narrow_axi_req[UART].aw.burst ), - .AWLOCK_i (periph_narrow_axi_req[UART].aw.lock ), - .AWCACHE_i (periph_narrow_axi_req[UART].aw.cache ), - .AWPROT_i (periph_narrow_axi_req[UART].aw.prot ), - .AWREGION_i(periph_narrow_axi_req[UART].aw.region), - .AWUSER_i (periph_narrow_axi_req[UART].aw.user ), - .AWQOS_i (periph_narrow_axi_req[UART].aw.qos ), - .AWVALID_i (periph_narrow_axi_req[UART].aw_valid ), - .AWREADY_o (periph_narrow_axi_resp[UART].aw_ready), - .WDATA_i (periph_narrow_axi_req[UART].w.data ), - .WSTRB_i (periph_narrow_axi_req[UART].w.strb ), - .WLAST_i (periph_narrow_axi_req[UART].w.last ), - .WUSER_i (periph_narrow_axi_req[UART].w.user ), - .WVALID_i (periph_narrow_axi_req[UART].w_valid ), - .WREADY_o (periph_narrow_axi_resp[UART].w_ready ), - .BID_o (periph_narrow_axi_resp[UART].b.id ), - .BRESP_o (periph_narrow_axi_resp[UART].b.resp ), - .BVALID_o (periph_narrow_axi_resp[UART].b_valid ), - .BUSER_o (periph_narrow_axi_resp[UART].b.user ), - .BREADY_i (periph_narrow_axi_req[UART].b_ready ), - .ARID_i (periph_narrow_axi_req[UART].ar.id ), - .ARADDR_i (periph_narrow_axi_req[UART].ar.addr ), - .ARLEN_i (periph_narrow_axi_req[UART].ar.len ), - .ARSIZE_i (periph_narrow_axi_req[UART].ar.size ), - .ARBURST_i (periph_narrow_axi_req[UART].ar.burst ), - .ARLOCK_i (periph_narrow_axi_req[UART].ar.lock ), - .ARCACHE_i (periph_narrow_axi_req[UART].ar.cache ), - .ARPROT_i (periph_narrow_axi_req[UART].ar.prot ), - .ARREGION_i(periph_narrow_axi_req[UART].ar.region), - .ARUSER_i (periph_narrow_axi_req[UART].ar.user ), - .ARQOS_i (periph_narrow_axi_req[UART].ar.qos ), - .ARVALID_i (periph_narrow_axi_req[UART].ar_valid ), - .ARREADY_o (periph_narrow_axi_resp[UART].ar_ready), - .RID_o (periph_narrow_axi_resp[UART].r.id ), - .RDATA_o (periph_narrow_axi_resp[UART].r.data ), - .RRESP_o (periph_narrow_axi_resp[UART].r.resp ), - .RLAST_o (periph_narrow_axi_resp[UART].r.last ), - .RUSER_o (periph_narrow_axi_resp[UART].r.user ), - .RVALID_o (periph_narrow_axi_resp[UART].r_valid ), - .RREADY_i (periph_narrow_axi_req[UART].r_ready ), - .PENABLE (uart_penable_o ), - .PWRITE (uart_pwrite_o ), - .PADDR (uart_paddr_o ), - .PSEL (uart_psel_o ), - .PWDATA (uart_pwdata_o ), - .PRDATA (uart_prdata_i ), - .PREADY (uart_pready_i ), - .PSLVERR (uart_pslverr_i ) + `AXI_TYPEDEF_ALL(uart_axi, axi_addr_t, axi_soc_id_t, logic [31:0], logic [3:0], axi_user_t) + `AXI_LITE_TYPEDEF_ALL(uart_lite, axi_addr_t, logic [31:0], logic [3:0]) + `APB_TYPEDEF_ALL(uart_apb, axi_addr_t, logic [31:0], logic [3:0]) + + uart_axi_req_t uart_axi_req; + uart_axi_resp_t uart_axi_resp; + uart_lite_req_t uart_lite_req; + uart_lite_resp_t uart_lite_resp; + uart_apb_req_t uart_apb_req; + uart_apb_resp_t uart_apb_resp; + + assign uart_penable_o = uart_apb_req.penable; + assign uart_pwrite_o = uart_apb_req.pwrite; + assign uart_paddr_o = uart_apb_req.paddr; + assign uart_psel_o = uart_apb_req.psel; + assign uart_pwdata_o = uart_apb_req.pwdata; + assign uart_apb_resp.prdata = uart_prdata_i; + assign uart_apb_resp.pready = uart_pready_i; + assign uart_apb_resp.pslverr = uart_pslverr_i; + + typedef struct packed { + int unsigned idx; + axi_addr_t start_addr; + axi_addr_t end_addr; + } uart_apb_rule_t; + + uart_apb_rule_t uart_apb_map = '{idx: 0, start_addr: '0, end_addr: '1}; + + axi_lite_to_apb #( + .NoApbSlaves (32'd1 ), + .NoRules (32'd1 ), + .AddrWidth (AxiAddrWidth ), + .DataWidth (32'd32 ), + .PipelineRequest (1'b0 ), + .PipelineResponse(1'b0 ), + .axi_lite_req_t (uart_lite_req_t ), + .axi_lite_resp_t (uart_lite_resp_t), + .apb_req_t (uart_apb_req_t ), + .apb_resp_t (uart_apb_resp_t ), + .rule_t (uart_apb_rule_t ) + ) i_axi_lite_to_apb_uart ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .axi_lite_req_i (uart_lite_req ), + .axi_lite_resp_o(uart_lite_resp), + .apb_req_o (uart_apb_req ), + .apb_resp_i (uart_apb_resp ), + .addr_map_i (uart_apb_map ) + ); + + axi_to_axi_lite #( + .AxiAddrWidth (AxiAddrWidth ), + .AxiDataWidth (32'd32 ), + .AxiIdWidth (AxiSocIdWidth ), + .AxiUserWidth (AxiUserWidth ), + .AxiMaxWriteTxns(32'd1 ), + .AxiMaxReadTxns (32'd1 ), + .FallThrough (1'b1 ), + .full_req_t (uart_axi_req_t ), + .full_resp_t (uart_axi_resp_t ), + .lite_req_t (uart_lite_req_t ), + .lite_resp_t (uart_lite_resp_t) + ) i_axi_to_axi_lite_uart ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (1'b0 ), + .slv_req_i (uart_axi_req ), + .slv_resp_o(uart_axi_resp ), + .mst_req_o (uart_lite_req ), + .mst_resp_i(uart_lite_resp) ); axi_dw_converter #( - .AxiSlvPortDataWidth(AxiWideDataWidth ), - .AxiMstPortDataWidth(AxiNarrowDataWidth ), - .AxiAddrWidth (AxiAddrWidth ), - .AxiIdWidth (AxiSocIdWidth ), - .AxiMaxReads (2 ), - .ar_chan_t (soc_wide_ar_chan_t ), - .mst_r_chan_t (soc_narrow_r_chan_t ), - .slv_r_chan_t (soc_wide_r_chan_t ), - .aw_chan_t (soc_narrow_aw_chan_t ), - .b_chan_t (soc_wide_b_chan_t ), - .mst_w_chan_t (soc_narrow_w_chan_t ), - .slv_w_chan_t (soc_wide_w_chan_t ), - .axi_mst_req_t (soc_narrow_req_t ), - .axi_mst_resp_t (soc_narrow_resp_t ), - .axi_slv_req_t (soc_wide_req_t ), - .axi_slv_resp_t (soc_wide_resp_t ) + .AxiSlvPortDataWidth(AxiWideDataWidth ), + .AxiMstPortDataWidth(32 ), + .AxiAddrWidth (AxiAddrWidth ), + .AxiIdWidth (AxiSocIdWidth ), + .AxiMaxReads (1 ), + .ar_chan_t (soc_wide_ar_chan_t), + .mst_r_chan_t (uart_axi_r_chan_t ), + .slv_r_chan_t (soc_wide_r_chan_t ), + .aw_chan_t (uart_axi_aw_chan_t), + .b_chan_t (soc_wide_b_chan_t ), + .mst_w_chan_t (uart_axi_w_chan_t ), + .slv_w_chan_t (soc_wide_w_chan_t ), + .axi_mst_req_t (uart_axi_req_t ), + .axi_mst_resp_t (uart_axi_resp_t ), + .axi_slv_req_t (soc_wide_req_t ), + .axi_slv_resp_t (soc_wide_resp_t ) ) i_axi_slave_uart_dwc ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .slv_req_i (periph_wide_axi_req[UART] ), - .slv_resp_o(periph_wide_axi_resp[UART] ), - .mst_req_o (periph_narrow_axi_req[UART] ), - .mst_resp_i(periph_narrow_axi_resp[UART]) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (periph_wide_axi_req[UART] ), + .slv_resp_o(periph_wide_axi_resp[UART]), + .mst_req_o (uart_axi_req ), + .mst_resp_i(uart_axi_resp ) ); ///////////////////////// @@ -450,7 +458,7 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( CachedRegionAddrBase : {DRAMBase}, CachedRegionLength : {DRAMLength}, // cache config - Axi64BitCompliant : 1'b1, + AxiCompliant : 1'b1, SwapEndianess : 1'b0, // debug DmBaseAddress : 64'h0, diff --git a/hardware/src/ara_system.sv b/hardware/src/ara_system.sv index f8c32d44e..c2e7a7c5b 100644 --- a/hardware/src/ara_system.sv +++ b/hardware/src/ara_system.sv @@ -73,13 +73,11 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #( // Ara and Ariane // ////////////////////// - import ariane_pkg::accelerator_req_t; - import ariane_pkg::accelerator_resp_t; + import acc_pkg::accelerator_req_t; + import acc_pkg::accelerator_resp_t; // Accelerator ports accelerator_req_t acc_req; - logic acc_req_valid; - logic acc_req_ready; accelerator_resp_t acc_resp; logic acc_resp_valid; logic acc_resp_ready; @@ -98,15 +96,23 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #( .clk_i (clk_i ), .rst_ni (rst_ni ), .acc_req_o (acc_req ), - .acc_req_valid_o (acc_req_valid ), - .acc_req_ready_i (acc_req_ready ), .acc_resp_i (acc_resp ), .acc_resp_valid_i (acc_resp_valid ), .acc_resp_ready_o (acc_resp_ready ) ); `else - ariane #( - .ArianeCfg(ArianeCfg) + cva6 #( + .ArianeCfg(ArianeCfg), + .cvxif_req_t (acc_pkg::accelerator_req_t), + .cvxif_resp_t (acc_pkg::accelerator_resp_t), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiNarrowDataWidth ), + .AxiIdWidth ( AxiIdWidth ), + .axi_ar_chan_t (ariane_axi_ar_t), + .axi_aw_chan_t (ariane_axi_aw_t), + .axi_w_chan_t (ariane_axi_w_t), + .axi_req_t (ariane_axi_req_t), + .axi_rsp_t (ariane_axi_resp_t) ) i_ariane ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -116,19 +122,20 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #( .ipi_i ('0 ), .time_irq_i ('0 ), .debug_req_i ('0 ), - .axi_req_o (ariane_narrow_axi_req ), - .axi_resp_i (ariane_narrow_axi_resp), - // Accelerator ports - .acc_req_o (acc_req ), - .acc_req_valid_o (acc_req_valid ), - .acc_req_ready_i (acc_req_ready ), - .acc_resp_i (acc_resp ), - .acc_resp_valid_i (acc_resp_valid ), - .acc_resp_ready_o (acc_resp_ready ), + // Invalidation requests .acc_cons_en_o (acc_cons_en ), .inval_addr_i (inval_addr ), .inval_valid_i (inval_valid ), - .inval_ready_o (inval_ready ) + .inval_ready_o (inval_ready ), + .rvfi_o ( ), + // Accelerator ports + .cvxif_req_o (acc_req ), + .cvxif_resp_i (acc_resp ), + .l15_req_o ( ), + .l15_rtrn_i ( '0 ), + // Memory interface + .axi_req_o (ariane_narrow_axi_req ), + .axi_resp_i (ariane_narrow_axi_resp) ); `endif @@ -211,11 +218,7 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #( .scan_data_i (1'b0 ), .scan_data_o (/* Unused */ ), .acc_req_i (acc_req ), - .acc_req_valid_i (acc_req_valid ), - .acc_req_ready_o (acc_req_ready ), .acc_resp_o (acc_resp ), - .acc_resp_valid_o(acc_resp_valid), - .acc_resp_ready_i(acc_resp_ready), .axi_req_o (ara_axi_req ), .axi_resp_i (ara_axi_resp ) ); diff --git a/hardware/src/axi_to_mem.sv b/hardware/src/axi_to_mem.sv deleted file mode 100644 index 7a3db70de..000000000 --- a/hardware/src/axi_to_mem.sv +++ /dev/null @@ -1,691 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Authors: -// - Andreas Kurth -// - Wolfgang Roenninger - -`include "common_cells/registers.svh" -/// AXI4+ATOP slave module which translates AXI bursts into a memory stream. -/// If both read and write channels of the AXI4+ATOP are active, both will have an -/// utilization of 50%. -module axi_to_mem #( - /// AXI4+ATOP request type. See `include/axi/typedef.svh`. - parameter type axi_req_t = logic, - /// AXI4+ATOP response type. See `include/axi/typedef.svh`. - parameter type axi_resp_t = logic, - /// Address width, has to be less or equal than the width off the AXI address field. - /// Determines the width of `mem_addr_o`. Has to be wide enough to emit the memory region - /// which should be accessible. - parameter int unsigned AddrWidth = 0, - /// AXI4+ATOP data width. - parameter int unsigned DataWidth = 0, - /// AXI4+ATOP ID width. - parameter int unsigned IdWidth = 0, - /// Number of banks at output, must evenly divide `DataWidth`. - parameter int unsigned NumBanks = 0, - /// Depth of memory response buffer. This should be equal to the memory response latency. - parameter int unsigned BufDepth = 1, - /// Dependent parameter, do not override. Memory address type. - localparam type addr_t = logic [AddrWidth-1:0], - /// Dependent parameter, do not override. Memory data type. - localparam type mem_data_t = logic [DataWidth/NumBanks-1:0], - /// Dependent parameter, do not override. Memory write strobe type. - localparam type mem_strb_t = logic [DataWidth/NumBanks/8-1:0] -) ( - /// Clock input. - input logic clk_i, - /// Asynchronous reset, active low. - input logic rst_ni, - /// The unit is busy handling an AXI4+ATOP request. - output logic busy_o, - /// AXI4+ATOP slave port, request input. - input axi_req_t axi_req_i, - /// AXI4+ATOP slave port, response output. - output axi_resp_t axi_resp_o, - /// Memory stream master, request is valid for this bank. - output logic [NumBanks-1:0] mem_req_o, - /// Memory stream master, request can be granted by this bank. - input logic [NumBanks-1:0] mem_gnt_i, - /// Memory stream master, byte address of the request. - output addr_t [NumBanks-1:0] mem_addr_o, - /// Memory stream master, write data for this bank. Valid when `mem_req_o`. - output mem_data_t [NumBanks-1:0] mem_wdata_o, - /// Memory stream master, byte-wise strobe (byte enable). - output mem_strb_t [NumBanks-1:0] mem_strb_o, - /// Memory stream master, `axi_pkg::atop_t` signal associated with this request. - output axi_pkg::atop_t [NumBanks-1:0] mem_atop_o, - /// Memory stream master, write enable. Then asserted store of `mem_w_data` is requested. - output logic [NumBanks-1:0] mem_we_o, - /// Memory stream master, response is valid. This module expects always a response valid for a - /// request regardless if the request was a write or a read. - input logic [NumBanks-1:0] mem_rvalid_i, - /// Memory stream master, read response data. - input mem_data_t [NumBanks-1:0] mem_rdata_i -); - - typedef logic [DataWidth-1:0] axi_data_t; - typedef logic [DataWidth/8-1:0] axi_strb_t; - typedef logic [IdWidth-1:0] axi_id_t; - - typedef struct packed { - addr_t addr; - axi_pkg::atop_t atop; - axi_strb_t strb; - axi_data_t wdata; - logic we; - } mem_req_t; - - typedef struct packed { - addr_t addr; - axi_pkg::atop_t atop; - axi_id_t id; - logic last; - axi_pkg::qos_t qos; - axi_pkg::size_t size; - logic write; - } meta_t; - - axi_data_t mem_rdata, - m2s_resp; - axi_pkg::len_t r_cnt_d, r_cnt_q, - w_cnt_d, w_cnt_q; - logic arb_valid, arb_ready, - rd_valid, rd_ready, - wr_valid, wr_ready, - sel_b, sel_buf_b, - sel_r, sel_buf_r, - sel_valid, sel_ready, - sel_buf_valid, sel_buf_ready, - sel_lock_d, sel_lock_q, - meta_valid, meta_ready, - meta_buf_valid, meta_buf_ready, - meta_sel_d, meta_sel_q, - m2s_req_valid, m2s_req_ready, - m2s_resp_valid, m2s_resp_ready, - mem_req_valid, mem_req_ready, - mem_rvalid; - mem_req_t m2s_req, - mem_req; - meta_t rd_meta, - rd_meta_d, rd_meta_q, - wr_meta, - wr_meta_d, wr_meta_q, - meta, meta_buf; - - assign busy_o = axi_req_i.aw_valid | axi_req_i.ar_valid | axi_req_i.w_valid | - axi_resp_o.b_valid | axi_resp_o.r_valid | - (r_cnt_q > 0) | (w_cnt_q > 0); - - // Handle reads. - always_comb begin - // Default assignments - axi_resp_o.ar_ready = 1'b0; - rd_meta_d = rd_meta_q; - rd_meta = meta_t'{default: '0}; - rd_valid = 1'b0; - r_cnt_d = r_cnt_q; - // Handle R burst in progress. - if (r_cnt_q > '0) begin - rd_meta_d.last = (r_cnt_q == 8'd1); - rd_meta = rd_meta_d; - rd_meta.addr = rd_meta_q.addr + axi_pkg::num_bytes(rd_meta_q.size); - rd_valid = 1'b1; - if (rd_ready) begin - r_cnt_d--; - rd_meta_d.addr = rd_meta.addr; - end - // Handle new AR if there is one. - end else if (axi_req_i.ar_valid) begin - rd_meta_d = '{ - addr: addr_t'(axi_pkg::aligned_addr(axi_req_i.ar.addr, axi_req_i.ar.size)), - atop: '0, - id: axi_req_i.ar.id, - last: (axi_req_i.ar.len == '0), - qos: axi_req_i.ar.qos, - size: axi_req_i.ar.size, - write: 1'b0 - }; - rd_meta = rd_meta_d; - rd_meta.addr = addr_t'(axi_req_i.ar.addr); - rd_valid = 1'b1; - if (rd_ready) begin - r_cnt_d = axi_req_i.ar.len; - axi_resp_o.ar_ready = 1'b1; - end - end - end - - // Handle writes. - always_comb begin - // Default assignments - axi_resp_o.aw_ready = 1'b0; - axi_resp_o.w_ready = 1'b0; - wr_meta_d = wr_meta_q; - wr_meta = meta_t'{default: '0}; - wr_valid = 1'b0; - w_cnt_d = w_cnt_q; - // Handle W bursts in progress. - if (w_cnt_q > '0) begin - wr_meta_d.last = (w_cnt_q == 8'd1); - wr_meta = wr_meta_d; - wr_meta.addr = wr_meta_q.addr + axi_pkg::num_bytes(wr_meta_q.size); - if (axi_req_i.w_valid) begin - wr_valid = 1'b1; - if (wr_ready) begin - axi_resp_o.w_ready = 1'b1; - w_cnt_d--; - wr_meta_d.addr = wr_meta.addr; - end - end - // Handle new AW if there is one. - end else if (axi_req_i.aw_valid && axi_req_i.w_valid) begin - wr_meta_d = '{ - addr: addr_t'(axi_pkg::aligned_addr(axi_req_i.aw.addr, axi_req_i.aw.size)), - atop: axi_req_i.aw.atop, - id: axi_req_i.aw.id, - last: (axi_req_i.aw.len == '0), - qos: axi_req_i.aw.qos, - size: axi_req_i.aw.size, - write: 1'b1 - }; - wr_meta = wr_meta_d; - wr_meta.addr = addr_t'(axi_req_i.aw.addr); - wr_valid = 1'b1; - if (wr_ready) begin - w_cnt_d = axi_req_i.aw.len; - axi_resp_o.aw_ready = 1'b1; - axi_resp_o.w_ready = 1'b1; - end - end - end - - // Arbitrate between reads and writes. - stream_mux #( - .DATA_T ( meta_t ), - .N_INP ( 32'd2 ) - ) i_ax_mux ( - .inp_data_i ({wr_meta, rd_meta }), - .inp_valid_i ({wr_valid, rd_valid}), - .inp_ready_o ({wr_ready, rd_ready}), - .inp_sel_i ( meta_sel_d ), - .oup_data_o ( meta ), - .oup_valid_o ( arb_valid ), - .oup_ready_i ( arb_ready ) - ); - always_comb begin - meta_sel_d = meta_sel_q; - sel_lock_d = sel_lock_q; - if (sel_lock_q) begin - meta_sel_d = meta_sel_q; - if (arb_valid && arb_ready) begin - sel_lock_d = 1'b0; - end - end else begin - if (wr_valid ^ rd_valid) begin - // If either write or read is valid but not both, select the valid one. - meta_sel_d = wr_valid; - end else if (wr_valid && rd_valid) begin - // If both write and read are valid, decide according to QoS then burst properties. - // Prioritize higher QoS. - if (wr_meta.qos > rd_meta.qos) begin - meta_sel_d = 1'b1; - end else if (rd_meta.qos > wr_meta.qos) begin - meta_sel_d = 1'b0; - // Decide requests with identical QoS. - end else if (wr_meta.qos == rd_meta.qos) begin - // 1. Prioritize individual writes over read bursts. - // Rationale: Read bursts can be interleaved on AXI but write bursts cannot. - if (wr_meta.last && !rd_meta.last) begin - meta_sel_d = 1'b1; - // 2. Prioritize ongoing burst. - // Rationale: Stalled bursts create back-pressure or require costly buffers. - end else if (w_cnt_q > '0) begin - meta_sel_d = 1'b1; - end else if (r_cnt_q > '0) begin - meta_sel_d = 1'b0; - // 3. Otherwise arbitrate round robin to prevent starvation. - end else begin - meta_sel_d = ~meta_sel_q; - end - end - end - // Lock arbitration if valid but not yet ready. - if (arb_valid && !arb_ready) begin - sel_lock_d = 1'b1; - end - end - end - - // Fork arbitrated stream to meta data, memory requests, and R/B channel selection. - stream_fork #( - .N_OUP ( 32'd3 ) - ) i_fork ( - .clk_i, - .rst_ni, - .valid_i ( arb_valid ), - .ready_o ( arb_ready ), - .valid_o ({sel_valid, meta_valid, m2s_req_valid}), - .ready_i ({sel_ready, meta_ready, m2s_req_ready}) - ); - - assign sel_b = meta.write & meta.last; - assign sel_r = ~meta.write | meta.atop[5]; - - stream_fifo #( - .FALL_THROUGH ( 1'b1 ), - .DEPTH ( 32'd1 + BufDepth ), - .T ( logic[1:0] ) - ) i_sel_buf ( - .clk_i, - .rst_ni, - .flush_i ( 1'b0 ), - .testmode_i ( 1'b0 ), - .data_i ({sel_b, sel_r }), - .valid_i ( sel_valid ), - .ready_o ( sel_ready ), - .data_o ({sel_buf_b, sel_buf_r}), - .valid_o ( sel_buf_valid ), - .ready_i ( sel_buf_ready ), - .usage_o ( /* unused */ ) - ); - - stream_fifo #( - .FALL_THROUGH ( 1'b1 ), - .DEPTH ( 32'd1 + BufDepth ), - .T ( meta_t ) - ) i_meta_buf ( - .clk_i, - .rst_ni, - .flush_i ( 1'b0 ), - .testmode_i ( 1'b0 ), - .data_i ( meta ), - .valid_i ( meta_valid ), - .ready_o ( meta_ready ), - .data_o ( meta_buf ), - .valid_o ( meta_buf_valid ), - .ready_i ( meta_buf_ready ), - .usage_o ( /* unused */ ) - ); - - // Assemble the actual memory request from meta information and write data. - assign m2s_req = mem_req_t'{ - addr: meta.addr, - atop: meta.atop, - strb: axi_req_i.w.strb, - wdata: axi_req_i.w.data, - we: meta.write - }; - - // Interface memory as stream. - stream_to_mem #( - .mem_req_t ( mem_req_t ), - .mem_resp_t ( axi_data_t ), - .BufDepth ( BufDepth ) - ) i_stream_to_mem ( - .clk_i, - .rst_ni, - .req_i ( m2s_req ), - .req_valid_i ( m2s_req_valid ), - .req_ready_o ( m2s_req_ready ), - .resp_o ( m2s_resp ), - .resp_valid_o ( m2s_resp_valid ), - .resp_ready_i ( m2s_resp_ready ), - .mem_req_o ( mem_req ), - .mem_req_valid_o ( mem_req_valid ), - .mem_req_ready_i ( mem_req_ready ), - .mem_resp_i ( mem_rdata ), - .mem_resp_valid_i ( mem_rvalid ) - ); - - // Split single memory request to desired number of banks. - mem_to_banks #( - .AddrWidth ( AddrWidth ), - .DataWidth ( DataWidth ), - .NumBanks ( NumBanks ) - ) i_mem_to_banks ( - .clk_i, - .rst_ni, - .req_i ( mem_req_valid ), - .gnt_o ( mem_req_ready ), - .addr_i ( mem_req.addr ), - .wdata_i ( mem_req.wdata ), - .strb_i ( mem_req.strb ), - .atop_i ( mem_req.atop ), - .we_i ( mem_req.we ), - .rvalid_o ( mem_rvalid ), - .rdata_o ( mem_rdata ), - .bank_req_o ( mem_req_o ), - .bank_gnt_i ( mem_gnt_i ), - .bank_addr_o ( mem_addr_o ), - .bank_wdata_o ( mem_wdata_o ), - .bank_strb_o ( mem_strb_o ), - .bank_atop_o ( mem_atop_o ), - .bank_we_o ( mem_we_o ), - .bank_rvalid_i ( mem_rvalid_i ), - .bank_rdata_i ( mem_rdata_i ) - ); - - // Join memory read data and meta data stream. - logic mem_join_valid, mem_join_ready; - stream_join #( - .N_INP ( 32'd2 ) - ) i_join ( - .inp_valid_i ({m2s_resp_valid, meta_buf_valid}), - .inp_ready_o ({m2s_resp_ready, meta_buf_ready}), - .oup_valid_o ( mem_join_valid ), - .oup_ready_i ( mem_join_ready ) - ); - - // Dynamically fork the joined stream to B and R channels. - stream_fork_dynamic #( - .N_OUP ( 32'd2 ) - ) i_fork_dynamic ( - .clk_i, - .rst_ni, - .valid_i ( mem_join_valid ), - .ready_o ( mem_join_ready ), - .sel_i ({sel_buf_b, sel_buf_r }), - .sel_valid_i ( sel_buf_valid ), - .sel_ready_o ( sel_buf_ready ), - .valid_o ({axi_resp_o.b_valid, axi_resp_o.r_valid}), - .ready_i ({axi_req_i.b_ready, axi_req_i.r_ready }) - ); - - // Compose B responses. - assign axi_resp_o.b = '{ - id: meta_buf.id, - resp: axi_pkg::RESP_OKAY, - user: '0 - }; - - // Compose R responses. - assign axi_resp_o.r = '{ - data: m2s_resp, - id: meta_buf.id, - last: meta_buf.last, - resp: axi_pkg::RESP_OKAY, - user: '0 - }; - - // Registers - `FFARN(meta_sel_q, meta_sel_d, 1'b0, clk_i, rst_ni) - `FFARN(sel_lock_q, sel_lock_d, 1'b0, clk_i, rst_ni) - `FFARN(rd_meta_q, rd_meta_d, meta_t'{default: '0}, clk_i, rst_ni) - `FFARN(wr_meta_q, wr_meta_d, meta_t'{default: '0}, clk_i, rst_ni) - `FFARN(r_cnt_q, r_cnt_d, '0, clk_i, rst_ni) - `FFARN(w_cnt_q, w_cnt_d, '0, clk_i, rst_ni) - - // Assertions - // pragma translate_off - `ifndef VERILATOR - default disable iff (!rst_ni); - assume property (@(posedge clk_i) - axi_req_i.ar_valid && !axi_resp_o.ar_ready |=> $stable(axi_req_i.ar)) - else $error("AR must remain stable until handshake has happened!"); - assert property (@(posedge clk_i) - axi_resp_o.r_valid && !axi_req_i.r_ready |=> $stable(axi_resp_o.r)) - else $error("R must remain stable until handshake has happened!"); - assume property (@(posedge clk_i) - axi_req_i.aw_valid && !axi_resp_o.aw_ready |=> $stable(axi_req_i.aw)) - else $error("AW must remain stable until handshake has happened!"); - assume property (@(posedge clk_i) - axi_req_i.w_valid && !axi_resp_o.w_ready |=> $stable(axi_req_i.w)) - else $error("W must remain stable until handshake has happened!"); - assert property (@(posedge clk_i) - axi_resp_o.b_valid && !axi_req_i.b_ready |=> $stable(axi_resp_o.b)) - else $error("B must remain stable until handshake has happened!"); - assert property (@(posedge clk_i) axi_req_i.ar_valid && axi_req_i.ar.len > 0 |-> - axi_req_i.ar.burst == axi_pkg::BURST_INCR) - else $error("Non-incrementing bursts are not supported!"); - assert property (@(posedge clk_i) axi_req_i.aw_valid && axi_req_i.aw.len > 0 |-> - axi_req_i.aw.burst == axi_pkg::BURST_INCR) - else $error("Non-incrementing bursts are not supported!"); - assert property (@(posedge clk_i) meta_valid && meta.atop != '0 |-> meta.write) - else $warning("Unexpected atomic operation on read."); - `endif - // pragma translate_on -endmodule - - -`include "axi/assign.svh" -`include "axi/typedef.svh" -/// Interface wrapper for module `axi_to_mem`. -module axi_to_mem_intf #( - /// See `axi_to_mem`, parameter `AddrWidth`. - parameter int unsigned ADDR_WIDTH = 32'd0, - /// See `axi_to_mem`, parameter `DataWidth`. - parameter int unsigned DATA_WIDTH = 32'd0, - /// AXI4+ATOP ID width. - parameter int unsigned ID_WIDTH = 32'd0, - /// AXI4+ATOP user width. - parameter int unsigned USER_WIDTH = 32'd0, - /// See `axi_to_mem`, parameter `NumBanks`. - parameter int unsigned NUM_BANKS = 32'd0, - /// See `axi_to_mem`, parameter `BufDepth`. - parameter int unsigned BUF_DEPTH = 32'd1, - /// Dependent parameter, do not override. See `axi_to_mem`, parameter `addr_t`. - localparam type addr_t = logic [ADDR_WIDTH-1:0], - /// Dependent parameter, do not override. See `axi_to_mem`, parameter `mem_data_t`. - localparam type mem_data_t = logic [DATA_WIDTH/NUM_BANKS-1:0], - /// Dependent parameter, do not override. See `axi_to_mem`, parameter `mem_strb_t`. - localparam type mem_strb_t = logic [DATA_WIDTH/NUM_BANKS/8-1:0] -) ( - /// Clock input. - input logic clk_i, - /// Asynchronous reset, active low. - input logic rst_ni, - /// See `axi_to_mem`, port `busy_o`. - output logic busy_o, - /// AXI4+ATOP slave interface port. - AXI_BUS.Slave slv, - /// See `axi_to_mem`, port `mem_req_o`. - output logic [NUM_BANKS-1:0] mem_req_o, - /// See `axi_to_mem`, port `mem_gnt_i`. - input logic [NUM_BANKS-1:0] mem_gnt_i, - /// See `axi_to_mem`, port `mem_addr_o`. - output addr_t [NUM_BANKS-1:0] mem_addr_o, - /// See `axi_to_mem`, port `mem_wdata_o`. - output mem_data_t [NUM_BANKS-1:0] mem_wdata_o, - /// See `axi_to_mem`, port `mem_strb_o`. - output mem_strb_t [NUM_BANKS-1:0] mem_strb_o, - /// See `axi_to_mem`, port `mem_atop_o`. - output axi_pkg::atop_t [NUM_BANKS-1:0] mem_atop_o, - /// See `axi_to_mem`, port `mem_we_o`. - output logic [NUM_BANKS-1:0] mem_we_o, - /// See `axi_to_mem`, port `mem_rvalid_i`. - input logic [NUM_BANKS-1:0] mem_rvalid_i, - /// See `axi_to_mem`, port `mem_rdata_i`. - input mem_data_t [NUM_BANKS-1:0] mem_rdata_i -); - typedef logic [ID_WIDTH-1:0] id_t; - typedef logic [DATA_WIDTH-1:0] data_t; - typedef logic [DATA_WIDTH/8-1:0] strb_t; - typedef logic [USER_WIDTH-1:0] user_t; - `AXI_TYPEDEF_AW_CHAN_T(aw_chan_t, addr_t, id_t, user_t) - `AXI_TYPEDEF_W_CHAN_T(w_chan_t, data_t, strb_t, user_t) - `AXI_TYPEDEF_B_CHAN_T(b_chan_t, id_t, user_t) - `AXI_TYPEDEF_AR_CHAN_T(ar_chan_t, addr_t, id_t, user_t) - `AXI_TYPEDEF_R_CHAN_T(r_chan_t, data_t, id_t, user_t) - `AXI_TYPEDEF_REQ_T(req_t, aw_chan_t, w_chan_t, ar_chan_t) - `AXI_TYPEDEF_RESP_T(resp_t, b_chan_t, r_chan_t) - req_t req; - resp_t resp; - `AXI_ASSIGN_TO_REQ(req, slv) - `AXI_ASSIGN_FROM_RESP(slv, resp) - axi_to_mem #( - .axi_req_t ( req_t ), - .axi_resp_t ( resp_t ), - .AddrWidth ( ADDR_WIDTH ), - .DataWidth ( DATA_WIDTH ), - .IdWidth ( ID_WIDTH ), - .NumBanks ( NUM_BANKS ), - .BufDepth ( BUF_DEPTH ) - ) i_axi_to_mem ( - .clk_i, - .rst_ni, - .busy_o, - .axi_req_i ( req ), - .axi_resp_o ( resp ), - .mem_req_o, - .mem_gnt_i, - .mem_addr_o, - .mem_wdata_o, - .mem_strb_o, - .mem_atop_o, - .mem_we_o, - .mem_rvalid_i, - .mem_rdata_i - ); -endmodule - -/// Split memory access over multiple parallel banks, where each bank has its own req/gnt -/// request and valid response direction. -module mem_to_banks #( - /// Input address width. - parameter int unsigned AddrWidth = 32'd0, - /// Input data width, must be a power of two. - parameter int unsigned DataWidth = 32'd0, - /// Number of banks at output, must evenly divide `DataWidth`. - parameter int unsigned NumBanks = 32'd0, - /// Dependent parameter, do not override! Address type. - localparam type addr_t = logic [AddrWidth-1:0], - /// Dependent parameter, do not override! Input data type. - localparam type inp_data_t = logic [DataWidth-1:0], - /// Dependent parameter, do not override! Input write strobe type. - localparam type inp_strb_t = logic [DataWidth/8-1:0], - /// Dependent parameter, do not override! Output data type. - localparam type oup_data_t = logic [DataWidth/NumBanks-1:0], - /// Dependent parameter, do not override! Output write strobe type. - localparam type oup_strb_t = logic [DataWidth/NumBanks/8-1:0] -) ( - /// Clock input. - input logic clk_i, - /// Asynchronous reset, active low. - input logic rst_ni, - /// Memory request to split, request is valid. - input logic req_i, - /// Memory request to split, request can be granted. - output logic gnt_o, - /// Memory request to split, request address, byte-wise. - input addr_t addr_i, - /// Memory request to split, request write data. - input inp_data_t wdata_i, - /// Memory request to split, request write strobe. - input inp_strb_t strb_i, - /// Memory request to split, request Atomic signal from AXI4+ATOP. - input axi_pkg::atop_t atop_i, - /// Memory request to split, request write enable, active high. - input logic we_i, - /// Memory request to split, response is valid. Required for read and write requests - output logic rvalid_o, - /// Memory request to split, response read data. - output inp_data_t rdata_o, - /// Memory bank request, request is valid. - output logic [NumBanks-1:0] bank_req_o, - /// Memory bank request, request can be granted. - input logic [NumBanks-1:0] bank_gnt_i, - /// Memory bank request, request address, byte-wise. Will be different for each bank. - output addr_t [NumBanks-1:0] bank_addr_o, - /// Memory bank request, request write data. - output oup_data_t [NumBanks-1:0] bank_wdata_o, - /// Memory bank request, request write strobe. - output oup_strb_t [NumBanks-1:0] bank_strb_o, - /// Memory bank request, request Atomic signal from AXI4+ATOP. - output axi_pkg::atop_t [NumBanks-1:0] bank_atop_o, - /// Memory bank request, request write enable, active high. - output logic [NumBanks-1:0] bank_we_o, - /// Memory bank request, response is valid. Required for read and write requests - input logic [NumBanks-1:0] bank_rvalid_i, - /// Memory bank request, response read data. - input oup_data_t [NumBanks-1:0] bank_rdata_i -); - - localparam DataBytes = $bits(inp_strb_t); - localparam BitsPerBank = $bits(oup_data_t); - localparam BytesPerBank = $bits(oup_strb_t); - - typedef struct packed { - addr_t addr; - oup_data_t wdata; - oup_strb_t strb; - axi_pkg::atop_t atop; - logic we; - } req_t; - - logic req_valid; - logic [NumBanks-1:0] req_ready, - resp_valid, resp_ready; - req_t [NumBanks-1:0] bank_req, - bank_oup; - - function automatic addr_t align_addr(input addr_t addr); - return (addr >> $clog2(DataBytes)) << $clog2(DataBytes); - endfunction - - // Handle requests. - assign req_valid = req_i & gnt_o; - for (genvar i = 0; unsigned'(i) < NumBanks; i++) begin : gen_reqs - assign bank_req[i].addr = align_addr(addr_i) + i * BytesPerBank; - assign bank_req[i].wdata = wdata_i[i*BitsPerBank+:BitsPerBank]; - assign bank_req[i].strb = strb_i[i*BytesPerBank+:BytesPerBank]; - assign bank_req[i].atop = atop_i; - assign bank_req[i].we = we_i; - fall_through_register #( - .T ( req_t ) - ) i_ft_reg ( - .clk_i, - .rst_ni, - .clr_i ( 1'b0 ), - .testmode_i ( 1'b0 ), - .valid_i ( req_valid ), - .ready_o ( req_ready[i] ), - .data_i ( bank_req[i] ), - .valid_o ( bank_req_o[i] ), - .ready_i ( bank_gnt_i[i] ), - .data_o ( bank_oup[i] ) - ); - assign bank_addr_o[i] = bank_oup[i].addr; - assign bank_wdata_o[i] = bank_oup[i].wdata; - assign bank_strb_o[i] = bank_oup[i].strb; - assign bank_atop_o[i] = bank_oup[i].atop; - assign bank_we_o[i] = bank_oup[i].we; - end - - // Grant output if all our requests have been granted. - assign gnt_o = (&req_ready) & (&resp_ready); - - // Handle responses. - for (genvar i = 0; unsigned'(i) < NumBanks; i++) begin : gen_resp_regs - fall_through_register #( - .T ( oup_data_t ) - ) i_ft_reg ( - .clk_i, - .rst_ni, - .clr_i ( 1'b0 ), - .testmode_i ( 1'b0 ), - .valid_i ( bank_rvalid_i[i] ), - .ready_o ( resp_ready[i] ), - .data_i ( bank_rdata_i[i] ), - .data_o ( rdata_o[i*BitsPerBank+:BitsPerBank] ), - .ready_i ( rvalid_o ), - .valid_o ( resp_valid[i] ) - ); - end - assign rvalid_o = &resp_valid; - - // Assertions - // pragma translate_off - `ifndef VERILATOR - initial begin - assume (DataWidth != 0 && (DataWidth & (DataWidth - 1)) == 0) - else $fatal(1, "Data width must be a power of two!"); - assume (DataWidth % NumBanks == 0) - else $fatal(1, "Data width must be evenly divisible over banks!"); - assume ((DataWidth / NumBanks) % 8 == 0) - else $fatal(1, "Data width of each bank must be divisible into 8-bit bytes!"); - end - `endif - // pragma translate_on -endmodule diff --git a/hardware/src/cva6_accel_first_pass_decoder.sv b/hardware/src/cva6_accel_first_pass_decoder.sv index 0519d58b7..74c7e14e2 100644 --- a/hardware/src/cva6_accel_first_pass_decoder.sv +++ b/hardware/src/cva6_accel_first_pass_decoder.sv @@ -7,36 +7,49 @@ // instruction, whether it reads scalar registers, and whether // it writes to a destination scalar register -module cva6_accel_first_pass_decoder import rvv_pkg::*; ( - input logic [31:0] instruction_i, // instruction from IF - output logic is_accel_o, // is a vector instruction - output logic is_rs1_o, - output logic is_rs2_o, - output logic is_rd_o, - output logic is_fs1_o, - output logic is_fs2_o, - output logic is_fd_o, - output logic is_vfp_o, // is a vector floating-point instruction - output logic is_load_o, - output logic is_store_o +module cva6_accel_first_pass_decoder import rvv_pkg::*; import ariane_pkg::*; ( + input logic [31:0] instruction_i, // instruction from IF + input riscv::xs_t fs_i, // floating point extension status + input riscv::xs_t vs_i, // vector extension status + output logic is_accel_o, // is a vector instruction + output scoreboard_entry_t instruction_o, // predecoded instruction + output logic illegal_instr_o, // is an illegal instruction + output logic is_control_flow_instr_o ); + logic is_rs1; + logic is_rs2; + logic is_rd; + logic is_fs1; + logic is_fs2; + logic is_fd; + logic is_vfp; // is a vector floating-point instruction + logic is_load; + logic is_store; + // Cast instruction into the `rvv_instruction_t` struct rvv_instruction_t instr; assign instr = rvv_instruction_t'(instruction_i); + // Cast instruction into scalar `instruction_t` struct + riscv::instruction_t instr_scalar; + assign instr_scalar = riscv::instruction_t'(instruction_i); + + // Vector instructions never change control flow + assign is_control_flow_instr_o = 1'b0; + always_comb begin // Default values is_accel_o = 1'b0; - is_rs1_o = 1'b0; - is_rs2_o = 1'b0; - is_rd_o = 1'b0; - is_fs1_o = 1'b0; - is_fs2_o = 1'b0; - is_fd_o = 1'b0; - is_vfp_o = 1'b0; - is_load_o = instr.i_type.opcode == riscv::OpcodeLoadFp; - is_store_o = instr.i_type.opcode == riscv::OpcodeStoreFp; + is_rs1 = 1'b0; + is_rs2 = 1'b0; + is_rd = 1'b0; + is_fs1 = 1'b0; + is_fs2 = 1'b0; + is_fd = 1'b0; + is_vfp = 1'b0; + is_load = instr.i_type.opcode == riscv::OpcodeLoadFp; + is_store = instr.i_type.opcode == riscv::OpcodeStoreFp; // Decode based on the opcode case (instr.i_type.opcode) @@ -46,20 +59,20 @@ module cva6_accel_first_pass_decoder import rvv_pkg::*; ( is_accel_o = 1'b1; case (instr.varith_type.func3) OPFVV: begin - is_fd_o = instr.varith_type.func6 == 6'b010_000; // VFWUNARY0 - is_vfp_o = 1'b1; + is_fd = instr.varith_type.func6 == 6'b010_000; // VFWUNARY0 + is_vfp = 1'b1; end - OPMVV: is_rd_o = instr.varith_type.func6 == 6'b010_000; // VWXUNARY0 - OPIVX: is_rs1_o = 1'b1 ; + OPMVV: is_rd = instr.varith_type.func6 == 6'b010_000; // VWXUNARY0 + OPIVX: is_rs1 = 1'b1 ; OPFVF: begin - is_fs1_o = 1'b1; - is_vfp_o = 1'b1; + is_fs1 = 1'b1; + is_vfp = 1'b1; end - OPMVX: is_rs1_o = 1'b1 ; + OPMVX: is_rs1 = 1'b1 ; OPCFG: begin - is_rs1_o = instr.vsetivli_type.func2 != 2'b11; // not vsetivli - is_rs2_o = instr.vsetvl_type.func7 == 7'b100_0000; // vsetvl - is_rd_o = 1'b1 ; + is_rs1 = instr.vsetivli_type.func2 != 2'b11; // not vsetivli + is_rs2 = instr.vsetvl_type.func7 == 7'b100_0000; // vsetvl + is_rd = 1'b1 ; end endcase end @@ -77,8 +90,8 @@ module cva6_accel_first_pass_decoder import rvv_pkg::*; ( 4'b1110, //VLxE512/VSxE512 4'b1111: begin //VLxE1024/VSxE1024 is_accel_o = 1'b1 ; - is_rs1_o = 1'b1 ; - is_rs2_o = instr.vmem_type.mop == 2'b10; // Strided operation + is_rs1 = 1'b1 ; + is_rs2 = instr.vmem_type.mop == 2'b10; // Strided operation end endcase end @@ -91,7 +104,7 @@ module cva6_accel_first_pass_decoder import rvv_pkg::*; ( 3'b110, //VAMO*EI32.V 3'b111: begin //VAMO*EI64.V is_accel_o = 1'b1; - is_rs1_o = 1'b1; + is_rs1 = 1'b1; end endcase end @@ -106,13 +119,44 @@ module cva6_accel_first_pass_decoder import rvv_pkg::*; ( 3'b110, //CSRRSI 3'b111: begin //CSRRCI is_accel_o = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm)); - is_rs1_o = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm)); - is_rs2_o = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm)); - is_rd_o = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm)); + is_rs1 = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm)); + is_rs2 = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm)); + is_rd = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm)); end endcase end endcase end + always_comb begin + instruction_o = '0; + illegal_instr_o = 1'b1; + + if (is_accel_o && vs_i != riscv::Off) begin // trigger illegal instruction if the vector extension is turned off + // TODO: Instruction going to other accelerators might need to distinguish whether the value of vs_i is needed or not. + // Send accelerator instructions to the coprocessor + instruction_o.fu = ACCEL; + instruction_o.vfp = is_vfp; + instruction_o.rs1 = (is_rs1 || is_fs1) ? instr_scalar.rtype.rs1 : {REG_ADDR_SIZE{1'b0}}; + instruction_o.rs2 = (is_rs2 || is_fs2) ? instr_scalar.rtype.rs2 : {REG_ADDR_SIZE{1'b0}}; + instruction_o.rd = (is_rd || is_fd) ? instr_scalar.rtype.rd : {REG_ADDR_SIZE{1'b0}}; + + // Decode the vector operation + unique case ({is_store, is_load, is_fs1, is_fs2, is_fd}) + 5'b10000: instruction_o.op = ACCEL_OP_STORE; + 5'b01000: instruction_o.op = ACCEL_OP_LOAD; + 5'b00100: instruction_o.op = ACCEL_OP_FS1; + 5'b00001: instruction_o.op = ACCEL_OP_FD; + 5'b00000: instruction_o.op = ACCEL_OP; + endcase + + // Check that mstatus.FS is not OFF if we have a FP instruction for the accelerator + illegal_instr_o = (is_vfp && (fs_i == riscv::Off)) ? 1'b1 : 1'b0; + + // result holds the undecoded instruction + instruction_o.result = { {riscv::XLEN-32{1'b0}}, instruction_i[31:0] }; + instruction_o.use_imm = 1'b0; + end + end + endmodule : cva6_accel_first_pass_decoder diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 386b9823c..ba82f8922 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -113,7 +113,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // bits that indicate whether there is a hazard between different vector // instructions. Such hazards must be continuously cleared based on the // value of the currently running loops from the main sequencer. - operand_request_cmd_t [NrOperandQueues-1:0] operand_request_i; + operand_request_cmd_t [NrOperandQueues-1:0] operand_request; logic [NrOperandQueues-1:0] operand_request_push; operand_request_cmd_t [NrOperandQueues-1:0] operand_request_d; @@ -133,7 +133,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Got a new request if (operand_request_push[queue]) begin - operand_request_d[queue] = operand_request_i[queue]; + operand_request_d[queue] = operand_request[queue]; operand_request_valid_d[queue] = 1'b1; end end @@ -189,7 +189,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: pe_resp_o.vinsn_done = vinsn_done_q; // Make no requests to the operand requester - operand_request_i = '0; + operand_request = '0; operand_request_push = '0; // Make no requests to the lane's VFUs @@ -197,7 +197,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vfu_operation_valid_d = 1'b0; // If the operand requesters are busy, abort the request and wait for another cycle. - if (pe_req_valid) begin + if (pe_req_valid) begin : stall_op_req_busy unique case (pe_req.vfu) VFU_Alu : begin pe_req_ready = !(operand_request_valid_o[AluA] || @@ -230,11 +230,11 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: pe_req_ready = !(operand_request_valid_o[MaskB]); end default:; - endcase + endcase // stall_op_req_busy end // We received a new vector instruction - if (pe_req_valid && pe_req_ready && !vinsn_running_d[pe_req.id]) begin + if (pe_req_valid && pe_req_ready && !vinsn_running_d[pe_req.id]) begin : pe_req_valid // Populate the VFU request vfu_operation_d = '{ id : pe_req.id, @@ -263,9 +263,13 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0]) vfu_operation_d.vl += 1; // Vector start calculation - vfu_operation_d.vstart = pe_req.vstart / NrLanes; - // If lane_id_i < vstart % NrLanes, this lane needs to execute one micro-operation less. - if (lane_id_i < pe_req.vstart[idx_width(NrLanes)-1:0]) vfu_operation_d.vstart -= 1; + // TODO: check for LMUL = 4, 8 + // TODO: check for SEW != 64 + vfu_operation_d.vstart = pe_req.vstart / NrLanes; // High bits + // If lane_id_i < (vstart % NrLanes), this lane needs to execute one micro-operation less. + if (lane_id_i < pe_req.vstart[idx_width(NrLanes)-1:0]) begin : adjust_vstart_lane + vfu_operation_d.vstart += 1; + end : adjust_vstart_lane // Mark the vector instruction as running vinsn_running_d[pe_req.id] = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0; @@ -287,7 +291,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: unique case (pe_req.vfu) VFU_Alu: begin - operand_request_i[AluA] = '{ + operand_request[AluA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -306,7 +310,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; operand_request_push[AluA] = pe_req.use_vs1; - operand_request_i[AluB] = '{ + operand_request[AluB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -328,24 +332,24 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request_push[AluB] = pe_req.use_vs2; // This vector instruction uses masks - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew), + vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * + NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; end VFU_MFpu: begin - operand_request_i[MulFPUA] = '{ + operand_request[MulFPUA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -365,7 +369,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; operand_request_push[MulFPUA] = pe_req.use_vs1; - operand_request_i[MulFPUB] = '{ + operand_request[MulFPUB] = '{ id : pe_req.id, vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, @@ -388,7 +392,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request_push[MulFPUB] = pe_req.swap_vs2_vd_op ? pe_req.use_vd_op : pe_req.use_vs2; - operand_request_i[MulFPUC] = '{ + operand_request[MulFPUC] = '{ id : pe_req.id, vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, @@ -411,42 +415,42 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: pe_req.use_vs2 : pe_req.use_vd_op; // This vector instruction uses masks - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew), + vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * + NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; end VFU_LoadUnit : begin // This vector instruction uses masks - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew), + vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * + NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; // Load indexed - operand_request_i[SlideAddrGenA] = '{ + operand_request[SlideAddrGenA] = '{ id : pe_req_i.id, vs : pe_req_i.vs2, eew : pe_req_i.eew_vs2, @@ -461,13 +465,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if (operand_request_i[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) - operand_request_i[SlideAddrGenA].vl += 1; + if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) + operand_request[SlideAddrGenA].vl += 1; operand_request_push[SlideAddrGenA] = pe_req_i.op == VLXE; end VFU_StoreUnit : begin - operand_request_i[StA] = '{ + // vstart is supported here + operand_request[StA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -481,28 +486,34 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, default : '0 }; - if (operand_request_i[StA].vl * NrLanes != pe_req.vl) operand_request_i[StA].vl += 1; + // vl is not an integer multiple of NrLanes + // I.e., ( ( pe_req.vl / NrLanes * NrLanes ) == vl ) <=> ( ( vl % NrLanes ) != 0 ) + if ( ( operand_request[StA].vl * NrLanes ) != pe_req.vl ) begin : tweak_vl_StA + operand_request[StA].vl += 1; + end : tweak_vl_StA operand_request_push[StA] = pe_req.use_vs1; // This vector instruction uses masks - operand_request_i[MaskM] = '{ + // TODO: add vstart support here + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew), + vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * + NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; // Store indexed - operand_request_i[SlideAddrGenA] = '{ + // TODO: add vstart support here + operand_request[SlideAddrGenA] = '{ id : pe_req_i.id, vs : pe_req_i.vs2, eew : pe_req_i.eew_vs2, @@ -517,13 +528,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if (operand_request_i[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) - operand_request_i[SlideAddrGenA].vl += 1; + if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) begin : tweak_vl_SlideAddrGenA + operand_request[SlideAddrGenA].vl += 1; + end : tweak_vl_SlideAddrGenA operand_request_push[SlideAddrGenA] = pe_req_i.op == VSXE; end VFU_SlideUnit: begin - operand_request_i[SlideAddrGenA] = '{ + operand_request[SlideAddrGenA] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -543,7 +555,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // as operands by the slide unit. // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[SlideAddrGenA].vl = + operand_request[SlideAddrGenA].vl = (pe_req.vl - pe_req.stride + NrLanes - 1) / NrLanes; end VSLIDEDOWN: begin @@ -554,7 +566,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // We need to trim full words from the start of the vector that are not used // as operands by the slide unit. - operand_request_i[SlideAddrGenA].vstart = pe_req.stride / NrLanes; + operand_request[SlideAddrGenA].vstart = pe_req.stride / NrLanes; // The stride move the initial address in boundaries of 8*NrLanes Byte. // If the stride is not multiple of a full VRF word (8*NrLanes Byte), @@ -576,15 +588,15 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vl_tot += extra_stride; // Ask the elements, and ask one more if we do not perfectly divide NrLanes - operand_request_i[SlideAddrGenA].vl = vl_tot / NrLanes; - if (operand_request_i[SlideAddrGenA].vl * NrLanes != vl_tot) - operand_request_i[SlideAddrGenA].vl += 1; + operand_request[SlideAddrGenA].vl = vl_tot / NrLanes; + if (operand_request[SlideAddrGenA].vl * NrLanes != vl_tot) + operand_request[SlideAddrGenA].vl += 1; end default:; endcase // This vector instruction uses masks - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, @@ -601,32 +613,32 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // as operands by the slide unit. // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[MaskM].vl = + operand_request[MaskM].vl = ((pe_req.vl - pe_req.stride + NrLanes - 1) / 8 / NrLanes) - >> int'(pe_req.vtype.vsew); + >> unsigned'(pe_req.vtype.vsew); - if (((operand_request_i[MaskM].vl + pe_req.stride) << - int'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl)) - operand_request_i[MaskM].vl += 1; + if (((operand_request[MaskM].vl + pe_req.stride) << + unsigned'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl)) + operand_request[MaskM].vl += 1; // SLIDEUP only uses mask bits whose indices are > stride // Don't send the previous (unused) ones to the MASKU if (pe_req.stride >= NrLanes * 64) - operand_request_i[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8; + operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8; end VSLIDEDOWN: begin // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> int'( + operand_request[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> unsigned'( pe_req.vtype.vsew)); - if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) * + if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * NrLanes * 8 != pe_req.vl) - operand_request_i[MaskM].vl += 1; + operand_request[MaskM].vl += 1; end endcase end VFU_MaskUnit: begin - operand_request_i[AluA] = '{ + operand_request[AluA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -640,21 +652,21 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. if (pe_req.op inside {[VMSEQ:VMSBC]}) begin - operand_request_i[AluA].vl = vfu_operation_d.vl; + operand_request[AluA].vl = vfu_operation_d.vl; end // This is an operation that runs normally on the ALU, and then gets reshuffled at the // Mask Unit. else begin // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[AluA].vl = (pe_req.vl / NrLanes) >> - (int'(EW64) - int'(pe_req.eew_vs1)); - if ((operand_request_i[AluA].vl << (int'(EW64) - int'(pe_req.eew_vs1))) * NrLanes != - pe_req.vl) operand_request_i[AluA].vl += 1; + operand_request[AluA].vl = (pe_req.vl / NrLanes) >> + (unsigned'(EW64) - unsigned'(pe_req.eew_vs1)); + if ((operand_request[AluA].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs1))) * NrLanes != + pe_req.vl) operand_request[AluA].vl += 1; end operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]}); - operand_request_i[AluB] = '{ + operand_request[AluB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -667,21 +679,21 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. if (pe_req.op inside {[VMSEQ:VMSBC]}) begin - operand_request_i[AluB].vl = vfu_operation_d.vl; + operand_request[AluB].vl = vfu_operation_d.vl; end // This is an operation that runs normally on the ALU, and then gets reshuffled at the // Mask Unit. else begin // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request_i[AluB].vl = (pe_req.vl / NrLanes) >> - (int'(EW64) - int'(pe_req.eew_vs2)); - if ((operand_request_i[AluB].vl << (int'(EW64) - int'(pe_req.eew_vs2))) * NrLanes != - pe_req.vl) operand_request_i[AluB].vl += 1; + operand_request[AluB].vl = (pe_req.vl / NrLanes) >> + (unsigned'(EW64) - unsigned'(pe_req.eew_vs2)); + if ((operand_request[AluB].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs2))) * NrLanes != + pe_req.vl) operand_request[AluB].vl += 1; end operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]}); - operand_request_i[MulFPUA] = '{ + operand_request[MulFPUA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, @@ -694,10 +706,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. - operand_request_i[MulFPUA].vl = vfu_operation_d.vl; + operand_request[MulFPUA].vl = vfu_operation_d.vl; operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]}; - operand_request_i[MulFPUB] = '{ + operand_request[MulFPUB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -709,10 +721,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. - operand_request_i[MulFPUB].vl = vfu_operation_d.vl; + operand_request[MulFPUB].vl = vfu_operation_d.vl; operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]}; - operand_request_i[MaskB] = '{ + operand_request[MaskB] = '{ id : pe_req.id, vs : pe_req.vd, eew : pe_req.eew_vd_op, @@ -720,16 +732,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / ELEN) << (int'(EW64) - int'(pe_req.vtype.vsew)), + vl : (pe_req.vl / NrLanes / ELEN) << (unsigned'(EW64) - unsigned'(pe_req.vtype.vsew)), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vd, default : '0 }; if (((pe_req.vl / NrLanes / ELEN) * NrLanes * ELEN) != - pe_req.vl) operand_request_i[MaskB].vl += 1; + pe_req.vl) operand_request[MaskB].vl += 1; operand_request_push[MaskB] = pe_req.use_vd_op; - operand_request_i[MaskM] = '{ + operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, eew : pe_req.vtype.vsew, @@ -741,13 +753,13 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: hazard : pe_req.hazard_vm, default: '0 }; - if ((operand_request_i[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin - operand_request_i[MaskM].vl += 1; + if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin + operand_request[MaskM].vl += 1; end operand_request_push[MaskM] = !pe_req.vm; end VFU_None: begin - operand_request_i[MaskB] = '{ + operand_request[MaskB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, @@ -763,8 +775,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request_push[MaskB] = 1'b1; end default:; - endcase - end + endcase // pe_req.vfu + end : pe_req_valid end: sequencer always_ff @(posedge clk_i or negedge rst_ni) begin: p_sequencer_ff diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv index 72c8202e1..9b8c1464c 100644 --- a/hardware/src/lane/operand_queue.sv +++ b/hardware/src/lane/operand_queue.sv @@ -127,7 +127,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i /////////////////////// // Count how many operands were already produced - vlen_t vl_d, vl_q; + vlen_t elem_count_d, elem_count_q; elen_t conv_operand; // Decide whether we are taking the operands from the lower or from the upper half of the input @@ -226,23 +226,23 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i end // Assert the signal if the last 64-bit packet will contain also - // elements with idx >= vl (they should not contribute to the result!). + // elements with idx >= elem_count (they should not contribute to the result!). // Gate for power saving // Power optimization: // The optimal solution would be to act on the mask bits in the two // processing units (valu and vmfpu), masking the unused elements. unique case (cmd.eew) EW8 : begin - incomplete_packet = |cmd.vl[2:0]; - last_packet = ((cmd.vl - vl_q) <= 8) ? 1'b1 : 1'b0; + incomplete_packet = |cmd.elem_count[2:0]; + last_packet = ((cmd.elem_count - elem_count_q) <= 8) ? 1'b1 : 1'b0; end EW16: begin - incomplete_packet = |cmd.vl[1:0]; - last_packet = ((cmd.vl - vl_q) <= 4) ? 1'b1 : 1'b0; + incomplete_packet = |cmd.elem_count[1:0]; + last_packet = ((cmd.elem_count - elem_count_q) <= 4) ? 1'b1 : 1'b0; end EW32: begin - incomplete_packet = |cmd.vl[0:0]; - last_packet = ((cmd.vl - vl_q) <= 2) ? 1'b1 : 1'b0; + incomplete_packet = |cmd.elem_count[0:0]; + last_packet = ((cmd.elem_count - elem_count_q) <= 2) ? 1'b1 : 1'b0; end default: begin incomplete_packet = 1'b0; @@ -373,15 +373,15 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i if (SupportNtrVal) unique case (cmd.eew) EW8 : for (int unsigned b = 0; b < 8; b++) begin automatic int unsigned bs = shuffle_index(b, 1, EW8); - if ((b >> 0) >= cmd.vl[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; + if ((b >> 0) >= cmd.elem_count[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; end EW16: for (int unsigned b = 0; b < 8; b++) begin automatic int unsigned bs = shuffle_index(b, 1, EW16); - if ((b >> 1) >= cmd.vl[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; + if ((b >> 1) >= cmd.elem_count[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; end EW32: for (int unsigned b = 0; b < 8; b++) begin automatic int unsigned bs = shuffle_index(b, 1, EW32); - if ((b >> 2) >= cmd.vl[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; + if ((b >> 2) >= cmd.elem_count[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b]; end default:; endcase @@ -401,7 +401,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Maintain state select_d = select_q; - vl_d = vl_q; + elem_count_d = elem_count_q; // Send the operand operand_o = conv_operand; @@ -418,16 +418,16 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i OpQueueConversionZExt2, OpQueueConversionWideFP2, OpQueueAdjustFPCvt: - if (SupportIntExt2) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 2; + if (SupportIntExt2) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 2; OpQueueConversionSExt4, OpQueueConversionZExt4: - if (SupportIntExt4) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 4; + if (SupportIntExt4) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 4; OpQueueConversionSExt8, OpQueueConversionZExt8: - if (SupportIntExt8) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 8; + if (SupportIntExt8) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 8; OpQueueReductionZExt: - vl_d = vl_q + 1; - default: vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))); + elem_count_d = elem_count_q + 1; + default: elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))); endcase // Update the pointer to the input operand @@ -443,22 +443,22 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i if ((select_q != '0 && select_d == '0) || cmd.conv == OpQueueConversionNone) ibuf_pop = 1'b1; // Finished execution - if (vl_d >= cmd.vl) begin + if (elem_count_d >= cmd.elem_count) begin : finished_elems ibuf_pop = 1'b1; cmd_pop = 1'b1; select_d = '0; - vl_d = '0; - end + elem_count_d = '0; + end : finished_elems end end : obuf_control always_ff @(posedge clk_i or negedge rst_ni) begin: p_type_conversion_ff if (!rst_ni) begin select_q <= '0; - vl_q <= '0; + elem_count_q <= '0; end else begin select_q <= select_d; - vl_q <= vl_d; + elem_count_q <= elem_count_d; end end : p_type_conversion_ff diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 54590fbc3..4bbafdc75 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -199,7 +199,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Operand request // /////////////////////// - // There is an operand requester for each operand queue. Each one + // There is an operand requester_index for each operand queue. Each one // can be in one of the following two states. typedef enum logic { IDLE, @@ -223,216 +223,230 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( logic [NrBanks-1:0][NrMasters-1:0] operand_gnt; payload_t [NrMasters-1:0] operand_payload; - for (genvar requester = 0; requester < NrOperandQueues; requester++) begin: gen_operand_requester - // State of this operand requester + // Metadata required to request all elements of this vector operand + typedef struct packed { + // ID of the instruction for this requester_index + vid_t id; + // Address of the next element to be read + vaddr_t addr; + // How many elements remain to be read + vlen_t len; + // Element width + vew_e vew; + + // Hazards between vector instructions + logic [NrVInsn-1:0] hazard; + + // Widening instructions produces two writes of every read + // In case of a WAW with a previous instruction, + // read once every two writes of the previous instruction + logic is_widening; + // One-bit counters + logic [NrVInsn-1:0] waw_hazard_counter; + } requester_metadata_t; + + for (genvar requester_index = 0; requester_index < NrOperandQueues; requester_index++) begin : gen_operand_requester + // State of this operand requester_index state_t state_d, state_q; - // Metadata required to request all elements of this vector operand - struct packed { - // ID of the instruction for this requester - vid_t id; - // Address of the next element to be read - vaddr_t addr; - // How many elements remain to be read - vlen_t len; - // Element width - vew_e vew; - - // Hazards between vector instructions - logic [NrVInsn-1:0] hazard; - - // Widening instructions produces two writes of every read - // In case of a WAW with a previous instruction, - // read once every two writes of the previous instruction - logic is_widening; - // One-bit counters - logic [NrVInsn-1:0] waw_hazard_counter; - } requester_d, requester_q; - + requester_metadata_t requester_metadata_d, requester_metadata_q; // Is there a hazard during this cycle? logic stall; - assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & - (~{NrVInsn{requester_q.is_widening}} | requester_q.waw_hazard_counter))); + assign stall = |(requester_metadata_q.hazard & ~(vinsn_result_written_q & + (~{NrVInsn{requester_metadata_q.is_widening}} | requester_metadata_q.waw_hazard_counter))); // Did we get a grant? logic [NrBanks-1:0] operand_requester_gnt; for (genvar bank = 0; bank < NrBanks; bank++) begin: gen_operand_requester_gnt - assign operand_requester_gnt[bank] = operand_gnt[bank][requester]; + assign operand_requester_gnt[bank] = operand_gnt[bank][requester_index]; end // Did we issue a word to this operand queue? - assign operand_issued_o[requester] = |(operand_requester_gnt); + assign operand_issued_o[requester_index] = |(operand_requester_gnt); always_comb begin: operand_requester + // Helper local variables + automatic operand_queue_cmd_t operand_queue_cmd_tmp; + automatic requester_metadata_t requester_metadata_tmp; + automatic vlen_t vector_body_length; + automatic vlen_t scaled_vector_body_length; + automatic vlen_t effective_vector_body_length; + automatic vaddr_t vrf_addr; + // Maintain state state_d = state_q; - requester_d = requester_q; + requester_metadata_d = requester_metadata_q; // Make no requests to the VRF - operand_payload[requester] = '0; - for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester] = 1'b0; + operand_payload[requester_index] = '0; + for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester_index] = 1'b0; - // Do not acknowledge any operand requester commands - operand_request_ready_o[requester] = 1'b0; + // Do not acknowledge any operand requester_index commands + operand_request_ready_o[requester_index] = 1'b0; // Do not send any operand conversion commands - operand_queue_cmd_o[requester] = '0; - operand_queue_cmd_valid_o[requester] = 1'b0; + operand_queue_cmd_o[requester_index] = '0; + operand_queue_cmd_valid_o[requester_index] = 1'b0; + + // Prepare metadata upfront + // Length of vector body in elements, i.e., vl - vstart + vector_body_length = operand_request_i[requester_index].vl - operand_request_i[requester_index].vstart; + // For memory operations, the number of elements initially refers to the new EEW (vsew here), + // but the requester_index must refer to the old EEW (eew here) + // This reasoning cannot be applied also to widening instructions, which modify vsew + // treating it as the EEW of vd + scaled_vector_body_length = ( + vector_body_length + << operand_request_i[requester_index].vtype.vsew + ) >> operand_request_i[requester_index].eew; + // Final computed length + effective_vector_body_length = ( operand_request_i[requester_index].scale_vl ) + ? scaled_vector_body_length + : vector_body_length; + // Address of the vstart element of the vector in the VRF + vrf_addr = vaddr(operand_request_i[requester_index].vs, NrLanes) + + ( + operand_request_i[requester_index].vstart + >> (unsigned'(EW64) - unsigned'(operand_request_i[requester_index].eew)) + ); + // Init helper variables + requester_metadata_tmp = '{ + id : operand_request_i[requester_index].id, + addr : vrf_addr, + len : effective_vector_body_length, + vew : operand_request_i[requester_index].eew, + hazard : operand_request_i[requester_index].hazard, + is_widening : operand_request_i[requester_index].cvt_resize == CVT_WIDE, + default: '0 + }; + operand_queue_cmd_tmp = '{ + eew : operand_request_i[requester_index].eew, + elem_count: effective_vector_body_length, + conv : operand_request_i[requester_index].conv, + ntr_red : operand_request_i[requester_index].cvt_resize, + target_fu : operand_request_i[requester_index].target_fu, + is_reduct : operand_request_i[requester_index].is_reduct + }; case (state_q) - IDLE: begin + IDLE: begin : state_q_IDLE // Accept a new instruction - if (operand_request_valid_i[requester]) begin + if (operand_request_valid_i[requester_index]) begin : op_req_valid state_d = REQUESTING; // Acknowledge the request - operand_request_ready_o[requester] = 1'b1; + operand_request_ready_o[requester_index] = 1'b1; // Send a command to the operand queue - operand_queue_cmd_o[requester] = '{ - eew : operand_request_i[requester].eew, - // For memory operations, the number of elements initially refers to the new EEW (vsew here), - // but the requester must refer to the old EEW (eew here) - // This reasoning cannot be applied also to widening instructions, which modify vsew - // treating it as the EEW of vd - vl : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - conv : operand_request_i[requester].conv, - ntr_red : operand_request_i[requester].cvt_resize, - target_fu: operand_request_i[requester].target_fu, - is_reduct: operand_request_i[requester].is_reduct - }; + operand_queue_cmd_o[requester_index] = operand_queue_cmd_tmp; + operand_queue_cmd_valid_o[requester_index] = 1'b1; + // The length should be at least one after the rescaling - if (operand_queue_cmd_o[requester].vl == '0) - operand_queue_cmd_o[requester].vl = 1; - operand_queue_cmd_valid_o[requester] = 1'b1; + if (operand_queue_cmd_o[requester_index].elem_count == '0) begin : cmd_zero_rescaled_vl + operand_queue_cmd_o[requester_index].elem_count = 1; + end : cmd_zero_rescaled_vl // Store the request - requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), - // For memory operations, the number of elements initially refers to the new EEW (vsew here), - // but the requester must refer to the old EEW (eew here) - // This reasoning cannot be applied also to widening instructions, which modify vsew - // treating it as the EEW of vd - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE, - default: '0 - }; + requester_metadata_d = requester_metadata_tmp; + // The length should be at least one after the rescaling - if (requester_d.len == '0) - requester_d.len = 1; + if (requester_metadata_d.len == '0) begin : req_zero_rescaled_vl + requester_metadata_d.len = 1; + end : req_zero_rescaled_vl + // Mute the requisition if the vl is zero - if (operand_request_i[requester].vl == '0) begin + if (operand_request_i[requester_index].vl == '0) begin : zero_vl state_d = IDLE; - operand_queue_cmd_valid_o[requester] = 1'b0; - end - end - end + operand_queue_cmd_valid_o[requester_index] = 1'b0; + end : zero_vl + end : op_req_valid + end : state_q_IDLE - REQUESTING: begin + REQUESTING: begin : state_q_REQUESTING // Update waw counters - for (int b = 0; b < NrVInsn; b++) - if (vinsn_result_written_d[b]) - requester_d.waw_hazard_counter[b] = ~requester_q.waw_hazard_counter[b]; + for (int b = 0; b < NrVInsn; b++) begin : waw_counters_update + if ( vinsn_result_written_d[b] ) begin : result_valid + requester_metadata_d.waw_hazard_counter[b] = ~requester_metadata_q.waw_hazard_counter[b]; + end : result_valid + end : waw_counters_update - if (operand_queue_ready_i[requester]) begin + if (operand_queue_ready_i[requester_index]) begin : op_queue_ready // Bank we are currently requesting - automatic int bank = requester_q.addr[idx_width(NrBanks)-1:0]; + automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0]; + automatic vlen_t num_bytes; // Operand request - operand_req[bank][requester] = !stall; - operand_payload[requester] = '{ - addr : requester_q.addr >> $clog2(NrBanks), - opqueue: opqueue_e'(requester), - default: '0 + operand_req[bank][requester_index] = !stall; + operand_payload[requester_index] = '{ + addr : requester_metadata_q.addr >> $clog2(NrBanks), + opqueue: opqueue_e'(requester_index), + default: '0 // this is a read operation }; // Received a grant. - if (|operand_requester_gnt) begin + if (|operand_requester_gnt) begin : op_req_grant // Bump the address pointer - requester_d.addr = requester_q.addr + 1'b1; + requester_metadata_d.addr = requester_metadata_q.addr + 1'b1; // We read less than 64 bits worth of elements - if (requester_q.len < (1 << (int'(EW64) - int'(requester_q.vew)))) - requester_d.len = 0; - else requester_d.len = requester_q.len - (1 << (int'(EW64) - int'(requester_q.vew))); - end + num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) ); + if (requester_metadata_q.len < num_bytes) begin + requester_metadata_d.len = 0; + end + else begin + requester_metadata_d.len = requester_metadata_q.len - num_bytes; + end + end : op_req_grant // Finished requesting all the elements - if (requester_d.len == '0) begin + if (requester_metadata_d.len == '0) begin : req_finished state_d = IDLE; // Accept a new instruction - if (operand_request_valid_i[requester]) begin + if (operand_request_valid_i[requester_index]) begin : op_req_valid state_d = REQUESTING; // Acknowledge the request - operand_request_ready_o[requester] = 1'b1; + operand_request_ready_o[requester_index] = 1'b1; // Send a command to the operand queue - operand_queue_cmd_o[requester] = '{ - eew : operand_request_i[requester].eew, - vl : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - conv : operand_request_i[requester].conv, - ntr_red : operand_request_i[requester].cvt_resize, - target_fu: operand_request_i[requester].target_fu, - is_reduct: operand_request_i[requester].is_reduct - }; - operand_queue_cmd_valid_o[requester] = 1'b1; + operand_queue_cmd_o[requester_index] = operand_queue_cmd_tmp; + operand_queue_cmd_valid_o[requester_index] = 1'b1; + // The length should be at least one after the rescaling - if (operand_queue_cmd_o[requester].vl == '0) - operand_queue_cmd_o[requester].vl = 1; + if (operand_queue_cmd_o[requester_index].elem_count == '0) begin : cmd_zero_rescaled_vl + operand_queue_cmd_o[requester_index].elem_count = 1; + end : cmd_zero_rescaled_vl // Store the request - requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - default: '0 - }; + requester_metadata_d = requester_metadata_tmp; + // The length should be at least one after the rescaling - if (requester_d.len == '0) - requester_d.len = 1; - end - end - end - end - endcase + if (requester_metadata_d.len == '0) begin : req_zero_rescaled_vl + requester_metadata_d.len = 1; + end : req_zero_rescaled_vl + + // Mute the requisition if the vl is zero + if (operand_request_i[requester_index].vl == '0) begin : zero_vl + state_d = IDLE; + operand_queue_cmd_valid_o[requester_index] = 1'b0; + end : zero_vl + end : op_req_valid + end : req_finished + end : op_queue_ready + end : state_q_REQUESTING + endcase // state_q // Always keep the hazard bits up to date with the global hazard table - requester_d.hazard &= global_hazard_table_i[requester_d.id]; + requester_metadata_d.hazard &= global_hazard_table_i[requester_metadata_d.id]; end : operand_requester always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin state_q <= IDLE; - requester_q <= '0; + requester_metadata_q <= '0; end else begin state_q <= state_d; - requester_q <= requester_d; + requester_metadata_q <= requester_metadata_d; end end end : gen_operand_requester @@ -452,7 +466,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( operand_req[bank][NrOperandQueues + VFU_LoadUnit] = 1'b0; end - // Generate the payload + // Generate the payloads for write back operations operand_payload[NrOperandQueues + VFU_Alu] = '{ addr : alu_result_addr_i >> $clog2(NrBanks), wen : 1'b1, @@ -523,7 +537,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( logic payload_hp_req; logic payload_hp_gnt; rr_arb_tree #( - .NumIn (int'(MulFPUC) - int'(AluA) + 1 + int'(VFU_MFpu) - int'(VFU_Alu) + 1), + .NumIn (unsigned'(MulFPUC) - unsigned'(AluA) + 1 + unsigned'(VFU_MFpu) - unsigned'(VFU_Alu) + 1), .DataWidth($bits(payload_t) ), .AxiVldRdy(1'b0 ) ) i_hp_vrf_arbiter ( @@ -548,7 +562,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( logic payload_lp_req; logic payload_lp_gnt; rr_arb_tree #( - .NumIn(int'(SlideAddrGenA)- int'(MaskB) + 1 + int'(VFU_LoadUnit) - int'(VFU_SlideUnit) + 1), + .NumIn(unsigned'(SlideAddrGenA)- unsigned'(MaskB) + 1 + unsigned'(VFU_LoadUnit) - unsigned'(VFU_SlideUnit) + 1), .DataWidth($bits(payload_t) ), .AxiVldRdy(1'b0 ) ) i_lp_vrf_arbiter ( diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index 8d8b1024d..369784f78 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -449,7 +449,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) && (mask_valid_i || vinsn_issue_q.vm)) begin // How many elements are we committing with this word? - automatic logic [3:0] element_cnt = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))); + automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))); + automatic vlen_t vector_body_length = vinsn_issue_q.vl - vinsn_issue_q.vstart; + if (element_cnt > issue_cnt_q) element_cnt = issue_cnt_q; @@ -465,7 +467,12 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Store the result in the result queue result_queue_d[result_queue_write_pnt_q].wdata = result_queue_q[result_queue_write_pnt_q].wdata | valu_result; - result_queue_d[result_queue_write_pnt_q].addr = vaddr(vinsn_issue_q.vd, NrLanes) + ((vinsn_issue_q.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue_q.vtype.vsew)); + result_queue_d[result_queue_write_pnt_q].addr = vaddr(vinsn_issue_q.vd, NrLanes) + + ( + ( vinsn_issue_q.vl - issue_cnt_q ) // vstart is already considered in issue_cnt_q + >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew) + ) + ); result_queue_d[result_queue_write_pnt_q].id = vinsn_issue_q.id; result_queue_d[result_queue_write_pnt_q].mask = vinsn_issue_q.vfu == VFU_MaskUnit; if (!narrowing(vinsn_issue_q.op) || !narrowing_select_q) @@ -474,7 +481,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Is this a narrowing instruction? if (narrowing(vinsn_issue_q.op)) begin // How many elements did we calculate in this iteration? - automatic logic [3:0] element_cnt_narrow = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))) / 2; + automatic logic [3:0] element_cnt_narrow = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))) / 2; if (element_cnt_narrow > issue_cnt_q) element_cnt_narrow = issue_cnt_q; @@ -523,12 +530,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Assign vector length for next instruction in the instruction queue if (vinsn_queue_d.issue_cnt != 0) begin + automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart; if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; + issue_cnt_d = vector_body_length; else begin - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + issue_cnt_d = (vector_body_length / 8) >> vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; + issue_cnt_d += |vector_body_length[2:0]; end end end @@ -547,7 +557,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) && (mask_valid_i || vinsn_issue_q.vm)) begin // How many elements are we committing with this word? - automatic logic [3:0] element_cnt = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))); + automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))); if (element_cnt > issue_cnt_q) element_cnt = issue_cnt_q; @@ -654,12 +664,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Assign vector length for next instruction in the instruction queue if (vinsn_queue_d.issue_cnt != 0) begin + automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart; if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; + issue_cnt_d = vector_body_length; else begin - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + issue_cnt_d = (vector_body_length / 8) >> vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; + issue_cnt_d += |vector_body_length[2:0]; end end @@ -690,12 +703,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Assign vector length for next instruction in the instruction queue if (vinsn_queue_d.issue_cnt != 0) begin + automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart; if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; + issue_cnt_d = vector_body_length; else begin - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + issue_cnt_d = (vector_body_length / 8) >> vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; + issue_cnt_d += |vector_body_length[2:0]; end end @@ -750,8 +766,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Decrement the counter of remaining vector elements waiting to be written // Don't do it in case of a reduction if (!is_reduction(vinsn_commit.op)) - commit_cnt_d = commit_cnt_q - (1 << (int'(EW64) - vinsn_commit.vtype.vsew)); - if (commit_cnt_q < (1 << (int'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0; + commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew)); + if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0; end // Finished committing the results of a vector instruction @@ -765,16 +781,20 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; else vinsn_queue_d.commit_pnt += 1; // Update the commit counter for the next instruction - if (vinsn_queue_d.commit_cnt != '0) + if (vinsn_queue_d.commit_cnt != '0) begin + automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vstart; if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]})) - commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl; + commit_cnt_d = vector_body_length; else begin // We are asking for bits, and we want at least one chunk of bits if // vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew) - commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + commit_cnt_d = (vector_body_length / 8) >> vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew; - commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0]; + commit_cnt_d += |vector_body_length[2:0]; end + end // Initialize counters and alu state if needed by the next instruction // After a reduction, the next instructions starts after the reduction commits @@ -796,14 +816,18 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; if (!vinsn_queue_full && vfu_operation_valid_i && (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin + automatic vlen_t vector_body_length = vfu_operation_i.vl - vfu_operation_i.vstart; + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i; // Do not wait for masks if, during a reduction, this lane is just a pass-through // The only valid instructions here with vl == '0 are reductions + // TODO: check if vector_body_length should be used insteada of plain vl here vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0); // Initialize counters and alu state if the instruction queue was empty // and the lane is not reducing if ((vinsn_queue_d.issue_cnt == '0) && !prevent_commit) begin + alu_state_d = is_reduction(vfu_operation_i.op) ? INTRA_LANE_REDUCTION : NO_REDUCTION; // The next will be the first operation of this instruction // This information is useful for reduction operation @@ -812,22 +836,24 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; reduction_rx_cnt_d = reduction_rx_cnt_init(NrLanes, lane_id_i); sldu_transactions_cnt_d = $clog2(NrLanes) + 1; - issue_cnt_d = vfu_operation_i.vl; + issue_cnt_d = vector_body_length; if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vfu_operation_i.vl; + issue_cnt_d = vector_body_length; else begin - issue_cnt_d = (vfu_operation_i.vl / 8) >> + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); + issue_cnt_d = (vector_body_length / 8) >> vfu_operation_i.vtype.vsew; - issue_cnt_d += |vfu_operation_i.vl[2:0]; + issue_cnt_d += |vector_body_length[2:0]; end end if (vinsn_queue_d.commit_cnt == '0) if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) - commit_cnt_d = vfu_operation_i.vl; + commit_cnt_d = vector_body_length; else begin + $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); // Operations between mask vectors operate on bits - commit_cnt_d = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew; - commit_cnt_d += |vfu_operation_i.vl[2:0]; + commit_cnt_d = (vector_body_length / 8) >> vfu_operation_i.vtype.vsew; + commit_cnt_d += |vector_body_length[2:0]; end // Bump pointers and counters of the vector instruction queue diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index 36c76df21..fbe3f1a49 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -768,7 +768,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; }; // Don't compress classify result - localparam int unsigned TrueSIMDClass = 1; + localparam int unsigned TrueSIMDClass = 1; + localparam int unsigned EnableSIMDMask = 1; operation_e fp_op; logic fp_opmod; @@ -969,9 +970,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; .Features (FPUFeatures ), .Implementation(FPUImplementation), .TagType (strb_t ), - .NumLanes (FPULanes ), .TrueSIMDClass (TrueSIMDClass ), - .MaskType (fpu_mask_t ) + .EnableSIMDMask(EnableSIMDMask ) ) i_fpnew_bulk ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index 2fbe05e55..09c7bfaaa 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -26,15 +26,33 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( output axi_aw_t axi_aw_o, output logic axi_aw_valid_o, input logic axi_aw_ready_i, + // CSR input + input logic en_ld_st_translation_i, + // Interface with CVA6's sv39 MMU + // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless + output ariane_pkg::exception_t mmu_misaligned_ex_o, + output logic mmu_req_o, // request address translation + output logic [riscv::VLEN-1:0] mmu_vaddr_o, // virtual address out + output logic mmu_is_store_o, // the translation is requested by a store + // if we need to walk the page table we can't grant in the same cycle + // Cycle 0 + input logic mmu_dtlb_hit_i, // sent in the same cycle as the request if translation hits in the DTLB + input logic [riscv::PPNW-1:0] mmu_dtlb_ppn_i, // ppn (send same cycle as hit) + // Cycle 1 + input logic mmu_valid_i, // translation is valid + input logic [riscv::PLEN-1:0] mmu_paddr_i, // translated address + input ariane_pkg::exception_t mmu_exception_i, // address translation threw an exception // Interace with the dispatcher input logic core_st_pending_i, // Interface with the main sequencer input pe_req_t pe_req_i, input logic pe_req_valid_i, input logic [NrVInsn-1:0] pe_vinsn_running_i, - output logic addrgen_error_o, + output ariane_pkg::exception_t addrgen_exception_o, output logic addrgen_ack_o, - output vlen_t addrgen_error_vl_o, + output vlen_t addrgen_exception_vl_o, + output logic addrgen_exception_load_o, + output logic addrgen_exception_store_o, // Interface with the load/store units output addrgen_axi_req_t axi_addrgen_req_o, output logic axi_addrgen_req_valid_o, @@ -47,11 +65,29 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( output logic addrgen_operand_ready_o ); + localparam unsigned DataWidth = $bits(elen_t); + localparam unsigned DataWidthB = DataWidth / 8; + + /////////////////// + // Assignments // + /////////////////// + + // Ara reports misaligned exceptions on its own + assign mmu_misaligned_ex_o = '0; + assign mmu_is_store_o = !axi_addrgen_q.is_load; + + /////////////// + // Imports // + /////////////// import cf_math_pkg::idx_width; import axi_pkg::aligned_addr; import axi_pkg::BURST_INCR; import axi_pkg::CACHE_MODIFIABLE; + /////////////////// + // Definitions // + /////////////////// + // Check if the address is aligned to a particular width function automatic logic is_addr_error(axi_addr_t addr, vew_e vew); is_addr_error = |(addr & (elen_t'(1 << vew) - 1)); @@ -114,10 +150,10 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( logic [$bits(elen_t)*NrLanes-1:0] shuffled_word; logic [$bits(elen_t)*NrLanes-1:0] deshuffled_word; elen_t reduced_word; - axi_addr_t idx_final_addr_d, idx_final_addr_q; - elen_t idx_addr; + axi_addr_t idx_final_vaddr_d, idx_final_vaddr_q; + elen_t idx_vaddr; logic idx_op_error_d, idx_op_error_q; - vlen_t addrgen_error_vl_d; + vlen_t addrgen_exception_vl_d; // Pointer to point to the correct logic [$clog2(NrLanes)-1:0] word_lane_ptr_d, word_lane_ptr_q; @@ -126,8 +162,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( vlen_t idx_op_cnt_d, idx_op_cnt_q; // Spill reg signals - logic idx_addr_valid_d, idx_addr_valid_q; - logic idx_addr_ready_d, idx_addr_ready_q; + logic idx_vaddr_valid_d, idx_vaddr_valid_q; + logic idx_vaddr_ready_d, idx_vaddr_ready_q; // Break the path from the VRF to the AXI request spill_register #( @@ -135,17 +171,19 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( ) i_addrgen_idx_op_spill_reg ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .valid_i(idx_addr_valid_d), - .ready_o(idx_addr_ready_q), - .data_i (idx_final_addr_d), - .valid_o(idx_addr_valid_q), - .ready_i(idx_addr_ready_d), - .data_o (idx_final_addr_q) + .valid_i(idx_vaddr_valid_d), + .ready_o(idx_vaddr_ready_q), + .data_i (idx_final_vaddr_d), + .valid_o(idx_vaddr_valid_q), + .ready_i(idx_vaddr_ready_d), + .data_o (idx_final_vaddr_q) ); ////////////////////////// // Address generation // ////////////////////////// + ariane_pkg::exception_t mmu_exception_d, mmu_exception_q; + logic last_translation_completed; // Running vector instructions logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q; @@ -156,13 +194,18 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // ADDRGEN_IDX_OP: Generates a series of AXI requests from a // vector instruction, but reading a vector of offsets from Ara's lanes. // This is used for scatter and gather operations. - enum logic [1:0] { + // WAIT_LAST_TRANSLATION: Wait for the last address translation to be acknowledged + enum logic [2:0] { IDLE, ADDRGEN, ADDRGEN_IDX_OP, - ADDRGEN_IDX_OP_END + ADDRGEN_IDX_OP_END, + WAIT_LAST_TRANSLATION } state_q, state_d; + // TODO: Masked elements do not generate exceptions on: + // * EEW misalignment + // * page faults always_comb begin: addr_generation // Maintain state state_d = state_q; @@ -177,16 +220,20 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Nothing to acknowledge addrgen_ack_o = 1'b0; - addrgen_error_o = 1'b0; + addrgen_exception_o.valid = 1'b0; + addrgen_exception_o.tval = '0; + addrgen_exception_o.cause = '0; + addrgen_exception_load_o = 1'b0; + addrgen_exception_store_o = 1'b0; // No valid words for the spill register - idx_addr_valid_d = 1'b0; + idx_vaddr_valid_d = 1'b0; addrgen_operand_ready_o = 1'b0; reduced_word = '0; elm_ptr_d = elm_ptr_q; idx_op_cnt_d = idx_op_cnt_q; word_lane_ptr_d = word_lane_ptr_q; - idx_final_addr_d = idx_final_addr_q; + idx_final_vaddr_d = idx_final_vaddr_q; last_elm_subw_d = last_elm_subw_q; // Support for indexed operations @@ -201,10 +248,10 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( for (int unsigned lane = 0; lane < NrLanes; lane++) if (lane == word_lane_ptr_q) reduced_word = deshuffled_word[word_lane_ptr_q*$bits(elen_t) +: $bits(elen_t)]; - idx_addr = reduced_word; + idx_vaddr = reduced_word; case (state_q) - IDLE: begin + IDLE: begin : state_IDLE // Received a new request if (pe_req_valid_i && (is_load(pe_req_i.op) || is_store(pe_req_i.op)) && !vinsn_running_q[pe_req_i.id]) begin @@ -229,22 +276,41 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( endcase // Load element counter - idx_op_cnt_d = pe_req_i.vl; + idx_op_cnt_d = pe_req_i.vl - pe_req_i.vstart; end default: state_d = ADDRGEN; - endcase + endcase // pe_req_i.op + end - end - ADDRGEN: begin + end : state_IDLE + + ADDRGEN: begin : ADDRGEN // Ara does not support misaligned AXI requests - if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin + if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin : eew_misaligned_error state_d = IDLE; addrgen_ack_o = 1'b1; - addrgen_error_o = 1'b1; - end else begin + addrgen_exception_o.valid = 1'b1; + addrgen_exception_o.cause = riscv::ILLEGAL_INSTR; + addrgen_exception_o.tval = '0; + end : eew_misaligned_error + else begin : address_valid + // NOTE: indexed are not covered here + automatic logic [riscv::VLEN-1:0] vaddr_start; + + case ( pe_req_q.op ) + // Unit-stride: address = base + (vstart in elements) + VLE, VSE : vaddr_start = pe_req_q.scalar_op + ( pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew) ); + // Strided: address = base + (vstart * stride) + // NOTE: this multiplier might cause some timing issues + VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + ( pe_req_q.vstart * pe_req_q.stride ) ; + // Indexed: let the next stage take care of vstart + VLXE, VSXE: vaddr_start = pe_req_q.scalar_op; + default : vaddr_start = '0; + endcase // pe_req_q.op + addrgen_req = '{ - addr : pe_req_q.scalar_op, - len : pe_req_q.vl, + addr : vaddr_start, + len : pe_req_q.vl - pe_req_q.vstart, stride : pe_req_q.stride, vew : pe_req_q.vtype.vsew, is_load : is_load(pe_req_q.op), @@ -253,20 +319,35 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( }; addrgen_req_valid = 1'b1; - if (addrgen_req_ready) begin + if (addrgen_req_ready) begin : finished addrgen_req_valid = '0; addrgen_ack_o = 1'b1; state_d = IDLE; - end - end - end - ADDRGEN_IDX_OP: begin + end : finished + + // If load/store translation is enabled + if ( en_ld_st_translation_i ) begin : translation_enabled + // We need to wait for the last translation to be over before acking back + // addrgen_req_valid = '0; TODO: figure out if set/reset here + addrgen_ack_o = 1'b0; + state_d = WAIT_LAST_TRANSLATION; + end : translation_enabled + end : address_valid + end : ADDRGEN + + ADDRGEN_IDX_OP: begin : ADDRGEN_IDX_OP + // NOTE: vstart is not supported for indexed operations + // the logic shuld be introduced: + // 1. in the addrgen_operand_i operand read + // 2. in idx_vaddr computation + automatic logic [NrLanes-1:0] addrgen_operand_valid; + // Stall the interface until the operation is over to catch possible exceptions // Every address can generate an exception addrgen_req = '{ - addr : pe_req_q.scalar_op, - len : pe_req_q.vl, + addr : pe_req_q.scalar_op, + len : pe_req_q.vl - pe_req_q.vstart, stride : pe_req_q.stride, vew : pe_req_q.vtype.vsew, is_load : is_load(pe_req_q.op), @@ -275,51 +356,66 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( }; addrgen_req_valid = 1'b1; + // Adjust valid signals to the next block "operands_ready" + addrgen_operand_valid = addrgen_operand_valid_i; + for ( int unsigned lane = 0; lane < NrLanes; lane++ ) begin : adjust_operand_valid + // - We are left with less byte than the maximim to issue, + // this means that at least one lane is not going to push us any operand anymore + // - For the lanes which index % NrLanes != 0 + if ( ( ( idx_op_cnt_q << pe_req_q.vtype.vsew ) < (NrLanes * DataWidthB) ) + & ( lane < pe_req_q.vstart[idx_width(NrLanes)-1:0] ) + ) begin : vstart_lane_adjust + addrgen_operand_valid[lane] |= 1'b1; + end : vstart_lane_adjust + end : adjust_operand_valid + // TODO: apply the same vstart logic also to mask_valid_i + // Handle handshake and data between VRF and spill register // We accept all the incoming data, without any checks // since Ara stalls on an indexed memory operation - if (&addrgen_operand_valid_i & addrgen_operand_target_fu_i[0] == MFPU_ADDRGEN) begin + if (&addrgen_operand_valid & addrgen_operand_target_fu_i[0] == MFPU_ADDRGEN) begin // Valid data for the spill register - idx_addr_valid_d = 1'b1; + idx_vaddr_valid_d = 1'b1; // Select the correct element, and zero extend it depending on vsew case (pe_req_q.eew_vs2) EW8: begin for (int unsigned b = 0; b < 8; b++) if (b == elm_ptr_q) - idx_addr = reduced_word[b*8 +: 8]; + idx_vaddr = reduced_word[b*8 +: 8]; end EW16: begin for (int unsigned h = 0; h < 4; h++) if (h == elm_ptr_q) - idx_addr = reduced_word[h*16 +: 16]; + idx_vaddr = reduced_word[h*16 +: 16]; end EW32: begin for (int unsigned w = 0; w < 2; w++) if (w == elm_ptr_q) - idx_addr = reduced_word[w*32 +: 32]; + idx_vaddr = reduced_word[w*32 +: 32]; end EW64: begin for (int unsigned d = 0; d < 1; d++) if (d == elm_ptr_q) - idx_addr = reduced_word[d*64 +: 64]; + idx_vaddr = reduced_word[d*64 +: 64]; end default: begin for (int unsigned b = 0; b < 8; b++) if (b == elm_ptr_q) - idx_addr = reduced_word[b*8 +: 8]; + idx_vaddr = reduced_word[b*8 +: 8]; end endcase // Compose the address - idx_final_addr_d = pe_req_q.scalar_op + idx_addr; + idx_final_vaddr_d = pe_req_q.scalar_op + idx_vaddr; // When the data is accepted - if (idx_addr_ready_q) begin + if (idx_vaddr_ready_q) begin // Consumed one element idx_op_cnt_d = idx_op_cnt_q - 1; // Have we finished a full NrLanes*64b word? + // TODO: check for the need of vstart logic here if (elm_ptr_q == last_elm_subw_q) begin // Bump lane pointer elm_ptr_d = '0; @@ -339,13 +435,13 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( end end - if (idx_op_error_d || addrgen_req_ready) begin + if (idx_op_error_d || addrgen_req_ready || mmu_exception_d.valid ) begin state_d = ADDRGEN_IDX_OP_END; end - end + end : ADDRGEN_IDX_OP // This state exists not to create combinatorial paths on the interface - ADDRGEN_IDX_OP_END : begin + ADDRGEN_IDX_OP_END : begin : ADDRGEN_IDX_OP_END // Acknowledge the indexed memory operation addrgen_ack_o = 1'b1; addrgen_req_valid = '0; @@ -355,11 +451,38 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( word_lane_ptr_d = '0; // Raise an error if necessary if (idx_op_error_q) begin - addrgen_error_o = 1'b1; + // In this case, we always get EEW-misaligned exceptions + addrgen_exception_o.valid = 1'b1; + addrgen_exception_o.cause = riscv::ILLEGAL_INSTR; + addrgen_exception_o.tval = '0; end - end - endcase - end + // Propagate the exception from the MMU (if any) + // NOTE: this would override + if ( mmu_exception_q.valid ) begin + addrgen_exception_o = mmu_exception_q; + end + end : ADDRGEN_IDX_OP_END + + WAIT_LAST_TRANSLATION : begin : WAIT_LAST_TRANSLATION + if ( last_translation_completed | mmu_exception_q.valid ) begin + // Acknowledge the indexed memory operation + addrgen_ack_o = 1'b1; + addrgen_req_valid = '0; + state_d = IDLE; + // Reset pointers + elm_ptr_d = '0; + word_lane_ptr_d = '0; + // Propagate the exception from the MMU (if any) + addrgen_exception_o = mmu_exception_q; + end + end : WAIT_LAST_TRANSLATION + endcase // state_q + + if ( addrgen_exception_o.valid & addrgen_ack_o ) begin + addrgen_exception_load_o = is_load(pe_req_q.op); + addrgen_exception_store_o = !is_load(pe_req_q.op); + end + end : addr_generation always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin @@ -371,7 +494,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( idx_op_cnt_q <= '0; last_elm_subw_q <= '0; idx_op_error_q <= '0; - addrgen_error_vl_o <= '0; + addrgen_exception_vl_o <= '0; + mmu_exception_q <= '0; end else begin state_q <= state_d; pe_req_q <= pe_req_d; @@ -381,7 +505,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( idx_op_cnt_q <= idx_op_cnt_d; last_elm_subw_q <= last_elm_subw_d; idx_op_error_q <= idx_op_error_d; - addrgen_error_vl_o <= addrgen_error_vl_d; + addrgen_exception_vl_o <= addrgen_exception_vl_d; + mmu_exception_q <= mmu_exception_d; end end @@ -389,25 +514,27 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Support for misaligned stores // ///////////////////////////////////// + localparam clog2_AxiStrobeWidth = $clog2(AxiDataWidth/8); + // AXI Request Generation signals, declared here for convenience addrgen_req_t axi_addrgen_d, axi_addrgen_q; // Narrower AXI Data Byte-Width used for misaligned stores - logic [$clog2(AxiDataWidth/8)-1:0] narrow_axi_data_bwidth; + logic [clog2_AxiStrobeWidth-1:0] narrow_axi_data_bwidth; // Helper signal to calculate the narrow_axi_data_bwidth // It carries information about the misalignment of the start address w.r.t. the AxiDataWidth - logic [$clog2(AxiDataWidth/8)-1:0] axi_addr_misalignment; + logic [clog2_AxiStrobeWidth-1:0] axi_addr_misalignment; // Number of trailing 0s of axi_addr_misalignment - logic [idx_width($clog2(AxiDataWidth/8))-1:0] zeroes_cnt; + logic [idx_width(clog2_AxiStrobeWidth)-1:0] zeroes_cnt; // Get the misalignment information for this vector memory instruction - assign axi_addr_misalignment = axi_addrgen_d.addr[$clog2(AxiDataWidth/8)-1:0]; + assign axi_addr_misalignment = axi_addrgen_d.addr[clog2_AxiStrobeWidth-1:0]; // Calculate the maximum number of Bytes we can send in a store-misaligned beat. // This number must be a power of 2 not to get misaligned wrt the pack of data that the // store unit receives from the lanes lzc #( - .WIDTH($clog2(AxiDataWidth/8)), + .WIDTH(clog2_AxiStrobeWidth), .MODE (1'b0 ) ) i_lzc ( .in_i (axi_addr_misalignment), @@ -416,14 +543,18 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( ); // Effective AXI data width for misaligned stores - assign narrow_axi_data_bwidth = (AxiDataWidth/8) >> ($clog2(AxiDataWidth/8) - zeroes_cnt); + assign narrow_axi_data_bwidth = (AxiDataWidth/8) >> (clog2_AxiStrobeWidth - zeroes_cnt); ////////////////////////////// // AXI Request Generation // ////////////////////////////// - enum logic [1:0] { - AXI_ADDRGEN_IDLE, AXI_ADDRGEN_MISALIGNED, AXI_ADDRGEN_WAITING, AXI_ADDRGEN_REQUESTING + enum logic [2:0] { + AXI_ADDRGEN_IDLE, + AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED, // Misaligned vector store to AxiDataWidth/8, needs special treatement + AXI_ADDRGEN_WAITING_CORE_STORE_PENDING, // Wait until (core_st_pending_i == 0) + AXI_ADDRGEN_REQUESTING, // Perform AW/AR transactions and push addrgen_req to VSTU/VLDU + AXI_ADDRGEN_WAIT_TRANSLATION // Wait for MMU to ack back } axi_addrgen_state_d, axi_addrgen_state_q; axi_addr_t aligned_start_addr_d, aligned_start_addr_q; @@ -433,8 +564,39 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // MSb of the next-next page (page selector for page 2 positions after the current one) logic [($bits(aligned_start_addr_d) - 12)-1:0] next_2page_msb_d, next_2page_msb_q; - logic [$clog2(AxiDataWidth/8):0] eff_axi_dw_d, eff_axi_dw_q; - logic [idx_width($clog2(AxiDataWidth/8)):0] eff_axi_dw_log_d, eff_axi_dw_log_q; + logic [clog2_AxiStrobeWidth:0] eff_axi_dw_d, eff_axi_dw_q; + logic [idx_width(clog2_AxiStrobeWidth):0] eff_axi_dw_log_d, eff_axi_dw_log_q; + + function automatic void set_end_addr ( + input logic [($bits(axi_addr_t) - 12)-1:0] next_2page_msb, + input int unsigned num_bytes, + input axi_addr_t addr, + input logic [clog2_AxiStrobeWidth:0] eff_axi_dw, + input logic [idx_width(clog2_AxiStrobeWidth):0] eff_axi_dw_log, + input axi_addr_t aligned_start_addr_d, + output axi_addr_t aligned_end_addr_d, + output axi_addr_t aligned_next_start_addr_d + ); + + // POSSIBLE BUG: given this is really the maximum number of bytes per burst, + // this assumes the burst length is always the maximum possible, i.e., 256. + automatic int unsigned max_burst_bytes = addr + (256 << eff_axi_dw_log); + + // The final address can be found similarly... + if (num_bytes >= max_burst_bytes) begin + aligned_next_start_addr_d = aligned_addr(addr + max_burst_bytes, clog2_AxiStrobeWidth); + end else begin + aligned_next_start_addr_d = aligned_addr(addr + num_bytes - 1, eff_axi_dw_log) + eff_axi_dw; + end + aligned_end_addr_d = aligned_next_start_addr_d - 1; + + // But since AXI requests are aligned in 4 KiB pages, aligned_end_addr must be in the + // same page as aligned_start_addr + if (aligned_start_addr_d[AxiAddrWidth-1:12] != aligned_end_addr_d[AxiAddrWidth-1:12]) begin + aligned_end_addr_d = {aligned_start_addr_d[AxiAddrWidth-1:12], 12'hFFF}; + aligned_next_start_addr_d = { next_2page_msb , 12'h000}; + end + endfunction // set_end_addr always_comb begin: axi_addrgen // Maintain state @@ -450,8 +612,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( eff_axi_dw_d = eff_axi_dw_q; eff_axi_dw_log_d = eff_axi_dw_log_q; - idx_addr_ready_d = 1'b0; - addrgen_error_vl_d = '0; + idx_vaddr_ready_d = 1'b0; + addrgen_exception_vl_d = '0; // No error by default idx_op_error_d = 1'b0; @@ -469,82 +631,82 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( axi_aw_o = '0; axi_aw_valid_o = 1'b0; - case (axi_addrgen_state_q) - AXI_ADDRGEN_IDLE: begin + // MMU + mmu_exception_d = mmu_exception_q; + mmu_req_o = 1'b0; + mmu_vaddr_o = '0; + + // For addrgen FSM + last_translation_completed = 1'b0; + + case (axi_addrgen_state_q) + AXI_ADDRGEN_IDLE: begin : axi_addrgen_state_AXI_ADDRGEN_IDLE if (addrgen_req_valid) begin axi_addrgen_d = addrgen_req; - axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING : AXI_ADDRGEN_REQUESTING; + axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING_CORE_STORE_PENDING : AXI_ADDRGEN_REQUESTING; // In case of a misaligned store, reduce the effective width of the AXI transaction, // since the store unit does not support misalignments between the AXI bus and the lanes - if ((axi_addrgen_d.addr[$clog2(AxiDataWidth/8)-1:0] != '0) && !axi_addrgen_d.is_load) + // BUG: this address check is not valid for indexed operations + if ((axi_addrgen_d.addr[clog2_AxiStrobeWidth-1:0] != '0) && !axi_addrgen_d.is_load) begin - // Calculate the start and the end addresses in the AXI_ADDRGEN_MISALIGNED state - axi_addrgen_state_d = AXI_ADDRGEN_MISALIGNED; + // Calculate the start and the end addresses in the AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED state + axi_addrgen_state_d = AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED; eff_axi_dw_d = {1'b0, narrow_axi_data_bwidth}; eff_axi_dw_log_d = zeroes_cnt; end else begin eff_axi_dw_d = AxiDataWidth/8; - eff_axi_dw_log_d = $clog2(AxiDataWidth/8); + eff_axi_dw_log_d = clog2_AxiStrobeWidth; end // The start address is found by aligning the original request address by the width of // the memory interface. - aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, $clog2(AxiDataWidth/8)); + aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, clog2_AxiStrobeWidth); // Pre-calculate the next_2page_msb. This should not require much energy if the addr // has zeroes in the upper positions. next_2page_msb_d = aligned_start_addr_d[AxiAddrWidth-1:12] + 1; // The final address can be found similarly... - if (axi_addrgen_d.len << int'(axi_addrgen_d.vew) >= (256 << $clog2(AxiDataWidth/8))) begin - aligned_next_start_addr_d = - aligned_addr(axi_addrgen_d.addr + (256 << $clog2(AxiDataWidth/8)), $clog2(AxiDataWidth/8)); - aligned_end_addr_d = aligned_next_start_addr_d - 1; - end else begin - aligned_next_start_addr_d = - aligned_addr(axi_addrgen_d.addr + (axi_addrgen_d.len << int'(axi_addrgen_d.vew)) - 1, - $clog2(AxiDataWidth/8)) + AxiDataWidth/8; - aligned_end_addr_d = aligned_next_start_addr_d - 1; - end - // But since AXI requests are aligned in 4 KiB pages, aligned_end_addr must be in the - // same page as aligned_start_addr - if (aligned_start_addr_d[AxiAddrWidth-1:12] != aligned_end_addr_d[AxiAddrWidth-1:12]) begin - aligned_end_addr_d = {aligned_start_addr_d[AxiAddrWidth-1:12], 12'hFFF}; - aligned_next_start_addr_d = { next_2page_msb_d, 12'h000}; - end + set_end_addr ( + next_2page_msb_d, + (axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)), + axi_addrgen_d.addr, + AxiDataWidth/8, + clog2_AxiStrobeWidth, + aligned_start_addr_d, + aligned_end_addr_d, + aligned_next_start_addr_d + ); end - end - AXI_ADDRGEN_MISALIGNED: begin - axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING : AXI_ADDRGEN_REQUESTING; + end : axi_addrgen_state_AXI_ADDRGEN_IDLE + + AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED: begin : axi_addrgen_state_AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED + axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING_CORE_STORE_PENDING : AXI_ADDRGEN_REQUESTING; // The start address is found by aligning the original request address by the width of // the memory interface. aligned_start_addr_d = aligned_addr(axi_addrgen_q.addr, eff_axi_dw_log_q); - // The final address can be found similarly... - if (axi_addrgen_q.len << int'(axi_addrgen_q.vew) >= (256 << eff_axi_dw_log_q)) begin - aligned_next_start_addr_d = - aligned_addr(axi_addrgen_q.addr + (256 << eff_axi_dw_log_q), eff_axi_dw_log_q); - aligned_end_addr_d = aligned_next_start_addr_d - 1; - end else begin - aligned_next_start_addr_d = - aligned_addr(axi_addrgen_q.addr + (axi_addrgen_q.len << int'(axi_addrgen_q.vew)) - 1, - eff_axi_dw_log_q) + eff_axi_dw_q; - aligned_end_addr_d = aligned_next_start_addr_d - 1; - end - // But since AXI requests are aligned in 4 KiB pages, aligned_end_addr must be in the - // same page as aligned_start_addr - if (aligned_start_addr_d[AxiAddrWidth-1:12] != aligned_end_addr_d[AxiAddrWidth-1:12]) begin - aligned_end_addr_d = {aligned_start_addr_d[AxiAddrWidth-1:12], 12'hFFF}; - aligned_next_start_addr_d = { next_2page_msb_q, 12'h000}; - end - end - AXI_ADDRGEN_WAITING: begin - if (!core_st_pending_i) + + set_end_addr ( + next_2page_msb_q, + (axi_addrgen_q.len << unsigned'(axi_addrgen_q.vew)), + axi_addrgen_q.addr, + eff_axi_dw_q, + eff_axi_dw_log_q, + aligned_start_addr_d, + aligned_end_addr_d, + aligned_next_start_addr_d + ); + end : axi_addrgen_state_AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED + + AXI_ADDRGEN_WAITING_CORE_STORE_PENDING: begin : axi_addrgen_state_AXI_ADDRGEN_WAITING_CORE_STORE_PENDING + if (!core_st_pending_i) begin axi_addrgen_state_d = AXI_ADDRGEN_REQUESTING; - end - AXI_ADDRGEN_REQUESTING : begin - automatic logic axi_ax_ready = (axi_addrgen_q.is_load && axi_ar_ready_i) || (! - axi_addrgen_q.is_load && axi_aw_ready_i); + end + end : axi_addrgen_state_AXI_ADDRGEN_WAITING_CORE_STORE_PENDING + + AXI_ADDRGEN_REQUESTING : begin : axi_addrgen_state_AXI_ADDRGEN_REQUESTING + automatic logic axi_ax_ready = (axi_addrgen_q.is_load && axi_ar_ready_i) || (!axi_addrgen_q.is_load && axi_aw_ready_i); // Pre-calculate the next_2page_msb. This should not require much energy if the addr // has zeroes in the upper positions. @@ -553,14 +715,25 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Before starting a transaction on a different channel, wait the formers to complete // Otherwise, the ordering of the responses is not guaranteed, and with the current // implementation we can incur in deadlocks - if (axi_addrgen_queue_empty || (axi_addrgen_req_o.is_load && axi_addrgen_q.is_load) || - (~axi_addrgen_req_o.is_load && ~axi_addrgen_q.is_load)) begin - if (!axi_addrgen_queue_full && axi_ax_ready) begin - if (axi_addrgen_q.is_burst) begin + // NOTE: this might be referring to an obsolete axi_cut implementation + if ( axi_addrgen_queue_empty || + (axi_addrgen_req_o.is_load && axi_addrgen_q.is_load) || + (~axi_addrgen_req_o.is_load && ~axi_addrgen_q.is_load + ) + ) begin : axi_ax_idle + if (!axi_addrgen_queue_full && axi_ax_ready) begin : start_req + automatic logic [riscv::PLEN-1:0] paddr; + + if (axi_addrgen_q.is_burst) begin : unit_stride ///////////////////////// // Unit-Stride access // ///////////////////////// + // NOTE: all these variables could be narrowed to the minimum number of bits + automatic int unsigned num_beats; + automatic int unsigned num_bytes; + automatic int unsigned burst_len_bytes; + automatic int unsigned axi_addrgen_bytes; // AXI burst length automatic int unsigned burst_length; @@ -570,10 +743,10 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // 2 - The AXI burst length cannot be longer than the number of beats required // to access the memory regions between aligned_start_addr and // aligned_end_addr - if (burst_length > ((aligned_end_addr_q[11:0] - aligned_start_addr_q[11:0]) >> - eff_axi_dw_log_q) + 1) - burst_length = ((aligned_end_addr_q[11:0] - aligned_start_addr_q[11:0]) >> - eff_axi_dw_log_q) + 1; + num_beats = ((aligned_end_addr_q[11:0] - aligned_start_addr_q[11:0]) >> eff_axi_dw_log_q) + 1; + if (burst_length > num_beats) begin + burst_length = num_beats; + end // AR Channel if (axi_addrgen_q.is_load) begin @@ -612,107 +785,57 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( axi_addrgen_queue_push = 1'b1; // Account for the requested operands - axi_addrgen_d.len = axi_addrgen_q.len - - ((aligned_end_addr_q[11:0] - axi_addrgen_q.addr[11:0] + 1) - >> int'(axi_addrgen_q.vew)); - if (axi_addrgen_q.len < - ((aligned_end_addr_q[11:0] - axi_addrgen_q.addr[11:0] + 1) - >> int'(axi_addrgen_q.vew))) + num_bytes = ( (aligned_end_addr_q[11:0] - axi_addrgen_q.addr[11:0] + 1) >> unsigned'(axi_addrgen_q.vew) ); + if (axi_addrgen_q.len >= num_bytes) begin + axi_addrgen_d.len = axi_addrgen_q.len - num_bytes; + end + else begin axi_addrgen_d.len = 0; - axi_addrgen_d.addr = aligned_next_start_addr_q; - - // Finished generating AXI requests - if (axi_addrgen_d.len == 0) begin - addrgen_req_ready = 1'b1; - axi_addrgen_state_d = AXI_ADDRGEN_IDLE; end + axi_addrgen_d.addr = aligned_next_start_addr_q; // Calculate the addresses for the next iteration // The start address is found by aligning the original request address by the width of // the memory interface. In our case, we have it already. aligned_start_addr_d = axi_addrgen_d.addr; // The final address can be found similarly. - // How many B we requested? No more than (256 << burst_size) - if (axi_addrgen_d.len << int'(axi_addrgen_q.vew) >= (256 << eff_axi_dw_log_q)) begin - aligned_next_start_addr_d = - aligned_addr(aligned_start_addr_d + (256 << eff_axi_dw_log_q), eff_axi_dw_log_q); - aligned_end_addr_d = aligned_next_start_addr_d - 1; - end else begin - aligned_next_start_addr_d = - aligned_addr(aligned_start_addr_d + (axi_addrgen_d.len << int'(axi_addrgen_q.vew)) - - 1, eff_axi_dw_log_q) + eff_axi_dw_q; - aligned_end_addr_d = aligned_next_start_addr_d - 1; - end - // But since AXI requests are aligned in 4 KiB pages, aligned_end_addr must be in the - // same page as aligned_start_addr - if (aligned_start_addr_d[AxiAddrWidth-1:12] != aligned_end_addr_d[AxiAddrWidth-1:12]) begin - aligned_end_addr_d = {aligned_start_addr_d[AxiAddrWidth-1:12], 12'hFFF}; - aligned_next_start_addr_d = { next_2page_msb_d, 12'h000}; - end - end else if (state_q != ADDRGEN_IDX_OP) begin + // How many B we requested? No more than (256 << burst_len_bytes) + burst_len_bytes = (256 << eff_axi_dw_log_q); + axi_addrgen_bytes = (axi_addrgen_d.len << unsigned'(axi_addrgen_q.vew)); + set_end_addr ( + next_2page_msb_d, + (axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)), + aligned_start_addr_d, + eff_axi_dw_q, + eff_axi_dw_log_q, + aligned_start_addr_d, + aligned_end_addr_d, + aligned_next_start_addr_d + ); + end : unit_stride + else if (state_q != ADDRGEN_IDX_OP) begin : strided ///////////////////// // Strided access // ///////////////////// - // AR Channel - if (axi_addrgen_q.is_load) begin - axi_ar_o = '{ - addr : axi_addrgen_q.addr, - len : 0, - size : axi_addrgen_q.vew, - cache : CACHE_MODIFIABLE, - burst : BURST_INCR, - default: '0 - }; - axi_ar_valid_o = 1'b1; - end - // AW Channel - else begin - axi_aw_o = '{ - addr : axi_addrgen_q.addr, - len : 0, - size : axi_addrgen_q.vew, - cache : CACHE_MODIFIABLE, - burst : BURST_INCR, - default: '0 - }; - axi_aw_valid_o = 1'b1; - end + if ( en_ld_st_translation_i ) begin : en_ld_st_translation + // Request an address translation + mmu_req_o = 1'b1; + mmu_vaddr_o = axi_addrgen_q.addr; + axi_addrgen_state_d = AXI_ADDRGEN_WAIT_TRANSLATION; + end : en_ld_st_translation - // Send this request to the load/store units - axi_addrgen_queue = '{ - addr : axi_addrgen_q.addr, - size : axi_addrgen_q.vew, - len : 0, - is_load: axi_addrgen_q.is_load - }; - axi_addrgen_queue_push = 1'b1; - - // Account for the requested operands - axi_addrgen_d.len = axi_addrgen_q.len - 1; - // Calculate the addresses for the next iteration, adding the correct stride - axi_addrgen_d.addr = axi_addrgen_q.addr + axi_addrgen_q.stride; - - // Finished generating AXI requests - if (axi_addrgen_d.len == 0) begin - addrgen_req_ready = 1'b1; - axi_addrgen_state_d = AXI_ADDRGEN_IDLE; - end - end else begin - - ////////////////////// - // Indexed access // - ////////////////////// - - if (idx_addr_valid_q) begin - // We consumed a word - idx_addr_ready_d = 1'b1; + // Mux target address + paddr = ( en_ld_st_translation_i ) ? mmu_paddr_i : axi_addrgen_q.addr; + // Either we got a valid address translation from the MMU + // or virtual memory is disabled + if ( mmu_valid_i | !en_ld_st_translation_i ) begin : addr_valid // AR Channel if (axi_addrgen_q.is_load) begin axi_ar_o = '{ - addr : idx_final_addr_q, + addr : paddr, len : 0, size : axi_addrgen_q.vew, cache : CACHE_MODIFIABLE, @@ -724,7 +847,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // AW Channel else begin axi_aw_o = '{ - addr : idx_final_addr_q, + addr : paddr, len : 0, size : axi_addrgen_q.vew, cache : CACHE_MODIFIABLE, @@ -736,7 +859,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Send this request to the load/store units axi_addrgen_queue = '{ - addr : idx_final_addr_q, + addr : paddr, size : axi_addrgen_q.vew, len : 0, is_load: axi_addrgen_q.is_load @@ -744,29 +867,124 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( axi_addrgen_queue_push = 1'b1; // Account for the requested operands - axi_addrgen_d.len = axi_addrgen_q.len - 1; + axi_addrgen_d.len = axi_addrgen_q.len - 1; + // Calculate the addresses for the next iteration, adding the correct stride + // NOTE: there is no need to check for misaligned erros, since the stride always produces EEW-aligned to the first address + axi_addrgen_d.addr = axi_addrgen_q.addr + axi_addrgen_q.stride; + end : addr_valid + end : strided + else begin : indexed + automatic logic [riscv::PLEN-1:0] idx_final_paddr; + ////////////////////// + // Indexed access // + ////////////////////// + // TODO: check if idx_vaddr_valid_q is stable + if (idx_vaddr_valid_q) begin : idx_vaddr_valid_q - // Check if the address does generate an exception - if (is_addr_error(idx_final_addr_q, axi_addrgen_q.vew)) begin + // Check if the virtual address generates an exception + // NOTE: we can do this even before address translation, since the + // page offset (2^12) is the same for both physical and virtual addresses + if (is_addr_error(idx_final_vaddr_q, axi_addrgen_q.vew)) begin : eew_misaligned_error // Generate an error idx_op_error_d = 1'b1; // Forward next vstart info to the dispatcher - addrgen_error_vl_d = addrgen_req.len - axi_addrgen_q.len - 1; + addrgen_exception_vl_d = addrgen_req.len - axi_addrgen_q.len - 1; addrgen_req_ready = 1'b1; axi_addrgen_state_d = AXI_ADDRGEN_IDLE; - end - - // Finished generating AXI requests - if (axi_addrgen_d.len == 0) begin - addrgen_req_ready = 1'b1; - axi_addrgen_state_d = AXI_ADDRGEN_IDLE; - end + end : eew_misaligned_error + else begin : aligned_vaddress + if ( en_ld_st_translation_i ) begin : en_ld_st_translation + // Request an address translation + mmu_req_o = 1'b1; + mmu_vaddr_o = idx_final_vaddr_q; + axi_addrgen_state_d = AXI_ADDRGEN_WAIT_TRANSLATION; + end : en_ld_st_translation + + // Mux target address + idx_final_paddr = ( en_ld_st_translation_i ) ? mmu_paddr_i : idx_final_vaddr_q; + + // Either we got a valid address translation from the MMU + // or virtual memory is disabled + if ( mmu_valid_i | !en_ld_st_translation_i ) begin : addr_valid + // We consumed a word + idx_vaddr_ready_d = 1'b1; + + // AR Channel + if (axi_addrgen_q.is_load) begin + axi_ar_o = '{ + addr : idx_final_paddr, + len : 0, + size : axi_addrgen_q.vew, + cache : CACHE_MODIFIABLE, + burst : BURST_INCR, + default: '0 + }; + axi_ar_valid_o = 1'b1; + end + // AW Channel + else begin + axi_aw_o = '{ + addr : idx_final_paddr, + len : 0, + size : axi_addrgen_q.vew, + cache : CACHE_MODIFIABLE, + burst : BURST_INCR, + default: '0 + }; + axi_aw_valid_o = 1'b1; + end + + // Send this request to the load/store units + axi_addrgen_queue = '{ + addr : idx_final_paddr, + size : axi_addrgen_q.vew, + len : 0, + is_load: axi_addrgen_q.is_load + }; + axi_addrgen_queue_push = 1'b1; + + // Account for the requested operands + axi_addrgen_d.len = axi_addrgen_q.len - 1; + end : addr_valid + end : aligned_vaddress + end : idx_vaddr_valid_q + end : indexed + + // Finished generating AXI requests + if (axi_addrgen_d.len == 0) begin + addrgen_req_ready = 1'b1; + axi_addrgen_state_d = AXI_ADDRGEN_IDLE; + if ( en_ld_st_translation_i ) begin + last_translation_completed = 1'b1; end end - end - end - end - endcase + end : start_req + end : axi_ax_idle + end : axi_addrgen_state_AXI_ADDRGEN_REQUESTING + + AXI_ADDRGEN_WAIT_TRANSLATION : begin : axi_addrgen_state_AXI_ADDRGEN_WAIT_TRANSLATION + // keep request high + mmu_req_o = 1'b1; + + // Wait for MMU to respond + if ( mmu_valid_i ) begin : mmu_valid + // Perform request + axi_addrgen_state_d = AXI_ADDRGEN_REQUESTING; + + // Replace virtual address with translated address + axi_addrgen_d.addr = mmu_paddr_i; + + // Sample MMU exception + if ( mmu_exception_i.valid ) begin : mmu_exception_valid + // the other FSM will pick up the _q on the next cycle + mmu_exception_d = mmu_exception_i; + addrgen_req_ready = 1'b1; + axi_addrgen_state_d = AXI_ADDRGEN_IDLE; + end : mmu_exception_valid + end : mmu_valid + + end : axi_addrgen_state_AXI_ADDRGEN_WAIT_TRANSLATION + endcase // axi_addrgen_state_q end: axi_addrgen always_ff @(posedge clk_i or negedge rst_ni) begin diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 51042ed8e..467ae4a70 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -35,6 +35,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( output pe_resp_t pe_resp_o, // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, + input logic addrgen_exception_valid_i, input logic axi_addrgen_req_valid_i, output logic axi_addrgen_req_ready_o, // Interface with the lanes @@ -136,7 +137,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // reading from and writing into the lanes (read_pnt). logic [idx_width(ResultQueueDepth)-1:0] result_queue_write_pnt_d, result_queue_write_pnt_q; logic [idx_width(ResultQueueDepth)-1:0] result_queue_read_pnt_d, result_queue_read_pnt_q; - // We need to count how many valid elements are there in this result queue. + // We need to count how many valid elements (payload_t) are there in this result queue. logic [idx_width(ResultQueueDepth):0] result_queue_cnt_d, result_queue_cnt_q; // Vector to register the final grants from the operand requesters, which indicate // that the result was actually written in the VRF (while the normal grant just says @@ -174,33 +175,37 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q; // Interface with the main sequencer - pe_resp_t pe_resp; + pe_resp_t pe_resp_d; // Remaining bytes of the current instruction in the issue phase - vlen_t issue_cnt_d, issue_cnt_q; + vlen_t issue_cnt_bytes_d, issue_cnt_bytes_q; // Remaining bytes of the current instruction in the commit phase - vlen_t commit_cnt_d, commit_cnt_q; + vlen_t commit_cnt_bytes_d, commit_cnt_bytes_q; // Pointers // // We need several pointers to copy data from the memory interface // into the VRF. Namely, we need: // - A counter of how many beats are left in the current AXI burst - axi_pkg::len_t len_d, len_q; + axi_pkg::len_t axi_len_d, axi_len_q; // - A pointer to which byte in the current R beat we are reading data from. - logic [idx_width(AxiDataWidth/8):0] r_pnt_d, r_pnt_q; + logic [idx_width(AxiDataWidth/8):0] axi_r_byte_pnt_d, axi_r_byte_pnt_q; // - A pointer to which byte in the full VRF word we are writing data into. - logic [idx_width(DataWidth*NrLanes/8):0] vrf_pnt_d, vrf_pnt_q; + logic [idx_width(DataWidth*NrLanes/8):0] vrf_word_byte_pnt_d, vrf_word_byte_pnt_q; + + localparam unsigned DataWidthB = DataWidth / 8; + + vlen_t vstart_lane; always_comb begin: p_vldu // Maintain state vinsn_queue_d = vinsn_queue_q; - issue_cnt_d = issue_cnt_q; - commit_cnt_d = commit_cnt_q; + issue_cnt_bytes_d = issue_cnt_bytes_q; + commit_cnt_bytes_d = commit_cnt_bytes_q; - len_d = len_q; - r_pnt_d = r_pnt_q; - vrf_pnt_d = vrf_pnt_q; + axi_len_d = axi_len_q; + axi_r_byte_pnt_d = axi_r_byte_pnt_q; + vrf_word_byte_pnt_d = vrf_word_byte_pnt_q; result_queue_d = result_queue_q; result_queue_valid_d = result_queue_valid_q; @@ -215,7 +220,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // We are not ready, by default axi_addrgen_req_ready_o = 1'b0; - pe_resp = '0; + pe_resp_d = '0; axi_r_ready_o = 1'b0; mask_ready_o = 1'b0; load_complete_o = 1'b0; @@ -232,128 +237,168 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // - The Address Generator sent us the data about the corresponding AR beat // - There is place in the result queue to write the data read from the R channel if (axi_r_valid_i && axi_addrgen_req_valid_i - && axi_addrgen_req_i.is_load && !result_queue_full) begin + && axi_addrgen_req_i.is_load && !result_queue_full) begin : axi_r_beat_read // Bytes valid in the current R beat // If non-unit strided load, we do not progress within the beat automatic shortint unsigned lower_byte = beat_lower_byte(axi_addrgen_req_i.addr, - axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q); + axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q); automatic shortint unsigned upper_byte = beat_upper_byte(axi_addrgen_req_i.addr, - axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q); - + axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q); + // Is there a vector instruction ready to be issued? // Do we have the operands for it? - if (vinsn_issue_valid && (vinsn_issue_q.vm || (|mask_valid_i))) begin + if (vinsn_issue_valid && (vinsn_issue_q.vm || (|mask_valid_i))) begin : operands_valid // Account for the issued bytes // How many bytes are valid in this VRF word - automatic vlen_t vrf_valid_bytes = NrLanes * 8 - vrf_pnt_q; + automatic vlen_t vrf_valid_bytes = (NrLanes * DataWidthB) - vrf_word_byte_pnt_q; // How many bytes are valid in this instruction - automatic vlen_t vinsn_valid_bytes = issue_cnt_q - vrf_pnt_q; + automatic vlen_t vinsn_valid_bytes = issue_cnt_bytes_q - vrf_word_byte_pnt_q; // How many bytes are valid in this AXI word - automatic vlen_t axi_valid_bytes = upper_byte - lower_byte - r_pnt_q + 1; + automatic vlen_t axi_valid_bytes = upper_byte - lower_byte - axi_r_byte_pnt_q + 1; + // How many bytes are we committing? automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes; - valid_bytes = issue_cnt_q < NrLanes * 8 ? vinsn_valid_bytes : vrf_valid_bytes; - valid_bytes = valid_bytes < axi_valid_bytes ? valid_bytes : axi_valid_bytes; + valid_bytes = ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) ? vinsn_valid_bytes : vrf_valid_bytes; + valid_bytes = ( valid_bytes < axi_valid_bytes ) ? valid_bytes : axi_valid_bytes; - r_pnt_d = r_pnt_q + valid_bytes; - vrf_pnt_d = vrf_pnt_q + valid_bytes; + // Bump R beat and VRF word pointers + axi_r_byte_pnt_d = axi_r_byte_pnt_q + valid_bytes; + vrf_word_byte_pnt_d = vrf_word_byte_pnt_q + valid_bytes; // Copy data from the R channel into the result queue - for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin + for (int unsigned axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : axi_r_to_result_queue // Is this byte a valid byte in the R beat? - if (axi_byte >= lower_byte + r_pnt_q && axi_byte <= upper_byte) begin + if ( ( axi_byte >= ( lower_byte + axi_r_byte_pnt_q ) ) && + ( axi_byte <= upper_byte ) + ) begin : is_axi_r_byte // Map axi_byte to the corresponding byte in the VRF word (sequential) - automatic int vrf_seq_byte = axi_byte - lower_byte - r_pnt_q + vrf_pnt_q; + automatic int unsigned vrf_seq_byte = axi_byte - lower_byte - axi_r_byte_pnt_q + vrf_word_byte_pnt_q; // And then shuffle it - automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.vtype.vsew); + automatic int unsigned vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.vtype.vsew); // Is this byte a valid byte in the VRF word? - if (vrf_seq_byte < issue_cnt_q && vrf_seq_byte < NrLanes * 8) begin + if (vrf_seq_byte < issue_cnt_bytes_q && vrf_seq_byte < (NrLanes * DataWidthB)) begin : is_vrf_byte // At which lane, and what is the byte offset in that lane, of the byte vrf_byte? - automatic int vrf_lane = vrf_byte >> 3; - automatic int vrf_offset = vrf_byte[2:0]; + automatic int unsigned vrf_offset = vrf_byte[2:0]; + // Consider also vstart and make sure this index wraps around the number of lane + automatic int unsigned vrf_lane = (vrf_byte >> 3); + // Adjust lane selection w.r.t. vstart + vrf_lane += vinsn_issue_q.vstart[idx_width(NrLanes)-1:0]; + if ( vrf_lane >= NrLanes ) begin : vstart_lane_adjust + vrf_lane -= NrLanes; + end : vstart_lane_adjust + // Copy data and byte strobe result_queue_d[result_queue_write_pnt_q][vrf_lane].wdata[8*vrf_offset +: 8] = axi_r_i.data[8*axi_byte +: 8]; result_queue_d[result_queue_write_pnt_q][vrf_lane].be[vrf_offset] = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset]; - end - end - end - - // Initialize id and addr fields of the result queue requests - for (int lane = 0; lane < NrLanes; lane++) begin + end : is_vrf_byte + end : is_axi_r_byte + end : axi_r_to_result_queue + + for (int unsigned lane = 0; lane < NrLanes; lane++) begin : compute_vrf_addr + automatic vlen_t issue_cnt_elems; + // elements per lane (each lane processes num elements / NrLanes) + automatic vlen_t elem_left_per_lane; + // 64-bit aligned address + automatic vlen_t lane_word_offset; + // How many elements in the vector body + automatic vlen_t elem_body_count; + // vstart value local ot the lane + automatic vlen_t vstart_lane; + + // Compute VRF chunk address per lane + elem_body_count = vinsn_issue_q.vl - vinsn_issue_q.vstart; + issue_cnt_elems = issue_cnt_bytes_q >> unsigned'(vinsn_issue_q.vtype.vsew); + elem_left_per_lane = ( elem_body_count - issue_cnt_elems ) / NrLanes; + lane_word_offset = elem_left_per_lane >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)); + + vstart_lane = vinsn_issue_q.vstart / NrLanes; + // If lane_id < (vstart % NrLanes), this lane needs to execute one micro-operation less. + if ( lane < vinsn_issue_q.vstart[idx_width(NrLanes)-1:0] ) begin : vstart_lane_adjust + vstart_lane += 1; + end : vstart_lane_adjust + + // Store in result queue + result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) + lane_word_offset + vstart_lane; result_queue_d[result_queue_write_pnt_q][lane].id = vinsn_issue_q.id; - result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) + - (((vinsn_issue_q.vl - (issue_cnt_q >> int'(vinsn_issue_q.vtype.vsew))) / NrLanes) >> - (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))); - end - end + end : compute_vrf_addr + end : operands_valid // We have a word ready to be sent to the lanes - if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_q) begin + if (vrf_word_byte_pnt_d == (NrLanes * DataWidthB) || vrf_word_byte_pnt_d == issue_cnt_bytes_q) begin : vrf_word_ready // Increment result queue pointers and counters result_queue_cnt_d += 1; - if (result_queue_write_pnt_q == ResultQueueDepth-1) + if (result_queue_write_pnt_q == ResultQueueDepth-1) begin : result_queue_write_pnt_overflow result_queue_write_pnt_d = '0; - else + end : result_queue_write_pnt_overflow + else begin : result_queue_write_pnt_increment result_queue_write_pnt_d = result_queue_write_pnt_q + 1; + end : result_queue_write_pnt_increment // Trigger the request signal + // TODO: check if triggering all lanes is actually necessary here result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; // Acknowledge the mask operands mask_ready_o = !vinsn_issue_q.vm; // Reset the pointer in the VRF word - vrf_pnt_d = '0; + vrf_word_byte_pnt_d = '0; // Account for the results that were issued - issue_cnt_d = issue_cnt_q - NrLanes * 8; - if (issue_cnt_q < NrLanes * 8) - issue_cnt_d = '0; - end + issue_cnt_bytes_d = issue_cnt_bytes_q - (NrLanes * DataWidthB); + if (issue_cnt_bytes_q < (NrLanes * DataWidthB)) begin : issue_cnt_bytes_overflow + issue_cnt_bytes_d = '0; + end : issue_cnt_bytes_overflow + end : vrf_word_ready // Consumed all valid bytes in this R beat - if (r_pnt_d == upper_byte - lower_byte + 1 || issue_cnt_d == '0) begin + if ( ( axi_r_byte_pnt_d == ( upper_byte - lower_byte + 1 ) ) || ( issue_cnt_bytes_d == '0 ) ) begin : axi_r_beat_finish // Request another beat axi_r_ready_o = 1'b1; - r_pnt_d = '0; + axi_r_byte_pnt_d = '0; // Account for the beat we consumed - len_d = len_q + 1; - end + axi_len_d = axi_len_q + 1; + end : axi_r_beat_finish // Consumed all beats from this burst - if ($unsigned(len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin + if ($unsigned(axi_len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin : axi_finish // Reset AXI pointers - len_d = '0; - r_pnt_d = '0; + axi_len_d = '0; + axi_r_byte_pnt_d = '0; // Wait for another AXI request axi_addrgen_req_ready_o = 1'b1; - end + end : axi_finish // Finished issuing results - if (vinsn_issue_valid && issue_cnt_d == '0) begin + if (vinsn_issue_valid && (issue_cnt_bytes_d == '0)) begin : vrf_results_finish // Increment vector instruction queue pointers and counters vinsn_queue_d.issue_cnt -= 1; - if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) + if (vinsn_queue_q.issue_pnt == (VInsnQueueDepth-1)) begin : issue_pnt_overflow vinsn_queue_d.issue_pnt = '0; - else + end : issue_pnt_overflow + else begin : issue_pnt_increment vinsn_queue_d.issue_pnt += 1; + end : issue_pnt_increment // Prepare for the next vector instruction - if (vinsn_queue_d.issue_cnt != 0) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl << int'(vinsn_queue_q.vinsn[ - vinsn_queue_d.issue_pnt].vtype.vsew); - end - end + if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update + issue_cnt_bytes_d = ( + vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart + ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew); + end : issue_cnt_bytes_update + end : vrf_results_finish + end : axi_r_beat_read ////////////////////////////////// // Write results into the VRF // ////////////////////////////////// - for (int lane = 0; lane < NrLanes; lane++) begin: result_write + for (int unsigned lane = 0; lane < NrLanes; lane++) begin: vrf_result_write ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane]; ldu_result_addr_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].addr; ldu_result_id_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].id; @@ -365,39 +410,43 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Received a grant from the VRF. // Deactivate the request, but do not bump the pointers for now. - if (ldu_result_req_o[lane] && ldu_result_gnt_i[lane]) begin + if (ldu_result_req_o[lane] && ldu_result_gnt_i[lane]) begin : vrf_grant result_queue_valid_d[result_queue_read_pnt_q][lane] = 1'b0; result_queue_d[result_queue_read_pnt_q][lane] = '0; // Reset the final gnt vector since we are now waiting for another final gnt result_final_gnt_d[lane] = 1'b0; - end - end: result_write + end : vrf_grant + end: vrf_result_write // All lanes accepted the VRF request // Wait for all the final grants, to be sure that all the results were written back if (!(|result_queue_valid_d[result_queue_read_pnt_q]) && - (&result_final_gnt_d || commit_cnt_q > (NrLanes * 8))) + (&result_final_gnt_d || commit_cnt_bytes_q > (NrLanes * DataWidthB))) begin : wait_for_write_back // There is something waiting to be written - if (!result_queue_empty) begin + if (!result_queue_empty) begin : result_available // Increment the read pointer - if (result_queue_read_pnt_q == ResultQueueDepth-1) + if (result_queue_read_pnt_q == (ResultQueueDepth-1)) begin : result_queue_read_pnt_overflow result_queue_read_pnt_d = 0; - else + end : result_queue_read_pnt_overflow + else begin : result_queue_read_pnt_increment result_queue_read_pnt_d = result_queue_read_pnt_q + 1; + end : result_queue_read_pnt_increment // Decrement the counter of results waiting to be written result_queue_cnt_d -= 1; // Decrement the counter of remaining vector elements waiting to be written - commit_cnt_d = commit_cnt_q - NrLanes * 8; - if (commit_cnt_q < (NrLanes * 8)) - commit_cnt_d = '0; - end + commit_cnt_bytes_d = commit_cnt_bytes_q - (NrLanes * DataWidthB); + if (commit_cnt_bytes_q < (NrLanes * DataWidthB)) begin : commit_cnt_bytes_overflow + commit_cnt_bytes_d = '0; + end : commit_cnt_bytes_overflow + end : result_available + end : wait_for_write_back // Finished committing the results of a vector instruction - if (vinsn_commit_valid && commit_cnt_d == '0) begin + if (vinsn_commit_valid && commit_cnt_bytes_d == '0) begin : vinsn_done // Mark the vector instruction as being done - pe_resp.vinsn_done[vinsn_commit.id] = 1'b1; + pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; // Signal complete load load_complete_o = 1'b1; @@ -411,51 +460,62 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Update the commit counter for the next instruction if (vinsn_queue_d.commit_cnt != '0) - commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl << int'(vinsn_queue_q.vinsn[ - vinsn_queue_d.commit_pnt].vtype.vsew); - end + commit_cnt_bytes_d = ( + vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl + - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart + ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew); + end : vinsn_done + + // Clear instruction queue in case of exceptions from addrgen + if ( addrgen_exception_valid_i ) begin : exception + // Signal done to sequencer + pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; + // Clear counters and flags + end : exception ////////////////////////////// // Accept new instruction // ////////////////////////////// if (!vinsn_queue_full && pe_req_valid_i && !vinsn_running_q[pe_req_i.id] && - pe_req_i.vfu == VFU_LoadUnit) begin + pe_req_i.vfu == VFU_LoadUnit) begin : pe_req_valid vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = pe_req_i; vinsn_running_d[pe_req_i.id] = 1'b1; // Initialize counters - if (vinsn_queue_d.issue_cnt == '0) - issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); - if (vinsn_queue_d.commit_cnt == '0) - commit_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); + if (vinsn_queue_d.issue_cnt == '0) begin : issue_cnt_bytes_init + issue_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew); + end : issue_cnt_bytes_init + if (vinsn_queue_d.commit_cnt == '0) begin : commit_cnt_bytes_init + commit_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew); + end : commit_cnt_bytes_init // Bump pointers and counters of the vector instruction queue vinsn_queue_d.accept_pnt += 1; vinsn_queue_d.issue_cnt += 1; vinsn_queue_d.commit_cnt += 1; - end + end : pe_req_valid end: p_vldu always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - vinsn_running_q <= '0; - issue_cnt_q <= '0; - commit_cnt_q <= '0; - len_q <= '0; - r_pnt_q <= '0; - vrf_pnt_q <= '0; - pe_resp_o <= '0; - result_final_gnt_q <= '0; + vinsn_running_q <= '0; + issue_cnt_bytes_q <= '0; + commit_cnt_bytes_q <= '0; + axi_len_q <= '0; + axi_r_byte_pnt_q <= '0; + vrf_word_byte_pnt_q <= '0; + pe_resp_o <= '0; + result_final_gnt_q <= '0; end else begin - vinsn_running_q <= vinsn_running_d; - issue_cnt_q <= issue_cnt_d; - commit_cnt_q <= commit_cnt_d; - len_q <= len_d; - r_pnt_q <= r_pnt_d; - vrf_pnt_q <= vrf_pnt_d; - pe_resp_o <= pe_resp; - result_final_gnt_q <= result_final_gnt_d; + vinsn_running_q <= vinsn_running_d; + issue_cnt_bytes_q <= issue_cnt_bytes_d; + commit_cnt_bytes_q <= commit_cnt_bytes_d; + axi_len_q <= axi_len_d; + axi_r_byte_pnt_q <= axi_r_byte_pnt_d; + vrf_word_byte_pnt_q <= vrf_word_byte_pnt_d; + pe_resp_o <= pe_resp_d; + result_final_gnt_q <= result_final_gnt_d; end end diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index aa2e05283..68cd3add5 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -42,8 +42,8 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( output logic [1:0] pe_req_ready_o, // Load (0) and Store (1) units output pe_resp_t [1:0] pe_resp_o, // Load (0) and Store (1) units output logic addrgen_ack_o, - output logic addrgen_error_o, - output vlen_t addrgen_error_vl_o, + output ariane_pkg::exception_t addrgen_exception_o, + output vlen_t addrgen_exception_vl_o, // Interface with the lanes // Store unit operands input elen_t [NrLanes-1:0] stu_operand_i, @@ -59,6 +59,25 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrLanes-1:0] mask_valid_i, output logic vldu_mask_ready_o, output logic vstu_mask_ready_o, + + // CSR input + input logic en_ld_st_translation_i, + + // Interface with CVA6's sv39 MMU + // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless + output ariane_pkg::exception_t mmu_misaligned_ex_o, + output logic mmu_req_o, // request address translation + output logic [riscv::VLEN-1:0] mmu_vaddr_o, // virtual address out + output logic mmu_is_store_o, // the translation is requested by a store + // if we need to walk the page table we can't grant in the same cycle + // Cycle 0 + input logic mmu_dtlb_hit_i, // sent in the same cycle as the request if translation hits in the DTLB + input logic [riscv::PPNW-1:0] mmu_dtlb_ppn_i, // ppn (send same cycle as hit) + // Cycle 1 + input logic mmu_valid_i, // translation is valid + input logic [riscv::PLEN-1:0] mmu_paddr_i, // translated address + input ariane_pkg::exception_t mmu_exception_i, // address translation threw an exception + // Results output logic [NrLanes-1:0] ldu_result_req_o, output vid_t [NrLanes-1:0] ldu_result_id_o, @@ -69,6 +88,11 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrLanes-1:0] ldu_result_final_gnt_i ); + logic load_complete, store_complete; + logic addrgen_exception_load, addrgen_exception_store; + assign load_complete_o = load_complete | addrgen_exception_load; + assign store_complete_o = store_complete | addrgen_exception_store; + /////////////////// // Definitions // /////////////////// @@ -89,8 +113,8 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .aw_chan_t(axi_aw_t ), .w_chan_t (axi_w_t ), .b_chan_t (axi_b_t ), - .req_t (axi_req_t ), - .resp_t (axi_resp_t) + .axi_req_t (axi_req_t ), + .axi_resp_t(axi_resp_t) ) i_axi_cut ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -133,8 +157,10 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_req_valid_i (pe_req_valid_i ), .pe_vinsn_running_i (pe_vinsn_running_i ), .addrgen_ack_o (addrgen_ack_o ), - .addrgen_error_o (addrgen_error_o ), - .addrgen_error_vl_o (addrgen_error_vl_o ), + .addrgen_exception_o ( addrgen_exception_o ), + .addrgen_exception_vl_o ( addrgen_exception_vl_o ), + .addrgen_exception_load_o ( addrgen_exception_load ), + .addrgen_exception_store_o ( addrgen_exception_store ), // Interface with the lanes .addrgen_operand_i (addrgen_operand_i ), .addrgen_operand_target_fu_i(addrgen_operand_target_fu_i), @@ -144,7 +170,19 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .axi_addrgen_req_o (axi_addrgen_req ), .axi_addrgen_req_valid_o (axi_addrgen_req_valid ), .ldu_axi_addrgen_req_ready_i(ldu_axi_addrgen_req_ready ), - .stu_axi_addrgen_req_ready_i(stu_axi_addrgen_req_ready ) + .stu_axi_addrgen_req_ready_i(stu_axi_addrgen_req_ready ), + + // CSR input + .en_ld_st_translation_i, + .mmu_misaligned_ex_o, + .mmu_req_o, + .mmu_vaddr_o, + .mmu_is_store_o, + .mmu_dtlb_hit_i, + .mmu_dtlb_ppn_i, + .mmu_valid_i, + .mmu_paddr_i, + .mmu_exception_i ); //////////////////////// @@ -165,7 +203,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .axi_r_valid_i (axi_resp.r_valid ), .axi_r_ready_o (axi_req.r_ready ), // Interface with the dispatcher - .load_complete_o (load_complete_o ), + .load_complete_o (load_complete ), // Interface with the main sequencer .pe_req_i (pe_req_i ), .pe_req_valid_i (pe_req_valid_i ), @@ -173,6 +211,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_req_ready_o (pe_req_ready_o[OffsetLoad]), .pe_resp_o (pe_resp_o[OffsetLoad] ), // Interface with the address generator + .addrgen_exception_valid_i ( addrgen_ack_o & addrgen_exception_o.valid ), .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ), .axi_addrgen_req_ready_o(ldu_axi_addrgen_req_ready ), @@ -213,7 +252,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .axi_b_ready_o (axi_req.b_ready ), // Interface with the dispatcher .store_pending_o (store_pending_o ), - .store_complete_o (store_complete_o ), + .store_complete_o (store_complete ), // Interface with the main sequencer .pe_req_i (pe_req_i ), .pe_req_valid_i (pe_req_valid_i ), @@ -221,6 +260,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_req_ready_o (pe_req_ready_o[OffsetStore]), .pe_resp_o (pe_resp_o[OffsetStore] ), // Interface with the address generator + .addrgen_exception_valid_i ( addrgen_ack_o & addrgen_exception_o.valid ), .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ), .axi_addrgen_req_ready_o(stu_axi_addrgen_req_ready ), diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv index 9580f59b0..f6e5e38ca 100644 --- a/hardware/src/vlsu/vstu.sv +++ b/hardware/src/vlsu/vstu.sv @@ -46,6 +46,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( output pe_resp_t pe_resp_o, // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, + input logic addrgen_exception_valid_i, input logic axi_addrgen_req_valid_i, output logic axi_addrgen_req_ready_o, // Interface with the lanes @@ -63,12 +64,14 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( import axi_pkg::beat_upper_byte; import axi_pkg::BURST_INCR; + localparam unsigned DataWidthB = DataWidth / 8; + /////////////////////// // Spill registers // /////////////////////// elen_t [NrLanes-1:0] stu_operand; - logic [NrLanes-1:0] stu_operand_valid; + logic [NrLanes-1:0] stu_operand_valid_lanes; logic stu_operand_ready; for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_regs @@ -83,7 +86,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( .valid_i (stu_operand_valid_i[lane]), .ready_o (stu_operand_ready_o[lane]), .data_o (stu_operand[lane] ), - .valid_o (stu_operand_valid[lane] ), + .valid_o (stu_operand_valid_lanes[lane] ), .ready_i (stu_operand_ready ) ); end: gen_regs @@ -153,30 +156,47 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // Store Unit // ////////////////// + // NOTE: these are out here only for debug visibility, they could go in p_vldu as automatic variables + int unsigned vrf_seq_byte; + int unsigned vrf_byte ; + vlen_t vrf_valid_bytes ; + vlen_t vinsn_valid_bytes; + vlen_t axi_valid_bytes ; + logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes; + + // Vector instructions currently running logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q; // Interface with the main sequencer - pe_resp_t pe_resp; + pe_resp_t pe_resp_d; // Remaining bytes of the current instruction in the issue phase - vlen_t issue_cnt_d, issue_cnt_q; + vlen_t issue_cnt_bytes_d, issue_cnt_bytes_q; // Pointers // // We need several pointers to copy data to the memory interface // from the VRF. Namely, we need: // - A counter of how many beats are left in the current AXI burst - axi_pkg::len_t len_d, len_q; + axi_pkg::len_t axi_len_d, axi_len_q; // - A pointer to which byte in the full VRF word we are reading data from. logic [idx_width(DataWidth*NrLanes/8):0] vrf_pnt_d, vrf_pnt_q; always_comb begin: p_vstu + // NOTE: these are out here only for debug visibility, they could go in p_vldu as automatic variables + vrf_seq_byte = '0; + vrf_byte = '0; + vrf_valid_bytes = '0; + vinsn_valid_bytes = '0; + axi_valid_bytes = '0; + valid_bytes = '0; + // Maintain state vinsn_queue_d = vinsn_queue_q; - issue_cnt_d = issue_cnt_q; + issue_cnt_bytes_d = issue_cnt_bytes_q; - len_d = len_q; + axi_len_d = axi_len_q; vrf_pnt_d = vrf_pnt_q; // Vector instructions currently running @@ -184,7 +204,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // We are not ready, by default axi_addrgen_req_ready_o = 1'b0; - pe_resp = '0; + pe_resp_d = '0; axi_w_o = '0; axi_w_valid_o = 1'b0; axi_b_ready_o = 1'b0; @@ -204,92 +224,130 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // - We received all the operands from the lanes // - The address generator generated an AXI AW request for this write beat // - The AXI subsystem is ready to accept this W beat - if (vinsn_issue_valid && &stu_operand_valid && (vinsn_issue_q.vm || (|mask_valid_i)) && - axi_addrgen_req_valid_i && !axi_addrgen_req_i.is_load && axi_w_ready_i) begin + if (vinsn_issue_valid && + axi_addrgen_req_valid_i && !axi_addrgen_req_i.is_load && axi_w_ready_i) begin : issue_valid // Bytes valid in the current W beat automatic shortint unsigned lower_byte = beat_lower_byte(axi_addrgen_req_i.addr, - axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q); + axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q); automatic shortint unsigned upper_byte = beat_upper_byte(axi_addrgen_req_i.addr, - axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q); + axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q); + + // For non-zero vstart values, the last operand read is not going to involve all the lanes + automatic logic [NrLanes-1:0] stu_operand_valid; + automatic logic [NrLanes-1:0] mask_valid; + // How many bytes are we committing? + // automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes; + // Account for the issued bytes // How many bytes are valid in this VRF word - automatic vlen_t vrf_valid_bytes = NrLanes * 8 - vrf_pnt_q; + vrf_valid_bytes = (NrLanes * DataWidthB) - vrf_pnt_q; // How many bytes are valid in this instruction - automatic vlen_t vinsn_valid_bytes = issue_cnt_q - vrf_pnt_q; + vinsn_valid_bytes = issue_cnt_bytes_q - vrf_pnt_q; // How many bytes are valid in this AXI word - automatic vlen_t axi_valid_bytes = upper_byte - lower_byte + 1; - - // How many bytes are we committing? - automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes; - valid_bytes = issue_cnt_q < NrLanes * 8 ? vinsn_valid_bytes : vrf_valid_bytes; - valid_bytes = valid_bytes < axi_valid_bytes ? valid_bytes : axi_valid_bytes; - - vrf_pnt_d = vrf_pnt_q + valid_bytes; - - // Copy data from the operands into the W channel - for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin - // Is this byte a valid byte in the W beat? - if (axi_byte >= lower_byte && axi_byte <= upper_byte) begin - // Map axy_byte to the corresponding byte in the VRF word (sequential) - automatic int vrf_seq_byte = axi_byte - lower_byte + vrf_pnt_q; - // And then shuffle it - automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.eew_vs1); - - // Is this byte a valid byte in the VRF word? - if (vrf_seq_byte < issue_cnt_q) begin - // At which lane, and what is the byte offset in that lane, of the byte vrf_byte? - automatic int vrf_lane = vrf_byte >> 3; - automatic int vrf_offset = vrf_byte[2:0]; - - // Copy data - axi_w_o.data[8*axi_byte +: 8] = stu_operand[vrf_lane][8*vrf_offset +: 8]; - axi_w_o.strb[axi_byte] = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset]; + axi_valid_bytes = upper_byte - lower_byte + 1; + + valid_bytes = ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) ? vinsn_valid_bytes : vrf_valid_bytes; + valid_bytes = ( valid_bytes < axi_valid_bytes ) ? valid_bytes : axi_valid_bytes; + + // Adjust valid signals to the next block "operands_ready" + stu_operand_valid = stu_operand_valid_lanes; + for ( int unsigned lane = 0; lane < NrLanes; lane++ ) begin : adjust_operand_valid + // - We are left with less byte than the maximim to issue, + // this means that at least one lane is not going to push us any operand anymore + // - For the lanes which index % NrLanes != 0 + if ( ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) + & ( lane < vinsn_issue_q.vstart[idx_width(NrLanes)-1:0] ) + ) begin : vstart_lane_adjust + stu_operand_valid[lane] |= 1'b1; + end : vstart_lane_adjust + end : adjust_operand_valid + + // TODO: apply the same vstart logic also to mask_valid_i + // For now, assume (vstart % NrLanes == 0) + mask_valid = mask_valid_i; + + // Wait for all expected operands from the lanes + if ( &stu_operand_valid && (vinsn_issue_q.vm || (|mask_valid_i) ) ) begin : operands_ready + vrf_pnt_d = vrf_pnt_q + valid_bytes; + + // Copy data from the operands into the W channel + for (int unsigned axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : stu_operand_to_axi_w + // Is this byte a valid byte in the W beat? + if (axi_byte >= lower_byte && axi_byte <= upper_byte) begin + // Map axy_byte to the corresponding byte in the VRF word (sequential) + vrf_seq_byte = axi_byte - lower_byte + vrf_pnt_q; + // And then shuffle it + vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.eew_vs1); + + // Is this byte a valid byte in the VRF word? + if (vrf_seq_byte < issue_cnt_bytes_q) begin + // At which lane, and what is the byte offset in that lane, of the byte vrf_byte? + automatic int unsigned vrf_offset = vrf_byte[2:0]; + + // Consider also vstart and make sure this index wraps around the number of lane + // automatic logic [$clog2(NrLanes)-1:0] vrf_lane = (vrf_byte >> 3) + vinsn_issue_q.vstart[idx_width(NrLanes)-1:0]; + automatic int unsigned vrf_lane = (vrf_byte >> 3); + // Adjust lane selection w.r.t. vstart + vrf_lane += vinsn_issue_q.vstart[idx_width(NrLanes)-1:0]; + if ( vrf_lane >= NrLanes ) begin : vstart_lane_adjust + vrf_lane -= NrLanes; + end : vstart_lane_adjust + + // Copy data + axi_w_o.data[8*axi_byte +: 8] = stu_operand[vrf_lane][8*vrf_offset +: 8]; + axi_w_o.strb[axi_byte] = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset]; + end end - end - end - - // Send the W beat - axi_w_valid_o = 1'b1; - // Account for the beat we sent - len_d = len_q + 1; - // We wrote all the beats for this AW burst - if ($unsigned(len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin - axi_w_o.last = 1'b1; - // Ask for another burst by the address generator - axi_addrgen_req_ready_o = 1'b1; - // Reset AXI pointers - len_d = '0; - end - - // We consumed a whole word from the lanes - if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_q) begin - // Reset the pointer in the VRF word - vrf_pnt_d = '0; - // Acknowledge the operands with the lanes - stu_operand_ready = '1; - // Acknowledge the mask operand - mask_ready_o = !vinsn_issue_q.vm; - // Account for the results that were issued - issue_cnt_d = issue_cnt_q - NrLanes * 8; - if (issue_cnt_q < NrLanes * 8) - issue_cnt_d = '0; - end - end + end : stu_operand_to_axi_w + + // Send the W beat + axi_w_valid_o = 1'b1; + // Account for the beat we sent + axi_len_d = axi_len_q + 1; + // We wrote all the beats for this AW burst + if ($unsigned(axi_len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin : beats_complete + axi_w_o.last = 1'b1; + // Ask for another burst by the address generator + axi_addrgen_req_ready_o = 1'b1; + // Reset AXI pointers + axi_len_d = '0; + end : beats_complete + + // We consumed a whole word from the lanes + if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_bytes_q) begin : vrf_word_done + // Reset the pointer in the VRF word + vrf_pnt_d = '0; + // Acknowledge the operands with the lanes + stu_operand_ready = '1; + // Acknowledge the mask operand + mask_ready_o = !vinsn_issue_q.vm; + // Account for the results that were issued + issue_cnt_bytes_d = issue_cnt_bytes_q - (NrLanes * DataWidthB); + if (issue_cnt_bytes_q < (NrLanes * DataWidthB)) begin : issue_cnt_bytes_overflow + issue_cnt_bytes_d = '0; + end : issue_cnt_bytes_overflow + end : vrf_word_done + end : operands_ready + end : issue_valid // Finished issuing W beats for this vector store - if (vinsn_issue_valid && issue_cnt_d == 0) begin + if (vinsn_issue_valid && issue_cnt_bytes_d == 0) begin : axi_w_beat_finish // Bump issue counters and pointers of the vector instruction queue vinsn_queue_d.issue_cnt -= 1; - if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) + if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) begin : issue_pnt_overflow vinsn_queue_d.issue_pnt = 0; - else + end : issue_pnt_overflow + else begin : issue_pnt_increment vinsn_queue_d.issue_pnt += 1; + end : issue_pnt_increment - if (vinsn_queue_d.issue_cnt != 0) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl << - int'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew); - end + if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update + issue_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl - + vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart + ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew); + end : issue_cnt_bytes_update + end : axi_w_beat_finish //////////////////////////// // Handle the B channel // @@ -297,63 +355,66 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // TODO: We cannot handle errors on the B channel. // We just acknowledge any AXI requests that come on the B channel. - if (axi_b_valid_i) begin + if (axi_b_valid_i) begin : axi_b_valid // Acknowledge the B beat axi_b_ready_o = 1'b1; // Mark the vector instruction as being done - if (vinsn_queue_d.issue_pnt != vinsn_queue_d.commit_pnt) begin + if (vinsn_queue_d.issue_pnt != vinsn_queue_d.commit_pnt) begin : instr_done // Signal complete store store_complete_o = 1'b1; - pe_resp.vinsn_done[vinsn_commit.id] = 1'b1; + pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; // Update the commit counters and pointers vinsn_queue_d.commit_cnt -= 1; - if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) + if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) begin : commit_pnt_overflow vinsn_queue_d.commit_pnt = '0; - else + end : commit_pnt_overflow + else begin : commit_pnt_increment vinsn_queue_d.commit_pnt += 1; - end - end + end : commit_pnt_increment + end : instr_done + end : axi_b_valid ////////////////////////////// // Accept new instruction // ////////////////////////////// if (!vinsn_queue_full && pe_req_valid_i && !vinsn_running_q[pe_req_i.id] && - pe_req_i.vfu == VFU_StoreUnit) begin + pe_req_i.vfu == VFU_StoreUnit) begin : issue_cnt_bytes_init vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = pe_req_i; vinsn_running_d[pe_req_i.id] = 1'b1; // Initialize counters - if (vinsn_queue_d.issue_cnt == '0) - issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); + if (vinsn_queue_d.issue_cnt == '0) begin : issue_cnt_bytes_init + issue_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew); + end : issue_cnt_bytes_init // Bump pointers and counters of the vector instruction queue vinsn_queue_d.accept_pnt += 1; vinsn_queue_d.issue_cnt += 1; vinsn_queue_d.commit_cnt += 1; - end + end : issue_cnt_bytes_init end: p_vstu always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin vinsn_running_q <= '0; - issue_cnt_q <= '0; + issue_cnt_bytes_q <= '0; - len_q <= '0; + axi_len_q <= '0; vrf_pnt_q <= '0; pe_resp_o <= '0; end else begin vinsn_running_q <= vinsn_running_d; - issue_cnt_q <= issue_cnt_d; + issue_cnt_bytes_q <= issue_cnt_bytes_d; - len_q <= len_d; + axi_len_q <= axi_len_d; vrf_pnt_q <= vrf_pnt_d; - pe_resp_o <= pe_resp; + pe_resp_o <= pe_resp_d; end end diff --git a/hardware/tb/ara_testharness.sv b/hardware/tb/ara_testharness.sv index 09901b262..84edf4c8e 100644 --- a/hardware/tb/ara_testharness.sv +++ b/hardware/tb/ara_testharness.sv @@ -153,7 +153,7 @@ module ara_testharness #( // If disabled if (!runtime_cnt_en_q) // Start only if the software allowed the enable and we detect the first V instruction - runtime_cnt_en_d = i_ara_soc.i_system.i_ara.acc_req_valid_i & cnt_en_mask; + runtime_cnt_en_d = i_ara_soc.i_system.i_ara.acc_req_i.req_valid & cnt_en_mask; // If enabled if (runtime_cnt_en_q) // Stop counting only if the software disabled the counter and Ara returned idle @@ -177,14 +177,14 @@ module ara_testharness #( runtime_to_be_updated_d = runtime_to_be_updated_q; // Assert the update flag upon a new valid vector instruction - if (!runtime_to_be_updated_q && i_ara_soc.i_system.i_ara.acc_req_valid_i) begin + if (!runtime_to_be_updated_q && i_ara_soc.i_system.i_ara.acc_req_i.req_valid) begin runtime_to_be_updated_d = 1'b1; end // Update the internal runtime and reset the update flag if (runtime_to_be_updated_q && i_ara_soc.i_system.i_ara.ara_idle && - !i_ara_soc.i_system.i_ara.acc_req_valid_i) begin + !i_ara_soc.i_system.i_ara.acc_req_i.req_valid) begin runtime_buf_d = runtime_cnt_q; runtime_to_be_updated_d = 1'b0; end diff --git a/hardware/tb/dpi/elfloader.cc b/hardware/tb/dpi/elfloader.cc index 60f06c358..7e0528f54 120000 --- a/hardware/tb/dpi/elfloader.cc +++ b/hardware/tb/dpi/elfloader.cc @@ -1 +1 @@ -../../deps/cva6/tb/dpi/elfloader.cc \ No newline at end of file +../../deps/cva6/corev_apu/tb/dpi/elfloader.cc \ No newline at end of file diff --git a/scripts/check_cycles.py b/scripts/check_cycles.py index 7b7040c07..24a861d1a 100644 --- a/scripts/check_cycles.py +++ b/scripts/check_cycles.py @@ -24,19 +24,19 @@ import numpy as np threshold = { - 'imatmul' : 300, - 'fmatmul' : 300, - 'iconv2d' : 300, - 'fconv2d' : 300, - 'fconv3d' : 300, - 'jacobi2d' : 300, - 'dropout' : 300, - 'fft' : 300, - 'dwt' : 300, - 'exp' : 300, - 'softmax' : 300, - 'pathfinder' : 300, - 'roi_align' : 300, + 'imatmul' : 500, + 'fmatmul' : 500, + 'iconv2d' : 500, + 'fconv2d' : 500, + 'fconv3d' : 500, + 'jacobi2d' : 500, + 'dropout' : 500, + 'fft' : 500, + 'dwt' : 500, + 'exp' : 500, + 'softmax' : 500, + 'pathfinder' : 500, + 'roi_align' : 500, } skip_check = {