diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b2a1c8e11..1e748b5e9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -235,6 +235,7 @@ jobs:
   compile-ara:
     runs-on: ubuntu-20.04
     strategy:
+      fail-fast: false
       matrix:
         ara_config: [2_lanes, 4_lanes, 8_lanes, 16_lanes]
     needs: ["tc-verilator", "tc-isa-sim"]
diff --git a/.gitmodules b/.gitmodules
index f7d26db14..32546e093 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -21,9 +21,6 @@
 [submodule "hardware/deps/common_verification"]
 	path = hardware/deps/common_verification
 	url = https://github.com/pulp-platform/common_verification.git
-[submodule "hardware/deps/cva6"]
-	path = hardware/deps/cva6
-	url = https://github.com/pulp-platform/cva6.git
 [submodule "toolchain/newlib"]
 	path = toolchain/newlib
 	url = https://sourceware.org/git/newlib-cygwin.git
@@ -32,3 +29,9 @@
 	path = toolchain/riscv-llvm
 	url = https://github.com/llvm/llvm-project.git
 	ignore = dirty
+[submodule "hardware/deps/apb"]
+	path = hardware/deps/apb
+	url = https://github.com/pulp-platform/apb.git
+[submodule "hardware/deps/cva6"]
+	path = hardware/deps/cva6
+	url = git@github.com:MaistoV/cva6_fork.git
diff --git a/Bender.lock b/Bender.lock
index ae080d9ac..cf547586a 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -1,36 +1,62 @@
 ---
 packages:
+  apb:
+    revision: 77ddf073f194d44b9119949d2421be59789e69ae
+    version: 0.2.4
+    source:
+      Git: "https://github.com/pulp-platform/apb.git"
+    dependencies:
+      - common_cells
   axi:
-    revision: 442ff3375710513623f95944d66cc2bd09b2f155
-    version: 0.29.1
+    revision: 9251564ed67e3e71adf46dbeba62ef4435d2524c
+    version: 0.31.1
     source:
       Git: "https://github.com/pulp-platform/axi.git"
     dependencies:
       - common_cells
       - common_verification
   common_cells:
-    revision: 015917ff33e5f944e866814f72f2074fb0f4220f
-    version: 1.22.1
+    revision: 53b0b58af2db5bd3c850a7038fae170ed78326bb
+    version: 1.31.1
     source:
       Git: "https://github.com/pulp-platform/common_cells.git"
     dependencies:
       - common_verification
       - tech_cells_generic
   common_verification:
-    revision: 6fc76fb013315af9fabbb90b431863d498df2d6d
-    version: 0.2.0
+    revision: 9c07fa860593b2caabd9b5681740c25fac04b878
+    version: 0.2.3
     source:
       Git: "https://github.com/pulp-platform/common_verification.git"
     dependencies: []
   cva6:
-    revision: 3245e44ec49c1cdcd19eb298cd81f0672eaf81ca
+    revision: 5e2e520696aa63545b91fca38ce340314291be5c
     version: ~
     source:
-      Git: "https://github.com/pulp-platform/cva6.git"
-    dependencies: []
+      Git: "https://github.com/MaistoV/cva6_fork.git"
+    dependencies:
+      - axi
+      - common_cells
+      - fpnew
+      - tech_cells_generic
+  fpnew:
+    revision: 3116391bf66660f806b45e212b9949c528b4e270
+    version: 0.7.0
+    source:
+      Git: "https://github.com/openhwgroup/cvfpu.git"
+    dependencies:
+      - common_cells
+      - fpu_div_sqrt_mvp
+  fpu_div_sqrt_mvp:
+    revision: 86e1f558b3c95e91577c41b2fc452c86b04e85ac
+    version: 1.0.4
+    source:
+      Git: "https://github.com/pulp-platform/fpu_div_sqrt_mvp.git"
+    dependencies:
+      - common_cells
   tech_cells_generic:
-    revision: 203038f857158ae4634c47ce0281f402cc2a1344
-    version: 0.2.4
+    revision: 298b7297d220ba2601d0f24f684f97ff32f61123
+    version: 0.2.12
     source:
       Git: "https://github.com/pulp-platform/tech_cells_generic.git"
     dependencies:
diff --git a/Bender.yml b/Bender.yml
index e50de82cc..5518043c3 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -10,8 +10,9 @@ package:
 dependencies:
   axi:                { git: "https://github.com/pulp-platform/axi.git",                version: 0.29.1 }
   common_cells:       { git: "https://github.com/pulp-platform/common_cells.git",       version: 1.22.1 }
-  cva6:               { git: "https://github.com/pulp-platform/cva6.git",               rev: acc_port   }
+  cva6:               { git: "https://github.com/MaistoV/cva6_fork.git",              rev: ara_cheshire }
   tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.1  }
+  apb:                { git: "https://github.com/pulp-platform/apb.git",                version: 0.2.4  }
 
 workspace:
   checkout_dir: "hardware/deps"
@@ -27,7 +28,6 @@ sources:
 
     # Sources
     # Level 1
-    - hardware/src/axi_to_mem.sv
     - hardware/src/ctrl_registers.sv
     - hardware/src/cva6_accel_first_pass_decoder.sv
     - hardware/src/ara_dispatcher.sv
diff --git a/apps/.gitignore b/apps/.gitignore
index ef412f9ba..9bf00c4f9 100644
--- a/apps/.gitignore
+++ b/apps/.gitignore
@@ -1,2 +1,4 @@
 bin
 common/link.ld
+*.o*
+data.S*
\ No newline at end of file
diff --git a/apps/Makefile b/apps/Makefile
index 6bb74f304..c06b873b2 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -21,6 +21,12 @@ APPS_DIR := $(ROOT_DIR)
 COMMON_DIR := $(ROOT_DIR)/common
 TESTS_DIR := $(ROOT_DIR)/riscv-tests/isa
 
+# Build environment for Linux
+LINUX ?= 0
+ifeq ($(LINUX), 1)
+include $(COMMON_DIR)/linux.mk
+endif
+
 # This will overwrite the ROOT_DIR variable from the included makefile
 include $(COMMON_DIR)/runtime.mk
 include $(COMMON_DIR)/riscv_tests.mk
@@ -33,9 +39,9 @@ BINARIES := $(filter-out bin/benchmarks, $(addprefix bin/,$(APPS)))
 CVA6_EXTENSIONS := rv64ui rv64uc rv64um rv64uf rv64ud rv64si
 # Atomics are messy, since there is currently no memory region capable of handling them
 # CVA6_EXTENSIONS := rv64ua
-CVA6_BINARIES := $(addprefix bin/, $(cva6_tests))
+CVA6_BINARIES := $(addsuffix $(IS_LINUX_EXTENSION), $(addprefix bin/, $(cva6_tests)))
 ARA_EXTENSIONS := rv64uv
-ARA_BINARIES := $(addprefix bin/, $(ara_tests))
+ARA_BINARIES := $(addsuffix $(IS_LINUX_EXTENSION), $(addprefix bin/, $(ara_tests)))
 
 # FFT requires special treatment because of its header files
 ifeq ($(ENV_DEFINES),)
@@ -95,14 +101,18 @@ endef
 $(foreach app,$(APPS),$(eval $(call app_compile_template_spike,$(app))))
 
 define app_compile_template
-bin/$1: $1/data.S.o $(addsuffix .o, $(shell find $(1) -name "*.c" -o -name "*.S")) $(RUNTIME_LLVM) linker_script
+bin/$1: $1/data.S.o$$(IS_LINUX_EXTENSION) $(addsuffix .o$$(IS_LINUX_EXTENSION), $(shell find $(1) -name "*.c" -o -name "*.S")) $(RUNTIME_LLVM) linker_script
 	mkdir -p bin/
-	$$(RISCV_CC) -Iinclude $(RISCV_CCFLAGS) -o $$@ $$(addsuffix .o, $$(shell find $(1) -name "*.c" -o -name "*.S")) $(RUNTIME_LLVM) $$(RISCV_LDFLAGS) -T$$(CURDIR)/common/link.ld
-	$$(RISCV_OBJDUMP) $$(RISCV_OBJDUMP_FLAGS) -D $$@ > $$@.dump
-	$$(RISCV_STRIP) $$@ -S --strip-unneeded
+	$$(RISCV_CC) $(RISCV_CCFLAGS) -o $$@$$(IS_LINUX_EXTENSION) $$(addsuffix .o$$(IS_LINUX_EXTENSION), $$(shell find $(1) -name "*.c" -o -name "*.S")) $(RUNTIME_LLVM) $$(RISCV_LDFLAGS) $$(LD_FLAGS)
+	$$(RISCV_OBJDUMP) $$(RISCV_OBJDUMP_FLAGS) -D $$@$$(IS_LINUX_EXTENSION) > $$@$$(IS_LINUX_EXTENSION).dump
+	# Don't strip symbols for Linux build since need them for debug
+	if [ "$$(IS_LINUX_EXTENSION)" == "" ]; then \
+		$$(RISCV_STRIP) $$@$$(IS_LINUX_EXTENSION) -S --strip-unneeded; \
+	fi
 endef
 $(foreach app,$(APPS),$(eval $(call app_compile_template,$(app))))
 
+
 # Make the RISC-V tests
 riscv_tests: $(CVA6_BINARIES) $(ARA_BINARIES)
 
@@ -111,7 +121,7 @@ TESTS_$(1) := $(addprefix bin/, $($(addsuffix _ara_tests, $1)))
 
 bin/$(1)-ara-%: $(TESTS_DIR)/$(1)/%.$(2) $(RUNTIME_GCC) linker_script
 	mkdir -p bin/
-	$$(RISCV_CC_GCC) -Iinclude -I$$(TESTS_DIR)/macros/scalar -I$$(TESTS_DIR)/macros/vector $$(RISCV_CCFLAGS_GCC) $$(RISCV_LDFLAGS_GCC) -o $$@ $$< $(RUNTIME_GCC) -T$$(CURDIR)/common/link.ld
+	$$(RISCV_CC_GCC) -Iinclude -I$$(TESTS_DIR)/macros/scalar -I$$(TESTS_DIR)/macros/vector $$(RISCV_CCFLAGS_GCC) $$(RISCV_LDFLAGS_GCC) -o $$@ $$< $(RUNTIME_GCC) $$(LD_FLAGS)
 	$$(RISCV_OBJDUMP) $$(RISCV_OBJDUMP_FLAGS) -D $$@ > $$@.dump
 	$$(RISCV_STRIP) $$@ -S --strip-unneeded
 endef
@@ -121,7 +131,7 @@ TESTS_$(1) := $(addprefix bin/, $($(addsuffix _ara_tests, $1)))
 
 bin/$(1)-ara-%: $(TESTS_DIR)/$(1)/%.$(2) $(RUNTIME_LLVM) linker_script
 	mkdir -p bin/
-	$$(RISCV_CC) -Iinclude -I$$(TESTS_DIR)/macros/scalar -I$$(TESTS_DIR)/macros/vector $$(RISCV_CCFLAGS) $$(RISCV_LDFLAGS) -o $$@ $$< $(RUNTIME_LLVM) -T$$(CURDIR)/common/link.ld
+	$$(RISCV_CC) -Iinclude -I$$(TESTS_DIR)/macros/scalar -I$$(TESTS_DIR)/macros/vector $$(RISCV_CCFLAGS) $$(RISCV_LDFLAGS) -o $$@ $$< $(RUNTIME_LLVM) $$(LD_FLAGS)
 	$$(RISCV_OBJDUMP) $$(RISCV_OBJDUMP_FLAGS) -D $$@ > $$@.dump
 	$$(RISCV_STRIP) $$@ -S --strip-unneeded
 endef
@@ -169,13 +179,14 @@ benchmarks_clean:
 
 .PHONY: clean
 clean: riscv_tests_spike_clean benchmarks_clean
+	rm -vf bin/*
 	rm -vf $(BINARIES)
 	rm -vf $(CVA6_BINARIES)
 	rm -vf $(ARA_BINARIES)
 	rm -vf $(addsuffix .dump,$(BINARIES))
 	rm -vf $(addsuffix .dump,$(CVA6_BINARIES))
 	rm -vf $(addsuffix .dump,$(ARA_BINARIES))
-	rm -vf $(addsuffix /main.c.o,$(APPS))
+	rm -vf $(addsuffix /main.c.o$(IS_LINUX_EXTENSION),$(APPS))
 	rm -vf $(RUNTIME_GCC)
 	rm -vf $(RUNTIME_LLVM)
 	rm -vf $(RUNTIME_SPIKE)
diff --git a/apps/common/linux.mk b/apps/common/linux.mk
new file mode 100644
index 000000000..65daa016e
--- /dev/null
+++ b/apps/common/linux.mk
@@ -0,0 +1,47 @@
+IS_LINUX_EXTENSION := .linux
+
+CVA6_SDK ?= /usr/scratch/fenga3/vmaisto/cva6-sdk_fork_backup
+ROOTFS_DEST ?= $(CVA6_SDK)/rootfs/ara/apps/bin
+cp_to_rootfs: 
+	mkdir -p $(ROOTFS_DEST)
+	@echo "[Copying binaries to rootfs directory $(ROOTFS_DEST)]"
+	cp -v bin/*.linux $(ROOTFS_DEST)
+
+# Set the runtime variables to empty, the Linux libs will takcare of that 
+LD_FLAGS     := 
+RUNTIME_GCC   ?= common/util-gcc.c.o
+RUNTIME_LLVM  ?= common/util-llvm.c.o
+
+
+# Override
+INSTALL_DIR             ?= $(ARA_DIR)/install
+GCC_INSTALL_DIR         ?= $(CVA6_SDK)/buildroot/output/host/
+LLVM_INSTALL_DIR        ?= $(INSTALL_DIR)/riscv-llvm
+
+RISCV_XLEN    ?= 64
+RISCV_ARCH    ?= rv$(RISCV_XLEN)gcv
+RISCV_ABI     ?= lp64d
+RISCV_TARGET  ?= riscv$(RISCV_XLEN)-buildroot-linux-gnu-
+
+# Don't use LLVM
+RISCV_PREFIX  ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET)
+RISCV_CC      ?= $(RISCV_PREFIX)gcc
+RISCV_CXX     ?= $(RISCV_PREFIX)g++
+RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump
+RISCV_OBJCOPY ?= $(RISCV_PREFIX)objcopy
+RISCV_AS      ?= $(RISCV_PREFIX)as
+RISCV_AR      ?= $(RISCV_PREFIX)ar
+RISCV_LD      ?= $(RISCV_PREFIX)ld
+RISCV_STRIP   ?= $(RISCV_PREFIX)strip
+
+# Override flags
+# LLVM_FLAGS     ?= -march=rv64gcv_zfh_zvfh0p1 -mabi=$(RISCV_ABI) -mno-relax -fuse-ld=lld
+LLVM_FLAGS     ?= -march=rv64gcv -mabi=$(RISCV_ABI)
+LLVM_V_FLAGS   ?= #+no-optimized-zero-stride-load
+# RISCV_FLAGS    ?= $(LLVM_FLAGS) $(LLVM_V_FLAGS) -mcmodel=medany -I$(CURDIR)/common -std=gnu99 -O3 -ffast-math -fno-common -fno-builtin-printf $(DEFINES) $(RISCV_WARNINGS)
+RISCV_FLAGS    ?= -g $(LLVM_FLAGS) $(LLVM_V_FLAGS) -I$(CURDIR)/common -std=gnu99 -O0 $(DEFINES) $(RISCV_WARNINGS)
+RISCV_CCFLAGS  ?= $(RISCV_FLAGS) #-ffunction-sections -fdata-sections
+RISCV_CXXFLAGS ?= $(RISCV_FLAGS) -ffunction-sections -fdata-sections
+RISCV_LDFLAGS  ?= #-static -nostartfiles -lm -Wl,--gc-sections
+
+RISCV_OBJDUMP_FLAGS ?= -S
diff --git a/apps/common/printf.h b/apps/common/printf.h
index dd8d0d514..53bdc952c 100644
--- a/apps/common/printf.h
+++ b/apps/common/printf.h
@@ -29,6 +29,11 @@
 //
 ///////////////////////////////////////////////////////////////////////////////
 
+
+#ifdef __linux__
+  #include <stdio.h>
+#else // ! __linux__
+
 #ifndef _PRINTF_H_
 #define _PRINTF_H_
 
@@ -100,4 +105,5 @@ int fctprintf(void (*out)(char character, void *arg), void *arg,
 }
 #endif
 
+#endif // __linux__
 #endif // _PRINTF_H_
diff --git a/apps/common/runtime.h b/apps/common/runtime.h
index 4e8dbb44d..e61d5fda8 100644
--- a/apps/common/runtime.h
+++ b/apps/common/runtime.h
@@ -7,13 +7,19 @@
   asm volatile(                                                                \
       "csrs mstatus, %[bits];" ::[bits] "r"(0x00000600 & (0x00000600 >> 1)))
 
-extern int64_t event_trigger;
-extern int64_t timer;
-// SoC-level CSR
-extern uint64_t hw_cnt_en_reg;
+// SoC-level CSR, put in memory for Linux build
+#ifdef __linux__
+  int64_t event_trigger;
+  int64_t timer;
+  uint64_t hw_cnt_en_reg;
+#else // ! __linux__
+  extern int64_t event_trigger;
+  extern int64_t timer;
+  extern uint64_t hw_cnt_en_reg;
+#endif // __linux__
 
 // Return the current value of the cycle counter
-inline int64_t get_cycle_count() {
+int64_t get_cycle_count() {
   int64_t cycle_count;
   // The fence is needed to be sure that Ara is idle, and it is not performing
   // the last vector stores when we read mcycle with stop_timer()
@@ -31,26 +37,26 @@ inline int64_t get_cycle_count() {
 #define HW_CNT_READY hw_cnt_en_reg = 1;
 #define HW_CNT_NOT_READY hw_cnt_en_reg = 0;
 // Start and stop the counter
-inline void start_timer() { timer = -get_cycle_count(); }
-inline void stop_timer() { timer += get_cycle_count(); }
+void start_timer() { timer = -get_cycle_count(); }
+void stop_timer() { timer += get_cycle_count(); }
 
 // Get the value of the timer
-inline int64_t get_timer() { return timer; }
+int64_t get_timer() { return timer; }
 #else
 #define HW_CNT_READY ;
 #define HW_CNT_NOT_READY ;
 // Start and stop the counter
-inline void start_timer() {
+void start_timer() {
   while (0)
     ;
 }
-inline void stop_timer() {
+void stop_timer() {
   while (0)
     ;
 }
 
 // Get the value of the timer
-inline int64_t get_timer() { return 0; }
+int64_t get_timer() { return 0; }
 #endif
 
 #endif // _RUNTIME_H_
diff --git a/apps/common/runtime.mk b/apps/common/runtime.mk
index 66b05f660..5205ba0ba 100644
--- a/apps/common/runtime.mk
+++ b/apps/common/runtime.mk
@@ -42,7 +42,7 @@ ISA_SIM_MOD_INSTALL_DIR ?= $(INSTALL_DIR)/riscv-isa-sim-mod
 RISCV_XLEN    ?= 64
 RISCV_ARCH    ?= rv$(RISCV_XLEN)gcv
 RISCV_ABI     ?= lp64d
-RISCV_TARGET  ?= riscv$(RISCV_XLEN)-unknown-elf
+RISCV_TARGET  ?= riscv$(RISCV_XLEN)-unknown-elf-
 
 # Use LLVM
 RISCV_PREFIX  ?= $(LLVM_INSTALL_DIR)/bin/
@@ -56,7 +56,9 @@ RISCV_LD      ?= $(RISCV_PREFIX)ld.lld
 RISCV_STRIP   ?= $(RISCV_PREFIX)llvm-strip
 
 # Use gcc to compile scalar riscv-tests
-RISCV_CC_GCC  ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET)-gcc
+RISCV_CC_GCC  ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET)gcc
+RISCV_OBJCOPY_GCC  ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET)objcopy
+RISCV_OBJDUMP_GCC  ?= $(GCC_INSTALL_DIR)/bin/$(RISCV_TARGET)objdump
 
 # Benchmark with spike
 spike_env_dir ?= $(ARA_DIR)/apps/riscv-tests
@@ -109,6 +111,8 @@ RUNTIME_GCC   ?= common/crt0-gcc.S.o common/printf-gcc.c.o common/string-gcc.c.o
 RUNTIME_LLVM  ?= common/crt0-llvm.S.o common/printf-llvm.c.o common/string-llvm.c.o common/serial-llvm.c.o common/util-llvm.c.o
 RUNTIME_SPIKE ?= $(spike_env_dir)/benchmarks/common/crt.S.o.spike $(spike_env_dir)/benchmarks/common/syscalls.c.o.spike common/util.c.o.spike
 
+LD_FLAGS ?= -T$(CURDIR)/common/link.ld
+
 .INTERMEDIATE: $(RUNTIME_GCC) $(RUNTIME_LLVM)
 
 %-gcc.S.o: %.S
@@ -123,10 +127,10 @@ RUNTIME_SPIKE ?= $(spike_env_dir)/benchmarks/common/crt.S.o.spike $(spike_env_di
 %-llvm.c.o: %.c
 	$(RISCV_CC) $(RISCV_CCFLAGS) -c $< -o $@
 
-%.S.o: %.S
+%.S.o$(IS_LINUX_EXTENSION): %.S
 	$(RISCV_CC) $(RISCV_CCFLAGS) -c $< -o $@
 
-%.c.o: %.c
+%.c.o$(IS_LINUX_EXTENSION): %.c
 	$(RISCV_CC) $(RISCV_CCFLAGS) -c $< -o $@
 
 %.S.o.spike: %.S patch-spike-crt0
diff --git a/apps/riscv-tests/benchmarks/Makefile b/apps/riscv-tests/benchmarks/Makefile
index cc327145a..43df648b8 100644
--- a/apps/riscv-tests/benchmarks/Makefile
+++ b/apps/riscv-tests/benchmarks/Makefile
@@ -35,12 +35,12 @@ bmarks = \
 # Build rules
 #--------------------------------------------------------------------
 
-RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
+RISCV_PREFIX ?= /usr/scratch/fenga3/vmaisto/cva6-sdk_fork/buildroot/output/host/bin/riscv64-buildroot-linux-gnu-
 RISCV_GCC ?= $(RISCV_PREFIX)gcc
-RISCV_GCC_OPTS ?= -DPREALLOCATE=1 -mcmodel=medany -static -std=gnu99 -O2 -ffast-math -fno-common -fno-builtin-printf
-RISCV_LINK ?= $(RISCV_GCC) -T $(src_dir)/common/test.ld $(incs)
-RISCV_LINK_OPTS ?= -static -nostdlib -nostartfiles -lm -lgcc -T $(src_dir)/common/test.ld
-RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes --section=.text --section=.text.startup --section=.text.init --section=.data
+RISCV_GCC_OPTS ?= -DPREALLOCATE=1 -mcmodel=medany -std=gnu99 -O2 -ffast-math -fPIC 
+RISCV_LINK ?= $(RISCV_GCC)  $(incs)
+RISCV_LINK_OPTS ?= 
+RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes -S
 RISCV_SIM ?= spike --isa=rv$(XLEN)gc
 
 incs  += -I$(src_dir)/../env -I$(src_dir)/common $(addprefix -I$(src_dir)/, $(bmarks))
@@ -48,7 +48,7 @@ objs  :=
 
 define compile_template
 $(1).riscv: $(wildcard $(src_dir)/$(1)/*) $(wildcard $(src_dir)/common/*)
-	$$(RISCV_GCC) $$(incs) $$(RISCV_GCC_OPTS) -o $$@ $(wildcard $(src_dir)/$(1)/*.c) $(wildcard $(src_dir)/common/*.c) $(wildcard $(src_dir)/common/*.S) $$(RISCV_LINK_OPTS)
+	$$(RISCV_GCC) $$(incs) $$(RISCV_GCC_OPTS) -o $$@ $(wildcard $(src_dir)/$(1)/*.c) $(wildcard $(src_dir)/common/*.c)  $$(RISCV_LINK_OPTS)
 endef
 
 $(foreach bmark,$(bmarks),$(eval $(call compile_template,$(bmark))))
diff --git a/apps/riscv-tests/benchmarks/common/syscalls.c b/apps/riscv-tests/benchmarks/common/syscalls.c
index 4d20be9e4..b9d33c368 100644
--- a/apps/riscv-tests/benchmarks/common/syscalls.c
+++ b/apps/riscv-tests/benchmarks/common/syscalls.c
@@ -8,6 +8,36 @@
 #include <sys/signal.h>
 #include "util.h"
 
+#define NUM_COUNTERS 2
+static uintptr_t counters[NUM_COUNTERS];
+static char* counter_names[NUM_COUNTERS];
+
+void setStats(int enable)
+{
+  int i = 0;
+#define READ_CTR(name) do { \
+    while (i >= NUM_COUNTERS) ; \
+    uintptr_t csr = read_csr(name); \
+    if (!enable) { csr -= counters[i]; counter_names[i] = #name; } \
+    counters[i++] = csr; \
+  } while (0)
+
+  // Read from user CSRs
+  READ_CTR(cycle);
+  READ_CTR(instret);
+
+#undef READ_CTR
+}
+
+int __attribute__((weak)) main(int argc, char** argv)
+{
+  // single-threaded programs override this function.
+  printstr("Implement main(), foo!\n");
+  return -1;
+}
+
+#ifndef __linux__
+
 #define SYS_write 64
 
 #undef strcmp
@@ -33,26 +63,6 @@ static uintptr_t syscall(uintptr_t which, uint64_t arg0, uint64_t arg1, uint64_t
   return magic_mem[0];
 }
 
-#define NUM_COUNTERS 2
-static uintptr_t counters[NUM_COUNTERS];
-static char* counter_names[NUM_COUNTERS];
-
-void setStats(int enable)
-{
-  int i = 0;
-#define READ_CTR(name) do { \
-    while (i >= NUM_COUNTERS) ; \
-    uintptr_t csr = read_csr(name); \
-    if (!enable) { csr -= counters[i]; counter_names[i] = #name; } \
-    counters[i++] = csr; \
-  } while (0)
-
-  READ_CTR(mcycle);
-  READ_CTR(minstret);
-
-#undef READ_CTR
-}
-
 void __attribute__((noreturn)) tohost_exit(uintptr_t code)
 {
   tohost = (code << 1) | 1;
@@ -86,19 +96,13 @@ void __attribute__((weak)) thread_entry(int cid, int nc)
   while (cid != 0);
 }
 
-int __attribute__((weak)) main(int argc, char** argv)
-{
-  // single-threaded programs override this function.
-  printstr("Implement main(), foo!\n");
-  return -1;
-}
-
 static void init_tls()
 {
   register void* thread_pointer asm("tp");
-  extern char _tdata_begin, _tdata_end, _tbss_end;
+  extern char _tls_data;
+  extern __thread char _tdata_begin, _tdata_end, _tbss_end;
   size_t tdata_size = &_tdata_end - &_tdata_begin;
-  memcpy(thread_pointer, &_tdata_begin, tdata_size);
+  memcpy(thread_pointer, &_tls_data, tdata_size);
   size_t tbss_size = &_tbss_end - &_tdata_end;
   memset(thread_pointer + tdata_size, 0, tbss_size);
 }
@@ -115,7 +119,7 @@ void _init(int cid, int nc)
   char* pbuf = buf;
   for (int i = 0; i < NUM_COUNTERS; i++)
     if (counters[i])
-      pbuf += sprintf(pbuf, "%s = %ld\n", counter_names[i], counters[i]);
+      pbuf += sprintf(pbuf, "%s = %d\n", counter_names[i], counters[i]);
   if (pbuf != buf)
     printstr(buf);
 
@@ -226,7 +230,7 @@ static void vprintfmt(void (*putch)(int, void**), void **putdat, const char *fmt
     case '-':
       padc = '-';
       goto reswitch;
-
+      
     // flag to pad with 0's instead of spaces
     case '0':
       padc = '0';
@@ -335,7 +339,7 @@ static void vprintfmt(void (*putch)(int, void**), void **putdat, const char *fmt
     case '%':
       putch(ch, putdat);
       break;
-
+      
     // unrecognized escape sequence - just print it literally
     default:
       putch('%', putdat);
@@ -356,19 +360,19 @@ int printf(const char* fmt, ...)
   return 0; // incorrect return value, but who cares, anyway?
 }
 
-void sprintf_putch(int ch, void** data)
-{
-  char** pstr = (char**)data;
-  **pstr = ch;
-  (*pstr)++;
-}
-
 int sprintf(char* str, const char* fmt, ...)
 {
   va_list ap;
   char* str0 = str;
   va_start(ap, fmt);
 
+  void sprintf_putch(int ch, void** data)
+  {
+    char** pstr = (char**)data;
+    **pstr = ch;
+    (*pstr)++;
+  }
+
   vprintfmt(sprintf_putch, (void**)&str, fmt, ap);
   *str = 0;
 
@@ -467,3 +471,5 @@ long atol(const char* str)
 
   return sign ? -res : res;
 }
+
+#endif // __linux__
diff --git a/apps/riscv-tests/mt/Makefile b/apps/riscv-tests/mt/Makefile
index b45e55182..81052bafc 100644
--- a/apps/riscv-tests/mt/Makefile
+++ b/apps/riscv-tests/mt/Makefile
@@ -75,12 +75,12 @@ bmarks = $(bmarks_vvadd) $(bmarks_matmul)
 # Build rules
 #--------------------------------------------------------------------
 
-RISCV_PREFIX=riscv$(XLEN)-unknown-elf-
+RISCV_PREFIX := /scratch/vmaisto/cva6-sdk_fork/buildroot/output/host/bin/riscv64-buildroot-linux-gnu-
 RISCV_GCC = $(RISCV_PREFIX)gcc
 RISCV_GCC_OPTS = -std=gnu99 -O2 -ffast-math
-RISCV_LINK = $(RISCV_GCC) -T $(common)/test.ld $(incs)
+RISCV_LINK = $(RISCV_GCC)  $(incs)
 RISCV_LINK_OPTS = -nostdlib -nostartfiles -ffast-math -lc
-RISCV_OBJDUMP = $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes --section=.text --section=.text.startup --section=.data
+RISCV_OBJDUMP = $(RISCV_PREFIX)objdump --disassemble-all --disassemble-zeroes -S
 RISCV_SIM = spike -p2
 
 VPATH += $(common) $(common)/../mt-matmul $(common)/../mt-vvadd 
diff --git a/hardware/Makefile b/hardware/Makefile
index d85fd638c..c8a281ad6 100644
--- a/hardware/Makefile
+++ b/hardware/Makefile
@@ -102,7 +102,7 @@ vlog_args += -suppress vlog-2583 -suppress vlog-13314 -suppress vlog-13233
 vlog_args += -work $(library)
 
 # Defines
-bender_defs += --define NR_LANES=$(nr_lanes) --define VLEN=$(vlen) --define RVV_ARIANE=1
+bender_defs += --define NR_LANES=$(nr_lanes) --define VLEN=$(vlen) --define ARIANE_ACCELERATOR_PORT=1
 
 # Default target
 all: compile
@@ -114,7 +114,9 @@ $(buildpath):
 # Bender
 bender:
 	@[ -x ./bender ] && echo "Bender already exists." || \
-	curl --proto '=https' --tlsv1.2 https://fabianschuiki.github.io/bender/init -sSf | sh	-s -- 0.23.1
+	wget https://github.com/pulp-platform/bender/releases/download/v0.23.1/bender-0.23.1-x86_64-linux-gnu.tar.gz
+	tar xf bender-0.23.1-x86_64-linux-gnu.tar.gz
+	rm -rf bender-0.23.1-x86_64-linux-gnu.tar.gz
 	@echo "$$(./bender --version) available."
 
 # Patches
@@ -132,7 +134,7 @@ $(buildpath)/$(library):
 .PHONY: compile
 compile: dpi lib $(buildpath) bender $(buildpath)/compile_$(config).tcl
 $(buildpath)/compile_$(config).tcl: $(config_file) Makefile ../Bender.yml $(shell find src -type f) $(shell find ../config -type f) $(shell find include -type f) $(shell find tb -type f) $(shell find deps -type f)
-	./bender script vsim --vlog-arg="$(vlog_args)" -t rtl -t asic -t ara_test -t cva6_test $(bender_defs) > $(buildpath)/compile_$(config).tcl
+	./bender script vsim --vlog-arg="$(vlog_args)" -t rtl -t asic -t ara_test -t cva6_test -t cv64a6_imafdcv_sv39 $(bender_defs) > $(buildpath)/compile_$(config).tcl
 	echo "exit" >> $(buildpath)/compile_$(config).tcl
 	cd $(buildpath) && $(questa_cmd) vsim -work $(library) -c -do compile_$(config).tcl
 	# Remove the file if compilation did not succeed
@@ -164,11 +166,13 @@ verilate: $(buildpath) bender $(veril_library)/V$(veril_top)
 
 $(veril_library)/V$(veril_top): $(config_file) Makefile ../Bender.yml $(shell find src -type f) $(shell find ../config -type f) $(shell find include -type f) $(shell find tb -type f) $(shell find deps -type f)
 	rm -rf $(veril_library); mkdir -p $(veril_library)
-	./bender script verilator -t rtl -t ara_test -t cva6_test -t verilator $(bender_defs) > $(veril_library)/bender_script_$(config)
+	./bender script verilator -t rtl -t ara_test -t cva6_test -t cv64a6_imafdcv_sv39 -t verilator $(bender_defs) > $(veril_library)/bender_script_$(config)
 # Verilate the design
 	$(veril_path)/verilator -f $(veril_library)/bender_script_$(config)           \
   -GNrLanes=$(nr_lanes)                                                         \
   -O3                                                                           \
+  -Wno-fatal                                                                    \
+  -Wno-PINCONNECTEMPTY                                                          \
   -Wno-BLKANDNBLK                                                               \
   -Wno-CASEINCOMPLETE                                                           \
   -Wno-CMPCONST                                                                 \
@@ -179,6 +183,7 @@ $(veril_library)/V$(veril_top): $(config_file) Makefile ../Bender.yml $(shell fi
   -Wno-UNSIGNED                                                                 \
   -Wno-WIDTH                                                                    \
   -Wno-WIDTHCONCAT                                                              \
+  -Wall                                                                         \
   --hierarchical                                                                \
   tb/verilator/waiver.vlt                                                       \
   --Mdir $(veril_library)                                                       \
@@ -222,7 +227,7 @@ lint: spyglass/tmp/files spyglass/sdc/func.sdc spyglass/scripts/run_lint.tcl
 
 spyglass/tmp/files: $(bender)
 	mkdir -p spyglass/tmp
-	./bender script verilator -t rtl -t spyglass -t cva6_test $(bender_defs) --define SPYGLASS > spyglass/tmp/files
+	./bender script verilator -t rtl -t spyglass -t cva6_test -t cv64a6_imafdcv_sv39 $(bender_defs) --define SPYGLASS > spyglass/tmp/files
 
 # DPIs
 .PHONY: dpi
diff --git a/hardware/deps/apb b/hardware/deps/apb
new file mode 160000
index 000000000..77ddf073f
--- /dev/null
+++ b/hardware/deps/apb
@@ -0,0 +1 @@
+Subproject commit 77ddf073f194d44b9119949d2421be59789e69ae
diff --git a/hardware/deps/axi b/hardware/deps/axi
index 442ff3375..bfee21757 160000
--- a/hardware/deps/axi
+++ b/hardware/deps/axi
@@ -1 +1 @@
-Subproject commit 442ff3375710513623f95944d66cc2bd09b2f155
+Subproject commit bfee21757bf090ec8e358456314b0b0fd3c90809
diff --git a/hardware/deps/cva6 b/hardware/deps/cva6
index bebbc1475..5e2e52069 160000
--- a/hardware/deps/cva6
+++ b/hardware/deps/cva6
@@ -1 +1 @@
-Subproject commit bebbc1475f9ffba661e8354d8773e27ab9338db1
+Subproject commit 5e2e520696aa63545b91fca38ce340314291be5c
diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
index 014b00473..b8ffa78c8 100644
--- a/hardware/include/ara_pkg.sv
+++ b/hardware/include/ara_pkg.sv
@@ -155,17 +155,17 @@ package ara_pkg;
   } ara_op_e;
 
   // Return true if op is a load operation
-  function automatic is_load(ara_op_e op);
+  function automatic logic is_load(ara_op_e op);
     is_load = op inside {[VLE:VLXE]};
   endfunction : is_load
 
   // Return true if op is a store operation
-  function automatic is_store(ara_op_e op);
+  function automatic logic is_store(ara_op_e op);
     is_store = op inside {[VSE:VSXE]};
   endfunction : is_store
 
   // Return true of op is either VCPOP or VFIRST
-  function automatic vd_scalar(ara_op_e op);
+  function automatic logic vd_scalar(ara_op_e op);
     vd_scalar = op inside {[VCPOP:VFIRST]};
   endfunction : vd_scalar
 
@@ -239,8 +239,8 @@ package ara_pkg;
   /////////////////////////////
 
   // Use Ariane's accelerator interface.
-  typedef ariane_pkg::accelerator_req_t accelerator_req_t;
-  typedef ariane_pkg::accelerator_resp_t accelerator_resp_t;
+  typedef acc_pkg::accelerator_req_t accelerator_req_t;
+  typedef acc_pkg::accelerator_resp_t accelerator_resp_t;
 
   /////////////////////////
   //  Backend interface  //
@@ -322,11 +322,11 @@ package ara_pkg;
     // Scalar response
     elen_t resp;
 
-    // Instruction triggered an error
-    logic error;
+    // Instruction triggered an exception
+    ariane_pkg::exception_t exception;
 
     // New value for vstart
-    vlen_t error_vl;
+    vlen_t exception_vl;
   } ara_resp_t;
 
   ////////////////////
@@ -974,11 +974,20 @@ package ara_pkg;
   } opqueue_e;
 
   // Each lane has eight VRF banks
+  // NOTE: values != 8 are not supported
   localparam int unsigned NrVRFBanksPerLane = 8;
 
-  // Find the starting address of a vector register vid
+  // Find the starting address (in bytes) of a vector register chunk of vid
   function automatic logic [63:0] vaddr(logic [4:0] vid, int NrLanes);
-    vaddr = vid * (VLENB / NrLanes / 8);
+    // Each vector register spans multiple words in each bank in each lane
+    // The start address is the same in every lane
+    // Therefore, within each lane, each vector register chunk starts on a given offset
+    vaddr = vid * (VLENB / NrLanes / NrVRFBanksPerLane); 
+    // NOTE: the only extensively tested configuration of Ara keeps:
+    //        - (VLEN / NrLanes) constant to 1024;
+    //        - NrVRFBanksPerLane always equal to 8.
+    //        Given so, each vector register will span 2 words across all the banks and lanes, 
+    //        therefore, vaddr = vid * 16
   endfunction: vaddr
 
   // Differenciate between SLDU and ADDRGEN operands from opqueue
@@ -1016,7 +1025,7 @@ package ara_pkg;
 
   typedef struct packed {
     rvv_pkg::vew_e eew;        // Effective element width
-    vlen_t vl;                 // Vector length
+    vlen_t elem_count;         // Vector body length
     opqueue_conversion_e conv; // Type conversion
     logic [1:0] ntr_red;       // Neutral type for reductions
     logic is_reduct;           // Is this a reduction?
diff --git a/hardware/scripts/wave_core.tcl b/hardware/scripts/wave_core.tcl
index 7f0434ad7..757f814e7 100644
--- a/hardware/scripts/wave_core.tcl
+++ b/hardware/scripts/wave_core.tcl
@@ -7,15 +7,15 @@
 add wave -noupdate -group CVA6 -group core /ara_tb/dut/i_ara_soc/i_system/i_ariane/*
 
 add wave -noupdate -group CVA6 -group frontend /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/*
-add wave -noupdate -group CVA6 -group frontend -group icache /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_cva6_icache/*
-add wave -noupdate -group CVA6 -group frontend -group ras /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_ras/*
-add wave -noupdate -group CVA6 -group frontend -group btb /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_btb/*
-add wave -noupdate -group CVA6 -group frontend -group bht /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_bht/*
+add wave -noupdate -group CVA6 -group frontend -group icache /ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/*
+# add wave -noupdate -group CVA6 -group frontend -group ras /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_ras/*
+# add wave -noupdate -group CVA6 -group frontend -group btb /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_btb/*
+# add wave -noupdate -group CVA6 -group frontend -group bht /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_bht/*
 # add wave -noupdate -group CVA6 -group frontend -group instr_scan /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/*/i_instr_scan/*
 # add wave -noupdate -group CVA6 -group frontend -group fetch_fifo /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_frontend/i_fetch_fifo/*
 
 add wave -noupdate -group CVA6 -group id_stage -group decoder /ara_tb/dut/i_ara_soc/i_system/i_ariane/id_stage_i/decoder_i/*
-add wave -noupdate -group CVA6 -group id_stage -group compressed_decoder /ara_tb/dut/i_ara_soc/i_system/i_ariane/id_stage_i/compressed_decoder_i/*
+add wave -noupdate -group CVA6 -group id_stage -group compressed_decoder /ara_tb/dut/i_ara_soc/i_system/i_ariane/id_stage_i/genblk1/compressed_decoder_i/*
 add wave -noupdate -group CVA6 -group id_stage /ara_tb/dut/i_ara_soc/i_system/i_ariane/id_stage_i/*
 
 add wave -noupdate -group CVA6 -group issue_stage -group scoreboard /ara_tb/dut/i_ara_soc/i_system/i_ariane/issue_stage_i/i_scoreboard/*
@@ -32,10 +32,10 @@ add wave -noupdate -group CVA6 -group ex_stage -group fpu -group fpnew /ara_tb/d
 
 add wave -noupdate -group CVA6 -group ex_stage -group lsu /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/*
 add wave -noupdate -group CVA6 -group ex_stage -group lsu  -group lsu_bypass /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/lsu_bypass_i/*
-add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_mmu/*
-add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group itlb /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_mmu/i_itlb/*
-add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group dtlb /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_mmu/i_dtlb/*
-add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group ptw /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_mmu/i_ptw/*
+add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/gen_mmu_sv39/i_cva6_mmu/*
+add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group itlb /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/gen_mmu_sv39/i_cva6_mmu/i_itlb/*
+add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group dtlb /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/gen_mmu_sv39/i_cva6_mmu/i_dtlb/*
+add wave -noupdate -group CVA6 -group ex_stage -group lsu -group mmu -group ptw /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/gen_mmu_sv39/i_cva6_mmu/i_ptw/*
 
 add wave -noupdate -group CVA6 -group ex_stage -group lsu -group store_unit /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_store_unit/*
 add wave -noupdate -group CVA6 -group ex_stage -group lsu -group store_unit -group store_buffer /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/lsu_i/i_store_unit/store_buffer_i/*
@@ -46,7 +46,6 @@ add wave -noupdate -group CVA6 -group ex_stage -group branch_unit /ara_tb/dut/i_
 
 add wave -noupdate -group CVA6 -group ex_stage -group csr_buffer /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/csr_buffer_i/*
 
-add wave -noupdate -group CVA6 -group ex_stage -group dispatcher /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/gen_accelerator/i_acc_dispatcher/*
 add wave -noupdate -group CVA6 -group ex_stage /ara_tb/dut/i_ara_soc/i_system/i_ariane/ex_stage_i/*
 
 add wave -noupdate -group CVA6 -group commit_stage /ara_tb/dut/i_ara_soc/i_system/i_ariane/commit_stage_i/*
@@ -55,10 +54,12 @@ add wave -noupdate -group CVA6 -group csr_file /ara_tb/dut/i_ara_soc/i_system/i_
 
 add wave -noupdate -group CVA6 -group controller /ara_tb/dut/i_ara_soc/i_system/i_ariane/controller_i/*
 
-add wave -noupdate -group CVA6 -group wt_dcache /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_wt_dcache/*
-add wave -noupdate -group CVA6 -group wt_dcache -group miss_handler /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_wt_dcache/i_wt_dcache_missunit/*
+add wave -noupdate -group CVA6 -group wt_dcache /ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/i_wt_dcache/*
+add wave -noupdate -group CVA6 -group wt_dcache -group miss_handler /ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/i_wt_dcache/i_wt_dcache_missunit/*
 
-add wave -noupdate -group CVA6 -group wt_dcache -group load {/ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_wt_dcache/gen_rd_ports[0]/i_wt_dcache_ctrl/*}
-add wave -noupdate -group CVA6 -group wt_dcache -group ptw {/ara_tb/dut/i_ara_soc/i_system/i_ariane/i_cache_subsystem/i_wt_dcache/gen_rd_ports[1]/i_wt_dcache_ctrl/*}
+add wave -noupdate -group CVA6 -group wt_dcache -group load {/ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/i_wt_dcache/gen_rd_ports[0]/i_wt_dcache_ctrl/*}
+add wave -noupdate -group CVA6 -group wt_dcache -group ptw {/ara_tb/dut/i_ara_soc/i_system/i_ariane/genblk4/i_cache_subsystem/i_wt_dcache/gen_rd_ports[1]/i_wt_dcache_ctrl/*}
 
-add wave -noupdate -group CVA6 -group perf_counters /ara_tb/dut/i_ara_soc/i_system/i_ariane/i_perf_counters/*
+add wave -noupdate -group CVA6 -group dispatcher /ara_tb/dut/i_ara_soc/i_system/i_ariane/gen_accelerator/i_acc_dispatcher/*
+
+add wave -noupdate -group CVA6 -group perf_counters /ara_tb/dut/i_ara_soc/i_system/i_ariane/gen_perf_counter/perf_counters_i/*
diff --git a/hardware/src/accel_dispatcher_ideal.sv b/hardware/src/accel_dispatcher_ideal.sv
index 8c564b34c..b89d93474 100644
--- a/hardware/src/accel_dispatcher_ideal.sv
+++ b/hardware/src/accel_dispatcher_ideal.sv
@@ -25,11 +25,7 @@ module accel_dispatcher_ideal import axi_pkg::*; import ara_pkg::*; (
   input logic                     rst_ni,
   // Accelerator interaface
   output accelerator_req_t  acc_req_o,
-  output logic              acc_req_valid_o,
-  input  logic              acc_req_ready_i,
-  input  accelerator_resp_t acc_resp_i,
-  input  logic              acc_resp_valid_i,
-  output logic              acc_resp_ready_o
+  input  accelerator_resp_t acc_resp_i
 );
 
   localparam string vtrace = `STRINGIFY(`VTRACE);
@@ -69,7 +65,7 @@ module accel_dispatcher_ideal import axi_pkg::*; import ara_pkg::*; (
     status_cnt_n   = status_cnt_q;
     fifo_data_raw  = fifo_q[read_pointer_q];
 
-    if (acc_req_ready_i && ~fifo_empty) begin
+    if (acc_resp_i.req_ready && ~fifo_empty) begin
       // read from the queue is a default assignment
       // but increment the read pointer...
       if (read_pointer_n == N_VINSN - 1)
@@ -94,16 +90,16 @@ module accel_dispatcher_ideal import axi_pkg::*; import ara_pkg::*; (
 
   assign fifo_empty = (status_cnt_q == 0);
 
-  // Always valid until empty
-  assign acc_req_valid_o = ~fifo_empty;
-  // Flush the answer
-  assign acc_resp_ready_o = 1'b1;
   // Output assignment
   assign fifo_data = fifo_payload_t'(fifo_data_raw);
   assign acc_req_o = '{
     insn    : fifo_data.insn,
     rs1     : fifo_data.rs1,
     rs2     : fifo_data.rs2,
+    // Always valid until empty
+    req_valid  : ~fifo_empty,
+    // Flush the answer
+    resp_ready : 1'b1,
     default : '0
   };
 
@@ -133,7 +129,7 @@ module accel_dispatcher_ideal import axi_pkg::*; import ara_pkg::*; (
   // Stop the computation when the instructions are over and ara has returned idle
   // Just check that we are after reset
   always_ff @(posedge clk_i) begin
-    if (rst_ni && was_reset && !acc_req_valid_o && i_system.i_ara.ara_idle) begin
+    if (rst_ni && was_reset && !acc_req_o.req_valid && i_system.i_ara.ara_idle) begin
       $display("[hw-cycles]: %d", int'(perf_cnt_q));
       $info("Core Test ", $sformatf("*** SUCCESS *** (tohost = %0d)", 0));
       $finish(0);
@@ -160,10 +156,10 @@ endmodule
     fifo_payload_t payload;
 
     acc_req_o = '0;
-    acc_req_valid_o = 1'b0;
+    acc_req_o.req_valid = 1'b0;
 
     // Flush the answer
-	acc_resp_ready_o = 1'b1;
+	acc_req_o.resp_ready = 1'b1;
 
     acc_req_o     = '0;
     acc_req_o.frm = fpnew_pkg::RNE;
@@ -176,17 +172,17 @@ endmodule
 
     while ($fscanf(fd, "%h", payload) == 1) begin
       // Always valid
-      acc_req_valid_o = 1'b1;
+      acc_req_o.req_valid = 1'b1;
       acc_req_o.insn  = payload.insn;
       acc_req_o.rs1   = payload.rs1;
       // Wait for the handshake
-      wait(acc_req_ready_i);
+      wait(acc_resp_i.req_ready);
       @(posedge clk_i);
       @(negedge clk_i);
     end
 
     // Stop dispatching
-    acc_req_valid_o = 1'b0;
+    acc_req_o.req_valid = 1'b0;
 
     $fclose(fd);
   end
diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv
index c6976be6f..2bb6c6d08 100644
--- a/hardware/src/ara.sv
+++ b/hardware/src/ara.sv
@@ -37,13 +37,28 @@ module ara import ara_pkg::*; #(
     input  logic              scan_enable_i,
     input  logic              scan_data_i,
     output logic              scan_data_o,
+    
+    // CSR input
+    input  logic              en_ld_st_translation_i,
+    
+    // Interface with CVA6's sv39 MMU
+    // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless
+    output  ariane_pkg::exception_t        mmu_misaligned_ex_o,
+    output  logic                          mmu_req_o,        // request address translation
+    output  logic [riscv::VLEN-1:0]        mmu_vaddr_o,      // virtual address out
+    output  logic                          mmu_is_store_o,   // the translation is requested by a store
+    // if we need to walk the page table we can't grant in the same cycle
+    // Cycle 0
+    input logic                            mmu_dtlb_hit_i,   // sent in the same cycle as the request if translation hits in the DTLB
+    input logic [riscv::PPNW-1:0]          mmu_dtlb_ppn_i,   // ppn (send same cycle as hit)
+    // Cycle 1
+    input logic                            mmu_valid_i,      // translation is valid
+    input logic [riscv::PLEN-1:0]          mmu_paddr_i,      // translated address
+    input ariane_pkg::exception_t          mmu_exception_i,  // address translation threw an exception
+
     // Interface with Ariane
     input  accelerator_req_t  acc_req_i,
-    input  logic              acc_req_valid_i,
-    output logic              acc_req_ready_o,
     output accelerator_resp_t acc_resp_o,
-    output logic              acc_resp_valid_o,
-    input  logic              acc_resp_ready_i,
     // AXI interface
     output axi_req_t          axi_req_o,
     input  axi_resp_t         axi_resp_i
@@ -95,11 +110,7 @@ module ara import ara_pkg::*; #(
     .rst_ni            (rst_ni          ),
     // Interface with Ariane
     .acc_req_i         (acc_req_i       ),
-    .acc_req_valid_i   (acc_req_valid_i ),
-    .acc_req_ready_o   (acc_req_ready_o ),
     .acc_resp_o        (acc_resp_o      ),
-    .acc_resp_valid_o  (acc_resp_valid_o),
-    .acc_resp_ready_i  (acc_resp_ready_i),
     // Interface with the sequencer
     .ara_req_o         (ara_req         ),
     .ara_req_valid_o   (ara_req_valid   ),
@@ -131,8 +142,8 @@ module ara import ara_pkg::*; #(
   pe_resp_t          [NrPEs-1:0]   pe_resp;
   // Interface with the address generator
   logic                            addrgen_ack;
-  logic                            addrgen_error;
-  vlen_t                           addrgen_error_vl;
+  ariane_pkg::exception_t          addrgen_exception;
+  vlen_t                           addrgen_exception_vl;
   logic              [NrLanes-1:0] alu_vinsn_done;
   logic              [NrLanes-1:0] mfpu_vinsn_done;
   // Interface with the operand requesters
@@ -179,8 +190,8 @@ module ara import ara_pkg::*; #(
     .pe_scalar_resp_ready_o(pe_scalar_resp_ready     ),
     // Interface with the address generator
     .addrgen_ack_i         (addrgen_ack              ),
-    .addrgen_error_i       (addrgen_error            ),
-    .addrgen_error_vl_i    (addrgen_error_vl         )
+    .addrgen_exception_i   (addrgen_exception        ),
+    .addrgen_exception_vl_i(addrgen_exception_vl     )
   );
 
   // Scalar move support
@@ -345,8 +356,8 @@ module ara import ara_pkg::*; #(
     .pe_req_ready_o             (pe_req_ready[NrLanes+OffsetStore : NrLanes+OffsetLoad]),
     .pe_resp_o                  (pe_resp[NrLanes+OffsetStore : NrLanes+OffsetLoad]     ),
     .addrgen_ack_o              (addrgen_ack                                           ),
-    .addrgen_error_o            (addrgen_error                                         ),
-    .addrgen_error_vl_o         (addrgen_error_vl                                      ),
+    .addrgen_exception_o        (addrgen_exception                                     ),
+    .addrgen_exception_vl_o     (addrgen_exception_vl                                  ),
     // Interface with the Mask unit
     .mask_i                     (mask                                                  ),
     .mask_valid_i               (mask_valid                                            ),
@@ -362,6 +373,18 @@ module ara import ara_pkg::*; #(
     .addrgen_operand_target_fu_i(sldu_addrgen_operand_target_fu                        ),
     .addrgen_operand_valid_i    (sldu_addrgen_operand_valid                            ),
     .addrgen_operand_ready_o    (addrgen_operand_ready                                 ),
+    // CSR input    
+    .en_ld_st_translation_i,
+    // Interface with CVA6's sv39 MMU
+    .mmu_misaligned_ex_o   ,
+    .mmu_req_o             ,
+    .mmu_vaddr_o           ,
+    .mmu_is_store_o        ,
+    .mmu_dtlb_hit_i        ,
+    .mmu_dtlb_ppn_i        ,
+    .mmu_valid_i           ,
+    .mmu_paddr_i           ,
+    .mmu_exception_i       ,
     // Load unit
     .ldu_result_req_o           (ldu_result_req                                        ),
     .ldu_result_addr_o          (ldu_result_addr                                       ),
diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv
index b48f33c66..6daf98e99 100644
--- a/hardware/src/ara_dispatcher.sv
+++ b/hardware/src/ara_dispatcher.sv
@@ -22,11 +22,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
     input  logic                                 rst_ni,
     // Interfaces with Ariane
     input  accelerator_req_t                     acc_req_i,
-    input  logic                                 acc_req_valid_i,
-    output logic                                 acc_req_ready_o,
     output accelerator_resp_t                    acc_resp_o,
-    output logic                                 acc_resp_valid_o,
-    input  logic                                 acc_resp_ready_i,
     // Interface with Ara's backend
     output ara_req_t                             ara_req_o,
     output logic                                 ara_req_valid_o,
@@ -57,17 +53,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
   //  CSRs  //
   ////////////
 
-  vlen_t  vstart_d, vstart_q;
-  vlen_t  vl_d, vl_q;
-  vtype_t vtype_d, vtype_q;
-  vxsat_e vxsat_d, vxsat_q;
-  vxrm_t  vxrm_d, vxrm_q;
-
-  `FF(vstart_q, vstart_d, '0)
-  `FF(vl_q, vl_d, '0)
-  `FF(vtype_q, vtype_d, '{vill: 1'b1, default: '0})
-  `FF(vxsat_q, vxsat_d, '0)
-  `FF(vxrm_q, vxrm_d, '0)
+  vlen_t  csr_vstart_d, csr_vstart_q;
+  vlen_t  csr_vl_d, csr_vl_q;
+  vtype_t csr_vtype_d, csr_vtype_q;
+  vxsat_e csr_vxsat_d, csr_vxsat_q;
+  vxrm_t  csr_vxrm_d, csr_vxrm_q;
+
+  `FF(csr_vstart_q, csr_vstart_d, '0)
+  `FF(csr_vl_q, csr_vl_d, '0)
+  `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, default: '0})
+  `FF(csr_vxsat_q, csr_vxsat_d, '0)
+  `FF(csr_vxrm_q, csr_vxrm_d, '0)
   // Converts between the internal representation of `vtype_t` and the full XLEN-bit CSR.
   function automatic riscv::xlen_t xlen_vtype(vtype_t vtype);
     xlen_vtype = {vtype.vill, {riscv::XLEN-9{1'b0}}, vtype.vma, vtype.vta, vtype.vsew,
@@ -138,7 +134,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
     NORMAL_OPERATION,
     WAIT_IDLE,
     RESHUFFLE,
-    SLDU_SEQUENCER
+    SLDU_SEQUENCER // NOTE: this is never used!
   } state_e;
   state_e state_d, state_q;
 
@@ -197,9 +193,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
   // its counters of pending memory operations
   // Ara should tell Ariane when a memory operation is completed, so that it can modify
   // its pending load/store counters.
-  // A memory operation can be completed both when it is over and when vl_q == 0. In the latter case,
+  // A memory operation can be completed both when it is over and when csr_vl_q == 0. In the latter case,
   // Ara's decoder answers immediately, and this can cause a collision with an answer from Ara's VLSU.
-  // To avoid collisions, we give precedence to the VLSU, and we delay the vl_q == 0 memory op
+  // To avoid collisions, we give precedence to the VLSU, and we delay the csr_vl_q == 0 memory op
   // completion signal if a collision occurs
   logic load_zero_vl, store_zero_vl;
   // Do not checks vregs validity against current LMUL
@@ -209,14 +205,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
   logic is_decoding;
   // Is this an in-lane operation?
   logic in_lane_op;
-  // If the vslideup offset is greater than vl_q, the vslideup has no effects
+  // If the vslideup offset is greater than csr_vl_q, the vslideup has no effects
   logic null_vslideup;
 
   // Pipeline the VLSU's load and store complete signals, for timing reasons
   logic load_complete_q;
   logic store_complete_q;
-  `FF(load_complete_q, load_complete_i, 1'b0)
-  `FF(store_complete_q, store_complete_i, 1'b0)
+  logic illegal_insn_load, illegal_insn_store;
+  `FF(load_complete_q, load_complete_i || illegal_insn_load, 1'b0)
+  `FF(store_complete_q, store_complete_i || illegal_insn_store, 1'b0)
 
   // NP2 Slide support
   logic is_stride_np2;
@@ -240,14 +237,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
   always_comb begin: p_decoder
     // Default values
-    vstart_d     = vstart_q;
-    vl_d         = vl_q;
-    vtype_d      = vtype_q;
+    csr_vstart_d     = csr_vstart_q;
+    csr_vl_d         = csr_vl_q;
+    csr_vtype_d      = csr_vtype_q;
     state_d      = state_q;
     eew_d        = eew_q;
     eew_valid_d  = eew_valid_q;
-    lmul_vs2     = vtype_q.vlmul;
-    lmul_vs1     = vtype_q.vlmul;
+    lmul_vs2     = csr_vtype_q.vlmul;
+    lmul_vs1     = csr_vtype_q.vlmul;
 
     reshuffle_req_d  = reshuffle_req_q;
     eew_old_buffer_d = eew_old_buffer_q;
@@ -259,8 +256,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
     rs_mask_request_d   = 1'b0;
 
     illegal_insn = 1'b0;
-    vxsat_d      = vxsat_q;
-    vxrm_d       = vxrm_q;
+    illegal_insn_load  = 1'b0;
+    illegal_insn_store = 1'b0;
+    csr_vxsat_d      = csr_vxsat_q;
+    csr_vxrm_d       = csr_vxrm_q;
 
     is_vload      = 1'b0;
     is_vstore     = 1'b0;
@@ -275,8 +274,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
     is_decoding = 1'b0;
     in_lane_op  = 1'b0;
 
-    acc_req_ready_o  = 1'b0;
-    acc_resp_valid_o = 1'b0;
     acc_resp_o       = '{
       trans_id      : acc_req_i.trans_id,
       load_complete : load_zero_vl | load_complete_q,
@@ -285,18 +282,20 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
       fflags_valid  : |fflags_ex_valid_i,
       default       : '0
     };
+    acc_resp_o.req_ready  = 1'b0;
+    acc_resp_o.resp_valid = 1'b0;
 
     // fflags
     for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane];
 
     ara_req_d = '{
-      vl           : vl_q,
-      vstart       : vstart_q,
-      vtype        : vtype_q,
-      emul         : vtype_q.vlmul,
-      eew_vs1      : vtype_q.vsew,
-      eew_vs2      : vtype_q.vsew,
-      eew_vd_op    : vtype_q.vsew,
+      vl           : csr_vl_q,
+      vstart       : csr_vstart_q,
+      vtype        : csr_vtype_q,
+      emul         : csr_vtype_q.vlmul,
+      eew_vs1      : csr_vtype_q.vsew,
+      eew_vs2      : csr_vtype_q.vsew,
+      eew_vd_op    : csr_vtype_q.vsew,
       eew_vmask    : eew_q[VMASK],
       cvt_resize   : CVT_SAME,
       default      : '0
@@ -307,9 +306,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
     ignore_zero_vl_check = 1'b0;
 
     // Saturation in any lane will raise vxsat flag
-    vxsat_d |= |vxsat_flag_i;
+    csr_vxsat_d |= |vxsat_flag_i;
     // Fixed-point rounding mode is applied to all lanes
-    for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = vxrm_q;
+    for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = csr_vxrm_q;
     // Rounding mode is shared between all lanes
     for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane];
     // Special states
@@ -325,8 +324,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
         automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr);
 
         // Stall the interface, wait for the backend to accept the injected uop
-        acc_req_ready_o  = 1'b0;
-        acc_resp_valid_o = 1'b0;
+        acc_resp_o.req_ready  = 1'b0;
+        acc_resp_o.resp_valid = 1'b0;
 
         // Handle LMUL > 1
         rs_lmul_cnt_d       = rs_lmul_cnt_q;
@@ -424,14 +423,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
           end
         end
       end
-    endcase
+    endcase // state_q
 
-    if (state_d == NORMAL_OPERATION && state_q != RESHUFFLE) begin
-      if (acc_req_valid_i && ara_req_ready_i && acc_resp_ready_i) begin
+    if (state_d == NORMAL_OPERATION && state_q != RESHUFFLE) begin : not_reshuffling
+      if (acc_req_i.req_valid && ara_req_ready_i && acc_req_i.resp_ready) begin : ready
         // Decoding
         is_decoding = 1'b1;
         // Acknowledge the request
-        acc_req_ready_o = ara_req_ready_i;
+        acc_resp_o.req_ready = 1'b1;
 
         // Decode the instructions based on their opcode
         unique case (acc_req_i.insn.itype.opcode)
@@ -439,45 +438,46 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
           //  Vector Arithmetic instructions  //
           //////////////////////////////////////
 
-          riscv::OpcodeVec: begin
+          riscv::OpcodeVec: begin : OpcodeVec
             // Instruction is of one of the RVV types
             automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr);
 
-            // These always respond at the same cycle
-            acc_resp_valid_o = 1'b1;
+            // These (mostly) always respond at the same cycle
+            acc_resp_o.resp_valid = 1'b1;
 
             // Decode based on their func3 field
             unique case (insn.varith_type.func3)
               // Configuration instructions
               OPCFG: begin: opcfg
                 // These can be acknowledged regardless of the state of Ara
-                acc_req_ready_o = 1'b1;
+                // NOTE: unless there is a pending fault-only first vector load
+                // acc_resp_o.req_ready = 1'b1;
                 is_config       = 1'b1;
 
                 // Update vtype
                 if (insn.vsetvli_type.func1 == 1'b0) begin // vsetvli
-                  vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11));
+                  csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11));
                 end else if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli
-                  vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10));
+                  csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10));
                 end else if (insn.vsetvl_type.func7 == 7'b100_0000) begin // vsetvl
-                  vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0]));
+                  csr_vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0]));
                 end else
-                  acc_resp_o.error = 1'b1;
+                  illegal_insn = 1'b1;
 
                 // Check whether the updated vtype makes sense
-                if ((vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN
-                    (vtype_d.vlmul == LMUL_RSVD) ||                    // reserved value
+                if ((csr_vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN
+                    (csr_vtype_d.vlmul == LMUL_RSVD) ||                    // reserved value
                     // LMUL >= SEW/ELEN
-                    (signed'($clog2(ELENB)) + signed'(vtype_d.vlmul) < signed'(vtype_d.vsew))) begin
-                  vtype_d = '{vill: 1'b1, default: '0};
-                  vl_d    = '0;
+                    (signed'($clog2(ELENB)) + signed'(csr_vtype_d.vlmul) < signed'(csr_vtype_d.vsew))) begin
+                  csr_vtype_d = '{vill: 1'b1, default: '0};
+                  csr_vl_d    = '0;
                 end
 
                 // Update the vector length
                 else begin
                   // Maximum vector length. VLMAX = LMUL * VLEN / SEW.
-                  automatic int unsigned vlmax = VLENB >> vtype_d.vsew;
-                  unique case (vtype_d.vlmul)
+                  automatic int unsigned vlmax = VLENB >> csr_vtype_d.vsew;
+                  unique case (csr_vtype_d.vlmul)
                     LMUL_1  : vlmax <<= 0;
                     LMUL_2  : vlmax <<= 1;
                     LMUL_4  : vlmax <<= 2;
@@ -490,24 +490,24 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   endcase
 
                   if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli
-                    vl_d = vlen_t'(insn.vsetivli_type.uimm5);
+                    csr_vl_d = vlen_t'(insn.vsetivli_type.uimm5);
                   end else begin // vsetvl || vsetvli
                     if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd == '0) begin
                       // Do not update the vector length
-                      vl_d = vl_q;
+                      csr_vl_d = csr_vl_q;
                     end else if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd != '0) begin
                       // Set the vector length to vlmax
-                      vl_d = vlmax;
+                      csr_vl_d = vlmax;
                     end else begin
                       // Normal stripmining
-                      vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(vl_d)]) ||
+                      csr_vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(csr_vl_d)]) ||
                         (vlen_t'(acc_req_i.rs1) > vlmax)) ? vlmax : vlen_t'(acc_req_i.rs1);
                     end
                   end
                 end
 
                 // Return the new vl
-                acc_resp_o.result = vl_d;
+                acc_resp_o.result = csr_vl_d;
 
                 // If the vtype has changed, wait for the backend before issuing any new instructions.
                 // This is to avoid hazards on implicit register labels when LMUL_old > LMUL_new
@@ -515,7 +515,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 // Checking only lmul_q is a trick: we want to stall only if both lmuls have
                 // zero MSB. If lmul_q has zero MSB, it's greater than lmul_d only if also
                 // lmul_d has zero MSB since the slice comparison is intrinsically unsigned
-                if (!vtype_q.vlmul[2] && (vtype_d.vlmul[2:0] < vtype_q.vlmul[2:0]))
+                if (!csr_vtype_q.vlmul[2] && (csr_vtype_d.vlmul[2:0] < csr_vtype_q.vlmul[2:0]))
                   state_d = WAIT_IDLE;
               end
 
@@ -635,7 +635,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     if (insn.varith_type.vm) begin
                       ara_req_d.eew_vs1    = eew_q[ara_req_d.vs1];
                       ara_req_d.vtype.vsew = eew_q[ara_req_d.vs1];
-                      ara_req_d.vl         = (vl_q << vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0];
+                      ara_req_d.vl         = (csr_vl_q << csr_vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0];
                     end
                   end
                   6'b100000: ara_req_d.op = ara_pkg::VSADDU;
@@ -651,11 +651,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101100: begin
                     ara_req_d.op             = ara_pkg::VNSRL;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -669,11 +669,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101101: begin
                     ara_req_d.op             = ara_pkg::VNSRA;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -686,28 +686,28 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b101110: begin
                     ara_req_d.op = ara_pkg::VNCLIPU;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   6'b101111: begin
                     ara_req_d.op = ara_pkg::VNCLIP;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   // Reductions encode in cvt_resize the neutral value bits
                   // CVT_WIDE is 2'b00 (hack to save wires)
                   6'b110000: begin
                     ara_req_d.op = ara_pkg::VWREDSUMU;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.eew_vs1        = vtype_q.vsew.next();
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.eew_vs1        = csr_vtype_q.vsew.next();
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110001: begin
                     ara_req_d.op = ara_pkg::VWREDSUM;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.eew_vs1        = vtype_q.vsew.next();
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.eew_vs1        = csr_vtype_q.vsew.next();
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
@@ -731,7 +731,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 endcase
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPIVX: begin: opivx
@@ -761,7 +761,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b001110: begin
                     ara_req_d.op            = ara_pkg::VSLIDEUP;
                     ara_req_d.stride        = acc_req_i.rs1;
-                    ara_req_d.eew_vs2       = vtype_q.vsew;
+                    ara_req_d.eew_vs2       = csr_vtype_q.vsew;
                     // Encode vslideup/vslide1up on the use_scalar_op field
                     ara_req_d.use_scalar_op = 1'b0;
                     // Vl refers to current system vsew, but operand requesters
@@ -769,13 +769,13 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     // i.e., request will need reshuffling
                     ara_req_d.scale_vl      = 1'b1;
                     // If stride > vl, the vslideup has no effects
-                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
-                      (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
+                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] ||
+                      (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1;
                   end
                   6'b001111: begin
                     ara_req_d.op            = ara_pkg::VSLIDEDOWN;
                     ara_req_d.stride        = acc_req_i.rs1;
-                    ara_req_d.eew_vs2       = vtype_q.vsew;
+                    ara_req_d.eew_vs2       = csr_vtype_q.vsew;
                     // Encode vslidedown/vslide1down on the use_scalar_op field
                     ara_req_d.use_scalar_op = 1'b0;
                     // Request will need reshuffling
@@ -884,11 +884,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101100: begin
                     ara_req_d.op             = ara_pkg::VNSRL;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -902,11 +902,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101101: begin
                     ara_req_d.op             = ara_pkg::VNSRA;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -919,11 +919,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b101110: begin
                     ara_req_d.op = ara_pkg::VNCLIPU;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   6'b101111: begin
                     ara_req_d.op = ara_pkg::VNCLIP;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   default: illegal_insn = 1'b1;
                 endcase
@@ -941,7 +941,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 endcase
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPIVI: begin: opivi
@@ -969,19 +969,19 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b001110: begin
                     ara_req_d.op            = ara_pkg::VSLIDEUP;
                     ara_req_d.stride        = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1};
-                    ara_req_d.eew_vs2       = vtype_q.vsew;
+                    ara_req_d.eew_vs2       = csr_vtype_q.vsew;
                     // Encode vslideup/vslide1up on the use_scalar_op field
                     ara_req_d.use_scalar_op = 1'b0;
                     // Request will need reshuffling
                     ara_req_d.scale_vl      = 1'b1;
                     // If stride > vl, the vslideup has no effects
-                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
-                      (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
+                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] ||
+                      (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1;
                   end
                   6'b001111: begin
                     ara_req_d.op            = ara_pkg::VSLIDEDOWN;
                     ara_req_d.stride        = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1};
-                    ara_req_d.eew_vs2       = vtype_q.vsew;
+                    ara_req_d.eew_vs2       = csr_vtype_q.vsew;
                     // Encode vslidedown/vslide1down on the use_scalar_op field
                     ara_req_d.use_scalar_op = 1'b0;
                     // Request will need reshuffling
@@ -1094,11 +1094,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101100: begin
                     ara_req_d.op             = ara_pkg::VNSRL;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -1112,11 +1112,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101101: begin
                     ara_req_d.op             = ara_pkg::VNSRA;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -1129,11 +1129,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b101110: begin
                     ara_req_d.op = ara_pkg::VNCLIPU;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   6'b101111: begin
                     ara_req_d.op = ara_pkg::VNCLIP;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   default: illegal_insn = 1'b1;
                 endcase
@@ -1151,7 +1151,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 endcase
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPMVV: begin: opmvv
@@ -1215,8 +1215,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b010000: begin // VWXUNARY0
                     // vmv.x.s
                     // Stall the interface until we get the result
-                    acc_req_ready_o  = 1'b0;
-                    acc_resp_valid_o = 1'b0;
+                    acc_resp_o.req_ready  = 1'b0;
+                    acc_resp_o.resp_valid = 1'b0;
 
                     case (insn.varith_type.rs1)
                       5'b00000: begin
@@ -1240,7 +1240,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ignore_zero_vl_check = 1'b1;
 
                     // Sign extend operands
-                    unique case (vtype_q.vsew)
+                    unique case (csr_vtype_q.vsew)
                       EW8: begin
                         ara_req_d.conversion_vs2 = OpQueueConversionSExt8;
                       end
@@ -1254,13 +1254,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     endcase
 
                     // Wait until the back-end answers to acknowledge those instructions
-                    if (ara_resp_valid_i) begin
-                      acc_req_ready_o   = 1'b1;
-                      acc_resp_o.result = ara_resp_i.resp;
-                      acc_resp_o.error  = ara_resp_i.error;
-                      acc_resp_valid_o  = 1'b1;
-                      ara_req_valid_d   = 1'b0;
-                    end
+                    if ( ara_resp_valid_i ) begin : ara_resp_valid
+                      acc_resp_o.req_ready  = 1'b1;
+                      acc_resp_o.resp_valid = 1'b1;
+                      acc_resp_o.result     = ara_resp_i.resp;
+                      acc_resp_o.exception  = ara_resp_i.exception;
+                      // Clear request to backend
+                      ara_req_valid_d       = 1'b0;
+                    end : ara_resp_valid
                   end
                   6'b010100: begin
                     ara_req_d.use_vd_op = 1'b1;
@@ -1360,8 +1361,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW64) ||
-                            int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8})
+                        if (int'(csr_vtype_q.vsew) < int'(EW64) ||
+                            int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8})
                           illegal_insn = 1'b1;
                       end
                       5'b00011: begin // VSEXT.VF8
@@ -1370,44 +1371,44 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW64) ||
-                            int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8})
+                        if (int'(csr_vtype_q.vsew) < int'(EW64) ||
+                            int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8})
                           illegal_insn = 1'b1;
                       end
                       5'b00100: begin // VZEXT.VF4
                         ara_req_d.conversion_vs2 = OpQueueConversionZExt4;
-                        ara_req_d.eew_vs2        = prev_prev_ew(vtype_q.vsew);
+                        ara_req_d.eew_vs2        = prev_prev_ew(csr_vtype_q.vsew);
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW32) ||
-                            int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1;
+                        if (int'(csr_vtype_q.vsew) < int'(EW32) ||
+                            int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1;
                       end
                       5'b00101: begin // VSEXT.VF4
                         ara_req_d.conversion_vs2 = OpQueueConversionSExt4;
-                        ara_req_d.eew_vs2        = prev_prev_ew(vtype_q.vsew);
+                        ara_req_d.eew_vs2        = prev_prev_ew(csr_vtype_q.vsew);
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW32) ||
-                            int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1;
+                        if (int'(csr_vtype_q.vsew) < int'(EW32) ||
+                            int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1;
                       end
                       5'b00110: begin // VZEXT.VF2
                         ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                        ara_req_d.eew_vs2        = vtype_q.vsew.prev();
+                        ara_req_d.eew_vs2        = csr_vtype_q.vsew.prev();
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8})
+                        if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8})
                           illegal_insn = 1'b1;
                       end
                       5'b00111: begin // VSEXT.VF2
                         ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
-                        ara_req_d.eew_vs2        = vtype_q.vsew.prev();
+                        ara_req_d.eew_vs2        = csr_vtype_q.vsew.prev();
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8})
+                        if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8})
                           illegal_insn = 1'b1;
                       end
                       default: illegal_insn = 1'b1;
@@ -1447,92 +1448,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // Widening instructions
                   6'b110000: begin // VWADDU
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110001: begin // VWADD
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110010: begin // VWSUBU
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110011: begin // VWSUB
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110100: begin // VWADDU.W
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110101: begin // VWADD.W
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110110: begin // VWSUBU.W
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110111: begin // VWSUB.W
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111000: begin // VWMULU
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111010: begin // VWMULSU
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111011: begin // VWMUL
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
@@ -1540,31 +1541,31 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b111100: begin // VWMACCU
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111101: begin // VWMACC
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111111: begin // VWMACCSU
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   default: illegal_insn = 1'b1;
@@ -1598,7 +1599,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1;
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPMVX: begin: opmvx
@@ -1623,17 +1624,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b001110: begin // vslide1up
                     ara_req_d.op      = ara_pkg::VSLIDEUP;
                     ara_req_d.stride  = 1;
-                    ara_req_d.eew_vs2 = vtype_q.vsew;
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
                     // If stride > vl, the vslideup has no effects
-                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
-                      (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
+                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] ||
+                      (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1;
                   end
                   6'b001111: begin // vslide1down
                     ara_req_d.op      = ara_pkg::VSLIDEDOWN;
                     ara_req_d.stride  = 1;
-                    ara_req_d.eew_vs2 = vtype_q.vsew;
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
                   end
@@ -1641,7 +1642,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     // vmv.s.x
                     ara_req_d.op      = ara_pkg::VMVSX;
                     ara_req_d.use_vs2 = 1'b0;
-                    ara_req_d.vl      = |vl_q ? 1 : '0;
+                    ara_req_d.vl      = |csr_vl_q ? 1 : '0;
                     // This instruction ignores LMUL checks
                     skip_lmul_checks  = 1'b1;
                   end
@@ -1679,92 +1680,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // Widening instructions
                   6'b110000: begin // VWADDU
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110001: begin // VWADD
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110010: begin // VWSUBU
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110011: begin // VWSUB
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110100: begin // VWADDU.W
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110101: begin // VWADD.W
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110110: begin // VWSUBU.W
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110111: begin // VWSUB.W
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111000: begin // VWMULU
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111010: begin // VWMULSU
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111011: begin // VWMUL
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
@@ -1772,41 +1773,41 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b111100: begin // VWMACCU
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111101: begin // VWMACC
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111110: begin // VWMACCUS
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111111: begin // VWMACCSU
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   default: illegal_insn = 1'b1;
@@ -1834,7 +1835,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1;
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPFVV: begin: opfvv
@@ -1893,8 +1894,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b010000: begin // VWFUNARY0
                       // vmv.f.s
                       // Stall the interface until we get the result
-                      acc_req_ready_o  = 1'b0;
-                      acc_resp_valid_o = 1'b0;
+                      acc_resp_o.req_ready  = 1'b0;
+                      acc_resp_o.resp_valid = 1'b0;
 
                       ara_req_d.op         = ara_pkg::VFMVFS;
                       ara_req_d.use_vd     = 1'b0;
@@ -1904,7 +1905,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ignore_zero_vl_check = 1'b1;
 
                       // Zero-extend operands
-                      unique case (vtype_q.vsew)
+                      unique case (csr_vtype_q.vsew)
                         EW16: begin
                           ara_req_d.conversion_vs2 = OpQueueConversionZExt4;
                         end
@@ -1915,13 +1916,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       endcase
 
                       // Wait until the back-end answers to acknowledge those instructions
-                      if (ara_resp_valid_i) begin
-                        acc_req_ready_o   = 1'b1;
-                        acc_resp_o.result = ara_resp_i.resp;
-                        acc_resp_o.error  = ara_resp_i.error;
-                        acc_resp_valid_o  = 1'b1;
-                        ara_req_valid_d   = 1'b0;
-                      end
+                      if ( ara_resp_valid_i ) begin : ara_resp_valid
+                        acc_resp_o.req_ready  = 1'b1;
+                        acc_resp_o.resp_valid = 1'b1;
+                        acc_resp_o.result     = ara_resp_i.resp;
+                        acc_resp_o.exception  = ara_resp_i.exception;
+                        // Clear request to backend
+                        ara_req_valid_d       = 1'b0;
+                      end : ara_resp_valid
                     end
                     6'b011000: ara_req_d.op = ara_pkg::VMFEQ;
                     6'b011001: ara_req_d.op = ara_pkg::VMFLE;
@@ -1942,96 +1944,95 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         5'b01000: begin // Widening VFCVTXUF
                           ara_req_d.op             = VFCVTXUF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01001: begin // Widening VFCVTXF
                           ara_req_d.op             = VFCVTXF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01010: begin // Widening VFCVTFXU
                           ara_req_d.op             = VFCVTFXU;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01011: begin // Widening VFCVTFX
                           ara_req_d.op             = VFCVTFX;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01100: begin // Widening VFCVTFF
                           ara_req_d.op             = VFCVTFF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01110: begin // Widening VFCVTRTZXUF
                           ara_req_d.op             = VFCVTRTZXUF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01111: begin // Widening VFCVTRTZXF
                           ara_req_d.op             = VFCVTRTZXF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b10000: begin // Narrowing VFCVTXUF
                           ara_req_d.op             = VFCVTXUF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10001: begin // Narrowing VFCVTXF
                           ara_req_d.op             = VFCVTXF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10010: begin // Narrowing VFCVTFXU
                           ara_req_d.op             = VFCVTFXU;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10011: begin // Narrowing VFCVTFX
                           ara_req_d.op             = VFCVTFX;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10100: begin // Narrowing VFCVTFF
                           ara_req_d.op             = VFCVTFF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10101: begin // Narrowing VFNCVTRODFF
                           ara_req_d.op             = VFNCVTRODFF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10110: begin // Narrowing VFCVTRTZXUF
                           ara_req_d.op             = VFCVTRTZXUF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10111: begin // Narrowing VFCVTRTZXF
                           ara_req_d.op             = VFCVTRTZXF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         default: begin
                           // Trigger an error
-                          acc_resp_o.error = 1'b1;
-                          ara_req_valid_d  = 1'b0;
+                          illegal_insn = 1'b1;
                         end
                       endcase
                     end
@@ -2094,99 +2095,99 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b110000: begin // VFWADD
                       ara_req_d.op             = ara_pkg::VFADD;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                     end
                     6'b110001: begin // VFWREDUSUM
                       ara_req_d.op             = ara_pkg::VFWREDUSUM;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vs1        = vtype_q.vsew.next();
+                      ara_req_d.eew_vs1        = csr_vtype_q.vsew.next();
                       ara_req_d.cvt_resize     = resize_e'(2'b00);
                     end
                     6'b110010: begin // VFWSUB
                       ara_req_d.op             = ara_pkg::VFSUB;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                     end
                     6'b110011: begin // VFWREDOSUM
                       ara_req_d.op             = ara_pkg::VFWREDOSUM;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vs1        = vtype_q.vsew.next();
+                      ara_req_d.eew_vs1        = csr_vtype_q.vsew.next();
                       ara_req_d.cvt_resize     = resize_e'(2'b00);
                     end
                     6'b110100: begin // VFWADD.W
                       ara_req_d.op             = ara_pkg::VFADD;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
-                      ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
+                      ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                     end
                     6'b110110: begin // VFWSUB.W
                       ara_req_d.op             = ara_pkg::VFSUB;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
-                      ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
+                      ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                     end
                     6'b111000: begin // VFWMUL
                       ara_req_d.op             = ara_pkg::VFMUL;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                     end
                     6'b111100: begin // VFWMACC
                       ara_req_d.op             = ara_pkg::VFMACC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111101: begin // VFWNMACC
                       ara_req_d.op             = ara_pkg::VFNMACC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111110: begin // VFWMSAC
                       ara_req_d.op             = ara_pkg::VFMSAC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111111: begin // VFWNMSAC
                       ara_req_d.op             = ara_pkg::VFNMSAC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     default: illegal_insn = 1'b1;
                   endcase
@@ -2242,7 +2243,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   endcase
 
                   // Instruction is invalid if the vtype is invalid
-                  if (vtype_q.vill) illegal_insn = 1'b1;
+                  if (csr_vtype_q.vill) illegal_insn = 1'b1;
                 end else illegal_insn = 1'b1; // Vector FP instructions are disabled
               end
 
@@ -2281,17 +2282,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b001110: begin // vfslide1up
                       ara_req_d.op     = ara_pkg::VSLIDEUP;
                       ara_req_d.stride = 1;
-                    ara_req_d.eew_vs2  = vtype_q.vsew;
+                    ara_req_d.eew_vs2  = csr_vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
                     // If stride > vl, the vslideup has no effects
-                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
-                      (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
+                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] ||
+                      (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1;
                     end
                     6'b001111: begin // vfslide1down
                       ara_req_d.op     = ara_pkg::VSLIDEDOWN;
                       ara_req_d.stride = 1;
-                    ara_req_d.eew_vs2  = vtype_q.vsew;
+                    ara_req_d.eew_vs2  = csr_vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
                     end
@@ -2299,7 +2300,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       // vmv.s.f
                       ara_req_d.op      = ara_pkg::VFMVSF;
                       ara_req_d.use_vs2 = 1'b0;
-                      ara_req_d.vl      = |vl_q ? 1 : '0;
+                      ara_req_d.vl      = |csr_vl_q ? 1 : '0;
                       // This instruction ignores LMUL checks
                       skip_lmul_checks  = 1'b1;
                     end
@@ -2360,85 +2361,85 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b110000: begin // VFWADD
                       ara_req_d.op             = ara_pkg::VFADD;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b110010: begin // VFWSUB
                       ara_req_d.op             = ara_pkg::VFSUB;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b110100: begin // VFWADD.W
                       ara_req_d.op             = ara_pkg::VFADD;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
-                      ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
+                      ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b110110: begin // VFWSUB.W
                       ara_req_d.op             = ara_pkg::VFSUB;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
-                      ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
+                      ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b111000: begin // VFWMUL
                       ara_req_d.op             = ara_pkg::VFMUL;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b111100: begin // VFWMACC
                       ara_req_d.op             = ara_pkg::VFMACC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111101: begin // VFWNMACC
                       ara_req_d.op             = ara_pkg::VFNMACC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111110: begin // VFWMSAC
                       ara_req_d.op             = ara_pkg::VFMSAC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111111: begin // VFWNMSAC
                       ara_req_d.op             = ara_pkg::VFNMSAC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     default: illegal_insn = 1'b1;
                   endcase
 
                   // Check if the FP scalar operand is NaN-boxed. If not, replace it with a NaN.
-                  case (vtype_q.vsew)
+                  case (csr_vtype_q.vsew)
                     EW16: if (~(&acc_req_i.rs1[63:16])) ara_req_d.scalar_op = 64'h0000000000007e00;
                     EW32: if (~(&acc_req_i.rs1[63:32])) ara_req_d.scalar_op = 64'h000000007fc00000;
                   endcase
@@ -2481,17 +2482,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   endcase
 
                   // Instruction is invalid if the vtype is invalid
-                  if (vtype_q.vill) illegal_insn = 1'b1;
+                  if (csr_vtype_q.vill) illegal_insn = 1'b1;
                 end else illegal_insn = 1'b1; // Vector FP instructions are disabled
               end
             endcase
-          end
+          end : OpcodeVec
 
           ////////////////////
           //  Vector Loads  //
           ////////////////////
 
-          riscv::OpcodeLoadFp: begin
+          riscv::OpcodeLoadFp: begin : OpcodeLoadFp
             // Instruction is of one of the RVV types
             automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr);
 
@@ -2499,7 +2500,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             is_vload = 1'b1;
 
             // Wait before acknowledging this instruction
-            acc_req_ready_o = 1'b0;
+            acc_resp_o.req_ready = 1'b0;
 
             // These generate a request to Ara's backend
             ara_req_d.vd        = insn.vmem_type.rd;
@@ -2515,7 +2516,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW8;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW8;
                   end
               end
@@ -2523,7 +2524,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW16;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW16;
                   end
               end
@@ -2531,7 +2532,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW32;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW32;
                   end
               end
@@ -2539,15 +2540,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW64;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW64;
                   end
               end
               default: begin // Invalid. Element is too wide, or encoding is non-existant.
-                acc_req_ready_o  = 1'b1;
-                acc_resp_o.error = 1'b1;
-                acc_resp_valid_o = 1'b1;
-                ara_req_valid_d  = 1'b0;
+                acc_resp_o.req_ready  = 1'b1;
+                acc_resp_o.resp_valid = 1'b1;
+                illegal_insn          = 1'b1;
+                ara_req_valid_d       = 1'b0;
               end
             endcase
 
@@ -2562,19 +2563,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   5'b01000:;      // Unit-strided, whole registers
                   5'b01011: begin // Unit-strided, mask load, EEW=1
                     // We operate ceil(vl/8) bytes
-                    ara_req_d.vl         = (vl_q >> 3) + |vl_q[2:0];
+                    ara_req_d.vl         = (csr_vl_q >> 3) + |csr_vl_q[2:0];
                     ara_req_d.vtype.vsew = EW8;
                   end
                   5'b10000: begin // Unit-strided, fault-only first
                     // TODO: Not implemented
-                    illegal_insn     = 1'b1;
-                    acc_req_ready_o  = 1'b1;
-                    acc_resp_valid_o = 1'b1;
+                    illegal_insn_load     = 1'b1;
                   end
                   default: begin // Reserved
-                    illegal_insn     = 1'b1;
-                    acc_req_ready_o  = 1'b1;
-                    acc_resp_valid_o = 1'b1;
+                    illegal_insn_load     = 1'b1;
                   end
                 endcase
               end
@@ -2594,24 +2591,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
             // For memory operations: EMUL = LMUL * (EEW / SEW)
             // EEW is encoded in the instruction
-            ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew));
+            ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew));
 
             // Exception if EMUL > 8 or < 1/8
-            unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]})
+            unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]})
               // The new emul is lower than the previous lmul
               2'b01: begin
                 // But the new eew is greater than vsew
-                if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_valid_o = 1'b1;
+                if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin
+                  illegal_insn_load     = 1'b1;
                 end
               end
               // The new emul is greater than the previous lmul
               2'b10: begin
                 // But the new eew is lower than vsew
-                if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_valid_o = 1'b1;
+                if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin
+                  illegal_insn_load     = 1'b1;
                 end
               end
               default:;
@@ -2621,20 +2616,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             // access.
             unique case (ara_req_d.emul)
               LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_valid_o = 1'b1;
+                illegal_insn_load     = 1'b1;
               end
               LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_valid_o = 1'b1;
+                illegal_insn_load     = 1'b1;
               end
               LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_valid_o = 1'b1;
+                illegal_insn_load     = 1'b1;
               end
               LMUL_RSVD: begin
-                illegal_insn     = 1'b1;
-                acc_resp_valid_o = 1'b1;
+                illegal_insn_load     = 1'b1;
               end
               default:;
             endcase
@@ -2644,9 +2635,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
               // Execute also if vl == 0
               ignore_zero_vl_check = 1'b1;
               // The LMUL value is kept in the instruction itself
-              illegal_insn     = 1'b0;
-              acc_req_ready_o  = 1'b0;
-              acc_resp_valid_o = 1'b0;
+              illegal_insn_load     = 1'b0;
               ara_req_valid_d  = 1'b1;
 
               // Maximum vector length. VLMAX = nf * VLEN / EW8.
@@ -2670,22 +2659,23 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 end
                 default: begin
                   // Trigger an error for the reserved simm values
-                  illegal_insn     = 1'b1;
+                  illegal_insn_load = 1'b1;
                 end
               endcase
             end
 
             // Wait until the back-end answers to acknowledge those instructions
-            if (ara_resp_valid_i) begin
-              acc_req_ready_o  = 1'b1;
-              acc_resp_o.error = ara_resp_i.error;
-              acc_resp_valid_o = 1'b1;
-              ara_req_valid_d  = 1'b0;
-              // In case of error, modify vstart
-              if (ara_resp_i.error)
-                vstart_d = ara_resp_i.error_vl;
-            end
-          end
+            if ( ara_resp_valid_i ) begin : ara_resp_valid
+              acc_resp_o.req_ready  = 1'b1;
+              acc_resp_o.resp_valid = 1'b1;
+              acc_resp_o.exception  = ara_resp_i.exception;
+              ara_req_valid_d       = 1'b0; // Clear request to backend
+              // In case of exception, modify vstart
+              if ( ara_resp_i.exception.valid ) begin : exception
+                csr_vstart_d = ara_resp_i.exception_vl;
+              end : exception
+            end : ara_resp_valid
+          end : OpcodeLoadFp
 
           /////////////////////
           //  Vector Stores  //
@@ -2697,7 +2687,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
           // The current vector length refers to the target EEW!
           // Vector stores never re-shuffle the source register!
 
-          riscv::OpcodeStoreFp: begin
+          riscv::OpcodeStoreFp: begin : OpcodeStoreFp
             // Instruction is of one of the RVV types
             automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr);
 
@@ -2705,7 +2695,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             is_vstore = 1'b1;
 
             // Wait before acknowledging this instruction
-            acc_req_ready_o = 1'b0;
+            acc_resp_o.req_ready = 1'b0;
 
             // vl depends on the EEW encoded in the instruction.
             // Ara does not reshuffle source vregs upon vector stores,
@@ -2728,7 +2718,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW8; // ara_req_d.vtype.vsew is the target EEW!
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW8;
                   end
               end
@@ -2736,7 +2726,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW16;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW16;
                   end
               end
@@ -2744,7 +2734,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW32;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW32;
                   end
               end
@@ -2752,15 +2742,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW64;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW64;
                   end
               end
               default: begin // Invalid. Element is too wide, or encoding is non-existant.
-                acc_req_ready_o  = 1'b1;
-                acc_resp_o.error = 1'b1;
-                acc_resp_valid_o = 1'b1;
-                ara_req_valid_d  = 1'b0;
+                illegal_insn  = 1'b1;
               end
             endcase
 
@@ -2775,13 +2762,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   5'b01000:;     // Unit-strided, whole registers
                   5'b01011: begin // Unit-strided, mask store, EEW=1
                     // We operate ceil(vl/8) bytes
-                    ara_req_d.vl         = (vl_q >> 3) + |vl_q[2:0];
+                    ara_req_d.vl         = (csr_vl_q >> 3) + |csr_vl_q[2:0];
                     ara_req_d.vtype.vsew = EW8;
                   end
                   default: begin // Reserved
-                    illegal_insn     = 1'b1;
-                    acc_req_ready_o  = 1'b1;
-                    acc_resp_valid_o = 1'b1;
+                    illegal_insn_store    = 1'b1;
                   end
                 endcase
               end
@@ -2801,24 +2786,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
             // For memory operations: EMUL = LMUL * (EEW / SEW)
             // EEW is encoded in the instruction
-            ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew));
+            ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew));
 
             // Exception if EMUL > 8 or < 1/8
-            unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]})
+            unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]})
               // The new emul is lower than the previous lmul
               2'b01: begin
                 // But the new eew is greater than vsew
-                if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_valid_o = 1'b1;
+                if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin
+                  illegal_insn_store    = 1'b1;
                 end
               end
               // The new emul is greater than the previous lmul
               2'b10: begin
                 // But the new eew is lower than vsew
-                if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_valid_o = 1'b1;
+                if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin
+                  illegal_insn_store    = 1'b1;
                 end
               end
               default:;
@@ -2828,20 +2811,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             // access.
             unique case (ara_req_d.emul)
               LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_valid_o = 1'b1;
+                illegal_insn_store     = 1'b1;
               end
               LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_valid_o = 1'b1;
+                illegal_insn_store    = 1'b1;
               end
               LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_valid_o = 1'b1;
+                illegal_insn_store    = 1'b1;
               end
               LMUL_RSVD: begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_valid_o = 1'b1;
+                  illegal_insn_store    = 1'b1;
               end
               default:;
             endcase
@@ -2873,227 +2852,309 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 end
                 default: begin
                   // Trigger an error for the reserved simm values
-                  illegal_insn     = 1'b1;
+                  illegal_insn_store = 1'b1;
                 end
               endcase
 
-              illegal_insn     = 1'b0;
-              acc_req_ready_o  = 1'b0;
-              acc_resp_valid_o = 1'b0;
+              // illegal_insn_store    = 1'b0; // TODO: IS THIS A BUG?
+              acc_resp_o.req_ready  = 1'b0;
+              acc_resp_o.resp_valid = 1'b0;
               ara_req_valid_d  = 1'b1;
             end
 
             // Wait until the back-end answers to acknowledge those instructions
-            if (ara_resp_valid_i) begin
-              acc_req_ready_o  = 1'b1;
-              acc_resp_o.error = ara_resp_i.error;
-              acc_resp_valid_o = 1'b1;
-              ara_req_valid_d  = 1'b0;
-              // If there is an error, change vstart
-              if (ara_resp_i.error)
-                vstart_d = ara_resp_i.error_vl;
-            end
-          end
+            if ( ara_resp_valid_i ) begin : ara_resp_valid
+              acc_resp_o.req_ready  = 1'b1;
+              acc_resp_o.resp_valid = 1'b1;
+              acc_resp_o.exception  = ara_resp_i.exception;
+              ara_req_valid_d       = 1'b0; // Clear request to backend
+              // In case of exception, modify vstart
+              if ( ara_resp_i.exception.valid ) begin : exception
+                csr_vstart_d = ara_resp_i.exception_vl;
+              end : exception
+            end : ara_resp_valid
+          end : OpcodeStoreFp
 
           ////////////////////////////
           //  CSR Reads and Writes  //
           ////////////////////////////
 
-          riscv::OpcodeSystem: begin
-            // These always respond at the same cycle
-            acc_resp_valid_o = 1'b1;
-            is_config        = 1'b1;
-
-            unique case (acc_req_i.insn.itype.funct3)
-              3'b001: begin // csrrw
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  // Only vstart can be written with CSR instructions.
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = acc_req_i.rs1;
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VXRM: begin
-                    vxrm_d            = vxrm_t'(acc_req_i.rs1[1:0]);
-                    acc_resp_o.result = vlen_t'(vxrm_q);
-                  end
-                  riscv::CSR_VXSAT: begin
-                    vxsat_d           = vxsat_e'(acc_req_i.rs1[0]);
-                    acc_resp_o.result = vlen_t'(vxsat_q);
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b010: begin // csrrs
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vstart_q | vlen_t'(acc_req_i.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VTYPE: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q);
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VL: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VLENB: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VXRM: begin
-                    vxrm_d            = vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]);
-                    acc_resp_o.result = vlen_t'(vxrm_q);
-                  end
-                  riscv::CSR_VXSAT: begin
-                    vxsat_d           = vxsat_q | vxsat_e'(acc_req_i.rs1[0]);
-                    acc_resp_o.result = vlen_t'(vxsat_q);
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b011: begin // csrrc
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vstart_q & ~vlen_t'(acc_req_i.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VTYPE: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q);
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VL: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VLENB: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VXSAT: begin
-                    vxsat_d           = vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
-                    acc_resp_o.result = vxsat_q;
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b101: begin // csrrwi
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  // Only vstart can be written with CSR instructions.
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vlen_t'(acc_req_i.insn.itype.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VXRM: begin
-                    vxrm_d            = vxrm_t'(acc_req_i.rs1[1:0]);
-                    acc_resp_o.result = vlen_t'(vxrm_q);
-                  end
-                  riscv::CSR_VXSAT: begin
-                    // logic [19:15] rs1; So, LSB is [15]
-                    vxsat_d           = acc_req_i.insn.itype.rs1[15];
-                    acc_resp_o.result = vxsat_q;
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b110: begin // csrrsi
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vstart_q | vlen_t'(acc_req_i.insn.itype.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VTYPE: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q);
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VL: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VLENB: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VXSAT: begin
-                    // logic [19:15] rs1; So, LSB is [15]
-                    vxsat_d           = vxsat_q | vxsat_e'(acc_req_i.insn.itype.rs1[15]);
-                    acc_resp_o.result = vxsat_q;
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b111: begin // csrrci
-                // Decode the CSR.
-                unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vstart_q & ~vlen_t'(acc_req_i.insn.itype.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VTYPE: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q);
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VL: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VLENB: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VXSAT: begin
-                    // logic [19:15] rs1; So, LSB is [15]
-                    vxsat_d           = vxsat_q & ~vxsat_e'(acc_req_i.insn.itype.rs1[15]);
-                    acc_resp_o.result = vxsat_q;
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              default: begin
-                // Trigger an illegal instruction
-                acc_resp_o.error = 1'b1;
-                acc_resp_valid_o = 1'b1;
-              end
-            endcase
-          end
+          riscv::OpcodeSystem: begin : OpcodeSystem
+            // CSR ops have semantic dependency from vector instrucitons.
+            // Therefore, Ara must be idle before performing any CSR operation.
+
+            // Stall if there is any pending vector instruction
+            // NOTE: This is overconstraining. Not all CSR ops actually need to stall if a vector instruction is pending.
+            //       E.g., CSR vl is never updated by instructions past ara_dispatcher, except for "unit-stride fault-only-first loads". Reading vl would be safe otherwise.
+            //       E.g., CSR vlenb is a design-constant parameter, reading is always safe.
+            //       E.g., CSRs vxrm and vxsat have no influence on-non fixed-point instructions, it could be read and written safely when no fixed-point operation is running.
+            //       By better analyzing the spec, more of optimizations of such can be made. For the sake of simplicity, the current implementation treats CSR ops as one block.
+            if ( ara_idle_i ) begin : ara_idle
+              // These always respond at the same cycle
+              acc_resp_o.resp_valid = 1'b1;
+              is_config        = 1'b1;
+
+              unique case (acc_req_i.insn.itype.funct3)
+                3'b001: begin // csrrw
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    // Only vstart can be written with CSR instructions.
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = acc_req_i.rs1;
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d            = vxrm_t'(acc_req_i.rs1[16:15]);
+                      acc_resp_o.result = vlen_t'(csr_vxrm_q);
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = vxsat_e'(acc_req_i.rs1[15]);
+                      acc_resp_o.result = vlen_t'(csr_vxsat_q);
+                    end
+                    riscv::CSR_VCSR: begin
+                      csr_vxrm_d            = vxrm_t'(  acc_req_i.rs1[17:16]  );
+                      csr_vxsat_d           = vxsat_e'( acc_req_i.rs1[15]    );
+                      acc_resp_o.result = vlen_t'(  { csr_vxrm_q, csr_vxsat_q } );
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b010: begin // csrrs
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = csr_vstart_q | vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VTYPE: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q);
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VL: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VLENB: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d            = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[16:15]);
+                      acc_resp_o.result = vlen_t'(csr_vxrm_q);
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]);
+                      acc_resp_o.result = vlen_t'(csr_vxsat_q);
+                    end
+                    riscv::CSR_VCSR: begin
+                      csr_vxrm_d            = csr_vxrm_q  | vxrm_t'(acc_req_i.rs1[17:16]);
+                      csr_vxsat_d           = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]);
+                      acc_resp_o.result = vlen_t'(  { csr_vxrm_q, csr_vxsat_q } );
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b011: begin // csrrc
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = csr_vstart_q & ~vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VTYPE: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q);
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VL: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VLENB: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d           = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]);
+                      acc_resp_o.result = csr_vxrm_q;
+                    end
+                    riscv::CSR_VCSR: begin
+                      csr_vxrm_d            = csr_vxrm_q  & ~vxsat_e'(acc_req_i.rs1[2:1]);
+                      csr_vxsat_d           = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = vlen_t'(  { csr_vxrm_q, csr_vxsat_q } );
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b101: begin // csrrwi
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    // Only vstart can be written with CSR instructions.
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d            = vxrm_t'(acc_req_i.rs1[1:0]);
+                      acc_resp_o.result = vlen_t'(csr_vxrm_q);
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = acc_req_i.rs1[0];
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    riscv::CSR_VCSR: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxrm_d            = vxrm_t'(acc_req_i.rs1[2:1]);
+                      csr_vxsat_d           = vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b110: begin // csrrsi
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = csr_vstart_q | vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VTYPE: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q);
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VL: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VLENB: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VXSAT: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxsat_d           = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxrm_d           = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]);
+                      acc_resp_o.result = csr_vxrm_q;
+                    end
+                    riscv::CSR_VCSR: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxrm_d            = csr_vxrm_q  |  vxrm_t'(acc_req_i.rs1[2:1]);
+                      csr_vxsat_d           = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = { csr_vxrm_q,  csr_vxsat_q };
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b111: begin // csrrci
+                  // Decode the CSR.
+                  unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = csr_vstart_q & ~vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VTYPE: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q);
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VL: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VLENB: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d           = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]);
+                      acc_resp_o.result = csr_vxrm_q;
+                    end
+                    riscv::CSR_VCSR: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxrm_d           = csr_vxrm_q  &  ~vxrm_t'(acc_req_i.rs1[2:1]);
+                      csr_vxsat_d          = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = { csr_vxrm_q,  csr_vxsat_q };
+                    end
+                    default: illegal_insn= 1'b1;
+                  endcase
+                end
+                default: begin
+                  // Trigger an illegal instruction
+                  illegal_insn = 1'b1;
+                end
+              endcase // acc_req_i.insn.itype.funct3
+            end : ara_idle
+            else begin : csr_stall
+              acc_resp_o.req_ready = 1'b0;
+            end : csr_stall
+          end : OpcodeSystem
 
           default: begin
             // Trigger an illegal instruction
-            acc_resp_o.error = 1'b1;
-            acc_resp_valid_o = 1'b1;
+            illegal_insn = 1'b1;
           end
-        endcase
-      end
+
+        endcase // acc_req_i.insn.itype.opcode
+      end : ready
 
       // Check that we have fixed-point support if requested
       // vxsat and vxrm are always accessible anyway
-      if (ara_req_valid_d && (ara_req_d.op inside {[VSADDU:VNCLIPU], VSMUL}) && (FixPtSupport == FixedPointDisable))
+      if (ara_req_valid_d && (ara_req_d.op inside {[VSADDU:VNCLIPU], VSMUL}) && (FixPtSupport == FixedPointDisable)) begin : fixed_point_check
         illegal_insn = 1'b1;
+      end : fixed_point_check
 
       // Check that we have we have vfrec7, vfrsqrt7
-      if (ara_req_valid_d && (ara_req_d.op inside {VFREC7, VFRSQRT7}) && (FPExtSupport == FPExtSupportDisable))
+      if (ara_req_valid_d && (ara_req_d.op inside {VFREC7, VFRSQRT7}) && (FPExtSupport == FPExtSupportDisable)) begin : vfrec7_vfrsqrt7_support_check
         illegal_insn = 1'b1;
+      end : vfrec7_vfrsqrt7_support_check
+
+
+      // Raise an illegal instruction exception
+      if ( illegal_insn || illegal_insn_load || illegal_insn_store ) begin : illegal_instruction
+        ara_req_valid_d            = 1'b0;
+        acc_resp_o.req_ready       = 1'b1;
+        acc_resp_o.resp_valid      = 1'b1;
+        acc_resp_o.exception.valid = 1'b1;
+        acc_resp_o.exception.cause = riscv::ILLEGAL_INSTR;
+        acc_resp_o.exception.tval  = acc_req_i.insn;
+      end : illegal_instruction
+
+      // Reset vstart to zero for successful vector instructions
+      // Corner cases:
+      // * vstart exception reporting, e.g., VLSU, is handled above
+      // * CSR operations are not considered vector instructions
+      if ( acc_resp_o.resp_valid 
+            & !acc_resp_o.exception.valid 
+            & (acc_req_i.insn.itype.opcode != riscv::OpcodeSystem)
+          ) begin : reset_vstart
+        csr_vstart_d = '0;
+      end : reset_vstart
 
       // Check if we need to reshuffle our vector registers involved in the operation
       // This operation is costly when occurs, so avoid it if possible
-      if (ara_req_valid_d && !acc_resp_o.error) begin
+      if ( ara_req_valid_d && !acc_resp_o.exception.valid ) begin : check_reshuffle
         automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr);
 
         // Is the instruction an in-lane one and could it be subject to reshuffling?
@@ -3104,7 +3165,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
         // Optimization: reshuffle vd only if we are not overwriting the whole vector register!
         reshuffle_req_d = {ara_req_d.use_vs1 && (ara_req_d.eew_vs1    != eew_q[ara_req_d.vs1]) && eew_valid_q[ara_req_d.vs1] && in_lane_op,
                            ara_req_d.use_vs2 && (ara_req_d.eew_vs2    != eew_q[ara_req_d.vs2]) && eew_valid_q[ara_req_d.vs2] && in_lane_op,
-                           ara_req_d.use_vd  && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && vl_q != (VLENB >> ara_req_d.vtype.vsew)};
+                           ara_req_d.use_vd  && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && csr_vl_q != (VLENB >> ara_req_d.vtype.vsew)};
 
         // Prepare the information to reshuffle the vector registers during the next cycles
         // Reshuffle in the following order: vd, v2, v1. The order is arbitrary.
@@ -3126,7 +3187,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
           end
           default:;
         endcase
-      end
+      end : check_reshuffle
 
       // Reshuffle if at least one of the three registers needs a reshuffle
       if (|reshuffle_req_d) begin
@@ -3134,8 +3195,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
         automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr);
 
         // Stall the interface, and inject a reshuffling instruction
-        acc_req_ready_o  = 1'b0;
-        acc_resp_valid_o = 1'b0;
+        acc_resp_o.req_ready  = 1'b0;
+        acc_resp_o.resp_valid = 1'b0;
         ara_req_valid_d  = 1'b0;
 
         // Initialize the reshuffle counter limit to handle LMUL > 1
@@ -3149,13 +3210,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
         // Reshuffle
         state_d = RESHUFFLE;
       end
-    end
-
-    // Raise an illegal instruction exception
-    if (illegal_insn) begin
-      acc_resp_o.error = 1'b1;
-      ara_req_valid_d  = 1'b0;
-    end
+    end : not_reshuffling
 
     // Update the EEW
     if (ara_req_valid_d && ara_req_d.use_vd && ara_req_ready_i) begin
@@ -3195,14 +3250,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
     // Any valid non-config instruction is a NOP if vl == 0, with some exceptions,
     // e.g. whole vector memory operations / whole vector register move
-    if (is_decoding && (vl_q == '0 || null_vslideup) && !is_config &&
-      !ignore_zero_vl_check && !acc_resp_o.error) begin
+    if (is_decoding && (csr_vl_q == '0 || null_vslideup) && !is_config &&
+      !ignore_zero_vl_check && !acc_resp_o.exception.valid) begin
       // If we are acknowledging a memory operation, we must tell Ariane that the memory
       // operation was resolved (to decrement its pending load/store counter)
       // This can collide with the same signal from the vector load/store unit, so we must
       // delay the zero_vl acknowledge by 1 cycle
-      acc_req_ready_o  = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q));
-      acc_resp_valid_o = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q));
+      acc_resp_o.req_ready  = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q));
+      acc_resp_o.resp_valid = ~((is_vload & load_complete_q) | (is_vstore & store_complete_q));
       ara_req_valid_d  = 1'b0;
       load_zero_vl     = is_vload;
       store_zero_vl    = is_vstore;
diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv
index 5fb0abff1..f384eaa63 100644
--- a/hardware/src/ara_sequencer.sv
+++ b/hardware/src/ara_sequencer.sv
@@ -40,8 +40,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     output logic                            pe_scalar_resp_ready_o,
     // Interface with the Address Generation
     input  logic                            addrgen_ack_i,
-    input  logic                            addrgen_error_i,
-    input  vlen_t                           addrgen_error_vl_i
+    input  ariane_pkg::exception_t          addrgen_exception_i,
+    input  vlen_t                           addrgen_exception_vl_i
   );
 
   ///////////////////////////////////
@@ -438,8 +438,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
           state_d             = IDLE;
           ara_req_ready_o     = 1'b1;
           ara_resp_valid_o    = 1'b1;
-          ara_resp_o.error    = addrgen_error_i;
-          ara_resp_o.error_vl = addrgen_error_vl_i;
+          ara_resp_o.exception = addrgen_exception_i;
+          ara_resp_o.exception_vl = addrgen_exception_vl_i;
         end
 
         // Wait for the scalar result
diff --git a/hardware/src/ara_soc.sv b/hardware/src/ara_soc.sv
index 431605ef0..5c1ac53db 100644
--- a/hardware/src/ara_soc.sv
+++ b/hardware/src/ara_soc.sv
@@ -53,6 +53,7 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #(
   `include "axi/assign.svh"
   `include "axi/typedef.svh"
   `include "common_cells/registers.svh"
+  `include "apb/typedef.svh"
 
   //////////////////////
   //  Memory Regions  //
@@ -137,7 +138,8 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #(
     UniqueIds         : 1'b0,
     AxiAddrWidth      : AxiAddrWidth,
     AxiDataWidth      : AxiWideDataWidth,
-    NoAddrRules       : NrAXISlaves
+    NoAddrRules       : NrAXISlaves,
+    default           : '0
   };
 
   axi_pkg::xbar_rule_64_t [NrAXISlaves-1:0] routing_rules;
@@ -187,8 +189,8 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #(
   axi_atop_filter #(
     .AxiIdWidth     (AxiSocIdWidth  ),
     .AxiMaxWriteTxns(4              ),
-    .req_t          (soc_wide_req_t ),
-    .resp_t         (soc_wide_resp_t)
+    .axi_req_t          (soc_wide_req_t ),
+    .axi_resp_t         (soc_wide_resp_t)
   ) i_l2mem_atop_filter (
     .clk_i     (clk_i                         ),
     .rst_ni    (rst_ni                        ),
@@ -257,96 +259,102 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #(
   //  UART  //
   ////////////
 
-  axi2apb_64_32 #(
-    .AXI4_ADDRESS_WIDTH(AxiAddrWidth      ),
-    .AXI4_RDATA_WIDTH  (AxiNarrowDataWidth),
-    .AXI4_WDATA_WIDTH  (AxiNarrowDataWidth),
-    .AXI4_ID_WIDTH     (AxiSocIdWidth     ),
-    .AXI4_USER_WIDTH   (AxiUserWidth      ),
-    .BUFF_DEPTH_SLAVE  (2                 ),
-    .APB_ADDR_WIDTH    (32                )
-  ) i_axi2apb_64_32_uart (
-    .ACLK      (clk_i                                ),
-    .ARESETn   (rst_ni                               ),
-    .test_en_i (1'b0                                 ),
-    .AWID_i    (periph_narrow_axi_req[UART].aw.id    ),
-    .AWADDR_i  (periph_narrow_axi_req[UART].aw.addr  ),
-    .AWLEN_i   (periph_narrow_axi_req[UART].aw.len   ),
-    .AWSIZE_i  (periph_narrow_axi_req[UART].aw.size  ),
-    .AWBURST_i (periph_narrow_axi_req[UART].aw.burst ),
-    .AWLOCK_i  (periph_narrow_axi_req[UART].aw.lock  ),
-    .AWCACHE_i (periph_narrow_axi_req[UART].aw.cache ),
-    .AWPROT_i  (periph_narrow_axi_req[UART].aw.prot  ),
-    .AWREGION_i(periph_narrow_axi_req[UART].aw.region),
-    .AWUSER_i  (periph_narrow_axi_req[UART].aw.user  ),
-    .AWQOS_i   (periph_narrow_axi_req[UART].aw.qos   ),
-    .AWVALID_i (periph_narrow_axi_req[UART].aw_valid ),
-    .AWREADY_o (periph_narrow_axi_resp[UART].aw_ready),
-    .WDATA_i   (periph_narrow_axi_req[UART].w.data   ),
-    .WSTRB_i   (periph_narrow_axi_req[UART].w.strb   ),
-    .WLAST_i   (periph_narrow_axi_req[UART].w.last   ),
-    .WUSER_i   (periph_narrow_axi_req[UART].w.user   ),
-    .WVALID_i  (periph_narrow_axi_req[UART].w_valid  ),
-    .WREADY_o  (periph_narrow_axi_resp[UART].w_ready ),
-    .BID_o     (periph_narrow_axi_resp[UART].b.id    ),
-    .BRESP_o   (periph_narrow_axi_resp[UART].b.resp  ),
-    .BVALID_o  (periph_narrow_axi_resp[UART].b_valid ),
-    .BUSER_o   (periph_narrow_axi_resp[UART].b.user  ),
-    .BREADY_i  (periph_narrow_axi_req[UART].b_ready  ),
-    .ARID_i    (periph_narrow_axi_req[UART].ar.id    ),
-    .ARADDR_i  (periph_narrow_axi_req[UART].ar.addr  ),
-    .ARLEN_i   (periph_narrow_axi_req[UART].ar.len   ),
-    .ARSIZE_i  (periph_narrow_axi_req[UART].ar.size  ),
-    .ARBURST_i (periph_narrow_axi_req[UART].ar.burst ),
-    .ARLOCK_i  (periph_narrow_axi_req[UART].ar.lock  ),
-    .ARCACHE_i (periph_narrow_axi_req[UART].ar.cache ),
-    .ARPROT_i  (periph_narrow_axi_req[UART].ar.prot  ),
-    .ARREGION_i(periph_narrow_axi_req[UART].ar.region),
-    .ARUSER_i  (periph_narrow_axi_req[UART].ar.user  ),
-    .ARQOS_i   (periph_narrow_axi_req[UART].ar.qos   ),
-    .ARVALID_i (periph_narrow_axi_req[UART].ar_valid ),
-    .ARREADY_o (periph_narrow_axi_resp[UART].ar_ready),
-    .RID_o     (periph_narrow_axi_resp[UART].r.id    ),
-    .RDATA_o   (periph_narrow_axi_resp[UART].r.data  ),
-    .RRESP_o   (periph_narrow_axi_resp[UART].r.resp  ),
-    .RLAST_o   (periph_narrow_axi_resp[UART].r.last  ),
-    .RUSER_o   (periph_narrow_axi_resp[UART].r.user  ),
-    .RVALID_o  (periph_narrow_axi_resp[UART].r_valid ),
-    .RREADY_i  (periph_narrow_axi_req[UART].r_ready  ),
-    .PENABLE   (uart_penable_o                       ),
-    .PWRITE    (uart_pwrite_o                        ),
-    .PADDR     (uart_paddr_o                         ),
-    .PSEL      (uart_psel_o                          ),
-    .PWDATA    (uart_pwdata_o                        ),
-    .PRDATA    (uart_prdata_i                        ),
-    .PREADY    (uart_pready_i                        ),
-    .PSLVERR   (uart_pslverr_i                       )
+  `AXI_TYPEDEF_ALL(uart_axi, axi_addr_t, axi_soc_id_t, logic [31:0], logic [3:0], axi_user_t)
+  `AXI_LITE_TYPEDEF_ALL(uart_lite, axi_addr_t, logic [31:0], logic [3:0])
+  `APB_TYPEDEF_ALL(uart_apb, axi_addr_t, logic [31:0], logic [3:0])
+
+  uart_axi_req_t   uart_axi_req;
+  uart_axi_resp_t  uart_axi_resp;
+  uart_lite_req_t  uart_lite_req;
+  uart_lite_resp_t uart_lite_resp;
+  uart_apb_req_t   uart_apb_req;
+  uart_apb_resp_t  uart_apb_resp;
+
+  assign uart_penable_o = uart_apb_req.penable;
+  assign uart_pwrite_o  = uart_apb_req.pwrite;
+  assign uart_paddr_o   = uart_apb_req.paddr;
+  assign uart_psel_o    = uart_apb_req.psel;
+  assign uart_pwdata_o  = uart_apb_req.pwdata;
+  assign uart_apb_resp.prdata  = uart_prdata_i;
+  assign uart_apb_resp.pready  = uart_pready_i;
+  assign uart_apb_resp.pslverr = uart_pslverr_i;
+
+  typedef struct packed {
+    int unsigned idx;
+    axi_addr_t   start_addr;
+    axi_addr_t   end_addr;
+  } uart_apb_rule_t;
+
+  uart_apb_rule_t uart_apb_map = '{idx: 0, start_addr: '0, end_addr: '1};
+
+  axi_lite_to_apb #(
+    .NoApbSlaves     (32'd1           ),
+    .NoRules         (32'd1           ),
+    .AddrWidth       (AxiAddrWidth    ),
+    .DataWidth       (32'd32          ),
+    .PipelineRequest (1'b0            ),
+    .PipelineResponse(1'b0            ),
+    .axi_lite_req_t  (uart_lite_req_t ),
+    .axi_lite_resp_t (uart_lite_resp_t),
+    .apb_req_t       (uart_apb_req_t  ),
+    .apb_resp_t      (uart_apb_resp_t ),
+    .rule_t          (uart_apb_rule_t )
+  ) i_axi_lite_to_apb_uart (
+    .clk_i          (clk_i         ),
+    .rst_ni         (rst_ni        ),
+    .axi_lite_req_i (uart_lite_req ),
+    .axi_lite_resp_o(uart_lite_resp),
+    .apb_req_o      (uart_apb_req  ),
+    .apb_resp_i     (uart_apb_resp ),
+    .addr_map_i     (uart_apb_map  )
+  );
+
+  axi_to_axi_lite #(
+    .AxiAddrWidth   (AxiAddrWidth    ),
+    .AxiDataWidth   (32'd32          ),
+    .AxiIdWidth     (AxiSocIdWidth   ),
+    .AxiUserWidth   (AxiUserWidth    ),
+    .AxiMaxWriteTxns(32'd1           ),
+    .AxiMaxReadTxns (32'd1           ),
+    .FallThrough    (1'b1            ),
+    .full_req_t     (uart_axi_req_t  ),
+    .full_resp_t    (uart_axi_resp_t ),
+    .lite_req_t     (uart_lite_req_t ),
+    .lite_resp_t    (uart_lite_resp_t)
+  ) i_axi_to_axi_lite_uart (
+    .clk_i     (clk_i         ),
+    .rst_ni    (rst_ni        ),
+    .test_i    (1'b0          ),
+    .slv_req_i (uart_axi_req  ),
+    .slv_resp_o(uart_axi_resp ),
+    .mst_req_o (uart_lite_req ),
+    .mst_resp_i(uart_lite_resp)
   );
 
   axi_dw_converter #(
-    .AxiSlvPortDataWidth(AxiWideDataWidth     ),
-    .AxiMstPortDataWidth(AxiNarrowDataWidth   ),
-    .AxiAddrWidth       (AxiAddrWidth         ),
-    .AxiIdWidth         (AxiSocIdWidth        ),
-    .AxiMaxReads        (2                    ),
-    .ar_chan_t          (soc_wide_ar_chan_t   ),
-    .mst_r_chan_t       (soc_narrow_r_chan_t  ),
-    .slv_r_chan_t       (soc_wide_r_chan_t    ),
-    .aw_chan_t          (soc_narrow_aw_chan_t ),
-    .b_chan_t           (soc_wide_b_chan_t    ),
-    .mst_w_chan_t       (soc_narrow_w_chan_t  ),
-    .slv_w_chan_t       (soc_wide_w_chan_t    ),
-    .axi_mst_req_t      (soc_narrow_req_t     ),
-    .axi_mst_resp_t     (soc_narrow_resp_t    ),
-    .axi_slv_req_t      (soc_wide_req_t       ),
-    .axi_slv_resp_t     (soc_wide_resp_t      )
+    .AxiSlvPortDataWidth(AxiWideDataWidth  ),
+    .AxiMstPortDataWidth(32                ),
+    .AxiAddrWidth       (AxiAddrWidth      ),
+    .AxiIdWidth         (AxiSocIdWidth     ),
+    .AxiMaxReads        (1                 ),
+    .ar_chan_t          (soc_wide_ar_chan_t),
+    .mst_r_chan_t       (uart_axi_r_chan_t ),
+    .slv_r_chan_t       (soc_wide_r_chan_t ),
+    .aw_chan_t          (uart_axi_aw_chan_t),
+    .b_chan_t           (soc_wide_b_chan_t ),
+    .mst_w_chan_t       (uart_axi_w_chan_t ),
+    .slv_w_chan_t       (soc_wide_w_chan_t ),
+    .axi_mst_req_t      (uart_axi_req_t    ),
+    .axi_mst_resp_t     (uart_axi_resp_t   ),
+    .axi_slv_req_t      (soc_wide_req_t    ),
+    .axi_slv_resp_t     (soc_wide_resp_t   )
   ) i_axi_slave_uart_dwc (
-    .clk_i     (clk_i                       ),
-    .rst_ni    (rst_ni                      ),
-    .slv_req_i (periph_wide_axi_req[UART]   ),
-    .slv_resp_o(periph_wide_axi_resp[UART]  ),
-    .mst_req_o (periph_narrow_axi_req[UART] ),
-    .mst_resp_i(periph_narrow_axi_resp[UART])
+    .clk_i     (clk_i                     ),
+    .rst_ni    (rst_ni                    ),
+    .slv_req_i (periph_wide_axi_req[UART] ),
+    .slv_resp_o(periph_wide_axi_resp[UART]),
+    .mst_req_o (uart_axi_req              ),
+    .mst_resp_i(uart_axi_resp             )
   );
 
   /////////////////////////
@@ -450,7 +458,7 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #(
     CachedRegionAddrBase : {DRAMBase},
     CachedRegionLength   : {DRAMLength},
     //  cache config
-    Axi64BitCompliant    : 1'b1,
+    AxiCompliant         : 1'b1,
     SwapEndianess        : 1'b0,
     // debug
     DmBaseAddress        : 64'h0,
diff --git a/hardware/src/ara_system.sv b/hardware/src/ara_system.sv
index f8c32d44e..c2e7a7c5b 100644
--- a/hardware/src/ara_system.sv
+++ b/hardware/src/ara_system.sv
@@ -73,13 +73,11 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #(
   //  Ara and Ariane  //
   //////////////////////
 
-  import ariane_pkg::accelerator_req_t;
-  import ariane_pkg::accelerator_resp_t;
+  import acc_pkg::accelerator_req_t;
+  import acc_pkg::accelerator_resp_t;
 
   // Accelerator ports
   accelerator_req_t                     acc_req;
-  logic                                 acc_req_valid;
-  logic                                 acc_req_ready;
   accelerator_resp_t                    acc_resp;
   logic                                 acc_resp_valid;
   logic                                 acc_resp_ready;
@@ -98,15 +96,23 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #(
     .clk_i            (clk_i                 ),
     .rst_ni           (rst_ni                ),
     .acc_req_o        (acc_req               ),
-    .acc_req_valid_o  (acc_req_valid         ),
-    .acc_req_ready_i  (acc_req_ready         ),
     .acc_resp_i       (acc_resp              ),
     .acc_resp_valid_i (acc_resp_valid        ),
     .acc_resp_ready_o (acc_resp_ready        )
   );
 `else
-  ariane #(
-    .ArianeCfg(ArianeCfg)
+  cva6 #(
+    .ArianeCfg(ArianeCfg),
+    .cvxif_req_t (acc_pkg::accelerator_req_t),
+    .cvxif_resp_t (acc_pkg::accelerator_resp_t),
+    .AxiAddrWidth ( AxiAddrWidth ),
+    .AxiDataWidth ( AxiNarrowDataWidth ),
+    .AxiIdWidth ( AxiIdWidth ),
+    .axi_ar_chan_t (ariane_axi_ar_t),
+    .axi_aw_chan_t (ariane_axi_aw_t),
+    .axi_w_chan_t (ariane_axi_w_t),
+    .axi_req_t (ariane_axi_req_t),
+    .axi_rsp_t (ariane_axi_resp_t)
   ) i_ariane (
     .clk_i            (clk_i                 ),
     .rst_ni           (rst_ni                ),
@@ -116,19 +122,20 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #(
     .ipi_i            ('0                    ),
     .time_irq_i       ('0                    ),
     .debug_req_i      ('0                    ),
-    .axi_req_o        (ariane_narrow_axi_req ),
-    .axi_resp_i       (ariane_narrow_axi_resp),
-    // Accelerator ports
-    .acc_req_o        (acc_req               ),
-    .acc_req_valid_o  (acc_req_valid         ),
-    .acc_req_ready_i  (acc_req_ready         ),
-    .acc_resp_i       (acc_resp              ),
-    .acc_resp_valid_i (acc_resp_valid        ),
-    .acc_resp_ready_o (acc_resp_ready        ),
+    // Invalidation requests
     .acc_cons_en_o    (acc_cons_en           ),
     .inval_addr_i     (inval_addr            ),
     .inval_valid_i    (inval_valid           ),
-    .inval_ready_o    (inval_ready           )
+    .inval_ready_o    (inval_ready           ),
+    .rvfi_o           (                      ),
+    // Accelerator ports
+    .cvxif_req_o      (acc_req               ),
+    .cvxif_resp_i     (acc_resp              ),
+    .l15_req_o        (                      ),
+    .l15_rtrn_i       ( '0                   ),
+    // Memory interface
+    .axi_req_o        (ariane_narrow_axi_req ),
+    .axi_resp_i       (ariane_narrow_axi_resp)
   );
 `endif
 
@@ -211,11 +218,7 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #(
     .scan_data_i     (1'b0          ),
     .scan_data_o     (/* Unused */  ),
     .acc_req_i       (acc_req       ),
-    .acc_req_valid_i (acc_req_valid ),
-    .acc_req_ready_o (acc_req_ready ),
     .acc_resp_o      (acc_resp      ),
-    .acc_resp_valid_o(acc_resp_valid),
-    .acc_resp_ready_i(acc_resp_ready),
     .axi_req_o       (ara_axi_req   ),
     .axi_resp_i      (ara_axi_resp  )
   );
diff --git a/hardware/src/axi_to_mem.sv b/hardware/src/axi_to_mem.sv
deleted file mode 100644
index 7a3db70de..000000000
--- a/hardware/src/axi_to_mem.sv
+++ /dev/null
@@ -1,691 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Solderpad Hardware License, Version 0.51, see LICENSE for details.
-// SPDX-License-Identifier: SHL-0.51
-//
-// Authors:
-// - Andreas Kurth <akurth@iis.ee.ethz.ch>
-// - Wolfgang Roenninger <wroennin@iis.ee.ethz.ch>
-
-`include "common_cells/registers.svh"
-/// AXI4+ATOP slave module which translates AXI bursts into a memory stream.
-/// If both read and write channels of the AXI4+ATOP are active, both will have an
-/// utilization of 50%.
-module axi_to_mem #(
-  /// AXI4+ATOP request type. See `include/axi/typedef.svh`.
-  parameter type         axi_req_t  = logic,
-  /// AXI4+ATOP response type. See `include/axi/typedef.svh`.
-  parameter type         axi_resp_t = logic,
-  /// Address width, has to be less or equal than the width off the AXI address field.
-  /// Determines the width of `mem_addr_o`. Has to be wide enough to emit the memory region
-  /// which should be accessible.
-  parameter int unsigned AddrWidth  = 0,
-  /// AXI4+ATOP data width.
-  parameter int unsigned DataWidth  = 0,
-  /// AXI4+ATOP ID width.
-  parameter int unsigned IdWidth    = 0,
-  /// Number of banks at output, must evenly divide `DataWidth`.
-  parameter int unsigned NumBanks   = 0,
-  /// Depth of memory response buffer. This should be equal to the memory response latency.
-  parameter int unsigned BufDepth   = 1,
-  /// Dependent parameter, do not override. Memory address type.
-  localparam type addr_t     = logic [AddrWidth-1:0],
-  /// Dependent parameter, do not override. Memory data type.
-  localparam type mem_data_t = logic [DataWidth/NumBanks-1:0],
-  /// Dependent parameter, do not override. Memory write strobe type.
-  localparam type mem_strb_t = logic [DataWidth/NumBanks/8-1:0]
-) (
-  /// Clock input.
-  input  logic                           clk_i,
-  /// Asynchronous reset, active low.
-  input  logic                           rst_ni,
-  /// The unit is busy handling an AXI4+ATOP request.
-  output logic                           busy_o,
-  /// AXI4+ATOP slave port, request input.
-  input  axi_req_t                       axi_req_i,
-  /// AXI4+ATOP slave port, response output.
-  output axi_resp_t                      axi_resp_o,
-  /// Memory stream master, request is valid for this bank.
-  output logic           [NumBanks-1:0]  mem_req_o,
-  /// Memory stream master, request can be granted by this bank.
-  input  logic           [NumBanks-1:0]  mem_gnt_i,
-  /// Memory stream master, byte address of the request.
-  output addr_t          [NumBanks-1:0]  mem_addr_o,
-  /// Memory stream master, write data for this bank. Valid when `mem_req_o`.
-  output mem_data_t      [NumBanks-1:0]  mem_wdata_o,
-  /// Memory stream master, byte-wise strobe (byte enable).
-  output mem_strb_t      [NumBanks-1:0]  mem_strb_o,
-  /// Memory stream master, `axi_pkg::atop_t` signal associated with this request.
-  output axi_pkg::atop_t [NumBanks-1:0]  mem_atop_o,
-  /// Memory stream master, write enable. Then asserted store of `mem_w_data` is requested.
-  output logic           [NumBanks-1:0]  mem_we_o,
-  /// Memory stream master, response is valid. This module expects always a response valid for a
-  /// request regardless if the request was a write or a read.
-  input  logic           [NumBanks-1:0]  mem_rvalid_i,
-  /// Memory stream master, read response data.
-  input  mem_data_t      [NumBanks-1:0]  mem_rdata_i
-);
-
-  typedef logic [DataWidth-1:0]   axi_data_t;
-  typedef logic [DataWidth/8-1:0] axi_strb_t;
-  typedef logic [IdWidth-1:0]     axi_id_t;
-
-  typedef struct packed {
-    addr_t          addr;
-    axi_pkg::atop_t atop;
-    axi_strb_t      strb;
-    axi_data_t      wdata;
-    logic           we;
-  } mem_req_t;
-
-  typedef struct packed {
-    addr_t          addr;
-    axi_pkg::atop_t atop;
-    axi_id_t        id;
-    logic           last;
-    axi_pkg::qos_t  qos;
-    axi_pkg::size_t size;
-    logic           write;
-  } meta_t;
-
-  axi_data_t      mem_rdata,
-                  m2s_resp;
-  axi_pkg::len_t  r_cnt_d,        r_cnt_q,
-                  w_cnt_d,        w_cnt_q;
-  logic           arb_valid,      arb_ready,
-                  rd_valid,       rd_ready,
-                  wr_valid,       wr_ready,
-                  sel_b,          sel_buf_b,
-                  sel_r,          sel_buf_r,
-                  sel_valid,      sel_ready,
-                  sel_buf_valid,  sel_buf_ready,
-                  sel_lock_d,     sel_lock_q,
-                  meta_valid,     meta_ready,
-                  meta_buf_valid, meta_buf_ready,
-                  meta_sel_d,     meta_sel_q,
-                  m2s_req_valid,  m2s_req_ready,
-                  m2s_resp_valid, m2s_resp_ready,
-                  mem_req_valid,  mem_req_ready,
-                  mem_rvalid;
-  mem_req_t       m2s_req,
-                  mem_req;
-  meta_t          rd_meta,
-                  rd_meta_d,      rd_meta_q,
-                  wr_meta,
-                  wr_meta_d,      wr_meta_q,
-                  meta,           meta_buf;
-
-  assign busy_o = axi_req_i.aw_valid | axi_req_i.ar_valid | axi_req_i.w_valid |
-                    axi_resp_o.b_valid | axi_resp_o.r_valid |
-                    (r_cnt_q > 0) | (w_cnt_q > 0);
-
-  // Handle reads.
-  always_comb begin
-    // Default assignments
-    axi_resp_o.ar_ready = 1'b0;
-    rd_meta_d           = rd_meta_q;
-    rd_meta             = meta_t'{default: '0};
-    rd_valid            = 1'b0;
-    r_cnt_d             = r_cnt_q;
-    // Handle R burst in progress.
-    if (r_cnt_q > '0) begin
-      rd_meta_d.last = (r_cnt_q == 8'd1);
-      rd_meta        = rd_meta_d;
-      rd_meta.addr   = rd_meta_q.addr + axi_pkg::num_bytes(rd_meta_q.size);
-      rd_valid       = 1'b1;
-      if (rd_ready) begin
-        r_cnt_d--;
-        rd_meta_d.addr = rd_meta.addr;
-      end
-    // Handle new AR if there is one.
-    end else if (axi_req_i.ar_valid) begin
-      rd_meta_d = '{
-        addr:  addr_t'(axi_pkg::aligned_addr(axi_req_i.ar.addr, axi_req_i.ar.size)),
-        atop:  '0,
-        id:    axi_req_i.ar.id,
-        last:  (axi_req_i.ar.len == '0),
-        qos:   axi_req_i.ar.qos,
-        size:  axi_req_i.ar.size,
-        write: 1'b0
-      };
-      rd_meta      = rd_meta_d;
-      rd_meta.addr = addr_t'(axi_req_i.ar.addr);
-      rd_valid     = 1'b1;
-      if (rd_ready) begin
-        r_cnt_d             = axi_req_i.ar.len;
-        axi_resp_o.ar_ready = 1'b1;
-      end
-    end
-  end
-
-  // Handle writes.
-  always_comb begin
-    // Default assignments
-    axi_resp_o.aw_ready = 1'b0;
-    axi_resp_o.w_ready  = 1'b0;
-    wr_meta_d           = wr_meta_q;
-    wr_meta             = meta_t'{default: '0};
-    wr_valid            = 1'b0;
-    w_cnt_d             = w_cnt_q;
-    // Handle W bursts in progress.
-    if (w_cnt_q > '0) begin
-      wr_meta_d.last = (w_cnt_q == 8'd1);
-      wr_meta        = wr_meta_d;
-      wr_meta.addr   = wr_meta_q.addr + axi_pkg::num_bytes(wr_meta_q.size);
-      if (axi_req_i.w_valid) begin
-        wr_valid = 1'b1;
-        if (wr_ready) begin
-          axi_resp_o.w_ready = 1'b1;
-          w_cnt_d--;
-          wr_meta_d.addr = wr_meta.addr;
-        end
-      end
-    // Handle new AW if there is one.
-    end else if (axi_req_i.aw_valid && axi_req_i.w_valid) begin
-      wr_meta_d = '{
-        addr:   addr_t'(axi_pkg::aligned_addr(axi_req_i.aw.addr, axi_req_i.aw.size)),
-        atop:   axi_req_i.aw.atop,
-        id:     axi_req_i.aw.id,
-        last:   (axi_req_i.aw.len == '0),
-        qos:    axi_req_i.aw.qos,
-        size:   axi_req_i.aw.size,
-        write:  1'b1
-      };
-      wr_meta = wr_meta_d;
-      wr_meta.addr = addr_t'(axi_req_i.aw.addr);
-      wr_valid = 1'b1;
-      if (wr_ready) begin
-        w_cnt_d = axi_req_i.aw.len;
-        axi_resp_o.aw_ready = 1'b1;
-        axi_resp_o.w_ready = 1'b1;
-      end
-    end
-  end
-
-  // Arbitrate between reads and writes.
-  stream_mux #(
-    .DATA_T ( meta_t ),
-    .N_INP  ( 32'd2  )
-  ) i_ax_mux (
-    .inp_data_i   ({wr_meta,  rd_meta }),
-    .inp_valid_i  ({wr_valid, rd_valid}),
-    .inp_ready_o  ({wr_ready, rd_ready}),
-    .inp_sel_i    ( meta_sel_d         ),
-    .oup_data_o   ( meta               ),
-    .oup_valid_o  ( arb_valid          ),
-    .oup_ready_i  ( arb_ready          )
-  );
-  always_comb begin
-    meta_sel_d = meta_sel_q;
-    sel_lock_d = sel_lock_q;
-    if (sel_lock_q) begin
-      meta_sel_d = meta_sel_q;
-      if (arb_valid && arb_ready) begin
-        sel_lock_d = 1'b0;
-      end
-    end else begin
-      if (wr_valid ^ rd_valid) begin
-        // If either write or read is valid but not both, select the valid one.
-        meta_sel_d = wr_valid;
-      end else if (wr_valid && rd_valid) begin
-        // If both write and read are valid, decide according to QoS then burst properties.
-        // Prioritize higher QoS.
-        if (wr_meta.qos > rd_meta.qos) begin
-          meta_sel_d = 1'b1;
-        end else if (rd_meta.qos > wr_meta.qos) begin
-          meta_sel_d = 1'b0;
-        // Decide requests with identical QoS.
-        end else if (wr_meta.qos == rd_meta.qos) begin
-          // 1. Prioritize individual writes over read bursts.
-          // Rationale: Read bursts can be interleaved on AXI but write bursts cannot.
-          if (wr_meta.last && !rd_meta.last) begin
-            meta_sel_d = 1'b1;
-          // 2. Prioritize ongoing burst.
-          // Rationale: Stalled bursts create back-pressure or require costly buffers.
-          end else if (w_cnt_q > '0) begin
-            meta_sel_d = 1'b1;
-          end else if (r_cnt_q > '0) begin
-            meta_sel_d = 1'b0;
-          // 3. Otherwise arbitrate round robin to prevent starvation.
-          end else begin
-            meta_sel_d = ~meta_sel_q;
-          end
-        end
-      end
-      // Lock arbitration if valid but not yet ready.
-      if (arb_valid && !arb_ready) begin
-        sel_lock_d = 1'b1;
-      end
-    end
-  end
-
-  // Fork arbitrated stream to meta data, memory requests, and R/B channel selection.
-  stream_fork #(
-    .N_OUP ( 32'd3 )
-  ) i_fork (
-    .clk_i,
-    .rst_ni,
-    .valid_i ( arb_valid                            ),
-    .ready_o ( arb_ready                            ),
-    .valid_o ({sel_valid, meta_valid, m2s_req_valid}),
-    .ready_i ({sel_ready, meta_ready, m2s_req_ready})
-  );
-
-  assign sel_b = meta.write & meta.last;
-  assign sel_r = ~meta.write | meta.atop[5];
-
-  stream_fifo #(
-    .FALL_THROUGH ( 1'b1             ),
-    .DEPTH        ( 32'd1 + BufDepth ),
-    .T            ( logic[1:0]       )
-  ) i_sel_buf (
-    .clk_i,
-    .rst_ni,
-    .flush_i    ( 1'b0                    ),
-    .testmode_i ( 1'b0                    ),
-    .data_i     ({sel_b,        sel_r    }),
-    .valid_i    ( sel_valid               ),
-    .ready_o    ( sel_ready               ),
-    .data_o     ({sel_buf_b,    sel_buf_r}),
-    .valid_o    ( sel_buf_valid           ),
-    .ready_i    ( sel_buf_ready           ),
-    .usage_o    ( /* unused */            )
-  );
-
-  stream_fifo #(
-    .FALL_THROUGH ( 1'b1             ),
-    .DEPTH        ( 32'd1 + BufDepth ),
-    .T            ( meta_t           )
-  ) i_meta_buf (
-    .clk_i,
-    .rst_ni,
-    .flush_i    ( 1'b0           ),
-    .testmode_i ( 1'b0           ),
-    .data_i     ( meta           ),
-    .valid_i    ( meta_valid     ),
-    .ready_o    ( meta_ready     ),
-    .data_o     ( meta_buf       ),
-    .valid_o    ( meta_buf_valid ),
-    .ready_i    ( meta_buf_ready ),
-    .usage_o    ( /* unused */   )
-  );
-
-  // Assemble the actual memory request from meta information and write data.
-  assign m2s_req = mem_req_t'{
-    addr:  meta.addr,
-    atop:  meta.atop,
-    strb:  axi_req_i.w.strb,
-    wdata: axi_req_i.w.data,
-    we:    meta.write
-  };
-
-  // Interface memory as stream.
-  stream_to_mem #(
-    .mem_req_t  ( mem_req_t  ),
-    .mem_resp_t ( axi_data_t ),
-    .BufDepth   ( BufDepth   )
-  ) i_stream_to_mem (
-    .clk_i,
-    .rst_ni,
-    .req_i            ( m2s_req        ),
-    .req_valid_i      ( m2s_req_valid  ),
-    .req_ready_o      ( m2s_req_ready  ),
-    .resp_o           ( m2s_resp       ),
-    .resp_valid_o     ( m2s_resp_valid ),
-    .resp_ready_i     ( m2s_resp_ready ),
-    .mem_req_o        ( mem_req        ),
-    .mem_req_valid_o  ( mem_req_valid  ),
-    .mem_req_ready_i  ( mem_req_ready  ),
-    .mem_resp_i       ( mem_rdata      ),
-    .mem_resp_valid_i ( mem_rvalid     )
-  );
-
-  // Split single memory request to desired number of banks.
-  mem_to_banks #(
-    .AddrWidth  ( AddrWidth ),
-    .DataWidth  ( DataWidth ),
-    .NumBanks   ( NumBanks  )
-  ) i_mem_to_banks (
-    .clk_i,
-    .rst_ni,
-    .req_i         ( mem_req_valid ),
-    .gnt_o         ( mem_req_ready ),
-    .addr_i        ( mem_req.addr  ),
-    .wdata_i       ( mem_req.wdata ),
-    .strb_i        ( mem_req.strb  ),
-    .atop_i        ( mem_req.atop  ),
-    .we_i          ( mem_req.we    ),
-    .rvalid_o      ( mem_rvalid    ),
-    .rdata_o       ( mem_rdata     ),
-    .bank_req_o    ( mem_req_o     ),
-    .bank_gnt_i    ( mem_gnt_i     ),
-    .bank_addr_o   ( mem_addr_o    ),
-    .bank_wdata_o  ( mem_wdata_o   ),
-    .bank_strb_o   ( mem_strb_o    ),
-    .bank_atop_o   ( mem_atop_o    ),
-    .bank_we_o     ( mem_we_o      ),
-    .bank_rvalid_i ( mem_rvalid_i  ),
-    .bank_rdata_i  ( mem_rdata_i   )
-  );
-
-  // Join memory read data and meta data stream.
-  logic mem_join_valid, mem_join_ready;
-  stream_join #(
-    .N_INP ( 32'd2 )
-  ) i_join (
-    .inp_valid_i  ({m2s_resp_valid, meta_buf_valid}),
-    .inp_ready_o  ({m2s_resp_ready, meta_buf_ready}),
-    .oup_valid_o  ( mem_join_valid                 ),
-    .oup_ready_i  ( mem_join_ready                 )
-  );
-
-  // Dynamically fork the joined stream to B and R channels.
-  stream_fork_dynamic #(
-    .N_OUP ( 32'd2 )
-  ) i_fork_dynamic (
-    .clk_i,
-    .rst_ni,
-    .valid_i      ( mem_join_valid                         ),
-    .ready_o      ( mem_join_ready                         ),
-    .sel_i        ({sel_buf_b,          sel_buf_r         }),
-    .sel_valid_i  ( sel_buf_valid                          ),
-    .sel_ready_o  ( sel_buf_ready                          ),
-    .valid_o      ({axi_resp_o.b_valid, axi_resp_o.r_valid}),
-    .ready_i      ({axi_req_i.b_ready,  axi_req_i.r_ready })
-  );
-
-  // Compose B responses.
-  assign axi_resp_o.b = '{
-    id:   meta_buf.id,
-    resp: axi_pkg::RESP_OKAY,
-    user: '0
-  };
-
-  // Compose R responses.
-  assign axi_resp_o.r = '{
-    data: m2s_resp,
-    id:   meta_buf.id,
-    last: meta_buf.last,
-    resp: axi_pkg::RESP_OKAY,
-    user: '0
-  };
-
-  // Registers
-  `FFARN(meta_sel_q, meta_sel_d, 1'b0, clk_i, rst_ni)
-  `FFARN(sel_lock_q, sel_lock_d, 1'b0, clk_i, rst_ni)
-  `FFARN(rd_meta_q, rd_meta_d, meta_t'{default: '0}, clk_i, rst_ni)
-  `FFARN(wr_meta_q, wr_meta_d, meta_t'{default: '0}, clk_i, rst_ni)
-  `FFARN(r_cnt_q, r_cnt_d, '0, clk_i, rst_ni)
-  `FFARN(w_cnt_q, w_cnt_d, '0, clk_i, rst_ni)
-
-  // Assertions
-  // pragma translate_off
-  `ifndef VERILATOR
-  default disable iff (!rst_ni);
-  assume property (@(posedge clk_i)
-      axi_req_i.ar_valid && !axi_resp_o.ar_ready |=> $stable(axi_req_i.ar))
-    else $error("AR must remain stable until handshake has happened!");
-  assert property (@(posedge clk_i)
-      axi_resp_o.r_valid && !axi_req_i.r_ready |=> $stable(axi_resp_o.r))
-    else $error("R must remain stable until handshake has happened!");
-  assume property (@(posedge clk_i)
-      axi_req_i.aw_valid && !axi_resp_o.aw_ready |=> $stable(axi_req_i.aw))
-    else $error("AW must remain stable until handshake has happened!");
-  assume property (@(posedge clk_i)
-      axi_req_i.w_valid && !axi_resp_o.w_ready |=> $stable(axi_req_i.w))
-    else $error("W must remain stable until handshake has happened!");
-  assert property (@(posedge clk_i)
-      axi_resp_o.b_valid && !axi_req_i.b_ready |=> $stable(axi_resp_o.b))
-    else $error("B must remain stable until handshake has happened!");
-  assert property (@(posedge clk_i) axi_req_i.ar_valid && axi_req_i.ar.len > 0 |->
-      axi_req_i.ar.burst == axi_pkg::BURST_INCR)
-    else $error("Non-incrementing bursts are not supported!");
-  assert property (@(posedge clk_i) axi_req_i.aw_valid && axi_req_i.aw.len > 0 |->
-      axi_req_i.aw.burst == axi_pkg::BURST_INCR)
-    else $error("Non-incrementing bursts are not supported!");
-  assert property (@(posedge clk_i) meta_valid && meta.atop != '0 |-> meta.write)
-    else $warning("Unexpected atomic operation on read.");
-  `endif
-  // pragma translate_on
-endmodule
-
-
-`include "axi/assign.svh"
-`include "axi/typedef.svh"
-/// Interface wrapper for module `axi_to_mem`.
-module axi_to_mem_intf #(
-  /// See `axi_to_mem`, parameter `AddrWidth`.
-  parameter int unsigned ADDR_WIDTH = 32'd0,
-  /// See `axi_to_mem`, parameter `DataWidth`.
-  parameter int unsigned DATA_WIDTH = 32'd0,
-  /// AXI4+ATOP ID width.
-  parameter int unsigned ID_WIDTH   = 32'd0,
-  /// AXI4+ATOP user width.
-  parameter int unsigned USER_WIDTH = 32'd0,
-  /// See `axi_to_mem`, parameter `NumBanks`.
-  parameter int unsigned NUM_BANKS  = 32'd0,
-  /// See `axi_to_mem`, parameter `BufDepth`.
-  parameter int unsigned BUF_DEPTH  = 32'd1,
-  /// Dependent parameter, do not override. See `axi_to_mem`, parameter `addr_t`.
-  localparam type addr_t     = logic [ADDR_WIDTH-1:0],
-  /// Dependent parameter, do not override. See `axi_to_mem`, parameter `mem_data_t`.
-  localparam type mem_data_t = logic [DATA_WIDTH/NUM_BANKS-1:0],
-  /// Dependent parameter, do not override. See `axi_to_mem`, parameter `mem_strb_t`.
-  localparam type mem_strb_t = logic [DATA_WIDTH/NUM_BANKS/8-1:0]
-) (
-  /// Clock input.
-  input  logic                            clk_i,
-  /// Asynchronous reset, active low.
-  input  logic                            rst_ni,
-  /// See `axi_to_mem`, port `busy_o`.
-  output logic                            busy_o,
-  /// AXI4+ATOP slave interface port.
-  AXI_BUS.Slave                           slv,
-  /// See `axi_to_mem`, port `mem_req_o`.
-  output logic           [NUM_BANKS-1:0]  mem_req_o,
-  /// See `axi_to_mem`, port `mem_gnt_i`.
-  input  logic           [NUM_BANKS-1:0]  mem_gnt_i,
-  /// See `axi_to_mem`, port `mem_addr_o`.
-  output addr_t          [NUM_BANKS-1:0]  mem_addr_o,
-  /// See `axi_to_mem`, port `mem_wdata_o`.
-  output mem_data_t      [NUM_BANKS-1:0]  mem_wdata_o,
-  /// See `axi_to_mem`, port `mem_strb_o`.
-  output mem_strb_t      [NUM_BANKS-1:0]  mem_strb_o,
-  /// See `axi_to_mem`, port `mem_atop_o`.
-  output axi_pkg::atop_t [NUM_BANKS-1:0]  mem_atop_o,
-  /// See `axi_to_mem`, port `mem_we_o`.
-  output logic           [NUM_BANKS-1:0]  mem_we_o,
-  /// See `axi_to_mem`, port `mem_rvalid_i`.
-  input  logic           [NUM_BANKS-1:0]  mem_rvalid_i,
-  /// See `axi_to_mem`, port `mem_rdata_i`.
-  input  mem_data_t      [NUM_BANKS-1:0]  mem_rdata_i
-);
-  typedef logic [ID_WIDTH-1:0]     id_t;
-  typedef logic [DATA_WIDTH-1:0]   data_t;
-  typedef logic [DATA_WIDTH/8-1:0] strb_t;
-  typedef logic [USER_WIDTH-1:0]   user_t;
-  `AXI_TYPEDEF_AW_CHAN_T(aw_chan_t, addr_t, id_t, user_t)
-  `AXI_TYPEDEF_W_CHAN_T(w_chan_t, data_t, strb_t, user_t)
-  `AXI_TYPEDEF_B_CHAN_T(b_chan_t, id_t, user_t)
-  `AXI_TYPEDEF_AR_CHAN_T(ar_chan_t, addr_t, id_t, user_t)
-  `AXI_TYPEDEF_R_CHAN_T(r_chan_t, data_t, id_t, user_t)
-  `AXI_TYPEDEF_REQ_T(req_t, aw_chan_t, w_chan_t, ar_chan_t)
-  `AXI_TYPEDEF_RESP_T(resp_t, b_chan_t, r_chan_t)
-  req_t   req;
-  resp_t  resp;
-  `AXI_ASSIGN_TO_REQ(req, slv)
-  `AXI_ASSIGN_FROM_RESP(slv, resp)
-  axi_to_mem #(
-    .axi_req_t  ( req_t     ),
-    .axi_resp_t ( resp_t    ),
-    .AddrWidth  ( ADDR_WIDTH ),
-    .DataWidth  ( DATA_WIDTH ),
-    .IdWidth    ( ID_WIDTH   ),
-    .NumBanks   ( NUM_BANKS  ),
-    .BufDepth   ( BUF_DEPTH  )
-  ) i_axi_to_mem (
-    .clk_i,
-    .rst_ni,
-    .busy_o,
-    .axi_req_i  ( req  ),
-    .axi_resp_o ( resp ),
-    .mem_req_o,
-    .mem_gnt_i,
-    .mem_addr_o,
-    .mem_wdata_o,
-    .mem_strb_o,
-    .mem_atop_o,
-    .mem_we_o,
-    .mem_rvalid_i,
-    .mem_rdata_i
-  );
-endmodule
-
-/// Split memory access over multiple parallel banks, where each bank has its own req/gnt
-/// request and valid response direction.
-module mem_to_banks #(
-  /// Input address width.
-  parameter int unsigned AddrWidth = 32'd0,
-  /// Input data width, must be a power of two.
-  parameter int unsigned DataWidth = 32'd0,
-  /// Number of banks at output, must evenly divide `DataWidth`.
-  parameter int unsigned NumBanks  = 32'd0,
-  /// Dependent parameter, do not override! Address type.
-  localparam type addr_t     = logic [AddrWidth-1:0],
-  /// Dependent parameter, do not override! Input data type.
-  localparam type inp_data_t = logic [DataWidth-1:0],
-  /// Dependent parameter, do not override! Input write strobe type.
-  localparam type inp_strb_t = logic [DataWidth/8-1:0],
-  /// Dependent parameter, do not override! Output data type.
-  localparam type oup_data_t = logic [DataWidth/NumBanks-1:0],
-  /// Dependent parameter, do not override! Output write strobe type.
-  localparam type oup_strb_t = logic [DataWidth/NumBanks/8-1:0]
-) (
-  /// Clock input.
-  input  logic                      clk_i,
-  /// Asynchronous reset, active low.
-  input  logic                      rst_ni,
-  /// Memory request to split, request is valid.
-  input  logic                      req_i,
-  /// Memory request to split, request can be granted.
-  output logic                      gnt_o,
-  /// Memory request to split, request address, byte-wise.
-  input  addr_t                     addr_i,
-  /// Memory request to split, request write data.
-  input  inp_data_t                 wdata_i,
-  /// Memory request to split, request write strobe.
-  input  inp_strb_t                 strb_i,
-  /// Memory request to split, request Atomic signal from AXI4+ATOP.
-  input  axi_pkg::atop_t            atop_i,
-  /// Memory request to split, request write enable, active high.
-  input  logic                      we_i,
-  /// Memory request to split, response is valid. Required for read and write requests
-  output logic                      rvalid_o,
-  /// Memory request to split, response read data.
-  output inp_data_t                 rdata_o,
-  /// Memory bank request, request is valid.
-  output logic           [NumBanks-1:0]  bank_req_o,
-  /// Memory bank request, request can be granted.
-  input  logic           [NumBanks-1:0]  bank_gnt_i,
-  /// Memory bank request, request address, byte-wise. Will be different for each bank.
-  output addr_t          [NumBanks-1:0]  bank_addr_o,
-  /// Memory bank request, request write data.
-  output oup_data_t      [NumBanks-1:0]  bank_wdata_o,
-  /// Memory bank request, request write strobe.
-  output oup_strb_t      [NumBanks-1:0]  bank_strb_o,
-  /// Memory bank request, request Atomic signal from AXI4+ATOP.
-  output axi_pkg::atop_t [NumBanks-1:0]  bank_atop_o,
-  /// Memory bank request, request write enable, active high.
-  output logic           [NumBanks-1:0]  bank_we_o,
-  /// Memory bank request, response is valid. Required for read and write requests
-  input  logic           [NumBanks-1:0]  bank_rvalid_i,
-  /// Memory bank request, response read data.
-  input  oup_data_t      [NumBanks-1:0]  bank_rdata_i
-);
-
-  localparam DataBytes    = $bits(inp_strb_t);
-  localparam BitsPerBank  = $bits(oup_data_t);
-  localparam BytesPerBank = $bits(oup_strb_t);
-
-  typedef struct packed {
-    addr_t          addr;
-    oup_data_t      wdata;
-    oup_strb_t      strb;
-    axi_pkg::atop_t atop;
-    logic           we;
-  } req_t;
-
-  logic                 req_valid;
-  logic [NumBanks-1:0]              req_ready,
-                        resp_valid, resp_ready;
-  req_t [NumBanks-1:0]  bank_req,
-                        bank_oup;
-
-  function automatic addr_t align_addr(input addr_t addr);
-    return (addr >> $clog2(DataBytes)) << $clog2(DataBytes);
-  endfunction
-
-  // Handle requests.
-  assign req_valid = req_i & gnt_o;
-  for (genvar i = 0; unsigned'(i) < NumBanks; i++) begin : gen_reqs
-    assign bank_req[i].addr  = align_addr(addr_i) + i * BytesPerBank;
-    assign bank_req[i].wdata = wdata_i[i*BitsPerBank+:BitsPerBank];
-    assign bank_req[i].strb  = strb_i[i*BytesPerBank+:BytesPerBank];
-    assign bank_req[i].atop  = atop_i;
-    assign bank_req[i].we    = we_i;
-    fall_through_register #(
-      .T ( req_t )
-    ) i_ft_reg (
-      .clk_i,
-      .rst_ni,
-      .clr_i      ( 1'b0          ),
-      .testmode_i ( 1'b0          ),
-      .valid_i    ( req_valid     ),
-      .ready_o    ( req_ready[i]  ),
-      .data_i     ( bank_req[i]   ),
-      .valid_o    ( bank_req_o[i] ),
-      .ready_i    ( bank_gnt_i[i] ),
-      .data_o     ( bank_oup[i]   )
-    );
-    assign bank_addr_o[i]  = bank_oup[i].addr;
-    assign bank_wdata_o[i] = bank_oup[i].wdata;
-    assign bank_strb_o[i]  = bank_oup[i].strb;
-    assign bank_atop_o[i]  = bank_oup[i].atop;
-    assign bank_we_o[i]    = bank_oup[i].we;
-  end
-
-  // Grant output if all our requests have been granted.
-  assign gnt_o = (&req_ready) & (&resp_ready);
-
-  // Handle responses.
-  for (genvar i = 0; unsigned'(i) < NumBanks; i++) begin : gen_resp_regs
-    fall_through_register #(
-      .T ( oup_data_t )
-    ) i_ft_reg (
-      .clk_i,
-      .rst_ni,
-      .clr_i      ( 1'b0                                ),
-      .testmode_i ( 1'b0                                ),
-      .valid_i    ( bank_rvalid_i[i]                    ),
-      .ready_o    ( resp_ready[i]                       ),
-      .data_i     ( bank_rdata_i[i]                     ),
-      .data_o     ( rdata_o[i*BitsPerBank+:BitsPerBank] ),
-      .ready_i    ( rvalid_o                            ),
-      .valid_o    ( resp_valid[i]                       )
-    );
-  end
-  assign rvalid_o = &resp_valid;
-
-  // Assertions
-  // pragma translate_off
-  `ifndef VERILATOR
-    initial begin
-      assume (DataWidth != 0 && (DataWidth & (DataWidth - 1)) == 0)
-        else $fatal(1, "Data width must be a power of two!");
-      assume (DataWidth % NumBanks == 0)
-        else $fatal(1, "Data width must be evenly divisible over banks!");
-      assume ((DataWidth / NumBanks) % 8 == 0)
-        else $fatal(1, "Data width of each bank must be divisible into 8-bit bytes!");
-    end
-  `endif
-  // pragma translate_on
-endmodule
diff --git a/hardware/src/cva6_accel_first_pass_decoder.sv b/hardware/src/cva6_accel_first_pass_decoder.sv
index 0519d58b7..74c7e14e2 100644
--- a/hardware/src/cva6_accel_first_pass_decoder.sv
+++ b/hardware/src/cva6_accel_first_pass_decoder.sv
@@ -7,36 +7,49 @@
 //              instruction, whether it reads scalar registers, and whether
 //              it writes to a destination scalar register
 
-module cva6_accel_first_pass_decoder import rvv_pkg::*; (
-    input  logic [31:0] instruction_i, // instruction from IF
-    output logic        is_accel_o,    // is a vector instruction
-    output logic        is_rs1_o,
-    output logic        is_rs2_o,
-    output logic        is_rd_o,
-    output logic        is_fs1_o,
-    output logic        is_fs2_o,
-    output logic        is_fd_o,
-    output logic        is_vfp_o,      // is a vector floating-point instruction
-    output logic        is_load_o,
-    output logic        is_store_o
+module cva6_accel_first_pass_decoder import rvv_pkg::*; import ariane_pkg::*; (
+    input  logic [31:0]       instruction_i,   // instruction from IF
+    input  riscv::xs_t        fs_i,            // floating point extension status
+    input  riscv::xs_t        vs_i,            // vector extension status
+    output logic              is_accel_o,      // is a vector instruction
+    output scoreboard_entry_t instruction_o,   // predecoded instruction
+    output logic              illegal_instr_o, // is an illegal instruction
+    output logic              is_control_flow_instr_o
   );
 
+  logic        is_rs1;
+  logic        is_rs2;
+  logic        is_rd;
+  logic        is_fs1;
+  logic        is_fs2;
+  logic        is_fd;
+  logic        is_vfp;      // is a vector floating-point instruction
+  logic        is_load;
+  logic        is_store;
+
   // Cast instruction into the `rvv_instruction_t` struct
   rvv_instruction_t instr;
   assign instr = rvv_instruction_t'(instruction_i);
 
+  // Cast instruction into scalar `instruction_t` struct
+  riscv::instruction_t instr_scalar;
+  assign instr_scalar = riscv::instruction_t'(instruction_i);
+
+  // Vector instructions never change control flow
+  assign is_control_flow_instr_o = 1'b0;
+
   always_comb begin
     // Default values
     is_accel_o = 1'b0;
-    is_rs1_o   = 1'b0;
-    is_rs2_o   = 1'b0;
-    is_rd_o    = 1'b0;
-    is_fs1_o   = 1'b0;
-    is_fs2_o   = 1'b0;
-    is_fd_o    = 1'b0;
-    is_vfp_o   = 1'b0;
-    is_load_o  = instr.i_type.opcode == riscv::OpcodeLoadFp;
-    is_store_o = instr.i_type.opcode == riscv::OpcodeStoreFp;
+    is_rs1   = 1'b0;
+    is_rs2   = 1'b0;
+    is_rd    = 1'b0;
+    is_fs1   = 1'b0;
+    is_fs2   = 1'b0;
+    is_fd    = 1'b0;
+    is_vfp   = 1'b0;
+    is_load  = instr.i_type.opcode == riscv::OpcodeLoadFp;
+    is_store = instr.i_type.opcode == riscv::OpcodeStoreFp;
 
     // Decode based on the opcode
     case (instr.i_type.opcode)
@@ -46,20 +59,20 @@ module cva6_accel_first_pass_decoder import rvv_pkg::*; (
         is_accel_o = 1'b1;
         case (instr.varith_type.func3)
           OPFVV: begin
-            is_fd_o  = instr.varith_type.func6 == 6'b010_000; // VFWUNARY0
-            is_vfp_o = 1'b1;
+            is_fd  = instr.varith_type.func6 == 6'b010_000; // VFWUNARY0
+            is_vfp = 1'b1;
           end
-          OPMVV: is_rd_o  = instr.varith_type.func6 == 6'b010_000; // VWXUNARY0
-          OPIVX: is_rs1_o = 1'b1 ;
+          OPMVV: is_rd  = instr.varith_type.func6 == 6'b010_000; // VWXUNARY0
+          OPIVX: is_rs1 = 1'b1 ;
           OPFVF: begin
-            is_fs1_o = 1'b1;
-            is_vfp_o = 1'b1;
+            is_fs1 = 1'b1;
+            is_vfp = 1'b1;
           end
-          OPMVX: is_rs1_o = 1'b1 ;
+          OPMVX: is_rs1 = 1'b1 ;
           OPCFG: begin
-            is_rs1_o = instr.vsetivli_type.func2 != 2'b11; // not vsetivli
-            is_rs2_o = instr.vsetvl_type.func7 == 7'b100_0000; // vsetvl
-            is_rd_o  = 1'b1 ;
+            is_rs1 = instr.vsetivli_type.func2 != 2'b11; // not vsetivli
+            is_rs2 = instr.vsetvl_type.func7 == 7'b100_0000; // vsetvl
+            is_rd  = 1'b1 ;
           end
         endcase
       end
@@ -77,8 +90,8 @@ module cva6_accel_first_pass_decoder import rvv_pkg::*; (
           4'b1110, //VLxE512/VSxE512
           4'b1111: begin //VLxE1024/VSxE1024
             is_accel_o = 1'b1 ;
-            is_rs1_o   = 1'b1 ;
-            is_rs2_o   = instr.vmem_type.mop == 2'b10; // Strided operation
+            is_rs1   = 1'b1 ;
+            is_rs2   = instr.vmem_type.mop == 2'b10; // Strided operation
           end
         endcase
       end
@@ -91,7 +104,7 @@ module cva6_accel_first_pass_decoder import rvv_pkg::*; (
           3'b110, //VAMO*EI32.V
           3'b111: begin //VAMO*EI64.V
             is_accel_o = 1'b1;
-            is_rs1_o   = 1'b1;
+            is_rs1   = 1'b1;
           end
         endcase
       end
@@ -106,13 +119,44 @@ module cva6_accel_first_pass_decoder import rvv_pkg::*; (
           3'b110, //CSRRSI
           3'b111: begin //CSRRCI
             is_accel_o = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm));
-            is_rs1_o   = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm));
-            is_rs2_o   = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm));
-            is_rd_o    = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm));
+            is_rs1   = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm));
+            is_rs2   = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm));
+            is_rd    = is_vector_csr(riscv::csr_reg_t'(instr.i_type.imm));
           end
         endcase
       end
     endcase
   end
 
+  always_comb begin
+    instruction_o   = '0;
+    illegal_instr_o = 1'b1;
+
+    if (is_accel_o && vs_i != riscv::Off) begin // trigger illegal instruction if the vector extension is turned off
+      // TODO: Instruction going to other accelerators might need to distinguish whether the value of vs_i is needed or not.
+      // Send accelerator instructions to the coprocessor
+      instruction_o.fu  = ACCEL;
+      instruction_o.vfp = is_vfp;
+      instruction_o.rs1 = (is_rs1 || is_fs1) ? instr_scalar.rtype.rs1 : {REG_ADDR_SIZE{1'b0}};
+      instruction_o.rs2 = (is_rs2 || is_fs2) ? instr_scalar.rtype.rs2 : {REG_ADDR_SIZE{1'b0}};
+      instruction_o.rd  = (is_rd || is_fd) ? instr_scalar.rtype.rd : {REG_ADDR_SIZE{1'b0}};
+
+      // Decode the vector operation
+      unique case ({is_store, is_load, is_fs1, is_fs2, is_fd})
+        5'b10000: instruction_o.op = ACCEL_OP_STORE;
+        5'b01000: instruction_o.op = ACCEL_OP_LOAD;
+        5'b00100: instruction_o.op = ACCEL_OP_FS1;
+        5'b00001: instruction_o.op = ACCEL_OP_FD;
+        5'b00000: instruction_o.op = ACCEL_OP;
+      endcase
+
+      // Check that mstatus.FS is not OFF if we have a FP instruction for the accelerator
+      illegal_instr_o = (is_vfp && (fs_i == riscv::Off)) ? 1'b1 : 1'b0;
+
+      // result holds the undecoded instruction
+      instruction_o.result = { {riscv::XLEN-32{1'b0}}, instruction_i[31:0] };
+      instruction_o.use_imm = 1'b0;
+    end
+  end
+
 endmodule : cva6_accel_first_pass_decoder
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index 386b9823c..ba82f8922 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -113,7 +113,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
   // bits that indicate whether there is a hazard between different vector
   // instructions. Such hazards must be continuously cleared based on the
   // value of the currently running loops from the main sequencer.
-  operand_request_cmd_t [NrOperandQueues-1:0] operand_request_i;
+  operand_request_cmd_t [NrOperandQueues-1:0] operand_request;
   logic                 [NrOperandQueues-1:0] operand_request_push;
 
   operand_request_cmd_t [NrOperandQueues-1:0] operand_request_d;
@@ -133,7 +133,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
       // Got a new request
       if (operand_request_push[queue]) begin
-        operand_request_d[queue]       = operand_request_i[queue];
+        operand_request_d[queue]       = operand_request[queue];
         operand_request_valid_d[queue] = 1'b1;
       end
     end
@@ -189,7 +189,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
     pe_resp_o.vinsn_done = vinsn_done_q;
 
     // Make no requests to the operand requester
-    operand_request_i    = '0;
+    operand_request    = '0;
     operand_request_push = '0;
 
     // Make no requests to the lane's VFUs
@@ -197,7 +197,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
     vfu_operation_valid_d = 1'b0;
 
     // If the operand requesters are busy, abort the request and wait for another cycle.
-    if (pe_req_valid) begin
+    if (pe_req_valid) begin : stall_op_req_busy
       unique case (pe_req.vfu)
         VFU_Alu : begin
           pe_req_ready = !(operand_request_valid_o[AluA] ||
@@ -230,11 +230,11 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           pe_req_ready = !(operand_request_valid_o[MaskB]);
         end
         default:;
-      endcase
+      endcase // stall_op_req_busy
     end
 
     // We received a new vector instruction
-    if (pe_req_valid && pe_req_ready && !vinsn_running_d[pe_req.id]) begin
+    if (pe_req_valid && pe_req_ready && !vinsn_running_d[pe_req.id]) begin : pe_req_valid
       // Populate the VFU request
       vfu_operation_d = '{
         id             : pe_req.id,
@@ -263,9 +263,13 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
       if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0]) vfu_operation_d.vl += 1;
 
       // Vector start calculation
-      vfu_operation_d.vstart = pe_req.vstart / NrLanes;
-      // If lane_id_i < vstart % NrLanes, this lane needs to execute one micro-operation less.
-      if (lane_id_i < pe_req.vstart[idx_width(NrLanes)-1:0]) vfu_operation_d.vstart -= 1;
+      // TODO: check for LMUL = 4, 8
+      // TODO: check for SEW != 64
+      vfu_operation_d.vstart = pe_req.vstart / NrLanes; // High bits
+      // If lane_id_i < (vstart % NrLanes), this lane needs to execute one micro-operation less.
+      if (lane_id_i < pe_req.vstart[idx_width(NrLanes)-1:0]) begin : adjust_vstart_lane
+        vfu_operation_d.vstart += 1;
+      end : adjust_vstart_lane
 
       // Mark the vector instruction as running
       vinsn_running_d[pe_req.id] = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0;
@@ -287,7 +291,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
       unique case (pe_req.vfu)
         VFU_Alu: begin
-          operand_request_i[AluA] = '{
+          operand_request[AluA] = '{
             id         : pe_req.id,
             vs         : pe_req.vs1,
             eew        : pe_req.eew_vs1,
@@ -306,7 +310,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           operand_request_push[AluA] = pe_req.use_vs1;
 
-          operand_request_i[AluB] = '{
+          operand_request[AluB] = '{
             id         : pe_req.id,
             vs         : pe_req.vs2,
             eew        : pe_req.eew_vs2,
@@ -328,24 +332,24 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request_push[AluB] = pe_req.use_vs2;
 
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
             vtype  : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew),
+            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
+              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_MFpu: begin
-          operand_request_i[MulFPUA] = '{
+          operand_request[MulFPUA] = '{
             id         : pe_req.id,
             vs         : pe_req.vs1,
             eew        : pe_req.eew_vs1,
@@ -365,7 +369,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           operand_request_push[MulFPUA] = pe_req.use_vs1;
 
-          operand_request_i[MulFPUB] = '{
+          operand_request[MulFPUB] = '{
             id         : pe_req.id,
             vs         : pe_req.swap_vs2_vd_op ? pe_req.vd        : pe_req.vs2,
             eew        : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2,
@@ -388,7 +392,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request_push[MulFPUB] = pe_req.swap_vs2_vd_op ?
           pe_req.use_vd_op : pe_req.use_vs2;
 
-          operand_request_i[MulFPUC] = '{
+          operand_request[MulFPUC] = '{
             id         : pe_req.id,
             vs         : pe_req.swap_vs2_vd_op ? pe_req.vs2            : pe_req.vd,
             eew        : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2        : pe_req.eew_vd_op,
@@ -411,42 +415,42 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           pe_req.use_vs2 : pe_req.use_vd_op;
 
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
             vtype  : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew),
+            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
+              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_LoadUnit : begin
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
             vtype  : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew),
+            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
+              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
 
           // Load indexed
-          operand_request_i[SlideAddrGenA] = '{
+          operand_request[SlideAddrGenA] = '{
             id       : pe_req_i.id,
             vs       : pe_req_i.vs2,
             eew      : pe_req_i.eew_vs2,
@@ -461,13 +465,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           // Since this request goes outside of the lane, we might need to request an
           // extra operand regardless of whether it is valid in this lane or not.
-          if (operand_request_i[SlideAddrGenA].vl * NrLanes != pe_req_i.vl)
-            operand_request_i[SlideAddrGenA].vl += 1;
+          if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl)
+            operand_request[SlideAddrGenA].vl += 1;
           operand_request_push[SlideAddrGenA] = pe_req_i.op == VLXE;
         end
 
         VFU_StoreUnit : begin
-          operand_request_i[StA] = '{
+          // vstart is supported here
+          operand_request[StA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
             eew     : pe_req.eew_vs1,
@@ -481,28 +486,34 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             hazard  : pe_req.hazard_vs1 | pe_req.hazard_vd,
             default : '0
           };
-          if (operand_request_i[StA].vl * NrLanes != pe_req.vl) operand_request_i[StA].vl += 1;
+          // vl is not an integer multiple of NrLanes
+          // I.e., ( ( pe_req.vl / NrLanes * NrLanes ) == vl ) <=> ( ( vl % NrLanes ) != 0 )
+          if ( ( operand_request[StA].vl * NrLanes ) != pe_req.vl ) begin : tweak_vl_StA
+            operand_request[StA].vl += 1;
+          end : tweak_vl_StA
           operand_request_push[StA] = pe_req.use_vs1;
 
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          // TODO: add vstart support here
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
             vtype  : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> int'(pe_req.vtype.vsew),
+            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
+              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
 
           // Store indexed
-          operand_request_i[SlideAddrGenA] = '{
+          // TODO: add vstart support here
+          operand_request[SlideAddrGenA] = '{
             id       : pe_req_i.id,
             vs       : pe_req_i.vs2,
             eew      : pe_req_i.eew_vs2,
@@ -517,13 +528,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           // Since this request goes outside of the lane, we might need to request an
           // extra operand regardless of whether it is valid in this lane or not.
-          if (operand_request_i[SlideAddrGenA].vl * NrLanes != pe_req_i.vl)
-            operand_request_i[SlideAddrGenA].vl += 1;
+          if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) begin : tweak_vl_SlideAddrGenA
+            operand_request[SlideAddrGenA].vl += 1;
+          end : tweak_vl_SlideAddrGenA
           operand_request_push[SlideAddrGenA] = pe_req_i.op == VSXE;
         end
 
         VFU_SlideUnit: begin
-          operand_request_i[SlideAddrGenA] = '{
+          operand_request[SlideAddrGenA] = '{
             id       : pe_req.id,
             vs       : pe_req.vs2,
             eew      : pe_req.eew_vs2,
@@ -543,7 +555,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
               // as operands by the slide unit.
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request_i[SlideAddrGenA].vl =
+              operand_request[SlideAddrGenA].vl =
               (pe_req.vl - pe_req.stride + NrLanes - 1) / NrLanes;
             end
             VSLIDEDOWN: begin
@@ -554,7 +566,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
               // We need to trim full words from the start of the vector that are not used
               // as operands by the slide unit.
-              operand_request_i[SlideAddrGenA].vstart = pe_req.stride / NrLanes;
+              operand_request[SlideAddrGenA].vstart = pe_req.stride / NrLanes;
 
               // The stride move the initial address in boundaries of 8*NrLanes Byte.
               // If the stride is not multiple of a full VRF word (8*NrLanes Byte),
@@ -576,15 +588,15 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
                 vl_tot += extra_stride;
 
               // Ask the elements, and ask one more if we do not perfectly divide NrLanes
-              operand_request_i[SlideAddrGenA].vl = vl_tot / NrLanes;
-              if (operand_request_i[SlideAddrGenA].vl * NrLanes != vl_tot)
-                operand_request_i[SlideAddrGenA].vl += 1;
+              operand_request[SlideAddrGenA].vl = vl_tot / NrLanes;
+              if (operand_request[SlideAddrGenA].vl * NrLanes != vl_tot)
+                operand_request[SlideAddrGenA].vl += 1;
             end
             default:;
           endcase
 
           // This vector instruction uses masks
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
@@ -601,32 +613,32 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
               // as operands by the slide unit.
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request_i[MaskM].vl =
+              operand_request[MaskM].vl =
               ((pe_req.vl - pe_req.stride + NrLanes - 1) / 8 / NrLanes)
-              >> int'(pe_req.vtype.vsew);
+              >> unsigned'(pe_req.vtype.vsew);
 
-              if (((operand_request_i[MaskM].vl + pe_req.stride) <<
-                    int'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl))
-                operand_request_i[MaskM].vl += 1;
+              if (((operand_request[MaskM].vl + pe_req.stride) <<
+                    unsigned'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl))
+                operand_request[MaskM].vl += 1;
 
               // SLIDEUP only uses mask bits whose indices are > stride
               // Don't send the previous (unused) ones to the MASKU
               if (pe_req.stride >= NrLanes * 64)
-                operand_request_i[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8;
+                operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8;
             end
             VSLIDEDOWN: begin
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request_i[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> int'(
+              operand_request[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> unsigned'(
                     pe_req.vtype.vsew));
-              if ((operand_request_i[MaskM].vl << int'(pe_req.vtype.vsew)) *
+              if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
                   NrLanes * 8 != pe_req.vl)
-                operand_request_i[MaskM].vl += 1;
+                operand_request[MaskM].vl += 1;
             end
           endcase
         end
         VFU_MaskUnit: begin
-          operand_request_i[AluA] = '{
+          operand_request[AluA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
             eew     : pe_req.eew_vs1,
@@ -640,21 +652,21 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
           if (pe_req.op inside {[VMSEQ:VMSBC]}) begin
-            operand_request_i[AluA].vl = vfu_operation_d.vl;
+            operand_request[AluA].vl = vfu_operation_d.vl;
           end
           // This is an operation that runs normally on the ALU, and then gets reshuffled at the
           // Mask Unit.
           else begin
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            operand_request_i[AluA].vl = (pe_req.vl / NrLanes) >>
-            (int'(EW64) - int'(pe_req.eew_vs1));
-            if ((operand_request_i[AluA].vl << (int'(EW64) - int'(pe_req.eew_vs1))) * NrLanes !=
-                pe_req.vl) operand_request_i[AluA].vl += 1;
+            operand_request[AluA].vl = (pe_req.vl / NrLanes) >>
+            (unsigned'(EW64) - unsigned'(pe_req.eew_vs1));
+            if ((operand_request[AluA].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs1))) * NrLanes !=
+                pe_req.vl) operand_request[AluA].vl += 1;
           end
           operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]});
 
-          operand_request_i[AluB] = '{
+          operand_request[AluB] = '{
             id      : pe_req.id,
             vs      : pe_req.vs2,
             eew     : pe_req.eew_vs2,
@@ -667,21 +679,21 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
           if (pe_req.op inside {[VMSEQ:VMSBC]}) begin
-            operand_request_i[AluB].vl = vfu_operation_d.vl;
+            operand_request[AluB].vl = vfu_operation_d.vl;
           end
           // This is an operation that runs normally on the ALU, and then gets reshuffled at the
           // Mask Unit.
           else begin
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            operand_request_i[AluB].vl = (pe_req.vl / NrLanes) >>
-            (int'(EW64) - int'(pe_req.eew_vs2));
-            if ((operand_request_i[AluB].vl << (int'(EW64) - int'(pe_req.eew_vs2))) * NrLanes !=
-                pe_req.vl) operand_request_i[AluB].vl += 1;
+            operand_request[AluB].vl = (pe_req.vl / NrLanes) >>
+            (unsigned'(EW64) - unsigned'(pe_req.eew_vs2));
+            if ((operand_request[AluB].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs2))) * NrLanes !=
+                pe_req.vl) operand_request[AluB].vl += 1;
           end
           operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]});
 
-          operand_request_i[MulFPUA] = '{
+          operand_request[MulFPUA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
             eew     : pe_req.eew_vs1,
@@ -694,10 +706,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
-          operand_request_i[MulFPUA].vl = vfu_operation_d.vl;
+          operand_request[MulFPUA].vl = vfu_operation_d.vl;
           operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]};
 
-          operand_request_i[MulFPUB] = '{
+          operand_request[MulFPUB] = '{
             id      : pe_req.id,
             vs      : pe_req.vs2,
             eew     : pe_req.eew_vs2,
@@ -709,10 +721,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
-          operand_request_i[MulFPUB].vl = vfu_operation_d.vl;
+          operand_request[MulFPUB].vl = vfu_operation_d.vl;
           operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]};
 
-          operand_request_i[MaskB] = '{
+          operand_request[MaskB] = '{
             id      : pe_req.id,
             vs      : pe_req.vd,
             eew     : pe_req.eew_vd_op,
@@ -720,16 +732,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             vtype   : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            vl      : (pe_req.vl / NrLanes / ELEN) << (int'(EW64) - int'(pe_req.vtype.vsew)),
+            vl      : (pe_req.vl / NrLanes / ELEN) << (unsigned'(EW64) - unsigned'(pe_req.vtype.vsew)),
             vstart  : vfu_operation_d.vstart,
             hazard  : pe_req.hazard_vd,
             default : '0
           };
           if (((pe_req.vl / NrLanes / ELEN) * NrLanes * ELEN) !=
-            pe_req.vl) operand_request_i[MaskB].vl += 1;
+            pe_req.vl) operand_request[MaskB].vl += 1;
           operand_request_push[MaskB] = pe_req.use_vd_op;
 
-          operand_request_i[MaskM] = '{
+          operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
             eew    : pe_req.vtype.vsew,
@@ -741,13 +753,13 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             hazard : pe_req.hazard_vm,
             default: '0
           };
-          if ((operand_request_i[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin
-            operand_request_i[MaskM].vl += 1;
+          if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin
+            operand_request[MaskM].vl += 1;
           end
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_None: begin
-          operand_request_i[MaskB] = '{
+          operand_request[MaskB] = '{
             id         : pe_req.id,
             vs         : pe_req.vs2,
             eew        : pe_req.eew_vs2,
@@ -763,8 +775,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request_push[MaskB] = 1'b1;
         end
         default:;
-      endcase
-    end
+      endcase // pe_req.vfu
+    end : pe_req_valid
   end: sequencer
 
   always_ff @(posedge clk_i or negedge rst_ni) begin: p_sequencer_ff
diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv
index 72c8202e1..9b8c1464c 100644
--- a/hardware/src/lane/operand_queue.sv
+++ b/hardware/src/lane/operand_queue.sv
@@ -127,7 +127,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
   ///////////////////////
 
   // Count how many operands were already produced
-  vlen_t vl_d, vl_q;
+  vlen_t elem_count_d, elem_count_q;
 
   elen_t                            conv_operand;
   // Decide whether we are taking the operands from the lower or from the upper half of the input
@@ -226,23 +226,23 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
       end
 
       // Assert the signal if the last 64-bit packet will contain also
-      // elements with idx >= vl (they should not contribute to the result!).
+      // elements with idx >= elem_count (they should not contribute to the result!).
       // Gate for power saving
       // Power optimization:
       // The optimal solution would be to act on the mask bits in the two
       // processing units (valu and vmfpu), masking the unused elements.
       unique case (cmd.eew)
         EW8 : begin
-          incomplete_packet = |cmd.vl[2:0];
-          last_packet       = ((cmd.vl - vl_q) <= 8) ? 1'b1 : 1'b0;
+          incomplete_packet = |cmd.elem_count[2:0];
+          last_packet       = ((cmd.elem_count - elem_count_q) <= 8) ? 1'b1 : 1'b0;
         end
         EW16: begin
-          incomplete_packet = |cmd.vl[1:0];
-          last_packet       = ((cmd.vl - vl_q) <= 4) ? 1'b1 : 1'b0;
+          incomplete_packet = |cmd.elem_count[1:0];
+          last_packet       = ((cmd.elem_count - elem_count_q) <= 4) ? 1'b1 : 1'b0;
         end
         EW32: begin
-          incomplete_packet = |cmd.vl[0:0];
-          last_packet       = ((cmd.vl - vl_q) <= 2) ? 1'b1 : 1'b0;
+          incomplete_packet = |cmd.elem_count[0:0];
+          last_packet       = ((cmd.elem_count - elem_count_q) <= 2) ? 1'b1 : 1'b0;
         end
         default: begin
           incomplete_packet = 1'b0;
@@ -373,15 +373,15 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
           if (SupportNtrVal) unique case (cmd.eew)
             EW8 : for (int unsigned b = 0; b < 8; b++) begin
                     automatic int unsigned bs = shuffle_index(b, 1, EW8);
-                    if ((b >> 0) >= cmd.vl[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
+                    if ((b >> 0) >= cmd.elem_count[2:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
                   end
             EW16: for (int unsigned b = 0; b < 8; b++) begin
                     automatic int unsigned bs = shuffle_index(b, 1, EW16);
-                    if ((b >> 1) >= cmd.vl[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
+                    if ((b >> 1) >= cmd.elem_count[1:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
                   end
             EW32: for (int unsigned b = 0; b < 8; b++) begin
                     automatic int unsigned bs = shuffle_index(b, 1, EW32);
-                    if ((b >> 2) >= cmd.vl[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
+                    if ((b >> 2) >= cmd.elem_count[0:0]) conv_operand[8*bs +: 8] = ntr.w8[b];
                   end
             default:;
           endcase
@@ -401,7 +401,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
 
     // Maintain state
     select_d = select_q;
-    vl_d     = vl_q;
+    elem_count_d     = elem_count_q;
 
     // Send the operand
     operand_o       = conv_operand;
@@ -418,16 +418,16 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
         OpQueueConversionZExt2,
         OpQueueConversionWideFP2,
         OpQueueAdjustFPCvt:
-          if (SupportIntExt2) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 2;
+          if (SupportIntExt2) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 2;
         OpQueueConversionSExt4,
         OpQueueConversionZExt4:
-          if (SupportIntExt4) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 4;
+          if (SupportIntExt4) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 4;
         OpQueueConversionSExt8,
         OpQueueConversionZExt8:
-          if (SupportIntExt8) vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew))) / 8;
+          if (SupportIntExt8) elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew))) / 8;
         OpQueueReductionZExt:
-          vl_d = vl_q + 1;
-        default: vl_d = vl_q + (1 << (int'(EW64) - int'(cmd.eew)));
+          elem_count_d = elem_count_q + 1;
+        default: elem_count_d = elem_count_q + (1 << (unsigned'(EW64) - unsigned'(cmd.eew)));
       endcase
 
       // Update the pointer to the input operand
@@ -443,22 +443,22 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
       if ((select_q != '0 && select_d == '0) || cmd.conv == OpQueueConversionNone) ibuf_pop = 1'b1;
 
       // Finished execution
-      if (vl_d >= cmd.vl) begin
+      if (elem_count_d >= cmd.elem_count) begin : finished_elems
         ibuf_pop = 1'b1;
         cmd_pop  = 1'b1;
         select_d = '0;
-        vl_d     = '0;
-      end
+        elem_count_d     = '0;
+      end : finished_elems
     end
   end : obuf_control
 
   always_ff @(posedge clk_i or negedge rst_ni) begin: p_type_conversion_ff
     if (!rst_ni) begin
       select_q <= '0;
-      vl_q     <= '0;
+      elem_count_q     <= '0;
     end else begin
       select_q <= select_d;
-      vl_q     <= vl_d;
+      elem_count_q     <= elem_count_d;
     end
   end : p_type_conversion_ff
 
diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv
index 54590fbc3..4bbafdc75 100644
--- a/hardware/src/lane/operand_requester.sv
+++ b/hardware/src/lane/operand_requester.sv
@@ -199,7 +199,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
   //  Operand request  //
   ///////////////////////
 
-  // There is an operand requester for each operand queue. Each one
+  // There is an operand requester_index for each operand queue. Each one
   // can be in one of the following two states.
   typedef enum logic {
     IDLE,
@@ -223,216 +223,230 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
   logic     [NrBanks-1:0][NrMasters-1:0] operand_gnt;
   payload_t [NrMasters-1:0]              operand_payload;
 
-  for (genvar requester = 0; requester < NrOperandQueues; requester++) begin: gen_operand_requester
-    // State of this operand requester
+  // Metadata required to request all elements of this vector operand
+  typedef struct packed {
+    // ID of the instruction for this requester_index
+    vid_t id;
+    // Address of the next element to be read
+    vaddr_t addr;
+    // How many elements remain to be read
+    vlen_t len;
+    // Element width
+    vew_e vew;
+
+    // Hazards between vector instructions
+    logic [NrVInsn-1:0] hazard;
+
+    // Widening instructions produces two writes of every read
+    // In case of a WAW with a previous instruction,
+    // read once every two writes of the previous instruction
+    logic is_widening;
+    // One-bit counters
+    logic [NrVInsn-1:0] waw_hazard_counter;
+  } requester_metadata_t;
+
+  for (genvar requester_index = 0; requester_index < NrOperandQueues; requester_index++) begin : gen_operand_requester
+    // State of this operand requester_index
     state_t state_d, state_q;
 
-    // Metadata required to request all elements of this vector operand
-    struct packed {
-      // ID of the instruction for this requester
-      vid_t id;
-      // Address of the next element to be read
-      vaddr_t addr;
-      // How many elements remain to be read
-      vlen_t len;
-      // Element width
-      vew_e vew;
-
-      // Hazards between vector instructions
-      logic [NrVInsn-1:0] hazard;
-
-      // Widening instructions produces two writes of every read
-      // In case of a WAW with a previous instruction,
-      // read once every two writes of the previous instruction
-      logic is_widening;
-      // One-bit counters
-      logic [NrVInsn-1:0] waw_hazard_counter;
-    } requester_d, requester_q;
-
+    requester_metadata_t requester_metadata_d, requester_metadata_q;
 
     // Is there a hazard during this cycle?
     logic stall;
-    assign stall = |(requester_q.hazard & ~(vinsn_result_written_q &
-                   (~{NrVInsn{requester_q.is_widening}} | requester_q.waw_hazard_counter)));
+    assign stall = |(requester_metadata_q.hazard & ~(vinsn_result_written_q &
+                   (~{NrVInsn{requester_metadata_q.is_widening}} | requester_metadata_q.waw_hazard_counter)));
 
     // Did we get a grant?
     logic [NrBanks-1:0] operand_requester_gnt;
     for (genvar bank = 0; bank < NrBanks; bank++) begin: gen_operand_requester_gnt
-      assign operand_requester_gnt[bank] = operand_gnt[bank][requester];
+      assign operand_requester_gnt[bank] = operand_gnt[bank][requester_index];
     end
 
     // Did we issue a word to this operand queue?
-    assign operand_issued_o[requester] = |(operand_requester_gnt);
+    assign operand_issued_o[requester_index] = |(operand_requester_gnt);
 
     always_comb begin: operand_requester
+      // Helper local variables
+      automatic operand_queue_cmd_t  operand_queue_cmd_tmp;
+      automatic requester_metadata_t requester_metadata_tmp;
+      automatic vlen_t               vector_body_length;
+      automatic vlen_t               scaled_vector_body_length;
+      automatic vlen_t               effective_vector_body_length;
+      automatic vaddr_t              vrf_addr;
+
       // Maintain state
       state_d     = state_q;
-      requester_d = requester_q;
+      requester_metadata_d = requester_metadata_q;
 
       // Make no requests to the VRF
-      operand_payload[requester] = '0;
-      for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester] = 1'b0;
+      operand_payload[requester_index] = '0;
+      for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester_index] = 1'b0;
 
-      // Do not acknowledge any operand requester commands
-      operand_request_ready_o[requester] = 1'b0;
+      // Do not acknowledge any operand requester_index commands
+      operand_request_ready_o[requester_index] = 1'b0;
 
       // Do not send any operand conversion commands
-      operand_queue_cmd_o[requester]       = '0;
-      operand_queue_cmd_valid_o[requester] = 1'b0;
+      operand_queue_cmd_o[requester_index]       = '0;
+      operand_queue_cmd_valid_o[requester_index] = 1'b0;
+
+      // Prepare metadata upfront
+      // Length of vector body in elements, i.e., vl - vstart
+      vector_body_length = operand_request_i[requester_index].vl - operand_request_i[requester_index].vstart;
+      // For memory operations, the number of elements initially refers to the new EEW (vsew here),
+      // but the requester_index must refer to the old EEW (eew here)
+      // This reasoning cannot be applied also to widening instructions, which modify vsew
+      // treating it as the EEW of vd
+      scaled_vector_body_length = (
+                                   vector_body_length
+                                    << operand_request_i[requester_index].vtype.vsew
+                                  ) >> operand_request_i[requester_index].eew;
+      // Final computed length
+      effective_vector_body_length = ( operand_request_i[requester_index].scale_vl )
+                                      ? scaled_vector_body_length
+                                      : vector_body_length;
+      // Address of the vstart element of the vector in the VRF
+      vrf_addr = vaddr(operand_request_i[requester_index].vs, NrLanes)
+                  + (
+                      operand_request_i[requester_index].vstart
+                      >> (unsigned'(EW64) - unsigned'(operand_request_i[requester_index].eew))
+                    );
+      // Init helper variables
+      requester_metadata_tmp = '{
+        id          : operand_request_i[requester_index].id,
+        addr        : vrf_addr,
+        len         : effective_vector_body_length,
+        vew         : operand_request_i[requester_index].eew,
+        hazard      : operand_request_i[requester_index].hazard,
+        is_widening : operand_request_i[requester_index].cvt_resize == CVT_WIDE,
+        default: '0
+      };
+      operand_queue_cmd_tmp = '{
+        eew       : operand_request_i[requester_index].eew,
+        elem_count: effective_vector_body_length,
+        conv      : operand_request_i[requester_index].conv,
+        ntr_red   : operand_request_i[requester_index].cvt_resize,
+        target_fu : operand_request_i[requester_index].target_fu,
+        is_reduct : operand_request_i[requester_index].is_reduct
+      };
 
       case (state_q)
-        IDLE: begin
+        IDLE: begin : state_q_IDLE
           // Accept a new instruction
-          if (operand_request_valid_i[requester]) begin
+          if (operand_request_valid_i[requester_index]) begin : op_req_valid
             state_d                            = REQUESTING;
             // Acknowledge the request
-            operand_request_ready_o[requester] = 1'b1;
+            operand_request_ready_o[requester_index] = 1'b1;
 
             // Send a command to the operand queue
-            operand_queue_cmd_o[requester] = '{
-              eew : operand_request_i[requester].eew,
-              // For memory operations, the number of elements initially refers to the new EEW (vsew here),
-              // but the requester must refer to the old EEW (eew here)
-              // This reasoning cannot be applied also to widening instructions, which modify vsew
-              // treating it as the EEW of vd
-              vl       : (operand_request_i[requester].scale_vl) ?
-                           ((operand_request_i[requester].vl <<
-                           operand_request_i[requester].vtype.vsew) >>
-                           operand_request_i[requester].eew) :
-                           operand_request_i[requester].vl,
-              conv     : operand_request_i[requester].conv,
-              ntr_red  : operand_request_i[requester].cvt_resize,
-              target_fu: operand_request_i[requester].target_fu,
-              is_reduct: operand_request_i[requester].is_reduct
-            };
+            operand_queue_cmd_o[requester_index] = operand_queue_cmd_tmp;
+            operand_queue_cmd_valid_o[requester_index] = 1'b1;
+
             // The length should be at least one after the rescaling
-            if (operand_queue_cmd_o[requester].vl == '0)
-              operand_queue_cmd_o[requester].vl = 1;
-            operand_queue_cmd_valid_o[requester] = 1'b1;
+            if (operand_queue_cmd_o[requester_index].elem_count == '0) begin : cmd_zero_rescaled_vl
+              operand_queue_cmd_o[requester_index].elem_count = 1;
+            end : cmd_zero_rescaled_vl
 
             // Store the request
-            requester_d = '{
-              id     : operand_request_i[requester].id,
-              addr   : vaddr(operand_request_i[requester].vs, NrLanes) +
-              (operand_request_i[requester].vstart >>
-                (int'(EW64) - int'(operand_request_i[requester].eew))),
-              // For memory operations, the number of elements initially refers to the new EEW (vsew here),
-              // but the requester must refer to the old EEW (eew here)
-              // This reasoning cannot be applied also to widening instructions, which modify vsew
-              // treating it as the EEW of vd
-              len         : (operand_request_i[requester].scale_vl) ?
-                              ((operand_request_i[requester].vl <<
-                              operand_request_i[requester].vtype.vsew) >>
-                              operand_request_i[requester].eew) :
-                              operand_request_i[requester].vl,
-              vew         : operand_request_i[requester].eew,
-              hazard      : operand_request_i[requester].hazard,
-              is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE,
-              default: '0
-            };
+            requester_metadata_d = requester_metadata_tmp;
+
             // The length should be at least one after the rescaling
-            if (requester_d.len == '0)
-              requester_d.len = 1;
+            if (requester_metadata_d.len == '0) begin : req_zero_rescaled_vl
+              requester_metadata_d.len = 1;
+            end : req_zero_rescaled_vl
+
 
             // Mute the requisition if the vl is zero
-            if (operand_request_i[requester].vl == '0) begin
+            if (operand_request_i[requester_index].vl == '0) begin : zero_vl
               state_d                              = IDLE;
-              operand_queue_cmd_valid_o[requester] = 1'b0;
-            end
-          end
-        end
+              operand_queue_cmd_valid_o[requester_index] = 1'b0;
+            end : zero_vl
+          end : op_req_valid
+        end : state_q_IDLE
 
-        REQUESTING: begin
+        REQUESTING: begin : state_q_REQUESTING
           // Update waw counters
-          for (int b = 0; b < NrVInsn; b++)
-            if (vinsn_result_written_d[b])
-              requester_d.waw_hazard_counter[b] = ~requester_q.waw_hazard_counter[b];
+          for (int b = 0; b < NrVInsn; b++) begin : waw_counters_update
+            if ( vinsn_result_written_d[b] ) begin : result_valid
+              requester_metadata_d.waw_hazard_counter[b] = ~requester_metadata_q.waw_hazard_counter[b];
+            end : result_valid
+          end : waw_counters_update
 
-          if (operand_queue_ready_i[requester]) begin
+          if (operand_queue_ready_i[requester_index]) begin : op_queue_ready
             // Bank we are currently requesting
-            automatic int bank = requester_q.addr[idx_width(NrBanks)-1:0];
+            automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0];
+            automatic vlen_t num_bytes;
 
             // Operand request
-            operand_req[bank][requester] = !stall;
-            operand_payload[requester]   = '{
-              addr   : requester_q.addr >> $clog2(NrBanks),
-              opqueue: opqueue_e'(requester),
-              default: '0
+            operand_req[bank][requester_index] = !stall;
+            operand_payload[requester_index]   = '{
+              addr   : requester_metadata_q.addr >> $clog2(NrBanks),
+              opqueue: opqueue_e'(requester_index),
+              default: '0 // this is a read operation
             };
 
             // Received a grant.
-            if (|operand_requester_gnt) begin
+            if (|operand_requester_gnt) begin : op_req_grant
               // Bump the address pointer
-              requester_d.addr = requester_q.addr + 1'b1;
+              requester_metadata_d.addr = requester_metadata_q.addr + 1'b1;
 
               // We read less than 64 bits worth of elements
-              if (requester_q.len < (1 << (int'(EW64) - int'(requester_q.vew))))
-                requester_d.len    = 0;
-              else requester_d.len = requester_q.len - (1 << (int'(EW64) - int'(requester_q.vew)));
-            end
+              num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
+              if (requester_metadata_q.len < num_bytes) begin
+                requester_metadata_d.len    = 0;
+              end
+              else begin
+                requester_metadata_d.len = requester_metadata_q.len - num_bytes;
+              end
+            end : op_req_grant
 
             // Finished requesting all the elements
-            if (requester_d.len == '0) begin
+            if (requester_metadata_d.len == '0) begin : req_finished
               state_d = IDLE;
 
               // Accept a new instruction
-              if (operand_request_valid_i[requester]) begin
+              if (operand_request_valid_i[requester_index]) begin : op_req_valid
                 state_d                            = REQUESTING;
                 // Acknowledge the request
-                operand_request_ready_o[requester] = 1'b1;
+                operand_request_ready_o[requester_index] = 1'b1;
 
                 // Send a command to the operand queue
-                operand_queue_cmd_o[requester] = '{
-                  eew      : operand_request_i[requester].eew,
-                  vl       : (operand_request_i[requester].scale_vl) ?
-                               ((operand_request_i[requester].vl <<
-                               operand_request_i[requester].vtype.vsew) >>
-                               operand_request_i[requester].eew) :
-                               operand_request_i[requester].vl,
-                  conv     : operand_request_i[requester].conv,
-                  ntr_red  : operand_request_i[requester].cvt_resize,
-                  target_fu: operand_request_i[requester].target_fu,
-                  is_reduct: operand_request_i[requester].is_reduct
-                };
-                operand_queue_cmd_valid_o[requester] = 1'b1;
+                operand_queue_cmd_o[requester_index] = operand_queue_cmd_tmp;
+                operand_queue_cmd_valid_o[requester_index] = 1'b1;
+
                 // The length should be at least one after the rescaling
-                if (operand_queue_cmd_o[requester].vl == '0)
-                  operand_queue_cmd_o[requester].vl = 1;
+                if (operand_queue_cmd_o[requester_index].elem_count == '0) begin : cmd_zero_rescaled_vl
+                  operand_queue_cmd_o[requester_index].elem_count = 1;
+                end : cmd_zero_rescaled_vl
 
                 // Store the request
-                requester_d = '{
-                  id   : operand_request_i[requester].id,
-                  addr : vaddr(operand_request_i[requester].vs, NrLanes) +
-                  (operand_request_i[requester].vstart >>
-                    (int'(EW64) - int'(operand_request_i[requester].eew))),
-                  len    : (operand_request_i[requester].scale_vl) ?
-                             ((operand_request_i[requester].vl <<
-                             operand_request_i[requester].vtype.vsew) >>
-                             operand_request_i[requester].eew) :
-                             operand_request_i[requester].vl,
-                  vew    : operand_request_i[requester].eew,
-                  hazard : operand_request_i[requester].hazard,
-                  default: '0
-                };
+                requester_metadata_d = requester_metadata_tmp;
+
                 // The length should be at least one after the rescaling
-                if (requester_d.len == '0)
-                  requester_d.len = 1;
-              end
-            end
-          end
-        end
-      endcase
+                if (requester_metadata_d.len == '0) begin : req_zero_rescaled_vl
+                  requester_metadata_d.len = 1;
+                end : req_zero_rescaled_vl
+                
+                // Mute the requisition if the vl is zero
+                if (operand_request_i[requester_index].vl == '0) begin : zero_vl
+                  state_d                              = IDLE;
+                  operand_queue_cmd_valid_o[requester_index] = 1'b0;
+                end : zero_vl
+              end : op_req_valid
+            end : req_finished
+          end : op_queue_ready
+        end : state_q_REQUESTING
+      endcase // state_q
       // Always keep the hazard bits up to date with the global hazard table
-      requester_d.hazard &= global_hazard_table_i[requester_d.id];
+      requester_metadata_d.hazard &= global_hazard_table_i[requester_metadata_d.id];
     end : operand_requester
 
     always_ff @(posedge clk_i or negedge rst_ni) begin
       if (!rst_ni) begin
         state_q     <= IDLE;
-        requester_q <= '0;
+        requester_metadata_q <= '0;
       end else begin
         state_q     <= state_d;
-        requester_q <= requester_d;
+        requester_metadata_q <= requester_metadata_d;
       end
     end
   end : gen_operand_requester
@@ -452,7 +466,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
       operand_req[bank][NrOperandQueues + VFU_LoadUnit]  = 1'b0;
     end
 
-    // Generate the payload
+    // Generate the payloads for write back operations
     operand_payload[NrOperandQueues + VFU_Alu] = '{
       addr   : alu_result_addr_i >> $clog2(NrBanks),
       wen    : 1'b1,
@@ -523,7 +537,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
     logic payload_hp_req;
     logic payload_hp_gnt;
     rr_arb_tree #(
-      .NumIn    (int'(MulFPUC) - int'(AluA) + 1 + int'(VFU_MFpu) - int'(VFU_Alu) + 1),
+      .NumIn    (unsigned'(MulFPUC) - unsigned'(AluA) + 1 + unsigned'(VFU_MFpu) - unsigned'(VFU_Alu) + 1),
       .DataWidth($bits(payload_t)                                                   ),
       .AxiVldRdy(1'b0                                                               )
     ) i_hp_vrf_arbiter (
@@ -548,7 +562,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
     logic payload_lp_req;
     logic payload_lp_gnt;
     rr_arb_tree #(
-      .NumIn(int'(SlideAddrGenA)- int'(MaskB) + 1 + int'(VFU_LoadUnit) - int'(VFU_SlideUnit) + 1),
+      .NumIn(unsigned'(SlideAddrGenA)- unsigned'(MaskB) + 1 + unsigned'(VFU_LoadUnit) - unsigned'(VFU_SlideUnit) + 1),
       .DataWidth($bits(payload_t)                                                               ),
       .AxiVldRdy(1'b0                                                                           )
     ) i_lp_vrf_arbiter (
diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index 8d8b1024d..369784f78 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -449,7 +449,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
+              automatic vlen_t vector_body_length = vinsn_issue_q.vl - vinsn_issue_q.vstart;
+              
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
 
@@ -465,7 +467,12 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
               // Store the result in the result queue
               result_queue_d[result_queue_write_pnt_q].wdata = result_queue_q[result_queue_write_pnt_q].wdata | valu_result;
-              result_queue_d[result_queue_write_pnt_q].addr  = vaddr(vinsn_issue_q.vd, NrLanes) + ((vinsn_issue_q.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue_q.vtype.vsew));
+              result_queue_d[result_queue_write_pnt_q].addr  = vaddr(vinsn_issue_q.vd, NrLanes) 
+                                                                + ( 
+                                                                    ( vinsn_issue_q.vl - issue_cnt_q ) // vstart is already considered in issue_cnt_q
+                                                                      >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)
+                                                                    )
+                                                                  );
               result_queue_d[result_queue_write_pnt_q].id    = vinsn_issue_q.id;
               result_queue_d[result_queue_write_pnt_q].mask  = vinsn_issue_q.vfu == VFU_MaskUnit;
               if (!narrowing(vinsn_issue_q.op) || !narrowing_select_q)
@@ -474,7 +481,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
               // Is this a narrowing instruction?
               if (narrowing(vinsn_issue_q.op)) begin
                 // How many elements did we calculate in this iteration?
-                automatic logic [3:0] element_cnt_narrow = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))) / 2;
+                automatic logic [3:0] element_cnt_narrow = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))) / 2;
                 if (element_cnt_narrow > issue_cnt_q)
                   element_cnt_narrow = issue_cnt_q;
 
@@ -523,12 +530,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
                 // Assign vector length for next instruction in the instruction queue
                 if (vinsn_queue_d.issue_cnt != 0) begin
+                  automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl
+                                                        - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart;
                   if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                    issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
+                    issue_cnt_d = vector_body_length;
                   else begin
-                    issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
+                    $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+                    issue_cnt_d = (vector_body_length / 8) >>
                       vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                    issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
+                    issue_cnt_d += |vector_body_length[2:0];
                   end
                 end
               end
@@ -547,7 +557,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (int'(EW64) - int'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
 
@@ -654,12 +664,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
           // Assign vector length for next instruction in the instruction queue
           if (vinsn_queue_d.issue_cnt != 0) begin
+            automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl
+                                                  - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart;
             if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-              issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
+              issue_cnt_d = vector_body_length;
             else begin
-              issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
+              $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+              issue_cnt_d = (vector_body_length / 8) >>
                 vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-              issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
+              issue_cnt_d += |vector_body_length[2:0];
             end
           end
 
@@ -690,12 +703,15 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
               // Assign vector length for next instruction in the instruction queue
               if (vinsn_queue_d.issue_cnt != 0) begin
+                automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl
+                                                      - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart;
                 if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                  issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
+                  issue_cnt_d = vector_body_length;
                 else begin
-                  issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
+                  $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+                  issue_cnt_d = (vector_body_length / 8) >>
                     vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                  issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
+                  issue_cnt_d += |vector_body_length[2:0];
                 end
               end
 
@@ -750,8 +766,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       // Decrement the counter of remaining vector elements waiting to be written
       // Don't do it in case of a reduction
       if (!is_reduction(vinsn_commit.op))
-        commit_cnt_d = commit_cnt_q - (1 << (int'(EW64) - vinsn_commit.vtype.vsew));
-      if (commit_cnt_q < (1 << (int'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0;
+        commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew));
+      if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0;
     end
 
     // Finished committing the results of a vector instruction
@@ -765,16 +781,20 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       else vinsn_queue_d.commit_pnt += 1;
 
       // Update the commit counter for the next instruction
-      if (vinsn_queue_d.commit_cnt != '0)
+      if (vinsn_queue_d.commit_cnt != '0) begin
+        automatic vlen_t vector_body_length = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl
+                                              - vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vstart;
         if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;
+          commit_cnt_d = vector_body_length;
         else begin
           // We are asking for bits, and we want at least one chunk of bits if
           // vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew)
-          commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >>
+          $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+          commit_cnt_d = (vector_body_length / 8) >>
             vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew;
-          commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0];
+          commit_cnt_d += |vector_body_length[2:0];
         end
+      end
 
       // Initialize counters and alu state if needed by the next instruction
       // After a reduction, the next instructions starts after the reduction commits
@@ -796,14 +816,18 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
     if (!vinsn_queue_full && vfu_operation_valid_i &&
       (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin
+      automatic vlen_t vector_body_length = vfu_operation_i.vl - vfu_operation_i.vstart;
+      
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
       // Do not wait for masks if, during a reduction, this lane is just a pass-through
       // The only valid instructions here with vl == '0 are reductions
+      // TODO: check if vector_body_length should be used insteada of plain vl here
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0);
 
       // Initialize counters and alu state if the instruction queue was empty
       // and the lane is not reducing
       if ((vinsn_queue_d.issue_cnt == '0) && !prevent_commit) begin
+
         alu_state_d = is_reduction(vfu_operation_i.op) ? INTRA_LANE_REDUCTION : NO_REDUCTION;
         // The next will be the first operation of this instruction
         // This information is useful for reduction operation
@@ -812,22 +836,24 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
         reduction_rx_cnt_d      = reduction_rx_cnt_init(NrLanes, lane_id_i);
         sldu_transactions_cnt_d = $clog2(NrLanes) + 1;
 
-        issue_cnt_d = vfu_operation_i.vl;
+        issue_cnt_d = vector_body_length;
         if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          issue_cnt_d = vfu_operation_i.vl;
+          issue_cnt_d = vector_body_length;
         else begin
-          issue_cnt_d = (vfu_operation_i.vl / 8) >>
+          $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
+          issue_cnt_d = (vector_body_length / 8) >>
             vfu_operation_i.vtype.vsew;
-          issue_cnt_d += |vfu_operation_i.vl[2:0];
+          issue_cnt_d += |vector_body_length[2:0];
         end
       end
       if (vinsn_queue_d.commit_cnt == '0)
         if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vfu_operation_i.vl;
+          commit_cnt_d = vector_body_length;
         else begin
+          $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
           // Operations between mask vectors operate on bits
-          commit_cnt_d  = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew;
-          commit_cnt_d += |vfu_operation_i.vl[2:0];
+          commit_cnt_d  = (vector_body_length / 8) >> vfu_operation_i.vtype.vsew;
+          commit_cnt_d += |vector_body_length[2:0];
         end
 
       // Bump pointers and counters of the vector instruction queue
diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv
index 36c76df21..fbe3f1a49 100644
--- a/hardware/src/lane/vmfpu.sv
+++ b/hardware/src/lane/vmfpu.sv
@@ -768,7 +768,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
     };
 
     // Don't compress classify result
-    localparam int unsigned TrueSIMDClass = 1;
+    localparam int unsigned TrueSIMDClass  = 1;
+    localparam int unsigned EnableSIMDMask = 1;
 
     operation_e fp_op;
     logic fp_opmod;
@@ -969,9 +970,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
       .Features      (FPUFeatures      ),
       .Implementation(FPUImplementation),
       .TagType       (strb_t           ),
-      .NumLanes      (FPULanes         ),
       .TrueSIMDClass (TrueSIMDClass    ),
-      .MaskType      (fpu_mask_t       )
+      .EnableSIMDMask(EnableSIMDMask   )
     ) i_fpnew_bulk (
       .clk_i         (clk_i          ),
       .rst_ni        (rst_ni         ),
diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv
index 2fbe05e55..09c7bfaaa 100644
--- a/hardware/src/vlsu/addrgen.sv
+++ b/hardware/src/vlsu/addrgen.sv
@@ -26,15 +26,33 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     output axi_aw_t                        axi_aw_o,
     output logic                           axi_aw_valid_o,
     input  logic                           axi_aw_ready_i,
+    // CSR input
+    input  logic                           en_ld_st_translation_i,
+    // Interface with CVA6's sv39 MMU
+    // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless
+    output ariane_pkg::exception_t         mmu_misaligned_ex_o,
+    output logic                           mmu_req_o,        // request address translation
+    output logic [riscv::VLEN-1:0]         mmu_vaddr_o,      // virtual address out
+    output logic                           mmu_is_store_o,   // the translation is requested by a store
+    // if we need to walk the page table we can't grant in the same cycle
+    // Cycle 0
+    input logic                            mmu_dtlb_hit_i,   // sent in the same cycle as the request if translation hits in the DTLB
+    input logic [riscv::PPNW-1:0]          mmu_dtlb_ppn_i,   // ppn (send same cycle as hit)
+    // Cycle 1
+    input  logic                           mmu_valid_i,      // translation is valid
+    input  logic [riscv::PLEN-1:0]         mmu_paddr_i,      // translated address
+    input  ariane_pkg::exception_t         mmu_exception_i,  // address translation threw an exception
     // Interace with the dispatcher
     input  logic                           core_st_pending_i,
     // Interface with the main sequencer
     input  pe_req_t                        pe_req_i,
     input  logic                           pe_req_valid_i,
     input  logic     [NrVInsn-1:0]         pe_vinsn_running_i,
-    output logic                           addrgen_error_o,
+    output ariane_pkg::exception_t         addrgen_exception_o,
     output logic                           addrgen_ack_o,
-    output vlen_t                          addrgen_error_vl_o,
+    output vlen_t                          addrgen_exception_vl_o,
+    output logic                           addrgen_exception_load_o,
+    output logic                           addrgen_exception_store_o,
     // Interface with the load/store units
     output addrgen_axi_req_t               axi_addrgen_req_o,
     output logic                           axi_addrgen_req_valid_o,
@@ -47,11 +65,29 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     output logic                           addrgen_operand_ready_o
   );
 
+  localparam unsigned DataWidth = $bits(elen_t);
+  localparam unsigned DataWidthB = DataWidth / 8;
+
+  ///////////////////
+  //  Assignments  //
+  ///////////////////
+
+  // Ara reports misaligned exceptions on its own
+  assign mmu_misaligned_ex_o  = '0;
+  assign mmu_is_store_o       = !axi_addrgen_q.is_load;
+
+  ///////////////
+  //  Imports  //
+  ///////////////
   import cf_math_pkg::idx_width;
   import axi_pkg::aligned_addr;
   import axi_pkg::BURST_INCR;
   import axi_pkg::CACHE_MODIFIABLE;
 
+  ///////////////////
+  //  Definitions  //
+  ///////////////////
+
   // Check if the address is aligned to a particular width
   function automatic logic is_addr_error(axi_addr_t addr, vew_e vew);
     is_addr_error = |(addr & (elen_t'(1 << vew) - 1));
@@ -114,10 +150,10 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   logic [$bits(elen_t)*NrLanes-1:0] shuffled_word;
   logic [$bits(elen_t)*NrLanes-1:0] deshuffled_word;
   elen_t                            reduced_word;
-  axi_addr_t                        idx_final_addr_d, idx_final_addr_q;
-  elen_t                            idx_addr;
+  axi_addr_t                        idx_final_vaddr_d, idx_final_vaddr_q;
+  elen_t                            idx_vaddr;
   logic                             idx_op_error_d, idx_op_error_q;
-  vlen_t                            addrgen_error_vl_d;
+  vlen_t                            addrgen_exception_vl_d;
 
   // Pointer to point to the correct
   logic [$clog2(NrLanes)-1:0] word_lane_ptr_d, word_lane_ptr_q;
@@ -126,8 +162,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   vlen_t                              idx_op_cnt_d, idx_op_cnt_q;
 
   // Spill reg signals
-  logic      idx_addr_valid_d, idx_addr_valid_q;
-  logic      idx_addr_ready_d, idx_addr_ready_q;
+  logic      idx_vaddr_valid_d, idx_vaddr_valid_q;
+  logic      idx_vaddr_ready_d, idx_vaddr_ready_q;
 
   // Break the path from the VRF to the AXI request
   spill_register #(
@@ -135,17 +171,19 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   ) i_addrgen_idx_op_spill_reg (
     .clk_i  (clk_i           ),
     .rst_ni (rst_ni          ),
-    .valid_i(idx_addr_valid_d),
-    .ready_o(idx_addr_ready_q),
-    .data_i (idx_final_addr_d),
-    .valid_o(idx_addr_valid_q),
-    .ready_i(idx_addr_ready_d),
-    .data_o (idx_final_addr_q)
+    .valid_i(idx_vaddr_valid_d),
+    .ready_o(idx_vaddr_ready_q),
+    .data_i (idx_final_vaddr_d),
+    .valid_o(idx_vaddr_valid_q),
+    .ready_i(idx_vaddr_ready_d),
+    .data_o (idx_final_vaddr_q)
   );
 
   //////////////////////////
   //  Address generation  //
   //////////////////////////
+  ariane_pkg::exception_t mmu_exception_d, mmu_exception_q;
+  logic last_translation_completed;
 
   // Running vector instructions
   logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q;
@@ -156,13 +194,18 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   // ADDRGEN_IDX_OP: Generates a series of AXI requests from a
   //    vector instruction, but reading a vector of offsets from Ara's lanes.
   //    This is used for scatter and gather operations.
-  enum logic [1:0] {
+  // WAIT_LAST_TRANSLATION: Wait for the last address translation to be acknowledged
+  enum logic [2:0] {
     IDLE,
     ADDRGEN,
     ADDRGEN_IDX_OP,
-    ADDRGEN_IDX_OP_END
+    ADDRGEN_IDX_OP_END,
+    WAIT_LAST_TRANSLATION
   } state_q, state_d;
 
+  // TODO: Masked elements do not generate exceptions on:
+  //      * EEW misalignment
+  //      * page faults
   always_comb begin: addr_generation
     // Maintain state
     state_d  = state_q;
@@ -177,16 +220,20 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
 
     // Nothing to acknowledge
     addrgen_ack_o           = 1'b0;
-    addrgen_error_o         = 1'b0;
+    addrgen_exception_o.valid = 1'b0;
+    addrgen_exception_o.tval  = '0;
+    addrgen_exception_o.cause = '0;
+    addrgen_exception_load_o  = 1'b0;
+    addrgen_exception_store_o = 1'b0;
 
     // No valid words for the spill register
-    idx_addr_valid_d        = 1'b0;
+    idx_vaddr_valid_d       = 1'b0;
     addrgen_operand_ready_o = 1'b0;
     reduced_word            = '0;
     elm_ptr_d               = elm_ptr_q;
     idx_op_cnt_d            = idx_op_cnt_q;
     word_lane_ptr_d         = word_lane_ptr_q;
-    idx_final_addr_d        = idx_final_addr_q;
+    idx_final_vaddr_d       = idx_final_vaddr_q;
     last_elm_subw_d         = last_elm_subw_q;
 
     // Support for indexed operations
@@ -201,10 +248,10 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     for (int unsigned lane = 0; lane < NrLanes; lane++)
       if (lane == word_lane_ptr_q)
         reduced_word = deshuffled_word[word_lane_ptr_q*$bits(elen_t) +: $bits(elen_t)];
-    idx_addr = reduced_word;
+    idx_vaddr = reduced_word;
 
     case (state_q)
-      IDLE: begin
+      IDLE: begin : state_IDLE
         // Received a new request
         if (pe_req_valid_i &&
             (is_load(pe_req_i.op) || is_store(pe_req_i.op)) && !vinsn_running_q[pe_req_i.id]) begin
@@ -229,22 +276,41 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
               endcase
 
               // Load element counter
-              idx_op_cnt_d = pe_req_i.vl;
+              idx_op_cnt_d = pe_req_i.vl - pe_req_i.vstart;
             end
             default: state_d = ADDRGEN;
-          endcase
+          endcase // pe_req_i.op
+
         end
-      end
-      ADDRGEN: begin
+      end : state_IDLE
+
+      ADDRGEN: begin : ADDRGEN
         // Ara does not support misaligned AXI requests
-        if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin
+        if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin : eew_misaligned_error
           state_d         = IDLE;
           addrgen_ack_o   = 1'b1;
-          addrgen_error_o = 1'b1;
-        end else begin
+          addrgen_exception_o.valid = 1'b1;
+          addrgen_exception_o.cause = riscv::ILLEGAL_INSTR;
+          addrgen_exception_o.tval  = '0;
+        end : eew_misaligned_error
+        else begin : address_valid
+          // NOTE: indexed are not covered here          
+          automatic logic [riscv::VLEN-1:0] vaddr_start;
+
+          case ( pe_req_q.op )
+            // Unit-stride: address = base + (vstart in elements)
+            VLE,  VSE : vaddr_start = pe_req_q.scalar_op + ( pe_req_q.vstart << unsigned'(pe_req_q.vtype.vsew) ); 
+            // Strided: address = base + (vstart * stride)
+            // NOTE: this multiplier might cause some timing issues
+            VLSE, VSSE: vaddr_start = pe_req_q.scalar_op + ( pe_req_q.vstart * pe_req_q.stride ) ;
+            // Indexed: let the next stage take care of vstart
+            VLXE, VSXE: vaddr_start = pe_req_q.scalar_op;
+            default   : vaddr_start = '0;
+          endcase // pe_req_q.op 
+
           addrgen_req = '{
-            addr    : pe_req_q.scalar_op,
-            len     : pe_req_q.vl,
+            addr    : vaddr_start,
+            len     : pe_req_q.vl - pe_req_q.vstart,
             stride  : pe_req_q.stride,
             vew     : pe_req_q.vtype.vsew,
             is_load : is_load(pe_req_q.op),
@@ -253,20 +319,35 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
           };
           addrgen_req_valid = 1'b1;
 
-          if (addrgen_req_ready) begin
+          if (addrgen_req_ready) begin : finished
             addrgen_req_valid = '0;
             addrgen_ack_o     = 1'b1;
             state_d           = IDLE;
-          end
-        end
-      end
-      ADDRGEN_IDX_OP: begin
+          end : finished
+          
+          // If load/store translation is enabled
+          if ( en_ld_st_translation_i ) begin : translation_enabled
+            // We need to wait for the last translation to be over before acking back
+            // addrgen_req_valid = '0; TODO: figure out if set/reset here
+            addrgen_ack_o     = 1'b0;
+            state_d           = WAIT_LAST_TRANSLATION;
+          end : translation_enabled
+        end : address_valid
+      end : ADDRGEN
+
+      ADDRGEN_IDX_OP: begin : ADDRGEN_IDX_OP
+        // NOTE: vstart is not supported for indexed operations
+        //       the logic shuld be introduced:
+        //       1. in the addrgen_operand_i operand read
+        //       2. in idx_vaddr computation
+        automatic logic [NrLanes-1:0] addrgen_operand_valid;
+
         // Stall the interface until the operation is over to catch possible exceptions
 
         // Every address can generate an exception
         addrgen_req = '{
-          addr    : pe_req_q.scalar_op,
-          len     : pe_req_q.vl,
+          addr    : pe_req_q.scalar_op, 
+          len     : pe_req_q.vl - pe_req_q.vstart,
           stride  : pe_req_q.stride,
           vew     : pe_req_q.vtype.vsew,
           is_load : is_load(pe_req_q.op),
@@ -275,51 +356,66 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
         };
         addrgen_req_valid = 1'b1;
 
+        // Adjust valid signals to the next block "operands_ready"
+        addrgen_operand_valid = addrgen_operand_valid_i;
+        for ( int unsigned lane = 0; lane < NrLanes; lane++ ) begin : adjust_operand_valid
+          // - We are left with less byte than the maximim to issue, 
+          //    this means that at least one lane is not going to push us any operand anymore
+          // - For the lanes which index % NrLanes != 0
+          if ( ( ( idx_op_cnt_q << pe_req_q.vtype.vsew ) < (NrLanes * DataWidthB) )
+                & ( lane < pe_req_q.vstart[idx_width(NrLanes)-1:0] )
+                ) begin : vstart_lane_adjust
+            addrgen_operand_valid[lane] |= 1'b1;
+          end : vstart_lane_adjust
+        end : adjust_operand_valid
+        // TODO: apply the same vstart logic also to mask_valid_i
+    
         // Handle handshake and data between VRF and spill register
         // We accept all the incoming data, without any checks
         // since Ara stalls on an indexed memory operation
-        if (&addrgen_operand_valid_i & addrgen_operand_target_fu_i[0] == MFPU_ADDRGEN) begin
+        if (&addrgen_operand_valid & addrgen_operand_target_fu_i[0] == MFPU_ADDRGEN) begin
 
           // Valid data for the spill register
-          idx_addr_valid_d = 1'b1;
+          idx_vaddr_valid_d = 1'b1;
 
           // Select the correct element, and zero extend it depending on vsew
           case (pe_req_q.eew_vs2)
             EW8: begin
               for (int unsigned b = 0; b < 8; b++)
                 if (b == elm_ptr_q)
-                  idx_addr = reduced_word[b*8 +: 8];
+                  idx_vaddr = reduced_word[b*8 +: 8];
             end
             EW16: begin
               for (int unsigned h = 0; h < 4; h++)
                 if (h == elm_ptr_q)
-                  idx_addr = reduced_word[h*16 +: 16];
+                  idx_vaddr = reduced_word[h*16 +: 16];
             end
             EW32: begin
               for (int unsigned w = 0; w < 2; w++)
                 if (w == elm_ptr_q)
-                  idx_addr = reduced_word[w*32 +: 32];
+                  idx_vaddr = reduced_word[w*32 +: 32];
             end
             EW64: begin
               for (int unsigned d = 0; d < 1; d++)
                 if (d == elm_ptr_q)
-                  idx_addr = reduced_word[d*64 +: 64];
+                  idx_vaddr = reduced_word[d*64 +: 64];
             end
             default: begin
               for (int unsigned b = 0; b < 8; b++)
                 if (b == elm_ptr_q)
-                  idx_addr = reduced_word[b*8 +: 8];
+                  idx_vaddr = reduced_word[b*8 +: 8];
             end
           endcase
 
           // Compose the address
-          idx_final_addr_d = pe_req_q.scalar_op + idx_addr;
+          idx_final_vaddr_d = pe_req_q.scalar_op + idx_vaddr;
 
           // When the data is accepted
-          if (idx_addr_ready_q) begin
+          if (idx_vaddr_ready_q) begin
             // Consumed one element
             idx_op_cnt_d = idx_op_cnt_q - 1;
             // Have we finished a full NrLanes*64b word?
+            // TODO: check for the need of vstart logic here
             if (elm_ptr_q == last_elm_subw_q) begin
               // Bump lane pointer
               elm_ptr_d       = '0;
@@ -339,13 +435,13 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
           end
         end
 
-        if (idx_op_error_d || addrgen_req_ready) begin
+        if (idx_op_error_d || addrgen_req_ready || mmu_exception_d.valid ) begin
           state_d = ADDRGEN_IDX_OP_END;
         end
-      end
+      end : ADDRGEN_IDX_OP
 
       // This state exists not to create combinatorial paths on the interface
-      ADDRGEN_IDX_OP_END : begin
+      ADDRGEN_IDX_OP_END : begin : ADDRGEN_IDX_OP_END
         // Acknowledge the indexed memory operation
         addrgen_ack_o     = 1'b1;
         addrgen_req_valid = '0;
@@ -355,11 +451,38 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
         word_lane_ptr_d = '0;
         // Raise an error if necessary
         if (idx_op_error_q) begin
-          addrgen_error_o = 1'b1;
+          // In this case, we always get EEW-misaligned exceptions
+          addrgen_exception_o.valid = 1'b1;
+          addrgen_exception_o.cause = riscv::ILLEGAL_INSTR;
+          addrgen_exception_o.tval  = '0;
         end
-      end
-    endcase
-  end
+        // Propagate the exception from the MMU (if any)
+        // NOTE: this would override 
+        if ( mmu_exception_q.valid ) begin
+          addrgen_exception_o = mmu_exception_q;
+        end
+      end : ADDRGEN_IDX_OP_END
+
+      WAIT_LAST_TRANSLATION : begin : WAIT_LAST_TRANSLATION
+        if ( last_translation_completed | mmu_exception_q.valid ) begin
+          // Acknowledge the indexed memory operation
+          addrgen_ack_o     = 1'b1;
+          addrgen_req_valid = '0;
+          state_d           = IDLE;
+          // Reset pointers
+          elm_ptr_d       = '0;
+          word_lane_ptr_d = '0;
+          // Propagate the exception from the MMU (if any)
+          addrgen_exception_o = mmu_exception_q;
+        end
+      end : WAIT_LAST_TRANSLATION
+    endcase // state_q
+
+    if ( addrgen_exception_o.valid & addrgen_ack_o ) begin
+      addrgen_exception_load_o  = is_load(pe_req_q.op);
+      addrgen_exception_store_o = !is_load(pe_req_q.op);
+    end
+  end : addr_generation
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
@@ -371,7 +494,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
       idx_op_cnt_q       <= '0;
       last_elm_subw_q    <= '0;
       idx_op_error_q     <= '0;
-      addrgen_error_vl_o <= '0;
+      addrgen_exception_vl_o <= '0;
+      mmu_exception_q    <= '0;
     end else begin
       state_q            <= state_d;
       pe_req_q           <= pe_req_d;
@@ -381,7 +505,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
       idx_op_cnt_q       <= idx_op_cnt_d;
       last_elm_subw_q    <= last_elm_subw_d;
       idx_op_error_q     <= idx_op_error_d;
-      addrgen_error_vl_o <= addrgen_error_vl_d;
+      addrgen_exception_vl_o <= addrgen_exception_vl_d;
+      mmu_exception_q    <= mmu_exception_d;
     end
   end
 
@@ -389,25 +514,27 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   //  Support for misaligned stores  //
   /////////////////////////////////////
 
+  localparam clog2_AxiStrobeWidth = $clog2(AxiDataWidth/8);
+
   // AXI Request Generation signals, declared here for convenience
   addrgen_req_t axi_addrgen_d, axi_addrgen_q;
 
   // Narrower AXI Data Byte-Width used for misaligned stores
-  logic [$clog2(AxiDataWidth/8)-1:0]            narrow_axi_data_bwidth;
+  logic [clog2_AxiStrobeWidth-1:0]            narrow_axi_data_bwidth;
   // Helper signal to calculate the narrow_axi_data_bwidth
   // It carries information about the misalignment of the start address w.r.t. the AxiDataWidth
-  logic [$clog2(AxiDataWidth/8)-1:0]            axi_addr_misalignment;
+  logic [clog2_AxiStrobeWidth-1:0]            axi_addr_misalignment;
   // Number of trailing 0s of axi_addr_misalignment
-  logic [idx_width($clog2(AxiDataWidth/8))-1:0] zeroes_cnt;
+  logic [idx_width(clog2_AxiStrobeWidth)-1:0] zeroes_cnt;
 
   // Get the misalignment information for this vector memory instruction
-  assign axi_addr_misalignment = axi_addrgen_d.addr[$clog2(AxiDataWidth/8)-1:0];
+  assign axi_addr_misalignment = axi_addrgen_d.addr[clog2_AxiStrobeWidth-1:0];
 
   // Calculate the maximum number of Bytes we can send in a store-misaligned beat.
   // This number must be a power of 2 not to get misaligned wrt the pack of data that the
   // store unit receives from the lanes
   lzc #(
-    .WIDTH($clog2(AxiDataWidth/8)),
+    .WIDTH(clog2_AxiStrobeWidth),
     .MODE (1'b0                  )
   ) i_lzc (
     .in_i   (axi_addr_misalignment),
@@ -416,14 +543,18 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   );
 
   // Effective AXI data width for misaligned stores
-  assign narrow_axi_data_bwidth = (AxiDataWidth/8) >> ($clog2(AxiDataWidth/8) - zeroes_cnt);
+  assign narrow_axi_data_bwidth = (AxiDataWidth/8) >> (clog2_AxiStrobeWidth - zeroes_cnt);
 
   //////////////////////////////
   //  AXI Request Generation  //
   //////////////////////////////
 
-  enum logic [1:0] {
-    AXI_ADDRGEN_IDLE, AXI_ADDRGEN_MISALIGNED, AXI_ADDRGEN_WAITING, AXI_ADDRGEN_REQUESTING
+  enum logic [2:0] {
+    AXI_ADDRGEN_IDLE, 
+    AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED,    // Misaligned vector store to AxiDataWidth/8, needs special treatement
+    AXI_ADDRGEN_WAITING_CORE_STORE_PENDING, // Wait until (core_st_pending_i == 0)
+    AXI_ADDRGEN_REQUESTING,                 // Perform AW/AR transactions and push addrgen_req to VSTU/VLDU
+    AXI_ADDRGEN_WAIT_TRANSLATION            // Wait for MMU to ack back
   } axi_addrgen_state_d, axi_addrgen_state_q;
 
   axi_addr_t aligned_start_addr_d, aligned_start_addr_q;
@@ -433,8 +564,39 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   // MSb of the next-next page (page selector for page 2 positions after the current one)
   logic [($bits(aligned_start_addr_d) - 12)-1:0] next_2page_msb_d, next_2page_msb_q;
 
-  logic [$clog2(AxiDataWidth/8):0]            eff_axi_dw_d, eff_axi_dw_q;
-  logic [idx_width($clog2(AxiDataWidth/8)):0] eff_axi_dw_log_d, eff_axi_dw_log_q;
+  logic [clog2_AxiStrobeWidth:0]            eff_axi_dw_d, eff_axi_dw_q;
+  logic [idx_width(clog2_AxiStrobeWidth):0] eff_axi_dw_log_d, eff_axi_dw_log_q;
+
+  function automatic void set_end_addr (
+      input  logic [($bits(axi_addr_t) - 12)-1:0]       next_2page_msb,
+      input  int unsigned                               num_bytes,
+      input  axi_addr_t                                 addr,
+      input  logic [clog2_AxiStrobeWidth:0]             eff_axi_dw,
+      input  logic [idx_width(clog2_AxiStrobeWidth):0]  eff_axi_dw_log,
+      input  axi_addr_t                                 aligned_start_addr_d,
+      output axi_addr_t                                 aligned_end_addr_d,
+      output axi_addr_t                                 aligned_next_start_addr_d
+  );
+  
+    // POSSIBLE BUG: given this is really the maximum number of bytes per burst, 
+    //                this assumes the burst length is always the maximum possible, i.e., 256.
+    automatic int unsigned max_burst_bytes = addr + (256 << eff_axi_dw_log);
+
+    // The final address can be found similarly...
+    if (num_bytes >= max_burst_bytes) begin
+        aligned_next_start_addr_d = aligned_addr(addr + max_burst_bytes, clog2_AxiStrobeWidth);
+    end else begin
+        aligned_next_start_addr_d = aligned_addr(addr + num_bytes - 1, eff_axi_dw_log) + eff_axi_dw;
+    end
+    aligned_end_addr_d = aligned_next_start_addr_d - 1;
+
+    // But since AXI requests are aligned in 4 KiB pages, aligned_end_addr must be in the
+    // same page as aligned_start_addr
+    if (aligned_start_addr_d[AxiAddrWidth-1:12] != aligned_end_addr_d[AxiAddrWidth-1:12]) begin
+        aligned_end_addr_d        = {aligned_start_addr_d[AxiAddrWidth-1:12], 12'hFFF};
+        aligned_next_start_addr_d = {                       next_2page_msb  , 12'h000};
+    end
+  endfunction // set_end_addr
 
   always_comb begin: axi_addrgen
     // Maintain state
@@ -450,8 +612,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     eff_axi_dw_d     = eff_axi_dw_q;
     eff_axi_dw_log_d = eff_axi_dw_log_q;
 
-    idx_addr_ready_d    = 1'b0;
-    addrgen_error_vl_d  = '0;
+    idx_vaddr_ready_d    = 1'b0;
+    addrgen_exception_vl_d  = '0;
 
     // No error by default
     idx_op_error_d = 1'b0;
@@ -469,82 +631,82 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     axi_aw_o       = '0;
     axi_aw_valid_o = 1'b0;
 
-    case (axi_addrgen_state_q)
-      AXI_ADDRGEN_IDLE: begin
+    // MMU
+    mmu_exception_d = mmu_exception_q;
+    mmu_req_o       = 1'b0;
+    mmu_vaddr_o     = '0;
+
+    // For addrgen FSM
+    last_translation_completed = 1'b0;
+
+    case (axi_addrgen_state_q) 
+      AXI_ADDRGEN_IDLE: begin : axi_addrgen_state_AXI_ADDRGEN_IDLE
         if (addrgen_req_valid) begin
           axi_addrgen_d       = addrgen_req;
-          axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING : AXI_ADDRGEN_REQUESTING;
+          axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING_CORE_STORE_PENDING : AXI_ADDRGEN_REQUESTING;
 
           // In case of a misaligned store, reduce the effective width of the AXI transaction,
           // since the store unit does not support misalignments between the AXI bus and the lanes
-          if ((axi_addrgen_d.addr[$clog2(AxiDataWidth/8)-1:0] != '0) && !axi_addrgen_d.is_load)
+          // BUG: this address check is not valid for indexed operations
+          if ((axi_addrgen_d.addr[clog2_AxiStrobeWidth-1:0] != '0) && !axi_addrgen_d.is_load)
           begin
-            // Calculate the start and the end addresses in the AXI_ADDRGEN_MISALIGNED state
-            axi_addrgen_state_d = AXI_ADDRGEN_MISALIGNED;
+            // Calculate the start and the end addresses in the AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED state
+            axi_addrgen_state_d = AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED;
 
             eff_axi_dw_d     = {1'b0, narrow_axi_data_bwidth};
             eff_axi_dw_log_d = zeroes_cnt;
           end else begin
             eff_axi_dw_d     = AxiDataWidth/8;
-            eff_axi_dw_log_d = $clog2(AxiDataWidth/8);
+            eff_axi_dw_log_d = clog2_AxiStrobeWidth;
           end
 
           // The start address is found by aligning the original request address by the width of
           // the memory interface.
-          aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, $clog2(AxiDataWidth/8));
+          aligned_start_addr_d = aligned_addr(axi_addrgen_d.addr, clog2_AxiStrobeWidth);
           // Pre-calculate the next_2page_msb. This should not require much energy if the addr
           // has zeroes in the upper positions.
           next_2page_msb_d = aligned_start_addr_d[AxiAddrWidth-1:12] + 1;
           // The final address can be found similarly...
-          if (axi_addrgen_d.len << int'(axi_addrgen_d.vew) >= (256 << $clog2(AxiDataWidth/8))) begin
-            aligned_next_start_addr_d =
-              aligned_addr(axi_addrgen_d.addr + (256 << $clog2(AxiDataWidth/8)), $clog2(AxiDataWidth/8));
-            aligned_end_addr_d = aligned_next_start_addr_d - 1;
-          end else begin
-            aligned_next_start_addr_d =
-              aligned_addr(axi_addrgen_d.addr + (axi_addrgen_d.len << int'(axi_addrgen_d.vew)) - 1,
-              $clog2(AxiDataWidth/8)) + AxiDataWidth/8;
-            aligned_end_addr_d = aligned_next_start_addr_d - 1;
-          end
-          // But since AXI requests are aligned in 4 KiB pages, aligned_end_addr must be in the
-          // same page as aligned_start_addr
-          if (aligned_start_addr_d[AxiAddrWidth-1:12] != aligned_end_addr_d[AxiAddrWidth-1:12]) begin
-            aligned_end_addr_d        = {aligned_start_addr_d[AxiAddrWidth-1:12], 12'hFFF};
-            aligned_next_start_addr_d = {                       next_2page_msb_d, 12'h000};
-          end
+          set_end_addr (
+                          next_2page_msb_d,
+                          (axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)),
+                          axi_addrgen_d.addr,
+                          AxiDataWidth/8,
+                          clog2_AxiStrobeWidth,
+                          aligned_start_addr_d,
+                          aligned_end_addr_d,
+                          aligned_next_start_addr_d
+          );
         end
-      end
-      AXI_ADDRGEN_MISALIGNED: begin
-        axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING : AXI_ADDRGEN_REQUESTING;
+      end : axi_addrgen_state_AXI_ADDRGEN_IDLE
+
+      AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED: begin : axi_addrgen_state_AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED
+        axi_addrgen_state_d = core_st_pending_i ? AXI_ADDRGEN_WAITING_CORE_STORE_PENDING : AXI_ADDRGEN_REQUESTING;
 
         // The start address is found by aligning the original request address by the width of
         // the memory interface.
         aligned_start_addr_d = aligned_addr(axi_addrgen_q.addr, eff_axi_dw_log_q);
-        // The final address can be found similarly...
-        if (axi_addrgen_q.len << int'(axi_addrgen_q.vew) >= (256 << eff_axi_dw_log_q)) begin
-          aligned_next_start_addr_d =
-            aligned_addr(axi_addrgen_q.addr + (256 << eff_axi_dw_log_q), eff_axi_dw_log_q);
-          aligned_end_addr_d = aligned_next_start_addr_d - 1;
-        end else begin
-          aligned_next_start_addr_d =
-            aligned_addr(axi_addrgen_q.addr + (axi_addrgen_q.len << int'(axi_addrgen_q.vew)) - 1,
-            eff_axi_dw_log_q) + eff_axi_dw_q;
-          aligned_end_addr_d = aligned_next_start_addr_d - 1;
-        end
-        // But since AXI requests are aligned in 4 KiB pages, aligned_end_addr must be in the
-        // same page as aligned_start_addr
-        if (aligned_start_addr_d[AxiAddrWidth-1:12] != aligned_end_addr_d[AxiAddrWidth-1:12]) begin
-          aligned_end_addr_d = {aligned_start_addr_d[AxiAddrWidth-1:12], 12'hFFF};
-          aligned_next_start_addr_d = {                next_2page_msb_q, 12'h000};
-        end
-      end
-      AXI_ADDRGEN_WAITING: begin
-        if (!core_st_pending_i)
+
+        set_end_addr (
+                        next_2page_msb_q,
+                        (axi_addrgen_q.len << unsigned'(axi_addrgen_q.vew)),
+                        axi_addrgen_q.addr,
+                        eff_axi_dw_q,
+                        eff_axi_dw_log_q,
+                        aligned_start_addr_d,
+                        aligned_end_addr_d,
+                        aligned_next_start_addr_d
+        );            
+      end : axi_addrgen_state_AXI_ADDRGEN_AXI_DW_STORE_MISALIGNED
+
+      AXI_ADDRGEN_WAITING_CORE_STORE_PENDING: begin : axi_addrgen_state_AXI_ADDRGEN_WAITING_CORE_STORE_PENDING
+        if (!core_st_pending_i) begin
           axi_addrgen_state_d = AXI_ADDRGEN_REQUESTING;
-      end
-      AXI_ADDRGEN_REQUESTING : begin
-        automatic logic axi_ax_ready = (axi_addrgen_q.is_load && axi_ar_ready_i) || (!
-          axi_addrgen_q.is_load && axi_aw_ready_i);
+        end
+      end : axi_addrgen_state_AXI_ADDRGEN_WAITING_CORE_STORE_PENDING
+
+      AXI_ADDRGEN_REQUESTING : begin : axi_addrgen_state_AXI_ADDRGEN_REQUESTING
+        automatic logic axi_ax_ready = (axi_addrgen_q.is_load && axi_ar_ready_i) || (!axi_addrgen_q.is_load && axi_aw_ready_i);
 
         // Pre-calculate the next_2page_msb. This should not require much energy if the addr
         // has zeroes in the upper positions.
@@ -553,14 +715,25 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
         // Before starting a transaction on a different channel, wait the formers to complete
         // Otherwise, the ordering of the responses is not guaranteed, and with the current
         // implementation we can incur in deadlocks
-        if (axi_addrgen_queue_empty || (axi_addrgen_req_o.is_load && axi_addrgen_q.is_load) ||
-            (~axi_addrgen_req_o.is_load && ~axi_addrgen_q.is_load)) begin
-          if (!axi_addrgen_queue_full && axi_ax_ready) begin
-            if (axi_addrgen_q.is_burst) begin
+        // NOTE: this might be referring to an obsolete axi_cut implementation
+        if ( axi_addrgen_queue_empty || 
+            (axi_addrgen_req_o.is_load && axi_addrgen_q.is_load) ||
+            (~axi_addrgen_req_o.is_load && ~axi_addrgen_q.is_load
+            )
+          ) begin : axi_ax_idle
+          if (!axi_addrgen_queue_full && axi_ax_ready) begin : start_req
+            automatic logic [riscv::PLEN-1:0] paddr;
+
+            if (axi_addrgen_q.is_burst) begin : unit_stride
 
               /////////////////////////
               //  Unit-Stride access //
               /////////////////////////
+              // NOTE: all these variables could be narrowed to the minimum number of bits
+              automatic int unsigned num_beats;
+              automatic int unsigned num_bytes;
+              automatic int unsigned burst_len_bytes;
+              automatic int unsigned axi_addrgen_bytes;
 
               // AXI burst length
               automatic int unsigned burst_length;
@@ -570,10 +743,10 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
               // 2 - The AXI burst length cannot be longer than the number of beats required
               //     to access the memory regions between aligned_start_addr and
               //     aligned_end_addr
-              if (burst_length > ((aligned_end_addr_q[11:0] - aligned_start_addr_q[11:0]) >>
-                    eff_axi_dw_log_q) + 1)
-                burst_length = ((aligned_end_addr_q[11:0] - aligned_start_addr_q[11:0]) >>
-                  eff_axi_dw_log_q) + 1;
+              num_beats = ((aligned_end_addr_q[11:0] - aligned_start_addr_q[11:0]) >> eff_axi_dw_log_q) + 1;
+              if (burst_length > num_beats) begin
+                burst_length = num_beats;
+              end
 
               // AR Channel
               if (axi_addrgen_q.is_load) begin
@@ -612,107 +785,57 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
               axi_addrgen_queue_push = 1'b1;
 
               // Account for the requested operands
-              axi_addrgen_d.len = axi_addrgen_q.len -
-                ((aligned_end_addr_q[11:0] - axi_addrgen_q.addr[11:0] + 1)
-                  >> int'(axi_addrgen_q.vew));
-              if (axi_addrgen_q.len <
-                ((aligned_end_addr_q[11:0] - axi_addrgen_q.addr[11:0] + 1)
-                  >> int'(axi_addrgen_q.vew)))
+              num_bytes = ( (aligned_end_addr_q[11:0] - axi_addrgen_q.addr[11:0] + 1) >> unsigned'(axi_addrgen_q.vew) );
+              if (axi_addrgen_q.len >= num_bytes) begin
+                axi_addrgen_d.len = axi_addrgen_q.len - num_bytes;
+              end
+              else begin
                 axi_addrgen_d.len = 0;
-              axi_addrgen_d.addr = aligned_next_start_addr_q;
-
-              // Finished generating AXI requests
-              if (axi_addrgen_d.len == 0) begin
-                addrgen_req_ready   = 1'b1;
-                axi_addrgen_state_d = AXI_ADDRGEN_IDLE;
               end
+              axi_addrgen_d.addr = aligned_next_start_addr_q;
 
               // Calculate the addresses for the next iteration
               // The start address is found by aligning the original request address by the width of
               // the memory interface. In our case, we have it already.
               aligned_start_addr_d = axi_addrgen_d.addr;
               // The final address can be found similarly.
-              // How many B we requested? No more than (256 << burst_size)
-              if (axi_addrgen_d.len << int'(axi_addrgen_q.vew) >= (256 << eff_axi_dw_log_q)) begin
-                aligned_next_start_addr_d =
-                  aligned_addr(aligned_start_addr_d + (256 << eff_axi_dw_log_q), eff_axi_dw_log_q);
-                aligned_end_addr_d = aligned_next_start_addr_d - 1;
-              end else begin
-                aligned_next_start_addr_d =
-                  aligned_addr(aligned_start_addr_d + (axi_addrgen_d.len << int'(axi_addrgen_q.vew))
-                  - 1, eff_axi_dw_log_q) + eff_axi_dw_q;
-                aligned_end_addr_d = aligned_next_start_addr_d - 1;
-              end
-              // But since AXI requests are aligned in 4 KiB pages, aligned_end_addr must be in the
-              // same page as aligned_start_addr
-              if (aligned_start_addr_d[AxiAddrWidth-1:12] != aligned_end_addr_d[AxiAddrWidth-1:12]) begin
-                aligned_end_addr_d        = {aligned_start_addr_d[AxiAddrWidth-1:12], 12'hFFF};
-                aligned_next_start_addr_d = {                       next_2page_msb_d, 12'h000};
-              end
-            end else if (state_q != ADDRGEN_IDX_OP) begin
+              // How many B we requested? No more than (256 << burst_len_bytes)
+              burst_len_bytes   = (256 << eff_axi_dw_log_q);
+              axi_addrgen_bytes = (axi_addrgen_d.len << unsigned'(axi_addrgen_q.vew));
+              set_end_addr (
+                              next_2page_msb_d,
+                              (axi_addrgen_d.len << unsigned'(axi_addrgen_d.vew)),
+                              aligned_start_addr_d,
+                              eff_axi_dw_q,
+                              eff_axi_dw_log_q,
+                              aligned_start_addr_d,
+                              aligned_end_addr_d,
+                              aligned_next_start_addr_d
+              );
+            end : unit_stride
+            else if (state_q != ADDRGEN_IDX_OP) begin : strided
 
               /////////////////////
               //  Strided access //
               /////////////////////
 
-              // AR Channel
-              if (axi_addrgen_q.is_load) begin
-                axi_ar_o = '{
-                  addr   : axi_addrgen_q.addr,
-                  len    : 0,
-                  size   : axi_addrgen_q.vew,
-                  cache  : CACHE_MODIFIABLE,
-                  burst  : BURST_INCR,
-                  default: '0
-                };
-                axi_ar_valid_o = 1'b1;
-              end
-              // AW Channel
-              else begin
-                axi_aw_o = '{
-                  addr   : axi_addrgen_q.addr,
-                  len    : 0,
-                  size   : axi_addrgen_q.vew,
-                  cache  : CACHE_MODIFIABLE,
-                  burst  : BURST_INCR,
-                  default: '0
-                };
-                axi_aw_valid_o = 1'b1;
-              end
+              if ( en_ld_st_translation_i ) begin : en_ld_st_translation
+                // Request an address translation
+                mmu_req_o   = 1'b1;
+                mmu_vaddr_o = axi_addrgen_q.addr;
+                axi_addrgen_state_d = AXI_ADDRGEN_WAIT_TRANSLATION;
+              end : en_ld_st_translation
 
-              // Send this request to the load/store units
-              axi_addrgen_queue = '{
-                addr   : axi_addrgen_q.addr,
-                size   : axi_addrgen_q.vew,
-                len    : 0,
-                is_load: axi_addrgen_q.is_load
-              };
-              axi_addrgen_queue_push = 1'b1;
-
-              // Account for the requested operands
-              axi_addrgen_d.len  = axi_addrgen_q.len - 1;
-              // Calculate the addresses for the next iteration, adding the correct stride
-              axi_addrgen_d.addr = axi_addrgen_q.addr + axi_addrgen_q.stride;
-
-              // Finished generating AXI requests
-              if (axi_addrgen_d.len == 0) begin
-                addrgen_req_ready   = 1'b1;
-                axi_addrgen_state_d = AXI_ADDRGEN_IDLE;
-              end
-            end else begin
-
-              //////////////////////
-              //  Indexed access  //
-              //////////////////////
-
-              if (idx_addr_valid_q) begin
-                // We consumed a word
-                idx_addr_ready_d = 1'b1;
+              // Mux target address
+              paddr = ( en_ld_st_translation_i ) ? mmu_paddr_i : axi_addrgen_q.addr;
 
+              // Either we got a valid address translation from the MMU
+              // or virtual memory is disabled
+              if ( mmu_valid_i | !en_ld_st_translation_i ) begin : addr_valid
                 // AR Channel
                 if (axi_addrgen_q.is_load) begin
                   axi_ar_o = '{
-                    addr   : idx_final_addr_q,
+                    addr   : paddr,
                     len    : 0,
                     size   : axi_addrgen_q.vew,
                     cache  : CACHE_MODIFIABLE,
@@ -724,7 +847,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
                 // AW Channel
                 else begin
                   axi_aw_o = '{
-                    addr   : idx_final_addr_q,
+                    addr   : paddr,
                     len    : 0,
                     size   : axi_addrgen_q.vew,
                     cache  : CACHE_MODIFIABLE,
@@ -736,7 +859,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
 
                 // Send this request to the load/store units
                 axi_addrgen_queue = '{
-                  addr   : idx_final_addr_q,
+                  addr   : paddr,
                   size   : axi_addrgen_q.vew,
                   len    : 0,
                   is_load: axi_addrgen_q.is_load
@@ -744,29 +867,124 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
                 axi_addrgen_queue_push = 1'b1;
 
                 // Account for the requested operands
-                axi_addrgen_d.len = axi_addrgen_q.len - 1;
+                axi_addrgen_d.len  = axi_addrgen_q.len - 1;
+                // Calculate the addresses for the next iteration, adding the correct stride
+                // NOTE: there is no need to check for misaligned erros, since the stride always produces EEW-aligned to the first address
+                axi_addrgen_d.addr = axi_addrgen_q.addr + axi_addrgen_q.stride;
+              end : addr_valid
+            end : strided
+            else begin : indexed
+              automatic logic [riscv::PLEN-1:0] idx_final_paddr;
+              //////////////////////
+              //  Indexed access  //
+              //////////////////////
+              // TODO: check if idx_vaddr_valid_q is stable 
+              if (idx_vaddr_valid_q) begin : idx_vaddr_valid_q
 
-                // Check if the address does generate an exception
-                if (is_addr_error(idx_final_addr_q, axi_addrgen_q.vew)) begin
+                // Check if the virtual address generates an exception
+                // NOTE: we can do this even before address translation, since the
+                //       page offset (2^12) is the same for both physical and virtual addresses
+                if (is_addr_error(idx_final_vaddr_q, axi_addrgen_q.vew)) begin : eew_misaligned_error
                   // Generate an error
                   idx_op_error_d          = 1'b1;
                   // Forward next vstart info to the dispatcher
-                  addrgen_error_vl_d      = addrgen_req.len - axi_addrgen_q.len - 1;
+                  addrgen_exception_vl_d  = addrgen_req.len - axi_addrgen_q.len - 1;
                   addrgen_req_ready       = 1'b1;
                   axi_addrgen_state_d     = AXI_ADDRGEN_IDLE;
-                end
-
-                // Finished generating AXI requests
-                if (axi_addrgen_d.len == 0) begin
-                  addrgen_req_ready   = 1'b1;
-                  axi_addrgen_state_d = AXI_ADDRGEN_IDLE;
-                end
+                end : eew_misaligned_error
+                else begin : aligned_vaddress
+                  if ( en_ld_st_translation_i ) begin : en_ld_st_translation
+                    // Request an address translation
+                    mmu_req_o   = 1'b1;
+                    mmu_vaddr_o = idx_final_vaddr_q;
+                    axi_addrgen_state_d = AXI_ADDRGEN_WAIT_TRANSLATION;
+                  end : en_ld_st_translation
+
+                  // Mux target address
+                  idx_final_paddr = ( en_ld_st_translation_i ) ? mmu_paddr_i : idx_final_vaddr_q;
+                  
+                  // Either we got a valid address translation from the MMU
+                  // or virtual memory is disabled
+                  if ( mmu_valid_i | !en_ld_st_translation_i ) begin : addr_valid
+                    // We consumed a word
+                    idx_vaddr_ready_d = 1'b1;
+
+                    // AR Channel
+                    if (axi_addrgen_q.is_load) begin
+                      axi_ar_o = '{
+                        addr   : idx_final_paddr,
+                        len    : 0,
+                        size   : axi_addrgen_q.vew,
+                        cache  : CACHE_MODIFIABLE,
+                        burst  : BURST_INCR,
+                        default: '0
+                      };
+                      axi_ar_valid_o = 1'b1;
+                    end
+                    // AW Channel
+                    else begin
+                      axi_aw_o = '{
+                        addr   : idx_final_paddr,
+                        len    : 0,
+                        size   : axi_addrgen_q.vew,
+                        cache  : CACHE_MODIFIABLE,
+                        burst  : BURST_INCR,
+                        default: '0
+                      };
+                      axi_aw_valid_o = 1'b1;
+                    end
+
+                    // Send this request to the load/store units
+                    axi_addrgen_queue = '{
+                      addr   : idx_final_paddr,
+                      size   : axi_addrgen_q.vew,
+                      len    : 0,
+                      is_load: axi_addrgen_q.is_load
+                    };
+                    axi_addrgen_queue_push = 1'b1;
+
+                    // Account for the requested operands
+                    axi_addrgen_d.len = axi_addrgen_q.len - 1;
+                  end : addr_valid
+                end : aligned_vaddress
+              end : idx_vaddr_valid_q
+            end : indexed
+
+            // Finished generating AXI requests
+            if (axi_addrgen_d.len == 0) begin
+              addrgen_req_ready   = 1'b1;
+              axi_addrgen_state_d = AXI_ADDRGEN_IDLE;
+              if ( en_ld_st_translation_i ) begin
+                last_translation_completed = 1'b1;
               end
             end
-          end
-        end
-      end
-    endcase
+          end : start_req
+        end : axi_ax_idle
+      end : axi_addrgen_state_AXI_ADDRGEN_REQUESTING
+
+      AXI_ADDRGEN_WAIT_TRANSLATION : begin : axi_addrgen_state_AXI_ADDRGEN_WAIT_TRANSLATION
+        // keep request high
+        mmu_req_o      = 1'b1;       
+
+        // Wait for MMU to respond
+        if ( mmu_valid_i ) begin : mmu_valid
+          // Perform request
+          axi_addrgen_state_d = AXI_ADDRGEN_REQUESTING;
+
+          // Replace virtual address with translated address
+          axi_addrgen_d.addr = mmu_paddr_i;
+
+          // Sample MMU exception
+          if ( mmu_exception_i.valid ) begin : mmu_exception_valid
+            // the other FSM will pick up the _q on the next cycle
+            mmu_exception_d     = mmu_exception_i;
+            addrgen_req_ready   = 1'b1;
+            axi_addrgen_state_d = AXI_ADDRGEN_IDLE;
+          end : mmu_exception_valid
+        end : mmu_valid
+
+      end : axi_addrgen_state_AXI_ADDRGEN_WAIT_TRANSLATION
+    endcase // axi_addrgen_state_q
   end: axi_addrgen
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv
index 51042ed8e..467ae4a70 100644
--- a/hardware/src/vlsu/vldu.sv
+++ b/hardware/src/vlsu/vldu.sv
@@ -35,6 +35,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     output pe_resp_t                       pe_resp_o,
     // Interface with the address generator
     input  addrgen_axi_req_t               axi_addrgen_req_i,
+    input  logic                           addrgen_exception_valid_i,
     input  logic                           axi_addrgen_req_valid_i,
     output logic                           axi_addrgen_req_ready_o,
     // Interface with the lanes
@@ -136,7 +137,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
   // reading from and writing into the lanes (read_pnt).
   logic     [idx_width(ResultQueueDepth)-1:0]   result_queue_write_pnt_d, result_queue_write_pnt_q;
   logic     [idx_width(ResultQueueDepth)-1:0]   result_queue_read_pnt_d, result_queue_read_pnt_q;
-  // We need to count how many valid elements are there in this result queue.
+  // We need to count how many valid elements (payload_t) are there in this result queue.
   logic     [idx_width(ResultQueueDepth):0]     result_queue_cnt_d, result_queue_cnt_q;
   // Vector to register the final grants from the operand requesters, which indicate
   // that the result was actually written in the VRF (while the normal grant just says
@@ -174,33 +175,37 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
   logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q;
 
   // Interface with the main sequencer
-  pe_resp_t pe_resp;
+  pe_resp_t pe_resp_d;
 
   // Remaining bytes of the current instruction in the issue phase
-  vlen_t issue_cnt_d, issue_cnt_q;
+  vlen_t issue_cnt_bytes_d, issue_cnt_bytes_q;
   // Remaining bytes of the current instruction in the commit phase
-  vlen_t commit_cnt_d, commit_cnt_q;
+  vlen_t commit_cnt_bytes_d, commit_cnt_bytes_q;
 
   // Pointers
   //
   // We need several pointers to copy data from the memory interface
   // into the VRF. Namely, we need:
   // - A counter of how many beats are left in the current AXI burst
-  axi_pkg::len_t len_d, len_q;
+  axi_pkg::len_t                           axi_len_d, axi_len_q;
   // - A pointer to which byte in the current R beat we are reading data from.
-  logic [idx_width(AxiDataWidth/8):0]      r_pnt_d, r_pnt_q;
+  logic [idx_width(AxiDataWidth/8):0]      axi_r_byte_pnt_d, axi_r_byte_pnt_q;
   // - A pointer to which byte in the full VRF word we are writing data into.
-  logic [idx_width(DataWidth*NrLanes/8):0] vrf_pnt_d, vrf_pnt_q;
+  logic [idx_width(DataWidth*NrLanes/8):0] vrf_word_byte_pnt_d, vrf_word_byte_pnt_q;
+
+  localparam unsigned DataWidthB = DataWidth / 8;
+  
+    vlen_t vstart_lane;
 
   always_comb begin: p_vldu
     // Maintain state
     vinsn_queue_d = vinsn_queue_q;
-    issue_cnt_d   = issue_cnt_q;
-    commit_cnt_d  = commit_cnt_q;
+    issue_cnt_bytes_d   = issue_cnt_bytes_q;
+    commit_cnt_bytes_d  = commit_cnt_bytes_q;
 
-    len_d     = len_q;
-    r_pnt_d   = r_pnt_q;
-    vrf_pnt_d = vrf_pnt_q;
+    axi_len_d           = axi_len_q;
+    axi_r_byte_pnt_d    = axi_r_byte_pnt_q;
+    vrf_word_byte_pnt_d = vrf_word_byte_pnt_q;
 
     result_queue_d           = result_queue_q;
     result_queue_valid_d     = result_queue_valid_q;
@@ -215,7 +220,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
     // We are not ready, by default
     axi_addrgen_req_ready_o = 1'b0;
-    pe_resp                 = '0;
+    pe_resp_d               = '0;
     axi_r_ready_o           = 1'b0;
     mask_ready_o            = 1'b0;
     load_complete_o         = 1'b0;
@@ -232,128 +237,168 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     // - The Address Generator sent us the data about the corresponding AR beat
     // - There is place in the result queue to write the data read from the R channel
     if (axi_r_valid_i && axi_addrgen_req_valid_i
-        && axi_addrgen_req_i.is_load && !result_queue_full) begin
+        && axi_addrgen_req_i.is_load && !result_queue_full) begin : axi_r_beat_read
       // Bytes valid in the current R beat
       // If non-unit strided load, we do not progress within the beat
       automatic shortint unsigned lower_byte = beat_lower_byte(axi_addrgen_req_i.addr,
-        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q);
+        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q);
       automatic shortint unsigned upper_byte = beat_upper_byte(axi_addrgen_req_i.addr,
-        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q);
-
+        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q);
+      
       // Is there a vector instruction ready to be issued?
       // Do we have the operands for it?
-      if (vinsn_issue_valid && (vinsn_issue_q.vm || (|mask_valid_i))) begin
+      if (vinsn_issue_valid && (vinsn_issue_q.vm || (|mask_valid_i))) begin : operands_valid
         // Account for the issued bytes
         // How many bytes are valid in this VRF word
-        automatic vlen_t vrf_valid_bytes   = NrLanes * 8 - vrf_pnt_q;
+        automatic vlen_t vrf_valid_bytes   = (NrLanes * DataWidthB) - vrf_word_byte_pnt_q; 
         // How many bytes are valid in this instruction
-        automatic vlen_t vinsn_valid_bytes = issue_cnt_q - vrf_pnt_q;
+        automatic vlen_t vinsn_valid_bytes = issue_cnt_bytes_q - vrf_word_byte_pnt_q;
         // How many bytes are valid in this AXI word
-        automatic vlen_t axi_valid_bytes   = upper_byte - lower_byte - r_pnt_q + 1;
+        automatic vlen_t axi_valid_bytes   = upper_byte - lower_byte - axi_r_byte_pnt_q + 1;
+
 
         // How many bytes are we committing?
         automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes;
-        valid_bytes = issue_cnt_q < NrLanes * 8     ? vinsn_valid_bytes : vrf_valid_bytes;
-        valid_bytes = valid_bytes < axi_valid_bytes ? valid_bytes       : axi_valid_bytes;
+        valid_bytes = ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) ? vinsn_valid_bytes : vrf_valid_bytes;
+        valid_bytes = ( valid_bytes       < axi_valid_bytes        ) ? valid_bytes       : axi_valid_bytes;
 
-        r_pnt_d   = r_pnt_q + valid_bytes;
-        vrf_pnt_d = vrf_pnt_q + valid_bytes;
+        // Bump R beat and VRF word pointers
+        axi_r_byte_pnt_d   = axi_r_byte_pnt_q + valid_bytes;
+        vrf_word_byte_pnt_d = vrf_word_byte_pnt_q + valid_bytes;
 
         // Copy data from the R channel into the result queue
-        for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin
+        for (int unsigned axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : axi_r_to_result_queue
           // Is this byte a valid byte in the R beat?
-          if (axi_byte >= lower_byte + r_pnt_q && axi_byte <= upper_byte) begin
+          if ( ( axi_byte >= ( lower_byte + axi_r_byte_pnt_q ) ) &&
+               ( axi_byte <= upper_byte ) 
+              ) begin : is_axi_r_byte
             // Map axi_byte to the corresponding byte in the VRF word (sequential)
-            automatic int vrf_seq_byte = axi_byte - lower_byte - r_pnt_q + vrf_pnt_q;
+            automatic int unsigned vrf_seq_byte = axi_byte - lower_byte - axi_r_byte_pnt_q + vrf_word_byte_pnt_q;
             // And then shuffle it
-            automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.vtype.vsew);
+            automatic int unsigned vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.vtype.vsew);
 
             // Is this byte a valid byte in the VRF word?
-            if (vrf_seq_byte < issue_cnt_q && vrf_seq_byte < NrLanes * 8) begin
+            if (vrf_seq_byte < issue_cnt_bytes_q && vrf_seq_byte < (NrLanes * DataWidthB)) begin : is_vrf_byte
               // At which lane, and what is the byte offset in that lane, of the byte vrf_byte?
-              automatic int vrf_lane   = vrf_byte >> 3;
-              automatic int vrf_offset = vrf_byte[2:0];
+              automatic int unsigned vrf_offset = vrf_byte[2:0];
+              // Consider also vstart and make sure this index wraps around the number of lane
+              automatic int unsigned vrf_lane = (vrf_byte >> 3);
+              // Adjust lane selection w.r.t. vstart
+              vrf_lane += vinsn_issue_q.vstart[idx_width(NrLanes)-1:0];
+              if ( vrf_lane >= NrLanes ) begin : vstart_lane_adjust
+                vrf_lane -= NrLanes;
+              end : vstart_lane_adjust
+
 
               // Copy data and byte strobe
               result_queue_d[result_queue_write_pnt_q][vrf_lane].wdata[8*vrf_offset +: 8] =
                 axi_r_i.data[8*axi_byte +: 8];
               result_queue_d[result_queue_write_pnt_q][vrf_lane].be[vrf_offset] =
                 vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset];
-            end
-          end
-        end
-
-        // Initialize id and addr fields of the result queue requests
-        for (int lane = 0; lane < NrLanes; lane++) begin
+            end : is_vrf_byte
+          end : is_axi_r_byte
+        end : axi_r_to_result_queue
+
+        for (int unsigned lane = 0; lane < NrLanes; lane++) begin : compute_vrf_addr
+          automatic vlen_t issue_cnt_elems;
+          // elements per lane (each lane processes num elements / NrLanes)
+          automatic vlen_t elem_left_per_lane;
+          // 64-bit aligned address
+          automatic vlen_t lane_word_offset;
+          // How many elements in the vector body
+          automatic vlen_t elem_body_count;
+          // vstart value local ot the lane
+          automatic vlen_t vstart_lane;        
+          
+          // Compute VRF chunk address per lane
+          elem_body_count    = vinsn_issue_q.vl - vinsn_issue_q.vstart;
+          issue_cnt_elems    = issue_cnt_bytes_q >> unsigned'(vinsn_issue_q.vtype.vsew);
+          elem_left_per_lane = ( elem_body_count - issue_cnt_elems ) / NrLanes;
+          lane_word_offset   = elem_left_per_lane >> (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew));
+          
+          vstart_lane = vinsn_issue_q.vstart / NrLanes;
+          // If lane_id < (vstart % NrLanes), this lane needs to execute one micro-operation less.
+          if ( lane < vinsn_issue_q.vstart[idx_width(NrLanes)-1:0] ) begin : vstart_lane_adjust
+            vstart_lane += 1;
+          end : vstart_lane_adjust
+
+          // Store in result queue
+          result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) + lane_word_offset + vstart_lane;  
           result_queue_d[result_queue_write_pnt_q][lane].id   = vinsn_issue_q.id;
-          result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) +
-            (((vinsn_issue_q.vl - (issue_cnt_q >> int'(vinsn_issue_q.vtype.vsew))) / NrLanes) >>
-            (int'(EW64) - int'(vinsn_issue_q.vtype.vsew)));
-        end
-      end
+        end : compute_vrf_addr
+      end : operands_valid
 
       // We have a word ready to be sent to the lanes
-      if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_q) begin
+      if (vrf_word_byte_pnt_d == (NrLanes * DataWidthB) || vrf_word_byte_pnt_d == issue_cnt_bytes_q) begin : vrf_word_ready
         // Increment result queue pointers and counters
         result_queue_cnt_d += 1;
-        if (result_queue_write_pnt_q == ResultQueueDepth-1)
+        if (result_queue_write_pnt_q == ResultQueueDepth-1) begin : result_queue_write_pnt_overflow
           result_queue_write_pnt_d = '0;
-        else
+        end : result_queue_write_pnt_overflow
+        else begin : result_queue_write_pnt_increment
           result_queue_write_pnt_d = result_queue_write_pnt_q + 1;
+        end : result_queue_write_pnt_increment
 
         // Trigger the request signal
+        // TODO: check if triggering all lanes is actually necessary here
         result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
 
         // Acknowledge the mask operands
         mask_ready_o = !vinsn_issue_q.vm;
 
         // Reset the pointer in the VRF word
-        vrf_pnt_d   = '0;
+        vrf_word_byte_pnt_d   = '0;
         // Account for the results that were issued
-        issue_cnt_d = issue_cnt_q - NrLanes * 8;
-        if (issue_cnt_q < NrLanes * 8)
-          issue_cnt_d = '0;
-      end
+        issue_cnt_bytes_d = issue_cnt_bytes_q - (NrLanes * DataWidthB);
+        if (issue_cnt_bytes_q < (NrLanes * DataWidthB)) begin : issue_cnt_bytes_overflow
+          issue_cnt_bytes_d = '0;
+        end : issue_cnt_bytes_overflow
+      end : vrf_word_ready
 
       // Consumed all valid bytes in this R beat
-      if (r_pnt_d == upper_byte - lower_byte + 1 || issue_cnt_d == '0) begin
+      if ( ( axi_r_byte_pnt_d == ( upper_byte - lower_byte + 1 ) ) || ( issue_cnt_bytes_d == '0 ) ) begin : axi_r_beat_finish
         // Request another beat
         axi_r_ready_o = 1'b1;
-        r_pnt_d       = '0;
+        axi_r_byte_pnt_d   = '0;
         // Account for the beat we consumed
-        len_d         = len_q + 1;
-      end
+        axi_len_d     = axi_len_q + 1;
+      end : axi_r_beat_finish
 
       // Consumed all beats from this burst
-      if ($unsigned(len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin
+      if ($unsigned(axi_len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin : axi_finish
         // Reset AXI pointers
-        len_d                   = '0;
-        r_pnt_d                 = '0;
+        axi_len_d               = '0;
+        axi_r_byte_pnt_d             = '0;
         // Wait for another AXI request
         axi_addrgen_req_ready_o = 1'b1;
-      end
+      end : axi_finish
 
       // Finished issuing results
-      if (vinsn_issue_valid && issue_cnt_d == '0) begin
+      if (vinsn_issue_valid && (issue_cnt_bytes_d == '0)) begin : vrf_results_finish
         // Increment vector instruction queue pointers and counters
         vinsn_queue_d.issue_cnt -= 1;
-        if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1)
+        if (vinsn_queue_q.issue_pnt == (VInsnQueueDepth-1)) begin : issue_pnt_overflow
           vinsn_queue_d.issue_pnt = '0;
-        else
+        end : issue_pnt_overflow
+        else begin : issue_pnt_increment
           vinsn_queue_d.issue_pnt += 1;
+        end : issue_pnt_increment
 
         // Prepare for the next vector instruction
-        if (vinsn_queue_d.issue_cnt != 0)
-          issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl << int'(vinsn_queue_q.vinsn[
-              vinsn_queue_d.issue_pnt].vtype.vsew);
-      end
-    end
+        if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update
+          issue_cnt_bytes_d = (
+                                vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl 
+                                - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart
+                              ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew);
+        end : issue_cnt_bytes_update
+      end : vrf_results_finish
+    end : axi_r_beat_read
 
     //////////////////////////////////
     //  Write results into the VRF  //
     //////////////////////////////////
 
-    for (int lane = 0; lane < NrLanes; lane++) begin: result_write
+    for (int unsigned lane = 0; lane < NrLanes; lane++) begin: vrf_result_write
       ldu_result_req_o[lane]   = result_queue_valid_q[result_queue_read_pnt_q][lane];
       ldu_result_addr_o[lane]  = result_queue_q[result_queue_read_pnt_q][lane].addr;
       ldu_result_id_o[lane]    = result_queue_q[result_queue_read_pnt_q][lane].id;
@@ -365,39 +410,43 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
       // Received a grant from the VRF.
       // Deactivate the request, but do not bump the pointers for now.
-      if (ldu_result_req_o[lane] && ldu_result_gnt_i[lane]) begin
+      if (ldu_result_req_o[lane] && ldu_result_gnt_i[lane]) begin : vrf_grant
         result_queue_valid_d[result_queue_read_pnt_q][lane] = 1'b0;
         result_queue_d[result_queue_read_pnt_q][lane]       = '0;
         // Reset the final gnt vector since we are now waiting for another final gnt
         result_final_gnt_d[lane] = 1'b0;
-      end
-    end: result_write
+      end : vrf_grant
+    end: vrf_result_write
 
     // All lanes accepted the VRF request
     // Wait for all the final grants, to be sure that all the results were written back
     if (!(|result_queue_valid_d[result_queue_read_pnt_q]) &&
-      (&result_final_gnt_d || commit_cnt_q > (NrLanes * 8)))
+      (&result_final_gnt_d || commit_cnt_bytes_q > (NrLanes * DataWidthB))) begin : wait_for_write_back
       // There is something waiting to be written
-      if (!result_queue_empty) begin
+      if (!result_queue_empty) begin : result_available
         // Increment the read pointer
-        if (result_queue_read_pnt_q == ResultQueueDepth-1)
+        if (result_queue_read_pnt_q == (ResultQueueDepth-1)) begin : result_queue_read_pnt_overflow
           result_queue_read_pnt_d = 0;
-        else
+        end : result_queue_read_pnt_overflow
+        else begin  : result_queue_read_pnt_increment
           result_queue_read_pnt_d = result_queue_read_pnt_q + 1;
+        end : result_queue_read_pnt_increment
 
         // Decrement the counter of results waiting to be written
         result_queue_cnt_d -= 1;
 
         // Decrement the counter of remaining vector elements waiting to be written
-        commit_cnt_d = commit_cnt_q - NrLanes * 8;
-        if (commit_cnt_q < (NrLanes * 8))
-          commit_cnt_d = '0;
-      end
+        commit_cnt_bytes_d = commit_cnt_bytes_q - (NrLanes * DataWidthB);
+        if (commit_cnt_bytes_q < (NrLanes * DataWidthB)) begin : commit_cnt_bytes_overflow
+          commit_cnt_bytes_d = '0;
+        end : commit_cnt_bytes_overflow
+      end : result_available
+    end : wait_for_write_back
 
     // Finished committing the results of a vector instruction
-    if (vinsn_commit_valid && commit_cnt_d == '0) begin
+    if (vinsn_commit_valid && commit_cnt_bytes_d == '0) begin : vinsn_done
       // Mark the vector instruction as being done
-      pe_resp.vinsn_done[vinsn_commit.id] = 1'b1;
+      pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1;
 
       // Signal complete load
       load_complete_o = 1'b1;
@@ -411,51 +460,62 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
       // Update the commit counter for the next instruction
       if (vinsn_queue_d.commit_cnt != '0)
-        commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl << int'(vinsn_queue_q.vinsn[
-            vinsn_queue_d.commit_pnt].vtype.vsew);
-    end
+        commit_cnt_bytes_d = (
+                               vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl
+                                - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart
+                              ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew);
+    end : vinsn_done
+
+    // Clear instruction queue in case of exceptions from addrgen
+    if ( addrgen_exception_valid_i ) begin : exception
+      // Signal done to sequencer
+      pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1;
+      // Clear counters and flags
+    end : exception
 
     //////////////////////////////
     //  Accept new instruction  //
     //////////////////////////////
 
     if (!vinsn_queue_full && pe_req_valid_i && !vinsn_running_q[pe_req_i.id] &&
-      pe_req_i.vfu == VFU_LoadUnit) begin
+      pe_req_i.vfu == VFU_LoadUnit) begin : pe_req_valid
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = pe_req_i;
       vinsn_running_d[pe_req_i.id]                  = 1'b1;
 
       // Initialize counters
-      if (vinsn_queue_d.issue_cnt == '0)
-        issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew);
-      if (vinsn_queue_d.commit_cnt == '0)
-        commit_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew);
+      if (vinsn_queue_d.issue_cnt == '0) begin : issue_cnt_bytes_init
+        issue_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew);
+      end : issue_cnt_bytes_init
+      if (vinsn_queue_d.commit_cnt == '0) begin : commit_cnt_bytes_init
+        commit_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew);
+      end : commit_cnt_bytes_init
 
       // Bump pointers and counters of the vector instruction queue
       vinsn_queue_d.accept_pnt += 1;
       vinsn_queue_d.issue_cnt += 1;
       vinsn_queue_d.commit_cnt += 1;
-    end
+    end : pe_req_valid
   end: p_vldu
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
-      vinsn_running_q    <= '0;
-      issue_cnt_q        <= '0;
-      commit_cnt_q       <= '0;
-      len_q              <= '0;
-      r_pnt_q            <= '0;
-      vrf_pnt_q          <= '0;
-      pe_resp_o          <= '0;
-      result_final_gnt_q <= '0;
+      vinsn_running_q     <= '0;
+      issue_cnt_bytes_q   <= '0;
+      commit_cnt_bytes_q  <= '0;
+      axi_len_q           <= '0;
+      axi_r_byte_pnt_q    <= '0;
+      vrf_word_byte_pnt_q <= '0;
+      pe_resp_o           <= '0;
+      result_final_gnt_q  <= '0;
     end else begin
-      vinsn_running_q    <= vinsn_running_d;
-      issue_cnt_q        <= issue_cnt_d;
-      commit_cnt_q       <= commit_cnt_d;
-      len_q              <= len_d;
-      r_pnt_q            <= r_pnt_d;
-      vrf_pnt_q          <= vrf_pnt_d;
-      pe_resp_o          <= pe_resp;
-      result_final_gnt_q <= result_final_gnt_d;
+      vinsn_running_q     <= vinsn_running_d;
+      issue_cnt_bytes_q   <= issue_cnt_bytes_d;
+      commit_cnt_bytes_q  <= commit_cnt_bytes_d;
+      axi_len_q           <= axi_len_d;
+      axi_r_byte_pnt_q    <= axi_r_byte_pnt_d;
+      vrf_word_byte_pnt_q <= vrf_word_byte_pnt_d;
+      pe_resp_o           <= pe_resp_d;
+      result_final_gnt_q  <= result_final_gnt_d;
     end
   end
 
diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv
index aa2e05283..68cd3add5 100644
--- a/hardware/src/vlsu/vlsu.sv
+++ b/hardware/src/vlsu/vlsu.sv
@@ -42,8 +42,8 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     output logic      [1:0]         pe_req_ready_o,         // Load (0) and Store (1) units
     output pe_resp_t  [1:0]         pe_resp_o,              // Load (0) and Store (1) units
     output logic                    addrgen_ack_o,
-    output logic                    addrgen_error_o,
-    output vlen_t                   addrgen_error_vl_o,
+    output ariane_pkg::exception_t  addrgen_exception_o,
+    output vlen_t                   addrgen_exception_vl_o,
     // Interface with the lanes
     // Store unit operands
     input  elen_t     [NrLanes-1:0] stu_operand_i,
@@ -59,6 +59,25 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     input  logic      [NrLanes-1:0] mask_valid_i,
     output logic                    vldu_mask_ready_o,
     output logic                    vstu_mask_ready_o,
+    
+    // CSR input
+    input  logic                    en_ld_st_translation_i,
+
+    // Interface with CVA6's sv39 MMU
+    // This is everything the MMU can provide, it might be overcomplete for Ara and some signals be useless
+    output  ariane_pkg::exception_t        mmu_misaligned_ex_o,
+    output  logic                          mmu_req_o,        // request address translation
+    output  logic [riscv::VLEN-1:0]        mmu_vaddr_o,      // virtual address out
+    output  logic                          mmu_is_store_o,   // the translation is requested by a store
+    // if we need to walk the page table we can't grant in the same cycle
+    // Cycle 0
+    input logic                            mmu_dtlb_hit_i,   // sent in the same cycle as the request if translation hits in the DTLB
+    input logic [riscv::PPNW-1:0]          mmu_dtlb_ppn_i,   // ppn (send same cycle as hit)
+    // Cycle 1
+    input logic                            mmu_valid_i,      // translation is valid
+    input logic [riscv::PLEN-1:0]          mmu_paddr_i,      // translated address
+    input ariane_pkg::exception_t          mmu_exception_i,  // address translation threw an exception
+
     // Results
     output logic      [NrLanes-1:0] ldu_result_req_o,
     output vid_t      [NrLanes-1:0] ldu_result_id_o,
@@ -69,6 +88,11 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     input  logic      [NrLanes-1:0] ldu_result_final_gnt_i
   );
 
+  logic load_complete, store_complete;
+  logic addrgen_exception_load, addrgen_exception_store;
+  assign load_complete_o  = load_complete  | addrgen_exception_load;
+  assign store_complete_o = store_complete | addrgen_exception_store;
+
   ///////////////////
   //  Definitions  //
   ///////////////////
@@ -89,8 +113,8 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .aw_chan_t(axi_aw_t  ),
     .w_chan_t (axi_w_t   ),
     .b_chan_t (axi_b_t   ),
-    .req_t    (axi_req_t ),
-    .resp_t   (axi_resp_t)
+    .axi_req_t (axi_req_t ),
+    .axi_resp_t(axi_resp_t)
   ) i_axi_cut (
     .clk_i     (clk_i     ),
     .rst_ni    (rst_ni    ),
@@ -133,8 +157,10 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .pe_req_valid_i             (pe_req_valid_i             ),
     .pe_vinsn_running_i         (pe_vinsn_running_i         ),
     .addrgen_ack_o              (addrgen_ack_o              ),
-    .addrgen_error_o            (addrgen_error_o            ),
-    .addrgen_error_vl_o         (addrgen_error_vl_o         ),
+    .addrgen_exception_o        ( addrgen_exception_o       ),
+    .addrgen_exception_vl_o     ( addrgen_exception_vl_o    ),
+    .addrgen_exception_load_o   ( addrgen_exception_load    ),
+    .addrgen_exception_store_o  ( addrgen_exception_store   ),
     // Interface with the lanes
     .addrgen_operand_i          (addrgen_operand_i          ),
     .addrgen_operand_target_fu_i(addrgen_operand_target_fu_i),
@@ -144,7 +170,19 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .axi_addrgen_req_o          (axi_addrgen_req            ),
     .axi_addrgen_req_valid_o    (axi_addrgen_req_valid      ),
     .ldu_axi_addrgen_req_ready_i(ldu_axi_addrgen_req_ready  ),
-    .stu_axi_addrgen_req_ready_i(stu_axi_addrgen_req_ready  )
+    .stu_axi_addrgen_req_ready_i(stu_axi_addrgen_req_ready  ),
+
+    // CSR input    
+    .en_ld_st_translation_i,
+    .mmu_misaligned_ex_o,
+    .mmu_req_o,        
+    .mmu_vaddr_o,      
+    .mmu_is_store_o,   
+    .mmu_dtlb_hit_i,   
+    .mmu_dtlb_ppn_i,   
+    .mmu_valid_i,
+    .mmu_paddr_i,   
+    .mmu_exception_i
   );
 
   ////////////////////////
@@ -165,7 +203,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .axi_r_valid_i          (axi_resp.r_valid          ),
     .axi_r_ready_o          (axi_req.r_ready           ),
     // Interface with the dispatcher
-    .load_complete_o        (load_complete_o           ),
+    .load_complete_o        (load_complete             ),
     // Interface with the main sequencer
     .pe_req_i               (pe_req_i                  ),
     .pe_req_valid_i         (pe_req_valid_i            ),
@@ -173,6 +211,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .pe_req_ready_o         (pe_req_ready_o[OffsetLoad]),
     .pe_resp_o              (pe_resp_o[OffsetLoad]     ),
     // Interface with the address generator
+    .addrgen_exception_valid_i ( addrgen_ack_o & addrgen_exception_o.valid ),
     .axi_addrgen_req_i      (axi_addrgen_req           ),
     .axi_addrgen_req_valid_i(axi_addrgen_req_valid     ),
     .axi_addrgen_req_ready_o(ldu_axi_addrgen_req_ready ),
@@ -213,7 +252,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .axi_b_ready_o          (axi_req.b_ready            ),
     // Interface with the dispatcher
     .store_pending_o        (store_pending_o            ),
-    .store_complete_o       (store_complete_o           ),
+    .store_complete_o       (store_complete             ),
     // Interface with the main sequencer
     .pe_req_i               (pe_req_i                   ),
     .pe_req_valid_i         (pe_req_valid_i             ),
@@ -221,6 +260,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .pe_req_ready_o         (pe_req_ready_o[OffsetStore]),
     .pe_resp_o              (pe_resp_o[OffsetStore]     ),
     // Interface with the address generator
+    .addrgen_exception_valid_i ( addrgen_ack_o & addrgen_exception_o.valid ),
     .axi_addrgen_req_i      (axi_addrgen_req            ),
     .axi_addrgen_req_valid_i(axi_addrgen_req_valid      ),
     .axi_addrgen_req_ready_o(stu_axi_addrgen_req_ready  ),
diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv
index 9580f59b0..f6e5e38ca 100644
--- a/hardware/src/vlsu/vstu.sv
+++ b/hardware/src/vlsu/vstu.sv
@@ -46,6 +46,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
     output pe_resp_t                       pe_resp_o,
     // Interface with the address generator
     input  addrgen_axi_req_t               axi_addrgen_req_i,
+    input  logic                           addrgen_exception_valid_i,
     input  logic                           axi_addrgen_req_valid_i,
     output logic                           axi_addrgen_req_ready_o,
     // Interface with the lanes
@@ -63,12 +64,14 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
   import axi_pkg::beat_upper_byte;
   import axi_pkg::BURST_INCR;
 
+  localparam unsigned DataWidthB = DataWidth / 8;
+
   ///////////////////////
   //  Spill registers  //
   ///////////////////////
 
   elen_t [NrLanes-1:0] stu_operand;
-  logic  [NrLanes-1:0] stu_operand_valid;
+  logic  [NrLanes-1:0] stu_operand_valid_lanes;
   logic                stu_operand_ready;
 
   for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_regs
@@ -83,7 +86,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
       .valid_i   (stu_operand_valid_i[lane]),
       .ready_o   (stu_operand_ready_o[lane]),
       .data_o    (stu_operand[lane]        ),
-      .valid_o   (stu_operand_valid[lane]  ),
+      .valid_o   (stu_operand_valid_lanes[lane]  ),
       .ready_i   (stu_operand_ready        )
     );
   end: gen_regs
@@ -153,30 +156,47 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
   //  Store Unit  //
   //////////////////
 
+  // NOTE: these are out here only for debug visibility, they could go in p_vldu as automatic variables
+  int unsigned vrf_seq_byte;
+  int unsigned vrf_byte ;
+  vlen_t vrf_valid_bytes ;
+  vlen_t vinsn_valid_bytes;
+  vlen_t axi_valid_bytes   ;
+  logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes;      
+
+
   // Vector instructions currently running
   logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q;
 
   // Interface with the main sequencer
-  pe_resp_t pe_resp;
+  pe_resp_t pe_resp_d;
 
   // Remaining bytes of the current instruction in the issue phase
-  vlen_t issue_cnt_d, issue_cnt_q;
+  vlen_t issue_cnt_bytes_d, issue_cnt_bytes_q;
 
   // Pointers
   //
   // We need several pointers to copy data to the memory interface
   // from the VRF. Namely, we need:
   // - A counter of how many beats are left in the current AXI burst
-  axi_pkg::len_t len_d, len_q;
+  axi_pkg::len_t axi_len_d, axi_len_q;
   // - A pointer to which byte in the full VRF word we are reading data from.
   logic [idx_width(DataWidth*NrLanes/8):0] vrf_pnt_d, vrf_pnt_q;
 
   always_comb begin: p_vstu
+    // NOTE: these are out here only for debug visibility, they could go in p_vldu as automatic variables
+    vrf_seq_byte = '0;
+    vrf_byte  = '0;
+    vrf_valid_bytes  = '0;
+    vinsn_valid_bytes = '0;
+    axi_valid_bytes    = '0;
+    valid_bytes = '0;
+
     // Maintain state
     vinsn_queue_d = vinsn_queue_q;
-    issue_cnt_d   = issue_cnt_q;
+    issue_cnt_bytes_d   = issue_cnt_bytes_q;
 
-    len_d     = len_q;
+    axi_len_d     = axi_len_q;
     vrf_pnt_d = vrf_pnt_q;
 
     // Vector instructions currently running
@@ -184,7 +204,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
 
     // We are not ready, by default
     axi_addrgen_req_ready_o = 1'b0;
-    pe_resp                 = '0;
+    pe_resp_d               = '0;
     axi_w_o                 = '0;
     axi_w_valid_o           = 1'b0;
     axi_b_ready_o           = 1'b0;
@@ -204,92 +224,130 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
     // - We received all the operands from the lanes
     // - The address generator generated an AXI AW request for this write beat
     // - The AXI subsystem is ready to accept this W beat
-    if (vinsn_issue_valid && &stu_operand_valid && (vinsn_issue_q.vm || (|mask_valid_i)) &&
-        axi_addrgen_req_valid_i && !axi_addrgen_req_i.is_load && axi_w_ready_i) begin
+    if (vinsn_issue_valid &&
+        axi_addrgen_req_valid_i && !axi_addrgen_req_i.is_load && axi_w_ready_i) begin : issue_valid
       // Bytes valid in the current W beat
       automatic shortint unsigned lower_byte = beat_lower_byte(axi_addrgen_req_i.addr,
-        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q);
+        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q);
       automatic shortint unsigned upper_byte = beat_upper_byte(axi_addrgen_req_i.addr,
-        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, len_q);
+        axi_addrgen_req_i.size, axi_addrgen_req_i.len, BURST_INCR, AxiDataWidth/8, axi_len_q);
+
+      // For non-zero vstart values, the last operand read is not going to involve all the lanes
+      automatic logic [NrLanes-1:0] stu_operand_valid;
+      automatic logic [NrLanes-1:0] mask_valid;
 
+      // How many bytes are we committing?
+      // automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes;      
+      
       // Account for the issued bytes
       // How many bytes are valid in this VRF word
-      automatic vlen_t vrf_valid_bytes   = NrLanes * 8 - vrf_pnt_q;
+      vrf_valid_bytes   = (NrLanes * DataWidthB) - vrf_pnt_q;
       // How many bytes are valid in this instruction
-      automatic vlen_t vinsn_valid_bytes = issue_cnt_q - vrf_pnt_q;
+      vinsn_valid_bytes = issue_cnt_bytes_q - vrf_pnt_q;
       // How many bytes are valid in this AXI word
-      automatic vlen_t axi_valid_bytes   = upper_byte - lower_byte + 1;
-
-      // How many bytes are we committing?
-      automatic logic [idx_width(DataWidth*NrLanes/8):0] valid_bytes;
-      valid_bytes = issue_cnt_q < NrLanes * 8     ? vinsn_valid_bytes : vrf_valid_bytes;
-      valid_bytes = valid_bytes < axi_valid_bytes ? valid_bytes       : axi_valid_bytes;
-
-      vrf_pnt_d = vrf_pnt_q + valid_bytes;
-
-      // Copy data from the operands into the W channel
-      for (int axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin
-        // Is this byte a valid byte in the W beat?
-        if (axi_byte >= lower_byte && axi_byte <= upper_byte) begin
-          // Map axy_byte to the corresponding byte in the VRF word (sequential)
-          automatic int vrf_seq_byte = axi_byte - lower_byte + vrf_pnt_q;
-          // And then shuffle it
-          automatic int vrf_byte     = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.eew_vs1);
-
-          // Is this byte a valid byte in the VRF word?
-          if (vrf_seq_byte < issue_cnt_q) begin
-            // At which lane, and what is the byte offset in that lane, of the byte vrf_byte?
-            automatic int vrf_lane   = vrf_byte >> 3;
-            automatic int vrf_offset = vrf_byte[2:0];
-
-            // Copy data
-            axi_w_o.data[8*axi_byte +: 8] = stu_operand[vrf_lane][8*vrf_offset +: 8];
-            axi_w_o.strb[axi_byte]        = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset];
+      axi_valid_bytes   = upper_byte - lower_byte + 1;
+
+      valid_bytes = ( issue_cnt_bytes_q < (NrLanes * DataWidthB) ) ? vinsn_valid_bytes : vrf_valid_bytes;
+      valid_bytes = ( valid_bytes < axi_valid_bytes              ) ? valid_bytes       : axi_valid_bytes;
+
+      // Adjust valid signals to the next block "operands_ready"
+      stu_operand_valid = stu_operand_valid_lanes;
+      for ( int unsigned lane = 0; lane < NrLanes; lane++ ) begin : adjust_operand_valid
+        // - We are left with less byte than the maximim to issue, 
+        //    this means that at least one lane is not going to push us any operand anymore
+        // - For the lanes which index % NrLanes != 0
+        if ( ( issue_cnt_bytes_q < (NrLanes * DataWidthB) )
+              & ( lane < vinsn_issue_q.vstart[idx_width(NrLanes)-1:0] )
+              ) begin : vstart_lane_adjust
+          stu_operand_valid[lane] |= 1'b1;
+        end : vstart_lane_adjust
+      end : adjust_operand_valid
+    
+      // TODO: apply the same vstart logic also to mask_valid_i
+      // For now, assume (vstart % NrLanes == 0)
+      mask_valid = mask_valid_i;
+
+      // Wait for all expected operands from the lanes
+      if ( &stu_operand_valid && (vinsn_issue_q.vm || (|mask_valid_i) ) ) begin : operands_ready
+        vrf_pnt_d = vrf_pnt_q + valid_bytes;
+
+        // Copy data from the operands into the W channel
+        for (int unsigned axi_byte = 0; axi_byte < AxiDataWidth/8; axi_byte++) begin : stu_operand_to_axi_w
+          // Is this byte a valid byte in the W beat?
+          if (axi_byte >= lower_byte && axi_byte <= upper_byte) begin
+            // Map axy_byte to the corresponding byte in the VRF word (sequential)
+            vrf_seq_byte = axi_byte - lower_byte + vrf_pnt_q;
+            // And then shuffle it
+            vrf_byte     = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue_q.eew_vs1);
+
+            // Is this byte a valid byte in the VRF word?
+            if (vrf_seq_byte < issue_cnt_bytes_q) begin
+              // At which lane, and what is the byte offset in that lane, of the byte vrf_byte?
+              automatic int unsigned vrf_offset = vrf_byte[2:0];
+
+              // Consider also vstart and make sure this index wraps around the number of lane
+              // automatic logic [$clog2(NrLanes)-1:0] vrf_lane = (vrf_byte >> 3) + vinsn_issue_q.vstart[idx_width(NrLanes)-1:0];
+              automatic int unsigned vrf_lane = (vrf_byte >> 3);
+              // Adjust lane selection w.r.t. vstart
+              vrf_lane += vinsn_issue_q.vstart[idx_width(NrLanes)-1:0];
+              if ( vrf_lane >= NrLanes ) begin : vstart_lane_adjust
+                vrf_lane -= NrLanes;
+              end : vstart_lane_adjust
+
+              // Copy data
+              axi_w_o.data[8*axi_byte +: 8] = stu_operand[vrf_lane][8*vrf_offset +: 8];
+              axi_w_o.strb[axi_byte]        = vinsn_issue_q.vm || mask_i[vrf_lane][vrf_offset];
+            end
           end
-        end
-      end
-
-      // Send the W beat
-      axi_w_valid_o = 1'b1;
-      // Account for the beat we sent
-      len_d         = len_q + 1;
-      // We wrote all the beats for this AW burst
-      if ($unsigned(len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin
-        axi_w_o.last            = 1'b1;
-        // Ask for another burst by the address generator
-        axi_addrgen_req_ready_o = 1'b1;
-        // Reset AXI pointers
-        len_d                   = '0;
-      end
-
-      // We consumed a whole word from the lanes
-      if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_q) begin
-        // Reset the pointer in the VRF word
-        vrf_pnt_d         = '0;
-        // Acknowledge the operands with the lanes
-        stu_operand_ready = '1;
-        // Acknowledge the mask operand
-        mask_ready_o      = !vinsn_issue_q.vm;
-        // Account for the results that were issued
-        issue_cnt_d       = issue_cnt_q - NrLanes * 8;
-        if (issue_cnt_q < NrLanes * 8)
-          issue_cnt_d = '0;
-      end
-    end
+        end : stu_operand_to_axi_w
+
+        // Send the W beat
+        axi_w_valid_o = 1'b1;
+        // Account for the beat we sent
+        axi_len_d     = axi_len_q + 1;
+        // We wrote all the beats for this AW burst
+        if ($unsigned(axi_len_d) == axi_pkg::len_t'($unsigned(axi_addrgen_req_i.len) + 1)) begin : beats_complete
+          axi_w_o.last            = 1'b1;
+          // Ask for another burst by the address generator
+          axi_addrgen_req_ready_o = 1'b1;
+          // Reset AXI pointers
+          axi_len_d                   = '0;
+        end : beats_complete
+
+        // We consumed a whole word from the lanes
+        if (vrf_pnt_d == NrLanes*8 || vrf_pnt_d == issue_cnt_bytes_q) begin : vrf_word_done
+          // Reset the pointer in the VRF word
+          vrf_pnt_d         = '0;
+          // Acknowledge the operands with the lanes
+          stu_operand_ready = '1;
+          // Acknowledge the mask operand
+          mask_ready_o      = !vinsn_issue_q.vm;
+          // Account for the results that were issued
+          issue_cnt_bytes_d       = issue_cnt_bytes_q - (NrLanes * DataWidthB);
+          if (issue_cnt_bytes_q < (NrLanes * DataWidthB)) begin : issue_cnt_bytes_overflow
+            issue_cnt_bytes_d = '0;
+          end : issue_cnt_bytes_overflow
+        end : vrf_word_done
+      end : operands_ready
+    end : issue_valid
 
     // Finished issuing W beats for this vector store
-    if (vinsn_issue_valid && issue_cnt_d == 0) begin
+    if (vinsn_issue_valid && issue_cnt_bytes_d == 0) begin : axi_w_beat_finish
       // Bump issue counters and pointers of the vector instruction queue
       vinsn_queue_d.issue_cnt -= 1;
-      if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1)
+      if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) begin : issue_pnt_overflow
         vinsn_queue_d.issue_pnt = 0;
-      else
+      end : issue_pnt_overflow
+      else begin : issue_pnt_increment
         vinsn_queue_d.issue_pnt += 1;
+      end : issue_pnt_increment
 
-      if (vinsn_queue_d.issue_cnt != 0)
-        issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl <<
-          int'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew);
-    end
+      if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update
+        issue_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl - 
+                        vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart
+                      ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew);
+      end : issue_cnt_bytes_update
+    end : axi_w_beat_finish
 
     ////////////////////////////
     //  Handle the B channel  //
@@ -297,63 +355,66 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
 
     // TODO: We cannot handle errors on the B channel.
     // We just acknowledge any AXI requests that come on the B channel.
-    if (axi_b_valid_i) begin
+    if (axi_b_valid_i) begin : axi_b_valid
       // Acknowledge the B beat
       axi_b_ready_o = 1'b1;
 
       // Mark the vector instruction as being done
-      if (vinsn_queue_d.issue_pnt != vinsn_queue_d.commit_pnt) begin
+      if (vinsn_queue_d.issue_pnt != vinsn_queue_d.commit_pnt) begin : instr_done
         // Signal complete store
         store_complete_o = 1'b1;
 
-        pe_resp.vinsn_done[vinsn_commit.id] = 1'b1;
+        pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1;
 
         // Update the commit counters and pointers
         vinsn_queue_d.commit_cnt -= 1;
-        if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1)
+        if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) begin : commit_pnt_overflow
           vinsn_queue_d.commit_pnt = '0;
-        else
+        end : commit_pnt_overflow
+        else begin : commit_pnt_increment
           vinsn_queue_d.commit_pnt += 1;
-      end
-    end
+        end : commit_pnt_increment
+      end : instr_done
+    end : axi_b_valid
 
     //////////////////////////////
     //  Accept new instruction  //
     //////////////////////////////
 
     if (!vinsn_queue_full && pe_req_valid_i && !vinsn_running_q[pe_req_i.id] &&
-      pe_req_i.vfu == VFU_StoreUnit) begin
+      pe_req_i.vfu == VFU_StoreUnit) begin : issue_cnt_bytes_init
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = pe_req_i;
       vinsn_running_d[pe_req_i.id]                  = 1'b1;
 
       // Initialize counters
-      if (vinsn_queue_d.issue_cnt == '0)
-        issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew);
+      if (vinsn_queue_d.issue_cnt == '0) begin : issue_cnt_bytes_init
+        issue_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew);
+      end : issue_cnt_bytes_init
 
       // Bump pointers and counters of the vector instruction queue
       vinsn_queue_d.accept_pnt += 1;
       vinsn_queue_d.issue_cnt += 1;
       vinsn_queue_d.commit_cnt += 1;
-    end
+    end : issue_cnt_bytes_init
   end: p_vstu
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
       vinsn_running_q <= '0;
-      issue_cnt_q     <= '0;
+      issue_cnt_bytes_q     <= '0;
 
-      len_q     <= '0;
+      axi_len_q     <= '0;
       vrf_pnt_q <= '0;
 
       pe_resp_o <= '0;
     end else begin
       vinsn_running_q <= vinsn_running_d;
-      issue_cnt_q     <= issue_cnt_d;
+      issue_cnt_bytes_q     <= issue_cnt_bytes_d;
 
-      len_q     <= len_d;
+      axi_len_q     <= axi_len_d;
       vrf_pnt_q <= vrf_pnt_d;
 
-      pe_resp_o <= pe_resp;
+      pe_resp_o <= pe_resp_d;
     end
   end
 
diff --git a/hardware/tb/ara_testharness.sv b/hardware/tb/ara_testharness.sv
index 09901b262..84edf4c8e 100644
--- a/hardware/tb/ara_testharness.sv
+++ b/hardware/tb/ara_testharness.sv
@@ -153,7 +153,7 @@ module ara_testharness #(
     // If disabled
     if (!runtime_cnt_en_q)
       // Start only if the software allowed the enable and we detect the first V instruction
-      runtime_cnt_en_d = i_ara_soc.i_system.i_ara.acc_req_valid_i & cnt_en_mask;
+      runtime_cnt_en_d = i_ara_soc.i_system.i_ara.acc_req_i.req_valid & cnt_en_mask;
     // If enabled
     if (runtime_cnt_en_q)
       // Stop counting only if the software disabled the counter and Ara returned idle
@@ -177,14 +177,14 @@ module ara_testharness #(
     runtime_to_be_updated_d = runtime_to_be_updated_q;
 
     // Assert the update flag upon a new valid vector instruction
-    if (!runtime_to_be_updated_q && i_ara_soc.i_system.i_ara.acc_req_valid_i) begin
+    if (!runtime_to_be_updated_q && i_ara_soc.i_system.i_ara.acc_req_i.req_valid) begin
       runtime_to_be_updated_d = 1'b1;
     end
 
     // Update the internal runtime and reset the update flag
     if (runtime_to_be_updated_q           &&
         i_ara_soc.i_system.i_ara.ara_idle &&
-        !i_ara_soc.i_system.i_ara.acc_req_valid_i) begin
+        !i_ara_soc.i_system.i_ara.acc_req_i.req_valid) begin
       runtime_buf_d = runtime_cnt_q;
       runtime_to_be_updated_d = 1'b0;
     end
diff --git a/hardware/tb/dpi/elfloader.cc b/hardware/tb/dpi/elfloader.cc
index 60f06c358..7e0528f54 120000
--- a/hardware/tb/dpi/elfloader.cc
+++ b/hardware/tb/dpi/elfloader.cc
@@ -1 +1 @@
-../../deps/cva6/tb/dpi/elfloader.cc
\ No newline at end of file
+../../deps/cva6/corev_apu/tb/dpi/elfloader.cc
\ No newline at end of file
diff --git a/scripts/check_cycles.py b/scripts/check_cycles.py
index 7b7040c07..24a861d1a 100644
--- a/scripts/check_cycles.py
+++ b/scripts/check_cycles.py
@@ -24,19 +24,19 @@
 import numpy as np
 
 threshold = {
-  'imatmul'    : 300,
-  'fmatmul'    : 300,
-  'iconv2d'    : 300,
-  'fconv2d'    : 300,
-  'fconv3d'    : 300,
-  'jacobi2d'   : 300,
-  'dropout'    : 300,
-  'fft'        : 300,
-  'dwt'        : 300,
-  'exp'        : 300,
-  'softmax'    : 300,
-  'pathfinder' : 300,
-  'roi_align'  : 300,
+  'imatmul'    : 500,
+  'fmatmul'    : 500,
+  'iconv2d'    : 500,
+  'fconv2d'    : 500,
+  'fconv3d'    : 500,
+  'jacobi2d'   : 500,
+  'dropout'    : 500,
+  'fft'        : 500,
+  'dwt'        : 500,
+  'exp'        : 500,
+  'softmax'    : 500,
+  'pathfinder' : 500,
+  'roi_align'  : 500,
 }
 
 skip_check = {