diff --git a/Bender.lock b/Bender.lock
index 0458a760d..e3514dcd2 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -30,10 +30,10 @@ packages:
       Git: https://github.com/pulp-platform/common_verification.git
     dependencies: []
   cva6:
-    revision: ee89dcc00e6c1a1f4cf97ee1835e950fcfdeebb5
+    revision: ea86d0ac5fe23ac7889cf8a8b8df7a8c0813bfad
     version: null
     source:
-      Git: https://github.com/pulp-platform/cva6.git
+      Git: https://github.com/mp-17/cva6.git
     dependencies:
     - axi
     - common_cells
diff --git a/Bender.yml b/Bender.yml
index 142dcd8d0..8bcf21ee4 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -10,7 +10,7 @@ package:
 dependencies:
   axi:                { git: "https://github.com/pulp-platform/axi.git",                version: 0.39.1                               }
   common_cells:       { git: "https://github.com/pulp-platform/common_cells.git",       version: 1.22.1                               }
-  cva6:               { git: "https://github.com/pulp-platform/cva6.git",               rev: ee89dcc00e6c1a1f4cf97ee1835e950fcfdeebb5 } # pulp-v1
+  cva6:               { git: "https://github.com/mp-17/cva6.git",                       rev: ea86d0ac5fe23ac7889cf8a8b8df7a8c0813bfad } # rebase/pulp-v1-os
   tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.13                               }
   apb:                { git: "https://github.com/pulp-platform/apb.git",                version: 0.2.4                                }
 
@@ -42,7 +42,7 @@ sources:
     - hardware/src/lane/simd_mul.sv
     - hardware/src/lane/vector_regfile.sv
     - hardware/src/lane/power_gating_generic.sv
-    - hardware/src/masku/masku.sv
+    - hardware/src/masku/masku_operands.sv
     - hardware/src/sldu/p2_stride_gen.sv
     - hardware/src/sldu/sldu_op_dp.sv
     - hardware/src/sldu/sldu.sv
@@ -55,6 +55,7 @@ sources:
     - hardware/src/lane/vmfpu.sv
     - hardware/src/lane/fixed_p_rounding.sv
     - hardware/src/vlsu/vlsu.sv
+    - hardware/src/masku/masku.sv
     # Level 3
     - hardware/src/lane/vector_fus_stage.sv
     # Level 4
diff --git a/apps/verification/Makefile b/apps/verification/Makefile
new file mode 100644
index 000000000..777068a6f
--- /dev/null
+++ b/apps/verification/Makefile
@@ -0,0 +1,46 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Author: Matteo Perotti, ETH Zurich
+
+# Variables for sequence length and number of sequences
+SEQ_LENGTH ?= 6
+NUM_SEQS ?= 10
+
+# Python
+PYTHON ?= python3
+
+# Directories
+SRC_DIR = src
+SCRIPT_DIR = script
+OUTPUT_DIR = ../rand_seq_autogen
+
+# Source files
+INSTRUCTIONS_FILE = $(SRC_DIR)/vinsn_list.txt
+PYTHON_SCRIPT = $(SCRIPT_DIR)/vinsn_trace_gen.py
+# Output files
+OUTPUT_SEQ_FILE = $(OUTPUT_DIR)/vinsn_rand_seq.S
+OUTPUT_MAIN_FILE = $(OUTPUT_DIR)/main.c
+
+# Target to generate the sequences and main file
+all: $(OUTPUT_DIR) $(OUTPUT_SEQ_FILE) $(OUTPUT_MAIN_FILE)
+
+# Target to create the output directory
+$(OUTPUT_DIR):
+	mkdir -p $(OUTPUT_DIR)
+
+$(OUTPUT_SEQ_FILE) $(OUTPUT_MAIN_FILE): $(INSTRUCTIONS_FILE) $(PYTHON_SCRIPT)
+	$(PYTHON) $(PYTHON_SCRIPT) $(INSTRUCTIONS_FILE) $(OUTPUT_SEQ_FILE) $(SEQ_LENGTH) $(NUM_SEQS) $(OUTPUT_MAIN_FILE)
+
+.PHONY: all clean
diff --git a/apps/verification/README.md b/apps/verification/README.md
new file mode 100644
index 000000000..54ceb92e5
--- /dev/null
+++ b/apps/verification/README.md
@@ -0,0 +1,10 @@
+# Usage
+To generate the main.c and vinsn_rand_seq.S files with specific sequence length and number of sequences, run:
+
+```bash
+make SEQ_LENGTH=6 NUM_SEQS=10
+```
+
+This will create the output directory in the parent directory and place the main.c and vinsn_rand_seq.S files inside it.
+The SEQ_LENGTH and NUM_SEQS variables can be adjusted as needed when running the make command.
+The generated files will include comments at the beginning indicating they were auto-generated by the Python script.
\ No newline at end of file
diff --git a/apps/verification/script/README.md b/apps/verification/script/README.md
new file mode 100644
index 000000000..3b3768cc3
--- /dev/null
+++ b/apps/verification/script/README.md
@@ -0,0 +1,11 @@
+Execute the script with the following command:
+
+```bash
+python vinsn_trace_gen.py instructions.txt rand_seq.S 6 10 main.c
+```
+
+ - instructions.txt is the input file with the list of instructions.
+ - rand_seq.S is the output file where the random sequences will be written.
+ - 6 is the length of each random sequence (including the initial vsetvli instruction).
+ - 10 is the number of random sequences to generate.
+ - main.c is the file where the main function and function declarations will be written.
\ No newline at end of file
diff --git a/apps/verification/script/vinsn_trace_gen.py b/apps/verification/script/vinsn_trace_gen.py
new file mode 100644
index 000000000..8a3636480
--- /dev/null
+++ b/apps/verification/script/vinsn_trace_gen.py
@@ -0,0 +1,92 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Author: Matteo Perotti, ETH Zurich
+
+import random
+import sys
+
+def load_instructions(file_path):
+    with open(file_path, 'r') as file:
+        instructions = [line.strip() for line in file if line.strip() and not line.strip().startswith('#')]
+    return instructions
+
+def generate_random_sequences(instructions, sequence_length, num_sequences):
+    sequences = []
+    for _ in range(num_sequences):
+        XX = random.choice([8, 16, 32, 64])
+        Y = random.choice([1, 2, 4, 8])
+        initial_instruction = f'vsetvli t0, x0, e{XX}, m{Y}, ta, ma'
+        sequence = [initial_instruction] + random.choices(instructions, k=sequence_length - 1)
+        sequences.append(sequence)
+    return sequences
+
+def write_sequences_to_file(sequences, output_file_path, script_name):
+    with open(output_file_path, 'w') as file:
+        file.write(f'# This file was auto-generated by {script_name}\n')
+        file.write('.text\n')
+        for i in range(len(sequences)):
+            file.write(f'.global rand_seq_{i}\n')
+        file.write('\n')
+
+        for i, sequence in enumerate(sequences):
+            file.write(f'rand_seq_{i}:\n')
+            for instruction in sequence:
+                file.write(f'  {instruction}\n')
+            file.write('  ret\n\n')
+
+def write_main_file(num_sequences, output_file_path, script_name):
+    with open(output_file_path, 'w') as file:
+        file.write(f'// This file was auto-generated by {script_name}\n')
+        file.write('#include <stdint.h>\n')
+        file.write('#include <string.h>\n\n')
+        file.write('#ifndef SPIKE\n')
+        file.write('#include "printf.h"\n')
+        file.write('#else\n')
+        file.write('#include "util.h"\n')
+        file.write('#include <stdio.h>\n')
+        file.write('#endif\n\n')
+
+        for i in range(num_sequences):
+            file.write(f'void rand_seq_{i}();\n')
+        file.write('\n')
+        file.write('int main() {\n')
+        for i in range(num_sequences):
+            file.write(f'  printf("Rand Seq {i}\\n");\n')
+            file.write(f'  rand_seq_{i}();\n\n')
+        file.write('  printf("Program end\\n");\n\n')
+        file.write('  return 0;\n')
+        file.write('}\n')
+
+def main():
+    if len(sys.argv) != 6:
+        print("Usage: python generate_sequences.py <input_file> <output_seq_file> <sequence_length> <num_sequences> <output_main_file>")
+        sys.exit(1)
+
+    input_file_path = sys.argv[1]
+    output_seq_file_path = sys.argv[2]
+    sequence_length = int(sys.argv[3])
+    num_sequences = int(sys.argv[4])
+    output_main_file_path = sys.argv[5]
+
+    instructions = load_instructions(input_file_path)
+    sequences = generate_random_sequences(instructions, sequence_length, num_sequences)
+    script_name = sys.argv[0]
+    write_sequences_to_file(sequences, output_seq_file_path, script_name)
+    write_main_file(num_sequences, output_main_file_path, script_name)
+    print(f'{num_sequences} random sequences of length {sequence_length} have been written to {output_seq_file_path}')
+    print(f'Main file written to {output_main_file_path}')
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/verification/src/vinsn_list.txt b/apps/verification/src/vinsn_list.txt
new file mode 100644
index 000000000..5f7f304e2
--- /dev/null
+++ b/apps/verification/src/vinsn_list.txt
@@ -0,0 +1,575 @@
+# Author: Camel Coder, camel-cdr, <camel-cdr@protonmail.com>
+
+vadd.vv v8,v16,v24
+vadd.vv v8,v16,v24,v0.t
+vadd.vx v8,v16,t0
+vadd.vx v8,v16,t0,v0.t
+vadd.vi v8,v16,13
+vadd.vi v8,v16,13,v0.t
+vsub.vv v8,v16,v24
+vsub.vv v8,v16,v24,v0.t
+vsub.vx v8,v16,t0
+vsub.vx v8,v16,t0,v0.t
+vrsub.vx v8,v16,t0
+vrsub.vx v8,v16,t0,v0.t
+vrsub.vi v8,v16,13
+vrsub.vi v8,v16,13,v0.t
+vminu.vv v8,v16,v24
+vminu.vv v8,v16,v24,v0.t
+vminu.vx v8,v16,t0
+vminu.vx v8,v16,t0,v0.t
+vmin.vv v8,v16,v24
+vmin.vv v8,v16,v24,v0.t
+vmin.vx v8,v16,t0
+vmin.vx v8,v16,t0,v0.t
+vmaxu.vv v8,v16,v24
+vmaxu.vv v8,v16,v24,v0.t
+vmaxu.vx v8,v16,t0
+vmaxu.vx v8,v16,t0,v0.t
+vmax.vv v8,v16,v24
+vmax.vv v8,v16,v24,v0.t
+vmax.vx v8,v16,t0
+vmax.vx v8,v16,t0,v0.t
+vand.vv v8,v16,v24
+vand.vv v8,v16,v24,v0.t
+vand.vx v8,v16,t0
+vand.vx v8,v16,t0,v0.t
+vand.vi v8,v16,13
+vand.vi v8,v16,13,v0.t
+vor.vv v8,v16,v24
+vor.vv v8,v16,v24,v0.t
+vor.vx v8,v16,t0
+vor.vx v8,v16,t0,v0.t
+vor.vi v8,v16,13
+vor.vi v8,v16,13,v0.t
+vxor.vv v8,v16,v24
+vxor.vv v8,v16,v24,v0.t
+vxor.vx v8,v16,t0
+vxor.vx v8,v16,t0,v0.t
+vxor.vi v8,v16,13
+vxor.vi v8,v16,13,v0.t
+vrgather.vv v8,v16,v24
+vrgather.vv v8,v16,v24,v0.t
+vrgather.vx v8,v16,t0
+vrgather.vx v8,v16,t0,v0.t
+vrgather.vi v8,v16,3
+vrgather.vi v8,v16,3,v0.t
+vslideup.vx v8,v16,t0
+vslideup.vx v8,v16,t0,v0.t
+vslideup.vi v8,v16,3
+vslideup.vi v8,v16,3,v0.t
+vrgatherei16.vv v8,v16,v24
+vrgatherei16.vv v8,v16,v24,v0.t
+vslidedown.vx v8,v16,t0
+vslidedown.vx v8,v16,t0,v0.t
+vslidedown.vi v8,v16,3
+vslidedown.vi v8,v16,3,v0.t
+vredsum.vs v8,v16,v24
+vredsum.vs v8,v16,v24,v0.t
+vredand.vs v8,v16,v24
+vredand.vs v8,v16,v24,v0.t
+vredor.vs v8,v16,v24
+vredor.vs v8,v16,v24,v0.t
+vredxor.vs v8,v16,v24
+vredxor.vs v8,v16,v24,v0.t
+vredminu.vs v8,v16,v24
+vredminu.vs v8,v16,v24,v0.t
+vredmin.vs v8,v16,v24
+vredmin.vs v8,v16,v24,v0.t
+vredmaxu.vs v8,v16,v24
+vredmaxu.vs v8,v16,v24,v0.t
+vredmax.vs v8,v16,v24
+vredmax.vs v8,v16,v24,v0.t
+vaaddu.vv v8,v16,v24
+vaaddu.vv v8,v16,v24,v0.t
+vaaddu.vx v8,v16,t0
+vaaddu.vx v8,v16,t0,v0.t
+vaadd.vv v8,v16,v24
+vaadd.vv v8,v16,v24,v0.t
+vaadd.vx v8,v16,t0
+vaadd.vx v8,v16,t0,v0.t
+vasubu.vv v8,v16,v24
+vasubu.vv v8,v16,v24,v0.t
+vasubu.vx v8,v16,t0
+vasubu.vx v8,v16,t0,v0.t
+vasub.vv v8,v16,v24
+vasub.vv v8,v16,v24,v0.t
+vasub.vx v8,v16,t0
+vasub.vx v8,v16,t0,v0.t
+vslide1up.vx v8,v16,t0
+vslide1up.vx v8,v16,t0,v0.t
+vslide1down.vx v8,v16,t0
+vslide1down.vx v8,v16,t0,v0.t
+vadc.vvm v8,v16,v24,v0
+vadc.vxm v8,v16,t0,v0
+vadc.vim v8,v16,13,v0
+vmadc.vvm v8,v16,v24,v0
+vmadc.vxm v8,v16,t0,v0
+vmadc.vim v8,v16,13,v0
+vsbc.vvm v8,v16,v24,v0
+vsbc.vxm v8,v16,t0,v0
+vmsbc.vvm v8,v16,v24,v0
+vmsbc.vxm v8,v16,t0,v0
+vmerge.vvm v8,v16,v24,v0
+vmerge.vxm v8,v16,t0,v0
+vmerge.vim v8,v16,13,v0
+vmv.v.v v8,v16
+vmv.v.x v8,t0
+vmv.v.i v8,13
+vmseq.vv v8,v16,v24
+vmseq.vv v8,v16,v24,v0.t
+vmseq.vx v8,v16,t0
+vmseq.vx v8,v16,t0,v0.t
+vmseq.vi v8,v16,13
+vmseq.vi v8,v16,13,v0.t
+vmsne.vv v8,v16,v24
+vmsne.vv v8,v16,v24,v0.t
+vmsne.vx v8,v16,t0
+vmsne.vx v8,v16,t0,v0.t
+vmsne.vi v8,v16,13
+vmsne.vi v8,v16,13,v0.t
+vmsltu.vv v8,v16,v24
+vmsltu.vv v8,v16,v24,v0.t
+vmsltu.vx v8,v16,t0
+vmsltu.vx v8,v16,t0,v0.t
+vmslt.vv v8,v16,v24
+vmslt.vv v8,v16,v24,v0.t
+vmslt.vx v8,v16,t0
+vmslt.vx v8,v16,t0,v0.t
+vmsleu.vv v8,v16,v24
+vmsleu.vv v8,v16,v24,v0.t
+vmsleu.vx v8,v16,t0
+vmsleu.vx v8,v16,t0,v0.t
+vmsleu.vi v8,v16,13
+vmsleu.vi v8,v16,13,v0.t
+vmsle.vv v8,v16,v24
+vmsle.vv v8,v16,v24,v0.t
+vmsle.vx v8,v16,t0
+vmsle.vx v8,v16,t0,v0.t
+vmsle.vi v8,v16,13
+vmsle.vi v8,v16,13,v0.t
+vmsgtu.vx v8,v16,t0
+vmsgtu.vx v8,v16,t0,v0.t
+vmsgtu.vi v8,v16,13
+vmsgtu.vi v8,v16,13,v0.t
+vmsgt.vx v8,v16,t0
+vmsgt.vx v8,v16,t0,v0.t
+vmsgt.vi v8,v16,13
+vmsgt.vi v8,v16,13,v0.t
+vcompress.vm v0,v8,v16
+vmandn.mm v0,v8,v16
+vmand.mm v0,v8,v16
+vmor.mm v0,v8,v16
+vmxor.mm v0,v8,v16
+vmorn.mm v0,v8,v16
+vmnand.mm v0,v8,v16
+vmnor.mm v0,v8,v16
+vmxnor.mm v0,v8,v16
+vsaddu.vv v8,v16,v24
+vsaddu.vv v8,v16,v24,v0.t
+vsaddu.vx v8,v16,t0
+vsaddu.vx v8,v16,t0,v0.t
+vsaddu.vi v8,v16,13
+vsaddu.vi v8,v16,13,v0.t
+vsadd.vv v8,v16,v24
+vsadd.vv v8,v16,v24,v0.t
+vsadd.vx v8,v16,t0
+vsadd.vx v8,v16,t0,v0.t
+vsadd.vi v8,v16,13
+vsadd.vi v8,v16,13,v0.t
+vssubu.vv v8,v16,v24
+vssubu.vv v8,v16,v24,v0.t
+vssubu.vx v8,v16,t0
+vssubu.vx v8,v16,t0,v0.t
+vssub.vv v8,v16,v24
+vssub.vv v8,v16,v24,v0.t
+vssub.vx v8,v16,t0
+vssub.vx v8,v16,t0,v0.t
+vsll.vv v8,v16,v24
+vsll.vv v8,v16,v24,v0.t
+vsll.vx v8,v16,t0
+vsll.vx v8,v16,t0,v0.t
+vsll.vi v8,v16,13
+vsll.vi v8,v16,13,v0.t
+vsmul.vv v8,v16,v24
+vsmul.vv v8,v16,v24,v0.t
+vsmul.vx v8,v16,t0
+vsmul.vx v8,v16,t0,v0.t
+vmv1r.v v8,v16
+vmv2r.v v8,v16
+vmv4r.v v8,v16
+vmv8r.v v8,v16
+vsrl.vv v8,v16,v24
+vsrl.vv v8,v16,v24,v0.t
+vsrl.vx v8,v16,t0
+vsrl.vx v8,v16,t0,v0.t
+vsrl.vi v8,v16,13
+vsrl.vi v8,v16,13,v0.t
+vsra.vv v8,v16,v24
+vsra.vv v8,v16,v24,v0.t
+vsra.vx v8,v16,t0
+vsra.vx v8,v16,t0,v0.t
+vsra.vi v8,v16,13
+vsra.vi v8,v16,13,v0.t
+vssrl.vv v8,v16,v24
+vssrl.vv v8,v16,v24,v0.t
+vssrl.vx v8,v16,t0
+vssrl.vx v8,v16,t0,v0.t
+vssrl.vi v8,v16,13
+vssrl.vi v8,v16,13,v0.t
+vdivu.vv v8,v16,v24
+vdivu.vv v8,v16,v24,v0.t
+vdivu.vx v8,v16,t0
+vdivu.vx v8,v16,t0,v0.t
+vdiv.vv v8,v16,v24
+vdiv.vv v8,v16,v24,v0.t
+vdiv.vx v8,v16,t0
+vdiv.vx v8,v16,t0,v0.t
+vremu.vv v8,v16,v24
+vremu.vv v8,v16,v24,v0.t
+vremu.vx v8,v16,t0
+vremu.vx v8,v16,t0,v0.t
+vrem.vv v8,v16,v24
+vrem.vv v8,v16,v24,v0.t
+vrem.vx v8,v16,t0
+vrem.vx v8,v16,t0,v0.t
+vmulhu.vv v8,v16,v24
+vmulhu.vv v8,v16,v24,v0.t
+vmulhu.vx v8,v16,t0
+vmulhu.vx v8,v16,t0,v0.t
+vmul.vv v8,v16,v24
+vmul.vv v8,v16,v24,v0.t
+vmul.vx v8,v16,t0
+vmul.vx v8,v16,t0,v0.t
+vmulhsu.vv v8,v16,v24
+vmulhsu.vv v8,v16,v24,v0.t
+vmulhsu.vx v8,v16,t0
+vmulhsu.vx v8,v16,t0,v0.t
+vmulh.vv v8,v16,v24
+vmulh.vv v8,v16,v24,v0.t
+vmulh.vx v8,v16,t0
+vmulh.vx v8,v16,t0,v0.t
+vmadd.vv v8,v16,v24
+vmadd.vv v8,v16,v24,v0.t
+vmadd.vx v8,t0,v16
+vmadd.vx v8,t0,v16,v0.t
+vmacc.vv v8,v16,v24
+vmacc.vv v8,v16,v24,v0.t
+vmacc.vx v8,t0,v16
+vmacc.vx v8,t0,v16,v0.t
+vnsrl.wv v8,v16,v24
+vnsrl.wv v8,v16,v24,v0.t
+vnsrl.wx v8,v16,t0
+vnsrl.wx v8,v16,t0,v0.t
+vnsrl.wi v8,v16,13
+vnsrl.wi v8,v16,13,v0.t
+vnsra.wv v8,v16,v24
+vnsra.wv v8,v16,v24,v0.t
+vnsra.wx v8,v16,t0
+vnsra.wx v8,v16,t0,v0.t
+vnsra.wi v8,v16,13
+vnsra.wi v8,v16,13,v0.t
+vnclipu.wv v8,v16,v24
+vnclipu.wv v8,v16,v24,v0.t
+vnclipu.wx v8,v16,t0
+vnclipu.wx v8,v16,t0,v0.t
+vnclipu.wi v8,v16,13
+vnclipu.wi v8,v16,13,v0.t
+vnclip.wv v8,v16,v24
+vnclip.wv v8,v16,v24,v0.t
+vnclip.wx v8,v16,t0
+vnclip.wx v8,v16,t0,v0.t
+vnclip.wi v8,v16,13
+vnclip.wi v8,v16,13,v0.t
+vnmsub.vv v8,v16,v24
+vnmsub.vv v8,v16,v24,v0.t
+vnmsub.vx v8,t0,v16
+vnmsub.vx v8,t0,v16,v0.t
+vnmsac.vv v8,v16,v24
+vnmsac.vv v8,v16,v24,v0.t
+vnmsac.vx v8,t0,v16
+vnmsac.vx v8,t0,v16,v0.t
+vwaddu.vv v8,v16,v24
+vwaddu.vv v8,v16,v24,v0.t
+vwaddu.vx v8,v16,t0
+vwaddu.vx v8,v16,t0,v0.t
+vwadd.vv v8,v16,v24
+vwadd.vv v8,v16,v24,v0.t
+vwadd.vx v8,v16,t0
+vwadd.vx v8,v16,t0,v0.t
+vwsub.vv v8,v16,v24
+vwsub.vv v8,v16,v24,v0.t
+vwsub.vx v8,v16,t0
+vwsub.vx v8,v16,t0,v0.t
+vwaddu.wv v8,v16,v24
+vwaddu.wv v8,v16,v24,v0.t
+vwaddu.wx v8,v16,t0
+vwaddu.wx v8,v16,t0,v0.t
+vwadd.wv v8,v16,v24
+vwadd.wv v8,v16,v24,v0.t
+vwadd.wx v8,v16,t0
+vwadd.wx v8,v16,t0,v0.t
+vwsub.wv v8,v16,v24
+vwsub.wv v8,v16,v24,v0.t
+vwsub.wx v8,v16,t0
+vwsub.wx v8,v16,t0,v0.t
+vwmulu.vv v8,v16,v24
+vwmulu.vv v8,v16,v24,v0.t
+vwmulu.vx v8,v16,t0
+vwmulu.vx v8,v16,t0,v0.t
+vwmulsu.vv v8,v16,v24
+vwmulsu.vv v8,v16,v24,v0.t
+vwmulsu.vx v8,v16,t0
+vwmulsu.vx v8,v16,t0,v0.t
+vwmul.vv v8,v16,v24
+vwmul.vv v8,v16,v24,v0.t
+vwmul.vx v8,v16,t0
+vwmul.vx v8,v16,t0,v0.t
+vwmaccu.vv v8,v16,v24
+vwmaccu.vv v8,v16,v24,v0.t
+vwmaccu.vx v8,t0,v16
+vwmaccu.vx v8,t0,v16,v0.t
+vwmacc.vv v8,v16,v24
+vwmacc.vv v8,v16,v24,v0.t
+vwmacc.vx v8,t0,v16
+vwmacc.vx v8,t0,v16,v0.t
+vwmaccsu.vv v8,v16,v24
+vwmaccsu.vv v8,v16,v24,v0.t
+vwmaccsu.vx v8,t0,v16
+vwmaccsu.vx v8,t0,v16,v0.t
+vwmaccus.vx v8,t0,v16
+vwmaccus.vx v8,t0,v16,v0.t
+vfadd.vv v8,v16,v24
+vfadd.vv v8,v16,v24,v0.t
+vfadd.vf v8,v16,ft0
+vfadd.vf v8,v16,ft0,v0.t
+vfsub.vv v8,v16,v24
+vfsub.vv v8,v16,v24,v0.t
+vfsub.vf v8,v16,ft0
+vfsub.vf v8,v16,ft0,v0.t
+vfmin.vv v8,v16,v24
+vfmin.vv v8,v16,v24,v0.t
+vfmin.vf v8,v16,ft0
+vfmin.vf v8,v16,ft0,v0.t
+vfmax.vv v8,v16,v24
+vfmax.vv v8,v16,v24,v0.t
+vfmax.vf v8,v16,ft0
+vfmax.vf v8,v16,ft0,v0.t
+vfsgnj.vv v8,v16,v24
+vfsgnj.vv v8,v16,v24,v0.t
+vfsgnj.vf v8,v16,ft0
+vfsgnj.vf v8,v16,ft0,v0.t
+vfsgnjn.vv v8,v16,v24
+vfsgnjn.vv v8,v16,v24,v0.t
+vfsgnjn.vf v8,v16,ft0
+vfsgnjn.vf v8,v16,ft0,v0.t
+vfsgnjx.vv v8,v16,v24
+vfsgnjx.vv v8,v16,v24,v0.t
+vfsgnjx.vf v8,v16,ft0
+vfsgnjx.vf v8,v16,ft0,v0.t
+vfslide1up.vf v8,v16,ft0
+vfslide1up.vf v8,v16,ft0,v0.t
+vfslide1down.vf v8,v16,ft0
+vfslide1down.vf v8,v16,ft0,v0.t
+vfredusum.vs v8,v16,v24
+vfredusum.vs v8,v16,v24,v0.t
+vfredosum.vs v8,v16,v24
+vfredosum.vs v8,v16,v24,v0.t
+vfredmin.vs v8,v16,v24
+vfredmin.vs v8,v16,v24,v0.t
+vfredmax.vs v8,v16,v24
+vfredmax.vs v8,v16,v24,v0.t
+vfmerge.vfm v8,v16,ft0,v0
+vfmv.v.f v8,ft0
+vmfeq.vv v8,v16,v24
+vmfeq.vv v8,v16,v24,v0.t
+vmfeq.vf v8,v16,ft0
+vmfeq.vf v8,v16,ft0,v0.t
+vmfle.vv v8,v16,v24
+vmfle.vv v8,v16,v24,v0.t
+vmfle.vf v8,v16,ft0
+vmfle.vf v8,v16,ft0,v0.t
+vmflt.vv v8,v16,v24
+vmflt.vv v8,v16,v24,v0.t
+vmflt.vf v8,v16,ft0
+vmflt.vf v8,v16,ft0,v0.t
+vmfne.vv v8,v16,v24
+vmfne.vv v8,v16,v24,v0.t
+vmfne.vf v8,v16,ft0
+vmfne.vf v8,v16,ft0,v0.t
+vmfgt.vv v8,v16,v24
+vmfgt.vv v8,v16,v24,v0.t
+vmfgt.vf v8,v16,ft0
+vmfgt.vf v8,v16,ft0,v0.t
+vmfge.vv v8,v16,v24
+vmfge.vv v8,v16,v24,v0.t
+vmfge.vf v8,v16,ft0
+vmfge.vf v8,v16,ft0,v0.t
+vfdiv.vv v8,v16,v24
+vfdiv.vv v8,v16,v24,v0.t
+vfdiv.vf v8,v16,ft0
+vfdiv.vf v8,v16,ft0,v0.t
+vfrdiv.vf v8,v16,ft0
+vfrdiv.vf v8,v16,ft0,v0.t
+vfmul.vv v8,v16,v24
+vfmul.vv v8,v16,v24,v0.t
+vfmul.vf v8,v16,ft0
+vfmul.vf v8,v16,ft0,v0.t
+vfrsub.vf v8,v16,ft0
+vfrsub.vf v8,v16,ft0,v0.t
+vfmadd.vv v8,v16,v24
+vfmadd.vv v8,v16,v24,v0.t
+vfmadd.vf v8,ft0,v16
+vfmadd.vf v8,ft0,v16,v0.t
+vfmsub.vv v8,v16,v24
+vfmsub.vv v8,v16,v24,v0.t
+vfmsub.vf v8,ft0,v16
+vfmsub.vf v8,ft0,v16,v0.t
+vfmacc.vv v8,v16,v24
+vfmacc.vv v8,v16,v24,v0.t
+vfmacc.vf v8,ft0,v16
+vfmacc.vf v8,ft0,v16,v0.t
+vfmsac.vv v8,v16,v24
+vfmsac.vv v8,v16,v24,v0.t
+vfmsac.vf v8,ft0,v16
+vfmsac.vf v8,ft0,v16,v0.t
+vfnmsac.vv v8,v16,v24
+vfnmsac.vv v8,v16,v24,v0.t
+vfnmsac.vf v8,ft0,v16
+vfnmsac.vf v8,ft0,v16,v0.t
+vfnmacc.vv v8,v16,v24
+vfnmacc.vv v8,v16,v24,v0.t
+vfnmacc.vf v8,ft0,v16
+vfnmacc.vf v8,ft0,v16,v0.t
+vfnmsub.vv v8,v16,v24
+vfnmsub.vv v8,v16,v24,v0.t
+vfnmsub.vf v8,ft0,v16
+vfnmsub.vf v8,ft0,v16,v0.t
+vfnmadd.vv v8,v16,v24
+vfnmadd.vv v8,v16,v24,v0.t
+vfnmadd.vf v8,ft0,v16
+vfnmadd.vf v8,ft0,v16,v0.t
+vwredsumu.vs v8,v16,v24
+vwredsumu.vs v8,v16,v24,v0.t
+vwredsum.vs v8,v16,v24
+vwredsum.vs v8,v16,v24,v0.t
+vfwadd.vv v8,v16,v24
+vfwadd.vv v8,v16,v24,v0.t
+vfwadd.vf v8,v16,ft0
+vfwadd.vf v8,v16,ft0,v0.t
+vfwsub.vv v8,v16,v24
+vfwsub.vv v8,v16,v24,v0.t
+vfwsub.vf v8,v16,ft0
+vfwsub.vf v8,v16,ft0,v0.t
+vfwadd.wv v8,v16,v24
+vfwadd.wv v8,v16,v24,v0.t
+vfwadd.wf v8,v16,ft0
+vfwadd.wf v8,v16,ft0,v0.t
+vfwsub.wv v8,v16,v24
+vfwsub.wv v8,v16,v24,v0.t
+vfwsub.wf v8,v16,ft0
+vfwsub.wf v8,v16,ft0,v0.t
+vfwmul.vv v8,v16,v24
+vfwmul.vv v8,v16,v24,v0.t
+vfwmul.vf v8,v16,ft0
+vfwmul.vf v8,v16,ft0,v0.t
+vfwmacc.vv v8,v16,v24
+vfwmacc.vv v8,v16,v24,v0.t
+vfwmacc.vf v8,ft0,v16
+vfwmacc.vf v8,ft0,v16,v0.t
+vfwnmacc.vv v8,v16,v24
+vfwnmacc.vv v8,v16,v24,v0.t
+vfwnmacc.vf v8,ft0,v16
+vfwnmacc.vf v8,ft0,v16,v0.t
+vfwmsac.vv v8,v16,v24
+vfwmsac.vv v8,v16,v24,v0.t
+vfwmsac.vf v8,ft0,v16
+vfwmsac.vf v8,ft0,v16,v0.t
+vfwnmsac.vv v8,v16,v24
+vfwnmsac.vv v8,v16,v24,v0.t
+vfwnmsac.vf v8,ft0,v16
+vfwnmsac.vf v8,ft0,v16,v0.t
+vfwredosum.vs v8,v16,v24
+vfwredosum.vs v8,v16,v24,v0.t
+vfwredusum.vs v8,v16,v24
+vfwredusum.vs v8,v16,v24,v0.t
+vmv.s.x v8,t0
+vmv.x.s t0,v8
+vcpop.m t0,v8
+vcpop.m t0,v8,v0.t
+vfirst.m t0,v8
+vfirst.m t0,v8,v0.t
+vzext.vf2 v8,v16
+vzext.vf2 v8,v16,v0.t
+vsext.vf2 v8,v16
+vsext.vf2 v8,v16,v0.t
+vzext.vf4 v8,v16
+vzext.vf4 v8,v16,v0.t
+vsext.vf4 v8,v16
+vsext.vf4 v8,v16,v0.t
+vzext.vf8 v8,v16
+vzext.vf8 v8,v16,v0.t
+vsext.vf8 v8,v16
+vsext.vf8 v8,v16,v0.t
+vfmv.f.s ft0,v8
+vfmv.s.f v8,ft0
+vfcvt.xu.f.v v8,v16
+vfcvt.xu.f.v v8,v16,v0.t
+vfcvt.x.f.v v8,v16
+vfcvt.x.f.v v8,v16,v0.t
+vfcvt.f.xu.v v8,v16
+vfcvt.f.xu.v v8,v16,v0.t
+vfcvt.f.x.v v8,v16
+vfcvt.f.x.v v8,v16,v0.t
+vfcvt.rtz.x.f.v v8,v16
+vfcvt.rtz.x.f.v v8,v16,v0.t
+vfcvt.rtz.xu.f.v v8,v16
+vfcvt.rtz.xu.f.v v8,v16,v0.t
+vfwcvt.xu.f.v v8,v16
+vfwcvt.xu.f.v v8,v16,v0.t
+vfwcvt.x.f.v v8,v16
+vfwcvt.x.f.v v8,v16,v0.t
+vfwcvt.f.xu.v v8,v16
+vfwcvt.f.xu.v v8,v16,v0.t
+vfwcvt.f.x.v v8,v16
+vfwcvt.f.x.v v8,v16,v0.t
+vfwcvt.f.f.v v8,v16
+vfwcvt.f.f.v v8,v16,v0.t
+vfwcvt.rtz.xu.f.v v8,v16
+vfwcvt.rtz.xu.f.v v8,v16,v0.t
+vfwcvt.rtz.x.f.v v8,v16
+vfwcvt.rtz.x.f.v v8,v16,v0.t
+vfncvt.xu.f.w v8,v16
+vfncvt.xu.f.w v8,v16,v0.t
+vfncvt.x.f.w v8,v16
+vfncvt.x.f.w v8,v16,v0.t
+vfncvt.f.xu.w v8,v16
+vfncvt.f.xu.w v8,v16,v0.t
+vfncvt.f.x.w v8,v16
+vfncvt.f.x.w v8,v16,v0.t
+vfncvt.f.f.w v8,v16
+vfncvt.f.f.w v8,v16,v0.t
+vfncvt.rtz.x.f.w v8,v16
+vfncvt.rtz.x.f.w v8,v16,v0.t
+vfncvt.rtz.xu.f.w v8,v16
+vfncvt.rtz.xu.f.w v8,v16,v0.t
+vfncvt.rod.f.f.w v8,v16
+vfncvt.rod.f.f.w v8,v16,v0.t
+vfsqrt.v v8,v16
+vfsqrt.v v8,v16,v0.t
+vfrsqrt7.v v8,v16
+vfrsqrt7.v v8,v16,v0.t
+vfrec7.v v8,v16
+vfrec7.v v8,v16,v0.t
+vfclass.v v8,v16
+vfclass.v v8,v16,v0.t
+vmsbf.m v8,v16
+vmsbf.m v8,v16,v0.t
+vmsof.m v8,v16
+vmsof.m v8,v16,v0.t
+vmsif.m v8,v16
+vmsif.m v8,v16,v0.t
+viota.m v8,v16
+viota.m v8,v16,v0.t
+vid.v v8
+vid.v v8,v0.t
diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
index 8d51aa576..d1dd3788f 100644
--- a/hardware/include/ara_pkg.sv
+++ b/hardware/include/ara_pkg.sv
@@ -324,11 +324,11 @@ package ara_pkg;
     // Scalar response
     elen_t resp;
 
-    // Instruction triggered an error
-    logic error;
+    // Instruction triggered an exception
+    ariane_pkg::exception_t exception;
 
     // New value for vstart
-    vlen_t error_vl;
+    vlen_t exception_vstart;
   } ara_resp_t;
 
   ////////////////////
diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv
index 9b1b74463..8e12543c0 100644
--- a/hardware/src/ara.sv
+++ b/hardware/src/ara.sv
@@ -123,8 +123,8 @@ module ara import ara_pkg::*; #(
   pe_resp_t          [NrPEs-1:0]   pe_resp;
   // Interface with the address generator
   logic                            addrgen_ack;
-  logic                            addrgen_error;
-  vlen_t                           addrgen_error_vl;
+  ariane_pkg::exception_t          addrgen_exception;
+  vlen_t                           addrgen_exception_vstart;
   logic              [NrLanes-1:0] alu_vinsn_done;
   logic              [NrLanes-1:0] mfpu_vinsn_done;
   // Interface with the operand requesters
@@ -171,8 +171,8 @@ module ara import ara_pkg::*; #(
     .pe_scalar_resp_ready_o(pe_scalar_resp_ready     ),
     // Interface with the address generator
     .addrgen_ack_i         (addrgen_ack              ),
-    .addrgen_error_i       (addrgen_error            ),
-    .addrgen_error_vl_i    (addrgen_error_vl         )
+    .addrgen_exception_i   (addrgen_exception        ),
+    .addrgen_exception_vstart_i(addrgen_exception_vstart     )
   );
 
   // Scalar move support
@@ -337,8 +337,8 @@ module ara import ara_pkg::*; #(
     .pe_req_ready_o             (pe_req_ready[NrLanes+OffsetStore : NrLanes+OffsetLoad]),
     .pe_resp_o                  (pe_resp[NrLanes+OffsetStore : NrLanes+OffsetLoad]     ),
     .addrgen_ack_o              (addrgen_ack                                           ),
-    .addrgen_error_o            (addrgen_error                                         ),
-    .addrgen_error_vl_o         (addrgen_error_vl                                      ),
+    .addrgen_exception_o        (addrgen_exception                                     ),
+    .addrgen_exception_vstart_o     (addrgen_exception_vstart                                  ),
     // Interface with the Mask unit
     .mask_i                     (mask                                                  ),
     .mask_valid_i               (mask_valid                                            ),
@@ -458,6 +458,9 @@ module ara import ara_pkg::*; #(
   if (ara_pkg::VLEN == 0)
     $error("[ara] The vector length must be greater than zero.");
 
+  if (ara_pkg::VLENB < 8 * NrLanes)
+    $error("[ara] Every vector register with LMUL1 should have at least 8 Byte/lane.");
+
   if (ara_pkg::VLEN < ELEN)
     $error(
       "[ara] The vector length must be greater or equal than the maximum size of a single vector element"
diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv
index 998e84230..b06afd097 100644
--- a/hardware/src/ara_dispatcher.sv
+++ b/hardware/src/ara_dispatcher.sv
@@ -53,17 +53,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
   //  CSRs  //
   ////////////
 
-  vlen_t  vstart_d, vstart_q;
-  vlen_t  vl_d, vl_q;
-  vtype_t vtype_d, vtype_q;
-  vxsat_e vxsat_d, vxsat_q;
-  vxrm_t  vxrm_d, vxrm_q;
-
-  `FF(vstart_q, vstart_d, '0)
-  `FF(vl_q, vl_d, '0)
-  `FF(vtype_q, vtype_d, '{vill: 1'b1, default: '0})
-  `FF(vxsat_q, vxsat_d, '0)
-  `FF(vxrm_q, vxrm_d, '0)
+  vlen_t  csr_vstart_d, csr_vstart_q;
+  vlen_t  csr_vl_d, csr_vl_q;
+  vtype_t csr_vtype_d, csr_vtype_q;
+  vxsat_e csr_vxsat_d, csr_vxsat_q;
+  vxrm_t  csr_vxrm_d, csr_vxrm_q;
+
+  `FF(csr_vstart_q, csr_vstart_d, '0)
+  `FF(csr_vl_q, csr_vl_d, '0)
+  `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, default: '0})
+  `FF(csr_vxsat_q, csr_vxsat_d, '0)
+  `FF(csr_vxrm_q, csr_vxrm_d, '0)
   // Converts between the internal representation of `vtype_t` and the full XLEN-bit CSR.
   function automatic riscv::xlen_t xlen_vtype(vtype_t vtype);
     xlen_vtype = {vtype.vill, {riscv::XLEN-9{1'b0}}, vtype.vma, vtype.vta, vtype.vsew,
@@ -133,14 +133,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
   typedef enum logic [1:0] {
     NORMAL_OPERATION,
     WAIT_IDLE,
-    RESHUFFLE,
-    SLDU_SEQUENCER
+    RESHUFFLE
   } state_e;
   state_e state_d, state_q;
 
   // We need to memorize the element width used to store each vector on the lanes, so that we are
   // able to deshuffle it when needed.
   rvv_pkg::vew_e [31:0] eew_d, eew_q;
+  // eew buffers for reshuffling
+  rvv_pkg::vew_e reshuffle_eew_vs1_d, reshuffle_eew_vs1_q;
+  rvv_pkg::vew_e reshuffle_eew_vs2_d, reshuffle_eew_vs2_q;
+  rvv_pkg::vew_e reshuffle_eew_vd_d, reshuffle_eew_vd_q;
   // If the reg was not written, the content is unknown. No need to reshuffle
   // when writing with != EEW
   logic [31:0] eew_valid_d, eew_valid_q;
@@ -167,6 +170,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
       rs_lmul_cnt_q       <= '0;
       rs_lmul_cnt_limit_q <= '0;
       rs_mask_request_q   <= 1'b0;
+      reshuffle_eew_vs1_q <= rvv_pkg::EW8;
+      reshuffle_eew_vs2_q <= rvv_pkg::EW8;
+      reshuffle_eew_vd_q  <= rvv_pkg::EW8;
     end else begin
       state_q             <= state_d;
       eew_q               <= eew_d;
@@ -178,6 +184,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
       rs_lmul_cnt_q       <= rs_lmul_cnt_d;
       rs_lmul_cnt_limit_q <= rs_lmul_cnt_limit_d;
       rs_mask_request_q   <= rs_mask_request_d;
+      reshuffle_eew_vs1_q <= reshuffle_eew_vs1_d;
+      reshuffle_eew_vs2_q <= reshuffle_eew_vs2_d;
+      reshuffle_eew_vd_q  <= reshuffle_eew_vd_d;
     end
   end
 
@@ -193,26 +202,26 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
   // its counters of pending memory operations
   // Ara should tell Ariane when a memory operation is completed, so that it can modify
   // its pending load/store counters.
-  // A memory operation can be completed both when it is over and when vl_q == 0. In the latter case,
+  // A memory operation can be completed both when it is over and when csr_vl_q == 0. In the latter case,
   // Ara's decoder answers immediately, and this can cause a collision with an answer from Ara's VLSU.
-  // To avoid collisions, we give precedence to the VLSU, and we delay the vl_q == 0 memory op
+  // To avoid collisions, we give precedence to the VLSU, and we delay the csr_vl_q == 0 memory op
   // completion signal if a collision occurs
   logic load_zero_vl, store_zero_vl;
   // Do not checks vregs validity against current LMUL
   logic skip_lmul_checks;
-  logic skip_vs1_lmul_checks;
   // Are we decoding?
   logic is_decoding;
   // Is this an in-lane operation?
   logic in_lane_op;
-  // If the vslideup offset is greater than vl_q, the vslideup has no effects
+  // If the vslideup offset is greater than csr_vl_q, the vslideup has no effects
   logic null_vslideup;
 
   // Pipeline the VLSU's load and store complete signals, for timing reasons
   logic load_complete_q;
   logic store_complete_q;
-  `FF(load_complete_q, load_complete_i, 1'b0)
-  `FF(store_complete_q, store_complete_i, 1'b0)
+  logic illegal_insn_load, illegal_insn_store;
+  `FF(load_complete_q, load_complete_i || illegal_insn_load, 1'b0)
+  `FF(store_complete_q, store_complete_i || illegal_insn_store, 1'b0)
 
   // NP2 Slide support
   logic is_stride_np2;
@@ -237,27 +246,32 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
   always_comb begin: p_decoder
     // Default values
-    vstart_d     = vstart_q;
-    vl_d         = vl_q;
-    vtype_d      = vtype_q;
+    csr_vstart_d     = csr_vstart_q;
+    csr_vl_d         = csr_vl_q;
+    csr_vtype_d      = csr_vtype_q;
     state_d      = state_q;
     eew_d        = eew_q;
     eew_valid_d  = eew_valid_q;
-    lmul_vs2     = vtype_q.vlmul;
-    lmul_vs1     = vtype_q.vlmul;
+    lmul_vs2     = csr_vtype_q.vlmul;
+    lmul_vs1     = csr_vtype_q.vlmul;
 
-    reshuffle_req_d  = reshuffle_req_q;
-    eew_old_buffer_d = eew_old_buffer_q;
-    eew_new_buffer_d = eew_new_buffer_q;
-    vs_buffer_d      = vs_buffer_q;
+    reshuffle_req_d     = reshuffle_req_q;
+    eew_old_buffer_d    = eew_old_buffer_q;
+    eew_new_buffer_d    = eew_new_buffer_q;
+    vs_buffer_d         = vs_buffer_q;
+    reshuffle_eew_vs1_d = reshuffle_eew_vs1_q;
+    reshuffle_eew_vs2_d = reshuffle_eew_vs2_q;
+    reshuffle_eew_vd_d  = reshuffle_eew_vd_q;
 
     rs_lmul_cnt_d       = '0;
     rs_lmul_cnt_limit_d = '0;
     rs_mask_request_d   = 1'b0;
 
     illegal_insn = 1'b0;
-    vxsat_d      = vxsat_q;
-    vxrm_d       = vxrm_q;
+    illegal_insn_load  = 1'b0;
+    illegal_insn_store = 1'b0;
+    csr_vxsat_d      = csr_vxsat_q;
+    csr_vxrm_d       = csr_vxrm_q;
 
     is_vload      = 1'b0;
     is_vstore     = 1'b0;
@@ -265,15 +279,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
     store_zero_vl = 1'b0;
 
     skip_lmul_checks     = 1'b0;
-    skip_vs1_lmul_checks = 1'b0;
 
     null_vslideup = 1'b0;
 
-    is_decoding = 1'b0;
-    in_lane_op  = 1'b0;
+    is_decoding     = 1'b0;
+    in_lane_op      = 1'b0;
 
-    acc_resp_o.req_ready  = 1'b0;
-    acc_resp_o.resp_valid = 1'b0;
     acc_resp_o       = '{
       trans_id      : acc_req_i.trans_id,
       load_complete : load_zero_vl | load_complete_q,
@@ -282,18 +293,20 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
       fflags_valid  : |fflags_ex_valid_i,
       default       : '0
     };
+    acc_resp_o.req_ready  = 1'b0;
+    acc_resp_o.resp_valid = 1'b0;
 
     // fflags
     for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane];
 
     ara_req_d = '{
-      vl           : vl_q,
-      vstart       : vstart_q,
-      vtype        : vtype_q,
-      emul         : vtype_q.vlmul,
-      eew_vs1      : vtype_q.vsew,
-      eew_vs2      : vtype_q.vsew,
-      eew_vd_op    : vtype_q.vsew,
+      vl           : csr_vl_q,
+      vstart       : csr_vstart_q,
+      vtype        : csr_vtype_q,
+      emul         : csr_vtype_q.vlmul,
+      eew_vs1      : csr_vtype_q.vsew,
+      eew_vs2      : csr_vtype_q.vsew,
+      eew_vd_op    : csr_vtype_q.vsew,
       eew_vmask    : eew_q[VMASK],
       cvt_resize   : CVT_SAME,
       default      : '0
@@ -304,9 +317,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
     ignore_zero_vl_check = 1'b0;
 
     // Saturation in any lane will raise vxsat flag
-    vxsat_d |= |vxsat_flag_i;
+    csr_vxsat_d |= |vxsat_flag_i;
     // Fixed-point rounding mode is applied to all lanes
-    for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = vxrm_q;
+    for (int lane = 0; lane < NrLanes; lane++) alu_vxrm_o[lane] = csr_vxrm_q;
     // Rounding mode is shared between all lanes
     for (int lane = 0; lane < NrLanes; lane++) acc_resp_o.fflags |= fflags_ex_i[lane];
     // Special states
@@ -330,6 +343,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
         rs_lmul_cnt_limit_d = rs_lmul_cnt_limit_q;
         rs_mask_request_d   = 1'b0;
 
+        // Every single reshuffle request refers to LMUL == 1
+        ara_req_d.emul = LMUL_1;
+
+        // vstart is always 0 for a reshuffle
+        ara_req_d.vstart = '0;
+
         // These generate a reshuffle request to Ara's backend
         // When LMUL > 1, not all the regs that compose a large
         // register should always be reshuffled
@@ -369,26 +388,35 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
             // Prepare the information to reshuffle the vector registers during the next cycles
             // Reshuffle in the following order: vd, v2, v1. The order is arbitrary.
+            // If we are here, vd has been already reshuffled.
             unique casez (reshuffle_req_d)
-              3'b??1: begin
-                eew_old_buffer_d = eew_q[insn.vmem_type.rd];
-                eew_new_buffer_d = ara_req_d.vtype.vsew;
-                vs_buffer_d      = insn.varith_type.rd;
-              end
               3'b?10: begin
                 eew_old_buffer_d = eew_q[insn.vmem_type.rs2];
-                eew_new_buffer_d = ara_req_d.eew_vs2;
+                eew_new_buffer_d = reshuffle_eew_vs2_q;
                 vs_buffer_d      = insn.varith_type.rs2;
               end
               3'b100: begin
                 eew_old_buffer_d = eew_q[insn.vmem_type.rs1];
-                eew_new_buffer_d = ara_req_d.eew_vs1;
+                eew_new_buffer_d = reshuffle_eew_vs1_q;
                 vs_buffer_d      = insn.varith_type.rs1;
               end
               default:;
             endcase
 
-            if (reshuffle_req_d == 3'b0) state_d = NORMAL_OPERATION;
+            if (reshuffle_req_d == 3'b0) begin
+              // If LMUL_X has X > 1, Ara can inject different reshuffle ops during RESHUFFLE,
+              // one per LMUL_1-register that needs to be reshuffled. In mixed cases, we have
+              // multiple instructions that reshuffle parts of the original LMUL_X-register
+              // (e.g., LMUL_8, vd = v0, eew = 64, and only v1 and v5 have eew = 64). In this
+              // case, the dependency of the next LMUL_8 instruction on v0 should be on all
+              // the reshuffle micro operations. This is not possible with the current architecture.
+              // Therefore, we either set the dependency on the very last instruction only, or
+              // we just wait until the reshuffle is over.
+              // The best optimization would be injecting contiguous reshuffles with X > 1 and
+              // an extended vl. If we injected only one reshuffle, we can skip the wait idle.
+              if (csr_vtype_q.vlmul != LMUL_1) state_d = WAIT_IDLE;
+              else state_d = NORMAL_OPERATION;
+            end
           // The register is not completely reshuffled (LMUL > 1)
           end else begin
             // Count up
@@ -401,17 +429,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
               3'b??1: begin
                 vs_buffer_d      = vs_buffer_q + 1;
                 eew_old_buffer_d = eew_q[vs_buffer_d];
-                eew_new_buffer_d = ara_req_d.vtype.vsew;
+                eew_new_buffer_d = reshuffle_eew_vd_q;
               end
               3'b?10: begin
                 vs_buffer_d      = vs_buffer_q + 1;
                 eew_old_buffer_d = eew_q[vs_buffer_d];
-                eew_new_buffer_d = ara_req_d.eew_vs2;
+                eew_new_buffer_d = reshuffle_eew_vs2_q;
               end
               3'b100: begin
                 vs_buffer_d      = vs_buffer_q + 1;
                 eew_old_buffer_d = eew_q[vs_buffer_d];
-                eew_new_buffer_d = ara_req_d.eew_vs1;
+                eew_new_buffer_d = reshuffle_eew_vs1_q;
               end
               default:;
             endcase
@@ -428,7 +456,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
         // Decoding
         is_decoding = 1'b1;
         // Acknowledge the request
-        acc_resp_o.req_ready = ara_req_ready_i;
+        acc_resp_o.req_ready = 1'b1;
 
         // Decode the instructions based on their opcode
         unique case (acc_req_i.insn.itype.opcode)
@@ -440,7 +468,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             // Instruction is of one of the RVV types
             automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr);
 
-            // These always respond at the same cycle
+            // These (mostly) always respond at the same cycle
             acc_resp_o.resp_valid = 1'b1;
 
             // Decode based on their func3 field
@@ -448,33 +476,33 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
               // Configuration instructions
               OPCFG: begin: opcfg
                 // These can be acknowledged regardless of the state of Ara
-                acc_resp_o.req_ready = 1'b1;
+                // NOTE: unless there is a pending fault-only first vector load
                 is_config       = 1'b1;
 
                 // Update vtype
                 if (insn.vsetvli_type.func1 == 1'b0) begin // vsetvli
-                  vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11));
+                  csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetvli_type.zimm11));
                 end else if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli
-                  vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10));
+                  csr_vtype_d = vtype_xlen(riscv::xlen_t'(insn.vsetivli_type.zimm10));
                 end else if (insn.vsetvl_type.func7 == 7'b100_0000) begin // vsetvl
-                  vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0]));
+                  csr_vtype_d = vtype_xlen(riscv::xlen_t'(acc_req_i.rs2[7:0]));
                 end else
-                  acc_resp_o.error = 1'b1;
+                  illegal_insn = 1'b1;
 
                 // Check whether the updated vtype makes sense
-                if ((vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN
-                    (vtype_d.vlmul == LMUL_RSVD) ||                    // reserved value
+                if ((csr_vtype_d.vsew > rvv_pkg::vew_e'($clog2(ELENB))) || // SEW <= ELEN
+                    (csr_vtype_d.vlmul == LMUL_RSVD) ||                    // reserved value
                     // LMUL >= SEW/ELEN
-                    (signed'($clog2(ELENB)) + signed'(vtype_d.vlmul) < signed'(vtype_d.vsew))) begin
-                  vtype_d = '{vill: 1'b1, default: '0};
-                  vl_d    = '0;
+                    (signed'($clog2(ELENB)) + signed'(csr_vtype_d.vlmul) < signed'(csr_vtype_d.vsew))) begin
+                  csr_vtype_d = '{vill: 1'b1, default: '0};
+                  csr_vl_d    = '0;
                 end
 
                 // Update the vector length
                 else begin
                   // Maximum vector length. VLMAX = LMUL * VLEN / SEW.
-                  automatic int unsigned vlmax = VLENB >> vtype_d.vsew;
-                  unique case (vtype_d.vlmul)
+                  automatic int unsigned vlmax = VLENB >> csr_vtype_d.vsew;
+                  unique case (csr_vtype_d.vlmul)
                     LMUL_1  : vlmax <<= 0;
                     LMUL_2  : vlmax <<= 1;
                     LMUL_4  : vlmax <<= 2;
@@ -487,24 +515,24 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   endcase
 
                   if (insn.vsetivli_type.func2 == 2'b11) begin // vsetivli
-                    vl_d = vlen_t'(insn.vsetivli_type.uimm5);
+                    csr_vl_d = vlen_t'(insn.vsetivli_type.uimm5);
                   end else begin // vsetvl || vsetvli
                     if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd == '0) begin
                       // Do not update the vector length
-                      vl_d = vl_q;
+                      csr_vl_d = csr_vl_q;
                     end else if (insn.vsetvl_type.rs1 == '0 && insn.vsetvl_type.rd != '0) begin
                       // Set the vector length to vlmax
-                      vl_d = vlmax;
+                      csr_vl_d = vlmax;
                     end else begin
                       // Normal stripmining
-                      vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(vl_d)]) ||
+                      csr_vl_d = ((|acc_req_i.rs1[$bits(acc_req_i.rs1)-1:$bits(csr_vl_d)]) ||
                         (vlen_t'(acc_req_i.rs1) > vlmax)) ? vlmax : vlen_t'(acc_req_i.rs1);
                     end
                   end
                 end
 
                 // Return the new vl
-                acc_resp_o.result = vl_d;
+                acc_resp_o.result = csr_vl_d;
 
                 // If the vtype has changed, wait for the backend before issuing any new instructions.
                 // This is to avoid hazards on implicit register labels when LMUL_old > LMUL_new
@@ -512,7 +540,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 // Checking only lmul_q is a trick: we want to stall only if both lmuls have
                 // zero MSB. If lmul_q has zero MSB, it's greater than lmul_d only if also
                 // lmul_d has zero MSB since the slice comparison is intrinsically unsigned
-                if (!vtype_q.vlmul[2] && (vtype_d.vlmul[2:0] < vtype_q.vlmul[2:0]))
+                if (!csr_vtype_q.vlmul[2] && (csr_vtype_d.vlmul[2:0] < csr_vtype_q.vlmul[2:0]))
                   state_d = WAIT_IDLE;
               end
 
@@ -549,26 +577,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b010001: begin
                     ara_req_d.op        = ara_pkg::VMADC;
-                    ara_req_d.use_vd_op = 1'b1;
 
                     // Check whether we can access vs1 and vs2
-                    unique case (ara_req_d.emul)
-                      LMUL_2:
-                        if (((insn.varith_type.rs1 & 5'b00001) == (insn.varith_type.rd & 5'b00001)) ||
-                            ((insn.varith_type.rs2 & 5'b00001) == (insn.varith_type.rd & 5'b00001)))
-                          illegal_insn = 1'b1;
-                      LMUL_4:
-                        if (((insn.varith_type.rs1 & 5'b00011) == (insn.varith_type.rd & 5'b00011)) ||
-                            ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011)))
-                          illegal_insn = 1'b1;
-                      LMUL_8:
-                        if (((insn.varith_type.rs1 & 5'b00111) == (insn.varith_type.rd & 5'b00111)) ||
-                            ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111)))
-                          illegal_insn = 1'b1;
-                      default:
-                        if ((insn.varith_type.rs1 == insn.varith_type.rd) ||
-                            (insn.varith_type.rs2 == insn.varith_type.rd)) illegal_insn = 1'b1;
-                    endcase
+                    if ((insn.varith_type.rs1 == insn.varith_type.rd) ||
+                        (insn.varith_type.rs2 == insn.varith_type.rd)) illegal_insn = 1'b1;
                   end
                   6'b010010: begin
                     ara_req_d.op = ara_pkg::VSBC;
@@ -579,50 +591,28 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b010011: begin
                     ara_req_d.op        = ara_pkg::VMSBC;
-                    ara_req_d.use_vd_op = 1'b1;
 
                     // Check whether we can access vs1 and vs2
-                    unique case (ara_req_d.emul)
-                      LMUL_2:
-                        if (((insn.varith_type.rs1 & 5'b00001) == (insn.varith_type.rd & 5'b00001)) ||
-                            ((insn.varith_type.rs2 & 5'b00001) == ( insn.varith_type.rd & 5'b00001)))
-                          illegal_insn = 1'b1;
-                      LMUL_4:
-                        if (((insn.varith_type.rs1 & 5'b00011) == (insn.varith_type.rd & 5'b00011)) ||
-                            ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011)))
-                          illegal_insn = 1'b1;
-                      LMUL_8:
-                        if (((insn.varith_type.rs1 & 5'b00111) == (insn.varith_type.rd & 5'b00111)) ||
-                            ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111)))
-                          illegal_insn = 1'b1;
-                      default:
-                        if ((insn.varith_type.rs1 == insn.varith_type.rd) ||
-                            (insn.varith_type.rs2 == insn.varith_type.rd)) illegal_insn = 1'b1;
-                    endcase
+                    if ((insn.varith_type.rs1 == insn.varith_type.rd) ||
+                        (insn.varith_type.rs2 == insn.varith_type.rd)) illegal_insn = 1'b1;
                   end
                   6'b011000: begin
                     ara_req_d.op        = ara_pkg::VMSEQ;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011001: begin
                     ara_req_d.op        = ara_pkg::VMSNE;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011010: begin
                     ara_req_d.op        = ara_pkg::VMSLTU;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011011: begin
                     ara_req_d.op        = ara_pkg::VMSLT;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011100: begin
                     ara_req_d.op        = ara_pkg::VMSLEU;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011101: begin
                     ara_req_d.op        = ara_pkg::VMSLE;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b010111: begin
                     ara_req_d.op      = ara_pkg::VMERGE;
@@ -632,7 +622,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     if (insn.varith_type.vm) begin
                       ara_req_d.eew_vs1    = eew_q[ara_req_d.vs1];
                       ara_req_d.vtype.vsew = eew_q[ara_req_d.vs1];
-                      ara_req_d.vl         = (vl_q << vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0];
+                      ara_req_d.vl         = (csr_vl_q << csr_vtype_q.vsew[1:0]) >> ara_req_d.eew_vs1[1:0];
                     end
                   end
                   6'b100000: ara_req_d.op = ara_pkg::VSADDU;
@@ -648,11 +638,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101100: begin
                     ara_req_d.op             = ara_pkg::VNSRL;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -666,11 +656,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101101: begin
                     ara_req_d.op             = ara_pkg::VNSRA;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -683,28 +673,28 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b101110: begin
                     ara_req_d.op = ara_pkg::VNCLIPU;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   6'b101111: begin
                     ara_req_d.op = ara_pkg::VNCLIP;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   // Reductions encode in cvt_resize the neutral value bits
                   // CVT_WIDE is 2'b00 (hack to save wires)
                   6'b110000: begin
                     ara_req_d.op = ara_pkg::VWREDSUMU;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.eew_vs1        = vtype_q.vsew.next();
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.eew_vs1        = csr_vtype_q.vsew.next();
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110001: begin
                     ara_req_d.op = ara_pkg::VWREDSUM;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.eew_vs1        = vtype_q.vsew.next();
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.eew_vs1        = csr_vtype_q.vsew.next();
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
@@ -728,7 +718,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 endcase
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPIVX: begin: opivx
@@ -758,7 +748,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b001110: begin
                     ara_req_d.op            = ara_pkg::VSLIDEUP;
                     ara_req_d.stride        = acc_req_i.rs1;
-                    ara_req_d.eew_vs2       = vtype_q.vsew;
+                    ara_req_d.eew_vs2       = csr_vtype_q.vsew;
                     // Encode vslideup/vslide1up on the use_scalar_op field
                     ara_req_d.use_scalar_op = 1'b0;
                     // Vl refers to current system vsew, but operand requesters
@@ -766,13 +756,13 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     // i.e., request will need reshuffling
                     ara_req_d.scale_vl      = 1'b1;
                     // If stride > vl, the vslideup has no effects
-                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
-                      (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
+                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] ||
+                      (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1;
                   end
                   6'b001111: begin
                     ara_req_d.op            = ara_pkg::VSLIDEDOWN;
                     ara_req_d.stride        = acc_req_i.rs1;
-                    ara_req_d.eew_vs2       = vtype_q.vsew;
+                    ara_req_d.eew_vs2       = csr_vtype_q.vsew;
                     // Encode vslidedown/vslide1down on the use_scalar_op field
                     ara_req_d.use_scalar_op = 1'b0;
                     // Request will need reshuffling
@@ -789,21 +779,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b010001: begin
                     ara_req_d.op        = ara_pkg::VMADC;
-                    ara_req_d.use_vd_op = 1'b1;
 
                     // Check whether we can access vs1 and vs2
-                    unique case (ara_req_d.emul)
-                      LMUL_2:
-                        if ((insn.varith_type.rs2 & 5'b00001) == (insn.varith_type.rd & 5'b00001))
-                          illegal_insn = 1'b1;
-                      LMUL_4:
-                        if ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011))
-                          illegal_insn = 1'b1;
-                      LMUL_8:
-                        if ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111))
-                          illegal_insn = 1'b1;
-                      default: if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1;
-                    endcase
+                    if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1;
                   end
                   6'b010010: begin
                     ara_req_d.op = ara_pkg::VSBC;
@@ -816,53 +794,33 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b010011: begin
                     ara_req_d.op        = ara_pkg::VMSBC;
-                    ara_req_d.use_vd_op = 1'b1;
 
                     // Check whether we can access vs1 and vs2
-                    unique case (ara_req_d.emul)
-                      LMUL_2:
-                        if ((insn.varith_type.rs2 & 5'b00001) == (insn.varith_type.rd & 5'b00001))
-                          illegal_insn = 1'b1;
-                      LMUL_4:
-                        if ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011))
-                          illegal_insn = 1'b1;
-                      LMUL_8:
-                        if ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111))
-                          illegal_insn = 1'b1;
-                      default: if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1;
-                    endcase
+                    if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1;
                   end
                   6'b011000: begin
                     ara_req_d.op        = ara_pkg::VMSEQ;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011001: begin
                     ara_req_d.op        = ara_pkg::VMSNE;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011010: begin
                     ara_req_d.op        = ara_pkg::VMSLTU;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011011: begin
                     ara_req_d.op        = ara_pkg::VMSLT;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011100: begin
                     ara_req_d.op        = ara_pkg::VMSLEU;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011101: begin
                     ara_req_d.op        = ara_pkg::VMSLE;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011110: begin
                     ara_req_d.op        = ara_pkg::VMSGTU;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011111: begin
                     ara_req_d.op        = ara_pkg::VMSGT;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b010111: begin
                     ara_req_d.op      = ara_pkg::VMERGE;
@@ -881,11 +839,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101100: begin
                     ara_req_d.op             = ara_pkg::VNSRL;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -899,11 +857,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101101: begin
                     ara_req_d.op             = ara_pkg::VNSRA;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -916,11 +874,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b101110: begin
                     ara_req_d.op = ara_pkg::VNCLIPU;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   6'b101111: begin
                     ara_req_d.op = ara_pkg::VNCLIP;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   default: illegal_insn = 1'b1;
                 endcase
@@ -938,7 +896,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 endcase
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPIVI: begin: opivi
@@ -966,19 +924,19 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b001110: begin
                     ara_req_d.op            = ara_pkg::VSLIDEUP;
                     ara_req_d.stride        = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1};
-                    ara_req_d.eew_vs2       = vtype_q.vsew;
+                    ara_req_d.eew_vs2       = csr_vtype_q.vsew;
                     // Encode vslideup/vslide1up on the use_scalar_op field
                     ara_req_d.use_scalar_op = 1'b0;
                     // Request will need reshuffling
                     ara_req_d.scale_vl      = 1'b1;
                     // If stride > vl, the vslideup has no effects
-                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
-                      (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
+                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] ||
+                      (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1;
                   end
                   6'b001111: begin
                     ara_req_d.op            = ara_pkg::VSLIDEDOWN;
                     ara_req_d.stride        = {{ELEN{insn.varith_type.rs1[19]}}, insn.varith_type.rs1};
-                    ara_req_d.eew_vs2       = vtype_q.vsew;
+                    ara_req_d.eew_vs2       = csr_vtype_q.vsew;
                     // Encode vslidedown/vslide1down on the use_scalar_op field
                     ara_req_d.use_scalar_op = 1'b0;
                     // Request will need reshuffling
@@ -995,45 +953,27 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b010001: begin
                     ara_req_d.op        = ara_pkg::VMADC;
-                    ara_req_d.use_vd_op = 1'b1;
 
                     // Check whether we can access vs1 and vs2
-                    unique case (ara_req_d.emul)
-                      LMUL_2:
-                        if ((insn.varith_type.rs2 & 5'b00001) == (insn.varith_type.rd & 5'b00001))
-                          illegal_insn = 1'b1;
-                      LMUL_4:
-                        if ((insn.varith_type.rs2 & 5'b00011) == (insn.varith_type.rd & 5'b00011))
-                          illegal_insn = 1'b1;
-                      LMUL_8:
-                        if ((insn.varith_type.rs2 & 5'b00111) == (insn.varith_type.rd & 5'b00111))
-                          illegal_insn = 1'b1;
-                      default: if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1;
-                    endcase
+                    if (insn.varith_type.rs2 == insn.varith_type.rd) illegal_insn = 1'b1;
                   end
                   6'b011000: begin
                     ara_req_d.op        = ara_pkg::VMSEQ;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011001: begin
                     ara_req_d.op        = ara_pkg::VMSNE;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011100: begin
                     ara_req_d.op        = ara_pkg::VMSLEU;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011101: begin
                     ara_req_d.op        = ara_pkg::VMSLE;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011110: begin
                     ara_req_d.op        = ara_pkg::VMSGTU;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b011111: begin
                     ara_req_d.op        = ara_pkg::VMSGT;
-                    ara_req_d.use_vd_op = 1'b1;
                   end
                   6'b010111: begin
                     ara_req_d.op      = ara_pkg::VMERGE;
@@ -1091,11 +1031,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101100: begin
                     ara_req_d.op             = ara_pkg::VNSRL;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -1109,11 +1049,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b101101: begin
                     ara_req_d.op             = ara_pkg::VNSRA;
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
 
                     // Check whether the EEW is not too wide.
-                    if (int'(vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    if (int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
 
                     // Check whether we can access vs2
                     unique case (ara_req_d.emul.next())
@@ -1126,11 +1066,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   end
                   6'b101110: begin
                     ara_req_d.op = ara_pkg::VNCLIPU;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   6'b101111: begin
                     ara_req_d.op = ara_pkg::VNCLIP;
-                    ara_req_d.eew_vs2 = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew.next();
                   end
                   default: illegal_insn = 1'b1;
                 endcase
@@ -1148,7 +1088,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 endcase
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPMVV: begin: opmvv
@@ -1237,7 +1177,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ignore_zero_vl_check = 1'b1;
 
                     // Sign extend operands
-                    unique case (vtype_q.vsew)
+                    unique case (csr_vtype_q.vsew)
                       EW8: begin
                         ara_req_d.conversion_vs2 = OpQueueConversionSExt8;
                       end
@@ -1251,12 +1191,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     endcase
 
                     // Wait until the back-end answers to acknowledge those instructions
-                    if (ara_resp_valid_i) begin
-                      acc_resp_o.req_ready   = 1'b1;
-                      acc_resp_o.result = ara_resp_i.resp;
-                      acc_resp_o.error  = ara_resp_i.error;
-                      acc_resp_o.resp_valid  = 1'b1;
-                      ara_req_valid_d   = 1'b0;
+                    if ( ara_resp_valid_i ) begin
+                      acc_resp_o.req_ready  = 1'b1;
+                      acc_resp_o.resp_valid = 1'b1;
+                      acc_resp_o.result     = ara_resp_i.resp;
+                      acc_resp_o.exception  = ara_resp_i.exception;
+                      ara_req_valid_d       = 1'b0;
                     end
                   end
                   6'b010100: begin
@@ -1283,7 +1223,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2    = EW8;
                     ara_req_d.eew_vd_op  = EW8;
                     ara_req_d.vtype.vsew = EW8;
-                    ara_req_d.use_vd_op  = 1'b1;
                   end
                   6'b011001: begin
                     ara_req_d.op         = ara_pkg::VMAND;
@@ -1291,7 +1230,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2    = EW8;
                     ara_req_d.eew_vd_op  = EW8;
                     ara_req_d.vtype.vsew = EW8;
-                    ara_req_d.use_vd_op  = 1'b1;
                   end
                   6'b011010: begin
                     ara_req_d.op         = ara_pkg::VMOR;
@@ -1299,7 +1237,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2    = EW8;
                     ara_req_d.eew_vd_op  = EW8;
                     ara_req_d.vtype.vsew = EW8;
-                    ara_req_d.use_vd_op  = 1'b1;
                   end
                   6'b011011: begin
                     ara_req_d.op         = ara_pkg::VMXOR;
@@ -1307,7 +1244,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2    = EW8;
                     ara_req_d.eew_vd_op  = EW8;
                     ara_req_d.vtype.vsew = EW8;
-                    ara_req_d.use_vd_op  = 1'b1;
                   end
                   6'b011100: begin
                     ara_req_d.op         = ara_pkg::VMORNOT;
@@ -1315,7 +1251,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2    = EW8;
                     ara_req_d.eew_vd_op  = EW8;
                     ara_req_d.vtype.vsew = EW8;
-                    ara_req_d.use_vd_op  = 1'b1;
                   end
                   6'b011101: begin
                     ara_req_d.op         = ara_pkg::VMNAND;
@@ -1323,7 +1258,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2    = EW8;
                     ara_req_d.eew_vd_op  = EW8;
                     ara_req_d.vtype.vsew = EW8;
-                    ara_req_d.use_vd_op  = 1'b1;
                   end
                   6'b011110: begin
                     ara_req_d.op         = ara_pkg::VMNOR;
@@ -1331,7 +1265,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2    = EW8;
                     ara_req_d.eew_vd_op  = EW8;
                     ara_req_d.vtype.vsew = EW8;
-                    ara_req_d.use_vd_op  = 1'b1;
                   end
                   6'b011111: begin
                     ara_req_d.op         = ara_pkg::VMXNOR;
@@ -1339,12 +1272,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2    = EW8;
                     ara_req_d.eew_vd_op  = EW8;
                     ara_req_d.vtype.vsew = EW8;
-                    ara_req_d.use_vd_op  = 1'b1;
                   end
                   6'b010010: begin // VXUNARY0
                     // These instructions do not use vs1
                     ara_req_d.use_vs1    = 1'b0;
-                    skip_vs1_lmul_checks = 1'b1;
                     // They are always encoded as ADDs with zero.
                     ara_req_d.op            = ara_pkg::VADD;
                     ara_req_d.use_scalar_op = 1'b1;
@@ -1357,8 +1288,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW64) ||
-                            int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8})
+                        if (int'(csr_vtype_q.vsew) < int'(EW64) ||
+                            int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8})
                           illegal_insn = 1'b1;
                       end
                       5'b00011: begin // VSEXT.VF8
@@ -1367,44 +1298,44 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW64) ||
-                            int'(vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8})
+                        if (int'(csr_vtype_q.vsew) < int'(EW64) ||
+                            int'(csr_vtype_q.vlmul) inside {LMUL_1_2, LMUL_1_4, LMUL_1_8})
                           illegal_insn = 1'b1;
                       end
                       5'b00100: begin // VZEXT.VF4
                         ara_req_d.conversion_vs2 = OpQueueConversionZExt4;
-                        ara_req_d.eew_vs2        = prev_prev_ew(vtype_q.vsew);
+                        ara_req_d.eew_vs2        = prev_prev_ew(csr_vtype_q.vsew);
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW32) ||
-                            int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1;
+                        if (int'(csr_vtype_q.vsew) < int'(EW32) ||
+                            int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1;
                       end
                       5'b00101: begin // VSEXT.VF4
                         ara_req_d.conversion_vs2 = OpQueueConversionSExt4;
-                        ara_req_d.eew_vs2        = prev_prev_ew(vtype_q.vsew);
+                        ara_req_d.eew_vs2        = prev_prev_ew(csr_vtype_q.vsew);
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW32) ||
-                            int'(vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1;
+                        if (int'(csr_vtype_q.vsew) < int'(EW32) ||
+                            int'(csr_vtype_q.vlmul) inside {LMUL_1_4, LMUL_1_8}) illegal_insn = 1'b1;
                       end
                       5'b00110: begin // VZEXT.VF2
                         ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                        ara_req_d.eew_vs2        = vtype_q.vsew.prev();
+                        ara_req_d.eew_vs2        = csr_vtype_q.vsew.prev();
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8})
+                        if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8})
                           illegal_insn = 1'b1;
                       end
                       5'b00111: begin // VSEXT.VF2
                         ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
-                        ara_req_d.eew_vs2        = vtype_q.vsew.prev();
+                        ara_req_d.eew_vs2        = csr_vtype_q.vsew.prev();
                         ara_req_d.cvt_resize     = CVT_WIDE;
 
                         // Invalid conversion
-                        if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8})
+                        if (int'(csr_vtype_q.vsew) < int'(EW16) || int'(csr_vtype_q.vlmul) inside {LMUL_1_8})
                           illegal_insn = 1'b1;
                       end
                       default: illegal_insn = 1'b1;
@@ -1444,92 +1375,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // Widening instructions
                   6'b110000: begin // VWADDU
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110001: begin // VWADD
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110010: begin // VWSUBU
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110011: begin // VWSUB
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110100: begin // VWADDU.W
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110101: begin // VWADD.W
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110110: begin // VWSUBU.W
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110111: begin // VWSUB.W
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111000: begin // VWMULU
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111010: begin // VWMULSU
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111011: begin // VWMUL
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
@@ -1537,31 +1468,31 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b111100: begin // VWMACCU
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111101: begin // VWMACC
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111111: begin // VWMACCSU
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   default: illegal_insn = 1'b1;
@@ -1572,21 +1503,21 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 // destination register.
                 if (!skip_lmul_checks) begin
                   unique case (ara_req_d.emul)
-                    LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vd;
+                    LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vd;
+                    LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vd;
                     default:;
                   endcase
                   unique case (lmul_vs2)
-                    LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
+                    LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
+                    LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
                     default:;
                   endcase
                   unique case (lmul_vs1)
-                    LMUL_2: if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs1;
+                    LMUL_4: if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs1;
+                    LMUL_8: if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs1;
                     default:;
                   endcase
                 end
@@ -1595,7 +1526,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1;
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPMVX: begin: opmvx
@@ -1620,17 +1551,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b001110: begin // vslide1up
                     ara_req_d.op      = ara_pkg::VSLIDEUP;
                     ara_req_d.stride  = 1;
-                    ara_req_d.eew_vs2 = vtype_q.vsew;
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
                     // If stride > vl, the vslideup has no effects
-                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
-                      (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
+                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] ||
+                      (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1;
                   end
                   6'b001111: begin // vslide1down
                     ara_req_d.op      = ara_pkg::VSLIDEDOWN;
                     ara_req_d.stride  = 1;
-                    ara_req_d.eew_vs2 = vtype_q.vsew;
+                    ara_req_d.eew_vs2 = csr_vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
                   end
@@ -1638,7 +1569,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     // vmv.s.x
                     ara_req_d.op      = ara_pkg::VMVSX;
                     ara_req_d.use_vs2 = 1'b0;
-                    ara_req_d.vl      = |vl_q ? 1 : '0;
+                    ara_req_d.vl      = |csr_vl_q ? 1 : '0;
                     // This instruction ignores LMUL checks
                     skip_lmul_checks  = 1'b1;
                   end
@@ -1676,92 +1607,92 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // Widening instructions
                   6'b110000: begin // VWADDU
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110001: begin // VWADD
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110010: begin // VWSUBU
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110011: begin // VWSUB
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110100: begin // VWADDU.W
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110101: begin // VWADD.W
                     ara_req_d.op             = ara_pkg::VADD;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110110: begin // VWSUBU.W
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b110111: begin // VWSUB.W
                     ara_req_d.op             = ara_pkg::VSUB;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                    ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111000: begin // VWMULU
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111010: begin // VWMULSU
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111011: begin // VWMUL
                     ara_req_d.op             = ara_pkg::VMUL;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
@@ -1769,41 +1700,41 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b111100: begin // VWMACCU
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111101: begin // VWMACC
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111110: begin // VWMACCUS
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   6'b111111: begin // VWMACCSU
                     ara_req_d.op             = ara_pkg::VMACC;
                     ara_req_d.use_vd_op      = 1'b1;
-                    ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                    ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                    ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                    ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
-                    ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                    ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
                   end
                   default: illegal_insn = 1'b1;
@@ -1814,15 +1745,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 // destination register.
                 if (!skip_lmul_checks) begin
                   unique case (ara_req_d.emul)
-                    LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vd;
+                    LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vd;
+                    LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vd;
                     default:;
                   endcase
                   unique case (lmul_vs2)
-                    LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
+                    LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
+                    LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
                     default:;
                   endcase
                 end
@@ -1831,7 +1762,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 if (int'(ara_req_d.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1;
 
                 // Instruction is invalid if the vtype is invalid
-                if (vtype_q.vill) illegal_insn = 1'b1;
+                if (csr_vtype_q.vill) illegal_insn = 1'b1;
               end
 
               OPFVV: begin: opfvv
@@ -1901,7 +1832,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ignore_zero_vl_check = 1'b1;
 
                       // Zero-extend operands
-                      unique case (vtype_q.vsew)
+                      unique case (csr_vtype_q.vsew)
                         EW16: begin
                           ara_req_d.conversion_vs2 = OpQueueConversionZExt4;
                         end
@@ -1912,7 +1843,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       endcase
 
                       // NaN-box the result if needed
-                      unique case (vtype_q.vsew)
+                      unique case (csr_vtype_q.vsew)
                         EW16: begin
                           vfmvfs_result[63:16] = '1;
                           vfmvfs_result[15:0]  = ara_resp_i.resp[15:0];
@@ -1926,11 +1857,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
                       // Wait until the back-end answers to acknowledge those instructions
                       if (ara_resp_valid_i) begin
-                        acc_resp_o.req_ready   = 1'b1;
-                        acc_resp_o.result = vfmvfs_result;
-                        acc_resp_o.error  = ara_resp_i.error;
-                        acc_resp_o.resp_valid  = 1'b1;
-                        ara_req_valid_d   = 1'b0;
+                        acc_resp_o.req_ready  = 1'b1;
+                        acc_resp_o.resp_valid = 1'b1;
+                        acc_resp_o.result     = vfmvfs_result;
+                        acc_resp_o.exception  = ara_resp_i.exception;
+                        ara_req_valid_d       = 1'b0;
                       end
                     end
                     6'b011000: ara_req_d.op = ara_pkg::VMFEQ;
@@ -1940,7 +1871,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b010010: begin // VFUNARY0
                       // These instructions do not use vs1
                       ara_req_d.use_vs1    = 1'b0;
-                      skip_vs1_lmul_checks = 1'b1;
 
                       case (insn.varith_type.rs1)
                         5'b00000: ara_req_d.op = VFCVTXUF;
@@ -1952,103 +1882,101 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         5'b01000: begin // Widening VFCVTXUF
                           ara_req_d.op             = VFCVTXUF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01001: begin // Widening VFCVTXF
                           ara_req_d.op             = VFCVTXF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01010: begin // Widening VFCVTFXU
                           ara_req_d.op             = VFCVTFXU;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01011: begin // Widening VFCVTFX
                           ara_req_d.op             = VFCVTFX;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01100: begin // Widening VFCVTFF
                           ara_req_d.op             = VFCVTFF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01110: begin // Widening VFCVTRTZXUF
                           ara_req_d.op             = VFCVTRTZXUF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b01111: begin // Widening VFCVTRTZXF
                           ara_req_d.op             = VFCVTRTZXF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
-                          ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                          ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                          ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                          ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
                         end
                         5'b10000: begin // Narrowing VFCVTXUF
                           ara_req_d.op             = VFCVTXUF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10001: begin // Narrowing VFCVTXF
                           ara_req_d.op             = VFCVTXF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10010: begin // Narrowing VFCVTFXU
                           ara_req_d.op             = VFCVTFXU;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10011: begin // Narrowing VFCVTFX
                           ara_req_d.op             = VFCVTFX;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10100: begin // Narrowing VFCVTFF
                           ara_req_d.op             = VFCVTFF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10101: begin // Narrowing VFNCVTRODFF
                           ara_req_d.op             = VFNCVTRODFF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10110: begin // Narrowing VFCVTRTZXUF
                           ara_req_d.op             = VFCVTRTZXUF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         5'b10111: begin // Narrowing VFCVTRTZXF
                           ara_req_d.op             = VFCVTRTZXF;
                           ara_req_d.cvt_resize     = CVT_NARROW;
-                          ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                          ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                         end
                         default: begin
                           // Trigger an error
-                          acc_resp_o.error = 1'b1;
-                          ara_req_valid_d  = 1'b0;
+                          illegal_insn = 1'b1;
                         end
                       endcase
                     end
                     6'b010011: begin // VFUNARY1
                     // These instructions do not use vs1
                     ara_req_d.use_vs1    = 1'b0;
-                    skip_vs1_lmul_checks = 1'b1;
 
                     unique case (insn.varith_type.rs1)
                       5'b00000: ara_req_d.op = ara_pkg::VFSQRT;
@@ -2104,99 +2032,99 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b110000: begin // VFWADD
                       ara_req_d.op             = ara_pkg::VFADD;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                     end
                     6'b110001: begin // VFWREDUSUM
                       ara_req_d.op             = ara_pkg::VFWREDUSUM;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vs1        = vtype_q.vsew.next();
+                      ara_req_d.eew_vs1        = csr_vtype_q.vsew.next();
                       ara_req_d.cvt_resize     = resize_e'(2'b00);
                     end
                     6'b110010: begin // VFWSUB
                       ara_req_d.op             = ara_pkg::VFSUB;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                     end
                     6'b110011: begin // VFWREDOSUM
                       ara_req_d.op             = ara_pkg::VFWREDOSUM;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vs1        = vtype_q.vsew.next();
+                      ara_req_d.eew_vs1        = csr_vtype_q.vsew.next();
                       ara_req_d.cvt_resize     = resize_e'(2'b00);
                     end
                     6'b110100: begin // VFWADD.W
                       ara_req_d.op             = ara_pkg::VFADD;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
-                      ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
+                      ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                     end
                     6'b110110: begin // VFWSUB.W
                       ara_req_d.op             = ara_pkg::VFSUB;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
-                      ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
+                      ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                     end
                     6'b111000: begin // VFWMUL
                       ara_req_d.op             = ara_pkg::VFMUL;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                     end
                     6'b111100: begin // VFWMACC
                       ara_req_d.op             = ara_pkg::VFMACC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111101: begin // VFWNMACC
                       ara_req_d.op             = ara_pkg::VFNMACC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111110: begin // VFWMSAC
                       ara_req_d.op             = ara_pkg::VFMSAC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111111: begin // VFWNMSAC
                       ara_req_d.op             = ara_pkg::VFNMSAC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     default: illegal_insn = 1'b1;
                   endcase
@@ -2206,28 +2134,26 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // destination register.
                   if (!skip_lmul_checks) begin
                     unique case (ara_req_d.emul)
-                      LMUL_2   : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_4   : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_8   : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                      LMUL_2   : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vd;
+                      LMUL_4   : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vd;
+                      LMUL_8   : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vd;
                       LMUL_RSVD: illegal_insn = 1'b1;
                       default:;
                     endcase
                     unique case (lmul_vs2)
-                      LMUL_2   : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_4   : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_8   : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                      LMUL_2   : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
+                      LMUL_4   : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
+                      LMUL_8   : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
+                      LMUL_RSVD: illegal_insn = 1'b1;
+                      default:;
+                    endcase
+                    unique case (lmul_vs1)
+                      LMUL_2   : if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs1;
+                      LMUL_4   : if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs1;
+                      LMUL_8   : if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs1;
                       LMUL_RSVD: illegal_insn = 1'b1;
                       default:;
                     endcase
-                    if (!skip_vs1_lmul_checks) begin
-                      unique case (lmul_vs1)
-                        LMUL_2   : if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                        LMUL_4   : if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                        LMUL_8   : if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
-                        LMUL_RSVD: illegal_insn = 1'b1;
-                        default:;
-                      endcase
-                    end
                   end
 
                   // Ara can support 16-bit float, 32-bit float, 64-bit float.
@@ -2252,7 +2178,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   endcase
 
                   // Instruction is invalid if the vtype is invalid
-                  if (vtype_q.vill) illegal_insn = 1'b1;
+                  if (csr_vtype_q.vill) illegal_insn = 1'b1;
                 end else illegal_insn = 1'b1; // Vector FP instructions are disabled
               end
 
@@ -2291,17 +2217,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b001110: begin // vfslide1up
                       ara_req_d.op     = ara_pkg::VSLIDEUP;
                       ara_req_d.stride = 1;
-                    ara_req_d.eew_vs2  = vtype_q.vsew;
+                    ara_req_d.eew_vs2  = csr_vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
                     // If stride > vl, the vslideup has no effects
-                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
-                      (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
+                    if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(csr_vl_q)] ||
+                      (vlen_t'(ara_req_d.stride) >= csr_vl_q)) null_vslideup = 1'b1;
                     end
                     6'b001111: begin // vfslide1down
                       ara_req_d.op     = ara_pkg::VSLIDEDOWN;
                       ara_req_d.stride = 1;
-                    ara_req_d.eew_vs2  = vtype_q.vsew;
+                    ara_req_d.eew_vs2  = csr_vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
                     end
@@ -2309,7 +2235,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       // vmv.s.f
                       ara_req_d.op      = ara_pkg::VFMVSF;
                       ara_req_d.use_vs2 = 1'b0;
-                      ara_req_d.vl      = |vl_q ? 1 : '0;
+                      ara_req_d.vl      = |csr_vl_q ? 1 : '0;
                       // This instruction ignores LMUL checks
                       skip_lmul_checks  = 1'b1;
                     end
@@ -2370,85 +2296,85 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b110000: begin // VFWADD
                       ara_req_d.op             = ara_pkg::VFADD;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b110010: begin // VFWSUB
                       ara_req_d.op             = ara_pkg::VFSUB;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b110100: begin // VFWADD.W
                       ara_req_d.op             = ara_pkg::VFADD;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
-                      ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
+                      ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b110110: begin // VFWSUB.W
                       ara_req_d.op             = ara_pkg::VFSUB;
                       ara_req_d.swap_vs2_vd_op = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      lmul_vs2                 = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
-                      ara_req_d.eew_vs2        = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      lmul_vs2                 = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
+                      ara_req_d.eew_vs2        = csr_vtype_q.vsew.next();
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b111000: begin // VFWMUL
                       ara_req_d.op             = ara_pkg::VFMUL;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
                     end
                     6'b111100: begin // VFWMACC
                       ara_req_d.op             = ara_pkg::VFMACC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111101: begin // VFWNMACC
                       ara_req_d.op             = ara_pkg::VFNMACC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111110: begin // VFWMSAC
                       ara_req_d.op             = ara_pkg::VFMSAC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     6'b111111: begin // VFWNMSAC
                       ara_req_d.op             = ara_pkg::VFNMSAC;
                       ara_req_d.use_vd_op      = 1'b1;
-                      ara_req_d.emul           = next_lmul(vtype_q.vlmul);
-                      ara_req_d.vtype.vsew     = vtype_q.vsew.next();
+                      ara_req_d.emul           = next_lmul(csr_vtype_q.vlmul);
+                      ara_req_d.vtype.vsew     = csr_vtype_q.vsew.next();
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.wide_fp_imm    = 1'b1;
-                      ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.eew_vd_op      = csr_vtype_q.vsew.next();
                     end
                     default: illegal_insn = 1'b1;
                   endcase
 
                   // Check if the FP scalar operand is NaN-boxed. If not, replace it with a NaN.
-                  case (vtype_q.vsew)
+                  case (csr_vtype_q.vsew)
                     EW16: if (~(&acc_req_i.rs1[63:16])) ara_req_d.scalar_op = 64'h0000000000007e00;
                     EW32: if (~(&acc_req_i.rs1[63:32])) ara_req_d.scalar_op = 64'h000000007fc00000;
                   endcase
@@ -2458,16 +2384,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // destination register.
                   if (!skip_lmul_checks) begin
                     unique case (ara_req_d.emul)
-                      LMUL_2   : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_4   : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_8   : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                      LMUL_2   : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vd;
+                      LMUL_4   : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vd;
+                      LMUL_8   : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vd;
                       LMUL_RSVD: illegal_insn = 1'b1;
                       default:;
                     endcase
                     unique case (lmul_vs2)
-                      LMUL_2   : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_4   : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_8   : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                      LMUL_2   : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
+                      LMUL_4   : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
+                      LMUL_8   : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req_d.use_vs2;
                       LMUL_RSVD: illegal_insn = 1'b1;
                       default:;
                     endcase
@@ -2491,7 +2417,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   endcase
 
                   // Instruction is invalid if the vtype is invalid
-                  if (vtype_q.vill) illegal_insn = 1'b1;
+                  if (csr_vtype_q.vill) illegal_insn = 1'b1;
                 end else illegal_insn = 1'b1; // Vector FP instructions are disabled
               end
             endcase
@@ -2525,7 +2451,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW8;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW8;
                   end
               end
@@ -2533,7 +2459,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW16;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW16;
                   end
               end
@@ -2541,7 +2467,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW32;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW32;
                   end
               end
@@ -2549,15 +2475,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW64;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW64;
                   end
               end
               default: begin // Invalid. Element is too wide, or encoding is non-existant.
                 acc_resp_o.req_ready  = 1'b1;
-                acc_resp_o.error = 1'b1;
                 acc_resp_o.resp_valid = 1'b1;
-                ara_req_valid_d  = 1'b0;
+                illegal_insn          = 1'b1;
+                ara_req_valid_d       = 1'b0;
               end
             endcase
 
@@ -2572,19 +2498,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   5'b01000:;      // Unit-strided, whole registers
                   5'b01011: begin // Unit-strided, mask load, EEW=1
                     // We operate ceil(vl/8) bytes
-                    ara_req_d.vl         = (vl_q >> 3) + |vl_q[2:0];
+                    ara_req_d.vl         = (csr_vl_q >> 3) + |csr_vl_q[2:0];
                     ara_req_d.vtype.vsew = EW8;
                   end
                   5'b10000: begin // Unit-strided, fault-only first
                     // TODO: Not implemented
-                    illegal_insn     = 1'b1;
-                    acc_resp_o.req_ready  = 1'b1;
-                    acc_resp_o.resp_valid = 1'b1;
+                    illegal_insn_load     = 1'b1;
                   end
                   default: begin // Reserved
-                    illegal_insn     = 1'b1;
-                    acc_resp_o.req_ready  = 1'b1;
-                    acc_resp_o.resp_valid = 1'b1;
+                    illegal_insn_load     = 1'b1;
                   end
                 endcase
               end
@@ -2604,24 +2526,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
             // For memory operations: EMUL = LMUL * (EEW / SEW)
             // EEW is encoded in the instruction
-            ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew));
+            ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew));
 
             // Exception if EMUL > 8 or < 1/8
-            unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]})
+            unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]})
               // The new emul is lower than the previous lmul
               2'b01: begin
                 // But the new eew is greater than vsew
-                if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_o.resp_valid = 1'b1;
+                if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin
+                  illegal_insn_load     = 1'b1;
                 end
               end
               // The new emul is greater than the previous lmul
               2'b10: begin
                 // But the new eew is lower than vsew
-                if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_o.resp_valid = 1'b1;
+                if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin
+                  illegal_insn_load     = 1'b1;
                 end
               end
               default:;
@@ -2631,20 +2551,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             // access.
             unique case (ara_req_d.emul)
               LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_o.resp_valid = 1'b1;
+                illegal_insn_load     = 1'b1;
               end
               LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_o.resp_valid = 1'b1;
+                illegal_insn_load     = 1'b1;
               end
               LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_o.resp_valid = 1'b1;
+                illegal_insn_load     = 1'b1;
               end
               LMUL_RSVD: begin
-                illegal_insn     = 1'b1;
-                acc_resp_o.resp_valid = 1'b1;
+                illegal_insn_load     = 1'b1;
               end
               default:;
             endcase
@@ -2654,9 +2570,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
               // Execute also if vl == 0
               ignore_zero_vl_check = 1'b1;
               // The LMUL value is kept in the instruction itself
-              illegal_insn     = 1'b0;
-              acc_resp_o.req_ready  = 1'b0;
-              acc_resp_o.resp_valid = 1'b0;
+              illegal_insn_load     = 1'b0;
               ara_req_valid_d  = 1'b1;
 
               // Maximum vector length. VLMAX = nf * VLEN / EW8.
@@ -2680,20 +2594,21 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 end
                 default: begin
                   // Trigger an error for the reserved simm values
-                  illegal_insn     = 1'b1;
+                  illegal_insn_load = 1'b1;
                 end
               endcase
             end
 
             // Wait until the back-end answers to acknowledge those instructions
-            if (ara_resp_valid_i) begin
+            if ( ara_resp_valid_i ) begin
               acc_resp_o.req_ready  = 1'b1;
-              acc_resp_o.error = ara_resp_i.error;
               acc_resp_o.resp_valid = 1'b1;
-              ara_req_valid_d  = 1'b0;
-              // In case of error, modify vstart
-              if (ara_resp_i.error)
-                vstart_d = ara_resp_i.error_vl;
+              acc_resp_o.exception  = ara_resp_i.exception;
+              ara_req_valid_d       = 1'b0;
+              // In case of exception, modify vstart
+              if ( ara_resp_i.exception.valid ) begin
+                csr_vstart_d = ara_resp_i.exception_vstart;
+              end
             end
           end
 
@@ -2738,7 +2653,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW8; // ara_req_d.vtype.vsew is the target EEW!
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW8;
                   end
               end
@@ -2746,7 +2661,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW16;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW16;
                   end
               end
@@ -2754,7 +2669,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW32;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW32;
                   end
               end
@@ -2762,15 +2677,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   if (insn.vmem_type.mop != 2'b01 && insn.vmem_type.mop != 2'b11) begin
                     ara_req_d.vtype.vsew = EW64;
                   end else begin
-                    ara_req_d.vtype.vsew = vtype_q.vsew;
+                    ara_req_d.vtype.vsew = csr_vtype_q.vsew;
                     ara_req_d.eew_vs2    = EW64;
                   end
               end
               default: begin // Invalid. Element is too wide, or encoding is non-existant.
-                acc_resp_o.req_ready  = 1'b1;
-                acc_resp_o.error = 1'b1;
-                acc_resp_o.resp_valid = 1'b1;
-                ara_req_valid_d  = 1'b0;
+                illegal_insn  = 1'b1;
               end
             endcase
 
@@ -2785,13 +2697,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   5'b01000:;     // Unit-strided, whole registers
                   5'b01011: begin // Unit-strided, mask store, EEW=1
                     // We operate ceil(vl/8) bytes
-                    ara_req_d.vl         = (vl_q >> 3) + |vl_q[2:0];
+                    ara_req_d.vl         = (csr_vl_q >> 3) + |csr_vl_q[2:0];
                     ara_req_d.vtype.vsew = EW8;
                   end
                   default: begin // Reserved
-                    illegal_insn     = 1'b1;
-                    acc_resp_o.req_ready  = 1'b1;
-                    acc_resp_o.resp_valid = 1'b1;
+                    illegal_insn_store    = 1'b1;
                   end
                 endcase
               end
@@ -2811,24 +2721,22 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
             // For memory operations: EMUL = LMUL * (EEW / SEW)
             // EEW is encoded in the instruction
-            ara_req_d.emul = vlmul_e'(vtype_q.vlmul + (ara_req_d.vtype.vsew - vtype_q.vsew));
+            ara_req_d.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req_d.vtype.vsew - csr_vtype_q.vsew));
 
             // Exception if EMUL > 8 or < 1/8
-            unique case ({vtype_q.vlmul[2], ara_req_d.emul[2]})
+            unique case ({csr_vtype_q.vlmul[2], ara_req_d.emul[2]})
               // The new emul is lower than the previous lmul
               2'b01: begin
                 // But the new eew is greater than vsew
-                if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) > 0) begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_o.resp_valid = 1'b1;
+                if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) > 0) begin
+                  illegal_insn_store    = 1'b1;
                 end
               end
               // The new emul is greater than the previous lmul
               2'b10: begin
                 // But the new eew is lower than vsew
-                if (signed'(ara_req_d.vtype.vsew - vtype_q.vsew) < 0) begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_o.resp_valid = 1'b1;
+                if (signed'(ara_req_d.vtype.vsew - csr_vtype_q.vsew) < 0) begin
+                  illegal_insn_store    = 1'b1;
                 end
               end
               default:;
@@ -2838,20 +2746,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             // access.
             unique case (ara_req_d.emul)
               LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_o.resp_valid = 1'b1;
+                illegal_insn_store     = 1'b1;
               end
               LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_o.resp_valid = 1'b1;
+                illegal_insn_store    = 1'b1;
               end
               LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) begin
-                illegal_insn     = 1'b1;
-                acc_resp_o.resp_valid = 1'b1;
+                illegal_insn_store    = 1'b1;
               end
               LMUL_RSVD: begin
-                  illegal_insn     = 1'b1;
-                  acc_resp_o.resp_valid = 1'b1;
+                illegal_insn_store    = 1'b1;
               end
               default:;
             endcase
@@ -2861,6 +2765,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
             if (ara_req_d.op == VSE && insn.vmem_type.rs2 == 5'b01000) begin
               // Execute also if vl == 0
               ignore_zero_vl_check = 1'b1;
+              illegal_insn_store    = 1'b0;
 
               // Maximum vector length. VLMAX = nf * VLEN / EW8.
               ara_req_d.vtype.vsew = EW8;
@@ -2883,25 +2788,25 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 end
                 default: begin
                   // Trigger an error for the reserved simm values
-                  illegal_insn     = 1'b1;
+                  illegal_insn_store = 1'b1;
                 end
               endcase
 
-              illegal_insn     = 1'b0;
               acc_resp_o.req_ready  = 1'b0;
               acc_resp_o.resp_valid = 1'b0;
               ara_req_valid_d  = 1'b1;
             end
 
             // Wait until the back-end answers to acknowledge those instructions
-            if (ara_resp_valid_i) begin
+            if ( ara_resp_valid_i ) begin
               acc_resp_o.req_ready  = 1'b1;
-              acc_resp_o.error = ara_resp_i.error;
               acc_resp_o.resp_valid = 1'b1;
-              ara_req_valid_d  = 1'b0;
-              // If there is an error, change vstart
-              if (ara_resp_i.error)
-                vstart_d = ara_resp_i.error_vl;
+              acc_resp_o.exception  = ara_resp_i.exception;
+              ara_req_valid_d       = 1'b0;
+              // In case of exception, modify vstart
+              if ( ara_resp_i.exception.valid ) begin
+                csr_vstart_d = ara_resp_i.exception_vstart;
+              end
             end
           end
 
@@ -2910,184 +2815,240 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
           ////////////////////////////
 
           riscv::OpcodeSystem: begin
-            // These always respond at the same cycle
-            acc_resp_o.resp_valid = 1'b1;
-            is_config        = 1'b1;
-
-            unique case (acc_req_i.insn.itype.funct3)
-              3'b001: begin // csrrw
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  // Only vstart can be written with CSR instructions.
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = acc_req_i.rs1;
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VXRM: begin
-                    vxrm_d            = vxrm_t'(acc_req_i.rs1[1:0]);
-                    acc_resp_o.result = vlen_t'(vxrm_q);
-                  end
-                  riscv::CSR_VXSAT: begin
-                    vxsat_d           = vxsat_e'(acc_req_i.rs1[0]);
-                    acc_resp_o.result = vlen_t'(vxsat_q);
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b010: begin // csrrs
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vstart_q | vlen_t'(acc_req_i.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VTYPE: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q);
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VL: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VLENB: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VXRM: begin
-                    vxrm_d            = vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]);
-                    acc_resp_o.result = vlen_t'(vxrm_q);
-                  end
-                  riscv::CSR_VXSAT: begin
-                    vxsat_d           = vxsat_q | vxsat_e'(acc_req_i.rs1[0]);
-                    acc_resp_o.result = vlen_t'(vxsat_q);
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b011: begin // csrrc
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vstart_q & ~vlen_t'(acc_req_i.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VTYPE: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q);
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VL: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VLENB: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VXSAT: begin
-                    vxsat_d           = vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
-                    acc_resp_o.result = vxsat_q;
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b101: begin // csrrwi
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  // Only vstart can be written with CSR instructions.
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vlen_t'(acc_req_i.insn.itype.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VXRM: begin
-                    vxrm_d            = vxrm_t'(acc_req_i.rs1[1:0]);
-                    acc_resp_o.result = vlen_t'(vxrm_q);
-                  end
-                  riscv::CSR_VXSAT: begin
-                    // logic [19:15] rs1; So, LSB is [15]
-                    vxsat_d           = acc_req_i.insn.itype.rs1[15];
-                    acc_resp_o.result = vxsat_q;
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b110: begin // csrrsi
-                // Decode the CSR.
-                case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vstart_q | vlen_t'(acc_req_i.insn.itype.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VTYPE: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q);
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VL: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VLENB: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VXSAT: begin
-                    // logic [19:15] rs1; So, LSB is [15]
-                    vxsat_d           = vxsat_q | vxsat_e'(acc_req_i.insn.itype.rs1[15]);
-                    acc_resp_o.result = vxsat_q;
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              3'b111: begin // csrrci
-                // Decode the CSR.
-                unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
-                  riscv::CSR_VSTART: begin
-                    vstart_d          = vstart_q & ~vlen_t'(acc_req_i.insn.itype.rs1);
-                    acc_resp_o.result = vstart_q;
-                  end
-                  riscv::CSR_VTYPE: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(vtype_q);
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VL: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = vl_q;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VLENB: begin
-                    // Only reads are allowed
-                    if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
-                    else acc_resp_o.error                                 = 1'b1;
-                  end
-                  riscv::CSR_VXSAT: begin
-                    // logic [19:15] rs1; So, LSB is [15]
-                    vxsat_d           = vxsat_q & ~vxsat_e'(acc_req_i.insn.itype.rs1[15]);
-                    acc_resp_o.result = vxsat_q;
-                  end
-                  default: acc_resp_o.error = 1'b1;
-                endcase
-              end
-              default: begin
-                // Trigger an illegal instruction
-                acc_resp_o.error = 1'b1;
-                acc_resp_o.resp_valid = 1'b1;
-              end
-            endcase
+            // CSR ops have semantic dependency from vector instrucitons.
+            // Therefore, Ara must be idle before performing any CSR operation.
+
+            // Stall if there is any pending vector instruction
+            // NOTE: This is overconstraining. Not all CSR ops actually need to stall if a vector instruction is pending.
+            //       E.g., CSR vl is never updated by instructions past ara_dispatcher, except for "unit-stride fault-only-first loads". Reading vl would be safe otherwise.
+            //       E.g., CSR vlenb is a design-constant parameter, reading is always safe.
+            //       E.g., CSRs vxrm and vxsat have no influence on-non fixed-point instructions, it could be read and written safely when no fixed-point operation is running.
+            //       By better analyzing the spec, more of optimizations of such can be made. For the sake of simplicity, the current implementation treats CSR ops as one block.
+            if ( ara_idle_i ) begin
+              // These always respond at the same cycle
+              acc_resp_o.resp_valid = 1'b1;
+              is_config        = 1'b1;
+
+              unique case (acc_req_i.insn.itype.funct3)
+                3'b001: begin // csrrw
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    // Only vstart can be written with CSR instructions.
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = acc_req_i.rs1;
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d            = vxrm_t'(acc_req_i.rs1[16:15]);
+                      acc_resp_o.result = vlen_t'(csr_vxrm_q);
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = vxsat_e'(acc_req_i.rs1[15]);
+                      acc_resp_o.result = vlen_t'(csr_vxsat_q);
+                    end
+                    riscv::CSR_VCSR: begin
+                      csr_vxrm_d            = vxrm_t'(  acc_req_i.rs1[17:16]  );
+                      csr_vxsat_d           = vxsat_e'( acc_req_i.rs1[15]    );
+                      acc_resp_o.result = vlen_t'(  { csr_vxrm_q, csr_vxsat_q } );
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b010: begin // csrrs
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = csr_vstart_q | vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VTYPE: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q);
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VL: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VLENB: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d            = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[16:15]);
+                      acc_resp_o.result = vlen_t'(csr_vxrm_q);
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]);
+                      acc_resp_o.result = vlen_t'(csr_vxsat_q);
+                    end
+                    riscv::CSR_VCSR: begin
+                      csr_vxrm_d            = csr_vxrm_q  | vxrm_t'(acc_req_i.rs1[17:16]);
+                      csr_vxsat_d           = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[15]);
+                      acc_resp_o.result = vlen_t'(  { csr_vxrm_q, csr_vxsat_q } );
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b011: begin // csrrc
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = csr_vstart_q & ~vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VTYPE: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q);
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VL: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VLENB: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d           = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]);
+                      acc_resp_o.result = csr_vxrm_q;
+                    end
+                    riscv::CSR_VCSR: begin
+                      csr_vxrm_d            = csr_vxrm_q  & ~vxsat_e'(acc_req_i.rs1[2:1]);
+                      csr_vxsat_d           = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = vlen_t'(  { csr_vxrm_q, csr_vxsat_q } );
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b101: begin // csrrwi
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    // Only vstart can be written with CSR instructions.
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d            = vxrm_t'(acc_req_i.rs1[1:0]);
+                      acc_resp_o.result = vlen_t'(csr_vxrm_q);
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = acc_req_i.rs1[0];
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    riscv::CSR_VCSR: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxrm_d            = vxrm_t'(acc_req_i.rs1[2:1]);
+                      csr_vxsat_d           = vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b110: begin // csrrsi
+                  // Decode the CSR.
+                  case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = csr_vstart_q | vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VTYPE: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q);
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VL: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VLENB: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VXSAT: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxsat_d           = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxrm_d           = csr_vxrm_q | vxrm_t'(acc_req_i.rs1[1:0]);
+                      acc_resp_o.result = csr_vxrm_q;
+                    end
+                    riscv::CSR_VCSR: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxrm_d            = csr_vxrm_q  |  vxrm_t'(acc_req_i.rs1[2:1]);
+                      csr_vxsat_d           = csr_vxsat_q | vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = { csr_vxrm_q,  csr_vxsat_q };
+                    end
+                    default: illegal_insn = 1'b1;
+                  endcase
+                end
+                3'b111: begin // csrrci
+                  // Decode the CSR.
+                  unique case (riscv::csr_addr_t'(acc_req_i.insn.itype.imm))
+                    riscv::CSR_VSTART: begin
+                      csr_vstart_d          = csr_vstart_q & ~vlen_t'(acc_req_i.rs1);
+                      acc_resp_o.result = csr_vstart_q;
+                    end
+                    riscv::CSR_VTYPE: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = xlen_vtype(csr_vtype_q);
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VL: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = csr_vl_q;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VLENB: begin
+                      // Only reads are allowed
+                      if (acc_req_i.insn.itype.rs1 == '0) acc_resp_o.result = VLENB;
+                      else illegal_insn = 1'b1;
+                    end
+                    riscv::CSR_VXSAT: begin
+                      csr_vxsat_d           = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = csr_vxsat_q;
+                    end
+                    riscv::CSR_VXRM: begin
+                      csr_vxrm_d           = csr_vxrm_q & ~vxsat_e'(acc_req_i.rs1[1:0]);
+                      acc_resp_o.result = csr_vxrm_q;
+                    end
+                    riscv::CSR_VCSR: begin
+                      // logic [19:15] rs1; So, LSB is [15]
+                      csr_vxrm_d           = csr_vxrm_q  &  ~vxrm_t'(acc_req_i.rs1[2:1]);
+                      csr_vxsat_d          = csr_vxsat_q & ~vxsat_e'(acc_req_i.rs1[0]);
+                      acc_resp_o.result = { csr_vxrm_q,  csr_vxsat_q };
+                    end
+                    default: illegal_insn= 1'b1;
+                  endcase
+                end
+                default: begin
+                  // Trigger an illegal instruction
+                  illegal_insn = 1'b1;
+                end
+              endcase // acc_req_i.insn.itype.funct3
+            end
+            else begin
+              acc_resp_o.req_ready = 1'b0;
+            end
           end
 
           default: begin
             // Trigger an illegal instruction
-            acc_resp_o.error = 1'b1;
-            acc_resp_o.resp_valid = 1'b1;
+            illegal_insn = 1'b1;
           end
         endcase
       end
@@ -3101,9 +3062,19 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
       if (ara_req_valid_d && (ara_req_d.op inside {VFREC7, VFRSQRT7}) && (FPExtSupport == FPExtSupportDisable))
         illegal_insn = 1'b1;
 
+      // Raise an illegal instruction exception
+      if ( illegal_insn || illegal_insn_load || illegal_insn_store ) begin
+        ara_req_valid_d            = 1'b0;
+        acc_resp_o.req_ready       = 1'b1;
+        acc_resp_o.resp_valid      = 1'b1;
+        acc_resp_o.exception.valid = 1'b1;
+        acc_resp_o.exception.cause = riscv::ILLEGAL_INSTR;
+        acc_resp_o.exception.tval  = acc_req_i.insn;
+      end
+
       // Check if we need to reshuffle our vector registers involved in the operation
       // This operation is costly when occurs, so avoid it if possible
-      if (ara_req_valid_d && !acc_resp_o.error) begin
+      if ( ara_req_valid_d && !acc_resp_o.exception.valid ) begin
         automatic rvv_instruction_t insn = rvv_instruction_t'(acc_req_i.insn.instr);
 
         // Is the instruction an in-lane one and could it be subject to reshuffling?
@@ -3114,7 +3085,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
         // Optimization: reshuffle vd only if we are not overwriting the whole vector register!
         reshuffle_req_d = {ara_req_d.use_vs1 && (ara_req_d.eew_vs1    != eew_q[ara_req_d.vs1]) && eew_valid_q[ara_req_d.vs1] && in_lane_op,
                            ara_req_d.use_vs2 && (ara_req_d.eew_vs2    != eew_q[ara_req_d.vs2]) && eew_valid_q[ara_req_d.vs2] && in_lane_op,
-                           ara_req_d.use_vd  && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && vl_q != (VLENB >> ara_req_d.vtype.vsew)};
+                           ara_req_d.use_vd  && (ara_req_d.vtype.vsew != eew_q[ara_req_d.vd ]) && eew_valid_q[ara_req_d.vd ] && csr_vl_q != ((VLENB << ara_req_d.emul[1:0]) >> ara_req_d.vtype.vsew)};
+        // Mask out requests if they refer to the same register!
+        reshuffle_req_d &= {
+          (insn.varith_type.rs1 != insn.varith_type.rs2) && (insn.varith_type.rs1 != insn.varith_type.rd),
+          (insn.varith_type.rs2 != insn.varith_type.rd),
+          1'b1};
 
         // Prepare the information to reshuffle the vector registers during the next cycles
         // Reshuffle in the following order: vd, v2, v1. The order is arbitrary.
@@ -3156,17 +3132,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
           default: rs_lmul_cnt_limit_d = 0;
         endcase
 
+        // Save info for next reshuffles
+        reshuffle_eew_vs1_d = ara_req_d.eew_vs1;
+        reshuffle_eew_vs2_d = ara_req_d.eew_vs2;
+        reshuffle_eew_vd_d  = ara_req_d.vtype.vsew;
+
         // Reshuffle
         state_d = RESHUFFLE;
       end
     end
 
-    // Raise an illegal instruction exception
-    if (illegal_insn) begin
-      acc_resp_o.error = 1'b1;
-      ara_req_valid_d  = 1'b0;
-    end
-
     // Update the EEW
     if (ara_req_valid_d && ara_req_d.use_vd && ara_req_ready_i) begin
       unique case (ara_req_d.emul)
@@ -3205,8 +3180,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
     // Any valid non-config instruction is a NOP if vl == 0, with some exceptions,
     // e.g. whole vector memory operations / whole vector register move
-    if (is_decoding && (vl_q == '0 || null_vslideup) && !is_config &&
-      !ignore_zero_vl_check && !acc_resp_o.error) begin
+    if (is_decoding && (csr_vstart_q >= csr_vl_q || null_vslideup) && !is_config &&
+      !ignore_zero_vl_check && !acc_resp_o.exception.valid) begin
       // If we are acknowledging a memory operation, we must tell Ariane that the memory
       // operation was resolved (to decrement its pending load/store counter)
       // This can collide with the same signal from the vector load/store unit, so we must
@@ -3218,6 +3193,17 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
       store_zero_vl    = is_vstore;
     end
 
+    // Reset vstart to zero for successful vector instructions
+    // Corner cases:
+    // * vstart exception reporting, e.g., VLSU, is handled above
+    // * CSR operations are not considered vector instructions
+    if ( acc_resp_o.resp_valid
+          & !acc_resp_o.exception.valid
+          & (acc_req_i.insn.itype.opcode != riscv::OpcodeSystem)
+        ) begin
+      csr_vstart_d = '0;
+    end
+
     acc_resp_o.load_complete  = load_zero_vl  | load_complete_q;
     acc_resp_o.store_complete = store_zero_vl | store_complete_q;
 
diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv
index 5fb0abff1..74fce4573 100644
--- a/hardware/src/ara_sequencer.sv
+++ b/hardware/src/ara_sequencer.sv
@@ -40,8 +40,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     output logic                            pe_scalar_resp_ready_o,
     // Interface with the Address Generation
     input  logic                            addrgen_ack_i,
-    input  logic                            addrgen_error_i,
-    input  vlen_t                           addrgen_error_vl_i
+    input  ariane_pkg::exception_t          addrgen_exception_i,
+    input  vlen_t                           addrgen_exception_vstart_i
   );
 
   ///////////////////////////////////
@@ -438,8 +438,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
           state_d             = IDLE;
           ara_req_ready_o     = 1'b1;
           ara_resp_valid_o    = 1'b1;
-          ara_resp_o.error    = addrgen_error_i;
-          ara_resp_o.error_vl = addrgen_error_vl_i;
+          ara_resp_o.exception = addrgen_exception_i;
+          ara_resp_o.exception_vstart = addrgen_exception_vstart_i;
         end
 
         // Wait for the scalar result
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index 386b9823c..d93b79fda 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -652,7 +652,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             if ((operand_request_i[AluA].vl << (int'(EW64) - int'(pe_req.eew_vs1))) * NrLanes !=
                 pe_req.vl) operand_request_i[AluA].vl += 1;
           end
-          operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]});
+          operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF});
 
           operand_request_i[AluB] = '{
             id      : pe_req.id,
@@ -679,7 +679,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             if ((operand_request_i[AluB].vl << (int'(EW64) - int'(pe_req.eew_vs2))) * NrLanes !=
                 pe_req.vl) operand_request_i[AluB].vl += 1;
           end
-          operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]});
+          operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF, VFIRST});
 
           operand_request_i[MulFPUA] = '{
             id      : pe_req.id,
@@ -695,7 +695,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
           operand_request_i[MulFPUA].vl = vfu_operation_d.vl;
-          operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]};
+          operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF});
 
           operand_request_i[MulFPUB] = '{
             id      : pe_req.id,
@@ -710,24 +710,26 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           // This is an operation that runs normally on the ALU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
           operand_request_i[MulFPUB].vl = vfu_operation_d.vl;
-          operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]};
+          operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF, VFIRST});
 
           operand_request_i[MaskB] = '{
             id      : pe_req.id,
-            vs      : pe_req.vd,
-            eew     : pe_req.eew_vd_op,
+            vs      : pe_req.vs2,
+            eew     : pe_req.eew_vs2,
             scale_vl: pe_req.scale_vl,
             vtype   : pe_req.vtype,
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
             vl      : (pe_req.vl / NrLanes / ELEN) << (int'(EW64) - int'(pe_req.vtype.vsew)),
             vstart  : vfu_operation_d.vstart,
-            hazard  : pe_req.hazard_vd,
+            hazard  : (pe_req.op inside {VMSBF, VMSOF, VMSIF}) ? pe_req.hazard_vs2 : pe_req.hazard_vs2 | pe_req.hazard_vd,
             default : '0
           };
-          if (((pe_req.vl / NrLanes / ELEN) * NrLanes * ELEN) !=
-            pe_req.vl) operand_request_i[MaskB].vl += 1;
-          operand_request_push[MaskB] = pe_req.use_vd_op;
+          operand_request_i[MaskB].vl = pe_req.vl / (NrLanes * (8 << pe_req.vtype.vsew));
+          if ((pe_req.vl % (NrLanes*ELEN)) != 0) begin
+            operand_request_i[MaskB].vl += 1'b1;
+          end
+          operand_request_push[MaskB] = pe_req.use_vs2  && pe_req.op inside {VCPOP, VFIRST, VMSIF, VMSOF, VMSBF};
 
           operand_request_i[MaskM] = '{
             id     : pe_req.id,
diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index 9bfc120f4..f9c447d62 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -795,7 +795,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     //////////////////////////////
 
     if (!vinsn_queue_full && vfu_operation_valid_i &&
-      (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin
+      (vfu_operation_i.vfu == VFU_Alu || (vfu_operation_i.op inside {[VMSEQ:VMXNOR]} &&
+      !(vfu_operation_i.op inside {VCPOP, VMSIF, VMSOF, VMSBF, VFIRST})))) begin
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
       // Do not wait for masks if, during a reduction, this lane is just a pass-through
       // The only valid instructions here with vl == '0 are reductions
diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv
index da6fb002a..855f89440 100644
--- a/hardware/src/masku/masku.sv
+++ b/hardware/src/masku/masku.sv
@@ -50,6 +50,20 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
   import cf_math_pkg::idx_width;
 
+  // Pointers
+  //
+  // We need a pointer to which bit on the full VRF word we are reading mask operands from.
+  logic [idx_width(DataWidth*NrLanes):0] mask_pnt_d, mask_pnt_q;
+  // We need a pointer to which bit on the full VRF word we are writing results to.
+  logic [idx_width(DataWidth*NrLanes):0] vrf_pnt_d, vrf_pnt_q;
+
+  // Remaining elements of the current instruction in the read operand phase
+  vlen_t read_cnt_d, read_cnt_q;
+  // Remaining elements of the current instruction in the issue phase
+  vlen_t issue_cnt_d, issue_cnt_q;
+  // Remaining elements of the current instruction in the commit phase
+  vlen_t commit_cnt_d, commit_cnt_q;
+
   ////////////////
   //  Operands  //
   ////////////////
@@ -57,39 +71,130 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   // Information about which is the target FU of the request
   masku_fu_e masku_operand_fu;
 
-  // ALU/FPU result
-  elen_t [NrLanes-1:0] masku_operand_a_i;
-  logic  [NrLanes-1:0] masku_operand_a_valid_i;
-  logic  [NrLanes-1:0] masku_operand_a_ready_o;
+  // ALU/FPU result (shuffled)
+  elen_t [NrLanes-1:0] masku_operand_alu;
+  logic  [NrLanes-1:0] masku_operand_alu_valid;
+  logic  [NrLanes-1:0] masku_operand_alu_ready;
 
-  // Previous value of the destination vector register
-  elen_t [NrLanes-1:0] masku_operand_b_i;
-  logic  [NrLanes-1:0] masku_operand_b_valid_i;
-  logic  [NrLanes-1:0] masku_operand_b_ready_o;
+  // ALU/FPU result (deshuffled)
+  logic  [NrLanes*ELEN-1:0] masku_operand_alu_seq;
+
+  // vs2 (shuffled)
+  elen_t [NrLanes-1:0] masku_operand_vs2;
+  logic  [NrLanes-1:0] masku_operand_vs2_valid;
+  logic  [NrLanes-1:0] masku_operand_vs2_ready;
+
+  assign masku_operand_vs2_ready = 1'b0;
+
+  // vs2 (deshuffled)
+  logic  [NrLanes*ELEN-1:0] masku_operand_vs2_seq;
+  logic  [     NrLanes-1:0] masku_operand_vs2_seq_valid;
+  logic  [     NrLanes-1:0] masku_operand_vs2_seq_ready;
 
   // Mask
-  elen_t [NrLanes-1:0] masku_operand_m_i;
-  logic  [NrLanes-1:0] masku_operand_m_valid_i;
-  logic  [NrLanes-1:0] masku_operand_m_ready_o;
+  elen_t [NrLanes-1:0] masku_operand_m;
+  logic  [NrLanes-1:0] masku_operand_m_valid;
+  logic  [NrLanes-1:0] masku_operand_m_ready;
+
+  // Mask deshuffled
+  logic  [NrLanes*ELEN-1:0] masku_operand_m_seq;
+  logic  [NrLanes-1:0] masku_operand_m_seq_valid;
+  logic  [NrLanes-1:0] masku_operand_m_seq_ready;
 
   // Insn-queue related signal
   pe_req_t vinsn_issue;
 
-  for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_unpack_masku_operands
-    assign masku_operand_a_i[lane]       = masku_operand_i[lane][2 + masku_operand_fu];
-    assign masku_operand_a_valid_i[lane] = masku_operand_valid_i[lane][2 + masku_operand_fu];
-    for (genvar operand_fu = 0; operand_fu < NrMaskFUnits; operand_fu++) begin: gen_masku_operand_ready
-      assign masku_operand_ready_o[lane][2 + operand_fu] = (masku_fu_e'(operand_fu) == masku_operand_fu) && masku_operand_a_ready_o[lane];
-    end: gen_masku_operand_ready
+  logic  [NrLanes*ELEN-1:0] bit_enable_mask;
+  logic  [NrLanes*ELEN-1:0] bit_enable_shuffle;
+  logic  [NrLanes*ELEN-1:0] alu_result_compressed;
+
+  // Performs all shuffling and deshuffling of mask operands (including masks for mask instructions)
+  // Furthermore, it buffers certain operands that would create long critical paths
+  masku_operands #(
+    .NrLanes ( NrLanes )
+  ) i_masku_operands (
+    .clk_i                         (                       clk_i ),
+    .rst_ni                        (                      rst_ni ),
+    // Control logic
+    .masku_fu_i                    (            masku_operand_fu ),
+    .vinsn_issue_i                 (                 vinsn_issue ),
+    .vrf_pnt_i                     (                   vrf_pnt_q ),
+    // Operands coming from lanes
+    .masku_operand_valid_i         (       masku_operand_valid_i ),
+    .masku_operand_ready_o         (       masku_operand_ready_o ),
+    .masku_operands_i              (             masku_operand_i ),
+    // Operands prepared for mask unit execution
+    .masku_operand_alu_o           (           masku_operand_alu ),
+    .masku_operand_alu_valid_o     (     masku_operand_alu_valid ),
+    .masku_operand_alu_ready_i     (     masku_operand_alu_ready ),
+    .masku_operand_alu_seq_o       (       masku_operand_alu_seq ),
+    .masku_operand_alu_seq_valid_o (  ),
+    .masku_operand_alu_seq_ready_i (  ),
+    .masku_operand_vs2_o           (           masku_operand_vs2 ),
+    .masku_operand_vs2_valid_o     (     masku_operand_vs2_valid ),
+    .masku_operand_vs2_ready_i     (     masku_operand_vs2_ready ),
+    .masku_operand_vs2_seq_o       (       masku_operand_vs2_seq ),
+    .masku_operand_vs2_seq_valid_o ( masku_operand_vs2_seq_valid ),
+    .masku_operand_vs2_seq_ready_i ( masku_operand_vs2_seq_ready ),
+    .masku_operand_m_o             (             masku_operand_m ),
+    .masku_operand_m_valid_o       (       masku_operand_m_valid ),
+    .masku_operand_m_ready_i       (       masku_operand_m_ready ),
+    .masku_operand_m_seq_o         (         masku_operand_m_seq ),
+    .masku_operand_m_seq_valid_o   (  ),
+    .masku_operand_m_seq_ready_i   (  ),
+    .bit_enable_mask_o             (             bit_enable_mask ),
+    .shuffled_vl_bit_mask_o        (          bit_enable_shuffle ),
+    .alu_result_compressed_o       (       alu_result_compressed )
+  );
+
 
-    assign masku_operand_b_i[lane]        = masku_operand_i[lane][1];
-    assign masku_operand_b_valid_i[lane]  = (vinsn_issue.op inside {[VMSBF:VID]}) ? '1 : masku_operand_valid_i[lane][1];
-    assign masku_operand_ready_o[lane][1] = masku_operand_b_ready_o[lane];
+  // Local Parameter W_CPOP and W_VFIRST
+  //
+  // Description: Parameters W_CPOP and W_VFIRST enable time multiplexing of vcpop.m and vfirst.m instruction.
+  //
+  // Legal range W_CPOP:   {64, 128, ... , DataWidth*NrLanes} // DataWidth = 64
+  // Legal range W_VFIRST: {64, 128, ... , DataWidth*NrLanes} // DataWidth = 64
+  //
+  // Execution time example for vcpop.m (similar for vfirst.m):
+  // W_CPOP = 64; VLEN = 1024; vl = 1024
+  // t_vcpop.m = VLEN/W_CPOP = 8 [Cycles]
+  localparam int W_CPOP   = 64;
+  localparam int W_VFIRST = 64;
+  // derived parameters
+  localparam int MAX_W_CPOP_VFIRST = (W_CPOP > W_VFIRST) ? W_CPOP : W_VFIRST;
+  localparam int N_SLICES_CPOP   = NrLanes * DataWidth / W_CPOP;
+  localparam int N_SLICES_VFIRST = NrLanes * DataWidth / W_VFIRST;
+  // Check if parameters are within range
+  if (((W_CPOP & (W_CPOP - 1)) != 0) || (W_CPOP < 64)) begin
+    $fatal(1, "Parameter W_CPOP must be power of 2.");
+  end else if (((W_VFIRST & (W_VFIRST - 1)) != 0) || (W_VFIRST < 64)) begin
+    $fatal(1, "Parameter W_VFIRST must be power of 2.");
+  end
+
+  // VFIRST and VCPOP Signals
+  logic  [NrLanes*ELEN-1:0]              vcpop_operand;
+  logic  [$clog2(W_VFIRST):0]            popcount;
+  logic  [$clog2(VLEN):0]                popcount_d, popcount_q;
+  logic  [$clog2(W_VFIRST)-1:0]          vfirst_count;
+  logic  [$clog2(VLEN)-1:0]              vfirst_count_d, vfirst_count_q;
+  logic                                  vfirst_empty;
+  logic  [NrLanes-1:0]                   vcpop_vfirst_vs2_ready;
+  // counter to keep track of how many slices of the vcpop_operand have been processed
+  logic [$clog2(MAX_W_CPOP_VFIRST):0]   vcpop_slice_cnt_d, vcpop_slice_cnt_q;
+  logic [W_CPOP-1:0]                    vcpop_slice;
+  logic [W_VFIRST-1:0]                  vfirst_slice;
+
+  // vmsbf, vmsif, vmsof, viota, vid, vcpop, vfirst variables
+  logic  [NrLanes*DataWidth-1:0] alu_result_f, alu_result_ff;
+  logic  [NrLanes*DataWidth-1:0] masku_operand_alu_seq_m, masku_operand_alu_seq_f, masku_operand_alu_seq_ff;
+  logic  [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_seq;
+  logic  [NrLanes*DataWidth-1:0] alu_src_idx, alu_src_idx_m;
+  logic  [                 13:0] iteration_count_d, iteration_count_q;
+  logic                          not_found_one_d, not_found_one_q;
+  logic  [          NrLanes-1:0] vmsif_vmsof_vmsbf_vs2_ready;
 
-    assign masku_operand_m_i[lane]        = masku_operand_i[lane][0];
-    assign masku_operand_m_valid_i[lane]  = masku_operand_valid_i[lane][0];
-    assign masku_operand_ready_o[lane][0] = masku_operand_m_ready_o[lane];
-  end: gen_unpack_masku_operands
+  // Control flow for mask operands
+  assign masku_operand_vs2_seq_ready = vcpop_vfirst_vs2_ready | vmsif_vmsof_vmsbf_vs2_ready;
 
   ////////////////////////////////
   //  Vector instruction queue  //
@@ -217,16 +322,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   logic result_queue_empty;
   assign result_queue_empty = (result_queue_cnt_q == '0);
 
-  // vmsbf, vmsif, vmsof, viota, vid, vcpop, vfirst variables
-  logic  [NrLanes*DataWidth-1:0] alu_result_f, alu_result_ff;
-  logic  [NrLanes*DataWidth-1:0] alu_operand_a, alu_operand_a_seq, alu_operand_a_seq_f;
-  logic  [NrLanes*DataWidth-1:0] alu_operand_b, alu_operand_b_seq, alu_operand_b_seq_m, alu_operand_b_seq_f, alu_operand_b_seq_ff;
-  logic  [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_seq;
-  logic  [NrLanes*DataWidth-1:0] masku_operand_vd;
-  logic  [NrLanes*DataWidth-1:0] alu_src_idx, alu_src_idx_m;
-  logic  [4:0]                   iteration_count_d, iteration_count_q;
-  logic                          not_found_one_d, not_found_one_q;
-
   always_ff @(posedge clk_i or negedge rst_ni) begin: p_result_queue_ff
     if (!rst_ni) begin
       result_queue_q           <= '0;
@@ -238,8 +333,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       alu_result_f             <= '0;
       alu_result_ff            <= '0;
       not_found_one_q          <= 1'b1;
-      alu_operand_b_seq_f      <= '0;
-      alu_operand_b_seq_ff     <= '0;
+      masku_operand_alu_seq_f  <= '0;
+      masku_operand_alu_seq_ff <= '0;
       iteration_count_q        <= '0;
     end else begin
       result_queue_q           <= result_queue_d;
@@ -251,15 +346,15 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       alu_result_f             <= (pe_req_ready_o) ? '0 : (!vinsn_issue.vm) ? alu_result_vm : alu_result_vm_seq;
       alu_result_ff            <= alu_result_f;
       not_found_one_q          <= not_found_one_d;
-      alu_operand_b_seq_f      <= (pe_req_ready_o) ? '0 : alu_operand_b_seq_m;
-      alu_operand_b_seq_ff     <= alu_operand_b_seq_f;
+      masku_operand_alu_seq_f  <= (pe_req_ready_o) ? '0 : masku_operand_alu_seq_m;
+      masku_operand_alu_seq_ff <= masku_operand_alu_seq_f;
       iteration_count_q        <= iteration_count_d;
     end
   end
 
   // iteration count for masked instrctions
   always_comb begin
-    if (vinsn_issue_valid && &masku_operand_a_valid_i) begin
+    if (vinsn_issue_valid && (&masku_operand_alu_valid || &masku_operand_vs2_seq_valid)) begin
       iteration_count_d = iteration_count_q + 1'b1;
     end else begin
       iteration_count_d = iteration_count_q;
@@ -291,114 +386,49 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   ////////////////
 
   elen_t [NrLanes-1:0]                   alu_result;
-  logic  [NrLanes*ELEN-1:0]              bit_enable;
-  logic  [NrLanes*ELEN-1:0]              bit_enable_shuffle;
-  logic  [NrLanes*ELEN-1:0]              bit_enable_mask;
-  rvv_pkg::vew_e                         bit_enable_shuffle_eew;
   logic  [NrLanes*ELEN-1:0]              mask;
-  logic  [NrLanes*ELEN-1:0]              vcpop_operand;
-  logic  [$clog2(DataWidth*NrLanes):0]   popcount;
-  logic  [$clog2(VLEN):0]                popcount_d, popcount_q;
-  logic  [$clog2(DataWidth*NrLanes)-1:0] vfirst_count;
-  logic  [$clog2(VLEN)-1:0]              vfirst_count_d, vfirst_count_q;
-  logic                                  vfirst_empty;
 
-  // Pointers
-  //
-  // We need a pointer to which bit on the full VRF word we are reading mask operands from.
-  logic [idx_width(DataWidth*NrLanes):0] mask_pnt_d, mask_pnt_q;
-  // We need a pointer to which bit on the full VRF word we are writing results to.
-  logic [idx_width(DataWidth*NrLanes):0] vrf_pnt_d, vrf_pnt_q;
+  // keep track if first 1 mask element was found
+  logic vfirst_found;
 
-  // Remaining elements of the current instruction in the read operand phase
-  vlen_t read_cnt_d, read_cnt_q;
-  // Remaining elements of the current instruction in the issue phase
-  vlen_t issue_cnt_d, issue_cnt_q;
-  // Remaining elements of the current instruction in the commit phase
-  vlen_t commit_cnt_d, commit_cnt_q;
+  // assign operand slices to be processed by popcount and lzc
+  assign vcpop_slice  = vcpop_operand[(vcpop_slice_cnt_q * W_CPOP) +: W_CPOP];
+  assign vfirst_slice = vcpop_operand[(vcpop_slice_cnt_q * W_VFIRST) +: W_VFIRST];
 
   // Population count for vcpop.m instruction
   popcount #(
-    .INPUT_WIDTH (DataWidth*NrLanes)
+    .INPUT_WIDTH (W_CPOP)
   ) i_popcount (
-    .data_i    (vcpop_operand),
+    .data_i    (vcpop_slice),
     .popcount_o(popcount     )
   );
 
   // Trailing zero counter
   lzc #(
-    .WIDTH(DataWidth*NrLanes),
+    .WIDTH(W_VFIRST),
     .MODE (0)
   ) i_clz (
-    .in_i    (vcpop_operand),
+    .in_i    (vfirst_slice ),
     .cnt_o   (vfirst_count ),
     .empty_o (vfirst_empty )
   );
 
   always_comb begin: p_mask_alu
     alu_result          = '0;
-    bit_enable          = '0;
-    bit_enable_shuffle  = '0;
-    bit_enable_mask     = '0;
     not_found_one_d     = pe_req_ready_o ? 1'b1 : not_found_one_q;
     alu_result_vm       = '0;
     alu_result_vm_m     = '0;
     alu_result_vm_seq   = '0;
-    alu_operand_b_seq   = '0;
-    alu_operand_b_seq_m = '0;
+    masku_operand_alu_seq_m = '0;
     mask                = '0;
-    masku_operand_vd    = '0;
     vcpop_operand       = '0;
 
-    // Comparisons work on vtype.vsew from VALU or VMFPU
-    bit_enable_shuffle_eew = vinsn_issue.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]}
-                           ? vinsn_issue.vtype.vsew
-                           : vinsn_issue.eew_vd_op;
-
     if (vinsn_issue_valid) begin
-      // Calculate bit enable
-      // The result can be taken either from the result of an operation (mask_operand_a_i), or
-      // from the previous value of the destination register (mask_operand_b_i). Byte strobes
-      // do not work here, since this has to be done at a bit granularity. Therefore, the Mask Unit
-      // received both operands, and does a masking depending on the value of the vl.
-      if (vinsn_issue.vl >= ELEN*NrLanes)
-        bit_enable = '1;
-      else begin
-        bit_enable[vinsn_issue.vl] = 1'b1;
-        bit_enable                 = bit_enable - 1;
-      end
-
-      // Shuffle the bit enable signal
-      for (int b = 0; b < NrLanes*StrbWidth; b++) begin
-        automatic int vrf_byte              = shuffle_index(b, NrLanes, bit_enable_shuffle_eew);
-        bit_enable_shuffle[8*vrf_byte +: 8] = bit_enable[8*b +: 8];
-
-        // Take the mask into account
-        if (!vinsn_issue.vm) begin
-          automatic int mask_byte          = shuffle_index(b, NrLanes, vinsn_issue.eew_vmask);
-          automatic int mask_byte_lane     = mask_byte[idx_width(StrbWidth) +: idx_width(NrLanes)];
-          automatic int mask_byte_offset   = mask_byte[idx_width(StrbWidth)-1:0];
-          bit_enable_mask[8*vrf_byte +: 8] = bit_enable_shuffle[8*vrf_byte +: 8] &
-            masku_operand_m_i[mask_byte_lane][8*mask_byte_offset +: 8];
-        end else begin
-          bit_enable_mask[8*vrf_byte +: 8] = bit_enable_shuffle[8*vrf_byte +: 8];
-        end
-      end
-
-      alu_operand_a = masku_operand_a_i;
-      alu_operand_b = masku_operand_b_i;
-
-      // Deshuffle the operands for the mask instructions
-      for (int b = 0; b < (NrLanes*StrbWidth); b++) begin
-        automatic int deshuffle_byte             = deshuffle_index(b, NrLanes, vinsn_issue.vtype.vsew);
-        alu_operand_b_seq[8*deshuffle_byte +: 8] = alu_operand_a[8*b +: 8];
-        masku_operand_vd [8*deshuffle_byte +: 8] = alu_operand_b[8*b +: 8];
-      end
 
       // Mask generation
       unique case (vinsn_issue.op) inside
         [VMSBF:VID] :
-          if (&masku_operand_a_valid_i) begin
+          if (&masku_operand_alu_valid) begin
             unique case (vinsn_issue.vtype.vsew)
               EW8 : for (int i = 0; i < (DataWidth * NrLanes)/8; i++)
                       mask [(i*8) +: 8]   = {8{bit_enable_mask [i+(((DataWidth * NrLanes)/8)*(iteration_count_d-1))]}};
@@ -417,156 +447,77 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
       // Evaluate the instruction
       unique case (vinsn_issue.op) inside
-        [VMANDNOT:VMXNOR]: alu_result = (masku_operand_a_i & bit_enable_mask) |
-          (masku_operand_b_i & ~bit_enable_mask);
-        [VMFEQ:VMSGTU], [VMSGT:VMSBC] : begin
-          automatic logic [ELEN*NrLanes-1:0] alu_result_flat = '0;
-
-          unique case (vinsn_issue.vtype.vsew)
-            EW8: for (int b = 0; b < 8*NrLanes; b++) begin
-                // Shuffle the source byte, then find the lane and the offset of this byte in the
-                // full operand word.
-                automatic int src_byte        = shuffle_index(1*b, NrLanes, EW8);
-                automatic int src_byte_lane   = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)];
-                automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0];
-
-                // Find the destination byte
-                automatic int dest_bit_seq  = b + vrf_pnt_q;
-                automatic int dest_byte_seq = dest_bit_seq / StrbWidth;
-                automatic int dest_byte     = shuffle_index(dest_byte_seq, NrLanes, EW8);
-
-                alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] =
-                (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ?
-                masku_operand_b_i[src_byte_lane][8*src_byte_offset] :
-                masku_operand_a_i[src_byte_lane][8*src_byte_offset];
-              end
-            EW16: for (int b = 0; b < 4*NrLanes; b++) begin
-                // Shuffle the source byte, then find the lane and the offset of this byte in the
-                // full operand word.
-                automatic int src_byte        = shuffle_index(2*b, NrLanes, EW16);
-                automatic int src_byte_lane   = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)];
-                automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0];
-
-                // Find the destination byte
-                automatic int dest_bit_seq  = b + vrf_pnt_q;
-                automatic int dest_byte_seq = dest_bit_seq / StrbWidth;
-                automatic int dest_byte     = shuffle_index(dest_byte_seq, NrLanes, EW16);
-
-                alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] =
-                (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ?
-                masku_operand_b_i[src_byte_lane][8*src_byte_offset] :
-                masku_operand_a_i[src_byte_lane][8*src_byte_offset];
-              end
-            EW32: for (int b = 0; b < 2*NrLanes; b++) begin
-                // Shuffle the source byte, then find the lane and the offset of this byte in the
-                // full operand word.
-                automatic int src_byte        = shuffle_index(4*b, NrLanes, EW32);
-                automatic int src_byte_lane   = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)];
-                automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0];
-
-                // Find the destination byte
-                automatic int dest_bit_seq  = b + vrf_pnt_q;
-                automatic int dest_byte_seq = dest_bit_seq / StrbWidth;
-                automatic int dest_byte     = shuffle_index(dest_byte_seq, NrLanes, EW32);
-
-                alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] =
-                (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ?
-                masku_operand_b_i[src_byte_lane][8*src_byte_offset] :
-                masku_operand_a_i[src_byte_lane][8*src_byte_offset];
-              end
-            EW64: for (int b = 0; b < 1*NrLanes; b++) begin
-                // Shuffle the source byte, then find the lane and the offset of this byte in the
-                // full operand word.
-                automatic int src_byte        = shuffle_index(8*b, NrLanes, EW64);
-                automatic int src_byte_lane   = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)];
-                automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0];
-
-                // Find the destination byte
-                automatic int dest_bit_seq  = b + vrf_pnt_q;
-                automatic int dest_byte_seq = dest_bit_seq / StrbWidth;
-                automatic int dest_byte     = shuffle_index(dest_byte_seq, NrLanes, EW64);
-
-                alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] =
-                  (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ?
-                  masku_operand_b_i[src_byte_lane][8*src_byte_offset] :
-                  masku_operand_a_i[src_byte_lane][8*src_byte_offset];
-              end
-            default:;
-          endcase
-
-          // Final assignment
-          alu_result = (alu_result_flat & bit_enable_shuffle) |
-            (masku_operand_b_i & ~bit_enable_shuffle);
-        end
+        [VMANDNOT:VMXNOR]: alu_result = (masku_operand_alu) | (~bit_enable_shuffle);
+        [VMFEQ:VMSGTU], [VMSGT:VMSBC]:  alu_result = (alu_result_compressed & bit_enable_mask) | (~bit_enable_shuffle);
         [VMSBF:VMSIF] : begin
-            if (&masku_operand_a_valid_i) begin
-                for (int i = 0; i < NrLanes * DataWidth; i++) begin
-                    if (alu_operand_b_seq[i] == 1'b0) begin
-                        alu_result_vm[i] = (vinsn_issue.op == VMSOF) ? 1'b0 : not_found_one_d;
-                    end else begin
-                        not_found_one_d = 1'b0;
-                        alu_result_vm[i] = (vinsn_issue.op == VMSBF) ? not_found_one_d : 1'b1;
-                        break;
-                    end
+            if (&masku_operand_vs2_seq_valid && (&masku_operand_m_valid || vinsn_issue.vm)) begin
+              for (int i = 0; i < NrLanes * DataWidth; i++) begin
+                if (masku_operand_vs2_seq[i] == 1'b0) begin
+                  alu_result_vm[i] = (vinsn_issue.op == VMSOF) ? 1'b0 : not_found_one_d;
+                end else begin
+                  not_found_one_d = 1'b0;
+                  alu_result_vm[i] = (vinsn_issue.op == VMSBF) ? not_found_one_d : 1'b1;
+                  break;
                 end
-                alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & bit_enable_mask : alu_result_vm;
+              end
+              alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & bit_enable_mask : alu_result_vm;
             end else begin
                 alu_result_vm = '0;
             end
         end
         VIOTA: begin
-          if (&masku_operand_a_valid_i) begin
-            alu_operand_b_seq_m = alu_operand_b_seq & bit_enable_mask;
+          if (&masku_operand_alu_valid) begin
+            masku_operand_alu_seq_m = masku_operand_alu_seq & bit_enable_mask;
             unique case (vinsn_issue.vtype.vsew)
               EW8 : begin
                 if (issue_cnt_q < vinsn_issue.vl) begin
-                  alu_result_vm [7:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:8] + alu_result_ff [(NrLanes*DataWidth)-1-:8];
+                  alu_result_vm [7:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:8] + alu_result_ff [(NrLanes*DataWidth)-1-:8];
                 end else begin
                   alu_result_vm [7:0] = '0;
                 end
                 for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin
-                  alu_result_vm   [(index*8) +: 7] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*8) +: 7];
-                  alu_result_vm_m [(index*8) +: 7] = (|mask[(index*8) +: 7]) ? alu_result_vm [(index*8) +: 7] : masku_operand_vd [(index*8) +: 7];
+                  alu_result_vm   [(index*8) +: 7] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*8) +: 7];
+                  alu_result_vm_m [(index*8) +: 7] = alu_result_vm [(index*8) +: 7];
                 end
               end
               EW16: begin
                 if (issue_cnt_q < vinsn_issue.vl) begin
-                  alu_result_vm [15:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:16] + alu_result_ff [(NrLanes*DataWidth)-1-:16];
+                  alu_result_vm [15:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:16] + alu_result_ff [(NrLanes*DataWidth)-1-:16];
                 end else begin
                   alu_result_vm [15:0] = '0;
                 end
                 for (int index = 1; index < (NrLanes*DataWidth)/16; index++) begin
-                  alu_result_vm   [(index*16) +: 15] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*16) +: 15];
-                  alu_result_vm_m [(index*16) +: 15] = (|mask[(index*16) +: 15]) ? alu_result_vm [(index*16) +: 15] : masku_operand_vd [(index*16) +: 15];
+                  alu_result_vm   [(index*16) +: 15] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*16) +: 15];
+                  alu_result_vm_m [(index*16) +: 15] = alu_result_vm [(index*16) +: 15];
                 end
               end
               EW32: begin
                 if (issue_cnt_q < vinsn_issue.vl) begin
-                  alu_result_vm [31:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:32] + alu_result_ff [(NrLanes*DataWidth)-1-:32];
+                  alu_result_vm [31:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:32] + alu_result_ff [(NrLanes*DataWidth)-1-:32];
                 end else begin
                   alu_result_vm [31:0] = '0;
                 end
                 for (int index = 1; index < (NrLanes*DataWidth)/32; index++) begin
-                  alu_result_vm   [(index*32) +: 31] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*32) +: 31];
-                  alu_result_vm_m [(index*32) +: 31] = (|mask[(index*32) +: 31]) ? alu_result_vm [(index*32) +: 31] : masku_operand_vd [(index*32) +: 31];
+                  alu_result_vm   [(index*32) +: 31] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*32) +: 31];
+                  alu_result_vm_m [(index*32) +: 31] = alu_result_vm [(index*32) +: 31];
                 end
               end
               EW64: begin
                 if (issue_cnt_q < vinsn_issue.vl) begin
-                  alu_result_vm [63:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:64] + alu_result_ff [(NrLanes*DataWidth)-1-:64];
+                  alu_result_vm [63:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:64] + alu_result_ff [(NrLanes*DataWidth)-1-:64];
                 end else begin
                   alu_result_vm [63:0] = '0;
                 end
                 for (int index = 1; index < (NrLanes*DataWidth)/64; index++) begin
-                  alu_result_vm   [(index*64) +: 63] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*64) +: 63];
-                  alu_result_vm_m [(index*64) +: 63] = (|mask[(index*64) +: 63]) ? alu_result_vm [(index*64) +: 63] : masku_operand_vd [(index*64) +: 63];
+                  alu_result_vm   [(index*64) +: 63] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*64) +: 63];
+                  alu_result_vm_m [(index*64) +: 63] = alu_result_vm [(index*64) +: 63];
                 end
               end
             endcase
           end
         end
         VID: begin
-          if (&masku_operand_a_valid_i) begin
+          if (&masku_operand_alu_valid) begin
             unique case (vinsn_issue.vtype.vsew)
               EW8 : begin
                 for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin
@@ -596,7 +547,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
           end
         end
         [VCPOP:VFIRST] : begin
-          vcpop_operand = (!vinsn_issue.vm) ? masku_operand_a_i & bit_enable_mask : masku_operand_a_i;
+          vcpop_operand = (!vinsn_issue.vm) ? masku_operand_vs2_seq & bit_enable_mask : masku_operand_vs2_seq;
         end
         default: begin
           alu_result    = '0;
@@ -639,9 +590,16 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   logic last_incoming_a;
   logic unbalanced_a;
 
+  // Control signals for better code-readability (this signals goes high if a result is valid and can be pushed to the result_queue)
+  logic vreg_wb_valid;
+
   // Information about which is the target FU of the request
   assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu;
 
+  // Byte enable for the result queue
+  logic [NrLanes*ELENB-1:0] result_queue_be_seq;
+  logic [NrLanes*ELENB-1:0] result_queue_be;
+
   always_comb begin: p_masku
     // Maintain state
     vinsn_queue_d  = vinsn_queue_q;
@@ -652,8 +610,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     mask_pnt_d     = mask_pnt_q;
     vrf_pnt_d      = vrf_pnt_q;
 
-    popcount_d     = popcount_q;
-    vfirst_count_d = vfirst_count_q;
+    vcpop_slice_cnt_d = vcpop_slice_cnt_q;
+    popcount_d        = popcount_q;
+    vfirst_count_d    = vfirst_count_q;
 
     mask_queue_d           = mask_queue_q;
     mask_queue_valid_d     = mask_queue_valid_q;
@@ -676,9 +635,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
     // We are not ready, by default
     pe_resp                 = '0;
-    masku_operand_a_ready_o = '0;
-    masku_operand_b_ready_o = '0;
-    masku_operand_m_ready_o = '0;
+    masku_operand_alu_ready = '0;
+    masku_operand_m_ready = '0;
 
     // Inform the main sequencer if we are idle
     pe_req_ready_o = !vinsn_queue_full;
@@ -705,7 +663,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     if (vinsn_issue_valid && !(vd_scalar(vinsn_issue.op))) begin
       // Is there place in the mask queue to write the mask operands?
       // Did we receive the mask bits on the MaskM channel?
-      if (!vinsn_issue.vm && !mask_queue_full && &masku_operand_m_valid_i) begin
+      if (!vinsn_issue.vm && !mask_queue_full && &masku_operand_m_valid && !(vinsn_issue.op inside {VMSBF, VMSOF, VMSIF})) begin
         // Copy data from the mask operands into the mask queue
         for (int vrf_seq_byte = 0; vrf_seq_byte < NrLanes*StrbWidth; vrf_seq_byte++) begin
           // Map vrf_seq_byte to the corresponding byte in the VRF word.
@@ -737,7 +695,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
           // Copy the mask operand
           mask_queue_d[mask_queue_write_pnt_q][vrf_lane][vrf_offset] =
-            masku_operand_m_i[mask_lane][mask_offset];
+            masku_operand_m[mask_lane][mask_offset];
         end
 
         // Account for the used operands
@@ -766,7 +724,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         // Consumed all valid bytes from the lane operands
         if (mask_pnt_d == NrLanes*64 || read_cnt_d == '0) begin
           // Request another beat
-          masku_operand_m_ready_o = '1;
+          masku_operand_m_ready = '1;
           // Reset the pointer
           mask_pnt_d              = '0;
         end
@@ -777,31 +735,62 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     // Calculate scalar results //
     //////////////////////////////
 
+    vcpop_vfirst_vs2_ready = 1'b0;
+
     // Is there an instruction ready to be issued?
     if (vinsn_issue_valid && vd_scalar(vinsn_issue.op)) begin
-      if (&(masku_operand_a_valid_i | fake_a_valid) && (&masku_operand_m_valid_i || vinsn_issue.vm)) begin
-
-        masku_operand_a_ready_o = masku_operand_a_valid_i;
+      if (&(masku_operand_vs2_seq_valid | fake_a_valid) && (&masku_operand_m_valid || vinsn_issue.vm)) begin
+
+        // increment slice counter
+        vcpop_slice_cnt_d = vcpop_slice_cnt_q + 1'b1;
+
+        // request new operand (by completing ready-valid handshake) once all slices have been processed
+        vcpop_vfirst_vs2_ready = 1'b0;
+        if (((vcpop_slice_cnt_q == N_SLICES_CPOP - 1) && vinsn_issue.op == VCPOP) ||
+            ((vcpop_slice_cnt_q == N_SLICES_VFIRST-1) && vinsn_issue.op == VFIRST)) begin
+          vcpop_slice_cnt_d       = '0;
+          vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid;
+          if (!vinsn_issue.vm) begin 
+            masku_operand_m_ready = '1;
+          end
+        end
 
         // Account for the elements that were processed
-        issue_cnt_d = issue_cnt_q - ((NrLanes*DataWidth)/(8 << vinsn_issue.vtype.vsew));
-        if (iteration_count_d >= (((8 << vinsn_issue.vtype.vsew)*vinsn_issue.vl)/(DataWidth*NrLanes)))
-          issue_cnt_d = '0;
+        issue_cnt_d = issue_cnt_q - W_CPOP;
 
-        // Acknowledge the operands, also triggers another beat if necessary
-        if (!vinsn_issue.vm) masku_operand_m_ready_o = '1;
+        // abruptly stop processing elements if vl is reached
+        if (iteration_count_d >= (vinsn_issue.vl/(W_CPOP)) || (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin
+          issue_cnt_d = '0;
+          commit_cnt_d = '0;
+          read_cnt_d ='0;
+          vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid;
+          if (!vinsn_issue.vm) begin 
+            masku_operand_m_ready = '1;
+          end
+        end
 
         popcount_d     = popcount_q + popcount;
         vfirst_count_d = vfirst_count_q + vfirst_count;
 
         // if this is the last beat, commit the result to the scalar_result queue
-        if (iteration_count_d >= (((8 << vinsn_issue.vtype.vsew)*vinsn_issue.vl)/(DataWidth*NrLanes))) begin
+        if ((iteration_count_d >= (vinsn_issue.vl/W_CPOP) && vinsn_issue.op == VCPOP) ||
+            (iteration_count_d >= (vinsn_issue.vl/W_VFIRST) && vinsn_issue.op == VFIRST) ||
+            (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin
           result_scalar_d = (vinsn_issue.op == VCPOP) ? popcount_d : (vfirst_empty) ? -1 : vfirst_count_d;
           result_scalar_valid_d = '1;
 
           // Decrement the commit counter by the entire number of elements,
           // since we only commit one result for everything
           commit_cnt_d = '0;
+
+          // reset vcpop slice counter, since instruction is finished
+          vcpop_slice_cnt_d = '0;
+
+          // acknowledge operand a
+          vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid;
+          if (!vinsn_issue.vm) begin 
+            masku_operand_m_ready = '1;
+          end
         end
       end
     end
@@ -810,14 +799,17 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     //  Write results to the lanes  //
     //////////////////////////////////
 
+    result_queue_be = '1;
+    result_queue_be_seq = '1;
+    vmsif_vmsof_vmsbf_vs2_ready = '0;
+
     // Is there an instruction ready to be issued?
     if (vinsn_issue_valid && !vd_scalar(vinsn_issue.op)) begin
       // This instruction executes on the Mask Unit
       if (vinsn_issue.vfu == VFU_MaskUnit) begin
         // Is there place in the result queue to write the results?
         // Did we receive the operands?
-        if (!result_queue_full && &(masku_operand_a_valid_i | fake_a_valid) &&
-            (!vinsn_issue.use_vd_op || &masku_operand_b_valid_i)) begin
+        if (!result_queue_full && (&(masku_operand_alu_valid | fake_a_valid | masku_operand_vs2_seq_valid))) begin
           // How many elements are we committing in total?
           // Since we are committing bits instead of bytes, we carry out the following calculation
           // with ceil(vl/8) instead.
@@ -832,7 +824,37 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
           // Acknowledge the operands of this instruction.
           // At this stage, acknowledge only the first operand, "a", coming from the ALU/VMFpu.
-          masku_operand_a_ready_o = masku_operand_a_valid_i;
+          masku_operand_alu_ready = masku_operand_alu_valid;
+          vmsif_vmsof_vmsbf_vs2_ready = (&masku_operand_m_valid || vinsn_issue.vm) ? '1 : '0;
+
+          if (!vinsn_issue.vm) begin
+            unique case (vinsn_issue.vtype.vsew)
+              EW8 : result_queue_be_seq = masku_operand_m_seq[NrLanes*ELENB-1:0];
+              EW16: begin
+                for (int i = 0; i < NrLanes * ELENB / 2; i++) begin
+                  result_queue_be_seq[2*i +: 2] = {2{bit_enable_mask[i]}};
+                end
+              end
+              EW32: begin
+                for (int i = 0; i < NrLanes * ELENB / 4; i++) begin
+                  result_queue_be_seq[4*i +: 4] = {4{bit_enable_mask[i]}};
+                end
+              end
+              EW64: begin
+                for (int i = 0; i < NrLanes * ELENB / 8; i++) begin
+                  result_queue_be_seq[8*i +: 8] = {8{bit_enable_mask[i]}};
+                end
+              end
+              default: ; // Not sure what should be the default
+            endcase
+            for (int i = 0; i < NrLanes*ELENB; i++) begin
+              result_queue_be[shuffle_index(i, NrLanes, vinsn_issue.vtype.vsew)] = result_queue_be_seq[i];
+            end
+          end
+
+          if (vinsn_issue.op inside {[VMSBF: VMSIF], VID}) begin
+            result_queue_be = '1;
+          end
 
           // Store the result in the operand queue
           for (int unsigned lane = 0; lane < NrLanes; lane++) begin
@@ -843,8 +865,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
             result_queue_d[result_queue_write_pnt_q][lane] = '{
               wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata | alu_result[lane],
-              be   : (vinsn_issue.op inside {[VMSBF:VID]}) ? '1 : be(element_cnt, vinsn_issue.vtype.vsew),
-              addr : (vinsn_issue.op inside {[VMSBF:VID]}) ? vaddr(vinsn_issue.vd, NrLanes) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes) +
+              be   : (vinsn_issue.op inside {[VMSBF:VID]}) ? result_queue_be[lane*ELENB +: ELENB] : be(element_cnt, vinsn_issue.vtype.vsew),
+              addr : (vinsn_issue.op inside {[VIOTA:VID]}) ? vaddr(vinsn_issue.vd, NrLanes) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes) +
                 (((vinsn_issue.vl - issue_cnt_q) / NrLanes / DataWidth)),
               id : vinsn_issue.id
             };
@@ -858,9 +880,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
             if (vrf_pnt_d == DataWidth*NrLanes || vrf_pnt_d >= issue_cnt_q) begin
               result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
 
-              // Acknowledge the rest of the operands, which are accessed bit by bit.
-              masku_operand_b_ready_o = masku_operand_b_valid_i;
-
               // Reset VRF pointer
               vrf_pnt_d = '0;
 
@@ -877,33 +896,34 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
                 issue_cnt_d = '0;
             end
           end else if (vinsn_issue.op inside {[VMSBF:VID]}) begin
-            result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
-
-            // Acknowledge the previous value of the destination vector register.
-            masku_operand_b_ready_o = masku_operand_b_valid_i;
+            if (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {VIOTA, VID}) begin
+              result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
 
-            // Increment result queue pointers and counters
-            result_queue_cnt_d += 1;
-            if (result_queue_write_pnt_q == ResultQueueDepth-1)
-              result_queue_write_pnt_d = '0;
-            else
-              result_queue_write_pnt_d = result_queue_write_pnt_q + 1;
+              // Increment result queue pointers and counters
+              result_queue_cnt_d += 1;
+              if (result_queue_write_pnt_q == ResultQueueDepth-1)
+                result_queue_write_pnt_d = '0;
+              else
+                result_queue_write_pnt_d = result_queue_write_pnt_q + 1;
 
-            if (result_queue_read_pnt_q == ResultQueueDepth-1)
-              result_queue_read_pnt_d = '0;
-            else
-              result_queue_read_pnt_d = result_queue_read_pnt_m;
+              if (result_queue_read_pnt_q == ResultQueueDepth-1)
+                result_queue_read_pnt_d = '0;
+              else
+                result_queue_read_pnt_d = result_queue_read_pnt_m;
 
-            // Account for the results that were issued
-            issue_cnt_d = issue_cnt_q - (1 << (int'(EW64) - vinsn_issue.vtype.vsew));
-            if ((vinsn_issue.vl-issue_cnt_d)*4 >= vinsn_issue.vl)
-              issue_cnt_d = '0;
+              // Account for the results that were issued
+              if (vinsn_issue.op inside {VIOTA, VID}) begin
+                issue_cnt_d = issue_cnt_q - (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew));
+                if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl)
+                  issue_cnt_d = '0;
+              end else begin
+                issue_cnt_d = issue_cnt_q - NrLanes * DataWidth;
+                if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl)
+                  issue_cnt_d = '0;
+              end
+            end
           end else begin
             result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
-
-            // Acknowledge the previous value of the destination vector register.
-            masku_operand_b_ready_o = masku_operand_b_valid_i;
-
             // Increment result queue pointers and counters
             result_queue_cnt_d += 1;
             if (result_queue_write_pnt_q == ResultQueueDepth-1)
@@ -923,13 +943,17 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     ///////////////////////////
     //// Masked Instruction ///
     ///////////////////////////
-    if (vinsn_commit_valid && vinsn_commit.op inside {[VMSBF:VID]}) begin
-      if (&masku_operand_a_valid_i && (&masku_operand_m_valid_i || vinsn_issue.vm)) begin
-        // if this is the last beat, commit the result to the scalar_result queue
-        commit_cnt_d = commit_cnt_q - (1 << (int'(EW64) - vinsn_commit.vtype.vsew));
-        if ((vinsn_commit.vl-commit_cnt_d)*4 >= vinsn_commit.vl) begin
-          commit_cnt_d = '0;
-        end
+    if ((|masku_operand_alu_valid && !result_queue_full) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {[VIOTA:VID]}) begin
+      // if this is the last beat, commit the result to the scalar_result queue
+      commit_cnt_d = commit_cnt_q - (NrLanes << (int'(EW64) - vinsn_commit.vtype.vsew));
+      if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin
+        commit_cnt_d = '0;
+      end
+    end
+    if ((&masku_operand_alu_valid || &masku_operand_vs2_seq_valid) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {VMSBF, VMSOF, VMSIF}) begin
+      commit_cnt_d = commit_cnt_q - NrLanes * DataWidth;
+      if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin
+        commit_cnt_d = '0;
       end
     end
 
@@ -983,9 +1007,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         mask_queue_cnt_d -= 1;
 
         // Decrement the counter of remaining vector elements waiting to be used
-        commit_cnt_d = commit_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew));
-        if (commit_cnt_q < (NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew))))
-          commit_cnt_d = '0;
+        if (vldu_mask_ready_i || vstu_mask_ready_i || sldu_mask_ready_i || vinsn_issue.vm || (vinsn_issue.vfu != VFU_MaskUnit)) begin
+          commit_cnt_d = commit_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew));
+          if (commit_cnt_q < (NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew))))
+            commit_cnt_d = '0;
+        end
       end
 
     //////////////////////////////////
@@ -1030,9 +1056,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         result_queue_d[result_queue_read_pnt_q] = '0;
 
         // Decrement the counter of remaining vector elements waiting to be written
-        commit_cnt_d = commit_cnt_q - NrLanes * DataWidth;
-        if (commit_cnt_q < (NrLanes * DataWidth))
-          commit_cnt_d = '0;
+        if (!(vinsn_issue.op inside {VID, VSE})) begin
+          commit_cnt_d = commit_cnt_q - NrLanes * DataWidth;
+          if (commit_cnt_q < (NrLanes * DataWidth))
+            commit_cnt_d = '0;
+        end
       end
 
     ///////////////////////////
@@ -1135,6 +1163,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       mask_pnt_q         <= '0;
       pe_resp_o          <= '0;
       result_final_gnt_q <= '0;
+      vcpop_slice_cnt_q  <= '0;
       popcount_q         <= '0;
       vfirst_count_q     <= '0;
     end else begin
@@ -1146,6 +1175,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       mask_pnt_q         <= mask_pnt_d;
       pe_resp_o          <= pe_resp;
       result_final_gnt_q <= result_final_gnt_d;
+      vcpop_slice_cnt_q  <= vcpop_slice_cnt_d;
       popcount_q         <= popcount_d;
       vfirst_count_q     <= vfirst_count_d;
     end
diff --git a/hardware/src/masku/masku_operands.sv b/hardware/src/masku/masku_operands.sv
new file mode 100644
index 000000000..86b5e6988
--- /dev/null
+++ b/hardware/src/masku/masku_operands.sv
@@ -0,0 +1,233 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Mask Unit Operands Module
+//
+// Author: Moritz Imfeld <moimfeld@student.ethz.ch>
+//
+//
+// Description:
+//  Module takes operands coming from the lanes and then unpacks and prepares them
+//  for mask instruction execution.
+//
+//
+// Incoming Operands:
+// masku_operands_i = {v0.m, vs2, alu_result, fpu_result}
+//
+
+module masku_operands import ara_pkg::*; import rvv_pkg::*; #(
+    parameter int unsigned NrLanes = 0
+  ) (
+    input logic clk_i,
+    input logic rst_ni,
+
+    // Control logic
+    input masku_fu_e                        masku_fu_i,    // signal deciding from which functional unit the result should be taken from
+    input pe_req_t                          vinsn_issue_i,
+    input logic [idx_width(ELEN*NrLanes):0] vrf_pnt_i,
+
+    // Operands and operand handshake signals coming from lanes
+    input  logic [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand_valid_i,
+    output logic [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand_ready_o,
+    input elen_t [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operands_i,
+
+    // Operands prepared for masku execution
+    output elen_t [     NrLanes-1:0] masku_operand_alu_o,     // ALU/FPU result (shuffled, uncompressed)
+    output logic  [     NrLanes-1:0] masku_operand_alu_valid_o,
+    input  logic  [     NrLanes-1:0] masku_operand_alu_ready_i,
+    output logic  [NrLanes*ELEN-1:0] masku_operand_alu_seq_o, // ALU/FPU result (deshuffled, uncompressed)
+    output logic  [     NrLanes-1:0] masku_operand_alu_seq_valid_o,
+    input  logic  [     NrLanes-1:0] masku_operand_alu_seq_ready_i,
+    output elen_t [     NrLanes-1:0] masku_operand_vs2_o,     // vs2 (shuffled)
+    output logic  [     NrLanes-1:0] masku_operand_vs2_valid_o,
+    input  logic  [     NrLanes-1:0] masku_operand_vs2_ready_i,
+    output logic  [NrLanes*ELEN-1:0] masku_operand_vs2_seq_o, // vs2 (deshuffled)
+    output logic  [     NrLanes-1:0] masku_operand_vs2_seq_valid_o,
+    input  logic  [     NrLanes-1:0] masku_operand_vs2_seq_ready_i,
+    output elen_t [     NrLanes-1:0] masku_operand_m_o,       // Mask (shuffled)
+    output logic  [     NrLanes-1:0] masku_operand_m_valid_o,
+    input  logic  [     NrLanes-1:0] masku_operand_m_ready_i,
+    output logic  [NrLanes*ELEN-1:0] masku_operand_m_seq_o,   // Mask (deshuffled)
+    output logic  [     NrLanes-1:0] masku_operand_m_seq_valid_o,
+    input  logic  [     NrLanes-1:0] masku_operand_m_seq_ready_i,
+    output logic  [NrLanes*ELEN-1:0] bit_enable_mask_o,       // Bit mask for mask unit instructions (shuffled like mask register)
+    output logic  [NrLanes*ELEN-1:0] shuffled_vl_bit_mask_o,  // vl mask for mask unit instructions (first vl bits are 1, others 0)  (shuffled like mask register)
+    output logic  [NrLanes*ELEN-1:0] alu_result_compressed_o  // ALU/FPU results compressed (from sew to 1-bit) (shuffled, in mask format)
+  );
+
+  // Imports
+  import cf_math_pkg::idx_width;
+
+  // Local Parameter
+  localparam int unsigned DATAPATH_WIDTH = NrLanes * ELEN; // Mask Unit datapath width
+  localparam int unsigned ELEN_BYTES     = ELEN / 8;
+
+  // Helper signals
+  logic [DATAPATH_WIDTH-1:0] deshuffled_vl_bit_mask; // this bit enable signal is only dependent on vl
+  logic [DATAPATH_WIDTH-1:0] shuffled_vl_bit_mask;   // this bit enable signal is only dependent on vl
+  vew_e                      bit_enable_shuffle_eew;
+
+  elen_t [NrLanes-1:0] masku_operand_vs2_d;
+  logic                masku_operand_vs2_lane_valid;
+  logic                masku_operand_vs2_lane_ready;
+  logic                masku_operand_vs2_spill_valid;
+  logic                masku_operand_vs2_spill_ready;
+
+
+  // Extract operands from input (input comes in "shuffled form" from the lanes)
+  for (genvar lane = 0; lane < NrLanes; lane++) begin
+    assign masku_operand_m_o[lane]   = masku_operands_i[lane][0];
+    assign masku_operand_vs2_d[lane] = masku_operands_i[lane][1];
+    assign masku_operand_alu_o[lane] = masku_operands_i[lane][2 + masku_fu_i];
+  end
+
+  // ----------
+  // Deshuffle vs2
+  // ----------
+  always_comb begin
+    masku_operand_m_seq_o   = '0;
+    masku_operand_vs2_seq_o = '0;
+    masku_operand_alu_seq_o = '0;
+    for (int b = 0; b < (NrLanes * ELEN_BYTES); b++) begin
+      automatic int deshuffle_idx   = deshuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew);
+      automatic int deshuffle_m_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask);
+      automatic int lane_idx    = b / ELEN_BYTES; // rounded down to nearest integer
+      automatic int lane_offset = b % ELEN_BYTES;
+      masku_operand_alu_seq_o[8*deshuffle_idx +: 8] = masku_operand_alu_o[lane_idx][8*lane_offset +: 8];
+      masku_operand_vs2_seq_o[8*deshuffle_idx +: 8] = masku_operand_vs2_o[lane_idx][8*lane_offset +: 8];
+      masku_operand_m_seq_o[8*deshuffle_m_idx +: 8] = masku_operand_m_o[lane_idx][8*lane_offset +: 8];
+    end
+  end
+
+  always_comb begin
+    masku_operand_vs2_spill_ready = 1'b1;
+    for (int lane = 0; lane < NrLanes; lane++) begin
+      masku_operand_vs2_spill_ready &= masku_operand_vs2_ready_i[lane] | masku_operand_vs2_seq_ready_i[lane];
+    end
+  end
+
+  spill_register #(
+    .T       ( elen_t [NrLanes-1:0] ),
+    .Bypass  ( 1'b0 )
+  ) i_spill_register_vs2 (
+    .clk_i   (clk_i),
+    .rst_ni  (rst_ni),
+    .valid_i (masku_operand_vs2_lane_valid),
+    .ready_o (masku_operand_vs2_lane_ready),
+    .data_i  (masku_operand_vs2_d),
+    .valid_o (masku_operand_vs2_spill_valid),
+    .ready_i (masku_operand_vs2_spill_ready),
+    .data_o  (masku_operand_vs2_o)
+  );
+
+  for (genvar lane = 0; lane < NrLanes; lane++) begin
+    assign masku_operand_vs2_valid_o[lane]     = masku_operand_vs2_spill_valid;
+    assign masku_operand_vs2_seq_valid_o[lane] = masku_operand_vs2_spill_valid;
+  end
+
+  always_comb begin
+    masku_operand_vs2_lane_valid = 1'b1;
+    for (int lane = 0; lane < NrLanes; lane++) begin
+      masku_operand_vs2_lane_valid &= masku_operand_valid_i[lane][1];
+    end
+  end
+
+  // ------------------------------------------------
+  // Generate shuffled and unshuffled bit level masks
+  // ------------------------------------------------
+
+  // Generate shuffled bit level mask
+  assign bit_enable_shuffle_eew = vinsn_issue_i.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]} ? vinsn_issue_i.vtype.vsew : vinsn_issue_i.eew_vd_op;
+
+  always_comb begin
+    // Default assignments
+    deshuffled_vl_bit_mask = '0;
+    shuffled_vl_bit_mask   = '0;
+    bit_enable_mask_o      = '0;
+
+    // Generate deshuffled vl bit mask
+    for (int unsigned i = 0; i < DATAPATH_WIDTH; i++) begin
+      if (i < vinsn_issue_i.vl) begin
+        deshuffled_vl_bit_mask[i] = 1'b1;
+      end
+    end
+
+    for (int unsigned b = 0; b < NrLanes * ELEN_BYTES; b++) begin
+      // local helper signals
+      logic [idx_width(DATAPATH_WIDTH)-1:0] src_operand_byte_shuffle_index;
+      logic [idx_width(DATAPATH_WIDTH)-1:0] mask_operand_byte_shuffle_index;
+      logic [       idx_width(NrLanes)-1:0] mask_operand_byte_shuffle_lane_index;
+      logic [    idx_width(ELEN_BYTES)-1:0] mask_operand_byte_shuffle_lane_offset;
+
+      // get shuffle idices
+      // Note: two types of shuffle indices are needed because the source operand and the
+      //       mask register might not have the same effective element width (eew)
+      src_operand_byte_shuffle_index        = shuffle_index(b, NrLanes, bit_enable_shuffle_eew);
+      mask_operand_byte_shuffle_index       = shuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask);
+      mask_operand_byte_shuffle_lane_index  = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES) +: idx_width(NrLanes)];
+      mask_operand_byte_shuffle_lane_offset = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES)-1:0];
+
+      // shuffle bit enable
+      shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8] = deshuffled_vl_bit_mask[8*b +: 8];
+
+      // Generate bit-level mask
+      bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] = shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8];
+      if (!vinsn_issue_i.vm && !(vinsn_issue_i.op inside {VMADC, VMSBC})) begin // exception for VMADC and VMSBC, because they use the mask register as a source operand (and not as a mask)
+        bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] &= masku_operand_m_o[mask_operand_byte_shuffle_lane_index][8*mask_operand_byte_shuffle_lane_offset +: 8];
+      end
+    end
+  end
+
+  assign shuffled_vl_bit_mask_o = shuffled_vl_bit_mask;
+
+
+  // -------------------------------------------
+  // Compress ALU/FPU results into a mask vector
+  // -------------------------------------------
+  always_comb begin
+    alu_result_compressed_o = '0;
+    for (int b = 0; b < ELEN_BYTES * NrLanes; b++) begin
+      if ((b % (1 << vinsn_issue_i.vtype.vsew)) == '0) begin
+        automatic int src_byte        = shuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew);
+        automatic int src_byte_lane   = src_byte[idx_width(ELEN_BYTES) +: idx_width(NrLanes)];
+        automatic int src_byte_offset = src_byte[idx_width(ELEN_BYTES)-1:0];
+
+        automatic int dest_bit_seq  = (b >> vinsn_issue_i.vtype.vsew) + vrf_pnt_i;
+        automatic int dest_byte_seq = dest_bit_seq / ELEN_BYTES;
+        automatic int dest_byte     = shuffle_index(dest_byte_seq, NrLanes, vinsn_issue_i.vtype.vsew);
+        alu_result_compressed_o[ELEN_BYTES * dest_byte + dest_bit_seq[idx_width(ELEN_BYTES)-1:0]] = masku_operand_alu_o[src_byte_lane][8 * src_byte_offset];
+      end
+    end
+  end
+
+
+  // Control
+  for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_unpack_masku_operands
+    // immediately acknowledge operands coming from functional units
+    assign masku_operand_alu_valid_o[lane] = masku_operand_valid_i[lane][2 + masku_fu_i];
+
+    assign masku_operand_m_valid_o[lane]   = masku_operand_valid_i[lane][0];
+
+    assign masku_operand_m_seq_valid_o[lane]   = masku_operand_valid_i[lane][0];
+  end: gen_unpack_masku_operands
+
+
+  // assign the operand_ready signal that goes to the lane operand queues
+  always_comb begin
+    // by default, assign '0 to operand ready signals
+    masku_operand_ready_o = '0;
+    for (int lane = 0; lane < NrLanes; lane++) begin
+      // Acknowledge alu operand
+      for (int operand_fu = 0; operand_fu < NrMaskFUnits; operand_fu++) begin
+        masku_operand_ready_o[lane][2 + operand_fu] = (masku_fu_e'(operand_fu) == masku_fu_i) && masku_operand_alu_ready_i[lane];
+      end
+      // Acknowledge vs2 operands
+      masku_operand_ready_o[lane][1] = masku_operand_vs2_lane_ready;
+      // Acknowledge mask operand
+      masku_operand_ready_o[lane][0]  = masku_operand_m_ready_i[lane];
+    end
+  end
+
+
+endmodule : masku_operands
diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv
index a7b384ef9..45c4bda3b 100644
--- a/hardware/src/sldu/sldu.sv
+++ b/hardware/src/sldu/sldu.sv
@@ -733,13 +733,14 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
         result_queue_write_pnt_d = NP2_BUFFER_PNT;
         // Prepare the read pointer
         result_queue_read_pnt_d = NP2_RESULT_PNT;
-        // Setup the mux sel as soon as we get one operand
-        if (sldu_operand_valid_i[0])
+        // Setup the mux sel as soon as we get the operands
+        if (&(sldu_operand_valid_i | sldu_operand_valid))
           np2_loop_mux_sel_d = NP2_LOOP_SEL;
         // Setup the p2-stride generator
         p2_stride_gen_stride_d = stride_t'(vinsn_issue_q.stride >> vinsn_issue_q.vtype.vsew);
         p2_stride_gen_valid_d  = 1'b1;
         // Start processing the first VRF chunk as soon as the result queue is completely empty
+        // and the VRF chunk is complete
         if (np2_loop_mux_sel_q == NP2_LOOP_SEL && result_queue_empty) begin
           state_d = SLIDE_NP2_RUN;
         end
diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv
index a25d086a1..824834199 100644
--- a/hardware/src/vlsu/addrgen.sv
+++ b/hardware/src/vlsu/addrgen.sv
@@ -32,9 +32,11 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     input  pe_req_t                        pe_req_i,
     input  logic                           pe_req_valid_i,
     input  logic     [NrVInsn-1:0]         pe_vinsn_running_i,
-    output logic                           addrgen_error_o,
+    output ariane_pkg::exception_t         addrgen_exception_o,
     output logic                           addrgen_ack_o,
-    output vlen_t                          addrgen_error_vl_o,
+    output vlen_t                          addrgen_exception_vstart_o,
+    output logic                           addrgen_exception_load_o,
+    output logic                           addrgen_exception_store_o,
     // Interface with the load/store units
     output addrgen_axi_req_t               axi_addrgen_req_o,
     output logic                           axi_addrgen_req_valid_o,
@@ -117,7 +119,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   axi_addr_t                        idx_final_addr_d, idx_final_addr_q;
   elen_t                            idx_addr;
   logic                             idx_op_error_d, idx_op_error_q;
-  vlen_t                            addrgen_error_vl_d;
+  vlen_t                            addrgen_exception_vstart_d;
 
   // Pointer to point to the correct
   logic [$clog2(NrLanes)-1:0] word_lane_ptr_d, word_lane_ptr_q;
@@ -177,7 +179,11 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
 
     // Nothing to acknowledge
     addrgen_ack_o           = 1'b0;
-    addrgen_error_o         = 1'b0;
+    addrgen_exception_o.valid = 1'b0;
+    addrgen_exception_o.tval  = '0;
+    addrgen_exception_o.cause = '0;
+    addrgen_exception_load_o  = 1'b0;
+    addrgen_exception_store_o = 1'b0;
 
     // No valid words for the spill register
     idx_addr_valid_d        = 1'b0;
@@ -240,7 +246,9 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
         if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin
           state_d         = IDLE;
           addrgen_ack_o   = 1'b1;
-          addrgen_error_o = 1'b1;
+          addrgen_exception_o.valid = 1'b1;
+          addrgen_exception_o.cause = riscv::ILLEGAL_INSTR;
+          addrgen_exception_o.tval  = '0;
         end else begin
           addrgen_req = '{
             addr    : pe_req_q.scalar_op,
@@ -356,10 +364,16 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
         word_lane_ptr_d = '0;
         // Raise an error if necessary
         if (idx_op_error_q) begin
-          addrgen_error_o = 1'b1;
+          addrgen_exception_o.valid = 1'b1;
+          addrgen_exception_o.cause = riscv::ILLEGAL_INSTR;
+          addrgen_exception_o.tval  = '0;
         end
       end
     endcase
+    if ( addrgen_exception_o.valid & addrgen_ack_o ) begin
+      addrgen_exception_load_o  = is_load(pe_req_q.op);
+      addrgen_exception_store_o = !is_load(pe_req_q.op);
+    end
   end
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
@@ -372,7 +386,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
       idx_op_cnt_q       <= '0;
       last_elm_subw_q    <= '0;
       idx_op_error_q     <= '0;
-      addrgen_error_vl_o <= '0;
+      addrgen_exception_vstart_o <= '0;
     end else begin
       state_q            <= state_d;
       pe_req_q           <= pe_req_d;
@@ -382,7 +396,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
       idx_op_cnt_q       <= idx_op_cnt_d;
       last_elm_subw_q    <= last_elm_subw_d;
       idx_op_error_q     <= idx_op_error_d;
-      addrgen_error_vl_o <= addrgen_error_vl_d;
+      addrgen_exception_vstart_o <= addrgen_exception_vstart_d;
     end
   end
 
@@ -452,7 +466,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     eff_axi_dw_log_d = eff_axi_dw_log_q;
 
     idx_addr_ready_d    = 1'b0;
-    addrgen_error_vl_d  = '0;
+    addrgen_exception_vstart_d  = '0;
 
     // No error by default
     idx_op_error_d = 1'b0;
@@ -752,7 +766,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
                   // Generate an error
                   idx_op_error_d          = 1'b1;
                   // Forward next vstart info to the dispatcher
-                  addrgen_error_vl_d      = addrgen_req.len - axi_addrgen_q.len - 1;
+                  addrgen_exception_vstart_d = addrgen_req.len - axi_addrgen_q.len - 1;
                   addrgen_req_ready       = 1'b1;
                   axi_addrgen_state_d     = AXI_ADDRGEN_IDLE;
                 end
diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv
index b6904850c..7b1056667 100644
--- a/hardware/src/vlsu/vlsu.sv
+++ b/hardware/src/vlsu/vlsu.sv
@@ -42,8 +42,8 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     output logic      [1:0]         pe_req_ready_o,         // Load (0) and Store (1) units
     output pe_resp_t  [1:0]         pe_resp_o,              // Load (0) and Store (1) units
     output logic                    addrgen_ack_o,
-    output logic                    addrgen_error_o,
-    output vlen_t                   addrgen_error_vl_o,
+    output ariane_pkg::exception_t  addrgen_exception_o,
+    output vlen_t                   addrgen_exception_vstart_o,
     // Interface with the lanes
     // Store unit operands
     input  elen_t     [NrLanes-1:0] stu_operand_i,
@@ -69,6 +69,11 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     input  logic      [NrLanes-1:0] ldu_result_final_gnt_i
   );
 
+  logic load_complete, store_complete;
+  logic addrgen_exception_load, addrgen_exception_store;
+  assign load_complete_o  = load_complete  | addrgen_exception_load;
+  assign store_complete_o = store_complete | addrgen_exception_store;
+
   ///////////////////
   //  Definitions  //
   ///////////////////
@@ -133,8 +138,10 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .pe_req_valid_i             (pe_req_valid_i             ),
     .pe_vinsn_running_i         (pe_vinsn_running_i         ),
     .addrgen_ack_o              (addrgen_ack_o              ),
-    .addrgen_error_o            (addrgen_error_o            ),
-    .addrgen_error_vl_o         (addrgen_error_vl_o         ),
+    .addrgen_exception_o        ( addrgen_exception_o       ),
+    .addrgen_exception_vstart_o     ( addrgen_exception_vstart_o    ),
+    .addrgen_exception_load_o   ( addrgen_exception_load    ),
+    .addrgen_exception_store_o  ( addrgen_exception_store   ),
     // Interface with the lanes
     .addrgen_operand_i          (addrgen_operand_i          ),
     .addrgen_operand_target_fu_i(addrgen_operand_target_fu_i),
@@ -165,7 +172,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .axi_r_valid_i          (axi_resp.r_valid          ),
     .axi_r_ready_o          (axi_req.r_ready           ),
     // Interface with the dispatcher
-    .load_complete_o        (load_complete_o           ),
+    .load_complete_o        (load_complete             ),
     // Interface with the main sequencer
     .pe_req_i               (pe_req_i                  ),
     .pe_req_valid_i         (pe_req_valid_i            ),
@@ -213,7 +220,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .axi_b_ready_o          (axi_req.b_ready            ),
     // Interface with the dispatcher
     .store_pending_o        (store_pending_o            ),
-    .store_complete_o       (store_complete_o           ),
+    .store_complete_o       (store_complete             ),
     // Interface with the main sequencer
     .pe_req_i               (pe_req_i                   ),
     .pe_req_valid_i         (pe_req_valid_i             ),