From 55fded5b14550a0682573049cc1ea9195f869f25 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Sat, 9 Nov 2024 14:23:50 +0100
Subject: [PATCH 1/8] [hardware] Simplify the operand queues

---
 hardware/src/lane/operand_queues_stage.sv | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv
index 1445236c0..d03e8a6a4 100644
--- a/hardware/src/lane/operand_queues_stage.sv
+++ b/hardware/src/lane/operand_queues_stage.sv
@@ -57,7 +57,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   operand_queue #(
     .CmdBufDepth        (ValuInsnQueueDepth   ),
     .DataBufDepth       (5                    ),
-    .FPUSupport         (FPUSupport           ),
+    .FPUSupport         (FPUSupportNone       ),
     .NrLanes            (NrLanes              ),
     .VLEN               (VLEN                 ),
     .SupportIntExt2     (1'b1                 ),
@@ -86,7 +86,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   operand_queue #(
     .CmdBufDepth        (ValuInsnQueueDepth   ),
     .DataBufDepth       (5                    ),
-    .FPUSupport         (FPUSupport           ),
+    .FPUSupport         (FPUSupportNone       ),
     .NrLanes            (NrLanes              ),
     .VLEN               (VLEN                 ),
     .SupportIntExt2     (1'b1                 ),
@@ -204,7 +204,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   operand_queue #(
     .CmdBufDepth        (VstuInsnQueueDepth + MaskuInsnQueueDepth),
     .DataBufDepth       (2                                       ),
-    .FPUSupport         (FPUSupport                              ),
+    .FPUSupport         (FPUSupportNone                          ),
     .NrLanes            (NrLanes                                 ),
     .VLEN               (VLEN                                    ),
     .operand_queue_cmd_t(operand_queue_cmd_t                     )
@@ -248,7 +248,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   operand_queue #(
     .CmdBufDepth        (VlduInsnQueueDepth   ),
     .DataBufDepth       (2                    ),
-    .FPUSupport         (FPUSupport           ),
+    .FPUSupport         (FPUSupportNone       ),
     .NrLanes            (NrLanes              ),
     .VLEN               (VLEN                 ),
     .operand_queue_cmd_t(operand_queue_cmd_t  )
@@ -276,7 +276,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   operand_queue #(
     .CmdBufDepth        (MaskuInsnQueueDepth  ),
     .DataBufDepth       (1                    ),
-    .FPUSupport         (FPUSupport           ),
+    .FPUSupport         (FPUSupportNone       ),
     .SupportIntExt2     (1'b1                 ),
     .SupportIntExt4     (1'b1                 ),
     .SupportIntExt8     (1'b1                 ),
@@ -303,6 +303,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   operand_queue #(
     .CmdBufDepth        (MaskuInsnQueueDepth  ),
     .DataBufDepth       (1                    ),
+    .FPUSupport         (FPUSupportNone       ),
     .NrLanes            (NrLanes              ),
     .VLEN               (VLEN                 ),
     .operand_queue_cmd_t(operand_queue_cmd_t  )

From a570818cf68ab60978cd11eb5573fd7048a01619 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Sat, 9 Nov 2024 14:24:12 +0100
Subject: [PATCH 2/8] [hardware] Slim down addrgen check function

---
 hardware/src/vlsu/addrgen.sv | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv
index ed3bc59bc..005116546 100644
--- a/hardware/src/vlsu/addrgen.sv
+++ b/hardware/src/vlsu/addrgen.sv
@@ -92,8 +92,13 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   import axi_pkg::CACHE_MODIFIABLE;
 
   // Check if the address is aligned to a particular width
+  // Max element width: 8 bytes
   function automatic logic is_addr_error(axi_addr_t addr, logic [1:0] vew);
-    is_addr_error = |(addr & (elen_t'(1 << vew) - 1));
+    // log2(MAX_ELEMENT_WIDTH_BYTE)
+    localparam LOG2_MAX_SEW_BYTE = 3;
+    typedef logic [LOG2_MAX_SEW_BYTE:0] max_sew_byte_t;
+
+    is_addr_error = |(max_sew_byte_t'(addr[LOG2_MAX_SEW_BYTE-1:0]) & (max_sew_byte_t'(1 << vew) - 1));
   endfunction // is_addr_error
 
   ////////////////////
@@ -332,7 +337,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
         };
 
         // Ara does not support misaligned AXI requests
-        if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew)) begin
+        if (is_addr_error(pe_req_q.scalar_op, pe_req_q.vtype.vsew[1:0])) begin
           state_d         = IDLE;
           addrgen_ack_o   = 1'b1;
           addrgen_exception_o.valid = 1'b1;
@@ -926,7 +931,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
                   // Check if the virtual address generates an exception
                   // NOTE: we can do this even before address translation, since the
                   //       page offset (2^12) is the same for both physical and virtual addresses
-                  if (is_addr_error(idx_final_vaddr_q, axi_addrgen_q.vew)) begin : eew_misaligned_error
+                  if (is_addr_error(idx_final_vaddr_q, axi_addrgen_q.vew[1:0])) begin : eew_misaligned_error
                     // Generate an error
                     idx_op_error_d          = 1'b1;
                     // Forward next vstart info to the dispatcher

From 66c4f34193935b05ffc34ee7c2331a212241e874 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Sat, 9 Nov 2024 14:24:41 +0100
Subject: [PATCH 3/8] [hardware] Replace stream regs with spill regs

---
 hardware/src/lane/valu.sv  | 4 +---
 hardware/src/lane/vmfpu.sv | 4 +---
 hardware/src/sldu/sldu.sv  | 4 +---
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index 7f2be6614..d3ce82bee 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -180,13 +180,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
   assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q];
 
-  stream_register #(
+  spill_register #(
     .T(elen_t)
   ) i_mask_operand_register (
     .clk_i     (clk_i                                                                                        ),
     .rst_ni    (rst_ni                                                                                       ),
-    .clr_i     (1'b0                                                                                         ),
-    .testmode_i(1'b0                                                                                         ),
     .data_o    (mask_operand_o                                                                               ),
     .valid_o   (mask_operand_valid_o                                                                         ),
     .ready_i   (mask_operand_ready_i                                                                         ),
diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv
index 4822ce7b7..fdf10363a 100644
--- a/hardware/src/lane/vmfpu.sv
+++ b/hardware/src/lane/vmfpu.sv
@@ -245,13 +245,11 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
   assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q];
 
-  stream_register #(
+  spill_register #(
     .T(elen_t)
   ) i_mask_operand_register (
     .clk_i     (clk_i                                                                                        ),
     .rst_ni    (rst_ni                                                                                       ),
-    .clr_i     (1'b0                                                                                         ),
-    .testmode_i(1'b0                                                                                         ),
     .data_o    (mask_operand_o                                                                               ),
     .valid_o   (mask_operand_valid_o                                                                         ),
     .ready_i   (mask_operand_ready_i                                                                         ),
diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv
index 423ee092a..66527b442 100644
--- a/hardware/src/sldu/sldu.sv
+++ b/hardware/src/sldu/sldu.sv
@@ -239,13 +239,11 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
   logic  [NrLanes-1:0] mask_ready_q;
 
   for (genvar l = 0; l < NrLanes; l++) begin
-    stream_register #(
+    spill_register #(
       .T(strb_t)
     ) i_mask_operand_register (
       .clk_i     (clk_i           ),
       .rst_ni    (rst_ni          ),
-      .clr_i     (1'b0            ),
-      .testmode_i(1'b0            ),
       .data_o    (mask_q[l]       ),
       .valid_o   (mask_valid_q[l] ),
       .ready_i   (mask_ready_d    ),

From 5995d455ed7d76b060d2f2e71ee75134a46dc0a4 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Mon, 25 Nov 2024 14:28:36 +0100
Subject: [PATCH 4/8] [hardware] Fix some latches

---
 hardware/src/lane/operand_queue.sv |  2 +-
 hardware/src/lane/vmfpu.sv         |  2 +-
 hardware/src/vlsu/addrgen.sv       | 10 ++++++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv
index 640d77f14..0ccdcc6ae 100644
--- a/hardware/src/lane/operand_queue.sv
+++ b/hardware/src/lane/operand_queue.sv
@@ -209,7 +209,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     last_packet       = 1'b0;
 
     for (int i = 0; i < 2; i++) fp16[i] = '0;
-    for (int i = 0; i < 1; i++) fp32[i] = '0;
+    fp32 = '0;
 
     // Reductions need to mask away the inactive elements
     // A temporary solution is to send a neutral value directly
diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv
index fdf10363a..bbeb78f32 100644
--- a/hardware/src/lane/vmfpu.sv
+++ b/hardware/src/lane/vmfpu.sv
@@ -1391,7 +1391,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                    : {vinsn_issue_q.use_vd_op, vinsn_issue_q.use_vs2, vinsn_issue_q.use_vs1};
 
     for (int i = 0; i < 2; i++) fp16[i] = '0;
-    for (int i = 0; i < 1; i++) fp32[i] = '0;
+    fp32 = '0;
 
     first_op_d              = first_op_q;
     simd_red_cnt_d          = simd_red_cnt_q;
diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv
index 005116546..2a21199a3 100644
--- a/hardware/src/vlsu/addrgen.sv
+++ b/hardware/src/vlsu/addrgen.sv
@@ -232,8 +232,14 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
     addrgen_req_valid = 1'b0;
 
     // Nothing to acknowledge
-    addrgen_ack_o           = 1'b0;
-    addrgen_exception_o     = '0;
+    addrgen_ack_o             = 1'b0;
+    addrgen_exception_o       = '0;
+    addrgen_exception_o.valid = 1'b0;
+    addrgen_exception_o.gva   = '0;
+    addrgen_exception_o.tinst = '0;
+    addrgen_exception_o.tval  = '0;
+    addrgen_exception_o.tval2 = '0;
+    addrgen_exception_o.cause = '0;
     addrgen_illegal_load_o  = 1'b0;
     addrgen_illegal_store_o = 1'b0;
 

From b04759de09180f113dd0afe49d3904b9e96f6539 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Sat, 9 Nov 2024 15:16:55 +0100
Subject: [PATCH 5/8] [apps] Improve MASKU riscv-tests

---
 apps/riscv-tests/isa/macros/vector/dataset.h  |   2 +-
 .../isa/macros/vector/vector_macros.h         |  11 +
 apps/riscv-tests/isa/rv64uv/vcpop.c           |  86 +++++-
 apps/riscv-tests/isa/rv64uv/vid.c             |  13 +-
 apps/riscv-tests/isa/rv64uv/viota.c           | 247 +++++++++++++++++-
 apps/riscv-tests/isa/rv64uv/vmsbf.c           | 135 +++++++++-
 apps/riscv-tests/isa/rv64uv/vmseq.c           |  59 +++++
 apps/riscv-tests/isa/rv64uv/vmsif.c           |  50 +++-
 apps/riscv-tests/isa/rv64uv/vmsof.c           |  18 +-
 apps/script/viota.py                          |  57 ++++
 10 files changed, 652 insertions(+), 26 deletions(-)
 create mode 100755 apps/script/viota.py

diff --git a/apps/riscv-tests/isa/macros/vector/dataset.h b/apps/riscv-tests/isa/macros/vector/dataset.h
index 90baacdae..cfad9409f 100644
--- a/apps/riscv-tests/isa/macros/vector/dataset.h
+++ b/apps/riscv-tests/isa/macros/vector/dataset.h
@@ -7,7 +7,7 @@
 #ifndef __DATASET_H__
 #define __DATASET_H__
 
-#define SIZE 64
+#define SIZE 1024
 #define L_SIZE 1024
 
 static volatile uint64_t Au64[SIZE] __attribute__((aligned(128)));
diff --git a/apps/riscv-tests/isa/macros/vector/vector_macros.h b/apps/riscv-tests/isa/macros/vector/vector_macros.h
index d303a7cb9..9193717e1 100644
--- a/apps/riscv-tests/isa/macros/vector/vector_macros.h
+++ b/apps/riscv-tests/isa/macros/vector/vector_macros.h
@@ -209,6 +209,17 @@ int test_case;
     asm volatile("vsetvl zero, %[vl], %[vtype]" :: [vl] "r" (vl), [vtype] "r" (vtype));           \
   } while(0)
 
+#define VCLEAR_AT_ONE(register)                                                                   \
+  do {                                                                                            \
+    MEMORY_BARRIER;                                                                               \
+    uint64_t vtype; uint64_t vl; uint64_t vlmax;                                                  \
+    asm volatile("csrr %[vtype], vtype" : [vtype] "=r" (vtype));                                  \
+    asm volatile("csrr %[vl], vl" : [vl] "=r" (vl));                                              \
+    asm volatile("vsetvl %[vlmax], zero, %[vtype]" : [vlmax] "=r" (vlmax) : [vtype] "r" (vtype)); \
+    asm volatile("vmv.v.i "#register", -1");                                                       \
+    asm volatile("vsetvl zero, %[vl], %[vtype]" :: [vl] "r" (vl), [vtype] "r" (vtype));           \
+  } while(0)
+
 // Macro to initialize a vector with progressive values from a counter
 #define INIT_MEM_CNT(vec_name, size) \
   counter = 0;                          \
diff --git a/apps/riscv-tests/isa/rv64uv/vcpop.c b/apps/riscv-tests/isa/rv64uv/vcpop.c
index 0d0794db9..3c964215d 100644
--- a/apps/riscv-tests/isa/rv64uv/vcpop.c
+++ b/apps/riscv-tests/isa/rv64uv/vcpop.c
@@ -15,24 +15,90 @@ void TEST_CASE1(void) {
   VLOAD_32(v0, 5, 0, 0, 0);
   volatile uint32_t scalar = 1337;
   volatile uint32_t OUP[] = {0, 0, 0, 0};
-  __asm__ volatile("vpopc.m %[A], v2, v0.t \n"
-                   "sw %[A], (%1) \n"
-                   :
-                   : [A] "r"(scalar), "r"(OUP));
+  asm volatile("vpopc.m %[A], v2, v0.t \n"
+               "sw %[A], (%1) \n"
+               :
+               : [A] "r"(scalar), "r"(OUP));
   XCMP(1, OUP[0], 2);
+
+  VSET(32, e32, m1);
+  VLOAD_32(v8, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88,
+           0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF,
+           0x88, 0x1, 0x1F, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_32(v0, 0xffffffffffffffff, 0xfffffffffffffff7, 0xffffffffffffffff,
+           0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+           0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+           0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+           0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+           0xefffffffffffffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  VSET(1024, e8, m8);
+  asm volatile("vpopc.m %[A], v8, v0.t \n"
+               "sw %[A], (%1) \n"
+               :
+               : [A] "r"(scalar), "r"(OUP));
+  XCMP(2, OUP[0], 159);
 }
 
 // unmasked
 void TEST_CASE2(void) {
   VSET(4, e32, m1);
-  VLOAD_32(v2, 0xF, 0, 0, 0);
+  VLOAD_32(v2, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F);
   volatile uint32_t scalar = 1337;
   volatile uint32_t OUP[] = {0, 0, 0, 0};
-  __asm__ volatile("vpopc.m %[A], v2 \n"
-                   "sw %[A], (%1) \n"
-                   :
-                   : [A] "r"(scalar), "r"(OUP));
-  XCMP(2, OUP[0], 4);
+  VSET(128, e32, m2);
+  asm volatile("vpopc.m %[A], v2 \n"
+               "sw %[A], (%1) \n"
+               :
+               : [A] "r"(scalar), "r"(OUP));
+  XCMP(3, OUP[0], 40);
+
+  VSET(8, e32, m1);
+  VLOAD_32(v0, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88,
+           0x1, 0x1F);
+  VSET(256, e8, m8);
+  asm volatile("vpopc.m %[A], v0 \n"
+               "sw %[A], (%1) \n"
+               :
+               : [A] "r"(scalar), "r"(OUP));
+  XCMP(4, OUP[0], 80);
+
+  VSET(16, e32, m1);
+  VLOAD_32(v0, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88,
+           0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF,
+           0x88, 0x1, 0x1F);
+  VSET(1024, e8, m8);
+  asm volatile("vpopc.m %[A], v0 \n"
+               "sw %[A], (%1) \n"
+               :
+               : [A] "r"(scalar), "r"(OUP));
+  XCMP(5, OUP[0], 160);
+
+  VSET(8, e32, m1);
+  VLOAD_32(v2, 0xFFFFFFF7FFFFFFFF, 0x88, 0x1, 0x1F, 0xFFFFFFF7FFFFFFFF, 0x88,
+           0x1, 0x1F);
+  VSET(256, e8, m1);
+  asm volatile("vpopc.m %[A], v2 \n"
+               "sw %[A], (%1) \n"
+               :
+               : [A] "r"(scalar), "r"(OUP));
+  XCMP(6, OUP[0], 80);
+
+  VSET(2, e32, m1);
+  VLOAD_8(v2, 0xFF, 0x88);
+  VSET(16, e16, m1);
+  asm volatile("vcpop.m %[A], v2 \n"
+               "sw %[A], (%1) \n"
+               :
+               : [A] "r"(scalar), "r"(OUP));
+  XCMP(7, OUP[0], 10);
+
+  VSET(4, e32, m1);
+  VLOAD_32(v2, 0xF, 0, 0, 0);
+  asm volatile("vpopc.m %[A], v2 \n"
+               "sw %[A], (%1) \n"
+               :
+               : [A] "r"(scalar), "r"(OUP));
+  XCMP(8, OUP[0], 4);
 }
 
 int main(void) {
diff --git a/apps/riscv-tests/isa/rv64uv/vid.c b/apps/riscv-tests/isa/rv64uv/vid.c
index 7db9a1fc5..796e3e166 100644
--- a/apps/riscv-tests/isa/rv64uv/vid.c
+++ b/apps/riscv-tests/isa/rv64uv/vid.c
@@ -11,6 +11,17 @@ void TEST_CASE1() {
   VSET(16, e8, m1);
   __asm__ volatile("vid.v v1");
   VCMP_U8(1, v1, 0, 1, 2, 3, 4, 5, 6, 7);
+  VSET(10, e8, m1);
+
+  VLOAD_8(v1, 0b00001110, 0b00000111, 0b00010000, 0b01001011, 0b00110100,
+          0b01111101, 0b11001100, 0b00011000, 0b01000111, 0b00010100);
+  VSET(77, e8, m1);
+  asm volatile("vid.v v2");
+  VCMP_U8(2, v2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+          18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
+          35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+          52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+          69, 70, 71, 72, 73, 74, 75, 76);
 }
 
 void TEST_CASE2() {
@@ -18,7 +29,7 @@ void TEST_CASE2() {
   VLOAD_8(v0, 85, 0, 0, 0, 0, 0, 0, 0);
   VCLEAR(v1);
   __asm__ volatile("vid.v v1, v0.t");
-  VCMP_U8(2, v1, 0, 0, 2, 0, 4, 0, 6, 0);
+  VCMP_U8(3, v1, 0, 0, 2, 0, 4, 0, 6, 0);
 }
 
 int main(void) {
diff --git a/apps/riscv-tests/isa/rv64uv/viota.c b/apps/riscv-tests/isa/rv64uv/viota.c
index 9fb17080f..bd8dbec6a 100644
--- a/apps/riscv-tests/isa/rv64uv/viota.c
+++ b/apps/riscv-tests/isa/rv64uv/viota.c
@@ -10,9 +10,208 @@
 void TEST_CASE1() {
   VSET(1, e8, m1);
   VLOAD_8(v1, 0b10001001);
-  VSET(16, e8, m1);
+  VSET(8, e8, m1);
   asm volatile("viota.m v2, v1");
   VCMP_U8(1, v2, 0, 1, 1, 1, 2, 2, 2, 2);
+
+  VSET(2, e8, m1);
+  VLOAD_8(v1, 0b01100010, 0b01001100);
+  VSET(16, e8, m1);
+  asm volatile("viota.m v2, v1");
+  VCMP_U8(2, v2, 0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 3, 4, 5, 5, 5, 6);
+
+  VSET(1, e8, m8);
+  VLOAD_8(v0, 0b00000001);
+  VSET(4, e16, m8);
+  asm volatile("viota.m v8, v0");
+  VCMP_U16(3, v8, 0, 1, 1, 1);
+
+  VSET(64, e8, m4);
+  VLOAD_8(
+      v0, 0b10101101, 0b10000000, 0b00000110, 0b10011100, 0b10010101,
+      0b01001100, 0b01101010, 0b11000100, 0b00011110, 0b10111010, 0b00100110,
+      0b11001010, 0b01101101, 0b11001010, 0b01000101, 0b00010110, 0b00001000,
+      0b10111000, 0b11011100, 0b11100000, 0b00110101, 0b10011110, 0b01001111,
+      0b01011101, 0b00001010, 0b01111000, 0b11010100, 0b01011101, 0b11000101,
+      0b10010010, 0b01011100, 0b11010101, 0b00100010, 0b10000100, 0b11001011,
+      0b01001101, 0b01010000, 0b10110011, 0b00011000, 0b10000101, 0b01110101,
+      0b00001111, 0b10111100, 0b00010101, 0b10011101, 0b11011001, 0b11010101,
+      0b00100001, 0b01101110, 0b10000001, 0b01100100, 0b00010001, 0b00010100,
+      0b00101011, 0b11111000, 0b10010000, 0b01010000, 0b01001111, 0b00000011,
+      0b10100100, 0b10001010, 0b01110011, 0b10100010, 0b01111110);
+  VSET(512, e8, m4);
+  asm volatile("viota.m v8, v0");
+  VCMP_U8(
+      4, v8, 0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 7, 8, 8, 8,
+      8, 8, 8, 8, 8, 9, 10, 11, 11, 11, 12, 13, 13, 14, 14, 15, 15, 15, 16, 16,
+      16, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 22, 23, 23, 23, 23, 24,
+      24, 24, 24, 25, 26, 26, 27, 28, 29, 30, 30, 30, 30, 30, 31, 31, 32, 33,
+      34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41,
+      42, 43, 43, 44, 45, 45, 46, 47, 47, 47, 48, 48, 49, 49, 49, 50, 51, 52,
+      52, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 57, 57, 57, 57, 57, 57, 57,
+      58, 58, 58, 58, 58, 58, 58, 58, 59, 60, 61, 61, 62, 62, 62, 63, 64, 65,
+      65, 66, 67, 67, 67, 67, 67, 67, 68, 69, 70, 71, 71, 72, 72, 73, 74, 74,
+      74, 74, 75, 76, 77, 78, 78, 78, 79, 80, 81, 82, 83, 83, 83, 84, 84, 85,
+      85, 86, 87, 88, 88, 89, 89, 89, 90, 90, 91, 91, 91, 91, 91, 91, 91, 91,
+      92, 93, 94, 95, 95, 95, 95, 96, 96, 97, 97, 98, 99, 100, 100, 101, 102,
+      103, 103, 104, 104, 105, 105, 106, 106, 106, 106, 107, 108, 108, 109, 109,
+      109, 110, 110, 110, 111, 111, 111, 112, 113, 114, 114, 115, 115, 116, 116,
+      117, 117, 118, 118, 119, 120, 120, 121, 121, 121, 121, 122, 122, 122, 122,
+      122, 123, 123, 123, 123, 123, 124, 125, 126, 126, 127, 127, 127, 128, 129,
+      130, 130, 131, 132, 132, 132, 133, 133, 133, 133, 133, 133, 134, 134, 135,
+      135, 136, 137, 137, 137, 138, 139, 139, 140, 140, 140, 140, 141, 142, 142,
+      142, 142, 143, 143, 144, 144, 144, 144, 144, 145, 146, 146, 147, 147, 148,
+      149, 150, 150, 151, 152, 153, 154, 154, 154, 154, 154, 154, 154, 155, 156,
+      157, 158, 158, 159, 160, 160, 161, 161, 162, 162, 162, 162, 163, 163, 164,
+      165, 166, 166, 166, 167, 168, 168, 168, 169, 170, 170, 171, 172, 173, 173,
+      174, 174, 175, 175, 176, 177, 178, 178, 178, 178, 178, 179, 179, 179, 179,
+      180, 181, 182, 182, 183, 184, 184, 185, 185, 185, 185, 185, 185, 185, 186,
+      186, 186, 187, 187, 187, 188, 189, 189, 190, 190, 190, 190, 191, 191, 191,
+      191, 191, 191, 192, 192, 193, 193, 193, 193, 194, 195, 195, 196, 196, 197,
+      197, 197, 197, 197, 197, 198, 199, 200, 201, 202, 202, 202, 202, 202, 203,
+      203, 203, 204, 204, 204, 204, 204, 205, 205, 206, 206, 207, 208, 209, 210,
+      210, 210, 211, 211, 212, 213, 213, 213, 213, 213, 213, 213, 213, 213, 214,
+      214, 214, 215, 215, 216, 216, 217, 217, 218, 218, 218, 218, 219, 220, 221,
+      221, 221, 222, 223, 224, 224, 224, 225, 225, 225, 225, 226, 226, 227, 227,
+      228, 229, 230, 231, 232, 233);
+
+  VSET(128, e8, m8);
+  VLOAD_8(
+      v0, 0b00000001, 0b00000011, 0b00111110, 0b00000100, 0b01101000,
+      0b10111011, 0b11010110, 0b10111111, 0b00110011, 0b00011100, 0b11010100,
+      0b00011010, 0b10100001, 0b10110100, 0b10010111, 0b01010100, 0b00011010,
+      0b01101011, 0b00101010, 0b11111111, 0b10000100, 0b11100110, 0b00100001,
+      0b01101000, 0b10110100, 0b01100010, 0b11100001, 0b10011100, 0b00110111,
+      0b01010011, 0b01010111, 0b10010001, 0b11001000, 0b01001011, 0b01000000,
+      0b10001111, 0b00001111, 0b01110100, 0b10101100, 0b00010101, 0b00110100,
+      0b10010010, 0b00001101, 0b11110011, 0b10101101, 0b10000100, 0b01111000,
+      0b11010101, 0b10110110, 0b00110110, 0b01010001, 0b01001000, 0b11100011,
+      0b01001110, 0b11101101, 0b01111000, 0b10111101, 0b00111011, 0b10111001,
+      0b11000110, 0b00000011, 0b00001110, 0b00001111, 0b00000010, 0b01010110,
+      0b00000010, 0b11011011, 0b01010100, 0b10110110, 0b10100011, 0b10100101,
+      0b11110101, 0b00000110, 0b10011111, 0b01000110, 0b00100000, 0b00100011,
+      0b11110100, 0b10111101, 0b10000010, 0b11110011, 0b00111111, 0b11000010,
+      0b00011001, 0b10000010, 0b00110011, 0b11000110, 0b11001100, 0b10011100,
+      0b11001011, 0b10101101, 0b11011110, 0b11010110, 0b11010110, 0b00100100,
+      0b01111010, 0b00111001, 0b10111000, 0b01101000, 0b00001001, 0b10010100,
+      0b11111101, 0b00001101, 0b10100111, 0b11000110, 0b01100111, 0b01010111,
+      0b10011001, 0b01100111, 0b00001011, 0b01001011, 0b10001101, 0b11110110,
+      0b10001001, 0b10010101, 0b10010010, 0b10100100, 0b01010110, 0b10110110,
+      0b10111001, 0b01010000, 0b01010001, 0b01001100, 0b11001101, 0b10111100,
+      0b11110010, 0b00001000, 0b10111000);
+  VSET(1024, e16, m8);
+  asm volatile("viota.m v8, v0");
+  VCMP_U16(
+      5, v8, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 6, 7,
+      8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 11, 12, 12, 13, 14, 14,
+      15, 16, 17, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 24, 25, 26, 27, 28,
+      29, 29, 30, 31, 32, 32, 32, 33, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+      37, 37, 37, 38, 38, 39, 39, 40, 41, 41, 42, 42, 43, 44, 44, 44, 44, 45,
+      45, 45, 45, 45, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 54,
+      54, 55, 55, 55, 56, 56, 56, 57, 57, 58, 58, 59, 59, 59, 60, 60, 61, 62,
+      62, 62, 62, 63, 64, 64, 65, 65, 66, 67, 67, 67, 68, 68, 69, 69, 70, 70,
+      70, 71, 72, 73, 74, 75, 76, 77, 78, 78, 78, 79, 79, 79, 79, 79, 80, 80,
+      81, 82, 82, 82, 83, 84, 85, 86, 86, 86, 86, 86, 87, 87, 87, 87, 87, 87,
+      88, 88, 89, 90, 90, 90, 90, 91, 91, 92, 93, 93, 94, 94, 95, 95, 95, 95,
+      96, 97, 97, 98, 98, 98, 98, 98, 99, 100, 101, 101, 101, 102, 103, 104,
+      104, 104, 105, 106, 107, 108, 108, 109, 110, 110, 110, 111, 112, 112, 112,
+      113, 113, 114, 114, 115, 116, 117, 117, 118, 118, 119, 119, 120, 120, 120,
+      120, 121, 121, 121, 122, 122, 122, 122, 123, 123, 123, 124, 125, 126, 127,
+      127, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 130, 130, 131,
+      132, 133, 134, 134, 134, 134, 135, 136, 137, 138, 139, 139, 139, 139, 139,
+      139, 139, 140, 140, 141, 142, 143, 143, 143, 143, 144, 145, 145, 146, 146,
+      147, 148, 148, 149, 149, 150, 150, 150, 150, 150, 150, 151, 151, 152, 153,
+      153, 153, 153, 154, 154, 154, 155, 155, 155, 156, 157, 157, 158, 159, 159,
+      159, 159, 159, 160, 161, 161, 161, 162, 163, 164, 165, 166, 166, 167, 168,
+      168, 169, 169, 170, 170, 170, 171, 171, 171, 171, 171, 172, 172, 172, 172,
+      173, 174, 175, 176, 176, 177, 177, 178, 178, 179, 179, 180, 181, 181, 182,
+      183, 183, 184, 185, 185, 186, 186, 187, 188, 188, 189, 190, 190, 190, 191,
+      191, 191, 191, 192, 192, 193, 193, 193, 193, 193, 194, 194, 194, 195, 195,
+      196, 197, 197, 197, 197, 198, 199, 200, 200, 201, 202, 203, 203, 203, 204,
+      204, 205, 205, 206, 207, 207, 208, 209, 210, 210, 210, 210, 211, 212, 213,
+      214, 214, 215, 215, 216, 217, 218, 219, 219, 220, 221, 222, 222, 223, 224,
+      225, 225, 225, 226, 226, 226, 227, 228, 229, 229, 230, 230, 231, 232, 232,
+      232, 232, 233, 234, 235, 236, 236, 236, 236, 236, 236, 236, 236, 237, 238,
+      239, 239, 239, 239, 239, 240, 241, 242, 243, 243, 243, 243, 243, 243, 244,
+      244, 244, 244, 244, 244, 244, 244, 245, 246, 246, 247, 247, 248, 248, 248,
+      249, 249, 249, 249, 249, 249, 249, 250, 251, 251, 252, 253, 253, 254, 255,
+      255, 255, 256, 256, 257, 257, 258, 258, 258, 259, 260, 260, 261, 262, 262,
+      263, 264, 265, 265, 265, 265, 266, 266, 267, 268, 268, 269, 269, 269, 270,
+      270, 271, 272, 272, 273, 273, 274, 275, 276, 277, 277, 278, 279, 279, 279,
+      279, 279, 279, 280, 281, 282, 283, 284, 284, 284, 285, 285, 286, 287, 287,
+      287, 287, 288, 288, 288, 288, 288, 288, 288, 289, 289, 289, 290, 291, 291,
+      291, 291, 292, 292, 292, 292, 292, 293, 293, 294, 295, 296, 297, 298, 298,
+      299, 300, 301, 302, 302, 303, 303, 304, 304, 304, 304, 304, 304, 305, 306,
+      307, 307, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 317, 317,
+      317, 318, 318, 318, 318, 318, 319, 320, 321, 321, 321, 322, 323, 323, 323,
+      323, 323, 324, 324, 324, 324, 324, 324, 325, 326, 327, 327, 327, 328, 329,
+      329, 329, 329, 330, 331, 331, 331, 331, 332, 333, 333, 333, 334, 335, 335,
+      335, 336, 337, 337, 337, 338, 339, 340, 340, 340, 341, 342, 343, 343, 344,
+      344, 344, 345, 346, 347, 347, 348, 349, 349, 350, 350, 351, 351, 352, 353,
+      354, 355, 355, 356, 357, 357, 358, 359, 359, 360, 360, 361, 362, 362, 363,
+      364, 364, 365, 365, 366, 367, 367, 367, 368, 368, 368, 369, 369, 369, 369,
+      370, 370, 371, 372, 373, 374, 374, 375, 375, 375, 376, 377, 378, 378, 378,
+      378, 378, 378, 379, 380, 381, 381, 382, 382, 382, 382, 383, 383, 384, 385,
+      385, 386, 386, 386, 387, 387, 387, 387, 387, 387, 387, 388, 388, 389, 389,
+      389, 390, 391, 391, 392, 393, 394, 395, 396, 397, 398, 398, 399, 400, 400,
+      400, 400, 400, 401, 402, 403, 403, 403, 404, 404, 405, 405, 406, 407, 407,
+      407, 407, 408, 409, 410, 411, 412, 412, 412, 413, 414, 414, 415, 416, 417,
+      417, 418, 418, 419, 419, 420, 420, 420, 421, 422, 422, 422, 423, 424, 425,
+      426, 426, 426, 427, 428, 428, 429, 430, 430, 431, 431, 431, 431, 431, 432,
+      433, 433, 434, 434, 434, 435, 435, 436, 436, 437, 438, 438, 438, 438, 439,
+      439, 440, 441, 441, 442, 443, 444, 445, 446, 446, 446, 447, 447, 447, 447,
+      448, 449, 449, 450, 450, 451, 451, 451, 452, 452, 453, 453, 453, 454, 454,
+      454, 455, 455, 455, 456, 456, 456, 457, 457, 458, 458, 459, 460, 460, 461,
+      461, 462, 462, 462, 463, 464, 464, 465, 466, 466, 467, 468, 468, 468, 469,
+      470, 471, 471, 472, 472, 472, 472, 472, 473, 473, 474, 474, 475, 475, 475,
+      475, 476, 476, 477, 477, 477, 477, 478, 479, 479, 479, 480, 480, 481, 481,
+      482, 483, 483, 483, 484, 485, 485, 485, 486, 487, 488, 489, 489, 490, 490,
+      491, 491, 491, 492, 493, 494, 495, 495, 495, 495, 496, 496, 496, 496, 496,
+      496, 496, 496, 497, 498, 499, 499);
+
+  VSET(10, e8, m1);
+  VLOAD_8(v1, 0b00001110, 0b00000111, 0b00010000, 0b01001011, 0b00110100,
+          0b01111101, 0b11001100, 0b00011000, 0b01000111, 0b00010100);
+  VSET(77, e8, m1);
+  asm volatile("viota.m v2, v1");
+  VCMP_U8(6, v2, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+          7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14,
+          14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24,
+          24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30,
+          30, 31, 31);
+
+  VSET(10, e8, m1);
+  VLOAD_8(v0, 0b00001110, 0b00000111, 0b00010000, 0b01001011, 0b00110100,
+          0b01111101, 0b11001100, 0b00011000, 0b01000111, 0b00010100);
+  VSET(77, e32, m2);
+  asm volatile("viota.m v2, v0");
+  VCMP_U32(7, v2, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+           7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14,
+           14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24,
+           24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30,
+           30, 31, 31);
+
+  VSET(10, e8, m1);
+  VLOAD_8(v0, 0b00001110, 0b00000111, 0b00010000, 0b01001011, 0b00110100,
+          0b01111101, 0b11001100, 0b00011000, 0b01000111, 0b00010100);
+  VSET(77, e64, m4);
+  asm volatile("viota.m v4, v0");
+  VCMP_U64(8, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+           7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14,
+           14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24,
+           24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30,
+           30, 31, 31);
+
+  VSET(5, e16, m1);
+  VLOAD_16(v0, 0b0000011100001110, 0b0100101100010000, 0b0111110100110100,
+           0b0001100011001100, 0b0001010001000111);
+  VSET(77, e8, m4);
+  asm volatile("viota.m v4, v0");
+  VCMP_U8(9, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+          7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14,
+          14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24,
+          24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30,
+          30, 31, 31);
 }
 
 void TEST_CASE2() {
@@ -24,7 +223,51 @@ void TEST_CASE2() {
   VLOAD_8(v0, 0b11000111);
   VSET(16, e8, m1);
   asm volatile("viota.m v2, v1, v0.t");
-  VCMP_U8(2, v2, 0, 1, 1, 3, 4, 5, 1, 1);
+  VCMP_U8(10, v2, 0, 1, 1, 3, 4, 5, 1, 1);
+
+  VSET(5, e16, m1);
+  VLOAD_16(v8, 0b0000011100001110, 0b0100101100010000, 0b0111110100110100,
+           0b0001100011001100, 0b0001010001000111);
+  VSET(10, e16, m1);
+  VLOAD_8(v0, 0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b11111111,
+          0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b00001111);
+  VSET(77, e8, m4);
+  VCLEAR(v4);
+  asm volatile("viota.m v4, v8, v0.t");
+  VCMP_U8(11, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+          7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14,
+          14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24,
+          24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30,
+          30, 31, 0);
+
+  VSET(5, e16, m1);
+  VLOAD_16(v8, 0b0000011100001110, 0b0100101100010000, 0b0111110100110100,
+           0b0001100011001100, 0b0001010001000111);
+  VSET(10, e16, m1);
+  VLOAD_8(v0, 0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b11111111,
+          0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b00001011);
+  VSET(77, e8, m4);
+  VCLEAR(v4);
+  asm volatile("viota.m v4, v8, v0.t");
+  VCMP_U8(12, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+          7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14,
+          14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24,
+          24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30, 0,
+          30, 0);
+
+  VSET(5, e16, m1);
+  VLOAD_16(v8, 0b0000011100001110, 0b0100101100010000, 0b0111110100110100,
+           0b0001100011001100, 0b0001010001000111);
+  VSET(10, e16, m1);
+  VLOAD_8(v0, 0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b11111111,
+          0b11111111, 0b11111111, 0b11111111, 0b11111111, 0b11111111);
+  VSET(77, e8, m4);
+  asm volatile("viota.m v4, v8, v0.t");
+  VCMP_U8(13, v4, 0, 0, 1, 2, 3, 3, 3, 3, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+          7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 14, 14,
+          14, 15, 15, 16, 17, 18, 19, 20, 20, 20, 20, 21, 22, 22, 22, 23, 24,
+          24, 24, 24, 25, 26, 26, 26, 26, 27, 28, 29, 29, 29, 29, 30, 30, 30,
+          30, 31, 31);
 }
 
 int main(void) {
diff --git a/apps/riscv-tests/isa/rv64uv/vmsbf.c b/apps/riscv-tests/isa/rv64uv/vmsbf.c
index 7a4e15e9f..eb92f3ddb 100644
--- a/apps/riscv-tests/isa/rv64uv/vmsbf.c
+++ b/apps/riscv-tests/isa/rv64uv/vmsbf.c
@@ -10,17 +10,141 @@
 void TEST_CASE1() {
   VSET(16, e8, m1);
   VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0);
-  __asm__ volatile("vmsbf.m v2, v3");
-  VCMP_U8(1, v2, 7, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsbf.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(1, v2, 7, 0);
 }
 
 void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 4, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsbf.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(2, v2, 3, 0);
+}
+
+void TEST_CASE3() {
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsbf.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(3, v2, 0xff, 0xff);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsbf.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(4, v2, 0, 0);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 0x08, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  VSET(16, e32, m1);
+  asm volatile("vmsbf.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(5, v2, 0x07, 0x00);
+
+  VSET(8, e64, m2);
+  VLOAD_64(v4, 0, 0, 0, 0, 0, 1, 0, 0);
+  VSET(512, e8, m2);
+  asm volatile("vmsbf.m v2, v4");
+  VSET(16, e32, m2);
+  VCMP_U32(6, v2, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+           0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0,
+           0, 0, 0);
+
+  VSET(16, e64, m2);
+  VLOAD_64(v4, 0, 0, 0, 0, 0, 1, 0, 0, 1685, 0, 0, 1, 0, 0, 0, 0);
+  VSET(1024, e8, m2);
+  asm volatile("vmsbf.m v2, v4");
+  VSET(32, e32, m2);
+  VCMP_U32(7, v2, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+           0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0,
+           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+void TEST_CASE4() {
   VSET(16, e8, m1);
   VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0);
   VLOAD_8(v0, 3, 0, 0, 0, 0, 0, 0, 0);
   VCLEAR(v2);
-  __asm__ volatile("vmsbf.m v2, v3, v0.t");
-  VCMP_U8(2, v2, 3, 0, 0, 0, 0, 0, 0, 0);
+  VSET(2, e8, m1);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VCMP_U8(8, v2, 3, 0);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 5, 0, 0, 0, 0, 0, 0, 0);
+  VCLEAR(v2);
+  VSET(16, e8, m1);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(9, v2, 5, 0);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 0x18, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 0xf7, 0, 0, 0, 0, 0, 0, 0);
+  VCLEAR(v2);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(10, v2, 0x07, 0);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 0x18, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 0xef, 0, 0, 0, 0, 0, 0, 0);
+  VCLEAR(v2);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(11, v2, 0x07, 0);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 0x8, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 0xf7, 0xff, 0, 0, 0, 0, 0, 0);
+  VCLEAR(v2);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(12, v2, 0xf7, 0xff);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 0x38, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 0xf7, 0, 0, 0, 0, 0, 0, 0);
+  VCLEAR(v2);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(13, v2, 0x7, 0);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 5, 0, 0, 0, 0, 0, 0, 0);
+  VCLEAR(v2);
+  VSET(16, e8, m1);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VCMP_U8(14, v2, 5, 0);
+}
+
+void TEST_CASE5() {
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 11, 0, 0, 0, 0, 0, 0, 0);
+  VCLEAR(v2);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(15, v2, 3, 0);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 11, 0, 0, 0, 0, 0, 0, 0);
+  VCLEAR_AT_ONE(v2);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(16, v2, 0xf7, 0xff);
+
+  VSET(8, e8, m1);
+  VLOAD_8(v3, 0x94, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 0xC3, 0, 0, 0, 0, 0, 0, 0);
+  VCLEAR(v2);
+  asm volatile("vmsbf.m v2, v3, v0.t");
+  VSET(1, e8, m1);
+  VCMP_U8(17, v2, 0x43);
 }
 
 int main(void) {
@@ -29,5 +153,8 @@ int main(void) {
   enable_fp();
   TEST_CASE1();
   TEST_CASE2();
+  TEST_CASE3();
+  TEST_CASE4();
+  TEST_CASE5();
   EXIT_CHECK();
 }
diff --git a/apps/riscv-tests/isa/rv64uv/vmseq.c b/apps/riscv-tests/isa/rv64uv/vmseq.c
index 582ba8f81..b881857d6 100644
--- a/apps/riscv-tests/isa/rv64uv/vmseq.c
+++ b/apps/riscv-tests/isa/rv64uv/vmseq.c
@@ -291,6 +291,64 @@ void TEST_CASE6(void) {
   VCMP_U8(24, v1, 0x10, 0x10);
 };
 
+void TEST_CASE7(void) {
+  VSET(16, e8, m1);
+  VLOAD_8(v2, 0xff, 0x00, 0xf0, 0x0f, 0xff, 0x00, 0xf0, 0x0f, 0xff, 0x00, 0xf0,
+          0x0f, 0xff, 0x00, 0xf0, 0x0f);
+  VLOAD_8(v3, 0xf2, 0x01, 0xf0, 0x0f, 0xf2, 0x01, 0xf0, 0x0f, 0xf2, 0x01, 0xf0,
+          0x0f, 0xf2, 0x01, 0xf0, 0x0f);
+  VLOAD_8(v0, 0xaa, 0xaa);
+  VCLEAR_AT_ONE(v1);
+  asm volatile("vmseq.vv v1, v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(25, v1, 0xdd, 0xdd);
+
+  VSET(16, e16, m1);
+  VLOAD_16(v2, 0xffff, 0x0000, 0xf0f0, 0x0f0f, 0xffff, 0x0000, 0xf0f0, 0x0f0f,
+           0xffff, 0x0000, 0xf0f0, 0x0f0f, 0xffff, 0x0000, 0xf0f0, 0x0f0f);
+  VLOAD_16(v3, 0xf2ff, 0x0100, 0xf0f0, 0x0f0f, 0xf2ff, 0x0100, 0xf0f0, 0x0f0f,
+           0xf2ff, 0x0100, 0xf0f0, 0x0f0f, 0xf2ff, 0x0100, 0xf0f0, 0x0f0f);
+  VLOAD_8(v0, 0xaa, 0xaa);
+  VCLEAR_AT_ONE(v1);
+  asm volatile("vmseq.vv v1, v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(26, v1, 0xdd, 0xdd);
+
+  VSET(16, e32, m1);
+  VLOAD_32(v2, 0xffffffff, 0x00000000, 0xf0f0f0f0, 0x0f0f0f0f, 0xffffffff,
+           0x00000000, 0xf0f0f0f0, 0x0f0f0f0f, 0xffffffff, 0x00000000,
+           0xf0f0f0f0, 0x0f0f0f0f, 0xffffffff, 0x00000000, 0xf0f0f0f0,
+           0x0f0f0f0f);
+  VLOAD_32(v3, 0xfff2ffff, 0x01000000, 0xf0f0f0f0, 0x0f0f0f0f, 0xfff2ffff,
+           0x01000000, 0xf0f0f0f0, 0x0f0f0f0f, 0xfff2ffff, 0x01000000,
+           0xf0f0f0f0, 0x0f0f0f0f, 0xfff2ffff, 0x01000000, 0xf0f0f0f0,
+           0x0f0f0f0f);
+  VLOAD_8(v0, 0xaa, 0xaa);
+  VCLEAR_AT_ONE(v1);
+  asm volatile("vmseq.vv v1, v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(27, v1, 0xdd, 0xdd);
+
+  VSET(16, e64, m1);
+  VLOAD_64(v2, 0xffffffffffffffff, 0x0000000000000000, 0xf0f0f0f0f0f0f0f0,
+           0x0f0f0f0f0f0f0f0f, 0xffffffffffffffff, 0x0000000000000000,
+           0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, 0xffffffffffffffff,
+           0x0000000000000000, 0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f,
+           0xffffffffffffffff, 0x0000000000000000, 0xf0f0f0f0f0f0f0f0,
+           0x0f0f0f0f0f0f0f0f);
+  VLOAD_64(v3, 0xfff2ffffffffffff, 0x0100000000000000, 0xf0f0f0f0f0f0f0f0,
+           0x0f0f0f0f0f0f0f0f, 0xfff2ffffffffffff, 0x0100000000000000,
+           0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f, 0xfff2ffffffffffff,
+           0x0100000000000000, 0xf0f0f0f0f0f0f0f0, 0x0f0f0f0f0f0f0f0f,
+           0xfff2ffffffffffff, 0x0100000000000000, 0xf0f0f0f0f0f0f0f0,
+           0x0f0f0f0f0f0f0f0f);
+  VLOAD_8(v0, 0xaa, 0xaa);
+  VCLEAR_AT_ONE(v1);
+  asm volatile("vmseq.vv v1, v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(28, v1, 0xdd, 0xdd);
+};
+
 int main(void) {
   INIT_CHECK();
   enable_vec();
@@ -301,6 +359,7 @@ int main(void) {
   TEST_CASE4();
   TEST_CASE5();
   TEST_CASE6();
+  TEST_CASE7();
 
   EXIT_CHECK();
 }
diff --git a/apps/riscv-tests/isa/rv64uv/vmsif.c b/apps/riscv-tests/isa/rv64uv/vmsif.c
index 7f682bba3..24a5f3fd3 100644
--- a/apps/riscv-tests/isa/rv64uv/vmsif.c
+++ b/apps/riscv-tests/isa/rv64uv/vmsif.c
@@ -10,17 +10,58 @@
 void TEST_CASE1() {
   VSET(16, e8, m1);
   VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0);
-  __asm__ volatile("vmsif.m v2, v3");
-  VCMP_U8(1, v2, 15, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsif.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(1, v2, 15, 0);
 }
 
 void TEST_CASE2() {
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsif.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(2, v2, 0xff, 0xff);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsif.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(3, v2, 1, 0);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 0x08, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  VSET(16, e32, m1);
+  asm volatile("vmsif.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(4, v2, 0x0F, 0x00);
+
+  VSET(8, e64, m2);
+  VLOAD_64(v4, 0, 0, 0, 0, 0, 1, 0, 0);
+  VSET(512, e8, m2);
+  asm volatile("vmsif.m v2, v4");
+  VSET(16, e32, m2);
+  VCMP_U32(5, v2, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+           0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 1, 0, 0,
+           0, 0, 0);
+
+  VSET(16, e64, m2);
+  VLOAD_64(v4, 0, 0, 0, 0, 0, 1, 0, 0, 1685, 0, 0, 1, 0, 0, 0, 0);
+  VSET(1024, e8, m4);
+  asm volatile("vmsif.m v0, v4");
+  VSET(32, e32, m2);
+  VCMP_U32(6, v0, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+           0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 1, 0, 0,
+           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+void TEST_CASE3() {
   VSET(16, e8, m1);
   VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0);
   VLOAD_8(v0, 11, 0, 0, 0, 0, 0, 0, 0);
   VCLEAR(v2);
-  __asm__ volatile("vmsif.m v2, v3, v0.t");
-  VCMP_U8(2, v2, 11, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsif.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(7, v2, 11, 0, 0, 0, 0, 0, 0, 0);
 }
 
 int main(void) {
@@ -29,5 +70,6 @@ int main(void) {
   enable_fp();
   TEST_CASE1();
   TEST_CASE2();
+  TEST_CASE3();
   EXIT_CHECK();
 }
diff --git a/apps/riscv-tests/isa/rv64uv/vmsof.c b/apps/riscv-tests/isa/rv64uv/vmsof.c
index b5dc5aae1..24db47531 100644
--- a/apps/riscv-tests/isa/rv64uv/vmsof.c
+++ b/apps/riscv-tests/isa/rv64uv/vmsof.c
@@ -10,8 +10,9 @@
 void TEST_CASE1() {
   VSET(16, e8, m1);
   VLOAD_8(v3, 8, 0, 0, 0, 0, 0, 0, 0);
-  __asm__ volatile("vmsof.m v2, v3");
-  VCMP_U8(1, v2, 8, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsof.m v2, v3");
+  VSET(2, e8, m1);
+  VCMP_U8(1, v2, 8, 0);
 }
 
 void TEST_CASE2() {
@@ -19,8 +20,17 @@ void TEST_CASE2() {
   VLOAD_8(v3, 0, 0, 0, 1, 0, 0, 0, 0);
   VLOAD_8(v0, 3, 0, 0, 0, 0, 0, 0, 0);
   VCLEAR(v2);
-  __asm__ volatile("vmsof.m v2, v3, v0.t");
-  VCMP_U8(2, v2, 0, 0, 0, 0, 0, 0, 0, 0);
+  asm volatile("vmsof.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(2, v2, 0, 0);
+
+  VSET(16, e8, m1);
+  VLOAD_8(v3, 0x38, 0, 0, 0, 0, 0, 0, 0);
+  VLOAD_8(v0, 0xf7, 0, 0, 0, 0, 0, 0, 0);
+  VCLEAR(v2);
+  asm volatile("vmsof.m v2, v3, v0.t");
+  VSET(2, e8, m1);
+  VCMP_U8(3, v2, 0x10, 0);
 }
 
 int main(void) {
diff --git a/apps/script/viota.py b/apps/script/viota.py
new file mode 100755
index 000000000..d74327252
--- /dev/null
+++ b/apps/script/viota.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+import random
+import sys
+
+def generate_bit_vector(size):
+    # Generate a random bit vector I of size 'size'
+    I = [random.randint(0, 1) for _ in range(size)]
+
+    # Initialize the accumulator and the output vector O
+    accumulator = 0
+    O = []
+
+    # Compute each element of O based on I and the accumulator
+    for bit in I:
+        O.append(accumulator)
+        accumulator += bit
+
+    return I, O
+
+def format_I_vector_as_binary(I, size):
+    # Format the I vector in chunks of 8 bits in reverse order
+    bin_chunks = [
+        "0b" + "".join(str(bit) for bit in I[i:i+8][::-1])
+        for i in range(0, size, 8)
+    ]
+    return ", ".join(bin_chunks)
+
+def format_O_vector(O):
+    # Format the O vector as individual elements
+    return ", ".join(f"{val}" for val in O)
+
+def generate_test_case(size):
+    # Generate I and O vectors
+    I, O = generate_bit_vector(size)
+
+    # Format I as binary strings in chunks of 8 bits and O as individual elements
+    I_formatted = format_I_vector_as_binary(I, size)
+    O_formatted = format_O_vector(O)
+
+    # Prepare the test case template
+    test_case = f"""
+void TEST_CASE1() {{
+  VSET({int(size/8)}, e8, m1);
+  VLOAD_8(v1, {I_formatted});
+  VSET({size}, e8, m1);
+  asm volatile("viota.m v2, v1");
+  VCMP_U8(1, v2, {O_formatted});
+}}
+"""
+
+    return test_case
+
+# Example of using the function
+if __name__ == "__main__":
+    size = int(sys.argv[1])
+    print(generate_test_case(size))

From 7015639c194b704d7f136861cea67e968cfa59ff Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Sat, 9 Nov 2024 14:25:17 +0100
Subject: [PATCH 6/8] [hardware] Refactor the MASKU

Comment: the lanes are not synchronized when sending operands.
Therefore, the spill regs need to handshake the lanes individually.
---
 hardware/include/ara_pkg.sv            |    8 +-
 hardware/include/rvv_pkg.sv            |    7 +
 hardware/src/ara_dispatcher.sv         |  352 ++++--
 hardware/src/lane/lane_sequencer.sv    |  218 ++--
 hardware/src/lane/operand_requester.sv |   18 +-
 hardware/src/lane/simd_alu.sv          |    7 +-
 hardware/src/lane/valu.sv              |  119 +-
 hardware/src/lane/vmfpu.sv             |   13 +-
 hardware/src/masku/masku.sv            | 1412 +++++++++++++-----------
 hardware/src/masku/masku_operands.sv   |  181 +--
 10 files changed, 1362 insertions(+), 973 deletions(-)

diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
index 6fa695a7f..d071463f4 100644
--- a/hardware/include/ara_pkg.sv
+++ b/hardware/include/ara_pkg.sv
@@ -140,9 +140,15 @@ package ara_pkg;
     // Floating-point comparison instructions
     VMFEQ, VMFLE, VMFLT, VMFNE, VMFGT, VMFGE,
     // Integer comparison instructions
-    VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSBF, VMSOF, VMSIF, VIOTA, VID, VCPOP, VFIRST, VMSGT,
+    VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSGT,
     // Integer add-with-carry and subtract-with-borrow carry-out instructions
     VMADC, VMSBC,
+    // Mask to mask
+    VMSBF, VMSOF, VMSIF,
+    // Mask to non-mask
+    VIOTA, VID,
+    // Mask to scalar
+    VCPOP, VFIRST,
     // Mask operations
     VMANDNOT, VMAND, VMOR, VMXOR, VMORNOT, VMNAND, VMNOR, VMXNOR,
     // Scalar moves from VRF
diff --git a/hardware/include/rvv_pkg.sv b/hardware/include/rvv_pkg.sv
index 12a859408..7120ac295 100644
--- a/hardware/include/rvv_pkg.sv
+++ b/hardware/include/rvv_pkg.sv
@@ -161,4 +161,11 @@ package rvv_pkg;
   // The mask register is always vreg[0]
   localparam VMASK = 5'b00000;
 
+  /////////////////////////
+  //  VLEN restrictions  //
+  /////////////////////////
+
+  // RISC-V Maximum VLEN == 64Ki
+  localparam int unsigned RISCV_MAX_VLEN = 1 << 16;
+
 endpackage : rvv_pkg
diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv
index 022a35a89..1915c47db 100644
--- a/hardware/src/ara_dispatcher.sv
+++ b/hardware/src/ara_dispatcher.sv
@@ -69,7 +69,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
   `FF(csr_vstart_q, csr_vstart_d, '0)
   `FF(csr_vl_q, csr_vl_d, '0)
-  `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, default: '0})
+  `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, vsew: EW8, vlmul: LMUL_1, default: '0})
   `FF(csr_vxsat_q, csr_vxsat_d, '0)
   `FF(csr_vxrm_q, csr_vxrm_d, '0)
   // Converts between the internal representation of `vtype_t` and the full XLEN-bit CSR.
@@ -337,6 +337,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
     null_vslideup = 1'b0;
 
+    vfmvfs_result = ara_resp_i.resp;
+
     is_decoding     = 1'b0;
     in_lane_op      = 1'b0;
 
@@ -551,7 +553,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     (csr_vtype_d.vlmul == LMUL_RSVD) ||                    // reserved value
                     // LMUL >= SEW/ELEN
                     (signed'($clog2(ELENB)) + signed'(csr_vtype_d.vlmul) < signed'(csr_vtype_d.vsew))) begin
-                  csr_vtype_d = '{vill: 1'b1, default: '0};
+                  csr_vtype_d = '{vill: 1'b1, vsew: EW8, vlmul: LMUL_1, default: '0};
                   csr_vl_d    = '0;
                 end
 
@@ -684,22 +686,52 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     endcase
                   end
                   6'b011000: begin
-                    ara_req.op        = ara_pkg::VMSEQ;
+                    ara_req.op         = ara_pkg::VMSEQ;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011001: begin
                     ara_req.op        = ara_pkg::VMSNE;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011010: begin
                     ara_req.op        = ara_pkg::VMSLTU;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011011: begin
                     ara_req.op        = ara_pkg::VMSLT;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011100: begin
                     ara_req.op        = ara_pkg::VMSLEU;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011101: begin
                     ara_req.op        = ara_pkg::VMSLE;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b010111: begin
                     ara_req.op      = ara_pkg::VMERGE;
@@ -908,28 +940,68 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     endcase
                   end
                   6'b011000: begin
-                    ara_req.op        = ara_pkg::VMSEQ;
+                    ara_req.op         = ara_pkg::VMSEQ;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011001: begin
                     ara_req.op        = ara_pkg::VMSNE;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011010: begin
                     ara_req.op        = ara_pkg::VMSLTU;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011011: begin
                     ara_req.op        = ara_pkg::VMSLT;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011100: begin
                     ara_req.op        = ara_pkg::VMSLEU;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011101: begin
                     ara_req.op        = ara_pkg::VMSLE;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011110: begin
                     ara_req.op        = ara_pkg::VMSGTU;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011111: begin
                     ara_req.op        = ara_pkg::VMSGT;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b010111: begin
                     ara_req.op      = ara_pkg::VMERGE;
@@ -1078,22 +1150,52 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     endcase
                   end
                   6'b011000: begin
-                    ara_req.op        = ara_pkg::VMSEQ;
+                    ara_req.op         = ara_pkg::VMSEQ;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011001: begin
                     ara_req.op        = ara_pkg::VMSNE;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011100: begin
                     ara_req.op        = ara_pkg::VMSLEU;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011101: begin
                     ara_req.op        = ara_pkg::VMSLE;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011110: begin
                     ara_req.op        = ara_pkg::VMSGTU;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011111: begin
                     ara_req.op        = ara_pkg::VMSGT;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs1    = csr_vtype_q.vsew;
+                    ara_req.eew_vs2    = csr_vtype_q.vsew;
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b010111: begin
                     ara_req.op      = ara_pkg::VMERGE;
@@ -1282,11 +1384,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       end
                       5'b10000: begin
                         ara_req.op      = ara_pkg::VCPOP;
-                        ara_req.use_vs1 = 1'b0;
+                        ara_req.eew_vs2 = eew_q[ara_req.vs2];
                       end
                       5'b10001: begin
                         ara_req.op      = ara_pkg::VFIRST;
-                        ara_req.use_vs1 = 1'b0;
+                        ara_req.eew_vs2 = eew_q[ara_req.vs2];
                       end
                       default :;
                     endcase
@@ -1320,14 +1422,40 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     end
                   end
                   6'b010100: begin
-                    ara_req.use_vd_op = 1'b1;
-                    ara_req.use_vs1   = 1'b0;
+                    // VMSBF, -OF, -IF, require bit-level masking
+                    // vd is fetched for correct mask undisturbed
+                    ara_req.use_vs1    = 1'b0;
+                    ara_req.use_vd_op  = 1'b1;
+                    ara_req.eew_vs2    = eew_q[ara_req.vs2]; // Force reshuffle
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
                     case (insn.varith_type.rs1)
-                      5'b00001: ara_req.op = ara_pkg::VMSBF;
-                      5'b00010: ara_req.op = ara_pkg::VMSOF;
-                      5'b00011: ara_req.op = ara_pkg::VMSIF;
-                      5'b10000: ara_req.op = ara_pkg::VIOTA;
-                      5'b10001: ara_req.op = ara_pkg::VID;
+                      5'b00001: begin
+                        ara_req.op = ara_pkg::VMSBF;
+                        // This is a mask-to-mask operation, vsew does not have any meaning
+                        // So, avoid reshuffling
+                        ara_req.vtype.vsew = eew_q[ara_req.vd];
+                      end
+                      5'b00010: begin
+                        ara_req.op = ara_pkg::VMSOF;
+                        // This is a mask-to-mask operation, vsew does not have any meaning
+                        // So, avoid reshuffling
+                        ara_req.vtype.vsew = eew_q[ara_req.vd];
+                      end
+                      5'b00011: begin
+                        ara_req.op = ara_pkg::VMSIF;
+                        // This is a mask-to-mask operation, vsew does not have any meaning
+                        // So, avoid reshuffling
+                        ara_req.vtype.vsew = eew_q[ara_req.vd];
+                      end
+                      5'b10000: begin
+                        ara_req.op = ara_pkg::VIOTA;
+                        ara_req.use_vd_op  = 1'b0;
+                      end
+                      5'b10001: begin
+                        ara_req.op = ara_pkg::VID;
+                        ara_req.use_vd_op  = 1'b0;
+                        ara_req.use_vs2 = 1'b0;
+                      end
                     endcase
                   end
                   6'b001000: ara_req.op = ara_pkg::VAADDU;
@@ -1335,63 +1463,61 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b001010: ara_req.op = ara_pkg::VASUBU;
                   6'b001011: ara_req.op = ara_pkg::VASUB;
                   6'b011000: begin
-                    ara_req.op        = ara_pkg::VMANDNOT;
-                    // Prefer mask operation on EW8 encoding
-                    // In mask operations, vs1, vs2, vd should
-                    // have the same encoding.
-                    ara_req.eew_vs1    = EW8;
-                    ara_req.eew_vs2    = EW8;
-                    ara_req.eew_vd_op  = EW8;
-                    ara_req.vtype.vsew = EW8;
+                    ara_req.op         = ara_pkg::VMANDNOT;
+                    // The source operands should have the same byte encoding
+                    // Minimize reshuffling on mask operations
+                    ara_req.eew_vs1    = eew_q[ara_req.vs1];
+                    ara_req.eew_vs2    = eew_q[ara_req.vs1]; // Force reshuffle
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011001: begin
                     ara_req.op         = ara_pkg::VMAND;
-                    ara_req.eew_vs1    = EW8;
-                    ara_req.eew_vs2    = EW8;
-                    ara_req.eew_vd_op  = EW8;
-                    ara_req.vtype.vsew = EW8;
+                    ara_req.eew_vs1    = eew_q[ara_req.vs1];
+                    ara_req.eew_vs2    = eew_q[ara_req.vs1]; // Force reshuffle
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011010: begin
                     ara_req.op         = ara_pkg::VMOR;
-                    ara_req.eew_vs1    = EW8;
-                    ara_req.eew_vs2    = EW8;
-                    ara_req.eew_vd_op  = EW8;
-                    ara_req.vtype.vsew = EW8;
+                    ara_req.eew_vs1    = eew_q[ara_req.vs1];
+                    ara_req.eew_vs2    = eew_q[ara_req.vs1]; // Force reshuffle
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011011: begin
                     ara_req.op         = ara_pkg::VMXOR;
-                    ara_req.eew_vs1    = EW8;
-                    ara_req.eew_vs2    = EW8;
-                    ara_req.eew_vd_op  = EW8;
-                    ara_req.vtype.vsew = EW8;
+                    ara_req.eew_vs1    = eew_q[ara_req.vs1];
+                    ara_req.eew_vs2    = eew_q[ara_req.vs1]; // Force reshuffle
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011100: begin
                     ara_req.op         = ara_pkg::VMORNOT;
-                    ara_req.eew_vs1    = EW8;
-                    ara_req.eew_vs2    = EW8;
-                    ara_req.eew_vd_op  = EW8;
-                    ara_req.vtype.vsew = EW8;
+                    ara_req.eew_vs1    = eew_q[ara_req.vs1];
+                    ara_req.eew_vs2    = eew_q[ara_req.vs1]; // Force reshuffle
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011101: begin
                     ara_req.op         = ara_pkg::VMNAND;
-                    ara_req.eew_vs1    = EW8;
-                    ara_req.eew_vs2    = EW8;
-                    ara_req.eew_vd_op  = EW8;
-                    ara_req.vtype.vsew = EW8;
+                    ara_req.eew_vs1    = eew_q[ara_req.vs1];
+                    ara_req.eew_vs2    = eew_q[ara_req.vs1]; // Force reshuffle
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011110: begin
                     ara_req.op         = ara_pkg::VMNOR;
-                    ara_req.eew_vs1    = EW8;
-                    ara_req.eew_vs2    = EW8;
-                    ara_req.eew_vd_op  = EW8;
-                    ara_req.vtype.vsew = EW8;
+                    ara_req.eew_vs1    = eew_q[ara_req.vs1];
+                    ara_req.eew_vs2    = eew_q[ara_req.vs1]; // Force reshuffle
+                    ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b011111: begin
                     ara_req.op         = ara_pkg::VMXNOR;
-                    ara_req.eew_vs1    = EW8;
-                    ara_req.eew_vs2    = EW8;
-                    ara_req.eew_vd_op  = EW8;
-                    ara_req.vtype.vsew = EW8;
+                    ara_req.eew_vs1    = eew_q[ara_req.vs1];
+                    ara_req.eew_vs2    = eew_q[ara_req.vs1]; // Force reshuffle
+                    ara_req.vtype.vsew = eew_q[ara_req.vd];
                   end
                   6'b010010: begin // VXUNARY0
                     // These instructions do not use vs1
@@ -1985,10 +2111,38 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_valid       = 1'b0;
                       end
                     end
-                    6'b011000: ara_req.op = ara_pkg::VMFEQ;
-                    6'b011001: ara_req.op = ara_pkg::VMFLE;
-                    6'b011011: ara_req.op = ara_pkg::VMFLT;
-                    6'b011100: ara_req.op = ara_pkg::VMFNE;
+                    6'b011000: begin
+                      ara_req.op = ara_pkg::VMFEQ;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
+                    6'b011001: begin
+                      ara_req.op = ara_pkg::VMFLE;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
+                    6'b011011: begin
+                      ara_req.op = ara_pkg::VMFLT;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
+                    6'b011100: begin
+                      ara_req.op = ara_pkg::VMFNE;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
                     6'b010010: begin // VFUNARY0
                       // These instructions do not use vs1
                       ara_req.use_vs1    = 1'b0;
@@ -2284,20 +2438,20 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // Ara can support 16-bit float, 32-bit float, 64-bit float.
                   // Ara cannot support instructions who operates on more than 64 bits.
                   unique case (FPUSupport)
-                    FPUSupportHalfSingleDouble: if (int'(ara_req.vtype.vsew) < int'(EW16) ||
-                          int'(ara_req.vtype.vsew) > int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64))
+                    FPUSupportHalfSingleDouble: if (int'(csr_vtype_q.vsew) < int'(EW16) ||
+                          int'(csr_vtype_q.vsew) > int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64))
                           illegal_insn = 1'b1;
-                    FPUSupportHalfSingle: if (int'(ara_req.vtype.vsew) < int'(EW16) ||
-                          int'(ara_req.vtype.vsew) > int'(EW32) || int'(ara_req.eew_vs2) > int'(EW32))
+                    FPUSupportHalfSingle: if (int'(csr_vtype_q.vsew) < int'(EW16) ||
+                          int'(csr_vtype_q.vsew) > int'(EW32) || int'(ara_req.eew_vs2) > int'(EW32))
                           illegal_insn = 1'b1;
-                    FPUSupportSingleDouble: if (int'(ara_req.vtype.vsew) < int'(EW32) ||
-                          int'(ara_req.vtype.vsew) > int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64))
+                    FPUSupportSingleDouble: if (int'(csr_vtype_q.vsew) < int'(EW32) ||
+                          int'(csr_vtype_q.vsew) > int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64))
                           illegal_insn = 1'b1;
-                    FPUSupportHalf: if (int'(ara_req.vtype.vsew) != int'(EW16) || int'(ara_req.eew_vs2) > int'(EW16))
+                    FPUSupportHalf: if (int'(csr_vtype_q.vsew) != int'(EW16) || int'(ara_req.eew_vs2) > int'(EW16))
                           illegal_insn = 1'b1;
-                    FPUSupportSingle: if (int'(ara_req.vtype.vsew) != int'(EW32) || int'(ara_req.eew_vs2) > int'(EW32))
+                    FPUSupportSingle: if (int'(csr_vtype_q.vsew) != int'(EW32) || int'(ara_req.eew_vs2) > int'(EW32))
                         illegal_insn = 1'b1;
-                    FPUSupportDouble: if (int'(ara_req.vtype.vsew) != int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64))
+                    FPUSupportDouble: if (int'(csr_vtype_q.vsew) != int'(EW64) || int'(ara_req.eew_vs2) > int'(EW64))
                         illegal_insn = 1'b1;
                     default: illegal_insn = 1'b1; // Unsupported configuration
                   endcase
@@ -2365,12 +2519,54 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       skip_lmul_checks  = 1'b1;
                     end
                     6'b010111: ara_req.op = ara_pkg::VMERGE;
-                    6'b011000: ara_req.op = ara_pkg::VMFEQ;
-                    6'b011001: ara_req.op = ara_pkg::VMFLE;
-                    6'b011011: ara_req.op = ara_pkg::VMFLT;
-                    6'b011100: ara_req.op = ara_pkg::VMFNE;
-                    6'b011101: ara_req.op = ara_pkg::VMFGT;
-                    6'b011111: ara_req.op = ara_pkg::VMFGE;
+                    6'b011000: begin
+                      ara_req.op = ara_pkg::VMFEQ;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
+                    6'b011001: begin
+                      ara_req.op = ara_pkg::VMFLE;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
+                    6'b011011: begin
+                      ara_req.op = ara_pkg::VMFLT;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
+                    6'b011100: begin
+                      ara_req.op = ara_pkg::VMFNE;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
+                    6'b011101: begin
+                      ara_req.op = ara_pkg::VMFGT;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
+                    6'b011111: begin
+                      ara_req.op = ara_pkg::VMFGE;
+                      ara_req.use_vd_op  = 1'b1;
+                      ara_req.eew_vs1    = csr_vtype_q.vsew;
+                      ara_req.eew_vs2    = csr_vtype_q.vsew;
+                      ara_req.eew_vd_op  = eew_q[ara_req.vd];
+                      ara_req.vtype.vsew = eew_q[ara_req.vd];
+                    end
                     6'b100100: ara_req.op = ara_pkg::VFMUL;
                     6'b100000: ara_req.op = ara_pkg::VFDIV;
                     6'b100001: ara_req.op = ara_pkg::VFRDIV;
@@ -2527,16 +2723,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // Ara can support 16-bit float, 32-bit float, 64-bit float.
                   // Ara cannot support instructions who operates on more than 64 bits.
                   unique case (FPUSupport)
-                    FPUSupportHalfSingleDouble: if (int'(ara_req.vtype.vsew) < int'(EW16) ||
-                          int'(ara_req.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1;
-                    FPUSupportHalfSingle: if (int'(ara_req.vtype.vsew) < int'(EW16) ||
-                          int'(ara_req.vtype.vsew) > int'(EW32)) illegal_insn = 1'b1;
-                    FPUSupportSingleDouble: if (int'(ara_req.vtype.vsew) < int'(EW32) ||
-                          int'(ara_req.vtype.vsew) > int'(EW64)) illegal_insn = 1'b1;
-                    FPUSupportHalf: if (int'(ara_req.vtype.vsew) != int'(EW16)) illegal_insn = 1'b1;
-                    FPUSupportSingle: if (int'(ara_req.vtype.vsew) != int'(EW32))
+                    FPUSupportHalfSingleDouble: if (int'(csr_vtype_q.vsew) < int'(EW16) ||
+                          int'(csr_vtype_q.vsew) > int'(EW64)) illegal_insn = 1'b1;
+                    FPUSupportHalfSingle: if (int'(csr_vtype_q.vsew) < int'(EW16) ||
+                          int'(csr_vtype_q.vsew) > int'(EW32)) illegal_insn = 1'b1;
+                    FPUSupportSingleDouble: if (int'(csr_vtype_q.vsew) < int'(EW32) ||
+                          int'(csr_vtype_q.vsew) > int'(EW64)) illegal_insn = 1'b1;
+                    FPUSupportHalf: if (int'(csr_vtype_q.vsew) != int'(EW16)) illegal_insn = 1'b1;
+                    FPUSupportSingle: if (int'(csr_vtype_q.vsew) != int'(EW32))
                         illegal_insn = 1'b1;
-                    FPUSupportDouble: if (int'(ara_req.vtype.vsew) != int'(EW64))
+                    FPUSupportDouble: if (int'(csr_vtype_q.vsew) != int'(EW64))
                         illegal_insn = 1'b1;
                     default: illegal_insn = 1'b1; // Unsupported configuration
                   endcase
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index 3ddcfa6eb..6a3dd2b52 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -259,12 +259,15 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
         vtype          : pe_req.vtype,
         default        : '0
       };
+      vfu_operation_d.vtype.vsew = pe_req.op inside {[VMFEQ:VMSGT]} ? pe_req.eew_vs2 : pe_req.vtype.vsew;
       vfu_operation_valid_d = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0;
 
       // Vector length calculation
       vfu_operation_d.vl = pe_req.vl / NrLanes;
       // If lane_id_i < vl % NrLanes, this lane has to execute one extra micro-operation.
-      if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0]) vfu_operation_d.vl += 1;
+      // Also, if the ALU/VMFPU should pre-process data for the MASKU, force a balanced payload
+      if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0] || (|pe_req.vl[idx_width(NrLanes)-1:0] && pe_req.op inside {[VMFEQ:VMXNOR]}))
+        vfu_operation_d.vl += 1;
 
       // Calculate the start element for Lane[i]. This will be forwarded to both opqueues
       // and operand requesters, with some light modification in the case of a vslide.
@@ -277,9 +280,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
       vinsn_running_d[pe_req.id] = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0;
 
       // Mute request if the instruction runs in the lane and the vl is zero.
-      // Exception 1: insn on mask vectors, as MASKU has to receive something from all lanes
-      // and the partial results come from VALU and VMFPU.
-      // Exception 2: during a reduction, all the lanes must cooperate anyway.
+      // Exception: during a reduction, all the lanes must cooperate anyway.
       if (vfu_operation_d.vl == '0 && (vfu_operation_d.vfu inside {VFU_Alu, VFU_MFpu}) && !(vfu_operation_d.op inside {[VREDSUM:VWREDSUM], [VFREDUSUM:VFWREDOSUM]})) begin
         vfu_operation_valid_d = 1'b0;
         // We are already done with this instruction
@@ -337,17 +338,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
+            vl     : pe_req.vl / NrLanes / ELEN,
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
+            operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_MFpu: begin
@@ -420,17 +421,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
+            vl     : pe_req.vl / NrLanes / ELEN,
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
+            operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_LoadUnit : begin
@@ -438,17 +439,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
+            vl     : pe_req.vl / NrLanes / ELEN,
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
+            operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
 
           // Load indexed
@@ -490,26 +491,25 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           // extra operand regardless of whether it is valid in this lane or not.
           // This is done to balance the data received by the store unit, which expects
           // L*64-bits packets only.
-          if (lane_id_i > pe_req.end_lane) begin
+          if (lane_id_i > pe_req.end_lane)
             operand_request[StA].vl += 1;
-          end
           operand_request_push[StA] = pe_req.use_vs1;
 
           // This vector instruction uses masks
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
+            vl     : pe_req.vl / NrLanes / ELEN,
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
+            operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
 
           // Store indexed
@@ -529,9 +529,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           // Since this request goes outside of the lane, we might need to request an
           // extra operand regardless of whether it is valid in this lane or not.
-          if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) begin
+          if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl)
             operand_request[SlideAddrGenA].vl += 1;
-          end
           operand_request_push[SlideAddrGenA] = pe_req_i.op == VSXE;
         end
 
@@ -601,7 +600,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request[MaskM] = '{
             id      : pe_req.id,
             vs      : VMASK,
-            eew     : pe_req.vtype.vsew,
+            eew     : EW64,
             is_slide: 1'b1,
             vtype   : pe_req.vtype,
             vstart  : vfu_operation_d.vstart,
@@ -614,61 +613,61 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             VSLIDEUP: begin
               // We need to trim full words from the end of the vector that are not used
               // as operands by the slide unit.
+              operand_request[MaskM].vl = (pe_req.vl - pe_req.stride) / NrLanes / ELEN;
+
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request[MaskM].vl =
-              ((pe_req.vl - pe_req.stride + NrLanes - 1) / 8 / NrLanes)
-              >> unsigned'(pe_req.vtype.vsew);
-
-              if (((operand_request[MaskM].vl + pe_req.stride) <<
-                    unsigned'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl))
+              if ((operand_request[MaskM].vl) * NrLanes * ELEN != + pe_req.stride)
                 operand_request[MaskM].vl += 1;
 
               // SLIDEUP only uses mask bits whose indices are > stride
               // Don't send the previous (unused) ones to the MASKU
               if (pe_req.stride >= NrLanes * 64)
-                operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8;
+                operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * ELEN) << NrLanes * ELEN) / 8;
             end
             VSLIDEDOWN: begin
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> unsigned'(
-                    pe_req.vtype.vsew));
-              if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-                  NrLanes * 8 != pe_req.vl)
+              operand_request[MaskM].vl = pe_req.vl / NrLanes / ELEN;
+              if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
                 operand_request[MaskM].vl += 1;
             end
           endcase
         end
         VFU_MaskUnit: begin
+          // todo: balance mask comparison source requested
+          // todo:
+
+          // Mask logical and integer comparisons
           operand_request[AluA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
-            eew     : pe_req.eew_vs1,
             scale_vl: pe_req.scale_vl,
             vtype   : pe_req.vtype,
             vstart  : vfu_operation_d.vstart,
             hazard  : pe_req.hazard_vs1 | pe_req.hazard_vd,
             default : '0
           };
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
 
-          // This is an operation that runs normally on the ALU, and then gets *condensed* and
-          // reshuffled at the Mask Unit.
+          // Integer comparisons run on the ALU and then get reshuffled and masked in the MASKU
           if (pe_req.op inside {[VMSEQ:VMSBC]}) begin
-            operand_request[AluA].vl = vfu_operation_d.vl;
-          end
-          // This is an operation that runs normally on the ALU, and then gets reshuffled at the
-          // Mask Unit.
-          else begin
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            operand_request[AluA].vl = (pe_req.vl / NrLanes) >>
-            (unsigned'(EW64) - unsigned'(pe_req.eew_vs1));
-            if ((operand_request[AluA].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs1))) * NrLanes !=
-                pe_req.vl) operand_request[AluA].vl += 1;
+            // These source regs contain non-mask vectors.
+            operand_request[AluA].eew = pe_req.eew_vs1;
+            operand_request[AluA].vl  = pe_req.vl / NrLanes;
+            if ((operand_request[AluA].vl * NrLanes) != pe_req.vl)
+              operand_request[AluA].vl += 1;
+          end else begin // Mask logical operations
+            // These source regs contain mask vectors.
+            operand_request[AluA].eew = EW64;
+            operand_request[AluA].vl  = pe_req.vl / NrLanes / ELEN;
+            if (operand_request[AluA].vl * NrLanes * ELEN != pe_req.vl)
+              operand_request[AluA].vl += 1;
           end
-          operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF});
+          operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]});
 
+          // Mask logical, integer comparisons, VIOTA, VID, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST
           operand_request[AluB] = '{
             id      : pe_req.id,
             vs      : pe_req.vs2,
@@ -679,88 +678,117 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             hazard  : pe_req.hazard_vs2 | pe_req.hazard_vd,
             default : '0
           };
-          // This is an operation that runs normally on the ALU, and then gets *condensed* and
-          // reshuffled at the Mask Unit.
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+
+          // Integer comparisons run on the ALU and then get reshuffled and masked in the MASKU
           if (pe_req.op inside {[VMSEQ:VMSBC]}) begin
-            operand_request[AluB].vl = vfu_operation_d.vl;
+            // These source regs contain non-mask vectors.
+            operand_request[AluB].eew = pe_req.eew_vs2;
+            operand_request[AluB].vl  = pe_req.vl / NrLanes;
+            if ((operand_request[AluB].vl * NrLanes) != pe_req.vl)
+              operand_request[AluB].vl += 1;
+          end else begin // Mask logical, VIOTA, VID, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST
+            // These source regs contain mask vectors.
+            operand_request[AluB].eew = EW64;
+            operand_request[AluB].vl  = pe_req.vl / NrLanes / ELEN;
+            if (operand_request[AluB].vl * NrLanes * ELEN != pe_req.vl)
+              operand_request[AluB].vl += 1;
           end
-          // This is an operation that runs normally on the ALU, and then gets reshuffled at the
-          // Mask Unit.
-          else begin
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            operand_request[AluB].vl = (pe_req.vl / NrLanes) >>
-            (unsigned'(EW64) - unsigned'(pe_req.eew_vs2));
-            if ((operand_request[AluB].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs2))) * NrLanes !=
-                pe_req.vl) operand_request[AluB].vl += 1;
-          end
-          operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF, VFIRST});
+          operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]});
 
+          // Mask fp comparisons
           operand_request[MulFPUA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
             eew     : pe_req.eew_vs1,
             scale_vl: pe_req.scale_vl,
+            vl      : pe_req.vl / NrLanes,
             vtype   : pe_req.vtype,
             vstart  : vfu_operation_d.vstart,
             hazard  : pe_req.hazard_vs1 | pe_req.hazard_vd,
             default : '0
           };
-
-          // This is an operation that runs normally on the ALU, and then gets *condensed* and
+          // This is an operation that runs normally on the VMFPU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
-          operand_request[MulFPUA].vl = vfu_operation_d.vl;
-          operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF});
+          // Request a balanced load from every lane despite it being active or not.
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if ((operand_request[MulFPUA].vl * NrLanes) != pe_req.vl)
+            operand_request[MulFPUA].vl += 1;
+          operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]};
 
+          // Mask fp comparisons
           operand_request[MulFPUB] = '{
             id      : pe_req.id,
             vs      : pe_req.vs2,
             eew     : pe_req.eew_vs2,
             scale_vl: pe_req.scale_vl,
+            vl      : pe_req.vl / NrLanes,
             vtype   : pe_req.vtype,
             vstart  : vfu_operation_d.vstart,
             hazard  : pe_req.hazard_vs2 | pe_req.hazard_vd,
             default : '0
           };
-          // This is an operation that runs normally on the ALU, and then gets *condensed* and
+          // This is an operation that runs normally on the VMFPU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
-          operand_request[MulFPUB].vl = vfu_operation_d.vl;
-          operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF, VFIRST});
+          // Request a balanced load from every lane despite it being active or not.
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if ((operand_request[MulFPUB].vl * NrLanes) != pe_req.vl)
+            operand_request[MulFPUB].vl += 1;
+          operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]};
 
+          // Vd register to provide correct mask undisturbed policy at bit-level
+          // This is can be a mask or normal register
           operand_request[MaskB] = '{
             id      : pe_req.id,
-            vs      : pe_req.vs2,
-            eew     : pe_req.eew_vs2,
+            vs      : pe_req.vd,
             scale_vl: pe_req.scale_vl,
             vtype   : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl      : (pe_req.vl / NrLanes / ELEN) << (unsigned'(EW64) - unsigned'(pe_req.vtype.vsew)),
             vstart  : vfu_operation_d.vstart,
-            hazard  : (pe_req.op inside {VMSBF, VMSOF, VMSIF}) ? pe_req.hazard_vs2 : pe_req.hazard_vs2 | pe_req.hazard_vd,
+            hazard  : pe_req.hazard_vd,
             default : '0
           };
-          operand_request[MaskB].vl = pe_req.vl / (NrLanes * (8 << pe_req.vtype.vsew));
-          if ((pe_req.vl % (NrLanes*ELEN)) != 0) begin
-            operand_request[MaskB].vl += 1'b1;
+          // vl and eew depend on the real eew on which we are working on
+          if (pe_req.op inside {VIOTA,VID}) begin
+            // Non-mask layout
+            operand_request[MaskB].eew = pe_req.vtype.vsew;
+            operand_request[MaskB].vl  = pe_req.vl / NrLanes;
+            // Request a balanced load from every lane despite it being active or not.
+            // Since this request goes outside of the lane, we might need to request an
+            // extra operand regardless of whether it is valid in this lane or not.
+            if ((operand_request[MaskB].vl * NrLanes) != pe_req.vl)
+              operand_request[MaskB].vl += 1;
+          end else begin // Mask logical, comparisons, VMSBF, VMSIF, VMSOF
+            // Mask layout
+            operand_request[MaskB].eew = EW64;
+            operand_request[MaskB].vl  = (pe_req.vl / NrLanes / ELEN);
+            // Request a balanced load from every lane despite it being active or not.
+            // Since this request goes outside of the lane, we might need to request an
+            // extra operand regardless of whether it is valid in this lane or not.
+            if ((operand_request[MaskB].vl * NrLanes * ELEN) != pe_req.vl)
+              operand_request[MaskB].vl += 1;
           end
-          operand_request_push[MaskB] = pe_req.use_vs2 && pe_req.op inside {VCPOP, VFIRST, VMSIF, VMSOF, VMSBF};
+          operand_request_push[MaskB] = pe_req.use_vd_op;
 
+          // All masked operations
+          // This is always a mask register
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
             vl     : (pe_req.vl / NrLanes / ELEN),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm,
             default: '0
           };
-          if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin
+          // Request a balanced load from every lane despite it being active or not.
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl)
             operand_request[MaskM].vl += 1;
-          end
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_None: begin
diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv
index 1baec0780..de2cc4f82 100644
--- a/hardware/src/lane/operand_requester.sv
+++ b/hardware/src/lane/operand_requester.sv
@@ -291,7 +291,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
       automatic elen_t vl_byte;
       automatic elen_t vstart_byte;
       automatic elen_t vector_body_len_byte;
-      automatic elen_t vector_body_len_packets;
+      automatic elen_t vector_body_len_elements;
 
       // Bank we are currently requesting
       automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0];
@@ -324,13 +324,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
                   ? 0
                   : operand_request_i[requester_index].vstart << operand_request_i[requester_index].vtype.vsew;
       vector_body_len_byte = vl_byte - vstart_byte + (vstart_byte % 8);
-      vector_body_len_packets = vector_body_len_byte >> operand_request_i[requester_index].eew;
-      if (vector_body_len_packets << operand_request_i[requester_index].eew < vector_body_len_byte)
-        vector_body_len_packets += 1;
+      vector_body_len_elements = vector_body_len_byte >> operand_request_i[requester_index].eew;
+      if (vector_body_len_elements << operand_request_i[requester_index].eew < vector_body_len_byte)
+        vector_body_len_elements += 1;
 
       // Final computed length
       effective_vector_body_length = (operand_request_i[requester_index].scale_vl)
-                                   ? vector_body_len_packets
+                                   ? vector_body_len_elements
                                    : vector_body_length;
 
       // Address of the vstart element of the vector in the VRF
@@ -401,7 +401,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
           end : waw_counters_update
 
           if (operand_queue_ready_i[requester_index]) begin
-            automatic vlen_t num_bytes;
+            automatic vlen_t num_elements;
 
             // Operand request
             lane_operand_req_transposed[requester_index][bank] = !stall;
@@ -417,12 +417,12 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
               requester_metadata_d.addr = requester_metadata_q.addr + 1'b1;
 
               // We read less than 64 bits worth of elements
-              num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
-              if (requester_metadata_q.len < num_bytes) begin
+              num_elements = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
+              if (requester_metadata_q.len < num_elements) begin
                 requester_metadata_d.len    = 0;
               end
               else begin
-                requester_metadata_d.len = requester_metadata_q.len - num_bytes;
+                requester_metadata_d.len = requester_metadata_q.len - num_elements;
               end
             end : op_req_grant
 
diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv
index 242c0d2bc..572bc35af 100644
--- a/hardware/src/lane/simd_alu.sv
+++ b/hardware/src/lane/simd_alu.sv
@@ -132,11 +132,8 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
         VMXOR   : res = operand_a_i ^ operand_b_i;
         VMXNOR  : res = ~(operand_a_i ^ operand_b_i);
 
-        // vmsbf, vmsof, vmsif and viota operand generation
-        VMSBF, VMSOF, VMSIF, VIOTA : res = opb;
-
-	      // Vector count population and find first set bit instructions
-        VCPOP, VFIRST : res = operand_b_i;
+        // Mask operands pass-through
+        VCPOP, VFIRST, VMSBF, VMSOF, VMSIF, VIOTA: res = operand_b_i;
 
         // Arithmetic instructions
         VSADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)
diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index d3ce82bee..53a14e177 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -175,22 +175,25 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
   //  Mask operands  //
   /////////////////////
 
+  logic mask_operand_valid;
   logic mask_operand_ready;
   logic mask_operand_gnt;
 
-  assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q];
+  assign mask_operand_valid = result_queue_q[result_queue_read_pnt_q].mask
+                            & result_queue_valid_q[result_queue_read_pnt_q];
+  assign mask_operand_gnt = mask_operand_valid & mask_operand_ready;
 
   spill_register #(
     .T(elen_t)
   ) i_mask_operand_register (
-    .clk_i     (clk_i                                                                                        ),
-    .rst_ni    (rst_ni                                                                                       ),
-    .data_o    (mask_operand_o                                                                               ),
-    .valid_o   (mask_operand_valid_o                                                                         ),
-    .ready_i   (mask_operand_ready_i                                                                         ),
-    .data_i    (result_queue_q[result_queue_read_pnt_q].wdata                                                ),
-    .valid_i   (result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]),
-    .ready_o   (mask_operand_ready                                                                           )
+    .clk_i     (clk_i                                         ),
+    .rst_ni    (rst_ni                                        ),
+    .data_o    (mask_operand_o                                ),
+    .valid_o   (mask_operand_valid_o                          ),
+    .ready_i   (mask_operand_ready_i                          ),
+    .data_i    (result_queue_q[result_queue_read_pnt_q].wdata ),
+    .valid_i   (mask_operand_valid                            ),
+    .ready_o   (mask_operand_ready                            )
   );
 
   //////////////////////
@@ -395,6 +398,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
   // Remaining elements of the current instruction in the commit phase
   vlen_t commit_cnt_d, commit_cnt_q;
 
+  // How many elements are issued/committed
+  logic [3:0] element_cnt_buf_issue, element_cnt_buf_commit;
+  logic [6:0] element_cnt_issue;
+  logic [6:0] element_cnt_commit;
+
   always_comb begin: p_valu
     // Maintain state
     vinsn_queue_d = vinsn_queue_q;
@@ -436,6 +444,13 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     // Don't prevent commit by default
     prevent_commit = 1'b0;
 
+    // How many elements are we processing this cycle?
+    element_cnt_buf_issue = 1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew));
+    element_cnt_issue = vinsn_issue_q.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_issue};
+
+    element_cnt_buf_commit = 1 << (unsigned'(EW64) - unsigned'(vinsn_commit.vtype.vsew));
+    element_cnt_commit = vinsn_commit.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_commit};
+
     ////////////////////////////////////////
     //  Write data into the result queue  //
     ////////////////////////////////////////
@@ -450,7 +465,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [6:0] element_cnt = element_cnt_issue;
 
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
@@ -524,16 +539,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                   vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
                 // Assign vector length for next instruction in the instruction queue
-                if (vinsn_queue_d.issue_cnt != 0) begin
-                  if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                    issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
-                  else begin
-                    $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-                    issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
-                      vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                    issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
-                  end
-                end
+                if (vinsn_queue_d.issue_cnt != 0)
+                  issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
               end
             end
           end
@@ -550,7 +557,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [6:0] element_cnt = element_cnt_issue;
+
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
 
@@ -656,16 +664,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
             vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
           // Assign vector length for next instruction in the instruction queue
-          if (vinsn_queue_d.issue_cnt != 0) begin
-            if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-              issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
-            else begin
-              $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-              issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
-                vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-              issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
-            end
-          end
+          if (vinsn_queue_d.issue_cnt != 0)
+            issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
 
           // Give the done to the main sequencer
           commit_cnt_d = '0;
@@ -693,16 +693,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
               // Assign vector length for next instruction in the instruction queue
-              if (vinsn_queue_d.issue_cnt != 0) begin
-                if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                  issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
-                else begin
-                  $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-                  issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
-                    vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                  issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
-                end
-              end
+              if (vinsn_queue_d.issue_cnt != 0)
+                issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
 
               // Commit and give the done to the main sequencer
               commit_cnt_d = '0;
@@ -739,7 +731,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     if (|result_queue_valid_q)
       vxsat_flag_o = |(alu_vxsat_q & result_queue_q[result_queue_read_pnt_q].be);
 
-    // Received a grant from the VRF.
+    // Received a grant from the VRF or MASKU.
     // Deactivate the request.
     if (alu_result_gnt_i || mask_operand_gnt) begin
       result_queue_valid_d[result_queue_read_pnt_q] = 1'b0;
@@ -754,9 +746,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
       // Decrement the counter of remaining vector elements waiting to be written
       // Don't do it in case of a reduction
-      if (!is_reduction(vinsn_commit.op))
-        commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew));
-      if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0;
+      if (!is_reduction(vinsn_commit.op)) begin
+        automatic logic [6:0] element_cnt = element_cnt_commit;
+          commit_cnt_d = commit_cnt_q - element_cnt;
+        if (commit_cnt_q < element_cnt) commit_cnt_d = '0;
+      end
     end
 
     // Finished committing the results of a vector instruction
@@ -770,18 +764,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       else vinsn_queue_d.commit_pnt += 1;
 
       // Update the commit counter for the next instruction
-      if (vinsn_queue_d.commit_cnt != '0) begin
-        if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;
-        else begin
-          // We are asking for bits, and we want at least one chunk of bits if
-          // vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew)
-          $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-          commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >>
-            vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew;
-          commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0];
-        end
-      end
+      if (vinsn_queue_d.commit_cnt != '0)
+        commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;
 
       // Initialize counters and alu state if needed by the next instruction
       // After a reduction, the next instructions starts after the reduction commits
@@ -806,7 +790,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
       // Do not wait for masks if, during a reduction, this lane is just a pass-through
       // The only valid instructions here with vl == '0 are reductions
-      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0);
+      // Instructions that execute in the mask unit will process the mask there directly
+      // VMADC/VMSBC requires mask bits in the ALU
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VMXNOR]} && !(vfu_operation_i.op inside {[VMADC:VMSBC]})
+                                                       ? 1'b1
+                                                       : vfu_operation_i.vm | (vfu_operation_i.vl == '0);
 
       // Initialize counters and alu state if the instruction queue was empty
       // and the lane is not reducing
@@ -822,22 +810,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
         sldu_transactions_cnt_d = $clog2(NrLanes) + 1;
 
         issue_cnt_d = vfu_operation_i.vl;
-        if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          issue_cnt_d = vfu_operation_i.vl;
-        else begin
-          issue_cnt_d = (vfu_operation_i.vl / 8) >>
-            vfu_operation_i.vtype.vsew;
-          issue_cnt_d += |vfu_operation_i.vl[2:0];
-        end
       end
       if (vinsn_queue_d.commit_cnt == '0)
-        if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vfu_operation_i.vl;
-        else begin
-          // Operations between mask vectors operate on bits
-          commit_cnt_d  = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew;
-          commit_cnt_d += |vfu_operation_i.vl[2:0];
-        end
+        commit_cnt_d = vfu_operation_i.vl;
 
       // Bump pointers and counters of the vector instruction queue
       vinsn_queue_d.accept_pnt += 1;
diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv
index bbeb78f32..846638243 100644
--- a/hardware/src/lane/vmfpu.sv
+++ b/hardware/src/lane/vmfpu.sv
@@ -1245,21 +1245,18 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
               (vinsn_processing_q.op == VMFNE) ?
                 ~vfpu_processed_result[16*b] :
                 vfpu_processed_result[16*b];
-            for (int b = 0; b < 4; b++) vfpu_processed_result[16*b+1] = vfpu_mask[2*b];
           end
           EW32: begin
             for (int b = 0; b < 2; b++) vfpu_processed_result[32*b] =
               (vinsn_processing_q.op == VMFNE) ?
                 ~vfpu_processed_result[32*b] :
                 vfpu_processed_result[32*b];
-            for (int b = 0; b < 2; b++) vfpu_processed_result[32*b+1] = vfpu_mask[4*b];
           end
           EW64: begin
             for (int b = 0; b < 1; b++) vfpu_processed_result[b] =
               (vinsn_processing_q.op == VMFNE) ?
                 ~vfpu_processed_result[b] :
                 vfpu_processed_result[b];
-            for (int b = 0; b < 1; b++) vfpu_processed_result[b+1] = vfpu_mask[8*b];
           end
         endcase
       end
@@ -2180,7 +2177,15 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
     if (!vinsn_queue_full && vfu_operation_valid_i &&
       (vfu_operation_i.vfu == VFU_MFpu || vfu_operation_i.op inside {[VMFEQ:VMFGE]})) begin
-      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt]    = vfu_operation_i;
+      // Masks are handled in the MASKU directly for comparisons
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMFEQ:VMFGE]}
+                                                       ? 1'b1
+                                                       : vfu_operation_i.vm;
+      // During comparisons, vd_op is for the masku, not for the VMFPU
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].use_vd_op = vfu_operation_i.op inside {[VMFEQ:VMFGE]}
+                                                              ? 1'b0
+                                                              : vfu_operation_i.use_vd_op;
 
       // Initialize counters
       if (vinsn_queue_d.issue_cnt == '0 && !prevent_commit) begin
diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv
index 1ea497dc4..e83998965 100644
--- a/hardware/src/masku/masku.sv
+++ b/hardware/src/masku/masku.sv
@@ -65,6 +65,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   vlen_t read_cnt_d, read_cnt_q;
   // Remaining elements of the current instruction in the issue phase
   vlen_t issue_cnt_d, issue_cnt_q;
+  // Remaining elements of the current instruction to be validated in the result queue
+  vlen_t processing_cnt_d, processing_cnt_q;
   // Remaining elements of the current instruction in the commit phase
   vlen_t commit_cnt_d, commit_cnt_q;
 
@@ -81,19 +83,17 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   logic  [NrLanes-1:0] masku_operand_alu_ready;
 
   // ALU/FPU result (deshuffled)
-  logic  [NrLanes*ELEN-1:0] masku_operand_alu_seq;
+  logic  [NrLanes*DataWidth-1:0] masku_operand_alu_seq;
 
-  // vs2 (shuffled)
-  elen_t [NrLanes-1:0] masku_operand_vs2;
-  logic  [NrLanes-1:0] masku_operand_vs2_valid;
-  logic  [NrLanes-1:0] masku_operand_vs2_ready;
+  // vd (shuffled)
+  elen_t [NrLanes-1:0] masku_operand_vd;
+  logic  [NrLanes-1:0] masku_operand_vd_valid;
+  logic  [NrLanes-1:0] masku_operand_vd_ready;
 
-  assign masku_operand_vs2_ready = 1'b0;
-
-  // vs2 (deshuffled)
-  logic  [NrLanes*ELEN-1:0] masku_operand_vs2_seq;
-  logic  [     NrLanes-1:0] masku_operand_vs2_seq_valid;
-  logic  [     NrLanes-1:0] masku_operand_vs2_seq_ready;
+  // vd (deshuffled)
+  logic  [NrLanes*DataWidth-1:0] masku_operand_vd_seq;
+  logic  [     NrLanes-1:0] masku_operand_vd_seq_valid;
+  logic  [     NrLanes-1:0] masku_operand_vd_seq_ready;
 
   // Mask
   elen_t [NrLanes-1:0] masku_operand_m;
@@ -101,15 +101,13 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   logic  [NrLanes-1:0] masku_operand_m_ready;
 
   // Mask deshuffled
-  logic  [NrLanes*ELEN-1:0] masku_operand_m_seq;
-  logic  [NrLanes-1:0] masku_operand_m_seq_valid;
-  logic  [NrLanes-1:0] masku_operand_m_seq_ready;
+  logic  [NrLanes*DataWidth-1:0] masku_operand_m_seq;
 
   // Insn-queue related signal
   pe_req_t vinsn_issue;
 
-  logic  [NrLanes*ELEN-1:0] bit_enable_mask;
-  logic  [NrLanes*ELEN-1:0] alu_result_compressed;
+  logic  [NrLanes*DataWidth-1:0] bit_enable_mask;
+  logic  [NrLanes*DataWidth-1:0] alu_result_compressed_seq;
 
   // Performs all shuffling and deshuffling of mask operands (including masks for mask instructions)
   // Furthermore, it buffers certain operands that would create long critical paths
@@ -133,72 +131,100 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     .masku_operand_alu_valid_o     (     masku_operand_alu_valid ),
     .masku_operand_alu_ready_i     (     masku_operand_alu_ready ),
     .masku_operand_alu_seq_o       (       masku_operand_alu_seq ),
-    .masku_operand_alu_seq_valid_o (  ),
-    .masku_operand_alu_seq_ready_i (  ),
-    .masku_operand_vs2_o           (           masku_operand_vs2 ),
-    .masku_operand_vs2_valid_o     (     masku_operand_vs2_valid ),
-    .masku_operand_vs2_ready_i     (     masku_operand_vs2_ready ),
-    .masku_operand_vs2_seq_o       (       masku_operand_vs2_seq ),
-    .masku_operand_vs2_seq_valid_o ( masku_operand_vs2_seq_valid ),
-    .masku_operand_vs2_seq_ready_i ( masku_operand_vs2_seq_ready ),
+    .masku_operand_alu_seq_valid_o (                             ),
+    .masku_operand_alu_seq_ready_i (                          '0 ),
+    .masku_operand_vd_o            (            masku_operand_vd ),
+    .masku_operand_vd_valid_o      (      masku_operand_vd_valid ),
+    .masku_operand_vd_ready_i      (      masku_operand_vd_ready ),
+    .masku_operand_vd_seq_o        (        masku_operand_vd_seq ),
+    .masku_operand_vd_seq_valid_o  (  masku_operand_vd_seq_valid ),
+    .masku_operand_vd_seq_ready_i  (                          '0 ),
     .masku_operand_m_o             (             masku_operand_m ),
     .masku_operand_m_valid_o       (       masku_operand_m_valid ),
     .masku_operand_m_ready_i       (       masku_operand_m_ready ),
     .masku_operand_m_seq_o         (         masku_operand_m_seq ),
-    .masku_operand_m_seq_valid_o   (  ),
-    .masku_operand_m_seq_ready_i   (  ),
+    .masku_operand_m_seq_valid_o   (                             ),
+    .masku_operand_m_seq_ready_i   (                          '0 ),
     .bit_enable_mask_o             (             bit_enable_mask ),
-    .alu_result_compressed_o       (       alu_result_compressed )
+    .alu_result_compressed_seq_o   (   alu_result_compressed_seq )
   );
 
+  // Local Parameter for mask logical instructions
+  //
+  // Don't change this parameter!
+  localparam integer unsigned VmLogicalParallelism = NrLanes*DataWidth;
 
-  // Local Parameter W_CPOP and W_VFIRST
+  // Local Parameter VMSBF, VMSIF, VMSOF
+  //
+  localparam integer unsigned VmsxfParallelism = NrLanes < 4 ? 2 : NrLanes/2;
+  // Ancillary signals
+  logic [VmsxfParallelism-1:0] vmsbf_buffer;
+  logic [NrLanes*DataWidth-1:0] alu_result_vmsif_vm;
+  logic [NrLanes*DataWidth-1:0] alu_result_vmsbf_vm;
+  logic [NrLanes*DataWidth-1:0] alu_result_vmsof_vm;
+
+  // Local Parameter VIOTA, VID
+  //
+  // How many output results are computed in parallel by VIOTA
+  localparam integer unsigned ViotaParallelism = NrLanes < 4 ? 2 : NrLanes/2;
+  // Check if parameters are within range
+  if (ViotaParallelism > NrLanes || ViotaParallelism % 2 != 0) begin
+    $fatal(1, "Parameter ViotaParallelism cannot be higher than NrLanes and should be a power of 2.");
+  end
+  // VLENMAX can be 64Ki elements at most - 16 bit per adder are enough
+  logic [ViotaParallelism-1:0] [idx_width(RISCV_MAX_VLEN)-1:0] viota_res;
+  logic [idx_width(RISCV_MAX_VLEN)-1:0] viota_acc, viota_acc_d, viota_acc_q;
+  // Ancillary signal to tweak the VRF byte-enable, accounting for an unbalanced write,
+  // i.e., when the number of elements does not perfectly divide NrLanes
+  logic [3:0] elm_per_lane; // From 0 to 8 elements per lane
+  logic [NrLanes-1:0] additional_elm; // There can be an additional element for some lanes
+  // BE signals for VIOTA
+  logic [NrLanes*DataWidth/8-1:0] be_viota_seq_d, be_viota_seq_q, be_viota_shuf;
+
+  // Local Parameter VcpopParallelism and VfirstParallelism
   //
-  // Description: Parameters W_CPOP and W_VFIRST enable time multiplexing of vcpop.m and vfirst.m instruction.
+  // Description: Parameters VcpopParallelism and VfirstParallelism enable time multiplexing of vcpop.m and vfirst.m instruction.
   //
-  // Legal range W_CPOP:   {16, 32, 64, 128, ... , DataWidth*NrLanes} // DataWidth = 64
-  // Legal range W_VFIRST: {16, 32, 64, 128, ... , DataWidth*NrLanes} // DataWidth = 64
+  // Legal range VcpopParallelism:   {16, 32, 64, 128, ... , DataWidth*NrLanes} // DataWidth = 64
+  // Legal range VfirstParallelism: {16, 32, 64, 128, ... , DataWidth*NrLanes} // DataWidth = 64
   //
   // Execution time example for vcpop.m (similar for vfirst.m):
-  // W_CPOP = 64; VLEN = 1024; vl = 1024
-  // t_vcpop.m = VLEN/W_CPOP = 8 [Cycles]
-  localparam int W_CPOP   = 16;
-  localparam int W_VFIRST = 16;
+  // VcpopParallelism = 64; VLEN = 1024; vl = 1024
+  // t_vcpop.m = VLEN/VcpopParallelism = 8 [Cycles]
+  localparam int VcpopParallelism   = 16;
+  localparam int VfirstParallelism = 16;
   // derived parameters
-  localparam int MAX_W_CPOP_VFIRST = (W_CPOP > W_VFIRST) ? W_CPOP : W_VFIRST;
-  localparam int N_SLICES_CPOP   = NrLanes * DataWidth / W_CPOP;
-  localparam int N_SLICES_VFIRST = NrLanes * DataWidth / W_VFIRST;
+  localparam int MAX_VcpopParallelism_VFIRST = (VcpopParallelism > VfirstParallelism) ? VcpopParallelism : VfirstParallelism;
+  localparam int N_SLICES_CPOP   = NrLanes * DataWidth / VcpopParallelism;
+  localparam int N_SLICES_VFIRST = NrLanes * DataWidth / VfirstParallelism;
   // Check if parameters are within range
-  if (((W_CPOP & (W_CPOP - 1)) != 0) || (W_CPOP < 8)) begin
-    $fatal(1, "Parameter W_CPOP must be power of 2.");
-  end else if (((W_VFIRST & (W_VFIRST - 1)) != 0) || (W_VFIRST < 8)) begin
-    $fatal(1, "Parameter W_VFIRST must be power of 2.");
+  if (((VcpopParallelism & (VcpopParallelism - 1)) != 0) || (VcpopParallelism < 8)) begin
+    $fatal(1, "Parameter VcpopParallelism must be power of 2.");
+  end else if (((VfirstParallelism & (VfirstParallelism - 1)) != 0) || (VfirstParallelism < 8)) begin
+    $fatal(1, "Parameter VfirstParallelism must be power of 2.");
   end
 
   // VFIRST and VCPOP Signals
-  logic  [NrLanes*ELEN-1:0]              vcpop_operand;
-  logic  [$clog2(W_CPOP):0]              popcount;
+  logic  [NrLanes*DataWidth-1:0]              vcpop_operand;
+  logic  [$clog2(VcpopParallelism):0]              popcount;
   logic  [$clog2(VLEN):0]                popcount_d, popcount_q;
-  logic  [$clog2(W_VFIRST)-1:0]          vfirst_count;
+  logic  [$clog2(VfirstParallelism)-1:0]          vfirst_count;
   logic  [$clog2(VLEN)-1:0]              vfirst_count_d, vfirst_count_q;
   logic                                  vfirst_empty;
-  logic  [NrLanes-1:0]                   vcpop_vfirst_vs2_ready;
   // counter to keep track of how many slices of the vcpop_operand have been processed
-  logic [$clog2(MAX_W_CPOP_VFIRST):0]   vcpop_slice_cnt_d, vcpop_slice_cnt_q;
-  logic [W_CPOP-1:0]                    vcpop_slice;
-  logic [W_VFIRST-1:0]                  vfirst_slice;
+  logic [VcpopParallelism-1:0]                    vcpop_slice;
+  logic [VfirstParallelism-1:0]                  vfirst_slice;
 
   // vmsbf, vmsif, vmsof, viota, vid, vcpop, vfirst variables
-  logic  [NrLanes*DataWidth-1:0] alu_result_f, alu_result_ff;
-  logic  [NrLanes*DataWidth-1:0] masku_operand_alu_seq_m, masku_operand_alu_seq_f, masku_operand_alu_seq_ff;
-  logic  [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_seq;
-  logic  [NrLanes*DataWidth-1:0] alu_src_idx, alu_src_idx_m;
-  logic  [                 13:0] iteration_count_d, iteration_count_q;
-  logic                          not_found_one_d, not_found_one_q;
-  logic  [          NrLanes-1:0] vmsif_vmsof_vmsbf_vs2_ready;
+  logic  [NrLanes*DataWidth-1:0] masku_operand_alu_seq_m;
+  logic  [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_shuf;
+  logic                          found_one, found_one_d, found_one_q;
 
-  // Control flow for mask operands
-  assign masku_operand_vs2_seq_ready = vcpop_vfirst_vs2_ready | vmsif_vmsof_vmsbf_vs2_ready;
+  // How many elements we are processing per cycle
+  logic [idx_width(NrLanes*DataWidth):0] delta_elm_d, delta_elm_q;
+
+  // MASKU Alu: is a VRF word result or a scalar result fully valid?
+  logic out_vrf_word_valid, out_scalar_valid;
 
   ////////////////////////////////
   //  Vector instruction queue  //
@@ -311,7 +337,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   // and one pointer to indicate which `payload_t` we are currently
   // reading from and writing into the lanes (read_pnt).
   logic     [idx_width(ResultQueueDepth)-1:0]   result_queue_write_pnt_d, result_queue_write_pnt_q;
-  logic     [idx_width(ResultQueueDepth)-1:0]   result_queue_read_pnt_d, result_queue_read_pnt_q, result_queue_read_pnt_m;
+  logic     [idx_width(ResultQueueDepth)-1:0]   result_queue_read_pnt_d, result_queue_read_pnt_q;
   // We need to count how many valid elements are there in this result queue.
   logic     [idx_width(ResultQueueDepth):0]     result_queue_cnt_d, result_queue_cnt_q;
   // Vector to register the final grants from the operand requesters, which indicate
@@ -319,6 +345,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   // that the result was accepted by the operand requester stage
   logic     [NrLanes-1:0]                       result_final_gnt_d, result_final_gnt_q;
 
+  // Result queue
+  elen_t [NrLanes-1:0] result_queue_background_data;
+  elen_t [NrLanes-1:0] result_queue_mask_seq;
+  logic  [NrLanes*DataWidth-1:0] background_data_init_seq, background_data_init_shuf;
+
   // Is the result queue full?
   logic result_queue_full;
   assign result_queue_full = (result_queue_cnt_q == ResultQueueDepth);
@@ -332,41 +363,135 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       result_queue_valid_q     <= '0;
       result_queue_write_pnt_q <= '0;
       result_queue_read_pnt_q  <= '0;
-      result_queue_read_pnt_m  <= '0;
       result_queue_cnt_q       <= '0;
-      alu_result_f             <= '0;
-      alu_result_ff            <= '0;
-      not_found_one_q          <= 1'b1;
-      masku_operand_alu_seq_f  <= '0;
-      masku_operand_alu_seq_ff <= '0;
-      iteration_count_q        <= '0;
     end else begin
       result_queue_q           <= result_queue_d;
       result_queue_valid_q     <= result_queue_valid_d;
       result_queue_write_pnt_q <= result_queue_write_pnt_d;
-      result_queue_read_pnt_m  <= result_queue_write_pnt_q;
-      result_queue_read_pnt_q  <= (vinsn_issue.op inside {[VMSBF:VID]}) ? result_queue_read_pnt_m : result_queue_read_pnt_d;
+      result_queue_read_pnt_q  <= result_queue_read_pnt_d;
       result_queue_cnt_q       <= result_queue_cnt_d;
-      alu_result_f             <= (pe_req_ready_o) ? '0 : (!vinsn_issue.vm) ? alu_result_vm : alu_result_vm_seq;
-      alu_result_ff            <= alu_result_f;
-      not_found_one_q          <= not_found_one_d;
-      masku_operand_alu_seq_f  <= (pe_req_ready_o) ? '0 : masku_operand_alu_seq_m;
-      masku_operand_alu_seq_ff <= masku_operand_alu_seq_f;
-      iteration_count_q        <= iteration_count_d;
     end
   end
 
-  // iteration count for masked instrctions
-  always_comb begin
-    if (vinsn_issue_valid && (&masku_operand_alu_valid || &masku_operand_vs2_seq_valid)) begin
-      iteration_count_d = iteration_count_q + 1'b1;
-    end else begin
-      iteration_count_d = iteration_count_q;
-    end
-    if (pe_req_ready_o && !vinsn_issue_valid) begin
-      iteration_count_d = '0;
-    end
-  end
+  ////////////////////
+  //  ALU counters  //
+  ////////////////////
+
+  // Compile-time minimum among five different numbers
+  function automatic int unsigned min5(int unsigned a, int unsigned b, int unsigned c, int unsigned d, int unsigned e);
+      return (a < b) ? ((a < c) ? ((a < d) ? ((a < e) ? a : e) : (d < e ? d : e))
+                                 : (c < d) ? ((c < e) ? c : e) : (d < e ? d : e))
+                     : ((b < c) ? ((b < d) ? ((b < e) ? b : e) : (d < e ? d : e))
+                                 : (c < d) ? ((c < e) ? c : e) : (d < e ? d : e));
+  endfunction
+
+  // What is the minimum supported parallelism?
+  localparam int unsigned MIN_MASKU_ALU_WIDTH = min5(
+      ViotaParallelism,
+      VmsxfParallelism,
+      VmLogicalParallelism,
+      VcpopParallelism,
+      VfirstParallelism
+  );
+
+  localparam int unsigned IN_READY_CNT_WIDTH = idx_width(NrLanes * DataWidth / MIN_MASKU_ALU_WIDTH);
+  typedef logic [IN_READY_CNT_WIDTH-1:0] in_ready_cnt_t;
+  logic in_ready_cnt_en, in_ready_cnt_clr;
+  in_ready_cnt_t in_ready_cnt_delta_q, in_ready_cnt_q;
+  in_ready_cnt_t in_ready_threshold_d, in_ready_threshold_q;
+
+  assign in_ready_cnt_delta_q = 1;
+
+  // Counter to trigger the input ready.
+  // Ready triggered when all the slices of the VRF word have been consumed.
+  delta_counter #(
+    .WIDTH(IN_READY_CNT_WIDTH)
+  ) i_in_ready_cnt (
+    .clk_i,
+    .rst_ni,
+    .clear_i(in_ready_cnt_clr    ),
+    .en_i   (in_ready_cnt_en     ),
+    .load_i (1'b0                ),
+    .down_i (1'b0                ),
+    .delta_i(in_ready_cnt_delta_q),
+    .d_i    ('0                  ),
+    .q_o    (in_ready_cnt_q      ),
+    .overflow_o(/* Unused */)
+  );
+
+  localparam int unsigned IN_M_READY_CNT_WIDTH = idx_width(NrLanes * DataWidth / MIN_MASKU_ALU_WIDTH);
+  typedef logic [IN_M_READY_CNT_WIDTH-1:0] in_m_ready_cnt_t;
+  logic in_m_ready_cnt_en, in_m_ready_cnt_clr;
+  in_m_ready_cnt_t in_m_ready_cnt_q, in_m_ready_cnt_delta_q;
+  in_ready_cnt_t in_m_ready_threshold_d, in_m_ready_threshold_q;
+
+  assign in_m_ready_cnt_delta_q = 1;
+
+  // Counter to trigger the input ready.
+  // Ready triggered when all the slices of the VRF word have been consumed.
+  delta_counter #(
+    .WIDTH(IN_M_READY_CNT_WIDTH)
+  ) i_in_m_ready_cnt (
+    .clk_i,
+    .rst_ni,
+    .clear_i(in_m_ready_cnt_clr    ),
+    .en_i   (in_m_ready_cnt_en     ),
+    .load_i (1'b0                  ),
+    .down_i (1'b0                  ),
+    .delta_i(in_m_ready_cnt_delta_q),
+    .d_i    ('0                    ),
+    .q_o    (in_m_ready_cnt_q      ),
+    .overflow_o(/* Unused */)
+  );
+
+  localparam int unsigned OUT_VALID_CNT_WIDTH = idx_width(NrLanes * DataWidth / MIN_MASKU_ALU_WIDTH);
+  typedef logic [OUT_VALID_CNT_WIDTH-1:0] out_valid_cnt_t;
+  logic out_valid_cnt_en, out_valid_cnt_clr;
+  out_valid_cnt_t out_valid_cnt_q, out_valid_cnt_delta_q;
+  out_valid_cnt_t out_valid_threshold_d, out_valid_threshold_q;
+
+  assign out_valid_cnt_delta_q = 1;
+
+  // Counter to trigger the output valid.
+  // Valid triggered when all the slices of the VRF word have been consumed.
+  delta_counter #(
+    .WIDTH(OUT_VALID_CNT_WIDTH)
+  ) i_out_valid_cnt (
+    .clk_i,
+    .rst_ni,
+    .clear_i(out_valid_cnt_clr    ),
+    .en_i   (out_valid_cnt_en     ),
+    .load_i (1'b0                 ),
+    .down_i (1'b0                 ),
+    .delta_i(out_valid_cnt_delta_q),
+    .d_i    ('0                   ),
+    .q_o    (out_valid_cnt_q      ),
+    .overflow_o(/* Unused */)
+  );
+
+  // How many (64*NrLanes)-bit VRF words we can get, maximum?
+  localparam int unsigned MAX_NUM_VRF_WORDS = VLEN / NrLanes / 8;
+  logic iteration_cnt_clr;
+  logic [idx_width(MAX_NUM_VRF_WORDS)-1:0] iteration_cnt_q, iteration_cnt_delta_q;
+
+  assign iteration_cnt_delta_q = 1;
+
+  // Iteration count for masked instructions
+  // One iteration == One full output slice processed
+  delta_counter #(
+    .WIDTH(idx_width(MAX_NUM_VRF_WORDS))
+  ) i_iteration_cnt (
+    .clk_i,
+    .rst_ni,
+    .clear_i(iteration_cnt_clr    ),
+    .en_i   (out_valid_cnt_clr    ),
+    .load_i (1'b0                 ),
+    .down_i (1'b0                 ),
+    .delta_i(iteration_cnt_delta_q),
+    .d_i    ('0                   ),
+    .q_o    (iteration_cnt_q      ),
+    .overflow_o(/* Unused */)
+  );
 
   ////////////////////////////
   //// Scalar result reg  ////
@@ -389,19 +514,15 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   //  Mask ALU  //
   ////////////////
 
-  elen_t [NrLanes-1:0]                   alu_result;
-  logic  [NrLanes*ELEN-1:0]              mask;
-
-  // keep track if first 1 mask element was found
-  logic vfirst_found;
+  elen_t [NrLanes-1:0] alu_result;
 
   // assign operand slices to be processed by popcount and lzc
-  assign vcpop_slice  = vcpop_operand[(vcpop_slice_cnt_q * W_CPOP) +: W_CPOP];
-  assign vfirst_slice = vcpop_operand[(vcpop_slice_cnt_q * W_VFIRST) +: W_VFIRST];
+  assign vcpop_slice  = vcpop_operand[(in_ready_cnt_q[idx_width(N_SLICES_CPOP)-1:0] * VcpopParallelism) +: VcpopParallelism];
+  assign vfirst_slice = vcpop_operand[(in_ready_cnt_q[idx_width(N_SLICES_VFIRST)-1:0] * VfirstParallelism) +: VfirstParallelism];
 
   // Population count for vcpop.m instruction
   popcount #(
-    .INPUT_WIDTH (W_CPOP)
+    .INPUT_WIDTH (VcpopParallelism)
   ) i_popcount (
     .data_i    (vcpop_slice),
     .popcount_o(popcount     )
@@ -409,7 +530,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
   // Trailing zero counter
   lzc #(
-    .WIDTH(W_VFIRST),
+    .WIDTH(VfirstParallelism),
     .MODE (0)
   ) i_clz (
     .in_i    (vfirst_slice ),
@@ -417,204 +538,190 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     .empty_o (vfirst_empty )
   );
 
-  always_comb begin: p_mask_alu
-    alu_result          = '0;
-    not_found_one_d     = pe_req_ready_o ? 1'b1 : not_found_one_q;
-    alu_result_vm       = '0;
-    alu_result_vm_m     = '0;
-    alu_result_vm_seq   = '0;
-    masku_operand_alu_seq_m = '0;
-    mask                = '0;
-    vcpop_operand       = '0;
+  // Vector instructions currently running
+  logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q;
 
-    if (vinsn_issue_valid) begin
+  // Interface with the main sequencer
+  pe_resp_t pe_resp;
 
-      // Mask generation
-      unique case (vinsn_issue.op) inside
-        [VMSBF:VID] :
-          if (&masku_operand_alu_valid) begin
-            unique case (vinsn_issue.vtype.vsew)
-              EW8 : for (int i = 0; i < (DataWidth * NrLanes)/8; i++)
-                      mask [(i*8) +: 8]   = {8{bit_enable_mask [i+(((DataWidth * NrLanes)/8)*(iteration_count_d-1))]}};
-              EW16: for (int i = 0; i < (DataWidth * NrLanes)/16; i++)
-                      mask [(i*16) +: 16] = {16{bit_enable_mask [i+(((DataWidth * NrLanes)/16)*(iteration_count_d-1))]}};
-              EW32: for (int i = 0; i < (DataWidth * NrLanes)/32; i++)
-                      mask [(i*32) +: 32] = {32{bit_enable_mask [i+(((DataWidth * NrLanes)/32)*(iteration_count_d-1))]}};
-              EW64: for (int i = 0; i < (DataWidth * NrLanes)/64; i++)
-                      mask [(i*64) +: 64] = {64{bit_enable_mask [i+(((DataWidth * NrLanes)/64)*(iteration_count_d-1))]}};
-            endcase
-          end else begin
-            mask = '0;
-          end
-        default:;
-      endcase
+  // Effective MASKU stride in case of VSLIDEUP
+  // MASKU receives chunks of 64 * NrLanes mask bits from the lanes
+  // VSLIDEUP only needs the bits whose index >= than its stride
+  // So, the operand requester does not send vl mask bits to MASKU
+  // and trims all the unused 64 * NrLanes mask bits chunks
+  // Therefore, the stride needs to be trimmed, too
+  elen_t trimmed_stride;
+
+  // Information about which is the target FU of the request
+  assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu;
+
+  always_comb begin
+    // Tail-agnostic bus
+    alu_result          = '1;
+    alu_result_vm       = '1;
+    alu_result_vm_m     = '1;
+    alu_result_vm_shuf  = '1;
+    alu_result_vmsif_vm = '1;
+    alu_result_vmsbf_vm = '1;
+    alu_result_vmsof_vm = '1;
+    alu_result_vm       = '1;
+
+    vcpop_operand = '0;
+
+    // The result mask should be created here since the output is a non-mask vector
+    be_viota_seq_d = be_viota_seq_q;
+
+    // Create a bit-masked ALU sequential vector
+    masku_operand_alu_seq_m = masku_operand_alu_seq
+                            & (masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}});
+
+    // VMSBF, VMSIF, VMSOF default assignments
+    found_one           = found_one_q;
+    found_one_d         = found_one_q;
+    vmsbf_buffer        = '0;
+    // VIOTA default assignments
+    viota_acc   = viota_acc_q;
+    viota_acc_d = viota_acc_q;
+    for (int i = 0; i < ViotaParallelism; i++) viota_res[i] = '0;
 
+    if (vinsn_issue_valid) begin
       // Evaluate the instruction
       unique case (vinsn_issue.op) inside
-        [VMANDNOT:VMXNOR]: alu_result = masku_operand_alu;
-        [VMFEQ:VMSGTU], [VMSGT:VMSBC]: alu_result = alu_result_compressed & bit_enable_mask;
+        // Mask logical: pass through the result already computed in the ALU
+        // This operation is never masked
+        // This operation always writes to multiple of VRF words, and it does not need vd
+        // This operation can overwrite the destination register without constraints on tail elements
+        [VMANDNOT:VMXNOR]: alu_result_vm_m = masku_operand_alu_seq;
+        // Comparisons: mask out the masked out bits of this pre-computed slice
+        [VMFEQ:VMSGT]: alu_result_vm_m = alu_result_compressed_seq
+                                  | ~(masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}});
+        // Add/sub-with-carry/borrow: the masks are all 1 since these operations are NOT masked
+        [VMADC:VMSBC]: alu_result_vm_m = alu_result_compressed_seq;
+        // VMSBF, VMSOF, VMSIF: compute a slice of the output and mask out the masked out bits
         [VMSBF:VMSIF] : begin
-            if (&masku_operand_vs2_seq_valid && (&masku_operand_m_valid || vinsn_issue.vm)) begin
-              for (int i = 0; i < NrLanes * DataWidth; i++) begin
-                if (masku_operand_vs2_seq[i] == 1'b0) begin
-                  alu_result_vm[i] = (vinsn_issue.op == VMSOF) ? 1'b0 : not_found_one_d;
-                end else begin
-                  not_found_one_d = 1'b0;
-                  alu_result_vm[i] = (vinsn_issue.op == VMSBF) ? not_found_one_d : 1'b1;
-                  break;
-                end
-              end
-              alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & bit_enable_mask : alu_result_vm;
-            end else begin
-                alu_result_vm = '0;
-            end
-        end
-        VIOTA: begin
-          if (&masku_operand_alu_valid) begin
-            masku_operand_alu_seq_m = masku_operand_alu_seq & bit_enable_mask;
-            unique case (vinsn_issue.vtype.vsew)
-              EW8 : begin
-                if (issue_cnt_q < vinsn_issue.vl) begin
-                  alu_result_vm [7:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:8] + alu_result_ff [(NrLanes*DataWidth)-1-:8];
-                end else begin
-                  alu_result_vm [7:0] = '0;
-                end
-                for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin
-                  alu_result_vm   [(index*8) +: 7] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*8) +: 7];
-                  alu_result_vm_m [(index*8) +: 7] = alu_result_vm [(index*8) +: 7];
-                end
-              end
-              EW16: begin
-                if (issue_cnt_q < vinsn_issue.vl) begin
-                  alu_result_vm [15:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:16] + alu_result_ff [(NrLanes*DataWidth)-1-:16];
-                end else begin
-                  alu_result_vm [15:0] = '0;
-                end
-                for (int index = 1; index < (NrLanes*DataWidth)/16; index++) begin
-                  alu_result_vm   [(index*16) +: 15] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*16) +: 15];
-                  alu_result_vm_m [(index*16) +: 15] = alu_result_vm [(index*16) +: 15];
-                end
-              end
-              EW32: begin
-                if (issue_cnt_q < vinsn_issue.vl) begin
-                  alu_result_vm [31:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:32] + alu_result_ff [(NrLanes*DataWidth)-1-:32];
-                end else begin
-                  alu_result_vm [31:0] = '0;
-                end
-                for (int index = 1; index < (NrLanes*DataWidth)/32; index++) begin
-                  alu_result_vm   [(index*32) +: 31] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*32) +: 31];
-                  alu_result_vm_m [(index*32) +: 31] = alu_result_vm [(index*32) +: 31];
-                end
-              end
-              EW64: begin
-                if (issue_cnt_q < vinsn_issue.vl) begin
-                  alu_result_vm [63:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:64] + alu_result_ff [(NrLanes*DataWidth)-1-:64];
-                end else begin
-                  alu_result_vm [63:0] = '0;
-                end
-                for (int index = 1; index < (NrLanes*DataWidth)/64; index++) begin
-                  alu_result_vm   [(index*64) +: 63] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*64) +: 63];
-                  alu_result_vm_m [(index*64) +: 63] = alu_result_vm [(index*64) +: 63];
-                end
-              end
-            endcase
+          vmsbf_buffer[0] = ~(masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism] | found_one_q);
+          for (int i = 1; i < VmsxfParallelism; i++) begin
+            vmsbf_buffer[i] = ~((masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism + i]) | ~vmsbf_buffer[i-1]);
           end
+          // Have we found a 1 in the current slice?
+          found_one = |(masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism]) | found_one_q;
+
+          alu_result_vmsbf_vm[out_valid_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = vmsbf_buffer;
+          alu_result_vmsif_vm[out_valid_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = {vmsbf_buffer[VmsxfParallelism-2:0], ~found_one_q};
+          alu_result_vmsof_vm[out_valid_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = ~vmsbf_buffer & {vmsbf_buffer[VmsxfParallelism-2:0], ~found_one_q};
+
+          unique case (vinsn_issue.op)
+            VMSBF: alu_result_vm = alu_result_vmsbf_vm;
+            VMSIF: alu_result_vm = alu_result_vmsif_vm;
+            // VMSOF
+            default: alu_result_vm = alu_result_vmsof_vm;
+          endcase
+
+          // Mask the result
+          alu_result_vm_m = (!vinsn_issue.vm) || (vinsn_issue.op inside {[VMADC:VMSBC]}) ? alu_result_vm | ~masku_operand_m_seq : alu_result_vm;
         end
-        VID: begin
-          if (&masku_operand_alu_valid) begin
-            unique case (vinsn_issue.vtype.vsew)
-              EW8 : begin
-                for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin
-                  alu_result_vm [(index*8) +: 7] = (((NrLanes * DataWidth)/8) >= vinsn_issue.vl) ? index : index-(((vinsn_issue.vl/((NrLanes * DataWidth)/8))-iteration_count_d)*32);
-                  alu_result_vm_m = alu_result_vm & mask;
-                end
-              end
-              EW16: begin
-                for (int index = 1; index < (NrLanes*DataWidth)/16; index++) begin
-                  alu_result_vm [(index*16) +: 15] = (((NrLanes * DataWidth)/8) >= vinsn_issue.vl) ? index : index-(((vinsn_issue.vl/((NrLanes * DataWidth)/8))-iteration_count_d)*16);
-                  alu_result_vm_m = alu_result_vm & mask;
-                end
-              end
-              EW32: begin
-                for (int index = 1; index < (NrLanes*DataWidth)/32; index++) begin
-                  alu_result_vm [(index*32) +: 31] = (((NrLanes * DataWidth)/8) >= vinsn_issue.vl) ? index : index-(((vinsn_issue.vl/((NrLanes * DataWidth)/8))-iteration_count_d)*8);
-                  alu_result_vm_m = alu_result_vm & mask;
-                end
-              end
-              EW64: begin
-                for (int index = 1; index < (NrLanes*DataWidth)/64; index++) begin
-                  alu_result_vm [(index*64) +: 63] = (((NrLanes * DataWidth)/8) >= vinsn_issue.vl) ? index : index-(((vinsn_issue.vl/((NrLanes * DataWidth)/8))-iteration_count_d)*4);
-                  alu_result_vm_m = alu_result_vm & mask;
-                end
-              end
-            endcase
+        // VIOTA, VID: compute a slice of the output and mask out the masked elements
+        // VID re-uses the VIOTA datapath
+        VIOTA, VID: begin
+          // Mask the input vector
+          // VID uses the same datapath of VIOTA, but with implicit input vector at '1
+          masku_operand_alu_seq_m = (vinsn_issue.op == VID)
+                                  ? '1 // VID mask does NOT modify the count
+                                  : masku_operand_alu_seq
+                                    & (masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}}); // VIOTA mask DOES modify the count
+
+          // Compute output results on `ViotaParallelism 16-bit adders
+          viota_res[0] = viota_acc_q;
+          for (int i = 0; i < ViotaParallelism - 1; i++) begin
+            viota_res[i+1] = viota_res[i] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i];
           end
+          viota_acc = viota_res[ViotaParallelism-1] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + ViotaParallelism - 1];
+
+          // This datapath should be relativeley simple:
+          // `ViotaParallelism bytes connected, in line, to output byte chunks
+          // Multiple limited-width counters should help the synthesizer reduce wiring
+          unique case (vinsn_issue.vtype.vsew)
+            EW8: for (int i = 0; i < ViotaParallelism; i++) begin
+              alu_result_vm_m[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0]  * ViotaParallelism * 8  + i*8  +: 8]  = viota_res[i][7:0];
+            end
+            EW16: for (int i = 0; i < ViotaParallelism; i++) begin
+              alu_result_vm_m[out_valid_cnt_q[idx_width(NrLanes*DataWidth/16/ViotaParallelism)-1:0] * ViotaParallelism * 16 + i*16 +: 16] = viota_res[i];
+            end
+            EW32: for (int i = 0; i < ViotaParallelism; i++) begin
+              alu_result_vm_m[out_valid_cnt_q[idx_width(NrLanes*DataWidth/32/ViotaParallelism)-1:0] * ViotaParallelism * 32 + i*32 +: 32] = {{32{1'b0}}, viota_res[i]};
+            end
+            default: for (int i = 0; i < ViotaParallelism; i++) begin // EW64
+              alu_result_vm_m[out_valid_cnt_q[idx_width(NrLanes*DataWidth/64/ViotaParallelism)-1:0] * ViotaParallelism * 64 + i*64 +: 64] = {{48{1'b0}}, viota_res[i]};
+            end
+          endcase
+
+          // BE signal for VIOTA,VID
+          unique case (vinsn_issue.vtype.vsew)
+            EW8: for (int i = 0; i < ViotaParallelism; i++) begin
+              be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism * 1 + 1*i +: 1] =
+                {1{vinsn_issue.vm}} | {1{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}};
+            end
+            EW16: for (int i = 0; i < ViotaParallelism; i++) begin
+              be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/16/ViotaParallelism)-1:0] * ViotaParallelism * 2 + 2*i +: 2] =
+                {2{vinsn_issue.vm}} | {2{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}};
+            end
+            EW32: for (int i = 0; i < ViotaParallelism; i++) begin
+              be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/32/ViotaParallelism)-1:0] * ViotaParallelism * 4 + 4*i +: 4] =
+                {4{vinsn_issue.vm}} | {4{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}};
+            end
+            default: for (int i = 0; i < ViotaParallelism; i++) begin // EW64
+              be_viota_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/64/ViotaParallelism)-1:0] * ViotaParallelism * 8 + 8*i +: 8] =
+                {8{vinsn_issue.vm}} | {8{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]}};
+            end
+          endcase
         end
+        // VCPOP, VFIRST: mask the current slice and feed the popc or lzc unit
         [VCPOP:VFIRST] : begin
-          vcpop_operand = (!vinsn_issue.vm) ? masku_operand_vs2_seq & bit_enable_mask : masku_operand_vs2_seq;
-        end
-        default: begin
-          alu_result    = '0;
-          alu_result_vm = '0;
+          vcpop_operand = (!vinsn_issue.vm) ? masku_operand_alu_seq & masku_operand_m_seq : masku_operand_alu_seq;
         end
+        default:;
       endcase
     end
 
-    // Shuffle result for masked instructions
+    // Shuffle the sequential result with vtype.vsew encoding
+    for (int b = 0; b < (NrLanes*StrbWidth); b++) begin
+      automatic int shuffle_byte              = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew);
+      alu_result_vm_shuf[8*shuffle_byte +: 8] = alu_result_vm_m[8*b +: 8];
+    end
+
+    // Shuffle the VIOTA, VID byte enable signal
+    be_viota_shuf = '0;
     for (int b = 0; b < (NrLanes*StrbWidth); b++) begin
-      automatic int shuffle_byte             = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew);
-      alu_result_vm_seq[8*shuffle_byte +: 8] = alu_result_vm_m[8*b +: 8];
+      automatic int shuffle_byte  = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew);
+      be_viota_shuf[shuffle_byte] = be_viota_seq_d[b];
     end
 
-    // alu_result propagation mux
-    if (vinsn_issue.op inside {[VMSBF:VID]})
-      alu_result = alu_result_vm_seq;
+    // Simplify layout handling
+    alu_result = alu_result_vm_shuf;
 
-  end: p_mask_alu
+    // Prepare the background data with vtype.vsew encoding
+    result_queue_mask_seq = vinsn_issue.op inside {[VIOTA:VID]} ? '0 : masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}} | {NrLanes*DataWidth{vinsn_issue.op inside {[VMADC:VMSBC]}}};
+    background_data_init_seq = masku_operand_vd_seq | result_queue_mask_seq;
+    background_data_init_shuf = '0;
+    for (int b = 0; b < (NrLanes*StrbWidth); b++) begin
+      automatic int shuffle_byte                     = shuffle_index(b, NrLanes, vinsn_issue.vtype.vsew);
+      background_data_init_shuf[8*shuffle_byte +: 8] = background_data_init_seq[8*b +: 8];
+    end
 
   /////////////////
   //  Mask unit  //
   /////////////////
 
-  // Vector instructions currently running
-  logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q;
-
-  // Interface with the main sequencer
-  pe_resp_t pe_resp;
-
-  // Effective MASKU stride in case of VSLIDEUP
-  // MASKU receives chunks of 64 * NrLanes mask bits from the lanes
-  // VSLIDEUP only needs the bits whose index >= than its stride
-  // So, the operand requester does not send vl mask bits to MASKU
-  // and trims all the unused 64 * NrLanes mask bits chunks
-  // Therefore, the stride needs to be trimmed, too
-  elen_t trimmed_stride;
-
-  logic [NrLanes-1:0] fake_a_valid;
-  logic last_incoming_a;
-  logic unbalanced_a;
-
-  // Control signals for better code-readability (this signals goes high if a result is valid and can be pushed to the result_queue)
-  logic vreg_wb_valid;
-
-  // Information about which is the target FU of the request
-  assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu;
-
-  // Byte enable for the result queue
-  logic [NrLanes*ELENB-1:0] result_queue_be_seq;
-  logic [NrLanes*ELENB-1:0] result_queue_be;
-
-  always_comb begin: p_masku
     // Maintain state
-    vinsn_queue_d  = vinsn_queue_q;
-    read_cnt_d     = read_cnt_q;
-    issue_cnt_d    = issue_cnt_q;
-    commit_cnt_d   = commit_cnt_q;
+    vinsn_queue_d    = vinsn_queue_q;
+    read_cnt_d       = read_cnt_q;
+    issue_cnt_d      = issue_cnt_q;
+    processing_cnt_d = processing_cnt_q;
+    commit_cnt_d     = commit_cnt_q;
 
     mask_pnt_d     = mask_pnt_q;
     vrf_pnt_d      = vrf_pnt_q;
 
-    vcpop_slice_cnt_d = vcpop_slice_cnt_q;
     popcount_d        = popcount_q;
     vfirst_count_d    = vfirst_count_q;
 
@@ -634,13 +741,19 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
     trimmed_stride = pe_req_i.stride;
 
+    out_vrf_word_valid = 1'b0;
+    out_scalar_valid   = 1'b0;
+
     // Vector instructions currently running
     vinsn_running_d = vinsn_running_q & pe_vinsn_running_i;
 
+    // Mask the response, by default
+    pe_resp = '0;
+
     // We are not ready, by default
-    pe_resp                 = '0;
-    masku_operand_alu_ready = '0;
-    masku_operand_m_ready = '0;
+    masku_operand_alu_ready    = '0;
+    masku_operand_m_ready      = '0;
+    masku_operand_vd_ready     = '0;
 
     // Inform the main sequencer if we are idle
     pe_req_ready_o = !vinsn_queue_full;
@@ -649,329 +762,110 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     result_scalar_d       = result_scalar_o;
     result_scalar_valid_d = result_scalar_valid_o;
 
-    // Balance the incoming valid
-    unbalanced_a = (|commit_cnt_q[idx_width(NrLanes)-1:0] != 1'b0) ? 1'b1 : 1'b0;
-    last_incoming_a = ((commit_cnt_q - vrf_pnt_q) < NrLanes) ? 1'b1 : 1'b0;
-    fake_a_valid[0] = 1'b0;
-    for (int unsigned i = 1; i < NrLanes; i++)
-      if (i >= {1'b0, commit_cnt_q[idx_width(NrLanes)-1:0]})
-        fake_a_valid[i] = last_incoming_a & unbalanced_a;
-      else
-        fake_a_valid = 1'b0;
+    // Don't handshake the inputs
+    in_ready_cnt_en   = 1'b0;
+    in_m_ready_cnt_en = 1'b0;
+    out_valid_cnt_en  = 1'b0;
+
+    // Result queue background data
+    for (int unsigned lane = 0; lane < NrLanes; lane++)
+      result_queue_background_data[lane] = result_queue_q[result_queue_write_pnt_q][lane].wdata;
+
+    // Maintain state
+    delta_elm_d = delta_elm_q;
+    in_ready_threshold_d   = in_ready_threshold_q;
+    in_m_ready_threshold_d = in_m_ready_threshold_q;
+    out_valid_threshold_d  = out_valid_threshold_q;
+
+    in_ready_cnt_clr   = 1'b0;
+    in_m_ready_cnt_clr = 1'b0;
+    out_valid_cnt_clr  = 1'b0;
+    iteration_cnt_clr  = 1'b0;
 
     /////////////////////
     //  Mask Operands  //
     /////////////////////
 
-    // Is there an instruction ready to be issued?
-    if (vinsn_issue_valid && !(vd_scalar(vinsn_issue.op))) begin
-      // Is there place in the mask queue to write the mask operands?
-      // Did we receive the mask bits on the MaskM channel?
-      if (!vinsn_issue.vm && !mask_queue_full && &masku_operand_m_valid && !(vinsn_issue.op inside {VMSBF, VMSOF, VMSIF})) begin
-        // Copy data from the mask operands into the mask queue
-        for (int vrf_seq_byte = 0; vrf_seq_byte < NrLanes*StrbWidth; vrf_seq_byte++) begin
-          // Map vrf_seq_byte to the corresponding byte in the VRF word.
-          automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue.vtype.vsew);
-
-          // At which lane, and what is the byte offset in that lane, of the byte vrf_byte?
-          // NOTE: This does not work if the number of lanes is not a power of two.
-          // If that is needed, the following two lines must be changed accordingly.
-          automatic int vrf_lane   = vrf_byte >> $clog2(StrbWidth);
-          automatic int vrf_offset = vrf_byte[idx_width(StrbWidth)-1:0];
-
-          // The VRF pointer can be broken into a byte offset, and a bit offset
-          automatic int vrf_pnt_byte_offset = mask_pnt_q >> $clog2(StrbWidth);
-          automatic int vrf_pnt_bit_offset  = mask_pnt_q[idx_width(StrbWidth)-1:0];
-
-          // A single bit from the mask operands can be used several times, depending on the eew.
-          automatic int mask_seq_bit  = vrf_seq_byte >> int'(vinsn_issue.vtype.vsew);
-          automatic int mask_seq_byte = (mask_seq_bit >> $clog2(StrbWidth)) + vrf_pnt_byte_offset;
-          // Shuffle this source byte
-          automatic int mask_byte     = shuffle_index(mask_seq_byte, NrLanes, vinsn_issue.eew_vmask);
-          // Account for the bit offset
-          automatic int mask_bit = (mask_byte << $clog2(StrbWidth)) +
-            mask_seq_bit[idx_width(StrbWidth)-1:0] + vrf_pnt_bit_offset;
-
-          // At which lane, and what is the bit offset in that lane, of the mask operand from
-          // mask_seq_bit?
-          automatic int mask_lane   = mask_bit >> idx_width(DataWidth);
-          automatic int mask_offset = mask_bit[idx_width(DataWidth)-1:0];
-
-          // Copy the mask operand
-          mask_queue_d[mask_queue_write_pnt_q][vrf_lane][vrf_offset] =
-            masku_operand_m[mask_lane][mask_offset];
-        end
-
-        // Account for the used operands
-        mask_pnt_d += NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew));
-
-        // Increment result queue pointers and counters
-        mask_queue_cnt_d += 1;
-        if (mask_queue_write_pnt_q == MaskQueueDepth-1)
-          mask_queue_write_pnt_d = '0;
-        else
-          mask_queue_write_pnt_d = mask_queue_write_pnt_q + 1;
-
-        // Account for the operands that were issued
-        read_cnt_d = read_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew));
-        if (read_cnt_q < NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)))
-          read_cnt_d = '0;
-
-        // Trigger the request signal
-        mask_queue_valid_d[mask_queue_write_pnt_q] = {NrLanes{1'b1}};
-
-        // Are there lanes with no valid elements?
-        // If so, mute their request signal
-        if (read_cnt_q < NrLanes)
-          mask_queue_valid_d[mask_queue_write_pnt_q] = (1 << read_cnt_q) - 1;
-
-        // Consumed all valid bytes from the lane operands
-        if (mask_pnt_d == NrLanes*64 || read_cnt_d == '0) begin
-          // Request another beat
-          masku_operand_m_ready = '1;
-          // Reset the pointer
-          mask_pnt_d              = '0;
-        end
+    // Instructions that run in other units, but need mask strobes for predicated execution
+
+    // Is there space in the result queue?
+    if (!mask_queue_full) begin
+      // Copy data from the mask operands into the mask queue
+      for (int vrf_seq_byte = 0; vrf_seq_byte < NrLanes*StrbWidth; vrf_seq_byte++) begin
+        // Map vrf_seq_byte to the corresponding byte in the VRF word.
+        automatic int vrf_byte = shuffle_index(vrf_seq_byte, NrLanes, vinsn_issue.vtype.vsew);
+
+        // At which lane, and what is the byte offset in that lane, of the byte vrf_byte?
+        // NOTE: This does not work if the number of lanes is not a power of two.
+        // If that is needed, the following two lines must be changed accordingly.
+        automatic int vrf_lane   = vrf_byte >> $clog2(StrbWidth);
+        automatic int vrf_offset = vrf_byte[idx_width(StrbWidth)-1:0];
+
+        // The VRF pointer can be broken into a byte offset, and a bit offset
+        automatic int vrf_pnt_byte_offset = mask_pnt_q >> $clog2(StrbWidth);
+        automatic int vrf_pnt_bit_offset  = mask_pnt_q[idx_width(StrbWidth)-1:0];
+
+        // A single bit from the mask operands can be used several times, depending on the eew.
+        automatic int mask_seq_bit  = vrf_seq_byte >> int'(vinsn_issue.vtype.vsew);
+        automatic int mask_seq_byte = (mask_seq_bit >> $clog2(StrbWidth)) + vrf_pnt_byte_offset;
+        // Shuffle this source byte
+        automatic int mask_byte     = shuffle_index(mask_seq_byte, NrLanes, vinsn_issue.eew_vmask);
+        // Account for the bit offset
+        automatic int mask_bit = (mask_byte << $clog2(StrbWidth)) +
+          mask_seq_bit[idx_width(StrbWidth)-1:0] + vrf_pnt_bit_offset;
+
+        // At which lane, and what is the bit offset in that lane, of the mask operand from
+        // mask_seq_bit?
+        automatic int mask_lane   = mask_bit >> idx_width(DataWidth);
+        automatic int mask_offset = mask_bit[idx_width(DataWidth)-1:0];
+
+        // Copy the mask operand
+        mask_queue_d[mask_queue_write_pnt_q][vrf_lane][vrf_offset] =
+          masku_operand_m[mask_lane][mask_offset];
       end
-    end
-
-    //////////////////////////////
-    // Calculate scalar results //
-    //////////////////////////////
-
-    vcpop_vfirst_vs2_ready = 1'b0;
-
-    // Is there an instruction ready to be issued?
-    if (vinsn_issue_valid && vd_scalar(vinsn_issue.op)) begin
-      if (&(masku_operand_vs2_seq_valid | fake_a_valid) && (&masku_operand_m_valid || vinsn_issue.vm)) begin
-
-        // increment slice counter
-        vcpop_slice_cnt_d = vcpop_slice_cnt_q + 1'b1;
-
-        // request new operand (by completing ready-valid handshake) once all slices have been processed
-        vcpop_vfirst_vs2_ready = 1'b0;
-        if (((vcpop_slice_cnt_q == N_SLICES_CPOP - 1) && vinsn_issue.op == VCPOP) ||
-            ((vcpop_slice_cnt_q == N_SLICES_VFIRST-1) && vinsn_issue.op == VFIRST)) begin
-          vcpop_slice_cnt_d       = '0;
-          vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid;
-          if (!vinsn_issue.vm) begin
-            masku_operand_m_ready = '1;
-          end
-        end
 
-        // Account for the elements that were processed
-        issue_cnt_d = issue_cnt_q - W_CPOP;
-
-        // abruptly stop processing elements if vl is reached
-        if (iteration_count_d >= (vinsn_issue.vl/(W_CPOP)) || (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin
-          issue_cnt_d = '0;
-          commit_cnt_d = '0;
-          read_cnt_d ='0;
-          vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid;
-          if (!vinsn_issue.vm) begin
-            masku_operand_m_ready = '1;
-          end
-        end
-
-        popcount_d     = popcount_q + popcount;
-        vfirst_count_d = vfirst_count_q + vfirst_count;
-
-        // if this is the last beat, commit the result to the scalar_result queue
-        if ((iteration_count_d >= (vinsn_issue.vl/W_CPOP) && vinsn_issue.op == VCPOP) ||
-            (iteration_count_d >= (vinsn_issue.vl/W_VFIRST) && vinsn_issue.op == VFIRST) ||
-            (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin
-          result_scalar_d = (vinsn_issue.op == VCPOP) ? popcount_d : (vfirst_empty) ? -1 : vfirst_count_d;
-          result_scalar_valid_d = '1;
-
-          // Decrement the commit counter by the entire number of elements,
-          // since we only commit one result for everything
-          commit_cnt_d = '0;
-
-          // reset vcpop slice counter, since instruction is finished
-          vcpop_slice_cnt_d = '0;
-
-          // acknowledge operand a
-          vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid;
-          if (!vinsn_issue.vm) begin
+      // Is there an instruction ready to be issued?
+      if (vinsn_issue_valid && ((vinsn_issue.vfu != VFU_MaskUnit) || (vinsn_issue.op inside {[VMADC:VMSBC]}))) begin
+        // Is there place in the mask queue to write the mask operands?
+        // Did we receive the mask bits on the MaskM channel?
+        if (!vinsn_issue.vm && &masku_operand_m_valid) begin
+          // Account for the used operands
+          mask_pnt_d += NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew));
+
+          // Increment result queue pointers and counters
+          mask_queue_cnt_d += 1;
+          if (mask_queue_write_pnt_q == MaskQueueDepth-1)
+            mask_queue_write_pnt_d = '0;
+          else
+            mask_queue_write_pnt_d = mask_queue_write_pnt_q + 1;
+
+          // Account for the operands that were issued
+          read_cnt_d = read_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew));
+          if (read_cnt_q < NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)))
+            read_cnt_d = '0;
+
+          // Trigger the request signal
+          mask_queue_valid_d[mask_queue_write_pnt_q] = {NrLanes{1'b1}};
+
+          // Are there lanes with no valid elements?
+          // If so, mute their request signal
+          if (read_cnt_q < NrLanes)
+            mask_queue_valid_d[mask_queue_write_pnt_q] = (1 << read_cnt_q) - 1;
+
+          // Consumed all valid bytes from the lane operands
+          if (mask_pnt_d == NrLanes*DataWidth || read_cnt_d == '0) begin
+            // Request another beat
             masku_operand_m_ready = '1;
+            // Reset the pointer
+            mask_pnt_d = '0;
           end
         end
       end
     end
 
-    //////////////////////////////////
-    //  Write results to the lanes  //
-    //////////////////////////////////
-
-    result_queue_be = '1;
-    result_queue_be_seq = '1;
-    vmsif_vmsof_vmsbf_vs2_ready = '0;
-
-    // Is there an instruction ready to be issued?
-    if (vinsn_issue_valid && !vd_scalar(vinsn_issue.op)) begin
-      // This instruction executes on the Mask Unit
-      if (vinsn_issue.vfu == VFU_MaskUnit) begin
-        // Is there place in the result queue to write the results?
-        // Did we receive the operands?
-        if (!result_queue_full && (&(masku_operand_alu_valid | fake_a_valid | masku_operand_vs2_seq_valid))) begin
-          // How many elements are we committing in total?
-          // Since we are committing bits instead of bytes, we carry out the following calculation
-          // with ceil(vl/8) instead.
-          automatic int element_cnt_all_lanes           = (ELENB * NrLanes) >> int'(vinsn_issue.vtype.vsew);
-          // How many elements are remaining to be committed? Carry out the calculation with
-          // ceil(issue_cnt/8).
-          automatic int remaining_element_cnt_all_lanes = (issue_cnt_q + 7) / 8;
-          remaining_element_cnt_all_lanes               = (remaining_element_cnt_all_lanes +
-            (1 << int'(vinsn_issue.vtype.vsew)) - 1) >> int'(vinsn_issue.vtype.vsew);
-          if (element_cnt_all_lanes > remaining_element_cnt_all_lanes)
-            element_cnt_all_lanes = remaining_element_cnt_all_lanes;
-
-          // Acknowledge the operands of this instruction.
-          // At this stage, acknowledge only the first operand, "a", coming from the ALU/VMFpu.
-          masku_operand_alu_ready = masku_operand_alu_valid;
-          vmsif_vmsof_vmsbf_vs2_ready = (&masku_operand_m_valid || vinsn_issue.vm) ? '1 : '0;
-
-          if (!vinsn_issue.vm) begin
-            unique case (vinsn_issue.vtype.vsew)
-              EW8 : result_queue_be_seq = masku_operand_m_seq[NrLanes*ELENB-1:0];
-              EW16: begin
-                for (int i = 0; i < NrLanes * ELENB / 2; i++) begin
-                  result_queue_be_seq[2*i +: 2] = {2{bit_enable_mask[i]}};
-                end
-              end
-              EW32: begin
-                for (int i = 0; i < NrLanes * ELENB / 4; i++) begin
-                  result_queue_be_seq[4*i +: 4] = {4{bit_enable_mask[i]}};
-                end
-              end
-              EW64: begin
-                for (int i = 0; i < NrLanes * ELENB / 8; i++) begin
-                  result_queue_be_seq[8*i +: 8] = {8{bit_enable_mask[i]}};
-                end
-              end
-              default: ; // Not sure what should be the default
-            endcase
-            for (int i = 0; i < NrLanes*ELENB; i++) begin
-              result_queue_be[shuffle_index(i, NrLanes, vinsn_issue.vtype.vsew)] = result_queue_be_seq[i];
-            end
-          end
-
-          if (vinsn_issue.op inside {[VMSBF: VMSIF], VID}) begin
-            result_queue_be = '1;
-          end
-
-          // Store the result in the operand queue
-          for (int unsigned lane = 0; lane < NrLanes; lane++) begin
-            // How many elements are we committing in this lane?
-            automatic int element_cnt = element_cnt_all_lanes / NrLanes;
-            if (lane < element_cnt_all_lanes[idx_width(NrLanes)-1:0])
-              element_cnt += 1;
-
-            result_queue_d[result_queue_write_pnt_q][lane] = '{
-              wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata | alu_result[lane],
-              be   : (vinsn_issue.op inside {[VMSBF:VID]}) ? result_queue_be[lane*ELENB +: ELENB] : be(element_cnt, vinsn_issue.vtype.vsew),
-              addr : (vinsn_issue.op inside {[VIOTA:VID]}) ? vaddr(vinsn_issue.vd, NrLanes, VLEN) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes, VLEN) +
-                (((vinsn_issue.vl - issue_cnt_q) / NrLanes / DataWidth)),
-              id : vinsn_issue.id
-            };
-          end
-
-          // Increment the VRF pointer
-          if (vinsn_issue.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]}) begin
-            vrf_pnt_d = vrf_pnt_q + (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew));
-
-            // Filled-up a word, or finished execution
-            if (vrf_pnt_d == DataWidth*NrLanes || vrf_pnt_d >= issue_cnt_q) begin
-              result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
-
-              // Reset VRF pointer
-              vrf_pnt_d = '0;
-
-              // Increment result queue pointers and counters
-              result_queue_cnt_d += 1;
-              if (result_queue_write_pnt_q == ResultQueueDepth-1)
-                result_queue_write_pnt_d = '0;
-              else
-                result_queue_write_pnt_d = result_queue_write_pnt_q + 1;
-
-              // Account for the results that were issued
-              issue_cnt_d = issue_cnt_q - NrLanes * DataWidth;
-              if (issue_cnt_q < NrLanes * DataWidth)
-                issue_cnt_d = '0;
-            end
-          end else if (vinsn_issue.op inside {[VMSBF:VID]}) begin
-            if (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {VIOTA, VID}) begin
-              result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
-
-              // Increment result queue pointers and counters
-              result_queue_cnt_d += 1;
-              if (result_queue_write_pnt_q == ResultQueueDepth-1)
-                result_queue_write_pnt_d = '0;
-              else
-                result_queue_write_pnt_d = result_queue_write_pnt_q + 1;
-
-              if (result_queue_read_pnt_q == ResultQueueDepth-1)
-                result_queue_read_pnt_d = '0;
-              else
-                result_queue_read_pnt_d = result_queue_read_pnt_m;
-
-              // Account for the results that were issued
-              if (vinsn_issue.op inside {VIOTA, VID}) begin
-                issue_cnt_d = issue_cnt_q - (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew));
-                if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl)
-                  issue_cnt_d = '0;
-              end else begin
-                issue_cnt_d = issue_cnt_q - NrLanes * DataWidth;
-                if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl)
-                  issue_cnt_d = '0;
-              end
-            end
-          end else begin
-            result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
-            // Increment result queue pointers and counters
-            result_queue_cnt_d += 1;
-            if (result_queue_write_pnt_q == ResultQueueDepth-1)
-              result_queue_write_pnt_d = '0;
-            else
-              result_queue_write_pnt_d = result_queue_write_pnt_q + 1;
-
-            // Account for the results that were issued
-            issue_cnt_d = issue_cnt_q - NrLanes * DataWidth;
-            if (issue_cnt_q < NrLanes * DataWidth)
-              issue_cnt_d = '0;
-          end
-        end
-      end
-    end
-
-    ///////////////////////////
-    //// Masked Instruction ///
-    ///////////////////////////
-    if ((|masku_operand_alu_valid && !result_queue_full) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {[VIOTA:VID]}) begin
-      // if this is the last beat, commit the result to the scalar_result queue
-      commit_cnt_d = commit_cnt_q - (NrLanes << (int'(EW64) - vinsn_commit.vtype.vsew));
-      if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin
-        commit_cnt_d = '0;
-      end
-    end
-    if ((&masku_operand_alu_valid || &masku_operand_vs2_seq_valid) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {VMSBF, VMSOF, VMSIF}) begin
-      commit_cnt_d = commit_cnt_q - NrLanes * DataWidth;
-      if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin
-        commit_cnt_d = '0;
-      end
-    end
-
-    // Finished issuing results
-    if (vinsn_issue_valid && (
-          ( (vinsn_issue.vm || vinsn_issue.vfu == VFU_MaskUnit) && issue_cnt_d == '0) ||
-          (!(vinsn_issue.vm || vinsn_issue.vfu == VFU_MaskUnit) && read_cnt_d == '0))) begin
-      // Increment vector instruction queue pointers and counters
-      vinsn_queue_d.issue_cnt -= 1;
-    end
-
-    /////////////////////////////////
-    //  Send operands to the VFUs  //
-    /////////////////////////////////
+    //////////////////////////////////////
+    //  Send Mask Operands to the VFUs  //
+    //////////////////////////////////////
 
     for (int lane = 0; lane < NrLanes; lane++) begin: send_operand
       mask_valid_o[lane] = mask_queue_valid_q[mask_queue_read_pnt_q][lane];
@@ -980,7 +874,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       // The VLDU and the VSTU acknowledge all the operands at once.
       // Only accept the acknowledgement from the lanes if the current instruction is executing there.
       // Deactivate the request, but do not bump the pointers for now.
-      if ((lane_mask_ready_i[lane] && mask_valid_o[lane] && vinsn_issue.vfu inside {VFU_Alu, VFU_MFpu, VFU_MaskUnit}) ||
+      if ((lane_mask_ready_i[lane] && mask_valid_o[lane] && (vinsn_issue.vfu inside {VFU_Alu, VFU_MFpu} || vinsn_issue.op inside {[VMADC:VMSBC]})) ||
            vldu_mask_ready_i || vstu_mask_ready_i || sldu_mask_ready_i) begin
         mask_queue_valid_d[mask_queue_read_pnt_q][lane] = 1'b0;
         mask_queue_d[mask_queue_read_pnt_q][lane]       = '0;
@@ -990,12 +884,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     // Is this operand going to the lanes?
     mask_valid_lane_o = vinsn_issue.vfu inside {VFU_Alu, VFU_MFpu, VFU_MaskUnit};
 
-    if (vd_scalar(vinsn_issue.op)) begin
-      mask_valid_o = (vinsn_issue.vm) ? '0 : '1;
-    end
-
     // All lanes accepted the VRF request
-    if (!(|mask_queue_valid_d[mask_queue_read_pnt_q]))
+    if (!(|mask_queue_valid_d[mask_queue_read_pnt_q])) begin
       // There is something waiting to be written
       if (!mask_queue_empty) begin
         // Increment the read pointer
@@ -1017,10 +907,171 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
             commit_cnt_d = '0;
         end
       end
+    end
+
+    ///////////////////////
+    // MASKU ALU Control //
+    ///////////////////////
+
+    // Instructions that natively run in the MASKU
+
+    // The main data packets come from the lanes' ALUs.
+    // Also, mask- and tail-undisturbed policies are implemented by fetching the destination register,
+    // which is the default value of the result queue.
+
+    // Almost all the operations are time multiplexed. Moreover, some operations (e.g., VIOTA) work on
+    // different input and output data widths, meaning that the input ready and the final output valid
+    // are not always synchronized.
+
+    // How many elements {VIOTA|VID} are writing to each lane
+    elm_per_lane = processing_cnt_q / NrLanes;
+    if ((processing_cnt_q / NrLanes) > 4'b1000)
+      elm_per_lane = 4'b1000;
+    for (int l = 0; l < NrLanes; l++) additional_elm[l] = processing_cnt_q[idx_width(NrLanes)-1:0] > l;
+
+    // Default operand queue assignment
+    for (int unsigned lane = 0; lane < NrLanes; lane++) begin
+      result_queue_d[result_queue_write_pnt_q][lane] = '{
+        wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata, // Retain the last-cycle's data
+		// VIOTA, VID generate a non-mask vector and should comply with undisturbed policy
+        // This means that we can use the byte-enable signal
+        be   : vinsn_issue.op inside {[VIOTA:VID]} ? be(elm_per_lane + additional_elm[lane], vinsn_issue.vtype.vsew) & be_viota_shuf[lane*StrbWidth +: StrbWidth] : '1,
+        addr : vaddr(vinsn_issue.vd, NrLanes, VLEN) + iteration_cnt_q,
+        id   : vinsn_issue.id
+      };
+    end
+
+    // Is there an instruction ready to be issued?
+    if (vinsn_issue_valid && vinsn_issue.op inside {[VMFEQ:VMXNOR]}) begin
+      // Compute one slice if we can write and the necessary inputs are valid
+      if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op == VID)
+                             && (&masku_operand_vd_valid  || !vinsn_issue.use_vd_op)
+                             && (&masku_operand_m_valid   || vinsn_issue.vm || vinsn_issue.op inside {[VMADC:VMSBC]})) begin
+
+        // Write the result queue on the background data - either vd or the previous result
+        // The mask vector writes at 1 (tail-agnostic ok value) both the background body
+        // elements that will be written by the MASKU ALU and the tail elements.
+        for (int unsigned lane = 0; lane < NrLanes; lane++) begin
+          result_queue_background_data[lane] = (out_valid_cnt_q != '0)
+                                             ? result_queue_q[result_queue_write_pnt_q][lane].wdata
+                                             : vinsn_issue.op inside {[VIOTA:VID]} ? '1 : background_data_init_shuf[lane*DataWidth +: DataWidth];
+        end
+        for (int unsigned lane = 0; lane < NrLanes; lane++) begin
+          // The alu_result has all the bits at 1 except for the portion of bits to write.
+          // The masking is already applied in the MASKU ALU.
+          result_queue_d[result_queue_write_pnt_q][lane].wdata = result_queue_background_data[lane] & alu_result[lane];
+        end
+        // Write the scalar accumulator
+        popcount_d = popcount_q + popcount;
+        vfirst_count_d = vfirst_count_q + vfirst_count;
+
+        // Bump MASKU ALU state
+        found_one_d = found_one;
+        viota_acc_d = viota_acc;
+        vrf_pnt_d   = vrf_pnt_q + delta_elm_q;
+
+        // Increment the input, input-mask, and output slice counters
+        in_ready_cnt_en   = 1'b1;
+        in_m_ready_cnt_en = 1'b1;
+        out_valid_cnt_en  = 1'b1;
+
+        // Account for the elements that have been processed
+        issue_cnt_d = issue_cnt_q - delta_elm_q;
+        if (issue_cnt_q < delta_elm_q)
+          issue_cnt_d = '0;
+
+        // Request new input (by completing ready-valid handshake) once all slices have been processed
+        // Alu input is accessed in different widths
+        if ((in_ready_cnt_q == in_ready_threshold_q) || (issue_cnt_d == '0)) begin
+          in_ready_cnt_clr = 1'b1;
+          if (vinsn_issue.op != VID) begin
+            masku_operand_alu_ready = '1;
+          end
+        end
+        // Mask is always accessed at bit level
+        // VMADC, VMSBC handle masks in the mask queue
+        if ((in_m_ready_cnt_q == in_m_ready_threshold_q) || (issue_cnt_d == '0) && !(vinsn_issue.op inside {[VMADC:VMSBC]})) begin
+          in_m_ready_cnt_clr = 1'b1;
+          if (!vinsn_issue.vm) begin
+            masku_operand_m_ready = '1;
+          end
+        end
+
+        // Write to the result queue if the entry is full or if this is the last output
+        // if this is the last output slice of the vector.
+        // Also, handshake the vd input, which follows the output.
+        if ((out_valid_cnt_q == out_valid_threshold_q) || (issue_cnt_d == '0)) begin
+          out_valid_cnt_clr = 1'b1;
+          // Handshake vd input
+          if (vinsn_issue.use_vd_op) begin
+            masku_operand_vd_ready = '1;
+          end
+          // Assert valid result queue output
+          out_vrf_word_valid = !vd_scalar(vinsn_issue.op);
+        end
+
+        // The scalar result is valid for write back at the end of the operation.
+        // VFIRST can also interrupt the operation in advance when the 1 is found.
+        if (issue_cnt_d == '0 || (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin
+          // Assert valid scalar output
+          out_scalar_valid = vd_scalar(vinsn_issue.op);
+        end
+
+        // Have we finished insn execution? Clear MASKU ALU state
+        if (issue_cnt_d == '0) begin
+          be_viota_seq_d = '1; // Default: write
+          viota_acc_d    = '0;
+          found_one_d    = '0;
+        end
+      end
+    end
+
+    /////////////////////
+    //  Write results  //
+    /////////////////////
+
+    // Write VRF words to the result queue
+    if (out_vrf_word_valid) begin
+      // Write to the lanes
+      result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
+
+      // Increment result queue pointers and counters
+      result_queue_cnt_d += 1;
+      result_queue_write_pnt_d = result_queue_write_pnt_q + 1;
+      if (result_queue_write_pnt_q == ResultQueueDepth-1) begin
+        result_queue_write_pnt_d = '0;
+      end
 
-    //////////////////////////////////
-    //  Write results into the VRF  //
-    //////////////////////////////////
+      // Clear MASKU ALU state
+      be_viota_seq_d = '0;
+
+      // Account for the written results
+      // VIOTA and VID do not write bits!
+      processing_cnt_d = vinsn_issue.op inside {[VIOTA:VID]} ? processing_cnt_q - ((NrLanes * DataWidth / 8) >> vinsn_issue.vtype.vsew) : processing_cnt_q - NrLanes * DataWidth;
+    end
+
+    // The scalar result has been sent to and acknowledged by the dispatcher
+    if (out_scalar_valid) begin
+      result_scalar_d = (vinsn_issue.op == VCPOP) ? popcount_d : ((vfirst_empty) ? -1 : vfirst_count_d);
+      result_scalar_valid_d = '1;
+
+      // The instruction is over
+      issue_cnt_d       = '0;
+      processing_cnt_d  = '0;
+      commit_cnt_d      = '0;
+    end
+
+    // Finished issuing results
+    if (vinsn_issue_valid && (
+          ( (vinsn_issue.vm || vinsn_issue.vfu == VFU_MaskUnit) && issue_cnt_d == '0) ||
+          (!(vinsn_issue.vm || vinsn_issue.vfu == VFU_MaskUnit) && read_cnt_d  == '0))) begin
+      // The instruction finished its issue phase
+      vinsn_queue_d.issue_cnt -= 1;
+    end
+
+    //////////////
+    //  Commit  //
+    //////////////
 
     for (int lane = 0; lane < NrLanes; lane++) begin: result_write
       masku_result_req_o[lane]   = result_queue_valid_q[result_queue_read_pnt_q][lane];
@@ -1044,7 +1095,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
     // All lanes accepted the VRF request
     if (!(|result_queue_valid_d[result_queue_read_pnt_q]) &&
-      (&result_final_gnt_d || (commit_cnt_q > (NrLanes * DataWidth))))
+      (&result_final_gnt_d || (commit_cnt_q > (NrLanes * DataWidth)))) begin
       // There is something waiting to be written
       if (!result_queue_empty) begin
         // Increment the read pointer
@@ -1060,41 +1111,59 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         result_queue_d[result_queue_read_pnt_q] = '0;
 
         // Decrement the counter of remaining vector elements waiting to be written
-        if (!(vinsn_issue.op inside {VID, VSE})) begin
-          commit_cnt_d = commit_cnt_q - NrLanes * DataWidth;
-          if (commit_cnt_q < (NrLanes * DataWidth))
-            commit_cnt_d = '0;
+        if (!(vinsn_commit.op inside {VSE})) begin
+          if (vinsn_commit.op inside {[VIOTA:VID]}) begin
+            commit_cnt_d = commit_cnt_q - ((NrLanes * DataWidth / 8) >> unsigned'(vinsn_commit.vtype.vsew));
+            if (commit_cnt_q < ((NrLanes * DataWidth / 8) >> unsigned'(vinsn_commit.vtype.vsew)))
+              commit_cnt_d = '0;
+          end else begin
+            commit_cnt_d = commit_cnt_q - NrLanes * DataWidth;
+            if (commit_cnt_q < (NrLanes * DataWidth))
+              commit_cnt_d = '0;
+          end
         end
       end
+    end
+
+    // Finished committing the results of a vector instruction
+    if (vinsn_commit_valid && commit_cnt_d == '0) begin
+      // Clear the iteration counter
+      out_valid_cnt_clr = 1'b1;
+
+      // Clear the vrf pointer for comparisons
+      vrf_pnt_d = '0;
+
+      // Clear the iteration counter
+      iteration_cnt_clr = 1'b1;
+
+      if(&result_final_gnt_d || vd_scalar(vinsn_commit.op) || vinsn_commit.vfu != VFU_MaskUnit) begin
+        // Mark the vector instruction as being done
+        pe_resp.vinsn_done[vinsn_commit.id] = 1'b1;
+
+        // Update the commit counters and pointers
+        vinsn_queue_d.commit_cnt -= 1;
+      end
+    end
 
     ///////////////////////////
     // Commit scalar results //
     ///////////////////////////
 
-    // The scalar result has been sent to and acknowledged by the dispatcher
-    if (vinsn_commit.op inside {[VCPOP:VFIRST]} && result_scalar_valid_o == 1) begin
-
-      // reset result_scalar
+    // This is one cycle after asserting out_scalar_valid
+    // Ara's frontend is always ready to accept the scalar result
+    if (result_scalar_valid_o) begin
+      // Reset result_scalar
       result_scalar_d       = '0;
       result_scalar_valid_d = '0;
 
-      // reset the popcount and vfirst_count
+      // Clear the iteration counter
+      iteration_cnt_clr = 1'b1;
+
+      // Reset the popcount and vfirst_count
       popcount_d     = '0;
       vfirst_count_d = '0;
     end
 
-    // Finished committing the results of a vector instruction
-    // Some instructions forward operands to the lanes before writing the VRF
-    // In this case, wait for the lanes to be written
-    if (vinsn_commit_valid && commit_cnt_d == '0 &&
-      (!(vinsn_commit.op inside {[VMFEQ:VID], [VMSGT:VMSBC]}) || &result_final_gnt_d)) begin
-      // Mark the vector instruction as being done
-      pe_resp.vinsn_done[vinsn_commit.id] = 1'b1;
-
-      // Update the commit counters and pointers
-      vinsn_queue_d.commit_cnt -= 1;
-    end
-
     //////////////////////////////
     //  Accept new instruction  //
     //////////////////////////////
@@ -1112,12 +1181,14 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
       // Initialize counters
       if (vinsn_queue_d.issue_cnt == '0) begin
-        issue_cnt_d = pe_req_i.vl;
-        read_cnt_d  = pe_req_i.vl;
+        issue_cnt_d      = pe_req_i.vl;
+        processing_cnt_d = pe_req_i.vl;
+        read_cnt_d       = pe_req_i.vl;
 
         // Trim skipped words
         if (pe_req_i.op == VSLIDEUP) begin
-          issue_cnt_d -= vlen_t'(trimmed_stride);
+          issue_cnt_d      -= vlen_t'(trimmed_stride);
+          processing_cnt_d -= vlen_t'(trimmed_stride);
           case (pe_req_i.vtype.vsew)
             EW8:  begin
               read_cnt_d -= (vlen_t'(trimmed_stride) >> $clog2(NrLanes << 3)) << $clog2(NrLanes << 3);
@@ -1139,9 +1210,68 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
           endcase
         end
 
+        // Initialize ALU MASKU counters and pointers
+        unique case (pe_req_i.op) inside
+          [VMFEQ:VMSGT]: begin
+            // Mask to mask - encoded
+            delta_elm_d = NrLanes << (EW64 - pe_req_i.eew_vs2[1:0]);
+
+            in_ready_threshold_d   = 0;
+            in_m_ready_threshold_d = (DataWidth >> (EW64 - pe_req_i.eew_vs2[1:0]))-1;
+            out_valid_threshold_d  = (DataWidth >> (EW64 - pe_req_i.eew_vs2[1:0]))-1;
+          end
+          [VMADC:VMSBC]: begin
+            // Mask to mask - encoded
+            delta_elm_d = NrLanes << (EW64 - pe_req_i.eew_vs2[1:0]);
+
+            in_ready_threshold_d   = 0;
+            in_m_ready_threshold_d = (DataWidth >> (EW64 - pe_req_i.eew_vs2[1:0]))-1;
+            out_valid_threshold_d  = (DataWidth >> (EW64 - pe_req_i.eew_vs2[1:0]))-1;
+          end
+          [VMANDNOT:VMXNOR]: begin
+            // Mask to mask
+            delta_elm_d = VmLogicalParallelism;
+
+            in_ready_threshold_d   = NrLanes*DataWidth/VmLogicalParallelism-1;
+            in_m_ready_threshold_d = NrLanes*DataWidth/VmLogicalParallelism-1;
+            out_valid_threshold_d  = NrLanes*DataWidth/VmLogicalParallelism-1;
+          end
+          [VMSBF:VMSIF]: begin
+            // Mask to mask
+            delta_elm_d = VmsxfParallelism;
+
+            in_ready_threshold_d   = NrLanes*DataWidth/VmsxfParallelism-1;
+            in_m_ready_threshold_d = NrLanes*DataWidth/VmsxfParallelism-1;
+            out_valid_threshold_d  = NrLanes*DataWidth/VmsxfParallelism-1;
+          end
+          [VIOTA:VID]: begin
+            // Mask to non-mask
+            delta_elm_d = ViotaParallelism;
+
+            in_ready_threshold_d   = NrLanes*DataWidth/ViotaParallelism-1;
+            in_m_ready_threshold_d = NrLanes*DataWidth/ViotaParallelism-1;
+            out_valid_threshold_d  = ((NrLanes*DataWidth/8/ViotaParallelism) >> pe_req_i.vtype.vsew[1:0])-1;
+          end
+          VCPOP: begin
+            // Mask to scalar
+            delta_elm_d = VcpopParallelism;
+
+            in_ready_threshold_d   = NrLanes*DataWidth/VcpopParallelism-1;
+            in_m_ready_threshold_d = NrLanes*DataWidth/VcpopParallelism-1;
+            out_valid_threshold_d  = '0;
+          end
+          default: begin // VFIRST
+            // Mask to scalar
+            delta_elm_d = VfirstParallelism;
+
+            in_ready_threshold_d   = NrLanes*DataWidth/VfirstParallelism-1;
+            in_m_ready_threshold_d = NrLanes*DataWidth/VfirstParallelism-1;
+            out_valid_threshold_d  = '0;
+          end
+        endcase
+
         // Reset the final grant vector
         // Be aware: this works only if the insn queue length is 1
-
         result_final_gnt_d = '0;
       end
       if (vinsn_queue_d.commit_cnt == '0) begin
@@ -1155,33 +1285,47 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       vinsn_queue_d.issue_cnt += 1;
       vinsn_queue_d.commit_cnt += 1;
     end
-  end: p_masku
+  end
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
-      vinsn_running_q    <= '0;
-      read_cnt_q         <= '0;
-      issue_cnt_q        <= '0;
-      commit_cnt_q       <= '0;
-      vrf_pnt_q          <= '0;
-      mask_pnt_q         <= '0;
-      pe_resp_o          <= '0;
-      result_final_gnt_q <= '0;
-      vcpop_slice_cnt_q  <= '0;
-      popcount_q         <= '0;
-      vfirst_count_q     <= '0;
+      vinsn_running_q        <= '0;
+      read_cnt_q             <= '0;
+      issue_cnt_q            <= '0;
+      processing_cnt_q       <= '0;
+      commit_cnt_q           <= '0;
+      vrf_pnt_q              <= '0;
+      mask_pnt_q             <= '0;
+      pe_resp_o              <= '0;
+      result_final_gnt_q     <= '0;
+      popcount_q             <= '0;
+      vfirst_count_q         <= '0;
+      delta_elm_q            <= '0;
+      in_ready_threshold_q   <= '0;
+      in_m_ready_threshold_q <= '0;
+      out_valid_threshold_q  <= '0;
+      viota_acc_q            <= '0;
+      found_one_q            <= '0;
+      be_viota_seq_q         <= '1; // Default: write
     end else begin
-      vinsn_running_q    <= vinsn_running_d;
-      read_cnt_q         <= read_cnt_d;
-      issue_cnt_q        <= issue_cnt_d;
-      commit_cnt_q       <= commit_cnt_d;
-      vrf_pnt_q          <= vrf_pnt_d;
-      mask_pnt_q         <= mask_pnt_d;
-      pe_resp_o          <= pe_resp;
-      result_final_gnt_q <= result_final_gnt_d;
-      vcpop_slice_cnt_q  <= vcpop_slice_cnt_d;
-      popcount_q         <= popcount_d;
-      vfirst_count_q     <= vfirst_count_d;
+      vinsn_running_q        <= vinsn_running_d;
+      read_cnt_q             <= read_cnt_d;
+      issue_cnt_q            <= issue_cnt_d;
+      processing_cnt_q       <= processing_cnt_d;
+      commit_cnt_q           <= commit_cnt_d;
+      vrf_pnt_q              <= vrf_pnt_d;
+      mask_pnt_q             <= mask_pnt_d;
+      pe_resp_o              <= pe_resp;
+      result_final_gnt_q     <= result_final_gnt_d;
+      popcount_q             <= popcount_d;
+      vfirst_count_q         <= vfirst_count_d;
+      delta_elm_q            <= delta_elm_d;
+      in_ready_threshold_q   <= in_ready_threshold_d;
+      in_m_ready_threshold_q <= in_m_ready_threshold_d;
+      out_valid_threshold_q  <= out_valid_threshold_d;
+      viota_acc_q            <= viota_acc_d;
+      found_one_q            <= found_one_d;
+      be_viota_seq_q         <= be_viota_seq_d;
     end
   end
 
diff --git a/hardware/src/masku/masku_operands.sv b/hardware/src/masku/masku_operands.sv
index 503f5b207..2788652c7 100644
--- a/hardware/src/masku/masku_operands.sv
+++ b/hardware/src/masku/masku_operands.sv
@@ -13,13 +13,17 @@
 //
 //
 // Incoming Operands:
-// masku_operands_i = {v0.m, vs2, alu_result, fpu_result}
+// masku_operands_i = {v0.m, vd, alu_result, fpu_result}
 //
 
 module masku_operands import ara_pkg::*; import rvv_pkg::*; #(
     parameter int unsigned NrLanes   = 0,
     parameter type         pe_req_t  = logic,
-    parameter type         pe_resp_t = logic
+    parameter type         pe_resp_t = logic,
+    // Vl bit mask disabled by default since we fetch vd from opqueues
+    // to provide tail undisturbed policy at bit granularity.
+    // Enable this if the datapath is changed and vd is no more fetched.
+    localparam int unsigned VlBitMaskEnable = 0
   ) (
     input logic clk_i,
     input logic rst_ni,
@@ -41,20 +45,20 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #(
     output logic  [NrLanes*ELEN-1:0] masku_operand_alu_seq_o, // ALU/FPU result (deshuffled, uncompressed)
     output logic  [     NrLanes-1:0] masku_operand_alu_seq_valid_o,
     input  logic  [     NrLanes-1:0] masku_operand_alu_seq_ready_i,
-    output elen_t [     NrLanes-1:0] masku_operand_vs2_o,     // vs2 (shuffled)
-    output logic  [     NrLanes-1:0] masku_operand_vs2_valid_o,
-    input  logic  [     NrLanes-1:0] masku_operand_vs2_ready_i,
-    output logic  [NrLanes*ELEN-1:0] masku_operand_vs2_seq_o, // vs2 (deshuffled)
-    output logic  [     NrLanes-1:0] masku_operand_vs2_seq_valid_o,
-    input  logic  [     NrLanes-1:0] masku_operand_vs2_seq_ready_i,
+    output elen_t [     NrLanes-1:0] masku_operand_vd_o,     // vd (shuffled)
+    output logic  [     NrLanes-1:0] masku_operand_vd_valid_o,
+    input  logic  [     NrLanes-1:0] masku_operand_vd_ready_i,
+    output logic  [NrLanes*ELEN-1:0] masku_operand_vd_seq_o, // vd (deshuffled)
+    output logic  [     NrLanes-1:0] masku_operand_vd_seq_valid_o,
+    input  logic  [     NrLanes-1:0] masku_operand_vd_seq_ready_i,
     output elen_t [     NrLanes-1:0] masku_operand_m_o,       // Mask (shuffled)
     output logic  [     NrLanes-1:0] masku_operand_m_valid_o,
     input  logic  [     NrLanes-1:0] masku_operand_m_ready_i,
     output logic  [NrLanes*ELEN-1:0] masku_operand_m_seq_o,   // Mask (deshuffled)
     output logic  [     NrLanes-1:0] masku_operand_m_seq_valid_o,
     input  logic  [     NrLanes-1:0] masku_operand_m_seq_ready_i,
-    output logic  [NrLanes*ELEN-1:0] bit_enable_mask_o,       // Bit mask for mask unit instructions (shuffled like mask register)
-    output logic  [NrLanes*ELEN-1:0] alu_result_compressed_o  // ALU/FPU results compressed (from sew to 1-bit) (shuffled, in mask format)
+    output logic  [NrLanes*ELEN-1:0] bit_enable_mask_o,           // Bit mask for mask unit instructions (shuffled like mask register)
+    output logic  [NrLanes*ELEN-1:0] alu_result_compressed_seq_o  // ALU/FPU results compressed (from sew to 1-bit) (deshuffled, in mask format)
   );
 
   // Imports
@@ -62,75 +66,101 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #(
 
   // Local Parameter
   localparam int unsigned DATAPATH_WIDTH = NrLanes * ELEN; // Mask Unit datapath width
-  localparam int unsigned ELEN_BYTES     = ELEN / 8;
 
   // Helper signals
   logic [DATAPATH_WIDTH-1:0] deshuffled_vl_bit_mask; // this bit enable signal is only dependent on vl
   logic [DATAPATH_WIDTH-1:0] shuffled_vl_bit_mask;   // this bit enable signal is only dependent on vl
   vew_e                      bit_enable_shuffle_eew;
 
-  elen_t [NrLanes-1:0] masku_operand_vs2_d;
-  logic                masku_operand_vs2_lane_valid;
-  logic                masku_operand_vs2_lane_ready;
-  logic                masku_operand_vs2_spill_valid;
-  logic                masku_operand_vs2_spill_ready;
+  elen_t [NrLanes-1:0] masku_operand_vd_d;
+  logic  [NrLanes-1:0] masku_operand_vd_lane_valid;
+  logic  [NrLanes-1:0] masku_operand_vd_lane_ready;
+  logic  [NrLanes-1:0] masku_operand_vd_spill_valid;
+  logic  [NrLanes-1:0] masku_operand_vd_spill_ready;
 
+  elen_t [NrLanes-1:0] masku_operand_m_d;
+  logic  [NrLanes-1:0] masku_operand_m_lane_valid;
+  logic  [NrLanes-1:0] masku_operand_m_lane_ready;
+  logic  [NrLanes-1:0] masku_operand_m_spill_valid;
+  logic  [NrLanes-1:0] masku_operand_m_spill_ready;
 
   // Extract operands from input (input comes in "shuffled form" from the lanes)
   for (genvar lane = 0; lane < NrLanes; lane++) begin
-    assign masku_operand_m_o[lane]   = masku_operands_i[lane][0];
-    assign masku_operand_vs2_d[lane] = masku_operands_i[lane][1];
     assign masku_operand_alu_o[lane] = masku_operands_i[lane][2 + masku_fu_i];
+    assign masku_operand_vd_d[lane] = masku_operands_i[lane][1];
+    assign masku_operand_m_d[lane] = masku_operands_i[lane][0];
   end
 
   // ----------
-  // Deshuffle vs2
+  // Deshuffle input sources
   // ----------
   always_comb begin
     masku_operand_m_seq_o   = '0;
-    masku_operand_vs2_seq_o = '0;
+    masku_operand_vd_seq_o = '0;
     masku_operand_alu_seq_o = '0;
-    for (int b = 0; b < (NrLanes * ELEN_BYTES); b++) begin
-      automatic int deshuffle_idx   = deshuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew);
+    for (int b = 0; b < (NrLanes * ELENB); b++) begin
+      automatic int deshuffle_alu_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.eew_vs2);
+      automatic int deshuffle_vd_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.eew_vd_op);
       automatic int deshuffle_m_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask);
-      automatic int lane_idx    = b / ELEN_BYTES; // rounded down to nearest integer
-      automatic int lane_offset = b % ELEN_BYTES;
-      masku_operand_alu_seq_o[8*deshuffle_idx +: 8] = masku_operand_alu_o[lane_idx][8*lane_offset +: 8];
-      masku_operand_vs2_seq_o[8*deshuffle_idx +: 8] = masku_operand_vs2_o[lane_idx][8*lane_offset +: 8];
+      automatic int lane_idx    = b / ELENB; // rounded down to nearest integer
+      automatic int lane_offset = b % ELENB;
+      masku_operand_alu_seq_o[8*deshuffle_alu_idx +: 8] = masku_operand_alu_o[lane_idx][8*lane_offset +: 8];
+      masku_operand_vd_seq_o[8*deshuffle_vd_idx +: 8] = masku_operand_vd_o[lane_idx][8*lane_offset +: 8];
       masku_operand_m_seq_o[8*deshuffle_m_idx +: 8] = masku_operand_m_o[lane_idx][8*lane_offset +: 8];
     end
   end
 
   always_comb begin
-    masku_operand_vs2_spill_ready = 1'b1;
+    masku_operand_vd_spill_ready = 1'b0;
+    masku_operand_m_spill_ready  = 1'b0;
     for (int lane = 0; lane < NrLanes; lane++) begin
-      masku_operand_vs2_spill_ready &= masku_operand_vs2_ready_i[lane] | masku_operand_vs2_seq_ready_i[lane];
+      masku_operand_vd_spill_ready[lane] = masku_operand_vd_ready_i[lane] | masku_operand_vd_seq_ready_i[lane];
+      masku_operand_m_spill_ready[lane]  = masku_operand_m_ready_i[lane]  | masku_operand_m_seq_ready_i[lane];
     end
   end
 
-  spill_register #(
-    .T       ( elen_t [NrLanes-1:0] ),
-    .Bypass  ( 1'b0 )
-  ) i_spill_register_vs2 (
-    .clk_i   (clk_i),
-    .rst_ni  (rst_ni),
-    .valid_i (masku_operand_vs2_lane_valid),
-    .ready_o (masku_operand_vs2_lane_ready),
-    .data_i  (masku_operand_vs2_d),
-    .valid_o (masku_operand_vs2_spill_valid),
-    .ready_i (masku_operand_vs2_spill_ready),
-    .data_o  (masku_operand_vs2_o)
-  );
+  for (genvar lane = 0; lane < NrLanes; lane++) begin : gen_masku_operands_spill_regs
+    spill_register #(
+      .T       ( elen_t )
+    ) i_spill_register_vd (
+      .clk_i   (clk_i),
+      .rst_ni  (rst_ni),
+      .valid_i (masku_operand_vd_lane_valid[lane]),
+      .ready_o (masku_operand_vd_lane_ready[lane]),
+      .data_i  (masku_operand_vd_d[lane]),
+      .valid_o (masku_operand_vd_spill_valid[lane]),
+      .ready_i (masku_operand_vd_spill_ready[lane]),
+      .data_o  (masku_operand_vd_o[lane])
+    );
+
+    spill_register #(
+      .T       ( elen_t )
+    ) i_spill_register_m (
+      .clk_i   (clk_i),
+      .rst_ni  (rst_ni),
+      .valid_i (masku_operand_m_lane_valid[lane]),
+      .ready_o (masku_operand_m_lane_ready[lane]),
+      .data_i  (masku_operand_m_d[lane]),
+      .valid_o (masku_operand_m_spill_valid[lane]),
+      .ready_i (masku_operand_m_spill_ready[lane]),
+      .data_o  (masku_operand_m_o[lane])
+    );
+  end
 
   for (genvar lane = 0; lane < NrLanes; lane++) begin
-    assign masku_operand_vs2_valid_o[lane]     = masku_operand_vs2_spill_valid;
-    assign masku_operand_vs2_seq_valid_o[lane] = masku_operand_vs2_spill_valid;
+    assign masku_operand_vd_valid_o[lane]     = masku_operand_vd_spill_valid[lane];
+    assign masku_operand_vd_seq_valid_o[lane] = masku_operand_vd_spill_valid[lane];
+
+    assign masku_operand_m_valid_o[lane]     = masku_operand_m_spill_valid[lane];
+    assign masku_operand_m_seq_valid_o[lane] = masku_operand_m_spill_valid[lane];
   end
 
   always_comb begin
-    masku_operand_vs2_lane_valid = 1'b1;
+    masku_operand_vd_lane_valid = 1'b0;
+    masku_operand_m_lane_valid  = 1'b0;
     for (int lane = 0; lane < NrLanes; lane++) begin
-      masku_operand_vs2_lane_valid &= masku_operand_valid_i[lane][1];
+      masku_operand_vd_lane_valid[lane] = masku_operand_valid_i[lane][1];
+      masku_operand_m_lane_valid[lane]  = masku_operand_valid_i[lane][0];
     end
   end
 
@@ -139,7 +169,7 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #(
   // ------------------------------------------------
 
   // Generate shuffled bit level mask
-  assign bit_enable_shuffle_eew = vinsn_issue_i.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]} ? vinsn_issue_i.vtype.vsew : vinsn_issue_i.eew_vd_op;
+  assign bit_enable_shuffle_eew = vinsn_issue_i.op inside {[VMFEQ:VMSBC]} ? vinsn_issue_i.vtype.vsew : vinsn_issue_i.eew_vd_op;
 
   always_comb begin
     // Default assignments
@@ -148,32 +178,39 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #(
     bit_enable_mask_o      = '0;
 
     // Generate deshuffled vl bit mask
-    for (int unsigned i = 0; i < DATAPATH_WIDTH; i++) begin
-      if (i < vinsn_issue_i.vl) begin
-        deshuffled_vl_bit_mask[i] = 1'b1;
+    if (VlBitMaskEnable) begin
+      for (int unsigned i = 0; i < DATAPATH_WIDTH; i++) begin
+        if (i < vinsn_issue_i.vl) begin
+          deshuffled_vl_bit_mask[i] = 1'b1;
+        end
       end
     end
 
-    for (int unsigned b = 0; b < NrLanes * ELEN_BYTES; b++) begin
+    for (int unsigned b = 0; b < NrLanes * ELENB; b++) begin
       // local helper signals
       logic [idx_width(DATAPATH_WIDTH)-1:0] src_operand_byte_shuffle_index;
       logic [idx_width(DATAPATH_WIDTH)-1:0] mask_operand_byte_shuffle_index;
       logic [       idx_width(NrLanes)-1:0] mask_operand_byte_shuffle_lane_index;
-      logic [    idx_width(ELEN_BYTES)-1:0] mask_operand_byte_shuffle_lane_offset;
+      logic [    idx_width(ELENB)-1:0] mask_operand_byte_shuffle_lane_offset;
 
       // get shuffle idices
       // Note: two types of shuffle indices are needed because the source operand and the
       //       mask register might not have the same effective element width (eew)
       src_operand_byte_shuffle_index        = shuffle_index(b, NrLanes, bit_enable_shuffle_eew);
       mask_operand_byte_shuffle_index       = shuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask);
-      mask_operand_byte_shuffle_lane_index  = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES) +: idx_width(NrLanes)];
-      mask_operand_byte_shuffle_lane_offset = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES)-1:0];
+      mask_operand_byte_shuffle_lane_index  = mask_operand_byte_shuffle_index[idx_width(ELENB) +: idx_width(NrLanes)];
+      mask_operand_byte_shuffle_lane_offset = mask_operand_byte_shuffle_index[idx_width(ELENB)-1:0];
 
       // shuffle bit enable
-      shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8] = deshuffled_vl_bit_mask[8*b +: 8];
+      if (VlBitMaskEnable) begin
+        shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8] = deshuffled_vl_bit_mask[8*b +: 8];
+        // Generate bit-level mask
+        bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] = shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8];
+      end else begin
+        shuffled_vl_bit_mask = '0;
+        bit_enable_mask_o = '0;
+      end
 
-      // Generate bit-level mask
-      bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] = shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8];
       if (!vinsn_issue_i.vm && !(vinsn_issue_i.op inside {VMADC, VMSBC})) begin // exception for VMADC and VMSBC, because they use the mask register as a source operand (and not as a mask)
         bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] &= masku_operand_m_o[mask_operand_byte_shuffle_lane_index][8*mask_operand_byte_shuffle_lane_offset +: 8];
       end
@@ -184,30 +221,24 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #(
   // Compress ALU/FPU results into a mask vector
   // -------------------------------------------
   always_comb begin
-    alu_result_compressed_o = '0;
-    for (int b = 0; b < ELEN_BYTES * NrLanes; b++) begin
-      if ((b % (1 << vinsn_issue_i.vtype.vsew)) == '0) begin
-        automatic int src_byte        = shuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew);
-        automatic int src_byte_lane   = src_byte[idx_width(ELEN_BYTES) +: idx_width(NrLanes)];
-        automatic int src_byte_offset = src_byte[idx_width(ELEN_BYTES)-1:0];
-
-        automatic int dest_bit_seq  = (b >> vinsn_issue_i.vtype.vsew) + vrf_pnt_i;
-        automatic int dest_byte_seq = dest_bit_seq / ELEN_BYTES;
-        automatic int dest_byte     = shuffle_index(dest_byte_seq, NrLanes, vinsn_issue_i.vtype.vsew);
-        alu_result_compressed_o[ELEN_BYTES * dest_byte + dest_bit_seq[idx_width(ELEN_BYTES)-1:0]] = masku_operand_alu_o[src_byte_lane][8 * src_byte_offset];
+    alu_result_compressed_seq_o = '1;
+    for (int b = 0; b < ELENB * NrLanes; b++) begin
+      if ((b % (1 << vinsn_issue_i.eew_vs2)) == '0) begin
+        automatic int src_byte        = shuffle_index(b, NrLanes, vinsn_issue_i.eew_vs2);
+        automatic int src_byte_lane   = src_byte[idx_width(ELENB) +: idx_width(NrLanes)];
+        automatic int src_byte_offset = src_byte[idx_width(ELENB)-1:0];
+
+        automatic int dest_bit_seq  = (b >> vinsn_issue_i.eew_vs2) + vrf_pnt_i;
+        automatic int dest_byte_seq = dest_bit_seq / ELENB;
+        alu_result_compressed_seq_o[ELENB * dest_byte_seq + dest_bit_seq[idx_width(ELENB)-1:0]] = masku_operand_alu_o[src_byte_lane][8 * src_byte_offset];
       end
     end
   end
 
-
   // Control
   for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_unpack_masku_operands
     // immediately acknowledge operands coming from functional units
     assign masku_operand_alu_valid_o[lane] = masku_operand_valid_i[lane][2 + masku_fu_i];
-
-    assign masku_operand_m_valid_o[lane]   = masku_operand_valid_i[lane][0];
-
-    assign masku_operand_m_seq_valid_o[lane]   = masku_operand_valid_i[lane][0];
   end: gen_unpack_masku_operands
 
 
@@ -220,10 +251,10 @@ module masku_operands import ara_pkg::*; import rvv_pkg::*; #(
       for (int operand_fu = 0; operand_fu < NrMaskFUnits; operand_fu++) begin
         masku_operand_ready_o[lane][2 + operand_fu] = (masku_fu_e'(operand_fu) == masku_fu_i) && masku_operand_alu_ready_i[lane];
       end
-      // Acknowledge vs2 operands
-      masku_operand_ready_o[lane][1] = masku_operand_vs2_lane_ready;
+      // Acknowledge vd operands
+      masku_operand_ready_o[lane][1] = masku_operand_vd_lane_ready[lane];
       // Acknowledge mask operand
-      masku_operand_ready_o[lane][0]  = masku_operand_m_ready_i[lane];
+      masku_operand_ready_o[lane][0] = masku_operand_m_lane_ready[lane];
     end
   end
 

From 215edf798f31e58376f8dad41109de37441b8a64 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Fri, 15 Nov 2024 17:32:53 +0100
Subject: [PATCH 7/8] [hardware] :bug: Fix legality check in dispatcher

---
 hardware/src/ara_dispatcher.sv | 75 +++++++++++++++-------------------
 1 file changed, 34 insertions(+), 41 deletions(-)

diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv
index 1915c47db..c19efa2a1 100644
--- a/hardware/src/ara_dispatcher.sv
+++ b/hardware/src/ara_dispatcher.sv
@@ -224,7 +224,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
   logic load_zero_vl, store_zero_vl;
   // Do not checks vregs validity against current LMUL
   logic skip_lmul_checks;
-  logic skip_vs1_lmul_checks;
   // Are we decoding?
   logic is_decoding;
   // Is this an in-lane operation?
@@ -333,7 +332,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
     store_zero_vl = 1'b0;
 
     skip_lmul_checks     = 1'b0;
-    skip_vs1_lmul_checks = 1'b0;
 
     null_vslideup = 1'b0;
 
@@ -1522,7 +1520,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   6'b010010: begin // VXUNARY0
                     // These instructions do not use vs1
                     ara_req.use_vs1    = 1'b0;
-                    skip_vs1_lmul_checks = 1'b1;
                     // They are always encoded as ADDs with zero.
                     ara_req.op            = ara_pkg::VADD;
                     ara_req.use_scalar_op = 1'b1;
@@ -1750,21 +1747,21 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 // destination register.
                 if (!skip_lmul_checks) begin
                   unique case (ara_req.emul)
-                    LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vd;
+                    LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vd;
+                    LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vd;
                     default:;
                   endcase
                   unique case (lmul_vs2)
-                    LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs2;
+                    LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs2;
+                    LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs2;
                     default:;
                   endcase
                   unique case (lmul_vs1)
-                    LMUL_2: if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs1;
+                    LMUL_4: if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs1;
+                    LMUL_8: if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs1;
                     default:;
                   endcase
                 end
@@ -1992,15 +1989,15 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                 // destination register.
                 if (!skip_lmul_checks) begin
                   unique case (ara_req.emul)
-                    LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vd;
+                    LMUL_4: if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vd;
+                    LMUL_8: if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vd;
                     default:;
                   endcase
                   unique case (lmul_vs2)
-                    LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                    LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                    LMUL_2: if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs2;
+                    LMUL_4: if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs2;
+                    LMUL_8: if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs2;
                     default:;
                   endcase
                 end
@@ -2146,7 +2143,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b010010: begin // VFUNARY0
                       // These instructions do not use vs1
                       ara_req.use_vs1    = 1'b0;
-                      skip_vs1_lmul_checks = 1'b1;
 
                       case (insn.varith_type.rs1)
                         5'b00000: ara_req.op = VFCVTXUF;
@@ -2253,7 +2249,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b010011: begin // VFUNARY1
                     // These instructions do not use vs1
                     ara_req.use_vs1    = 1'b0;
-                    skip_vs1_lmul_checks = 1'b1;
 
                     unique case (insn.varith_type.rs1)
                       5'b00000: ara_req.op = ara_pkg::VFSQRT;
@@ -2411,28 +2406,26 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // destination register.
                   if (!skip_lmul_checks) begin
                     unique case (ara_req.emul)
-                      LMUL_2   : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_4   : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_8   : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                      LMUL_2   : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vd;
+                      LMUL_4   : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vd;
+                      LMUL_8   : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vd;
                       LMUL_RSVD: illegal_insn = 1'b1;
                       default:;
                     endcase
                     unique case (lmul_vs2)
-                      LMUL_2   : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_4   : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_8   : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                      LMUL_2   : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs2;
+                      LMUL_4   : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs2;
+                      LMUL_8   : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs2;
+                      LMUL_RSVD: illegal_insn = 1'b1;
+                      default:;
+                    endcase
+                    unique case (lmul_vs1)
+                      LMUL_2   : if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs1;
+                      LMUL_4   : if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs1;
+                      LMUL_8   : if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs1;
                       LMUL_RSVD: illegal_insn = 1'b1;
                       default:;
                     endcase
-                    if (!skip_vs1_lmul_checks) begin
-                      unique case (lmul_vs1)
-                        LMUL_2   : if ((insn.varith_type.rs1 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                        LMUL_4   : if ((insn.varith_type.rs1 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                        LMUL_8   : if ((insn.varith_type.rs1 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
-                        LMUL_RSVD: illegal_insn = 1'b1;
-                        default:;
-                      endcase
-                    end
                   end
 
                   // Ara can support 16-bit float, 32-bit float, 64-bit float.
@@ -2705,16 +2698,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                   // destination register.
                   if (!skip_lmul_checks) begin
                     unique case (ara_req.emul)
-                      LMUL_2   : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_4   : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_8   : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                      LMUL_2   : if ((insn.varith_type.rd & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vd;
+                      LMUL_4   : if ((insn.varith_type.rd & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vd;
+                      LMUL_8   : if ((insn.varith_type.rd & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vd;
                       LMUL_RSVD: illegal_insn = 1'b1;
                       default:;
                     endcase
                     unique case (lmul_vs2)
-                      LMUL_2   : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_4   : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = 1'b1;
-                      LMUL_8   : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = 1'b1;
+                      LMUL_2   : if ((insn.varith_type.rs2 & 5'b00001) != 5'b00000) illegal_insn = ara_req.use_vs2;
+                      LMUL_4   : if ((insn.varith_type.rs2 & 5'b00011) != 5'b00000) illegal_insn = ara_req.use_vs2;
+                      LMUL_8   : if ((insn.varith_type.rs2 & 5'b00111) != 5'b00000) illegal_insn = ara_req.use_vs2;
                       LMUL_RSVD: illegal_insn = 1'b1;
                       default:;
                     endcase

From 634a7534047b9e1fcad5a54cba47eb6c7c84d911 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Mon, 25 Nov 2024 14:57:46 +0100
Subject: [PATCH 8/8] [CHANGELOG] Update Changelog

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 919277f9c..36a4bb1fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
  - Bump upload and delete artifact actions
  - Fix synthesis-unfriendly constructs
  - Fix vector slicing bug in operand requesters
+ - Fix legality check for allowed registers in dispatcher
+ - Remove a couple of latches
 
 ### Added
 
@@ -39,6 +41,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
  - Add multi-precision conv3d
  - Add support for unit-stride, non-unit-stride, indexed segment memory instructions
  - Add support for fault-only-first loads
+ - Extend the riscv-tests MASKU-related tests
 
 ### Changed
 
@@ -69,6 +72,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
  - Refactor MASKU
  - Remove bit-support for tail elements
  - Adapt mask tests to this behavior
+ - Refactor the MASKU
+ - The MASKU always receives balanced payloads from the lanes
+ - Remove FPU support for opqueues that do not need it
 
 ## 3.0.0 - 2023-09-08