From 261dadedd8f16ffc50158fed77abe8ef285111f3 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Sun, 10 Nov 2024 21:55:00 +0100
Subject: [PATCH] WIP: update ara_pkg and lane_sequencer for mask operations

---
 hardware/include/ara_pkg.sv            |   8 +-
 hardware/src/lane/lane_sequencer.sv    | 217 ++++++++++++++-----------
 hardware/src/lane/operand_requester.sv |  18 +-
 hardware/src/lane/simd_alu.sv          |   7 +-
 hardware/src/lane/valu.sv              |  25 +--
 5 files changed, 154 insertions(+), 121 deletions(-)

diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
index 13b5b0424..dcc37d7f7 100644
--- a/hardware/include/ara_pkg.sv
+++ b/hardware/include/ara_pkg.sv
@@ -134,9 +134,15 @@ package ara_pkg;
     // Floating-point comparison instructions
     VMFEQ, VMFLE, VMFLT, VMFNE, VMFGT, VMFGE,
     // Integer comparison instructions
-    VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSBF, VMSOF, VMSIF, VIOTA, VID, VCPOP, VFIRST, VMSGT,
+    VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSGT,
     // Integer add-with-carry and subtract-with-borrow carry-out instructions
     VMADC, VMSBC,
+    // Mask to mask
+    VMSBF, VMSOF, VMSIF,
+    // Mask to non-mask
+    VIOTA, VID,
+    // Mask to scalar
+    VCPOP, VFIRST,
     // Mask operations
     VMANDNOT, VMAND, VMOR, VMXOR, VMORNOT, VMNAND, VMNOR, VMXNOR,
     // Scalar moves from VRF
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index 3ddcfa6eb..0db9660eb 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -264,7 +264,9 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
       // Vector length calculation
       vfu_operation_d.vl = pe_req.vl / NrLanes;
       // If lane_id_i < vl % NrLanes, this lane has to execute one extra micro-operation.
-      if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0]) vfu_operation_d.vl += 1;
+      // Also, if the ALU/VMFPU should pre-process data for the MASKU, force a balanced payload
+      if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0] || pe_req.op inside {[VMFEQ:VMXNOR]})
+        vfu_operation_d.vl += 1;
 
       // Calculate the start element for Lane[i]. This will be forwarded to both opqueues
       // and operand requesters, with some light modification in the case of a vslide.
@@ -277,9 +279,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
       vinsn_running_d[pe_req.id] = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0;
 
       // Mute request if the instruction runs in the lane and the vl is zero.
-      // Exception 1: insn on mask vectors, as MASKU has to receive something from all lanes
-      // and the partial results come from VALU and VMFPU.
-      // Exception 2: during a reduction, all the lanes must cooperate anyway.
+      // Exception: during a reduction, all the lanes must cooperate anyway.
       if (vfu_operation_d.vl == '0 && (vfu_operation_d.vfu inside {VFU_Alu, VFU_MFpu}) && !(vfu_operation_d.op inside {[VREDSUM:VWREDSUM], [VFREDUSUM:VFWREDOSUM]})) begin
         vfu_operation_valid_d = 1'b0;
         // We are already done with this instruction
@@ -337,17 +337,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
+            vl     : pe_req.vl / NrLanes / ELEN,
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
+            operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_MFpu: begin
@@ -420,17 +420,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
+            vl     : pe_req.vl / NrLanes / ELEN,
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
+            operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_LoadUnit : begin
@@ -438,17 +438,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
+            vl     : pe_req.vl / NrLanes / ELEN,
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
+            operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
 
           // Load indexed
@@ -490,26 +490,25 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           // extra operand regardless of whether it is valid in this lane or not.
           // This is done to balance the data received by the store unit, which expects
           // L*64-bits packets only.
-          if (lane_id_i > pe_req.end_lane) begin
+          if (lane_id_i > pe_req.end_lane)
             operand_request[StA].vl += 1;
-          end
           operand_request_push[StA] = pe_req.use_vs1;
 
           // This vector instruction uses masks
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl     : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew),
+            vl     : pe_req.vl / NrLanes / ELEN,
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm | pe_req.hazard_vd,
             default: '0
           };
-          if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-              NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1;
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
+            operand_request[MaskM].vl += 1;
           operand_request_push[MaskM] = !pe_req.vm;
 
           // Store indexed
@@ -529,9 +528,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           };
           // Since this request goes outside of the lane, we might need to request an
           // extra operand regardless of whether it is valid in this lane or not.
-          if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) begin
+          if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl)
             operand_request[SlideAddrGenA].vl += 1;
-          end
           operand_request_push[SlideAddrGenA] = pe_req_i.op == VSXE;
         end
 
@@ -601,7 +599,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           operand_request[MaskM] = '{
             id      : pe_req.id,
             vs      : VMASK,
-            eew     : pe_req.vtype.vsew,
+            eew     : EW64,
             is_slide: 1'b1,
             vtype   : pe_req.vtype,
             vstart  : vfu_operation_d.vstart,
@@ -614,61 +612,61 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             VSLIDEUP: begin
               // We need to trim full words from the end of the vector that are not used
               // as operands by the slide unit.
+              operand_request[MaskM].vl = (pe_req.vl - pe_req.stride) / NrLanes / ELEN;
+
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request[MaskM].vl =
-              ((pe_req.vl - pe_req.stride + NrLanes - 1) / 8 / NrLanes)
-              >> unsigned'(pe_req.vtype.vsew);
-
-              if (((operand_request[MaskM].vl + pe_req.stride) <<
-                    unsigned'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl))
+              if ((operand_request[MaskM].vl) * NrLanes * ELEN != + pe_req.stride)
                 operand_request[MaskM].vl += 1;
 
               // SLIDEUP only uses mask bits whose indices are > stride
               // Don't send the previous (unused) ones to the MASKU
               if (pe_req.stride >= NrLanes * 64)
-                operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8;
+                operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * ELEN) << NrLanes * ELEN) / 8;
             end
             VSLIDEDOWN: begin
               // Since this request goes outside of the lane, we might need to request an
               // extra operand regardless of whether it is valid in this lane or not.
-              operand_request[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> unsigned'(
-                    pe_req.vtype.vsew));
-              if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) *
-                  NrLanes * 8 != pe_req.vl)
+              operand_request[MaskM].vl = pe_req.vl / NrLanes / ELEN;
+              if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl)
                 operand_request[MaskM].vl += 1;
             end
           endcase
         end
         VFU_MaskUnit: begin
+          // todo: balance mask comparison source requested
+          // todo:
+
+          // Mask logical and integer comparisons
           operand_request[AluA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
-            eew     : pe_req.eew_vs1,
             scale_vl: pe_req.scale_vl,
             vtype   : pe_req.vtype,
             vstart  : vfu_operation_d.vstart,
             hazard  : pe_req.hazard_vs1 | pe_req.hazard_vd,
             default : '0
           };
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
 
-          // This is an operation that runs normally on the ALU, and then gets *condensed* and
-          // reshuffled at the Mask Unit.
+          // Integer comparisons run on the ALU and then get reshuffled and masked in the MASKU
           if (pe_req.op inside {[VMSEQ:VMSBC]}) begin
-            operand_request[AluA].vl = vfu_operation_d.vl;
-          end
-          // This is an operation that runs normally on the ALU, and then gets reshuffled at the
-          // Mask Unit.
-          else begin
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            operand_request[AluA].vl = (pe_req.vl / NrLanes) >>
-            (unsigned'(EW64) - unsigned'(pe_req.eew_vs1));
-            if ((operand_request[AluA].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs1))) * NrLanes !=
-                pe_req.vl) operand_request[AluA].vl += 1;
+            // These source regs contain non-mask vectors.
+            operand_request[AluA].eew = pe_req.eew_vs1;
+            operand_request[AluA].vl  = pe_req.vl / NrLanes;
+            if ((operand_request[AluA].vl * NrLanes) != pe_req.vl)
+              operand_request[AluA].vl += 1;
+          end else begin // Mask logical operations
+            // These source regs contain mask vectors.
+            operand_request[AluA].eew = EW64;
+            operand_request[AluA].vl  = pe_req.vl / NrLanes / ELEN;
+            if (operand_request[AluA].vl * NrLanes * ELEN != pe_req.vl)
+              operand_request[AluA].vl += 1;
           end
-          operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF});
+          operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]});
 
+          // Mask logical, integer comparisons, VIOTA, VID, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST
           operand_request[AluB] = '{
             id      : pe_req.id,
             vs      : pe_req.vs2,
@@ -679,88 +677,117 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             hazard  : pe_req.hazard_vs2 | pe_req.hazard_vd,
             default : '0
           };
-          // This is an operation that runs normally on the ALU, and then gets *condensed* and
-          // reshuffled at the Mask Unit.
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+
+          // Integer comparisons run on the ALU and then get reshuffled and masked in the MASKU
           if (pe_req.op inside {[VMSEQ:VMSBC]}) begin
-            operand_request[AluB].vl = vfu_operation_d.vl;
+            // These source regs contain non-mask vectors.
+            operand_request[AluB].eew = pe_req.eew_vs2;
+            operand_request[AluB].vl  = pe_req.vl / NrLanes;
+            if ((operand_request[AluB].vl * NrLanes) != pe_req.vl)
+              operand_request[AluB].vl += 1;
+          end else begin // Mask logical, VIOTA, VID, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST
+            // These source regs contain mask vectors.
+            operand_request[AluB].eew = EW64;
+            operand_request[AluB].vl  = pe_req.vl / NrLanes / ELEN;
+            if (operand_request[AluB].vl * NrLanes * ELEN != pe_req.vl)
+              operand_request[AluB].vl += 1;
           end
-          // This is an operation that runs normally on the ALU, and then gets reshuffled at the
-          // Mask Unit.
-          else begin
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            operand_request[AluB].vl = (pe_req.vl / NrLanes) >>
-            (unsigned'(EW64) - unsigned'(pe_req.eew_vs2));
-            if ((operand_request[AluB].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs2))) * NrLanes !=
-                pe_req.vl) operand_request[AluB].vl += 1;
-          end
-          operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF, VFIRST});
+          operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]});
 
+          // Mask fp comparisons
           operand_request[MulFPUA] = '{
             id      : pe_req.id,
             vs      : pe_req.vs1,
             eew     : pe_req.eew_vs1,
             scale_vl: pe_req.scale_vl,
+            vl      : pe_req.vl / NrLanes,
             vtype   : pe_req.vtype,
             vstart  : vfu_operation_d.vstart,
             hazard  : pe_req.hazard_vs1 | pe_req.hazard_vd,
             default : '0
           };
-
-          // This is an operation that runs normally on the ALU, and then gets *condensed* and
+          // This is an operation that runs normally on the VMFPU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
-          operand_request[MulFPUA].vl = vfu_operation_d.vl;
-          operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF});
+          // Request a balanced load from every lane despite it being active or not.
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if ((operand_request[MulFPUA].vl * NrLanes) != pe_req.vl)
+            operand_request[MulFPUA].vl += 1;
+          operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]};
 
+          // Mask fp comparisons
           operand_request[MulFPUB] = '{
             id      : pe_req.id,
             vs      : pe_req.vs2,
             eew     : pe_req.eew_vs2,
             scale_vl: pe_req.scale_vl,
+            vl      : pe_req.vl / NrLanes,
             vtype   : pe_req.vtype,
             vstart  : vfu_operation_d.vstart,
             hazard  : pe_req.hazard_vs2 | pe_req.hazard_vd,
             default : '0
           };
-          // This is an operation that runs normally on the ALU, and then gets *condensed* and
+          // This is an operation that runs normally on the VMFPU, and then gets *condensed* and
           // reshuffled at the Mask Unit.
-          operand_request[MulFPUB].vl = vfu_operation_d.vl;
-          operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF, VFIRST});
+          // Request a balanced load from every lane despite it being active or not.
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if ((operand_request[MulFPUB].vl * NrLanes) != pe_req.vl)
+            operand_request[MulFPUB].vl += 1;
+          operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]};
 
+          // Vd register to provide correct mask undisturbed policy at bit-level
+          // This is can be a mask or normal register
           operand_request[MaskB] = '{
             id      : pe_req.id,
-            vs      : pe_req.vs2,
-            eew     : pe_req.eew_vs2,
+            vs      : pe_req.vd,
             scale_vl: pe_req.scale_vl,
             vtype   : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
-            vl      : (pe_req.vl / NrLanes / ELEN) << (unsigned'(EW64) - unsigned'(pe_req.vtype.vsew)),
             vstart  : vfu_operation_d.vstart,
-            hazard  : (pe_req.op inside {VMSBF, VMSOF, VMSIF}) ? pe_req.hazard_vs2 : pe_req.hazard_vs2 | pe_req.hazard_vd,
+            hazard  : pe_req.hazard_vd,
             default : '0
           };
-          operand_request[MaskB].vl = pe_req.vl / (NrLanes * (8 << pe_req.vtype.vsew));
-          if ((pe_req.vl % (NrLanes*ELEN)) != 0) begin
-            operand_request[MaskB].vl += 1'b1;
+          // vl and eew depend on the real eew on which we are working on
+          if (pe_req.op inside {VIOTA,VID}) begin
+            // Non-mask layout
+            operand_request[MaskB].eew = pe_req.vtype.vsew;
+            operand_request[MaskB].vl  = pe_req.vl / NrLanes;
+            // Request a balanced load from every lane despite it being active or not.
+            // Since this request goes outside of the lane, we might need to request an
+            // extra operand regardless of whether it is valid in this lane or not.
+            if ((operand_request[MaskM].vl * NrLanes) != pe_req.vl)
+              operand_request[MaskM].vl += 1;
+          end else begin // Mask logical, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST
+            // Mask layout
+            operand_request[MaskB].eew = EW64;
+            operand_request[MaskB].vl  = (pe_req.vl / NrLanes / ELEN);
+            // Request a balanced load from every lane despite it being active or not.
+            // Since this request goes outside of the lane, we might need to request an
+            // extra operand regardless of whether it is valid in this lane or not.
+            if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl)
+              operand_request[MaskM].vl += 1;
           end
-          operand_request_push[MaskB] = pe_req.use_vs2 && pe_req.op inside {VCPOP, VFIRST, VMSIF, VMSOF, VMSBF};
+          operand_request_push[MaskB] = pe_req.use_vd_op;
 
+          // All masked operations
+          // This is always a mask register
           operand_request[MaskM] = '{
             id     : pe_req.id,
             vs     : VMASK,
-            eew    : pe_req.vtype.vsew,
+            eew    : EW64,
             vtype  : pe_req.vtype,
-            // Since this request goes outside of the lane, we might need to request an
-            // extra operand regardless of whether it is valid in this lane or not.
             vl     : (pe_req.vl / NrLanes / ELEN),
             vstart : vfu_operation_d.vstart,
             hazard : pe_req.hazard_vm,
             default: '0
           };
-          if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin
+          // Request a balanced load from every lane despite it being active or not.
+          // Since this request goes outside of the lane, we might need to request an
+          // extra operand regardless of whether it is valid in this lane or not.
+          if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl)
             operand_request[MaskM].vl += 1;
-          end
           operand_request_push[MaskM] = !pe_req.vm;
         end
         VFU_None: begin
diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv
index 1baec0780..de2cc4f82 100644
--- a/hardware/src/lane/operand_requester.sv
+++ b/hardware/src/lane/operand_requester.sv
@@ -291,7 +291,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
       automatic elen_t vl_byte;
       automatic elen_t vstart_byte;
       automatic elen_t vector_body_len_byte;
-      automatic elen_t vector_body_len_packets;
+      automatic elen_t vector_body_len_elements;
 
       // Bank we are currently requesting
       automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0];
@@ -324,13 +324,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
                   ? 0
                   : operand_request_i[requester_index].vstart << operand_request_i[requester_index].vtype.vsew;
       vector_body_len_byte = vl_byte - vstart_byte + (vstart_byte % 8);
-      vector_body_len_packets = vector_body_len_byte >> operand_request_i[requester_index].eew;
-      if (vector_body_len_packets << operand_request_i[requester_index].eew < vector_body_len_byte)
-        vector_body_len_packets += 1;
+      vector_body_len_elements = vector_body_len_byte >> operand_request_i[requester_index].eew;
+      if (vector_body_len_elements << operand_request_i[requester_index].eew < vector_body_len_byte)
+        vector_body_len_elements += 1;
 
       // Final computed length
       effective_vector_body_length = (operand_request_i[requester_index].scale_vl)
-                                   ? vector_body_len_packets
+                                   ? vector_body_len_elements
                                    : vector_body_length;
 
       // Address of the vstart element of the vector in the VRF
@@ -401,7 +401,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
           end : waw_counters_update
 
           if (operand_queue_ready_i[requester_index]) begin
-            automatic vlen_t num_bytes;
+            automatic vlen_t num_elements;
 
             // Operand request
             lane_operand_req_transposed[requester_index][bank] = !stall;
@@ -417,12 +417,12 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
               requester_metadata_d.addr = requester_metadata_q.addr + 1'b1;
 
               // We read less than 64 bits worth of elements
-              num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
-              if (requester_metadata_q.len < num_bytes) begin
+              num_elements = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) );
+              if (requester_metadata_q.len < num_elements) begin
                 requester_metadata_d.len    = 0;
               end
               else begin
-                requester_metadata_d.len = requester_metadata_q.len - num_bytes;
+                requester_metadata_d.len = requester_metadata_q.len - num_elements;
               end
             end : op_req_grant
 
diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv
index 242c0d2bc..572bc35af 100644
--- a/hardware/src/lane/simd_alu.sv
+++ b/hardware/src/lane/simd_alu.sv
@@ -132,11 +132,8 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
         VMXOR   : res = operand_a_i ^ operand_b_i;
         VMXNOR  : res = ~(operand_a_i ^ operand_b_i);
 
-        // vmsbf, vmsof, vmsif and viota operand generation
-        VMSBF, VMSOF, VMSIF, VIOTA : res = opb;
-
-	      // Vector count population and find first set bit instructions
-        VCPOP, VFIRST : res = operand_b_i;
+        // Mask operands pass-through
+        VCPOP, VFIRST, VMSBF, VMSOF, VMSIF, VIOTA: res = operand_b_i;
 
         // Arithmetic instructions
         VSADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)
diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index d3ce82bee..c4c9cdced 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -175,22 +175,25 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
   //  Mask operands  //
   /////////////////////
 
+  logic mask_operand_valid;
   logic mask_operand_ready;
   logic mask_operand_gnt;
 
-  assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q];
+  assign mask_operand_valid = result_queue_q[result_queue_read_pnt_q].mask
+                            & result_queue_valid_q[result_queue_read_pnt_q];
+  assign mask_operand_gnt = mask_operand_valid & mask_operand_ready;
 
   spill_register #(
     .T(elen_t)
   ) i_mask_operand_register (
-    .clk_i     (clk_i                                                                                        ),
-    .rst_ni    (rst_ni                                                                                       ),
-    .data_o    (mask_operand_o                                                                               ),
-    .valid_o   (mask_operand_valid_o                                                                         ),
-    .ready_i   (mask_operand_ready_i                                                                         ),
-    .data_i    (result_queue_q[result_queue_read_pnt_q].wdata                                                ),
-    .valid_i   (result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]),
-    .ready_o   (mask_operand_ready                                                                           )
+    .clk_i     (clk_i                                         ),
+    .rst_ni    (rst_ni                                        ),
+    .data_o    (mask_operand_o                                ),
+    .valid_o   (mask_operand_valid_o                          ),
+    .ready_i   (mask_operand_ready_i                          ),
+    .data_i    (result_queue_q[result_queue_read_pnt_q].wdata ),
+    .valid_i   (mask_operand_valid                            ),
+    .ready_o   (mask_operand_ready                            )
   );
 
   //////////////////////
@@ -739,7 +742,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     if (|result_queue_valid_q)
       vxsat_flag_o = |(alu_vxsat_q & result_queue_q[result_queue_read_pnt_q].be);
 
-    // Received a grant from the VRF.
+    // Received a grant from the VRF or MASKU.
     // Deactivate the request.
     if (alu_result_gnt_i || mask_operand_gnt) begin
       result_queue_valid_d[result_queue_read_pnt_q] = 1'b0;
@@ -802,7 +805,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     //////////////////////////////
 
     if (!vinsn_queue_full && vfu_operation_valid_i &&
-      (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin
+      (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMSBC],[VMANDNOT:VMXNOR]})) begin
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
       // Do not wait for masks if, during a reduction, this lane is just a pass-through
       // The only valid instructions here with vl == '0 are reductions