diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv
index 1569793b4..364c5ea7e 100644
--- a/hardware/src/masku/masku.sv
+++ b/hardware/src/masku/masku.sv
@@ -175,14 +175,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   if (ViotaParallelism > NrLanes || ViotaParallelism % 2 != 0) begin
     $fatal(1, "Parameter ViotaParallelism cannot be higher than NrLanes and should be a power of 2.");
   end
-  // VIOTA/VID counters
-  logic [idx_width(NrLanes*ELEN/ViotaParallelism)-1:0] viota_in_cnt_q;
-  logic [idx_width(NrLanes*ELEN/8/ViotaParallelism)-1:0] viota_out_cnt_ew8_q;
-  logic [idx_width(NrLanes*ELEN/16/ViotaParallelism)-1:0] viota_out_cnt_ew16_q;
-  logic [idx_width(NrLanes*ELEN/32/ViotaParallelism)-1:0] viota_out_cnt_ew32_q;
-  logic [idx_width(NrLanes*ELEN/64/ViotaParallelism)-1:0] viota_out_cnt_ew64_q;
   // VLENMAX can be 64Ki elements at most - 16 bit per adder are enough
   logic [15:0] viota_res [ViotaParallelism];
+  logic [idx_width(NrLanes*ELEN)-1:0] viota_input_vector;
 
   // Local Parameter W_CPOP and W_VFIRST
   //
@@ -510,13 +505,13 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   );
 
   always_comb begin: p_mask_alu
-    alu_result          = '0;
-    not_found_one_d     = pe_req_ready_o ? 1'b1 : not_found_one_q;
-    alu_result_vm       = '0;
-    alu_result_vm_m     = '0;
-    alu_result_vm_shuf  = '0;
+    alu_result              = '0;
+    not_found_one_d         = pe_req_ready_o ? 1'b1 : not_found_one_q;
+    alu_result_vm           = '0;
+    alu_result_vm_m         = '0;
+    alu_result_vm_shuf      = '0;
     masku_operand_alu_seq_m = '0;
-    vcpop_operand       = '0;
+    vcpop_operand           = '0;
 
     if (vinsn_issue_valid) begin
       // Evaluate the instruction
@@ -525,8 +520,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 		// This operation is never masked
         [VMANDNOT:VMXNOR]: alu_result_vm_m = masku_operand_alu_seq;
         // Comparisons: mask out the masked out bits of this pre-computed slice
+        [VMFEQ:VMSGT]: alu_result = alu_result_compressed & masku_operand_m;
 		// Add/sub-with-carry/borrow: the masks are all 1 since these operations are NOT masked
-        [VMFEQ:VMSBC]: alu_result = alu_result_compressed & bit_enable_mask;
+        [VMADC:VMSBC]: alu_result = alu_result_compressed;
         // VMSBF, VMSOF, VMSIF: compute a slice of the output and mask out the masked out bits
         [VMSBF:VMSIF] : begin
           masku_operand_alu_seq_m = masku_operand_alu_seq & masku_operand_m_seq;
@@ -563,8 +559,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         // VIOTA, VID: compute a slice of the output and mask out the masked elements
 		// VID re-uses the VIOTA datapath
         VIOTA, VID: begin
+          viota_input_vector = vinsn_issue.op == VID ? '1 : masku_operand_alu_seq;
+
           // Mask the input vector
-          masku_operand_alu_seq_m = masku_operand_alu_seq & masku_operand_m_seq;
+          masku_operand_alu_seq_m = viota_input_vector & masku_operand_m_seq;
 
           // Compute output results on `ViotaParallelism 16-bit adders
           viota_res[0] = viota_acc_q;
@@ -578,7 +576,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
           // This datapath should be relativeley simple:
           // `ViotaParallelism bytes connected, in line, to output byte chunks
           // Multiple limited-width counters should help the synthesizer reduce wiring
-          logic [NrLanes*ELEN/ViotaParallelism-1:0] viota_out_cnt_q;
 		  unique case (vinsn_issue.vtype.vsew)
             EW8: for (int i = 0; i < ViotaParallelism; i++)
               alu_result_vm_m[out_valid_cnt_q[NrLanes*ELEN/8/ViotaParallelism-1:0]  * ViotaParallelism * 8  + i*8  +: 8]  = viota_res[i][7:0];
@@ -592,7 +589,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         end
         // VCPOP, VFIRST: mask the current slice and feed the popc or lzc unit
         [VCPOP:VFIRST] : begin
-          vcpop_operand = (!vinsn_issue.vm) ? masku_operand_vd_seq & bit_enable_mask : masku_operand_vd_seq;
+          vcpop_operand = (!vinsn_issue.vm) ? masku_operand_vd_seq & masku_operand_m_seq : masku_operand_vd_seq;
         end
         default:;
       endcase
@@ -730,7 +727,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       if (vinsn_issue_valid && !(vd_scalar(vinsn_issue.op))) begin
         // Is there place in the mask queue to write the mask operands?
         // Did we receive the mask bits on the MaskM channel?
-        if (!vinsn_issue.vm && &masku_operand_m_valid && !(vinsn_issue.op inside {[VMSBF:VMSIF]})) begin
+        if (!vinsn_issue.vm && &masku_operand_m_valid && !(vinsn_issue.op inside {[VMFEQ,VMSIF]})) begin
           // Account for the used operands
           mask_pnt_d += NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew));
 
@@ -923,7 +920,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         result_queue_write_pnt_d = '0;
       end
 
-      vrf_pnt_d = vrf_pnt_q + (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew));
+      vrf_pnt_d = vrf_pnt_q + (NrLanes << (int'(EW64) - vinsn_issue.eew_vs2));
     end
 
     // The scalar result has been sent to and acknowledged by the dispatcher
@@ -1070,19 +1067,19 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         unique case (pe_req_i.op)
           {[VMFEQ:VMXNOR]}: begin
             // Mask to mask - encoded
-            delta_elm_d = ;
+            delta_elm_d = NrLanes << (EW64 - pe_req_i.eew_vs2[1:0]);
 
-            in_ready_threshold_d   = ;
-            in_m_ready_threshold_d = ;
-            out_valid_threshold_d  = ;
+            in_ready_threshold_d   = 1;
+            in_m_ready_threshold_d = ELEN >> (EW64 - pe_req_i.eew_vs2[1:0]);
+            out_valid_threshold_d  = ELEN >> (EW64 - pe_req_i.eew_vs2[1:0]);
           end
           {[VMADC:VMSBC]}: begin
             // Mask to mask - encoded
-            delta_elm_d = ;
+            delta_elm_d = NrLanes << (EW64 - pe_req_i.eew_vs2[1:0]);
 
-            in_ready_threshold_d   = ;
-            in_m_ready_threshold_d = ;
-            out_valid_threshold_d  = ;
+            in_ready_threshold_d   = 1;
+            in_m_ready_threshold_d = ELEN >> (EW64 - pe_req_i.eew_vs2[1:0]);
+            out_valid_threshold_d  = ELEN >> (EW64 - pe_req_i.eew_vs2[1:0]);
           end
           {[VMANDNOT:VMXNOR]}: begin
             // Mask to mask
@@ -1106,7 +1103,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
             in_ready_threshold_d   = NrLanes*ELEN/ViotaParallelism;
             in_m_ready_threshold_d = NrLanes*ELEN/ViotaParallelism;
-            out_valid_threshold_d  = (NrLanes*ELEN/ViotaParallelism) >> pe_req_i.vtype.vsew;
+            out_valid_threshold_d  = (NrLanes*ELEN/ViotaParallelism) >> (EW64 - pe_req_i.vtype.vsew[1:0]);
           end
           default: begin // {[VCPOP:VFIRST]}
             // Mask to scalar