From c54eba65ace92878eaac9baaf6d7e0afd1fcd995 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Thu, 14 Nov 2024 19:39:20 +0100
Subject: [PATCH] [hardware] Debugging VIOTA (two commits ago, it worked until
 masked test)

---
 hardware/src/ara_dispatcher.sv      | 28 ++++++---
 hardware/src/lane/lane_sequencer.sv | 10 +--
 hardware/src/lane/valu.sv           | 95 ++++++++++-------------------
 hardware/src/masku/masku.sv         | 92 ++++++++++------------------
 4 files changed, 92 insertions(+), 133 deletions(-)

diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv
index fb5559bd8..818207b45 100644
--- a/hardware/src/ara_dispatcher.sv
+++ b/hardware/src/ara_dispatcher.sv
@@ -67,7 +67,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
 
   `FF(csr_vstart_q, csr_vstart_d, '0)
   `FF(csr_vl_q, csr_vl_d, '0)
-  `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, default: '0})
+  `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, vsew: EW8, vlmul: LMUL_1, default: '0})
   `FF(csr_vxsat_q, csr_vxsat_d, '0)
   `FF(csr_vxrm_q, csr_vxrm_d, '0)
   // Converts between the internal representation of `vtype_t` and the full XLEN-bit CSR.
@@ -505,7 +505,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     (csr_vtype_d.vlmul == LMUL_RSVD) ||                    // reserved value
                     // LMUL >= SEW/ELEN
                     (signed'($clog2(ELENB)) + signed'(csr_vtype_d.vlmul) < signed'(csr_vtype_d.vsew))) begin
-                  csr_vtype_d = '{vill: 1'b1, default: '0};
+                  csr_vtype_d = '{vill: 1'b1, vsew: EW8, vlmul: LMUL_1, default: '0};
                   csr_vl_d    = '0;
                 end
 
@@ -1279,12 +1279,26 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.use_vs1    = 1'b0;
                     ara_req_d.use_vd_op  = 1'b1;
                     ara_req_d.eew_vs2    = eew_q[ara_req_d.vs2]; // Force reshuffle
-                    ara_req_d.eew_vd_op  = eew_q[ara_req_d.vd]; // Force reshuffle
-                    ara_req_d.vtype.vsew = eew_q[ara_req_d.vd];
+                    ara_req_d.eew_vd_op  = eew_q[ara_req_d.vd];
                     case (insn.varith_type.rs1)
-                      5'b00001: ara_req_d.op = ara_pkg::VMSBF;
-                      5'b00010: ara_req_d.op = ara_pkg::VMSOF;
-                      5'b00011: ara_req_d.op = ara_pkg::VMSIF;
+                      5'b00001: begin
+                        ara_req_d.op = ara_pkg::VMSBF;
+                        // This is a mask-to-mask operation, vsew does not have any meaning
+                        // So, avoid reshuffling
+                        ara_req_d.vtype.vsew = eew_q[ara_req_d.vd];
+                      end
+                      5'b00010: begin
+                        ara_req_d.op = ara_pkg::VMSOF;
+                        // This is a mask-to-mask operation, vsew does not have any meaning
+                        // So, avoid reshuffling
+                        ara_req_d.vtype.vsew = eew_q[ara_req_d.vd];
+                      end
+                      5'b00011: begin
+                        ara_req_d.op = ara_pkg::VMSIF;
+                        // This is a mask-to-mask operation, vsew does not have any meaning
+                        // So, avoid reshuffling
+                        ara_req_d.vtype.vsew = eew_q[ara_req_d.vd];
+                      end
                       5'b10000: ara_req_d.op = ara_pkg::VIOTA;
                       5'b10001: ara_req_d.op = ara_pkg::VID;
                     endcase
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index 0db9660eb..c2c8c1d14 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -265,7 +265,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
       vfu_operation_d.vl = pe_req.vl / NrLanes;
       // If lane_id_i < vl % NrLanes, this lane has to execute one extra micro-operation.
       // Also, if the ALU/VMFPU should pre-process data for the MASKU, force a balanced payload
-      if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0] || pe_req.op inside {[VMFEQ:VMXNOR]})
+      if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0] || (|pe_req.vl[idx_width(NrLanes)-1:0] && pe_req.op inside {[VMFEQ:VMXNOR]}))
         vfu_operation_d.vl += 1;
 
       // Calculate the start element for Lane[i]. This will be forwarded to both opqueues
@@ -757,8 +757,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             // Request a balanced load from every lane despite it being active or not.
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            if ((operand_request[MaskM].vl * NrLanes) != pe_req.vl)
-              operand_request[MaskM].vl += 1;
+            if ((operand_request[MaskB].vl * NrLanes) != pe_req.vl)
+              operand_request[MaskB].vl += 1;
           end else begin // Mask logical, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST
             // Mask layout
             operand_request[MaskB].eew = EW64;
@@ -766,8 +766,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             // Request a balanced load from every lane despite it being active or not.
             // Since this request goes outside of the lane, we might need to request an
             // extra operand regardless of whether it is valid in this lane or not.
-            if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl)
-              operand_request[MaskM].vl += 1;
+            if ((operand_request[MaskB].vl * NrLanes * ELEN) != pe_req.vl)
+              operand_request[MaskB].vl += 1;
           end
           operand_request_push[MaskB] = pe_req.use_vd_op;
 
diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index e9179b0e7..a61242f67 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -398,6 +398,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
   // Remaining elements of the current instruction in the commit phase
   vlen_t commit_cnt_d, commit_cnt_q;
 
+  // How many elements are issued/committed
+  logic [3:0] element_cnt_buf_issue, element_cnt_buf_commit;
+  logic [6:0] element_cnt_issue;
+  logic [6:0] element_cnt_commit;
+
   always_comb begin: p_valu
     // Maintain state
     vinsn_queue_d = vinsn_queue_q;
@@ -439,6 +444,13 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     // Don't prevent commit by default
     prevent_commit = 1'b0;
 
+    // How many elements are we processing this cycle?
+    element_cnt_buf_issue = (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew));
+    element_cnt_issue = vinsn_issue_q.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_issue};
+
+    element_cnt_buf_commit = (unsigned'(EW64) - unsigned'(vinsn_commit.vtype.vsew));
+    element_cnt_commit = vinsn_commit.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_commit};
+
     ////////////////////////////////////////
     //  Write data into the result queue  //
     ////////////////////////////////////////
@@ -453,7 +465,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [6:0] element_cnt = element_cnt_issue;
 
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
@@ -527,16 +539,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                   vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
                 // Assign vector length for next instruction in the instruction queue
-                if (vinsn_queue_d.issue_cnt != 0) begin
-                  if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                    issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
-                  else begin
-                    $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-                    issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
-                      vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                    issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
-                  end
-                end
+                if (vinsn_queue_d.issue_cnt != 0)
+                  issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
               end
             end
           end
@@ -553,7 +557,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) &&
                 (mask_valid_i || vinsn_issue_q.vm)) begin
               // How many elements are we committing with this word?
-              automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)));
+              automatic logic [6:0] element_cnt = element_cnt_issue;
+
               if (element_cnt > issue_cnt_q)
                 element_cnt = issue_cnt_q;
 
@@ -659,16 +664,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
             vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
           // Assign vector length for next instruction in the instruction queue
-          if (vinsn_queue_d.issue_cnt != 0) begin
-            if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-              issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
-            else begin
-              $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-              issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
-                vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-              issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
-            end
-          end
+          if (vinsn_queue_d.issue_cnt != 0)
+            issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
 
           // Give the done to the main sequencer
           commit_cnt_d = '0;
@@ -696,16 +693,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
               // Assign vector length for next instruction in the instruction queue
-              if (vinsn_queue_d.issue_cnt != 0) begin
-                if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
-                  issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
-                else begin
-                  $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-                  issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >>
-                    vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew;
-                  issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0];
-                end
-              end
+              if (vinsn_queue_d.issue_cnt != 0)
+                issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl;
 
               // Commit and give the done to the main sequencer
               commit_cnt_d = '0;
@@ -757,9 +746,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
       // Decrement the counter of remaining vector elements waiting to be written
       // Don't do it in case of a reduction
-      if (!is_reduction(vinsn_commit.op))
-        commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew));
-      if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0;
+      if (!is_reduction(vinsn_commit.op)) begin
+        automatic logic [6:0] element_cnt = element_cnt_commit;
+          commit_cnt_d = commit_cnt_q - element_cnt;
+        if (commit_cnt_q < element_cnt) commit_cnt_d = '0;
+      end
     end
 
     // Finished committing the results of a vector instruction
@@ -773,18 +764,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       else vinsn_queue_d.commit_pnt += 1;
 
       // Update the commit counter for the next instruction
-      if (vinsn_queue_d.commit_cnt != '0) begin
-        if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;
-        else begin
-          // We are asking for bits, and we want at least one chunk of bits if
-          // vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew)
-          $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}");
-          commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >>
-            vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew;
-          commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0];
-        end
-      end
+      if (vinsn_queue_d.commit_cnt != '0)
+        commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl;
 
       // Initialize counters and alu state if needed by the next instruction
       // After a reduction, the next instructions starts after the reduction commits
@@ -809,7 +790,10 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
       // Do not wait for masks if, during a reduction, this lane is just a pass-through
       // The only valid instructions here with vl == '0 are reductions
-      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0);
+      // Instructions that execute in the mask unit will process the mask there directly
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VMXNOR]}
+                                                       ? 1'b1
+                                                       : vfu_operation_i.vm | (vfu_operation_i.vl == '0);
 
       // Initialize counters and alu state if the instruction queue was empty
       // and the lane is not reducing
@@ -825,22 +809,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
         sldu_transactions_cnt_d = $clog2(NrLanes) + 1;
 
         issue_cnt_d = vfu_operation_i.vl;
-        if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          issue_cnt_d = vfu_operation_i.vl;
-        else begin
-          issue_cnt_d = (vfu_operation_i.vl / 8) >>
-            vfu_operation_i.vtype.vsew;
-          issue_cnt_d += |vfu_operation_i.vl[2:0];
-        end
       end
       if (vinsn_queue_d.commit_cnt == '0)
-        if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
-          commit_cnt_d = vfu_operation_i.vl;
-        else begin
-          // Operations between mask vectors operate on bits
-          commit_cnt_d  = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew;
-          commit_cnt_d += |vfu_operation_i.vl[2:0];
-        end
+        commit_cnt_d = vfu_operation_i.vl;
 
       // Bump pointers and counters of the vector instruction queue
       vinsn_queue_d.accept_pnt += 1;
diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv
index 367e33a9d..d7bc525c6 100644
--- a/hardware/src/masku/masku.sv
+++ b/hardware/src/masku/masku.sv
@@ -180,7 +180,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   logic [NrLanes-1:0] additional_elm; // There can be an additional element for some lanes
   // BE signals for VIOTA
   logic [NrLanes*DataWidth/8-1:0] be_viota_seq_d, be_viota_seq_q, be_viota_shuf;
-  logic masku_alu_be_clr;
 
   // Local Parameter VcpopParallelism and VfirstParallelism
   //
@@ -520,8 +519,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
   elen_t [NrLanes-1:0] alu_result;
 
-  logic masku_alu_en, masku_alu_clr;
-
   // assign operand slices to be processed by popcount and lzc
   assign vcpop_slice  = vcpop_operand[(in_ready_cnt_q[idx_width(N_SLICES_CPOP)-1:0] * VcpopParallelism) +: VcpopParallelism];
   assign vfirst_slice = vcpop_operand[(in_ready_cnt_q[idx_width(N_SLICES_VFIRST)-1:0] * VfirstParallelism) +: VfirstParallelism];
@@ -544,7 +541,24 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     .empty_o (vfirst_empty )
   );
 
-  always_comb begin: p_mask_alu
+  // Vector instructions currently running
+  logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q;
+
+  // Interface with the main sequencer
+  pe_resp_t pe_resp;
+
+  // Effective MASKU stride in case of VSLIDEUP
+  // MASKU receives chunks of 64 * NrLanes mask bits from the lanes
+  // VSLIDEUP only needs the bits whose index >= than its stride
+  // So, the operand requester does not send vl mask bits to MASKU
+  // and trims all the unused 64 * NrLanes mask bits chunks
+  // Therefore, the stride needs to be trimmed, too
+  elen_t trimmed_stride;
+
+  // Information about which is the target FU of the request
+  assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu;
+
+  always_comb begin
     // Tail-agnostic bus
     alu_result          = '0;
     alu_result_vm       = '0;
@@ -557,28 +571,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
     vcpop_operand = '0;
 
-    // MASKU ALU control
-    masku_alu_en     = 1'b0;
-    masku_alu_clr    = 1'b0;
-    masku_alu_be_clr = 1'b0;
-
     // The result mask should be created here since the output is a non-mask vector
     be_viota_seq_d = be_viota_seq_q;
 
-    // Is there an instruction ready to be issued?
-    if (vinsn_issue_valid && vinsn_issue.op inside {[VMFEQ:VMXNOR]})
-      // Compute one slice if we can write and the necessary inputs are valid
-      if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op == VID)
-                             && (&masku_operand_vd_valid  || !vinsn_issue.use_vd_op)
-                             && (&masku_operand_m_valid   || vinsn_issue.vm))
-        masku_alu_en = 1'b1;
-
-    // Have we finished insn execution?
-    if (vinsn_issue_valid && issue_cnt_d == '0) masku_alu_clr = 1'b1;
-
-    // Have we written the result queue?
-    if (vinsn_issue_valid && out_vrf_word_valid) masku_alu_be_clr = 1'b1;
-
     // Create a bit-masked ALU sequential vector
     masku_operand_alu_seq_m = masku_operand_alu_seq
                             & (masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}});
@@ -608,7 +603,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
           for (int i = 1; i < VmsxfParallelism; i++) begin
             vmsxf_buffer[i] = ~((masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism + i]) | vmsxf_buffer[i-1]);
           end
-          found_one_d = masku_alu_en ? |(masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism]) | found_one_q : found_one_q;
 
           alu_result_vmsif_vm[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = vmsxf_buffer;
           alu_result_vmsbf_vm[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = {~found_one_d, vmsxf_buffer[VmsxfParallelism-1:1]};
@@ -623,9 +617,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
           // Mask the result
           alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & masku_operand_m_seq : alu_result_vm;
-
-          // Clean-up state upon instruction end
-          if (masku_alu_clr) found_one_d = '0;
         end
         // VIOTA, VID: compute a slice of the output and mask out the masked elements
 		// VID re-uses the VIOTA datapath
@@ -643,9 +634,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
             viota_res[i+1] = viota_res[i] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i];
           end
 
-          // Save last result in the accumulator for next slice upon processing
-          viota_acc_d = masku_alu_en ? viota_res[ViotaParallelism-1] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + ViotaParallelism - 1] : viota_acc_q;
-
           // This datapath should be relativeley simple:
           // `ViotaParallelism bytes connected, in line, to output byte chunks
           // Multiple limited-width counters should help the synthesizer reduce wiring
@@ -683,14 +671,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
                 {8{vinsn_issue.vm}} | {8{masku_operand_m_seq[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism + i]}};
             end
           endcase
-
-          // Clean-up state upon instruction end
-          if (masku_alu_clr) begin
-            viota_acc_d    = '0;
-            be_viota_seq_d = '0;
-          end
-
-          if (masku_alu_be_clr) be_viota_seq_d = '0;
         end
         // VCPOP, VFIRST: mask the current slice and feed the popc or lzc unit
         [VCPOP:VFIRST] : begin
@@ -716,30 +696,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     if (vinsn_issue.op inside {[VMSBF:VID]})
       alu_result = alu_result_vm_shuf;
 
-  end: p_mask_alu
-
   /////////////////
   //  Mask unit  //
   /////////////////
 
-  // Vector instructions currently running
-  logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q;
-
-  // Interface with the main sequencer
-  pe_resp_t pe_resp;
-
-  // Effective MASKU stride in case of VSLIDEUP
-  // MASKU receives chunks of 64 * NrLanes mask bits from the lanes
-  // VSLIDEUP only needs the bits whose index >= than its stride
-  // So, the operand requester does not send vl mask bits to MASKU
-  // and trims all the unused 64 * NrLanes mask bits chunks
-  // Therefore, the stride needs to be trimmed, too
-  elen_t trimmed_stride;
-
-  // Information about which is the target FU of the request
-  assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu;
-
-  always_comb begin: p_masku
     // Maintain state
     vinsn_queue_d    = vinsn_queue_q;
     read_cnt_d       = read_cnt_q;
@@ -994,6 +954,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         popcount_d = popcount_q + popcount;
         vfirst_count_d = vfirst_count_q + vfirst_count;
 
+        // Bump MASKU ALU state
+        found_one_d = |(masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism]) | found_one_q;
+        viota_acc_d = viota_res[ViotaParallelism-1] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + ViotaParallelism - 1];
+
         // Increment the input, input-mask, and output slice counters
         in_ready_cnt_en   = 1'b1;
         in_m_ready_cnt_en = 1'b1;
@@ -1040,6 +1004,13 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
           // Assert valid scalar output
           out_scalar_valid = vd_scalar(vinsn_issue.op);
         end
+
+        // Have we finished insn execution? Clear MASKU ALU state
+        if (issue_cnt_d == '0) begin
+          viota_acc_d    = '0;
+          be_viota_seq_d = '0;
+          found_one_d    = '0;
+        end
       end
     end
 
@@ -1059,6 +1030,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         result_queue_write_pnt_d = '0;
       end
 
+      // Clear MASKU ALU state
+      be_viota_seq_d = '0;
+
       // Account for the written results
       // VIOTA and VID do not write bits!
       processing_cnt_d = vinsn_issue.op inside {[VIOTA:VID]} ? processing_cnt_q - ((NrLanes * DataWidth / 8) >> vinsn_issue.vtype.vsew) : processing_cnt_q - NrLanes * DataWidth;
@@ -1296,7 +1270,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       vinsn_queue_d.issue_cnt += 1;
       vinsn_queue_d.commit_cnt += 1;
     end
-  end: p_masku
+  end
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin