From ef73126c04ea1d695abd4f0cbc06dfae44c7f5d5 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Mon, 25 Nov 2024 19:05:47 +0100
Subject: [PATCH] [hardware] WIP: vrgather / vcompress debug

---
 hardware/include/ara_pkg.sv               |  4 ++
 hardware/src/lane/lane.sv                 |  6 +++
 hardware/src/lane/lane_sequencer.sv       | 29 ++++++++++++--
 hardware/src/lane/operand_queue.sv        | 14 ++++++-
 hardware/src/lane/operand_queues_stage.sv | 33 +++++++++++-----
 hardware/src/lane/operand_requester.sv    |  2 +
 hardware/src/lane/valu.sv                 |  4 +-
 hardware/src/masku/masku.sv               | 46 +++++++++--------------
 8 files changed, 95 insertions(+), 43 deletions(-)

diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
index 10b444d8c..a8526da88 100644
--- a/hardware/include/ara_pkg.sv
+++ b/hardware/include/ara_pkg.sv
@@ -967,6 +967,10 @@ package ara_pkg;
   // VRGATHER / VCOMPRESS //
   //////////////////////////
 
+  // Buffer more elements in MaskB opqueue
+  // This should be a power of 2
+  localparam VrgatherOpQueueBufDepth = 2;
+
   // Indices are 16-bit at most because of RISC-V V VLEN limitation at 64Kibit
   typedef logic [$clog2(rvv_pkg::RISCV_MAX_VLEN)-1:0] max_vlen_t;
 
diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv
index e453e2c61..2af0f9125 100644
--- a/hardware/src/lane/lane.sv
+++ b/hardware/src/lane/lane.sv
@@ -217,6 +217,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #(
   logic                 [NrVInsn-1:0]         alu_vinsn_done;
   logic                                       mfpu_ready;
   logic                 [NrVInsn-1:0]         mfpu_vinsn_done;
+  // Interface with the MaskB operand queue (VRGATHER/VCOMPRESS)
+  logic                                       mask_b_cmd_pop;
 
   // Additional signals to please Verilator's hierarchical verilation
   pe_req_t  pe_req;
@@ -246,6 +248,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #(
     .operand_request_ready_i(operand_request_ready),
     .alu_vinsn_done_o       (alu_vinsn_done_o     ),
     .mfpu_vinsn_done_o      (mfpu_vinsn_done_o    ),
+    // Interface with the Operand Queue
+    .mask_b_cmd_pop_i       (mask_b_cmd_pop       ),
     // Interface with the VFUs
     .vfu_operation_o        (vfu_operation        ),
     .vfu_operation_valid_o  (vfu_operation_valid  ),
@@ -437,6 +441,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #(
     .operand_queue_ready_o            (operand_queue_ready                ),
     .operand_queue_cmd_i              (operand_queue_cmd                  ),
     .operand_queue_cmd_valid_i        (operand_queue_cmd_valid            ),
+    // Interface with the Lane Sequencer
+    .mask_b_cmd_pop_o         (mask_b_cmd_pop          ),
     // Interface with the VFUs
     // ALU
     .alu_operand_o                    (alu_operand                        ),
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index d7824a219..dab9ac1e2 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -31,6 +31,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
     input  logic                 [NrOperandQueues-1:0]    operand_request_ready_i,
     output logic                                          alu_vinsn_done_o,
     output logic                                          mfpu_vinsn_done_o,
+    // Interface with the Operand Queue (MaskB - for VRGATHER)
+    input  logic                                          mask_b_cmd_pop_i,
     // Interface with the lane's VFUs
     output vfu_operation_t                                vfu_operation_o,
     output logic                                          vfu_operation_valid_o,
@@ -167,6 +169,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
   vrgat_req_t masku_vrgat_req_q;
   logic masku_vrgat_req_ready_d, masku_vrgat_req_valid_q;
 
+  logic [idx_width(VrgatherOpQueueBufDepth)-1:0] vrgat_cmd_req_cnt_d, vrgat_cmd_req_cnt_q;
+
   spill_register #(
     .T       ( vrgat_req_t )
   ) i_spill_register_vrgat_req (
@@ -183,6 +187,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
   always_comb begin
     masku_vrgat_req_ready_d = 1'b0;
 
+    vrgat_state_d = vrgat_state_q;
+
+    vrgat_cmd_req_cnt_d = vrgat_cmd_req_cnt_q;
+
     // If MASKU request arrives, wait until the MaskB requester is free
     // Also, lock the MaskB opqueue
     unique case (vrgat_state_q)
@@ -192,8 +200,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
         end
       end
       REQUESTING: begin
-        // Pop if the operand requester is ready to accept a request
-        masku_vrgat_req_ready_d = masku_vrgat_req_valid_q && !(operand_request_valid_o[MaskB]);
+        // Pop if the operand requester and the queue are ready to accept requests
+        masku_vrgat_req_ready_d = masku_vrgat_req_valid_q & !(operand_request_valid_o[MaskB])
+                                & (vrgat_cmd_req_cnt_q != (VrgatherOpQueueBufDepth-1));
+
+        // Increase the counter if we handshake
+        if (masku_vrgat_req_ready_d)
+          vrgat_cmd_req_cnt_d = vrgat_cmd_req_cnt_q + 1;
+        // Decrease the counter if the MaskB opqueue popped a cmd
+        if (mask_b_cmd_pop_i)
+          vrgat_cmd_req_cnt_d = vrgat_cmd_req_cnt_q - 1;
+
         // If the MASKU is over with VRGATHER/VCOMPRESS, return to idle
         if (masku_vrgat_req_ready_d && masku_vrgat_req_q.is_last_req) begin
           vrgat_state_d = IDLE;
@@ -874,7 +891,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
         hazard     : '0,
         default    : '0
       };
-      operand_request_push[MaskB] = masku_vrgat_req_valid_q && !(operand_request_valid_o[MaskB]);
+      operand_request_push[MaskB] = masku_vrgat_req_ready_d;
     end
   end: sequencer
 
@@ -888,6 +905,9 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
       alu_vinsn_done_o  <= 1'b0;
       mfpu_vinsn_done_o <= 1'b0;
+
+      vrgat_state_q       <= IDLE;
+      vrgat_cmd_req_cnt_q <= '0;
     end else begin
       vinsn_done_q    <= vinsn_done_d;
       vinsn_running_q <= vinsn_running_d;
@@ -897,6 +917,9 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
       alu_vinsn_done_o  <= alu_vinsn_done_d;
       mfpu_vinsn_done_o <= mfpu_vinsn_done_d;
+
+      vrgat_state_q       <= vrgat_state_d;
+      vrgat_cmd_req_cnt_q <= vrgat_cmd_req_cnt_d;
     end
   end
 
diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv
index 0ccdcc6ae..e3b2a6b70 100644
--- a/hardware/src/lane/operand_queue.sv
+++ b/hardware/src/lane/operand_queue.sv
@@ -14,6 +14,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     parameter  int           unsigned NrSlaves            = 1,
     parameter  int           unsigned NrLanes             = 0,
     parameter  int           unsigned VLEN                = 0,
+    parameter  bit                    IsVrgatherOpqueue   = 0,
     // Support for floating-point data types
     parameter  fpu_support_e          FPUSupport          = FPUSupportHalfSingleDouble,
     // Supported conversions
@@ -37,6 +38,8 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     // Interface with the Operand Requester
     input  operand_queue_cmd_t                operand_queue_cmd_i,
     input  logic                              operand_queue_cmd_valid_i,
+    // Interface with the Lane Sequencer
+    output logic                              mask_b_cmd_pop_o,
     // Interface with the Vector Register File
     input  elen_t                             operand_i,
     input  logic                              operand_valid_i,
@@ -54,7 +57,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
   //////////////////////
 
   operand_queue_cmd_t cmd;
-  logic               cmd_pop;
+  logic               cmd_pop, cmd_pop_q;
 
   fifo_v3 #(
     .DEPTH(CmdBufDepth        ),
@@ -73,6 +76,13 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     .usage_o   (/* Unused */             )
   );
 
+  // If this is the MaskB opqueue, propagate the
+  // pop information for the cmd buffer
+  if (IsVrgatherOpqueue)
+    assign mask_b_cmd_pop_o = cmd_pop_q;
+  else
+    assign mask_b_cmd_pop_o = 1'b0;
+
   //////////////
   //  Buffer  //
   //////////////
@@ -123,8 +133,10 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
   always_ff @(posedge clk_i or negedge rst_ni) begin: p_ibuf_usage_ff
     if (!rst_ni) begin
       ibuf_usage_q <= '0;
+      cmd_pop_q    <= 1'b0;
     end else begin
       ibuf_usage_q <= ibuf_usage_d;
+      cmd_pop_q    <= cmd_pop;
     end
   end
 
diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv
index d03e8a6a4..4fb49dbb2 100644
--- a/hardware/src/lane/operand_queues_stage.sv
+++ b/hardware/src/lane/operand_queues_stage.sv
@@ -24,6 +24,8 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     output logic               [NrOperandQueues-1:0] operand_queue_ready_o,
     input  operand_queue_cmd_t [NrOperandQueues-1:0] operand_queue_cmd_i,
     input  logic               [NrOperandQueues-1:0] operand_queue_cmd_valid_i,
+    // Interface with the Lane Sequencer
+    output logic                                     mask_b_cmd_pop_o,
     // Interface with the VFUs
     // ALU
     output elen_t              [1:0]                 alu_operand_o,
@@ -73,6 +75,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .lane_id_i                (lane_id_i                      ),
     .operand_queue_cmd_i      (operand_queue_cmd_i[AluA]      ),
     .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[AluA]),
+    .mask_b_cmd_pop_o         (/* Unused */                   ),
     .operand_i                (operand_i[AluA]                ),
     .operand_valid_i          (operand_valid_i[AluA]          ),
     .operand_issued_i         (operand_issued_i[AluA]         ),
@@ -102,6 +105,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .lane_id_i                (lane_id_i                      ),
     .operand_queue_cmd_i      (operand_queue_cmd_i[AluB]      ),
     .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[AluB]),
+    .mask_b_cmd_pop_o         (/* Unused */                   ),
     .operand_i                (operand_i[AluB]                ),
     .operand_valid_i          (operand_valid_i[AluB]          ),
     .operand_issued_i         (operand_issued_i[AluB]         ),
@@ -133,6 +137,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .lane_id_i                (lane_id_i                         ),
     .operand_queue_cmd_i      (operand_queue_cmd_i[MulFPUA]      ),
     .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MulFPUA]),
+    .mask_b_cmd_pop_o         (/* Unused */                      ),
     .operand_i                (operand_i[MulFPUA]                ),
     .operand_valid_i          (operand_valid_i[MulFPUA]          ),
     .operand_issued_i         (operand_issued_i[MulFPUA]         ),
@@ -160,6 +165,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .lane_id_i                (lane_id_i                         ),
     .operand_queue_cmd_i      (operand_queue_cmd_i[MulFPUB]      ),
     .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MulFPUB]),
+    .mask_b_cmd_pop_o         (/* Unused */                      ),
     .operand_i                (operand_i[MulFPUB]                ),
     .operand_valid_i          (operand_valid_i[MulFPUB]          ),
     .operand_issued_i         (operand_issued_i[MulFPUB]         ),
@@ -187,6 +193,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .lane_id_i                (lane_id_i                         ),
     .operand_queue_cmd_i      (operand_queue_cmd_i[MulFPUC]      ),
     .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MulFPUC]),
+    .mask_b_cmd_pop_o         (/* Unused */                      ),
     .operand_i                (operand_i[MulFPUC]                ),
     .operand_valid_i          (operand_valid_i[MulFPUC]          ),
     .operand_issued_i         (operand_issued_i[MulFPUC]         ),
@@ -215,6 +222,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .lane_id_i                (lane_id_i                     ),
     .operand_queue_cmd_i      (operand_queue_cmd_i[StA]      ),
     .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[StA]),
+    .mask_b_cmd_pop_o         (/* Unused */                  ),
     .operand_i                (operand_i[StA]                ),
     .operand_valid_i          (operand_valid_i[StA]          ),
     .operand_issued_i         (operand_issued_i[StA]         ),
@@ -259,6 +267,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .lane_id_i                (lane_id_i                                                   ),
     .operand_queue_cmd_i      (operand_queue_cmd_i[SlideAddrGenA]                          ),
     .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[SlideAddrGenA]                    ),
+    .mask_b_cmd_pop_o         (/* Unused */                                                ),
     .operand_i                (operand_i[SlideAddrGenA]                                    ),
     .operand_valid_i          (operand_valid_i[SlideAddrGenA]                              ),
     .operand_issued_i         (operand_issued_i[SlideAddrGenA]                             ),
@@ -274,15 +283,16 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   /////////////////
 
   operand_queue #(
-    .CmdBufDepth        (MaskuInsnQueueDepth  ),
-    .DataBufDepth       (1                    ),
-    .FPUSupport         (FPUSupportNone       ),
-    .SupportIntExt2     (1'b1                 ),
-    .SupportIntExt4     (1'b1                 ),
-    .SupportIntExt8     (1'b1                 ),
-    .NrLanes            (NrLanes              ),
-    .VLEN               (VLEN                 ),
-    .operand_queue_cmd_t(operand_queue_cmd_t  )
+    .CmdBufDepth        (MaskuInsnQueueDepth + VrgatherOpQueueBufDepth ),
+    .DataBufDepth       (MaskuInsnQueueDepth + VrgatherOpQueueBufDepth ),
+    .IsVrgatherOpqueue  (1'b1                                          ),
+    .FPUSupport         (FPUSupportNone                                ),
+    .SupportIntExt2     (1'b1                                          ),
+    .SupportIntExt4     (1'b1                                          ),
+    .SupportIntExt8     (1'b1                                          ),
+    .NrLanes            (NrLanes                                       ),
+    .VLEN               (VLEN                                          ),
+    .operand_queue_cmd_t(operand_queue_cmd_t                           )
   ) i_operand_queue_mask_b (
     .clk_i                    (clk_i                           ),
     .rst_ni                   (rst_ni                          ),
@@ -290,6 +300,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .lane_id_i                (lane_id_i                       ),
     .operand_queue_cmd_i      (operand_queue_cmd_i[MaskB]      ),
     .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MaskB]),
+    .mask_b_cmd_pop_o         (mask_b_cmd_pop_o                ),
     .operand_i                (operand_i[MaskB]                ),
     .operand_valid_i          (operand_valid_i[MaskB]          ),
     .operand_issued_i         (operand_issued_i[MaskB]         ),
@@ -314,6 +325,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .lane_id_i                (lane_id_i                       ),
     .operand_queue_cmd_i      (operand_queue_cmd_i[MaskM]      ),
     .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MaskM]),
+    .mask_b_cmd_pop_o         (/* Unused */                    ),
     .operand_i                (operand_i[MaskM]                ),
     .operand_valid_i          (operand_valid_i[MaskM]          ),
     .operand_issued_i         (operand_issued_i[MaskM]         ),
@@ -324,4 +336,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
     .operand_ready_i          (mask_operand_ready_i[0]         )
   );
 
+  // Checks
+  if (VrgatherOpQueueBufDepth % 2 != 0) $fatal(1, "Parameter VrgatherOpQueueBufDepth must be power of 2.");
+
 endmodule : operand_queues_stage
diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv
index f085d8471..8af7512ff 100644
--- a/hardware/src/lane/operand_requester.sv
+++ b/hardware/src/lane/operand_requester.sv
@@ -39,6 +39,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
     output logic                 [NrOperandQueues-1:0] operand_issued_o,
     output operand_queue_cmd_t   [NrOperandQueues-1:0] operand_queue_cmd_o,
     output logic                 [NrOperandQueues-1:0] operand_queue_cmd_valid_o,
+    // VRGATHER/VCOMPRESS support
+    output logic                                       masku_b_operand_queue_ready_o,
     // Interface with the VFUs
     // ALU
     input  logic                                       alu_result_req_i,
diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index 53a14e177..74c560019 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -786,13 +786,13 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     //////////////////////////////
 
     if (!vinsn_queue_full && vfu_operation_valid_i &&
-      (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin
+      (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VCOMPRESS]})) begin
       vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i;
       // Do not wait for masks if, during a reduction, this lane is just a pass-through
       // The only valid instructions here with vl == '0 are reductions
       // Instructions that execute in the mask unit will process the mask there directly
       // VMADC/VMSBC requires mask bits in the ALU
-      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VMXNOR]} && !(vfu_operation_i.op inside {[VMADC:VMSBC]})
+      vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VCOMPRESS]} && !(vfu_operation_i.op inside {[VMADC:VMSBC]})
                                                        ? 1'b1
                                                        : vfu_operation_i.vm | (vfu_operation_i.vl == '0);
 
diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv
index b12675768..46f6cf65f 100644
--- a/hardware/src/masku/masku.sv
+++ b/hardware/src/masku/masku.sv
@@ -46,7 +46,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     output logic     [NrLanes-1:0]                     masku_vrgat_req_valid_o,
     input  logic     [NrLanes-1:0]                     masku_vrgat_req_ready_i,
     output vrgat_req_t                                 masku_vrgat_req_o,
-    output logic                                       masku_vrgat_end_o,
     // Interface with the VFUs
     output strb_t    [NrLanes-1:0]                     mask_o,
     output logic     [NrLanes-1:0]                     mask_valid_o,
@@ -98,7 +97,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   // vd (deshuffled)
   logic  [NrLanes*DataWidth-1:0] masku_operand_vd_seq;
   logic  [     NrLanes-1:0] masku_operand_vd_seq_valid;
-  logic  [     NrLanes-1:0] masku_operand_vd_seq_ready;
 
   // Mask
   elen_t [NrLanes-1:0] masku_operand_m;
@@ -154,6 +152,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     .alu_result_compressed_seq_o   (   alu_result_compressed_seq )
   );
 
+  // Local Parameter for mask logical instructions
+  //
+  // Don't change this parameter!
+  localparam integer unsigned VrgatherParallelism = 1;
+
   // Local Parameter for mask logical instructions
   //
   // Don't change this parameter!
@@ -226,10 +229,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   logic  [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_shuf;
   logic                          found_one, found_one_d, found_one_q;
 
-  // VRGATHER/VCOMPRESS signals
-  // Current vrgather index
-  logic [15:0] vrgat_idx;
-
   // How many elements we are processing per cycle
   logic [idx_width(NrLanes*DataWidth):0] delta_elm_d, delta_elm_q;
 
@@ -387,22 +386,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   //  ALU counters  //
   ////////////////////
 
-  // Compile-time minimum among five different numbers
-  function automatic int unsigned min5(int unsigned a, int unsigned b, int unsigned c, int unsigned d, int unsigned e);
-      return (a < b) ? ((a < c) ? ((a < d) ? ((a < e) ? a : e) : (d < e ? d : e))
-                                 : (c < d) ? ((c < e) ? c : e) : (d < e ? d : e))
-                     : ((b < c) ? ((b < d) ? ((b < e) ? b : e) : (d < e ? d : e))
-                                 : (c < d) ? ((c < e) ? c : e) : (d < e ? d : e));
-  endfunction
-
   // What is the minimum supported parallelism?
-  localparam int unsigned MIN_MASKU_ALU_WIDTH = min5(
-      ViotaParallelism,
-      VmsxfParallelism,
-      VmLogicalParallelism,
-      VcpopParallelism,
-      VfirstParallelism
-  );
+  localparam int unsigned MIN_MASKU_ALU_WIDTH = 1; // VrgatherParallelism
 
   localparam int unsigned IN_READY_CNT_WIDTH = idx_width(NrLanes * DataWidth / MIN_MASKU_ALU_WIDTH);
   typedef logic [IN_READY_CNT_WIDTH-1:0] in_ready_cnt_t;
@@ -517,7 +502,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
   // Sequential counter for vcompress
   vlen_t vrgat_cnt_d, vrgat_cnt_q;
-  logic [NrLanes*DataWidth-1:0] vrgat_threshold_d, vrgat_threshold_q;
   logic vcompress_bit;
 
   // FIFO-related signals
@@ -531,6 +515,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   logic [4:0] vrgat_req_vs_d;
   logic vrgat_req_is_last_req_d;
 
+  // If VRGATHEREI16, vsew == EW16 -> shift-by-1
+  logic [1:0] vrgat_eff_vsew;
+  assign vrgat_eff_vsew = (pe_req_i.op == VRGATHEREI16) ? 2'b1 : unsigned'(pe_req_i.vtype.vsew);
+
   assign vrgat_req_eew_d = vinsn_issue.op == VRGATHEREI16 ? EW16 : vinsn_issue.vtype.vsew;
   assign vrgat_req_vs_d  = vinsn_issue.vs1;
 
@@ -551,7 +539,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   // Synchronize the handshake between MASKU and lanes since we are making a single request
   // to all the lanes, which can also answer individually
   always_comb begin
-
     // Don't do anything by default
     vrgat_req_fifo_pop = 1'b0;
 
@@ -569,7 +556,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     if (&masku_vrgat_req_ready_i) vrgat_req_valid_mask_d = '0;
 
     // Pop the current address if all the lanes have handshaked it
-    if (&(masku_vrgat_req_ready_i | vrgat_req_valid_mask_q)) vrgat_req_fifo_pop = 1'b1;
+    if (&(masku_vrgat_req_ready_i | vrgat_req_valid_mask_q) && ~vrgat_req_fifo_empty) vrgat_req_fifo_pop = 1'b1;
   end
 
   // Overflow after 16-bits
@@ -707,6 +694,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     viota_acc_d = viota_acc_q;
     for (int i = 0; i < ViotaParallelism; i++) viota_res[i] = '0;
 
+    be_vrgat_seq_d = '0;
+
     if (vinsn_issue_valid) begin
       // Evaluate the instruction
       unique case (vinsn_issue.op) inside
@@ -1220,6 +1209,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     // different input and output data widths, meaning that the input ready and the final output valid
     // are not always synchronized.
 
+    vrgat_idx_fifo_pop = 1'b0;
+
     // How many elements {VIOTA|VID} are writing to each lane
     elm_per_lane = processing_cnt_q / NrLanes;
     if ((processing_cnt_q / NrLanes) > 4'b1000)
@@ -1247,7 +1238,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       // Therefore, VRGATHER/VCOMPRESS's operand are special. Only the vd operand works in the MASKU ALU.
       if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op inside {VID,[VRGATHER:VCOMPRESS]})
                              && (&masku_operand_vd_valid  || !vinsn_issue.use_vd_op)
-                             && (&masku_operand_m_valid   || vinsn_issue.vm || vinsn_issue.op inside {[VMADC:VMSBC]})) begin
+                             && (&masku_operand_m_valid   || vinsn_issue.vm || vinsn_issue.op inside {[VMADC:VMSBC]})
+                             && (!vrgat_idx_fifo_empty    || vinsn_issue.op inside {[VRGATHER:VCOMPRESS]})) begin
 
         // Write the result queue on the background data - either vd or the previous result
         // The mask vector writes at 1 (tail-agnostic ok value) both the background body
@@ -1270,6 +1262,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         found_one_d = found_one;
         viota_acc_d = viota_acc;
         vrf_pnt_d   = vrf_pnt_q + delta_elm_q;
+        vrgat_idx_fifo_pop = 1'b1;
 
         // Increment the input, input-mask, and output slice counters
         if (!(vinsn_issue.op inside {[VRGATHER:VCOMPRESS]})) in_ready_cnt_en = 1'b1;
@@ -1567,9 +1560,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
             out_valid_threshold_d  = '0;
           end
           default: begin // VRGATHER, VRGATHEREI16, VCOMPRESS
-            // If VRGATHEREI16, vsew == EW16 -> shift-by-1
-            static logic [1:0] vrgat_eff_vsew = (pe_req_i.op == VRGATHEREI16) ? 1 : pe_req_i.vtype.vsew[1:0];
-
             delta_elm_d = 1;
 
             in_ready_threshold_d   = pe_req_i.op == VCOMPRESS ? NrLanes*DataWidth : ((NrLanes*DataWidth/8) >> vrgat_eff_vsew)-1;