diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv
index abdec2c35..90f9b2163 100644
--- a/hardware/src/ara.sv
+++ b/hardware/src/ara.sv
@@ -594,6 +594,9 @@ module ara import ara_pkg::*; #(
     .masku_result_be_o       (masku_result_be                 ),
     .masku_result_gnt_i      (masku_result_gnt                ),
     .masku_result_final_gnt_i(masku_result_final_gnt          ),
+    .masku_vrgat_req_valid_o (                                ),
+    .masku_vrgat_req_ready_i ('0                              ),
+    .masku_vrgat_addr_o      (                                ),
     // Interface with the VFUs
     .mask_o                  (mask                            ),
     .mask_valid_o            (mask_valid                      ),
diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv
index cdd2efb6d..3cde6ba00 100644
--- a/hardware/src/masku/masku.sv
+++ b/hardware/src/masku/masku.sv
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: SHL-0.51
 //
 // Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+// Author: Matteo Perotti <mperotti@iis.ee.ethz.ch>
 // Description:
 // This is Ara's mask unit. It fetches operands from any one the lanes, and
 // then sends back to them whether the elements are predicated or not.
@@ -42,6 +43,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     output strb_t    [NrLanes-1:0]                     masku_result_be_o,
     input  logic     [NrLanes-1:0]                     masku_result_gnt_i,
     input  logic     [NrLanes-1:0]                     masku_result_final_gnt_i,
+    output logic     [NrLanes-1:0]                     masku_vrgat_req_valid_o,
+    input  logic     [NrLanes-1:0]                     masku_vrgat_req_ready_i,
+    output vaddr_t                                     masku_vrgat_addr_o,
     // Interface with the VFUs
     output strb_t    [NrLanes-1:0]                     mask_o,
     output logic     [NrLanes-1:0]                     mask_valid_o,
@@ -498,6 +502,94 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     .overflow_o(/* Unused */)
   );
 
+  ///////////////////////////////
+  //// VRGATHER / VCOMPRESS  ////
+  ///////////////////////////////
+
+  // Indices are 16-bit at most because of RISC-V V VLEN limitation at 64Kibit
+  typedef logic [$clog2(RISCV_MAX_VLEN)-1:0] max_vlen_t;
+
+  // How deep are the VRGATHER/VCOMPRESS address/index FIFOs?
+  localparam int unisgned VrgatFifoDepth = 3;
+
+  // Mask bit sequentially selected by the m-operand delta counter
+  // VRGATHER: used as a mask bit by the MASKU ALU (write-back phase of VRGATHER)
+  // VCOMPRESS: used as an index bit to build the next index for address generation (first phase of VCOMPRESS)
+  logic vrgat_m_seq_bit;
+
+  // FIFO-related signals
+  logic vrgat_addr_fifo_empty, vrgat_addr_fifo_full, vrgat_addr_fifo_push, vrgat_addr_fifo_pop;
+  logic vrgat_idx_fifo_empty, vrgat_idx_fifo_full, vrgat_idx_fifo_push, vrgat_idx_fifo_pop;
+
+  // Build the address from the index
+  assign vrgat_addr_d = vaddr(vinsn_issue.vs2, NrLanes, VLEN) + vrgat_idx_d;
+
+  // Broadcast the address request to all the lanes
+  assign masku_vrgat_addr_o = vrgat_addr_q;
+
+  // Synchronize the handshake between MASKU and lanes since we are making a single request
+  // to all the lanes, which can also answer individually
+  always_comb begin
+    // A mask for the valid to keep up only the unshaked ones and hide the others
+    logic [NrLanes-1:0] vrgat_addr_valid_mask_d, vrgat_addr_valid_mask_q;
+
+    // Don't do anything by default
+    vrgat_addr_fifo_pop = 1'b0;
+
+    // Don't hide the valids by defaults
+    vrgat_addr_valid_mask_d = vrgat_addr_valid_mask_q;
+
+    for (int lane = 0; lane < NrLanes; lane++) begin
+      // Valid address request if the address fifo is not empty and if the valid is not masked
+      masku_vrgat_req_valid_o[lane] = ~vrgat_addr_fifo_empty & vrgat_addr_valid_q[lane];
+      // Mask the next valid on this lane if the lane is handshaking
+      vrgat_addr_valid_mask_d[lane] = masku_vrgat_req_ready_i[lane];
+    end
+
+    // Don't mask if all the lanes have handshaked
+    if (&masku_vrgat_req_ready_i) vrgat_addr_valid_mask_d = '0;
+
+    // Pop the current address if all the lanes have handshaked it
+    if (&(masku_vrgat_req_ready_i | vrgat_addr_valid_mask_q)) vrgat_addr_fifo_pop = 1'b1;
+  end
+
+  // Save the indices into the MASKU ALU vrgather/vcompress queue for later use
+  // Also, save if one of the indices is out of range
+  fifo_v3 #(
+    .DATA_WIDTH($clog2(RISCV_MAX_VLEN) + 1),
+    .DEPTH     (VrgatFifoDepth        )
+  ) i_fifo_vrgat_idx (
+    .clk_i,
+    .rst_ni,
+    .flush_i   (1'b0),
+    .testmode_i(1'b0),
+    .full_o    (vrgat_idx_fifo_full           ),
+    .empty_o   (vrgat_idx_fifo_empty          ),
+    .usage_o   (/* unused */                  ),
+    .data_i    ({vrgat_idx_oor_d, vrgat_idx_d}),
+    .push_i    (vrgat_idx_fifo_push           ),
+    .data_o    ({vrgat_idx_oor_q, vrgat_idx_q}),
+    .pop_i     (vrgat_idx_fifo_pop            )
+  );
+
+  // Send the address request to the lanes
+  fifo_v3 #(
+    .dtype(vaddr_t       ),
+    .DEPTH(VrgatFifoDepth)
+  ) i_fifo_vrgat_addr (
+    .clk_i,
+    .rst_ni,
+    .flush_i   (1'b0),
+    .testmode_i(1'b0),
+    .full_o    (vrgat_addr_fifo_full ),
+    .empty_o   (vrgat_addr_fifo_empty),
+    .usage_o   (/* unused */         ),
+    .data_i    (vrgat_addr_d         ),
+    .push_i    (vrgat_addr_fifo_push ),
+    .data_o    (vrgat_addr_q         ),
+    .pop_i     (vrgat_addr_fifo_pop  )
+  );
+
   ////////////////////////////
   //// Scalar result reg  ////
   ////////////////////////////
@@ -573,6 +665,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
     vcpop_operand = '0;
 
+    vrgat_m_seq_bit = 1'b0;
+
     // The result mask should be created here since the output is a non-mask vector
     be_viota_seq_d = be_viota_seq_q;
 
@@ -712,23 +806,27 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
             end
           endcase
 
-          // BE signal for VRGATHER
+          // BE signal for VRGATHER and VCOMPRESS
 		  unique case (vinsn_issue.vtype.vsew)
             EW8: begin
+              vrgat_m_seq_bit = masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]];
               be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8)-1:0] * 1 +: 1] =
-                {1{vinsn_issue.vm}} | {1{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}};
+                {1{vinsn_issue.vm}} | {1{vrgat_m_seq_bit}};
             end
             EW16: begin
+              vrgat_m_seq_bit = masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]];
               be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/16)-1:0] * 2 +: 2] =
-                {2{vinsn_issue.vm}} | {2{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}};
+                {2{vinsn_issue.vm}} | {2{vrgat_m_seq_bit}};
             end
             EW32: begin
+              vrgat_m_seq_bit = masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]];
               be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/32)-1:0] * 4 +: 4] =
-                {4{vinsn_issue.vm}} | {4{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}};
+                {4{vinsn_issue.vm}} | {4{vrgat_m_seq_bit}};
             end
             default: begin // EW64
+              vrgat_m_seq_bit = masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]];
               be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/64)-1:0] * 8 +: 8] =
-                {8{vinsn_issue.vm}} | {8{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}};
+                {8{vinsn_issue.vm}} | {8{vrgat_m_seq_bit}};
             end
           endcase
 
@@ -975,20 +1073,93 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     // We ask all the lanes in parallel for a precise index, and we will get a balanced payload from them.
     // Only one element of the payload is important, the rest is discarded.
     // This can be easily optimized by asking only the correct lane and by handling unbalanced payloads.
-    logic [NrLanes-1:0] vrgat_idx_req;
-    logic vrgat_idx_addr;
-
-    // VRGATHER
-
-
-    // VCOMPRESS
 
+    vcompress_cnt_d = vcompress_cnt_q;
+
+    vrgat_idx_d = '0;
+    vrgat_idx_fifo_push = 1'b0;
+    vrgat_addr_fifo_push = 1'b0;
+
+    // Track if an index overflow occurred past the 16 sampled bits
+    vrgat_idx_overflow = 1'b0;
+
+    // Track if the index is out of range
+    vrgat_idx_oor_d = 1'b0;
+
+    unique case (vinsn_issue.op)
+      VCOMPRESS: begin
+        vrgat_idx_d = vcompress_cnt_q;
+        if (~vrgat_idx_fifo_full && ~vrgat_addr_fifo_full) begin
+          // Count up if we could process the current mask bit
+          vcompress_cnt_d = vcompress_cnt_q + 1;
+          // Check vrgat_m_seq_bit: we can use this since VRGATHER and VCOMPRESS are mutually exclusive
+          // and the masku_operand_m is used in different ways
+          if (vrgat_m_seq_bit) begin
+            // Push this index and address if the fifos are free and if the mask bit is set
+            vrgat_idx_fifo_push  = 1'b1;
+            vrgat_addr_fifo_push = 1'b1;
+          end
+        end
+      end
+      VRGATHER,
+      VRGATHEREI16: begin
+        // Find the maximum vector length. VLMAX = LMUL * VLEN / SEW.
+        automatic int unsigned vlmax = VLENB >> vinsn_issue.vtype.vsew;
+        unique case (vinsn_issue.vtype.vlmul)
+          LMUL_1  : vlmax <<= 0;
+          LMUL_2  : vlmax <<= 1;
+          LMUL_4  : vlmax <<= 2;
+          LMUL_8  : vlmax <<= 3;
+          // Fractional LMUL
+          LMUL_1_2: vlmax >>= 1;
+          LMUL_1_4: vlmax >>= 2;
+          LMUL_1_8: vlmax >>= 3;
+          default:;
+        endcase
 
+        // VRGATHER: treat the index as a vtype.vsew-bit number
+        if (vinsn_issue.op == VRGATHER) begin
+          unique case (vinsn_issue.vtype.vsew)
+            EW8: begin
+              vrgat_idx_d = {8'b0, masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/8)-1:0] * 8 +: 8]};
+            end
+            EW16: begin
+              vrgat_idx_d = masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/16)-1:0] * 16 +: 16];
+            end
+            EW32: begin
+              vrgat_idx_d = masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/32)-1:0] * 32 +: 16];
+              vrgat_idx_overflow = |masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/32)-1:0] * 32 + 16 +: 32 - 16];
+            end
+            default: begin // EW64
+              vrgat_idx_d = masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/64)-1:0] * 64 +: 16];
+              vrgat_idx_overflow = |masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/64)-1:0] * 64 + 16 +: 64 - 16];
+            end
+          endcase
+        end else begin
+          // VRGATHEREI16: treat the index as a 16-bit number
+          vrgat_idx_d = masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/16)-1:0] * 16 +: 16];
+        end
 
-    // Stream the address requests into the lanes in parallel
+        // VRGATHER.v[x|i] splats one scalar into Vd. The scalar is not truncated
+        if (insn_issue.use_scalar_op) begin
+          vrgat_idx_d = insn_issue.scalar_op[15:0];
+          vrgat_idx_overflow = |insn_issue.use_scalar_op[16 +: ELEN - 16];
+        end
 
-    // Save the indices into the MASKU ALU vrgather/vcompress queue for later use
+        vrgat_idx_oor_d = (vrgat_idx_d >= vlmax) | vrgat_idx_overflow;
 
+        // Proceed if the FIFOs are not full
+        if (~vrgat_idx_fifo_full && ~vrgat_addr_fifo_full) begin
+          // Push the index no matter what
+          vrgat_idx_fifo_push = 1'b1;
+          // Request to the lanes only if the index is within range
+          if (!vrgat_idx_oor_d) begin
+            vrgat_addr_fifo_push = 1'b1;
+          end
+        end
+      end
+      default:;
+    endcase
 
     ///////////////////////
     // MASKU ALU Control //
@@ -1026,8 +1197,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     if (vinsn_issue_valid && vinsn_issue.op inside {[VMFEQ:VMXNOR]}) begin
       // Compute one slice if we can write and the necessary inputs are valid
       // VID does not require any operand, while VRGATHER/VCOMPRESS's ALU operand is just preprocessed to get the indices.
-      // Therefore, VRGATHER/VCOMPRESS's ALU timing should follow the operands only, passed through Vd
-      if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op inside {VID,VRGATHER,VCOMPRESS})
+      // Therefore, VRGATHER/VCOMPRESS's operand are special. Only the vd operand works in the MASKU ALU.
+      if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op inside {VID,[VRGATHER:VCOMPRESS]})
                              && (&masku_operand_vd_valid  || !vinsn_issue.use_vd_op)
                              && (&masku_operand_m_valid   || vinsn_issue.vm || vinsn_issue.op inside {[VMADC:VMSBC]})) begin
 
@@ -1054,8 +1225,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         vrf_pnt_d   = vrf_pnt_q + delta_elm_q;
 
         // Increment the input, input-mask, and output slice counters
-        in_ready_cnt_en   = 1'b1;
-        in_m_ready_cnt_en = 1'b1;
+        if (!(vinsn_issue.op inside {[VRGATHER:VCOMPRESS]})) in_ready_cnt_en = 1'b1;
+        if (!(vinsn_issue.op inside {[VMADC:VMSBC], VCOMPRESS})) in_m_ready_cnt_en = 1'b1;
         out_valid_cnt_en  = 1'b1;
 
         // Account for the elements that have been processed
@@ -1065,7 +1236,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
         // Request new input (by completing ready-valid handshake) once all slices have been processed
         // Alu input is accessed in different widths
-        if ((in_ready_cnt_q == in_ready_threshold_q) || (issue_cnt_d == '0)) begin
+        // VRGATHER and VCOMPRESS handle the ALU operand for the index generation before the MASKU ALU gets the operands
+        if (((in_ready_cnt_q == in_ready_threshold_q) || (issue_cnt_d == '0)) && !(vinsn_issue.op inside {[VRGATHER:VCOMPRESS]})) begin
           in_ready_cnt_clr = 1'b1;
           if (vinsn_issue.op != VID) begin
             masku_operand_alu_ready = '1;
@@ -1073,7 +1245,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         end
         // Mask is always accessed at bit level
         // VMADC, VMSBC handle masks in the mask queue
-        if ((in_m_ready_cnt_q == in_m_ready_threshold_q) || (issue_cnt_d == '0) && !(vinsn_issue.op inside {[VMADC:VMSBC]})) begin
+        // VCOMPRESS handles the mask for the index generation before the MASKU ALU gets the operands
+        if (((in_m_ready_cnt_q == in_m_ready_threshold_q) || (issue_cnt_d == '0)) && !(vinsn_issue.op inside {[VMADC:VMSBC], VCOMPRESS})) begin
           in_m_ready_cnt_clr = 1'b1;
           if (!vinsn_issue.vm) begin
             masku_operand_m_ready = '1;
@@ -1339,7 +1512,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
             in_m_ready_threshold_d = NrLanes*DataWidth/VcpopParallelism-1;
             out_valid_threshold_d  = '0;
           end
-          default: begin // VFIRST
+          VFIRST: begin
             // Mask to scalar
             delta_elm_d = VfirstParallelism;
 
@@ -1347,6 +1520,16 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
             in_m_ready_threshold_d = NrLanes*DataWidth/VfirstParallelism-1;
             out_valid_threshold_d  = '0;
           end
+          default: begin // VRGATHER, VRGATHEREI16, VCOMPRESS
+            // If VRGATHEREI16, vsew == EW16 -> shift-by-1
+            logic [1:0] vrgat_eff_vsew = (pe_req_i.op == VRGATHEREI16) ? 1 : pe_req_i.vtype.vsew[1:0];
+
+            delta_elm_d = 1;
+
+            in_ready_threshold_d   = ((NrLanes*DataWidth/8) >> vrgat_eff_vsew)-1;
+            in_m_ready_threshold_d = NrLanes*DataWidth-1;
+            out_valid_threshold_d  = ((NrLanes*DataWidth/8) >> pe_req_i.vtype.vsew[1:0])-1;
+          end
         endcase
 
         // Reset the final grant vector
@@ -1368,45 +1551,47 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
-      vinsn_running_q        <= '0;
-      read_cnt_q             <= '0;
-      issue_cnt_q            <= '0;
-      processing_cnt_q       <= '0;
-      commit_cnt_q           <= '0;
-      vrf_pnt_q              <= '0;
-      mask_pnt_q             <= '0;
-      pe_resp_o              <= '0;
-      result_final_gnt_q     <= '0;
-      popcount_q             <= '0;
-      vfirst_count_q         <= '0;
-      delta_elm_q            <= '0;
-      in_ready_threshold_q   <= '0;
-      in_m_ready_threshold_q <= '0;
-      out_valid_threshold_q  <= '0;
-      viota_acc_q            <= '0;
-      found_one_q            <= '0;
-      be_viota_seq_q         <= '1; // Default: write
-      be_vrgat_seq_q         <= '1; // Default: write
+      vinsn_running_q         <= '0;
+      read_cnt_q              <= '0;
+      issue_cnt_q             <= '0;
+      processing_cnt_q        <= '0;
+      commit_cnt_q            <= '0;
+      vrf_pnt_q               <= '0;
+      mask_pnt_q              <= '0;
+      pe_resp_o               <= '0;
+      result_final_gnt_q      <= '0;
+      popcount_q              <= '0;
+      vfirst_count_q          <= '0;
+      delta_elm_q             <= '0;
+      in_ready_threshold_q    <= '0;
+      in_m_ready_threshold_q  <= '0;
+      out_valid_threshold_q   <= '0;
+      viota_acc_q             <= '0;
+      found_one_q             <= '0;
+      be_viota_seq_q          <= '1; // Default: write
+      be_vrgat_seq_q          <= '1; // Default: write
+      vrgat_addr_valid_mask_q <= '0;
     end else begin
-      vinsn_running_q        <= vinsn_running_d;
-      read_cnt_q             <= read_cnt_d;
-      issue_cnt_q            <= issue_cnt_d;
-      processing_cnt_q       <= processing_cnt_d;
-      commit_cnt_q           <= commit_cnt_d;
-      vrf_pnt_q              <= vrf_pnt_d;
-      mask_pnt_q             <= mask_pnt_d;
-      pe_resp_o              <= pe_resp;
-      result_final_gnt_q     <= result_final_gnt_d;
-      popcount_q             <= popcount_d;
-      vfirst_count_q         <= vfirst_count_d;
-      delta_elm_q            <= delta_elm_d;
-      in_ready_threshold_q   <= in_ready_threshold_d;
-      in_m_ready_threshold_q <= in_m_ready_threshold_d;
-      out_valid_threshold_q  <= out_valid_threshold_d;
-      viota_acc_q            <= viota_acc_d;
-      found_one_q            <= found_one_d;
-      be_viota_seq_q         <= be_viota_seq_d;
-      be_vrgat_seq_q         <= be_vrgat_seq_d;
+      vinsn_running_q         <= vinsn_running_d;
+      read_cnt_q              <= read_cnt_d;
+      issue_cnt_q             <= issue_cnt_d;
+      processing_cnt_q        <= processing_cnt_d;
+      commit_cnt_q            <= commit_cnt_d;
+      vrf_pnt_q               <= vrf_pnt_d;
+      mask_pnt_q              <= mask_pnt_d;
+      pe_resp_o               <= pe_resp;
+      result_final_gnt_q      <= result_final_gnt_d;
+      popcount_q              <= popcount_d;
+      vfirst_count_q          <= vfirst_count_d;
+      delta_elm_q             <= delta_elm_d;
+      in_ready_threshold_q    <= in_ready_threshold_d;
+      in_m_ready_threshold_q  <= in_m_ready_threshold_d;
+      out_valid_threshold_q   <= out_valid_threshold_d;
+      viota_acc_q             <= viota_acc_d;
+      found_one_q             <= found_one_d;
+      be_viota_seq_q          <= be_viota_seq_d;
+      be_vrgat_seq_q          <= be_vrgat_seq_d;
+      vrgat_addr_valid_mask_q <= vrgat_addr_valid_mask_d;
     end
   end