From ef73126c04ea1d695abd4f0cbc06dfae44c7f5d5 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Mon, 25 Nov 2024 19:05:47 +0100 Subject: [PATCH] [hardware] WIP: vrgather / vcompress debug --- hardware/include/ara_pkg.sv | 4 ++ hardware/src/lane/lane.sv | 6 +++ hardware/src/lane/lane_sequencer.sv | 29 ++++++++++++-- hardware/src/lane/operand_queue.sv | 14 ++++++- hardware/src/lane/operand_queues_stage.sv | 33 +++++++++++----- hardware/src/lane/operand_requester.sv | 2 + hardware/src/lane/valu.sv | 4 +- hardware/src/masku/masku.sv | 46 +++++++++-------------- 8 files changed, 95 insertions(+), 43 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 10b444d8c..a8526da88 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -967,6 +967,10 @@ package ara_pkg; // VRGATHER / VCOMPRESS // ////////////////////////// + // Buffer more elements in MaskB opqueue + // This should be a power of 2 + localparam VrgatherOpQueueBufDepth = 2; + // Indices are 16-bit at most because of RISC-V V VLEN limitation at 64Kibit typedef logic [$clog2(rvv_pkg::RISCV_MAX_VLEN)-1:0] max_vlen_t; diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv index e453e2c61..2af0f9125 100644 --- a/hardware/src/lane/lane.sv +++ b/hardware/src/lane/lane.sv @@ -217,6 +217,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( logic [NrVInsn-1:0] alu_vinsn_done; logic mfpu_ready; logic [NrVInsn-1:0] mfpu_vinsn_done; + // Interface with the MaskB operand queue (VRGATHER/VCOMPRESS) + logic mask_b_cmd_pop; // Additional signals to please Verilator's hierarchical verilation pe_req_t pe_req; @@ -246,6 +248,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( .operand_request_ready_i(operand_request_ready), .alu_vinsn_done_o (alu_vinsn_done_o ), .mfpu_vinsn_done_o (mfpu_vinsn_done_o ), + // Interface with the Operand Queue + .mask_b_cmd_pop_i (mask_b_cmd_pop ), // Interface with the VFUs .vfu_operation_o (vfu_operation ), .vfu_operation_valid_o (vfu_operation_valid ), @@ -437,6 +441,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( .operand_queue_ready_o (operand_queue_ready ), .operand_queue_cmd_i (operand_queue_cmd ), .operand_queue_cmd_valid_i (operand_queue_cmd_valid ), + // Interface with the Lane Sequencer + .mask_b_cmd_pop_o (mask_b_cmd_pop ), // Interface with the VFUs // ALU .alu_operand_o (alu_operand ), diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index d7824a219..dab9ac1e2 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -31,6 +31,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: input logic [NrOperandQueues-1:0] operand_request_ready_i, output logic alu_vinsn_done_o, output logic mfpu_vinsn_done_o, + // Interface with the Operand Queue (MaskB - for VRGATHER) + input logic mask_b_cmd_pop_i, // Interface with the lane's VFUs output vfu_operation_t vfu_operation_o, output logic vfu_operation_valid_o, @@ -167,6 +169,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vrgat_req_t masku_vrgat_req_q; logic masku_vrgat_req_ready_d, masku_vrgat_req_valid_q; + logic [idx_width(VrgatherOpQueueBufDepth)-1:0] vrgat_cmd_req_cnt_d, vrgat_cmd_req_cnt_q; + spill_register #( .T ( vrgat_req_t ) ) i_spill_register_vrgat_req ( @@ -183,6 +187,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: always_comb begin masku_vrgat_req_ready_d = 1'b0; + vrgat_state_d = vrgat_state_q; + + vrgat_cmd_req_cnt_d = vrgat_cmd_req_cnt_q; + // If MASKU request arrives, wait until the MaskB requester is free // Also, lock the MaskB opqueue unique case (vrgat_state_q) @@ -192,8 +200,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: end end REQUESTING: begin - // Pop if the operand requester is ready to accept a request - masku_vrgat_req_ready_d = masku_vrgat_req_valid_q && !(operand_request_valid_o[MaskB]); + // Pop if the operand requester and the queue are ready to accept requests + masku_vrgat_req_ready_d = masku_vrgat_req_valid_q & !(operand_request_valid_o[MaskB]) + & (vrgat_cmd_req_cnt_q != (VrgatherOpQueueBufDepth-1)); + + // Increase the counter if we handshake + if (masku_vrgat_req_ready_d) + vrgat_cmd_req_cnt_d = vrgat_cmd_req_cnt_q + 1; + // Decrease the counter if the MaskB opqueue popped a cmd + if (mask_b_cmd_pop_i) + vrgat_cmd_req_cnt_d = vrgat_cmd_req_cnt_q - 1; + // If the MASKU is over with VRGATHER/VCOMPRESS, return to idle if (masku_vrgat_req_ready_d && masku_vrgat_req_q.is_last_req) begin vrgat_state_d = IDLE; @@ -874,7 +891,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: hazard : '0, default : '0 }; - operand_request_push[MaskB] = masku_vrgat_req_valid_q && !(operand_request_valid_o[MaskB]); + operand_request_push[MaskB] = masku_vrgat_req_ready_d; end end: sequencer @@ -888,6 +905,9 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: alu_vinsn_done_o <= 1'b0; mfpu_vinsn_done_o <= 1'b0; + + vrgat_state_q <= IDLE; + vrgat_cmd_req_cnt_q <= '0; end else begin vinsn_done_q <= vinsn_done_d; vinsn_running_q <= vinsn_running_d; @@ -897,6 +917,9 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: alu_vinsn_done_o <= alu_vinsn_done_d; mfpu_vinsn_done_o <= mfpu_vinsn_done_d; + + vrgat_state_q <= vrgat_state_d; + vrgat_cmd_req_cnt_q <= vrgat_cmd_req_cnt_d; end end diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv index 0ccdcc6ae..e3b2a6b70 100644 --- a/hardware/src/lane/operand_queue.sv +++ b/hardware/src/lane/operand_queue.sv @@ -14,6 +14,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i parameter int unsigned NrSlaves = 1, parameter int unsigned NrLanes = 0, parameter int unsigned VLEN = 0, + parameter bit IsVrgatherOpqueue = 0, // Support for floating-point data types parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, // Supported conversions @@ -37,6 +38,8 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Interface with the Operand Requester input operand_queue_cmd_t operand_queue_cmd_i, input logic operand_queue_cmd_valid_i, + // Interface with the Lane Sequencer + output logic mask_b_cmd_pop_o, // Interface with the Vector Register File input elen_t operand_i, input logic operand_valid_i, @@ -54,7 +57,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i ////////////////////// operand_queue_cmd_t cmd; - logic cmd_pop; + logic cmd_pop, cmd_pop_q; fifo_v3 #( .DEPTH(CmdBufDepth ), @@ -73,6 +76,13 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i .usage_o (/* Unused */ ) ); + // If this is the MaskB opqueue, propagate the + // pop information for the cmd buffer + if (IsVrgatherOpqueue) + assign mask_b_cmd_pop_o = cmd_pop_q; + else + assign mask_b_cmd_pop_o = 1'b0; + ////////////// // Buffer // ////////////// @@ -123,8 +133,10 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i always_ff @(posedge clk_i or negedge rst_ni) begin: p_ibuf_usage_ff if (!rst_ni) begin ibuf_usage_q <= '0; + cmd_pop_q <= 1'b0; end else begin ibuf_usage_q <= ibuf_usage_d; + cmd_pop_q <= cmd_pop; end end diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv index d03e8a6a4..4fb49dbb2 100644 --- a/hardware/src/lane/operand_queues_stage.sv +++ b/hardware/src/lane/operand_queues_stage.sv @@ -24,6 +24,8 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math output logic [NrOperandQueues-1:0] operand_queue_ready_o, input operand_queue_cmd_t [NrOperandQueues-1:0] operand_queue_cmd_i, input logic [NrOperandQueues-1:0] operand_queue_cmd_valid_i, + // Interface with the Lane Sequencer + output logic mask_b_cmd_pop_o, // Interface with the VFUs // ALU output elen_t [1:0] alu_operand_o, @@ -73,6 +75,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[AluA] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[AluA]), + .mask_b_cmd_pop_o (/* Unused */ ), .operand_i (operand_i[AluA] ), .operand_valid_i (operand_valid_i[AluA] ), .operand_issued_i (operand_issued_i[AluA] ), @@ -102,6 +105,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[AluB] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[AluB]), + .mask_b_cmd_pop_o (/* Unused */ ), .operand_i (operand_i[AluB] ), .operand_valid_i (operand_valid_i[AluB] ), .operand_issued_i (operand_issued_i[AluB] ), @@ -133,6 +137,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[MulFPUA] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MulFPUA]), + .mask_b_cmd_pop_o (/* Unused */ ), .operand_i (operand_i[MulFPUA] ), .operand_valid_i (operand_valid_i[MulFPUA] ), .operand_issued_i (operand_issued_i[MulFPUA] ), @@ -160,6 +165,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[MulFPUB] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MulFPUB]), + .mask_b_cmd_pop_o (/* Unused */ ), .operand_i (operand_i[MulFPUB] ), .operand_valid_i (operand_valid_i[MulFPUB] ), .operand_issued_i (operand_issued_i[MulFPUB] ), @@ -187,6 +193,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[MulFPUC] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MulFPUC]), + .mask_b_cmd_pop_o (/* Unused */ ), .operand_i (operand_i[MulFPUC] ), .operand_valid_i (operand_valid_i[MulFPUC] ), .operand_issued_i (operand_issued_i[MulFPUC] ), @@ -215,6 +222,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[StA] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[StA]), + .mask_b_cmd_pop_o (/* Unused */ ), .operand_i (operand_i[StA] ), .operand_valid_i (operand_valid_i[StA] ), .operand_issued_i (operand_issued_i[StA] ), @@ -259,6 +267,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[SlideAddrGenA] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[SlideAddrGenA] ), + .mask_b_cmd_pop_o (/* Unused */ ), .operand_i (operand_i[SlideAddrGenA] ), .operand_valid_i (operand_valid_i[SlideAddrGenA] ), .operand_issued_i (operand_issued_i[SlideAddrGenA] ), @@ -274,15 +283,16 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ///////////////// operand_queue #( - .CmdBufDepth (MaskuInsnQueueDepth ), - .DataBufDepth (1 ), - .FPUSupport (FPUSupportNone ), - .SupportIntExt2 (1'b1 ), - .SupportIntExt4 (1'b1 ), - .SupportIntExt8 (1'b1 ), - .NrLanes (NrLanes ), - .VLEN (VLEN ), - .operand_queue_cmd_t(operand_queue_cmd_t ) + .CmdBufDepth (MaskuInsnQueueDepth + VrgatherOpQueueBufDepth ), + .DataBufDepth (MaskuInsnQueueDepth + VrgatherOpQueueBufDepth ), + .IsVrgatherOpqueue (1'b1 ), + .FPUSupport (FPUSupportNone ), + .SupportIntExt2 (1'b1 ), + .SupportIntExt4 (1'b1 ), + .SupportIntExt8 (1'b1 ), + .NrLanes (NrLanes ), + .VLEN (VLEN ), + .operand_queue_cmd_t(operand_queue_cmd_t ) ) i_operand_queue_mask_b ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -290,6 +300,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[MaskB] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MaskB]), + .mask_b_cmd_pop_o (mask_b_cmd_pop_o ), .operand_i (operand_i[MaskB] ), .operand_valid_i (operand_valid_i[MaskB] ), .operand_issued_i (operand_issued_i[MaskB] ), @@ -314,6 +325,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[MaskM] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MaskM]), + .mask_b_cmd_pop_o (/* Unused */ ), .operand_i (operand_i[MaskM] ), .operand_valid_i (operand_valid_i[MaskM] ), .operand_issued_i (operand_issued_i[MaskM] ), @@ -324,4 +336,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math .operand_ready_i (mask_operand_ready_i[0] ) ); + // Checks + if (VrgatherOpQueueBufDepth % 2 != 0) $fatal(1, "Parameter VrgatherOpQueueBufDepth must be power of 2."); + endmodule : operand_queues_stage diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index f085d8471..8af7512ff 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -39,6 +39,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( output logic [NrOperandQueues-1:0] operand_issued_o, output operand_queue_cmd_t [NrOperandQueues-1:0] operand_queue_cmd_o, output logic [NrOperandQueues-1:0] operand_queue_cmd_valid_o, + // VRGATHER/VCOMPRESS support + output logic masku_b_operand_queue_ready_o, // Interface with the VFUs // ALU input logic alu_result_req_i, diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index 53a14e177..74c560019 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -786,13 +786,13 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; ////////////////////////////// if (!vinsn_queue_full && vfu_operation_valid_i && - (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin + (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VCOMPRESS]})) begin vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i; // Do not wait for masks if, during a reduction, this lane is just a pass-through // The only valid instructions here with vl == '0 are reductions // Instructions that execute in the mask unit will process the mask there directly // VMADC/VMSBC requires mask bits in the ALU - vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VMXNOR]} && !(vfu_operation_i.op inside {[VMADC:VMSBC]}) + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VCOMPRESS]} && !(vfu_operation_i.op inside {[VMADC:VMSBC]}) ? 1'b1 : vfu_operation_i.vm | (vfu_operation_i.vl == '0); diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index b12675768..46f6cf65f 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -46,7 +46,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( output logic [NrLanes-1:0] masku_vrgat_req_valid_o, input logic [NrLanes-1:0] masku_vrgat_req_ready_i, output vrgat_req_t masku_vrgat_req_o, - output logic masku_vrgat_end_o, // Interface with the VFUs output strb_t [NrLanes-1:0] mask_o, output logic [NrLanes-1:0] mask_valid_o, @@ -98,7 +97,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // vd (deshuffled) logic [NrLanes*DataWidth-1:0] masku_operand_vd_seq; logic [ NrLanes-1:0] masku_operand_vd_seq_valid; - logic [ NrLanes-1:0] masku_operand_vd_seq_ready; // Mask elen_t [NrLanes-1:0] masku_operand_m; @@ -154,6 +152,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( .alu_result_compressed_seq_o ( alu_result_compressed_seq ) ); + // Local Parameter for mask logical instructions + // + // Don't change this parameter! + localparam integer unsigned VrgatherParallelism = 1; + // Local Parameter for mask logical instructions // // Don't change this parameter! @@ -226,10 +229,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( logic [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_shuf; logic found_one, found_one_d, found_one_q; - // VRGATHER/VCOMPRESS signals - // Current vrgather index - logic [15:0] vrgat_idx; - // How many elements we are processing per cycle logic [idx_width(NrLanes*DataWidth):0] delta_elm_d, delta_elm_q; @@ -387,22 +386,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // ALU counters // //////////////////// - // Compile-time minimum among five different numbers - function automatic int unsigned min5(int unsigned a, int unsigned b, int unsigned c, int unsigned d, int unsigned e); - return (a < b) ? ((a < c) ? ((a < d) ? ((a < e) ? a : e) : (d < e ? d : e)) - : (c < d) ? ((c < e) ? c : e) : (d < e ? d : e)) - : ((b < c) ? ((b < d) ? ((b < e) ? b : e) : (d < e ? d : e)) - : (c < d) ? ((c < e) ? c : e) : (d < e ? d : e)); - endfunction - // What is the minimum supported parallelism? - localparam int unsigned MIN_MASKU_ALU_WIDTH = min5( - ViotaParallelism, - VmsxfParallelism, - VmLogicalParallelism, - VcpopParallelism, - VfirstParallelism - ); + localparam int unsigned MIN_MASKU_ALU_WIDTH = 1; // VrgatherParallelism localparam int unsigned IN_READY_CNT_WIDTH = idx_width(NrLanes * DataWidth / MIN_MASKU_ALU_WIDTH); typedef logic [IN_READY_CNT_WIDTH-1:0] in_ready_cnt_t; @@ -517,7 +502,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Sequential counter for vcompress vlen_t vrgat_cnt_d, vrgat_cnt_q; - logic [NrLanes*DataWidth-1:0] vrgat_threshold_d, vrgat_threshold_q; logic vcompress_bit; // FIFO-related signals @@ -531,6 +515,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( logic [4:0] vrgat_req_vs_d; logic vrgat_req_is_last_req_d; + // If VRGATHEREI16, vsew == EW16 -> shift-by-1 + logic [1:0] vrgat_eff_vsew; + assign vrgat_eff_vsew = (pe_req_i.op == VRGATHEREI16) ? 2'b1 : unsigned'(pe_req_i.vtype.vsew); + assign vrgat_req_eew_d = vinsn_issue.op == VRGATHEREI16 ? EW16 : vinsn_issue.vtype.vsew; assign vrgat_req_vs_d = vinsn_issue.vs1; @@ -551,7 +539,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Synchronize the handshake between MASKU and lanes since we are making a single request // to all the lanes, which can also answer individually always_comb begin - // Don't do anything by default vrgat_req_fifo_pop = 1'b0; @@ -569,7 +556,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( if (&masku_vrgat_req_ready_i) vrgat_req_valid_mask_d = '0; // Pop the current address if all the lanes have handshaked it - if (&(masku_vrgat_req_ready_i | vrgat_req_valid_mask_q)) vrgat_req_fifo_pop = 1'b1; + if (&(masku_vrgat_req_ready_i | vrgat_req_valid_mask_q) && ~vrgat_req_fifo_empty) vrgat_req_fifo_pop = 1'b1; end // Overflow after 16-bits @@ -707,6 +694,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( viota_acc_d = viota_acc_q; for (int i = 0; i < ViotaParallelism; i++) viota_res[i] = '0; + be_vrgat_seq_d = '0; + if (vinsn_issue_valid) begin // Evaluate the instruction unique case (vinsn_issue.op) inside @@ -1220,6 +1209,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // different input and output data widths, meaning that the input ready and the final output valid // are not always synchronized. + vrgat_idx_fifo_pop = 1'b0; + // How many elements {VIOTA|VID} are writing to each lane elm_per_lane = processing_cnt_q / NrLanes; if ((processing_cnt_q / NrLanes) > 4'b1000) @@ -1247,7 +1238,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Therefore, VRGATHER/VCOMPRESS's operand are special. Only the vd operand works in the MASKU ALU. if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op inside {VID,[VRGATHER:VCOMPRESS]}) && (&masku_operand_vd_valid || !vinsn_issue.use_vd_op) - && (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {[VMADC:VMSBC]})) begin + && (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {[VMADC:VMSBC]}) + && (!vrgat_idx_fifo_empty || vinsn_issue.op inside {[VRGATHER:VCOMPRESS]})) begin // Write the result queue on the background data - either vd or the previous result // The mask vector writes at 1 (tail-agnostic ok value) both the background body @@ -1270,6 +1262,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( found_one_d = found_one; viota_acc_d = viota_acc; vrf_pnt_d = vrf_pnt_q + delta_elm_q; + vrgat_idx_fifo_pop = 1'b1; // Increment the input, input-mask, and output slice counters if (!(vinsn_issue.op inside {[VRGATHER:VCOMPRESS]})) in_ready_cnt_en = 1'b1; @@ -1567,9 +1560,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( out_valid_threshold_d = '0; end default: begin // VRGATHER, VRGATHEREI16, VCOMPRESS - // If VRGATHEREI16, vsew == EW16 -> shift-by-1 - static logic [1:0] vrgat_eff_vsew = (pe_req_i.op == VRGATHEREI16) ? 1 : pe_req_i.vtype.vsew[1:0]; - delta_elm_d = 1; in_ready_threshold_d = pe_req_i.op == VCOMPRESS ? NrLanes*DataWidth : ((NrLanes*DataWidth/8) >> vrgat_eff_vsew)-1;