diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index abdec2c35..90f9b2163 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -594,6 +594,9 @@ module ara import ara_pkg::*; #( .masku_result_be_o (masku_result_be ), .masku_result_gnt_i (masku_result_gnt ), .masku_result_final_gnt_i(masku_result_final_gnt ), + .masku_vrgat_req_valid_o ( ), + .masku_vrgat_req_ready_i ('0 ), + .masku_vrgat_addr_o ( ), // Interface with the VFUs .mask_o (mask ), .mask_valid_o (mask_valid ), diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index cdd2efb6d..3cde6ba00 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -3,6 +3,7 @@ // SPDX-License-Identifier: SHL-0.51 // // Author: Matheus Cavalcante +// Author: Matteo Perotti // Description: // This is Ara's mask unit. It fetches operands from any one the lanes, and // then sends back to them whether the elements are predicated or not. @@ -42,6 +43,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( output strb_t [NrLanes-1:0] masku_result_be_o, input logic [NrLanes-1:0] masku_result_gnt_i, input logic [NrLanes-1:0] masku_result_final_gnt_i, + output logic [NrLanes-1:0] masku_vrgat_req_valid_o, + input logic [NrLanes-1:0] masku_vrgat_req_ready_i, + output vaddr_t masku_vrgat_addr_o, // Interface with the VFUs output strb_t [NrLanes-1:0] mask_o, output logic [NrLanes-1:0] mask_valid_o, @@ -498,6 +502,94 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( .overflow_o(/* Unused */) ); + /////////////////////////////// + //// VRGATHER / VCOMPRESS //// + /////////////////////////////// + + // Indices are 16-bit at most because of RISC-V V VLEN limitation at 64Kibit + typedef logic [$clog2(RISCV_MAX_VLEN)-1:0] max_vlen_t; + + // How deep are the VRGATHER/VCOMPRESS address/index FIFOs? + localparam int unisgned VrgatFifoDepth = 3; + + // Mask bit sequentially selected by the m-operand delta counter + // VRGATHER: used as a mask bit by the MASKU ALU (write-back phase of VRGATHER) + // VCOMPRESS: used as an index bit to build the next index for address generation (first phase of VCOMPRESS) + logic vrgat_m_seq_bit; + + // FIFO-related signals + logic vrgat_addr_fifo_empty, vrgat_addr_fifo_full, vrgat_addr_fifo_push, vrgat_addr_fifo_pop; + logic vrgat_idx_fifo_empty, vrgat_idx_fifo_full, vrgat_idx_fifo_push, vrgat_idx_fifo_pop; + + // Build the address from the index + assign vrgat_addr_d = vaddr(vinsn_issue.vs2, NrLanes, VLEN) + vrgat_idx_d; + + // Broadcast the address request to all the lanes + assign masku_vrgat_addr_o = vrgat_addr_q; + + // Synchronize the handshake between MASKU and lanes since we are making a single request + // to all the lanes, which can also answer individually + always_comb begin + // A mask for the valid to keep up only the unshaked ones and hide the others + logic [NrLanes-1:0] vrgat_addr_valid_mask_d, vrgat_addr_valid_mask_q; + + // Don't do anything by default + vrgat_addr_fifo_pop = 1'b0; + + // Don't hide the valids by defaults + vrgat_addr_valid_mask_d = vrgat_addr_valid_mask_q; + + for (int lane = 0; lane < NrLanes; lane++) begin + // Valid address request if the address fifo is not empty and if the valid is not masked + masku_vrgat_req_valid_o[lane] = ~vrgat_addr_fifo_empty & vrgat_addr_valid_q[lane]; + // Mask the next valid on this lane if the lane is handshaking + vrgat_addr_valid_mask_d[lane] = masku_vrgat_req_ready_i[lane]; + end + + // Don't mask if all the lanes have handshaked + if (&masku_vrgat_req_ready_i) vrgat_addr_valid_mask_d = '0; + + // Pop the current address if all the lanes have handshaked it + if (&(masku_vrgat_req_ready_i | vrgat_addr_valid_mask_q)) vrgat_addr_fifo_pop = 1'b1; + end + + // Save the indices into the MASKU ALU vrgather/vcompress queue for later use + // Also, save if one of the indices is out of range + fifo_v3 #( + .DATA_WIDTH($clog2(RISCV_MAX_VLEN) + 1), + .DEPTH (VrgatFifoDepth ) + ) i_fifo_vrgat_idx ( + .clk_i, + .rst_ni, + .flush_i (1'b0), + .testmode_i(1'b0), + .full_o (vrgat_idx_fifo_full ), + .empty_o (vrgat_idx_fifo_empty ), + .usage_o (/* unused */ ), + .data_i ({vrgat_idx_oor_d, vrgat_idx_d}), + .push_i (vrgat_idx_fifo_push ), + .data_o ({vrgat_idx_oor_q, vrgat_idx_q}), + .pop_i (vrgat_idx_fifo_pop ) + ); + + // Send the address request to the lanes + fifo_v3 #( + .dtype(vaddr_t ), + .DEPTH(VrgatFifoDepth) + ) i_fifo_vrgat_addr ( + .clk_i, + .rst_ni, + .flush_i (1'b0), + .testmode_i(1'b0), + .full_o (vrgat_addr_fifo_full ), + .empty_o (vrgat_addr_fifo_empty), + .usage_o (/* unused */ ), + .data_i (vrgat_addr_d ), + .push_i (vrgat_addr_fifo_push ), + .data_o (vrgat_addr_q ), + .pop_i (vrgat_addr_fifo_pop ) + ); + //////////////////////////// //// Scalar result reg //// //////////////////////////// @@ -573,6 +665,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( vcpop_operand = '0; + vrgat_m_seq_bit = 1'b0; + // The result mask should be created here since the output is a non-mask vector be_viota_seq_d = be_viota_seq_q; @@ -712,23 +806,27 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( end endcase - // BE signal for VRGATHER + // BE signal for VRGATHER and VCOMPRESS unique case (vinsn_issue.vtype.vsew) EW8: begin + vrgat_m_seq_bit = masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]; be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8)-1:0] * 1 +: 1] = - {1{vinsn_issue.vm}} | {1{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}}; + {1{vinsn_issue.vm}} | {1{vrgat_m_seq_bit}}; end EW16: begin + vrgat_m_seq_bit = masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]; be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/16)-1:0] * 2 +: 2] = - {2{vinsn_issue.vm}} | {2{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}}; + {2{vinsn_issue.vm}} | {2{vrgat_m_seq_bit}}; end EW32: begin + vrgat_m_seq_bit = masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]; be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/32)-1:0] * 4 +: 4] = - {4{vinsn_issue.vm}} | {4{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}}; + {4{vinsn_issue.vm}} | {4{vrgat_m_seq_bit}}; end default: begin // EW64 + vrgat_m_seq_bit = masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]; be_vrgat_seq_d[out_valid_cnt_q[idx_width(NrLanes*DataWidth/64)-1:0] * 8 +: 8] = - {8{vinsn_issue.vm}} | {8{masku_operand_m_seq[in_m_ready_cnt_q[idx_width(NrLanes*DataWidth)-1:0]]}}; + {8{vinsn_issue.vm}} | {8{vrgat_m_seq_bit}}; end endcase @@ -975,20 +1073,93 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // We ask all the lanes in parallel for a precise index, and we will get a balanced payload from them. // Only one element of the payload is important, the rest is discarded. // This can be easily optimized by asking only the correct lane and by handling unbalanced payloads. - logic [NrLanes-1:0] vrgat_idx_req; - logic vrgat_idx_addr; - - // VRGATHER - - - // VCOMPRESS + vcompress_cnt_d = vcompress_cnt_q; + + vrgat_idx_d = '0; + vrgat_idx_fifo_push = 1'b0; + vrgat_addr_fifo_push = 1'b0; + + // Track if an index overflow occurred past the 16 sampled bits + vrgat_idx_overflow = 1'b0; + + // Track if the index is out of range + vrgat_idx_oor_d = 1'b0; + + unique case (vinsn_issue.op) + VCOMPRESS: begin + vrgat_idx_d = vcompress_cnt_q; + if (~vrgat_idx_fifo_full && ~vrgat_addr_fifo_full) begin + // Count up if we could process the current mask bit + vcompress_cnt_d = vcompress_cnt_q + 1; + // Check vrgat_m_seq_bit: we can use this since VRGATHER and VCOMPRESS are mutually exclusive + // and the masku_operand_m is used in different ways + if (vrgat_m_seq_bit) begin + // Push this index and address if the fifos are free and if the mask bit is set + vrgat_idx_fifo_push = 1'b1; + vrgat_addr_fifo_push = 1'b1; + end + end + end + VRGATHER, + VRGATHEREI16: begin + // Find the maximum vector length. VLMAX = LMUL * VLEN / SEW. + automatic int unsigned vlmax = VLENB >> vinsn_issue.vtype.vsew; + unique case (vinsn_issue.vtype.vlmul) + LMUL_1 : vlmax <<= 0; + LMUL_2 : vlmax <<= 1; + LMUL_4 : vlmax <<= 2; + LMUL_8 : vlmax <<= 3; + // Fractional LMUL + LMUL_1_2: vlmax >>= 1; + LMUL_1_4: vlmax >>= 2; + LMUL_1_8: vlmax >>= 3; + default:; + endcase + // VRGATHER: treat the index as a vtype.vsew-bit number + if (vinsn_issue.op == VRGATHER) begin + unique case (vinsn_issue.vtype.vsew) + EW8: begin + vrgat_idx_d = {8'b0, masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/8)-1:0] * 8 +: 8]}; + end + EW16: begin + vrgat_idx_d = masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/16)-1:0] * 16 +: 16]; + end + EW32: begin + vrgat_idx_d = masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/32)-1:0] * 32 +: 16]; + vrgat_idx_overflow = |masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/32)-1:0] * 32 + 16 +: 32 - 16]; + end + default: begin // EW64 + vrgat_idx_d = masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/64)-1:0] * 64 +: 16]; + vrgat_idx_overflow = |masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/64)-1:0] * 64 + 16 +: 64 - 16]; + end + endcase + end else begin + // VRGATHEREI16: treat the index as a 16-bit number + vrgat_idx_d = masku_operand_alu_seq[in_ready_cnt_en[idx_width(NrLanes*DataWidth/16)-1:0] * 16 +: 16]; + end - // Stream the address requests into the lanes in parallel + // VRGATHER.v[x|i] splats one scalar into Vd. The scalar is not truncated + if (insn_issue.use_scalar_op) begin + vrgat_idx_d = insn_issue.scalar_op[15:0]; + vrgat_idx_overflow = |insn_issue.use_scalar_op[16 +: ELEN - 16]; + end - // Save the indices into the MASKU ALU vrgather/vcompress queue for later use + vrgat_idx_oor_d = (vrgat_idx_d >= vlmax) | vrgat_idx_overflow; + // Proceed if the FIFOs are not full + if (~vrgat_idx_fifo_full && ~vrgat_addr_fifo_full) begin + // Push the index no matter what + vrgat_idx_fifo_push = 1'b1; + // Request to the lanes only if the index is within range + if (!vrgat_idx_oor_d) begin + vrgat_addr_fifo_push = 1'b1; + end + end + end + default:; + endcase /////////////////////// // MASKU ALU Control // @@ -1026,8 +1197,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( if (vinsn_issue_valid && vinsn_issue.op inside {[VMFEQ:VMXNOR]}) begin // Compute one slice if we can write and the necessary inputs are valid // VID does not require any operand, while VRGATHER/VCOMPRESS's ALU operand is just preprocessed to get the indices. - // Therefore, VRGATHER/VCOMPRESS's ALU timing should follow the operands only, passed through Vd - if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op inside {VID,VRGATHER,VCOMPRESS}) + // Therefore, VRGATHER/VCOMPRESS's operand are special. Only the vd operand works in the MASKU ALU. + if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op inside {VID,[VRGATHER:VCOMPRESS]}) && (&masku_operand_vd_valid || !vinsn_issue.use_vd_op) && (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {[VMADC:VMSBC]})) begin @@ -1054,8 +1225,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( vrf_pnt_d = vrf_pnt_q + delta_elm_q; // Increment the input, input-mask, and output slice counters - in_ready_cnt_en = 1'b1; - in_m_ready_cnt_en = 1'b1; + if (!(vinsn_issue.op inside {[VRGATHER:VCOMPRESS]})) in_ready_cnt_en = 1'b1; + if (!(vinsn_issue.op inside {[VMADC:VMSBC], VCOMPRESS})) in_m_ready_cnt_en = 1'b1; out_valid_cnt_en = 1'b1; // Account for the elements that have been processed @@ -1065,7 +1236,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Request new input (by completing ready-valid handshake) once all slices have been processed // Alu input is accessed in different widths - if ((in_ready_cnt_q == in_ready_threshold_q) || (issue_cnt_d == '0)) begin + // VRGATHER and VCOMPRESS handle the ALU operand for the index generation before the MASKU ALU gets the operands + if (((in_ready_cnt_q == in_ready_threshold_q) || (issue_cnt_d == '0)) && !(vinsn_issue.op inside {[VRGATHER:VCOMPRESS]})) begin in_ready_cnt_clr = 1'b1; if (vinsn_issue.op != VID) begin masku_operand_alu_ready = '1; @@ -1073,7 +1245,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( end // Mask is always accessed at bit level // VMADC, VMSBC handle masks in the mask queue - if ((in_m_ready_cnt_q == in_m_ready_threshold_q) || (issue_cnt_d == '0) && !(vinsn_issue.op inside {[VMADC:VMSBC]})) begin + // VCOMPRESS handles the mask for the index generation before the MASKU ALU gets the operands + if (((in_m_ready_cnt_q == in_m_ready_threshold_q) || (issue_cnt_d == '0)) && !(vinsn_issue.op inside {[VMADC:VMSBC], VCOMPRESS})) begin in_m_ready_cnt_clr = 1'b1; if (!vinsn_issue.vm) begin masku_operand_m_ready = '1; @@ -1339,7 +1512,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( in_m_ready_threshold_d = NrLanes*DataWidth/VcpopParallelism-1; out_valid_threshold_d = '0; end - default: begin // VFIRST + VFIRST: begin // Mask to scalar delta_elm_d = VfirstParallelism; @@ -1347,6 +1520,16 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( in_m_ready_threshold_d = NrLanes*DataWidth/VfirstParallelism-1; out_valid_threshold_d = '0; end + default: begin // VRGATHER, VRGATHEREI16, VCOMPRESS + // If VRGATHEREI16, vsew == EW16 -> shift-by-1 + logic [1:0] vrgat_eff_vsew = (pe_req_i.op == VRGATHEREI16) ? 1 : pe_req_i.vtype.vsew[1:0]; + + delta_elm_d = 1; + + in_ready_threshold_d = ((NrLanes*DataWidth/8) >> vrgat_eff_vsew)-1; + in_m_ready_threshold_d = NrLanes*DataWidth-1; + out_valid_threshold_d = ((NrLanes*DataWidth/8) >> pe_req_i.vtype.vsew[1:0])-1; + end endcase // Reset the final grant vector @@ -1368,45 +1551,47 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - vinsn_running_q <= '0; - read_cnt_q <= '0; - issue_cnt_q <= '0; - processing_cnt_q <= '0; - commit_cnt_q <= '0; - vrf_pnt_q <= '0; - mask_pnt_q <= '0; - pe_resp_o <= '0; - result_final_gnt_q <= '0; - popcount_q <= '0; - vfirst_count_q <= '0; - delta_elm_q <= '0; - in_ready_threshold_q <= '0; - in_m_ready_threshold_q <= '0; - out_valid_threshold_q <= '0; - viota_acc_q <= '0; - found_one_q <= '0; - be_viota_seq_q <= '1; // Default: write - be_vrgat_seq_q <= '1; // Default: write + vinsn_running_q <= '0; + read_cnt_q <= '0; + issue_cnt_q <= '0; + processing_cnt_q <= '0; + commit_cnt_q <= '0; + vrf_pnt_q <= '0; + mask_pnt_q <= '0; + pe_resp_o <= '0; + result_final_gnt_q <= '0; + popcount_q <= '0; + vfirst_count_q <= '0; + delta_elm_q <= '0; + in_ready_threshold_q <= '0; + in_m_ready_threshold_q <= '0; + out_valid_threshold_q <= '0; + viota_acc_q <= '0; + found_one_q <= '0; + be_viota_seq_q <= '1; // Default: write + be_vrgat_seq_q <= '1; // Default: write + vrgat_addr_valid_mask_q <= '0; end else begin - vinsn_running_q <= vinsn_running_d; - read_cnt_q <= read_cnt_d; - issue_cnt_q <= issue_cnt_d; - processing_cnt_q <= processing_cnt_d; - commit_cnt_q <= commit_cnt_d; - vrf_pnt_q <= vrf_pnt_d; - mask_pnt_q <= mask_pnt_d; - pe_resp_o <= pe_resp; - result_final_gnt_q <= result_final_gnt_d; - popcount_q <= popcount_d; - vfirst_count_q <= vfirst_count_d; - delta_elm_q <= delta_elm_d; - in_ready_threshold_q <= in_ready_threshold_d; - in_m_ready_threshold_q <= in_m_ready_threshold_d; - out_valid_threshold_q <= out_valid_threshold_d; - viota_acc_q <= viota_acc_d; - found_one_q <= found_one_d; - be_viota_seq_q <= be_viota_seq_d; - be_vrgat_seq_q <= be_vrgat_seq_d; + vinsn_running_q <= vinsn_running_d; + read_cnt_q <= read_cnt_d; + issue_cnt_q <= issue_cnt_d; + processing_cnt_q <= processing_cnt_d; + commit_cnt_q <= commit_cnt_d; + vrf_pnt_q <= vrf_pnt_d; + mask_pnt_q <= mask_pnt_d; + pe_resp_o <= pe_resp; + result_final_gnt_q <= result_final_gnt_d; + popcount_q <= popcount_d; + vfirst_count_q <= vfirst_count_d; + delta_elm_q <= delta_elm_d; + in_ready_threshold_q <= in_ready_threshold_d; + in_m_ready_threshold_q <= in_m_ready_threshold_d; + out_valid_threshold_q <= out_valid_threshold_d; + viota_acc_q <= viota_acc_d; + found_one_q <= found_one_d; + be_viota_seq_q <= be_viota_seq_d; + be_vrgat_seq_q <= be_vrgat_seq_d; + vrgat_addr_valid_mask_q <= vrgat_addr_valid_mask_d; end end