diff --git a/Bender.yml b/Bender.yml index 89ef503d6..5f0faae81 100644 --- a/Bender.yml +++ b/Bender.yml @@ -41,7 +41,7 @@ sources: - hardware/src/lane/simd_mul.sv - hardware/src/lane/vector_regfile.sv - hardware/src/lane/power_gating_generic.sv - - hardware/src/masku/masku.sv + - hardware/src/masku/masku_operands.sv - hardware/src/sldu/p2_stride_gen.sv - hardware/src/sldu/sldu_op_dp.sv - hardware/src/sldu/sldu.sv @@ -54,6 +54,7 @@ sources: - hardware/src/lane/vmfpu.sv - hardware/src/lane/fixed_p_rounding.sv - hardware/src/vlsu/vlsu.sv + - hardware/src/masku/masku.sv # Level 3 - hardware/src/lane/vector_fus_stage.sv # Level 4 diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 6063cdfe9..0aec42e05 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -588,7 +588,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010001: begin ara_req_d.op = ara_pkg::VMADC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 unique case (ara_req_d.emul) @@ -618,7 +617,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010011: begin ara_req_d.op = ara_pkg::VMSBC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 unique case (ara_req_d.emul) @@ -641,27 +639,21 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b011000: begin ara_req_d.op = ara_pkg::VMSEQ; - ara_req_d.use_vd_op = 1'b1; end 6'b011001: begin ara_req_d.op = ara_pkg::VMSNE; - ara_req_d.use_vd_op = 1'b1; end 6'b011010: begin ara_req_d.op = ara_pkg::VMSLTU; - ara_req_d.use_vd_op = 1'b1; end 6'b011011: begin ara_req_d.op = ara_pkg::VMSLT; - ara_req_d.use_vd_op = 1'b1; end 6'b011100: begin ara_req_d.op = ara_pkg::VMSLEU; - ara_req_d.use_vd_op = 1'b1; end 6'b011101: begin ara_req_d.op = ara_pkg::VMSLE; - ara_req_d.use_vd_op = 1'b1; end 6'b010111: begin ara_req_d.op = ara_pkg::VMERGE; @@ -828,7 +820,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010001: begin ara_req_d.op = ara_pkg::VMADC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 unique case (ara_req_d.emul) @@ -855,7 +846,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010011: begin ara_req_d.op = ara_pkg::VMSBC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 unique case (ara_req_d.emul) @@ -873,35 +863,27 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b011000: begin ara_req_d.op = ara_pkg::VMSEQ; - ara_req_d.use_vd_op = 1'b1; end 6'b011001: begin ara_req_d.op = ara_pkg::VMSNE; - ara_req_d.use_vd_op = 1'b1; end 6'b011010: begin ara_req_d.op = ara_pkg::VMSLTU; - ara_req_d.use_vd_op = 1'b1; end 6'b011011: begin ara_req_d.op = ara_pkg::VMSLT; - ara_req_d.use_vd_op = 1'b1; end 6'b011100: begin ara_req_d.op = ara_pkg::VMSLEU; - ara_req_d.use_vd_op = 1'b1; end 6'b011101: begin ara_req_d.op = ara_pkg::VMSLE; - ara_req_d.use_vd_op = 1'b1; end 6'b011110: begin ara_req_d.op = ara_pkg::VMSGTU; - ara_req_d.use_vd_op = 1'b1; end 6'b011111: begin ara_req_d.op = ara_pkg::VMSGT; - ara_req_d.use_vd_op = 1'b1; end 6'b010111: begin ara_req_d.op = ara_pkg::VMERGE; @@ -1034,7 +1016,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b010001: begin ara_req_d.op = ara_pkg::VMADC; - ara_req_d.use_vd_op = 1'b1; // Check whether we can access vs1 and vs2 unique case (ara_req_d.emul) @@ -1052,27 +1033,21 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( end 6'b011000: begin ara_req_d.op = ara_pkg::VMSEQ; - ara_req_d.use_vd_op = 1'b1; end 6'b011001: begin ara_req_d.op = ara_pkg::VMSNE; - ara_req_d.use_vd_op = 1'b1; end 6'b011100: begin ara_req_d.op = ara_pkg::VMSLEU; - ara_req_d.use_vd_op = 1'b1; end 6'b011101: begin ara_req_d.op = ara_pkg::VMSLE; - ara_req_d.use_vd_op = 1'b1; end 6'b011110: begin ara_req_d.op = ara_pkg::VMSGTU; - ara_req_d.use_vd_op = 1'b1; end 6'b011111: begin ara_req_d.op = ara_pkg::VMSGT; - ara_req_d.use_vd_op = 1'b1; end 6'b010111: begin ara_req_d.op = ara_pkg::VMERGE; @@ -1322,7 +1297,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011001: begin ara_req_d.op = ara_pkg::VMAND; @@ -1330,7 +1304,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011010: begin ara_req_d.op = ara_pkg::VMOR; @@ -1338,7 +1311,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011011: begin ara_req_d.op = ara_pkg::VMXOR; @@ -1346,7 +1318,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011100: begin ara_req_d.op = ara_pkg::VMORNOT; @@ -1354,7 +1325,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011101: begin ara_req_d.op = ara_pkg::VMNAND; @@ -1362,7 +1332,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011110: begin ara_req_d.op = ara_pkg::VMNOR; @@ -1370,7 +1339,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b011111: begin ara_req_d.op = ara_pkg::VMXNOR; @@ -1378,7 +1346,6 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = EW8; ara_req_d.eew_vd_op = EW8; ara_req_d.vtype.vsew = EW8; - ara_req_d.use_vd_op = 1'b1; end 6'b010010: begin // VXUNARY0 // These instructions do not use vs1 diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index fcaf85f03..5fc19ec47 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -667,7 +667,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: if ((operand_request[AluA].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs1))) * NrLanes != pe_req.vl) operand_request[AluA].vl += 1; end - operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]}); + operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF}); operand_request[AluB] = '{ id : pe_req.id, @@ -694,7 +694,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: if ((operand_request[AluB].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs2))) * NrLanes != pe_req.vl) operand_request[AluB].vl += 1; end - operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]}); + operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF, VFIRST}); operand_request[MulFPUA] = '{ id : pe_req.id, @@ -710,7 +710,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. operand_request[MulFPUA].vl = vfu_operation_d.vl; - operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]}; + operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF}); operand_request[MulFPUB] = '{ id : pe_req.id, @@ -725,24 +725,26 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // This is an operation that runs normally on the ALU, and then gets *condensed* and // reshuffled at the Mask Unit. operand_request[MulFPUB].vl = vfu_operation_d.vl; - operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]}; + operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF, VFIRST}); operand_request[MaskB] = '{ id : pe_req.id, - vs : pe_req.vd, - eew : pe_req.eew_vd_op, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, scale_vl: pe_req.scale_vl, vtype : pe_req.vtype, // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. vl : (pe_req.vl / NrLanes / ELEN) << (unsigned'(EW64) - unsigned'(pe_req.vtype.vsew)), vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vd, + hazard : (pe_req.op inside {VMSBF, VMSOF, VMSIF}) ? pe_req.hazard_vs2 : pe_req.hazard_vs2 | pe_req.hazard_vd, default : '0 }; - if (((pe_req.vl / NrLanes / ELEN) * NrLanes * ELEN) != - pe_req.vl) operand_request[MaskB].vl += 1; - operand_request_push[MaskB] = pe_req.use_vd_op; + operand_request[MaskB].vl = pe_req.vl / (NrLanes * (8 << pe_req.vtype.vsew)); + if ((pe_req.vl % (NrLanes*ELEN)) != 0) begin + operand_request[MaskB].vl += 1'b1; + end + operand_request_push[MaskB] = pe_req.use_vs2 && pe_req.op inside {VCPOP, VFIRST, VMSIF, VMSOF, VMSBF}; operand_request[MaskM] = '{ id : pe_req.id, diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index 4120b9ea1..f386a9c96 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -54,6 +54,20 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( import cf_math_pkg::idx_width; + // Pointers + // + // We need a pointer to which bit on the full VRF word we are reading mask operands from. + logic [idx_width(DataWidth*NrLanes):0] mask_pnt_d, mask_pnt_q; + // We need a pointer to which bit on the full VRF word we are writing results to. + logic [idx_width(DataWidth*NrLanes):0] vrf_pnt_d, vrf_pnt_q; + + // Remaining elements of the current instruction in the read operand phase + vlen_t read_cnt_d, read_cnt_q; + // Remaining elements of the current instruction in the issue phase + vlen_t issue_cnt_d, issue_cnt_q; + // Remaining elements of the current instruction in the commit phase + vlen_t commit_cnt_d, commit_cnt_q; + //////////////// // Operands // //////////////// @@ -61,39 +75,130 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Information about which is the target FU of the request masku_fu_e masku_operand_fu; - // ALU/FPU result - elen_t [NrLanes-1:0] masku_operand_a_i; - logic [NrLanes-1:0] masku_operand_a_valid_i; - logic [NrLanes-1:0] masku_operand_a_ready_o; + // ALU/FPU result (shuffled) + elen_t [NrLanes-1:0] masku_operand_alu; + logic [NrLanes-1:0] masku_operand_alu_valid; + logic [NrLanes-1:0] masku_operand_alu_ready; + + // ALU/FPU result (deshuffled) + logic [NrLanes*ELEN-1:0] masku_operand_alu_seq; - // Previous value of the destination vector register - elen_t [NrLanes-1:0] masku_operand_b_i; - logic [NrLanes-1:0] masku_operand_b_valid_i; - logic [NrLanes-1:0] masku_operand_b_ready_o; + // vs2 (shuffled) + elen_t [NrLanes-1:0] masku_operand_vs2; + logic [NrLanes-1:0] masku_operand_vs2_valid; + logic [NrLanes-1:0] masku_operand_vs2_ready; + + assign masku_operand_vs2_ready = 1'b0; + + // vs2 (deshuffled) + logic [NrLanes*ELEN-1:0] masku_operand_vs2_seq; + logic [ NrLanes-1:0] masku_operand_vs2_seq_valid; + logic [ NrLanes-1:0] masku_operand_vs2_seq_ready; // Mask - elen_t [NrLanes-1:0] masku_operand_m_i; - logic [NrLanes-1:0] masku_operand_m_valid_i; - logic [NrLanes-1:0] masku_operand_m_ready_o; + elen_t [NrLanes-1:0] masku_operand_m; + logic [NrLanes-1:0] masku_operand_m_valid; + logic [NrLanes-1:0] masku_operand_m_ready; + + // Mask deshuffled + logic [NrLanes*ELEN-1:0] masku_operand_m_seq; + logic [NrLanes-1:0] masku_operand_m_seq_valid; + logic [NrLanes-1:0] masku_operand_m_seq_ready; // Insn-queue related signal pe_req_t vinsn_issue; - for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_unpack_masku_operands - assign masku_operand_a_i[lane] = masku_operand_i[lane][2 + masku_operand_fu]; - assign masku_operand_a_valid_i[lane] = masku_operand_valid_i[lane][2 + masku_operand_fu]; - for (genvar operand_fu = 0; operand_fu < NrMaskFUnits; operand_fu++) begin: gen_masku_operand_ready - assign masku_operand_ready_o[lane][2 + operand_fu] = (masku_fu_e'(operand_fu) == masku_operand_fu) && masku_operand_a_ready_o[lane]; - end: gen_masku_operand_ready + logic [NrLanes*ELEN-1:0] bit_enable_mask; + logic [NrLanes*ELEN-1:0] bit_enable_shuffle; + logic [NrLanes*ELEN-1:0] alu_result_compressed; + + // Performs all shuffling and deshuffling of mask operands (including masks for mask instructions) + // Furthermore, it buffers certain operands that would create long critical paths + masku_operands #( + .NrLanes ( NrLanes ) + ) i_masku_operands ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + // Control logic + .masku_fu_i ( masku_operand_fu ), + .vinsn_issue_i ( vinsn_issue ), + .vrf_pnt_i ( vrf_pnt_q ), + // Operands coming from lanes + .masku_operand_valid_i ( masku_operand_valid_i ), + .masku_operand_ready_o ( masku_operand_ready_o ), + .masku_operands_i ( masku_operand_i ), + // Operands prepared for mask unit execution + .masku_operand_alu_o ( masku_operand_alu ), + .masku_operand_alu_valid_o ( masku_operand_alu_valid ), + .masku_operand_alu_ready_i ( masku_operand_alu_ready ), + .masku_operand_alu_seq_o ( masku_operand_alu_seq ), + .masku_operand_alu_seq_valid_o ( ), + .masku_operand_alu_seq_ready_i ( ), + .masku_operand_vs2_o ( masku_operand_vs2 ), + .masku_operand_vs2_valid_o ( masku_operand_vs2_valid ), + .masku_operand_vs2_ready_i ( masku_operand_vs2_ready ), + .masku_operand_vs2_seq_o ( masku_operand_vs2_seq ), + .masku_operand_vs2_seq_valid_o ( masku_operand_vs2_seq_valid ), + .masku_operand_vs2_seq_ready_i ( masku_operand_vs2_seq_ready ), + .masku_operand_m_o ( masku_operand_m ), + .masku_operand_m_valid_o ( masku_operand_m_valid ), + .masku_operand_m_ready_i ( masku_operand_m_ready ), + .masku_operand_m_seq_o ( masku_operand_m_seq ), + .masku_operand_m_seq_valid_o ( ), + .masku_operand_m_seq_ready_i ( ), + .bit_enable_mask_o ( bit_enable_mask ), + .shuffled_vl_bit_mask_o ( bit_enable_shuffle ), + .alu_result_compressed_o ( alu_result_compressed ) + ); + + + // Local Parameter W_CPOP and W_VFIRST + // + // Description: Parameters W_CPOP and W_VFIRST enable time multiplexing of vcpop.m and vfirst.m instruction. + // + // Legal range W_CPOP: {64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 + // Legal range W_VFIRST: {64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 + // + // Execution time example for vcpop.m (similar for vfirst.m): + // W_CPOP = 64; VLEN = 1024; vl = 1024 + // t_vcpop.m = VLEN/W_CPOP = 8 [Cycles] + localparam int W_CPOP = 64; + localparam int W_VFIRST = 64; + // derived parameters + localparam int MAX_W_CPOP_VFIRST = (W_CPOP > W_VFIRST) ? W_CPOP : W_VFIRST; + localparam int N_SLICES_CPOP = NrLanes * DataWidth / W_CPOP; + localparam int N_SLICES_VFIRST = NrLanes * DataWidth / W_VFIRST; + // Check if parameters are within range + if (((W_CPOP & (W_CPOP - 1)) != 0) || (W_CPOP < 64)) begin + $fatal(1, "Parameter W_CPOP must be power of 2."); + end else if (((W_VFIRST & (W_VFIRST - 1)) != 0) || (W_VFIRST < 64)) begin + $fatal(1, "Parameter W_VFIRST must be power of 2."); + end + + // VFIRST and VCPOP Signals + logic [NrLanes*ELEN-1:0] vcpop_operand; + logic [$clog2(W_VFIRST):0] popcount; + logic [$clog2(VLEN):0] popcount_d, popcount_q; + logic [$clog2(W_VFIRST)-1:0] vfirst_count; + logic [$clog2(VLEN)-1:0] vfirst_count_d, vfirst_count_q; + logic vfirst_empty; + logic [NrLanes-1:0] vcpop_vfirst_vs2_ready; + // counter to keep track of how many slices of the vcpop_operand have been processed + logic [$clog2(MAX_W_CPOP_VFIRST):0] vcpop_slice_cnt_d, vcpop_slice_cnt_q; + logic [W_CPOP-1:0] vcpop_slice; + logic [W_VFIRST-1:0] vfirst_slice; - assign masku_operand_b_i[lane] = masku_operand_i[lane][1]; - assign masku_operand_b_valid_i[lane] = (vinsn_issue.op inside {[VMSBF:VID]}) ? '1 : masku_operand_valid_i[lane][1]; - assign masku_operand_ready_o[lane][1] = masku_operand_b_ready_o[lane]; + // vmsbf, vmsif, vmsof, viota, vid, vcpop, vfirst variables + logic [NrLanes*DataWidth-1:0] alu_result_f, alu_result_ff; + logic [NrLanes*DataWidth-1:0] masku_operand_alu_seq_m, masku_operand_alu_seq_f, masku_operand_alu_seq_ff; + logic [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_seq; + logic [NrLanes*DataWidth-1:0] alu_src_idx, alu_src_idx_m; + logic [ 13:0] iteration_count_d, iteration_count_q; + logic not_found_one_d, not_found_one_q; + logic [ NrLanes-1:0] vmsif_vmsof_vmsbf_vs2_ready; - assign masku_operand_m_i[lane] = masku_operand_i[lane][0]; - assign masku_operand_m_valid_i[lane] = masku_operand_valid_i[lane][0]; - assign masku_operand_ready_o[lane][0] = masku_operand_m_ready_o[lane]; - end: gen_unpack_masku_operands + // Control flow for mask operands + assign masku_operand_vs2_seq_ready = vcpop_vfirst_vs2_ready | vmsif_vmsof_vmsbf_vs2_ready; //////////////////////////////// // Vector instruction queue // @@ -221,16 +326,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( logic result_queue_empty; assign result_queue_empty = (result_queue_cnt_q == '0); - // vmsbf, vmsif, vmsof, viota, vid, vcpop, vfirst variables - logic [NrLanes*DataWidth-1:0] alu_result_f, alu_result_ff; - logic [NrLanes*DataWidth-1:0] alu_operand_a, alu_operand_a_seq, alu_operand_a_seq_f; - logic [NrLanes*DataWidth-1:0] alu_operand_b, alu_operand_b_seq, alu_operand_b_seq_m, alu_operand_b_seq_f, alu_operand_b_seq_ff; - logic [NrLanes*DataWidth-1:0] alu_result_vm, alu_result_vm_m, alu_result_vm_seq; - logic [NrLanes*DataWidth-1:0] masku_operand_vd; - logic [NrLanes*DataWidth-1:0] alu_src_idx, alu_src_idx_m; - logic [4:0] iteration_count_d, iteration_count_q; - logic not_found_one_d, not_found_one_q; - always_ff @(posedge clk_i or negedge rst_ni) begin: p_result_queue_ff if (!rst_ni) begin result_queue_q <= '0; @@ -242,8 +337,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( alu_result_f <= '0; alu_result_ff <= '0; not_found_one_q <= 1'b1; - alu_operand_b_seq_f <= '0; - alu_operand_b_seq_ff <= '0; + masku_operand_alu_seq_f <= '0; + masku_operand_alu_seq_ff <= '0; iteration_count_q <= '0; end else begin result_queue_q <= result_queue_d; @@ -255,15 +350,15 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( alu_result_f <= (pe_req_ready_o) ? '0 : (!vinsn_issue.vm) ? alu_result_vm : alu_result_vm_seq; alu_result_ff <= alu_result_f; not_found_one_q <= not_found_one_d; - alu_operand_b_seq_f <= (pe_req_ready_o) ? '0 : alu_operand_b_seq_m; - alu_operand_b_seq_ff <= alu_operand_b_seq_f; + masku_operand_alu_seq_f <= (pe_req_ready_o) ? '0 : masku_operand_alu_seq_m; + masku_operand_alu_seq_ff <= masku_operand_alu_seq_f; iteration_count_q <= iteration_count_d; end end // iteration count for masked instrctions always_comb begin - if (vinsn_issue_valid && &masku_operand_a_valid_i) begin + if (vinsn_issue_valid && (&masku_operand_alu_valid || &masku_operand_vs2_seq_valid)) begin iteration_count_d = iteration_count_q + 1'b1; end else begin iteration_count_d = iteration_count_q; @@ -294,60 +389,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Mask ALU // //////////////// - // Local Parameter W_CPOP and W_VFIRST - // - // Description: Parameters W_CPOP and W_VFIRST enable time multiplexing of vcpop.m and vfirst.m instruction. - // - // Legal range W_CPOP: {64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 - // Legal range W_VFIRST: {64, 128, ... , DataWidth*NrLanes} // DataWidth = 64 - // - // Execution time example for vcpop.m (similar for vfirst.m): - // W_CPOP = 64; VLEN = 1024; vl = 1024 - // t_vcpop.m = VLEN/W_CPOP = 8 [Cycles] - localparam int W_CPOP = 64; - localparam int W_VFIRST = 64; - // derived parameters - localparam int MAX_W_CPOP_VFIRST = (W_CPOP > W_VFIRST) ? W_CPOP : W_VFIRST; - localparam int N_SLICES_CPOP = NrLanes * DataWidth / W_CPOP; - localparam int N_SLICES_VFIRST = NrLanes * DataWidth / W_VFIRST; - // Check if parameters are within range - if (((W_CPOP & (W_CPOP - 1)) != 0) || (W_CPOP < 64)) begin - $fatal(1, "Parameter W_CPOP must be power of 2."); - end else if (((W_VFIRST & (W_VFIRST - 1)) != 0) || (W_VFIRST < 64)) begin - $fatal(1, "Parameter W_VFIRST must be power of 2."); - end - elen_t [NrLanes-1:0] alu_result; - logic [NrLanes*ELEN-1:0] bit_enable; - logic [NrLanes*ELEN-1:0] bit_enable_shuffle; - logic [NrLanes*ELEN-1:0] bit_enable_mask; - rvv_pkg::vew_e bit_enable_shuffle_eew; logic [NrLanes*ELEN-1:0] mask; - logic [NrLanes*ELEN-1:0] vcpop_operand; - logic [$clog2(W_VFIRST):0] popcount; - logic [$clog2(VLEN):0] popcount_d, popcount_q; - logic [$clog2(W_VFIRST)-1:0] vfirst_count; - logic [$clog2(VLEN)-1:0] vfirst_count_d, vfirst_count_q; - logic vfirst_empty; - - // Pointers - // - // We need a pointer to which bit on the full VRF word we are reading mask operands from. - logic [idx_width(DataWidth*NrLanes):0] mask_pnt_d, mask_pnt_q; - // We need a pointer to which bit on the full VRF word we are writing results to. - logic [idx_width(DataWidth*NrLanes):0] vrf_pnt_d, vrf_pnt_q; - - // Remaining elements of the current instruction in the read operand phase - vlen_t read_cnt_d, read_cnt_q; - // Remaining elements of the current instruction in the issue phase - vlen_t issue_cnt_d, issue_cnt_q; - // Remaining elements of the current instruction in the commit phase - vlen_t commit_cnt_d, commit_cnt_q; - - // counter to keep track of how many slices of the vcpop_operand have been processed - logic [$clog2(MAX_W_CPOP_VFIRST):0] vcpop_slice_cnt_d, vcpop_slice_cnt_q; - logic [W_CPOP-1:0] vcpop_slice; - logic [W_VFIRST-1:0] vfirst_slice; // keep track if first 1 mask element was found logic vfirst_found; @@ -376,68 +419,20 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( always_comb begin: p_mask_alu alu_result = '0; - bit_enable = '0; - bit_enable_shuffle = '0; - bit_enable_mask = '0; not_found_one_d = pe_req_ready_o ? 1'b1 : not_found_one_q; alu_result_vm = '0; alu_result_vm_m = '0; alu_result_vm_seq = '0; - alu_operand_b_seq = '0; - alu_operand_b_seq_m = '0; + masku_operand_alu_seq_m = '0; mask = '0; - masku_operand_vd = '0; vcpop_operand = '0; - // Comparisons work on vtype.vsew from VALU or VMFPU - bit_enable_shuffle_eew = vinsn_issue.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]} - ? vinsn_issue.vtype.vsew - : vinsn_issue.eew_vd_op; - if (vinsn_issue_valid) begin - // Calculate bit enable - // The result can be taken either from the result of an operation (mask_operand_a_i), or - // from the previous value of the destination register (mask_operand_b_i). Byte strobes - // do not work here, since this has to be done at a bit granularity. Therefore, the Mask Unit - // received both operands, and does a masking depending on the value of the vl. - if (vinsn_issue.vl >= ELEN*NrLanes) - bit_enable = '1; - else begin - bit_enable[vinsn_issue.vl] = 1'b1; - bit_enable = bit_enable - 1; - end - - // Shuffle the bit enable signal - for (int b = 0; b < NrLanes*StrbWidth; b++) begin - automatic int vrf_byte = shuffle_index(b, NrLanes, bit_enable_shuffle_eew); - bit_enable_shuffle[8*vrf_byte +: 8] = bit_enable[8*b +: 8]; - - // Take the mask into account - if (!vinsn_issue.vm) begin - automatic int mask_byte = shuffle_index(b, NrLanes, vinsn_issue.eew_vmask); - automatic int mask_byte_lane = mask_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int mask_byte_offset = mask_byte[idx_width(StrbWidth)-1:0]; - bit_enable_mask[8*vrf_byte +: 8] = bit_enable_shuffle[8*vrf_byte +: 8] & - masku_operand_m_i[mask_byte_lane][8*mask_byte_offset +: 8]; - end else begin - bit_enable_mask[8*vrf_byte +: 8] = bit_enable_shuffle[8*vrf_byte +: 8]; - end - end - - alu_operand_a = masku_operand_a_i; - alu_operand_b = masku_operand_b_i; - - // Deshuffle the operands for the mask instructions - for (int b = 0; b < (NrLanes*StrbWidth); b++) begin - automatic int deshuffle_byte = deshuffle_index(b, NrLanes, vinsn_issue.vtype.vsew); - alu_operand_b_seq[8*deshuffle_byte +: 8] = alu_operand_a[8*b +: 8]; - masku_operand_vd [8*deshuffle_byte +: 8] = alu_operand_b[8*b +: 8]; - end // Mask generation unique case (vinsn_issue.op) inside [VMSBF:VID] : - if (&masku_operand_a_valid_i) begin + if (&masku_operand_alu_valid) begin unique case (vinsn_issue.vtype.vsew) EW8 : for (int i = 0; i < (DataWidth * NrLanes)/8; i++) mask [(i*8) +: 8] = {8{bit_enable_mask [i+(((DataWidth * NrLanes)/8)*(iteration_count_d-1))]}}; @@ -456,156 +451,77 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Evaluate the instruction unique case (vinsn_issue.op) inside - [VMANDNOT:VMXNOR]: alu_result = (masku_operand_a_i & bit_enable_mask) | - (masku_operand_b_i & ~bit_enable_mask); - [VMFEQ:VMSGTU], [VMSGT:VMSBC] : begin - automatic logic [ELEN*NrLanes-1:0] alu_result_flat = '0; - - unique case (vinsn_issue.vtype.vsew) - EW8: for (int b = 0; b < 8*NrLanes; b++) begin - // Shuffle the source byte, then find the lane and the offset of this byte in the - // full operand word. - automatic int src_byte = shuffle_index(1*b, NrLanes, EW8); - automatic int src_byte_lane = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0]; - - // Find the destination byte - automatic int dest_bit_seq = b + vrf_pnt_q; - automatic int dest_byte_seq = dest_bit_seq / StrbWidth; - automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, EW8); - - alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] = - (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ? - masku_operand_b_i[src_byte_lane][8*src_byte_offset] : - masku_operand_a_i[src_byte_lane][8*src_byte_offset]; - end - EW16: for (int b = 0; b < 4*NrLanes; b++) begin - // Shuffle the source byte, then find the lane and the offset of this byte in the - // full operand word. - automatic int src_byte = shuffle_index(2*b, NrLanes, EW16); - automatic int src_byte_lane = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0]; - - // Find the destination byte - automatic int dest_bit_seq = b + vrf_pnt_q; - automatic int dest_byte_seq = dest_bit_seq / StrbWidth; - automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, EW16); - - alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] = - (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ? - masku_operand_b_i[src_byte_lane][8*src_byte_offset] : - masku_operand_a_i[src_byte_lane][8*src_byte_offset]; - end - EW32: for (int b = 0; b < 2*NrLanes; b++) begin - // Shuffle the source byte, then find the lane and the offset of this byte in the - // full operand word. - automatic int src_byte = shuffle_index(4*b, NrLanes, EW32); - automatic int src_byte_lane = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0]; - - // Find the destination byte - automatic int dest_bit_seq = b + vrf_pnt_q; - automatic int dest_byte_seq = dest_bit_seq / StrbWidth; - automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, EW32); - - alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] = - (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ? - masku_operand_b_i[src_byte_lane][8*src_byte_offset] : - masku_operand_a_i[src_byte_lane][8*src_byte_offset]; - end - EW64: for (int b = 0; b < 1*NrLanes; b++) begin - // Shuffle the source byte, then find the lane and the offset of this byte in the - // full operand word. - automatic int src_byte = shuffle_index(8*b, NrLanes, EW64); - automatic int src_byte_lane = src_byte[idx_width(StrbWidth) +: idx_width(NrLanes)]; - automatic int src_byte_offset = src_byte[idx_width(StrbWidth)-1:0]; - - // Find the destination byte - automatic int dest_bit_seq = b + vrf_pnt_q; - automatic int dest_byte_seq = dest_bit_seq / StrbWidth; - automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, EW64); - - alu_result_flat[StrbWidth*dest_byte + dest_bit_seq[idx_width(StrbWidth)-1:0]] = - (!vinsn_issue.vm && !masku_operand_a_i[src_byte_lane][8*src_byte_offset+1]) ? - masku_operand_b_i[src_byte_lane][8*src_byte_offset] : - masku_operand_a_i[src_byte_lane][8*src_byte_offset]; - end - default:; - endcase - - // Final assignment - alu_result = (alu_result_flat & bit_enable_shuffle) | - (masku_operand_b_i & ~bit_enable_shuffle); - end + [VMANDNOT:VMXNOR]: alu_result = (masku_operand_alu) | (~bit_enable_shuffle); + [VMFEQ:VMSGTU], [VMSGT:VMSBC]: alu_result = (alu_result_compressed & bit_enable_mask) | (~bit_enable_shuffle); [VMSBF:VMSIF] : begin - if (&masku_operand_a_valid_i) begin - for (int i = 0; i < NrLanes * DataWidth; i++) begin - if (alu_operand_b_seq[i] == 1'b0) begin - alu_result_vm[i] = (vinsn_issue.op == VMSOF) ? 1'b0 : not_found_one_d; - end else begin - not_found_one_d = 1'b0; - alu_result_vm[i] = (vinsn_issue.op == VMSBF) ? not_found_one_d : 1'b1; - break; - end + if (&masku_operand_vs2_seq_valid && (&masku_operand_m_valid || vinsn_issue.vm)) begin + for (int i = 0; i < NrLanes * DataWidth; i++) begin + if (masku_operand_vs2_seq[i] == 1'b0) begin + alu_result_vm[i] = (vinsn_issue.op == VMSOF) ? 1'b0 : not_found_one_d; + end else begin + not_found_one_d = 1'b0; + alu_result_vm[i] = (vinsn_issue.op == VMSBF) ? not_found_one_d : 1'b1; + break; end - alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & bit_enable_mask : alu_result_vm; + end + alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & bit_enable_mask : alu_result_vm; end else begin alu_result_vm = '0; end end VIOTA: begin - if (&masku_operand_a_valid_i) begin - alu_operand_b_seq_m = alu_operand_b_seq & bit_enable_mask; + if (&masku_operand_alu_valid) begin + masku_operand_alu_seq_m = masku_operand_alu_seq & bit_enable_mask; unique case (vinsn_issue.vtype.vsew) EW8 : begin if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [7:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:8] + alu_result_ff [(NrLanes*DataWidth)-1-:8]; + alu_result_vm [7:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:8] + alu_result_ff [(NrLanes*DataWidth)-1-:8]; end else begin alu_result_vm [7:0] = '0; end for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin - alu_result_vm [(index*8) +: 7] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*8) +: 7]; - alu_result_vm_m [(index*8) +: 7] = (|mask[(index*8) +: 7]) ? alu_result_vm [(index*8) +: 7] : masku_operand_vd [(index*8) +: 7]; + alu_result_vm [(index*8) +: 7] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*8) +: 7]; + alu_result_vm_m [(index*8) +: 7] = alu_result_vm [(index*8) +: 7]; end end EW16: begin if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [15:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:16] + alu_result_ff [(NrLanes*DataWidth)-1-:16]; + alu_result_vm [15:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:16] + alu_result_ff [(NrLanes*DataWidth)-1-:16]; end else begin alu_result_vm [15:0] = '0; end for (int index = 1; index < (NrLanes*DataWidth)/16; index++) begin - alu_result_vm [(index*16) +: 15] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*16) +: 15]; - alu_result_vm_m [(index*16) +: 15] = (|mask[(index*16) +: 15]) ? alu_result_vm [(index*16) +: 15] : masku_operand_vd [(index*16) +: 15]; + alu_result_vm [(index*16) +: 15] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*16) +: 15]; + alu_result_vm_m [(index*16) +: 15] = alu_result_vm [(index*16) +: 15]; end end EW32: begin if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [31:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:32] + alu_result_ff [(NrLanes*DataWidth)-1-:32]; + alu_result_vm [31:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:32] + alu_result_ff [(NrLanes*DataWidth)-1-:32]; end else begin alu_result_vm [31:0] = '0; end for (int index = 1; index < (NrLanes*DataWidth)/32; index++) begin - alu_result_vm [(index*32) +: 31] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*32) +: 31]; - alu_result_vm_m [(index*32) +: 31] = (|mask[(index*32) +: 31]) ? alu_result_vm [(index*32) +: 31] : masku_operand_vd [(index*32) +: 31]; + alu_result_vm [(index*32) +: 31] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*32) +: 31]; + alu_result_vm_m [(index*32) +: 31] = alu_result_vm [(index*32) +: 31]; end end EW64: begin if (issue_cnt_q < vinsn_issue.vl) begin - alu_result_vm [63:0] = alu_operand_b_seq_ff [(NrLanes*DataWidth)-1-:64] + alu_result_ff [(NrLanes*DataWidth)-1-:64]; + alu_result_vm [63:0] = masku_operand_alu_seq_ff [(NrLanes*DataWidth)-1-:64] + alu_result_ff [(NrLanes*DataWidth)-1-:64]; end else begin alu_result_vm [63:0] = '0; end for (int index = 1; index < (NrLanes*DataWidth)/64; index++) begin - alu_result_vm [(index*64) +: 63] = alu_operand_b_seq_m [index-1] + alu_result_vm [((index-1)*64) +: 63]; - alu_result_vm_m [(index*64) +: 63] = (|mask[(index*64) +: 63]) ? alu_result_vm [(index*64) +: 63] : masku_operand_vd [(index*64) +: 63]; + alu_result_vm [(index*64) +: 63] = masku_operand_alu_seq_m [index-1] + alu_result_vm [((index-1)*64) +: 63]; + alu_result_vm_m [(index*64) +: 63] = alu_result_vm [(index*64) +: 63]; end end endcase end end VID: begin - if (&masku_operand_a_valid_i) begin + if (&masku_operand_alu_valid) begin unique case (vinsn_issue.vtype.vsew) EW8 : begin for (int index = 1; index < (NrLanes*DataWidth)/8; index++) begin @@ -635,7 +551,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( end end [VCPOP:VFIRST] : begin - vcpop_operand = (!vinsn_issue.vm) ? masku_operand_a_i & bit_enable_mask : masku_operand_a_i; + vcpop_operand = (!vinsn_issue.vm) ? masku_operand_vs2_seq & bit_enable_mask : masku_operand_vs2_seq; end default: begin alu_result = '0; @@ -678,9 +594,16 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( logic last_incoming_a; logic unbalanced_a; + // Control signals for better code-readability (this signals goes high if a result is valid and can be pushed to the result_queue) + logic vreg_wb_valid; + // Information about which is the target FU of the request assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu; + // Byte enable for the result queue + logic [NrLanes*ELENB-1:0] result_queue_be_seq; + logic [NrLanes*ELENB-1:0] result_queue_be; + always_comb begin: p_masku // Maintain state vinsn_queue_d = vinsn_queue_q; @@ -716,9 +639,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // We are not ready, by default pe_resp = '0; - masku_operand_a_ready_o = '0; - masku_operand_b_ready_o = '0; - masku_operand_m_ready_o = '0; + masku_operand_alu_ready = '0; + masku_operand_m_ready = '0; // Inform the main sequencer if we are idle pe_req_ready_o = !vinsn_queue_full; @@ -745,7 +667,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( if (vinsn_issue_valid && !(vd_scalar(vinsn_issue.op))) begin // Is there place in the mask queue to write the mask operands? // Did we receive the mask bits on the MaskM channel? - if (!vinsn_issue.vm && !mask_queue_full && &masku_operand_m_valid_i) begin + if (!vinsn_issue.vm && !mask_queue_full && &masku_operand_m_valid && !(vinsn_issue.op inside {VMSBF, VMSOF, VMSIF})) begin // Copy data from the mask operands into the mask queue for (int vrf_seq_byte = 0; vrf_seq_byte < NrLanes*StrbWidth; vrf_seq_byte++) begin // Map vrf_seq_byte to the corresponding byte in the VRF word. @@ -777,7 +699,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Copy the mask operand mask_queue_d[mask_queue_write_pnt_q][vrf_lane][vrf_offset] = - masku_operand_m_i[mask_lane][mask_offset]; + masku_operand_m[mask_lane][mask_offset]; end // Account for the used operands @@ -806,7 +728,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Consumed all valid bytes from the lane operands if (mask_pnt_d == NrLanes*64 || read_cnt_d == '0) begin // Request another beat - masku_operand_m_ready_o = '1; + masku_operand_m_ready = '1; // Reset the pointer mask_pnt_d = '0; end @@ -817,33 +739,37 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Calculate scalar results // ////////////////////////////// + vcpop_vfirst_vs2_ready = 1'b0; + // Is there an instruction ready to be issued? if (vinsn_issue_valid && vd_scalar(vinsn_issue.op)) begin - if (&(masku_operand_a_valid_i | fake_a_valid) && (&masku_operand_m_valid_i || vinsn_issue.vm)) begin + if (&(masku_operand_vs2_seq_valid | fake_a_valid) && (&masku_operand_m_valid || vinsn_issue.vm)) begin // increment slice counter vcpop_slice_cnt_d = vcpop_slice_cnt_q + 1'b1; // request new operand (by completing ready-valid handshake) once all slices have been processed - masku_operand_a_ready_o = 1'b0; + vcpop_vfirst_vs2_ready = 1'b0; if (((vcpop_slice_cnt_q == N_SLICES_CPOP - 1) && vinsn_issue.op == VCPOP) || ((vcpop_slice_cnt_q == N_SLICES_VFIRST-1) && vinsn_issue.op == VFIRST)) begin vcpop_slice_cnt_d = '0; - masku_operand_a_ready_o = masku_operand_a_valid_i; - if (!vinsn_issue.vm) begin - masku_operand_m_ready_o = '1; + vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid; + if (!vinsn_issue.vm) begin + masku_operand_m_ready = '1; end end // Account for the elements that were processed - issue_cnt_d = issue_cnt_q - (W_CPOP/(8 << vinsn_issue.vtype.vsew)); + issue_cnt_d = issue_cnt_q - W_CPOP; // abruptly stop processing elements if vl is reached - if (iteration_count_d >= (vinsn_issue.vl/W_CPOP) || (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin + if (iteration_count_d >= (vinsn_issue.vl/(W_CPOP)) || (!vfirst_empty && (vinsn_issue.op == VFIRST))) begin issue_cnt_d = '0; - masku_operand_a_ready_o = masku_operand_a_valid_i; - if (!vinsn_issue.vm) begin - masku_operand_m_ready_o = '1; + commit_cnt_d = '0; + read_cnt_d ='0; + vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid; + if (!vinsn_issue.vm) begin + masku_operand_m_ready = '1; end end @@ -865,9 +791,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( vcpop_slice_cnt_d = '0; // acknowledge operand a - masku_operand_a_ready_o = masku_operand_a_valid_i; - if (!vinsn_issue.vm) begin - masku_operand_m_ready_o = '1; + vcpop_vfirst_vs2_ready = masku_operand_vs2_seq_valid; + if (!vinsn_issue.vm) begin + masku_operand_m_ready = '1; end end end @@ -877,14 +803,17 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Write results to the lanes // ////////////////////////////////// + result_queue_be = '1; + result_queue_be_seq = '1; + vmsif_vmsof_vmsbf_vs2_ready = '0; + // Is there an instruction ready to be issued? if (vinsn_issue_valid && !vd_scalar(vinsn_issue.op)) begin // This instruction executes on the Mask Unit if (vinsn_issue.vfu == VFU_MaskUnit) begin // Is there place in the result queue to write the results? // Did we receive the operands? - if (!result_queue_full && &(masku_operand_a_valid_i | fake_a_valid) && - (!vinsn_issue.use_vd_op || &masku_operand_b_valid_i)) begin + if (!result_queue_full && (&(masku_operand_alu_valid | fake_a_valid | masku_operand_vs2_seq_valid))) begin // How many elements are we committing in total? // Since we are committing bits instead of bytes, we carry out the following calculation // with ceil(vl/8) instead. @@ -899,7 +828,37 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the operands of this instruction. // At this stage, acknowledge only the first operand, "a", coming from the ALU/VMFpu. - masku_operand_a_ready_o = masku_operand_a_valid_i; + masku_operand_alu_ready = masku_operand_alu_valid; + vmsif_vmsof_vmsbf_vs2_ready = (&masku_operand_m_valid || vinsn_issue.vm) ? '1 : '0; + + if (!vinsn_issue.vm) begin + unique case (vinsn_issue.vtype.vsew) + EW8 : result_queue_be_seq = masku_operand_m_seq[NrLanes*ELENB-1:0]; + EW16: begin + for (int i = 0; i < NrLanes * ELENB / 2; i++) begin + result_queue_be_seq[2*i +: 2] = {2{bit_enable_mask[i]}}; + end + end + EW32: begin + for (int i = 0; i < NrLanes * ELENB / 4; i++) begin + result_queue_be_seq[4*i +: 4] = {4{bit_enable_mask[i]}}; + end + end + EW64: begin + for (int i = 0; i < NrLanes * ELENB / 8; i++) begin + result_queue_be_seq[8*i +: 8] = {8{bit_enable_mask[i]}}; + end + end + default: ; // Not sure what should be the default + endcase + for (int i = 0; i < NrLanes*ELENB; i++) begin + result_queue_be[shuffle_index(i, NrLanes, vinsn_issue.vtype.vsew)] = result_queue_be_seq[i]; + end + end + + if (vinsn_issue.op inside {[VMSBF: VMSIF], VID}) begin + result_queue_be = '1; + end // Store the result in the operand queue for (int unsigned lane = 0; lane < NrLanes; lane++) begin @@ -910,8 +869,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_queue_d[result_queue_write_pnt_q][lane] = '{ wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata | alu_result[lane], - be : (vinsn_issue.op inside {[VMSBF:VID]}) ? '1 : be(element_cnt, vinsn_issue.vtype.vsew), - addr : (vinsn_issue.op inside {[VMSBF:VID]}) ? vaddr(vinsn_issue.vd, NrLanes, VLEN) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes, VLEN) + + be : (vinsn_issue.op inside {[VMSBF:VID]}) ? result_queue_be[lane*ELENB +: ELENB] : be(element_cnt, vinsn_issue.vtype.vsew), + addr : (vinsn_issue.op inside {[VIOTA:VID]}) ? vaddr(vinsn_issue.vd, NrLanes, VLEN) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes, VLEN) + (((vinsn_issue.vl - issue_cnt_q) / NrLanes / DataWidth)), id : vinsn_issue.id }; @@ -925,9 +884,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( if (vrf_pnt_d == DataWidth*NrLanes || vrf_pnt_d >= issue_cnt_q) begin result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - // Acknowledge the rest of the operands, which are accessed bit by bit. - masku_operand_b_ready_o = masku_operand_b_valid_i; - // Reset VRF pointer vrf_pnt_d = '0; @@ -944,33 +900,34 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( issue_cnt_d = '0; end end else if (vinsn_issue.op inside {[VMSBF:VID]}) begin - result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - - // Acknowledge the previous value of the destination vector register. - masku_operand_b_ready_o = masku_operand_b_valid_i; + if (&masku_operand_m_valid || vinsn_issue.vm || vinsn_issue.op inside {VIOTA, VID}) begin + result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - // Increment result queue pointers and counters - result_queue_cnt_d += 1; - if (result_queue_write_pnt_q == ResultQueueDepth-1) - result_queue_write_pnt_d = '0; - else - result_queue_write_pnt_d = result_queue_write_pnt_q + 1; + // Increment result queue pointers and counters + result_queue_cnt_d += 1; + if (result_queue_write_pnt_q == ResultQueueDepth-1) + result_queue_write_pnt_d = '0; + else + result_queue_write_pnt_d = result_queue_write_pnt_q + 1; - if (result_queue_read_pnt_q == ResultQueueDepth-1) - result_queue_read_pnt_d = '0; - else - result_queue_read_pnt_d = result_queue_read_pnt_m; + if (result_queue_read_pnt_q == ResultQueueDepth-1) + result_queue_read_pnt_d = '0; + else + result_queue_read_pnt_d = result_queue_read_pnt_m; - // Account for the results that were issued - issue_cnt_d = issue_cnt_q - (1 << (int'(EW64) - vinsn_issue.vtype.vsew)); - if ((vinsn_issue.vl-issue_cnt_d)*4 >= vinsn_issue.vl) - issue_cnt_d = '0; + // Account for the results that were issued + if (vinsn_issue.op inside {VIOTA, VID}) begin + issue_cnt_d = issue_cnt_q - (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew)); + if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl) + issue_cnt_d = '0; + end else begin + issue_cnt_d = issue_cnt_q - NrLanes * DataWidth; + if ((vinsn_issue.vl-issue_cnt_d) >= vinsn_issue.vl) + issue_cnt_d = '0; + end + end end else begin result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; - - // Acknowledge the previous value of the destination vector register. - masku_operand_b_ready_o = masku_operand_b_valid_i; - // Increment result queue pointers and counters result_queue_cnt_d += 1; if (result_queue_write_pnt_q == ResultQueueDepth-1) @@ -990,13 +947,17 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( /////////////////////////// //// Masked Instruction /// /////////////////////////// - if (vinsn_commit_valid && vinsn_commit.op inside {[VMSBF:VID]}) begin - if (&masku_operand_a_valid_i && (&masku_operand_m_valid_i || vinsn_issue.vm)) begin - // if this is the last beat, commit the result to the scalar_result queue - commit_cnt_d = commit_cnt_q - (1 << (int'(EW64) - vinsn_commit.vtype.vsew)); - if ((vinsn_commit.vl-commit_cnt_d)*4 >= vinsn_commit.vl) begin - commit_cnt_d = '0; - end + if ((|masku_operand_alu_valid && !result_queue_full) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {[VIOTA:VID]}) begin + // if this is the last beat, commit the result to the scalar_result queue + commit_cnt_d = commit_cnt_q - (NrLanes << (int'(EW64) - vinsn_commit.vtype.vsew)); + if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin + commit_cnt_d = '0; + end + end + if ((&masku_operand_alu_valid || &masku_operand_vs2_seq_valid) && (&masku_operand_m_valid || vinsn_issue.vm) && vinsn_commit_valid && vinsn_commit.op inside {VMSBF, VMSOF, VMSIF}) begin + commit_cnt_d = commit_cnt_q - NrLanes * DataWidth; + if ((vinsn_commit.vl-commit_cnt_d) >= vinsn_commit.vl) begin + commit_cnt_d = '0; end end @@ -1050,9 +1011,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( mask_queue_cnt_d -= 1; // Decrement the counter of remaining vector elements waiting to be used - commit_cnt_d = commit_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew)); - if (commit_cnt_q < (NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew)))) - commit_cnt_d = '0; + if (vldu_mask_ready_i || vstu_mask_ready_i || sldu_mask_ready_i || vinsn_issue.vm || (vinsn_issue.vfu != VFU_MaskUnit)) begin + commit_cnt_d = commit_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew)); + if (commit_cnt_q < (NrLanes * (1 << (int'(EW64) - vinsn_commit.vtype.vsew)))) + commit_cnt_d = '0; + end end ////////////////////////////////// @@ -1097,9 +1060,11 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_queue_d[result_queue_read_pnt_q] = '0; // Decrement the counter of remaining vector elements waiting to be written - commit_cnt_d = commit_cnt_q - NrLanes * DataWidth; - if (commit_cnt_q < (NrLanes * DataWidth)) - commit_cnt_d = '0; + if (!(vinsn_issue.op inside {VID, VSE})) begin + commit_cnt_d = commit_cnt_q - NrLanes * DataWidth; + if (commit_cnt_q < (NrLanes * DataWidth)) + commit_cnt_d = '0; + end end /////////////////////////// diff --git a/hardware/src/masku/masku_operands.sv b/hardware/src/masku/masku_operands.sv new file mode 100644 index 000000000..86b5e6988 --- /dev/null +++ b/hardware/src/masku/masku_operands.sv @@ -0,0 +1,233 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Mask Unit Operands Module +// +// Author: Moritz Imfeld +// +// +// Description: +// Module takes operands coming from the lanes and then unpacks and prepares them +// for mask instruction execution. +// +// +// Incoming Operands: +// masku_operands_i = {v0.m, vs2, alu_result, fpu_result} +// + +module masku_operands import ara_pkg::*; import rvv_pkg::*; #( + parameter int unsigned NrLanes = 0 + ) ( + input logic clk_i, + input logic rst_ni, + + // Control logic + input masku_fu_e masku_fu_i, // signal deciding from which functional unit the result should be taken from + input pe_req_t vinsn_issue_i, + input logic [idx_width(ELEN*NrLanes):0] vrf_pnt_i, + + // Operands and operand handshake signals coming from lanes + input logic [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand_valid_i, + output logic [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand_ready_o, + input elen_t [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operands_i, + + // Operands prepared for masku execution + output elen_t [ NrLanes-1:0] masku_operand_alu_o, // ALU/FPU result (shuffled, uncompressed) + output logic [ NrLanes-1:0] masku_operand_alu_valid_o, + input logic [ NrLanes-1:0] masku_operand_alu_ready_i, + output logic [NrLanes*ELEN-1:0] masku_operand_alu_seq_o, // ALU/FPU result (deshuffled, uncompressed) + output logic [ NrLanes-1:0] masku_operand_alu_seq_valid_o, + input logic [ NrLanes-1:0] masku_operand_alu_seq_ready_i, + output elen_t [ NrLanes-1:0] masku_operand_vs2_o, // vs2 (shuffled) + output logic [ NrLanes-1:0] masku_operand_vs2_valid_o, + input logic [ NrLanes-1:0] masku_operand_vs2_ready_i, + output logic [NrLanes*ELEN-1:0] masku_operand_vs2_seq_o, // vs2 (deshuffled) + output logic [ NrLanes-1:0] masku_operand_vs2_seq_valid_o, + input logic [ NrLanes-1:0] masku_operand_vs2_seq_ready_i, + output elen_t [ NrLanes-1:0] masku_operand_m_o, // Mask (shuffled) + output logic [ NrLanes-1:0] masku_operand_m_valid_o, + input logic [ NrLanes-1:0] masku_operand_m_ready_i, + output logic [NrLanes*ELEN-1:0] masku_operand_m_seq_o, // Mask (deshuffled) + output logic [ NrLanes-1:0] masku_operand_m_seq_valid_o, + input logic [ NrLanes-1:0] masku_operand_m_seq_ready_i, + output logic [NrLanes*ELEN-1:0] bit_enable_mask_o, // Bit mask for mask unit instructions (shuffled like mask register) + output logic [NrLanes*ELEN-1:0] shuffled_vl_bit_mask_o, // vl mask for mask unit instructions (first vl bits are 1, others 0) (shuffled like mask register) + output logic [NrLanes*ELEN-1:0] alu_result_compressed_o // ALU/FPU results compressed (from sew to 1-bit) (shuffled, in mask format) + ); + + // Imports + import cf_math_pkg::idx_width; + + // Local Parameter + localparam int unsigned DATAPATH_WIDTH = NrLanes * ELEN; // Mask Unit datapath width + localparam int unsigned ELEN_BYTES = ELEN / 8; + + // Helper signals + logic [DATAPATH_WIDTH-1:0] deshuffled_vl_bit_mask; // this bit enable signal is only dependent on vl + logic [DATAPATH_WIDTH-1:0] shuffled_vl_bit_mask; // this bit enable signal is only dependent on vl + vew_e bit_enable_shuffle_eew; + + elen_t [NrLanes-1:0] masku_operand_vs2_d; + logic masku_operand_vs2_lane_valid; + logic masku_operand_vs2_lane_ready; + logic masku_operand_vs2_spill_valid; + logic masku_operand_vs2_spill_ready; + + + // Extract operands from input (input comes in "shuffled form" from the lanes) + for (genvar lane = 0; lane < NrLanes; lane++) begin + assign masku_operand_m_o[lane] = masku_operands_i[lane][0]; + assign masku_operand_vs2_d[lane] = masku_operands_i[lane][1]; + assign masku_operand_alu_o[lane] = masku_operands_i[lane][2 + masku_fu_i]; + end + + // ---------- + // Deshuffle vs2 + // ---------- + always_comb begin + masku_operand_m_seq_o = '0; + masku_operand_vs2_seq_o = '0; + masku_operand_alu_seq_o = '0; + for (int b = 0; b < (NrLanes * ELEN_BYTES); b++) begin + automatic int deshuffle_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew); + automatic int deshuffle_m_idx = deshuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask); + automatic int lane_idx = b / ELEN_BYTES; // rounded down to nearest integer + automatic int lane_offset = b % ELEN_BYTES; + masku_operand_alu_seq_o[8*deshuffle_idx +: 8] = masku_operand_alu_o[lane_idx][8*lane_offset +: 8]; + masku_operand_vs2_seq_o[8*deshuffle_idx +: 8] = masku_operand_vs2_o[lane_idx][8*lane_offset +: 8]; + masku_operand_m_seq_o[8*deshuffle_m_idx +: 8] = masku_operand_m_o[lane_idx][8*lane_offset +: 8]; + end + end + + always_comb begin + masku_operand_vs2_spill_ready = 1'b1; + for (int lane = 0; lane < NrLanes; lane++) begin + masku_operand_vs2_spill_ready &= masku_operand_vs2_ready_i[lane] | masku_operand_vs2_seq_ready_i[lane]; + end + end + + spill_register #( + .T ( elen_t [NrLanes-1:0] ), + .Bypass ( 1'b0 ) + ) i_spill_register_vs2 ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .valid_i (masku_operand_vs2_lane_valid), + .ready_o (masku_operand_vs2_lane_ready), + .data_i (masku_operand_vs2_d), + .valid_o (masku_operand_vs2_spill_valid), + .ready_i (masku_operand_vs2_spill_ready), + .data_o (masku_operand_vs2_o) + ); + + for (genvar lane = 0; lane < NrLanes; lane++) begin + assign masku_operand_vs2_valid_o[lane] = masku_operand_vs2_spill_valid; + assign masku_operand_vs2_seq_valid_o[lane] = masku_operand_vs2_spill_valid; + end + + always_comb begin + masku_operand_vs2_lane_valid = 1'b1; + for (int lane = 0; lane < NrLanes; lane++) begin + masku_operand_vs2_lane_valid &= masku_operand_valid_i[lane][1]; + end + end + + // ------------------------------------------------ + // Generate shuffled and unshuffled bit level masks + // ------------------------------------------------ + + // Generate shuffled bit level mask + assign bit_enable_shuffle_eew = vinsn_issue_i.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]} ? vinsn_issue_i.vtype.vsew : vinsn_issue_i.eew_vd_op; + + always_comb begin + // Default assignments + deshuffled_vl_bit_mask = '0; + shuffled_vl_bit_mask = '0; + bit_enable_mask_o = '0; + + // Generate deshuffled vl bit mask + for (int unsigned i = 0; i < DATAPATH_WIDTH; i++) begin + if (i < vinsn_issue_i.vl) begin + deshuffled_vl_bit_mask[i] = 1'b1; + end + end + + for (int unsigned b = 0; b < NrLanes * ELEN_BYTES; b++) begin + // local helper signals + logic [idx_width(DATAPATH_WIDTH)-1:0] src_operand_byte_shuffle_index; + logic [idx_width(DATAPATH_WIDTH)-1:0] mask_operand_byte_shuffle_index; + logic [ idx_width(NrLanes)-1:0] mask_operand_byte_shuffle_lane_index; + logic [ idx_width(ELEN_BYTES)-1:0] mask_operand_byte_shuffle_lane_offset; + + // get shuffle idices + // Note: two types of shuffle indices are needed because the source operand and the + // mask register might not have the same effective element width (eew) + src_operand_byte_shuffle_index = shuffle_index(b, NrLanes, bit_enable_shuffle_eew); + mask_operand_byte_shuffle_index = shuffle_index(b, NrLanes, vinsn_issue_i.eew_vmask); + mask_operand_byte_shuffle_lane_index = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES) +: idx_width(NrLanes)]; + mask_operand_byte_shuffle_lane_offset = mask_operand_byte_shuffle_index[idx_width(ELEN_BYTES)-1:0]; + + // shuffle bit enable + shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8] = deshuffled_vl_bit_mask[8*b +: 8]; + + // Generate bit-level mask + bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] = shuffled_vl_bit_mask[8*src_operand_byte_shuffle_index +: 8]; + if (!vinsn_issue_i.vm && !(vinsn_issue_i.op inside {VMADC, VMSBC})) begin // exception for VMADC and VMSBC, because they use the mask register as a source operand (and not as a mask) + bit_enable_mask_o[8*src_operand_byte_shuffle_index +: 8] &= masku_operand_m_o[mask_operand_byte_shuffle_lane_index][8*mask_operand_byte_shuffle_lane_offset +: 8]; + end + end + end + + assign shuffled_vl_bit_mask_o = shuffled_vl_bit_mask; + + + // ------------------------------------------- + // Compress ALU/FPU results into a mask vector + // ------------------------------------------- + always_comb begin + alu_result_compressed_o = '0; + for (int b = 0; b < ELEN_BYTES * NrLanes; b++) begin + if ((b % (1 << vinsn_issue_i.vtype.vsew)) == '0) begin + automatic int src_byte = shuffle_index(b, NrLanes, vinsn_issue_i.vtype.vsew); + automatic int src_byte_lane = src_byte[idx_width(ELEN_BYTES) +: idx_width(NrLanes)]; + automatic int src_byte_offset = src_byte[idx_width(ELEN_BYTES)-1:0]; + + automatic int dest_bit_seq = (b >> vinsn_issue_i.vtype.vsew) + vrf_pnt_i; + automatic int dest_byte_seq = dest_bit_seq / ELEN_BYTES; + automatic int dest_byte = shuffle_index(dest_byte_seq, NrLanes, vinsn_issue_i.vtype.vsew); + alu_result_compressed_o[ELEN_BYTES * dest_byte + dest_bit_seq[idx_width(ELEN_BYTES)-1:0]] = masku_operand_alu_o[src_byte_lane][8 * src_byte_offset]; + end + end + end + + + // Control + for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_unpack_masku_operands + // immediately acknowledge operands coming from functional units + assign masku_operand_alu_valid_o[lane] = masku_operand_valid_i[lane][2 + masku_fu_i]; + + assign masku_operand_m_valid_o[lane] = masku_operand_valid_i[lane][0]; + + assign masku_operand_m_seq_valid_o[lane] = masku_operand_valid_i[lane][0]; + end: gen_unpack_masku_operands + + + // assign the operand_ready signal that goes to the lane operand queues + always_comb begin + // by default, assign '0 to operand ready signals + masku_operand_ready_o = '0; + for (int lane = 0; lane < NrLanes; lane++) begin + // Acknowledge alu operand + for (int operand_fu = 0; operand_fu < NrMaskFUnits; operand_fu++) begin + masku_operand_ready_o[lane][2 + operand_fu] = (masku_fu_e'(operand_fu) == masku_fu_i) && masku_operand_alu_ready_i[lane]; + end + // Acknowledge vs2 operands + masku_operand_ready_o[lane][1] = masku_operand_vs2_lane_ready; + // Acknowledge mask operand + masku_operand_ready_o[lane][0] = masku_operand_m_ready_i[lane]; + end + end + + +endmodule : masku_operands