From c54eba65ace92878eaac9baaf6d7e0afd1fcd995 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Thu, 14 Nov 2024 19:39:20 +0100 Subject: [PATCH] [hardware] Debugging VIOTA (two commits ago, it worked until masked test) --- hardware/src/ara_dispatcher.sv | 28 ++++++--- hardware/src/lane/lane_sequencer.sv | 10 +-- hardware/src/lane/valu.sv | 95 ++++++++++------------------- hardware/src/masku/masku.sv | 92 ++++++++++------------------ 4 files changed, 92 insertions(+), 133 deletions(-) diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index fb5559bd8..818207b45 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -67,7 +67,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( `FF(csr_vstart_q, csr_vstart_d, '0) `FF(csr_vl_q, csr_vl_d, '0) - `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, default: '0}) + `FF(csr_vtype_q, csr_vtype_d, '{vill: 1'b1, vsew: EW8, vlmul: LMUL_1, default: '0}) `FF(csr_vxsat_q, csr_vxsat_d, '0) `FF(csr_vxrm_q, csr_vxrm_d, '0) // Converts between the internal representation of `vtype_t` and the full XLEN-bit CSR. @@ -505,7 +505,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( (csr_vtype_d.vlmul == LMUL_RSVD) || // reserved value // LMUL >= SEW/ELEN (signed'($clog2(ELENB)) + signed'(csr_vtype_d.vlmul) < signed'(csr_vtype_d.vsew))) begin - csr_vtype_d = '{vill: 1'b1, default: '0}; + csr_vtype_d = '{vill: 1'b1, vsew: EW8, vlmul: LMUL_1, default: '0}; csr_vl_d = '0; end @@ -1279,12 +1279,26 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.use_vs1 = 1'b0; ara_req_d.use_vd_op = 1'b1; ara_req_d.eew_vs2 = eew_q[ara_req_d.vs2]; // Force reshuffle - ara_req_d.eew_vd_op = eew_q[ara_req_d.vd]; // Force reshuffle - ara_req_d.vtype.vsew = eew_q[ara_req_d.vd]; + ara_req_d.eew_vd_op = eew_q[ara_req_d.vd]; case (insn.varith_type.rs1) - 5'b00001: ara_req_d.op = ara_pkg::VMSBF; - 5'b00010: ara_req_d.op = ara_pkg::VMSOF; - 5'b00011: ara_req_d.op = ara_pkg::VMSIF; + 5'b00001: begin + ara_req_d.op = ara_pkg::VMSBF; + // This is a mask-to-mask operation, vsew does not have any meaning + // So, avoid reshuffling + ara_req_d.vtype.vsew = eew_q[ara_req_d.vd]; + end + 5'b00010: begin + ara_req_d.op = ara_pkg::VMSOF; + // This is a mask-to-mask operation, vsew does not have any meaning + // So, avoid reshuffling + ara_req_d.vtype.vsew = eew_q[ara_req_d.vd]; + end + 5'b00011: begin + ara_req_d.op = ara_pkg::VMSIF; + // This is a mask-to-mask operation, vsew does not have any meaning + // So, avoid reshuffling + ara_req_d.vtype.vsew = eew_q[ara_req_d.vd]; + end 5'b10000: ara_req_d.op = ara_pkg::VIOTA; 5'b10001: ara_req_d.op = ara_pkg::VID; endcase diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 0db9660eb..c2c8c1d14 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -265,7 +265,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vfu_operation_d.vl = pe_req.vl / NrLanes; // If lane_id_i < vl % NrLanes, this lane has to execute one extra micro-operation. // Also, if the ALU/VMFPU should pre-process data for the MASKU, force a balanced payload - if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0] || pe_req.op inside {[VMFEQ:VMXNOR]}) + if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0] || (|pe_req.vl[idx_width(NrLanes)-1:0] && pe_req.op inside {[VMFEQ:VMXNOR]})) vfu_operation_d.vl += 1; // Calculate the start element for Lane[i]. This will be forwarded to both opqueues @@ -757,8 +757,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Request a balanced load from every lane despite it being active or not. // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if ((operand_request[MaskM].vl * NrLanes) != pe_req.vl) - operand_request[MaskM].vl += 1; + if ((operand_request[MaskB].vl * NrLanes) != pe_req.vl) + operand_request[MaskB].vl += 1; end else begin // Mask logical, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST // Mask layout operand_request[MaskB].eew = EW64; @@ -766,8 +766,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Request a balanced load from every lane despite it being active or not. // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) - operand_request[MaskM].vl += 1; + if ((operand_request[MaskB].vl * NrLanes * ELEN) != pe_req.vl) + operand_request[MaskB].vl += 1; end operand_request_push[MaskB] = pe_req.use_vd_op; diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index e9179b0e7..a61242f67 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -398,6 +398,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Remaining elements of the current instruction in the commit phase vlen_t commit_cnt_d, commit_cnt_q; + // How many elements are issued/committed + logic [3:0] element_cnt_buf_issue, element_cnt_buf_commit; + logic [6:0] element_cnt_issue; + logic [6:0] element_cnt_commit; + always_comb begin: p_valu // Maintain state vinsn_queue_d = vinsn_queue_q; @@ -439,6 +444,13 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Don't prevent commit by default prevent_commit = 1'b0; + // How many elements are we processing this cycle? + element_cnt_buf_issue = (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew)); + element_cnt_issue = vinsn_issue_q.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_issue}; + + element_cnt_buf_commit = (unsigned'(EW64) - unsigned'(vinsn_commit.vtype.vsew)); + element_cnt_commit = vinsn_commit.op inside {[VMSBF:VMXNOR]} ? ELEN : {2'b0, element_cnt_buf_commit}; + //////////////////////////////////////// // Write data into the result queue // //////////////////////////////////////// @@ -453,7 +465,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1) && (mask_valid_i || vinsn_issue_q.vm)) begin // How many elements are we committing with this word? - automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))); + automatic logic [6:0] element_cnt = element_cnt_issue; if (element_cnt > issue_cnt_q) element_cnt = issue_cnt_q; @@ -527,16 +539,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1; // Assign vector length for next instruction in the instruction queue - if (vinsn_queue_d.issue_cnt != 0) begin - if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; - else begin - $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; - end - end + if (vinsn_queue_d.issue_cnt != 0) + issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; end end end @@ -553,7 +557,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; (alu_operand_valid_i[0] || !vinsn_issue_q.use_vs1 || !first_op_q) && (mask_valid_i || vinsn_issue_q.vm)) begin // How many elements are we committing with this word? - automatic logic [3:0] element_cnt = (1 << (unsigned'(EW64) - unsigned'(vinsn_issue_q.vtype.vsew))); + automatic logic [6:0] element_cnt = element_cnt_issue; + if (element_cnt > issue_cnt_q) element_cnt = issue_cnt_q; @@ -659,16 +664,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1; // Assign vector length for next instruction in the instruction queue - if (vinsn_queue_d.issue_cnt != 0) begin - if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; - else begin - $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; - end - end + if (vinsn_queue_d.issue_cnt != 0) + issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; // Give the done to the main sequencer commit_cnt_d = '0; @@ -696,16 +693,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1; // Assign vector length for next instruction in the instruction queue - if (vinsn_queue_d.issue_cnt != 0) begin - if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; - else begin - $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); - issue_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl / 8) >> - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vtype.vsew; - issue_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl[2:0]; - end - end + if (vinsn_queue_d.issue_cnt != 0) + issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl; // Commit and give the done to the main sequencer commit_cnt_d = '0; @@ -757,9 +746,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Decrement the counter of remaining vector elements waiting to be written // Don't do it in case of a reduction - if (!is_reduction(vinsn_commit.op)) - commit_cnt_d = commit_cnt_q - (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew)); - if (commit_cnt_q < (1 << (unsigned'(EW64) - vinsn_commit.vtype.vsew))) commit_cnt_d = '0; + if (!is_reduction(vinsn_commit.op)) begin + automatic logic [6:0] element_cnt = element_cnt_commit; + commit_cnt_d = commit_cnt_q - element_cnt; + if (commit_cnt_q < element_cnt) commit_cnt_d = '0; + end end // Finished committing the results of a vector instruction @@ -773,18 +764,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; else vinsn_queue_d.commit_pnt += 1; // Update the commit counter for the next instruction - if (vinsn_queue_d.commit_cnt != '0) begin - if (!(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].op inside {[VMANDNOT:VMXNOR]})) - commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl; - else begin - // We are asking for bits, and we want at least one chunk of bits if - // vl > 0. Therefore, commit_cnt = ceil((vl / 8) >> sew) - $warning("vstart was never tested for op inside {[VMANDNOT:VMXNOR]}"); - commit_cnt_d = (vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl / 8) >> - vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew; - commit_cnt_d += |vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl[2:0]; - end - end + if (vinsn_queue_d.commit_cnt != '0) + commit_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl; // Initialize counters and alu state if needed by the next instruction // After a reduction, the next instructions starts after the reduction commits @@ -809,7 +790,10 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i; // Do not wait for masks if, during a reduction, this lane is just a pass-through // The only valid instructions here with vl == '0 are reductions - vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.vm | (vfu_operation_i.vl == '0); + // Instructions that execute in the mask unit will process the mask there directly + vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].vm = vfu_operation_i.op inside {[VMSEQ:VMXNOR]} + ? 1'b1 + : vfu_operation_i.vm | (vfu_operation_i.vl == '0); // Initialize counters and alu state if the instruction queue was empty // and the lane is not reducing @@ -825,22 +809,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; sldu_transactions_cnt_d = $clog2(NrLanes) + 1; issue_cnt_d = vfu_operation_i.vl; - if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) - issue_cnt_d = vfu_operation_i.vl; - else begin - issue_cnt_d = (vfu_operation_i.vl / 8) >> - vfu_operation_i.vtype.vsew; - issue_cnt_d += |vfu_operation_i.vl[2:0]; - end end if (vinsn_queue_d.commit_cnt == '0) - if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) - commit_cnt_d = vfu_operation_i.vl; - else begin - // Operations between mask vectors operate on bits - commit_cnt_d = (vfu_operation_i.vl / 8) >> vfu_operation_i.vtype.vsew; - commit_cnt_d += |vfu_operation_i.vl[2:0]; - end + commit_cnt_d = vfu_operation_i.vl; // Bump pointers and counters of the vector instruction queue vinsn_queue_d.accept_pnt += 1; diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index 367e33a9d..d7bc525c6 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -180,7 +180,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( logic [NrLanes-1:0] additional_elm; // There can be an additional element for some lanes // BE signals for VIOTA logic [NrLanes*DataWidth/8-1:0] be_viota_seq_d, be_viota_seq_q, be_viota_shuf; - logic masku_alu_be_clr; // Local Parameter VcpopParallelism and VfirstParallelism // @@ -520,8 +519,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( elen_t [NrLanes-1:0] alu_result; - logic masku_alu_en, masku_alu_clr; - // assign operand slices to be processed by popcount and lzc assign vcpop_slice = vcpop_operand[(in_ready_cnt_q[idx_width(N_SLICES_CPOP)-1:0] * VcpopParallelism) +: VcpopParallelism]; assign vfirst_slice = vcpop_operand[(in_ready_cnt_q[idx_width(N_SLICES_VFIRST)-1:0] * VfirstParallelism) +: VfirstParallelism]; @@ -544,7 +541,24 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( .empty_o (vfirst_empty ) ); - always_comb begin: p_mask_alu + // Vector instructions currently running + logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q; + + // Interface with the main sequencer + pe_resp_t pe_resp; + + // Effective MASKU stride in case of VSLIDEUP + // MASKU receives chunks of 64 * NrLanes mask bits from the lanes + // VSLIDEUP only needs the bits whose index >= than its stride + // So, the operand requester does not send vl mask bits to MASKU + // and trims all the unused 64 * NrLanes mask bits chunks + // Therefore, the stride needs to be trimmed, too + elen_t trimmed_stride; + + // Information about which is the target FU of the request + assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu; + + always_comb begin // Tail-agnostic bus alu_result = '0; alu_result_vm = '0; @@ -557,28 +571,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( vcpop_operand = '0; - // MASKU ALU control - masku_alu_en = 1'b0; - masku_alu_clr = 1'b0; - masku_alu_be_clr = 1'b0; - // The result mask should be created here since the output is a non-mask vector be_viota_seq_d = be_viota_seq_q; - // Is there an instruction ready to be issued? - if (vinsn_issue_valid && vinsn_issue.op inside {[VMFEQ:VMXNOR]}) - // Compute one slice if we can write and the necessary inputs are valid - if (!result_queue_full && (&masku_operand_alu_valid || vinsn_issue.op == VID) - && (&masku_operand_vd_valid || !vinsn_issue.use_vd_op) - && (&masku_operand_m_valid || vinsn_issue.vm)) - masku_alu_en = 1'b1; - - // Have we finished insn execution? - if (vinsn_issue_valid && issue_cnt_d == '0) masku_alu_clr = 1'b1; - - // Have we written the result queue? - if (vinsn_issue_valid && out_vrf_word_valid) masku_alu_be_clr = 1'b1; - // Create a bit-masked ALU sequential vector masku_operand_alu_seq_m = masku_operand_alu_seq & (masku_operand_m_seq | {NrLanes*DataWidth{vinsn_issue.vm}}); @@ -608,7 +603,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( for (int i = 1; i < VmsxfParallelism; i++) begin vmsxf_buffer[i] = ~((masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism + i]) | vmsxf_buffer[i-1]); end - found_one_d = masku_alu_en ? |(masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism]) | found_one_q : found_one_q; alu_result_vmsif_vm[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = vmsxf_buffer; alu_result_vmsbf_vm[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism] = {~found_one_d, vmsxf_buffer[VmsxfParallelism-1:1]}; @@ -623,9 +617,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Mask the result alu_result_vm_m = (!vinsn_issue.vm) ? alu_result_vm & masku_operand_m_seq : alu_result_vm; - - // Clean-up state upon instruction end - if (masku_alu_clr) found_one_d = '0; end // VIOTA, VID: compute a slice of the output and mask out the masked elements // VID re-uses the VIOTA datapath @@ -643,9 +634,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( viota_res[i+1] = viota_res[i] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + i]; end - // Save last result in the accumulator for next slice upon processing - viota_acc_d = masku_alu_en ? viota_res[ViotaParallelism-1] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + ViotaParallelism - 1] : viota_acc_q; - // This datapath should be relativeley simple: // `ViotaParallelism bytes connected, in line, to output byte chunks // Multiple limited-width counters should help the synthesizer reduce wiring @@ -683,14 +671,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( {8{vinsn_issue.vm}} | {8{masku_operand_m_seq[out_valid_cnt_q[idx_width(NrLanes*DataWidth/8/ViotaParallelism)-1:0] * ViotaParallelism + i]}}; end endcase - - // Clean-up state upon instruction end - if (masku_alu_clr) begin - viota_acc_d = '0; - be_viota_seq_d = '0; - end - - if (masku_alu_be_clr) be_viota_seq_d = '0; end // VCPOP, VFIRST: mask the current slice and feed the popc or lzc unit [VCPOP:VFIRST] : begin @@ -716,30 +696,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( if (vinsn_issue.op inside {[VMSBF:VID]}) alu_result = alu_result_vm_shuf; - end: p_mask_alu - ///////////////// // Mask unit // ///////////////// - // Vector instructions currently running - logic [NrVInsn-1:0] vinsn_running_d, vinsn_running_q; - - // Interface with the main sequencer - pe_resp_t pe_resp; - - // Effective MASKU stride in case of VSLIDEUP - // MASKU receives chunks of 64 * NrLanes mask bits from the lanes - // VSLIDEUP only needs the bits whose index >= than its stride - // So, the operand requester does not send vl mask bits to MASKU - // and trims all the unused 64 * NrLanes mask bits chunks - // Therefore, the stride needs to be trimmed, too - elen_t trimmed_stride; - - // Information about which is the target FU of the request - assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu; - - always_comb begin: p_masku // Maintain state vinsn_queue_d = vinsn_queue_q; read_cnt_d = read_cnt_q; @@ -994,6 +954,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( popcount_d = popcount_q + popcount; vfirst_count_d = vfirst_count_q + vfirst_count; + // Bump MASKU ALU state + found_one_d = |(masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/VmsxfParallelism)-1:0] * VmsxfParallelism +: VmsxfParallelism]) | found_one_q; + viota_acc_d = viota_res[ViotaParallelism-1] + masku_operand_alu_seq_m[in_ready_cnt_q[idx_width(NrLanes*DataWidth/ViotaParallelism)-1:0] * ViotaParallelism + ViotaParallelism - 1]; + // Increment the input, input-mask, and output slice counters in_ready_cnt_en = 1'b1; in_m_ready_cnt_en = 1'b1; @@ -1040,6 +1004,13 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Assert valid scalar output out_scalar_valid = vd_scalar(vinsn_issue.op); end + + // Have we finished insn execution? Clear MASKU ALU state + if (issue_cnt_d == '0) begin + viota_acc_d = '0; + be_viota_seq_d = '0; + found_one_d = '0; + end end end @@ -1059,6 +1030,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_queue_write_pnt_d = '0; end + // Clear MASKU ALU state + be_viota_seq_d = '0; + // Account for the written results // VIOTA and VID do not write bits! processing_cnt_d = vinsn_issue.op inside {[VIOTA:VID]} ? processing_cnt_q - ((NrLanes * DataWidth / 8) >> vinsn_issue.vtype.vsew) : processing_cnt_q - NrLanes * DataWidth; @@ -1296,7 +1270,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( vinsn_queue_d.issue_cnt += 1; vinsn_queue_d.commit_cnt += 1; end - end: p_masku + end always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin