From 261dadedd8f16ffc50158fed77abe8ef285111f3 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sun, 10 Nov 2024 21:55:00 +0100 Subject: [PATCH] WIP: update ara_pkg and lane_sequencer for mask operations --- hardware/include/ara_pkg.sv | 8 +- hardware/src/lane/lane_sequencer.sv | 217 ++++++++++++++----------- hardware/src/lane/operand_requester.sv | 18 +- hardware/src/lane/simd_alu.sv | 7 +- hardware/src/lane/valu.sv | 25 +-- 5 files changed, 154 insertions(+), 121 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 13b5b0424..dcc37d7f7 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -134,9 +134,15 @@ package ara_pkg; // Floating-point comparison instructions VMFEQ, VMFLE, VMFLT, VMFNE, VMFGT, VMFGE, // Integer comparison instructions - VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSBF, VMSOF, VMSIF, VIOTA, VID, VCPOP, VFIRST, VMSGT, + VMSEQ, VMSNE, VMSLTU, VMSLT, VMSLEU, VMSLE, VMSGTU, VMSGT, // Integer add-with-carry and subtract-with-borrow carry-out instructions VMADC, VMSBC, + // Mask to mask + VMSBF, VMSOF, VMSIF, + // Mask to non-mask + VIOTA, VID, + // Mask to scalar + VCPOP, VFIRST, // Mask operations VMANDNOT, VMAND, VMOR, VMXOR, VMORNOT, VMNAND, VMNOR, VMXNOR, // Scalar moves from VRF diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 3ddcfa6eb..0db9660eb 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -264,7 +264,9 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Vector length calculation vfu_operation_d.vl = pe_req.vl / NrLanes; // If lane_id_i < vl % NrLanes, this lane has to execute one extra micro-operation. - if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0]) vfu_operation_d.vl += 1; + // Also, if the ALU/VMFPU should pre-process data for the MASKU, force a balanced payload + if (lane_id_i < pe_req.vl[idx_width(NrLanes)-1:0] || pe_req.op inside {[VMFEQ:VMXNOR]}) + vfu_operation_d.vl += 1; // Calculate the start element for Lane[i]. This will be forwarded to both opqueues // and operand requesters, with some light modification in the case of a vslide. @@ -277,9 +279,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: vinsn_running_d[pe_req.id] = (vfu_operation_d.vfu != VFU_None) ? 1'b1 : 1'b0; // Mute request if the instruction runs in the lane and the vl is zero. - // Exception 1: insn on mask vectors, as MASKU has to receive something from all lanes - // and the partial results come from VALU and VMFPU. - // Exception 2: during a reduction, all the lanes must cooperate anyway. + // Exception: during a reduction, all the lanes must cooperate anyway. if (vfu_operation_d.vl == '0 && (vfu_operation_d.vfu inside {VFU_Alu, VFU_MFpu}) && !(vfu_operation_d.op inside {[VREDSUM:VWREDSUM], [VFREDUSUM:VFWREDOSUM]})) begin vfu_operation_valid_d = 1'b0; // We are already done with this instruction @@ -337,17 +337,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), + vl : pe_req.vl / NrLanes / ELEN, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) + operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; end VFU_MFpu: begin @@ -420,17 +420,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), + vl : pe_req.vl / NrLanes / ELEN, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) + operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; end VFU_LoadUnit : begin @@ -438,17 +438,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), + vl : pe_req.vl / NrLanes / ELEN, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) + operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; // Load indexed @@ -490,26 +490,25 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // extra operand regardless of whether it is valid in this lane or not. // This is done to balance the data received by the store unit, which expects // L*64-bits packets only. - if (lane_id_i > pe_req.end_lane) begin + if (lane_id_i > pe_req.end_lane) operand_request[StA].vl += 1; - end operand_request_push[StA] = pe_req.use_vs1; // This vector instruction uses masks operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / 8) >> unsigned'(pe_req.vtype.vsew), + vl : pe_req.vl / NrLanes / ELEN, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm | pe_req.hazard_vd, default: '0 }; - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) operand_request[MaskM].vl += 1; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) + operand_request[MaskM].vl += 1; operand_request_push[MaskM] = !pe_req.vm; // Store indexed @@ -529,9 +528,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) begin + if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) operand_request[SlideAddrGenA].vl += 1; - end operand_request_push[SlideAddrGenA] = pe_req_i.op == VSXE; end @@ -601,7 +599,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, is_slide: 1'b1, vtype : pe_req.vtype, vstart : vfu_operation_d.vstart, @@ -614,61 +612,61 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: VSLIDEUP: begin // We need to trim full words from the end of the vector that are not used // as operands by the slide unit. + operand_request[MaskM].vl = (pe_req.vl - pe_req.stride) / NrLanes / ELEN; + // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request[MaskM].vl = - ((pe_req.vl - pe_req.stride + NrLanes - 1) / 8 / NrLanes) - >> unsigned'(pe_req.vtype.vsew); - - if (((operand_request[MaskM].vl + pe_req.stride) << - unsigned'(pe_req.vtype.vsew) * NrLanes * 8 != pe_req.vl)) + if ((operand_request[MaskM].vl) * NrLanes * ELEN != + pe_req.stride) operand_request[MaskM].vl += 1; // SLIDEUP only uses mask bits whose indices are > stride // Don't send the previous (unused) ones to the MASKU if (pe_req.stride >= NrLanes * 64) - operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * 64) << NrLanes * 64) / 8; + operand_request[MaskM].vstart += ((pe_req.stride >> NrLanes * ELEN) << NrLanes * ELEN) / 8; end VSLIDEDOWN: begin // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - operand_request[MaskM].vl = ((pe_req.vl / NrLanes / 8) >> unsigned'( - pe_req.vtype.vsew)); - if ((operand_request[MaskM].vl << unsigned'(pe_req.vtype.vsew)) * - NrLanes * 8 != pe_req.vl) + operand_request[MaskM].vl = pe_req.vl / NrLanes / ELEN; + if (operand_request[MaskM].vl * NrLanes * ELEN != pe_req.vl) operand_request[MaskM].vl += 1; end endcase end VFU_MaskUnit: begin + // todo: balance mask comparison source requested + // todo: + + // Mask logical and integer comparisons operand_request[AluA] = '{ id : pe_req.id, vs : pe_req.vs1, - eew : pe_req.eew_vs1, scale_vl: pe_req.scale_vl, vtype : pe_req.vtype, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, default : '0 }; + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. - // This is an operation that runs normally on the ALU, and then gets *condensed* and - // reshuffled at the Mask Unit. + // Integer comparisons run on the ALU and then get reshuffled and masked in the MASKU if (pe_req.op inside {[VMSEQ:VMSBC]}) begin - operand_request[AluA].vl = vfu_operation_d.vl; - end - // This is an operation that runs normally on the ALU, and then gets reshuffled at the - // Mask Unit. - else begin - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - operand_request[AluA].vl = (pe_req.vl / NrLanes) >> - (unsigned'(EW64) - unsigned'(pe_req.eew_vs1)); - if ((operand_request[AluA].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs1))) * NrLanes != - pe_req.vl) operand_request[AluA].vl += 1; + // These source regs contain non-mask vectors. + operand_request[AluA].eew = pe_req.eew_vs1; + operand_request[AluA].vl = pe_req.vl / NrLanes; + if ((operand_request[AluA].vl * NrLanes) != pe_req.vl) + operand_request[AluA].vl += 1; + end else begin // Mask logical operations + // These source regs contain mask vectors. + operand_request[AluA].eew = EW64; + operand_request[AluA].vl = pe_req.vl / NrLanes / ELEN; + if (operand_request[AluA].vl * NrLanes * ELEN != pe_req.vl) + operand_request[AluA].vl += 1; end - operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF}); + operand_request_push[AluA] = pe_req.use_vs1 && !(pe_req.op inside {[VMFEQ:VMFGE]}); + // Mask logical, integer comparisons, VIOTA, VID, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST operand_request[AluB] = '{ id : pe_req.id, vs : pe_req.vs2, @@ -679,88 +677,117 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, default : '0 }; - // This is an operation that runs normally on the ALU, and then gets *condensed* and - // reshuffled at the Mask Unit. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + + // Integer comparisons run on the ALU and then get reshuffled and masked in the MASKU if (pe_req.op inside {[VMSEQ:VMSBC]}) begin - operand_request[AluB].vl = vfu_operation_d.vl; + // These source regs contain non-mask vectors. + operand_request[AluB].eew = pe_req.eew_vs2; + operand_request[AluB].vl = pe_req.vl / NrLanes; + if ((operand_request[AluB].vl * NrLanes) != pe_req.vl) + operand_request[AluB].vl += 1; + end else begin // Mask logical, VIOTA, VID, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST + // These source regs contain mask vectors. + operand_request[AluB].eew = EW64; + operand_request[AluB].vl = pe_req.vl / NrLanes / ELEN; + if (operand_request[AluB].vl * NrLanes * ELEN != pe_req.vl) + operand_request[AluB].vl += 1; end - // This is an operation that runs normally on the ALU, and then gets reshuffled at the - // Mask Unit. - else begin - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - operand_request[AluB].vl = (pe_req.vl / NrLanes) >> - (unsigned'(EW64) - unsigned'(pe_req.eew_vs2)); - if ((operand_request[AluB].vl << (unsigned'(EW64) - unsigned'(pe_req.eew_vs2))) * NrLanes != - pe_req.vl) operand_request[AluB].vl += 1; - end - operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE], VCPOP, VMSIF, VMSOF, VMSBF, VFIRST}); + operand_request_push[AluB] = pe_req.use_vs2 && !(pe_req.op inside {[VMFEQ:VMFGE]}); + // Mask fp comparisons operand_request[MulFPUA] = '{ id : pe_req.id, vs : pe_req.vs1, eew : pe_req.eew_vs1, scale_vl: pe_req.scale_vl, + vl : pe_req.vl / NrLanes, vtype : pe_req.vtype, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, default : '0 }; - - // This is an operation that runs normally on the ALU, and then gets *condensed* and + // This is an operation that runs normally on the VMFPU, and then gets *condensed* and // reshuffled at the Mask Unit. - operand_request[MulFPUA].vl = vfu_operation_d.vl; - operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF}); + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MulFPUA].vl * NrLanes) != pe_req.vl) + operand_request[MulFPUA].vl += 1; + operand_request_push[MulFPUA] = pe_req.use_vs1 && pe_req.op inside {[VMFEQ:VMFGE]}; + // Mask fp comparisons operand_request[MulFPUB] = '{ id : pe_req.id, vs : pe_req.vs2, eew : pe_req.eew_vs2, scale_vl: pe_req.scale_vl, + vl : pe_req.vl / NrLanes, vtype : pe_req.vtype, vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, default : '0 }; - // This is an operation that runs normally on the ALU, and then gets *condensed* and + // This is an operation that runs normally on the VMFPU, and then gets *condensed* and // reshuffled at the Mask Unit. - operand_request[MulFPUB].vl = vfu_operation_d.vl; - operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]} && !(pe_req.op inside {VCPOP, VMSIF, VMSOF, VMSBF, VFIRST}); + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MulFPUB].vl * NrLanes) != pe_req.vl) + operand_request[MulFPUB].vl += 1; + operand_request_push[MulFPUB] = pe_req.use_vs2 && pe_req.op inside {[VMFEQ:VMFGE]}; + // Vd register to provide correct mask undisturbed policy at bit-level + // This is can be a mask or normal register operand_request[MaskB] = '{ id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, + vs : pe_req.vd, scale_vl: pe_req.scale_vl, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. - vl : (pe_req.vl / NrLanes / ELEN) << (unsigned'(EW64) - unsigned'(pe_req.vtype.vsew)), vstart : vfu_operation_d.vstart, - hazard : (pe_req.op inside {VMSBF, VMSOF, VMSIF}) ? pe_req.hazard_vs2 : pe_req.hazard_vs2 | pe_req.hazard_vd, + hazard : pe_req.hazard_vd, default : '0 }; - operand_request[MaskB].vl = pe_req.vl / (NrLanes * (8 << pe_req.vtype.vsew)); - if ((pe_req.vl % (NrLanes*ELEN)) != 0) begin - operand_request[MaskB].vl += 1'b1; + // vl and eew depend on the real eew on which we are working on + if (pe_req.op inside {VIOTA,VID}) begin + // Non-mask layout + operand_request[MaskB].eew = pe_req.vtype.vsew; + operand_request[MaskB].vl = pe_req.vl / NrLanes; + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MaskM].vl * NrLanes) != pe_req.vl) + operand_request[MaskM].vl += 1; + end else begin // Mask logical, VMSBF, VMSIF, VMSOF, VCPOP, VFIRST + // Mask layout + operand_request[MaskB].eew = EW64; + operand_request[MaskB].vl = (pe_req.vl / NrLanes / ELEN); + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) + operand_request[MaskM].vl += 1; end - operand_request_push[MaskB] = pe_req.use_vs2 && pe_req.op inside {VCPOP, VFIRST, VMSIF, VMSOF, VMSBF}; + operand_request_push[MaskB] = pe_req.use_vd_op; + // All masked operations + // This is always a mask register operand_request[MaskM] = '{ id : pe_req.id, vs : VMASK, - eew : pe_req.vtype.vsew, + eew : EW64, vtype : pe_req.vtype, - // Since this request goes outside of the lane, we might need to request an - // extra operand regardless of whether it is valid in this lane or not. vl : (pe_req.vl / NrLanes / ELEN), vstart : vfu_operation_d.vstart, hazard : pe_req.hazard_vm, default: '0 }; - if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) begin + // Request a balanced load from every lane despite it being active or not. + // Since this request goes outside of the lane, we might need to request an + // extra operand regardless of whether it is valid in this lane or not. + if ((operand_request[MaskM].vl * NrLanes * ELEN) != pe_req.vl) operand_request[MaskM].vl += 1; - end operand_request_push[MaskM] = !pe_req.vm; end VFU_None: begin diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 1baec0780..de2cc4f82 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -291,7 +291,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( automatic elen_t vl_byte; automatic elen_t vstart_byte; automatic elen_t vector_body_len_byte; - automatic elen_t vector_body_len_packets; + automatic elen_t vector_body_len_elements; // Bank we are currently requesting automatic int bank = requester_metadata_q.addr[idx_width(NrBanks)-1:0]; @@ -324,13 +324,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( ? 0 : operand_request_i[requester_index].vstart << operand_request_i[requester_index].vtype.vsew; vector_body_len_byte = vl_byte - vstart_byte + (vstart_byte % 8); - vector_body_len_packets = vector_body_len_byte >> operand_request_i[requester_index].eew; - if (vector_body_len_packets << operand_request_i[requester_index].eew < vector_body_len_byte) - vector_body_len_packets += 1; + vector_body_len_elements = vector_body_len_byte >> operand_request_i[requester_index].eew; + if (vector_body_len_elements << operand_request_i[requester_index].eew < vector_body_len_byte) + vector_body_len_elements += 1; // Final computed length effective_vector_body_length = (operand_request_i[requester_index].scale_vl) - ? vector_body_len_packets + ? vector_body_len_elements : vector_body_length; // Address of the vstart element of the vector in the VRF @@ -401,7 +401,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( end : waw_counters_update if (operand_queue_ready_i[requester_index]) begin - automatic vlen_t num_bytes; + automatic vlen_t num_elements; // Operand request lane_operand_req_transposed[requester_index][bank] = !stall; @@ -417,12 +417,12 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( requester_metadata_d.addr = requester_metadata_q.addr + 1'b1; // We read less than 64 bits worth of elements - num_bytes = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) ); - if (requester_metadata_q.len < num_bytes) begin + num_elements = ( 1 << ( unsigned'(EW64) - unsigned'(requester_metadata_q.vew) ) ); + if (requester_metadata_q.len < num_elements) begin requester_metadata_d.len = 0; end else begin - requester_metadata_d.len = requester_metadata_q.len - num_bytes; + requester_metadata_d.len = requester_metadata_q.len - num_elements; end end : op_req_grant diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv index 242c0d2bc..572bc35af 100644 --- a/hardware/src/lane/simd_alu.sv +++ b/hardware/src/lane/simd_alu.sv @@ -132,11 +132,8 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( VMXOR : res = operand_a_i ^ operand_b_i; VMXNOR : res = ~(operand_a_i ^ operand_b_i); - // vmsbf, vmsof, vmsif and viota operand generation - VMSBF, VMSOF, VMSIF, VIOTA : res = opb; - - // Vector count population and find first set bit instructions - VCPOP, VFIRST : res = operand_b_i; + // Mask operands pass-through + VCPOP, VFIRST, VMSBF, VMSOF, VMSIF, VIOTA: res = operand_b_i; // Arithmetic instructions VSADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index d3ce82bee..c4c9cdced 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -175,22 +175,25 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // Mask operands // ///////////////////// + logic mask_operand_valid; logic mask_operand_ready; logic mask_operand_gnt; - assign mask_operand_gnt = mask_operand_ready && result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]; + assign mask_operand_valid = result_queue_q[result_queue_read_pnt_q].mask + & result_queue_valid_q[result_queue_read_pnt_q]; + assign mask_operand_gnt = mask_operand_valid & mask_operand_ready; spill_register #( .T(elen_t) ) i_mask_operand_register ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .data_o (mask_operand_o ), - .valid_o (mask_operand_valid_o ), - .ready_i (mask_operand_ready_i ), - .data_i (result_queue_q[result_queue_read_pnt_q].wdata ), - .valid_i (result_queue_q[result_queue_read_pnt_q].mask && result_queue_valid_q[result_queue_read_pnt_q]), - .ready_o (mask_operand_ready ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .data_o (mask_operand_o ), + .valid_o (mask_operand_valid_o ), + .ready_i (mask_operand_ready_i ), + .data_i (result_queue_q[result_queue_read_pnt_q].wdata ), + .valid_i (mask_operand_valid ), + .ready_o (mask_operand_ready ) ); ////////////////////// @@ -739,7 +742,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; if (|result_queue_valid_q) vxsat_flag_o = |(alu_vxsat_q & result_queue_q[result_queue_read_pnt_q].be); - // Received a grant from the VRF. + // Received a grant from the VRF or MASKU. // Deactivate the request. if (alu_result_gnt_i || mask_operand_gnt) begin result_queue_valid_d[result_queue_read_pnt_q] = 1'b0; @@ -802,7 +805,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; ////////////////////////////// if (!vinsn_queue_full && vfu_operation_valid_i && - (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMXNOR]})) begin + (vfu_operation_i.vfu == VFU_Alu || vfu_operation_i.op inside {[VMSEQ:VMSBC],[VMANDNOT:VMXNOR]})) begin vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt] = vfu_operation_i; // Do not wait for masks if, during a reduction, this lane is just a pass-through // The only valid instructions here with vl == '0 are reductions