diff --git a/omreaderlib/src/T1OMReaderAPI.scala b/omreaderlib/src/T1OMReaderAPI.scala index 754d07b75..22005a8e4 100644 --- a/omreaderlib/src/T1OMReaderAPI.scala +++ b/omreaderlib/src/T1OMReaderAPI.scala @@ -54,7 +54,7 @@ object Path { implicit val rw: ReadWriter[Instruction] = macroRW def parse(str: String): Path = str match { - case s"OMInstanceTarget:~${top}|${hier}>${local}" => + case s"OMReferenceTarget:~${top}|${hier}>${local}" => Path( top, hier @@ -65,7 +65,7 @@ object Path { }), Some(local) ) - case s"OMInstanceTarget:~${top}|${hier}" => + case s"OMInstanceTarget:~${top}|${hier}" => Path( top, hier diff --git a/omreaderlib/src/t1/T1.scala b/omreaderlib/src/t1/T1.scala index 5e040886e..dc304d860 100644 --- a/omreaderlib/src/t1/T1.scala +++ b/omreaderlib/src/t1/T1.scala @@ -18,6 +18,20 @@ class T1(val mlirbc: Array[Byte]) extends T1OMReaderAPI { def instructions: Seq[Instruction] = t1("decoder").obj("instructions").list.elements().map(_.obj).map(getInstruction) def sram: Seq[SRAM] = t1("lanes").list.elements().map(_.obj("vrf").obj).flatMap(getSRAM) - def retime: Seq[Retime] = + + def permutation: Seq[Retime] = { + val permutation = t1("permutation") + val reduceUnit = permutation.obj("reduceUnit").obj + val compressUnit = permutation.obj("compress").obj + // TODO: need fieldOpt(name: String) + val floatAdder = + Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj) + + (Seq(compressUnit) ++ floatAdder).flatMap(getRetime) + } + + def vfus: Seq[Retime] = t1("lanes").list.elements().map(_.obj("vfus")).flatMap(_.list.elements().map(_.obj)).flatMap(getRetime) + + def retime = (vfus ++ permutation).distinct } diff --git a/omreaderlib/src/t1rocketv/T1RocketTile.scala b/omreaderlib/src/t1rocketv/T1RocketTile.scala index 1224726e4..0c33bd3f6 100644 --- a/omreaderlib/src/t1rocketv/T1RocketTile.scala +++ b/omreaderlib/src/t1rocketv/T1RocketTile.scala @@ -20,8 +20,21 @@ class T1RocketTile(val mlirbc: Array[Byte]) extends T1OMReaderAPI { t1("lanes").list.elements().map(_.obj("vrf").obj).flatMap(getSRAM) def cache: Seq[SRAM] = Seq(tile("frontend").obj("icache").obj, tile("hellaCache").obj).flatMap(getSRAM) - def vfu: Seq[Retime] = + + def permutation: Seq[Retime] = { + val permutation = t1("permutation") + val reduceUnit = permutation.obj("reduceUnit").obj + val compressUnit = permutation.obj("compress").obj + // TODO: need fieldOpt(name: String) + val floatAdder = + Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj) + + (Seq(compressUnit) ++ floatAdder).flatMap(getRetime) + } + + def vfus: Seq[Retime] = t1("lanes").list.elements().map(_.obj("vfus")).flatMap(_.list.elements().map(_.obj)).flatMap(getRetime) - def retime = vfu + + def retime = (vfus ++ permutation).distinct def sram = vrf ++ cache } diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index 59b317b5b..d59a97d75 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -138,6 +138,8 @@ class LaneRequest(param: LaneParameter) extends Bundle { /** data of rs1 */ val readFromScalar: UInt = UInt(param.datapathWidth.W) + val csrInterface: CSRInterface = new CSRInterface(param.vlMaxBits) + // vmacc 的vd需要跨lane读 TODO: move to [[V]] def ma: Bool = decodeResult(Decoder.multiplier) && decodeResult(Decoder.uop)(1, 0).xorR && !decodeResult(Decoder.vwmacc) @@ -220,10 +222,6 @@ class InstructionControlRecord(param: LaneParameter) extends Bundle { /** Store request from [[T1]]. */ val laneRequest: LaneRequest = new LaneRequest(param) - /** csr follows the instruction. TODO: move to [[laneRequest]] - */ - val csr: CSRInterface = new CSRInterface(param.vlMaxBits) - /** which group is the last group for instruction. */ val lastGroupForInstruction: UInt = UInt(param.groupNumberBits.W) @@ -362,9 +360,9 @@ class LaneResponseFeedback(param: LaneParameter) extends Bundle { val complete: Bool = Bool() } -class V0Update(param: LaneParameter) extends Bundle { - val data: UInt = UInt(param.datapathWidth.W) - val offset: UInt = UInt(param.vrfOffsetBits.W) +class V0Update(datapathWidth: Int, vrfOffsetBits: Int) extends Bundle { + val data: UInt = UInt(datapathWidth.W) + val offset: UInt = UInt(vrfOffsetBits.W) // mask/ld类型的有可能不会写完整的32bit val mask: UInt = UInt(4.W) } @@ -794,6 +792,5 @@ class MaskUnitReadVs1(parameter: T1Parameter) extends Bundle { } class LaneTokenBundle extends Bundle { - val maskResponseRelease: Bool = Output(Bool()) - val maskRequestRelease: Bool = Input(Bool()) + val maskRequestRelease: Bool = Input(Bool()) } diff --git a/t1/src/FloatModule.scala b/t1/src/FloatModule.scala index 8862937c2..cff306c9c 100644 --- a/t1/src/FloatModule.scala +++ b/t1/src/FloatModule.scala @@ -17,7 +17,7 @@ object FloatAdderParameter { implicit def rwP = upickle.default.macroRW[FloatAdderParameter] } -case class FloatAdderParameter(expWidth: Int, sigWidth: Int) extends SerializableModuleParameter +case class FloatAdderParameter(expWidth: Int, sigWidth: Int, latency: Int) extends SerializableModuleParameter class FloatAdderInterface(val parameter: FloatAdderParameter) extends Bundle { val expWidth = parameter.expWidth @@ -61,8 +61,8 @@ class FloatAdder(val parameter: FloatAdderParameter) addRecFN.io.roundingMode := io.roundingMode addRecFN.io.detectTininess := false.B - io.out := fNFromRecFN(8, 24, addRecFN.io.out) - io.exceptionFlags := addRecFN.io.exceptionFlags + io.out := Pipe(true.B, fNFromRecFN(8, 24, addRecFN.io.out), parameter.latency).bits + io.exceptionFlags := Pipe(true.B, addRecFN.io.exceptionFlags, parameter.latency).bits } object FloatCompareParameter { diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index ec52d2bb7..1c8893634 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -98,6 +98,7 @@ case class LaneParameter( crossLaneVRFWriteEscapeQueueSize: Int, fpuEnable: Boolean, portFactor: Int, + maskRequestLatency: Int, vrfRamType: RamType, decoderParam: DecoderParam, vfuInstantiateParameter: VFUInstantiateParameter) @@ -233,11 +234,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ @public val laneRequest: DecoupledIO[LaneRequest] = IO(Flipped(Decoupled(new LaneRequest(parameter)))) - /** CSR Interface. TODO: merge to [[laneRequest]] - */ - @public - val csrInterface: CSRInterface = IO(Input(new CSRInterface(parameter.vlMaxBits))) - @public val maskUnitRequest: ValidIO[MaskUnitExeReq] = IO(Valid(new MaskUnitExeReq(parameter))) @@ -280,7 +276,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ /** V0 update in the lane should also update [[T1.v0]] */ @public - val v0Update: ValidIO[V0Update] = IO(Valid(new V0Update(parameter))) + val v0Update: ValidIO[V0Update] = IO(Valid(new V0Update(parameter.datapathWidth, parameter.vrfOffsetBits))) /** input of mask data */ @public @@ -319,27 +315,14 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ @public val laneProbe = IO(Output(Probe(new LaneProbe(parameter), layers.Verification))) - @public - val vrfAllocateIssue: Bool = IO(Output(Bool())) - // TODO: remove dontTouch(writeBusPort) + val csrInterface: CSRInterface = laneRequest.bits.csrInterface /** VRF instantces. */ val vrf: Instance[VRF] = Instantiate(new VRF(parameter.vrfParam)) omInstance.vrfIn := Property(vrf.om.asAnyClassType) - /** TODO: review later - */ - val maskGroupedOrR: UInt = VecInit( - maskInput.asBools - .grouped(parameter.dataPathByteWidth) - .toSeq - .map( - VecInit(_).asUInt.orR - ) - ).asUInt - val fullMask: UInt = (-1.S(parameter.datapathWidth.W)).asUInt /** the slot is occupied by instruction */ @@ -353,9 +336,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val maskIndexVec: Vec[UInt] = RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(log2Ceil(parameter.maskGroupWidth).W)))) - /** the find first one index register in this lane. */ - val ffoIndexReg: UInt = RegInit(0.U(log2Ceil(parameter.vLen / 8).W)) - /** result of reduce instruction. */ val reduceResult: UInt = RegInit(0.U(parameter.datapathWidth.W)) @@ -363,7 +343,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ */ val vrfWriteArbiter: Vec[DecoupledIO[VRFWriteRequest]] = Wire( Vec( - parameter.chainingSize + 2, + parameter.chainingSize + 1, Decoupled( new VRFWriteRequest( parameter.vrfParam.regNumBits, @@ -375,31 +355,76 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) ) - val lsuWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteType, 1, flow = true) - // connect lsuWriteQueue.enq - lsuWriteQueue.enq.valid := vrfWriteChannel.valid && !writeFromMask - lsuWriteQueue.enq.bits := vrfWriteChannel.bits - vrfWriteChannel.ready := writeFromMask || lsuWriteQueue.enq.ready + vrfWriteArbiter(parameter.chainingSize).valid := vrfWriteChannel.valid + vrfWriteArbiter(parameter.chainingSize).bits := vrfWriteChannel.bits + vrfWriteChannel.ready := vrfWriteArbiter(parameter.chainingSize).ready - val maskWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteType, parameter.maskUnitVefWriteQueueSize) - // connect maskWriteQueue.enq - maskWriteQueue.enq.valid := vrfWriteChannel.valid && writeFromMask - maskWriteQueue.enq.bits := vrfWriteChannel.bits + val allVrfWriteAfterCheck: Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 3) { i => + RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits)) + } + val afterCheckValid: Seq[Bool] = Seq.tabulate(parameter.chainingSize + 3) { _ => RegInit(false.B) } + val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 3, Bool())) + val afterCheckDequeueFire: Seq[Bool] = afterCheckValid.zip(afterCheckDequeueReady).map { case (v, r) => v && r } - vrfWriteArbiter(parameter.chainingSize).valid := lsuWriteQueue.deq.valid - vrfWriteArbiter(parameter.chainingSize).bits := lsuWriteQueue.deq.bits - lsuWriteQueue.deq.ready := vrfWriteArbiter(parameter.chainingSize).ready + // todo: mv to bundle.scala + class MaskControl(parameter: LaneParameter) extends Bundle { + val index: UInt = UInt(parameter.instructionIndexBits.W) + val sew: UInt = UInt(2.W) + val maskData: UInt = UInt(parameter.datapathWidth.W) + val group: UInt = UInt(parameter.maskGroupSizeBits.W) + val dataValid: Bool = Bool() + val waiteResponse: Bool = Bool() + val controlValid: Bool = Bool() + } - vrfWriteArbiter(parameter.chainingSize + 1).valid := maskWriteQueue.deq.valid - vrfWriteArbiter(parameter.chainingSize + 1).bits := maskWriteQueue.deq.bits - maskWriteQueue.deq.ready := vrfWriteArbiter(parameter.chainingSize + 1).ready + val maskControlRelease: Vec[ValidIO[UInt]] = + Wire(Vec(parameter.chainingSize, Valid(UInt(parameter.instructionIndexBits.W)))) + + val maskControlEnq: UInt = Wire(UInt(parameter.chainingSize.W)) + val maskControlDataDeq: UInt = Wire(UInt(parameter.chainingSize.W)) + val maskControlReq: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) + val maskControlReqSelect: UInt = ffo(maskControlReq.asUInt) + // mask request & response handle + val maskControlVec: Seq[MaskControl] = Seq.tabulate(parameter.chainingSize) { index => + val state = RegInit(0.U.asTypeOf(new MaskControl(parameter))) + val releaseHit: Bool = maskControlRelease.map(r => r.valid && (r.bits === state.index)).reduce(_ || _) + val responseFire = + Pipe(maskControlReqSelect(index), 0.U.asTypeOf(new EmptyBundle), parameter.maskRequestLatency).valid + + when(maskControlEnq(index)) { + state := 0.U.asTypeOf(state) + state.index := laneRequest.bits.instructionIndex + state.sew := laneRequest.bits.csrInterface.vSew + state.controlValid := true.B + } - val allVrfWriteAfterCheck: Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 4) { i => - RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits)) + when(state.controlValid) { + when(releaseHit) { + state.controlValid := false.B + } + } + + maskControlReq(index) := state.controlValid && !state.dataValid && !state.waiteResponse + when(maskControlReqSelect(index)) { + state.waiteResponse := true.B + state.group := state.group + 1.U + } + + when(responseFire) { + state.dataValid := true.B + state.waiteResponse := false.B + state.maskData := maskInput + } + + when(maskControlDataDeq(index)) { + state.dataValid := false.B + } + + state } - val afterCheckValid: Seq[Bool] = Seq.tabulate(parameter.chainingSize + 4) { _ => RegInit(false.B) } - val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 4, Bool())) - val afterCheckDequeueFire: Seq[Bool] = afterCheckValid.zip(afterCheckDequeueReady).map { case (v, r) => v && r } + val maskControlFree: Seq[Bool] = maskControlVec.map(s => !s.controlValid && !s.waiteResponse) + val freeSelect: UInt = ffo(VecInit(maskControlFree).asUInt) + maskControlEnq := maskAnd(laneRequest.fire && laneRequest.bits.mask, freeSelect) /** for each slot, assert when it is asking [[T1]] to change mask */ val slotMaskRequestVec: Vec[ValidIO[UInt]] = Wire( @@ -410,7 +435,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) /** which slot wins the arbitration for requesting mask. */ - val maskRequestFireOH: UInt = Wire(UInt(parameter.chainingSize.W)) + val maskRequestFireOH: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) + val maskDataVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.maskGroupWidth.W))) /** FSM control for each slot. if index == 0, * - slot can support write v0 in mask type, see [[Decoder.maskDestination]] [[Decoder.maskSource]] @@ -515,8 +541,10 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // Overflow occurs val vxsatEnq: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt((2 * parameter.chainingSize).W))) + + val instructionFinishInSlot: UInt = Wire(UInt((2 * parameter.chainingSize).W)) // vxsatEnq and instructionFinished cannot happen at the same time - vxsatResult := (vxsatEnq.reduce(_ | _) | vxsatResult) & (~instructionFinished).asUInt + vxsatResult := (vxsatEnq.reduce(_ | _) | vxsatResult) & (~instructionFinishInSlot).asUInt /** assert when a instruction will not use mask unit */ val instructionUnrelatedMaskUnitVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.chainingSize.W))) @@ -555,7 +583,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ record.laneRequest.decodeResult(Decoder.maskLogic) /** onehot value of SEW. */ - val vSew1H: UInt = UIntToOH(record.csr.vSew)(2, 0) + val vSew1H: UInt = UIntToOH(record.laneRequest.csrInterface.vSew)(2, 0) /** if asserted, the element won't be executed. adc: vm = 0; madc: vm = 0 -> s0 + s1 + c, vm = 1 -> s0 + s1 */ @@ -598,7 +626,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ maskUnitRequest <> mask.maskReq maskRequestToLSU <> mask.maskRequestToLSU tokenIO <> mask.tokenIO - tokenIO.maskResponseRelease := maskWriteQueue.deq.fire mask.dequeue }.getOrElse(stage3EnqWire) stage3.enqueue <> stage3EnqSelect @@ -611,7 +638,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ laneState.lastGroupForInstruction := record.lastGroupForInstruction laneState.isLastLaneForInstruction := record.isLastLaneForInstruction laneState.instructionFinished := record.instructionFinished - laneState.csr := record.csr + laneState.csr := record.laneRequest.csrInterface laneState.maskType := record.laneRequest.mask laneState.maskNotMaskedElement := !record.laneRequest.mask || record.laneRequest.decodeResult(Decoder.maskSource) || @@ -634,13 +661,16 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ laneState.elements.get(k).foreach(stateData => d := stateData) } + maskControlRelease(index).valid := false.B + maskControlRelease(index).bits := record.laneRequest.instructionIndex // update lane state when(stage0.enqueue.fire) { maskGroupCountVec(index) := stage0.updateLaneState.maskGroupCount // todo: handle all elements in first group are masked maskIndexVec(index) := stage0.updateLaneState.maskIndex when(stage0.updateLaneState.outOfExecutionRange) { - slotOccupied(index) := false.B + slotOccupied(index) := false.B + maskControlRelease(index).valid := true.B } } @@ -655,7 +685,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val maskFailure: Bool = stage0.updateLaneState.maskExhausted && stage0.enqueue.fire // update mask register when(maskUpdateFire) { - record.mask.bits := maskInput + record.mask.bits := maskDataVec(index) } when(maskUpdateFire ^ maskFailure) { record.mask.valid := maskUpdateFire @@ -853,7 +883,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // It’s been a long time since I selected it. Need pipe val queueBeforeMaskWrite: QueueIO[VRFWriteRequest] = Queue.io(chiselTypeOf(maskedWriteUnit.enqueue.bits), entries = 1, pipe = true) - val writeSelect: UInt = Wire(UInt((parameter.chainingSize + 4).W)) + val writeSelect: UInt = Wire(UInt((parameter.chainingSize + 3).W)) val writeCavitation: UInt = VecInit(allVrfWriteAfterCheck.map(_.mask === 0.U)).asUInt // 处理 rf @@ -931,20 +961,21 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ } { - // 处理mask的请求 - val maskSelectArbitrator = ffo( - VecInit(slotMaskRequestVec.map(_.valid)).asUInt ## - (laneRequest.valid && (laneRequest.bits.mask || laneRequest.bits.decodeResult(Decoder.maskSource))) - ) - maskRequestFireOH := maskSelectArbitrator(parameter.chainingSize, 1) - maskSelect := Mux1H( - maskSelectArbitrator, - 0.U.asTypeOf(slotMaskRequestVec.head.bits) +: slotMaskRequestVec.map(_.bits) - ) - maskSelectSew := Mux1H( - maskSelectArbitrator, - csrInterface.vSew +: slotControl.map(_.csr.vSew) - ) + maskSelect := Mux1H(maskControlReqSelect, maskControlVec.map(_.group)) + maskSelectSew := Mux1H(maskControlReqSelect, maskControlVec.map(_.sew)) + maskControlDataDeq := slotMaskRequestVec.zipWithIndex.map { case (req, index) => + val slotIndex = slotControl(index).laneRequest.instructionIndex + val hitMaskControl = VecInit(maskControlVec.map(_.index === slotIndex)).asUInt + val dataValid = Mux1H(hitMaskControl, maskControlVec.map(_.dataValid)) + val data = Mux1H(hitMaskControl, maskControlVec.map(_.maskData)) + val group = Mux1H(hitMaskControl, maskControlVec.map(_.group)) + val sameGroup = group === req.bits + dontTouch(sameGroup) + val maskRequestFire = req.valid && dataValid + maskRequestFireOH(index) := maskRequestFire + maskDataVec(index) := data + maskAnd(maskRequestFire, hitMaskControl).asUInt + }.reduce(_ | _) } // package a control logic for incoming instruction. @@ -954,8 +985,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val maskLogicCompleted: Bool = laneRequest.bits.decodeResult(Decoder.maskLogic) && (laneIndex ## 0.U(parameter.datapathWidthBits.W) >= csrInterface.vl) - // latch CSR from V - entranceControl.csr := csrInterface entranceControl.laneRequest := laneRequest.bits // TODO: in scalar core, raise illegal instruction exception when vstart is nonzero. @@ -969,9 +998,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // for 'nr' type instructions, they will need another complete signal. !(laneRequest.bits.decodeResult(Decoder.nr) || laneRequest.bits.lsWholeReg) // indicate if this is the mask type. - entranceControl.mask.valid := laneRequest.bits.mask + entranceControl.mask.valid := false.B // assign mask from [[V]] - entranceControl.mask.bits := maskInput + entranceControl.mask.bits := DontCare // mask used for VRF write in this group. entranceControl.vrfWriteMask := 0.U @@ -1057,7 +1086,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // enqueue from lane request if (slotIndex == parameter.chainingSize - 1) { enqueueValid := laneRequest.valid - enqueueReady := slotShiftValid(slotIndex) && vrf.instructionWriteReport.ready + enqueueReady := slotShiftValid(slotIndex) when(enqueueFire) { slotControl(slotIndex) := entranceControl maskGroupCountVec(slotIndex) := 0.U(parameter.maskGroupSizeBits.W) @@ -1087,7 +1116,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // handshake // @todo @Clo91eaf lane can take request from Sequencer - laneRequest.ready := slotFree && vrf.instructionWriteReport.ready + laneRequest.ready := slotFree val instructionFinishAndNotReportByTop: Bool = entranceControl.instructionFinished && !laneRequest.bits.decodeResult(Decoder.readOnly) && (writeCount === 0.U) @@ -1119,7 +1148,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrf.instructionWriteReport.bits.state.wLaneLastReport := !laneRequest.valid vrf.instructionWriteReport.bits.state.wTopLastReport := !laneRequest.bits.decodeResult(Decoder.maskUnit) vrf.instructionWriteReport.bits.state.wLaneClear := false.B - vrfAllocateIssue := vrf.vrfAllocateIssue val elementSizeForOneRegister: Int = parameter.vLen / parameter.datapathWidth / parameter.laneNumber val nrMask: UInt = VecInit(Seq.tabulate(8) { i => @@ -1153,17 +1181,23 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrf.instructionWriteReport.bits.elementMask := selectMask + instructionFinishInSlot := (~instructionValid).asUInt & instructionValidNext + + val emptyInstValid: Bool = RegNext(laneRequest.bits.issueInst && !vrf.instructionWriteReport.valid, false.B) + val emptyInstCount: UInt = RegNext(indexToOH(laneRequest.bits.instructionIndex, parameter.chainingSize)) + val emptyReport: UInt = maskAnd(emptyInstValid, emptyInstCount).asUInt + // clear record by instructionFinished - vrf.instructionLastReport := instructionFinished + vrf.instructionLastReport := instructionFinishInSlot vrf.lsuLastReport := lsuLastReport vrf.loadDataInLSUWriteQueue := loadDataInLSUWriteQueue vrf.dataInLane := instructionValid - instructionFinished := (~instructionValid).asUInt & instructionValidNext + instructionFinished := vrf.vrfSlotRelease | emptyReport writeReadyForLsu := vrf.writeReadyForLsu vrfReadyToStore := vrf.vrfReadyToStore tokenManager.crossWriteReports.zipWithIndex.foreach { case (rpt, rptIndex) => - rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 2 + rptIndex) - rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 2 + rptIndex).instructionIndex + rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 1 + rptIndex) + rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex } // todo: add mask unit write token tokenManager.responseReport.valid := maskUnitRequest.valid @@ -1199,13 +1233,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ tokenManager.topWriteEnq.valid := vrfWriteChannel.fire tokenManager.topWriteEnq.bits := vrfWriteChannel.bits.instructionIndex - tokenManager.fromMask := writeFromMask - - tokenManager.lsuWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) - tokenManager.lsuWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex - tokenManager.maskWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize + 1) - tokenManager.maskWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1).instructionIndex + tokenManager.topWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) + tokenManager.topWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex tokenManager.maskUnitLastReport := lsuLastReport diff --git a/t1/src/T1.scala b/t1/src/T1.scala index f739c885c..dc76eb784 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -66,10 +66,10 @@ class T1OM(parameter: T1Parameter) extends GeneralOM[T1Parameter, T1](parameter) val decoderIn = IO(Input(Property[AnyClassType]())) decoder := decoderIn - val permutatuon = IO(Output(Property[AnyClassType]())) + val permutation = IO(Output(Property[AnyClassType]())) @public - val permutatuonIn = IO(Input(Property[AnyClassType]())) - permutatuon := permutatuonIn + val permutationIn = IO(Input(Property[AnyClassType]())) + permutation := permutationIn } object T1Parameter { @@ -252,6 +252,19 @@ case class T1Parameter( // and the values are their respective delays. val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1)) + val laneRequestTokenSize: Int = 4 + val laneRequestShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + + val maskUnitReadTokenSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 4) + val maskUnitReadShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + + val lsuReadTokenSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 4) + val lsuReadShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + + val maskRequestLatency = 2 + + val releaseShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) /** paraemter for AXI4. */ @@ -290,6 +303,7 @@ case class T1Parameter( crossLaneVRFWriteEscapeQueueSize = vrfWriteQueueSize, fpuEnable = fpuEnable, portFactor = vrfBankSize, + maskRequestLatency = 2 * maskRequestLatency, vrfRamType = vrfRamType, decoderParam = decoderParam, vfuInstantiateParameter = vfuInstantiateParameter @@ -391,7 +405,7 @@ class T1(val parameter: T1Parameter) val maskUnit: Instance[MaskUnit] = Instantiate(new MaskUnit(parameter)) maskUnit.io.clock := implicitClock maskUnit.io.reset := implicitReset - omInstance.permutatuonIn := Property(maskUnit.io.om.asAnyClassType) + omInstance.permutationIn := Property(maskUnit.io.om.asAnyClassType) val tokenManager: Instance[T1TokenManager] = Instantiate(new T1TokenManager(parameter)) @@ -624,9 +638,21 @@ class T1(val parameter: T1Parameter) control } - /** lane is ready to receive new instruction. */ - val laneReady: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - val allLaneReady: Bool = laneReady.asUInt.andR + // Close to top + val laneRequestSourceWire: Vec[DecoupledIO[LaneRequest]] = Wire( + Vec(parameter.laneNumber, Decoupled(new LaneRequest(parameter.laneParam))) + ) + // Close to lane + val laneRequestSinkWire: Vec[DecoupledIO[LaneRequest]] = Wire( + Vec(parameter.laneNumber, Decoupled(new LaneRequest(parameter.laneParam))) + ) + + laneRequestSourceWire.zipWithIndex.foreach { case (source, index) => + val sink = laneRequestSinkWire(index) + connectDecoupledWithShifter(parameter.laneRequestShifterSize(index), parameter.laneRequestTokenSize)(source, sink) + } + + val allLaneReady: Bool = VecInit(laneRequestSourceWire.map(_.ready)).asUInt.andR // TODO: review later // todo: 把scheduler的反馈也加上,lsu有更高的优先级 @@ -635,25 +661,18 @@ class T1(val parameter: T1Parameter) val completeIndexInstruction: Bool = ohCheck(lsu.lastReport, slots.last.record.instructionIndex, parameter.chainingSize) && !slots.last.state.idle - val vrfWrite: Vec[DecoupledIO[VRFWriteRequest]] = Wire( - Vec( - parameter.laneNumber, - Decoupled( - new VRFWriteRequest( - parameter.vrfParam.regNumBits, - parameter.vrfParam.vrfOffsetBits, - parameter.instructionIndexBits, - parameter.datapathWidth - ) - ) - ) - ) - val freeOR: Bool = VecInit(slots.map(_.state.idle)).asUInt.orR /** slot is ready to accept new instructions. */ val slotReady: Bool = Mux(specialInstruction, slots.map(_.state.idle).last, freeOR) + val olderCheck: Bool = slots.map { re => + // The same lsb will make it difficult to distinguish between the new and the old + val notSameLSB: Bool = re.record.instructionIndex(parameter.instructionIndexBits - 2, 0) =/= + requestReg.bits.instructionIndex(parameter.instructionIndexBits - 2, 0) + re.state.idle || (instIndexL(re.record.instructionIndex, requestReg.bits.instructionIndex) && notSameLSB) + }.reduce(_ && _) + val source1Select: UInt = Mux( decodeResult(Decoder.gather), @@ -688,92 +707,89 @@ class T1(val parameter: T1Parameter) requestReg.bits.issue.vl ) - /** instantiate lanes. TODO: move instantiate to top of class. - */ - val laneVec: Seq[Instance[Lane]] = Seq.tabulate(parameter.laneNumber) { index => - val lane: Instance[Lane] = Instantiate(new Lane(parameter.laneParam)) - // lane.laneRequest.valid -> requestRegDequeue.ready -> lane.laneRequest.ready -> lane.laneRequest.bits - // TODO: this is harmful for PnR design, since it broadcast ready singal to each lanes, which will significantly - // reduce the scalability for large number of lanes. - lane.laneRequest.valid := requestRegDequeue.fire && !noOffsetReadLoadStore && !maskUnitInstruction + laneRequestSourceWire.foreach { request => + request.valid := requestRegDequeue.fire // hard wire - lane.laneRequest.bits.instructionIndex := requestReg.bits.instructionIndex - lane.laneRequest.bits.decodeResult := decodeResult - lane.laneRequest.bits.vs1 := requestRegDequeue.bits.instruction(19, 15) - lane.laneRequest.bits.vs2 := requestRegDequeue.bits.instruction(24, 20) - lane.laneRequest.bits.vd := requestRegDequeue.bits.instruction(11, 7) - lane.laneRequest.bits.segment := Mux( + request.bits.instructionIndex := requestReg.bits.instructionIndex + request.bits.decodeResult := decodeResult + request.bits.vs1 := requestRegDequeue.bits.instruction(19, 15) + request.bits.vs2 := requestRegDequeue.bits.instruction(24, 20) + request.bits.vd := requestRegDequeue.bits.instruction(11, 7) + request.bits.segment := Mux( decodeResult(Decoder.nr), requestRegDequeue.bits.instruction(17, 15), requestRegDequeue.bits.instruction(31, 29) ) - lane.laneRequest.bits.loadStoreEEW := requestRegDequeue.bits.instruction(13, 12) + request.bits.loadStoreEEW := requestRegDequeue.bits.instruction(13, 12) // if the instruction is vi and vx type of gather, gather from rs2 with mask VRF read channel from one lane, // and broadcast to all lanes. - lane.laneRequest.bits.readFromScalar := source1Select + request.bits.readFromScalar := source1Select - lane.laneRequest.bits.issueInst := requestRegDequeue.fire - lane.laneRequest.bits.loadStore := isLoadStoreType + request.bits.issueInst := !noOffsetReadLoadStore && !maskUnitInstruction + request.bits.loadStore := isLoadStoreType // let record in VRF to know there is a store instruction. - lane.laneRequest.bits.store := isStoreType + request.bits.store := isStoreType // let lane know if this is a special instruction, which need group-level synchronization between lane and [[V]] - lane.laneRequest.bits.special := specialInstruction - lane.laneRequest.bits.lsWholeReg := lsWholeReg + request.bits.special := specialInstruction + request.bits.lsWholeReg := lsWholeReg // mask type instruction. - lane.laneRequest.bits.mask := maskType - laneReady(index) := lane.laneRequest.ready + request.bits.mask := maskType - lane.csrInterface := requestRegCSR + // connect csrInterface + request.bits.csrInterface := requestRegCSR // index type EEW Decoded in the instruction - lane.csrInterface.vSew := vSewSelect - lane.csrInterface.vl := evlForLane - lane.laneIndex := index.U - - // lsu 优先会有死锁: - // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写 - // vse32.v v1, (a0) -> 依赖上一条,但是会先发出read - - // Mask priority will also be - // vse32.v v19, (a0) - // vfslide1down.vf v19, v10, x1 - val maskUnitFirst = RegInit(false.B) - val tryToRead = lsu.vrfReadDataPorts(index).valid || maskUnit.io.readChannel(index).valid - when(tryToRead && !lane.vrfReadAddressChannel.fire) { - maskUnitFirst := !maskUnitFirst - } - lane.vrfReadAddressChannel.valid := Mux( - maskUnitFirst, - maskUnit.io.readChannel(index).valid, - lsu.vrfReadDataPorts(index).valid + request.bits.csrInterface.vSew := vSewSelect + request.bits.csrInterface.vl := evlForLane + } + + /** instantiate lanes. TODO: move instantiate to top of class. + */ + val laneVec: Seq[Instance[Lane]] = Seq.tabulate(parameter.laneNumber) { index => + val lane: Instance[Lane] = Instantiate(new Lane(parameter.laneParam)) + lane.laneRequest.valid := laneRequestSinkWire(index).valid && laneRequestSinkWire(index).bits.issueInst + lane.laneRequest.bits := laneRequestSinkWire(index).bits + lane.laneRequest.bits.issueInst := laneRequestSinkWire(index).fire + laneRequestSinkWire(index).ready := !laneRequestSinkWire(index).bits.issueInst || lane.laneRequest.ready + + lane.laneIndex := index.U + + connectVrfAccess( + Seq(parameter.maskUnitReadShifterSize(index), parameter.lsuReadShifterSize(index)), + Seq(parameter.maskUnitReadTokenSize(index), parameter.lsuReadTokenSize(index)), + Some(parameter.vrfReadLatency) + )( + VecInit(Seq(maskUnit.io.readChannel(index), lsu.vrfReadDataPorts(index))), + lane.vrfReadAddressChannel, + 0, + Some(lane.vrfReadDataChannel), + Some(Seq(maskUnit.io.readResult(index), lsu.vrfReadResults(index))) + ) + + connectVrfAccess( + Seq(parameter.maskUnitReadShifterSize(index), parameter.lsuReadShifterSize(index)), + Seq(parameter.maskUnitReadTokenSize(index), parameter.lsuReadTokenSize(index)) + )( + VecInit(Seq(maskUnit.io.exeResp(index), lsu.vrfWritePort(index))), + lane.vrfWriteChannel, + 0 ) - lane.vrfReadAddressChannel.bits := - Mux(maskUnitFirst, maskUnit.io.readChannel(index).bits, lsu.vrfReadDataPorts(index).bits) - lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnitFirst - maskUnit.io.readChannel(index).ready := lane.vrfReadAddressChannel.ready && maskUnitFirst - maskUnit.io.readResult(index) := lane.vrfReadDataChannel - lsu.vrfReadResults(index) := lane.vrfReadDataChannel - - val maskTryToWrite = maskUnit.io.exeResp(index) - // lsu & mask unit write lane - // Mask write has absolute priority because it has a token - lane.vrfWriteChannel.valid := vrfWrite(index).valid || maskTryToWrite.valid - lane.vrfWriteChannel.bits := Mux(maskTryToWrite.valid, maskTryToWrite.bits, vrfWrite(index).bits) - vrfWrite(index).ready := lane.vrfWriteChannel.ready && !maskTryToWrite.valid - lane.writeFromMask := maskTryToWrite.valid + lane.writeFromMask := maskUnit.io.exeResp(index).fire lsu.offsetReadResult(index).valid := lane.maskUnitRequest.valid && lane.maskRequestToLSU lsu.offsetReadResult(index).bits := lane.maskUnitRequest.bits.source2 lsu.offsetReadIndex(index) := lane.maskUnitRequest.bits.index + val instructionFinishedPipe = Pipe(true.B, lane.instructionFinished, parameter.releaseShifterSize(index)).bits instructionFinished(index).zip(slots.map(_.record.instructionIndex)).foreach { case (d, f) => - d := ohCheck(lane.instructionFinished, f, parameter.chainingSize) + d := ohCheck(instructionFinishedPipe, f, parameter.chainingSize) } - vxsatReportVec(index) := lane.vxsatReport - lane.maskInput := maskUnit.io.laneMaskInput(index) - maskUnit.io.laneMaskSelect(index) := lane.maskSelect - maskUnit.io.laneMaskSewSelect(index) := lane.maskSelectSew + vxsatReportVec(index) := lane.vxsatReport + lane.maskInput := Pipe(true.B, maskUnit.io.laneMaskInput(index), parameter.maskRequestLatency).bits + maskUnit.io.laneMaskSelect(index) := Pipe(true.B, lane.maskSelect, parameter.maskRequestLatency).bits + maskUnit.io.laneMaskSewSelect(index) := Pipe(true.B, lane.maskSelectSew, parameter.maskRequestLatency).bits maskUnit.io.v0UpdateVec(index) <> lane.v0Update + lsu.v0UpdateVec(index) <> lane.v0Update lane.lsuLastReport := lsu.lastReport | maskUnit.io.lastReport @@ -785,7 +801,7 @@ class T1(val parameter: T1Parameter) (requestReg.bits.writeByte(rowWith - 1, 0) > ((parameter.datapathWidth / 8) * index).U) // token manager - tokenManager.instructionFinish(index) := lane.instructionFinished + tokenManager.instructionFinish(index) := instructionFinishedPipe lane } @@ -808,12 +824,10 @@ class T1(val parameter: T1Parameter) lsu.request.bits.instructionInformation.isStore := isStoreType lsu.request.bits.instructionInformation.maskedLoadStore := maskType - maskUnit.io.lsuMaskSelect := lsu.maskSelect - lsu.maskInput := maskUnit.io.lsuMaskInput - lsu.csrInterface := requestRegCSR - lsu.csrInterface.vl := evlForLsu - lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR - lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR + lsu.csrInterface := requestRegCSR + lsu.csrInterface.vl := evlForLsu + lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR + lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR // connect mask unit maskUnit.io.instReq.valid := requestRegDequeue.fire && requestReg.bits.decodeResult(Decoder.maskUnit) @@ -838,7 +852,6 @@ class T1(val parameter: T1Parameter) } maskUnit.io.tokenIO.zip(laneVec).zipWithIndex.foreach { case ((token, lane), index) => - token.maskResponseRelease := lane.tokenIO.maskResponseRelease lane.tokenIO.maskRequestRelease := token.maskRequestRelease || lsu.tokenIO.offsetGroupRelease(index) } @@ -875,8 +888,6 @@ class T1(val parameter: T1Parameter) io.highBandwidthLoadStorePort <> lsu.axi4Port io.indexedLoadStorePort <> lsu.simpleAccessPorts - // 暂时直接连lsu的写,后续需要处理scheduler的写 - vrfWrite.zip(lsu.vrfWritePort).foreach { case (sink, source) => sink <> source } /** Slot has free entries. */ val free = VecInit(slots.map(_.state.idle)).asUInt @@ -898,7 +909,6 @@ class T1(val parameter: T1Parameter) /** for lsu instruction lsu is ready, for normal instructions, lanes are ready. */ val executionReady: Bool = (!isLoadStoreType || lsu.request.ready) && (noOffsetReadLoadStore || allLaneReady) - val vrfAllocate: Bool = VecInit(laneVec.map(_.vrfAllocateIssue)).asUInt.andR // - ready to issue instruction // - for vi and vx type of gather, it need to access vs2 for one time, we read vs2 firstly in `gatherReadFinish` // and convert it to mv instruction. @@ -907,7 +917,7 @@ class T1(val parameter: T1Parameter) // we detect the hazard and decide should we issue this slide or // issue the instruction after the slide which already in the slot. requestRegDequeue.ready := executionReady && slotReady && (!gatherNeedRead || maskUnit.io.gatherData.valid) && - tokenManager.issueAllow && instructionIndexFree && vrfAllocate + tokenManager.issueAllow && instructionIndexFree && olderCheck instructionToSlotOH := Mux(requestRegDequeue.fire, slotToEnqueue, 0.U) @@ -963,13 +973,12 @@ class T1(val parameter: T1Parameter) probeWire.requestRegReady := requestRegDequeue.ready // maskUnitWrite maskUnitWriteReady probeWire.writeQueueEnqVec.zip(maskUnit.io.exeResp).foreach { case (probe, write) => - probe.valid := write.valid && write.bits.mask.orR + probe.valid := write.fire && write.bits.mask.orR probe.bits := write.bits.instructionIndex } - probeWire.instructionValid := maskAnd( - !slots.last.state.wMaskUnitLast && !slots.last.state.idle, - indexToOH(slots.last.record.instructionIndex, parameter.chainingSize) - ).asUInt + probeWire.instructionValid := slots + .map(s => maskAnd(!s.state.idle, indexToOH(s.record.instructionIndex, parameter.chainingSize)).asUInt) + .reduce(_ | _) probeWire.responseCounter := responseCounter probeWire.laneProbes.zip(laneVec).foreach { case (p, l) => p := probe.read(l.laneProbe) } probeWire.lsuProbe := probe.read(lsu.lsuProbe) diff --git a/t1/src/laneStage/MaskExchangeUnit.scala b/t1/src/laneStage/MaskExchangeUnit.scala index cb8257f83..49faff45f 100644 --- a/t1/src/laneStage/MaskExchangeUnit.scala +++ b/t1/src/laneStage/MaskExchangeUnit.scala @@ -54,8 +54,7 @@ class MaskExchangeUnit(parameter: LaneParameter) extends Module { val maskRequestEnqReady: Bool = !enqIsMaskRequest || maskRequestAllow - dequeue.valid := enqueue.valid && enqSendToDeq - dequeue.bits := enqueue.bits - enqueue.ready := Mux(enqSendToDeq, dequeue.ready, maskRequestEnqReady) - tokenIO.maskResponseRelease := DontCare + dequeue.valid := enqueue.valid && enqSendToDeq + dequeue.bits := enqueue.bits + enqueue.ready := Mux(enqSendToDeq, dequeue.ready, maskRequestEnqReady) } diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala index 22eef760b..dadea0fd5 100644 --- a/t1/src/laneStage/SlotTokenManager.scala +++ b/t1/src/laneStage/SlotTokenManager.scala @@ -94,13 +94,7 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val topWriteEnq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) @public - val fromMask: Bool = IO(Input(Bool())) - - @public - val lsuWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) - - @public - val maskWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) + val topWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) @public val instructionValid: UInt = IO(Output(UInt((2 * parameter.chainingSize).W))) @@ -212,27 +206,16 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val instructionInWritePipe: UInt = tokenUpdate(writePipeToken, writePipeEnq, writePipeDeq) // lsu & mask write token - val lsuWriteToken: Seq[UInt] = Seq.tabulate(2 * parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) - val maskWriteToken: Seq[UInt] = Seq.tabulate(2 * parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + val topWriteToken: Seq[UInt] = Seq.tabulate(2 * parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) val topWriteDoEnq: UInt = maskAnd(topWriteEnq.valid, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt - val lsuWriteDoEnq: UInt = - maskAnd(topWriteEnq.valid && !fromMask, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt - - val maskWriteDoEnq: UInt = - maskAnd(topWriteEnq.valid && fromMask, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt - - val lsuWriteDoDeq: UInt = - maskAnd(lsuWriteDeq.valid, indexToOH(lsuWriteDeq.bits, parameter.chainingSize)).asUInt - - val maskWriteDoDeq: UInt = - maskAnd(maskWriteDeq.valid, indexToOH(maskWriteDeq.bits, parameter.chainingSize)).asUInt + val topWriteDoDeq: UInt = + maskAnd(topWriteDeq.valid, indexToOH(topWriteDeq.bits, parameter.chainingSize)).asUInt - val lsuInTopWrite = tokenUpdate(lsuWriteToken, lsuWriteDoEnq, lsuWriteDoDeq) - val maskInTopWrite = tokenUpdate(maskWriteToken, maskWriteDoEnq, maskWriteDoDeq) + val topWrite: UInt = tokenUpdate(topWriteToken, topWriteDoEnq, topWriteDoDeq) - dataInWritePipe := instructionInWritePipe | lsuInTopWrite | maskInTopWrite + dataInWritePipe := instructionInWritePipe | topWrite instructionValid := dataInWritePipe | instructionInSlot } diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index 7c93e09ed..cd29f90be 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -118,15 +118,10 @@ class LSU(param: LSUParameter) extends Module { @public val request: DecoupledIO[LSURequest] = IO(Flipped(Decoupled(new LSURequest(param.datapathWidth)))) - /** mask from [[V]] TODO: since mask is one-cycle information for a mask group, we should latch it in the LSU, and - * reduce the IO width. this needs PnR information. - */ - @public - val maskInput: Vec[UInt] = IO(Input(Vec(param.lsuMSHRSize, UInt(param.maskGroupWidth.W)))) - - /** the address of the mask group in the [[V]]. */ @public - val maskSelect: Vec[UInt] = IO(Output(Vec(param.lsuMSHRSize, UInt(param.maskGroupSizeBits.W)))) + val v0UpdateVec: Vec[ValidIO[V0Update]] = IO( + Flipped(Vec(param.laneNumber, Valid(new V0Update(param.datapathWidth, param.vrfOffsetBits)))) + ) @public val axi4Port: AXI4RWIrrevocable = IO(new AXI4RWIrrevocable(param.axi4BundleParameter)) @@ -150,7 +145,7 @@ class LSU(param: LSUParameter) extends Module { /** hard wire form Top. TODO: merge to [[vrfReadDataPorts]] */ @public - val vrfReadResults: Vec[UInt] = IO(Input(Vec(param.laneNumber, UInt(param.datapathWidth.W)))) + val vrfReadResults: Vec[ValidIO[UInt]] = IO(Vec(param.laneNumber, Flipped(Valid(UInt(param.datapathWidth.W))))) /** write channel to [[V]], which will redirect it to [[Lane.vrf]]. */ @public @@ -197,6 +192,25 @@ class LSU(param: LSUParameter) extends Module { val storeUnit: StoreUnit = Module(new StoreUnit(param.mshrParam)) val otherUnit: SimpleAccessUnit = Module(new SimpleAccessUnit(param.mshrParam)) + /** duplicate v0 in lsu */ + val v0: Vec[UInt] = RegInit( + VecInit(Seq.fill(param.vLen / param.datapathWidth)(0.U(param.datapathWidth.W))) + ) + + // write v0(mask) + v0.zipWithIndex.foreach { case (data, index) => + // 属于哪个lane + val laneIndex: Int = index % param.laneNumber + // 取出写的端口 + val v0Write = v0UpdateVec(laneIndex) + // offset + val offset: Int = index / param.laneNumber + val maskExt = FillInterleaved(8, v0Write.bits.mask) + when(v0Write.valid && v0Write.bits.offset === offset.U) { + data := (data & (~maskExt).asUInt) | (maskExt & v0Write.bits.data) + } + } + val unitVec = Seq(loadUnit, storeUnit, otherUnit) /** Always merge into cache line */ @@ -222,8 +236,8 @@ class LSU(param: LSUParameter) extends Module { mshr.lsuRequest.valid := reqEnq(index) mshr.lsuRequest.bits := request.bits - maskSelect(index) := Mux(mshr.maskSelect.valid, mshr.maskSelect.bits, 0.U) - mshr.maskInput := maskInput(index) + val maskSelect = Mux(mshr.maskSelect.valid, mshr.maskSelect.bits, 0.U) + mshr.maskInput := cutUInt(v0.asUInt, param.maskGroupWidth)(maskSelect) // broadcast CSR mshr.csrInterface := csrInterface @@ -250,7 +264,8 @@ class LSU(param: LSUParameter) extends Module { otherUnit.vrfReadDataPorts.ready := (otherTryReadVrf & VecInit(vrfReadDataPorts.map(_.ready)).asUInt).orR val pipeOtherRead: ValidIO[UInt] = Pipe(otherUnit.vrfReadDataPorts.fire, otherUnit.status.targetLane, param.vrfReadLatency) - otherUnit.vrfReadResults.bits := Mux1H(pipeOtherRead.bits, vrfReadResults) + // todo: read data reorder + otherUnit.vrfReadResults.bits := Mux1H(pipeOtherRead.bits, vrfReadResults.map(_.bits)) otherUnit.vrfReadResults.valid := pipeOtherRead.valid // write vrf diff --git a/t1/src/lsu/SimpleAccessUnit.scala b/t1/src/lsu/SimpleAccessUnit.scala index d7409500b..bc517d5eb 100644 --- a/t1/src/lsu/SimpleAccessUnit.scala +++ b/t1/src/lsu/SimpleAccessUnit.scala @@ -120,6 +120,10 @@ case class MSHRParam( // outstanding of MaskExchangeUnit.maskReq // todo: param from T1Param val maskRequestQueueSize: Int = 8 + + // outstanding of StoreUnit.vrfReadDataPorts + // todo: param from T1Param + val storeUnitReadOutStanding: Int = 8 } /** Miss Status Handler Register this is used to record the outstanding memory access request for each instruction. it diff --git a/t1/src/lsu/StoreUnit.scala b/t1/src/lsu/StoreUnit.scala index 1fb72096b..00f261a4b 100644 --- a/t1/src/lsu/StoreUnit.scala +++ b/t1/src/lsu/StoreUnit.scala @@ -39,9 +39,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { /** hard wire form Top. see [[LSU.vrfReadResults]] */ @public - val vrfReadResults: Vec[UInt] = IO(Input(Vec(param.laneNumber, UInt(param.datapathWidth.W)))) + val vrfReadResults: Vec[ValidIO[UInt]] = IO(Input(Vec(param.laneNumber, Valid(UInt(param.datapathWidth.W))))) @public - val vrfReadyToStore: Bool = IO(Input(Bool())) + val vrfReadyToStore: Bool = IO(Input(Bool())) @public val storeResponse = IO(Input(Bool())) @@ -62,8 +62,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { // todo: need hazardCheck? val hazardCheck: Bool = RegEnable(vrfReadyToStore && !lsuRequest.valid, false.B, lsuRequest.valid || vrfReadyToStore) // read stage dequeue ready need all source valid, Or add a queue to coordinate - val vrfReadQueueVec: Seq[QueueIO[UInt]] = - Seq.tabulate(param.laneNumber)(_ => Queue.io(UInt(param.datapathWidth.W), 2, flow = true, pipe = true)) + val vrfReadQueueVec: Seq[QueueIO[UInt]] = Seq.tabulate(param.laneNumber)(_ => + Queue.io(UInt(param.datapathWidth.W), param.storeUnitReadOutStanding, flow = true, pipe = true) + ) // 从vrf里面读数据 val readStageValid: Bool = Seq @@ -72,9 +73,6 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { val segPtr: UInt = RegInit(0.U(3.W)) val readCount: UInt = RegInit(0.U(dataGroupBits.W)) val stageValid = RegInit(false.B) - // queue for read latency - val queue: QueueIO[UInt] = - Queue.io(UInt(param.datapathWidth.W), param.vrfReadLatency, flow = true) val lastReadPtr: Bool = segPtr === 0.U @@ -104,8 +102,14 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { readCount := nextReadCount } + val readCounter = RegInit(0.U(log2Ceil(param.storeUnitReadOutStanding + 1).W)) + val counterChange: UInt = Mux(readPort.fire, 1.U, -1.S(readCounter.getWidth.W).asUInt) + when(readPort.fire ^ vrfReadQueueVec(laneIndex).deq.fire) { + readCounter := readCounter + counterChange + } + // vrf read request - readPort.valid := stageValid && vrfReadQueueVec(laneIndex).enq.ready + readPort.valid := stageValid && !readCounter.asBools.last readPort.bits.vs := lsuRequestReg.instructionInformation.vs3 + segPtr * segmentInstructionIndexInterval + @@ -114,15 +118,11 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { readPort.bits.offset := readCount readPort.bits.instructionIndex := lsuRequestReg.instructionIndex - // pipe read fire - val readResultFire = Pipe(readPort.fire, 0.U.asTypeOf(new EmptyBundle), param.vrfReadLatency).valid - // latency queue enq - queue.enq.valid := readResultFire - queue.enq.bits := vrfReadResults(laneIndex) - AssertProperty(BoolSequence(!queue.enq.valid || queue.enq.ready)) - vrfReadQueueVec(laneIndex).enq <> queue.deq - stageValid || RegNext(readPort.fire) + AssertProperty(BoolSequence(!vrfReadQueueVec(laneIndex).enq.valid || vrfReadQueueVec(laneIndex).enq.ready)) + vrfReadQueueVec(laneIndex).enq.valid := vrfReadResults(laneIndex).valid + vrfReadQueueVec(laneIndex).enq.bits := vrfReadResults(laneIndex).bits + stageValid || readCounter.orR } .reduce(_ || _) diff --git a/t1/src/mask/BitLevelMaskWrite.scala b/t1/src/mask/BitLevelMaskWrite.scala index cefe31868..5bc73491e 100644 --- a/t1/src/mask/BitLevelMaskWrite.scala +++ b/t1/src/mask/BitLevelMaskWrite.scala @@ -43,8 +43,8 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { ) } - val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => - IO(Input(UInt(parameter.datapathWidth.W))) + val readResult: Seq[ValidIO[UInt]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Valid(UInt(parameter.datapathWidth.W)))) } val stageClear: Bool = IO(Output(Bool())) @@ -52,7 +52,7 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { val stageClearVec: Seq[Bool] = in.zipWithIndex.map { case (req, index) => val reqQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), 4) val readPort = readChannel(index) - val readData = readResult(index) + val readData = readResult(index).bits val res = out(index) val WaitReadQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), readVRFLatency) @@ -68,7 +68,7 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { readPort.bits.vs := vd + (reqQueue.deq.bits.groupCounter >> readPort.bits.offset.getWidth).asUInt readPort.bits.offset := changeUIntSize(reqQueue.deq.bits.groupCounter, readPort.bits.offset.getWidth) - val readValidPipe = Pipe(readPort.fire, false.B, readVRFLatency).valid + val readValidPipe = Pipe(readPort.fire, false.B, readVRFLatency).valid && readResult(index).valid val readResultValid = !needWAR || readValidPipe val WARData = (WaitReadQueue.deq.bits.data & WaitReadQueue.deq.bits.bitMask) | diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala index 24f597259..03520db7a 100644 --- a/t1/src/mask/MaskCompress.scala +++ b/t1/src/mask/MaskCompress.scala @@ -4,9 +4,26 @@ package org.chipsalliance.t1.rtl import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, Instance, Instantiate} +import chisel3.experimental.{SerializableModule, SerializableModuleParameter} +import chisel3.properties.{AnyClassType, Path, Property} import chisel3.util._ +import org.chipsalliance.stdlib.GeneralOM -class CompressInput(parameter: T1Parameter) extends Bundle { +case class CompressParam( + datapathWidth: Int, + xLen: Int, + vLen: Int, + laneNumber: Int, + groupNumberBits: Int, + latency: Int) + extends SerializableModuleParameter + +object CompressParam { + implicit def rwP = upickle.default.macroRW[CompressParam] +} + +class CompressInput(parameter: CompressParam) extends Bundle { val maskType: Bool = Bool() val eew: UInt = UInt(2.W) val uop: UInt = UInt(3.W) @@ -14,26 +31,53 @@ class CompressInput(parameter: T1Parameter) extends Bundle { val source1: UInt = UInt(parameter.datapathWidth.W) val mask: UInt = UInt(parameter.datapathWidth.W) val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) - val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) val ffoInput: UInt = UInt(parameter.laneNumber.W) val validInput: UInt = UInt(parameter.laneNumber.W) val lastCompress: Bool = Bool() } -class CompressOutput(parameter: T1Parameter) extends Bundle { +class CompressOutput(parameter: CompressParam) extends Bundle { val data: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) val mask: UInt = UInt((parameter.laneNumber * parameter.datapathWidth / 8).W) - val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) val ffoOutput: UInt = UInt(parameter.laneNumber.W) val compressValid: Bool = Bool() } -class MaskCompress(parameter: T1Parameter) extends Module { - val in: ValidIO[CompressInput] = IO(Flipped(Valid(new CompressInput(parameter)))) - val out: CompressOutput = IO(Output(new CompressOutput(parameter))) - val newInstruction: Bool = IO(Input(Bool())) - val ffoInstruction: Bool = IO(Input(Bool())) - val writeData: UInt = IO(Output(UInt(parameter.xLen.W))) +class MaskCompressInterFace(parameter: CompressParam) extends Bundle { + val clock = Input(Clock()) + val reset = Input(Reset()) + + val in: ValidIO[CompressInput] = Flipped(Valid(new CompressInput(parameter))) + val out: CompressOutput = Output(new CompressOutput(parameter)) + val newInstruction: Bool = Input(Bool()) + val ffoInstruction: Bool = Input(Bool()) + val writeData: UInt = Output(UInt(parameter.xLen.W)) + val om = Output(Property[AnyClassType]()) +} + +@instantiable +class MaskCompressOM(parameter: CompressParam) extends GeneralOM[CompressParam, MaskCompress](parameter) {} + +class MaskCompress(val parameter: CompressParam) + extends FixedIORawModule(new MaskCompressInterFace(parameter)) + with SerializableModule[CompressParam] + with ImplicitClock + with ImplicitReset { + + protected def implicitClock = io.clock + protected def implicitReset = io.reset + + val omInstance: Instance[MaskCompressOM] = Instantiate(new MaskCompressOM(parameter)) + io.om := omInstance.getPropertyReference + omInstance.retimeIn.foreach(_ := Property(Path(io.clock))) + + val in = io.in + val out = io.out + val newInstruction = io.newInstruction + val ffoInstruction = io.ffoInstruction + val writeData = io.writeData val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 @@ -44,11 +88,16 @@ class MaskCompress(parameter: T1Parameter) extends Module { val writeRD = in.bits.uop === BitPat("b?11") val ffoType = in.bits.uop === BitPat("b11?") + val outWire: CompressOutput = Wire(new CompressOutput(parameter)) + val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) val compressInit: UInt = RegInit(0.U(log2Ceil(parameter.vLen).W)) val compressVec: Vec[UInt] = Wire(Vec(maskSize, UInt(compressInit.getWidth.W))) - val compressMaskVec: Seq[Bool] = changeUIntSize(in.bits.source1 & in.bits.mask, maskSize).asBools - val compressCount: UInt = compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) => + val maskInput: UInt = changeUIntSize(in.bits.source1 & in.bits.mask, maskSize) + val compressMaskVec: Vec[Bool] = VecInit(maskInput.asBools) + val compressCount: UInt = compressInit + PopCount(maskInput) + + compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) => compressVec(index) := pre pre + mask } @@ -58,6 +107,7 @@ class MaskCompress(parameter: T1Parameter) extends Module { val ffoValid: Bool = RegInit(false.B) writeData := ffoIndex + // compress & viota stage 1: update compressInit when(newInstruction) { compressInit := 0.U } @@ -66,7 +116,7 @@ class MaskCompress(parameter: T1Parameter) extends Module { val dataByte = 1 << sewInt val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte val countWidth = log2Ceil(elementSizePerSet) - val compressDeqValid = (compressCount >> countWidth).asUInt.orR + val compressDeqValid = (compressCount >> countWidth).asUInt(0) val compressUpdate = changeUIntSize(compressCount, countWidth) (compressDeqValid, compressUpdate) } @@ -83,28 +133,43 @@ class MaskCompress(parameter: T1Parameter) extends Module { } } - val viotaResult: UInt = Mux1H( + // compress & viota stage 2: get result + // pipe stage1 result + def initRegEnable[T <: Data](data: T, enable: Bool) = { + RegEnable(data, 0.U.asTypeOf(data), enable) + } + val compressVecPipe: Vec[UInt] = initRegEnable(compressVec, in.fire) + val compressMaskVecPipe: Vec[Bool] = initRegEnable(compressMaskVec, in.fire) + val maskPipe: UInt = initRegEnable(in.bits.mask, in.fire) + val source2Pipe: UInt = initRegEnable(in.bits.source2, in.fire) + val lastCompressPipe: Bool = initRegEnable(in.bits.lastCompress, in.fire) + val stage2Valid: Bool = RegNext(in.fire, false.B) + val newInstructionPipe: Bool = RegNext(newInstruction, false.B) + val compressInitPipe: UInt = initRegEnable(compressInit, in.fire) + val compressDeqValidPipe: Bool = initRegEnable(compressDeqValid, in.fire) + val groupCounterPipe: UInt = initRegEnable(in.bits.groupCounter, in.fire) + val viotaResult: UInt = Mux1H( eew1H, Seq(1, 2, 4).map { eew => VecInit(Seq.tabulate(parameter.laneNumber) { index => // data width: eew * 8, data path 32, need [4 / eew] element val dataSize = 4 / eew val res: Seq[UInt] = Seq.tabulate(dataSize) { i => - changeUIntSize(compressVec(dataSize * index + i), eew * 8) + changeUIntSize(compressVecPipe(dataSize * index + i), eew * 8) } // each data path VecInit(res).asUInt }).asUInt } ) - val viotaMask: UInt = Mux1H( + val viotaMask: UInt = Mux1H( eew1H, Seq(1, 2, 4).map { eew => VecInit(Seq.tabulate(parameter.laneNumber) { index => val dataSize = 4 / eew val res: Seq[UInt] = Seq.tabulate(dataSize) { i => val maskIndex: Int = (parameter.datapathWidth - 1).min(dataSize * index + i) - Fill(eew, in.bits.mask(maskIndex)) + Fill(eew, maskPipe(maskIndex)) } // 4 bit mask VecInit(res).asUInt @@ -115,21 +180,23 @@ class MaskCompress(parameter: T1Parameter) extends Module { val tailCount: UInt = { val minElementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 val maxCountWidth = log2Ceil(minElementSizePerSet) - changeUIntSize(compressInit, maxCountWidth) + changeUIntSize(compressInitPipe, maxCountWidth) } val compressDataReg = RegInit(0.U((parameter.laneNumber * parameter.datapathWidth).W)) val compressTailValid: Bool = RegInit(false.B) - val compressWriteGroupCount: UInt = RegInit(0.U(parameter.laneParam.groupNumberBits.W)) + val compressWriteGroupCount: UInt = RegInit(0.U(parameter.groupNumberBits.W)) val compressDataVec = Seq(1, 2, 4).map { dataByte => val dataBit = dataByte * 8 val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte VecInit(Seq.tabulate(elementSizePerSet * 2) { index => val hitReq = - Seq.tabulate(elementSizePerSet)(maskIndex => compressMaskVec(maskIndex) && compressVec(maskIndex) === index.U) + Seq.tabulate(elementSizePerSet)(maskIndex => + compressMaskVecPipe(maskIndex) && compressVecPipe(maskIndex) === index.U + ) val selectReqData = Mux1H( hitReq, - cutUInt(in.bits.source2, dataBit) + cutUInt(source2Pipe, dataBit) ) if (index < elementSizePerSet) { val useTail = index.U < tailCount @@ -141,18 +208,18 @@ class MaskCompress(parameter: T1Parameter) extends Module { }).asUInt } val compressResult: UInt = Mux1H(eew1H, compressDataVec) - val lastCompressEnq: Bool = in.fire && in.bits.lastCompress - when(newInstruction || lastCompressEnq || out.compressValid) { + val lastCompressEnq: Bool = stage2Valid && lastCompressPipe + when(newInstructionPipe || lastCompressEnq || outWire.compressValid) { compressTailValid := lastCompressEnq && compress } - when(newInstruction || out.compressValid) { - compressWriteGroupCount := Mux(newInstruction, 0.U, compressWriteGroupCount + 1.U) + when(newInstructionPipe || outWire.compressValid) { + compressWriteGroupCount := Mux(newInstructionPipe, 0.U, compressWriteGroupCount + 1.U) } val splitCompressResult: Vec[UInt] = cutUIntBySize(compressResult, 2) - when(in.fire) { - compressDataReg := Mux(compressDeqValid, splitCompressResult(1), splitCompressResult(0)) + when(stage2Valid) { + compressDataReg := Mux(compressDeqValidPipe, splitCompressResult(1), splitCompressResult(0)) } // todo: connect & update compressInit @@ -177,7 +244,7 @@ class MaskCompress(parameter: T1Parameter) extends Module { val ffoMask: UInt = FillInterleaved(parameter.datapathWidth / 8, in.bits.validInput) - out.data := Mux1H( + outWire.data := Mux1H( Seq( compress -> compressResult, viota -> viotaResult, @@ -187,7 +254,7 @@ class MaskCompress(parameter: T1Parameter) extends Module { ) // todo: compressMask - out.mask := Mux1H( + outWire.mask := Mux1H( Seq( compress -> compressMask, viota -> viotaMask, @@ -197,9 +264,10 @@ class MaskCompress(parameter: T1Parameter) extends Module { ) // todo - out.compressValid := (compressTailValid || (compressDeqValid && in.fire)) && !writeRD - out.groupCounter := Mux(compress, compressWriteGroupCount, in.bits.groupCounter) + outWire.compressValid := (compressTailValid || (compressDeqValidPipe && stage2Valid)) && !writeRD + outWire.groupCounter := Mux(compress, compressWriteGroupCount, groupCounterPipe) + // ffo type execute when(newInstruction && ffoInstruction) { ffoIndex := -1.S(parameter.datapathWidth.W).asUInt ffoValid := false.B @@ -235,5 +303,6 @@ class MaskCompress(parameter: T1Parameter) extends Module { }.elsewhen(mvRd) { ffoIndex := source1SigExtend } - out.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid) + outWire.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid) + out := RegNext(outWire, 0.U.asTypeOf(outWire)) } diff --git a/t1/src/mask/MaskReduce.scala b/t1/src/mask/MaskReduce.scala index adba8a813..d479b064b 100644 --- a/t1/src/mask/MaskReduce.scala +++ b/t1/src/mask/MaskReduce.scala @@ -75,7 +75,8 @@ class MaskReduce(val parameter: MaskReduceParameter) val omInstance: Instance[MaskReduceOM] = Instantiate(new MaskReduceOM(parameter)) io.om := omInstance.getPropertyReference - val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 + val floatAdderLatency: Int = 1 + val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 // todo: uop decode val order: Bool = in.bits.uop === "b101".U @@ -89,7 +90,7 @@ class MaskReduce(val parameter: MaskReduceParameter) val logicUnit: Instance[LaneLogic] = Instantiate(new LaneLogic(LaneLogicParameter(parameter.datapathWidth))) // option unit for flot reduce val floatAdder: Option[Instance[FloatAdder]] = - Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(FloatAdderParameter(8, 24)))) + Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(FloatAdderParameter(8, 24, floatAdderLatency)))) omInstance.floatAdderIn.zip(floatAdder).foreach { case (l, r) => l := r.io.om.asAnyClassType } val flotCompare: Option[Instance[FloatCompare]] = Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(FloatCompareParameter(8, 24)))) @@ -114,6 +115,7 @@ class MaskReduce(val parameter: MaskReduceParameter) val floatType: Bool = reqReg.uop(2) || reqReg.uop(1, 0).andR val NotAdd: Bool = reqReg.uop(1) val widen: Bool = reqReg.uop === "b001".U || reqReg.uop(2, 1) === "b11".U + val floatAdd: Bool = floatType && !NotAdd // eew1HReg(0) || (eew1HReg(1) && !widen) val needFold: Bool = false.B val writeEEW: UInt = Mux(pop, 2.U, reqReg.eew + widen) @@ -123,16 +125,21 @@ class MaskReduce(val parameter: MaskReduceParameter) // crossFold: reduce between lane // lastFold: reduce in data path // orderRed: order reduce - val idle :: crossFold :: lastFold :: orderRed :: Nil = Enum(4) + val idle :: crossFold :: lastFold :: orderRed :: waitRes :: Nil = Enum(5) val state: UInt = RegInit(idle) val stateIdle: Bool = state === idle val stateCross: Bool = state === crossFold val stateLast: Bool = state === lastFold val stateOrder: Bool = state === orderRed + val stateWait: Bool = state === waitRes + // wait float + val waitCount: UInt = RegInit(0.U(log2Ceil(floatAdderLatency.max(2)).W)) + when(stateWait) { waitCount := waitCount + 1.U } + val resFire: Bool = stateWait && waitCount === (floatAdderLatency - 1).U updateResult := - stateLast || ((stateCross || stateOrder) && sourceValid) + stateLast || ((stateCross || stateOrder) && sourceValid && !floatAdd) || resFire // state update in.ready := stateIdle @@ -143,9 +150,21 @@ class MaskReduce(val parameter: MaskReduceParameter) } when(stateCross) { + when(floatAdd) { + state := waitRes + waitCount := 0.U + }.elsewhen(groupLastReduce) { + state := Mux(reqReg.lastGroup && needFold, lastFold, idle) + outValid := reqReg.lastGroup && !needFold + } + } + + when(stateWait && resFire) { when(groupLastReduce) { state := Mux(reqReg.lastGroup && needFold, lastFold, idle) outValid := reqReg.lastGroup && !needFold + }.otherwise { + state := crossFold } } diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index 932e3a6ba..641ea9bd4 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -39,15 +39,15 @@ import org.chipsalliance.t1.rtl.decoder.Decoder // 11 11 1 -> maskdestination class MaskUnitInterface(parameter: T1Parameter) extends Bundle { - val clock: Clock = Input(Clock()) - val reset: Reset = Input(Reset()) - val instReq: ValidIO[MaskUnitInstReq] = Flipped(Valid(new MaskUnitInstReq(parameter))) - val exeReq: Vec[ValidIO[MaskUnitExeReq]] = Flipped( + val clock: Clock = Input(Clock()) + val reset: Reset = Input(Reset()) + val instReq: ValidIO[MaskUnitInstReq] = Flipped(Valid(new MaskUnitInstReq(parameter))) + val exeReq: Vec[ValidIO[MaskUnitExeReq]] = Flipped( Vec(parameter.laneNumber, Valid(new MaskUnitExeReq(parameter.laneParam))) ) - val exeResp: Vec[ValidIO[VRFWriteRequest]] = Vec( + val exeResp: Vec[DecoupledIO[VRFWriteRequest]] = Vec( parameter.laneNumber, - Valid( + Decoupled( new VRFWriteRequest( parameter.vrfParam.regNumBits, parameter.laneParam.vrfOffsetBits, @@ -56,8 +56,8 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle { ) ) ) - val tokenIO: Vec[LaneTokenBundle] = Flipped(Vec(parameter.laneNumber, new LaneTokenBundle)) - val readChannel: Vec[DecoupledIO[VRFReadRequest]] = Vec( + val tokenIO: Vec[LaneTokenBundle] = Flipped(Vec(parameter.laneNumber, new LaneTokenBundle)) + val readChannel: Vec[DecoupledIO[VRFReadRequest]] = Vec( parameter.laneNumber, Decoupled( new VRFReadRequest( @@ -67,28 +67,32 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle { ) ) ) - val readResult: Vec[UInt] = Flipped(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) - val writeRD: ValidIO[UInt] = Valid(UInt(parameter.datapathWidth.W)) - val lastReport: UInt = Output(UInt((2 * parameter.chainingSize).W)) - val lsuMaskInput: Vec[UInt] = Output(Vec(parameter.lsuMSHRSize, UInt(parameter.maskGroupWidth.W))) - val lsuMaskSelect: Vec[UInt] = Input(Vec(parameter.lsuMSHRSize, UInt(parameter.lsuParameters.maskGroupSizeBits.W))) - val laneMaskInput: Vec[UInt] = Output(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) - val laneMaskSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(parameter.laneParam.maskGroupSizeBits.W))) - val laneMaskSewSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(2.W))) - val v0UpdateVec: Vec[ValidIO[V0Update]] = Flipped(Vec(parameter.laneNumber, Valid(new V0Update(parameter.laneParam)))) - val writeRDData: UInt = Output(UInt(parameter.xLen.W)) - val gatherData: DecoupledIO[UInt] = Decoupled(UInt(parameter.xLen.W)) - val gatherRead: Bool = Input(Bool()) - val om: Property[ClassType] = Output(Property[AnyClassType]()) + val readResult: Vec[ValidIO[UInt]] = Flipped(Vec(parameter.laneNumber, Valid(UInt(parameter.datapathWidth.W)))) + val writeRD: ValidIO[UInt] = Valid(UInt(parameter.datapathWidth.W)) + val lastReport: UInt = Output(UInt((2 * parameter.chainingSize).W)) + val laneMaskInput: Vec[UInt] = Output(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) + val laneMaskSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(parameter.laneParam.maskGroupSizeBits.W))) + val laneMaskSewSelect: Vec[UInt] = Input(Vec(parameter.laneNumber, UInt(2.W))) + val v0UpdateVec: Vec[ValidIO[V0Update]] = Flipped( + Vec(parameter.laneNumber, Valid(new V0Update(parameter.laneParam.datapathWidth, parameter.laneParam.vrfOffsetBits))) + ) + val writeRDData: UInt = Output(UInt(parameter.xLen.W)) + val gatherData: DecoupledIO[UInt] = Decoupled(UInt(parameter.xLen.W)) + val gatherRead: Bool = Input(Bool()) + val om: Property[ClassType] = Output(Property[AnyClassType]()) } @instantiable class MaskUnitOM(parameter: T1Parameter) extends GeneralOM[T1Parameter, MaskUnit](parameter) { - @public val reduceUnit = IO(Output(Property[AnyClassType]())) @public val reduceUnitIn = IO(Input(Property[AnyClassType]())) reduceUnit := reduceUnitIn + + val compress = IO(Output(Property[AnyClassType]())) + @public + val compressIn = IO(Input(Property[AnyClassType]())) + compress := compressIn } // TODO: no T1Parameter here. @@ -116,8 +120,6 @@ class MaskUnit(val parameter: T1Parameter) val readResult = io.readResult val writeRD = io.writeRD val lastReport = io.lastReport - val lsuMaskInput = io.lsuMaskInput - val lsuMaskSelect = io.lsuMaskSelect val laneMaskInput = io.laneMaskInput val laneMaskSelect = io.laneMaskSelect val laneMaskSewSelect = io.laneMaskSewSelect @@ -168,11 +170,6 @@ class MaskUnit(val parameter: T1Parameter) input := cutUInt(v0SelectBySew, parameter.datapathWidth)(laneMaskSelect(index)) } - // lsu - lsuMaskInput.zip(lsuMaskSelect).foreach { case (data, index) => - data := cutUInt(v0.asUInt, parameter.maskGroupWidth)(index) - } - val maskedWrite: BitLevelMaskWrite = Module(new BitLevelMaskWrite(parameter)) def gatherIndex(elementIndex: UInt, vlmul: UInt, sew: UInt): (UInt, UInt, UInt, UInt, Bool) = { @@ -795,7 +792,7 @@ class MaskUnit(val parameter: T1Parameter) val dataOffset: UInt = Mux1H(readResultSelect, pipeDataOffset) readTokenRelease(index) := readDataQueue.deq.fire readDataQueue.enq.valid := readResultSelect.orR - readDataQueue.enq.bits := Mux1H(readResultSelect, readResult) >> (dataOffset ## 0.U(3.W)) + readDataQueue.enq.bits := Mux1H(readResultSelect, readResult.map(_.bits)) >> (dataOffset ## 0.U(3.W)) readDataQueue.deq } @@ -905,14 +902,24 @@ class MaskUnit(val parameter: T1Parameter) // Determine whether the data is ready val executeEnqValid: Bool = otherTypeRequestDeq && !readType + val compressParam: CompressParam = CompressParam( + parameter.datapathWidth, + parameter.xLen, + parameter.vLen, + parameter.laneNumber, + parameter.laneParam.groupNumberBits, + 2 + ) // start execute - val compressUnit: MaskCompress = Module(new MaskCompress(parameter)) - val reduceUnit = Instantiate( + val compressUnit = Instantiate(new MaskCompress(compressParam)) + val reduceUnit = Instantiate( new MaskReduce( MaskReduceParameter(parameter.datapathWidth, parameter.laneNumber, parameter.fpuEnable) ) ) omInstance.reduceUnitIn := reduceUnit.io.om.asAnyClassType + omInstance.compressIn := compressUnit.io.om.asAnyClassType + val extendUnit: MaskExtend = Module(new MaskExtend(parameter)) // todo @@ -942,28 +949,30 @@ class MaskUnit(val parameter: T1Parameter) val compressSource1: UInt = Mux1H(sew1H, vs1Split.map(_._1)) val source1Select: UInt = Mux(mv, readVS1Reg.data, compressSource1) val source1Change: Bool = Mux1H(sew1H, vs1Split.map(_._2)) - when(source1Change && compressUnit.in.fire) { + when(source1Change && compressUnit.io.in.fire) { readVS1Reg.dataValid := false.B readVS1Reg.requestSend := false.B readVS1Reg.readIndex := readVS1Reg.readIndex + 1.U } - viotaCounterAdd := compressUnit.in.fire - - compressUnit.in.valid := executeEnqValid && unitType(1) - compressUnit.in.bits.maskType := instReg.maskType - compressUnit.in.bits.eew := instReg.sew - compressUnit.in.bits.uop := instReg.decodeResult(Decoder.topUop) - compressUnit.in.bits.readFromScalar := instReg.readFromScala - compressUnit.in.bits.source1 := source1Select - compressUnit.in.bits.mask := executeElementMask - compressUnit.in.bits.source2 := source2 - compressUnit.in.bits.groupCounter := requestCounter - compressUnit.in.bits.lastCompress := lastGroup - compressUnit.in.bits.ffoInput := VecInit(exeReqReg.map(_.bits.ffo)).asUInt - compressUnit.in.bits.validInput := VecInit(exeReqReg.map(_.valid)).asUInt - compressUnit.newInstruction := instReq.valid - compressUnit.ffoInstruction := instReq.bits.decodeResult(Decoder.topUop)(2, 0) === BitPat("b11?") + viotaCounterAdd := compressUnit.io.in.fire + + compressUnit.io.clock := implicitClock + compressUnit.io.reset := implicitReset + compressUnit.io.in.valid := executeEnqValid && unitType(1) + compressUnit.io.in.bits.maskType := instReg.maskType + compressUnit.io.in.bits.eew := instReg.sew + compressUnit.io.in.bits.uop := instReg.decodeResult(Decoder.topUop) + compressUnit.io.in.bits.readFromScalar := instReg.readFromScala + compressUnit.io.in.bits.source1 := source1Select + compressUnit.io.in.bits.mask := executeElementMask + compressUnit.io.in.bits.source2 := source2 + compressUnit.io.in.bits.groupCounter := requestCounter + compressUnit.io.in.bits.lastCompress := lastGroup + compressUnit.io.in.bits.ffoInput := VecInit(exeReqReg.map(_.bits.ffo)).asUInt + compressUnit.io.in.bits.validInput := VecInit(exeReqReg.map(_.valid)).asUInt + compressUnit.io.newInstruction := instReq.valid + compressUnit.io.ffoInstruction := instReq.bits.decodeResult(Decoder.topUop)(2, 0) === BitPat("b11?") reduceUnit.io.clock := implicitClock reduceUnit.io.reset := implicitReset @@ -987,7 +996,7 @@ class MaskUnit(val parameter: T1Parameter) sink := VecInit(exeReqReg.map(_.bits.fpReduceValid.get)).asUInt } - when(reduceUnit.io.in.fire || compressUnit.in.fire) { + when(reduceUnit.io.in.fire || compressUnit.io.in.fire) { readVS1Reg.sendToExecution := true.B } @@ -1008,7 +1017,7 @@ class MaskUnit(val parameter: T1Parameter) val executeResult: UInt = Mux1H( unitType(3, 1), Seq( - compressUnit.out.data, + compressUnit.io.out.data, reduceUnit.io.out.bits.data, extendUnit.out ) @@ -1028,7 +1037,7 @@ class MaskUnit(val parameter: T1Parameter) val executeValid: Bool = Mux1H( unitType(3, 1), Seq( - compressUnit.out.compressValid, + compressUnit.io.out.compressValid, false.B, executeEnqValid ) @@ -1046,13 +1055,13 @@ class MaskUnit(val parameter: T1Parameter) val executeDeqGroupCounter: UInt = Mux1H( unitType(3, 1), Seq( - compressUnit.out.groupCounter, + compressUnit.io.out.groupCounter, requestCounter, extendGroupCount ) ) - val executeWriteByteMask: UInt = Mux(compress || ffo || mvVd, compressUnit.out.mask, executeByteMask) + val executeWriteByteMask: UInt = Mux(compress || ffo || mvVd, compressUnit.io.out.mask, executeByteMask) maskedWrite.needWAR := maskDestinationType maskedWrite.vd := instReg.vd maskedWrite.in.zipWithIndex.foreach { case (req, index) => @@ -1064,7 +1073,7 @@ class MaskUnit(val parameter: T1Parameter) req.bits.pipeData := exeReqReg(index).bits.source1 req.bits.bitMask := bitMask req.bits.groupCounter := executeDeqGroupCounter - req.bits.ffoByOther := compressUnit.out.ffoOutput(index) && ffo + req.bits.ffoByOther := compressUnit.io.out.ffoOutput(index) && ffo if (index == 0) { // reduce result when(unitType(2)) { @@ -1091,17 +1100,10 @@ class MaskUnit(val parameter: T1Parameter) } queue.enq.bits.index := instReg.instructionIndex - // write token - val tokenCounter = RegInit(0.U(log2Ceil(parameter.maskUnitVefWriteQueueSize + 1).W)) - val tokenAllow: Bool = queue.deq.fire - val counterChange: UInt = Mux(tokenAllow, 1.U, -1.S(tokenCounter.getWidth.W).asUInt) - when(tokenAllow ^ tokenIO(index).maskResponseRelease) { - tokenCounter := tokenCounter + counterChange - } // write vrf val writePort = exeResp(index) - queue.deq.ready := !tokenCounter.asBools.last - writePort.valid := tokenAllow + queue.deq.ready := writePort.ready + writePort.valid := queue.deq.valid writePort.bits.last := DontCare writePort.bits.instructionIndex := instReg.instructionIndex writePort.bits.data := Mux(queue.deq.bits.ffoByOther, queue.deq.bits.pipeData, queue.deq.bits.writeData.data) @@ -1131,7 +1133,7 @@ class MaskUnit(val parameter: T1Parameter) val executeStageInvalid: Bool = Mux1H( unitType(3, 1), Seq( - !compressUnit.out.compressValid, + !compressUnit.io.out.compressValid, reduceUnit.io.in.ready, true.B ) @@ -1150,7 +1152,7 @@ class MaskUnit(val parameter: T1Parameter) lastReportValid, indexToOH(instReg.instructionIndex, parameter.chainingSize) ) - writeRDData := Mux(pop, reduceUnit.io.out.bits.data, compressUnit.writeData) + writeRDData := Mux(pop, reduceUnit.io.out.bits.data, compressUnit.io.writeData) // gather read state when(gatherRequestFire) { diff --git a/t1/src/package.scala b/t1/src/package.scala index 13bde15a9..7b648bc75 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -7,6 +7,7 @@ import chisel3._ import chisel3.experimental.hierarchy.{Instance, Instantiate} import chisel3.util._ import chisel3.util.experimental.decode.DecodeBundle +import org.chipsalliance.dwbb.stdlib.queue.Queue import org.chipsalliance.t1.rtl.decoder.{Decoder, TableGenerator} import org.chipsalliance.t1.rtl.lane.Distributor @@ -221,6 +222,89 @@ package object rtl { id.map(f => (shifterReg :+ source).map(p => Mux(p.valid, indexToOH(f(p.bits), 4), 0.U)).reduce(_ | _)) } + def connectDecoupledWithShifter[T <: Data](latency: Int, tokenSize: Int)(source: DecoupledIO[T], sink: DecoupledIO[T]) + : Unit = { + val queue = Queue.io(chiselTypeOf(source.bits), tokenSize, flow = true) + // Reverse pipe release + val releasePipe = Pipe( + sink.fire, + 0.U.asTypeOf(new EmptyBundle), + latency + ).valid + val tokenCheck: Bool = pipeToken(tokenSize)(source.fire, releasePipe) + source.ready := tokenCheck + + // Complete the handshake at the source end and convert the result of the handshake into a data stream + val validSource: ValidIO[T] = Wire(Valid(chiselTypeOf(source.bits))) + validSource.valid := source.fire + validSource.bits := source.bits + + val validSink: ValidIO[T] = Wire(Valid(chiselTypeOf(source.bits))) + + // Shift Data + connectWithShifter(latency)(validSource, validSink) + // Throw the moved data into the queue + // todo: assert(queue.enq.ready || !queue.enq.valid) + queue.enq.valid := validSink.valid + queue.enq.bits := validSink.bits + // Finally, send the data to the sink + sink <> queue.deq + } + + def maskUnitReadArbitrate[T <: Data](source: Vec[DecoupledIO[T]]): DecoupledIO[T] = { + require(source.size == 2) + val maskRead = source.head + val lsuRead = source.last + val sinkWire: DecoupledIO[T] = Wire(Decoupled(chiselTypeOf(maskRead.bits))) + val maskUnitFirst = RegInit(false.B) + val tryToRead = maskRead.valid || lsuRead.valid + when(tryToRead && !sinkWire.fire) { + maskUnitFirst := !maskUnitFirst + } + + sinkWire.valid := Mux( + maskUnitFirst, + maskRead.valid, + lsuRead.valid + ) + sinkWire.bits := + Mux(maskUnitFirst, maskRead.bits, lsuRead.bits) + lsuRead.ready := sinkWire.ready && !maskUnitFirst + maskRead.ready := sinkWire.ready && maskUnitFirst + sinkWire + } + + def connectVrfAccess[T <: Data]( + latencyVec: Seq[Int], + tokenSizeVec: Seq[Int], + vrfReadLatency: Option[Int] = None + )(sourceVec: Vec[DecoupledIO[T]], + sink: DecoupledIO[T], + arb: Int, + dataAck: Option[UInt] = None, + dataToSource: Option[Seq[ValidIO[UInt]]] = None + ): Unit = { + val sinkVec: Vec[DecoupledIO[T]] = VecInit(sourceVec.zipWithIndex.map { case (source, index) => + val sinkWire: DecoupledIO[T] = Wire(Decoupled(chiselTypeOf(source.bits))) + connectDecoupledWithShifter(latencyVec(index), tokenSizeVec(index))(source, sinkWire) + sinkWire + }) + if (arb == 0) { + sink <> maskUnitReadArbitrate(sinkVec) + } + dataToSource.foreach { sourceDataVec => + require(dataAck.isDefined) + sourceDataVec.zipWithIndex.foreach { case (sourceData, index) => + val sinkRequest = sinkVec(index) + val accessDataValid = Pipe(sinkRequest.fire, 0.U.asTypeOf(new EmptyBundle), vrfReadLatency.get).valid + val accessDataSource = Wire(Valid(chiselTypeOf(dataAck.get))) + accessDataSource.valid := accessDataValid + accessDataSource.bits := dataAck.get + connectWithShifter(latencyVec(index))(accessDataSource, sourceData) + } + } + } + def instantiateVFU( parameter: VFUInstantiateParameter )(requestVec: Vec[SlotRequestToVFU], diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala index 721c888f4..0292d8409 100644 --- a/t1/src/vrf/VRF.scala +++ b/t1/src/vrf/VRF.scala @@ -15,6 +15,7 @@ import chisel3.properties.{AnyClassType, Class, ClassType, Path, Property} import org.chipsalliance.stdlib.GeneralOM import org.chipsalliance.t1.rtl.{ ffo, + indexToOH, instIndexL, instIndexLE, ohCheck, @@ -199,7 +200,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar @public val writeCheck: Vec[LSUWriteCheck] = IO( Vec( - parameter.chainingSize + 4, + parameter.chainingSize + 3, Input( new LSUWriteCheck( parameter.regNumBits, @@ -211,11 +212,11 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar ) @public - val writeAllow: Vec[Bool] = IO(Vec(parameter.chainingSize + 4, Output(Bool()))) + val writeAllow: Vec[Bool] = IO(Vec(parameter.chainingSize + 3, Output(Bool()))) /** when instruction is fired, record it in the VRF for chaining. */ @public - val instructionWriteReport: DecoupledIO[VRFWriteReport] = IO(Flipped(Decoupled(new VRFWriteReport(parameter)))) + val instructionWriteReport: ValidIO[VRFWriteReport] = IO(Flipped(Valid(new VRFWriteReport(parameter)))) /** similar to [[flush]]. */ @public @@ -224,6 +225,9 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar @public val lsuLastReport: UInt = IO(Input(UInt((2 * parameter.chainingSize).W))) + @public + val vrfSlotRelease: UInt = IO(Output(UInt((2 * parameter.chainingSize).W))) + @public val dataInLane: UInt = IO(Input(UInt((2 * parameter.chainingSize).W))) @@ -232,9 +236,6 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar @public val vrfReadyToStore: Bool = IO(Output(Bool())) - @public - val vrfAllocateIssue: Bool = IO(Output(Bool())) - /** we can only chain LSU instructions, after [[LSU.writeQueueVec]] is cleared. */ @public val loadDataInLSUWriteQueue: UInt = IO(Input(UInt((2 * parameter.chainingSize).W))) @@ -275,6 +276,13 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val chainingRecordCopy: Vec[ValidIO[VRFWriteReport]] = RegInit( VecInit(Seq.fill(parameter.chainingSize + 1)(0.U.asTypeOf(Valid(new VRFWriteReport(parameter))))) ) + val recordRelease: Vec[UInt] = WireDefault( + VecInit( + Seq.fill(parameter.chainingSize + 1)( + 0.U.asTypeOf(UInt((parameter.chainingSize * 2).W)) + ) + ) + ) val recordValidVec: Seq[Bool] = chainingRecord.map(r => !r.bits.elementMask.andR && r.valid) // first read @@ -500,23 +508,12 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar // @todo @Clo91eaf VRF ready signal for performance. val freeRecord: UInt = VecInit(chainingRecord.map(!_.valid)).asUInt val recordFFO: UInt = ffo(freeRecord) - val recordEnq: UInt = Wire(UInt((parameter.chainingSize + 1).W)) - val olderCheck = chainingRecord.map { re => - // The same lsb will make it difficult to distinguish between the new and the old - val notSameLSB: Bool = re.bits.instIndex(parameter.instructionIndexBits - 2, 0) =/= - instructionWriteReport.bits.instIndex(parameter.instructionIndexBits - 2, 0) - !re.valid || (instIndexL(re.bits.instIndex, instructionWriteReport.bits.instIndex) && notSameLSB) - }.reduce(_ && _) - // handle VRF hazard - // @todo @Clo91eaf VRF ready signal for performance. - instructionWriteReport.ready := freeRecord.orR && olderCheck - recordEnq := Mux( + val recordEnq: UInt = Mux( // 纯粹的lsu指令的记录不需要ready instructionWriteReport.valid, recordFFO, 0.U((parameter.chainingSize + 1).W) ) - vrfAllocateIssue := freeRecord.orR && olderCheck val writePort: Seq[ValidIO[VRFWriteRequest]] = Seq(writePipe) val loadUnitReadPorts: Seq[DecoupledIO[VRFReadRequest]] = Seq(readRequests.last) @@ -570,6 +567,9 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar when(stateClear) { record.valid := false.B + when(record.valid) { + recordRelease(i) := indexToOH(record.bits.instIndex, parameter.chainingSize) + } } when(recordEnq(i)) { @@ -617,6 +617,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar } writeReadyForLsu := !hazardVec.map(_.map(_._1).reduce(_ || _)).reduce(_ || _) vrfReadyToStore := !hazardVec.map(_.map(_._2).reduce(_ || _)).reduce(_ || _) + vrfSlotRelease := recordRelease.reduce(_ | _) writeCheck.zip(writeAllow).foreach { case (check, allow) => allow := chainingRecordCopy