From 6a3349332c730f3a7e767eab669f40788b449dcf Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Fri, 6 Dec 2024 16:43:47 +0800 Subject: [PATCH] [rtl] connect vrf read with shifter. --- t1/src/T1.scala | 38 +++++++++----------- t1/src/lsu/LSU.scala | 5 +-- t1/src/lsu/StoreUnit.scala | 12 +++---- t1/src/mask/BitLevelMaskWrite.scala | 8 ++--- t1/src/mask/MaskUnit.scala | 6 ++-- t1/src/package.scala | 54 +++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 38 deletions(-) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index f5990174a..70c515d23 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -263,6 +263,12 @@ case class T1Parameter( val laneRequestTokenSize: Int = 4 val laneRequestShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + val maskUnitReadTokenSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 4) + val maskUnitReadShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + + val lsuReadTokenSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 4) + val lsuReadShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1) + val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) /** paraemter for AXI4. */ @@ -758,29 +764,17 @@ class T1(val parameter: T1Parameter) laneRequestSinkWire(index).ready := lane.laneRequest.ready && lane.vrfAllocateIssue lane.laneIndex := index.U - // lsu 优先会有死锁: - // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写 - // vse32.v v1, (a0) -> 依赖上一条,但是会先发出read - - // Mask priority will also be - // vse32.v v19, (a0) - // vfslide1down.vf v19, v10, x1 - val maskUnitFirst = RegInit(false.B) - val tryToRead = lsu.vrfReadDataPorts(index).valid || maskUnit.readChannel(index).valid - when(tryToRead && !lane.vrfReadAddressChannel.fire) { - maskUnitFirst := !maskUnitFirst - } - lane.vrfReadAddressChannel.valid := Mux( - maskUnitFirst, - maskUnit.readChannel(index).valid, - lsu.vrfReadDataPorts(index).valid + connectVrfAccess( + Seq(parameter.maskUnitReadShifterSize(index), parameter.lsuReadShifterSize(index)), + Seq(parameter.maskUnitReadTokenSize(index), parameter.lsuReadTokenSize(index)), + Some(parameter.vrfReadLatency) + )( + VecInit(Seq(maskUnit.readChannel(index), lsu.vrfReadDataPorts(index))), + lane.vrfReadAddressChannel, + 0, + Some(lane.vrfReadDataChannel), + Some(Seq(maskUnit.readResult(index), lsu.vrfReadResults(index))) ) - lane.vrfReadAddressChannel.bits := - Mux(maskUnitFirst, maskUnit.readChannel(index).bits, lsu.vrfReadDataPorts(index).bits) - lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnitFirst - maskUnit.readChannel(index).ready := lane.vrfReadAddressChannel.ready && maskUnitFirst - maskUnit.readResult(index) := lane.vrfReadDataChannel - lsu.vrfReadResults(index) := lane.vrfReadDataChannel val maskTryToWrite = maskUnit.exeResp(index) // lsu & mask unit write lane diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index 7c93e09ed..b7bd81202 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -150,7 +150,7 @@ class LSU(param: LSUParameter) extends Module { /** hard wire form Top. TODO: merge to [[vrfReadDataPorts]] */ @public - val vrfReadResults: Vec[UInt] = IO(Input(Vec(param.laneNumber, UInt(param.datapathWidth.W)))) + val vrfReadResults: Vec[ValidIO[UInt]] = IO(Vec(param.laneNumber, Flipped(Valid(UInt(param.datapathWidth.W))))) /** write channel to [[V]], which will redirect it to [[Lane.vrf]]. */ @public @@ -250,7 +250,8 @@ class LSU(param: LSUParameter) extends Module { otherUnit.vrfReadDataPorts.ready := (otherTryReadVrf & VecInit(vrfReadDataPorts.map(_.ready)).asUInt).orR val pipeOtherRead: ValidIO[UInt] = Pipe(otherUnit.vrfReadDataPorts.fire, otherUnit.status.targetLane, param.vrfReadLatency) - otherUnit.vrfReadResults.bits := Mux1H(pipeOtherRead.bits, vrfReadResults) + // todo: read data reorder + otherUnit.vrfReadResults.bits := Mux1H(pipeOtherRead.bits, vrfReadResults.map(_.bits)) otherUnit.vrfReadResults.valid := pipeOtherRead.valid // write vrf diff --git a/t1/src/lsu/StoreUnit.scala b/t1/src/lsu/StoreUnit.scala index 1fb72096b..a4e125296 100644 --- a/t1/src/lsu/StoreUnit.scala +++ b/t1/src/lsu/StoreUnit.scala @@ -39,9 +39,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { /** hard wire form Top. see [[LSU.vrfReadResults]] */ @public - val vrfReadResults: Vec[UInt] = IO(Input(Vec(param.laneNumber, UInt(param.datapathWidth.W)))) + val vrfReadResults: Vec[ValidIO[UInt]] = IO(Input(Vec(param.laneNumber, Valid(UInt(param.datapathWidth.W))))) @public - val vrfReadyToStore: Bool = IO(Input(Bool())) + val vrfReadyToStore: Bool = IO(Input(Bool())) @public val storeResponse = IO(Input(Bool())) @@ -73,6 +73,7 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { val readCount: UInt = RegInit(0.U(dataGroupBits.W)) val stageValid = RegInit(false.B) // queue for read latency + // todo: param.vrfReadLatency => param.vrfReadLatency + shifterLatency val queue: QueueIO[UInt] = Queue.io(UInt(param.datapathWidth.W), param.vrfReadLatency, flow = true) @@ -114,12 +115,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { readPort.bits.offset := readCount readPort.bits.instructionIndex := lsuRequestReg.instructionIndex - // pipe read fire - val readResultFire = Pipe(readPort.fire, 0.U.asTypeOf(new EmptyBundle), param.vrfReadLatency).valid - // latency queue enq - queue.enq.valid := readResultFire - queue.enq.bits := vrfReadResults(laneIndex) + queue.enq.valid := vrfReadResults(laneIndex).valid + queue.enq.bits := vrfReadResults(laneIndex).bits AssertProperty(BoolSequence(!queue.enq.valid || queue.enq.ready)) vrfReadQueueVec(laneIndex).enq <> queue.deq stageValid || RegNext(readPort.fire) diff --git a/t1/src/mask/BitLevelMaskWrite.scala b/t1/src/mask/BitLevelMaskWrite.scala index cefe31868..5bc73491e 100644 --- a/t1/src/mask/BitLevelMaskWrite.scala +++ b/t1/src/mask/BitLevelMaskWrite.scala @@ -43,8 +43,8 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { ) } - val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => - IO(Input(UInt(parameter.datapathWidth.W))) + val readResult: Seq[ValidIO[UInt]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Valid(UInt(parameter.datapathWidth.W)))) } val stageClear: Bool = IO(Output(Bool())) @@ -52,7 +52,7 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { val stageClearVec: Seq[Bool] = in.zipWithIndex.map { case (req, index) => val reqQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), 4) val readPort = readChannel(index) - val readData = readResult(index) + val readData = readResult(index).bits val res = out(index) val WaitReadQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), readVRFLatency) @@ -68,7 +68,7 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module { readPort.bits.vs := vd + (reqQueue.deq.bits.groupCounter >> readPort.bits.offset.getWidth).asUInt readPort.bits.offset := changeUIntSize(reqQueue.deq.bits.groupCounter, readPort.bits.offset.getWidth) - val readValidPipe = Pipe(readPort.fire, false.B, readVRFLatency).valid + val readValidPipe = Pipe(readPort.fire, false.B, readVRFLatency).valid && readResult(index).valid val readResultValid = !needWAR || readValidPipe val WARData = (WaitReadQueue.deq.bits.data & WaitReadQueue.deq.bits.bitMask) | diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala index 215e79f59..ab3f8c526 100644 --- a/t1/src/mask/MaskUnit.scala +++ b/t1/src/mask/MaskUnit.scala @@ -82,8 +82,8 @@ class MaskUnit(parameter: T1Parameter) extends Module { } @public - val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => - IO(Input(UInt(parameter.datapathWidth.W))) + val readResult: Seq[ValidIO[UInt]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Valid(UInt(parameter.datapathWidth.W)))) } @public @@ -789,7 +789,7 @@ class MaskUnit(parameter: T1Parameter) extends Module { val dataOffset: UInt = Mux1H(readResultSelect, pipeDataOffset) readTokenRelease(index) := readDataQueue.deq.fire readDataQueue.enq.valid := readResultSelect.orR - readDataQueue.enq.bits := Mux1H(readResultSelect, readResult) >> (dataOffset ## 0.U(3.W)) + readDataQueue.enq.bits := Mux1H(readResultSelect, readResult.map(_.bits)) >> (dataOffset ## 0.U(3.W)) readDataQueue.deq } diff --git a/t1/src/package.scala b/t1/src/package.scala index 4bb5835da..5eca78e39 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -251,6 +251,60 @@ package object rtl { sink <> queue.deq } + def maskUnitReadArbitrate[T <: Data](source: Vec[DecoupledIO[T]]): DecoupledIO[T] = { + require(source.size == 2) + val maskRead = source.head + val lsuRead = source.last + val sinkWire: DecoupledIO[T] = Wire(Decoupled(chiselTypeOf(maskRead.bits))) + val maskUnitFirst = RegInit(false.B) + val tryToRead = maskRead.valid || lsuRead.valid + when(tryToRead && !sinkWire.fire) { + maskUnitFirst := !maskUnitFirst + } + + sinkWire.valid := Mux( + maskUnitFirst, + maskRead.valid, + lsuRead.valid + ) + sinkWire.bits := + Mux(maskUnitFirst, maskRead.bits, lsuRead.bits) + lsuRead.ready := sinkWire.ready && !maskUnitFirst + maskRead.ready := sinkWire.ready && maskUnitFirst + sinkWire + } + + def connectVrfAccess[T <: Data]( + latencyVec: Seq[Int], + tokenSizeVec: Seq[Int], + vrfReadLatency: Option[Int] + )(sourceVec: Vec[DecoupledIO[T]], + sink: DecoupledIO[T], + arb: Int, + dataAck: Option[UInt] = None, + dataToSource: Option[Seq[ValidIO[UInt]]] = None + ): Unit = { + val sinkVec: Vec[DecoupledIO[T]] = VecInit(sourceVec.zipWithIndex.map { case (source, index) => + val sinkWire: DecoupledIO[T] = Wire(Decoupled(chiselTypeOf(source.bits))) + connectDecoupledWithShifter(latencyVec(index), tokenSizeVec(index))(source, sinkWire) + sinkWire + }) + if (arb == 0) { + sink <> maskUnitReadArbitrate(sinkVec) + } + dataToSource.foreach { sourceDataVec => + require(dataAck.isDefined) + sourceDataVec.zipWithIndex.foreach { case (sourceData, index) => + val sinkRequest = sinkVec(index) + val accessDataValid = Pipe(sinkRequest.fire, 0.U.asTypeOf(new EmptyBundle), vrfReadLatency.get).valid + val accessDataSource = Wire(Valid(chiselTypeOf(dataAck.get))) + accessDataSource.valid := accessDataValid + accessDataSource.bits := accessDataValid + connectWithShifter(latencyVec(index))(accessDataSource, sourceData) + } + } + } + def instantiateVFU( parameter: VFUInstantiateParameter )(requestVec: Vec[SlotRequestToVFU],