From 8831fc0a2dd8054edfbc3ab69fe16d31afa4ab0f Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Sun, 17 Nov 2024 21:10:37 +0800 Subject: [PATCH 1/6] [rtl] migrate to DLEN-sized Mask Unit --- t1/src/Bundles.scala | 101 +- t1/src/Lane.scala | 203 ++-- t1/src/T1.scala | 1086 +++--------------- t1/src/VectorFunctionUnit.scala | 7 + t1/src/decoder/Decoder.scala | 43 +- t1/src/decoder/attribute/isSwrite.scala | 64 -- t1/src/decoder/attribute/topUop.scala | 263 ++++- t1/src/laneStage/LaneExecutionBridge.scala | 9 +- t1/src/laneStage/LaneStage.scala | 1 - t1/src/laneStage/LaneStage0.scala | 1 - t1/src/laneStage/LaneStage1.scala | 20 +- t1/src/laneStage/LaneStage3.scala | 62 +- t1/src/laneStage/MaskExchangeUnit.scala | 61 + t1/src/laneStage/SlotTokenManager.scala | 65 +- t1/src/lsu/Bundle.scala | 8 +- t1/src/lsu/LSU.scala | 21 +- t1/src/lsu/SimpleAccessUnit.scala | 85 +- t1/src/lsu/StoreUnit.scala | 9 +- t1/src/mask/BitLevelMaskWrite.scala | 95 ++ t1/src/mask/MaskCompress.scala | 239 ++++ t1/src/mask/MaskExtend.scala | 75 ++ t1/src/mask/MaskReduce.scala | 212 ++++ t1/src/mask/MaskUnit.scala | 1166 ++++++++++++++++++++ t1/src/mask/MaskUnitReadCrossBar.scala | 48 + t1/src/mask/SlideIndexGen.scala | 128 +++ t1/src/package.scala | 32 + t1/src/sequencer/T1TokenManager.scala | 74 +- t1/src/vrf/VRF.scala | 24 +- t1/src/vrf/WriteCheck.scala | 3 +- t1emu/src/TestBench.scala | 2 +- t1rocketemu/src/TestBench.scala | 2 +- 31 files changed, 2886 insertions(+), 1323 deletions(-) create mode 100644 t1/src/laneStage/MaskExchangeUnit.scala create mode 100644 t1/src/mask/BitLevelMaskWrite.scala create mode 100644 t1/src/mask/MaskCompress.scala create mode 100644 t1/src/mask/MaskExtend.scala create mode 100644 t1/src/mask/MaskReduce.scala create mode 100644 t1/src/mask/MaskUnit.scala create mode 100644 t1/src/mask/MaskUnitReadCrossBar.scala create mode 100644 t1/src/mask/SlideIndexGen.scala diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index fd833f07e..59b317b5b 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -66,7 +66,7 @@ class InstructionState extends Bundle { val idle: Bool = Bool() /** used for mask unit, schedule mask unit to execute. */ - val sMaskUnitExecution: Bool = Bool() + val wMaskUnitLast: Bool = Bool() /** wait for vrf write finish. */ val wVRFWrite: Bool = Bool() @@ -455,6 +455,8 @@ class VRFWriteReport(param: VRFParam) extends Bundle { val indexType: Bool = Bool() // 乘加 val ma: Bool = Bool() + // Read everything, but write very little + val onlyRead: Bool = Bool() // 慢指令 mask unit val slow: Bool = Bool() // which element will access(write or store read) @@ -698,3 +700,100 @@ class T1Retire(xLen: Int) extends Bundle { val csr: ValidIO[T1CSRRetire] = Valid(new T1CSRRetire) val mem: ValidIO[EmptyBundle] = Valid(new EmptyBundle) } + +class MaskUnitReadState(parameter: T1Parameter) extends Bundle { + val groupReadState: UInt = UInt(parameter.laneNumber.W) + val needRead: UInt = UInt(parameter.laneNumber.W) + val elementValid: UInt = UInt(parameter.laneNumber.W) + val replaceVs1: UInt = UInt(parameter.laneNumber.W) + val readOffset: UInt = UInt((parameter.laneNumber * parameter.laneParam.vrfOffsetBits).W) + val accessLane: Vec[UInt] = Vec(parameter.laneNumber, UInt(log2Ceil(parameter.laneNumber).W)) + // 3: log2Ceil(8); 8: Use up to 8 registers + val vsGrowth: Vec[UInt] = Vec(parameter.laneNumber, UInt(3.W)) + val executeGroup: UInt = UInt((parameter.laneParam.groupNumberBits + 2).W) + val readDataOffset: UInt = UInt((log2Ceil(parameter.datapathWidth / 8) * parameter.laneNumber).W) + val last: Bool = Bool() +} + +class MaskUnitInstReq(parameter: T1Parameter) extends Bundle { + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) + val decodeResult: DecodeBundle = Decoder.bundle(parameter.decoderParam) + val readFromScala: UInt = UInt(parameter.datapathWidth.W) + val sew: UInt = UInt(2.W) + val vlmul: UInt = UInt(3.W) + val maskType: Bool = Bool() + val vxrm: UInt = UInt(3.W) + val vs2: UInt = UInt(5.W) + val vs1: UInt = UInt(5.W) + val vd: UInt = UInt(5.W) + val vl: UInt = UInt(parameter.laneParam.vlMaxBits.W) +} + +class MaskUnitExeReq(parameter: LaneParameter) extends Bundle { + // source1, read vs + val source1: UInt = UInt(parameter.datapathWidth.W) + // source2, read offset + val source2: UInt = UInt(parameter.datapathWidth.W) + val index: UInt = UInt(parameter.instructionIndexBits.W) + val ffo: Bool = Bool() + // Is there a valid element? + val fpReduceValid: Option[Bool] = Option.when(parameter.fpuEnable)(Bool()) +} + +class MaskUnitExeResponse(parameter: LaneParameter) extends Bundle { + val ffoByOther: Bool = Bool() + val writeData = new MaskUnitWriteBundle(parameter) + val pipeData: UInt = UInt(parameter.datapathWidth.W) + val index: UInt = UInt(parameter.instructionIndexBits.W) +} + +class MaskUnitReadReq(parameter: T1Parameter) extends Bundle { + val vs: UInt = UInt(5.W) + // source2, read offset + val offset: UInt = UInt(parameter.laneParam.vrfOffsetBits.W) + // Read which lane + val readLane: UInt = UInt(log2Ceil(parameter.laneNumber).W) + // from which request + val requestIndex: UInt = UInt(log2Ceil(parameter.laneNumber).W) + // data position in data path + val dataOffset: UInt = UInt(log2Ceil(parameter.datapathWidth / 8).W) +} + +class MaskUnitReadQueue(parameter: T1Parameter) extends Bundle { + val vs: UInt = UInt(5.W) + // source2, read offset + val offset: UInt = UInt(parameter.laneParam.vrfOffsetBits.W) + // Which channel will this read request be written to? + val writeIndex: UInt = UInt(log2Ceil(parameter.laneNumber).W) + val dataOffset: UInt = UInt(log2Ceil(parameter.datapathWidth / 8).W) +} + +class MaskUnitWaitReadQueue(parameter: T1Parameter) extends Bundle { + val executeGroup: UInt = UInt((parameter.laneParam.groupNumberBits + 2).W) + val sourceValid: UInt = UInt(parameter.laneNumber.W) + val replaceVs1: UInt = UInt(parameter.laneNumber.W) + val needRead: UInt = UInt(parameter.laneNumber.W) + val last: Bool = Bool() +} + +class MaskUnitWriteBundle(parameter: LaneParameter) extends Bundle { + val data: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt((parameter.datapathWidth / 8).W) + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) + val vd: UInt = UInt(5.W) +} + +class MaskUnitReadVs1(parameter: T1Parameter) extends Bundle { + val indexSize: Int = log2Ceil(parameter.vLen * 8 / parameter.datapathWidth / parameter.laneNumber) + val dataValid: Bool = Bool() + val requestSend: Bool = Bool() + val sendToExecution: Bool = Bool() + val data: UInt = UInt(parameter.datapathWidth.W) + val readIndex: UInt = UInt(indexSize.W) + val laneIndex: UInt = UInt(parameter.laneNumber.W) +} + +class LaneTokenBundle extends Bundle { + val maskResponseRelease: Bool = Output(Bool()) + val maskRequestRelease: Bool = Input(Bool()) +} diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index eb0dd00ea..e15c74a03 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -103,6 +103,7 @@ case class LaneParameter( decoderParam: DecoderParam, vfuInstantiateParameter: VFUInstantiateParameter) extends SerializableModuleParameter { + val maskUnitVefWriteQueueSize: Int = 8 /** 1 in MSB for instruction order. */ val instructionIndexBits: Int = log2Ceil(chainingSize) + 1 @@ -183,6 +184,9 @@ case class LaneParameter( */ val executionQueueSize: Int = 4 + // outstanding of MaskExchangeUnit.maskReq + val maskRequestQueueSize: Int = 8 + /** Parameter for [[VRF]] */ def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, portFactor, vrfRamType) } @@ -235,17 +239,18 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ @public val csrInterface: CSRInterface = IO(Input(new CSRInterface(parameter.vlMaxBits))) - /** response to [[T1.lsu]] or mask unit in [[T1]] */ @public - val laneResponse: ValidIO[LaneResponse] = IO(Valid(new LaneResponse(parameter))) + val maskUnitRequest: ValidIO[MaskUnitExeReq] = IO(Valid(new MaskUnitExeReq(parameter))) + + @public + val maskRequestToLSU: Bool = IO(Output(Bool())) - /** feedback from [[T1]] to [[Lane]] for [[laneResponse]] */ @public - val laneResponseFeedback: ValidIO[LaneResponseFeedback] = IO(Flipped(Valid(new LaneResponseFeedback(parameter)))) + val tokenIO: LaneTokenBundle = IO(new LaneTokenBundle) /** for LSU and V accessing lane, this is not a part of ring, but a direct connection. */ @public - val vrfReadAddressChannel: DecoupledIO[VRFReadRequest] = IO( + val vrfReadAddressChannel: DecoupledIO[VRFReadRequest] = IO( Flipped( Decoupled( new VRFReadRequest(parameter.vrfParam.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) @@ -253,21 +258,21 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) ) @public - val vrfReadDataChannel: UInt = IO(Output(UInt(parameter.datapathWidth.W))) - @public - val vrfWriteChannel: DecoupledIO[VRFWriteRequest] = IO( - Flipped( - Decoupled( - new VRFWriteRequest( - parameter.vrfParam.regNumBits, - parameter.vrfOffsetBits, - parameter.instructionIndexBits, - parameter.datapathWidth - ) - ) - ) + val vrfReadDataChannel: UInt = IO(Output(UInt(parameter.datapathWidth.W))) + + val vrfWriteType: VRFWriteRequest = new VRFWriteRequest( + parameter.vrfParam.regNumBits, + parameter.vrfOffsetBits, + parameter.instructionIndexBits, + parameter.datapathWidth ) + @public + val vrfWriteChannel: DecoupledIO[VRFWriteRequest] = IO(Flipped(Decoupled(vrfWriteType))) + + @public + val writeFromMask: Bool = IO(Input(Bool())) + /** for each instruction in the slot, response to top when instruction is finished in this lane. */ @public val instructionFinished: UInt = IO(Output(UInt(parameter.chainingSize.W))) @@ -363,7 +368,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ */ val vrfWriteArbiter: Vec[DecoupledIO[VRFWriteRequest]] = Wire( Vec( - parameter.chainingSize + 1, + parameter.chainingSize + 2, Decoupled( new VRFWriteRequest( parameter.vrfParam.regNumBits, @@ -374,16 +379,31 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) ) ) - val topWriteQueue: DecoupledIO[VRFWriteRequest] = Queue(vrfWriteChannel, 1, flow = true) - vrfWriteArbiter(parameter.chainingSize).valid := topWriteQueue.valid - vrfWriteArbiter(parameter.chainingSize).bits := topWriteQueue.bits - topWriteQueue.ready := vrfWriteArbiter(parameter.chainingSize).ready - val allVrfWriteAfterCheck: Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 3) { i => + val lsuWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteType, 1, flow = true) + // connect lsuWriteQueue.enq + lsuWriteQueue.enq.valid := vrfWriteChannel.valid && !writeFromMask + lsuWriteQueue.enq.bits := vrfWriteChannel.bits + vrfWriteChannel.ready := writeFromMask || lsuWriteQueue.enq.ready + + val maskWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteType, parameter.maskUnitVefWriteQueueSize) + // connect maskWriteQueue.enq + maskWriteQueue.enq.valid := vrfWriteChannel.valid && writeFromMask + maskWriteQueue.enq.bits := vrfWriteChannel.bits + + vrfWriteArbiter(parameter.chainingSize).valid := lsuWriteQueue.deq.valid + vrfWriteArbiter(parameter.chainingSize).bits := lsuWriteQueue.deq.bits + lsuWriteQueue.deq.ready := vrfWriteArbiter(parameter.chainingSize).ready + + vrfWriteArbiter(parameter.chainingSize + 1).valid := maskWriteQueue.deq.valid + vrfWriteArbiter(parameter.chainingSize + 1).bits := maskWriteQueue.deq.bits + maskWriteQueue.deq.ready := vrfWriteArbiter(parameter.chainingSize + 1).ready + + val allVrfWriteAfterCheck: Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 4) { i => RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits)) } - val afterCheckValid: Seq[Bool] = Seq.tabulate(parameter.chainingSize + 3) { _ => RegInit(false.B) } - val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 3, Bool())) + val afterCheckValid: Seq[Bool] = Seq.tabulate(parameter.chainingSize + 4) { _ => RegInit(false.B) } + val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 4, Bool())) val afterCheckDequeueFire: Seq[Bool] = afterCheckValid.zip(afterCheckDequeueReady).map { case (v, r) => v && r } /** for each slot, assert when it is asking [[T1]] to change mask */ @@ -412,8 +432,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) ) - val ffoRecord: FFORecord = RegInit(0.U.asTypeOf(new FFORecord)) - /** VRF read request for each slot, 3 is for [[source1]] [[source2]] [[source3]] */ val vrfReadRequest: Vec[Vec[DecoupledIO[VRFReadRequest]]] = Wire( @@ -569,14 +587,26 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ slotCanShift(index) := true.B } - val laneState: LaneState = Wire(new LaneState(parameter)) - val stage0: Instance[LaneStage0] = Instantiate(new LaneStage0(parameter, isLastSlot)) - val stage1: Instance[LaneStage1] = Instantiate(new LaneStage1(parameter, isLastSlot)) - val stage2: Instance[LaneStage2] = Instantiate(new LaneStage2(parameter, isLastSlot)) - val executionUnit: Instance[LaneExecutionBridge] = Instantiate( + val laneState: LaneState = Wire(new LaneState(parameter)) + val stage0: Instance[LaneStage0] = Instantiate(new LaneStage0(parameter, isLastSlot)) + val stage1: Instance[LaneStage1] = Instantiate(new LaneStage1(parameter, isLastSlot)) + val stage2: Instance[LaneStage2] = Instantiate(new LaneStage2(parameter, isLastSlot)) + val executionUnit: Instance[LaneExecutionBridge] = Instantiate( new LaneExecutionBridge(parameter, isLastSlot, index) ) - val stage3: Instance[LaneStage3] = Instantiate(new LaneStage3(parameter, isLastSlot)) + val maskStage: Option[Instance[MaskExchangeUnit]] = + Option.when(isLastSlot)(Instantiate(new MaskExchangeUnit(parameter))) + val stage3: Instance[LaneStage3] = Instantiate(new LaneStage3(parameter, isLastSlot)) + val stage3EnqWire: DecoupledIO[LaneStage3Enqueue] = Wire(Decoupled(new LaneStage3Enqueue(parameter, isLastSlot))) + val stage3EnqSelect: DecoupledIO[LaneStage3Enqueue] = maskStage.map { mask => + mask.enqueue <> stage3EnqWire + maskUnitRequest <> mask.maskReq + maskRequestToLSU <> mask.maskRequestToLSU + tokenIO <> mask.tokenIO + tokenIO.maskResponseRelease := maskWriteQueue.deq.fire + mask.dequeue + }.getOrElse(stage3EnqWire) + stage3.enqueue <> stage3EnqSelect // slot state laneState.vSew1H := vSew1H @@ -596,7 +626,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ laneState.vd := record.laneRequest.vd laneState.instructionIndex := record.laneRequest.instructionIndex laneState.skipEnable := skipEnable - laneState.ffoByOtherLanes := ffoRecord.ffoByOtherLanes laneState.additionalRW := record.additionalRW laneState.skipRead := record.laneRequest.decodeResult(Decoder.other) && (record.laneRequest.decodeResult(Decoder.uop) === 9.U) @@ -740,9 +769,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ sink := source } - executionUnit.ffoByOtherLanes := ffoRecord.ffoByOtherLanes - executionUnit.selfCompleted := ffoRecord.selfCompleted - // executionUnit <> vfu requestVec(index) := executionUnit.vfuRequest.bits executeDecodeVec(index) := executionUnit.executeDecode @@ -758,51 +784,35 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ 0.U(parameter.chainingSize.W) ) AssertProperty(BoolSequence(!executionUnit.dequeue.valid || stage2.dequeue.valid)) - stage3.enqueue.valid := executionUnit.dequeue.valid - executionUnit.dequeue.ready := stage3.enqueue.ready + stage3EnqWire.valid := executionUnit.dequeue.valid + executionUnit.dequeue.ready := stage3EnqWire.ready stage2.dequeue.ready := executionUnit.dequeue.fire if (!isLastSlot) { - stage3.enqueue.bits := DontCare + stage3EnqWire.bits := DontCare } // pipe state from stage0 - stage3.enqueue.bits.decodeResult := stage2.dequeue.bits.decodeResult - stage3.enqueue.bits.instructionIndex := stage2.dequeue.bits.instructionIndex - stage3.enqueue.bits.loadStore := stage2.dequeue.bits.loadStore - stage3.enqueue.bits.vd := stage2.dequeue.bits.vd - stage3.enqueue.bits.ffoByOtherLanes := ffoRecord.ffoByOtherLanes - stage3.enqueue.bits.groupCounter := stage2.dequeue.bits.groupCounter - stage3.enqueue.bits.mask := stage2.dequeue.bits.mask + stage3EnqWire.bits.decodeResult := stage2.dequeue.bits.decodeResult + stage3EnqWire.bits.instructionIndex := stage2.dequeue.bits.instructionIndex + stage3EnqWire.bits.loadStore := stage2.dequeue.bits.loadStore + stage3EnqWire.bits.vd := stage2.dequeue.bits.vd + stage3EnqWire.bits.ffoByOtherLanes := false.B + stage3EnqWire.bits.groupCounter := stage2.dequeue.bits.groupCounter + stage3EnqWire.bits.mask := stage2.dequeue.bits.mask if (isLastSlot) { - stage3.enqueue.bits.sSendResponse := stage2.dequeue.bits.sSendResponse.get - stage3.enqueue.bits.ffoSuccess := executionUnit.dequeue.bits.ffoSuccess.get - stage3.enqueue.bits.fpReduceValid.zip(executionUnit.dequeue.bits.fpReduceValid).foreach { case (sink, source) => + stage3EnqWire.bits.sSendResponse := stage2.dequeue.bits.sSendResponse.get + stage3EnqWire.bits.ffoSuccess := executionUnit.dequeue.bits.ffoSuccess.get + stage3EnqWire.bits.fpReduceValid.zip(executionUnit.dequeue.bits.fpReduceValid).foreach { case (sink, source) => sink := source } } - stage3.enqueue.bits.data := executionUnit.dequeue.bits.data - stage3.enqueue.bits.pipeData := stage2.dequeue.bits.pipeData.getOrElse(DontCare) - stage3.enqueue.bits.ffoIndex := executionUnit.dequeue.bits.ffoIndex - executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3.enqueue.bits.crossWriteData := data) - stage2.dequeue.bits.sSendResponse.foreach(_ => stage3.enqueue.bits.sSendResponse := _) - executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3.enqueue.bits.ffoSuccess := _) - - if (isLastSlot) { - when(laneResponseFeedback.valid) { - when(laneResponseFeedback.bits.complete) { - ffoRecord.ffoByOtherLanes := true.B - } - } - when(stage3.enqueue.fire) { - executionUnit.dequeue.bits.ffoSuccess.foreach(ffoRecord.selfCompleted := _) - // This group found means the next group ended early - ffoRecord.ffoByOtherLanes := ffoRecord.ffoByOtherLanes || ffoRecord.selfCompleted - } - - laneResponse <> stage3.laneResponse.get - stage3.laneResponseFeedback.get <> laneResponseFeedback - } + stage3EnqWire.bits.data := executionUnit.dequeue.bits.data + stage3EnqWire.bits.pipeData := stage2.dequeue.bits.pipeData.getOrElse(DontCare) + stage3EnqWire.bits.ffoIndex := executionUnit.dequeue.bits.ffoIndex + executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3EnqWire.bits.crossWriteData := data) + stage2.dequeue.bits.sSendResponse.foreach(_ => stage3EnqWire.bits.sSendResponse := _) + executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3EnqWire.bits.ffoSuccess := _) // --- stage 3 end & stage 4 start --- // vrfWriteQueue try to write vrf @@ -818,14 +828,13 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // cross write bus <> write queue crossLaneWriteQueue.zipWithIndex.foreach { case (queue, index) => - val port = writeBusPort(index) - // ((counter << 1) >> parameter.vrfParam.vrfOffsetBits).low(3) - val registerIncreaseBase = parameter.vrfParam.vrfOffsetBits - 1 + val port = writeBusPort(index) + val baseIndex = slotControl.head.laneRequest.vd ## 0.U(parameter.vrfOffsetBits.W) + val indexGrowth: UInt = changeUIntSize(port.enq.bits.counter ## index.U(1.W), port.enq.bits.counter.getWidth) + val finalIndex: UInt = baseIndex + indexGrowth queue.enq.valid := port.enq.valid - queue.enq.bits.vd := - // 3: 8 reg => log(2, 8) - slotControl.head.laneRequest.vd + port.enq.bits.counter(registerIncreaseBase + 3 - 1, registerIncreaseBase) - queue.enq.bits.offset := port.enq.bits.counter ## index.U(1.W) + queue.enq.bits.vd := finalIndex >> parameter.vrfOffsetBits + queue.enq.bits.offset := finalIndex queue.enq.bits.data := port.enq.bits.data queue.enq.bits.last := DontCare queue.enq.bits.instructionIndex := port.enq.bits.instructionIndex @@ -849,7 +858,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // It’s been a long time since I selected it. Need pipe val queueBeforeMaskWrite: QueueIO[VRFWriteRequest] = Queue.io(chiselTypeOf(maskedWriteUnit.enqueue.bits), entries = 1, pipe = true) - val writeSelect: UInt = Wire(UInt((parameter.chainingSize + 3).W)) + val writeSelect: UInt = Wire(UInt((parameter.chainingSize + 4).W)) val writeCavitation: UInt = VecInit(allVrfWriteAfterCheck.map(_.mask === 0.U)).asUInt // 处理 rf @@ -1073,14 +1082,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ } } - // slot 0 update - when(slotEnqueueFire.head) { - // new ffo enq - when(slotControl(1).laneRequest.decodeResult(Decoder.ffo)) { - ffoRecord := 0.U.asTypeOf(ffoRecord) - } - } - val slotDequeueFire: Seq[Bool] = (slotCanShift.head && slotOccupied.head) +: slotEnqueueFire Seq.tabulate(parameter.chainingSize) { slotIndex => when(slotEnqueueFire(slotIndex) ^ slotDequeueFire(slotIndex)) { @@ -1109,6 +1110,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrf.instructionWriteReport.bits.indexType := laneRequest.valid && laneRequest.bits.loadStore // TODO: move ma to [[V]] vrf.instructionWriteReport.bits.ma := laneRequest.bits.ma + vrf.instructionWriteReport.bits.onlyRead := laneRequest.bits.decodeResult(Decoder.popCount) // for mask unit vrf.instructionWriteReport.bits.slow := laneRequest.bits.decodeResult(Decoder.special) vrf.instructionWriteReport.bits.ls := laneRequest.bits.loadStore @@ -1171,14 +1173,15 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ writeReadyForLsu := vrf.writeReadyForLsu vrfReadyToStore := vrf.vrfReadyToStore tokenManager.crossWriteReports.zipWithIndex.foreach { case (rpt, rptIndex) => - rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 1 + rptIndex) - rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex + rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 2 + rptIndex) + rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 2 + rptIndex).instructionIndex } // todo: add mask unit write token - tokenManager.responseReport.valid := laneResponse.valid - tokenManager.responseReport.bits := laneResponse.bits.instructionIndex - tokenManager.responseFeedbackReport.valid := laneResponseFeedback.valid - tokenManager.responseFeedbackReport.bits := laneResponseFeedback.bits.instructionIndex + tokenManager.responseReport.valid := maskUnitRequest.valid + tokenManager.responseReport.bits := maskUnitRequest.bits.index + // todo: delete feedback token + tokenManager.responseFeedbackReport.valid := vrfWriteChannel.fire && writeFromMask + tokenManager.responseFeedbackReport.bits := vrfWriteChannel.bits.instructionIndex val instInSlot: UInt = slotControl .zip(slotOccupied) .map { case (slotState, occupied) => @@ -1207,9 +1210,15 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ tokenManager.topWriteEnq.valid := vrfWriteChannel.fire tokenManager.topWriteEnq.bits := vrfWriteChannel.bits.instructionIndex + tokenManager.fromMask := writeFromMask + + tokenManager.lsuWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) + tokenManager.lsuWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + + tokenManager.maskWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize + 1) + tokenManager.maskWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1).instructionIndex - tokenManager.topWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) - tokenManager.topWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + tokenManager.maskUnitLastReport := lsuLastReport layer.block(layers.Verification) { val probeWire = Wire(new LaneProbe(parameter)) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 6a78a2b7c..8e52e136a 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -252,6 +252,8 @@ case class T1Parameter( val vrfReadLatency = 2 + val maskUnitVefWriteQueueSize: Int = 8 + // each element: Each lane will be connected to the other two lanes, // and the values are their respective delays. val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1)) @@ -333,8 +335,7 @@ class T1Probe(parameter: T1Parameter) extends Bundle { val requestReg: ValidIO[InstructionPipeBundle] = ValidIO(new InstructionPipeBundle(parameter)) val requestRegReady: Bool = Bool() // write queue enq for mask unit - val writeQueueEnq: ValidIO[UInt] = Valid(UInt(parameter.instructionIndexBits.W)) - val writeQueueEnqMask: UInt = UInt((parameter.datapathWidth / 8).W) + val writeQueueEnqVec: Vec[ValidIO[UInt]] = Vec(parameter.laneNumber, Valid(UInt(parameter.instructionIndexBits.W))) // mask unit instruction valid val instructionValid: UInt = UInt((parameter.chainingSize * 2).W) // instruction index for check rd @@ -395,8 +396,9 @@ class T1(val parameter: T1Parameter) /** the LSU Module */ - val lsu: Instance[LSU] = Instantiate(new LSU(parameter.lsuParameters)) - val decode: Instance[VectorDecoder] = Instantiate(new VectorDecoder(parameter.decoderParam)) + val lsu: Instance[LSU] = Instantiate(new LSU(parameter.lsuParameters)) + val decode: Instance[VectorDecoder] = Instantiate(new VectorDecoder(parameter.decoderParam)) + val maskUnit: Instance[MaskUnit] = Instantiate(new MaskUnit(parameter)) omInstance.decoderIn := Property(decode.om.asAnyClassType) val tokenManager: Instance[T1TokenManager] = Instantiate(new T1TokenManager(parameter)) @@ -504,35 +506,9 @@ class T1(val parameter: T1Parameter) Fill(8, imm(4) && (vSew1H(1) || vSew1H(2) || src1IsSInt)) ## Fill(3, imm(4)) ## imm - /** duplicate v0 for mask */ - val v0: Vec[UInt] = RegInit( - VecInit(Seq.fill(parameter.vLen / parameter.datapathWidth)(0.U(parameter.datapathWidth.W))) - ) - // TODO: uarch doc for the regroup - val regroupV0: Seq[UInt] = Seq(4, 2, 1).map { groupSize => - VecInit( - cutUInt(v0.asUInt, groupSize) - .grouped(parameter.laneNumber) - .toSeq - .transpose - .map(seq => VecInit(seq).asUInt) - ).asUInt - } - /** which slot the instruction is entering */ val instructionToSlotOH: UInt = Wire(UInt(parameter.chainingSize.W)) - /** synchronize signal from each lane, for mask units.(ffo) */ - val laneSynchronize: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - - /** all lanes are synchronized. */ - val synchronized: Bool = WireDefault(false.B) - - /** for mask unit that need to access VRF from lanes, use this signal to indicate it is finished access VRF(but - * instruction might not finish). - */ - val maskUnitReadOnlyFinish: Bool = WireDefault(false.B) - /** last slot is committing. */ val lastSlotCommit: Bool = Wire(Bool()) @@ -543,11 +519,6 @@ class T1(val parameter: T1Parameter) val vxsatReportVec: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.chainingSize.W))) val vxsatReport = vxsatReportVec.reduce(_ | _) - // todo: 把lsu也放decode里去 - val maskUnitType: Bool = decodeResult(Decoder.maskUnit) && requestRegDequeue.bits.instruction(6) - val maskDestination = decodeResult(Decoder.maskDestination) - val unOrderType: Bool = decodeResult(Decoder.unOrderWrite) - /** Special instructions which will be allocate to the last slot. * - mask unit * - Lane <-> Top has data exchange(top might forward to LSU.) TODO: move to normal slots(add `offset` fields) @@ -558,99 +529,18 @@ class T1(val parameter: T1Parameter) val dataInWritePipeVec: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.chainingSize.W))) val dataInWritePipe: UInt = dataInWritePipeVec.reduce(_ | _) - /** designed for unordered instruction(slide), it doesn't go to lane, it has RAW hazzard. - */ - val instructionRAWReady: Bool = Wire(Bool()) - val allSlotFree: Bool = Wire(Bool()) - val existMaskType: Bool = Wire(Bool()) - - // mask Unit 与lane交换数据 - val writeType: VRFWriteRequest = new VRFWriteRequest( - parameter.vrfParam.regNumBits, - parameter.vrfParam.vrfOffsetBits, - parameter.instructionIndexBits, - parameter.datapathWidth - ) - val maskUnitWrite: ValidIO[VRFWriteRequest] = Wire(Valid(writeType)) - val maskUnitWriteVec: Vec[ValidIO[VRFWriteRequest]] = Wire(Vec(3, Valid(writeType))) - val maskWriteLaneSelect: Vec[UInt] = Wire(Vec(3, UInt(parameter.laneNumber.W))) - // 默认是head - val maskUnitWriteSelect: UInt = Mux1H(maskUnitWriteVec.map(_.valid), maskWriteLaneSelect) - maskUnitWriteVec.foreach(_ := DontCare) - maskUnitWrite := Mux1H(maskUnitWriteVec.map(_.valid), maskUnitWriteVec) - val writeSelectMaskUnit: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - val maskUnitWriteReady: Bool = writeSelectMaskUnit.asUInt.orR + // todo: instructionRAWReady -> v0 write token + val allSlotFree: Bool = Wire(Bool()) + val existMaskType: Bool = Wire(Bool()) // read - val readType: VRFReadRequest = new VRFReadRequest( + val readType: VRFReadRequest = new VRFReadRequest( parameter.vrfParam.regNumBits, parameter.vrfParam.vrfOffsetBits, parameter.instructionIndexBits ) - val maskUnitRead: ValidIO[VRFReadRequest] = Wire(Valid(readType)) - val maskUnitReadVec: Vec[ValidIO[VRFReadRequest]] = Wire(Vec(3, Valid(readType))) - val maskReadLaneSelect: Vec[UInt] = Wire(Vec(3, UInt(parameter.laneNumber.W))) - val maskUnitReadSelect: UInt = Mux1H(maskUnitReadVec.map(_.valid), maskReadLaneSelect) - maskUnitRead := Mux1H(maskUnitReadVec.map(_.valid), maskUnitReadVec) - val readSelectMaskUnit: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - val maskUnitReadReady = readSelectMaskUnit.asUInt.orR - val laneReadResult: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) - val WARRedResult: ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W)))) - // mask unit 最后的写 - val maskUnitFlushVrf: Bool = WireDefault(false.B) - - // gather read state - val gatherOverlap: Bool = Wire(Bool()) - val gatherNeedRead: Bool = requestRegDequeue.valid && decodeResult(Decoder.gather) && - !decodeResult(Decoder.vtype) && !gatherOverlap - val gatherReadFinish: Bool = - RegEnable( - !requestRegDequeue.fire, - false.B, - (RegNext(RegNext(maskUnitReadReady)) && gatherNeedRead) || requestRegDequeue.fire - ) - val gatherReadDataOffset: UInt = Wire(UInt(5.W)) - val gatherData: UInt = Mux(gatherOverlap, 0.U, (WARRedResult.bits >> gatherReadDataOffset).asUInt) - /** data that need to be compute at top. */ - val data: Vec[ValidIO[UInt]] = RegInit( - VecInit(Seq.fill(parameter.laneNumber)(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W))))) - ) - val flotReduceValid: Seq[Option[Bool]] = Seq.tabulate(parameter.laneNumber) { _ => - Option.when(parameter.fpuEnable)(RegInit(false.B)) - } - val maskDataForCompress: UInt = RegInit(0.U(parameter.datapathWidth.W)) - // clear the previous set of data from lane - val dataClear: Bool = WireDefault(false.B) - val completedVec: Vec[Bool] = RegInit(VecInit(Seq.fill(parameter.laneNumber)(false.B))) - // ffoIndexReg.valid: Already found the first one - val ffoIndexReg: ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.xLen.W)))) - val ffoType: Bool = Wire(Bool()) - - /** for find first one, need to tell the lane with higher index `1` . */ - val completedLeftOr: UInt = (scanLeftOr(completedVec.asUInt) << 1).asUInt(parameter.laneNumber - 1, 0) - // 按指定的sew拼成 {laneNumer * dataPathWidth} bit, 然后根据sew选择出来 - val sortedData: UInt = Mux1H( - vSewOHForMask, - Seq(4, 2, 1).map { groupSize => - VecInit(data.map { element => - element.bits.asBools // [x] * 32 eg: sew = 1 - .grouped(groupSize) // [x, x] * 16 - .toSeq - .map(VecInit(_).asUInt) // [xx] * 16 - }.transpose.map(VecInit(_).asUInt)).asUInt // [x*16] * 16 -> x * 256 - } - ) - // 把已经排过序的数据重新分给各个lane - val regroupData: Vec[UInt] = VecInit(Seq.tabulate(parameter.laneNumber) { laneIndex => - sortedData( - laneIndex * parameter.datapathWidth + parameter.datapathWidth - 1, - laneIndex * parameter.datapathWidth - ) - }) - val dataResult: ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W)))) - - val executeForLastLaneFire: Bool = WireDefault(false.B) + val gatherNeedRead: Bool = requestRegDequeue.valid && decodeResult(Decoder.gather) /** state machine register for each instruction. */ val slots: Seq[InstructionControl] = Seq.tabulate(parameter.chainingSize) { index => @@ -661,8 +551,6 @@ class T1(val parameter: T1Parameter) .asTypeOf(new InstructionControl(parameter.instructionIndexBits, parameter.laneNumber)) ) - val mvToVRF: Option[Bool] = Option.when(index == parameter.chainingSize - 1)(RegInit(false.B)) - /** the execution is finished. (but there might still exist some data in the ring.) */ val laneAndLSUFinish: Bool = control.endTag.asUInt.andR @@ -688,6 +576,7 @@ class T1(val parameter: T1Parameter) control.state.wLast := false.B control.state.sCommit := false.B control.state.wVRFWrite := !requestReg.bits.decodeResult(Decoder.maskUnit) + control.state.wMaskUnitLast := !requestReg.bits.decodeResult(Decoder.maskUnit) control.vxsat := false.B // two different initial states for endTag: // for load/store instruction, use the last bit to indicate whether it is the last instruction @@ -696,11 +585,14 @@ class T1(val parameter: T1Parameter) } // state machine starts here .otherwise { + when(maskUnit.lastReport.orR) { + control.state.wMaskUnitLast := true.B + } when(laneAndLSUFinish && v0WriteFinish) { control.state.wLast := true.B } - when(control.state.wLast && control.state.sMaskUnitExecution && !dataInWritePipeCheck) { + when(control.state.wLast && control.state.wMaskUnitLast && !dataInWritePipeCheck) { control.state.wVRFWrite := true.B } @@ -708,7 +600,7 @@ class T1(val parameter: T1Parameter) control.state.sCommit := true.B } - when(control.state.sCommit && control.state.wVRFWrite && control.state.sMaskUnitExecution) { + when(control.state.sCommit && control.state.wVRFWrite && control.state.wMaskUnitLast) { control.state.idle := true.B } @@ -720,735 +612,22 @@ class T1(val parameter: T1Parameter) control.vxsat := true.B } } - // logic like mask&reduce will be put to last slot - // TODO: review later if (index == (parameter.chainingSize - 1)) { - val feedBack: UInt = RegInit(0.U(parameter.laneNumber.W)) - val executeCounter: UInt = RegInit(0.U((log2Ceil(parameter.laneNumber) + 1).W)) - // mask destination时这两count都是以写vrf为视角 - val writeBackCounter: UInt = RegInit(0.U(log2Ceil(parameter.laneNumber).W)) - val groupCounter: UInt = RegInit(0.U(parameter.groupNumberMaxBits.W)) - val iotaCount: UInt = RegInit(0.U((parameter.laneParam.vlMaxBits - 1).W)) - val maskTypeInstruction = RegInit(false.B) - val vd = RegInit(0.U(5.W)) - val vs1 = RegInit(0.U(5.W)) - val vs2 = RegInit(0.U(5.W)) - val rs1 = RegInit(0.U(parameter.xLen.W)) - val vm = RegInit(false.B) - val executeFinishReg = RegInit(true.B) - val unOrderTypeInstruction = RegInit(false.B) - val decodeResultReg = RegInit(0.U.asTypeOf(decodeResult)) - val gather: Bool = decodeResultReg(Decoder.gather) - // for slid - val elementIndexCount = RegInit(0.U(parameter.laneParam.vlMaxBits.W)) - val compressWriteCount = RegInit(0.U(parameter.laneParam.vlMaxBits.W)) - val nextElementIndex: UInt = elementIndexCount + 1.U - val firstElement = elementIndexCount === 0.U - val lastElement: Bool = nextElementIndex === csrRegForMaskUnit.vl - val updateMaskIndex = WireDefault(false.B) - when(updateMaskIndex) { elementIndexCount := nextElementIndex } - // 特殊的指令,会阻止 wLast 后把 sExecute 拉回来, 因为需要等待读后才写 - val mixedUnit: Bool = Wire(Bool()) - // slid & gather & extend - val slidUnitIdle: Bool = RegInit(true.B) - // compress & iota - val iotaUnitIdle: Bool = RegInit(true.B) - val orderedReduceGroupCount: Option[UInt] = Option.when(parameter.fpuEnable)( - RegInit(0.U(log2Ceil(parameter.vLen / parameter.laneNumber).W)) - ) - val orderedReduceIdle: Option[Bool] = Option.when(parameter.fpuEnable)(RegInit(true.B)) - val maskUnitIdle = (Seq(slidUnitIdle, iotaUnitIdle) ++ orderedReduceIdle).reduce(_ && _) - val reduce = decodeResultReg(Decoder.red) - val orderedReduce: Bool = if (parameter.fpuEnable) decodeResultReg(Decoder.orderReduce) else false.B - val popCount = decodeResultReg(Decoder.popCount) - val extend = decodeResultReg(Decoder.extend) - // first type instruction - val firstLane = ffo(completedVec.asUInt) - val firstLaneIndex: UInt = OHToUInt(firstLane)(log2Ceil(parameter.laneNumber) - 1, 0) - io.retire.rd.valid := lastSlotCommit && decodeResultReg(Decoder.targetRd) + val writeRD = RegInit(false.B) + val float: Option[Bool] = Option.when(parameter.fpuEnable)(RegInit(false.B)) + val vd = RegInit(0.U(5.W)) + when(instructionToSlotOH(index)) { + writeRD := decodeResult(Decoder.targetRd) + float.foreach(_ := decodeResult(Decoder.float)) + vd := requestRegDequeue.bits.instruction(11, 7) + } + io.retire.rd.valid := lastSlotCommit && writeRD io.retire.rd.bits.rdAddress := vd if (parameter.fpuEnable) { - io.retire.rd.bits.isFp := decodeResultReg(Decoder.float) + io.retire.rd.bits.isFp := float.getOrElse(false.B) } else { io.retire.rd.bits.isFp := false.B } - when(requestRegDequeue.fire) { - ffoIndexReg.valid := false.B - ffoIndexReg.bits := -1.S(parameter.xLen.W).asUInt - }.elsewhen(synchronized && completedVec.asUInt.orR && !ffoIndexReg.valid) { - ffoIndexReg.valid := true.B - ffoIndexReg.bits := Mux1H( - firstLane, - // 3: firstLaneIndex.width - data.map(i => i.bits(parameter.xLen - 1 - 3, 5) ## firstLaneIndex ## i.bits(4, 0)) - ) - } - ffoType := decodeResultReg(Decoder.ffo) - - /** vlmax = vLen * (2**lmul) / (2 ** sew * 8) \= (vLen / 8) * 2 ** (lmul - sew) \= vlb * 2 ** (lmul - sew) lmul <- - * (-3, -2, -1, 0 ,1, 2, 3) sew <- (0, 1, 2) lmul - sew <- [-5, 3] 选择信号 +5 -> lmul - sew + 5 <- [0, 8] - */ - def largeThanVLMax(source: UInt, advance: Bool = false.B, lmul: UInt, sew: UInt): Bool = { - val vlenLog2 = log2Ceil(parameter.vLen) // 10 - val cut = - if (source.getWidth >= vlenLog2) source(vlenLog2 - 1, vlenLog2 - 9) - else (0.U(vlenLog2.W) ## source)(vlenLog2 - 1, vlenLog2 - 9) - // 9: lmul - sew 的可能值的个数 - val largeList: Vec[Bool] = Wire(Vec(9, Bool())) - cut.asBools.reverse.zipWithIndex.foldLeft(advance) { case (a, (b, i)) => - largeList(i) := a - a || b - } - val extendVlmul = lmul(2) ## lmul - val selectWire = UIntToOH(5.U(4.W) + extendVlmul - sew)(8, 0).asBools.reverse - Mux1H(selectWire, largeList) - } - // 算req上面的分开吧 - val gatherWire = - Mux(decodeResult(Decoder.itype), requestRegDequeue.bits.instruction(19, 15), requestRegDequeue.bits.rs1Data) - val gatherAdvance = (gatherWire >> log2Ceil(parameter.vLen)).asUInt.orR - gatherOverlap := largeThanVLMax( - gatherWire, - gatherAdvance, - T1Issue.vlmul(requestReg.bits.issue), - T1Issue.vsew(requestReg.bits.issue) - ) - val slotValid = !control.state.idle - val storeAfterSlide = isStoreType && (requestRegDequeue.bits.instruction(11, 7) === vd) - instructionRAWReady := !((unOrderTypeInstruction && slotValid && - // slid 类的会比执行得慢的指令快(div),会修改前面的指令的source - ((vd === requestRegDequeue.bits.instruction(24, 20)) || - (vd === requestRegDequeue.bits.instruction(19, 15)) || - storeAfterSlide || - // slid 类的会比执行得快的指令慢(mv),会被后来的指令修改 source2 - (vs2 === requestRegDequeue.bits.instruction(11, 7))) || - (unOrderType && !allSlotFree) || - (requestReg.bits.vdIsV0 && existMaskType)) || - (vd === 0.U && maskType && slotValid)) - when(instructionToSlotOH(index)) { - writeBackCounter := 0.U - groupCounter := 0.U - executeCounter := 0.U - elementIndexCount := 0.U - compressWriteCount := 0.U - iotaCount := 0.U - slidUnitIdle := !((decodeResult(Decoder.slid) || (decodeResult(Decoder.gather) && decodeResult(Decoder.vtype)) - || decodeResult(Decoder.extend)) && instructionValid) - iotaUnitIdle := !((decodeResult(Decoder.compress) || decodeResult(Decoder.iota)) && instructionValid) - orderedReduceIdle.foreach(_ := !(decodeResult(Decoder.orderReduce) && instructionValid)) - orderedReduceGroupCount.foreach(_ := 0.U) - vd := requestRegDequeue.bits.instruction(11, 7) - vs1 := requestRegDequeue.bits.instruction(19, 15) - vs2 := requestRegDequeue.bits.instruction(24, 20) - vm := requestRegDequeue.bits.instruction(25) - executeFinishReg := false.B - rs1 := requestRegDequeue.bits.rs1Data - decodeResultReg := decodeResult - csrRegForMaskUnit := requestRegCSR - // todo: decode need execute - control.state.sMaskUnitExecution := !maskUnitType - maskTypeInstruction := maskType && !decodeResult(Decoder.maskSource) - completedVec.foreach(_ := false.B) - WARRedResult.valid := false.B - unOrderTypeInstruction := unOrderType - dataResult := 0.U.asTypeOf(dataResult) - }.elsewhen(control.state.wLast && maskUnitIdle) { - // 如果真需要执行的lane会wScheduler,不会提前发出last确认 - when(!mixedUnit) { - control.state.sMaskUnitExecution := true.B - } - maskUnitFlushVrf := !control.state.idle - } - when(laneSynchronize.asUInt.orR) { - feedBack := feedBack | laneSynchronize.asUInt - }.elsewhen(lastSlotCommit) { - feedBack := 0.U - } - // 执行 - // mask destination write - /** 对于mask destination 类型的指令需要特别注意两种不对齐 第一种是我们以 32(dataPatWidth) * 8(laneNumber) 为一个组, 但是我们vl可能不对齐一整个组 第二种是 - * 32(dataPatWidth) 的时候对不齐 vl假设最大1024,相应的会有11位的vl xxx xxx xxxxx - */ - val dataPathMisaligned = csrRegForMaskUnit.vl(parameter.dataPathWidthBits - 1, 0).orR - val groupMisaligned = - if (parameter.laneNumber > 1) - csrRegForMaskUnit - .vl(parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - 1, parameter.dataPathWidthBits) - .orR - else false.B - - /** 我们需要计算最后一次写的 [[writeBackCounter]] & [[groupCounter]] lastGroupCounter = vl(10, 8) - !([[dataPathMisaligned]] - * \|| [[groupMisaligned]]) lastExecuteCounter = vl(7, 5) - ![[dataPathMisaligned]] - */ - val lastGroupCounter: UInt = - csrRegForMaskUnit.vl( - parameter.laneParam.vlMaxBits - 1, - parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - ) - !(dataPathMisaligned || groupMisaligned) - val lastExecuteCounter: UInt = if (parameter.laneNumber > 1) { - csrRegForMaskUnit.vl( - parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - 1, - parameter.dataPathWidthBits - ) - !dataPathMisaligned - } else 0.U - val lastGroup = groupCounter === lastGroupCounter - val lastExecute = lastGroup && writeBackCounter === lastExecuteCounter - val lastExecuteForGroup = writeBackCounter.andR - // 计算正写的这个lane是不是在边界上 - val endOH = UIntToOH(csrRegForMaskUnit.vl(parameter.dataPathWidthBits - 1, 0)) - val border = lastExecute && dataPathMisaligned && - !(decodeResultReg(Decoder.compress) || decodeResultReg(Decoder.gather)) - val lastGroupMask = scanRightOr(endOH(parameter.datapathWidth - 1, 1)) - val mvType = decodeResultReg(Decoder.mv) - val readMv = mvType && decodeResultReg(Decoder.targetRd) - val writeMv = mvType && !decodeResultReg(Decoder.targetRd) && - csrRegForMaskUnit.vl > csrRegForMaskUnit.vStart - mvToVRF.foreach(d => when(requestRegDequeue.fire) { d := writeMv }) - // 读后写中的读 - val needWAR = (maskTypeInstruction || border || reduce || readMv) && !popCount - val skipLaneData: Bool = decodeResultReg(Decoder.mv) - mixedUnit := writeMv || readMv - maskReadLaneSelect.head := UIntToOH(writeBackCounter) - maskReadLaneSelect.head := UIntToOH(writeBackCounter) - maskWriteLaneSelect.head := maskReadLaneSelect.head - maskUnitReadVec.head.valid := false.B - maskUnitReadVec.head.bits.vs := Mux(readMv, vs2, Mux(reduce, vs1, vd)) - maskUnitReadVec.head.bits.readSource := Mux(readMv, 1.U, Mux(reduce, 0.U, 2.U)) - maskUnitReadVec.head.bits.offset := groupCounter - maskUnitRead.bits.instructionIndex := control.record.instructionIndex - val readResultSelectResult = Mux1H( - Pipe(true.B, maskUnitReadSelect, parameter.vrfReadLatency).bits, - laneReadResult - ) - // 把mask选出来 - val maskSelect = v0(groupCounter ## writeBackCounter) - val fullMask: UInt = (-1.S(parameter.datapathWidth.W)).asUInt - - /** 正常全1 mask:[[maskSelect]] border: [[lastGroupMask]] mask && border: [[maskSelect]] & [[lastGroupMask]] - */ - val maskCorrect: UInt = Mux(maskTypeInstruction, maskSelect, fullMask) & - Mux(border, lastGroupMask, fullMask) - // mask - val sew1HCorrect = Mux(decodeResultReg(Decoder.widenReduce), vSewOHForMask ## false.B, vSewOHForMask) - // 写的data - val writeData = (WARRedResult.bits & (~maskCorrect).asUInt) | (regroupData(writeBackCounter) & maskCorrect) - val writeMask = Mux(sew1HCorrect(2) || !reduce, 15.U, Mux(sew1HCorrect(1), 3.U, 1.U)) - maskUnitWriteVec.head.valid := false.B - maskUnitWriteVec.head.bits.vd := vd - maskUnitWriteVec.head.bits.offset := groupCounter - maskUnitWriteVec.head.bits.data := Mux(writeMv, rs1, Mux(reduce, dataResult.bits, writeData)) - maskUnitWriteVec.head.bits.last := control.state.wLast || reduce - maskUnitWriteVec.head.bits.instructionIndex := control.record.instructionIndex - - val waitReadResult: Bool = Wire(Bool()) - val maskUnitReadVrf = maskUnitReadReady && maskUnitReadVec.map(_.valid).reduce(_ || _) && !waitReadResult - val readNext = RegNext(maskUnitReadVrf) - waitReadResult := RegNext(readNext) || readNext - when(Pipe(maskUnitReadVrf, false.B, parameter.vrfReadLatency).valid) { - WARRedResult.bits := readResultSelectResult - WARRedResult.valid := true.B - } - // alu start - val aluInput1 = Mux( - (Seq(executeCounter === 0.U) ++ orderedReduceGroupCount.map(_ === 0.U)).reduce(_ && _), - Mux( - needWAR, - WARRedResult.bits & FillInterleaved(8, writeMask), - 0.U - ), - dataResult.bits - ) - val aluInput2 = Mux1H(UIntToOH(executeCounter), data.map(d => Mux(d.valid, d.bits, 0.U))) - val skipFlotReduce: Bool = !Mux1H(UIntToOH(executeCounter), flotReduceValid.map(_.getOrElse(false.B))) - // red alu instance - val adder: Instance[ReduceAdder] = Instantiate(new ReduceAdder(parameter.datapathWidth)) - val logicUnit: Instance[LaneLogic] = Instantiate(new LaneLogic(parameter.datapathWidth)) - // option unit for flot reduce - val floatAdder: Option[Instance[FloatAdder]] = - Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(8, 24))) - val flotCompare: Option[Instance[FloatCompare]] = - Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(8, 24))) - - val sign = !decodeResultReg(Decoder.unsigned1) - adder.request.src := VecInit( - Seq( - (aluInput1(parameter.datapathWidth - 1) && sign) ## aluInput1, - (aluInput2(parameter.datapathWidth - 1) && sign) ## aluInput2 - ) - ) - // popCount 在top视为reduce add - adder.request.opcode := Mux(popCount, 0.U, decodeResultReg(Decoder.uop)) - adder.request.sign := sign - adder.request.vSew := Mux(popCount, 2.U, OHToUInt(sew1HCorrect)) - - floatAdder.foreach { fAdder => - fAdder.io.a := aluInput1 - fAdder.io.b := aluInput2 - fAdder.io.roundingMode := csrRegForMaskUnit.vxrm - } - - flotCompare.foreach { fCompare => - fCompare.io.a := aluInput1 - fCompare.io.b := aluInput2 - // max -> 12, min -> 8 - fCompare.io.isMax := decodeResultReg(Decoder.uop)(2) - } - - logicUnit.req.src := VecInit(Seq(aluInput1, aluInput2)) - logicUnit.req.opcode := decodeResultReg(Decoder.uop) - - // reduce resultSelect - val intReduceResult = Mux( - decodeResultReg(Decoder.adder) || popCount, - adder.response.data, - logicUnit.resp - ) - val flotReduceResult: Option[UInt] = Option.when(parameter.fpuEnable)( - Mux( - skipFlotReduce, - aluInput1, - Mux(decodeResultReg(Decoder.fpExecutionType) === 0.U, floatAdder.get.io.out, flotCompare.get.io.out) - ) - ) - val aluOutPut = Mux1H( - Seq(if (parameter.fpuEnable) reduce && !decodeResultReg(Decoder.float) else reduce) ++ - Option.when(parameter.fpuEnable)(reduce && decodeResultReg(Decoder.float)), - Seq(intReduceResult) ++ flotReduceResult - ) - // slid & gather unit - val slideUp = decodeResultReg(Decoder.topUop)(1) - val slide1 = decodeResultReg(Decoder.topUop)(0) && decodeResultReg(Decoder.slid) - - /** special uop 里面编码了extend的信息: specialUop(1,0): 倍率 specialUop(2):是否是符号 - */ - val extendSourceSew: Bool = (csrRegForMaskUnit.vSew >> decodeResultReg(Decoder.topUop)(1, 0))(0) - val extendSign: Bool = decodeResultReg(Decoder.topUop)(2) - // gather 相关的控制 - val gather16: Bool = decodeResultReg(Decoder.gather16) - val maskUnitEEW = Mux(gather16, 1.U, Mux(extend, extendSourceSew, csrRegForMaskUnit.vSew)) - val maskUnitEEW1H: UInt = UIntToOH(maskUnitEEW) - val maskUnitByteEnable = maskUnitEEW1H(2) ## maskUnitEEW1H(2) ## maskUnitEEW1H(2, 1).orR ## true.B - val maskUnitBitEnable = FillInterleaved(8, maskUnitByteEnable) - maskUnitWriteVec.head.bits.mask := Mux(writeMv, maskUnitByteEnable, writeMask) - // log2(dataWidth * laneNumber / 8) - val maskUnitDataOffset = - (elementIndexCount << maskUnitEEW).asUInt( - log2Ceil(parameter.datapathWidth * parameter.laneNumber / 8) - 1, - 0 - ) ## 0.U(3.W) - val maskUnitData = ((VecInit(data.map(_.bits)).asUInt >> maskUnitDataOffset).asUInt & maskUnitBitEnable)( - parameter.datapathWidth - 1, - 0 - ) - - val compareWire = Mux(decodeResultReg(Decoder.slid), rs1, maskUnitData) - val compareAdvance: Bool = (compareWire >> log2Ceil(parameter.vLen)).asUInt.orR - val compareResult: Bool = - largeThanVLMax(compareWire, compareAdvance, csrRegForMaskUnit.vlmul, csrRegForMaskUnit.vSew) - // 正在被gather使用的数据在data的那个组里 - val gatherDataSelect = - UIntToOH((false.B ## maskUnitDataOffset)(5 + (log2Ceil(parameter.laneNumber).max(1)) - 1, 5)) - val dataTail = Mux1H(UIntToOH(maskUnitEEW)(1, 0), Seq(3.U(2.W), 2.U(2.W))) - val lastElementForData = gatherDataSelect.asBools.last && maskUnitDataOffset(4, 3) === dataTail - val lastElementForCompressMask = elementIndexCount(log2Ceil(parameter.datapathWidth) - 1, 0).andR - val maskUnitDataReady: Bool = (gatherDataSelect & VecInit(data.map(_.valid)).asUInt).orR - // 正在被gather使用的数据是否就绪了 - val isSlide = !(gather || extend) - val slidUnitDataReady: Bool = maskUnitDataReady || isSlide - val compressDataReady = maskUnitDataReady || !(decodeResultReg(Decoder.compress) || decodeResultReg(Decoder.iota)) - // slid 先用状态机 - val idle :: sRead :: sWrite :: Nil = Enum(3) - val slideState = RegInit(idle) - val readState = slideState === sRead - - // slid 的立即数是0扩展的 - val slidSize = Mux(slide1, 1.U, Mux(decodeResultReg(Decoder.itype), vs1, rs1)) - // todo: 这里是否有更好的处理方式 - val slidSizeLSB = slidSize(parameter.laneParam.vlMaxBits - 1, 0) - // down + - // up - - val directionSelection = Mux(slideUp, (~slidSizeLSB).asUInt, slidSizeLSB) - val slideReadIndex = elementIndexCount + directionSelection + slideUp - val readIndex: UInt = Mux( - !maskUnitIdle, - Mux( - decodeResultReg(Decoder.slid), - slideReadIndex, - maskUnitData - ), - gatherWire - ) - - def indexAnalysis(elementIndex: UInt, csrInput: CSRInterface = csrRegForMaskUnit) = { - val sewInput = csrInput.vSew - val sewOHInput = UIntToOH(csrInput.vSew)(2, 0) - val intLMULInput: UInt = (1.U << csrInput.vlmul(1, 0)).asUInt - val dataPosition = (elementIndex(parameter.laneParam.vlMaxBits - 2, 0) << sewInput) - .asUInt(parameter.laneParam.vlMaxBits - 2, 0) - val accessMask = Mux1H( - sewOHInput(2, 0), - Seq( - UIntToOH(dataPosition(1, 0)), - FillInterleaved(2, UIntToOH(dataPosition(1))), - 15.U(4.W) - ) - ) - // 数据起始位置在32bit(暂时只32)中的偏移,由于数据会有跨lane的情况,融合的优化时再做 - val dataOffset = (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0)) ## 0.U(3.W) - val accessLane = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W) - // 32 bit / group - val dataGroup = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt - val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits - val offset = dataGroup(offsetWidth - 1, 0) - val accessRegGrowth = (dataGroup >> offsetWidth).asUInt - val decimalProportion = offset ## accessLane - // 1/8 register - val decimal = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3)) - - /** elementIndex 需要与vlMax比较, vLen * lmul /sew 这个计算太复杂了 我们可以换一个角度,计算读寄存器的增量与lmul比较,就能知道下标是否超vlMax了 vlmul - * 需要区分整数与浮点 - */ - val overlap = - (csrInput.vlmul(2) && decimal >= intLMULInput(3, 1)) || - (!csrInput.vlmul(2) && accessRegGrowth >= intLMULInput) - accessRegGrowth >= csrInput.vlmul - val reallyGrowth = accessRegGrowth(2, 0) - (accessMask, dataOffset, accessLane, offset, reallyGrowth, overlap) - } - val srcOverlap: Bool = !decodeResultReg(Decoder.itype) && (rs1 >= csrRegForMaskUnit.vl) - // rs1 >= vlMax - val srcOversize = !decodeResultReg(Decoder.itype) && !slide1 && compareResult - val signBit = Mux1H( - vSewOHForMask, - readIndex(parameter.laneParam.vlMaxBits - 1, parameter.laneParam.vlMaxBits - 3).asBools.reverse - ) - // 对于up来说小于offset的element是不变得的 - val slideUpUnderflow = slideUp && !slide1 && (signBit || srcOverlap) - val elementActive: Bool = v0.asUInt(elementIndexCount) || vm - val slidActive = elementActive && (!slideUpUnderflow || !decodeResultReg(Decoder.slid)) - // index >= vlMax 是写0 - val overlapVlMax: Bool = !slideUp && (signBit || srcOversize) - // select csr - val csrSelect = Mux(control.state.idle, requestRegCSR, csrRegForMaskUnit) - // slid read - val (_, readDataOffset, readLane, readOffset, readGrowth, lmulOverlap) = indexAnalysis(readIndex, csrSelect) - gatherReadDataOffset := readDataOffset - val readOverlap = lmulOverlap || overlapVlMax - val skipRead = readOverlap || (gather && compareResult) || extend - val maskUnitWriteVecFire1 = maskUnitReadVec(1).valid && maskUnitReadReady - val readFireNext1: Bool = RegNext(maskUnitWriteVecFire1) - val readFireNextNext1: Bool = RegNext(readFireNext1) - val port1WaitForResult: Bool = readFireNext1 || readFireNextNext1 - val gatherTryToRead = - gatherNeedRead && !VecInit(lsu.vrfReadDataPorts.map(_.valid)).asUInt.orR && !gatherReadFinish - maskUnitReadVec(1).valid := (readState || gatherTryToRead) && !port1WaitForResult - maskUnitReadVec(1).bits.vs := Mux(readState, vs2, requestRegDequeue.bits.instruction(24, 20)) + readGrowth - maskUnitReadVec(1).bits.readSource := 1.U - maskUnitReadVec(1).bits.offset := readOffset - maskReadLaneSelect(1) := UIntToOH(readLane) - // slid write, vlXXX: 用element index 算出来的 - val (vlMask, vlDataOffset, vlLane, vlOffset, vlGrowth, _) = indexAnalysis(elementIndexCount) - val writeState = slideState === sWrite - // 处理数据,先硬移位吧 - val slidReadData: UInt = ((WARRedResult.bits >> readDataOffset) << vlDataOffset) - .asUInt(parameter.datapathWidth - 1, 0) - val selectRS1 = slide1 && ((slideUp && firstElement) || (!slideUp && lastElement)) - // extend 类型的扩展和移位 - val extendData: UInt = (Mux( - extendSourceSew, - Fill(parameter.datapathWidth - 16, extendSign && maskUnitData(15)) ## maskUnitData(15, 0), - Fill(parameter.datapathWidth - 8, extendSign && maskUnitData(7)) ## maskUnitData(7, 0) - ) << vlDataOffset).asUInt(parameter.xLen - 1, 0) - - /** vd 的值有4种: - * 1. 用readIndex读出来的vs2的值 - * 1. 0 - * 1. slide1 时插进来的rs1 - * 1. extend 的值 - */ - val slidWriteData = Mux1H( - Seq((!(readOverlap || selectRS1 || extend)) || (gather && !compareResult), selectRS1, extend), - Seq(slidReadData, (rs1 << vlDataOffset).asUInt(parameter.xLen - 1, 0), extendData) - ) - maskUnitWriteVec(1).valid := writeState && slidActive - maskUnitWriteVec(1).bits.vd := vd + vlGrowth - maskUnitWriteVec(1).bits.offset := vlOffset - maskUnitWriteVec(1).bits.mask := vlMask - maskUnitWriteVec(1).bits.data := slidWriteData - maskUnitWriteVec(1).bits.last := lastElement - maskUnitWriteVec(1).bits.instructionIndex := control.record.instructionIndex - maskWriteLaneSelect(1) := UIntToOH(vlLane) - // slid 跳状态机 - when(slideState === idle) { - when((!slidUnitIdle) && slidUnitDataReady) { - when(skipRead) { - slideState := sWrite - }.otherwise { - slideState := sRead - } - } - } - when(readState) { - // 不需要valid,因为这个状态下一定是valid的 - when(readFireNextNext1) { - slideState := sWrite - } - } - when(writeState) { - when(maskUnitWriteReady || !slidActive) { - when(lastElement) { - slideState := idle - slidUnitIdle := true.B - when(gather || extend) { - synchronized := true.B - dataClear := true.B - maskUnitReadOnlyFinish := true.B - } - }.otherwise { - when(lastElementForData && (gather || extend)) { - synchronized := true.B - dataClear := true.B - slideState := idle - }.otherwise { - // todo: skip read - slideState := sRead - } - updateMaskIndex := true.B - } - } - } - - // compress & iota - val idle1 :: sReadMask :: sWrite1 :: Nil = Enum(3) - val compressState = RegInit(idle1) - val compressStateIdle = compressState === idle1 - val compressStateRead = compressState === sReadMask - val compressStateWrite = compressState === sWrite1 - - // compress 用vs1当mask,需要先读vs1 - val readCompressMaskNext = Pipe(maskUnitReadReady && compressStateRead, false.B, parameter.vrfReadLatency).valid - when(readCompressMaskNext) { - maskDataForCompress := readResultSelectResult - } - - // 处理 iota - val iotaDataOffset: UInt = elementIndexCount(log2Ceil(parameter.datapathWidth * parameter.laneNumber) - 1, 0) - val lastDataForIota: Bool = iotaDataOffset.andR - val iotaData = VecInit(data.map(_.bits)).asUInt(iotaDataOffset) - val iota = decodeResultReg(Decoder.iota) - - val maskUnitReadFire2: Bool = maskUnitReadVec(2).valid && maskUnitReadReady - val readFireNext2 = RegNext(maskUnitReadFire2) - val readFireNextNext2 = RegNext(readFireNext2) - val port2WaitForResult = readFireNextNext2 || readFireNext2 - - /** 计算需要读的mask的相关 elementIndexCount -> 11bit 只会访问单寄存器 elementIndexCount(4, 0)做为32bit内的offset elementIndexCount(7, - * 5)作为lane的选择 elementIndexCount(9, 8)作为offset - */ - // compress read - maskUnitReadVec(2).valid := compressStateRead && !port2WaitForResult - maskUnitReadVec(2).bits.vs := vs1 - maskUnitReadVec(2).bits.readSource := 0.U - maskUnitReadVec(2).bits.offset := elementIndexCount( - log2Ceil(parameter.datapathWidth) + log2Ceil(parameter.laneNumber) + - parameter.laneParam.vrfParam.vrfOffsetBits - 1, - log2Ceil(parameter.datapathWidth) + log2Ceil(parameter.laneNumber) - ) - maskReadLaneSelect(2) := UIntToOH( - elementIndexCount( - log2Ceil(parameter.datapathWidth) + ((log2Ceil(parameter.laneNumber) - 1).max(0)), - log2Ceil(parameter.datapathWidth) - ) - ) - // val lastElementForMask: Bool = elementIndexCount(4, 0).andR - val maskForCompress: Bool = maskDataForCompress(elementIndexCount(log2Ceil(parameter.datapathWidth) - 1, 0)) - - // compress vm=0 是保留的 - val skipWrite = !Mux(decodeResultReg(Decoder.compress), maskForCompress, elementActive) - val dataGroupTailForCompressUnit: Bool = Mux(iota, lastDataForIota, lastElementForData) - - // 计算compress write的位置信息 - val (compressMask, compressDataOffset, compressLane, compressOffset, compressGrowth, _) = - indexAnalysis(compressWriteCount) - val compressWriteData = (maskUnitData << compressDataOffset).asUInt - val iotaWriteData = (iotaCount << vlDataOffset).asUInt - // compress write - maskUnitWriteVec(2).valid := compressStateWrite && !skipWrite - maskUnitWriteVec(2).bits.vd := vd + Mux(iota, vlGrowth, compressGrowth) - maskUnitWriteVec(2).bits.offset := Mux(iota, vlOffset, compressOffset) - maskUnitWriteVec(2).bits.mask := Mux(iota, vlMask, compressMask) - maskUnitWriteVec(2).bits.data := Mux(iota, iotaWriteData, compressWriteData) - maskUnitWriteVec(2).bits.last := lastElement - maskUnitWriteVec(2).bits.instructionIndex := control.record.instructionIndex - maskWriteLaneSelect(2) := UIntToOH(Mux(iota, vlLane, compressLane)) - - // 跳状态机 - // compress每组数据先读mask - val firstState = Mux(iota, sWrite1, sReadMask) - when(compressStateIdle && (!iotaUnitIdle) && compressDataReady) { - compressState := firstState - } - - when(compressStateRead && readFireNextNext2) { - compressState := sWrite1 - } - - when(compressStateWrite) { - when(maskUnitWriteReady || skipWrite) { - when(!skipWrite) { - compressWriteCount := compressWriteCount + 1.U - iotaCount := iotaCount + iotaData - } - when(lastElement) { - compressState := idle - iotaUnitIdle := true.B - synchronized := true.B - dataClear := true.B - maskUnitReadOnlyFinish := true.B - }.otherwise { - when(lastElementForCompressMask) { - // update vs1 as mask for compress - compressState := sRead - } - when(dataGroupTailForCompressUnit) { - synchronized := true.B - dataClear := true.B - compressState := idle - } - updateMaskIndex := true.B - } - } - } - // for small vl & reduce - val accessByte = (csrRegForMaskUnit.vl << csrRegForMaskUnit.vSew).asUInt - // vl < row(vl) - val smallVL = accessByte < (parameter.datapathWidth * parameter.laneNumber / 8).U - val byteSizePerDataPathBits = log2Ceil(parameter.datapathWidth / 8) - val lastExecuteCounterForReduce: UInt = if (parameter.laneNumber > 1) { - accessByte( - byteSizePerDataPathBits + log2Ceil(parameter.laneNumber) - 1, - byteSizePerDataPathBits - ) - !accessByte(byteSizePerDataPathBits - 1, 0).orR - } else 0.U - val lastGroupDataWaitMaskForRed: UInt = scanRightOr(UIntToOH(lastExecuteCounterForReduce)) - // alu end - val maskOperation = - decodeResultReg(Decoder.maskLogic) || - decodeResultReg(Decoder.maskDestination) || - decodeResultReg(Decoder.ffo) - // How many data path(32 bit) will used by maskDestination instruction. - val maskDestinationByteSize: Bits = - csrRegForMaskUnit.vl(log2Ceil(parameter.dLen) - 1, 0) << csrRegForMaskUnit.vSew - val maskDestinationUseDataPathSize = - (maskDestinationByteSize >> 2).asUInt + maskDestinationByteSize(1, 0).orR - val lastGroupCountForThisGroup: UInt = maskDestinationUseDataPathSize(log2Ceil(parameter.laneNumber) - 1, 0) - val counterForMaskDestination: UInt = if (parameter.laneNumber > 1) { - (lastGroupCountForThisGroup - 1.U) | - Fill( - log2Ceil(parameter.laneNumber), - (maskDestinationUseDataPathSize >> log2Ceil(parameter.laneNumber)).asUInt.orR - ) - } else 0.U - - val waitSourceDataCounter = - Mux(decodeResultReg(Decoder.maskDestination), counterForMaskDestination, lastExecuteCounter) - val lastGroupDataWaitMask = scanRightOr(UIntToOH(waitSourceDataCounter)) - // todo: other ways - val lastOrderedGroup: Option[Bool] = orderedReduceGroupCount.map(count => - (count ## 0 - .U(log2Ceil(parameter.laneNumber).W) + -1.S(log2Ceil(parameter.laneNumber).W).asUInt) >= csrRegForMaskUnit.vl - ) - val misalignedOrdered: Bool = if (parameter.fpuEnable) { - lastOrderedGroup.get && csrRegForMaskUnit.vl(log2Ceil(parameter.laneNumber) - 1, 0).orR && decodeResultReg( - Decoder.float - ) - } else false.B - val dataMask = - Mux( - maskOperation && lastGroup, - lastGroupDataWaitMask, - Mux( - reduce && (smallVL || misalignedOrdered), - lastGroupDataWaitMaskForRed, - -1.S(parameter.laneNumber.W).asUInt - ) - ) - val dataReady = ((~dataMask).asUInt | VecInit(data.map(_.valid)).asUInt).andR || skipLaneData - when( - // data ready - dataReady && - // state check - !control.state.sMaskUnitExecution - ) { - // 读 - when(needWAR && !WARRedResult.valid) { - maskUnitReadVec.head.valid := true.B - } - // 可能有的计算 - val nextExecuteIndex: UInt = executeCounter + 1.U - val isLastExecuteForGroup: Bool = executeCounter(log2Ceil(parameter.laneNumber) - 1, 0).andR - val lastExecuteForInstruction: Option[Bool] = orderedReduceGroupCount.map(count => - (count ## 0.U(log2Ceil(parameter.laneNumber).W) + nextExecuteIndex) === csrRegForMaskUnit.vl - ) - val readFinish = WARRedResult.valid || !needWAR - val readDataSign = - Mux1H(vSewOHForMask(2, 0), Seq(WARRedResult.bits(7), WARRedResult.bits(15), WARRedResult.bits(31))) - when(readFinish && !executeFinishReg) { - when(readMv) { - control.state.sMaskUnitExecution := true.B - // signExtend for vmv.x.s - dataResult.bits := Mux(vSewOHForMask(2), WARRedResult.bits(31, 16), Fill(16, readDataSign)) ## - Mux(vSewOHForMask(0), Fill(8, readDataSign), WARRedResult.bits(15, 8)) ## - WARRedResult.bits(7, 0) - - }.otherwise { - executeCounter := nextExecuteIndex - when(executeCounter =/= csrRegForMaskUnit.vl) { - dataResult.bits := aluOutPut - } - if (parameter.fpuEnable) { - when(!orderedReduceIdle.get) { - when(lastExecuteForInstruction.get) { - orderedReduceIdle.get := true.B - }.elsewhen(isLastExecuteForGroup) { - synchronized := true.B - executeCounter := 0.U - dataClear := true.B - orderedReduceGroupCount.foreach(d => d := d + 1.U) - } - } - } - } - } - // for vfredmax - val lastReduceCounter = - executeCounter === csrRegForMaskUnit.vl || executeCounter(log2Ceil(parameter.laneNumber)) - dontTouch(lastReduceCounter) - val executeFinish: Bool = - (lastReduceCounter || !(reduce || popCount) || orderedReduce) && maskUnitIdle - val schedulerWrite = decodeResultReg(Decoder.maskDestination) || (reduce && !popCount) || writeMv - val groupSync = decodeResultReg(Decoder.ffo) - // 写回 - when(readFinish && (executeFinish || writeMv || executeFinishReg)) { - maskUnitWriteVec.head.valid := schedulerWrite - executeFinishReg := true.B - when(maskUnitWriteReady || !schedulerWrite) { - WARRedResult.valid := false.B - writeBackCounter := writeBackCounter + schedulerWrite - when(lastExecuteForGroup || lastExecute || reduce || groupSync || writeMv || popCount) { - synchronized := true.B - dataClear := true.B - when(lastExecuteForGroup || groupSync) { - executeForLastLaneFire := true.B - groupCounter := groupCounter + 1.U - } - when(lastExecute || reduce || writeMv || popCount) { - control.state.sMaskUnitExecution := true.B - } - } - } - } - } } control } @@ -1484,10 +663,14 @@ class T1(val parameter: T1Parameter) val slotReady: Bool = Mux(specialInstruction, slots.map(_.state.idle).last, freeOR) val source1Select: UInt = - Mux(decodeResult(Decoder.gather), gatherData, Mux(decodeResult(Decoder.itype), immSignExtend, source1Extend)) + Mux( + decodeResult(Decoder.gather), + maskUnit.gatherData.bits, + Mux(decodeResult(Decoder.itype), immSignExtend, source1Extend) + ) // data eew for extend type - val extendDataEEW: Bool = (csrRegForMaskUnit.vSew >> decodeResult(Decoder.topUop)(1, 0))(0) + val extendDataEEW: Bool = (T1Issue.vsew(requestReg.bits.issue) - decodeResult(Decoder.topUop)(2, 1))(0) val gather16: Bool = decodeResult(Decoder.gather16) val vSewSelect: UInt = Mux( isLoadStoreType, @@ -1555,58 +738,52 @@ class T1(val parameter: T1Parameter) lane.csrInterface.vl := evlForLane lane.laneIndex := index.U - // - LSU request next offset of group - // - all lane are synchronized - // - the index type of instruction is finished. - lane.laneResponseFeedback.valid := lsu.lsuOffsetRequest || synchronized || completeIndexInstruction - // - the index type of instruction is finished. - // - for find first one. - lane.laneResponseFeedback.bits.complete := - completeIndexInstruction || - completedLeftOr(index) || - maskUnitReadOnlyFinish - // tell lane which - lane.laneResponseFeedback.bits.instructionIndex := slots.last.record.instructionIndex - // lsu 优先会有死锁: // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写 // vse32.v v1, (a0) -> 依赖上一条,但是会先发出read - // 读 lane - lane.vrfReadAddressChannel.valid := lsu.vrfReadDataPorts(index).valid || - (maskUnitRead.valid && maskUnitReadSelect(index)) + + // Mask priority will also be + // vse32.v v19, (a0) + // vfslide1down.vf v19, v10, x1 + val maskUnitFirst = RegInit(false.B) + val tryToRead = lsu.vrfReadDataPorts(index).valid || maskUnit.readChannel(index).valid + when(tryToRead && !lane.vrfReadAddressChannel.fire) { + maskUnitFirst := !maskUnitFirst + } + lane.vrfReadAddressChannel.valid := Mux( + maskUnitFirst, + maskUnit.readChannel(index).valid, + lsu.vrfReadDataPorts(index).valid + ) lane.vrfReadAddressChannel.bits := - Mux(maskUnitRead.valid, maskUnitRead.bits, lsu.vrfReadDataPorts(index).bits) - lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnitRead.valid - readSelectMaskUnit(index) := - lane.vrfReadAddressChannel.ready && maskUnitReadSelect(index) - laneReadResult(index) := lane.vrfReadDataChannel + Mux(maskUnitFirst, maskUnit.readChannel(index).bits, lsu.vrfReadDataPorts(index).bits) + lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnitFirst + maskUnit.readChannel(index).ready := lane.vrfReadAddressChannel.ready && maskUnitFirst + maskUnit.readResult(index) := lane.vrfReadDataChannel lsu.vrfReadResults(index) := lane.vrfReadDataChannel - // 写lane - lane.vrfWriteChannel.valid := vrfWrite(index).valid || (maskUnitWrite.valid && maskUnitWriteSelect(index)) - lane.vrfWriteChannel.bits := - Mux(vrfWrite(index).valid, vrfWrite(index).bits, maskUnitWrite.bits) - vrfWrite(index).ready := lane.vrfWriteChannel.ready - writeSelectMaskUnit(index) := - lane.vrfWriteChannel.ready && !vrfWrite(index).valid && maskUnitWriteSelect(index) + val maskTryToWrite = maskUnit.exeResp(index) + // lsu & mask unit write lane + // Mask write has absolute priority because it has a token + lane.vrfWriteChannel.valid := vrfWrite(index).valid || maskTryToWrite.valid + lane.vrfWriteChannel.bits := Mux(maskTryToWrite.valid, maskTryToWrite.bits, vrfWrite(index).bits) + vrfWrite(index).ready := lane.vrfWriteChannel.ready && !maskTryToWrite.valid + lane.writeFromMask := maskTryToWrite.valid - lsu.offsetReadResult(index).valid := lane.laneResponse.valid && lane.laneResponse.bits.toLSU - lsu.offsetReadResult(index).bits := lane.laneResponse.bits.data - lsu.offsetReadIndex(index) := lane.laneResponse.bits.instructionIndex + lsu.offsetReadResult(index).valid := lane.maskUnitRequest.valid && lane.maskRequestToLSU + lsu.offsetReadResult(index).bits := lane.maskUnitRequest.bits.source2 + lsu.offsetReadIndex(index) := lane.maskUnitRequest.bits.index instructionFinished(index).zip(slots.map(_.record.instructionIndex)).foreach { case (d, f) => d := (UIntToOH(f(parameter.instructionIndexBits - 2, 0)) & lane.instructionFinished).orR } - vxsatReportVec(index) := lane.vxsatReport - val v0ForThisLane: Seq[UInt] = regroupV0.map(rv => cutUInt(rv, parameter.vLen / parameter.laneNumber)(index)) - val v0SelectBySew = Mux1H(UIntToOH(lane.maskSelectSew)(2, 0), v0ForThisLane) - lane.maskInput := cutUInt(v0SelectBySew, parameter.datapathWidth)(lane.maskSelect) - lane.lsuLastReport := lsu.lastReport | - Mux( - maskUnitFlushVrf, - indexToOH(slots.last.record.instructionIndex, parameter.chainingSize), - 0.U - ) + vxsatReportVec(index) := lane.vxsatReport + lane.maskInput := maskUnit.laneMaskInput(index) + maskUnit.laneMaskSelect(index) := lane.maskSelect + maskUnit.laneMaskSewSelect(index) := lane.maskSelectSew + maskUnit.v0UpdateVec(index) <> lane.v0Update + + lane.lsuLastReport := lsu.lastReport | maskUnit.lastReport lane.lsuMaskGroupChange := lsu.lsuMaskGroupChange lane.loadDataInLSUWriteQueue := lsu.dataInWriteQueue(index) @@ -1616,18 +793,7 @@ class T1(val parameter: T1Parameter) (requestReg.bits.writeByte >> rowWith).asUInt + (requestReg.bits.writeByte(rowWith - 1, 0) > ((parameter.datapathWidth / 8) * index).U) - // 处理lane的mask类型请求 - laneSynchronize(index) := lane.laneResponse.valid && !lane.laneResponse.bits.toLSU - when(laneSynchronize(index)) { - data(index).valid := true.B - data(index).bits := lane.laneResponse.bits.data - completedVec(index) := lane.laneResponse.bits.ffoSuccess - flotReduceValid(index).foreach(d => d := lane.laneResponse.bits.fpReduceValid.get) - } - // token manager - tokenManager.writeV0(index).valid := lane.vrfWriteChannel.fire && (lane.vrfWriteChannel.bits.vd === 0.U) - tokenManager.writeV0(index).bits := lane.vrfWriteChannel.bits.instructionIndex tokenManager.instructionFinish(index) := lane.instructionFinished lane @@ -1651,13 +817,39 @@ class T1(val parameter: T1Parameter) lsu.request.bits.instructionInformation.isStore := isStoreType lsu.request.bits.instructionInformation.maskedLoadStore := maskType - lsu.maskInput.zip(lsu.maskSelect).foreach { case (data, index) => - data := cutUInt(v0.asUInt, parameter.maskGroupWidth)(index) + maskUnit.lsuMaskSelect := lsu.maskSelect + lsu.maskInput := maskUnit.lsuMaskInput + lsu.csrInterface := requestRegCSR + lsu.csrInterface.vl := evlForLsu + lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR + lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR + + // connect mask unit + maskUnit.instReq.valid := requestRegDequeue.fire && requestReg.bits.decodeResult(Decoder.maskUnit) + maskUnit.instReq.bits.instructionIndex := requestReg.bits.instructionIndex + maskUnit.instReq.bits.decodeResult := decodeResult + maskUnit.instReq.bits.readFromScala := Mux(decodeResult(Decoder.itype), imm, requestRegDequeue.bits.rs1Data) + maskUnit.instReq.bits.sew := T1Issue.vsew(requestReg.bits.issue) + maskUnit.instReq.bits.maskType := maskType + maskUnit.instReq.bits.vxrm := requestReg.bits.issue.vcsr(2, 1) + maskUnit.instReq.bits.vlmul := requestReg.bits.issue.vtype(2, 0) + maskUnit.instReq.bits.vs1 := requestRegDequeue.bits.instruction(19, 15) + maskUnit.instReq.bits.vs2 := requestRegDequeue.bits.instruction(24, 20) + maskUnit.instReq.bits.vd := requestRegDequeue.bits.instruction(11, 7) + maskUnit.instReq.bits.vl := requestReg.bits.issue.vl + // gather read + maskUnit.gatherRead := gatherNeedRead + maskUnit.gatherData.ready := requestRegDequeue.fire + + maskUnit.exeReq.zip(laneVec).foreach { case (maskInput, lane) => + maskInput.valid := lane.maskUnitRequest.valid && !lane.maskRequestToLSU + maskInput.bits := lane.maskUnitRequest.bits + } + + maskUnit.tokenIO.zip(laneVec).zipWithIndex.foreach { case ((token, lane), index) => + token.maskResponseRelease := lane.tokenIO.maskResponseRelease + lane.tokenIO.maskRequestRelease := token.maskRequestRelease || lsu.tokenIO.offsetGroupRelease(index) } - lsu.csrInterface := requestRegCSR - lsu.csrInterface.vl := evlForLsu - lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR - lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR // 连lane的环 parameter.crossLaneConnectCycles.zipWithIndex.foreach { case (cycles, index) => @@ -1723,16 +915,30 @@ class T1(val parameter: T1Parameter) // - for slide instruction, it is unordered, and may have RAW hazard, // we detect the hazard and decide should we issue this slide or // issue the instruction after the slide which already in the slot. - requestRegDequeue.ready := executionReady && slotReady && (!gatherNeedRead || gatherReadFinish) && - instructionRAWReady && instructionIndexFree && vrfAllocate + requestRegDequeue.ready := executionReady && slotReady && (!gatherNeedRead || maskUnit.gatherData.valid) && + tokenManager.issueAllow && instructionIndexFree && vrfAllocate instructionToSlotOH := Mux(requestRegDequeue.fire, slotToEnqueue, 0.U) + tokenManager.instructionIssue.valid := requestRegDequeue.fire + tokenManager.instructionIssue.bits.instructionIndex := requestReg.bits.instructionIndex + tokenManager.instructionIssue.bits.writeV0 := + (!requestReg.bits.decodeResult(Decoder.targetRd) && !isStoreType) && requestReg.bits.vdIsV0 + tokenManager.instructionIssue.bits.useV0AsMask := maskType + tokenManager.instructionIssue.bits.isLoadStore := !requestRegDequeue.bits.instruction(6) + tokenManager.instructionIssue.bits.toLane := !noOffsetReadLoadStore && !maskUnitInstruction + tokenManager.instructionIssue.bits.toMask := requestReg.bits.decodeResult(Decoder.maskUnit) + tokenManager.lsuWriteV0.zip(lsu.vrfWritePort).foreach { case (token, write) => + token.valid := write.fire && write.bits.vd === 0.U && write.bits.mask.orR + token.bits := write.bits.instructionIndex + } + tokenManager.maskUnitFree := slots.last.state.idle + // instruction commit { val slotCommit: Vec[Bool] = VecInit(slots.map { inst => // mask unit finish - inst.state.sMaskUnitExecution && + inst.state.wMaskUnitLast && // lane|lsu finish inst.state.wLast && // mask unit write finish @@ -1743,7 +949,7 @@ class T1(val parameter: T1Parameter) inst.record.instructionIndex === responseCounter }) retire := slotCommit.asUInt.orR - io.retire.rd.bits.rdData := Mux(ffoType, ffoIndexReg.bits, dataResult.bits) + io.retire.rd.bits.rdData := maskUnit.writeRDData // TODO: csr retire. io.retire.csr.bits.vxsat := (slotCommit.asUInt & VecInit(slots.map(_.vxsat)).asUInt).orR io.retire.csr.bits.fflag := DontCare @@ -1752,53 +958,35 @@ class T1(val parameter: T1Parameter) lastSlotCommit := slotCommit.last } - // write v0(mask) - v0.zipWithIndex.foreach { case (data, index) => - // 属于哪个lane - val laneIndex: Int = index % parameter.laneNumber - // 取出写的端口 - val v0Write = laneVec(laneIndex).v0Update - // offset - val offset: Int = index / parameter.laneNumber - val maskExt = FillInterleaved(8, v0Write.bits.mask) - when(v0Write.valid && v0Write.bits.offset === offset.U) { - data := (data & (~maskExt).asUInt) | (maskExt & v0Write.bits.data) - } - } - when(dataClear) { - data.foreach(_.valid := false.B) - } - // don't care有可能会导致先读后写失败 - maskUnitReadVec.foreach(_.bits.instructionIndex := slots.last.record.instructionIndex) - layer.block(layers.Verification) { /** Probes */ val probeWire = Wire(new T1Probe(parameter)) define(io.t1Probe, ProbeValue(probeWire)) - probeWire.instructionCounter := instructionCounter - probeWire.instructionIssue := requestRegDequeue.fire - probeWire.issueTag := requestReg.bits.instructionIndex - probeWire.retireValid := retire - probeWire.requestReg := requestReg - probeWire.requestRegReady := requestRegDequeue.ready + probeWire.instructionCounter := instructionCounter + probeWire.instructionIssue := requestRegDequeue.fire + probeWire.issueTag := requestReg.bits.instructionIndex + probeWire.retireValid := retire + probeWire.requestReg := requestReg + probeWire.requestRegReady := requestRegDequeue.ready // maskUnitWrite maskUnitWriteReady - probeWire.writeQueueEnq.valid := maskUnitWrite.valid && maskUnitWriteReady - probeWire.writeQueueEnq.bits := maskUnitWrite.bits.instructionIndex - probeWire.writeQueueEnqMask := maskUnitWrite.bits.mask - probeWire.instructionValid := maskAnd( - !slots.last.state.sMaskUnitExecution && !slots.last.state.idle, + probeWire.writeQueueEnqVec.zip(maskUnit.exeResp).foreach { case (probe, write) => + probe.valid := write.valid && write.bits.mask.orR + probe.bits := write.bits.instructionIndex + } + probeWire.instructionValid := maskAnd( + !slots.last.state.wMaskUnitLast && !slots.last.state.idle, indexToOH(slots.last.record.instructionIndex, parameter.chainingSize * 2) ).asUInt - probeWire.responseCounter := responseCounter + probeWire.responseCounter := responseCounter probeWire.laneProbes.zip(laneVec).foreach { case (p, l) => p := probe.read(l.laneProbe) } - probeWire.lsuProbe := probe.read(lsu.lsuProbe) - probeWire.issue.valid := io.issue.fire - probeWire.issue.bits := instructionCounter - probeWire.retire.valid := io.retire.rd.valid - probeWire.retire.bits := io.retire.rd.bits.rdData - probeWire.idle := slots.map(_.state.idle).reduce(_ && _) + probeWire.lsuProbe := probe.read(lsu.lsuProbe) + probeWire.issue.valid := io.issue.fire + probeWire.issue.bits := instructionCounter + probeWire.retire.valid := io.retire.rd.valid + probeWire.retire.bits := io.retire.rd.bits.rdData + probeWire.idle := slots.map(_.state.idle).reduce(_ && _) } // coverage diff --git a/t1/src/VectorFunctionUnit.scala b/t1/src/VectorFunctionUnit.scala index 9450a11d2..4c16022b0 100644 --- a/t1/src/VectorFunctionUnit.scala +++ b/t1/src/VectorFunctionUnit.scala @@ -132,6 +132,13 @@ object VFUInstantiateParameter { case (false, true) => VFUInstantiateParameter.zvbb(vLen, dLen) case (true, true) => VFUInstantiateParameter.zvbbFP(vLen, dLen) } + case "huge" => + (fp, zvbb) match { + case (false, false) => VFUInstantiateParameter.smallInt(vLen, dLen) + case (true, false) => VFUInstantiateParameter.hugeFP(vLen, dLen) + case (false, true) => VFUInstantiateParameter.zvbb(vLen, dLen) + case (true, true) => VFUInstantiateParameter.zvbbFP(vLen, dLen) + } } // instantiate each module and connect to all scoreboards diff --git a/t1/src/decoder/Decoder.scala b/t1/src/decoder/Decoder.scala index 4072ae589..dd27a1657 100644 --- a/t1/src/decoder/Decoder.scala +++ b/t1/src/decoder/Decoder.scala @@ -33,7 +33,7 @@ trait T1UopField extends T1DecodeFiled[UInt] with FieldName { } trait T1TopUopField extends T1DecodeFiled[UInt] with FieldName { - def chiselType: UInt = UInt(3.W) + def chiselType: UInt = UInt(5.W) } trait T1fpExecutionTypeUopField extends T1DecodeFiled[UInt] with FieldName { @@ -227,14 +227,39 @@ object Decoder { object topUop extends T1TopUopField { override def genTable(pattern: T1DecodePattern): BitPat = pattern.topUop.value match { - case _: TopT0.type => BitPat("b000") - case _: TopT1.type => BitPat("b001") - case _: TopT2.type => BitPat("b010") - case _: TopT3.type => BitPat("b011") - case _: TopT5.type => BitPat("b101") - case _: TopT6.type => BitPat("b110") - case _: TopT7.type => BitPat("b111") - case _ => BitPat.dontCare(3) + case _: TopT0.type => BitPat("b00000") + case _: TopT1.type => BitPat("b00001") + case _: TopT2.type => BitPat("b00010") + case _: TopT3.type => BitPat("b00011") + case _: TopT4.type => BitPat("b00100") + case _: TopT5.type => BitPat("b00101") + case _: TopT6.type => BitPat("b00110") + case _: TopT7.type => BitPat("b00111") + case _: TopT8.type => BitPat("b01000") + case _: TopT9.type => BitPat("b01001") + case _: TopT10.type => BitPat("b01010") + case _: TopT11.type => BitPat("b01011") + case _: TopT12.type => BitPat("b01100") + case _: TopT13.type => BitPat("b01101") + case _: TopT14.type => BitPat("b01110") + case _: TopT15.type => BitPat("b01111") + case _: TopT16.type => BitPat("b10000") + case _: TopT17.type => BitPat("b10001") + case _: TopT18.type => BitPat("b10010") + case _: TopT19.type => BitPat("b10011") + case _: TopT20.type => BitPat("b10100") + case _: TopT21.type => BitPat("b10101") + case _: TopT22.type => BitPat("b10110") + case _: TopT23.type => BitPat("b10111") + case _: TopT24.type => BitPat("b11000") + case _: TopT25.type => BitPat("b11001") + case _: TopT26.type => BitPat("b11010") + case _: TopT27.type => BitPat("b11011") + case _: TopT28.type => BitPat("b11100") + case _: TopT29.type => BitPat("b11101") + case _: TopT30.type => BitPat("b11110") + case _: TopT31.type => BitPat("b11111") + case _ => BitPat.dontCare(5) } } diff --git a/t1/src/decoder/attribute/isSwrite.scala b/t1/src/decoder/attribute/isSwrite.scala index f14bad4c0..921f29dc5 100644 --- a/t1/src/decoder/attribute/isSwrite.scala +++ b/t1/src/decoder/attribute/isSwrite.scala @@ -17,17 +17,9 @@ object isSwrite { def y(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched = Seq( - "vcompress.vm", "vcpop.m", "vfirst.m", "vfmv.f.s", - "vfredmax.vs", - "vfredmin.vs", - "vfredosum.vs", - "vfredusum.vs", - "vfwredosum.vs", - "vfwredusum.vs", - "viota.m", "vl1re16.v", "vl1re32.v", "vl1re64.v", @@ -85,57 +77,7 @@ object isSwrite { "vluxei512.v", "vluxei64.v", "vluxei8.v", - "vmadc.vi", - "vmadc.vim", - "vmadc.vv", - "vmadc.vvm", - "vmadc.vx", - "vmadc.vxm", - "vmfeq.vf", - "vmfeq.vv", - "vmfge.vf", - "vmfgt.vf", - "vmfle.vf", - "vmfle.vv", - "vmflt.vf", - "vmflt.vv", - "vmfne.vf", - "vmfne.vv", - "vmsbc.vv", - "vmsbc.vvm", - "vmsbc.vx", - "vmsbc.vxm", - "vmseq.vi", - "vmseq.vv", - "vmseq.vx", - "vmsgt.vi", - "vmsgt.vx", - "vmsgtu.vi", - "vmsgtu.vx", - "vmsle.vi", - "vmsle.vv", - "vmsle.vx", - "vmsleu.vi", - "vmsleu.vv", - "vmsleu.vx", - "vmslt.vv", - "vmslt.vx", - "vmsltu.vv", - "vmsltu.vx", - "vmsne.vi", - "vmsne.vv", - "vmsne.vx", "vmv.x.s", - "vredand.vs", - "vredmax.vs", - "vredmaxu.vs", - "vredmin.vs", - "vredminu.vs", - "vredor.vs", - "vredsum.vs", - "vredxor.vs", - "vrgather.vv", - "vrgatherei16.vv", "vs1r.v", "vs2r.v", "vs4r.v", @@ -148,9 +90,6 @@ object isSwrite { "vse512.v", "vse64.v", "vse8.v", - "vsext.vf2", - "vsext.vf4", - "vsext.vf8", "vsm.v", "vsoxei1024.v", "vsoxei128.v", @@ -207,9 +146,6 @@ object isSwrite { "vwsubu.vx", "vwsubu.wv", "vwsubu.wx", - "vzext.vf2", - "vzext.vf4", - "vzext.vf8", // rv_zvbb "vwsll.vv", "vwsll.vx", diff --git a/t1/src/decoder/attribute/topUop.scala b/t1/src/decoder/attribute/topUop.scala index ae8beeca1..c9447af9e 100644 --- a/t1/src/decoder/attribute/topUop.scala +++ b/t1/src/decoder/attribute/topUop.scala @@ -10,77 +10,286 @@ object TopT0 extends TopUopType object TopT1 extends TopUopType object TopT2 extends TopUopType object TopT3 extends TopUopType +object TopT4 extends TopUopType object TopT5 extends TopUopType object TopT6 extends TopUopType object TopT7 extends TopUopType +object TopT8 extends TopUopType +object TopT9 extends TopUopType +object TopT10 extends TopUopType +object TopT11 extends TopUopType +object TopT12 extends TopUopType +object TopT13 extends TopUopType +object TopT14 extends TopUopType +object TopT15 extends TopUopType +object TopT16 extends TopUopType +object TopT17 extends TopUopType +object TopT18 extends TopUopType +object TopT19 extends TopUopType +object TopT20 extends TopUopType +object TopT21 extends TopUopType +object TopT22 extends TopUopType +object TopT23 extends TopUopType +object TopT24 extends TopUopType +object TopT25 extends TopUopType +object TopT26 extends TopUopType +object TopT27 extends TopUopType +object TopT28 extends TopUopType +object TopT29 extends TopUopType +object TopT30 extends TopUopType +object TopT31 extends TopUopType object TopUop { def apply(t1DecodePattern: T1DecodePattern): TopUop = { Seq( - t0 _ -> TopT0, - t1 _ -> TopT1, - t2 _ -> TopT2, - t3 _ -> TopT3, - t5 _ -> TopT5, - t6 _ -> TopT6, - t7 _ -> TopT7 + t0 _ -> TopT0, + t1 _ -> TopT1, + t2 _ -> TopT2, + t3 _ -> TopT3, + t4 _ -> TopT4, + t5 _ -> TopT5, + t6 _ -> TopT6, + t7 _ -> TopT7, + t8 _ -> TopT8, + t9 _ -> TopT9, + t10 _ -> TopT10, + t11 _ -> TopT11, + t12 _ -> TopT12, + t13 _ -> TopT13, + t14 _ -> TopT14, + t15 _ -> TopT15, + t16 _ -> TopT16, + t17 _ -> TopT17, + t18 _ -> TopT18, + t19 _ -> TopT19, + t20 _ -> TopT20, + t21 _ -> TopT21, + t22 _ -> TopT22, + t23 _ -> TopT23, + t24 _ -> TopT24, + t25 _ -> TopT25, + t26 _ -> TopT26, + t27 _ -> TopT27, + t28 _ -> TopT28, + t29 _ -> TopT29, + t30 _ -> TopT30, + t31 _ -> TopT31 ).collectFirst { case (fn, tpe) if fn(t1DecodePattern) => TopUop(tpe) }.getOrElse(TopUop(TopT0)) } def t0(t1DecodePattern: T1DecodePattern): Boolean = { - val allMatched = t1DecodePattern.param.allInstructions.filter(i => - !(t1(t1DecodePattern) - || t2(t1DecodePattern) - || t3(t1DecodePattern) - || t5(t1DecodePattern) - || t6(t1DecodePattern) - || t7(t1DecodePattern)) + val allMatched: Seq[String] = Seq( + "vslidedown.vi", + "vslidedown.vx" ) - allMatched.contains(t1DecodePattern.instruction) + allMatched.contains(t1DecodePattern.instruction.name) } def t1(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vfslide1down.vf", - "vslide1down.vx", - "vzext.vf2" + "vslideup.vi", + "vslideup.vx" ) allMatched.contains(t1DecodePattern.instruction.name) } def t2(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vslideup.vi", - "vslideup.vx", - "vzext.vf4" + "vslide1down.vx", + "vfslide1down.vf" ) allMatched.contains(t1DecodePattern.instruction.name) } def t3(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vfslide1up.vf", "vslide1up.vx", - "vzext.vf8" + "vfslide1up.vf" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t4(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vrgather.vv" ) allMatched.contains(t1DecodePattern.instruction.name) } def t5(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vsext.vf2" + "vrgatherei16.vv" ) allMatched.contains(t1DecodePattern.instruction.name) } def t6(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t7(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t8(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("viota.m") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t9(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vcompress.vm") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t10(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vsext.vf4" + "vfmv.s.f", + "vmv.s.x" ) allMatched.contains(t1DecodePattern.instruction.name) } - def t7(t1DecodePattern: T1DecodePattern): Boolean = { + def t11(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vfmv.f.s", + "vmv.x.s" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t12(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t13(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t14(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vmsbf.m", + "vmsif.m", + "vmsof.m" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t15(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfirst.m") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t16(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vcpop.m", + "vredmax.vs", + "vredmaxu.vs", + "vredmin.vs", + "vredminu.vs", + "vredsum.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t17(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vwredsum.vs", + "vwredsumu.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t18(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vredand.vs", + "vredor.vs", + "vredxor.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t19(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vfredmax.vs", + "vfredmin.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t20(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfredusum.vs") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t21(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfredosum.vs") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t22(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfwredusum.vs") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t23(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vfwredosum.vs") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t24(t1DecodePattern: T1DecodePattern): Boolean = { val allMatched: Seq[String] = Seq( - "vsext.vf8" + "vmadc.vi", + "vmadc.vim", + "vmadc.vv", + "vmadc.vvm", + "vmadc.vx", + "vmadc.vxm", + "vmfeq.vf", + "vmfeq.vv", + "vmfge.vf", + "vmfgt.vf", + "vmfle.vf", + "vmfle.vv", + "vmflt.vf", + "vmflt.vv", + "vmfne.vf", + "vmfne.vv", + "vmsbc.vv", + "vmsbc.vvm", + "vmsbc.vx", + "vmsbc.vxm", + "vmseq.vi", + "vmseq.vv", + "vmseq.vx", + "vmsgt.vi", + "vmsgt.vx", + "vmsgtu.vi", + "vmsgtu.vx", + "vmsle.vi", + "vmsle.vv", + "vmsle.vx", + "vmsleu.vi", + "vmsleu.vv", + "vmsleu.vx", + "vmslt.vv", + "vmslt.vx", + "vmsltu.vv", + "vmsltu.vx", + "vmsne.vi", + "vmsne.vv", + "vmsne.vx" ) allMatched.contains(t1DecodePattern.instruction.name) } + def t25(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def t26(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vzext.vf2") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t27(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vsext.vf2") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t28(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vzext.vf4") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t29(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vsext.vf4") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t30(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vzext.vf8") + allMatched.contains(t1DecodePattern.instruction.name) + } + def t31(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq("vsext.vf8") + allMatched.contains(t1DecodePattern.instruction.name) + } } case class TopUop(value: TopUopType) extends UopDecodeAttribute[TopUopType] { diff --git a/t1/src/laneStage/LaneExecutionBridge.scala b/t1/src/laneStage/LaneExecutionBridge.scala index e1456f7db..e3356dfa9 100644 --- a/t1/src/laneStage/LaneExecutionBridge.scala +++ b/t1/src/laneStage/LaneExecutionBridge.scala @@ -75,11 +75,6 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd @public val dataResponse: ValidIO[VFUResponseToSlot] = IO(Flipped(Valid(new VFUResponseToSlot(parameter)))) - @public - val ffoByOtherLanes: Bool = IO(Input(Bool())) - @public - val selfCompleted: Bool = IO(Input(Bool())) - @public val executeDecode: DecodeBundle = IO(Output(Decoder.bundle(parameter.decoderParam))) @public @@ -318,7 +313,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd vfuRequest.bits.popInit := reduceResult.getOrElse(0.U) vfuRequest.bits.groupIndex := executionRecord.groupCounter vfuRequest.bits.laneIndex := executionRecord.laneIndex - vfuRequest.bits.complete := ffoByOtherLanes || selfCompleted + vfuRequest.bits.complete := false.B vfuRequest.bits.maskType := executionRecord.maskType vfuRequest.bits.narrow := narrowInRecord vfuRequest.bits.unitSelet.foreach(_ := executionRecord.decodeResult(Decoder.fpExecutionType)) @@ -411,7 +406,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd maskResult(1, 0) << (recordQueue.deq.bits.groupCounter(3, 0) ## false.B), // 1 bit per data group, it will had 32 data groups -> executeIndex1H << 1 * groupCounter(4, 0) - maskResult(0) << recordQueue.deq.bits.groupCounter(4, 0) + maskResult(0) << recordQueue.deq.bits.groupCounter(4.min(parameter.groupNumberBits - 1), 0) ) ).asUInt diff --git a/t1/src/laneStage/LaneStage.scala b/t1/src/laneStage/LaneStage.scala index 395e96c16..05cc18816 100644 --- a/t1/src/laneStage/LaneStage.scala +++ b/t1/src/laneStage/LaneStage.scala @@ -25,7 +25,6 @@ class LaneState(parameter: LaneParameter) extends Bundle { val maskType: Bool = Bool() val maskNotMaskedElement: Bool = Bool() val skipEnable: Bool = Bool() - val ffoByOtherLanes: Bool = Bool() /** vs1 or imm */ val vs1: UInt = UInt(5.W) diff --git a/t1/src/laneStage/LaneStage0.scala b/t1/src/laneStage/LaneStage0.scala index acbfe6aef..0aefdb69a 100644 --- a/t1/src/laneStage/LaneStage0.scala +++ b/t1/src/laneStage/LaneStage0.scala @@ -31,7 +31,6 @@ class LaneStage0Enqueue(parameter: LaneParameter) extends Bundle { // vm = 0 val maskType: Bool = Bool() val maskNotMaskedElement: Bool = Bool() - val ffoByOtherLanes: Bool = Bool() /** vs1 or imm */ val vs1: UInt = UInt(5.W) diff --git a/t1/src/laneStage/LaneStage1.scala b/t1/src/laneStage/LaneStage1.scala index 5b5809080..ce1836583 100644 --- a/t1/src/laneStage/LaneStage1.scala +++ b/t1/src/laneStage/LaneStage1.scala @@ -212,27 +212,35 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { // cross read enqueue queueBeforeCheckLSB.foreach { q => - q.enq.bits.vs := Mux( + val baseVs = Mux( enqueue.bits.decodeResult(Decoder.vwmacc), // cross read vd for vwmacc, since it need dual [[dataPathWidth]], use vs2 port to read LSB part of it. enqueue.bits.vd, // read vs2 for other instruction enqueue.bits.vs2 - ) + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + ) + val baseIndex = baseVs ## 0.U(parameter.vrfOffsetBits.W) + val indexGrowth: UInt = changeUIntSize(groupCounter ## false.B, groupCounter.getWidth) + val finalIndex: UInt = baseIndex + indexGrowth + q.enq.bits.vs := finalIndex >> parameter.vrfOffsetBits q.enq.bits.readSource := Mux(enqueue.bits.decodeResult(Decoder.vwmacc), 2.U, 1.U) - q.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## false.B + q.enq.bits.offset := finalIndex } queueBeforeCheckMSB.foreach { q => - q.enq.bits.vs := Mux( + val baseVs = Mux( enqueue.bits.decodeResult(Decoder.vwmacc), // cross read vd for vwmacc enqueue.bits.vd, // cross lane access use vs2 enqueue.bits.vs2 - ) + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + ) + val baseIndex = baseVs ## 0.U(parameter.vrfOffsetBits.W) + val indexGrowth: UInt = changeUIntSize(groupCounter ## true.B, groupCounter.getWidth) + val finalIndex: UInt = baseIndex + indexGrowth + q.enq.bits.vs := finalIndex >> parameter.vrfOffsetBits q.enq.bits.readSource := Mux(enqueue.bits.decodeResult(Decoder.vwmacc), 2.U, 1.U) - q.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + q.enq.bits.offset := finalIndex } // read pipe diff --git a/t1/src/laneStage/LaneStage3.scala b/t1/src/laneStage/LaneStage3.scala index 46427ade9..401eeb5da 100644 --- a/t1/src/laneStage/LaneStage3.scala +++ b/t1/src/laneStage/LaneStage3.scala @@ -47,18 +47,11 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val pipeEnqueue: Option[LaneStage3Enqueue] = Option.when(isLastSlot)(RegInit(0.U.asTypeOf(enqueue.bits))) - /** response to [[T1.lsu]] or mask unit in [[T1]] */ @public - val laneResponse: Option[ValidIO[LaneResponse]] = Option.when(isLastSlot)(IO(Valid(new LaneResponse(parameter)))) - @public - val stageValid: Bool = IO(Output(Bool())) + val stageValid: Bool = IO(Output(Bool())) - /** feedback from [[T1]] to [[Lane]] for [[laneResponse]] */ - @public - val laneResponseFeedback: Option[ValidIO[LaneResponseFeedback]] = - Option.when(isLastSlot)(IO(Flipped(Valid(new LaneResponseFeedback(parameter))))) @public - val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = + val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = Option.when(isLastSlot)(IO(Vec(2, Decoupled(new WriteBusData(parameter))))) val stageValidReg: Option[Bool] = Option.when(isLastSlot)(RegInit(false.B)) @@ -69,28 +62,17 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { /** schedule cross lane write MSB */ val sCrossWriteMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - // state for response to scheduler - /** schedule send [[LaneResponse]] to scheduler */ - val sSendResponse: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - - /** wait scheduler send [[LaneResponseFeedback]] */ - val wResponseFeedback: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - // update register when(enqueue.fire) { pipeEnqueue.foreach(_ := enqueue.bits) (sCrossWriteLSB ++ sCrossWriteMSB).foreach(_ := !enqueue.bits.decodeResult(Decoder.crossWrite)) - (sSendResponse ++ wResponseFeedback).foreach( - _ := enqueue.bits.decodeResult(Decoder.scheduler) || enqueue.bits.sSendResponse - ) } // Used to cut off back pressure forward - val vrfWriteQueue: QueueIO[VRFWriteRequest] = - Queue.io(vrfWriteBundle, entries = 4, pipe = false, flow = false) + val vrfWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteBundle, 4) // The load of the pointer is a bit large, copy one - val vrfPtrReplica: QueueIO[UInt] = - Queue.io(UInt(parameter.vrfParam.vrfOffsetBits.W), entries = 4, pipe = false, flow = false) + val offsetBit: Int = 1.max(parameter.vrfParam.vrfOffsetBits) + val vrfPtrReplica: QueueIO[UInt] = Queue.io(UInt(offsetBit.W), 4) vrfPtrReplica.enq.valid := vrfWriteQueue.enq.valid vrfPtrReplica.enq.bits := vrfWriteQueue.enq.bits.offset vrfPtrReplica.deq.ready := vrfWriteQueue.deq.ready @@ -112,43 +94,17 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { sendState(index) := true.B } } - // scheduler synchronization - val schedulerFinish: Bool = (sSendResponse ++ wResponseFeedback).reduce(_ && _) val dataSelect: Option[UInt] = Option.when(isLastSlot) { Mux( - pipeEnqueue.get.decodeResult(Decoder.nr) || - (enqueue.bits.ffoByOtherLanes && pipeEnqueue.get.decodeResult(Decoder.ffo)) || - pipeEnqueue.get.decodeResult(Decoder.dontNeedExecuteInLane), + pipeEnqueue.get.decodeResult(Decoder.nr) || pipeEnqueue.get.ffoByOtherLanes, pipeEnqueue.get.pipeData, pipeEnqueue.get.data ) } - // mask request - laneResponse.head.valid := stageValidReg.get && !sSendResponse.get - laneResponse.head.bits.data := Mux( - pipeEnqueue.get.decodeResult(Decoder.ffo), - pipeEnqueue.get.ffoIndex, - dataSelect.get - ) - laneResponse.head.bits.toLSU := pipeEnqueue.get.loadStore - laneResponse.head.bits.instructionIndex := pipeEnqueue.get.instructionIndex - laneResponse.head.bits.ffoSuccess := pipeEnqueue.get.ffoSuccess - laneResponse.head.bits.fpReduceValid.zip(pipeEnqueue.get.fpReduceValid).foreach { case (s, f) => s := f } - - sSendResponse.foreach(state => - when(laneResponse.head.valid) { - state := true.B - } - ) - wResponseFeedback.foreach(state => - when(laneResponseFeedback.head.valid) { - state := true.B - } - ) // enqueue write for last slot - vrfWriteQueue.enq.valid := stageValidReg.get && schedulerFinish && !pipeEnqueue.get.decodeResult(Decoder.sWrite) + vrfWriteQueue.enq.valid := stageValidReg.get && !pipeEnqueue.get.decodeResult(Decoder.sWrite) // UInt(5.W) + UInt(3.W), use `+` here vrfWriteQueue.enq.bits.vd := pipeEnqueue.get.vd + pipeEnqueue.get.groupCounter( @@ -166,8 +122,8 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { /** Cross-lane writing is over */ val CrossLaneWriteOver: Bool = (sCrossWriteLSB ++ sCrossWriteMSB).reduce(_ && _) - enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && schedulerFinish && vrfWriteReady) - val dequeueFire = stageValidReg.get && CrossLaneWriteOver && schedulerFinish && vrfWriteReady + enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && vrfWriteReady) + val dequeueFire = stageValidReg.get && CrossLaneWriteOver && vrfWriteReady stageValidReg.foreach { data => when(dequeueFire ^ enqueue.fire) { data := enqueue.fire diff --git a/t1/src/laneStage/MaskExchangeUnit.scala b/t1/src/laneStage/MaskExchangeUnit.scala new file mode 100644 index 000000000..cb8257f83 --- /dev/null +++ b/t1/src/laneStage/MaskExchangeUnit.scala @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.lane + +import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, public} +import chisel3.util._ +import org.chipsalliance.t1.rtl._ +import org.chipsalliance.t1.rtl.decoder.Decoder + +@instantiable +class MaskExchangeUnit(parameter: LaneParameter) extends Module { + @public + val enqueue: DecoupledIO[LaneStage3Enqueue] = + IO(Flipped(Decoupled(new LaneStage3Enqueue(parameter, true)))) + + @public + val dequeue: DecoupledIO[LaneStage3Enqueue] = + IO(Decoupled(new LaneStage3Enqueue(parameter, true))) + + @public + val maskReq: ValidIO[MaskUnitExeReq] = IO(Valid(new MaskUnitExeReq(parameter))) + + @public + val maskRequestToLSU: Bool = IO(Output(Bool())) + + @public + val tokenIO: LaneTokenBundle = IO(new LaneTokenBundle) + + // todo: sSendResponse -> sendResponse + val enqIsMaskRequest: Bool = !enqueue.bits.sSendResponse + // not maskUnit && not send out + val enqSendToDeq: Bool = !enqueue.bits.decodeResult(Decoder.maskUnit) && enqueue.bits.sSendResponse + val enqFFoIndex: Bool = enqueue.bits.decodeResult(Decoder.ffo) && + enqueue.bits.decodeResult(Decoder.targetRd) + + val maskRequestAllow: Bool = + pipeToken(parameter.maskRequestQueueSize)(maskReq.valid, tokenIO.maskRequestRelease) + // todo: connect mask request & response + maskReq.valid := enqIsMaskRequest && enqueue.valid && maskRequestAllow + maskReq.bits.source1 := enqueue.bits.pipeData + maskReq.bits.source2 := Mux( + enqFFoIndex, + enqueue.bits.ffoIndex, + enqueue.bits.data + ) + maskReq.bits.index := enqueue.bits.instructionIndex + maskReq.bits.ffo := enqueue.bits.ffoSuccess + + maskReq.bits.fpReduceValid.zip(enqueue.bits.fpReduceValid).foreach { case (sink, source) => sink := source } + + maskRequestToLSU := enqueue.bits.loadStore + + val maskRequestEnqReady: Bool = !enqIsMaskRequest || maskRequestAllow + + dequeue.valid := enqueue.valid && enqSendToDeq + dequeue.bits := enqueue.bits + enqueue.ready := Mux(enqSendToDeq, dequeue.ready, maskRequestEnqReady) + tokenIO.maskResponseRelease := DontCare +} diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala index 61212900f..f7893a926 100644 --- a/t1/src/laneStage/SlotTokenManager.scala +++ b/t1/src/laneStage/SlotTokenManager.scala @@ -94,7 +94,13 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val topWriteEnq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) @public - val topWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) + val fromMask: Bool = IO(Input(Bool())) + + @public + val lsuWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) + + @public + val maskWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) @public val instructionValid: UInt = IO(Output(UInt(parameter.chainingSize.W))) @@ -102,6 +108,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { @public val dataInWritePipe: UInt = IO(Output(UInt(parameter.chainingSize.W))) + @public + val maskUnitLastReport: UInt = IO(Input(UInt(parameter.chainingSize.W))) + def tokenUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt): UInt = { tokenData.zipWithIndex.foreach { case (t, i) => val e = enqWire(i) @@ -115,12 +124,15 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { } // todo: Precise feedback - def feedbackUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt): UInt = { + def feedbackUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt, clear: UInt): UInt = { tokenData.zipWithIndex.foreach { case (t, i) => val e = enqWire(i) val d = deqWire(i) + val c = clear(i) val change = Mux(e, 1.U(tokenWith.W), -1.S(tokenWith.W).asUInt) - when((e ^ d) && (e || t =/= 0.U)) { + when(c) { + t := 0.U + }.elsewhen((e ^ d) && (e || t =/= 0.U)) { t := t + change } } @@ -133,7 +145,11 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val enqOH = indexToOH(enqReport.bits.instructionIndex, parameter.chainingSize) val writeDoEnq: UInt = - maskAnd(enqReport.valid && !enqReport.bits.decodeResult(Decoder.sWrite), enqOH).asUInt + maskAnd( + enqReport.valid && !enqReport.bits.decodeResult(Decoder.sWrite) && + !enqReport.bits.decodeResult(Decoder.maskUnit), + enqOH + ).asUInt val writeDoDeq: UInt = maskAnd( @@ -141,7 +157,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { indexToOH(slotWriteReport(slotIndex).bits, parameter.chainingSize) ).asUInt - val pendingSlotWrite = tokenUpdate(writeToken, writeDoEnq, writeDoDeq) + val writeEnqSelect: UInt = Wire(UInt(parameter.chainingSize.W)) + + val pendingSlotWrite = tokenUpdate(writeToken, writeEnqSelect, writeDoDeq) if (slotIndex == 0) { val responseToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) @@ -176,19 +194,17 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val responseDoDeq: UInt = maskAnd(responseReport.valid, indexToOH(responseReport.bits, parameter.chainingSize)).asUInt - val feedbackIndexSelect = Mux( - responseIndexQueue.deq.valid, - responseIndexQueue.deq.bits, - responseFeedbackReport.bits - ) val feedbackDoDeq: UInt = - maskAnd(responseFeedbackReport.valid, indexToOH(feedbackIndexSelect, parameter.chainingSize)).asUInt + maskAnd(responseFeedbackReport.valid, indexToOH(responseFeedbackReport.bits, parameter.chainingSize)).asUInt + + writeEnqSelect := writeDoEnq val pendingResponse = tokenUpdate(responseToken, responseDoEnq, responseDoDeq) // todo: Precise feedback - val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq) + val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq, maskUnitLastReport) pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | pendingResponse | pendingFeedback } else { + writeEnqSelect := writeDoEnq pendingSlotWrite } }.reduce(_ | _) @@ -202,16 +218,25 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val instructionInWritePipe: UInt = tokenUpdate(writePipeToken, writePipeEnq, writePipeDeq) - // top write token - val topWriteToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) - val topWriteDoEnq: UInt = - maskAnd(topWriteEnq.valid, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt + // lsu & mask write token + val lsuWriteToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + val maskWriteToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + + val lsuWriteDoEnq: UInt = + maskAnd(topWriteEnq.valid && !fromMask, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt + + val maskWriteDoEnq: UInt = + maskAnd(topWriteEnq.valid && fromMask, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt + + val lsuWriteDoDeq: UInt = + maskAnd(lsuWriteDeq.valid, indexToOH(lsuWriteDeq.bits, parameter.chainingSize)).asUInt - val topWriteDoDeq: UInt = - maskAnd(topWriteDeq.valid, indexToOH(topWriteDeq.bits, parameter.chainingSize)).asUInt + val maskWriteDoDeq: UInt = + maskAnd(maskWriteDeq.valid, indexToOH(maskWriteDeq.bits, parameter.chainingSize)).asUInt - val instructionInTopWritePipe = tokenUpdate(topWriteToken, topWriteDoEnq, topWriteDoDeq) + val lsuInTopWrite = tokenUpdate(lsuWriteToken, lsuWriteDoEnq, lsuWriteDoDeq) + val maskInTopWrite = tokenUpdate(maskWriteToken, maskWriteDoEnq, maskWriteDoDeq) - dataInWritePipe := instructionInWritePipe | instructionInTopWritePipe + dataInWritePipe := instructionInWritePipe | lsuInTopWrite | maskInTopWrite instructionValid := dataInWritePipe | instructionInSlot } diff --git a/t1/src/lsu/Bundle.scala b/t1/src/lsu/Bundle.scala index ecc450cdc..4bcee26d8 100644 --- a/t1/src/lsu/Bundle.scala +++ b/t1/src/lsu/Bundle.scala @@ -27,10 +27,6 @@ class LSUBaseStatus extends Bundle { class SimpleAccessStatus(laneNumber: Int) extends LSUBaseStatus { - /** the MSHR finished the current offset group, need to notify Scheduler for next index group. - */ - val offsetGroupEnd: Bool = Bool() - /** the current lane that this MSHR is accessing. */ val targetLane: UInt = UInt(laneNumber.W) @@ -104,3 +100,7 @@ class SimpleMemWrite(param: MSHRParam) extends Bundle { val address: UInt = UInt(param.paWidth.W) val size: UInt = UInt(2.W) } + +class LSUToken(parameter: LSUParameter) extends Bundle { + val offsetGroupRelease: UInt = Output(UInt(parameter.laneNumber.W)) +} diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala index 6d1b86b1e..614ccfb2e 100644 --- a/t1/src/lsu/LSU.scala +++ b/t1/src/lsu/LSU.scala @@ -58,7 +58,7 @@ case class LSUParameter( */ val vLenBits: Int = log2Ceil(vLen) + 1 - val sourceQueueSize: Int = vLen * 8 / (transferSize * 8) + val sourceQueueSize: Int = 32.min(vLen * 8 / (transferSize * 8)) def mshrParam: MSHRParam = MSHRParam(chainingSize, datapathWidth, vLen, laneNumber, paWidth, transferSize, vrfReadLatency) @@ -187,16 +187,14 @@ class LSU(param: LSUParameter) extends Module { @public val lsuMaskGroupChange: UInt = IO(Output(UInt(param.chainingSize.W))) - /** interface to [[V]], redirect to [[org.chipsalliance.t1.rtl.Lane]]. this group of offset is finish, request the - * next group of offset. - */ - @public - val lsuOffsetRequest: Bool = IO(Output(Bool())) @public val writeReadyForLsu: Bool = IO(Input(Bool())) @public val vrfReadyToStore: Bool = IO(Input(Bool())) + @public + val tokenIO: LSUToken = IO(new LSUToken(param)) + // TODO: make it D/I val loadUnit: LoadUnit = Module(new LoadUnit(param.mshrParam)) val storeUnit: StoreUnit = Module(new StoreUnit(param.mshrParam)) @@ -437,15 +435,14 @@ class LSU(param: LSUParameter) extends Module { otherUnit.offsetReadResult := offsetReadResult // gather last signal from all MSHR to notify LSU - lastReport := + lastReport := unitVec.map(m => Mux(m.status.last, indexToOH(m.status.instructionIndex, param.chainingSize), 0.U)).reduce(_ | _) - lsuMaskGroupChange := unitVec + lsuMaskGroupChange := unitVec .map(m => Mux(m.status.changeMaskGroup, indexToOH(m.status.instructionIndex, param.chainingSize), 0.U)) .reduce(_ | _) - lsuOffsetRequest := (otherUnit.status.offsetGroupEnd | otherUnit.status.last | - (otherUnit.status.idle && offsetReadResult.map(_.valid).reduce(_ | _))) && otherUnit.status.isIndexLS - loadUnit.writeReadyForLsu := writeReadyForLsu - storeUnit.vrfReadyToStore := vrfReadyToStore + tokenIO.offsetGroupRelease := otherUnit.offsetRelease.asUInt + loadUnit.writeReadyForLsu := writeReadyForLsu + storeUnit.vrfReadyToStore := vrfReadyToStore val unitOrder: Bool = instIndexLE(loadUnit.status.instructionIndex, storeUnit.status.instructionIndex) val loadAddressConflict: Bool = (loadUnit.status.startAddress >= storeUnit.status.startAddress) && diff --git a/t1/src/lsu/SimpleAccessUnit.scala b/t1/src/lsu/SimpleAccessUnit.scala index 8cb26d359..d4598036c 100644 --- a/t1/src/lsu/SimpleAccessUnit.scala +++ b/t1/src/lsu/SimpleAccessUnit.scala @@ -116,6 +116,10 @@ case class MSHRParam( /** The maximum number of cache lines that will be accessed, a counter is needed. +1 Corresponding unaligned case */ val cacheLineIndexBits: Int = log2Ceil(vLen / lsuTransposeSize + 1) + + // outstanding of MaskExchangeUnit.maskReq + // todo: param from T1Param + val maskRequestQueueSize: Int = 8 } /** Miss Status Handler Register this is used to record the outstanding memory access request for each instruction. it @@ -189,6 +193,26 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { @public val probe = IO(Output(Probe(new MemoryWriteProbe(param), layers.Verification))) + @public + val offsetRelease: Vec[Bool] = IO(Output(Vec(param.laneNumber, Bool()))) + + val requestOffset: Bool = Wire(Bool()) + val stateIdle: Bool = Wire(Bool()) + val waitQueueDeq: Vec[Bool] = Wire(Vec(param.laneNumber, Bool())) + val offsetQueueVec: Seq[QueueIO[UInt]] = offsetReadResult.zipWithIndex.map { case (req, index) => + val queue: QueueIO[UInt] = Queue.io(chiselTypeOf(req.bits), param.maskRequestQueueSize) + val deqLock: Bool = RegInit(false.B) + waitQueueDeq(index) := deqLock + when(lsuRequest.valid || requestOffset || queue.deq.fire) { + deqLock := queue.deq.fire + } + offsetRelease(index) := queue.deq.fire + queue.enq.valid := req.valid + queue.enq.bits := req.bits + queue.deq.ready := !deqLock || stateIdle + queue + } + val s0Fire: Bool = Wire(Bool()) val s1Fire: Bool = Wire(Bool()) val memRequestFire: Bool = memReadRequest.fire || memWriteRequest.fire @@ -317,7 +341,7 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { * indexOfIndexedInstructionOffsetsNext: UInt = Wire(UInt(2.W))` */ val indexOfIndexedInstructionOffsets: UInt = - RegEnable(indexOfIndexedInstructionOffsetsNext, lsuRequest.valid || offsetReadResult.head.valid) + RegEnable(indexOfIndexedInstructionOffsetsNext, lsuRequest.valid || offsetQueueVec.head.deq.fire) indexOfIndexedInstructionOffsetsNext := Mux(lsuRequest.valid, 3.U(2.W), indexOfIndexedInstructionOffsets + 1.U) /** record the used [[indexedInstructionOffsets]] for sending memory transactions. */ @@ -326,10 +350,10 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { indexedInstructionOffsets.zipWithIndex.foreach { case (offset, index) => // offsetReadResult(index).valid: new offset came // (offset.valid && !usedIndexedInstructionOffsets(index)): old unused offset - offset.valid := offsetReadResult(index).valid || + offset.valid := offsetQueueVec(index).deq.fire || (offset.valid && !usedIndexedInstructionOffsets(index) && !status.last) // select from new and old. - offset.bits := Mux(offsetReadResult(index).valid, offsetReadResult(index).bits, offset.bits) + offset.bits := Mux(offsetQueueVec(index).deq.fire, offsetQueueVec(index).deq.bits, offset.bits) } /** register to latch mask */ @@ -584,7 +608,7 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { indexedInstructionOffsetExhausted ) || // change offset group - status.offsetGroupEnd || + (requestOffset && waitQueueDeq.asUInt.andR) || // change mask group // TODO: remove [[maskNeedUpdate]]? maskGroupEndAndRequestNewMask @@ -622,40 +646,14 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { ) ) - /** signal indicate that the offset group for all lanes are valid. */ - val allOffsetValid: Bool = VecInit(indexedInstructionOffsets.map(_.valid)).asUInt.andR - - /** signal used for aligning offset groups. eg: vl = 65, eew = 16, only wait for first group of offset - */ - val offsetGroupsAlign: Vec[Bool] = RegInit(VecInit(Seq.fill(param.laneNumber)(false.B))) - // to fix the bug that after the first group being used, the second group is not valid, - // MSHR will change group by mistake. - offsetGroupsAlign.zip(offsetReadResult).foreach { case (a, d) => - when(!a && d.valid) { - a := true.B - }.elsewhen(status.offsetGroupEnd) { - a := false.B - } - } - - val alignCheck: Bool = - (offsetGroupsAlign.asUInt >> ( - // offsetOfOffsetGroup is in byte level - offsetOfOffsetGroup >> - // shift it to word level - log2Ceil(param.datapathWidth / 8) - ).asUInt).asUInt(0) - /** the current element is the last element to execute in the pipeline. */ val last: Bool = nextElementIndex >= evl /** no need mask, there still exist unsent masked requests, don't need to update mask. */ val maskCheck: Bool = !isMaskedLoadStore || !noMoreMaskedUnsentMemoryRequests - val skipAllGroupOffset: Bool = isIndexedLoadStore && offsetValidCheck && alignCheck && !offsetGroupCheck - /** no need index, when use a index, check it is valid or not. */ - val indexCheck: Bool = !isIndexedLoadStore || (offsetValidCheck && offsetGroupCheck && alignCheck) + val indexCheck: Bool = !isIndexedLoadStore || (offsetValidCheck && offsetGroupCheck) // handle fault only first /** the current TileLink message in A Channel is the first transaction in this instruction. */ @@ -676,24 +674,10 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { /** all check is ready, being able to send request to pipeline. */ val stateReady: Bool = stateIsRequest && maskCheck && indexCheck && fofCheck - /** only need to request offset when changing offset group, don't send request for the first offset group for each - * instruction. - */ - val needRequestOffset: Bool = - RegEnable(offsetReadResult.head.valid, false.B, offsetReadResult.head.valid || lsuRequest.valid) - + // state === idle: All the remaining elements are removed by the mask, + // but there is still offset left. /** signal to request offset in the pipeline, only assert for one cycle. */ - val requestOffset: Bool = stateIsRequest && maskCheck && !indexCheck && fofCheck - - /** lock [[status.offsetGroupEnd]] */ - val offsetRequestLock: Bool = RegInit(false.B) - - when(status.offsetGroupEnd || offsetReadResult.head.valid) { - offsetRequestLock := status.offsetGroupEnd - } - - // ask Scheduler to change offset group - status.offsetGroupEnd := needRequestOffset && requestOffset && !offsetRequestLock + requestOffset := stateIsRequest && maskCheck && !indexCheck && fofCheck val s0DequeueFire: Bool = Wire(Bool()) @@ -979,6 +963,7 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { // handle corner case for vl=0 val invalidInstruction: Bool = csrInterface.vl === 0.U && !requestIsWholeRegisterLoadStore && lsuRequest.valid val invalidInstructionNext: Bool = RegNext(invalidInstruction) + val allElementsMasked: Bool = state === idle && offsetQueueVec.map(_.deq.fire).reduce(_ || _) // change state to request when(lsuRequest.valid && !invalidInstruction) { @@ -990,9 +975,9 @@ class SimpleAccessUnit(param: MSHRParam) extends Module with LSUPublic { status.instructionIndex := lsuRequestReg.instructionIndex /** the current state is idle. */ - val stateIdle = state === idle + stateIdle := state === idle status.idle := stateIdle - status.last := (!RegNext(stateIdle) && stateIdle) || invalidInstructionNext + status.last := (!RegNext(stateIdle) && stateIdle) || invalidInstructionNext || allElementsMasked status.changeMaskGroup := updateOffsetGroupEnable // which lane to access status.targetLane := { diff --git a/t1/src/lsu/StoreUnit.scala b/t1/src/lsu/StoreUnit.scala index a5afb5a05..1fb72096b 100644 --- a/t1/src/lsu/StoreUnit.scala +++ b/t1/src/lsu/StoreUnit.scala @@ -248,8 +248,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { } // 连接 alignedDequeue - val needSendTail: Bool = bufferBaseCacheLineIndex === cacheLineNumberReg - memRequest.valid := bufferValid || (canSendTail && needSendTail) + val needSendTail: Bool = bufferBaseCacheLineIndex === cacheLineNumberReg + val addressQueueFree: Bool = Wire(Bool()) + memRequest.valid := (bufferValid || (canSendTail && needSendTail)) && addressQueueFree // aligned memRequest.bits.data := multiShifter(right = false, multiSize = 8)(dataBuffer.head ## cacheLineTemp, initOffset) >> cacheLineTemp.getWidth @@ -263,12 +264,14 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic { 0.U(param.cacheLineBits.W) memRequest.bits.address := alignedDequeueAddress - val addressQueueSize: Int = (param.vLen * 8) / (param.datapathWidth * param.laneNumber) + 1 + // todo: param outstanding + val addressQueueSize: Int = 32.min((param.vLen * 8) / (param.datapathWidth * param.laneNumber) + 1) // address Wait For Response val addressQueue: QueueIO[UInt] = Queue.io(UInt(param.paWidth.W), addressQueueSize) addressQueue.enq.valid := memRequest.fire addressQueue.enq.bits := alignedDequeueAddress addressQueue.deq.ready := storeResponse + addressQueueFree := addressQueue.enq.ready status.idle := !bufferValid && !readStageValid && readQueueClear && !bufferFull && !addressQueue.deq.valid val idleNext: Bool = RegNext(status.idle, true.B) diff --git a/t1/src/mask/BitLevelMaskWrite.scala b/t1/src/mask/BitLevelMaskWrite.scala new file mode 100644 index 000000000..cefe31868 --- /dev/null +++ b/t1/src/mask/BitLevelMaskWrite.scala @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.dwbb.stdlib.queue.{Queue, QueueIO} + +class BitLevelWriteRequest(parameter: T1Parameter) extends Bundle { + val data: UInt = UInt(parameter.datapathWidth.W) + val pipeData: UInt = UInt(parameter.datapathWidth.W) + val bitMask: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt((parameter.datapathWidth / 8).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val ffoByOther: Bool = Bool() +} + +class BitLevelMaskWrite(parameter: T1Parameter) extends Module { + // todo + val readVRFLatency: Int = 2 + + val needWAR: Bool = IO(Input(Bool())) + val vd: UInt = IO(Input(UInt(5.W))) + + val in: Seq[DecoupledIO[BitLevelWriteRequest]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Decoupled(new BitLevelWriteRequest(parameter)))) + } + + val out: Seq[DecoupledIO[MaskUnitExeResponse]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Decoupled(new MaskUnitExeResponse(parameter.laneParam))) + } + + val readChannel: Seq[DecoupledIO[VRFReadRequest]] = Seq.tabulate(parameter.laneNumber) { _ => + IO( + Decoupled( + new VRFReadRequest( + parameter.vrfParam.regNumBits, + parameter.laneParam.vrfOffsetBits, + parameter.instructionIndexBits + ) + ) + ) + } + + val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Input(UInt(parameter.datapathWidth.W))) + } + + val stageClear: Bool = IO(Output(Bool())) + + val stageClearVec: Seq[Bool] = in.zipWithIndex.map { case (req, index) => + val reqQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), 4) + val readPort = readChannel(index) + val readData = readResult(index) + val res = out(index) + + val WaitReadQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), readVRFLatency) + val readReady = !needWAR || readPort.ready + + reqQueue.enq <> req + WaitReadQueue.enq.valid := reqQueue.deq.valid && readReady + WaitReadQueue.enq.bits := reqQueue.deq.bits + reqQueue.deq.ready := WaitReadQueue.enq.ready && readReady + + readPort.valid := reqQueue.deq.valid && needWAR && WaitReadQueue.enq.ready + readPort.bits := DontCare + readPort.bits.vs := vd + (reqQueue.deq.bits.groupCounter >> readPort.bits.offset.getWidth).asUInt + readPort.bits.offset := changeUIntSize(reqQueue.deq.bits.groupCounter, readPort.bits.offset.getWidth) + + val readValidPipe = Pipe(readPort.fire, false.B, readVRFLatency).valid + val readResultValid = !needWAR || readValidPipe + + val WARData = (WaitReadQueue.deq.bits.data & WaitReadQueue.deq.bits.bitMask) | + (readData & (~WaitReadQueue.deq.bits.bitMask).asUInt) + + res.valid := WaitReadQueue.deq.valid && readResultValid + WaitReadQueue.deq.ready := res.ready && readResultValid + res.bits := DontCare + res.bits.pipeData := WaitReadQueue.deq.bits.pipeData + res.bits.ffoByOther := WaitReadQueue.deq.bits.ffoByOther + res.bits.writeData.data := Mux(needWAR, WARData, WaitReadQueue.deq.bits.data) + res.bits.writeData.groupCounter := WaitReadQueue.deq.bits.groupCounter + res.bits.writeData.mask := maskEnable(!needWAR, WaitReadQueue.deq.bits.mask) + + // valid token + val counter = RegInit(0.U(3.W)) + val counterChange = Mux(req.fire, 1.U(3.W), 7.U(3.W)) + when(req.fire ^ res.fire) { + counter := counter + counterChange + } + counter === 0.U + } + stageClear := stageClearVec.reduce(_ && _) +} diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala new file mode 100644 index 000000000..24f597259 --- /dev/null +++ b/t1/src/mask/MaskCompress.scala @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ + +class CompressInput(parameter: T1Parameter) extends Bundle { + val maskType: Bool = Bool() + val eew: UInt = UInt(2.W) + val uop: UInt = UInt(3.W) + val readFromScalar: UInt = UInt(parameter.datapathWidth.W) + val source1: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt(parameter.datapathWidth.W) + val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val ffoInput: UInt = UInt(parameter.laneNumber.W) + val validInput: UInt = UInt(parameter.laneNumber.W) + val lastCompress: Bool = Bool() +} + +class CompressOutput(parameter: T1Parameter) extends Bundle { + val data: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val mask: UInt = UInt((parameter.laneNumber * parameter.datapathWidth / 8).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val ffoOutput: UInt = UInt(parameter.laneNumber.W) + val compressValid: Bool = Bool() +} + +class MaskCompress(parameter: T1Parameter) extends Module { + val in: ValidIO[CompressInput] = IO(Flipped(Valid(new CompressInput(parameter)))) + val out: CompressOutput = IO(Output(new CompressOutput(parameter))) + val newInstruction: Bool = IO(Input(Bool())) + val ffoInstruction: Bool = IO(Input(Bool())) + val writeData: UInt = IO(Output(UInt(parameter.xLen.W))) + + val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 + + val compress = in.bits.uop === "b001".U + val viota = in.bits.uop === "b000".U + val mv = in.bits.uop === "b010".U + val mvRd = in.bits.uop === "b011".U + val writeRD = in.bits.uop === BitPat("b?11") + val ffoType = in.bits.uop === BitPat("b11?") + + val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) + val compressInit: UInt = RegInit(0.U(log2Ceil(parameter.vLen).W)) + val compressVec: Vec[UInt] = Wire(Vec(maskSize, UInt(compressInit.getWidth.W))) + val compressMaskVec: Seq[Bool] = changeUIntSize(in.bits.source1 & in.bits.mask, maskSize).asBools + val compressCount: UInt = compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) => + compressVec(index) := pre + pre + mask + } + + // ffo + val ffoIndex: UInt = RegInit(0.U(parameter.datapathWidth.W)) + val ffoValid: Bool = RegInit(false.B) + writeData := ffoIndex + + when(newInstruction) { + compressInit := 0.U + } + + val countSplit: Seq[(Bool, UInt)] = Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte + val countWidth = log2Ceil(elementSizePerSet) + val compressDeqValid = (compressCount >> countWidth).asUInt.orR + val compressUpdate = changeUIntSize(compressCount, countWidth) + (compressDeqValid, compressUpdate) + } + + val compressDeqValid: Bool = Mux1H(eew1H, countSplit.map(_._1)) || !compress + val compressCountSelect: UInt = Mux1H(eew1H, countSplit.map(_._2)) + + when(in.fire) { + when(viota) { + compressInit := compressCount + }.otherwise { + // count update compress + compressInit := compressCountSelect + } + } + + val viotaResult: UInt = Mux1H( + eew1H, + Seq(1, 2, 4).map { eew => + VecInit(Seq.tabulate(parameter.laneNumber) { index => + // data width: eew * 8, data path 32, need [4 / eew] element + val dataSize = 4 / eew + val res: Seq[UInt] = Seq.tabulate(dataSize) { i => + changeUIntSize(compressVec(dataSize * index + i), eew * 8) + } + // each data path + VecInit(res).asUInt + }).asUInt + } + ) + val viotaMask: UInt = Mux1H( + eew1H, + Seq(1, 2, 4).map { eew => + VecInit(Seq.tabulate(parameter.laneNumber) { index => + val dataSize = 4 / eew + val res: Seq[UInt] = Seq.tabulate(dataSize) { i => + val maskIndex: Int = (parameter.datapathWidth - 1).min(dataSize * index + i) + Fill(eew, in.bits.mask(maskIndex)) + } + // 4 bit mask + VecInit(res).asUInt + }).asUInt + } + ) + + val tailCount: UInt = { + val minElementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 + val maxCountWidth = log2Ceil(minElementSizePerSet) + changeUIntSize(compressInit, maxCountWidth) + } + + val compressDataReg = RegInit(0.U((parameter.laneNumber * parameter.datapathWidth).W)) + val compressTailValid: Bool = RegInit(false.B) + val compressWriteGroupCount: UInt = RegInit(0.U(parameter.laneParam.groupNumberBits.W)) + val compressDataVec = Seq(1, 2, 4).map { dataByte => + val dataBit = dataByte * 8 + val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte + VecInit(Seq.tabulate(elementSizePerSet * 2) { index => + val hitReq = + Seq.tabulate(elementSizePerSet)(maskIndex => compressMaskVec(maskIndex) && compressVec(maskIndex) === index.U) + val selectReqData = Mux1H( + hitReq, + cutUInt(in.bits.source2, dataBit) + ) + if (index < elementSizePerSet) { + val useTail = index.U < tailCount + val tailData = cutUInt(compressDataReg, dataBit)(index) + Mux(useTail, tailData, selectReqData) + } else { + selectReqData + } + }).asUInt + } + val compressResult: UInt = Mux1H(eew1H, compressDataVec) + val lastCompressEnq: Bool = in.fire && in.bits.lastCompress + when(newInstruction || lastCompressEnq || out.compressValid) { + compressTailValid := lastCompressEnq && compress + } + + when(newInstruction || out.compressValid) { + compressWriteGroupCount := Mux(newInstruction, 0.U, compressWriteGroupCount + 1.U) + } + + val splitCompressResult: Vec[UInt] = cutUIntBySize(compressResult, 2) + when(in.fire) { + compressDataReg := Mux(compressDeqValid, splitCompressResult(1), splitCompressResult(0)) + } + + // todo: connect & update compressInit + val compressMask = Wire(UInt(out.mask.getWidth.W)) + // todo: optimization + val compressTailMask: UInt = Mux1H( + eew1H, + Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte + VecInit(Seq.tabulate(elementSizePerSet) { elementIndex => + val elementValid = elementIndex.U < tailCount + val elementMask = Fill(dataByte, elementValid) + elementMask + }).asUInt + } + ) + compressMask := Mux(compressTailValid, compressTailMask, (-1.S(out.mask.getWidth.W)).asUInt) + + val mvMask = Mux1H(eew1H, Seq(1.U, 3.U, 15.U)) + val mvData = in.bits.readFromScalar + + val ffoMask: UInt = FillInterleaved(parameter.datapathWidth / 8, in.bits.validInput) + + out.data := Mux1H( + Seq( + compress -> compressResult, + viota -> viotaResult, + mv -> mvData, + ffoType -> in.bits.source2 + ) + ) + + // todo: compressMask + out.mask := Mux1H( + Seq( + compress -> compressMask, + viota -> viotaMask, + mv -> mvMask, + ffoType -> ffoMask + ) + ) + + // todo + out.compressValid := (compressTailValid || (compressDeqValid && in.fire)) && !writeRD + out.groupCounter := Mux(compress, compressWriteGroupCount, in.bits.groupCounter) + + when(newInstruction && ffoInstruction) { + ffoIndex := -1.S(parameter.datapathWidth.W).asUInt + ffoValid := false.B + } + val firstLane: UInt = ffo(in.bits.ffoInput) + val firstLaneIndex: UInt = OHToUInt(firstLane)(log2Ceil(parameter.laneNumber) - 1, 0) + + val source1SigExtend: UInt = Mux1H( + eew1H, + Seq(1, 2, 4).map { byteSize => + val dataBits = byteSize * 8 + if (parameter.xLen > dataBits) { + Fill(parameter.xLen - dataBits, in.bits.source1(dataBits - 1)) ## in.bits.source1(dataBits - 1, 0) + } else { + in.bits.source1 + } + } + ) + + /** for find first one, need to tell the lane with higher index `1` . */ + val completedLeftOr: UInt = (scanLeftOr(in.bits.ffoInput) << 1).asUInt(parameter.laneNumber - 1, 0) + when(in.fire && in.bits.ffoInput.orR && ffoType) { + ffoValid := true.B + when(!ffoValid) { + ffoIndex := Mux1H( + firstLane, + // 3: firstLaneIndex.width + cutUInt(in.bits.source2, parameter.datapathWidth).map(i => + i(parameter.xLen - 1 - 3, 5) ## firstLaneIndex ## i(4, 0) + ) + ) + } + }.elsewhen(mvRd) { + ffoIndex := source1SigExtend + } + out.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid) +} diff --git a/t1/src/mask/MaskExtend.scala b/t1/src/mask/MaskExtend.scala new file mode 100644 index 000000000..5df5ed2ee --- /dev/null +++ b/t1/src/mask/MaskExtend.scala @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ + +class ExtendInput(parameter: T1Parameter) extends Bundle { + val eew: UInt = UInt(2.W) + val uop: UInt = UInt(3.W) + val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) +} + +class MaskExtend(parameter: T1Parameter) extends Module { + val in: ExtendInput = IO(Input(new ExtendInput(parameter))) + val out: UInt = IO(Output(UInt((parameter.laneNumber * parameter.datapathWidth).W))) + + val eew1H: UInt = UIntToOH(in.eew)(2, 0) + + val isMaskDestination: Bool = !in.uop(2, 0).orR + val sourceDataVec: Vec[UInt] = cutUInt(in.source2, parameter.datapathWidth) + val maskDestinationResult: UInt = + Mux1H( + eew1H, + Seq(4, 2, 1).map { groupSize => + VecInit(sourceDataVec.map { element => + element.asBools // [x] * 32 eg: sew = 1 + .grouped(groupSize) // [x, x] * 16 + .toSeq + .map(VecInit(_).asUInt) // [xx] * 16 + }.transpose.map(VecInit(_).asUInt)).asUInt // [x*16] * 16 -> x * 256 + } + ) + + // extend + val sign: Bool = in.uop(0) + // extend ratio + // todo: Currently only vf2 and vf4 + // 0b10 -> 4, 0b01 -> 2 + val extendRatio: Bool = in.uop(2) + + // select source2 + // extendRatio: 0 -> vf2; 1-> vf4 + val source2: UInt = Mux( + extendRatio, + Mux1H( + UIntToOH(in.groupCounter(1, 0)), + cutUInt(in.source2, parameter.laneNumber * parameter.datapathWidth / 4) + ), + Mux1H( + UIntToOH(in.groupCounter(0)), + cutUInt(in.source2, parameter.laneNumber * parameter.datapathWidth / 2) + ) + ) + + val extendResult: UInt = Mux1H( + eew1H(2, 1), + Seq(2, 4).map { dataWidth => + Mux1H( + UIntToOH(extendRatio), + Seq(2, 4).map { ratio => + val resWidth = dataWidth * 8 + val sourceWidth = resWidth / ratio + VecInit(cutUInt(source2, sourceWidth).map { sourceData => + Fill(resWidth - sourceWidth, sourceData(sourceWidth - 1) && sign) ## sourceData + }).asUInt + } + ) + } + ) + + out := Mux(isMaskDestination, maskDestinationResult, extendResult) +} diff --git a/t1/src/mask/MaskReduce.scala b/t1/src/mask/MaskReduce.scala new file mode 100644 index 000000000..288a190f5 --- /dev/null +++ b/t1/src/mask/MaskReduce.scala @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.experimental.hierarchy.{Instance, Instantiate} +import chisel3.util._ + +class ReduceInput(parameter: T1Parameter) extends Bundle { + val maskType: Bool = Bool() + val eew: UInt = UInt(2.W) + val uop: UInt = UInt(3.W) + val readVS1: UInt = UInt(parameter.datapathWidth.W) + val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val sourceValid: UInt = UInt(parameter.laneNumber.W) + val lastGroup: Bool = Bool() + val vxrm: UInt = UInt(3.W) + val aluUop: UInt = UInt(4.W) + val sign: Bool = Bool() + // for fpu + val fpSourceValid: Option[UInt] = Option.when(parameter.fpuEnable)(UInt(parameter.laneNumber.W)) +} + +class ReduceOutput(parameter: T1Parameter) extends Bundle { + val data: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt((parameter.datapathWidth / 8).W) +} + +class MaskReduce(parameter: T1Parameter) extends Module { + val in: DecoupledIO[ReduceInput] = IO(Flipped(Decoupled(new ReduceInput(parameter)))) + val out: ValidIO[ReduceOutput] = IO(Valid(new ReduceOutput(parameter))) + val firstGroup: Bool = IO(Input(Bool())) + val newInstruction: Bool = IO(Input(Bool())) + val validInst: Bool = IO(Input(Bool())) + val pop: Bool = IO(Input(Bool())) + + val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 + + // todo: uop decode + val order: Bool = in.bits.uop === "b101".U + val reqWiden: Bool = in.bits.uop === "b001".U || in.bits.uop(2, 1) === "b11".U + + val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) + val nextFoldCount: Bool = eew1H(0) && !reqWiden + + // reduce function unit + val adder: Instance[ReduceAdder] = Instantiate(new ReduceAdder(parameter.datapathWidth)) + val logicUnit: Instance[LaneLogic] = Instantiate(new LaneLogic(parameter.datapathWidth)) + // option unit for flot reduce + val floatAdder: Option[Instance[FloatAdder]] = + Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(8, 24))) + val flotCompare: Option[Instance[FloatCompare]] = + Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(8, 24))) + + // init reg + val reduceInit: UInt = RegInit(0.U(parameter.datapathWidth.W)) + val reduceResult: UInt = Wire(UInt(parameter.datapathWidth.W)) + val crossFoldCount: UInt = RegInit(0.U(log2Ceil(parameter.laneNumber).W)) + val lastFoldCount: Bool = RegInit(false.B) + val updateResult: Bool = Wire(Bool()) + val sourceValid: Bool = Wire(Bool()) + + val reqReg = RegEnable(in.bits, 0.U.asTypeOf(in.bits), in.fire) + // todo: handle reqReg.sourceValid + val groupLastReduce = crossFoldCount.andR + val lastFoldEnd = !lastFoldCount + val outValid: Bool = WireDefault(false.B) + // todo: skip float reduce + val skipFlotReduce: Bool = WireDefault(false.B) + + val eew1HReg: UInt = UIntToOH(reqReg.eew)(2, 0) + val floatType: Bool = reqReg.uop(2) || reqReg.uop(1, 0).andR + val NotAdd: Bool = reqReg.uop(1) + val widen: Bool = reqReg.uop === "b001".U || reqReg.uop(2, 1) === "b11".U + // eew1HReg(0) || (eew1HReg(1) && !widen) + val needFold: Bool = false.B + val writeEEW: UInt = Mux(pop, 2.U, reqReg.eew + widen) + val writeEEW1H: UInt = UIntToOH(writeEEW)(2, 0) + val writeMask: UInt = Fill(2, writeEEW1H(2)) ## !writeEEW1H(0) ## true.B + + // crossFold: reduce between lane + // lastFold: reduce in data path + // orderRed: order reduce + val idle :: crossFold :: lastFold :: orderRed :: Nil = Enum(4) + val state: UInt = RegInit(idle) + + val stateIdle: Bool = state === idle + val stateCross: Bool = state === crossFold + val stateLast: Bool = state === lastFold + val stateOrder: Bool = state === orderRed + + updateResult := + stateLast || ((stateCross || stateOrder) && sourceValid) + + // state update + in.ready := stateIdle + when(stateIdle) { + when(in.valid) { + state := Mux(order, orderRed, crossFold) + } + } + + when(stateCross) { + when(groupLastReduce) { + state := Mux(reqReg.lastGroup && needFold, lastFold, idle) + outValid := reqReg.lastGroup && !needFold + } + } + + when(stateOrder) { + when(groupLastReduce) { + state := idle + outValid := reqReg.lastGroup + } + } + + when(stateLast) { + when(lastFoldEnd) { + state := idle + outValid := true.B + } + } + + val widenEnqMask: UInt = Fill(2, in.bits.eew.orR) ## true.B ## true.B + val normalMask: UInt = Fill(2, in.bits.eew(1)) ## in.bits.eew.orR ## true.B + val enqWriteMask: UInt = Mux(reqWiden, widenEnqMask, normalMask) + val updateInitMask: UInt = FillInterleaved(8, enqWriteMask) + val updateMask: UInt = FillInterleaved(8, writeMask) + when(firstGroup || newInstruction) { + reduceInit := Mux(pop || newInstruction, 0.U, in.bits.readVS1 & updateInitMask) + crossFoldCount := 0.U + lastFoldCount := nextFoldCount + } + + // count update + // todo: stateCross <=> stateOrder ?? + when(stateCross || stateOrder || in.fire) { + crossFoldCount := Mux(in.fire, 0.U, crossFoldCount + 1.U) + } + + // result update + when(updateResult) { + reduceInit := reduceResult & updateMask + } + + when(stateLast) { + lastFoldCount := false.B + } + + val selectLaneResult: UInt = Mux1H( + UIntToOH(crossFoldCount), + cutUInt(reqReg.source2, parameter.datapathWidth) + ) + val sourceValidCalculate: UInt = + reqReg.fpSourceValid + .map(fv => Mux(floatType, fv & reqReg.sourceValid, reqReg.sourceValid)) + .getOrElse(reqReg.sourceValid) + sourceValid := Mux1H( + UIntToOH(crossFoldCount), + sourceValidCalculate.asBools + ) + val reduceDataVec = cutUInt(reduceInit, 8) + // reduceFoldCount = false => abcd -> xxab | xxcd -> mask 0011 + // reduceFoldCount = true => abcd -> xaxc | xbxd -> mask 0101 + val lastFoldSource1: UInt = Mux( + lastFoldCount, + reduceDataVec(3) ## reduceDataVec(3) ## reduceDataVec(1), + reduceDataVec(3) ## reduceDataVec(3) ## reduceDataVec(2) + ) + val source2Select: UInt = Mux(stateCross || stateOrder, selectLaneResult, lastFoldSource1) + + // popCount 在top视为reduce add + adder.request.src := VecInit(Seq(reduceInit, source2Select)) + adder.request.opcode := Mux(pop, 0.U, reqReg.aluUop) + adder.request.sign := reqReg.sign + adder.request.vSew := writeEEW + + floatAdder.foreach { fAdder => + fAdder.io.a := reduceInit + fAdder.io.b := source2Select + fAdder.io.roundingMode := reqReg.vxrm + } + + flotCompare.foreach { fCompare => + fCompare.io.a := reduceInit + fCompare.io.b := source2Select + // max -> 12, min -> 8 + fCompare.io.isMax := reqReg.aluUop(2) + } + + logicUnit.req.src := VecInit(Seq(reduceInit, source2Select)) + logicUnit.req.opcode := reqReg.aluUop + + val flotReduceResult: Option[UInt] = Option.when(parameter.fpuEnable)( + Mux( + skipFlotReduce, + reduceInit, + Mux(NotAdd, flotCompare.get.io.out, floatAdder.get.io.out) + ) + ) + // select result + reduceResult := Mux( + floatType, + flotReduceResult.getOrElse(adder.response.data), + Mux(NotAdd, logicUnit.resp, adder.response.data) + ) + + out.valid := outValid && !pop + out.bits.data := Mux(updateResult, reduceResult, reduceInit) + out.bits.mask := writeMask & Fill(4, validInst) +} diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala new file mode 100644 index 000000000..fa71fc1c8 --- /dev/null +++ b/t1/src/mask/MaskUnit.scala @@ -0,0 +1,1166 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, public} +import chisel3.util._ +import org.chipsalliance.t1.rtl.decoder.Decoder +import org.chipsalliance.dwbb.stdlib.queue.{Queue, QueueIO} + +// top uop decode +// uu ii x -> uu: unit index; ii: Internal encoding, x: additional encode + +// slid & gather unit, need read vrf in mask unit(00) +// 00 00 x -> slid; x? up: down +// 00 01 x -> slid1; x? up: down +// 00 10 x -> gather; x? 16 : sew todo:(multi address check/ index -> data cache?) + +// compress & viota unit & vmv(01) +// These instructions cannot extend their execution width indefinitely. +// 01 00 x -> x ? compress : viota +// 01 01 x -> vmv; x: write rd ? + +// reduce unit(10) n + 8 + m -> n + 3 + m // Folded into datapath, then folded into sew +// The Reduce instruction folds the data. +// Considering the sequential addition, a state machine is needed to control it. +// 10 00 x -> adder; x: widen reduce? +// 10 01 x -> logic; x: dc +// 10 10 x -> floatAdder; x: order? +// 10 11 x -> flotCompare; x: dc + +// extend unit & maskdestination(11) +// These instructions write an entire data path each time they are executed. +// 11 mm x -> s(z)ext; mm: multiple(00, 01, 10); x ? sign : zero +// 11 11 1 -> maskdestination +@instantiable +class MaskUnit(parameter: T1Parameter) extends Module { + // todo: param + val readQueueSize: Int = 4 + val readVRFLatency: Int = 2 + val maskUnitWriteQueueSize: Int = 8 + + @public + val instReq: ValidIO[MaskUnitInstReq] = IO(Flipped(Valid(new MaskUnitInstReq(parameter)))) + + @public + val exeReq: Seq[ValidIO[MaskUnitExeReq]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Valid(new MaskUnitExeReq(parameter.laneParam)))) + } + + @public + val exeResp: Seq[ValidIO[VRFWriteRequest]] = Seq.tabulate(parameter.laneNumber) { _ => + IO( + Valid( + new VRFWriteRequest( + parameter.vrfParam.regNumBits, + parameter.laneParam.vrfOffsetBits, + parameter.instructionIndexBits, + parameter.datapathWidth + ) + ) + ) + } + + @public + val tokenIO: Seq[LaneTokenBundle] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(new LaneTokenBundle)) + } + + @public + val readChannel: Seq[DecoupledIO[VRFReadRequest]] = Seq.tabulate(parameter.laneNumber) { _ => + IO( + Decoupled( + new VRFReadRequest( + parameter.vrfParam.regNumBits, + parameter.laneParam.vrfOffsetBits, + parameter.instructionIndexBits + ) + ) + ) + } + + @public + val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Input(UInt(parameter.datapathWidth.W))) + } + + @public + val writeRD: ValidIO[UInt] = IO(Valid(UInt(parameter.datapathWidth.W))) + + @public + val lastReport: UInt = IO(Output(UInt(parameter.chainingSize.W))) + + // mask + @public + val lsuMaskInput: Vec[UInt] = IO(Output(Vec(parameter.lsuMSHRSize, UInt(parameter.maskGroupWidth.W)))) + + @public + val lsuMaskSelect: Vec[UInt] = + IO(Input(Vec(parameter.lsuMSHRSize, UInt(parameter.lsuParameters.maskGroupSizeBits.W)))) + + // mask + @public + val laneMaskInput: Vec[UInt] = IO(Output(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W)))) + + @public + val laneMaskSelect: Vec[UInt] = + IO(Input(Vec(parameter.laneNumber, UInt(parameter.laneParam.maskGroupSizeBits.W)))) + + @public + val laneMaskSewSelect: Vec[UInt] = IO(Input(Vec(parameter.laneNumber, UInt(2.W)))) + + @public + val v0UpdateVec = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Valid(new V0Update(parameter.laneParam)))) + } + + @public + val writeRDData: UInt = IO(Output(UInt(parameter.xLen.W))) + + @public + val gatherData: DecoupledIO[UInt] = IO(Decoupled(UInt(parameter.xLen.W))) + + @public + val gatherRead: Bool = IO(Input(Bool())) + + /** duplicate v0 for mask */ + val v0: Vec[UInt] = RegInit( + VecInit(Seq.fill(parameter.vLen / parameter.datapathWidth)(0.U(parameter.datapathWidth.W))) + ) + + // write v0(mask) + v0.zipWithIndex.foreach { case (data, index) => + // 属于哪个lane + val laneIndex: Int = index % parameter.laneNumber + // 取出写的端口 + val v0Write = v0UpdateVec(laneIndex) + // offset + val offset: Int = index / parameter.laneNumber + val maskExt = FillInterleaved(8, v0Write.bits.mask) + when(v0Write.valid && v0Write.bits.offset === offset.U) { + data := (data & (~maskExt).asUInt) | (maskExt & v0Write.bits.data) + } + } + + // mask update & select + // lane + // TODO: uarch doc for the regroup + val regroupV0: Seq[UInt] = Seq(4, 2, 1).map { groupSize => + VecInit( + cutUInt(v0.asUInt, groupSize) + .grouped(parameter.laneNumber) + .toSeq + .transpose + .map(seq => VecInit(seq).asUInt) + ).asUInt + } + laneMaskInput.zipWithIndex.foreach { case (input, index) => + val v0ForThisLane: Seq[UInt] = regroupV0.map(rv => cutUInt(rv, parameter.vLen / parameter.laneNumber)(index)) + val v0SelectBySew = Mux1H(UIntToOH(laneMaskSewSelect(index))(2, 0), v0ForThisLane) + input := cutUInt(v0SelectBySew, parameter.datapathWidth)(laneMaskSelect(index)) + } + + // lsu + lsuMaskInput.zip(lsuMaskSelect).foreach { case (data, index) => + data := cutUInt(v0.asUInt, parameter.maskGroupWidth)(index) + } + + val maskedWrite: BitLevelMaskWrite = Module(new BitLevelMaskWrite(parameter)) + + def gatherIndex(elementIndex: UInt, vlmul: UInt, sew: UInt): (UInt, UInt, UInt, UInt, Bool) = { + val intLMULInput: UInt = (1.U << vlmul(1, 0)).asUInt + val positionSize = parameter.laneParam.vlMaxBits - 1 + val dataPosition = (changeUIntSize(elementIndex, positionSize) << sew).asUInt(positionSize - 1, 0) + val sewOHInput = UIntToOH(sew)(2, 0) + + // The offset of the data starting position in 32 bits (currently only 32). + // Since the data may cross lanes, it will be optimized during fusion. + val dataOffset: UInt = (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0)) + val accessLane = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W) + // 32 bit / group + val dataGroup = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt + val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits + val offset = dataGroup(offsetWidth - 1, 0) + val accessRegGrowth = (dataGroup >> offsetWidth).asUInt + val decimalProportion = offset ## accessLane + // 1/8 register + val decimal = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3)) + + /** elementIndex needs to be compared with vlMax(vLen * lmul /sew) This calculation is too complicated We can change + * the angle. Calculate the increment of the read register and compare it with lmul to know whether the index + * exceeds vlMax. vlmul needs to distinguish between integers and floating points + */ + val overlap = + (vlmul(2) && decimal >= intLMULInput(3, 1)) || + (!vlmul(2) && accessRegGrowth >= intLMULInput) || + (elementIndex >> log2Ceil(parameter.vLen)).asUInt.orR + val notNeedRead = overlap + val reallyGrowth: UInt = changeUIntSize(accessRegGrowth, 3) + (dataOffset, accessLane, offset, reallyGrowth, notNeedRead) + } + val (dataOffset, accessLane, offset, reallyGrowth, notNeedRead) = + gatherIndex(instReq.bits.readFromScala, instReq.bits.vlmul, instReq.bits.sew) + val idle :: sRead :: wRead :: sResponse :: Nil = Enum(4) + val gatherReadState: UInt = RegInit(idle) + val gatherRequestFire: Bool = Wire(Bool()) + val gatherSRead: Bool = gatherReadState === sRead + val gatherWaiteRead: Bool = gatherReadState === wRead + val gatherResponse: Bool = gatherReadState === sResponse + val gatherDatOffset: UInt = RegEnable(dataOffset, 0.U, gatherRequestFire) + val gatherLane: UInt = RegEnable(accessLane, 0.U, gatherRequestFire) + val gatherOffset: UInt = RegEnable(offset, 0.U, gatherRequestFire) + val gatherGrowth: UInt = RegEnable(reallyGrowth, 0.U, gatherRequestFire) + + val instReg: MaskUnitInstReq = RegEnable(instReq.bits, 0.U.asTypeOf(instReq.bits), instReq.valid) + val enqMvRD: Bool = instReq.bits.decodeResult(Decoder.topUop) === BitPat("b01011") + val instVlValid: Bool = + RegEnable((instReq.bits.vl.orR || enqMvRD) && instReq.valid, false.B, instReq.valid || lastReport.orR) + gatherRequestFire := gatherReadState === idle && gatherRead && !instVlValid + // viota mask read vs2. Also pretending to be reading vs1 + val viotaReq: Bool = instReq.bits.decodeResult(Decoder.topUop) === "b01000".U + when(instReq.valid && (viotaReq || enqMvRD) || gatherRequestFire) { + instReg.vs1 := instReq.bits.vs2 + instReg.instructionIndex := instReq.bits.instructionIndex + } + // register for read vs1 + val readVS1Reg: MaskUnitReadVs1 = RegInit(0.U.asTypeOf(new MaskUnitReadVs1(parameter))) + val sew1H: UInt = UIntToOH(instReg.sew)(2, 0) + // request for read vs1 + val readVS1Req: MaskUnitReadReq = WireDefault(0.U.asTypeOf(new MaskUnitReadReq(parameter))) + + when(instReq.valid || gatherRequestFire) { + readVS1Reg.requestSend := false.B + readVS1Reg.dataValid := false.B + readVS1Reg.sendToExecution := false.B + readVS1Reg.readIndex := 0.U + } + + // from decode + val unitType: UInt = UIntToOH(instReg.decodeResult(Decoder.topUop)(4, 3)) + val subType: UInt = UIntToOH(instReg.decodeResult(Decoder.topUop)(2, 1)) + val readType: Bool = unitType(0) + val gather16: Bool = instReg.decodeResult(Decoder.topUop) === "b00101".U + val maskDestinationType: Bool = instReg.decodeResult(Decoder.topUop) === "b11000".U + val compress: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b0100?") + val viota: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b01000") + val mv: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b0101?") + val mvRd: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b01011") + val mvVd: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b01010") + val orderReduce: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b101?1") + val ffo: Bool = instReg.decodeResult(Decoder.topUop) === BitPat("b0111?") + val extendType: Bool = unitType(3) && (subType(2) || subType(1)) + val pop: Bool = instReg.decodeResult(Decoder.popCount) + + // Instructions for writing vd without source + val noSource: Bool = mv || viota + + val allGroupExecute: Bool = maskDestinationType || unitType(2) || compress || ffo + val useDefaultSew: Bool = unitType(0) && !gather16 + // todo: decode ? + // Indicates how many times a set of data will be executed + // 0 -> 4 times + // 1 -> 2 times + // 3 -> 1 times + val dataSplitSew: UInt = Mux1H( + Seq( + useDefaultSew -> instReg.sew, + // extend + (unitType(3) && subType(2)) -> 0.U, + (unitType(3) && subType(1) || gather16) -> 1.U, + allGroupExecute -> 2.U + ) + ) + + // Indicates that an element will use the width of the original data + val sourceDataUseDefaultSew: Bool = !(unitType(3) || gather16) + val sourceDataEEW: UInt = Mux1H( + Seq( + sourceDataUseDefaultSew -> instReg.sew, + // extend + unitType(3) -> (instReg.sew >> subType(2, 1)).asUInt, + gather16 -> 1.U + ) + ) + + // ExecuteIndex is only related to how many times it will be executed, so use [dataSplitSew] + val lastExecuteIndex: UInt = Mux1H(UIntToOH(dataSplitSew), Seq(3.U(2.W), 2.U(2.W), 0.U(2.W))) + + // calculate last group + val sourceDataEEW1H: UInt = UIntToOH(sourceDataEEW)(2, 0) + val lastElementIndex: UInt = (instReg.vl - instReg.vl.orR)(parameter.laneParam.vlMaxBits - 2, 0) + + val maskFormatSource: Bool = ffo || maskDestinationType + + // When one row is not enough, should we prioritize filling one lane? + val prioritizeLane: Bool = ffo + + // Seq(1, 2, 4) => element byte size + val processingVl: Seq[(UInt, UInt)] = Seq(1, 2, 4).map { eByte => + val eByteLog = log2Ceil(eByte) + val lastByteIndex = (lastElementIndex << eByteLog).asUInt + // The width of a row of data + val rowWidth = parameter.datapathWidth * parameter.laneNumber / 8 + val rowWidthLog: Int = log2Ceil(rowWidth) + val lastGroupRemaining: UInt = changeUIntSize(lastByteIndex, rowWidthLog) + // get last group index + val lastRowIndex = (lastByteIndex >> rowWidthLog).asUInt + + // for last group remainder lastGroupRemaining + val laneDatalog = log2Ceil(parameter.datapathWidth) + val lastLaneIndex = (lastGroupRemaining >> laneDatalog).asUInt + val lastGroupDataNeed = scanRightOr(UIntToOH(lastLaneIndex)) + (lastRowIndex, lastGroupDataNeed) + } + + // mask format source, 1 bit/element + val processingMaskVl: Seq[(UInt, UInt)] = Seq(1).map { eBit => + val lastBitIndex = lastElementIndex + // The width of a row of data + val rowWidth = parameter.datapathWidth * parameter.laneNumber + val rowWidthLog: Int = log2Ceil(rowWidth) + val lastGroupRemaining: UInt = changeUIntSize(lastBitIndex, rowWidthLog) + val lastGroupMisAlign: Bool = lastGroupRemaining.orR + // get last group index + val lastRowIndex = (lastBitIndex >> rowWidthLog).asUInt + + // for prioritizeLane + // for last group remainder lastGroupRemaining + val laneDatalog = log2Ceil(parameter.datapathWidth) + val lastLaneIndex = (lastGroupRemaining >> laneDatalog).asUInt - + !changeUIntSize(lastGroupRemaining, laneDatalog).orR + val dataNeedForPL = scanRightOr(UIntToOH(lastLaneIndex)) + + // for !prioritizeLane + // Seq(4, 2, 1) => If it is in normal form, one datapath corresponds to several elements + val dataNeedForNPL = Mux1H( + sew1H, + Seq(4, 2, 1).map { eSize => + val eSizeLog = log2Ceil(eSize) + val misAlign = if (eSizeLog > 0) changeUIntSize(lastGroupRemaining, eSizeLog).orR else false.B + // How many datapaths will there be? + val datapathSize = (lastGroupRemaining >> eSizeLog).asUInt +& misAlign + + val laneNumLog = log2Ceil(parameter.laneNumber) + // More than one group + val allNeed = (datapathSize >> laneNumLog).asUInt.orR + val lastLaneIndex = changeUIntSize(datapathSize, laneNumLog) + val dataNeed: UInt = (~scanLeftOr(UIntToOH(lastLaneIndex))).asUInt | Fill(parameter.laneNumber, allNeed) + dataNeed + } + ) + val lastGroupDataNeed = Mux(prioritizeLane, dataNeedForPL, dataNeedForNPL) + (lastRowIndex, lastGroupDataNeed) + } + + val reduceLastDataNeed: UInt = Mux1H( + sew1H, + Seq(1, 2, 4).map { eByte => + val eLog = log2Ceil(eByte) + // byte size per row + val rowByteSize = parameter.datapathWidth * parameter.laneNumber / 8 + // byte size for vl + val byteForVl = (instReg.vl << eLog).asUInt + + val vlMSB: Bool = (byteForVl >> log2Ceil(rowByteSize)).asUInt.orR + // Unaligned row parts + val vlLSB: UInt = changeUIntSize(instReg.vl, log2Ceil(rowByteSize)) + + val dLog = log2Ceil(parameter.datapathWidth / 8) + // How many datapaths does LSB contain? + val lsbDSize = (vlLSB >> dLog).asUInt - !changeUIntSize(vlLSB, dLog).orR + scanRightOr(UIntToOH(lsbDSize)) | Fill(parameter.laneNumber, vlMSB) + } + ) + + val dataSourceSew: UInt = Mux( + unitType(3), + instReg.sew - instReg.decodeResult(Decoder.topUop)(2, 1), + Mux(gather16, 1.U, instReg.sew) + ) + val dataSourceSew1H: UInt = UIntToOH(dataSourceSew)(2, 0) + + val unorderReduce: Bool = !orderReduce && unitType(2) + val normalFormat: Bool = !maskFormatSource && !unorderReduce && !mv + val lastGroupForInstruction: UInt = Mux1H( + Seq( + (unorderReduce || mv) -> 0.U, + maskFormatSource -> processingMaskVl.head._1, + (normalFormat && dataSourceSew1H(0)) -> processingVl.head._1, + (normalFormat && dataSourceSew1H(1)) -> processingVl(1)._1, + (normalFormat && dataSourceSew1H(2)) -> processingVl(2)._1 + ) + ) + + val popDataNeed: UInt = { + val dataPathBit = log2Ceil(parameter.datapathWidth) + val lastLaneIndex = (lastElementIndex >> dataPathBit).asUInt + scanRightOr(UIntToOH(lastLaneIndex)) + } + val lastGroupDataNeed: UInt = Mux1H( + Seq( + (unorderReduce && pop) -> popDataNeed, + (unorderReduce && !pop) -> reduceLastDataNeed, + maskFormatSource -> processingMaskVl.head._2, + (normalFormat && dataSourceSew1H(0)) -> processingVl.head._2, + (normalFormat && dataSourceSew1H(1)) -> processingVl(1)._2, + (normalFormat && dataSourceSew1H(2)) -> processingVl(2)._2 + ) + ) + + val groupSizeForMaskDestination: Int = parameter.laneNumber * parameter.datapathWidth + val elementTailForMaskDestination: UInt = lastElementIndex(log2Ceil(groupSizeForMaskDestination) - 1, 0) + + val exeRequestQueue: Seq[QueueIO[MaskUnitExeReq]] = exeReq.zipWithIndex.map { case (req, index) => + // todo: max or token? + val queue: QueueIO[MaskUnitExeReq] = + Queue.io(chiselTypeOf(req.bits), parameter.laneParam.maskRequestQueueSize, flow = true) + tokenIO(index).maskRequestRelease := queue.deq.fire + queue.enq.valid := req.valid + queue.enq.bits := req.bits + queue + } + + val exeReqReg: Seq[ValidIO[MaskUnitExeReq]] = Seq.tabulate(parameter.laneNumber) { _ => + RegInit(0.U.asTypeOf(Valid(new MaskUnitExeReq(parameter.laneParam)))) + } + val requestCounter: UInt = RegInit(0.U(parameter.laneParam.groupNumberBits.W)) + val executeGroupCounter: UInt = Wire(UInt(parameter.laneParam.groupNumberBits.W)) + + val counterValid: Bool = requestCounter <= lastGroupForInstruction + val lastGroup: Bool = + requestCounter === lastGroupForInstruction || (!orderReduce && unitType(2)) || mv + val slideAddressGen: SlideIndexGen = Module(new SlideIndexGen(parameter)) + slideAddressGen.newInstruction := instReq.valid & instReq.bits.vl.orR + slideAddressGen.instructionReq := instReg + slideAddressGen.slideMaskInput := cutUInt(v0.asUInt, 8)(slideAddressGen.slideGroupOut) + + val firstRequest: Bool = RegInit(false.B) + val viotaCounterAdd: Bool = Wire(Bool()) + val groupCounterAdd: Bool = Mux(noSource, viotaCounterAdd, exeRequestQueue.head.deq.fire && firstRequest) + when(exeRequestQueue.head.deq.fire || instReq.valid) { + firstRequest := exeRequestQueue.head.deq.fire + } + when(instReq.valid || groupCounterAdd) { + requestCounter := Mux(instReq.valid, 0.U, requestCounter + 1.U) + } + + // todo: mask + val groupDataNeed: UInt = Mux(lastGroup, lastGroupDataNeed, (-1.S(parameter.laneNumber.W)).asUInt) + // For read type, only sew * laneNumber data will be consumed each time + // There will be a maximum of (dataPath * laneNumber) / (sew * laneNumber) times + val executeIndex: UInt = RegInit(0.U(2.W)) + // The status of an execution + // Each execution ends with executeIndex + 1 + val readIssueStageState: MaskUnitReadState = RegInit(0.U.asTypeOf(new MaskUnitReadState(parameter))) + val readIssueStageValid: Bool = RegInit(false.B) + + def indexAnalysis(sewInt: Int)(elementIndex: UInt, vlmul: UInt, valid: Option[Bool] = None): Seq[UInt] = { + val intLMULInput: UInt = (1.U << vlmul(1, 0)).asUInt + val positionSize = parameter.laneParam.vlMaxBits - 1 + val dataPosition = (changeUIntSize(elementIndex, positionSize) << sewInt).asUInt(positionSize - 1, 0) + val accessMask: UInt = Seq( + UIntToOH(dataPosition(1, 0)), + FillInterleaved(2, UIntToOH(dataPosition(1))), + 15.U(4.W) + )(sewInt) + // The offset of the data starting position in 32 bits (currently only 32). + // Since the data may cross lanes, it will be optimized during fusion. + // (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0)) + val dataOffset: UInt = + (if (sewInt < 2) dataPosition(1) else false.B) ## + (if (sewInt == 0) dataPosition(0) else false.B) + val accessLane = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W) + // 32 bit / group + val dataGroup = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt + val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits + val offset = dataGroup(offsetWidth - 1, 0) + val accessRegGrowth = (dataGroup >> offsetWidth).asUInt + val decimalProportion = offset ## accessLane + // 1/8 register + val decimal = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3)) + + /** elementIndex needs to be compared with vlMax(vLen * lmul /sew) This calculation is too complicated We can change + * the angle. Calculate the increment of the read register and compare it with lmul to know whether the index + * exceeds vlMax. vlmul needs to distinguish between integers and floating points + */ + val overlap = + (vlmul(2) && decimal >= intLMULInput(3, 1)) || + (!vlmul(2) && accessRegGrowth >= intLMULInput) || + (elementIndex >> log2Ceil(parameter.vLen)).asUInt.orR + val elementValid = valid.getOrElse(true.B) + val notNeedRead = overlap || !elementValid + val reallyGrowth: UInt = changeUIntSize(accessRegGrowth, 3) + Seq(accessMask, dataOffset, accessLane, offset, reallyGrowth, notNeedRead, elementValid) + } + + // datapath bit per mask group + // laneNumber bit per execute group + val executeGroup: UInt = Mux1H( + UIntToOH(dataSplitSew)(2, 0), + Seq( + requestCounter ## executeIndex, + requestCounter ## executeIndex(1), + requestCounter + ) + ) + + // read vl boundary + val executeSizeBit: Int = log2Ceil(parameter.laneNumber) + val vlMisAlign = instReg.vl(executeSizeBit - 1, 0).orR + val lastexecuteGroup: UInt = (instReg.vl >> executeSizeBit).asUInt - !vlMisAlign + val isVlBoundary: Bool = executeGroup === lastexecuteGroup + val validExecuteGroup: Bool = executeGroup <= lastexecuteGroup + val vlBoundaryCorrection: UInt = Mux( + vlMisAlign && isVlBoundary, + (~scanLeftOr(UIntToOH(instReg.vl(executeSizeBit - 1, 0)))).asUInt, + -1.S(parameter.laneNumber.W).asUInt + ) & Fill(parameter.laneNumber, validExecuteGroup) + + // handle mask + val readMaskSelect: UInt = + (executeGroup >> log2Ceil(parameter.datapathWidth / parameter.laneNumber)).asUInt + val readMaskInput: UInt = cutUInt(v0.asUInt, parameter.maskGroupWidth)(readMaskSelect) + val selectReadStageMask: UInt = cutUIntBySize(readMaskInput, 4)(executeGroup(1, 0)) + val readMaskCorrection: UInt = + Mux(instReg.maskType, selectReadStageMask, -1.S(parameter.laneNumber.W).asUInt) & + vlBoundaryCorrection + + // write mask for normal execute + val maskSplit = Seq(0, 1, 2).map { sewInt => + // byte / element + val dataByte = 1 << sewInt + val rowElementSize: Int = parameter.laneNumber * parameter.datapathWidth / dataByte / 8 + val maskSelect = cutUInt(v0.asUInt, rowElementSize)(executeGroupCounter) + + val executeSizeBit: Int = log2Ceil(rowElementSize) + val vlMisAlign = instReg.vl(executeSizeBit - 1, 0).orR + val lastexecuteGroup: UInt = (instReg.vl >> executeSizeBit).asUInt - !vlMisAlign + val isVlBoundary: Bool = executeGroupCounter === lastexecuteGroup + val validExecuteGroup: Bool = executeGroupCounter <= lastexecuteGroup + val vlBoundaryCorrection: UInt = maskEnable( + vlMisAlign && isVlBoundary, + (~scanLeftOr(UIntToOH(instReg.vl(executeSizeBit - 1, 0)))).asUInt + ) & Fill(rowElementSize, validExecuteGroup) + val elementMask = maskEnable(instReg.maskType, maskSelect) & vlBoundaryCorrection + val byteMask = FillInterleaved(dataByte, elementMask) + (byteMask, elementMask) + } + val executeByteMask: UInt = Mux1H(sew1H, maskSplit.map(_._1)) + val executeElementMask: UInt = Mux1H(sew1H, maskSplit.map(_._2)) + + // mask for destination + val maskForDestination: UInt = cutUInt(v0.asUInt, groupSizeForMaskDestination)(requestCounter) + val lastGroupMask: UInt = scanRightOr(UIntToOH(elementTailForMaskDestination)) + val currentMaskGroupForDestination: UInt = maskEnable(lastGroup, lastGroupMask) & + maskEnable(instReg.maskType && !instReg.decodeResult(Decoder.maskSource), maskForDestination) + + // select source & valid + val minSourceSize: Int = 8 * parameter.laneNumber + val minValidSize = parameter.laneNumber + val groupSourceData: UInt = VecInit(exeReqReg.map(_.bits.source1)).asUInt + val groupSourceValid: UInt = VecInit(exeReqReg.map(_.valid)).asUInt + val shifterSize: UInt = Wire(UInt(2.W)) + shifterSize := Mux1H( + sourceDataEEW1H(1, 0), + Seq( + executeIndex, + executeIndex(1) ## false.B + ) + ) + val shifterSource: UInt = Mux1H( + UIntToOH(shifterSize), + Seq( + groupSourceData, + (groupSourceData >> minSourceSize).asUInt, + (groupSourceData >> (minSourceSize * 2)).asUInt, + (groupSourceData >> (minSourceSize * 3)).asUInt + ) + ) + val selectValid: UInt = Mux1H( + sourceDataEEW1H, + Seq( + cutUIntBySize(FillInterleaved(4, groupSourceValid), 4)(executeIndex), + cutUIntBySize(FillInterleaved(2, groupSourceValid), 2)(executeIndex(1)), + groupSourceValid + ) + ) + val source: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) + source.zipWithIndex.foreach { case (d, i) => + d := Mux1H( + sourceDataEEW1H, + Seq( + cutUInt(shifterSource, 8)(i), + cutUInt(shifterSource, 16)(i), + cutUInt(shifterSource, 32)(i) + ) + ) + } + + val checkVec: Seq[Seq[UInt]] = Seq(0, 1, 2).map { sewInt => + val validVec = selectValid & readMaskCorrection + // read index check + // (accessMask, dataOffset, accessLane, offset, reallyGrowth, overlap) + val checkResultVec: Seq[Seq[UInt]] = source.zipWithIndex.map { case (s, i) => + indexAnalysis(sewInt)(s, instReg.vlmul, Some(validVec(i))) + } + val checkResult = checkResultVec.transpose.map(a => VecInit(a).asUInt) + checkResult + } + val sewCorrection1H: UInt = sew1H + val dataOffsetSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(1))) + val accessLaneSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(2))) + val offsetSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(3))) + val growthSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(4))) + val notReadSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(5))) + val elementValidSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(6))) + + val readCrossBar: MaskUnitReadCrossBar = Module(new MaskUnitReadCrossBar(parameter)) + + // read data queue deq release + val readTokenRelease: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) + + // todo: param + val readDataQueueSize: Int = 8 + + // The queue waiting to read data. This queue contains other information about this group. + // 64: todo: max or token? + val readWaitQueue: QueueIO[MaskUnitWaitReadQueue] = Queue.io(new MaskUnitWaitReadQueue(parameter), 64) + + // s0 pipe request from lane + val lastExecuteGroupDeq: Bool = Wire(Bool()) + exeRequestQueue.zip(exeReqReg).foreach { case (req, reg) => + req.deq.ready := !reg.valid || lastExecuteGroupDeq || viota + when(req.deq.fire) { + reg.bits := req.deq.bits + } + when(req.deq.fire ^ lastExecuteGroupDeq) { + reg.valid := req.deq.fire && !viota + } + } + + val isLastExecuteGroup: Bool = executeIndex === lastExecuteIndex + val allDataValid: Bool = exeReqReg.zipWithIndex.map { case (d, i) => d.valid || !groupDataNeed(i) }.reduce(_ && _) + val anyDataValid: Bool = exeReqReg.zipWithIndex.map { case (d, i) => d.valid }.reduce(_ || _) + + // try to read vs1 + val readVs1Valid: Bool = + (unitType(2) || compress || mvRd) && !readVS1Reg.requestSend || gatherSRead + readVS1Req.vs := instReg.vs1 + when(compress) { + val logLaneNumber = log2Ceil(parameter.laneNumber) + readVS1Req.vs := instReg.vs1 + (readVS1Reg.readIndex >> (parameter.laneParam.vrfOffsetBits + logLaneNumber)) + readVS1Req.offset := readVS1Reg.readIndex >> logLaneNumber + readVS1Req.readLane := changeUIntSize(readVS1Reg.readIndex, logLaneNumber) + }.elsewhen(gatherSRead) { + readVS1Req.vs := instReg.vs1 + gatherGrowth + readVS1Req.offset := gatherOffset + readVS1Req.readLane := gatherLane + readVS1Req.dataOffset := gatherDatOffset + } + + // select execute group + val pipeReadFire: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) + val selectExecuteReq: Seq[ValidIO[MaskUnitReadReq]] = exeReqReg.zipWithIndex.map { case (_, index) => + val res: ValidIO[MaskUnitReadReq] = WireInit(0.U.asTypeOf(Valid(new MaskUnitReadReq(parameter)))) + res.bits.vs := instReg.vs2 + readIssueStageState.vsGrowth(index) + if (parameter.laneParam.vrfOffsetBits > 0) { + res.bits.offset := cutUIntBySize(readIssueStageState.readOffset, parameter.laneNumber)(index) + } + res.bits.readLane := readIssueStageState.accessLane(index) + res.bits.dataOffset := cutUIntBySize(readIssueStageState.readDataOffset, parameter.laneNumber)(index) + res.bits.requestIndex := index.U + res.valid := readIssueStageValid && !readIssueStageState.groupReadState(index) && + readIssueStageState.needRead(index) && unitType(0) + if (index == 0) { + when(readVs1Valid) { + res.valid := true.B + res.bits := readVS1Req + } + pipeReadFire(index) := !readVs1Valid && readCrossBar.input(index).fire + } else { + pipeReadFire(index) := readCrossBar.input(index).fire + } + res + } + + when(readCrossBar.input.head.fire) { + readVS1Reg.requestSend := true.B + } + + // read arbitration + readCrossBar.input.zip(selectExecuteReq).zipWithIndex.foreach { case ((cross, req), index) => + // read token + val tokenCheck: Bool = pipeToken(readDataQueueSize)(cross.fire, readTokenRelease(index)) + cross.valid := req.valid && tokenCheck + cross.bits := req.bits + } + + // read control register update + val readFire: UInt = pipeReadFire.asUInt + val anyReadFire: Bool = readFire.orR + val readStateUpdate: UInt = readFire | readIssueStageState.groupReadState + val groupReadFinish: Bool = readStateUpdate === readIssueStageState.needRead + val readTypeRequestDeq: Bool = + (anyReadFire && groupReadFinish) || (readIssueStageValid && readIssueStageState.needRead === 0.U) + + val noSourceValid: Bool = noSource && counterValid && + (instReg.vl.orR || (mvRd && !readVS1Reg.sendToExecution)) + val vs1DataValid: Bool = readVS1Reg.dataValid || !(unitType(2) || compress || mvRd) + val executeReady: Bool = Wire(Bool()) + val executeDeqReady: Bool = VecInit(maskedWrite.in.map(_.ready)).asUInt.andR + val otherTypeRequestDeq: Bool = + Mux(noSource, noSourceValid, allDataValid) && + vs1DataValid && instVlValid && executeDeqReady + val requestStageDeq: Bool = Mux(readType, readTypeRequestDeq, otherTypeRequestDeq && executeReady) + val readIssueStageEnq: Bool = + (allDataValid || slideAddressGen.indexDeq.valid) && + (readTypeRequestDeq || !readIssueStageValid) && instVlValid && readType + slideAddressGen.indexDeq.ready := readTypeRequestDeq || !readIssueStageValid + when(anyReadFire) { + readIssueStageState.groupReadState := readStateUpdate + } + + when(readTypeRequestDeq ^ readIssueStageEnq) { + readIssueStageValid := readIssueStageEnq + } + + val executeIndexGrowth: UInt = (1.U << dataSplitSew).asUInt + when(requestStageDeq && anyDataValid) { + executeIndex := executeIndex + executeIndexGrowth + } + when(readIssueStageEnq) { + readIssueStageState.groupReadState := 0.U + readIssueStageState.needRead := (~notReadSelect).asUInt + readIssueStageState.elementValid := elementValidSelect + readIssueStageState.replaceVs1 := 0.U + readIssueStageState.accessLane := cutUIntBySize(accessLaneSelect, parameter.laneNumber) + readIssueStageState.vsGrowth := cutUIntBySize(growthSelect, parameter.laneNumber) + readIssueStageState.readOffset := offsetSelect + readIssueStageState.executeGroup := executeGroup + readIssueStageState.readDataOffset := dataOffsetSelect + readIssueStageState.last := isVlBoundary + when(slideAddressGen.indexDeq.fire) { + readIssueStageState := slideAddressGen.indexDeq.bits + } + } + + readWaitQueue.enq.valid := readTypeRequestDeq + readWaitQueue.enq.bits.executeGroup := readIssueStageState.executeGroup + readWaitQueue.enq.bits.sourceValid := readIssueStageState.elementValid + readWaitQueue.enq.bits.replaceVs1 := readIssueStageState.replaceVs1 + readWaitQueue.enq.bits.needRead := readIssueStageState.needRead + readWaitQueue.enq.bits.last := readIssueStageState.last + + // last execute group in this request group dequeue + lastExecuteGroupDeq := requestStageDeq && isLastExecuteGroup + + // s1 read vrf + val write1HPipe: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W))) + val pipeDataOffset: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(log2Ceil(parameter.datapathWidth / 8).W))) + + readCrossBar.output.zipWithIndex.foreach { case (request, index) => + val sourceLane = UIntToOH(request.bits.writeIndex) + readChannel(index).valid := request.valid + readChannel(index).bits.readSource := 2.U + readChannel(index).bits.vs := request.bits.vs + readChannel(index).bits.offset := request.bits.offset + readChannel(index).bits.instructionIndex := instReg.instructionIndex + request.ready := readChannel(index).ready + + maskedWrite.readChannel(index).ready := readChannel(index).ready + maskedWrite.readResult(index) := readResult(index) + when(maskDestinationType) { + readChannel(index).valid := maskedWrite.readChannel(index).valid + readChannel(index).bits.vs := maskedWrite.readChannel(index).bits.vs + readChannel(index).bits.offset := maskedWrite.readChannel(index).bits.offset + } + + // pipe read fire + val pipeRead = Pipe( + readChannel(index).fire && !maskDestinationType, + sourceLane, + readVRFLatency + ) + val pipeOffset = Pipe(readChannel(index).fire, request.bits.dataOffset, readVRFLatency) + write1HPipe(index) := Mux(pipeRead.valid, pipeRead.bits, 0.U(parameter.laneNumber.W)) + pipeDataOffset(index) := pipeOffset.bits + } + + // Processing read results + val readData: Seq[DecoupledIO[UInt]] = Seq.tabulate(parameter.laneNumber) { index => + val readDataQueue = Queue.io(UInt(parameter.datapathWidth.W), readDataQueueSize, flow = true) + val readResultSelect = VecInit(write1HPipe.map(_(index))).asUInt + val dataOffset: UInt = Mux1H(readResultSelect, pipeDataOffset) + readTokenRelease(index) := readDataQueue.deq.fire + readDataQueue.enq.valid := readResultSelect.orR + readDataQueue.enq.bits := Mux1H(readResultSelect, readResult) >> (dataOffset ## 0.U(3.W)) + readDataQueue.deq + } + + /** todo: [[waiteReadDataPipeReg]] enq && [[readWaitQueue]] enq * */ + // reg before execute + val waiteReadDataPipeReg: MaskUnitWaitReadQueue = RegInit(0.U.asTypeOf(new MaskUnitWaitReadQueue(parameter))) + val waiteReadData: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => RegInit(0.U(parameter.datapathWidth.W)) } + val waiteReadSate: UInt = RegInit(0.U(parameter.laneNumber.W)) + val waiteReadStageValid: Bool = RegInit(false.B) + + // Process the data that needs to be written + val dlen: Int = parameter.datapathWidth * parameter.laneNumber + // Execute at most 4 times, each index represents 1/4 of dlen + val eachIndexSize = dlen / 4 + val executeIndexVec: Seq[UInt] = Seq( + waiteReadDataPipeReg.executeGroup(1, 0), + waiteReadDataPipeReg.executeGroup(0) ## false.B, + false.B + ) + val writeDataVec = Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + val data = VecInit(Seq.tabulate(parameter.laneNumber) { laneIndex => + val dataElement: UInt = Wire(UInt((dataByte * 8).W)) + val dataIsRead = waiteReadDataPipeReg.needRead(laneIndex) + val unreadData = Mux(waiteReadDataPipeReg.replaceVs1(laneIndex), instReg.readFromScala, 0.U) + + dataElement := Mux(dataIsRead, waiteReadData(laneIndex), unreadData) + dataElement + }).asUInt + val executeIndex = executeIndexVec(sewInt) + val shifterData = (data << (executeIndex ## 0.U(log2Ceil(eachIndexSize).W))).asUInt + // align + changeUIntSize(shifterData, dlen) + } + val writeData = Mux1H(sew1H, writeDataVec) + + val writeMaskVec: Seq[UInt] = Seq(0, 1, 2).map { sewInt => + val MaskMagnification = 1 << sewInt + val mask = FillInterleaved(MaskMagnification, waiteReadDataPipeReg.sourceValid) + val executeIndex = executeIndexVec(sewInt) + val shifterMask = (mask << (executeIndex ## 0.U(log2Ceil(eachIndexSize / 8).W))).asUInt + // align + changeUIntSize(shifterMask, dlen / 8) + } + val writeMask = Mux1H(sew1H, writeMaskVec) + + val writeRequest: Seq[MaskUnitExeResponse] = Seq.tabulate(parameter.laneNumber) { laneIndex => + val res: MaskUnitExeResponse = Wire(new MaskUnitExeResponse(parameter.laneParam)) + res.ffoByOther := DontCare + res.pipeData := DontCare + res.index := instReg.instructionIndex + res.writeData.groupCounter := (waiteReadDataPipeReg.executeGroup << instReg.sew >> 2).asUInt + res.writeData.vd := instReg.vd + res.writeData.data := cutUIntBySize(writeData, parameter.laneNumber)(laneIndex) + res.writeData.mask := cutUIntBySize(writeMask, parameter.laneNumber)(laneIndex) + res + } + val WillWriteLane: UInt = VecInit(cutUIntBySize(writeMask, parameter.laneNumber).map(_.orR)).asUInt + + // update waite read stage + val waiteStageDeqValid: Bool = + waiteReadStageValid && + (waiteReadSate === waiteReadDataPipeReg.needRead || waiteReadDataPipeReg.needRead === 0.U) + val waiteStageDeqReady: Bool = Wire(Bool()) + val waiteStageDeqFire: Bool = waiteStageDeqValid && waiteStageDeqReady + + val waiteStageEnqReady: Bool = !waiteReadStageValid || waiteStageDeqFire + val waiteStageEnqFire: Bool = readWaitQueue.deq.valid && waiteStageEnqReady + + readWaitQueue.deq.ready := waiteStageEnqReady + + when(waiteStageEnqFire) { + waiteReadDataPipeReg := readWaitQueue.deq.bits + } + + when(waiteStageDeqFire ^ waiteStageEnqFire) { + waiteReadStageValid := waiteStageEnqFire + } + + waiteReadData.zipWithIndex.foreach { case (reg, index) => + val isWaiteForThisData = waiteReadDataPipeReg.needRead(index) && !waiteReadSate(index) && waiteReadStageValid + val read = readData(index) + read.ready := isWaiteForThisData + if (index == 0) { + read.ready := isWaiteForThisData || unitType(2) || compress || gatherWaiteRead || mvRd + when(read.fire) { + readVS1Reg.data := read.bits + readVS1Reg.dataValid := true.B + when(gatherWaiteRead) { + gatherReadState := sResponse + } + } + } + when(read.fire) { + reg := read.bits + } + } + val readResultValid: UInt = VecInit(readData.map(_.fire)).asUInt + when(waiteStageEnqFire && readResultValid.orR) { + waiteReadSate := readResultValid + }.elsewhen(readResultValid.orR) { + waiteReadSate := waiteReadSate | readResultValid + }.elsewhen(waiteStageEnqFire) { + waiteReadSate := 0.U + } + + // Determine whether the data is ready + val executeEnqValid: Bool = otherTypeRequestDeq && !readType + + // start execute + val compressUnit: MaskCompress = Module(new MaskCompress(parameter)) + val reduceUnit: MaskReduce = Module(new MaskReduce(parameter)) + val extendUnit: MaskExtend = Module(new MaskExtend(parameter)) + + // todo + val source2: UInt = VecInit(exeReqReg.map(_.bits.source2)).asUInt + val source1: UInt = VecInit(exeReqReg.map(_.bits.source1)).asUInt + + // compress data + // compress executes a whole set of data + val vs1Split: Seq[(UInt, Bool)] = Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + // For compress, a set of data requires vs1Size bits of vs1 + val vs1Size = (parameter.datapathWidth / 8) * parameter.laneNumber / dataByte + // How many sets of vs1 can a dataPath have? + val setSize = parameter.datapathWidth / vs1Size + val vs1SetIndex: UInt = + if (parameter.datapathWidth <= vs1Size) true.B + else + requestCounter(log2Ceil(setSize) - 1, 0) + val selectVS1: UInt = + if (parameter.datapathWidth <= vs1Size) readVS1Reg.data + else + cutUIntBySize(readVS1Reg.data, setSize)(vs1SetIndex) + val willChangeVS1Index = vs1SetIndex.andR + (selectVS1, willChangeVS1Index) + } + + val compressSource1: UInt = Mux1H(sew1H, vs1Split.map(_._1)) + val source1Select: UInt = Mux(mv, readVS1Reg.data, compressSource1) + val source1Change: Bool = Mux1H(sew1H, vs1Split.map(_._2)) + when(source1Change && compressUnit.in.fire) { + readVS1Reg.dataValid := false.B + readVS1Reg.requestSend := false.B + readVS1Reg.readIndex := readVS1Reg.readIndex + 1.U + + } + viotaCounterAdd := compressUnit.in.fire + + compressUnit.in.valid := executeEnqValid && unitType(1) + compressUnit.in.bits.maskType := instReg.maskType + compressUnit.in.bits.eew := instReg.sew + compressUnit.in.bits.uop := instReg.decodeResult(Decoder.topUop) + compressUnit.in.bits.readFromScalar := instReg.readFromScala + compressUnit.in.bits.source1 := source1Select + compressUnit.in.bits.mask := executeElementMask + compressUnit.in.bits.source2 := source2 + compressUnit.in.bits.groupCounter := requestCounter + compressUnit.in.bits.lastCompress := lastGroup + compressUnit.in.bits.ffoInput := VecInit(exeReqReg.map(_.bits.ffo)).asUInt + compressUnit.in.bits.validInput := VecInit(exeReqReg.map(_.valid)).asUInt + compressUnit.newInstruction := instReq.valid + compressUnit.ffoInstruction := instReq.bits.decodeResult(Decoder.topUop)(2, 0) === BitPat("b11?") + + reduceUnit.in.valid := executeEnqValid && unitType(2) + reduceUnit.in.bits.maskType := instReg.maskType + reduceUnit.in.bits.eew := instReg.sew + reduceUnit.in.bits.uop := instReg.decodeResult(Decoder.topUop) + reduceUnit.in.bits.readVS1 := readVS1Reg.data + reduceUnit.in.bits.source2 := source2 + reduceUnit.in.bits.sourceValid := VecInit(exeReqReg.map(_.valid)).asUInt + reduceUnit.in.bits.lastGroup := lastGroup + reduceUnit.in.bits.vxrm := instReg.vxrm + reduceUnit.in.bits.aluUop := instReg.decodeResult(Decoder.uop) + reduceUnit.in.bits.sign := !instReg.decodeResult(Decoder.unsigned1) + reduceUnit.firstGroup := !readVS1Reg.sendToExecution && reduceUnit.in.fire + reduceUnit.newInstruction := instReq.fire + reduceUnit.validInst := instReg.vl.orR + reduceUnit.pop := pop + + reduceUnit.in.bits.fpSourceValid.foreach { sink => + sink := VecInit(exeReqReg.map(_.bits.fpReduceValid.get)).asUInt + } + + when(reduceUnit.in.fire || compressUnit.in.fire) { + readVS1Reg.sendToExecution := true.B + } + + val extendGroupCount: UInt = Mux( + extendType, + Mux( + subType(2), + requestCounter ## executeIndex, + requestCounter ## executeIndex(1) + ), + requestCounter + ) + extendUnit.in.eew := instReg.sew + extendUnit.in.uop := instReg.decodeResult(Decoder.topUop) + extendUnit.in.source2 := source2 + extendUnit.in.groupCounter := extendGroupCount + + val executeResult: UInt = Mux1H( + unitType(3, 1), + Seq( + compressUnit.out.data, + reduceUnit.out.bits.data, + extendUnit.out + ) + ) + + // todo + executeReady := Mux1H( + unitType, + Seq( + true.B, // read type + true.B, // compress + reduceUnit.in.ready && readVS1Reg.dataValid, // reduce + executeEnqValid // extend unit + ) + ) + + val executeValid: Bool = Mux1H( + unitType(3, 1), + Seq( + compressUnit.out.compressValid, + false.B, + executeEnqValid + ) + ) + + executeGroupCounter := Mux1H( + unitType(3, 1), + Seq( + requestCounter, + requestCounter, + extendGroupCount + ) + ) + + val executeDeqGroupCounter: UInt = Mux1H( + unitType(3, 1), + Seq( + compressUnit.out.groupCounter, + requestCounter, + extendGroupCount + ) + ) + + val executeWriteByteMask: UInt = Mux(compress || ffo || mvVd, compressUnit.out.mask, executeByteMask) + maskedWrite.needWAR := maskDestinationType + maskedWrite.vd := instReg.vd + maskedWrite.in.zipWithIndex.foreach { case (req, index) => + val bitMask = cutUInt(currentMaskGroupForDestination, parameter.datapathWidth)(index) + val maskFilter = !maskDestinationType || bitMask.orR + req.valid := executeValid && maskFilter + req.bits.mask := cutUIntBySize(executeWriteByteMask, parameter.laneNumber)(index) + req.bits.data := cutUInt(executeResult, parameter.datapathWidth)(index) + req.bits.pipeData := exeReqReg(index).bits.source1 + req.bits.bitMask := bitMask + req.bits.groupCounter := executeDeqGroupCounter + req.bits.ffoByOther := compressUnit.out.ffoOutput(index) && ffo + if (index == 0) { + // reduce result + when(unitType(2)) { + req.valid := reduceUnit.out.valid + req.bits.mask := reduceUnit.out.bits.mask + req.bits.data := reduceUnit.out.bits.data + req.bits.groupCounter := 0.U + } + } + } + + // mask unit write queue + val writeQueue: Seq[QueueIO[MaskUnitExeResponse]] = Seq.tabulate(parameter.laneNumber) { _ => + Queue.io(new MaskUnitExeResponse(parameter.laneParam), maskUnitWriteQueueSize) + } + + writeQueue.zipWithIndex.foreach { case (queue, index) => + val readTypeWriteVrf: Bool = waiteStageDeqFire && WillWriteLane(index) + queue.enq.valid := maskedWrite.out(index).valid || readTypeWriteVrf + maskedWrite.out(index).ready := queue.enq.ready + queue.enq.bits := maskedWrite.out(index).bits + when(readTypeWriteVrf) { + queue.enq.bits := writeRequest(index) + } + queue.enq.bits.index := instReg.instructionIndex + + // write token + val tokenCounter = RegInit(0.U(log2Ceil(parameter.maskUnitVefWriteQueueSize + 1).W)) + val tokenAllow: Bool = queue.deq.fire + val counterChange: UInt = Mux(tokenAllow, 1.U, -1.S(tokenCounter.getWidth.W).asUInt) + when(tokenAllow ^ tokenIO(index).maskResponseRelease) { + tokenCounter := tokenCounter + counterChange + } + // write vrf + val writePort = exeResp(index) + queue.deq.ready := !tokenCounter.asBools.last + writePort.valid := tokenAllow + writePort.bits.last := DontCare + writePort.bits.instructionIndex := instReg.instructionIndex + writePort.bits.data := Mux(queue.deq.bits.ffoByOther, queue.deq.bits.pipeData, queue.deq.bits.writeData.data) + writePort.bits.mask := queue.deq.bits.writeData.mask + writePort.bits.vd := instReg.vd + queue.deq.bits.writeData.groupCounter( + parameter.laneParam.groupNumberBits - 1, + parameter.laneParam.vrfOffsetBits + ) + writePort.bits.offset := queue.deq.bits.writeData.groupCounter + } + waiteStageDeqReady := writeQueue.zipWithIndex.map { case (queue, index) => + !WillWriteLane(index) || queue.enq.ready + }.reduce(_ && _) + writeRD <> DontCare + + // todo: token + val waiteLastRequest: Bool = RegInit(false.B) + val waitQueueClear: Bool = RegInit(false.B) + val lastReportValid = waitQueueClear && !writeQueue.map(_.deq.valid).reduce(_ || _) + when(lastReportValid) { + waitQueueClear := false.B + waiteLastRequest := false.B + } + when(!readType && requestStageDeq && lastGroup) { + waiteLastRequest := true.B + } + val executeStageInvalid: Bool = Mux1H( + unitType(3, 1), + Seq( + !compressUnit.out.compressValid, + reduceUnit.in.ready, + true.B + ) + ) + val executeStageClean: Bool = Mux( + readType, + waiteStageDeqFire && waiteReadDataPipeReg.last, + waiteLastRequest && maskedWrite.stageClear && executeStageInvalid + ) + val alwaysNeedExecute: Bool = enqMvRD + val invalidEnq: Bool = instReq.fire && !instReq.bits.vl && !alwaysNeedExecute + when(executeStageClean || invalidEnq) { + waitQueueClear := true.B + } + lastReport := maskAnd( + lastReportValid, + indexToOH(instReg.instructionIndex, parameter.chainingSize) + ) + writeRDData := Mux(pop, reduceUnit.out.bits.data, compressUnit.writeData) + + // gather read state + when(gatherRequestFire) { + when(notNeedRead) { + gatherReadState := sResponse + }.otherwise { + gatherReadState := sRead + } + } + + when(readCrossBar.input.head.fire && gatherSRead) { + gatherReadState := wRead + } + + gatherData.valid := gatherResponse + gatherData.bits := Mux(readVS1Reg.dataValid, readVS1Reg.data, 0.U) + when(gatherData.fire) { + gatherReadState := idle + } +} diff --git a/t1/src/mask/MaskUnitReadCrossBar.scala b/t1/src/mask/MaskUnitReadCrossBar.scala new file mode 100644 index 000000000..dab845d9e --- /dev/null +++ b/t1/src/mask/MaskUnitReadCrossBar.scala @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ + +class MaskUnitReadCrossBar(parameter: T1Parameter) extends Module { + val input: Seq[DecoupledIO[MaskUnitReadReq]] = Seq.tabulate(parameter.laneNumber)(_ => + IO( + Flipped( + Decoupled( + new MaskUnitReadReq(parameter) + ) + ) + ) + ) + val output: Seq[DecoupledIO[MaskUnitReadQueue]] = Seq.tabulate(parameter.laneNumber)(_ => + IO( + Decoupled( + new MaskUnitReadQueue(parameter) + ) + ) + ) + + val inputSelect1H: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W))) + + input.zipWithIndex.foldLeft(0.U(parameter.laneNumber.W)) { case (laneOccupied, (req, index)) => + val requestReadLane = UIntToOH(req.bits.readLane) + // read lane free + val free: Bool = (requestReadLane & (~laneOccupied).asUInt).orR + val outReady: Bool = Mux1H(requestReadLane, output.map(_.ready)) + req.ready := free && outReady + inputSelect1H(index) := Mux(req.valid && free, requestReadLane, 0.U(parameter.laneNumber.W)) + laneOccupied | inputSelect1H(index) + } + + output.zipWithIndex.foreach { case (req, index) => + val tryToRead: UInt = VecInit(inputSelect1H.map(_(index))).asUInt + req.valid := tryToRead.orR + val selectReq: DecoupledIO[MaskUnitReadReq] = Mux1H(tryToRead, input) + req.bits.vs := selectReq.bits.vs + req.bits.offset := selectReq.bits.offset + req.bits.writeIndex := selectReq.bits.requestIndex + req.bits.dataOffset := selectReq.bits.dataOffset + } +} diff --git a/t1/src/mask/SlideIndexGen.scala b/t1/src/mask/SlideIndexGen.scala new file mode 100644 index 000000000..93d06ae59 --- /dev/null +++ b/t1/src/mask/SlideIndexGen.scala @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.t1.rtl.decoder.Decoder + +class SlideIndexGen(parameter: T1Parameter) extends Module { + val newInstruction: Bool = IO(Input(Bool())) + + val instructionReq: MaskUnitInstReq = IO(Input(new MaskUnitInstReq(parameter))) + + val indexDeq: DecoupledIO[MaskUnitReadState] = IO(Decoupled(new MaskUnitReadState(parameter))) + + val lgNumber: Int = parameter.laneParam.laneNumberBits + val slideGroupOut: UInt = IO(Output(UInt((parameter.laneParam.vlMaxBits - lgNumber).W))) + val slideMaskInput: UInt = IO(Input(UInt(parameter.laneNumber.W))) + + val lastFire: Bool = Wire(Bool()) + val InstructionValid: Bool = RegEnable(newInstruction, false.B, newInstruction || lastFire) + val vl: UInt = instructionReq.vl + + val isSlide: Bool = instructionReq.decodeResult(Decoder.topUop) === BitPat("b000??") + val slideUp: Bool = instructionReq.decodeResult(Decoder.topUop)(0) + val slide1: Bool = instructionReq.decodeResult(Decoder.topUop)(1) + + // each slide group laneNumber element + val slideGroup = RegInit(0.U((parameter.laneParam.vlMaxBits - lgNumber).W)) + val vlTail: UInt = changeUIntSize(vl, lgNumber) + val lastSlideGroup = (vl >> lgNumber).asUInt - !changeUIntSize(vl, lgNumber).orR + val lastValidVec = (~scanLeftOr(UIntToOH(vlTail))).asUInt + + val groupVlValid = maskEnable(slideGroup === lastSlideGroup && vlTail.orR, lastValidVec) + val groupMaskValid = maskEnable(instructionReq.maskType, slideMaskInput) + + val validVec = groupVlValid & groupMaskValid + + val lastElementValid: UInt = ((groupVlValid >> 1).asUInt ^ groupVlValid) & groupMaskValid + val replaceWithVs1: UInt = Mux1H( + Seq( + (slideGroup === 0.U && slide1 && slideUp) -> (1.U(parameter.laneNumber.W) & groupMaskValid(0)), + (slideGroup === lastSlideGroup && slide1 && !slideUp) -> lastElementValid + ) + ).asUInt + + lastFire := slideGroup === lastSlideGroup && indexDeq.fire + slideGroupOut := slideGroup + when(newInstruction || indexDeq.fire) { + slideGroup := Mux(newInstruction, 0.U, slideGroup + 1.U) + } + + val slideValue: UInt = Mux(slide1, 1.U, instructionReq.readFromScala) + // Positive and negative select + val PNSelect: UInt = Mux(slideUp, (~slideValue).asUInt, slideValue) + + val baseIndex: UInt = (slideGroup << lgNumber).asUInt + PNSelect + slideUp + val lagerThanVL: Bool = (slideValue >> parameter.laneParam.vlMaxBits).asUInt.orR + + def indexAnalysis(sewInt: Int)(elementIndex: UInt, vlmul: UInt, valid: Bool): Seq[UInt] = { + val intLMULInput: UInt = (1.U << vlmul(1, 0)).asUInt + val positionSize = parameter.laneParam.vlMaxBits - 1 + val allDataPosition = (elementIndex << sewInt).asUInt + val dataPosition = changeUIntSize(allDataPosition, positionSize) + // The offset of the data starting position in 32 bits (currently only 32). + // Since the data may cross lanes, it will be optimized during fusion. + // (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0)) + val dataOffset: UInt = + (if (sewInt < 2) dataPosition(1) else false.B) ## + (if (sewInt == 0) dataPosition(0) else false.B) + val accessLane = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W) + // 32 bit / group + val dataGroup = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt + val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits + val offset = dataGroup(offsetWidth - 1, 0) + val accessRegGrowth = (dataGroup >> offsetWidth).asUInt + val decimalProportion = offset ## accessLane + // 1/8 register + val decimal = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3)) + + /** elementIndex needs to be compared with vlMax(vLen * lmul /sew) This calculation is too complicated We can change + * the angle. Calculate the increment of the read register and compare it with lmul to know whether the index + * exceeds vlMax. vlmul needs to distinguish between integers and floating points + */ + val overlap = + (vlmul(2) && decimal >= intLMULInput(3, 1)) || + (!vlmul(2) && accessRegGrowth >= intLMULInput) || + (allDataPosition >> log2Ceil(parameter.vLen)).asUInt.orR + val unChange = slideUp && (elementIndex.asBools.last || lagerThanVL) + val elementValid = valid && !unChange + val notNeedRead = overlap || !elementValid || lagerThanVL || unChange + val reallyGrowth: UInt = changeUIntSize(accessRegGrowth, 3) + Seq(dataOffset, accessLane, offset, reallyGrowth, notNeedRead, elementValid) + } + + val sew1H: UInt = UIntToOH(instructionReq.sew)(2, 0) + + val indexVec = Seq.tabulate(parameter.laneNumber) { index => + val readIndex = baseIndex + index.U(lgNumber.W) + val checkResult: Seq[Seq[UInt]] = Seq(0, 1, 2).map { sewInt => + indexAnalysis(sewInt)(readIndex, instructionReq.vlmul, validVec(index)) + } + + val dataOffset = Mux1H(sew1H, checkResult.map(_.head)) + val accessLane = Mux1H(sew1H, checkResult.map(_(1))) + val offset = Mux1H(sew1H, checkResult.map(_(2))) + val reallyGrowth = Mux1H(sew1H, checkResult.map(_(3))) + val notNeedRead = Mux1H(sew1H, checkResult.map(_(4))) + val elementValid = Mux1H(sew1H, checkResult.map(_(5))) + Seq(dataOffset, accessLane, offset, reallyGrowth, notNeedRead, elementValid) + } + + indexDeq.valid := InstructionValid && isSlide + indexDeq.bits := DontCare + // 0: dataOffset, 1: accessLane, + // 2: offset, 3: reallyGrowth, + // 4: notNeedRead, 5: elementValid + indexDeq.bits.needRead := VecInit(indexVec.map(!_(4))).asUInt & (~replaceWithVs1).asUInt + indexDeq.bits.elementValid := VecInit(indexVec.map(_(5))).asUInt | replaceWithVs1 + indexDeq.bits.replaceVs1 := replaceWithVs1 + indexDeq.bits.readOffset := VecInit(indexVec.map(_(2))).asUInt + indexDeq.bits.accessLane := VecInit(indexVec.map(_(1))) + indexDeq.bits.vsGrowth := VecInit(indexVec.map(_(3))) + indexDeq.bits.executeGroup := slideGroup + indexDeq.bits.readDataOffset := VecInit(indexVec.map(_.head)).asUInt + indexDeq.bits.last := slideGroup === lastSlideGroup +} diff --git a/t1/src/package.scala b/t1/src/package.scala index b0afc12be..33bd0f728 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -10,6 +10,8 @@ import chisel3.util.experimental.decode.DecodeBundle import org.chipsalliance.t1.rtl.decoder.{Decoder, TableGenerator} import org.chipsalliance.t1.rtl.lane.Distributor +import scala.jdk.CollectionConverters._ + package object rtl { def csa32(s: UInt, c: UInt, a: UInt): (UInt, UInt) = { val xor = s ^ c @@ -41,6 +43,10 @@ package object rtl { Mux(mask, data, 0.U.asTypeOf(data)) } + def maskEnable(enable: Bool, mask: UInt): UInt = { + Mux(enable, mask, (-1.S(mask.getWidth.W)).asUInt.asTypeOf(mask)) + } + def indexToOH(index: UInt, chainingSize: Int): UInt = { UIntToOH(index(log2Ceil(chainingSize) - 1, 0)) } @@ -74,6 +80,21 @@ package object rtl { }) } + def cutUIntBySize(data: UInt, size: Int): Vec[UInt] = { + require(data.getWidth % size == 0) + val width: Int = data.getWidth / size + cutUInt(data, width) + } + + def changeUIntSize(data: UInt, size: Int, sign: Boolean = false): UInt = { + if (data.getWidth >= size) { + data(size - 1, 0) + } else { + val extend = if (sign) data(data.getWidth - 1) else false.B + Fill(size - data.getWidth, extend) ## data + } + } + def calculateSegmentWriteMask( datapath: Int, laneNumber: Int, @@ -372,4 +393,15 @@ package object rtl { ) ++ Option.when(parameter.fpuEnable)(inputDecode(Decoder.float)) VecInit(executeList).asUInt } + + def pipeToken(size: Int)(enq: Bool, deq: Bool): Bool = { + require(isPow2(size)) + val counterSize: Int = log2Ceil(size) + 1 + val counter: UInt = RegInit(0.U(counterSize.W)) + val counterChange: UInt = Mux(enq, 1.U, (-1.S(counterSize.W)).asUInt) + when(enq ^ deq) { + counter := counter + counterChange + } + !counter(log2Ceil(size)) + } } diff --git a/t1/src/sequencer/T1TokenManager.scala b/t1/src/sequencer/T1TokenManager.scala index b80ed2dcd..a46c19bda 100644 --- a/t1/src/sequencer/T1TokenManager.scala +++ b/t1/src/sequencer/T1TokenManager.scala @@ -7,10 +7,27 @@ import chisel3._ import chisel3.experimental.hierarchy.{instantiable, public} import chisel3.util._ +class IssueToken(parameter: T1Parameter) extends Bundle { + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) + val writeV0: Bool = Bool() + val useV0AsMask: Bool = Bool() + val isLoadStore: Bool = Bool() + val toLane: Bool = Bool() + val toMask: Bool = Bool() +} + @instantiable class T1TokenManager(parameter: T1Parameter) extends Module { @public - val writeV0 = IO(Vec(parameter.laneNumber, Flipped(Valid(UInt(parameter.instructionIndexBits.W))))) + val instructionIssue: ValidIO[IssueToken] = IO(Flipped(Valid(new IssueToken(parameter)))) + + @public + val lsuWriteV0: Vec[ValidIO[UInt]] = IO( + Vec(parameter.laneNumber, Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) + ) + + @public + val issueAllow: Bool = IO(Output(Bool())) @public val instructionFinish: Vec[UInt] = IO(Vec(parameter.laneNumber, Input(UInt(parameter.chainingSize.W)))) @@ -18,19 +35,60 @@ class T1TokenManager(parameter: T1Parameter) extends Module { @public val v0WriteValid = IO(Output(UInt(parameter.chainingSize.W))) - // v0 write token - val v0WriteValidVec: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { laneIndex => - val update: ValidIO[UInt] = writeV0(laneIndex) - val clear: UInt = instructionFinish(laneIndex) - val updateOH = maskAnd(update.valid, indexToOH(update.bits, parameter.chainingSize)).asUInt + @public + val maskUnitFree: Bool = IO(Input(Bool())) + + val issueIndex1H: UInt = indexToOH(instructionIssue.bits.instructionIndex, parameter.chainingSize) + + // Boolean type token clear & set + def updateBooleanToken(set: UInt, clear: UInt): UInt = { VecInit(Seq.tabulate(parameter.chainingSize) { chainingIndex => val res = RegInit(false.B) - when(updateOH(chainingIndex) || clear(chainingIndex)) { - res := updateOH(chainingIndex) + when(set(chainingIndex) || clear(chainingIndex)) { + res := set(chainingIndex) } res }).asUInt } + // v0 write token + val v0WriteValidVec: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { laneIndex => + val lsuWriteSet = maskAnd( + lsuWriteV0(laneIndex).valid, + indexToOH(lsuWriteV0(laneIndex).bits, parameter.chainingSize) + ).asUInt + val v0WriteIssue = instructionIssue.valid && instructionIssue.bits.writeV0 && instructionIssue.bits.toLane + val clear: UInt = instructionFinish(laneIndex) + val updateOH = maskAnd(v0WriteIssue, issueIndex1H).asUInt + updateBooleanToken(updateOH | lsuWriteSet, clear) + } + + val useV0AsMaskToken: UInt = Seq + .tabulate(parameter.laneNumber) { laneIndex => + val useV0Issue = instructionIssue.valid && instructionIssue.bits.useV0AsMask && + instructionIssue.bits.toLane + val clear: UInt = instructionFinish(laneIndex) + val updateOH = maskAnd(useV0Issue, issueIndex1H).asUInt + updateBooleanToken(updateOH, clear) + } + .reduce(_ | _) + + val maskUnitWriteV0: Bool = { + val set = instructionIssue.valid && instructionIssue.bits.writeV0 && instructionIssue.bits.toMask + val clear = maskUnitFree + val res = RegInit(false.B) + when(set || clear) { + res := set + } + res + } + v0WriteValid := v0WriteValidVec.reduce(_ | _) + + // v0 read-write conflict + val v0Conflict: Bool = + (instructionIssue.bits.writeV0 && useV0AsMaskToken.orR) || + (instructionIssue.bits.useV0AsMask && (v0WriteValid.orR || maskUnitWriteV0)) + + issueAllow := !v0Conflict } diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala index be019fc58..7e59380eb 100644 --- a/t1/src/vrf/VRF.scala +++ b/t1/src/vrf/VRF.scala @@ -202,7 +202,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar @public val writeCheck: Vec[LSUWriteCheck] = IO( Vec( - parameter.chainingSize + 3, + parameter.chainingSize + 4, Input( new LSUWriteCheck( parameter.regNumBits, @@ -214,7 +214,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar ) @public - val writeAllow: Vec[Bool] = IO(Vec(parameter.chainingSize + 3, Output(Bool()))) + val writeAllow: Vec[Bool] = IO(Vec(parameter.chainingSize + 4, Output(Bool()))) /** when instruction is fired, record it in the VRF for chaining. */ @public @@ -268,8 +268,9 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val portFireCount: UInt = PopCount(VecInit(readRequests.map(_.fire) :+ write.fire)) dontTouch(portFireCount) - val writeBank: UInt = - if (parameter.rfBankNum == 1) true.B else UIntToOH(write.bits.offset(log2Ceil(parameter.rfBankNum) - 1, 0)) + val writeIndex: UInt = write.bits.vd ## write.bits.offset + val writeBank: UInt = + if (parameter.rfBankNum == 1) true.B else UIntToOH(writeIndex(log2Ceil(parameter.rfBankNum) - 1, 0)) // Add one more record slot to prevent there is no free slot when the instruction comes in // (the slot will die a few cycles later than the instruction) @@ -350,8 +351,9 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar .reduce(_ && _) && portConflictCheck } val validCorrect: Bool = if (i == (readRequests.size - 1)) v.valid && checkResult.get else v.valid + val address = v.bits.vs ## v.bits.offset // select bank - val bank = if (parameter.rfBankNum == 1) true.B else UIntToOH(v.bits.offset(log2Ceil(parameter.rfBankNum) - 1, 0)) + val bank = if (parameter.rfBankNum == 1) true.B else UIntToOH(address(log2Ceil(parameter.rfBankNum) - 1, 0)) val pipeBank = Pipe(true.B, bank, parameter.vrfReadLatency).bits val bankCorrect = Mux(validCorrect, bank, 0.U(parameter.rfBankNum.W)) val readPortCheckSelect = parameter.ramType match { @@ -528,7 +530,11 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val dataInLsuQueue = ohCheck(loadDataInLSUWriteQueue, record.bits.instIndex, parameter.chainingSize) // elementMask update by write val writeUpdateValidVec: Seq[Bool] = - writePort.map(p => p.fire && p.bits.instructionIndex === record.bits.instIndex && p.bits.mask(3)) + writePort.map(p => + p.fire && p.bits.instructionIndex === record.bits.instIndex && + // Only index load will split the datapath into separate parts. + (p.bits.mask(3) || !record.bits.ls) + ) val writeUpdate1HVec: Seq[UInt] = writeOH.zip(writeUpdateValidVec).map { case (oh, v) => Mux(v, oh, 0.U) } // elementMask update by read of store instruction val loadUpdateValidVec = @@ -544,7 +550,8 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val waitLaneClear = record.bits.state.stFinish && record.bits.state.wWriteQueueClear && record.bits.state.wLaneLastReport && record.bits.state.wTopLastReport - val stateClear: Bool = waitLaneClear && record.bits.state.wLaneClear + val stateClear: Bool = waitLaneClear && record.bits.state.wLaneClear || + record.bits.elementMask.andR && !record.bits.onlyRead when(topLastReport) { record.bits.state.stFinish := true.B @@ -606,7 +613,8 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar Mux(older, sourceVdEqSinkVs, sinkVdEqSourceVs) ) val rawForeStore = Mux(older, isStore.head && isSlow.last, isStore.last && isSlow.head) && samVd - (hazardForeLoad, rawForeStore) + // (hazardForeLoad, rawForeStore) todo: need check hazard? + (false.B, false.B) } } writeReadyForLsu := !hazardVec.map(_.map(_._1).reduce(_ || _)).reduce(_ || _) diff --git a/t1/src/vrf/WriteCheck.scala b/t1/src/vrf/WriteCheck.scala index 57fa5479f..b454782b7 100644 --- a/t1/src/vrf/WriteCheck.scala +++ b/t1/src/vrf/WriteCheck.scala @@ -54,7 +54,8 @@ class WriteCheck(val parameter: VRFParam) extends Module { ) val notHitVs1: Bool = (checkOH & vs1Mask) === 0.U val war1: Bool = record.bits.vs1.valid && check.vd(4, 3) === record.bits.vs1.bits(4, 3) && notHitVs1 - val vs2Mask: UInt = (((-1.S(parameter.elementSize.W)).asUInt ## record.bits.elementMask) << + val maskForVs2: UInt = record.bits.elementMask & Fill(parameter.elementSize, !record.bits.onlyRead) + val vs2Mask: UInt = (((-1.S(parameter.elementSize.W)).asUInt ## maskForVs2) << ((8.U + record.bits.vs2(2, 0) - record.bits.vd.bits(2, 0)) << parameter.vrfOffsetBits).asUInt).asUInt( 2 * 8 * parameter.singleGroupSize - 1, 8 * parameter.singleGroupSize diff --git a/t1emu/src/TestBench.scala b/t1emu/src/TestBench.scala index d43b91c5a..5b07b2348 100644 --- a/t1emu/src/TestBench.scala +++ b/t1emu/src/TestBench.scala @@ -221,7 +221,7 @@ class TestBench(val parameter: T1Parameter) // vrf write from lsu lsuProbe.slots.map(slot => slot.dataInstruction === tag.U && slot.writeValid && slot.dataMask.orR) ++ // vrf write from Sequencer - Some(t1Probe.writeQueueEnq.bits === tag.U && t1Probe.writeQueueEnq.valid && t1Probe.writeQueueEnqMask.orR) + t1Probe.writeQueueEnqVec.map(maskWrite => maskWrite.valid && maskWrite.bits === tag.U) ).asUInt // always equal to array index scoreboard.bits := scoreboard.bits + PopCount(writeEnq) diff --git a/t1rocketemu/src/TestBench.scala b/t1rocketemu/src/TestBench.scala index 0457b9c3e..2e26647d5 100644 --- a/t1rocketemu/src/TestBench.scala +++ b/t1rocketemu/src/TestBench.scala @@ -285,7 +285,7 @@ class TestBench(val parameter: T1RocketTileParameter) // vrf write from lsu lsuProbe.slots.map(slot => slot.dataInstruction === tag.U && slot.writeValid && slot.dataMask.orR) ++ // vrf write from Sequencer - Some(t1Probe.writeQueueEnq.bits === tag.U && t1Probe.writeQueueEnq.valid && t1Probe.writeQueueEnqMask.orR) + t1Probe.writeQueueEnqVec.map(maskWrite => maskWrite.valid && maskWrite.bits === tag.U) ).asUInt // always equal to array index scoreboard.bits := scoreboard.bits + PopCount(writeEnq) From 4b09b2c5726b3665bacee022df28bf11dcc9260c Mon Sep 17 00:00:00 2001 From: Avimitin Date: Wed, 23 Oct 2024 00:38:06 +0800 Subject: [PATCH 2/6] [ci] make sure all config specify in designs dir elaboratable Signed-off-by: Avimitin --- .github/workflows/pd.yml | 49 +++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/.github/workflows/pd.yml b/.github/workflows/pd.yml index b152176be..f93a87351 100644 --- a/.github/workflows/pd.yml +++ b/.github/workflows/pd.yml @@ -16,42 +16,39 @@ concurrency: cancel-in-progress: true jobs: - test-emit: + generate-test-plan: if: '! github.event.pull_request.draft' + name: "Generate CI target for RTL elaborate" + runs-on: [self-hosted, linux, nixos] + outputs: + testplan: ${{ steps.get-testplan.outputs.testplan }} + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - id: "get-testplan" + run: | + testFile="./designs/org.chipsalliance.t1.elaborator.t1rocketv.T1RocketTile.toml" + jsonData=$(nix shell ".#dasel" -c dasel -r toml -f "$testFile" -w json) + echo "testplan=$(echo "$jsonData" | nix shell ".#jq" -c jq -c -r '{ "config": keys }')" \ + > $GITHUB_OUTPUT + + test-emit: name: "Test elaborate" + needs: [generate-test-plan] runs-on: [self-hosted, linux, nixos] strategy: - fail-fast: false + matrix: ${{ fromJSON(needs.generate-test-plan.outputs.testplan) }} steps: - uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - name: "Test elaborate" run: | - configArray=( $(ls .github/designs) ) - ipArray=( "t1emu" "t1rocketemu" ) - for cfg in "${configArray[@]}"; do - for ip in "${ipArray[@]}"; do - echo "Building .#t1.${cfg}.${ip}.rtl" - if ! nix build ".#t1.${cfg}.${ip}.rtl" -L; then - failed_rtl_configs+=("${cfg}.${ip}") - fi - if ! nix run ".#ci-helper" -- runOMTests --config ${cfg}; then - failed_om+=("${cfg}") - fi - done - done - if [ -n "${failed_rtl_configs[*]}" ]; then - echo -e "## Failed RTL\n" >> "$GITHUB_STEP_SUMMARY" - for cfg in "${failed_rtl_configs[@]}"; do - echo "* ${cfg}" >> $GITHUB_STEP_SUMMARY - done - echo -e "## Failed RTL\n" >> "$GITHUB_STEP_SUMMARY" - for cfg in "${failed_om[@]}"; do - echo "* ${cfg}" >> $GITHUB_STEP_SUMMARY - done - printf "\n" >> $GITHUB_STEP_SUMMARY - fi + testAttr="t1rocketv" + testConfig=${{ matrix.config }} + echo "Building .#t1.${testConfig}.${testAttr}.rtl" + nix build ".#t1.${testConfig}.${testAttr}.rtl" -L physical-design-report: name: "Request physical design report" From 944c813067f33d64c442302e6711fac36fd917f2 Mon Sep 17 00:00:00 2001 From: Avimitin Date: Fri, 8 Nov 2024 16:59:29 +0800 Subject: [PATCH 3/6] [ci] remove exit on single case failure Signed-off-by: Avimitin --- script/ci/src/Main.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/script/ci/src/Main.scala b/script/ci/src/Main.scala index 5eb3f58f0..e90f5a7ff 100644 --- a/script/ci/src/Main.scala +++ b/script/ci/src/Main.scala @@ -181,7 +181,10 @@ object Main: s"Online driver for config $config, case $caseName fail, please check manually on local machine" ) Logger.error(s"nix build $testAttr --impure") - Logger.fatal("Online Drive run fail, exiting CI") + // Create fake path for latter assertion to fail + val fakedir = os.temp.dir() + os.write(fakedir / "offline-check-status", "1") + fakedir Logger.info("Checking RTL event from event log") val testSuccess = From 17c553bdb08f1065a3f8d181a229ea0d50379108 Mon Sep 17 00:00:00 2001 From: Avimitin Date: Mon, 11 Nov 2024 13:05:05 +0800 Subject: [PATCH 4/6] [ci] move failed test report to each bucket Report CI result at final step might fail when RTL changes causing online driver to fail. And developers might find it hard to get a list of fail tests list to get fixed. This commit move the report step at running stage. Signed-off-by: Avimitin --- .github/workflows/t1rocket.yml | 3 --- .github/workflows/vcs.yml | 3 --- .github/workflows/verilator.yml | 2 -- script/ci/src/Main.scala | 25 +++++-------------------- script/emu/src/Main.scala | 2 +- 5 files changed, 6 insertions(+), 29 deletions(-) diff --git a/.github/workflows/t1rocket.yml b/.github/workflows/t1rocket.yml index 638549fbb..b61857a01 100644 --- a/.github/workflows/t1rocket.yml +++ b/.github/workflows/t1rocket.yml @@ -104,14 +104,11 @@ jobs: nix run ".#ci-helper" -- postCI \ --urg-report-file-path ./urg-report.md \ --cycle-update-file-path ./cycle-update.md \ - --failed-tests-file-path ./failed-tests.md \ --emu-lib "vcs" \ --top ${{ env.EMU_TOP }} cat ./urg-report.md >> $GITHUB_STEP_SUMMARY echo >> $GITHUB_STEP_SUMMARY cat ./cycle-update.md >> $GITHUB_STEP_SUMMARY - echo >> $GITHUB_STEP_SUMMARY - cat ./failed-tests.md >> $GITHUB_STEP_SUMMARY - name: "Commit cycle updates" run: | git config user.name github-actions diff --git a/.github/workflows/vcs.yml b/.github/workflows/vcs.yml index 810638c08..58749a8df 100644 --- a/.github/workflows/vcs.yml +++ b/.github/workflows/vcs.yml @@ -104,14 +104,11 @@ jobs: nix run ".#ci-helper" -- postCI \ --urg-report-file-path ./urg-report.md \ --cycle-update-file-path ./cycle-update.md \ - --failed-tests-file-path ./failed-tests.md \ --emu-lib "vcs" \ --top ${{ env.EMU_TOP }} cat ./urg-report.md >> $GITHUB_STEP_SUMMARY echo >> $GITHUB_STEP_SUMMARY cat ./cycle-update.md >> $GITHUB_STEP_SUMMARY - echo >> $GITHUB_STEP_SUMMARY - cat ./failed-tests.md >> $GITHUB_STEP_SUMMARY - name: "Commit cycle updates" run: | git config user.name github-actions diff --git a/.github/workflows/verilator.yml b/.github/workflows/verilator.yml index fdfc4e44e..634489afa 100644 --- a/.github/workflows/verilator.yml +++ b/.github/workflows/verilator.yml @@ -108,10 +108,8 @@ jobs: - name: "Print step summary" run: | nix run ".#ci-helper" -- postCI \ - --failed-tests-file-path ./failed-tests.md \ --cycle-update-file-path ./cycle-update.md --emu-lib "verilator" \ --top ${{ env.EMU_TOP }} - cat ./failed-tests.md >> $GITHUB_STEP_SUMMARY echo >> $GITHUB_STEP_SUMMARY cat ./cycle-update.md >> $GITHUB_STEP_SUMMARY diff --git a/script/ci/src/Main.scala b/script/ci/src/Main.scala index e90f5a7ff..0b6469286 100644 --- a/script/ci/src/Main.scala +++ b/script/ci/src/Main.scala @@ -200,6 +200,10 @@ object Main: val failedTests = findFailedTests() if failedTests.isEmpty then Logger.info(s"All tests passed") else + os.write.append( + os.Path(sys.env("GITHUB_STEP_SUMMARY")), + failedTests.map(s => s"* ${s}\n").mkString + ) Logger.fatal( s"${BOLD}${failedTests.length} tests failed${RESET}" ) @@ -211,10 +215,6 @@ object Main: // * collect and report cycle update @main def postCI( - @arg( - name = "failed-tests-file-path", - doc = "specify the failed tests markdown file output path" - ) failedTestsFilePath: String, @arg( name = "cycle-update-file-path", doc = "specify the cycle update markdown file output path" @@ -232,9 +232,6 @@ object Main: doc = "Specify emulator ip top, Eg. t1rocketemu/t1emu..." ) top: String ) = - val failedTestsFile = os.Path(failedTestsFilePath, os.pwd) - os.write.over(failedTestsFile, "## Failed Tests\n") - os.write.over( os.Path(cycleUpdateFilePath, os.pwd), "## Cycle Update\n" @@ -256,18 +253,6 @@ object Main: ) ) - Logger.info("Collecting failed tests") - os.walk(emuResultPath) - .filter(path => path.last == "offline-check-status") - .filter(path => os.read(path).trim() != "0") - .map(path => { - val caseName = path.segments.toSeq.reverse.drop(1).head - os.write.append( - failedTestsFile, - s"* ${config} - ${caseName}: `nix build .#t1.$config.${top}.$caseName.$emuLib-emu -L --impure`\n" - ) - }) - if cycleUpdateFilePath.nonEmpty then Logger.info("Collecting cycle update info") val allCycleUpdates = os @@ -397,5 +382,5 @@ object Main: .pipe(println) end generateRegressionTestPlan - def main(args: Array[String]): Unit = ParserForMethods(this).runOrExit(args) + def main(args: Array[String]): Unit = ParserForMethods(this).runOrExit(args.toSeq) end Main diff --git a/script/emu/src/Main.scala b/script/emu/src/Main.scala index 4fc68e5d8..370327af1 100644 --- a/script/emu/src/Main.scala +++ b/script/emu/src/Main.scala @@ -431,5 +431,5 @@ object Main: .foreach(p => println(s"* ${p}")) end listCases - def main(args: Array[String]): Unit = ParserForMethods(this).runOrExit(args) + def main(args: Array[String]): Unit = ParserForMethods(this).runOrExit(args.toSeq) end Main From 26f7f3bd8aeb7edf2ecfcc7a5bae81fca4e81535 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 17 Nov 2024 14:15:27 +0000 Subject: [PATCH 5/6] [ci] update t1 test case cycle data --- .github/designs/blastoise/t1emu.json | 38 ++++++++++++++-------------- .github/designs/rookidee/t1emu.json | 28 ++++++++++---------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/.github/designs/blastoise/t1emu.json b/.github/designs/blastoise/t1emu.json index 44a756c3f..bb5b4fdda 100644 --- a/.github/designs/blastoise/t1emu.json +++ b/.github/designs/blastoise/t1emu.json @@ -1,23 +1,23 @@ { - "pytorch.demo": 76, - "pytorch.lenet": 1117626, - "pytorch.matmul": 14497, - "mlir.rvv_vp_intrinsic_add": 457, - "mlir.rvv_vp_intrinsic_add_scalable": 745, - "mlir.hello": 144, - "mlir.stripmining": 10228, - "asm.mmm": 92606, - "asm.smoke": 8130, + "pytorch.demo": 77, + "pytorch.lenet": 1116581, + "pytorch.matmul": 14556, + "mlir.rvv_vp_intrinsic_add": 448, + "mlir.rvv_vp_intrinsic_add_scalable": 729, + "mlir.hello": 137, + "mlir.stripmining": 8869, + "asm.mmm": 50444, + "asm.smoke": 7991, "intrinsic.conv2d_less_m2": 2647, - "intrinsic.linear_normalization": 3429, - "intrinsic.softmax": 7036, - "rvv_bench.ascii_to_utf16": 36936, - "rvv_bench.ascii_to_utf32": 11092, - "rvv_bench.byteswap": 108514, - "rvv_bench.mandelbrot": 257235, - "rvv_bench.memcpy": 36037, + "intrinsic.linear_normalization": 3416, + "intrinsic.softmax": 7032, + "rvv_bench.ascii_to_utf16": 11282, + "rvv_bench.ascii_to_utf32": 4694, + "rvv_bench.byteswap": 19955, + "rvv_bench.mandelbrot": 247231, + "rvv_bench.memcpy": 34534, "rvv_bench.memset": 11501, - "rvv_bench.mergelines": 90569, - "rvv_bench.strlen": 23191, - "rvv_bench.utf8_count": 151499 + "rvv_bench.mergelines": 24911, + "rvv_bench.strlen": 22697, + "rvv_bench.utf8_count": 151124 } \ No newline at end of file diff --git a/.github/designs/rookidee/t1emu.json b/.github/designs/rookidee/t1emu.json index 4dbb06cf4..8a4ccbffa 100644 --- a/.github/designs/rookidee/t1emu.json +++ b/.github/designs/rookidee/t1emu.json @@ -1,17 +1,17 @@ { - "asm.mmm": 96230, - "asm.smoke": 8156, - "intrinsic.conv2d_less_m2": 2709, - "mlir.hello": 143, - "mlir.rvv_vp_intrinsic_add": 456, - "mlir.rvv_vp_intrinsic_add_scalable": 641, - "mlir.stripmining": 27819, - "rvv_bench.ascii_to_utf16": 41474, - "rvv_bench.ascii_to_utf32": 12345, - "rvv_bench.byteswap": 119778, - "rvv_bench.memcpy": 48026, + "asm.mmm": 54919, + "asm.smoke": 7770, + "intrinsic.conv2d_less_m2": 2710, + "mlir.hello": 136, + "mlir.rvv_vp_intrinsic_add": 445, + "mlir.rvv_vp_intrinsic_add_scalable": 637, + "mlir.stripmining": 27810, + "rvv_bench.ascii_to_utf16": 16070, + "rvv_bench.ascii_to_utf32": 6057, + "rvv_bench.byteswap": 43275, + "rvv_bench.memcpy": 46320, "rvv_bench.memset": 19350, - "rvv_bench.mergelines": 92398, - "rvv_bench.strlen": 34125, - "rvv_bench.utf8_count": 202107 + "rvv_bench.mergelines": 38578, + "rvv_bench.strlen": 34517, + "rvv_bench.utf8_count": 206393 } \ No newline at end of file From d47ceea20d4eb3d9aad09292dbd3667b145c0312 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 17 Nov 2024 14:22:00 +0000 Subject: [PATCH 6/6] [ci] update t1 test case cycle data --- .github/designs/blastoise/t1rocketemu.json | 946 ++++++++++----------- .github/designs/rookidee/t1rocketemu.json | 734 ++++++++-------- 2 files changed, 840 insertions(+), 840 deletions(-) diff --git a/.github/designs/blastoise/t1rocketemu.json b/.github/designs/blastoise/t1rocketemu.json index ca4b191f6..5c41e56cb 100644 --- a/.github/designs/blastoise/t1rocketemu.json +++ b/.github/designs/blastoise/t1rocketemu.json @@ -1,417 +1,417 @@ { "asm.memcpy": 751, - "asm.mmm": 93690, - "asm.smoke": 8140, - "asm.strlen": 7995, - "asm.utf8_count": 204, - "codegen.vaadd_vv": 170556, - "codegen.vaadd_vx": 491346, - "codegen.vaaddu_vv": 170556, - "codegen.vaaddu_vx": 491346, - "codegen.vadc_vim": 45517, - "codegen.vadc_vvm": 21499, - "codegen.vadc_vxm": 61826, - "codegen.vadd_vi": 89916, - "codegen.vadd_vv": 43308, - "codegen.vadd_vx": 123867, - "codegen.vand_vi": 90050, - "codegen.vand_vv": 43308, - "codegen.vand_vx": 124030, - "codegen.vasub_vv": 170556, - "codegen.vasub_vx": 491346, - "codegen.vasubu_vv": 170556, - "codegen.vasubu_vx": 491346, - "codegen.vcompress_vm": 64137, - "codegen.vcpop_m": 3588, - "codegen.vdiv_vv": 71197, - "codegen.vdiv_vx": 596818, - "codegen.vdivu_vv": 71626, - "codegen.vdivu_vx": 628867, - "codegen.vfadd_vf": 659554, - "codegen.vfadd_vv": 154180, - "codegen.vfclass_v": 12596, - "codegen.vfcvt_f_x_v": 12584, - "codegen.vfcvt_f_xu_v": 12584, - "codegen.vfcvt_rtz_x_f_v": 12584, - "codegen.vfcvt_rtz_xu_f_v": 12584, - "codegen.vfcvt_x_f_v": 12584, - "codegen.vfcvt_xu_f_v": 12584, - "codegen.vfdiv_vf": 2320265, - "codegen.vfdiv_vv": 307033, - "codegen.vfirst_m": 3500, - "codegen.vfmacc_vf": 686820, - "codegen.vfmacc_vv": 155174, - "codegen.vfmadd_vf": 686820, - "codegen.vfmadd_vv": 155174, - "codegen.vfmax_vf": 659554, - "codegen.vfmax_vv": 154180, - "codegen.vfmerge_vfm": 497630, - "codegen.vfmin_vf": 659554, - "codegen.vfmin_vv": 154180, - "codegen.vfmsac_vf": 686820, - "codegen.vfmsac_vv": 155174, - "codegen.vfmsub_vf": 686820, - "codegen.vfmsub_vv": 155174, - "codegen.vfmul_vf": 659554, - "codegen.vfmul_vv": 154180, - "codegen.vfmv_f_s": 13196, - "codegen.vfmv_s_f": 1907, - "codegen.vfmv_v_f": 4303, - "codegen.vfnmacc_vf": 686820, - "codegen.vfnmacc_vv": 155174, - "codegen.vfnmadd_vf": 686820, - "codegen.vfnmadd_vv": 155174, - "codegen.vfnmsac_vf": 686820, - "codegen.vfnmsac_vv": 155174, - "codegen.vfnmsub_vf": 686828, - "codegen.vfnmsub_vv": 155174, - "codegen.vfrdiv_vf": 2320265, - "codegen.vfrec7_v": 12712, - "codegen.vfredmax_vs": 230224, - "codegen.vfredmin_vs": 230224, - "codegen.vfredosum_vs": 375766, - "codegen.vfredusum_vs": 230224, - "codegen.vfrsqrt7_v": 12584, - "codegen.vfrsub_vf": 659554, - "codegen.vfsgnj_vf": 557835, - "codegen.vfsgnj_vv": 154180, - "codegen.vfsgnjn_vf": 557835, - "codegen.vfsgnjn_vv": 154180, - "codegen.vfsgnjx_vf": 557835, - "codegen.vfsgnjx_vv": 155382, - "codegen.vfslide1down_vf": 9138743, - "codegen.vfslide1up_vf": 9111351, - "codegen.vfsqrt_v": 29247, - "codegen.vfsub_vf": 659554, - "codegen.vfsub_vv": 154196, - "codegen.vid_v": 29452, - "codegen.viota_m": 116460, + "asm.mmm": 51749, + "asm.smoke": 8001, + "asm.strlen": 7986, + "asm.utf8_count": 205, + "codegen.vaadd_vv": 170452, + "codegen.vaadd_vx": 490423, + "codegen.vaaddu_vv": 170452, + "codegen.vaaddu_vx": 490423, + "codegen.vadc_vim": 45186, + "codegen.vadc_vvm": 21491, + "codegen.vadc_vxm": 61706, + "codegen.vadd_vi": 89522, + "codegen.vadd_vv": 43282, + "codegen.vadd_vx": 123663, + "codegen.vand_vi": 89612, + "codegen.vand_vv": 43282, + "codegen.vand_vx": 123814, + "codegen.vasub_vv": 170452, + "codegen.vasub_vx": 490423, + "codegen.vasubu_vv": 170452, + "codegen.vasubu_vx": 490423, + "codegen.vcompress_vm": 19341, + "codegen.vcpop_m": 3603, + "codegen.vdiv_vv": 70806, + "codegen.vdiv_vx": 595233, + "codegen.vdivu_vv": 71250, + "codegen.vdivu_vx": 627278, + "codegen.vfadd_vf": 659445, + "codegen.vfadd_vv": 152058, + "codegen.vfclass_v": 12627, + "codegen.vfcvt_f_x_v": 12623, + "codegen.vfcvt_f_xu_v": 12623, + "codegen.vfcvt_rtz_x_f_v": 12623, + "codegen.vfcvt_rtz_xu_f_v": 12623, + "codegen.vfcvt_x_f_v": 12623, + "codegen.vfcvt_xu_f_v": 12623, + "codegen.vfdiv_vf": 2322335, + "codegen.vfdiv_vv": 302637, + "codegen.vfirst_m": 3474, + "codegen.vfmacc_vf": 697013, + "codegen.vfmacc_vv": 152821, + "codegen.vfmadd_vf": 697013, + "codegen.vfmadd_vv": 152821, + "codegen.vfmax_vf": 659445, + "codegen.vfmax_vv": 152058, + "codegen.vfmerge_vfm": 496779, + "codegen.vfmin_vf": 659445, + "codegen.vfmin_vv": 152058, + "codegen.vfmsac_vf": 697013, + "codegen.vfmsac_vv": 152821, + "codegen.vfmsub_vf": 697013, + "codegen.vfmsub_vv": 152821, + "codegen.vfmul_vf": 659445, + "codegen.vfmul_vv": 152058, + "codegen.vfmv_f_s": 13054, + "codegen.vfmv_s_f": 1818, + "codegen.vfmv_v_f": 4141, + "codegen.vfnmacc_vf": 697013, + "codegen.vfnmacc_vv": 152821, + "codegen.vfnmadd_vf": 697013, + "codegen.vfnmadd_vv": 152821, + "codegen.vfnmsac_vf": 697013, + "codegen.vfnmsac_vv": 152821, + "codegen.vfnmsub_vf": 697078, + "codegen.vfnmsub_vv": 152821, + "codegen.vfrdiv_vf": 2322335, + "codegen.vfrec7_v": 12751, + "codegen.vfredmax_vs": 225887, + "codegen.vfredmin_vs": 225887, + "codegen.vfredosum_vs": 309223, + "codegen.vfredusum_vs": 225887, + "codegen.vfrsqrt7_v": 12623, + "codegen.vfrsub_vf": 659445, + "codegen.vfsgnj_vf": 557860, + "codegen.vfsgnj_vv": 152058, + "codegen.vfsgnjn_vf": 557860, + "codegen.vfsgnjn_vv": 152058, + "codegen.vfsgnjx_vf": 557860, + "codegen.vfsgnjx_vv": 153048, + "codegen.vfslide1down_vf": 1030070, + "codegen.vfslide1up_vf": 1081477, + "codegen.vfsqrt_v": 29261, + "codegen.vfsub_vf": 659445, + "codegen.vfsub_vv": 152118, + "codegen.vid_v": 27213, + "codegen.viota_m": 38960, "codegen.vl1re16_v": 1844, "codegen.vl1re32_v": 1844, "codegen.vl1re8_v": 1844, - "codegen.vl2re16_v": 1972, - "codegen.vl2re32_v": 1972, - "codegen.vl2re8_v": 1972, - "codegen.vl4re16_v": 2228, - "codegen.vl4re32_v": 2228, - "codegen.vl4re8_v": 2448, + "codegen.vl2re16_v": 1973, + "codegen.vl2re32_v": 1973, + "codegen.vl2re8_v": 1973, + "codegen.vl4re16_v": 2229, + "codegen.vl4re32_v": 2229, + "codegen.vl4re8_v": 2449, "codegen.vl8re16_v": 2803, - "codegen.vl8re32_v": 2740, - "codegen.vl8re8_v": 2740, - "codegen.vle16_v": 8906, - "codegen.vle16ff_v": 25539, - "codegen.vle32_v": 7624, - "codegen.vle32ff_v": 15363, - "codegen.vle8_v": 10396, - "codegen.vle8ff_v": 47173, - "codegen.vlm_v": 2025, - "codegen.vloxei16_v": 72844, - "codegen.vloxei32_v": 40153, - "codegen.vloxei8_v": 112117, - "codegen.vloxseg2ei16_v": 74510, - "codegen.vloxseg2ei32_v": 41260, - "codegen.vloxseg2ei8_v": 93533, - "codegen.vloxseg3ei16_v": 50793, - "codegen.vloxseg3ei32_v": 32445, - "codegen.vloxseg3ei8_v": 69535, - "codegen.vloxseg4ei16_v": 61472, - "codegen.vloxseg4ei32_v": 37598, - "codegen.vloxseg4ei8_v": 86155, - "codegen.vloxseg5ei16_v": 37155, - "codegen.vloxseg5ei32_v": 17675, - "codegen.vloxseg5ei8_v": 50049, - "codegen.vloxseg6ei16_v": 42130, - "codegen.vloxseg6ei32_v": 19387, - "codegen.vloxseg6ei8_v": 57397, - "codegen.vloxseg7ei16_v": 47105, - "codegen.vloxseg7ei32_v": 21099, - "codegen.vloxseg7ei8_v": 64745, - "codegen.vloxseg8ei16_v": 52080, - "codegen.vloxseg8ei32_v": 22811, - "codegen.vloxseg8ei8_v": 72093, - "codegen.vlse16_v": 117005, - "codegen.vlse32_v": 67293, - "codegen.vlse8_v": 224501, - "codegen.vlseg2e16_v": 7630, - "codegen.vlseg2e32_v": 6153, - "codegen.vlseg2e8_v": 8932, - "codegen.vlseg3e16_v": 6090, - "codegen.vlseg3e32_v": 4997, - "codegen.vlseg3e8_v": 7448, - "codegen.vlseg4e16_v": 6468, - "codegen.vlseg4e32_v": 5311, - "codegen.vlseg4e8_v": 7658, - "codegen.vlseg5e16_v": 5597, - "codegen.vlseg5e32_v": 3711, - "codegen.vlseg5e8_v": 7269, - "codegen.vlseg6e16_v": 5787, - "codegen.vlseg6e32_v": 3823, - "codegen.vlseg6e8_v": 7480, - "codegen.vlseg7e16_v": 5977, - "codegen.vlseg7e32_v": 3935, - "codegen.vlseg7e8_v": 7708, - "codegen.vlseg8e16_v": 6150, - "codegen.vlseg8e32_v": 4047, - "codegen.vlseg8e8_v": 7953, - "codegen.vlsseg2e16_v": 111727, - "codegen.vlsseg2e32_v": 56680, - "codegen.vlsseg2e8_v": 204238, - "codegen.vlsseg3e16_v": 80320, - "codegen.vlsseg3e32_v": 39661, - "codegen.vlsseg3e8_v": 162886, - "codegen.vlsseg4e16_v": 100425, - "codegen.vlsseg4e32_v": 48241, - "codegen.vlsseg4e8_v": 207523, - "codegen.vlsseg5e16_v": 64750, - "codegen.vlsseg5e32_v": 23178, - "codegen.vlsseg5e8_v": 134802, - "codegen.vlsseg6e16_v": 74320, - "codegen.vlsseg6e32_v": 26028, - "codegen.vlsseg6e8_v": 156552, - "codegen.vlsseg7e16_v": 83890, - "codegen.vlsseg7e32_v": 28878, - "codegen.vlsseg7e8_v": 178302, - "codegen.vlsseg8e16_v": 93460, - "codegen.vlsseg8e32_v": 31728, - "codegen.vlsseg8e8_v": 200052, - "codegen.vluxei16_v": 72844, - "codegen.vluxei32_v": 40153, - "codegen.vluxei8_v": 112117, - "codegen.vluxseg2ei16_v": 74510, - "codegen.vluxseg2ei32_v": 41260, - "codegen.vluxseg2ei8_v": 93533, - "codegen.vluxseg3ei16_v": 50793, - "codegen.vluxseg3ei32_v": 32445, - "codegen.vluxseg3ei8_v": 69535, - "codegen.vluxseg4ei16_v": 61472, - "codegen.vluxseg4ei32_v": 37598, - "codegen.vluxseg4ei8_v": 86155, - "codegen.vluxseg5ei16_v": 37155, - "codegen.vluxseg5ei32_v": 17675, - "codegen.vluxseg5ei8_v": 50049, - "codegen.vluxseg6ei16_v": 42130, - "codegen.vluxseg6ei32_v": 19387, - "codegen.vluxseg6ei8_v": 57397, - "codegen.vluxseg7ei16_v": 47105, - "codegen.vluxseg7ei32_v": 21099, - "codegen.vluxseg7ei8_v": 64745, - "codegen.vluxseg8ei16_v": 52080, - "codegen.vluxseg8ei32_v": 22811, - "codegen.vluxseg8ei8_v": 72093, - "codegen.vmacc_vv": 43783, - "codegen.vmacc_vx": 129798, - "codegen.vmadc_vi": 58486, - "codegen.vmadc_vim": 60754, - "codegen.vmadc_vv": 18860, - "codegen.vmadc_vvm": 21407, - "codegen.vmadc_vx": 81621, - "codegen.vmadc_vxm": 83389, - "codegen.vmadd_vv": 43783, - "codegen.vmadd_vx": 129798, - "codegen.vmand_mm": 16487, - "codegen.vmandn_mm": 16487, - "codegen.vmax_vv": 43308, - "codegen.vmax_vx": 123867, - "codegen.vmaxu_vv": 43308, - "codegen.vmaxu_vx": 123867, - "codegen.vmerge_vim": 82376, - "codegen.vmerge_vvm": 26348, - "codegen.vmerge_vxm": 113446, - "codegen.vmfeq_vf": 1053178, - "codegen.vmfeq_vv": 168173, - "codegen.vmfge_vf": 1053178, - "codegen.vmfgt_vf": 1053178, - "codegen.vmfle_vf": 1053178, - "codegen.vmfle_vv": 168173, - "codegen.vmflt_vf": 1053178, - "codegen.vmflt_vv": 168173, - "codegen.vmfne_vf": 1053178, - "codegen.vmfne_vv": 168173, - "codegen.vmin_vv": 43308, - "codegen.vmin_vx": 123867, - "codegen.vminu_vv": 43308, - "codegen.vminu_vx": 123867, - "codegen.vmnand_mm": 16487, - "codegen.vmnor_mm": 16487, - "codegen.vmor_mm": 16487, - "codegen.vmorn_mm": 16487, - "codegen.vmsbc_vv": 18860, - "codegen.vmsbc_vvm": 21407, - "codegen.vmsbc_vx": 81310, - "codegen.vmsbc_vxm": 83389, - "codegen.vmsbf_m": 2702, - "codegen.vmseq_vi": 144220, - "codegen.vmseq_vv": 44317, - "codegen.vmseq_vx": 199993, - "codegen.vmsgt_vi": 144220, - "codegen.vmsgt_vv": 44317, - "codegen.vmsgt_vx": 199993, - "codegen.vmsgtu_vi": 144220, - "codegen.vmsgtu_vv": 44317, - "codegen.vmsgtu_vx": 199993, - "codegen.vmsif_m": 2702, - "codegen.vmsle_vi": 144220, - "codegen.vmsle_vv": 44317, - "codegen.vmsle_vx": 199993, - "codegen.vmsleu_vi": 144220, - "codegen.vmsleu_vv": 44317, - "codegen.vmsleu_vx": 199993, - "codegen.vmslt_vv": 44317, - "codegen.vmslt_vx": 199993, - "codegen.vmsltu_vv": 44317, - "codegen.vmsltu_vx": 199993, - "codegen.vmsne_vi": 144220, - "codegen.vmsne_vv": 44317, - "codegen.vmsne_vx": 199993, - "codegen.vmsof_m": 2702, - "codegen.vmul_vv": 43466, - "codegen.vmul_vx": 167670, - "codegen.vmulh_vv": 43466, - "codegen.vmulh_vx": 167670, - "codegen.vmulhsu_vv": 43466, - "codegen.vmulhsu_vx": 167670, - "codegen.vmulhu_vv": 43466, - "codegen.vmulhu_vx": 167670, - "codegen.vmv1r_v": 2862, + "codegen.vl8re32_v": 2741, + "codegen.vl8re8_v": 2741, + "codegen.vle16_v": 8861, + "codegen.vle16ff_v": 25407, + "codegen.vle32_v": 7594, + "codegen.vle32ff_v": 15259, + "codegen.vle8_v": 10366, + "codegen.vle8ff_v": 47015, + "codegen.vlm_v": 2027, + "codegen.vloxei16_v": 70166, + "codegen.vloxei32_v": 38066, + "codegen.vloxei8_v": 109654, + "codegen.vloxseg2ei16_v": 72886, + "codegen.vloxseg2ei32_v": 39913, + "codegen.vloxseg2ei8_v": 92475, + "codegen.vloxseg3ei16_v": 50116, + "codegen.vloxseg3ei32_v": 31720, + "codegen.vloxseg3ei8_v": 69073, + "codegen.vloxseg4ei16_v": 60795, + "codegen.vloxseg4ei32_v": 36872, + "codegen.vloxseg4ei8_v": 85693, + "codegen.vloxseg5ei16_v": 36868, + "codegen.vloxseg5ei32_v": 17437, + "codegen.vloxseg5ei8_v": 49894, + "codegen.vloxseg6ei16_v": 41843, + "codegen.vloxseg6ei32_v": 19149, + "codegen.vloxseg6ei8_v": 57242, + "codegen.vloxseg7ei16_v": 46818, + "codegen.vloxseg7ei32_v": 20861, + "codegen.vloxseg7ei8_v": 64590, + "codegen.vloxseg8ei16_v": 51793, + "codegen.vloxseg8ei32_v": 22573, + "codegen.vloxseg8ei8_v": 71938, + "codegen.vlse16_v": 117039, + "codegen.vlse32_v": 67298, + "codegen.vlse8_v": 224575, + "codegen.vlseg2e16_v": 7600, + "codegen.vlseg2e32_v": 6113, + "codegen.vlseg2e8_v": 8892, + "codegen.vlseg3e16_v": 5991, + "codegen.vlseg3e32_v": 4931, + "codegen.vlseg3e8_v": 7418, + "codegen.vlseg4e16_v": 6369, + "codegen.vlseg4e32_v": 5246, + "codegen.vlseg4e8_v": 7626, + "codegen.vlseg5e16_v": 5492, + "codegen.vlseg5e32_v": 3650, + "codegen.vlseg5e8_v": 7135, + "codegen.vlseg6e16_v": 5677, + "codegen.vlseg6e32_v": 3762, + "codegen.vlseg6e8_v": 7350, + "codegen.vlseg7e16_v": 5865, + "codegen.vlseg7e32_v": 3874, + "codegen.vlseg7e8_v": 7574, + "codegen.vlseg8e16_v": 6056, + "codegen.vlseg8e32_v": 3986, + "codegen.vlseg8e8_v": 7811, + "codegen.vlsseg2e16_v": 111747, + "codegen.vlsseg2e32_v": 56705, + "codegen.vlsseg2e8_v": 204285, + "codegen.vlsseg3e16_v": 80332, + "codegen.vlsseg3e32_v": 39679, + "codegen.vlsseg3e8_v": 162954, + "codegen.vlsseg4e16_v": 100437, + "codegen.vlsseg4e32_v": 48259, + "codegen.vlsseg4e8_v": 207563, + "codegen.vlsseg5e16_v": 64763, + "codegen.vlsseg5e32_v": 23189, + "codegen.vlsseg5e8_v": 134835, + "codegen.vlsseg6e16_v": 74333, + "codegen.vlsseg6e32_v": 26039, + "codegen.vlsseg6e8_v": 156585, + "codegen.vlsseg7e16_v": 83903, + "codegen.vlsseg7e32_v": 28889, + "codegen.vlsseg7e8_v": 178335, + "codegen.vlsseg8e16_v": 93473, + "codegen.vlsseg8e32_v": 31739, + "codegen.vlsseg8e8_v": 200085, + "codegen.vluxei16_v": 70166, + "codegen.vluxei32_v": 38066, + "codegen.vluxei8_v": 109654, + "codegen.vluxseg2ei16_v": 72886, + "codegen.vluxseg2ei32_v": 39913, + "codegen.vluxseg2ei8_v": 92475, + "codegen.vluxseg3ei16_v": 50116, + "codegen.vluxseg3ei32_v": 31720, + "codegen.vluxseg3ei8_v": 69073, + "codegen.vluxseg4ei16_v": 60795, + "codegen.vluxseg4ei32_v": 36872, + "codegen.vluxseg4ei8_v": 85693, + "codegen.vluxseg5ei16_v": 36868, + "codegen.vluxseg5ei32_v": 17437, + "codegen.vluxseg5ei8_v": 49894, + "codegen.vluxseg6ei16_v": 41843, + "codegen.vluxseg6ei32_v": 19149, + "codegen.vluxseg6ei8_v": 57242, + "codegen.vluxseg7ei16_v": 46818, + "codegen.vluxseg7ei32_v": 20861, + "codegen.vluxseg7ei8_v": 64590, + "codegen.vluxseg8ei16_v": 51793, + "codegen.vluxseg8ei32_v": 22573, + "codegen.vluxseg8ei8_v": 71938, + "codegen.vmacc_vv": 43698, + "codegen.vmacc_vx": 132377, + "codegen.vmadc_vi": 52961, + "codegen.vmadc_vim": 55596, + "codegen.vmadc_vv": 17232, + "codegen.vmadc_vvm": 20130, + "codegen.vmadc_vx": 74385, + "codegen.vmadc_vxm": 76284, + "codegen.vmadd_vv": 43698, + "codegen.vmadd_vx": 132377, + "codegen.vmand_mm": 16488, + "codegen.vmandn_mm": 16488, + "codegen.vmax_vv": 43282, + "codegen.vmax_vx": 123663, + "codegen.vmaxu_vv": 43282, + "codegen.vmaxu_vx": 123663, + "codegen.vmerge_vim": 81882, + "codegen.vmerge_vvm": 26334, + "codegen.vmerge_vxm": 113256, + "codegen.vmfeq_vf": 941663, + "codegen.vmfeq_vv": 155759, + "codegen.vmfge_vf": 941663, + "codegen.vmfgt_vf": 941663, + "codegen.vmfle_vf": 941663, + "codegen.vmfle_vv": 155759, + "codegen.vmflt_vf": 941663, + "codegen.vmflt_vv": 155759, + "codegen.vmfne_vf": 941663, + "codegen.vmfne_vv": 155759, + "codegen.vmin_vv": 43282, + "codegen.vmin_vx": 123663, + "codegen.vminu_vv": 43282, + "codegen.vminu_vx": 123663, + "codegen.vmnand_mm": 16488, + "codegen.vmnor_mm": 16488, + "codegen.vmor_mm": 16488, + "codegen.vmorn_mm": 16488, + "codegen.vmsbc_vv": 17232, + "codegen.vmsbc_vvm": 20130, + "codegen.vmsbc_vx": 74021, + "codegen.vmsbc_vxm": 76284, + "codegen.vmsbf_m": 2675, + "codegen.vmseq_vi": 109496, + "codegen.vmseq_vv": 39180, + "codegen.vmseq_vx": 150872, + "codegen.vmsgt_vi": 109496, + "codegen.vmsgt_vv": 39180, + "codegen.vmsgt_vx": 150872, + "codegen.vmsgtu_vi": 109496, + "codegen.vmsgtu_vv": 39180, + "codegen.vmsgtu_vx": 150872, + "codegen.vmsif_m": 2675, + "codegen.vmsle_vi": 109496, + "codegen.vmsle_vv": 39180, + "codegen.vmsle_vx": 150872, + "codegen.vmsleu_vi": 109496, + "codegen.vmsleu_vv": 39180, + "codegen.vmsleu_vx": 150872, + "codegen.vmslt_vv": 39180, + "codegen.vmslt_vx": 150872, + "codegen.vmsltu_vv": 39180, + "codegen.vmsltu_vx": 150872, + "codegen.vmsne_vi": 109496, + "codegen.vmsne_vv": 39180, + "codegen.vmsne_vx": 150872, + "codegen.vmsof_m": 2675, + "codegen.vmul_vv": 43454, + "codegen.vmul_vx": 167668, + "codegen.vmulh_vv": 43454, + "codegen.vmulh_vx": 167668, + "codegen.vmulhsu_vv": 43454, + "codegen.vmulhsu_vx": 167668, + "codegen.vmulhu_vv": 43454, + "codegen.vmulhu_vx": 167668, + "codegen.vmv1r_v": 2863, "codegen.vmv2r_v": 2951, - "codegen.vmv4r_v": 4323, - "codegen.vmv8r_v": 6937, - "codegen.vmv_s_x": 2514, - "codegen.vmv_v_i": 42917, - "codegen.vmv_v_v": 16733, - "codegen.vmv_v_x": 14463, - "codegen.vmv_x_s": 3859, - "codegen.vmxnor_mm": 16487, - "codegen.vmxor_mm": 16487, - "codegen.vnclip_wi": 338931, - "codegen.vnclip_wv": 106452, - "codegen.vnclip_wx": 471158, - "codegen.vnclipu_wi": 338931, - "codegen.vnclipu_wv": 106452, - "codegen.vnclipu_wx": 471158, - "codegen.vnmsac_vv": 43783, - "codegen.vnmsac_vx": 129798, - "codegen.vnmsub_vv": 43783, - "codegen.vnmsub_vx": 129845, - "codegen.vnsra_wi": 85460, - "codegen.vnsra_wv": 27050, - "codegen.vnsra_wx": 118024, - "codegen.vnsrl_wi": 85460, - "codegen.vnsrl_wv": 27050, - "codegen.vnsrl_wx": 118024, - "codegen.vor_vi": 90050, - "codegen.vor_vv": 43308, - "codegen.vor_vx": 124030, - "codegen.vredand_vs": 46035, - "codegen.vredmax_vs": 46035, - "codegen.vredmaxu_vs": 46035, - "codegen.vredmin_vs": 46035, - "codegen.vredminu_vs": 46035, - "codegen.vredor_vs": 46035, - "codegen.vredsum_vs": 46035, - "codegen.vredxor_vs": 46035, - "codegen.vrem_vv": 71197, - "codegen.vrem_vx": 596818, - "codegen.vremu_vv": 71626, - "codegen.vremu_vx": 628867, - "codegen.vrgather_vi": 194683, - "codegen.vrgather_vv": 347215, - "codegen.vrgather_vx": 235632, - "codegen.vrgatherei16_vv": 248363, - "codegen.vrsub_vi": 89916, - "codegen.vrsub_vx": 123867, - "codegen.vs1r_v": 1864, + "codegen.vmv4r_v": 4236, + "codegen.vmv8r_v": 6838, + "codegen.vmv_s_x": 2512, + "codegen.vmv_v_i": 42927, + "codegen.vmv_v_v": 16446, + "codegen.vmv_v_x": 13951, + "codegen.vmv_x_s": 3835, + "codegen.vmxnor_mm": 16488, + "codegen.vmxor_mm": 16488, + "codegen.vnclip_wi": 327597, + "codegen.vnclip_wv": 106245, + "codegen.vnclip_wx": 459101, + "codegen.vnclipu_wi": 327597, + "codegen.vnclipu_wv": 106245, + "codegen.vnclipu_wx": 459101, + "codegen.vnmsac_vv": 43698, + "codegen.vnmsac_vx": 132377, + "codegen.vnmsub_vv": 43698, + "codegen.vnmsub_vx": 132377, + "codegen.vnsra_wi": 82390, + "codegen.vnsra_wv": 26974, + "codegen.vnsra_wx": 114953, + "codegen.vnsrl_wi": 82390, + "codegen.vnsrl_wv": 26974, + "codegen.vnsrl_wx": 114953, + "codegen.vor_vi": 89612, + "codegen.vor_vv": 43282, + "codegen.vor_vx": 123814, + "codegen.vredand_vs": 46038, + "codegen.vredmax_vs": 46038, + "codegen.vredmaxu_vs": 46038, + "codegen.vredmin_vs": 46038, + "codegen.vredminu_vs": 46038, + "codegen.vredor_vs": 46038, + "codegen.vredsum_vs": 46038, + "codegen.vredxor_vs": 46038, + "codegen.vrem_vv": 70806, + "codegen.vrem_vx": 595233, + "codegen.vremu_vv": 71250, + "codegen.vremu_vx": 627278, + "codegen.vrgather_vi": 154310, + "codegen.vrgather_vv": 59125, + "codegen.vrgather_vx": 208229, + "codegen.vrgatherei16_vv": 43906, + "codegen.vrsub_vi": 89522, + "codegen.vrsub_vx": 123663, + "codegen.vs1r_v": 1872, "codegen.vs2r_v": 1960, "codegen.vs4r_v": 2216, "codegen.vs8r_v": 3018, - "codegen.vsadd_vi": 90872, - "codegen.vsadd_vv": 44263, - "codegen.vsadd_vx": 124447, - "codegen.vsaddu_vi": 90872, - "codegen.vsaddu_vv": 44263, - "codegen.vsaddu_vx": 124447, - "codegen.vsbc_vvm": 21499, - "codegen.vsbc_vxm": 91923, + "codegen.vsadd_vi": 90482, + "codegen.vsadd_vv": 44237, + "codegen.vsadd_vx": 124243, + "codegen.vsaddu_vi": 90482, + "codegen.vsaddu_vv": 44237, + "codegen.vsaddu_vx": 124243, + "codegen.vsbc_vvm": 21491, + "codegen.vsbc_vxm": 91809, "codegen.vse16_v": 8888, "codegen.vse32_v": 7681, "codegen.vse8_v": 10145, "codegen.vsetivli": 6005, "codegen.vsetvl": 1649, "codegen.vsetvli": 18626, - "codegen.vsext_vf2": 74544, - "codegen.vsext_vf4": 12424, - "codegen.vslide1down_vx": 3229852, - "codegen.vslide1up_vx": 3224140, - "codegen.vslidedown_vi": 2264530, - "codegen.vslidedown_vx": 3226300, - "codegen.vslideup_vi": 2261058, - "codegen.vslideup_vx": 3223172, - "codegen.vsll_vi": 144252, - "codegen.vsll_vv": 50841, - "codegen.vsll_vx": 201959, + "codegen.vsext_vf2": 13454, + "codegen.vsext_vf4": 4260, + "codegen.vslide1down_vx": 379997, + "codegen.vslide1up_vx": 379961, + "codegen.vslidedown_vi": 266862, + "codegen.vslidedown_vx": 313411, + "codegen.vslideup_vi": 268568, + "codegen.vslideup_vx": 296567, + "codegen.vsll_vi": 143160, + "codegen.vsll_vv": 50559, + "codegen.vsll_vx": 201308, "codegen.vsm_v": 2062, - "codegen.vsmul_vv": 174798, - "codegen.vsmul_vx": 503126, - "codegen.vsoxei16_v": 79844, - "codegen.vsoxei32_v": 44251, - "codegen.vsoxei8_v": 121713, - "codegen.vsoxseg2ei16_v": 82246, - "codegen.vsoxseg2ei32_v": 46433, - "codegen.vsoxseg2ei8_v": 102066, - "codegen.vsoxseg3ei16_v": 57997, - "codegen.vsoxseg3ei32_v": 37706, - "codegen.vsoxseg3ei8_v": 77964, - "codegen.vsoxseg4ei16_v": 68676, - "codegen.vsoxseg4ei32_v": 42858, - "codegen.vsoxseg4ei8_v": 94584, - "codegen.vsoxseg5ei16_v": 44125, - "codegen.vsoxseg5ei32_v": 21045, - "codegen.vsoxseg5ei8_v": 59121, - "codegen.vsoxseg6ei16_v": 49100, - "codegen.vsoxseg6ei32_v": 22757, - "codegen.vsoxseg6ei8_v": 66469, - "codegen.vsoxseg7ei16_v": 54075, - "codegen.vsoxseg7ei32_v": 24469, - "codegen.vsoxseg7ei8_v": 73817, - "codegen.vsoxseg8ei16_v": 59050, - "codegen.vsoxseg8ei32_v": 26181, - "codegen.vsoxseg8ei8_v": 81165, - "codegen.vsra_vi": 144252, - "codegen.vsra_vv": 50833, - "codegen.vsra_vx": 201959, - "codegen.vsrl_vi": 144252, - "codegen.vsrl_vv": 50833, - "codegen.vsrl_vx": 201959, + "codegen.vsmul_vv": 174750, + "codegen.vsmul_vx": 502614, + "codegen.vsoxei16_v": 78491, + "codegen.vsoxei32_v": 43319, + "codegen.vsoxei8_v": 120521, + "codegen.vsoxseg2ei16_v": 80978, + "codegen.vsoxseg2ei32_v": 45428, + "codegen.vsoxseg2ei8_v": 101130, + "codegen.vsoxseg3ei16_v": 57245, + "codegen.vsoxseg3ei32_v": 37006, + "codegen.vsoxseg3ei8_v": 77378, + "codegen.vsoxseg4ei16_v": 67926, + "codegen.vsoxseg4ei32_v": 42162, + "codegen.vsoxseg4ei8_v": 93998, + "codegen.vsoxseg5ei16_v": 43764, + "codegen.vsoxseg5ei32_v": 20798, + "codegen.vsoxseg5ei8_v": 58866, + "codegen.vsoxseg6ei16_v": 48739, + "codegen.vsoxseg6ei32_v": 22510, + "codegen.vsoxseg6ei8_v": 66214, + "codegen.vsoxseg7ei16_v": 53714, + "codegen.vsoxseg7ei32_v": 24222, + "codegen.vsoxseg7ei8_v": 73562, + "codegen.vsoxseg8ei16_v": 58689, + "codegen.vsoxseg8ei32_v": 25934, + "codegen.vsoxseg8ei8_v": 80910, + "codegen.vsra_vi": 143160, + "codegen.vsra_vv": 50551, + "codegen.vsra_vx": 201308, + "codegen.vsrl_vi": 143160, + "codegen.vsrl_vv": 50551, + "codegen.vsrl_vx": 201308, "codegen.vsse16_v": 137067, "codegen.vsse32_v": 92941, "codegen.vsse8_v": 228853, "codegen.vsseg2e16_v": 7664, - "codegen.vsseg2e32_v": 6659, + "codegen.vsseg2e32_v": 6672, "codegen.vsseg2e8_v": 8870, "codegen.vsseg3e16_v": 7136, "codegen.vsseg3e32_v": 5678, "codegen.vsseg3e8_v": 8594, - "codegen.vsseg4e16_v": 7676, - "codegen.vsseg4e32_v": 6124, - "codegen.vsseg4e8_v": 9272, - "codegen.vsseg5e16_v": 6820, + "codegen.vsseg4e16_v": 7721, + "codegen.vsseg4e32_v": 6154, + "codegen.vsseg4e8_v": 9334, + "codegen.vsseg5e16_v": 6821, "codegen.vsseg5e32_v": 4193, - "codegen.vsseg5e8_v": 9430, - "codegen.vsseg6e16_v": 6898, - "codegen.vsseg6e32_v": 4298, - "codegen.vsseg6e8_v": 9515, - "codegen.vsseg7e16_v": 7228, - "codegen.vsseg7e32_v": 4507, - "codegen.vsseg7e8_v": 9850, - "codegen.vsseg8e16_v": 7626, - "codegen.vsseg8e32_v": 4723, - "codegen.vsseg8e8_v": 10377, - "codegen.vssra_vi": 575109, - "codegen.vssra_vv": 200640, - "codegen.vssra_vx": 1242295, - "codegen.vssrl_vi": 575109, - "codegen.vssrl_vv": 200640, - "codegen.vssrl_vx": 1242295, + "codegen.vsseg5e8_v": 9431, + "codegen.vsseg6e16_v": 6908, + "codegen.vsseg6e32_v": 4306, + "codegen.vsseg6e8_v": 9524, + "codegen.vsseg7e16_v": 7232, + "codegen.vsseg7e32_v": 4508, + "codegen.vsseg7e8_v": 9870, + "codegen.vsseg8e16_v": 7660, + "codegen.vsseg8e32_v": 4735, + "codegen.vsseg8e8_v": 10420, + "codegen.vssra_vi": 570699, + "codegen.vssra_vv": 199512, + "codegen.vssra_vx": 1238496, + "codegen.vssrl_vi": 570699, + "codegen.vssrl_vv": 199512, + "codegen.vssrl_vx": 1238496, "codegen.vssseg2e16_v": 130577, "codegen.vssseg2e32_v": 80467, "codegen.vssseg2e8_v": 211152, @@ -433,96 +433,96 @@ "codegen.vssseg8e16_v": 113525, "codegen.vssseg8e32_v": 44831, "codegen.vssseg8e8_v": 218268, - "codegen.vssub_vv": 43308, - "codegen.vssub_vx": 185107, - "codegen.vssubu_vv": 43308, - "codegen.vssubu_vx": 185107, - "codegen.vsub_vv": 43308, - "codegen.vsub_vx": 185107, - "codegen.vsuxei16_v": 79844, - "codegen.vsuxei32_v": 44251, - "codegen.vsuxei8_v": 121713, - "codegen.vsuxseg2ei16_v": 82246, - "codegen.vsuxseg2ei32_v": 46433, - "codegen.vsuxseg2ei8_v": 102066, - "codegen.vsuxseg3ei16_v": 57997, - "codegen.vsuxseg3ei32_v": 37706, - "codegen.vsuxseg3ei8_v": 77964, - "codegen.vsuxseg4ei16_v": 68676, - "codegen.vsuxseg4ei32_v": 42858, - "codegen.vsuxseg4ei8_v": 94584, - "codegen.vsuxseg5ei16_v": 44125, - "codegen.vsuxseg5ei32_v": 21045, - "codegen.vsuxseg5ei8_v": 59121, - "codegen.vsuxseg6ei16_v": 49100, - "codegen.vsuxseg6ei32_v": 22757, - "codegen.vsuxseg6ei8_v": 66469, - "codegen.vsuxseg7ei16_v": 54075, - "codegen.vsuxseg7ei32_v": 24469, - "codegen.vsuxseg7ei8_v": 73817, - "codegen.vsuxseg8ei16_v": 59050, - "codegen.vsuxseg8ei32_v": 26181, - "codegen.vsuxseg8ei8_v": 81165, - "codegen.vwadd_vv": 24689, - "codegen.vwadd_vx": 76158, - "codegen.vwadd_wv": 27195, - "codegen.vwadd_wx": 92809, - "codegen.vwaddu_vv": 24689, - "codegen.vwaddu_vx": 76158, - "codegen.vwaddu_wv": 27195, - "codegen.vwaddu_wx": 92809, - "codegen.vwmacc_vv": 26385, - "codegen.vwmacc_vx": 100646, - "codegen.vwmaccsu_vv": 26385, - "codegen.vwmaccsu_vx": 100646, - "codegen.vwmaccu_vv": 26385, - "codegen.vwmaccu_vx": 100646, - "codegen.vwmaccus_vx": 100646, - "codegen.vwmul_vv": 24720, - "codegen.vwmul_vx": 101803, - "codegen.vwmulsu_vv": 24720, - "codegen.vwmulsu_vx": 101803, - "codegen.vwmulu_vv": 24720, - "codegen.vwmulu_vx": 101803, + "codegen.vssub_vv": 43282, + "codegen.vssub_vx": 184911, + "codegen.vssubu_vv": 43282, + "codegen.vssubu_vx": 184911, + "codegen.vsub_vv": 43282, + "codegen.vsub_vx": 184911, + "codegen.vsuxei16_v": 78491, + "codegen.vsuxei32_v": 43319, + "codegen.vsuxei8_v": 120521, + "codegen.vsuxseg2ei16_v": 80978, + "codegen.vsuxseg2ei32_v": 45428, + "codegen.vsuxseg2ei8_v": 101130, + "codegen.vsuxseg3ei16_v": 57245, + "codegen.vsuxseg3ei32_v": 37006, + "codegen.vsuxseg3ei8_v": 77378, + "codegen.vsuxseg4ei16_v": 67926, + "codegen.vsuxseg4ei32_v": 42162, + "codegen.vsuxseg4ei8_v": 93998, + "codegen.vsuxseg5ei16_v": 43764, + "codegen.vsuxseg5ei32_v": 20798, + "codegen.vsuxseg5ei8_v": 58866, + "codegen.vsuxseg6ei16_v": 48739, + "codegen.vsuxseg6ei32_v": 22510, + "codegen.vsuxseg6ei8_v": 66214, + "codegen.vsuxseg7ei16_v": 53714, + "codegen.vsuxseg7ei32_v": 24222, + "codegen.vsuxseg7ei8_v": 73562, + "codegen.vsuxseg8ei16_v": 58689, + "codegen.vsuxseg8ei32_v": 25934, + "codegen.vsuxseg8ei8_v": 80910, + "codegen.vwadd_vv": 24676, + "codegen.vwadd_vx": 75676, + "codegen.vwadd_wv": 27204, + "codegen.vwadd_wx": 91440, + "codegen.vwaddu_vv": 24676, + "codegen.vwaddu_vx": 75676, + "codegen.vwaddu_wv": 27204, + "codegen.vwaddu_wx": 91440, + "codegen.vwmacc_vv": 26382, + "codegen.vwmacc_vx": 99430, + "codegen.vwmaccsu_vv": 26382, + "codegen.vwmaccsu_vx": 99430, + "codegen.vwmaccu_vv": 26382, + "codegen.vwmaccu_vx": 99430, + "codegen.vwmaccus_vx": 99430, + "codegen.vwmul_vv": 24726, + "codegen.vwmul_vx": 101274, + "codegen.vwmulsu_vv": 24726, + "codegen.vwmulsu_vx": 101274, + "codegen.vwmulu_vv": 24726, + "codegen.vwmulu_vx": 101274, "codegen.vwredsum_vs": 26838, "codegen.vwredsumu_vs": 26838, - "codegen.vwsub_vv": 24689, - "codegen.vwsub_vx": 76158, - "codegen.vwsub_wv": 27195, - "codegen.vwsub_wx": 92809, - "codegen.vwsubu_vv": 24689, - "codegen.vwsubu_vx": 76158, - "codegen.vwsubu_wv": 27195, - "codegen.vwsubu_wx": 92809, - "codegen.vxor_vi": 90050, - "codegen.vxor_vv": 43308, - "codegen.vxor_vx": 124030, - "codegen.vzext_vf2": 74544, - "codegen.vzext_vf4": 12424, - "intrinsic.conv2d_less_m2": 2497, - "intrinsic.linear_normalization": 3413, - "intrinsic.matmul": 70908, - "intrinsic.softmax": 7051, + "codegen.vwsub_vv": 24676, + "codegen.vwsub_vx": 75676, + "codegen.vwsub_wv": 27204, + "codegen.vwsub_wx": 91440, + "codegen.vwsubu_vv": 24676, + "codegen.vwsubu_vx": 75676, + "codegen.vwsubu_wv": 27204, + "codegen.vwsubu_wx": 91440, + "codegen.vxor_vi": 89612, + "codegen.vxor_vv": 43282, + "codegen.vxor_vx": 123814, + "codegen.vzext_vf2": 13454, + "codegen.vzext_vf4": 4260, + "intrinsic.conv2d_less_m2": 2498, + "intrinsic.linear_normalization": 3350, + "intrinsic.matmul": 61748, + "intrinsic.softmax": 6793, "mlir.axpy_masked": 4048, - "mlir.conv": 124726, - "mlir.hello": 140, - "mlir.matmul": 55579, - "mlir.maxvl_tail_setvl_front": 726, - "mlir.rvv_vp_intrinsic_add": 477, - "mlir.rvv_vp_intrinsic_add_scalable": 813, - "mlir.stripmining": 10240, - "mlir.vectoradd": 17183, + "mlir.conv": 125859, + "mlir.hello": 131, + "mlir.matmul": 56059, + "mlir.maxvl_tail_setvl_front": 700, + "mlir.rvv_vp_intrinsic_add": 469, + "mlir.rvv_vp_intrinsic_add_scalable": 807, + "mlir.stripmining": 8882, + "mlir.vectoradd": 15530, "pytorch.demo": 31521, "pytorch.matmul": 69793, - "rvv_bench.ascii_to_utf16": 702730, - "rvv_bench.ascii_to_utf32": 233308, - "rvv_bench.byteswap": 488091, + "rvv_bench.ascii_to_utf16": 677090, + "rvv_bench.ascii_to_utf32": 226910, + "rvv_bench.byteswap": 399524, "rvv_bench.chacha20": 39957, - "rvv_bench.mandelbrot": 539087, - "rvv_bench.memcpy": 673836, - "rvv_bench.memset": 290713, - "rvv_bench.mergelines": 629787, + "rvv_bench.mandelbrot": 529063, + "rvv_bench.memcpy": 671955, + "rvv_bench.memset": 290725, + "rvv_bench.mergelines": 564228, "rvv_bench.poly1305": 39957, - "rvv_bench.strlen": 219699, - "rvv_bench.utf8_count": 2283633 + "rvv_bench.strlen": 219139, + "rvv_bench.utf8_count": 2283382 } \ No newline at end of file diff --git a/.github/designs/rookidee/t1rocketemu.json b/.github/designs/rookidee/t1rocketemu.json index e0feb693f..043fc72ca 100644 --- a/.github/designs/rookidee/t1rocketemu.json +++ b/.github/designs/rookidee/t1rocketemu.json @@ -1,357 +1,357 @@ { - "asm.mmm": 93690, - "asm.smoke": 4859, - "codegen.vaadd_vv": 118066, - "codegen.vaadd_vx": 354670, - "codegen.vaaddu_vv": 118066, - "codegen.vaaddu_vx": 354670, - "codegen.vadc_vim": 31477, - "codegen.vadc_vvm": 14554, + "asm.mmm": 51749, + "asm.smoke": 4564, + "codegen.vaadd_vv": 118067, + "codegen.vaadd_vx": 354504, + "codegen.vaaddu_vv": 118067, + "codegen.vaaddu_vx": 354504, + "codegen.vadc_vim": 31478, + "codegen.vadc_vvm": 14555, "codegen.vadc_vxm": 44308, - "codegen.vadd_vi": 62874, - "codegen.vadd_vv": 29206, - "codegen.vadd_vx": 88766, - "codegen.vand_vi": 62874, - "codegen.vand_vv": 29206, - "codegen.vand_vx": 88766, - "codegen.vasub_vv": 118066, - "codegen.vasub_vx": 354670, - "codegen.vasubu_vv": 118066, - "codegen.vasubu_vx": 354670, - "codegen.vcompress_vm": 20331, - "codegen.vcpop_m": 2464, - "codegen.vdiv_vv": 33648, - "codegen.vdiv_vx": 200480, - "codegen.vdivu_vv": 33726, - "codegen.vdivu_vx": 207983, - "codegen.vfirst_m": 2376, - "codegen.vid_v": 18696, - "codegen.viota_m": 38506, + "codegen.vadd_vi": 62877, + "codegen.vadd_vv": 29207, + "codegen.vadd_vx": 88767, + "codegen.vand_vi": 62877, + "codegen.vand_vv": 29207, + "codegen.vand_vx": 88767, + "codegen.vasub_vv": 118067, + "codegen.vasub_vx": 354504, + "codegen.vasubu_vv": 118067, + "codegen.vasubu_vx": 354504, + "codegen.vcompress_vm": 11918, + "codegen.vcpop_m": 2479, + "codegen.vdiv_vv": 33501, + "codegen.vdiv_vx": 200221, + "codegen.vdivu_vv": 33544, + "codegen.vdivu_vx": 207584, + "codegen.vfirst_m": 2350, + "codegen.vid_v": 18586, + "codegen.viota_m": 22745, "codegen.vl1re16_v": 641, - "codegen.vl1re32_v": 633, - "codegen.vl1re8_v": 633, - "codegen.vl2re16_v": 635, - "codegen.vl2re32_v": 635, - "codegen.vl2re8_v": 635, + "codegen.vl1re32_v": 634, + "codegen.vl1re8_v": 634, + "codegen.vl2re16_v": 636, + "codegen.vl2re32_v": 636, + "codegen.vl2re8_v": 636, "codegen.vl4re16_v": 692, "codegen.vl4re32_v": 692, - "codegen.vl4re8_v": 873, - "codegen.vl8re16_v": 822, - "codegen.vl8re32_v": 820, - "codegen.vl8re8_v": 820, + "codegen.vl4re8_v": 874, + "codegen.vl8re16_v": 824, + "codegen.vl8re32_v": 821, + "codegen.vl8re8_v": 821, "codegen.vle16_v": 7129, - "codegen.vle16ff_v": 9062, + "codegen.vle16ff_v": 8995, "codegen.vle32_v": 5697, - "codegen.vle32ff_v": 6192, + "codegen.vle32ff_v": 6158, "codegen.vle8_v": 8556, - "codegen.vle8ff_v": 14809, - "codegen.vlm_v": 696, - "codegen.vloxei16_v": 27792, - "codegen.vloxei32_v": 17233, - "codegen.vloxei8_v": 38330, - "codegen.vloxseg2ei16_v": 24647, - "codegen.vloxseg2ei32_v": 15647, - "codegen.vloxseg2ei8_v": 29994, - "codegen.vloxseg3ei16_v": 16634, - "codegen.vloxseg3ei32_v": 10791, - "codegen.vloxseg3ei8_v": 21042, - "codegen.vloxseg4ei16_v": 18180, - "codegen.vloxseg4ei32_v": 11230, - "codegen.vloxseg4ei8_v": 24283, - "codegen.vloxseg5ei16_v": 9699, - "codegen.vloxseg5ei32_v": 5506, - "codegen.vloxseg5ei8_v": 14602, - "codegen.vloxseg6ei16_v": 10239, - "codegen.vloxseg6ei32_v": 5884, - "codegen.vloxseg6ei8_v": 16406, - "codegen.vloxseg7ei16_v": 10953, - "codegen.vloxseg7ei32_v": 6284, - "codegen.vloxseg7ei8_v": 18210, - "codegen.vloxseg8ei16_v": 11678, - "codegen.vloxseg8ei32_v": 6684, - "codegen.vloxseg8ei8_v": 20014, - "codegen.vlse16_v": 39627, - "codegen.vlse32_v": 25942, - "codegen.vlse8_v": 68058, - "codegen.vlseg2e16_v": 5738, + "codegen.vle8ff_v": 14696, + "codegen.vlm_v": 699, + "codegen.vloxei16_v": 27373, + "codegen.vloxei32_v": 16937, + "codegen.vloxei8_v": 37917, + "codegen.vloxseg2ei16_v": 24428, + "codegen.vloxseg2ei32_v": 15505, + "codegen.vloxseg2ei8_v": 29869, + "codegen.vloxseg3ei16_v": 16575, + "codegen.vloxseg3ei32_v": 10757, + "codegen.vloxseg3ei8_v": 21016, + "codegen.vloxseg4ei16_v": 18111, + "codegen.vloxseg4ei32_v": 11200, + "codegen.vloxseg4ei8_v": 24271, + "codegen.vloxseg5ei16_v": 9690, + "codegen.vloxseg5ei32_v": 5490, + "codegen.vloxseg5ei8_v": 14631, + "codegen.vloxseg6ei16_v": 10242, + "codegen.vloxseg6ei32_v": 5877, + "codegen.vloxseg6ei8_v": 16435, + "codegen.vloxseg7ei16_v": 10967, + "codegen.vloxseg7ei32_v": 6277, + "codegen.vloxseg7ei8_v": 18239, + "codegen.vloxseg8ei16_v": 11692, + "codegen.vloxseg8ei32_v": 6677, + "codegen.vloxseg8ei8_v": 20043, + "codegen.vlse16_v": 39602, + "codegen.vlse32_v": 25930, + "codegen.vlse8_v": 68022, + "codegen.vlseg2e16_v": 5739, "codegen.vlseg2e32_v": 4294, - "codegen.vlseg2e8_v": 7150, + "codegen.vlseg2e8_v": 7151, "codegen.vlseg3e16_v": 4315, "codegen.vlseg3e32_v": 2915, - "codegen.vlseg3e8_v": 5734, - "codegen.vlseg4e16_v": 4318, + "codegen.vlseg3e8_v": 5735, + "codegen.vlseg4e16_v": 4319, "codegen.vlseg4e32_v": 2917, "codegen.vlseg4e8_v": 5737, - "codegen.vlseg5e16_v": 2925, + "codegen.vlseg5e16_v": 2926, "codegen.vlseg5e32_v": 1500, - "codegen.vlseg5e8_v": 4358, + "codegen.vlseg5e8_v": 4359, "codegen.vlseg6e16_v": 2926, "codegen.vlseg6e32_v": 1500, - "codegen.vlseg6e8_v": 4360, + "codegen.vlseg6e8_v": 4361, "codegen.vlseg7e16_v": 2928, "codegen.vlseg7e32_v": 1502, - "codegen.vlseg7e8_v": 4362, - "codegen.vlseg8e16_v": 2929, - "codegen.vlseg8e32_v": 1503, - "codegen.vlseg8e8_v": 4364, - "codegen.vlsseg2e16_v": 33278, - "codegen.vlsseg2e32_v": 19904, - "codegen.vlsseg2e8_v": 56918, - "codegen.vlsseg3e16_v": 23085, - "codegen.vlsseg3e32_v": 11962, - "codegen.vlsseg3e8_v": 43116, - "codegen.vlsseg4e16_v": 26561, - "codegen.vlsseg4e32_v": 12587, - "codegen.vlsseg4e8_v": 53325, - "codegen.vlsseg5e16_v": 14465, - "codegen.vlsseg5e32_v": 6438, - "codegen.vlsseg5e8_v": 35805, - "codegen.vlsseg6e16_v": 15835, - "codegen.vlsseg6e32_v": 7098, - "codegen.vlsseg6e8_v": 41200, - "codegen.vlsseg7e16_v": 17356, - "codegen.vlsseg7e32_v": 7758, - "codegen.vlsseg7e8_v": 46595, - "codegen.vlsseg8e16_v": 18951, - "codegen.vlsseg8e32_v": 8418, - "codegen.vlsseg8e8_v": 51990, - "codegen.vluxei16_v": 27792, - "codegen.vluxei32_v": 17233, - "codegen.vluxei8_v": 38330, - "codegen.vluxseg2ei16_v": 24647, - "codegen.vluxseg2ei32_v": 15647, - "codegen.vluxseg2ei8_v": 29994, - "codegen.vluxseg3ei16_v": 16634, - "codegen.vluxseg3ei32_v": 10791, - "codegen.vluxseg3ei8_v": 21042, - "codegen.vluxseg4ei16_v": 18180, - "codegen.vluxseg4ei32_v": 11230, - "codegen.vluxseg4ei8_v": 24283, - "codegen.vluxseg5ei16_v": 9699, - "codegen.vluxseg5ei32_v": 5506, - "codegen.vluxseg5ei8_v": 14602, - "codegen.vluxseg6ei16_v": 10239, - "codegen.vluxseg6ei32_v": 5884, - "codegen.vluxseg6ei8_v": 16406, - "codegen.vluxseg7ei16_v": 10953, - "codegen.vluxseg7ei32_v": 6284, - "codegen.vluxseg7ei8_v": 18210, - "codegen.vluxseg8ei16_v": 11678, - "codegen.vluxseg8ei32_v": 6684, - "codegen.vluxseg8ei8_v": 20014, + "codegen.vlseg7e8_v": 4363, + "codegen.vlseg8e16_v": 2930, + "codegen.vlseg8e32_v": 1504, + "codegen.vlseg8e8_v": 4365, + "codegen.vlsseg2e16_v": 33260, + "codegen.vlsseg2e32_v": 19897, + "codegen.vlsseg2e8_v": 56876, + "codegen.vlsseg3e16_v": 23073, + "codegen.vlsseg3e32_v": 11971, + "codegen.vlsseg3e8_v": 43076, + "codegen.vlsseg4e16_v": 26562, + "codegen.vlsseg4e32_v": 12588, + "codegen.vlsseg4e8_v": 53340, + "codegen.vlsseg5e16_v": 14443, + "codegen.vlsseg5e32_v": 6441, + "codegen.vlsseg5e8_v": 35770, + "codegen.vlsseg6e16_v": 15813, + "codegen.vlsseg6e32_v": 7101, + "codegen.vlsseg6e8_v": 41165, + "codegen.vlsseg7e16_v": 17324, + "codegen.vlsseg7e32_v": 7761, + "codegen.vlsseg7e8_v": 46560, + "codegen.vlsseg8e16_v": 18919, + "codegen.vlsseg8e32_v": 8421, + "codegen.vlsseg8e8_v": 51955, + "codegen.vluxei16_v": 27373, + "codegen.vluxei32_v": 16937, + "codegen.vluxei8_v": 37917, + "codegen.vluxseg2ei16_v": 24428, + "codegen.vluxseg2ei32_v": 15505, + "codegen.vluxseg2ei8_v": 29869, + "codegen.vluxseg3ei16_v": 16575, + "codegen.vluxseg3ei32_v": 10757, + "codegen.vluxseg3ei8_v": 21016, + "codegen.vluxseg4ei16_v": 18111, + "codegen.vluxseg4ei32_v": 11200, + "codegen.vluxseg4ei8_v": 24271, + "codegen.vluxseg5ei16_v": 9690, + "codegen.vluxseg5ei32_v": 5490, + "codegen.vluxseg5ei8_v": 14631, + "codegen.vluxseg6ei16_v": 10242, + "codegen.vluxseg6ei32_v": 5877, + "codegen.vluxseg6ei8_v": 16435, + "codegen.vluxseg7ei16_v": 10967, + "codegen.vluxseg7ei32_v": 6277, + "codegen.vluxseg7ei8_v": 18239, + "codegen.vluxseg8ei16_v": 11692, + "codegen.vluxseg8ei32_v": 6677, + "codegen.vluxseg8ei8_v": 20043, "codegen.vmacc_vv": 29209, - "codegen.vmacc_vx": 89468, - "codegen.vmadc_vi": 33340, - "codegen.vmadc_vim": 35381, - "codegen.vmadc_vv": 11915, - "codegen.vmadc_vvm": 14564, - "codegen.vmadc_vx": 48068, - "codegen.vmadc_vxm": 49602, + "codegen.vmacc_vx": 89389, + "codegen.vmadc_vi": 33249, + "codegen.vmadc_vim": 35158, + "codegen.vmadc_vv": 11916, + "codegen.vmadc_vvm": 14566, + "codegen.vmadc_vx": 47863, + "codegen.vmadc_vxm": 49236, "codegen.vmadd_vv": 29209, - "codegen.vmadd_vx": 89468, - "codegen.vmand_mm": 11906, - "codegen.vmandn_mm": 11906, - "codegen.vmax_vv": 29206, - "codegen.vmax_vx": 88766, - "codegen.vmaxu_vv": 29206, - "codegen.vmaxu_vx": 88766, - "codegen.vmerge_vim": 36518, - "codegen.vmerge_vvm": 15134, - "codegen.vmerge_vxm": 50735, - "codegen.vmin_vv": 29206, - "codegen.vmin_vx": 88766, - "codegen.vminu_vv": 29206, - "codegen.vminu_vx": 88766, - "codegen.vmnand_mm": 11906, - "codegen.vmnor_mm": 11906, - "codegen.vmor_mm": 11906, - "codegen.vmorn_mm": 11906, - "codegen.vmsbc_vv": 11915, - "codegen.vmsbc_vvm": 14564, - "codegen.vmsbc_vx": 47756, - "codegen.vmsbc_vxm": 49602, - "codegen.vmsbf_m": 1589, - "codegen.vmseq_vi": 75561, - "codegen.vmseq_vv": 29216, - "codegen.vmseq_vx": 106022, - "codegen.vmsgt_vi": 75561, - "codegen.vmsgt_vv": 29216, - "codegen.vmsgt_vx": 106022, - "codegen.vmsgtu_vi": 75561, - "codegen.vmsgtu_vv": 29216, - "codegen.vmsgtu_vx": 106022, - "codegen.vmsif_m": 1589, - "codegen.vmsle_vi": 75561, - "codegen.vmsle_vv": 29216, - "codegen.vmsle_vx": 106022, - "codegen.vmsleu_vi": 75561, - "codegen.vmsleu_vv": 29216, - "codegen.vmsleu_vx": 106022, - "codegen.vmslt_vv": 29216, - "codegen.vmslt_vx": 106022, - "codegen.vmsltu_vv": 29216, - "codegen.vmsltu_vx": 106022, - "codegen.vmsne_vi": 75561, - "codegen.vmsne_vv": 29216, - "codegen.vmsne_vx": 106022, - "codegen.vmsof_m": 1589, + "codegen.vmadd_vx": 89389, + "codegen.vmand_mm": 11907, + "codegen.vmandn_mm": 11907, + "codegen.vmax_vv": 29207, + "codegen.vmax_vx": 88767, + "codegen.vmaxu_vv": 29207, + "codegen.vmaxu_vx": 88767, + "codegen.vmerge_vim": 36304, + "codegen.vmerge_vvm": 15126, + "codegen.vmerge_vxm": 50678, + "codegen.vmin_vv": 29207, + "codegen.vmin_vx": 88767, + "codegen.vminu_vv": 29207, + "codegen.vminu_vx": 88767, + "codegen.vmnand_mm": 11907, + "codegen.vmnor_mm": 11907, + "codegen.vmor_mm": 11907, + "codegen.vmorn_mm": 11907, + "codegen.vmsbc_vv": 11916, + "codegen.vmsbc_vvm": 14566, + "codegen.vmsbc_vx": 47474, + "codegen.vmsbc_vxm": 49236, + "codegen.vmsbf_m": 1583, + "codegen.vmseq_vi": 70004, + "codegen.vmseq_vv": 29212, + "codegen.vmseq_vx": 98083, + "codegen.vmsgt_vi": 70004, + "codegen.vmsgt_vv": 29212, + "codegen.vmsgt_vx": 98083, + "codegen.vmsgtu_vi": 70004, + "codegen.vmsgtu_vv": 29212, + "codegen.vmsgtu_vx": 98083, + "codegen.vmsif_m": 1583, + "codegen.vmsle_vi": 70004, + "codegen.vmsle_vv": 29212, + "codegen.vmsle_vx": 98083, + "codegen.vmsleu_vi": 70004, + "codegen.vmsleu_vv": 29212, + "codegen.vmsleu_vx": 98083, + "codegen.vmslt_vv": 29212, + "codegen.vmslt_vx": 98083, + "codegen.vmsltu_vv": 29212, + "codegen.vmsltu_vx": 98083, + "codegen.vmsne_vi": 70004, + "codegen.vmsne_vv": 29212, + "codegen.vmsne_vx": 98083, + "codegen.vmsof_m": 1583, "codegen.vmul_vv": 29207, - "codegen.vmul_vx": 117167, + "codegen.vmul_vx": 117138, "codegen.vmulh_vv": 29207, - "codegen.vmulh_vx": 117167, + "codegen.vmulh_vx": 117138, "codegen.vmulhsu_vv": 29207, - "codegen.vmulhsu_vx": 117167, + "codegen.vmulhsu_vx": 117138, "codegen.vmulhu_vv": 29207, - "codegen.vmulhu_vx": 117167, + "codegen.vmulhu_vx": 117138, "codegen.vmv1r_v": 1798, "codegen.vmv2r_v": 1786, - "codegen.vmv4r_v": 1791, - "codegen.vmv8r_v": 1833, - "codegen.vmv_s_x": 1447, - "codegen.vmv_v_i": 18569, - "codegen.vmv_v_v": 9582, - "codegen.vmv_v_x": 6932, - "codegen.vmv_x_s": 2649, - "codegen.vmxnor_mm": 11906, - "codegen.vmxor_mm": 11906, - "codegen.vnclip_wi": 168000, - "codegen.vnclip_wv": 74313, - "codegen.vnclip_wx": 236219, - "codegen.vnclipu_wi": 168000, - "codegen.vnclipu_wv": 74313, - "codegen.vnclipu_wx": 236219, + "codegen.vmv4r_v": 1792, + "codegen.vmv8r_v": 1834, + "codegen.vmv_s_x": 1446, + "codegen.vmv_v_i": 18573, + "codegen.vmv_v_v": 9593, + "codegen.vmv_v_x": 6885, + "codegen.vmv_x_s": 2721, + "codegen.vmxnor_mm": 11907, + "codegen.vmxor_mm": 11907, + "codegen.vnclip_wi": 165634, + "codegen.vnclip_wv": 74314, + "codegen.vnclip_wx": 232622, + "codegen.vnclipu_wi": 165634, + "codegen.vnclipu_wv": 74314, + "codegen.vnclipu_wx": 232622, "codegen.vnmsac_vv": 29209, - "codegen.vnmsac_vx": 89468, + "codegen.vnmsac_vx": 89389, "codegen.vnmsub_vv": 29209, - "codegen.vnmsub_vx": 89468, - "codegen.vnsra_wi": 41062, - "codegen.vnsra_wv": 17517, - "codegen.vnsra_wx": 57905, - "codegen.vnsrl_wi": 41062, - "codegen.vnsrl_wv": 17517, - "codegen.vnsrl_wx": 57905, - "codegen.vor_vi": 62874, - "codegen.vor_vv": 29206, - "codegen.vor_vx": 88766, - "codegen.vredand_vs": 29600, - "codegen.vredmax_vs": 29600, - "codegen.vredmaxu_vs": 29600, - "codegen.vredmin_vs": 29600, - "codegen.vredminu_vs": 29600, - "codegen.vredor_vs": 29600, - "codegen.vredsum_vs": 29600, - "codegen.vredxor_vs": 29600, - "codegen.vrem_vv": 33648, - "codegen.vrem_vx": 200480, - "codegen.vremu_vv": 33724, - "codegen.vremu_vx": 207983, - "codegen.vrgather_vi": 87834, - "codegen.vrgather_vv": 94965, - "codegen.vrgather_vx": 109453, - "codegen.vrgatherei16_vv": 68358, - "codegen.vrsub_vi": 62874, - "codegen.vrsub_vx": 88766, - "codegen.vs1r_v": 678, - "codegen.vs2r_v": 680, - "codegen.vs4r_v": 712, + "codegen.vnmsub_vx": 89389, + "codegen.vnsra_wi": 40554, + "codegen.vnsra_wv": 17487, + "codegen.vnsra_wx": 57159, + "codegen.vnsrl_wi": 40554, + "codegen.vnsrl_wv": 17487, + "codegen.vnsrl_wx": 57159, + "codegen.vor_vi": 62877, + "codegen.vor_vv": 29207, + "codegen.vor_vx": 88767, + "codegen.vredand_vs": 29601, + "codegen.vredmax_vs": 29601, + "codegen.vredmaxu_vs": 29601, + "codegen.vredmin_vs": 29601, + "codegen.vredminu_vs": 29601, + "codegen.vredor_vs": 29601, + "codegen.vredsum_vs": 29601, + "codegen.vredxor_vs": 29601, + "codegen.vrem_vv": 33501, + "codegen.vrem_vx": 200221, + "codegen.vremu_vv": 33544, + "codegen.vremu_vx": 207584, + "codegen.vrgather_vi": 73225, + "codegen.vrgather_vv": 31781, + "codegen.vrgather_vx": 100955, + "codegen.vrgatherei16_vv": 27635, + "codegen.vrsub_vi": 62877, + "codegen.vrsub_vx": 88767, + "codegen.vs1r_v": 686, + "codegen.vs2r_v": 688, + "codegen.vs4r_v": 720, "codegen.vs8r_v": 1011, - "codegen.vsadd_vi": 64452, - "codegen.vsadd_vv": 30698, - "codegen.vsadd_vx": 90382, - "codegen.vsaddu_vi": 64452, - "codegen.vsaddu_vv": 30698, - "codegen.vsaddu_vx": 90382, - "codegen.vsbc_vvm": 14554, - "codegen.vsbc_vxm": 65358, + "codegen.vsadd_vi": 64453, + "codegen.vsadd_vv": 30699, + "codegen.vsadd_vx": 90384, + "codegen.vsaddu_vi": 64453, + "codegen.vsaddu_vv": 30699, + "codegen.vsaddu_vx": 90384, + "codegen.vsbc_vvm": 14555, + "codegen.vsbc_vxm": 65359, "codegen.vse16_v": 6270, "codegen.vse32_v": 5019, "codegen.vse8_v": 7504, "codegen.vsetivli": 6005, "codegen.vsetvl": 497, "codegen.vsetvli": 18626, - "codegen.vsext_vf2": 20769, - "codegen.vsext_vf4": 3582, - "codegen.vslide1down_vx": 859706, - "codegen.vslide1up_vx": 853994, - "codegen.vslidedown_vi": 602744, - "codegen.vslidedown_vx": 855482, - "codegen.vslideup_vi": 599196, - "codegen.vslideup_vx": 853162, - "codegen.vsll_vi": 70136, - "codegen.vsll_vv": 30059, - "codegen.vsll_vx": 98450, + "codegen.vsext_vf2": 11223, + "codegen.vsext_vf4": 3241, + "codegen.vslide1down_vx": 137587, + "codegen.vslide1up_vx": 139331, + "codegen.vslidedown_vi": 96987, + "codegen.vslidedown_vx": 118424, + "codegen.vslideup_vi": 96319, + "codegen.vslideup_vx": 111346, + "codegen.vsll_vi": 69759, + "codegen.vsll_vv": 30002, + "codegen.vsll_vx": 98144, "codegen.vsm_v": 734, "codegen.vsmul_vv": 124295, - "codegen.vsmul_vx": 360247, - "codegen.vsoxei16_v": 27633, - "codegen.vsoxei32_v": 16766, - "codegen.vsoxei8_v": 38831, - "codegen.vsoxseg2ei16_v": 25133, - "codegen.vsoxseg2ei32_v": 15152, - "codegen.vsoxseg2ei8_v": 30432, - "codegen.vsoxseg3ei16_v": 16722, - "codegen.vsoxseg3ei32_v": 10925, - "codegen.vsoxseg3ei8_v": 22055, - "codegen.vsoxseg4ei16_v": 18694, - "codegen.vsoxseg4ei32_v": 11678, - "codegen.vsoxseg4ei8_v": 26019, - "codegen.vsoxseg5ei16_v": 10787, - "codegen.vsoxseg5ei32_v": 6322, - "codegen.vsoxseg5ei8_v": 16810, - "codegen.vsoxseg6ei16_v": 11630, - "codegen.vsoxseg6ei32_v": 6722, - "codegen.vsoxseg6ei8_v": 18614, - "codegen.vsoxseg7ei16_v": 12473, - "codegen.vsoxseg7ei32_v": 7122, - "codegen.vsoxseg7ei8_v": 20418, - "codegen.vsoxseg8ei16_v": 13316, - "codegen.vsoxseg8ei32_v": 7522, - "codegen.vsoxseg8ei8_v": 22222, - "codegen.vsra_vi": 70136, - "codegen.vsra_vv": 30059, - "codegen.vsra_vx": 98450, - "codegen.vsrl_vi": 70136, - "codegen.vsrl_vv": 30059, - "codegen.vsrl_vx": 98450, + "codegen.vsmul_vx": 359954, + "codegen.vsoxei16_v": 27188, + "codegen.vsoxei32_v": 16473, + "codegen.vsoxei8_v": 38355, + "codegen.vsoxseg2ei16_v": 24811, + "codegen.vsoxseg2ei32_v": 14984, + "codegen.vsoxseg2ei8_v": 30206, + "codegen.vsoxseg3ei16_v": 16610, + "codegen.vsoxseg3ei32_v": 10816, + "codegen.vsoxseg3ei8_v": 21966, + "codegen.vsoxseg4ei16_v": 18582, + "codegen.vsoxseg4ei32_v": 11552, + "codegen.vsoxseg4ei8_v": 25930, + "codegen.vsoxseg5ei16_v": 10707, + "codegen.vsoxseg5ei32_v": 6288, + "codegen.vsoxseg5ei8_v": 16788, + "codegen.vsoxseg6ei16_v": 11550, + "codegen.vsoxseg6ei32_v": 6688, + "codegen.vsoxseg6ei8_v": 18592, + "codegen.vsoxseg7ei16_v": 12393, + "codegen.vsoxseg7ei32_v": 7088, + "codegen.vsoxseg7ei8_v": 20396, + "codegen.vsoxseg8ei16_v": 13236, + "codegen.vsoxseg8ei32_v": 7488, + "codegen.vsoxseg8ei8_v": 22200, + "codegen.vsra_vi": 69759, + "codegen.vsra_vv": 30002, + "codegen.vsra_vx": 98144, + "codegen.vsrl_vi": 69759, + "codegen.vsrl_vv": 30002, + "codegen.vsrl_vx": 98144, "codegen.vsse16_v": 51557, "codegen.vsse32_v": 37244, - "codegen.vsse8_v": 77683, + "codegen.vsse8_v": 77684, "codegen.vsseg2e16_v": 5059, "codegen.vsseg2e32_v": 3818, "codegen.vsseg2e8_v": 6247, "codegen.vsseg3e16_v": 3835, "codegen.vsseg3e32_v": 2583, "codegen.vsseg3e8_v": 5039, - "codegen.vsseg4e16_v": 3838, - "codegen.vsseg4e32_v": 2586, - "codegen.vsseg4e8_v": 5042, + "codegen.vsseg4e16_v": 3839, + "codegen.vsseg4e32_v": 2587, + "codegen.vsseg4e8_v": 5043, "codegen.vsseg5e16_v": 2556, - "codegen.vsseg5e32_v": 1382, + "codegen.vsseg5e32_v": 1398, "codegen.vsseg5e8_v": 3814, "codegen.vsseg6e16_v": 2559, - "codegen.vsseg6e32_v": 1430, - "codegen.vsseg6e8_v": 3816, + "codegen.vsseg6e32_v": 1446, + "codegen.vsseg6e8_v": 3817, "codegen.vsseg7e16_v": 2562, - "codegen.vsseg7e32_v": 1478, - "codegen.vsseg7e8_v": 3821, - "codegen.vsseg8e16_v": 2575, - "codegen.vsseg8e32_v": 1531, - "codegen.vsseg8e8_v": 3828, - "codegen.vssra_vi": 282211, - "codegen.vssra_vv": 121315, - "codegen.vssra_vx": 590997, - "codegen.vssrl_vi": 282211, - "codegen.vssrl_vv": 121315, - "codegen.vssrl_vx": 590997, - "codegen.vssseg2e16_v": 42526, + "codegen.vsseg7e32_v": 1494, + "codegen.vsseg7e8_v": 3822, + "codegen.vsseg8e16_v": 2604, + "codegen.vsseg8e32_v": 1542, + "codegen.vsseg8e8_v": 3827, + "codegen.vssra_vi": 280647, + "codegen.vssra_vv": 121087, + "codegen.vssra_vx": 588914, + "codegen.vssrl_vi": 280647, + "codegen.vssrl_vv": 121087, + "codegen.vssrl_vx": 588914, + "codegen.vssseg2e16_v": 42589, "codegen.vssseg2e32_v": 28551, - "codegen.vssseg2e8_v": 64395, + "codegen.vssseg2e8_v": 64477, "codegen.vssseg3e16_v": 30791, "codegen.vssseg3e32_v": 18119, "codegen.vssseg3e8_v": 49832, - "codegen.vssseg4e16_v": 33440, + "codegen.vssseg4e16_v": 33459, "codegen.vssseg4e32_v": 18899, - "codegen.vssseg4e8_v": 57551, + "codegen.vssseg4e8_v": 57710, "codegen.vssseg5e16_v": 21679, "codegen.vssseg5e32_v": 10505, "codegen.vssseg5e8_v": 44457, @@ -364,85 +364,85 @@ "codegen.vssseg8e16_v": 25555, "codegen.vssseg8e32_v": 12089, "codegen.vssseg8e8_v": 57405, - "codegen.vssub_vv": 29206, - "codegen.vssub_vx": 130493, - "codegen.vssubu_vv": 29206, - "codegen.vssubu_vx": 130493, - "codegen.vsub_vv": 29206, - "codegen.vsub_vx": 130493, - "codegen.vsuxei16_v": 27633, - "codegen.vsuxei32_v": 16766, - "codegen.vsuxei8_v": 38831, - "codegen.vsuxseg2ei16_v": 25133, - "codegen.vsuxseg2ei32_v": 15152, - "codegen.vsuxseg2ei8_v": 30432, - "codegen.vsuxseg3ei16_v": 16722, - "codegen.vsuxseg3ei32_v": 10925, - "codegen.vsuxseg3ei8_v": 22055, - "codegen.vsuxseg4ei16_v": 18694, - "codegen.vsuxseg4ei32_v": 11678, - "codegen.vsuxseg4ei8_v": 26019, - "codegen.vsuxseg5ei16_v": 10787, - "codegen.vsuxseg5ei32_v": 6322, - "codegen.vsuxseg5ei8_v": 16810, - "codegen.vsuxseg6ei16_v": 11630, - "codegen.vsuxseg6ei32_v": 6722, - "codegen.vsuxseg6ei8_v": 18614, - "codegen.vsuxseg7ei16_v": 12473, - "codegen.vsuxseg7ei32_v": 7122, - "codegen.vsuxseg7ei8_v": 20418, - "codegen.vsuxseg8ei16_v": 13316, - "codegen.vsuxseg8ei32_v": 7522, - "codegen.vsuxseg8ei8_v": 22222, - "codegen.vwadd_vv": 17507, - "codegen.vwadd_vx": 53483, + "codegen.vssub_vv": 29207, + "codegen.vssub_vx": 130478, + "codegen.vssubu_vv": 29207, + "codegen.vssubu_vx": 130478, + "codegen.vsub_vv": 29207, + "codegen.vsub_vx": 130478, + "codegen.vsuxei16_v": 27188, + "codegen.vsuxei32_v": 16473, + "codegen.vsuxei8_v": 38355, + "codegen.vsuxseg2ei16_v": 24811, + "codegen.vsuxseg2ei32_v": 14984, + "codegen.vsuxseg2ei8_v": 30206, + "codegen.vsuxseg3ei16_v": 16610, + "codegen.vsuxseg3ei32_v": 10816, + "codegen.vsuxseg3ei8_v": 21966, + "codegen.vsuxseg4ei16_v": 18582, + "codegen.vsuxseg4ei32_v": 11552, + "codegen.vsuxseg4ei8_v": 25930, + "codegen.vsuxseg5ei16_v": 10707, + "codegen.vsuxseg5ei32_v": 6288, + "codegen.vsuxseg5ei8_v": 16788, + "codegen.vsuxseg6ei16_v": 11550, + "codegen.vsuxseg6ei32_v": 6688, + "codegen.vsuxseg6ei8_v": 18592, + "codegen.vsuxseg7ei16_v": 12393, + "codegen.vsuxseg7ei32_v": 7088, + "codegen.vsuxseg7ei8_v": 20396, + "codegen.vsuxseg8ei16_v": 13236, + "codegen.vsuxseg8ei32_v": 7488, + "codegen.vsuxseg8ei8_v": 22200, + "codegen.vwadd_vv": 17508, + "codegen.vwadd_vx": 53494, "codegen.vwadd_wv": 17438, - "codegen.vwadd_wx": 54635, - "codegen.vwaddu_vv": 17507, - "codegen.vwaddu_vx": 53483, + "codegen.vwadd_wx": 54403, + "codegen.vwaddu_vv": 17508, + "codegen.vwaddu_vx": 53494, "codegen.vwaddu_wv": 17438, - "codegen.vwaddu_wx": 54635, + "codegen.vwaddu_wx": 54403, "codegen.vwmacc_vv": 17518, - "codegen.vwmacc_vx": 57966, + "codegen.vwmacc_vx": 57596, "codegen.vwmaccsu_vv": 17518, - "codegen.vwmaccsu_vx": 57966, + "codegen.vwmaccsu_vx": 57596, "codegen.vwmaccu_vv": 17518, - "codegen.vwmaccu_vx": 57966, - "codegen.vwmaccus_vx": 57966, + "codegen.vwmaccu_vx": 57596, + "codegen.vwmaccus_vx": 57596, "codegen.vwmul_vv": 17508, - "codegen.vwmul_vx": 70305, + "codegen.vwmul_vx": 70303, "codegen.vwmulsu_vv": 17508, - "codegen.vwmulsu_vx": 70305, + "codegen.vwmulsu_vx": 70303, "codegen.vwmulu_vv": 17508, - "codegen.vwmulu_vx": 70305, + "codegen.vwmulu_vx": 70303, "codegen.vwredsum_vs": 17578, "codegen.vwredsumu_vs": 17578, - "codegen.vwsub_vv": 17507, - "codegen.vwsub_vx": 53483, + "codegen.vwsub_vv": 17508, + "codegen.vwsub_vx": 53494, "codegen.vwsub_wv": 17438, - "codegen.vwsub_wx": 54635, - "codegen.vwsubu_vv": 17507, - "codegen.vwsubu_vx": 53483, + "codegen.vwsub_wx": 54403, + "codegen.vwsubu_vv": 17508, + "codegen.vwsubu_vx": 53494, "codegen.vwsubu_wv": 17438, - "codegen.vwsubu_wx": 54635, - "codegen.vxor_vi": 62874, - "codegen.vxor_vv": 29206, - "codegen.vxor_vx": 88766, - "codegen.vzext_vf2": 20769, - "codegen.vzext_vf4": 3582, - "intrinsic.conv2d_less_m2": 2497, - "mlir.hello": 139, - "mlir.rvv_vp_intrinsic_add": 454, - "mlir.rvv_vp_intrinsic_add_scalable": 634, - "mlir.stripmining": 26776, - "rvv_bench.ascii_to_utf16": 702047, - "rvv_bench.ascii_to_utf32": 232145, - "rvv_bench.byteswap": 493024, + "codegen.vwsubu_wx": 54403, + "codegen.vxor_vi": 62877, + "codegen.vxor_vv": 29207, + "codegen.vxor_vx": 88767, + "codegen.vzext_vf2": 11223, + "codegen.vzext_vf4": 3241, + "intrinsic.conv2d_less_m2": 2498, + "mlir.hello": 130, + "mlir.rvv_vp_intrinsic_add": 442, + "mlir.rvv_vp_intrinsic_add_scalable": 640, + "mlir.stripmining": 27798, + "rvv_bench.ascii_to_utf16": 676092, + "rvv_bench.ascii_to_utf32": 225585, + "rvv_bench.byteswap": 408359, "rvv_bench.chacha20": 39957, - "rvv_bench.memcpy": 677749, - "rvv_bench.memset": 291429, - "rvv_bench.mergelines": 630910, + "rvv_bench.memcpy": 675881, + "rvv_bench.memset": 291438, + "rvv_bench.mergelines": 575240, "rvv_bench.poly1305": 39957, - "rvv_bench.strlen": 230896, - "rvv_bench.utf8_count": 2344343 + "rvv_bench.strlen": 231237, + "rvv_bench.utf8_count": 2350031 } \ No newline at end of file