diff --git a/omreaderlib/src/T1OMReaderAPI.scala b/omreaderlib/src/T1OMReaderAPI.scala
index 754d07b75..22005a8e4 100644
--- a/omreaderlib/src/T1OMReaderAPI.scala
+++ b/omreaderlib/src/T1OMReaderAPI.scala
@@ -54,7 +54,7 @@ object Path {
   implicit val rw:        ReadWriter[Instruction] = macroRW
   def parse(str: String): Path                    =
     str match {
-      case s"OMInstanceTarget:~${top}|${hier}>${local}" =>
+      case s"OMReferenceTarget:~${top}|${hier}>${local}" =>
         Path(
           top,
           hier
@@ -65,7 +65,7 @@ object Path {
             }),
           Some(local)
         )
-      case s"OMInstanceTarget:~${top}|${hier}"          =>
+      case s"OMInstanceTarget:~${top}|${hier}"           =>
         Path(
           top,
           hier
diff --git a/omreaderlib/src/t1/T1.scala b/omreaderlib/src/t1/T1.scala
index 5e040886e..dc304d860 100644
--- a/omreaderlib/src/t1/T1.scala
+++ b/omreaderlib/src/t1/T1.scala
@@ -18,6 +18,20 @@ class T1(val mlirbc: Array[Byte]) extends T1OMReaderAPI {
   def instructions: Seq[Instruction]                  = t1("decoder").obj("instructions").list.elements().map(_.obj).map(getInstruction)
   def sram:         Seq[SRAM]                         =
     t1("lanes").list.elements().map(_.obj("vrf").obj).flatMap(getSRAM)
-  def retime:       Seq[Retime]                       =
+
+  def permutation: Seq[Retime] = {
+    val permutation  = t1("permutation")
+    val reduceUnit   = permutation.obj("reduceUnit").obj
+    val compressUnit = permutation.obj("compress").obj
+    // TODO: need fieldOpt(name: String)
+    val floatAdder   =
+      Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj)
+
+    (Seq(compressUnit) ++ floatAdder).flatMap(getRetime)
+  }
+
+  def vfus: Seq[Retime] =
     t1("lanes").list.elements().map(_.obj("vfus")).flatMap(_.list.elements().map(_.obj)).flatMap(getRetime)
+
+  def retime = (vfus ++ permutation).distinct
 }
diff --git a/omreaderlib/src/t1rocketv/T1RocketTile.scala b/omreaderlib/src/t1rocketv/T1RocketTile.scala
index 1224726e4..0c33bd3f6 100644
--- a/omreaderlib/src/t1rocketv/T1RocketTile.scala
+++ b/omreaderlib/src/t1rocketv/T1RocketTile.scala
@@ -20,8 +20,21 @@ class T1RocketTile(val mlirbc: Array[Byte]) extends T1OMReaderAPI {
     t1("lanes").list.elements().map(_.obj("vrf").obj).flatMap(getSRAM)
   def cache:        Seq[SRAM]                         =
     Seq(tile("frontend").obj("icache").obj, tile("hellaCache").obj).flatMap(getSRAM)
-  def vfu:          Seq[Retime]                       =
+
+  def permutation: Seq[Retime] = {
+    val permutation  = t1("permutation")
+    val reduceUnit   = permutation.obj("reduceUnit").obj
+    val compressUnit = permutation.obj("compress").obj
+    // TODO: need fieldOpt(name: String)
+    val floatAdder   =
+      Option.when(reduceUnit.fieldNames().contains("floatAdder"))(reduceUnit("floatAdder").obj)
+
+    (Seq(compressUnit) ++ floatAdder).flatMap(getRetime)
+  }
+
+  def vfus: Seq[Retime] =
     t1("lanes").list.elements().map(_.obj("vfus")).flatMap(_.list.elements().map(_.obj)).flatMap(getRetime)
-  def retime = vfu
+
+  def retime = (vfus ++ permutation).distinct
   def sram   = vrf ++ cache
 }
diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala
index 59b317b5b..d59a97d75 100644
--- a/t1/src/Bundles.scala
+++ b/t1/src/Bundles.scala
@@ -138,6 +138,8 @@ class LaneRequest(param: LaneParameter) extends Bundle {
   /** data of rs1 */
   val readFromScalar: UInt = UInt(param.datapathWidth.W)
 
+  val csrInterface: CSRInterface = new CSRInterface(param.vlMaxBits)
+
   // vmacc 的vd需要跨lane读 TODO: move to [[V]]
   def ma: Bool =
     decodeResult(Decoder.multiplier) && decodeResult(Decoder.uop)(1, 0).xorR && !decodeResult(Decoder.vwmacc)
@@ -220,10 +222,6 @@ class InstructionControlRecord(param: LaneParameter) extends Bundle {
   /** Store request from [[T1]]. */
   val laneRequest: LaneRequest = new LaneRequest(param)
 
-  /** csr follows the instruction. TODO: move to [[laneRequest]]
-    */
-  val csr: CSRInterface = new CSRInterface(param.vlMaxBits)
-
   /** which group is the last group for instruction. */
   val lastGroupForInstruction: UInt = UInt(param.groupNumberBits.W)
 
@@ -362,9 +360,9 @@ class LaneResponseFeedback(param: LaneParameter) extends Bundle {
   val complete: Bool = Bool()
 }
 
-class V0Update(param: LaneParameter) extends Bundle {
-  val data:   UInt = UInt(param.datapathWidth.W)
-  val offset: UInt = UInt(param.vrfOffsetBits.W)
+class V0Update(datapathWidth: Int, vrfOffsetBits: Int) extends Bundle {
+  val data:   UInt = UInt(datapathWidth.W)
+  val offset: UInt = UInt(vrfOffsetBits.W)
   // mask/ld类型的有可能不会写完整的32bit
   val mask:   UInt = UInt(4.W)
 }
@@ -794,6 +792,5 @@ class MaskUnitReadVs1(parameter: T1Parameter) extends Bundle {
 }
 
 class LaneTokenBundle extends Bundle {
-  val maskResponseRelease: Bool = Output(Bool())
-  val maskRequestRelease:  Bool = Input(Bool())
+  val maskRequestRelease: Bool = Input(Bool())
 }
diff --git a/t1/src/FloatModule.scala b/t1/src/FloatModule.scala
index 8862937c2..cff306c9c 100644
--- a/t1/src/FloatModule.scala
+++ b/t1/src/FloatModule.scala
@@ -17,7 +17,7 @@ object FloatAdderParameter {
   implicit def rwP = upickle.default.macroRW[FloatAdderParameter]
 }
 
-case class FloatAdderParameter(expWidth: Int, sigWidth: Int) extends SerializableModuleParameter
+case class FloatAdderParameter(expWidth: Int, sigWidth: Int, latency: Int) extends SerializableModuleParameter
 
 class FloatAdderInterface(val parameter: FloatAdderParameter) extends Bundle {
   val expWidth = parameter.expWidth
@@ -61,8 +61,8 @@ class FloatAdder(val parameter: FloatAdderParameter)
   addRecFN.io.roundingMode   := io.roundingMode
   addRecFN.io.detectTininess := false.B
 
-  io.out            := fNFromRecFN(8, 24, addRecFN.io.out)
-  io.exceptionFlags := addRecFN.io.exceptionFlags
+  io.out            := Pipe(true.B, fNFromRecFN(8, 24, addRecFN.io.out), parameter.latency).bits
+  io.exceptionFlags := Pipe(true.B, addRecFN.io.exceptionFlags, parameter.latency).bits
 }
 
 object FloatCompareParameter {
diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala
index ec52d2bb7..1c8893634 100644
--- a/t1/src/Lane.scala
+++ b/t1/src/Lane.scala
@@ -98,6 +98,7 @@ case class LaneParameter(
   crossLaneVRFWriteEscapeQueueSize: Int,
   fpuEnable:                        Boolean,
   portFactor:                       Int,
+  maskRequestLatency:               Int,
   vrfRamType:                       RamType,
   decoderParam:                     DecoderParam,
   vfuInstantiateParameter:          VFUInstantiateParameter)
@@ -233,11 +234,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   @public
   val laneRequest: DecoupledIO[LaneRequest] = IO(Flipped(Decoupled(new LaneRequest(parameter))))
 
-  /** CSR Interface. TODO: merge to [[laneRequest]]
-    */
-  @public
-  val csrInterface: CSRInterface = IO(Input(new CSRInterface(parameter.vlMaxBits)))
-
   @public
   val maskUnitRequest: ValidIO[MaskUnitExeReq] = IO(Valid(new MaskUnitExeReq(parameter)))
 
@@ -280,7 +276,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
 
   /** V0 update in the lane should also update [[T1.v0]] */
   @public
-  val v0Update: ValidIO[V0Update] = IO(Valid(new V0Update(parameter)))
+  val v0Update: ValidIO[V0Update] = IO(Valid(new V0Update(parameter.datapathWidth, parameter.vrfOffsetBits)))
 
   /** input of mask data */
   @public
@@ -319,27 +315,14 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   @public
   val laneProbe = IO(Output(Probe(new LaneProbe(parameter), layers.Verification)))
 
-  @public
-  val vrfAllocateIssue: Bool = IO(Output(Bool()))
-
   // TODO: remove
   dontTouch(writeBusPort)
+  val csrInterface: CSRInterface = laneRequest.bits.csrInterface
 
   /** VRF instantces. */
   val vrf: Instance[VRF] = Instantiate(new VRF(parameter.vrfParam))
   omInstance.vrfIn := Property(vrf.om.asAnyClassType)
 
-  /** TODO: review later
-    */
-  val maskGroupedOrR: UInt = VecInit(
-    maskInput.asBools
-      .grouped(parameter.dataPathByteWidth)
-      .toSeq
-      .map(
-        VecInit(_).asUInt.orR
-      )
-  ).asUInt
-
   val fullMask: UInt = (-1.S(parameter.datapathWidth.W)).asUInt
 
   /** the slot is occupied by instruction */
@@ -353,9 +336,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   val maskIndexVec: Vec[UInt] =
     RegInit(VecInit(Seq.fill(parameter.chainingSize)(0.U(log2Ceil(parameter.maskGroupWidth).W))))
 
-  /** the find first one index register in this lane. */
-  val ffoIndexReg: UInt = RegInit(0.U(log2Ceil(parameter.vLen / 8).W))
-
   /** result of reduce instruction. */
   val reduceResult: UInt = RegInit(0.U(parameter.datapathWidth.W))
 
@@ -363,7 +343,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
     */
   val vrfWriteArbiter: Vec[DecoupledIO[VRFWriteRequest]] = Wire(
     Vec(
-      parameter.chainingSize + 2,
+      parameter.chainingSize + 1,
       Decoupled(
         new VRFWriteRequest(
           parameter.vrfParam.regNumBits,
@@ -375,31 +355,76 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
     )
   )
 
-  val lsuWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteType, 1, flow = true)
-  // connect lsuWriteQueue.enq
-  lsuWriteQueue.enq.valid := vrfWriteChannel.valid && !writeFromMask
-  lsuWriteQueue.enq.bits  := vrfWriteChannel.bits
-  vrfWriteChannel.ready   := writeFromMask || lsuWriteQueue.enq.ready
+  vrfWriteArbiter(parameter.chainingSize).valid := vrfWriteChannel.valid
+  vrfWriteArbiter(parameter.chainingSize).bits  := vrfWriteChannel.bits
+  vrfWriteChannel.ready                         := vrfWriteArbiter(parameter.chainingSize).ready
 
-  val maskWriteQueue: QueueIO[VRFWriteRequest] = Queue.io(vrfWriteType, parameter.maskUnitVefWriteQueueSize)
-  // connect maskWriteQueue.enq
-  maskWriteQueue.enq.valid := vrfWriteChannel.valid && writeFromMask
-  maskWriteQueue.enq.bits  := vrfWriteChannel.bits
+  val allVrfWriteAfterCheck:  Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 3) { i =>
+    RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits))
+  }
+  val afterCheckValid:        Seq[Bool]            = Seq.tabulate(parameter.chainingSize + 3) { _ => RegInit(false.B) }
+  val afterCheckDequeueReady: Vec[Bool]            = Wire(Vec(parameter.chainingSize + 3, Bool()))
+  val afterCheckDequeueFire:  Seq[Bool]            = afterCheckValid.zip(afterCheckDequeueReady).map { case (v, r) => v && r }
 
-  vrfWriteArbiter(parameter.chainingSize).valid := lsuWriteQueue.deq.valid
-  vrfWriteArbiter(parameter.chainingSize).bits  := lsuWriteQueue.deq.bits
-  lsuWriteQueue.deq.ready                       := vrfWriteArbiter(parameter.chainingSize).ready
+  // todo: mv to bundle.scala
+  class MaskControl(parameter: LaneParameter) extends Bundle {
+    val index:         UInt = UInt(parameter.instructionIndexBits.W)
+    val sew:           UInt = UInt(2.W)
+    val maskData:      UInt = UInt(parameter.datapathWidth.W)
+    val group:         UInt = UInt(parameter.maskGroupSizeBits.W)
+    val dataValid:     Bool = Bool()
+    val waiteResponse: Bool = Bool()
+    val controlValid:  Bool = Bool()
+  }
 
-  vrfWriteArbiter(parameter.chainingSize + 1).valid := maskWriteQueue.deq.valid
-  vrfWriteArbiter(parameter.chainingSize + 1).bits  := maskWriteQueue.deq.bits
-  maskWriteQueue.deq.ready                          := vrfWriteArbiter(parameter.chainingSize + 1).ready
+  val maskControlRelease: Vec[ValidIO[UInt]] =
+    Wire(Vec(parameter.chainingSize, Valid(UInt(parameter.instructionIndexBits.W))))
+
+  val maskControlEnq:       UInt             = Wire(UInt(parameter.chainingSize.W))
+  val maskControlDataDeq:   UInt             = Wire(UInt(parameter.chainingSize.W))
+  val maskControlReq:       Vec[Bool]        = Wire(Vec(parameter.chainingSize, Bool()))
+  val maskControlReqSelect: UInt             = ffo(maskControlReq.asUInt)
+  // mask request & response handle
+  val maskControlVec:       Seq[MaskControl] = Seq.tabulate(parameter.chainingSize) { index =>
+    val state = RegInit(0.U.asTypeOf(new MaskControl(parameter)))
+    val releaseHit: Bool = maskControlRelease.map(r => r.valid && (r.bits === state.index)).reduce(_ || _)
+    val responseFire =
+      Pipe(maskControlReqSelect(index), 0.U.asTypeOf(new EmptyBundle), parameter.maskRequestLatency).valid
+
+    when(maskControlEnq(index)) {
+      state              := 0.U.asTypeOf(state)
+      state.index        := laneRequest.bits.instructionIndex
+      state.sew          := laneRequest.bits.csrInterface.vSew
+      state.controlValid := true.B
+    }
 
-  val allVrfWriteAfterCheck:  Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 4) { i =>
-    RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits))
+    when(state.controlValid) {
+      when(releaseHit) {
+        state.controlValid := false.B
+      }
+    }
+
+    maskControlReq(index) := state.controlValid && !state.dataValid && !state.waiteResponse
+    when(maskControlReqSelect(index)) {
+      state.waiteResponse := true.B
+      state.group         := state.group + 1.U
+    }
+
+    when(responseFire) {
+      state.dataValid     := true.B
+      state.waiteResponse := false.B
+      state.maskData      := maskInput
+    }
+
+    when(maskControlDataDeq(index)) {
+      state.dataValid := false.B
+    }
+
+    state
   }
-  val afterCheckValid:        Seq[Bool]            = Seq.tabulate(parameter.chainingSize + 4) { _ => RegInit(false.B) }
-  val afterCheckDequeueReady: Vec[Bool]            = Wire(Vec(parameter.chainingSize + 4, Bool()))
-  val afterCheckDequeueFire:  Seq[Bool]            = afterCheckValid.zip(afterCheckDequeueReady).map { case (v, r) => v && r }
+  val maskControlFree:      Seq[Bool]        = maskControlVec.map(s => !s.controlValid && !s.waiteResponse)
+  val freeSelect:           UInt             = ffo(VecInit(maskControlFree).asUInt)
+  maskControlEnq := maskAnd(laneRequest.fire && laneRequest.bits.mask, freeSelect)
 
   /** for each slot, assert when it is asking [[T1]] to change mask */
   val slotMaskRequestVec: Vec[ValidIO[UInt]] = Wire(
@@ -410,7 +435,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   )
 
   /** which slot wins the arbitration for requesting mask. */
-  val maskRequestFireOH: UInt = Wire(UInt(parameter.chainingSize.W))
+  val maskRequestFireOH: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool()))
+  val maskDataVec:       Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.maskGroupWidth.W)))
 
   /** FSM control for each slot. if index == 0,
     *   - slot can support write v0 in mask type, see [[Decoder.maskDestination]] [[Decoder.maskSource]]
@@ -515,8 +541,10 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
 
   // Overflow occurs
   val vxsatEnq: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt((2 * parameter.chainingSize).W)))
+
+  val instructionFinishInSlot: UInt = Wire(UInt((2 * parameter.chainingSize).W))
   // vxsatEnq and instructionFinished cannot happen at the same time
-  vxsatResult := (vxsatEnq.reduce(_ | _) | vxsatResult) & (~instructionFinished).asUInt
+  vxsatResult := (vxsatEnq.reduce(_ | _) | vxsatResult) & (~instructionFinishInSlot).asUInt
 
   /** assert when a instruction will not use mask unit */
   val instructionUnrelatedMaskUnitVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.chainingSize.W)))
@@ -555,7 +583,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
       record.laneRequest.decodeResult(Decoder.maskLogic)
 
     /** onehot value of SEW. */
-    val vSew1H: UInt = UIntToOH(record.csr.vSew)(2, 0)
+    val vSew1H: UInt = UIntToOH(record.laneRequest.csrInterface.vSew)(2, 0)
 
     /** if asserted, the element won't be executed. adc: vm = 0; madc: vm = 0 -> s0 + s1 + c, vm = 1 -> s0 + s1
       */
@@ -598,7 +626,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
       maskUnitRequest <> mask.maskReq
       maskRequestToLSU <> mask.maskRequestToLSU
       tokenIO <> mask.tokenIO
-      tokenIO.maskResponseRelease := maskWriteQueue.deq.fire
       mask.dequeue
     }.getOrElse(stage3EnqWire)
     stage3.enqueue <> stage3EnqSelect
@@ -611,7 +638,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
     laneState.lastGroupForInstruction  := record.lastGroupForInstruction
     laneState.isLastLaneForInstruction := record.isLastLaneForInstruction
     laneState.instructionFinished      := record.instructionFinished
-    laneState.csr                      := record.csr
+    laneState.csr                      := record.laneRequest.csrInterface
     laneState.maskType                 := record.laneRequest.mask
     laneState.maskNotMaskedElement     := !record.laneRequest.mask ||
       record.laneRequest.decodeResult(Decoder.maskSource) ||
@@ -634,13 +661,16 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
       laneState.elements.get(k).foreach(stateData => d := stateData)
     }
 
+    maskControlRelease(index).valid := false.B
+    maskControlRelease(index).bits  := record.laneRequest.instructionIndex
     // update lane state
     when(stage0.enqueue.fire) {
       maskGroupCountVec(index) := stage0.updateLaneState.maskGroupCount
       // todo: handle all elements in first group are masked
       maskIndexVec(index)      := stage0.updateLaneState.maskIndex
       when(stage0.updateLaneState.outOfExecutionRange) {
-        slotOccupied(index) := false.B
+        slotOccupied(index)             := false.B
+        maskControlRelease(index).valid := true.B
       }
     }
 
@@ -655,7 +685,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
     val maskFailure:    Bool = stage0.updateLaneState.maskExhausted && stage0.enqueue.fire
     // update mask register
     when(maskUpdateFire) {
-      record.mask.bits := maskInput
+      record.mask.bits := maskDataVec(index)
     }
     when(maskUpdateFire ^ maskFailure) {
       record.mask.valid := maskUpdateFire
@@ -853,7 +883,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   // It’s been a long time since I selected it. Need pipe
   val queueBeforeMaskWrite: QueueIO[VRFWriteRequest] =
     Queue.io(chiselTypeOf(maskedWriteUnit.enqueue.bits), entries = 1, pipe = true)
-  val writeSelect:          UInt                     = Wire(UInt((parameter.chainingSize + 4).W))
+  val writeSelect:          UInt                     = Wire(UInt((parameter.chainingSize + 3).W))
   val writeCavitation:      UInt                     = VecInit(allVrfWriteAfterCheck.map(_.mask === 0.U)).asUInt
 
   // 处理 rf
@@ -931,20 +961,21 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   }
 
   {
-    // 处理mask的请求
-    val maskSelectArbitrator = ffo(
-      VecInit(slotMaskRequestVec.map(_.valid)).asUInt ##
-        (laneRequest.valid && (laneRequest.bits.mask || laneRequest.bits.decodeResult(Decoder.maskSource)))
-    )
-    maskRequestFireOH := maskSelectArbitrator(parameter.chainingSize, 1)
-    maskSelect        := Mux1H(
-      maskSelectArbitrator,
-      0.U.asTypeOf(slotMaskRequestVec.head.bits) +: slotMaskRequestVec.map(_.bits)
-    )
-    maskSelectSew     := Mux1H(
-      maskSelectArbitrator,
-      csrInterface.vSew +: slotControl.map(_.csr.vSew)
-    )
+    maskSelect         := Mux1H(maskControlReqSelect, maskControlVec.map(_.group))
+    maskSelectSew      := Mux1H(maskControlReqSelect, maskControlVec.map(_.sew))
+    maskControlDataDeq := slotMaskRequestVec.zipWithIndex.map { case (req, index) =>
+      val slotIndex       = slotControl(index).laneRequest.instructionIndex
+      val hitMaskControl  = VecInit(maskControlVec.map(_.index === slotIndex)).asUInt
+      val dataValid       = Mux1H(hitMaskControl, maskControlVec.map(_.dataValid))
+      val data            = Mux1H(hitMaskControl, maskControlVec.map(_.maskData))
+      val group           = Mux1H(hitMaskControl, maskControlVec.map(_.group))
+      val sameGroup       = group === req.bits
+      dontTouch(sameGroup)
+      val maskRequestFire = req.valid && dataValid
+      maskRequestFireOH(index) := maskRequestFire
+      maskDataVec(index)       := data
+      maskAnd(maskRequestFire, hitMaskControl).asUInt
+    }.reduce(_ | _)
   }
 
   // package a control logic for incoming instruction.
@@ -954,8 +985,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   val maskLogicCompleted: Bool =
     laneRequest.bits.decodeResult(Decoder.maskLogic) &&
       (laneIndex ## 0.U(parameter.datapathWidthBits.W) >= csrInterface.vl)
-  // latch CSR from V
-  entranceControl.csr := csrInterface
 
   entranceControl.laneRequest         := laneRequest.bits
   // TODO: in scalar core, raise illegal instruction exception when vstart is nonzero.
@@ -969,9 +998,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
       // for 'nr' type instructions, they will need another complete signal.
       !(laneRequest.bits.decodeResult(Decoder.nr) || laneRequest.bits.lsWholeReg)
   // indicate if this is the mask type.
-  entranceControl.mask.valid          := laneRequest.bits.mask
+  entranceControl.mask.valid          := false.B
   // assign mask from [[V]]
-  entranceControl.mask.bits           := maskInput
+  entranceControl.mask.bits           := DontCare
   // mask used for VRF write in this group.
   entranceControl.vrfWriteMask        := 0.U
 
@@ -1057,7 +1086,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
     // enqueue from lane request
     if (slotIndex == parameter.chainingSize - 1) {
       enqueueValid := laneRequest.valid
-      enqueueReady := slotShiftValid(slotIndex) && vrf.instructionWriteReport.ready
+      enqueueReady := slotShiftValid(slotIndex)
       when(enqueueFire) {
         slotControl(slotIndex)       := entranceControl
         maskGroupCountVec(slotIndex) := 0.U(parameter.maskGroupSizeBits.W)
@@ -1087,7 +1116,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
 
   // handshake
   // @todo @Clo91eaf lane can take request from Sequencer
-  laneRequest.ready := slotFree && vrf.instructionWriteReport.ready
+  laneRequest.ready := slotFree
 
   val instructionFinishAndNotReportByTop: Bool =
     entranceControl.instructionFinished && !laneRequest.bits.decodeResult(Decoder.readOnly) && (writeCount === 0.U)
@@ -1119,7 +1148,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   vrf.instructionWriteReport.bits.state.wLaneLastReport  := !laneRequest.valid
   vrf.instructionWriteReport.bits.state.wTopLastReport   := !laneRequest.bits.decodeResult(Decoder.maskUnit)
   vrf.instructionWriteReport.bits.state.wLaneClear       := false.B
-  vrfAllocateIssue                                       := vrf.vrfAllocateIssue
 
   val elementSizeForOneRegister: Int  = parameter.vLen / parameter.datapathWidth / parameter.laneNumber
   val nrMask:                    UInt = VecInit(Seq.tabulate(8) { i =>
@@ -1153,17 +1181,23 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
 
   vrf.instructionWriteReport.bits.elementMask := selectMask
 
+  instructionFinishInSlot := (~instructionValid).asUInt & instructionValidNext
+
+  val emptyInstValid: Bool = RegNext(laneRequest.bits.issueInst && !vrf.instructionWriteReport.valid, false.B)
+  val emptyInstCount: UInt = RegNext(indexToOH(laneRequest.bits.instructionIndex, parameter.chainingSize))
+  val emptyReport:    UInt = maskAnd(emptyInstValid, emptyInstCount).asUInt
+
   // clear record by instructionFinished
-  vrf.instructionLastReport                 := instructionFinished
+  vrf.instructionLastReport                 := instructionFinishInSlot
   vrf.lsuLastReport                         := lsuLastReport
   vrf.loadDataInLSUWriteQueue               := loadDataInLSUWriteQueue
   vrf.dataInLane                            := instructionValid
-  instructionFinished                       := (~instructionValid).asUInt & instructionValidNext
+  instructionFinished                       := vrf.vrfSlotRelease | emptyReport
   writeReadyForLsu                          := vrf.writeReadyForLsu
   vrfReadyToStore                           := vrf.vrfReadyToStore
   tokenManager.crossWriteReports.zipWithIndex.foreach { case (rpt, rptIndex) =>
-    rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 2 + rptIndex)
-    rpt.bits  := allVrfWriteAfterCheck(parameter.chainingSize + 2 + rptIndex).instructionIndex
+    rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 1 + rptIndex)
+    rpt.bits  := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex
   }
   // todo: add mask unit write token
   tokenManager.responseReport.valid         := maskUnitRequest.valid
@@ -1199,13 +1233,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
 
   tokenManager.topWriteEnq.valid := vrfWriteChannel.fire
   tokenManager.topWriteEnq.bits  := vrfWriteChannel.bits.instructionIndex
-  tokenManager.fromMask          := writeFromMask
-
-  tokenManager.lsuWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize)
-  tokenManager.lsuWriteDeq.bits  := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex
 
-  tokenManager.maskWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize + 1)
-  tokenManager.maskWriteDeq.bits  := allVrfWriteAfterCheck(parameter.chainingSize + 1).instructionIndex
+  tokenManager.topWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize)
+  tokenManager.topWriteDeq.bits  := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex
 
   tokenManager.maskUnitLastReport := lsuLastReport
 
diff --git a/t1/src/T1.scala b/t1/src/T1.scala
index f739c885c..dc76eb784 100644
--- a/t1/src/T1.scala
+++ b/t1/src/T1.scala
@@ -66,10 +66,10 @@ class T1OM(parameter: T1Parameter) extends GeneralOM[T1Parameter, T1](parameter)
   val decoderIn = IO(Input(Property[AnyClassType]()))
   decoder := decoderIn
 
-  val permutatuon   = IO(Output(Property[AnyClassType]()))
+  val permutation   = IO(Output(Property[AnyClassType]()))
   @public
-  val permutatuonIn = IO(Input(Property[AnyClassType]()))
-  permutatuon := permutatuonIn
+  val permutationIn = IO(Input(Property[AnyClassType]()))
+  permutation := permutationIn
 }
 
 object T1Parameter {
@@ -252,6 +252,19 @@ case class T1Parameter(
   // and the values are their respective delays.
   val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1))
 
+  val laneRequestTokenSize:   Int      = 4
+  val laneRequestShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1)
+
+  val maskUnitReadTokenSize:   Seq[Int] = Seq.tabulate(laneNumber)(_ => 4)
+  val maskUnitReadShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1)
+
+  val lsuReadTokenSize:   Seq[Int] = Seq.tabulate(laneNumber)(_ => 4)
+  val lsuReadShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1)
+
+  val maskRequestLatency = 2
+
+  val releaseShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1)
+
   val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions)
 
   /** paraemter for AXI4. */
@@ -290,6 +303,7 @@ case class T1Parameter(
       crossLaneVRFWriteEscapeQueueSize = vrfWriteQueueSize,
       fpuEnable = fpuEnable,
       portFactor = vrfBankSize,
+      maskRequestLatency = 2 * maskRequestLatency,
       vrfRamType = vrfRamType,
       decoderParam = decoderParam,
       vfuInstantiateParameter = vfuInstantiateParameter
@@ -391,7 +405,7 @@ class T1(val parameter: T1Parameter)
   val maskUnit: Instance[MaskUnit] = Instantiate(new MaskUnit(parameter))
   maskUnit.io.clock        := implicitClock
   maskUnit.io.reset        := implicitReset
-  omInstance.permutatuonIn := Property(maskUnit.io.om.asAnyClassType)
+  omInstance.permutationIn := Property(maskUnit.io.om.asAnyClassType)
 
   val tokenManager: Instance[T1TokenManager] = Instantiate(new T1TokenManager(parameter))
 
@@ -624,9 +638,21 @@ class T1(val parameter: T1Parameter)
     control
   }
 
-  /** lane is ready to receive new instruction. */
-  val laneReady:    Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool()))
-  val allLaneReady: Bool      = laneReady.asUInt.andR
+  // Close to top
+  val laneRequestSourceWire: Vec[DecoupledIO[LaneRequest]] = Wire(
+    Vec(parameter.laneNumber, Decoupled(new LaneRequest(parameter.laneParam)))
+  )
+  // Close to lane
+  val laneRequestSinkWire:   Vec[DecoupledIO[LaneRequest]] = Wire(
+    Vec(parameter.laneNumber, Decoupled(new LaneRequest(parameter.laneParam)))
+  )
+
+  laneRequestSourceWire.zipWithIndex.foreach { case (source, index) =>
+    val sink = laneRequestSinkWire(index)
+    connectDecoupledWithShifter(parameter.laneRequestShifterSize(index), parameter.laneRequestTokenSize)(source, sink)
+  }
+
+  val allLaneReady: Bool = VecInit(laneRequestSourceWire.map(_.ready)).asUInt.andR
   // TODO: review later
   // todo: 把scheduler的反馈也加上,lsu有更高的优先级
 
@@ -635,25 +661,18 @@ class T1(val parameter: T1Parameter)
   val completeIndexInstruction: Bool =
     ohCheck(lsu.lastReport, slots.last.record.instructionIndex, parameter.chainingSize) && !slots.last.state.idle
 
-  val vrfWrite: Vec[DecoupledIO[VRFWriteRequest]] = Wire(
-    Vec(
-      parameter.laneNumber,
-      Decoupled(
-        new VRFWriteRequest(
-          parameter.vrfParam.regNumBits,
-          parameter.vrfParam.vrfOffsetBits,
-          parameter.instructionIndexBits,
-          parameter.datapathWidth
-        )
-      )
-    )
-  )
-
   val freeOR: Bool = VecInit(slots.map(_.state.idle)).asUInt.orR
 
   /** slot is ready to accept new instructions. */
   val slotReady: Bool = Mux(specialInstruction, slots.map(_.state.idle).last, freeOR)
 
+  val olderCheck: Bool = slots.map { re =>
+    // The same lsb will make it difficult to distinguish between the new and the old
+    val notSameLSB: Bool = re.record.instructionIndex(parameter.instructionIndexBits - 2, 0) =/=
+      requestReg.bits.instructionIndex(parameter.instructionIndexBits - 2, 0)
+    re.state.idle || (instIndexL(re.record.instructionIndex, requestReg.bits.instructionIndex) && notSameLSB)
+  }.reduce(_ && _)
+
   val source1Select: UInt =
     Mux(
       decodeResult(Decoder.gather),
@@ -688,92 +707,89 @@ class T1(val parameter: T1Parameter)
     requestReg.bits.issue.vl
   )
 
-  /** instantiate lanes. TODO: move instantiate to top of class.
-    */
-  val laneVec: Seq[Instance[Lane]] = Seq.tabulate(parameter.laneNumber) { index =>
-    val lane: Instance[Lane] = Instantiate(new Lane(parameter.laneParam))
-    // lane.laneRequest.valid -> requestRegDequeue.ready -> lane.laneRequest.ready -> lane.laneRequest.bits
-    // TODO: this is harmful for PnR design, since it broadcast ready singal to each lanes, which will significantly
-    //       reduce the scalability for large number of lanes.
-    lane.laneRequest.valid                 := requestRegDequeue.fire && !noOffsetReadLoadStore && !maskUnitInstruction
+  laneRequestSourceWire.foreach { request =>
+    request.valid                 := requestRegDequeue.fire
     // hard wire
-    lane.laneRequest.bits.instructionIndex := requestReg.bits.instructionIndex
-    lane.laneRequest.bits.decodeResult     := decodeResult
-    lane.laneRequest.bits.vs1              := requestRegDequeue.bits.instruction(19, 15)
-    lane.laneRequest.bits.vs2              := requestRegDequeue.bits.instruction(24, 20)
-    lane.laneRequest.bits.vd               := requestRegDequeue.bits.instruction(11, 7)
-    lane.laneRequest.bits.segment          := Mux(
+    request.bits.instructionIndex := requestReg.bits.instructionIndex
+    request.bits.decodeResult     := decodeResult
+    request.bits.vs1              := requestRegDequeue.bits.instruction(19, 15)
+    request.bits.vs2              := requestRegDequeue.bits.instruction(24, 20)
+    request.bits.vd               := requestRegDequeue.bits.instruction(11, 7)
+    request.bits.segment          := Mux(
       decodeResult(Decoder.nr),
       requestRegDequeue.bits.instruction(17, 15),
       requestRegDequeue.bits.instruction(31, 29)
     )
 
-    lane.laneRequest.bits.loadStoreEEW   := requestRegDequeue.bits.instruction(13, 12)
+    request.bits.loadStoreEEW   := requestRegDequeue.bits.instruction(13, 12)
     // if the instruction is vi and vx type of gather, gather from rs2 with mask VRF read channel from one lane,
     // and broadcast to all lanes.
-    lane.laneRequest.bits.readFromScalar := source1Select
+    request.bits.readFromScalar := source1Select
 
-    lane.laneRequest.bits.issueInst  := requestRegDequeue.fire
-    lane.laneRequest.bits.loadStore  := isLoadStoreType
+    request.bits.issueInst  := !noOffsetReadLoadStore && !maskUnitInstruction
+    request.bits.loadStore  := isLoadStoreType
     // let record in VRF to know there is a store instruction.
-    lane.laneRequest.bits.store      := isStoreType
+    request.bits.store      := isStoreType
     // let lane know if this is a special instruction, which need group-level synchronization between lane and [[V]]
-    lane.laneRequest.bits.special    := specialInstruction
-    lane.laneRequest.bits.lsWholeReg := lsWholeReg
+    request.bits.special    := specialInstruction
+    request.bits.lsWholeReg := lsWholeReg
     // mask type instruction.
-    lane.laneRequest.bits.mask       := maskType
-    laneReady(index)                 := lane.laneRequest.ready
+    request.bits.mask       := maskType
 
-    lane.csrInterface      := requestRegCSR
+    // connect csrInterface
+    request.bits.csrInterface      := requestRegCSR
     // index type EEW Decoded in the instruction
-    lane.csrInterface.vSew := vSewSelect
-    lane.csrInterface.vl   := evlForLane
-    lane.laneIndex         := index.U
-
-    // lsu 优先会有死锁:
-    // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写
-    // vse32.v v1, (a0) -> 依赖上一条,但是会先发出read
-
-    // Mask priority will also be
-    // vse32.v v19, (a0)
-    // vfslide1down.vf v19, v10, x1
-    val maskUnitFirst = RegInit(false.B)
-    val tryToRead     = lsu.vrfReadDataPorts(index).valid || maskUnit.io.readChannel(index).valid
-    when(tryToRead && !lane.vrfReadAddressChannel.fire) {
-      maskUnitFirst := !maskUnitFirst
-    }
-    lane.vrfReadAddressChannel.valid := Mux(
-      maskUnitFirst,
-      maskUnit.io.readChannel(index).valid,
-      lsu.vrfReadDataPorts(index).valid
+    request.bits.csrInterface.vSew := vSewSelect
+    request.bits.csrInterface.vl   := evlForLane
+  }
+
+  /** instantiate lanes. TODO: move instantiate to top of class.
+    */
+  val laneVec: Seq[Instance[Lane]] = Seq.tabulate(parameter.laneNumber) { index =>
+    val lane: Instance[Lane] = Instantiate(new Lane(parameter.laneParam))
+    lane.laneRequest.valid           := laneRequestSinkWire(index).valid && laneRequestSinkWire(index).bits.issueInst
+    lane.laneRequest.bits            := laneRequestSinkWire(index).bits
+    lane.laneRequest.bits.issueInst  := laneRequestSinkWire(index).fire
+    laneRequestSinkWire(index).ready := !laneRequestSinkWire(index).bits.issueInst || lane.laneRequest.ready
+
+    lane.laneIndex := index.U
+
+    connectVrfAccess(
+      Seq(parameter.maskUnitReadShifterSize(index), parameter.lsuReadShifterSize(index)),
+      Seq(parameter.maskUnitReadTokenSize(index), parameter.lsuReadTokenSize(index)),
+      Some(parameter.vrfReadLatency)
+    )(
+      VecInit(Seq(maskUnit.io.readChannel(index), lsu.vrfReadDataPorts(index))),
+      lane.vrfReadAddressChannel,
+      0,
+      Some(lane.vrfReadDataChannel),
+      Some(Seq(maskUnit.io.readResult(index), lsu.vrfReadResults(index)))
+    )
+
+    connectVrfAccess(
+      Seq(parameter.maskUnitReadShifterSize(index), parameter.lsuReadShifterSize(index)),
+      Seq(parameter.maskUnitReadTokenSize(index), parameter.lsuReadTokenSize(index))
+    )(
+      VecInit(Seq(maskUnit.io.exeResp(index), lsu.vrfWritePort(index))),
+      lane.vrfWriteChannel,
+      0
     )
-    lane.vrfReadAddressChannel.bits      :=
-      Mux(maskUnitFirst, maskUnit.io.readChannel(index).bits, lsu.vrfReadDataPorts(index).bits)
-    lsu.vrfReadDataPorts(index).ready    := lane.vrfReadAddressChannel.ready && !maskUnitFirst
-    maskUnit.io.readChannel(index).ready := lane.vrfReadAddressChannel.ready && maskUnitFirst
-    maskUnit.io.readResult(index)        := lane.vrfReadDataChannel
-    lsu.vrfReadResults(index)            := lane.vrfReadDataChannel
-
-    val maskTryToWrite = maskUnit.io.exeResp(index)
-    // lsu & mask unit write lane
-    // Mask write has absolute priority because it has a token
-    lane.vrfWriteChannel.valid := vrfWrite(index).valid || maskTryToWrite.valid
-    lane.vrfWriteChannel.bits  := Mux(maskTryToWrite.valid, maskTryToWrite.bits, vrfWrite(index).bits)
-    vrfWrite(index).ready      := lane.vrfWriteChannel.ready && !maskTryToWrite.valid
-    lane.writeFromMask         := maskTryToWrite.valid
+    lane.writeFromMask := maskUnit.io.exeResp(index).fire
 
     lsu.offsetReadResult(index).valid := lane.maskUnitRequest.valid && lane.maskRequestToLSU
     lsu.offsetReadResult(index).bits  := lane.maskUnitRequest.bits.source2
     lsu.offsetReadIndex(index)        := lane.maskUnitRequest.bits.index
 
+    val instructionFinishedPipe = Pipe(true.B, lane.instructionFinished, parameter.releaseShifterSize(index)).bits
     instructionFinished(index).zip(slots.map(_.record.instructionIndex)).foreach { case (d, f) =>
-      d := ohCheck(lane.instructionFinished, f, parameter.chainingSize)
+      d := ohCheck(instructionFinishedPipe, f, parameter.chainingSize)
     }
-    vxsatReportVec(index)                := lane.vxsatReport
-    lane.maskInput                       := maskUnit.io.laneMaskInput(index)
-    maskUnit.io.laneMaskSelect(index)    := lane.maskSelect
-    maskUnit.io.laneMaskSewSelect(index) := lane.maskSelectSew
+    vxsatReportVec(index) := lane.vxsatReport
+    lane.maskInput                       := Pipe(true.B, maskUnit.io.laneMaskInput(index), parameter.maskRequestLatency).bits
+    maskUnit.io.laneMaskSelect(index)    := Pipe(true.B, lane.maskSelect, parameter.maskRequestLatency).bits
+    maskUnit.io.laneMaskSewSelect(index) := Pipe(true.B, lane.maskSelectSew, parameter.maskRequestLatency).bits
     maskUnit.io.v0UpdateVec(index) <> lane.v0Update
+    lsu.v0UpdateVec(index) <> lane.v0Update
 
     lane.lsuLastReport := lsu.lastReport | maskUnit.io.lastReport
 
@@ -785,7 +801,7 @@ class T1(val parameter: T1Parameter)
         (requestReg.bits.writeByte(rowWith - 1, 0) > ((parameter.datapathWidth / 8) * index).U)
 
     // token manager
-    tokenManager.instructionFinish(index) := lane.instructionFinished
+    tokenManager.instructionFinish(index) := instructionFinishedPipe
 
     lane
   }
@@ -808,12 +824,10 @@ class T1(val parameter: T1Parameter)
   lsu.request.bits.instructionInformation.isStore         := isStoreType
   lsu.request.bits.instructionInformation.maskedLoadStore := maskType
 
-  maskUnit.io.lsuMaskSelect := lsu.maskSelect
-  lsu.maskInput             := maskUnit.io.lsuMaskInput
-  lsu.csrInterface          := requestRegCSR
-  lsu.csrInterface.vl       := evlForLsu
-  lsu.writeReadyForLsu      := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR
-  lsu.vrfReadyToStore       := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR
+  lsu.csrInterface     := requestRegCSR
+  lsu.csrInterface.vl  := evlForLsu
+  lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR
+  lsu.vrfReadyToStore  := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR
 
   // connect mask unit
   maskUnit.io.instReq.valid                 := requestRegDequeue.fire && requestReg.bits.decodeResult(Decoder.maskUnit)
@@ -838,7 +852,6 @@ class T1(val parameter: T1Parameter)
   }
 
   maskUnit.io.tokenIO.zip(laneVec).zipWithIndex.foreach { case ((token, lane), index) =>
-    token.maskResponseRelease       := lane.tokenIO.maskResponseRelease
     lane.tokenIO.maskRequestRelease := token.maskRequestRelease || lsu.tokenIO.offsetGroupRelease(index)
   }
 
@@ -875,8 +888,6 @@ class T1(val parameter: T1Parameter)
 
   io.highBandwidthLoadStorePort <> lsu.axi4Port
   io.indexedLoadStorePort <> lsu.simpleAccessPorts
-  // 暂时直接连lsu的写,后续需要处理scheduler的写
-  vrfWrite.zip(lsu.vrfWritePort).foreach { case (sink, source) => sink <> source }
 
   /** Slot has free entries. */
   val free = VecInit(slots.map(_.state.idle)).asUInt
@@ -898,7 +909,6 @@ class T1(val parameter: T1Parameter)
 
   /** for lsu instruction lsu is ready, for normal instructions, lanes are ready. */
   val executionReady: Bool = (!isLoadStoreType || lsu.request.ready) && (noOffsetReadLoadStore || allLaneReady)
-  val vrfAllocate:    Bool = VecInit(laneVec.map(_.vrfAllocateIssue)).asUInt.andR
   // - ready to issue instruction
   // - for vi and vx type of gather, it need to access vs2 for one time, we read vs2 firstly in `gatherReadFinish`
   //   and convert it to mv instruction.
@@ -907,7 +917,7 @@ class T1(val parameter: T1Parameter)
   //   we detect the hazard and decide should we issue this slide or
   //   issue the instruction after the slide which already in the slot.
   requestRegDequeue.ready := executionReady && slotReady && (!gatherNeedRead || maskUnit.io.gatherData.valid) &&
-    tokenManager.issueAllow && instructionIndexFree && vrfAllocate
+    tokenManager.issueAllow && instructionIndexFree && olderCheck
 
   instructionToSlotOH := Mux(requestRegDequeue.fire, slotToEnqueue, 0.U)
 
@@ -963,13 +973,12 @@ class T1(val parameter: T1Parameter)
     probeWire.requestRegReady    := requestRegDequeue.ready
     // maskUnitWrite maskUnitWriteReady
     probeWire.writeQueueEnqVec.zip(maskUnit.io.exeResp).foreach { case (probe, write) =>
-      probe.valid := write.valid && write.bits.mask.orR
+      probe.valid := write.fire && write.bits.mask.orR
       probe.bits  := write.bits.instructionIndex
     }
-    probeWire.instructionValid   := maskAnd(
-      !slots.last.state.wMaskUnitLast && !slots.last.state.idle,
-      indexToOH(slots.last.record.instructionIndex, parameter.chainingSize)
-    ).asUInt
+    probeWire.instructionValid   := slots
+      .map(s => maskAnd(!s.state.idle, indexToOH(s.record.instructionIndex, parameter.chainingSize)).asUInt)
+      .reduce(_ | _)
     probeWire.responseCounter    := responseCounter
     probeWire.laneProbes.zip(laneVec).foreach { case (p, l) => p := probe.read(l.laneProbe) }
     probeWire.lsuProbe           := probe.read(lsu.lsuProbe)
diff --git a/t1/src/laneStage/MaskExchangeUnit.scala b/t1/src/laneStage/MaskExchangeUnit.scala
index cb8257f83..49faff45f 100644
--- a/t1/src/laneStage/MaskExchangeUnit.scala
+++ b/t1/src/laneStage/MaskExchangeUnit.scala
@@ -54,8 +54,7 @@ class MaskExchangeUnit(parameter: LaneParameter) extends Module {
 
   val maskRequestEnqReady: Bool = !enqIsMaskRequest || maskRequestAllow
 
-  dequeue.valid               := enqueue.valid && enqSendToDeq
-  dequeue.bits                := enqueue.bits
-  enqueue.ready               := Mux(enqSendToDeq, dequeue.ready, maskRequestEnqReady)
-  tokenIO.maskResponseRelease := DontCare
+  dequeue.valid := enqueue.valid && enqSendToDeq
+  dequeue.bits  := enqueue.bits
+  enqueue.ready := Mux(enqSendToDeq, dequeue.ready, maskRequestEnqReady)
 }
diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala
index 22eef760b..dadea0fd5 100644
--- a/t1/src/laneStage/SlotTokenManager.scala
+++ b/t1/src/laneStage/SlotTokenManager.scala
@@ -94,13 +94,7 @@ class SlotTokenManager(parameter: LaneParameter) extends Module {
   val topWriteEnq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W))))
 
   @public
-  val fromMask: Bool = IO(Input(Bool()))
-
-  @public
-  val lsuWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W))))
-
-  @public
-  val maskWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W))))
+  val topWriteDeq: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W))))
 
   @public
   val instructionValid: UInt = IO(Output(UInt((2 * parameter.chainingSize).W)))
@@ -212,27 +206,16 @@ class SlotTokenManager(parameter: LaneParameter) extends Module {
   val instructionInWritePipe: UInt = tokenUpdate(writePipeToken, writePipeEnq, writePipeDeq)
 
   // lsu & mask write token
-  val lsuWriteToken:  Seq[UInt] = Seq.tabulate(2 * parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))
-  val maskWriteToken: Seq[UInt] = Seq.tabulate(2 * parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))
+  val topWriteToken: Seq[UInt] = Seq.tabulate(2 * parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))
 
   val topWriteDoEnq: UInt =
     maskAnd(topWriteEnq.valid, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt
 
-  val lsuWriteDoEnq: UInt =
-    maskAnd(topWriteEnq.valid && !fromMask, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt
-
-  val maskWriteDoEnq: UInt =
-    maskAnd(topWriteEnq.valid && fromMask, indexToOH(topWriteEnq.bits, parameter.chainingSize)).asUInt
-
-  val lsuWriteDoDeq: UInt =
-    maskAnd(lsuWriteDeq.valid, indexToOH(lsuWriteDeq.bits, parameter.chainingSize)).asUInt
-
-  val maskWriteDoDeq: UInt =
-    maskAnd(maskWriteDeq.valid, indexToOH(maskWriteDeq.bits, parameter.chainingSize)).asUInt
+  val topWriteDoDeq: UInt =
+    maskAnd(topWriteDeq.valid, indexToOH(topWriteDeq.bits, parameter.chainingSize)).asUInt
 
-  val lsuInTopWrite  = tokenUpdate(lsuWriteToken, lsuWriteDoEnq, lsuWriteDoDeq)
-  val maskInTopWrite = tokenUpdate(maskWriteToken, maskWriteDoEnq, maskWriteDoDeq)
+  val topWrite: UInt = tokenUpdate(topWriteToken, topWriteDoEnq, topWriteDoDeq)
 
-  dataInWritePipe  := instructionInWritePipe | lsuInTopWrite | maskInTopWrite
+  dataInWritePipe  := instructionInWritePipe | topWrite
   instructionValid := dataInWritePipe | instructionInSlot
 }
diff --git a/t1/src/lsu/LSU.scala b/t1/src/lsu/LSU.scala
index 7c93e09ed..cd29f90be 100644
--- a/t1/src/lsu/LSU.scala
+++ b/t1/src/lsu/LSU.scala
@@ -118,15 +118,10 @@ class LSU(param: LSUParameter) extends Module {
   @public
   val request: DecoupledIO[LSURequest] = IO(Flipped(Decoupled(new LSURequest(param.datapathWidth))))
 
-  /** mask from [[V]] TODO: since mask is one-cycle information for a mask group, we should latch it in the LSU, and
-    * reduce the IO width. this needs PnR information.
-    */
-  @public
-  val maskInput: Vec[UInt] = IO(Input(Vec(param.lsuMSHRSize, UInt(param.maskGroupWidth.W))))
-
-  /** the address of the mask group in the [[V]]. */
   @public
-  val maskSelect: Vec[UInt] = IO(Output(Vec(param.lsuMSHRSize, UInt(param.maskGroupSizeBits.W))))
+  val v0UpdateVec: Vec[ValidIO[V0Update]] = IO(
+    Flipped(Vec(param.laneNumber, Valid(new V0Update(param.datapathWidth, param.vrfOffsetBits))))
+  )
 
   @public
   val axi4Port: AXI4RWIrrevocable = IO(new AXI4RWIrrevocable(param.axi4BundleParameter))
@@ -150,7 +145,7 @@ class LSU(param: LSUParameter) extends Module {
   /** hard wire form Top. TODO: merge to [[vrfReadDataPorts]]
     */
   @public
-  val vrfReadResults: Vec[UInt] = IO(Input(Vec(param.laneNumber, UInt(param.datapathWidth.W))))
+  val vrfReadResults: Vec[ValidIO[UInt]] = IO(Vec(param.laneNumber, Flipped(Valid(UInt(param.datapathWidth.W)))))
 
   /** write channel to [[V]], which will redirect it to [[Lane.vrf]]. */
   @public
@@ -197,6 +192,25 @@ class LSU(param: LSUParameter) extends Module {
   val storeUnit: StoreUnit        = Module(new StoreUnit(param.mshrParam))
   val otherUnit: SimpleAccessUnit = Module(new SimpleAccessUnit(param.mshrParam))
 
+  /** duplicate v0 in lsu */
+  val v0: Vec[UInt] = RegInit(
+    VecInit(Seq.fill(param.vLen / param.datapathWidth)(0.U(param.datapathWidth.W)))
+  )
+
+  // write v0(mask)
+  v0.zipWithIndex.foreach { case (data, index) =>
+    // 属于哪个lane
+    val laneIndex: Int = index % param.laneNumber
+    // 取出写的端口
+    val v0Write = v0UpdateVec(laneIndex)
+    // offset
+    val offset: Int = index / param.laneNumber
+    val maskExt = FillInterleaved(8, v0Write.bits.mask)
+    when(v0Write.valid && v0Write.bits.offset === offset.U) {
+      data := (data & (~maskExt).asUInt) | (maskExt & v0Write.bits.data)
+    }
+  }
+
   val unitVec = Seq(loadUnit, storeUnit, otherUnit)
 
   /** Always merge into cache line */
@@ -222,8 +236,8 @@ class LSU(param: LSUParameter) extends Module {
     mshr.lsuRequest.valid := reqEnq(index)
     mshr.lsuRequest.bits  := request.bits
 
-    maskSelect(index) := Mux(mshr.maskSelect.valid, mshr.maskSelect.bits, 0.U)
-    mshr.maskInput    := maskInput(index)
+    val maskSelect = Mux(mshr.maskSelect.valid, mshr.maskSelect.bits, 0.U)
+    mshr.maskInput := cutUInt(v0.asUInt, param.maskGroupWidth)(maskSelect)
 
     // broadcast CSR
     mshr.csrInterface := csrInterface
@@ -250,7 +264,8 @@ class LSU(param: LSUParameter) extends Module {
   otherUnit.vrfReadDataPorts.ready := (otherTryReadVrf & VecInit(vrfReadDataPorts.map(_.ready)).asUInt).orR
   val pipeOtherRead:   ValidIO[UInt] =
     Pipe(otherUnit.vrfReadDataPorts.fire, otherUnit.status.targetLane, param.vrfReadLatency)
-  otherUnit.vrfReadResults.bits  := Mux1H(pipeOtherRead.bits, vrfReadResults)
+  // todo: read data reorder
+  otherUnit.vrfReadResults.bits  := Mux1H(pipeOtherRead.bits, vrfReadResults.map(_.bits))
   otherUnit.vrfReadResults.valid := pipeOtherRead.valid
 
   // write vrf
diff --git a/t1/src/lsu/SimpleAccessUnit.scala b/t1/src/lsu/SimpleAccessUnit.scala
index d7409500b..bc517d5eb 100644
--- a/t1/src/lsu/SimpleAccessUnit.scala
+++ b/t1/src/lsu/SimpleAccessUnit.scala
@@ -120,6 +120,10 @@ case class MSHRParam(
   // outstanding of MaskExchangeUnit.maskReq
   // todo: param from T1Param
   val maskRequestQueueSize: Int = 8
+
+  // outstanding of StoreUnit.vrfReadDataPorts
+  // todo: param from T1Param
+  val storeUnitReadOutStanding: Int = 8
 }
 
 /** Miss Status Handler Register this is used to record the outstanding memory access request for each instruction. it
diff --git a/t1/src/lsu/StoreUnit.scala b/t1/src/lsu/StoreUnit.scala
index 1fb72096b..00f261a4b 100644
--- a/t1/src/lsu/StoreUnit.scala
+++ b/t1/src/lsu/StoreUnit.scala
@@ -39,9 +39,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic {
   /** hard wire form Top. see [[LSU.vrfReadResults]]
     */
   @public
-  val vrfReadResults:  Vec[UInt] = IO(Input(Vec(param.laneNumber, UInt(param.datapathWidth.W))))
+  val vrfReadResults:  Vec[ValidIO[UInt]] = IO(Input(Vec(param.laneNumber, Valid(UInt(param.datapathWidth.W)))))
   @public
-  val vrfReadyToStore: Bool      = IO(Input(Bool()))
+  val vrfReadyToStore: Bool               = IO(Input(Bool()))
   @public
   val storeResponse = IO(Input(Bool()))
 
@@ -62,8 +62,9 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic {
   // todo: need hazardCheck?
   val hazardCheck:     Bool               = RegEnable(vrfReadyToStore && !lsuRequest.valid, false.B, lsuRequest.valid || vrfReadyToStore)
   // read stage dequeue ready need all source valid, Or add a queue to coordinate
-  val vrfReadQueueVec: Seq[QueueIO[UInt]] =
-    Seq.tabulate(param.laneNumber)(_ => Queue.io(UInt(param.datapathWidth.W), 2, flow = true, pipe = true))
+  val vrfReadQueueVec: Seq[QueueIO[UInt]] = Seq.tabulate(param.laneNumber)(_ =>
+    Queue.io(UInt(param.datapathWidth.W), param.storeUnitReadOutStanding, flow = true, pipe = true)
+  )
 
   // 从vrf里面读数据
   val readStageValid: Bool = Seq
@@ -72,9 +73,6 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic {
       val segPtr:    UInt                        = RegInit(0.U(3.W))
       val readCount: UInt                        = RegInit(0.U(dataGroupBits.W))
       val stageValid = RegInit(false.B)
-      // queue for read latency
-      val queue: QueueIO[UInt] =
-        Queue.io(UInt(param.datapathWidth.W), param.vrfReadLatency, flow = true)
 
       val lastReadPtr: Bool = segPtr === 0.U
 
@@ -104,8 +102,14 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic {
         readCount := nextReadCount
       }
 
+      val readCounter = RegInit(0.U(log2Ceil(param.storeUnitReadOutStanding + 1).W))
+      val counterChange: UInt = Mux(readPort.fire, 1.U, -1.S(readCounter.getWidth.W).asUInt)
+      when(readPort.fire ^ vrfReadQueueVec(laneIndex).deq.fire) {
+        readCounter := readCounter + counterChange
+      }
+
       // vrf read request
-      readPort.valid                 := stageValid && vrfReadQueueVec(laneIndex).enq.ready
+      readPort.valid                 := stageValid && !readCounter.asBools.last
       readPort.bits.vs               :=
         lsuRequestReg.instructionInformation.vs3 +
           segPtr * segmentInstructionIndexInterval +
@@ -114,15 +118,11 @@ class StoreUnit(param: MSHRParam) extends StrideBase(param) with LSUPublic {
       readPort.bits.offset           := readCount
       readPort.bits.instructionIndex := lsuRequestReg.instructionIndex
 
-      // pipe read fire
-      val readResultFire = Pipe(readPort.fire, 0.U.asTypeOf(new EmptyBundle), param.vrfReadLatency).valid
-
       // latency queue enq
-      queue.enq.valid := readResultFire
-      queue.enq.bits  := vrfReadResults(laneIndex)
-      AssertProperty(BoolSequence(!queue.enq.valid || queue.enq.ready))
-      vrfReadQueueVec(laneIndex).enq <> queue.deq
-      stageValid || RegNext(readPort.fire)
+      AssertProperty(BoolSequence(!vrfReadQueueVec(laneIndex).enq.valid || vrfReadQueueVec(laneIndex).enq.ready))
+      vrfReadQueueVec(laneIndex).enq.valid := vrfReadResults(laneIndex).valid
+      vrfReadQueueVec(laneIndex).enq.bits  := vrfReadResults(laneIndex).bits
+      stageValid || readCounter.orR
     }
     .reduce(_ || _)
 
diff --git a/t1/src/mask/BitLevelMaskWrite.scala b/t1/src/mask/BitLevelMaskWrite.scala
index cefe31868..5bc73491e 100644
--- a/t1/src/mask/BitLevelMaskWrite.scala
+++ b/t1/src/mask/BitLevelMaskWrite.scala
@@ -43,8 +43,8 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module {
     )
   }
 
-  val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ =>
-    IO(Input(UInt(parameter.datapathWidth.W)))
+  val readResult: Seq[ValidIO[UInt]] = Seq.tabulate(parameter.laneNumber) { _ =>
+    IO(Flipped(Valid(UInt(parameter.datapathWidth.W))))
   }
 
   val stageClear: Bool = IO(Output(Bool()))
@@ -52,7 +52,7 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module {
   val stageClearVec: Seq[Bool] = in.zipWithIndex.map { case (req, index) =>
     val reqQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), 4)
     val readPort = readChannel(index)
-    val readData = readResult(index)
+    val readData = readResult(index).bits
     val res      = out(index)
 
     val WaitReadQueue: QueueIO[BitLevelWriteRequest] = Queue.io(chiselTypeOf(req.bits), readVRFLatency)
@@ -68,7 +68,7 @@ class BitLevelMaskWrite(parameter: T1Parameter) extends Module {
     readPort.bits.vs     := vd + (reqQueue.deq.bits.groupCounter >> readPort.bits.offset.getWidth).asUInt
     readPort.bits.offset := changeUIntSize(reqQueue.deq.bits.groupCounter, readPort.bits.offset.getWidth)
 
-    val readValidPipe   = Pipe(readPort.fire, false.B, readVRFLatency).valid
+    val readValidPipe   = Pipe(readPort.fire, false.B, readVRFLatency).valid && readResult(index).valid
     val readResultValid = !needWAR || readValidPipe
 
     val WARData = (WaitReadQueue.deq.bits.data & WaitReadQueue.deq.bits.bitMask) |
diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala
index 24f597259..03520db7a 100644
--- a/t1/src/mask/MaskCompress.scala
+++ b/t1/src/mask/MaskCompress.scala
@@ -4,9 +4,26 @@
 package org.chipsalliance.t1.rtl
 
 import chisel3._
+import chisel3.experimental.hierarchy.{instantiable, Instance, Instantiate}
+import chisel3.experimental.{SerializableModule, SerializableModuleParameter}
+import chisel3.properties.{AnyClassType, Path, Property}
 import chisel3.util._
+import org.chipsalliance.stdlib.GeneralOM
 
-class CompressInput(parameter: T1Parameter) extends Bundle {
+case class CompressParam(
+  datapathWidth:   Int,
+  xLen:            Int,
+  vLen:            Int,
+  laneNumber:      Int,
+  groupNumberBits: Int,
+  latency:         Int)
+    extends SerializableModuleParameter
+
+object CompressParam {
+  implicit def rwP = upickle.default.macroRW[CompressParam]
+}
+
+class CompressInput(parameter: CompressParam) extends Bundle {
   val maskType:       Bool = Bool()
   val eew:            UInt = UInt(2.W)
   val uop:            UInt = UInt(3.W)
@@ -14,26 +31,53 @@ class CompressInput(parameter: T1Parameter) extends Bundle {
   val source1:        UInt = UInt(parameter.datapathWidth.W)
   val mask:           UInt = UInt(parameter.datapathWidth.W)
   val source2:        UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W)
-  val groupCounter:   UInt = UInt(parameter.laneParam.groupNumberBits.W)
+  val groupCounter:   UInt = UInt(parameter.groupNumberBits.W)
   val ffoInput:       UInt = UInt(parameter.laneNumber.W)
   val validInput:     UInt = UInt(parameter.laneNumber.W)
   val lastCompress:   Bool = Bool()
 }
 
-class CompressOutput(parameter: T1Parameter) extends Bundle {
+class CompressOutput(parameter: CompressParam) extends Bundle {
   val data:          UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W)
   val mask:          UInt = UInt((parameter.laneNumber * parameter.datapathWidth / 8).W)
-  val groupCounter:  UInt = UInt(parameter.laneParam.groupNumberBits.W)
+  val groupCounter:  UInt = UInt(parameter.groupNumberBits.W)
   val ffoOutput:     UInt = UInt(parameter.laneNumber.W)
   val compressValid: Bool = Bool()
 }
 
-class MaskCompress(parameter: T1Parameter) extends Module {
-  val in:             ValidIO[CompressInput] = IO(Flipped(Valid(new CompressInput(parameter))))
-  val out:            CompressOutput         = IO(Output(new CompressOutput(parameter)))
-  val newInstruction: Bool                   = IO(Input(Bool()))
-  val ffoInstruction: Bool                   = IO(Input(Bool()))
-  val writeData:      UInt                   = IO(Output(UInt(parameter.xLen.W)))
+class MaskCompressInterFace(parameter: CompressParam) extends Bundle {
+  val clock = Input(Clock())
+  val reset = Input(Reset())
+
+  val in:             ValidIO[CompressInput] = Flipped(Valid(new CompressInput(parameter)))
+  val out:            CompressOutput         = Output(new CompressOutput(parameter))
+  val newInstruction: Bool                   = Input(Bool())
+  val ffoInstruction: Bool                   = Input(Bool())
+  val writeData:      UInt                   = Output(UInt(parameter.xLen.W))
+  val om = Output(Property[AnyClassType]())
+}
+
+@instantiable
+class MaskCompressOM(parameter: CompressParam) extends GeneralOM[CompressParam, MaskCompress](parameter) {}
+
+class MaskCompress(val parameter: CompressParam)
+    extends FixedIORawModule(new MaskCompressInterFace(parameter))
+    with SerializableModule[CompressParam]
+    with ImplicitClock
+    with ImplicitReset {
+
+  protected def implicitClock = io.clock
+  protected def implicitReset = io.reset
+
+  val omInstance: Instance[MaskCompressOM] = Instantiate(new MaskCompressOM(parameter))
+  io.om := omInstance.getPropertyReference
+  omInstance.retimeIn.foreach(_ := Property(Path(io.clock)))
+
+  val in             = io.in
+  val out            = io.out
+  val newInstruction = io.newInstruction
+  val ffoInstruction = io.ffoInstruction
+  val writeData      = io.writeData
 
   val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8
 
@@ -44,11 +88,16 @@ class MaskCompress(parameter: T1Parameter) extends Module {
   val writeRD  = in.bits.uop === BitPat("b?11")
   val ffoType  = in.bits.uop === BitPat("b11?")
 
+  val outWire: CompressOutput = Wire(new CompressOutput(parameter))
+
   val eew1H:           UInt      = UIntToOH(in.bits.eew)(2, 0)
   val compressInit:    UInt      = RegInit(0.U(log2Ceil(parameter.vLen).W))
   val compressVec:     Vec[UInt] = Wire(Vec(maskSize, UInt(compressInit.getWidth.W)))
-  val compressMaskVec: Seq[Bool] = changeUIntSize(in.bits.source1 & in.bits.mask, maskSize).asBools
-  val compressCount:   UInt      = compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) =>
+  val maskInput:       UInt      = changeUIntSize(in.bits.source1 & in.bits.mask, maskSize)
+  val compressMaskVec: Vec[Bool] = VecInit(maskInput.asBools)
+  val compressCount:   UInt      = compressInit + PopCount(maskInput)
+
+  compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) =>
     compressVec(index) := pre
     pre + mask
   }
@@ -58,6 +107,7 @@ class MaskCompress(parameter: T1Parameter) extends Module {
   val ffoValid: Bool = RegInit(false.B)
   writeData := ffoIndex
 
+  // compress & viota stage 1: update compressInit
   when(newInstruction) {
     compressInit := 0.U
   }
@@ -66,7 +116,7 @@ class MaskCompress(parameter: T1Parameter) extends Module {
     val dataByte          = 1 << sewInt
     val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte
     val countWidth        = log2Ceil(elementSizePerSet)
-    val compressDeqValid  = (compressCount >> countWidth).asUInt.orR
+    val compressDeqValid  = (compressCount >> countWidth).asUInt(0)
     val compressUpdate    = changeUIntSize(compressCount, countWidth)
     (compressDeqValid, compressUpdate)
   }
@@ -83,28 +133,43 @@ class MaskCompress(parameter: T1Parameter) extends Module {
     }
   }
 
-  val viotaResult: UInt = Mux1H(
+  // compress & viota stage 2: get result
+  // pipe stage1 result
+  def initRegEnable[T <: Data](data: T, enable: Bool) = {
+    RegEnable(data, 0.U.asTypeOf(data), enable)
+  }
+  val compressVecPipe:      Vec[UInt] = initRegEnable(compressVec, in.fire)
+  val compressMaskVecPipe:  Vec[Bool] = initRegEnable(compressMaskVec, in.fire)
+  val maskPipe:             UInt      = initRegEnable(in.bits.mask, in.fire)
+  val source2Pipe:          UInt      = initRegEnable(in.bits.source2, in.fire)
+  val lastCompressPipe:     Bool      = initRegEnable(in.bits.lastCompress, in.fire)
+  val stage2Valid:          Bool      = RegNext(in.fire, false.B)
+  val newInstructionPipe:   Bool      = RegNext(newInstruction, false.B)
+  val compressInitPipe:     UInt      = initRegEnable(compressInit, in.fire)
+  val compressDeqValidPipe: Bool      = initRegEnable(compressDeqValid, in.fire)
+  val groupCounterPipe:     UInt      = initRegEnable(in.bits.groupCounter, in.fire)
+  val viotaResult:          UInt      = Mux1H(
     eew1H,
     Seq(1, 2, 4).map { eew =>
       VecInit(Seq.tabulate(parameter.laneNumber) { index =>
         // data width: eew * 8, data path 32, need [4 / eew] element
         val dataSize = 4 / eew
         val res: Seq[UInt] = Seq.tabulate(dataSize) { i =>
-          changeUIntSize(compressVec(dataSize * index + i), eew * 8)
+          changeUIntSize(compressVecPipe(dataSize * index + i), eew * 8)
         }
         // each data path
         VecInit(res).asUInt
       }).asUInt
     }
   )
-  val viotaMask:   UInt = Mux1H(
+  val viotaMask:            UInt      = Mux1H(
     eew1H,
     Seq(1, 2, 4).map { eew =>
       VecInit(Seq.tabulate(parameter.laneNumber) { index =>
         val dataSize = 4 / eew
         val res: Seq[UInt] = Seq.tabulate(dataSize) { i =>
           val maskIndex: Int = (parameter.datapathWidth - 1).min(dataSize * index + i)
-          Fill(eew, in.bits.mask(maskIndex))
+          Fill(eew, maskPipe(maskIndex))
         }
         // 4 bit mask
         VecInit(res).asUInt
@@ -115,21 +180,23 @@ class MaskCompress(parameter: T1Parameter) extends Module {
   val tailCount: UInt = {
     val minElementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8
     val maxCountWidth        = log2Ceil(minElementSizePerSet)
-    changeUIntSize(compressInit, maxCountWidth)
+    changeUIntSize(compressInitPipe, maxCountWidth)
   }
 
   val compressDataReg = RegInit(0.U((parameter.laneNumber * parameter.datapathWidth).W))
   val compressTailValid:       Bool = RegInit(false.B)
-  val compressWriteGroupCount: UInt = RegInit(0.U(parameter.laneParam.groupNumberBits.W))
+  val compressWriteGroupCount: UInt = RegInit(0.U(parameter.groupNumberBits.W))
   val compressDataVec = Seq(1, 2, 4).map { dataByte =>
     val dataBit           = dataByte * 8
     val elementSizePerSet = parameter.laneNumber * parameter.datapathWidth / 8 / dataByte
     VecInit(Seq.tabulate(elementSizePerSet * 2) { index =>
       val hitReq        =
-        Seq.tabulate(elementSizePerSet)(maskIndex => compressMaskVec(maskIndex) && compressVec(maskIndex) === index.U)
+        Seq.tabulate(elementSizePerSet)(maskIndex =>
+          compressMaskVecPipe(maskIndex) && compressVecPipe(maskIndex) === index.U
+        )
       val selectReqData = Mux1H(
         hitReq,
-        cutUInt(in.bits.source2, dataBit)
+        cutUInt(source2Pipe, dataBit)
       )
       if (index < elementSizePerSet) {
         val useTail  = index.U < tailCount
@@ -141,18 +208,18 @@ class MaskCompress(parameter: T1Parameter) extends Module {
     }).asUInt
   }
   val compressResult: UInt = Mux1H(eew1H, compressDataVec)
-  val lastCompressEnq: Bool = in.fire && in.bits.lastCompress
-  when(newInstruction || lastCompressEnq || out.compressValid) {
+  val lastCompressEnq: Bool = stage2Valid && lastCompressPipe
+  when(newInstructionPipe || lastCompressEnq || outWire.compressValid) {
     compressTailValid := lastCompressEnq && compress
   }
 
-  when(newInstruction || out.compressValid) {
-    compressWriteGroupCount := Mux(newInstruction, 0.U, compressWriteGroupCount + 1.U)
+  when(newInstructionPipe || outWire.compressValid) {
+    compressWriteGroupCount := Mux(newInstructionPipe, 0.U, compressWriteGroupCount + 1.U)
   }
 
   val splitCompressResult: Vec[UInt] = cutUIntBySize(compressResult, 2)
-  when(in.fire) {
-    compressDataReg := Mux(compressDeqValid, splitCompressResult(1), splitCompressResult(0))
+  when(stage2Valid) {
+    compressDataReg := Mux(compressDeqValidPipe, splitCompressResult(1), splitCompressResult(0))
   }
 
   // todo: connect & update compressInit
@@ -177,7 +244,7 @@ class MaskCompress(parameter: T1Parameter) extends Module {
 
   val ffoMask: UInt = FillInterleaved(parameter.datapathWidth / 8, in.bits.validInput)
 
-  out.data := Mux1H(
+  outWire.data := Mux1H(
     Seq(
       compress -> compressResult,
       viota    -> viotaResult,
@@ -187,7 +254,7 @@ class MaskCompress(parameter: T1Parameter) extends Module {
   )
 
   // todo: compressMask
-  out.mask := Mux1H(
+  outWire.mask := Mux1H(
     Seq(
       compress -> compressMask,
       viota    -> viotaMask,
@@ -197,9 +264,10 @@ class MaskCompress(parameter: T1Parameter) extends Module {
   )
 
   // todo
-  out.compressValid := (compressTailValid || (compressDeqValid && in.fire)) && !writeRD
-  out.groupCounter  := Mux(compress, compressWriteGroupCount, in.bits.groupCounter)
+  outWire.compressValid := (compressTailValid || (compressDeqValidPipe && stage2Valid)) && !writeRD
+  outWire.groupCounter  := Mux(compress, compressWriteGroupCount, groupCounterPipe)
 
+  // ffo type execute
   when(newInstruction && ffoInstruction) {
     ffoIndex := -1.S(parameter.datapathWidth.W).asUInt
     ffoValid := false.B
@@ -235,5 +303,6 @@ class MaskCompress(parameter: T1Parameter) extends Module {
   }.elsewhen(mvRd) {
     ffoIndex := source1SigExtend
   }
-  out.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid)
+  outWire.ffoOutput := completedLeftOr | Fill(parameter.laneNumber, ffoValid)
+  out := RegNext(outWire, 0.U.asTypeOf(outWire))
 }
diff --git a/t1/src/mask/MaskReduce.scala b/t1/src/mask/MaskReduce.scala
index adba8a813..d479b064b 100644
--- a/t1/src/mask/MaskReduce.scala
+++ b/t1/src/mask/MaskReduce.scala
@@ -75,7 +75,8 @@ class MaskReduce(val parameter: MaskReduceParameter)
   val omInstance: Instance[MaskReduceOM] = Instantiate(new MaskReduceOM(parameter))
   io.om := omInstance.getPropertyReference
 
-  val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8
+  val floatAdderLatency: Int = 1
+  val maskSize:          Int = parameter.laneNumber * parameter.datapathWidth / 8
 
   // todo: uop decode
   val order:    Bool = in.bits.uop === "b101".U
@@ -89,7 +90,7 @@ class MaskReduce(val parameter: MaskReduceParameter)
   val logicUnit:   Instance[LaneLogic]            = Instantiate(new LaneLogic(LaneLogicParameter(parameter.datapathWidth)))
   // option unit for flot reduce
   val floatAdder:  Option[Instance[FloatAdder]]   =
-    Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(FloatAdderParameter(8, 24))))
+    Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(FloatAdderParameter(8, 24, floatAdderLatency))))
   omInstance.floatAdderIn.zip(floatAdder).foreach { case (l, r) => l := r.io.om.asAnyClassType }
   val flotCompare: Option[Instance[FloatCompare]] =
     Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(FloatCompareParameter(8, 24))))
@@ -114,6 +115,7 @@ class MaskReduce(val parameter: MaskReduceParameter)
   val floatType:  Bool = reqReg.uop(2) || reqReg.uop(1, 0).andR
   val NotAdd:     Bool = reqReg.uop(1)
   val widen:      Bool = reqReg.uop === "b001".U || reqReg.uop(2, 1) === "b11".U
+  val floatAdd:   Bool = floatType && !NotAdd
   // eew1HReg(0) || (eew1HReg(1) && !widen)
   val needFold:   Bool = false.B
   val writeEEW:   UInt = Mux(pop, 2.U, reqReg.eew + widen)
@@ -123,16 +125,21 @@ class MaskReduce(val parameter: MaskReduceParameter)
   // crossFold: reduce between lane
   // lastFold: reduce in data path
   // orderRed: order reduce
-  val idle :: crossFold :: lastFold :: orderRed :: Nil = Enum(4)
+  val idle :: crossFold :: lastFold :: orderRed :: waitRes :: Nil = Enum(5)
   val state: UInt = RegInit(idle)
 
   val stateIdle:  Bool = state === idle
   val stateCross: Bool = state === crossFold
   val stateLast:  Bool = state === lastFold
   val stateOrder: Bool = state === orderRed
+  val stateWait:  Bool = state === waitRes
 
+  // wait float
+  val waitCount: UInt = RegInit(0.U(log2Ceil(floatAdderLatency.max(2)).W))
+  when(stateWait) { waitCount := waitCount + 1.U }
+  val resFire:   Bool = stateWait && waitCount === (floatAdderLatency - 1).U
   updateResult :=
-    stateLast || ((stateCross || stateOrder) && sourceValid)
+    stateLast || ((stateCross || stateOrder) && sourceValid && !floatAdd) || resFire
 
   // state update
   in.ready := stateIdle
@@ -143,9 +150,21 @@ class MaskReduce(val parameter: MaskReduceParameter)
   }
 
   when(stateCross) {
+    when(floatAdd) {
+      state     := waitRes
+      waitCount := 0.U
+    }.elsewhen(groupLastReduce) {
+      state    := Mux(reqReg.lastGroup && needFold, lastFold, idle)
+      outValid := reqReg.lastGroup && !needFold
+    }
+  }
+
+  when(stateWait && resFire) {
     when(groupLastReduce) {
       state    := Mux(reqReg.lastGroup && needFold, lastFold, idle)
       outValid := reqReg.lastGroup && !needFold
+    }.otherwise {
+      state := crossFold
     }
   }
 
diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala
index 932e3a6ba..641ea9bd4 100644
--- a/t1/src/mask/MaskUnit.scala
+++ b/t1/src/mask/MaskUnit.scala
@@ -39,15 +39,15 @@ import org.chipsalliance.t1.rtl.decoder.Decoder
 // 11 11 1 -> maskdestination
 
 class MaskUnitInterface(parameter: T1Parameter) extends Bundle {
-  val clock:             Clock                            = Input(Clock())
-  val reset:             Reset                            = Input(Reset())
-  val instReq:           ValidIO[MaskUnitInstReq]         = Flipped(Valid(new MaskUnitInstReq(parameter)))
-  val exeReq:            Vec[ValidIO[MaskUnitExeReq]]     = Flipped(
+  val clock:             Clock                             = Input(Clock())
+  val reset:             Reset                             = Input(Reset())
+  val instReq:           ValidIO[MaskUnitInstReq]          = Flipped(Valid(new MaskUnitInstReq(parameter)))
+  val exeReq:            Vec[ValidIO[MaskUnitExeReq]]      = Flipped(
     Vec(parameter.laneNumber, Valid(new MaskUnitExeReq(parameter.laneParam)))
   )
-  val exeResp:           Vec[ValidIO[VRFWriteRequest]]    = Vec(
+  val exeResp:           Vec[DecoupledIO[VRFWriteRequest]] = Vec(
     parameter.laneNumber,
-    Valid(
+    Decoupled(
       new VRFWriteRequest(
         parameter.vrfParam.regNumBits,
         parameter.laneParam.vrfOffsetBits,
@@ -56,8 +56,8 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle {
       )
     )
   )
-  val tokenIO:           Vec[LaneTokenBundle]             = Flipped(Vec(parameter.laneNumber, new LaneTokenBundle))
-  val readChannel:       Vec[DecoupledIO[VRFReadRequest]] = Vec(
+  val tokenIO:           Vec[LaneTokenBundle]              = Flipped(Vec(parameter.laneNumber, new LaneTokenBundle))
+  val readChannel:       Vec[DecoupledIO[VRFReadRequest]]  = Vec(
     parameter.laneNumber,
     Decoupled(
       new VRFReadRequest(
@@ -67,28 +67,32 @@ class MaskUnitInterface(parameter: T1Parameter) extends Bundle {
       )
     )
   )
-  val readResult:        Vec[UInt]                        = Flipped(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W)))
-  val writeRD:           ValidIO[UInt]                    = Valid(UInt(parameter.datapathWidth.W))
-  val lastReport:        UInt                             = Output(UInt((2 * parameter.chainingSize).W))
-  val lsuMaskInput:      Vec[UInt]                        = Output(Vec(parameter.lsuMSHRSize, UInt(parameter.maskGroupWidth.W)))
-  val lsuMaskSelect:     Vec[UInt]                        = Input(Vec(parameter.lsuMSHRSize, UInt(parameter.lsuParameters.maskGroupSizeBits.W)))
-  val laneMaskInput:     Vec[UInt]                        = Output(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W)))
-  val laneMaskSelect:    Vec[UInt]                        = Input(Vec(parameter.laneNumber, UInt(parameter.laneParam.maskGroupSizeBits.W)))
-  val laneMaskSewSelect: Vec[UInt]                        = Input(Vec(parameter.laneNumber, UInt(2.W)))
-  val v0UpdateVec:       Vec[ValidIO[V0Update]]           = Flipped(Vec(parameter.laneNumber, Valid(new V0Update(parameter.laneParam))))
-  val writeRDData:       UInt                             = Output(UInt(parameter.xLen.W))
-  val gatherData:        DecoupledIO[UInt]                = Decoupled(UInt(parameter.xLen.W))
-  val gatherRead:        Bool                             = Input(Bool())
-  val om:                Property[ClassType]              = Output(Property[AnyClassType]())
+  val readResult:        Vec[ValidIO[UInt]]                = Flipped(Vec(parameter.laneNumber, Valid(UInt(parameter.datapathWidth.W))))
+  val writeRD:           ValidIO[UInt]                     = Valid(UInt(parameter.datapathWidth.W))
+  val lastReport:        UInt                              = Output(UInt((2 * parameter.chainingSize).W))
+  val laneMaskInput:     Vec[UInt]                         = Output(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W)))
+  val laneMaskSelect:    Vec[UInt]                         = Input(Vec(parameter.laneNumber, UInt(parameter.laneParam.maskGroupSizeBits.W)))
+  val laneMaskSewSelect: Vec[UInt]                         = Input(Vec(parameter.laneNumber, UInt(2.W)))
+  val v0UpdateVec:       Vec[ValidIO[V0Update]]            = Flipped(
+    Vec(parameter.laneNumber, Valid(new V0Update(parameter.laneParam.datapathWidth, parameter.laneParam.vrfOffsetBits)))
+  )
+  val writeRDData:       UInt                              = Output(UInt(parameter.xLen.W))
+  val gatherData:        DecoupledIO[UInt]                 = Decoupled(UInt(parameter.xLen.W))
+  val gatherRead:        Bool                              = Input(Bool())
+  val om:                Property[ClassType]               = Output(Property[AnyClassType]())
 }
 
 @instantiable
 class MaskUnitOM(parameter: T1Parameter) extends GeneralOM[T1Parameter, MaskUnit](parameter) {
-  @public
   val reduceUnit   = IO(Output(Property[AnyClassType]()))
   @public
   val reduceUnitIn = IO(Input(Property[AnyClassType]()))
   reduceUnit := reduceUnitIn
+
+  val compress   = IO(Output(Property[AnyClassType]()))
+  @public
+  val compressIn = IO(Input(Property[AnyClassType]()))
+  compress := compressIn
 }
 
 // TODO: no T1Parameter here.
@@ -116,8 +120,6 @@ class MaskUnit(val parameter: T1Parameter)
   val readResult        = io.readResult
   val writeRD           = io.writeRD
   val lastReport        = io.lastReport
-  val lsuMaskInput      = io.lsuMaskInput
-  val lsuMaskSelect     = io.lsuMaskSelect
   val laneMaskInput     = io.laneMaskInput
   val laneMaskSelect    = io.laneMaskSelect
   val laneMaskSewSelect = io.laneMaskSewSelect
@@ -168,11 +170,6 @@ class MaskUnit(val parameter: T1Parameter)
     input := cutUInt(v0SelectBySew, parameter.datapathWidth)(laneMaskSelect(index))
   }
 
-  // lsu
-  lsuMaskInput.zip(lsuMaskSelect).foreach { case (data, index) =>
-    data := cutUInt(v0.asUInt, parameter.maskGroupWidth)(index)
-  }
-
   val maskedWrite: BitLevelMaskWrite = Module(new BitLevelMaskWrite(parameter))
 
   def gatherIndex(elementIndex: UInt, vlmul: UInt, sew: UInt): (UInt, UInt, UInt, UInt, Bool) = {
@@ -795,7 +792,7 @@ class MaskUnit(val parameter: T1Parameter)
     val dataOffset: UInt = Mux1H(readResultSelect, pipeDataOffset)
     readTokenRelease(index) := readDataQueue.deq.fire
     readDataQueue.enq.valid := readResultSelect.orR
-    readDataQueue.enq.bits  := Mux1H(readResultSelect, readResult) >> (dataOffset ## 0.U(3.W))
+    readDataQueue.enq.bits  := Mux1H(readResultSelect, readResult.map(_.bits)) >> (dataOffset ## 0.U(3.W))
     readDataQueue.deq
   }
 
@@ -905,14 +902,24 @@ class MaskUnit(val parameter: T1Parameter)
   // Determine whether the data is ready
   val executeEnqValid: Bool = otherTypeRequestDeq && !readType
 
+  val compressParam: CompressParam = CompressParam(
+    parameter.datapathWidth,
+    parameter.xLen,
+    parameter.vLen,
+    parameter.laneNumber,
+    parameter.laneParam.groupNumberBits,
+    2
+  )
   // start execute
-  val compressUnit: MaskCompress = Module(new MaskCompress(parameter))
-  val reduceUnit = Instantiate(
+  val compressUnit = Instantiate(new MaskCompress(compressParam))
+  val reduceUnit   = Instantiate(
     new MaskReduce(
       MaskReduceParameter(parameter.datapathWidth, parameter.laneNumber, parameter.fpuEnable)
     )
   )
   omInstance.reduceUnitIn := reduceUnit.io.om.asAnyClassType
+  omInstance.compressIn   := compressUnit.io.om.asAnyClassType
+
   val extendUnit: MaskExtend = Module(new MaskExtend(parameter))
 
   // todo
@@ -942,28 +949,30 @@ class MaskUnit(val parameter: T1Parameter)
   val compressSource1: UInt = Mux1H(sew1H, vs1Split.map(_._1))
   val source1Select:   UInt = Mux(mv, readVS1Reg.data, compressSource1)
   val source1Change:   Bool = Mux1H(sew1H, vs1Split.map(_._2))
-  when(source1Change && compressUnit.in.fire) {
+  when(source1Change && compressUnit.io.in.fire) {
     readVS1Reg.dataValid   := false.B
     readVS1Reg.requestSend := false.B
     readVS1Reg.readIndex   := readVS1Reg.readIndex + 1.U
 
   }
-  viotaCounterAdd := compressUnit.in.fire
-
-  compressUnit.in.valid               := executeEnqValid && unitType(1)
-  compressUnit.in.bits.maskType       := instReg.maskType
-  compressUnit.in.bits.eew            := instReg.sew
-  compressUnit.in.bits.uop            := instReg.decodeResult(Decoder.topUop)
-  compressUnit.in.bits.readFromScalar := instReg.readFromScala
-  compressUnit.in.bits.source1        := source1Select
-  compressUnit.in.bits.mask           := executeElementMask
-  compressUnit.in.bits.source2        := source2
-  compressUnit.in.bits.groupCounter   := requestCounter
-  compressUnit.in.bits.lastCompress   := lastGroup
-  compressUnit.in.bits.ffoInput       := VecInit(exeReqReg.map(_.bits.ffo)).asUInt
-  compressUnit.in.bits.validInput     := VecInit(exeReqReg.map(_.valid)).asUInt
-  compressUnit.newInstruction         := instReq.valid
-  compressUnit.ffoInstruction         := instReq.bits.decodeResult(Decoder.topUop)(2, 0) === BitPat("b11?")
+  viotaCounterAdd := compressUnit.io.in.fire
+
+  compressUnit.io.clock                  := implicitClock
+  compressUnit.io.reset                  := implicitReset
+  compressUnit.io.in.valid               := executeEnqValid && unitType(1)
+  compressUnit.io.in.bits.maskType       := instReg.maskType
+  compressUnit.io.in.bits.eew            := instReg.sew
+  compressUnit.io.in.bits.uop            := instReg.decodeResult(Decoder.topUop)
+  compressUnit.io.in.bits.readFromScalar := instReg.readFromScala
+  compressUnit.io.in.bits.source1        := source1Select
+  compressUnit.io.in.bits.mask           := executeElementMask
+  compressUnit.io.in.bits.source2        := source2
+  compressUnit.io.in.bits.groupCounter   := requestCounter
+  compressUnit.io.in.bits.lastCompress   := lastGroup
+  compressUnit.io.in.bits.ffoInput       := VecInit(exeReqReg.map(_.bits.ffo)).asUInt
+  compressUnit.io.in.bits.validInput     := VecInit(exeReqReg.map(_.valid)).asUInt
+  compressUnit.io.newInstruction         := instReq.valid
+  compressUnit.io.ffoInstruction         := instReq.bits.decodeResult(Decoder.topUop)(2, 0) === BitPat("b11?")
 
   reduceUnit.io.clock               := implicitClock
   reduceUnit.io.reset               := implicitReset
@@ -987,7 +996,7 @@ class MaskUnit(val parameter: T1Parameter)
     sink := VecInit(exeReqReg.map(_.bits.fpReduceValid.get)).asUInt
   }
 
-  when(reduceUnit.io.in.fire || compressUnit.in.fire) {
+  when(reduceUnit.io.in.fire || compressUnit.io.in.fire) {
     readVS1Reg.sendToExecution := true.B
   }
 
@@ -1008,7 +1017,7 @@ class MaskUnit(val parameter: T1Parameter)
   val executeResult: UInt = Mux1H(
     unitType(3, 1),
     Seq(
-      compressUnit.out.data,
+      compressUnit.io.out.data,
       reduceUnit.io.out.bits.data,
       extendUnit.out
     )
@@ -1028,7 +1037,7 @@ class MaskUnit(val parameter: T1Parameter)
   val executeValid: Bool = Mux1H(
     unitType(3, 1),
     Seq(
-      compressUnit.out.compressValid,
+      compressUnit.io.out.compressValid,
       false.B,
       executeEnqValid
     )
@@ -1046,13 +1055,13 @@ class MaskUnit(val parameter: T1Parameter)
   val executeDeqGroupCounter: UInt = Mux1H(
     unitType(3, 1),
     Seq(
-      compressUnit.out.groupCounter,
+      compressUnit.io.out.groupCounter,
       requestCounter,
       extendGroupCount
     )
   )
 
-  val executeWriteByteMask: UInt = Mux(compress || ffo || mvVd, compressUnit.out.mask, executeByteMask)
+  val executeWriteByteMask: UInt = Mux(compress || ffo || mvVd, compressUnit.io.out.mask, executeByteMask)
   maskedWrite.needWAR := maskDestinationType
   maskedWrite.vd      := instReg.vd
   maskedWrite.in.zipWithIndex.foreach { case (req, index) =>
@@ -1064,7 +1073,7 @@ class MaskUnit(val parameter: T1Parameter)
     req.bits.pipeData     := exeReqReg(index).bits.source1
     req.bits.bitMask      := bitMask
     req.bits.groupCounter := executeDeqGroupCounter
-    req.bits.ffoByOther   := compressUnit.out.ffoOutput(index) && ffo
+    req.bits.ffoByOther   := compressUnit.io.out.ffoOutput(index) && ffo
     if (index == 0) {
       // reduce result
       when(unitType(2)) {
@@ -1091,17 +1100,10 @@ class MaskUnit(val parameter: T1Parameter)
     }
     queue.enq.bits.index         := instReg.instructionIndex
 
-    // write token
-    val tokenCounter = RegInit(0.U(log2Ceil(parameter.maskUnitVefWriteQueueSize + 1).W))
-    val tokenAllow:    Bool = queue.deq.fire
-    val counterChange: UInt = Mux(tokenAllow, 1.U, -1.S(tokenCounter.getWidth.W).asUInt)
-    when(tokenAllow ^ tokenIO(index).maskResponseRelease) {
-      tokenCounter := tokenCounter + counterChange
-    }
     // write vrf
     val writePort = exeResp(index)
-    queue.deq.ready                 := !tokenCounter.asBools.last
-    writePort.valid                 := tokenAllow
+    queue.deq.ready                 := writePort.ready
+    writePort.valid                 := queue.deq.valid
     writePort.bits.last             := DontCare
     writePort.bits.instructionIndex := instReg.instructionIndex
     writePort.bits.data             := Mux(queue.deq.bits.ffoByOther, queue.deq.bits.pipeData, queue.deq.bits.writeData.data)
@@ -1131,7 +1133,7 @@ class MaskUnit(val parameter: T1Parameter)
   val executeStageInvalid: Bool = Mux1H(
     unitType(3, 1),
     Seq(
-      !compressUnit.out.compressValid,
+      !compressUnit.io.out.compressValid,
       reduceUnit.io.in.ready,
       true.B
     )
@@ -1150,7 +1152,7 @@ class MaskUnit(val parameter: T1Parameter)
     lastReportValid,
     indexToOH(instReg.instructionIndex, parameter.chainingSize)
   )
-  writeRDData := Mux(pop, reduceUnit.io.out.bits.data, compressUnit.writeData)
+  writeRDData := Mux(pop, reduceUnit.io.out.bits.data, compressUnit.io.writeData)
 
   // gather read state
   when(gatherRequestFire) {
diff --git a/t1/src/package.scala b/t1/src/package.scala
index 13bde15a9..7b648bc75 100644
--- a/t1/src/package.scala
+++ b/t1/src/package.scala
@@ -7,6 +7,7 @@ import chisel3._
 import chisel3.experimental.hierarchy.{Instance, Instantiate}
 import chisel3.util._
 import chisel3.util.experimental.decode.DecodeBundle
+import org.chipsalliance.dwbb.stdlib.queue.Queue
 import org.chipsalliance.t1.rtl.decoder.{Decoder, TableGenerator}
 import org.chipsalliance.t1.rtl.lane.Distributor
 
@@ -221,6 +222,89 @@ package object rtl {
     id.map(f => (shifterReg :+ source).map(p => Mux(p.valid, indexToOH(f(p.bits), 4), 0.U)).reduce(_ | _))
   }
 
+  def connectDecoupledWithShifter[T <: Data](latency: Int, tokenSize: Int)(source: DecoupledIO[T], sink: DecoupledIO[T])
+    : Unit = {
+    val queue       = Queue.io(chiselTypeOf(source.bits), tokenSize, flow = true)
+    // Reverse pipe release
+    val releasePipe = Pipe(
+      sink.fire,
+      0.U.asTypeOf(new EmptyBundle),
+      latency
+    ).valid
+    val tokenCheck: Bool = pipeToken(tokenSize)(source.fire, releasePipe)
+    source.ready := tokenCheck
+
+    // Complete the handshake at the source end and convert the result of the handshake into a data stream
+    val validSource: ValidIO[T] = Wire(Valid(chiselTypeOf(source.bits)))
+    validSource.valid := source.fire
+    validSource.bits  := source.bits
+
+    val validSink: ValidIO[T] = Wire(Valid(chiselTypeOf(source.bits)))
+
+    // Shift Data
+    connectWithShifter(latency)(validSource, validSink)
+    // Throw the moved data into the queue
+    // todo: assert(queue.enq.ready || !queue.enq.valid)
+    queue.enq.valid := validSink.valid
+    queue.enq.bits  := validSink.bits
+    // Finally, send the data to the sink
+    sink <> queue.deq
+  }
+
+  def maskUnitReadArbitrate[T <: Data](source: Vec[DecoupledIO[T]]): DecoupledIO[T] = {
+    require(source.size == 2)
+    val maskRead = source.head
+    val lsuRead  = source.last
+    val sinkWire: DecoupledIO[T] = Wire(Decoupled(chiselTypeOf(maskRead.bits)))
+    val maskUnitFirst = RegInit(false.B)
+    val tryToRead     = maskRead.valid || lsuRead.valid
+    when(tryToRead && !sinkWire.fire) {
+      maskUnitFirst := !maskUnitFirst
+    }
+
+    sinkWire.valid := Mux(
+      maskUnitFirst,
+      maskRead.valid,
+      lsuRead.valid
+    )
+    sinkWire.bits  :=
+      Mux(maskUnitFirst, maskRead.bits, lsuRead.bits)
+    lsuRead.ready  := sinkWire.ready && !maskUnitFirst
+    maskRead.ready := sinkWire.ready && maskUnitFirst
+    sinkWire
+  }
+
+  def connectVrfAccess[T <: Data](
+    latencyVec:     Seq[Int],
+    tokenSizeVec:   Seq[Int],
+    vrfReadLatency: Option[Int] = None
+  )(sourceVec:      Vec[DecoupledIO[T]],
+    sink:           DecoupledIO[T],
+    arb:            Int,
+    dataAck:        Option[UInt] = None,
+    dataToSource:   Option[Seq[ValidIO[UInt]]] = None
+  ): Unit = {
+    val sinkVec: Vec[DecoupledIO[T]] = VecInit(sourceVec.zipWithIndex.map { case (source, index) =>
+      val sinkWire: DecoupledIO[T] = Wire(Decoupled(chiselTypeOf(source.bits)))
+      connectDecoupledWithShifter(latencyVec(index), tokenSizeVec(index))(source, sinkWire)
+      sinkWire
+    })
+    if (arb == 0) {
+      sink <> maskUnitReadArbitrate(sinkVec)
+    }
+    dataToSource.foreach { sourceDataVec =>
+      require(dataAck.isDefined)
+      sourceDataVec.zipWithIndex.foreach { case (sourceData, index) =>
+        val sinkRequest      = sinkVec(index)
+        val accessDataValid  = Pipe(sinkRequest.fire, 0.U.asTypeOf(new EmptyBundle), vrfReadLatency.get).valid
+        val accessDataSource = Wire(Valid(chiselTypeOf(dataAck.get)))
+        accessDataSource.valid := accessDataValid
+        accessDataSource.bits  := dataAck.get
+        connectWithShifter(latencyVec(index))(accessDataSource, sourceData)
+      }
+    }
+  }
+
   def instantiateVFU(
     parameter:          VFUInstantiateParameter
   )(requestVec:         Vec[SlotRequestToVFU],
diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala
index 721c888f4..0292d8409 100644
--- a/t1/src/vrf/VRF.scala
+++ b/t1/src/vrf/VRF.scala
@@ -15,6 +15,7 @@ import chisel3.properties.{AnyClassType, Class, ClassType, Path, Property}
 import org.chipsalliance.stdlib.GeneralOM
 import org.chipsalliance.t1.rtl.{
   ffo,
+  indexToOH,
   instIndexL,
   instIndexLE,
   ohCheck,
@@ -199,7 +200,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
   @public
   val writeCheck: Vec[LSUWriteCheck] = IO(
     Vec(
-      parameter.chainingSize + 4,
+      parameter.chainingSize + 3,
       Input(
         new LSUWriteCheck(
           parameter.regNumBits,
@@ -211,11 +212,11 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
   )
 
   @public
-  val writeAllow: Vec[Bool] = IO(Vec(parameter.chainingSize + 4, Output(Bool())))
+  val writeAllow: Vec[Bool] = IO(Vec(parameter.chainingSize + 3, Output(Bool())))
 
   /** when instruction is fired, record it in the VRF for chaining. */
   @public
-  val instructionWriteReport: DecoupledIO[VRFWriteReport] = IO(Flipped(Decoupled(new VRFWriteReport(parameter))))
+  val instructionWriteReport: ValidIO[VRFWriteReport] = IO(Flipped(Valid(new VRFWriteReport(parameter))))
 
   /** similar to [[flush]]. */
   @public
@@ -224,6 +225,9 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
   @public
   val lsuLastReport: UInt = IO(Input(UInt((2 * parameter.chainingSize).W)))
 
+  @public
+  val vrfSlotRelease: UInt = IO(Output(UInt((2 * parameter.chainingSize).W)))
+
   @public
   val dataInLane: UInt = IO(Input(UInt((2 * parameter.chainingSize).W)))
 
@@ -232,9 +236,6 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
   @public
   val vrfReadyToStore:  Bool = IO(Output(Bool()))
 
-  @public
-  val vrfAllocateIssue: Bool = IO(Output(Bool()))
-
   /** we can only chain LSU instructions, after [[LSU.writeQueueVec]] is cleared. */
   @public
   val loadDataInLSUWriteQueue: UInt = IO(Input(UInt((2 * parameter.chainingSize).W)))
@@ -275,6 +276,13 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
   val chainingRecordCopy: Vec[ValidIO[VRFWriteReport]] = RegInit(
     VecInit(Seq.fill(parameter.chainingSize + 1)(0.U.asTypeOf(Valid(new VRFWriteReport(parameter)))))
   )
+  val recordRelease:      Vec[UInt]                    = WireDefault(
+    VecInit(
+      Seq.fill(parameter.chainingSize + 1)(
+        0.U.asTypeOf(UInt((parameter.chainingSize * 2).W))
+      )
+    )
+  )
   val recordValidVec:     Seq[Bool]                    = chainingRecord.map(r => !r.bits.elementMask.andR && r.valid)
 
   // first read
@@ -500,23 +508,12 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
   // @todo @Clo91eaf VRF ready signal for performance.
   val freeRecord: UInt = VecInit(chainingRecord.map(!_.valid)).asUInt
   val recordFFO:  UInt = ffo(freeRecord)
-  val recordEnq:  UInt = Wire(UInt((parameter.chainingSize + 1).W))
-  val olderCheck = chainingRecord.map { re =>
-    // The same lsb will make it difficult to distinguish between the new and the old
-    val notSameLSB: Bool = re.bits.instIndex(parameter.instructionIndexBits - 2, 0) =/=
-      instructionWriteReport.bits.instIndex(parameter.instructionIndexBits - 2, 0)
-    !re.valid || (instIndexL(re.bits.instIndex, instructionWriteReport.bits.instIndex) && notSameLSB)
-  }.reduce(_ && _)
-  // handle VRF hazard
-  // @todo @Clo91eaf VRF ready signal for performance.
-  instructionWriteReport.ready := freeRecord.orR && olderCheck
-  recordEnq        := Mux(
+  val recordEnq:  UInt = Mux(
     // 纯粹的lsu指令的记录不需要ready
     instructionWriteReport.valid,
     recordFFO,
     0.U((parameter.chainingSize + 1).W)
   )
-  vrfAllocateIssue := freeRecord.orR && olderCheck
 
   val writePort:         Seq[ValidIO[VRFWriteRequest]]    = Seq(writePipe)
   val loadUnitReadPorts: Seq[DecoupledIO[VRFReadRequest]] = Seq(readRequests.last)
@@ -570,6 +567,9 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
 
       when(stateClear) {
         record.valid := false.B
+        when(record.valid) {
+          recordRelease(i) := indexToOH(record.bits.instIndex, parameter.chainingSize)
+        }
       }
 
       when(recordEnq(i)) {
@@ -617,6 +617,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
   }
   writeReadyForLsu := !hazardVec.map(_.map(_._1).reduce(_ || _)).reduce(_ || _)
   vrfReadyToStore := !hazardVec.map(_.map(_._2).reduce(_ || _)).reduce(_ || _)
+  vrfSlotRelease  := recordRelease.reduce(_ | _)
 
   writeCheck.zip(writeAllow).foreach { case (check, allow) =>
     allow := chainingRecordCopy