diff --git a/t1/src/T1.scala b/t1/src/T1.scala
index c3d71115f..a06472e3b 100644
--- a/t1/src/T1.scala
+++ b/t1/src/T1.scala
@@ -258,6 +258,9 @@ case class T1Parameter(
   // and the values are their respective delays.
   val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1))
 
+  val laneRequestTokenSize:   Int      = 4
+  val laneRequestShifterSize: Seq[Int] = Seq.tabulate(laneNumber)(_ => 1)
+
   val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions)
 
   /** paraemter for AXI4. */
@@ -632,9 +635,21 @@ class T1(val parameter: T1Parameter)
     control
   }
 
-  /** lane is ready to receive new instruction. */
-  val laneReady:    Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool()))
-  val allLaneReady: Bool      = laneReady.asUInt.andR
+  // Close to top
+  val laneRequestSourceWire: Vec[DecoupledIO[LaneRequest]] = Wire(
+    Vec(parameter.laneNumber, Decoupled(new LaneRequest(parameter.laneParam)))
+  )
+  // Close to lane
+  val laneRequestSinkWire:   Vec[DecoupledIO[LaneRequest]] = Wire(
+    Vec(parameter.laneNumber, Decoupled(new LaneRequest(parameter.laneParam)))
+  )
+
+  laneRequestSourceWire.zipWithIndex.foreach { case (source, index) =>
+    val sink = laneRequestSinkWire(index)
+    connectDecoupledWithShifter(parameter.laneRequestShifterSize(index), parameter.laneRequestTokenSize)(source, sink)
+  }
+
+  val allLaneReady: Bool = VecInit(laneRequestSourceWire.map(_.ready)).asUInt.andR
   // TODO: review later
   // todo: 把scheduler的反馈也加上,lsu有更高的优先级
 
@@ -696,48 +711,48 @@ class T1(val parameter: T1Parameter)
     requestReg.bits.issue.vl
   )
 
-  /** instantiate lanes. TODO: move instantiate to top of class.
-    */
-  val laneVec: Seq[Instance[Lane]] = Seq.tabulate(parameter.laneNumber) { index =>
-    val lane: Instance[Lane] = Instantiate(new Lane(parameter.laneParam))
-    // lane.laneRequest.valid -> requestRegDequeue.ready -> lane.laneRequest.ready -> lane.laneRequest.bits
-    // TODO: this is harmful for PnR design, since it broadcast ready singal to each lanes, which will significantly
-    //       reduce the scalability for large number of lanes.
-    lane.laneRequest.valid                 := requestRegDequeue.fire && !noOffsetReadLoadStore && !maskUnitInstruction
+  laneRequestSourceWire.foreach { request =>
+    request.valid                 := requestRegDequeue.fire && !noOffsetReadLoadStore && !maskUnitInstruction
     // hard wire
-    lane.laneRequest.bits.instructionIndex := requestReg.bits.instructionIndex
-    lane.laneRequest.bits.decodeResult     := decodeResult
-    lane.laneRequest.bits.vs1              := requestRegDequeue.bits.instruction(19, 15)
-    lane.laneRequest.bits.vs2              := requestRegDequeue.bits.instruction(24, 20)
-    lane.laneRequest.bits.vd               := requestRegDequeue.bits.instruction(11, 7)
-    lane.laneRequest.bits.segment          := Mux(
+    request.bits.instructionIndex := requestReg.bits.instructionIndex
+    request.bits.decodeResult     := decodeResult
+    request.bits.vs1              := requestRegDequeue.bits.instruction(19, 15)
+    request.bits.vs2              := requestRegDequeue.bits.instruction(24, 20)
+    request.bits.vd               := requestRegDequeue.bits.instruction(11, 7)
+    request.bits.segment          := Mux(
       decodeResult(Decoder.nr),
       requestRegDequeue.bits.instruction(17, 15),
       requestRegDequeue.bits.instruction(31, 29)
     )
 
-    lane.laneRequest.bits.loadStoreEEW   := requestRegDequeue.bits.instruction(13, 12)
+    request.bits.loadStoreEEW   := requestRegDequeue.bits.instruction(13, 12)
     // if the instruction is vi and vx type of gather, gather from rs2 with mask VRF read channel from one lane,
     // and broadcast to all lanes.
-    lane.laneRequest.bits.readFromScalar := source1Select
+    request.bits.readFromScalar := source1Select
 
-    lane.laneRequest.bits.issueInst  := requestRegDequeue.fire
-    lane.laneRequest.bits.loadStore  := isLoadStoreType
+    request.bits.issueInst  := requestRegDequeue.fire
+    request.bits.loadStore  := isLoadStoreType
     // let record in VRF to know there is a store instruction.
-    lane.laneRequest.bits.store      := isStoreType
+    request.bits.store      := isStoreType
     // let lane know if this is a special instruction, which need group-level synchronization between lane and [[V]]
-    lane.laneRequest.bits.special    := specialInstruction
-    lane.laneRequest.bits.lsWholeReg := lsWholeReg
+    request.bits.special    := specialInstruction
+    request.bits.lsWholeReg := lsWholeReg
     // mask type instruction.
-    lane.laneRequest.bits.mask       := maskType
-    laneReady(index)                 := lane.laneRequest.ready
+    request.bits.mask       := maskType
 
     // connect csrInterface
-    lane.laneRequest.bits.csrInterface := requestRegCSR
+    request.bits.csrInterface      := requestRegCSR
     // index type EEW Decoded in the instruction
-    lane.laneRequest.bits.csrInterface.vSew := vSewSelect
-    lane.laneRequest.bits.csrInterface.vl   := evlForLane
-    lane.laneIndex                          := index.U
+    request.bits.csrInterface.vSew := vSewSelect
+    request.bits.csrInterface.vl   := evlForLane
+  }
+
+  /** instantiate lanes. TODO: move instantiate to top of class.
+    */
+  val laneVec: Seq[Instance[Lane]] = Seq.tabulate(parameter.laneNumber) { index =>
+    val lane: Instance[Lane] = Instantiate(new Lane(parameter.laneParam))
+    lane.laneRequest <> laneRequestSinkWire(index)
+    lane.laneIndex := index.U
 
     // lsu 优先会有死锁:
     // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写
diff --git a/t1/src/package.scala b/t1/src/package.scala
index 13bde15a9..4bb5835da 100644
--- a/t1/src/package.scala
+++ b/t1/src/package.scala
@@ -7,6 +7,7 @@ import chisel3._
 import chisel3.experimental.hierarchy.{Instance, Instantiate}
 import chisel3.util._
 import chisel3.util.experimental.decode.DecodeBundle
+import org.chipsalliance.dwbb.stdlib.queue.Queue
 import org.chipsalliance.t1.rtl.decoder.{Decoder, TableGenerator}
 import org.chipsalliance.t1.rtl.lane.Distributor
 
@@ -221,6 +222,35 @@ package object rtl {
     id.map(f => (shifterReg :+ source).map(p => Mux(p.valid, indexToOH(f(p.bits), 4), 0.U)).reduce(_ | _))
   }
 
+  def connectDecoupledWithShifter[T <: Data](latency: Int, tokenSize: Int)(source: DecoupledIO[T], sink: DecoupledIO[T])
+    : Unit = {
+    val queue       = Queue.io(chiselTypeOf(source.bits), tokenSize, flow = true)
+    // Reverse pipe release
+    val releasePipe = Pipe(
+      sink.fire,
+      0.U.asTypeOf(new EmptyBundle),
+      latency
+    ).valid
+    val tokenCheck: Bool = pipeToken(tokenSize)(source.fire, releasePipe)
+    source.ready := tokenCheck
+
+    // Complete the handshake at the source end and convert the result of the handshake into a data stream
+    val validSource: ValidIO[T] = Wire(Valid(chiselTypeOf(source.bits)))
+    validSource.valid := source.fire
+    validSource.bits  := source.bits
+
+    val validSink: ValidIO[T] = Wire(Valid(chiselTypeOf(source.bits)))
+
+    // Shift Data
+    connectWithShifter(latency)(validSource, validSink)
+    // Throw the moved data into the queue
+    // todo: assert(queue.enq.ready || !queue.enq.valid)
+    queue.enq.valid := validSink.valid
+    queue.enq.bits  := validSink.bits
+    // Finally, send the data to the sink
+    sink <> queue.deq
+  }
+
   def instantiateVFU(
     parameter:          VFUInstantiateParameter
   )(requestVec:         Vec[SlotRequestToVFU],