From a2883533751e38f478fa8eec1298686c0617edaa Mon Sep 17 00:00:00 2001
From: qinjun-li <liqinjun1994@gmail.com>
Date: Thu, 5 Sep 2024 14:21:28 +0800
Subject: [PATCH] [rtl] refactor mask unit.

---
 t1/src/Bundles.scala                    |  76 ++
 t1/src/Lane.scala                       |  89 ++-
 t1/src/T1.scala                         | 961 +++---------------------
 t1/src/decoder/Decoder.scala            |  18 +-
 t1/src/decoder/attribute/isSwrite.scala |   2 -
 t1/src/laneStage/LaneStage3.scala       |  54 +-
 t1/src/laneStage/MaskExchangeUnit.scala |  65 ++
 t1/src/laneStage/SlotTokenManager.scala |  27 +-
 t1/src/mask/MaskCompress.scala          | 123 +++
 t1/src/mask/MaskExtend.scala            |  69 ++
 t1/src/mask/MaskReduce.scala            | 186 +++++
 t1/src/mask/MaskUnit.scala              | 580 ++++++++++++++
 t1/src/mask/MaskUnitReadCrossBar.scala  |  48 ++
 t1/src/package.scala                    |  26 +
 t1/src/sequencer/T1TokenManager.scala   |  49 +-
 t1/src/vrf/VRF.scala                    |  11 +-
 16 files changed, 1395 insertions(+), 989 deletions(-)
 create mode 100644 t1/src/laneStage/MaskExchangeUnit.scala
 create mode 100644 t1/src/mask/MaskCompress.scala
 create mode 100644 t1/src/mask/MaskExtend.scala
 create mode 100644 t1/src/mask/MaskReduce.scala
 create mode 100644 t1/src/mask/MaskUnit.scala
 create mode 100644 t1/src/mask/MaskUnitReadCrossBar.scala

diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala
index fd833f07e..42652cb51 100644
--- a/t1/src/Bundles.scala
+++ b/t1/src/Bundles.scala
@@ -698,3 +698,79 @@ class T1Retire(xLen: Int) extends Bundle {
   val csr: ValidIO[T1CSRRetire] = Valid(new T1CSRRetire)
   val mem: ValidIO[EmptyBundle] = Valid(new EmptyBundle)
 }
+
+class MaskUnitExecuteState(parameter: T1Parameter) extends Bundle {
+  val groupReadState: UInt      = UInt(parameter.laneNumber.W)
+  val needRead:       UInt      = UInt(parameter.laneNumber.W)
+  val elementValid:   UInt      = UInt(parameter.laneNumber.W)
+  val readOffset:     UInt      = UInt((parameter.laneNumber * parameter.laneParam.vrfOffsetBits).W)
+  val accessLane:     Vec[UInt] = Vec(parameter.laneNumber, UInt(log2Ceil(parameter.laneNumber).W))
+  // 3: log2Ceil(8); 8: Use up to 8 registers
+  val vsGrowth:       Vec[UInt] = Vec(parameter.laneNumber, UInt(3.W))
+  val groupCount:     UInt      = UInt(parameter.laneParam.groupNumberBits.W)
+  val executeIndex:   UInt      = UInt(2.W)
+  val readDataOffset: UInt      = UInt((log2Ceil(parameter.datapathWidth / 8) * parameter.laneNumber).W)
+  val last:           Bool      = Bool()
+}
+
+class MaskUnitInstReq(parameter: T1Parameter) extends Bundle {
+  val instructionIndex: UInt         = UInt(parameter.instructionIndexBits.W)
+  val decodeResult:     DecodeBundle = Decoder.bundle(parameter.decoderParam)
+  val readFromScala:    UInt         = UInt(parameter.datapathWidth.W)
+  val sew:              UInt         = UInt(2.W)
+  val vlmul:            UInt         = UInt(3.W)
+  val maskType:         Bool         = Bool()
+  val vxrm:             UInt         = UInt(3.W)
+  val vs2:              UInt         = UInt(5.W)
+  val vl:               UInt         = UInt(parameter.laneParam.vlMaxBits.W)
+}
+
+class MaskUnitExeReq(parameter: LaneParameter) extends Bundle {
+  // source1, read vs
+  val source1:      UInt = UInt(parameter.datapathWidth.W)
+  // source2, read offset
+  val source2:      UInt = UInt(parameter.datapathWidth.W)
+  val groupCounter: UInt = UInt(parameter.groupNumberBits.W)
+  val index:        UInt = UInt(parameter.instructionIndexBits.W)
+}
+
+class MaskUnitExeResponse(parameter: LaneParameter) extends Bundle {
+  val ffoByOther: Bool = Bool()
+  val writeData = new MaskUnitWriteBundle(parameter)
+  val index: UInt = UInt(parameter.instructionIndexBits.W)
+}
+
+class MaskUnitReadReq(parameter: T1Parameter) extends Bundle {
+  val vs:           UInt = UInt(5.W)
+  // source2, read offset
+  val offset:       UInt = UInt(parameter.laneParam.vrfOffsetBits.W)
+  // Read which lane
+  val readLane:     UInt = UInt(log2Ceil(parameter.laneNumber).W)
+  // from which request
+  val requestIndex: UInt = UInt(log2Ceil(parameter.laneNumber).W)
+  // data position in data path
+  val dataOffset:   UInt = UInt(log2Ceil(parameter.datapathWidth / 8).W)
+}
+
+class MaskUnitReadQueue(parameter: T1Parameter) extends Bundle {
+  val vs:         UInt = UInt(5.W)
+  // source2, read offset
+  val offset:     UInt = UInt(parameter.laneParam.vrfOffsetBits.W)
+  // Which channel will this read request be written to?
+  val writeIndex: UInt = UInt(log2Ceil(parameter.laneNumber).W)
+  val dataOffset: UInt = UInt(log2Ceil(parameter.datapathWidth / 8).W)
+}
+
+class MaskUnitWaitReadQueue(parameter: T1Parameter) extends Bundle {
+  val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W)
+  val executeIndex: UInt = UInt(2.W)
+  val sourceValid:  UInt = UInt(parameter.laneNumber.W)
+  val needRead:     UInt = UInt(parameter.laneNumber.W)
+  val last:         Bool = Bool()
+}
+
+class MaskUnitWriteBundle(parameter: LaneParameter) extends Bundle {
+  val data:         UInt = UInt(parameter.datapathWidth.W)
+  val mask:         UInt = UInt((parameter.datapathWidth / 8).W)
+  val groupCounter: UInt = UInt(parameter.groupNumberBits.W)
+}
diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala
index bb5f829e0..7a22b5cbf 100644
--- a/t1/src/Lane.scala
+++ b/t1/src/Lane.scala
@@ -234,13 +234,14 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   @public
   val csrInterface: CSRInterface = IO(Input(new CSRInterface(parameter.vlMaxBits)))
 
-  /** response to [[T1.lsu]] or mask unit in [[T1]] */
   @public
-  val laneResponse: ValidIO[LaneResponse] = IO(Valid(new LaneResponse(parameter)))
+  val maskUnitRequest: DecoupledIO[MaskUnitExeReq] = IO(Decoupled(new MaskUnitExeReq(parameter)))
 
-  /** feedback from [[T1]] to [[Lane]] for [[laneResponse]] */
   @public
-  val laneResponseFeedback: ValidIO[LaneResponseFeedback] = IO(Flipped(Valid(new LaneResponseFeedback(parameter))))
+  val maskRequestToLSU: Bool = IO(Output(Bool()))
+
+  @public
+  val maskUnitResponse: ValidIO[MaskUnitExeResponse] = IO(Flipped(Valid(new MaskUnitExeResponse(parameter))))
 
   /** for LSU and V accessing lane, this is not a part of ring, but a direct connection. */
   @public
@@ -570,14 +571,25 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
       slotCanShift(index) := true.B
     }
 
-    val laneState:     LaneState                     = Wire(new LaneState(parameter))
-    val stage0:        Instance[LaneStage0]          = Instantiate(new LaneStage0(parameter, isLastSlot))
-    val stage1:        Instance[LaneStage1]          = Instantiate(new LaneStage1(parameter, isLastSlot))
-    val stage2:        Instance[LaneStage2]          = Instantiate(new LaneStage2(parameter, isLastSlot))
-    val executionUnit: Instance[LaneExecutionBridge] = Instantiate(
+    val laneState:       LaneState                          = Wire(new LaneState(parameter))
+    val stage0:          Instance[LaneStage0]               = Instantiate(new LaneStage0(parameter, isLastSlot))
+    val stage1:          Instance[LaneStage1]               = Instantiate(new LaneStage1(parameter, isLastSlot))
+    val stage2:          Instance[LaneStage2]               = Instantiate(new LaneStage2(parameter, isLastSlot))
+    val executionUnit:   Instance[LaneExecutionBridge]      = Instantiate(
       new LaneExecutionBridge(parameter, isLastSlot, index)
     )
-    val stage3:        Instance[LaneStage3]          = Instantiate(new LaneStage3(parameter, isLastSlot))
+    val maskStage:       Option[Instance[MaskExchangeUnit]] =
+      Option.when(isLastSlot)(Instantiate(new MaskExchangeUnit(parameter)))
+    val stage3:          Instance[LaneStage3]               = Instantiate(new LaneStage3(parameter, isLastSlot))
+    val stage3EnqWire:   DecoupledIO[LaneStage3Enqueue]     = Wire(Decoupled(new LaneStage3Enqueue(parameter, isLastSlot)))
+    val stage3EnqSelect: DecoupledIO[LaneStage3Enqueue]     = maskStage.map { mask =>
+      mask.enqueue <> stage3EnqWire
+      maskUnitRequest <> mask.maskReq
+      maskRequestToLSU <> mask.maskRequestToLSU
+      mask.maskUnitResponse := maskUnitResponse
+      mask.dequeue
+    }.getOrElse(stage3EnqWire)
+    stage3.enqueue <> stage3EnqSelect
 
     // slot state
     laneState.vSew1H                   := vSew1H
@@ -759,50 +771,47 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
       0.U(parameter.chainingSize.W)
     )
     AssertProperty(BoolSequence(!executionUnit.dequeue.valid || stage2.dequeue.valid))
-    stage3.enqueue.valid        := executionUnit.dequeue.valid
-    executionUnit.dequeue.ready := stage3.enqueue.ready
+    stage3EnqWire.valid         := executionUnit.dequeue.valid
+    executionUnit.dequeue.ready := stage3EnqWire.ready
     stage2.dequeue.ready        := executionUnit.dequeue.fire
 
     if (!isLastSlot) {
-      stage3.enqueue.bits := DontCare
+      stage3EnqWire.bits := DontCare
     }
 
     // pipe state from stage0
-    stage3.enqueue.bits.decodeResult     := stage2.dequeue.bits.decodeResult
-    stage3.enqueue.bits.instructionIndex := stage2.dequeue.bits.instructionIndex
-    stage3.enqueue.bits.loadStore        := stage2.dequeue.bits.loadStore
-    stage3.enqueue.bits.vd               := stage2.dequeue.bits.vd
-    stage3.enqueue.bits.ffoByOtherLanes  := ffoRecord.ffoByOtherLanes
-    stage3.enqueue.bits.groupCounter     := stage2.dequeue.bits.groupCounter
-    stage3.enqueue.bits.mask             := stage2.dequeue.bits.mask
+    stage3EnqWire.bits.decodeResult     := stage2.dequeue.bits.decodeResult
+    stage3EnqWire.bits.instructionIndex := stage2.dequeue.bits.instructionIndex
+    stage3EnqWire.bits.loadStore        := stage2.dequeue.bits.loadStore
+    stage3EnqWire.bits.vd               := stage2.dequeue.bits.vd
+    stage3EnqWire.bits.ffoByOtherLanes  := ffoRecord.ffoByOtherLanes
+    stage3EnqWire.bits.groupCounter     := stage2.dequeue.bits.groupCounter
+    stage3EnqWire.bits.mask             := stage2.dequeue.bits.mask
     if (isLastSlot) {
-      stage3.enqueue.bits.sSendResponse := stage2.dequeue.bits.sSendResponse.get
-      stage3.enqueue.bits.ffoSuccess    := executionUnit.dequeue.bits.ffoSuccess.get
-      stage3.enqueue.bits.fpReduceValid.zip(executionUnit.dequeue.bits.fpReduceValid).foreach { case (sink, source) =>
+      stage3EnqWire.bits.sSendResponse := stage2.dequeue.bits.sSendResponse.get
+      stage3EnqWire.bits.ffoSuccess    := executionUnit.dequeue.bits.ffoSuccess.get
+      stage3EnqWire.bits.fpReduceValid.zip(executionUnit.dequeue.bits.fpReduceValid).foreach { case (sink, source) =>
         sink := source
       }
     }
-    stage3.enqueue.bits.data             := executionUnit.dequeue.bits.data
-    stage3.enqueue.bits.pipeData         := stage2.dequeue.bits.pipeData.getOrElse(DontCare)
-    stage3.enqueue.bits.ffoIndex         := executionUnit.dequeue.bits.ffoIndex
-    executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3.enqueue.bits.crossWriteData := data)
-    stage2.dequeue.bits.sSendResponse.foreach(_ => stage3.enqueue.bits.sSendResponse := _)
-    executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3.enqueue.bits.ffoSuccess := _)
+    stage3EnqWire.bits.data             := executionUnit.dequeue.bits.data
+    stage3EnqWire.bits.pipeData         := stage2.dequeue.bits.pipeData.getOrElse(DontCare)
+    stage3EnqWire.bits.ffoIndex         := executionUnit.dequeue.bits.ffoIndex
+    executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3EnqWire.bits.crossWriteData := data)
+    stage2.dequeue.bits.sSendResponse.foreach(_ => stage3EnqWire.bits.sSendResponse := _)
+    executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3EnqWire.bits.ffoSuccess := _)
 
     if (isLastSlot) {
-      when(laneResponseFeedback.valid) {
-        when(laneResponseFeedback.bits.complete) {
+      when(maskUnitResponse.valid) {
+        when(maskUnitResponse.bits.ffoByOther) {
           ffoRecord.ffoByOtherLanes := true.B
         }
       }
-      when(stage3.enqueue.fire) {
+      when(stage3EnqWire.fire) {
         executionUnit.dequeue.bits.ffoSuccess.foreach(ffoRecord.selfCompleted := _)
         // This group found means the next group ended early
         ffoRecord.ffoByOtherLanes := ffoRecord.ffoByOtherLanes || ffoRecord.selfCompleted
       }
-
-      laneResponse <> stage3.laneResponse.get
-      stage3.laneResponseFeedback.get <> laneResponseFeedback
     }
 
     // --- stage 3 end & stage 4 start ---
@@ -1176,10 +1185,10 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
     rpt.bits  := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex
   }
   // todo: add mask unit write token
-  tokenManager.responseReport.valid         := laneResponse.valid
-  tokenManager.responseReport.bits          := laneResponse.bits.instructionIndex
-  tokenManager.responseFeedbackReport.valid := laneResponseFeedback.valid
-  tokenManager.responseFeedbackReport.bits  := laneResponseFeedback.bits.instructionIndex
+  tokenManager.responseReport.valid         := maskUnitRequest.valid
+  tokenManager.responseReport.bits          := maskUnitRequest.bits.index
+  tokenManager.responseFeedbackReport.valid := maskUnitResponse.valid
+  tokenManager.responseFeedbackReport.bits  := maskUnitResponse.bits.index
   val instInSlot: UInt = slotControl
     .zip(slotOccupied)
     .map { case (slotState, occupied) =>
@@ -1212,6 +1221,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[
   tokenManager.topWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize)
   tokenManager.topWriteDeq.bits  := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex
 
+  tokenManager.maskUnitLastReport := lsuLastReport
+
   layer.block(layers.Verification) {
     val probeWire = Wire(new LaneProbe(parameter))
     define(laneProbe, ProbeValue(probeWire))
diff --git a/t1/src/T1.scala b/t1/src/T1.scala
index b87b3d043..bc118284b 100644
--- a/t1/src/T1.scala
+++ b/t1/src/T1.scala
@@ -392,8 +392,9 @@ class T1(val parameter: T1Parameter)
 
   /** the LSU Module */
 
-  val lsu:    Instance[LSU]           = Instantiate(new LSU(parameter.lsuParameters))
-  val decode: Instance[VectorDecoder] = Instantiate(new VectorDecoder(parameter.decoderParam))
+  val lsu:      Instance[LSU]           = Instantiate(new LSU(parameter.lsuParameters))
+  val decode:   Instance[VectorDecoder] = Instantiate(new VectorDecoder(parameter.decoderParam))
+  val maskUnit: Instance[MaskUnit]      = Instantiate(new MaskUnit(parameter))
   omInstance.decoderIn := Property(decode.om.asAnyClassType)
 
   val tokenManager: Instance[T1TokenManager] = Instantiate(new T1TokenManager(parameter))
@@ -519,17 +520,6 @@ class T1(val parameter: T1Parameter)
   /** which slot the instruction is entering */
   val instructionToSlotOH: UInt = Wire(UInt(parameter.chainingSize.W))
 
-  /** synchronize signal from each lane, for mask units.(ffo) */
-  val laneSynchronize: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool()))
-
-  /** all lanes are synchronized. */
-  val synchronized: Bool = WireDefault(false.B)
-
-  /** for mask unit that need to access VRF from lanes, use this signal to indicate it is finished access VRF(but
-    * instruction might not finish).
-    */
-  val maskUnitReadOnlyFinish: Bool = WireDefault(false.B)
-
   /** last slot is committing. */
   val lastSlotCommit: Bool = Wire(Bool())
 
@@ -540,11 +530,6 @@ class T1(val parameter: T1Parameter)
   val vxsatReportVec: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.chainingSize.W)))
   val vxsatReport = vxsatReportVec.reduce(_ | _)
 
-  // todo: 把lsu也放decode里去
-  val maskUnitType: Bool = decodeResult(Decoder.maskUnit) && requestRegDequeue.bits.instruction(6)
-  val maskDestination = decodeResult(Decoder.maskDestination)
-  val unOrderType: Bool = decodeResult(Decoder.unOrderWrite)
-
   /** Special instructions which will be allocate to the last slot.
     *   - mask unit
     *   - Lane <-> Top has data exchange(top might forward to LSU.) TODO: move to normal slots(add `offset` fields)
@@ -555,99 +540,44 @@ class T1(val parameter: T1Parameter)
   val dataInWritePipeVec: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.chainingSize.W)))
   val dataInWritePipe:    UInt      = dataInWritePipeVec.reduce(_ | _)
 
-  /** designed for unordered instruction(slide), it doesn't go to lane, it has RAW hazzard.
-    */
-  val instructionRAWReady: Bool = Wire(Bool())
-  val allSlotFree:         Bool = Wire(Bool())
-  val existMaskType:       Bool = Wire(Bool())
-
-  // mask Unit 与lane交换数据
-  val writeType:           VRFWriteRequest               = new VRFWriteRequest(
-    parameter.vrfParam.regNumBits,
-    parameter.vrfParam.vrfOffsetBits,
-    parameter.instructionIndexBits,
-    parameter.datapathWidth
-  )
-  val maskUnitWrite:       ValidIO[VRFWriteRequest]      = Wire(Valid(writeType))
-  val maskUnitWriteVec:    Vec[ValidIO[VRFWriteRequest]] = Wire(Vec(3, Valid(writeType)))
-  val maskWriteLaneSelect: Vec[UInt]                     = Wire(Vec(3, UInt(parameter.laneNumber.W)))
-  // 默认是head
-  val maskUnitWriteSelect: UInt                          = Mux1H(maskUnitWriteVec.map(_.valid), maskWriteLaneSelect)
-  maskUnitWriteVec.foreach(_ := DontCare)
-  maskUnitWrite := Mux1H(maskUnitWriteVec.map(_.valid), maskUnitWriteVec)
-  val writeSelectMaskUnit: Vec[Bool]                     = Wire(Vec(parameter.laneNumber, Bool()))
-  val maskUnitWriteReady:  Bool                          = writeSelectMaskUnit.asUInt.orR
+  // todo: instructionRAWReady -> v0 write token
+  val allSlotFree:   Bool = Wire(Bool())
+  val existMaskType: Bool = Wire(Bool())
 
   // read
-  val readType:           VRFReadRequest               = new VRFReadRequest(
+  val readType: VRFReadRequest = new VRFReadRequest(
     parameter.vrfParam.regNumBits,
     parameter.vrfParam.vrfOffsetBits,
     parameter.instructionIndexBits
   )
-  val maskUnitRead:       ValidIO[VRFReadRequest]      = Wire(Valid(readType))
-  val maskUnitReadVec:    Vec[ValidIO[VRFReadRequest]] = Wire(Vec(3, Valid(readType)))
-  val maskReadLaneSelect: Vec[UInt]                    = Wire(Vec(3, UInt(parameter.laneNumber.W)))
-  val maskUnitReadSelect: UInt                         = Mux1H(maskUnitReadVec.map(_.valid), maskReadLaneSelect)
-  maskUnitRead := Mux1H(maskUnitReadVec.map(_.valid), maskUnitReadVec)
-  val readSelectMaskUnit: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool()))
-  val maskUnitReadReady = readSelectMaskUnit.asUInt.orR
-  val laneReadResult:   Vec[UInt]     = Wire(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W)))
-  val WARRedResult:     ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W))))
-  // mask unit 最后的写
-  val maskUnitFlushVrf: Bool          = WireDefault(false.B)
 
+  // todo: ix type gather read
   // gather read state
-  val gatherOverlap:        Bool = Wire(Bool())
-  val gatherNeedRead:       Bool = requestRegDequeue.valid && decodeResult(Decoder.gather) &&
+  val gatherOverlap:        Bool                        = Wire(Bool())
+  val gatherNeedRead:       Bool                        = requestRegDequeue.valid && decodeResult(Decoder.gather) &&
     !decodeResult(Decoder.vtype) && !gatherOverlap
+  val gatherData:           UInt                        = RegInit(0.U(parameter.datapathWidth.W))
+  val gatherReadRequest:    DecoupledIO[VRFReadRequest] = Wire(Decoupled(readType))
+  val gatherReadLaneSelect: UInt                        = Wire(UInt(parameter.laneNumber.W))
+  val gatherReadResultFire = Pipe(gatherReadRequest.fire, gatherReadLaneSelect, parameter.vrfReadLatency).valid
   val gatherReadFinish:     Bool =
     RegEnable(
       !requestRegDequeue.fire,
       false.B,
-      (RegNext(RegNext(maskUnitReadReady)) && gatherNeedRead) || requestRegDequeue.fire
+      (gatherReadResultFire && gatherNeedRead) || requestRegDequeue.fire
     )
   val gatherReadDataOffset: UInt = Wire(UInt(5.W))
-  val gatherData:           UInt = Mux(gatherOverlap, 0.U, (WARRedResult.bits >> gatherReadDataOffset).asUInt)
 
-  /** data that need to be compute at top. */
-  val data:                Vec[ValidIO[UInt]] = RegInit(
-    VecInit(Seq.fill(parameter.laneNumber)(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W)))))
-  )
-  val flotReduceValid:     Seq[Option[Bool]]  = Seq.tabulate(parameter.laneNumber) { _ =>
-    Option.when(parameter.fpuEnable)(RegInit(false.B))
-  }
-  val maskDataForCompress: UInt               = RegInit(0.U(parameter.datapathWidth.W))
-  // clear the previous set of data from lane
-  val dataClear:           Bool               = WireDefault(false.B)
-  val completedVec:        Vec[Bool]          = RegInit(VecInit(Seq.fill(parameter.laneNumber)(false.B)))
-  // ffoIndexReg.valid: Already found the first one
-  val ffoIndexReg:         ValidIO[UInt]      = RegInit(0.U.asTypeOf(Valid(UInt(parameter.xLen.W))))
-  val ffoType:             Bool               = Wire(Bool())
+  // todo
+  gatherReadRequest.valid := DontCare
+  gatherReadRequest.bits  := DontCare
+  gatherReadRequest.ready := DontCare
+  gatherOverlap           := DontCare
+  gatherReadLaneSelect    := DontCare
+  gatherReadDataOffset    := DontCare
 
   /** for find first one, need to tell the lane with higher index `1` . */
-  val completedLeftOr: UInt          = (scanLeftOr(completedVec.asUInt) << 1).asUInt(parameter.laneNumber - 1, 0)
-  // 按指定的sew拼成 {laneNumer * dataPathWidth} bit, 然后根据sew选择出来
-  val sortedData:      UInt          = Mux1H(
-    vSewOHForMask,
-    Seq(4, 2, 1).map { groupSize =>
-      VecInit(data.map { element =>
-        element.bits.asBools  // [x] * 32 eg: sew = 1
-          .grouped(groupSize) // [x, x] * 16
-          .toSeq
-          .map(VecInit(_).asUInt) // [xx] * 16
-      }.transpose.map(VecInit(_).asUInt)).asUInt // [x*16] * 16 -> x * 256
-    }
-  )
-  // 把已经排过序的数据重新分给各个lane
-  val regroupData:     Vec[UInt]     = VecInit(Seq.tabulate(parameter.laneNumber) { laneIndex =>
-    sortedData(
-      laneIndex * parameter.datapathWidth + parameter.datapathWidth - 1,
-      laneIndex * parameter.datapathWidth
-    )
-  })
-  val dataResult:      ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W))))
-
-  val executeForLastLaneFire: Bool = WireDefault(false.B)
+  val dataResult: UInt = RegInit(0.U.asTypeOf(UInt(parameter.datapathWidth.W)))
 
   /** state machine register for each instruction. */
   val slots: Seq[InstructionControl] = Seq.tabulate(parameter.chainingSize) { index =>
@@ -658,8 +588,6 @@ class T1(val parameter: T1Parameter)
         .asTypeOf(new InstructionControl(parameter.instructionIndexBits, parameter.laneNumber))
     )
 
-    val mvToVRF: Option[Bool] = Option.when(index == parameter.chainingSize - 1)(RegInit(false.B))
-
     /** the execution is finished. (but there might still exist some data in the ring.)
       */
     val laneAndLSUFinish: Bool = control.endTag.asUInt.andR
@@ -717,735 +645,22 @@ class T1(val parameter: T1Parameter)
           control.vxsat := true.B
         }
       }
-    // logic like mask&reduce will be put to last slot
-    // TODO: review later
     if (index == (parameter.chainingSize - 1)) {
-      val feedBack:         UInt = RegInit(0.U(parameter.laneNumber.W))
-      val executeCounter:   UInt = RegInit(0.U((log2Ceil(parameter.laneNumber) + 1).W))
-      // mask destination时这两count都是以写vrf为视角
-      val writeBackCounter: UInt = RegInit(0.U(log2Ceil(parameter.laneNumber).W))
-      val groupCounter:     UInt = RegInit(0.U(parameter.groupNumberMaxBits.W))
-      val iotaCount:        UInt = RegInit(0.U((parameter.laneParam.vlMaxBits - 1).W))
-      val maskTypeInstruction    = RegInit(false.B)
-      val vd                     = RegInit(0.U(5.W))
-      val vs1                    = RegInit(0.U(5.W))
-      val vs2                    = RegInit(0.U(5.W))
-      val rs1                    = RegInit(0.U(parameter.xLen.W))
-      val vm                     = RegInit(false.B)
-      val executeFinishReg       = RegInit(true.B)
-      val unOrderTypeInstruction = RegInit(false.B)
-      val decodeResultReg        = RegInit(0.U.asTypeOf(decodeResult))
-      val gather: Bool = decodeResultReg(Decoder.gather)
-      // for slid
-      val elementIndexCount  = RegInit(0.U(parameter.laneParam.vlMaxBits.W))
-      val compressWriteCount = RegInit(0.U(parameter.laneParam.vlMaxBits.W))
-      val nextElementIndex: UInt = elementIndexCount + 1.U
-      val firstElement = elementIndexCount === 0.U
-      val lastElement: Bool = nextElementIndex === csrRegForMaskUnit.vl
-      val updateMaskIndex = WireDefault(false.B)
-      when(updateMaskIndex) { elementIndexCount := nextElementIndex }
-      // 特殊的指令,会阻止 wLast 后把 sExecute 拉回来, 因为需要等待读后才写
-      val mixedUnit: Bool = Wire(Bool())
-      // slid & gather & extend
-      val slidUnitIdle:            Bool         = RegInit(true.B)
-      // compress & iota
-      val iotaUnitIdle:            Bool         = RegInit(true.B)
-      val orderedReduceGroupCount: Option[UInt] = Option.when(parameter.fpuEnable)(
-        RegInit(0.U(log2Ceil(parameter.vLen / parameter.laneNumber).W))
-      )
-      val orderedReduceIdle:       Option[Bool] = Option.when(parameter.fpuEnable)(RegInit(true.B))
-      val maskUnitIdle = (Seq(slidUnitIdle, iotaUnitIdle) ++ orderedReduceIdle).reduce(_ && _)
-      val reduce       = decodeResultReg(Decoder.red)
-      val orderedReduce: Bool = if (parameter.fpuEnable) decodeResultReg(Decoder.orderReduce) else false.B
-      val popCount  = decodeResultReg(Decoder.popCount)
-      val extend    = decodeResultReg(Decoder.extend)
-      // first type instruction
-      val firstLane = ffo(completedVec.asUInt)
-      val firstLaneIndex: UInt = OHToUInt(firstLane)(log2Ceil(parameter.laneNumber) - 1, 0)
-      io.retire.rd.valid          := lastSlotCommit && decodeResultReg(Decoder.targetRd)
+      val writeRD = RegInit(false.B)
+      val float: Option[Bool] = Option.when(parameter.fpuEnable)(RegInit(false.B))
+      val vd = RegInit(0.U(5.W))
+      when(instructionToSlotOH(index)) {
+        writeRD := decodeResult(Decoder.targetRd)
+        float.foreach(_ := decodeResult(Decoder.float))
+        vd      := requestRegDequeue.bits.instruction(11, 7)
+      }
+      io.retire.rd.valid := lastSlotCommit && writeRD
       io.retire.rd.bits.rdAddress := vd
       if (parameter.fpuEnable) {
-        io.retire.rd.bits.isFp := decodeResultReg(Decoder.float)
+        io.retire.rd.bits.isFp := float.getOrElse(false.B)
       } else {
         io.retire.rd.bits.isFp := false.B
       }
-      when(requestRegDequeue.fire) {
-        ffoIndexReg.valid := false.B
-        ffoIndexReg.bits  := -1.S(parameter.xLen.W).asUInt
-      }.elsewhen(synchronized && completedVec.asUInt.orR && !ffoIndexReg.valid) {
-        ffoIndexReg.valid := true.B
-        ffoIndexReg.bits  := Mux1H(
-          firstLane,
-          // 3: firstLaneIndex.width
-          data.map(i => i.bits(parameter.xLen - 1 - 3, 5) ## firstLaneIndex ## i.bits(4, 0))
-        )
-      }
-      ffoType                     := decodeResultReg(Decoder.ffo)
-
-      /** vlmax = vLen * (2**lmul) / (2 ** sew * 8) \= (vLen / 8) * 2 ** (lmul - sew) \= vlb * 2 ** (lmul - sew) lmul <-
-        * (-3, -2, -1, 0 ,1, 2, 3) sew <- (0, 1, 2) lmul - sew <- [-5, 3] 选择信号 +5 -> lmul - sew + 5 <- [0, 8]
-        */
-      def largeThanVLMax(source: UInt, advance: Bool = false.B, lmul: UInt, sew: UInt): Bool = {
-        val vlenLog2 = log2Ceil(parameter.vLen) // 10
-        val cut      =
-          if (source.getWidth >= vlenLog2) source(vlenLog2 - 1, vlenLog2 - 9)
-          else (0.U(vlenLog2.W) ## source)(vlenLog2 - 1, vlenLog2 - 9)
-        // 9: lmul - sew 的可能值的个数
-        val largeList: Vec[Bool] = Wire(Vec(9, Bool()))
-        cut.asBools.reverse.zipWithIndex.foldLeft(advance) { case (a, (b, i)) =>
-          largeList(i) := a
-          a || b
-        }
-        val extendVlmul = lmul(2) ## lmul
-        val selectWire = UIntToOH(5.U(4.W) + extendVlmul - sew)(8, 0).asBools.reverse
-        Mux1H(selectWire, largeList)
-      }
-      // 算req上面的分开吧
-      val gatherWire =
-        Mux(decodeResult(Decoder.itype), requestRegDequeue.bits.instruction(19, 15), requestRegDequeue.bits.rs1Data)
-      val gatherAdvance = (gatherWire >> log2Ceil(parameter.vLen)).asUInt.orR
-      gatherOverlap := largeThanVLMax(
-        gatherWire,
-        gatherAdvance,
-        T1Issue.vlmul(requestReg.bits.issue),
-        T1Issue.vsew(requestReg.bits.issue)
-      )
-      val slotValid       = !control.state.idle
-      val storeAfterSlide = isStoreType && (requestRegDequeue.bits.instruction(11, 7) === vd)
-      instructionRAWReady := !((unOrderTypeInstruction && slotValid &&
-        // slid 类的会比执行得慢的指令快(div),会修改前面的指令的source
-        ((vd === requestRegDequeue.bits.instruction(24, 20)) ||
-          (vd === requestRegDequeue.bits.instruction(19, 15)) ||
-          storeAfterSlide ||
-          // slid 类的会比执行得快的指令慢(mv),会被后来的指令修改 source2
-          (vs2 === requestRegDequeue.bits.instruction(11, 7))) ||
-        (unOrderType && !allSlotFree) ||
-        (requestReg.bits.vdIsV0 && existMaskType)) ||
-        (vd === 0.U && maskType && slotValid))
-      when(instructionToSlotOH(index)) {
-        writeBackCounter                 := 0.U
-        groupCounter                     := 0.U
-        executeCounter                   := 0.U
-        elementIndexCount                := 0.U
-        compressWriteCount               := 0.U
-        iotaCount                        := 0.U
-        slidUnitIdle                     := !((decodeResult(Decoder.slid) || (decodeResult(Decoder.gather) && decodeResult(Decoder.vtype))
-          || decodeResult(Decoder.extend)) && instructionValid)
-        iotaUnitIdle                     := !((decodeResult(Decoder.compress) || decodeResult(Decoder.iota)) && instructionValid)
-        orderedReduceIdle.foreach(_ := !(decodeResult(Decoder.orderReduce) && instructionValid))
-        orderedReduceGroupCount.foreach(_ := 0.U)
-        vd                               := requestRegDequeue.bits.instruction(11, 7)
-        vs1                              := requestRegDequeue.bits.instruction(19, 15)
-        vs2                              := requestRegDequeue.bits.instruction(24, 20)
-        vm                               := requestRegDequeue.bits.instruction(25)
-        executeFinishReg                 := false.B
-        rs1                              := requestRegDequeue.bits.rs1Data
-        decodeResultReg                  := decodeResult
-        csrRegForMaskUnit                := requestRegCSR
-        // todo: decode need execute
-        control.state.sMaskUnitExecution := !maskUnitType
-        maskTypeInstruction              := maskType && !decodeResult(Decoder.maskSource)
-        completedVec.foreach(_ := false.B)
-        WARRedResult.valid               := false.B
-        unOrderTypeInstruction           := unOrderType
-        dataResult                       := 0.U.asTypeOf(dataResult)
-      }.elsewhen(control.state.wLast && maskUnitIdle) {
-        // 如果真需要执行的lane会wScheduler,不会提前发出last确认
-        when(!mixedUnit) {
-          control.state.sMaskUnitExecution := true.B
-        }
-        maskUnitFlushVrf := !control.state.idle
-      }
-      when(laneSynchronize.asUInt.orR) {
-        feedBack := feedBack | laneSynchronize.asUInt
-      }.elsewhen(lastSlotCommit) {
-        feedBack := 0.U
-      }
-      // 执行
-      // mask destination write
-      /** 对于mask destination 类型的指令需要特别注意两种不对齐 第一种是我们以 32(dataPatWidth) * 8(laneNumber) 为一个组, 但是我们vl可能不对齐一整个组 第二种是
-        * 32(dataPatWidth) 的时候对不齐 vl假设最大1024,相应的会有11位的vl xxx xxx xxxxx
-        */
-      val dataPathMisaligned = csrRegForMaskUnit.vl(parameter.dataPathWidthBits - 1, 0).orR
-      val groupMisaligned =
-        if (parameter.laneNumber > 1)
-          csrRegForMaskUnit
-            .vl(parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - 1, parameter.dataPathWidthBits)
-            .orR
-        else false.B
-
-      /** 我们需要计算最后一次写的 [[writeBackCounter]] & [[groupCounter]] lastGroupCounter = vl(10, 8) - !([[dataPathMisaligned]]
-        * \|| [[groupMisaligned]]) lastExecuteCounter = vl(7, 5) - ![[dataPathMisaligned]]
-        */
-      val lastGroupCounter:   UInt =
-        csrRegForMaskUnit.vl(
-          parameter.laneParam.vlMaxBits - 1,
-          parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber)
-        ) - !(dataPathMisaligned || groupMisaligned)
-      val lastExecuteCounter: UInt = if (parameter.laneNumber > 1) {
-        csrRegForMaskUnit.vl(
-          parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - 1,
-          parameter.dataPathWidthBits
-        ) - !dataPathMisaligned
-      } else 0.U
-      val lastGroup           = groupCounter === lastGroupCounter
-      val lastExecute         = lastGroup && writeBackCounter === lastExecuteCounter
-      val lastExecuteForGroup = writeBackCounter.andR
-      // 计算正写的这个lane是不是在边界上
-      val endOH               = UIntToOH(csrRegForMaskUnit.vl(parameter.dataPathWidthBits - 1, 0))
-      val border              = lastExecute && dataPathMisaligned &&
-        !(decodeResultReg(Decoder.compress) || decodeResultReg(Decoder.gather))
-      val lastGroupMask       = scanRightOr(endOH(parameter.datapathWidth - 1, 1))
-      val mvType              = decodeResultReg(Decoder.mv)
-      val readMv              = mvType && decodeResultReg(Decoder.targetRd)
-      val writeMv             = mvType && !decodeResultReg(Decoder.targetRd) &&
-        csrRegForMaskUnit.vl > csrRegForMaskUnit.vStart
-      mvToVRF.foreach(d => when(requestRegDequeue.fire) { d := writeMv })
-      // 读后写中的读
-      val needWAR             = (maskTypeInstruction || border || reduce || readMv) && !popCount
-      val skipLaneData: Bool = decodeResultReg(Decoder.mv)
-      mixedUnit                            := writeMv || readMv
-      maskReadLaneSelect.head              := UIntToOH(writeBackCounter)
-      maskReadLaneSelect.head              := UIntToOH(writeBackCounter)
-      maskWriteLaneSelect.head             := maskReadLaneSelect.head
-      maskUnitReadVec.head.valid           := false.B
-      maskUnitReadVec.head.bits.vs         := Mux(readMv, vs2, Mux(reduce, vs1, vd))
-      maskUnitReadVec.head.bits.readSource := Mux(readMv, 1.U, Mux(reduce, 0.U, 2.U))
-      maskUnitReadVec.head.bits.offset     := groupCounter
-      maskUnitRead.bits.instructionIndex   := control.record.instructionIndex
-      val readResultSelectResult = Mux1H(
-        Pipe(true.B, maskUnitReadSelect, parameter.vrfReadLatency).bits,
-        laneReadResult
-      )
-      // 把mask选出来
-      val maskSelect             = v0(groupCounter ## writeBackCounter)
-      val fullMask: UInt = (-1.S(parameter.datapathWidth.W)).asUInt
-
-      /** 正常全1 mask:[[maskSelect]] border: [[lastGroupMask]] mask && border: [[maskSelect]] & [[lastGroupMask]]
-        */
-      val maskCorrect: UInt = Mux(maskTypeInstruction, maskSelect, fullMask) &
-        Mux(border, lastGroupMask, fullMask)
-      // mask
-      val sew1HCorrect = Mux(decodeResultReg(Decoder.widenReduce), vSewOHForMask ## false.B, vSewOHForMask)
-      // 写的data
-      val writeData    = (WARRedResult.bits & (~maskCorrect).asUInt) | (regroupData(writeBackCounter) & maskCorrect)
-      val writeMask    = Mux(sew1HCorrect(2) || !reduce, 15.U, Mux(sew1HCorrect(1), 3.U, 1.U))
-      maskUnitWriteVec.head.valid                 := false.B
-      maskUnitWriteVec.head.bits.vd               := vd
-      maskUnitWriteVec.head.bits.offset           := groupCounter
-      maskUnitWriteVec.head.bits.data             := Mux(writeMv, rs1, Mux(reduce, dataResult.bits, writeData))
-      maskUnitWriteVec.head.bits.last             := control.state.wLast || reduce
-      maskUnitWriteVec.head.bits.instructionIndex := control.record.instructionIndex
-
-      val waitReadResult: Bool = Wire(Bool())
-      val maskUnitReadVrf = maskUnitReadReady && maskUnitReadVec.map(_.valid).reduce(_ || _) && !waitReadResult
-      val readNext        = RegNext(maskUnitReadVrf)
-      waitReadResult := RegNext(readNext) || readNext
-      when(Pipe(maskUnitReadVrf, false.B, parameter.vrfReadLatency).valid) {
-        WARRedResult.bits  := readResultSelectResult
-        WARRedResult.valid := true.B
-      }
-      // alu start
-      val aluInput1 = Mux(
-        (Seq(executeCounter === 0.U) ++ orderedReduceGroupCount.map(_ === 0.U)).reduce(_ && _),
-        Mux(
-          needWAR,
-          WARRedResult.bits & FillInterleaved(8, writeMask),
-          0.U
-        ),
-        dataResult.bits
-      )
-      val aluInput2 = Mux1H(UIntToOH(executeCounter), data.map(d => Mux(d.valid, d.bits, 0.U)))
-      val skipFlotReduce: Bool                           = !Mux1H(UIntToOH(executeCounter), flotReduceValid.map(_.getOrElse(false.B)))
-      // red alu instance
-      val adder:          Instance[ReduceAdder]          = Instantiate(new ReduceAdder(parameter.datapathWidth))
-      val logicUnit:      Instance[LaneLogic]            = Instantiate(new LaneLogic(parameter.datapathWidth))
-      // option unit for flot reduce
-      val floatAdder:     Option[Instance[FloatAdder]]   =
-        Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(8, 24)))
-      val flotCompare:    Option[Instance[FloatCompare]] =
-        Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(8, 24)))
-
-      val sign = !decodeResultReg(Decoder.unsigned1)
-      adder.request.src    := VecInit(
-        Seq(
-          (aluInput1(parameter.datapathWidth - 1) && sign) ## aluInput1,
-          (aluInput2(parameter.datapathWidth - 1) && sign) ## aluInput2
-        )
-      )
-      // popCount 在top视为reduce add
-      adder.request.opcode := Mux(popCount, 0.U, decodeResultReg(Decoder.uop))
-      adder.request.sign   := sign
-      adder.request.vSew   := Mux(popCount, 2.U, OHToUInt(sew1HCorrect))
-
-      floatAdder.foreach { fAdder =>
-        fAdder.io.a            := aluInput1
-        fAdder.io.b            := aluInput2
-        fAdder.io.roundingMode := csrRegForMaskUnit.vxrm
-      }
-
-      flotCompare.foreach { fCompare =>
-        fCompare.io.a     := aluInput1
-        fCompare.io.b     := aluInput2
-        // max -> 12, min -> 8
-        fCompare.io.isMax := decodeResultReg(Decoder.uop)(2)
-      }
-
-      logicUnit.req.src    := VecInit(Seq(aluInput1, aluInput2))
-      logicUnit.req.opcode := decodeResultReg(Decoder.uop)
-
-      // reduce resultSelect
-      val intReduceResult = Mux(
-        decodeResultReg(Decoder.adder) || popCount,
-        adder.response.data,
-        logicUnit.resp
-      )
-      val flotReduceResult: Option[UInt] = Option.when(parameter.fpuEnable)(
-        Mux(
-          skipFlotReduce,
-          aluInput1,
-          Mux(decodeResultReg(Decoder.fpExecutionType) === 0.U, floatAdder.get.io.out, flotCompare.get.io.out)
-        )
-      )
-      val aluOutPut = Mux1H(
-        Seq(if (parameter.fpuEnable) reduce && !decodeResultReg(Decoder.float) else reduce) ++
-          Option.when(parameter.fpuEnable)(reduce && decodeResultReg(Decoder.float)),
-        Seq(intReduceResult) ++ flotReduceResult
-      )
-      // slid & gather unit
-      val slideUp   = decodeResultReg(Decoder.topUop)(1)
-      val slide1    = decodeResultReg(Decoder.topUop)(0) && decodeResultReg(Decoder.slid)
-
-      /** special uop 里面编码了extend的信息: specialUop(1,0): 倍率 specialUop(2):是否是符号
-        */
-      val extendSourceSew: Bool = (csrRegForMaskUnit.vSew >> decodeResultReg(Decoder.topUop)(1, 0))(0)
-      val extendSign:      Bool = decodeResultReg(Decoder.topUop)(2)
-      // gather 相关的控制
-      val gather16:        Bool = decodeResultReg(Decoder.gather16)
-      val maskUnitEEW = Mux(gather16, 1.U, Mux(extend, extendSourceSew, csrRegForMaskUnit.vSew))
-      val maskUnitEEW1H: UInt = UIntToOH(maskUnitEEW)
-      val maskUnitByteEnable = maskUnitEEW1H(2) ## maskUnitEEW1H(2) ## maskUnitEEW1H(2, 1).orR ## true.B
-      val maskUnitBitEnable  = FillInterleaved(8, maskUnitByteEnable)
-      maskUnitWriteVec.head.bits.mask := Mux(writeMv, maskUnitByteEnable, writeMask)
-      // log2(dataWidth * laneNumber / 8)
-      val maskUnitDataOffset =
-        (elementIndexCount << maskUnitEEW).asUInt(
-          log2Ceil(parameter.datapathWidth * parameter.laneNumber / 8) - 1,
-          0
-        ) ## 0.U(3.W)
-      val maskUnitData       = ((VecInit(data.map(_.bits)).asUInt >> maskUnitDataOffset).asUInt & maskUnitBitEnable)(
-        parameter.datapathWidth - 1,
-        0
-      )
-
-      val compareWire = Mux(decodeResultReg(Decoder.slid), rs1, maskUnitData)
-      val compareAdvance: Bool = (compareWire >> log2Ceil(parameter.vLen)).asUInt.orR
-      val compareResult:  Bool =
-        largeThanVLMax(compareWire, compareAdvance, csrRegForMaskUnit.vlmul, csrRegForMaskUnit.vSew)
-      // 正在被gather使用的数据在data的那个组里
-      val gatherDataSelect           =
-        UIntToOH((false.B ## maskUnitDataOffset)(5 + (log2Ceil(parameter.laneNumber).max(1)) - 1, 5))
-      val dataTail                   = Mux1H(UIntToOH(maskUnitEEW)(1, 0), Seq(3.U(2.W), 2.U(2.W)))
-      val lastElementForData         = gatherDataSelect.asBools.last && maskUnitDataOffset(4, 3) === dataTail
-      val lastElementForCompressMask = elementIndexCount(log2Ceil(parameter.datapathWidth) - 1, 0).andR
-      val maskUnitDataReady: Bool = (gatherDataSelect & VecInit(data.map(_.valid)).asUInt).orR
-      // 正在被gather使用的数据是否就绪了
-      val isSlide = !(gather || extend)
-      val slidUnitDataReady: Bool = maskUnitDataReady || isSlide
-      val compressDataReady              = maskUnitDataReady || !(decodeResultReg(Decoder.compress) || decodeResultReg(Decoder.iota))
-      // slid 先用状态机
-      val idle :: sRead :: sWrite :: Nil = Enum(3)
-      val slideState                     = RegInit(idle)
-      val readState                      = slideState === sRead
-
-      // slid 的立即数是0扩展的
-      val slidSize           = Mux(slide1, 1.U, Mux(decodeResultReg(Decoder.itype), vs1, rs1))
-      // todo: 这里是否有更好的处理方式
-      val slidSizeLSB        = slidSize(parameter.laneParam.vlMaxBits - 1, 0)
-      // down +
-      // up -
-      val directionSelection = Mux(slideUp, (~slidSizeLSB).asUInt, slidSizeLSB)
-      val slideReadIndex     = elementIndexCount + directionSelection + slideUp
-      val readIndex: UInt = Mux(
-        !maskUnitIdle,
-        Mux(
-          decodeResultReg(Decoder.slid),
-          slideReadIndex,
-          maskUnitData
-        ),
-        gatherWire
-      )
-
-      def indexAnalysis(elementIndex: UInt, csrInput: CSRInterface = csrRegForMaskUnit) = {
-        val sewInput   = csrInput.vSew
-        val sewOHInput = UIntToOH(csrInput.vSew)(2, 0)
-        val intLMULInput: UInt = (1.U << csrInput.vlmul(1, 0)).asUInt
-        val dataPosition = (elementIndex(parameter.laneParam.vlMaxBits - 2, 0) << sewInput)
-          .asUInt(parameter.laneParam.vlMaxBits - 2, 0)
-        val accessMask   = Mux1H(
-          sewOHInput(2, 0),
-          Seq(
-            UIntToOH(dataPosition(1, 0)),
-            FillInterleaved(2, UIntToOH(dataPosition(1))),
-            15.U(4.W)
-          )
-        )
-        // 数据起始位置在32bit(暂时只32)中的偏移,由于数据会有跨lane的情况,融合的优化时再做
-        val dataOffset   = (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0)) ## 0.U(3.W)
-        val accessLane   = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W)
-        // 32 bit / group
-        val dataGroup    = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt
-        val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits
-        val offset            = dataGroup(offsetWidth - 1, 0)
-        val accessRegGrowth   = (dataGroup >> offsetWidth).asUInt
-        val decimalProportion = offset ## accessLane
-        // 1/8 register
-        val decimal           = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3))
-
-        /** elementIndex 需要与vlMax比较, vLen * lmul /sew 这个计算太复杂了 我们可以换一个角度,计算读寄存器的增量与lmul比较,就能知道下标是否超vlMax了 vlmul
-          * 需要区分整数与浮点
-          */
-        val overlap      =
-          (csrInput.vlmul(2) && decimal >= intLMULInput(3, 1)) ||
-            (!csrInput.vlmul(2) && accessRegGrowth >= intLMULInput)
-        accessRegGrowth >= csrInput.vlmul
-        val reallyGrowth = accessRegGrowth(2, 0)
-        (accessMask, dataOffset, accessLane, offset, reallyGrowth, overlap)
-      }
-      val srcOverlap: Bool = !decodeResultReg(Decoder.itype) && (rs1 >= csrRegForMaskUnit.vl)
-      // rs1 >= vlMax
-      val srcOversize                                                                   = !decodeResultReg(Decoder.itype) && !slide1 && compareResult
-      val signBit                                                                       = Mux1H(
-        vSewOHForMask,
-        readIndex(parameter.laneParam.vlMaxBits - 1, parameter.laneParam.vlMaxBits - 3).asBools.reverse
-      )
-      // 对于up来说小于offset的element是不变得的
-      val slideUpUnderflow                                                              = slideUp && !slide1 && (signBit || srcOverlap)
-      val elementActive: Bool = v0.asUInt(elementIndexCount) || vm
-      val slidActive = elementActive && (!slideUpUnderflow || !decodeResultReg(Decoder.slid))
-      // index >= vlMax 是写0
-      val overlapVlMax: Bool = !slideUp && (signBit || srcOversize)
-      // select csr
-      val csrSelect                                                          = Mux(control.state.idle, requestRegCSR, csrRegForMaskUnit)
-      // slid read
-      val (_, readDataOffset, readLane, readOffset, readGrowth, lmulOverlap) = indexAnalysis(readIndex, csrSelect)
-      gatherReadDataOffset := readDataOffset
-      val readOverlap           = lmulOverlap || overlapVlMax
-      val skipRead              = readOverlap || (gather && compareResult) || extend
-      val maskUnitWriteVecFire1 = maskUnitReadVec(1).valid && maskUnitReadReady
-      val readFireNext1:      Bool = RegNext(maskUnitWriteVecFire1)
-      val readFireNextNext1:  Bool = RegNext(readFireNext1)
-      val port1WaitForResult: Bool = readFireNext1 || readFireNextNext1
-      val gatherTryToRead =
-        gatherNeedRead && !VecInit(lsu.vrfReadDataPorts.map(_.valid)).asUInt.orR && !gatherReadFinish
-      maskUnitReadVec(1).valid           := (readState || gatherTryToRead) && !port1WaitForResult
-      maskUnitReadVec(1).bits.vs         := Mux(readState, vs2, requestRegDequeue.bits.instruction(24, 20)) + readGrowth
-      maskUnitReadVec(1).bits.readSource := 1.U
-      maskUnitReadVec(1).bits.offset     := readOffset
-      maskReadLaneSelect(1)              := UIntToOH(readLane)
-      // slid write, vlXXX: 用element index 算出来的
-      val (vlMask, vlDataOffset, vlLane, vlOffset, vlGrowth, _) = indexAnalysis(elementIndexCount)
-      val writeState                                            = slideState === sWrite
-      // 处理数据,先硬移位吧
-      val slidReadData: UInt = ((WARRedResult.bits >> readDataOffset) << vlDataOffset)
-        .asUInt(parameter.datapathWidth - 1, 0)
-      val selectRS1 = slide1 && ((slideUp && firstElement) || (!slideUp && lastElement))
-      // extend 类型的扩展和移位
-      val extendData: UInt = (Mux(
-        extendSourceSew,
-        Fill(parameter.datapathWidth - 16, extendSign && maskUnitData(15)) ## maskUnitData(15, 0),
-        Fill(parameter.datapathWidth - 8, extendSign && maskUnitData(7)) ## maskUnitData(7, 0)
-      ) << vlDataOffset).asUInt(parameter.xLen - 1, 0)
-
-      /** vd 的值有4种:
-        *   1. 用readIndex读出来的vs2的值
-        *   1. 0
-        *   1. slide1 时插进来的rs1
-        *   1. extend 的值
-        */
-      val slidWriteData = Mux1H(
-        Seq((!(readOverlap || selectRS1 || extend)) || (gather && !compareResult), selectRS1, extend),
-        Seq(slidReadData, (rs1 << vlDataOffset).asUInt(parameter.xLen - 1, 0), extendData)
-      )
-      maskUnitWriteVec(1).valid                 := writeState && slidActive
-      maskUnitWriteVec(1).bits.vd               := vd + vlGrowth
-      maskUnitWriteVec(1).bits.offset           := vlOffset
-      maskUnitWriteVec(1).bits.mask             := vlMask
-      maskUnitWriteVec(1).bits.data             := slidWriteData
-      maskUnitWriteVec(1).bits.last             := lastElement
-      maskUnitWriteVec(1).bits.instructionIndex := control.record.instructionIndex
-      maskWriteLaneSelect(1)                    := UIntToOH(vlLane)
-      // slid 跳状态机
-      when(slideState === idle) {
-        when((!slidUnitIdle) && slidUnitDataReady) {
-          when(skipRead) {
-            slideState := sWrite
-          }.otherwise {
-            slideState := sRead
-          }
-        }
-      }
-      when(readState) {
-        // 不需要valid,因为这个状态下一定是valid的
-        when(readFireNextNext1) {
-          slideState := sWrite
-        }
-      }
-      when(writeState) {
-        when(maskUnitWriteReady || !slidActive) {
-          when(lastElement) {
-            slideState   := idle
-            slidUnitIdle := true.B
-            when(gather || extend) {
-              synchronized           := true.B
-              dataClear              := true.B
-              maskUnitReadOnlyFinish := true.B
-            }
-          }.otherwise {
-            when(lastElementForData && (gather || extend)) {
-              synchronized := true.B
-              dataClear    := true.B
-              slideState   := idle
-            }.otherwise {
-              // todo: skip read
-              slideState := sRead
-            }
-            updateMaskIndex := true.B
-          }
-        }
-      }
-
-      // compress & iota
-      val idle1 :: sReadMask :: sWrite1 :: Nil = Enum(3)
-      val compressState                        = RegInit(idle1)
-      val compressStateIdle                    = compressState === idle1
-      val compressStateRead                    = compressState === sReadMask
-      val compressStateWrite                   = compressState === sWrite1
-
-      // compress 用vs1当mask,需要先读vs1
-      val readCompressMaskNext = Pipe(maskUnitReadReady && compressStateRead, false.B, parameter.vrfReadLatency).valid
-      when(readCompressMaskNext) {
-        maskDataForCompress := readResultSelectResult
-      }
-
-      // 处理 iota
-      val iotaDataOffset:  UInt = elementIndexCount(log2Ceil(parameter.datapathWidth * parameter.laneNumber) - 1, 0)
-      val lastDataForIota: Bool = iotaDataOffset.andR
-      val iotaData = VecInit(data.map(_.bits)).asUInt(iotaDataOffset)
-      val iota     = decodeResultReg(Decoder.iota)
-
-      val maskUnitReadFire2: Bool = maskUnitReadVec(2).valid && maskUnitReadReady
-      val readFireNext2      = RegNext(maskUnitReadFire2)
-      val readFireNextNext2  = RegNext(readFireNext2)
-      val port2WaitForResult = readFireNextNext2 || readFireNext2
-
-      /** 计算需要读的mask的相关 elementIndexCount -> 11bit 只会访问单寄存器 elementIndexCount(4, 0)做为32bit内的offset elementIndexCount(7,
-        * 5)作为lane的选择 elementIndexCount(9, 8)作为offset
-        */
-      // compress read
-      maskUnitReadVec(2).valid           := compressStateRead && !port2WaitForResult
-      maskUnitReadVec(2).bits.vs         := vs1
-      maskUnitReadVec(2).bits.readSource := 0.U
-      maskUnitReadVec(2).bits.offset     := elementIndexCount(
-        log2Ceil(parameter.datapathWidth) + log2Ceil(parameter.laneNumber) +
-          parameter.laneParam.vrfParam.vrfOffsetBits - 1,
-        log2Ceil(parameter.datapathWidth) + log2Ceil(parameter.laneNumber)
-      )
-      maskReadLaneSelect(2)              := UIntToOH(
-        elementIndexCount(
-          log2Ceil(parameter.datapathWidth) + ((log2Ceil(parameter.laneNumber) - 1).max(0)),
-          log2Ceil(parameter.datapathWidth)
-        )
-      )
-      // val lastElementForMask: Bool = elementIndexCount(4, 0).andR
-      val maskForCompress: Bool = maskDataForCompress(elementIndexCount(log2Ceil(parameter.datapathWidth) - 1, 0))
-
-      // compress vm=0 是保留的
-      val skipWrite = !Mux(decodeResultReg(Decoder.compress), maskForCompress, elementActive)
-      val dataGroupTailForCompressUnit: Bool = Mux(iota, lastDataForIota, lastElementForData)
-
-      // 计算compress write的位置信息
-      val (compressMask, compressDataOffset, compressLane, compressOffset, compressGrowth, _) =
-        indexAnalysis(compressWriteCount)
-      val compressWriteData                                                                   = (maskUnitData << compressDataOffset).asUInt
-      val iotaWriteData                                                                       = (iotaCount << vlDataOffset).asUInt
-      // compress write
-      maskUnitWriteVec(2).valid                 := compressStateWrite && !skipWrite
-      maskUnitWriteVec(2).bits.vd               := vd + Mux(iota, vlGrowth, compressGrowth)
-      maskUnitWriteVec(2).bits.offset           := Mux(iota, vlOffset, compressOffset)
-      maskUnitWriteVec(2).bits.mask             := Mux(iota, vlMask, compressMask)
-      maskUnitWriteVec(2).bits.data             := Mux(iota, iotaWriteData, compressWriteData)
-      maskUnitWriteVec(2).bits.last             := lastElement
-      maskUnitWriteVec(2).bits.instructionIndex := control.record.instructionIndex
-      maskWriteLaneSelect(2)                    := UIntToOH(Mux(iota, vlLane, compressLane))
-
-      // 跳状态机
-      // compress每组数据先读mask
-      val firstState = Mux(iota, sWrite1, sReadMask)
-      when(compressStateIdle && (!iotaUnitIdle) && compressDataReady) {
-        compressState := firstState
-      }
-
-      when(compressStateRead && readFireNextNext2) {
-        compressState := sWrite1
-      }
-
-      when(compressStateWrite) {
-        when(maskUnitWriteReady || skipWrite) {
-          when(!skipWrite) {
-            compressWriteCount := compressWriteCount + 1.U
-            iotaCount          := iotaCount + iotaData
-          }
-          when(lastElement) {
-            compressState          := idle
-            iotaUnitIdle           := true.B
-            synchronized           := true.B
-            dataClear              := true.B
-            maskUnitReadOnlyFinish := true.B
-          }.otherwise {
-            when(lastElementForCompressMask) {
-              // update vs1 as mask for compress
-              compressState := sRead
-            }
-            when(dataGroupTailForCompressUnit) {
-              synchronized  := true.B
-              dataClear     := true.B
-              compressState := idle
-            }
-            updateMaskIndex := true.B
-          }
-        }
-      }
-      // for small vl & reduce
-      val accessByte              = (csrRegForMaskUnit.vl << csrRegForMaskUnit.vSew).asUInt
-      // vl < row(vl)
-      val smallVL                 = accessByte < (parameter.datapathWidth * parameter.laneNumber / 8).U
-      val byteSizePerDataPathBits = log2Ceil(parameter.datapathWidth / 8)
-      val lastExecuteCounterForReduce: UInt = if (parameter.laneNumber > 1) {
-        accessByte(
-          byteSizePerDataPathBits + log2Ceil(parameter.laneNumber) - 1,
-          byteSizePerDataPathBits
-        ) - !accessByte(byteSizePerDataPathBits - 1, 0).orR
-      } else 0.U
-      val lastGroupDataWaitMaskForRed: UInt = scanRightOr(UIntToOH(lastExecuteCounterForReduce))
-      // alu end
-      val maskOperation =
-        decodeResultReg(Decoder.maskLogic) ||
-          decodeResultReg(Decoder.maskDestination) ||
-          decodeResultReg(Decoder.ffo)
-      // How many data path(32 bit) will used by maskDestination instruction.
-      val maskDestinationByteSize: Bits =
-        csrRegForMaskUnit.vl(log2Ceil(parameter.dLen) - 1, 0) << csrRegForMaskUnit.vSew
-      val maskDestinationUseDataPathSize =
-        (maskDestinationByteSize >> 2).asUInt + maskDestinationByteSize(1, 0).orR
-      val lastGroupCountForThisGroup: UInt = maskDestinationUseDataPathSize(log2Ceil(parameter.laneNumber) - 1, 0)
-      val counterForMaskDestination:  UInt = if (parameter.laneNumber > 1) {
-        (lastGroupCountForThisGroup - 1.U) |
-          Fill(
-            log2Ceil(parameter.laneNumber),
-            (maskDestinationUseDataPathSize >> log2Ceil(parameter.laneNumber)).asUInt.orR
-          )
-      } else 0.U
-
-      val waitSourceDataCounter =
-        Mux(decodeResultReg(Decoder.maskDestination), counterForMaskDestination, lastExecuteCounter)
-      val lastGroupDataWaitMask = scanRightOr(UIntToOH(waitSourceDataCounter))
-      // todo: other ways
-      val lastOrderedGroup:  Option[Bool] = orderedReduceGroupCount.map(count =>
-        (count ## 0
-          .U(log2Ceil(parameter.laneNumber).W) + -1.S(log2Ceil(parameter.laneNumber).W).asUInt) >= csrRegForMaskUnit.vl
-      )
-      val misalignedOrdered: Bool         = if (parameter.fpuEnable) {
-        lastOrderedGroup.get && csrRegForMaskUnit.vl(log2Ceil(parameter.laneNumber) - 1, 0).orR && decodeResultReg(
-          Decoder.float
-        )
-      } else false.B
-      val dataMask  =
-        Mux(
-          maskOperation && lastGroup,
-          lastGroupDataWaitMask,
-          Mux(
-            reduce && (smallVL || misalignedOrdered),
-            lastGroupDataWaitMaskForRed,
-            -1.S(parameter.laneNumber.W).asUInt
-          )
-        )
-      val dataReady = ((~dataMask).asUInt | VecInit(data.map(_.valid)).asUInt).andR || skipLaneData
-      when(
-        // data ready
-        dataReady &&
-          // state check
-          !control.state.sMaskUnitExecution
-      ) {
-        // 读
-        when(needWAR && !WARRedResult.valid) {
-          maskUnitReadVec.head.valid := true.B
-        }
-        // 可能有的计算
-        val nextExecuteIndex:          UInt         = executeCounter + 1.U
-        val isLastExecuteForGroup:     Bool         = executeCounter(log2Ceil(parameter.laneNumber) - 1, 0).andR
-        val lastExecuteForInstruction: Option[Bool] = orderedReduceGroupCount.map(count =>
-          (count ## 0.U(log2Ceil(parameter.laneNumber).W) + nextExecuteIndex) === csrRegForMaskUnit.vl
-        )
-        val readFinish        = WARRedResult.valid || !needWAR
-        val readDataSign      =
-          Mux1H(vSewOHForMask(2, 0), Seq(WARRedResult.bits(7), WARRedResult.bits(15), WARRedResult.bits(31)))
-        when(readFinish && !executeFinishReg) {
-          when(readMv) {
-            control.state.sMaskUnitExecution := true.B
-            // signExtend for vmv.x.s
-            dataResult.bits                  := Mux(vSewOHForMask(2), WARRedResult.bits(31, 16), Fill(16, readDataSign)) ##
-              Mux(vSewOHForMask(0), Fill(8, readDataSign), WARRedResult.bits(15, 8)) ##
-              WARRedResult.bits(7, 0)
-
-          }.otherwise {
-            executeCounter := nextExecuteIndex
-            when(executeCounter =/= csrRegForMaskUnit.vl) {
-              dataResult.bits := aluOutPut
-            }
-            if (parameter.fpuEnable) {
-              when(!orderedReduceIdle.get) {
-                when(lastExecuteForInstruction.get) {
-                  orderedReduceIdle.get := true.B
-                }.elsewhen(isLastExecuteForGroup) {
-                  synchronized   := true.B
-                  executeCounter := 0.U
-                  dataClear      := true.B
-                  orderedReduceGroupCount.foreach(d => d := d + 1.U)
-                }
-              }
-            }
-          }
-        }
-        // for vfredmax
-        val lastReduceCounter =
-          executeCounter === csrRegForMaskUnit.vl || executeCounter(log2Ceil(parameter.laneNumber))
-        dontTouch(lastReduceCounter)
-        val executeFinish: Bool =
-          (lastReduceCounter || !(reduce || popCount) || orderedReduce) && maskUnitIdle
-        val schedulerWrite = decodeResultReg(Decoder.maskDestination) || (reduce && !popCount) || writeMv
-        val groupSync      = decodeResultReg(Decoder.ffo)
-        // 写回
-        when(readFinish && (executeFinish || writeMv || executeFinishReg)) {
-          maskUnitWriteVec.head.valid := schedulerWrite
-          executeFinishReg            := true.B
-          when(maskUnitWriteReady || !schedulerWrite) {
-            WARRedResult.valid := false.B
-            writeBackCounter   := writeBackCounter + schedulerWrite
-            when(lastExecuteForGroup || lastExecute || reduce || groupSync || writeMv || popCount) {
-              synchronized := true.B
-              dataClear    := true.B
-              when(lastExecuteForGroup || groupSync) {
-                executeForLastLaneFire := true.B
-                groupCounter           := groupCounter + 1.U
-              }
-              when(lastExecute || reduce || writeMv || popCount) {
-                control.state.sMaskUnitExecution := true.B
-              }
-            }
-          }
-        }
-      }
     }
     control
   }
@@ -1552,44 +767,26 @@ class T1(val parameter: T1Parameter)
     lane.csrInterface.vl   := evlForLane
     lane.laneIndex         := index.U
 
-    // - LSU request next offset of group
-    // - all lane are synchronized
-    // - the index type of instruction is finished.
-    lane.laneResponseFeedback.valid                 := lsu.lsuOffsetRequest || synchronized || completeIndexInstruction
-    // - the index type of instruction is finished.
-    // - for find first one.
-    lane.laneResponseFeedback.bits.complete         :=
-      completeIndexInstruction ||
-        completedLeftOr(index) ||
-        maskUnitReadOnlyFinish
-    // tell lane which
-    lane.laneResponseFeedback.bits.instructionIndex := slots.last.record.instructionIndex
-
     // lsu 优先会有死锁:
     // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写
     // vse32.v v1, (a0) -> 依赖上一条,但是会先发出read
     // 读 lane
-    lane.vrfReadAddressChannel.valid  := lsu.vrfReadDataPorts(index).valid ||
-      (maskUnitRead.valid && maskUnitReadSelect(index))
+    lane.vrfReadAddressChannel.valid  := lsu.vrfReadDataPorts(index).valid || maskUnit.readChannel(index).valid
     lane.vrfReadAddressChannel.bits   :=
-      Mux(maskUnitRead.valid, maskUnitRead.bits, lsu.vrfReadDataPorts(index).bits)
-    lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnitRead.valid
-    readSelectMaskUnit(index)         :=
-      lane.vrfReadAddressChannel.ready && maskUnitReadSelect(index)
-    laneReadResult(index)             := lane.vrfReadDataChannel
+      Mux(maskUnit.readChannel(index).valid, maskUnit.readChannel(index).bits, lsu.vrfReadDataPorts(index).bits)
+    lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnit.readChannel(index).valid
+    maskUnit.readChannel(index).ready := lane.vrfReadAddressChannel.ready
+    maskUnit.readResult(index)        := lane.vrfReadDataChannel
     lsu.vrfReadResults(index)         := lane.vrfReadDataChannel
 
-    // 写lane
-    lane.vrfWriteChannel.valid := vrfWrite(index).valid || (maskUnitWrite.valid && maskUnitWriteSelect(index))
-    lane.vrfWriteChannel.bits  :=
-      Mux(vrfWrite(index).valid, vrfWrite(index).bits, maskUnitWrite.bits)
+    // lsu & mask unit write lane
+    lane.vrfWriteChannel.valid := vrfWrite(index).valid
+    lane.vrfWriteChannel.bits  := vrfWrite(index).bits
     vrfWrite(index).ready      := lane.vrfWriteChannel.ready
-    writeSelectMaskUnit(index) :=
-      lane.vrfWriteChannel.ready && !vrfWrite(index).valid && maskUnitWriteSelect(index)
 
-    lsu.offsetReadResult(index).valid := lane.laneResponse.valid && lane.laneResponse.bits.toLSU
-    lsu.offsetReadResult(index).bits  := lane.laneResponse.bits.data
-    lsu.offsetReadIndex(index)        := lane.laneResponse.bits.instructionIndex
+    lsu.offsetReadResult(index).valid := lane.maskUnitRequest.valid && lane.maskRequestToLSU
+    lsu.offsetReadResult(index).bits  := lane.maskUnitRequest.bits.source2
+    lsu.offsetReadIndex(index)        := lane.maskUnitRequest.bits.index
 
     instructionFinished(index).zip(slots.map(_.record.instructionIndex)).foreach { case (d, f) =>
       d := (UIntToOH(f(parameter.instructionIndexBits - 2, 0)) & lane.instructionFinished).orR
@@ -1598,12 +795,7 @@ class T1(val parameter: T1Parameter)
     val v0ForThisLane: Seq[UInt] = regroupV0.map(rv => cutUInt(rv, parameter.vLen / parameter.laneNumber)(index))
     val v0SelectBySew = Mux1H(UIntToOH(lane.maskSelectSew)(2, 0), v0ForThisLane)
     lane.maskInput     := cutUInt(v0SelectBySew, parameter.datapathWidth)(lane.maskSelect)
-    lane.lsuLastReport := lsu.lastReport |
-      Mux(
-        maskUnitFlushVrf,
-        indexToOH(slots.last.record.instructionIndex, parameter.chainingSize),
-        0.U
-      )
+    lane.lsuLastReport := lsu.lastReport | maskUnit.lastReport
 
     lane.lsuMaskGroupChange      := lsu.lsuMaskGroupChange
     lane.loadDataInLSUWriteQueue := lsu.dataInWriteQueue(index)
@@ -1613,18 +805,7 @@ class T1(val parameter: T1Parameter)
       (requestReg.bits.writeByte >> rowWith).asUInt +
         (requestReg.bits.writeByte(rowWith - 1, 0) > ((parameter.datapathWidth / 8) * index).U)
 
-    // 处理lane的mask类型请求
-    laneSynchronize(index) := lane.laneResponse.valid && !lane.laneResponse.bits.toLSU
-    when(laneSynchronize(index)) {
-      data(index).valid   := true.B
-      data(index).bits    := lane.laneResponse.bits.data
-      completedVec(index) := lane.laneResponse.bits.ffoSuccess
-      flotReduceValid(index).foreach(d => d := lane.laneResponse.bits.fpReduceValid.get)
-    }
-
     // token manager
-    tokenManager.writeV0(index).valid     := lane.vrfWriteChannel.fire && (lane.vrfWriteChannel.bits.vd === 0.U)
-    tokenManager.writeV0(index).bits      := lane.vrfWriteChannel.bits.instructionIndex
     tokenManager.instructionFinish(index) := lane.instructionFinished
 
     lane
@@ -1656,6 +837,35 @@ class T1(val parameter: T1Parameter)
   lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR
   lsu.vrfReadyToStore  := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR
 
+  // connect mask unit
+  maskUnit.instReq.valid                 := requestRegDequeue.fire && requestReg.bits.decodeResult(Decoder.maskUnit)
+  maskUnit.instReq.bits.instructionIndex := requestReg.bits.instructionIndex
+  maskUnit.instReq.bits.decodeResult     := decodeResult
+  maskUnit.instReq.bits.readFromScala    := source1Select
+  maskUnit.instReq.bits.sew              := T1Issue.vsew(requestReg.bits.issue)
+  maskUnit.instReq.bits.maskType         := maskType
+  maskUnit.instReq.bits.vxrm             := requestReg.bits.issue.vcsr(2, 1)
+  maskUnit.instReq.bits.vlmul            := requestReg.bits.issue.vtype(2, 0)
+  maskUnit.instReq.bits.vs2              := requestRegDequeue.bits.instruction(24, 20)
+  maskUnit.instReq.bits.vl               := requestReg.bits.issue.vl
+  maskUnit.maskInput                     := cutUInt(v0.asUInt, parameter.maskGroupWidth)(maskUnit.maskSelect)
+
+  maskUnit.exeReq.zip(laneVec).foreach { case (maskInput, lane) =>
+    maskInput <> lane.maskUnitRequest
+  }
+  maskUnit.exeResp.zip(laneVec).foreach { case (maskOutput, lane) =>
+    lane.maskUnitResponse <> maskOutput
+  }
+
+  val gatherResultSelect: UInt = Mux1H(
+    gatherReadLaneSelect,
+    laneVec.map(_.vrfReadDataChannel)
+  )
+  // gather read result
+  when(gatherReadResultFire) {
+    gatherData := Mux(gatherOverlap, 0.U, (gatherResultSelect >> gatherReadDataOffset).asUInt)
+  }
+
   // 连lane的环
   parameter.crossLaneConnectCycles.zipWithIndex.foreach { case (cycles, index) =>
     cycles.zipWithIndex.foreach { case (cycle, portIndex) =>
@@ -1721,10 +931,17 @@ class T1(val parameter: T1Parameter)
   //   we detect the hazard and decide should we issue this slide or
   //   issue the instruction after the slide which already in the slot.
   requestRegDequeue.ready := executionReady && slotReady && (!gatherNeedRead || gatherReadFinish) &&
-    instructionRAWReady && instructionIndexFree && vrfAllocate
+    tokenManager.issueAllow && instructionIndexFree && vrfAllocate
 
   instructionToSlotOH := Mux(requestRegDequeue.fire, slotToEnqueue, 0.U)
 
+  tokenManager.instructionIssue.valid                 := requestRegDequeue.fire
+  tokenManager.instructionIssue.bits.instructionIndex := requestReg.bits.instructionIndex
+  tokenManager.instructionIssue.bits.writeV0          :=
+    (!requestReg.bits.decodeResult(Decoder.targetRd) && !isStoreType) && requestReg.bits.vdIsV0
+  tokenManager.instructionIssue.bits.useV0AsMask      := maskType
+  tokenManager.instructionIssue.bits.isLoadStore      := !requestRegDequeue.bits.instruction(6)
+
   // instruction commit
   {
     val slotCommit: Vec[Bool] = VecInit(slots.map { inst =>
@@ -1740,7 +957,7 @@ class T1(val parameter: T1Parameter)
       inst.record.instructionIndex === responseCounter
     })
     retire                   := slotCommit.asUInt.orR
-    io.retire.rd.bits.rdData := Mux(ffoType, ffoIndexReg.bits, dataResult.bits)
+    io.retire.rd.bits.rdData := dataResult
     // TODO: csr retire.
     io.retire.csr.bits.vxsat := (slotCommit.asUInt & VecInit(slots.map(_.vxsat)).asUInt).orR
     io.retire.csr.bits.fflag := DontCare
@@ -1762,11 +979,6 @@ class T1(val parameter: T1Parameter)
       data := (data & (~maskExt).asUInt) | (maskExt & v0Write.bits.data)
     }
   }
-  when(dataClear) {
-    data.foreach(_.valid := false.B)
-  }
-  // don't care有可能会导致先读后写失败
-  maskUnitReadVec.foreach(_.bits.instructionIndex := slots.last.record.instructionIndex)
 
   layer.block(layers.Verification) {
 
@@ -1779,9 +991,10 @@ class T1(val parameter: T1Parameter)
     probeWire.issueTag            := requestReg.bits.instructionIndex
     probeWire.retireValid         := retire
     // maskUnitWrite maskUnitWriteReady
-    probeWire.writeQueueEnq.valid := maskUnitWrite.valid && maskUnitWriteReady
-    probeWire.writeQueueEnq.bits  := maskUnitWrite.bits.instructionIndex
-    probeWire.writeQueueEnqMask   := maskUnitWrite.bits.mask
+    // todo
+    probeWire.writeQueueEnq.valid := DontCare
+    probeWire.writeQueueEnq.bits  := DontCare
+    probeWire.writeQueueEnqMask   := DontCare
     probeWire.instructionValid    := maskAnd(
       !slots.last.state.sMaskUnitExecution && !slots.last.state.idle,
       indexToOH(slots.last.record.instructionIndex, parameter.chainingSize * 2)
diff --git a/t1/src/decoder/Decoder.scala b/t1/src/decoder/Decoder.scala
index 4072ae589..74b08d520 100644
--- a/t1/src/decoder/Decoder.scala
+++ b/t1/src/decoder/Decoder.scala
@@ -33,7 +33,7 @@ trait T1UopField extends T1DecodeFiled[UInt] with FieldName {
 }
 
 trait T1TopUopField extends T1DecodeFiled[UInt] with FieldName {
-  def chiselType: UInt = UInt(3.W)
+  def chiselType: UInt = UInt(5.W)
 }
 
 trait T1fpExecutionTypeUopField extends T1DecodeFiled[UInt] with FieldName {
@@ -227,14 +227,14 @@ object Decoder {
 
   object topUop extends T1TopUopField {
     override def genTable(pattern: T1DecodePattern): BitPat = pattern.topUop.value match {
-      case _: TopT0.type => BitPat("b000")
-      case _: TopT1.type => BitPat("b001")
-      case _: TopT2.type => BitPat("b010")
-      case _: TopT3.type => BitPat("b011")
-      case _: TopT5.type => BitPat("b101")
-      case _: TopT6.type => BitPat("b110")
-      case _: TopT7.type => BitPat("b111")
-      case _ => BitPat.dontCare(3)
+      case _: TopT0.type => BitPat("b00000")
+      case _: TopT1.type => BitPat("b00001")
+      case _: TopT2.type => BitPat("b00010")
+      case _: TopT3.type => BitPat("b00011")
+      case _: TopT5.type => BitPat("b00101")
+      case _: TopT6.type => BitPat("b00110")
+      case _: TopT7.type => BitPat("b00111")
+      case _ => BitPat.dontCare(5)
     }
   }
 
diff --git a/t1/src/decoder/attribute/isSwrite.scala b/t1/src/decoder/attribute/isSwrite.scala
index f14bad4c0..9b33abe53 100644
--- a/t1/src/decoder/attribute/isSwrite.scala
+++ b/t1/src/decoder/attribute/isSwrite.scala
@@ -134,8 +134,6 @@ object isSwrite {
       "vredor.vs",
       "vredsum.vs",
       "vredxor.vs",
-      "vrgather.vv",
-      "vrgatherei16.vv",
       "vs1r.v",
       "vs2r.v",
       "vs4r.v",
diff --git a/t1/src/laneStage/LaneStage3.scala b/t1/src/laneStage/LaneStage3.scala
index d6fb70eca..4854c0941 100644
--- a/t1/src/laneStage/LaneStage3.scala
+++ b/t1/src/laneStage/LaneStage3.scala
@@ -46,18 +46,11 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module {
 
   val pipeEnqueue: Option[LaneStage3Enqueue] = Option.when(isLastSlot)(RegInit(0.U.asTypeOf(enqueue.bits)))
 
-  /** response to [[T1.lsu]] or mask unit in [[T1]] */
   @public
-  val laneResponse: Option[ValidIO[LaneResponse]] = Option.when(isLastSlot)(IO(Valid(new LaneResponse(parameter))))
-  @public
-  val stageValid:   Bool                          = IO(Output(Bool()))
+  val stageValid: Bool = IO(Output(Bool()))
 
-  /** feedback from [[T1]] to [[Lane]] for [[laneResponse]] */
-  @public
-  val laneResponseFeedback: Option[ValidIO[LaneResponseFeedback]]  =
-    Option.when(isLastSlot)(IO(Flipped(Valid(new LaneResponseFeedback(parameter)))))
   @public
-  val crossWritePort:       Option[Vec[DecoupledIO[WriteBusData]]] =
+  val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] =
     Option.when(isLastSlot)(IO(Vec(2, Decoupled(new WriteBusData(parameter)))))
 
   val stageValidReg: Option[Bool] = Option.when(isLastSlot)(RegInit(false.B))
@@ -68,20 +61,10 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module {
   /** schedule cross lane write MSB */
   val sCrossWriteMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B))
 
-  // state for response to scheduler
-  /** schedule send [[LaneResponse]] to scheduler */
-  val sSendResponse: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B))
-
-  /** wait scheduler send [[LaneResponseFeedback]] */
-  val wResponseFeedback: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B))
-
   // update register
   when(enqueue.fire) {
     pipeEnqueue.foreach(_ := enqueue.bits)
     (sCrossWriteLSB ++ sCrossWriteMSB).foreach(_ := !enqueue.bits.decodeResult(Decoder.crossWrite))
-    (sSendResponse ++ wResponseFeedback).foreach(
-      _ := enqueue.bits.decodeResult(Decoder.scheduler) || enqueue.bits.sSendResponse
-    )
   }
 
   // Used to cut off back pressure forward
@@ -111,43 +94,18 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module {
         sendState(index) := true.B
       }
     }
-    // scheduler synchronization
-    val schedulerFinish: Bool = (sSendResponse ++ wResponseFeedback).reduce(_ && _)
 
     val dataSelect: Option[UInt] = Option.when(isLastSlot) {
       Mux(
         pipeEnqueue.get.decodeResult(Decoder.nr) ||
-          (enqueue.bits.ffoByOtherLanes && pipeEnqueue.get.decodeResult(Decoder.ffo)) ||
-          pipeEnqueue.get.decodeResult(Decoder.dontNeedExecuteInLane),
+          (enqueue.bits.ffoByOtherLanes && pipeEnqueue.get.decodeResult(Decoder.ffo)),
         pipeEnqueue.get.pipeData,
         pipeEnqueue.get.data
       )
     }
-    // mask request
-    laneResponse.head.valid                 := stageValidReg.get && !sSendResponse.get
-    laneResponse.head.bits.data             := Mux(
-      pipeEnqueue.get.decodeResult(Decoder.ffo),
-      pipeEnqueue.get.ffoIndex,
-      dataSelect.get
-    )
-    laneResponse.head.bits.toLSU            := pipeEnqueue.get.loadStore
-    laneResponse.head.bits.instructionIndex := pipeEnqueue.get.instructionIndex
-    laneResponse.head.bits.ffoSuccess       := pipeEnqueue.get.ffoSuccess
-    laneResponse.head.bits.fpReduceValid.zip(pipeEnqueue.get.fpReduceValid).foreach { case (s, f) => s := f }
-
-    sSendResponse.foreach(state =>
-      when(laneResponse.head.valid) {
-        state := true.B
-      }
-    )
-    wResponseFeedback.foreach(state =>
-      when(laneResponseFeedback.head.valid) {
-        state := true.B
-      }
-    )
 
     // enqueue write for last slot
-    vrfWriteQueue.io.enq.valid := stageValidReg.get && schedulerFinish && !pipeEnqueue.get.decodeResult(Decoder.sWrite)
+    vrfWriteQueue.io.enq.valid := stageValidReg.get && !pipeEnqueue.get.decodeResult(Decoder.sWrite)
 
     // UInt(5.W) + UInt(3.W), use `+` here
     vrfWriteQueue.io.enq.bits.vd := pipeEnqueue.get.vd + pipeEnqueue.get.groupCounter(
@@ -165,8 +123,8 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module {
     /** Cross-lane writing is over */
     val CrossLaneWriteOver: Bool = (sCrossWriteLSB ++ sCrossWriteMSB).reduce(_ && _)
 
-    enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && schedulerFinish && vrfWriteReady)
-    val dequeueFire = stageValidReg.get && CrossLaneWriteOver && schedulerFinish && vrfWriteReady
+    enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && vrfWriteReady)
+    val dequeueFire = stageValidReg.get && CrossLaneWriteOver && vrfWriteReady
     stageValidReg.foreach { data =>
       when(dequeueFire ^ enqueue.fire) {
         data := enqueue.fire
diff --git a/t1/src/laneStage/MaskExchangeUnit.scala b/t1/src/laneStage/MaskExchangeUnit.scala
new file mode 100644
index 000000000..9ac45c363
--- /dev/null
+++ b/t1/src/laneStage/MaskExchangeUnit.scala
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: 2022 Jiuyang Liu <liu@jiuyang.me>
+
+package org.chipsalliance.t1.rtl.lane
+
+import chisel3._
+import chisel3.experimental.hierarchy.{instantiable, public}
+import chisel3.util._
+import org.chipsalliance.t1.rtl._
+
+@instantiable
+class MaskExchangeUnit(parameter: LaneParameter) extends Module {
+  @public
+  val enqueue: DecoupledIO[LaneStage3Enqueue] =
+    IO(Flipped(Decoupled(new LaneStage3Enqueue(parameter, true))))
+
+  @public
+  val dequeue: DecoupledIO[LaneStage3Enqueue] =
+    IO(Decoupled(new LaneStage3Enqueue(parameter, true)))
+
+  @public
+  val maskReq: DecoupledIO[MaskUnitExeReq] = IO(Decoupled(new MaskUnitExeReq(parameter)))
+
+  @public
+  val maskRequestToLSU: Bool = IO(Output(Bool()))
+
+  @public
+  val maskUnitResponse: ValidIO[MaskUnitExeResponse] = IO(Flipped(Valid(new MaskUnitExeResponse(parameter))))
+
+  // pipe reg
+  val requestPipeReq:   LaneStage3Enqueue = RegInit(0.U.asTypeOf(enqueue.bits))
+  val pipeValid:        Bool              = RegInit(false.B)
+  // todo: sSendResponse -> sendResponse
+  val enqIsMaskRequest: Bool              = !enqueue.bits.sSendResponse
+
+  // todo: connect mask request & response
+  maskReq.valid             := enqIsMaskRequest && enqueue.valid
+  maskReq.bits.source1      := enqueue.bits.pipeData
+  maskReq.bits.source2      := enqueue.bits.data
+  maskReq.bits.groupCounter := enqueue.bits.groupCounter
+  maskReq.bits.index        := enqueue.bits.instructionIndex
+
+  maskRequestToLSU := enqueue.bits.loadStore
+
+  // type change MaskUnitExeResponse -> LaneStage3Enqueue
+  val maskUnitResponsePipeType: LaneStage3Enqueue = WireDefault(requestPipeReq)
+  maskUnitResponsePipeType.groupCounter     := maskUnitResponse.bits.writeData.groupCounter
+  maskUnitResponsePipeType.data             := maskUnitResponse.bits.writeData.data
+  maskUnitResponsePipeType.mask             := maskUnitResponse.bits.writeData.mask
+  maskUnitResponsePipeType.instructionIndex := maskUnitResponse.bits.index
+  maskUnitResponsePipeType.ffoByOtherLanes  := enqueue.bits.ffoByOtherLanes
+
+  val regEnq:      Bool = (enqueue.fire && !enqIsMaskRequest) || maskUnitResponse.valid
+  val pipeRequest: Bool = enqueue.fire || maskUnitResponse.valid
+  when(pipeRequest) {
+    requestPipeReq := Mux(maskUnitResponse.valid, maskUnitResponsePipeType, enqueue.bits)
+  }
+  when(regEnq ^ dequeue.fire) {
+    pipeValid := regEnq
+  }
+
+  enqueue.ready := ((!pipeValid || dequeue.ready) && !maskUnitResponse.valid) || enqIsMaskRequest
+  dequeue.valid := pipeValid
+  dequeue.bits  := requestPipeReq
+}
diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala
index b2ed09a4e..d842b9051 100644
--- a/t1/src/laneStage/SlotTokenManager.scala
+++ b/t1/src/laneStage/SlotTokenManager.scala
@@ -101,6 +101,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module {
   @public
   val dataInWritePipe: UInt = IO(Output(UInt(parameter.chainingSize.W)))
 
+  @public
+  val maskUnitLastReport: UInt = IO(Input(UInt(parameter.chainingSize.W)))
+
   def tokenUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt): UInt = {
     tokenData.zipWithIndex.foreach { case (t, i) =>
       val e      = enqWire(i)
@@ -114,12 +117,15 @@ class SlotTokenManager(parameter: LaneParameter) extends Module {
   }
 
   // todo: Precise feedback
-  def feedbackUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt): UInt = {
+  def feedbackUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt, clear: UInt): UInt = {
     tokenData.zipWithIndex.foreach { case (t, i) =>
       val e      = enqWire(i)
       val d      = deqWire(i)
+      val c      = clear(i)
       val change = Mux(e, 1.U(tokenWith.W), -1.S(tokenWith.W).asUInt)
-      when((e ^ d) && (e || t =/= 0.U)) {
+      when(c) {
+        t := 0.U
+      }.elsewhen((e ^ d) && (e || t =/= 0.U)) {
         t := t + change
       }
     }
@@ -132,7 +138,11 @@ class SlotTokenManager(parameter: LaneParameter) extends Module {
     val enqOH = indexToOH(enqReport.bits.instructionIndex, parameter.chainingSize)
 
     val writeDoEnq: UInt =
-      maskAnd(enqReport.valid && !enqReport.bits.decodeResult(Decoder.sWrite), enqOH).asUInt
+      maskAnd(
+        enqReport.valid && !enqReport.bits.decodeResult(Decoder.sWrite) &&
+          !enqReport.bits.decodeResult(Decoder.maskUnit),
+        enqOH
+      ).asUInt
 
     val writeDoDeq: UInt =
       maskAnd(
@@ -140,7 +150,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module {
         indexToOH(slotWriteReport(slotIndex).bits, parameter.chainingSize)
       ).asUInt
 
-    val pendingSlotWrite = tokenUpdate(writeToken, writeDoEnq, writeDoDeq)
+    val writeEnqSelect: UInt = Wire(UInt(parameter.chainingSize.W))
+
+    val pendingSlotWrite = tokenUpdate(writeToken, writeEnqSelect, writeDoDeq)
 
     if (slotIndex == 0) {
       val responseToken:      Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))
@@ -181,13 +193,16 @@ class SlotTokenManager(parameter: LaneParameter) extends Module {
         responseFeedbackReport.bits
       )
       val feedbackDoDeq: UInt =
-        maskAnd(responseFeedbackReport.valid, indexToOH(feedbackIndexSelect, parameter.chainingSize)).asUInt
+        maskAnd(responseFeedbackReport.valid, indexToOH(responseFeedbackReport.bits, parameter.chainingSize)).asUInt
+
+      writeEnqSelect := writeDoEnq | feedbackDoDeq
 
       val pendingResponse = tokenUpdate(responseToken, responseDoEnq, responseDoDeq)
       // todo: Precise feedback
-      val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq)
+      val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq, maskUnitLastReport)
       pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | pendingResponse | pendingFeedback
     } else {
+      writeEnqSelect := writeDoEnq
       pendingSlotWrite
     }
   }.reduce(_ | _)
diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala
new file mode 100644
index 000000000..651724380
--- /dev/null
+++ b/t1/src/mask/MaskCompress.scala
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: 2022 Jiuyang Liu <liu@jiuyang.me>
+
+package org.chipsalliance.t1.rtl
+
+import chisel3._
+import chisel3.util._
+
+class CompressInput(parameter: T1Parameter) extends Bundle {
+  val maskType:       Bool = Bool()
+  val eew:            UInt = UInt(2.W)
+  val uop:            UInt = UInt(3.W)
+  val readFromScalar: UInt = UInt(parameter.datapathWidth.W)
+  val source1:        UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W)
+  val source2:        UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W)
+  val groupCounter:   UInt = UInt(parameter.laneParam.groupNumberBits.W)
+  val lastCompress:   Bool = Bool()
+}
+
+class CompressOutput(parameter: T1Parameter) extends Bundle {
+  val data:          UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W)
+  val mask:          UInt = UInt((parameter.laneNumber * parameter.datapathWidth / 8).W)
+  val compressValid: Bool = Bool()
+}
+
+class MaskCompress(parameter: T1Parameter) extends Module {
+  val in:             ValidIO[CompressInput] = IO(Flipped(Valid(new CompressInput(parameter))))
+  val out:            CompressOutput         = IO(Output(new CompressOutput(parameter)))
+  val newInstruction: Bool                   = IO(Input(Bool()))
+
+  val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8
+
+  // Source1 alignment
+  val source1Aligned: UInt = Wire(UInt(maskSize.W))
+  // TODO: Align and align in advance
+  source1Aligned := in.bits.source1
+  val compress = in.bits.uop === "b001".U
+  val viota    = in.bits.uop === "b000".U
+  val mv       = in.bits.uop === "b101".U
+
+  val eew1H:           UInt      = UIntToOH(in.bits.eew)(2, 0)
+  val compressInit:    UInt      = RegInit(0.U(log2Ceil(parameter.vLen).W))
+  val compressVec:     Vec[UInt] = Wire(Vec(maskSize, UInt(compressInit.getWidth.W)))
+  val compressMaskVec: Seq[Bool] = source1Aligned.asBools
+  val compressCount:   UInt      = compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) =>
+    compressVec(index) := pre
+    pre + mask
+  }
+  // todo: compress update
+  compressInit := Mux(newInstruction, 0.U, compressCount)
+
+  val viotaResult: UInt = Mux1H(
+    eew1H,
+    Seq(1, 2, 4).map { eew =>
+      VecInit(Seq.tabulate(parameter.laneNumber) { index =>
+        // data width: eew * 8, data path 32, need [4 / eew] element
+        val dataSize = 4 / eew
+        val res: Seq[UInt] = Seq.tabulate(dataSize) { i =>
+          UIntWithSize(compressVec(dataSize * index + i), eew * 8)
+        }
+        // each data path
+        VecInit(res).asUInt
+      }).asUInt
+    }
+  )
+  val viotaMask:   UInt = Mux1H(
+    eew1H,
+    Seq(1, 2, 4).map { eew =>
+      VecInit(Seq.tabulate(parameter.laneNumber) { index =>
+        val dataSize = 4 / eew
+        val res: Seq[UInt] = Seq.tabulate(dataSize) { i =>
+          Fill(eew, compressMaskVec(dataSize * index + i))
+        }
+        // 4 bit mask
+        VecInit(res).asUInt
+      }).asUInt
+    }
+  )
+
+  val tailCount       = compressInit
+  val compressDataReg = RegInit(0.U((parameter.laneNumber * parameter.datapathWidth).W))
+  val compressDataVec = Seq(1, 2, 4).map { eew =>
+    VecInit(Seq.tabulate(parameter.laneNumber * 2) { index =>
+      val useTail       = index.U < tailCount
+      val tailData      = cutUInt(compressDataReg, eew)(index)
+      val maskSize      = 4 * parameter.laneNumber / eew
+      val hitReq        = Seq.tabulate(maskSize)(maskIndex => compressVec(maskIndex) === index.U)
+      val selectReqData = Mux1H(
+        hitReq,
+        cutUInt(in.bits.source2, eew)
+      )
+      Mux(useTail, tailData, selectReqData)
+    }).asUInt
+  }
+  val compressResult: UInt = Mux1H(eew1H, compressDataVec)
+
+  // todo: connect & update compressInit
+  val compressTailMask = Wire(UInt(out.mask.getWidth.W))
+  compressTailMask := DontCare
+
+  val mvMask = Mux1H(eew1H, Seq(1.U, 3.U, 15.U))
+  val mvData = in.bits.readFromScalar
+
+  out.data := Mux1H(
+    Seq(
+      compress -> compressResult,
+      viota    -> viotaResult,
+      mv       -> mvData
+    )
+  )
+
+  // todo: compressMask
+  out.mask := Mux1H(
+    Seq(
+      compress -> compressTailMask,
+      viota    -> viotaMask,
+      mv       -> mvMask
+    )
+  )
+
+  // todo
+  out.compressValid := false.B
+}
diff --git a/t1/src/mask/MaskExtend.scala b/t1/src/mask/MaskExtend.scala
new file mode 100644
index 000000000..27a27f9ea
--- /dev/null
+++ b/t1/src/mask/MaskExtend.scala
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: 2022 Jiuyang Liu <liu@jiuyang.me>
+
+package org.chipsalliance.t1.rtl
+
+import chisel3._
+import chisel3.util._
+
+class ExtendInput(parameter: T1Parameter) extends Bundle {
+  val eew:          UInt = UInt(2.W)
+  val uop:          UInt = UInt(3.W)
+  val source2:      UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W)
+  val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W)
+}
+
+class MaskExtend(parameter: T1Parameter) extends Module {
+  val in:  ExtendInput = IO(Input(new ExtendInput(parameter)))
+  val out: UInt        = IO(Output(UInt(parameter.datapathWidth.W)))
+
+  val eew1H: UInt = UIntToOH(in.eew)(2, 0)
+
+  val isMaskDestination:     Bool = in.uop.andR
+  val maskDestinationResult: UInt = Mux1H(
+    eew1H,
+    Seq(4, 2, 1).map { groupSize =>
+      VecInit(
+        cutUInt(in.source2, groupSize).grouped(parameter.laneNumber).toSeq.transpose.map(a => VecInit(a).asUInt)
+      ).asUInt
+    }
+  )
+
+  // extend
+  val sign:        Bool = in.uop(0)
+  // extend ratio
+  // todo: Currently only vf2 and vf4
+  val extendRatio: Bool = in.uop(1)
+
+  // select source2
+  // extendRatio: 0 -> vf2; 1-> vf4
+  val source2: UInt = Mux(
+    extendRatio,
+    Mux1H(
+      UIntToOH(in.groupCounter(1, 0)),
+      cutUInt(in.source2, parameter.laneNumber * parameter.datapathWidth / 4)
+    ),
+    Mux1H(
+      UIntToOH(in.groupCounter(0)),
+      cutUInt(in.source2, parameter.laneNumber * parameter.datapathWidth / 2)
+    )
+  )
+
+  val extendResult: UInt = Mux1H(
+    eew1H(2, 1),
+    Seq(2, 4).map { dataWidth =>
+      Mux1H(
+        UIntToOH(extendRatio),
+        Seq(2, 4).map { ratio =>
+          val resWidth    = dataWidth * 8
+          val sourceWidth = resWidth / ratio
+          VecInit(cutUInt(source2, sourceWidth).map { sourceData =>
+            Fill(resWidth - sourceWidth, sourceData(sourceWidth - 1) && sign) ## sourceData
+          }).asUInt
+        }
+      )
+    }
+  )
+
+  out := Mux(isMaskDestination, maskDestinationResult, extendResult)
+}
diff --git a/t1/src/mask/MaskReduce.scala b/t1/src/mask/MaskReduce.scala
new file mode 100644
index 000000000..afee1bd88
--- /dev/null
+++ b/t1/src/mask/MaskReduce.scala
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: 2022 Jiuyang Liu <liu@jiuyang.me>
+
+package org.chipsalliance.t1.rtl
+
+import chisel3._
+import chisel3.experimental.hierarchy.{Instance, Instantiate}
+import chisel3.util._
+
+class ReduceInput(parameter: T1Parameter) extends Bundle {
+  val maskType:     Bool = Bool()
+  val eew:          UInt = UInt(2.W)
+  val uop:          UInt = UInt(3.W)
+  val readVS1:      UInt = UInt(parameter.datapathWidth.W)
+  val source2:      UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W)
+  val sourceValid:  UInt = UInt(parameter.laneNumber.W)
+  val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W)
+  val lastGroup:    Bool = Bool()
+  val vxrm:         UInt = UInt(3.W)
+  val aluUop:       UInt = UInt(4.W)
+  val sign:         Bool = Bool()
+}
+
+class ReduceOutput(parameter: T1Parameter) extends Bundle {
+  val data: UInt = UInt(parameter.datapathWidth.W)
+}
+
+class MaskReduce(parameter: T1Parameter) extends Module {
+  val in:             DecoupledIO[ReduceInput] = IO(Flipped(Decoupled(new ReduceInput(parameter))))
+  val out:            ValidIO[ReduceOutput]    = IO(Valid(new ReduceOutput(parameter)))
+  val newInstruction: Bool                     = IO(Input(Bool()))
+
+  val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8
+
+  // todo: uop decode
+  val order:    Bool = in.bits.uop === "b101".U
+  val reqWiden: Bool = in.bits.uop === "b001".U
+
+  val eew1H:         UInt = UIntToOH(in.bits.eew)(2, 0)
+  val nextFoldCount: Bool = eew1H(0) && !reqWiden
+
+  // reduce function unit
+  val adder:       Instance[ReduceAdder]          = Instantiate(new ReduceAdder(parameter.datapathWidth))
+  val logicUnit:   Instance[LaneLogic]            = Instantiate(new LaneLogic(parameter.datapathWidth))
+  // option unit for flot reduce
+  val floatAdder:  Option[Instance[FloatAdder]]   =
+    Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(8, 24)))
+  val flotCompare: Option[Instance[FloatCompare]] =
+    Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(8, 24)))
+
+  // init reg
+  val reduceInit:     UInt = RegInit(0.U(parameter.datapathWidth.W))
+  val reduceResult:   UInt = Wire(UInt(parameter.datapathWidth.W))
+  val crossFoldCount: UInt = RegInit(0.U(log2Ceil(parameter.laneNumber).W))
+  val lastFoldCount:  Bool = RegInit(false.B)
+
+  val reqReg          = RegEnable(in.bits, 0.U.asTypeOf(in.bits), in.fire)
+  // todo: handle reqReg.sourceValid
+  val groupLastReduce = crossFoldCount.andR
+  val lastFoldEnd     = !lastFoldCount
+  val outValid:       Bool = WireDefault(false.B)
+  // todo: skip float reduce
+  val skipFlotReduce: Bool = WireDefault(false.B)
+
+  val eew1HReg:  UInt = UIntToOH(reqReg.eew)(2, 0)
+  val floatType: Bool = reqReg.uop(2)
+  val NotAdd:    Bool = reqReg.uop(1)
+  val widen:     Bool = reqReg.uop === "b001".U
+  val needFold:  Bool = eew1HReg(0) || (eew1HReg(1) && !widen)
+
+  // crossFold: reduce between lane
+  // lastFold: reduce in data path
+  // orderRed: order reduce
+  val idle :: crossFold :: lastFold :: orderRed :: Nil = Enum(4)
+  val state: UInt = RegInit(idle)
+
+  val stateIdle:  Bool = state === idle
+  val stateCross: Bool = state === crossFold
+  val stateLast:  Bool = state === lastFold
+  val stateOrder: Bool = state === orderRed
+
+  // state update
+  in.ready := stateIdle
+  when(stateIdle) {
+    when(in.valid) {
+      state := Mux(order, orderRed, crossFold)
+    }
+  }
+
+  when(stateCross) {
+    when(groupLastReduce) {
+      state    := Mux(reqReg.lastGroup && needFold, lastFold, idle)
+      outValid := reqReg.lastGroup && !needFold
+    }
+  }
+
+  when(stateOrder) {
+    when(groupLastReduce) {
+      state    := idle
+      outValid := reqReg.lastGroup
+    }
+  }
+
+  when(stateLast) {
+    when(lastFoldEnd) {
+      state    := idle
+      outValid := true.B
+    }
+  }
+
+  when(newInstruction) {
+    // todo: update reduceInit when first in.fire
+    reduceInit     := in.bits.readVS1
+    crossFoldCount := 0.U
+    lastFoldCount  := nextFoldCount
+  }
+
+  // count update
+  // todo: stateCross <=> stateOrder ??
+  when(stateCross || stateOrder || in.fire) {
+    crossFoldCount := Mux(in.fire, 0.U, crossFoldCount + 1.U)
+  }
+
+  // result update
+  when(!stateIdle) {
+    reduceInit := reduceResult
+  }
+
+  when(stateLast) {
+    lastFoldCount := false.B
+  }
+
+  val selectLaneResult: UInt = Mux1H(
+    UIntToOH(crossFoldCount),
+    cutUInt(reqReg.source2, parameter.datapathWidth)
+  )
+  val reduceDataVec = cutUInt(reduceInit, 8)
+  // reduceFoldCount = false => abcd -> xxab | xxcd -> mask 0011
+  // reduceFoldCount = true =>  abcd -> xaxc | xbxd -> mask 0101
+  val lastFoldSource1: UInt = Mux(
+    lastFoldCount,
+    reduceDataVec(3) ## reduceDataVec(3) ## reduceDataVec(1),
+    reduceDataVec(3) ## reduceDataVec(3) ## reduceDataVec(2)
+  )
+  val source2Select:   UInt = Mux(stateCross || stateOrder, selectLaneResult, lastFoldSource1)
+
+  // popCount 在top视为reduce add
+  adder.request.src    := VecInit(Seq(reduceInit, source2Select))
+  // todo: pop
+  adder.request.opcode := reqReg.aluUop(2)
+  adder.request.sign   := reqReg.sign
+  adder.request.vSew   := reqReg.eew
+
+  floatAdder.foreach { fAdder =>
+    fAdder.io.a            := reduceInit
+    fAdder.io.b            := source2Select
+    fAdder.io.roundingMode := reqReg.vxrm
+  }
+
+  flotCompare.foreach { fCompare =>
+    fCompare.io.a     := reduceInit
+    fCompare.io.b     := source2Select
+    // max -> 12, min -> 8
+    fCompare.io.isMax := reqReg.aluUop(2)
+  }
+
+  logicUnit.req.src    := VecInit(Seq(reduceInit, source2Select))
+  logicUnit.req.opcode := reqReg.aluUop
+
+  val flotReduceResult: Option[UInt] = Option.when(parameter.fpuEnable)(
+    Mux(
+      skipFlotReduce,
+      reduceInit,
+      Mux(NotAdd, flotCompare.get.io.out, floatAdder.get.io.out)
+    )
+  )
+  // select result
+  reduceResult := Mux(
+    floatType,
+    flotReduceResult.getOrElse(adder.response.data),
+    Mux(NotAdd, logicUnit.resp, adder.response.data)
+  )
+
+  out.valid     := outValid
+  out.bits.data := reduceResult
+}
diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala
new file mode 100644
index 000000000..5985a83fd
--- /dev/null
+++ b/t1/src/mask/MaskUnit.scala
@@ -0,0 +1,580 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: 2022 Jiuyang Liu <liu@jiuyang.me>
+
+package org.chipsalliance.t1.rtl
+
+import chisel3._
+import chisel3.experimental.hierarchy.{instantiable, public}
+import chisel3.util._
+import org.chipsalliance.t1.rtl.decoder.Decoder
+
+// top uop decode
+// uu ii x -> uu: unit index; ii: Internal encoding, x: additional encode
+
+// slid & gather unit, need read vrf in mask unit(00)
+// 00 00 x -> slid; x? up: down
+// 00 01 x -> slid1; x? up: down
+// 00 10 x -> gather; x? 16 : sew  todo:(multi address check/ index -> data cache?)
+
+// compress & viota unit & vmv(01)
+// These instructions cannot extend their execution width indefinitely.
+// 01 00 x -> x ? compress : viota
+// 01 01 x -> vmv; x: write rd ?
+
+// reduce unit(10) n + 8 + m -> n + 3 + m // Folded into datapath, then folded into sew
+// The Reduce instruction folds the data.
+// Considering the sequential addition, a state machine is needed to control it.
+// 10 00 x -> adder; x: widen reduce?
+// 10 01 x -> logic; x: dc
+// 10 10 x -> floatAdder; x: order?
+// 10 11 x -> flotCompare; x: dc
+
+// extend unit & maskdestination(11)
+// These instructions write an entire data path each time they are executed.
+// 11 mm x -> s(z)ext; mm: multiple(00, 01, 10); x ? sign : zero
+// 11 11 1 -> maskdestination
+@instantiable
+class MaskUnit(parameter: T1Parameter) extends Module {
+  // todo: param
+  val readQueueSize:          Int = 4
+  val readVRFLatency:         Int = 2
+  val maskUnitWriteQueueSize: Int = 8
+
+  @public
+  val instReq: ValidIO[MaskUnitInstReq] = IO(Flipped(Valid(new MaskUnitInstReq(parameter))))
+
+  @public
+  val exeReq: Seq[DecoupledIO[MaskUnitExeReq]] = Seq.tabulate(parameter.laneNumber) { _ =>
+    IO(Flipped(Decoupled(new MaskUnitExeReq(parameter.laneParam))))
+  }
+
+  @public
+  val exeResp: Seq[ValidIO[MaskUnitExeResponse]] = Seq.tabulate(parameter.laneNumber) { _ =>
+    IO(Valid(new MaskUnitExeResponse(parameter.laneParam)))
+  }
+
+  @public
+  val readChannel: Seq[DecoupledIO[VRFReadRequest]] = Seq.tabulate(parameter.laneNumber) { _ =>
+    IO(
+      Decoupled(
+        new VRFReadRequest(
+          parameter.vrfParam.regNumBits,
+          parameter.laneParam.vrfOffsetBits,
+          parameter.instructionIndexBits
+        )
+      )
+    )
+  }
+
+  @public
+  val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ =>
+    IO(Input(UInt(parameter.datapathWidth.W)))
+  }
+
+  @public
+  val writeRD: ValidIO[UInt] = IO(Valid(UInt(parameter.datapathWidth.W)))
+
+  /** input of mask data */
+  @public
+  val maskInput: UInt = IO(Input(UInt(parameter.maskGroupWidth.W)))
+
+  /** select which mask group. */
+  @public
+  val maskSelect: UInt = IO(Output(UInt(parameter.laneParam.maskGroupSizeBits.W)))
+
+  @public
+  val lastReport: UInt = IO(Output(UInt(parameter.chainingSize.W)))
+
+  val instReg:          MaskUnitInstReq = RegEnable(instReq.bits, 0.U.asTypeOf(instReq.bits), instReq.valid)
+  val sew1H:            UInt            = UIntToOH(instReg.sew)(2, 0)
+  val lastExecuteIndex: UInt            = Mux1H(sew1H, Seq(3.U(2.W), 2.U(2.W), 0.U(2.W)))
+
+  // calculate last group
+  val readDataEew1H:    UInt = sew1H
+  val lastElementIndex: UInt = (instReg.vl - instReg.vl.orR)(parameter.laneParam.vlMaxBits - 2, 0)
+  val laneNumberBits:   Int  = 1.max(log2Ceil(parameter.laneNumber))
+
+  /** For an instruction, the last group is not executed by all lanes, here is the last group of the instruction xxxxx
+    * xxx xx -> vsew = 0 xxxxxx xxx x -> vsew = 1 xxxxxxx xxx -> vsew = 2
+    */
+  val lastGroupForInstruction: UInt = Mux1H(
+    readDataEew1H,
+    Seq(
+      lastElementIndex(parameter.laneParam.vlMaxBits - 2, laneNumberBits + 2),
+      lastElementIndex(parameter.laneParam.vlMaxBits - 2, laneNumberBits + 1),
+      lastElementIndex(parameter.laneParam.vlMaxBits - 2, laneNumberBits)
+    )
+  )
+
+  /** Which lane the last element is in. */
+  val lastLaneIndex:     UInt = Mux1H(
+    readDataEew1H,
+    Seq(
+      lastElementIndex(laneNumberBits + 2 - 1, 2),
+      lastElementIndex(laneNumberBits + 1 - 1, 1),
+      lastElementIndex(laneNumberBits - 1, 0)
+    )
+  )
+  val lastGroupDataNeed: UInt = scanRightOr(UIntToOH(lastLaneIndex))
+
+  // from decode
+  val unitType: UInt = UIntToOH(instReg.decodeResult(Decoder.topUop)(4, 3))
+  val readType: Bool = unitType(0)
+  val gather16: Bool = instReg.decodeResult(Decoder.topUop) === "b00101".U
+
+  val sewCorrection: UInt = Mux(gather16, 1.U, instReg.sew)
+
+  val exeRequestQueue: Seq[Queue[MaskUnitExeReq]] = exeReq.map { req =>
+    // todo: max or token?
+    val queue: Queue[MaskUnitExeReq] = Module(new Queue(chiselTypeOf(req.bits), 16, flow = true))
+    queue.io.enq.valid := req.valid
+    req.ready          := queue.io.enq.ready
+    queue.io.enq.bits  := req.bits
+    queue
+  }
+
+  val exeReqReg:         Seq[ValidIO[MaskUnitExeReq]] = Seq.tabulate(parameter.laneNumber) { _ =>
+    RegInit(0.U.asTypeOf(Valid(new MaskUnitExeReq(parameter.laneParam))))
+  }
+  val lastGroup:         Bool                         = exeReqReg.head.bits.groupCounter === lastGroupForInstruction
+  // todo: mask
+  val groupDataNeed:     UInt                         = Mux(lastGroup, lastGroupDataNeed, (-1.S(parameter.laneNumber.W)).asUInt)
+  // For read type, only sew * laneNumber data will be consumed each time
+  // There will be a maximum of (dataPath * laneNumber) / (sew * laneNumber) times
+  val executeIndex:      UInt                         = RegInit(0.U(2.W))
+  // The status of an execution
+  // Each execution ends with executeIndex + 1
+  val readGroupState:    MaskUnitExecuteState         = RegInit(0.U.asTypeOf(new MaskUnitExecuteState(parameter)))
+  val executeStateValid: Bool                         = RegInit(false.B)
+
+  def indexAnalysis(sewInt: Int)(elementIndex: UInt, vlmul: UInt, valid: Option[Bool] = None): Seq[UInt] = {
+    val intLMULInput: UInt = (1.U << vlmul(1, 0)).asUInt
+    val positionSize = parameter.laneParam.vlMaxBits - 1
+    val dataPosition = (changeUIntSize(elementIndex, positionSize) << sewInt).asUInt(positionSize - 1, 0)
+    val accessMask: UInt = Seq(
+      UIntToOH(dataPosition(1, 0)),
+      FillInterleaved(2, UIntToOH(dataPosition(1))),
+      15.U(4.W)
+    )(sewInt)
+    // The offset of the data starting position in 32 bits (currently only 32).
+    // Since the data may cross lanes, it will be optimized during fusion.
+    // (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0))
+    val dataOffset: UInt =
+      (if (sewInt < 2) dataPosition(1) else false.B) ##
+        (if (sewInt == 0) dataPosition(0) else false.B)
+    val accessLane = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W)
+    // 32 bit / group
+    val dataGroup  = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt
+    val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits
+    val offset            = dataGroup(offsetWidth - 1, 0)
+    val accessRegGrowth   = (dataGroup >> offsetWidth).asUInt
+    val decimalProportion = offset ## accessLane
+    // 1/8 register
+    val decimal           = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3))
+
+    /** elementIndex needs to be compared with vlMax(vLen * lmul /sew) This calculation is too complicated We can change
+      * the angle. Calculate the increment of the read register and compare it with lmul to know whether the index
+      * exceeds vlMax. vlmul needs to distinguish between integers and floating points
+      */
+    val overlap      =
+      (vlmul(2) && decimal >= intLMULInput(3, 1)) ||
+        (!vlmul(2) && accessRegGrowth >= intLMULInput) ||
+        (elementIndex >> log2Ceil(parameter.vLen)).asUInt.orR
+    val elementValid = valid.getOrElse(true.B)
+    val notNeedRead  = overlap || !elementValid
+    val reallyGrowth: UInt = changeUIntSize(accessRegGrowth, 3)
+    Seq(accessMask, dataOffset, accessLane, offset, reallyGrowth, notNeedRead, elementValid)
+  }
+
+  // datapath bit per mask group
+  // laneNumber bit per execute group
+  val executeGroup: UInt = Mux1H(
+    sew1H,
+    Seq(
+      exeReqReg.head.bits.groupCounter ## executeIndex,
+      exeReqReg.head.bits.groupCounter ## executeIndex(1),
+      exeReqReg.head.bits.groupCounter
+    )
+  )
+
+  val executeSizeBit: Int = log2Ceil(parameter.laneNumber)
+  val vlMisAlign = instReg.vl(executeSizeBit - 1, 0).orR
+  val lastexecuteGroup:     UInt = (instReg.vl >> executeSizeBit).asUInt - !vlMisAlign
+  val isVlBoundary:         Bool = executeGroup === lastexecuteGroup
+  val validExecuteGroup:    Bool = executeGroup <= lastexecuteGroup
+  val vlBoundaryCorrection: UInt = Mux(
+    vlMisAlign && isVlBoundary,
+    (~scanLeftOr(UIntToOH(instReg.vl(executeSizeBit - 1, 0)))).asUInt,
+    -1.S(parameter.laneNumber.W).asUInt
+  ) & Fill(parameter.laneNumber, validExecuteGroup)
+
+  maskSelect := executeGroup >> log2Ceil(parameter.datapathWidth / parameter.laneNumber)
+  val selectReadStageMask: UInt = cutUIntBySize(maskInput, 4)(executeGroup(1, 0))
+  val maskCorrection:      UInt =
+    Mux(instReg.maskType, selectReadStageMask, -1.S(parameter.laneNumber.W).asUInt) &
+      vlBoundaryCorrection
+
+  val checkVec:           Seq[Seq[UInt]] = Seq(0, 1, 2).map { sewInt =>
+    val dataByte = 1 << sewInt
+    // All data of this group
+    val groupSourceData:  UInt = VecInit(exeReqReg.map(_.bits.source1)).asUInt
+    val groupSourceValid: UInt = VecInit(exeReqReg.map(_.valid)).asUInt
+    // Single use length
+    val singleWidth  = dataByte * 8 * parameter.laneNumber
+    // How many times will a set of data be executed?
+    val executeTimes = (parameter.datapathWidth / 8) / dataByte
+    // Which part is selected as the source data this time?
+    val executeDataSelect1H: UInt = if (sewInt == 0) {
+      UIntToOH(executeIndex)
+    } else if (sewInt == 1) {
+      UIntToOH(executeIndex(1))
+    } else {
+      true.B
+    }
+    // Select source data
+    val sourceSelect = Mux1H(
+      executeDataSelect1H,
+      cutUInt(groupSourceData, singleWidth)
+    )
+    val validSelect: UInt = Mux1H(
+      executeDataSelect1H,
+      cutUInt(groupSourceValid, singleWidth / parameter.datapathWidth)
+    )
+
+    // The length of an element
+    val dataWidth = 8 * dataByte
+    // Split into elements
+    val source    = cutUInt(sourceSelect, dataWidth)
+    val validVec  = FillInterleaved(parameter.datapathWidth / dataWidth, validSelect) & maskCorrection
+    // read index check
+    // (accessMask, dataOffset, accessLane, offset, reallyGrowth, overlap)
+    val checkResultVec: Seq[Seq[UInt]] = source.zipWithIndex.map { case (s, i) =>
+      indexAnalysis(sewInt)(s, instReg.vlmul, Some(validVec(i)))
+    }
+    val checkResult = checkResultVec.transpose.map(a => VecInit(a).asUInt)
+    checkResult
+  }
+  val sewCorrection1H:    UInt           = UIntToOH(sewCorrection)(2, 0)
+  val dataOffsetSelect:   UInt           = Mux1H(sewCorrection1H, checkVec.map(_(1)))
+  val accessLaneSelect:   UInt           = Mux1H(sewCorrection1H, checkVec.map(_(2)))
+  val offsetSelect:       UInt           = Mux1H(sewCorrection1H, checkVec.map(_(3)))
+  val growthSelect:       UInt           = Mux1H(sewCorrection1H, checkVec.map(_(4)))
+  val notReadSelect:      UInt           = Mux1H(sewCorrection1H, checkVec.map(_(5)))
+  val elementValidSelect: UInt           = Mux1H(sewCorrection1H, checkVec.map(_(6)))
+
+  val readCrossBar: MaskUnitReadCrossBar = Module(new MaskUnitReadCrossBar(parameter))
+
+  // The queue waiting to read data. This queue contains other information about this group.
+  // 64: todo: max or token?
+  val readWaitQueue: Queue[MaskUnitWaitReadQueue] =
+    Module(new Queue(new MaskUnitWaitReadQueue(parameter), 64))
+
+  // s0 pipe request from lane
+  val laseExecuteGroupDeq: Bool = Wire(Bool())
+  exeRequestQueue.zip(exeReqReg).foreach { case (req, reg) =>
+    req.io.deq.ready := !reg.valid || laseExecuteGroupDeq
+    when(req.io.deq.fire) {
+      reg.bits := req.io.deq.bits
+    }
+    when(req.io.deq.fire ^ laseExecuteGroupDeq) {
+      reg.valid := req.io.deq.fire
+    }
+  }
+
+  val isLastExecuteGroup: Bool = executeIndex === lastExecuteIndex
+  val allDataValid:       Bool = exeReqReg.zipWithIndex.map { case (d, i) => d.valid || !groupDataNeed(i) }.reduce(_ && _)
+  val canIssueGroup:      Bool = allDataValid && readWaitQueue.io.enq.ready
+
+  // select execute group
+  val selectExecuteReq: Seq[ValidIO[MaskUnitReadReq]] = exeReqReg.zipWithIndex.map { case (_, index) =>
+    val res: ValidIO[MaskUnitReadReq] = WireInit(0.U.asTypeOf(Valid(new MaskUnitReadReq(parameter))))
+    res.bits.vs           := instReg.vs2 + readGroupState.vsGrowth(index)
+    res.bits.offset       := readGroupState.readOffset(index)
+    res.bits.readLane     := readGroupState.accessLane(index)
+    res.bits.dataOffset   := cutUIntBySize(readGroupState.readDataOffset, parameter.laneNumber)(index)
+    res.bits.requestIndex := index.U
+    res.valid             := executeStateValid && !readGroupState.groupReadState(index) && readGroupState.needRead(index)
+    res
+  }
+
+  // read arbitration
+  readCrossBar.input.zip(selectExecuteReq).foreach { case (cross, req) =>
+    cross.valid := req.valid
+    cross.bits  := req.bits
+  }
+
+  // read control register update
+  val readFire:        UInt = VecInit(readCrossBar.input.map(_.fire)).asUInt
+  val anyReadFire:     Bool = readFire.orR
+  val readStateUpdate: UInt = readFire | readGroupState.groupReadState
+  val groupReadFinish: Bool = readStateUpdate === readGroupState.needRead
+  val readStateDeq:    Bool = (anyReadFire && groupReadFinish) || (executeStateValid && readGroupState.needRead === 0.U)
+  val executeStateEnq: Bool = allDataValid && (readStateDeq || !executeStateValid)
+  when(anyReadFire) {
+    readGroupState.groupReadState := readStateUpdate
+  }
+
+  when(readStateDeq ^ executeStateEnq) {
+    executeStateValid := executeStateEnq
+  }
+
+  val executeIndexGrowth: UInt = (1.U << instReg.sew).asUInt
+  when(executeStateEnq) {
+    readGroupState.groupReadState := 0.U
+    readGroupState.needRead       := (~notReadSelect).asUInt
+    readGroupState.elementValid   := elementValidSelect
+    readGroupState.accessLane     := cutUIntBySize(accessLaneSelect, parameter.laneNumber)
+    readGroupState.vsGrowth       := cutUIntBySize(growthSelect, parameter.laneNumber)
+    readGroupState.readOffset     := offsetSelect
+    readGroupState.groupCount     := exeReqReg.head.bits.groupCounter
+    readGroupState.executeIndex   := executeIndex
+    readGroupState.readDataOffset := dataOffsetSelect
+    readGroupState.last           := isVlBoundary
+    executeIndex                  := executeIndex + executeIndexGrowth
+  }
+
+  readWaitQueue.io.enq.valid             := readStateDeq
+  readWaitQueue.io.enq.bits.groupCounter := readGroupState.groupCount
+  readWaitQueue.io.enq.bits.executeIndex := readGroupState.executeIndex
+  readWaitQueue.io.enq.bits.sourceValid  := readGroupState.elementValid
+  readWaitQueue.io.enq.bits.needRead     := readGroupState.needRead
+  readWaitQueue.io.enq.bits.last         := readGroupState.last
+
+  laseExecuteGroupDeq := Mux(readType, executeStateEnq, readWaitQueue.io.enq.fire) && isLastExecuteGroup
+
+  // s1 read vrf
+  val write1HPipe:    Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W)))
+  val pipeDataOffset: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(log2Ceil(parameter.datapathWidth / 8).W)))
+
+  readCrossBar.output.zipWithIndex.foreach { case (request, index) =>
+    val sourceLane = UIntToOH(request.bits.writeIndex)
+    readChannel(index).valid                 := request.valid
+    readChannel(index).bits.readSource       := 2.U
+    readChannel(index).bits.vs               := request.bits.vs
+    readChannel(index).bits.offset           := request.bits.offset
+    readChannel(index).bits.instructionIndex := instReg.instructionIndex
+    request.ready                            := readChannel(index).ready
+
+    // pipe read fire
+    val pipeRead   = Pipe(readChannel(index).fire, sourceLane, readVRFLatency)
+    val pipeOffset = Pipe(readChannel(index).fire, request.bits.dataOffset, readVRFLatency)
+    write1HPipe(index)    := Mux(pipeRead.valid, pipeRead.bits, 0.U(parameter.laneNumber.W))
+    pipeDataOffset(index) := pipeOffset.bits
+  }
+
+  // Processing read results
+  val readData: Seq[DecoupledIO[UInt]] = Seq.tabulate(parameter.laneNumber) { index =>
+    // todo: assert enq.read & use token
+    val readDataQueue    = Module(new Queue(UInt(parameter.datapathWidth.W), 4, flow = true))
+    val readResultSelect = VecInit(write1HPipe.map(_(index))).asUInt
+    val dataOffset: UInt = Mux1H(readResultSelect, pipeDataOffset)
+    readDataQueue.io.enq.valid := readResultSelect.orR
+    readDataQueue.io.enq.bits  := Mux1H(readResultSelect, readResult) >> (dataOffset ## 0.U(3.W))
+    readDataQueue.io.deq
+  }
+
+  /** todo: [[waiteReadDataPipeReg]] enq && [[readWaitQueue]] enq * */
+  // reg before execute
+  val waiteReadDataPipeReg: MaskUnitWaitReadQueue = RegInit(0.U.asTypeOf(new MaskUnitWaitReadQueue(parameter)))
+  val waiteReadData:        Seq[UInt]             = Seq.tabulate(parameter.laneNumber) { _ => RegInit(0.U(parameter.datapathWidth.W)) }
+  val waiteReadSate:        UInt                  = RegInit(0.U(parameter.laneNumber.W))
+  val waiteReadStageValid:  Bool                  = RegInit(false.B)
+
+  // Process the data that needs to be written
+  val dlen: Int = parameter.datapathWidth * parameter.laneNumber
+  // Execute at most 4 times, each index represents 1/4 of dlen
+  val eachIndexSize = dlen / 4
+  val writeDataVec  = Seq(0, 1, 2).map { sewInt =>
+    val dataByte = 1 << sewInt
+    val data     = VecInit(Seq.tabulate(parameter.laneNumber) { laneIndex =>
+      val dataElement: UInt = Wire(UInt((dataByte * 8).W))
+      val dataIsRead = waiteReadDataPipeReg.needRead(laneIndex)
+      // todo: select vs1 when slide1
+      dataElement := Mux(dataIsRead, waiteReadData(laneIndex), 0.U)
+      dataElement
+    }).asUInt
+
+    val shifterData = (data << (waiteReadDataPipeReg.executeIndex ## 0.U(log2Ceil(eachIndexSize).W))).asUInt
+    // align
+    changeUIntSize(shifterData, dlen)
+  }
+  val writeData     = Mux1H(sew1H, writeDataVec)
+
+  val writeMaskVec: Seq[UInt] = Seq(0, 1, 2).map { sewInt =>
+    val MaskMagnification = 1 << sewInt
+    val mask              = FillInterleaved(MaskMagnification, waiteReadDataPipeReg.sourceValid)
+    val shifterMask       = (mask << (waiteReadDataPipeReg.executeIndex ## 0.U(log2Ceil(eachIndexSize / 8).W))).asUInt
+    // align
+    changeUIntSize(shifterMask, dlen / 8)
+  }
+  val writeMask = Mux1H(sew1H, writeMaskVec)
+
+  val writeRequest:  Seq[MaskUnitExeResponse] = Seq.tabulate(parameter.laneNumber) { laneIndex =>
+    val res: MaskUnitExeResponse = Wire(new MaskUnitExeResponse(parameter.laneParam))
+    res.ffoByOther             := false.B
+    res.index                  := instReg.instructionIndex
+    res.writeData.groupCounter := waiteReadDataPipeReg.groupCounter
+    res.writeData.data         := cutUIntBySize(writeData, parameter.laneNumber)(laneIndex)
+    res.writeData.mask         := cutUIntBySize(writeMask, parameter.laneNumber)(laneIndex)
+    res
+  }
+  val WillWriteLane: UInt                     = VecInit(cutUIntBySize(writeMask, parameter.laneNumber).map(_.orR)).asUInt
+
+  // update waite read stage
+  val waiteStageDeqValid: Bool =
+    waiteReadStageValid &&
+      (waiteReadSate === waiteReadDataPipeReg.needRead || waiteReadDataPipeReg.needRead === 0.U)
+  val waiteStageDeqReady: Bool = Wire(Bool())
+  val waiteStageDeqFire:  Bool = waiteStageDeqValid && waiteStageDeqReady
+
+  val waiteStageEnqReady: Bool = !waiteReadStageValid || waiteStageDeqFire
+  val waiteStageEnqFire:  Bool = readWaitQueue.io.deq.valid && waiteStageEnqReady
+
+  readWaitQueue.io.deq.ready := waiteStageEnqReady
+
+  when(waiteStageEnqFire) {
+    waiteReadDataPipeReg := readWaitQueue.io.deq.bits
+  }
+
+  when(waiteStageDeqFire ^ waiteStageEnqFire) {
+    waiteReadStageValid := waiteStageEnqFire
+  }
+
+  waiteReadData.zipWithIndex.foreach { case (reg, index) =>
+    val isWaiteForThisData = waiteReadDataPipeReg.needRead(index) && !waiteReadSate(index) && waiteReadStageValid
+    val read               = readData(index)
+    read.ready := isWaiteForThisData
+    when(read.fire) {
+      reg := read.bits
+    }
+  }
+  val readResultValid: UInt = VecInit(readData.map(_.fire)).asUInt
+  when(waiteStageEnqFire && readResultValid.orR) {
+    waiteReadSate := readResultValid
+  }.elsewhen(readResultValid.orR) {
+    waiteReadSate := waiteReadSate | readResultValid
+  }.elsewhen(waiteStageEnqFire) {
+    waiteReadSate := 0.U
+  }
+
+  // Determine whether the data is ready
+  val executeEnqValid: Bool = waiteReadStageValid && waiteReadDataPipeReg.needRead === waiteReadSate
+
+  // start execute
+  val compressUnit: MaskCompress = Module(new MaskCompress(parameter))
+  val reduceUnit:   MaskReduce   = Module(new MaskReduce(parameter))
+  val extendUnit:   MaskExtend   = Module(new MaskExtend(parameter))
+
+  // todo
+  val source2: UInt = VecInit(exeReqReg.map(_.bits.source2)).asUInt
+  val source1: UInt = VecInit(exeReqReg.map(_.bits.source1)).asUInt
+
+  compressUnit.in.valid               := executeEnqValid
+  compressUnit.in.bits.maskType       := instReg.maskType
+  compressUnit.in.bits.eew            := instReg.sew
+  compressUnit.in.bits.uop            := instReg.decodeResult(Decoder.topUop)
+  compressUnit.in.bits.readFromScalar := instReg.readFromScala
+  compressUnit.in.bits.source1        := source1
+  compressUnit.in.bits.source2        := source2
+  compressUnit.in.bits.groupCounter   := waiteReadDataPipeReg.groupCounter
+  compressUnit.in.bits.lastCompress   := lastGroup
+  compressUnit.newInstruction         := instReq.valid
+
+  reduceUnit.in.valid             := executeEnqValid && unitType(2)
+  reduceUnit.in.bits.maskType     := instReg.maskType
+  reduceUnit.in.bits.eew          := instReg.sew
+  reduceUnit.in.bits.uop          := instReg.decodeResult(Decoder.topUop)
+  reduceUnit.in.bits.readVS1      := source1
+  reduceUnit.in.bits.source2      := source2
+  reduceUnit.in.bits.sourceValid  := waiteReadDataPipeReg.sourceValid
+  reduceUnit.in.bits.groupCounter := waiteReadDataPipeReg.groupCounter
+  reduceUnit.in.bits.lastGroup    := lastGroup
+  reduceUnit.in.bits.vxrm         := instReg.vxrm
+  reduceUnit.in.bits.aluUop       := instReg.decodeResult(Decoder.uop)
+  reduceUnit.in.bits.sign         := !instReg.decodeResult(Decoder.unsigned1)
+  reduceUnit.newInstruction       := instReq.valid
+
+  extendUnit.in.eew          := instReg.sew
+  extendUnit.in.uop          := instReg.decodeResult(Decoder.topUop)
+  extendUnit.in.source2      := source2
+  extendUnit.in.groupCounter := waiteReadDataPipeReg.groupCounter
+
+  val executeResult = Mux1H(
+    unitType,
+    Seq(
+      source2,
+      compressUnit.out.data,
+      reduceUnit.out.bits.data,
+      extendUnit.out
+    )
+  )
+
+  // todo
+  val executeMask:     UInt = VecInit(exeReqReg.map(_.bits.source2)).asUInt
+  val executeDeqCount: UInt = waiteReadDataPipeReg.groupCounter
+
+  //  val executeValid = Mux1H(
+  //    unitType,
+  //    Seq(
+  //      executeEnqValid,
+  //      compressUnit.out.compressValid,
+  //      reduceUnit.out.valid,
+  //      executeEnqValid
+  //    )
+  //  )
+  val executeValid: Bool = Mux1H(
+    unitType(3, 1),
+    Seq(
+      compressUnit.out.compressValid,
+      reduceUnit.out.valid,
+      executeEnqValid
+    )
+  )
+
+  // mask unit write queue
+  val writeQueue: Seq[Queue[MaskUnitExeResponse]] = Seq.tabulate(parameter.laneNumber) { _ =>
+    Module(
+      new Queue(
+        new MaskUnitExeResponse(parameter.laneParam),
+        maskUnitWriteQueueSize
+      )
+    )
+  }
+
+  writeQueue.zipWithIndex.foreach { case (queue, index) =>
+    val readTypeWriteVrf: Bool = waiteStageDeqFire && WillWriteLane(index)
+    queue.io.enq.valid                       := executeValid || readTypeWriteVrf
+    queue.io.enq.bits.writeData.data         := cutUInt(executeResult, parameter.datapathWidth)(index)
+    queue.io.enq.bits.writeData.mask         := cutUInt(executeMask, parameter.datapathWidth / 8)(index)
+    queue.io.enq.bits.writeData.groupCounter := executeDeqCount
+    queue.io.enq.bits.ffoByOther             := false.B // todo
+    queue.io.enq.bits.index                  := instReg.instructionIndex
+    when(readTypeWriteVrf) {
+      queue.io.enq.bits := writeRequest(index)
+    }
+
+    // write vrf
+    val writePort = exeResp(index)
+    queue.io.deq.ready := true.B
+    writePort.valid    := queue.io.deq.valid
+    writePort.bits     := queue.io.deq.bits
+  }
+  waiteStageDeqReady := writeQueue.zipWithIndex.map { case (queue, index) =>
+    !WillWriteLane(index) || queue.io.enq.ready
+  }.reduce(_ && _)
+  writeRD <> DontCare
+
+  // todo: token
+  val waitQueueClear: Bool = RegInit(false.B)
+  val lastReportValid = waitQueueClear && !writeQueue.map(_.io.deq.valid).reduce(_ || _)
+  when(lastReportValid) {
+    waitQueueClear := false.B
+  }
+  when(waiteStageDeqFire && waiteReadDataPipeReg.last) {
+    waitQueueClear := true.B
+  }
+  lastReport := maskAnd(
+    lastReportValid,
+    indexToOH(instReg.instructionIndex, parameter.chainingSize)
+  )
+}
diff --git a/t1/src/mask/MaskUnitReadCrossBar.scala b/t1/src/mask/MaskUnitReadCrossBar.scala
new file mode 100644
index 000000000..dab845d9e
--- /dev/null
+++ b/t1/src/mask/MaskUnitReadCrossBar.scala
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: 2022 Jiuyang Liu <liu@jiuyang.me>
+
+package org.chipsalliance.t1.rtl
+
+import chisel3._
+import chisel3.util._
+
+class MaskUnitReadCrossBar(parameter: T1Parameter) extends Module {
+  val input:  Seq[DecoupledIO[MaskUnitReadReq]]   = Seq.tabulate(parameter.laneNumber)(_ =>
+    IO(
+      Flipped(
+        Decoupled(
+          new MaskUnitReadReq(parameter)
+        )
+      )
+    )
+  )
+  val output: Seq[DecoupledIO[MaskUnitReadQueue]] = Seq.tabulate(parameter.laneNumber)(_ =>
+    IO(
+      Decoupled(
+        new MaskUnitReadQueue(parameter)
+      )
+    )
+  )
+
+  val inputSelect1H: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W)))
+
+  input.zipWithIndex.foldLeft(0.U(parameter.laneNumber.W)) { case (laneOccupied, (req, index)) =>
+    val requestReadLane = UIntToOH(req.bits.readLane)
+    // read lane free
+    val free:     Bool = (requestReadLane & (~laneOccupied).asUInt).orR
+    val outReady: Bool = Mux1H(requestReadLane, output.map(_.ready))
+    req.ready            := free && outReady
+    inputSelect1H(index) := Mux(req.valid && free, requestReadLane, 0.U(parameter.laneNumber.W))
+    laneOccupied | inputSelect1H(index)
+  }
+
+  output.zipWithIndex.foreach { case (req, index) =>
+    val tryToRead: UInt = VecInit(inputSelect1H.map(_(index))).asUInt
+    req.valid := tryToRead.orR
+    val selectReq: DecoupledIO[MaskUnitReadReq] = Mux1H(tryToRead, input)
+    req.bits.vs         := selectReq.bits.vs
+    req.bits.offset     := selectReq.bits.offset
+    req.bits.writeIndex := selectReq.bits.requestIndex
+    req.bits.dataOffset := selectReq.bits.dataOffset
+  }
+}
diff --git a/t1/src/package.scala b/t1/src/package.scala
index b0afc12be..ba1519b6b 100644
--- a/t1/src/package.scala
+++ b/t1/src/package.scala
@@ -10,6 +10,8 @@ import chisel3.util.experimental.decode.DecodeBundle
 import org.chipsalliance.t1.rtl.decoder.{Decoder, TableGenerator}
 import org.chipsalliance.t1.rtl.lane.Distributor
 
+import scala.jdk.CollectionConverters._
+
 package object rtl {
   def csa32(s: UInt, c: UInt, a: UInt): (UInt, UInt) = {
     val xor = s ^ c
@@ -74,6 +76,30 @@ package object rtl {
     })
   }
 
+  def cutUIntBySize(data: UInt, size: Int): Vec[UInt] = {
+    require(data.getWidth % size == 0)
+    val width: Int = data.getWidth / size
+    cutUInt(data, width)
+  }
+
+  def changeUIntSize(data: UInt, size: Int, sign: Boolean = false): UInt = {
+    if (data.getWidth >= size) {
+      data(size - 1, 0)
+    } else {
+      val extend = if (sign) data(data.getWidth - 1) else false.B
+      Fill(size - data.getWidth, extend) ## data
+    }
+  }
+
+  def UIntWithSize(data: UInt, width: Int, signExtend: Boolean = false): UInt = {
+    val sign = if (signExtend) data(data.getWidth - 1) else false.B
+    if (data.getWidth > width) {
+      data(width - 1, 0)
+    } else {
+      Fill(width - data.getWidth, sign) ## data
+    }
+  }
+
   def calculateSegmentWriteMask(
     datapath:                  Int,
     laneNumber:                Int,
diff --git a/t1/src/sequencer/T1TokenManager.scala b/t1/src/sequencer/T1TokenManager.scala
index b80ed2dcd..de19e9be0 100644
--- a/t1/src/sequencer/T1TokenManager.scala
+++ b/t1/src/sequencer/T1TokenManager.scala
@@ -7,10 +7,20 @@ import chisel3._
 import chisel3.experimental.hierarchy.{instantiable, public}
 import chisel3.util._
 
+class IssueToken(parameter: T1Parameter) extends Bundle {
+  val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W)
+  val writeV0:          Bool = Bool()
+  val useV0AsMask:      Bool = Bool()
+  val isLoadStore:      Bool = Bool()
+}
+
 @instantiable
 class T1TokenManager(parameter: T1Parameter) extends Module {
   @public
-  val writeV0 = IO(Vec(parameter.laneNumber, Flipped(Valid(UInt(parameter.instructionIndexBits.W)))))
+  val instructionIssue: ValidIO[IssueToken] = IO(Flipped(Valid(new IssueToken(parameter))))
+
+  @public
+  val issueAllow: Bool = IO(Output(Bool()))
 
   @public
   val instructionFinish: Vec[UInt] = IO(Vec(parameter.laneNumber, Input(UInt(parameter.chainingSize.W))))
@@ -18,19 +28,42 @@ class T1TokenManager(parameter: T1Parameter) extends Module {
   @public
   val v0WriteValid = IO(Output(UInt(parameter.chainingSize.W)))
 
-  // v0 write token
-  val v0WriteValidVec: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { laneIndex =>
-    val update: ValidIO[UInt] = writeV0(laneIndex)
-    val clear:  UInt          = instructionFinish(laneIndex)
-    val updateOH = maskAnd(update.valid, indexToOH(update.bits, parameter.chainingSize)).asUInt
+  val issueIndex1H: UInt = indexToOH(instructionIssue.bits.instructionIndex, parameter.chainingSize)
+
+  // Boolean type token clear & set
+  def updateBooleanToken(set: UInt, clear: UInt): UInt = {
     VecInit(Seq.tabulate(parameter.chainingSize) { chainingIndex =>
       val res = RegInit(false.B)
-      when(updateOH(chainingIndex) || clear(chainingIndex)) {
-        res := updateOH(chainingIndex)
+      when(set(chainingIndex) || clear(chainingIndex)) {
+        res := set(chainingIndex)
       }
       res
     }).asUInt
   }
 
+  // v0 write token
+  val v0WriteValidVec: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { laneIndex =>
+    val v0WriteIssue = instructionIssue.valid && instructionIssue.bits.writeV0
+    val clear: UInt = instructionFinish(laneIndex)
+    val updateOH = maskAnd(v0WriteIssue, issueIndex1H).asUInt
+    updateBooleanToken(updateOH, clear)
+  }
+
+  val useV0AsMaskToken: UInt = Seq
+    .tabulate(parameter.laneNumber) { laneIndex =>
+      val useV0Issue = instructionIssue.valid && instructionIssue.bits.useV0AsMask
+      val clear: UInt = instructionFinish(laneIndex)
+      val updateOH = maskAnd(useV0Issue, issueIndex1H).asUInt
+      updateBooleanToken(updateOH, clear)
+    }
+    .reduce(_ | _)
+
   v0WriteValid := v0WriteValidVec.reduce(_ | _)
+
+  // v0 read-write conflict
+  val v0Conflict: Bool =
+    (instructionIssue.bits.writeV0 && useV0AsMaskToken.orR) ||
+      (instructionIssue.bits.useV0AsMask && v0WriteValid.orR)
+
+  issueAllow := !(v0Conflict)
 }
diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala
index eaccfee70..fa676c6fc 100644
--- a/t1/src/vrf/VRF.scala
+++ b/t1/src/vrf/VRF.scala
@@ -529,7 +529,11 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
       val dataInLsuQueue = ohCheck(loadDataInLSUWriteQueue, record.bits.instIndex, parameter.chainingSize)
       // elementMask update by write
       val writeUpdateValidVec: Seq[Bool] =
-        writePort.map(p => p.fire && p.bits.instructionIndex === record.bits.instIndex && p.bits.mask(3))
+        writePort.map(p =>
+          p.fire && p.bits.instructionIndex === record.bits.instIndex &&
+            // Only index load will split the datapath into separate parts.
+            (p.bits.mask(3) || !record.bits.ls)
+        )
       val writeUpdate1HVec:    Seq[UInt] = writeOH.zip(writeUpdateValidVec).map { case (oh, v) => Mux(v, oh, 0.U) }
       // elementMask update by read of store instruction
       val loadUpdateValidVec =
@@ -545,7 +549,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
       val waitLaneClear   =
         record.bits.state.stFinish && record.bits.state.wWriteQueueClear &&
           record.bits.state.wLaneLastReport && record.bits.state.wTopLastReport
-      val stateClear: Bool = waitLaneClear && record.bits.state.wLaneClear
+      val stateClear: Bool = waitLaneClear && record.bits.state.wLaneClear || record.bits.elementMask.andR
 
       when(topLastReport) {
         record.bits.state.stFinish       := true.B
@@ -607,7 +611,8 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar
             Mux(older, sourceVdEqSinkVs, sinkVdEqSourceVs)
         )
         val rawForeStore   = Mux(older, isStore.head && isSlow.last, isStore.last && isSlow.head) && samVd
-        (hazardForeLoad, rawForeStore)
+        // (hazardForeLoad, rawForeStore) todo: need check hazard?
+        (false.B, false.B)
       }
   }
   writeReadyForLsu := !hazardVec.map(_.map(_._1).reduce(_ || _)).reduce(_ || _)