Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpu: optimize the clock gate efficiency of bpu/predictors_update_* and bpu/previous_* #3265

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 113 additions & 2 deletions src/main/scala/xiangshan/frontend/BPU.scala
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,13 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H
s1_pred_info.takenMask := resp.s1.full_pred.map(_.taken_mask_on_slot)
s1_pred_info.cfiIndex := resp.s1.cfiIndex.map { case x => x.bits }

val previous_s1_pred_info = RegEnable(s1_pred_info, 0.U.asTypeOf(new PreviousPredInfo), s1_fire_dup(0))
val previous_s1_pred_info = Wire(new PreviousPredInfo)
previous_s1_pred_info.hit := GatedRegEnable(s1_pred_info.hit, Some(0.U.asTypeOf(s1_pred_info.hit)), s1_fire_dup(0))
previous_s1_pred_info.taken := GatedRegEnable(s1_pred_info.taken, Some(0.U.asTypeOf(s1_pred_info.taken)), s1_fire_dup(0))
previous_s1_pred_info.target := GatedRegEnable.dupRegs(s1_pred_info.target, Some(0.U.asTypeOf(s1_pred_info.target)), s1_fire_dup(0))
previous_s1_pred_info.lastBrPosOH := GatedRegEnable.dupRegs(s1_pred_info.lastBrPosOH, Some(0.U.asTypeOf(s1_pred_info.lastBrPosOH)), s1_fire_dup(0))
previous_s1_pred_info.takenMask := GatedRegEnable.dupRegs(s1_pred_info.takenMask, Some(0.U.asTypeOf(s1_pred_info.takenMask)), s1_fire_dup(0))
previous_s1_pred_info.cfiIndex := GatedRegEnable.dupRegs(s1_pred_info.cfiIndex, Some(0.U.asTypeOf(s1_pred_info.cfiIndex)), s1_fire_dup(0))

val s2_redirect_s1_last_pred_vec_dup = preds_needs_redirect_vec_dup(previous_s1_pred_info, resp.s2)

Expand Down Expand Up @@ -702,7 +708,25 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H
)
)

val previous_s2_pred = RegEnable(resp.s2, 0.U.asTypeOf(resp.s2), s2_fire_dup(0))
// ------- To optimize Clock Gate Efficiency of bpu/previous_* -------
val previous_s2_pred = Wire(new BranchPredictionBundle)
previous_s2_pred.pc := SegmentedAddrNext.dupAddrs(resp.s2.pc, pcSegments, s2_fire_dup(0), Some("previous_s2_pred_pc")).map(segAddr => segAddr.getAddr())
previous_s2_pred.valid := RegEnable(resp.s2.valid, 0.U.asTypeOf(resp.s2.valid), s2_fire_dup(0))
previous_s2_pred.hasRedirect := RegEnable(resp.s2.hasRedirect, 0.U.asTypeOf(resp.s2.hasRedirect), s2_fire_dup(0))
previous_s2_pred.ftq_idx := RegEnable(resp.s2.ftq_idx, 0.U.asTypeOf(resp.s2.ftq_idx), s2_fire_dup(0))
previous_s2_pred.full_pred := RegEnable(resp.s2.full_pred, 0.U.asTypeOf(resp.s2.full_pred), s2_fire_dup(0))
previous_s2_pred.full_pred.zip(resp.s2.full_pred.zipWithIndex).map {case (prev_fp, (new_fp, dupIdx)) =>
prev_fp.targets.zip(new_fp.taken_mask_on_slot.zipWithIndex).map{ case(target, (taken_mask, slotIdx)) =>
target := RegEnable(new_fp.targets(slotIdx), 0.U.asTypeOf(new_fp.targets(slotIdx)), s2_fire_dup(0) && taken_mask)
}
prev_fp.fallThroughAddr := RegEnable(new_fp.fallThroughAddr, 0.U.asTypeOf(new_fp.fallThroughAddr), s2_fire_dup(0) && new_fp.hit)
}

XSPerfAccumulate("previous_s2_pred_full_pred_old", s2_fire_dup(0))
XSPerfAccumulate("previous_s2_pred_full_pred_fallThroughAddr_new", s2_fire_dup(0) && resp.s2.full_pred(0).hit)
XSPerfAccumulate("previous_s2_pred_full_pred_targets_new", s2_fire_dup(0) && resp.s2.full_pred(0).taken_mask_on_slot.reduce(_||_))
XSPerfAccumulate("previous_s2_pred_full_pred_target0_new", s2_fire_dup(0) && resp.s2.full_pred(0).taken_mask_on_slot(0))
XSPerfAccumulate("previous_s2_pred_full_pred_target1_new", s2_fire_dup(0) && resp.s2.full_pred(0).taken_mask_on_slot(1))

val s3_redirect_on_br_taken_dup = resp.s3.full_pred.zip(previous_s2_pred.full_pred).map {case (fp1, fp2) => fp1.real_br_taken_mask().asUInt =/= fp2.real_br_taken_mask().asUInt}
val s3_both_first_taken_dup = resp.s3.full_pred.zip(previous_s2_pred.full_pred).map {case (fp1, fp2) => fp1.real_br_taken_mask()(0) && fp2.real_br_taken_mask()(0)}
Expand Down Expand Up @@ -759,6 +783,93 @@ class Predictor(implicit p: Parameters) extends XSModule with HasBPUConst with H

predictors.io.update.valid := RegNext(io.ftq_to_bpu.update.valid, init = false.B)
predictors.io.update.bits := RegEnable(io.ftq_to_bpu.update.bits, io.ftq_to_bpu.update.valid)

// ------- To optimize Clock Gate Efficiency of bpu/predictors_update* -------
// Update pc
predictors.io.update.bits.pc := SegmentedAddrNext(io.ftq_to_bpu.update.bits.pc, pcSegments, io.ftq_to_bpu.update.valid, Some("predictors.io.update.pc")).getAddr()

// Update ftb_entry
val FTBEntryUpdateValid = io.ftq_to_bpu.update.bits.ftb_entry.valid && io.ftq_to_bpu.update.valid
predictors.io.update.bits.ftb_entry := RegEnable(io.ftq_to_bpu.update.bits.ftb_entry, FTBEntryUpdateValid)
predictors.io.update.bits.ftb_entry.valid := RegEnable(FTBEntryUpdateValid, io.ftq_to_bpu.update.valid) // not useful

// Get UpdateMeta of each Predictor
// | <------------- io.update.bits.meta ------------> |
// |---- x -----| uftb | tage-sc | ftb | ittage | ras |
val metaSizeSeq = predictors.asInstanceOf[Composer].getEachMetaSize()
val metaStartIdxWithSize = metaSizeSeq.foldLeft(Seq[(Int, Int)]()) { (acc, len) =>
val start = if (acc.isEmpty) 0 else acc.last._1 + acc.last._2
acc :+ (start, len)
}
val Seq(( ras_meta_sta, ras_meta_sz),
(ittage_meta_sta, ittage_meta_sz),
( ftb_meta_sta, ftb_meta_sz),
( tage_meta_sta, tage_meta_sz),
( uftb_meta_sta, uftb_meta_sz)) = metaStartIdxWithSize.take(5)

println(p"uftb_meta_sta = $uftb_meta_sta , uftb_meta_sz = $uftb_meta_sz ")
println(p"tage_meta_sta = $tage_meta_sta , tage_meta_sz = $tage_meta_sz ")
println(p"ftb_meta_sta = $ftb_meta_sta , ftb_meta_sz = $ftb_meta_sz ")
println(p"ittage_meta_sta = $ittage_meta_sta, ittage_meta_sz = $ittage_meta_sz")
println(p"ras_meta_sta = $ras_meta_sta , ras_meta_sz = $ras_meta_sz ")

val UpdateTageMeta = io.ftq_to_bpu.update.bits.meta( tage_meta_sta + tage_meta_sz - 1, tage_meta_sta).asTypeOf(new TageMeta)
val UpdateFTBMeta = io.ftq_to_bpu.update.bits.meta( ftb_meta_sta + ftb_meta_sz - 1, ftb_meta_sta).asTypeOf(new FTBMeta)
val UpdateITTageMeta = io.ftq_to_bpu.update.bits.meta(ittage_meta_sta + ittage_meta_sz - 1, ittage_meta_sta).asTypeOf(new ITTageMeta)
val UpdateRASMeta = io.ftq_to_bpu.update.bits.meta( ras_meta_sta + ras_meta_sz - 1, ras_meta_sta).asTypeOf(new RASMeta)

// Update Meta of each Predictor
val new_uftb_meta = RegEnable(io.ftq_to_bpu.update.bits.meta(uftb_meta_sta + uftb_meta_sz - 1, uftb_meta_sta), io.ftq_to_bpu.update.valid)
val new_ftb_meta = RegEnable(UpdateFTBMeta, io.ftq_to_bpu.update.valid && !io.ftq_to_bpu.update.bits.old_entry)
val new_ras_meta = RegEnable(UpdateRASMeta, io.ftq_to_bpu.update.valid && (io.ftq_to_bpu.update.bits.is_call_taken || io.ftq_to_bpu.update.bits.is_ret_taken))

val new_ittage_meta = WireInit(0.U.asTypeOf(new ITTageMeta))
new_ittage_meta := RegEnable(UpdateITTageMeta, io.ftq_to_bpu.update.valid)
new_ittage_meta.provider.bits := RegEnable(UpdateITTageMeta.provider.bits , io.ftq_to_bpu.update.valid && UpdateITTageMeta.provider.valid )
new_ittage_meta.providerTarget := RegEnable(UpdateITTageMeta.providerTarget , io.ftq_to_bpu.update.valid && UpdateITTageMeta.provider.valid )
new_ittage_meta.allocate.bits := RegEnable(UpdateITTageMeta.allocate.bits , io.ftq_to_bpu.update.valid && UpdateITTageMeta.allocate.valid )
new_ittage_meta.altProvider.bits := RegEnable(UpdateITTageMeta.altProvider.bits , io.ftq_to_bpu.update.valid && UpdateITTageMeta.altProvider.valid)
new_ittage_meta.altProviderTarget := RegEnable(UpdateITTageMeta.altProviderTarget, io.ftq_to_bpu.update.valid && UpdateITTageMeta.provider.valid &&
UpdateITTageMeta.altProvider.valid &&
UpdateITTageMeta.providerCtr === 0.U &&
io.ftq_to_bpu.update.bits.mispred_mask(numBr) )

val new_tage_meta = WireInit(0.U.asTypeOf(new TageMeta))
new_tage_meta := RegEnable(UpdateTageMeta, io.ftq_to_bpu.update.valid)
val TageUpdateValids = VecInit((0 until TageBanks).map(w =>
io.ftq_to_bpu.update.bits.ftb_entry.brValids(w) && io.ftq_to_bpu.update.valid && !io.ftq_to_bpu.update.bits.ftb_entry.always_taken(w) &&
!(PriorityEncoder(io.ftq_to_bpu.update.bits.br_taken_mask) < w.U)))
for(i <- 0 until numBr){
val TageHasUpdate = TageUpdateValids(i)
val TageUpdateProvided = UpdateTageMeta.providers(i).valid
new_tage_meta.providers(i).bits := RegEnable(UpdateTageMeta.providers(i).bits, TageUpdateProvided && TageHasUpdate)
new_tage_meta.providerResps(i) := RegEnable(UpdateTageMeta.providerResps(i), TageHasUpdate) // `TageUpdateProvided` 只影响perfAccumulate
new_tage_meta.altUsed(i) := RegEnable(UpdateTageMeta.altUsed(i), TageHasUpdate)
new_tage_meta.allocates(i) := RegEnable(UpdateTageMeta.allocates(i), TageHasUpdate && io.ftq_to_bpu.update.bits.mispred_mask(i))
}
if(EnableSC){
for(w <- 0 until TageBanks){
new_tage_meta.scMeta.get.scPreds(w) := RegEnable(UpdateTageMeta.scMeta.get.scPreds(w), TageUpdateValids(w) && UpdateTageMeta.providers(w).valid)
new_tage_meta.scMeta.get.ctrs(w) := RegEnable(UpdateTageMeta.scMeta.get.ctrs(w), TageUpdateValids(w) && UpdateTageMeta.providers(w).valid)
}
}

predictors.io.update.bits.meta := Cat(0.U((MaxMetaLength - metaSizeSeq.foldLeft(0)(_ + _)).W),
new_uftb_meta.asUInt,
new_tage_meta.asUInt,
new_ftb_meta.asUInt,
new_ittage_meta.asUInt,
new_ras_meta.asUInt)

// Update full_target
val gatedCond1 = UpdateITTageMeta.provider.valid
val gatedCond2 = io.ftq_to_bpu.update.bits.mispred_mask(numBr) && !(UpdateITTageMeta.provider.valid && UpdateITTageMeta.providerCtr === 0.U)
predictors.io.update.bits.full_target := RegEnable(io.ftq_to_bpu.update.bits.full_target, io.ftq_to_bpu.update.valid && ( gatedCond1 || gatedCond2 ))

// Update cfi_idx
predictors.io.update.bits.cfi_idx.bits := RegEnable(io.ftq_to_bpu.update.bits.cfi_idx.bits, io.ftq_to_bpu.update.valid && io.ftq_to_bpu.update.bits.cfi_idx.valid)

// Update ghist
predictors.io.update.bits.ghist := RegEnable(
getHist(io.ftq_to_bpu.update.bits.spec_info.histPtr), io.ftq_to_bpu.update.valid)

Expand Down
8 changes: 8 additions & 0 deletions src/main/scala/xiangshan/frontend/Composer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,14 @@ class Composer(implicit p: Parameters) extends BasePredictor with HasBPUConst wi
metas(idx)
}

def getEachMetaSize(): Seq[Int] = {
var meta_sz: Seq[Int] = Nil
for (c <- components.reverse) {
meta_sz = meta_sz :+ c.meta_size
}
meta_sz
}

override def getFoldedHistoryInfo = Some(components.map(_.getFoldedHistoryInfo.getOrElse(Set())).reduce(_++_))

override val perfEvents = components.map(_.getPerfEvents).reduce(_++_)
Expand Down
2 changes: 1 addition & 1 deletion utility
Loading