xref: /XiangShan/src/main/scala/xiangshan/frontend/BPU.scala (revision e43855a9ca830e257120ed8b5937a1d1e87a4b75)
1package xiangshan.frontend
2
3import chisel3._
4import chisel3.util._
5import utils._
6import xiangshan._
7import xiangshan.backend.ALUOpType
8import xiangshan.backend.JumpOpType
9import chisel3.util.experimental.BoringUtils
10import xiangshan.backend.decode.XSTrap
11
12class TableAddr(val idxBits: Int, val banks: Int) extends XSBundle {
13  def tagBits = VAddrBits - idxBits - 1
14
15  val tag = UInt(tagBits.W)
16  val idx = UInt(idxBits.W)
17  val offset = UInt(1.W)
18
19  def fromUInt(x: UInt) = x.asTypeOf(UInt(VAddrBits.W)).asTypeOf(this)
20  def getTag(x: UInt) = fromUInt(x).tag
21  def getIdx(x: UInt) = fromUInt(x).idx
22  def getBank(x: UInt) = getIdx(x)(log2Up(banks) - 1, 0)
23  def getBankIdx(x: UInt) = getIdx(x)(idxBits - 1, log2Up(banks))
24}
25
26class Stage1To2IO extends XSBundle {
27  val pc = Output(UInt(VAddrBits.W))
28  val btb = new Bundle {
29    val hits = Output(UInt(FetchWidth.W))
30    val targets = Output(Vec(FetchWidth, UInt(VAddrBits.W)))
31  }
32  val jbtac = new Bundle {
33    val hitIdx = Output(UInt(FetchWidth.W))
34    val target = Output(UInt(VAddrBits.W))
35  }
36  val tage = new Bundle {
37    val hits = Output(UInt(FetchWidth.W))
38    val takens = Output(Vec(FetchWidth, Bool()))
39  }
40  val hist = Output(Vec(FetchWidth, UInt(HistoryLength.W)))
41  val btbPred = ValidIO(new BranchPrediction)
42}
43
44class BPUStage1 extends XSModule {
45  val io = IO(new Bundle() {
46    val in = new Bundle { val pc = Flipped(Decoupled(UInt(VAddrBits.W))) }
47    // from backend
48    val redirectInfo = Input(new RedirectInfo)
49    // from Stage3
50    val flush = Input(Bool())
51    val s3RollBackHist = Input(UInt(HistoryLength.W))
52    val s3Taken = Input(Bool())
53    // to ifu, quick prediction result
54    val s1OutPred = ValidIO(new BranchPrediction)
55    // to Stage2
56    val out = Decoupled(new Stage1To2IO)
57  })
58
59  io.in.pc.ready := true.B
60
61  // flush Stage1 when io.flush
62  val flushS1 = BoolStopWatch(io.flush, io.in.pc.fire(), startHighPriority = true)
63
64  // global history register
65  val ghr = RegInit(0.U(HistoryLength.W))
66  // modify updateGhr and newGhr when updating ghr
67  val updateGhr = WireInit(false.B)
68  val newGhr = WireInit(0.U(HistoryLength.W))
69  when (updateGhr) { ghr := newGhr }
70  // use hist as global history!!!
71  val hist = Mux(updateGhr, newGhr, ghr)
72
73  // Tage predictor
74  // val tage = Module(new FakeTAGE)
75  val tage = if(EnableBPD) Module(new Tage) else Module(new FakeTAGE)
76  tage.io.req.valid := io.in.pc.fire()
77  tage.io.req.bits.pc := io.in.pc.bits
78  tage.io.req.bits.hist := hist
79  tage.io.redirectInfo <> io.redirectInfo
80  io.out.bits.tage <> tage.io.out
81  io.s1OutPred.bits.tageMeta := tage.io.meta
82
83  // latch pc for 1 cycle latency when reading SRAM
84  val pcLatch = RegEnable(io.in.pc.bits, io.in.pc.fire())
85
86  val r = io.redirectInfo.redirect
87  val updateFetchpc = r.pc - (r.fetchIdx << 2.U)
88  // BTB
89  val btb = Module(new BTB)
90  btb.io.in.pc <> io.in.pc
91  btb.io.in.pcLatch := pcLatch
92  // TODO: pass real mask in
93  btb.io.in.mask := "b1111111111111111".asUInt
94  btb.io.redirectValid := io.redirectInfo.valid
95  btb.io.flush := io.flush
96
97  // btb.io.update.fetchPC := updateFetchpc
98  // btb.io.update.fetchIdx := r.fetchIdx
99  btb.io.update.pc := r.pc
100  btb.io.update.hit := r.btbHitWay
101  btb.io.update.misPred := io.redirectInfo.misPred
102  // btb.io.update.writeWay := r.btbVictimWay
103  btb.io.update.oldCtr := r.btbPredCtr
104  btb.io.update.taken := r.taken
105  btb.io.update.target := r.brTarget
106  btb.io.update._type := r._type
107  // TODO: add RVC logic
108  btb.io.update.isRVC := DontCare
109
110  val btbHit = btb.io.out.hit
111  val btbTaken = btb.io.out.taken
112  val btbTakenIdx = btb.io.out.takenIdx
113  val btbTakenTarget = btb.io.out.target
114  // val btbWriteWay = btb.io.out.writeWay
115  val btbNotTakens = btb.io.out.notTakens
116  val btbCtrs = VecInit(btb.io.out.dEntries.map(_.pred))
117  val btbValids = btb.io.out.hits
118  val btbTargets = VecInit(btb.io.out.dEntries.map(_.target))
119  val btbTypes = VecInit(btb.io.out.dEntries.map(_._type))
120
121
122  val jbtac = Module(new JBTAC)
123  jbtac.io.in.pc <> io.in.pc
124  jbtac.io.in.pcLatch := pcLatch
125  jbtac.io.in.hist := hist
126  jbtac.io.redirectValid := io.redirectInfo.valid
127  jbtac.io.flush := io.flush
128
129  jbtac.io.update.fetchPC := updateFetchpc
130  jbtac.io.update.fetchIdx := r.fetchIdx << 1
131  jbtac.io.update.misPred := io.redirectInfo.misPred
132  jbtac.io.update._type := r._type
133  jbtac.io.update.target := r.target
134  jbtac.io.update.hist := r.hist
135
136  val jbtacHit = jbtac.io.out.hit
137  val jbtacTarget = jbtac.io.out.target
138  val jbtacHitIdx = jbtac.io.out.hitIdx
139
140  // calculate global history of each instr
141  val firstHist = RegNext(hist)
142  val histShift = Wire(Vec(FetchWidth, UInt(log2Up(FetchWidth).W)))
143  val shift = Wire(Vec(FetchWidth, Vec(FetchWidth, UInt(1.W))))
144  (0 until FetchWidth).map(i => shift(i) := Mux(!btbNotTakens(i), 0.U, ~LowerMask(UIntToOH(i.U), FetchWidth)).asTypeOf(Vec(FetchWidth, UInt(1.W))))
145  for (j <- 0 until FetchWidth) {
146    var tmp = 0.U
147    for (i <- 0 until FetchWidth) {
148      tmp = tmp + shift(i)(j)
149    }
150    histShift(j) := tmp
151  }
152  (0 until FetchWidth).map(i => io.s1OutPred.bits.hist(i) := firstHist << histShift(i))
153
154  // update ghr
155  updateGhr := io.s1OutPred.bits.redirect || io.flush
156  val brJumpIdx = Mux(!(btbHit && btbTaken), 0.U, UIntToOH(btbTakenIdx))
157  val indirectIdx = Mux(!jbtacHit, 0.U, UIntToOH(jbtacHitIdx))
158  //val newTaken = Mux(io.redirectInfo.flush(), !(r._type === BTBtype.B && !r.taken), )
159  newGhr := Mux(io.redirectInfo.flush(),    (r.hist << 1.U) | !(r._type === BTBtype.B && !r.taken),
160            Mux(io.flush,                   Mux(io.s3Taken, (io.s3RollBackHist << 1.U) | 1.U, io.s3RollBackHist),
161            Mux(io.s1OutPred.bits.redirect, (PriorityMux(brJumpIdx | indirectIdx, io.s1OutPred.bits.hist) << 1.U | 1.U),
162                                            io.s1OutPred.bits.hist(0) << PopCount(btbNotTakens))))
163
164  // redirect based on BTB and JBTAC
165  // io.out.valid := RegNext(io.in.pc.fire()) && !flushS1u
166  io.out.valid := RegNext(io.in.pc.fire()) && !io.flush
167
168  io.s1OutPred.valid := io.out.valid
169  io.s1OutPred.bits.redirect := btbHit && btbTaken || jbtacHit
170  // io.s1OutPred.bits.instrValid := LowerMask(UIntToOH(btbTakenIdx), FetchWidth) & LowerMask(UIntToOH(jbtacHitIdx), FetchWidth)
171  io.s1OutPred.bits.instrValid := Mux(io.s1OutPred.bits.redirect, LowerMask(LowestBit(brJumpIdx | indirectIdx, FetchWidth), FetchWidth), Fill(FetchWidth, 1.U(1.W))).asTypeOf(Vec(FetchWidth, Bool()))
172  io.s1OutPred.bits.target := Mux(brJumpIdx === LowestBit(brJumpIdx | indirectIdx, FetchWidth), btbTakenTarget, jbtacTarget)
173  // io.s1OutPred.bits.btbVictimWay := btbWriteWay
174  io.s1OutPred.bits.predCtr := btbCtrs
175  io.s1OutPred.bits.btbHitWay := btbHit
176  io.s1OutPred.bits.rasSp := DontCare
177  io.s1OutPred.bits.rasTopCtr := DontCare
178
179  io.out.bits.pc := pcLatch
180  io.out.bits.btb.hits := btbValids.asUInt
181  (0 until FetchWidth).map(i => io.out.bits.btb.targets(i) := btbTargets(i))
182  io.out.bits.jbtac.hitIdx := UIntToOH(jbtacHitIdx)
183  io.out.bits.jbtac.target := jbtacTarget
184  // TODO: we don't need this repeatedly!
185  io.out.bits.hist := io.s1OutPred.bits.hist
186  io.out.bits.btbPred := io.s1OutPred
187
188
189
190  // debug info
191  XSDebug(true.B, "[BPUS1]in:(%d %d)   pc=%x ghr=%b\n", io.in.pc.valid, io.in.pc.ready, io.in.pc.bits, hist)
192  XSDebug(true.B, "[BPUS1]outPred:(%d) pc=0x%x, redirect=%d instrValid=%b tgt=%x\n",
193    io.s1OutPred.valid, pcLatch, io.s1OutPred.bits.redirect, io.s1OutPred.bits.instrValid.asUInt, io.s1OutPred.bits.target)
194  XSDebug(io.flush && io.redirectInfo.flush(),
195    "[BPUS1]flush from backend: pc=%x tgt=%x brTgt=%x _type=%b taken=%d oldHist=%b fetchIdx=%d isExcpt=%d\n",
196    r.pc, r.target, r.brTarget, r._type, r.taken, r.hist, r.fetchIdx, r.isException)
197  XSDebug(io.flush && !io.redirectInfo.flush(),
198    "[BPUS1]flush from Stage3:  s3Taken=%d s3RollBackHist=%b\n", io.s3Taken, io.s3RollBackHist)
199
200}
201
202class Stage2To3IO extends Stage1To2IO {
203}
204
205class BPUStage2 extends XSModule {
206  val io = IO(new Bundle() {
207    // flush from Stage3
208    val flush = Input(Bool())
209    val in = Flipped(Decoupled(new Stage1To2IO))
210    val out = Decoupled(new Stage2To3IO)
211  })
212
213  // flush Stage2 when Stage3 or banckend redirects
214  val flushS2 = BoolStopWatch(io.flush, io.in.fire(), startHighPriority = true)
215  val inLatch = RegInit(0.U.asTypeOf(io.in.bits))
216  when (io.in.fire()) { inLatch := io.in.bits }
217  val validLatch = RegInit(false.B)
218  when (io.flush) {
219    validLatch := false.B
220  }.elsewhen (io.in.fire()) {
221    validLatch := true.B
222  }.elsewhen (io.out.fire()) {
223    validLatch := false.B
224  }
225
226  io.out.valid := !io.flush && !flushS2 && validLatch
227  io.in.ready := !validLatch || io.out.fire()
228
229  // do nothing
230  io.out.bits := inLatch
231
232  // debug info
233  XSDebug(true.B, "[BPUS2]in:(%d %d) pc=%x out:(%d %d) pc=%x\n",
234    io.in.valid, io.in.ready, io.in.bits.pc, io.out.valid, io.out.ready, io.out.bits.pc)
235  XSDebug(true.B, "[BPUS2]validLatch=%d pc=%x\n", validLatch, inLatch.pc)
236  XSDebug(io.flush, "[BPUS2]flush!!!\n")
237}
238
239class BPUStage3 extends XSModule {
240  val io = IO(new Bundle() {
241    val flush = Input(Bool())
242    val in = Flipped(Decoupled(new Stage2To3IO))
243    val out = ValidIO(new BranchPrediction)
244    // from icache
245    val predecode = Flipped(ValidIO(new Predecode))
246    // from backend
247    val redirectInfo = Input(new RedirectInfo)
248    // to Stage1 and Stage2
249    val flushBPU = Output(Bool())
250    // to Stage1, restore ghr in stage1 when flushBPU is valid
251    val s1RollBackHist = Output(UInt(HistoryLength.W))
252    val s3Taken = Output(Bool())
253  })
254
255  val flushS3 = BoolStopWatch(io.flush, io.in.fire(), startHighPriority = true)
256  val inLatch = RegInit(0.U.asTypeOf(io.in.bits))
257  val validLatch = RegInit(false.B)
258  when (io.in.fire()) { inLatch := io.in.bits }
259  when (io.flush) {
260    validLatch := false.B
261  }.elsewhen (io.in.fire()) {
262    validLatch := true.B
263  }.elsewhen (io.out.valid) {
264    validLatch := false.B
265  }
266  io.out.valid := validLatch && io.predecode.valid && !flushS3 && !io.flush
267  io.in.ready := !validLatch || io.out.valid
268
269  // RAS
270  // TODO: split retAddr and ctr
271  def rasEntry() = new Bundle {
272    val retAddr = UInt(VAddrBits.W)
273    val ctr = UInt(8.W) // layer of nested call functions
274  }
275  val ras = RegInit(VecInit(Seq.fill(RasSize)(0.U.asTypeOf(rasEntry()))))
276  val sp = Counter(RasSize)
277  val rasTop = ras(sp.value)
278  val rasTopAddr = rasTop.retAddr
279
280  // get the first taken branch/jal/call/jalr/ret in a fetch line
281  // brTakenIdx/jalIdx/callIdx/jalrIdx/retIdx/jmpIdx is one-hot encoded.
282  // brNotTakenIdx indicates all the not-taken branches before the first jump instruction.
283  val brIdx = inLatch.btb.hits & Reverse(Cat(io.predecode.bits.fuOpTypes.map { t => ALUOpType.isBranch(t) }).asUInt) & io.predecode.bits.mask
284  val brTakenIdx = LowestBit(brIdx & inLatch.tage.takens.asUInt, FetchWidth)
285  val jalIdx = LowestBit(inLatch.btb.hits & Reverse(Cat(io.predecode.bits.fuOpTypes.map { t => t === JumpOpType.jal }).asUInt) & io.predecode.bits.mask, FetchWidth)
286  val callIdx = LowestBit(inLatch.btb.hits & io.predecode.bits.mask & Reverse(Cat(io.predecode.bits.fuOpTypes.map { t => t === JumpOpType.call }).asUInt), FetchWidth)
287  val jalrIdx = LowestBit(inLatch.jbtac.hitIdx & io.predecode.bits.mask & Reverse(Cat(io.predecode.bits.fuOpTypes.map { t => t === JumpOpType.jalr }).asUInt), FetchWidth)
288  val retIdx = LowestBit(io.predecode.bits.mask & Reverse(Cat(io.predecode.bits.fuOpTypes.map { t => t === JumpOpType.ret }).asUInt), FetchWidth)
289
290  val jmpIdx = if (EnableRAS) LowestBit(brTakenIdx | jalIdx | callIdx | jalrIdx | retIdx, FetchWidth)
291               else LowestBit(brTakenIdx | jalIdx | callIdx | jalrIdx, FetchWidth)
292  val brNotTakenIdx = brIdx & ~inLatch.tage.takens.asUInt & LowerMask(jmpIdx, FetchWidth) & io.predecode.bits.mask
293
294
295  // io.out.bits.btbVictimWay := inLatch.btbPred.bits.btbVictimWay
296  io.out.bits.predCtr := inLatch.btbPred.bits.predCtr
297  io.out.bits.btbHitWay := inLatch.btbPred.bits.btbHitWay
298  io.out.bits.tageMeta := inLatch.btbPred.bits.tageMeta
299  //io.out.bits._type := Mux(jmpIdx === retIdx, BTBtype.R,
300  //  Mux(jmpIdx === jalrIdx, BTBtype.I,
301  //  Mux(jmpIdx === brTakenIdx, BTBtype.B, BTBtype.J)))
302  val firstHist = inLatch.btbPred.bits.hist(0)
303  // there may be several notTaken branches before the first jump instruction,
304  // so we need to calculate how many zeroes should each instruction shift in its global history.
305  // each history is exclusive of instruction's own jump direction.
306  val histShift = Wire(Vec(FetchWidth, UInt(log2Up(FetchWidth).W)))
307  val shift = Wire(Vec(FetchWidth, Vec(FetchWidth, UInt(1.W))))
308  (0 until FetchWidth).map(i => shift(i) := Mux(!brNotTakenIdx(i), 0.U, ~LowerMask(UIntToOH(i.U), FetchWidth)).asTypeOf(Vec(FetchWidth, UInt(1.W))))
309  for (j <- 0 until FetchWidth) {
310    var tmp = 0.U
311    for (i <- 0 until FetchWidth) {
312      tmp = tmp + shift(i)(j)
313    }
314    histShift(j) := tmp
315  }
316  (0 until FetchWidth).map(i => io.out.bits.hist(i) := firstHist << histShift(i))
317  // save ras checkpoint info
318  io.out.bits.rasSp := sp.value
319  io.out.bits.rasTopCtr := rasTop.ctr
320
321  // flush BPU and redirect when target differs from the target predicted in Stage1
322  val tToNt = inLatch.btbPred.bits.redirect && ~jmpIdx.orR.asBool
323  val ntToT = ~inLatch.btbPred.bits.redirect && jmpIdx.orR.asBool
324  val dirDiffers = tToNt || ntToT
325  val tgtDiffers = inLatch.btbPred.bits.redirect && jmpIdx.orR.asBool && io.out.bits.target =/= inLatch.btbPred.bits.target
326  io.out.bits.redirect := (if (EnableBPD) {dirDiffers || tgtDiffers} else false.B)
327  io.out.bits.target := Mux(jmpIdx === 0.U, inLatch.pc + (PopCount(io.predecode.bits.mask) << 2.U), // TODO: RVC
328    Mux(jmpIdx === retIdx, rasTopAddr,
329    Mux(jmpIdx === jalrIdx, inLatch.jbtac.target,
330    PriorityMux(jmpIdx, inLatch.btb.targets))))
331  io.out.bits.instrValid := Mux(ntToT || tgtDiffers, LowerMask(jmpIdx, FetchWidth), io.predecode.bits.mask).asTypeOf(Vec(FetchWidth, Bool()))
332
333  io.flushBPU := io.out.bits.redirect && io.out.valid
334
335  // speculative update RAS
336  val rasWrite = WireInit(0.U.asTypeOf(rasEntry()))
337  val retAddr = inLatch.pc + (OHToUInt(callIdx) << 2.U) + 4.U
338  rasWrite.retAddr := retAddr
339  val allocNewEntry = rasWrite.retAddr =/= rasTopAddr
340  rasWrite.ctr := Mux(allocNewEntry, 1.U, rasTop.ctr + 1.U)
341  val rasWritePosition = Mux(allocNewEntry, sp.value + 1.U, sp.value)
342  when (io.out.valid) {
343    when (jmpIdx === callIdx) {
344      ras(rasWritePosition) := rasWrite
345      when (allocNewEntry) { sp.value := sp.value + 1.U }
346    }.elsewhen (jmpIdx === retIdx) {
347      when (rasTop.ctr === 1.U) {
348        sp.value := Mux(sp.value === 0.U, 0.U, sp.value - 1.U)
349      }.otherwise {
350        ras(sp.value) := Cat(rasTop.ctr - 1.U, rasTopAddr).asTypeOf(rasEntry())
351      }
352    }
353  }
354  // use checkpoint to recover RAS
355  val recoverSp = io.redirectInfo.redirect.rasSp
356  val recoverCtr = io.redirectInfo.redirect.rasTopCtr
357  when (io.redirectInfo.valid && io.redirectInfo.misPred) {
358    sp.value := recoverSp
359    ras(recoverSp) := Cat(recoverCtr, ras(recoverSp).retAddr).asTypeOf(rasEntry())
360  }
361
362  // roll back global history in S1 if S3 redirects
363  io.s1RollBackHist := Mux(io.s3Taken, PriorityMux(jmpIdx, io.out.bits.hist), io.out.bits.hist(0) << PopCount(brIdx & ~inLatch.tage.takens.asUInt))
364  // whether Stage3 has a taken jump
365  io.s3Taken := jmpIdx.orR.asBool
366
367  XSDebug(io.in.fire() && callIdx.orR, "[RAS]:pc=0x%x, rasWritePosition=%d, rasWriteAddr=0x%x",
368            io.in.bits.pc, rasWritePosition, retAddr)
369
370  // debug info
371  XSDebug(io.in.fire(), "[BPUS3]in:(%d %d) pc=%x\n", io.in.valid, io.in.ready, io.in.bits.pc)
372  XSDebug(io.out.valid, "[BPUS3]out:%d pc=%x redirect=%d predcdMask=%b instrValid=%b tgt=%x\n",
373    io.out.valid, inLatch.pc, io.out.bits.redirect, io.predecode.bits.mask, io.out.bits.instrValid.asUInt, io.out.bits.target)
374  XSDebug(true.B, "[BPUS3]flushS3=%d\n", flushS3)
375  XSDebug(true.B, "[BPUS3]validLatch=%d predecode.valid=%d\n", validLatch, io.predecode.valid)
376  XSDebug(true.B, "[BPUS3]brIdx=%b brTakenIdx=%b brNTakenIdx=%b jalIdx=%b jalrIdx=%b callIdx=%b retIdx=%b\n",
377    brIdx, brTakenIdx, brNotTakenIdx, jalIdx, jalrIdx, callIdx, retIdx)
378
379  // BPU's TEMP Perf Cnt
380  BoringUtils.addSource(io.out.valid, "MbpS3Cnt")
381  BoringUtils.addSource(io.out.valid && io.out.bits.redirect, "MbpS3TageRed")
382  BoringUtils.addSource(io.out.valid && (inLatch.btbPred.bits.redirect ^ jmpIdx.orR.asBool), "MbpS3TageRedDir")
383  BoringUtils.addSource(io.out.valid && (inLatch.btbPred.bits.redirect
384              && jmpIdx.orR.asBool && (io.out.bits.target =/= inLatch.btbPred.bits.target)), "MbpS3TageRedTar")
385}
386
387class BPU extends XSModule {
388  val io = IO(new Bundle() {
389    // from backend
390    // flush pipeline if misPred and update bpu based on redirect signals from brq
391    val redirectInfo = Input(new RedirectInfo)
392
393    val in = new Bundle { val pc = Flipped(Valid(UInt(VAddrBits.W))) }
394
395    val btbOut = ValidIO(new BranchPrediction)
396    val tageOut = ValidIO(new BranchPrediction)
397
398    // predecode info from icache
399    // TODO: simplify this after implement predecode unit
400    val predecode = Flipped(ValidIO(new Predecode))
401  })
402
403  val s1 = Module(new BPUStage1)
404  val s2 = Module(new BPUStage2)
405  val s3 = Module(new BPUStage3)
406
407  s1.io.redirectInfo <> io.redirectInfo
408  s1.io.flush := s3.io.flushBPU || io.redirectInfo.flush()
409  s1.io.in.pc.valid := io.in.pc.valid
410  s1.io.in.pc.bits <> io.in.pc.bits
411  io.btbOut <> s1.io.s1OutPred
412  s1.io.s3RollBackHist := s3.io.s1RollBackHist
413  s1.io.s3Taken := s3.io.s3Taken
414
415  s1.io.out <> s2.io.in
416  s2.io.flush := s3.io.flushBPU || io.redirectInfo.flush()
417
418  s2.io.out <> s3.io.in
419  s3.io.flush := io.redirectInfo.flush()
420  s3.io.predecode <> io.predecode
421  io.tageOut <> s3.io.out
422  s3.io.redirectInfo <> io.redirectInfo
423
424  // TODO: temp and ugly code, when perf counters is added( may after adding CSR), please mv the below counter
425  val bpuPerfCntList = List(
426    ("MbpInstr","         "),
427    ("MbpRight","         "),
428    ("MbpWrong","         "),
429    ("MbpBRight","        "),
430    ("MbpBWrong","        "),
431    ("MbpJRight","        "),
432    ("MbpJWrong","        "),
433    ("MbpIRight","        "),
434    ("MbpIWrong","        "),
435    ("MbpRRight","        "),
436    ("MbpRWrong","        "),
437    ("MbpS3Cnt","         "),
438    ("MbpS3TageRed","     "),
439    ("MbpS3TageRedDir","  "),
440    ("MbpS3TageRedTar","  ")
441  )
442
443  val bpuPerfCnts = List.fill(bpuPerfCntList.length)(RegInit(0.U(XLEN.W)))
444  val bpuPerfCntConds = List.fill(bpuPerfCntList.length)(WireInit(false.B))
445  (bpuPerfCnts zip bpuPerfCntConds) map { case (cnt, cond) => { when (cond) { cnt := cnt + 1.U }}}
446
447  for(i <- bpuPerfCntList.indices) {
448    BoringUtils.addSink(bpuPerfCntConds(i), bpuPerfCntList(i)._1)
449  }
450
451  val xsTrap = WireInit(false.B)
452  BoringUtils.addSink(xsTrap, "XSTRAP_BPU")
453
454  // if (!p.FPGAPlatform) {
455    when (xsTrap) {
456      printf("=================BPU's PerfCnt================\n")
457      for(i <- bpuPerfCntList.indices) {
458        printf(bpuPerfCntList(i)._1 + bpuPerfCntList(i)._2 + " <- " + "%d\n", bpuPerfCnts(i))
459      }
460    }
461  // }
462}