xref: /XiangShan/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala (revision 708ceed4afe43fb0ea3a52407e46b2794c573634)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.cache
18
19import chipsalliance.rocketchip.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import freechips.rocketchip.tilelink._
24import bus.tilelink.TLMessages._
25import difftest._
26import huancun.{AliasKey, DirtyKey, PreferCacheKey, PrefetchKey}
27
28class MissReq(implicit p: Parameters) extends DCacheBundle
29{
30  val source = UInt(sourceTypeWidth.W)
31  val cmd    = UInt(M_SZ.W)
32  // must be aligned to block
33  val addr   = UInt(PAddrBits.W)
34  val vaddr  = UInt(VAddrBits.W)
35
36  // store
37  val store_data   = UInt((cfg.blockBytes * 8).W)
38  val store_mask   = UInt(cfg.blockBytes.W)
39
40  // which word does amo work on?
41  val word_idx = UInt(log2Up(blockWords).W)
42  val amo_data = UInt(DataBits.W)
43  val amo_mask = UInt((DataBits/8).W)
44
45  // coherence state
46  val coh = new ClientMetadata
47  val id  = UInt(reqIdWidth.W)
48
49  def dump() = {
50    XSDebug("MissReq source: %d cmd: %d addr: %x store_data: %x store_mask: %x word_idx: %d amo_data: %x amo_mask: %x coh: %d id: %d\n",
51      source, cmd, addr, store_data, store_mask, word_idx, amo_data, amo_mask, coh.state, id)
52  }
53
54  def isLoad = source === LOAD_SOURCE.U
55  def isStore = source === STORE_SOURCE.U
56}
57
58// One miss entry deals with one missed block
59class MissEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule {
60  val io = IO(new Bundle {
61    // MSHR ID
62    val id = Input(UInt())
63
64    // client requests
65    // this entry is free and can be allocated to new reqs
66    val primary_ready = Output(Bool())
67    // this entry is busy, but it can merge the new req
68    val secondary_ready = Output(Bool())
69    // this entry is busy and it can not merge the new req
70    val secondary_reject = Output(Bool())
71    val req    = Flipped(ValidIO(new MissReq))
72    val refill = ValidIO(new Refill)
73
74    // bus
75    val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle))
76    val mem_grant   = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
77    val mem_finish  = DecoupledIO(new TLBundleE(edge.bundle))
78
79    val pipe_req  = DecoupledIO(new MainPipeReq)
80    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
81
82    // block probe
83    val block_addr = ValidIO(UInt(PAddrBits.W))
84  })
85  val tma_io = IO(new Bundle {
86    val req    = Output(new MissReq)
87    val state  = Output(UInt(5.W))
88  })
89
90  val req = Reg(new MissReq)
91  val req_valid = RegInit(false.B)
92
93  val s_acquire = RegInit(true.B)
94  val s_grantack = RegInit(true.B)
95  val s_pipe_req = RegInit(true.B)
96  val w_grantfirst = RegInit(true.B)
97  val w_grantlast = RegInit(true.B)
98  val w_pipe_resp = RegInit(true.B)
99
100  val no_schedule = s_grantack && s_pipe_req
101  val no_wait = w_pipe_resp
102  val release_entry = no_schedule && no_wait
103
104  val acquire_not_sent = !s_acquire && !io.mem_acquire.ready
105  val data_not_refilled = !w_grantlast
106
107  // should we refill the data to load queue to wake up any missed load?
108  val should_refill_data_reg = Reg(Bool())
109  val should_refill_data = WireInit(should_refill_data_reg)
110
111  val full_overwrite = req.isStore && req.store_mask.andR
112
113  val (_, _, refill_done, refill_count) = edge.count(io.mem_grant)
114  val grant_param = Reg(UInt(TLPermissions.bdWidth.W))
115
116  val grant_beats = RegInit(0.U((beatBits).W))
117
118  when (io.req.valid && io.primary_ready) {
119    req_valid := true.B
120    req := io.req.bits
121    req.addr := get_block_addr(io.req.bits.addr)
122
123    s_acquire := false.B
124    s_grantack := false.B
125    s_pipe_req := false.B
126    w_grantfirst := false.B
127    w_grantlast := false.B
128    w_pipe_resp := false.B
129
130    should_refill_data_reg := io.req.bits.isLoad
131
132    grant_beats := 0.U
133  }.elsewhen (release_entry) {
134    req_valid := false.B
135  }
136
137  when (io.req.valid && io.secondary_ready) {
138    // The merged reqs should never have higher permissions
139    // which means the cache silently upgrade the permission of our block
140    // without merge with this miss queue request!
141    // Either our req come in with stale meta, or the req that upgrade the permission does not merge with this req.
142    // Both cases are bugs of DCache.
143    //
144    // DCache can silently drop permission(eg, probed or evicted)
145    // it should never silently upgrade permissions.
146    assert (io.req.bits.coh.state <= req.coh.state)
147    // use the most uptodate meta
148    req.coh := io.req.bits.coh
149
150    // when merging with store
151    // we should remember its info into our req
152    // or we will not be able to replay store
153    when (io.req.bits.isStore) {
154      req := io.req.bits
155    }
156
157    should_refill_data := should_refill_data_reg || io.req.bits.isLoad
158    should_refill_data_reg := should_refill_data
159  }
160
161  // set state regs
162  when (io.mem_acquire.fire()) {
163    s_acquire := true.B
164  }
165
166  val refill_data = Reg(Vec(blockRows, UInt(rowBits.W)))
167  val refill_data_raw = Reg(Vec(blockBytes/beatBytes, UInt(beatBits.W)))
168  val new_data = Wire(Vec(blockRows, UInt(rowBits.W)))
169  val new_mask = Wire(Vec(blockRows, UInt(rowBytes.W)))
170  def mergePutData(old_data: UInt, new_data: UInt, wmask: UInt): UInt = {
171    val full_wmask = FillInterleaved(8, wmask)
172    (~full_wmask & old_data | full_wmask & new_data)
173  }
174  for (i <- 0 until blockRows) {
175    new_data(i) := req.store_data(rowBits * (i + 1) - 1, rowBits * i)
176    // we only need to merge data for Store
177    new_mask(i) := Mux(req.isStore, req.store_mask(rowBytes * (i + 1) - 1, rowBytes * i), 0.U)
178  }
179  val hasData = RegInit(true.B)
180  val isDirty = RegInit(false.B)
181  when (io.mem_grant.fire()) {
182    w_grantfirst := true.B
183    grant_param := io.mem_grant.bits.param
184    when (edge.hasData(io.mem_grant.bits)) {
185      // GrantData
186      for (i <- 0 until beatRows) {
187        val idx = (refill_count << log2Floor(beatRows)) + i.U
188        val grant_row = io.mem_grant.bits.data(rowBits * (i + 1) - 1, rowBits * i)
189        refill_data(idx) := mergePutData(grant_row, new_data(idx), new_mask(idx))
190      }
191
192      w_grantlast := w_grantlast || refill_done
193
194      hasData := true.B
195
196      grant_beats := grant_beats + 1.U
197    }.otherwise {
198      // Grant
199
200      // since we do not sync between MissQueue and WritebackQueue
201      // for a AcquireBlock BtoT, we can not protect our block from being replaced by another miss and written back by WritebackQueue
202      // so AcquireBlock BtoT, we need L2 to give us GrantData, not Grant.
203      // So that whether our block is replaced or not, we can always refill the block with valid data
204      // So, if we enters here
205      // we must be a AcquirePerm, not a AcquireBlock!!!
206      assert (full_overwrite)
207      // when we only acquire perm, not data
208      // use Store's data
209      for (i <- 0 until blockRows) {
210        refill_data(i) := new_data(i)
211      }
212
213      w_grantlast := true.B
214
215      hasData := false.B
216    }
217
218    refill_data_raw(refill_count) := io.mem_grant.bits.data
219    isDirty := io.mem_grant.bits.echo.lift(DirtyKey).getOrElse(false.B)
220  }
221
222  when (io.mem_finish.fire()) {
223    s_grantack := true.B
224  }
225
226  when (io.pipe_req.fire()) {
227    s_pipe_req := true.B
228    assert(!io.pipe_req.bits.vaddr === 0.U)
229  }
230
231  when (io.pipe_resp.valid) {
232    w_pipe_resp := true.B
233  }
234
235//  def can_merge(new_req: MissReq): Bool = {
236//    // caution: do not merge with AMO
237//    // we can not do amoalu calculation in MissQueue
238//    // so, we do not know the result after AMO calculation
239//    // so do not merge with AMO
240//
241//    // before read acquire is fired, we can merge read or write
242//    val before_read_sent = acquire_not_sent && req.source === LOAD_SOURCE.U && (new_req.source === LOAD_SOURCE.U || new_req.source === STORE_SOURCE.U)
243//    // before read/write refills data to LoadQueue, we can merge any read
244//    val before_data_refill = data_not_refilled && (req.source === LOAD_SOURCE.U || req.source === STORE_SOURCE.U) && new_req.source === LOAD_SOURCE.U
245//
246//    before_read_sent || before_data_refill
247//  }
248
249  def before_read_sent_can_merge(new_req: MissReq): Bool = {
250    acquire_not_sent && req.source === LOAD_SOURCE.U && (new_req.source === LOAD_SOURCE.U || new_req.source === STORE_SOURCE.U)
251  }
252
253  def before_data_refill_can_merge(new_req: MissReq): Bool = {
254    data_not_refilled && (req.source === LOAD_SOURCE.U || req.source === STORE_SOURCE.U) && new_req.source === LOAD_SOURCE.U
255  }
256
257  def should_merge(new_req: MissReq): Bool = {
258    val block_match = req.addr === get_block_addr(new_req.addr)
259    val beat_match = new_req.addr(blockOffBits - 1, beatOffBits) >= grant_beats
260    block_match && (before_read_sent_can_merge(new_req) || beat_match && before_data_refill_can_merge(new_req))
261  }
262
263  def should_reject(new_req: MissReq): Bool = {
264    val block_match = req.addr === get_block_addr(new_req.addr)
265    // do not reject any req when we are in s_invalid
266    block_match && !should_merge(new_req) && req_valid // TODO: optimize this
267  }
268
269  io.primary_ready := !req_valid
270  io.secondary_ready := should_merge(io.req.bits)
271  io.secondary_reject := should_reject(io.req.bits)
272
273  // should not allocate, merge or reject at the same time
274  // one at a time
275  OneHot.checkOneHot(Seq(io.primary_ready, io.secondary_ready, io.secondary_reject))
276
277  // put should_refill_data out of RegNext
278  // so that when load miss are merged at refill_done
279  // we can still refill data back
280  //
281  // Now refill to load queue width l1BusDataWidth, to load queue refill req
282  // will be issued as soon as data is ready (stored in regs in miss queue)
283  val refill_data_splited = WireInit(VecInit(Seq.tabulate(cfg.blockBytes * 8 / l1BusDataWidth)(i => {
284    val data = refill_data.asUInt
285    data((i + 1) * l1BusDataWidth - 1, i * l1BusDataWidth)
286  })))
287  io.refill.valid := RegNext(!w_grantlast && s_acquire && io.mem_grant.fire()) && should_refill_data
288  io.refill.bits.addr := RegNext(req.addr + (refill_count << refillOffBits))
289  io.refill.bits.data := refill_data_splited(RegNext(refill_count))
290  io.refill.bits.refill_done := RegNext(refill_done && io.mem_grant.fire())
291  io.refill.bits.hasdata := hasData
292  io.refill.bits.data_raw := refill_data_raw.asUInt
293
294  io.mem_acquire.valid := !s_acquire
295  val grow_param = req.coh.onAccess(req.cmd)._2
296  val acquireBlock = edge.AcquireBlock(
297    fromSource = io.id,
298    toAddress = req.addr,
299    lgSize = (log2Up(cfg.blockBytes)).U,
300    growPermissions = grow_param
301  )._2
302  val acquirePerm = edge.AcquirePerm(
303    fromSource = io.id,
304    toAddress = req.addr,
305    lgSize = (log2Up(cfg.blockBytes)).U,
306    growPermissions = grow_param
307  )._2
308  io.mem_acquire.bits := Mux(full_overwrite, acquirePerm, acquireBlock)
309  // resolve cache alias by L2
310  io.mem_acquire.bits.user.lift(AliasKey).foreach( _ := req.vaddr(13, 12))
311  // trigger prefetch
312  io.mem_acquire.bits.user.lift(PrefetchKey).foreach(_ := true.B)
313  // prefer not to cache data in L2 by default
314  io.mem_acquire.bits.user.lift(PreferCacheKey).foreach(_ := false.B)
315  require(nSets <= 256) // dcache size should not be more than 128KB
316  io.mem_grant.ready := !w_grantlast && s_acquire
317  val grantack = RegEnable(edge.GrantAck(io.mem_grant.bits), io.mem_grant.fire())
318  val is_grant = RegEnable(edge.isRequest(io.mem_grant.bits), io.mem_grant.fire())
319  io.mem_finish.valid := !s_grantack && w_grantfirst && is_grant
320  io.mem_finish.bits := grantack
321
322  io.pipe_req.valid := !s_pipe_req && w_grantlast
323  val pipe_req = io.pipe_req.bits
324  pipe_req.miss := true.B
325  pipe_req.miss_id := io.id
326  pipe_req.miss_param := grant_param
327  pipe_req.miss_dirty := isDirty
328
329  pipe_req.probe := false.B
330  pipe_req.probe_param := DontCare
331  pipe_req.probe_need_data := false.B
332
333  pipe_req.source := req.source
334  pipe_req.cmd    := req.cmd
335  pipe_req.addr   := req.addr
336  pipe_req.vaddr   := req.vaddr
337  pipe_req.store_data := refill_data.asUInt
338  // full overwrite
339  pipe_req.store_mask := Fill(cfg.blockBytes, "b1".U)
340  pipe_req.word_idx := req.word_idx
341  pipe_req.amo_data   := req.amo_data
342  pipe_req.amo_mask   := req.amo_mask
343  pipe_req.id     := req.id
344
345  io.block_addr.valid := req_valid && w_grantlast && !release_entry
346  io.block_addr.bits := req.addr
347
348  tma_io.req := req
349  tma_io.state := DontCare // TODO
350
351  XSPerfAccumulate("miss_req", io.req.valid && io.primary_ready)
352  XSPerfAccumulate("miss_penalty", BoolStopWatch(io.req.valid && io.primary_ready, release_entry))
353  XSPerfAccumulate("load_miss_penalty_to_use", should_refill_data && BoolStopWatch(io.req.valid && io.primary_ready, io.refill.valid, true))
354  XSPerfAccumulate("pipeline_penalty", BoolStopWatch(io.pipe_req.fire(), io.pipe_resp.fire()))
355  XSPerfAccumulate("penalty_blocked_by_channel_A", io.mem_acquire.valid && !io.mem_acquire.ready)
356  XSPerfAccumulate("penalty_waiting_for_channel_D", s_acquire && !w_grantlast && !io.mem_grant.valid)
357  XSPerfAccumulate("penalty_blocked_by_channel_E", io.mem_finish.valid && !io.mem_finish.ready)
358  XSPerfAccumulate("penalty_blocked_by_pipeline", io.pipe_req.valid && !io.pipe_req.ready)
359
360  val (mshr_penalty_sample, mshr_penalty) = TransactionLatencyCounter(RegNext(io.req.valid && io.primary_ready), release_entry)
361  XSPerfHistogram("miss_penalty", mshr_penalty, mshr_penalty_sample, 0, 100, 10)
362
363  val load_miss_begin = io.req.valid && io.primary_ready && io.req.bits.isLoad
364  val refill_finished = RegNext(!w_grantlast && refill_done) && should_refill_data
365  val (load_miss_penalty_sample, load_miss_penalty) = TransactionLatencyCounter(load_miss_begin, refill_finished) // not real refill finish time
366  XSPerfHistogram("load_miss_penalty_to_use", load_miss_penalty, load_miss_penalty_sample, 0, 100, 10)
367
368  val (a_to_d_penalty_sample, a_to_d_penalty) = TransactionLatencyCounter(io.mem_acquire.fire(), io.mem_grant.fire() && refill_done)
369  XSPerfHistogram("a_to_d_penalty", a_to_d_penalty, a_to_d_penalty_sample, 0, 100, 10)
370}
371
372class MissQueue(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule with HasTLDump
373{
374  val io = IO(new Bundle {
375    val req    = Flipped(DecoupledIO(new MissReq))
376    val refill = ValidIO(new Refill)
377
378    val mem_acquire = Decoupled(new TLBundleA(edge.bundle))
379    val mem_grant   = Flipped(DecoupledIO(new TLBundleD(edge.bundle)))
380    val mem_finish  = Decoupled(new TLBundleE(edge.bundle))
381
382    val pipe_req  = DecoupledIO(new MainPipeReq)
383    val pipe_resp = Flipped(ValidIO(new MainPipeResp))
384
385    // block probe
386    val probe_req = Input(UInt(PAddrBits.W))
387    val probe_block = Output(Bool())
388
389    val full = Output(Bool())
390  })
391
392  // 128KBL1: FIXME: provide vaddr for l2
393
394  val pipe_req_arb = Module(new RRArbiter(new MainPipeReq, cfg.nMissEntries))
395  val refill_arb   = Module(new Arbiter(new Refill, cfg.nMissEntries))
396
397  // dispatch req to MSHR
398  val primary_ready  = Wire(Vec(cfg.nMissEntries, Bool()))
399  val secondary_ready  = Wire(Vec(cfg.nMissEntries, Bool()))
400  val secondary_reject  = Wire(Vec(cfg.nMissEntries, Bool()))
401  val probe_block_vec = Wire(Vec(cfg.nMissEntries, Bool()))
402
403  // try merging with existing reqs
404  val merge = secondary_ready.asUInt.orR
405  val merge_idx = PriorityEncoder(secondary_ready)
406  // some req says the request can not be merged
407  val reject = secondary_reject.asUInt.orR
408  // allocate a new entry for this req
409  val allocate = !reject && !merge && primary_ready.asUInt.orR
410  val alloc_idx = PriorityEncoder(primary_ready)
411
412  // will this req be accepted
413  val accept = (merge || allocate) && !reject
414  // if it's accepted, which entry will it enter
415  val entry_idx = Mux(allocate, alloc_idx, merge_idx)
416
417  // for one block, their should be only one MSHR
418  // one block should not be stay in multiple MSHRs
419  // if we a req can not merge with existing reqs
420  // block it!
421  OneHot.checkOneHot(secondary_ready)
422  OneHot.checkOneHot(secondary_reject)
423  // should not merge and reject at the same time
424  OneHot.checkOneHot(Seq(merge, reject))
425
426  io.req.ready := accept
427  io.mem_grant.ready := false.B
428
429  val entries = (0 until cfg.nMissEntries) map { i =>
430    val entry = Module(new MissEntry(edge))
431
432    entry.io.id := i.U(log2Up(cfg.nMissEntries).W)
433
434    // entry req
435    entry.io.req.valid  := (i.U === entry_idx) && accept && io.req.valid
436    primary_ready(i)    := entry.io.primary_ready
437    secondary_ready(i)  := entry.io.secondary_ready
438    secondary_reject(i) := entry.io.secondary_reject
439    probe_block_vec(i)  := entry.io.block_addr.valid && entry.io.block_addr.bits === io.probe_req
440    entry.io.req.bits   := io.req.bits
441
442    // entry refill
443    refill_arb.io.in(i).valid := entry.io.refill.valid
444    refill_arb.io.in(i).bits  := entry.io.refill.bits
445
446    // pipe_req
447    pipe_req_arb.io.in(i)     <> entry.io.pipe_req
448
449    // pipe_req
450    entry.io.pipe_resp.valid  := false.B
451    entry.io.pipe_resp.bits   := DontCare
452    when (io.pipe_resp.bits.id === i.U) {
453      entry.io.pipe_resp <> io.pipe_resp
454    }
455
456    entry.io.mem_grant.valid := false.B
457    entry.io.mem_grant.bits  := DontCare
458    when (io.mem_grant.bits.source === i.U) {
459      entry.io.mem_grant <> io.mem_grant
460    }
461
462    /*
463    XSPerf(
464      "perfCntDCacheMissQueuePenaltyEntry" + Integer.toString(i, 10),
465      BoolStopWatch(
466        start = entry.io.req.fire(),
467        stop = entry.io.resp.fire(),
468        startHighPriority = true)
469    )
470    */
471
472    entry
473  }
474
475  val pendingVec = entries.map(entry => (entry.tma_io.req.source =/= STORE_SOURCE.U) && (entry.tma_io.state =/= 0.U))
476  ExcitingUtils.addSource(pendingVec.reduce(_||_), "TMA_l1miss")
477
478  io.refill.valid := refill_arb.io.out.valid
479  io.refill.bits  := refill_arb.io.out.bits
480  refill_arb.io.out.ready := true.B
481
482  if (!env.FPGAPlatform) {
483    val difftest = Module(new DifftestRefillEvent)
484    difftest.io.clock := clock
485    difftest.io.coreid := hardId.U
486    difftest.io.valid := io.refill.valid && io.refill.bits.hasdata && io.refill.bits.refill_done
487    difftest.io.addr := io.refill.bits.addr
488    difftest.io.data := io.refill.bits.data_raw.asTypeOf(difftest.io.data)
489  }
490
491  // one refill at a time
492  OneHot.checkOneHot(refill_arb.io.in.map(r => r.valid))
493
494  TLArbiter.lowest(edge, io.mem_acquire, entries.map(_.io.mem_acquire):_*)
495  TLArbiter.lowest(edge, io.mem_finish,  entries.map(_.io.mem_finish):_*)
496
497  io.pipe_req <> pipe_req_arb.io.out
498
499  io.probe_block := probe_block_vec.asUInt.orR
500
501  // print all input/output requests for debug purpose
502
503  when (io.req.fire()) {
504    io.req.bits.dump()
505    // sanity check
506    val source = io.req.bits.source
507    val cmd = io.req.bits.cmd
508    when (source === LOAD_SOURCE.U) {
509      assert (cmd === M_XRD)
510    }
511    when (source === STORE_SOURCE.U) {
512      assert (cmd === M_XWR)
513    }
514
515    when (source === AMO_SOURCE.U) {
516      assert (
517        cmd === M_XA_SWAP ||
518        cmd === M_XLR     ||
519        cmd === M_XSC     ||
520        cmd === M_XA_ADD  ||
521        cmd === M_XA_XOR  ||
522        cmd === M_XA_OR   ||
523        cmd === M_XA_AND  ||
524        cmd === M_XA_MIN  ||
525        cmd === M_XA_MAX  ||
526        cmd === M_XA_MINU ||
527        cmd === M_XA_MAXU)
528    }
529    // req addr must be aligned to block boundary
530//    assert (io.req.bits.addr(blockOffBits - 1, 0) === 0.U)
531  }
532
533  when (io.refill.fire()) {
534    io.refill.bits.dump()
535  }
536
537  when (io.mem_acquire.fire()) {
538    XSDebug("mem_acquire ")
539    io.mem_acquire.bits.dump
540  }
541
542  when (io.mem_grant.fire()) {
543    XSDebug("mem_grant ")
544    io.mem_grant.bits.dump
545  }
546
547  when (io.mem_finish.fire()) {
548    XSDebug("mem_finish ")
549    io.mem_finish.bits.dump
550  }
551
552  when (io.probe_block) {
553    XSDebug(p"block probe req ${Hexadecimal(io.probe_req)}\n")
554  }
555
556  XSPerfAccumulate("miss_req", io.req.fire())
557  XSPerfAccumulate("miss_req_allocate", io.req.fire() && allocate)
558  XSPerfAccumulate("miss_req_merge_load", io.req.fire() && merge && !reject && io.req.bits.isLoad)
559  XSPerfAccumulate("miss_req_reject_load", io.req.valid && reject && io.req.bits.isLoad)
560  XSPerfAccumulate("probe_blocked_by_miss", io.probe_block)
561  val max_inflight = RegInit(0.U((log2Up(cfg.nMissEntries) + 1).W))
562  val num_valids = PopCount(~primary_ready.asUInt)
563  when (num_valids > max_inflight) {
564    max_inflight := num_valids
565  }
566  // max inflight (average) = max_inflight_total / cycle cnt
567  XSPerfAccumulate("max_inflight", max_inflight)
568  QueuePerf(cfg.nMissEntries, num_valids, num_valids === cfg.nMissEntries.U)
569  io.full := num_valids === cfg.nMissEntries.U
570  XSPerfHistogram("num_valids", num_valids, true.B, 0, cfg.nMissEntries, 1)
571}
572