xref: /XiangShan/src/main/scala/xiangshan/mem/prefetch/L1PrefetchComponent.scala (revision f7063a43ab34da917ba6c670d21871314340c550)
1package xiangshan.mem.prefetch
2
3import org.chipsalliance.cde.config.Parameters
4import chisel3._
5import chisel3.util._
6import xiangshan._
7import utils._
8import utility._
9import xiangshan.cache.HasDCacheParameters
10import xiangshan.cache.mmu._
11import xiangshan.mem.{L1PrefetchReq, LdPrefetchTrainBundle}
12import xiangshan.mem.trace._
13import xiangshan.mem.L1PrefetchSource
14
15trait HasL1PrefetchHelper extends HasCircularQueuePtrHelper with HasDCacheParameters {
16  // region related
17  val REGION_SIZE = 1024
18  val PAGE_OFFSET = 12
19  val BLOCK_OFFSET = log2Up(dcacheParameters.blockBytes)
20  val BIT_VEC_WITDH = REGION_SIZE / dcacheParameters.blockBytes
21  val REGION_BITS = log2Up(BIT_VEC_WITDH)
22  val REGION_TAG_OFFSET = BLOCK_OFFSET + REGION_BITS
23  val REGION_TAG_BITS = VAddrBits - BLOCK_OFFSET - REGION_BITS
24
25  // hash related
26  val VADDR_HASH_WIDTH = 5
27  val BLK_ADDR_RAW_WIDTH = 10
28  val HASH_TAG_WIDTH = VADDR_HASH_WIDTH + BLK_ADDR_RAW_WIDTH
29
30  // capacity related
31  val MLP_SIZE = 16
32
33  // prefetch sink related
34  val SINK_BITS = 2
35  def SINK_L1 = "b00".U
36  def SINK_L2 = "b01".U
37  def SINK_L3 = "b10".U
38
39  // vaddr: |       region tag        |  region bits  | block offset |
40  def get_region_tag(vaddr: UInt) = {
41    require(vaddr.getWidth == VAddrBits)
42    vaddr(vaddr.getWidth - 1, REGION_TAG_OFFSET)
43  }
44
45  def get_region_bits(vaddr: UInt) = {
46    require(vaddr.getWidth == VAddrBits)
47    vaddr(REGION_TAG_OFFSET - 1, BLOCK_OFFSET)
48  }
49
50  def block_addr(x: UInt): UInt = {
51    x(x.getWidth - 1, BLOCK_OFFSET)
52  }
53
54  def vaddr_hash(x: UInt): UInt = {
55    val width = VADDR_HASH_WIDTH
56    val low = x(width - 1, 0)
57    val mid = x(2 * width - 1, width)
58    val high = x(3 * width - 1, 2 * width)
59    low ^ mid ^ high
60  }
61
62  def pc_hash_tag(x: UInt): UInt = {
63    val low = x(BLK_ADDR_RAW_WIDTH - 1, 0)
64    val high = x(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH)
65    val high_hash = vaddr_hash(high)
66    Cat(high_hash, low)
67  }
68
69  def block_hash_tag(x: UInt): UInt = {
70    val blk_addr = block_addr(x)
71    val low = blk_addr(BLK_ADDR_RAW_WIDTH - 1, 0)
72    val high = blk_addr(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH)
73    val high_hash = vaddr_hash(high)
74    Cat(high_hash, low)
75  }
76
77  def region_hash_tag(region_tag: UInt): UInt = {
78    val low = region_tag(BLK_ADDR_RAW_WIDTH - 1, 0)
79    val high = region_tag(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH)
80    val high_hash = vaddr_hash(high)
81    Cat(high_hash, low)
82  }
83
84  def region_to_block_addr(region_tag: UInt, region_bits: UInt): UInt = {
85    Cat(region_tag, region_bits)
86  }
87
88  def get_candidate_oh(x: UInt): UInt = {
89    require(x.getWidth == PAddrBits)
90    UIntToOH(x(REGION_BITS + BLOCK_OFFSET - 1, BLOCK_OFFSET))
91  }
92
93  def toBinary(n: Int): String = n match {
94    case 0|1 => s"$n"
95    case _   => s"${toBinary(n/2)}${n%2}"
96  }
97}
98
99trait HasTrainFilterHelper extends HasCircularQueuePtrHelper {
100  def reorder[T <: LdPrefetchTrainBundle](source: Vec[ValidIO[T]]): Vec[ValidIO[T]] = {
101    if(source.length == 1) {
102      source
103    }else if(source.length == 2) {
104      val source_v = source.map(_.valid)
105      val res = Wire(source.cloneType)
106      // source 1 is older than source 0 (only when source0/1 are both valid)
107      val source_1_older = Mux(Cat(source_v).andR,
108        isBefore(source(1).bits.uop.robIdx, source(0).bits.uop.robIdx),
109        false.B
110      )
111      when(source_1_older) {
112        res(0) := source(1)
113        res(1) := source(0)
114      }.otherwise {
115        res := source
116      }
117
118      res
119    }else if(source.length == 3) {
120      // TODO: generalize
121      val res_0_1 = Wire(source.cloneType)
122      val res_1_2 = Wire(source.cloneType)
123      val res = Wire(source.cloneType)
124
125      val tmp = reorder(VecInit(source.slice(0, 2)))
126      res_0_1(0) := tmp(0)
127      res_0_1(1) := tmp(1)
128      res_0_1(2) := source(2)
129      val tmp_1 = reorder(VecInit(res_0_1.slice(1, 3)))
130      res_1_2(0) := res_0_1(0)
131      res_1_2(1) := tmp_1(0)
132      res_1_2(2) := tmp_1(1)
133      val tmp_2 = reorder(VecInit(res_1_2.slice(0, 2)))
134      res(0) := tmp_2(0)
135      res(1) := tmp_2(1)
136      res(2) := res_1_2(2)
137
138      res
139    }else {
140      require(false, "for now, 4 or more sources are invalid")
141      source
142    }
143  }
144}
145
146// get prefetch train reqs from `exuParameters.LduCnt` load pipelines (up to `exuParameters.LduCnt`/cycle)
147// filter by cache line address, send out train req to stride (up to 1 req/cycle)
148class TrainFilter(size: Int, name: String)(implicit p: Parameters) extends XSModule with HasL1PrefetchHelper with HasTrainFilterHelper {
149  val io = IO(new Bundle() {
150    val enable = Input(Bool())
151    val flush = Input(Bool())
152    // train input, only from load for now
153    val ld_in = Flipped(Vec(backendParams.LduCnt, ValidIO(new LdPrefetchTrainBundle())))
154    // filter out
155    val train_req = DecoupledIO(new PrefetchReqBundle())
156  })
157
158  class Ptr(implicit p: Parameters) extends CircularQueuePtr[Ptr]( p => size ){}
159  object Ptr {
160    def apply(f: Bool, v: UInt)(implicit p: Parameters): Ptr = {
161      val ptr = Wire(new Ptr)
162      ptr.flag := f
163      ptr.value := v
164      ptr
165    }
166  }
167
168  val entries = RegInit(VecInit(Seq.fill(size){ (0.U.asTypeOf(new PrefetchReqBundle())) }))
169  val valids = RegInit(VecInit(Seq.fill(size){ (false.B) }))
170
171  // enq
172  val enqLen = backendParams.LduCnt
173  val enqPtrExt = RegInit(VecInit((0 until enqLen).map(_.U.asTypeOf(new Ptr))))
174  val deqPtrExt = RegInit(0.U.asTypeOf(new Ptr))
175
176  val deqPtr = WireInit(deqPtrExt.value)
177
178  require(size >= enqLen)
179
180  val ld_in_reordered = reorder(io.ld_in)
181  val reqs_l = ld_in_reordered.map(_.bits.asPrefetchReqBundle())
182  val reqs_vl = ld_in_reordered.map(_.valid)
183  val needAlloc = Wire(Vec(enqLen, Bool()))
184  val canAlloc = Wire(Vec(enqLen, Bool()))
185
186  for(i <- (0 until enqLen)) {
187    val req = reqs_l(i)
188    val req_v = reqs_vl(i)
189    val index = PopCount(needAlloc.take(i))
190    val allocPtr = enqPtrExt(index)
191    val entry_match = Cat(entries.zip(valids).map {
192      case(e, v) => v && block_hash_tag(e.vaddr) === block_hash_tag(req.vaddr)
193    }).orR
194    val prev_enq_match = if(i == 0) false.B else Cat(reqs_l.zip(reqs_vl).take(i).map {
195      case(pre, pre_v) => pre_v && block_hash_tag(pre.vaddr) === block_hash_tag(req.vaddr)
196    }).orR
197
198    needAlloc(i) := req_v && !entry_match && !prev_enq_match
199    canAlloc(i) := needAlloc(i) && allocPtr >= deqPtrExt && io.enable
200
201    when(canAlloc(i)) {
202      valids(allocPtr.value) := true.B
203      entries(allocPtr.value) := req
204    }
205  }
206  val allocNum = PopCount(canAlloc)
207
208  enqPtrExt.foreach{case x => x := x + allocNum}
209
210  // deq
211  io.train_req.valid := false.B
212  io.train_req.bits := DontCare
213  valids.zip(entries).zipWithIndex.foreach {
214    case((valid, entry), i) => {
215      when(deqPtr === i.U) {
216        io.train_req.valid := valid && io.enable
217        io.train_req.bits := entry
218      }
219    }
220  }
221
222  when(io.train_req.fire) {
223    valids(deqPtr) := false.B
224    deqPtrExt := deqPtrExt + 1.U
225  }
226
227  when(RegNext(io.flush)) {
228    valids.foreach {case valid => valid := false.B}
229    (0 until enqLen).map {case i => enqPtrExt(i) := i.U.asTypeOf(new Ptr)}
230    deqPtrExt := 0.U.asTypeOf(new Ptr)
231  }
232
233  XSPerfAccumulate(s"${name}_train_filter_full", PopCount(valids) === size.U)
234  XSPerfAccumulate(s"${name}_train_filter_half", PopCount(valids) >= (size / 2).U)
235  XSPerfAccumulate(s"${name}_train_filter_empty", PopCount(valids) === 0.U)
236
237  val raw_enq_pattern = Cat(reqs_vl)
238  val filtered_enq_pattern = Cat(needAlloc)
239  val actual_enq_pattern = Cat(canAlloc)
240  XSPerfAccumulate(s"${name}_train_filter_enq", allocNum > 0.U)
241  XSPerfAccumulate(s"${name}_train_filter_deq", io.train_req.fire)
242  for(i <- 0 until (1 << enqLen)) {
243    XSPerfAccumulate(s"${name}_train_filter_raw_enq_pattern_${toBinary(i)}", raw_enq_pattern === i.U)
244    XSPerfAccumulate(s"${name}_train_filter_filtered_enq_pattern_${toBinary(i)}", filtered_enq_pattern === i.U)
245    XSPerfAccumulate(s"${name}_train_filter_actual_enq_pattern_${toBinary(i)}", actual_enq_pattern === i.U)
246  }
247}
248
249class MLPReqFilterBundle(implicit p: Parameters) extends XSBundle with HasL1PrefetchHelper {
250  val tag = UInt(HASH_TAG_WIDTH.W)
251  val region = UInt(REGION_TAG_BITS.W)
252  val bit_vec = UInt(BIT_VEC_WITDH.W)
253  // NOTE: l1 will not use sent_vec, for making more prefetch reqs to l1 dcache
254  val sent_vec = UInt(BIT_VEC_WITDH.W)
255  val sink = UInt(SINK_BITS.W)
256  val alias = UInt(2.W)
257  val is_vaddr = Bool()
258  val source = new L1PrefetchSource()
259
260  def reset(index: Int) = {
261    tag := region_hash_tag(index.U)
262    region := index.U
263    bit_vec := 0.U
264    sent_vec := 0.U
265    sink := SINK_L1
266    alias := 0.U
267    is_vaddr := false.B
268    source.value := L1_HW_PREFETCH_NULL
269  }
270
271  def tag_match(new_tag: UInt): Bool = {
272    require(new_tag.getWidth == HASH_TAG_WIDTH)
273    tag === new_tag
274  }
275
276  def update(update_bit_vec: UInt, update_sink: UInt) = {
277    bit_vec := bit_vec | update_bit_vec
278    when(update_sink < sink) {
279      bit_vec := (bit_vec & ~sent_vec) | update_bit_vec
280      sink := update_sink
281    }
282
283    assert(PopCount(update_bit_vec) >= 1.U, "valid bits in update vector should greater than one")
284  }
285
286  def can_send_pf(): Bool = {
287    Mux(
288      sink === SINK_L1,
289      !is_vaddr && bit_vec.orR,
290      !is_vaddr && (bit_vec & ~sent_vec).orR
291    )
292  }
293
294  def get_pf_addr(): UInt = {
295    require(PAddrBits <= VAddrBits)
296    require((region.getWidth + REGION_BITS + BLOCK_OFFSET) == VAddrBits)
297
298    val candidate = Mux(
299      sink === SINK_L1,
300      PriorityEncoder(bit_vec).asTypeOf(UInt(REGION_BITS.W)),
301      PriorityEncoder(bit_vec & ~sent_vec).asTypeOf(UInt(REGION_BITS.W))
302    )
303    Cat(region, candidate, 0.U(BLOCK_OFFSET.W))
304  }
305
306  def get_tlb_va(): UInt = {
307    require((region.getWidth + REGION_TAG_OFFSET) == VAddrBits)
308    Cat(region, 0.U(REGION_TAG_OFFSET.W))
309  }
310
311  def fromStreamPrefetchReqBundle(x : StreamPrefetchReqBundle): MLPReqFilterBundle = {
312    require(PAGE_OFFSET >= REGION_TAG_OFFSET, "region is greater than 4k, alias bit may be incorrect")
313
314    val res = Wire(new MLPReqFilterBundle)
315    res.tag := region_hash_tag(x.region)
316    res.region := x.region
317    res.bit_vec := x.bit_vec
318    res.sent_vec := 0.U
319    res.sink := x.sink
320    res.is_vaddr := true.B
321    res.source := x.source
322    res.alias := x.region(PAGE_OFFSET - REGION_TAG_OFFSET + 1, PAGE_OFFSET - REGION_TAG_OFFSET)
323
324    res
325  }
326
327  def invalidate() = {
328    // disable sending pf req
329    when(sink === SINK_L1) {
330      bit_vec := 0.U(BIT_VEC_WITDH.W)
331    }.otherwise {
332      sent_vec := ~(0.U(BIT_VEC_WITDH.W))
333    }
334    // disable sending tlb req
335    is_vaddr := false.B
336  }
337}
338
339// there are 5 independent pipelines inside
340// 1. prefetch enqueue
341// 2. tlb request
342// 3. actual l1 prefetch
343// 4. actual l2 prefetch
344// 5. actual l3 prefetch
345class MutiLevelPrefetchFilter(implicit p: Parameters) extends XSModule with HasL1PrefetchHelper {
346  val io = IO(new XSBundle {
347    val enable = Input(Bool())
348    val flush = Input(Bool())
349    val prefetch_req = Flipped(ValidIO(new StreamPrefetchReqBundle))
350    val tlb_req = new TlbRequestIO(nRespDups = 2)
351    val l1_req = DecoupledIO(new L1PrefetchReq())
352    val l2_pf_addr = ValidIO(new L2PrefetchReq())
353    val l3_pf_addr = ValidIO(UInt(PAddrBits.W)) // TODO: l3 pf source
354    val confidence = Input(UInt(1.W))
355    val l2PfqBusy = Input(Bool())
356  })
357
358  val array = Reg(Vec(MLP_SIZE, new MLPReqFilterBundle))
359  val replacement = ReplacementPolicy.fromString("plru", MLP_SIZE)
360  val tlb_req_arb = Module(new RRArbiterInit(new TlbReq, MLP_SIZE))
361  val l1_pf_req_arb = Module(new RRArbiterInit(new L1PrefetchReq, MLP_SIZE))
362  val l2_pf_req_arb = Module(new RRArbiterInit(new L2PrefetchReq, MLP_SIZE))
363  val l3_pf_req_arb = Module(new RRArbiterInit(UInt(PAddrBits.W), MLP_SIZE))
364
365  // enq
366  // s0: hash tag match
367  val s0_can_accept = Wire(Bool())
368  val s0_valid = io.prefetch_req.valid && s0_can_accept
369  val s0_region = io.prefetch_req.bits.region
370  val s0_region_hash = region_hash_tag(s0_region)
371  val s0_match_vec = array.map(_.tag_match(s0_region_hash))
372  val s0_hit = VecInit(s0_match_vec).asUInt.orR
373  val s0_index = Mux(s0_hit, OHToUInt(VecInit(s0_match_vec).asUInt), replacement.way)
374  val s0_prefetch_req = (new MLPReqFilterBundle).fromStreamPrefetchReqBundle(io.prefetch_req.bits)
375
376  when(s0_valid) {
377    replacement.access(s0_index)
378  }
379
380  assert(!s0_valid || PopCount(VecInit(s0_match_vec)) <= 1.U, "req region should match no more than 1 entry")
381  assert(!(s0_valid && RegNext(s0_valid) && !s0_hit && !RegNext(s0_hit) && replacement.way === RegNext(replacement.way)), "replacement error")
382
383  XSPerfAccumulate("s0_enq_fire", s0_valid)
384  XSPerfAccumulate("s0_enq_valid", io.prefetch_req.valid)
385  XSPerfAccumulate("s0_cannot_enq", io.prefetch_req.valid && !s0_can_accept)
386
387  // s1: alloc or update
388  val s1_valid = RegNext(s0_valid)
389  val s1_region = RegEnable(s0_region, s0_valid)
390  val s1_region_hash = RegEnable(s0_region_hash, s0_valid)
391  val s1_hit = RegEnable(s0_hit, s0_valid)
392  val s1_index = RegEnable(s0_index, s0_valid)
393  val s1_prefetch_req = RegEnable(s0_prefetch_req, s0_valid)
394  val s1_alloc = s1_valid && !s1_hit
395  val s1_update = s1_valid && s1_hit
396  s0_can_accept := !(s1_valid && s1_alloc && (s0_region_hash === s1_region_hash))
397
398  when(s1_alloc) {
399    array(s1_index) := s1_prefetch_req
400  }.elsewhen(s1_update) {
401    array(s1_index).update(
402      update_bit_vec = s1_prefetch_req.bit_vec,
403      update_sink = s1_prefetch_req.sink
404    )
405  }
406
407  // TODO: set this constraint looser to enable more kinds of depth
408  // assert(!(s0_valid && s1_valid && s0_region === s1_region), "s0 and s1 must have different region")
409
410  XSPerfAccumulate("s1_enq_valid", s1_valid)
411  XSPerfAccumulate("s1_enq_alloc", s1_alloc)
412  XSPerfAccumulate("s1_enq_update", s1_update)
413  XSPerfAccumulate("hash_conflict", s0_valid && RegNext(s1_valid) && (s0_region =/= RegNext(s1_region)) && (s0_region_hash === RegNext(s1_region_hash)))
414
415  // tlb req
416  // s0: arb all tlb reqs
417  val s0_tlb_fire_vec = VecInit((0 until MLP_SIZE).map{case i => tlb_req_arb.io.in(i).fire})
418  val s1_tlb_fire_vec = RegNext(s0_tlb_fire_vec)
419  val s2_tlb_fire_vec = RegNext(s1_tlb_fire_vec)
420
421  for(i <- 0 until MLP_SIZE) {
422    val evict = s1_alloc && (s1_index === i.U)
423    tlb_req_arb.io.in(i).valid := array(i).is_vaddr && !s1_tlb_fire_vec(i) && !s2_tlb_fire_vec(i) && !evict
424    tlb_req_arb.io.in(i).bits.vaddr := array(i).get_tlb_va()
425    tlb_req_arb.io.in(i).bits.cmd := TlbCmd.read
426    tlb_req_arb.io.in(i).bits.size := 3.U
427    tlb_req_arb.io.in(i).bits.kill := false.B
428    tlb_req_arb.io.in(i).bits.no_translate := false.B
429    tlb_req_arb.io.in(i).bits.memidx := DontCare
430    tlb_req_arb.io.in(i).bits.debug := DontCare
431    tlb_req_arb.io.in(i).bits.hlvx := DontCare
432    tlb_req_arb.io.in(i).bits.hyperinst := DontCare
433  }
434
435  assert(PopCount(s0_tlb_fire_vec) <= 1.U, "s0_tlb_fire_vec should be one-hot or empty")
436
437  // s1: send out the req
438  val s1_tlb_req_valid = RegNext(tlb_req_arb.io.out.valid)
439  val s1_tlb_req_bits = RegEnable(tlb_req_arb.io.out.bits, tlb_req_arb.io.out.valid)
440  val s1_tlb_req_index = RegEnable(OHToUInt(s0_tlb_fire_vec.asUInt), tlb_req_arb.io.out.valid)
441  val s1_tlb_evict = s1_alloc && (s1_index === s1_tlb_req_index)
442  io.tlb_req.req.valid := s1_tlb_req_valid && !s1_tlb_evict
443  io.tlb_req.req.bits := s1_tlb_req_bits
444  io.tlb_req.req_kill := false.B
445  tlb_req_arb.io.out.ready := true.B
446
447  XSPerfAccumulate("s1_tlb_req_sent", io.tlb_req.req.valid)
448  XSPerfAccumulate("s1_tlb_req_evict", s1_tlb_req_valid && s1_tlb_evict)
449
450  // s2: get response from tlb
451  val s2_tlb_resp = io.tlb_req.resp
452  val s2_tlb_update_index = RegEnable(s1_tlb_req_index, s1_tlb_req_valid)
453  val s2_tlb_evict = s1_alloc && (s1_index === s2_tlb_update_index)
454  when(s2_tlb_resp.valid && !s2_tlb_evict) {
455    array(s2_tlb_update_index).is_vaddr := s2_tlb_resp.bits.miss
456
457    when(!s2_tlb_resp.bits.miss) {
458      array(s2_tlb_update_index).region := Cat(0.U((VAddrBits - PAddrBits).W), s2_tlb_resp.bits.paddr.head(s2_tlb_resp.bits.paddr.head.getWidth - 1, REGION_TAG_OFFSET))
459      when(s2_tlb_resp.bits.excp.head.pf.ld || s2_tlb_resp.bits.excp.head.af.ld) {
460        array(s2_tlb_update_index).invalidate()
461      }
462    }
463  }
464  s2_tlb_resp.ready := true.B
465
466  XSPerfAccumulate("s2_tlb_resp_valid", s2_tlb_resp.valid)
467  XSPerfAccumulate("s2_tlb_resp_evict", s2_tlb_resp.valid && s2_tlb_evict)
468  XSPerfAccumulate("s2_tlb_resp_miss", s2_tlb_resp.valid && !s2_tlb_evict && s2_tlb_resp.bits.miss)
469  XSPerfAccumulate("s2_tlb_resp_updated", s2_tlb_resp.valid && !s2_tlb_evict && !s2_tlb_resp.bits.miss)
470  XSPerfAccumulate("s2_tlb_resp_page_fault", s2_tlb_resp.valid && !s2_tlb_evict && !s2_tlb_resp.bits.miss && s2_tlb_resp.bits.excp.head.pf.ld)
471  XSPerfAccumulate("s2_tlb_resp_access_fault", s2_tlb_resp.valid && !s2_tlb_evict && !s2_tlb_resp.bits.miss && s2_tlb_resp.bits.excp.head.af.ld)
472
473  // l1 pf
474  // s0: generate prefetch req paddr per entry, arb them
475  val s0_pf_fire_vec = VecInit((0 until MLP_SIZE).map{case i => l1_pf_req_arb.io.in(i).fire})
476  val s1_pf_fire_vec = RegNext(s0_pf_fire_vec)
477
478  val s0_pf_fire = l1_pf_req_arb.io.out.fire
479  val s0_pf_index = l1_pf_req_arb.io.chosen
480  val s0_pf_candidate_oh = get_candidate_oh(l1_pf_req_arb.io.out.bits.paddr)
481
482  for(i <- 0 until MLP_SIZE) {
483    val evict = s1_alloc && (s1_index === i.U)
484    l1_pf_req_arb.io.in(i).valid := array(i).can_send_pf() && (array(i).sink === SINK_L1) && !evict
485    l1_pf_req_arb.io.in(i).bits.paddr := array(i).get_pf_addr()
486    l1_pf_req_arb.io.in(i).bits.alias := array(i).alias
487    l1_pf_req_arb.io.in(i).bits.confidence := io.confidence
488    l1_pf_req_arb.io.in(i).bits.is_store := false.B
489    l1_pf_req_arb.io.in(i).bits.pf_source := array(i).source
490  }
491
492  when(s0_pf_fire) {
493    array(s0_pf_index).sent_vec := array(s0_pf_index).sent_vec | s0_pf_candidate_oh
494  }
495
496  assert(PopCount(s0_pf_fire_vec) <= 1.U, "s0_pf_fire_vec should be one-hot or empty")
497
498  // s1: send out to dcache
499  val s1_pf_valid = Reg(Bool())
500  val s1_pf_bits = RegEnable(l1_pf_req_arb.io.out.bits, l1_pf_req_arb.io.out.fire)
501  val s1_pf_index = RegEnable(s0_pf_index, l1_pf_req_arb.io.out.fire)
502  val s1_pf_candidate_oh = RegEnable(s0_pf_candidate_oh, l1_pf_req_arb.io.out.fire)
503  val s1_pf_evict = s1_alloc && (s1_index === s1_pf_index)
504  val s1_pf_update = s1_update && (s1_index === s1_pf_index)
505  val s1_pf_can_go = io.l1_req.ready && !s1_pf_evict && !s1_pf_update
506  val s1_pf_fire = s1_pf_valid && s1_pf_can_go
507
508  when(s1_pf_can_go) {
509    s1_pf_valid := false.B
510  }
511
512  when(l1_pf_req_arb.io.out.fire) {
513    s1_pf_valid := true.B
514  }
515
516  when(s1_pf_fire) {
517    array(s1_pf_index).bit_vec := array(s1_pf_index).bit_vec & ~s1_pf_candidate_oh
518  }
519
520  io.l1_req.valid := s1_pf_valid && !s1_pf_evict && !s1_pf_update && (s1_pf_bits.paddr >= 0x80000000L.U) && io.enable
521  io.l1_req.bits := s1_pf_bits
522
523  l1_pf_req_arb.io.out.ready := s1_pf_can_go || !s1_pf_valid
524
525  assert(!((s1_alloc || s1_update) && s1_pf_fire && (s1_index === s1_pf_index)), "pf pipeline & enq pipeline bit_vec harzard!")
526
527  XSPerfAccumulate("s1_pf_valid", s1_pf_valid)
528  XSPerfAccumulate("s1_pf_block_by_pipe_unready", s1_pf_valid && !io.l1_req.ready)
529  XSPerfAccumulate("s1_pf_block_by_enq_alloc_harzard", s1_pf_valid && s1_pf_evict)
530  XSPerfAccumulate("s1_pf_block_by_enq_update_harzard", s1_pf_valid && s1_pf_update)
531  XSPerfAccumulate("s1_pf_fire", s1_pf_fire)
532
533  // l2 pf
534  // s0: generate prefetch req paddr per entry, arb them, sent out
535  io.l2_pf_addr.valid := l2_pf_req_arb.io.out.valid
536  io.l2_pf_addr.bits := l2_pf_req_arb.io.out.bits
537
538  l2_pf_req_arb.io.out.ready := true.B
539
540  for(i <- 0 until MLP_SIZE) {
541    val evict = s1_alloc && (s1_index === i.U)
542    l2_pf_req_arb.io.in(i).valid := array(i).can_send_pf() && (array(i).sink === SINK_L2) && !evict
543    l2_pf_req_arb.io.in(i).bits.addr := array(i).get_pf_addr()
544    l2_pf_req_arb.io.in(i).bits.source := MuxLookup(array(i).source.value, MemReqSource.Prefetch2L2Unknown.id.U)(Seq(
545      L1_HW_PREFETCH_STRIDE -> MemReqSource.Prefetch2L2Stride.id.U,
546      L1_HW_PREFETCH_STREAM -> MemReqSource.Prefetch2L2Stream.id.U
547    ))
548  }
549
550  when(l2_pf_req_arb.io.out.valid) {
551    array(l2_pf_req_arb.io.chosen).sent_vec := array(l2_pf_req_arb.io.chosen).sent_vec | get_candidate_oh(l2_pf_req_arb.io.out.bits.addr)
552  }
553
554  // last level cache pf
555  // s0: generate prefetch req paddr per entry, arb them, sent out
556  io.l3_pf_addr.valid := l3_pf_req_arb.io.out.valid
557  io.l3_pf_addr.bits := l3_pf_req_arb.io.out.bits
558
559  l3_pf_req_arb.io.out.ready := true.B
560
561  for(i <- 0 until MLP_SIZE) {
562    val evict = s1_alloc && (s1_index === i.U)
563    l3_pf_req_arb.io.in(i).valid := array(i).can_send_pf() && (array(i).sink === SINK_L3) && !evict
564    l3_pf_req_arb.io.in(i).bits := array(i).get_pf_addr()
565  }
566
567  when(l3_pf_req_arb.io.out.valid) {
568    array(l3_pf_req_arb.io.chosen).sent_vec := array(l3_pf_req_arb.io.chosen).sent_vec | get_candidate_oh(l3_pf_req_arb.io.out.bits)
569  }
570
571  // reset meta to avoid muti-hit problem
572  for(i <- 0 until MLP_SIZE) {
573    when(reset.asBool || RegNext(io.flush)) {
574      array(i).reset(i)
575    }
576  }
577
578  XSPerfAccumulate("l2_prefetche_queue_busby", io.l2PfqBusy)
579  XSPerfHistogram("filter_active", PopCount(VecInit(array.map(_.can_send_pf())).asUInt), true.B, 0, MLP_SIZE, 1)
580  XSPerfHistogram("l1_filter_active", PopCount(VecInit(array.map(x => x.can_send_pf() && (x.sink === SINK_L1))).asUInt), true.B, 0, MLP_SIZE, 1)
581  XSPerfHistogram("l2_filter_active", PopCount(VecInit(array.map(x => x.can_send_pf() && (x.sink === SINK_L2))).asUInt), true.B, 0, MLP_SIZE, 1)
582  XSPerfHistogram("l3_filter_active", PopCount(VecInit(array.map(x => x.can_send_pf() && (x.sink === SINK_L3))).asUInt), true.B, 0, MLP_SIZE, 1)
583}
584
585class L1Prefetcher(implicit p: Parameters) extends BasePrefecher with HasStreamPrefetchHelper with HasStridePrefetchHelper {
586  val pf_ctrl = IO(Input(new PrefetchControlBundle))
587  val stride_train = IO(Flipped(Vec(backendParams.LduCnt + backendParams.HyuCnt, ValidIO(new LdPrefetchTrainBundle()))))
588  val l2PfqBusy = IO(Input(Bool()))
589
590  val stride_train_filter = Module(new TrainFilter(STRIDE_FILTER_SIZE, "stride"))
591  val stride_meta_array = Module(new StrideMetaArray)
592  val stream_train_filter = Module(new TrainFilter(STREAM_FILTER_SIZE, "stream"))
593  val stream_bit_vec_array = Module(new StreamBitVectorArray)
594  val pf_queue_filter = Module(new MutiLevelPrefetchFilter)
595
596  // for now, if the stream is disabled, train and prefetch process will continue, without sending out and reqs
597  val enable = io.enable
598  val flush = pf_ctrl.flush
599
600  stream_train_filter.io.ld_in.zipWithIndex.foreach {
601    case (ld_in, i) => {
602      ld_in.valid := io.ld_in(i).valid && enable
603      ld_in.bits := io.ld_in(i).bits
604    }
605  }
606  stream_train_filter.io.enable := enable
607  stream_train_filter.io.flush := flush
608
609  stride_train_filter.io.ld_in.zipWithIndex.foreach {
610    case (ld_in, i) => {
611      ld_in.valid := stride_train(i).valid && enable
612      ld_in.bits := stride_train(i).bits
613    }
614  }
615  stride_train_filter.io.enable := enable
616  stride_train_filter.io.flush := flush
617
618  stream_bit_vec_array.io.enable := enable
619  stream_bit_vec_array.io.flush := flush
620  stream_bit_vec_array.io.dynamic_depth := pf_ctrl.dynamic_depth
621  stream_bit_vec_array.io.train_req <> stream_train_filter.io.train_req
622
623  stride_meta_array.io.enable := enable
624  stride_meta_array.io.flush := flush
625  stride_meta_array.io.dynamic_depth := 0.U
626  stride_meta_array.io.train_req <> stride_train_filter.io.train_req
627  stride_meta_array.io.stream_lookup_req <> stream_bit_vec_array.io.stream_lookup_req
628  stride_meta_array.io.stream_lookup_resp <> stream_bit_vec_array.io.stream_lookup_resp
629
630  // stream has higher priority than stride
631  pf_queue_filter.io.prefetch_req.valid := stream_bit_vec_array.io.prefetch_req.valid || stride_meta_array.io.prefetch_req.valid
632  pf_queue_filter.io.prefetch_req.bits := Mux(
633    stream_bit_vec_array.io.prefetch_req.valid,
634    stream_bit_vec_array.io.prefetch_req.bits,
635    stride_meta_array.io.prefetch_req.bits
636  )
637
638  io.l1_req.valid := pf_queue_filter.io.l1_req.valid && enable && pf_ctrl.enable
639  io.l1_req.bits := pf_queue_filter.io.l1_req.bits
640
641  pf_queue_filter.io.l1_req.ready := Mux(pf_ctrl.enable, io.l1_req.ready, true.B)
642  pf_queue_filter.io.tlb_req <> io.tlb_req
643  pf_queue_filter.io.enable := enable
644  pf_queue_filter.io.flush := flush
645  pf_queue_filter.io.confidence := pf_ctrl.confidence
646  pf_queue_filter.io.l2PfqBusy := l2PfqBusy
647
648  io.l2_req.valid := pf_queue_filter.io.l2_pf_addr.valid && pf_queue_filter.io.l2_pf_addr.bits.addr > 0x80000000L.U && enable && pf_ctrl.enable
649  io.l2_req.bits := pf_queue_filter.io.l2_pf_addr.bits
650
651  io.l3_req.valid := pf_queue_filter.io.l3_pf_addr.valid && pf_queue_filter.io.l3_pf_addr.bits > 0x80000000L.U && enable && pf_ctrl.enable
652  io.l3_req.bits := pf_queue_filter.io.l3_pf_addr.bits
653}