1package xiangshan.mem.prefetch 2 3import org.chipsalliance.cde.config.Parameters 4import chisel3._ 5import chisel3.util._ 6import xiangshan._ 7import utils._ 8import utility._ 9import xiangshan.cache.HasDCacheParameters 10import xiangshan.cache.mmu._ 11import xiangshan.mem.{L1PrefetchReq, LdPrefetchTrainBundle} 12import xiangshan.mem.trace._ 13import xiangshan.mem.L1PrefetchSource 14 15trait HasL1PrefetchHelper extends HasCircularQueuePtrHelper with HasDCacheParameters { 16 // region related 17 val REGION_SIZE = 1024 18 val PAGE_OFFSET = 12 19 val BLOCK_OFFSET = log2Up(dcacheParameters.blockBytes) 20 val BIT_VEC_WITDH = REGION_SIZE / dcacheParameters.blockBytes 21 val REGION_BITS = log2Up(BIT_VEC_WITDH) 22 val REGION_TAG_OFFSET = BLOCK_OFFSET + REGION_BITS 23 val REGION_TAG_BITS = VAddrBits - BLOCK_OFFSET - REGION_BITS 24 25 // hash related 26 val VADDR_HASH_WIDTH = 5 27 val BLK_ADDR_RAW_WIDTH = 10 28 val HASH_TAG_WIDTH = VADDR_HASH_WIDTH + BLK_ADDR_RAW_WIDTH 29 30 // capacity related 31 val MLP_SIZE = 16 32 33 // prefetch sink related 34 val SINK_BITS = 2 35 def SINK_L1 = "b00".U 36 def SINK_L2 = "b01".U 37 def SINK_L3 = "b10".U 38 39 // vaddr: | region tag | region bits | block offset | 40 def get_region_tag(vaddr: UInt) = { 41 require(vaddr.getWidth == VAddrBits) 42 vaddr(vaddr.getWidth - 1, REGION_TAG_OFFSET) 43 } 44 45 def get_region_bits(vaddr: UInt) = { 46 require(vaddr.getWidth == VAddrBits) 47 vaddr(REGION_TAG_OFFSET - 1, BLOCK_OFFSET) 48 } 49 50 def block_addr(x: UInt): UInt = { 51 x(x.getWidth - 1, BLOCK_OFFSET) 52 } 53 54 def vaddr_hash(x: UInt): UInt = { 55 val width = VADDR_HASH_WIDTH 56 val low = x(width - 1, 0) 57 val mid = x(2 * width - 1, width) 58 val high = x(3 * width - 1, 2 * width) 59 low ^ mid ^ high 60 } 61 62 def pc_hash_tag(x: UInt): UInt = { 63 val low = x(BLK_ADDR_RAW_WIDTH - 1, 0) 64 val high = x(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH) 65 val high_hash = vaddr_hash(high) 66 Cat(high_hash, low) 67 } 68 69 def block_hash_tag(x: UInt): UInt = { 70 val blk_addr = block_addr(x) 71 val low = blk_addr(BLK_ADDR_RAW_WIDTH - 1, 0) 72 val high = blk_addr(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH) 73 val high_hash = vaddr_hash(high) 74 Cat(high_hash, low) 75 } 76 77 def region_hash_tag(region_tag: UInt): UInt = { 78 val low = region_tag(BLK_ADDR_RAW_WIDTH - 1, 0) 79 val high = region_tag(BLK_ADDR_RAW_WIDTH - 1 + 3 * VADDR_HASH_WIDTH, BLK_ADDR_RAW_WIDTH) 80 val high_hash = vaddr_hash(high) 81 Cat(high_hash, low) 82 } 83 84 def region_to_block_addr(region_tag: UInt, region_bits: UInt): UInt = { 85 Cat(region_tag, region_bits) 86 } 87 88 def get_candidate_oh(x: UInt): UInt = { 89 require(x.getWidth == PAddrBits) 90 UIntToOH(x(REGION_BITS + BLOCK_OFFSET - 1, BLOCK_OFFSET)) 91 } 92 93 def toBinary(n: Int): String = n match { 94 case 0|1 => s"$n" 95 case _ => s"${toBinary(n/2)}${n%2}" 96 } 97} 98 99trait HasTrainFilterHelper extends HasCircularQueuePtrHelper { 100 def reorder[T <: LdPrefetchTrainBundle](source: Vec[ValidIO[T]]): Vec[ValidIO[T]] = { 101 if(source.length == 1) { 102 source 103 }else if(source.length == 2) { 104 val source_v = source.map(_.valid) 105 val res = Wire(source.cloneType) 106 // source 1 is older than source 0 (only when source0/1 are both valid) 107 val source_1_older = Mux(Cat(source_v).andR, 108 isBefore(source(1).bits.uop.robIdx, source(0).bits.uop.robIdx), 109 false.B 110 ) 111 when(source_1_older) { 112 res(0) := source(1) 113 res(1) := source(0) 114 }.otherwise { 115 res := source 116 } 117 118 res 119 }else if(source.length == 3) { 120 // TODO: generalize 121 val res_0_1 = Wire(source.cloneType) 122 val res_1_2 = Wire(source.cloneType) 123 val res = Wire(source.cloneType) 124 125 val tmp = reorder(VecInit(source.slice(0, 2))) 126 res_0_1(0) := tmp(0) 127 res_0_1(1) := tmp(1) 128 res_0_1(2) := source(2) 129 val tmp_1 = reorder(VecInit(res_0_1.slice(1, 3))) 130 res_1_2(0) := res_0_1(0) 131 res_1_2(1) := tmp_1(0) 132 res_1_2(2) := tmp_1(1) 133 val tmp_2 = reorder(VecInit(res_1_2.slice(0, 2))) 134 res(0) := tmp_2(0) 135 res(1) := tmp_2(1) 136 res(2) := res_1_2(2) 137 138 res 139 }else { 140 require(false, "for now, 4 or more sources are invalid") 141 source 142 } 143 } 144} 145 146// get prefetch train reqs from `exuParameters.LduCnt` load pipelines (up to `exuParameters.LduCnt`/cycle) 147// filter by cache line address, send out train req to stride (up to 1 req/cycle) 148class TrainFilter(size: Int, name: String)(implicit p: Parameters) extends XSModule with HasL1PrefetchHelper with HasTrainFilterHelper { 149 val io = IO(new Bundle() { 150 val enable = Input(Bool()) 151 val flush = Input(Bool()) 152 // train input, only from load for now 153 val ld_in = Flipped(Vec(backendParams.LduCnt, ValidIO(new LdPrefetchTrainBundle()))) 154 // filter out 155 val train_req = DecoupledIO(new PrefetchReqBundle()) 156 }) 157 158 class Ptr(implicit p: Parameters) extends CircularQueuePtr[Ptr]( p => size ){} 159 object Ptr { 160 def apply(f: Bool, v: UInt)(implicit p: Parameters): Ptr = { 161 val ptr = Wire(new Ptr) 162 ptr.flag := f 163 ptr.value := v 164 ptr 165 } 166 } 167 168 val entries = RegInit(VecInit(Seq.fill(size){ (0.U.asTypeOf(new PrefetchReqBundle())) })) 169 val valids = RegInit(VecInit(Seq.fill(size){ (false.B) })) 170 171 // enq 172 val enqLen = backendParams.LduCnt 173 val enqPtrExt = RegInit(VecInit((0 until enqLen).map(_.U.asTypeOf(new Ptr)))) 174 val deqPtrExt = RegInit(0.U.asTypeOf(new Ptr)) 175 176 val deqPtr = WireInit(deqPtrExt.value) 177 178 require(size >= enqLen) 179 180 val ld_in_reordered = reorder(io.ld_in) 181 val reqs_l = ld_in_reordered.map(_.bits.asPrefetchReqBundle()) 182 val reqs_vl = ld_in_reordered.map(_.valid) 183 val needAlloc = Wire(Vec(enqLen, Bool())) 184 val canAlloc = Wire(Vec(enqLen, Bool())) 185 186 for(i <- (0 until enqLen)) { 187 val req = reqs_l(i) 188 val req_v = reqs_vl(i) 189 val index = PopCount(needAlloc.take(i)) 190 val allocPtr = enqPtrExt(index) 191 val entry_match = Cat(entries.zip(valids).map { 192 case(e, v) => v && block_hash_tag(e.vaddr) === block_hash_tag(req.vaddr) 193 }).orR 194 val prev_enq_match = if(i == 0) false.B else Cat(reqs_l.zip(reqs_vl).take(i).map { 195 case(pre, pre_v) => pre_v && block_hash_tag(pre.vaddr) === block_hash_tag(req.vaddr) 196 }).orR 197 198 needAlloc(i) := req_v && !entry_match && !prev_enq_match 199 canAlloc(i) := needAlloc(i) && allocPtr >= deqPtrExt && io.enable 200 201 when(canAlloc(i)) { 202 valids(allocPtr.value) := true.B 203 entries(allocPtr.value) := req 204 } 205 } 206 val allocNum = PopCount(canAlloc) 207 208 enqPtrExt.foreach{case x => x := x + allocNum} 209 210 // deq 211 io.train_req.valid := false.B 212 io.train_req.bits := DontCare 213 valids.zip(entries).zipWithIndex.foreach { 214 case((valid, entry), i) => { 215 when(deqPtr === i.U) { 216 io.train_req.valid := valid && io.enable 217 io.train_req.bits := entry 218 } 219 } 220 } 221 222 when(io.train_req.fire) { 223 valids(deqPtr) := false.B 224 deqPtrExt := deqPtrExt + 1.U 225 } 226 227 when(RegNext(io.flush)) { 228 valids.foreach {case valid => valid := false.B} 229 (0 until enqLen).map {case i => enqPtrExt(i) := i.U.asTypeOf(new Ptr)} 230 deqPtrExt := 0.U.asTypeOf(new Ptr) 231 } 232 233 XSPerfAccumulate(s"${name}_train_filter_full", PopCount(valids) === size.U) 234 XSPerfAccumulate(s"${name}_train_filter_half", PopCount(valids) >= (size / 2).U) 235 XSPerfAccumulate(s"${name}_train_filter_empty", PopCount(valids) === 0.U) 236 237 val raw_enq_pattern = Cat(reqs_vl) 238 val filtered_enq_pattern = Cat(needAlloc) 239 val actual_enq_pattern = Cat(canAlloc) 240 XSPerfAccumulate(s"${name}_train_filter_enq", allocNum > 0.U) 241 XSPerfAccumulate(s"${name}_train_filter_deq", io.train_req.fire) 242 for(i <- 0 until (1 << enqLen)) { 243 XSPerfAccumulate(s"${name}_train_filter_raw_enq_pattern_${toBinary(i)}", raw_enq_pattern === i.U) 244 XSPerfAccumulate(s"${name}_train_filter_filtered_enq_pattern_${toBinary(i)}", filtered_enq_pattern === i.U) 245 XSPerfAccumulate(s"${name}_train_filter_actual_enq_pattern_${toBinary(i)}", actual_enq_pattern === i.U) 246 } 247} 248 249class MLPReqFilterBundle(implicit p: Parameters) extends XSBundle with HasL1PrefetchHelper { 250 val tag = UInt(HASH_TAG_WIDTH.W) 251 val region = UInt(REGION_TAG_BITS.W) 252 val bit_vec = UInt(BIT_VEC_WITDH.W) 253 // NOTE: l1 will not use sent_vec, for making more prefetch reqs to l1 dcache 254 val sent_vec = UInt(BIT_VEC_WITDH.W) 255 val sink = UInt(SINK_BITS.W) 256 val alias = UInt(2.W) 257 val is_vaddr = Bool() 258 val source = new L1PrefetchSource() 259 260 def reset(index: Int) = { 261 tag := region_hash_tag(index.U) 262 region := index.U 263 bit_vec := 0.U 264 sent_vec := 0.U 265 sink := SINK_L1 266 alias := 0.U 267 is_vaddr := false.B 268 source.value := L1_HW_PREFETCH_NULL 269 } 270 271 def tag_match(new_tag: UInt): Bool = { 272 require(new_tag.getWidth == HASH_TAG_WIDTH) 273 tag === new_tag 274 } 275 276 def update(update_bit_vec: UInt, update_sink: UInt) = { 277 bit_vec := bit_vec | update_bit_vec 278 when(update_sink < sink) { 279 bit_vec := (bit_vec & ~sent_vec) | update_bit_vec 280 sink := update_sink 281 } 282 283 assert(PopCount(update_bit_vec) >= 1.U, "valid bits in update vector should greater than one") 284 } 285 286 def can_send_pf(): Bool = { 287 Mux( 288 sink === SINK_L1, 289 !is_vaddr && bit_vec.orR, 290 !is_vaddr && (bit_vec & ~sent_vec).orR 291 ) 292 } 293 294 def get_pf_addr(): UInt = { 295 require(PAddrBits <= VAddrBits) 296 require((region.getWidth + REGION_BITS + BLOCK_OFFSET) == VAddrBits) 297 298 val candidate = Mux( 299 sink === SINK_L1, 300 PriorityEncoder(bit_vec).asTypeOf(UInt(REGION_BITS.W)), 301 PriorityEncoder(bit_vec & ~sent_vec).asTypeOf(UInt(REGION_BITS.W)) 302 ) 303 Cat(region, candidate, 0.U(BLOCK_OFFSET.W)) 304 } 305 306 def get_tlb_va(): UInt = { 307 require((region.getWidth + REGION_TAG_OFFSET) == VAddrBits) 308 Cat(region, 0.U(REGION_TAG_OFFSET.W)) 309 } 310 311 def fromStreamPrefetchReqBundle(x : StreamPrefetchReqBundle): MLPReqFilterBundle = { 312 require(PAGE_OFFSET >= REGION_TAG_OFFSET, "region is greater than 4k, alias bit may be incorrect") 313 314 val res = Wire(new MLPReqFilterBundle) 315 res.tag := region_hash_tag(x.region) 316 res.region := x.region 317 res.bit_vec := x.bit_vec 318 res.sent_vec := 0.U 319 res.sink := x.sink 320 res.is_vaddr := true.B 321 res.source := x.source 322 res.alias := x.region(PAGE_OFFSET - REGION_TAG_OFFSET + 1, PAGE_OFFSET - REGION_TAG_OFFSET) 323 324 res 325 } 326 327 def invalidate() = { 328 // disable sending pf req 329 when(sink === SINK_L1) { 330 bit_vec := 0.U(BIT_VEC_WITDH.W) 331 }.otherwise { 332 sent_vec := ~(0.U(BIT_VEC_WITDH.W)) 333 } 334 // disable sending tlb req 335 is_vaddr := false.B 336 } 337} 338 339// there are 5 independent pipelines inside 340// 1. prefetch enqueue 341// 2. tlb request 342// 3. actual l1 prefetch 343// 4. actual l2 prefetch 344// 5. actual l3 prefetch 345class MutiLevelPrefetchFilter(implicit p: Parameters) extends XSModule with HasL1PrefetchHelper { 346 val io = IO(new XSBundle { 347 val enable = Input(Bool()) 348 val flush = Input(Bool()) 349 val prefetch_req = Flipped(ValidIO(new StreamPrefetchReqBundle)) 350 val tlb_req = new TlbRequestIO(nRespDups = 2) 351 val l1_req = DecoupledIO(new L1PrefetchReq()) 352 val l2_pf_addr = ValidIO(new L2PrefetchReq()) 353 val l3_pf_addr = ValidIO(UInt(PAddrBits.W)) // TODO: l3 pf source 354 val confidence = Input(UInt(1.W)) 355 val l2PfqBusy = Input(Bool()) 356 }) 357 358 val array = Reg(Vec(MLP_SIZE, new MLPReqFilterBundle)) 359 val replacement = ReplacementPolicy.fromString("plru", MLP_SIZE) 360 val tlb_req_arb = Module(new RRArbiterInit(new TlbReq, MLP_SIZE)) 361 val l1_pf_req_arb = Module(new RRArbiterInit(new L1PrefetchReq, MLP_SIZE)) 362 val l2_pf_req_arb = Module(new RRArbiterInit(new L2PrefetchReq, MLP_SIZE)) 363 val l3_pf_req_arb = Module(new RRArbiterInit(UInt(PAddrBits.W), MLP_SIZE)) 364 365 // enq 366 // s0: hash tag match 367 val s0_can_accept = Wire(Bool()) 368 val s0_valid = io.prefetch_req.valid && s0_can_accept 369 val s0_region = io.prefetch_req.bits.region 370 val s0_region_hash = region_hash_tag(s0_region) 371 val s0_match_vec = array.map(_.tag_match(s0_region_hash)) 372 val s0_hit = VecInit(s0_match_vec).asUInt.orR 373 val s0_index = Mux(s0_hit, OHToUInt(VecInit(s0_match_vec).asUInt), replacement.way) 374 val s0_prefetch_req = (new MLPReqFilterBundle).fromStreamPrefetchReqBundle(io.prefetch_req.bits) 375 376 when(s0_valid) { 377 replacement.access(s0_index) 378 } 379 380 assert(!s0_valid || PopCount(VecInit(s0_match_vec)) <= 1.U, "req region should match no more than 1 entry") 381 assert(!(s0_valid && RegNext(s0_valid) && !s0_hit && !RegNext(s0_hit) && replacement.way === RegNext(replacement.way)), "replacement error") 382 383 XSPerfAccumulate("s0_enq_fire", s0_valid) 384 XSPerfAccumulate("s0_enq_valid", io.prefetch_req.valid) 385 XSPerfAccumulate("s0_cannot_enq", io.prefetch_req.valid && !s0_can_accept) 386 387 // s1: alloc or update 388 val s1_valid = RegNext(s0_valid) 389 val s1_region = RegEnable(s0_region, s0_valid) 390 val s1_region_hash = RegEnable(s0_region_hash, s0_valid) 391 val s1_hit = RegEnable(s0_hit, s0_valid) 392 val s1_index = RegEnable(s0_index, s0_valid) 393 val s1_prefetch_req = RegEnable(s0_prefetch_req, s0_valid) 394 val s1_alloc = s1_valid && !s1_hit 395 val s1_update = s1_valid && s1_hit 396 s0_can_accept := !(s1_valid && s1_alloc && (s0_region_hash === s1_region_hash)) 397 398 when(s1_alloc) { 399 array(s1_index) := s1_prefetch_req 400 }.elsewhen(s1_update) { 401 array(s1_index).update( 402 update_bit_vec = s1_prefetch_req.bit_vec, 403 update_sink = s1_prefetch_req.sink 404 ) 405 } 406 407 // TODO: set this constraint looser to enable more kinds of depth 408 // assert(!(s0_valid && s1_valid && s0_region === s1_region), "s0 and s1 must have different region") 409 410 XSPerfAccumulate("s1_enq_valid", s1_valid) 411 XSPerfAccumulate("s1_enq_alloc", s1_alloc) 412 XSPerfAccumulate("s1_enq_update", s1_update) 413 XSPerfAccumulate("hash_conflict", s0_valid && RegNext(s1_valid) && (s0_region =/= RegNext(s1_region)) && (s0_region_hash === RegNext(s1_region_hash))) 414 415 // tlb req 416 // s0: arb all tlb reqs 417 val s0_tlb_fire_vec = VecInit((0 until MLP_SIZE).map{case i => tlb_req_arb.io.in(i).fire}) 418 val s1_tlb_fire_vec = RegNext(s0_tlb_fire_vec) 419 val s2_tlb_fire_vec = RegNext(s1_tlb_fire_vec) 420 421 for(i <- 0 until MLP_SIZE) { 422 val evict = s1_alloc && (s1_index === i.U) 423 tlb_req_arb.io.in(i).valid := array(i).is_vaddr && !s1_tlb_fire_vec(i) && !s2_tlb_fire_vec(i) && !evict 424 tlb_req_arb.io.in(i).bits.vaddr := array(i).get_tlb_va() 425 tlb_req_arb.io.in(i).bits.cmd := TlbCmd.read 426 tlb_req_arb.io.in(i).bits.size := 3.U 427 tlb_req_arb.io.in(i).bits.kill := false.B 428 tlb_req_arb.io.in(i).bits.no_translate := false.B 429 tlb_req_arb.io.in(i).bits.memidx := DontCare 430 tlb_req_arb.io.in(i).bits.debug := DontCare 431 tlb_req_arb.io.in(i).bits.hlvx := DontCare 432 tlb_req_arb.io.in(i).bits.hyperinst := DontCare 433 } 434 435 assert(PopCount(s0_tlb_fire_vec) <= 1.U, "s0_tlb_fire_vec should be one-hot or empty") 436 437 // s1: send out the req 438 val s1_tlb_req_valid = RegNext(tlb_req_arb.io.out.valid) 439 val s1_tlb_req_bits = RegEnable(tlb_req_arb.io.out.bits, tlb_req_arb.io.out.valid) 440 val s1_tlb_req_index = RegEnable(OHToUInt(s0_tlb_fire_vec.asUInt), tlb_req_arb.io.out.valid) 441 val s1_tlb_evict = s1_alloc && (s1_index === s1_tlb_req_index) 442 io.tlb_req.req.valid := s1_tlb_req_valid && !s1_tlb_evict 443 io.tlb_req.req.bits := s1_tlb_req_bits 444 io.tlb_req.req_kill := false.B 445 tlb_req_arb.io.out.ready := true.B 446 447 XSPerfAccumulate("s1_tlb_req_sent", io.tlb_req.req.valid) 448 XSPerfAccumulate("s1_tlb_req_evict", s1_tlb_req_valid && s1_tlb_evict) 449 450 // s2: get response from tlb 451 val s2_tlb_resp = io.tlb_req.resp 452 val s2_tlb_update_index = RegEnable(s1_tlb_req_index, s1_tlb_req_valid) 453 val s2_tlb_evict = s1_alloc && (s1_index === s2_tlb_update_index) 454 when(s2_tlb_resp.valid && !s2_tlb_evict) { 455 array(s2_tlb_update_index).is_vaddr := s2_tlb_resp.bits.miss 456 457 when(!s2_tlb_resp.bits.miss) { 458 array(s2_tlb_update_index).region := Cat(0.U((VAddrBits - PAddrBits).W), s2_tlb_resp.bits.paddr.head(s2_tlb_resp.bits.paddr.head.getWidth - 1, REGION_TAG_OFFSET)) 459 when(s2_tlb_resp.bits.excp.head.pf.ld || s2_tlb_resp.bits.excp.head.af.ld) { 460 array(s2_tlb_update_index).invalidate() 461 } 462 } 463 } 464 s2_tlb_resp.ready := true.B 465 466 XSPerfAccumulate("s2_tlb_resp_valid", s2_tlb_resp.valid) 467 XSPerfAccumulate("s2_tlb_resp_evict", s2_tlb_resp.valid && s2_tlb_evict) 468 XSPerfAccumulate("s2_tlb_resp_miss", s2_tlb_resp.valid && !s2_tlb_evict && s2_tlb_resp.bits.miss) 469 XSPerfAccumulate("s2_tlb_resp_updated", s2_tlb_resp.valid && !s2_tlb_evict && !s2_tlb_resp.bits.miss) 470 XSPerfAccumulate("s2_tlb_resp_page_fault", s2_tlb_resp.valid && !s2_tlb_evict && !s2_tlb_resp.bits.miss && s2_tlb_resp.bits.excp.head.pf.ld) 471 XSPerfAccumulate("s2_tlb_resp_access_fault", s2_tlb_resp.valid && !s2_tlb_evict && !s2_tlb_resp.bits.miss && s2_tlb_resp.bits.excp.head.af.ld) 472 473 // l1 pf 474 // s0: generate prefetch req paddr per entry, arb them 475 val s0_pf_fire_vec = VecInit((0 until MLP_SIZE).map{case i => l1_pf_req_arb.io.in(i).fire}) 476 val s1_pf_fire_vec = RegNext(s0_pf_fire_vec) 477 478 val s0_pf_fire = l1_pf_req_arb.io.out.fire 479 val s0_pf_index = l1_pf_req_arb.io.chosen 480 val s0_pf_candidate_oh = get_candidate_oh(l1_pf_req_arb.io.out.bits.paddr) 481 482 for(i <- 0 until MLP_SIZE) { 483 val evict = s1_alloc && (s1_index === i.U) 484 l1_pf_req_arb.io.in(i).valid := array(i).can_send_pf() && (array(i).sink === SINK_L1) && !evict 485 l1_pf_req_arb.io.in(i).bits.paddr := array(i).get_pf_addr() 486 l1_pf_req_arb.io.in(i).bits.alias := array(i).alias 487 l1_pf_req_arb.io.in(i).bits.confidence := io.confidence 488 l1_pf_req_arb.io.in(i).bits.is_store := false.B 489 l1_pf_req_arb.io.in(i).bits.pf_source := array(i).source 490 } 491 492 when(s0_pf_fire) { 493 array(s0_pf_index).sent_vec := array(s0_pf_index).sent_vec | s0_pf_candidate_oh 494 } 495 496 assert(PopCount(s0_pf_fire_vec) <= 1.U, "s0_pf_fire_vec should be one-hot or empty") 497 498 // s1: send out to dcache 499 val s1_pf_valid = Reg(Bool()) 500 val s1_pf_bits = RegEnable(l1_pf_req_arb.io.out.bits, l1_pf_req_arb.io.out.fire) 501 val s1_pf_index = RegEnable(s0_pf_index, l1_pf_req_arb.io.out.fire) 502 val s1_pf_candidate_oh = RegEnable(s0_pf_candidate_oh, l1_pf_req_arb.io.out.fire) 503 val s1_pf_evict = s1_alloc && (s1_index === s1_pf_index) 504 val s1_pf_update = s1_update && (s1_index === s1_pf_index) 505 val s1_pf_can_go = io.l1_req.ready && !s1_pf_evict && !s1_pf_update 506 val s1_pf_fire = s1_pf_valid && s1_pf_can_go 507 508 when(s1_pf_can_go) { 509 s1_pf_valid := false.B 510 } 511 512 when(l1_pf_req_arb.io.out.fire) { 513 s1_pf_valid := true.B 514 } 515 516 when(s1_pf_fire) { 517 array(s1_pf_index).bit_vec := array(s1_pf_index).bit_vec & ~s1_pf_candidate_oh 518 } 519 520 io.l1_req.valid := s1_pf_valid && !s1_pf_evict && !s1_pf_update && (s1_pf_bits.paddr >= 0x80000000L.U) && io.enable 521 io.l1_req.bits := s1_pf_bits 522 523 l1_pf_req_arb.io.out.ready := s1_pf_can_go || !s1_pf_valid 524 525 assert(!((s1_alloc || s1_update) && s1_pf_fire && (s1_index === s1_pf_index)), "pf pipeline & enq pipeline bit_vec harzard!") 526 527 XSPerfAccumulate("s1_pf_valid", s1_pf_valid) 528 XSPerfAccumulate("s1_pf_block_by_pipe_unready", s1_pf_valid && !io.l1_req.ready) 529 XSPerfAccumulate("s1_pf_block_by_enq_alloc_harzard", s1_pf_valid && s1_pf_evict) 530 XSPerfAccumulate("s1_pf_block_by_enq_update_harzard", s1_pf_valid && s1_pf_update) 531 XSPerfAccumulate("s1_pf_fire", s1_pf_fire) 532 533 // l2 pf 534 // s0: generate prefetch req paddr per entry, arb them, sent out 535 io.l2_pf_addr.valid := l2_pf_req_arb.io.out.valid 536 io.l2_pf_addr.bits := l2_pf_req_arb.io.out.bits 537 538 l2_pf_req_arb.io.out.ready := true.B 539 540 for(i <- 0 until MLP_SIZE) { 541 val evict = s1_alloc && (s1_index === i.U) 542 l2_pf_req_arb.io.in(i).valid := array(i).can_send_pf() && (array(i).sink === SINK_L2) && !evict 543 l2_pf_req_arb.io.in(i).bits.addr := array(i).get_pf_addr() 544 l2_pf_req_arb.io.in(i).bits.source := MuxLookup(array(i).source.value, MemReqSource.Prefetch2L2Unknown.id.U)(Seq( 545 L1_HW_PREFETCH_STRIDE -> MemReqSource.Prefetch2L2Stride.id.U, 546 L1_HW_PREFETCH_STREAM -> MemReqSource.Prefetch2L2Stream.id.U 547 )) 548 } 549 550 when(l2_pf_req_arb.io.out.valid) { 551 array(l2_pf_req_arb.io.chosen).sent_vec := array(l2_pf_req_arb.io.chosen).sent_vec | get_candidate_oh(l2_pf_req_arb.io.out.bits.addr) 552 } 553 554 // last level cache pf 555 // s0: generate prefetch req paddr per entry, arb them, sent out 556 io.l3_pf_addr.valid := l3_pf_req_arb.io.out.valid 557 io.l3_pf_addr.bits := l3_pf_req_arb.io.out.bits 558 559 l3_pf_req_arb.io.out.ready := true.B 560 561 for(i <- 0 until MLP_SIZE) { 562 val evict = s1_alloc && (s1_index === i.U) 563 l3_pf_req_arb.io.in(i).valid := array(i).can_send_pf() && (array(i).sink === SINK_L3) && !evict 564 l3_pf_req_arb.io.in(i).bits := array(i).get_pf_addr() 565 } 566 567 when(l3_pf_req_arb.io.out.valid) { 568 array(l3_pf_req_arb.io.chosen).sent_vec := array(l3_pf_req_arb.io.chosen).sent_vec | get_candidate_oh(l3_pf_req_arb.io.out.bits) 569 } 570 571 // reset meta to avoid muti-hit problem 572 for(i <- 0 until MLP_SIZE) { 573 when(reset.asBool || RegNext(io.flush)) { 574 array(i).reset(i) 575 } 576 } 577 578 XSPerfAccumulate("l2_prefetche_queue_busby", io.l2PfqBusy) 579 XSPerfHistogram("filter_active", PopCount(VecInit(array.map(_.can_send_pf())).asUInt), true.B, 0, MLP_SIZE, 1) 580 XSPerfHistogram("l1_filter_active", PopCount(VecInit(array.map(x => x.can_send_pf() && (x.sink === SINK_L1))).asUInt), true.B, 0, MLP_SIZE, 1) 581 XSPerfHistogram("l2_filter_active", PopCount(VecInit(array.map(x => x.can_send_pf() && (x.sink === SINK_L2))).asUInt), true.B, 0, MLP_SIZE, 1) 582 XSPerfHistogram("l3_filter_active", PopCount(VecInit(array.map(x => x.can_send_pf() && (x.sink === SINK_L3))).asUInt), true.B, 0, MLP_SIZE, 1) 583} 584 585class L1Prefetcher(implicit p: Parameters) extends BasePrefecher with HasStreamPrefetchHelper with HasStridePrefetchHelper { 586 val pf_ctrl = IO(Input(new PrefetchControlBundle)) 587 val stride_train = IO(Flipped(Vec(backendParams.LduCnt + backendParams.HyuCnt, ValidIO(new LdPrefetchTrainBundle())))) 588 val l2PfqBusy = IO(Input(Bool())) 589 590 val stride_train_filter = Module(new TrainFilter(STRIDE_FILTER_SIZE, "stride")) 591 val stride_meta_array = Module(new StrideMetaArray) 592 val stream_train_filter = Module(new TrainFilter(STREAM_FILTER_SIZE, "stream")) 593 val stream_bit_vec_array = Module(new StreamBitVectorArray) 594 val pf_queue_filter = Module(new MutiLevelPrefetchFilter) 595 596 // for now, if the stream is disabled, train and prefetch process will continue, without sending out and reqs 597 val enable = io.enable 598 val flush = pf_ctrl.flush 599 600 stream_train_filter.io.ld_in.zipWithIndex.foreach { 601 case (ld_in, i) => { 602 ld_in.valid := io.ld_in(i).valid && enable 603 ld_in.bits := io.ld_in(i).bits 604 } 605 } 606 stream_train_filter.io.enable := enable 607 stream_train_filter.io.flush := flush 608 609 stride_train_filter.io.ld_in.zipWithIndex.foreach { 610 case (ld_in, i) => { 611 ld_in.valid := stride_train(i).valid && enable 612 ld_in.bits := stride_train(i).bits 613 } 614 } 615 stride_train_filter.io.enable := enable 616 stride_train_filter.io.flush := flush 617 618 stream_bit_vec_array.io.enable := enable 619 stream_bit_vec_array.io.flush := flush 620 stream_bit_vec_array.io.dynamic_depth := pf_ctrl.dynamic_depth 621 stream_bit_vec_array.io.train_req <> stream_train_filter.io.train_req 622 623 stride_meta_array.io.enable := enable 624 stride_meta_array.io.flush := flush 625 stride_meta_array.io.dynamic_depth := 0.U 626 stride_meta_array.io.train_req <> stride_train_filter.io.train_req 627 stride_meta_array.io.stream_lookup_req <> stream_bit_vec_array.io.stream_lookup_req 628 stride_meta_array.io.stream_lookup_resp <> stream_bit_vec_array.io.stream_lookup_resp 629 630 // stream has higher priority than stride 631 pf_queue_filter.io.prefetch_req.valid := stream_bit_vec_array.io.prefetch_req.valid || stride_meta_array.io.prefetch_req.valid 632 pf_queue_filter.io.prefetch_req.bits := Mux( 633 stream_bit_vec_array.io.prefetch_req.valid, 634 stream_bit_vec_array.io.prefetch_req.bits, 635 stride_meta_array.io.prefetch_req.bits 636 ) 637 638 io.l1_req.valid := pf_queue_filter.io.l1_req.valid && enable && pf_ctrl.enable 639 io.l1_req.bits := pf_queue_filter.io.l1_req.bits 640 641 pf_queue_filter.io.l1_req.ready := Mux(pf_ctrl.enable, io.l1_req.ready, true.B) 642 pf_queue_filter.io.tlb_req <> io.tlb_req 643 pf_queue_filter.io.enable := enable 644 pf_queue_filter.io.flush := flush 645 pf_queue_filter.io.confidence := pf_ctrl.confidence 646 pf_queue_filter.io.l2PfqBusy := l2PfqBusy 647 648 io.l2_req.valid := pf_queue_filter.io.l2_pf_addr.valid && pf_queue_filter.io.l2_pf_addr.bits.addr > 0x80000000L.U && enable && pf_ctrl.enable 649 io.l2_req.bits := pf_queue_filter.io.l2_pf_addr.bits 650 651 io.l3_req.valid := pf_queue_filter.io.l3_pf_addr.valid && pf_queue_filter.io.l3_pf_addr.bits > 0x80000000L.U && enable && pf_ctrl.enable 652 io.l3_req.bits := pf_queue_filter.io.l3_pf_addr.bits 653}