1*0d32f713Shappy-lxpackage xiangshan.mem.prefetch 2*0d32f713Shappy-lx 3*0d32f713Shappy-lximport chipsalliance.rocketchip.config.Parameters 4*0d32f713Shappy-lximport chisel3._ 5*0d32f713Shappy-lximport chisel3.util._ 6*0d32f713Shappy-lximport xiangshan._ 7*0d32f713Shappy-lximport utils._ 8*0d32f713Shappy-lximport utility._ 9*0d32f713Shappy-lximport xiangshan.cache.HasDCacheParameters 10*0d32f713Shappy-lximport xiangshan.cache.mmu._ 11*0d32f713Shappy-lximport xiangshan.mem.{L1PrefetchReq, LdPrefetchTrainBundle} 12*0d32f713Shappy-lximport xiangshan.mem.trace._ 13*0d32f713Shappy-lximport scala.collection.SeqLike 14*0d32f713Shappy-lx 15*0d32f713Shappy-lxtrait HasStridePrefetchHelper extends HasL1PrefetchHelper { 16*0d32f713Shappy-lx val STRIDE_FILTER_SIZE = 6 17*0d32f713Shappy-lx val STRIDE_ENTRY_NUM = 10 18*0d32f713Shappy-lx val STRIDE_BITS = 10 + BLOCK_OFFSET 19*0d32f713Shappy-lx val STRIDE_VADDR_BITS = 10 + BLOCK_OFFSET 20*0d32f713Shappy-lx val STRIDE_CONF_BITS = 2 21*0d32f713Shappy-lx 22*0d32f713Shappy-lx // detail control 23*0d32f713Shappy-lx val ALWAYS_UPDATE_PRE_VADDR = 1 // 1 for true, 0 for false 24*0d32f713Shappy-lx val AGGRESIVE_POLICY = false // if true, prefetch degree is greater than 1, 1 otherwise 25*0d32f713Shappy-lx val STRIDE_LOOK_AHEAD_BLOCKS = 2 // aggressive degree 26*0d32f713Shappy-lx val LOOK_UP_STREAM = false // if true, avoid collision with stream 27*0d32f713Shappy-lx 28*0d32f713Shappy-lx val STRIDE_WIDTH_BLOCKS = if(AGGRESIVE_POLICY) STRIDE_LOOK_AHEAD_BLOCKS else 1 29*0d32f713Shappy-lx 30*0d32f713Shappy-lx def MAX_CONF = (1 << STRIDE_CONF_BITS) - 1 31*0d32f713Shappy-lx} 32*0d32f713Shappy-lx 33*0d32f713Shappy-lxclass StrideMetaBundle(implicit p: Parameters) extends XSBundle with HasStridePrefetchHelper { 34*0d32f713Shappy-lx val pre_vaddr = UInt(STRIDE_VADDR_BITS.W) 35*0d32f713Shappy-lx val stride = UInt(STRIDE_BITS.W) 36*0d32f713Shappy-lx val confidence = UInt(STRIDE_CONF_BITS.W) 37*0d32f713Shappy-lx val hash_pc = UInt(HASH_TAG_WIDTH.W) 38*0d32f713Shappy-lx 39*0d32f713Shappy-lx def reset(index: Int) = { 40*0d32f713Shappy-lx pre_vaddr := 0.U 41*0d32f713Shappy-lx stride := 0.U 42*0d32f713Shappy-lx confidence := 0.U 43*0d32f713Shappy-lx hash_pc := index.U 44*0d32f713Shappy-lx } 45*0d32f713Shappy-lx 46*0d32f713Shappy-lx def alloc(vaddr: UInt, alloc_hash_pc: UInt) = { 47*0d32f713Shappy-lx pre_vaddr := vaddr(STRIDE_VADDR_BITS - 1, 0) 48*0d32f713Shappy-lx stride := 0.U 49*0d32f713Shappy-lx confidence := 0.U 50*0d32f713Shappy-lx hash_pc := alloc_hash_pc 51*0d32f713Shappy-lx } 52*0d32f713Shappy-lx 53*0d32f713Shappy-lx def update(vaddr: UInt, always_update_pre_vaddr: Bool) = { 54*0d32f713Shappy-lx val new_vaddr = vaddr(STRIDE_VADDR_BITS - 1, 0) 55*0d32f713Shappy-lx val new_stride = new_vaddr - pre_vaddr 56*0d32f713Shappy-lx val new_stride_blk = block_addr(new_stride) 57*0d32f713Shappy-lx // NOTE: for now, disable negtive stride 58*0d32f713Shappy-lx val stride_valid = new_stride_blk =/= 0.U && new_stride_blk =/= 1.U && new_stride(STRIDE_VADDR_BITS - 1) === 0.U 59*0d32f713Shappy-lx val stride_match = new_stride === stride 60*0d32f713Shappy-lx val low_confidence = confidence <= 1.U 61*0d32f713Shappy-lx val can_send_pf = stride_valid && stride_match && confidence === MAX_CONF.U 62*0d32f713Shappy-lx 63*0d32f713Shappy-lx when(stride_valid) { 64*0d32f713Shappy-lx when(stride_match) { 65*0d32f713Shappy-lx confidence := Mux(confidence === MAX_CONF.U, confidence, confidence + 1.U) 66*0d32f713Shappy-lx }.otherwise { 67*0d32f713Shappy-lx confidence := Mux(confidence === 0.U, confidence, confidence - 1.U) 68*0d32f713Shappy-lx when(low_confidence) { 69*0d32f713Shappy-lx stride := new_stride 70*0d32f713Shappy-lx } 71*0d32f713Shappy-lx } 72*0d32f713Shappy-lx pre_vaddr := new_vaddr 73*0d32f713Shappy-lx } 74*0d32f713Shappy-lx when(always_update_pre_vaddr) { 75*0d32f713Shappy-lx pre_vaddr := new_vaddr 76*0d32f713Shappy-lx } 77*0d32f713Shappy-lx 78*0d32f713Shappy-lx (can_send_pf, new_stride) 79*0d32f713Shappy-lx } 80*0d32f713Shappy-lx 81*0d32f713Shappy-lx} 82*0d32f713Shappy-lx 83*0d32f713Shappy-lxclass StrideMetaArray(implicit p: Parameters) extends XSModule with HasStridePrefetchHelper { 84*0d32f713Shappy-lx val io = IO(new XSBundle { 85*0d32f713Shappy-lx val enable = Input(Bool()) 86*0d32f713Shappy-lx // TODO: flush all entry when process changing happens, or disable stream prefetch for a while 87*0d32f713Shappy-lx val flush = Input(Bool()) 88*0d32f713Shappy-lx val dynamic_depth = Input(UInt(32.W)) // TODO: enable dynamic stride depth 89*0d32f713Shappy-lx val train_req = Flipped(DecoupledIO(new PrefetchReqBundle)) 90*0d32f713Shappy-lx val prefetch_req = ValidIO(new StreamPrefetchReqBundle) 91*0d32f713Shappy-lx // query Stream component to see if a stream pattern has already been detected 92*0d32f713Shappy-lx val stream_lookup_req = ValidIO(new PrefetchReqBundle) 93*0d32f713Shappy-lx val stream_lookup_resp = Input(Bool()) 94*0d32f713Shappy-lx }) 95*0d32f713Shappy-lx 96*0d32f713Shappy-lx val array = Reg(Vec(STRIDE_ENTRY_NUM, new StrideMetaBundle)) 97*0d32f713Shappy-lx val replacement = ReplacementPolicy.fromString("plru", STRIDE_ENTRY_NUM) 98*0d32f713Shappy-lx 99*0d32f713Shappy-lx // s0: hash pc -> cam all entries 100*0d32f713Shappy-lx val s0_can_accept = Wire(Bool()) 101*0d32f713Shappy-lx val s0_valid = io.train_req.fire 102*0d32f713Shappy-lx val s0_vaddr = io.train_req.bits.vaddr 103*0d32f713Shappy-lx val s0_pc = io.train_req.bits.pc 104*0d32f713Shappy-lx val s0_pc_hash = pc_hash_tag(s0_pc) 105*0d32f713Shappy-lx val s0_pc_match_vec = VecInit(array.map(_.hash_pc === s0_pc_hash)).asUInt 106*0d32f713Shappy-lx val s0_hit = s0_pc_match_vec.orR 107*0d32f713Shappy-lx val s0_index = Mux(s0_hit, OHToUInt(s0_pc_match_vec), replacement.way) 108*0d32f713Shappy-lx io.train_req.ready := s0_can_accept 109*0d32f713Shappy-lx io.stream_lookup_req.valid := s0_valid 110*0d32f713Shappy-lx io.stream_lookup_req.bits := io.train_req.bits 111*0d32f713Shappy-lx 112*0d32f713Shappy-lx when(s0_valid) { 113*0d32f713Shappy-lx replacement.access(s0_index) 114*0d32f713Shappy-lx } 115*0d32f713Shappy-lx 116*0d32f713Shappy-lx assert(PopCount(s0_pc_match_vec) <= 1.U) 117*0d32f713Shappy-lx XSPerfAccumulate("s0_valid", s0_valid) 118*0d32f713Shappy-lx XSPerfAccumulate("s0_hit", s0_valid && s0_hit) 119*0d32f713Shappy-lx XSPerfAccumulate("s0_miss", s0_valid && !s0_hit) 120*0d32f713Shappy-lx 121*0d32f713Shappy-lx // s1: alloc or update 122*0d32f713Shappy-lx val s1_valid = RegNext(s0_valid) 123*0d32f713Shappy-lx val s1_index = RegEnable(s0_index, s0_valid) 124*0d32f713Shappy-lx val s1_pc_hash = RegEnable(s0_pc_hash, s0_valid) 125*0d32f713Shappy-lx val s1_vaddr = RegEnable(s0_vaddr, s0_valid) 126*0d32f713Shappy-lx val s1_hit = RegEnable(s0_hit, s0_valid) 127*0d32f713Shappy-lx val s1_alloc = s1_valid && !s1_hit 128*0d32f713Shappy-lx val s1_update = s1_valid && s1_hit 129*0d32f713Shappy-lx val s1_stride = array(s1_index).stride 130*0d32f713Shappy-lx val s1_new_stride = WireInit(0.U(STRIDE_BITS.W)) 131*0d32f713Shappy-lx val s1_can_send_pf = WireInit(false.B) 132*0d32f713Shappy-lx s0_can_accept := !(s1_valid && s1_pc_hash === s0_pc_hash) 133*0d32f713Shappy-lx 134*0d32f713Shappy-lx val always_update = WireInit(Constantin.createRecord("always_update" + p(XSCoreParamsKey).HartId.toString, initValue = ALWAYS_UPDATE_PRE_VADDR.U)) === 1.U 135*0d32f713Shappy-lx 136*0d32f713Shappy-lx when(s1_alloc) { 137*0d32f713Shappy-lx array(s1_index).alloc( 138*0d32f713Shappy-lx vaddr = s1_vaddr, 139*0d32f713Shappy-lx alloc_hash_pc = s1_pc_hash 140*0d32f713Shappy-lx ) 141*0d32f713Shappy-lx }.elsewhen(s1_update) { 142*0d32f713Shappy-lx val res = array(s1_index).update(s1_vaddr, always_update) 143*0d32f713Shappy-lx s1_can_send_pf := res._1 144*0d32f713Shappy-lx s1_new_stride := res._2 145*0d32f713Shappy-lx } 146*0d32f713Shappy-lx 147*0d32f713Shappy-lx val l1_stride_ratio_const = WireInit(Constantin.createRecord("l1_stride_ratio" + p(XSCoreParamsKey).HartId.toString, initValue = 2.U)) 148*0d32f713Shappy-lx val l1_stride_ratio = l1_stride_ratio_const(3, 0) 149*0d32f713Shappy-lx val l2_stride_ratio_const = WireInit(Constantin.createRecord("l2_stride_ratio" + p(XSCoreParamsKey).HartId.toString, initValue = 5.U)) 150*0d32f713Shappy-lx val l2_stride_ratio = l2_stride_ratio_const(3, 0) 151*0d32f713Shappy-lx // s2: calculate L1 & L2 pf addr 152*0d32f713Shappy-lx val s2_valid = RegNext(s1_valid && s1_can_send_pf) 153*0d32f713Shappy-lx val s2_vaddr = RegEnable(s1_vaddr, s1_valid && s1_can_send_pf) 154*0d32f713Shappy-lx val s2_stride = RegEnable(s1_stride, s1_valid && s1_can_send_pf) 155*0d32f713Shappy-lx val s2_l1_depth = s2_stride << l1_stride_ratio 156*0d32f713Shappy-lx val s2_l1_pf_vaddr = (s2_vaddr + s2_l1_depth)(VAddrBits - 1, 0) 157*0d32f713Shappy-lx val s2_l2_depth = s2_stride << l2_stride_ratio 158*0d32f713Shappy-lx val s2_l2_pf_vaddr = (s2_vaddr + s2_l2_depth)(VAddrBits - 1, 0) 159*0d32f713Shappy-lx val s2_l1_pf_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle( 160*0d32f713Shappy-lx vaddr = s2_l1_pf_vaddr, 161*0d32f713Shappy-lx width = STRIDE_WIDTH_BLOCKS, 162*0d32f713Shappy-lx decr_mode = false.B, 163*0d32f713Shappy-lx sink = SINK_L1, 164*0d32f713Shappy-lx source = L1_HW_PREFETCH_STRIDE) 165*0d32f713Shappy-lx val s2_l2_pf_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle( 166*0d32f713Shappy-lx vaddr = s2_l2_pf_vaddr, 167*0d32f713Shappy-lx width = STRIDE_WIDTH_BLOCKS, 168*0d32f713Shappy-lx decr_mode = false.B, 169*0d32f713Shappy-lx sink = SINK_L2, 170*0d32f713Shappy-lx source = L1_HW_PREFETCH_STRIDE) 171*0d32f713Shappy-lx 172*0d32f713Shappy-lx // s3: send l1 pf out 173*0d32f713Shappy-lx val s3_valid = if (LOOK_UP_STREAM) RegNext(s2_valid) && !io.stream_lookup_resp else RegNext(s2_valid) 174*0d32f713Shappy-lx val s3_l1_pf_req_bits = RegEnable(s2_l1_pf_req_bits, s2_valid) 175*0d32f713Shappy-lx val s3_l2_pf_req_bits = RegEnable(s2_l2_pf_req_bits, s2_valid) 176*0d32f713Shappy-lx 177*0d32f713Shappy-lx // s4: send l2 pf out 178*0d32f713Shappy-lx val s4_valid = RegNext(s3_valid) 179*0d32f713Shappy-lx val s4_l2_pf_req_bits = RegEnable(s3_l2_pf_req_bits, s3_valid) 180*0d32f713Shappy-lx 181*0d32f713Shappy-lx // l2 has higher priority than l1 ? 182*0d32f713Shappy-lx io.prefetch_req.valid := s3_valid || s4_valid 183*0d32f713Shappy-lx io.prefetch_req.bits := Mux(s4_valid, s4_l2_pf_req_bits, s3_l1_pf_req_bits) 184*0d32f713Shappy-lx 185*0d32f713Shappy-lx XSPerfAccumulate("pf_valid", io.prefetch_req.valid) 186*0d32f713Shappy-lx XSPerfAccumulate("l1_pf_valid", s3_valid && !s4_valid) 187*0d32f713Shappy-lx XSPerfAccumulate("l1_pf_block", s3_valid && s4_valid) 188*0d32f713Shappy-lx XSPerfAccumulate("l2_pf_valid", s4_valid) 189*0d32f713Shappy-lx XSPerfAccumulate("detect_stream", io.stream_lookup_resp) 190*0d32f713Shappy-lx XSPerfHistogram("high_conf_num", PopCount(VecInit(array.map(_.confidence === MAX_CONF.U))).asUInt, true.B, 0, STRIDE_ENTRY_NUM, 1) 191*0d32f713Shappy-lx for(i <- 0 until STRIDE_ENTRY_NUM) { 192*0d32f713Shappy-lx XSPerfAccumulate(s"entry_${i}_update", i.U === s1_index && s1_update) 193*0d32f713Shappy-lx for(j <- 0 until 4) { 194*0d32f713Shappy-lx XSPerfAccumulate(s"entry_${i}_disturb_${j}", i.U === s1_index && s1_update && 195*0d32f713Shappy-lx j.U === s1_new_stride && 196*0d32f713Shappy-lx array(s1_index).confidence === MAX_CONF.U && 197*0d32f713Shappy-lx array(s1_index).stride =/= s1_new_stride 198*0d32f713Shappy-lx ) 199*0d32f713Shappy-lx } 200*0d32f713Shappy-lx } 201*0d32f713Shappy-lx 202*0d32f713Shappy-lx for(i <- 0 until STRIDE_ENTRY_NUM) { 203*0d32f713Shappy-lx when(reset.asBool || RegNext(io.flush)) { 204*0d32f713Shappy-lx array(i).reset(i) 205*0d32f713Shappy-lx } 206*0d32f713Shappy-lx } 207*0d32f713Shappy-lx}