10d32f713Shappy-lx/*************************************************************************************** 20d32f713Shappy-lx* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences 30d32f713Shappy-lx* Copyright (c) 2020-2021 Peng Cheng Laboratory 40d32f713Shappy-lx* 50d32f713Shappy-lx* XiangShan is licensed under Mulan PSL v2. 60d32f713Shappy-lx* You can use this software according to the terms and conditions of the Mulan PSL v2. 70d32f713Shappy-lx* You may obtain a copy of Mulan PSL v2 at: 80d32f713Shappy-lx* http://license.coscl.org.cn/MulanPSL2 90d32f713Shappy-lx* 100d32f713Shappy-lx* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 110d32f713Shappy-lx* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 120d32f713Shappy-lx* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 130d32f713Shappy-lx* 140d32f713Shappy-lx* See the Mulan PSL v2 for more details. 150d32f713Shappy-lx***************************************************************************************/ 160d32f713Shappy-lx 170d32f713Shappy-lxpackage xiangshan.mem 180d32f713Shappy-lx 198891a219SYinan Xuimport org.chipsalliance.cde.config.Parameters 200d32f713Shappy-lximport chisel3._ 210d32f713Shappy-lximport chisel3.util._ 220d32f713Shappy-lximport xiangshan._ 230d32f713Shappy-lximport utils._ 240d32f713Shappy-lximport utility._ 250d32f713Shappy-lximport xiangshan.cache._ 260d32f713Shappy-lx 270d32f713Shappy-lxtrait HasStorePrefetchHelper extends HasCircularQueuePtrHelper with HasDCacheParameters { 280d32f713Shappy-lx // common 290d32f713Shappy-lx val PAGEOFFSET = 12 // page offset 4096 Bytes 300d32f713Shappy-lx val BLOCKOFFSET = log2Up(dcacheParameters.blockBytes) // cache block offset 64 Bytes 310d32f713Shappy-lx 320d32f713Shappy-lx // spb parameters 330d32f713Shappy-lx val ENABLE_SPB = EnableStorePrefetchSPB 340d32f713Shappy-lx val ONLY_ON_MEMSET = false 350d32f713Shappy-lx val SATURATE_COUNTER_BITS = 7 360d32f713Shappy-lx val BURST_ENGINE_SIZE = 2 370d32f713Shappy-lx val SPB_N = 48 380d32f713Shappy-lx 390d32f713Shappy-lx // serializer parameters 400d32f713Shappy-lx val SERIALIZER_SIZE = 12 410d32f713Shappy-lx 420d32f713Shappy-lx def block_addr(x: UInt): UInt = { 430d32f713Shappy-lx val offset = log2Up(dcacheParameters.blockBytes) 440d32f713Shappy-lx x(x.getWidth - 1, offset) 450d32f713Shappy-lx } 460d32f713Shappy-lx 470d32f713Shappy-lx // filter logic (granularity: a page) 480d32f713Shappy-lx def same_page_addr(addr0: UInt, addr1: UInt): Bool = { 490d32f713Shappy-lx addr0(addr0.getWidth - 1, PAGEOFFSET) === addr1(addr1.getWidth - 1, PAGEOFFSET) 500d32f713Shappy-lx } 510d32f713Shappy-lx 520d32f713Shappy-lx def filter_by_page_addr(valid_vec: Vec[Bool], data_vec: Vec[UInt], incoming_vaddr: UInt) : Bool = { 530d32f713Shappy-lx val match_vec = (valid_vec zip data_vec).map{ 540d32f713Shappy-lx case(v, e_vaddr) => v && same_page_addr(e_vaddr, incoming_vaddr) 550d32f713Shappy-lx } 560d32f713Shappy-lx VecInit(match_vec).asUInt.orR 570d32f713Shappy-lx } 580d32f713Shappy-lx 590d32f713Shappy-lx def cache_block_addr_difference(req_addr: UInt, last_addr: UInt): UInt = { 600d32f713Shappy-lx (block_addr(req_addr).asSInt - block_addr(last_addr).asSInt)(SATURATE_COUNTER_BITS - 1, 0) 610d32f713Shappy-lx } 620d32f713Shappy-lx 630d32f713Shappy-lx def get_store_count_divided_by_8(st_count: UInt): UInt = { 640d32f713Shappy-lx st_count(st_count.getWidth - 1, 3) 650d32f713Shappy-lx } 660d32f713Shappy-lx 670d32f713Shappy-lx def trigger_check(st_count: UInt, N: UInt): Bool = { 680d32f713Shappy-lx st_count > N 690d32f713Shappy-lx } 700d32f713Shappy-lx 710d32f713Shappy-lx def can_burst(st_count: UInt, N: UInt, sa_count: SInt): Bool = { 720d32f713Shappy-lx // 1.counter overflows 730d32f713Shappy-lx // 2.counter / 8 == saturate counter 740d32f713Shappy-lx // 3.saturate counter is not negtive 750d32f713Shappy-lx trigger_check(st_count, N) && get_store_count_divided_by_8(st_count) === sa_count.asUInt && sa_count(sa_count.getWidth - 1) === false.B 760d32f713Shappy-lx } 770d32f713Shappy-lx} 780d32f713Shappy-lx 790d32f713Shappy-lx// L1 Store prefetch component 800d32f713Shappy-lx 810d32f713Shappy-lx// an prefetch request generator used by spb to burst some prefetch request to L1 Dcache 820d32f713Shappy-lxclass PrefetchBurstGenerator(is_store: Boolean)(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { 830d32f713Shappy-lx val io = IO(new DCacheBundle { 840d32f713Shappy-lx val alloc = Input(Bool()) 850d32f713Shappy-lx val vaddr = Input(UInt(VAddrBits.W)) 860d32f713Shappy-lx val prefetch_req = Vec(StorePipelineWidth, DecoupledIO(new StorePrefetchReq)) 870d32f713Shappy-lx }) 880d32f713Shappy-lx 89ec86549eSsfencevma val maxStorePipelineWidth = (if (Enable3Load3Store) 3 else 2) 90*b9ef0a42SXuan Hu require(StorePipelineWidth >= maxStorePipelineWidth) 910d32f713Shappy-lx 920d32f713Shappy-lx val SIZE = BURST_ENGINE_SIZE 930d32f713Shappy-lx 940d32f713Shappy-lx val valids = RegInit(VecInit(List.tabulate(SIZE){_ => false.B})) 950d32f713Shappy-lx val datas = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U.asTypeOf(io.vaddr)})) 960d32f713Shappy-lx val pagebits = RegInit(VecInit(List.tabulate(SIZE){_ => 0.U(1.W)})) 970d32f713Shappy-lx 980d32f713Shappy-lx // enq 990d32f713Shappy-lx val enq_valids = ~(valids.asUInt) 1000d32f713Shappy-lx val full = !(enq_valids.orR) 1010d32f713Shappy-lx val enq_idx = PriorityEncoder(enq_valids) 1020d32f713Shappy-lx val enq_filter = filter_by_page_addr(valids, datas, io.vaddr) 1030d32f713Shappy-lx 1040d32f713Shappy-lx when(io.alloc && !full && !enq_filter) { 1050d32f713Shappy-lx valids(enq_idx) := true.B 1060d32f713Shappy-lx datas(enq_idx) := io.vaddr 1070d32f713Shappy-lx pagebits(enq_idx) := io.vaddr(PAGEOFFSET) 1080d32f713Shappy-lx } 1090d32f713Shappy-lx 1100d32f713Shappy-lx XSPerfAccumulate("burst_generator_alloc_success", io.alloc && !full && !enq_filter) 1110d32f713Shappy-lx XSPerfAccumulate("burst_generator_alloc_fail", io.alloc && full && !enq_filter) 1120d32f713Shappy-lx XSPerfAccumulate("burst_generator_full", full) 1130d32f713Shappy-lx 1140d32f713Shappy-lx // next prefetch address 1150d32f713Shappy-lx val datas_next = Wire(Vec(SIZE, chiselTypeOf(datas(0)))) 1160d32f713Shappy-lx datas_next := datas.map(_ + Cat(1.U(1.W), 0.U(BLOCKOFFSET.W))) 1170d32f713Shappy-lx // double next prefetch address 1180d32f713Shappy-lx val datas_next_next = Wire(Vec(SIZE, chiselTypeOf(datas(0)))) 1190d32f713Shappy-lx datas_next_next := datas.map(_ + Cat(2.U(2.W), 0.U(BLOCKOFFSET.W))) 1200d32f713Shappy-lx 1210d32f713Shappy-lx // deq 1220d32f713Shappy-lx // val deq_valids = (valids zip datas zip pagebits).map{case (v, vaddr, pgbit) => v && vaddr(PAGEOFFSET) === pagebits} 1230d32f713Shappy-lx val deq_valids = valids 1240d32f713Shappy-lx val deq_decoupled = Wire(Vec(SIZE, Vec(StorePipelineWidth, Decoupled(new StorePrefetchReq)))) 1250d32f713Shappy-lx 1260d32f713Shappy-lx (deq_valids zip deq_decoupled zip datas zip datas_next zip datas_next_next zip pagebits zip valids).foreach{case ((((((deq_valid, out_decouple), data), data_next), data_next_next), pg_bit), v) => { 1270d32f713Shappy-lx out_decouple(0).valid := deq_valid 1280d32f713Shappy-lx out_decouple(0).bits := DontCare 1290d32f713Shappy-lx out_decouple(0).bits.vaddr := data 1300d32f713Shappy-lx out_decouple(1).valid := deq_valid && data_next(PAGEOFFSET) === pg_bit && out_decouple(0).fire 1310d32f713Shappy-lx out_decouple(1).bits := DontCare 1320d32f713Shappy-lx out_decouple(1).bits.vaddr := data_next 133202674aeSHaojin Tang out_decouple.drop(2).foreach { out => out.valid := false.B; out.bits := DontCare } 1340d32f713Shappy-lx when(out_decouple(1).fire) { 1350d32f713Shappy-lx // fired 2 prefetch reqs 1360d32f713Shappy-lx data := data_next_next 1370d32f713Shappy-lx when(data_next_next(PAGEOFFSET) =/= pg_bit) { 1380d32f713Shappy-lx // cross page, invalid this entry 1390d32f713Shappy-lx v := false.B 1400d32f713Shappy-lx } 1410d32f713Shappy-lx }.elsewhen(out_decouple(0).fire) { 1420d32f713Shappy-lx // fired 1 prefetch req 1430d32f713Shappy-lx data := data_next 1440d32f713Shappy-lx when(data_next(PAGEOFFSET) =/= pg_bit) { 1450d32f713Shappy-lx // cross page, invalid this entry 1460d32f713Shappy-lx v := false.B 1470d32f713Shappy-lx } 1480d32f713Shappy-lx } 1490d32f713Shappy-lx }} 1500d32f713Shappy-lx for (i <- 0 until StorePipelineWidth) { 1510d32f713Shappy-lx arbiter(deq_decoupled.map(_(i)), io.prefetch_req(i), Some(s"spb_deq_arb${i}")) 1520d32f713Shappy-lx } 1530d32f713Shappy-lx 1540d32f713Shappy-lx XSPerfAccumulate("burst_valid_num", PopCount(valids)) 1550d32f713Shappy-lx XSPerfAccumulate("prefetch_req_fire_by_generator", PopCount(VecInit(io.prefetch_req.map(_.fire)))) 1560d32f713Shappy-lx} 1570d32f713Shappy-lx 1580d32f713Shappy-lxclass StorePrefetchBursts(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { 1590d32f713Shappy-lx val io = IO(new DCacheBundle { 1600d32f713Shappy-lx val enable = Input(Bool()) 1610d32f713Shappy-lx val memSetPattenDetected = Input(Bool()) 1620d32f713Shappy-lx val sbuffer_enq = Flipped(Valid(new DCacheWordReqWithVaddr)) 1630d32f713Shappy-lx val prefetch_req = Vec(StorePipelineWidth, DecoupledIO(new StorePrefetchReq)) 1640d32f713Shappy-lx }) 1650d32f713Shappy-lx require(EnsbufferWidth == 2) 1660d32f713Shappy-lx 1670d32f713Shappy-lx // meta for SPB 1680d32f713Shappy-lx val N = SPB_N 1690d32f713Shappy-lx val last_st_block_addr = RegInit(0.U(VAddrBits.W)) 1700d32f713Shappy-lx val saturate_counter = RegInit(0.S(SATURATE_COUNTER_BITS.W)) 1710d32f713Shappy-lx val store_count = RegInit(0.U((log2Up(N) + 1).W)) 1720d32f713Shappy-lx val burst_engine = Module(new PrefetchBurstGenerator(is_store = true)) 1730d32f713Shappy-lx 1740d32f713Shappy-lx val sbuffer_fire = io.sbuffer_enq.valid 1750d32f713Shappy-lx val sbuffer_vaddr = io.sbuffer_enq.bits.vaddr 1760d32f713Shappy-lx 1770d32f713Shappy-lx val next_store_count = store_count + Mux(sbuffer_fire, 1.U, 0.U) 1780d32f713Shappy-lx val next_saturate_count = (saturate_counter + Mux(sbuffer_fire, cache_block_addr_difference(sbuffer_vaddr, last_st_block_addr).asSInt, 0.S)).asSInt 1790d32f713Shappy-lx 1800d32f713Shappy-lx when(sbuffer_fire) { 1810d32f713Shappy-lx last_st_block_addr := sbuffer_vaddr 1820d32f713Shappy-lx } 1830d32f713Shappy-lx 1840d32f713Shappy-lx val check = trigger_check(next_store_count, N.U) 1850d32f713Shappy-lx val burst = can_burst(next_store_count, N.U, next_saturate_count) 1860d32f713Shappy-lx 1870d32f713Shappy-lx store_count := Mux(burst || check, 0.U, next_store_count) 1880d32f713Shappy-lx saturate_counter := Mux(burst || check, 0.S, next_saturate_count) 1890d32f713Shappy-lx 1900d32f713Shappy-lx if(ONLY_ON_MEMSET) { 1910d32f713Shappy-lx // very strict: only burst on memset 1920d32f713Shappy-lx burst_engine.io.alloc := burst && io.enable && io.memSetPattenDetected 1930d32f713Shappy-lx }else { 1940d32f713Shappy-lx burst_engine.io.alloc := burst && io.enable 1950d32f713Shappy-lx } 1960d32f713Shappy-lx burst_engine.io.vaddr := get_block_addr(io.sbuffer_enq.bits.vaddr) 1970d32f713Shappy-lx burst_engine.io.prefetch_req <> io.prefetch_req 1980d32f713Shappy-lx 1990d32f713Shappy-lx // perf 2000d32f713Shappy-lx XSPerfAccumulate("trigger_burst", burst && io.enable) 2010d32f713Shappy-lx XSPerfAccumulate("trigger_check", check && io.enable) 2020d32f713Shappy-lx} 2030d32f713Shappy-lx 2040d32f713Shappy-lx// L2 Store prefetch component 2050d32f713Shappy-lx 2060d32f713Shappy-lx// Serializer: FIFO queue, recieve EnsbufferWidth requests sent from sq to sbuffer 2070d32f713Shappy-lx// save them to a FIFO queue, pop them in order 2080d32f713Shappy-lxclass Serializer(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { 2090d32f713Shappy-lx val io = IO(new DCacheBundle { 2100d32f713Shappy-lx val sbuffer_enq = Vec(EnsbufferWidth, Flipped(Valid(new DCacheWordReqWithVaddr))) 2110d32f713Shappy-lx val prefetch_train = DecoupledIO(new DCacheWordReqWithVaddr) 2120d32f713Shappy-lx }) 2130d32f713Shappy-lx val QueueSize = SERIALIZER_SIZE 2140d32f713Shappy-lx 2150d32f713Shappy-lx class SerializerPtr(implicit p: Parameters) extends CircularQueuePtr[SerializerPtr](p => QueueSize){} 2160d32f713Shappy-lx 2170d32f713Shappy-lx object SerializerPtr { 2180d32f713Shappy-lx def apply(f: Bool, v: UInt)(implicit p: Parameters): SerializerPtr = { 2190d32f713Shappy-lx val ptr = Wire(new SerializerPtr) 2200d32f713Shappy-lx ptr.flag := f 2210d32f713Shappy-lx ptr.value := v 2220d32f713Shappy-lx ptr 2230d32f713Shappy-lx } 2240d32f713Shappy-lx } 2250d32f713Shappy-lx 2260d32f713Shappy-lx val enqPtrExt = RegInit(VecInit((0 until EnsbufferWidth).map(_.U.asTypeOf(new SerializerPtr)))) 2270d32f713Shappy-lx val deqPtrExt = RegInit(0.U.asTypeOf(new SerializerPtr)) 2280d32f713Shappy-lx 2290d32f713Shappy-lx val deqPtr = deqPtrExt.value 2300d32f713Shappy-lx 2310d32f713Shappy-lx val reqs = RegInit(VecInit((0 until QueueSize).map(_.U.asTypeOf(Valid(new DCacheWordReqWithVaddr))))) 2320d32f713Shappy-lx 2330d32f713Shappy-lx // deq 2340d32f713Shappy-lx io.prefetch_train.valid := reqs(deqPtr).valid 2350d32f713Shappy-lx io.prefetch_train.bits := reqs(deqPtr).bits 2360d32f713Shappy-lx 2370d32f713Shappy-lx when(io.prefetch_train.fire) { 2380d32f713Shappy-lx deqPtrExt := deqPtrExt + 1.U 2390d32f713Shappy-lx reqs(deqPtr).valid := false.B 2400d32f713Shappy-lx } 2410d32f713Shappy-lx 2420d32f713Shappy-lx // enq 2430d32f713Shappy-lx val count_vsreq = PopCount(io.sbuffer_enq.map(_.valid)) 2440d32f713Shappy-lx val canEnqueue = (distanceBetween(enqPtrExt(0), deqPtrExt) + count_vsreq) <= QueueSize.U 2450d32f713Shappy-lx 2460d32f713Shappy-lx when(canEnqueue) { 2470d32f713Shappy-lx for(i <- 0 until EnsbufferWidth) { 2480d32f713Shappy-lx when(io.sbuffer_enq(i).valid) { 2490d32f713Shappy-lx reqs(enqPtrExt(i).value) := io.sbuffer_enq(i) 2500d32f713Shappy-lx } 2510d32f713Shappy-lx } 2520d32f713Shappy-lx enqPtrExt.map(ptr => ptr := ptr + count_vsreq) 2530d32f713Shappy-lx } 2540d32f713Shappy-lx 2550d32f713Shappy-lx XSPerfAccumulate("canNotEnqueue", !canEnqueue) 2560d32f713Shappy-lx XSPerfAccumulate("prefetch_train_fire", io.prefetch_train.fire) 2570d32f713Shappy-lx XSPerfAccumulate("full", PopCount(reqs.map(_.valid)) === QueueSize.U) 2580d32f713Shappy-lx} 2590d32f713Shappy-lx 2600d32f713Shappy-lxclass StorePfWrapper()(implicit p: Parameters) extends DCacheModule with HasStorePrefetchHelper { 2610d32f713Shappy-lx val io = IO(new DCacheBundle { 2620d32f713Shappy-lx val sbuffer_enq = Vec(EnsbufferWidth, Flipped(Valid(new DCacheWordReqWithVaddr))) 2630d32f713Shappy-lx val prefetch_req = Vec(StorePipelineWidth, DecoupledIO(new StorePrefetchReq)) 2640d32f713Shappy-lx val memSetPattenDetected = Input(Bool()) 2650d32f713Shappy-lx }) 2660d32f713Shappy-lx 2670d32f713Shappy-lx // TODO: remove serializer, use a ptr in sq 2680d32f713Shappy-lx val serializer = Module(new Serializer()) 2690d32f713Shappy-lx val spb = Module(new StorePrefetchBursts()) 2700d32f713Shappy-lx 2710d32f713Shappy-lx // give mutiple reqs to serializer, serializer will give out one req per cycle 2720d32f713Shappy-lx for(i <- 0 until EnsbufferWidth) { 2730d32f713Shappy-lx serializer.io.sbuffer_enq(i).valid := io.sbuffer_enq(i).valid && ENABLE_SPB.B 2740d32f713Shappy-lx serializer.io.sbuffer_enq(i).bits := io.sbuffer_enq(i).bits 2750d32f713Shappy-lx } 2760d32f713Shappy-lx 2770d32f713Shappy-lx // train spb 2780d32f713Shappy-lx spb.io.enable := ENABLE_SPB.B 2790d32f713Shappy-lx spb.io.memSetPattenDetected := io.memSetPattenDetected 2800d32f713Shappy-lx spb.io.sbuffer_enq.valid := serializer.io.prefetch_train.valid 2810d32f713Shappy-lx spb.io.sbuffer_enq.bits := serializer.io.prefetch_train.bits 2820d32f713Shappy-lx // spb will always recieve train req 2830d32f713Shappy-lx serializer.io.prefetch_train.ready := true.B 2840d32f713Shappy-lx 2850d32f713Shappy-lx // fire a prefetch req 2860d32f713Shappy-lx io.prefetch_req <> spb.io.prefetch_req 2870d32f713Shappy-lx}