1/*************************************************************************************** 2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences 3* Copyright (c) 2020-2021 Peng Cheng Laboratory 4* 5* XiangShan is licensed under Mulan PSL v2. 6* You can use this software according to the terms and conditions of the Mulan PSL v2. 7* You may obtain a copy of Mulan PSL v2 at: 8* http://license.coscl.org.cn/MulanPSL2 9* 10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13* 14* See the Mulan PSL v2 for more details. 15***************************************************************************************/ 16 17package xiangshan.mem 18 19import chipsalliance.rocketchip.config.Parameters 20import chisel3._ 21import chisel3.util._ 22import utils._ 23import utility._ 24import xiangshan._ 25import xiangshan.backend.fu.fpu.FPU 26import xiangshan.backend.rob.RobLsqIO 27import xiangshan.cache._ 28import xiangshan.frontend.FtqPtr 29import xiangshan.ExceptionNO._ 30import chisel3.ExcitingUtils 31import xiangshan.cache.dcache.ReplayCarry 32import xiangshan.backend.Bundles.{DynInst, MemExuOutput, MemMicroOpRbExt} 33 34class LqPtr(implicit p: Parameters) extends CircularQueuePtr[LqPtr]( 35 p => p(XSCoreParamsKey).LoadQueueSize 36){ 37} 38 39object LqPtr { 40 def apply(f: Bool, v: UInt)(implicit p: Parameters): LqPtr = { 41 val ptr = Wire(new LqPtr) 42 ptr.flag := f 43 ptr.value := v 44 ptr 45 } 46} 47 48trait HasLoadHelper { this: XSModule => 49 def rdataHelper(uop: DynInst, rdata: UInt): UInt = { 50 val fpWen = uop.fpWen 51 LookupTree(uop.fuOpType, List( 52 LSUOpType.lb -> SignExt(rdata(7, 0) , XLEN), 53 LSUOpType.lh -> SignExt(rdata(15, 0), XLEN), 54 /* 55 riscv-spec-20191213: 12.2 NaN Boxing of Narrower Values 56 Any operation that writes a narrower result to an f register must write 57 all 1s to the uppermost FLEN−n bits to yield a legal NaN-boxed value. 58 */ 59 LSUOpType.lw -> Mux(fpWen, FPU.box(rdata, FPU.S), SignExt(rdata(31, 0), XLEN)), 60 LSUOpType.ld -> Mux(fpWen, FPU.box(rdata, FPU.D), SignExt(rdata(63, 0), XLEN)), 61 LSUOpType.lbu -> ZeroExt(rdata(7, 0) , XLEN), 62 LSUOpType.lhu -> ZeroExt(rdata(15, 0), XLEN), 63 LSUOpType.lwu -> ZeroExt(rdata(31, 0), XLEN), 64 )) 65 } 66} 67 68class LqEnqIO(implicit p: Parameters) extends XSBundle { 69 private val LsExuCnt = backendParams.StaCnt + backendParams.LduCnt 70 val canAccept = Output(Bool()) 71 val sqCanAccept = Input(Bool()) 72 val needAlloc = Vec(LsExuCnt, Input(Bool())) 73 val req = Vec(LsExuCnt, Flipped(ValidIO(new DynInst))) 74 val resp = Vec(LsExuCnt, Output(new LqPtr)) 75} 76 77class LqPaddrWriteBundle(implicit p: Parameters) extends XSBundle { 78 val paddr = Output(UInt(PAddrBits.W)) 79 val lqIdx = Output(new LqPtr) 80} 81 82class LqVaddrWriteBundle(implicit p: Parameters) extends XSBundle { 83 val vaddr = Output(UInt(VAddrBits.W)) 84 val lqIdx = Output(new LqPtr) 85} 86 87class LqTriggerIO(implicit p: Parameters) extends XSBundle { 88 val hitLoadAddrTriggerHitVec = Input(Vec(3, Bool())) 89 val lqLoadAddrTriggerHitVec = Output(Vec(3, Bool())) 90} 91 92class LoadQueueIOBundle(implicit p: Parameters) extends XSBundle { 93 val enq = new LqEnqIO 94 val brqRedirect = Flipped(ValidIO(new Redirect)) 95 val loadOut = Vec(LoadPipelineWidth, Decoupled(new LsPipelineBundle)) // select load from lq to load pipeline 96 val loadPaddrIn = Vec(LoadPipelineWidth, Flipped(Valid(new LqPaddrWriteBundle))) 97 val loadVaddrIn = Vec(LoadPipelineWidth, Flipped(Valid(new LqVaddrWriteBundle))) 98 val loadIn = Vec(LoadPipelineWidth, Flipped(Valid(new LqWriteBundle))) 99 val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) 100 val s2_load_data_forwarded = Vec(LoadPipelineWidth, Input(Bool())) 101 val s3_delayed_load_error = Vec(LoadPipelineWidth, Input(Bool())) 102 val s2_dcache_require_replay = Vec(LoadPipelineWidth, Input(Bool())) 103 val s3_replay_from_fetch = Vec(LoadPipelineWidth, Input(Bool())) 104 val ldout = Vec(2, DecoupledIO(new MemExuOutput)) // writeback int load 105 val ldRawDataOut = Vec(2, Output(new LoadDataFromLQBundle)) 106 val load_s1 = Vec(LoadPipelineWidth, Flipped(new PipeLoadForwardQueryIO)) // TODO: to be renamed 107 val loadViolationQuery = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO)) 108 val rob = Flipped(new RobLsqIO) 109 val rollback = Output(Valid(new Redirect)) // replay now starts from load instead of store 110 val refill = Flipped(ValidIO(new Refill)) // TODO: to be renamed 111 val release = Flipped(ValidIO(new Release)) 112 val uncache = new UncacheWordIO 113 val exceptionAddr = new ExceptionAddrIO 114 val lqFull = Output(Bool()) 115 val lqCancelCnt = Output(UInt(log2Up(LoadQueueSize + 1).W)) 116 val trigger = Vec(LoadPipelineWidth, new LqTriggerIO) 117 118 // for load replay (recieve feedback from load pipe line) 119 val replayFast = Vec(LoadPipelineWidth, Flipped(new LoadToLsqFastIO)) 120 val replaySlow = Vec(LoadPipelineWidth, Flipped(new LoadToLsqSlowIO)) 121 122 val storeDataValidVec = Vec(StoreQueueSize, Input(Bool())) 123 124 val tlbReplayDelayCycleCtrl = Vec(4, Input(UInt(ReSelectLen.W))) 125} 126 127// Load Queue 128class LoadQueue(implicit p: Parameters) extends XSModule 129 with HasDCacheParameters 130 with HasCircularQueuePtrHelper 131 with HasLoadHelper 132 with HasPerfEvents 133{ 134 val io = IO(new LoadQueueIOBundle()) 135 136 // dontTouch(io) 137 138 println("LoadQueue: size:" + LoadQueueSize) 139 140 val uop = Reg(Vec(LoadQueueSize, new DynInst)) 141 val replayCarryReg = RegInit(VecInit(List.fill(LoadQueueSize)(ReplayCarry(0.U, false.B)))) 142 // val data = Reg(Vec(LoadQueueSize, new LsRobEntry)) 143 val dataModule = Module(new LoadQueueDataWrapper(LoadQueueSize, wbNumWrite = LoadPipelineWidth)) 144 dataModule.io := DontCare 145 // vaddrModule's read port 0 for exception addr, port 1 for uncache vaddr read, port {2, 3} for load replay 146 val vaddrModule = Module(new SyncDataModuleTemplate(UInt(VAddrBits.W), LoadQueueSize, numRead = 1 + 1 + LoadPipelineWidth, numWrite = LoadPipelineWidth)) 147 vaddrModule.io := DontCare 148 val vaddrTriggerResultModule = Module(new SyncDataModuleTemplate(Vec(3, Bool()), LoadQueueSize, numRead = LoadPipelineWidth, numWrite = LoadPipelineWidth)) 149 vaddrTriggerResultModule.io := DontCare 150 val allocated = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // lq entry has been allocated 151 val datavalid = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // data is valid 152 val writebacked = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // inst has been writebacked to CDB 153 val released = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // load data has been released by dcache 154 val error = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) // load data has been corrupted 155 val miss = Reg(Vec(LoadQueueSize, Bool())) // load inst missed, waiting for miss queue to accept miss request 156 // val listening = Reg(Vec(LoadQueueSize, Bool())) // waiting for refill result 157 val pending = Reg(Vec(LoadQueueSize, Bool())) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of rob 158 val refilling = WireInit(VecInit(List.fill(LoadQueueSize)(false.B))) // inst has been writebacked to CDB 159 160 /** 161 * used for load replay control 162 */ 163 164 val tlb_hited = RegInit(VecInit(List.fill(LoadQueueSize)(true.B))) 165 val ld_ld_check_ok = RegInit(VecInit(List.fill(LoadQueueSize)(true.B))) 166 val st_ld_check_ok = RegInit(VecInit(List.fill(LoadQueueSize)(true.B))) 167 val cache_bank_no_conflict = RegInit(VecInit(List.fill(LoadQueueSize)(true.B))) 168 val cache_no_replay = RegInit(VecInit(List.fill(LoadQueueSize)(true.B))) 169 val forward_data_valid = RegInit(VecInit(List.fill(LoadQueueSize)(true.B))) 170 val cache_hited = RegInit(VecInit(List.fill(LoadQueueSize)(true.B))) 171 172 173 /** 174 * used for re-select control 175 */ 176 177 val credit = RegInit(VecInit(List.fill(LoadQueueSize)(0.U(ReSelectLen.W)))) 178 179 // ptrs to control which cycle to choose 180 val block_ptr_tlb = RegInit(VecInit(List.fill(LoadQueueSize)(0.U(2.W)))) 181 val block_ptr_cache = RegInit(VecInit(List.fill(LoadQueueSize)(0.U(2.W)))) 182 val block_ptr_others = RegInit(VecInit(List.fill(LoadQueueSize)(0.U(2.W)))) 183 184 // specific cycles to block 185 val block_cycles_tlb = Reg(Vec(4, UInt(ReSelectLen.W))) 186 block_cycles_tlb := io.tlbReplayDelayCycleCtrl 187 val block_cycles_cache = RegInit(VecInit(Seq(11.U(ReSelectLen.W), 0.U(ReSelectLen.W), 31.U(ReSelectLen.W), 0.U(ReSelectLen.W)))) 188 val block_cycles_others = RegInit(VecInit(Seq(0.U(ReSelectLen.W), 0.U(ReSelectLen.W), 0.U(ReSelectLen.W), 0.U(ReSelectLen.W)))) 189 190 val sel_blocked = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) 191 192 // data forward block 193 val block_sq_idx = RegInit(VecInit(List.fill(LoadQueueSize)(0.U((log2Ceil(StoreQueueSize).W))))) 194 val block_by_data_forward_fail = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) 195 196 // dcache miss block 197 val miss_mshr_id = RegInit(VecInit(List.fill(LoadQueueSize)(0.U((log2Up(cfg.nMissEntries).W))))) 198 val block_by_cache_miss = RegInit(VecInit(List.fill(LoadQueueSize)(false.B))) 199 200 val true_cache_miss_replay = WireInit(VecInit(List.fill(LoadQueueSize)(false.B))) 201 (0 until LoadQueueSize).map{i => { 202 true_cache_miss_replay(i) := tlb_hited(i) && ld_ld_check_ok(i) && st_ld_check_ok(i) && cache_bank_no_conflict(i) && 203 cache_no_replay(i) && forward_data_valid(i) && !cache_hited(i) 204 }} 205 206 val creditUpdate = WireInit(VecInit(List.fill(LoadQueueSize)(0.U(ReSelectLen.W)))) 207 208 credit := creditUpdate 209 210 (0 until LoadQueueSize).map(i => { 211 creditUpdate(i) := Mux(credit(i) > 0.U(ReSelectLen.W), credit(i) - 1.U(ReSelectLen.W), credit(i)) 212 sel_blocked(i) := creditUpdate(i) =/= 0.U(ReSelectLen.W) 213 }) 214 215 (0 until LoadQueueSize).map(i => { 216 block_by_data_forward_fail(i) := Mux(block_by_data_forward_fail(i) === true.B && io.storeDataValidVec(block_sq_idx(i)) === true.B , false.B, block_by_data_forward_fail(i)) 217 }) 218 219 (0 until LoadQueueSize).map(i => { 220 block_by_cache_miss(i) := Mux(block_by_cache_miss(i) === true.B && io.refill.valid && io.refill.bits.id === miss_mshr_id(i), false.B, block_by_cache_miss(i)) 221 when(creditUpdate(i) === 0.U && block_by_cache_miss(i) === true.B) { 222 block_by_cache_miss(i) := false.B 223 } 224 when(block_by_cache_miss(i) === true.B && io.refill.valid && io.refill.bits.id === miss_mshr_id(i)) { 225 creditUpdate(i) := 0.U 226 } 227 }) 228 229 val debug_mmio = Reg(Vec(LoadQueueSize, Bool())) // mmio: inst is an mmio inst 230 val debug_paddr = Reg(Vec(LoadQueueSize, UInt(PAddrBits.W))) // mmio: inst is an mmio inst 231 232 val enqPtrExt = RegInit(VecInit((0 until io.enq.req.length).map(_.U.asTypeOf(new LqPtr)))) 233 val deqPtrExt = RegInit(0.U.asTypeOf(new LqPtr)) 234 val deqPtrExtNext = Wire(new LqPtr) 235 236 val enqPtr = enqPtrExt(0).value 237 val deqPtr = deqPtrExt.value 238 239 val validCount = distanceBetween(enqPtrExt(0), deqPtrExt) 240 val allowEnqueue = validCount <= (LoadQueueSize - LoadPipelineWidth).U 241 242 val deqMask = UIntToMask(deqPtr, LoadQueueSize) 243 val enqMask = UIntToMask(enqPtr, LoadQueueSize) 244 245 val commitCount = RegNext(io.rob.lcommit) 246 247 val release1cycle = io.release 248 val release2cycle = RegNext(io.release) 249 val release2cycle_dup_lsu = RegNext(io.release) 250 251 /** 252 * Enqueue at dispatch 253 * 254 * Currently, LoadQueue only allows enqueue when #emptyEntries > EnqWidth 255 */ 256 io.enq.canAccept := allowEnqueue 257 258 val canEnqueue = io.enq.req.map(_.valid) 259 val enqCancel = io.enq.req.map(_.bits.robIdx.needFlush(io.brqRedirect)) 260 for (i <- 0 until io.enq.req.length) { 261 val offset = if (i == 0) 0.U else PopCount(io.enq.needAlloc.take(i)) 262 val lqIdx = enqPtrExt(offset) 263 val index = io.enq.req(i).bits.lqIdx.value 264 when (canEnqueue(i) && !enqCancel(i)) { 265 uop(index) := io.enq.req(i).bits 266 // NOTE: the index will be used when replay 267 uop(index).lqIdx := lqIdx 268 allocated(index) := true.B 269 datavalid(index) := false.B 270 writebacked(index) := false.B 271 released(index) := false.B 272 miss(index) := false.B 273 pending(index) := false.B 274 error(index) := false.B 275 276 /** 277 * used for load replay control 278 */ 279 tlb_hited(index) := true.B 280 ld_ld_check_ok(index) := true.B 281 st_ld_check_ok(index) := true.B 282 cache_bank_no_conflict(index) := true.B 283 cache_no_replay(index) := true.B 284 forward_data_valid(index) := true.B 285 cache_hited(index) := true.B 286 287 /** 288 * used for delaying load(block-ptr to control how many cycles to block) 289 */ 290 credit(index) := 0.U(ReSelectLen.W) 291 block_ptr_tlb(index) := 0.U(2.W) 292 block_ptr_cache(index) := 0.U(2.W) 293 block_ptr_others(index) := 0.U(2.W) 294 295 block_by_data_forward_fail(index) := false.B 296 block_by_cache_miss(index) := false.B 297 298 XSError(!io.enq.canAccept || !io.enq.sqCanAccept, s"must accept $i\n") 299 XSError(index =/= lqIdx.value, s"must be the same entry $i\n") 300 } 301 io.enq.resp(i) := lqIdx 302 } 303 XSDebug(p"(ready, valid): ${io.enq.canAccept}, ${Binary(Cat(io.enq.req.map(_.valid)))}\n") 304 305 val lastCycleRedirect = RegNext(io.brqRedirect) 306 val lastlastCycleRedirect = RegNext(lastCycleRedirect) 307 308 // replay logic 309 // replay is splited into 2 stages 310 311 // stage1: select 2 entries and read their vaddr 312 val s0_block_load_mask = WireInit(VecInit((0 until LoadQueueSize).map(x=>false.B))) 313 val s1_block_load_mask = RegNext(s0_block_load_mask) 314 val s2_block_load_mask = RegNext(s1_block_load_mask) 315 316 val loadReplaySel = Wire(Vec(LoadPipelineWidth, UInt(log2Up(LoadQueueSize).W))) // index selected last cycle 317 val loadReplaySelV = Wire(Vec(LoadPipelineWidth, Bool())) // index selected in last cycle is valid 318 319 val loadReplaySelVec = VecInit((0 until LoadQueueSize).map(i => { 320 val blocked = s1_block_load_mask(i) || s2_block_load_mask(i) || sel_blocked(i) || block_by_data_forward_fail(i) || block_by_cache_miss(i) 321 allocated(i) && (!tlb_hited(i) || !ld_ld_check_ok(i) || !st_ld_check_ok(i) || !cache_bank_no_conflict(i) || !cache_no_replay(i) || !forward_data_valid(i) || !cache_hited(i)) && !blocked 322 })).asUInt() // use uint instead vec to reduce verilog lines 323 324 val remReplayDeqMask = Seq.tabulate(LoadPipelineWidth)(getRemBits(deqMask)(_)) 325 326 // generate lastCycleSelect mask 327 val remReplayFireMask = Seq.tabulate(LoadPipelineWidth)(rem => getRemBits(UIntToOH(loadReplaySel(rem)))(rem)) 328 329 val loadReplayRemSelVecFire = Seq.tabulate(LoadPipelineWidth)(rem => getRemBits(loadReplaySelVec)(rem) & ~remReplayFireMask(rem)) 330 val loadReplayRemSelVecNotFire = Seq.tabulate(LoadPipelineWidth)(getRemBits(loadReplaySelVec)(_)) 331 332 val replayRemFire = Seq.tabulate(LoadPipelineWidth)(rem => WireInit(false.B)) 333 334 val loadReplayRemSel = Seq.tabulate(LoadPipelineWidth)(rem => Mux( 335 replayRemFire(rem), 336 getFirstOne(toVec(loadReplayRemSelVecFire(rem)), remReplayDeqMask(rem)), 337 getFirstOne(toVec(loadReplayRemSelVecNotFire(rem)), remReplayDeqMask(rem)) 338 )) 339 340 val loadReplaySelGen = Wire(Vec(LoadPipelineWidth, UInt(log2Up(LoadQueueSize).W))) 341 val loadReplaySelVGen = Wire(Vec(LoadPipelineWidth, Bool())) 342 343 (0 until LoadPipelineWidth).foreach(index => { 344 loadReplaySelGen(index) := ( 345 if (LoadPipelineWidth > 1) Cat(loadReplayRemSel(index), index.U(log2Ceil(LoadPipelineWidth).W)) 346 else loadReplayRemSel(index) 347 ) 348 loadReplaySelVGen(index) := Mux(replayRemFire(index), loadReplayRemSelVecFire(index).asUInt.orR, loadReplayRemSelVecNotFire(index).asUInt.orR) 349 }) 350 351 (0 until LoadPipelineWidth).map(i => { 352 vaddrModule.io.raddr(LoadPipelineWidth + i) := loadReplaySelGen(i) 353 }) 354 355 (0 until LoadPipelineWidth).map(i => { 356 loadReplaySel(i) := RegNext(loadReplaySelGen(i)) 357 loadReplaySelV(i) := RegNext(loadReplaySelVGen(i), init = false.B) 358 }) 359 360 // stage2: replay to load pipeline (if no load in S0) 361 (0 until LoadPipelineWidth).map(i => { 362 when(replayRemFire(i)) { 363 s0_block_load_mask(loadReplaySel(i)) := true.B 364 } 365 }) 366 367 // init 368 (0 until LoadPipelineWidth).map(i => { 369 replayRemFire(i) := false.B 370 }) 371 372 for(i <- 0 until LoadPipelineWidth) { 373 val replayIdx = loadReplaySel(i) 374 val notRedirectLastCycle = !uop(replayIdx).robIdx.needFlush(RegNext(io.brqRedirect)) 375 376 io.loadOut(i).valid := loadReplaySelV(i) && notRedirectLastCycle 377 378 io.loadOut(i).bits := DontCare 379 io.loadOut(i).bits.uop := uop(replayIdx) 380 io.loadOut(i).bits.vaddr := vaddrModule.io.rdata(LoadPipelineWidth + i) 381 io.loadOut(i).bits.mask := genWmask(vaddrModule.io.rdata(LoadPipelineWidth + i), uop(replayIdx).fuOpType(1,0)) 382 io.loadOut(i).bits.isFirstIssue := false.B 383 io.loadOut(i).bits.isLoadReplay := true.B 384 io.loadOut(i).bits.replayCarry := replayCarryReg(replayIdx) 385 io.loadOut(i).bits.mshrid := miss_mshr_id(replayIdx) 386 io.loadOut(i).bits.forward_tlDchannel := true_cache_miss_replay(replayIdx) 387 388 when(io.loadOut(i).fire) { 389 replayRemFire(i) := true.B 390 } 391 392 } 393 /** 394 * Writeback load from load units 395 * 396 * Most load instructions writeback to regfile at the same time. 397 * However, 398 * (1) For an mmio instruction with exceptions, it writes back to ROB immediately. 399 * (2) For an mmio instruction without exceptions, it does not write back. 400 * The mmio instruction will be sent to lower level when it reaches ROB's head. 401 * After uncache response, it will write back through arbiter with loadUnit. 402 * (3) For cache misses, it is marked miss and sent to dcache later. 403 * After cache refills, it will write back through arbiter with loadUnit. 404 */ 405 for (i <- 0 until LoadPipelineWidth) { 406 dataModule.io.wb.wen(i) := false.B 407 dataModule.io.paddr.wen(i) := false.B 408 vaddrModule.io.wen(i) := false.B 409 vaddrTriggerResultModule.io.wen(i) := false.B 410 val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value 411 412 // most lq status need to be updated immediately after load writeback to lq 413 // flag bits in lq needs to be updated accurately 414 when(io.loadIn(i).fire()) { 415 when(io.loadIn(i).bits.miss) { 416 XSInfo(io.loadIn(i).valid, "load miss write to lq idx %d pc 0x%x vaddr %x paddr %x mask %x forwardData %x forwardMask: %x mmio %x\n", 417 io.loadIn(i).bits.uop.lqIdx.asUInt, 418 io.loadIn(i).bits.uop.pc, 419 io.loadIn(i).bits.vaddr, 420 io.loadIn(i).bits.paddr, 421 io.loadIn(i).bits.mask, 422 io.loadIn(i).bits.forwardData.asUInt, 423 io.loadIn(i).bits.forwardMask.asUInt, 424 io.loadIn(i).bits.mmio 425 ) 426 }.otherwise { 427 XSInfo(io.loadIn(i).valid, "load hit write to cbd lqidx %d pc 0x%x vaddr %x paddr %x mask %x forwardData %x forwardMask: %x mmio %x\n", 428 io.loadIn(i).bits.uop.lqIdx.asUInt, 429 io.loadIn(i).bits.uop.pc, 430 io.loadIn(i).bits.vaddr, 431 io.loadIn(i).bits.paddr, 432 io.loadIn(i).bits.mask, 433 io.loadIn(i).bits.forwardData.asUInt, 434 io.loadIn(i).bits.forwardMask.asUInt, 435 io.loadIn(i).bits.mmio 436 )} 437 if(EnableFastForward){ 438 datavalid(loadWbIndex) := !io.loadIn(i).bits.miss && 439 !io.loadIn(i).bits.mmio && // mmio data is not valid until we finished uncache access 440 !io.s2_dcache_require_replay(i) // do not writeback if that inst will be resend from rs 441 } else { 442 datavalid(loadWbIndex) := !io.loadIn(i).bits.miss && 443 !io.loadIn(i).bits.mmio // mmio data is not valid until we finished uncache access 444 } 445 writebacked(loadWbIndex) := !io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio 446 447 debug_mmio(loadWbIndex) := io.loadIn(i).bits.mmio 448 debug_paddr(loadWbIndex) := io.loadIn(i).bits.paddr 449 450 val dcacheMissed = io.loadIn(i).bits.miss && !io.loadIn(i).bits.mmio 451 if(EnableFastForward){ 452 miss(loadWbIndex) := dcacheMissed && !io.s2_load_data_forwarded(i) && !io.s2_dcache_require_replay(i) 453 } else { 454 miss(loadWbIndex) := dcacheMissed && !io.s2_load_data_forwarded(i) 455 } 456 pending(loadWbIndex) := io.loadIn(i).bits.mmio 457 released(loadWbIndex) := release2cycle.valid && 458 io.loadIn(i).bits.paddr(PAddrBits-1, DCacheLineOffset) === release2cycle.bits.paddr(PAddrBits-1, DCacheLineOffset) || 459 release1cycle.valid && 460 io.loadIn(i).bits.paddr(PAddrBits-1, DCacheLineOffset) === release1cycle.bits.paddr(PAddrBits-1, DCacheLineOffset) 461 } 462 463 // data bit in lq can be updated when load_s2 valid 464 // when(io.loadIn(i).bits.lq_data_wen){ 465 // val loadWbData = Wire(new LQDataEntry) 466 // loadWbData.paddr := io.loadIn(i).bits.paddr 467 // loadWbData.mask := io.loadIn(i).bits.mask 468 // loadWbData.data := io.loadIn(i).bits.forwardData.asUInt // fwd data 469 // loadWbData.fwdMask := io.loadIn(i).bits.forwardMask 470 // dataModule.io.wbWrite(i, loadWbIndex, loadWbData) 471 // dataModule.io.wb.wen(i) := true.B 472 473 // // dirty code for load instr 474 // uop(loadWbIndex).pdest := io.loadIn(i).bits.uop.pdest 475 // uop(loadWbIndex).cf := io.loadIn(i).bits.uop.cf 476 // uop(loadWbIndex).ctrl := io.loadIn(i).bits.uop.ctrl 477 // uop(loadWbIndex).debugInfo := io.loadIn(i).bits.uop.debugInfo 478 479 // vaddrTriggerResultModule.io.waddr(i) := loadWbIndex 480 // vaddrTriggerResultModule.io.wdata(i) := io.trigger(i).hitLoadAddrTriggerHitVec 481 482 // vaddrTriggerResultModule.io.wen(i) := true.B 483 // } 484 485 // dirty code to reduce load_s2.valid fanout 486 when(io.loadIn(i).bits.lq_data_wen_dup(0)){ 487 dataModule.io.wbWrite(i, loadWbIndex, io.loadIn(i).bits.mask) 488 dataModule.io.wb.wen(i) := true.B 489 } 490 // dirty code for load instr 491 // Todo: solve this elegantly 492 when(io.loadIn(i).bits.lq_data_wen_dup(1)){ 493 uop(loadWbIndex) := io.loadIn(i).bits.uop 494 } 495 when(io.loadIn(i).bits.lq_data_wen_dup(4)){ 496 uop(loadWbIndex).debugInfo := io.loadIn(i).bits.uop.debugInfo 497 } 498 when(io.loadIn(i).bits.lq_data_wen_dup(5)){ 499 vaddrTriggerResultModule.io.waddr(i) := loadWbIndex 500 vaddrTriggerResultModule.io.wdata(i) := io.trigger(i).hitLoadAddrTriggerHitVec 501 vaddrTriggerResultModule.io.wen(i) := true.B 502 } 503 504 when(io.loadPaddrIn(i).valid) { 505 dataModule.io.paddr.wen(i) := true.B 506 dataModule.io.paddr.waddr(i) := io.loadPaddrIn(i).bits.lqIdx.value 507 dataModule.io.paddr.wdata(i) := io.loadPaddrIn(i).bits.paddr 508 } 509 510 // update vaddr in load S1 511 when(io.loadVaddrIn(i).valid) { 512 vaddrModule.io.wen(i) := true.B 513 vaddrModule.io.waddr(i) := io.loadVaddrIn(i).bits.lqIdx.value 514 vaddrModule.io.wdata(i) := io.loadVaddrIn(i).bits.vaddr 515 } 516 517 /** 518 * used for feedback and replay 519 */ 520 when(io.replayFast(i).valid){ 521 val idx = io.replayFast(i).ld_idx 522 val needreplay = !io.replayFast(i).ld_ld_check_ok || !io.replayFast(i).st_ld_check_ok || !io.replayFast(i).cache_bank_no_conflict 523 524 ld_ld_check_ok(idx) := io.replayFast(i).ld_ld_check_ok 525 st_ld_check_ok(idx) := io.replayFast(i).st_ld_check_ok 526 cache_bank_no_conflict(idx) := io.replayFast(i).cache_bank_no_conflict 527 528 when(needreplay) { 529 creditUpdate(idx) := block_cycles_others(block_ptr_others(idx)) 530 block_ptr_others(idx) := Mux(block_ptr_others(idx) === 3.U(2.W), block_ptr_others(idx), block_ptr_others(idx) + 1.U(2.W)) 531 // try to replay this load in next cycle 532 s1_block_load_mask(idx) := false.B 533 s2_block_load_mask(idx) := false.B 534 535 // replay this load in next cycle 536 loadReplaySelGen(idx(log2Ceil(LoadPipelineWidth) - 1, 0)) := idx 537 loadReplaySelVGen(idx(log2Ceil(LoadPipelineWidth) - 1, 0)) := true.B 538 } 539 } 540 541 when(io.replaySlow(i).valid){ 542 val idx = io.replaySlow(i).ld_idx 543 val needreplay = !io.replaySlow(i).tlb_hited || !io.replaySlow(i).st_ld_check_ok || !io.replaySlow(i).cache_no_replay || !io.replaySlow(i).forward_data_valid || !io.replaySlow(i).cache_hited 544 545 tlb_hited(idx) := io.replaySlow(i).tlb_hited 546 st_ld_check_ok(idx) := io.replaySlow(i).st_ld_check_ok 547 cache_no_replay(idx) := io.replaySlow(i).cache_no_replay 548 forward_data_valid(idx) := io.replaySlow(i).forward_data_valid 549 replayCarryReg(idx) := io.replaySlow(i).replayCarry 550 cache_hited(idx) := io.replaySlow(i).cache_hited 551 552 val invalid_sq_idx = io.replaySlow(i).data_invalid_sq_idx 553 554 when(needreplay) { 555 // update credit and ptr 556 val data_in_last_beat = io.replaySlow(i).data_in_last_beat 557 creditUpdate(idx) := Mux( !io.replaySlow(i).tlb_hited, block_cycles_tlb(block_ptr_tlb(idx)), 558 Mux(!io.replaySlow(i).cache_hited, block_cycles_cache(block_ptr_cache(idx)) + data_in_last_beat, 559 Mux(!io.replaySlow(i).cache_no_replay || !io.replaySlow(i).st_ld_check_ok, block_cycles_others(block_ptr_others(idx)), 0.U))) 560 when(!io.replaySlow(i).tlb_hited) { 561 block_ptr_tlb(idx) := Mux(block_ptr_tlb(idx) === 3.U(2.W), block_ptr_tlb(idx), block_ptr_tlb(idx) + 1.U(2.W)) 562 }.elsewhen(!io.replaySlow(i).cache_hited) { 563 block_ptr_cache(idx) := Mux(block_ptr_cache(idx) === 3.U(2.W), block_ptr_cache(idx), block_ptr_cache(idx) + 1.U(2.W)) 564 }.elsewhen(!io.replaySlow(i).cache_no_replay || !io.replaySlow(i).st_ld_check_ok) { 565 block_ptr_others(idx) := Mux(block_ptr_others(idx) === 3.U(2.W), block_ptr_others(idx), block_ptr_others(idx) + 1.U(2.W)) 566 } 567 } 568 569 // special case: data forward fail 570 block_by_data_forward_fail(idx) := false.B 571 572 when(!io.replaySlow(i).forward_data_valid && io.replaySlow(i).tlb_hited) { 573 when(!io.storeDataValidVec(invalid_sq_idx)) { 574 block_by_data_forward_fail(idx) := true.B 575 block_sq_idx(idx) := invalid_sq_idx 576 } 577 } 578 579 // special case: cache miss 580 miss_mshr_id(idx) := io.replaySlow(i).miss_mshr_id 581 block_by_cache_miss(idx) := io.replaySlow(i).tlb_hited && io.replaySlow(i).cache_no_replay && io.replaySlow(i).st_ld_check_ok && // this load tlb hit and no cache replay 582 !io.replaySlow(i).cache_hited && !io.replaySlow(i).can_forward_full_data && // cache miss 583 !(io.refill.valid && io.refill.bits.id === io.replaySlow(i).miss_mshr_id) && // no refill in this cycle 584 creditUpdate(idx) =/= 0.U // credit is not zero 585 } 586 587 } 588 589 when(io.refill.valid) { 590 XSDebug("miss resp: paddr:0x%x data %x\n", io.refill.bits.addr, io.refill.bits.data) 591 } 592 593 // NOTE: we don't refill data from dcache now! 594 595 val s2_dcache_require_replay = WireInit(VecInit((0 until LoadPipelineWidth).map(i =>{ 596 RegNext(io.loadIn(i).fire()) && RegNext(io.s2_dcache_require_replay(i)) 597 }))) 598 dontTouch(s2_dcache_require_replay) 599 600 for (i <- 0 until LoadPipelineWidth) { 601 val loadWbIndex = io.loadIn(i).bits.uop.lqIdx.value 602 val lastCycleLoadWbIndex = RegNext(loadWbIndex) 603 // update miss state in load s3 604 if(!EnableFastForward){ 605 // s2_dcache_require_replay will be used to update lq flag 1 cycle after for better timing 606 // 607 // io.s2_dcache_require_replay comes from dcache miss req reject, which is quite slow to generate 608 when(s2_dcache_require_replay(i)) { 609 // do not writeback if that inst will be resend from rs 610 // rob writeback will not be triggered by a refill before inst replay 611 miss(lastCycleLoadWbIndex) := false.B // disable refill listening 612 datavalid(lastCycleLoadWbIndex) := false.B // disable refill listening 613 assert(!datavalid(lastCycleLoadWbIndex)) 614 } 615 } 616 // update load error state in load s3 617 when(RegNext(io.loadIn(i).fire) && io.s3_delayed_load_error(i)){ 618 uop(lastCycleLoadWbIndex).exceptionVec(loadAccessFault) := true.B 619 } 620 // update inst replay from fetch flag in s3 621 when(RegNext(io.loadIn(i).fire) && io.s3_replay_from_fetch(i)){ 622 uop(lastCycleLoadWbIndex).replayInst := true.B 623 } 624 } 625 626 /** 627 * Load commits 628 * 629 * When load commited, mark it as !allocated and move deqPtrExt forward. 630 */ 631 (0 until CommitWidth).map(i => { 632 when(commitCount > i.U){ 633 allocated((deqPtrExt+i.U).value) := false.B 634 XSError(!allocated((deqPtrExt+i.U).value), s"why commit invalid entry $i?\n") 635 } 636 }) 637 638 def toVec(a: UInt): Vec[Bool] = { 639 VecInit(a.asBools) 640 } 641 642 def getRemBits(input: UInt)(rem: Int): UInt = { 643 VecInit((0 until LoadQueueSize / LoadPipelineWidth).map(i => { input(LoadPipelineWidth * i + rem) })).asUInt 644 } 645 646 def getFirstOne(mask: Vec[Bool], startMask: UInt) = { 647 val length = mask.length 648 val highBits = (0 until length).map(i => mask(i) & ~startMask(i)) 649 val highBitsUint = Cat(highBits.reverse) 650 PriorityEncoder(Mux(highBitsUint.orR(), highBitsUint, mask.asUInt)) 651 } 652 653 def getOldest[T <: MemMicroOpRbExt](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = { 654 assert(valid.length == bits.length) 655 assert(isPow2(valid.length)) 656 if (valid.length == 1) { 657 (valid, bits) 658 } else if (valid.length == 2) { 659 val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0))))) 660 for (i <- res.indices) { 661 res(i).valid := valid(i) 662 res(i).bits := bits(i) 663 } 664 val oldest = Mux(valid(0) && valid(1), Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx), res(1), res(0)), Mux(valid(0) && !valid(1), res(0), res(1))) 665 (Seq(oldest.valid), Seq(oldest.bits)) 666 } else { 667 val left = getOldest(valid.take(valid.length / 2), bits.take(valid.length / 2)) 668 val right = getOldest(valid.takeRight(valid.length / 2), bits.takeRight(valid.length / 2)) 669 getOldest(left._1 ++ right._1, left._2 ++ right._2) 670 } 671 } 672 673 def getAfterMask(valid: Seq[Bool], uop: Seq[DynInst]) = { 674 assert(valid.length == uop.length) 675 val length = valid.length 676 (0 until length).map(i => { 677 (0 until length).map(j => { 678 Mux(valid(i) && valid(j), 679 isAfter(uop(i).robIdx, uop(j).robIdx), 680 Mux(!valid(i), true.B, false.B)) 681 }) 682 }) 683 } 684 685 686 /** 687 * Store-Load Memory violation detection 688 * 689 * When store writes back, it searches LoadQueue for younger load instructions 690 * with the same load physical address. They loaded wrong data and need re-execution. 691 * 692 * Cycle 0: Store Writeback 693 * Generate match vector for store address with rangeMask(stPtr, enqPtr). 694 * Cycle 1: Redirect Generation 695 * There're up to 2 possible redirect requests. 696 * Choose the oldest load (part 1). 697 * Cycle 2: Redirect Fire 698 * Choose the oldest load (part 2). 699 * Prepare redirect request according to the detected violation. 700 * Fire redirect request (if valid) 701 */ 702 703 // stage 0: lq lq 704 // | | (paddr match) 705 // stage 1: lq lq 706 // | | 707 // | | 708 // | | 709 // stage 2: lq lq 710 // | | 711 // -------------------- 712 // | 713 // rollback req 714 io.load_s1 := DontCare 715def detectRollback(i: Int) = { 716 val startIndex = io.storeIn(i).bits.uop.lqIdx.value 717 val lqIdxMask = UIntToMask(startIndex, LoadQueueSize) 718 val xorMask = lqIdxMask ^ enqMask 719 val sameFlag = io.storeIn(i).bits.uop.lqIdx.flag === enqPtrExt(0).flag 720 val stToEnqPtrMask = Mux(sameFlag, xorMask, ~xorMask) 721 722 // check if load already in lq needs to be rolledback 723 dataModule.io.violation(i).paddr := io.storeIn(i).bits.paddr 724 dataModule.io.violation(i).mask := io.storeIn(i).bits.mask 725 val addrMaskMatch = RegNext(dataModule.io.violation(i).violationMask) 726 val entryNeedCheck = RegNext(VecInit((0 until LoadQueueSize).map(j => { 727 allocated(j) && stToEnqPtrMask(j) && datavalid(j) 728 }))) 729 val lqViolationVec = VecInit((0 until LoadQueueSize).map(j => { 730 addrMaskMatch(j) && entryNeedCheck(j) 731 })) 732 val lqViolation = lqViolationVec.asUInt().orR() && RegNext(!io.storeIn(i).bits.miss) 733 val lqViolationIndex = getFirstOne(lqViolationVec, RegNext(lqIdxMask)) 734 val lqViolationUop = uop(lqViolationIndex) 735 // lqViolationUop.lqIdx.flag := deqMask(lqViolationIndex) ^ deqPtrExt.flag 736 // lqViolationUop.lqIdx.value := lqViolationIndex 737 XSDebug(lqViolation, p"${Binary(Cat(lqViolationVec))}, $startIndex, $lqViolationIndex\n") 738 739 XSDebug( 740 lqViolation, 741 "need rollback (ld wb before store) pc %x robidx %d target %x\n", 742 io.storeIn(i).bits.uop.pc, io.storeIn(i).bits.uop.robIdx.asUInt, lqViolationUop.robIdx.asUInt 743 ) 744 745 (lqViolation, lqViolationUop) 746 } 747 748 def rollbackSel(a: Valid[MemMicroOpRbExt], b: Valid[MemMicroOpRbExt]): ValidIO[MemMicroOpRbExt] = { 749 Mux( 750 a.valid, 751 Mux( 752 b.valid, 753 Mux(isAfter(a.bits.uop.robIdx, b.bits.uop.robIdx), b, a), // a,b both valid, sel oldest 754 a // sel a 755 ), 756 b // sel b 757 ) 758 } 759 760 // S2: select rollback (part1) and generate rollback request 761 // rollback check 762 // Lq rollback seq check is done in s3 (next stage), as getting rollbackLq MicroOp is slow 763 val rollbackLq = Wire(Vec(StorePipelineWidth, Valid(new MemMicroOpRbExt))) 764 // store ftq index for store set update 765 val stFtqIdxS2 = Wire(Vec(StorePipelineWidth, new FtqPtr)) 766 val stFtqOffsetS2 = Wire(Vec(StorePipelineWidth, UInt(log2Up(PredictWidth).W))) 767 for (i <- 0 until StorePipelineWidth) { 768 val detectedRollback = detectRollback(i) 769 rollbackLq(i).valid := detectedRollback._1 && RegNext(io.storeIn(i).valid) 770 rollbackLq(i).bits.uop := detectedRollback._2 771 rollbackLq(i).bits.flag := i.U 772 stFtqIdxS2(i) := RegNext(io.storeIn(i).bits.uop.ftqPtr) 773 stFtqOffsetS2(i) := RegNext(io.storeIn(i).bits.uop.ftqOffset) 774 } 775 776 val rollbackLqVReg = rollbackLq.map(x => RegNext(x.valid)) 777 val rollbackLqReg = rollbackLq.map(x => RegEnable(x.bits, x.valid)) 778 779 // S3: select rollback (part2), generate rollback request, then fire rollback request 780 // Note that we use robIdx - 1.U to flush the load instruction itself. 781 // Thus, here if last cycle's robIdx equals to this cycle's robIdx, it still triggers the redirect. 782 783 // select uop in parallel 784 val lqs = getOldest(rollbackLqVReg, rollbackLqReg) 785 val rollbackUopExt = lqs._2(0) 786 val stFtqIdxS3 = RegNext(stFtqIdxS2) 787 val stFtqOffsetS3 = RegNext(stFtqOffsetS2) 788 val rollbackUop = rollbackUopExt.uop 789 val rollbackStFtqIdx = stFtqIdxS3(rollbackUopExt.flag) 790 val rollbackStFtqOffset = stFtqOffsetS3(rollbackUopExt.flag) 791 792 // check if rollback request is still valid in parallel 793 io.rollback.bits.robIdx := rollbackUop.robIdx 794 io.rollback.bits.ftqIdx := rollbackUop.ftqPtr 795 io.rollback.bits.stFtqIdx := rollbackStFtqIdx 796 io.rollback.bits.ftqOffset := rollbackUop.ftqOffset 797 io.rollback.bits.stFtqOffset := rollbackStFtqOffset 798 io.rollback.bits.level := RedirectLevel.flush 799 io.rollback.bits.interrupt := DontCare 800 io.rollback.bits.cfiUpdate := DontCare 801 io.rollback.bits.cfiUpdate.target := rollbackUop.pc 802 io.rollback.bits.debug_runahead_checkpoint_id := rollbackUop.debugInfo.runahead_checkpoint_id 803 // io.rollback.bits.pc := DontCare 804 805 io.rollback.valid := rollbackLqVReg.reduce(_|_) && 806 (!lastCycleRedirect.valid || isBefore(rollbackUop.robIdx, lastCycleRedirect.bits.robIdx)) && 807 (!lastlastCycleRedirect.valid || isBefore(rollbackUop.robIdx, lastlastCycleRedirect.bits.robIdx)) 808 809 when(io.rollback.valid) { 810 // XSDebug("Mem rollback: pc %x robidx %d\n", io.rollback.bits.cfi, io.rollback.bits.robIdx.asUInt) 811 } 812 813 /** 814 * Load-Load Memory violation detection 815 * 816 * When load arrives load_s1, it searches LoadQueue for younger load instructions 817 * with the same load physical address. If younger load has been released (or observed), 818 * the younger load needs to be re-execed. 819 * 820 * For now, if re-exec it found to be needed in load_s1, we mark the older load as replayInst, 821 * the two loads will be replayed if the older load becomes the head of rob. 822 * 823 * When dcache releases a line, mark all writebacked entrys in load queue with 824 * the same line paddr as released. 825 */ 826 827 // Load-Load Memory violation query 828 val deqRightMask = UIntToMask.rightmask(deqPtr, LoadQueueSize) 829 (0 until LoadPipelineWidth).map(i => { 830 dataModule.io.release_violation(i).paddr := io.loadViolationQuery(i).req.bits.paddr 831 io.loadViolationQuery(i).req.ready := true.B 832 io.loadViolationQuery(i).resp.valid := RegNext(io.loadViolationQuery(i).req.fire()) 833 // Generate real violation mask 834 // Note that we use UIntToMask.rightmask here 835 val startIndex = io.loadViolationQuery(i).req.bits.uop.lqIdx.value 836 val lqIdxMask = UIntToMask(startIndex, LoadQueueSize) 837 val xorMask = lqIdxMask ^ enqMask 838 val sameFlag = io.loadViolationQuery(i).req.bits.uop.lqIdx.flag === enqPtrExt(0).flag 839 val ldToEnqPtrMask = Mux(sameFlag, xorMask, ~xorMask) 840 val ldld_violation_mask_gen_1 = WireInit(VecInit((0 until LoadQueueSize).map(j => { 841 ldToEnqPtrMask(j) && // the load is younger than current load 842 allocated(j) && // entry is valid 843 released(j) && // cacheline is released 844 (datavalid(j) || miss(j)) // paddr is valid 845 }))) 846 val ldld_violation_mask_gen_2 = WireInit(VecInit((0 until LoadQueueSize).map(j => { 847 dataModule.io.release_violation(i).match_mask(j)// addr match 848 // addr match result is slow to generate, we RegNext() it 849 }))) 850 val ldld_violation_mask = RegNext(ldld_violation_mask_gen_1).asUInt & RegNext(ldld_violation_mask_gen_2).asUInt 851 dontTouch(ldld_violation_mask) 852 ldld_violation_mask.suggestName("ldldViolationMask_" + i) 853 io.loadViolationQuery(i).resp.bits.have_violation := ldld_violation_mask.orR 854 }) 855 856 // "released" flag update 857 // 858 // When io.release.valid (release1cycle.valid), it uses the last ld-ld paddr cam port to 859 // update release flag in 1 cycle 860 861 when(release1cycle.valid){ 862 // Take over ld-ld paddr cam port 863 dataModule.io.release_violation.takeRight(1)(0).paddr := release1cycle.bits.paddr 864 io.loadViolationQuery.takeRight(1)(0).req.ready := false.B 865 } 866 867 when(release2cycle.valid){ 868 // If a load comes in that cycle, we can not judge if it has ld-ld violation 869 // We replay that load inst from RS 870 io.loadViolationQuery.map(i => i.req.ready := 871 // use lsu side release2cycle_dup_lsu paddr for better timing 872 !i.req.bits.paddr(PAddrBits-1, DCacheLineOffset) === release2cycle_dup_lsu.bits.paddr(PAddrBits-1, DCacheLineOffset) 873 ) 874 // io.loadViolationQuery.map(i => i.req.ready := false.B) // For better timing 875 } 876 877 (0 until LoadQueueSize).map(i => { 878 when(RegNext(dataModule.io.release_violation.takeRight(1)(0).match_mask(i) && 879 allocated(i) && 880 datavalid(i) && 881 release1cycle.valid 882 )){ 883 // Note: if a load has missed in dcache and is waiting for refill in load queue, 884 // its released flag still needs to be set as true if addr matches. 885 released(i) := true.B 886 } 887 }) 888 889 /** 890 * Memory mapped IO / other uncached operations 891 * 892 * States: 893 * (1) writeback from store units: mark as pending 894 * (2) when they reach ROB's head, they can be sent to uncache channel 895 * (3) response from uncache channel: mark as datavalid 896 * (4) writeback to ROB (and other units): mark as writebacked 897 * (5) ROB commits the instruction: same as normal instructions 898 */ 899 //(2) when they reach ROB's head, they can be sent to uncache channel 900 val lqTailMmioPending = WireInit(pending(deqPtr)) 901 val lqTailAllocated = WireInit(allocated(deqPtr)) 902 val s_idle :: s_req :: s_resp :: s_wait :: Nil = Enum(4) 903 val uncacheState = RegInit(s_idle) 904 switch(uncacheState) { 905 is(s_idle) { 906 when(RegNext(io.rob.pendingld && lqTailMmioPending && lqTailAllocated)) { 907 uncacheState := s_req 908 } 909 } 910 is(s_req) { 911 when(io.uncache.req.fire()) { 912 uncacheState := s_resp 913 } 914 } 915 is(s_resp) { 916 when(io.uncache.resp.fire()) { 917 uncacheState := s_wait 918 } 919 } 920 is(s_wait) { 921 when(RegNext(io.rob.commit)) { 922 uncacheState := s_idle // ready for next mmio 923 } 924 } 925 } 926 927 // used for uncache commit 928 val uncacheData = RegInit(0.U(XLEN.W)) 929 val uncacheCommitFired = RegInit(false.B) 930 931 when(uncacheState === s_req) { 932 uncacheCommitFired := false.B 933 } 934 935 io.uncache.req.valid := uncacheState === s_req 936 937 dataModule.io.uncache.raddr := deqPtrExtNext.value 938 939 io.uncache.req.bits.cmd := MemoryOpConstants.M_XRD 940 io.uncache.req.bits.addr := dataModule.io.uncache.rdata.paddr 941 io.uncache.req.bits.data := DontCare 942 io.uncache.req.bits.mask := dataModule.io.uncache.rdata.mask 943 io.uncache.req.bits.id := RegNext(deqPtrExtNext.value) 944 io.uncache.req.bits.instrtype := DontCare 945 io.uncache.req.bits.replayCarry := DontCare 946 io.uncache.req.bits.atomic := true.B 947 948 io.uncache.resp.ready := true.B 949 950 when (io.uncache.req.fire()) { 951 pending(deqPtr) := false.B 952 953 XSDebug("uncache req: pc %x addr %x data %x op %x mask %x\n", 954 uop(deqPtr).pc, 955 io.uncache.req.bits.addr, 956 io.uncache.req.bits.data, 957 io.uncache.req.bits.cmd, 958 io.uncache.req.bits.mask 959 ) 960 } 961 962 // (3) response from uncache channel: mark as datavalid 963 when(io.uncache.resp.fire()){ 964 datavalid(deqPtr) := true.B 965 uncacheData := io.uncache.resp.bits.data(XLEN-1, 0) 966 967 XSDebug("uncache resp: data %x\n", io.refill.bits.data) 968 } 969 970 // writeback mmio load, Note: only use ldout(0) to write back 971 // 972 // Int load writeback will finish (if not blocked) in one cycle 973 io.ldout(0).bits.uop := uop(deqPtr) 974 io.ldout(0).bits.uop.lqIdx := deqPtr.asTypeOf(new LqPtr) 975 io.ldout(0).bits.data := DontCare // not used 976 io.ldout(0).bits.debug.isMMIO := true.B 977 io.ldout(0).bits.debug.isPerfCnt := false.B 978 io.ldout(0).bits.debug.paddr := debug_paddr(deqPtr) 979 io.ldout(0).bits.debug.vaddr := vaddrModule.io.rdata(1) 980 981 io.ldout(0).valid := (uncacheState === s_wait) && !uncacheCommitFired 982 983 io.ldout(1).bits := DontCare 984 io.ldout(1).valid := false.B 985 986 // merged data, uop and offset for data sel in load_s3 987 io.ldRawDataOut(0).lqData := uncacheData 988 io.ldRawDataOut(0).uop := io.ldout(0).bits.uop 989 io.ldRawDataOut(0).addrOffset := dataModule.io.uncache.rdata.paddr 990 991 io.ldRawDataOut(1) := DontCare 992 993 when(io.ldout(0).fire()){ 994 uncacheCommitFired := true.B 995 } 996 997 XSPerfAccumulate("uncache_load_write_back", io.ldout(0).fire()) 998 999 // Read vaddr for mem exception 1000 // no inst will be commited 1 cycle before tval update 1001 vaddrModule.io.raddr(0) := (deqPtrExt + commitCount).value 1002 io.exceptionAddr.vaddr := vaddrModule.io.rdata(0) 1003 1004 // read vaddr for mmio, and only port {1} is used 1005 vaddrModule.io.raddr(1) := deqPtr 1006 1007 (0 until LoadPipelineWidth).map(i => { 1008 if(i == 0) { 1009 vaddrTriggerResultModule.io.raddr(i) := deqPtr 1010 io.trigger(i).lqLoadAddrTriggerHitVec := Mux( 1011 io.ldout(i).valid, 1012 vaddrTriggerResultModule.io.rdata(i), 1013 VecInit(Seq.fill(3)(false.B)) 1014 ) 1015 }else { 1016 vaddrTriggerResultModule.io.raddr(i) := DontCare 1017 io.trigger(i).lqLoadAddrTriggerHitVec := VecInit(Seq.fill(3)(false.B)) 1018 } 1019 // vaddrTriggerResultModule.io.raddr(i) := loadWbSelGen(i) 1020 // io.trigger(i).lqLoadAddrTriggerHitVec := Mux( 1021 // loadWbSelV(i), 1022 // vaddrTriggerResultModule.io.rdata(i), 1023 // VecInit(Seq.fill(3)(false.B)) 1024 // ) 1025 }) 1026 1027 // misprediction recovery / exception redirect 1028 // invalidate lq term using robIdx 1029 val needCancel = Wire(Vec(LoadQueueSize, Bool())) 1030 for (i <- 0 until LoadQueueSize) { 1031 needCancel(i) := uop(i).robIdx.needFlush(io.brqRedirect) && allocated(i) 1032 when (needCancel(i)) { 1033 allocated(i) := false.B 1034 } 1035 } 1036 1037 /** 1038 * update pointers 1039 */ 1040 val lastEnqCancel = PopCount(RegNext(VecInit(canEnqueue.zip(enqCancel).map(x => x._1 && x._2)))) 1041 val lastCycleCancelCount = PopCount(RegNext(needCancel)) 1042 val enqNumber = Mux(io.enq.canAccept && io.enq.sqCanAccept, PopCount(io.enq.req.map(_.valid)), 0.U) 1043 when (lastCycleRedirect.valid) { 1044 // we recover the pointers in the next cycle after redirect 1045 enqPtrExt := VecInit(enqPtrExt.map(_ - (lastCycleCancelCount + lastEnqCancel))) 1046 }.otherwise { 1047 enqPtrExt := VecInit(enqPtrExt.map(_ + enqNumber)) 1048 } 1049 1050 deqPtrExtNext := deqPtrExt + commitCount 1051 deqPtrExt := deqPtrExtNext 1052 1053 io.lqCancelCnt := RegNext(lastCycleCancelCount + lastEnqCancel) 1054 1055 /** 1056 * misc 1057 */ 1058 // perf counter 1059 QueuePerf(LoadQueueSize, validCount, !allowEnqueue) 1060 io.lqFull := !allowEnqueue 1061 XSPerfAccumulate("rollback", io.rollback.valid) // rollback redirect generated 1062 XSPerfAccumulate("mmioCycle", uncacheState =/= s_idle) // lq is busy dealing with uncache req 1063 XSPerfAccumulate("mmioCnt", io.uncache.req.fire()) 1064 XSPerfAccumulate("refill", io.refill.valid) 1065 XSPerfAccumulate("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire())))) 1066 XSPerfAccumulate("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready)))) 1067 XSPerfAccumulate("utilization_miss", PopCount((0 until LoadQueueSize).map(i => allocated(i) && miss(i)))) 1068 1069 if (env.EnableTopDown) { 1070 val stall_loads_bound = WireDefault(0.B) 1071 ExcitingUtils.addSink(stall_loads_bound, "stall_loads_bound", ExcitingUtils.Perf) 1072 val have_miss_entry = (allocated zip miss).map(x => x._1 && x._2).reduce(_ || _) 1073 val l1d_loads_bound = stall_loads_bound && !have_miss_entry 1074 ExcitingUtils.addSource(l1d_loads_bound, "l1d_loads_bound", ExcitingUtils.Perf) 1075 XSPerfAccumulate("l1d_loads_bound", l1d_loads_bound) 1076 val stall_l1d_load_miss = stall_loads_bound && have_miss_entry 1077 ExcitingUtils.addSource(stall_l1d_load_miss, "stall_l1d_load_miss", ExcitingUtils.Perf) 1078 ExcitingUtils.addSink(WireInit(0.U), "stall_l1d_load_miss", ExcitingUtils.Perf) 1079 } 1080 1081 val perfValidCount = RegNext(validCount) 1082 1083 val perfEvents = Seq( 1084 ("rollback ", io.rollback.valid), 1085 ("mmioCycle ", uncacheState =/= s_idle), 1086 ("mmio_Cnt ", io.uncache.req.fire()), 1087 ("refill ", io.refill.valid), 1088 ("writeback_success", PopCount(VecInit(io.ldout.map(i => i.fire())))), 1089 ("writeback_blocked", PopCount(VecInit(io.ldout.map(i => i.valid && !i.ready)))), 1090 ("ltq_1_4_valid ", (perfValidCount < (LoadQueueSize.U/4.U))), 1091 ("ltq_2_4_valid ", (perfValidCount > (LoadQueueSize.U/4.U)) & (perfValidCount <= (LoadQueueSize.U/2.U))), 1092 ("ltq_3_4_valid ", (perfValidCount > (LoadQueueSize.U/2.U)) & (perfValidCount <= (LoadQueueSize.U*3.U/4.U))), 1093 ("ltq_4_4_valid ", (perfValidCount > (LoadQueueSize.U*3.U/4.U))) 1094 ) 1095 generatePerfEvent() 1096 1097 // debug info 1098 XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt.flag, deqPtr) 1099 1100 def PrintFlag(flag: Bool, name: String): Unit = { 1101 when(flag) { 1102 XSDebug(false, true.B, name) 1103 }.otherwise { 1104 XSDebug(false, true.B, " ") 1105 } 1106 } 1107 1108 for (i <- 0 until LoadQueueSize) { 1109 XSDebug(i + " pc %x pa %x ", uop(i).pc, debug_paddr(i)) 1110 PrintFlag(allocated(i), "a") 1111 PrintFlag(allocated(i) && datavalid(i), "v") 1112 PrintFlag(allocated(i) && writebacked(i), "w") 1113 PrintFlag(allocated(i) && miss(i), "m") 1114 PrintFlag(allocated(i) && pending(i), "p") 1115 XSDebug(false, true.B, "\n") 1116 } 1117 1118} 1119