xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala (revision deb6421e9ab9b7980dc6c429456fc7bd2161357b)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import chipsalliance.rocketchip.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import xiangshan._
24import xiangshan.cache._
25import xiangshan.cache.{DCacheWordIO, DCacheLineIO, MemoryOpConstants}
26import xiangshan.backend.rob.{RobLsqIO, RobPtr}
27import difftest._
28import device.RAMHelper
29
30class SqPtr(implicit p: Parameters) extends CircularQueuePtr[SqPtr](
31  p => p(XSCoreParamsKey).StoreQueueSize
32){
33}
34
35object SqPtr {
36  def apply(f: Bool, v: UInt)(implicit p: Parameters): SqPtr = {
37    val ptr = Wire(new SqPtr)
38    ptr.flag := f
39    ptr.value := v
40    ptr
41  }
42}
43
44class SqEnqIO(implicit p: Parameters) extends XSBundle {
45  val canAccept = Output(Bool())
46  val lqCanAccept = Input(Bool())
47  val needAlloc = Vec(exuParameters.LsExuCnt, Input(Bool()))
48  val req = Vec(exuParameters.LsExuCnt, Flipped(ValidIO(new MicroOp)))
49  val resp = Vec(exuParameters.LsExuCnt, Output(new SqPtr))
50}
51
52class DataBufferEntry (implicit p: Parameters)  extends DCacheBundle {
53  val addr   = UInt(PAddrBits.W)
54  val vaddr  = UInt(VAddrBits.W)
55  val data   = UInt(DataBits.W)
56  val mask   = UInt((DataBits/8).W)
57  val wline = Bool()
58  val sqPtr  = new SqPtr
59}
60
61// Store Queue
62class StoreQueue(implicit p: Parameters) extends XSModule
63  with HasDCacheParameters with HasCircularQueuePtrHelper with HasPerfEvents {
64  val io = IO(new Bundle() {
65    val hartId = Input(UInt(8.W))
66    val enq = new SqEnqIO
67    val brqRedirect = Flipped(ValidIO(new Redirect))
68    val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle))) // store addr, data is not included
69    val storeInRe = Vec(StorePipelineWidth, Input(new LsPipelineBundle())) // store more mmio and exception
70    val storeDataIn = Vec(StorePipelineWidth, Flipped(Valid(new ExuOutput))) // store data, send to sq from rs
71    val storeMaskIn = Vec(StorePipelineWidth, Flipped(Valid(new StoreMaskBundle))) // store mask, send to sq from rs
72    val sbuffer = Vec(EnsbufferWidth, Decoupled(new DCacheWordReqWithVaddr)) // write committed store to sbuffer
73    val uncacheOutstanding = Input(Bool())
74    val mmioStout = DecoupledIO(new ExuOutput) // writeback uncached store
75    val forward = Vec(LoadPipelineWidth, Flipped(new PipeLoadForwardQueryIO))
76    val rob = Flipped(new RobLsqIO)
77    val uncache = new UncacheWordIO
78    // val refill = Flipped(Valid(new DCacheLineReq ))
79    val exceptionAddr = new ExceptionAddrIO
80    val sqempty = Output(Bool())
81    val issuePtrExt = Output(new SqPtr) // used to wake up delayed load/store
82    val sqFull = Output(Bool())
83    val sqCancelCnt = Output(UInt(log2Up(StoreQueueSize + 1).W))
84    val sqDeq = Output(UInt(log2Ceil(EnsbufferWidth + 1).W))
85    val storeDataValidVec = Vec(StoreQueueSize, Output(Bool()))
86  })
87
88  println("StoreQueue: size:" + StoreQueueSize)
89
90  // data modules
91  val uop = Reg(Vec(StoreQueueSize, new MicroOp))
92  // val data = Reg(Vec(StoreQueueSize, new LsqEntry))
93  val dataModule = Module(new SQDataModule(
94    numEntries = StoreQueueSize,
95    numRead = EnsbufferWidth,
96    numWrite = StorePipelineWidth,
97    numForward = StorePipelineWidth
98  ))
99  dataModule.io := DontCare
100  val paddrModule = Module(new SQAddrModule(
101    dataWidth = PAddrBits,
102    numEntries = StoreQueueSize,
103    numRead = EnsbufferWidth,
104    numWrite = StorePipelineWidth,
105    numForward = StorePipelineWidth
106  ))
107  paddrModule.io := DontCare
108  val vaddrModule = Module(new SQAddrModule(
109    dataWidth = VAddrBits,
110    numEntries = StoreQueueSize,
111    numRead = EnsbufferWidth + 1, // sbuffer + badvaddr 1 (TODO)
112    numWrite = StorePipelineWidth,
113    numForward = StorePipelineWidth
114  ))
115  vaddrModule.io := DontCare
116  val dataBuffer = Module(new DatamoduleResultBuffer(new DataBufferEntry))
117  val debug_paddr = Reg(Vec(StoreQueueSize, UInt((PAddrBits).W)))
118  val debug_vaddr = Reg(Vec(StoreQueueSize, UInt((VAddrBits).W)))
119  val debug_data = Reg(Vec(StoreQueueSize, UInt((XLEN).W)))
120
121  // state & misc
122  val allocated = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // sq entry has been allocated
123  val addrvalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // non-mmio addr is valid
124  val datavalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // non-mmio data is valid
125  val allvalid  = VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && datavalid(i))) // non-mmio data & addr is valid
126  val committed = Reg(Vec(StoreQueueSize, Bool())) // inst has been committed by rob
127  val pending = Reg(Vec(StoreQueueSize, Bool())) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of rob
128  val mmio = Reg(Vec(StoreQueueSize, Bool())) // mmio: inst is an mmio inst
129  val atomic = Reg(Vec(StoreQueueSize, Bool()))
130
131  // ptr
132  val enqPtrExt = RegInit(VecInit((0 until io.enq.req.length).map(_.U.asTypeOf(new SqPtr))))
133  val rdataPtrExt = RegInit(VecInit((0 until EnsbufferWidth).map(_.U.asTypeOf(new SqPtr))))
134  val deqPtrExt = RegInit(VecInit((0 until EnsbufferWidth).map(_.U.asTypeOf(new SqPtr))))
135  val cmtPtrExt = RegInit(VecInit((0 until CommitWidth).map(_.U.asTypeOf(new SqPtr))))
136  val issuePtrExt = RegInit(0.U.asTypeOf(new SqPtr))
137  val validCounter = RegInit(0.U(log2Ceil(LoadQueueSize + 1).W))
138
139  val enqPtr = enqPtrExt(0).value
140  val deqPtr = deqPtrExt(0).value
141  val cmtPtr = cmtPtrExt(0).value
142
143  val validCount = distanceBetween(enqPtrExt(0), deqPtrExt(0))
144  val allowEnqueue = validCount <= (StoreQueueSize - StorePipelineWidth).U
145
146  val deqMask = UIntToMask(deqPtr, StoreQueueSize)
147  val enqMask = UIntToMask(enqPtr, StoreQueueSize)
148
149  val commitCount = RegNext(io.rob.scommit)
150
151  (0 until StoreQueueSize).map{i => {
152    io.storeDataValidVec(i) := datavalid(i)
153  }}
154
155  // Read dataModule
156  assert(EnsbufferWidth <= 2)
157  // rdataPtrExtNext and rdataPtrExtNext+1 entry will be read from dataModule
158  val rdataPtrExtNext = WireInit(Mux(dataBuffer.io.enq(1).fire(),
159    VecInit(rdataPtrExt.map(_ + 2.U)),
160    Mux(dataBuffer.io.enq(0).fire() || io.mmioStout.fire(),
161      VecInit(rdataPtrExt.map(_ + 1.U)),
162      rdataPtrExt
163    )
164  ))
165
166  // deqPtrExtNext traces which inst is about to leave store queue
167  //
168  // io.sbuffer(i).fire() is RegNexted, as sbuffer data write takes 2 cycles.
169  // Before data write finish, sbuffer is unable to provide store to load
170  // forward data. As an workaround, deqPtrExt and allocated flag update
171  // is delayed so that load can get the right data from store queue.
172  //
173  // Modify deqPtrExtNext and io.sqDeq with care!
174  val deqPtrExtNext = Mux(RegNext(io.sbuffer(1).fire()),
175    VecInit(deqPtrExt.map(_ + 2.U)),
176    Mux(RegNext(io.sbuffer(0).fire()) || io.mmioStout.fire(),
177      VecInit(deqPtrExt.map(_ + 1.U)),
178      deqPtrExt
179    )
180  )
181  io.sqDeq := RegNext(Mux(RegNext(io.sbuffer(1).fire()), 2.U,
182    Mux(RegNext(io.sbuffer(0).fire()) || io.mmioStout.fire(), 1.U, 0.U)
183  ))
184  assert(!RegNext(RegNext(io.sbuffer(0).fire()) && io.mmioStout.fire()))
185
186  for (i <- 0 until EnsbufferWidth) {
187    dataModule.io.raddr(i) := rdataPtrExtNext(i).value
188    paddrModule.io.raddr(i) := rdataPtrExtNext(i).value
189    vaddrModule.io.raddr(i) := rdataPtrExtNext(i).value
190  }
191
192  // no inst will be committed 1 cycle before tval update
193  vaddrModule.io.raddr(EnsbufferWidth) := (cmtPtrExt(0) + commitCount).value
194
195  /**
196    * Enqueue at dispatch
197    *
198    * Currently, StoreQueue only allows enqueue when #emptyEntries > EnqWidth
199    */
200  io.enq.canAccept := allowEnqueue
201  val canEnqueue = io.enq.req.map(_.valid)
202  val enqCancel = io.enq.req.map(_.bits.robIdx.needFlush(io.brqRedirect))
203  for (i <- 0 until io.enq.req.length) {
204    val offset = if (i == 0) 0.U else PopCount(io.enq.needAlloc.take(i))
205    val sqIdx = enqPtrExt(offset)
206    val index = io.enq.req(i).bits.sqIdx.value
207    when (canEnqueue(i) && !enqCancel(i)) {
208      uop(index) := io.enq.req(i).bits
209      // NOTE: the index will be used when replay
210      uop(index).sqIdx := sqIdx
211      allocated(index) := true.B
212      datavalid(index) := false.B
213      addrvalid(index) := false.B
214      committed(index) := false.B
215      pending(index) := false.B
216
217      XSError(!io.enq.canAccept || !io.enq.lqCanAccept, s"must accept $i\n")
218      XSError(index =/= sqIdx.value, s"must be the same entry $i\n")
219    }
220    io.enq.resp(i) := sqIdx
221  }
222  XSDebug(p"(ready, valid): ${io.enq.canAccept}, ${Binary(Cat(io.enq.req.map(_.valid)))}\n")
223
224  /**
225    * Update issuePtr when issue from rs
226    */
227  // update issuePtr
228  val IssuePtrMoveStride = 4
229  require(IssuePtrMoveStride >= 2)
230
231  val issueLookupVec = (0 until IssuePtrMoveStride).map(issuePtrExt + _.U)
232  val issueLookup = issueLookupVec.map(ptr => allocated(ptr.value) && addrvalid(ptr.value) && datavalid(ptr.value) && ptr =/= enqPtrExt(0))
233  val nextIssuePtr = issuePtrExt + PriorityEncoder(VecInit(issueLookup.map(!_) :+ true.B))
234  issuePtrExt := nextIssuePtr
235
236  when (io.brqRedirect.valid) {
237    issuePtrExt := Mux(
238      isAfter(cmtPtrExt(0), deqPtrExt(0)),
239      cmtPtrExt(0),
240      deqPtrExtNext(0) // for mmio insts, deqPtr may be ahead of cmtPtr
241    )
242  }
243  // send issuePtrExt to rs
244  // io.issuePtrExt := cmtPtrExt(0)
245  io.issuePtrExt := issuePtrExt
246
247  /**
248    * Writeback store from store units
249    *
250    * Most store instructions writeback to regfile in the previous cycle.
251    * However,
252    *   (1) For an mmio instruction with exceptions, we need to mark it as addrvalid
253    * (in this way it will trigger an exception when it reaches ROB's head)
254    * instead of pending to avoid sending them to lower level.
255    *   (2) For an mmio instruction without exceptions, we mark it as pending.
256    * When the instruction reaches ROB's head, StoreQueue sends it to uncache channel.
257    * Upon receiving the response, StoreQueue writes back the instruction
258    * through arbiter with store units. It will later commit as normal.
259    */
260
261  // Write addr to sq
262  for (i <- 0 until StorePipelineWidth) {
263    paddrModule.io.wen(i) := false.B
264    vaddrModule.io.wen(i) := false.B
265    dataModule.io.mask.wen(i) := false.B
266    val stWbIndex = io.storeIn(i).bits.uop.sqIdx.value
267    when (io.storeIn(i).fire()) {
268      val addr_valid = !io.storeIn(i).bits.miss
269      addrvalid(stWbIndex) := addr_valid //!io.storeIn(i).bits.mmio
270      // pending(stWbIndex) := io.storeIn(i).bits.mmio
271
272      paddrModule.io.waddr(i) := stWbIndex
273      paddrModule.io.wdata(i) := io.storeIn(i).bits.paddr
274      paddrModule.io.wlineflag(i) := io.storeIn(i).bits.wlineflag
275      paddrModule.io.wen(i) := true.B
276
277      vaddrModule.io.waddr(i) := stWbIndex
278      vaddrModule.io.wdata(i) := io.storeIn(i).bits.vaddr
279      vaddrModule.io.wlineflag(i) := io.storeIn(i).bits.wlineflag
280      vaddrModule.io.wen(i) := true.B
281
282      debug_paddr(paddrModule.io.waddr(i)) := paddrModule.io.wdata(i)
283
284      // mmio(stWbIndex) := io.storeIn(i).bits.mmio
285
286      uop(stWbIndex).ctrl := io.storeIn(i).bits.uop.ctrl
287      uop(stWbIndex).debugInfo := io.storeIn(i).bits.uop.debugInfo
288      XSInfo("store addr write to sq idx %d pc 0x%x miss:%d vaddr %x paddr %x mmio %x\n",
289        io.storeIn(i).bits.uop.sqIdx.value,
290        io.storeIn(i).bits.uop.cf.pc,
291        io.storeIn(i).bits.miss,
292        io.storeIn(i).bits.vaddr,
293        io.storeIn(i).bits.paddr,
294        io.storeIn(i).bits.mmio
295      )
296    }
297
298    // re-replinish mmio, for pma/pmp will get mmio one cycle later
299    val storeInFireReg = RegNext(io.storeIn(i).fire() && !io.storeIn(i).bits.miss)
300    val stWbIndexReg = RegNext(stWbIndex)
301    when (storeInFireReg) {
302      pending(stWbIndexReg) := io.storeInRe(i).mmio
303      mmio(stWbIndexReg) := io.storeInRe(i).mmio
304      atomic(stWbIndexReg) := io.storeInRe(i).atomic
305    }
306
307    when(vaddrModule.io.wen(i)){
308      debug_vaddr(vaddrModule.io.waddr(i)) := vaddrModule.io.wdata(i)
309    }
310  }
311
312  // Write data to sq
313  // Now store data pipeline is actually 2 stages
314  for (i <- 0 until StorePipelineWidth) {
315    dataModule.io.data.wen(i) := false.B
316    val stWbIndex = io.storeDataIn(i).bits.uop.sqIdx.value
317    // sq data write takes 2 cycles:
318    // sq data write s0
319    when (io.storeDataIn(i).fire()) {
320      // send data write req to data module
321      dataModule.io.data.waddr(i) := stWbIndex
322      dataModule.io.data.wdata(i) := Mux(io.storeDataIn(i).bits.uop.ctrl.fuOpType === LSUOpType.cbo_zero,
323        0.U,
324        genWdata(io.storeDataIn(i).bits.data, io.storeDataIn(i).bits.uop.ctrl.fuOpType(1,0))
325      )
326      dataModule.io.data.wen(i) := true.B
327
328      debug_data(dataModule.io.data.waddr(i)) := dataModule.io.data.wdata(i)
329
330      XSInfo("store data write to sq idx %d pc 0x%x data %x -> %x\n",
331        io.storeDataIn(i).bits.uop.sqIdx.value,
332        io.storeDataIn(i).bits.uop.cf.pc,
333        io.storeDataIn(i).bits.data,
334        dataModule.io.data.wdata(i)
335      )
336    }
337    // sq data write s1
338    when (
339      RegNext(io.storeDataIn(i).fire())
340      // && !RegNext(io.storeDataIn(i).bits.uop).robIdx.needFlush(io.brqRedirect)
341    ) {
342      datavalid(RegNext(stWbIndex)) := true.B
343    }
344  }
345
346  // Write mask to sq
347  for (i <- 0 until StorePipelineWidth) {
348    // sq mask write s0
349    when (io.storeMaskIn(i).fire()) {
350      // send data write req to data module
351      dataModule.io.mask.waddr(i) := io.storeMaskIn(i).bits.sqIdx.value
352      dataModule.io.mask.wdata(i) := io.storeMaskIn(i).bits.mask
353      dataModule.io.mask.wen(i) := true.B
354    }
355  }
356
357  /**
358    * load forward query
359    *
360    * Check store queue for instructions that is older than the load.
361    * The response will be valid at the next cycle after req.
362    */
363  // check over all lq entries and forward data from the first matched store
364  for (i <- 0 until LoadPipelineWidth) {
365    // Compare deqPtr (deqPtr) and forward.sqIdx, we have two cases:
366    // (1) if they have the same flag, we need to check range(tail, sqIdx)
367    // (2) if they have different flags, we need to check range(tail, LoadQueueSize) and range(0, sqIdx)
368    // Forward1: Mux(same_flag, range(tail, sqIdx), range(tail, LoadQueueSize))
369    // Forward2: Mux(same_flag, 0.U,                   range(0, sqIdx)    )
370    // i.e. forward1 is the target entries with the same flag bits and forward2 otherwise
371    val differentFlag = deqPtrExt(0).flag =/= io.forward(i).sqIdx.flag
372    val forwardMask = io.forward(i).sqIdxMask
373    // all addrvalid terms need to be checked
374    val addrValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && allocated(i))))
375    val dataValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => datavalid(i))))
376    val allValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && datavalid(i) && allocated(i))))
377    val canForward1 = Mux(differentFlag, ~deqMask, deqMask ^ forwardMask) & allValidVec.asUInt
378    val canForward2 = Mux(differentFlag, forwardMask, 0.U(StoreQueueSize.W)) & allValidVec.asUInt
379    val needForward = Mux(differentFlag, ~deqMask | forwardMask, deqMask ^ forwardMask)
380
381    XSDebug(p"$i f1 ${Binary(canForward1)} f2 ${Binary(canForward2)} " +
382      p"sqIdx ${io.forward(i).sqIdx} pa ${Hexadecimal(io.forward(i).paddr)}\n"
383    )
384
385    // do real fwd query (cam lookup in load_s1)
386    dataModule.io.needForward(i)(0) := canForward1 & vaddrModule.io.forwardMmask(i).asUInt
387    dataModule.io.needForward(i)(1) := canForward2 & vaddrModule.io.forwardMmask(i).asUInt
388
389    vaddrModule.io.forwardMdata(i) := io.forward(i).vaddr
390    paddrModule.io.forwardMdata(i) := io.forward(i).paddr
391
392    // vaddr cam result does not equal to paddr cam result
393    // replay needed
394    // val vpmaskNotEqual = ((paddrModule.io.forwardMmask(i).asUInt ^ vaddrModule.io.forwardMmask(i).asUInt) & needForward) =/= 0.U
395    // val vaddrMatchFailed = vpmaskNotEqual && io.forward(i).valid
396    val vpmaskNotEqual = (
397      (RegNext(paddrModule.io.forwardMmask(i).asUInt) ^ RegNext(vaddrModule.io.forwardMmask(i).asUInt)) &
398      RegNext(needForward) &
399      RegNext(addrValidVec.asUInt)
400    ) =/= 0.U
401    val vaddrMatchFailed = vpmaskNotEqual && RegNext(io.forward(i).valid)
402    when (vaddrMatchFailed) {
403      XSInfo("vaddrMatchFailed: pc %x pmask %x vmask %x\n",
404        RegNext(io.forward(i).uop.cf.pc),
405        RegNext(needForward & paddrModule.io.forwardMmask(i).asUInt),
406        RegNext(needForward & vaddrModule.io.forwardMmask(i).asUInt)
407      );
408    }
409    XSPerfAccumulate("vaddr_match_failed", vpmaskNotEqual)
410    XSPerfAccumulate("vaddr_match_really_failed", vaddrMatchFailed)
411
412    // Fast forward mask will be generated immediately (load_s1)
413    io.forward(i).forwardMaskFast := dataModule.io.forwardMaskFast(i)
414
415    // Forward result will be generated 1 cycle later (load_s2)
416    io.forward(i).forwardMask := dataModule.io.forwardMask(i)
417    io.forward(i).forwardData := dataModule.io.forwardData(i)
418
419    // If addr match, data not ready, mark it as dataInvalid
420    // load_s1: generate dataInvalid in load_s1 to set fastUop
421    val dataInvalidMask = (addrValidVec.asUInt & ~dataValidVec.asUInt & vaddrModule.io.forwardMmask(i).asUInt & needForward.asUInt)
422    io.forward(i).dataInvalidFast := dataInvalidMask.orR
423    val dataInvalidMaskReg = RegNext(dataInvalidMask)
424    // load_s2
425    io.forward(i).dataInvalid := RegNext(io.forward(i).dataInvalidFast)
426    // check if vaddr forward mismatched
427    io.forward(i).matchInvalid := vaddrMatchFailed
428    val dataInvalidMaskRegWire = Wire(UInt(StoreQueueSize.W))
429    dataInvalidMaskRegWire := dataInvalidMaskReg // make chisel happy
430    io.forward(i).dataInvalidSqIdx := PriorityEncoder(dataInvalidMaskRegWire)
431  }
432
433  /**
434    * Memory mapped IO / other uncached operations
435    *
436    * States:
437    * (1) writeback from store units: mark as pending
438    * (2) when they reach ROB's head, they can be sent to uncache channel
439    * (3) response from uncache channel: mark as datavalidmask.wen
440    * (4) writeback to ROB (and other units): mark as writebacked
441    * (5) ROB commits the instruction: same as normal instructions
442    */
443  //(2) when they reach ROB's head, they can be sent to uncache channel
444  val s_idle :: s_req :: s_resp :: s_wb :: s_wait :: Nil = Enum(5)
445  val uncacheState = RegInit(s_idle)
446  switch(uncacheState) {
447    is(s_idle) {
448      when(RegNext(io.rob.pendingst && pending(deqPtr) && allocated(deqPtr) && datavalid(deqPtr) && addrvalid(deqPtr))) {
449        uncacheState := s_req
450      }
451    }
452    is(s_req) {
453      when (io.uncache.req.fire) {
454        when (io.uncacheOutstanding) {
455          uncacheState := s_wb
456        } .otherwise {
457          uncacheState := s_resp
458        }
459      }
460    }
461    is(s_resp) {
462      when(io.uncache.resp.fire()) {
463        uncacheState := s_wb
464      }
465    }
466    is(s_wb) {
467      when (io.mmioStout.fire()) {
468        uncacheState := s_wait
469      }
470    }
471    is(s_wait) {
472      when(commitCount > 0.U) {
473        uncacheState := s_idle // ready for next mmio
474      }
475    }
476  }
477  io.uncache.req.valid := uncacheState === s_req
478
479  io.uncache.req.bits.cmd  := MemoryOpConstants.M_XWR
480  io.uncache.req.bits.addr := paddrModule.io.rdata(0) // data(deqPtr) -> rdata(0)
481  io.uncache.req.bits.data := dataModule.io.rdata(0).data
482  io.uncache.req.bits.mask := dataModule.io.rdata(0).mask
483
484  // CBO op type check can be delayed for 1 cycle,
485  // as uncache op will not start in s_idle
486  val cbo_mmio_addr = paddrModule.io.rdata(0) >> 2 << 2 // clear lowest 2 bits for op
487  val cbo_mmio_op = 0.U //TODO
488  val cbo_mmio_data = cbo_mmio_addr | cbo_mmio_op
489  when(RegNext(LSUOpType.isCbo(uop(deqPtr).ctrl.fuOpType))){
490    io.uncache.req.bits.addr := DontCare // TODO
491    io.uncache.req.bits.data := paddrModule.io.rdata(0)
492    io.uncache.req.bits.mask := DontCare // TODO
493  }
494
495  io.uncache.req.bits.id   := DontCare
496  io.uncache.req.bits.instrtype   := DontCare
497  io.uncache.req.bits.atomic := atomic(RegNext(rdataPtrExtNext(0)).value)
498
499  when(io.uncache.req.fire()){
500    // mmio store should not be committed until uncache req is sent
501    pending(deqPtr) := false.B
502
503    XSDebug(
504      p"uncache req: pc ${Hexadecimal(uop(deqPtr).cf.pc)} " +
505      p"addr ${Hexadecimal(io.uncache.req.bits.addr)} " +
506      p"data ${Hexadecimal(io.uncache.req.bits.data)} " +
507      p"op ${Hexadecimal(io.uncache.req.bits.cmd)} " +
508      p"mask ${Hexadecimal(io.uncache.req.bits.mask)}\n"
509    )
510  }
511
512  // (3) response from uncache channel: mark as datavalid
513  io.uncache.resp.ready := true.B
514
515  // (4) writeback to ROB (and other units): mark as writebacked
516  io.mmioStout.valid := uncacheState === s_wb
517  io.mmioStout.bits.uop := uop(deqPtr)
518  io.mmioStout.bits.uop.sqIdx := deqPtrExt(0)
519  io.mmioStout.bits.data := dataModule.io.rdata(0).data // dataModule.io.rdata.read(deqPtr)
520  io.mmioStout.bits.redirectValid := false.B
521  io.mmioStout.bits.redirect := DontCare
522  io.mmioStout.bits.debug.isMMIO := true.B
523  io.mmioStout.bits.debug.paddr := DontCare
524  io.mmioStout.bits.debug.isPerfCnt := false.B
525  io.mmioStout.bits.fflags := DontCare
526  io.mmioStout.bits.debug.vaddr := DontCare
527  // Remove MMIO inst from store queue after MMIO request is being sent
528  // That inst will be traced by uncache state machine
529  when (io.mmioStout.fire()) {
530    allocated(deqPtr) := false.B
531  }
532
533  /**
534    * ROB commits store instructions (mark them as committed)
535    *
536    * (1) When store commits, mark it as committed.
537    * (2) They will not be cancelled and can be sent to lower level.
538    */
539  XSError(uncacheState =/= s_idle && uncacheState =/= s_wait && commitCount > 0.U,
540   "should not commit instruction when MMIO has not been finished\n")
541  for (i <- 0 until CommitWidth) {
542    when (commitCount > i.U) { // MMIO inst is not in progress
543      if(i == 0){
544        // MMIO inst should not update committed flag
545        // Note that commit count has been delayed for 1 cycle
546        when(uncacheState === s_idle){
547          committed(cmtPtrExt(0).value) := true.B
548        }
549      } else {
550        committed(cmtPtrExt(i).value) := true.B
551      }
552    }
553  }
554  cmtPtrExt := cmtPtrExt.map(_ + commitCount)
555
556  // committed stores will not be cancelled and can be sent to lower level.
557  // remove retired insts from sq, add retired store to sbuffer
558
559  // Read data from data module
560  // As store queue grows larger and larger, time needed to read data from data
561  // module keeps growing higher. Now we give data read a whole cycle.
562
563  val mmioStall = mmio(rdataPtrExt(0).value)
564  for (i <- 0 until EnsbufferWidth) {
565    val ptr = rdataPtrExt(i).value
566    dataBuffer.io.enq(i).valid := allocated(ptr) && committed(ptr) && !mmioStall
567    // Note that store data/addr should both be valid after store's commit
568    assert(!dataBuffer.io.enq(i).valid || allvalid(ptr))
569    dataBuffer.io.enq(i).bits.addr  := paddrModule.io.rdata(i)
570    dataBuffer.io.enq(i).bits.vaddr := vaddrModule.io.rdata(i)
571    dataBuffer.io.enq(i).bits.data  := dataModule.io.rdata(i).data
572    dataBuffer.io.enq(i).bits.mask  := dataModule.io.rdata(i).mask
573    dataBuffer.io.enq(i).bits.wline := paddrModule.io.rlineflag(i)
574    dataBuffer.io.enq(i).bits.sqPtr := rdataPtrExt(i)
575  }
576
577  // Send data stored in sbufferReqBitsReg to sbuffer
578  for (i <- 0 until EnsbufferWidth) {
579    io.sbuffer(i).valid := dataBuffer.io.deq(i).valid
580    dataBuffer.io.deq(i).ready := io.sbuffer(i).ready
581    // Write line request should have all 1 mask
582    assert(!(io.sbuffer(i).valid && io.sbuffer(i).bits.wline && !io.sbuffer(i).bits.mask.andR))
583    io.sbuffer(i).bits.cmd   := MemoryOpConstants.M_XWR
584    io.sbuffer(i).bits.addr  := dataBuffer.io.deq(i).bits.addr
585    io.sbuffer(i).bits.vaddr := dataBuffer.io.deq(i).bits.vaddr
586    io.sbuffer(i).bits.data  := dataBuffer.io.deq(i).bits.data
587    io.sbuffer(i).bits.mask  := dataBuffer.io.deq(i).bits.mask
588    io.sbuffer(i).bits.wline := dataBuffer.io.deq(i).bits.wline
589    io.sbuffer(i).bits.id    := DontCare
590    io.sbuffer(i).bits.instrtype    := DontCare
591
592    // io.sbuffer(i).fire() is RegNexted, as sbuffer data write takes 2 cycles.
593    // Before data write finish, sbuffer is unable to provide store to load
594    // forward data. As an workaround, deqPtrExt and allocated flag update
595    // is delayed so that load can get the right data from store queue.
596    val ptr = dataBuffer.io.deq(i).bits.sqPtr.value
597    when (RegNext(io.sbuffer(i).fire())) {
598      allocated(RegEnable(ptr, io.sbuffer(i).fire())) := false.B
599      XSDebug("sbuffer "+i+" fire: ptr %d\n", ptr)
600    }
601  }
602  (1 until EnsbufferWidth).foreach(i => when(io.sbuffer(i).fire) { assert(io.sbuffer(i - 1).fire) })
603  if (coreParams.dcacheParametersOpt.isEmpty) {
604    for (i <- 0 until EnsbufferWidth) {
605      val ptr = deqPtrExt(i).value
606      val fakeRAM = Module(new RAMHelper(64L * 1024 * 1024 * 1024))
607      fakeRAM.clk   := clock
608      fakeRAM.en    := allocated(ptr) && committed(ptr) && !mmio(ptr)
609      fakeRAM.rIdx  := 0.U
610      fakeRAM.wIdx  := (paddrModule.io.rdata(i) - "h80000000".U) >> 3
611      fakeRAM.wdata := dataModule.io.rdata(i).data
612      fakeRAM.wmask := MaskExpand(dataModule.io.rdata(i).mask)
613      fakeRAM.wen   := allocated(ptr) && committed(ptr) && !mmio(ptr)
614    }
615  }
616
617  if (env.EnableDifftest) {
618    for (i <- 0 until EnsbufferWidth) {
619      val storeCommit = io.sbuffer(i).fire()
620      val waddr = SignExt(io.sbuffer(i).bits.addr, 64)
621      val wdata = io.sbuffer(i).bits.data & MaskExpand(io.sbuffer(i).bits.mask)
622      val wmask = io.sbuffer(i).bits.mask
623
624      val difftest = Module(new DifftestStoreEvent)
625      difftest.io.clock       := clock
626      difftest.io.coreid      := io.hartId
627      difftest.io.index       := i.U
628      difftest.io.valid       := RegNext(RegNext(storeCommit))
629      difftest.io.storeAddr   := RegNext(RegNext(waddr))
630      difftest.io.storeData   := RegNext(RegNext(wdata))
631      difftest.io.storeMask   := RegNext(RegNext(wmask))
632    }
633  }
634
635  // Read vaddr for mem exception
636  io.exceptionAddr.vaddr := vaddrModule.io.rdata(EnsbufferWidth)
637
638  // misprediction recovery / exception redirect
639  // invalidate sq term using robIdx
640  val needCancel = Wire(Vec(StoreQueueSize, Bool()))
641  for (i <- 0 until StoreQueueSize) {
642    needCancel(i) := uop(i).robIdx.needFlush(io.brqRedirect) && allocated(i) && !committed(i)
643    when (needCancel(i)) {
644      allocated(i) := false.B
645    }
646  }
647
648  /**
649    * update pointers
650    */
651  val lastEnqCancel = PopCount(RegNext(VecInit(canEnqueue.zip(enqCancel).map(x => x._1 && x._2))))
652  val lastCycleRedirect = RegNext(io.brqRedirect.valid)
653  val lastCycleCancelCount = PopCount(RegNext(needCancel))
654  val enqNumber = Mux(io.enq.canAccept && io.enq.lqCanAccept, PopCount(io.enq.req.map(_.valid)), 0.U)
655  when (lastCycleRedirect) {
656    // we recover the pointers in the next cycle after redirect
657    enqPtrExt := VecInit(enqPtrExt.map(_ - (lastCycleCancelCount + lastEnqCancel)))
658  }.otherwise {
659    enqPtrExt := VecInit(enqPtrExt.map(_ + enqNumber))
660  }
661
662  deqPtrExt := deqPtrExtNext
663  rdataPtrExt := rdataPtrExtNext
664
665  // val dequeueCount = Mux(io.sbuffer(1).fire(), 2.U, Mux(io.sbuffer(0).fire() || io.mmioStout.fire(), 1.U, 0.U))
666
667  // If redirect at T0, sqCancelCnt is at T2
668  io.sqCancelCnt := RegNext(lastCycleCancelCount + lastEnqCancel)
669
670  // io.sqempty will be used by sbuffer
671  // We delay it for 1 cycle for better timing
672  // When sbuffer need to check if it is empty, the pipeline is blocked, which means delay io.sqempty
673  // for 1 cycle will also promise that sq is empty in that cycle
674  io.sqempty := RegNext(
675    enqPtrExt(0).value === deqPtrExt(0).value &&
676    enqPtrExt(0).flag === deqPtrExt(0).flag
677  )
678
679  // perf counter
680  QueuePerf(StoreQueueSize, validCount, !allowEnqueue)
681  io.sqFull := !allowEnqueue
682  XSPerfAccumulate("mmioCycle", uncacheState =/= s_idle) // lq is busy dealing with uncache req
683  XSPerfAccumulate("mmioCnt", io.uncache.req.fire())
684  XSPerfAccumulate("mmio_wb_success", io.mmioStout.fire())
685  XSPerfAccumulate("mmio_wb_blocked", io.mmioStout.valid && !io.mmioStout.ready)
686  XSPerfAccumulate("validEntryCnt", distanceBetween(enqPtrExt(0), deqPtrExt(0)))
687  XSPerfAccumulate("cmtEntryCnt", distanceBetween(cmtPtrExt(0), deqPtrExt(0)))
688  XSPerfAccumulate("nCmtEntryCnt", distanceBetween(enqPtrExt(0), cmtPtrExt(0)))
689
690  val perfValidCount = distanceBetween(enqPtrExt(0), deqPtrExt(0))
691  val perfEvents = Seq(
692    ("mmioCycle      ", uncacheState =/= s_idle),
693    ("mmioCnt        ", io.uncache.req.fire()),
694    ("mmio_wb_success", io.mmioStout.fire()),
695    ("mmio_wb_blocked", io.mmioStout.valid && !io.mmioStout.ready),
696    ("stq_1_4_valid  ", (perfValidCount < (StoreQueueSize.U/4.U))),
697    ("stq_2_4_valid  ", (perfValidCount > (StoreQueueSize.U/4.U)) & (perfValidCount <= (StoreQueueSize.U/2.U))),
698    ("stq_3_4_valid  ", (perfValidCount > (StoreQueueSize.U/2.U)) & (perfValidCount <= (StoreQueueSize.U*3.U/4.U))),
699    ("stq_4_4_valid  ", (perfValidCount > (StoreQueueSize.U*3.U/4.U))),
700  )
701  generatePerfEvent()
702
703  // debug info
704  XSDebug("enqPtrExt %d:%d deqPtrExt %d:%d\n", enqPtrExt(0).flag, enqPtr, deqPtrExt(0).flag, deqPtr)
705
706  def PrintFlag(flag: Bool, name: String): Unit = {
707    when(flag) {
708      XSDebug(false, true.B, name)
709    }.otherwise {
710      XSDebug(false, true.B, " ")
711    }
712  }
713
714  for (i <- 0 until StoreQueueSize) {
715    XSDebug(i + ": pc %x va %x pa %x data %x ",
716      uop(i).cf.pc,
717      debug_vaddr(i),
718      debug_paddr(i),
719      debug_data(i)
720    )
721    PrintFlag(allocated(i), "a")
722    PrintFlag(allocated(i) && addrvalid(i), "a")
723    PrintFlag(allocated(i) && datavalid(i), "d")
724    PrintFlag(allocated(i) && committed(i), "c")
725    PrintFlag(allocated(i) && pending(i), "p")
726    PrintFlag(allocated(i) && mmio(i), "m")
727    XSDebug(false, true.B, "\n")
728  }
729
730}
731