xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/LoadQueueRAW.scala (revision e4f69d78f24895ac36a5a6c704cec53e4af72485)
1*e4f69d78Ssfencevma/***************************************************************************************
2*e4f69d78Ssfencevma* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3*e4f69d78Ssfencevma* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*e4f69d78Ssfencevma*
5*e4f69d78Ssfencevma* XiangShan is licensed under Mulan PSL v2.
6*e4f69d78Ssfencevma* You can use this software according to the terms and conditions of the Mulan PSL v2.
7*e4f69d78Ssfencevma* You may obtain a copy of Mulan PSL v2 at:
8*e4f69d78Ssfencevma*          http://license.coscl.org.cn/MulanPSL2
9*e4f69d78Ssfencevma*
10*e4f69d78Ssfencevma* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11*e4f69d78Ssfencevma* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12*e4f69d78Ssfencevma* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*e4f69d78Ssfencevma*
14*e4f69d78Ssfencevma* See the Mulan PSL v2 for more details.
15*e4f69d78Ssfencevma***************************************************************************************/
16*e4f69d78Ssfencevma
17*e4f69d78Ssfencevmapackage xiangshan.mem
18*e4f69d78Ssfencevma
19*e4f69d78Ssfencevmaimport chisel3._
20*e4f69d78Ssfencevmaimport chisel3.util._
21*e4f69d78Ssfencevmaimport chipsalliance.rocketchip.config._
22*e4f69d78Ssfencevmaimport xiangshan._
23*e4f69d78Ssfencevmaimport xiangshan.backend.rob.RobPtr
24*e4f69d78Ssfencevmaimport xiangshan.cache._
25*e4f69d78Ssfencevmaimport xiangshan.frontend.FtqPtr
26*e4f69d78Ssfencevmaimport xiangshan.mem.mdp._
27*e4f69d78Ssfencevmaimport utils._
28*e4f69d78Ssfencevmaimport utility._
29*e4f69d78Ssfencevma
30*e4f69d78Ssfencevmaclass LoadQueueRAW(implicit p: Parameters) extends XSModule
31*e4f69d78Ssfencevma  with HasDCacheParameters
32*e4f69d78Ssfencevma  with HasCircularQueuePtrHelper
33*e4f69d78Ssfencevma  with HasLoadHelper
34*e4f69d78Ssfencevma  with HasPerfEvents
35*e4f69d78Ssfencevma{
36*e4f69d78Ssfencevma  val io = IO(new Bundle() {
37*e4f69d78Ssfencevma    val redirect = Flipped(ValidIO(new Redirect))
38*e4f69d78Ssfencevma    val query = Vec(LoadPipelineWidth, Flipped(new LoadViolationQueryIO))
39*e4f69d78Ssfencevma    val storeIn = Vec(StorePipelineWidth, Flipped(Valid(new LsPipelineBundle)))
40*e4f69d78Ssfencevma    val rollback = Output(Valid(new Redirect))
41*e4f69d78Ssfencevma    val stAddrReadySqPtr = Input(new SqPtr)
42*e4f69d78Ssfencevma    val stIssuePtr = Input(new SqPtr)
43*e4f69d78Ssfencevma    val lqFull = Output(Bool())
44*e4f69d78Ssfencevma  })
45*e4f69d78Ssfencevma
46*e4f69d78Ssfencevma  println("LoadQueueRAW: size " + LoadQueueRAWSize)
47*e4f69d78Ssfencevma  //  LoadQueueRAW field
48*e4f69d78Ssfencevma  //  +-------+--------+-------+-------+-----------+
49*e4f69d78Ssfencevma  //  | Valid |  uop   |PAddr  | Mask  | Datavalid |
50*e4f69d78Ssfencevma  //  +-------+--------+-------+-------+-----------+
51*e4f69d78Ssfencevma  //
52*e4f69d78Ssfencevma  //  Field descriptions:
53*e4f69d78Ssfencevma  //  Allocated   : entry has been allocated already
54*e4f69d78Ssfencevma  //  MicroOp     : inst's microOp
55*e4f69d78Ssfencevma  //  PAddr       : physical address.
56*e4f69d78Ssfencevma  //  Mask        : data mask
57*e4f69d78Ssfencevma  //  Datavalid   : data valid
58*e4f69d78Ssfencevma  //
59*e4f69d78Ssfencevma  val allocated = RegInit(VecInit(List.fill(LoadQueueRAWSize)(false.B))) // The control signals need to explicitly indicate the initial value
60*e4f69d78Ssfencevma  val uop = Reg(Vec(LoadQueueRAWSize, new MicroOp))
61*e4f69d78Ssfencevma  val paddrModule = Module(new LqPAddrModule(
62*e4f69d78Ssfencevma    gen = UInt(PAddrBits.W),
63*e4f69d78Ssfencevma    numEntries = LoadQueueRAWSize,
64*e4f69d78Ssfencevma    numRead = LoadPipelineWidth,
65*e4f69d78Ssfencevma    numWrite = LoadPipelineWidth,
66*e4f69d78Ssfencevma    numWBank = LoadQueueNWriteBanks,
67*e4f69d78Ssfencevma    numWDelay = 2,
68*e4f69d78Ssfencevma    numCamPort = StorePipelineWidth
69*e4f69d78Ssfencevma  ))
70*e4f69d78Ssfencevma  paddrModule.io := DontCare
71*e4f69d78Ssfencevma  val maskModule = Module(new LqMaskModule(
72*e4f69d78Ssfencevma    gen = UInt(8.W),
73*e4f69d78Ssfencevma    numEntries = LoadQueueRAWSize,
74*e4f69d78Ssfencevma    numRead = LoadPipelineWidth,
75*e4f69d78Ssfencevma    numWrite = LoadPipelineWidth,
76*e4f69d78Ssfencevma    numWBank = LoadQueueNWriteBanks,
77*e4f69d78Ssfencevma    numWDelay = 2,
78*e4f69d78Ssfencevma    numCamPort = StorePipelineWidth
79*e4f69d78Ssfencevma  ))
80*e4f69d78Ssfencevma  maskModule.io := DontCare
81*e4f69d78Ssfencevma  val datavalid = RegInit(VecInit(List.fill(LoadQueueRAWSize)(false.B)))
82*e4f69d78Ssfencevma
83*e4f69d78Ssfencevma  // freeliset: store valid entries index.
84*e4f69d78Ssfencevma  // +---+---+--------------+-----+-----+
85*e4f69d78Ssfencevma  // | 0 | 1 |      ......  | n-2 | n-1 |
86*e4f69d78Ssfencevma  // +---+---+--------------+-----+-----+
87*e4f69d78Ssfencevma  val freeList = Module(new FreeList(
88*e4f69d78Ssfencevma    size = LoadQueueRAWSize,
89*e4f69d78Ssfencevma    allocWidth = LoadPipelineWidth,
90*e4f69d78Ssfencevma    freeWidth = 4,
91*e4f69d78Ssfencevma    moduleName = "LoadQueueRAW freelist"
92*e4f69d78Ssfencevma  ))
93*e4f69d78Ssfencevma  freeList.io := DontCare
94*e4f69d78Ssfencevma
95*e4f69d78Ssfencevma  //  LoadQueueRAW enqueue
96*e4f69d78Ssfencevma  val canEnqueue = io.query.map(_.req.valid)
97*e4f69d78Ssfencevma  val cancelEnqueue = io.query.map(_.req.bits.uop.robIdx.needFlush(io.redirect))
98*e4f69d78Ssfencevma  val allAddrCheck = io.stIssuePtr === io.stAddrReadySqPtr
99*e4f69d78Ssfencevma  val hasAddrInvalidStore = io.query.map(_.req.bits.uop.sqIdx).map(sqIdx => {
100*e4f69d78Ssfencevma    Mux(!allAddrCheck, isBefore(io.stAddrReadySqPtr, sqIdx), false.B)
101*e4f69d78Ssfencevma  })
102*e4f69d78Ssfencevma  val needEnqueue = canEnqueue.zip(hasAddrInvalidStore).zip(cancelEnqueue).map { case ((v, r), c) => v && r && !c }
103*e4f69d78Ssfencevma  val bypassPAddr = Reg(Vec(LoadPipelineWidth, UInt(PAddrBits.W)))
104*e4f69d78Ssfencevma  val bypassMask = Reg(Vec(LoadPipelineWidth, UInt(8.W)))
105*e4f69d78Ssfencevma
106*e4f69d78Ssfencevma  // Allocate logic
107*e4f69d78Ssfencevma  val enqValidVec = Wire(Vec(LoadPipelineWidth, Bool()))
108*e4f69d78Ssfencevma  val enqIndexVec = Wire(Vec(LoadPipelineWidth, UInt()))
109*e4f69d78Ssfencevma  val enqOffset = Wire(Vec(LoadPipelineWidth, UInt()))
110*e4f69d78Ssfencevma
111*e4f69d78Ssfencevma  // Enqueue
112*e4f69d78Ssfencevma  for ((enq, w) <- io.query.map(_.req).zipWithIndex) {
113*e4f69d78Ssfencevma    paddrModule.io.wen(w) := false.B
114*e4f69d78Ssfencevma    maskModule.io.wen(w) := false.B
115*e4f69d78Ssfencevma    freeList.io.doAllocate(w) := false.B
116*e4f69d78Ssfencevma
117*e4f69d78Ssfencevma    enqOffset(w) := PopCount(needEnqueue.take(w))
118*e4f69d78Ssfencevma    freeList.io.allocateReq(w) := needEnqueue(w)
119*e4f69d78Ssfencevma
120*e4f69d78Ssfencevma    //  Allocate ready
121*e4f69d78Ssfencevma    enqValidVec(w) := freeList.io.canAllocate(enqOffset(w))
122*e4f69d78Ssfencevma    enqIndexVec(w) := freeList.io.allocateSlot(enqOffset(w))
123*e4f69d78Ssfencevma    enq.ready := Mux(needEnqueue(w), enqValidVec(w), true.B)
124*e4f69d78Ssfencevma
125*e4f69d78Ssfencevma    val enqIndex = enqIndexVec(w)
126*e4f69d78Ssfencevma    when (needEnqueue(w) && enq.ready) {
127*e4f69d78Ssfencevma      val debug_robIdx = enq.bits.uop.robIdx.asUInt
128*e4f69d78Ssfencevma      XSError(allocated(enqIndex), p"LoadQueueRAW: You can not write an valid entry! check: ldu $w, robIdx $debug_robIdx")
129*e4f69d78Ssfencevma
130*e4f69d78Ssfencevma      freeList.io.doAllocate(w) := true.B
131*e4f69d78Ssfencevma
132*e4f69d78Ssfencevma      //  Allocate new entry
133*e4f69d78Ssfencevma      allocated(enqIndex) := true.B
134*e4f69d78Ssfencevma
135*e4f69d78Ssfencevma      //  Write paddr
136*e4f69d78Ssfencevma      paddrModule.io.wen(w) := true.B
137*e4f69d78Ssfencevma      paddrModule.io.waddr(w) := enqIndex
138*e4f69d78Ssfencevma      paddrModule.io.wdata(w) := enq.bits.paddr
139*e4f69d78Ssfencevma      bypassPAddr(w) := enq.bits.paddr
140*e4f69d78Ssfencevma
141*e4f69d78Ssfencevma      //  Write mask
142*e4f69d78Ssfencevma      maskModule.io.wen(w) := true.B
143*e4f69d78Ssfencevma      maskModule.io.waddr(w) := enqIndex
144*e4f69d78Ssfencevma      maskModule.io.wdata(w) := enq.bits.mask
145*e4f69d78Ssfencevma      bypassMask(w) := enq.bits.mask
146*e4f69d78Ssfencevma
147*e4f69d78Ssfencevma      //  Fill info
148*e4f69d78Ssfencevma      uop(enqIndex) := enq.bits.uop
149*e4f69d78Ssfencevma      datavalid(enqIndex) := enq.bits.datavalid
150*e4f69d78Ssfencevma    }
151*e4f69d78Ssfencevma  }
152*e4f69d78Ssfencevma
153*e4f69d78Ssfencevma  for ((query, w) <- io.query.map(_.resp).zipWithIndex) {
154*e4f69d78Ssfencevma    query.valid := RegNext(io.query(w).req.valid)
155*e4f69d78Ssfencevma    query.bits.replayFromFetch := RegNext(false.B)
156*e4f69d78Ssfencevma  }
157*e4f69d78Ssfencevma
158*e4f69d78Ssfencevma  //  LoadQueueRAW deallocate
159*e4f69d78Ssfencevma  val freeMaskVec = Wire(Vec(LoadQueueRAWSize, Bool()))
160*e4f69d78Ssfencevma
161*e4f69d78Ssfencevma  // init
162*e4f69d78Ssfencevma  freeMaskVec.map(e => e := false.B)
163*e4f69d78Ssfencevma
164*e4f69d78Ssfencevma  // when the stores that "older than" current load address were ready.
165*e4f69d78Ssfencevma  // current load will be released.
166*e4f69d78Ssfencevma  for (i <- 0 until LoadQueueRAWSize) {
167*e4f69d78Ssfencevma    val deqNotBlock = Mux(!allAddrCheck, !isBefore(io.stAddrReadySqPtr, uop(i).sqIdx), true.B)
168*e4f69d78Ssfencevma    val needCancel = uop(i).robIdx.needFlush(io.redirect)
169*e4f69d78Ssfencevma
170*e4f69d78Ssfencevma    when (allocated(i) && (deqNotBlock || needCancel)) {
171*e4f69d78Ssfencevma      allocated(i) := false.B
172*e4f69d78Ssfencevma      freeMaskVec(i) := true.B
173*e4f69d78Ssfencevma    }
174*e4f69d78Ssfencevma  }
175*e4f69d78Ssfencevma
176*e4f69d78Ssfencevma  // if need replay deallocate entry
177*e4f69d78Ssfencevma  val lastCanAccept = RegNext(VecInit(needEnqueue.zip(enqValidVec).map(x => x._1 && x._2)))
178*e4f69d78Ssfencevma  val lastAllocIndex = RegNext(enqIndexVec)
179*e4f69d78Ssfencevma
180*e4f69d78Ssfencevma  for ((release, w) <- io.query.map(_.release).zipWithIndex) {
181*e4f69d78Ssfencevma    val releaseValid = release && lastCanAccept(w)
182*e4f69d78Ssfencevma    val releaseIndex = lastAllocIndex(w)
183*e4f69d78Ssfencevma
184*e4f69d78Ssfencevma    when (allocated(releaseIndex) && releaseValid) {
185*e4f69d78Ssfencevma      allocated(releaseIndex) := false.B
186*e4f69d78Ssfencevma      freeMaskVec(releaseIndex) := true.B
187*e4f69d78Ssfencevma    }
188*e4f69d78Ssfencevma  }
189*e4f69d78Ssfencevma  freeList.io.free := freeMaskVec.asUInt
190*e4f69d78Ssfencevma
191*e4f69d78Ssfencevma  io.lqFull := freeList.io.empty
192*e4f69d78Ssfencevma
193*e4f69d78Ssfencevma  /**
194*e4f69d78Ssfencevma    * Store-Load Memory violation detection
195*e4f69d78Ssfencevma    * Scheme 1(Current scheme): flush the pipeline then re-fetch from the load instruction (like old load queue).
196*e4f69d78Ssfencevma    * Scheme 2                : re-fetch instructions from the first instruction after the store instruction.
197*e4f69d78Ssfencevma    *
198*e4f69d78Ssfencevma    * When store writes back, it searches LoadQueue for younger load instructions
199*e4f69d78Ssfencevma    * with the same load physical address. They loaded wrong data and need re-execution.
200*e4f69d78Ssfencevma    *
201*e4f69d78Ssfencevma    * Cycle 0: Store Writeback
202*e4f69d78Ssfencevma    *   Generate match vector for store address with rangeMask(stPtr, enqPtr).
203*e4f69d78Ssfencevma    * Cycle 1: Select oldest load from select group.
204*e4f69d78Ssfencevma    * Cycle x: Redirect Fire
205*e4f69d78Ssfencevma    *   Choose the oldest load from LoadPipelineWidth oldest loads.
206*e4f69d78Ssfencevma    *   Prepare redirect request according to the detected violation.
207*e4f69d78Ssfencevma    *   Fire redirect request (if valid)
208*e4f69d78Ssfencevma    */
209*e4f69d78Ssfencevma  //              SelectGroup 0         SelectGroup 1          SelectGroup y
210*e4f69d78Ssfencevma  // stage 0:       lq  lq  lq  ......    lq  lq  lq  .......    lq  lq  lq
211*e4f69d78Ssfencevma  //                |   |   |             |   |   |              |   |   |
212*e4f69d78Ssfencevma  // stage 1:       lq  lq  lq  ......    lq  lq  lq  .......    lq  lq  lq
213*e4f69d78Ssfencevma  //                 \  |  /    ......     \  |  /    .......     \  |  /
214*e4f69d78Ssfencevma  // stage 2:           lq                    lq                     lq
215*e4f69d78Ssfencevma  //                     \  |  /  .......  \  |  /   ........  \  |  /
216*e4f69d78Ssfencevma  // stage 3:               lq                lq                  lq
217*e4f69d78Ssfencevma  //                                          ...
218*e4f69d78Ssfencevma  //                                          ...
219*e4f69d78Ssfencevma  //                                           |
220*e4f69d78Ssfencevma  // stage x:                                  lq
221*e4f69d78Ssfencevma  //                                           |
222*e4f69d78Ssfencevma  //                                       rollback req
223*e4f69d78Ssfencevma
224*e4f69d78Ssfencevma  // select logic
225*e4f69d78Ssfencevma  val SelectGroupSize = RollbackGroupSize
226*e4f69d78Ssfencevma  val lgSelectGroupSize = log2Ceil(SelectGroupSize)
227*e4f69d78Ssfencevma  val TotalSelectCycles = scala.math.ceil(log2Ceil(LoadQueueRAWSize).toFloat / lgSelectGroupSize).toInt + 1
228*e4f69d78Ssfencevma
229*e4f69d78Ssfencevma  def selectPartialOldest[T <: XSBundleWithMicroOp](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
230*e4f69d78Ssfencevma    assert(valid.length == bits.length)
231*e4f69d78Ssfencevma    if (valid.length == 0 || valid.length == 1) {
232*e4f69d78Ssfencevma      (valid, bits)
233*e4f69d78Ssfencevma    } else if (valid.length == 2) {
234*e4f69d78Ssfencevma      val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0)))))
235*e4f69d78Ssfencevma      for (i <- res.indices) {
236*e4f69d78Ssfencevma        res(i).valid := valid(i)
237*e4f69d78Ssfencevma        res(i).bits := bits(i)
238*e4f69d78Ssfencevma      }
239*e4f69d78Ssfencevma      val oldest = Mux(valid(0) && valid(1), Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx), res(1), res(0)), Mux(valid(0) && !valid(1), res(0), res(1)))
240*e4f69d78Ssfencevma      (Seq(oldest.valid), Seq(oldest.bits))
241*e4f69d78Ssfencevma    } else {
242*e4f69d78Ssfencevma      val left = selectPartialOldest(valid.take(valid.length / 2), bits.take(bits.length / 2))
243*e4f69d78Ssfencevma      val right = selectPartialOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)))
244*e4f69d78Ssfencevma      selectPartialOldest(left._1 ++ right._1, left._2 ++ right._2)
245*e4f69d78Ssfencevma    }
246*e4f69d78Ssfencevma  }
247*e4f69d78Ssfencevma
248*e4f69d78Ssfencevma  def selectOldest[T <: XSBundleWithMicroOp](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
249*e4f69d78Ssfencevma    assert(valid.length == bits.length)
250*e4f69d78Ssfencevma    val numSelectGroups = scala.math.ceil(valid.length.toFloat / SelectGroupSize).toInt
251*e4f69d78Ssfencevma
252*e4f69d78Ssfencevma    // group info
253*e4f69d78Ssfencevma    val selectValidGroups =
254*e4f69d78Ssfencevma      if (valid.length <= SelectGroupSize) {
255*e4f69d78Ssfencevma        Seq(valid)
256*e4f69d78Ssfencevma      } else {
257*e4f69d78Ssfencevma        (0 until numSelectGroups).map(g => {
258*e4f69d78Ssfencevma          if (valid.length < (g + 1) * SelectGroupSize) {
259*e4f69d78Ssfencevma            valid.takeRight(valid.length - g * SelectGroupSize)
260*e4f69d78Ssfencevma          } else {
261*e4f69d78Ssfencevma            (0 until SelectGroupSize).map(j => valid(g * SelectGroupSize + j))
262*e4f69d78Ssfencevma          }
263*e4f69d78Ssfencevma        })
264*e4f69d78Ssfencevma      }
265*e4f69d78Ssfencevma    val selectBitsGroups =
266*e4f69d78Ssfencevma      if (bits.length <= SelectGroupSize) {
267*e4f69d78Ssfencevma        Seq(bits)
268*e4f69d78Ssfencevma      } else {
269*e4f69d78Ssfencevma        (0 until numSelectGroups).map(g => {
270*e4f69d78Ssfencevma          if (bits.length < (g + 1) * SelectGroupSize) {
271*e4f69d78Ssfencevma            bits.takeRight(bits.length - g * SelectGroupSize)
272*e4f69d78Ssfencevma          } else {
273*e4f69d78Ssfencevma            (0 until SelectGroupSize).map(j => bits(g * SelectGroupSize + j))
274*e4f69d78Ssfencevma          }
275*e4f69d78Ssfencevma        })
276*e4f69d78Ssfencevma      }
277*e4f69d78Ssfencevma
278*e4f69d78Ssfencevma    // select logic
279*e4f69d78Ssfencevma    if (valid.length <= SelectGroupSize) {
280*e4f69d78Ssfencevma      val (selValid, selBits) = selectPartialOldest(valid, bits)
281*e4f69d78Ssfencevma      (Seq(RegNext(selValid(0) && !selBits(0).uop.robIdx.needFlush(io.redirect))), Seq(RegNext(selBits(0))))
282*e4f69d78Ssfencevma    } else {
283*e4f69d78Ssfencevma      val select = (0 until numSelectGroups).map(g => {
284*e4f69d78Ssfencevma        val (selValid, selBits) = selectPartialOldest(selectValidGroups(g), selectBitsGroups(g))
285*e4f69d78Ssfencevma        (RegNext(selValid(0) && !selBits(0).uop.robIdx.needFlush(io.redirect)), RegNext(selBits(0)))
286*e4f69d78Ssfencevma      })
287*e4f69d78Ssfencevma      selectOldest(select.map(_._1), select.map(_._2))
288*e4f69d78Ssfencevma    }
289*e4f69d78Ssfencevma  }
290*e4f69d78Ssfencevma
291*e4f69d78Ssfencevma  def detectRollback(i: Int) = {
292*e4f69d78Ssfencevma    paddrModule.io.violationMdata(i) := io.storeIn(i).bits.paddr
293*e4f69d78Ssfencevma    maskModule.io.violationMdata(i) := io.storeIn(i).bits.mask
294*e4f69d78Ssfencevma
295*e4f69d78Ssfencevma    val bypassPaddrMask = RegNext(VecInit((0 until LoadPipelineWidth).map(j => bypassPAddr(j)(PAddrBits-1, 3) === io.storeIn(i).bits.paddr(PAddrBits-1, 3))))
296*e4f69d78Ssfencevma    val bypassMMask = RegNext(VecInit((0 until LoadPipelineWidth).map(j => (bypassMask(j) & io.storeIn(i).bits.mask).orR)))
297*e4f69d78Ssfencevma    val bypassMaskUInt = (0 until LoadPipelineWidth).map(j =>
298*e4f69d78Ssfencevma      Fill(LoadQueueRAWSize, RegNext(RegNext(io.query(j).req.fire))) & Mux(bypassPaddrMask(j) && bypassMMask(j), UIntToOH(RegNext(RegNext(enqIndexVec(j)))), 0.U(LoadQueueRAWSize))
299*e4f69d78Ssfencevma    ).reduce(_|_)
300*e4f69d78Ssfencevma
301*e4f69d78Ssfencevma    val addrMaskMatch = RegNext(paddrModule.io.violationMmask(i).asUInt & maskModule.io.violationMmask(i).asUInt) | bypassMaskUInt
302*e4f69d78Ssfencevma    val entryNeedCheck = RegNext(VecInit((0 until LoadQueueRAWSize).map(j => {
303*e4f69d78Ssfencevma      allocated(j) && isAfter(uop(j).robIdx, io.storeIn(i).bits.uop.robIdx) && datavalid(j) && !uop(j).robIdx.needFlush(io.redirect)
304*e4f69d78Ssfencevma    })))
305*e4f69d78Ssfencevma    val lqViolationSelVec = VecInit((0 until LoadQueueRAWSize).map(j => {
306*e4f69d78Ssfencevma      addrMaskMatch(j) && entryNeedCheck(j)
307*e4f69d78Ssfencevma    }))
308*e4f69d78Ssfencevma
309*e4f69d78Ssfencevma    val lqViolationSelUopExts = uop.map(uop => {
310*e4f69d78Ssfencevma      val wrapper = Wire(new XSBundleWithMicroOp)
311*e4f69d78Ssfencevma      wrapper.uop := uop
312*e4f69d78Ssfencevma      wrapper
313*e4f69d78Ssfencevma    })
314*e4f69d78Ssfencevma
315*e4f69d78Ssfencevma    // select logic
316*e4f69d78Ssfencevma    val lqSelect = selectOldest(lqViolationSelVec, lqViolationSelUopExts)
317*e4f69d78Ssfencevma
318*e4f69d78Ssfencevma    // select one inst
319*e4f69d78Ssfencevma    val lqViolation = lqSelect._1(0)
320*e4f69d78Ssfencevma    val lqViolationUop = lqSelect._2(0).uop
321*e4f69d78Ssfencevma
322*e4f69d78Ssfencevma    XSDebug(
323*e4f69d78Ssfencevma      lqViolation,
324*e4f69d78Ssfencevma      "need rollback (ld wb before store) pc %x robidx %d target %x\n",
325*e4f69d78Ssfencevma      io.storeIn(i).bits.uop.cf.pc, io.storeIn(i).bits.uop.robIdx.asUInt, lqViolationUop.robIdx.asUInt
326*e4f69d78Ssfencevma    )
327*e4f69d78Ssfencevma
328*e4f69d78Ssfencevma    (lqViolation, lqViolationUop)
329*e4f69d78Ssfencevma  }
330*e4f69d78Ssfencevma
331*e4f69d78Ssfencevma  // select rollback (part1) and generate rollback request
332*e4f69d78Ssfencevma  // rollback check
333*e4f69d78Ssfencevma  // Lq rollback seq check is done in s3 (next stage), as getting rollbackLq MicroOp is slow
334*e4f69d78Ssfencevma  val rollbackLqWb = Wire(Vec(StorePipelineWidth, Valid(new MicroOpRbExt)))
335*e4f69d78Ssfencevma  val stFtqIdx = Wire(Vec(StorePipelineWidth, new FtqPtr))
336*e4f69d78Ssfencevma  val stFtqOffset = Wire(Vec(StorePipelineWidth, UInt(log2Up(PredictWidth).W)))
337*e4f69d78Ssfencevma  for (w <- 0 until StorePipelineWidth) {
338*e4f69d78Ssfencevma    val detectedRollback = detectRollback(w)
339*e4f69d78Ssfencevma    rollbackLqWb(w).valid := detectedRollback._1 && DelayN(io.storeIn(w).valid && !io.storeIn(w).bits.miss, TotalSelectCycles)
340*e4f69d78Ssfencevma    rollbackLqWb(w).bits.uop := detectedRollback._2
341*e4f69d78Ssfencevma    rollbackLqWb(w).bits.flag := w.U
342*e4f69d78Ssfencevma    stFtqIdx(w) := DelayN(io.storeIn(w).bits.uop.cf.ftqPtr, TotalSelectCycles)
343*e4f69d78Ssfencevma    stFtqOffset(w) := DelayN(io.storeIn(w).bits.uop.cf.ftqOffset, TotalSelectCycles)
344*e4f69d78Ssfencevma  }
345*e4f69d78Ssfencevma
346*e4f69d78Ssfencevma  val rollbackLqWbValid = rollbackLqWb.map(x => x.valid && !x.bits.uop.robIdx.needFlush(io.redirect))
347*e4f69d78Ssfencevma  val rollbackLqWbBits = rollbackLqWb.map(x => x.bits)
348*e4f69d78Ssfencevma
349*e4f69d78Ssfencevma  // select rollback (part2), generate rollback request, then fire rollback request
350*e4f69d78Ssfencevma  // Note that we use robIdx - 1.U to flush the load instruction itself.
351*e4f69d78Ssfencevma  // Thus, here if last cycle's robIdx equals to this cycle's robIdx, it still triggers the redirect.
352*e4f69d78Ssfencevma
353*e4f69d78Ssfencevma  // select uop in parallel
354*e4f69d78Ssfencevma  val lqs = selectPartialOldest(rollbackLqWbValid, rollbackLqWbBits)
355*e4f69d78Ssfencevma  val rollbackUopExt = lqs._2(0)
356*e4f69d78Ssfencevma  val rollbackUop = rollbackUopExt.uop
357*e4f69d78Ssfencevma  val rollbackStFtqIdx = stFtqIdx(rollbackUopExt.flag)
358*e4f69d78Ssfencevma  val rollbackStFtqOffset = stFtqOffset(rollbackUopExt.flag)
359*e4f69d78Ssfencevma
360*e4f69d78Ssfencevma  // check if rollback request is still valid in parallel
361*e4f69d78Ssfencevma  io.rollback.bits.robIdx := rollbackUop.robIdx
362*e4f69d78Ssfencevma  io.rollback.bits.ftqIdx := rollbackUop.cf.ftqPtr
363*e4f69d78Ssfencevma  io.rollback.bits.stFtqIdx := rollbackStFtqIdx
364*e4f69d78Ssfencevma  io.rollback.bits.ftqOffset := rollbackUop.cf.ftqOffset
365*e4f69d78Ssfencevma  io.rollback.bits.stFtqOffset := rollbackStFtqOffset
366*e4f69d78Ssfencevma  io.rollback.bits.level := RedirectLevel.flush
367*e4f69d78Ssfencevma  io.rollback.bits.interrupt := DontCare
368*e4f69d78Ssfencevma  io.rollback.bits.cfiUpdate := DontCare
369*e4f69d78Ssfencevma  io.rollback.bits.cfiUpdate.target := rollbackUop.cf.pc
370*e4f69d78Ssfencevma  io.rollback.bits.debug_runahead_checkpoint_id := rollbackUop.debugInfo.runahead_checkpoint_id
371*e4f69d78Ssfencevma  // io.rollback.bits.pc := DontCare
372*e4f69d78Ssfencevma
373*e4f69d78Ssfencevma  io.rollback.valid := VecInit(rollbackLqWbValid).asUInt.orR
374*e4f69d78Ssfencevma
375*e4f69d78Ssfencevma  // perf cnt
376*e4f69d78Ssfencevma  val canEnqCount = PopCount(io.query.map(_.req.fire))
377*e4f69d78Ssfencevma  val validCount = freeList.io.validCount
378*e4f69d78Ssfencevma  val allowEnqueue = validCount <= (LoadQueueRAWSize - LoadPipelineWidth).U
379*e4f69d78Ssfencevma
380*e4f69d78Ssfencevma  QueuePerf(LoadQueueRAWSize, validCount, !allowEnqueue)
381*e4f69d78Ssfencevma  XSPerfAccumulate("enqs", canEnqCount)
382*e4f69d78Ssfencevma  XSPerfAccumulate("stld_rollback", io.rollback.valid)
383*e4f69d78Ssfencevma  val perfEvents: Seq[(String, UInt)] = Seq(
384*e4f69d78Ssfencevma    ("enq ", canEnqCount),
385*e4f69d78Ssfencevma    ("stld_rollback", io.rollback.valid),
386*e4f69d78Ssfencevma  )
387*e4f69d78Ssfencevma  generatePerfEvent()
388*e4f69d78Ssfencevma  // end
389*e4f69d78Ssfencevma}