xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala (revision 1bc48dd1fa0af361fd194c65bad3b86349ec2903)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.fu.FuConfig._
26import xiangshan.backend.fu.fpu.FPU
27import xiangshan.backend.rob.RobLsqIO
28import xiangshan.cache._
29import xiangshan.frontend.FtqPtr
30import xiangshan.ExceptionNO._
31import xiangshan.cache.wpu.ReplayCarry
32import xiangshan.backend.rob.RobPtr
33import xiangshan.backend.Bundles.{MemExuOutput, DynInst}
34
35class LoadMisalignBuffer(implicit p: Parameters) extends XSModule
36  with HasCircularQueuePtrHelper
37  with HasLoadHelper
38{
39  private val enqPortNum = LoadPipelineWidth
40  private val maxSplitNum = 2
41
42  require(maxSplitNum == 2)
43
44  private val LB = "b00".U(2.W)
45  private val LH = "b01".U(2.W)
46  private val LW = "b10".U(2.W)
47  private val LD = "b11".U(2.W)
48
49  // encode of how many bytes to shift or truncate
50  private val BYTE0 = "b000".U(3.W)
51  private val BYTE1 = "b001".U(3.W)
52  private val BYTE2 = "b010".U(3.W)
53  private val BYTE3 = "b011".U(3.W)
54  private val BYTE4 = "b100".U(3.W)
55  private val BYTE5 = "b101".U(3.W)
56  private val BYTE6 = "b110".U(3.W)
57  private val BYTE7 = "b111".U(3.W)
58
59  def getMask(sizeEncode: UInt) = LookupTree(sizeEncode, List(
60    LB -> 0x1.U, // lb
61    LH -> 0x3.U, // lh
62    LW -> 0xf.U, // lw
63    LD -> 0xff.U  // ld
64  ))
65
66  def getShiftAndTruncateData(shiftEncode: UInt, truncateEncode: UInt, data: UInt) = {
67    val shiftData = LookupTree(shiftEncode, List(
68      BYTE0 -> data(63,    0),
69      BYTE1 -> data(63,    8),
70      BYTE2 -> data(63,   16),
71      BYTE3 -> data(63,   24),
72      BYTE4 -> data(63,   32),
73      BYTE5 -> data(63,   40),
74      BYTE6 -> data(63,   48),
75      BYTE7 -> data(63,   56)
76    ))
77    val truncateData = LookupTree(truncateEncode, List(
78      BYTE0 -> 0.U(XLEN.W), // can not truncate with 0 byte width
79      BYTE1 -> shiftData(7,    0),
80      BYTE2 -> shiftData(15,   0),
81      BYTE3 -> shiftData(23,   0),
82      BYTE4 -> shiftData(31,   0),
83      BYTE5 -> shiftData(39,   0),
84      BYTE6 -> shiftData(47,   0),
85      BYTE7 -> shiftData(55,   0)
86    ))
87    truncateData(XLEN - 1, 0)
88  }
89
90  def selectOldest[T <: LqWriteBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
91    assert(valid.length == bits.length)
92    if (valid.length == 0 || valid.length == 1) {
93      (valid, bits)
94    } else if (valid.length == 2) {
95      val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0)))))
96      for (i <- res.indices) {
97        res(i).valid := valid(i)
98        res(i).bits := bits(i)
99      }
100      val oldest = Mux(valid(0) && valid(1),
101        Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) ||
102          (isNotBefore(bits(0).uop.robIdx, bits(1).uop.robIdx) && bits(0).uop.uopIdx > bits(1).uop.uopIdx), res(1), res(0)),
103        Mux(valid(0) && !valid(1), res(0), res(1)))
104      (Seq(oldest.valid), Seq(oldest.bits))
105    } else {
106      val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2))
107      val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)))
108      selectOldest(left._1 ++ right._1, left._2 ++ right._2)
109    }
110  }
111
112  val io = IO(new Bundle() {
113    val redirect        = Flipped(Valid(new Redirect))
114    val req             = Vec(enqPortNum, Flipped(Valid(new LqWriteBundle)))
115    val rob             = Flipped(new RobLsqIO)
116    val splitLoadReq    = Decoupled(new LsPipelineBundle)
117    val splitLoadResp   = Flipped(Valid(new LqWriteBundle))
118    val writeBack       = Decoupled(new MemExuOutput)
119    val overwriteExpBuf = Output(new XSBundle {
120      val valid  = Bool()
121      val vaddr  = UInt(XLEN.W)
122      val isHyper = Bool()
123      val gpaddr = UInt(XLEN.W)
124      val isForVSnonLeafPTE = Bool()
125    })
126    val flushLdExpBuff  = Output(Bool())
127  })
128
129  io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool()))
130  io.rob.uop  := 0.U.asTypeOf(Vec(LoadPipelineWidth, new DynInst))
131
132  val req_valid = RegInit(false.B)
133  val req = Reg(new LqWriteBundle)
134
135  // enqueue
136  // s1:
137  val s1_req = VecInit(io.req.map(_.bits))
138  val s1_valid = VecInit(io.req.map(x => x.valid))
139
140  // s2: delay 1 cycle
141  val s2_req = RegNext(s1_req)
142  val s2_valid = (0 until enqPortNum).map(i =>
143    RegNext(s1_valid(i)) &&
144    !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) &&
145    !s2_req(i).uop.robIdx.needFlush(io.redirect)
146  )
147  val s2_miss_aligned = s2_req.map(x =>
148    x.uop.exceptionVec(loadAddrMisaligned) && !x.uop.exceptionVec(breakPoint) && !TriggerAction.isDmode(x.uop.trigger)
149  )
150
151  val s2_enqueue = Wire(Vec(enqPortNum, Bool()))
152  for (w <- 0 until enqPortNum) {
153    s2_enqueue(w) := s2_valid(w) && s2_miss_aligned(w)
154  }
155
156  when (req_valid && req.uop.robIdx.needFlush(io.redirect)) {
157    req_valid := s2_enqueue.asUInt.orR
158  } .elsewhen (s2_enqueue.asUInt.orR) {
159    req_valid := req_valid || true.B
160  }
161
162  val reqSel = selectOldest(s2_enqueue, s2_req)
163
164  when (req_valid) {
165    req := Mux(
166      reqSel._1(0) && (isAfter(req.uop.robIdx, reqSel._2(0).uop.robIdx) || (isNotBefore(req.uop.robIdx, reqSel._2(0).uop.robIdx) && req.uop.uopIdx > reqSel._2(0).uop.uopIdx)),
167      reqSel._2(0),
168      req)
169  } .elsewhen (s2_enqueue.asUInt.orR) {
170    req := reqSel._2(0)
171  }
172
173  val robMatch = req_valid && io.rob.pendingld && (io.rob.pendingPtr === req.uop.robIdx)
174
175  // buffer control:
176  //  - split miss-aligned load into aligned loads
177  //  - send split load to ldu and get result from ldu
178  //  - merge them and write back to rob
179  val s_idle :: s_split :: s_req :: s_resp :: s_comb :: s_wb :: s_wait :: Nil = Enum(7)
180  val bufferState = RegInit(s_idle)
181  val splitLoadReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle))))
182  val splitLoadResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LqWriteBundle))))
183  val unSentLoads = RegInit(0.U(maxSplitNum.W))
184  val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W))
185
186  // if there is exception or mmio in split load
187  val globalException = RegInit(false.B)
188  val globalMMIO = RegInit(false.B)
189
190  val hasException = ExceptionNO.selectByFu(io.splitLoadResp.bits.uop.exceptionVec, LduCfg).asUInt.orR
191  val isMMIO = io.splitLoadResp.bits.mmio
192
193  switch(bufferState) {
194    is (s_idle) {
195      when (robMatch) {
196        bufferState := s_split
197      }
198    }
199
200    is (s_split) {
201      bufferState := s_req
202    }
203
204    is (s_req) {
205      when (io.splitLoadReq.fire) {
206        bufferState := s_resp
207      }
208    }
209
210    is (s_resp) {
211      when (io.splitLoadResp.valid) {
212        val clearOh = UIntToOH(curPtr)
213        when (hasException || isMMIO) {
214          // commit directly when exception ocurs
215          // if any split load reaches mmio space, delegate to software loadAddrMisaligned exception
216          bufferState := s_wb
217          globalException := hasException
218          globalMMIO := isMMIO
219        } .elsewhen(io.splitLoadResp.bits.rep_info.need_rep || (unSentLoads & ~clearOh).orR) {
220          // need replay or still has unsent requests
221          bufferState := s_req
222        } .otherwise {
223          // merge the split load results
224          bufferState := s_comb
225        }
226      }
227    }
228
229    is (s_comb) {
230      bufferState := s_wb
231    }
232
233    is (s_wb) {
234      when(io.writeBack.fire) {
235        bufferState := s_wait
236      }
237    }
238
239    is (s_wait) {
240      when(io.rob.lcommit =/= 0.U || req.uop.robIdx.needFlush(io.redirect)) {
241        // rob commits the unaligned load or handled the exception, reset all state
242        bufferState := s_idle
243        req_valid := false.B
244        curPtr := 0.U
245        unSentLoads := 0.U
246        globalException := false.B
247        globalMMIO := false.B
248      }
249    }
250  }
251
252  val highAddress = LookupTree(req.uop.fuOpType(1, 0), List(
253    LB -> 0.U,
254    LH -> 1.U,
255    LW -> 3.U,
256    LD -> 7.U
257  )) + req.vaddr(4, 0)
258  // to see if (vaddr + opSize - 1) and vaddr are in the same 16 bytes region
259  val cross16BytesBoundary = req_valid && (highAddress(4) =/= req.vaddr(4))
260  val aligned16BytesAddr   = (req.vaddr >> 4) << 4// req.vaddr & ~("b1111".U)
261  val aligned16BytesSel    = req.vaddr(3, 0)
262
263  // meta of 128 bit load
264  val new128Load = WireInit(0.U.asTypeOf(new LsPipelineBundle))
265  // meta of split loads
266  val lowAddrLoad  = WireInit(0.U.asTypeOf(new LsPipelineBundle))
267  val highAddrLoad = WireInit(0.U.asTypeOf(new LsPipelineBundle))
268  val lowResultShift = RegInit(0.U(3.W)) // how many bytes should we shift right when got result
269  val lowResultWidth = RegInit(0.U(3.W)) // how many bytes should we take from result
270  val highResultShift = RegInit(0.U(3.W))
271  val highResultWidth = RegInit(0.U(3.W))
272
273  when (bufferState === s_split) {
274    when (!cross16BytesBoundary) {
275      // change this unaligned load into a 128 bits load
276      unSentLoads := 1.U
277      curPtr := 0.U
278      new128Load.vaddr := aligned16BytesAddr
279      new128Load.fullva := req.fullva
280      // new128Load.mask  := (getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel).asUInt
281      new128Load.mask  := 0xffff.U
282      new128Load.uop   := req.uop
283      new128Load.uop.exceptionVec(loadAddrMisaligned) := false.B
284      new128Load.is128bit := true.B
285      splitLoadReqs(0) := new128Load
286    } .otherwise {
287      // split this unaligned load into `maxSplitNum` aligned loads
288      unSentLoads := Fill(maxSplitNum, 1.U(1.W))
289      curPtr := 0.U
290      lowAddrLoad.uop := req.uop
291      lowAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
292      lowAddrLoad.fullva := req.fullva
293      highAddrLoad.uop := req.uop
294      highAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
295      highAddrLoad.fullva := req.fullva
296
297      switch (req.uop.fuOpType(1, 0)) {
298        is (LB) {
299          assert(false.B, "lb should not trigger miss align")
300        }
301
302        is (LH) {
303          lowAddrLoad.uop.fuOpType := LB
304          lowAddrLoad.vaddr := req.vaddr
305          lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
306          lowResultShift    := BYTE0
307          lowResultWidth    := BYTE1
308
309          highAddrLoad.uop.fuOpType := LB
310          highAddrLoad.vaddr := req.vaddr + 1.U
311          highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
312          highResultShift    := BYTE0
313          highResultWidth    := BYTE1
314        }
315
316        is (LW) {
317          switch (req.vaddr(1, 0)) {
318            is ("b00".U) {
319              assert(false.B, "should not trigger miss align")
320            }
321
322            is ("b01".U) {
323              lowAddrLoad.uop.fuOpType := LW
324              lowAddrLoad.vaddr := req.vaddr - 1.U
325              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
326              lowResultShift    := BYTE1
327              lowResultWidth    := BYTE3
328
329              highAddrLoad.uop.fuOpType := LB
330              highAddrLoad.vaddr := req.vaddr + 3.U
331              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
332              highResultShift    := BYTE0
333              highResultWidth    := BYTE1
334            }
335
336            is ("b10".U) {
337              lowAddrLoad.uop.fuOpType := LH
338              lowAddrLoad.vaddr := req.vaddr
339              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
340              lowResultShift    := BYTE0
341              lowResultWidth    := BYTE2
342
343              highAddrLoad.uop.fuOpType := LH
344              highAddrLoad.vaddr := req.vaddr + 2.U
345              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
346              highResultShift    := BYTE0
347              highResultWidth    := BYTE2
348            }
349
350            is ("b11".U) {
351              lowAddrLoad.uop.fuOpType := LB
352              lowAddrLoad.vaddr := req.vaddr
353              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
354              lowResultShift    := BYTE0
355              lowResultWidth    := BYTE1
356
357              highAddrLoad.uop.fuOpType := LW
358              highAddrLoad.vaddr := req.vaddr + 1.U
359              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
360              highResultShift    := BYTE0
361              highResultWidth    := BYTE3
362            }
363          }
364        }
365
366        is (LD) {
367          switch (req.vaddr(2, 0)) {
368            is ("b000".U) {
369              assert(false.B, "should not trigger miss align")
370            }
371
372            is ("b001".U) {
373              lowAddrLoad.uop.fuOpType := LD
374              lowAddrLoad.vaddr := req.vaddr - 1.U
375              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
376              lowResultShift    := BYTE1
377              lowResultWidth    := BYTE7
378
379              highAddrLoad.uop.fuOpType := LB
380              highAddrLoad.vaddr := req.vaddr + 7.U
381              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
382              highResultShift    := BYTE0
383              highResultWidth    := BYTE1
384            }
385
386            is ("b010".U) {
387              lowAddrLoad.uop.fuOpType := LD
388              lowAddrLoad.vaddr := req.vaddr - 2.U
389              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
390              lowResultShift    := BYTE2
391              lowResultWidth    := BYTE6
392
393              highAddrLoad.uop.fuOpType := LH
394              highAddrLoad.vaddr := req.vaddr + 6.U
395              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
396              highResultShift    := BYTE0
397              highResultWidth    := BYTE2
398            }
399
400            is ("b011".U) {
401              lowAddrLoad.uop.fuOpType := LD
402              lowAddrLoad.vaddr := req.vaddr - 3.U
403              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
404              lowResultShift    := BYTE3
405              lowResultWidth    := BYTE5
406
407              highAddrLoad.uop.fuOpType := LW
408              highAddrLoad.vaddr := req.vaddr + 5.U
409              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
410              highResultShift    := BYTE0
411              highResultWidth    := BYTE3
412            }
413
414            is ("b100".U) {
415              lowAddrLoad.uop.fuOpType := LW
416              lowAddrLoad.vaddr := req.vaddr
417              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
418              lowResultShift    := BYTE0
419              lowResultWidth    := BYTE4
420
421              highAddrLoad.uop.fuOpType := LW
422              highAddrLoad.vaddr := req.vaddr + 4.U
423              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
424              highResultShift    := BYTE0
425              highResultWidth    := BYTE4
426            }
427
428            is ("b101".U) {
429              lowAddrLoad.uop.fuOpType := LW
430              lowAddrLoad.vaddr := req.vaddr - 1.U
431              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
432              lowResultShift    := BYTE1
433              lowResultWidth    := BYTE3
434
435              highAddrLoad.uop.fuOpType := LD
436              highAddrLoad.vaddr := req.vaddr + 3.U
437              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
438              highResultShift    := BYTE0
439              highResultWidth    := BYTE5
440            }
441
442            is ("b110".U) {
443              lowAddrLoad.uop.fuOpType := LH
444              lowAddrLoad.vaddr := req.vaddr
445              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
446              lowResultShift    := BYTE0
447              lowResultWidth    := BYTE2
448
449              highAddrLoad.uop.fuOpType := LD
450              highAddrLoad.vaddr := req.vaddr + 2.U
451              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
452              highResultShift    := BYTE0
453              highResultWidth    := BYTE6
454            }
455
456            is ("b111".U) {
457              lowAddrLoad.uop.fuOpType := LB
458              lowAddrLoad.vaddr := req.vaddr
459              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
460              lowResultShift    := BYTE0
461              lowResultWidth    := BYTE1
462
463              highAddrLoad.uop.fuOpType := LD
464              highAddrLoad.vaddr := req.vaddr + 1.U
465              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
466              highResultShift    := BYTE0
467              highResultWidth    := BYTE7
468            }
469          }
470        }
471      }
472
473      splitLoadReqs(0) := lowAddrLoad
474      splitLoadReqs(1) := highAddrLoad
475    }
476  }
477
478  io.splitLoadReq.valid := req_valid && (bufferState === s_req)
479  io.splitLoadReq.bits  := splitLoadReqs(curPtr)
480
481  when (io.splitLoadResp.valid) {
482    splitLoadResp(curPtr) := io.splitLoadResp.bits
483    when (isMMIO) {
484      unSentLoads := 0.U
485      splitLoadResp(curPtr).uop.exceptionVec := 0.U.asTypeOf(ExceptionVec())
486      // delegate to software
487      splitLoadResp(curPtr).uop.exceptionVec(loadAddrMisaligned) := true.B
488    } .elsewhen (hasException) {
489      unSentLoads := 0.U
490    } .elsewhen (!io.splitLoadResp.bits.rep_info.need_rep) {
491      unSentLoads := unSentLoads & ~UIntToOH(curPtr)
492      curPtr := curPtr + 1.U
493    }
494  }
495
496  val combinedData = RegInit(0.U(XLEN.W))
497
498  when (bufferState === s_comb) {
499    when (!cross16BytesBoundary) {
500      val shiftData = LookupTree(aligned16BytesSel, List(
501        "b0000".U -> splitLoadResp(0).data(63,     0),
502        "b0001".U -> splitLoadResp(0).data(71,     8),
503        "b0010".U -> splitLoadResp(0).data(79,    16),
504        "b0011".U -> splitLoadResp(0).data(87,    24),
505        "b0100".U -> splitLoadResp(0).data(95,    32),
506        "b0101".U -> splitLoadResp(0).data(103,   40),
507        "b0110".U -> splitLoadResp(0).data(111,   48),
508        "b0111".U -> splitLoadResp(0).data(119,   56),
509        "b1000".U -> splitLoadResp(0).data(127,   64),
510        "b1001".U -> splitLoadResp(0).data(127,   72),
511        "b1010".U -> splitLoadResp(0).data(127,   80),
512        "b1011".U -> splitLoadResp(0).data(127,   88),
513        "b1100".U -> splitLoadResp(0).data(127,   96),
514        "b1101".U -> splitLoadResp(0).data(127,  104),
515        "b1110".U -> splitLoadResp(0).data(127,  112),
516        "b1111".U -> splitLoadResp(0).data(127,  120)
517      ))
518      val truncateData = LookupTree(req.uop.fuOpType(1, 0), List(
519        LB -> shiftData(7,  0), // lb
520        LH -> shiftData(15, 0), // lh
521        LW -> shiftData(31, 0), // lw
522        LD -> shiftData(63, 0)  // ld
523      ))
524      combinedData := rdataHelper(req.uop, truncateData(XLEN - 1, 0))
525    } .otherwise {
526      val lowAddrResult = getShiftAndTruncateData(lowResultShift, lowResultWidth, splitLoadResp(0).data)
527                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
528      val highAddrResult = getShiftAndTruncateData(highResultShift, highResultWidth, splitLoadResp(1).data)
529                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
530      val catResult = Wire(Vec(XLEN / 8, UInt(8.W)))
531      (0 until XLEN / 8) .map {
532        case i => {
533          when (i.U < lowResultWidth) {
534            catResult(i) := lowAddrResult(i)
535          } .otherwise {
536            catResult(i) := highAddrResult(i.U - lowResultWidth)
537          }
538        }
539      }
540      combinedData := rdataHelper(req.uop, (catResult.asUInt)(XLEN - 1, 0))
541    }
542  }
543
544  io.writeBack.valid := req_valid && (bufferState === s_wb)
545  io.writeBack.bits.uop := req.uop
546  io.writeBack.bits.uop.exceptionVec := Mux(
547    globalMMIO || globalException,
548    splitLoadResp(curPtr).uop.exceptionVec,
549    0.U.asTypeOf(ExceptionVec()) // TODO: is this ok?
550  )
551  io.writeBack.bits.uop.flushPipe := Mux(globalMMIO || globalException, false.B, true.B)
552  io.writeBack.bits.uop.replayInst := false.B
553  io.writeBack.bits.data := combinedData
554  io.writeBack.bits.debug.isMMIO := globalMMIO
555  io.writeBack.bits.debug.isPerfCnt := false.B
556  io.writeBack.bits.debug.paddr := req.paddr
557  io.writeBack.bits.debug.vaddr := req.vaddr
558
559  val flush = req_valid && req.uop.robIdx.needFlush(io.redirect)
560
561  when (flush && (bufferState =/= s_idle)) {
562    bufferState := s_idle
563    req_valid := false.B
564    curPtr := 0.U
565    unSentLoads := 0.U
566    globalException := false.B
567    globalMMIO := false.B
568  }
569
570  // NOTE: spectial case (unaligned load cross page, page fault happens in next page)
571  // if exception happens in the higher page address part, overwrite the loadExceptionBuffer vaddr
572  val overwriteExpBuf = GatedValidRegNext(req_valid && globalException)
573  val overwriteVaddr = GatedRegNext(Mux(
574    cross16BytesBoundary && (curPtr === 1.U),
575    splitLoadResp(curPtr).vaddr,
576    splitLoadResp(curPtr).fullva))
577  val overwriteIsHyper = GatedRegNext(splitLoadResp(curPtr).isHyper)
578  val overwriteGpaddr = GatedRegNext(splitLoadResp(curPtr).gpaddr)
579  val overwriteIsForVSnonLeafPTE = GatedRegNext(splitLoadResp(curPtr).isForVSnonLeafPTE)
580
581  io.overwriteExpBuf.valid := overwriteExpBuf
582  io.overwriteExpBuf.vaddr := overwriteVaddr
583  io.overwriteExpBuf.isHyper := overwriteIsHyper
584  io.overwriteExpBuf.gpaddr := overwriteGpaddr
585  io.overwriteExpBuf.isForVSnonLeafPTE := overwriteIsForVSnonLeafPTE
586
587  // when no exception or mmio, flush loadExceptionBuffer at s_wb
588  val flushLdExpBuff = GatedValidRegNext(req_valid && (bufferState === s_wb) && !(globalMMIO || globalException))
589  io.flushLdExpBuff := flushLdExpBuff
590
591  XSPerfAccumulate("alloc",                  RegNext(!req_valid) && req_valid)
592  XSPerfAccumulate("flush",                  flush)
593  XSPerfAccumulate("flush_idle",             flush && (bufferState === s_idle))
594  XSPerfAccumulate("flush_non_idle",         flush && (bufferState =/= s_idle))
595}