xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala (revision 282dd18c7cd62babdfe80e8ee56edd19c842e80d)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.fu.FuConfig._
26import xiangshan.backend.fu.fpu.FPU
27import xiangshan.backend.rob.RobLsqIO
28import xiangshan.cache.mmu.HasTlbConst
29import xiangshan.cache._
30import xiangshan.frontend.FtqPtr
31import xiangshan.ExceptionNO._
32import xiangshan.cache.wpu.ReplayCarry
33import xiangshan.backend.rob.RobPtr
34import xiangshan.backend.Bundles.{MemExuOutput, DynInst}
35import xiangshan.backend.fu.FuConfig.LduCfg
36
37class LoadMisalignBuffer(implicit p: Parameters) extends XSModule
38  with HasCircularQueuePtrHelper
39  with HasLoadHelper
40  with HasTlbConst
41{
42  private val enqPortNum = LoadPipelineWidth
43  private val maxSplitNum = 2
44
45  require(maxSplitNum == 2)
46
47  private val LB = "b00".U(2.W)
48  private val LH = "b01".U(2.W)
49  private val LW = "b10".U(2.W)
50  private val LD = "b11".U(2.W)
51
52  // encode of how many bytes to shift or truncate
53  private val BYTE0 = "b000".U(3.W)
54  private val BYTE1 = "b001".U(3.W)
55  private val BYTE2 = "b010".U(3.W)
56  private val BYTE3 = "b011".U(3.W)
57  private val BYTE4 = "b100".U(3.W)
58  private val BYTE5 = "b101".U(3.W)
59  private val BYTE6 = "b110".U(3.W)
60  private val BYTE7 = "b111".U(3.W)
61
62  def getMask(sizeEncode: UInt) = LookupTree(sizeEncode, List(
63    LB -> 0x1.U, // lb
64    LH -> 0x3.U, // lh
65    LW -> 0xf.U, // lw
66    LD -> 0xff.U  // ld
67  ))
68
69  def getShiftAndTruncateData(shiftEncode: UInt, truncateEncode: UInt, data: UInt) = {
70    val shiftData = LookupTree(shiftEncode, List(
71      BYTE0 -> data(63,    0),
72      BYTE1 -> data(63,    8),
73      BYTE2 -> data(63,   16),
74      BYTE3 -> data(63,   24),
75      BYTE4 -> data(63,   32),
76      BYTE5 -> data(63,   40),
77      BYTE6 -> data(63,   48),
78      BYTE7 -> data(63,   56)
79    ))
80    val truncateData = LookupTree(truncateEncode, List(
81      BYTE0 -> 0.U(XLEN.W), // can not truncate with 0 byte width
82      BYTE1 -> shiftData(7,    0),
83      BYTE2 -> shiftData(15,   0),
84      BYTE3 -> shiftData(23,   0),
85      BYTE4 -> shiftData(31,   0),
86      BYTE5 -> shiftData(39,   0),
87      BYTE6 -> shiftData(47,   0),
88      BYTE7 -> shiftData(55,   0)
89    ))
90    truncateData(XLEN - 1, 0)
91  }
92
93  def selectOldest[T <: LqWriteBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
94    assert(valid.length == bits.length)
95    if (valid.length == 0 || valid.length == 1) {
96      (valid, bits)
97    } else if (valid.length == 2) {
98      val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0)))))
99      for (i <- res.indices) {
100        res(i).valid := valid(i)
101        res(i).bits := bits(i)
102      }
103      val oldest = Mux(valid(0) && valid(1),
104        Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) ||
105          (isNotBefore(bits(0).uop.robIdx, bits(1).uop.robIdx) && bits(0).uop.uopIdx > bits(1).uop.uopIdx), res(1), res(0)),
106        Mux(valid(0) && !valid(1), res(0), res(1)))
107      (Seq(oldest.valid), Seq(oldest.bits))
108    } else {
109      val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2))
110      val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)))
111      selectOldest(left._1 ++ right._1, left._2 ++ right._2)
112    }
113  }
114
115  val io = IO(new Bundle() {
116    val redirect        = Flipped(Valid(new Redirect))
117    val req             = Vec(enqPortNum, Flipped(Valid(new LqWriteBundle)))
118    val rob             = Flipped(new RobLsqIO)
119    val splitLoadReq    = Decoupled(new LsPipelineBundle)
120    val splitLoadResp   = Flipped(Valid(new LqWriteBundle))
121    val writeBack       = Decoupled(new MemExuOutput)
122    val overwriteExpBuf = Output(new XSBundle {
123      val valid  = Bool()
124      val vaddr  = UInt(XLEN.W)
125      val isHyper = Bool()
126      val gpaddr = UInt(XLEN.W)
127      val isForVSnonLeafPTE = Bool()
128    })
129    val flushLdExpBuff  = Output(Bool())
130  })
131
132  io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool()))
133  io.rob.uop  := 0.U.asTypeOf(Vec(LoadPipelineWidth, new DynInst))
134
135  val req_valid = RegInit(false.B)
136  val req = Reg(new LqWriteBundle)
137
138  // enqueue
139  // s1:
140  val s1_req = VecInit(io.req.map(_.bits))
141  val s1_valid = VecInit(io.req.map(x => x.valid))
142
143  // s2: delay 1 cycle
144  val s2_req = RegNext(s1_req)
145  val s2_valid = (0 until enqPortNum).map(i =>
146    RegNext(s1_valid(i)) &&
147    !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) &&
148    !s2_req(i).uop.robIdx.needFlush(io.redirect)
149  )
150  val s2_miss_aligned = s2_req.map(x =>
151    x.uop.exceptionVec(loadAddrMisaligned) && !x.uop.exceptionVec(breakPoint) && !TriggerAction.isDmode(x.uop.trigger)
152  )
153
154  val s2_enqueue = Wire(Vec(enqPortNum, Bool()))
155  for (w <- 0 until enqPortNum) {
156    s2_enqueue(w) := s2_valid(w) && s2_miss_aligned(w)
157  }
158
159  when (req_valid && req.uop.robIdx.needFlush(io.redirect)) {
160    req_valid := s2_enqueue.asUInt.orR
161  } .elsewhen (s2_enqueue.asUInt.orR) {
162    req_valid := req_valid || true.B
163  }
164
165  val reqSel = selectOldest(s2_enqueue, s2_req)
166
167  when (req_valid) {
168    req := Mux(
169      reqSel._1(0) && (isAfter(req.uop.robIdx, reqSel._2(0).uop.robIdx) || (isNotBefore(req.uop.robIdx, reqSel._2(0).uop.robIdx) && req.uop.uopIdx > reqSel._2(0).uop.uopIdx)),
170      reqSel._2(0),
171      req)
172  } .elsewhen (s2_enqueue.asUInt.orR) {
173    req := reqSel._2(0)
174  }
175
176  val robMatch = req_valid && io.rob.pendingld && (io.rob.pendingPtr === req.uop.robIdx)
177
178  // buffer control:
179  //  - split miss-aligned load into aligned loads
180  //  - send split load to ldu and get result from ldu
181  //  - merge them and write back to rob
182  val s_idle :: s_split :: s_req :: s_resp :: s_comb :: s_wb :: s_wait :: Nil = Enum(7)
183  val bufferState = RegInit(s_idle)
184  val splitLoadReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle))))
185  val splitLoadResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LqWriteBundle))))
186  val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec()))
187  val unSentLoads = RegInit(0.U(maxSplitNum.W))
188  val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W))
189
190  // if there is exception or mmio in split load
191  val globalException = RegInit(false.B)
192  val globalMMIO = RegInit(false.B)
193
194  val hasException = ExceptionNO.selectByFu(io.splitLoadResp.bits.uop.exceptionVec, LduCfg).asUInt.orR
195  val isMMIO = io.splitLoadResp.bits.mmio
196
197  switch(bufferState) {
198    is (s_idle) {
199      when (robMatch) {
200        bufferState := s_split
201      }
202    }
203
204    is (s_split) {
205      bufferState := s_req
206    }
207
208    is (s_req) {
209      when (io.splitLoadReq.fire) {
210        bufferState := s_resp
211      }
212    }
213
214    is (s_resp) {
215      when (io.splitLoadResp.valid) {
216        val clearOh = UIntToOH(curPtr)
217        when (hasException || isMMIO) {
218          // commit directly when exception ocurs
219          // if any split load reaches mmio space, delegate to software loadAddrMisaligned exception
220          bufferState := s_wb
221          globalException := hasException
222          globalMMIO := isMMIO
223        } .elsewhen(io.splitLoadResp.bits.rep_info.need_rep || (unSentLoads & ~clearOh).orR) {
224          // need replay or still has unsent requests
225          bufferState := s_req
226        } .otherwise {
227          // merge the split load results
228          bufferState := s_comb
229        }
230      }
231    }
232
233    is (s_comb) {
234      bufferState := s_wb
235    }
236
237    is (s_wb) {
238      when(io.writeBack.fire) {
239        bufferState := s_wait
240      }
241    }
242
243    is (s_wait) {
244      when(io.rob.lcommit =/= 0.U || req.uop.robIdx.needFlush(io.redirect)) {
245        // rob commits the unaligned load or handled the exception, reset all state
246        bufferState := s_idle
247        req_valid := false.B
248        curPtr := 0.U
249        unSentLoads := 0.U
250        globalException := false.B
251        globalMMIO := false.B
252      }
253    }
254  }
255
256  val highAddress = LookupTree(req.uop.fuOpType(1, 0), List(
257    LB -> 0.U,
258    LH -> 1.U,
259    LW -> 3.U,
260    LD -> 7.U
261  )) + req.vaddr(4, 0)
262  // to see if (vaddr + opSize - 1) and vaddr are in the same 16 bytes region
263  val cross16BytesBoundary = req_valid && (highAddress(4) =/= req.vaddr(4))
264  val aligned16BytesAddr   = (req.vaddr >> 4) << 4// req.vaddr & ~("b1111".U)
265  val aligned16BytesSel    = req.vaddr(3, 0)
266
267  // meta of 128 bit load
268  val new128Load = WireInit(0.U.asTypeOf(new LsPipelineBundle))
269  // meta of split loads
270  val lowAddrLoad  = WireInit(0.U.asTypeOf(new LsPipelineBundle))
271  val highAddrLoad = WireInit(0.U.asTypeOf(new LsPipelineBundle))
272  val lowResultShift = RegInit(0.U(3.W)) // how many bytes should we shift right when got result
273  val lowResultWidth = RegInit(0.U(3.W)) // how many bytes should we take from result
274  val highResultShift = RegInit(0.U(3.W))
275  val highResultWidth = RegInit(0.U(3.W))
276
277  when (bufferState === s_split) {
278    when (!cross16BytesBoundary) {
279      // change this unaligned load into a 128 bits load
280      unSentLoads := 1.U
281      curPtr := 0.U
282      new128Load.vaddr := aligned16BytesAddr
283      new128Load.fullva := req.fullva
284      // new128Load.mask  := (getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel).asUInt
285      new128Load.mask  := 0xffff.U
286      new128Load.uop   := req.uop
287      new128Load.uop.exceptionVec(loadAddrMisaligned) := false.B
288      new128Load.is128bit := true.B
289      splitLoadReqs(0) := new128Load
290    } .otherwise {
291      // split this unaligned load into `maxSplitNum` aligned loads
292      unSentLoads := Fill(maxSplitNum, 1.U(1.W))
293      curPtr := 0.U
294      lowAddrLoad.uop := req.uop
295      lowAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
296      lowAddrLoad.fullva := req.fullva
297      highAddrLoad.uop := req.uop
298      highAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
299      highAddrLoad.fullva := req.fullva
300
301      switch (req.uop.fuOpType(1, 0)) {
302        is (LB) {
303          assert(false.B, "lb should not trigger miss align")
304        }
305
306        is (LH) {
307          lowAddrLoad.uop.fuOpType := LB
308          lowAddrLoad.vaddr := req.vaddr
309          lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
310          lowResultShift    := BYTE0
311          lowResultWidth    := BYTE1
312
313          highAddrLoad.uop.fuOpType := LB
314          highAddrLoad.vaddr := req.vaddr + 1.U
315          highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
316          highResultShift    := BYTE0
317          highResultWidth    := BYTE1
318        }
319
320        is (LW) {
321          switch (req.vaddr(1, 0)) {
322            is ("b00".U) {
323              assert(false.B, "should not trigger miss align")
324            }
325
326            is ("b01".U) {
327              lowAddrLoad.uop.fuOpType := LW
328              lowAddrLoad.vaddr := req.vaddr - 1.U
329              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
330              lowResultShift    := BYTE1
331              lowResultWidth    := BYTE3
332
333              highAddrLoad.uop.fuOpType := LB
334              highAddrLoad.vaddr := req.vaddr + 3.U
335              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
336              highResultShift    := BYTE0
337              highResultWidth    := BYTE1
338            }
339
340            is ("b10".U) {
341              lowAddrLoad.uop.fuOpType := LH
342              lowAddrLoad.vaddr := req.vaddr
343              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
344              lowResultShift    := BYTE0
345              lowResultWidth    := BYTE2
346
347              highAddrLoad.uop.fuOpType := LH
348              highAddrLoad.vaddr := req.vaddr + 2.U
349              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
350              highResultShift    := BYTE0
351              highResultWidth    := BYTE2
352            }
353
354            is ("b11".U) {
355              lowAddrLoad.uop.fuOpType := LB
356              lowAddrLoad.vaddr := req.vaddr
357              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
358              lowResultShift    := BYTE0
359              lowResultWidth    := BYTE1
360
361              highAddrLoad.uop.fuOpType := LW
362              highAddrLoad.vaddr := req.vaddr + 1.U
363              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
364              highResultShift    := BYTE0
365              highResultWidth    := BYTE3
366            }
367          }
368        }
369
370        is (LD) {
371          switch (req.vaddr(2, 0)) {
372            is ("b000".U) {
373              assert(false.B, "should not trigger miss align")
374            }
375
376            is ("b001".U) {
377              lowAddrLoad.uop.fuOpType := LD
378              lowAddrLoad.vaddr := req.vaddr - 1.U
379              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
380              lowResultShift    := BYTE1
381              lowResultWidth    := BYTE7
382
383              highAddrLoad.uop.fuOpType := LB
384              highAddrLoad.vaddr := req.vaddr + 7.U
385              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
386              highResultShift    := BYTE0
387              highResultWidth    := BYTE1
388            }
389
390            is ("b010".U) {
391              lowAddrLoad.uop.fuOpType := LD
392              lowAddrLoad.vaddr := req.vaddr - 2.U
393              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
394              lowResultShift    := BYTE2
395              lowResultWidth    := BYTE6
396
397              highAddrLoad.uop.fuOpType := LH
398              highAddrLoad.vaddr := req.vaddr + 6.U
399              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
400              highResultShift    := BYTE0
401              highResultWidth    := BYTE2
402            }
403
404            is ("b011".U) {
405              lowAddrLoad.uop.fuOpType := LD
406              lowAddrLoad.vaddr := req.vaddr - 3.U
407              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
408              lowResultShift    := BYTE3
409              lowResultWidth    := BYTE5
410
411              highAddrLoad.uop.fuOpType := LW
412              highAddrLoad.vaddr := req.vaddr + 5.U
413              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
414              highResultShift    := BYTE0
415              highResultWidth    := BYTE3
416            }
417
418            is ("b100".U) {
419              lowAddrLoad.uop.fuOpType := LW
420              lowAddrLoad.vaddr := req.vaddr
421              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
422              lowResultShift    := BYTE0
423              lowResultWidth    := BYTE4
424
425              highAddrLoad.uop.fuOpType := LW
426              highAddrLoad.vaddr := req.vaddr + 4.U
427              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
428              highResultShift    := BYTE0
429              highResultWidth    := BYTE4
430            }
431
432            is ("b101".U) {
433              lowAddrLoad.uop.fuOpType := LW
434              lowAddrLoad.vaddr := req.vaddr - 1.U
435              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
436              lowResultShift    := BYTE1
437              lowResultWidth    := BYTE3
438
439              highAddrLoad.uop.fuOpType := LD
440              highAddrLoad.vaddr := req.vaddr + 3.U
441              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
442              highResultShift    := BYTE0
443              highResultWidth    := BYTE5
444            }
445
446            is ("b110".U) {
447              lowAddrLoad.uop.fuOpType := LH
448              lowAddrLoad.vaddr := req.vaddr
449              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
450              lowResultShift    := BYTE0
451              lowResultWidth    := BYTE2
452
453              highAddrLoad.uop.fuOpType := LD
454              highAddrLoad.vaddr := req.vaddr + 2.U
455              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
456              highResultShift    := BYTE0
457              highResultWidth    := BYTE6
458            }
459
460            is ("b111".U) {
461              lowAddrLoad.uop.fuOpType := LB
462              lowAddrLoad.vaddr := req.vaddr
463              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
464              lowResultShift    := BYTE0
465              lowResultWidth    := BYTE1
466
467              highAddrLoad.uop.fuOpType := LD
468              highAddrLoad.vaddr := req.vaddr + 1.U
469              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
470              highResultShift    := BYTE0
471              highResultWidth    := BYTE7
472            }
473          }
474        }
475      }
476
477      splitLoadReqs(0) := lowAddrLoad
478      splitLoadReqs(1) := highAddrLoad
479    }
480    exceptionVec := 0.U.asTypeOf(exceptionVec.cloneType)
481  }
482
483  io.splitLoadReq.valid := req_valid && (bufferState === s_req)
484  io.splitLoadReq.bits  := splitLoadReqs(curPtr)
485  // Restore the information of H extension load
486  // bit encoding: | hlv 1 | hlvx 1 | is unsigned(1bit) | size(2bit) |
487  val reqIsHlv  = LSUOpType.isHlv(req.uop.fuOpType)
488  val reqIsHlvx = LSUOpType.isHlvx(req.uop.fuOpType)
489  io.splitLoadReq.bits.uop.fuOpType := Cat(reqIsHlv, reqIsHlvx, 0.U(1.W), splitLoadReqs(curPtr).uop.fuOpType(1, 0))
490
491  when (io.splitLoadResp.valid) {
492    val resp = io.splitLoadResp.bits
493    splitLoadResp(curPtr) := io.splitLoadResp.bits
494    when (isMMIO) {
495      unSentLoads := 0.U
496      exceptionVec := 0.U.asTypeOf(ExceptionVec())
497      // delegate to software
498      exceptionVec(loadAddrMisaligned) := true.B
499    } .elsewhen (hasException) {
500      unSentLoads := 0.U
501      LduCfg.exceptionOut.map(no => exceptionVec(no) := exceptionVec(no) || resp.uop.exceptionVec(no))
502    } .elsewhen (!io.splitLoadResp.bits.rep_info.need_rep) {
503      unSentLoads := unSentLoads & ~UIntToOH(curPtr)
504      curPtr := curPtr + 1.U
505      exceptionVec := 0.U.asTypeOf(ExceptionVec())
506    }
507  }
508
509  val combinedData = RegInit(0.U(XLEN.W))
510
511  when (bufferState === s_comb) {
512    when (!cross16BytesBoundary) {
513      val shiftData = LookupTree(aligned16BytesSel, List(
514        "b0000".U -> splitLoadResp(0).data(63,     0),
515        "b0001".U -> splitLoadResp(0).data(71,     8),
516        "b0010".U -> splitLoadResp(0).data(79,    16),
517        "b0011".U -> splitLoadResp(0).data(87,    24),
518        "b0100".U -> splitLoadResp(0).data(95,    32),
519        "b0101".U -> splitLoadResp(0).data(103,   40),
520        "b0110".U -> splitLoadResp(0).data(111,   48),
521        "b0111".U -> splitLoadResp(0).data(119,   56),
522        "b1000".U -> splitLoadResp(0).data(127,   64),
523        "b1001".U -> splitLoadResp(0).data(127,   72),
524        "b1010".U -> splitLoadResp(0).data(127,   80),
525        "b1011".U -> splitLoadResp(0).data(127,   88),
526        "b1100".U -> splitLoadResp(0).data(127,   96),
527        "b1101".U -> splitLoadResp(0).data(127,  104),
528        "b1110".U -> splitLoadResp(0).data(127,  112),
529        "b1111".U -> splitLoadResp(0).data(127,  120)
530      ))
531      val truncateData = LookupTree(req.uop.fuOpType(1, 0), List(
532        LB -> shiftData(7,  0), // lb
533        LH -> shiftData(15, 0), // lh
534        LW -> shiftData(31, 0), // lw
535        LD -> shiftData(63, 0)  // ld
536      ))
537      combinedData := rdataHelper(req.uop, truncateData(XLEN - 1, 0))
538    } .otherwise {
539      val lowAddrResult = getShiftAndTruncateData(lowResultShift, lowResultWidth, splitLoadResp(0).data)
540                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
541      val highAddrResult = getShiftAndTruncateData(highResultShift, highResultWidth, splitLoadResp(1).data)
542                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
543      val catResult = Wire(Vec(XLEN / 8, UInt(8.W)))
544      (0 until XLEN / 8) .map {
545        case i => {
546          when (i.U < lowResultWidth) {
547            catResult(i) := lowAddrResult(i)
548          } .otherwise {
549            catResult(i) := highAddrResult(i.U - lowResultWidth)
550          }
551        }
552      }
553      combinedData := rdataHelper(req.uop, (catResult.asUInt)(XLEN - 1, 0))
554    }
555  }
556
557  io.writeBack.valid := req_valid && (bufferState === s_wb)
558  io.writeBack.bits.uop := req.uop
559  io.writeBack.bits.uop.exceptionVec := DontCare
560  LduCfg.exceptionOut.map(no => io.writeBack.bits.uop.exceptionVec(no) := (globalMMIO || globalException) && exceptionVec(no))
561  io.writeBack.bits.uop.flushPipe := Mux(globalMMIO || globalException, false.B, true.B)
562  io.writeBack.bits.uop.replayInst := false.B
563  io.writeBack.bits.data := combinedData
564  io.writeBack.bits.isFromLoadUnit := DontCare
565  io.writeBack.bits.debug.isMMIO := globalMMIO
566  io.writeBack.bits.debug.isPerfCnt := false.B
567  io.writeBack.bits.debug.paddr := req.paddr
568  io.writeBack.bits.debug.vaddr := req.vaddr
569
570  val flush = req_valid && req.uop.robIdx.needFlush(io.redirect)
571
572  when (flush && (bufferState =/= s_idle)) {
573    bufferState := s_idle
574    req_valid := false.B
575    curPtr := 0.U
576    unSentLoads := 0.U
577    globalException := false.B
578    globalMMIO := false.B
579  }
580
581  // NOTE: spectial case (unaligned load cross page, page fault happens in next page)
582  // if exception happens in the higher page address part, overwrite the loadExceptionBuffer vaddr
583  val shouldOverwrite = req_valid && globalException
584  val overwriteExpBuf = GatedValidRegNext(shouldOverwrite)
585  val overwriteVaddr = RegEnable(
586    Mux(
587      cross16BytesBoundary && (curPtr === 1.U),
588      splitLoadResp(curPtr).vaddr,
589      splitLoadResp(curPtr).fullva),
590    shouldOverwrite)
591  val overwriteGpaddr = RegEnable(splitLoadResp(curPtr).gpaddr, shouldOverwrite)
592  val overwriteIsHyper = RegEnable(splitLoadResp(curPtr).isHyper, shouldOverwrite)
593  val overwriteIsForVSnonLeafPTE = RegEnable(splitLoadResp(curPtr).isForVSnonLeafPTE, shouldOverwrite)
594
595  io.overwriteExpBuf.valid := overwriteExpBuf
596  io.overwriteExpBuf.vaddr := overwriteVaddr
597  io.overwriteExpBuf.isHyper := overwriteIsHyper
598  io.overwriteExpBuf.gpaddr := overwriteGpaddr
599  io.overwriteExpBuf.isForVSnonLeafPTE := overwriteIsForVSnonLeafPTE
600
601  // when no exception or mmio, flush loadExceptionBuffer at s_wb
602  val flushLdExpBuff = GatedValidRegNext(req_valid && (bufferState === s_wb) && !(globalMMIO || globalException))
603  io.flushLdExpBuff := flushLdExpBuff
604
605  XSPerfAccumulate("alloc",                  RegNext(!req_valid) && req_valid)
606  XSPerfAccumulate("flush",                  flush)
607  XSPerfAccumulate("flush_idle",             flush && (bufferState === s_idle))
608  XSPerfAccumulate("flush_non_idle",         flush && (bufferState =/= s_idle))
609}