xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala (revision 211d620b07edb797ba35b635d24fef4e7294bae2)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.fu.FuConfig._
26import xiangshan.backend.fu.fpu.FPU
27import xiangshan.backend.rob.RobLsqIO
28import xiangshan.cache.mmu.HasTlbConst
29import xiangshan.cache._
30import xiangshan.frontend.FtqPtr
31import xiangshan.ExceptionNO._
32import xiangshan.cache.wpu.ReplayCarry
33import xiangshan.backend.rob.RobPtr
34import xiangshan.backend.Bundles.{MemExuOutput, DynInst}
35
36class LoadMisalignBuffer(implicit p: Parameters) extends XSModule
37  with HasCircularQueuePtrHelper
38  with HasLoadHelper
39  with HasTlbConst
40{
41  private val enqPortNum = LoadPipelineWidth
42  private val maxSplitNum = 2
43
44  require(maxSplitNum == 2)
45
46  private val LB = "b00".U(2.W)
47  private val LH = "b01".U(2.W)
48  private val LW = "b10".U(2.W)
49  private val LD = "b11".U(2.W)
50
51  // encode of how many bytes to shift or truncate
52  private val BYTE0 = "b000".U(3.W)
53  private val BYTE1 = "b001".U(3.W)
54  private val BYTE2 = "b010".U(3.W)
55  private val BYTE3 = "b011".U(3.W)
56  private val BYTE4 = "b100".U(3.W)
57  private val BYTE5 = "b101".U(3.W)
58  private val BYTE6 = "b110".U(3.W)
59  private val BYTE7 = "b111".U(3.W)
60
61  def getMask(sizeEncode: UInt) = LookupTree(sizeEncode, List(
62    LB -> 0x1.U, // lb
63    LH -> 0x3.U, // lh
64    LW -> 0xf.U, // lw
65    LD -> 0xff.U  // ld
66  ))
67
68  def getShiftAndTruncateData(shiftEncode: UInt, truncateEncode: UInt, data: UInt) = {
69    val shiftData = LookupTree(shiftEncode, List(
70      BYTE0 -> data(63,    0),
71      BYTE1 -> data(63,    8),
72      BYTE2 -> data(63,   16),
73      BYTE3 -> data(63,   24),
74      BYTE4 -> data(63,   32),
75      BYTE5 -> data(63,   40),
76      BYTE6 -> data(63,   48),
77      BYTE7 -> data(63,   56)
78    ))
79    val truncateData = LookupTree(truncateEncode, List(
80      BYTE0 -> 0.U(XLEN.W), // can not truncate with 0 byte width
81      BYTE1 -> shiftData(7,    0),
82      BYTE2 -> shiftData(15,   0),
83      BYTE3 -> shiftData(23,   0),
84      BYTE4 -> shiftData(31,   0),
85      BYTE5 -> shiftData(39,   0),
86      BYTE6 -> shiftData(47,   0),
87      BYTE7 -> shiftData(55,   0)
88    ))
89    truncateData(XLEN - 1, 0)
90  }
91
92  def selectOldest[T <: LqWriteBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
93    assert(valid.length == bits.length)
94    if (valid.length == 0 || valid.length == 1) {
95      (valid, bits)
96    } else if (valid.length == 2) {
97      val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0)))))
98      for (i <- res.indices) {
99        res(i).valid := valid(i)
100        res(i).bits := bits(i)
101      }
102      val oldest = Mux(valid(0) && valid(1),
103        Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) ||
104          (isNotBefore(bits(0).uop.robIdx, bits(1).uop.robIdx) && bits(0).uop.uopIdx > bits(1).uop.uopIdx), res(1), res(0)),
105        Mux(valid(0) && !valid(1), res(0), res(1)))
106      (Seq(oldest.valid), Seq(oldest.bits))
107    } else {
108      val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2))
109      val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)))
110      selectOldest(left._1 ++ right._1, left._2 ++ right._2)
111    }
112  }
113
114  val io = IO(new Bundle() {
115    val redirect        = Flipped(Valid(new Redirect))
116    val req             = Vec(enqPortNum, Flipped(Valid(new LqWriteBundle)))
117    val rob             = Flipped(new RobLsqIO)
118    val splitLoadReq    = Decoupled(new LsPipelineBundle)
119    val splitLoadResp   = Flipped(Valid(new LqWriteBundle))
120    val writeBack       = Decoupled(new MemExuOutput)
121    val overwriteExpBuf = Output(new XSBundle {
122      val valid  = Bool()
123      val vaddr  = UInt(XLEN.W)
124      val isHyper = Bool()
125      val gpaddr = UInt(XLEN.W)
126      val isForVSnonLeafPTE = Bool()
127    })
128    val flushLdExpBuff  = Output(Bool())
129  })
130
131  io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool()))
132  io.rob.uop  := 0.U.asTypeOf(Vec(LoadPipelineWidth, new DynInst))
133
134  val req_valid = RegInit(false.B)
135  val req = Reg(new LqWriteBundle)
136
137  // enqueue
138  // s1:
139  val s1_req = VecInit(io.req.map(_.bits))
140  val s1_valid = VecInit(io.req.map(x => x.valid))
141
142  // s2: delay 1 cycle
143  val s2_req = RegNext(s1_req)
144  val s2_valid = (0 until enqPortNum).map(i =>
145    RegNext(s1_valid(i)) &&
146    !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) &&
147    !s2_req(i).uop.robIdx.needFlush(io.redirect)
148  )
149  val s2_miss_aligned = s2_req.map(x =>
150    x.uop.exceptionVec(loadAddrMisaligned) && !x.uop.exceptionVec(breakPoint) && !TriggerAction.isDmode(x.uop.trigger)
151  )
152
153  val s2_enqueue = Wire(Vec(enqPortNum, Bool()))
154  for (w <- 0 until enqPortNum) {
155    s2_enqueue(w) := s2_valid(w) && s2_miss_aligned(w)
156  }
157
158  when (req_valid && req.uop.robIdx.needFlush(io.redirect)) {
159    req_valid := s2_enqueue.asUInt.orR
160  } .elsewhen (s2_enqueue.asUInt.orR) {
161    req_valid := req_valid || true.B
162  }
163
164  val reqSel = selectOldest(s2_enqueue, s2_req)
165
166  when (req_valid) {
167    req := Mux(
168      reqSel._1(0) && (isAfter(req.uop.robIdx, reqSel._2(0).uop.robIdx) || (isNotBefore(req.uop.robIdx, reqSel._2(0).uop.robIdx) && req.uop.uopIdx > reqSel._2(0).uop.uopIdx)),
169      reqSel._2(0),
170      req)
171  } .elsewhen (s2_enqueue.asUInt.orR) {
172    req := reqSel._2(0)
173  }
174
175  val robMatch = req_valid && io.rob.pendingld && (io.rob.pendingPtr === req.uop.robIdx)
176
177  // buffer control:
178  //  - split miss-aligned load into aligned loads
179  //  - send split load to ldu and get result from ldu
180  //  - merge them and write back to rob
181  val s_idle :: s_split :: s_req :: s_resp :: s_comb :: s_wb :: s_wait :: Nil = Enum(7)
182  val bufferState = RegInit(s_idle)
183  val splitLoadReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle))))
184  val splitLoadResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LqWriteBundle))))
185  val unSentLoads = RegInit(0.U(maxSplitNum.W))
186  val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W))
187
188  // if there is exception or mmio in split load
189  val globalException = RegInit(false.B)
190  val globalMMIO = RegInit(false.B)
191
192  val hasException = ExceptionNO.selectByFu(io.splitLoadResp.bits.uop.exceptionVec, LduCfg).asUInt.orR
193  val isMMIO = io.splitLoadResp.bits.mmio
194
195  switch(bufferState) {
196    is (s_idle) {
197      when (robMatch) {
198        bufferState := s_split
199      }
200    }
201
202    is (s_split) {
203      bufferState := s_req
204    }
205
206    is (s_req) {
207      when (io.splitLoadReq.fire) {
208        bufferState := s_resp
209      }
210    }
211
212    is (s_resp) {
213      when (io.splitLoadResp.valid) {
214        val clearOh = UIntToOH(curPtr)
215        when (hasException || isMMIO) {
216          // commit directly when exception ocurs
217          // if any split load reaches mmio space, delegate to software loadAddrMisaligned exception
218          bufferState := s_wb
219          globalException := hasException
220          globalMMIO := isMMIO
221        } .elsewhen(io.splitLoadResp.bits.rep_info.need_rep || (unSentLoads & ~clearOh).orR) {
222          // need replay or still has unsent requests
223          bufferState := s_req
224        } .otherwise {
225          // merge the split load results
226          bufferState := s_comb
227        }
228      }
229    }
230
231    is (s_comb) {
232      bufferState := s_wb
233    }
234
235    is (s_wb) {
236      when(io.writeBack.fire) {
237        bufferState := s_wait
238      }
239    }
240
241    is (s_wait) {
242      when(io.rob.lcommit =/= 0.U || req.uop.robIdx.needFlush(io.redirect)) {
243        // rob commits the unaligned load or handled the exception, reset all state
244        bufferState := s_idle
245        req_valid := false.B
246        curPtr := 0.U
247        unSentLoads := 0.U
248        globalException := false.B
249        globalMMIO := false.B
250      }
251    }
252  }
253
254  val highAddress = LookupTree(req.uop.fuOpType(1, 0), List(
255    LB -> 0.U,
256    LH -> 1.U,
257    LW -> 3.U,
258    LD -> 7.U
259  )) + req.vaddr(4, 0)
260  // to see if (vaddr + opSize - 1) and vaddr are in the same 16 bytes region
261  val cross16BytesBoundary = req_valid && (highAddress(4) =/= req.vaddr(4))
262  val aligned16BytesAddr   = (req.vaddr >> 4) << 4// req.vaddr & ~("b1111".U)
263  val aligned16BytesSel    = req.vaddr(3, 0)
264
265  // meta of 128 bit load
266  val new128Load = WireInit(0.U.asTypeOf(new LsPipelineBundle))
267  // meta of split loads
268  val lowAddrLoad  = WireInit(0.U.asTypeOf(new LsPipelineBundle))
269  val highAddrLoad = WireInit(0.U.asTypeOf(new LsPipelineBundle))
270  val lowResultShift = RegInit(0.U(3.W)) // how many bytes should we shift right when got result
271  val lowResultWidth = RegInit(0.U(3.W)) // how many bytes should we take from result
272  val highResultShift = RegInit(0.U(3.W))
273  val highResultWidth = RegInit(0.U(3.W))
274
275  when (bufferState === s_split) {
276    when (!cross16BytesBoundary) {
277      // change this unaligned load into a 128 bits load
278      unSentLoads := 1.U
279      curPtr := 0.U
280      new128Load.vaddr := aligned16BytesAddr
281      new128Load.fullva := req.fullva
282      // new128Load.mask  := (getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel).asUInt
283      new128Load.mask  := 0xffff.U
284      new128Load.uop   := req.uop
285      new128Load.uop.exceptionVec(loadAddrMisaligned) := false.B
286      new128Load.is128bit := true.B
287      splitLoadReqs(0) := new128Load
288    } .otherwise {
289      // split this unaligned load into `maxSplitNum` aligned loads
290      unSentLoads := Fill(maxSplitNum, 1.U(1.W))
291      curPtr := 0.U
292      lowAddrLoad.uop := req.uop
293      lowAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
294      lowAddrLoad.fullva := req.fullva
295      highAddrLoad.uop := req.uop
296      highAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
297      highAddrLoad.fullva := req.fullva
298
299      switch (req.uop.fuOpType(1, 0)) {
300        is (LB) {
301          assert(false.B, "lb should not trigger miss align")
302        }
303
304        is (LH) {
305          lowAddrLoad.uop.fuOpType := LB
306          lowAddrLoad.vaddr := req.vaddr
307          lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
308          lowResultShift    := BYTE0
309          lowResultWidth    := BYTE1
310
311          highAddrLoad.uop.fuOpType := LB
312          highAddrLoad.vaddr := req.vaddr + 1.U
313          highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
314          highResultShift    := BYTE0
315          highResultWidth    := BYTE1
316        }
317
318        is (LW) {
319          switch (req.vaddr(1, 0)) {
320            is ("b00".U) {
321              assert(false.B, "should not trigger miss align")
322            }
323
324            is ("b01".U) {
325              lowAddrLoad.uop.fuOpType := LW
326              lowAddrLoad.vaddr := req.vaddr - 1.U
327              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
328              lowResultShift    := BYTE1
329              lowResultWidth    := BYTE3
330
331              highAddrLoad.uop.fuOpType := LB
332              highAddrLoad.vaddr := req.vaddr + 3.U
333              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
334              highResultShift    := BYTE0
335              highResultWidth    := BYTE1
336            }
337
338            is ("b10".U) {
339              lowAddrLoad.uop.fuOpType := LH
340              lowAddrLoad.vaddr := req.vaddr
341              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
342              lowResultShift    := BYTE0
343              lowResultWidth    := BYTE2
344
345              highAddrLoad.uop.fuOpType := LH
346              highAddrLoad.vaddr := req.vaddr + 2.U
347              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
348              highResultShift    := BYTE0
349              highResultWidth    := BYTE2
350            }
351
352            is ("b11".U) {
353              lowAddrLoad.uop.fuOpType := LB
354              lowAddrLoad.vaddr := req.vaddr
355              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
356              lowResultShift    := BYTE0
357              lowResultWidth    := BYTE1
358
359              highAddrLoad.uop.fuOpType := LW
360              highAddrLoad.vaddr := req.vaddr + 1.U
361              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
362              highResultShift    := BYTE0
363              highResultWidth    := BYTE3
364            }
365          }
366        }
367
368        is (LD) {
369          switch (req.vaddr(2, 0)) {
370            is ("b000".U) {
371              assert(false.B, "should not trigger miss align")
372            }
373
374            is ("b001".U) {
375              lowAddrLoad.uop.fuOpType := LD
376              lowAddrLoad.vaddr := req.vaddr - 1.U
377              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
378              lowResultShift    := BYTE1
379              lowResultWidth    := BYTE7
380
381              highAddrLoad.uop.fuOpType := LB
382              highAddrLoad.vaddr := req.vaddr + 7.U
383              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
384              highResultShift    := BYTE0
385              highResultWidth    := BYTE1
386            }
387
388            is ("b010".U) {
389              lowAddrLoad.uop.fuOpType := LD
390              lowAddrLoad.vaddr := req.vaddr - 2.U
391              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
392              lowResultShift    := BYTE2
393              lowResultWidth    := BYTE6
394
395              highAddrLoad.uop.fuOpType := LH
396              highAddrLoad.vaddr := req.vaddr + 6.U
397              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
398              highResultShift    := BYTE0
399              highResultWidth    := BYTE2
400            }
401
402            is ("b011".U) {
403              lowAddrLoad.uop.fuOpType := LD
404              lowAddrLoad.vaddr := req.vaddr - 3.U
405              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
406              lowResultShift    := BYTE3
407              lowResultWidth    := BYTE5
408
409              highAddrLoad.uop.fuOpType := LW
410              highAddrLoad.vaddr := req.vaddr + 5.U
411              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
412              highResultShift    := BYTE0
413              highResultWidth    := BYTE3
414            }
415
416            is ("b100".U) {
417              lowAddrLoad.uop.fuOpType := LW
418              lowAddrLoad.vaddr := req.vaddr
419              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
420              lowResultShift    := BYTE0
421              lowResultWidth    := BYTE4
422
423              highAddrLoad.uop.fuOpType := LW
424              highAddrLoad.vaddr := req.vaddr + 4.U
425              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
426              highResultShift    := BYTE0
427              highResultWidth    := BYTE4
428            }
429
430            is ("b101".U) {
431              lowAddrLoad.uop.fuOpType := LW
432              lowAddrLoad.vaddr := req.vaddr - 1.U
433              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
434              lowResultShift    := BYTE1
435              lowResultWidth    := BYTE3
436
437              highAddrLoad.uop.fuOpType := LD
438              highAddrLoad.vaddr := req.vaddr + 3.U
439              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
440              highResultShift    := BYTE0
441              highResultWidth    := BYTE5
442            }
443
444            is ("b110".U) {
445              lowAddrLoad.uop.fuOpType := LH
446              lowAddrLoad.vaddr := req.vaddr
447              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
448              lowResultShift    := BYTE0
449              lowResultWidth    := BYTE2
450
451              highAddrLoad.uop.fuOpType := LD
452              highAddrLoad.vaddr := req.vaddr + 2.U
453              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
454              highResultShift    := BYTE0
455              highResultWidth    := BYTE6
456            }
457
458            is ("b111".U) {
459              lowAddrLoad.uop.fuOpType := LB
460              lowAddrLoad.vaddr := req.vaddr
461              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
462              lowResultShift    := BYTE0
463              lowResultWidth    := BYTE1
464
465              highAddrLoad.uop.fuOpType := LD
466              highAddrLoad.vaddr := req.vaddr + 1.U
467              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
468              highResultShift    := BYTE0
469              highResultWidth    := BYTE7
470            }
471          }
472        }
473      }
474
475      splitLoadReqs(0) := lowAddrLoad
476      splitLoadReqs(1) := highAddrLoad
477    }
478  }
479
480  io.splitLoadReq.valid := req_valid && (bufferState === s_req)
481  io.splitLoadReq.bits  := splitLoadReqs(curPtr)
482  // Restore the information of H extension load
483  // bit encoding: | hlv 1 | hlvx 1 | is unsigned(1bit) | size(2bit) |
484  val reqIsHlv  = LSUOpType.isHlv(req.uop.fuOpType)
485  val reqIsHlvx = LSUOpType.isHlvx(req.uop.fuOpType)
486  io.splitLoadReq.bits.uop.fuOpType := Cat(reqIsHlv, reqIsHlvx, 0.U(1.W), splitLoadReqs(curPtr).uop.fuOpType(1, 0))
487
488  when (io.splitLoadResp.valid) {
489    splitLoadResp(curPtr) := io.splitLoadResp.bits
490    when (isMMIO) {
491      unSentLoads := 0.U
492      splitLoadResp(curPtr).uop.exceptionVec := 0.U.asTypeOf(ExceptionVec())
493      // delegate to software
494      splitLoadResp(curPtr).uop.exceptionVec(loadAddrMisaligned) := true.B
495    } .elsewhen (hasException) {
496      unSentLoads := 0.U
497    } .elsewhen (!io.splitLoadResp.bits.rep_info.need_rep) {
498      unSentLoads := unSentLoads & ~UIntToOH(curPtr)
499      curPtr := curPtr + 1.U
500    }
501  }
502
503  val combinedData = RegInit(0.U(XLEN.W))
504
505  when (bufferState === s_comb) {
506    when (!cross16BytesBoundary) {
507      val shiftData = LookupTree(aligned16BytesSel, List(
508        "b0000".U -> splitLoadResp(0).data(63,     0),
509        "b0001".U -> splitLoadResp(0).data(71,     8),
510        "b0010".U -> splitLoadResp(0).data(79,    16),
511        "b0011".U -> splitLoadResp(0).data(87,    24),
512        "b0100".U -> splitLoadResp(0).data(95,    32),
513        "b0101".U -> splitLoadResp(0).data(103,   40),
514        "b0110".U -> splitLoadResp(0).data(111,   48),
515        "b0111".U -> splitLoadResp(0).data(119,   56),
516        "b1000".U -> splitLoadResp(0).data(127,   64),
517        "b1001".U -> splitLoadResp(0).data(127,   72),
518        "b1010".U -> splitLoadResp(0).data(127,   80),
519        "b1011".U -> splitLoadResp(0).data(127,   88),
520        "b1100".U -> splitLoadResp(0).data(127,   96),
521        "b1101".U -> splitLoadResp(0).data(127,  104),
522        "b1110".U -> splitLoadResp(0).data(127,  112),
523        "b1111".U -> splitLoadResp(0).data(127,  120)
524      ))
525      val truncateData = LookupTree(req.uop.fuOpType(1, 0), List(
526        LB -> shiftData(7,  0), // lb
527        LH -> shiftData(15, 0), // lh
528        LW -> shiftData(31, 0), // lw
529        LD -> shiftData(63, 0)  // ld
530      ))
531      combinedData := rdataHelper(req.uop, truncateData(XLEN - 1, 0))
532    } .otherwise {
533      val lowAddrResult = getShiftAndTruncateData(lowResultShift, lowResultWidth, splitLoadResp(0).data)
534                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
535      val highAddrResult = getShiftAndTruncateData(highResultShift, highResultWidth, splitLoadResp(1).data)
536                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
537      val catResult = Wire(Vec(XLEN / 8, UInt(8.W)))
538      (0 until XLEN / 8) .map {
539        case i => {
540          when (i.U < lowResultWidth) {
541            catResult(i) := lowAddrResult(i)
542          } .otherwise {
543            catResult(i) := highAddrResult(i.U - lowResultWidth)
544          }
545        }
546      }
547      combinedData := rdataHelper(req.uop, (catResult.asUInt)(XLEN - 1, 0))
548    }
549  }
550
551  io.writeBack.valid := req_valid && (bufferState === s_wb)
552  io.writeBack.bits.uop := req.uop
553  io.writeBack.bits.uop.exceptionVec := ExceptionNO.selectByFu(Mux(
554    globalMMIO || globalException,
555    splitLoadResp(curPtr).uop.exceptionVec,
556    0.U.asTypeOf(ExceptionVec()) // TODO: is this ok?
557  ), LduCfg)
558  io.writeBack.bits.uop.flushPipe := Mux(globalMMIO || globalException, false.B, true.B)
559  io.writeBack.bits.uop.replayInst := false.B
560  io.writeBack.bits.data := combinedData
561  io.writeBack.bits.isFromLoadUnit := DontCare
562  io.writeBack.bits.debug.isMMIO := globalMMIO
563  io.writeBack.bits.debug.isPerfCnt := false.B
564  io.writeBack.bits.debug.paddr := req.paddr
565  io.writeBack.bits.debug.vaddr := req.vaddr
566
567  val flush = req_valid && req.uop.robIdx.needFlush(io.redirect)
568
569  when (flush && (bufferState =/= s_idle)) {
570    bufferState := s_idle
571    req_valid := false.B
572    curPtr := 0.U
573    unSentLoads := 0.U
574    globalException := false.B
575    globalMMIO := false.B
576  }
577
578  // NOTE: spectial case (unaligned load cross page, page fault happens in next page)
579  // if exception happens in the higher page address part, overwrite the loadExceptionBuffer vaddr
580  val shouldOverwrite = req_valid && globalException
581  val overwriteExpBuf = GatedValidRegNext(shouldOverwrite)
582  val overwriteVaddr = RegEnable(
583    Mux(
584      cross16BytesBoundary && (curPtr === 1.U),
585      splitLoadResp(curPtr).vaddr,
586      splitLoadResp(curPtr).fullva),
587    shouldOverwrite)
588  val overwriteGpaddr = RegEnable(
589    Mux(
590      cross16BytesBoundary && (curPtr === 1.U),
591      // when cross-page, offset should always be 0
592      Cat(get_pn(splitLoadResp(curPtr).gpaddr), get_off(0.U(splitLoadResp(curPtr).gpaddr.getWidth.W))),
593      splitLoadResp(curPtr).gpaddr),
594    shouldOverwrite)
595  val overwriteIsHyper = RegEnable(splitLoadResp(curPtr).isHyper, shouldOverwrite)
596  val overwriteIsForVSnonLeafPTE = RegEnable(splitLoadResp(curPtr).isForVSnonLeafPTE, shouldOverwrite)
597
598  io.overwriteExpBuf.valid := overwriteExpBuf
599  io.overwriteExpBuf.vaddr := overwriteVaddr
600  io.overwriteExpBuf.isHyper := overwriteIsHyper
601  io.overwriteExpBuf.gpaddr := overwriteGpaddr
602  io.overwriteExpBuf.isForVSnonLeafPTE := overwriteIsForVSnonLeafPTE
603
604  // when no exception or mmio, flush loadExceptionBuffer at s_wb
605  val flushLdExpBuff = GatedValidRegNext(req_valid && (bufferState === s_wb) && !(globalMMIO || globalException))
606  io.flushLdExpBuff := flushLdExpBuff
607
608  XSPerfAccumulate("alloc",                  RegNext(!req_valid) && req_valid)
609  XSPerfAccumulate("flush",                  flush)
610  XSPerfAccumulate("flush_idle",             flush && (bufferState === s_idle))
611  XSPerfAccumulate("flush_non_idle",         flush && (bufferState =/= s_idle))
612}