xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala (revision 149e918c520847554be4cf7f6594881d6d3a32c8)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.fu.FuConfig._
26import xiangshan.backend.fu.FuType
27import xiangshan.backend.fu.fpu.FPU
28import xiangshan.backend.rob.RobLsqIO
29import xiangshan.cache.mmu.HasTlbConst
30import xiangshan.cache._
31import xiangshan.frontend.FtqPtr
32import xiangshan.ExceptionNO._
33import xiangshan.cache.wpu.ReplayCarry
34import xiangshan.backend.rob.RobPtr
35import xiangshan.backend.Bundles.{MemExuOutput, DynInst}
36import xiangshan.backend.fu.FuConfig.LduCfg
37
38class LoadMisalignBuffer(implicit p: Parameters) extends XSModule
39  with HasCircularQueuePtrHelper
40  with HasLoadHelper
41  with HasTlbConst
42{
43  private val enqPortNum = LoadPipelineWidth
44  private val maxSplitNum = 2
45
46  require(maxSplitNum == 2)
47
48  private val LB = "b00".U(2.W)
49  private val LH = "b01".U(2.W)
50  private val LW = "b10".U(2.W)
51  private val LD = "b11".U(2.W)
52
53  // encode of how many bytes to shift or truncate
54  private val BYTE0 = "b000".U(3.W)
55  private val BYTE1 = "b001".U(3.W)
56  private val BYTE2 = "b010".U(3.W)
57  private val BYTE3 = "b011".U(3.W)
58  private val BYTE4 = "b100".U(3.W)
59  private val BYTE5 = "b101".U(3.W)
60  private val BYTE6 = "b110".U(3.W)
61  private val BYTE7 = "b111".U(3.W)
62
63  def getMask(sizeEncode: UInt) = LookupTree(sizeEncode, List(
64    LB -> 0x1.U, // lb
65    LH -> 0x3.U, // lh
66    LW -> 0xf.U, // lw
67    LD -> 0xff.U  // ld
68  ))
69
70  def getShiftAndTruncateData(shiftEncode: UInt, truncateEncode: UInt, data: UInt) = {
71    val shiftData = LookupTree(shiftEncode, List(
72      BYTE0 -> data(63,    0),
73      BYTE1 -> data(63,    8),
74      BYTE2 -> data(63,   16),
75      BYTE3 -> data(63,   24),
76      BYTE4 -> data(63,   32),
77      BYTE5 -> data(63,   40),
78      BYTE6 -> data(63,   48),
79      BYTE7 -> data(63,   56)
80    ))
81    val truncateData = LookupTree(truncateEncode, List(
82      BYTE0 -> 0.U(XLEN.W), // can not truncate with 0 byte width
83      BYTE1 -> shiftData(7,    0),
84      BYTE2 -> shiftData(15,   0),
85      BYTE3 -> shiftData(23,   0),
86      BYTE4 -> shiftData(31,   0),
87      BYTE5 -> shiftData(39,   0),
88      BYTE6 -> shiftData(47,   0),
89      BYTE7 -> shiftData(55,   0)
90    ))
91    truncateData(XLEN - 1, 0)
92  }
93
94  def selectOldest[T <: LqWriteBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
95    assert(valid.length == bits.length)
96    if (valid.length == 0 || valid.length == 1) {
97      (valid, bits)
98    } else if (valid.length == 2) {
99      val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0)))))
100      for (i <- res.indices) {
101        res(i).valid := valid(i)
102        res(i).bits := bits(i)
103      }
104      val oldest = Mux(valid(0) && valid(1),
105        Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) ||
106          (isNotBefore(bits(0).uop.robIdx, bits(1).uop.robIdx) && bits(0).uop.uopIdx > bits(1).uop.uopIdx), res(1), res(0)),
107        Mux(valid(0) && !valid(1), res(0), res(1)))
108      (Seq(oldest.valid), Seq(oldest.bits))
109    } else {
110      val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2))
111      val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)))
112      selectOldest(left._1 ++ right._1, left._2 ++ right._2)
113    }
114  }
115
116  val io = IO(new Bundle() {
117    val redirect        = Flipped(Valid(new Redirect))
118    val req             = Vec(enqPortNum, Flipped(Valid(new LqWriteBundle)))
119    val rob             = Flipped(new RobLsqIO)
120    val splitLoadReq    = Decoupled(new LsPipelineBundle)
121    val splitLoadResp   = Flipped(Valid(new LqWriteBundle))
122    val writeBack       = Decoupled(new MemExuOutput)
123    val overwriteExpBuf = Output(new XSBundle {
124      val valid  = Bool()
125      val vaddr  = UInt(XLEN.W)
126      val isHyper = Bool()
127      val gpaddr = UInt(XLEN.W)
128      val isForVSnonLeafPTE = Bool()
129    })
130    val flushLdExpBuff  = Output(Bool())
131  })
132
133  io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool()))
134  io.rob.uop  := 0.U.asTypeOf(Vec(LoadPipelineWidth, new DynInst))
135
136  val req_valid = RegInit(false.B)
137  val req = Reg(new LqWriteBundle)
138
139  // enqueue
140  // s1:
141  val s1_req = VecInit(io.req.map(_.bits))
142  val s1_valid = VecInit(io.req.map(x => x.valid))
143
144  // s2: delay 1 cycle
145  val s2_req = RegNext(s1_req)
146  val s2_valid = (0 until enqPortNum).map(i =>
147    RegNext(s1_valid(i)) &&
148    !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) &&
149    !s2_req(i).uop.robIdx.needFlush(io.redirect)
150  )
151  val s2_miss_aligned = s2_req.map(x =>
152    x.uop.exceptionVec(loadAddrMisaligned) && !x.uop.exceptionVec(breakPoint) && !TriggerAction.isDmode(x.uop.trigger)
153  )
154
155  val s2_enqueue = Wire(Vec(enqPortNum, Bool()))
156  for (w <- 0 until enqPortNum) {
157    s2_enqueue(w) := s2_valid(w) && s2_miss_aligned(w)
158  }
159
160  when (req_valid && req.uop.robIdx.needFlush(io.redirect)) {
161    req_valid := s2_enqueue.asUInt.orR
162  } .elsewhen (s2_enqueue.asUInt.orR) {
163    req_valid := req_valid || true.B
164  }
165
166  val reqSel = selectOldest(s2_enqueue, s2_req)
167
168  when (req_valid) {
169    req := Mux(
170      reqSel._1(0) && (isAfter(req.uop.robIdx, reqSel._2(0).uop.robIdx) || (isNotBefore(req.uop.robIdx, reqSel._2(0).uop.robIdx) && req.uop.uopIdx > reqSel._2(0).uop.uopIdx)),
171      reqSel._2(0),
172      req)
173  } .elsewhen (s2_enqueue.asUInt.orR) {
174    req := reqSel._2(0)
175  }
176
177  val robMatch = req_valid && io.rob.pendingld && (io.rob.pendingPtr === req.uop.robIdx)
178
179  // buffer control:
180  //  - split miss-aligned load into aligned loads
181  //  - send split load to ldu and get result from ldu
182  //  - merge them and write back to rob
183  val s_idle :: s_split :: s_req :: s_resp :: s_comb :: s_wb :: s_wait :: Nil = Enum(7)
184  val bufferState = RegInit(s_idle)
185  val splitLoadReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle))))
186  val splitLoadResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LqWriteBundle))))
187  val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec()))
188  val unSentLoads = RegInit(0.U(maxSplitNum.W))
189  val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W))
190
191  // if there is exception or mmio in split load
192  val globalException = RegInit(false.B)
193  val globalMMIO = RegInit(false.B)
194
195  val hasException = ExceptionNO.selectByFu(io.splitLoadResp.bits.uop.exceptionVec, LduCfg).asUInt.orR
196  val isMMIO = io.splitLoadResp.bits.mmio
197
198  switch(bufferState) {
199    is (s_idle) {
200      when (robMatch) {
201        bufferState := s_split
202      }
203    }
204
205    is (s_split) {
206      bufferState := s_req
207    }
208
209    is (s_req) {
210      when (io.splitLoadReq.fire) {
211        bufferState := s_resp
212      }
213    }
214
215    is (s_resp) {
216      when (io.splitLoadResp.valid) {
217        val clearOh = UIntToOH(curPtr)
218        when (hasException || isMMIO) {
219          // commit directly when exception ocurs
220          // if any split load reaches mmio space, delegate to software loadAddrMisaligned exception
221          bufferState := s_wb
222          globalException := hasException
223          globalMMIO := isMMIO
224        } .elsewhen(io.splitLoadResp.bits.rep_info.need_rep || (unSentLoads & ~clearOh).orR) {
225          // need replay or still has unsent requests
226          bufferState := s_req
227        } .otherwise {
228          // merge the split load results
229          bufferState := s_comb
230        }
231      }
232    }
233
234    is (s_comb) {
235      bufferState := s_wb
236    }
237
238    is (s_wb) {
239      when(io.writeBack.fire) {
240        bufferState := s_wait
241      }
242    }
243
244    is (s_wait) {
245      when(io.rob.lcommit =/= 0.U || req.uop.robIdx.needFlush(io.redirect)) {
246        // rob commits the unaligned load or handled the exception, reset all state
247        bufferState := s_idle
248        req_valid := false.B
249        curPtr := 0.U
250        unSentLoads := 0.U
251        globalException := false.B
252        globalMMIO := false.B
253      }
254    }
255  }
256
257  val highAddress = LookupTree(req.uop.fuOpType(1, 0), List(
258    LB -> 0.U,
259    LH -> 1.U,
260    LW -> 3.U,
261    LD -> 7.U
262  )) + req.vaddr(4, 0)
263  // to see if (vaddr + opSize - 1) and vaddr are in the same 16 bytes region
264  val cross16BytesBoundary = req_valid && (highAddress(4) =/= req.vaddr(4))
265  val aligned16BytesAddr   = (req.vaddr >> 4) << 4// req.vaddr & ~("b1111".U)
266  val aligned16BytesSel    = req.vaddr(3, 0)
267
268  // meta of 128 bit load
269  val new128Load = WireInit(0.U.asTypeOf(new LsPipelineBundle))
270  // meta of split loads
271  val lowAddrLoad  = WireInit(0.U.asTypeOf(new LsPipelineBundle))
272  val highAddrLoad = WireInit(0.U.asTypeOf(new LsPipelineBundle))
273  val lowResultShift = RegInit(0.U(3.W)) // how many bytes should we shift right when got result
274  val lowResultWidth = RegInit(0.U(3.W)) // how many bytes should we take from result
275  val highResultShift = RegInit(0.U(3.W))
276  val highResultWidth = RegInit(0.U(3.W))
277
278  when (bufferState === s_split) {
279    when (!cross16BytesBoundary) {
280      // change this unaligned load into a 128 bits load
281      unSentLoads := 1.U
282      curPtr := 0.U
283      new128Load.vaddr := aligned16BytesAddr
284      new128Load.fullva := req.fullva
285      // new128Load.mask  := (getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel).asUInt
286      new128Load.mask  := 0xffff.U
287      new128Load.uop   := req.uop
288      new128Load.uop.exceptionVec(loadAddrMisaligned) := false.B
289      new128Load.is128bit := true.B
290      splitLoadReqs(0) := new128Load
291    } .otherwise {
292      // split this unaligned load into `maxSplitNum` aligned loads
293      unSentLoads := Fill(maxSplitNum, 1.U(1.W))
294      curPtr := 0.U
295      lowAddrLoad.uop := req.uop
296      lowAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
297      lowAddrLoad.fullva := req.fullva
298      highAddrLoad.uop := req.uop
299      highAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
300      highAddrLoad.fullva := req.fullva
301
302      switch (req.uop.fuOpType(1, 0)) {
303        is (LB) {
304          assert(false.B, "lb should not trigger miss align")
305        }
306
307        is (LH) {
308          lowAddrLoad.uop.fuOpType := LB
309          lowAddrLoad.vaddr := req.vaddr
310          lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
311          lowResultShift    := BYTE0
312          lowResultWidth    := BYTE1
313
314          highAddrLoad.uop.fuOpType := LB
315          highAddrLoad.vaddr := req.vaddr + 1.U
316          highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
317          highResultShift    := BYTE0
318          highResultWidth    := BYTE1
319        }
320
321        is (LW) {
322          switch (req.vaddr(1, 0)) {
323            is ("b00".U) {
324              assert(false.B, "should not trigger miss align")
325            }
326
327            is ("b01".U) {
328              lowAddrLoad.uop.fuOpType := LW
329              lowAddrLoad.vaddr := req.vaddr - 1.U
330              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
331              lowResultShift    := BYTE1
332              lowResultWidth    := BYTE3
333
334              highAddrLoad.uop.fuOpType := LB
335              highAddrLoad.vaddr := req.vaddr + 3.U
336              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
337              highResultShift    := BYTE0
338              highResultWidth    := BYTE1
339            }
340
341            is ("b10".U) {
342              lowAddrLoad.uop.fuOpType := LH
343              lowAddrLoad.vaddr := req.vaddr
344              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
345              lowResultShift    := BYTE0
346              lowResultWidth    := BYTE2
347
348              highAddrLoad.uop.fuOpType := LH
349              highAddrLoad.vaddr := req.vaddr + 2.U
350              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
351              highResultShift    := BYTE0
352              highResultWidth    := BYTE2
353            }
354
355            is ("b11".U) {
356              lowAddrLoad.uop.fuOpType := LB
357              lowAddrLoad.vaddr := req.vaddr
358              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
359              lowResultShift    := BYTE0
360              lowResultWidth    := BYTE1
361
362              highAddrLoad.uop.fuOpType := LW
363              highAddrLoad.vaddr := req.vaddr + 1.U
364              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
365              highResultShift    := BYTE0
366              highResultWidth    := BYTE3
367            }
368          }
369        }
370
371        is (LD) {
372          switch (req.vaddr(2, 0)) {
373            is ("b000".U) {
374              assert(false.B, "should not trigger miss align")
375            }
376
377            is ("b001".U) {
378              lowAddrLoad.uop.fuOpType := LD
379              lowAddrLoad.vaddr := req.vaddr - 1.U
380              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
381              lowResultShift    := BYTE1
382              lowResultWidth    := BYTE7
383
384              highAddrLoad.uop.fuOpType := LB
385              highAddrLoad.vaddr := req.vaddr + 7.U
386              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
387              highResultShift    := BYTE0
388              highResultWidth    := BYTE1
389            }
390
391            is ("b010".U) {
392              lowAddrLoad.uop.fuOpType := LD
393              lowAddrLoad.vaddr := req.vaddr - 2.U
394              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
395              lowResultShift    := BYTE2
396              lowResultWidth    := BYTE6
397
398              highAddrLoad.uop.fuOpType := LH
399              highAddrLoad.vaddr := req.vaddr + 6.U
400              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
401              highResultShift    := BYTE0
402              highResultWidth    := BYTE2
403            }
404
405            is ("b011".U) {
406              lowAddrLoad.uop.fuOpType := LD
407              lowAddrLoad.vaddr := req.vaddr - 3.U
408              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
409              lowResultShift    := BYTE3
410              lowResultWidth    := BYTE5
411
412              highAddrLoad.uop.fuOpType := LW
413              highAddrLoad.vaddr := req.vaddr + 5.U
414              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
415              highResultShift    := BYTE0
416              highResultWidth    := BYTE3
417            }
418
419            is ("b100".U) {
420              lowAddrLoad.uop.fuOpType := LW
421              lowAddrLoad.vaddr := req.vaddr
422              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
423              lowResultShift    := BYTE0
424              lowResultWidth    := BYTE4
425
426              highAddrLoad.uop.fuOpType := LW
427              highAddrLoad.vaddr := req.vaddr + 4.U
428              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
429              highResultShift    := BYTE0
430              highResultWidth    := BYTE4
431            }
432
433            is ("b101".U) {
434              lowAddrLoad.uop.fuOpType := LW
435              lowAddrLoad.vaddr := req.vaddr - 1.U
436              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
437              lowResultShift    := BYTE1
438              lowResultWidth    := BYTE3
439
440              highAddrLoad.uop.fuOpType := LD
441              highAddrLoad.vaddr := req.vaddr + 3.U
442              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
443              highResultShift    := BYTE0
444              highResultWidth    := BYTE5
445            }
446
447            is ("b110".U) {
448              lowAddrLoad.uop.fuOpType := LH
449              lowAddrLoad.vaddr := req.vaddr
450              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
451              lowResultShift    := BYTE0
452              lowResultWidth    := BYTE2
453
454              highAddrLoad.uop.fuOpType := LD
455              highAddrLoad.vaddr := req.vaddr + 2.U
456              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
457              highResultShift    := BYTE0
458              highResultWidth    := BYTE6
459            }
460
461            is ("b111".U) {
462              lowAddrLoad.uop.fuOpType := LB
463              lowAddrLoad.vaddr := req.vaddr
464              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
465              lowResultShift    := BYTE0
466              lowResultWidth    := BYTE1
467
468              highAddrLoad.uop.fuOpType := LD
469              highAddrLoad.vaddr := req.vaddr + 1.U
470              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
471              highResultShift    := BYTE0
472              highResultWidth    := BYTE7
473            }
474          }
475        }
476      }
477
478      splitLoadReqs(0) := lowAddrLoad
479      splitLoadReqs(1) := highAddrLoad
480    }
481    exceptionVec := 0.U.asTypeOf(exceptionVec.cloneType)
482  }
483
484  io.splitLoadReq.valid := req_valid && (bufferState === s_req)
485  io.splitLoadReq.bits  := splitLoadReqs(curPtr)
486  // Restore the information of H extension load
487  // bit encoding: | hlv 1 | hlvx 1 | is unsigned(1bit) | size(2bit) |
488  val reqIsHlv  = LSUOpType.isHlv(req.uop.fuOpType)
489  val reqIsHlvx = LSUOpType.isHlvx(req.uop.fuOpType)
490  io.splitLoadReq.bits.uop.fuOpType := Cat(reqIsHlv, reqIsHlvx, 0.U(1.W), splitLoadReqs(curPtr).uop.fuOpType(1, 0))
491
492  when (io.splitLoadResp.valid) {
493    val resp = io.splitLoadResp.bits
494    splitLoadResp(curPtr) := io.splitLoadResp.bits
495    when (isMMIO) {
496      unSentLoads := 0.U
497      exceptionVec := ExceptionNO.selectByFu(0.U.asTypeOf(exceptionVec.cloneType), LduCfg)
498      // delegate to software
499      exceptionVec(loadAddrMisaligned) := true.B
500    } .elsewhen (hasException) {
501      unSentLoads := 0.U
502      LduCfg.exceptionOut.map(no => exceptionVec(no) := exceptionVec(no) || resp.uop.exceptionVec(no))
503    } .elsewhen (!io.splitLoadResp.bits.rep_info.need_rep) {
504      unSentLoads := unSentLoads & ~UIntToOH(curPtr)
505      curPtr := curPtr + 1.U
506      exceptionVec := 0.U.asTypeOf(ExceptionVec())
507    }
508  }
509
510  val combinedData = RegInit(0.U(XLEN.W))
511
512  when (bufferState === s_comb) {
513    when (!cross16BytesBoundary) {
514      val shiftData = LookupTree(aligned16BytesSel, List(
515        "b0000".U -> splitLoadResp(0).data(63,     0),
516        "b0001".U -> splitLoadResp(0).data(71,     8),
517        "b0010".U -> splitLoadResp(0).data(79,    16),
518        "b0011".U -> splitLoadResp(0).data(87,    24),
519        "b0100".U -> splitLoadResp(0).data(95,    32),
520        "b0101".U -> splitLoadResp(0).data(103,   40),
521        "b0110".U -> splitLoadResp(0).data(111,   48),
522        "b0111".U -> splitLoadResp(0).data(119,   56),
523        "b1000".U -> splitLoadResp(0).data(127,   64),
524        "b1001".U -> splitLoadResp(0).data(127,   72),
525        "b1010".U -> splitLoadResp(0).data(127,   80),
526        "b1011".U -> splitLoadResp(0).data(127,   88),
527        "b1100".U -> splitLoadResp(0).data(127,   96),
528        "b1101".U -> splitLoadResp(0).data(127,  104),
529        "b1110".U -> splitLoadResp(0).data(127,  112),
530        "b1111".U -> splitLoadResp(0).data(127,  120)
531      ))
532      val truncateData = LookupTree(req.uop.fuOpType(1, 0), List(
533        LB -> shiftData(7,  0), // lb
534        LH -> shiftData(15, 0), // lh
535        LW -> shiftData(31, 0), // lw
536        LD -> shiftData(63, 0)  // ld
537      ))
538      combinedData := rdataHelper(req.uop, truncateData(XLEN - 1, 0))
539    } .otherwise {
540      val lowAddrResult = getShiftAndTruncateData(lowResultShift, lowResultWidth, splitLoadResp(0).data)
541                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
542      val highAddrResult = getShiftAndTruncateData(highResultShift, highResultWidth, splitLoadResp(1).data)
543                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
544      val catResult = Wire(Vec(XLEN / 8, UInt(8.W)))
545      (0 until XLEN / 8) .map {
546        case i => {
547          when (i.U < lowResultWidth) {
548            catResult(i) := lowAddrResult(i)
549          } .otherwise {
550            catResult(i) := highAddrResult(i.U - lowResultWidth)
551          }
552        }
553      }
554      combinedData := rdataHelper(req.uop, (catResult.asUInt)(XLEN - 1, 0))
555    }
556  }
557
558  io.writeBack.valid := req_valid && (bufferState === s_wb)
559  io.writeBack.bits.uop := req.uop
560  io.writeBack.bits.uop.exceptionVec := DontCare
561  LduCfg.exceptionOut.map(no => io.writeBack.bits.uop.exceptionVec(no) := (globalMMIO || globalException) && exceptionVec(no))
562  io.writeBack.bits.uop.fuType := FuType.ldu.U
563  io.writeBack.bits.uop.flushPipe := Mux(globalMMIO || globalException, false.B, true.B)
564  io.writeBack.bits.uop.replayInst := false.B
565  io.writeBack.bits.data := combinedData
566  io.writeBack.bits.isFromLoadUnit := DontCare
567  io.writeBack.bits.debug.isMMIO := globalMMIO
568  io.writeBack.bits.debug.isPerfCnt := false.B
569  io.writeBack.bits.debug.paddr := req.paddr
570  io.writeBack.bits.debug.vaddr := req.vaddr
571
572  val flush = req_valid && req.uop.robIdx.needFlush(io.redirect)
573
574  when (flush && (bufferState =/= s_idle)) {
575    bufferState := s_idle
576    req_valid := false.B
577    curPtr := 0.U
578    unSentLoads := 0.U
579    globalException := false.B
580    globalMMIO := false.B
581  }
582
583  // NOTE: spectial case (unaligned load cross page, page fault happens in next page)
584  // if exception happens in the higher page address part, overwrite the loadExceptionBuffer vaddr
585  val shouldOverwrite = req_valid && globalException
586  val overwriteExpBuf = GatedValidRegNext(shouldOverwrite)
587  val overwriteVaddr = RegEnable(
588    Mux(
589      cross16BytesBoundary && (curPtr === 1.U),
590      splitLoadResp(curPtr).vaddr,
591      splitLoadResp(curPtr).fullva),
592    shouldOverwrite)
593  val overwriteGpaddr = RegEnable(splitLoadResp(curPtr).gpaddr, shouldOverwrite)
594  val overwriteIsHyper = RegEnable(splitLoadResp(curPtr).isHyper, shouldOverwrite)
595  val overwriteIsForVSnonLeafPTE = RegEnable(splitLoadResp(curPtr).isForVSnonLeafPTE, shouldOverwrite)
596
597  io.overwriteExpBuf.valid := overwriteExpBuf
598  io.overwriteExpBuf.vaddr := overwriteVaddr
599  io.overwriteExpBuf.isHyper := overwriteIsHyper
600  io.overwriteExpBuf.gpaddr := overwriteGpaddr
601  io.overwriteExpBuf.isForVSnonLeafPTE := overwriteIsForVSnonLeafPTE
602
603  // when no exception or mmio, flush loadExceptionBuffer at s_wb
604  val flushLdExpBuff = GatedValidRegNext(req_valid && (bufferState === s_wb) && !(globalMMIO || globalException))
605  io.flushLdExpBuff := flushLdExpBuff
606
607  XSPerfAccumulate("alloc",                  RegNext(!req_valid) && req_valid)
608  XSPerfAccumulate("flush",                  flush)
609  XSPerfAccumulate("flush_idle",             flush && (bufferState === s_idle))
610  XSPerfAccumulate("flush_non_idle",         flush && (bufferState =/= s_idle))
611}