xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala (revision 491c16ade93d4956fec6dde187943d72bb010bc4)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.fu.FuConfig._
26import xiangshan.backend.fu.FuType
27import xiangshan.backend.fu.fpu.FPU
28import xiangshan.backend.rob.RobLsqIO
29import xiangshan.cache.mmu.HasTlbConst
30import xiangshan.cache._
31import xiangshan.frontend.FtqPtr
32import xiangshan.ExceptionNO._
33import xiangshan.cache.wpu.ReplayCarry
34import xiangshan.backend.rob.RobPtr
35import xiangshan.backend.Bundles.{MemExuOutput, DynInst}
36import xiangshan.backend.fu.FuConfig.LduCfg
37
38class LoadMisalignBuffer(implicit p: Parameters) extends XSModule
39  with HasCircularQueuePtrHelper
40  with HasLoadHelper
41  with HasTlbConst
42{
43  private val enqPortNum = LoadPipelineWidth
44  private val maxSplitNum = 2
45
46  require(maxSplitNum == 2)
47
48  private val LB = "b00".U(2.W)
49  private val LH = "b01".U(2.W)
50  private val LW = "b10".U(2.W)
51  private val LD = "b11".U(2.W)
52
53  // encode of how many bytes to shift or truncate
54  private val BYTE0 = "b000".U(3.W)
55  private val BYTE1 = "b001".U(3.W)
56  private val BYTE2 = "b010".U(3.W)
57  private val BYTE3 = "b011".U(3.W)
58  private val BYTE4 = "b100".U(3.W)
59  private val BYTE5 = "b101".U(3.W)
60  private val BYTE6 = "b110".U(3.W)
61  private val BYTE7 = "b111".U(3.W)
62
63  def getMask(sizeEncode: UInt) = LookupTree(sizeEncode, List(
64    LB -> 0x1.U, // lb
65    LH -> 0x3.U, // lh
66    LW -> 0xf.U, // lw
67    LD -> 0xff.U  // ld
68  ))
69
70  def getShiftAndTruncateData(shiftEncode: UInt, truncateEncode: UInt, data: UInt) = {
71    val shiftData = LookupTree(shiftEncode, List(
72      BYTE0 -> data(63,    0),
73      BYTE1 -> data(63,    8),
74      BYTE2 -> data(63,   16),
75      BYTE3 -> data(63,   24),
76      BYTE4 -> data(63,   32),
77      BYTE5 -> data(63,   40),
78      BYTE6 -> data(63,   48),
79      BYTE7 -> data(63,   56)
80    ))
81    val truncateData = LookupTree(truncateEncode, List(
82      BYTE0 -> 0.U(XLEN.W), // can not truncate with 0 byte width
83      BYTE1 -> shiftData(7,    0),
84      BYTE2 -> shiftData(15,   0),
85      BYTE3 -> shiftData(23,   0),
86      BYTE4 -> shiftData(31,   0),
87      BYTE5 -> shiftData(39,   0),
88      BYTE6 -> shiftData(47,   0),
89      BYTE7 -> shiftData(55,   0)
90    ))
91    truncateData(XLEN - 1, 0)
92  }
93
94  def selectOldest[T <: LqWriteBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
95    assert(valid.length == bits.length)
96    if (valid.length == 0 || valid.length == 1) {
97      (valid, bits)
98    } else if (valid.length == 2) {
99      val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0)))))
100      for (i <- res.indices) {
101        res(i).valid := valid(i)
102        res(i).bits := bits(i)
103      }
104      val oldest = Mux(valid(0) && valid(1),
105        Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) ||
106          (bits(0).uop.robIdx === bits(1).uop.robIdx && bits(0).uop.uopIdx > bits(1).uop.uopIdx), res(1), res(0)),
107        Mux(valid(0) && !valid(1), res(0), res(1)))
108      (Seq(oldest.valid), Seq(oldest.bits))
109    } else {
110      val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2))
111      val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)))
112      selectOldest(left._1 ++ right._1, left._2 ++ right._2)
113    }
114  }
115
116  val io = IO(new Bundle() {
117    val redirect        = Flipped(Valid(new Redirect))
118    val req             = Vec(enqPortNum, Flipped(Decoupled(new LqWriteBundle)))
119    val rob             = Flipped(new RobLsqIO)
120    val splitLoadReq    = Decoupled(new LsPipelineBundle)
121    val splitLoadResp   = Flipped(Valid(new LqWriteBundle))
122    val writeBack       = Decoupled(new MemExuOutput)
123    val vecWriteBack    = Decoupled(new VecPipelineFeedbackIO(isVStore = false))
124    val loadOutValid    = Input(Bool())
125    val loadVecOutValid = Input(Bool())
126    val overwriteExpBuf = Output(new XSBundle {
127      val valid  = Bool()
128      val vaddr  = UInt(XLEN.W)
129      val isHyper = Bool()
130      val gpaddr = UInt(XLEN.W)
131      val isForVSnonLeafPTE = Bool()
132    })
133    val flushLdExpBuff  = Output(Bool())
134    val loadMisalignFull = Output(Bool())
135  })
136
137  io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool()))
138  io.rob.uop  := 0.U.asTypeOf(Vec(LoadPipelineWidth, new DynInst))
139
140  val req_valid = RegInit(false.B)
141  val req = Reg(new LqWriteBundle)
142
143  io.loadMisalignFull := req_valid
144
145  (0 until io.req.length).map{i =>
146    if (i == 0) {
147      io.req(0).ready := !req_valid && io.req(0).valid
148    }
149    else {
150      io.req(i).ready := !io.req.take(i).map(_.ready).reduce(_ || _) && !req_valid && io.req(i).valid
151    }
152  }
153
154
155  val select_req_bit   = ParallelPriorityMux(io.req.map(_.valid), io.req.map(_.bits))
156  val select_req_valid = io.req.map(_.valid).reduce(_ || _)
157  val canEnqValid = !req_valid && !select_req_bit.uop.robIdx.needFlush(io.redirect) && select_req_valid
158  when(canEnqValid) {
159    req := select_req_bit
160    req_valid := true.B
161  }
162
163  // buffer control:
164  //  - s_idle:   idle
165  //  - s_split:  split misalign laod
166  //  - s_req:    issue a split memory access request
167  //  - s_resp:   Responds to a split load access request
168  //  - s_comb_wakeup_rep: Merge the data and issue a wakeup load
169  //  - s_wb: writeback yo rob/vecMergeBuffer
170  val s_idle :: s_split :: s_req :: s_resp :: s_comb_wakeup_rep :: s_wb :: Nil = Enum(6)
171  val bufferState = RegInit(s_idle)
172  val splitLoadReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle))))
173  val splitLoadResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LqWriteBundle))))
174  val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec()))
175  val unSentLoads = RegInit(0.U(maxSplitNum.W))
176  val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W))
177  val needWakeUpReqsWire = Wire(Bool())
178  val needWakeUpWB       = RegInit(false.B)
179  val data_select        = RegEnable(genRdataOH(select_req_bit.uop), 0.U(genRdataOH(select_req_bit.uop).getWidth.W), canEnqValid)
180
181  // if there is exception or mmio in split load
182  val globalException = RegInit(false.B)
183  val globalMMIO = RegInit(false.B)
184
185  val hasException = io.splitLoadResp.bits.vecActive &&
186    ExceptionNO.selectByFu(io.splitLoadResp.bits.uop.exceptionVec, LduCfg).asUInt.orR || TriggerAction.isDmode(io.splitLoadResp.bits.uop.trigger)
187  val isMMIO = io.splitLoadResp.bits.mmio
188  needWakeUpReqsWire := false.B
189  switch(bufferState) {
190    is (s_idle) {
191      when (req_valid) {
192        bufferState := s_split
193      }
194    }
195
196    is (s_split) {
197      bufferState := s_req
198    }
199
200    is (s_req) {
201      when (io.splitLoadReq.fire) {
202        bufferState := s_resp
203      }
204    }
205
206    is (s_resp) {
207      when (io.splitLoadResp.valid) {
208        val clearOh = UIntToOH(curPtr)
209        when (hasException || isMMIO) {
210          // commit directly when exception ocurs
211          // if any split load reaches mmio space, delegate to software loadAddrMisaligned exception
212          bufferState := s_wb
213          globalException := hasException
214          globalMMIO := isMMIO
215        } .elsewhen(io.splitLoadResp.bits.rep_info.need_rep || (unSentLoads & ~clearOh).orR) {
216          // need replay or still has unsent requests
217          bufferState := s_req
218        } .otherwise {
219          // merge the split load results
220          bufferState := s_comb_wakeup_rep
221          needWakeUpWB := !req.isvec
222        }
223      }
224    }
225
226    is (s_comb_wakeup_rep) {
227      when(!req.isvec) {
228        when(io.splitLoadReq.fire) {
229          bufferState := s_wb
230        }.otherwise {
231          bufferState := s_comb_wakeup_rep
232        }
233        needWakeUpReqsWire := true.B
234      } .otherwise {
235        bufferState := s_wb
236      }
237
238    }
239
240    is (s_wb) {
241      when(req.isvec) {
242        when(io.vecWriteBack.fire) {
243          bufferState := s_idle
244          req_valid := false.B
245          curPtr := 0.U
246          unSentLoads := 0.U
247          globalException := false.B
248          globalMMIO := false.B
249          needWakeUpWB := false.B
250        }
251
252      } .otherwise {
253        when(io.writeBack.fire) {
254          bufferState := s_idle
255          req_valid := false.B
256          curPtr := 0.U
257          unSentLoads := 0.U
258          globalException := false.B
259          globalMMIO := false.B
260          needWakeUpWB := false.B
261        }
262      }
263
264    }
265  }
266
267  val alignedType = Mux(req.isvec, req.alignedType(1,0), req.uop.fuOpType(1, 0))
268  val highAddress = LookupTree(alignedType, List(
269    LB -> 0.U,
270    LH -> 1.U,
271    LW -> 3.U,
272    LD -> 7.U
273  )) + req.vaddr(4, 0)
274  // to see if (vaddr + opSize - 1) and vaddr are in the same 16 bytes region
275  val cross16BytesBoundary = req_valid && (highAddress(4) =/= req.vaddr(4))
276  val aligned16BytesAddr   = (req.vaddr >> 4) << 4// req.vaddr & ~("b1111".U)
277  val aligned16BytesSel    = req.vaddr(3, 0)
278
279  // meta of 128 bit load
280  val new128Load = WireInit(0.U.asTypeOf(new LsPipelineBundle))
281  // meta of split loads
282  val lowAddrLoad  = WireInit(0.U.asTypeOf(new LsPipelineBundle))
283  val highAddrLoad = WireInit(0.U.asTypeOf(new LsPipelineBundle))
284  val lowResultShift = RegInit(0.U(3.W)) // how many bytes should we shift right when got result
285  val lowResultWidth = RegInit(0.U(3.W)) // how many bytes should we take from result
286  val highResultShift = RegInit(0.U(3.W))
287  val highResultWidth = RegInit(0.U(3.W))
288
289  when (bufferState === s_split) {
290    when (!cross16BytesBoundary) {
291      assert(false.B, s"There should be no non-aligned access that does not cross 16Byte boundaries.")
292    } .otherwise {
293      // split this unaligned load into `maxSplitNum` aligned loads
294      unSentLoads := Fill(maxSplitNum, 1.U(1.W))
295      curPtr := 0.U
296      lowAddrLoad.uop := req.uop
297      lowAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
298      lowAddrLoad.fullva := req.fullva
299      highAddrLoad.uop := req.uop
300      highAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
301      highAddrLoad.fullva := req.fullva
302
303      switch (alignedType(1, 0)) {
304        is (LB) {
305          assert(false.B, "lb should not trigger miss align")
306        }
307
308        is (LH) {
309          lowAddrLoad.uop.fuOpType := LB
310          lowAddrLoad.vaddr := req.vaddr
311          lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
312          lowResultShift    := BYTE0
313          lowResultWidth    := BYTE1
314
315          highAddrLoad.uop.fuOpType := LB
316          highAddrLoad.vaddr := req.vaddr + 1.U
317          highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
318          highResultShift    := BYTE0
319          highResultWidth    := BYTE1
320        }
321
322        is (LW) {
323          switch (req.vaddr(1, 0)) {
324            is ("b00".U) {
325              assert(false.B, "should not trigger miss align")
326            }
327
328            is ("b01".U) {
329              lowAddrLoad.uop.fuOpType := LW
330              lowAddrLoad.vaddr := req.vaddr - 1.U
331              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
332              lowResultShift    := BYTE1
333              lowResultWidth    := BYTE3
334
335              highAddrLoad.uop.fuOpType := LB
336              highAddrLoad.vaddr := req.vaddr + 3.U
337              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
338              highResultShift    := BYTE0
339              highResultWidth    := BYTE1
340            }
341
342            is ("b10".U) {
343              lowAddrLoad.uop.fuOpType := LH
344              lowAddrLoad.vaddr := req.vaddr
345              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
346              lowResultShift    := BYTE0
347              lowResultWidth    := BYTE2
348
349              highAddrLoad.uop.fuOpType := LH
350              highAddrLoad.vaddr := req.vaddr + 2.U
351              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
352              highResultShift    := BYTE0
353              highResultWidth    := BYTE2
354            }
355
356            is ("b11".U) {
357              lowAddrLoad.uop.fuOpType := LB
358              lowAddrLoad.vaddr := req.vaddr
359              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
360              lowResultShift    := BYTE0
361              lowResultWidth    := BYTE1
362
363              highAddrLoad.uop.fuOpType := LW
364              highAddrLoad.vaddr := req.vaddr + 1.U
365              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
366              highResultShift    := BYTE0
367              highResultWidth    := BYTE3
368            }
369          }
370        }
371
372        is (LD) {
373          switch (req.vaddr(2, 0)) {
374            is ("b000".U) {
375              assert(false.B, "should not trigger miss align")
376            }
377
378            is ("b001".U) {
379              lowAddrLoad.uop.fuOpType := LD
380              lowAddrLoad.vaddr := req.vaddr - 1.U
381              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
382              lowResultShift    := BYTE1
383              lowResultWidth    := BYTE7
384
385              highAddrLoad.uop.fuOpType := LB
386              highAddrLoad.vaddr := req.vaddr + 7.U
387              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
388              highResultShift    := BYTE0
389              highResultWidth    := BYTE1
390            }
391
392            is ("b010".U) {
393              lowAddrLoad.uop.fuOpType := LD
394              lowAddrLoad.vaddr := req.vaddr - 2.U
395              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
396              lowResultShift    := BYTE2
397              lowResultWidth    := BYTE6
398
399              highAddrLoad.uop.fuOpType := LH
400              highAddrLoad.vaddr := req.vaddr + 6.U
401              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
402              highResultShift    := BYTE0
403              highResultWidth    := BYTE2
404            }
405
406            is ("b011".U) {
407              lowAddrLoad.uop.fuOpType := LD
408              lowAddrLoad.vaddr := req.vaddr - 3.U
409              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
410              lowResultShift    := BYTE3
411              lowResultWidth    := BYTE5
412
413              highAddrLoad.uop.fuOpType := LW
414              highAddrLoad.vaddr := req.vaddr + 5.U
415              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
416              highResultShift    := BYTE0
417              highResultWidth    := BYTE3
418            }
419
420            is ("b100".U) {
421              lowAddrLoad.uop.fuOpType := LW
422              lowAddrLoad.vaddr := req.vaddr
423              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
424              lowResultShift    := BYTE0
425              lowResultWidth    := BYTE4
426
427              highAddrLoad.uop.fuOpType := LW
428              highAddrLoad.vaddr := req.vaddr + 4.U
429              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
430              highResultShift    := BYTE0
431              highResultWidth    := BYTE4
432            }
433
434            is ("b101".U) {
435              lowAddrLoad.uop.fuOpType := LW
436              lowAddrLoad.vaddr := req.vaddr - 1.U
437              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
438              lowResultShift    := BYTE1
439              lowResultWidth    := BYTE3
440
441              highAddrLoad.uop.fuOpType := LD
442              highAddrLoad.vaddr := req.vaddr + 3.U
443              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
444              highResultShift    := BYTE0
445              highResultWidth    := BYTE5
446            }
447
448            is ("b110".U) {
449              lowAddrLoad.uop.fuOpType := LH
450              lowAddrLoad.vaddr := req.vaddr
451              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
452              lowResultShift    := BYTE0
453              lowResultWidth    := BYTE2
454
455              highAddrLoad.uop.fuOpType := LD
456              highAddrLoad.vaddr := req.vaddr + 2.U
457              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
458              highResultShift    := BYTE0
459              highResultWidth    := BYTE6
460            }
461
462            is ("b111".U) {
463              lowAddrLoad.uop.fuOpType := LB
464              lowAddrLoad.vaddr := req.vaddr
465              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
466              lowResultShift    := BYTE0
467              lowResultWidth    := BYTE1
468
469              highAddrLoad.uop.fuOpType := LD
470              highAddrLoad.vaddr := req.vaddr + 1.U
471              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
472              highResultShift    := BYTE0
473              highResultWidth    := BYTE7
474            }
475          }
476        }
477      }
478
479      splitLoadReqs(0) := lowAddrLoad
480      splitLoadReqs(1) := highAddrLoad
481    }
482    exceptionVec := 0.U.asTypeOf(exceptionVec.cloneType)
483  }
484
485  io.splitLoadReq.valid := req_valid && (bufferState === s_req || bufferState === s_comb_wakeup_rep && needWakeUpReqsWire && !req.isvec)
486  io.splitLoadReq.bits  := splitLoadReqs(curPtr)
487  io.splitLoadReq.bits.isvec  := req.isvec
488  io.splitLoadReq.bits.misalignNeedWakeUp  := needWakeUpReqsWire
489  io.splitLoadReq.bits.isFinalSplit        := curPtr(0) && !needWakeUpReqsWire
490  // Restore the information of H extension load
491  // bit encoding: | hlv 1 | hlvx 1 | is unsigned(1bit) | size(2bit) |
492  val reqIsHlv  = LSUOpType.isHlv(req.uop.fuOpType)
493  val reqIsHlvx = LSUOpType.isHlvx(req.uop.fuOpType)
494  io.splitLoadReq.bits.uop.fuOpType := Mux(req.isvec, req.uop.fuOpType, Cat(reqIsHlv, reqIsHlvx, 0.U(1.W), splitLoadReqs(curPtr).uop.fuOpType(1, 0)))
495  io.splitLoadReq.bits.alignedType  := Mux(req.isvec, splitLoadReqs(curPtr).uop.fuOpType(1, 0), req.alignedType)
496
497  when (io.splitLoadResp.valid) {
498    val resp = io.splitLoadResp.bits
499    splitLoadResp(curPtr) := io.splitLoadResp.bits
500    when (isMMIO) {
501      unSentLoads := 0.U
502      exceptionVec := ExceptionNO.selectByFu(0.U.asTypeOf(exceptionVec.cloneType), LduCfg)
503      // delegate to software
504      exceptionVec(loadAddrMisaligned) := true.B
505    } .elsewhen (hasException) {
506      unSentLoads := 0.U
507      LduCfg.exceptionOut.map(no => exceptionVec(no) := exceptionVec(no) || resp.uop.exceptionVec(no))
508    } .elsewhen (!io.splitLoadResp.bits.rep_info.need_rep) {
509      unSentLoads := unSentLoads & ~UIntToOH(curPtr)
510      curPtr := curPtr + 1.U
511      exceptionVec := 0.U.asTypeOf(ExceptionVec())
512    }
513  }
514
515  val combinedData = RegInit(0.U(XLEN.W))
516
517  when (bufferState === s_comb_wakeup_rep) {
518    val lowAddrResult = getShiftAndTruncateData(lowResultShift, lowResultWidth, splitLoadResp(0).data)
519                          .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
520    val highAddrResult = getShiftAndTruncateData(highResultShift, highResultWidth, splitLoadResp(1).data)
521                          .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
522    val catResult = Wire(Vec(XLEN / 8, UInt(8.W)))
523    (0 until XLEN / 8) .map {
524      case i => {
525        when (i.U < lowResultWidth) {
526          catResult(i) := lowAddrResult(i)
527        } .otherwise {
528          catResult(i) := highAddrResult(i.U - lowResultWidth)
529        }
530      }
531    }
532    combinedData := Mux(req.isvec, rdataVecHelper(req.alignedType, (catResult.asUInt)(XLEN - 1, 0)), rdataHelper(req.uop, (catResult.asUInt)(XLEN - 1, 0)))
533
534  }
535
536  io.writeBack.valid := req_valid && (bufferState === s_wb) && (io.splitLoadResp.valid && io.splitLoadResp.bits.misalignNeedWakeUp || globalMMIO || globalException) && !io.loadOutValid && !req.isvec
537  io.writeBack.bits.uop := req.uop
538  io.writeBack.bits.uop.exceptionVec := DontCare
539  LduCfg.exceptionOut.map(no => io.writeBack.bits.uop.exceptionVec(no) := (globalMMIO || globalException) && exceptionVec(no))
540  io.writeBack.bits.uop.rfWen := !globalException && !globalMMIO && req.uop.rfWen
541  io.writeBack.bits.uop.fuType := FuType.ldu.U
542  io.writeBack.bits.uop.flushPipe := false.B
543  io.writeBack.bits.uop.replayInst := false.B
544  io.writeBack.bits.data := newRdataHelper(data_select, combinedData)
545  io.writeBack.bits.isFromLoadUnit := needWakeUpWB
546  io.writeBack.bits.debug.isMMIO := globalMMIO
547  // FIXME lyq: temporarily set to false
548  io.writeBack.bits.debug.isNC := false.B
549  io.writeBack.bits.debug.isPerfCnt := false.B
550  io.writeBack.bits.debug.paddr := req.paddr
551  io.writeBack.bits.debug.vaddr := req.vaddr
552
553
554  // vector output
555  io.vecWriteBack.valid := req_valid && (bufferState === s_wb) && !io.loadVecOutValid && req.isvec
556
557  io.vecWriteBack.bits.alignedType          := req.alignedType
558  io.vecWriteBack.bits.vecFeedback          := true.B
559  io.vecWriteBack.bits.vecdata.get          := combinedData
560  io.vecWriteBack.bits.isvec                := req.isvec
561  io.vecWriteBack.bits.elemIdx              := req.elemIdx
562  io.vecWriteBack.bits.elemIdxInsideVd.get  := req.elemIdxInsideVd
563  io.vecWriteBack.bits.mask                 := req.mask
564  io.vecWriteBack.bits.reg_offset.get       := 0.U
565  io.vecWriteBack.bits.usSecondInv          := req.usSecondInv
566  io.vecWriteBack.bits.mBIndex              := req.mbIndex
567  io.vecWriteBack.bits.hit                  := true.B
568  io.vecWriteBack.bits.sourceType           := RSFeedbackType.lrqFull
569  io.vecWriteBack.bits.trigger              := TriggerAction.None
570  io.vecWriteBack.bits.flushState           := DontCare
571  io.vecWriteBack.bits.exceptionVec         := ExceptionNO.selectByFu(exceptionVec, VlduCfg)
572  io.vecWriteBack.bits.hasException         := globalException
573  io.vecWriteBack.bits.vaddr                := req.fullva
574  io.vecWriteBack.bits.vaNeedExt            := req.vaNeedExt
575  io.vecWriteBack.bits.gpaddr               := req.gpaddr
576  io.vecWriteBack.bits.isForVSnonLeafPTE    := req.isForVSnonLeafPTE
577  io.vecWriteBack.bits.mmio                 := DontCare
578  io.vecWriteBack.bits.vstart               := req.uop.vpu.vstart
579  io.vecWriteBack.bits.vecTriggerMask       := req.vecTriggerMask
580  io.vecWriteBack.bits.nc                   := false.B
581
582
583  val flush = req_valid && req.uop.robIdx.needFlush(io.redirect)
584
585  when (flush) {
586    bufferState := s_idle
587    req_valid := false.B
588    curPtr := 0.U
589    unSentLoads := 0.U
590    globalException := false.B
591    globalMMIO := false.B
592  }
593
594  // NOTE: spectial case (unaligned load cross page, page fault happens in next page)
595  // if exception happens in the higher page address part, overwrite the loadExceptionBuffer vaddr
596  val shouldOverwrite = req_valid && globalException
597  val overwriteExpBuf = GatedValidRegNext(shouldOverwrite)
598  val overwriteVaddr = RegEnable(
599    Mux(
600      cross16BytesBoundary && (curPtr === 1.U),
601      splitLoadResp(curPtr).vaddr,
602      splitLoadResp(curPtr).fullva),
603    shouldOverwrite)
604  val overwriteGpaddr = RegEnable(splitLoadResp(curPtr).gpaddr, shouldOverwrite)
605  val overwriteIsHyper = RegEnable(splitLoadResp(curPtr).isHyper, shouldOverwrite)
606  val overwriteIsForVSnonLeafPTE = RegEnable(splitLoadResp(curPtr).isForVSnonLeafPTE, shouldOverwrite)
607
608  //TODO In theory, there is no need to overwrite, but for now, the signal is retained in the code in this way.
609  // and the signal will be removed after sufficient verification.
610  io.overwriteExpBuf.valid := false.B
611  io.overwriteExpBuf.vaddr := overwriteVaddr
612  io.overwriteExpBuf.isHyper := overwriteIsHyper
613  io.overwriteExpBuf.gpaddr := overwriteGpaddr
614  io.overwriteExpBuf.isForVSnonLeafPTE := overwriteIsForVSnonLeafPTE
615
616  // when no exception or mmio, flush loadExceptionBuffer at s_wb
617  val flushLdExpBuff = GatedValidRegNext(req_valid && (bufferState === s_wb) && !(globalMMIO || globalException))
618  io.flushLdExpBuff := flushLdExpBuff
619
620  XSPerfAccumulate("alloc",                  RegNext(!req_valid) && req_valid)
621  XSPerfAccumulate("flush",                  flush)
622  XSPerfAccumulate("flush_idle",             flush && (bufferState === s_idle))
623  XSPerfAccumulate("flush_non_idle",         flush && (bufferState =/= s_idle))
624}