xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/StoreMisalignBuffer.scala (revision e7ab4635f82e41d64e6ef7c0f535bf3495a5ca34)
1/***************************************************************************************
2* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*
5* XiangShan is licensed under Mulan PSL v2.
6* You can use this software according to the terms and conditions of the Mulan PSL v2.
7* You may obtain a copy of Mulan PSL v2 at:
8*          http://license.coscl.org.cn/MulanPSL2
9*
10* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*
14* See the Mulan PSL v2 for more details.
15***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.fu.FuConfig._
26import xiangshan.backend.fu.fpu.FPU
27import xiangshan.backend.rob.RobLsqIO
28import xiangshan.cache._
29import xiangshan.frontend.FtqPtr
30import xiangshan.ExceptionNO._
31import xiangshan.cache.wpu.ReplayCarry
32import xiangshan.backend.rob.RobPtr
33import xiangshan.backend.Bundles.{MemExuOutput, DynInst}
34import xiangshan.backend.fu.FuConfig.StaCfg
35
36class StoreMisalignBuffer(implicit p: Parameters) extends XSModule
37  with HasCircularQueuePtrHelper
38{
39  private val enqPortNum = StorePipelineWidth
40  private val maxSplitNum = 2
41
42  require(maxSplitNum == 2)
43
44  private val SB = "b00".U(2.W)
45  private val SH = "b01".U(2.W)
46  private val SW = "b10".U(2.W)
47  private val SD = "b11".U(2.W)
48
49  // encode of how many bytes to shift or truncate
50  private val BYTE0 = "b000".U(3.W)
51  private val BYTE1 = "b001".U(3.W)
52  private val BYTE2 = "b010".U(3.W)
53  private val BYTE3 = "b011".U(3.W)
54  private val BYTE4 = "b100".U(3.W)
55  private val BYTE5 = "b101".U(3.W)
56  private val BYTE6 = "b110".U(3.W)
57  private val BYTE7 = "b111".U(3.W)
58
59  def getMask(sizeEncode: UInt) = LookupTree(sizeEncode, List(
60    SB -> 0x1.U,
61    SH -> 0x3.U,
62    SW -> 0xf.U,
63    SD -> 0xff.U
64  ))
65
66  def selectOldest[T <: LsPipelineBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
67    assert(valid.length == bits.length)
68    if (valid.length == 0 || valid.length == 1) {
69      (valid, bits)
70    } else if (valid.length == 2) {
71      val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0)))))
72      for (i <- res.indices) {
73        res(i).valid := valid(i)
74        res(i).bits := bits(i)
75      }
76      val oldest = Mux(valid(0) && valid(1),
77        Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) ||
78          (isNotBefore(bits(0).uop.robIdx, bits(1).uop.robIdx) && bits(0).uop.uopIdx > bits(1).uop.uopIdx), res(1), res(0)),
79        Mux(valid(0) && !valid(1), res(0), res(1)))
80      (Seq(oldest.valid), Seq(oldest.bits))
81    } else {
82      val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2))
83      val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)))
84      selectOldest(left._1 ++ right._1, left._2 ++ right._2)
85    }
86  }
87
88  val io = IO(new Bundle() {
89    val redirect        = Flipped(Valid(new Redirect))
90    val req             = Vec(enqPortNum, Flipped(Valid(new LsPipelineBundle)))
91    val rob             = Flipped(new RobLsqIO)
92    val splitStoreReq   = Decoupled(new LsPipelineBundle)
93    val splitStoreResp  = Flipped(Valid(new SqWriteBundle))
94    val writeBack       = Decoupled(new MemExuOutput)
95    val overwriteExpBuf = Output(new XSBundle {
96      val valid = Bool()
97      val vaddr = UInt(XLEN.W)
98      val isHyper = Bool()
99      val gpaddr = UInt(XLEN.W)
100      val isForVSnonLeafPTE = Bool()
101    })
102    val sqControl       = new StoreMaBufToSqControlIO
103  })
104
105  io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool()))
106  io.rob.uop  := 0.U.asTypeOf(Vec(LoadPipelineWidth, new DynInst))
107
108  val req_valid = RegInit(false.B)
109  val req = Reg(new LsPipelineBundle)
110
111  // enqueue
112  // s1:
113  val s1_req = VecInit(io.req.map(_.bits))
114  val s1_valid = VecInit(io.req.map(x => x.valid))
115
116  // s2: delay 1 cycle
117  val s2_req = RegNext(s1_req)
118  val s2_valid = (0 until enqPortNum).map(i =>
119    RegNext(s1_valid(i)) &&
120    !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) &&
121    !s2_req(i).uop.robIdx.needFlush(io.redirect)
122  )
123  val s2_miss_aligned = s2_req.map(x =>
124    x.uop.exceptionVec(storeAddrMisaligned) && !x.uop.exceptionVec(breakPoint) && !TriggerAction.isDmode(x.uop.trigger)
125  )
126
127  val s2_enqueue = Wire(Vec(enqPortNum, Bool()))
128  for (w <- 0 until enqPortNum) {
129    s2_enqueue(w) := s2_valid(w) && s2_miss_aligned(w)
130  }
131
132  when (req_valid && req.uop.robIdx.needFlush(io.redirect)) {
133    req_valid := s2_enqueue.asUInt.orR
134  } .elsewhen (s2_enqueue.asUInt.orR) {
135    req_valid := req_valid || true.B
136  }
137
138  val reqSel = selectOldest(s2_enqueue, s2_req)
139
140  when (req_valid) {
141    req := Mux(
142      reqSel._1(0) && (isAfter(req.uop.robIdx, reqSel._2(0).uop.robIdx) || (isNotBefore(req.uop.robIdx, reqSel._2(0).uop.robIdx) && req.uop.uopIdx > reqSel._2(0).uop.uopIdx)),
143      reqSel._2(0),
144      req)
145  } .elsewhen (s2_enqueue.asUInt.orR) {
146    req := reqSel._2(0)
147  }
148
149  val robMatch = req_valid && io.rob.pendingst && (io.rob.pendingPtr === req.uop.robIdx)
150
151  // buffer control:
152  //  - split miss-aligned store into aligned stores
153  //  - send split store to sta and get result from sta
154  //  - control sq write to sb
155  //  - control sq write this store back
156  val s_idle :: s_split :: s_req :: s_resp :: s_cal :: s_sq_req :: s_wb :: s_wait :: Nil = Enum(8)
157  val bufferState = RegInit(s_idle)
158  val splitStoreReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle))))
159  val splitStoreResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new SqWriteBundle))))
160  val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec()))
161  val unSentStores  = RegInit(0.U(maxSplitNum.W))
162  val unWriteStores = RegInit(0.U(maxSplitNum.W))
163  val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W))
164
165  // if there is exception or mmio in split store
166  val globalException = RegInit(false.B)
167  val globalMMIO = RegInit(false.B)
168
169  val hasException = ExceptionNO.selectByFu(io.splitStoreResp.bits.uop.exceptionVec, StaCfg).asUInt.orR && !io.splitStoreResp.bits.need_rep
170  val isMMIO = io.splitStoreResp.bits.mmio && !io.splitStoreResp.bits.need_rep
171
172  switch(bufferState) {
173    is (s_idle) {
174      when (robMatch) {
175        bufferState := s_split
176      }
177    }
178
179    is (s_split) {
180      bufferState := s_req
181    }
182
183    is (s_req) {
184      when (io.splitStoreReq.fire) {
185        bufferState := s_resp
186      }
187    }
188
189    is (s_resp) {
190      when (io.splitStoreResp.valid) {
191        val clearOh = UIntToOH(curPtr)
192        when (hasException || isMMIO) {
193          // commit directly when exception ocurs
194          // if any split store reaches mmio space, delegate to software storeAddrMisaligned exception
195          bufferState := s_wb
196          globalException := hasException
197          globalMMIO := isMMIO
198        } .elsewhen(io.splitStoreResp.bits.need_rep || (unSentStores & ~clearOh).orR) {
199          // need replay or still has unsent requests
200          bufferState := s_req
201        } .otherwise {
202          // got result, goto calculate data and control sq
203          bufferState := s_cal
204        }
205      }
206    }
207
208    is (s_cal) {
209      when (io.sqControl.storeInfo.dataReady) {
210        bufferState := s_sq_req
211        curPtr := 0.U
212      }
213    }
214
215    is (s_sq_req) {
216      when (io.sqControl.storeInfo.completeSbTrans) {
217        when (!((unWriteStores & ~UIntToOH(curPtr)).orR)) {
218          bufferState := s_wb
219        }
220      }
221    }
222
223    is (s_wb) {
224      when (io.writeBack.fire) {
225        bufferState := s_wait
226      }
227    }
228
229    is (s_wait) {
230      when (io.rob.scommit =/= 0.U || req.uop.robIdx.needFlush(io.redirect)) {
231        // rob commits the unaligned store or handled the exception, reset all state
232        bufferState := s_idle
233        req_valid := false.B
234        curPtr := 0.U
235        unSentStores := 0.U
236        unWriteStores := 0.U
237        globalException := false.B
238        globalMMIO := false.B
239      }
240    }
241  }
242
243  val highAddress = LookupTree(req.uop.fuOpType(1, 0), List(
244    SB -> 0.U,
245    SH -> 1.U,
246    SW -> 3.U,
247    SD -> 7.U
248  )) + req.vaddr(4, 0)
249  // to see if (vaddr + opSize - 1) and vaddr are in the same 16 bytes region
250  val cross16BytesBoundary = req_valid && (highAddress(4) =/= req.vaddr(4))
251  val aligned16BytesAddr   = (req.vaddr >> 4) << 4// req.vaddr & ~("b1111".U)
252  val aligned16BytesSel    = req.vaddr(3, 0)
253
254  // meta of 128 bit store
255  val new128Store = WireInit(0.U.asTypeOf(new LsPipelineBundle))
256  // meta of split loads
257  val lowAddrStore  = WireInit(0.U.asTypeOf(new LsPipelineBundle))
258  val highAddrStore = WireInit(0.U.asTypeOf(new LsPipelineBundle))
259  // final lowResult = Cat(`lowResultWidth` of store data, 0.U(make it to fill total length of Vlen))
260  val lowResultWidth = RegInit(0.U(3.W)) // how many bytes should we take from the store data
261  // final highResult = Zero extend to Vlen(`highResultWidth` of (store data >> lowResultWidth))
262  val highResultWidth = RegInit(0.U(3.W)) // how many bytes should we take from the store data
263
264  when (bufferState === s_split) {
265    when (!cross16BytesBoundary) {
266      // change this unaligned store into a 128 bits store
267      unWriteStores := 1.U
268      unSentStores := 1.U
269      curPtr := 0.U
270      new128Store.vaddr := aligned16BytesAddr
271      // new128Store.mask  := (getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel).asUInt
272      new128Store.mask  := 0xffff.U
273      new128Store.uop   := req.uop
274      new128Store.uop.exceptionVec(storeAddrMisaligned) := false.B
275      new128Store.is128bit := true.B
276      splitStoreReqs(0) := new128Store
277    } .otherwise {
278      // split this unaligned store into `maxSplitNum` aligned stores
279      unWriteStores := Fill(maxSplitNum, 1.U(1.W))
280      unSentStores := Fill(maxSplitNum, 1.U(1.W))
281      curPtr := 0.U
282      lowAddrStore.uop := req.uop
283      lowAddrStore.uop.exceptionVec(storeAddrMisaligned) := false.B
284      highAddrStore.uop := req.uop
285      highAddrStore.uop.exceptionVec(storeAddrMisaligned) := false.B
286
287      switch (req.uop.fuOpType(1, 0)) {
288        is (SB) {
289          assert(false.B, "lb should not trigger miss align")
290        }
291
292        is (SH) {
293          lowAddrStore.uop.fuOpType := SB
294          lowAddrStore.vaddr := req.vaddr
295          lowAddrStore.mask  := 0x1.U << lowAddrStore.vaddr(3, 0)
296          lowResultWidth    := BYTE1
297
298          highAddrStore.uop.fuOpType := SB
299          highAddrStore.vaddr := req.vaddr + 1.U
300          highAddrStore.mask  := 0x1.U << highAddrStore.vaddr(3, 0)
301          highResultWidth    := BYTE1
302        }
303
304        is (SW) {
305          switch (req.vaddr(1, 0)) {
306            is ("b00".U) {
307              assert(false.B, "should not trigger miss align")
308            }
309
310            is ("b01".U) {
311              lowAddrStore.uop.fuOpType := SW
312              lowAddrStore.vaddr := req.vaddr - 1.U
313              lowAddrStore.mask  := 0xf.U << lowAddrStore.vaddr(3, 0)
314              lowResultWidth    := BYTE3
315
316              highAddrStore.uop.fuOpType := SB
317              highAddrStore.vaddr := req.vaddr + 3.U
318              highAddrStore.mask  := 0x1.U << highAddrStore.vaddr(3, 0)
319              highResultWidth    := BYTE1
320            }
321
322            is ("b10".U) {
323              lowAddrStore.uop.fuOpType := SH
324              lowAddrStore.vaddr := req.vaddr
325              lowAddrStore.mask  := 0x3.U << lowAddrStore.vaddr(3, 0)
326              lowResultWidth    := BYTE2
327
328              highAddrStore.uop.fuOpType := SH
329              highAddrStore.vaddr := req.vaddr + 2.U
330              highAddrStore.mask  := 0x3.U << highAddrStore.vaddr(3, 0)
331              highResultWidth    := BYTE2
332            }
333
334            is ("b11".U) {
335              lowAddrStore.uop.fuOpType := SB
336              lowAddrStore.vaddr := req.vaddr
337              lowAddrStore.mask  := 0x1.U << lowAddrStore.vaddr(3, 0)
338              lowResultWidth    := BYTE1
339
340              highAddrStore.uop.fuOpType := SW
341              highAddrStore.vaddr := req.vaddr + 1.U
342              highAddrStore.mask  := 0xf.U << highAddrStore.vaddr(3, 0)
343              highResultWidth    := BYTE3
344            }
345          }
346        }
347
348        is (SD) {
349          switch (req.vaddr(2, 0)) {
350            is ("b000".U) {
351              assert(false.B, "should not trigger miss align")
352            }
353
354            is ("b001".U) {
355              lowAddrStore.uop.fuOpType := SD
356              lowAddrStore.vaddr := req.vaddr - 1.U
357              lowAddrStore.mask  := 0xff.U << lowAddrStore.vaddr(3, 0)
358              lowResultWidth    := BYTE7
359
360              highAddrStore.uop.fuOpType := SB
361              highAddrStore.vaddr := req.vaddr + 7.U
362              highAddrStore.mask  := 0x1.U << highAddrStore.vaddr(3, 0)
363              highResultWidth    := BYTE1
364            }
365
366            is ("b010".U) {
367              lowAddrStore.uop.fuOpType := SD
368              lowAddrStore.vaddr := req.vaddr - 2.U
369              lowAddrStore.mask  := 0xff.U << lowAddrStore.vaddr(3, 0)
370              lowResultWidth    := BYTE6
371
372              highAddrStore.uop.fuOpType := SH
373              highAddrStore.vaddr := req.vaddr + 6.U
374              highAddrStore.mask  := 0x3.U << highAddrStore.vaddr(3, 0)
375              highResultWidth    := BYTE2
376            }
377
378            is ("b011".U) {
379              lowAddrStore.uop.fuOpType := SD
380              lowAddrStore.vaddr := req.vaddr - 3.U
381              lowAddrStore.mask  := 0xff.U << lowAddrStore.vaddr(3, 0)
382              lowResultWidth    := BYTE5
383
384              highAddrStore.uop.fuOpType := SW
385              highAddrStore.vaddr := req.vaddr + 5.U
386              highAddrStore.mask  := 0xf.U << highAddrStore.vaddr(3, 0)
387              highResultWidth    := BYTE3
388            }
389
390            is ("b100".U) {
391              lowAddrStore.uop.fuOpType := SW
392              lowAddrStore.vaddr := req.vaddr
393              lowAddrStore.mask  := 0xf.U << lowAddrStore.vaddr(3, 0)
394              lowResultWidth    := BYTE4
395
396              highAddrStore.uop.fuOpType := SW
397              highAddrStore.vaddr := req.vaddr + 4.U
398              highAddrStore.mask  := 0xf.U << highAddrStore.vaddr(3, 0)
399              highResultWidth    := BYTE4
400            }
401
402            is ("b101".U) {
403              lowAddrStore.uop.fuOpType := SD
404              lowAddrStore.vaddr := req.vaddr - 5.U
405              lowAddrStore.mask  := 0xff.U << lowAddrStore.vaddr(3, 0)
406              lowResultWidth    := BYTE3
407
408              highAddrStore.uop.fuOpType := SD
409              highAddrStore.vaddr := req.vaddr + 3.U
410              highAddrStore.mask  := 0xff.U << highAddrStore.vaddr(3, 0)
411              highResultWidth    := BYTE5
412            }
413
414            is ("b110".U) {
415              lowAddrStore.uop.fuOpType := SD
416              lowAddrStore.vaddr := req.vaddr - 6.U
417              lowAddrStore.mask  := 0xff.U << lowAddrStore.vaddr(3, 0)
418              lowResultWidth    := BYTE2
419
420              highAddrStore.uop.fuOpType := SD
421              highAddrStore.vaddr := req.vaddr + 2.U
422              highAddrStore.mask  := 0xff.U << highAddrStore.vaddr(3, 0)
423              highResultWidth    := BYTE6
424            }
425
426            is ("b111".U) {
427              lowAddrStore.uop.fuOpType := SD
428              lowAddrStore.vaddr := req.vaddr - 7.U
429              lowAddrStore.mask  := 0xff.U << lowAddrStore.vaddr(3, 0)
430              lowResultWidth    := BYTE1
431
432              highAddrStore.uop.fuOpType := SD
433              highAddrStore.vaddr := req.vaddr + 1.U
434              highAddrStore.mask  := 0xff.U << highAddrStore.vaddr(3, 0)
435              highResultWidth    := BYTE7
436            }
437          }
438        }
439      }
440
441      splitStoreReqs(0) := lowAddrStore
442      splitStoreReqs(1) := highAddrStore
443    }
444  }
445
446  io.splitStoreReq.valid := req_valid && (bufferState === s_req)
447  io.splitStoreReq.bits  := splitStoreReqs(curPtr)
448  // Restore the information of H extension store
449  // bit encoding: | hsv 1 | store 00 | size(2bit) |
450  val reqIsHsv  = LSUOpType.isHsv(req.uop.fuOpType)
451  io.splitStoreReq.bits.uop.fuOpType := Cat(reqIsHsv, 0.U(2.W), splitStoreReqs(curPtr).uop.fuOpType(1, 0))
452
453  when (io.splitStoreResp.valid) {
454    val resp = io.splitStoreResp.bits
455    splitStoreResp(curPtr) := io.splitStoreResp.bits
456    when (isMMIO) {
457      unWriteStores := 0.U
458      unSentStores := 0.U
459      exceptionVec := ExceptionNO.selectByFu(0.U.asTypeOf(exceptionVec.cloneType), StaCfg)
460      // delegate to software
461      exceptionVec(storeAddrMisaligned) := true.B
462    } .elsewhen (hasException) {
463      unWriteStores := 0.U
464      unSentStores := 0.U
465      StaCfg.exceptionOut.map(no => exceptionVec(no) := exceptionVec(no) || resp.uop.exceptionVec(no))
466    } .elsewhen (!io.splitStoreResp.bits.need_rep) {
467      unSentStores := unSentStores & ~UIntToOH(curPtr)
468      curPtr := curPtr + 1.U
469      exceptionVec := 0.U.asTypeOf(ExceptionVec())
470    }
471  }
472
473  val splitStoreData = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new XSBundle {
474    val wdata = UInt(VLEN.W)
475    val wmask = UInt((VLEN / 8).W)
476  }))))
477
478  val unalignedStoreData = io.sqControl.storeInfo.data
479  val wmaskLow  = Wire(Vec(VLEN / 8, Bool()))
480  val wmaskHigh = Wire(Vec(VLEN / 8, Bool()))
481  (0 until (VLEN / 8)).map {
482    case i  => {
483      when (i.U < highResultWidth) {
484        wmaskHigh(i) := true.B
485      } .otherwise {
486        wmaskHigh(i) := false.B
487      }
488      when (i.U < lowResultWidth) {
489        wmaskLow(i) := true.B
490      } .otherwise {
491        wmaskLow(i) := false.B
492      }
493    }
494  }
495
496  when (bufferState === s_cal) {
497    when (!cross16BytesBoundary) {
498      splitStoreData(0).wdata := LookupTree(aligned16BytesSel, List(
499        "b0000".U ->     unalignedStoreData,
500        "b0001".U -> Cat(unalignedStoreData, 0.U(( 1 * 8).W)),
501        "b0010".U -> Cat(unalignedStoreData, 0.U(( 2 * 8).W)),
502        "b0011".U -> Cat(unalignedStoreData, 0.U(( 3 * 8).W)),
503        "b0100".U -> Cat(unalignedStoreData, 0.U(( 4 * 8).W)),
504        "b0101".U -> Cat(unalignedStoreData, 0.U(( 5 * 8).W)),
505        "b0110".U -> Cat(unalignedStoreData, 0.U(( 6 * 8).W)),
506        "b0111".U -> Cat(unalignedStoreData, 0.U(( 7 * 8).W)),
507        "b1000".U -> Cat(unalignedStoreData, 0.U(( 8 * 8).W)),
508        "b1001".U -> Cat(unalignedStoreData, 0.U(( 9 * 8).W)),
509        "b1010".U -> Cat(unalignedStoreData, 0.U((10 * 8).W)),
510        "b1011".U -> Cat(unalignedStoreData, 0.U((11 * 8).W)),
511        "b1100".U -> Cat(unalignedStoreData, 0.U((12 * 8).W)),
512        "b1101".U -> Cat(unalignedStoreData, 0.U((13 * 8).W)),
513        "b1110".U -> Cat(unalignedStoreData, 0.U((14 * 8).W)),
514        "b1111".U -> Cat(unalignedStoreData, 0.U((15 * 8).W))
515      ))(VLEN - 1, 0)
516      splitStoreData(0).wmask := getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel
517    } .otherwise {
518      // low 16bytes part
519      val catData = LookupTree(lowResultWidth, List(
520        BYTE0 -> unalignedStoreData,
521        BYTE1 -> Cat(unalignedStoreData, 0.U((8 * 15).W)),
522        BYTE2 -> Cat(unalignedStoreData, 0.U((8 * 14).W)),
523        BYTE3 -> Cat(unalignedStoreData, 0.U((8 * 13).W)),
524        BYTE4 -> Cat(unalignedStoreData, 0.U((8 * 12).W)),
525        BYTE5 -> Cat(unalignedStoreData, 0.U((8 * 11).W)),
526        BYTE6 -> Cat(unalignedStoreData, 0.U((8 * 10).W)),
527        BYTE7 -> Cat(unalignedStoreData, 0.U((8 *  9).W))
528      ))
529      splitStoreData(0).wdata := catData(VLEN - 1, 0)
530      splitStoreData(0).wmask := VecInit(wmaskLow.reverse).asUInt
531      // high 16bytes part
532      val shiftData = LookupTree(lowResultWidth, List(
533        BYTE0 -> unalignedStoreData(VLEN - 1,    0),
534        BYTE1 -> unalignedStoreData(VLEN - 1,    8),
535        BYTE2 -> unalignedStoreData(VLEN - 1,   16),
536        BYTE3 -> unalignedStoreData(VLEN - 1,   24),
537        BYTE4 -> unalignedStoreData(VLEN - 1,   32),
538        BYTE5 -> unalignedStoreData(VLEN - 1,   40),
539        BYTE6 -> unalignedStoreData(VLEN - 1,   48),
540        BYTE7 -> unalignedStoreData(VLEN - 1,   56)
541      ))
542      splitStoreData(1).wdata := LookupTree(highResultWidth, List(
543        BYTE0 -> ZeroExt(shiftData, VLEN),
544        BYTE1 -> ZeroExt(shiftData(7,    0), VLEN),
545        BYTE2 -> ZeroExt(shiftData(15,   0), VLEN),
546        BYTE3 -> ZeroExt(shiftData(23,   0), VLEN),
547        BYTE4 -> ZeroExt(shiftData(31,   0), VLEN),
548        BYTE5 -> ZeroExt(shiftData(39,   0), VLEN),
549        BYTE6 -> ZeroExt(shiftData(47,   0), VLEN),
550        BYTE7 -> ZeroExt(shiftData(55,   0), VLEN)
551      ))
552      splitStoreData(1).wmask := wmaskHigh.asUInt
553    }
554  }
555
556  io.sqControl.control.hasException := req_valid && globalException
557
558  io.sqControl.control.writeSb := bufferState === s_sq_req
559  io.sqControl.control.wdata   := splitStoreData(curPtr).wdata
560  io.sqControl.control.wmask   := splitStoreData(curPtr).wmask
561  // the paddr and vaddr is not corresponding to the exact addr of
562  io.sqControl.control.paddr   := splitStoreResp(curPtr).paddr
563  io.sqControl.control.vaddr   := splitStoreResp(curPtr).vaddr
564  io.sqControl.control.last    := !((unWriteStores & ~UIntToOH(curPtr)).orR)
565
566  when (bufferState === s_sq_req) {
567    when (io.sqControl.storeInfo.completeSbTrans) {
568      unWriteStores := unWriteStores & ~UIntToOH(curPtr)
569      curPtr := curPtr + 1.U
570    }
571  }
572  io.writeBack.valid := req_valid && (bufferState === s_wb) && io.sqControl.storeInfo.dataReady
573  io.writeBack.bits.uop := req.uop
574  io.writeBack.bits.uop.exceptionVec := DontCare
575  StaCfg.exceptionOut.map(no => io.writeBack.bits.uop.exceptionVec(no) := (globalMMIO || globalException) && exceptionVec(no))
576  io.writeBack.bits.uop.flushPipe := Mux(globalMMIO || globalException, false.B, true.B)
577  io.writeBack.bits.uop.replayInst := false.B
578  io.writeBack.bits.data := unalignedStoreData
579  io.writeBack.bits.isFromLoadUnit := DontCare
580  io.writeBack.bits.debug.isMMIO := globalMMIO
581  io.writeBack.bits.debug.isPerfCnt := false.B
582  io.writeBack.bits.debug.paddr := req.paddr
583  io.writeBack.bits.debug.vaddr := req.vaddr
584
585  io.sqControl.control.removeSq := req_valid && (bufferState === s_wait) && !(globalMMIO || globalException) && (io.rob.scommit =/= 0.U)
586
587  val flush = req_valid && req.uop.robIdx.needFlush(io.redirect)
588
589  when (flush && (bufferState =/= s_idle)) {
590    bufferState := s_idle
591    req_valid := false.B
592    curPtr := 0.U
593    unSentStores := 0.U
594    unWriteStores := 0.U
595    globalException := false.B
596    globalMMIO := false.B
597  }
598
599  // NOTE: spectial case (unaligned store cross page, page fault happens in next page)
600  // if exception happens in the higher page address part, overwrite the storeExceptionBuffer vaddr
601  val shouldOverwrite = req_valid && cross16BytesBoundary && globalException && (curPtr === 1.U)
602  val overwriteExpBuf = GatedValidRegNext(shouldOverwrite)
603  val overwriteVaddr = RegEnable(splitStoreResp(curPtr).vaddr, shouldOverwrite)
604  val overwriteIsHyper = RegEnable(splitStoreResp(curPtr).isHyper, shouldOverwrite)
605  val overwriteGpaddr = RegEnable(splitStoreResp(curPtr).gpaddr, shouldOverwrite)
606  val overwriteIsForVSnonLeafPTE = RegEnable(splitStoreResp(curPtr).isForVSnonLeafPTE, shouldOverwrite)
607
608  io.overwriteExpBuf.valid := overwriteExpBuf
609  io.overwriteExpBuf.vaddr := overwriteVaddr
610  io.overwriteExpBuf.isHyper := overwriteIsHyper
611  io.overwriteExpBuf.gpaddr := overwriteGpaddr
612  io.overwriteExpBuf.isForVSnonLeafPTE := overwriteIsForVSnonLeafPTE
613
614  XSPerfAccumulate("alloc",                  RegNext(!req_valid) && req_valid)
615  XSPerfAccumulate("flush",                  flush)
616  XSPerfAccumulate("flush_idle",             flush && (bufferState === s_idle))
617  XSPerfAccumulate("flush_non_idle",         flush && (bufferState =/= s_idle))
618}