xref: /XiangShan/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala (revision 41d8d23915d5ca51a6cc97bd29442107d4b2a339)
1*41d8d239Shappy-lx/***************************************************************************************
2*41d8d239Shappy-lx* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3*41d8d239Shappy-lx* Copyright (c) 2020-2021 Peng Cheng Laboratory
4*41d8d239Shappy-lx*
5*41d8d239Shappy-lx* XiangShan is licensed under Mulan PSL v2.
6*41d8d239Shappy-lx* You can use this software according to the terms and conditions of the Mulan PSL v2.
7*41d8d239Shappy-lx* You may obtain a copy of Mulan PSL v2 at:
8*41d8d239Shappy-lx*          http://license.coscl.org.cn/MulanPSL2
9*41d8d239Shappy-lx*
10*41d8d239Shappy-lx* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11*41d8d239Shappy-lx* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12*41d8d239Shappy-lx* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13*41d8d239Shappy-lx*
14*41d8d239Shappy-lx* See the Mulan PSL v2 for more details.
15*41d8d239Shappy-lx***************************************************************************************/
16*41d8d239Shappy-lx
17*41d8d239Shappy-lxpackage xiangshan.mem
18*41d8d239Shappy-lx
19*41d8d239Shappy-lximport org.chipsalliance.cde.config.Parameters
20*41d8d239Shappy-lximport chisel3._
21*41d8d239Shappy-lximport chisel3.util._
22*41d8d239Shappy-lximport utils._
23*41d8d239Shappy-lximport utility._
24*41d8d239Shappy-lximport xiangshan._
25*41d8d239Shappy-lximport xiangshan.backend.fu.FuConfig._
26*41d8d239Shappy-lximport xiangshan.backend.fu.fpu.FPU
27*41d8d239Shappy-lximport xiangshan.backend.rob.RobLsqIO
28*41d8d239Shappy-lximport xiangshan.cache._
29*41d8d239Shappy-lximport xiangshan.frontend.FtqPtr
30*41d8d239Shappy-lximport xiangshan.ExceptionNO._
31*41d8d239Shappy-lximport xiangshan.cache.wpu.ReplayCarry
32*41d8d239Shappy-lximport xiangshan.backend.rob.RobPtr
33*41d8d239Shappy-lximport xiangshan.backend.Bundles.{MemExuOutput, DynInst}
34*41d8d239Shappy-lx
35*41d8d239Shappy-lxclass LoadMisalignBuffer(implicit p: Parameters) extends XSModule
36*41d8d239Shappy-lx  with HasCircularQueuePtrHelper
37*41d8d239Shappy-lx  with HasLoadHelper
38*41d8d239Shappy-lx{
39*41d8d239Shappy-lx  private val enqPortNum = LoadPipelineWidth
40*41d8d239Shappy-lx  private val maxSplitNum = 2
41*41d8d239Shappy-lx
42*41d8d239Shappy-lx  require(maxSplitNum == 2)
43*41d8d239Shappy-lx
44*41d8d239Shappy-lx  private val LB = "b00".U(2.W)
45*41d8d239Shappy-lx  private val LH = "b01".U(2.W)
46*41d8d239Shappy-lx  private val LW = "b10".U(2.W)
47*41d8d239Shappy-lx  private val LD = "b11".U(2.W)
48*41d8d239Shappy-lx
49*41d8d239Shappy-lx  // encode of how many bytes to shift or truncate
50*41d8d239Shappy-lx  private val BYTE0 = "b000".U(3.W)
51*41d8d239Shappy-lx  private val BYTE1 = "b001".U(3.W)
52*41d8d239Shappy-lx  private val BYTE2 = "b010".U(3.W)
53*41d8d239Shappy-lx  private val BYTE3 = "b011".U(3.W)
54*41d8d239Shappy-lx  private val BYTE4 = "b100".U(3.W)
55*41d8d239Shappy-lx  private val BYTE5 = "b101".U(3.W)
56*41d8d239Shappy-lx  private val BYTE6 = "b110".U(3.W)
57*41d8d239Shappy-lx  private val BYTE7 = "b111".U(3.W)
58*41d8d239Shappy-lx
59*41d8d239Shappy-lx  def getMask(sizeEncode: UInt) = LookupTree(sizeEncode, List(
60*41d8d239Shappy-lx    LB -> 0x1.U, // lb
61*41d8d239Shappy-lx    LH -> 0x3.U, // lh
62*41d8d239Shappy-lx    LW -> 0xf.U, // lw
63*41d8d239Shappy-lx    LD -> 0xff.U  // ld
64*41d8d239Shappy-lx  ))
65*41d8d239Shappy-lx
66*41d8d239Shappy-lx  def getShiftAndTruncateData(shiftEncode: UInt, truncateEncode: UInt, data: UInt) = {
67*41d8d239Shappy-lx    val shiftData = LookupTree(shiftEncode, List(
68*41d8d239Shappy-lx      BYTE0 -> data(63,    0),
69*41d8d239Shappy-lx      BYTE1 -> data(63,    8),
70*41d8d239Shappy-lx      BYTE2 -> data(63,   16),
71*41d8d239Shappy-lx      BYTE3 -> data(63,   24),
72*41d8d239Shappy-lx      BYTE4 -> data(63,   32),
73*41d8d239Shappy-lx      BYTE5 -> data(63,   40),
74*41d8d239Shappy-lx      BYTE6 -> data(63,   48),
75*41d8d239Shappy-lx      BYTE7 -> data(63,   56)
76*41d8d239Shappy-lx    ))
77*41d8d239Shappy-lx    val truncateData = LookupTree(truncateEncode, List(
78*41d8d239Shappy-lx      BYTE0 -> 0.U(XLEN.W), // can not truncate with 0 byte width
79*41d8d239Shappy-lx      BYTE1 -> shiftData(7,    0),
80*41d8d239Shappy-lx      BYTE2 -> shiftData(15,   0),
81*41d8d239Shappy-lx      BYTE3 -> shiftData(23,   0),
82*41d8d239Shappy-lx      BYTE4 -> shiftData(31,   0),
83*41d8d239Shappy-lx      BYTE5 -> shiftData(39,   0),
84*41d8d239Shappy-lx      BYTE6 -> shiftData(47,   0),
85*41d8d239Shappy-lx      BYTE7 -> shiftData(55,   0)
86*41d8d239Shappy-lx    ))
87*41d8d239Shappy-lx    truncateData(XLEN - 1, 0)
88*41d8d239Shappy-lx  }
89*41d8d239Shappy-lx
90*41d8d239Shappy-lx  def selectOldest[T <: LqWriteBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = {
91*41d8d239Shappy-lx    assert(valid.length == bits.length)
92*41d8d239Shappy-lx    if (valid.length == 0 || valid.length == 1) {
93*41d8d239Shappy-lx      (valid, bits)
94*41d8d239Shappy-lx    } else if (valid.length == 2) {
95*41d8d239Shappy-lx      val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0)))))
96*41d8d239Shappy-lx      for (i <- res.indices) {
97*41d8d239Shappy-lx        res(i).valid := valid(i)
98*41d8d239Shappy-lx        res(i).bits := bits(i)
99*41d8d239Shappy-lx      }
100*41d8d239Shappy-lx      val oldest = Mux(valid(0) && valid(1),
101*41d8d239Shappy-lx        Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) ||
102*41d8d239Shappy-lx          (isNotBefore(bits(0).uop.robIdx, bits(1).uop.robIdx) && bits(0).uop.uopIdx > bits(1).uop.uopIdx), res(1), res(0)),
103*41d8d239Shappy-lx        Mux(valid(0) && !valid(1), res(0), res(1)))
104*41d8d239Shappy-lx      (Seq(oldest.valid), Seq(oldest.bits))
105*41d8d239Shappy-lx    } else {
106*41d8d239Shappy-lx      val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2))
107*41d8d239Shappy-lx      val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)))
108*41d8d239Shappy-lx      selectOldest(left._1 ++ right._1, left._2 ++ right._2)
109*41d8d239Shappy-lx    }
110*41d8d239Shappy-lx  }
111*41d8d239Shappy-lx
112*41d8d239Shappy-lx  val io = IO(new Bundle() {
113*41d8d239Shappy-lx    val redirect        = Flipped(Valid(new Redirect))
114*41d8d239Shappy-lx    val req             = Vec(enqPortNum, Flipped(Valid(new LqWriteBundle)))
115*41d8d239Shappy-lx    val rob             = Flipped(new RobLsqIO)
116*41d8d239Shappy-lx    val splitLoadReq    = Decoupled(new LsPipelineBundle)
117*41d8d239Shappy-lx    val splitLoadResp   = Flipped(Valid(new LqWriteBundle))
118*41d8d239Shappy-lx    val writeBack       = Decoupled(new MemExuOutput)
119*41d8d239Shappy-lx    val overwriteExpBuf = Output(new XSBundle {
120*41d8d239Shappy-lx      val valid = Bool()
121*41d8d239Shappy-lx      val vaddr = UInt(VAddrBits.W)
122*41d8d239Shappy-lx    })
123*41d8d239Shappy-lx    val flushLdExpBuff  = Output(Bool())
124*41d8d239Shappy-lx  })
125*41d8d239Shappy-lx
126*41d8d239Shappy-lx  io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool()))
127*41d8d239Shappy-lx  io.rob.uop  := 0.U.asTypeOf(Vec(LoadPipelineWidth, new DynInst))
128*41d8d239Shappy-lx
129*41d8d239Shappy-lx  val req_valid = RegInit(false.B)
130*41d8d239Shappy-lx  val req = Reg(new LqWriteBundle)
131*41d8d239Shappy-lx
132*41d8d239Shappy-lx  // enqueue
133*41d8d239Shappy-lx  // s1:
134*41d8d239Shappy-lx  val s1_req = VecInit(io.req.map(_.bits))
135*41d8d239Shappy-lx  val s1_valid = VecInit(io.req.map(x => x.valid))
136*41d8d239Shappy-lx
137*41d8d239Shappy-lx  // s2: delay 1 cycle
138*41d8d239Shappy-lx  val s2_req = RegNext(s1_req)
139*41d8d239Shappy-lx  val s2_valid = (0 until enqPortNum).map(i =>
140*41d8d239Shappy-lx    RegNext(s1_valid(i)) &&
141*41d8d239Shappy-lx    !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) &&
142*41d8d239Shappy-lx    !s2_req(i).uop.robIdx.needFlush(io.redirect)
143*41d8d239Shappy-lx  )
144*41d8d239Shappy-lx  val s2_miss_aligned = s2_req.map(x => x.uop.exceptionVec(loadAddrMisaligned))
145*41d8d239Shappy-lx
146*41d8d239Shappy-lx  val s2_enqueue = Wire(Vec(enqPortNum, Bool()))
147*41d8d239Shappy-lx  for (w <- 0 until enqPortNum) {
148*41d8d239Shappy-lx    s2_enqueue(w) := s2_valid(w) && s2_miss_aligned(w)
149*41d8d239Shappy-lx  }
150*41d8d239Shappy-lx
151*41d8d239Shappy-lx  when (req_valid && req.uop.robIdx.needFlush(io.redirect)) {
152*41d8d239Shappy-lx    req_valid := s2_enqueue.asUInt.orR
153*41d8d239Shappy-lx  } .elsewhen (s2_enqueue.asUInt.orR) {
154*41d8d239Shappy-lx    req_valid := req_valid || true.B
155*41d8d239Shappy-lx  }
156*41d8d239Shappy-lx
157*41d8d239Shappy-lx  val reqSel = selectOldest(s2_enqueue, s2_req)
158*41d8d239Shappy-lx
159*41d8d239Shappy-lx  when (req_valid) {
160*41d8d239Shappy-lx    req := Mux(
161*41d8d239Shappy-lx      reqSel._1(0) && (isAfter(req.uop.robIdx, reqSel._2(0).uop.robIdx) || (isNotBefore(req.uop.robIdx, reqSel._2(0).uop.robIdx) && req.uop.uopIdx > reqSel._2(0).uop.uopIdx)),
162*41d8d239Shappy-lx      reqSel._2(0),
163*41d8d239Shappy-lx      req)
164*41d8d239Shappy-lx  } .elsewhen (s2_enqueue.asUInt.orR) {
165*41d8d239Shappy-lx    req := reqSel._2(0)
166*41d8d239Shappy-lx  }
167*41d8d239Shappy-lx
168*41d8d239Shappy-lx  val robMatch = req_valid && io.rob.pendingld && (io.rob.pendingPtr === req.uop.robIdx)
169*41d8d239Shappy-lx
170*41d8d239Shappy-lx  // buffer control:
171*41d8d239Shappy-lx  //  - split miss-aligned load into aligned loads
172*41d8d239Shappy-lx  //  - send split load to ldu and get result from ldu
173*41d8d239Shappy-lx  //  - merge them and write back to rob
174*41d8d239Shappy-lx  val s_idle :: s_split :: s_req :: s_resp :: s_comb :: s_wb :: s_wait :: Nil = Enum(7)
175*41d8d239Shappy-lx  val bufferState = RegInit(s_idle)
176*41d8d239Shappy-lx  val splitLoadReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle))))
177*41d8d239Shappy-lx  val splitLoadResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LqWriteBundle))))
178*41d8d239Shappy-lx  val unSentLoads = RegInit(0.U(maxSplitNum.W))
179*41d8d239Shappy-lx  val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W))
180*41d8d239Shappy-lx
181*41d8d239Shappy-lx  // if there is exception or mmio in split load
182*41d8d239Shappy-lx  val globalException = RegInit(false.B)
183*41d8d239Shappy-lx  val globalMMIO = RegInit(false.B)
184*41d8d239Shappy-lx
185*41d8d239Shappy-lx  val hasException = ExceptionNO.selectByFu(io.splitLoadResp.bits.uop.exceptionVec, LduCfg).asUInt.orR
186*41d8d239Shappy-lx  val isMMIO = io.splitLoadResp.bits.mmio
187*41d8d239Shappy-lx
188*41d8d239Shappy-lx  switch(bufferState) {
189*41d8d239Shappy-lx    is (s_idle) {
190*41d8d239Shappy-lx      when (robMatch) {
191*41d8d239Shappy-lx        bufferState := s_split
192*41d8d239Shappy-lx      }
193*41d8d239Shappy-lx    }
194*41d8d239Shappy-lx
195*41d8d239Shappy-lx    is (s_split) {
196*41d8d239Shappy-lx      bufferState := s_req
197*41d8d239Shappy-lx    }
198*41d8d239Shappy-lx
199*41d8d239Shappy-lx    is (s_req) {
200*41d8d239Shappy-lx      when (io.splitLoadReq.fire) {
201*41d8d239Shappy-lx        bufferState := s_resp
202*41d8d239Shappy-lx      }
203*41d8d239Shappy-lx    }
204*41d8d239Shappy-lx
205*41d8d239Shappy-lx    is (s_resp) {
206*41d8d239Shappy-lx      when (io.splitLoadResp.valid) {
207*41d8d239Shappy-lx        val clearOh = UIntToOH(curPtr)
208*41d8d239Shappy-lx        when (hasException || isMMIO) {
209*41d8d239Shappy-lx          // commit directly when exception ocurs
210*41d8d239Shappy-lx          // if any split load reaches mmio space, delegate to software loadAddrMisaligned exception
211*41d8d239Shappy-lx          bufferState := s_wb
212*41d8d239Shappy-lx          globalException := hasException
213*41d8d239Shappy-lx          globalMMIO := isMMIO
214*41d8d239Shappy-lx        } .elsewhen(io.splitLoadResp.bits.rep_info.need_rep || (unSentLoads & ~clearOh).orR) {
215*41d8d239Shappy-lx          // need replay or still has unsent requests
216*41d8d239Shappy-lx          bufferState := s_req
217*41d8d239Shappy-lx        } .otherwise {
218*41d8d239Shappy-lx          // merge the split load results
219*41d8d239Shappy-lx          bufferState := s_comb
220*41d8d239Shappy-lx        }
221*41d8d239Shappy-lx      }
222*41d8d239Shappy-lx    }
223*41d8d239Shappy-lx
224*41d8d239Shappy-lx    is (s_comb) {
225*41d8d239Shappy-lx      bufferState := s_wb
226*41d8d239Shappy-lx    }
227*41d8d239Shappy-lx
228*41d8d239Shappy-lx    is (s_wb) {
229*41d8d239Shappy-lx      when(io.writeBack.fire) {
230*41d8d239Shappy-lx        bufferState := s_wait
231*41d8d239Shappy-lx      }
232*41d8d239Shappy-lx    }
233*41d8d239Shappy-lx
234*41d8d239Shappy-lx    is (s_wait) {
235*41d8d239Shappy-lx      when(io.rob.lcommit =/= 0.U || req.uop.robIdx.needFlush(io.redirect)) {
236*41d8d239Shappy-lx        // rob commits the unaligned load or handled the exception, reset all state
237*41d8d239Shappy-lx        bufferState := s_idle
238*41d8d239Shappy-lx        req_valid := false.B
239*41d8d239Shappy-lx        curPtr := 0.U
240*41d8d239Shappy-lx        unSentLoads := 0.U
241*41d8d239Shappy-lx        globalException := false.B
242*41d8d239Shappy-lx        globalMMIO := false.B
243*41d8d239Shappy-lx      }
244*41d8d239Shappy-lx    }
245*41d8d239Shappy-lx  }
246*41d8d239Shappy-lx
247*41d8d239Shappy-lx  val highAddress = LookupTree(req.uop.fuOpType(1, 0), List(
248*41d8d239Shappy-lx    LB -> 0.U,
249*41d8d239Shappy-lx    LH -> 1.U,
250*41d8d239Shappy-lx    LW -> 3.U,
251*41d8d239Shappy-lx    LD -> 7.U
252*41d8d239Shappy-lx  )) + req.vaddr(4, 0)
253*41d8d239Shappy-lx  // to see if (vaddr + opSize - 1) and vaddr are in the same 16 bytes region
254*41d8d239Shappy-lx  val cross16BytesBoundary = req_valid && (highAddress(4) =/= req.vaddr(4))
255*41d8d239Shappy-lx  val aligned16BytesAddr   = (req.vaddr >> 4) << 4// req.vaddr & ~("b1111".U)
256*41d8d239Shappy-lx  val aligned16BytesSel    = req.vaddr(3, 0)
257*41d8d239Shappy-lx
258*41d8d239Shappy-lx  // meta of 128 bit load
259*41d8d239Shappy-lx  val new128Load = WireInit(0.U.asTypeOf(new LsPipelineBundle))
260*41d8d239Shappy-lx  // meta of split loads
261*41d8d239Shappy-lx  val lowAddrLoad  = WireInit(0.U.asTypeOf(new LsPipelineBundle))
262*41d8d239Shappy-lx  val highAddrLoad = WireInit(0.U.asTypeOf(new LsPipelineBundle))
263*41d8d239Shappy-lx  val lowResultShift = RegInit(0.U(3.W)) // how many bytes should we shift right when got result
264*41d8d239Shappy-lx  val lowResultWidth = RegInit(0.U(3.W)) // how many bytes should we take from result
265*41d8d239Shappy-lx  val highResultShift = RegInit(0.U(3.W))
266*41d8d239Shappy-lx  val highResultWidth = RegInit(0.U(3.W))
267*41d8d239Shappy-lx
268*41d8d239Shappy-lx  when (bufferState === s_split) {
269*41d8d239Shappy-lx    when (!cross16BytesBoundary) {
270*41d8d239Shappy-lx      // change this unaligned load into a 128 bits load
271*41d8d239Shappy-lx      unSentLoads := 1.U
272*41d8d239Shappy-lx      curPtr := 0.U
273*41d8d239Shappy-lx      new128Load.vaddr := aligned16BytesAddr
274*41d8d239Shappy-lx      // new128Load.mask  := (getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel).asUInt
275*41d8d239Shappy-lx      new128Load.mask  := 0xffff.U
276*41d8d239Shappy-lx      new128Load.uop   := req.uop
277*41d8d239Shappy-lx      new128Load.uop.exceptionVec(loadAddrMisaligned) := false.B
278*41d8d239Shappy-lx      new128Load.is128bit := true.B
279*41d8d239Shappy-lx      splitLoadReqs(0) := new128Load
280*41d8d239Shappy-lx    } .otherwise {
281*41d8d239Shappy-lx      // split this unaligned load into `maxSplitNum` aligned loads
282*41d8d239Shappy-lx      unSentLoads := Fill(maxSplitNum, 1.U(1.W))
283*41d8d239Shappy-lx      curPtr := 0.U
284*41d8d239Shappy-lx      lowAddrLoad.uop := req.uop
285*41d8d239Shappy-lx      lowAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
286*41d8d239Shappy-lx      highAddrLoad.uop := req.uop
287*41d8d239Shappy-lx      highAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B
288*41d8d239Shappy-lx
289*41d8d239Shappy-lx      switch (req.uop.fuOpType(1, 0)) {
290*41d8d239Shappy-lx        is (LB) {
291*41d8d239Shappy-lx          assert(false.B, "lb should not trigger miss align")
292*41d8d239Shappy-lx        }
293*41d8d239Shappy-lx
294*41d8d239Shappy-lx        is (LH) {
295*41d8d239Shappy-lx          lowAddrLoad.uop.fuOpType := LB
296*41d8d239Shappy-lx          lowAddrLoad.vaddr := req.vaddr
297*41d8d239Shappy-lx          lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
298*41d8d239Shappy-lx          lowResultShift    := BYTE0
299*41d8d239Shappy-lx          lowResultWidth    := BYTE1
300*41d8d239Shappy-lx
301*41d8d239Shappy-lx          highAddrLoad.uop.fuOpType := LB
302*41d8d239Shappy-lx          highAddrLoad.vaddr := req.vaddr + 1.U
303*41d8d239Shappy-lx          highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
304*41d8d239Shappy-lx          highResultShift    := BYTE0
305*41d8d239Shappy-lx          highResultWidth    := BYTE1
306*41d8d239Shappy-lx        }
307*41d8d239Shappy-lx
308*41d8d239Shappy-lx        is (LW) {
309*41d8d239Shappy-lx          switch (req.vaddr(1, 0)) {
310*41d8d239Shappy-lx            is ("b00".U) {
311*41d8d239Shappy-lx              assert(false.B, "should not trigger miss align")
312*41d8d239Shappy-lx            }
313*41d8d239Shappy-lx
314*41d8d239Shappy-lx            is ("b01".U) {
315*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LW
316*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr - 1.U
317*41d8d239Shappy-lx              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
318*41d8d239Shappy-lx              lowResultShift    := BYTE1
319*41d8d239Shappy-lx              lowResultWidth    := BYTE3
320*41d8d239Shappy-lx
321*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LB
322*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 3.U
323*41d8d239Shappy-lx              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
324*41d8d239Shappy-lx              highResultShift    := BYTE0
325*41d8d239Shappy-lx              highResultWidth    := BYTE1
326*41d8d239Shappy-lx            }
327*41d8d239Shappy-lx
328*41d8d239Shappy-lx            is ("b10".U) {
329*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LH
330*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr
331*41d8d239Shappy-lx              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
332*41d8d239Shappy-lx              lowResultShift    := BYTE0
333*41d8d239Shappy-lx              lowResultWidth    := BYTE2
334*41d8d239Shappy-lx
335*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LH
336*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 2.U
337*41d8d239Shappy-lx              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
338*41d8d239Shappy-lx              highResultShift    := BYTE0
339*41d8d239Shappy-lx              highResultWidth    := BYTE2
340*41d8d239Shappy-lx            }
341*41d8d239Shappy-lx
342*41d8d239Shappy-lx            is ("b11".U) {
343*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LB
344*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr
345*41d8d239Shappy-lx              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
346*41d8d239Shappy-lx              lowResultShift    := BYTE0
347*41d8d239Shappy-lx              lowResultWidth    := BYTE1
348*41d8d239Shappy-lx
349*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LW
350*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 1.U
351*41d8d239Shappy-lx              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
352*41d8d239Shappy-lx              highResultShift    := BYTE0
353*41d8d239Shappy-lx              highResultWidth    := BYTE3
354*41d8d239Shappy-lx            }
355*41d8d239Shappy-lx          }
356*41d8d239Shappy-lx        }
357*41d8d239Shappy-lx
358*41d8d239Shappy-lx        is (LD) {
359*41d8d239Shappy-lx          switch (req.vaddr(2, 0)) {
360*41d8d239Shappy-lx            is ("b000".U) {
361*41d8d239Shappy-lx              assert(false.B, "should not trigger miss align")
362*41d8d239Shappy-lx            }
363*41d8d239Shappy-lx
364*41d8d239Shappy-lx            is ("b001".U) {
365*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LD
366*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr - 1.U
367*41d8d239Shappy-lx              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
368*41d8d239Shappy-lx              lowResultShift    := BYTE1
369*41d8d239Shappy-lx              lowResultWidth    := BYTE7
370*41d8d239Shappy-lx
371*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LB
372*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 7.U
373*41d8d239Shappy-lx              highAddrLoad.mask  := 0x1.U << highAddrLoad.vaddr(3, 0)
374*41d8d239Shappy-lx              highResultShift    := BYTE0
375*41d8d239Shappy-lx              highResultWidth    := BYTE1
376*41d8d239Shappy-lx            }
377*41d8d239Shappy-lx
378*41d8d239Shappy-lx            is ("b010".U) {
379*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LD
380*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr - 2.U
381*41d8d239Shappy-lx              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
382*41d8d239Shappy-lx              lowResultShift    := BYTE2
383*41d8d239Shappy-lx              lowResultWidth    := BYTE6
384*41d8d239Shappy-lx
385*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LH
386*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 6.U
387*41d8d239Shappy-lx              highAddrLoad.mask  := 0x3.U << highAddrLoad.vaddr(3, 0)
388*41d8d239Shappy-lx              highResultShift    := BYTE0
389*41d8d239Shappy-lx              highResultWidth    := BYTE2
390*41d8d239Shappy-lx            }
391*41d8d239Shappy-lx
392*41d8d239Shappy-lx            is ("b011".U) {
393*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LD
394*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr - 3.U
395*41d8d239Shappy-lx              lowAddrLoad.mask  := 0xff.U << lowAddrLoad.vaddr(3, 0)
396*41d8d239Shappy-lx              lowResultShift    := BYTE3
397*41d8d239Shappy-lx              lowResultWidth    := BYTE5
398*41d8d239Shappy-lx
399*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LW
400*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 5.U
401*41d8d239Shappy-lx              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
402*41d8d239Shappy-lx              highResultShift    := BYTE0
403*41d8d239Shappy-lx              highResultWidth    := BYTE3
404*41d8d239Shappy-lx            }
405*41d8d239Shappy-lx
406*41d8d239Shappy-lx            is ("b100".U) {
407*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LW
408*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr
409*41d8d239Shappy-lx              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
410*41d8d239Shappy-lx              lowResultShift    := BYTE0
411*41d8d239Shappy-lx              lowResultWidth    := BYTE4
412*41d8d239Shappy-lx
413*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LW
414*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 4.U
415*41d8d239Shappy-lx              highAddrLoad.mask  := 0xf.U << highAddrLoad.vaddr(3, 0)
416*41d8d239Shappy-lx              highResultShift    := BYTE0
417*41d8d239Shappy-lx              highResultWidth    := BYTE4
418*41d8d239Shappy-lx            }
419*41d8d239Shappy-lx
420*41d8d239Shappy-lx            is ("b101".U) {
421*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LW
422*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr - 1.U
423*41d8d239Shappy-lx              lowAddrLoad.mask  := 0xf.U << lowAddrLoad.vaddr(3, 0)
424*41d8d239Shappy-lx              lowResultShift    := BYTE1
425*41d8d239Shappy-lx              lowResultWidth    := BYTE3
426*41d8d239Shappy-lx
427*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LD
428*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 3.U
429*41d8d239Shappy-lx              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
430*41d8d239Shappy-lx              highResultShift    := BYTE0
431*41d8d239Shappy-lx              highResultWidth    := BYTE5
432*41d8d239Shappy-lx            }
433*41d8d239Shappy-lx
434*41d8d239Shappy-lx            is ("b110".U) {
435*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LH
436*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr
437*41d8d239Shappy-lx              lowAddrLoad.mask  := 0x3.U << lowAddrLoad.vaddr(3, 0)
438*41d8d239Shappy-lx              lowResultShift    := BYTE0
439*41d8d239Shappy-lx              lowResultWidth    := BYTE2
440*41d8d239Shappy-lx
441*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LD
442*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 2.U
443*41d8d239Shappy-lx              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
444*41d8d239Shappy-lx              highResultShift    := BYTE0
445*41d8d239Shappy-lx              highResultWidth    := BYTE6
446*41d8d239Shappy-lx            }
447*41d8d239Shappy-lx
448*41d8d239Shappy-lx            is ("b111".U) {
449*41d8d239Shappy-lx              lowAddrLoad.uop.fuOpType := LB
450*41d8d239Shappy-lx              lowAddrLoad.vaddr := req.vaddr
451*41d8d239Shappy-lx              lowAddrLoad.mask  := 0x1.U << lowAddrLoad.vaddr(3, 0)
452*41d8d239Shappy-lx              lowResultShift    := BYTE0
453*41d8d239Shappy-lx              lowResultWidth    := BYTE1
454*41d8d239Shappy-lx
455*41d8d239Shappy-lx              highAddrLoad.uop.fuOpType := LD
456*41d8d239Shappy-lx              highAddrLoad.vaddr := req.vaddr + 1.U
457*41d8d239Shappy-lx              highAddrLoad.mask  := 0xff.U << highAddrLoad.vaddr(3, 0)
458*41d8d239Shappy-lx              highResultShift    := BYTE0
459*41d8d239Shappy-lx              highResultWidth    := BYTE7
460*41d8d239Shappy-lx            }
461*41d8d239Shappy-lx          }
462*41d8d239Shappy-lx        }
463*41d8d239Shappy-lx      }
464*41d8d239Shappy-lx
465*41d8d239Shappy-lx      splitLoadReqs(0) := lowAddrLoad
466*41d8d239Shappy-lx      splitLoadReqs(1) := highAddrLoad
467*41d8d239Shappy-lx    }
468*41d8d239Shappy-lx  }
469*41d8d239Shappy-lx
470*41d8d239Shappy-lx  io.splitLoadReq.valid := req_valid && (bufferState === s_req)
471*41d8d239Shappy-lx  io.splitLoadReq.bits  := splitLoadReqs(curPtr)
472*41d8d239Shappy-lx
473*41d8d239Shappy-lx  when (io.splitLoadResp.valid) {
474*41d8d239Shappy-lx    splitLoadResp(curPtr) := io.splitLoadResp.bits
475*41d8d239Shappy-lx    when (isMMIO) {
476*41d8d239Shappy-lx      unSentLoads := 0.U
477*41d8d239Shappy-lx      splitLoadResp(curPtr).uop.exceptionVec := 0.U.asTypeOf(ExceptionVec())
478*41d8d239Shappy-lx      // delegate to software
479*41d8d239Shappy-lx      splitLoadResp(curPtr).uop.exceptionVec(loadAddrMisaligned) := true.B
480*41d8d239Shappy-lx    } .elsewhen (hasException) {
481*41d8d239Shappy-lx      unSentLoads := 0.U
482*41d8d239Shappy-lx    } .elsewhen (!io.splitLoadResp.bits.rep_info.need_rep) {
483*41d8d239Shappy-lx      unSentLoads := unSentLoads & ~UIntToOH(curPtr)
484*41d8d239Shappy-lx      curPtr := curPtr + 1.U
485*41d8d239Shappy-lx    }
486*41d8d239Shappy-lx  }
487*41d8d239Shappy-lx
488*41d8d239Shappy-lx  val combinedData = RegInit(0.U(XLEN.W))
489*41d8d239Shappy-lx
490*41d8d239Shappy-lx  when (bufferState === s_comb) {
491*41d8d239Shappy-lx    when (!cross16BytesBoundary) {
492*41d8d239Shappy-lx      val shiftData = LookupTree(aligned16BytesSel, List(
493*41d8d239Shappy-lx        "b0000".U -> splitLoadResp(0).data(63,     0),
494*41d8d239Shappy-lx        "b0001".U -> splitLoadResp(0).data(71,     8),
495*41d8d239Shappy-lx        "b0010".U -> splitLoadResp(0).data(79,    16),
496*41d8d239Shappy-lx        "b0011".U -> splitLoadResp(0).data(87,    24),
497*41d8d239Shappy-lx        "b0100".U -> splitLoadResp(0).data(95,    32),
498*41d8d239Shappy-lx        "b0101".U -> splitLoadResp(0).data(103,   40),
499*41d8d239Shappy-lx        "b0110".U -> splitLoadResp(0).data(111,   48),
500*41d8d239Shappy-lx        "b0111".U -> splitLoadResp(0).data(119,   56),
501*41d8d239Shappy-lx        "b1000".U -> splitLoadResp(0).data(127,   64),
502*41d8d239Shappy-lx        "b1001".U -> splitLoadResp(0).data(127,   72),
503*41d8d239Shappy-lx        "b1010".U -> splitLoadResp(0).data(127,   80),
504*41d8d239Shappy-lx        "b1011".U -> splitLoadResp(0).data(127,   88),
505*41d8d239Shappy-lx        "b1100".U -> splitLoadResp(0).data(127,   96),
506*41d8d239Shappy-lx        "b1101".U -> splitLoadResp(0).data(127,  104),
507*41d8d239Shappy-lx        "b1110".U -> splitLoadResp(0).data(127,  112),
508*41d8d239Shappy-lx        "b1111".U -> splitLoadResp(0).data(127,  120)
509*41d8d239Shappy-lx      ))
510*41d8d239Shappy-lx      val truncateData = LookupTree(req.uop.fuOpType(1, 0), List(
511*41d8d239Shappy-lx        LB -> shiftData(7,  0), // lb
512*41d8d239Shappy-lx        LH -> shiftData(15, 0), // lh
513*41d8d239Shappy-lx        LW -> shiftData(31, 0), // lw
514*41d8d239Shappy-lx        LD -> shiftData(63, 0)  // ld
515*41d8d239Shappy-lx      ))
516*41d8d239Shappy-lx      combinedData := rdataHelper(req.uop, truncateData(XLEN - 1, 0))
517*41d8d239Shappy-lx    } .otherwise {
518*41d8d239Shappy-lx      val lowAddrResult = getShiftAndTruncateData(lowResultShift, lowResultWidth, splitLoadResp(0).data)
519*41d8d239Shappy-lx                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
520*41d8d239Shappy-lx      val highAddrResult = getShiftAndTruncateData(highResultShift, highResultWidth, splitLoadResp(1).data)
521*41d8d239Shappy-lx                            .asTypeOf(Vec(XLEN / 8, UInt(8.W)))
522*41d8d239Shappy-lx      val catResult = Wire(Vec(XLEN / 8, UInt(8.W)))
523*41d8d239Shappy-lx      (0 until XLEN / 8) .map {
524*41d8d239Shappy-lx        case i => {
525*41d8d239Shappy-lx          when (i.U < lowResultWidth) {
526*41d8d239Shappy-lx            catResult(i) := lowAddrResult(i)
527*41d8d239Shappy-lx          } .otherwise {
528*41d8d239Shappy-lx            catResult(i) := highAddrResult(i.U - lowResultWidth)
529*41d8d239Shappy-lx          }
530*41d8d239Shappy-lx        }
531*41d8d239Shappy-lx      }
532*41d8d239Shappy-lx      combinedData := rdataHelper(req.uop, (catResult.asUInt)(XLEN - 1, 0))
533*41d8d239Shappy-lx    }
534*41d8d239Shappy-lx  }
535*41d8d239Shappy-lx
536*41d8d239Shappy-lx  io.writeBack.valid := req_valid && (bufferState === s_wb)
537*41d8d239Shappy-lx  io.writeBack.bits.uop := req.uop
538*41d8d239Shappy-lx  io.writeBack.bits.uop.exceptionVec := Mux(
539*41d8d239Shappy-lx    globalMMIO || globalException,
540*41d8d239Shappy-lx    splitLoadResp(curPtr).uop.exceptionVec,
541*41d8d239Shappy-lx    0.U.asTypeOf(ExceptionVec()) // TODO: is this ok?
542*41d8d239Shappy-lx  )
543*41d8d239Shappy-lx  io.writeBack.bits.uop.flushPipe := Mux(globalMMIO || globalException, false.B, true.B)
544*41d8d239Shappy-lx  io.writeBack.bits.uop.replayInst := false.B
545*41d8d239Shappy-lx  io.writeBack.bits.data := combinedData
546*41d8d239Shappy-lx  io.writeBack.bits.debug.isMMIO := globalMMIO
547*41d8d239Shappy-lx  io.writeBack.bits.debug.isPerfCnt := false.B
548*41d8d239Shappy-lx  io.writeBack.bits.debug.paddr := req.paddr
549*41d8d239Shappy-lx  io.writeBack.bits.debug.vaddr := req.vaddr
550*41d8d239Shappy-lx
551*41d8d239Shappy-lx  val flush = req_valid && req.uop.robIdx.needFlush(io.redirect)
552*41d8d239Shappy-lx
553*41d8d239Shappy-lx  when (flush && (bufferState =/= s_idle)) {
554*41d8d239Shappy-lx    bufferState := s_idle
555*41d8d239Shappy-lx    req_valid := false.B
556*41d8d239Shappy-lx    curPtr := 0.U
557*41d8d239Shappy-lx    unSentLoads := 0.U
558*41d8d239Shappy-lx    globalException := false.B
559*41d8d239Shappy-lx    globalMMIO := false.B
560*41d8d239Shappy-lx  }
561*41d8d239Shappy-lx
562*41d8d239Shappy-lx  // NOTE: spectial case (unaligned load cross page, page fault happens in next page)
563*41d8d239Shappy-lx  // if exception happens in the higher page address part, overwrite the loadExceptionBuffer vaddr
564*41d8d239Shappy-lx  val overwriteExpBuf = GatedValidRegNext(req_valid && cross16BytesBoundary && globalException && (curPtr === 1.U))
565*41d8d239Shappy-lx  val overwriteAddr = GatedRegNext(splitLoadResp(curPtr).vaddr)
566*41d8d239Shappy-lx
567*41d8d239Shappy-lx  io.overwriteExpBuf.valid := overwriteExpBuf
568*41d8d239Shappy-lx  io.overwriteExpBuf.vaddr := overwriteAddr
569*41d8d239Shappy-lx
570*41d8d239Shappy-lx  // when no exception or mmio, flush loadExceptionBuffer at s_wb
571*41d8d239Shappy-lx  val flushLdExpBuff = GatedValidRegNext(req_valid && (bufferState === s_wb) && !(globalMMIO || globalException))
572*41d8d239Shappy-lx  io.flushLdExpBuff := flushLdExpBuff
573*41d8d239Shappy-lx
574*41d8d239Shappy-lx  XSPerfAccumulate("alloc",                  RegNext(!req_valid) && req_valid)
575*41d8d239Shappy-lx  XSPerfAccumulate("flush",                  flush)
576*41d8d239Shappy-lx  XSPerfAccumulate("flush_idle",             flush && (bufferState === s_idle))
577*41d8d239Shappy-lx  XSPerfAccumulate("flush_non_idle",         flush && (bufferState =/= s_idle))
578*41d8d239Shappy-lx}