xref: /XiangShan/src/main/scala/xiangshan/mem/vector/VSplit.scala (revision 08047a411f7538fe4818201d2e92523128b7ee23)
1/***************************************************************************************
2  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3  * Copyright (c) 2020-2021 Peng Cheng Laboratory
4  *
5  * XiangShan is licensed under Mulan PSL v2.
6  * You can use this software according to the terms and conditions of the Mulan PSL v2.
7  * You may obtain a copy of Mulan PSL v2 at:
8  *          http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  *
14  * See the Mulan PSL v2 for more details.
15  ***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.rob.RobPtr
26import xiangshan.backend.Bundles._
27import xiangshan.mem._
28import xiangshan.backend.fu.vector.Bundles._
29
30
31class VSplitPipeline(isVStore: Boolean = false)(implicit p: Parameters) extends VLSUModule{
32  val io = IO(new VSplitPipelineIO(isVStore))
33  // will be override later
34  def us_whole_reg(fuOpType: UInt): Bool = false.B
35  def us_mask(fuOpType: UInt): Bool = false.B
36  def us_fof(fuOpType: UInt): Bool = false.B
37
38  val vdIdxReg = RegInit(0.U(3.W))
39
40  val s1_ready = WireInit(false.B)
41  io.in.ready := s1_ready
42
43  /**-----------------------------------------------------------
44    * s0 stage
45    * decode and generate AlignedType, uop mask, preIsSplit
46    * ----------------------------------------------------------
47    */
48  val s0_vtype = io.in.bits.uop.vpu.vtype
49  val s0_sew = s0_vtype.vsew
50  val s0_eew = io.in.bits.uop.vpu.veew
51  val s0_lmul = s0_vtype.vlmul
52  // when load whole register or unit-stride masked , emul should be 1
53  val s0_fuOpType = io.in.bits.uop.fuOpType
54  val s0_mop = s0_fuOpType(6, 5)
55  val s0_nf = Mux(us_whole_reg(s0_fuOpType), 0.U, io.in.bits.uop.vpu.nf)
56  val s0_vm = io.in.bits.uop.vpu.vm
57  val s0_emul = Mux(us_whole_reg(s0_fuOpType) ,GenUSWholeEmul(io.in.bits.uop.vpu.nf), Mux(us_mask(s0_fuOpType), 0.U(mulBits.W), EewLog2(s0_eew) - s0_sew + s0_lmul))
58  val s0_preIsSplit = !(isUnitStride(s0_mop) && !us_fof(s0_fuOpType))
59  val s0_nfield        = s0_nf +& 1.U
60
61  val s0_valid         = Wire(Bool())
62  val s0_kill          = io.in.bits.uop.robIdx.needFlush(io.redirect)
63  val s0_can_go        = s1_ready
64  val s0_fire          = s0_valid && s0_can_go
65  val s0_out           = Wire(new VLSBundle(isVStore))
66
67  val isUsWholeReg = isUnitStride(s0_mop) && us_whole_reg(s0_fuOpType)
68  val isMaskReg = isUnitStride(s0_mop) && us_mask(s0_fuOpType)
69  val isSegment = s0_nf =/= 0.U && !us_whole_reg(s0_fuOpType)
70  val instType = Cat(isSegment, s0_mop)
71  val uopIdx = io.in.bits.uop.vpu.vuopIdx
72  val uopIdxInField = GenUopIdxInField(instType, s0_emul, s0_lmul, uopIdx)
73  val vdIdxInField = GenVdIdxInField(instType, s0_emul, s0_lmul, uopIdxInField)
74  val lmulLog2 = Mux(s0_lmul.asSInt >= 0.S, 0.U, s0_lmul)
75  val emulLog2 = Mux(s0_emul.asSInt >= 0.S, 0.U, s0_emul)
76  val numEewLog2 = emulLog2 - EewLog2(s0_eew)
77  val numSewLog2 = lmulLog2 - s0_sew
78  val numFlowsSameVdLog2 = Mux(
79    isIndexed(instType),
80    log2Up(VLENB).U - s0_sew(1,0),
81    log2Up(VLENB).U - s0_eew(1,0)
82  )
83  // numUops = nf * max(lmul, emul)
84  val lmulLog2Pos = Mux(s0_lmul.asSInt < 0.S, 0.U, s0_lmul)
85  val emulLog2Pos = Mux(s0_emul.asSInt < 0.S, 0.U, s0_emul)
86  val numUops = Mux(
87    isIndexed(s0_mop) && s0_lmul.asSInt > s0_emul.asSInt,
88    (s0_nf +& 1.U) << lmulLog2Pos,
89    (s0_nf +& 1.U) << emulLog2Pos
90  )
91
92  val vvl = io.in.bits.src_vl.asTypeOf(VConfig()).vl
93  val evl = Mux(isUsWholeReg,
94                GenUSWholeRegVL(io.in.bits.uop.vpu.nf +& 1.U, s0_eew),
95                Mux(isMaskReg,
96                    GenUSMaskRegVL(vvl),
97                    vvl))
98  val vvstart = io.in.bits.uop.vpu.vstart
99  val alignedType = Mux(isIndexed(instType), s0_sew(1, 0), s0_eew(1, 0))
100  val broadenAligendType = Mux(s0_preIsSplit, Cat("b0".U, alignedType), "b100".U) // if is unit-stride, use 128-bits memory access
101  val flowsLog2 = GenRealFlowLog2(instType, s0_emul, s0_lmul, s0_eew, s0_sew)
102  val flowsPrevThisUop = uopIdxInField << flowsLog2 // # of flows before this uop in a field
103  val flowsPrevThisVd = vdIdxInField << numFlowsSameVdLog2 // # of flows before this vd in a field
104  val flowsIncludeThisUop = (uopIdxInField +& 1.U) << flowsLog2 // # of flows before this uop besides this uop
105  val flowNum = io.in.bits.flowNum.get
106  val srcMask = GenFlowMask(Mux(s0_vm, Fill(VLEN, 1.U(1.W)), io.in.bits.src_mask), vvstart, evl, true)
107
108  val flowMask = ((srcMask &
109    UIntToMask(flowsIncludeThisUop.asUInt, VLEN + 1) &
110    (~UIntToMask(flowsPrevThisUop.asUInt, VLEN)).asUInt
111  ) >> flowsPrevThisVd)(VLENB - 1, 0)
112  val vlmax = GenVLMAX(s0_lmul, s0_sew)
113
114    // connect
115  s0_out := DontCare
116  s0_out match {case x =>
117    x.uop := io.in.bits.uop
118    x.uop.vpu.vl := evl
119    x.uop.uopIdx := uopIdx
120    x.uop.numUops := numUops
121    x.uop.lastUop := (uopIdx +& 1.U) === numUops
122    x.uop.vpu.nf  := s0_nf
123    x.flowMask := flowMask
124    x.byteMask := GenUopByteMask(flowMask, Cat("b0".U, alignedType))(VLENB - 1, 0)
125    x.fof := isUnitStride(s0_mop) && us_fof(s0_fuOpType)
126    x.baseAddr := io.in.bits.src_rs1
127    x.stride := io.in.bits.src_stride
128    x.flowNum := flowNum
129    x.nfields := s0_nfield
130    x.vm := s0_vm
131    x.usWholeReg := isUsWholeReg
132    x.usMaskReg := isMaskReg
133    x.eew := s0_eew
134    x.sew := s0_sew
135    x.emul := s0_emul
136    x.lmul := s0_lmul
137    x.vlmax := Mux(isUsWholeReg, evl, vlmax)
138    x.instType := instType
139    x.data := io.in.bits.src_vs3
140    x.vdIdxInField := vdIdxInField
141    x.preIsSplit  := s0_preIsSplit
142    x.alignedType := broadenAligendType
143  }
144  s0_valid := io.in.valid && !s0_kill
145  /**-------------------------------------
146    * s1 stage
147    * ------------------------------------
148    * generate UopOffset
149    */
150  val s1_valid         = RegInit(false.B)
151  val s1_kill          = Wire(Bool())
152  val s1_in            = Wire(new VLSBundle(isVStore))
153  val s1_can_go        = io.out.ready && io.toMergeBuffer.resp.valid
154  val s1_fire          = s1_valid && !s1_kill && s1_can_go
155
156  s1_ready         := s1_kill || !s1_valid || io.out.ready && io.toMergeBuffer.resp.valid
157
158  when(s0_fire){
159    s1_valid := true.B
160  }.elsewhen(s1_fire){
161    s1_valid := false.B
162  }.elsewhen(s1_kill){
163    s1_valid := false.B
164  }
165  s1_in := RegEnable(s0_out, s0_fire)
166
167  val s1_uopidx           = s1_in.uop.vpu.vuopIdx
168  val s1_nf               = s1_in.uop.vpu.nf
169  val s1_nfields          = s1_in.nfields
170  val s1_eew              = s1_in.eew
171  val s1_instType         = s1_in.instType
172  val s1_stride           = s1_in.stride
173  val s1_vmask            = FillInterleaved(8, s1_in.byteMask)(VLEN-1, 0)
174  val s1_alignedType      = s1_in.alignedType
175  val s1_notIndexedStride = Mux( // stride for strided/unit-stride instruction
176    isStrided(s1_instType),
177    s1_stride(XLEN - 1, 0), // for strided load, stride = x[rs2]
178    s1_nfields << s1_eew(1, 0) // for unit-stride load, stride = eew * NFIELDS
179  )
180  val uopOffset  = (s1_uopidx >> s1_nf) << s1_alignedType
181  val stride     = Mux(isIndexed(s1_instType), s1_stride, s1_notIndexedStride) // if is index instructions, get index when split
182
183  s1_kill               := s1_in.uop.robIdx.needFlush(io.redirect)
184
185  // query mergeBuffer
186  io.toMergeBuffer.req.valid             := s1_fire // only can_go will get MergeBuffer entry
187  io.toMergeBuffer.req.bits.flowNum      := Mux(s1_in.preIsSplit, PopCount(s1_in.flowMask), flowNum)
188  io.toMergeBuffer.req.bits.data         := s1_in.data
189  io.toMergeBuffer.req.bits.uop          := s1_in.uop
190  io.toMergeBuffer.req.bits.mask         := s1_in.flowMask
191  io.toMergeBuffer.req.bits.vaddr        := DontCare
192  io.toMergeBuffer.req.bits.vdIdx        := vdIdxReg
193  io.toMergeBuffer.req.bits.fof          := s1_in.fof
194  io.toMergeBuffer.req.bits.vlmax        := s1_in.vlmax
195//   io.toMergeBuffer.req.bits.vdOffset :=
196
197  when (s1_in.uop.lastUop && s1_valid || s1_kill) {
198    vdIdxReg := 0.U
199  }.elsewhen(s1_valid) {
200    vdIdxReg := vdIdxReg + 1.U
201    XSError(vdIdxReg + 1.U === 0.U, s"Overflow! The number of vd should be less than 8\n")
202  }
203  // out connect
204  io.out.valid          := s1_valid && io.toMergeBuffer.resp.valid
205  io.out.bits           := s1_in
206  io.out.bits.uopOffset := uopOffset
207  io.out.bits.stride    := stride
208  io.out.bits.mBIndex   := io.toMergeBuffer.resp.bits.mBIndex
209
210  XSPerfAccumulate("split_out",     io.out.fire)
211  XSPerfAccumulate("pipe_block",    io.out.valid && !io.out.ready)
212  XSPerfAccumulate("mbuffer_block", s1_valid && io.out.ready && !io.toMergeBuffer.resp.valid)
213}
214
215abstract class VSplitBuffer(isVStore: Boolean = false)(implicit p: Parameters) extends VLSUModule{
216  val io = IO(new VSplitBufferIO(isVStore))
217
218  val bufferSize: Int
219
220  class VSplitPtr(implicit p: Parameters) extends CircularQueuePtr[VSplitPtr](bufferSize){
221  }
222
223  object VSplitPtr {
224    def apply(f: Bool, v: UInt)(implicit p: Parameters): VSplitPtr = {
225      val ptr = Wire(new VSplitPtr)
226      ptr.flag := f
227      ptr.value := v
228      ptr
229    }
230  }
231
232  val uopq = Reg(Vec(bufferSize, new VLSBundle(isVStore)))
233  val valid = RegInit(VecInit(Seq.fill(bufferSize)(false.B)))
234  val srcMaskVec = Reg(Vec(bufferSize, UInt(VLEN.W)))
235  // ptr
236  val enqPtr = RegInit(0.U.asTypeOf(new VSplitPtr))
237  val deqPtr = RegInit(0.U.asTypeOf(new VSplitPtr))
238  // for split
239  val splitIdx = RegInit(0.U(flowIdxBits.W))
240  val strideOffsetReg = RegInit(0.U(VLEN.W))
241
242  /**
243    * Redirect
244    */
245  val flushed = WireInit(VecInit(Seq.fill(bufferSize)(false.B))) // entry has been flushed by the redirect arrived in the pre 1 cycle
246  val flushVec = (valid zip flushed).zip(uopq).map { case ((v, f), entry) => v && entry.uop.robIdx.needFlush(io.redirect) && !f }
247  val flushEnq = io.in.fire && io.in.bits.uop.robIdx.needFlush(io.redirect)
248  val flushNumReg = RegNext(PopCount(flushEnq +: flushVec))
249  val redirectReg = RegNext(io.redirect)
250  val flushVecReg = RegNext(WireInit(VecInit(flushVec)))
251
252  // enqueue
253  when (io.in.fire && !flushEnq) {
254    val id = enqPtr.value
255    uopq(id) := io.in.bits
256    valid(id) := true.B
257  }
258  io.in.ready := isNotBefore(enqPtr, deqPtr)
259
260  //split uops
261  val issueValid       = valid(deqPtr.value)
262  val issueEntry       = uopq(deqPtr.value)
263  val issueMbIndex     = issueEntry.mBIndex
264  val issueFlowNum     = issueEntry.flowNum
265  val issueBaseAddr    = issueEntry.baseAddr
266  val issueUop         = issueEntry.uop
267  val issueUopIdx      = issueUop.vpu.vuopIdx
268  val issueInstType    = issueEntry.instType
269  val issueUopOffset   = issueEntry.uopOffset
270  val issueEew         = issueEntry.eew
271  val issueSew         = issueEntry.sew
272  val issueLmul        = issueEntry.emul
273  val issueEmul        = issueEntry.lmul
274  val issueAlignedType = issueEntry.alignedType
275  val issuePreIsSplit  = issueEntry.preIsSplit
276  val issueByteMask    = issueEntry.byteMask
277  val elemIdx = GenElemIdx(
278    instType = issueInstType,
279    emul = issueEmul,
280    lmul = issueLmul,
281    eew = issueEew,
282    sew = issueSew,
283    uopIdx = issueUopIdx,
284    flowIdx = splitIdx
285  ) // elemIdx inside an inst, for exception
286  val indexedStride    = IndexAddr( // index for indexed instruction
287    index = issueEntry.stride,
288    flow_inner_idx = ((splitIdx << issueEew(1, 0))(vOffsetBits - 1, 0) >> issueEew(1, 0)).asUInt,
289    eew = issueEew
290  )
291  val issueStride = Mux(isIndexed(issueInstType), indexedStride, strideOffsetReg)
292  val vaddr = issueBaseAddr + issueUopOffset + issueStride
293  val mask = genVWmask128(vaddr ,issueAlignedType) // scala maske for flow
294  val flowMask = issueEntry.flowMask
295  val vecActive = (flowMask & UIntToOH(splitIdx)).orR
296  /*
297   * Unit-Stride split to one flow or two flow.
298   * for Unit-Stride, if uop's addr is aligned with 128-bits, split it to one flow, otherwise split two
299   */
300
301  val usAligned128     = (vaddr(3,0) === 0.U)// addr 128-bit aligned
302  val usSplitMask      = genUSSplitMask(issueByteMask, splitIdx, vaddr(3,0))
303  val usNoSplit        = (usAligned128 || !(vaddr(3,0) +& PopCount(usSplitMask))(4)) && !issuePreIsSplit && (splitIdx === 0.U)// unit-stride uop don't need to split into two flow
304  val usSplitVaddr     = genUSSplitAddr(vaddr, splitIdx)
305  val regOffset        = vaddr(3,0) // offset in 256-bits vd
306  XSError((splitIdx > 1.U && usNoSplit) || (splitIdx > 1.U && !issuePreIsSplit) , "Unit-Stride addr split error!\n")
307
308  // data
309  io.out.bits match { case x =>
310    x.uop                   := issueUop
311    x.vaddr                 := Mux(!issuePreIsSplit, usSplitVaddr, vaddr)
312    x.alignedType           := issueAlignedType
313    x.isvec                 := true.B
314    x.mask                  := Mux(!issuePreIsSplit, usSplitMask, mask)
315    x.reg_offset            := regOffset //for merge unit-stride data
316    x.vecActive             := vecActive
317    x.is_first_ele          := DontCare
318    x.usSecondInv           := usNoSplit
319    x.elemIdx               := elemIdx
320    x.elemIdxInsideVd       := splitIdx // if is Unit-Stride, elemIdx is the index of 2 splited mem request (for merge data)
321    x.uop_unit_stride_fof   := DontCare
322    x.isFirstIssue          := DontCare
323    x.mBIndex               := issueMbIndex
324  }
325
326    //update enqptr
327  when (redirectReg.valid && flushNumReg =/= 0.U) {
328    enqPtr := enqPtr - flushNumReg
329  }.otherwise {
330    when (io.in.fire) {
331      enqPtr := enqPtr + 1.U
332    }
333  }
334
335  // flush queue
336  for (i <- 0 until bufferSize) {
337    when(flushVecReg(i) && redirectReg.valid && flushNumReg =/= 0.U) {
338      valid(i) := false.B
339      flushed(i) := true.B
340    }
341  }
342
343 /* Execute logic */
344  /** Issue to scala pipeline**/
345  val canIssue = Wire(Bool())
346  val allowIssue = io.out.ready
347  val doIssue = Wire(Bool())
348  val deqValid = valid(deqPtr.value)
349  val inActiveIssue = deqValid && canIssue && !vecActive && issuePreIsSplit
350  val issueCount = Mux(usNoSplit, 2.U, (PopCount(inActiveIssue) + PopCount(doIssue))) // for dont need split unit-stride, issue two flow
351
352  // handshake
353  val thisPtr = deqPtr.value
354  canIssue := !issueUop.robIdx.needFlush(io.redirect) && deqPtr < enqPtr
355  doIssue := canIssue && allowIssue
356  when (!RegNext(io.redirect.valid) || distanceBetween(enqPtr, deqPtr) > flushNumReg) {
357    when ((splitIdx < (issueFlowNum - issueCount))) {
358      when (doIssue || inActiveIssue) {
359        // The uop has not been entirly splited yet
360        splitIdx := splitIdx + issueCount
361        strideOffsetReg := strideOffsetReg + issueEntry.stride
362      }
363    }.otherwise {
364      when (doIssue || inActiveIssue) {
365        // The uop is done spliting
366        splitIdx := 0.U(flowIdxBits.W) // initialize flowIdx
367        valid(deqPtr.value) := false.B
368        strideOffsetReg := 0.U
369        deqPtr := deqPtr + 1.U
370      }
371    }
372  }.otherwise {
373    splitIdx := 0.U(flowIdxBits.W) // initialize flowIdx
374    strideOffsetReg := 0.U
375  }
376
377  // out connect
378  io.out.valid := canIssue && (vecActive || !issuePreIsSplit) // TODO: inactive uop do not send to pipeline
379
380  XSPerfAccumulate("out_valid",             io.out.valid)
381  XSPerfAccumulate("out_fire",              io.out.fire)
382  XSPerfAccumulate("out_fire_unitstride",   io.out.fire && !issuePreIsSplit)
383  XSPerfAccumulate("unitstride_vlenAlign",  io.out.fire && !issuePreIsSplit && io.out.bits.vaddr(3, 0) === 0.U)
384  XSPerfAccumulate("unitstride_invalid",    io.out.ready && canIssue && !issuePreIsSplit && PopCount(io.out.bits.mask).orR)
385
386  QueuePerf(bufferSize, distanceBetween(enqPtr, deqPtr), !io.in.ready)
387}
388
389class VSSplitBufferImp(implicit p: Parameters) extends VSplitBuffer(isVStore = true){
390  override lazy val bufferSize = SplitBufferSize
391  // split data
392  val splitData = genVSData(
393        data = issueEntry.data.asUInt,
394        elemIdx = splitIdx,
395        alignedType = issueAlignedType
396      )
397  val flowData = genVWdata(splitData, issueAlignedType)
398  val usSplitData      = genUSSplitData(issueEntry.data.asUInt, splitIdx, vaddr(3,0))
399
400  val sqIdx = issueUop.sqIdx + splitIdx
401  io.out.bits.uop.sqIdx := sqIdx
402
403  // send data to sq
404  val vstd = io.vstd.get
405  vstd.valid := canIssue
406  vstd.bits.uop := issueUop
407  vstd.bits.uop.sqIdx := sqIdx
408  vstd.bits.data := Mux(!issuePreIsSplit, usSplitData, flowData)
409  vstd.bits.debug := DontCare
410  vstd.bits.vdIdx.get := DontCare
411  vstd.bits.vdIdxInField.get := DontCare
412  vstd.bits.mask.get := Mux(!issuePreIsSplit, usSplitMask, mask)
413
414}
415
416class VLSplitBufferImp(implicit p: Parameters) extends VSplitBuffer(isVStore = false){
417  override lazy val bufferSize = SplitBufferSize
418  io.out.bits.uop.lqIdx := issueUop.lqIdx + splitIdx
419}
420
421class VSSplitPipelineImp(implicit p: Parameters) extends VSplitPipeline(isVStore = true){
422  override def us_whole_reg(fuOpType: UInt): Bool = fuOpType === VstuType.vsr
423  override def us_mask(fuOpType: UInt): Bool      = fuOpType === VstuType.vsm
424  override def us_fof(fuOpType: UInt): Bool       = false.B // dont have vector fof store
425}
426
427class VLSplitPipelineImp(implicit p: Parameters) extends VSplitPipeline(isVStore = false){
428
429  override def us_whole_reg(fuOpType: UInt): Bool = fuOpType === VlduType.vlr
430  override def us_mask(fuOpType: UInt): Bool      = fuOpType === VlduType.vlm
431  override def us_fof(fuOpType: UInt): Bool       = fuOpType === VlduType.vleff
432}
433
434class VLSplitImp(implicit p: Parameters) extends VLSUModule{
435  val io = IO(new VSplitIO(isVStore=false))
436  val splitPipeline = Module(new VLSplitPipelineImp())
437  val splitBuffer = Module(new VLSplitBufferImp())
438  // Split Pipeline
439  splitPipeline.io.in <> io.in
440  splitPipeline.io.redirect <> io.redirect
441  io.toMergeBuffer <> splitPipeline.io.toMergeBuffer
442
443  // Split Buffer
444  splitBuffer.io.in <> splitPipeline.io.out
445  splitBuffer.io.redirect <> io.redirect
446  io.out <> splitBuffer.io.out
447}
448
449class VSSplitImp(implicit p: Parameters) extends VLSUModule{
450  val io = IO(new VSplitIO(isVStore=true))
451  val splitPipeline = Module(new VSSplitPipelineImp())
452  val splitBuffer = Module(new VSSplitBufferImp())
453  // Split Pipeline
454  splitPipeline.io.in <> io.in
455  splitPipeline.io.redirect <> io.redirect
456  io.toMergeBuffer <> splitPipeline.io.toMergeBuffer
457
458  // Split Buffer
459  splitBuffer.io.in <> splitPipeline.io.out
460  splitBuffer.io.redirect <> io.redirect
461  io.out <> splitBuffer.io.out
462  io.vstd.get <> splitBuffer.io.vstd.get
463}
464
465