xref: /XiangShan/src/main/scala/xiangshan/mem/vector/VecCommon.scala (revision 039cdc35f5f3b68b6295ec5ace90f22a77322e02)
1/***************************************************************************************
2  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3  * Copyright (c) 2020-2021 Peng Cheng Laboratory
4  *
5  * XiangShan is licensed under Mulan PSL v2.
6  * You can use this software according to the terms and conditions of the Mulan PSL v2.
7  * You may obtain a copy of Mulan PSL v2 at:
8  *          http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  *
14  * See the Mulan PSL v2 for more details.
15  ***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.rob.RobPtr
26import xiangshan.backend.Bundles._
27import xiangshan.backend.fu.FuType
28
29/**
30  * Common used parameters or functions in vlsu
31  */
32trait VLSUConstants {
33  val VLEN = 128
34  //for pack unit-stride flow
35  val AlignedNum = 4 // 1/2/4/8
36  def VLENB = VLEN/8
37  def vOffsetBits = log2Up(VLENB) // bits-width to index offset inside a vector reg
38  lazy val vlmBindexBits = 8 //will be overrided later
39  lazy val vsmBindexBits = 8 // will be overrided later
40
41  def alignTypes = 5 // eew/sew = 1/2/4/8, last indicate 128 bit element
42  def alignTypeBits = log2Up(alignTypes)
43  def maxMUL = 8
44  def maxFields = 8
45  /**
46    * In the most extreme cases like a segment indexed instruction, eew=64, emul=8, sew=8, lmul=1,
47    * and nf=8, each data reg is mapped with 8 index regs and there are 8 data regs in total,
48    * each for a field. Therefore an instruction can be divided into 64 uops at most.
49    */
50  def maxUopNum = maxMUL * maxFields // 64
51  def maxFlowNum = 16
52  def maxElemNum = maxMUL * maxFlowNum // 128
53  // def uopIdxBits = log2Up(maxUopNum) // to index uop inside an robIdx
54  def elemIdxBits = log2Up(maxElemNum) + 1 // to index which element in an instruction
55  def flowIdxBits = log2Up(maxFlowNum) + 1 // to index which flow in a uop
56  def fieldBits = log2Up(maxFields) + 1 // 4-bits to indicate 1~8
57
58  def ewBits = 3 // bits-width of EEW/SEW
59  def mulBits = 3 // bits-width of emul/lmul
60
61  def getSlice(data: UInt, i: Int, alignBits: Int): UInt = {
62    require(data.getWidth >= (i+1) * alignBits)
63    data((i+1) * alignBits - 1, i * alignBits)
64  }
65  def getNoAlignedSlice(data: UInt, i: Int, alignBits: Int): UInt = {
66    data(i * 8 + alignBits - 1, i * 8)
67  }
68
69  def getByte(data: UInt, i: Int = 0) = getSlice(data, i, 8)
70  def getHalfWord(data: UInt, i: Int = 0) = getSlice(data, i, 16)
71  def getWord(data: UInt, i: Int = 0) = getSlice(data, i, 32)
72  def getDoubleWord(data: UInt, i: Int = 0) = getSlice(data, i, 64)
73  def getDoubleDoubleWord(data: UInt, i: Int = 0) = getSlice(data, i, 128)
74}
75
76trait HasVLSUParameters extends HasXSParameter with VLSUConstants {
77  override val VLEN = coreParams.VLEN
78  override lazy val vlmBindexBits = log2Up(coreParams.VlMergeBufferSize)
79  override lazy val vsmBindexBits = log2Up(coreParams.VsMergeBufferSize)
80  lazy val maxMemByteNum = 16 // Maximum bytes for a single memory access
81  /**
82   * get addr aligned low bits
83   * @param addr Address to be check
84   * @param width Width for checking alignment
85   */
86  def getCheckAddrLowBits(addr: UInt, width: Int): UInt = addr(log2Up(width) - 1, 0)
87  def getOverflowBit(in: UInt, width: Int): UInt = in(log2Up(width))
88  def isUnitStride(instType: UInt) = instType(1, 0) === "b00".U
89  def isStrided(instType: UInt) = instType(1, 0) === "b10".U
90  def isIndexed(instType: UInt) = instType(0) === "b1".U
91  def isNotIndexed(instType: UInt) = instType(0) === "b0".U
92  def isSegment(instType: UInt) = instType(2) === "b1".U
93  def is128Bit(alignedType: UInt) = alignedType(2) === "b1".U
94
95  def mergeDataWithMask(oldData: UInt, newData: UInt, mask: UInt): Vec[UInt] = {
96    require(oldData.getWidth == newData.getWidth)
97    require(oldData.getWidth == mask.getWidth * 8)
98    VecInit(mask.asBools.zipWithIndex.map { case (en, i) =>
99      Mux(en, getByte(newData, i), getByte(oldData, i))
100    })
101  }
102
103  // def asBytes(data: UInt) = {
104  //   require(data.getWidth % 8 == 0)
105  //   (0 until data.getWidth/8).map(i => getByte(data, i))
106  // }
107
108  def mergeDataWithElemIdx(
109    oldData: UInt,
110    newData: Seq[UInt],
111    alignedType: UInt,
112    elemIdx: Seq[UInt],
113    valids: Seq[Bool]
114  ): UInt = {
115    require(newData.length == elemIdx.length)
116    require(newData.length == valids.length)
117    LookupTree(alignedType, List(
118      "b00".U -> VecInit(elemIdx.map(e => UIntToOH(e(3, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
119        ParallelPosteriorityMux(
120          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
121          getByte(oldData, i) +: newData.map(getByte(_))
122        )}).asUInt,
123      "b01".U -> VecInit(elemIdx.map(e => UIntToOH(e(2, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
124        ParallelPosteriorityMux(
125          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
126          getHalfWord(oldData, i) +: newData.map(getHalfWord(_))
127        )}).asUInt,
128      "b10".U -> VecInit(elemIdx.map(e => UIntToOH(e(1, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
129        ParallelPosteriorityMux(
130          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
131          getWord(oldData, i) +: newData.map(getWord(_))
132        )}).asUInt,
133      "b11".U -> VecInit(elemIdx.map(e => UIntToOH(e(0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
134        ParallelPosteriorityMux(
135          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
136          getDoubleWord(oldData, i) +: newData.map(getDoubleWord(_))
137        )}).asUInt
138    ))
139  }
140
141  def mergeDataWithElemIdx(oldData: UInt, newData: UInt, alignedType: UInt, elemIdx: UInt): UInt = {
142    mergeDataWithElemIdx(oldData, Seq(newData), alignedType, Seq(elemIdx), Seq(true.B))
143  }
144  /**
145    * for merge 128-bits data of unit-stride
146    */
147  object mergeDataByByte{
148    def apply(oldData: UInt, newData: UInt, mask: UInt): UInt = {
149      val selVec = Seq(mask).map(_.asBools).transpose
150      VecInit(selVec.zipWithIndex.map{ case (selV, i) =>
151        ParallelPosteriorityMux(
152          true.B +: selV.map(x => x),
153          getByte(oldData, i) +: Seq(getByte(newData, i))
154        )}).asUInt
155    }
156  }
157
158  /**
159    * for merge Unit-Stride data to 256-bits
160    * merge 128-bits data to 256-bits
161    * if have 3 port,
162    *   if is port0, it is 6 to 1 Multiplexer -> (128'b0, data) or (data, 128'b0) or (data, port2data) or (port2data, data) or (data, port3data) or (port3data, data)
163    *   if is port1, it is 4 to 1 Multiplexer -> (128'b0, data) or (data, 128'b0) or (data, port3data) or (port3data, data)
164    *   if is port3, it is 2 to 1 Multiplexer -> (128'b0, data) or (data, 128'b0)
165    *
166    */
167  object mergeDataByIndex{
168    def apply(data:  Seq[UInt], mask: Seq[UInt], index: UInt, valids: Seq[Bool]): (UInt, UInt) = {
169      require(data.length == valids.length)
170      require(data.length == mask.length)
171      val muxLength = data.length
172      val selDataMatrix = Wire(Vec(muxLength, Vec(2, UInt((VLEN * 2).W)))) // 3 * 2 * 256
173      val selMaskMatrix = Wire(Vec(muxLength, Vec(2, UInt((VLENB * 2).W)))) // 3 * 2 * 16
174      dontTouch(selDataMatrix)
175      dontTouch(selMaskMatrix)
176      for(i <- 0 until muxLength){
177        if(i == 0){
178          selDataMatrix(i)(0) := Cat(0.U(VLEN.W), data(i))
179          selDataMatrix(i)(1) := Cat(data(i), 0.U(VLEN.W))
180          selMaskMatrix(i)(0) := Cat(0.U(VLENB.W), mask(i))
181          selMaskMatrix(i)(1) := Cat(mask(i), 0.U(VLENB.W))
182        }
183        else{
184          selDataMatrix(i)(0) := Cat(data(i), data(0))
185          selDataMatrix(i)(1) := Cat(data(0), data(i))
186          selMaskMatrix(i)(0) := Cat(mask(i), mask(0))
187          selMaskMatrix(i)(1) := Cat(mask(0), mask(i))
188        }
189      }
190      val selIdxVec = (0 until muxLength).map(_.U)
191      val selIdx    = PriorityMux(valids.reverse, selIdxVec.reverse)
192
193      val selData = Mux(index === 0.U,
194                        selDataMatrix(selIdx)(0),
195                        selDataMatrix(selIdx)(1))
196      val selMask = Mux(index === 0.U,
197                        selMaskMatrix(selIdx)(0),
198                        selMaskMatrix(selIdx)(1))
199      (selData, selMask)
200    }
201  }
202  def mergeDataByIndex(data:  UInt, mask: UInt, index: UInt): (UInt, UInt) = {
203    mergeDataByIndex(Seq(data), Seq(mask), index, Seq(true.B))
204  }
205}
206abstract class VLSUModule(implicit p: Parameters) extends XSModule
207  with HasVLSUParameters
208  with HasCircularQueuePtrHelper
209abstract class VLSUBundle(implicit p: Parameters) extends XSBundle
210  with HasVLSUParameters
211
212class VLSUBundleWithMicroOp(implicit p: Parameters) extends VLSUBundle {
213  val uop = new DynInst
214}
215
216class OnlyVecExuOutput(implicit p: Parameters) extends VLSUBundle {
217  val isvec = Bool()
218  val vecdata = UInt(VLEN.W)
219  val mask = UInt(VLENB.W)
220  // val rob_idx_valid = Vec(2, Bool())
221  // val inner_idx = Vec(2, UInt(3.W))
222  // val rob_idx = Vec(2, new RobPtr)
223  // val offset = Vec(2, UInt(4.W))
224  val reg_offset = UInt(vOffsetBits.W)
225  val vecActive = Bool() // 1: vector active element, 0: vector not active element
226  val is_first_ele = Bool()
227  val elemIdx = UInt(elemIdxBits.W) // element index
228  val elemIdxInsideVd = UInt(elemIdxBits.W) // element index in scope of vd
229  // val uopQueuePtr = new VluopPtr
230  // val flowPtr = new VlflowPtr
231}
232
233class VecExuOutput(implicit p: Parameters) extends MemExuOutput with HasVLSUParameters {
234  val vec = new OnlyVecExuOutput
235  val alignedType       = UInt(alignTypeBits.W)
236   // feedback
237  val vecFeedback       = Bool()
238}
239
240// class VecStoreExuOutput(implicit p: Parameters) extends MemExuOutput with HasVLSUParameters {
241//   val elemIdx = UInt(elemIdxBits.W)
242//   val uopQueuePtr = new VsUopPtr
243//   val fieldIdx = UInt(fieldBits.W)
244//   val segmentIdx = UInt(elemIdxBits.W)
245//   val vaddr = UInt(VAddrBits.W)
246//   // pack
247//   val isPackage         = Bool()
248//   val packageNum        = UInt((log2Up(VLENB) + 1).W)
249//   val originAlignedType = UInt(alignTypeBits.W)
250//   val alignedType       = UInt(alignTypeBits.W)
251// }
252
253class VecUopBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
254  val flowMask       = UInt(VLENB.W) // each bit for a flow
255  val byteMask       = UInt(VLENB.W) // each bit for a byte
256  val data           = UInt(VLEN.W)
257  // val fof            = Bool() // fof is only used for vector loads
258  val excp_eew_index = UInt(elemIdxBits.W)
259  // val exceptionVec   = ExceptionVec() // uop has exceptionVec
260  val baseAddr = UInt(VAddrBits.W)
261  val stride = UInt(VLEN.W)
262  val flow_counter = UInt(flowIdxBits.W)
263
264  // instruction decode result
265  val flowNum = UInt(flowIdxBits.W) // # of flows in a uop
266  // val flowNumLog2 = UInt(log2Up(flowIdxBits).W) // log2(flowNum), for better timing of multiplication
267  val nfields = UInt(fieldBits.W) // NFIELDS
268  val vm = Bool() // whether vector masking is enabled
269  val usWholeReg = Bool() // unit-stride, whole register load
270  val usMaskReg = Bool() // unit-stride, masked store/load
271  val eew = UInt(ewBits.W) // size of memory elements
272  val sew = UInt(ewBits.W)
273  val emul = UInt(mulBits.W)
274  val lmul = UInt(mulBits.W)
275  val vlmax = UInt(elemIdxBits.W)
276  val instType = UInt(3.W)
277  val vd_last_uop = Bool()
278  val vd_first_uop = Bool()
279}
280
281class VecFlowBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
282  val vaddr             = UInt(VAddrBits.W)
283  val mask              = UInt(VLENB.W)
284  val alignedType       = UInt(alignTypeBits.W)
285  val vecActive         = Bool()
286  val elemIdx           = UInt(elemIdxBits.W)
287  val is_first_ele      = Bool()
288
289  // pack
290  val isPackage         = Bool()
291  val packageNum        = UInt((log2Up(VLENB) + 1).W)
292  val originAlignedType = UInt(alignTypeBits.W)
293}
294
295class VecMemExuOutput(isVector: Boolean = false)(implicit p: Parameters) extends VLSUBundle{
296  val output = new MemExuOutput(isVector)
297  val vecFeedback = Bool()
298  val mmio = Bool()
299  val usSecondInv = Bool()
300  val elemIdx = UInt(elemIdxBits.W)
301  val alignedType = UInt(alignTypeBits.W)
302  val mbIndex     = UInt(vsmBindexBits.W)
303  val mask        = UInt(VLENB.W)
304  val vaddr       = UInt(VAddrBits.W)
305}
306
307object MulNum {
308  def apply (mul: UInt): UInt = { //mul means emul or lmul
309    (LookupTree(mul,List(
310      "b101".U -> 1.U , // 1/8
311      "b110".U -> 1.U , // 1/4
312      "b111".U -> 1.U , // 1/2
313      "b000".U -> 1.U , // 1
314      "b001".U -> 2.U , // 2
315      "b010".U -> 4.U , // 4
316      "b011".U -> 8.U   // 8
317    )))}
318}
319/**
320  * when emul is greater than or equal to 1, this means the entire register needs to be written;
321  * otherwise, only write the specified number of bytes */
322object MulDataSize {
323  def apply (mul: UInt): UInt = { //mul means emul or lmul
324    (LookupTree(mul,List(
325      "b101".U -> 2.U  , // 1/8
326      "b110".U -> 4.U  , // 1/4
327      "b111".U -> 8.U  , // 1/2
328      "b000".U -> 16.U , // 1
329      "b001".U -> 16.U , // 2
330      "b010".U -> 16.U , // 4
331      "b011".U -> 16.U   // 8
332    )))}
333}
334
335object OneRegNum {
336  def apply (eew: UInt): UInt = { //mul means emul or lmul
337    (LookupTree(eew,List(
338      "b000".U -> 16.U , // 1
339      "b101".U -> 8.U , // 2
340      "b110".U -> 4.U , // 4
341      "b111".U -> 2.U   // 8
342    )))}
343}
344
345//index inst read data byte
346object SewDataSize {
347  def apply (sew: UInt): UInt = {
348    (LookupTree(sew,List(
349      "b000".U -> 1.U , // 1
350      "b001".U -> 2.U , // 2
351      "b010".U -> 4.U , // 4
352      "b011".U -> 8.U   // 8
353    )))}
354}
355
356// strided inst read data byte
357object EewDataSize {
358  def apply (eew: UInt): UInt = {
359    (LookupTree(eew,List(
360      "b000".U -> 1.U , // 1
361      "b101".U -> 2.U , // 2
362      "b110".U -> 4.U , // 4
363      "b111".U -> 8.U   // 8
364    )))}
365}
366
367object loadDataSize {
368  def apply (instType: UInt, emul: UInt, eew: UInt, sew: UInt): UInt = {
369    (LookupTree(instType,List(
370      "b000".U ->  MulDataSize(emul), // unit-stride
371      "b010".U ->  EewDataSize(eew)  , // strided
372      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
373      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
374      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
375      "b110".U ->  EewDataSize(eew)  , // segment strided
376      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
377      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
378    )))}
379}
380
381object storeDataSize {
382  def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
383    (LookupTree(instType,List(
384      "b000".U ->  EewDataSize(eew)  , // unit-stride, do not use
385      "b010".U ->  EewDataSize(eew)  , // strided
386      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
387      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
388      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
389      "b110".U ->  EewDataSize(eew)  , // segment strided
390      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
391      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
392    )))}
393}
394
395object GenVecStoreMask {
396  def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
397    val mask = Wire(UInt(16.W))
398    mask := UIntToOH(storeDataSize(instType = instType, eew = eew, sew = sew)) - 1.U
399    mask
400  }
401}
402
403/**
404  * these are used to obtain immediate addresses for  index instruction */
405object EewEq8 {
406  def apply(index:UInt, flow_inner_idx: UInt): UInt = {
407    (LookupTree(flow_inner_idx,List(
408      0.U  -> index(7 ,0   ),
409      1.U  -> index(15,8   ),
410      2.U  -> index(23,16  ),
411      3.U  -> index(31,24  ),
412      4.U  -> index(39,32  ),
413      5.U  -> index(47,40  ),
414      6.U  -> index(55,48  ),
415      7.U  -> index(63,56  ),
416      8.U  -> index(71,64  ),
417      9.U  -> index(79,72  ),
418      10.U -> index(87,80  ),
419      11.U -> index(95,88  ),
420      12.U -> index(103,96 ),
421      13.U -> index(111,104),
422      14.U -> index(119,112),
423      15.U -> index(127,120)
424    )))}
425}
426
427object EewEq16 {
428  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
429    (LookupTree(flow_inner_idx, List(
430      0.U -> index(15, 0),
431      1.U -> index(31, 16),
432      2.U -> index(47, 32),
433      3.U -> index(63, 48),
434      4.U -> index(79, 64),
435      5.U -> index(95, 80),
436      6.U -> index(111, 96),
437      7.U -> index(127, 112)
438    )))}
439}
440
441object EewEq32 {
442  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
443    (LookupTree(flow_inner_idx, List(
444      0.U -> index(31, 0),
445      1.U -> index(63, 32),
446      2.U -> index(95, 64),
447      3.U -> index(127, 96)
448    )))}
449}
450
451object EewEq64 {
452  def apply (index: UInt, flow_inner_idx: UInt): UInt = {
453    (LookupTree(flow_inner_idx, List(
454      0.U -> index(63, 0),
455      1.U -> index(127, 64)
456    )))}
457}
458
459object IndexAddr {
460  def apply (index: UInt, flow_inner_idx: UInt, eew: UInt): UInt = {
461    (LookupTree(eew,List(
462      "b000".U -> EewEq8 (index = index, flow_inner_idx = flow_inner_idx ), // Imm is 1 Byte // TODO: index maybe cross register
463      "b101".U -> EewEq16(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 2 Byte
464      "b110".U -> EewEq32(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 4 Byte
465      "b111".U -> EewEq64(index = index, flow_inner_idx = flow_inner_idx )  // Imm is 8 Byte
466    )))}
467}
468
469object Log2Num {
470  def apply (num: UInt): UInt = {
471    (LookupTree(num,List(
472      16.U -> 4.U,
473      8.U  -> 3.U,
474      4.U  -> 2.U,
475      2.U  -> 1.U,
476      1.U  -> 0.U
477    )))}
478}
479
480object GenUopIdxInField {
481  /**
482   * Used in normal vector instruction
483   * */
484  def apply (instType: UInt, emul: UInt, lmul: UInt, uopIdx: UInt): UInt = {
485    val isIndexed = instType(0)
486    val mulInField = Mux(
487      isIndexed,
488      Mux(lmul.asSInt > emul.asSInt, lmul, emul),
489      emul
490    )
491    LookupTree(mulInField, List(
492      "b101".U -> 0.U,
493      "b110".U -> 0.U,
494      "b111".U -> 0.U,
495      "b000".U -> 0.U,
496      "b001".U -> uopIdx(0),
497      "b010".U -> uopIdx(1, 0),
498      "b011".U -> uopIdx(2, 0)
499    ))
500  }
501  /**
502   *  Only used in segment instruction.
503   * */
504  def apply (select: UInt, uopIdx: UInt): UInt = {
505    LookupTree(select, List(
506      "b101".U -> 0.U,
507      "b110".U -> 0.U,
508      "b111".U -> 0.U,
509      "b000".U -> 0.U,
510      "b001".U -> uopIdx(0),
511      "b010".U -> uopIdx(1, 0),
512      "b011".U -> uopIdx(2, 0)
513    ))
514  }
515}
516
517//eew decode
518object EewLog2 extends VLSUConstants {
519  // def apply (eew: UInt): UInt = {
520  //   (LookupTree(eew,List(
521  //     "b000".U -> "b000".U , // 1
522  //     "b101".U -> "b001".U , // 2
523  //     "b110".U -> "b010".U , // 4
524  //     "b111".U -> "b011".U   // 8
525  //   )))}
526  def apply(eew: UInt): UInt = ZeroExt(eew(1, 0), ewBits)
527}
528
529object GenRealFlowNum {
530  /**
531   * unit-stride instructions don't use this method;
532   * other instructions generate realFlowNum by EmulDataSize >> eew(1,0),
533   * EmulDataSize means the number of bytes that need to be written to the register,
534   * eew(1,0) means the number of bytes written at once.
535   *
536   * @param instType As the name implies.
537   * @param emul As the name implies.
538   * @param lmul As the name implies.
539   * @param eew As the name implies.
540   * @param sew As the name implies.
541   * @param isSegment Only modules related to segment need to be set to true.
542   * @return FlowNum of instruction.
543   *
544   */
545  def apply (instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt, isSegment: Boolean = false): UInt = {
546    require(instType.getWidth == 3, "The instType width must be 3, (isSegment, mop)")
547    // Because the new segmentunit is needed. But the previous implementation is retained for the time being in case of emergency.
548    val segmentIndexFlowNum =  if (isSegment) (MulDataSize(lmul) >> sew(1,0)).asUInt
549                                else Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt)
550    (LookupTree(instType,List(
551      "b000".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // store use, load do not use
552      "b010".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // strided
553      "b001".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-unordered
554      "b011".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-ordered
555      "b100".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // segment unit-stride
556      "b110".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // segment strided
557      "b101".U ->  segmentIndexFlowNum, // segment indexed-unordered
558      "b111".U ->  segmentIndexFlowNum  // segment indexed-ordered
559    )))}
560}
561
562object GenRealFlowLog2 extends VLSUConstants {
563  /**
564   * GenRealFlowLog2 = Log2(GenRealFlowNum)
565   *
566   * @param instType As the name implies.
567   * @param emul As the name implies.
568   * @param lmul As the name implies.
569   * @param eew As the name implies.
570   * @param sew As the name implies.
571   * @param isSegment Only modules related to segment need to be set to true.
572   * @return FlowNumLog2 of instruction.
573   */
574  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt, isSegment: Boolean = false): UInt = {
575    require(instType.getWidth == 3, "The instType width must be 3, (isSegment, mop)")
576    val emulLog2 = Mux(emul.asSInt >= 0.S, 0.U, emul)
577    val lmulLog2 = Mux(lmul.asSInt >= 0.S, 0.U, lmul)
578    val eewRealFlowLog2 = emulLog2 + log2Up(VLENB).U - eew(1, 0)
579    val sewRealFlowLog2 = lmulLog2 + log2Up(VLENB).U - sew(1, 0)
580    // Because the new segmentunit is needed. But the previous implementation is retained for the time being in case of emergency.
581    val segmentIndexFlowLog2 = if (isSegment) sewRealFlowLog2 else Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2)
582    (LookupTree(instType, List(
583      "b000".U -> eewRealFlowLog2, // unit-stride
584      "b010".U -> eewRealFlowLog2, // strided
585      "b001".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-unordered
586      "b011".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-ordered
587      "b100".U -> eewRealFlowLog2, // segment unit-stride
588      "b110".U -> eewRealFlowLog2, // segment strided
589      "b101".U -> segmentIndexFlowLog2, // segment indexed-unordered
590      "b111".U -> segmentIndexFlowLog2, // segment indexed-ordered
591    )))
592  }
593}
594
595/**
596  * GenElemIdx generals an element index within an instruction, given a certain uopIdx and a known flowIdx
597  * inside the uop.
598  */
599object GenElemIdx extends VLSUConstants {
600  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt,
601    uopIdx: UInt, flowIdx: UInt): UInt = {
602    val isIndexed = instType(0).asBool
603    val eewUopFlowsLog2 = Mux(emul.asSInt > 0.S, 0.U, emul) + log2Up(VLENB).U - eew(1, 0)
604    val sewUopFlowsLog2 = Mux(lmul.asSInt > 0.S, 0.U, lmul) + log2Up(VLENB).U - sew(1, 0)
605    val uopFlowsLog2 = Mux(
606      isIndexed,
607      Mux(emul.asSInt > lmul.asSInt, eewUopFlowsLog2, sewUopFlowsLog2),
608      eewUopFlowsLog2
609    )
610    LookupTree(uopFlowsLog2, List(
611      0.U -> uopIdx,
612      1.U -> uopIdx ## flowIdx(0),
613      2.U -> uopIdx ## flowIdx(1, 0),
614      3.U -> uopIdx ## flowIdx(2, 0),
615      4.U -> uopIdx ## flowIdx(3, 0)
616    ))
617  }
618}
619
620/**
621  * GenVLMAX calculates VLMAX, which equals MUL * ew
622  */
623object GenVLMAXLog2 extends VLSUConstants {
624  def apply(lmul: UInt, sew: UInt): UInt = lmul + log2Up(VLENB).U - sew
625}
626object GenVLMAX {
627  def apply(lmul: UInt, sew: UInt): UInt = 1.U << GenVLMAXLog2(lmul, sew)
628}
629/**
630 * generate mask base on vlmax
631 * example: vlmax = b100, max = b011
632 * */
633object GenVlMaxMask{
634  def apply(vlmax: UInt, length: Int): UInt = (vlmax - 1.U)(length-1, 0)
635}
636
637object GenUSWholeRegVL extends VLSUConstants {
638  def apply(nfields: UInt, eew: UInt): UInt = {
639    LookupTree(eew(1, 0), List(
640      "b00".U -> (nfields << (log2Up(VLENB) - 0)),
641      "b01".U -> (nfields << (log2Up(VLENB) - 1)),
642      "b10".U -> (nfields << (log2Up(VLENB) - 2)),
643      "b11".U -> (nfields << (log2Up(VLENB) - 3))
644    ))
645  }
646}
647object GenUSWholeEmul extends VLSUConstants{
648  def apply(nf: UInt): UInt={
649    LookupTree(nf,List(
650      "b000".U -> "b000".U(mulBits.W),
651      "b001".U -> "b001".U(mulBits.W),
652      "b011".U -> "b010".U(mulBits.W),
653      "b111".U -> "b011".U(mulBits.W)
654    ))
655  }
656}
657
658
659object GenUSMaskRegVL extends VLSUConstants {
660  def apply(vl: UInt): UInt = {
661    Mux(vl(2,0) === 0.U , (vl >> 3.U), ((vl >> 3.U) + 1.U))
662  }
663}
664
665object GenUopByteMask {
666  def apply(flowMask: UInt, alignedType: UInt): UInt = {
667    LookupTree(alignedType, List(
668      "b000".U -> flowMask,
669      "b001".U -> FillInterleaved(2, flowMask),
670      "b010".U -> FillInterleaved(4, flowMask),
671      "b011".U -> FillInterleaved(8, flowMask),
672      "b100".U -> FillInterleaved(16, flowMask)
673    ))
674  }
675}
676
677object GenVdIdxInField extends VLSUConstants {
678  def apply(instType: UInt, emul: UInt, lmul: UInt, uopIdx: UInt): UInt = {
679    val vdIdx = Wire(UInt(log2Up(maxMUL).W))
680    when (instType(1,0) === "b00".U || instType(1,0) === "b10".U || lmul.asSInt > emul.asSInt) {
681      // Unit-stride or Strided, or indexed with lmul >= emul
682      vdIdx := uopIdx
683    }.otherwise {
684      // Indexed with lmul <= emul
685      val multiple = emul - lmul
686      val uopIdxWidth = uopIdx.getWidth
687      vdIdx := LookupTree(multiple, List(
688        0.U -> uopIdx,
689        1.U -> (uopIdx >> 1),
690        2.U -> (uopIdx >> 2),
691        3.U -> (uopIdx >> 3)
692      ))
693    }
694    vdIdx
695  }
696}
697/**
698* Use start and vl to generate flow activative mask
699* mod = true fill 0
700* mod = false fill 1
701*/
702object GenFlowMask extends VLSUConstants {
703  def apply(elementMask: UInt, start: UInt, vl: UInt , mod: Boolean): UInt = {
704    val startMask = ~UIntToMask(start, VLEN)
705    val vlMask = UIntToMask(vl, VLEN)
706    val maskVlStart = vlMask & startMask
707    if(mod){
708      elementMask & maskVlStart
709    }
710    else{
711      (~elementMask).asUInt & maskVlStart
712    }
713  }
714}
715
716object CheckAligned extends VLSUConstants {
717  def apply(addr: UInt): UInt = {
718    val aligned_16 = (addr(0) === 0.U) // 16-bit
719    val aligned_32 = (addr(1,0) === 0.U) // 32-bit
720    val aligned_64 = (addr(2,0) === 0.U) // 64-bit
721    val aligned_128 = (addr(3,0) === 0.U) // 128-bit
722    Cat(true.B, aligned_16, aligned_32, aligned_64, aligned_128)
723  }
724}
725
726/**
727  search if mask have continue 'len' bit '1'
728  mask: source mask
729  len: search length
730*/
731object GenPackMask{
732  def leadX(mask: Seq[Bool], len: Int): Bool = {
733    if(len == 1){
734      mask.head
735    }
736    else{
737      leadX(mask.drop(1),len-1) & mask.head
738    }
739  }
740  def leadOneVec(shiftMask: Seq[Bool]): UInt = {
741    // max is 64-bit, so the max num of flow to pack is 8
742
743    val lead1 = leadX(shiftMask, 1) // continue 1 bit
744    val lead2 = leadX(shiftMask, 2) // continue 2 bit
745    val lead4 = leadX(shiftMask, 4) // continue 4 bit
746    val lead8 = leadX(shiftMask, 8) // continue 8 bit
747    val lead16 = leadX(shiftMask, 16) // continue 16 bit
748    Cat(lead1, lead2, lead4, lead8, lead16)
749  }
750
751  def apply(shiftMask: UInt) = {
752    // pack mask
753    val packMask = leadOneVec(shiftMask.asBools)
754    packMask
755  }
756}
757/**
758PackEnable = (LeadXVec >> eew) & alignedVec, where the 0th bit represents the ability to merge into a 64 bit flow, the second bit represents the ability to merge into a 32 bit flow, and so on.
759
760example:
761  addr = 0x0, activeMask = b00011100101111, flowIdx = 0, eew = 0(8-bit)
762
763  step 0 : addrAlignedVec = (1, 1, 1, 1) elemIdxAligned = (1, 1, 1, 1)
764  step 1 : activePackVec = (1, 1, 1, 0), inactivePackVec = (0, 0, 0, 0)
765  step 2 : activePackEnable = (1, 1, 1, 0), inactivePackVec = (0, 0, 0, 0)
766
767  we can package 4 8-bit activative flows into a 32-bit flow.
768*/
769object GenPackVec extends VLSUConstants{
770  def apply(addr: UInt, shiftMask: UInt, eew: UInt, elemIdx: UInt): UInt = {
771    val addrAlignedVec = CheckAligned(addr)
772    val elemIdxAligned = CheckAligned(elemIdx)
773    val packMask = GenPackMask(shiftMask)
774    // generate packVec
775    val packVec = addrAlignedVec & elemIdxAligned & (packMask.asUInt >> eew)
776
777    packVec
778  }
779}
780
781object GenPackAlignedType extends VLSUConstants{
782  def apply(packVec: UInt): UInt = {
783    val packAlignedType = PriorityMux(Seq(
784      packVec(0) -> "b100".U,
785      packVec(1) -> "b011".U,
786      packVec(2) -> "b010".U,
787      packVec(3) -> "b001".U,
788      packVec(4) -> "b000".U
789    ))
790    packAlignedType
791  }
792}
793
794object GenPackNum extends VLSUConstants{
795  def apply(alignedType: UInt, packAlignedType: UInt): UInt = {
796    (1.U << (packAlignedType - alignedType)).asUInt
797  }
798}
799
800object genVWmask128 {
801  def apply(addr: UInt, sizeEncode: UInt): UInt = {
802    (LookupTree(sizeEncode, List(
803      "b000".U -> 0x1.U, //0001 << addr(2:0)
804      "b001".U -> 0x3.U, //0011
805      "b010".U -> 0xf.U, //1111
806      "b011".U -> 0xff.U, //11111111
807      "b100".U -> 0xffff.U //1111111111111111
808    )) << addr(3, 0)).asUInt
809  }
810}
811/*
812* only use in max length is 128
813*/
814object genVWdata {
815  def apply(data: UInt, sizeEncode: UInt): UInt = {
816    LookupTree(sizeEncode, List(
817      "b000".U -> Fill(16, data(7, 0)),
818      "b001".U -> Fill(8, data(15, 0)),
819      "b010".U -> Fill(4, data(31, 0)),
820      "b011".U -> Fill(2, data(63,0)),
821      "b100".U -> data(127,0)
822    ))
823  }
824}
825
826object genUSSplitAddr{
827  def apply(addr: UInt, index: UInt): UInt = {
828    val tmpAddr = Cat(addr(38, 4), 0.U(4.W))
829    val nextCacheline = tmpAddr + 16.U
830    LookupTree(index, List(
831      0.U -> tmpAddr,
832      1.U -> nextCacheline
833    ))
834  }
835}
836
837object genUSSplitMask{
838  def apply(mask: UInt, index: UInt, addrOffset: UInt): UInt = {
839    val tmpMask = Cat(0.U(16.W),mask) << addrOffset // 32-bits
840    LookupTree(index, List(
841      0.U -> tmpMask(15, 0),
842      1.U -> tmpMask(31, 16),
843    ))
844  }
845}
846
847object genUSSplitData{
848  def apply(data: UInt, index: UInt, addrOffset: UInt): UInt = {
849    val tmpData = WireInit(0.U(256.W))
850    val lookupTable = (0 until 16).map{case i =>
851      if(i == 0){
852        i.U -> Cat(0.U(128.W), data)
853      }else{
854        i.U -> Cat(0.U(((16-i)*8).W), data, 0.U((i*8).W))
855      }
856    }
857    tmpData := LookupTree(addrOffset, lookupTable).asUInt
858
859    LookupTree(index, List(
860      0.U -> tmpData(127, 0),
861      1.U -> tmpData(255, 128)
862    ))
863  }
864}
865
866object genVSData extends VLSUConstants {
867  def apply(data: UInt, elemIdx: UInt, alignedType: UInt): UInt = {
868    LookupTree(alignedType, List(
869      "b000".U -> ZeroExt(LookupTree(elemIdx(3, 0), List.tabulate(VLEN/8)(i => i.U -> getByte(data, i))), VLEN),
870      "b001".U -> ZeroExt(LookupTree(elemIdx(2, 0), List.tabulate(VLEN/16)(i => i.U -> getHalfWord(data, i))), VLEN),
871      "b010".U -> ZeroExt(LookupTree(elemIdx(1, 0), List.tabulate(VLEN/32)(i => i.U -> getWord(data, i))), VLEN),
872      "b011".U -> ZeroExt(LookupTree(elemIdx(0), List.tabulate(VLEN/64)(i => i.U -> getDoubleWord(data, i))), VLEN),
873      "b100".U -> data // if have wider element, it will broken
874    ))
875  }
876}
877
878// TODO: more elegant
879object genVStride extends VLSUConstants {
880  def apply(uopIdx: UInt, stride: UInt): UInt = {
881    LookupTree(uopIdx, List(
882      0.U -> 0.U,
883      1.U -> stride,
884      2.U -> (stride << 1),
885      3.U -> ((stride << 1).asUInt + stride),
886      4.U -> (stride << 2),
887      5.U -> ((stride << 2).asUInt + stride),
888      6.U -> ((stride << 2).asUInt + (stride << 1)),
889      7.U -> ((stride << 2).asUInt + (stride << 1) + stride)
890    ))
891  }
892}
893/**
894 * generate uopOffset, not used in segment instruction
895 * */
896object genVUopOffset extends VLSUConstants {
897  def apply(instType: UInt, isfof: Bool, uopidx: UInt, nf: UInt, eew: UInt, stride: UInt, alignedType: UInt): UInt = {
898    val uopInsidefield = (uopidx >> nf).asUInt // when nf == 0, is uopidx
899
900    val fofVUopOffset = (LookupTree(instType,List(
901      "b000".U -> ( genVStride(uopInsidefield, stride) << (log2Up(VLENB).U - eew)   ) , // unit-stride fof
902      "b100".U -> ( genVStride(uopInsidefield, stride) << (log2Up(VLENB).U - eew)   ) , // segment unit-stride fof
903    ))).asUInt
904
905    val otherVUopOffset = (LookupTree(instType,List(
906      "b000".U -> ( uopInsidefield << alignedType                                   ) , // unit-stride
907      "b010".U -> ( genVStride(uopInsidefield, stride) << (log2Up(VLENB).U - eew)   ) , // strided
908      "b001".U -> ( 0.U                                                             ) , // indexed-unordered
909      "b011".U -> ( 0.U                                                             ) , // indexed-ordered
910      "b100".U -> ( uopInsidefield << alignedType                                   ) , // segment unit-stride
911      "b110".U -> ( genVStride(uopInsidefield, stride) << (log2Up(VLENB).U - eew)   ) , // segment strided
912      "b101".U -> ( 0.U                                                             ) , // segment indexed-unordered
913      "b111".U -> ( 0.U                                                             )   // segment indexed-ordered
914    ))).asUInt
915
916    Mux(isfof, fofVUopOffset, otherVUopOffset)
917  }
918}
919
920
921
922object genVFirstUnmask extends VLSUConstants {
923  /**
924   * Find the lowest unmasked number of bits.
925   * example:
926   *   mask = 16'b1111_1111_1110_0000
927   *   return 5
928   * @param mask 16bits of mask.
929   * @return lowest unmasked number of bits.
930   */
931  def apply(mask: UInt): UInt = {
932    require(mask.getWidth == 16, "The mask width must be 16")
933    val select = (0 until 16).zip(mask.asBools).map{case (i, v) =>
934      (v, i.U)
935    }
936    PriorityMuxDefault(select, 0.U)
937  }
938
939  def apply(mask: UInt, regOffset: UInt): UInt = {
940    require(mask.getWidth == 16, "The mask width must be 16")
941    val realMask = (mask >> regOffset).asUInt
942    val select = (0 until 16).zip(realMask.asBools).map{case (i, v) =>
943      (v, i.U)
944    }
945    PriorityMuxDefault(select, 0.U)
946  }
947}
948
949