xref: /XiangShan/src/main/scala/xiangshan/mem/vector/VecCommon.scala (revision 26af847e669bb208507278eafc6ebe52f03b0d19)
1/***************************************************************************************
2  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3  * Copyright (c) 2020-2021 Peng Cheng Laboratory
4  *
5  * XiangShan is licensed under Mulan PSL v2.
6  * You can use this software according to the terms and conditions of the Mulan PSL v2.
7  * You may obtain a copy of Mulan PSL v2 at:
8  *          http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  *
14  * See the Mulan PSL v2 for more details.
15  ***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.rob.RobPtr
26import xiangshan.backend.Bundles._
27import xiangshan.backend.fu.FuType
28
29/**
30  * Common used parameters or functions in vlsu
31  */
32trait VLSUConstants {
33  val VLEN = 128
34  //for pack unit-stride flow
35  val AlignedNum = 4 // 1/2/4/8
36  def VLENB = VLEN/8
37  def vOffsetBits = log2Up(VLENB) // bits-width to index offset inside a vector reg
38  lazy val vlmBindexBits = 8 //will be overrided later
39  lazy val vsmBindexBits = 8 // will be overrided later
40
41  def alignTypes = 5 // eew/sew = 1/2/4/8, last indicate 128 bit element
42  def alignTypeBits = log2Up(alignTypes)
43  def maxMUL = 8
44  def maxFields = 8
45  /**
46    * In the most extreme cases like a segment indexed instruction, eew=64, emul=8, sew=8, lmul=1,
47    * and nf=8, each data reg is mapped with 8 index regs and there are 8 data regs in total,
48    * each for a field. Therefore an instruction can be divided into 64 uops at most.
49    */
50  def maxUopNum = maxMUL * maxFields // 64
51  def maxFlowNum = 16
52  def maxElemNum = maxMUL * maxFlowNum // 128
53  // def uopIdxBits = log2Up(maxUopNum) // to index uop inside an robIdx
54  def elemIdxBits = log2Up(maxElemNum) + 1 // to index which element in an instruction
55  def flowIdxBits = log2Up(maxFlowNum) + 1 // to index which flow in a uop
56  def fieldBits = log2Up(maxFields) + 1 // 4-bits to indicate 1~8
57
58  def ewBits = 3 // bits-width of EEW/SEW
59  def mulBits = 3 // bits-width of emul/lmul
60
61  def getSlice(data: UInt, i: Int, alignBits: Int): UInt = {
62    require(data.getWidth >= (i+1) * alignBits)
63    data((i+1) * alignBits - 1, i * alignBits)
64  }
65
66  def getByte(data: UInt, i: Int = 0) = getSlice(data, i, 8)
67  def getHalfWord(data: UInt, i: Int = 0) = getSlice(data, i, 16)
68  def getWord(data: UInt, i: Int = 0) = getSlice(data, i, 32)
69  def getDoubleWord(data: UInt, i: Int = 0) = getSlice(data, i, 64)
70  def getDoubleDoubleWord(data: UInt, i: Int = 0) = getSlice(data, i, 128)
71}
72
73trait HasVLSUParameters extends HasXSParameter with VLSUConstants {
74  override val VLEN = coreParams.VLEN
75  override lazy val vlmBindexBits = log2Up(coreParams.VlMergeBufferSize)
76  override lazy val vsmBindexBits = log2Up(coreParams.VsMergeBufferSize)
77  def isUnitStride(instType: UInt) = instType(1, 0) === "b00".U
78  def isStrided(instType: UInt) = instType(1, 0) === "b10".U
79  def isIndexed(instType: UInt) = instType(0) === "b1".U
80  def isNotIndexed(instType: UInt) = instType(0) === "b0".U
81  def isSegment(instType: UInt) = instType(2) === "b1".U
82  def is128Bit(alignedType: UInt) = alignedType(2) === "b1".U
83
84  def mergeDataWithMask(oldData: UInt, newData: UInt, mask: UInt): Vec[UInt] = {
85    require(oldData.getWidth == newData.getWidth)
86    require(oldData.getWidth == mask.getWidth * 8)
87    VecInit(mask.asBools.zipWithIndex.map { case (en, i) =>
88      Mux(en, getByte(newData, i), getByte(oldData, i))
89    })
90  }
91
92  // def asBytes(data: UInt) = {
93  //   require(data.getWidth % 8 == 0)
94  //   (0 until data.getWidth/8).map(i => getByte(data, i))
95  // }
96
97  def mergeDataWithElemIdx(
98    oldData: UInt,
99    newData: Seq[UInt],
100    alignedType: UInt,
101    elemIdx: Seq[UInt],
102    valids: Seq[Bool]
103  ): UInt = {
104    require(newData.length == elemIdx.length)
105    require(newData.length == valids.length)
106    LookupTree(alignedType, List(
107      "b00".U -> VecInit(elemIdx.map(e => UIntToOH(e(3, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
108        ParallelPosteriorityMux(
109          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
110          getByte(oldData, i) +: newData.map(getByte(_))
111        )}).asUInt,
112      "b01".U -> VecInit(elemIdx.map(e => UIntToOH(e(2, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
113        ParallelPosteriorityMux(
114          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
115          getHalfWord(oldData, i) +: newData.map(getHalfWord(_))
116        )}).asUInt,
117      "b10".U -> VecInit(elemIdx.map(e => UIntToOH(e(1, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
118        ParallelPosteriorityMux(
119          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
120          getWord(oldData, i) +: newData.map(getWord(_))
121        )}).asUInt,
122      "b11".U -> VecInit(elemIdx.map(e => UIntToOH(e(0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
123        ParallelPosteriorityMux(
124          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
125          getDoubleWord(oldData, i) +: newData.map(getDoubleWord(_))
126        )}).asUInt
127    ))
128  }
129
130  def mergeDataWithElemIdx(oldData: UInt, newData: UInt, alignedType: UInt, elemIdx: UInt): UInt = {
131    mergeDataWithElemIdx(oldData, Seq(newData), alignedType, Seq(elemIdx), Seq(true.B))
132  }
133  /**
134    * for merge 128-bits data of unit-stride
135    */
136  object mergeDataByoffset{
137    def apply(oldData: UInt, newData: Seq[UInt], mask: Seq[UInt], offset: Seq[UInt], valids: Seq[Bool]): UInt = {
138      require(newData.length == valids.length)
139      require(newData.length == offset.length)
140      // if (i>offset[k] && mask[k][i]==1 && valid[k]) -> newData, else -> oldData
141      val selVec = (mask zip offset).map{case (m,e) =>
142        ((~UIntToMask(e, VLENB)).asBools.zip(m.asBools).map(x=> x._1 && x._2))}.transpose // vector(3,16)
143
144      VecInit(selVec.zipWithIndex.map{ case (selV, i) => // selV: vector(3,1), 0=<i<16
145        ParallelPosteriorityMux(
146          true.B +: selV.zip(valids).map(x => x._1 && x._2),
147          getByte(oldData, i) +: newData.map(getByte(_))
148        )}).asUInt
149    }
150  }
151  def mergeDataByoffset(oldData: UInt, newData: UInt, mask: UInt, offset: UInt): UInt = {
152    mergeDataByoffset(oldData, Seq(newData), Seq(mask), Seq(offset), Seq(true.B))
153  }
154}
155abstract class VLSUModule(implicit p: Parameters) extends XSModule
156  with HasVLSUParameters
157  with HasCircularQueuePtrHelper
158abstract class VLSUBundle(implicit p: Parameters) extends XSBundle
159  with HasVLSUParameters
160
161class VLSUBundleWithMicroOp(implicit p: Parameters) extends VLSUBundle {
162  val uop = new DynInst
163}
164
165class OnlyVecExuOutput(implicit p: Parameters) extends VLSUBundle {
166  val isvec = Bool()
167  val vecdata = UInt(VLEN.W)
168  val mask = UInt(VLENB.W)
169  // val rob_idx_valid = Vec(2, Bool())
170  // val inner_idx = Vec(2, UInt(3.W))
171  // val rob_idx = Vec(2, new RobPtr)
172  // val offset = Vec(2, UInt(4.W))
173  val reg_offset = UInt(vOffsetBits.W)
174  val vecActive = Bool() // 1: vector active element, 0: vector not active element
175  val is_first_ele = Bool()
176  val elemIdx = UInt(elemIdxBits.W) // element index
177  val elemIdxInsideVd = UInt(elemIdxBits.W) // element index in scope of vd
178  // val uopQueuePtr = new VluopPtr
179  // val flowPtr = new VlflowPtr
180}
181
182class VecExuOutput(implicit p: Parameters) extends MemExuOutput with HasVLSUParameters {
183  val vec = new OnlyVecExuOutput
184    // pack
185  val isPackage         = Bool()
186  val packageNum        = UInt((log2Up(VLENB) + 1).W)
187  val originAlignedType = UInt(alignTypeBits.W)
188  val alignedType       = UInt(alignTypeBits.W)
189   // feedback
190  val vecFeedback       = Bool()
191}
192
193// class VecStoreExuOutput(implicit p: Parameters) extends MemExuOutput with HasVLSUParameters {
194//   val elemIdx = UInt(elemIdxBits.W)
195//   val uopQueuePtr = new VsUopPtr
196//   val fieldIdx = UInt(fieldBits.W)
197//   val segmentIdx = UInt(elemIdxBits.W)
198//   val vaddr = UInt(VAddrBits.W)
199//   // pack
200//   val isPackage         = Bool()
201//   val packageNum        = UInt((log2Up(VLENB) + 1).W)
202//   val originAlignedType = UInt(alignTypeBits.W)
203//   val alignedType       = UInt(alignTypeBits.W)
204// }
205
206class VecUopBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
207  val flowMask       = UInt(VLENB.W) // each bit for a flow
208  val byteMask       = UInt(VLENB.W) // each bit for a byte
209  val data           = UInt(VLEN.W)
210  // val fof            = Bool() // fof is only used for vector loads
211  val excp_eew_index = UInt(elemIdxBits.W)
212  // val exceptionVec   = ExceptionVec() // uop has exceptionVec
213  val baseAddr = UInt(VAddrBits.W)
214  val stride = UInt(VLEN.W)
215  val flow_counter = UInt(flowIdxBits.W)
216
217  // instruction decode result
218  val flowNum = UInt(flowIdxBits.W) // # of flows in a uop
219  // val flowNumLog2 = UInt(log2Up(flowIdxBits).W) // log2(flowNum), for better timing of multiplication
220  val nfields = UInt(fieldBits.W) // NFIELDS
221  val vm = Bool() // whether vector masking is enabled
222  val usWholeReg = Bool() // unit-stride, whole register load
223  val usMaskReg = Bool() // unit-stride, masked store/load
224  val eew = UInt(ewBits.W) // size of memory elements
225  val sew = UInt(ewBits.W)
226  val emul = UInt(mulBits.W)
227  val lmul = UInt(mulBits.W)
228  val vlmax = UInt(elemIdxBits.W)
229  val instType = UInt(3.W)
230  val vd_last_uop = Bool()
231  val vd_first_uop = Bool()
232}
233
234class VecFlowBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
235  val vaddr             = UInt(VAddrBits.W)
236  val mask              = UInt(VLENB.W)
237  val alignedType       = UInt(alignTypeBits.W)
238  val vecActive         = Bool()
239  val elemIdx           = UInt(elemIdxBits.W)
240  val is_first_ele      = Bool()
241
242  // pack
243  val isPackage         = Bool()
244  val packageNum        = UInt((log2Up(VLENB) + 1).W)
245  val originAlignedType = UInt(alignTypeBits.W)
246}
247
248class VecMemExuOutput(isVector: Boolean = false)(implicit p: Parameters) extends VLSUBundle{
249  val output = new MemExuOutput(isVector)
250  val vecFeedback = Bool()
251  val mmio = Bool()
252  val usSecondInv = Bool()
253  val elemIdx = UInt(elemIdxBits.W)
254  val alignedType = UInt(alignTypeBits.W)
255}
256
257object MulNum {
258  def apply (mul: UInt): UInt = { //mul means emul or lmul
259    (LookupTree(mul,List(
260      "b101".U -> 1.U , // 1/8
261      "b110".U -> 1.U , // 1/4
262      "b111".U -> 1.U , // 1/2
263      "b000".U -> 1.U , // 1
264      "b001".U -> 2.U , // 2
265      "b010".U -> 4.U , // 4
266      "b011".U -> 8.U   // 8
267    )))}
268}
269/**
270  * when emul is greater than or equal to 1, this means the entire register needs to be written;
271  * otherwise, only write the specified number of bytes */
272object MulDataSize {
273  def apply (mul: UInt): UInt = { //mul means emul or lmul
274    (LookupTree(mul,List(
275      "b101".U -> 2.U  , // 1/8
276      "b110".U -> 4.U  , // 1/4
277      "b111".U -> 8.U  , // 1/2
278      "b000".U -> 16.U , // 1
279      "b001".U -> 16.U , // 2
280      "b010".U -> 16.U , // 4
281      "b011".U -> 16.U   // 8
282    )))}
283}
284
285object OneRegNum {
286  def apply (eew: UInt): UInt = { //mul means emul or lmul
287    (LookupTree(eew,List(
288      "b000".U -> 16.U , // 1
289      "b101".U -> 8.U , // 2
290      "b110".U -> 4.U , // 4
291      "b111".U -> 2.U   // 8
292    )))}
293}
294
295//index inst read data byte
296object SewDataSize {
297  def apply (sew: UInt): UInt = {
298    (LookupTree(sew,List(
299      "b000".U -> 1.U , // 1
300      "b001".U -> 2.U , // 2
301      "b010".U -> 4.U , // 4
302      "b011".U -> 8.U   // 8
303    )))}
304}
305
306// strided inst read data byte
307object EewDataSize {
308  def apply (eew: UInt): UInt = {
309    (LookupTree(eew,List(
310      "b000".U -> 1.U , // 1
311      "b101".U -> 2.U , // 2
312      "b110".U -> 4.U , // 4
313      "b111".U -> 8.U   // 8
314    )))}
315}
316
317object loadDataSize {
318  def apply (instType: UInt, emul: UInt, eew: UInt, sew: UInt): UInt = {
319    (LookupTree(instType,List(
320      "b000".U ->  MulDataSize(emul), // unit-stride
321      "b010".U ->  EewDataSize(eew)  , // strided
322      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
323      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
324      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
325      "b110".U ->  EewDataSize(eew)  , // segment strided
326      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
327      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
328    )))}
329}
330
331object storeDataSize {
332  def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
333    (LookupTree(instType,List(
334      "b000".U ->  EewDataSize(eew)  , // unit-stride, do not use
335      "b010".U ->  EewDataSize(eew)  , // strided
336      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
337      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
338      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
339      "b110".U ->  EewDataSize(eew)  , // segment strided
340      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
341      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
342    )))}
343}
344
345object GenVecStoreMask {
346  def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
347    val mask = Wire(UInt(16.W))
348    mask := UIntToOH(storeDataSize(instType = instType, eew = eew, sew = sew)) - 1.U
349    mask
350  }
351}
352
353/**
354  * these are used to obtain immediate addresses for  index instruction */
355object EewEq8 {
356  def apply(index:UInt, flow_inner_idx: UInt): UInt = {
357    (LookupTree(flow_inner_idx,List(
358      0.U  -> index(7 ,0   ),
359      1.U  -> index(15,8   ),
360      2.U  -> index(23,16  ),
361      3.U  -> index(31,24  ),
362      4.U  -> index(39,32  ),
363      5.U  -> index(47,40  ),
364      6.U  -> index(55,48  ),
365      7.U  -> index(63,56  ),
366      8.U  -> index(71,64  ),
367      9.U  -> index(79,72  ),
368      10.U -> index(87,80  ),
369      11.U -> index(95,88  ),
370      12.U -> index(103,96 ),
371      13.U -> index(111,104),
372      14.U -> index(119,112),
373      15.U -> index(127,120)
374    )))}
375}
376
377object EewEq16 {
378  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
379    (LookupTree(flow_inner_idx, List(
380      0.U -> index(15, 0),
381      1.U -> index(31, 16),
382      2.U -> index(47, 32),
383      3.U -> index(63, 48),
384      4.U -> index(79, 64),
385      5.U -> index(95, 80),
386      6.U -> index(111, 96),
387      7.U -> index(127, 112)
388    )))}
389}
390
391object EewEq32 {
392  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
393    (LookupTree(flow_inner_idx, List(
394      0.U -> index(31, 0),
395      1.U -> index(63, 32),
396      2.U -> index(95, 64),
397      3.U -> index(127, 96)
398    )))}
399}
400
401object EewEq64 {
402  def apply (index: UInt, flow_inner_idx: UInt): UInt = {
403    (LookupTree(flow_inner_idx, List(
404      0.U -> index(63, 0),
405      1.U -> index(127, 64)
406    )))}
407}
408
409object IndexAddr {
410  def apply (index: UInt, flow_inner_idx: UInt, eew: UInt): UInt = {
411    (LookupTree(eew,List(
412      "b000".U -> EewEq8 (index = index, flow_inner_idx = flow_inner_idx ), // Imm is 1 Byte // TODO: index maybe cross register
413      "b101".U -> EewEq16(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 2 Byte
414      "b110".U -> EewEq32(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 4 Byte
415      "b111".U -> EewEq64(index = index, flow_inner_idx = flow_inner_idx )  // Imm is 8 Byte
416    )))}
417}
418
419object Log2Num {
420  def apply (num: UInt): UInt = {
421    (LookupTree(num,List(
422      16.U -> 4.U,
423      8.U  -> 3.U,
424      4.U  -> 2.U,
425      2.U  -> 1.U,
426      1.U  -> 0.U
427    )))}
428}
429
430object GenUopIdxInField {
431  def apply (instType: UInt, emul: UInt, lmul: UInt, uopIdx: UInt): UInt = {
432    val isIndexed = instType(0)
433    val mulInField = Mux(
434      isIndexed,
435      Mux(lmul.asSInt > emul.asSInt, lmul, emul),
436      emul
437    )
438    LookupTree(mulInField, List(
439      "b101".U -> 0.U,
440      "b110".U -> 0.U,
441      "b111".U -> 0.U,
442      "b000".U -> 0.U,
443      "b001".U -> uopIdx(0),
444      "b010".U -> uopIdx(1, 0),
445      "b011".U -> uopIdx(2, 0)
446    ))
447  }
448}
449
450//eew decode
451object EewLog2 extends VLSUConstants {
452  // def apply (eew: UInt): UInt = {
453  //   (LookupTree(eew,List(
454  //     "b000".U -> "b000".U , // 1
455  //     "b101".U -> "b001".U , // 2
456  //     "b110".U -> "b010".U , // 4
457  //     "b111".U -> "b011".U   // 8
458  //   )))}
459  def apply(eew: UInt): UInt = ZeroExt(eew(1, 0), ewBits)
460}
461
462/**
463  * unit-stride instructions don't use this method;
464  * other instructions generate realFlowNum by EmulDataSize >> eew(1,0),
465  * EmulDataSize means the number of bytes that need to be written to the register,
466  * eew(1,0) means the number of bytes written at once*/
467object GenRealFlowNum {
468  def apply (instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt): UInt = {
469    (LookupTree(instType,List(
470      "b000".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // store use, load do not use
471      "b010".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // strided
472      "b001".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-unordered
473      "b011".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-ordered
474      "b100".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // segment unit-stride
475      "b110".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // segment strided
476      "b101".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // segment indexed-unordered
477      "b111".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt)  // segment indexed-ordered
478    )))}
479}
480
481/**
482  * GenRealFlowLog2 = Log2(GenRealFlowNum)
483  */
484object GenRealFlowLog2 extends VLSUConstants {
485  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt): UInt = {
486    val emulLog2 = Mux(emul.asSInt >= 0.S, 0.U, emul)
487    val lmulLog2 = Mux(lmul.asSInt >= 0.S, 0.U, lmul)
488    val eewRealFlowLog2 = emulLog2 + log2Up(VLENB).U - eew(1, 0)
489    val sewRealFlowLog2 = lmulLog2 + log2Up(VLENB).U - sew(1, 0)
490    (LookupTree(instType, List(
491      "b000".U -> eewRealFlowLog2, // unit-stride
492      "b010".U -> eewRealFlowLog2, // strided
493      "b001".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-unordered
494      "b011".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-ordered
495      "b100".U -> eewRealFlowLog2, // segment unit-stride
496      "b110".U -> eewRealFlowLog2, // segment strided
497      "b101".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // segment indexed-unordered
498      "b111".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // segment indexed-ordered
499    )))
500  }
501}
502
503/**
504  * GenElemIdx generals an element index within an instruction, given a certain uopIdx and a known flowIdx
505  * inside the uop.
506  */
507object GenElemIdx extends VLSUConstants {
508  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt,
509    uopIdx: UInt, flowIdx: UInt): UInt = {
510    val isIndexed = instType(0).asBool
511    val eewUopFlowsLog2 = Mux(emul.asSInt > 0.S, 0.U, emul) + log2Up(VLENB).U - eew(1, 0)
512    val sewUopFlowsLog2 = Mux(lmul.asSInt > 0.S, 0.U, lmul) + log2Up(VLENB).U - sew(1, 0)
513    val uopFlowsLog2 = Mux(
514      isIndexed,
515      Mux(emul.asSInt > lmul.asSInt, eewUopFlowsLog2, sewUopFlowsLog2),
516      eewUopFlowsLog2
517    )
518    LookupTree(uopFlowsLog2, List(
519      0.U -> uopIdx,
520      1.U -> uopIdx ## flowIdx(0),
521      2.U -> uopIdx ## flowIdx(1, 0),
522      3.U -> uopIdx ## flowIdx(2, 0),
523      4.U -> uopIdx ## flowIdx(3, 0)
524    ))
525  }
526}
527
528/**
529  * GenVLMAX calculates VLMAX, which equals MUL * ew
530  */
531object GenVLMAXLog2 extends VLSUConstants {
532  def apply(lmul: UInt, sew: UInt): UInt = lmul + log2Up(VLENB).U - sew
533}
534object GenVLMAX {
535  def apply(lmul: UInt, sew: UInt): UInt = 1.U << GenVLMAXLog2(lmul, sew)
536}
537
538object GenUSWholeRegVL extends VLSUConstants {
539  def apply(nfields: UInt, eew: UInt): UInt = {
540    LookupTree(eew(1, 0), List(
541      "b00".U -> (nfields << (log2Up(VLENB) - 0)),
542      "b01".U -> (nfields << (log2Up(VLENB) - 1)),
543      "b10".U -> (nfields << (log2Up(VLENB) - 2)),
544      "b11".U -> (nfields << (log2Up(VLENB) - 3))
545    ))
546  }
547}
548object GenUSWholeEmul extends VLSUConstants{
549  def apply(nf: UInt): UInt={
550    LookupTree(nf,List(
551      "b000".U -> "b000".U(mulBits.W),
552      "b001".U -> "b001".U(mulBits.W),
553      "b011".U -> "b010".U(mulBits.W),
554      "b111".U -> "b011".U(mulBits.W)
555    ))
556  }
557}
558
559
560object GenUSMaskRegVL extends VLSUConstants {
561  def apply(vl: UInt): UInt = {
562    Mux(vl(2,0) === 0.U , (vl >> 3.U), ((vl >> 3.U) + 1.U))
563  }
564}
565
566object GenUopByteMask {
567  def apply(flowMask: UInt, alignedType: UInt): UInt = {
568    LookupTree(alignedType, List(
569      "b000".U -> flowMask,
570      "b001".U -> FillInterleaved(2, flowMask),
571      "b010".U -> FillInterleaved(4, flowMask),
572      "b011".U -> FillInterleaved(8, flowMask),
573      "b100".U -> FillInterleaved(16, flowMask)
574    ))
575  }
576}
577
578object GenVdIdxInField extends VLSUConstants {
579  def apply(instType: UInt, emul: UInt, lmul: UInt, uopIdx: UInt): UInt = {
580    val vdIdx = Wire(UInt(log2Up(maxMUL).W))
581    when (instType(1,0) === "b00".U || instType(1,0) === "b10".U || lmul.asSInt > emul.asSInt) {
582      // Unit-stride or Strided, or indexed with lmul >= emul
583      vdIdx := uopIdx
584    }.otherwise {
585      // Indexed with lmul <= emul
586      val multiple = emul - lmul
587      val uopIdxWidth = uopIdx.getWidth
588      vdIdx := LookupTree(multiple, List(
589        0.U -> uopIdx,
590        1.U -> (uopIdx >> 1),
591        2.U -> (uopIdx >> 2),
592        3.U -> (uopIdx >> 3)
593      ))
594    }
595    vdIdx
596  }
597}
598/**
599* Use start and vl to generate flow activative mask
600* mod = true fill 0
601* mod = false fill 1
602*/
603object GenFlowMask extends VLSUConstants {
604  def apply(elementMask: UInt, start: UInt, vl: UInt , mod: Boolean): UInt = {
605    val startMask = ~UIntToMask(start, VLEN)
606    val vlMask = UIntToMask(vl, VLEN)
607    val maskVlStart = vlMask & startMask
608    if(mod){
609      elementMask & maskVlStart
610    }
611    else{
612      (~elementMask).asUInt & maskVlStart
613    }
614  }
615}
616
617object CheckAligned extends VLSUConstants {
618  def apply(addr: UInt): UInt = {
619    val aligned_16 = (addr(0) === 0.U) // 16-bit
620    val aligned_32 = (addr(1,0) === 0.U) // 32-bit
621    val aligned_64 = (addr(2,0) === 0.U) // 64-bit
622    val aligned_128 = (addr(3,0) === 0.U) // 128-bit
623    Cat(true.B, aligned_16, aligned_32, aligned_64, aligned_128)
624  }
625}
626
627/**
628  search if mask have continue 'len' bit '1'
629  mask: source mask
630  len: search length
631*/
632object GenPackMask{
633  def leadX(mask: Seq[Bool], len: Int): Bool = {
634    if(len == 1){
635      mask.head
636    }
637    else{
638      leadX(mask.drop(1),len-1) & mask.head
639    }
640  }
641  def leadOneVec(shiftMask: Seq[Bool]): UInt = {
642    // max is 64-bit, so the max num of flow to pack is 8
643
644    val lead1 = leadX(shiftMask, 1) // continue 1 bit
645    val lead2 = leadX(shiftMask, 2) // continue 2 bit
646    val lead4 = leadX(shiftMask, 4) // continue 4 bit
647    val lead8 = leadX(shiftMask, 8) // continue 8 bit
648    val lead16 = leadX(shiftMask, 16) // continue 16 bit
649    Cat(lead1, lead2, lead4, lead8, lead16)
650  }
651
652  def apply(shiftMask: UInt) = {
653    // pack mask
654    val packMask = leadOneVec(shiftMask.asBools)
655    packMask
656  }
657}
658/**
659PackEnable = (LeadXVec >> eew) & alignedVec, where the 0th bit represents the ability to merge into a 64 bit flow, the second bit represents the ability to merge into a 32 bit flow, and so on.
660
661example:
662  addr = 0x0, activeMask = b00011100101111, flowIdx = 0, eew = 0(8-bit)
663
664  step 0 : addrAlignedVec = (1, 1, 1, 1) elemIdxAligned = (1, 1, 1, 1)
665  step 1 : activePackVec = (1, 1, 1, 0), inactivePackVec = (0, 0, 0, 0)
666  step 2 : activePackEnable = (1, 1, 1, 0), inactivePackVec = (0, 0, 0, 0)
667
668  we can package 4 8-bit activative flows into a 32-bit flow.
669*/
670object GenPackVec extends VLSUConstants{
671  def apply(addr: UInt, shiftMask: UInt, eew: UInt, elemIdx: UInt): UInt = {
672    val addrAlignedVec = CheckAligned(addr)
673    val elemIdxAligned = CheckAligned(elemIdx)
674    val packMask = GenPackMask(shiftMask)
675    // generate packVec
676    val packVec = addrAlignedVec & elemIdxAligned & (packMask.asUInt >> eew)
677
678    packVec
679  }
680}
681
682object GenPackAlignedType extends VLSUConstants{
683  def apply(packVec: UInt): UInt = {
684    val packAlignedType = PriorityMux(Seq(
685      packVec(0) -> "b100".U,
686      packVec(1) -> "b011".U,
687      packVec(2) -> "b010".U,
688      packVec(3) -> "b001".U,
689      packVec(4) -> "b000".U
690    ))
691    packAlignedType
692  }
693}
694
695object GenPackNum extends VLSUConstants{
696  def apply(alignedType: UInt, packAlignedType: UInt): UInt = {
697    (1.U << (packAlignedType - alignedType)).asUInt
698  }
699}
700
701object genVWmask128 {
702  def apply(addr: UInt, sizeEncode: UInt): UInt = {
703    (LookupTree(sizeEncode, List(
704      "b000".U -> 0x1.U, //0001 << addr(2:0)
705      "b001".U -> 0x3.U, //0011
706      "b010".U -> 0xf.U, //1111
707      "b011".U -> 0xff.U, //11111111
708      "b100".U -> 0xffff.U //1111111111111111
709    )) << addr(3, 0)).asUInt
710  }
711}
712/*
713* only use in max length is 128
714*/
715object genVWdata {
716  def apply(data: UInt, sizeEncode: UInt): UInt = {
717    LookupTree(sizeEncode, List(
718      "b000".U -> Fill(16, data(7, 0)),
719      "b001".U -> Fill(8, data(15, 0)),
720      "b010".U -> Fill(4, data(31, 0)),
721      "b011".U -> Fill(2, data(63,0)),
722      "b100".U -> data(127,0)
723    ))
724  }
725}
726
727object genUSSplitAddr{
728  def apply(addr: UInt, index: UInt): UInt = {
729    val tmpAddr = Cat(addr(38, 4), 0.U(4.W))
730    val nextCacheline = tmpAddr + 16.U
731    LookupTree(index, List(
732      0.U -> tmpAddr,
733      1.U -> nextCacheline
734    ))
735  }
736}
737
738object genUSSplitMask{
739  def apply(mask: UInt, index: UInt, addrOffset: UInt): UInt = {
740    val tmpMask = Cat(0.U(16.W),mask) << addrOffset // 32-bits
741    LookupTree(index, List(
742      0.U -> tmpMask(15, 0),
743      1.U -> tmpMask(31, 16),
744    ))
745  }
746}
747
748object genUSSplitData{
749  def apply(data: UInt, index: UInt, addrOffset: UInt): UInt = {
750    val tmpData = WireInit(0.U(256.W))
751    val lookupTable = (0 until 16).map{case i =>
752      if(i == 0){
753        i.U -> Cat(0.U(128.W), data)
754      }else{
755        i.U -> Cat(0.U(((16-i)*8).W), data, 0.U((i*8).W))
756      }
757    }
758    tmpData := LookupTree(addrOffset, lookupTable).asUInt
759
760    LookupTree(index, List(
761      0.U -> tmpData(127, 0),
762      1.U -> tmpData(255, 128)
763    ))
764  }
765}
766/**
767  * generate offset in Vd of flows, only used in Unit-Stride
768  * */
769object genVdOffset{
770  def apply(offset: UInt, index: UInt): UInt = {
771    LookupTree(index, List(
772      0.U -> 0.U,
773      1.U -> ((~offset).asUInt + 1.U)
774    ))
775  }
776}
777
778object GenVSData extends VLSUConstants {
779  def apply(data: UInt, elemIdx: UInt, alignedType: UInt): UInt = {
780    LookupTree(alignedType, List(
781      "b000".U -> ZeroExt(LookupTree(elemIdx(3, 0), List.tabulate(VLEN/8)(i => i.U -> getByte(data, i))), VLEN),
782      "b001".U -> ZeroExt(LookupTree(elemIdx(2, 0), List.tabulate(VLEN/16)(i => i.U -> getHalfWord(data, i))), VLEN),
783      "b010".U -> ZeroExt(LookupTree(elemIdx(1, 0), List.tabulate(VLEN/32)(i => i.U -> getWord(data, i))), VLEN),
784      "b011".U -> ZeroExt(LookupTree(elemIdx(0), List.tabulate(VLEN/64)(i => i.U -> getDoubleWord(data, i))), VLEN),
785      "b100".U -> data // if have wider element, it will broken
786    ))
787  }
788}