xref: /XiangShan/src/main/scala/xiangshan/mem/vector/VecCommon.scala (revision 7531c765d87d92e3c772b0d55aa810c2041ba3e3)
1/***************************************************************************************
2  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3  * Copyright (c) 2020-2021 Peng Cheng Laboratory
4  *
5  * XiangShan is licensed under Mulan PSL v2.
6  * You can use this software according to the terms and conditions of the Mulan PSL v2.
7  * You may obtain a copy of Mulan PSL v2 at:
8  *          http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  *
14  * See the Mulan PSL v2 for more details.
15  ***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.rob.RobPtr
26import xiangshan.backend.Bundles._
27
28/**
29  * Common used parameters or functions in vlsu
30  */
31trait VLSUConstants {
32  val VLEN = 128
33  def VLENB = VLEN/8
34  def vOffsetBits = log2Up(VLENB) // bits-width to index offset inside a vector reg
35
36  def alignTypes = 4 // eew/sew = 1/2/4/8
37  def alignTypeBits = log2Up(alignTypes)
38  def maxMUL = 8
39  def maxFields = 8
40  /**
41    * In the most extreme cases like a segment indexed instruction, eew=64, emul=8, sew=8, lmul=1,
42    * and nf=8, each data reg is mapped with 8 index regs and there are 8 data regs in total,
43    * each for a field. Therefore an instruction can be divided into 64 uops at most.
44    */
45  def maxUopNum = maxMUL * maxFields // 64
46  def maxFlowNum = 16
47  def maxElemNum = maxMUL * maxFlowNum // 128
48  // def uopIdxBits = log2Up(maxUopNum) // to index uop inside an robIdx
49  def elemIdxBits = log2Up(maxElemNum) + 1 // to index which element in an instruction
50  def flowIdxBits = log2Up(maxFlowNum) + 1 // to index which flow in a uop
51  def fieldBits = log2Up(maxFields) + 1 // 4-bits to indicate 1~8
52
53  def ewBits = 3 // bits-width of EEW/SEW
54  def mulBits = 3 // bits-width of emul/lmul
55
56  def getSlice(data: UInt, i: Int, alignBits: Int): UInt = {
57    require(data.getWidth >= (i+1) * alignBits)
58    data((i+1) * alignBits - 1, i * alignBits)
59  }
60
61  def getByte(data: UInt, i: Int = 0) = getSlice(data, i, 8)
62  def getHalfWord(data: UInt, i: Int = 0) = getSlice(data, i, 16)
63  def getWord(data: UInt, i: Int = 0) = getSlice(data, i, 32)
64  def getDoubleWord(data: UInt, i: Int = 0) = getSlice(data, i, 64)
65}
66
67trait HasVLSUParameters extends HasXSParameter with VLSUConstants {
68  override val VLEN = coreParams.VLEN
69  def isUnitStride(instType: UInt) = instType(1, 0) === "b00".U
70  def isStrided(instType: UInt) = instType(1, 0) === "b10".U
71  def isIndexed(instType: UInt) = instType(0) === "b1".U
72  def isNotIndexed(instType: UInt) = instType(0) === "b0".U
73
74  def mergeDataWithMask(oldData: UInt, newData: UInt, mask: UInt): Vec[UInt] = {
75    require(oldData.getWidth == newData.getWidth)
76    require(oldData.getWidth == mask.getWidth * 8)
77    VecInit(mask.asBools.zipWithIndex.map { case (en, i) =>
78      Mux(en, getByte(newData, i), getByte(oldData, i))
79    })
80  }
81
82  // def asBytes(data: UInt) = {
83  //   require(data.getWidth % 8 == 0)
84  //   (0 until data.getWidth/8).map(i => getByte(data, i))
85  // }
86
87  def mergeDataWithElemIdx(
88    oldData: UInt,
89    newData: Seq[UInt],
90    alignedType: UInt,
91    elemIdx: Seq[UInt],
92    valids: Seq[Bool]
93  ): UInt = {
94    require(newData.length == elemIdx.length)
95    require(newData.length == valids.length)
96    LookupTree(alignedType, List(
97      "b00".U -> VecInit(elemIdx.map(e => UIntToOH(e(3, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
98        ParallelPosteriorityMux(
99          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
100          getByte(oldData, i) +: newData.map(getByte(_))
101        )}).asUInt,
102      "b01".U -> VecInit(elemIdx.map(e => UIntToOH(e(2, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
103        ParallelPosteriorityMux(
104          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
105          getHalfWord(oldData, i) +: newData.map(getHalfWord(_))
106        )}).asUInt,
107      "b10".U -> VecInit(elemIdx.map(e => UIntToOH(e(1, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
108        ParallelPosteriorityMux(
109          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
110          getWord(oldData, i) +: newData.map(getWord(_))
111        )}).asUInt,
112      "b11".U -> VecInit(elemIdx.map(e => UIntToOH(e(0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
113        ParallelPosteriorityMux(
114          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
115          getDoubleWord(oldData, i) +: newData.map(getDoubleWord(_))
116        )}).asUInt
117    ))
118  }
119
120  def mergeDataWithElemIdx(oldData: UInt, newData: UInt, alignedType: UInt, elemIdx: UInt): UInt = {
121    mergeDataWithElemIdx(oldData, Seq(newData), alignedType, Seq(elemIdx), Seq(true.B))
122  }
123}
124abstract class VLSUModule(implicit p: Parameters) extends XSModule
125  with HasVLSUParameters
126  with HasCircularQueuePtrHelper
127abstract class VLSUBundle(implicit p: Parameters) extends XSBundle
128  with HasVLSUParameters
129
130class VLSUBundleWithMicroOp(implicit p: Parameters) extends VLSUBundle {
131  val uop = new DynInst
132}
133
134// Where is VecOperand used?
135class VecOperand(implicit p: Parameters) extends VLSUBundleWithMicroOp {
136  val vmask = UInt(VLEN.W) // the mask of inst which is readed from reg
137  val vecData = UInt(VLEN.W)
138  val baseAddr = UInt(VAddrBits.W) // base address from rs1
139  val stride = UInt(XLEN.W) // stride from rs2
140  val index = UInt(VLEN.W) // index from vs2
141  val pvd = UInt(5.W) // physical vector register destination
142  val lmul = UInt(3.W)
143  val sew = UInt(2.W)
144  val vma = Bool()
145  val vta = Bool()
146  val inner_idx = UInt(3.W) // the number index among 8 uop
147  val vl = UInt(8.W)
148  // TODO: How will OOO calculatr vector register numbers?
149  //  (EEW / SEW) * LMUL or (vl * EEW) / VLEN ?
150  //  So OOO will always use eew ?
151  // val eew = UInt(3.W)
152  val total_num = UInt(3.W) // An inst to how many uops
153}
154
155class VecDecode(implicit p: Parameters) extends VLSUBundle {
156  val uop_segment_num = UInt(3.W)
157  val uop_type = UInt(2.W)
158  val mask_en = Bool()
159  val uop_unit_stride_whole_reg = Bool()
160  val uop_unit_stride_mask = Bool()
161  val uop_unit_stride_fof = Bool()
162  val uop_eew = UInt(ewBits.W) // this is also the index width when the inst is a index load
163
164  def apply(inst: UInt) = {
165    this.uop_segment_num := inst(31, 29)
166    this.uop_type := inst(27, 26)
167    this.mask_en := inst(25)
168    this.uop_unit_stride_whole_reg := (inst(24,20) === "b01000".U)
169    this.uop_unit_stride_mask := (inst(24,20) === "b01011".U)
170    this.uop_unit_stride_fof := (inst(24,20) === "b10000".U)
171    this.uop_eew := inst(12 + ewBits - 1, 12)
172    this
173  }
174
175  def isUnitStride = uop_type === "b00".U
176  def isStrided = uop_type === "b10".U
177  def isIndexed = uop_type(0) === "b1".U
178}
179
180class OnlyVecExuOutput(implicit p: Parameters) extends VLSUBundle {
181  val isvec = Bool()
182  val vecdata = UInt(VLEN.W)
183  val mask = UInt(VLENB.W)
184  // val rob_idx_valid = Vec(2, Bool())
185  // val inner_idx = Vec(2, UInt(3.W))
186  // val rob_idx = Vec(2, new RobPtr)
187  // val offset = Vec(2, UInt(4.W))
188  val reg_offset = UInt(vOffsetBits.W)
189  val exp = Bool()
190  val is_first_ele = Bool()
191  val exp_ele_index = UInt(elemIdxBits.W) // element index
192  val uopQueuePtr = new VluopPtr
193  val flowPtr = new VlflowPtr
194}
195
196class VecExuOutput(implicit p: Parameters) extends MemExuOutput with HasVLSUParameters {
197  val vec = new OnlyVecExuOutput
198}
199
200class VecStoreExuOutput(implicit p: Parameters) extends MemExuOutput with HasVLSUParameters {
201  val exp_ele_index = UInt(elemIdxBits.W)
202  val uopQueuePtr = new VsUopPtr
203}
204
205class VecUopBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
206  val flowMask       = UInt(VLENB.W) // each bit for a flow
207  val byteMask       = UInt(VLENB.W) // each bit for a byte
208  val data           = UInt(VLEN.W)
209  // val fof            = Bool() // fof is only used for vector loads
210  val excp_eew_index = UInt(elemIdxBits.W)
211  // val exceptionVec   = ExceptionVec() // uop has exceptionVec
212  val baseAddr = UInt(VAddrBits.W)
213  val stride = UInt(VLEN.W)
214  val flow_counter = UInt(flowIdxBits.W)
215
216  // instruction decode result
217  val flowNum = UInt(flowIdxBits.W) // # of flows in a uop
218  // val flowNumLog2 = UInt(log2Up(flowIdxBits).W) // log2(flowNum), for better timing of multiplication
219  val nfields = UInt(fieldBits.W) // NFIELDS
220  val vm = Bool() // whether vector masking is enabled
221  val usWholeReg = Bool() // unit-stride, whole register load
222  val usMaskReg = Bool() // unit-stride, masked store/load
223  val eew = UInt(ewBits.W) // size of memory elements
224  val sew = UInt(ewBits.W)
225  val emul = UInt(mulBits.W)
226  val lmul = UInt(mulBits.W)
227  val vlmax = UInt(elemIdxBits.W)
228  val instType = UInt(3.W)
229}
230
231class VecFlowBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
232  val vaddr             = UInt(VAddrBits.W)
233  val mask              = UInt(VLENB.W)
234  val alignedType       = UInt(alignTypeBits.W)
235  val exp               = Bool()
236  val flow_idx          = UInt(elemIdxBits.W)
237  val is_first_ele      = Bool()
238}
239
240object MulNum {
241  def apply (mul: UInt): UInt = { //mul means emul or lmul
242    (LookupTree(mul,List(
243      "b101".U -> 1.U , // 1/8
244      "b110".U -> 1.U , // 1/4
245      "b111".U -> 1.U , // 1/2
246      "b000".U -> 1.U , // 1
247      "b001".U -> 2.U , // 2
248      "b010".U -> 4.U , // 4
249      "b011".U -> 8.U   // 8
250    )))}
251}
252/**
253  * when emul is greater than or equal to 1, this means the entire register needs to be written;
254  * otherwise, only write the specified number of bytes */
255object MulDataSize {
256  def apply (mul: UInt): UInt = { //mul means emul or lmul
257    (LookupTree(mul,List(
258      "b101".U -> 2.U  , // 1/8
259      "b110".U -> 4.U  , // 1/4
260      "b111".U -> 8.U  , // 1/2
261      "b000".U -> 16.U , // 1
262      "b001".U -> 16.U , // 2
263      "b010".U -> 16.U , // 4
264      "b011".U -> 16.U   // 8
265    )))}
266}
267
268object OneRegNum {
269  def apply (eew: UInt): UInt = { //mul means emul or lmul
270    (LookupTree(eew,List(
271      "b000".U -> 16.U , // 1
272      "b101".U -> 8.U , // 2
273      "b110".U -> 4.U , // 4
274      "b111".U -> 2.U   // 8
275    )))}
276}
277
278//index inst read data byte
279object SewDataSize {
280  def apply (sew: UInt): UInt = {
281    (LookupTree(sew,List(
282      "b000".U -> 1.U , // 1
283      "b001".U -> 2.U , // 2
284      "b010".U -> 4.U , // 4
285      "b011".U -> 8.U   // 8
286    )))}
287}
288
289// strided inst read data byte
290object EewDataSize {
291  def apply (eew: UInt): UInt = {
292    (LookupTree(eew,List(
293      "b000".U -> 1.U , // 1
294      "b101".U -> 2.U , // 2
295      "b110".U -> 4.U , // 4
296      "b111".U -> 8.U   // 8
297    )))}
298}
299
300object loadDataSize {
301  def apply (instType: UInt, emul: UInt, eew: UInt, sew: UInt): UInt = {
302    (LookupTree(instType,List(
303      "b000".U ->  MulDataSize(emul), // unit-stride
304      "b010".U ->  EewDataSize(eew)  , // strided
305      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
306      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
307      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
308      "b110".U ->  EewDataSize(eew)  , // segment strided
309      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
310      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
311    )))}
312}
313
314// object GenVecLoadMask extends VLSUConstants {
315//   def apply(alignedType: UInt, vaddr: UInt): UInt = {
316//     LookupTree(alignedType, List(
317//       "b00".U -> 0x1.U, // b1
318//       "b01".U -> 0x3.U, // b11
319//       "b10".U -> 0xf.U, // b1111
320//       "b11".U -> 0xff.U // b11111111
321//     )) << vaddr(vOffsetBits - 1, 0)
322//   }
323// }
324
325object storeDataSize {
326  def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
327    (LookupTree(instType,List(
328      "b000".U ->  EewDataSize(eew)  , // unit-stride, do not use
329      "b010".U ->  EewDataSize(eew)  , // strided
330      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
331      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
332      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
333      "b110".U ->  EewDataSize(eew)  , // segment strided
334      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
335      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
336    )))}
337}
338
339object GenVecStoreMask {
340  def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
341    val mask = Wire(UInt(16.W))
342    mask := UIntToOH(storeDataSize(instType = instType, eew = eew, sew = sew)) - 1.U
343    mask
344  }
345}
346
347/**
348  * these are used to obtain immediate addresses for  index instruction */
349object EewEq8 {
350  def apply(index:UInt, flow_inner_idx: UInt): UInt = {
351    (LookupTree(flow_inner_idx,List(
352      0.U  -> index(7 ,0   ),
353      1.U  -> index(15,8   ),
354      2.U  -> index(23,16  ),
355      3.U  -> index(31,24  ),
356      4.U  -> index(39,32  ),
357      5.U  -> index(47,40  ),
358      6.U  -> index(55,48  ),
359      7.U  -> index(63,56  ),
360      8.U  -> index(71,64  ),
361      9.U  -> index(79,72  ),
362      10.U -> index(87,80  ),
363      11.U -> index(95,88  ),
364      12.U -> index(103,96 ),
365      13.U -> index(111,104),
366      14.U -> index(119,112),
367      15.U -> index(127,120)
368    )))}
369}
370
371object EewEq16 {
372  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
373    (LookupTree(flow_inner_idx, List(
374      0.U -> index(15, 0),
375      1.U -> index(31, 16),
376      2.U -> index(47, 32),
377      3.U -> index(63, 48),
378      4.U -> index(79, 64),
379      5.U -> index(95, 80),
380      6.U -> index(111, 96),
381      7.U -> index(127, 112)
382    )))}
383}
384
385object EewEq32 {
386  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
387    (LookupTree(flow_inner_idx, List(
388      0.U -> index(31, 0),
389      1.U -> index(63, 32),
390      2.U -> index(95, 64),
391      3.U -> index(127, 96)
392    )))}
393}
394
395object EewEq64 {
396  def apply (index: UInt, flow_inner_idx: UInt): UInt = {
397    (LookupTree(flow_inner_idx, List(
398      0.U -> index(63, 0),
399      1.U -> index(127, 64)
400    )))}
401}
402
403object IndexAddr {
404  def apply (index: UInt, flow_inner_idx: UInt, eew: UInt): UInt = {
405    (LookupTree(eew,List(
406      "b000".U -> EewEq8 (index = index, flow_inner_idx = flow_inner_idx ), // Imm is 1 Byte // TODO: index maybe cross register
407      "b101".U -> EewEq16(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 2 Byte
408      "b110".U -> EewEq32(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 4 Byte
409      "b111".U -> EewEq64(index = index, flow_inner_idx = flow_inner_idx )  // Imm is 8 Byte
410    )))}
411}
412/*
413object RegFLowCnt {
414  def apply (emul: UInt, lmul:UInt, eew: UInt, uopIdx: UInt, flowIdx: UInt): UInt = {
415
416    (LookupTree(Cat(emul,lmul),List(
417      "b001000".U -> ((uopIdx(0  ) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 2,lmul = 1
418      "b010000".U -> ((uopIdx(1,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 4,lmul = 1
419      "b011000".U -> ((uopIdx(2,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 8,lmul = 1
420      "b010001".U -> ((uopIdx(0  ) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 4,lmul = 2
421      "b011001".U -> ((uopIdx(1,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),//emul = 8,lmul = 2
422      "b011010".U -> ((uopIdx(0  ) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx) //emul = 8,lmul = 4
423    )))}
424}
425
426object AddrFLowCnt {
427  def apply (emul: UInt, lmul:UInt, sew:UInt, uopIdx: UInt, flowIdx: UInt):UInt = {
428    (LookupTree(Cat(lmul,emul),List(
429      "b001000".U -> ((uopIdx(0  ) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 2, emul = 1
430      "b010000".U -> ((uopIdx(1,0) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 4, emul = 1
431      "b011000".U -> ((uopIdx(2,0) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 8, emul = 1
432      "b010001".U -> ((uopIdx(0  ) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 4, emul = 2
433      "b011001".U -> ((uopIdx(1,0) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx),//lmul = 8, emul = 2
434      "b011011".U -> ((uopIdx(0  ) << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx) //lmul = 8, emul = 4
435    )))}
436}
437*/
438
439object RegFLowCnt {
440  def apply (emulNum: UInt, lmulNum:UInt, eew: UInt, uopIdx: UInt, flowIdx: UInt):UInt = {
441    (LookupTree(emulNum/lmulNum,List(
442      //"d1".U -> flowIdx,
443      "d2".U -> ((uopIdx(0  ) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),
444      "d4".U -> ((uopIdx(1,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx),
445      "d8".U -> ((uopIdx(2,0) << Log2Num((16.U >> eew(1,0)).asUInt)).asUInt + flowIdx)
446    )))}
447}
448
449object AddrFLowCnt {
450  def apply (emulNum: UInt, lmulNum:UInt, sew:UInt, uopIdx: UInt, flowIdx: UInt):UInt = {
451    (LookupTree(lmulNum/emulNum,List(
452      "d1".U -> flowIdx,
453      "d2".U -> ((uopIdx(0  ) << Log2Num((16.U >> sew(1,0)).asUInt)).asUInt + flowIdx),
454      "d4".U -> ((uopIdx(1,0) << Log2Num((16.U >> sew(1,0)).asUInt)).asUInt + flowIdx),
455      "d8".U -> ((uopIdx(2,0) << Log2Num((16.U >> sew(1,0)).asUInt)).asUInt + flowIdx)
456    )))}
457}
458
459
460object Log2Num {
461  def apply (num: UInt): UInt = {
462    (LookupTree(num,List(
463      16.U -> 4.U,
464      8.U  -> 3.U,
465      4.U  -> 2.U,
466      2.U  -> 1.U,
467      1.U  -> 0.U
468    )))}
469}
470
471/**
472  * when emul is less than or equal to 1, the nf is equal to uop_inner_idx;
473  * when emul is equal to 2, the nf is equal to uop_inner_idx(2,1), and so on*/
474object GenSegNfIdx {
475  def apply (mul: UInt, uopIdx: UInt):UInt = { // mul means lmul or emul
476    (LookupTree(mul,List(
477      "b101".U -> uopIdx     , // 1/8
478      "b110".U -> uopIdx     , // 1/4
479      "b111".U -> uopIdx     , // 1/2
480      "b000".U -> uopIdx     , // 1
481      "b001".U -> uopIdx(2,1), // 2
482      "b010".U -> uopIdx(2)  , // 4
483      "b011".U -> 0.U          //8
484    )))}
485}
486
487object GenSegNfIdxMul {
488  def apply (emul: UInt, lmul: UInt, uopIdx: UInt):UInt = {
489    (LookupTree(Cat(emul,lmul),List(
490      "b001000".U -> uopIdx(5,1), //emul = 2,lmul = 1
491      "b010000".U -> uopIdx(5,2), //emul = 4,lmul = 1
492      "b011000".U -> uopIdx(5,3), //emul = 8,lmul = 1
493      "b010001".U -> uopIdx(5,3), //emul = 4,lmul = 2
494      "b011001".U -> uopIdx(5,4), //emul = 8,lmul = 2
495      "b011010".U -> uopIdx(5,5)  //emul = 8,lmul = 4
496    )))}
497}
498
499/**
500  * when emul is less than or equal to 1, only one segEmulIdx, so the segEmulIdx is 0.U;
501  * when emul is equal to 2, the segEmulIdx is equal to uopIdx(0), and so on*/
502object GenSegMulIdx {
503  def apply (mul: UInt, uopIdx: UInt): UInt = { //mul means emul or lmul
504    (LookupTree(mul,List(
505      "b101".U -> 0.U        , // 1/8
506      "b110".U -> 0.U        , // 1/4
507      "b111".U -> 0.U        , // 1/2
508      "b000".U -> 0.U        , // 1
509      "b001".U -> uopIdx(0)  , // 2
510      "b010".U -> uopIdx(1,0), // 4
511      "b011".U -> uopIdx(2,0)  //8
512    )))}
513}
514
515//eew decode
516object EewLog2 extends VLSUConstants {
517  // def apply (eew: UInt): UInt = {
518  //   (LookupTree(eew,List(
519  //     "b000".U -> "b000".U , // 1
520  //     "b101".U -> "b001".U , // 2
521  //     "b110".U -> "b010".U , // 4
522  //     "b111".U -> "b011".U   // 8
523  //   )))}
524  def apply(eew: UInt): UInt = ZeroExt(eew(1, 0), ewBits)
525}
526
527/**
528  * unit-stride instructions don't use this method;
529  * other instructions generate realFlowNum by EmulDataSize >> eew(1,0),
530  * EmulDataSize means the number of bytes that need to be written to the register,
531  * eew(1,0) means the number of bytes written at once*/
532object GenRealFlowNum {
533  def apply (instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt): UInt = {
534    (LookupTree(instType,List(
535      "b000".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // store use, load do not use
536      "b010".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // strided
537      "b001".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-unordered
538      "b011".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-ordered
539      "b100".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // segment unit-stride
540      "b110".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // segment strided
541      "b101".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // segment indexed-unordered
542      "b111".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt)  // segment indexed-ordered
543    )))}
544}
545
546/**
547  * GenRealFlowLog2 = Log2(GenRealFlowNum)
548  */
549object GenRealFlowLog2 extends VLSUConstants {
550  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt): UInt = {
551    val emulLog2 = Mux(emul.asSInt >= 0.S, 0.U, emul)
552    val lmulLog2 = Mux(lmul.asSInt >= 0.S, 0.U, lmul)
553    val eewRealFlowLog2 = emulLog2 + log2Up(VLENB).U - eew(1, 0)
554    val sewRealFlowLog2 = lmulLog2 + log2Up(VLENB).U - sew(1, 0)
555    (LookupTree(instType, List(
556      "b000".U -> eewRealFlowLog2, // unit-stride
557      "b010".U -> eewRealFlowLog2, // strided
558      "b001".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-unordered
559      "b011".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-ordered
560      "b100".U -> eewRealFlowLog2, // segment unit-stride
561      "b110".U -> eewRealFlowLog2, // segment strided
562      "b101".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // segment indexed-unordered
563      "b111".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // segment indexed-ordered
564    )))
565  }
566}
567
568/**
569  * GenElemIdx generals an element index within an instruction, given a certain uopIdx and a known flowIdx
570  * inside the uop.
571  *
572  * eew = 0, elemIdx = uopIdx ## flowIdx(3, 0)
573  * eew = 1, elemIdx = uopIdx ## flowIdx(2, 0)
574  * eew = 2, elemIdx = uopIdx ## flowIdx(1, 0)
575  * eew = 3, elemIdx = uopIdx ## flowIdx(0)
576  */
577object GenElemIdx extends VLSUConstants {
578  def apply(alignedType: UInt, uopIdx: UInt, flowIdx: UInt): UInt = {
579    LookupTree(
580      alignedType,
581      (0 until alignTypes).map(i =>
582        i.U -> ((uopIdx ## flowIdx(log2Up(VLENB) - i - 1, 0))(log2Up(maxElemNum) - 1, 0))
583      )
584    )
585  }
586}
587
588/**
589  * GenVLMAX calculates VLMAX, which equals MUL * ew
590  */
591object GenVLMAXLog2 extends VLSUConstants {
592  def apply(lmul: UInt, sew: UInt): UInt = lmul + log2Up(VLENB).U - sew
593}
594object GenVLMAX {
595  def apply(lmul: UInt, sew: UInt): UInt = 1.U << GenVLMAXLog2(lmul, sew)
596}
597
598object GenUSWholeRegVL extends VLSUConstants {
599  def apply(nfields: UInt, eew: UInt): UInt = {
600    LookupTree(eew(1, 0), List(
601      "b00".U -> (nfields << (log2Up(VLENB) - 0)),
602      "b01".U -> (nfields << (log2Up(VLENB) - 1)),
603      "b10".U -> (nfields << (log2Up(VLENB) - 2)),
604      "b11".U -> (nfields << (log2Up(VLENB) - 3))
605    ))
606  }
607}
608object GenUSMaskRegVL extends VLSUConstants {
609  def apply(vl: UInt): UInt = {
610    (vl >> 3.U)
611  }
612}
613
614object GenUopByteMask {
615  def apply(flowMask: UInt, alignedType: UInt): UInt = {
616    LookupTree(alignedType, List(
617      "b00".U -> flowMask,
618      "b01".U -> FillInterleaved(2, flowMask),
619      "b10".U -> FillInterleaved(4, flowMask),
620      "b11".U -> FillInterleaved(8, flowMask)
621    ))
622  }
623}
624
625object GenFlowMaskInsideReg extends VLSUConstants {
626  def apply(alignedType: UInt, elemIdx: UInt): UInt = {
627    LookupTree(alignedType, List(
628      "b00".U -> UIntToOH(elemIdx(3, 0)),
629      "b01".U -> FillInterleaved(2, UIntToOH(elemIdx(2, 0))),
630      "b10".U -> FillInterleaved(4, UIntToOH(elemIdx(1, 0))),
631      "b11".U -> FillInterleaved(8, UIntToOH(elemIdx(0)))
632    ))
633  }
634}
635
636// TODO: delete this in vs flow queue
637object GenEleIdx {
638  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt, uopIdx:UInt, flowIdx: UInt):UInt = {
639    val eleIdx = Wire(UInt(7.W))
640    when (instType(1,0) === "b00".U || instType(1,0) === "b10".U || emul.asSInt > lmul.asSInt) {
641      eleIdx := (uopIdx << Log2Num((MulDataSize(emul) >> eew(1,0)).asUInt)).asUInt + flowIdx
642    }.otherwise {
643      eleIdx := (uopIdx << Log2Num((MulDataSize(lmul) >> sew(1,0)).asUInt)).asUInt + flowIdx
644    }
645    eleIdx
646  }
647}