xref: /XiangShan/src/main/scala/xiangshan/mem/vector/VecCommon.scala (revision f7063a43ab34da917ba6c670d21871314340c550)
1/***************************************************************************************
2  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3  * Copyright (c) 2020-2021 Peng Cheng Laboratory
4  *
5  * XiangShan is licensed under Mulan PSL v2.
6  * You can use this software according to the terms and conditions of the Mulan PSL v2.
7  * You may obtain a copy of Mulan PSL v2 at:
8  *          http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  *
14  * See the Mulan PSL v2 for more details.
15  ***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.rob.RobPtr
26import xiangshan.backend.Bundles._
27
28/**
29  * Common used parameters or functions in vlsu
30  */
31trait VLSUConstants {
32  val VLEN = 128
33  //for pack unit-stride flow
34  val AlignedNum = 4 // 1/2/4/8
35  def VLENB = VLEN/8
36  def vOffsetBits = log2Up(VLENB) // bits-width to index offset inside a vector reg
37
38  def alignTypes = 4 // eew/sew = 1/2/4/8
39  def alignTypeBits = log2Up(alignTypes)
40  def maxMUL = 8
41  def maxFields = 8
42  /**
43    * In the most extreme cases like a segment indexed instruction, eew=64, emul=8, sew=8, lmul=1,
44    * and nf=8, each data reg is mapped with 8 index regs and there are 8 data regs in total,
45    * each for a field. Therefore an instruction can be divided into 64 uops at most.
46    */
47  def maxUopNum = maxMUL * maxFields // 64
48  def maxFlowNum = 16
49  def maxElemNum = maxMUL * maxFlowNum // 128
50  // def uopIdxBits = log2Up(maxUopNum) // to index uop inside an robIdx
51  def elemIdxBits = log2Up(maxElemNum) + 1 // to index which element in an instruction
52  def flowIdxBits = log2Up(maxFlowNum) + 1 // to index which flow in a uop
53  def fieldBits = log2Up(maxFields) + 1 // 4-bits to indicate 1~8
54
55  def ewBits = 3 // bits-width of EEW/SEW
56  def mulBits = 3 // bits-width of emul/lmul
57
58  def getSlice(data: UInt, i: Int, alignBits: Int): UInt = {
59    require(data.getWidth >= (i+1) * alignBits)
60    data((i+1) * alignBits - 1, i * alignBits)
61  }
62
63  def getByte(data: UInt, i: Int = 0) = getSlice(data, i, 8)
64  def getHalfWord(data: UInt, i: Int = 0) = getSlice(data, i, 16)
65  def getWord(data: UInt, i: Int = 0) = getSlice(data, i, 32)
66  def getDoubleWord(data: UInt, i: Int = 0) = getSlice(data, i, 64)
67}
68
69trait HasVLSUParameters extends HasXSParameter with VLSUConstants {
70  override val VLEN = coreParams.VLEN
71  def isUnitStride(instType: UInt) = instType(1, 0) === "b00".U
72  def isStrided(instType: UInt) = instType(1, 0) === "b10".U
73  def isIndexed(instType: UInt) = instType(0) === "b1".U
74  def isNotIndexed(instType: UInt) = instType(0) === "b0".U
75  def isSegment(instType: UInt) = instType(2) === "b1".U
76
77  def mergeDataWithMask(oldData: UInt, newData: UInt, mask: UInt): Vec[UInt] = {
78    require(oldData.getWidth == newData.getWidth)
79    require(oldData.getWidth == mask.getWidth * 8)
80    VecInit(mask.asBools.zipWithIndex.map { case (en, i) =>
81      Mux(en, getByte(newData, i), getByte(oldData, i))
82    })
83  }
84
85  // def asBytes(data: UInt) = {
86  //   require(data.getWidth % 8 == 0)
87  //   (0 until data.getWidth/8).map(i => getByte(data, i))
88  // }
89
90  def mergeDataWithElemIdx(
91    oldData: UInt,
92    newData: Seq[UInt],
93    alignedType: UInt,
94    elemIdx: Seq[UInt],
95    valids: Seq[Bool]
96  ): UInt = {
97    require(newData.length == elemIdx.length)
98    require(newData.length == valids.length)
99    LookupTree(alignedType, List(
100      "b00".U -> VecInit(elemIdx.map(e => UIntToOH(e(3, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
101        ParallelPosteriorityMux(
102          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
103          getByte(oldData, i) +: newData.map(getByte(_))
104        )}).asUInt,
105      "b01".U -> VecInit(elemIdx.map(e => UIntToOH(e(2, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
106        ParallelPosteriorityMux(
107          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
108          getHalfWord(oldData, i) +: newData.map(getHalfWord(_))
109        )}).asUInt,
110      "b10".U -> VecInit(elemIdx.map(e => UIntToOH(e(1, 0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
111        ParallelPosteriorityMux(
112          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
113          getWord(oldData, i) +: newData.map(getWord(_))
114        )}).asUInt,
115      "b11".U -> VecInit(elemIdx.map(e => UIntToOH(e(0)).asBools).transpose.zipWithIndex.map { case (selVec, i) =>
116        ParallelPosteriorityMux(
117          true.B +: selVec.zip(valids).map(x => x._1 && x._2),
118          getDoubleWord(oldData, i) +: newData.map(getDoubleWord(_))
119        )}).asUInt
120    ))
121  }
122
123  def mergeDataWithElemIdx(oldData: UInt, newData: UInt, alignedType: UInt, elemIdx: UInt): UInt = {
124    mergeDataWithElemIdx(oldData, Seq(newData), alignedType, Seq(elemIdx), Seq(true.B))
125  }
126}
127abstract class VLSUModule(implicit p: Parameters) extends XSModule
128  with HasVLSUParameters
129  with HasCircularQueuePtrHelper
130abstract class VLSUBundle(implicit p: Parameters) extends XSBundle
131  with HasVLSUParameters
132
133class VLSUBundleWithMicroOp(implicit p: Parameters) extends VLSUBundle {
134  val uop = new DynInst
135}
136
137class OnlyVecExuOutput(implicit p: Parameters) extends VLSUBundle {
138  val isvec = Bool()
139  val vecdata = UInt(VLEN.W)
140  val mask = UInt(VLENB.W)
141  // val rob_idx_valid = Vec(2, Bool())
142  // val inner_idx = Vec(2, UInt(3.W))
143  // val rob_idx = Vec(2, new RobPtr)
144  // val offset = Vec(2, UInt(4.W))
145  val reg_offset = UInt(vOffsetBits.W)
146  val vecActive = Bool() // 1: vector active element, 0: vector not active element
147  val is_first_ele = Bool()
148  val elemIdx = UInt(elemIdxBits.W) // element index
149  val elemIdxInsideVd = UInt(elemIdxBits.W) // element index in scope of vd
150  val uopQueuePtr = new VluopPtr
151  val flowPtr = new VlflowPtr
152}
153
154class VecExuOutput(implicit p: Parameters) extends MemExuOutput with HasVLSUParameters {
155  val vec = new OnlyVecExuOutput
156    // pack
157  val isPackage         = Bool()
158  val packageNum        = UInt(log2Up(VLENB).W)
159  val originAlignedType = UInt(alignTypeBits.W)
160  val alignedType       = UInt(alignTypeBits.W)
161}
162
163class VecStoreExuOutput(implicit p: Parameters) extends MemExuOutput with HasVLSUParameters {
164  val elemIdx = UInt(elemIdxBits.W)
165  val uopQueuePtr = new VsUopPtr
166  val fieldIdx = UInt(fieldBits.W)
167  val segmentIdx = UInt(elemIdxBits.W)
168  val vaddr = UInt(VAddrBits.W)
169  // pack
170  val isPackage         = Bool()
171  val packageNum        = UInt(log2Up(VLENB).W)
172  val originAlignedType = UInt(alignTypeBits.W)
173  val alignedType       = UInt(alignTypeBits.W)
174}
175
176class VecUopBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
177  val flowMask       = UInt(VLENB.W) // each bit for a flow
178  val byteMask       = UInt(VLENB.W) // each bit for a byte
179  val data           = UInt(VLEN.W)
180  // val fof            = Bool() // fof is only used for vector loads
181  val excp_eew_index = UInt(elemIdxBits.W)
182  // val exceptionVec   = ExceptionVec() // uop has exceptionVec
183  val baseAddr = UInt(VAddrBits.W)
184  val stride = UInt(VLEN.W)
185  val flow_counter = UInt(flowIdxBits.W)
186
187  // instruction decode result
188  val flowNum = UInt(flowIdxBits.W) // # of flows in a uop
189  // val flowNumLog2 = UInt(log2Up(flowIdxBits).W) // log2(flowNum), for better timing of multiplication
190  val nfields = UInt(fieldBits.W) // NFIELDS
191  val vm = Bool() // whether vector masking is enabled
192  val usWholeReg = Bool() // unit-stride, whole register load
193  val usMaskReg = Bool() // unit-stride, masked store/load
194  val eew = UInt(ewBits.W) // size of memory elements
195  val sew = UInt(ewBits.W)
196  val emul = UInt(mulBits.W)
197  val lmul = UInt(mulBits.W)
198  val vlmax = UInt(elemIdxBits.W)
199  val instType = UInt(3.W)
200  val vd_last_uop = Bool()
201  val vd_first_uop = Bool()
202}
203
204class VecFlowBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp {
205  val vaddr             = UInt(VAddrBits.W)
206  val mask              = UInt(VLENB.W)
207  val alignedType       = UInt(alignTypeBits.W)
208  val vecActive         = Bool()
209  val elemIdx           = UInt(elemIdxBits.W)
210  val is_first_ele      = Bool()
211
212  // pack
213  val isPackage         = Bool()
214  val packageNum        = UInt(log2Up(VLENB).W)
215  val originAlignedType = UInt(alignTypeBits.W)
216}
217
218object MulNum {
219  def apply (mul: UInt): UInt = { //mul means emul or lmul
220    (LookupTree(mul,List(
221      "b101".U -> 1.U , // 1/8
222      "b110".U -> 1.U , // 1/4
223      "b111".U -> 1.U , // 1/2
224      "b000".U -> 1.U , // 1
225      "b001".U -> 2.U , // 2
226      "b010".U -> 4.U , // 4
227      "b011".U -> 8.U   // 8
228    )))}
229}
230/**
231  * when emul is greater than or equal to 1, this means the entire register needs to be written;
232  * otherwise, only write the specified number of bytes */
233object MulDataSize {
234  def apply (mul: UInt): UInt = { //mul means emul or lmul
235    (LookupTree(mul,List(
236      "b101".U -> 2.U  , // 1/8
237      "b110".U -> 4.U  , // 1/4
238      "b111".U -> 8.U  , // 1/2
239      "b000".U -> 16.U , // 1
240      "b001".U -> 16.U , // 2
241      "b010".U -> 16.U , // 4
242      "b011".U -> 16.U   // 8
243    )))}
244}
245
246object OneRegNum {
247  def apply (eew: UInt): UInt = { //mul means emul or lmul
248    (LookupTree(eew,List(
249      "b000".U -> 16.U , // 1
250      "b101".U -> 8.U , // 2
251      "b110".U -> 4.U , // 4
252      "b111".U -> 2.U   // 8
253    )))}
254}
255
256//index inst read data byte
257object SewDataSize {
258  def apply (sew: UInt): UInt = {
259    (LookupTree(sew,List(
260      "b000".U -> 1.U , // 1
261      "b001".U -> 2.U , // 2
262      "b010".U -> 4.U , // 4
263      "b011".U -> 8.U   // 8
264    )))}
265}
266
267// strided inst read data byte
268object EewDataSize {
269  def apply (eew: UInt): UInt = {
270    (LookupTree(eew,List(
271      "b000".U -> 1.U , // 1
272      "b101".U -> 2.U , // 2
273      "b110".U -> 4.U , // 4
274      "b111".U -> 8.U   // 8
275    )))}
276}
277
278object loadDataSize {
279  def apply (instType: UInt, emul: UInt, eew: UInt, sew: UInt): UInt = {
280    (LookupTree(instType,List(
281      "b000".U ->  MulDataSize(emul), // unit-stride
282      "b010".U ->  EewDataSize(eew)  , // strided
283      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
284      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
285      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
286      "b110".U ->  EewDataSize(eew)  , // segment strided
287      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
288      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
289    )))}
290}
291
292object storeDataSize {
293  def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
294    (LookupTree(instType,List(
295      "b000".U ->  EewDataSize(eew)  , // unit-stride, do not use
296      "b010".U ->  EewDataSize(eew)  , // strided
297      "b001".U ->  SewDataSize(sew)  , // indexed-unordered
298      "b011".U ->  SewDataSize(sew)  , // indexed-ordered
299      "b100".U ->  EewDataSize(eew)  , // segment unit-stride
300      "b110".U ->  EewDataSize(eew)  , // segment strided
301      "b101".U ->  SewDataSize(sew)  , // segment indexed-unordered
302      "b111".U ->  SewDataSize(sew)    // segment indexed-ordered
303    )))}
304}
305
306object GenVecStoreMask {
307  def apply (instType: UInt, eew: UInt, sew: UInt): UInt = {
308    val mask = Wire(UInt(16.W))
309    mask := UIntToOH(storeDataSize(instType = instType, eew = eew, sew = sew)) - 1.U
310    mask
311  }
312}
313
314/**
315  * these are used to obtain immediate addresses for  index instruction */
316object EewEq8 {
317  def apply(index:UInt, flow_inner_idx: UInt): UInt = {
318    (LookupTree(flow_inner_idx,List(
319      0.U  -> index(7 ,0   ),
320      1.U  -> index(15,8   ),
321      2.U  -> index(23,16  ),
322      3.U  -> index(31,24  ),
323      4.U  -> index(39,32  ),
324      5.U  -> index(47,40  ),
325      6.U  -> index(55,48  ),
326      7.U  -> index(63,56  ),
327      8.U  -> index(71,64  ),
328      9.U  -> index(79,72  ),
329      10.U -> index(87,80  ),
330      11.U -> index(95,88  ),
331      12.U -> index(103,96 ),
332      13.U -> index(111,104),
333      14.U -> index(119,112),
334      15.U -> index(127,120)
335    )))}
336}
337
338object EewEq16 {
339  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
340    (LookupTree(flow_inner_idx, List(
341      0.U -> index(15, 0),
342      1.U -> index(31, 16),
343      2.U -> index(47, 32),
344      3.U -> index(63, 48),
345      4.U -> index(79, 64),
346      5.U -> index(95, 80),
347      6.U -> index(111, 96),
348      7.U -> index(127, 112)
349    )))}
350}
351
352object EewEq32 {
353  def apply(index: UInt, flow_inner_idx: UInt): UInt = {
354    (LookupTree(flow_inner_idx, List(
355      0.U -> index(31, 0),
356      1.U -> index(63, 32),
357      2.U -> index(95, 64),
358      3.U -> index(127, 96)
359    )))}
360}
361
362object EewEq64 {
363  def apply (index: UInt, flow_inner_idx: UInt): UInt = {
364    (LookupTree(flow_inner_idx, List(
365      0.U -> index(63, 0),
366      1.U -> index(127, 64)
367    )))}
368}
369
370object IndexAddr {
371  def apply (index: UInt, flow_inner_idx: UInt, eew: UInt): UInt = {
372    (LookupTree(eew,List(
373      "b000".U -> EewEq8 (index = index, flow_inner_idx = flow_inner_idx ), // Imm is 1 Byte // TODO: index maybe cross register
374      "b101".U -> EewEq16(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 2 Byte
375      "b110".U -> EewEq32(index = index, flow_inner_idx = flow_inner_idx ), // Imm is 4 Byte
376      "b111".U -> EewEq64(index = index, flow_inner_idx = flow_inner_idx )  // Imm is 8 Byte
377    )))}
378}
379
380object Log2Num {
381  def apply (num: UInt): UInt = {
382    (LookupTree(num,List(
383      16.U -> 4.U,
384      8.U  -> 3.U,
385      4.U  -> 2.U,
386      2.U  -> 1.U,
387      1.U  -> 0.U
388    )))}
389}
390
391object GenUopIdxInField {
392  def apply (instType: UInt, emul: UInt, lmul: UInt, uopIdx: UInt): UInt = {
393    val isIndexed = instType(0)
394    val mulInField = Mux(
395      isIndexed,
396      Mux(lmul.asSInt > emul.asSInt, lmul, emul),
397      emul
398    )
399    LookupTree(mulInField, List(
400      "b101".U -> 0.U,
401      "b110".U -> 0.U,
402      "b111".U -> 0.U,
403      "b000".U -> 0.U,
404      "b001".U -> uopIdx(0),
405      "b010".U -> uopIdx(1, 0),
406      "b011".U -> uopIdx(2, 0)
407    ))
408  }
409}
410
411//eew decode
412object EewLog2 extends VLSUConstants {
413  // def apply (eew: UInt): UInt = {
414  //   (LookupTree(eew,List(
415  //     "b000".U -> "b000".U , // 1
416  //     "b101".U -> "b001".U , // 2
417  //     "b110".U -> "b010".U , // 4
418  //     "b111".U -> "b011".U   // 8
419  //   )))}
420  def apply(eew: UInt): UInt = ZeroExt(eew(1, 0), ewBits)
421}
422
423/**
424  * unit-stride instructions don't use this method;
425  * other instructions generate realFlowNum by EmulDataSize >> eew(1,0),
426  * EmulDataSize means the number of bytes that need to be written to the register,
427  * eew(1,0) means the number of bytes written at once*/
428object GenRealFlowNum {
429  def apply (instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt): UInt = {
430    (LookupTree(instType,List(
431      "b000".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // store use, load do not use
432      "b010".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // strided
433      "b001".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-unordered
434      "b011".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // indexed-ordered
435      "b100".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // segment unit-stride
436      "b110".U ->  (MulDataSize(emul) >> eew(1,0)).asUInt, // segment strided
437      "b101".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt), // segment indexed-unordered
438      "b111".U ->  Mux(emul.asSInt > lmul.asSInt, (MulDataSize(emul) >> eew(1,0)).asUInt, (MulDataSize(lmul) >> sew(1,0)).asUInt)  // segment indexed-ordered
439    )))}
440}
441
442/**
443  * GenRealFlowLog2 = Log2(GenRealFlowNum)
444  */
445object GenRealFlowLog2 extends VLSUConstants {
446  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt): UInt = {
447    val emulLog2 = Mux(emul.asSInt >= 0.S, 0.U, emul)
448    val lmulLog2 = Mux(lmul.asSInt >= 0.S, 0.U, lmul)
449    val eewRealFlowLog2 = emulLog2 + log2Up(VLENB).U - eew(1, 0)
450    val sewRealFlowLog2 = lmulLog2 + log2Up(VLENB).U - sew(1, 0)
451    (LookupTree(instType, List(
452      "b000".U -> eewRealFlowLog2, // unit-stride
453      "b010".U -> eewRealFlowLog2, // strided
454      "b001".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-unordered
455      "b011".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // indexed-ordered
456      "b100".U -> eewRealFlowLog2, // segment unit-stride
457      "b110".U -> eewRealFlowLog2, // segment strided
458      "b101".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // segment indexed-unordered
459      "b111".U -> Mux(emul.asSInt > lmul.asSInt, eewRealFlowLog2, sewRealFlowLog2), // segment indexed-ordered
460    )))
461  }
462}
463
464/**
465  * GenElemIdx generals an element index within an instruction, given a certain uopIdx and a known flowIdx
466  * inside the uop.
467  */
468object GenElemIdx extends VLSUConstants {
469  def apply(instType: UInt, emul: UInt, lmul: UInt, eew: UInt, sew: UInt,
470    uopIdx: UInt, flowIdx: UInt): UInt = {
471    val isIndexed = instType(0).asBool
472    val eewUopFlowsLog2 = Mux(emul.asSInt > 0.S, 0.U, emul) + log2Up(VLENB).U - eew(1, 0)
473    val sewUopFlowsLog2 = Mux(lmul.asSInt > 0.S, 0.U, lmul) + log2Up(VLENB).U - sew(1, 0)
474    val uopFlowsLog2 = Mux(
475      isIndexed,
476      Mux(emul.asSInt > lmul.asSInt, eewUopFlowsLog2, sewUopFlowsLog2),
477      eewUopFlowsLog2
478    )
479    LookupTree(uopFlowsLog2, List(
480      0.U -> uopIdx,
481      1.U -> uopIdx ## flowIdx(0),
482      2.U -> uopIdx ## flowIdx(1, 0),
483      3.U -> uopIdx ## flowIdx(2, 0),
484      4.U -> uopIdx ## flowIdx(3, 0)
485    ))
486  }
487}
488
489/**
490  * GenVLMAX calculates VLMAX, which equals MUL * ew
491  */
492object GenVLMAXLog2 extends VLSUConstants {
493  def apply(lmul: UInt, sew: UInt): UInt = lmul + log2Up(VLENB).U - sew
494}
495object GenVLMAX {
496  def apply(lmul: UInt, sew: UInt): UInt = 1.U << GenVLMAXLog2(lmul, sew)
497}
498
499object GenUSWholeRegVL extends VLSUConstants {
500  def apply(nfields: UInt, eew: UInt): UInt = {
501    LookupTree(eew(1, 0), List(
502      "b00".U -> (nfields << (log2Up(VLENB) - 0)),
503      "b01".U -> (nfields << (log2Up(VLENB) - 1)),
504      "b10".U -> (nfields << (log2Up(VLENB) - 2)),
505      "b11".U -> (nfields << (log2Up(VLENB) - 3))
506    ))
507  }
508}
509object GenUSWholeEmul extends VLSUConstants{
510  def apply(nf: UInt): UInt={
511    LookupTree(nf,List(
512      "b000".U -> "b000".U(mulBits.W),
513      "b001".U -> "b001".U(mulBits.W),
514      "b011".U -> "b010".U(mulBits.W),
515      "b111".U -> "b011".U(mulBits.W)
516    ))
517  }
518}
519
520
521object GenUSMaskRegVL extends VLSUConstants {
522  def apply(vl: UInt): UInt = {
523    Mux(vl(2,0) === 0.U , (vl >> 3.U), ((vl >> 3.U) + 1.U))
524  }
525}
526
527object GenUopByteMask {
528  def apply(flowMask: UInt, alignedType: UInt): UInt = {
529    LookupTree(alignedType, List(
530      "b00".U -> flowMask,
531      "b01".U -> FillInterleaved(2, flowMask),
532      "b10".U -> FillInterleaved(4, flowMask),
533      "b11".U -> FillInterleaved(8, flowMask)
534    ))
535  }
536}
537
538object GenVdIdxInField extends VLSUConstants {
539  def apply(instType: UInt, emul: UInt, lmul: UInt, uopIdx: UInt): UInt = {
540    val vdIdx = Wire(UInt(log2Up(maxMUL).W))
541    when (instType(1,0) === "b00".U || instType(1,0) === "b10".U || lmul.asSInt > emul.asSInt) {
542      // Unit-stride or Strided, or indexed with lmul >= emul
543      vdIdx := uopIdx
544    }.otherwise {
545      // Indexed with lmul <= emul
546      val multiple = emul - lmul
547      val uopIdxWidth = uopIdx.getWidth
548      vdIdx := LookupTree(multiple, List(
549        0.U -> uopIdx,
550        1.U -> (uopIdx >> 1),
551        2.U -> (uopIdx >> 2),
552        3.U -> (uopIdx >> 3)
553      ))
554    }
555    vdIdx
556  }
557}
558/**
559* Use start and vl to generate flow activative mask
560* mod = true fill 0
561* mod = false fill 1
562*/
563object GenFlowMask extends VLSUConstants {
564  def apply(elementMask: UInt, start: UInt, vl: UInt , mod: Boolean): UInt = {
565    val startMask = ~UIntToMask(start, VLEN)
566    val vlMask = UIntToMask(vl, VLEN)
567    val maskVlStart = vlMask & startMask
568    if(mod){
569      elementMask & maskVlStart
570    }
571    else{
572      (~elementMask).asUInt & maskVlStart
573    }
574  }
575}
576
577object CheckAligned extends VLSUConstants {
578  def apply(addr: UInt): UInt = {
579    val aligned_16 = (addr(0) === 0.U) // 16-bit
580    val aligned_32 = (addr(1,0) === 0.U) // 32-bit
581    val aligned_64 = (addr(2,0) === 0.U) // 64-bit
582    Cat(true.B, aligned_16, aligned_32, aligned_64)
583  }
584}
585
586/**
587  search if mask have continue 'len' bit '1'
588  mask: source mask
589  len: search length
590*/
591object GenPackMask{
592  def leadX(mask: Seq[Bool], len: Int): Bool = {
593    if(len == 1){
594      mask.head
595    }
596    else{
597      leadX(mask.drop(1),len-1) & mask.head
598    }
599  }
600  def leadOneVec(shiftMask: Seq[Bool]): UInt = {
601    // max is 64-bit, so the max num of flow to pack is 8
602
603    val lead1 = leadX(shiftMask, 1) // continue 1 bit
604    val lead2 = leadX(shiftMask, 2) // continue 2 bit
605    val lead4 = leadX(shiftMask, 4) // continue 4 bit
606    val lead8 = leadX(shiftMask, 8) // continue 8 bit
607    Cat(lead1, lead2, lead4, lead8)
608  }
609
610  def apply(shiftMask: UInt) = {
611    // pack mask
612    val packMask = leadOneVec(shiftMask.asBools)
613    packMask
614  }
615}
616/**
617PackEnable = (LeadXVec >> eew) & alignedVec, where the 0th bit represents the ability to merge into a 64 bit flow, the second bit represents the ability to merge into a 32 bit flow, and so on.
618
619example:
620  addr = 0x0, activeMask = b00011100101111, flowIdx = 0, eew = 0(8-bit)
621
622  step 0 : addrAlignedVec = (1, 1, 1, 1) elemIdxAligned = (1, 1, 1, 1)
623  step 1 : activePackVec = (1, 1, 1, 0), inactivePackVec = (0, 0, 0, 0)
624  step 2 : activePackEnable = (1, 1, 1, 0), inactivePackVec = (0, 0, 0, 0)
625
626  we can package 4 8-bit activative flows into a 32-bit flow.
627*/
628object GenPackVec extends VLSUConstants{
629  def apply(addr: UInt, shiftMask: UInt, eew: UInt, elemIdx: UInt): UInt = {
630    val addrAlignedVec = CheckAligned(addr)
631    val elemIdxAligned = CheckAligned(elemIdx)
632    val packMask = GenPackMask(shiftMask)
633    // generate packVec
634    val packVec = addrAlignedVec & elemIdxAligned & (packMask.asUInt >> eew)
635
636    packVec
637  }
638}
639
640object GenPackAlignedType extends VLSUConstants{
641  def apply(packVec: UInt): UInt = {
642    val packAlignedType = PriorityMux(Seq(
643      packVec(0) -> "b11".U,
644      packVec(1) -> "b10".U,
645      packVec(2) -> "b01".U,
646      packVec(3) -> "b00".U,
647    ))
648    packAlignedType
649  }
650}
651
652object GenPackNum extends VLSUConstants{
653  def apply(alignedType: UInt, packAlignedType: UInt): UInt = {
654    (1.U << (packAlignedType - alignedType)).asUInt
655  }
656}