xref: /XiangShan/src/main/scala/xiangshan/backend/fu/wrapper/VFALU.scala (revision a63155a6a44b3c7714e55906b55ebf92e0efc125)
1package xiangshan.backend.fu.wrapper
2
3import chipsalliance.rocketchip.config.Parameters
4import chisel3._
5import chisel3.util._
6import utils.XSError
7import xiangshan.backend.fu.FuConfig
8import xiangshan.backend.fu.vector.Bundles.{VLmul, VSew, ma}
9import xiangshan.backend.fu.vector.utils.VecDataSplitModule
10import xiangshan.backend.fu.vector.{Mgu, VecInfo, VecPipedFuncUnit}
11import yunsuan.{VfaluType, VfpuType}
12import yunsuan.vector.VectorFloatAdder
13
14class VFAlu(cfg: FuConfig)(implicit p: Parameters) extends VecPipedFuncUnit(cfg) {
15  XSError(io.in.valid && io.in.bits.ctrl.fuOpType === VfpuType.dummy, "Vfalu OpType not supported")
16
17  // params alias
18  private val dataWidth = cfg.dataBits
19  private val dataWidthOfDataModule = 64
20  private val numVecModule = dataWidth / dataWidthOfDataModule
21
22  // io alias
23  private val opcode  = fuOpType(4,0)
24  private val resWiden  = fuOpType(5)
25  private val opbWiden  = fuOpType(6)
26
27  // modules
28  private val vfalus = Seq.fill(numVecModule)(Module(new VectorFloatAdder))
29  private val vs2Split = Module(new VecDataSplitModule(dataWidth, dataWidthOfDataModule))
30  private val vs1Split = Module(new VecDataSplitModule(dataWidth, dataWidthOfDataModule))
31  private val oldVdSplit  = Module(new VecDataSplitModule(dataWidth, dataWidthOfDataModule))
32  private val mgu = Module(new Mgu(dataWidth))
33
34  /**
35    * In connection of [[vs2Split]], [[vs1Split]] and [[oldVdSplit]]
36    */
37  vs2Split.io.inVecData := vs2
38  vs1Split.io.inVecData := vs1
39  oldVdSplit.io.inVecData := oldVd
40
41  /**
42    * [[vfalus]]'s in connection
43    */
44  // Vec(vs2(31,0), vs2(63,32), vs2(95,64), vs2(127,96)) ==>
45  // Vec(
46  //   Cat(vs2(95,64),  vs2(31,0)),
47  //   Cat(vs2(127,96), vs2(63,32)),
48  // )
49  private val vs2GroupedVec: Vec[UInt] = VecInit(vs2Split.io.outVec32b.zipWithIndex.groupBy(_._2 % 2).map(x => x._1 -> x._2.map(_._1)).values.map(x => Cat(x.reverse)).toSeq)
50  private val vs1GroupedVec: Vec[UInt] = VecInit(vs1Split.io.outVec32b.zipWithIndex.groupBy(_._2 % 2).map(x => x._1 -> x._2.map(_._1)).values.map(x => Cat(x.reverse)).toSeq)
51  private val resultData = Wire(Vec(numVecModule,UInt(dataWidthOfDataModule.W)))
52  private val fflagsData = Wire(Vec(numVecModule,UInt(20.W)))
53  private val srcMaskRShiftForReduction = Wire(UInt((8 * numVecModule).W))
54  // for reduction
55  val isFirstGroupUop = vuopIdx === 0.U ||
56    (vuopIdx === 1.U && (vlmul === VLmul.m4 || vlmul === VLmul.m8)) ||
57    ((vuopIdx === 2.U || vuopIdx === 3.U) && vlmul === VLmul.m8)
58  val maskRshiftWidthForReduction = Wire(UInt(6.W))
59  maskRshiftWidthForReduction := Mux1H(
60    Seq(
61      (vsew === VSew.e16) -> (vuopIdx(1, 0) << 4),
62      (vsew === VSew.e32) -> (vuopIdx(1, 0) << 3),
63      (vsew === VSew.e64) -> (vuopIdx(1, 0) << 2),
64    )
65  )
66  val vlMaskForReduction = (~(Fill(VLEN, 1.U) << vl)).asUInt
67  srcMaskRShiftForReduction := ((srcMask & vlMaskForReduction) >> maskRshiftWidthForReduction)(8 * numVecModule - 1, 0)
68
69  def genMaskForReduction(inmask: UInt, sew: UInt, i: Int): UInt = {
70    val f64MaskNum = dataWidth / 64 * 2
71    val f32MaskNum = dataWidth / 32 * 2
72    val f16MaskNum = dataWidth / 16 * 2
73    val f64Mask = inmask(f64MaskNum - 1, 0)
74    val f32Mask = inmask(f32MaskNum - 1, 0)
75    val f16Mask = inmask(f16MaskNum - 1, 0)
76    // vs2 reordered, so mask use high bits
77    val f64FirstFoldMask = Mux1H(
78      Seq(
79        vecCtrl.fpu.isFoldTo1_2 -> Cat(0.U(3.W), f64Mask(0), 0.U(3.W), f64Mask(1)),
80      )
81    )
82    val f32FirstFoldMask = Mux1H(
83      Seq(
84        vecCtrl.fpu.isFoldTo1_2 -> Cat(0.U(2.W), f32Mask(1), f32Mask(0), 0.U(2.W), f32Mask(3), f32Mask(2)),
85        vecCtrl.fpu.isFoldTo1_4 -> Cat(0.U(3.W), f32Mask(0), 0.U(3.W), f32Mask(1)),
86      )
87    )
88    val f16FirstFoldMask = Mux1H(
89      Seq(
90        vecCtrl.fpu.isFoldTo1_2 -> Cat(f16Mask(7,4), f16Mask(3,0)),
91        vecCtrl.fpu.isFoldTo1_4 -> Cat(0.U(2.W), f16Mask(1), f16Mask(0), 0.U(2.W), f16Mask(3), f16Mask(2)),
92        vecCtrl.fpu.isFoldTo1_8 -> Cat(0.U(3.W), f16Mask(0), 0.U(3.W), f16Mask(1)),
93      )
94    )
95    val f64FoldMask = Mux1H(
96      Seq(
97        vecCtrl.fpu.isFoldTo1_2 -> "b00010001".U,
98      )
99    )
100    val f32FoldMask = Mux1H(
101      Seq(
102        vecCtrl.fpu.isFoldTo1_2 -> "b00110011".U,
103        vecCtrl.fpu.isFoldTo1_4 -> "b00010001".U,
104      )
105    )
106    val f16FoldMask = Mux1H(
107      Seq(
108        vecCtrl.fpu.isFoldTo1_2 -> "b11111111".U,
109        vecCtrl.fpu.isFoldTo1_4 -> "b00110011".U,
110        vecCtrl.fpu.isFoldTo1_8 -> "b00010001".U,
111      )
112    )
113    // low 4 bits for vs2(fp_a), high 4 bits for vs1(fp_b),
114    val isFold = vecCtrl.fpu.isFoldTo1_2 || vecCtrl.fpu.isFoldTo1_4 || vecCtrl.fpu.isFoldTo1_8
115    val f64FirstNotFoldMask = Cat(0.U(3.W), f64Mask(i+2), 0.U(3.W), f64Mask(i))
116    val f32FirstNotFoldMask = Cat(0.U(2.W), f32Mask(i + 5, i+4), 0.U(2.W), Cat(f32Mask(i + 1, i)))
117    val f16FirstNotFoldMask = Cat(f16Mask(i+11,i+8), f16Mask(i+3,0))
118    val f64MaskI = Mux(isFirstGroupUop, Mux(isFold, f64FirstFoldMask, f64FirstNotFoldMask), Mux(isFold, f64FoldMask, Fill(8,1.U)))
119    val f32MaskI = Mux(isFirstGroupUop, Mux(isFold, f32FirstFoldMask, f32FirstNotFoldMask), Mux(isFold, f32FoldMask, Fill(8,1.U)))
120    val f16MaskI = Mux(isFirstGroupUop, Mux(isFold, f16FirstFoldMask, f16FirstNotFoldMask), Mux(isFold, f16FoldMask, Fill(8,1.U)))
121    val outMask = Mux1H(
122      Seq(
123        (sew === 3.U) -> f64MaskI,
124        (sew === 2.U) -> f32MaskI,
125        (sew === 1.U) -> f16MaskI,
126      )
127    )
128    outMask
129  }
130  def genMaskForMerge(inmask:UInt, sew:UInt, i:Int): UInt = {
131    val f64MaskNum = dataWidth / 64
132    val f32MaskNum = dataWidth / 32
133    val f16MaskNum = dataWidth / 16
134    val f64Mask = inmask(f64MaskNum-1,0)
135    val f32Mask = inmask(f32MaskNum-1,0)
136    val f16Mask = inmask(f16MaskNum-1,0)
137    val f64MaskI = Cat(0.U(3.W),f64Mask(i))
138    val f32MaskI = Cat(0.U(2.W),f32Mask(2*i+1,2*i))
139    val f16MaskI = f16Mask(4*i+3,4*i)
140    val outMask = Mux1H(
141      Seq(
142        (sew === 3.U) -> f64MaskI,
143        (sew === 2.U) -> f32MaskI,
144        (sew === 1.U) -> f16MaskI,
145      )
146    )
147    outMask
148  }
149  val isScalarMove = (fuOpType === VfaluType.vfmv_f_s) || (fuOpType === VfaluType.vfmv_s_f)
150  val srcMaskRShift = Wire(UInt((4 * numVecModule).W))
151  val maskRshiftWidth = Wire(UInt(6.W))
152  maskRshiftWidth := Mux1H(
153    Seq(
154      (vsew === VSew.e16) -> (vuopIdx(2,0) << 3),
155      (vsew === VSew.e32) -> (vuopIdx(2,0) << 2),
156      (vsew === VSew.e64) -> (vuopIdx(2,0) << 1),
157    )
158  )
159  srcMaskRShift := (srcMask >> maskRshiftWidth)(4 * numVecModule - 1, 0)
160  val fp_aIsFpCanonicalNAN = Wire(Vec(numVecModule,Bool()))
161  val fp_bIsFpCanonicalNAN = Wire(Vec(numVecModule,Bool()))
162  vfalus.zipWithIndex.foreach {
163    case (mod, i) =>
164      mod.io.fp_a             := Mux(opbWiden, vs1Split.io.outVec64b(i), vs2Split.io.outVec64b(i))  // very dirty TODO
165      mod.io.fp_b             := Mux(opbWiden, vs2Split.io.outVec64b(i), vs1Split.io.outVec64b(i))  // very dirty TODO
166      mod.io.widen_a          := Cat(vs2Split.io.outVec32b(i+numVecModule), vs2Split.io.outVec32b(i))
167      mod.io.widen_b          := Cat(vs1Split.io.outVec32b(i+numVecModule), vs1Split.io.outVec32b(i))
168      mod.io.frs1             := 0.U     // already vf -> vv
169      mod.io.is_frs1          := false.B // already vf -> vv
170      mod.io.mask             := Mux(isScalarMove, !vuopIdx.orR, genMaskForMerge(inmask = srcMaskRShift, sew = vsew, i = i))
171      mod.io.maskForReduction := genMaskForReduction(inmask = srcMaskRShiftForReduction, sew = vsew, i = i)
172      mod.io.uop_idx          := vuopIdx(0)
173      mod.io.is_vec           := true.B // Todo
174      mod.io.round_mode       := frm
175      mod.io.fp_format        := Mux(resWiden, vsew + 1.U, vsew)
176      mod.io.opb_widening     := opbWiden
177      mod.io.res_widening     := resWiden
178      mod.io.op_code          := opcode
179      resultData(i)           := mod.io.fp_result
180      fflagsData(i)           := mod.io.fflags
181      fp_aIsFpCanonicalNAN(i) := vecCtrl.fpu.isFpToVecInst & (
182          ((vsew === VSew.e32) & (!vs2Split.io.outVec64b(i).head(32).andR)) |
183          ((vsew === VSew.e16) & (!vs2Split.io.outVec64b(i).head(48).andR))
184        )
185      fp_bIsFpCanonicalNAN(i) := vecCtrl.fpu.isFpToVecInst & (
186          ((vsew === VSew.e32) & (!vs1Split.io.outVec64b(i).head(32).andR)) |
187          ((vsew === VSew.e16) & (!vs1Split.io.outVec64b(i).head(48).andR))
188        )
189      mod.io.fp_aIsFpCanonicalNAN := fp_aIsFpCanonicalNAN(i)
190      mod.io.fp_bIsFpCanonicalNAN := fp_bIsFpCanonicalNAN(i)
191  }
192  val resultDataUInt = resultData.asUInt
193  val cmpResultWidth = dataWidth / 16
194  val cmpResult = Wire(Vec(cmpResultWidth, Bool()))
195  for (i <- 0 until cmpResultWidth) {
196    if(i == 0) {
197      cmpResult(i) := resultDataUInt(0)
198    }
199    else if(i < dataWidth / 64) {
200      cmpResult(i) := Mux1H(
201        Seq(
202          (outVecCtrl.vsew === 1.U) -> resultDataUInt(i*16),
203          (outVecCtrl.vsew === 2.U) -> resultDataUInt(i*32),
204          (outVecCtrl.vsew === 3.U) -> resultDataUInt(i*64)
205        )
206      )
207    }
208    else if(i < dataWidth / 32) {
209      cmpResult(i) := Mux1H(
210        Seq(
211          (outVecCtrl.vsew === 1.U) -> resultDataUInt(i * 16),
212          (outVecCtrl.vsew === 2.U) -> resultDataUInt(i * 32),
213          (outVecCtrl.vsew === 3.U) -> false.B
214        )
215      )
216    }
217    else if(i <  dataWidth / 16) {
218      cmpResult(i) := Mux(outVecCtrl.vsew === 1.U, resultDataUInt(i*16), false.B)
219    }
220  }
221
222  val outEew = Mux(RegNext(resWiden), outVecCtrl.vsew + 1.U, outVecCtrl.vsew)
223  val outVuopidx = outVecCtrl.vuopIdx(2, 0)
224  val vlMax = ((VLEN/8).U >> outEew).asUInt
225  val lmulAbs = Mux(outVecCtrl.vlmul(2), (~outVecCtrl.vlmul(1,0)).asUInt + 1.U, outVecCtrl.vlmul(1,0))
226  //  vfmv_f_s need vl=1, reduction last uop need vl=1, other uop need vl=vlmax
227  val numOfUopVFRED = {
228    // addTime include add frs1
229    val addTime = MuxLookup(outVecCtrl.vlmul, 1.U(4.W), Array(
230      VLmul.m2 -> 2.U,
231      VLmul.m4 -> 4.U,
232      VLmul.m8 -> 8.U,
233    ))
234    val foldLastVlmul = MuxLookup(outVecCtrl.vsew, "b000".U, Array(
235      VSew.e16 -> VLmul.mf8,
236      VSew.e32 -> VLmul.mf4,
237      VSew.e64 -> VLmul.mf2,
238    ))
239    // lmul < 1, foldTime = vlmul - foldFastVlmul
240    // lmul >= 1, foldTime = 0.U - foldFastVlmul
241    val foldTime = Mux(outVecCtrl.vlmul(2), outVecCtrl.vlmul, 0.U) - foldLastVlmul
242    addTime + foldTime
243  }
244  val reductionVl = Mux(outVecCtrl.vuopIdx === numOfUopVFRED - 1.U, 1.U, vlMax)
245  val outIsResuction = outCtrl.fuOpType === VfaluType.vfredusum || outCtrl.fuOpType === VfaluType.vfredmax || outCtrl.fuOpType === VfaluType.vfredmin
246  val outVlFix = Mux(
247    outVecCtrl.fpu.isFpToVecInst || (outCtrl.fuOpType === VfaluType.vfmv_f_s),
248    1.U,
249    Mux(
250      outCtrl.fuOpType === VfaluType.vfmv_s_f,
251      outVl.orR,
252      Mux(outIsResuction, reductionVl, outVl)
253    )
254  )
255  val vlMaxAllUop = Wire(outVl.cloneType)
256  vlMaxAllUop := Mux(outVecCtrl.vlmul(2), vlMax >> lmulAbs, vlMax << lmulAbs).asUInt
257  val vlMaxThisUop = Mux(outVecCtrl.vlmul(2), vlMax >> lmulAbs, vlMax).asUInt
258  val vlSetThisUop = Mux(outVlFix > outVuopidx*vlMaxThisUop, outVlFix - outVuopidx*vlMaxThisUop, 0.U)
259  val vlThisUop = Wire(UInt(3.W))
260  vlThisUop := Mux(vlSetThisUop < vlMaxThisUop, vlSetThisUop, vlMaxThisUop)
261  val vlMaskRShift = Wire(UInt((4 * numVecModule).W))
262  vlMaskRShift := Fill(4 * numVecModule, 1.U(1.W)) >> ((4 * numVecModule).U - vlThisUop)
263
264  private val needNoMask = outCtrl.fuOpType === VfaluType.vfmerge ||
265    outCtrl.fuOpType === VfaluType.vfmv_s_f ||
266    outIsResuction ||
267    outVecCtrl.fpu.isFpToVecInst
268  val maskToMgu = Mux(needNoMask, allMaskTrue, outSrcMask)
269  val allFFlagsEn = Wire(Vec(4*numVecModule,Bool()))
270  val outSrcMaskRShift = Wire(UInt((4*numVecModule).W))
271  outSrcMaskRShift := (maskToMgu >> (outVecCtrl.vuopIdx(2,0) * vlMax))(4*numVecModule-1,0)
272  val f16FFlagsEn = outSrcMaskRShift
273  val f32FFlagsEn = Wire(Vec(numVecModule,UInt(4.W)))
274  for (i <- 0 until numVecModule){
275    f32FFlagsEn(i) := Cat(Fill(2, 0.U),outSrcMaskRShift(2*i+1,2*i))
276  }
277  val f64FFlagsEn = Wire(Vec(numVecModule, UInt(4.W)))
278  for (i <- 0 until numVecModule) {
279    f64FFlagsEn(i) := Cat(Fill(3, 0.U), outSrcMaskRShift(i))
280  }
281  val fflagsEn= Mux1H(
282    Seq(
283      (outEew === 1.U) -> f16FFlagsEn.asUInt,
284      (outEew === 2.U) -> f32FFlagsEn.asUInt,
285      (outEew === 3.U) -> f64FFlagsEn.asUInt
286    )
287  )
288  allFFlagsEn := Mux(outIsResuction, Fill(4*numVecModule, 1.U), (fflagsEn & vlMaskRShift)).asTypeOf(allFFlagsEn)
289
290  val allFFlags = fflagsData.asTypeOf(Vec(4*numVecModule,UInt(5.W)))
291  val outFFlags = allFFlagsEn.zip(allFFlags).map{
292    case(en,fflags) => Mux(en, fflags, 0.U(5.W))
293  }.reduce(_ | _)
294  io.out.bits.res.fflags.get := outFFlags
295
296
297  val cmpResultOldVd = Wire(UInt(cmpResultWidth.W))
298  val cmpResultOldVdRshiftWidth = Wire(UInt(6.W))
299  cmpResultOldVdRshiftWidth := Mux1H(
300    Seq(
301      (outVecCtrl.vsew === VSew.e16) -> (outVecCtrl.vuopIdx(2, 0) << 3),
302      (outVecCtrl.vsew === VSew.e32) -> (outVecCtrl.vuopIdx(2, 0) << 2),
303      (outVecCtrl.vsew === VSew.e64) -> (outVecCtrl.vuopIdx(2, 0) << 1),
304    )
305  )
306  cmpResultOldVd := (outOldVd >> cmpResultOldVdRshiftWidth)(4*numVecModule-1,0)
307  val cmpResultForMgu = Wire(Vec(cmpResultWidth, Bool()))
308  for (i <- 0 until cmpResultWidth) {
309    cmpResultForMgu(i) := Mux(outSrcMaskRShift(i), cmpResult(i), Mux(outVecCtrl.vma, true.B, cmpResultOldVd(i)))
310  }
311
312  mgu.io.in.vd := Mux(outVecCtrl.isDstMask, Cat(0.U((dataWidth / 16 * 15).W), cmpResultForMgu.asUInt), resultDataUInt)
313  mgu.io.in.oldVd := outOldVd
314  mgu.io.in.mask := maskToMgu
315  mgu.io.in.info.ta := Mux(outCtrl.fuOpType === VfaluType.vfmv_f_s, true.B , outVecCtrl.vta)
316  mgu.io.in.info.ma := Mux(outCtrl.fuOpType === VfaluType.vfmv_s_f, true.B , outVecCtrl.vma)
317  mgu.io.in.info.vl := outVlFix
318  mgu.io.in.info.vstart := outVecCtrl.vstart
319  mgu.io.in.info.vlmul := outVecCtrl.vlmul
320  mgu.io.in.info.valid := io.out.valid
321  mgu.io.in.info.vstart := Mux(outVecCtrl.fpu.isFpToVecInst, 0.U, outVecCtrl.vstart)
322  mgu.io.in.info.eew := outEew
323  mgu.io.in.info.vsew := outVecCtrl.vsew
324  mgu.io.in.info.vdIdx := Mux(outIsResuction, 0.U, outVecCtrl.vuopIdx)
325  mgu.io.in.info.narrow := outVecCtrl.isNarrow
326  mgu.io.in.info.dstMask := outVecCtrl.isDstMask
327  val resultFpMask = Wire(UInt(VLEN.W))
328  val isFclass = outVecCtrl.fpu.isFpToVecInst && (outCtrl.fuOpType === VfaluType.vfclass)
329  val fpCmpFuOpType = Seq(VfaluType.vfeq, VfaluType.vflt, VfaluType.vfle)
330  val isCmp = outVecCtrl.fpu.isFpToVecInst && (fpCmpFuOpType.map(_ === outCtrl.fuOpType).reduce(_|_))
331  resultFpMask := Mux(isFclass || isCmp, Fill(16, 1.U(1.W)), Fill(VLEN, 1.U(1.W)))
332  io.out.bits.res.data := mgu.io.out.vd & resultFpMask
333
334}
335
336class VFMgu(vlen:Int)(implicit p: Parameters) extends Module{
337  val io = IO(new VFMguIO(vlen))
338
339  val vd = io.in.vd
340  val oldvd = io.in.oldVd
341  val mask = io.in.mask
342  val vsew = io.in.info.eew
343  val num16bits = vlen / 16
344
345}
346
347class VFMguIO(vlen: Int)(implicit p: Parameters) extends Bundle {
348  val in = new Bundle {
349    val vd = Input(UInt(vlen.W))
350    val oldVd = Input(UInt(vlen.W))
351    val mask = Input(UInt(vlen.W))
352    val info = Input(new VecInfo)
353  }
354  val out = new Bundle {
355    val vd = Output(UInt(vlen.W))
356  }
357}