1/*************************************************************************************** 2 * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences 3 * Copyright (c) 2020-2021 Peng Cheng Laboratory 4 * 5 * XiangShan is licensed under Mulan PSL v2. 6 * You can use this software according to the terms and conditions of the Mulan PSL v2. 7 * You may obtain a copy of Mulan PSL v2 at: 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * 14 * See the Mulan PSL v2 for more details. 15 ***************************************************************************************/ 16 17package xiangshan.mem 18 19import org.chipsalliance.cde.config.Parameters 20import chisel3._ 21import chisel3.util._ 22import utils._ 23import utility._ 24import xiangshan._ 25import xiangshan.backend.rob.RobPtr 26import xiangshan.backend.Bundles._ 27import xiangshan.mem._ 28import xiangshan.backend.fu.vector.Bundles._ 29 30 31class VSplitPipeline(isVStore: Boolean = false)(implicit p: Parameters) extends VLSUModule{ 32 val io = IO(new VSplitPipelineIO(isVStore)) 33 // will be override later 34 def us_whole_reg(fuOpType: UInt): Bool = false.B 35 def us_mask(fuOpType: UInt): Bool = false.B 36 def us_fof(fuOpType: UInt): Bool = false.B 37 38 val vdIdxReg = RegInit(0.U(3.W)) 39 40 val s1_ready = WireInit(false.B) 41 io.in.ready := s1_ready 42 43 /**----------------------------------------------------------- 44 * s0 stage 45 * decode and generate AlignedType, uop mask, preIsSplit 46 * ---------------------------------------------------------- 47 */ 48 val s0_vtype = io.in.bits.uop.vpu.vtype 49 val s0_sew = s0_vtype.vsew 50 val s0_eew = io.in.bits.uop.vpu.veew 51 val s0_lmul = s0_vtype.vlmul 52 // when load whole register or unit-stride masked , emul should be 1 53 val s0_fuOpType = io.in.bits.uop.fuOpType 54 val s0_mop = s0_fuOpType(6, 5) 55 val s0_nf = Mux(us_whole_reg(s0_fuOpType), 0.U, io.in.bits.uop.vpu.nf) 56 val s0_vm = io.in.bits.uop.vpu.vm 57 val s0_emul = Mux(us_whole_reg(s0_fuOpType) ,GenUSWholeEmul(io.in.bits.uop.vpu.nf), Mux(us_mask(s0_fuOpType), 0.U(mulBits.W), EewLog2(s0_eew) - s0_sew + s0_lmul)) 58 val s0_preIsSplit = !(isUnitStride(s0_mop) && !us_fof(s0_fuOpType)) 59 val s0_nfield = s0_nf +& 1.U 60 61 val s0_valid = Wire(Bool()) 62 val s0_kill = io.in.bits.uop.robIdx.needFlush(io.redirect) 63 val s0_can_go = s1_ready 64 val s0_fire = s0_valid && s0_can_go 65 val s0_out = Wire(new VLSBundle(isVStore)) 66 67 val isUsWholeReg = isUnitStride(s0_mop) && us_whole_reg(s0_fuOpType) 68 val isMaskReg = isUnitStride(s0_mop) && us_mask(s0_fuOpType) 69 val isSegment = s0_nf =/= 0.U && !us_whole_reg(s0_fuOpType) 70 val instType = Cat(isSegment, s0_mop) 71 val uopIdx = io.in.bits.uop.vpu.vuopIdx 72 val uopIdxInField = GenUopIdxInField(instType, s0_emul, s0_lmul, uopIdx) 73 val vdIdxInField = GenVdIdxInField(instType, s0_emul, s0_lmul, uopIdxInField) 74 val lmulLog2 = Mux(s0_lmul.asSInt >= 0.S, 0.U, s0_lmul) 75 val emulLog2 = Mux(s0_emul.asSInt >= 0.S, 0.U, s0_emul) 76 val numEewLog2 = emulLog2 - EewLog2(s0_eew) 77 val numSewLog2 = lmulLog2 - s0_sew 78 val numFlowsSameVdLog2 = Mux( 79 isIndexed(instType), 80 log2Up(VLENB).U - s0_sew(1,0), 81 log2Up(VLENB).U - s0_eew(1,0) 82 ) 83 // numUops = nf * max(lmul, emul) 84 val lmulLog2Pos = Mux(s0_lmul.asSInt < 0.S, 0.U, s0_lmul) 85 val emulLog2Pos = Mux(s0_emul.asSInt < 0.S, 0.U, s0_emul) 86 val numUops = Mux( 87 isIndexed(s0_mop) && s0_lmul.asSInt > s0_emul.asSInt, 88 (s0_nf +& 1.U) << lmulLog2Pos, 89 (s0_nf +& 1.U) << emulLog2Pos 90 ) 91 92 val vvl = io.in.bits.src_vl.asTypeOf(VConfig()).vl 93 val evl = Mux(isUsWholeReg, 94 GenUSWholeRegVL(io.in.bits.uop.vpu.nf +& 1.U, s0_eew), 95 Mux(isMaskReg, 96 GenUSMaskRegVL(vvl), 97 vvl)) 98 val vvstart = io.in.bits.uop.vpu.vstart 99 val alignedType = Mux(isIndexed(instType), s0_sew(1, 0), s0_eew(1, 0)) 100 val broadenAligendType = Mux(s0_preIsSplit, Cat("b0".U, alignedType), "b100".U) // if is unit-stride, use 128-bits memory access 101 val flowsLog2 = GenRealFlowLog2(instType, s0_emul, s0_lmul, s0_eew, s0_sew) 102 val flowsPrevThisUop = uopIdxInField << flowsLog2 // # of flows before this uop in a field 103 val flowsPrevThisVd = vdIdxInField << numFlowsSameVdLog2 // # of flows before this vd in a field 104 val flowsIncludeThisUop = (uopIdxInField +& 1.U) << flowsLog2 // # of flows before this uop besides this uop 105 val flowNum = io.in.bits.flowNum.get 106 val srcMask = GenFlowMask(Mux(s0_vm, Fill(VLEN, 1.U(1.W)), io.in.bits.src_mask), vvstart, evl, true) 107 108 val flowMask = ((srcMask & 109 UIntToMask(flowsIncludeThisUop.asUInt, VLEN + 1) & 110 (~UIntToMask(flowsPrevThisUop.asUInt, VLEN)).asUInt 111 ) >> flowsPrevThisVd)(VLENB - 1, 0) 112 val vlmax = GenVLMAX(s0_lmul, s0_sew) 113 114 // connect 115 s0_out := DontCare 116 s0_out match {case x => 117 x.uop := io.in.bits.uop 118 x.uop.vpu.vl := evl 119 x.uop.uopIdx := uopIdx 120 x.uop.numUops := numUops 121 x.uop.lastUop := (uopIdx +& 1.U) === numUops 122 x.uop.vpu.nf := s0_nf 123 x.flowMask := flowMask 124 x.byteMask := GenUopByteMask(flowMask, Cat("b0".U, alignedType))(VLENB - 1, 0) 125 x.fof := isUnitStride(s0_mop) && us_fof(s0_fuOpType) 126 x.baseAddr := io.in.bits.src_rs1 127 x.stride := io.in.bits.src_stride 128 x.flowNum := flowNum 129 x.nfields := s0_nfield 130 x.vm := s0_vm 131 x.usWholeReg := isUsWholeReg 132 x.usMaskReg := isMaskReg 133 x.eew := s0_eew 134 x.sew := s0_sew 135 x.emul := s0_emul 136 x.lmul := s0_lmul 137 x.vlmax := Mux(isUsWholeReg, evl, vlmax) 138 x.instType := instType 139 x.data := io.in.bits.src_vs3 140 x.vdIdxInField := vdIdxInField 141 x.preIsSplit := s0_preIsSplit 142 x.alignedType := broadenAligendType 143 } 144 s0_valid := io.in.valid && !s0_kill 145 /**------------------------------------- 146 * s1 stage 147 * ------------------------------------ 148 * generate UopOffset 149 */ 150 val s1_valid = RegInit(false.B) 151 val s1_kill = Wire(Bool()) 152 val s1_in = Wire(new VLSBundle(isVStore)) 153 val s1_can_go = io.out.ready && io.toMergeBuffer.resp.valid 154 val s1_fire = s1_valid && !s1_kill && s1_can_go 155 156 s1_ready := s1_kill || !s1_valid || io.out.ready && io.toMergeBuffer.resp.valid 157 158 when(s0_fire){ 159 s1_valid := true.B 160 }.elsewhen(s1_fire){ 161 s1_valid := false.B 162 }.elsewhen(s1_kill){ 163 s1_valid := false.B 164 } 165 s1_in := RegEnable(s0_out, s0_fire) 166 167 val s1_uopidx = s1_in.uop.vpu.vuopIdx 168 val s1_nf = s1_in.uop.vpu.nf 169 val s1_nfields = s1_in.nfields 170 val s1_eew = s1_in.eew 171 val s1_instType = s1_in.instType 172 val s1_stride = s1_in.stride 173 val s1_vmask = FillInterleaved(8, s1_in.byteMask)(VLEN-1, 0) 174 val s1_alignedType = s1_in.alignedType 175 val s1_notIndexedStride = Mux( // stride for strided/unit-stride instruction 176 isStrided(s1_instType), 177 s1_stride(XLEN - 1, 0), // for strided load, stride = x[rs2] 178 s1_nfields << s1_eew(1, 0) // for unit-stride load, stride = eew * NFIELDS 179 ) 180 val uopOffset = (s1_uopidx >> s1_nf) << s1_alignedType 181 val stride = Mux(isIndexed(s1_instType), s1_stride, s1_notIndexedStride) // if is index instructions, get index when split 182 183 s1_kill := s1_in.uop.robIdx.needFlush(io.redirect) 184 185 // query mergeBuffer 186 io.toMergeBuffer.req.valid := s1_fire // only can_go will get MergeBuffer entry 187 io.toMergeBuffer.req.bits.flowNum := Mux(s1_in.preIsSplit, PopCount(s1_in.flowMask), flowNum) 188 io.toMergeBuffer.req.bits.data := s1_in.data 189 io.toMergeBuffer.req.bits.uop := s1_in.uop 190 io.toMergeBuffer.req.bits.mask := s1_in.flowMask 191 io.toMergeBuffer.req.bits.vaddr := DontCare 192 io.toMergeBuffer.req.bits.vdIdx := vdIdxReg 193 io.toMergeBuffer.req.bits.fof := s1_in.fof 194 io.toMergeBuffer.req.bits.vlmax := s1_in.vlmax 195// io.toMergeBuffer.req.bits.vdOffset := 196 197 when (s1_in.uop.lastUop && s1_valid || s1_kill) { 198 vdIdxReg := 0.U 199 }.elsewhen(s1_valid) { 200 vdIdxReg := vdIdxReg + 1.U 201 XSError(vdIdxReg + 1.U === 0.U, s"Overflow! The number of vd should be less than 8\n") 202 } 203 // out connect 204 io.out.valid := s1_valid && io.toMergeBuffer.resp.valid 205 io.out.bits := s1_in 206 io.out.bits.uopOffset := uopOffset 207 io.out.bits.stride := stride 208 io.out.bits.mBIndex := io.toMergeBuffer.resp.bits.mBIndex 209 210 XSPerfAccumulate("split_out", io.out.fire) 211 XSPerfAccumulate("pipe_block", io.out.valid && !io.out.ready) 212 XSPerfAccumulate("mbuffer_block", s1_valid && io.out.ready && !io.toMergeBuffer.resp.valid) 213} 214 215abstract class VSplitBuffer(isVStore: Boolean = false)(implicit p: Parameters) extends VLSUModule{ 216 val io = IO(new VSplitBufferIO(isVStore)) 217 218 val bufferSize: Int 219 220 class VSplitPtr(implicit p: Parameters) extends CircularQueuePtr[VSplitPtr](bufferSize){ 221 } 222 223 object VSplitPtr { 224 def apply(f: Bool, v: UInt)(implicit p: Parameters): VSplitPtr = { 225 val ptr = Wire(new VSplitPtr) 226 ptr.flag := f 227 ptr.value := v 228 ptr 229 } 230 } 231 232 val uopq = Reg(Vec(bufferSize, new VLSBundle(isVStore))) 233 val valid = RegInit(VecInit(Seq.fill(bufferSize)(false.B))) 234 val srcMaskVec = Reg(Vec(bufferSize, UInt(VLEN.W))) 235 // ptr 236 val enqPtr = RegInit(0.U.asTypeOf(new VSplitPtr)) 237 val deqPtr = RegInit(0.U.asTypeOf(new VSplitPtr)) 238 // for split 239 val splitIdx = RegInit(0.U(flowIdxBits.W)) 240 val strideOffsetReg = RegInit(0.U(VLEN.W)) 241 242 /** 243 * Redirect 244 */ 245 val flushed = WireInit(VecInit(Seq.fill(bufferSize)(false.B))) // entry has been flushed by the redirect arrived in the pre 1 cycle 246 val flushVec = (valid zip flushed).zip(uopq).map { case ((v, f), entry) => v && entry.uop.robIdx.needFlush(io.redirect) && !f } 247 val flushEnq = io.in.fire && io.in.bits.uop.robIdx.needFlush(io.redirect) 248 val flushNumReg = RegNext(PopCount(flushEnq +: flushVec)) 249 val redirectReg = RegNext(io.redirect) 250 val flushVecReg = RegNext(WireInit(VecInit(flushVec))) 251 252 // enqueue 253 when (io.in.fire && !flushEnq) { 254 val id = enqPtr.value 255 uopq(id) := io.in.bits 256 valid(id) := true.B 257 } 258 io.in.ready := isNotBefore(enqPtr, deqPtr) 259 260 //split uops 261 val issueValid = valid(deqPtr.value) 262 val issueEntry = uopq(deqPtr.value) 263 val issueMbIndex = issueEntry.mBIndex 264 val issueFlowNum = issueEntry.flowNum 265 val issueBaseAddr = issueEntry.baseAddr 266 val issueUop = issueEntry.uop 267 val issueUopIdx = issueUop.vpu.vuopIdx 268 val issueInstType = issueEntry.instType 269 val issueUopOffset = issueEntry.uopOffset 270 val issueEew = issueEntry.eew 271 val issueSew = issueEntry.sew 272 val issueLmul = issueEntry.emul 273 val issueEmul = issueEntry.lmul 274 val issueAlignedType = issueEntry.alignedType 275 val issuePreIsSplit = issueEntry.preIsSplit 276 val issueByteMask = issueEntry.byteMask 277 val elemIdx = GenElemIdx( 278 instType = issueInstType, 279 emul = issueEmul, 280 lmul = issueLmul, 281 eew = issueEew, 282 sew = issueSew, 283 uopIdx = issueUopIdx, 284 flowIdx = splitIdx 285 ) // elemIdx inside an inst, for exception 286 val indexedStride = IndexAddr( // index for indexed instruction 287 index = issueEntry.stride, 288 flow_inner_idx = ((splitIdx << issueEew(1, 0))(vOffsetBits - 1, 0) >> issueEew(1, 0)).asUInt, 289 eew = issueEew 290 ) 291 val issueStride = Mux(isIndexed(issueInstType), indexedStride, strideOffsetReg) 292 val vaddr = issueBaseAddr + issueUopOffset + issueStride 293 val mask = genVWmask128(vaddr ,issueAlignedType) // scala maske for flow 294 val flowMask = issueEntry.flowMask 295 val vecActive = (flowMask & UIntToOH(splitIdx)).orR 296 /* 297 * Unit-Stride split to one flow or two flow. 298 * for Unit-Stride, if uop's addr is aligned with 128-bits, split it to one flow, otherwise split two 299 */ 300 301 val usAligned128 = (vaddr(3,0) === 0.U)// addr 128-bit aligned 302 val usSplitMask = genUSSplitMask(issueByteMask, splitIdx, vaddr(3,0)) 303 val usNoSplit = (usAligned128 || !(vaddr(3,0) +& PopCount(usSplitMask))(4)) && !issuePreIsSplit && (splitIdx === 0.U)// unit-stride uop don't need to split into two flow 304 val usSplitVaddr = genUSSplitAddr(vaddr, splitIdx) 305 val regOffset = vaddr(3,0) // offset in 256-bits vd 306 XSError((splitIdx > 1.U && usNoSplit) || (splitIdx > 1.U && !issuePreIsSplit) , "Unit-Stride addr split error!\n") 307 308 // data 309 io.out.bits match { case x => 310 x.uop := issueUop 311 x.vaddr := Mux(!issuePreIsSplit, usSplitVaddr, vaddr) 312 x.alignedType := issueAlignedType 313 x.isvec := true.B 314 x.mask := Mux(!issuePreIsSplit, usSplitMask, mask) 315 x.reg_offset := regOffset //for merge unit-stride data 316 x.vecActive := vecActive 317 x.is_first_ele := DontCare 318 x.usSecondInv := usNoSplit 319 x.elemIdx := elemIdx 320 x.elemIdxInsideVd := splitIdx // if is Unit-Stride, elemIdx is the index of 2 splited mem request (for merge data) 321 x.uop_unit_stride_fof := DontCare 322 x.isFirstIssue := DontCare 323 x.mBIndex := issueMbIndex 324 } 325 326 //update enqptr 327 when (redirectReg.valid && flushNumReg =/= 0.U) { 328 enqPtr := enqPtr - flushNumReg 329 }.otherwise { 330 when (io.in.fire) { 331 enqPtr := enqPtr + 1.U 332 } 333 } 334 335 // flush queue 336 for (i <- 0 until bufferSize) { 337 when(flushVecReg(i) && redirectReg.valid && flushNumReg =/= 0.U) { 338 valid(i) := false.B 339 flushed(i) := true.B 340 } 341 } 342 343 /* Execute logic */ 344 /** Issue to scala pipeline**/ 345 val canIssue = Wire(Bool()) 346 val allowIssue = io.out.ready 347 val doIssue = Wire(Bool()) 348 val deqValid = valid(deqPtr.value) 349 val inActiveIssue = deqValid && canIssue && !vecActive && issuePreIsSplit 350 val issueCount = Mux(usNoSplit, 2.U, (PopCount(inActiveIssue) + PopCount(doIssue))) // for dont need split unit-stride, issue two flow 351 352 // handshake 353 val thisPtr = deqPtr.value 354 canIssue := !issueUop.robIdx.needFlush(io.redirect) && deqPtr < enqPtr 355 doIssue := canIssue && allowIssue 356 when (!RegNext(io.redirect.valid) || distanceBetween(enqPtr, deqPtr) > flushNumReg) { 357 when ((splitIdx < (issueFlowNum - issueCount))) { 358 when (doIssue || inActiveIssue) { 359 // The uop has not been entirly splited yet 360 splitIdx := splitIdx + issueCount 361 strideOffsetReg := strideOffsetReg + issueEntry.stride 362 } 363 }.otherwise { 364 when (doIssue || inActiveIssue) { 365 // The uop is done spliting 366 splitIdx := 0.U(flowIdxBits.W) // initialize flowIdx 367 valid(deqPtr.value) := false.B 368 strideOffsetReg := 0.U 369 deqPtr := deqPtr + 1.U 370 } 371 } 372 }.otherwise { 373 splitIdx := 0.U(flowIdxBits.W) // initialize flowIdx 374 strideOffsetReg := 0.U 375 } 376 377 // out connect 378 io.out.valid := canIssue && (vecActive || !issuePreIsSplit) // TODO: inactive uop do not send to pipeline 379 380 XSPerfAccumulate("out_valid", io.out.valid) 381 XSPerfAccumulate("out_fire", io.out.fire) 382 XSPerfAccumulate("out_fire_unitstride", io.out.fire && !issuePreIsSplit) 383 XSPerfAccumulate("unitstride_vlenAlign", io.out.fire && !issuePreIsSplit && io.out.bits.vaddr(3, 0) === 0.U) 384 XSPerfAccumulate("unitstride_invalid", io.out.ready && canIssue && !issuePreIsSplit && PopCount(io.out.bits.mask).orR) 385 386 QueuePerf(bufferSize, distanceBetween(enqPtr, deqPtr), !io.in.ready) 387} 388 389class VSSplitBufferImp(implicit p: Parameters) extends VSplitBuffer(isVStore = true){ 390 override lazy val bufferSize = SplitBufferSize 391 // split data 392 val splitData = genVSData( 393 data = issueEntry.data.asUInt, 394 elemIdx = splitIdx, 395 alignedType = issueAlignedType 396 ) 397 val flowData = genVWdata(splitData, issueAlignedType) 398 val usSplitData = genUSSplitData(issueEntry.data.asUInt, splitIdx, vaddr(3,0)) 399 400 val sqIdx = issueUop.sqIdx + splitIdx 401 io.out.bits.uop.sqIdx := sqIdx 402 403 // send data to sq 404 val vstd = io.vstd.get 405 vstd.valid := canIssue 406 vstd.bits.uop := issueUop 407 vstd.bits.uop.sqIdx := sqIdx 408 vstd.bits.data := Mux(!issuePreIsSplit, usSplitData, flowData) 409 vstd.bits.debug := DontCare 410 vstd.bits.vdIdx.get := DontCare 411 vstd.bits.vdIdxInField.get := DontCare 412 vstd.bits.mask.get := Mux(!issuePreIsSplit, usSplitMask, mask) 413 414} 415 416class VLSplitBufferImp(implicit p: Parameters) extends VSplitBuffer(isVStore = false){ 417 override lazy val bufferSize = SplitBufferSize 418 io.out.bits.uop.lqIdx := issueUop.lqIdx + splitIdx 419} 420 421class VSSplitPipelineImp(implicit p: Parameters) extends VSplitPipeline(isVStore = true){ 422 override def us_whole_reg(fuOpType: UInt): Bool = fuOpType === VstuType.vsr 423 override def us_mask(fuOpType: UInt): Bool = fuOpType === VstuType.vsm 424 override def us_fof(fuOpType: UInt): Bool = false.B // dont have vector fof store 425} 426 427class VLSplitPipelineImp(implicit p: Parameters) extends VSplitPipeline(isVStore = false){ 428 429 override def us_whole_reg(fuOpType: UInt): Bool = fuOpType === VlduType.vlr 430 override def us_mask(fuOpType: UInt): Bool = fuOpType === VlduType.vlm 431 override def us_fof(fuOpType: UInt): Bool = fuOpType === VlduType.vleff 432} 433 434class VLSplitImp(implicit p: Parameters) extends VLSUModule{ 435 val io = IO(new VSplitIO(isVStore=false)) 436 val splitPipeline = Module(new VLSplitPipelineImp()) 437 val splitBuffer = Module(new VLSplitBufferImp()) 438 // Split Pipeline 439 splitPipeline.io.in <> io.in 440 splitPipeline.io.redirect <> io.redirect 441 io.toMergeBuffer <> splitPipeline.io.toMergeBuffer 442 443 // Split Buffer 444 splitBuffer.io.in <> splitPipeline.io.out 445 splitBuffer.io.redirect <> io.redirect 446 io.out <> splitBuffer.io.out 447} 448 449class VSSplitImp(implicit p: Parameters) extends VLSUModule{ 450 val io = IO(new VSplitIO(isVStore=true)) 451 val splitPipeline = Module(new VSSplitPipelineImp()) 452 val splitBuffer = Module(new VSSplitBufferImp()) 453 // Split Pipeline 454 splitPipeline.io.in <> io.in 455 splitPipeline.io.redirect <> io.redirect 456 io.toMergeBuffer <> splitPipeline.io.toMergeBuffer 457 458 // Split Buffer 459 splitBuffer.io.in <> splitPipeline.io.out 460 splitBuffer.io.redirect <> io.redirect 461 io.out <> splitBuffer.io.out 462 io.vstd.get <> splitBuffer.io.vstd.get 463} 464 465