xref: /XiangShan/src/main/scala/xiangshan/mem/vector/VSegmentUnit.scala (revision cdb8dc62a0ab179f4c5aaa19a3a121f66631d7f5)
1/***************************************************************************************
2  * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3  * Copyright (c) 2020-2021 Peng Cheng Laboratory
4  *
5  * XiangShan is licensed under Mulan PSL v2.
6  * You can use this software according to the terms and conditions of the Mulan PSL v2.
7  * You may obtain a copy of Mulan PSL v2 at:
8  *          http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  *
14  * See the Mulan PSL v2 for more details.
15  ***************************************************************************************/
16
17package xiangshan.mem
18
19import org.chipsalliance.cde.config.Parameters
20import chisel3._
21import chisel3.util._
22import utils._
23import utility._
24import xiangshan._
25import xiangshan.backend.rob.RobPtr
26import xiangshan.backend.Bundles._
27import xiangshan.mem._
28import xiangshan.backend.fu.FuType
29import freechips.rocketchip.diplomacy.BufferParams
30import xiangshan.cache.mmu._
31import xiangshan.cache._
32import xiangshan.cache.wpu.ReplayCarry
33import xiangshan.backend.fu.util.SdtrigExt
34import xiangshan.ExceptionNO._
35import xiangshan.backend.fu.vector.Bundles.VConfig
36import xiangshan.backend.datapath.NewPipelineConnect
37import xiangshan.backend.fu.vector.Utils.VecDataToMaskDataVec
38
39class VSegmentBundle(implicit p: Parameters) extends VLSUBundle
40{
41  val baseVaddr        = UInt(VAddrBits.W)
42  val uop              = new DynInst
43  val paddr            = UInt(PAddrBits.W)
44  val mask             = UInt(VLEN.W)
45  val alignedType      = UInt(alignTypeBits.W)
46  val vl               = UInt(elemIdxBits.W)
47  val uopFlowNum       = UInt(elemIdxBits.W)
48  val uopFlowNumMask   = UInt(elemIdxBits.W)
49  // for exception
50  val vstart           = UInt(elemIdxBits.W)
51  val exceptionVaddr   = UInt(VAddrBits.W)
52  val exceptionGpaddr  = UInt(GPAddrBits.W)
53  val exception_va     = Bool()
54  val exception_gpa    = Bool()
55  val exception_pa     = Bool()
56  val exceptionVstart  = UInt(elemIdxBits.W)
57  val exceptionVl      = UInt(elemIdxBits.W)
58  val isFof            = Bool()
59}
60
61// latch each uop's VecWen, pdest, v0Wen, uopIdx
62class VSegmentUop(implicit p: Parameters) extends VLSUBundle{
63  val uop              = new DynInst
64}
65
66class VSegmentUnit (implicit p: Parameters) extends VLSUModule
67  with HasDCacheParameters
68  with MemoryOpConstants
69  with SdtrigExt
70  with HasLoadHelper
71{
72  val io               = IO(new VSegmentUnitIO)
73
74  val maxSize          = VSegmentBufferSize
75
76  class VSegUPtr(implicit p: Parameters) extends CircularQueuePtr[VSegUPtr](maxSize){
77  }
78
79  object VSegUPtr {
80    def apply(f: Bool, v: UInt)(implicit p: Parameters): VSegUPtr = {
81      val ptr           = Wire(new VSegUPtr)
82      ptr.flag         := f
83      ptr.value        := v
84      ptr
85    }
86  }
87
88
89  /**
90  ********************************************************************************************************
91  *  Use an example to illustrate the working logic of a segmentunit:                                    *
92  *    For:                                                                                              *
93  *      lmul=2 sew=32 emul=2 eew=32  vl=16                                                              *
94  *    Then:                                                                                             *
95  *      Access memory in the order:                                                                     *
96  *        (V2,S0),(V4,S0),(V6,S0),(V8,S0),                                                              *
97  *        (V2,S1),(V4,S1),(V6,S1),(V8,S1),                                                              *
98  *        (V2,S2),(V4,S2),(V6,S2),(V8,S2),                                                              *
99  *        (V2,S3),(V4,S3),(V6,S3),(V8,S3),                                                              *
100  *        (V3,S4),(V5,S4),(V7,S4),(V9,S4),                                                              *
101  *        (V3,S5),(V5,S5),(V7,S5),(V9,S5),                                                              *
102  *        (V3,S6),(V5,S6),(V7,S6),(V9,S6),                                                              *
103  *        (V3,S7),(V5,S7),(V7,S7),(V9,S7),                                                              *
104  *                                                                                                      *
105  *                                                                                                      *
106  *    [[data]] saves the data generated by the access and corresponds to the register.                  *
107  *    [[splitPtr]] controls the destination register written to.                                        *
108  *                                                                                                      *
109  *    splitptr offset can be seen in [[splitPtrNext]] is assignment logic,                              *
110  *    which is mainly calculated in terms of [[fieldIdx]] and [[segmentIdx]]                            *
111  *    First access different fields of the same segment, and then visit different segments.             *
112  *    For the case of 'emul' greater than 1, such as the following example,                             *
113  *    although 'v2' and 'v3' are different vd and the same field, they are still different segments,    *
114  *    so they should be accessed sequentially.Just like the 'Access memory in the order' above.         *
115  *                                                                                                      *
116  *                         [[segmentIdx]]                                                               *
117  *                               |                                                                      *
118  *                               |                                                                      *
119  *                               V                                                                      *
120  *                                                                                                      *
121  *                               S0               S1                S2                 S3               *
122  *                      ----------------------------------------------------------------------------    *
123  *  [[splitPtr]]--> v2  |     field0     |      field0     |      field0     |      field0         |    *
124  *                      ----------------------------------------------------------------------------    *
125  *                               S4               S5                S6                 S7               *
126  *                      ----------------------------------------------------------------------------    *
127  *                  v3  |     field0     |      field0     |      field0     |      field0         |    *
128  *                      ----------------------------------------------------------------------------    *
129  *                               S0               S1                S2                 S3               *
130  *                      ----------------------------------------------------------------------------    *
131  *                  v4  |     field1     |      field1     |      field1     |      field1         |    *
132  *                      ----------------------------------------------------------------------------    *
133  *                               S4               S5                S6                 S7               *
134  *                      ----------------------------------------------------------------------------    *
135  *                  v5  |     field1     |      field1     |      field1     |      field1         |    *
136  *                      ----------------------------------------------------------------------------    *
137  *                               S0               S1                S2                 S3               *
138  *                      ----------------------------------------------------------------------------    *
139  *                  v6  |     field2     |      field2     |      field2     |      field2         |    *
140  *                      ----------------------------------------------------------------------------    *
141  *                               S4               S5                S6                 S7               *
142  *                      ----------------------------------------------------------------------------    *
143  *                  v7  |     field2     |      field2     |      field2     |      field2         |    *
144  *                      ----------------------------------------------------------------------------    *
145  *                               S0               S1                S2                 S3               *
146  *                      ----------------------------------------------------------------------------    *
147  *                  v8  |     field3     |      field3     |      field3     |      field3         |    *
148  *                      ----------------------------------------------------------------------------    *
149  *                               S4               S5                S6                 S7               *
150  *                      ----------------------------------------------------------------------------    *
151  *                  v9  |     field3     |      field3     |      field3     |      field3         |    *
152  *                      ----------------------------------------------------------------------------    *                                                                                    *
153  *                                                                                                      *                                                                                    *
154  *                                                                                                      *                                                                                    *
155  ********************************************************************************************************
156  **/
157
158
159  // buffer uop
160  val instMicroOp       = Reg(new VSegmentBundle)
161  val instMicroOpValid  = RegInit(false.B)
162  val data              = Reg(Vec(maxSize, UInt(VLEN.W)))
163  val uopq              = Reg(Vec(maxSize, new VSegmentUop))
164  val stride            = Reg(Vec(maxSize, UInt(VLEN.W)))
165  val allocated         = RegInit(VecInit(Seq.fill(maxSize)(false.B)))
166  val enqPtr            = RegInit(0.U.asTypeOf(new VSegUPtr))
167  val deqPtr            = RegInit(0.U.asTypeOf(new VSegUPtr))
168  val stridePtr         = WireInit(0.U.asTypeOf(new VSegUPtr)) // for select stride/index
169
170  val segmentIdx        = RegInit(0.U(elemIdxBits.W))
171  val fieldIdx          = RegInit(0.U(fieldBits.W))
172  val segmentOffset     = RegInit(0.U(VAddrBits.W))
173  val splitPtr          = RegInit(0.U.asTypeOf(new VSegUPtr)) // for select load/store data
174  val splitPtrNext      = WireInit(0.U.asTypeOf(new VSegUPtr))
175
176  val exception_va      = WireInit(false.B)
177  val exception_gpa     = WireInit(false.B)
178  val exception_pa      = WireInit(false.B)
179
180  val maxSegIdx         = instMicroOp.vl - 1.U
181  val maxNfields        = instMicroOp.uop.vpu.nf
182  val latchVaddr        = RegInit(0.U(VAddrBits.W))
183
184  XSError((segmentIdx > maxSegIdx) && instMicroOpValid, s"segmentIdx > vl, something error!\n")
185  XSError((fieldIdx > maxNfields) &&  instMicroOpValid, s"fieldIdx > nfields, something error!\n")
186
187  // MicroOp
188  val baseVaddr                       = instMicroOp.baseVaddr
189  val alignedType                     = instMicroOp.alignedType
190  val fuType                          = instMicroOp.uop.fuType
191  val mask                            = instMicroOp.mask
192  val exceptionVec                    = instMicroOp.uop.exceptionVec
193  val issueEew                        = instMicroOp.uop.vpu.veew
194  val issueLmul                       = instMicroOp.uop.vpu.vtype.vlmul
195  val issueSew                        = instMicroOp.uop.vpu.vtype.vsew
196  val issueEmul                       = EewLog2(issueEew) - issueSew + issueLmul
197  val elemIdxInVd                     = segmentIdx & instMicroOp.uopFlowNumMask
198  val issueInstType                   = Cat(true.B, instMicroOp.uop.fuOpType(6, 5)) // always segment instruction
199  val issueUopFlowNumLog2             = GenRealFlowLog2(issueInstType, issueEmul, issueLmul, issueEew, issueSew, true) // max element number log2 in vd
200  val issueVlMax                      = instMicroOp.uopFlowNum // max elementIdx in vd
201  val issueMaxIdxInIndex              = GenVLMAX(Mux(issueEmul.asSInt > 0.S, 0.U, issueEmul), issueEew(1, 0)) // index element index in index register
202  val issueMaxIdxInIndexMask          = GenVlMaxMask(issueMaxIdxInIndex, elemIdxBits)
203  val issueMaxIdxInIndexLog2          = GenVLMAXLog2(Mux(issueEmul.asSInt > 0.S, 0.U, issueEmul), issueEew(1, 0))
204  val issueIndexIdx                   = segmentIdx & issueMaxIdxInIndexMask
205  val segmentActive                   = (mask & UIntToOH(segmentIdx)).orR
206
207  // sbuffer write interface
208  val sbufferOut                      = Wire(Decoupled(new DCacheWordReqWithVaddrAndPfFlag))
209
210  // Segment instruction's FSM
211  /*
212  * s_idle: wait request
213  * s_flush_sbuffer_req: flush sbuffer
214  * s_wait_flush_sbuffer_resp: wait sbuffer empty
215  * s_tlb_req: request tlb
216  * s_wait_tlb_resp: wait tlb resp
217  * s_pm: check pmp
218  * s_cache_req: request cache
219  * s_cache_resp: wait cache resp
220  * s_latch_and_merge_data: for read data
221  * s_send_data: for send write data
222  * s_wait_to_sbuffer: Wait for data from the sbufferOut pipelayer to be sent to the sbuffer
223  * s_finish:
224  * */
225  val s_idle :: s_flush_sbuffer_req :: s_wait_flush_sbuffer_resp :: s_tlb_req :: s_wait_tlb_resp :: s_pm ::s_cache_req :: s_cache_resp :: s_latch_and_merge_data :: s_send_data :: s_wait_to_sbuffer :: s_finish :: Nil = Enum(12)
226  val state             = RegInit(s_idle)
227  val stateNext         = WireInit(s_idle)
228  val sbufferEmpty      = io.flush_sbuffer.empty
229
230  /**
231   * state update
232   */
233  state  := stateNext
234
235  /**
236   * state transfer
237   */
238  when(state === s_idle){
239    stateNext := Mux(isAfter(enqPtr, deqPtr), s_flush_sbuffer_req, s_idle)
240  }.elsewhen(state === s_flush_sbuffer_req){
241    stateNext := Mux(sbufferEmpty, s_tlb_req, s_wait_flush_sbuffer_resp) // if sbuffer is empty, go to query tlb
242
243  }.elsewhen(state === s_wait_flush_sbuffer_resp){
244    stateNext := Mux(sbufferEmpty, s_tlb_req, s_wait_flush_sbuffer_resp)
245
246  }.elsewhen(state === s_tlb_req){
247    stateNext := Mux(segmentActive, s_wait_tlb_resp, Mux(FuType.isVLoad(instMicroOp.uop.fuType), s_latch_and_merge_data, s_send_data))
248
249  }.elsewhen(state === s_wait_tlb_resp){
250    stateNext := Mux(io.dtlb.resp.fire,
251                      Mux(!io.dtlb.resp.bits.miss,
252                          s_pm,
253                          s_tlb_req),
254                      s_wait_tlb_resp)
255
256  }.elsewhen(state === s_pm){
257    /* if is vStore, send data to sbuffer, so don't need query dcache */
258    stateNext := Mux(exception_pa || exception_va || exception_gpa,
259                     s_finish,
260                     Mux(FuType.isVLoad(instMicroOp.uop.fuType), s_cache_req, s_send_data))
261
262  }.elsewhen(state === s_cache_req){
263    stateNext := Mux(io.rdcache.req.fire, s_cache_resp, s_cache_req)
264
265  }.elsewhen(state === s_cache_resp){
266    when(io.rdcache.resp.fire) {
267      when(io.rdcache.resp.bits.miss || io.rdcache.s2_bank_conflict) {
268        stateNext := s_cache_req
269      }.otherwise {
270        stateNext := Mux(FuType.isVLoad(instMicroOp.uop.fuType), s_latch_and_merge_data, s_send_data)
271      }
272    }.otherwise{
273      stateNext := s_cache_resp
274    }
275    /* if segment is inactive, don't need to wait access all of the field */
276  }.elsewhen(state === s_latch_and_merge_data) {
277    when((segmentIdx === maxSegIdx) && (fieldIdx === maxNfields) ||
278      ((segmentIdx === maxSegIdx) && !segmentActive)) {
279
280      stateNext := s_finish // segment instruction finish
281    }.otherwise {
282      stateNext := s_tlb_req // need continue
283    }
284    /* if segment is inactive, don't need to wait access all of the field */
285  }.elsewhen(state === s_send_data) { // when sbuffer accept data
286    when(!sbufferOut.fire && segmentActive) {
287      stateNext := s_send_data
288    }.elsewhen(segmentIdx === maxSegIdx && (fieldIdx === maxNfields && sbufferOut.fire || !segmentActive && io.sbuffer.valid && !io.sbuffer.ready)) {
289      stateNext := s_wait_to_sbuffer
290    }.elsewhen(segmentIdx === maxSegIdx && !segmentActive){
291      stateNext := s_finish // segment instruction finish
292    }.otherwise {
293      stateNext := s_tlb_req // need continue
294    }
295
296  }.elsewhen(state === s_wait_to_sbuffer){
297    stateNext := Mux(io.sbuffer.fire, s_finish, s_wait_to_sbuffer)
298
299  }.elsewhen(state === s_finish){ // writeback uop
300    stateNext := Mux(distanceBetween(enqPtr, deqPtr) === 0.U, s_idle, s_finish)
301
302  }.otherwise{
303    stateNext := s_idle
304    XSError(true.B, s"Unknown state!\n")
305  }
306
307  /*************************************************************************
308   *                            enqueue logic
309   *************************************************************************/
310  io.in.ready                         := true.B
311  val fuOpType                         = io.in.bits.uop.fuOpType
312  val vtype                            = io.in.bits.uop.vpu.vtype
313  val mop                              = fuOpType(6, 5)
314  val instType                         = Cat(true.B, mop)
315  val eew                              = io.in.bits.uop.vpu.veew
316  val sew                              = vtype.vsew
317  val lmul                             = vtype.vlmul
318  val emul                             = EewLog2(eew) - sew + lmul
319  val vl                               = instMicroOp.vl
320  val vm                               = instMicroOp.uop.vpu.vm
321  val vstart                           = instMicroOp.uop.vpu.vstart
322  val srcMask                          = GenFlowMask(Mux(vm, Fill(VLEN, 1.U(1.W)), io.in.bits.src_mask), vstart, vl, true)
323  // first uop enqueue, we need to latch microOp of segment instruction
324  when(io.in.fire && !instMicroOpValid){
325    // element number in a vd
326    // TODO Rewrite it in a more elegant way.
327    val uopFlowNum                    = ZeroExt(GenRealFlowNum(instType, emul, lmul, eew, sew, true), elemIdxBits)
328    instMicroOp.baseVaddr             := io.in.bits.src_rs1(VAddrBits - 1, 0)
329    instMicroOpValid                  := true.B // if is first uop
330    instMicroOp.alignedType           := Mux(isIndexed(instType), sew(1, 0), eew)
331    instMicroOp.uop                   := io.in.bits.uop
332    instMicroOp.mask                  := srcMask
333    instMicroOp.vstart                := 0.U
334    instMicroOp.uopFlowNum            := uopFlowNum
335    instMicroOp.uopFlowNumMask        := GenVlMaxMask(uopFlowNum, elemIdxBits) // for merge data
336    instMicroOp.vl                    := io.in.bits.src_vl.asTypeOf(VConfig()).vl
337    segmentOffset                     := 0.U
338    instMicroOp.isFof                 := (fuOpType === VlduType.vleff) && FuType.isVLoad(fuType)
339  }
340  // latch data
341  when(io.in.fire){
342    data(enqPtr.value)                := io.in.bits.src_vs3
343    stride(enqPtr.value)              := io.in.bits.src_stride
344    uopq(enqPtr.value).uop            := io.in.bits.uop
345  }
346
347  // update enqptr, only 1 port
348  when(io.in.fire){
349    enqPtr                            := enqPtr + 1.U
350  }
351
352  /*************************************************************************
353   *                            output logic
354   *************************************************************************/
355
356  val indexStride                     = IndexAddr( // index for indexed instruction
357                                                    index = stride(stridePtr.value),
358                                                    flow_inner_idx = issueIndexIdx,
359                                                    eew = issueEew
360                                                  )
361  val realSegmentOffset               = Mux(isIndexed(issueInstType),
362                                            indexStride,
363                                            segmentOffset)
364  val vaddr                           = baseVaddr + (fieldIdx << alignedType).asUInt + realSegmentOffset
365
366  //latch vaddr
367  when(state === s_tlb_req){
368    latchVaddr := vaddr
369  }
370  /**
371   * tlb req and tlb resq
372   */
373
374  // query DTLB IO Assign
375  io.dtlb.req                         := DontCare
376  io.dtlb.resp.ready                  := true.B
377  io.dtlb.req.valid                   := state === s_tlb_req && segmentActive
378  io.dtlb.req.bits.cmd                := Mux(FuType.isVLoad(fuType), TlbCmd.read, TlbCmd.write)
379  io.dtlb.req.bits.vaddr              := vaddr
380  io.dtlb.req.bits.size               := instMicroOp.alignedType(2,0)
381  io.dtlb.req.bits.memidx.is_ld       := FuType.isVLoad(fuType)
382  io.dtlb.req.bits.memidx.is_st       := FuType.isVStore(fuType)
383  io.dtlb.req.bits.debug.robIdx       := instMicroOp.uop.robIdx
384  io.dtlb.req.bits.no_translate       := false.B
385  io.dtlb.req.bits.debug.pc           := instMicroOp.uop.pc
386  io.dtlb.req.bits.debug.isFirstIssue := DontCare
387  io.dtlb.req_kill                    := false.B
388
389  val canTriggerException              = segmentIdx === 0.U || !instMicroOp.isFof // only elementIdx = 0 or is not fof can trigger
390  // tlb resp
391  when(io.dtlb.resp.fire && state === s_wait_tlb_resp){
392      exceptionVec(storePageFault)      := io.dtlb.resp.bits.excp(0).pf.st && canTriggerException
393      exceptionVec(loadPageFault)       := io.dtlb.resp.bits.excp(0).pf.ld && canTriggerException
394      exceptionVec(storeGuestPageFault) := io.dtlb.resp.bits.excp(0).gpf.st && canTriggerException
395      exceptionVec(loadGuestPageFault)  := io.dtlb.resp.bits.excp(0).gpf.ld && canTriggerException
396      exceptionVec(storeAccessFault)    := io.dtlb.resp.bits.excp(0).af.st && canTriggerException
397      exceptionVec(loadAccessFault)     := io.dtlb.resp.bits.excp(0).af.ld && canTriggerException
398      when(!io.dtlb.resp.bits.miss){
399        instMicroOp.paddr             := io.dtlb.resp.bits.paddr(0)
400        instMicroOp.exceptionGpaddr   := io.dtlb.resp.bits.gpaddr(0)
401      }
402  }
403  // pmp
404  // NOTE: only handle load/store exception here, if other exception happens, don't send here
405  val pmp = WireInit(io.pmpResp)
406  when(state === s_pm) {
407    val addr_aligned = LookupTree(Mux(isIndexed(issueInstType), issueSew(1, 0), issueEew(1, 0)), List(
408      "b00".U   -> true.B,                   //b
409      "b01".U   -> (vaddr(0)    === 0.U), //h
410      "b10".U   -> (vaddr(1, 0) === 0.U), //w
411      "b11".U   -> (vaddr(2, 0) === 0.U)  //d
412    ))
413    val missAligned = !addr_aligned
414    exceptionVec(loadAddrMisaligned)  := missAligned && FuType.isVLoad(fuType)  && canTriggerException
415    exceptionVec(storeAddrMisaligned) := missAligned && FuType.isVStore(fuType) && canTriggerException
416
417    exception_va  := exceptionVec(storePageFault) || exceptionVec(loadPageFault) ||
418      exceptionVec(storeAccessFault) || exceptionVec(loadAccessFault) || (missAligned && canTriggerException)
419    exception_gpa := exceptionVec(storeGuestPageFault) || exceptionVec(loadGuestPageFault)
420    exception_pa  := (pmp.st || pmp.ld || pmp.mmio) && canTriggerException
421
422    instMicroOp.exception_pa  := exception_pa
423    instMicroOp.exception_va  := exception_va
424    instMicroOp.exception_gpa := exception_gpa
425    // update storeAccessFault bit. Currently, we don't support vector MMIO
426    exceptionVec(loadAccessFault)  := (exceptionVec(loadAccessFault) || pmp.ld || pmp.mmio) && canTriggerException
427    exceptionVec(storeAccessFault) := (exceptionVec(storeAccessFault) || pmp.st || pmp.mmio) && canTriggerException
428
429    when(exception_va || exception_gpa || exception_pa) {
430      when(canTriggerException) {
431        instMicroOp.exceptionVaddr  := vaddr
432        instMicroOp.exceptionVl     := segmentIdx // for exception
433        instMicroOp.exceptionVstart := segmentIdx // for exception
434      }.otherwise {
435        instMicroOp.exceptionVl     := segmentIdx
436      }
437    }
438  }
439
440  /**
441   * flush sbuffer IO Assign
442   */
443  io.flush_sbuffer.valid           := !sbufferEmpty && (state === s_flush_sbuffer_req)
444
445
446  /**
447   * merge data for load
448   */
449  val cacheData = LookupTree(latchVaddr(3,0), List(
450    "b0000".U -> io.rdcache.resp.bits.data_delayed(63,    0),
451    "b0001".U -> io.rdcache.resp.bits.data_delayed(63,    8),
452    "b0010".U -> io.rdcache.resp.bits.data_delayed(63,   16),
453    "b0011".U -> io.rdcache.resp.bits.data_delayed(63,   24),
454    "b0100".U -> io.rdcache.resp.bits.data_delayed(63,   32),
455    "b0101".U -> io.rdcache.resp.bits.data_delayed(63,   40),
456    "b0110".U -> io.rdcache.resp.bits.data_delayed(63,   48),
457    "b0111".U -> io.rdcache.resp.bits.data_delayed(63,   56),
458    "b1000".U -> io.rdcache.resp.bits.data_delayed(127,  64),
459    "b1001".U -> io.rdcache.resp.bits.data_delayed(127,  72),
460    "b1010".U -> io.rdcache.resp.bits.data_delayed(127,  80),
461    "b1011".U -> io.rdcache.resp.bits.data_delayed(127,  88),
462    "b1100".U -> io.rdcache.resp.bits.data_delayed(127,  96),
463    "b1101".U -> io.rdcache.resp.bits.data_delayed(127, 104),
464    "b1110".U -> io.rdcache.resp.bits.data_delayed(127, 112),
465    "b1111".U -> io.rdcache.resp.bits.data_delayed(127, 120)
466  ))
467  val pickData  = rdataVecHelper(alignedType(1,0), cacheData)
468  val mergedData = mergeDataWithElemIdx(
469    oldData = data(splitPtr.value),
470    newData = Seq(pickData),
471    alignedType = alignedType(1,0),
472    elemIdx = Seq(elemIdxInVd),
473    valids = Seq(true.B)
474  )
475  when(state === s_latch_and_merge_data && segmentActive){
476    data(splitPtr.value) := mergedData
477  }
478  /**
479   * split data for store
480   * */
481  val splitData = genVSData(
482    data = data(splitPtr.value),
483    elemIdx = elemIdxInVd,
484    alignedType = alignedType
485  )
486  val flowData  = genVWdata(splitData, alignedType) // TODO: connect vstd, pass vector data
487  val wmask     = genVWmask(latchVaddr, alignedType(1, 0)) & Fill(VLENB, segmentActive)
488
489  /**
490   * rdcache req, write request don't need to query dcache, because we write element to sbuffer
491   */
492  io.rdcache.req                    := DontCare
493  io.rdcache.req.valid              := state === s_cache_req && FuType.isVLoad(fuType)
494  io.rdcache.req.bits.cmd           := MemoryOpConstants.M_XRD
495  io.rdcache.req.bits.vaddr         := latchVaddr
496  io.rdcache.req.bits.mask          := mask
497  io.rdcache.req.bits.data          := flowData
498  io.rdcache.pf_source              := LOAD_SOURCE.U
499  io.rdcache.req.bits.id            := DontCare
500  io.rdcache.resp.ready             := true.B
501  io.rdcache.s1_paddr_dup_lsu       := instMicroOp.paddr
502  io.rdcache.s1_paddr_dup_dcache    := instMicroOp.paddr
503  io.rdcache.s1_kill                := false.B
504  io.rdcache.s1_kill_data_read      := false.B
505  io.rdcache.s2_kill                := false.B
506  if (env.FPGAPlatform){
507    io.rdcache.s0_pc                := DontCare
508    io.rdcache.s1_pc                := DontCare
509    io.rdcache.s2_pc                := DontCare
510  }else{
511    io.rdcache.s0_pc                := instMicroOp.uop.pc
512    io.rdcache.s1_pc                := instMicroOp.uop.pc
513    io.rdcache.s2_pc                := instMicroOp.uop.pc
514  }
515  io.rdcache.replacementUpdated     := false.B
516  io.rdcache.is128Req               := false.B
517
518
519  /**
520   * write data to sbuffer
521   * */
522  sbufferOut.bits                  := DontCare
523  sbufferOut.valid                 := state === s_send_data && segmentActive
524  sbufferOut.bits.vecValid         := state === s_send_data && segmentActive
525  sbufferOut.bits.mask             := wmask
526  sbufferOut.bits.data             := flowData
527  sbufferOut.bits.vaddr            := latchVaddr
528  sbufferOut.bits.cmd              := MemoryOpConstants.M_XWR
529  sbufferOut.bits.id               := DontCare
530  sbufferOut.bits.addr             := instMicroOp.paddr
531
532  NewPipelineConnect(
533    sbufferOut, io.sbuffer, io.sbuffer.fire,
534    false.B,
535    Option(s"VSegmentUnitPipelineConnect")
536  )
537
538  io.vecDifftestInfo.valid         := io.sbuffer.valid
539  io.vecDifftestInfo.bits          := uopq(deqPtr.value).uop
540
541  /**
542   * update ptr
543   * */
544  private val fieldActiveWirteFinish = sbufferOut.fire && segmentActive // writedata finish and is a active segment
545  XSError(sbufferOut.fire && !segmentActive, "Attempt write inactive segment to sbuffer, something wrong!\n")
546
547  private val segmentInactiveFinish = ((state === s_latch_and_merge_data) || (state === s_send_data)) && !segmentActive
548
549  val splitPtrOffset = Mux(
550    isIndexed(instType),
551    Mux(lmul.asSInt < 0.S, 1.U, (1.U << lmul).asUInt),
552    Mux(emul.asSInt < 0.S, 1.U, (1.U << emul).asUInt)
553  )
554  splitPtrNext :=
555    Mux(fieldIdx === maxNfields || !segmentActive, // if segment is active, need to complete this segment, otherwise jump to next segment
556      // segment finish, By shifting 'issueUopFlowNumLog2' to the right to ensure that emul != 1 can correctly generate lateral offset.
557     (deqPtr + ((segmentIdx +& 1.U) >> issueUopFlowNumLog2).asUInt),
558      // next field.
559     (splitPtr + splitPtrOffset)
560    )
561
562  dontTouch(issueUopFlowNumLog2)
563  dontTouch(issueEmul)
564  dontTouch(splitPtrNext)
565  dontTouch(stridePtr)
566  dontTouch(segmentActive)
567
568  // update splitPtr
569  when(state === s_latch_and_merge_data || (state === s_send_data && (fieldActiveWirteFinish || !segmentActive))){
570    splitPtr := splitPtrNext
571  }.elsewhen(io.in.fire && !instMicroOpValid){
572    splitPtr := deqPtr // initial splitPtr
573  }
574
575  // update stridePtr, only use in index
576  val strideOffset = Mux(isIndexed(issueInstType), segmentIdx >> issueMaxIdxInIndexLog2, 0.U)
577  stridePtr       := deqPtr + strideOffset
578
579  // update fieldIdx
580  when(io.in.fire && !instMicroOpValid){ // init
581    fieldIdx := 0.U
582  }.elsewhen(state === s_latch_and_merge_data && segmentActive ||
583            (state === s_send_data && fieldActiveWirteFinish)){ // only if segment is active
584
585    /* next segment, only if segment complete */
586    fieldIdx := Mux(fieldIdx === maxNfields, 0.U, fieldIdx + 1.U)
587  }.elsewhen(segmentInactiveFinish){ // segment is inactive, go to next segment
588    fieldIdx := 0.U
589  }
590  //update segmentIdx
591  when(io.in.fire && !instMicroOpValid){
592    segmentIdx := 0.U
593  }.elsewhen(fieldIdx === maxNfields && (state === s_latch_and_merge_data || (state === s_send_data && fieldActiveWirteFinish)) &&
594             segmentIdx =/= maxSegIdx){ // next segment, only if segment is active
595
596    segmentIdx := segmentIdx + 1.U
597  }.elsewhen(segmentInactiveFinish && segmentIdx =/= maxSegIdx){ // if segment is inactive, go to next segment
598    segmentIdx := segmentIdx + 1.U
599  }
600
601  //update segmentOffset
602  /* when segment is active or segment is inactive, increase segmentOffset */
603  when((fieldIdx === maxNfields && (state === s_latch_and_merge_data || (state === s_send_data && fieldActiveWirteFinish))) ||
604       segmentInactiveFinish){
605
606    segmentOffset := segmentOffset + Mux(isUnitStride(issueInstType), (maxNfields +& 1.U) << issueEew(1, 0), stride(stridePtr.value))
607  }
608
609  //update deqPtr
610  when((state === s_finish) && !isEmpty(enqPtr, deqPtr)){
611    deqPtr := deqPtr + 1.U
612  }
613
614  /*************************************************************************
615   *                            dequeue logic
616   *************************************************************************/
617  val vdIdxInField = GenUopIdxInField(Mux(isIndexed(instType), issueLmul, issueEmul), uopq(deqPtr.value).uop.vpu.vuopIdx)
618  /*select mask of vd, maybe remove in feature*/
619  val realEw        = Mux(isIndexed(issueInstType), issueSew(1, 0), issueEew(1, 0))
620  val maskDataVec: Vec[UInt] = VecDataToMaskDataVec(instMicroOp.mask, realEw)
621  val maskUsed      = maskDataVec(vdIdxInField)
622
623  when(stateNext === s_idle){
624    instMicroOpValid := false.B
625  }
626  // writeback to backend
627  val writebackOut                     = WireInit(io.uopwriteback.bits)
628  val writebackValid                   = (state === s_finish) && !isEmpty(enqPtr, deqPtr)
629  writebackOut.uop                    := uopq(deqPtr.value).uop
630  writebackOut.uop.vpu                := instMicroOp.uop.vpu
631  writebackOut.uop.exceptionVec       := instMicroOp.uop.exceptionVec
632  writebackOut.mask.get               := instMicroOp.mask
633  writebackOut.data                   := data(deqPtr.value)
634  writebackOut.vdIdx.get              := vdIdxInField
635  writebackOut.uop.vpu.vl             := instMicroOp.vl
636  writebackOut.uop.vpu.vstart         := instMicroOp.vstart
637  writebackOut.uop.vpu.vmask          := maskUsed
638  writebackOut.uop.vpu.vuopIdx        := uopq(deqPtr.value).uop.vpu.vuopIdx
639  writebackOut.debug                  := DontCare
640  writebackOut.vdIdxInField.get       := vdIdxInField
641  writebackOut.uop.robIdx             := instMicroOp.uop.robIdx
642  writebackOut.uop.fuOpType           := instMicroOp.uop.fuOpType
643
644  io.uopwriteback.valid               := RegNext(writebackValid)
645  io.uopwriteback.bits                := RegEnable(writebackOut, writebackValid)
646
647  dontTouch(writebackValid)
648
649  //to RS
650  val feedbackOut                      = WireInit(0.U.asTypeOf(io.feedback.bits))
651  val feedbackValid                    = state === s_finish && !isEmpty(enqPtr, deqPtr)
652  feedbackOut.hit                     := true.B
653  feedbackOut.robIdx                  := instMicroOp.uop.robIdx
654  feedbackOut.sourceType              := DontCare
655  feedbackOut.flushState              := DontCare
656  feedbackOut.dataInvalidSqIdx        := DontCare
657  feedbackOut.sqIdx                   := uopq(deqPtr.value).uop.sqIdx
658  feedbackOut.lqIdx                   := uopq(deqPtr.value).uop.lqIdx
659
660  io.feedback.valid                   := RegNext(feedbackValid)
661  io.feedback.bits                    := RegEnable(feedbackOut, feedbackValid)
662
663  dontTouch(feedbackValid)
664
665  // exception
666  io.exceptionInfo                    := DontCare
667  io.exceptionInfo.bits.robidx        := instMicroOp.uop.robIdx
668  io.exceptionInfo.bits.uopidx        := uopq(deqPtr.value).uop.vpu.vuopIdx
669  io.exceptionInfo.bits.vstart        := instMicroOp.exceptionVstart
670  io.exceptionInfo.bits.vaddr         := instMicroOp.exceptionVaddr
671  io.exceptionInfo.bits.gpaddr        := instMicroOp.exceptionGpaddr
672  io.exceptionInfo.bits.vl            := instMicroOp.exceptionVl
673  io.exceptionInfo.valid              := (state === s_finish) && instMicroOp.uop.exceptionVec.asUInt.orR && !isEmpty(enqPtr, deqPtr)
674}
675
676