xref: /XiangShan/src/main/scala/xiangshan/backend/fu/SRT4Divider.scala (revision a58e33519795596dc4f85fe66907cbc7dde2d66a)
1c6d43980SLemover/***************************************************************************************
2c6d43980SLemover* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
3f320e0f0SYinan Xu* Copyright (c) 2020-2021 Peng Cheng Laboratory
4c6d43980SLemover*
5c6d43980SLemover* XiangShan is licensed under Mulan PSL v2.
6c6d43980SLemover* You can use this software according to the terms and conditions of the Mulan PSL v2.
7c6d43980SLemover* You may obtain a copy of Mulan PSL v2 at:
8c6d43980SLemover*          http://license.coscl.org.cn/MulanPSL2
9c6d43980SLemover*
10c6d43980SLemover* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11c6d43980SLemover* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12c6d43980SLemover* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13c6d43980SLemover*
14c6d43980SLemover* See the Mulan PSL v2 for more details.
15c6d43980SLemover***************************************************************************************/
16c6d43980SLemover
17*a58e3351SLi Qianruo// The "SRT4DividerDataModule" in this file is a scala rewrite of SRT4 divider by Yifei He, see
18*a58e3351SLi Qianruo// https://github.com/OpenXiangShan/XS-Verilog-Library/tree/main/int_div_radix_4_v1
19*a58e3351SLi Qianruo// Email of original author: [email protected]
20*a58e3351SLi Qianruo
215018a303SLinJiaweipackage xiangshan.backend.fu
225018a303SLinJiawei
232225d46eSJiawei Linimport chipsalliance.rocketchip.config.Parameters
245018a303SLinJiaweiimport chisel3._
255018a303SLinJiaweiimport chisel3.util._
265018a303SLinJiaweiimport utils.SignExt
277f1506e3SLinJiaweiimport xiangshan.backend.fu.util.CSA3_2
285018a303SLinJiawei
295018a303SLinJiawei/** A Radix-4 SRT Integer Divider
305018a303SLinJiawei  *
315018a303SLinJiawei  * 2 ~ (5 + (len+3)/2) cycles are needed for each division.
325018a303SLinJiawei  */
33afefbad5SLinJiaweiclass SRT4DividerDataModule(len: Int) extends Module {
34afefbad5SLinJiawei  val io = IO(new Bundle() {
352bd5334dSYinan Xu    val src = Vec(2, Input(UInt(len.W)))
36afefbad5SLinJiawei    val valid, sign, kill_w, kill_r, isHi, isW = Input(Bool())
37afefbad5SLinJiawei    val in_ready = Output(Bool())
38afefbad5SLinJiawei    val out_valid = Output(Bool())
39afefbad5SLinJiawei    val out_data = Output(UInt(len.W))
40afefbad5SLinJiawei    val out_ready = Input(Bool())
41afefbad5SLinJiawei  })
42afefbad5SLinJiawei
43*a58e3351SLi Qianruo  // consts
44*a58e3351SLi Qianruo  val lzc_width = log2Up(len)
45*a58e3351SLi Qianruo  val itn_len = 1 + len + 2 + 1
46*a58e3351SLi Qianruo  require(lzc_width == 6)
47*a58e3351SLi Qianruo
48*a58e3351SLi Qianruo  val (a, d, sign, valid, kill_w, kill_r, isHi, isW) =
492bd5334dSYinan Xu    (io.src(0), io.src(1), io.sign, io.valid, io.kill_w, io.kill_r, io.isHi, io.isW)
50afefbad5SLinJiawei  val in_fire = valid && io.in_ready
51afefbad5SLinJiawei  val out_fire = io.out_ready && io.out_valid
52afefbad5SLinJiawei  val newReq = in_fire
53*a58e3351SLi Qianruo  val startHandShake = io.in_ready && valid
54*a58e3351SLi Qianruo  val s_idle :: s_pre_0 :: s_pre_1 :: s_iter :: s_post_0 :: s_post_1 :: s_finish :: Nil = Enum(7)
555018a303SLinJiawei
56*a58e3351SLi Qianruo  val state = RegInit(UIntToOH(s_idle, 7))
57afefbad5SLinJiawei
58*a58e3351SLi Qianruo  val quot_neg_2 :: quot_neg_1 :: quot_0 :: quot_pos_1 :: quot_pos_2 :: Nil = Enum(5)
595018a303SLinJiawei
60*a58e3351SLi Qianruo  val finished = state(s_finish)
61*a58e3351SLi Qianruo
62*a58e3351SLi Qianruo  // reused wire declarations
63*a58e3351SLi Qianruo  val aIsZero = Wire(Bool())
64*a58e3351SLi Qianruo  val dIsZero = Wire(Bool())
65*a58e3351SLi Qianruo  val aTooSmall = Wire(Bool()) // this is output of reg!
66*a58e3351SLi Qianruo  val noIter = Wire(Bool()) // this is output of reg!
67*a58e3351SLi Qianruo  val finalIter = Wire(Bool())
68*a58e3351SLi Qianruo  val aLZC = Wire(UInt((lzc_width + 1).W))
69*a58e3351SLi Qianruo  val dLZC = Wire(UInt((lzc_width + 1).W))
70*a58e3351SLi Qianruo  val aNormAbs = Wire(UInt((len + 1).W))
71*a58e3351SLi Qianruo  val dNormAbs = Wire(UInt((len + 1).W))
72*a58e3351SLi Qianruo  val aInverter = Wire(UInt(len.W)) // results of global inverter
73*a58e3351SLi Qianruo  val dInverter = Wire(UInt(len.W))
74*a58e3351SLi Qianruo
75*a58e3351SLi Qianruo  val rPreShifted = Wire(UInt((len + 1).W))
76*a58e3351SLi Qianruo
77*a58e3351SLi Qianruo  val quotIter = Wire(UInt(len.W))
78*a58e3351SLi Qianruo  val quotM1Iter = Wire(UInt(len.W))
79*a58e3351SLi Qianruo  val qIterEnd = Wire(UInt(5.W))
80*a58e3351SLi Qianruo
81*a58e3351SLi Qianruo  val rNext = Wire(UInt(itn_len.W))
82*a58e3351SLi Qianruo  val rNextPd = Wire(UInt(itn_len.W)) // non-redundant remainder plus d, 68, 67
83*a58e3351SLi Qianruo  //reused ctrl regs
84*a58e3351SLi Qianruo
85*a58e3351SLi Qianruo  //reused other regs
86*a58e3351SLi Qianruo  val aNormAbsReg = RegEnable(aNormAbs, startHandShake | state(s_pre_0) | state(s_post_0)) // reg for normalized a & d and rem & rem+d
87*a58e3351SLi Qianruo  val dNormAbsReg = RegEnable(dNormAbs, startHandShake | state(s_pre_0) | state(s_post_0))
88*a58e3351SLi Qianruo  val quotIterReg = RegEnable(quotIter, state(s_pre_1) | state(s_iter) | state(s_post_0))
89*a58e3351SLi Qianruo  val quotM1IterReg = RegEnable(quotM1Iter, state(s_pre_1) | state(s_iter) | state(s_post_0))
90*a58e3351SLi Qianruo
91afefbad5SLinJiawei  when(kill_r) {
92*a58e3351SLi Qianruo    state := UIntToOH(s_idle, 7)
93*a58e3351SLi Qianruo  } .elsewhen(state(s_idle) && in_fire && !kill_w) {
94*a58e3351SLi Qianruo    state := UIntToOH(s_pre_0, 7)
95*a58e3351SLi Qianruo  } .elsewhen(state(s_pre_0)) { // leading zero detection
96*a58e3351SLi Qianruo    state := UIntToOH(s_pre_1, 7)
97*a58e3351SLi Qianruo  } .elsewhen(state(s_pre_1)) { // shift a/b
98*a58e3351SLi Qianruo    state := Mux(dIsZero | aTooSmall | noIter, UIntToOH(s_post_0, 7), UIntToOH(s_iter, 7))
99*a58e3351SLi Qianruo  } .elsewhen(state(s_iter)) { // (ws[j+1], wc[j+1]) = 4(ws[j],wc[j]) - q(j+1)*d
100*a58e3351SLi Qianruo    state := Mux(finalIter, UIntToOH(s_post_0, 7), UIntToOH(s_iter, 7))
101*a58e3351SLi Qianruo  } .elsewhen(state(s_post_0)) { // if rem < 0, rem = rem + d
102*a58e3351SLi Qianruo    state := UIntToOH(s_post_1, 7)
103*a58e3351SLi Qianruo  } .elsewhen(state(s_post_1)) {
104*a58e3351SLi Qianruo    state := UIntToOH(s_finish, 7)
105*a58e3351SLi Qianruo  } .elsewhen(state(s_finish) && out_fire) {
106*a58e3351SLi Qianruo    state := UIntToOH(s_idle, 7)
107*a58e3351SLi Qianruo  } .otherwise {
108*a58e3351SLi Qianruo    state := state
1095018a303SLinJiawei  }
1105018a303SLinJiawei
111*a58e3351SLi Qianruo  // First cycle:
112*a58e3351SLi Qianruo  // State is idle, we gain absolute value of a and b, using global inverter
113afefbad5SLinJiawei
114*a58e3351SLi Qianruo  io.in_ready := state(s_idle)
1155018a303SLinJiawei
116*a58e3351SLi Qianruo  aInverter := -Mux(state(s_idle), a, quotIterReg) // 64, 0
117*a58e3351SLi Qianruo  dInverter := -Mux(state(s_idle), d, quotM1IterReg) // 64, 0
1185018a303SLinJiawei
119*a58e3351SLi Qianruo  val aSign = io.sign && a(len - 1) // 1
120*a58e3351SLi Qianruo  val dSign = io.sign && d(len - 1)
1215018a303SLinJiawei
122*a58e3351SLi Qianruo  val aAbs = Mux(aSign, aInverter, a) // 64, 0
123*a58e3351SLi Qianruo  val dAbs = Mux(dSign, dInverter, d)
124*a58e3351SLi Qianruo  val aNorm = (aNormAbsReg(len - 1, 0) << aLZC(lzc_width - 1, 0))(len - 1, 0) // 64, 65
125*a58e3351SLi Qianruo  val dNorm = (dNormAbsReg(len - 1, 0) << dLZC(lzc_width - 1, 0))(len - 1, 0)
1265018a303SLinJiawei
127*a58e3351SLi Qianruo  aNormAbs := Mux1H(Seq(
128*a58e3351SLi Qianruo    state(s_idle) -> Cat(0.U(1.W), aAbs), // 65, 0
129*a58e3351SLi Qianruo    state(s_pre_0) -> Cat(0.U(1.W), aNorm), // 65, 0
130*a58e3351SLi Qianruo    state(s_post_0) -> rNext(len + 3, 3) // remainder 65, 64. highest is sign bit
131*a58e3351SLi Qianruo  ))
132*a58e3351SLi Qianruo  dNormAbs := Mux1H(Seq(
133*a58e3351SLi Qianruo    state(s_idle) -> Cat(0.U(1.W), dAbs),
134*a58e3351SLi Qianruo    state(s_pre_0) -> Cat(0.U(1.W), dNorm),
135*a58e3351SLi Qianruo    state(s_post_0) -> rNextPd(len + 3, 3)
1365018a303SLinJiawei    ))
1375018a303SLinJiawei
138*a58e3351SLi Qianruo  // Second cycle, state is pre_0
139*a58e3351SLi Qianruo  // calculate lzc and move div* and lzc diff check if no_iter_needed
1405018a303SLinJiawei
141*a58e3351SLi Qianruo  aLZC := PriorityEncoder(aNormAbsReg(len - 1, 0).asBools().reverse)
142*a58e3351SLi Qianruo  dLZC := PriorityEncoder(dNormAbsReg(len - 1, 0).asBools().reverse)
143*a58e3351SLi Qianruo  val aLZCReg = RegEnable(aLZC, state(s_pre_0)) // 7, 0
144*a58e3351SLi Qianruo  val dLZCReg = RegEnable(dLZC, state(s_pre_0))
145*a58e3351SLi Qianruo
146*a58e3351SLi Qianruo
147*a58e3351SLi Qianruo
148*a58e3351SLi Qianruo  val lzcWireDiff = Cat(0.U(1.W), dLZC(lzc_width - 1, 0)) - Cat(0.U(1.W), aLZC(lzc_width - 1, 0)) // 7, 0
149*a58e3351SLi Qianruo  val lzcRegDiff = Cat(0.U(1.W), dLZCReg(lzc_width - 1, 0)) - Cat(0.U(1.W), aLZCReg(lzc_width - 1, 0))
150*a58e3351SLi Qianruo  val lzcDiff = Mux(state(s_pre_0), lzcWireDiff, lzcRegDiff)
151*a58e3351SLi Qianruo  aIsZero := aLZC(lzc_width) // this is state pre_0
152*a58e3351SLi Qianruo  dIsZero := dLZCReg(lzc_width) // this is pre_1 and all stages after
153*a58e3351SLi Qianruo  val dIsOne = dLZC(lzc_width - 1, 0).andR() // this is pre_0
154*a58e3351SLi Qianruo  val noIterReg = RegEnable(dIsOne & aNormAbsReg(len - 1), state(s_pre_0)) // This means dividend has lzc 0 so iter is 17
155*a58e3351SLi Qianruo  noIter := noIterReg
156*a58e3351SLi Qianruo  val aTooSmallReg = RegEnable(aIsZero | lzcDiff(lzc_width), state(s_pre_0)) // a is zero or a smaller than d
157*a58e3351SLi Qianruo  aTooSmall := aTooSmallReg
158*a58e3351SLi Qianruo
159*a58e3351SLi Qianruo  val quotSign = Mux(state(s_idle), aSign ^ dSign, true.B) // if not s_idle then must be s_pre_1 & dIsZero, and that we have
160*a58e3351SLi Qianruo  val rSign = aSign
161*a58e3351SLi Qianruo  val quotSignReg = RegEnable(quotSign, startHandShake | (state(s_pre_1) & dIsZero))
162*a58e3351SLi Qianruo  val rSignReg = RegEnable(rSign, startHandShake)
163*a58e3351SLi Qianruo
164*a58e3351SLi Qianruo  val rShift = lzcDiff(0) // odd lzc diff, for SRT4
165*a58e3351SLi Qianruo  val rightShifted = Wire(UInt(len.W))
166*a58e3351SLi Qianruo  val rSumInit = Mux(aTooSmallReg | aIsZero, Cat(0.U(1.W), rightShifted, 0.U(3.W)), // right shift the dividend (which is already l-shifted)
167*a58e3351SLi Qianruo                    Mux(noIterReg, 0.U(itn_len.W), //
168*a58e3351SLi Qianruo                      Cat(0.U(3.W),
169*a58e3351SLi Qianruo                          Mux(rShift, Cat(0.U(1.W), aNormAbsReg(len - 1, 0)), Cat(aNormAbsReg(len - 1, 0), 0.U(1.W)))
170*a58e3351SLi Qianruo                        ) // Normal init value. 68, 67; For even lzcDiff, 0.001xxx0; for odd lzcDiff 0.0001xxx
1715018a303SLinJiawei                      )
172*a58e3351SLi Qianruo                    ) // state is s_pre_1
173*a58e3351SLi Qianruo  val rCarryInit = 0.U(itn_len.W)
1745018a303SLinJiawei
175*a58e3351SLi Qianruo  val rightShifter = Module(new RightShifter(len, lzc_width))
176*a58e3351SLi Qianruo  rightShifter.io.in := Mux(state(s_pre_1), aNormAbsReg(len - 1, 0), rPreShifted(len - 1, 0))
177*a58e3351SLi Qianruo  rightShifter.io.shiftNum := Mux(state(s_pre_1), aLZCReg,
178*a58e3351SLi Qianruo                                  Mux(aTooSmallReg | dIsZero, 0.U(lzc_width.W), dLZCReg))
179*a58e3351SLi Qianruo  rightShifter.io.msb := state(s_post_1) & rSignReg & rPreShifted(len)
180*a58e3351SLi Qianruo  rightShifted := rightShifter.io.out
181*a58e3351SLi Qianruo
182*a58e3351SLi Qianruo  // obtaining 1st quotient
183*a58e3351SLi Qianruo  val rSumInitTrunc = Cat(0.U(1.W), rSumInit(itn_len - 4, itn_len - 4 - 4 + 1)) // 0.00___
184*a58e3351SLi Qianruo  val mInitPos1 = MuxLookup(dNormAbsReg(len - 2, len - 2 - 3 + 1), "b00100".U(5.W),
185*a58e3351SLi Qianruo    Array(
186*a58e3351SLi Qianruo      0.U -> "b00100".U(5.W),
187*a58e3351SLi Qianruo      1.U -> "b00100".U(5.W),
188*a58e3351SLi Qianruo      2.U -> "b00100".U(5.W),
189*a58e3351SLi Qianruo      3.U -> "b00110".U(5.W),
190*a58e3351SLi Qianruo      4.U -> "b00110".U(5.W),
191*a58e3351SLi Qianruo      5.U -> "b00110".U(5.W),
192*a58e3351SLi Qianruo      6.U -> "b00110".U(5.W),
193*a58e3351SLi Qianruo      7.U -> "b01000".U(5.W),
1940fb3674eSJiawei Lin    )
1955018a303SLinJiawei  )
196*a58e3351SLi Qianruo  val mInitPos2 = MuxLookup(dNormAbsReg(len - 2, len - 2 - 3 + 1), "b01100".U(5.W),
197*a58e3351SLi Qianruo    Array(
198*a58e3351SLi Qianruo      0.U -> "b01100".U(5.W),
199*a58e3351SLi Qianruo      1.U -> "b01110".U(5.W),
200*a58e3351SLi Qianruo      2.U -> "b01111".U(5.W),
201*a58e3351SLi Qianruo      3.U -> "b10000".U(5.W),
202*a58e3351SLi Qianruo      4.U -> "b10010".U(5.W),
203*a58e3351SLi Qianruo      5.U -> "b10100".U(5.W),
204*a58e3351SLi Qianruo      6.U -> "b10110".U(5.W),
205*a58e3351SLi Qianruo      7.U -> "b10110".U(5.W),
206*a58e3351SLi Qianruo    )
207*a58e3351SLi Qianruo  )
208*a58e3351SLi Qianruo  val initCmpPos1 = rSumInitTrunc >= mInitPos1
209*a58e3351SLi Qianruo  val initCmpPos2 = rSumInitTrunc >= mInitPos2
210*a58e3351SLi Qianruo  val qInit = Mux(initCmpPos2, UIntToOH(quot_pos_2, 5), Mux(initCmpPos1, UIntToOH(quot_pos_1, 5), UIntToOH(quot_0, 5)))
211*a58e3351SLi Qianruo  val qPrev = Mux(state(s_pre_1), qInit, qIterEnd)
212*a58e3351SLi Qianruo  val qPrevReg = RegEnable(qPrev, state(s_pre_1) | state(s_iter))
213*a58e3351SLi Qianruo  val specialDivisorReg = RegEnable(dNormAbsReg(len - 2, len - 2 - 3 + 1) === 0.U, state(s_pre_1)) // d=0.1000xxx
2145018a303SLinJiawei
215*a58e3351SLi Qianruo  // rCarry and rSum in Iteration
216*a58e3351SLi Qianruo  val qXd = Mux1H(Seq(
217*a58e3351SLi Qianruo    qPrevReg(quot_neg_2) -> Cat(dNormAbsReg(len - 1, 0), 0.U(4.W)), // 68, 67 1.xxxxx0000
218*a58e3351SLi Qianruo    qPrevReg(quot_neg_1) -> Cat(0.U(1.W), dNormAbsReg(len - 1, 0), 0.U(3.W)), // 0.1xxxxx000
219*a58e3351SLi Qianruo    qPrevReg(quot_0)     -> 0.U(itn_len.W),
220*a58e3351SLi Qianruo    qPrevReg(quot_pos_1) -> ~Cat(0.U(1.W), dNormAbsReg(len - 1, 0), 0.U(3.W)), // don't forget to plus 1 later
221*a58e3351SLi Qianruo    qPrevReg(quot_pos_2) -> ~Cat(dNormAbsReg(len - 1, 0), 0.U(4.W))  // don't forget to plus 1 later
2225018a303SLinJiawei  ))
223*a58e3351SLi Qianruo  val csa = Module(new CSA3_2(itn_len))
2245018a303SLinJiawei
225*a58e3351SLi Qianruo  val rSumIter = csa.io.out(0)
226*a58e3351SLi Qianruo  val rCarryIter = Cat(csa.io.out(1)(itn_len - 2, 0), qPrevReg(quot_pos_1) | qPrevReg(quot_pos_2))
227*a58e3351SLi Qianruo  val rSumReg = RegEnable(Mux(state(s_pre_1), rSumInit, rSumIter), state(s_pre_1) | state(s_iter)) // 68, 67
228*a58e3351SLi Qianruo  val rCarryReg = RegEnable(Mux(state(s_pre_1), rCarryInit, rCarryIter), state(s_pre_1) | state(s_iter))
229*a58e3351SLi Qianruo  csa.io.in(0) := rSumReg << 2
230*a58e3351SLi Qianruo  csa.io.in(1) := rCarryReg << 2
231*a58e3351SLi Qianruo  csa.io.in(2) := qXd
232*a58e3351SLi Qianruo
233*a58e3351SLi Qianruo  val qds = Module(new SRT4QDS(len, itn_len))
234*a58e3351SLi Qianruo  qds.io.remSum := rSumReg
235*a58e3351SLi Qianruo  qds.io.remCarry := rCarryReg
236*a58e3351SLi Qianruo  qds.io.d := dNormAbsReg(len - 1, 0) // Maybe optimize here to lower power consumption?
237*a58e3351SLi Qianruo  qds.io.specialDivisor := specialDivisorReg
238*a58e3351SLi Qianruo  qds.io.qPrev := qPrevReg
239*a58e3351SLi Qianruo  qIterEnd := qds.io.qIterEnd
240*a58e3351SLi Qianruo
241*a58e3351SLi Qianruo  //on the fly conversion
242*a58e3351SLi Qianruo  val quotIterNext = Wire(UInt(len.W))
243*a58e3351SLi Qianruo  val quotIterM1Next = Wire(UInt(len.W))
244*a58e3351SLi Qianruo  quotIterNext := Mux1H(Seq(
245*a58e3351SLi Qianruo    qPrevReg(quot_pos_2) -> (quotIterReg << 2 | "b10".U),
246*a58e3351SLi Qianruo    qPrevReg(quot_pos_1) -> (quotIterReg << 2 | "b01".U),
247*a58e3351SLi Qianruo    qPrevReg(quot_0)     -> (quotIterReg << 2 | "b00".U),
248*a58e3351SLi Qianruo    qPrevReg(quot_neg_1) -> (quotM1IterReg << 2 | "b11".U),
249*a58e3351SLi Qianruo    qPrevReg(quot_neg_2) -> (quotM1IterReg << 2 | "b10".U)
250*a58e3351SLi Qianruo  ))
251*a58e3351SLi Qianruo  quotIterM1Next := Mux1H(Seq(
252*a58e3351SLi Qianruo    qPrevReg(quot_pos_2) -> (quotIterReg << 2 | "b01".U),
253*a58e3351SLi Qianruo    qPrevReg(quot_pos_1) -> (quotIterReg << 2 | "b00".U),
254*a58e3351SLi Qianruo    qPrevReg(quot_0)     -> (quotM1IterReg << 2 | "b11".U),
255*a58e3351SLi Qianruo    qPrevReg(quot_neg_1) -> (quotM1IterReg << 2 | "b10".U),
256*a58e3351SLi Qianruo    qPrevReg(quot_neg_2) -> (quotM1IterReg << 2 | "b01".U)
257*a58e3351SLi Qianruo  ))
258*a58e3351SLi Qianruo
259*a58e3351SLi Qianruo
260*a58e3351SLi Qianruo  quotIter := Mux(state(s_pre_1),
261*a58e3351SLi Qianruo                      Mux(dIsZero, VecInit(Seq.fill(len)(true.B)).asUInt,
262*a58e3351SLi Qianruo                        Mux(noIterReg, aNormAbsReg(len - 1, 0), 0.U(len.W))),
263*a58e3351SLi Qianruo                      Mux(state(s_iter), quotIterNext,
264*a58e3351SLi Qianruo                        Mux(quotSignReg, aInverter, quotIterReg)))
265*a58e3351SLi Qianruo  quotM1Iter := Mux(state(s_pre_1),
266*a58e3351SLi Qianruo                        0.U(len.W), Mux(state(s_iter), quotIterM1Next,
267*a58e3351SLi Qianruo                          Mux(quotSignReg, dInverter, quotM1IterReg)))
268*a58e3351SLi Qianruo
269*a58e3351SLi Qianruo
270*a58e3351SLi Qianruo  // iter num
271*a58e3351SLi Qianruo  val iterNum = Wire(UInt((lzc_width - 1).W))
272*a58e3351SLi Qianruo  val iterNumReg = RegEnable(iterNum, state(s_pre_1) | state(s_iter))
273*a58e3351SLi Qianruo
274*a58e3351SLi Qianruo  iterNum := Mux(state(s_pre_1), lzcDiff(lzc_width - 1, 1) +% lzcDiff(0), iterNumReg -% 1.U)
275*a58e3351SLi Qianruo  finalIter := iterNumReg === 0.U
276*a58e3351SLi Qianruo
277*a58e3351SLi Qianruo  // Post Process
278*a58e3351SLi Qianruo
279*a58e3351SLi Qianruo  when(rSignReg) {
280*a58e3351SLi Qianruo    rNext := ~rSumReg + ~rCarryReg + 2.U
281*a58e3351SLi Qianruo    rNextPd := ~rSumReg + ~rCarryReg + ~Cat(0.U(1.W), dNormAbsReg(len - 1, 0), 0.U(3.W)) + 3.U
282*a58e3351SLi Qianruo  } .otherwise {
283*a58e3351SLi Qianruo    rNext := rSumReg + rCarryReg
284*a58e3351SLi Qianruo    rNextPd := rSumReg + rCarryReg + Cat(0.U(1.W), dNormAbsReg(len - 1, 0), 0.U(3.W))
2855018a303SLinJiawei  }
2865018a303SLinJiawei
287*a58e3351SLi Qianruo  val r = aNormAbsReg
288*a58e3351SLi Qianruo  val rPd = dNormAbsReg
289*a58e3351SLi Qianruo  val rIsZero = ~(r.orR())
290*a58e3351SLi Qianruo  val needCorr = (~dIsZero & ~noIterReg) & Mux(rSignReg, ~r(len) & ~rIsZero, r(len)) // when we get pos rem for d<0 or neg rem for d>0
291*a58e3351SLi Qianruo  rPreShifted := Mux(needCorr, rPd, r)
292*a58e3351SLi Qianruo  val rFinal = RegEnable(rightShifted, state(s_post_1))// right shifted remainder. shift by the number of bits divisor is shifted
293*a58e3351SLi Qianruo  val qFinal = Mux(needCorr, quotM1IterReg, quotIterReg)
2945018a303SLinJiawei
295*a58e3351SLi Qianruo  val res = Mux(isHi, rFinal, qFinal)
296afefbad5SLinJiawei  io.out_data := Mux(isW,
2975018a303SLinJiawei    SignExt(res(31, 0), len),
2985018a303SLinJiawei    res
2995018a303SLinJiawei  )
300*a58e3351SLi Qianruo  io.in_ready := state(s_idle)
301*a58e3351SLi Qianruo  io.out_valid := state(s_finish) // state === s_finish
302afefbad5SLinJiawei}
3035018a303SLinJiawei
304*a58e3351SLi Qianruoclass RightShifter(len: Int, lzc_width: Int) extends Module {
305*a58e3351SLi Qianruo  val io = IO(new Bundle() {
306*a58e3351SLi Qianruo    val shiftNum = Input(UInt(lzc_width.W))
307*a58e3351SLi Qianruo    val in = Input(UInt(len.W))
308*a58e3351SLi Qianruo    val msb = Input(Bool())
309*a58e3351SLi Qianruo    val out = Output(UInt(len.W))
310*a58e3351SLi Qianruo  })
311*a58e3351SLi Qianruo  require(len == 64 || len == 32)
312*a58e3351SLi Qianruo  val shift = io.shiftNum
313*a58e3351SLi Qianruo  val msb = io.msb
314*a58e3351SLi Qianruo  val s0 = Mux(shift(0), Cat(VecInit(Seq.fill(1)(msb)).asUInt, io.in(len - 1, 1)), io.in)
315*a58e3351SLi Qianruo  val s1 = Mux(shift(1), Cat(VecInit(Seq.fill(2)(msb)).asUInt, s0(len - 1, 2)), s0)
316*a58e3351SLi Qianruo  val s2 = Mux(shift(2), Cat(VecInit(Seq.fill(4)(msb)).asUInt, s1(len - 1, 4)), s1)
317*a58e3351SLi Qianruo  val s3 = Mux(shift(3), Cat(VecInit(Seq.fill(8)(msb)).asUInt, s2(len - 1, 8)), s2)
318*a58e3351SLi Qianruo  val s4 = Mux(shift(4), Cat(VecInit(Seq.fill(16)(msb)).asUInt, s3(len - 1, 16)), s3)
319*a58e3351SLi Qianruo  val s5 = Wire(UInt(len.W))
320*a58e3351SLi Qianruo  if (len == 64) {
321*a58e3351SLi Qianruo    s5 := Mux(shift(5), Cat(VecInit(Seq.fill(32)(msb)).asUInt, s4(len - 1, 32)), s4)
322*a58e3351SLi Qianruo  } else if (len == 32) {
323*a58e3351SLi Qianruo    s5 := s4
324*a58e3351SLi Qianruo  }
325*a58e3351SLi Qianruo  io.out := s5
326*a58e3351SLi Qianruo}
327*a58e3351SLi Qianruo
328*a58e3351SLi Qianruoobject mLookUpTable {
329*a58e3351SLi Qianruo  // Usage :
330*a58e3351SLi Qianruo  // result := decoder(QMCMinimizer, index, mLookupTable.xxx)
331*a58e3351SLi Qianruo  val minus_m = Seq(
332*a58e3351SLi Qianruo    Array( // -m[-1]
333*a58e3351SLi Qianruo      0.U -> "b00_11010".U,
334*a58e3351SLi Qianruo      1.U -> "b00_11110".U,
335*a58e3351SLi Qianruo      2.U -> "b01_00000".U,
336*a58e3351SLi Qianruo      3.U -> "b01_00100".U,
337*a58e3351SLi Qianruo      4.U -> "b01_00110".U,
338*a58e3351SLi Qianruo      5.U -> "b01_01010".U,
339*a58e3351SLi Qianruo      6.U -> "b01_01100".U,
340*a58e3351SLi Qianruo      7.U -> "b01_10000".U
341*a58e3351SLi Qianruo    ),
342*a58e3351SLi Qianruo    Array( // -m[0]
343*a58e3351SLi Qianruo      0.U -> "b000_0101".U,
344*a58e3351SLi Qianruo      1.U -> "b000_0110".U,
345*a58e3351SLi Qianruo      2.U -> "b000_0110".U,
346*a58e3351SLi Qianruo      3.U -> "b000_0110".U,
347*a58e3351SLi Qianruo      4.U -> "b000_1001".U,
348*a58e3351SLi Qianruo      5.U -> "b000_1000".U,
349*a58e3351SLi Qianruo      6.U -> "b000_1000".U,
350*a58e3351SLi Qianruo      7.U -> "b000_1000".U
351*a58e3351SLi Qianruo    ),
352*a58e3351SLi Qianruo    Array( //-m[1]
353*a58e3351SLi Qianruo      0.U -> "b111_1101".U,
354*a58e3351SLi Qianruo      1.U -> "b111_1100".U,
355*a58e3351SLi Qianruo      2.U -> "b111_1100".U,
356*a58e3351SLi Qianruo      3.U -> "b111_1100".U,
357*a58e3351SLi Qianruo      4.U -> "b111_1011".U,
358*a58e3351SLi Qianruo      5.U -> "b111_1010".U,
359*a58e3351SLi Qianruo      6.U -> "b111_1010".U,
360*a58e3351SLi Qianruo      7.U -> "b111_1010".U
361*a58e3351SLi Qianruo    ),
362*a58e3351SLi Qianruo    Array( //-m[2]
363*a58e3351SLi Qianruo      0.U -> "b11_01000".U,
364*a58e3351SLi Qianruo      1.U -> "b11_00100".U,
365*a58e3351SLi Qianruo      2.U -> "b11_00010".U,
366*a58e3351SLi Qianruo      3.U -> "b10_11110".U,
367*a58e3351SLi Qianruo      4.U -> "b10_11100".U,
368*a58e3351SLi Qianruo      5.U -> "b10_11000".U,
369*a58e3351SLi Qianruo      6.U -> "b10_10110".U,
370*a58e3351SLi Qianruo      7.U -> "b10_10010".U
371*a58e3351SLi Qianruo    ))
372*a58e3351SLi Qianruo}
373*a58e3351SLi Qianruo
374*a58e3351SLi Qianruoclass SRT4QDS(len: Int, itn_len: Int) extends Module {
375*a58e3351SLi Qianruo  // srt4 quotientr digit selection
376*a58e3351SLi Qianruo  val io = IO(new Bundle() {
377*a58e3351SLi Qianruo    val remSum = Input(UInt(itn_len.W)) // 68, 67
378*a58e3351SLi Qianruo    val remCarry = Input(UInt(itn_len.W))
379*a58e3351SLi Qianruo    val d = Input(UInt(len.W)) // 64, 64
380*a58e3351SLi Qianruo    val specialDivisor = Input(Bool())
381*a58e3351SLi Qianruo    val qPrev = Input(UInt(5.W))
382*a58e3351SLi Qianruo    val qIterEnd = Output(UInt(5.W))
383*a58e3351SLi Qianruo  })
384*a58e3351SLi Qianruo  val remSumX16 = io.remSum << 4 // 72, 67 Top 2 bits unused
385*a58e3351SLi Qianruo  val remCarryX16 = io.remCarry << 4
386*a58e3351SLi Qianruo  def trunc25(rem: UInt): UInt = {rem(itn_len, itn_len - 7 + 1)}
387*a58e3351SLi Qianruo  def trunc34(rem: UInt): UInt = {rem(itn_len + 1, itn_len + 1 - 7 + 1)}
388*a58e3351SLi Qianruo
389*a58e3351SLi Qianruo  val quot_neg_2 :: quot_neg_1 :: quot_0 :: quot_pos_1 :: quot_pos_2 :: Nil = Enum(5)
390*a58e3351SLi Qianruo
391*a58e3351SLi Qianruo  val d = Cat(0.U(1.W), io.d, 0.U(3.W)) // 68, 67
392*a58e3351SLi Qianruo  val (dX4, dX8, dXNeg4, dXNeg8) = (d << 2, d(itn_len - 2, 0) << 3, ~(d << 2), ~(d(itn_len - 2, 0) << 3)) // 70, 67
393*a58e3351SLi Qianruo  val dForLookup = io.d(len - 2, len - 2 - 3 + 1)
394*a58e3351SLi Qianruo
395*a58e3351SLi Qianruo  val dXq = Mux1H(Seq(
396*a58e3351SLi Qianruo    io.qPrev(quot_neg_2) -> dX8,
397*a58e3351SLi Qianruo    io.qPrev(quot_neg_1) -> dX4,
398*a58e3351SLi Qianruo    io.qPrev(quot_0) -> 0.U((itn_len + 2).W),
399*a58e3351SLi Qianruo    io.qPrev(quot_pos_1) -> dXNeg4,
400*a58e3351SLi Qianruo    io.qPrev(quot_pos_2) -> dXNeg8
401*a58e3351SLi Qianruo  ))
402*a58e3351SLi Qianruo  val signs = VecInit(Seq.tabulate(4){ // -1 0 1 2
403*a58e3351SLi Qianruo    i => {
404*a58e3351SLi Qianruo      val csa1 = Module(new CSA3_2(7))
405*a58e3351SLi Qianruo      val csa2 = Module(new CSA3_2(7))
406*a58e3351SLi Qianruo      if (i == 1 || i == 2) {
407*a58e3351SLi Qianruo        csa1.io.in(0) := trunc34(remSumX16)
408*a58e3351SLi Qianruo        csa1.io.in(1) := trunc34(remCarryX16)
409*a58e3351SLi Qianruo        csa2.io.in(2) := trunc34(dXq)
410*a58e3351SLi Qianruo      } else {
411*a58e3351SLi Qianruo        csa1.io.in(0) := trunc25(remSumX16)
412*a58e3351SLi Qianruo        csa1.io.in(1) := trunc25(remCarryX16)
413*a58e3351SLi Qianruo        csa2.io.in(2) := trunc25(dXq)
414*a58e3351SLi Qianruo      }
415*a58e3351SLi Qianruo      csa1.io.in(2) := MuxLookup(dForLookup, "b0000000".U, mLookUpTable.minus_m(i))
416*a58e3351SLi Qianruo      csa2.io.in(0) := csa1.io.out(0)
417*a58e3351SLi Qianruo      csa2.io.in(1) := csa1.io.out(1)(5, 0) << 1
418*a58e3351SLi Qianruo      (csa2.io.out(0) + (csa2.io.out(1)(5, 0) << 1))(6)
419*a58e3351SLi Qianruo    }
420*a58e3351SLi Qianruo  })
421*a58e3351SLi Qianruo  val qVec = Wire(Vec(5, Bool()))
422*a58e3351SLi Qianruo  qVec(quot_neg_2) := signs(0) && signs(1) && signs(2)
423*a58e3351SLi Qianruo  qVec(quot_neg_1) := ~signs(0) && signs(1) && signs(2)
424*a58e3351SLi Qianruo  qVec(quot_0) := signs(2) && ~signs(1)
425*a58e3351SLi Qianruo  qVec(quot_pos_1) := signs(3) && ~signs(2) && ~signs(1)
426*a58e3351SLi Qianruo  qVec(quot_pos_2) := ~signs(3) && ~signs(2) && ~signs(1)
427*a58e3351SLi Qianruo  io.qIterEnd := qVec.asUInt
428*a58e3351SLi Qianruo  // assert(PopCount(qVec) === 1.U)
429*a58e3351SLi Qianruo}
430*a58e3351SLi Qianruo
431*a58e3351SLi Qianruo
4322225d46eSJiawei Linclass SRT4Divider(len: Int)(implicit p: Parameters) extends AbstractDivider(len) {
433afefbad5SLinJiawei
434afefbad5SLinJiawei  val newReq = io.in.fire()
435afefbad5SLinJiawei
436afefbad5SLinJiawei  val uop = io.in.bits.uop
437afefbad5SLinJiawei  val uopReg = RegEnable(uop, newReq)
438afefbad5SLinJiawei  val ctrlReg = RegEnable(ctrl, newReq)
439afefbad5SLinJiawei
440afefbad5SLinJiawei  val divDataModule = Module(new SRT4DividerDataModule(len))
441afefbad5SLinJiawei
442afefbad5SLinJiawei  val kill_w = uop.roqIdx.needFlush(io.redirectIn, io.flushIn)
443afefbad5SLinJiawei  val kill_r = !divDataModule.io.in_ready && uopReg.roqIdx.needFlush(io.redirectIn, io.flushIn)
444afefbad5SLinJiawei
4452bd5334dSYinan Xu  divDataModule.io.src(0) := io.in.bits.src(0)
4462bd5334dSYinan Xu  divDataModule.io.src(1) := io.in.bits.src(1)
447afefbad5SLinJiawei  divDataModule.io.valid := io.in.valid
448afefbad5SLinJiawei  divDataModule.io.sign := sign
449afefbad5SLinJiawei  divDataModule.io.kill_w := kill_w
450afefbad5SLinJiawei  divDataModule.io.kill_r := kill_r
451afefbad5SLinJiawei  divDataModule.io.isHi := ctrlReg.isHi
452afefbad5SLinJiawei  divDataModule.io.isW := ctrlReg.isW
453afefbad5SLinJiawei  divDataModule.io.out_ready := io.out.ready
454afefbad5SLinJiawei
455afefbad5SLinJiawei  io.in.ready := divDataModule.io.in_ready
456afefbad5SLinJiawei  io.out.valid := divDataModule.io.out_valid
457afefbad5SLinJiawei  io.out.bits.data := divDataModule.io.out_data
458afefbad5SLinJiawei  io.out.bits.uop := uopReg
4595018a303SLinJiawei}
460