/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*          http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/

package xiangshan.mem

import chipsalliance.rocketchip.config.Parameters
import chisel3._
import chisel3.util._
import utils._
import utility._
import xiangshan._
import xiangshan.cache._
import xiangshan.cache.{DCacheWordIO, DCacheLineIO, MemoryOpConstants}
import xiangshan.mem._
import xiangshan.backend.rob.RobPtr

class LQDataEntryWoPaddr(implicit p: Parameters) extends XSBundle {
  val mask = UInt(8.W)
  val data = UInt(XLEN.W)
  val fwdMask = Vec(8, Bool())
}

class LQDataEntry(implicit p: Parameters) extends LQDataEntryWoPaddr {
  val paddr = UInt(PAddrBits.W)
}

// Data module define
// These data modules are like SyncDataModuleTemplate, but support cam-like ops

// load queue paddr module
// 
// It supports 3 cam sources:
// * st-ld violation addr cam 
// * data release addr cam 
// * data refill addr cam
class LQPaddrModule(numEntries: Int, numRead: Int, numWrite: Int, numWBanks: Int)(implicit p: Parameters) extends XSModule with HasDCacheParameters {
  val io = IO(new Bundle {
    // normal read/write ports
    val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W)))
    val rdata = Output(Vec(numRead, UInt((PAddrBits).W)))
    val wen   = Input(Vec(numWrite, Bool()))
    val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W)))
    val wdata = Input(Vec(numWrite, UInt((PAddrBits).W)))
    // violation cam: hit if addr is in the same word
    val violationMdata = Input(Vec(StorePipelineWidth, UInt((PAddrBits).W))) // addr
    val violationMmask = Output(Vec(StorePipelineWidth, Vec(numEntries, Bool()))) // cam result mask
    // release cam: hit if addr is in the same cacheline
    val releaseMdata = Input(Vec(LoadPipelineWidth, UInt((PAddrBits).W)))
    val releaseMmask = Output(Vec(LoadPipelineWidth, Vec(numEntries, Bool())))
    // refill cam: hit if addr is in the same cacheline
    val refillMdata = Input(UInt((PAddrBits).W))
    val refillMmask = Output(Vec(numEntries, Bool()))
  })

  require(isPow2(numWBanks))
  require(numWBanks >= 2)

  val numEntryPerBank = numEntries / numWBanks

  val data = Reg(Vec(numEntries, UInt((PAddrBits).W)))

  // read ports
  for (i <- 0 until numRead) {
    io.rdata(i) := data(RegNext(io.raddr(i)))
  }

  // write ports
  val waddr_dec = io.waddr.map(a => UIntToOH(a))
  def selectBankMask(in: UInt, bank: Int): UInt = {
    in((bank + 1) * numEntryPerBank - 1, bank * numEntryPerBank)
  }
  for (bank <- 0 until numWBanks) {
    // write ports
    // s0: write to bank level buffer
    val s0_bank_waddr_dec = waddr_dec.map(a => selectBankMask(a, bank))
    val s0_bank_write_en = io.wen.zip(s0_bank_waddr_dec).map(w => w._1 && w._2.orR)
    s0_bank_waddr_dec.zipWithIndex.map(a =>
      a._1.suggestName("s0_bank_waddr_dec" + bank + "_" + a._2)
    )
    s0_bank_write_en.zipWithIndex.map(a =>
      a._1.suggestName("s0_bank_write_en" + bank + "_" + a._2)
    )
    // s1: write data to entries
    val s1_bank_waddr_dec = s0_bank_waddr_dec.zip(s0_bank_write_en).map(w => RegEnable(w._1, w._2))
    val s1_bank_wen = RegNext(VecInit(s0_bank_write_en))
    val s1_wdata = io.wdata.zip(s0_bank_write_en).map(w => RegEnable(w._1, w._2))
    s1_bank_waddr_dec.zipWithIndex.map(a =>
      a._1.suggestName("s1_bank_waddr_dec" + bank + "_" + a._2)
    )
    s1_bank_wen.zipWithIndex.map(a =>
      a._1.suggestName("s1_bank_wen" + bank + "_" + a._2)
    )
    s1_wdata.zipWithIndex.map(a =>
      a._1.suggestName("s1_wdata" + bank + "_" + a._2)
    )

    // entry write
    for (entry <- 0 until numEntryPerBank) {
      // write ports
      val s1_entry_write_en_vec = s1_bank_wen.zip(s1_bank_waddr_dec).map(w => w._1 && w._2(entry))
      val s1_entry_write_en = VecInit(s1_entry_write_en_vec).asUInt.orR
      val s1_entry_write_data = Mux1H(s1_entry_write_en_vec, s1_wdata)
      when (s1_entry_write_en) {
        data(bank * numEntryPerBank + entry) := s1_entry_write_data
      }
      s1_entry_write_en_vec.zipWithIndex.map(a =>
        a._1.suggestName("s1_entry_write_en_vec" + bank + "_" + entry + "_" + a._2)
      )
      s1_entry_write_en.suggestName("s1_entry_write_en" + bank + "_" + entry)
      s1_entry_write_data.suggestName("s1_entry_write_data" + bank + "_" + entry)
    }
  }

  // content addressed match
  for (i <- 0 until StorePipelineWidth) {
    for (j <- 0 until numEntries) {
      io.violationMmask(i)(j) := io.violationMdata(i)(PAddrBits-1, DCacheWordOffset) === data(j)(PAddrBits-1, DCacheWordOffset)
    }
  }
  for (i <- 0 until LoadPipelineWidth) {
    for (j <- 0 until numEntries) {
      io.releaseMmask(i)(j) := io.releaseMdata(i)(PAddrBits-1, DCacheLineOffset) === data(j)(PAddrBits-1, DCacheLineOffset)
    }
  }

  for (j <- 0 until numEntries) {
    io.refillMmask(j) := get_refill_addr(io.refillMdata) === get_refill_addr(data(j))
  }

  // DataModuleTemplate should not be used when there're any write conflicts
  for (i <- 0 until numWrite) {
    for (j <- i+1 until numWrite) {
      assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
    }
  }
}

// load queue load mask module
class LQMaskModule(numEntries: Int, numRead: Int, numWrite: Int)(implicit p: Parameters) extends XSModule {
  val io = IO(new Bundle {
    val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W)))
    val rdata = Output(Vec(numRead, UInt(8.W)))
    val wen   = Input(Vec(numWrite, Bool()))
    val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W)))
    val wdata = Input(Vec(numWrite, UInt(8.W)))
    // st-ld violation check wmask compare 
    val violationMdata = Input(Vec(StorePipelineWidth, UInt(8.W))) // input 8-bit wmask
    val violationMmask = Output(Vec(StorePipelineWidth, Vec(numEntries, Bool()))) // output wmask overlap vector
  })

  val data = Reg(Vec(numEntries, UInt(8.W)))

  // read ports
  for (i <- 0 until numRead) {
    io.rdata(i) := data(RegNext(io.raddr(i)))
  }

  // write ports
  val waddr_dec = io.waddr.map(a => UIntToOH(a))
  for (j <- 0 until numEntries) {
    val write_wen = io.wen.zip(waddr_dec).map(w => w._1 && w._2(j))
    when (VecInit(write_wen).asUInt.orR) {
      data(j) := Mux1H(write_wen, io.wdata)
    }
  }

  // st-ld violation check wmask compare 
  for (i <- 0 until StorePipelineWidth) {
    for (j <- 0 until numEntries) {
      io.violationMmask(i)(j) := (io.violationMdata(i) & data(j)).orR
    }
  }

  // DataModuleTemplate should not be used when there're any write conflicts
  for (i <- 0 until numWrite) {
    for (j <- i+1 until numWrite) {
      assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
    }
  }
}

// SQDataModule is a wrapper of 8 bit MaskedSyncDataModuleTemplates
// 
// It also contains:
// * fwdMask, which is used to merge refill data and forwarded data
// * word index extracted from paddr, which is used to select data from refill data (a cacheline)
class LQDataModule(numEntries: Int, numRead: Int, numWrite: Int)(implicit p: Parameters) extends XSModule with HasDCacheParameters {
  val io = IO(new Bundle {
    // sync read
    val raddr = Input(Vec(numRead, UInt(log2Up(numEntries).W)))
    val rdata = Output(Vec(numRead, UInt(XLEN.W)))
    
    // address indexed write
    val wen   = Input(Vec(numWrite, Bool()))
    val waddr = Input(Vec(numWrite, UInt(log2Up(numEntries).W)))
    val wdata = Input(Vec(numWrite, UInt(XLEN.W)))
    // forward mask needs to be recorded to merge data
    val fwdMaskWdata = Input(Vec(numWrite, UInt(8.W)))
    // refillOffBits - wordOffBits bits in paddr need to be stored in LQDataModule for refilling
    val paddrWdata = Input(Vec(numWrite, UInt((PAddrBits).W)))
    
    // masked write
    val mwmask = Input(Vec(numEntries, Bool()))
    val refillData = Input(UInt(l1BusDataWidth.W))
  })

  val data8 = Seq.fill(8)(Module(new MaskedBankedSyncDataModuleTemplate(
    UInt(8.W), numEntries, numRead, numWrite, numMWrite = refillWords, numWBanks = LoadQueueNWriteBanks
  )))
  val fwdMask = Reg(Vec(numEntries, UInt(8.W)))
  val wordIndex = Reg(Vec(numEntries, UInt((refillOffBits - wordOffBits).W)))

  // read ports
  for (i <- 0 until numRead) {
    for (j <- 0 until 8) {
      data8(j).io.raddr(i) := io.raddr(i)
    }
    io.rdata(i) := VecInit((0 until 8).map(j => data8(j).io.rdata(i))).asUInt
  }

  // below is the write ports (with priorities)
  for (i <- 0 until numWrite) {
    // write to data8
    for (j <- 0 until 8) {
      data8(j).io.waddr(i) := io.waddr(i)
      data8(j).io.wdata(i) := io.wdata(i)(8*(j+1)-1, 8*j)
      data8(j).io.wen(i) := io.wen(i)
    }

    // write ctrl info
    // TODO: optimize that
    when (io.wen(i)) {
      fwdMask(io.waddr(i)) := io.fwdMaskWdata(i)
    }
    when (io.wen(i)) {
      wordIndex(io.waddr(i)) := get_word(io.paddrWdata(i))
    }
  }

  // write refilled data to data8

  // select refill data
  // split dcache result into words
  val words = VecInit((0 until refillWords) map { i => io.refillData(DataBits * (i + 1) - 1, DataBits * i)})
  // select refill data according to wordIndex (paddr)
  for (i <- 0 until 8) {
    for (j <- 0 until refillWords) {
      data8(i).io.mwdata(j) := words(j)(8*(i+1)-1, 8*i)
    }
  }

  // gen refill wmask
  for (j <- 0 until refillWords) {
    for (k <- 0 until numEntries) {
      val wordMatch = wordIndex(k) === j.U
      for (i <- 0 until 8) {
        data8(i).io.mwmask(j)(k) := wordMatch && io.mwmask(k) && !fwdMask(k)(i)
      }
    }
  }

  // DataModuleTemplate should not be used when there're any write conflicts
  for (i <- 0 until numWrite) {
    for (j <- i+1 until numWrite) {
      assert(!(io.wen(i) && io.wen(j) && io.waddr(i) === io.waddr(j)))
    }
  }
}

// LoadQueueDataWrapper wraps:
// * load queue paddrModule
// * load queue maskModule
// * load queue dataModule
// and their interconnect
class LoadQueueDataWrapper(size: Int, wbNumRead: Int, wbNumWrite: Int)(implicit p: Parameters) extends XSModule with HasDCacheParameters with HasCircularQueuePtrHelper {
  val io = IO(new Bundle() {
    val paddr = new Bundle() {
      val wen = Vec(wbNumWrite, Input(Bool()))
      val waddr = Input(Vec(wbNumWrite, UInt(log2Up(size).W)))
      val wdata = Input(Vec(wbNumWrite, UInt(PAddrBits.W)))
    }
    val wb = new Bundle() {
      val wen = Vec(wbNumWrite, Input(Bool()))
      val waddr = Input(Vec(wbNumWrite, UInt(log2Up(size).W)))
      val wdata = Input(Vec(wbNumWrite, new LQDataEntry))
      val raddr = Input(Vec(wbNumRead, UInt(log2Up(size).W)))
      val rdata = Output(Vec(wbNumRead, new LQDataEntry))
    }
    val uncache = new Bundle() {
      val wen = Input(Bool())
      val waddr = Input(UInt(log2Up(size).W))
      val wdata = Input(UInt(XLEN.W)) // only write back uncache data
      val raddr = Input(UInt(log2Up(size).W))
      val rdata = Output(new LQDataEntry)
    }
    val refill = new Bundle() {
      val valid = Input(Bool())
      val paddr = Input(UInt(PAddrBits.W))
      val data = Input(UInt(l1BusDataWidth.W))
      val refillMask = Input(Vec(size, Bool()))
      val matchMask = Output(Vec(size, Bool()))
    }
    // st-ld violation query, word level cam
    val violation = Vec(StorePipelineWidth, new Bundle() {
      val paddr = Input(UInt(PAddrBits.W))
      val mask = Input(UInt(8.W))
      val violationMask = Output(Vec(size, Bool()))
    })
    // ld-ld violation query, cache line level cam
    val release_violation = Vec(LoadPipelineWidth, new Bundle() {
      val paddr = Input(UInt(PAddrBits.W))
      val match_mask = Output(Vec(size, Bool()))
      // if ld-ld violation does happened, we replay from the elder load
    })
    val debug = Output(Vec(size, new LQDataEntry))

    def wbWrite(channel: Int, waddr: UInt, wdata: LQDataEntry): Unit = {
      require(channel < wbNumWrite && wbNumWrite >= 0)
      // need extra "this.wb(channel).wen := true.B"
      this.wb.waddr(channel) := waddr
      this.wb.wdata(channel) := wdata
    }

    def uncacheWrite(waddr: UInt, wdata: UInt): Unit = {
      // need extra "this.uncache.wen := true.B"
      this.uncache.waddr := waddr
      this.uncache.wdata := wdata
    }
  })

  // data module
  val paddrModule = Module(new LQPaddrModule(size, numRead = LoadPipelineWidth+1, numWrite = LoadPipelineWidth, numWBanks = LoadQueueNWriteBanks))
  val maskModule = Module(new LQMaskModule(size, numRead = LoadPipelineWidth+1, numWrite = LoadPipelineWidth))
  val dataModule = Module(new LQDataModule(size, numRead = LoadPipelineWidth+1, numWrite = LoadPipelineWidth+1))

  // read data
  // read port 0 -> wbNumRead-1
  (0 until wbNumRead).map(i => {
    paddrModule.io.raddr(i) := io.wb.raddr(i)
    maskModule.io.raddr(i) := io.wb.raddr(i)
    dataModule.io.raddr(i) := io.wb.raddr(i)

    io.wb.rdata(i).paddr := paddrModule.io.rdata(i)
    io.wb.rdata(i).mask := maskModule.io.rdata(i)
    io.wb.rdata(i).data := dataModule.io.rdata(i)
    io.wb.rdata(i).fwdMask := DontCare
  })

  // read port wbNumRead
  paddrModule.io.raddr(wbNumRead) := io.uncache.raddr
  maskModule.io.raddr(wbNumRead) := io.uncache.raddr
  dataModule.io.raddr(wbNumRead) := io.uncache.raddr

  io.uncache.rdata.paddr := paddrModule.io.rdata(wbNumRead)
  io.uncache.rdata.mask := maskModule.io.rdata(wbNumRead)
  io.uncache.rdata.data := dataModule.io.rdata(wbNumRead)
  io.uncache.rdata.fwdMask := DontCare

  // write data
  // write port 0 -> wbNumWrite-1
  (0 until wbNumWrite).map(i => {
    paddrModule.io.wen(i) := false.B
    maskModule.io.wen(i) := false.B
    dataModule.io.wen(i) := false.B

    maskModule.io.waddr(i) := io.wb.waddr(i)
    dataModule.io.waddr(i) := io.wb.waddr(i)

    maskModule.io.wdata(i) := io.wb.wdata(i).mask
    dataModule.io.wdata(i) := io.wb.wdata(i).data
    dataModule.io.fwdMaskWdata(i) := io.wb.wdata(i).fwdMask.asUInt
    dataModule.io.paddrWdata(i) := io.wb.wdata(i).paddr

    when(io.wb.wen(i)){
      maskModule.io.wen(i) := true.B
      dataModule.io.wen(i) := true.B
    }

    paddrModule.io.wen(i) := io.paddr.wen(i)
    paddrModule.io.waddr(i) := io.paddr.waddr(i)
    paddrModule.io.wdata(i) := io.paddr.wdata(i)
  })

  // write port wbNumWrite
  dataModule.io.wen(wbNumWrite) := io.uncache.wen
  // dataModule.io.fwdMaskWen(wbNumWrite) := false.B
  // dataModule.io.paddrWen(wbNumWrite) := false.B

  dataModule.io.waddr(wbNumWrite) := io.uncache.waddr

  dataModule.io.fwdMaskWdata(wbNumWrite) := DontCare
  dataModule.io.paddrWdata(wbNumWrite) := DontCare
  dataModule.io.wdata(wbNumWrite) := io.uncache.wdata

  // st-ld mem access violation check, gen violationMask
  (0 until StorePipelineWidth).map(i => {
    paddrModule.io.violationMdata(i) := io.violation(i).paddr
    maskModule.io.violationMdata(i) := io.violation(i).mask
    io.violation(i).violationMask := (paddrModule.io.violationMmask(i).asUInt & maskModule.io.violationMmask(i).asUInt).asBools
  })

  // ld-ld mem access violation check, gen violationMask (cam match mask)
  (0 until LoadPipelineWidth).map(i => {
    paddrModule.io.releaseMdata(i) := io.release_violation(i).paddr
    io.release_violation(i).match_mask := paddrModule.io.releaseMmask(i)
  })

  // gen paddr match mask
  paddrModule.io.refillMdata := io.refill.paddr
  (0 until size).map(i => {
    io.refill.matchMask := paddrModule.io.refillMmask
    // io.refill.matchMask(i) := get_block_addr(data(i).paddr) === get_block_addr(io.refill.paddr)
  })

  // refill data according to matchMask, refillMask and refill.valid
  dataModule.io.refillData := io.refill.data
  (0 until size).map(i => {
    dataModule.io.mwmask(i) := io.refill.valid && io.refill.matchMask(i) && io.refill.refillMask(i)
  })

  // debug data read
  io.debug := DontCare
}