xref: /XiangShan/src/main/scala/xiangshan/mem/prefetch/L1StreamPrefetcher.scala (revision bb2f3f51dd67f6e16e0cc1ffe43368c9fc7e4aef)
1package xiangshan.mem.prefetch
2
3import org.chipsalliance.cde.config.Parameters
4import chisel3._
5import chisel3.util._
6import xiangshan._
7import utils._
8import utility._
9import xiangshan.cache.HasDCacheParameters
10import xiangshan.cache.mmu._
11import xiangshan.mem.{L1PrefetchReq, LdPrefetchTrainBundle}
12import xiangshan.mem.trace._
13import xiangshan.mem.L1PrefetchSource
14
15trait HasStreamPrefetchHelper extends HasL1PrefetchHelper {
16  // capacity related
17  val STREAM_FILTER_SIZE = 4
18  val BIT_VEC_ARRAY_SIZE = 16
19  val ACTIVE_THRESHOLD = BIT_VEC_WITDH - 4
20  val INIT_DEC_MODE = false
21
22  // bit_vector [StreamBitVectorBundle]:
23  // `X`: valid; `.`: invalid; `H`: hit
24  // [X X X X X X X X X . . H . X X X]                                                         [. . X X X X . . . . . . . . . .]
25  //                    hit in 12th slot & active           --------------------->             prefetch bit_vector [StreamPrefetchReqBundle]
26  //                        |  <---------------------------- depth ---------------------------->
27  //                                                                                           | <-- width -- >
28  val DEPTH_BYTES = 1024
29  val DEPTH_CACHE_BLOCKS = DEPTH_BYTES / dcacheParameters.blockBytes
30  val WIDTH_BYTES = 128
31  val WIDTH_CACHE_BLOCKS = WIDTH_BYTES / dcacheParameters.blockBytes
32
33  val L2_DEPTH_RATIO = 2
34  val L2_WIDTH_BYTES = WIDTH_BYTES * 2
35  val L2_WIDTH_CACHE_BLOCKS = L2_WIDTH_BYTES / dcacheParameters.blockBytes
36
37  val L3_DEPTH_RATIO = 3
38  val L3_WIDTH_BYTES = WIDTH_BYTES * 2 * 2
39  val L3_WIDTH_CACHE_BLOCKS = L3_WIDTH_BYTES / dcacheParameters.blockBytes
40
41  val DEPTH_LOOKAHEAD = 6
42  val DEPTH_BITS = log2Up(DEPTH_CACHE_BLOCKS) + DEPTH_LOOKAHEAD
43
44  val ENABLE_DECR_MODE = false
45  val ENABLE_STRICT_ACTIVE_DETECTION = true
46
47  // constraints
48  require((DEPTH_BYTES >= REGION_SIZE) && ((DEPTH_BYTES % REGION_SIZE) == 0) && ((DEPTH_BYTES / REGION_SIZE) > 0))
49  require(((VADDR_HASH_WIDTH * 3) + BLK_ADDR_RAW_WIDTH) <= REGION_TAG_BITS)
50  require(WIDTH_BYTES >= dcacheParameters.blockBytes)
51}
52
53class StreamBitVectorBundle(implicit p: Parameters) extends XSBundle with HasStreamPrefetchHelper {
54  val tag = UInt(REGION_TAG_BITS.W)
55  val bit_vec = UInt(BIT_VEC_WITDH.W)
56  val active = Bool()
57  // cnt can be optimized
58  val cnt = UInt((log2Up(BIT_VEC_WITDH) + 1).W)
59  val decr_mode = Bool()
60
61  // debug usage
62  val trigger_full_va = UInt(VAddrBits.W)
63
64  def reset(index: Int) = {
65    tag := index.U
66    bit_vec := 0.U
67    active := false.B
68    cnt := 0.U
69    decr_mode := INIT_DEC_MODE.B
70    trigger_full_va := 0xdeadbeefL.U
71  }
72
73  def tag_match(new_tag: UInt): Bool = {
74    region_hash_tag(tag) === region_hash_tag(new_tag)
75  }
76
77  def alloc(alloc_tag: UInt, alloc_bit_vec: UInt, alloc_active: Bool, alloc_decr_mode: Bool, alloc_full_vaddr: UInt) = {
78    tag := alloc_tag
79    bit_vec := alloc_bit_vec
80    active := alloc_active
81    cnt := 1.U
82    trigger_full_va := alloc_full_vaddr
83    if(ENABLE_DECR_MODE) {
84      decr_mode := alloc_decr_mode
85    }else {
86      decr_mode := INIT_DEC_MODE.B
87    }
88
89
90    assert(PopCount(alloc_bit_vec) === 1.U, "alloc vector should be one hot")
91  }
92
93  def update(update_bit_vec: UInt, update_active: Bool) = {
94    // if the slot is 0 before, increment cnt
95    val cnt_en = !((bit_vec & update_bit_vec).orR)
96    val cnt_next = Mux(cnt_en, cnt + 1.U, cnt)
97
98    bit_vec := bit_vec | update_bit_vec
99    cnt := cnt_next
100    when(cnt_next >= ACTIVE_THRESHOLD.U) {
101      active := true.B
102    }
103    when(update_active) {
104      active := true.B
105    }
106
107    assert(PopCount(update_bit_vec) === 1.U, "update vector should be one hot")
108    assert(cnt <= BIT_VEC_WITDH.U, "cnt should always less than bit vector size")
109  }
110}
111
112class StreamPrefetchReqBundle(implicit p: Parameters) extends XSBundle with HasStreamPrefetchHelper {
113  val region = UInt(REGION_TAG_BITS.W)
114  val bit_vec = UInt(BIT_VEC_WITDH.W)
115  val sink = UInt(SINK_BITS.W)
116  val source = new L1PrefetchSource()
117  // debug usage
118  val trigger_pc = UInt(VAddrBits.W)
119  val trigger_va = UInt(VAddrBits.W)
120
121  // align prefetch vaddr and width to region
122  def getStreamPrefetchReqBundle(valid: Bool, vaddr: UInt, width: Int, decr_mode: Bool, sink: UInt, source: UInt, t_pc: UInt, t_va: UInt): StreamPrefetchReqBundle = {
123    val res = Wire(new StreamPrefetchReqBundle)
124    res.region := get_region_tag(vaddr)
125    res.sink := sink
126    res.source.value := source
127
128    res.trigger_pc := t_pc
129    res.trigger_va := t_va
130
131    val region_bits = get_region_bits(vaddr)
132    val region_bit_vec = UIntToOH(region_bits)
133    res.bit_vec := Mux(
134      decr_mode,
135      (0 until width).map{ case i => region_bit_vec >> i}.reduce(_ | _),
136      (0 until width).map{ case i => region_bit_vec << i}.reduce(_ | _)
137    )
138
139    assert(!valid || PopCount(res.bit_vec) <= width.U, "actual prefetch block number should less than or equals to WIDTH_CACHE_BLOCKS")
140    assert(!valid || PopCount(res.bit_vec) >= 1.U, "at least one block should be included")
141    assert(sink <= SINK_L3, "invalid sink")
142    for(i <- 0 until BIT_VEC_WITDH) {
143      when(decr_mode) {
144        when(i.U > region_bits) {
145          assert(!valid || res.bit_vec(i) === 0.U, s"res.bit_vec(${i}) is not zero in decr_mode, prefetch vector is wrong!")
146        }.elsewhen(i.U === region_bits) {
147          assert(!valid || res.bit_vec(i) === 1.U, s"res.bit_vec(${i}) is zero in decr_mode, prefetch vector is wrong!")
148        }
149      }.otherwise {
150        when(i.U < region_bits) {
151          assert(!valid || res.bit_vec(i) === 0.U, s"res.bit_vec(${i}) is not zero in incr_mode, prefetch vector is wrong!")
152        }.elsewhen(i.U === region_bits) {
153          assert(!valid || res.bit_vec(i) === 1.U, s"res.bit_vec(${i}) is zero in decr_mode, prefetch vector is wrong!")
154        }
155      }
156    }
157
158    res
159  }
160}
161
162class StreamBitVectorArray(implicit p: Parameters) extends XSModule with HasStreamPrefetchHelper {
163  val io = IO(new XSBundle {
164    val enable = Input(Bool())
165    // TODO: flush all entry when process changing happens, or disable stream prefetch for a while
166    val flush = Input(Bool())
167    val dynamic_depth = Input(UInt(DEPTH_BITS.W))
168    val train_req = Flipped(DecoupledIO(new PrefetchReqBundle))
169    val l1_prefetch_req = ValidIO(new StreamPrefetchReqBundle)
170    val l2_l3_prefetch_req = ValidIO(new StreamPrefetchReqBundle)
171
172    // Stride send lookup req here
173    val stream_lookup_req  = Flipped(ValidIO(new PrefetchReqBundle))
174    val stream_lookup_resp = Output(Bool())
175  })
176
177  val array = Reg(Vec(BIT_VEC_ARRAY_SIZE, new StreamBitVectorBundle))
178  val replacement = ReplacementPolicy.fromString("plru", BIT_VEC_ARRAY_SIZE)
179
180  // s0: generate region tag, parallel match
181  val s0_can_accept = Wire(Bool())
182  val s0_valid = io.train_req.fire
183  val s0_pc    = io.train_req.bits.pc
184  val s0_vaddr = io.train_req.bits.vaddr
185  val s0_region_bits = get_region_bits(s0_vaddr)
186  val s0_region_tag = get_region_tag(s0_vaddr)
187  val s0_region_tag_plus_one = get_region_tag(s0_vaddr) + 1.U
188  val s0_region_tag_minus_one = get_region_tag(s0_vaddr) - 1.U
189  val s0_region_tag_match_vec = array.map(_.tag_match(s0_region_tag))
190  val s0_region_tag_plus_one_match_vec = array.map(_.tag_match(s0_region_tag_plus_one))
191  val s0_region_tag_minus_one_match_vec = array.map(_.tag_match(s0_region_tag_minus_one))
192  val s0_hit = Cat(s0_region_tag_match_vec).orR
193  val s0_plus_one_hit = Cat(s0_region_tag_plus_one_match_vec).orR
194  val s0_minus_one_hit = Cat(s0_region_tag_minus_one_match_vec).orR
195  val s0_hit_vec = VecInit(s0_region_tag_match_vec).asUInt
196  val s0_index = Mux(s0_hit, OHToUInt(s0_hit_vec), replacement.way)
197  val s0_plus_one_index = OHToUInt(VecInit(s0_region_tag_plus_one_match_vec).asUInt)
198  val s0_minus_one_index = OHToUInt(VecInit(s0_region_tag_minus_one_match_vec).asUInt)
199  io.train_req.ready := s0_can_accept
200
201  when(s0_valid) {
202    replacement.access(s0_index)
203  }
204
205  val stream_pf_train_debug_table = ChiselDB.createTable("StreamTrainTraceTable" + p(XSCoreParamsKey).HartId.toString, new StreamTrainTraceEntry, basicDB = false)
206
207  val spf_log_enable = s0_valid
208  val spf_log_data = Wire(new StreamTrainTraceEntry)
209
210  // WARNING: the type here only indicates trigger by stream, not saying it's sink
211  spf_log_data.Type := MemReqSource.Prefetch2L2Stream.id.U
212  spf_log_data.OldAddr := Mux(
213    !s0_hit,
214    s0_vaddr,
215    array(s0_index).trigger_full_va
216  )
217  spf_log_data.CurAddr := s0_vaddr
218  spf_log_data.Offset := DontCare
219  spf_log_data.Score := DontCare
220  spf_log_data.Miss := io.train_req.bits.miss
221
222  stream_pf_train_debug_table.log(
223    data = spf_log_data,
224    en = spf_log_enable,
225    site = "StreamTrainTraceTable",
226    clock = clock,
227    reset = reset
228  )
229
230  assert(!s0_valid || PopCount(VecInit(s0_region_tag_match_vec)) <= 1.U, "req region should match no more than 1 entry")
231  assert(!s0_valid || PopCount(VecInit(s0_region_tag_plus_one_match_vec)) <= 1.U, "req region plus 1 should match no more than 1 entry")
232  assert(!s0_valid || PopCount(VecInit(s0_region_tag_minus_one_match_vec)) <= 1.U, "req region minus 1 should match no more than 1 entry")
233  assert(!s0_valid || !(s0_hit && s0_plus_one_hit && (s0_index === s0_plus_one_index)), "region and region plus 1 index match failed")
234  assert(!s0_valid || !(s0_hit && s0_minus_one_hit && (s0_index === s0_minus_one_index)), "region and region minus 1 index match failed")
235  assert(!s0_valid || !(s0_plus_one_hit && s0_minus_one_hit && (s0_minus_one_index === s0_plus_one_index)), "region plus 1 and region minus 1 index match failed")
236  assert(!(s0_valid && RegNext(s0_valid) && !s0_hit && !RegEnable(s0_hit, s0_valid) && replacement.way === RegEnable(replacement.way, s0_valid)), "replacement error")
237
238  XSPerfAccumulate("s0_valid_train_req", s0_valid)
239  val s0_hit_pattern_vec = Seq(s0_hit, s0_plus_one_hit, s0_minus_one_hit)
240  for(i <- 0 until (1 << s0_hit_pattern_vec.size)) {
241    XSPerfAccumulate(s"s0_hit_pattern_${toBinary(i)}", (VecInit(s0_hit_pattern_vec).asUInt === i.U) && s0_valid)
242  }
243  XSPerfAccumulate("s0_replace_the_neighbor", s0_valid && !s0_hit && ((s0_plus_one_hit && (s0_index === s0_plus_one_index)) || (s0_minus_one_hit && (s0_index === s0_minus_one_index))))
244  XSPerfAccumulate("s0_req_valid", io.train_req.valid)
245  XSPerfAccumulate("s0_req_cannot_accept", io.train_req.valid && !io.train_req.ready)
246
247  val ratio_const = Constantin.createRecord(s"l2DepthRatio${p(XSCoreParamsKey).HartId}", initValue = L2_DEPTH_RATIO)
248  val ratio = ratio_const(3, 0)
249
250  val l3_ratio_const = Constantin.createRecord(s"l3DepthRatio${p(XSCoreParamsKey).HartId}", initValue = L3_DEPTH_RATIO)
251  val l3_ratio = l3_ratio_const(3, 0)
252
253  // s1: alloc or update
254  val s1_valid = GatedValidRegNext(s0_valid)
255  val s1_index = RegEnable(s0_index, s0_valid)
256  val s1_pc    = RegEnable(s0_pc, s0_valid)
257  val s1_vaddr = RegEnable(s0_vaddr, s0_valid)
258  val s1_plus_one_index = RegEnable(s0_plus_one_index, s0_valid)
259  val s1_minus_one_index = RegEnable(s0_minus_one_index, s0_valid)
260  val s1_hit = RegEnable(s0_hit, s0_valid)
261  val s1_plus_one_hit = if(ENABLE_STRICT_ACTIVE_DETECTION)
262                            RegEnable(s0_plus_one_hit, s0_valid) && array(s1_plus_one_index).active && (array(s1_plus_one_index).cnt >= ACTIVE_THRESHOLD.U)
263                        else
264                            RegEnable(s0_plus_one_hit, s0_valid) && array(s1_plus_one_index).active
265  val s1_minus_one_hit = if(ENABLE_STRICT_ACTIVE_DETECTION)
266                            RegEnable(s0_minus_one_hit, s0_valid) && array(s1_minus_one_index).active && (array(s1_minus_one_index).cnt >= ACTIVE_THRESHOLD.U)
267                        else
268                            RegEnable(s0_minus_one_hit, s0_valid) && array(s1_minus_one_index).active
269  val s1_region_tag = RegEnable(s0_region_tag, s0_valid)
270  val s1_region_bits = RegEnable(s0_region_bits, s0_valid)
271  val s1_alloc = s1_valid && !s1_hit
272  val s1_update = s1_valid && s1_hit
273  val s1_pf_l1_incr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) + io.dynamic_depth, 0.U(BLOCK_OFFSET.W))
274  val s1_pf_l1_decr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) - io.dynamic_depth, 0.U(BLOCK_OFFSET.W))
275  val s1_pf_l2_incr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) + (io.dynamic_depth << ratio), 0.U(BLOCK_OFFSET.W))
276  val s1_pf_l2_decr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) - (io.dynamic_depth << ratio), 0.U(BLOCK_OFFSET.W))
277  val s1_pf_l3_incr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) + (io.dynamic_depth << l3_ratio), 0.U(BLOCK_OFFSET.W))
278  val s1_pf_l3_decr_vaddr = Cat(region_to_block_addr(s1_region_tag, s1_region_bits) - (io.dynamic_depth << l3_ratio), 0.U(BLOCK_OFFSET.W))
279  // TODO: remove this
280  val s1_can_send_pf = Mux(s1_update, !((array(s1_index).bit_vec & UIntToOH(s1_region_bits)).orR), true.B)
281  s0_can_accept := !(s1_valid && (region_hash_tag(s1_region_tag) === region_hash_tag(s0_region_tag)))
282
283  when(s1_alloc) {
284    // alloc a new entry
285    array(s1_index).alloc(
286      alloc_tag = s1_region_tag,
287      alloc_bit_vec = UIntToOH(s1_region_bits),
288      alloc_active = s1_plus_one_hit || s1_minus_one_hit,
289      alloc_decr_mode = RegEnable(s0_plus_one_hit, s0_valid),
290      alloc_full_vaddr = RegEnable(s0_vaddr, s0_valid)
291      )
292
293  }.elsewhen(s1_update) {
294    // update a existing entry
295    assert(array(s1_index).cnt =/= 0.U || array(s1_index).tag === s1_index, "entry should have been allocated before")
296    array(s1_index).update(
297      update_bit_vec = UIntToOH(s1_region_bits),
298      update_active = s1_plus_one_hit || s1_minus_one_hit)
299  }
300
301  XSPerfAccumulate("s1_alloc", s1_alloc)
302  XSPerfAccumulate("s1_update", s1_update)
303  XSPerfAccumulate("s1_active_plus_one_hit", s1_valid && s1_plus_one_hit)
304  XSPerfAccumulate("s1_active_minus_one_hit", s1_valid && s1_minus_one_hit)
305
306  // s2: trigger prefetch if hit active bit vector, compute meta of prefetch req
307  val s2_valid = GatedValidRegNext(s1_valid)
308  val s2_index = RegEnable(s1_index, s1_valid)
309  val s2_pc    = RegEnable(s1_pc, s1_valid)
310  val s2_vaddr = RegEnable(s1_vaddr, s1_valid)
311  val s2_region_bits = RegEnable(s1_region_bits, s1_valid)
312  val s2_region_tag = RegEnable(s1_region_tag, s1_valid)
313  val s2_pf_l1_incr_vaddr = RegEnable(s1_pf_l1_incr_vaddr, s1_valid)
314  val s2_pf_l1_decr_vaddr = RegEnable(s1_pf_l1_decr_vaddr, s1_valid)
315  val s2_pf_l2_incr_vaddr = RegEnable(s1_pf_l2_incr_vaddr, s1_valid)
316  val s2_pf_l2_decr_vaddr = RegEnable(s1_pf_l2_decr_vaddr, s1_valid)
317  val s2_pf_l3_incr_vaddr = RegEnable(s1_pf_l3_incr_vaddr, s1_valid)
318  val s2_pf_l3_decr_vaddr = RegEnable(s1_pf_l3_decr_vaddr, s1_valid)
319  val s2_can_send_pf = RegEnable(s1_can_send_pf, s1_valid)
320  val s2_active = array(s2_index).active
321  val s2_decr_mode = array(s2_index).decr_mode
322  val s2_l1_vaddr = Mux(s2_decr_mode, s2_pf_l1_decr_vaddr, s2_pf_l1_incr_vaddr)
323  val s2_l2_vaddr = Mux(s2_decr_mode, s2_pf_l2_decr_vaddr, s2_pf_l2_incr_vaddr)
324  val s2_l3_vaddr = Mux(s2_decr_mode, s2_pf_l3_decr_vaddr, s2_pf_l3_incr_vaddr)
325  val s2_will_send_pf = s2_valid && s2_active && s2_can_send_pf
326  val s2_pf_req_valid = s2_will_send_pf && io.enable
327  val s2_pf_l1_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle(
328    valid = s2_valid,
329    vaddr = s2_l1_vaddr,
330    width = WIDTH_CACHE_BLOCKS,
331    decr_mode = s2_decr_mode,
332    sink = SINK_L1,
333    source = L1_HW_PREFETCH_STREAM,
334    t_pc = s2_pc,
335    t_va = s2_vaddr
336    )
337  val s2_pf_l2_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle(
338    valid = s2_valid,
339    vaddr = s2_l2_vaddr,
340    width = L2_WIDTH_CACHE_BLOCKS,
341    decr_mode = s2_decr_mode,
342    sink = SINK_L2,
343    source = L1_HW_PREFETCH_STREAM,
344    t_pc = s2_pc,
345    t_va = s2_vaddr
346    )
347  val s2_pf_l3_req_bits = (new StreamPrefetchReqBundle).getStreamPrefetchReqBundle(
348    valid = s2_valid,
349    vaddr = s2_l3_vaddr,
350    width = L3_WIDTH_CACHE_BLOCKS,
351    decr_mode = s2_decr_mode,
352    sink = SINK_L3,
353    source = L1_HW_PREFETCH_STREAM,
354    t_pc = s2_pc,
355    t_va = s2_vaddr
356    )
357
358  XSPerfAccumulate("s2_valid", s2_valid)
359  XSPerfAccumulate("s2_will_not_send_pf", s2_valid && !s2_will_send_pf)
360  XSPerfAccumulate("s2_will_send_decr_pf", s2_valid && s2_will_send_pf && s2_decr_mode)
361  XSPerfAccumulate("s2_will_send_incr_pf", s2_valid && s2_will_send_pf && !s2_decr_mode)
362
363  // s3: send the l1 prefetch req out
364  val s3_pf_l1_valid = GatedValidRegNext(s2_pf_req_valid)
365  val s3_pf_l1_bits = RegEnable(s2_pf_l1_req_bits, s2_pf_req_valid)
366  val s3_pf_l2_valid = GatedValidRegNext(s2_pf_req_valid)
367  val s3_pf_l2_bits = RegEnable(s2_pf_l2_req_bits, s2_pf_req_valid)
368  val s3_pf_l3_bits = RegEnable(s2_pf_l3_req_bits, s2_pf_req_valid)
369
370  XSPerfAccumulate("s3_pf_sent", s3_pf_l1_valid)
371
372  // s4: send the l2 prefetch req out
373  val s4_pf_l2_valid = GatedValidRegNext(s3_pf_l2_valid)
374  val s4_pf_l2_bits = RegEnable(s3_pf_l2_bits, s3_pf_l2_valid)
375  val s4_pf_l3_bits = RegEnable(s3_pf_l3_bits, s3_pf_l2_valid)
376
377  val enable_l3_pf = Constantin.createRecord(s"enableL3StreamPrefetch${p(XSCoreParamsKey).HartId}", initValue = false)
378  // s5: send the l3 prefetch req out
379  val s5_pf_l3_valid = GatedValidRegNext(s4_pf_l2_valid) && enable_l3_pf
380  val s5_pf_l3_bits = RegEnable(s4_pf_l3_bits, s4_pf_l2_valid)
381
382  io.l1_prefetch_req.valid := s3_pf_l1_valid
383  io.l1_prefetch_req.bits := s3_pf_l1_bits
384  io.l2_l3_prefetch_req.valid := s4_pf_l2_valid || s5_pf_l3_valid
385  io.l2_l3_prefetch_req.bits := Mux(s4_pf_l2_valid, s4_pf_l2_bits, s5_pf_l3_bits)
386
387  XSPerfAccumulate("s4_pf_sent", s4_pf_l2_valid)
388  XSPerfAccumulate("s5_pf_sent", !s4_pf_l2_valid && s5_pf_l3_valid)
389  XSPerfAccumulate("pf_sent", PopCount(Seq(io.l1_prefetch_req.valid, io.l2_l3_prefetch_req.valid)))
390
391  // Stride lookup starts here
392  // S0: Stride send req
393  val s0_lookup_valid = io.stream_lookup_req.valid
394  val s0_lookup_vaddr = io.stream_lookup_req.bits.vaddr
395  val s0_lookup_tag = get_region_tag(s0_lookup_vaddr)
396  // S1: match
397  val s1_lookup_valid = GatedValidRegNext(s0_lookup_valid)
398  val s1_lookup_tag = RegEnable(s0_lookup_tag, s0_lookup_valid)
399  val s1_lookup_tag_match_vec = array.map(_.tag_match(s1_lookup_tag))
400  val s1_lookup_hit = VecInit(s1_lookup_tag_match_vec).asUInt.orR
401  val s1_lookup_index = OHToUInt(VecInit(s1_lookup_tag_match_vec))
402  // S2: read active out
403  val s2_lookup_valid = GatedValidRegNext(s1_lookup_valid)
404  val s2_lookup_hit = RegEnable(s1_lookup_hit, s1_lookup_valid)
405  val s2_lookup_index = RegEnable(s1_lookup_index, s1_lookup_valid)
406  val s2_lookup_active = array(s2_lookup_index).active
407  // S3: send back to Stride
408  val s3_lookup_valid = GatedValidRegNext(s2_lookup_valid)
409  val s3_lookup_hit = RegEnable(s2_lookup_hit, s2_lookup_valid)
410  val s3_lookup_active = RegEnable(s2_lookup_active, s2_lookup_valid)
411  io.stream_lookup_resp := s3_lookup_valid && s3_lookup_hit && s3_lookup_active
412
413  // reset meta to avoid muti-hit problem
414  for(i <- 0 until BIT_VEC_ARRAY_SIZE) {
415    when(reset.asBool || GatedValidRegNext(io.flush)) {
416      array(i).reset(i)
417    }
418  }
419
420  XSPerfHistogram("bit_vector_active", PopCount(VecInit(array.map(_.active)).asUInt), true.B, 0, BIT_VEC_ARRAY_SIZE, 1)
421  XSPerfHistogram("bit_vector_decr_mode", PopCount(VecInit(array.map(_.decr_mode)).asUInt), true.B, 0, BIT_VEC_ARRAY_SIZE, 1)
422  XSPerfAccumulate("hash_conflict", s0_valid && s2_valid && (s0_region_tag =/= s2_region_tag) && (region_hash_tag(s0_region_tag) === region_hash_tag(s2_region_tag)))
423}