xref: /aosp_15_r20/external/perfetto/src/trace_processor/containers/bit_vector.cc (revision 6dbdd20afdafa5e3ca9b8809fa73465d530080dc)
1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/trace_processor/containers/bit_vector.h"
18 
19 #include <algorithm>
20 #include <cstddef>
21 #include <cstdint>
22 #include <cstring>
23 #include <initializer_list>
24 #include <limits>
25 #include <utility>
26 #include <vector>
27 
28 #include "perfetto/base/build_config.h"
29 #include "perfetto/base/compiler.h"
30 #include "perfetto/base/logging.h"
31 #include "perfetto/public/compiler.h"
32 
33 #include "protos/perfetto/trace_processor/serialization.pbzero.h"
34 
35 #if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT)
36 #include <immintrin.h>
37 #endif
38 
39 namespace perfetto::trace_processor {
40 namespace {
41 
42 // This function implements the PDEP instruction in x64 as a loop.
43 // See https://www.felixcloutier.com/x86/pdep for details on what PDEP does.
44 //
45 // Unfortunately, as we're emulating this in software, it scales with the number
46 // of set bits in |mask| rather than being a constant time instruction:
47 // therefore, this should be avoided where real instructions are available.
PdepSlow(uint64_t word,uint64_t mask)48 PERFETTO_ALWAYS_INLINE uint64_t PdepSlow(uint64_t word, uint64_t mask) {
49   if (word == 0 || mask == std::numeric_limits<uint64_t>::max())
50     return word;
51 
52   // This algorithm is for calculating PDEP was found to be the fastest "simple"
53   // one among those tested when writing this function.
54   uint64_t result = 0;
55   for (uint64_t bb = 1; mask; bb += bb) {
56     if (word & bb) {
57       // MSVC doesn't like -mask so work around this by doing 0 - mask.
58       result |= mask & (0ull - mask);
59     }
60     mask &= mask - 1;
61   }
62   return result;
63 }
64 
65 // See |PdepSlow| for information on PDEP.
Pdep(uint64_t word,uint64_t mask)66 PERFETTO_ALWAYS_INLINE uint64_t Pdep(uint64_t word, uint64_t mask) {
67 #if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT)
68   base::ignore_result(PdepSlow);
69   return _pdep_u64(word, mask);
70 #else
71   return PdepSlow(word, mask);
72 #endif
73 }
74 
75 // This function implements the PEXT instruction in x64 as a loop.
76 // See https://www.felixcloutier.com/x86/pext for details on what PEXT does.
77 //
78 // Unfortunately, as we're emulating this in software, it scales with the number
79 // of set bits in |mask| rather than being a constant time instruction:
80 // therefore, this should be avoided where real instructions are available.
PextSlow(uint64_t word,uint64_t mask)81 PERFETTO_ALWAYS_INLINE uint64_t PextSlow(uint64_t word, uint64_t mask) {
82   if (word == 0 || mask == std::numeric_limits<uint64_t>::max())
83     return word;
84 
85   // This algorithm is for calculating PEXT was found to be the fastest "simple"
86   // one among those tested when writing this function.
87   uint64_t result = 0;
88   for (uint64_t bb = 1; mask; bb += bb) {
89     // MSVC doesn't like -mask so work around this by doing 0 - mask.
90     if (word & mask & (0ull - mask)) {
91       result |= bb;
92     }
93     mask &= mask - 1;
94   }
95   return result;
96 }
97 
98 // See |PextSlow| for information on PEXT.
Pext(uint64_t word,uint64_t mask)99 PERFETTO_ALWAYS_INLINE uint64_t Pext(uint64_t word, uint64_t mask) {
100 #if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT)
101   base::ignore_result(PextSlow);
102   return _pext_u64(word, mask);
103 #else
104   return PextSlow(word, mask);
105 #endif
106 }
107 
108 // This function implements the tzcnt instruction.
109 // See https://www.felixcloutier.com/x86/tzcnt for details on what tzcnt does.
Tzcnt(uint64_t value)110 PERFETTO_ALWAYS_INLINE uint32_t Tzcnt(uint64_t value) {
111 #if PERFETTO_BUILDFLAG(PERFETTO_X64_CPU_OPT)
112   return static_cast<uint32_t>(_tzcnt_u64(value));
113 #elif defined(__GNUC__) || defined(__clang__)
114   return value ? static_cast<uint32_t>(__builtin_ctzll(value)) : 64u;
115 #else
116   unsigned long out;
117   return _BitScanForward64(&out, value) ? static_cast<uint32_t>(out) : 64u;
118 #endif
119 }
120 
121 }  // namespace
122 
123 BitVector::BitVector() = default;
124 
BitVector(std::initializer_list<bool> init)125 BitVector::BitVector(std::initializer_list<bool> init) {
126   for (bool x : init) {
127     if (x) {
128       AppendTrue();
129     } else {
130       AppendFalse();
131     }
132   }
133 }
134 
BitVector(uint32_t count,bool value)135 BitVector::BitVector(uint32_t count, bool value) {
136   Resize(count, value);
137 }
138 
BitVector(std::vector<uint64_t> words,std::vector<uint32_t> counts,uint32_t size)139 BitVector::BitVector(std::vector<uint64_t> words,
140                      std::vector<uint32_t> counts,
141                      uint32_t size)
142     : size_(size), counts_(std::move(counts)), words_(std::move(words)) {
143   PERFETTO_CHECK(words_.size() % Block::kWords == 0);
144 }
145 
Resize(uint32_t new_size,bool filler)146 void BitVector::Resize(uint32_t new_size, bool filler) {
147   uint32_t old_size = size_;
148   if (new_size == old_size)
149     return;
150 
151   // Empty bitvectors should be memory efficient so we don't keep any data
152   // around in the bitvector.
153   if (new_size == 0) {
154     words_.clear();
155     counts_.clear();
156     size_ = 0;
157     return;
158   }
159 
160   // Compute the address of the new last bit in the bitvector.
161   Address last_addr = IndexToAddress(new_size - 1);
162   auto old_blocks_size = static_cast<uint32_t>(counts_.size());
163   uint32_t new_blocks_size = last_addr.block_idx + 1;
164 
165   // Resize the block and count vectors to have the correct number of entries.
166   words_.resize(Block::kWords * new_blocks_size);
167   counts_.resize(new_blocks_size);
168 
169   if (new_size > old_size) {
170     if (filler) {
171       // If the new space should be filled with ones, then set all the bits
172       // between the address of the old size and the new last address.
173       const Address& start = IndexToAddress(old_size);
174       Set(start, last_addr);
175 
176       // We then need to update the counts vector to match the changes we
177       // made to the blocks.
178 
179       // We start by adding the bits we set in the first block to the
180       // cummulative count before the range we changed.
181       Address end_of_block = {start.block_idx,
182                               {Block::kWords - 1, BitWord::kBits - 1}};
183       uint32_t count_in_block_after_end =
184           AddressToIndex(end_of_block) - AddressToIndex(start) + 1;
185       uint32_t set_count = CountSetBits() + count_in_block_after_end;
186 
187       for (uint32_t i = start.block_idx + 1; i <= last_addr.block_idx; ++i) {
188         // Set the count to the cummulative count so far.
189         counts_[i] = set_count;
190 
191         // Add a full block of set bits to the count.
192         set_count += Block::kBits;
193       }
194     } else {
195       // If the newly added bits are false, we just need to update the
196       // counts vector with the current size of the bitvector for all
197       // the newly added blocks.
198       if (new_blocks_size > old_blocks_size) {
199         uint32_t count = CountSetBits();
200         for (uint32_t i = old_blocks_size; i < new_blocks_size; ++i) {
201           counts_[i] = count;
202         }
203       }
204     }
205   } else {
206     // Throw away all the bits after the new last bit. We do this to make
207     // future lookup, append and resize operations not have to worrying about
208     // trailing garbage bits in the last block.
209     BlockFromIndex(last_addr.block_idx).ClearAfter(last_addr.block_offset);
210   }
211 
212   // Actually update the size.
213   size_ = new_size;
214 }
215 
Copy() const216 BitVector BitVector::Copy() const {
217   return {words_, counts_, size_};
218 }
219 
Not()220 void BitVector::Not() {
221   if (size_ == 0) {
222     return;
223   }
224 
225   for (uint64_t& word : words_) {
226     BitWord(&word).Not();
227   }
228 
229   // Make sure to reset the last block's trailing bits to zero to preserve the
230   // invariant of BitVector.
231   Address last_addr = IndexToAddress(size_ - 1);
232   BlockFromIndex(last_addr.block_idx).ClearAfter(last_addr.block_offset);
233 
234   for (uint32_t i = 1; i < counts_.size(); ++i) {
235     counts_[i] = kBitsInBlock * i - counts_[i];
236   }
237 }
238 
Or(const BitVector & sec)239 void BitVector::Or(const BitVector& sec) {
240   PERFETTO_CHECK(size_ == sec.size());
241   for (uint32_t i = 0; i < words_.size(); ++i) {
242     BitWord(&words_[i]).Or(sec.words_[i]);
243   }
244   UpdateCounts(words_, counts_);
245 }
246 
And(const BitVector & sec)247 void BitVector::And(const BitVector& sec) {
248   Resize(std::min(size_, sec.size_));
249   for (uint32_t i = 0; i < words_.size(); ++i) {
250     BitWord(&words_[i]).And(sec.words_[i]);
251   }
252   UpdateCounts(words_, counts_);
253 }
254 
UpdateSetBits(const BitVector & update)255 void BitVector::UpdateSetBits(const BitVector& update) {
256   if (update.CountSetBits() == 0 || CountSetBits() == 0) {
257     *this = BitVector();
258     return;
259   }
260   PERFETTO_DCHECK(update.size() <= CountSetBits());
261 
262   // Get the start and end ptrs for the current bitvector.
263   // Safe because of the static_assert above.
264   uint64_t* ptr = words_.data();
265   const uint64_t* ptr_end = ptr + WordCount(size());
266 
267   // Get the start and end ptrs for the update bitvector.
268   // Safe because of the static_assert above.
269   const uint64_t* update_ptr = update.words_.data();
270   const uint64_t* update_ptr_end = update_ptr + WordCount(update.size());
271 
272   // |update_unused_bits| contains |unused_bits_count| bits at the bottom
273   // which indicates how the next |unused_bits_count| set bits in |this|
274   // should be changed. This is necessary because word boundaries in |this| will
275   // almost always *not* match the word boundaries in |update|.
276   uint64_t update_unused_bits = 0;
277   uint8_t unused_bits_count = 0;
278 
279   // The basic premise of this loop is, for each word in |this| we find
280   // enough bits from |update| to cover every set bit in the word. We then use
281   // the PDEP x64 instruction (or equivalent instructions/software emulation) to
282   // update the word and store it back in |this|.
283   for (; ptr != ptr_end; ++ptr) {
284     uint64_t current = *ptr;
285 
286     // If the current value is all zeros, there's nothing to update.
287     if (PERFETTO_UNLIKELY(current == 0))
288       continue;
289 
290     auto popcount = static_cast<uint8_t>(PERFETTO_POPCOUNT(current));
291     PERFETTO_DCHECK(popcount >= 1);
292 
293     // Check if we have enough unused bits from the previous iteration - if so,
294     // we don't need to read anything from |update|.
295     uint64_t update_for_current = update_unused_bits;
296     if (unused_bits_count >= popcount) {
297       // We have enough bits so just do the accounting to not reuse these bits
298       // for the future.
299       unused_bits_count -= popcount;
300       update_unused_bits = popcount == 64 ? 0 : update_unused_bits >> popcount;
301     } else {
302       // We don't have enough bits so we need to read the next word of bits from
303       // |current|.
304       uint64_t next_update = update_ptr == update_ptr_end ? 0 : *update_ptr++;
305 
306       // Bitwise or |64 - unused_bits_count| bits from the bottom of
307       // |next_update| to the top of |update_for_current|. Only |popcount| bits
308       // will actually be used by PDEP but masking off the unused bits takes
309       // *more* instructions than not doing anything.
310       update_for_current |= next_update << unused_bits_count;
311 
312       // PDEP will use |popcount| bits from update: this means it will use
313       // |unused_bits_count| from |update_for_current| and |popcount -
314       // unused_bits_count| from |next_update|
315       uint8_t used_next_bits = popcount - unused_bits_count;
316 
317       // Shift off any bits which will be used by current and store the
318       // remainder for use in the next iteration.
319       update_unused_bits =
320           used_next_bits == 64 ? 0 : next_update >> used_next_bits;
321       unused_bits_count = 64 - used_next_bits;
322     }
323 
324     // We should never end up with more than 64 bits available.
325     PERFETTO_CHECK(unused_bits_count <= 64);
326 
327     // PDEP precisely captures the notion of "updating set bits" for a single
328     // word.
329     *ptr = Pdep(update_for_current, current);
330   }
331 
332   // We shouldn't have any non-zero unused bits and we should have consumed the
333   // whole |update| bitvector. Note that we cannot really say anything about
334   // |unused_bits_count| because it's possible for the above algorithm to use
335   // some bits which are "past the end" of |update|; as long as these bits are
336   // zero, it meets the pre-condition of this function.
337   PERFETTO_DCHECK(update_unused_bits == 0);
338   PERFETTO_DCHECK(update_ptr == update_ptr_end);
339 
340   UpdateCounts(words_, counts_);
341 
342   // After the loop, we should have precisely the same number of bits
343   // set as |update|.
344   PERFETTO_DCHECK(update.CountSetBits() == CountSetBits());
345 }
346 
SelectBits(const BitVector & mask_bv)347 void BitVector::SelectBits(const BitVector& mask_bv) {
348   // Verify the precondition on the function: the algorithm relies on this
349   // being the case.
350   PERFETTO_DCHECK(size() <= mask_bv.size());
351 
352   // Get the set bits in the mask up to the end of |this|: this will precisely
353   // equal the number of bits in |this| at the end of this function.
354   uint32_t set_bits_in_mask = mask_bv.CountSetBits(size());
355 
356   const uint64_t* cur_word = words_.data();
357   const uint64_t* end_word = words_.data() + WordCount(size());
358   const uint64_t* cur_mask = mask_bv.words_.data();
359 
360   // Used to track the number of bits already set (i.e. by a previous loop
361   // iteration) in |out_word|.
362   uint32_t out_word_bits = 0;
363   uint64_t* out_word = words_.data();
364   for (; cur_word != end_word; ++cur_word, ++cur_mask) {
365     // Loop invariant: we should always have out_word and out_word_bits set
366     // such that there is room for at least one more bit.
367     PERFETTO_DCHECK(out_word_bits < 64);
368 
369     // The crux of this function: efficient parallel extract all bits in |this|
370     // which correspond to set bit positions in |this|.
371     uint64_t ext = Pext(*cur_word, *cur_mask);
372 
373     // If there are no bits in |out_word| from a previous iteration, set it to
374     // |ext|. Otherwise, concat the newly added bits to the top of the existing
375     // bits.
376     *out_word = out_word_bits == 0 ? ext : *out_word | (ext << out_word_bits);
377 
378     // Update the number of bits used in |out_word| by adding the number of set
379     // bit positions in |mask|.
380     auto popcount = static_cast<uint32_t>(PERFETTO_POPCOUNT(*cur_mask));
381     out_word_bits += popcount;
382 
383     // The below is a branch-free way to increment |out_word| pointer when we've
384     // packed 64 bits into it.
385     bool spillover = out_word_bits > 64;
386     out_word += out_word_bits >= 64;
387     out_word_bits %= 64;
388 
389     // If there was any "spillover" bits (i.e. bits which did not fit in the
390     // previous word), add them into the new out_word. Important: we *must* not
391     // change out_word if there was no spillover as |out_word| could be pointing
392     // to |data + 1| which needs to be preserved for the next loop iteration.
393     if (spillover) {
394       *out_word = ext >> (popcount - out_word_bits);
395     }
396   }
397 
398   // Loop post-condition: we must have written as many words as is required
399   // to store |set_bits_in_mask|.
400   PERFETTO_DCHECK(static_cast<uint32_t>(out_word - words_.data()) <=
401                   WordCount(set_bits_in_mask));
402 
403   // Resize the BitVector to equal to the number of elements in the  mask we
404   // calculated at the start of the loop.
405   Resize(set_bits_in_mask);
406 
407   // Fix up the counts to match the new values. The Resize above should ensure
408   // that a) the counts vector is correctly sized, b) the bits after
409   // |set_bits_in_mask| are cleared (allowing this count algortihm to be
410   // accurate).
411   UpdateCounts(words_, counts_);
412 }
413 
FromSortedIndexVector(const std::vector<int64_t> & indices)414 BitVector BitVector::FromSortedIndexVector(
415     const std::vector<int64_t>& indices) {
416   // The rest of the algorithm depends on |indices| being non empty.
417   if (indices.empty()) {
418     return {};
419   }
420 
421   // We are creating the smallest BitVector that can have all of the values
422   // from |indices| set. As we assume that |indices| is sorted, the size would
423   // be the last element + 1 and the last bit of the final BitVector will be
424   // set.
425   auto size = static_cast<uint32_t>(indices.back() + 1);
426 
427   uint32_t block_count = BlockCount(size);
428   std::vector<uint64_t> words(block_count * Block::kWords);
429   for (const int64_t i : indices) {
430     auto word_idx = static_cast<uint32_t>(i / kBitsInWord);
431     auto in_word_idx = static_cast<uint32_t>(i % kBitsInWord);
432     BitVector::BitWord(&words[word_idx]).Set(in_word_idx);
433   }
434 
435   std::vector<uint32_t> counts(block_count);
436   UpdateCounts(words, counts);
437   return {words, counts, size};
438 }
439 
FromUnsortedIndexVector(const std::vector<uint32_t> & indices)440 BitVector BitVector::FromUnsortedIndexVector(
441     const std::vector<uint32_t>& indices) {
442   // The rest of the algorithm depends on |indices| being non empty.
443   if (indices.empty()) {
444     return {};
445   }
446 
447   std::vector<uint64_t> words;
448   uint32_t max_idx = 0;
449   for (const uint32_t i : indices) {
450     auto word_idx = static_cast<uint32_t>(i / kBitsInWord);
451     max_idx = std::max(max_idx, i);
452     if (word_idx >= words.size()) {
453       words.resize(word_idx + 1);
454     }
455     auto in_word_idx = static_cast<uint32_t>(i % kBitsInWord);
456     BitVector::BitWord(&words[word_idx]).Set(in_word_idx);
457   }
458 
459   auto block_count = BlockCount(max_idx + 1);
460   words.resize(block_count * Block::kWords);
461   std::vector<uint32_t> counts(block_count);
462   UpdateCounts(words, counts);
463   return {words, counts, max_idx + 1};
464 }
465 
IntersectRange(uint32_t range_start,uint32_t range_end) const466 BitVector BitVector::IntersectRange(uint32_t range_start,
467                                     uint32_t range_end) const {
468   // We should skip all bits until the index of first set bit bigger than
469   // |range_start|.
470   uint32_t end_idx = std::min(range_end, size());
471 
472   if (range_start >= end_idx)
473     return {};
474 
475   Builder builder(end_idx, range_start);
476   uint32_t front_bits = builder.BitsUntilWordBoundaryOrFull();
477   uint32_t cur_index = range_start;
478   for (uint32_t i = 0; i < front_bits; ++i, ++cur_index) {
479     builder.Append(IsSet(cur_index));
480   }
481 
482   PERFETTO_DCHECK(cur_index == end_idx || cur_index % BitWord::kBits == 0);
483   uint32_t cur_words = cur_index / BitWord::kBits;
484   uint32_t full_words = builder.BitsInCompleteWordsUntilFull() / BitWord::kBits;
485   uint32_t total_full_words = cur_words + full_words;
486   for (; cur_words < total_full_words; ++cur_words) {
487     builder.AppendWord(words_[cur_words]);
488   }
489 
490   uint32_t last_bits = builder.BitsUntilFull();
491   cur_index += full_words * BitWord::kBits;
492   for (uint32_t i = 0; i < last_bits; ++i, ++cur_index) {
493     builder.Append(IsSet(cur_index));
494   }
495 
496   return std::move(builder).Build();
497 }
498 
GetSetBitIndices() const499 std::vector<uint32_t> BitVector::GetSetBitIndices() const {
500   uint32_t set_bits = CountSetBits();
501   if (set_bits == 0) {
502     return {};
503   }
504   std::vector<uint32_t> res(set_bits);
505 
506   // After measuring we discovered that not doing `push_back` creates a tangible
507   // performance improvement due to compiler unrolling the inner loop.
508   uint32_t res_idx = 0;
509   for (uint32_t i = 0; i < size_; i += BitWord::kBits) {
510     for (uint64_t word = words_[i / BitWord::kBits]; word; word &= word - 1) {
511       res[res_idx++] = i + Tzcnt(word);
512     }
513   }
514   return res;
515 }
516 
Serialize(protos::pbzero::SerializedColumn::BitVector * msg) const517 void BitVector::Serialize(
518     protos::pbzero::SerializedColumn::BitVector* msg) const {
519   msg->set_size(size_);
520   if (!counts_.empty()) {
521     msg->set_counts(reinterpret_cast<const uint8_t*>(counts_.data()),
522                     sizeof(uint32_t) * counts_.size());
523   }
524   if (!words_.empty()) {
525     msg->set_words(reinterpret_cast<const uint8_t*>(words_.data()),
526                    sizeof(uint64_t) * words_.size());
527   }
528 }
529 
530 // Deserialize BitVector from proto.
Deserialize(const protos::pbzero::SerializedColumn::BitVector::Decoder & bv_msg)531 void BitVector::Deserialize(
532     const protos::pbzero::SerializedColumn::BitVector::Decoder& bv_msg) {
533   size_ = bv_msg.size();
534   if (bv_msg.has_counts()) {
535     counts_.resize(
536         static_cast<size_t>(bv_msg.counts().size / sizeof(uint32_t)));
537     memcpy(counts_.data(), bv_msg.counts().data, bv_msg.counts().size);
538   } else {
539     counts_.clear();
540   }
541 
542   if (bv_msg.has_words()) {
543     words_.resize(static_cast<size_t>(bv_msg.words().size / sizeof(uint64_t)));
544     memcpy(words_.data(), bv_msg.words().data, bv_msg.words().size);
545   } else {
546     words_.clear();
547   }
548 }
549 
550 }  // namespace perfetto::trace_processor
551