xref: /aosp_15_r20/external/perfetto/src/trace_processor/containers/string_pool.h (revision 6dbdd20afdafa5e3ca9b8809fa73465d530080dc)
1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef SRC_TRACE_PROCESSOR_CONTAINERS_STRING_POOL_H_
18 #define SRC_TRACE_PROCESSOR_CONTAINERS_STRING_POOL_H_
19 
20 #include <cstddef>
21 #include <cstdint>
22 #include <functional>
23 #include <limits>
24 #include <memory>
25 #include <optional>
26 #include <string>
27 #include <utility>
28 #include <vector>
29 
30 #include "perfetto/base/logging.h"
31 #include "perfetto/ext/base/flat_hash_map.h"
32 #include "perfetto/ext/base/hash.h"
33 #include "perfetto/ext/base/paged_memory.h"
34 #include "perfetto/ext/base/string_view.h"
35 #include "perfetto/protozero/proto_utils.h"
36 #include "src/trace_processor/containers/null_term_string_view.h"
37 
38 namespace perfetto::trace_processor {
39 
40 // Interns strings in a string pool and hands out compact StringIds which can
41 // be used to retrieve the string in O(1).
42 class StringPool {
43  public:
44   struct Id {
45     Id() = default;
46 
47     constexpr bool operator==(const Id& other) const { return other.id == id; }
48     constexpr bool operator!=(const Id& other) const {
49       return !(other == *this);
50     }
51     constexpr bool operator<(const Id& other) const { return id < other.id; }
52 
is_nullId53     constexpr bool is_null() const { return id == 0u; }
54 
is_large_stringId55     constexpr bool is_large_string() const {
56       return id & kLargeStringFlagBitMask;
57     }
58 
block_offsetId59     constexpr uint32_t block_offset() const { return id & kBlockOffsetBitMask; }
60 
block_indexId61     constexpr uint32_t block_index() const {
62       return (id & kBlockIndexBitMask) >> kNumBlockOffsetBits;
63     }
64 
large_string_indexId65     constexpr uint32_t large_string_index() const {
66       PERFETTO_DCHECK(is_large_string());
67       return id & ~kLargeStringFlagBitMask;
68     }
69 
raw_idId70     constexpr uint32_t raw_id() const { return id; }
71 
LargeStringId72     static constexpr Id LargeString(size_t index) {
73       PERFETTO_DCHECK(index <= static_cast<uint32_t>(index));
74       PERFETTO_DCHECK(!(index & kLargeStringFlagBitMask));
75       return Id(kLargeStringFlagBitMask | static_cast<uint32_t>(index));
76     }
77 
BlockStringId78     static constexpr Id BlockString(size_t index, uint32_t offset) {
79       PERFETTO_DCHECK(index < (1u << (kNumBlockIndexBits + 1)));
80       PERFETTO_DCHECK(offset < (1u << (kNumBlockOffsetBits + 1)));
81       return Id(~kLargeStringFlagBitMask &
82                 (static_cast<uint32_t>(index << kNumBlockOffsetBits) |
83                  (offset & kBlockOffsetBitMask)));
84     }
85 
RawId86     static constexpr Id Raw(uint32_t raw) { return Id(raw); }
87 
NullId88     static constexpr Id Null() { return Id(0u); }
89 
90    private:
IdId91     constexpr explicit Id(uint32_t i) : id(i) {}
92 
93     uint32_t id;
94   };
95 
96   // Iterator over the strings in the pool.
97   class Iterator {
98    public:
99     explicit Iterator(const StringPool*);
100 
101     explicit operator bool() const;
102     Iterator& operator++();
103 
104     NullTermStringView StringView();
105     Id StringId();
106 
107    private:
108     const StringPool* pool_ = nullptr;
109     uint32_t block_index_ = 0;
110     uint32_t block_offset_ = 0;
111     uint32_t large_strings_index_ = 0;
112   };
113 
114   StringPool();
115   ~StringPool();
116 
117   // Allow std::move().
118   StringPool(StringPool&&) noexcept;
119   StringPool& operator=(StringPool&&) noexcept;
120 
121   // Disable implicit copy.
122   StringPool(const StringPool&) = delete;
123   StringPool& operator=(const StringPool&) = delete;
124 
InternString(base::StringView str)125   Id InternString(base::StringView str) {
126     if (str.data() == nullptr)
127       return Id::Null();
128 
129     auto hash = str.Hash();
130 
131     // Perform a hashtable insertion with a null ID just to check if the string
132     // is already inserted. If it's not, overwrite 0 with the actual Id.
133     auto it_and_inserted = string_index_.Insert(hash, Id());
134     Id* id = it_and_inserted.first;
135     if (!it_and_inserted.second) {
136       PERFETTO_DCHECK(Get(*id) == str);
137       return *id;
138     }
139     *id = InsertString(str, hash);
140     return *id;
141   }
142 
GetId(base::StringView str)143   std::optional<Id> GetId(base::StringView str) const {
144     if (str.data() == nullptr)
145       return Id::Null();
146 
147     auto hash = str.Hash();
148     Id* id = string_index_.Find(hash);
149     if (id) {
150       PERFETTO_DCHECK(Get(*id) == str);
151       return *id;
152     }
153     return std::nullopt;
154   }
155 
Get(Id id)156   NullTermStringView Get(Id id) const {
157     if (id.is_null())
158       return NullTermStringView();
159     if (id.is_large_string())
160       return GetLargeString(id);
161     return GetFromBlockPtr(IdToPtr(id));
162   }
163 
CreateIterator()164   Iterator CreateIterator() const { return Iterator(this); }
165 
size()166   size_t size() const { return string_index_.size(); }
167 
168   // Maximum Id of a small (not large) string in the string pool.
MaxSmallStringId()169   StringPool::Id MaxSmallStringId() const {
170     return Id::BlockString(blocks_.size() - 1, blocks_.back().pos());
171   }
172 
173   // Returns whether there is at least one large string in a string pool
HasLargeString()174   bool HasLargeString() const { return !large_strings_.empty(); }
175 
176  private:
177   using StringHash = uint64_t;
178 
179   struct Block {
BlockBlock180     explicit Block(size_t size)
181         : mem_(base::PagedMemory::Allocate(size,
182                                            base::PagedMemory::kDontCommit)),
183           size_(size) {}
184     ~Block() = default;
185 
186     // Allow std::move().
187     Block(Block&&) noexcept = default;
188     Block& operator=(Block&&) = default;
189 
190     // Disable implicit copy.
191     Block(const Block&) = delete;
192     Block& operator=(const Block&) = delete;
193 
GetBlock194     uint8_t* Get(uint32_t offset) const {
195       return static_cast<uint8_t*>(mem_.Get()) + offset;
196     }
197 
198     std::pair<bool /*success*/, uint32_t /*offset*/> TryInsert(
199         base::StringView str);
200 
OffsetOfBlock201     uint32_t OffsetOf(const uint8_t* ptr) const {
202       PERFETTO_DCHECK(Get(0) < ptr &&
203                       ptr <= Get(static_cast<uint32_t>(size_ - 1)));
204       return static_cast<uint32_t>(ptr - Get(0));
205     }
206 
posBlock207     uint32_t pos() const { return pos_; }
208 
209    private:
210     base::PagedMemory mem_;
211     uint32_t pos_ = 0;
212     size_t size_ = 0;
213   };
214 
215   friend class Iterator;
216   friend class StringPoolTest;
217 
218   // StringPool IDs are 32-bit. If the MSB is 1, the remaining bits of the ID
219   // are an index into the |large_strings_| vector. Otherwise, the next 6 bits
220   // are the index of the Block in the pool, and the remaining 25 bits the
221   // offset of the encoded string inside the pool.
222   //
223   // [31] [30:25] [24:0]
224   //  |      |       |
225   //  |      |       +---- offset in block (or LSB of large string index).
226   //  |      +------------ block index (or MSB of large string index).
227   //  +------------------- 1: large string, 0: string in a Block.
228   static constexpr size_t kNumBlockIndexBits = 6;
229   static constexpr size_t kNumBlockOffsetBits = 25;
230 
231   static constexpr size_t kLargeStringFlagBitMask = 1u << 31;
232   static constexpr size_t kBlockOffsetBitMask = (1u << kNumBlockOffsetBits) - 1;
233   static constexpr size_t kBlockIndexBitMask =
234       0xffffffff & ~kLargeStringFlagBitMask & ~kBlockOffsetBitMask;
235 
236   static constexpr size_t kBlockSizeBytes = kBlockOffsetBitMask + 1;  // 32 MB
237 
238   // If a string doesn't fit into the current block, we can either start a new
239   // block or insert the string into the |large_strings_| vector. To maximize
240   // the used proportion of each block's memory, we only start a new block if
241   // the string isn't very large.
242   static constexpr size_t kMinLargeStringSizeBytes = kBlockSizeBytes / 8;
243 
244   // Number of bytes to reserve for size and null terminator.
245   // This is the upper limit on metadata size: 5 bytes for max uint32,
246   // plus 1 byte for null terminator. The actual size may be lower.
247   static constexpr uint8_t kMaxMetadataSize = 6;
248 
249   // Inserts the string with the given hash into the pool and return its Id.
250   Id InsertString(base::StringView, uint64_t hash);
251 
252   // Insert a large string into the pool and return its Id.
253   Id InsertLargeString(base::StringView, uint64_t hash);
254 
255   // The returned pointer points to the start of the string metadata (i.e. the
256   // first byte of the size).
IdToPtr(Id id)257   const uint8_t* IdToPtr(Id id) const {
258     // If the MSB is set, the ID represents an index into |large_strings_|, so
259     // shouldn't be converted into a block pointer.
260     PERFETTO_DCHECK(!id.is_large_string());
261 
262     size_t block_index = id.block_index();
263     uint32_t block_offset = id.block_offset();
264 
265     PERFETTO_DCHECK(block_index < blocks_.size());
266     PERFETTO_DCHECK(block_offset < blocks_[block_index].pos());
267 
268     return blocks_[block_index].Get(block_offset);
269   }
270 
271   // |ptr| should point to the start of the string metadata (i.e. the first byte
272   // of the size).
273   // Returns pointer to the start of the string.
ReadSize(const uint8_t * ptr,uint32_t * size)274   static const uint8_t* ReadSize(const uint8_t* ptr, uint32_t* size) {
275     uint64_t value = 0;
276     const uint8_t* str_ptr = protozero::proto_utils::ParseVarInt(
277         ptr, ptr + kMaxMetadataSize, &value);
278     PERFETTO_DCHECK(str_ptr != ptr);
279     PERFETTO_DCHECK(value < std::numeric_limits<uint32_t>::max());
280     *size = static_cast<uint32_t>(value);
281     return str_ptr;
282   }
283 
284   // |ptr| should point to the start of the string metadata (i.e. the first byte
285   // of the size).
GetFromBlockPtr(const uint8_t * ptr)286   static NullTermStringView GetFromBlockPtr(const uint8_t* ptr) {
287     uint32_t size = 0;
288     const uint8_t* str_ptr = ReadSize(ptr, &size);
289     return {reinterpret_cast<const char*>(str_ptr), size};
290   }
291 
292   // Lookup a string in the |large_strings_| vector. |id| should have the MSB
293   // set.
GetLargeString(Id id)294   NullTermStringView GetLargeString(Id id) const {
295     PERFETTO_DCHECK(id.is_large_string());
296     size_t index = id.large_string_index();
297     PERFETTO_DCHECK(index < large_strings_.size());
298     const std::string* str = large_strings_[index].get();
299     return {str->c_str(), str->size()};
300   }
301 
302   // The actual memory storing the strings.
303   std::vector<Block> blocks_;
304 
305   // Any string that is too large to fit into a Block is stored separately
306   // (inside a unique_ptr to ensure any references to it remain valid even if
307   // |large_strings_| is resized).
308   std::vector<std::unique_ptr<std::string>> large_strings_;
309 
310   // Maps hashes of strings to the Id in the string pool.
311   base::FlatHashMap<StringHash,
312                     Id,
313                     base::AlreadyHashed<StringHash>,
314                     base::LinearProbe,
315                     /*AppendOnly=*/true>
316       string_index_{/*initial_capacity=*/4096u};
317 };
318 
319 }  // namespace perfetto::trace_processor
320 
321 template <>
322 struct std::hash<::perfetto::trace_processor::StringPool::Id> {
323   using argument_type = ::perfetto::trace_processor::StringPool::Id;
324   using result_type = size_t;
325 
326   result_type operator()(const argument_type& r) const {
327     return std::hash<uint32_t>{}(r.raw_id());
328   }
329 };
330 
331 #endif  // SRC_TRACE_PROCESSOR_CONTAINERS_STRING_POOL_H_
332