xref: /aosp_15_r20/external/perfetto/src/trace_processor/sorter/trace_token_buffer.cc (revision 6dbdd20afdafa5e3ca9b8809fa73465d530080dc)
1 /*
2  * Copyright (C) 2023 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/trace_processor/sorter/trace_token_buffer.h"
18 
19 #include <algorithm>
20 #include <cstdint>
21 #include <cstring>
22 #include <functional>
23 #include <limits>
24 #include <optional>
25 #include <utility>
26 
27 #include "perfetto/base/compiler.h"
28 #include "perfetto/base/logging.h"
29 #include "perfetto/trace_processor/ref_counted.h"
30 #include "perfetto/trace_processor/trace_blob.h"
31 #include "perfetto/trace_processor/trace_blob_view.h"
32 #include "src/trace_processor/importers/common/parser_types.h"
33 #include "src/trace_processor/importers/proto/packet_sequence_state_generation.h"
34 #include "src/trace_processor/util/bump_allocator.h"
35 
36 namespace perfetto::trace_processor {
37 namespace {
38 
39 struct alignas(8) TrackEventDataDescriptor {
40   static constexpr uint8_t kMaxOffsetFromInternedBlobBits = 25;
41   static constexpr uint32_t kMaxOffsetFromInternedBlob =
42       (1Ul << kMaxOffsetFromInternedBlobBits) - 1;
43 
44   static constexpr uint8_t kMaxExtraCountersBits = 4;
45   static constexpr uint8_t kMaxExtraCounters = (1 << kMaxExtraCountersBits) - 1;
46   static_assert(TrackEventData::kMaxNumExtraCounters <= kMaxExtraCounters,
47                 "Counter bits must be able to fit TrackEventData counters");
48 
49   uint16_t intern_blob_index;
50   uint16_t intern_seq_index;
51   uint32_t intern_blob_offset : kMaxOffsetFromInternedBlobBits;
52   uint32_t has_thread_timestamp : 1;
53   uint32_t has_thread_instruction_count : 1;
54   uint32_t has_counter_value : 1;
55   uint32_t extra_counter_count : kMaxExtraCountersBits;
56 };
57 static_assert(sizeof(TrackEventDataDescriptor) == 8,
58               "CompressedTracePacketData must be small");
59 static_assert(alignof(TrackEventDataDescriptor) == 8,
60               "CompressedTracePacketData must be 8-aligned");
61 
62 template <typename T>
ExtractFromPtr(uint8_t ** ptr)63 T ExtractFromPtr(uint8_t** ptr) {
64   T* typed_ptr = reinterpret_cast<T*>(*ptr);
65   T value(std::move(*typed_ptr));
66   typed_ptr->~T();
67   *ptr += sizeof(T);
68   return value;
69 }
70 
71 template <typename T>
AppendToPtr(uint8_t * ptr,T value)72 uint8_t* AppendToPtr(uint8_t* ptr, T value) {
73   new (ptr) T(std::move(value));
74   return ptr + sizeof(T);
75 }
76 
GetAllocSize(const TrackEventDataDescriptor & desc)77 uint32_t GetAllocSize(const TrackEventDataDescriptor& desc) {
78   uint32_t alloc_size = sizeof(TrackEventDataDescriptor);
79   alloc_size += sizeof(uint64_t);
80   alloc_size += desc.has_thread_instruction_count * sizeof(int64_t);
81   alloc_size += desc.has_thread_timestamp * sizeof(int64_t);
82   alloc_size += desc.has_counter_value * sizeof(double);
83   alloc_size += desc.extra_counter_count * sizeof(double);
84   return alloc_size;
85 }
86 
87 }  // namespace
88 
Append(TrackEventData ted)89 TraceTokenBuffer::Id TraceTokenBuffer::Append(TrackEventData ted) {
90   // TrackEventData (and TracePacketData) are two big contributors to the size
91   // of the peak memory usage by sorted. The main reasons for this are a) object
92   // padding and b) using more bits than necessary to store their contents.
93   //
94   // The purpose of this function is to "compress" the contents of
95   // TrackEventData by utilising techniques like bitpacking, interning and
96   // variable length encoding to ensure only the amount of data which really
97   // needs to be stored is done so.
98 
99   // Compress all the booleans indicating the presence of a value into 4 bits
100   // instead of 4 bytes as they would take inside base::Optional.
101   TrackEventDataDescriptor desc;
102   desc.has_thread_instruction_count = ted.thread_instruction_count.has_value();
103   desc.has_thread_timestamp = ted.thread_timestamp.has_value();
104   desc.has_counter_value = std::not_equal_to<double>()(ted.counter_value, 0);
105   desc.extra_counter_count = ted.CountExtraCounterValues();
106 
107   // Allocate enough memory using the BumpAllocator to store the data in |ted|.
108   // Also figure out the interned index.
109   BumpAllocator::AllocId alloc_id =
110       AllocAndResizeInternedVectors(GetAllocSize(desc));
111   InternedIndex interned_index = GetInternedIndex(alloc_id);
112 
113   // Compute the interning information for the TrackBlob and the SequenceState.
114   TracePacketData& tpd = ted.trace_packet_data;
115   desc.intern_blob_offset = InternTraceBlob(interned_index, tpd.packet);
116   desc.intern_blob_index =
117       static_cast<uint16_t>(interned_blobs_.at(interned_index).size() - 1);
118   desc.intern_seq_index =
119       InternSeqState(interned_index, std::move(tpd.sequence_state));
120 
121   // Store the descriptor
122   uint8_t* ptr = static_cast<uint8_t*>(allocator_.GetPointer(alloc_id));
123   ptr = AppendToPtr(ptr, desc);
124 
125   // Store the packet sizes.
126   uint64_t packet_size = static_cast<uint64_t>(tpd.packet.size());
127   ptr = AppendToPtr(ptr, packet_size);
128 
129   // Add the "optional" fields of TrackEventData based on whether or not they
130   // are non-null.
131   if (desc.has_thread_instruction_count) {
132     ptr = AppendToPtr(ptr, ted.thread_instruction_count.value());
133   }
134   if (desc.has_thread_timestamp) {
135     ptr = AppendToPtr(ptr, ted.thread_timestamp.value());
136   }
137   if (desc.has_counter_value) {
138     ptr = AppendToPtr(ptr, ted.counter_value);
139   }
140   for (uint32_t i = 0; i < desc.extra_counter_count; ++i) {
141     ptr = AppendToPtr(ptr, ted.extra_counter_values[i]);
142   }
143   return Id{alloc_id};
144 }
145 
146 template <>
Extract(Id id)147 TrackEventData TraceTokenBuffer::Extract<TrackEventData>(Id id) {
148   uint8_t* ptr = static_cast<uint8_t*>(allocator_.GetPointer(id.alloc_id));
149   TrackEventDataDescriptor desc =
150       ExtractFromPtr<TrackEventDataDescriptor>(&ptr);
151   uint64_t packet_size = ExtractFromPtr<uint64_t>(&ptr);
152 
153   InternedIndex interned_index = GetInternedIndex(id.alloc_id);
154   BlobWithOffset& bwo =
155       interned_blobs_.at(interned_index)[desc.intern_blob_index];
156   TraceBlobView tbv(RefPtr<TraceBlob>::FromReleasedUnsafe(bwo.blob),
157                     bwo.offset_in_blob + desc.intern_blob_offset,
158                     static_cast<uint32_t>(packet_size));
159   auto seq = RefPtr<PacketSequenceStateGeneration>::FromReleasedUnsafe(
160       interned_seqs_.at(interned_index)[desc.intern_seq_index]);
161 
162   TrackEventData ted{std::move(tbv), std::move(seq)};
163   if (desc.has_thread_instruction_count) {
164     ted.thread_instruction_count = ExtractFromPtr<int64_t>(&ptr);
165   }
166   if (desc.has_thread_timestamp) {
167     ted.thread_timestamp = ExtractFromPtr<int64_t>(&ptr);
168   }
169   if (desc.has_counter_value) {
170     ted.counter_value = ExtractFromPtr<double>(&ptr);
171   }
172   for (uint32_t i = 0; i < desc.extra_counter_count; ++i) {
173     ted.extra_counter_values[i] = ExtractFromPtr<double>(&ptr);
174   }
175   allocator_.Free(id.alloc_id);
176   return ted;
177 }
178 
InternTraceBlob(InternedIndex interned_index,const TraceBlobView & tbv)179 uint32_t TraceTokenBuffer::InternTraceBlob(InternedIndex interned_index,
180                                            const TraceBlobView& tbv) {
181   BlobWithOffsets& blobs = interned_blobs_.at(interned_index);
182   if (blobs.empty()) {
183     return AddTraceBlob(interned_index, tbv);
184   }
185 
186   BlobWithOffset& last_blob = blobs.back();
187   if (last_blob.blob != tbv.blob().get()) {
188     return AddTraceBlob(interned_index, tbv);
189   }
190   PERFETTO_CHECK(last_blob.offset_in_blob <= tbv.offset());
191 
192   // To allow our offsets in the store to be 16 bits, we intern not only the
193   // TraceBlob pointer but also the offset. By having this double indirection,
194   // we can store offset always as uint16 at the cost of storing blobs here more
195   // often: this more than pays for itself as in the majority of cases the
196   // offsets are small anyway.
197   size_t rel_offset = tbv.offset() - last_blob.offset_in_blob;
198   if (rel_offset > TrackEventDataDescriptor::kMaxOffsetFromInternedBlob) {
199     return AddTraceBlob(interned_index, tbv);
200   }
201 
202   // Intentionally "leak" this pointer. This essentially keeps the refcount
203   // of this TraceBlob one higher than the number of RefPtrs pointing to it.
204   // This allows avoid storing the same RefPtr n times.
205   //
206   // Calls to this function are paired to Extract<TrackEventData> which picks
207   // up this "leaked" pointer.
208   TraceBlob* leaked = tbv.blob().ReleaseUnsafe();
209   base::ignore_result(leaked);
210   return static_cast<uint32_t>(rel_offset);
211 }
212 
InternSeqState(InternedIndex interned_index,RefPtr<PacketSequenceStateGeneration> ptr)213 uint16_t TraceTokenBuffer::InternSeqState(
214     InternedIndex interned_index,
215     RefPtr<PacketSequenceStateGeneration> ptr) {
216   // Look back at most 32 elements. This should be far enough in most cases
217   // unless either: a) we are essentially round-robining between >32 sequences
218   // b) we are churning through generations. Either case seems pathalogical.
219   SequenceStates& states = interned_seqs_.at(interned_index);
220   size_t lookback = std::min<size_t>(32u, states.size());
221   for (uint32_t i = 0; i < lookback; ++i) {
222     uint16_t idx = static_cast<uint16_t>(states.size() - 1 - i);
223     if (states[idx] == ptr.get()) {
224       // Intentionally "leak" this pointer. See |InternTraceBlob| for an
225       // explanation.
226       PacketSequenceStateGeneration* leaked = ptr.ReleaseUnsafe();
227       base::ignore_result(leaked);
228       return idx;
229     }
230   }
231   states.emplace_back(ptr.ReleaseUnsafe());
232   PERFETTO_CHECK(states.size() <= std::numeric_limits<uint16_t>::max());
233   return static_cast<uint16_t>(states.size() - 1);
234 }
235 
AddTraceBlob(InternedIndex interned_index,const TraceBlobView & tbv)236 uint32_t TraceTokenBuffer::AddTraceBlob(InternedIndex interned_index,
237                                         const TraceBlobView& tbv) {
238   BlobWithOffsets& blobs = interned_blobs_.at(interned_index);
239   blobs.emplace_back(BlobWithOffset{tbv.blob().ReleaseUnsafe(), tbv.offset()});
240   PERFETTO_CHECK(blobs.size() <= std::numeric_limits<uint16_t>::max());
241   return 0u;
242 }
243 
FreeMemory()244 void TraceTokenBuffer::FreeMemory() {
245   uint64_t erased = allocator_.EraseFrontFreeChunks();
246   PERFETTO_CHECK(erased <= std::numeric_limits<size_t>::max());
247   interned_blobs_.erase_front(static_cast<size_t>(erased));
248   interned_seqs_.erase_front(static_cast<size_t>(erased));
249   PERFETTO_CHECK(interned_blobs_.size() == interned_seqs_.size());
250 }
251 
AllocAndResizeInternedVectors(uint32_t size)252 BumpAllocator::AllocId TraceTokenBuffer::AllocAndResizeInternedVectors(
253     uint32_t size) {
254   uint64_t erased = allocator_.erased_front_chunks_count();
255   BumpAllocator::AllocId alloc_id = allocator_.Alloc(size);
256   uint64_t allocator_chunks_size = alloc_id.chunk_index - erased + 1;
257 
258   // The allocator should never "remove" chunks from being tracked.
259   PERFETTO_DCHECK(allocator_chunks_size >= interned_blobs_.size());
260 
261   // We should add at most one chunk in the allocator.
262   uint64_t chunks_added = allocator_chunks_size - interned_blobs_.size();
263   PERFETTO_DCHECK(chunks_added <= 1);
264   PERFETTO_DCHECK(interned_blobs_.size() == interned_seqs_.size());
265   for (uint64_t i = 0; i < chunks_added; ++i) {
266     interned_blobs_.emplace_back();
267     interned_seqs_.emplace_back();
268   }
269   return alloc_id;
270 }
271 
GetInternedIndex(BumpAllocator::AllocId alloc_id)272 TraceTokenBuffer::InternedIndex TraceTokenBuffer::GetInternedIndex(
273     BumpAllocator::AllocId alloc_id) {
274   uint64_t interned_index =
275       alloc_id.chunk_index - allocator_.erased_front_chunks_count();
276   PERFETTO_DCHECK(interned_index <= std::numeric_limits<size_t>::max());
277   PERFETTO_DCHECK(interned_index < interned_blobs_.size());
278   PERFETTO_DCHECK(interned_index < interned_seqs_.size());
279   PERFETTO_DCHECK(interned_blobs_.size() == interned_seqs_.size());
280   return static_cast<size_t>(interned_index);
281 }
282 
283 }  // namespace perfetto::trace_processor
284