1 /*
2  * Copyright (C) 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/trace_processor/importers/instruments/instruments_xml_tokenizer.h"
18 
19 #include <cctype>
20 #include <map>
21 
22 #include <expat.h>
23 #include <stdint.h>
24 
25 #include "perfetto/base/status.h"
26 #include "perfetto/ext/base/status_or.h"
27 #include "perfetto/public/fnv1a.h"
28 #include "protos/perfetto/trace/clock_snapshot.pbzero.h"
29 #include "src/trace_processor/importers/common/clock_tracker.h"
30 #include "src/trace_processor/importers/common/stack_profile_tracker.h"
31 #include "src/trace_processor/importers/instruments/row.h"
32 #include "src/trace_processor/importers/instruments/row_data_tracker.h"
33 #include "src/trace_processor/sorter/trace_sorter.h"
34 
35 #if !PERFETTO_BUILDFLAG(PERFETTO_TP_INSTRUMENTS)
36 #error \
37     "This file should not be built when enable_perfetto_trace_processor_mac_instruments=false"
38 #endif
39 
40 namespace perfetto::trace_processor::instruments_importer {
41 
42 namespace {
43 
MakeTrimmed(const char * chars,int len)44 std::string MakeTrimmed(const char* chars, int len) {
45   while (len > 0 && std::isspace(*chars)) {
46     chars++;
47     len--;
48   }
49   while (len > 0 && std::isspace(chars[len - 1])) {
50     len--;
51   }
52   return std::string(chars, static_cast<size_t>(len));
53 }
54 
55 }  // namespace
56 
57 // The Instruments XML tokenizer reads instruments traces exported with:
58 //
59 //   xctrace export --input /path/to/profile.trace --xpath
60 //     '//trace-toc/run/data/table[@schema="os-signpost and
61 //        @category="PointsOfInterest"] |
62 //      //trace-toc/run/data/table[@schema="time-sample"]'
63 //
64 // This exports two tables:
65 //   1. Points of interest signposts
66 //   2. Time samples
67 //
68 // The first is used for clock synchronization -- perfetto emits signpost events
69 // during tracing which allow synchronization of the xctrace clock (relative to
70 // start of profiling) with the perfetto boottime clock. The second contains
71 // the samples themselves.
72 //
73 // The expected format of the rows in the clock sync table is:
74 //
75 //     <row>
76 //       <event-time>1234</event-time>
77 //       <subsystem>dev.perfetto.clock_sync</subsystem>
78 //       <os-log-metadata>
79 //         <uint64>5678</uint64>
80 //       </os-log-metadata>
81 //     </row>
82 //
83 // There may be other rows with other data (from other subsystems), and
84 // additional data in the row (such as thread data and other metadata) -- this
85 // can be safely ignored.
86 //
87 // The expected format of the rows in the time sample table is:
88 //
89 //     <row>
90 //       <sample-time>1234</sample-time>
91 //       <thread fmt="Thread name">
92 //         <tid>1</tid>
93 //         <process fmt="Process name">
94 //           <pid>1<pid>
95 //         </process>
96 //       </thread>
97 //       <core>0</core>
98 //       <backtrace>
99 //         <frame addr="0x120001234">
100 //           <binary
101 //             name="MyBinary" UUID="01234567-89ABC-CDEF-0123-456789ABCDEF"
102 //             load-addr="0x120000000" path="/path/to/MyBinary.app/MyBinary" />
103 //         </frame>
104 //         ... more frames ...
105 //     </row>
106 //
107 // Here we do not expect other rows with other data -- every row should have a
108 // backtrace, and we use the presence of a backtrace to distinguish time samples
109 // and clock sync eventst. However, there can be additional data in the row
110 // (such as other metadata) -- this can be safely ignored.
111 //
112 // In addition, the XML format annotates elements with ids, to later reuse the
113 // same data by id without needing to repeat its contents. For example, you
114 // might have thread data for a sample:
115 //
116 //     <thread id="11" fmt="My Thread"><tid id="12">10</tid>...</thread>
117 //
118 // and subsequent samples on that thread will simply have
119 //
120 //     <thread ref="11" />
121 //
122 // This means that most elements have to have their pertinent data cached by id,
123 // including any data store in child elements (which themselves also have to
124 // be cached by id, like the <tid> in the example above).
125 //
126 // This importer reads the XML data using a streaming XML parser, which means
127 // it has to maintain some parsing state (such as the current stack of tags, or
128 // the current element for which we are reading data).
129 class InstrumentsXmlTokenizer::Impl {
130  public:
Impl(TraceProcessorContext * context)131   explicit Impl(TraceProcessorContext* context)
132       : context_(context), data_(RowDataTracker::GetOrCreate(context_)) {
133     parser_ = XML_ParserCreate(nullptr);
134     XML_SetElementHandler(parser_, ElementStart, ElementEnd);
135     XML_SetCharacterDataHandler(parser_, CharacterData);
136     XML_SetUserData(parser_, this);
137 
138     const char* subsystem = "dev.perfetto.instruments_clock";
139     clock_ = static_cast<ClockTracker::ClockId>(
140         PerfettoFnv1a(subsystem, strlen(subsystem)) | 0x80000000);
141 
142     // Use the above clock if we can, in case there is no other trace and
143     // no clock sync events.
144     context_->clock_tracker->SetTraceTimeClock(clock_);
145   }
~Impl()146   ~Impl() { XML_ParserFree(parser_); }
147 
Parse(TraceBlobView view)148   base::Status Parse(TraceBlobView view) {
149     if (!XML_Parse(parser_, reinterpret_cast<const char*>(view.data()),
150                    static_cast<int>(view.length()), false)) {
151       return base::ErrStatus("XML parse error at line %lu: %s\n",
152                              XML_GetCurrentLineNumber(parser_),
153                              XML_ErrorString(XML_GetErrorCode(parser_)));
154     }
155     return base::OkStatus();
156   }
157 
End()158   base::Status End() {
159     if (!XML_Parse(parser_, nullptr, 0, true)) {
160       return base::ErrStatus("XML parse error at end, line %lu: %s\n",
161                              XML_GetCurrentLineNumber(parser_),
162                              XML_ErrorString(XML_GetErrorCode(parser_)));
163     }
164     return base::OkStatus();
165   }
166 
167  private:
ElementStart(void * data,const char * el,const char ** attr)168   static void ElementStart(void* data, const char* el, const char** attr) {
169     reinterpret_cast<Impl*>(data)->ElementStart(el, attr);
170   }
ElementEnd(void * data,const char * el)171   static void ElementEnd(void* data, const char* el) {
172     reinterpret_cast<Impl*>(data)->ElementEnd(el);
173   }
CharacterData(void * data,const char * chars,int len)174   static void CharacterData(void* data, const char* chars, int len) {
175     reinterpret_cast<Impl*>(data)->CharacterData(chars, len);
176   }
177 
ElementStart(const char * el,const char ** attrs)178   void ElementStart(const char* el, const char** attrs) {
179     tag_stack_.emplace_back(el);
180     std::string_view tag_name = tag_stack_.back();
181 
182     if (tag_name == "row") {
183       current_row_ = Row{};
184     } else if (tag_name == "thread") {
185       MaybeCachedRef<ThreadId> thread_lookup =
186           GetOrInsertByRef(attrs, thread_ref_to_thread_);
187       if (thread_lookup.is_new) {
188         auto new_thread = data_.NewThread();
189         thread_lookup.ref = new_thread.id;
190 
191         for (int i = 2; attrs[i]; i += 2) {
192           std::string key(attrs[i]);
193           if (key == "fmt") {
194             new_thread.ptr->fmt = InternString(attrs[i + 1]);
195           }
196         }
197 
198         current_new_thread_ = new_thread.id;
199       }
200       current_row_.thread = thread_lookup.ref;
201     } else if (tag_name == "process") {
202       MaybeCachedRef<ProcessId> process_lookup =
203           GetOrInsertByRef(attrs, process_ref_to_process_);
204       if (process_lookup.is_new) {
205         // Can only be processing a new process when processing a new thread.
206         PERFETTO_DCHECK(current_new_thread_ != kNullId);
207         auto new_process = data_.NewProcess();
208         process_lookup.ref = new_process.id;
209 
210         for (int i = 2; attrs[i]; i += 2) {
211           std::string key(attrs[i]);
212           if (key == "fmt") {
213             new_process.ptr->fmt = InternString(attrs[i + 1]);
214           }
215         }
216 
217         current_new_process_ = new_process.id;
218       }
219       if (current_new_thread_) {
220         data_.GetThread(current_new_thread_)->process = process_lookup.ref;
221       }
222     } else if (tag_name == "core") {
223       MaybeCachedRef<uint32_t> core_id_lookup =
224           GetOrInsertByRef(attrs, core_ref_to_core_);
225       if (core_id_lookup.is_new) {
226         current_new_core_id_ = &core_id_lookup.ref;
227       } else {
228         current_row_.core_id = core_id_lookup.ref;
229       }
230     } else if (tag_name == "sample-time" || tag_name == "event-time") {
231       // Share time lookup logic between sample times and event times, including
232       // updating the current row's sample time for both.
233       MaybeCachedRef<int64_t> time_lookup =
234           GetOrInsertByRef(attrs, sample_time_ref_to_time_);
235       if (time_lookup.is_new) {
236         current_time_ref_ = &time_lookup.ref;
237       } else {
238         current_row_.timestamp_ = time_lookup.ref;
239       }
240     } else if (tag_name == "subsystem") {
241       MaybeCachedRef<std::string> subsystem_lookup =
242           GetOrInsertByRef(attrs, subsystem_ref_to_subsystem_);
243       current_subsystem_ref_ = &subsystem_lookup.ref;
244     } else if (tag_name == "uint64") {
245       // The only uint64 we care about is the one for the clock sync, which is
246       // expected to contain exactly one uint64 value -- we'll
247       // map all uint64 to a single value and check against the subsystem
248       // when the row is closed.
249       MaybeCachedRef<uint64_t> uint64_lookup =
250           GetOrInsertByRef(attrs, os_log_metadata_or_uint64_ref_to_uint64_);
251       if (uint64_lookup.is_new) {
252         current_uint64_ref_ = &uint64_lookup.ref;
253       } else {
254         if (current_os_log_metadata_uint64_ref_) {
255           // Update the os-log-metadata's uint64 value with this uint64 value.
256           *current_os_log_metadata_uint64_ref_ = uint64_lookup.ref;
257         }
258       }
259     } else if (tag_name == "os-log-metadata") {
260       // The only os-log-metadata we care about is the one with the single
261       // uint64 clock sync value, so also map this to uint64 values with its own
262       // id.
263       MaybeCachedRef<uint64_t> uint64_lookup =
264           GetOrInsertByRef(attrs, os_log_metadata_or_uint64_ref_to_uint64_);
265       current_os_log_metadata_uint64_ref_ = &uint64_lookup.ref;
266     } else if (tag_name == "backtrace") {
267       MaybeCachedRef<BacktraceId> backtrace_lookup =
268           GetOrInsertByRef(attrs, backtrace_ref_to_backtrace_);
269       if (backtrace_lookup.is_new) {
270         backtrace_lookup.ref = data_.NewBacktrace().id;
271       }
272       current_row_.backtrace = backtrace_lookup.ref;
273     } else if (tag_name == "frame") {
274       MaybeCachedRef<BacktraceFrameId> frame_lookup =
275           GetOrInsertByRef(attrs, frame_ref_to_frame_);
276       if (frame_lookup.is_new) {
277         IdPtr<Frame> new_frame = data_.NewFrame();
278         frame_lookup.ref = new_frame.id;
279         for (int i = 2; attrs[i]; i += 2) {
280           std::string key(attrs[i]);
281           if (key == "addr") {
282             new_frame.ptr->addr = strtoll(attrs[i + 1], nullptr, 16);
283           }
284         }
285         current_new_frame_ = new_frame.id;
286       }
287       data_.GetBacktrace(current_row_.backtrace)
288           ->frames.push_back(frame_lookup.ref);
289     } else if (tag_name == "binary") {
290       // Can only be processing a binary when processing a new frame.
291       PERFETTO_DCHECK(current_new_frame_ != kNullId);
292 
293       MaybeCachedRef<BinaryId> binary_lookup =
294           GetOrInsertByRef(attrs, binary_ref_to_binary_);
295       if (binary_lookup.is_new) {
296         auto new_binary = data_.NewBinary();
297         binary_lookup.ref = new_binary.id;
298         for (int i = 2; attrs[i]; i += 2) {
299           std::string key(attrs[i]);
300           if (key == "path") {
301             new_binary.ptr->path = std::string(attrs[i + 1]);
302           } else if (key == "UUID") {
303             new_binary.ptr->uuid =
304                 BuildId::FromHex(base::StringView(attrs[i + 1]));
305           } else if (key == "load-addr") {
306             new_binary.ptr->load_addr = strtoll(attrs[i + 1], nullptr, 16);
307           }
308         }
309         new_binary.ptr->max_addr = new_binary.ptr->load_addr;
310       }
311       PERFETTO_DCHECK(data_.GetFrame(current_new_frame_)->binary == kNullId);
312       data_.GetFrame(current_new_frame_)->binary = binary_lookup.ref;
313     }
314   }
315 
ElementEnd(const char * el)316   void ElementEnd(const char* el) {
317     PERFETTO_DCHECK(el == tag_stack_.back());
318     std::string tag_name = std::move(tag_stack_.back());
319     tag_stack_.pop_back();
320 
321     if (tag_name == "row") {
322       if (current_row_.backtrace) {
323         // Rows with backtraces are assumed to be time samples.
324         base::StatusOr<int64_t> trace_ts =
325             ToTraceTimestamp(current_row_.timestamp_);
326         if (!trace_ts.ok()) {
327           PERFETTO_DLOG("Skipping timestamp %" PRId64 ", no clock snapshot yet",
328                         current_row_.timestamp_);
329         } else {
330           context_->sorter->PushInstrumentsRow(*trace_ts,
331                                                std::move(current_row_));
332         }
333       } else if (current_subsystem_ref_ != nullptr) {
334         // Rows without backtraces are assumed to be signpost events -- filter
335         // these for `dev.perfetto.clock_sync` events.
336         if (*current_subsystem_ref_ == "dev.perfetto.clock_sync") {
337           PERFETTO_DCHECK(current_os_log_metadata_uint64_ref_ != nullptr);
338           uint64_t clock_sync_timestamp = *current_os_log_metadata_uint64_ref_;
339           if (latest_clock_sync_timestamp_ > clock_sync_timestamp) {
340             PERFETTO_DLOG("Skipping timestamp %" PRId64
341                           ", non-monotonic sync deteced",
342                           current_row_.timestamp_);
343           } else {
344             latest_clock_sync_timestamp_ = clock_sync_timestamp;
345             auto status = context_->clock_tracker->AddSnapshot(
346                 {{clock_, current_row_.timestamp_},
347                  {protos::pbzero::ClockSnapshot::Clock::BOOTTIME,
348                   static_cast<int64_t>(latest_clock_sync_timestamp_)}});
349             if (!status.ok()) {
350               PERFETTO_FATAL("Error adding clock snapshot: %s",
351                              status.status().c_message());
352             }
353           }
354         }
355         current_subsystem_ref_ = nullptr;
356         current_os_log_metadata_uint64_ref_ = nullptr;
357         current_uint64_ref_ = nullptr;
358       }
359     } else if (current_new_frame_ != kNullId && tag_name == "frame") {
360       Frame* frame = data_.GetFrame(current_new_frame_);
361       if (frame->binary) {
362         Binary* binary = data_.GetBinary(frame->binary);
363         // We don't know what the binary's mapping end is, but we know that the
364         // current frame is inside of it, so use that.
365         PERFETTO_DCHECK(frame->addr > binary->load_addr);
366         if (frame->addr > binary->max_addr) {
367           binary->max_addr = frame->addr;
368         }
369       }
370       current_new_frame_ = kNullId;
371     } else if (current_new_thread_ != kNullId && tag_name == "thread") {
372       current_new_thread_ = kNullId;
373     } else if (current_new_process_ != kNullId && tag_name == "process") {
374       current_new_process_ = kNullId;
375     } else if (current_new_core_id_ != nullptr && tag_name == "core") {
376       current_new_core_id_ = nullptr;
377     }
378   }
379 
CharacterData(const char * chars,int len)380   void CharacterData(const char* chars, int len) {
381     std::string_view tag_name = tag_stack_.back();
382     if (current_time_ref_ != nullptr &&
383         (tag_name == "sample-time" || tag_name == "event-time")) {
384       std::string s = MakeTrimmed(chars, len);
385       current_row_.timestamp_ = *current_time_ref_ = stoll(s);
386       current_time_ref_ = nullptr;
387     } else if (current_new_thread_ != kNullId && tag_name == "tid") {
388       std::string s = MakeTrimmed(chars, len);
389       data_.GetThread(current_new_thread_)->tid = stoi(s);
390     } else if (current_new_process_ != kNullId && tag_name == "pid") {
391       std::string s = MakeTrimmed(chars, len);
392       data_.GetProcess(current_new_process_)->pid = stoi(s);
393     } else if (current_new_core_id_ != nullptr && tag_name == "core") {
394       std::string s = MakeTrimmed(chars, len);
395       *current_new_core_id_ = static_cast<uint32_t>(stoul(s));
396     } else if (current_subsystem_ref_ != nullptr && tag_name == "subsystem") {
397       std::string s = MakeTrimmed(chars, len);
398       *current_subsystem_ref_ = s;
399     } else if (current_uint64_ref_ != nullptr &&
400                current_os_log_metadata_uint64_ref_ != nullptr &&
401                tag_name == "uint64") {
402       std::string s = MakeTrimmed(chars, len);
403       *current_os_log_metadata_uint64_ref_ = *current_uint64_ref_ = stoull(s);
404     }
405   }
406 
ToTraceTimestamp(int64_t time)407   base::StatusOr<int64_t> ToTraceTimestamp(int64_t time) {
408     base::StatusOr<int64_t> trace_ts =
409         context_->clock_tracker->ToTraceTime(clock_, time);
410 
411     if (PERFETTO_LIKELY(trace_ts.ok())) {
412       latest_timestamp_ = std::max(latest_timestamp_, *trace_ts);
413     }
414 
415     return trace_ts;
416   }
417 
InternString(base::StringView string_view)418   StringId InternString(base::StringView string_view) {
419     return context_->storage->InternString(string_view);
420   }
InternString(const char * string)421   StringId InternString(const char* string) {
422     return InternString(base::StringView(string));
423   }
InternString(const char * data,size_t len)424   StringId InternString(const char* data, size_t len) {
425     return InternString(base::StringView(data, len));
426   }
427 
428   template <typename Value>
429   struct MaybeCachedRef {
430     Value& ref;
431     bool is_new;
432   };
433   // Implement the element caching mechanism. Either insert an element by its
434   // id attribute into the given map, or look up the element in the cache by its
435   // ref attribute. The returned value is a reference into the map, to allow
436   // in-place modification.
437   template <typename Value>
GetOrInsertByRef(const char ** attrs,std::map<unsigned long,Value> & map)438   MaybeCachedRef<Value> GetOrInsertByRef(const char** attrs,
439                                          std::map<unsigned long, Value>& map) {
440     PERFETTO_DCHECK(attrs[0] != nullptr);
441     PERFETTO_DCHECK(attrs[1] != nullptr);
442     const char* key = attrs[0];
443     // The id or ref attribute has to be the first attribute on the element.
444     PERFETTO_DCHECK(strcmp(key, "ref") == 0 || strcmp(key, "id") == 0);
445     unsigned long id = strtoul(attrs[1], nullptr, 10);
446     // If the first attribute key is `id`, then this is a new entry in the
447     // cache -- otherwise, for lookup by ref, it should already exist.
448     bool is_new = strcmp(key, "id") == 0;
449     PERFETTO_DCHECK(is_new == (map.find(id) == map.end()));
450     return {map[id], is_new};
451   }
452 
453   TraceProcessorContext* context_;
454   RowDataTracker& data_;
455 
456   XML_Parser parser_;
457   std::vector<std::string> tag_stack_;
458   int64_t latest_timestamp_;
459 
460   // These maps store the cached element data. These currently have to be
461   // std::map, because they require pointer stability under insertion,
462   // as the various `current_foo_` pointers below point directly into the map
463   // data.
464   //
465   // TODO(leszeks): Relax this pointer stability requirement, and use
466   // base::FlatHashMap.
467   // TODO(leszeks): Consider merging these into a single map from ID to
468   // a variant (or similar).
469   std::map<unsigned long, ThreadId> thread_ref_to_thread_;
470   std::map<unsigned long, ProcessId> process_ref_to_process_;
471   std::map<unsigned long, uint32_t> core_ref_to_core_;
472   std::map<unsigned long, int64_t> sample_time_ref_to_time_;
473   std::map<unsigned long, BinaryId> binary_ref_to_binary_;
474   std::map<unsigned long, BacktraceFrameId> frame_ref_to_frame_;
475   std::map<unsigned long, BacktraceId> backtrace_ref_to_backtrace_;
476   std::map<unsigned long, std::string> subsystem_ref_to_subsystem_;
477   std::map<unsigned long, uint64_t> os_log_metadata_or_uint64_ref_to_uint64_;
478 
479   Row current_row_;
480   int64_t* current_time_ref_ = nullptr;
481   ThreadId current_new_thread_ = kNullId;
482   ProcessId current_new_process_ = kNullId;
483   uint32_t* current_new_core_id_ = nullptr;
484   BacktraceFrameId current_new_frame_ = kNullId;
485 
486   ClockTracker::ClockId clock_;
487   std::string* current_subsystem_ref_ = nullptr;
488   uint64_t* current_os_log_metadata_uint64_ref_ = nullptr;
489   uint64_t* current_uint64_ref_ = nullptr;
490   uint64_t latest_clock_sync_timestamp_ = 0;
491 };
492 
InstrumentsXmlTokenizer(TraceProcessorContext * context)493 InstrumentsXmlTokenizer::InstrumentsXmlTokenizer(TraceProcessorContext* context)
494     : impl_(new Impl(context)) {}
~InstrumentsXmlTokenizer()495 InstrumentsXmlTokenizer::~InstrumentsXmlTokenizer() {
496   delete impl_;
497 }
498 
Parse(TraceBlobView view)499 base::Status InstrumentsXmlTokenizer::Parse(TraceBlobView view) {
500   return impl_->Parse(std::move(view));
501 }
502 
NotifyEndOfFile()503 [[nodiscard]] base::Status InstrumentsXmlTokenizer::NotifyEndOfFile() {
504   return impl_->End();
505 }
506 
507 }  // namespace perfetto::trace_processor::instruments_importer
508