xref: /aosp_15_r20/external/perfetto/src/protozero/filtering/filter_util.cc (revision 6dbdd20afdafa5e3ca9b8809fa73465d530080dc)
1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/protozero/filtering/filter_util.h"
18 
19 #include <algorithm>
20 #include <deque>
21 #include <map>
22 #include <memory>
23 #include <set>
24 
25 #include <google/protobuf/compiler/importer.h>
26 
27 #include "perfetto/base/build_config.h"
28 #include "perfetto/ext/base/file_utils.h"
29 #include "perfetto/ext/base/getopt.h"
30 #include "perfetto/ext/base/string_utils.h"
31 #include "perfetto/protozero/proto_utils.h"
32 #include "src/protozero/filtering/filter_bytecode_generator.h"
33 #include "src/protozero/filtering/filter_bytecode_parser.h"
34 
35 namespace protozero {
36 
37 namespace {
38 
39 class MultiFileErrorCollectorImpl
40     : public google::protobuf::compiler::MultiFileErrorCollector {
41  public:
42   ~MultiFileErrorCollectorImpl() override;
43   void AddError(const std::string&, int, int, const std::string&) override;
44   void AddWarning(const std::string&, int, int, const std::string&) override;
45 };
46 
47 MultiFileErrorCollectorImpl::~MultiFileErrorCollectorImpl() = default;
48 
AddError(const std::string & filename,int line,int column,const std::string & message)49 void MultiFileErrorCollectorImpl::AddError(const std::string& filename,
50                                            int line,
51                                            int column,
52                                            const std::string& message) {
53   PERFETTO_ELOG("Error %s %d:%d: %s", filename.c_str(), line, column,
54                 message.c_str());
55 }
56 
AddWarning(const std::string & filename,int line,int column,const std::string & message)57 void MultiFileErrorCollectorImpl::AddWarning(const std::string& filename,
58                                              int line,
59                                              int column,
60                                              const std::string& message) {
61   PERFETTO_ELOG("Warning %s %d:%d: %s", filename.c_str(), line, column,
62                 message.c_str());
63 }
64 
65 }  // namespace
66 
67 FilterUtil::FilterUtil() = default;
68 FilterUtil::~FilterUtil() = default;
69 
LoadMessageDefinition(const std::string & proto_file,const std::string & root_message,const std::string & proto_dir_path,const std::set<std::string> & passthrough_fields,const std::set<std::string> & string_filter_fields)70 bool FilterUtil::LoadMessageDefinition(
71     const std::string& proto_file,
72     const std::string& root_message,
73     const std::string& proto_dir_path,
74     const std::set<std::string>& passthrough_fields,
75     const std::set<std::string>& string_filter_fields) {
76   passthrough_fields_ = passthrough_fields;
77   passthrough_fields_seen_.clear();
78   filter_string_fields_ = string_filter_fields;
79   filter_string_fields_seen_.clear();
80 
81   // The protobuf compiler doesn't like backslashes and prints an error like:
82   // Error C:\it7mjanpw3\perfetto-a16500 -1:0: Backslashes, consecutive slashes,
83   // ".", or ".." are not allowed in the virtual path.
84   // Given that C:\foo\bar is a legit path on windows, fix it at this level
85   // because the problem is really the protobuf compiler being too picky.
86   static auto normalize_for_win = [](const std::string& path) {
87 #if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN)
88     return perfetto::base::ReplaceAll(path, "\\", "/");
89 #else
90     return path;
91 #endif
92   };
93 
94   google::protobuf::compiler::DiskSourceTree dst;
95 #if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN)
96   // If the path is absolute, maps "C:/" -> "C:/" (without hardcoding 'C').
97   if (proto_file.size() > 3 && proto_file[1] == ':') {
98     char win_drive[4]{proto_file[0], ':', '/', '\0'};
99     dst.MapPath(win_drive, win_drive);
100   }
101 #endif
102   dst.MapPath("/", "/");  // We might still need this on Win under cygwin.
103   dst.MapPath("", normalize_for_win(proto_dir_path));
104   MultiFileErrorCollectorImpl mfe;
105   google::protobuf::compiler::Importer importer(&dst, &mfe);
106   const google::protobuf::FileDescriptor* root_file =
107       importer.Import(normalize_for_win(proto_file));
108   const google::protobuf::Descriptor* root_msg = nullptr;
109   if (!root_message.empty()) {
110     root_msg = importer.pool()->FindMessageTypeByName(root_message);
111   } else if (root_file->message_type_count() > 0) {
112     // The user didn't specfy the root type. Pick the first type in the file,
113     // most times it's the right guess.
114     root_msg = root_file->message_type(0);
115     if (root_msg)
116       PERFETTO_LOG(
117           "The guessed root message name is \"%s\". Pass -r com.MyName to "
118           "override",
119           root_msg->full_name().c_str());
120   }
121 
122   if (!root_msg) {
123     PERFETTO_ELOG("Could not find the root message \"%s\" in %s",
124                   root_message.c_str(), proto_file.c_str());
125     return false;
126   }
127 
128   // |descriptors_by_full_name| is passed by argument rather than being a member
129   // field so that we don't risk leaving it out of sync (and depending on it in
130   // future without realizing) when performing the Dedupe() pass.
131   DescriptorsByNameMap descriptors_by_full_name;
132   ParseProtoDescriptor(root_msg, &descriptors_by_full_name);
133 
134   // If the user specified a set of fields to pass through, print an error and
135   // fail if any of the passed fields have not been seen while recursing in the
136   // schema. This is to avoid typos or naming changes to be silently ignored.
137   std::vector<std::string> unused;
138   std::set_difference(passthrough_fields_.begin(), passthrough_fields_.end(),
139                       passthrough_fields_seen_.begin(),
140                       passthrough_fields_seen_.end(),
141                       std::back_inserter(unused));
142   for (const std::string& message_and_field : unused) {
143     PERFETTO_ELOG("Field not found %s", message_and_field.c_str());
144   }
145   if (!unused.empty()) {
146     PERFETTO_ELOG("Passthrough syntax: perfetto.protos.MessageName:field_name");
147     return false;
148   }
149   std::set_difference(
150       filter_string_fields_.begin(), filter_string_fields_.end(),
151       filter_string_fields_seen_.begin(), filter_string_fields_seen_.end(),
152       std::back_inserter(unused));
153   for (const std::string& message_and_field : unused) {
154     PERFETTO_ELOG("Field not found %s", message_and_field.c_str());
155   }
156   if (!unused.empty()) {
157     PERFETTO_ELOG(
158         "Filter string syntax: perfetto.protos.MessageName:field_name");
159     return false;
160   }
161   return true;
162 }
163 
164 // Generates a Message object for the given libprotobuf message descriptor.
165 // Recurses as needed into nested fields.
ParseProtoDescriptor(const google::protobuf::Descriptor * proto,DescriptorsByNameMap * descriptors_by_full_name)166 FilterUtil::Message* FilterUtil::ParseProtoDescriptor(
167     const google::protobuf::Descriptor* proto,
168     DescriptorsByNameMap* descriptors_by_full_name) {
169   auto descr_it = descriptors_by_full_name->find(proto->full_name());
170   if (descr_it != descriptors_by_full_name->end())
171     return descr_it->second;
172 
173   descriptors_.emplace_back();
174   Message* msg = &descriptors_.back();
175   msg->full_name = proto->full_name();
176   (*descriptors_by_full_name)[msg->full_name] = msg;
177   for (int i = 0; i < proto->field_count(); ++i) {
178     const auto* proto_field = proto->field(i);
179     const uint32_t field_id = static_cast<uint32_t>(proto_field->number());
180     PERFETTO_CHECK(msg->fields.count(field_id) == 0);
181     auto& field = msg->fields[field_id];
182     field.name = proto_field->name();
183     field.type = proto_field->type_name();
184 
185     std::string message_and_field = msg->full_name + ":" + field.name;
186     bool passthrough = false;
187     if (passthrough_fields_.count(message_and_field)) {
188       field.type = "bytes";
189       passthrough = true;
190       passthrough_fields_seen_.insert(message_and_field);
191     }
192     if (filter_string_fields_.count(message_and_field)) {
193       PERFETTO_CHECK(field.type == "string");
194       field.filter_string = true;
195       msg->has_filter_string_fields = true;
196       filter_string_fields_seen_.insert(message_and_field);
197     }
198     if (proto_field->message_type() && !passthrough) {
199       msg->has_nested_fields = true;
200       // Recurse.
201       field.nested_type = ParseProtoDescriptor(proto_field->message_type(),
202                                                descriptors_by_full_name);
203     }
204   }
205   return msg;
206 }
207 
Dedupe()208 void FilterUtil::Dedupe() {
209   std::map<std::string /*identity*/, Message*> index;
210 
211   std::map<Message*, Message*> dupe_graph;  // K,V: K shall be duped against V.
212 
213   // As a first pass, generate an |identity| string for each leaf message. The
214   // identity is simply the comma-separated stringification of its field ids.
215   // If another message with the same identity exists, add an edge to the graph.
216   const size_t initial_count = descriptors_.size();
217   size_t field_count = 0;
218   for (auto& descr : descriptors_) {
219     // Dedupe only leaf messages without nested or string filter fields.
220     if (descr.has_nested_fields || descr.has_filter_string_fields)
221       continue;
222     std::string identity;
223     for (const auto& id_and_field : descr.fields)
224       identity.append(std::to_string(id_and_field.first) + ",");
225     auto it_and_inserted = index.emplace(identity, &descr);
226     if (!it_and_inserted.second) {
227       // insertion failed, a dupe exists already.
228       Message* dupe_against = it_and_inserted.first->second;
229       dupe_graph.emplace(&descr, dupe_against);
230     }
231   }
232 
233   // Now apply de-duplications by re-directing the nested_type pointer to the
234   // equivalent descriptors that have the same set of allowed field ids.
235   std::set<Message*> referenced_descriptors;
236   referenced_descriptors.emplace(&descriptors_.front());  // The root.
237   for (auto& descr : descriptors_) {
238     for (auto& id_and_field : descr.fields) {
239       Message* target = id_and_field.second.nested_type;
240       if (!target)
241         continue;  // Only try to dedupe nested types.
242       auto it = dupe_graph.find(target);
243       if (it == dupe_graph.end()) {
244         referenced_descriptors.emplace(target);
245         continue;
246       }
247       ++field_count;
248       // Replace with the dupe.
249       id_and_field.second.nested_type = it->second;
250     }  // for (nested_fields).
251   }    // for (descriptors_).
252 
253   // Remove unreferenced descriptors. We should much rather crash in the case of
254   // a logic bug rathern than trying to use them but don't emit them.
255   size_t removed_count = 0;
256   for (auto it = descriptors_.begin(); it != descriptors_.end();) {
257     if (referenced_descriptors.count(&*it)) {
258       ++it;
259     } else {
260       ++removed_count;
261       it = descriptors_.erase(it);
262     }
263   }
264   PERFETTO_LOG(
265       "Deduplication removed %zu duped descriptors out of %zu descriptors from "
266       "%zu fields",
267       removed_count, initial_count, field_count);
268 }
269 
270 // Prints the list of messages and fields in a diff-friendly text format.
PrintAsText(std::optional<std::string> filter_bytecode)271 void FilterUtil::PrintAsText(std::optional<std::string> filter_bytecode) {
272   using perfetto::base::StripPrefix;
273   const std::string& root_name = descriptors_.front().full_name;
274   std::string root_prefix = root_name.substr(0, root_name.rfind('.'));
275   if (!root_prefix.empty())
276     root_prefix.append(".");
277 
278   FilterBytecodeParser parser;
279   if (filter_bytecode) {
280     PERFETTO_CHECK(
281         parser.Load(filter_bytecode->data(), filter_bytecode->size()));
282   }
283 
284   // <Filter msg_index, Message>
285   std::deque<std::pair<uint32_t, const Message*>> queue;
286   std::set<const Message*> seen_msgs{&descriptors_.front()};
287   queue.emplace_back(0u, &descriptors_.front());
288 
289   while (!queue.empty()) {
290     auto index_and_descr = queue.front();
291     queue.pop_front();
292     uint32_t msg_index = index_and_descr.first;
293     const auto& descr = *index_and_descr.second;
294 
295     for (const auto& id_and_field : descr.fields) {
296       const uint32_t field_id = id_and_field.first;
297       const auto& field = id_and_field.second;
298 
299       FilterBytecodeParser::QueryResult result{0, false};
300       if (filter_bytecode) {
301         result = parser.Query(msg_index, field_id);
302         if (!result.allowed) {
303           continue;
304         }
305       }
306 
307       const Message* nested_type = id_and_field.second.nested_type;
308       bool passthrough = false;
309       if (nested_type) {
310         // result.simple_field might be true if the generated bytecode is
311         // passing through a whole submessage without recursing.
312         passthrough = result.allowed && result.simple_field();
313         if (seen_msgs.find(nested_type) == seen_msgs.end()) {
314           seen_msgs.insert(nested_type);
315           queue.emplace_back(result.nested_msg_index, nested_type);
316         }
317       } else {  // simple field
318         PERFETTO_CHECK(result.simple_field() || result.filter_string_field() ||
319                        !filter_bytecode);
320         PERFETTO_CHECK(result.filter_string_field() == field.filter_string ||
321                        !filter_bytecode);
322       }
323 
324       auto stripped_name = StripPrefix(descr.full_name, root_prefix);
325       std::string stripped_nested =
326           nested_type ? " " + StripPrefix(nested_type->full_name, root_prefix)
327                       : "";
328       if (passthrough)
329         stripped_nested += "  # PASSTHROUGH";
330       if (field.filter_string)
331         stripped_nested += "  # FILTER STRING";
332       fprintf(print_stream_, "%-60s %3u %-8s %-32s%s\n", stripped_name.c_str(),
333               field_id, field.type.c_str(), field.name.c_str(),
334               stripped_nested.c_str());
335     }
336   }
337 }
338 
GenerateFilterBytecode()339 std::string FilterUtil::GenerateFilterBytecode() {
340   protozero::FilterBytecodeGenerator bytecode_gen;
341 
342   // Assign indexes to descriptors, simply by counting them in order;
343   std::map<Message*, uint32_t> descr_to_idx;
344   for (auto& descr : descriptors_)
345     descr_to_idx[&descr] = static_cast<uint32_t>(descr_to_idx.size());
346 
347   for (auto& descr : descriptors_) {
348     for (auto it = descr.fields.begin(); it != descr.fields.end();) {
349       uint32_t field_id = it->first;
350       const Message::Field& field = it->second;
351       if (field.nested_type) {
352         // Append the index of the target submessage.
353         PERFETTO_CHECK(descr_to_idx.count(field.nested_type));
354         uint32_t nested_msg_index = descr_to_idx[field.nested_type];
355         bytecode_gen.AddNestedField(field_id, nested_msg_index);
356         ++it;
357         continue;
358       }
359       if (field.filter_string) {
360         bytecode_gen.AddFilterStringField(field_id);
361         ++it;
362         continue;
363       }
364       // Simple field. Lookahead to see if we have a range of contiguous simple
365       // fields.
366       for (uint32_t range_len = 1;; ++range_len) {
367         ++it;
368         if (it != descr.fields.end() && it->first == field_id + range_len &&
369             it->second.is_simple()) {
370           continue;
371         }
372         // At this point it points to either the end() of the vector or a
373         // non-contiguous or non-simple field (which will be picked up by the
374         // next iteration).
375         if (range_len == 1) {
376           bytecode_gen.AddSimpleField(field_id);
377         } else {
378           bytecode_gen.AddSimpleFieldRange(field_id, range_len);
379         }
380         break;
381       }  // for (range_len)
382     }    // for (descr.fields)
383     bytecode_gen.EndMessage();
384   }  // for (descriptors)
385   return bytecode_gen.Serialize();
386 }
387 
LookupField(const std::string & varint_encoded_path)388 std::string FilterUtil::LookupField(const std::string& varint_encoded_path) {
389   const uint8_t* ptr =
390       reinterpret_cast<const uint8_t*>(varint_encoded_path.data());
391   const uint8_t* const end = ptr + varint_encoded_path.size();
392 
393   std::vector<uint32_t> fields;
394   while (ptr < end) {
395     uint64_t varint;
396     const uint8_t* next = proto_utils::ParseVarInt(ptr, end, &varint);
397     PERFETTO_CHECK(next != ptr);
398     fields.emplace_back(static_cast<uint32_t>(varint));
399     ptr = next;
400   }
401   return LookupField(fields.data(), fields.size());
402 }
403 
LookupField(const uint32_t * field_ids,size_t num_fields)404 std::string FilterUtil::LookupField(const uint32_t* field_ids,
405                                     size_t num_fields) {
406   const Message* msg = descriptors_.empty() ? nullptr : &descriptors_.front();
407   std::string res;
408   for (size_t i = 0; i < num_fields; ++i) {
409     const uint32_t field_id = field_ids[i];
410     const Message::Field* field = nullptr;
411     if (msg) {
412       auto it = msg->fields.find(field_id);
413       field = it == msg->fields.end() ? nullptr : &it->second;
414     }
415     res.append(".");
416     if (field) {
417       res.append(field->name);
418       msg = field->nested_type;
419     } else {
420       res.append(std::to_string(field_id));
421     }
422   }
423   return res;
424 }
425 
426 }  // namespace protozero
427