xref: /aosp_15_r20/external/perfetto/src/protozero/filtering/filter_bytecode_parser.h (revision 6dbdd20afdafa5e3ca9b8809fa73465d530080dc)
1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_
18 #define SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_
19 
20 #include <stddef.h>
21 #include <stdint.h>
22 
23 #include <optional>
24 #include <vector>
25 
26 namespace protozero {
27 
28 // Loads the proto-encoded bytecode in memory and allows fast lookups for tuples
29 // (msg_index, field_id) to tell if a given field should be allowed or not and,
30 // in the case of nested fields, what is the next message index to recurse into.
31 // This class does two things:
32 // 1. Expands the array of varint from the proto into a vector<uint32_t>. This
33 //    is to avoid performing varint decoding on every lookup, at the cost of
34 //    some extra memory (2KB-4KB). Note that the expanded vector is not just a
35 //    1:1 copy of the proto one (more below). This is to avoid O(Fields) linear
36 //    lookup complexity.
37 // 2. Creates an index of offsets to remember the start word for each message.
38 //    This is so we can jump to O(1) to the N-th message when recursing into a
39 //    nested fields, without having to scan and find the (N-1)-th END_OF_MESSAGE
40 //    marker.
41 // Overall lookups are O(1) for field ids < 128 (kDirectlyIndexLimit) and O(N),
42 // with N being the number of allowed field ranges for other fields.
43 // See comments around |word_| below for the structure of the word vector.
44 class FilterBytecodeParser {
45  public:
46   // Result of a Query() operation
47   struct QueryResult {
48     bool allowed;  // Whether the field is allowed at all or no.
49 
50     // If |allowed|==true && nested_msg_field() == true, this tells the message
51     // index of the nested field that should be used when recursing in the
52     // parser.
53     uint32_t nested_msg_index;
54 
55     // If |allowed|==true, specifies if the field is of a simple type (varint,
56     // fixed32/64, string or byte).
simple_fieldQueryResult57     bool simple_field() const { return nested_msg_index == kSimpleField; }
58 
59     // If |allowed|==true, specifies if this field is a string field that needs
60     // to be filtered.
filter_string_fieldQueryResult61     bool filter_string_field() const {
62       return nested_msg_index == kFilterStringField;
63     }
64 
65     // If |allowed|==true, specifies if the field is a nested field that needs
66     // recursion. The caller is expected to use |nested_msg_index| for the next
67     // Query() calls.
nested_msg_fieldQueryResult68     bool nested_msg_field() const {
69       static_assert(kFilterStringField < kSimpleField,
70                     "kFilterStringField < kSimpleField");
71       return nested_msg_index < kFilterStringField;
72     }
73   };
74 
75   // Loads a filter. The filter data consists of a sequence of varints which
76   // contains the filter opcodes and a final checksum.
77   bool Load(const void* filter_data, size_t len);
78 
79   // Checks wheter a given field is allowed or not.
80   // msg_index = 0 is the index of the root message, where all queries should
81   // start from (typically perfetto.protos.Trace).
82   QueryResult Query(uint32_t msg_index, uint32_t field_id) const;
83 
84   void Reset();
set_suppress_logs_for_fuzzer(bool x)85   void set_suppress_logs_for_fuzzer(bool x) { suppress_logs_for_fuzzer_ = x; }
86 
87  private:
88   static constexpr uint32_t kDirectlyIndexLimit = 128;
89   static constexpr uint32_t kAllowed = 1u << 31u;
90   static constexpr uint32_t kSimpleField = 0x7fffffff;
91   static constexpr uint32_t kFilterStringField = 0x7ffffffe;
92 
93   bool LoadInternal(const uint8_t* filter_data, size_t len);
94 
95   // The state of all fields for all messages is stored in one contiguous array.
96   // This is to avoid memory fragmentation and allocator overhead.
97   // We expect a high number of messages (hundreds), but each message is small.
98   // For each message we store two sets of uint32:
99   // 1. A set of "directly indexed" fields, for field ids < 128.
100   // 2. The remainder is a set of ranges.
101   // So each message descriptor consists of a sequence of words as follows:
102   //
103   // [0] -> how many directly indexed fields are stored next (up to 128)
104   //
105   // [1..N] -> One word per field id (See "field state" below).
106   //
107   // [N + 1] -> Start of field id range 1
108   // [N + 2] -> End of field id range 1 (exclusive, STL-style).
109   // [N + 3] -> Field state for fields in range 1 (below)
110   //
111   // [N + 4] -> Start of field id range 2
112   // [N + 5] -> End of field id range 2 (exclusive, STL-style).
113   // [N + 6] -> Field state for fields in range 2 (below)
114 
115   // The "field state" word is as follows:
116   // Bit 31: 1 if the field is allowed, 0 if disallowed.
117   //         Only directly indexed fields can be 0 (it doesn't make sense to add
118   //         a range and then say "btw it's NOT allowed".. don't add it then.
119   //         0 is only used for filling gaps in the directly indexed bucket.
120   // Bits [30..0] (only when MSB == allowed):
121   //  0x7fffffff: The field is "simple" (varint, fixed32/64, string, bytes) and
122   //      can be directly passed through in output. No recursion is needed.
123   //  0x7ffffffe: The field is string field which needs to be filtered.
124   //  [0, 7ffffffd]: The field is a nested submessage. The value is the index
125   //     that must be passed as first argument to the next Query() calls.
126   //     Note that the message index is purely a monotonic counter in the
127   std::vector<uint32_t> words_;
128 
129   // One entry for each message index stored in the filter plus a sentinel at
130   // the end. Maps each message index to the offset in |words_| where the
131   // Nth message start.
132   // message_offset_.size() - 2 == the max message id that can be parsed.
133   std::vector<uint32_t> message_offset_;
134 
135   bool suppress_logs_for_fuzzer_ = false;
136 };
137 
138 }  // namespace protozero
139 
140 #endif  // SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_
141