1 /* 2 * Copyright (C) 2021 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_ 18 #define SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_ 19 20 #include <stddef.h> 21 #include <stdint.h> 22 23 #include <optional> 24 #include <vector> 25 26 namespace protozero { 27 28 // Loads the proto-encoded bytecode in memory and allows fast lookups for tuples 29 // (msg_index, field_id) to tell if a given field should be allowed or not and, 30 // in the case of nested fields, what is the next message index to recurse into. 31 // This class does two things: 32 // 1. Expands the array of varint from the proto into a vector<uint32_t>. This 33 // is to avoid performing varint decoding on every lookup, at the cost of 34 // some extra memory (2KB-4KB). Note that the expanded vector is not just a 35 // 1:1 copy of the proto one (more below). This is to avoid O(Fields) linear 36 // lookup complexity. 37 // 2. Creates an index of offsets to remember the start word for each message. 38 // This is so we can jump to O(1) to the N-th message when recursing into a 39 // nested fields, without having to scan and find the (N-1)-th END_OF_MESSAGE 40 // marker. 41 // Overall lookups are O(1) for field ids < 128 (kDirectlyIndexLimit) and O(N), 42 // with N being the number of allowed field ranges for other fields. 43 // See comments around |word_| below for the structure of the word vector. 44 class FilterBytecodeParser { 45 public: 46 // Result of a Query() operation 47 struct QueryResult { 48 bool allowed; // Whether the field is allowed at all or no. 49 50 // If |allowed|==true && nested_msg_field() == true, this tells the message 51 // index of the nested field that should be used when recursing in the 52 // parser. 53 uint32_t nested_msg_index; 54 55 // If |allowed|==true, specifies if the field is of a simple type (varint, 56 // fixed32/64, string or byte). simple_fieldQueryResult57 bool simple_field() const { return nested_msg_index == kSimpleField; } 58 59 // If |allowed|==true, specifies if this field is a string field that needs 60 // to be filtered. filter_string_fieldQueryResult61 bool filter_string_field() const { 62 return nested_msg_index == kFilterStringField; 63 } 64 65 // If |allowed|==true, specifies if the field is a nested field that needs 66 // recursion. The caller is expected to use |nested_msg_index| for the next 67 // Query() calls. nested_msg_fieldQueryResult68 bool nested_msg_field() const { 69 static_assert(kFilterStringField < kSimpleField, 70 "kFilterStringField < kSimpleField"); 71 return nested_msg_index < kFilterStringField; 72 } 73 }; 74 75 // Loads a filter. The filter data consists of a sequence of varints which 76 // contains the filter opcodes and a final checksum. 77 bool Load(const void* filter_data, size_t len); 78 79 // Checks wheter a given field is allowed or not. 80 // msg_index = 0 is the index of the root message, where all queries should 81 // start from (typically perfetto.protos.Trace). 82 QueryResult Query(uint32_t msg_index, uint32_t field_id) const; 83 84 void Reset(); set_suppress_logs_for_fuzzer(bool x)85 void set_suppress_logs_for_fuzzer(bool x) { suppress_logs_for_fuzzer_ = x; } 86 87 private: 88 static constexpr uint32_t kDirectlyIndexLimit = 128; 89 static constexpr uint32_t kAllowed = 1u << 31u; 90 static constexpr uint32_t kSimpleField = 0x7fffffff; 91 static constexpr uint32_t kFilterStringField = 0x7ffffffe; 92 93 bool LoadInternal(const uint8_t* filter_data, size_t len); 94 95 // The state of all fields for all messages is stored in one contiguous array. 96 // This is to avoid memory fragmentation and allocator overhead. 97 // We expect a high number of messages (hundreds), but each message is small. 98 // For each message we store two sets of uint32: 99 // 1. A set of "directly indexed" fields, for field ids < 128. 100 // 2. The remainder is a set of ranges. 101 // So each message descriptor consists of a sequence of words as follows: 102 // 103 // [0] -> how many directly indexed fields are stored next (up to 128) 104 // 105 // [1..N] -> One word per field id (See "field state" below). 106 // 107 // [N + 1] -> Start of field id range 1 108 // [N + 2] -> End of field id range 1 (exclusive, STL-style). 109 // [N + 3] -> Field state for fields in range 1 (below) 110 // 111 // [N + 4] -> Start of field id range 2 112 // [N + 5] -> End of field id range 2 (exclusive, STL-style). 113 // [N + 6] -> Field state for fields in range 2 (below) 114 115 // The "field state" word is as follows: 116 // Bit 31: 1 if the field is allowed, 0 if disallowed. 117 // Only directly indexed fields can be 0 (it doesn't make sense to add 118 // a range and then say "btw it's NOT allowed".. don't add it then. 119 // 0 is only used for filling gaps in the directly indexed bucket. 120 // Bits [30..0] (only when MSB == allowed): 121 // 0x7fffffff: The field is "simple" (varint, fixed32/64, string, bytes) and 122 // can be directly passed through in output. No recursion is needed. 123 // 0x7ffffffe: The field is string field which needs to be filtered. 124 // [0, 7ffffffd]: The field is a nested submessage. The value is the index 125 // that must be passed as first argument to the next Query() calls. 126 // Note that the message index is purely a monotonic counter in the 127 std::vector<uint32_t> words_; 128 129 // One entry for each message index stored in the filter plus a sentinel at 130 // the end. Maps each message index to the offset in |words_| where the 131 // Nth message start. 132 // message_offset_.size() - 2 == the max message id that can be parsed. 133 std::vector<uint32_t> message_offset_; 134 135 bool suppress_logs_for_fuzzer_ = false; 136 }; 137 138 } // namespace protozero 139 140 #endif // SRC_PROTOZERO_FILTERING_FILTER_BYTECODE_PARSER_H_ 141