xref: /aosp_15_r20/external/perfetto/src/protozero/filtering/message_filter.h (revision 6dbdd20afdafa5e3ca9b8809fa73465d530080dc)
1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef SRC_PROTOZERO_FILTERING_MESSAGE_FILTER_H_
18 #define SRC_PROTOZERO_FILTERING_MESSAGE_FILTER_H_
19 
20 #include <stdint.h>
21 
22 #include <memory>
23 #include <string>
24 #include <unordered_map>
25 
26 #include "src/protozero/filtering/filter_bytecode_parser.h"
27 #include "src/protozero/filtering/message_tokenizer.h"
28 #include "src/protozero/filtering/string_filter.h"
29 
30 namespace protozero {
31 
32 // A class to filter binary-encoded proto messages using an allow-list of field
33 // ids, also known as "filter bytecode". The filter determines which fields are
34 // allowed to be passed through in output and strips all the other fields.
35 // See go/trace-filtering for full design.
36 // This class takes in input:
37 // 1) The filter bytecode, loaded once via the LoadFilterBytecode() method.
38 // 2) A proto-encoded binary message. The message doesn't have to be contiguous,
39 //    it can be passed as an array of arbitrarily chunked fragments.
40 // The FilterMessage*() method returns in output a proto message, stripping out
41 // all unknown fields. If the input is malformed (e.g., unknown proto field wire
42 // types, lengths out of bound) the whole filtering failed and the |error| flag
43 // of the FilteredMessage object is set to true.
44 // The filtering operation is based on rewriting a copy of the message into a
45 // self-allocated buffer, which is then returned in the output. The input buffer
46 // is NOT altered.
47 // Note also that the process of rewriting the protos gets rid of most redundant
48 // varint encoding (if present). So even if all fields are allow-listed, the
49 // output might NOT be bitwise identical to the input (but it will be
50 // semantically equivalent).
51 // Furthermore the enable_field_usage_tracking() method allows to keep track of
52 // a histogram of allowed / denied fields. It slows down filtering and is
53 // intended only on host tools.
54 class MessageFilter {
55  public:
56   class Config {
57    public:
58     bool LoadFilterBytecode(const void* filter_data, size_t len);
59     bool SetFilterRoot(std::initializer_list<uint32_t> field_ids);
60 
filter()61     const FilterBytecodeParser& filter() const { return filter_; }
string_filter()62     const StringFilter& string_filter() const { return string_filter_; }
string_filter()63     StringFilter& string_filter() { return string_filter_; }
root_msg_index()64     uint32_t root_msg_index() const { return root_msg_index_; }
65 
66    private:
67     FilterBytecodeParser filter_;
68     StringFilter string_filter_;
69     uint32_t root_msg_index_ = 0;
70   };
71 
72   MessageFilter();
73   explicit MessageFilter(Config);
74   ~MessageFilter();
75 
76   struct InputSlice {
77     const void* data;
78     size_t len;
79   };
80 
81   struct FilteredMessage {
FilteredMessageFilteredMessage82     FilteredMessage(std::unique_ptr<uint8_t[]> d, size_t s)
83         : data(std::move(d)), size(s) {}
84     std::unique_ptr<uint8_t[]> data;
85     size_t size;  // The used bytes in |data|. This is <= sizeof(data).
86     bool error = false;
87   };
88 
89   // Loads the filter bytecode that will be used to filter any subsequent
90   // message. Must be called before the first call to FilterMessage*().
91   // |filter_data| must point to a byte buffer for a proto-encoded ProtoFilter
92   // message (see proto_filter.proto).
LoadFilterBytecode(const void * filter_data,size_t len)93   bool LoadFilterBytecode(const void* filter_data, size_t len) {
94     return config_.LoadFilterBytecode(filter_data, len);
95   }
96 
97   // This affects the filter starting point of the subsequent FilterMessage*()
98   // calls. By default the filtering process starts from the message @ index 0,
99   // the root message passed to proto_filter when generating the bytecode
100   // (in typical tracing use-cases, this is perfetto.protos.Trace). However, the
101   // caller (TracingServiceImpl) might want to filter packets from the 2nd level
102   // (perfetto.protos.TracePacket) because the root level is pre-pended after
103   // the fact. This call allows to change the root message for the filter.
104   // The argument |field_ids| is an array of proto field ids and determines the
105   // path to the new root. For instance, in the case of [1,2,3] SetFilterRoot
106   // will identify the sub-message for the field "root.1.2.3" and use that.
107   // In order for this to succeed all the fields in the path must be allowed
108   // in the filter and must be a nested message type.
SetFilterRoot(std::initializer_list<uint32_t> field_ids)109   bool SetFilterRoot(std::initializer_list<uint32_t> field_ids) {
110     return config_.SetFilterRoot(field_ids);
111   }
112 
113   // Takes an input message, fragmented in arbitrary slices, and returns a
114   // filtered message in output.
115   FilteredMessage FilterMessageFragments(const InputSlice*, size_t num_slices);
116 
117   // Helper for tests, where the input is a contiguous buffer.
FilterMessage(const void * data,size_t len)118   FilteredMessage FilterMessage(const void* data, size_t len) {
119     InputSlice slice{data, len};
120     return FilterMessageFragments(&slice, 1);
121   }
122 
123   // When enabled returns a map of "field path" to "usage counter".
124   // The key (std::string) is a binary buffer (i.e. NOT an ASCII/UTF-8 string)
125   // which contains a varint for each field. Consider the following:
126   // message Root { Sub1 f1 = 1; };
127   // message Sub1 { Sub2 f2 = 7;}
128   // message Sub2 { string f3 = 5; }
129   // The field .f1.f2.f3 will be encoded as \x01\0x07\x05.
130   // The value is the number of times that field has been encountered. If the
131   // field is not allow-listed in the bytecode (the field is stripped in output)
132   // the count will be negative.
enable_field_usage_tracking(bool x)133   void enable_field_usage_tracking(bool x) { track_field_usage_ = x; }
field_usage()134   const std::unordered_map<std::string, int32_t>& field_usage() const {
135     return field_usage_;
136   }
137 
config()138   const Config& config() const { return config_; }
139 
140   // Retuns the helper class used to perform string filtering.
string_filter()141   StringFilter& string_filter() { return config_.string_filter(); }
142 
143  private:
144   // This is called by FilterMessageFragments().
145   // Inlining allows the compiler turn the per-byte call/return into a for loop,
146   // while, at the same time, keeping the code easy to read and reason about.
147   // It gives a 20-25% speedup (265ms vs 215ms for a 25MB trace).
148   void FilterOneByte(uint8_t octet) PERFETTO_ALWAYS_INLINE;
149 
150   // No-inline because this is a slowpath (only when usage tracking is enabled).
151   void IncrementCurrentFieldUsage(uint32_t field_id,
152                                   bool allowed) PERFETTO_NO_INLINE;
153 
154   // Gets into an error state which swallows all the input and emits no output.
155   void SetUnrecoverableErrorState();
156 
157   // We keep track of the nest of messages in a stack. Each StackState
158   // object corresponds to a level of nesting in the proto message structure.
159   // Every time a new field of type len-delimited that has a corresponding
160   // sub-message in the bytecode is encountered, a new StackState is pushed in
161   // |stack_|. stack_[0] is a sentinel to prevent over-popping without adding
162   // extra branches in the fastpath.
163   // |stack_|. stack_[1] is the state of the root message.
164   struct StackState {
165     uint32_t in_bytes = 0;  // Number of input bytes processed.
166 
167     // When |in_bytes| reaches this value, the current state should be popped.
168     // This is set when recursing into nested submessages. This is 0 only for
169     // stack_[0] (we don't know the size of the root message upfront).
170     uint32_t in_bytes_limit = 0;
171 
172     // This is set when a len-delimited message is encountered, either a string
173     // or a nested submessage that is NOT allow-listed in the bytecode.
174     // This causes input bytes to be consumed without being parsed from the
175     // input stream. If |passthrough_eaten_bytes| == true, they will be copied
176     // as-is in output (e.g. in the case of an allowed string/bytes field).
177     uint32_t eat_next_bytes = 0;
178 
179     // Keeps tracks of the stream_writer output counter (out_.written()) then
180     // the StackState is pushed. This is used to work out, when popping, how
181     // many bytes have been written for the current submessage.
182     uint32_t out_bytes_written_at_start = 0;
183 
184     uint32_t field_id = 0;   // The proto field id for the current message.
185     uint32_t msg_index = 0;  // The index of the message filter in the bytecode.
186 
187     // This is a pointer to the proto preamble for the current submessage
188     // (it's nullptr for stack_[0] and non-null elsewhere). This will be filled
189     // with the actual size of the message (out_.written() -
190     // |out_bytes_written_at_start|) when finishing (popping) the message.
191     // This must be filled using WriteRedundantVarint(). Note that the
192     // |size_field_len| is variable and depends on the actual length of the
193     // input message. If the output message has roughly the same size of the
194     // input message, the length will not be redundant.
195     // In other words: the length of the field is reserved when the submessage
196     // starts. At that point we know the upper-bound for the output message
197     // (a filtered submessage can be <= the original one, but not >). So we
198     // reserve as many bytes it takes to write the input length in varint.
199     // Then, when the message is finalized and we know the actual output size
200     // we backfill the field.
201     // Consider the example of a submessage where the input size = 130 (>127,
202     // 2 varint bytes) and the output is 120 bytes. The length will be 2 bytes
203     // wide even though could have been encoded with just one byte.
204     uint8_t* size_field = nullptr;
205     uint32_t size_field_len = 0;
206 
207     // The pointer to the start of the string to update the string if it is
208     // filtered.
209     uint8_t* filter_string_ptr = nullptr;
210 
211     // How |eat_next_bytes| should be handled. It seems that keeping this field
212     // at the end rather than next to |eat_next_bytes| makes the filter a little
213     // (but measurably) faster. (likely something related with struct layout vs
214     // cache sizes).
215     enum FilterAction {
216       kDrop,
217       kPassthrough,
218       kFilterString,
219     };
220     FilterAction action = FilterAction::kDrop;
221   };
222 
out_written()223   uint32_t out_written() { return static_cast<uint32_t>(out_ - &out_buf_[0]); }
224 
225   Config config_;
226 
227   std::unique_ptr<uint8_t[]> out_buf_;
228   uint8_t* out_ = nullptr;
229   uint8_t* out_end_ = nullptr;
230 
231   MessageTokenizer tokenizer_;
232   std::vector<StackState> stack_;
233 
234   bool error_ = false;
235   bool track_field_usage_ = false;
236   std::unordered_map<std::string, int32_t> field_usage_;
237 };
238 
239 }  // namespace protozero
240 
241 #endif  // SRC_PROTOZERO_FILTERING_MESSAGE_FILTER_H_
242