xref: /aosp_15_r20/external/perfetto/src/kallsyms/kernel_symbol_map.h (revision 6dbdd20afdafa5e3ca9b8809fa73465d530080dc)
1*6dbdd20aSAndroid Build Coastguard Worker /*
2*6dbdd20aSAndroid Build Coastguard Worker  * Copyright (C) 2019 The Android Open Source Project
3*6dbdd20aSAndroid Build Coastguard Worker  *
4*6dbdd20aSAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*6dbdd20aSAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*6dbdd20aSAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*6dbdd20aSAndroid Build Coastguard Worker  *
8*6dbdd20aSAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*6dbdd20aSAndroid Build Coastguard Worker  *
10*6dbdd20aSAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*6dbdd20aSAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*6dbdd20aSAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*6dbdd20aSAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*6dbdd20aSAndroid Build Coastguard Worker  * limitations under the License.
15*6dbdd20aSAndroid Build Coastguard Worker  */
16*6dbdd20aSAndroid Build Coastguard Worker 
17*6dbdd20aSAndroid Build Coastguard Worker #ifndef SRC_KALLSYMS_KERNEL_SYMBOL_MAP_H_
18*6dbdd20aSAndroid Build Coastguard Worker #define SRC_KALLSYMS_KERNEL_SYMBOL_MAP_H_
19*6dbdd20aSAndroid Build Coastguard Worker 
20*6dbdd20aSAndroid Build Coastguard Worker #include <stdint.h>
21*6dbdd20aSAndroid Build Coastguard Worker 
22*6dbdd20aSAndroid Build Coastguard Worker #include <array>
23*6dbdd20aSAndroid Build Coastguard Worker #include <forward_list>
24*6dbdd20aSAndroid Build Coastguard Worker #include <string>
25*6dbdd20aSAndroid Build Coastguard Worker #include <vector>
26*6dbdd20aSAndroid Build Coastguard Worker 
27*6dbdd20aSAndroid Build Coastguard Worker namespace perfetto {
28*6dbdd20aSAndroid Build Coastguard Worker 
29*6dbdd20aSAndroid Build Coastguard Worker namespace base {
30*6dbdd20aSAndroid Build Coastguard Worker class StringView;
31*6dbdd20aSAndroid Build Coastguard Worker }
32*6dbdd20aSAndroid Build Coastguard Worker 
33*6dbdd20aSAndroid Build Coastguard Worker // A parser and memory-efficient container for /proc/kallsyms.
34*6dbdd20aSAndroid Build Coastguard Worker // It can store a full kernel symbol table in ~1.2MB of memory and perform fast
35*6dbdd20aSAndroid Build Coastguard Worker // lookups using logarithmic binary searches + bounded linear scans.
36*6dbdd20aSAndroid Build Coastguard Worker //
37*6dbdd20aSAndroid Build Coastguard Worker // /proc/kallsyms is a ~10 MB text file that contains the map of kernel symbols,
38*6dbdd20aSAndroid Build Coastguard Worker // as follows:
39*6dbdd20aSAndroid Build Coastguard Worker // ffffff8f77682f8c t el0_sync_invalid
40*6dbdd20aSAndroid Build Coastguard Worker // ffffff8f77683060 t el0_irq_invalid
41*6dbdd20aSAndroid Build Coastguard Worker // ...
42*6dbdd20aSAndroid Build Coastguard Worker // In a typipcal Android kernel, it consists of 213K lines. Out of these, only
43*6dbdd20aSAndroid Build Coastguard Worker // 116K are interesting for the sake of symbolizing kernel functions, the rest
44*6dbdd20aSAndroid Build Coastguard Worker // are .rodata (variables), weak or other useless symbols.
45*6dbdd20aSAndroid Build Coastguard Worker // Still, even keeping around 116K pointers would require 116K * 8 ~= 1 MB of
46*6dbdd20aSAndroid Build Coastguard Worker // memory, without accounting for any strings for the symbols names.
47*6dbdd20aSAndroid Build Coastguard Worker // The SUM(str.len) for the 116K symbol names adds up to 2.7 MB (without
48*6dbdd20aSAndroid Build Coastguard Worker // counting their addresses).
49*6dbdd20aSAndroid Build Coastguard Worker // However consider the following:
50*6dbdd20aSAndroid Build Coastguard Worker // - Symbol addresses are mostly contiguous. Modulo the initial KASLR loading
51*6dbdd20aSAndroid Build Coastguard Worker //   address, most symbols are few hundreds bytes apart from each other.
52*6dbdd20aSAndroid Build Coastguard Worker // - Symbol names are made of tokens that are quite frequent (token: the result
53*6dbdd20aSAndroid Build Coastguard Worker //   of name.split('_')). If we tokenize the 2.7 MB of strings, the resulting
54*6dbdd20aSAndroid Build Coastguard Worker //   SUM(distinct_token.len) goes down 2.7MB -> 146 KB. This is because tokens
55*6dbdd20aSAndroid Build Coastguard Worker //   like "get", "set" or "event" show up thousands of times.
56*6dbdd20aSAndroid Build Coastguard Worker // - Symbol names are ASCII strings using only 7 out of 8 bits.
57*6dbdd20aSAndroid Build Coastguard Worker //
58*6dbdd20aSAndroid Build Coastguard Worker // In the light of this, the in-memory architecture of this data structure is
59*6dbdd20aSAndroid Build Coastguard Worker // as follows:
60*6dbdd20aSAndroid Build Coastguard Worker // We keep two tables around: (1) a token table and (2) a symbol table. Both
61*6dbdd20aSAndroid Build Coastguard Worker // table are a flat byte vector with some sparse lookaside index to make lookups
62*6dbdd20aSAndroid Build Coastguard Worker // faster and avoid full linear scans.
63*6dbdd20aSAndroid Build Coastguard Worker //
64*6dbdd20aSAndroid Build Coastguard Worker // Token table
65*6dbdd20aSAndroid Build Coastguard Worker // -----------
66*6dbdd20aSAndroid Build Coastguard Worker // The token table is a flat char buffer. Tokens are variable size (>0). Each
67*6dbdd20aSAndroid Build Coastguard Worker // token is identified by its ordinality, so token id 3 is the 3rd token in
68*6dbdd20aSAndroid Build Coastguard Worker // the table. All tokens are concatenated together.
69*6dbdd20aSAndroid Build Coastguard Worker // Given the ASCII encoding, the MSB is used as a terminator. So instead of
70*6dbdd20aSAndroid Build Coastguard Worker // wasting an extra NUL byte for each string, the last char of each token has
71*6dbdd20aSAndroid Build Coastguard Worker // the MSB set.
72*6dbdd20aSAndroid Build Coastguard Worker // Furthermore, a lookaside index stores the offset of tokens (i.e. Token N
73*6dbdd20aSAndroid Build Coastguard Worker // starts at offset O in the buffer) to allow fast lookups. In order to avoid
74*6dbdd20aSAndroid Build Coastguard Worker // wasting too much memory, the index is sparse and track the offsets of only
75*6dbdd20aSAndroid Build Coastguard Worker // one every kTokenIndexSamplinig tokens.
76*6dbdd20aSAndroid Build Coastguard Worker // When looking up a token ID N, the table seeks at the offset of the closest
77*6dbdd20aSAndroid Build Coastguard Worker // token <= N, and then scans linearly the next (at most kTokenIndexSamplinig)
78*6dbdd20aSAndroid Build Coastguard Worker // tokens, counting the MSBs found, until the right token id is found.
79*6dbdd20aSAndroid Build Coastguard Worker // buf:   set*get*kernel*load*fpsimd*return*wrapper*el0*skip*sync*neon*bit*aes
80*6dbdd20aSAndroid Build Coastguard Worker //        ^                   ^                         ^
81*6dbdd20aSAndroid Build Coastguard Worker //        |                   |                         |
82*6dbdd20aSAndroid Build Coastguard Worker // index: 0@0                 4@15                      8@21
83*6dbdd20aSAndroid Build Coastguard Worker 
84*6dbdd20aSAndroid Build Coastguard Worker // Symbol table
85*6dbdd20aSAndroid Build Coastguard Worker // ------------
86*6dbdd20aSAndroid Build Coastguard Worker // The symbol table is a flat char buffer that stores for each symbol: its
87*6dbdd20aSAndroid Build Coastguard Worker // address + the list of token indexes in the token table. The main caveats are
88*6dbdd20aSAndroid Build Coastguard Worker // that:
89*6dbdd20aSAndroid Build Coastguard Worker // - Symbol addresses are delta encoded (delta from prev symbol's addr).
90*6dbdd20aSAndroid Build Coastguard Worker // - Both delta addresses and token indexes are var-int encoded.
91*6dbdd20aSAndroid Build Coastguard Worker // - The LSB of token indexes is used as EOF marker (i.e. the next varint is
92*6dbdd20aSAndroid Build Coastguard Worker //   the delta-addr for the next symbol). This time the LSB is used because of
93*6dbdd20aSAndroid Build Coastguard Worker //   the varint encoding.
94*6dbdd20aSAndroid Build Coastguard Worker // At parsing time symbols are ordered by address and tokens are sorted by
95*6dbdd20aSAndroid Build Coastguard Worker // frequency, so that the top used 64 tokens can be represented with 1 byte.
96*6dbdd20aSAndroid Build Coastguard Worker // (Rationale for 64: 1 byte = 8 bits. The MSB bit of each byte is used for the
97*6dbdd20aSAndroid Build Coastguard Worker // varint encoding, the LSB bit of each number is used as end-of-tokens marker.
98*6dbdd20aSAndroid Build Coastguard Worker // There are 6 bits left -> 64 indexes can be represented using one byte).
99*6dbdd20aSAndroid Build Coastguard Worker // In summary the symbol table looks as follows:
100*6dbdd20aSAndroid Build Coastguard Worker //
101*6dbdd20aSAndroid Build Coastguard Worker // Base address: 0xbeef0000
102*6dbdd20aSAndroid Build Coastguard Worker // Symbol buffer:
103*6dbdd20aSAndroid Build Coastguard Worker // 0 1|0  4|0  6|1    // 0xbeef0000: 1,4,6 -> get_fpsimd_wrapper
104*6dbdd20aSAndroid Build Coastguard Worker // 8 7|0  3|1         // 0xbeef0008: 7,3   -> el0_load
105*6dbdd20aSAndroid Build Coastguard Worker // ...
106*6dbdd20aSAndroid Build Coastguard Worker // Like in the case of the token table, a lookaside index keeps track of the
107*6dbdd20aSAndroid Build Coastguard Worker // offset of one every kSymIndexSamplinig addresses.
108*6dbdd20aSAndroid Build Coastguard Worker // The Lookup(ADDR) function operates as follows:
109*6dbdd20aSAndroid Build Coastguard Worker // 1. Performs a logarithmic binary search in the symbols index, finding the
110*6dbdd20aSAndroid Build Coastguard Worker //    offset of the closest addres <= ADDR.
111*6dbdd20aSAndroid Build Coastguard Worker // 2. Skip over at most kSymIndexSamplinig until the symbol is found.
112*6dbdd20aSAndroid Build Coastguard Worker // 3. For each token index, lookup the corresponding token string and
113*6dbdd20aSAndroid Build Coastguard Worker //    concatenate them to build the symbol name.
114*6dbdd20aSAndroid Build Coastguard Worker 
115*6dbdd20aSAndroid Build Coastguard Worker class KernelSymbolMap {
116*6dbdd20aSAndroid Build Coastguard Worker  public:
117*6dbdd20aSAndroid Build Coastguard Worker   // The two constants below are changeable only for the benchmark use.
118*6dbdd20aSAndroid Build Coastguard Worker   // Trades off size of the root |index_| vs worst-case linear scans size.
119*6dbdd20aSAndroid Build Coastguard Worker   // A higher number makes the index more sparse.
120*6dbdd20aSAndroid Build Coastguard Worker   static size_t kSymIndexSampling;
121*6dbdd20aSAndroid Build Coastguard Worker 
122*6dbdd20aSAndroid Build Coastguard Worker   // Trades off size of the TokenTable |index_| vs worst-case linear scans size.
123*6dbdd20aSAndroid Build Coastguard Worker   static size_t kTokenIndexSampling;
124*6dbdd20aSAndroid Build Coastguard Worker 
125*6dbdd20aSAndroid Build Coastguard Worker   // Parses a kallsyms file. Returns the number of valid symbols decoded.
126*6dbdd20aSAndroid Build Coastguard Worker   size_t Parse(const std::string& kallsyms_path);
127*6dbdd20aSAndroid Build Coastguard Worker 
128*6dbdd20aSAndroid Build Coastguard Worker   // Looks up the closest symbol (i.e. the one with the highest address <=
129*6dbdd20aSAndroid Build Coastguard Worker   // |addr|) from its absolute 64-bit address.
130*6dbdd20aSAndroid Build Coastguard Worker   // Returns an empty string if the symbol is not found (which can happen only
131*6dbdd20aSAndroid Build Coastguard Worker   // if the passed |addr| is < min(addr)).
132*6dbdd20aSAndroid Build Coastguard Worker   std::string Lookup(uint64_t addr);
133*6dbdd20aSAndroid Build Coastguard Worker 
134*6dbdd20aSAndroid Build Coastguard Worker   // Returns the numberr of valid symbols decoded.
num_syms()135*6dbdd20aSAndroid Build Coastguard Worker   size_t num_syms() const { return num_syms_; }
136*6dbdd20aSAndroid Build Coastguard Worker 
137*6dbdd20aSAndroid Build Coastguard Worker   // Returns the size in bytes used by the adddress table (without counting
138*6dbdd20aSAndroid Build Coastguard Worker   // the tokens).
addr_bytes()139*6dbdd20aSAndroid Build Coastguard Worker   size_t addr_bytes() const { return buf_.size() + index_.size() * 8; }
140*6dbdd20aSAndroid Build Coastguard Worker 
141*6dbdd20aSAndroid Build Coastguard Worker   // Returns the total memory usage in bytes.
size_bytes()142*6dbdd20aSAndroid Build Coastguard Worker   size_t size_bytes() const { return addr_bytes() + tokens_.size_bytes(); }
143*6dbdd20aSAndroid Build Coastguard Worker 
144*6dbdd20aSAndroid Build Coastguard Worker   // Token table.
145*6dbdd20aSAndroid Build Coastguard Worker   class TokenTable {
146*6dbdd20aSAndroid Build Coastguard Worker    public:
147*6dbdd20aSAndroid Build Coastguard Worker     using TokenId = uint32_t;
148*6dbdd20aSAndroid Build Coastguard Worker     TokenTable();
149*6dbdd20aSAndroid Build Coastguard Worker     ~TokenTable();
150*6dbdd20aSAndroid Build Coastguard Worker     TokenId Add(const std::string&);
151*6dbdd20aSAndroid Build Coastguard Worker     base::StringView Lookup(TokenId);
size_bytes()152*6dbdd20aSAndroid Build Coastguard Worker     size_t size_bytes() const { return buf_.size() + index_.size() * 4; }
153*6dbdd20aSAndroid Build Coastguard Worker 
shrink_to_fit()154*6dbdd20aSAndroid Build Coastguard Worker     void shrink_to_fit() {
155*6dbdd20aSAndroid Build Coastguard Worker       buf_.shrink_to_fit();
156*6dbdd20aSAndroid Build Coastguard Worker       index_.shrink_to_fit();
157*6dbdd20aSAndroid Build Coastguard Worker     }
158*6dbdd20aSAndroid Build Coastguard Worker 
159*6dbdd20aSAndroid Build Coastguard Worker    private:
160*6dbdd20aSAndroid Build Coastguard Worker     TokenId num_tokens_ = 0;
161*6dbdd20aSAndroid Build Coastguard Worker 
162*6dbdd20aSAndroid Build Coastguard Worker     std::vector<char> buf_;  // Token buffer.
163*6dbdd20aSAndroid Build Coastguard Worker 
164*6dbdd20aSAndroid Build Coastguard Worker     // The value i-th in the vector contains the offset (within |buf_|) of the
165*6dbdd20aSAndroid Build Coastguard Worker     // (i * kTokenIndexSamplinig)-th token.
166*6dbdd20aSAndroid Build Coastguard Worker     std::vector<uint32_t> index_;
167*6dbdd20aSAndroid Build Coastguard Worker   };
168*6dbdd20aSAndroid Build Coastguard Worker 
169*6dbdd20aSAndroid Build Coastguard Worker  private:
170*6dbdd20aSAndroid Build Coastguard Worker   TokenTable tokens_;  // Token table.
171*6dbdd20aSAndroid Build Coastguard Worker 
172*6dbdd20aSAndroid Build Coastguard Worker   uint64_t base_addr_ = 0;    // Address of the first symbol (after sorting).
173*6dbdd20aSAndroid Build Coastguard Worker   size_t num_syms_ = 0;       // Number of valid symbols stored.
174*6dbdd20aSAndroid Build Coastguard Worker   std::vector<uint8_t> buf_;  // Symbol buffer.
175*6dbdd20aSAndroid Build Coastguard Worker 
176*6dbdd20aSAndroid Build Coastguard Worker   // The key is (address - base_addr_), the value is the byte offset in |buf_|
177*6dbdd20aSAndroid Build Coastguard Worker   // where the symbol entry starts (i.e. the start of the varint that tells the
178*6dbdd20aSAndroid Build Coastguard Worker   // delta from the previous symbol).
179*6dbdd20aSAndroid Build Coastguard Worker   std::vector<std::pair<uint32_t /*rel_addr*/, uint32_t /*offset*/>> index_;
180*6dbdd20aSAndroid Build Coastguard Worker };
181*6dbdd20aSAndroid Build Coastguard Worker 
182*6dbdd20aSAndroid Build Coastguard Worker }  // namespace perfetto
183*6dbdd20aSAndroid Build Coastguard Worker 
184*6dbdd20aSAndroid Build Coastguard Worker #endif  // SRC_KALLSYMS_KERNEL_SYMBOL_MAP_H_
185