1 // Copyright 2024 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/debugging/internal/demangle_rust.h"
16
17 #include <cstddef>
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21
22 #include "absl/base/attributes.h"
23 #include "absl/base/config.h"
24 #include "absl/debugging/internal/decode_rust_punycode.h"
25
26 namespace absl {
27 ABSL_NAMESPACE_BEGIN
28 namespace debugging_internal {
29
30 namespace {
31
32 // Same step limit as the C++ demangler in demangle.cc uses.
33 constexpr int kMaxReturns = 1 << 17;
34
IsDigit(char c)35 bool IsDigit(char c) { return '0' <= c && c <= '9'; }
IsLower(char c)36 bool IsLower(char c) { return 'a' <= c && c <= 'z'; }
IsUpper(char c)37 bool IsUpper(char c) { return 'A' <= c && c <= 'Z'; }
IsAlpha(char c)38 bool IsAlpha(char c) { return IsLower(c) || IsUpper(c); }
IsIdentifierChar(char c)39 bool IsIdentifierChar(char c) { return IsAlpha(c) || IsDigit(c) || c == '_'; }
IsLowerHexDigit(char c)40 bool IsLowerHexDigit(char c) { return IsDigit(c) || ('a' <= c && c <= 'f'); }
41
BasicTypeName(char c)42 const char* BasicTypeName(char c) {
43 switch (c) {
44 case 'a': return "i8";
45 case 'b': return "bool";
46 case 'c': return "char";
47 case 'd': return "f64";
48 case 'e': return "str";
49 case 'f': return "f32";
50 case 'h': return "u8";
51 case 'i': return "isize";
52 case 'j': return "usize";
53 case 'l': return "i32";
54 case 'm': return "u32";
55 case 'n': return "i128";
56 case 'o': return "u128";
57 case 'p': return "_";
58 case 's': return "i16";
59 case 't': return "u16";
60 case 'u': return "()";
61 case 'v': return "...";
62 case 'x': return "i64";
63 case 'y': return "u64";
64 case 'z': return "!";
65 }
66 return nullptr;
67 }
68
69 // Parser for Rust symbol mangling v0, whose grammar is defined here:
70 //
71 // https://doc.rust-lang.org/rustc/symbol-mangling/v0.html#symbol-grammar-summary
72 class RustSymbolParser {
73 public:
74 // Prepares to demangle the given encoding, a Rust symbol name starting with
75 // _R, into the output buffer [out, out_end). The caller is expected to
76 // continue by calling the new object's Parse function.
RustSymbolParser(const char * encoding,char * out,char * const out_end)77 RustSymbolParser(const char* encoding, char* out, char* const out_end)
78 : encoding_(encoding), out_(out), out_end_(out_end) {
79 if (out_ != out_end_) *out_ = '\0';
80 }
81
82 // Parses the constructor's encoding argument, writing output into the range
83 // [out, out_end). Returns true on success and false for input whose
84 // structure was not recognized or exceeded implementation limits, such as by
85 // nesting structures too deep. In either case *this should not be used
86 // again.
Parse()87 ABSL_MUST_USE_RESULT bool Parse() && {
88 // Recursively parses the grammar production named by callee, then resumes
89 // execution at the next statement.
90 //
91 // Recursive-descent parsing is a beautifully readable translation of a
92 // grammar, but it risks stack overflow if implemented by naive recursion on
93 // the C++ call stack. So we simulate recursion by goto and switch instead,
94 // keeping a bounded stack of "return addresses" in the recursion_stack_
95 // member.
96 //
97 // The callee argument is a statement label. We goto that label after
98 // saving the "return address" on recursion_stack_. The next continue
99 // statement in the for loop below "returns" from this "call".
100 //
101 // The caller argument names the return point. Each value of caller must
102 // appear in only one ABSL_DEMANGLER_RECURSE call and be listed in the
103 // definition of enum ReturnAddress. The switch implements the control
104 // transfer from the end of a "called" subroutine back to the statement
105 // after the "call".
106 //
107 // Note that not all the grammar productions have to be packed into the
108 // switch, but only those which appear in a cycle in the grammar. Anything
109 // acyclic can be written as ordinary functions and function calls, e.g.,
110 // ParseIdentifier.
111 #define ABSL_DEMANGLER_RECURSE(callee, caller) \
112 do { \
113 if (recursion_depth_ == kStackSize) return false; \
114 /* The next continue will switch on this saved value ... */ \
115 recursion_stack_[recursion_depth_++] = caller; \
116 goto callee; \
117 /* ... and will land here, resuming the suspended code. */ \
118 case caller: {} \
119 } while (0)
120
121 // Parse the encoding, counting completed recursive calls to guard against
122 // excessively complex input and infinite-loop bugs.
123 int iter = 0;
124 goto whole_encoding;
125 for (; iter < kMaxReturns && recursion_depth_ > 0; ++iter) {
126 // This switch resumes the code path most recently suspended by
127 // ABSL_DEMANGLER_RECURSE.
128 switch (recursion_stack_[--recursion_depth_]) {
129 //
130 // symbol-name ->
131 // _R decimal-number? path instantiating-crate? vendor-specific-suffix?
132 whole_encoding:
133 if (!Eat('_') || !Eat('R')) return false;
134 // decimal-number? is always empty today, so proceed to path, which
135 // can't start with a decimal digit.
136 ABSL_DEMANGLER_RECURSE(path, kInstantiatingCrate);
137 if (IsAlpha(Peek())) {
138 ++silence_depth_; // Print nothing more from here on.
139 ABSL_DEMANGLER_RECURSE(path, kVendorSpecificSuffix);
140 }
141 switch (Take()) {
142 case '.': case '$': case '\0': return true;
143 }
144 return false; // unexpected trailing content
145
146 // path -> crate-root | inherent-impl | trait-impl | trait-definition |
147 // nested-path | generic-args | backref
148 //
149 // Note that ABSL_DEMANGLER_RECURSE does not work inside a nested switch
150 // (which would hide the generated case label). Thus we jump out of the
151 // inner switch with gotos before performing any fake recursion.
152 path:
153 switch (Take()) {
154 case 'C': goto crate_root;
155 case 'M': goto inherent_impl;
156 case 'X': goto trait_impl;
157 case 'Y': goto trait_definition;
158 case 'N': goto nested_path;
159 case 'I': goto generic_args;
160 case 'B': goto path_backref;
161 default: return false;
162 }
163
164 // crate-root -> C identifier (C consumed above)
165 crate_root:
166 if (!ParseIdentifier()) return false;
167 continue;
168
169 // inherent-impl -> M impl-path type (M already consumed)
170 inherent_impl:
171 if (!Emit("<")) return false;
172 ABSL_DEMANGLER_RECURSE(impl_path, kInherentImplType);
173 ABSL_DEMANGLER_RECURSE(type, kInherentImplEnding);
174 if (!Emit(">")) return false;
175 continue;
176
177 // trait-impl -> X impl-path type path (X already consumed)
178 trait_impl:
179 if (!Emit("<")) return false;
180 ABSL_DEMANGLER_RECURSE(impl_path, kTraitImplType);
181 ABSL_DEMANGLER_RECURSE(type, kTraitImplInfix);
182 if (!Emit(" as ")) return false;
183 ABSL_DEMANGLER_RECURSE(path, kTraitImplEnding);
184 if (!Emit(">")) return false;
185 continue;
186
187 // impl-path -> disambiguator? path (but never print it!)
188 impl_path:
189 ++silence_depth_;
190 {
191 int ignored_disambiguator;
192 if (!ParseDisambiguator(ignored_disambiguator)) return false;
193 }
194 ABSL_DEMANGLER_RECURSE(path, kImplPathEnding);
195 --silence_depth_;
196 continue;
197
198 // trait-definition -> Y type path (Y already consumed)
199 trait_definition:
200 if (!Emit("<")) return false;
201 ABSL_DEMANGLER_RECURSE(type, kTraitDefinitionInfix);
202 if (!Emit(" as ")) return false;
203 ABSL_DEMANGLER_RECURSE(path, kTraitDefinitionEnding);
204 if (!Emit(">")) return false;
205 continue;
206
207 // nested-path -> N namespace path identifier (N already consumed)
208 // namespace -> lower | upper
209 nested_path:
210 // Uppercase namespaces must be saved on a stack so we can print
211 // ::{closure#0} or ::{shim:vtable#0} or ::{X:name#0} as needed.
212 if (IsUpper(Peek())) {
213 if (!PushNamespace(Take())) return false;
214 ABSL_DEMANGLER_RECURSE(path, kIdentifierInUppercaseNamespace);
215 if (!Emit("::")) return false;
216 if (!ParseIdentifier(PopNamespace())) return false;
217 continue;
218 }
219
220 // Lowercase namespaces, however, are never represented in the output;
221 // they all emit just ::name.
222 if (IsLower(Take())) {
223 ABSL_DEMANGLER_RECURSE(path, kIdentifierInLowercaseNamespace);
224 if (!Emit("::")) return false;
225 if (!ParseIdentifier()) return false;
226 continue;
227 }
228
229 // Neither upper or lower
230 return false;
231
232 // type -> basic-type | array-type | slice-type | tuple-type |
233 // ref-type | mut-ref-type | const-ptr-type | mut-ptr-type |
234 // fn-type | dyn-trait-type | path | backref
235 //
236 // We use ifs instead of switch (Take()) because the default case jumps
237 // to path, which will need to see the first character not yet Taken
238 // from the input. Because we do not use a nested switch here,
239 // ABSL_DEMANGLER_RECURSE works fine in the 'S' case.
240 type:
241 if (IsLower(Peek())) {
242 const char* type_name = BasicTypeName(Take());
243 if (type_name == nullptr || !Emit(type_name)) return false;
244 continue;
245 }
246 if (Eat('A')) {
247 // array-type = A type const
248 if (!Emit("[")) return false;
249 ABSL_DEMANGLER_RECURSE(type, kArraySize);
250 if (!Emit("; ")) return false;
251 ABSL_DEMANGLER_RECURSE(constant, kFinishArray);
252 if (!Emit("]")) return false;
253 continue;
254 }
255 if (Eat('S')) {
256 if (!Emit("[")) return false;
257 ABSL_DEMANGLER_RECURSE(type, kSliceEnding);
258 if (!Emit("]")) return false;
259 continue;
260 }
261 if (Eat('T')) goto tuple_type;
262 if (Eat('R')) {
263 if (!Emit("&")) return false;
264 if (!ParseOptionalLifetime()) return false;
265 goto type;
266 }
267 if (Eat('Q')) {
268 if (!Emit("&mut ")) return false;
269 if (!ParseOptionalLifetime()) return false;
270 goto type;
271 }
272 if (Eat('P')) {
273 if (!Emit("*const ")) return false;
274 goto type;
275 }
276 if (Eat('O')) {
277 if (!Emit("*mut ")) return false;
278 goto type;
279 }
280 if (Eat('F')) goto fn_type;
281 if (Eat('D')) goto dyn_trait_type;
282 if (Eat('B')) goto type_backref;
283 goto path;
284
285 // tuple-type -> T type* E (T already consumed)
286 tuple_type:
287 if (!Emit("(")) return false;
288
289 // The toolchain should call the unit type u instead of TE, but the
290 // grammar and other demanglers also recognize TE, so we do too.
291 if (Eat('E')) {
292 if (!Emit(")")) return false;
293 continue;
294 }
295
296 // A tuple with one element is rendered (type,) instead of (type).
297 ABSL_DEMANGLER_RECURSE(type, kAfterFirstTupleElement);
298 if (Eat('E')) {
299 if (!Emit(",)")) return false;
300 continue;
301 }
302
303 // A tuple with two elements is of course (x, y).
304 if (!Emit(", ")) return false;
305 ABSL_DEMANGLER_RECURSE(type, kAfterSecondTupleElement);
306 if (Eat('E')) {
307 if (!Emit(")")) return false;
308 continue;
309 }
310
311 // And (x, y, z) for three elements.
312 if (!Emit(", ")) return false;
313 ABSL_DEMANGLER_RECURSE(type, kAfterThirdTupleElement);
314 if (Eat('E')) {
315 if (!Emit(")")) return false;
316 continue;
317 }
318
319 // For longer tuples we write (x, y, z, ...), printing none of the
320 // content of the fourth and later types. Thus we avoid exhausting
321 // output buffers and human readers' patience when some library has a
322 // long tuple as an implementation detail, without having to
323 // completely obfuscate all tuples.
324 if (!Emit(", ...)")) return false;
325 ++silence_depth_;
326 while (!Eat('E')) {
327 ABSL_DEMANGLER_RECURSE(type, kAfterSubsequentTupleElement);
328 }
329 --silence_depth_;
330 continue;
331
332 // fn-type -> F fn-sig (F already consumed)
333 // fn-sig -> binder? U? (K abi)? type* E type
334 // abi -> C | undisambiguated-identifier
335 //
336 // We follow the C++ demangler in suppressing details of function
337 // signatures. Every function type is rendered "fn...".
338 fn_type:
339 if (!Emit("fn...")) return false;
340 ++silence_depth_;
341 if (!ParseOptionalBinder()) return false;
342 (void)Eat('U');
343 if (Eat('K')) {
344 if (!Eat('C') && !ParseUndisambiguatedIdentifier()) return false;
345 }
346 while (!Eat('E')) {
347 ABSL_DEMANGLER_RECURSE(type, kContinueParameterList);
348 }
349 ABSL_DEMANGLER_RECURSE(type, kFinishFn);
350 --silence_depth_;
351 continue;
352
353 // dyn-trait-type -> D dyn-bounds lifetime (D already consumed)
354 // dyn-bounds -> binder? dyn-trait* E
355 //
356 // The grammar strangely allows an empty trait list, even though the
357 // compiler should never output one. We follow existing demanglers in
358 // rendering DEL_ as "dyn ".
359 //
360 // Because auto traits lengthen a type name considerably without
361 // providing much value to a search for related source code, it would be
362 // desirable to abbreviate
363 // dyn main::Trait + std::marker::Copy + std::marker::Send
364 // to
365 // dyn main::Trait + ...,
366 // eliding the auto traits. But it is difficult to do so correctly, in
367 // part because there is no guarantee that the mangling will list the
368 // main trait first. So we just print all the traits in their order of
369 // appearance in the mangled name.
370 dyn_trait_type:
371 if (!Emit("dyn ")) return false;
372 if (!ParseOptionalBinder()) return false;
373 if (!Eat('E')) {
374 ABSL_DEMANGLER_RECURSE(dyn_trait, kBeginAutoTraits);
375 while (!Eat('E')) {
376 if (!Emit(" + ")) return false;
377 ABSL_DEMANGLER_RECURSE(dyn_trait, kContinueAutoTraits);
378 }
379 }
380 if (!ParseRequiredLifetime()) return false;
381 continue;
382
383 // dyn-trait -> path dyn-trait-assoc-binding*
384 // dyn-trait-assoc-binding -> p undisambiguated-identifier type
385 //
386 // We render nonempty binding lists as <>, omitting their contents as
387 // for generic-args.
388 dyn_trait:
389 ABSL_DEMANGLER_RECURSE(path, kContinueDynTrait);
390 if (Peek() == 'p') {
391 if (!Emit("<>")) return false;
392 ++silence_depth_;
393 while (Eat('p')) {
394 if (!ParseUndisambiguatedIdentifier()) return false;
395 ABSL_DEMANGLER_RECURSE(type, kContinueAssocBinding);
396 }
397 --silence_depth_;
398 }
399 continue;
400
401 // const -> type const-data | p | backref
402 //
403 // const is a C++ keyword, so we use the label `constant` instead.
404 constant:
405 if (Eat('B')) goto const_backref;
406 if (Eat('p')) {
407 if (!Emit("_")) return false;
408 continue;
409 }
410
411 // Scan the type without printing it.
412 //
413 // The Rust language restricts the type of a const generic argument
414 // much more than the mangling grammar does. We do not enforce this.
415 //
416 // We also do not bother printing false, true, 'A', and '\u{abcd}' for
417 // the types bool and char. Because we do not print generic-args
418 // contents, we expect to print constants only in array sizes, and
419 // those should not be bool or char.
420 ++silence_depth_;
421 ABSL_DEMANGLER_RECURSE(type, kConstData);
422 --silence_depth_;
423
424 // const-data -> n? hex-digit* _
425 //
426 // Although the grammar doesn't say this, existing demanglers expect
427 // that zero is 0, not an empty digit sequence, and no nonzero value
428 // may have leading zero digits. Also n0_ is accepted and printed as
429 // -0, though a toolchain will probably never write that encoding.
430 if (Eat('n') && !EmitChar('-')) return false;
431 if (!Emit("0x")) return false;
432 if (Eat('0')) {
433 if (!EmitChar('0')) return false;
434 if (!Eat('_')) return false;
435 continue;
436 }
437 while (IsLowerHexDigit(Peek())) {
438 if (!EmitChar(Take())) return false;
439 }
440 if (!Eat('_')) return false;
441 continue;
442
443 // generic-args -> I path generic-arg* E (I already consumed)
444 //
445 // We follow the C++ demangler in omitting all the arguments from the
446 // output, printing only the list opening and closing tokens.
447 generic_args:
448 ABSL_DEMANGLER_RECURSE(path, kBeginGenericArgList);
449 if (!Emit("::<>")) return false;
450 ++silence_depth_;
451 while (!Eat('E')) {
452 ABSL_DEMANGLER_RECURSE(generic_arg, kContinueGenericArgList);
453 }
454 --silence_depth_;
455 continue;
456
457 // generic-arg -> lifetime | type | K const
458 generic_arg:
459 if (Peek() == 'L') {
460 if (!ParseOptionalLifetime()) return false;
461 continue;
462 }
463 if (Eat('K')) goto constant;
464 goto type;
465
466 // backref -> B base-62-number (B already consumed)
467 //
468 // The BeginBackref call parses and range-checks the base-62-number. We
469 // always do that much.
470 //
471 // The recursive call parses and prints what the backref points at. We
472 // save CPU and stack by skipping this work if the output would be
473 // suppressed anyway.
474 path_backref:
475 if (!BeginBackref()) return false;
476 if (silence_depth_ == 0) {
477 ABSL_DEMANGLER_RECURSE(path, kPathBackrefEnding);
478 }
479 EndBackref();
480 continue;
481
482 // This represents the same backref production as in path_backref but
483 // parses the target as a type instead of a path.
484 type_backref:
485 if (!BeginBackref()) return false;
486 if (silence_depth_ == 0) {
487 ABSL_DEMANGLER_RECURSE(type, kTypeBackrefEnding);
488 }
489 EndBackref();
490 continue;
491
492 const_backref:
493 if (!BeginBackref()) return false;
494 if (silence_depth_ == 0) {
495 ABSL_DEMANGLER_RECURSE(constant, kConstantBackrefEnding);
496 }
497 EndBackref();
498 continue;
499 }
500 }
501
502 return false; // hit iteration limit or a bug in our stack handling
503 }
504
505 private:
506 // Enumerates resumption points for ABSL_DEMANGLER_RECURSE calls.
507 enum ReturnAddress : uint8_t {
508 kInstantiatingCrate,
509 kVendorSpecificSuffix,
510 kIdentifierInUppercaseNamespace,
511 kIdentifierInLowercaseNamespace,
512 kInherentImplType,
513 kInherentImplEnding,
514 kTraitImplType,
515 kTraitImplInfix,
516 kTraitImplEnding,
517 kImplPathEnding,
518 kTraitDefinitionInfix,
519 kTraitDefinitionEnding,
520 kArraySize,
521 kFinishArray,
522 kSliceEnding,
523 kAfterFirstTupleElement,
524 kAfterSecondTupleElement,
525 kAfterThirdTupleElement,
526 kAfterSubsequentTupleElement,
527 kContinueParameterList,
528 kFinishFn,
529 kBeginAutoTraits,
530 kContinueAutoTraits,
531 kContinueDynTrait,
532 kContinueAssocBinding,
533 kConstData,
534 kBeginGenericArgList,
535 kContinueGenericArgList,
536 kPathBackrefEnding,
537 kTypeBackrefEnding,
538 kConstantBackrefEnding,
539 };
540
541 // Element counts for the stack arrays. Larger stack sizes accommodate more
542 // deeply nested names at the cost of a larger footprint on the C++ call
543 // stack.
544 enum {
545 // Maximum recursive calls outstanding at one time.
546 kStackSize = 256,
547
548 // Maximum N<uppercase> nested-paths open at once. We do not expect
549 // closures inside closures inside closures as much as functions inside
550 // modules inside other modules, so we can use a smaller array here.
551 kNamespaceStackSize = 64,
552
553 // Maximum number of nested backrefs. We can keep this stack pretty small
554 // because we do not follow backrefs inside generic-args or other contexts
555 // that suppress printing, so deep stacking is unlikely in practice.
556 kPositionStackSize = 16,
557 };
558
559 // Returns the next input character without consuming it.
Peek() const560 char Peek() const { return encoding_[pos_]; }
561
562 // Consumes and returns the next input character.
Take()563 char Take() { return encoding_[pos_++]; }
564
565 // If the next input character is the given character, consumes it and returns
566 // true; otherwise returns false without consuming a character.
Eat(char want)567 ABSL_MUST_USE_RESULT bool Eat(char want) {
568 if (encoding_[pos_] != want) return false;
569 ++pos_;
570 return true;
571 }
572
573 // Provided there is enough remaining output space, appends c to the output,
574 // writing a fresh NUL terminator afterward, and returns true. Returns false
575 // if the output buffer had less than two bytes free.
EmitChar(char c)576 ABSL_MUST_USE_RESULT bool EmitChar(char c) {
577 if (silence_depth_ > 0) return true;
578 if (out_end_ - out_ < 2) return false;
579 *out_++ = c;
580 *out_ = '\0';
581 return true;
582 }
583
584 // Provided there is enough remaining output space, appends the C string token
585 // to the output, followed by a NUL character, and returns true. Returns
586 // false if not everything fit into the output buffer.
Emit(const char * token)587 ABSL_MUST_USE_RESULT bool Emit(const char* token) {
588 if (silence_depth_ > 0) return true;
589 const size_t token_length = std::strlen(token);
590 const size_t bytes_to_copy = token_length + 1; // token and final NUL
591 if (static_cast<size_t>(out_end_ - out_) < bytes_to_copy) return false;
592 std::memcpy(out_, token, bytes_to_copy);
593 out_ += token_length;
594 return true;
595 }
596
597 // Provided there is enough remaining output space, appends the decimal form
598 // of disambiguator (if it's nonnegative) or "?" (if it's negative) to the
599 // output, followed by a NUL character, and returns true. Returns false if
600 // not everything fit into the output buffer.
EmitDisambiguator(int disambiguator)601 ABSL_MUST_USE_RESULT bool EmitDisambiguator(int disambiguator) {
602 if (disambiguator < 0) return EmitChar('?'); // parsed but too large
603 if (disambiguator == 0) return EmitChar('0');
604 // Convert disambiguator to decimal text. Three digits per byte is enough
605 // because 999 > 256. The bound will remain correct even if future
606 // maintenance changes the type of the disambiguator variable.
607 char digits[3 * sizeof(disambiguator)] = {};
608 size_t leading_digit_index = sizeof(digits) - 1;
609 for (; disambiguator > 0; disambiguator /= 10) {
610 digits[--leading_digit_index] =
611 static_cast<char>('0' + disambiguator % 10);
612 }
613 return Emit(digits + leading_digit_index);
614 }
615
616 // Consumes an optional disambiguator (s123_) from the input.
617 //
618 // On success returns true and fills value with the encoded value if it was
619 // not too big, otherwise with -1. If the optional disambiguator was omitted,
620 // value is 0. On parse failure returns false and sets value to -1.
ParseDisambiguator(int & value)621 ABSL_MUST_USE_RESULT bool ParseDisambiguator(int& value) {
622 value = -1;
623
624 // disambiguator = s base-62-number
625 //
626 // Disambiguators are optional. An omitted disambiguator is zero.
627 if (!Eat('s')) {
628 value = 0;
629 return true;
630 }
631 int base_62_value = 0;
632 if (!ParseBase62Number(base_62_value)) return false;
633 value = base_62_value < 0 ? -1 : base_62_value + 1;
634 return true;
635 }
636
637 // Consumes a base-62 number like _ or 123_ from the input.
638 //
639 // On success returns true and fills value with the encoded value if it was
640 // not too big, otherwise with -1. On parse failure returns false and sets
641 // value to -1.
ParseBase62Number(int & value)642 ABSL_MUST_USE_RESULT bool ParseBase62Number(int& value) {
643 value = -1;
644
645 // base-62-number = (digit | lower | upper)* _
646 //
647 // An empty base-62 digit sequence means 0.
648 if (Eat('_')) {
649 value = 0;
650 return true;
651 }
652
653 // A nonempty digit sequence denotes its base-62 value plus 1.
654 int encoded_number = 0;
655 bool overflowed = false;
656 while (IsAlpha(Peek()) || IsDigit(Peek())) {
657 const char c = Take();
658 if (encoded_number >= std::numeric_limits<int>::max()/62) {
659 // If we are close to overflowing an int, keep parsing but stop updating
660 // encoded_number and remember to return -1 at the end. The point is to
661 // avoid undefined behavior while parsing crate-root disambiguators,
662 // which are large in practice but not shown in demangling, while
663 // successfully computing closure and shim disambiguators, which are
664 // typically small and are printed out.
665 overflowed = true;
666 } else {
667 int digit;
668 if (IsDigit(c)) {
669 digit = c - '0';
670 } else if (IsLower(c)) {
671 digit = c - 'a' + 10;
672 } else {
673 digit = c - 'A' + 36;
674 }
675 encoded_number = 62 * encoded_number + digit;
676 }
677 }
678
679 if (!Eat('_')) return false;
680 if (!overflowed) value = encoded_number + 1;
681 return true;
682 }
683
684 // Consumes an identifier from the input, returning true on success.
685 //
686 // A nonzero uppercase_namespace specifies the character after the N in a
687 // nested-identifier, e.g., 'C' for a closure, allowing ParseIdentifier to
688 // write out the name with the conventional decoration for that namespace.
ParseIdentifier(char uppercase_namespace='\\0')689 ABSL_MUST_USE_RESULT bool ParseIdentifier(char uppercase_namespace = '\0') {
690 // identifier -> disambiguator? undisambiguated-identifier
691 int disambiguator = 0;
692 if (!ParseDisambiguator(disambiguator)) return false;
693
694 return ParseUndisambiguatedIdentifier(uppercase_namespace, disambiguator);
695 }
696
697 // Consumes from the input an identifier with no preceding disambiguator,
698 // returning true on success.
699 //
700 // When ParseIdentifier calls this, it passes the N<namespace> character and
701 // disambiguator value so that "{closure#42}" and similar forms can be
702 // rendered correctly.
703 //
704 // At other appearances of undisambiguated-identifier in the grammar, this
705 // treatment is not applicable, and the call site omits both arguments.
ParseUndisambiguatedIdentifier(char uppercase_namespace='\\0',int disambiguator=0)706 ABSL_MUST_USE_RESULT bool ParseUndisambiguatedIdentifier(
707 char uppercase_namespace = '\0', int disambiguator = 0) {
708 // undisambiguated-identifier -> u? decimal-number _? bytes
709 const bool is_punycoded = Eat('u');
710 if (!IsDigit(Peek())) return false;
711 int num_bytes = 0;
712 if (!ParseDecimalNumber(num_bytes)) return false;
713 (void)Eat('_'); // optional separator, needed if a digit follows
714 if (is_punycoded) {
715 DecodeRustPunycodeOptions options;
716 options.punycode_begin = &encoding_[pos_];
717 options.punycode_end = &encoding_[pos_] + num_bytes;
718 options.out_begin = out_;
719 options.out_end = out_end_;
720 out_ = DecodeRustPunycode(options);
721 if (out_ == nullptr) return false;
722 pos_ += static_cast<size_t>(num_bytes);
723 }
724
725 // Emit the beginnings of braced forms like {shim:vtable#0}.
726 if (uppercase_namespace != '\0') {
727 switch (uppercase_namespace) {
728 case 'C':
729 if (!Emit("{closure")) return false;
730 break;
731 case 'S':
732 if (!Emit("{shim")) return false;
733 break;
734 default:
735 if (!EmitChar('{') || !EmitChar(uppercase_namespace)) return false;
736 break;
737 }
738 if (num_bytes > 0 && !Emit(":")) return false;
739 }
740
741 // Emit the name itself.
742 if (!is_punycoded) {
743 for (int i = 0; i < num_bytes; ++i) {
744 const char c = Take();
745 if (!IsIdentifierChar(c) &&
746 // The spec gives toolchains the choice of Punycode or raw UTF-8 for
747 // identifiers containing code points above 0x7f, so accept bytes
748 // with the high bit set.
749 (c & 0x80) == 0) {
750 return false;
751 }
752 if (!EmitChar(c)) return false;
753 }
754 }
755
756 // Emit the endings of braced forms, e.g., "#42}".
757 if (uppercase_namespace != '\0') {
758 if (!EmitChar('#')) return false;
759 if (!EmitDisambiguator(disambiguator)) return false;
760 if (!EmitChar('}')) return false;
761 }
762
763 return true;
764 }
765
766 // Consumes a decimal number like 0 or 123 from the input. On success returns
767 // true and fills value with the encoded value. If the encoded value is too
768 // large or otherwise unparsable, returns false and sets value to -1.
ParseDecimalNumber(int & value)769 ABSL_MUST_USE_RESULT bool ParseDecimalNumber(int& value) {
770 value = -1;
771 if (!IsDigit(Peek())) return false;
772 int encoded_number = Take() - '0';
773 if (encoded_number == 0) {
774 // Decimal numbers are never encoded with extra leading zeroes.
775 value = 0;
776 return true;
777 }
778 while (IsDigit(Peek()) &&
779 // avoid overflow
780 encoded_number < std::numeric_limits<int>::max()/10) {
781 encoded_number = 10 * encoded_number + (Take() - '0');
782 }
783 if (IsDigit(Peek())) return false; // too big
784 value = encoded_number;
785 return true;
786 }
787
788 // Consumes a binder of higher-ranked lifetimes if one is present. On success
789 // returns true and discards the encoded lifetime count. On parse failure
790 // returns false.
ParseOptionalBinder()791 ABSL_MUST_USE_RESULT bool ParseOptionalBinder() {
792 // binder -> G base-62-number
793 if (!Eat('G')) return true;
794 int ignored_binding_count;
795 return ParseBase62Number(ignored_binding_count);
796 }
797
798 // Consumes a lifetime if one is present.
799 //
800 // On success returns true and discards the lifetime index. We do not print
801 // or even range-check lifetimes because they are a finer detail than other
802 // things we omit from output, such as the entire contents of generic-args.
803 //
804 // On parse failure returns false.
ParseOptionalLifetime()805 ABSL_MUST_USE_RESULT bool ParseOptionalLifetime() {
806 // lifetime -> L base-62-number
807 if (!Eat('L')) return true;
808 int ignored_de_bruijn_index;
809 return ParseBase62Number(ignored_de_bruijn_index);
810 }
811
812 // Consumes a lifetime just like ParseOptionalLifetime, but returns false if
813 // there is no lifetime here.
ParseRequiredLifetime()814 ABSL_MUST_USE_RESULT bool ParseRequiredLifetime() {
815 if (Peek() != 'L') return false;
816 return ParseOptionalLifetime();
817 }
818
819 // Pushes ns onto the namespace stack and returns true if the stack is not
820 // full, else returns false.
PushNamespace(char ns)821 ABSL_MUST_USE_RESULT bool PushNamespace(char ns) {
822 if (namespace_depth_ == kNamespaceStackSize) return false;
823 namespace_stack_[namespace_depth_++] = ns;
824 return true;
825 }
826
827 // Pops the last pushed namespace. Requires that the namespace stack is not
828 // empty (namespace_depth_ > 0).
PopNamespace()829 char PopNamespace() { return namespace_stack_[--namespace_depth_]; }
830
831 // Pushes position onto the position stack and returns true if the stack is
832 // not full, else returns false.
PushPosition(int position)833 ABSL_MUST_USE_RESULT bool PushPosition(int position) {
834 if (position_depth_ == kPositionStackSize) return false;
835 position_stack_[position_depth_++] = position;
836 return true;
837 }
838
839 // Pops the last pushed input position. Requires that the position stack is
840 // not empty (position_depth_ > 0).
PopPosition()841 int PopPosition() { return position_stack_[--position_depth_]; }
842
843 // Consumes a base-62-number denoting a backref target, pushes the current
844 // input position on the data stack, and sets the input position to the
845 // beginning of the backref target. Returns true on success. Returns false
846 // if parsing failed, the stack is exhausted, or the backref target position
847 // is out of range.
BeginBackref()848 ABSL_MUST_USE_RESULT bool BeginBackref() {
849 // backref = B base-62-number (B already consumed)
850 //
851 // Reject backrefs that don't parse, overflow int, or don't point backward.
852 // If the offset looks fine, adjust it to account for the _R prefix.
853 int offset = 0;
854 const int offset_of_this_backref =
855 pos_ - 2 /* _R */ - 1 /* B already consumed */;
856 if (!ParseBase62Number(offset) || offset < 0 ||
857 offset >= offset_of_this_backref) {
858 return false;
859 }
860 offset += 2;
861
862 // Save the old position to restore later.
863 if (!PushPosition(pos_)) return false;
864
865 // Move the input position to the backref target.
866 //
867 // Note that we do not check whether the new position points to the
868 // beginning of a construct matching the context in which the backref
869 // appeared. We just jump to it and see whether nested parsing succeeds.
870 // We therefore accept various wrong manglings, e.g., a type backref
871 // pointing to an 'l' character inside an identifier, which happens to mean
872 // i32 when parsed as a type mangling. This saves the complexity and RAM
873 // footprint of remembering which offsets began which kinds of
874 // substructures. Existing demanglers take similar shortcuts.
875 pos_ = offset;
876 return true;
877 }
878
879 // Cleans up after a backref production by restoring the previous input
880 // position from the data stack.
EndBackref()881 void EndBackref() { pos_ = PopPosition(); }
882
883 // The leftmost recursion_depth_ elements of recursion_stack_ contain the
884 // ReturnAddresses pushed by ABSL_DEMANGLER_RECURSE calls not yet completed.
885 ReturnAddress recursion_stack_[kStackSize] = {};
886 int recursion_depth_ = 0;
887
888 // The leftmost namespace_depth_ elements of namespace_stack_ contain the
889 // uppercase namespace identifiers for open nested-paths, e.g., 'C' for a
890 // closure.
891 char namespace_stack_[kNamespaceStackSize] = {};
892 int namespace_depth_ = 0;
893
894 // The leftmost position_depth_ elements of position_stack_ contain the input
895 // positions to return to after fully printing the targets of backrefs.
896 int position_stack_[kPositionStackSize] = {};
897 int position_depth_ = 0;
898
899 // Anything parsed while silence_depth_ > 0 contributes nothing to the
900 // demangled output. For constructs omitted from the demangling, such as
901 // impl-path and the contents of generic-args, we will increment
902 // silence_depth_ on the way in and decrement silence_depth_ on the way out.
903 int silence_depth_ = 0;
904
905 // Input: encoding_ points to a Rust mangled symbol, and encoding_[pos_] is
906 // the next input character to be scanned.
907 int pos_ = 0;
908 const char* encoding_ = nullptr;
909
910 // Output: *out_ is where the next output character should be written, and
911 // out_end_ points past the last byte of available space.
912 char* out_ = nullptr;
913 char* out_end_ = nullptr;
914 };
915
916 } // namespace
917
DemangleRustSymbolEncoding(const char * mangled,char * out,size_t out_size)918 bool DemangleRustSymbolEncoding(const char* mangled, char* out,
919 size_t out_size) {
920 return RustSymbolParser(mangled, out, out + out_size).Parse();
921 }
922
923 } // namespace debugging_internal
924 ABSL_NAMESPACE_END
925 } // namespace absl
926