xref: /aosp_15_r20/external/abseil-cpp/absl/debugging/internal/demangle_rust.cc (revision 9356374a3709195abf420251b3e825997ff56c0f)
1 // Copyright 2024 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/debugging/internal/demangle_rust.h"
16 
17 #include <cstddef>
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21 
22 #include "absl/base/attributes.h"
23 #include "absl/base/config.h"
24 #include "absl/debugging/internal/decode_rust_punycode.h"
25 
26 namespace absl {
27 ABSL_NAMESPACE_BEGIN
28 namespace debugging_internal {
29 
30 namespace {
31 
32 // Same step limit as the C++ demangler in demangle.cc uses.
33 constexpr int kMaxReturns = 1 << 17;
34 
IsDigit(char c)35 bool IsDigit(char c) { return '0' <= c && c <= '9'; }
IsLower(char c)36 bool IsLower(char c) { return 'a' <= c && c <= 'z'; }
IsUpper(char c)37 bool IsUpper(char c) { return 'A' <= c && c <= 'Z'; }
IsAlpha(char c)38 bool IsAlpha(char c) { return IsLower(c) || IsUpper(c); }
IsIdentifierChar(char c)39 bool IsIdentifierChar(char c) { return IsAlpha(c) || IsDigit(c) || c == '_'; }
IsLowerHexDigit(char c)40 bool IsLowerHexDigit(char c) { return IsDigit(c) || ('a' <= c && c <= 'f'); }
41 
BasicTypeName(char c)42 const char* BasicTypeName(char c) {
43   switch (c) {
44     case 'a': return "i8";
45     case 'b': return "bool";
46     case 'c': return "char";
47     case 'd': return "f64";
48     case 'e': return "str";
49     case 'f': return "f32";
50     case 'h': return "u8";
51     case 'i': return "isize";
52     case 'j': return "usize";
53     case 'l': return "i32";
54     case 'm': return "u32";
55     case 'n': return "i128";
56     case 'o': return "u128";
57     case 'p': return "_";
58     case 's': return "i16";
59     case 't': return "u16";
60     case 'u': return "()";
61     case 'v': return "...";
62     case 'x': return "i64";
63     case 'y': return "u64";
64     case 'z': return "!";
65   }
66   return nullptr;
67 }
68 
69 // Parser for Rust symbol mangling v0, whose grammar is defined here:
70 //
71 // https://doc.rust-lang.org/rustc/symbol-mangling/v0.html#symbol-grammar-summary
72 class RustSymbolParser {
73  public:
74   // Prepares to demangle the given encoding, a Rust symbol name starting with
75   // _R, into the output buffer [out, out_end).  The caller is expected to
76   // continue by calling the new object's Parse function.
RustSymbolParser(const char * encoding,char * out,char * const out_end)77   RustSymbolParser(const char* encoding, char* out, char* const out_end)
78       : encoding_(encoding), out_(out), out_end_(out_end) {
79     if (out_ != out_end_) *out_ = '\0';
80   }
81 
82   // Parses the constructor's encoding argument, writing output into the range
83   // [out, out_end).  Returns true on success and false for input whose
84   // structure was not recognized or exceeded implementation limits, such as by
85   // nesting structures too deep.  In either case *this should not be used
86   // again.
Parse()87   ABSL_MUST_USE_RESULT bool Parse() && {
88     // Recursively parses the grammar production named by callee, then resumes
89     // execution at the next statement.
90     //
91     // Recursive-descent parsing is a beautifully readable translation of a
92     // grammar, but it risks stack overflow if implemented by naive recursion on
93     // the C++ call stack.  So we simulate recursion by goto and switch instead,
94     // keeping a bounded stack of "return addresses" in the recursion_stack_
95     // member.
96     //
97     // The callee argument is a statement label.  We goto that label after
98     // saving the "return address" on recursion_stack_.  The next continue
99     // statement in the for loop below "returns" from this "call".
100     //
101     // The caller argument names the return point.  Each value of caller must
102     // appear in only one ABSL_DEMANGLER_RECURSE call and be listed in the
103     // definition of enum ReturnAddress.  The switch implements the control
104     // transfer from the end of a "called" subroutine back to the statement
105     // after the "call".
106     //
107     // Note that not all the grammar productions have to be packed into the
108     // switch, but only those which appear in a cycle in the grammar.  Anything
109     // acyclic can be written as ordinary functions and function calls, e.g.,
110     // ParseIdentifier.
111 #define ABSL_DEMANGLER_RECURSE(callee, caller) \
112     do { \
113       if (recursion_depth_ == kStackSize) return false; \
114       /* The next continue will switch on this saved value ... */ \
115       recursion_stack_[recursion_depth_++] = caller; \
116       goto callee; \
117       /* ... and will land here, resuming the suspended code. */ \
118       case caller: {} \
119     } while (0)
120 
121     // Parse the encoding, counting completed recursive calls to guard against
122     // excessively complex input and infinite-loop bugs.
123     int iter = 0;
124     goto whole_encoding;
125     for (; iter < kMaxReturns && recursion_depth_ > 0; ++iter) {
126       // This switch resumes the code path most recently suspended by
127       // ABSL_DEMANGLER_RECURSE.
128       switch (recursion_stack_[--recursion_depth_]) {
129         //
130         // symbol-name ->
131         // _R decimal-number? path instantiating-crate? vendor-specific-suffix?
132         whole_encoding:
133           if (!Eat('_') || !Eat('R')) return false;
134           // decimal-number? is always empty today, so proceed to path, which
135           // can't start with a decimal digit.
136           ABSL_DEMANGLER_RECURSE(path, kInstantiatingCrate);
137           if (IsAlpha(Peek())) {
138             ++silence_depth_;  // Print nothing more from here on.
139             ABSL_DEMANGLER_RECURSE(path, kVendorSpecificSuffix);
140           }
141           switch (Take()) {
142             case '.': case '$': case '\0': return true;
143           }
144           return false;  // unexpected trailing content
145 
146         // path -> crate-root | inherent-impl | trait-impl | trait-definition |
147         //         nested-path | generic-args | backref
148         //
149         // Note that ABSL_DEMANGLER_RECURSE does not work inside a nested switch
150         // (which would hide the generated case label).  Thus we jump out of the
151         // inner switch with gotos before performing any fake recursion.
152         path:
153           switch (Take()) {
154             case 'C': goto crate_root;
155             case 'M': goto inherent_impl;
156             case 'X': goto trait_impl;
157             case 'Y': goto trait_definition;
158             case 'N': goto nested_path;
159             case 'I': goto generic_args;
160             case 'B': goto path_backref;
161             default: return false;
162           }
163 
164         // crate-root -> C identifier (C consumed above)
165         crate_root:
166           if (!ParseIdentifier()) return false;
167           continue;
168 
169         // inherent-impl -> M impl-path type (M already consumed)
170         inherent_impl:
171           if (!Emit("<")) return false;
172           ABSL_DEMANGLER_RECURSE(impl_path, kInherentImplType);
173           ABSL_DEMANGLER_RECURSE(type, kInherentImplEnding);
174           if (!Emit(">")) return false;
175           continue;
176 
177         // trait-impl -> X impl-path type path (X already consumed)
178         trait_impl:
179           if (!Emit("<")) return false;
180           ABSL_DEMANGLER_RECURSE(impl_path, kTraitImplType);
181           ABSL_DEMANGLER_RECURSE(type, kTraitImplInfix);
182           if (!Emit(" as ")) return false;
183           ABSL_DEMANGLER_RECURSE(path, kTraitImplEnding);
184           if (!Emit(">")) return false;
185           continue;
186 
187         // impl-path -> disambiguator? path (but never print it!)
188         impl_path:
189           ++silence_depth_;
190           {
191             int ignored_disambiguator;
192             if (!ParseDisambiguator(ignored_disambiguator)) return false;
193           }
194           ABSL_DEMANGLER_RECURSE(path, kImplPathEnding);
195           --silence_depth_;
196           continue;
197 
198         // trait-definition -> Y type path (Y already consumed)
199         trait_definition:
200           if (!Emit("<")) return false;
201           ABSL_DEMANGLER_RECURSE(type, kTraitDefinitionInfix);
202           if (!Emit(" as ")) return false;
203           ABSL_DEMANGLER_RECURSE(path, kTraitDefinitionEnding);
204           if (!Emit(">")) return false;
205           continue;
206 
207         // nested-path -> N namespace path identifier (N already consumed)
208         // namespace -> lower | upper
209         nested_path:
210           // Uppercase namespaces must be saved on a stack so we can print
211           // ::{closure#0} or ::{shim:vtable#0} or ::{X:name#0} as needed.
212           if (IsUpper(Peek())) {
213             if (!PushNamespace(Take())) return false;
214             ABSL_DEMANGLER_RECURSE(path, kIdentifierInUppercaseNamespace);
215             if (!Emit("::")) return false;
216             if (!ParseIdentifier(PopNamespace())) return false;
217             continue;
218           }
219 
220           // Lowercase namespaces, however, are never represented in the output;
221           // they all emit just ::name.
222           if (IsLower(Take())) {
223             ABSL_DEMANGLER_RECURSE(path, kIdentifierInLowercaseNamespace);
224             if (!Emit("::")) return false;
225             if (!ParseIdentifier()) return false;
226             continue;
227           }
228 
229           // Neither upper or lower
230           return false;
231 
232         // type -> basic-type | array-type | slice-type | tuple-type |
233         //         ref-type | mut-ref-type | const-ptr-type | mut-ptr-type |
234         //         fn-type | dyn-trait-type | path | backref
235         //
236         // We use ifs instead of switch (Take()) because the default case jumps
237         // to path, which will need to see the first character not yet Taken
238         // from the input.  Because we do not use a nested switch here,
239         // ABSL_DEMANGLER_RECURSE works fine in the 'S' case.
240         type:
241           if (IsLower(Peek())) {
242             const char* type_name = BasicTypeName(Take());
243             if (type_name == nullptr || !Emit(type_name)) return false;
244             continue;
245           }
246           if (Eat('A')) {
247             // array-type = A type const
248             if (!Emit("[")) return false;
249             ABSL_DEMANGLER_RECURSE(type, kArraySize);
250             if (!Emit("; ")) return false;
251             ABSL_DEMANGLER_RECURSE(constant, kFinishArray);
252             if (!Emit("]")) return false;
253             continue;
254           }
255           if (Eat('S')) {
256             if (!Emit("[")) return false;
257             ABSL_DEMANGLER_RECURSE(type, kSliceEnding);
258             if (!Emit("]")) return false;
259             continue;
260           }
261           if (Eat('T')) goto tuple_type;
262           if (Eat('R')) {
263             if (!Emit("&")) return false;
264             if (!ParseOptionalLifetime()) return false;
265             goto type;
266           }
267           if (Eat('Q')) {
268             if (!Emit("&mut ")) return false;
269             if (!ParseOptionalLifetime()) return false;
270             goto type;
271           }
272           if (Eat('P')) {
273             if (!Emit("*const ")) return false;
274             goto type;
275           }
276           if (Eat('O')) {
277             if (!Emit("*mut ")) return false;
278             goto type;
279           }
280           if (Eat('F')) goto fn_type;
281           if (Eat('D')) goto dyn_trait_type;
282           if (Eat('B')) goto type_backref;
283           goto path;
284 
285         // tuple-type -> T type* E (T already consumed)
286         tuple_type:
287           if (!Emit("(")) return false;
288 
289           // The toolchain should call the unit type u instead of TE, but the
290           // grammar and other demanglers also recognize TE, so we do too.
291           if (Eat('E')) {
292             if (!Emit(")")) return false;
293             continue;
294           }
295 
296           // A tuple with one element is rendered (type,) instead of (type).
297           ABSL_DEMANGLER_RECURSE(type, kAfterFirstTupleElement);
298           if (Eat('E')) {
299             if (!Emit(",)")) return false;
300             continue;
301           }
302 
303           // A tuple with two elements is of course (x, y).
304           if (!Emit(", ")) return false;
305           ABSL_DEMANGLER_RECURSE(type, kAfterSecondTupleElement);
306           if (Eat('E')) {
307             if (!Emit(")")) return false;
308             continue;
309           }
310 
311           // And (x, y, z) for three elements.
312           if (!Emit(", ")) return false;
313           ABSL_DEMANGLER_RECURSE(type, kAfterThirdTupleElement);
314           if (Eat('E')) {
315             if (!Emit(")")) return false;
316             continue;
317           }
318 
319           // For longer tuples we write (x, y, z, ...), printing none of the
320           // content of the fourth and later types.  Thus we avoid exhausting
321           // output buffers and human readers' patience when some library has a
322           // long tuple as an implementation detail, without having to
323           // completely obfuscate all tuples.
324           if (!Emit(", ...)")) return false;
325           ++silence_depth_;
326           while (!Eat('E')) {
327             ABSL_DEMANGLER_RECURSE(type, kAfterSubsequentTupleElement);
328           }
329           --silence_depth_;
330           continue;
331 
332         // fn-type -> F fn-sig (F already consumed)
333         // fn-sig -> binder? U? (K abi)? type* E type
334         // abi -> C | undisambiguated-identifier
335         //
336         // We follow the C++ demangler in suppressing details of function
337         // signatures.  Every function type is rendered "fn...".
338         fn_type:
339           if (!Emit("fn...")) return false;
340           ++silence_depth_;
341           if (!ParseOptionalBinder()) return false;
342           (void)Eat('U');
343           if (Eat('K')) {
344             if (!Eat('C') && !ParseUndisambiguatedIdentifier()) return false;
345           }
346           while (!Eat('E')) {
347             ABSL_DEMANGLER_RECURSE(type, kContinueParameterList);
348           }
349           ABSL_DEMANGLER_RECURSE(type, kFinishFn);
350           --silence_depth_;
351           continue;
352 
353         // dyn-trait-type -> D dyn-bounds lifetime (D already consumed)
354         // dyn-bounds -> binder? dyn-trait* E
355         //
356         // The grammar strangely allows an empty trait list, even though the
357         // compiler should never output one.  We follow existing demanglers in
358         // rendering DEL_ as "dyn ".
359         //
360         // Because auto traits lengthen a type name considerably without
361         // providing much value to a search for related source code, it would be
362         // desirable to abbreviate
363         //     dyn main::Trait + std::marker::Copy + std::marker::Send
364         // to
365         //     dyn main::Trait + ...,
366         // eliding the auto traits.  But it is difficult to do so correctly, in
367         // part because there is no guarantee that the mangling will list the
368         // main trait first.  So we just print all the traits in their order of
369         // appearance in the mangled name.
370         dyn_trait_type:
371           if (!Emit("dyn ")) return false;
372           if (!ParseOptionalBinder()) return false;
373           if (!Eat('E')) {
374             ABSL_DEMANGLER_RECURSE(dyn_trait, kBeginAutoTraits);
375             while (!Eat('E')) {
376               if (!Emit(" + ")) return false;
377               ABSL_DEMANGLER_RECURSE(dyn_trait, kContinueAutoTraits);
378             }
379           }
380           if (!ParseRequiredLifetime()) return false;
381           continue;
382 
383         // dyn-trait -> path dyn-trait-assoc-binding*
384         // dyn-trait-assoc-binding -> p undisambiguated-identifier type
385         //
386         // We render nonempty binding lists as <>, omitting their contents as
387         // for generic-args.
388         dyn_trait:
389           ABSL_DEMANGLER_RECURSE(path, kContinueDynTrait);
390           if (Peek() == 'p') {
391             if (!Emit("<>")) return false;
392             ++silence_depth_;
393             while (Eat('p')) {
394               if (!ParseUndisambiguatedIdentifier()) return false;
395               ABSL_DEMANGLER_RECURSE(type, kContinueAssocBinding);
396             }
397             --silence_depth_;
398           }
399           continue;
400 
401         // const -> type const-data | p | backref
402         //
403         // const is a C++ keyword, so we use the label `constant` instead.
404         constant:
405           if (Eat('B')) goto const_backref;
406           if (Eat('p')) {
407             if (!Emit("_")) return false;
408             continue;
409           }
410 
411           // Scan the type without printing it.
412           //
413           // The Rust language restricts the type of a const generic argument
414           // much more than the mangling grammar does.  We do not enforce this.
415           //
416           // We also do not bother printing false, true, 'A', and '\u{abcd}' for
417           // the types bool and char.  Because we do not print generic-args
418           // contents, we expect to print constants only in array sizes, and
419           // those should not be bool or char.
420           ++silence_depth_;
421           ABSL_DEMANGLER_RECURSE(type, kConstData);
422           --silence_depth_;
423 
424           // const-data -> n? hex-digit* _
425           //
426           // Although the grammar doesn't say this, existing demanglers expect
427           // that zero is 0, not an empty digit sequence, and no nonzero value
428           // may have leading zero digits.  Also n0_ is accepted and printed as
429           // -0, though a toolchain will probably never write that encoding.
430           if (Eat('n') && !EmitChar('-')) return false;
431           if (!Emit("0x")) return false;
432           if (Eat('0')) {
433             if (!EmitChar('0')) return false;
434             if (!Eat('_')) return false;
435             continue;
436           }
437           while (IsLowerHexDigit(Peek())) {
438             if (!EmitChar(Take())) return false;
439           }
440           if (!Eat('_')) return false;
441           continue;
442 
443         // generic-args -> I path generic-arg* E (I already consumed)
444         //
445         // We follow the C++ demangler in omitting all the arguments from the
446         // output, printing only the list opening and closing tokens.
447         generic_args:
448           ABSL_DEMANGLER_RECURSE(path, kBeginGenericArgList);
449           if (!Emit("::<>")) return false;
450           ++silence_depth_;
451           while (!Eat('E')) {
452             ABSL_DEMANGLER_RECURSE(generic_arg, kContinueGenericArgList);
453           }
454           --silence_depth_;
455           continue;
456 
457         // generic-arg -> lifetime | type | K const
458         generic_arg:
459           if (Peek() == 'L') {
460             if (!ParseOptionalLifetime()) return false;
461             continue;
462           }
463           if (Eat('K')) goto constant;
464           goto type;
465 
466         // backref -> B base-62-number (B already consumed)
467         //
468         // The BeginBackref call parses and range-checks the base-62-number.  We
469         // always do that much.
470         //
471         // The recursive call parses and prints what the backref points at.  We
472         // save CPU and stack by skipping this work if the output would be
473         // suppressed anyway.
474         path_backref:
475           if (!BeginBackref()) return false;
476           if (silence_depth_ == 0) {
477             ABSL_DEMANGLER_RECURSE(path, kPathBackrefEnding);
478           }
479           EndBackref();
480           continue;
481 
482         // This represents the same backref production as in path_backref but
483         // parses the target as a type instead of a path.
484         type_backref:
485           if (!BeginBackref()) return false;
486           if (silence_depth_ == 0) {
487             ABSL_DEMANGLER_RECURSE(type, kTypeBackrefEnding);
488           }
489           EndBackref();
490           continue;
491 
492         const_backref:
493           if (!BeginBackref()) return false;
494           if (silence_depth_ == 0) {
495             ABSL_DEMANGLER_RECURSE(constant, kConstantBackrefEnding);
496           }
497           EndBackref();
498           continue;
499       }
500     }
501 
502     return false;  // hit iteration limit or a bug in our stack handling
503   }
504 
505  private:
506   // Enumerates resumption points for ABSL_DEMANGLER_RECURSE calls.
507   enum ReturnAddress : uint8_t {
508     kInstantiatingCrate,
509     kVendorSpecificSuffix,
510     kIdentifierInUppercaseNamespace,
511     kIdentifierInLowercaseNamespace,
512     kInherentImplType,
513     kInherentImplEnding,
514     kTraitImplType,
515     kTraitImplInfix,
516     kTraitImplEnding,
517     kImplPathEnding,
518     kTraitDefinitionInfix,
519     kTraitDefinitionEnding,
520     kArraySize,
521     kFinishArray,
522     kSliceEnding,
523     kAfterFirstTupleElement,
524     kAfterSecondTupleElement,
525     kAfterThirdTupleElement,
526     kAfterSubsequentTupleElement,
527     kContinueParameterList,
528     kFinishFn,
529     kBeginAutoTraits,
530     kContinueAutoTraits,
531     kContinueDynTrait,
532     kContinueAssocBinding,
533     kConstData,
534     kBeginGenericArgList,
535     kContinueGenericArgList,
536     kPathBackrefEnding,
537     kTypeBackrefEnding,
538     kConstantBackrefEnding,
539   };
540 
541   // Element counts for the stack arrays.  Larger stack sizes accommodate more
542   // deeply nested names at the cost of a larger footprint on the C++ call
543   // stack.
544   enum {
545     // Maximum recursive calls outstanding at one time.
546     kStackSize = 256,
547 
548     // Maximum N<uppercase> nested-paths open at once.  We do not expect
549     // closures inside closures inside closures as much as functions inside
550     // modules inside other modules, so we can use a smaller array here.
551     kNamespaceStackSize = 64,
552 
553     // Maximum number of nested backrefs.  We can keep this stack pretty small
554     // because we do not follow backrefs inside generic-args or other contexts
555     // that suppress printing, so deep stacking is unlikely in practice.
556     kPositionStackSize = 16,
557   };
558 
559   // Returns the next input character without consuming it.
Peek() const560   char Peek() const { return encoding_[pos_]; }
561 
562   // Consumes and returns the next input character.
Take()563   char Take() { return encoding_[pos_++]; }
564 
565   // If the next input character is the given character, consumes it and returns
566   // true; otherwise returns false without consuming a character.
Eat(char want)567   ABSL_MUST_USE_RESULT bool Eat(char want) {
568     if (encoding_[pos_] != want) return false;
569     ++pos_;
570     return true;
571   }
572 
573   // Provided there is enough remaining output space, appends c to the output,
574   // writing a fresh NUL terminator afterward, and returns true.  Returns false
575   // if the output buffer had less than two bytes free.
EmitChar(char c)576   ABSL_MUST_USE_RESULT bool EmitChar(char c) {
577     if (silence_depth_ > 0) return true;
578     if (out_end_ - out_ < 2) return false;
579     *out_++ = c;
580     *out_ = '\0';
581     return true;
582   }
583 
584   // Provided there is enough remaining output space, appends the C string token
585   // to the output, followed by a NUL character, and returns true.  Returns
586   // false if not everything fit into the output buffer.
Emit(const char * token)587   ABSL_MUST_USE_RESULT bool Emit(const char* token) {
588     if (silence_depth_ > 0) return true;
589     const size_t token_length = std::strlen(token);
590     const size_t bytes_to_copy = token_length + 1;  // token and final NUL
591     if (static_cast<size_t>(out_end_ - out_) < bytes_to_copy) return false;
592     std::memcpy(out_, token, bytes_to_copy);
593     out_ += token_length;
594     return true;
595   }
596 
597   // Provided there is enough remaining output space, appends the decimal form
598   // of disambiguator (if it's nonnegative) or "?" (if it's negative) to the
599   // output, followed by a NUL character, and returns true.  Returns false if
600   // not everything fit into the output buffer.
EmitDisambiguator(int disambiguator)601   ABSL_MUST_USE_RESULT bool EmitDisambiguator(int disambiguator) {
602     if (disambiguator < 0) return EmitChar('?');  // parsed but too large
603     if (disambiguator == 0) return EmitChar('0');
604     // Convert disambiguator to decimal text.  Three digits per byte is enough
605     // because 999 > 256.  The bound will remain correct even if future
606     // maintenance changes the type of the disambiguator variable.
607     char digits[3 * sizeof(disambiguator)] = {};
608     size_t leading_digit_index = sizeof(digits) - 1;
609     for (; disambiguator > 0; disambiguator /= 10) {
610       digits[--leading_digit_index] =
611           static_cast<char>('0' + disambiguator % 10);
612     }
613     return Emit(digits + leading_digit_index);
614   }
615 
616   // Consumes an optional disambiguator (s123_) from the input.
617   //
618   // On success returns true and fills value with the encoded value if it was
619   // not too big, otherwise with -1.  If the optional disambiguator was omitted,
620   // value is 0.  On parse failure returns false and sets value to -1.
ParseDisambiguator(int & value)621   ABSL_MUST_USE_RESULT bool ParseDisambiguator(int& value) {
622     value = -1;
623 
624     // disambiguator = s base-62-number
625     //
626     // Disambiguators are optional.  An omitted disambiguator is zero.
627     if (!Eat('s')) {
628       value = 0;
629       return true;
630     }
631     int base_62_value = 0;
632     if (!ParseBase62Number(base_62_value)) return false;
633     value = base_62_value < 0 ? -1 : base_62_value + 1;
634     return true;
635   }
636 
637   // Consumes a base-62 number like _ or 123_ from the input.
638   //
639   // On success returns true and fills value with the encoded value if it was
640   // not too big, otherwise with -1.  On parse failure returns false and sets
641   // value to -1.
ParseBase62Number(int & value)642   ABSL_MUST_USE_RESULT bool ParseBase62Number(int& value) {
643     value = -1;
644 
645     // base-62-number = (digit | lower | upper)* _
646     //
647     // An empty base-62 digit sequence means 0.
648     if (Eat('_')) {
649       value = 0;
650       return true;
651     }
652 
653     // A nonempty digit sequence denotes its base-62 value plus 1.
654     int encoded_number = 0;
655     bool overflowed = false;
656     while (IsAlpha(Peek()) || IsDigit(Peek())) {
657       const char c = Take();
658       if (encoded_number >= std::numeric_limits<int>::max()/62) {
659         // If we are close to overflowing an int, keep parsing but stop updating
660         // encoded_number and remember to return -1 at the end.  The point is to
661         // avoid undefined behavior while parsing crate-root disambiguators,
662         // which are large in practice but not shown in demangling, while
663         // successfully computing closure and shim disambiguators, which are
664         // typically small and are printed out.
665         overflowed = true;
666       } else {
667         int digit;
668         if (IsDigit(c)) {
669           digit = c - '0';
670         } else if (IsLower(c)) {
671           digit = c - 'a' + 10;
672         } else {
673           digit = c - 'A' + 36;
674         }
675         encoded_number = 62 * encoded_number + digit;
676       }
677     }
678 
679     if (!Eat('_')) return false;
680     if (!overflowed) value = encoded_number + 1;
681     return true;
682   }
683 
684   // Consumes an identifier from the input, returning true on success.
685   //
686   // A nonzero uppercase_namespace specifies the character after the N in a
687   // nested-identifier, e.g., 'C' for a closure, allowing ParseIdentifier to
688   // write out the name with the conventional decoration for that namespace.
ParseIdentifier(char uppercase_namespace='\\0')689   ABSL_MUST_USE_RESULT bool ParseIdentifier(char uppercase_namespace = '\0') {
690     // identifier -> disambiguator? undisambiguated-identifier
691     int disambiguator = 0;
692     if (!ParseDisambiguator(disambiguator)) return false;
693 
694     return ParseUndisambiguatedIdentifier(uppercase_namespace, disambiguator);
695   }
696 
697   // Consumes from the input an identifier with no preceding disambiguator,
698   // returning true on success.
699   //
700   // When ParseIdentifier calls this, it passes the N<namespace> character and
701   // disambiguator value so that "{closure#42}" and similar forms can be
702   // rendered correctly.
703   //
704   // At other appearances of undisambiguated-identifier in the grammar, this
705   // treatment is not applicable, and the call site omits both arguments.
ParseUndisambiguatedIdentifier(char uppercase_namespace='\\0',int disambiguator=0)706   ABSL_MUST_USE_RESULT bool ParseUndisambiguatedIdentifier(
707       char uppercase_namespace = '\0', int disambiguator = 0) {
708     // undisambiguated-identifier -> u? decimal-number _? bytes
709     const bool is_punycoded = Eat('u');
710     if (!IsDigit(Peek())) return false;
711     int num_bytes = 0;
712     if (!ParseDecimalNumber(num_bytes)) return false;
713     (void)Eat('_');  // optional separator, needed if a digit follows
714     if (is_punycoded) {
715       DecodeRustPunycodeOptions options;
716       options.punycode_begin = &encoding_[pos_];
717       options.punycode_end = &encoding_[pos_] + num_bytes;
718       options.out_begin = out_;
719       options.out_end = out_end_;
720       out_ = DecodeRustPunycode(options);
721       if (out_ == nullptr) return false;
722       pos_ += static_cast<size_t>(num_bytes);
723     }
724 
725     // Emit the beginnings of braced forms like {shim:vtable#0}.
726     if (uppercase_namespace != '\0') {
727       switch (uppercase_namespace) {
728         case 'C':
729           if (!Emit("{closure")) return false;
730           break;
731         case 'S':
732           if (!Emit("{shim")) return false;
733           break;
734         default:
735           if (!EmitChar('{') || !EmitChar(uppercase_namespace)) return false;
736           break;
737       }
738       if (num_bytes > 0 && !Emit(":")) return false;
739     }
740 
741     // Emit the name itself.
742     if (!is_punycoded) {
743       for (int i = 0; i < num_bytes; ++i) {
744         const char c = Take();
745         if (!IsIdentifierChar(c) &&
746             // The spec gives toolchains the choice of Punycode or raw UTF-8 for
747             // identifiers containing code points above 0x7f, so accept bytes
748             // with the high bit set.
749             (c & 0x80) == 0) {
750           return false;
751         }
752         if (!EmitChar(c)) return false;
753       }
754     }
755 
756     // Emit the endings of braced forms, e.g., "#42}".
757     if (uppercase_namespace != '\0') {
758       if (!EmitChar('#')) return false;
759       if (!EmitDisambiguator(disambiguator)) return false;
760       if (!EmitChar('}')) return false;
761     }
762 
763     return true;
764   }
765 
766   // Consumes a decimal number like 0 or 123 from the input.  On success returns
767   // true and fills value with the encoded value.  If the encoded value is too
768   // large or otherwise unparsable, returns false and sets value to -1.
ParseDecimalNumber(int & value)769   ABSL_MUST_USE_RESULT bool ParseDecimalNumber(int& value) {
770     value = -1;
771     if (!IsDigit(Peek())) return false;
772     int encoded_number = Take() - '0';
773     if (encoded_number == 0) {
774       // Decimal numbers are never encoded with extra leading zeroes.
775       value = 0;
776       return true;
777     }
778     while (IsDigit(Peek()) &&
779            // avoid overflow
780            encoded_number < std::numeric_limits<int>::max()/10) {
781       encoded_number = 10 * encoded_number + (Take() - '0');
782     }
783     if (IsDigit(Peek())) return false;  // too big
784     value = encoded_number;
785     return true;
786   }
787 
788   // Consumes a binder of higher-ranked lifetimes if one is present.  On success
789   // returns true and discards the encoded lifetime count.  On parse failure
790   // returns false.
ParseOptionalBinder()791   ABSL_MUST_USE_RESULT bool ParseOptionalBinder() {
792     // binder -> G base-62-number
793     if (!Eat('G')) return true;
794     int ignored_binding_count;
795     return ParseBase62Number(ignored_binding_count);
796   }
797 
798   // Consumes a lifetime if one is present.
799   //
800   // On success returns true and discards the lifetime index.  We do not print
801   // or even range-check lifetimes because they are a finer detail than other
802   // things we omit from output, such as the entire contents of generic-args.
803   //
804   // On parse failure returns false.
ParseOptionalLifetime()805   ABSL_MUST_USE_RESULT bool ParseOptionalLifetime() {
806     // lifetime -> L base-62-number
807     if (!Eat('L')) return true;
808     int ignored_de_bruijn_index;
809     return ParseBase62Number(ignored_de_bruijn_index);
810   }
811 
812   // Consumes a lifetime just like ParseOptionalLifetime, but returns false if
813   // there is no lifetime here.
ParseRequiredLifetime()814   ABSL_MUST_USE_RESULT bool ParseRequiredLifetime() {
815     if (Peek() != 'L') return false;
816     return ParseOptionalLifetime();
817   }
818 
819   // Pushes ns onto the namespace stack and returns true if the stack is not
820   // full, else returns false.
PushNamespace(char ns)821   ABSL_MUST_USE_RESULT bool PushNamespace(char ns) {
822     if (namespace_depth_ == kNamespaceStackSize) return false;
823     namespace_stack_[namespace_depth_++] = ns;
824     return true;
825   }
826 
827   // Pops the last pushed namespace.  Requires that the namespace stack is not
828   // empty (namespace_depth_ > 0).
PopNamespace()829   char PopNamespace() { return namespace_stack_[--namespace_depth_]; }
830 
831   // Pushes position onto the position stack and returns true if the stack is
832   // not full, else returns false.
PushPosition(int position)833   ABSL_MUST_USE_RESULT bool PushPosition(int position) {
834     if (position_depth_ == kPositionStackSize) return false;
835     position_stack_[position_depth_++] = position;
836     return true;
837   }
838 
839   // Pops the last pushed input position.  Requires that the position stack is
840   // not empty (position_depth_ > 0).
PopPosition()841   int PopPosition() { return position_stack_[--position_depth_]; }
842 
843   // Consumes a base-62-number denoting a backref target, pushes the current
844   // input position on the data stack, and sets the input position to the
845   // beginning of the backref target.  Returns true on success.  Returns false
846   // if parsing failed, the stack is exhausted, or the backref target position
847   // is out of range.
BeginBackref()848   ABSL_MUST_USE_RESULT bool BeginBackref() {
849     // backref = B base-62-number (B already consumed)
850     //
851     // Reject backrefs that don't parse, overflow int, or don't point backward.
852     // If the offset looks fine, adjust it to account for the _R prefix.
853     int offset = 0;
854     const int offset_of_this_backref =
855         pos_ - 2 /* _R */ - 1 /* B already consumed */;
856     if (!ParseBase62Number(offset) || offset < 0 ||
857         offset >= offset_of_this_backref) {
858       return false;
859     }
860     offset += 2;
861 
862     // Save the old position to restore later.
863     if (!PushPosition(pos_)) return false;
864 
865     // Move the input position to the backref target.
866     //
867     // Note that we do not check whether the new position points to the
868     // beginning of a construct matching the context in which the backref
869     // appeared.  We just jump to it and see whether nested parsing succeeds.
870     // We therefore accept various wrong manglings, e.g., a type backref
871     // pointing to an 'l' character inside an identifier, which happens to mean
872     // i32 when parsed as a type mangling.  This saves the complexity and RAM
873     // footprint of remembering which offsets began which kinds of
874     // substructures.  Existing demanglers take similar shortcuts.
875     pos_ = offset;
876     return true;
877   }
878 
879   // Cleans up after a backref production by restoring the previous input
880   // position from the data stack.
EndBackref()881   void EndBackref() { pos_ = PopPosition(); }
882 
883   // The leftmost recursion_depth_ elements of recursion_stack_ contain the
884   // ReturnAddresses pushed by ABSL_DEMANGLER_RECURSE calls not yet completed.
885   ReturnAddress recursion_stack_[kStackSize] = {};
886   int recursion_depth_ = 0;
887 
888   // The leftmost namespace_depth_ elements of namespace_stack_ contain the
889   // uppercase namespace identifiers for open nested-paths, e.g., 'C' for a
890   // closure.
891   char namespace_stack_[kNamespaceStackSize] = {};
892   int namespace_depth_ = 0;
893 
894   // The leftmost position_depth_ elements of position_stack_ contain the input
895   // positions to return to after fully printing the targets of backrefs.
896   int position_stack_[kPositionStackSize] = {};
897   int position_depth_ = 0;
898 
899   // Anything parsed while silence_depth_ > 0 contributes nothing to the
900   // demangled output.  For constructs omitted from the demangling, such as
901   // impl-path and the contents of generic-args, we will increment
902   // silence_depth_ on the way in and decrement silence_depth_ on the way out.
903   int silence_depth_ = 0;
904 
905   // Input: encoding_ points to a Rust mangled symbol, and encoding_[pos_] is
906   // the next input character to be scanned.
907   int pos_ = 0;
908   const char* encoding_ = nullptr;
909 
910   // Output: *out_ is where the next output character should be written, and
911   // out_end_ points past the last byte of available space.
912   char* out_ = nullptr;
913   char* out_end_ = nullptr;
914 };
915 
916 }  // namespace
917 
DemangleRustSymbolEncoding(const char * mangled,char * out,size_t out_size)918 bool DemangleRustSymbolEncoding(const char* mangled, char* out,
919                                 size_t out_size) {
920   return RustSymbolParser(mangled, out, out + out_size).Parse();
921 }
922 
923 }  // namespace debugging_internal
924 ABSL_NAMESPACE_END
925 }  // namespace absl
926