xref: /aosp_15_r20/external/abseil-cpp/absl/debugging/internal/decode_rust_punycode.cc (revision 9356374a3709195abf420251b3e825997ff56c0f)
1 // Copyright 2024 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/debugging/internal/decode_rust_punycode.h"
16 
17 #include <cstddef>
18 #include <cstdint>
19 #include <cstring>
20 
21 #include "absl/base/config.h"
22 #include "absl/base/nullability.h"
23 #include "absl/debugging/internal/bounded_utf8_length_sequence.h"
24 #include "absl/debugging/internal/utf8_for_code_point.h"
25 
26 namespace absl {
27 ABSL_NAMESPACE_BEGIN
28 namespace debugging_internal {
29 
30 namespace {
31 
32 // Decoding Punycode requires repeated random-access insertion into a stream of
33 // variable-length UTF-8 code-point encodings.  We need this to be tolerably
34 // fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data
35 // structures on the heap (async-signal-safety).
36 //
37 // It is pragmatic to impose a moderately low limit on the identifier length and
38 // bail out if we ever hit it.  Then BoundedUtf8LengthSequence efficiently
39 // determines where to insert the next code point, and memmove efficiently makes
40 // room for it.
41 //
42 // The chosen limit is a round number several times larger than identifiers
43 // expected in practice, yet still small enough that a memmove of this many
44 // UTF-8 characters is not much more expensive than the division and modulus
45 // operations that Punycode decoding requires.
46 constexpr uint32_t kMaxChars = 256;
47 
48 // Constants from RFC 3492 section 5.
49 constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700;
50 
51 constexpr uint32_t kMaxCodePoint = 0x10ffff;
52 
53 // Overflow threshold in DecodeRustPunycode's inner loop; see comments there.
54 constexpr uint32_t kMaxI = 1 << 30;
55 
56 // If punycode_begin .. punycode_end begins with a prefix matching the regular
57 // expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final
58 // underscore into out_begin .. out_end, sets num_ascii_chars to the number of
59 // bytes copied, and returns true.  (A prefix of this sort represents the
60 // nonempty subsequence of ASCII characters in the corresponding plaintext.)
61 //
62 // If punycode_begin .. punycode_end does not contain an underscore, sets
63 // num_ascii_chars to zero and returns true.  (The encoding of a plaintext
64 // without any ASCII characters does not carry such a prefix.)
65 //
66 // Returns false and zeroes num_ascii_chars on failure (either parse error or
67 // not enough space in the output buffer).
ConsumeOptionalAsciiPrefix(const char * & punycode_begin,const char * const punycode_end,char * const out_begin,char * const out_end,uint32_t & num_ascii_chars)68 bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin,
69                                 const char* const punycode_end,
70                                 char* const out_begin,
71                                 char* const out_end,
72                                 uint32_t& num_ascii_chars) {
73   num_ascii_chars = 0;
74 
75   // Remember the last underscore if any.  Also use the same string scan to
76   // reject any ASCII bytes that do not belong in an identifier, including NUL,
77   // as well as non-ASCII bytes, which should have been delta-encoded instead.
78   int last_underscore = -1;
79   for (int i = 0; i < punycode_end - punycode_begin; ++i) {
80     const char c = punycode_begin[i];
81     if (c == '_') {
82       last_underscore = i;
83       continue;
84     }
85     // We write out the meaning of absl::ascii_isalnum rather than call that
86     // function because its documentation does not promise it will remain
87     // async-signal-safe under future development.
88     if ('a' <= c && c <= 'z') continue;
89     if ('A' <= c && c <= 'Z') continue;
90     if ('0' <= c && c <= '9') continue;
91     return false;
92   }
93 
94   // If there was no underscore, that means there were no ASCII characters in
95   // the plaintext, so there is no prefix to consume.  Our work is done.
96   if (last_underscore < 0) return true;
97 
98   // Otherwise there will be an underscore delimiter somewhere.  It can't be
99   // initial because then there would be no ASCII characters to its left, and no
100   // delimiter would have been added in that case.
101   if (last_underscore == 0) return false;
102 
103   // Any other position is reasonable.  Make sure there's room in the buffer.
104   if (last_underscore + 1 > out_end - out_begin) return false;
105 
106   // Consume and write out the ASCII characters.
107   num_ascii_chars = static_cast<uint32_t>(last_underscore);
108   std::memcpy(out_begin, punycode_begin, num_ascii_chars);
109   out_begin[num_ascii_chars] = '\0';
110   punycode_begin += num_ascii_chars + 1;
111   return true;
112 }
113 
114 // Returns the value of `c` as a base-36 digit according to RFC 3492 section 5,
115 // or -1 if `c` is not such a digit.
DigitValue(char c)116 int DigitValue(char c) {
117   if ('0' <= c && c <= '9') return c - '0' + 26;
118   if ('a' <= c && c <= 'z') return c - 'a';
119   if ('A' <= c && c <= 'Z') return c - 'A';
120   return -1;
121 }
122 
123 // Consumes the next delta encoding from punycode_begin .. punycode_end,
124 // updating i accordingly.  Returns true on success.  Returns false on parse
125 // failure or arithmetic overflow.
ScanNextDelta(const char * & punycode_begin,const char * const punycode_end,uint32_t bias,uint32_t & i)126 bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end,
127                    uint32_t bias, uint32_t& i) {
128   uint64_t w = 1;  // 64 bits to prevent overflow in w *= kBase - t
129 
130   // "for k = base to infinity in steps of base do begin ... end" in RFC 3492
131   // section 6.2.  Each loop iteration scans one digit of the delta.
132   for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) {
133     const int digit_value = DigitValue(*punycode_begin++);
134     if (digit_value < 0) return false;
135 
136     // Compute this in 64-bit arithmetic so we can check for overflow afterward.
137     const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w;
138 
139     // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but
140     // invalid input could encode an arbitrarily large delta.  Nip that in the
141     // bud here.
142     static_assert(
143         kMaxI >= kMaxChars * kMaxCodePoint,
144         "kMaxI is too small to prevent spurious failures on good input");
145     if (new_i > kMaxI) return false;
146 
147     static_assert(
148         kMaxI < (uint64_t{1} << 32),
149         "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound");
150     i = static_cast<uint32_t>(new_i);
151 
152     // Compute the threshold that determines whether this is the last digit and
153     // (if not) what the next digit's place value will be.  This logic from RFC
154     // 3492 section 6.2 is explained in section 3.3.
155     uint32_t t;
156     if (k <= bias + kTMin) {
157       t = kTMin;
158     } else if (k >= bias + kTMax) {
159       t = kTMax;
160     } else {
161       t = k - bias;
162     }
163     if (static_cast<uint32_t>(digit_value) < t) return true;
164 
165     // If this gets too large, the range check on new_i in the next iteration
166     // will catch it.  We know this multiplication will not overwrap because w
167     // is 64 bits wide.
168     w *= kBase - t;
169   }
170   return false;
171 }
172 
173 }  // namespace
174 
DecodeRustPunycode(DecodeRustPunycodeOptions options)175 absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) {
176   const char* punycode_begin = options.punycode_begin;
177   const char* const punycode_end = options.punycode_end;
178   char* const out_begin = options.out_begin;
179   char* const out_end = options.out_end;
180 
181   // Write a NUL terminator first.  Later memcpy calls will keep bumping it
182   // along to its new right place.
183   const size_t out_size = static_cast<size_t>(out_end - out_begin);
184   if (out_size == 0) return nullptr;
185   *out_begin = '\0';
186 
187   // RFC 3492 section 6.2 begins here.  We retain the names of integer variables
188   // appearing in that text.
189   uint32_t n = 128, i = 0, bias = 72, num_chars = 0;
190 
191   // If there are any ASCII characters, consume them and their trailing
192   // underscore delimiter.
193   if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end,
194                                   out_begin, out_end, num_chars)) {
195     return nullptr;
196   }
197   uint32_t total_utf8_bytes = num_chars;
198 
199   BoundedUtf8LengthSequence<kMaxChars> utf8_lengths;
200 
201   // "while the input is not exhausted do begin ... end"
202   while (punycode_begin != punycode_end) {
203     if (num_chars >= kMaxChars) return nullptr;
204 
205     const uint32_t old_i = i;
206 
207     if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr;
208 
209     // Update bias as in RFC 3492 section 6.1.  (We have inlined adapt.)
210     uint32_t delta = i - old_i;
211     delta /= (old_i == 0 ? kDamp : 2);
212     delta += delta/(num_chars + 1);
213     bias = 0;
214     while (delta > ((kBase - kTMin) * kTMax)/2) {
215       delta /= kBase - kTMin;
216       bias += kBase;
217     }
218     bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew);
219 
220     // Back in section 6.2, compute the new code point and insertion index.
221     static_assert(
222         kMaxI + kMaxCodePoint < (uint64_t{1} << 32),
223         "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound");
224     n += i/(num_chars + 1);
225     i %= num_chars + 1;
226 
227     // To actually insert, we need to convert the code point n to UTF-8 and the
228     // character index i to an index into the byte stream emitted so far.  First
229     // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values,
230     // and anything that won't fit into the remaining output storage.
231     Utf8ForCodePoint utf8_for_code_point(n);
232     if (!utf8_for_code_point.ok()) return nullptr;
233     if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) {
234       return nullptr;
235     }
236 
237     // Now insert the new character into both our length map and the output.
238     uint32_t n_index =
239         utf8_lengths.InsertAndReturnSumOfPredecessors(
240             i, utf8_for_code_point.length);
241     std::memmove(
242         out_begin + n_index + utf8_for_code_point.length, out_begin + n_index,
243         total_utf8_bytes + 1 - n_index);
244     std::memcpy(out_begin + n_index, utf8_for_code_point.bytes,
245                 utf8_for_code_point.length);
246     total_utf8_bytes += utf8_for_code_point.length;
247     ++num_chars;
248 
249     // Finally, advance to the next state before continuing.
250     ++i;
251   }
252 
253   return out_begin + total_utf8_bytes;
254 }
255 
256 }  // namespace debugging_internal
257 ABSL_NAMESPACE_END
258 }  // namespace absl
259