1 // Copyright 2024 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/debugging/internal/decode_rust_punycode.h"
16
17 #include <cstddef>
18 #include <cstdint>
19 #include <cstring>
20
21 #include "absl/base/config.h"
22 #include "absl/base/nullability.h"
23 #include "absl/debugging/internal/bounded_utf8_length_sequence.h"
24 #include "absl/debugging/internal/utf8_for_code_point.h"
25
26 namespace absl {
27 ABSL_NAMESPACE_BEGIN
28 namespace debugging_internal {
29
30 namespace {
31
32 // Decoding Punycode requires repeated random-access insertion into a stream of
33 // variable-length UTF-8 code-point encodings. We need this to be tolerably
34 // fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data
35 // structures on the heap (async-signal-safety).
36 //
37 // It is pragmatic to impose a moderately low limit on the identifier length and
38 // bail out if we ever hit it. Then BoundedUtf8LengthSequence efficiently
39 // determines where to insert the next code point, and memmove efficiently makes
40 // room for it.
41 //
42 // The chosen limit is a round number several times larger than identifiers
43 // expected in practice, yet still small enough that a memmove of this many
44 // UTF-8 characters is not much more expensive than the division and modulus
45 // operations that Punycode decoding requires.
46 constexpr uint32_t kMaxChars = 256;
47
48 // Constants from RFC 3492 section 5.
49 constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700;
50
51 constexpr uint32_t kMaxCodePoint = 0x10ffff;
52
53 // Overflow threshold in DecodeRustPunycode's inner loop; see comments there.
54 constexpr uint32_t kMaxI = 1 << 30;
55
56 // If punycode_begin .. punycode_end begins with a prefix matching the regular
57 // expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final
58 // underscore into out_begin .. out_end, sets num_ascii_chars to the number of
59 // bytes copied, and returns true. (A prefix of this sort represents the
60 // nonempty subsequence of ASCII characters in the corresponding plaintext.)
61 //
62 // If punycode_begin .. punycode_end does not contain an underscore, sets
63 // num_ascii_chars to zero and returns true. (The encoding of a plaintext
64 // without any ASCII characters does not carry such a prefix.)
65 //
66 // Returns false and zeroes num_ascii_chars on failure (either parse error or
67 // not enough space in the output buffer).
ConsumeOptionalAsciiPrefix(const char * & punycode_begin,const char * const punycode_end,char * const out_begin,char * const out_end,uint32_t & num_ascii_chars)68 bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin,
69 const char* const punycode_end,
70 char* const out_begin,
71 char* const out_end,
72 uint32_t& num_ascii_chars) {
73 num_ascii_chars = 0;
74
75 // Remember the last underscore if any. Also use the same string scan to
76 // reject any ASCII bytes that do not belong in an identifier, including NUL,
77 // as well as non-ASCII bytes, which should have been delta-encoded instead.
78 int last_underscore = -1;
79 for (int i = 0; i < punycode_end - punycode_begin; ++i) {
80 const char c = punycode_begin[i];
81 if (c == '_') {
82 last_underscore = i;
83 continue;
84 }
85 // We write out the meaning of absl::ascii_isalnum rather than call that
86 // function because its documentation does not promise it will remain
87 // async-signal-safe under future development.
88 if ('a' <= c && c <= 'z') continue;
89 if ('A' <= c && c <= 'Z') continue;
90 if ('0' <= c && c <= '9') continue;
91 return false;
92 }
93
94 // If there was no underscore, that means there were no ASCII characters in
95 // the plaintext, so there is no prefix to consume. Our work is done.
96 if (last_underscore < 0) return true;
97
98 // Otherwise there will be an underscore delimiter somewhere. It can't be
99 // initial because then there would be no ASCII characters to its left, and no
100 // delimiter would have been added in that case.
101 if (last_underscore == 0) return false;
102
103 // Any other position is reasonable. Make sure there's room in the buffer.
104 if (last_underscore + 1 > out_end - out_begin) return false;
105
106 // Consume and write out the ASCII characters.
107 num_ascii_chars = static_cast<uint32_t>(last_underscore);
108 std::memcpy(out_begin, punycode_begin, num_ascii_chars);
109 out_begin[num_ascii_chars] = '\0';
110 punycode_begin += num_ascii_chars + 1;
111 return true;
112 }
113
114 // Returns the value of `c` as a base-36 digit according to RFC 3492 section 5,
115 // or -1 if `c` is not such a digit.
DigitValue(char c)116 int DigitValue(char c) {
117 if ('0' <= c && c <= '9') return c - '0' + 26;
118 if ('a' <= c && c <= 'z') return c - 'a';
119 if ('A' <= c && c <= 'Z') return c - 'A';
120 return -1;
121 }
122
123 // Consumes the next delta encoding from punycode_begin .. punycode_end,
124 // updating i accordingly. Returns true on success. Returns false on parse
125 // failure or arithmetic overflow.
ScanNextDelta(const char * & punycode_begin,const char * const punycode_end,uint32_t bias,uint32_t & i)126 bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end,
127 uint32_t bias, uint32_t& i) {
128 uint64_t w = 1; // 64 bits to prevent overflow in w *= kBase - t
129
130 // "for k = base to infinity in steps of base do begin ... end" in RFC 3492
131 // section 6.2. Each loop iteration scans one digit of the delta.
132 for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) {
133 const int digit_value = DigitValue(*punycode_begin++);
134 if (digit_value < 0) return false;
135
136 // Compute this in 64-bit arithmetic so we can check for overflow afterward.
137 const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w;
138
139 // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but
140 // invalid input could encode an arbitrarily large delta. Nip that in the
141 // bud here.
142 static_assert(
143 kMaxI >= kMaxChars * kMaxCodePoint,
144 "kMaxI is too small to prevent spurious failures on good input");
145 if (new_i > kMaxI) return false;
146
147 static_assert(
148 kMaxI < (uint64_t{1} << 32),
149 "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound");
150 i = static_cast<uint32_t>(new_i);
151
152 // Compute the threshold that determines whether this is the last digit and
153 // (if not) what the next digit's place value will be. This logic from RFC
154 // 3492 section 6.2 is explained in section 3.3.
155 uint32_t t;
156 if (k <= bias + kTMin) {
157 t = kTMin;
158 } else if (k >= bias + kTMax) {
159 t = kTMax;
160 } else {
161 t = k - bias;
162 }
163 if (static_cast<uint32_t>(digit_value) < t) return true;
164
165 // If this gets too large, the range check on new_i in the next iteration
166 // will catch it. We know this multiplication will not overwrap because w
167 // is 64 bits wide.
168 w *= kBase - t;
169 }
170 return false;
171 }
172
173 } // namespace
174
DecodeRustPunycode(DecodeRustPunycodeOptions options)175 absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) {
176 const char* punycode_begin = options.punycode_begin;
177 const char* const punycode_end = options.punycode_end;
178 char* const out_begin = options.out_begin;
179 char* const out_end = options.out_end;
180
181 // Write a NUL terminator first. Later memcpy calls will keep bumping it
182 // along to its new right place.
183 const size_t out_size = static_cast<size_t>(out_end - out_begin);
184 if (out_size == 0) return nullptr;
185 *out_begin = '\0';
186
187 // RFC 3492 section 6.2 begins here. We retain the names of integer variables
188 // appearing in that text.
189 uint32_t n = 128, i = 0, bias = 72, num_chars = 0;
190
191 // If there are any ASCII characters, consume them and their trailing
192 // underscore delimiter.
193 if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end,
194 out_begin, out_end, num_chars)) {
195 return nullptr;
196 }
197 uint32_t total_utf8_bytes = num_chars;
198
199 BoundedUtf8LengthSequence<kMaxChars> utf8_lengths;
200
201 // "while the input is not exhausted do begin ... end"
202 while (punycode_begin != punycode_end) {
203 if (num_chars >= kMaxChars) return nullptr;
204
205 const uint32_t old_i = i;
206
207 if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr;
208
209 // Update bias as in RFC 3492 section 6.1. (We have inlined adapt.)
210 uint32_t delta = i - old_i;
211 delta /= (old_i == 0 ? kDamp : 2);
212 delta += delta/(num_chars + 1);
213 bias = 0;
214 while (delta > ((kBase - kTMin) * kTMax)/2) {
215 delta /= kBase - kTMin;
216 bias += kBase;
217 }
218 bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew);
219
220 // Back in section 6.2, compute the new code point and insertion index.
221 static_assert(
222 kMaxI + kMaxCodePoint < (uint64_t{1} << 32),
223 "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound");
224 n += i/(num_chars + 1);
225 i %= num_chars + 1;
226
227 // To actually insert, we need to convert the code point n to UTF-8 and the
228 // character index i to an index into the byte stream emitted so far. First
229 // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values,
230 // and anything that won't fit into the remaining output storage.
231 Utf8ForCodePoint utf8_for_code_point(n);
232 if (!utf8_for_code_point.ok()) return nullptr;
233 if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) {
234 return nullptr;
235 }
236
237 // Now insert the new character into both our length map and the output.
238 uint32_t n_index =
239 utf8_lengths.InsertAndReturnSumOfPredecessors(
240 i, utf8_for_code_point.length);
241 std::memmove(
242 out_begin + n_index + utf8_for_code_point.length, out_begin + n_index,
243 total_utf8_bytes + 1 - n_index);
244 std::memcpy(out_begin + n_index, utf8_for_code_point.bytes,
245 utf8_for_code_point.length);
246 total_utf8_bytes += utf8_for_code_point.length;
247 ++num_chars;
248
249 // Finally, advance to the next state before continuing.
250 ++i;
251 }
252
253 return out_begin + total_utf8_bytes;
254 }
255
256 } // namespace debugging_internal
257 ABSL_NAMESPACE_END
258 } // namespace absl
259