1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/strings/escaping.h"
16
17 #include <algorithm>
18 #include <cassert>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 #include <limits>
23 #include <string>
24 #include <utility>
25
26 #include "absl/base/config.h"
27 #include "absl/base/internal/raw_logging.h"
28 #include "absl/base/internal/unaligned_access.h"
29 #include "absl/base/nullability.h"
30 #include "absl/strings/ascii.h"
31 #include "absl/strings/charset.h"
32 #include "absl/strings/internal/escaping.h"
33 #include "absl/strings/internal/resize_uninitialized.h"
34 #include "absl/strings/internal/utf8.h"
35 #include "absl/strings/numbers.h"
36 #include "absl/strings/str_cat.h"
37 #include "absl/strings/string_view.h"
38
39 namespace absl {
40 ABSL_NAMESPACE_BEGIN
41 namespace {
42
43 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
44 constexpr bool kUnescapeNulls = false;
45
is_octal_digit(char c)46 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
47
hex_digit_to_int(char c)48 inline unsigned int hex_digit_to_int(char c) {
49 static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
50 "Character set must be ASCII.");
51 assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
52 unsigned int x = static_cast<unsigned char>(c);
53 if (x > '9') {
54 x += 9;
55 }
56 return x & 0xf;
57 }
58
IsSurrogate(char32_t c,absl::string_view src,absl::Nullable<std::string * > error)59 inline bool IsSurrogate(char32_t c, absl::string_view src,
60 absl::Nullable<std::string*> error) {
61 if (c >= 0xD800 && c <= 0xDFFF) {
62 if (error) {
63 *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
64 src);
65 }
66 return true;
67 }
68 return false;
69 }
70
71 // ----------------------------------------------------------------------
72 // CUnescapeInternal()
73 // Implements both CUnescape() and CUnescapeForNullTerminatedString().
74 //
75 // Unescapes C escape sequences and is the reverse of CEscape().
76 //
77 // If 'source' is valid, stores the unescaped string and its size in
78 // 'dest' and 'dest_len' respectively, and returns true. Otherwise
79 // returns false and optionally stores the error description in
80 // 'error'. Set 'error' to nullptr to disable error reporting.
81 //
82 // 'dest' should point to a buffer that is at least as big as 'source'.
83 // 'source' and 'dest' may be the same.
84 //
85 // NOTE: any changes to this function must also be reflected in the older
86 // UnescapeCEscapeSequences().
87 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,absl::Nonnull<char * > dest,absl::Nonnull<ptrdiff_t * > dest_len,absl::Nullable<std::string * > error)88 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
89 absl::Nonnull<char*> dest,
90 absl::Nonnull<ptrdiff_t*> dest_len,
91 absl::Nullable<std::string*> error) {
92 char* d = dest;
93 const char* p = source.data();
94 const char* end = p + source.size();
95 const char* last_byte = end - 1;
96
97 // Small optimization for case where source = dest and there's no escaping
98 while (p == d && p < end && *p != '\\') p++, d++;
99
100 while (p < end) {
101 if (*p != '\\') {
102 *d++ = *p++;
103 } else {
104 if (++p > last_byte) { // skip past the '\\'
105 if (error) *error = "String cannot end with \\";
106 return false;
107 }
108 switch (*p) {
109 case 'a': *d++ = '\a'; break;
110 case 'b': *d++ = '\b'; break;
111 case 'f': *d++ = '\f'; break;
112 case 'n': *d++ = '\n'; break;
113 case 'r': *d++ = '\r'; break;
114 case 't': *d++ = '\t'; break;
115 case 'v': *d++ = '\v'; break;
116 case '\\': *d++ = '\\'; break;
117 case '?': *d++ = '\?'; break; // \? Who knew?
118 case '\'': *d++ = '\''; break;
119 case '"': *d++ = '\"'; break;
120 case '0':
121 case '1':
122 case '2':
123 case '3':
124 case '4':
125 case '5':
126 case '6':
127 case '7': {
128 // octal digit: 1 to 3 digits
129 const char* octal_start = p;
130 unsigned int ch = static_cast<unsigned int>(*p - '0'); // digit 1
131 if (p < last_byte && is_octal_digit(p[1]))
132 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 2
133 if (p < last_byte && is_octal_digit(p[1]))
134 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 3
135 if (ch > 0xff) {
136 if (error) {
137 *error = "Value of \\" +
138 std::string(octal_start,
139 static_cast<size_t>(p + 1 - octal_start)) +
140 " exceeds 0xff";
141 }
142 return false;
143 }
144 if ((ch == 0) && leave_nulls_escaped) {
145 // Copy the escape sequence for the null character
146 const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
147 *d++ = '\\';
148 memmove(d, octal_start, octal_size);
149 d += octal_size;
150 break;
151 }
152 *d++ = static_cast<char>(ch);
153 break;
154 }
155 case 'x':
156 case 'X': {
157 if (p >= last_byte) {
158 if (error) *error = "String cannot end with \\x";
159 return false;
160 } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
161 if (error) *error = "\\x cannot be followed by a non-hex digit";
162 return false;
163 }
164 unsigned int ch = 0;
165 const char* hex_start = p;
166 while (p < last_byte &&
167 absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
168 // Arbitrarily many hex digits
169 ch = (ch << 4) + hex_digit_to_int(*++p);
170 if (ch > 0xFF) {
171 if (error) {
172 *error = "Value of \\" +
173 std::string(hex_start,
174 static_cast<size_t>(p + 1 - hex_start)) +
175 " exceeds 0xff";
176 }
177 return false;
178 }
179 if ((ch == 0) && leave_nulls_escaped) {
180 // Copy the escape sequence for the null character
181 const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
182 *d++ = '\\';
183 memmove(d, hex_start, hex_size);
184 d += hex_size;
185 break;
186 }
187 *d++ = static_cast<char>(ch);
188 break;
189 }
190 case 'u': {
191 // \uhhhh => convert 4 hex digits to UTF-8
192 char32_t rune = 0;
193 const char* hex_start = p;
194 if (p + 4 >= end) {
195 if (error) {
196 *error = "\\u must be followed by 4 hex digits: \\" +
197 std::string(hex_start,
198 static_cast<size_t>(p + 1 - hex_start));
199 }
200 return false;
201 }
202 for (int i = 0; i < 4; ++i) {
203 // Look one char ahead.
204 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
205 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
206 } else {
207 if (error) {
208 *error = "\\u must be followed by 4 hex digits: \\" +
209 std::string(hex_start,
210 static_cast<size_t>(p + 1 - hex_start));
211 }
212 return false;
213 }
214 }
215 if ((rune == 0) && leave_nulls_escaped) {
216 // Copy the escape sequence for the null character
217 *d++ = '\\';
218 memmove(d, hex_start, 5); // u0000
219 d += 5;
220 break;
221 }
222 if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
223 return false;
224 }
225 d += strings_internal::EncodeUTF8Char(d, rune);
226 break;
227 }
228 case 'U': {
229 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
230 char32_t rune = 0;
231 const char* hex_start = p;
232 if (p + 8 >= end) {
233 if (error) {
234 *error = "\\U must be followed by 8 hex digits: \\" +
235 std::string(hex_start,
236 static_cast<size_t>(p + 1 - hex_start));
237 }
238 return false;
239 }
240 for (int i = 0; i < 8; ++i) {
241 // Look one char ahead.
242 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
243 // Don't change rune until we're sure this
244 // is within the Unicode limit, but do advance p.
245 uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
246 if (newrune > 0x10FFFF) {
247 if (error) {
248 *error = "Value of \\" +
249 std::string(hex_start,
250 static_cast<size_t>(p + 1 - hex_start)) +
251 " exceeds Unicode limit (0x10FFFF)";
252 }
253 return false;
254 } else {
255 rune = newrune;
256 }
257 } else {
258 if (error) {
259 *error = "\\U must be followed by 8 hex digits: \\" +
260 std::string(hex_start,
261 static_cast<size_t>(p + 1 - hex_start));
262 }
263 return false;
264 }
265 }
266 if ((rune == 0) && leave_nulls_escaped) {
267 // Copy the escape sequence for the null character
268 *d++ = '\\';
269 memmove(d, hex_start, 9); // U00000000
270 d += 9;
271 break;
272 }
273 if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
274 return false;
275 }
276 d += strings_internal::EncodeUTF8Char(d, rune);
277 break;
278 }
279 default: {
280 if (error) *error = std::string("Unknown escape sequence: \\") + *p;
281 return false;
282 }
283 }
284 p++; // read past letter we escaped
285 }
286 }
287 *dest_len = d - dest;
288 return true;
289 }
290
291 // ----------------------------------------------------------------------
292 // CUnescapeInternal()
293 //
294 // Same as above but uses a std::string for output. 'source' and 'dest'
295 // may be the same.
296 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,absl::Nonnull<std::string * > dest,absl::Nullable<std::string * > error)297 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
298 absl::Nonnull<std::string*> dest,
299 absl::Nullable<std::string*> error) {
300 strings_internal::STLStringResizeUninitialized(dest, source.size());
301
302 ptrdiff_t dest_size;
303 if (!CUnescapeInternal(source,
304 leave_nulls_escaped,
305 &(*dest)[0],
306 &dest_size,
307 error)) {
308 return false;
309 }
310 dest->erase(static_cast<size_t>(dest_size));
311 return true;
312 }
313
314 // ----------------------------------------------------------------------
315 // CEscape()
316 // CHexEscape()
317 // Utf8SafeCEscape()
318 // Utf8SafeCHexEscape()
319 // Escapes 'src' using C-style escape sequences. This is useful for
320 // preparing query flags. The 'Hex' version uses hexadecimal rather than
321 // octal sequences. The 'Utf8Safe' version does not touch UTF-8 bytes.
322 //
323 // Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
324 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)325 std::string CEscapeInternal(absl::string_view src, bool use_hex,
326 bool utf8_safe) {
327 std::string dest;
328 bool last_hex_escape = false; // true if last output char was \xNN.
329
330 for (char c : src) {
331 bool is_hex_escape = false;
332 switch (c) {
333 case '\n': dest.append("\\" "n"); break;
334 case '\r': dest.append("\\" "r"); break;
335 case '\t': dest.append("\\" "t"); break;
336 case '\"': dest.append("\\" "\""); break;
337 case '\'': dest.append("\\" "'"); break;
338 case '\\': dest.append("\\" "\\"); break;
339 default: {
340 // Note that if we emit \xNN and the src character after that is a hex
341 // digit then that digit must be escaped too to prevent it being
342 // interpreted as part of the character code by C.
343 const unsigned char uc = static_cast<unsigned char>(c);
344 if ((!utf8_safe || uc < 0x80) &&
345 (!absl::ascii_isprint(uc) ||
346 (last_hex_escape && absl::ascii_isxdigit(uc)))) {
347 if (use_hex) {
348 dest.append("\\" "x");
349 dest.push_back(numbers_internal::kHexChar[uc / 16]);
350 dest.push_back(numbers_internal::kHexChar[uc % 16]);
351 is_hex_escape = true;
352 } else {
353 dest.append("\\");
354 dest.push_back(numbers_internal::kHexChar[uc / 64]);
355 dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
356 dest.push_back(numbers_internal::kHexChar[uc % 8]);
357 }
358 } else {
359 dest.push_back(c);
360 break;
361 }
362 }
363 }
364 last_hex_escape = is_hex_escape;
365 }
366
367 return dest;
368 }
369
370 /* clang-format off */
371 constexpr unsigned char kCEscapedLen[256] = {
372 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
373 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
374 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
375 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
376 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
377 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
378 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
379 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
380 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
381 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
382 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
383 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
384 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
385 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
386 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
387 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
388 };
389 /* clang-format on */
390
391 // Calculates the length of the C-style escaped version of 'src'.
392 // Assumes that non-printable characters are escaped using octal sequences, and
393 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)394 inline size_t CEscapedLength(absl::string_view src) {
395 size_t escaped_len = 0;
396 // The maximum value of kCEscapedLen[x] is 4, so we can escape any string of
397 // length size_t_max/4 without checking for overflow.
398 size_t unchecked_limit =
399 std::min<size_t>(src.size(), std::numeric_limits<size_t>::max() / 4);
400 size_t i = 0;
401 while (i < unchecked_limit) {
402 // Common case: No need to check for overflow.
403 escaped_len += kCEscapedLen[static_cast<unsigned char>(src[i++])];
404 }
405 while (i < src.size()) {
406 // Beyond unchecked_limit we need to check for overflow before adding.
407 size_t char_len = kCEscapedLen[static_cast<unsigned char>(src[i++])];
408 ABSL_INTERNAL_CHECK(
409 escaped_len <= std::numeric_limits<size_t>::max() - char_len,
410 "escaped_len overflow");
411 escaped_len += char_len;
412 }
413 return escaped_len;
414 }
415
CEscapeAndAppendInternal(absl::string_view src,absl::Nonnull<std::string * > dest)416 void CEscapeAndAppendInternal(absl::string_view src,
417 absl::Nonnull<std::string*> dest) {
418 size_t escaped_len = CEscapedLength(src);
419 if (escaped_len == src.size()) {
420 dest->append(src.data(), src.size());
421 return;
422 }
423
424 size_t cur_dest_len = dest->size();
425 ABSL_INTERNAL_CHECK(
426 cur_dest_len <= std::numeric_limits<size_t>::max() - escaped_len,
427 "std::string size overflow");
428 strings_internal::STLStringResizeUninitialized(dest,
429 cur_dest_len + escaped_len);
430 char* append_ptr = &(*dest)[cur_dest_len];
431
432 for (char c : src) {
433 size_t char_len = kCEscapedLen[static_cast<unsigned char>(c)];
434 if (char_len == 1) {
435 *append_ptr++ = c;
436 } else if (char_len == 2) {
437 switch (c) {
438 case '\n':
439 *append_ptr++ = '\\';
440 *append_ptr++ = 'n';
441 break;
442 case '\r':
443 *append_ptr++ = '\\';
444 *append_ptr++ = 'r';
445 break;
446 case '\t':
447 *append_ptr++ = '\\';
448 *append_ptr++ = 't';
449 break;
450 case '\"':
451 *append_ptr++ = '\\';
452 *append_ptr++ = '\"';
453 break;
454 case '\'':
455 *append_ptr++ = '\\';
456 *append_ptr++ = '\'';
457 break;
458 case '\\':
459 *append_ptr++ = '\\';
460 *append_ptr++ = '\\';
461 break;
462 }
463 } else {
464 *append_ptr++ = '\\';
465 *append_ptr++ = '0' + static_cast<unsigned char>(c) / 64;
466 *append_ptr++ = '0' + (static_cast<unsigned char>(c) % 64) / 8;
467 *append_ptr++ = '0' + static_cast<unsigned char>(c) % 8;
468 }
469 }
470 }
471
472 // Reverses the mapping in Base64EscapeInternal; see that method's
473 // documentation for details of the mapping.
Base64UnescapeInternal(absl::Nullable<const char * > src_param,size_t szsrc,absl::Nullable<char * > dest,size_t szdest,absl::Nonnull<const signed char * > unbase64,absl::Nonnull<size_t * > len)474 bool Base64UnescapeInternal(absl::Nullable<const char*> src_param, size_t szsrc,
475 absl::Nullable<char*> dest, size_t szdest,
476 absl::Nonnull<const signed char*> unbase64,
477 absl::Nonnull<size_t*> len) {
478 static const char kPad64Equals = '=';
479 static const char kPad64Dot = '.';
480
481 size_t destidx = 0;
482 int decode = 0;
483 int state = 0;
484 unsigned char ch = 0;
485 unsigned int temp = 0;
486
487 // If "char" is signed by default, using *src as an array index results in
488 // accessing negative array elements. Treat the input as a pointer to
489 // unsigned char to avoid this.
490 const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
491
492 // The GET_INPUT macro gets the next input character, skipping
493 // over any whitespace, and stopping when we reach the end of the
494 // string or when we read any non-data character. The arguments are
495 // an arbitrary identifier (used as a label for goto) and the number
496 // of data bytes that must remain in the input to avoid aborting the
497 // loop.
498 #define GET_INPUT(label, remain) \
499 label: \
500 --szsrc; \
501 ch = *src++; \
502 decode = unbase64[ch]; \
503 if (decode < 0) { \
504 if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
505 state = 4 - remain; \
506 break; \
507 }
508
509 // if dest is null, we're just checking to see if it's legal input
510 // rather than producing output. (I suspect this could just be done
511 // with a regexp...). We duplicate the loop so this test can be
512 // outside it instead of in every iteration.
513
514 if (dest) {
515 // This loop consumes 4 input bytes and produces 3 output bytes
516 // per iteration. We can't know at the start that there is enough
517 // data left in the string for a full iteration, so the loop may
518 // break out in the middle; if so 'state' will be set to the
519 // number of input bytes read.
520
521 while (szsrc >= 4) {
522 // We'll start by optimistically assuming that the next four
523 // bytes of the string (src[0..3]) are four good data bytes
524 // (that is, no nulls, whitespace, padding chars, or illegal
525 // chars). We need to test src[0..2] for nulls individually
526 // before constructing temp to preserve the property that we
527 // never read past a null in the string (no matter how long
528 // szsrc claims the string is).
529
530 if (!src[0] || !src[1] || !src[2] ||
531 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
532 (unsigned(unbase64[src[1]]) << 12) |
533 (unsigned(unbase64[src[2]]) << 6) |
534 (unsigned(unbase64[src[3]])))) &
535 0x80000000)) {
536 // Iff any of those four characters was bad (null, illegal,
537 // whitespace, padding), then temp's high bit will be set
538 // (because unbase64[] is -1 for all bad characters).
539 //
540 // We'll back up and resort to the slower decoder, which knows
541 // how to handle those cases.
542
543 GET_INPUT(first, 4);
544 temp = static_cast<unsigned char>(decode);
545 GET_INPUT(second, 3);
546 temp = (temp << 6) | static_cast<unsigned char>(decode);
547 GET_INPUT(third, 2);
548 temp = (temp << 6) | static_cast<unsigned char>(decode);
549 GET_INPUT(fourth, 1);
550 temp = (temp << 6) | static_cast<unsigned char>(decode);
551 } else {
552 // We really did have four good data bytes, so advance four
553 // characters in the string.
554
555 szsrc -= 4;
556 src += 4;
557 }
558
559 // temp has 24 bits of input, so write that out as three bytes.
560
561 if (destidx + 3 > szdest) return false;
562 dest[destidx + 2] = static_cast<char>(temp);
563 temp >>= 8;
564 dest[destidx + 1] = static_cast<char>(temp);
565 temp >>= 8;
566 dest[destidx] = static_cast<char>(temp);
567 destidx += 3;
568 }
569 } else {
570 while (szsrc >= 4) {
571 if (!src[0] || !src[1] || !src[2] ||
572 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
573 (unsigned(unbase64[src[1]]) << 12) |
574 (unsigned(unbase64[src[2]]) << 6) |
575 (unsigned(unbase64[src[3]])))) &
576 0x80000000)) {
577 GET_INPUT(first_no_dest, 4);
578 GET_INPUT(second_no_dest, 3);
579 GET_INPUT(third_no_dest, 2);
580 GET_INPUT(fourth_no_dest, 1);
581 } else {
582 szsrc -= 4;
583 src += 4;
584 }
585 destidx += 3;
586 }
587 }
588
589 #undef GET_INPUT
590
591 // if the loop terminated because we read a bad character, return
592 // now.
593 if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
594 !absl::ascii_isspace(ch))
595 return false;
596
597 if (ch == kPad64Equals || ch == kPad64Dot) {
598 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
599 // look at it again when we count to check for the proper number of
600 // equals signs at the end.
601 ++szsrc;
602 --src;
603 } else {
604 // This loop consumes 1 input byte per iteration. It's used to
605 // clean up the 0-3 input bytes remaining when the first, faster
606 // loop finishes. 'temp' contains the data from 'state' input
607 // characters read by the first loop.
608 while (szsrc > 0) {
609 --szsrc;
610 ch = *src++;
611 decode = unbase64[ch];
612 if (decode < 0) {
613 if (absl::ascii_isspace(ch)) {
614 continue;
615 } else if (ch == kPad64Equals || ch == kPad64Dot) {
616 // back up one character; we'll read it again when we check
617 // for the correct number of pad characters at the end.
618 ++szsrc;
619 --src;
620 break;
621 } else {
622 return false;
623 }
624 }
625
626 // Each input character gives us six bits of output.
627 temp = (temp << 6) | static_cast<unsigned char>(decode);
628 ++state;
629 if (state == 4) {
630 // If we've accumulated 24 bits of output, write that out as
631 // three bytes.
632 if (dest) {
633 if (destidx + 3 > szdest) return false;
634 dest[destidx + 2] = static_cast<char>(temp);
635 temp >>= 8;
636 dest[destidx + 1] = static_cast<char>(temp);
637 temp >>= 8;
638 dest[destidx] = static_cast<char>(temp);
639 }
640 destidx += 3;
641 state = 0;
642 temp = 0;
643 }
644 }
645 }
646
647 // Process the leftover data contained in 'temp' at the end of the input.
648 int expected_equals = 0;
649 switch (state) {
650 case 0:
651 // Nothing left over; output is a multiple of 3 bytes.
652 break;
653
654 case 1:
655 // Bad input; we have 6 bits left over.
656 return false;
657
658 case 2:
659 // Produce one more output byte from the 12 input bits we have left.
660 if (dest) {
661 if (destidx + 1 > szdest) return false;
662 temp >>= 4;
663 dest[destidx] = static_cast<char>(temp);
664 }
665 ++destidx;
666 expected_equals = 2;
667 break;
668
669 case 3:
670 // Produce two more output bytes from the 18 input bits we have left.
671 if (dest) {
672 if (destidx + 2 > szdest) return false;
673 temp >>= 2;
674 dest[destidx + 1] = static_cast<char>(temp);
675 temp >>= 8;
676 dest[destidx] = static_cast<char>(temp);
677 }
678 destidx += 2;
679 expected_equals = 1;
680 break;
681
682 default:
683 // state should have no other values at this point.
684 ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
685 state);
686 }
687
688 // The remainder of the string should be all whitespace, mixed with
689 // exactly 0 equals signs, or exactly 'expected_equals' equals
690 // signs. (Always accepting 0 equals signs is an Abseil extension
691 // not covered in the RFC, as is accepting dot as the pad character.)
692
693 int equals = 0;
694 while (szsrc > 0) {
695 if (*src == kPad64Equals || *src == kPad64Dot)
696 ++equals;
697 else if (!absl::ascii_isspace(*src))
698 return false;
699 --szsrc;
700 ++src;
701 }
702
703 const bool ok = (equals == 0 || equals == expected_equals);
704 if (ok) *len = destidx;
705 return ok;
706 }
707
708 // The arrays below map base64-escaped characters back to their original values.
709 // For the inverse case, see k(WebSafe)Base64Chars in the internal
710 // escaping.cc.
711 // These arrays were generated by the following inversion code:
712 // #include <sys/time.h>
713 // #include <stdlib.h>
714 // #include <string.h>
715 // main()
716 // {
717 // static const char Base64[] =
718 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
719 // char* pos;
720 // int idx, i, j;
721 // printf(" ");
722 // for (i = 0; i < 255; i += 8) {
723 // for (j = i; j < i + 8; j++) {
724 // pos = strchr(Base64, j);
725 // if ((pos == nullptr) || (j == 0))
726 // idx = -1;
727 // else
728 // idx = pos - Base64;
729 // if (idx == -1)
730 // printf(" %2d, ", idx);
731 // else
732 // printf(" %2d/*%c*/,", idx, j);
733 // }
734 // printf("\n ");
735 // }
736 // }
737 //
738 // where the value of "Base64[]" was replaced by one of k(WebSafe)Base64Chars
739 // in the internal escaping.cc.
740 /* clang-format off */
741 constexpr signed char kUnBase64[] = {
742 -1, -1, -1, -1, -1, -1, -1, -1,
743 -1, -1, -1, -1, -1, -1, -1, -1,
744 -1, -1, -1, -1, -1, -1, -1, -1,
745 -1, -1, -1, -1, -1, -1, -1, -1,
746 -1, -1, -1, -1, -1, -1, -1, -1,
747 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
748 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
749 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
750 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
751 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
752 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
753 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
754 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
755 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
756 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
757 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
758 -1, -1, -1, -1, -1, -1, -1, -1,
759 -1, -1, -1, -1, -1, -1, -1, -1,
760 -1, -1, -1, -1, -1, -1, -1, -1,
761 -1, -1, -1, -1, -1, -1, -1, -1,
762 -1, -1, -1, -1, -1, -1, -1, -1,
763 -1, -1, -1, -1, -1, -1, -1, -1,
764 -1, -1, -1, -1, -1, -1, -1, -1,
765 -1, -1, -1, -1, -1, -1, -1, -1,
766 -1, -1, -1, -1, -1, -1, -1, -1,
767 -1, -1, -1, -1, -1, -1, -1, -1,
768 -1, -1, -1, -1, -1, -1, -1, -1,
769 -1, -1, -1, -1, -1, -1, -1, -1,
770 -1, -1, -1, -1, -1, -1, -1, -1,
771 -1, -1, -1, -1, -1, -1, -1, -1,
772 -1, -1, -1, -1, -1, -1, -1, -1,
773 -1, -1, -1, -1, -1, -1, -1, -1
774 };
775
776 constexpr signed char kUnWebSafeBase64[] = {
777 -1, -1, -1, -1, -1, -1, -1, -1,
778 -1, -1, -1, -1, -1, -1, -1, -1,
779 -1, -1, -1, -1, -1, -1, -1, -1,
780 -1, -1, -1, -1, -1, -1, -1, -1,
781 -1, -1, -1, -1, -1, -1, -1, -1,
782 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
783 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
784 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
785 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
786 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
787 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
788 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
789 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
790 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
791 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
792 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
793 -1, -1, -1, -1, -1, -1, -1, -1,
794 -1, -1, -1, -1, -1, -1, -1, -1,
795 -1, -1, -1, -1, -1, -1, -1, -1,
796 -1, -1, -1, -1, -1, -1, -1, -1,
797 -1, -1, -1, -1, -1, -1, -1, -1,
798 -1, -1, -1, -1, -1, -1, -1, -1,
799 -1, -1, -1, -1, -1, -1, -1, -1,
800 -1, -1, -1, -1, -1, -1, -1, -1,
801 -1, -1, -1, -1, -1, -1, -1, -1,
802 -1, -1, -1, -1, -1, -1, -1, -1,
803 -1, -1, -1, -1, -1, -1, -1, -1,
804 -1, -1, -1, -1, -1, -1, -1, -1,
805 -1, -1, -1, -1, -1, -1, -1, -1,
806 -1, -1, -1, -1, -1, -1, -1, -1,
807 -1, -1, -1, -1, -1, -1, -1, -1,
808 -1, -1, -1, -1, -1, -1, -1, -1
809 };
810 /* clang-format on */
811
812 template <typename String>
Base64UnescapeInternal(absl::Nullable<const char * > src,size_t slen,absl::Nonnull<String * > dest,absl::Nonnull<const signed char * > unbase64)813 bool Base64UnescapeInternal(absl::Nullable<const char*> src, size_t slen,
814 absl::Nonnull<String*> dest,
815 absl::Nonnull<const signed char*> unbase64) {
816 // Determine the size of the output string. Base64 encodes every 3 bytes into
817 // 4 characters. Any leftover chars are added directly for good measure.
818 const size_t dest_len = 3 * (slen / 4) + (slen % 4);
819
820 strings_internal::STLStringResizeUninitialized(dest, dest_len);
821
822 // We are getting the destination buffer by getting the beginning of the
823 // string and converting it into a char *.
824 size_t len;
825 const bool ok =
826 Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
827 if (!ok) {
828 dest->clear();
829 return false;
830 }
831
832 // could be shorter if there was padding
833 assert(len <= dest_len);
834 dest->erase(len);
835
836 return true;
837 }
838
839 /* clang-format off */
840 constexpr char kHexValueLenient[256] = {
841 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
843 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
844 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
845 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
846 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
847 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f'
848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
856 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
857 };
858
859 constexpr signed char kHexValueStrict[256] = {
860 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
861 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
862 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
863 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // '0'..'9'
864 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 'A'..'F'
865 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
866 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 'a'..'f'
867 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
868 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
869 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
870 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
871 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
872 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
873 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
874 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
875 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
876 };
877 /* clang-format on */
878
879 // This is a templated function so that T can be either a char*
880 // or a string. This works because we use the [] operator to access
881 // individual characters at a time.
882 template <typename T>
HexStringToBytesInternal(absl::Nullable<const char * > from,T to,size_t num)883 void HexStringToBytesInternal(absl::Nullable<const char*> from, T to,
884 size_t num) {
885 for (size_t i = 0; i < num; i++) {
886 to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
887 (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
888 }
889 }
890
891 // This is a templated function so that T can be either a char* or a
892 // std::string.
893 template <typename T>
BytesToHexStringInternal(absl::Nullable<const unsigned char * > src,T dest,size_t num)894 void BytesToHexStringInternal(absl::Nullable<const unsigned char*> src, T dest,
895 size_t num) {
896 auto dest_ptr = &dest[0];
897 for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
898 const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
899 std::copy(hex_p, hex_p + 2, dest_ptr);
900 }
901 }
902
903 } // namespace
904
905 // ----------------------------------------------------------------------
906 // CUnescape()
907 //
908 // See CUnescapeInternal() for implementation details.
909 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,absl::Nonnull<std::string * > dest,absl::Nullable<std::string * > error)910 bool CUnescape(absl::string_view source, absl::Nonnull<std::string*> dest,
911 absl::Nullable<std::string*> error) {
912 return CUnescapeInternal(source, kUnescapeNulls, dest, error);
913 }
914
CEscape(absl::string_view src)915 std::string CEscape(absl::string_view src) {
916 std::string dest;
917 CEscapeAndAppendInternal(src, &dest);
918 return dest;
919 }
920
CHexEscape(absl::string_view src)921 std::string CHexEscape(absl::string_view src) {
922 return CEscapeInternal(src, true, false);
923 }
924
Utf8SafeCEscape(absl::string_view src)925 std::string Utf8SafeCEscape(absl::string_view src) {
926 return CEscapeInternal(src, false, true);
927 }
928
Utf8SafeCHexEscape(absl::string_view src)929 std::string Utf8SafeCHexEscape(absl::string_view src) {
930 return CEscapeInternal(src, true, true);
931 }
932
Base64Unescape(absl::string_view src,absl::Nonnull<std::string * > dest)933 bool Base64Unescape(absl::string_view src, absl::Nonnull<std::string*> dest) {
934 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
935 }
936
WebSafeBase64Unescape(absl::string_view src,absl::Nonnull<std::string * > dest)937 bool WebSafeBase64Unescape(absl::string_view src,
938 absl::Nonnull<std::string*> dest) {
939 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
940 }
941
Base64Escape(absl::string_view src,absl::Nonnull<std::string * > dest)942 void Base64Escape(absl::string_view src, absl::Nonnull<std::string*> dest) {
943 strings_internal::Base64EscapeInternal(
944 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
945 true, strings_internal::kBase64Chars);
946 }
947
WebSafeBase64Escape(absl::string_view src,absl::Nonnull<std::string * > dest)948 void WebSafeBase64Escape(absl::string_view src,
949 absl::Nonnull<std::string*> dest) {
950 strings_internal::Base64EscapeInternal(
951 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
952 false, strings_internal::kWebSafeBase64Chars);
953 }
954
Base64Escape(absl::string_view src)955 std::string Base64Escape(absl::string_view src) {
956 std::string dest;
957 strings_internal::Base64EscapeInternal(
958 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
959 true, strings_internal::kBase64Chars);
960 return dest;
961 }
962
WebSafeBase64Escape(absl::string_view src)963 std::string WebSafeBase64Escape(absl::string_view src) {
964 std::string dest;
965 strings_internal::Base64EscapeInternal(
966 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
967 false, strings_internal::kWebSafeBase64Chars);
968 return dest;
969 }
970
HexStringToBytes(absl::string_view hex,absl::Nonnull<std::string * > bytes)971 bool HexStringToBytes(absl::string_view hex,
972 absl::Nonnull<std::string*> bytes) {
973 std::string output;
974
975 size_t num_bytes = hex.size() / 2;
976 if (hex.size() != num_bytes * 2) {
977 return false;
978 }
979
980 absl::strings_internal::STLStringResizeUninitialized(&output, num_bytes);
981 auto hex_p = hex.cbegin();
982 for (std::string::iterator bin_p = output.begin(); bin_p != output.end();
983 ++bin_p) {
984 int h1 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)];
985 int h2 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)];
986 if (h1 == -1 || h2 == -1) {
987 output.resize(static_cast<size_t>(bin_p - output.begin()));
988 return false;
989 }
990 *bin_p = static_cast<char>((h1 << 4) + h2);
991 }
992
993 *bytes = std::move(output);
994 return true;
995 }
996
HexStringToBytes(absl::string_view from)997 std::string HexStringToBytes(absl::string_view from) {
998 std::string result;
999 const auto num = from.size() / 2;
1000 strings_internal::STLStringResizeUninitialized(&result, num);
1001 absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
1002 return result;
1003 }
1004
BytesToHexString(absl::string_view from)1005 std::string BytesToHexString(absl::string_view from) {
1006 std::string result;
1007 strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
1008 absl::BytesToHexStringInternal<std::string&>(
1009 reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
1010 return result;
1011 }
1012
1013 ABSL_NAMESPACE_END
1014 } // namespace absl
1015