1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/strings/escaping.h"
16
17 #include <algorithm>
18 #include <cassert>
19 #include <cstdint>
20 #include <cstring>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24
25 #include "absl/base/internal/endian.h"
26 #include "absl/base/internal/raw_logging.h"
27 #include "absl/base/internal/unaligned_access.h"
28 #include "absl/strings/internal/char_map.h"
29 #include "absl/strings/internal/escaping.h"
30 #include "absl/strings/internal/resize_uninitialized.h"
31 #include "absl/strings/internal/utf8.h"
32 #include "absl/strings/str_cat.h"
33 #include "absl/strings/str_join.h"
34 #include "absl/strings/string_view.h"
35
36 namespace absl {
37 ABSL_NAMESPACE_BEGIN
38 namespace {
39
40 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
41 constexpr bool kUnescapeNulls = false;
42
is_octal_digit(char c)43 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
44
hex_digit_to_int(char c)45 inline unsigned int hex_digit_to_int(char c) {
46 static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
47 "Character set must be ASCII.");
48 assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
49 unsigned int x = static_cast<unsigned char>(c);
50 if (x > '9') {
51 x += 9;
52 }
53 return x & 0xf;
54 }
55
IsSurrogate(char32_t c,absl::string_view src,std::string * error)56 inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
57 if (c >= 0xD800 && c <= 0xDFFF) {
58 if (error) {
59 *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
60 src);
61 }
62 return true;
63 }
64 return false;
65 }
66
67 // ----------------------------------------------------------------------
68 // CUnescapeInternal()
69 // Implements both CUnescape() and CUnescapeForNullTerminatedString().
70 //
71 // Unescapes C escape sequences and is the reverse of CEscape().
72 //
73 // If 'source' is valid, stores the unescaped string and its size in
74 // 'dest' and 'dest_len' respectively, and returns true. Otherwise
75 // returns false and optionally stores the error description in
76 // 'error'. Set 'error' to nullptr to disable error reporting.
77 //
78 // 'dest' should point to a buffer that is at least as big as 'source'.
79 // 'source' and 'dest' may be the same.
80 //
81 // NOTE: any changes to this function must also be reflected in the older
82 // UnescapeCEscapeSequences().
83 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,char * dest,ptrdiff_t * dest_len,std::string * error)84 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
85 char* dest, ptrdiff_t* dest_len, std::string* error) {
86 char* d = dest;
87 const char* p = source.data();
88 const char* end = p + source.size();
89 const char* last_byte = end - 1;
90
91 // Small optimization for case where source = dest and there's no escaping
92 while (p == d && p < end && *p != '\\') p++, d++;
93
94 while (p < end) {
95 if (*p != '\\') {
96 *d++ = *p++;
97 } else {
98 if (++p > last_byte) { // skip past the '\\'
99 if (error) *error = "String cannot end with \\";
100 return false;
101 }
102 switch (*p) {
103 case 'a': *d++ = '\a'; break;
104 case 'b': *d++ = '\b'; break;
105 case 'f': *d++ = '\f'; break;
106 case 'n': *d++ = '\n'; break;
107 case 'r': *d++ = '\r'; break;
108 case 't': *d++ = '\t'; break;
109 case 'v': *d++ = '\v'; break;
110 case '\\': *d++ = '\\'; break;
111 case '?': *d++ = '\?'; break; // \? Who knew?
112 case '\'': *d++ = '\''; break;
113 case '"': *d++ = '\"'; break;
114 case '0':
115 case '1':
116 case '2':
117 case '3':
118 case '4':
119 case '5':
120 case '6':
121 case '7': {
122 // octal digit: 1 to 3 digits
123 const char* octal_start = p;
124 unsigned int ch = static_cast<unsigned int>(*p - '0'); // digit 1
125 if (p < last_byte && is_octal_digit(p[1]))
126 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 2
127 if (p < last_byte && is_octal_digit(p[1]))
128 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 3
129 if (ch > 0xff) {
130 if (error) {
131 *error = "Value of \\" +
132 std::string(octal_start,
133 static_cast<size_t>(p + 1 - octal_start)) +
134 " exceeds 0xff";
135 }
136 return false;
137 }
138 if ((ch == 0) && leave_nulls_escaped) {
139 // Copy the escape sequence for the null character
140 const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
141 *d++ = '\\';
142 memmove(d, octal_start, octal_size);
143 d += octal_size;
144 break;
145 }
146 *d++ = static_cast<char>(ch);
147 break;
148 }
149 case 'x':
150 case 'X': {
151 if (p >= last_byte) {
152 if (error) *error = "String cannot end with \\x";
153 return false;
154 } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
155 if (error) *error = "\\x cannot be followed by a non-hex digit";
156 return false;
157 }
158 unsigned int ch = 0;
159 const char* hex_start = p;
160 while (p < last_byte &&
161 absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
162 // Arbitrarily many hex digits
163 ch = (ch << 4) + hex_digit_to_int(*++p);
164 if (ch > 0xFF) {
165 if (error) {
166 *error = "Value of \\" +
167 std::string(hex_start,
168 static_cast<size_t>(p + 1 - hex_start)) +
169 " exceeds 0xff";
170 }
171 return false;
172 }
173 if ((ch == 0) && leave_nulls_escaped) {
174 // Copy the escape sequence for the null character
175 const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
176 *d++ = '\\';
177 memmove(d, hex_start, hex_size);
178 d += hex_size;
179 break;
180 }
181 *d++ = static_cast<char>(ch);
182 break;
183 }
184 case 'u': {
185 // \uhhhh => convert 4 hex digits to UTF-8
186 char32_t rune = 0;
187 const char* hex_start = p;
188 if (p + 4 >= end) {
189 if (error) {
190 *error = "\\u must be followed by 4 hex digits: \\" +
191 std::string(hex_start,
192 static_cast<size_t>(p + 1 - hex_start));
193 }
194 return false;
195 }
196 for (int i = 0; i < 4; ++i) {
197 // Look one char ahead.
198 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
199 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
200 } else {
201 if (error) {
202 *error = "\\u must be followed by 4 hex digits: \\" +
203 std::string(hex_start,
204 static_cast<size_t>(p + 1 - hex_start));
205 }
206 return false;
207 }
208 }
209 if ((rune == 0) && leave_nulls_escaped) {
210 // Copy the escape sequence for the null character
211 *d++ = '\\';
212 memmove(d, hex_start, 5); // u0000
213 d += 5;
214 break;
215 }
216 if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
217 return false;
218 }
219 d += strings_internal::EncodeUTF8Char(d, rune);
220 break;
221 }
222 case 'U': {
223 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
224 char32_t rune = 0;
225 const char* hex_start = p;
226 if (p + 8 >= end) {
227 if (error) {
228 *error = "\\U must be followed by 8 hex digits: \\" +
229 std::string(hex_start,
230 static_cast<size_t>(p + 1 - hex_start));
231 }
232 return false;
233 }
234 for (int i = 0; i < 8; ++i) {
235 // Look one char ahead.
236 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
237 // Don't change rune until we're sure this
238 // is within the Unicode limit, but do advance p.
239 uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
240 if (newrune > 0x10FFFF) {
241 if (error) {
242 *error = "Value of \\" +
243 std::string(hex_start,
244 static_cast<size_t>(p + 1 - hex_start)) +
245 " exceeds Unicode limit (0x10FFFF)";
246 }
247 return false;
248 } else {
249 rune = newrune;
250 }
251 } else {
252 if (error) {
253 *error = "\\U must be followed by 8 hex digits: \\" +
254 std::string(hex_start,
255 static_cast<size_t>(p + 1 - hex_start));
256 }
257 return false;
258 }
259 }
260 if ((rune == 0) && leave_nulls_escaped) {
261 // Copy the escape sequence for the null character
262 *d++ = '\\';
263 memmove(d, hex_start, 9); // U00000000
264 d += 9;
265 break;
266 }
267 if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
268 return false;
269 }
270 d += strings_internal::EncodeUTF8Char(d, rune);
271 break;
272 }
273 default: {
274 if (error) *error = std::string("Unknown escape sequence: \\") + *p;
275 return false;
276 }
277 }
278 p++; // read past letter we escaped
279 }
280 }
281 *dest_len = d - dest;
282 return true;
283 }
284
285 // ----------------------------------------------------------------------
286 // CUnescapeInternal()
287 //
288 // Same as above but uses a std::string for output. 'source' and 'dest'
289 // may be the same.
290 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,std::string * dest,std::string * error)291 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
292 std::string* dest, std::string* error) {
293 strings_internal::STLStringResizeUninitialized(dest, source.size());
294
295 ptrdiff_t dest_size;
296 if (!CUnescapeInternal(source,
297 leave_nulls_escaped,
298 &(*dest)[0],
299 &dest_size,
300 error)) {
301 return false;
302 }
303 dest->erase(static_cast<size_t>(dest_size));
304 return true;
305 }
306
307 // ----------------------------------------------------------------------
308 // CEscape()
309 // CHexEscape()
310 // Utf8SafeCEscape()
311 // Utf8SafeCHexEscape()
312 // Escapes 'src' using C-style escape sequences. This is useful for
313 // preparing query flags. The 'Hex' version uses hexadecimal rather than
314 // octal sequences. The 'Utf8Safe' version does not touch UTF-8 bytes.
315 //
316 // Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
317 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)318 std::string CEscapeInternal(absl::string_view src, bool use_hex,
319 bool utf8_safe) {
320 std::string dest;
321 bool last_hex_escape = false; // true if last output char was \xNN.
322
323 for (char c : src) {
324 bool is_hex_escape = false;
325 switch (c) {
326 case '\n': dest.append("\\" "n"); break;
327 case '\r': dest.append("\\" "r"); break;
328 case '\t': dest.append("\\" "t"); break;
329 case '\"': dest.append("\\" "\""); break;
330 case '\'': dest.append("\\" "'"); break;
331 case '\\': dest.append("\\" "\\"); break;
332 default: {
333 // Note that if we emit \xNN and the src character after that is a hex
334 // digit then that digit must be escaped too to prevent it being
335 // interpreted as part of the character code by C.
336 const unsigned char uc = static_cast<unsigned char>(c);
337 if ((!utf8_safe || uc < 0x80) &&
338 (!absl::ascii_isprint(uc) ||
339 (last_hex_escape && absl::ascii_isxdigit(uc)))) {
340 if (use_hex) {
341 dest.append("\\" "x");
342 dest.push_back(numbers_internal::kHexChar[uc / 16]);
343 dest.push_back(numbers_internal::kHexChar[uc % 16]);
344 is_hex_escape = true;
345 } else {
346 dest.append("\\");
347 dest.push_back(numbers_internal::kHexChar[uc / 64]);
348 dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
349 dest.push_back(numbers_internal::kHexChar[uc % 8]);
350 }
351 } else {
352 dest.push_back(c);
353 break;
354 }
355 }
356 }
357 last_hex_escape = is_hex_escape;
358 }
359
360 return dest;
361 }
362
363 /* clang-format off */
364 constexpr unsigned char c_escaped_len[256] = {
365 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
366 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
367 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
368 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
369 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
370 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
371 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
372 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
373 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
374 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
377 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
378 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
379 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
380 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
381 };
382 /* clang-format on */
383
384 // Calculates the length of the C-style escaped version of 'src'.
385 // Assumes that non-printable characters are escaped using octal sequences, and
386 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)387 inline size_t CEscapedLength(absl::string_view src) {
388 size_t escaped_len = 0;
389 for (char c : src)
390 escaped_len += c_escaped_len[static_cast<unsigned char>(c)];
391 return escaped_len;
392 }
393
CEscapeAndAppendInternal(absl::string_view src,std::string * dest)394 void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
395 size_t escaped_len = CEscapedLength(src);
396 if (escaped_len == src.size()) {
397 dest->append(src.data(), src.size());
398 return;
399 }
400
401 size_t cur_dest_len = dest->size();
402 strings_internal::STLStringResizeUninitialized(dest,
403 cur_dest_len + escaped_len);
404 char* append_ptr = &(*dest)[cur_dest_len];
405
406 for (char c : src) {
407 size_t char_len = c_escaped_len[static_cast<unsigned char>(c)];
408 if (char_len == 1) {
409 *append_ptr++ = c;
410 } else if (char_len == 2) {
411 switch (c) {
412 case '\n':
413 *append_ptr++ = '\\';
414 *append_ptr++ = 'n';
415 break;
416 case '\r':
417 *append_ptr++ = '\\';
418 *append_ptr++ = 'r';
419 break;
420 case '\t':
421 *append_ptr++ = '\\';
422 *append_ptr++ = 't';
423 break;
424 case '\"':
425 *append_ptr++ = '\\';
426 *append_ptr++ = '\"';
427 break;
428 case '\'':
429 *append_ptr++ = '\\';
430 *append_ptr++ = '\'';
431 break;
432 case '\\':
433 *append_ptr++ = '\\';
434 *append_ptr++ = '\\';
435 break;
436 }
437 } else {
438 *append_ptr++ = '\\';
439 *append_ptr++ = '0' + static_cast<unsigned char>(c) / 64;
440 *append_ptr++ = '0' + (static_cast<unsigned char>(c) % 64) / 8;
441 *append_ptr++ = '0' + static_cast<unsigned char>(c) % 8;
442 }
443 }
444 }
445
Base64UnescapeInternal(const char * src_param,size_t szsrc,char * dest,size_t szdest,const signed char * unbase64,size_t * len)446 bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
447 size_t szdest, const signed char* unbase64,
448 size_t* len) {
449 static const char kPad64Equals = '=';
450 static const char kPad64Dot = '.';
451
452 size_t destidx = 0;
453 int decode = 0;
454 int state = 0;
455 unsigned char ch = 0;
456 unsigned int temp = 0;
457
458 // If "char" is signed by default, using *src as an array index results in
459 // accessing negative array elements. Treat the input as a pointer to
460 // unsigned char to avoid this.
461 const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
462
463 // The GET_INPUT macro gets the next input character, skipping
464 // over any whitespace, and stopping when we reach the end of the
465 // string or when we read any non-data character. The arguments are
466 // an arbitrary identifier (used as a label for goto) and the number
467 // of data bytes that must remain in the input to avoid aborting the
468 // loop.
469 #define GET_INPUT(label, remain) \
470 label: \
471 --szsrc; \
472 ch = *src++; \
473 decode = unbase64[ch]; \
474 if (decode < 0) { \
475 if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
476 state = 4 - remain; \
477 break; \
478 }
479
480 // if dest is null, we're just checking to see if it's legal input
481 // rather than producing output. (I suspect this could just be done
482 // with a regexp...). We duplicate the loop so this test can be
483 // outside it instead of in every iteration.
484
485 if (dest) {
486 // This loop consumes 4 input bytes and produces 3 output bytes
487 // per iteration. We can't know at the start that there is enough
488 // data left in the string for a full iteration, so the loop may
489 // break out in the middle; if so 'state' will be set to the
490 // number of input bytes read.
491
492 while (szsrc >= 4) {
493 // We'll start by optimistically assuming that the next four
494 // bytes of the string (src[0..3]) are four good data bytes
495 // (that is, no nulls, whitespace, padding chars, or illegal
496 // chars). We need to test src[0..2] for nulls individually
497 // before constructing temp to preserve the property that we
498 // never read past a null in the string (no matter how long
499 // szsrc claims the string is).
500
501 if (!src[0] || !src[1] || !src[2] ||
502 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
503 (unsigned(unbase64[src[1]]) << 12) |
504 (unsigned(unbase64[src[2]]) << 6) |
505 (unsigned(unbase64[src[3]])))) &
506 0x80000000)) {
507 // Iff any of those four characters was bad (null, illegal,
508 // whitespace, padding), then temp's high bit will be set
509 // (because unbase64[] is -1 for all bad characters).
510 //
511 // We'll back up and resort to the slower decoder, which knows
512 // how to handle those cases.
513
514 GET_INPUT(first, 4);
515 temp = static_cast<unsigned char>(decode);
516 GET_INPUT(second, 3);
517 temp = (temp << 6) | static_cast<unsigned char>(decode);
518 GET_INPUT(third, 2);
519 temp = (temp << 6) | static_cast<unsigned char>(decode);
520 GET_INPUT(fourth, 1);
521 temp = (temp << 6) | static_cast<unsigned char>(decode);
522 } else {
523 // We really did have four good data bytes, so advance four
524 // characters in the string.
525
526 szsrc -= 4;
527 src += 4;
528 }
529
530 // temp has 24 bits of input, so write that out as three bytes.
531
532 if (destidx + 3 > szdest) return false;
533 dest[destidx + 2] = static_cast<char>(temp);
534 temp >>= 8;
535 dest[destidx + 1] = static_cast<char>(temp);
536 temp >>= 8;
537 dest[destidx] = static_cast<char>(temp);
538 destidx += 3;
539 }
540 } else {
541 while (szsrc >= 4) {
542 if (!src[0] || !src[1] || !src[2] ||
543 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
544 (unsigned(unbase64[src[1]]) << 12) |
545 (unsigned(unbase64[src[2]]) << 6) |
546 (unsigned(unbase64[src[3]])))) &
547 0x80000000)) {
548 GET_INPUT(first_no_dest, 4);
549 GET_INPUT(second_no_dest, 3);
550 GET_INPUT(third_no_dest, 2);
551 GET_INPUT(fourth_no_dest, 1);
552 } else {
553 szsrc -= 4;
554 src += 4;
555 }
556 destidx += 3;
557 }
558 }
559
560 #undef GET_INPUT
561
562 // if the loop terminated because we read a bad character, return
563 // now.
564 if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
565 !absl::ascii_isspace(ch))
566 return false;
567
568 if (ch == kPad64Equals || ch == kPad64Dot) {
569 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
570 // look at it again when we count to check for the proper number of
571 // equals signs at the end.
572 ++szsrc;
573 --src;
574 } else {
575 // This loop consumes 1 input byte per iteration. It's used to
576 // clean up the 0-3 input bytes remaining when the first, faster
577 // loop finishes. 'temp' contains the data from 'state' input
578 // characters read by the first loop.
579 while (szsrc > 0) {
580 --szsrc;
581 ch = *src++;
582 decode = unbase64[ch];
583 if (decode < 0) {
584 if (absl::ascii_isspace(ch)) {
585 continue;
586 } else if (ch == kPad64Equals || ch == kPad64Dot) {
587 // back up one character; we'll read it again when we check
588 // for the correct number of pad characters at the end.
589 ++szsrc;
590 --src;
591 break;
592 } else {
593 return false;
594 }
595 }
596
597 // Each input character gives us six bits of output.
598 temp = (temp << 6) | static_cast<unsigned char>(decode);
599 ++state;
600 if (state == 4) {
601 // If we've accumulated 24 bits of output, write that out as
602 // three bytes.
603 if (dest) {
604 if (destidx + 3 > szdest) return false;
605 dest[destidx + 2] = static_cast<char>(temp);
606 temp >>= 8;
607 dest[destidx + 1] = static_cast<char>(temp);
608 temp >>= 8;
609 dest[destidx] = static_cast<char>(temp);
610 }
611 destidx += 3;
612 state = 0;
613 temp = 0;
614 }
615 }
616 }
617
618 // Process the leftover data contained in 'temp' at the end of the input.
619 int expected_equals = 0;
620 switch (state) {
621 case 0:
622 // Nothing left over; output is a multiple of 3 bytes.
623 break;
624
625 case 1:
626 // Bad input; we have 6 bits left over.
627 return false;
628
629 case 2:
630 // Produce one more output byte from the 12 input bits we have left.
631 if (dest) {
632 if (destidx + 1 > szdest) return false;
633 temp >>= 4;
634 dest[destidx] = static_cast<char>(temp);
635 }
636 ++destidx;
637 expected_equals = 2;
638 break;
639
640 case 3:
641 // Produce two more output bytes from the 18 input bits we have left.
642 if (dest) {
643 if (destidx + 2 > szdest) return false;
644 temp >>= 2;
645 dest[destidx + 1] = static_cast<char>(temp);
646 temp >>= 8;
647 dest[destidx] = static_cast<char>(temp);
648 }
649 destidx += 2;
650 expected_equals = 1;
651 break;
652
653 default:
654 // state should have no other values at this point.
655 ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
656 state);
657 }
658
659 // The remainder of the string should be all whitespace, mixed with
660 // exactly 0 equals signs, or exactly 'expected_equals' equals
661 // signs. (Always accepting 0 equals signs is an Abseil extension
662 // not covered in the RFC, as is accepting dot as the pad character.)
663
664 int equals = 0;
665 while (szsrc > 0) {
666 if (*src == kPad64Equals || *src == kPad64Dot)
667 ++equals;
668 else if (!absl::ascii_isspace(*src))
669 return false;
670 --szsrc;
671 ++src;
672 }
673
674 const bool ok = (equals == 0 || equals == expected_equals);
675 if (ok) *len = destidx;
676 return ok;
677 }
678
679 // The arrays below were generated by the following code
680 // #include <sys/time.h>
681 // #include <stdlib.h>
682 // #include <string.h>
683 // main()
684 // {
685 // static const char Base64[] =
686 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
687 // char* pos;
688 // int idx, i, j;
689 // printf(" ");
690 // for (i = 0; i < 255; i += 8) {
691 // for (j = i; j < i + 8; j++) {
692 // pos = strchr(Base64, j);
693 // if ((pos == nullptr) || (j == 0))
694 // idx = -1;
695 // else
696 // idx = pos - Base64;
697 // if (idx == -1)
698 // printf(" %2d, ", idx);
699 // else
700 // printf(" %2d/*%c*/,", idx, j);
701 // }
702 // printf("\n ");
703 // }
704 // }
705 //
706 // where the value of "Base64[]" was replaced by one of the base-64 conversion
707 // tables from the functions below.
708 /* clang-format off */
709 constexpr signed char kUnBase64[] = {
710 -1, -1, -1, -1, -1, -1, -1, -1,
711 -1, -1, -1, -1, -1, -1, -1, -1,
712 -1, -1, -1, -1, -1, -1, -1, -1,
713 -1, -1, -1, -1, -1, -1, -1, -1,
714 -1, -1, -1, -1, -1, -1, -1, -1,
715 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
716 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
717 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
718 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
719 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
720 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
721 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
722 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
723 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
724 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
725 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
726 -1, -1, -1, -1, -1, -1, -1, -1,
727 -1, -1, -1, -1, -1, -1, -1, -1,
728 -1, -1, -1, -1, -1, -1, -1, -1,
729 -1, -1, -1, -1, -1, -1, -1, -1,
730 -1, -1, -1, -1, -1, -1, -1, -1,
731 -1, -1, -1, -1, -1, -1, -1, -1,
732 -1, -1, -1, -1, -1, -1, -1, -1,
733 -1, -1, -1, -1, -1, -1, -1, -1,
734 -1, -1, -1, -1, -1, -1, -1, -1,
735 -1, -1, -1, -1, -1, -1, -1, -1,
736 -1, -1, -1, -1, -1, -1, -1, -1,
737 -1, -1, -1, -1, -1, -1, -1, -1,
738 -1, -1, -1, -1, -1, -1, -1, -1,
739 -1, -1, -1, -1, -1, -1, -1, -1,
740 -1, -1, -1, -1, -1, -1, -1, -1,
741 -1, -1, -1, -1, -1, -1, -1, -1
742 };
743
744 constexpr signed char kUnWebSafeBase64[] = {
745 -1, -1, -1, -1, -1, -1, -1, -1,
746 -1, -1, -1, -1, -1, -1, -1, -1,
747 -1, -1, -1, -1, -1, -1, -1, -1,
748 -1, -1, -1, -1, -1, -1, -1, -1,
749 -1, -1, -1, -1, -1, -1, -1, -1,
750 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
751 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
752 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
753 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
754 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
755 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
756 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
757 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
758 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
759 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
760 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
761 -1, -1, -1, -1, -1, -1, -1, -1,
762 -1, -1, -1, -1, -1, -1, -1, -1,
763 -1, -1, -1, -1, -1, -1, -1, -1,
764 -1, -1, -1, -1, -1, -1, -1, -1,
765 -1, -1, -1, -1, -1, -1, -1, -1,
766 -1, -1, -1, -1, -1, -1, -1, -1,
767 -1, -1, -1, -1, -1, -1, -1, -1,
768 -1, -1, -1, -1, -1, -1, -1, -1,
769 -1, -1, -1, -1, -1, -1, -1, -1,
770 -1, -1, -1, -1, -1, -1, -1, -1,
771 -1, -1, -1, -1, -1, -1, -1, -1,
772 -1, -1, -1, -1, -1, -1, -1, -1,
773 -1, -1, -1, -1, -1, -1, -1, -1,
774 -1, -1, -1, -1, -1, -1, -1, -1,
775 -1, -1, -1, -1, -1, -1, -1, -1,
776 -1, -1, -1, -1, -1, -1, -1, -1
777 };
778 /* clang-format on */
779
780 constexpr char kWebSafeBase64Chars[] =
781 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
782
783 template <typename String>
Base64UnescapeInternal(const char * src,size_t slen,String * dest,const signed char * unbase64)784 bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
785 const signed char* unbase64) {
786 // Determine the size of the output string. Base64 encodes every 3 bytes into
787 // 4 characters. any leftover chars are added directly for good measure.
788 // This is documented in the base64 RFC:
789 // https://datatracker.ietf.org/doc/html/rfc3548
790 const size_t dest_len = 3 * (slen / 4) + (slen % 4);
791
792 strings_internal::STLStringResizeUninitialized(dest, dest_len);
793
794 // We are getting the destination buffer by getting the beginning of the
795 // string and converting it into a char *.
796 size_t len;
797 const bool ok =
798 Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
799 if (!ok) {
800 dest->clear();
801 return false;
802 }
803
804 // could be shorter if there was padding
805 assert(len <= dest_len);
806 dest->erase(len);
807
808 return true;
809 }
810
811 /* clang-format off */
812 constexpr char kHexValueLenient[256] = {
813 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
817 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
819 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f'
820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
821 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
822 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
829 };
830
831 /* clang-format on */
832
833 // This is a templated function so that T can be either a char*
834 // or a string. This works because we use the [] operator to access
835 // individual characters at a time.
836 template <typename T>
HexStringToBytesInternal(const char * from,T to,size_t num)837 void HexStringToBytesInternal(const char* from, T to, size_t num) {
838 for (size_t i = 0; i < num; i++) {
839 to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
840 (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
841 }
842 }
843
844 // This is a templated function so that T can be either a char* or a
845 // std::string.
846 template <typename T>
BytesToHexStringInternal(const unsigned char * src,T dest,size_t num)847 void BytesToHexStringInternal(const unsigned char* src, T dest, size_t num) {
848 auto dest_ptr = &dest[0];
849 for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
850 const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
851 std::copy(hex_p, hex_p + 2, dest_ptr);
852 }
853 }
854
855 } // namespace
856
857 // ----------------------------------------------------------------------
858 // CUnescape()
859 //
860 // See CUnescapeInternal() for implementation details.
861 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,std::string * dest,std::string * error)862 bool CUnescape(absl::string_view source, std::string* dest,
863 std::string* error) {
864 return CUnescapeInternal(source, kUnescapeNulls, dest, error);
865 }
866
CEscape(absl::string_view src)867 std::string CEscape(absl::string_view src) {
868 std::string dest;
869 CEscapeAndAppendInternal(src, &dest);
870 return dest;
871 }
872
CHexEscape(absl::string_view src)873 std::string CHexEscape(absl::string_view src) {
874 return CEscapeInternal(src, true, false);
875 }
876
Utf8SafeCEscape(absl::string_view src)877 std::string Utf8SafeCEscape(absl::string_view src) {
878 return CEscapeInternal(src, false, true);
879 }
880
Utf8SafeCHexEscape(absl::string_view src)881 std::string Utf8SafeCHexEscape(absl::string_view src) {
882 return CEscapeInternal(src, true, true);
883 }
884
885 // ----------------------------------------------------------------------
886 // Base64Unescape() - base64 decoder
887 // Base64Escape() - base64 encoder
888 // WebSafeBase64Unescape() - Google's variation of base64 decoder
889 // WebSafeBase64Escape() - Google's variation of base64 encoder
890 //
891 // Check out
892 // https://datatracker.ietf.org/doc/html/rfc2045 for formal description, but
893 // what we care about is that...
894 // Take the encoded stuff in groups of 4 characters and turn each
895 // character into a code 0 to 63 thus:
896 // A-Z map to 0 to 25
897 // a-z map to 26 to 51
898 // 0-9 map to 52 to 61
899 // +(- for WebSafe) maps to 62
900 // /(_ for WebSafe) maps to 63
901 // There will be four numbers, all less than 64 which can be represented
902 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
903 // Arrange the 6 digit binary numbers into three bytes as such:
904 // aaaaaabb bbbbcccc ccdddddd
905 // Equals signs (one or two) are used at the end of the encoded block to
906 // indicate that the text was not an integer multiple of three bytes long.
907 // ----------------------------------------------------------------------
908
Base64Unescape(absl::string_view src,std::string * dest)909 bool Base64Unescape(absl::string_view src, std::string* dest) {
910 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
911 }
912
WebSafeBase64Unescape(absl::string_view src,std::string * dest)913 bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
914 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
915 }
916
Base64Escape(absl::string_view src,std::string * dest)917 void Base64Escape(absl::string_view src, std::string* dest) {
918 strings_internal::Base64EscapeInternal(
919 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
920 true, strings_internal::kBase64Chars);
921 }
922
WebSafeBase64Escape(absl::string_view src,std::string * dest)923 void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
924 strings_internal::Base64EscapeInternal(
925 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
926 false, kWebSafeBase64Chars);
927 }
928
Base64Escape(absl::string_view src)929 std::string Base64Escape(absl::string_view src) {
930 std::string dest;
931 strings_internal::Base64EscapeInternal(
932 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
933 true, strings_internal::kBase64Chars);
934 return dest;
935 }
936
WebSafeBase64Escape(absl::string_view src)937 std::string WebSafeBase64Escape(absl::string_view src) {
938 std::string dest;
939 strings_internal::Base64EscapeInternal(
940 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
941 false, kWebSafeBase64Chars);
942 return dest;
943 }
944
HexStringToBytes(absl::string_view from)945 std::string HexStringToBytes(absl::string_view from) {
946 std::string result;
947 const auto num = from.size() / 2;
948 strings_internal::STLStringResizeUninitialized(&result, num);
949 absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
950 return result;
951 }
952
BytesToHexString(absl::string_view from)953 std::string BytesToHexString(absl::string_view from) {
954 std::string result;
955 strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
956 absl::BytesToHexStringInternal<std::string&>(
957 reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
958 return result;
959 }
960
961 ABSL_NAMESPACE_END
962 } // namespace absl
963