1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/strings/escaping.h"
16
17 #include <algorithm>
18 #include <array>
19 #include <cassert>
20 #include <cstddef>
21 #include <cstdint>
22 #include <cstring>
23 #include <limits>
24 #include <string>
25 #include <utility>
26
27 #include "absl/base/config.h"
28 #include "absl/base/internal/endian.h"
29 #include "absl/base/internal/raw_logging.h"
30 #include "absl/base/internal/unaligned_access.h"
31 #include "absl/base/nullability.h"
32 #include "absl/strings/ascii.h"
33 #include "absl/strings/charset.h"
34 #include "absl/strings/internal/escaping.h"
35 #include "absl/strings/internal/resize_uninitialized.h"
36 #include "absl/strings/internal/utf8.h"
37 #include "absl/strings/numbers.h"
38 #include "absl/strings/str_cat.h"
39 #include "absl/strings/string_view.h"
40
41 namespace absl {
42 ABSL_NAMESPACE_BEGIN
43 namespace {
44
45 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
46 constexpr bool kUnescapeNulls = false;
47
is_octal_digit(char c)48 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
49
hex_digit_to_int(char c)50 inline unsigned int hex_digit_to_int(char c) {
51 static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
52 "Character set must be ASCII.");
53 assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
54 unsigned int x = static_cast<unsigned char>(c);
55 if (x > '9') {
56 x += 9;
57 }
58 return x & 0xf;
59 }
60
IsSurrogate(char32_t c,absl::string_view src,absl::Nullable<std::string * > error)61 inline bool IsSurrogate(char32_t c, absl::string_view src,
62 absl::Nullable<std::string*> error) {
63 if (c >= 0xD800 && c <= 0xDFFF) {
64 if (error) {
65 *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
66 src);
67 }
68 return true;
69 }
70 return false;
71 }
72
73 // ----------------------------------------------------------------------
74 // CUnescapeInternal()
75 // Implements both CUnescape() and CUnescapeForNullTerminatedString().
76 //
77 // Unescapes C escape sequences and is the reverse of CEscape().
78 //
79 // If 'source' is valid, stores the unescaped string and its size in
80 // 'dest' and 'dest_len' respectively, and returns true. Otherwise
81 // returns false and optionally stores the error description in
82 // 'error'. Set 'error' to nullptr to disable error reporting.
83 //
84 // 'dest' should point to a buffer that is at least as big as 'source'.
85 // 'source' and 'dest' may be the same.
86 //
87 // NOTE: any changes to this function must also be reflected in the older
88 // UnescapeCEscapeSequences().
89 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,absl::Nonnull<char * > dest,absl::Nonnull<ptrdiff_t * > dest_len,absl::Nullable<std::string * > error)90 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
91 absl::Nonnull<char*> dest,
92 absl::Nonnull<ptrdiff_t*> dest_len,
93 absl::Nullable<std::string*> error) {
94 char* d = dest;
95 const char* p = source.data();
96 const char* end = p + source.size();
97 const char* last_byte = end - 1;
98
99 // Small optimization for case where source = dest and there's no escaping
100 while (p == d && p < end && *p != '\\') p++, d++;
101
102 while (p < end) {
103 if (*p != '\\') {
104 *d++ = *p++;
105 } else {
106 if (++p > last_byte) { // skip past the '\\'
107 if (error) *error = "String cannot end with \\";
108 return false;
109 }
110 switch (*p) {
111 case 'a': *d++ = '\a'; break;
112 case 'b': *d++ = '\b'; break;
113 case 'f': *d++ = '\f'; break;
114 case 'n': *d++ = '\n'; break;
115 case 'r': *d++ = '\r'; break;
116 case 't': *d++ = '\t'; break;
117 case 'v': *d++ = '\v'; break;
118 case '\\': *d++ = '\\'; break;
119 case '?': *d++ = '\?'; break; // \? Who knew?
120 case '\'': *d++ = '\''; break;
121 case '"': *d++ = '\"'; break;
122 case '0':
123 case '1':
124 case '2':
125 case '3':
126 case '4':
127 case '5':
128 case '6':
129 case '7': {
130 // octal digit: 1 to 3 digits
131 const char* octal_start = p;
132 unsigned int ch = static_cast<unsigned int>(*p - '0'); // digit 1
133 if (p < last_byte && is_octal_digit(p[1]))
134 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 2
135 if (p < last_byte && is_octal_digit(p[1]))
136 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 3
137 if (ch > 0xff) {
138 if (error) {
139 *error = "Value of \\" +
140 std::string(octal_start,
141 static_cast<size_t>(p + 1 - octal_start)) +
142 " exceeds 0xff";
143 }
144 return false;
145 }
146 if ((ch == 0) && leave_nulls_escaped) {
147 // Copy the escape sequence for the null character
148 const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
149 *d++ = '\\';
150 memmove(d, octal_start, octal_size);
151 d += octal_size;
152 break;
153 }
154 *d++ = static_cast<char>(ch);
155 break;
156 }
157 case 'x':
158 case 'X': {
159 if (p >= last_byte) {
160 if (error) *error = "String cannot end with \\x";
161 return false;
162 } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
163 if (error) *error = "\\x cannot be followed by a non-hex digit";
164 return false;
165 }
166 unsigned int ch = 0;
167 const char* hex_start = p;
168 while (p < last_byte &&
169 absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
170 // Arbitrarily many hex digits
171 ch = (ch << 4) + hex_digit_to_int(*++p);
172 if (ch > 0xFF) {
173 if (error) {
174 *error = "Value of \\" +
175 std::string(hex_start,
176 static_cast<size_t>(p + 1 - hex_start)) +
177 " exceeds 0xff";
178 }
179 return false;
180 }
181 if ((ch == 0) && leave_nulls_escaped) {
182 // Copy the escape sequence for the null character
183 const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
184 *d++ = '\\';
185 memmove(d, hex_start, hex_size);
186 d += hex_size;
187 break;
188 }
189 *d++ = static_cast<char>(ch);
190 break;
191 }
192 case 'u': {
193 // \uhhhh => convert 4 hex digits to UTF-8
194 char32_t rune = 0;
195 const char* hex_start = p;
196 if (p + 4 >= end) {
197 if (error) {
198 *error = "\\u must be followed by 4 hex digits: \\" +
199 std::string(hex_start,
200 static_cast<size_t>(p + 1 - hex_start));
201 }
202 return false;
203 }
204 for (int i = 0; i < 4; ++i) {
205 // Look one char ahead.
206 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
207 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
208 } else {
209 if (error) {
210 *error = "\\u must be followed by 4 hex digits: \\" +
211 std::string(hex_start,
212 static_cast<size_t>(p + 1 - hex_start));
213 }
214 return false;
215 }
216 }
217 if ((rune == 0) && leave_nulls_escaped) {
218 // Copy the escape sequence for the null character
219 *d++ = '\\';
220 memmove(d, hex_start, 5); // u0000
221 d += 5;
222 break;
223 }
224 if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
225 return false;
226 }
227 d += strings_internal::EncodeUTF8Char(d, rune);
228 break;
229 }
230 case 'U': {
231 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
232 char32_t rune = 0;
233 const char* hex_start = p;
234 if (p + 8 >= end) {
235 if (error) {
236 *error = "\\U must be followed by 8 hex digits: \\" +
237 std::string(hex_start,
238 static_cast<size_t>(p + 1 - hex_start));
239 }
240 return false;
241 }
242 for (int i = 0; i < 8; ++i) {
243 // Look one char ahead.
244 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
245 // Don't change rune until we're sure this
246 // is within the Unicode limit, but do advance p.
247 uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
248 if (newrune > 0x10FFFF) {
249 if (error) {
250 *error = "Value of \\" +
251 std::string(hex_start,
252 static_cast<size_t>(p + 1 - hex_start)) +
253 " exceeds Unicode limit (0x10FFFF)";
254 }
255 return false;
256 } else {
257 rune = newrune;
258 }
259 } else {
260 if (error) {
261 *error = "\\U must be followed by 8 hex digits: \\" +
262 std::string(hex_start,
263 static_cast<size_t>(p + 1 - hex_start));
264 }
265 return false;
266 }
267 }
268 if ((rune == 0) && leave_nulls_escaped) {
269 // Copy the escape sequence for the null character
270 *d++ = '\\';
271 memmove(d, hex_start, 9); // U00000000
272 d += 9;
273 break;
274 }
275 if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
276 return false;
277 }
278 d += strings_internal::EncodeUTF8Char(d, rune);
279 break;
280 }
281 default: {
282 if (error) *error = std::string("Unknown escape sequence: \\") + *p;
283 return false;
284 }
285 }
286 p++; // read past letter we escaped
287 }
288 }
289 *dest_len = d - dest;
290 return true;
291 }
292
293 // ----------------------------------------------------------------------
294 // CUnescapeInternal()
295 //
296 // Same as above but uses a std::string for output. 'source' and 'dest'
297 // may be the same.
298 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,absl::Nonnull<std::string * > dest,absl::Nullable<std::string * > error)299 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
300 absl::Nonnull<std::string*> dest,
301 absl::Nullable<std::string*> error) {
302 strings_internal::STLStringResizeUninitialized(dest, source.size());
303
304 ptrdiff_t dest_size;
305 if (!CUnescapeInternal(source,
306 leave_nulls_escaped,
307 &(*dest)[0],
308 &dest_size,
309 error)) {
310 return false;
311 }
312 dest->erase(static_cast<size_t>(dest_size));
313 return true;
314 }
315
316 // ----------------------------------------------------------------------
317 // CEscape()
318 // CHexEscape()
319 // Utf8SafeCEscape()
320 // Utf8SafeCHexEscape()
321 // Escapes 'src' using C-style escape sequences. This is useful for
322 // preparing query flags. The 'Hex' version uses hexadecimal rather than
323 // octal sequences. The 'Utf8Safe' version does not touch UTF-8 bytes.
324 //
325 // Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
326 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)327 std::string CEscapeInternal(absl::string_view src, bool use_hex,
328 bool utf8_safe) {
329 std::string dest;
330 bool last_hex_escape = false; // true if last output char was \xNN.
331
332 for (char c : src) {
333 bool is_hex_escape = false;
334 switch (c) {
335 case '\n': dest.append("\\" "n"); break;
336 case '\r': dest.append("\\" "r"); break;
337 case '\t': dest.append("\\" "t"); break;
338 case '\"': dest.append("\\" "\""); break;
339 case '\'': dest.append("\\" "'"); break;
340 case '\\': dest.append("\\" "\\"); break;
341 default: {
342 // Note that if we emit \xNN and the src character after that is a hex
343 // digit then that digit must be escaped too to prevent it being
344 // interpreted as part of the character code by C.
345 const unsigned char uc = static_cast<unsigned char>(c);
346 if ((!utf8_safe || uc < 0x80) &&
347 (!absl::ascii_isprint(uc) ||
348 (last_hex_escape && absl::ascii_isxdigit(uc)))) {
349 if (use_hex) {
350 dest.append("\\" "x");
351 dest.push_back(numbers_internal::kHexChar[uc / 16]);
352 dest.push_back(numbers_internal::kHexChar[uc % 16]);
353 is_hex_escape = true;
354 } else {
355 dest.append("\\");
356 dest.push_back(numbers_internal::kHexChar[uc / 64]);
357 dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
358 dest.push_back(numbers_internal::kHexChar[uc % 8]);
359 }
360 } else {
361 dest.push_back(c);
362 break;
363 }
364 }
365 }
366 last_hex_escape = is_hex_escape;
367 }
368
369 return dest;
370 }
371
372 /* clang-format off */
373 constexpr unsigned char kCEscapedLen[256] = {
374 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
375 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
377 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
378 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
379 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
380 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
381 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
382 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
383 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
384 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
385 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
386 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
387 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
388 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
389 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
390 };
391 /* clang-format on */
392
MakeCEscapedLittleEndianUint32(size_t c)393 constexpr uint32_t MakeCEscapedLittleEndianUint32(size_t c) {
394 size_t char_len = kCEscapedLen[c];
395 if (char_len == 1) {
396 return static_cast<uint32_t>(c);
397 }
398 if (char_len == 2) {
399 switch (c) {
400 case '\n':
401 return '\\' | (static_cast<uint32_t>('n') << 8);
402 case '\r':
403 return '\\' | (static_cast<uint32_t>('r') << 8);
404 case '\t':
405 return '\\' | (static_cast<uint32_t>('t') << 8);
406 case '\"':
407 return '\\' | (static_cast<uint32_t>('\"') << 8);
408 case '\'':
409 return '\\' | (static_cast<uint32_t>('\'') << 8);
410 case '\\':
411 return '\\' | (static_cast<uint32_t>('\\') << 8);
412 }
413 }
414 return static_cast<uint32_t>('\\' | (('0' + (c / 64)) << 8) |
415 (('0' + ((c % 64) / 8)) << 16) |
416 (('0' + (c % 8)) << 24));
417 }
418
419 template <size_t... indexes>
420 inline constexpr std::array<uint32_t, sizeof...(indexes)>
MakeCEscapedLittleEndianUint32Array(std::index_sequence<indexes...>)421 MakeCEscapedLittleEndianUint32Array(std::index_sequence<indexes...>) {
422 return {MakeCEscapedLittleEndianUint32(indexes)...};
423 }
424 constexpr std::array<uint32_t, 256> kCEscapedLittleEndianUint32Array =
425 MakeCEscapedLittleEndianUint32Array(std::make_index_sequence<256>());
426
427 // Calculates the length of the C-style escaped version of 'src'.
428 // Assumes that non-printable characters are escaped using octal sequences, and
429 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)430 inline size_t CEscapedLength(absl::string_view src) {
431 size_t escaped_len = 0;
432 // The maximum value of kCEscapedLen[x] is 4, so we can escape any string of
433 // length size_t_max/4 without checking for overflow.
434 size_t unchecked_limit =
435 std::min<size_t>(src.size(), std::numeric_limits<size_t>::max() / 4);
436 size_t i = 0;
437 while (i < unchecked_limit) {
438 // Common case: No need to check for overflow.
439 escaped_len += kCEscapedLen[static_cast<unsigned char>(src[i++])];
440 }
441 while (i < src.size()) {
442 // Beyond unchecked_limit we need to check for overflow before adding.
443 size_t char_len = kCEscapedLen[static_cast<unsigned char>(src[i++])];
444 ABSL_INTERNAL_CHECK(
445 escaped_len <= std::numeric_limits<size_t>::max() - char_len,
446 "escaped_len overflow");
447 escaped_len += char_len;
448 }
449 return escaped_len;
450 }
451
CEscapeAndAppendInternal(absl::string_view src,absl::Nonnull<std::string * > dest)452 void CEscapeAndAppendInternal(absl::string_view src,
453 absl::Nonnull<std::string*> dest) {
454 size_t escaped_len = CEscapedLength(src);
455 if (escaped_len == src.size()) {
456 dest->append(src.data(), src.size());
457 return;
458 }
459
460 // We keep 3 slop bytes so that we can call `little_endian::Store32`
461 // invariably regardless of the length of the escaped character.
462 constexpr size_t slop_bytes = 3;
463 size_t cur_dest_len = dest->size();
464 size_t new_dest_len = cur_dest_len + escaped_len + slop_bytes;
465 ABSL_INTERNAL_CHECK(new_dest_len > cur_dest_len, "std::string size overflow");
466 strings_internal::AppendUninitializedTraits<std::string>::Append(
467 dest, escaped_len + slop_bytes);
468 char* append_ptr = &(*dest)[cur_dest_len];
469
470 for (char c : src) {
471 unsigned char uc = static_cast<unsigned char>(c);
472 size_t char_len = kCEscapedLen[uc];
473 uint32_t little_endian_uint32 = kCEscapedLittleEndianUint32Array[uc];
474 little_endian::Store32(append_ptr, little_endian_uint32);
475 append_ptr += char_len;
476 }
477 dest->resize(new_dest_len - slop_bytes);
478 }
479
480 // Reverses the mapping in Base64EscapeInternal; see that method's
481 // documentation for details of the mapping.
Base64UnescapeInternal(absl::Nullable<const char * > src_param,size_t szsrc,absl::Nullable<char * > dest,size_t szdest,absl::Nonnull<const signed char * > unbase64,absl::Nonnull<size_t * > len)482 bool Base64UnescapeInternal(absl::Nullable<const char*> src_param, size_t szsrc,
483 absl::Nullable<char*> dest, size_t szdest,
484 absl::Nonnull<const signed char*> unbase64,
485 absl::Nonnull<size_t*> len) {
486 static const char kPad64Equals = '=';
487 static const char kPad64Dot = '.';
488
489 size_t destidx = 0;
490 int decode = 0;
491 int state = 0;
492 unsigned char ch = 0;
493 unsigned int temp = 0;
494
495 // If "char" is signed by default, using *src as an array index results in
496 // accessing negative array elements. Treat the input as a pointer to
497 // unsigned char to avoid this.
498 const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
499
500 // The GET_INPUT macro gets the next input character, skipping
501 // over any whitespace, and stopping when we reach the end of the
502 // string or when we read any non-data character. The arguments are
503 // an arbitrary identifier (used as a label for goto) and the number
504 // of data bytes that must remain in the input to avoid aborting the
505 // loop.
506 #define GET_INPUT(label, remain) \
507 label: \
508 --szsrc; \
509 ch = *src++; \
510 decode = unbase64[ch]; \
511 if (decode < 0) { \
512 if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
513 state = 4 - remain; \
514 break; \
515 }
516
517 // if dest is null, we're just checking to see if it's legal input
518 // rather than producing output. (I suspect this could just be done
519 // with a regexp...). We duplicate the loop so this test can be
520 // outside it instead of in every iteration.
521
522 if (dest) {
523 // This loop consumes 4 input bytes and produces 3 output bytes
524 // per iteration. We can't know at the start that there is enough
525 // data left in the string for a full iteration, so the loop may
526 // break out in the middle; if so 'state' will be set to the
527 // number of input bytes read.
528
529 while (szsrc >= 4) {
530 // We'll start by optimistically assuming that the next four
531 // bytes of the string (src[0..3]) are four good data bytes
532 // (that is, no nulls, whitespace, padding chars, or illegal
533 // chars). We need to test src[0..2] for nulls individually
534 // before constructing temp to preserve the property that we
535 // never read past a null in the string (no matter how long
536 // szsrc claims the string is).
537
538 if (!src[0] || !src[1] || !src[2] ||
539 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
540 (unsigned(unbase64[src[1]]) << 12) |
541 (unsigned(unbase64[src[2]]) << 6) |
542 (unsigned(unbase64[src[3]])))) &
543 0x80000000)) {
544 // Iff any of those four characters was bad (null, illegal,
545 // whitespace, padding), then temp's high bit will be set
546 // (because unbase64[] is -1 for all bad characters).
547 //
548 // We'll back up and resort to the slower decoder, which knows
549 // how to handle those cases.
550
551 GET_INPUT(first, 4);
552 temp = static_cast<unsigned char>(decode);
553 GET_INPUT(second, 3);
554 temp = (temp << 6) | static_cast<unsigned char>(decode);
555 GET_INPUT(third, 2);
556 temp = (temp << 6) | static_cast<unsigned char>(decode);
557 GET_INPUT(fourth, 1);
558 temp = (temp << 6) | static_cast<unsigned char>(decode);
559 } else {
560 // We really did have four good data bytes, so advance four
561 // characters in the string.
562
563 szsrc -= 4;
564 src += 4;
565 }
566
567 // temp has 24 bits of input, so write that out as three bytes.
568
569 if (destidx + 3 > szdest) return false;
570 dest[destidx + 2] = static_cast<char>(temp);
571 temp >>= 8;
572 dest[destidx + 1] = static_cast<char>(temp);
573 temp >>= 8;
574 dest[destidx] = static_cast<char>(temp);
575 destidx += 3;
576 }
577 } else {
578 while (szsrc >= 4) {
579 if (!src[0] || !src[1] || !src[2] ||
580 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
581 (unsigned(unbase64[src[1]]) << 12) |
582 (unsigned(unbase64[src[2]]) << 6) |
583 (unsigned(unbase64[src[3]])))) &
584 0x80000000)) {
585 GET_INPUT(first_no_dest, 4);
586 GET_INPUT(second_no_dest, 3);
587 GET_INPUT(third_no_dest, 2);
588 GET_INPUT(fourth_no_dest, 1);
589 } else {
590 szsrc -= 4;
591 src += 4;
592 }
593 destidx += 3;
594 }
595 }
596
597 #undef GET_INPUT
598
599 // if the loop terminated because we read a bad character, return
600 // now.
601 if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
602 !absl::ascii_isspace(ch))
603 return false;
604
605 if (ch == kPad64Equals || ch == kPad64Dot) {
606 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
607 // look at it again when we count to check for the proper number of
608 // equals signs at the end.
609 ++szsrc;
610 --src;
611 } else {
612 // This loop consumes 1 input byte per iteration. It's used to
613 // clean up the 0-3 input bytes remaining when the first, faster
614 // loop finishes. 'temp' contains the data from 'state' input
615 // characters read by the first loop.
616 while (szsrc > 0) {
617 --szsrc;
618 ch = *src++;
619 decode = unbase64[ch];
620 if (decode < 0) {
621 if (absl::ascii_isspace(ch)) {
622 continue;
623 } else if (ch == kPad64Equals || ch == kPad64Dot) {
624 // back up one character; we'll read it again when we check
625 // for the correct number of pad characters at the end.
626 ++szsrc;
627 --src;
628 break;
629 } else {
630 return false;
631 }
632 }
633
634 // Each input character gives us six bits of output.
635 temp = (temp << 6) | static_cast<unsigned char>(decode);
636 ++state;
637 if (state == 4) {
638 // If we've accumulated 24 bits of output, write that out as
639 // three bytes.
640 if (dest) {
641 if (destidx + 3 > szdest) return false;
642 dest[destidx + 2] = static_cast<char>(temp);
643 temp >>= 8;
644 dest[destidx + 1] = static_cast<char>(temp);
645 temp >>= 8;
646 dest[destidx] = static_cast<char>(temp);
647 }
648 destidx += 3;
649 state = 0;
650 temp = 0;
651 }
652 }
653 }
654
655 // Process the leftover data contained in 'temp' at the end of the input.
656 int expected_equals = 0;
657 switch (state) {
658 case 0:
659 // Nothing left over; output is a multiple of 3 bytes.
660 break;
661
662 case 1:
663 // Bad input; we have 6 bits left over.
664 return false;
665
666 case 2:
667 // Produce one more output byte from the 12 input bits we have left.
668 if (dest) {
669 if (destidx + 1 > szdest) return false;
670 temp >>= 4;
671 dest[destidx] = static_cast<char>(temp);
672 }
673 ++destidx;
674 expected_equals = 2;
675 break;
676
677 case 3:
678 // Produce two more output bytes from the 18 input bits we have left.
679 if (dest) {
680 if (destidx + 2 > szdest) return false;
681 temp >>= 2;
682 dest[destidx + 1] = static_cast<char>(temp);
683 temp >>= 8;
684 dest[destidx] = static_cast<char>(temp);
685 }
686 destidx += 2;
687 expected_equals = 1;
688 break;
689
690 default:
691 // state should have no other values at this point.
692 ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
693 state);
694 }
695
696 // The remainder of the string should be all whitespace, mixed with
697 // exactly 0 equals signs, or exactly 'expected_equals' equals
698 // signs. (Always accepting 0 equals signs is an Abseil extension
699 // not covered in the RFC, as is accepting dot as the pad character.)
700
701 int equals = 0;
702 while (szsrc > 0) {
703 if (*src == kPad64Equals || *src == kPad64Dot)
704 ++equals;
705 else if (!absl::ascii_isspace(*src))
706 return false;
707 --szsrc;
708 ++src;
709 }
710
711 const bool ok = (equals == 0 || equals == expected_equals);
712 if (ok) *len = destidx;
713 return ok;
714 }
715
716 // The arrays below map base64-escaped characters back to their original values.
717 // For the inverse case, see k(WebSafe)Base64Chars in the internal
718 // escaping.cc.
719 // These arrays were generated by the following inversion code:
720 // #include <sys/time.h>
721 // #include <stdlib.h>
722 // #include <string.h>
723 // main()
724 // {
725 // static const char Base64[] =
726 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
727 // char* pos;
728 // int idx, i, j;
729 // printf(" ");
730 // for (i = 0; i < 255; i += 8) {
731 // for (j = i; j < i + 8; j++) {
732 // pos = strchr(Base64, j);
733 // if ((pos == nullptr) || (j == 0))
734 // idx = -1;
735 // else
736 // idx = pos - Base64;
737 // if (idx == -1)
738 // printf(" %2d, ", idx);
739 // else
740 // printf(" %2d/*%c*/,", idx, j);
741 // }
742 // printf("\n ");
743 // }
744 // }
745 //
746 // where the value of "Base64[]" was replaced by one of k(WebSafe)Base64Chars
747 // in the internal escaping.cc.
748 /* clang-format off */
749 constexpr signed char kUnBase64[] = {
750 -1, -1, -1, -1, -1, -1, -1, -1,
751 -1, -1, -1, -1, -1, -1, -1, -1,
752 -1, -1, -1, -1, -1, -1, -1, -1,
753 -1, -1, -1, -1, -1, -1, -1, -1,
754 -1, -1, -1, -1, -1, -1, -1, -1,
755 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
756 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
757 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
758 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
759 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
760 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
761 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
762 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
763 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
764 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
765 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
766 -1, -1, -1, -1, -1, -1, -1, -1,
767 -1, -1, -1, -1, -1, -1, -1, -1,
768 -1, -1, -1, -1, -1, -1, -1, -1,
769 -1, -1, -1, -1, -1, -1, -1, -1,
770 -1, -1, -1, -1, -1, -1, -1, -1,
771 -1, -1, -1, -1, -1, -1, -1, -1,
772 -1, -1, -1, -1, -1, -1, -1, -1,
773 -1, -1, -1, -1, -1, -1, -1, -1,
774 -1, -1, -1, -1, -1, -1, -1, -1,
775 -1, -1, -1, -1, -1, -1, -1, -1,
776 -1, -1, -1, -1, -1, -1, -1, -1,
777 -1, -1, -1, -1, -1, -1, -1, -1,
778 -1, -1, -1, -1, -1, -1, -1, -1,
779 -1, -1, -1, -1, -1, -1, -1, -1,
780 -1, -1, -1, -1, -1, -1, -1, -1,
781 -1, -1, -1, -1, -1, -1, -1, -1
782 };
783
784 constexpr signed char kUnWebSafeBase64[] = {
785 -1, -1, -1, -1, -1, -1, -1, -1,
786 -1, -1, -1, -1, -1, -1, -1, -1,
787 -1, -1, -1, -1, -1, -1, -1, -1,
788 -1, -1, -1, -1, -1, -1, -1, -1,
789 -1, -1, -1, -1, -1, -1, -1, -1,
790 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
791 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
792 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
793 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
794 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
795 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
796 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
797 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
798 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
799 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
800 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
801 -1, -1, -1, -1, -1, -1, -1, -1,
802 -1, -1, -1, -1, -1, -1, -1, -1,
803 -1, -1, -1, -1, -1, -1, -1, -1,
804 -1, -1, -1, -1, -1, -1, -1, -1,
805 -1, -1, -1, -1, -1, -1, -1, -1,
806 -1, -1, -1, -1, -1, -1, -1, -1,
807 -1, -1, -1, -1, -1, -1, -1, -1,
808 -1, -1, -1, -1, -1, -1, -1, -1,
809 -1, -1, -1, -1, -1, -1, -1, -1,
810 -1, -1, -1, -1, -1, -1, -1, -1,
811 -1, -1, -1, -1, -1, -1, -1, -1,
812 -1, -1, -1, -1, -1, -1, -1, -1,
813 -1, -1, -1, -1, -1, -1, -1, -1,
814 -1, -1, -1, -1, -1, -1, -1, -1,
815 -1, -1, -1, -1, -1, -1, -1, -1,
816 -1, -1, -1, -1, -1, -1, -1, -1
817 };
818 /* clang-format on */
819
820 template <typename String>
Base64UnescapeInternal(absl::Nullable<const char * > src,size_t slen,absl::Nonnull<String * > dest,absl::Nonnull<const signed char * > unbase64)821 bool Base64UnescapeInternal(absl::Nullable<const char*> src, size_t slen,
822 absl::Nonnull<String*> dest,
823 absl::Nonnull<const signed char*> unbase64) {
824 // Determine the size of the output string. Base64 encodes every 3 bytes into
825 // 4 characters. Any leftover chars are added directly for good measure.
826 const size_t dest_len = 3 * (slen / 4) + (slen % 4);
827
828 strings_internal::STLStringResizeUninitialized(dest, dest_len);
829
830 // We are getting the destination buffer by getting the beginning of the
831 // string and converting it into a char *.
832 size_t len;
833 const bool ok =
834 Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
835 if (!ok) {
836 dest->clear();
837 return false;
838 }
839
840 // could be shorter if there was padding
841 assert(len <= dest_len);
842 dest->erase(len);
843
844 return true;
845 }
846
847 /* clang-format off */
848 constexpr char kHexValueLenient[256] = {
849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
852 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
853 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
855 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f'
856 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
858 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
859 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
860 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
861 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
865 };
866
867 constexpr signed char kHexValueStrict[256] = {
868 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
869 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
870 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
871 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // '0'..'9'
872 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 'A'..'F'
873 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
874 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 'a'..'f'
875 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
876 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
877 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
878 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
879 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
880 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
881 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
882 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
883 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
884 };
885 /* clang-format on */
886
887 // This is a templated function so that T can be either a char*
888 // or a string. This works because we use the [] operator to access
889 // individual characters at a time.
890 template <typename T>
HexStringToBytesInternal(absl::Nullable<const char * > from,T to,size_t num)891 void HexStringToBytesInternal(absl::Nullable<const char*> from, T to,
892 size_t num) {
893 for (size_t i = 0; i < num; i++) {
894 to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
895 (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
896 }
897 }
898
899 // This is a templated function so that T can be either a char* or a
900 // std::string.
901 template <typename T>
BytesToHexStringInternal(absl::Nullable<const unsigned char * > src,T dest,size_t num)902 void BytesToHexStringInternal(absl::Nullable<const unsigned char*> src, T dest,
903 size_t num) {
904 auto dest_ptr = &dest[0];
905 for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
906 const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
907 std::copy(hex_p, hex_p + 2, dest_ptr);
908 }
909 }
910
911 } // namespace
912
913 // ----------------------------------------------------------------------
914 // CUnescape()
915 //
916 // See CUnescapeInternal() for implementation details.
917 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,absl::Nonnull<std::string * > dest,absl::Nullable<std::string * > error)918 bool CUnescape(absl::string_view source, absl::Nonnull<std::string*> dest,
919 absl::Nullable<std::string*> error) {
920 return CUnescapeInternal(source, kUnescapeNulls, dest, error);
921 }
922
CEscape(absl::string_view src)923 std::string CEscape(absl::string_view src) {
924 std::string dest;
925 CEscapeAndAppendInternal(src, &dest);
926 return dest;
927 }
928
CHexEscape(absl::string_view src)929 std::string CHexEscape(absl::string_view src) {
930 return CEscapeInternal(src, true, false);
931 }
932
Utf8SafeCEscape(absl::string_view src)933 std::string Utf8SafeCEscape(absl::string_view src) {
934 return CEscapeInternal(src, false, true);
935 }
936
Utf8SafeCHexEscape(absl::string_view src)937 std::string Utf8SafeCHexEscape(absl::string_view src) {
938 return CEscapeInternal(src, true, true);
939 }
940
Base64Unescape(absl::string_view src,absl::Nonnull<std::string * > dest)941 bool Base64Unescape(absl::string_view src, absl::Nonnull<std::string*> dest) {
942 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
943 }
944
WebSafeBase64Unescape(absl::string_view src,absl::Nonnull<std::string * > dest)945 bool WebSafeBase64Unescape(absl::string_view src,
946 absl::Nonnull<std::string*> dest) {
947 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
948 }
949
Base64Escape(absl::string_view src,absl::Nonnull<std::string * > dest)950 void Base64Escape(absl::string_view src, absl::Nonnull<std::string*> dest) {
951 strings_internal::Base64EscapeInternal(
952 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
953 true, strings_internal::kBase64Chars);
954 }
955
WebSafeBase64Escape(absl::string_view src,absl::Nonnull<std::string * > dest)956 void WebSafeBase64Escape(absl::string_view src,
957 absl::Nonnull<std::string*> dest) {
958 strings_internal::Base64EscapeInternal(
959 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
960 false, strings_internal::kWebSafeBase64Chars);
961 }
962
Base64Escape(absl::string_view src)963 std::string Base64Escape(absl::string_view src) {
964 std::string dest;
965 strings_internal::Base64EscapeInternal(
966 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
967 true, strings_internal::kBase64Chars);
968 return dest;
969 }
970
WebSafeBase64Escape(absl::string_view src)971 std::string WebSafeBase64Escape(absl::string_view src) {
972 std::string dest;
973 strings_internal::Base64EscapeInternal(
974 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
975 false, strings_internal::kWebSafeBase64Chars);
976 return dest;
977 }
978
HexStringToBytes(absl::string_view hex,absl::Nonnull<std::string * > bytes)979 bool HexStringToBytes(absl::string_view hex,
980 absl::Nonnull<std::string*> bytes) {
981 std::string output;
982
983 size_t num_bytes = hex.size() / 2;
984 if (hex.size() != num_bytes * 2) {
985 return false;
986 }
987
988 absl::strings_internal::STLStringResizeUninitialized(&output, num_bytes);
989 auto hex_p = hex.cbegin();
990 for (std::string::iterator bin_p = output.begin(); bin_p != output.end();
991 ++bin_p) {
992 int h1 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)];
993 int h2 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)];
994 if (h1 == -1 || h2 == -1) {
995 output.resize(static_cast<size_t>(bin_p - output.begin()));
996 return false;
997 }
998 *bin_p = static_cast<char>((h1 << 4) + h2);
999 }
1000
1001 *bytes = std::move(output);
1002 return true;
1003 }
1004
HexStringToBytes(absl::string_view from)1005 std::string HexStringToBytes(absl::string_view from) {
1006 std::string result;
1007 const auto num = from.size() / 2;
1008 strings_internal::STLStringResizeUninitialized(&result, num);
1009 absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
1010 return result;
1011 }
1012
BytesToHexString(absl::string_view from)1013 std::string BytesToHexString(absl::string_view from) {
1014 std::string result;
1015 strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
1016 absl::BytesToHexStringInternal<std::string&>(
1017 reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
1018 return result;
1019 }
1020
1021 ABSL_NAMESPACE_END
1022 } // namespace absl
1023