xref: /aosp_15_r20/external/abseil-cpp/absl/strings/escaping.cc (revision 9356374a3709195abf420251b3e825997ff56c0f)
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/escaping.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 #include <limits>
23 #include <string>
24 #include <utility>
25 
26 #include "absl/base/config.h"
27 #include "absl/base/internal/raw_logging.h"
28 #include "absl/base/internal/unaligned_access.h"
29 #include "absl/base/nullability.h"
30 #include "absl/strings/ascii.h"
31 #include "absl/strings/charset.h"
32 #include "absl/strings/internal/escaping.h"
33 #include "absl/strings/internal/resize_uninitialized.h"
34 #include "absl/strings/internal/utf8.h"
35 #include "absl/strings/numbers.h"
36 #include "absl/strings/str_cat.h"
37 #include "absl/strings/string_view.h"
38 
39 namespace absl {
40 ABSL_NAMESPACE_BEGIN
41 namespace {
42 
43 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
44 constexpr bool kUnescapeNulls = false;
45 
is_octal_digit(char c)46 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
47 
hex_digit_to_int(char c)48 inline unsigned int hex_digit_to_int(char c) {
49   static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
50                 "Character set must be ASCII.");
51   assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
52   unsigned int x = static_cast<unsigned char>(c);
53   if (x > '9') {
54     x += 9;
55   }
56   return x & 0xf;
57 }
58 
IsSurrogate(char32_t c,absl::string_view src,absl::Nullable<std::string * > error)59 inline bool IsSurrogate(char32_t c, absl::string_view src,
60                         absl::Nullable<std::string*> error) {
61   if (c >= 0xD800 && c <= 0xDFFF) {
62     if (error) {
63       *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
64                             src);
65     }
66     return true;
67   }
68   return false;
69 }
70 
71 // ----------------------------------------------------------------------
72 // CUnescapeInternal()
73 //    Implements both CUnescape() and CUnescapeForNullTerminatedString().
74 //
75 //    Unescapes C escape sequences and is the reverse of CEscape().
76 //
77 //    If 'source' is valid, stores the unescaped string and its size in
78 //    'dest' and 'dest_len' respectively, and returns true. Otherwise
79 //    returns false and optionally stores the error description in
80 //    'error'. Set 'error' to nullptr to disable error reporting.
81 //
82 //    'dest' should point to a buffer that is at least as big as 'source'.
83 //    'source' and 'dest' may be the same.
84 //
85 //     NOTE: any changes to this function must also be reflected in the older
86 //     UnescapeCEscapeSequences().
87 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,absl::Nonnull<char * > dest,absl::Nonnull<ptrdiff_t * > dest_len,absl::Nullable<std::string * > error)88 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
89                        absl::Nonnull<char*> dest,
90                        absl::Nonnull<ptrdiff_t*> dest_len,
91                        absl::Nullable<std::string*> error) {
92   char* d = dest;
93   const char* p = source.data();
94   const char* end = p + source.size();
95   const char* last_byte = end - 1;
96 
97   // Small optimization for case where source = dest and there's no escaping
98   while (p == d && p < end && *p != '\\') p++, d++;
99 
100   while (p < end) {
101     if (*p != '\\') {
102       *d++ = *p++;
103     } else {
104       if (++p > last_byte) {  // skip past the '\\'
105         if (error) *error = "String cannot end with \\";
106         return false;
107       }
108       switch (*p) {
109         case 'a':  *d++ = '\a';  break;
110         case 'b':  *d++ = '\b';  break;
111         case 'f':  *d++ = '\f';  break;
112         case 'n':  *d++ = '\n';  break;
113         case 'r':  *d++ = '\r';  break;
114         case 't':  *d++ = '\t';  break;
115         case 'v':  *d++ = '\v';  break;
116         case '\\': *d++ = '\\';  break;
117         case '?':  *d++ = '\?';  break;    // \?  Who knew?
118         case '\'': *d++ = '\'';  break;
119         case '"':  *d++ = '\"';  break;
120         case '0':
121         case '1':
122         case '2':
123         case '3':
124         case '4':
125         case '5':
126         case '6':
127         case '7': {
128           // octal digit: 1 to 3 digits
129           const char* octal_start = p;
130           unsigned int ch = static_cast<unsigned int>(*p - '0');  // digit 1
131           if (p < last_byte && is_octal_digit(p[1]))
132             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 2
133           if (p < last_byte && is_octal_digit(p[1]))
134             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 3
135           if (ch > 0xff) {
136             if (error) {
137               *error = "Value of \\" +
138                        std::string(octal_start,
139                                    static_cast<size_t>(p + 1 - octal_start)) +
140                        " exceeds 0xff";
141             }
142             return false;
143           }
144           if ((ch == 0) && leave_nulls_escaped) {
145             // Copy the escape sequence for the null character
146             const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
147             *d++ = '\\';
148             memmove(d, octal_start, octal_size);
149             d += octal_size;
150             break;
151           }
152           *d++ = static_cast<char>(ch);
153           break;
154         }
155         case 'x':
156         case 'X': {
157           if (p >= last_byte) {
158             if (error) *error = "String cannot end with \\x";
159             return false;
160           } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
161             if (error) *error = "\\x cannot be followed by a non-hex digit";
162             return false;
163           }
164           unsigned int ch = 0;
165           const char* hex_start = p;
166           while (p < last_byte &&
167                  absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
168             // Arbitrarily many hex digits
169             ch = (ch << 4) + hex_digit_to_int(*++p);
170           if (ch > 0xFF) {
171             if (error) {
172               *error = "Value of \\" +
173                        std::string(hex_start,
174                                    static_cast<size_t>(p + 1 - hex_start)) +
175                        " exceeds 0xff";
176             }
177             return false;
178           }
179           if ((ch == 0) && leave_nulls_escaped) {
180             // Copy the escape sequence for the null character
181             const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
182             *d++ = '\\';
183             memmove(d, hex_start, hex_size);
184             d += hex_size;
185             break;
186           }
187           *d++ = static_cast<char>(ch);
188           break;
189         }
190         case 'u': {
191           // \uhhhh => convert 4 hex digits to UTF-8
192           char32_t rune = 0;
193           const char* hex_start = p;
194           if (p + 4 >= end) {
195             if (error) {
196               *error = "\\u must be followed by 4 hex digits: \\" +
197                        std::string(hex_start,
198                                    static_cast<size_t>(p + 1 - hex_start));
199             }
200             return false;
201           }
202           for (int i = 0; i < 4; ++i) {
203             // Look one char ahead.
204             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
205               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
206             } else {
207               if (error) {
208                 *error = "\\u must be followed by 4 hex digits: \\" +
209                          std::string(hex_start,
210                                      static_cast<size_t>(p + 1 - hex_start));
211               }
212               return false;
213             }
214           }
215           if ((rune == 0) && leave_nulls_escaped) {
216             // Copy the escape sequence for the null character
217             *d++ = '\\';
218             memmove(d, hex_start, 5);  // u0000
219             d += 5;
220             break;
221           }
222           if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
223             return false;
224           }
225           d += strings_internal::EncodeUTF8Char(d, rune);
226           break;
227         }
228         case 'U': {
229           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
230           char32_t rune = 0;
231           const char* hex_start = p;
232           if (p + 8 >= end) {
233             if (error) {
234               *error = "\\U must be followed by 8 hex digits: \\" +
235                        std::string(hex_start,
236                                    static_cast<size_t>(p + 1 - hex_start));
237             }
238             return false;
239           }
240           for (int i = 0; i < 8; ++i) {
241             // Look one char ahead.
242             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
243               // Don't change rune until we're sure this
244               // is within the Unicode limit, but do advance p.
245               uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
246               if (newrune > 0x10FFFF) {
247                 if (error) {
248                   *error = "Value of \\" +
249                            std::string(hex_start,
250                                        static_cast<size_t>(p + 1 - hex_start)) +
251                            " exceeds Unicode limit (0x10FFFF)";
252                 }
253                 return false;
254               } else {
255                 rune = newrune;
256               }
257             } else {
258               if (error) {
259                 *error = "\\U must be followed by 8 hex digits: \\" +
260                          std::string(hex_start,
261                                      static_cast<size_t>(p + 1 - hex_start));
262               }
263               return false;
264             }
265           }
266           if ((rune == 0) && leave_nulls_escaped) {
267             // Copy the escape sequence for the null character
268             *d++ = '\\';
269             memmove(d, hex_start, 9);  // U00000000
270             d += 9;
271             break;
272           }
273           if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
274             return false;
275           }
276           d += strings_internal::EncodeUTF8Char(d, rune);
277           break;
278         }
279         default: {
280           if (error) *error = std::string("Unknown escape sequence: \\") + *p;
281           return false;
282         }
283       }
284       p++;                                 // read past letter we escaped
285     }
286   }
287   *dest_len = d - dest;
288   return true;
289 }
290 
291 // ----------------------------------------------------------------------
292 // CUnescapeInternal()
293 //
294 //    Same as above but uses a std::string for output. 'source' and 'dest'
295 //    may be the same.
296 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,absl::Nonnull<std::string * > dest,absl::Nullable<std::string * > error)297 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
298                        absl::Nonnull<std::string*> dest,
299                        absl::Nullable<std::string*> error) {
300   strings_internal::STLStringResizeUninitialized(dest, source.size());
301 
302   ptrdiff_t dest_size;
303   if (!CUnescapeInternal(source,
304                          leave_nulls_escaped,
305                          &(*dest)[0],
306                          &dest_size,
307                          error)) {
308     return false;
309   }
310   dest->erase(static_cast<size_t>(dest_size));
311   return true;
312 }
313 
314 // ----------------------------------------------------------------------
315 // CEscape()
316 // CHexEscape()
317 // Utf8SafeCEscape()
318 // Utf8SafeCHexEscape()
319 //    Escapes 'src' using C-style escape sequences.  This is useful for
320 //    preparing query flags.  The 'Hex' version uses hexadecimal rather than
321 //    octal sequences.  The 'Utf8Safe' version does not touch UTF-8 bytes.
322 //
323 //    Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
324 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)325 std::string CEscapeInternal(absl::string_view src, bool use_hex,
326                             bool utf8_safe) {
327   std::string dest;
328   bool last_hex_escape = false;  // true if last output char was \xNN.
329 
330   for (char c : src) {
331     bool is_hex_escape = false;
332     switch (c) {
333       case '\n': dest.append("\\" "n"); break;
334       case '\r': dest.append("\\" "r"); break;
335       case '\t': dest.append("\\" "t"); break;
336       case '\"': dest.append("\\" "\""); break;
337       case '\'': dest.append("\\" "'"); break;
338       case '\\': dest.append("\\" "\\"); break;
339       default: {
340         // Note that if we emit \xNN and the src character after that is a hex
341         // digit then that digit must be escaped too to prevent it being
342         // interpreted as part of the character code by C.
343         const unsigned char uc = static_cast<unsigned char>(c);
344         if ((!utf8_safe || uc < 0x80) &&
345             (!absl::ascii_isprint(uc) ||
346              (last_hex_escape && absl::ascii_isxdigit(uc)))) {
347           if (use_hex) {
348             dest.append("\\" "x");
349             dest.push_back(numbers_internal::kHexChar[uc / 16]);
350             dest.push_back(numbers_internal::kHexChar[uc % 16]);
351             is_hex_escape = true;
352           } else {
353             dest.append("\\");
354             dest.push_back(numbers_internal::kHexChar[uc / 64]);
355             dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
356             dest.push_back(numbers_internal::kHexChar[uc % 8]);
357           }
358         } else {
359           dest.push_back(c);
360           break;
361         }
362       }
363     }
364     last_hex_escape = is_hex_escape;
365   }
366 
367   return dest;
368 }
369 
370 /* clang-format off */
371 constexpr unsigned char kCEscapedLen[256] = {
372     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
373     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
374     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
375     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
376     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
377     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
378     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
379     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
380     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
381     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
382     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
383     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
384     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
385     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
386     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
387     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
388 };
389 /* clang-format on */
390 
391 // Calculates the length of the C-style escaped version of 'src'.
392 // Assumes that non-printable characters are escaped using octal sequences, and
393 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)394 inline size_t CEscapedLength(absl::string_view src) {
395   size_t escaped_len = 0;
396   // The maximum value of kCEscapedLen[x] is 4, so we can escape any string of
397   // length size_t_max/4 without checking for overflow.
398   size_t unchecked_limit =
399       std::min<size_t>(src.size(), std::numeric_limits<size_t>::max() / 4);
400   size_t i = 0;
401   while (i < unchecked_limit) {
402     // Common case: No need to check for overflow.
403     escaped_len += kCEscapedLen[static_cast<unsigned char>(src[i++])];
404   }
405   while (i < src.size()) {
406     // Beyond unchecked_limit we need to check for overflow before adding.
407     size_t char_len = kCEscapedLen[static_cast<unsigned char>(src[i++])];
408     ABSL_INTERNAL_CHECK(
409         escaped_len <= std::numeric_limits<size_t>::max() - char_len,
410         "escaped_len overflow");
411     escaped_len += char_len;
412   }
413   return escaped_len;
414 }
415 
CEscapeAndAppendInternal(absl::string_view src,absl::Nonnull<std::string * > dest)416 void CEscapeAndAppendInternal(absl::string_view src,
417                               absl::Nonnull<std::string*> dest) {
418   size_t escaped_len = CEscapedLength(src);
419   if (escaped_len == src.size()) {
420     dest->append(src.data(), src.size());
421     return;
422   }
423 
424   size_t cur_dest_len = dest->size();
425   ABSL_INTERNAL_CHECK(
426       cur_dest_len <= std::numeric_limits<size_t>::max() - escaped_len,
427       "std::string size overflow");
428   strings_internal::STLStringResizeUninitialized(dest,
429                                                  cur_dest_len + escaped_len);
430   char* append_ptr = &(*dest)[cur_dest_len];
431 
432   for (char c : src) {
433     size_t char_len = kCEscapedLen[static_cast<unsigned char>(c)];
434     if (char_len == 1) {
435       *append_ptr++ = c;
436     } else if (char_len == 2) {
437       switch (c) {
438         case '\n':
439           *append_ptr++ = '\\';
440           *append_ptr++ = 'n';
441           break;
442         case '\r':
443           *append_ptr++ = '\\';
444           *append_ptr++ = 'r';
445           break;
446         case '\t':
447           *append_ptr++ = '\\';
448           *append_ptr++ = 't';
449           break;
450         case '\"':
451           *append_ptr++ = '\\';
452           *append_ptr++ = '\"';
453           break;
454         case '\'':
455           *append_ptr++ = '\\';
456           *append_ptr++ = '\'';
457           break;
458         case '\\':
459           *append_ptr++ = '\\';
460           *append_ptr++ = '\\';
461           break;
462       }
463     } else {
464       *append_ptr++ = '\\';
465       *append_ptr++ = '0' + static_cast<unsigned char>(c) / 64;
466       *append_ptr++ = '0' + (static_cast<unsigned char>(c) % 64) / 8;
467       *append_ptr++ = '0' + static_cast<unsigned char>(c) % 8;
468     }
469   }
470 }
471 
472 // Reverses the mapping in Base64EscapeInternal; see that method's
473 // documentation for details of the mapping.
Base64UnescapeInternal(absl::Nullable<const char * > src_param,size_t szsrc,absl::Nullable<char * > dest,size_t szdest,absl::Nonnull<const signed char * > unbase64,absl::Nonnull<size_t * > len)474 bool Base64UnescapeInternal(absl::Nullable<const char*> src_param, size_t szsrc,
475                             absl::Nullable<char*> dest, size_t szdest,
476                             absl::Nonnull<const signed char*> unbase64,
477                             absl::Nonnull<size_t*> len) {
478   static const char kPad64Equals = '=';
479   static const char kPad64Dot = '.';
480 
481   size_t destidx = 0;
482   int decode = 0;
483   int state = 0;
484   unsigned char ch = 0;
485   unsigned int temp = 0;
486 
487   // If "char" is signed by default, using *src as an array index results in
488   // accessing negative array elements. Treat the input as a pointer to
489   // unsigned char to avoid this.
490   const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
491 
492   // The GET_INPUT macro gets the next input character, skipping
493   // over any whitespace, and stopping when we reach the end of the
494   // string or when we read any non-data character.  The arguments are
495   // an arbitrary identifier (used as a label for goto) and the number
496   // of data bytes that must remain in the input to avoid aborting the
497   // loop.
498 #define GET_INPUT(label, remain)                                \
499   label:                                                        \
500   --szsrc;                                                      \
501   ch = *src++;                                                  \
502   decode = unbase64[ch];                                        \
503   if (decode < 0) {                                             \
504     if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
505     state = 4 - remain;                                         \
506     break;                                                      \
507   }
508 
509   // if dest is null, we're just checking to see if it's legal input
510   // rather than producing output.  (I suspect this could just be done
511   // with a regexp...).  We duplicate the loop so this test can be
512   // outside it instead of in every iteration.
513 
514   if (dest) {
515     // This loop consumes 4 input bytes and produces 3 output bytes
516     // per iteration.  We can't know at the start that there is enough
517     // data left in the string for a full iteration, so the loop may
518     // break out in the middle; if so 'state' will be set to the
519     // number of input bytes read.
520 
521     while (szsrc >= 4) {
522       // We'll start by optimistically assuming that the next four
523       // bytes of the string (src[0..3]) are four good data bytes
524       // (that is, no nulls, whitespace, padding chars, or illegal
525       // chars).  We need to test src[0..2] for nulls individually
526       // before constructing temp to preserve the property that we
527       // never read past a null in the string (no matter how long
528       // szsrc claims the string is).
529 
530       if (!src[0] || !src[1] || !src[2] ||
531           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
532                     (unsigned(unbase64[src[1]]) << 12) |
533                     (unsigned(unbase64[src[2]]) << 6) |
534                     (unsigned(unbase64[src[3]])))) &
535            0x80000000)) {
536         // Iff any of those four characters was bad (null, illegal,
537         // whitespace, padding), then temp's high bit will be set
538         // (because unbase64[] is -1 for all bad characters).
539         //
540         // We'll back up and resort to the slower decoder, which knows
541         // how to handle those cases.
542 
543         GET_INPUT(first, 4);
544         temp = static_cast<unsigned char>(decode);
545         GET_INPUT(second, 3);
546         temp = (temp << 6) | static_cast<unsigned char>(decode);
547         GET_INPUT(third, 2);
548         temp = (temp << 6) | static_cast<unsigned char>(decode);
549         GET_INPUT(fourth, 1);
550         temp = (temp << 6) | static_cast<unsigned char>(decode);
551       } else {
552         // We really did have four good data bytes, so advance four
553         // characters in the string.
554 
555         szsrc -= 4;
556         src += 4;
557       }
558 
559       // temp has 24 bits of input, so write that out as three bytes.
560 
561       if (destidx + 3 > szdest) return false;
562       dest[destidx + 2] = static_cast<char>(temp);
563       temp >>= 8;
564       dest[destidx + 1] = static_cast<char>(temp);
565       temp >>= 8;
566       dest[destidx] = static_cast<char>(temp);
567       destidx += 3;
568     }
569   } else {
570     while (szsrc >= 4) {
571       if (!src[0] || !src[1] || !src[2] ||
572           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
573                     (unsigned(unbase64[src[1]]) << 12) |
574                     (unsigned(unbase64[src[2]]) << 6) |
575                     (unsigned(unbase64[src[3]])))) &
576            0x80000000)) {
577         GET_INPUT(first_no_dest, 4);
578         GET_INPUT(second_no_dest, 3);
579         GET_INPUT(third_no_dest, 2);
580         GET_INPUT(fourth_no_dest, 1);
581       } else {
582         szsrc -= 4;
583         src += 4;
584       }
585       destidx += 3;
586     }
587   }
588 
589 #undef GET_INPUT
590 
591   // if the loop terminated because we read a bad character, return
592   // now.
593   if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
594       !absl::ascii_isspace(ch))
595     return false;
596 
597   if (ch == kPad64Equals || ch == kPad64Dot) {
598     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
599     // look at it again when we count to check for the proper number of
600     // equals signs at the end.
601     ++szsrc;
602     --src;
603   } else {
604     // This loop consumes 1 input byte per iteration.  It's used to
605     // clean up the 0-3 input bytes remaining when the first, faster
606     // loop finishes.  'temp' contains the data from 'state' input
607     // characters read by the first loop.
608     while (szsrc > 0) {
609       --szsrc;
610       ch = *src++;
611       decode = unbase64[ch];
612       if (decode < 0) {
613         if (absl::ascii_isspace(ch)) {
614           continue;
615         } else if (ch == kPad64Equals || ch == kPad64Dot) {
616           // back up one character; we'll read it again when we check
617           // for the correct number of pad characters at the end.
618           ++szsrc;
619           --src;
620           break;
621         } else {
622           return false;
623         }
624       }
625 
626       // Each input character gives us six bits of output.
627       temp = (temp << 6) | static_cast<unsigned char>(decode);
628       ++state;
629       if (state == 4) {
630         // If we've accumulated 24 bits of output, write that out as
631         // three bytes.
632         if (dest) {
633           if (destidx + 3 > szdest) return false;
634           dest[destidx + 2] = static_cast<char>(temp);
635           temp >>= 8;
636           dest[destidx + 1] = static_cast<char>(temp);
637           temp >>= 8;
638           dest[destidx] = static_cast<char>(temp);
639         }
640         destidx += 3;
641         state = 0;
642         temp = 0;
643       }
644     }
645   }
646 
647   // Process the leftover data contained in 'temp' at the end of the input.
648   int expected_equals = 0;
649   switch (state) {
650     case 0:
651       // Nothing left over; output is a multiple of 3 bytes.
652       break;
653 
654     case 1:
655       // Bad input; we have 6 bits left over.
656       return false;
657 
658     case 2:
659       // Produce one more output byte from the 12 input bits we have left.
660       if (dest) {
661         if (destidx + 1 > szdest) return false;
662         temp >>= 4;
663         dest[destidx] = static_cast<char>(temp);
664       }
665       ++destidx;
666       expected_equals = 2;
667       break;
668 
669     case 3:
670       // Produce two more output bytes from the 18 input bits we have left.
671       if (dest) {
672         if (destidx + 2 > szdest) return false;
673         temp >>= 2;
674         dest[destidx + 1] = static_cast<char>(temp);
675         temp >>= 8;
676         dest[destidx] = static_cast<char>(temp);
677       }
678       destidx += 2;
679       expected_equals = 1;
680       break;
681 
682     default:
683       // state should have no other values at this point.
684       ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
685                    state);
686   }
687 
688   // The remainder of the string should be all whitespace, mixed with
689   // exactly 0 equals signs, or exactly 'expected_equals' equals
690   // signs.  (Always accepting 0 equals signs is an Abseil extension
691   // not covered in the RFC, as is accepting dot as the pad character.)
692 
693   int equals = 0;
694   while (szsrc > 0) {
695     if (*src == kPad64Equals || *src == kPad64Dot)
696       ++equals;
697     else if (!absl::ascii_isspace(*src))
698       return false;
699     --szsrc;
700     ++src;
701   }
702 
703   const bool ok = (equals == 0 || equals == expected_equals);
704   if (ok) *len = destidx;
705   return ok;
706 }
707 
708 // The arrays below map base64-escaped characters back to their original values.
709 // For the inverse case, see k(WebSafe)Base64Chars in the internal
710 // escaping.cc.
711 // These arrays were generated by the following inversion code:
712 // #include <sys/time.h>
713 // #include <stdlib.h>
714 // #include <string.h>
715 // main()
716 // {
717 //   static const char Base64[] =
718 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
719 //   char* pos;
720 //   int idx, i, j;
721 //   printf("    ");
722 //   for (i = 0; i < 255; i += 8) {
723 //     for (j = i; j < i + 8; j++) {
724 //       pos = strchr(Base64, j);
725 //       if ((pos == nullptr) || (j == 0))
726 //         idx = -1;
727 //       else
728 //         idx = pos - Base64;
729 //       if (idx == -1)
730 //         printf(" %2d,     ", idx);
731 //       else
732 //         printf(" %2d/*%c*/,", idx, j);
733 //     }
734 //     printf("\n    ");
735 //   }
736 // }
737 //
738 // where the value of "Base64[]" was replaced by one of k(WebSafe)Base64Chars
739 // in the internal escaping.cc.
740 /* clang-format off */
741 constexpr signed char kUnBase64[] = {
742     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
743     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
744     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
745     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
746     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
747     -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
748     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
749     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
750     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
751     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
752     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
753     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
754     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
755     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
756     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
757     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
758     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
759     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
760     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
761     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
762     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
763     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
764     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
765     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
766     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
767     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
768     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
769     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
770     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
771     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
772     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
773     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
774 };
775 
776 constexpr signed char kUnWebSafeBase64[] = {
777     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
778     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
779     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
780     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
781     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
782     -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
783     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
784     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
785     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
786     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
787     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
788     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
789     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
790     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
791     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
792     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
793     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
794     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
795     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
796     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
797     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
798     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
799     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
800     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
801     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
802     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
803     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
804     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
805     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
806     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
807     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
808     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
809 };
810 /* clang-format on */
811 
812 template <typename String>
Base64UnescapeInternal(absl::Nullable<const char * > src,size_t slen,absl::Nonnull<String * > dest,absl::Nonnull<const signed char * > unbase64)813 bool Base64UnescapeInternal(absl::Nullable<const char*> src, size_t slen,
814                             absl::Nonnull<String*> dest,
815                             absl::Nonnull<const signed char*> unbase64) {
816   // Determine the size of the output string.  Base64 encodes every 3 bytes into
817   // 4 characters.  Any leftover chars are added directly for good measure.
818   const size_t dest_len = 3 * (slen / 4) + (slen % 4);
819 
820   strings_internal::STLStringResizeUninitialized(dest, dest_len);
821 
822   // We are getting the destination buffer by getting the beginning of the
823   // string and converting it into a char *.
824   size_t len;
825   const bool ok =
826       Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
827   if (!ok) {
828     dest->clear();
829     return false;
830   }
831 
832   // could be shorter if there was padding
833   assert(len <= dest_len);
834   dest->erase(len);
835 
836   return true;
837 }
838 
839 /* clang-format off */
840 constexpr char kHexValueLenient[256] = {
841     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
843     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
844     0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0,  // '0'..'9'
845     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'A'..'F'
846     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
847     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'a'..'f'
848     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
849     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
850     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
851     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
852     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
853     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
854     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
855     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
856     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
857 };
858 
859 constexpr signed char kHexValueStrict[256] = {
860     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
861     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
862     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
863      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,  // '0'..'9'
864     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'A'..'F'
865     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
866     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'a'..'f'
867     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
868     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
869     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
870     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
871     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
872     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
873     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
874     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
875     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
876 };
877 /* clang-format on */
878 
879 // This is a templated function so that T can be either a char*
880 // or a string.  This works because we use the [] operator to access
881 // individual characters at a time.
882 template <typename T>
HexStringToBytesInternal(absl::Nullable<const char * > from,T to,size_t num)883 void HexStringToBytesInternal(absl::Nullable<const char*> from, T to,
884                               size_t num) {
885   for (size_t i = 0; i < num; i++) {
886     to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
887             (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
888   }
889 }
890 
891 // This is a templated function so that T can be either a char* or a
892 // std::string.
893 template <typename T>
BytesToHexStringInternal(absl::Nullable<const unsigned char * > src,T dest,size_t num)894 void BytesToHexStringInternal(absl::Nullable<const unsigned char*> src, T dest,
895                               size_t num) {
896   auto dest_ptr = &dest[0];
897   for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
898     const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
899     std::copy(hex_p, hex_p + 2, dest_ptr);
900   }
901 }
902 
903 }  // namespace
904 
905 // ----------------------------------------------------------------------
906 // CUnescape()
907 //
908 // See CUnescapeInternal() for implementation details.
909 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,absl::Nonnull<std::string * > dest,absl::Nullable<std::string * > error)910 bool CUnescape(absl::string_view source, absl::Nonnull<std::string*> dest,
911                absl::Nullable<std::string*> error) {
912   return CUnescapeInternal(source, kUnescapeNulls, dest, error);
913 }
914 
CEscape(absl::string_view src)915 std::string CEscape(absl::string_view src) {
916   std::string dest;
917   CEscapeAndAppendInternal(src, &dest);
918   return dest;
919 }
920 
CHexEscape(absl::string_view src)921 std::string CHexEscape(absl::string_view src) {
922   return CEscapeInternal(src, true, false);
923 }
924 
Utf8SafeCEscape(absl::string_view src)925 std::string Utf8SafeCEscape(absl::string_view src) {
926   return CEscapeInternal(src, false, true);
927 }
928 
Utf8SafeCHexEscape(absl::string_view src)929 std::string Utf8SafeCHexEscape(absl::string_view src) {
930   return CEscapeInternal(src, true, true);
931 }
932 
Base64Unescape(absl::string_view src,absl::Nonnull<std::string * > dest)933 bool Base64Unescape(absl::string_view src, absl::Nonnull<std::string*> dest) {
934   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
935 }
936 
WebSafeBase64Unescape(absl::string_view src,absl::Nonnull<std::string * > dest)937 bool WebSafeBase64Unescape(absl::string_view src,
938                            absl::Nonnull<std::string*> dest) {
939   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
940 }
941 
Base64Escape(absl::string_view src,absl::Nonnull<std::string * > dest)942 void Base64Escape(absl::string_view src, absl::Nonnull<std::string*> dest) {
943   strings_internal::Base64EscapeInternal(
944       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
945       true, strings_internal::kBase64Chars);
946 }
947 
WebSafeBase64Escape(absl::string_view src,absl::Nonnull<std::string * > dest)948 void WebSafeBase64Escape(absl::string_view src,
949                          absl::Nonnull<std::string*> dest) {
950   strings_internal::Base64EscapeInternal(
951       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
952       false, strings_internal::kWebSafeBase64Chars);
953 }
954 
Base64Escape(absl::string_view src)955 std::string Base64Escape(absl::string_view src) {
956   std::string dest;
957   strings_internal::Base64EscapeInternal(
958       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
959       true, strings_internal::kBase64Chars);
960   return dest;
961 }
962 
WebSafeBase64Escape(absl::string_view src)963 std::string WebSafeBase64Escape(absl::string_view src) {
964   std::string dest;
965   strings_internal::Base64EscapeInternal(
966       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
967       false, strings_internal::kWebSafeBase64Chars);
968   return dest;
969 }
970 
HexStringToBytes(absl::string_view hex,absl::Nonnull<std::string * > bytes)971 bool HexStringToBytes(absl::string_view hex,
972                       absl::Nonnull<std::string*> bytes) {
973   std::string output;
974 
975   size_t num_bytes = hex.size() / 2;
976   if (hex.size() != num_bytes * 2) {
977     return false;
978   }
979 
980   absl::strings_internal::STLStringResizeUninitialized(&output, num_bytes);
981   auto hex_p = hex.cbegin();
982   for (std::string::iterator bin_p = output.begin(); bin_p != output.end();
983        ++bin_p) {
984     int h1 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)];
985     int h2 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)];
986     if (h1 == -1 || h2 == -1) {
987       output.resize(static_cast<size_t>(bin_p - output.begin()));
988       return false;
989     }
990     *bin_p = static_cast<char>((h1 << 4) + h2);
991   }
992 
993   *bytes = std::move(output);
994   return true;
995 }
996 
HexStringToBytes(absl::string_view from)997 std::string HexStringToBytes(absl::string_view from) {
998   std::string result;
999   const auto num = from.size() / 2;
1000   strings_internal::STLStringResizeUninitialized(&result, num);
1001   absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
1002   return result;
1003 }
1004 
BytesToHexString(absl::string_view from)1005 std::string BytesToHexString(absl::string_view from) {
1006   std::string result;
1007   strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
1008   absl::BytesToHexStringInternal<std::string&>(
1009       reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
1010   return result;
1011 }
1012 
1013 ABSL_NAMESPACE_END
1014 }  // namespace absl
1015