xref: /aosp_15_r20/external/angle/third_party/abseil-cpp/absl/strings/escaping.cc (revision 8975f5c5ed3d1c378011245431ada316dfb6f244)
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/escaping.h"
16 
17 #include <algorithm>
18 #include <array>
19 #include <cassert>
20 #include <cstddef>
21 #include <cstdint>
22 #include <cstring>
23 #include <limits>
24 #include <string>
25 #include <utility>
26 
27 #include "absl/base/config.h"
28 #include "absl/base/internal/endian.h"
29 #include "absl/base/internal/raw_logging.h"
30 #include "absl/base/internal/unaligned_access.h"
31 #include "absl/base/nullability.h"
32 #include "absl/strings/ascii.h"
33 #include "absl/strings/charset.h"
34 #include "absl/strings/internal/escaping.h"
35 #include "absl/strings/internal/resize_uninitialized.h"
36 #include "absl/strings/internal/utf8.h"
37 #include "absl/strings/numbers.h"
38 #include "absl/strings/str_cat.h"
39 #include "absl/strings/string_view.h"
40 
41 namespace absl {
42 ABSL_NAMESPACE_BEGIN
43 namespace {
44 
45 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
46 constexpr bool kUnescapeNulls = false;
47 
is_octal_digit(char c)48 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
49 
hex_digit_to_int(char c)50 inline unsigned int hex_digit_to_int(char c) {
51   static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
52                 "Character set must be ASCII.");
53   assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
54   unsigned int x = static_cast<unsigned char>(c);
55   if (x > '9') {
56     x += 9;
57   }
58   return x & 0xf;
59 }
60 
IsSurrogate(char32_t c,absl::string_view src,absl::Nullable<std::string * > error)61 inline bool IsSurrogate(char32_t c, absl::string_view src,
62                         absl::Nullable<std::string*> error) {
63   if (c >= 0xD800 && c <= 0xDFFF) {
64     if (error) {
65       *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
66                             src);
67     }
68     return true;
69   }
70   return false;
71 }
72 
73 // ----------------------------------------------------------------------
74 // CUnescapeInternal()
75 //    Implements both CUnescape() and CUnescapeForNullTerminatedString().
76 //
77 //    Unescapes C escape sequences and is the reverse of CEscape().
78 //
79 //    If 'source' is valid, stores the unescaped string and its size in
80 //    'dest' and 'dest_len' respectively, and returns true. Otherwise
81 //    returns false and optionally stores the error description in
82 //    'error'. Set 'error' to nullptr to disable error reporting.
83 //
84 //    'dest' should point to a buffer that is at least as big as 'source'.
85 //    'source' and 'dest' may be the same.
86 //
87 //     NOTE: any changes to this function must also be reflected in the older
88 //     UnescapeCEscapeSequences().
89 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,absl::Nonnull<char * > dest,absl::Nonnull<ptrdiff_t * > dest_len,absl::Nullable<std::string * > error)90 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
91                        absl::Nonnull<char*> dest,
92                        absl::Nonnull<ptrdiff_t*> dest_len,
93                        absl::Nullable<std::string*> error) {
94   char* d = dest;
95   const char* p = source.data();
96   const char* end = p + source.size();
97   const char* last_byte = end - 1;
98 
99   // Small optimization for case where source = dest and there's no escaping
100   while (p == d && p < end && *p != '\\') p++, d++;
101 
102   while (p < end) {
103     if (*p != '\\') {
104       *d++ = *p++;
105     } else {
106       if (++p > last_byte) {  // skip past the '\\'
107         if (error) *error = "String cannot end with \\";
108         return false;
109       }
110       switch (*p) {
111         case 'a':  *d++ = '\a';  break;
112         case 'b':  *d++ = '\b';  break;
113         case 'f':  *d++ = '\f';  break;
114         case 'n':  *d++ = '\n';  break;
115         case 'r':  *d++ = '\r';  break;
116         case 't':  *d++ = '\t';  break;
117         case 'v':  *d++ = '\v';  break;
118         case '\\': *d++ = '\\';  break;
119         case '?':  *d++ = '\?';  break;    // \?  Who knew?
120         case '\'': *d++ = '\'';  break;
121         case '"':  *d++ = '\"';  break;
122         case '0':
123         case '1':
124         case '2':
125         case '3':
126         case '4':
127         case '5':
128         case '6':
129         case '7': {
130           // octal digit: 1 to 3 digits
131           const char* octal_start = p;
132           unsigned int ch = static_cast<unsigned int>(*p - '0');  // digit 1
133           if (p < last_byte && is_octal_digit(p[1]))
134             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 2
135           if (p < last_byte && is_octal_digit(p[1]))
136             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 3
137           if (ch > 0xff) {
138             if (error) {
139               *error = "Value of \\" +
140                        std::string(octal_start,
141                                    static_cast<size_t>(p + 1 - octal_start)) +
142                        " exceeds 0xff";
143             }
144             return false;
145           }
146           if ((ch == 0) && leave_nulls_escaped) {
147             // Copy the escape sequence for the null character
148             const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
149             *d++ = '\\';
150             memmove(d, octal_start, octal_size);
151             d += octal_size;
152             break;
153           }
154           *d++ = static_cast<char>(ch);
155           break;
156         }
157         case 'x':
158         case 'X': {
159           if (p >= last_byte) {
160             if (error) *error = "String cannot end with \\x";
161             return false;
162           } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
163             if (error) *error = "\\x cannot be followed by a non-hex digit";
164             return false;
165           }
166           unsigned int ch = 0;
167           const char* hex_start = p;
168           while (p < last_byte &&
169                  absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
170             // Arbitrarily many hex digits
171             ch = (ch << 4) + hex_digit_to_int(*++p);
172           if (ch > 0xFF) {
173             if (error) {
174               *error = "Value of \\" +
175                        std::string(hex_start,
176                                    static_cast<size_t>(p + 1 - hex_start)) +
177                        " exceeds 0xff";
178             }
179             return false;
180           }
181           if ((ch == 0) && leave_nulls_escaped) {
182             // Copy the escape sequence for the null character
183             const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
184             *d++ = '\\';
185             memmove(d, hex_start, hex_size);
186             d += hex_size;
187             break;
188           }
189           *d++ = static_cast<char>(ch);
190           break;
191         }
192         case 'u': {
193           // \uhhhh => convert 4 hex digits to UTF-8
194           char32_t rune = 0;
195           const char* hex_start = p;
196           if (p + 4 >= end) {
197             if (error) {
198               *error = "\\u must be followed by 4 hex digits: \\" +
199                        std::string(hex_start,
200                                    static_cast<size_t>(p + 1 - hex_start));
201             }
202             return false;
203           }
204           for (int i = 0; i < 4; ++i) {
205             // Look one char ahead.
206             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
207               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
208             } else {
209               if (error) {
210                 *error = "\\u must be followed by 4 hex digits: \\" +
211                          std::string(hex_start,
212                                      static_cast<size_t>(p + 1 - hex_start));
213               }
214               return false;
215             }
216           }
217           if ((rune == 0) && leave_nulls_escaped) {
218             // Copy the escape sequence for the null character
219             *d++ = '\\';
220             memmove(d, hex_start, 5);  // u0000
221             d += 5;
222             break;
223           }
224           if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
225             return false;
226           }
227           d += strings_internal::EncodeUTF8Char(d, rune);
228           break;
229         }
230         case 'U': {
231           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
232           char32_t rune = 0;
233           const char* hex_start = p;
234           if (p + 8 >= end) {
235             if (error) {
236               *error = "\\U must be followed by 8 hex digits: \\" +
237                        std::string(hex_start,
238                                    static_cast<size_t>(p + 1 - hex_start));
239             }
240             return false;
241           }
242           for (int i = 0; i < 8; ++i) {
243             // Look one char ahead.
244             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
245               // Don't change rune until we're sure this
246               // is within the Unicode limit, but do advance p.
247               uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
248               if (newrune > 0x10FFFF) {
249                 if (error) {
250                   *error = "Value of \\" +
251                            std::string(hex_start,
252                                        static_cast<size_t>(p + 1 - hex_start)) +
253                            " exceeds Unicode limit (0x10FFFF)";
254                 }
255                 return false;
256               } else {
257                 rune = newrune;
258               }
259             } else {
260               if (error) {
261                 *error = "\\U must be followed by 8 hex digits: \\" +
262                          std::string(hex_start,
263                                      static_cast<size_t>(p + 1 - hex_start));
264               }
265               return false;
266             }
267           }
268           if ((rune == 0) && leave_nulls_escaped) {
269             // Copy the escape sequence for the null character
270             *d++ = '\\';
271             memmove(d, hex_start, 9);  // U00000000
272             d += 9;
273             break;
274           }
275           if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
276             return false;
277           }
278           d += strings_internal::EncodeUTF8Char(d, rune);
279           break;
280         }
281         default: {
282           if (error) *error = std::string("Unknown escape sequence: \\") + *p;
283           return false;
284         }
285       }
286       p++;                                 // read past letter we escaped
287     }
288   }
289   *dest_len = d - dest;
290   return true;
291 }
292 
293 // ----------------------------------------------------------------------
294 // CUnescapeInternal()
295 //
296 //    Same as above but uses a std::string for output. 'source' and 'dest'
297 //    may be the same.
298 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,absl::Nonnull<std::string * > dest,absl::Nullable<std::string * > error)299 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
300                        absl::Nonnull<std::string*> dest,
301                        absl::Nullable<std::string*> error) {
302   strings_internal::STLStringResizeUninitialized(dest, source.size());
303 
304   ptrdiff_t dest_size;
305   if (!CUnescapeInternal(source,
306                          leave_nulls_escaped,
307                          &(*dest)[0],
308                          &dest_size,
309                          error)) {
310     return false;
311   }
312   dest->erase(static_cast<size_t>(dest_size));
313   return true;
314 }
315 
316 // ----------------------------------------------------------------------
317 // CEscape()
318 // CHexEscape()
319 // Utf8SafeCEscape()
320 // Utf8SafeCHexEscape()
321 //    Escapes 'src' using C-style escape sequences.  This is useful for
322 //    preparing query flags.  The 'Hex' version uses hexadecimal rather than
323 //    octal sequences.  The 'Utf8Safe' version does not touch UTF-8 bytes.
324 //
325 //    Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
326 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)327 std::string CEscapeInternal(absl::string_view src, bool use_hex,
328                             bool utf8_safe) {
329   std::string dest;
330   bool last_hex_escape = false;  // true if last output char was \xNN.
331 
332   for (char c : src) {
333     bool is_hex_escape = false;
334     switch (c) {
335       case '\n': dest.append("\\" "n"); break;
336       case '\r': dest.append("\\" "r"); break;
337       case '\t': dest.append("\\" "t"); break;
338       case '\"': dest.append("\\" "\""); break;
339       case '\'': dest.append("\\" "'"); break;
340       case '\\': dest.append("\\" "\\"); break;
341       default: {
342         // Note that if we emit \xNN and the src character after that is a hex
343         // digit then that digit must be escaped too to prevent it being
344         // interpreted as part of the character code by C.
345         const unsigned char uc = static_cast<unsigned char>(c);
346         if ((!utf8_safe || uc < 0x80) &&
347             (!absl::ascii_isprint(uc) ||
348              (last_hex_escape && absl::ascii_isxdigit(uc)))) {
349           if (use_hex) {
350             dest.append("\\" "x");
351             dest.push_back(numbers_internal::kHexChar[uc / 16]);
352             dest.push_back(numbers_internal::kHexChar[uc % 16]);
353             is_hex_escape = true;
354           } else {
355             dest.append("\\");
356             dest.push_back(numbers_internal::kHexChar[uc / 64]);
357             dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
358             dest.push_back(numbers_internal::kHexChar[uc % 8]);
359           }
360         } else {
361           dest.push_back(c);
362           break;
363         }
364       }
365     }
366     last_hex_escape = is_hex_escape;
367   }
368 
369   return dest;
370 }
371 
372 /* clang-format off */
373 constexpr unsigned char kCEscapedLen[256] = {
374     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
375     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
377     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
378     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
379     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
380     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
381     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
382     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
383     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
384     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
385     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
386     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
387     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
388     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
389     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
390 };
391 /* clang-format on */
392 
MakeCEscapedLittleEndianUint32(size_t c)393 constexpr uint32_t MakeCEscapedLittleEndianUint32(size_t c) {
394   size_t char_len = kCEscapedLen[c];
395   if (char_len == 1) {
396     return static_cast<uint32_t>(c);
397   }
398   if (char_len == 2) {
399     switch (c) {
400       case '\n':
401         return '\\' | (static_cast<uint32_t>('n') << 8);
402       case '\r':
403         return '\\' | (static_cast<uint32_t>('r') << 8);
404       case '\t':
405         return '\\' | (static_cast<uint32_t>('t') << 8);
406       case '\"':
407         return '\\' | (static_cast<uint32_t>('\"') << 8);
408       case '\'':
409         return '\\' | (static_cast<uint32_t>('\'') << 8);
410       case '\\':
411         return '\\' | (static_cast<uint32_t>('\\') << 8);
412     }
413   }
414   return static_cast<uint32_t>('\\' | (('0' + (c / 64)) << 8) |
415                                (('0' + ((c % 64) / 8)) << 16) |
416                                (('0' + (c % 8)) << 24));
417 }
418 
419 template <size_t... indexes>
420 inline constexpr std::array<uint32_t, sizeof...(indexes)>
MakeCEscapedLittleEndianUint32Array(std::index_sequence<indexes...>)421 MakeCEscapedLittleEndianUint32Array(std::index_sequence<indexes...>) {
422   return {MakeCEscapedLittleEndianUint32(indexes)...};
423 }
424 constexpr std::array<uint32_t, 256> kCEscapedLittleEndianUint32Array =
425     MakeCEscapedLittleEndianUint32Array(std::make_index_sequence<256>());
426 
427 // Calculates the length of the C-style escaped version of 'src'.
428 // Assumes that non-printable characters are escaped using octal sequences, and
429 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)430 inline size_t CEscapedLength(absl::string_view src) {
431   size_t escaped_len = 0;
432   // The maximum value of kCEscapedLen[x] is 4, so we can escape any string of
433   // length size_t_max/4 without checking for overflow.
434   size_t unchecked_limit =
435       std::min<size_t>(src.size(), std::numeric_limits<size_t>::max() / 4);
436   size_t i = 0;
437   while (i < unchecked_limit) {
438     // Common case: No need to check for overflow.
439     escaped_len += kCEscapedLen[static_cast<unsigned char>(src[i++])];
440   }
441   while (i < src.size()) {
442     // Beyond unchecked_limit we need to check for overflow before adding.
443     size_t char_len = kCEscapedLen[static_cast<unsigned char>(src[i++])];
444     ABSL_INTERNAL_CHECK(
445         escaped_len <= std::numeric_limits<size_t>::max() - char_len,
446         "escaped_len overflow");
447     escaped_len += char_len;
448   }
449   return escaped_len;
450 }
451 
CEscapeAndAppendInternal(absl::string_view src,absl::Nonnull<std::string * > dest)452 void CEscapeAndAppendInternal(absl::string_view src,
453                               absl::Nonnull<std::string*> dest) {
454   size_t escaped_len = CEscapedLength(src);
455   if (escaped_len == src.size()) {
456     dest->append(src.data(), src.size());
457     return;
458   }
459 
460   // We keep 3 slop bytes so that we can call `little_endian::Store32`
461   // invariably regardless of the length of the escaped character.
462   constexpr size_t slop_bytes = 3;
463   size_t cur_dest_len = dest->size();
464   size_t new_dest_len = cur_dest_len + escaped_len + slop_bytes;
465   ABSL_INTERNAL_CHECK(new_dest_len > cur_dest_len, "std::string size overflow");
466   strings_internal::AppendUninitializedTraits<std::string>::Append(
467       dest, escaped_len + slop_bytes);
468   char* append_ptr = &(*dest)[cur_dest_len];
469 
470   for (char c : src) {
471     unsigned char uc = static_cast<unsigned char>(c);
472     size_t char_len = kCEscapedLen[uc];
473     uint32_t little_endian_uint32 = kCEscapedLittleEndianUint32Array[uc];
474     little_endian::Store32(append_ptr, little_endian_uint32);
475     append_ptr += char_len;
476   }
477   dest->resize(new_dest_len - slop_bytes);
478 }
479 
480 // Reverses the mapping in Base64EscapeInternal; see that method's
481 // documentation for details of the mapping.
Base64UnescapeInternal(absl::Nullable<const char * > src_param,size_t szsrc,absl::Nullable<char * > dest,size_t szdest,absl::Nonnull<const signed char * > unbase64,absl::Nonnull<size_t * > len)482 bool Base64UnescapeInternal(absl::Nullable<const char*> src_param, size_t szsrc,
483                             absl::Nullable<char*> dest, size_t szdest,
484                             absl::Nonnull<const signed char*> unbase64,
485                             absl::Nonnull<size_t*> len) {
486   static const char kPad64Equals = '=';
487   static const char kPad64Dot = '.';
488 
489   size_t destidx = 0;
490   int decode = 0;
491   int state = 0;
492   unsigned char ch = 0;
493   unsigned int temp = 0;
494 
495   // If "char" is signed by default, using *src as an array index results in
496   // accessing negative array elements. Treat the input as a pointer to
497   // unsigned char to avoid this.
498   const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
499 
500   // The GET_INPUT macro gets the next input character, skipping
501   // over any whitespace, and stopping when we reach the end of the
502   // string or when we read any non-data character.  The arguments are
503   // an arbitrary identifier (used as a label for goto) and the number
504   // of data bytes that must remain in the input to avoid aborting the
505   // loop.
506 #define GET_INPUT(label, remain)                                \
507   label:                                                        \
508   --szsrc;                                                      \
509   ch = *src++;                                                  \
510   decode = unbase64[ch];                                        \
511   if (decode < 0) {                                             \
512     if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
513     state = 4 - remain;                                         \
514     break;                                                      \
515   }
516 
517   // if dest is null, we're just checking to see if it's legal input
518   // rather than producing output.  (I suspect this could just be done
519   // with a regexp...).  We duplicate the loop so this test can be
520   // outside it instead of in every iteration.
521 
522   if (dest) {
523     // This loop consumes 4 input bytes and produces 3 output bytes
524     // per iteration.  We can't know at the start that there is enough
525     // data left in the string for a full iteration, so the loop may
526     // break out in the middle; if so 'state' will be set to the
527     // number of input bytes read.
528 
529     while (szsrc >= 4) {
530       // We'll start by optimistically assuming that the next four
531       // bytes of the string (src[0..3]) are four good data bytes
532       // (that is, no nulls, whitespace, padding chars, or illegal
533       // chars).  We need to test src[0..2] for nulls individually
534       // before constructing temp to preserve the property that we
535       // never read past a null in the string (no matter how long
536       // szsrc claims the string is).
537 
538       if (!src[0] || !src[1] || !src[2] ||
539           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
540                     (unsigned(unbase64[src[1]]) << 12) |
541                     (unsigned(unbase64[src[2]]) << 6) |
542                     (unsigned(unbase64[src[3]])))) &
543            0x80000000)) {
544         // Iff any of those four characters was bad (null, illegal,
545         // whitespace, padding), then temp's high bit will be set
546         // (because unbase64[] is -1 for all bad characters).
547         //
548         // We'll back up and resort to the slower decoder, which knows
549         // how to handle those cases.
550 
551         GET_INPUT(first, 4);
552         temp = static_cast<unsigned char>(decode);
553         GET_INPUT(second, 3);
554         temp = (temp << 6) | static_cast<unsigned char>(decode);
555         GET_INPUT(third, 2);
556         temp = (temp << 6) | static_cast<unsigned char>(decode);
557         GET_INPUT(fourth, 1);
558         temp = (temp << 6) | static_cast<unsigned char>(decode);
559       } else {
560         // We really did have four good data bytes, so advance four
561         // characters in the string.
562 
563         szsrc -= 4;
564         src += 4;
565       }
566 
567       // temp has 24 bits of input, so write that out as three bytes.
568 
569       if (destidx + 3 > szdest) return false;
570       dest[destidx + 2] = static_cast<char>(temp);
571       temp >>= 8;
572       dest[destidx + 1] = static_cast<char>(temp);
573       temp >>= 8;
574       dest[destidx] = static_cast<char>(temp);
575       destidx += 3;
576     }
577   } else {
578     while (szsrc >= 4) {
579       if (!src[0] || !src[1] || !src[2] ||
580           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
581                     (unsigned(unbase64[src[1]]) << 12) |
582                     (unsigned(unbase64[src[2]]) << 6) |
583                     (unsigned(unbase64[src[3]])))) &
584            0x80000000)) {
585         GET_INPUT(first_no_dest, 4);
586         GET_INPUT(second_no_dest, 3);
587         GET_INPUT(third_no_dest, 2);
588         GET_INPUT(fourth_no_dest, 1);
589       } else {
590         szsrc -= 4;
591         src += 4;
592       }
593       destidx += 3;
594     }
595   }
596 
597 #undef GET_INPUT
598 
599   // if the loop terminated because we read a bad character, return
600   // now.
601   if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
602       !absl::ascii_isspace(ch))
603     return false;
604 
605   if (ch == kPad64Equals || ch == kPad64Dot) {
606     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
607     // look at it again when we count to check for the proper number of
608     // equals signs at the end.
609     ++szsrc;
610     --src;
611   } else {
612     // This loop consumes 1 input byte per iteration.  It's used to
613     // clean up the 0-3 input bytes remaining when the first, faster
614     // loop finishes.  'temp' contains the data from 'state' input
615     // characters read by the first loop.
616     while (szsrc > 0) {
617       --szsrc;
618       ch = *src++;
619       decode = unbase64[ch];
620       if (decode < 0) {
621         if (absl::ascii_isspace(ch)) {
622           continue;
623         } else if (ch == kPad64Equals || ch == kPad64Dot) {
624           // back up one character; we'll read it again when we check
625           // for the correct number of pad characters at the end.
626           ++szsrc;
627           --src;
628           break;
629         } else {
630           return false;
631         }
632       }
633 
634       // Each input character gives us six bits of output.
635       temp = (temp << 6) | static_cast<unsigned char>(decode);
636       ++state;
637       if (state == 4) {
638         // If we've accumulated 24 bits of output, write that out as
639         // three bytes.
640         if (dest) {
641           if (destidx + 3 > szdest) return false;
642           dest[destidx + 2] = static_cast<char>(temp);
643           temp >>= 8;
644           dest[destidx + 1] = static_cast<char>(temp);
645           temp >>= 8;
646           dest[destidx] = static_cast<char>(temp);
647         }
648         destidx += 3;
649         state = 0;
650         temp = 0;
651       }
652     }
653   }
654 
655   // Process the leftover data contained in 'temp' at the end of the input.
656   int expected_equals = 0;
657   switch (state) {
658     case 0:
659       // Nothing left over; output is a multiple of 3 bytes.
660       break;
661 
662     case 1:
663       // Bad input; we have 6 bits left over.
664       return false;
665 
666     case 2:
667       // Produce one more output byte from the 12 input bits we have left.
668       if (dest) {
669         if (destidx + 1 > szdest) return false;
670         temp >>= 4;
671         dest[destidx] = static_cast<char>(temp);
672       }
673       ++destidx;
674       expected_equals = 2;
675       break;
676 
677     case 3:
678       // Produce two more output bytes from the 18 input bits we have left.
679       if (dest) {
680         if (destidx + 2 > szdest) return false;
681         temp >>= 2;
682         dest[destidx + 1] = static_cast<char>(temp);
683         temp >>= 8;
684         dest[destidx] = static_cast<char>(temp);
685       }
686       destidx += 2;
687       expected_equals = 1;
688       break;
689 
690     default:
691       // state should have no other values at this point.
692       ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
693                    state);
694   }
695 
696   // The remainder of the string should be all whitespace, mixed with
697   // exactly 0 equals signs, or exactly 'expected_equals' equals
698   // signs.  (Always accepting 0 equals signs is an Abseil extension
699   // not covered in the RFC, as is accepting dot as the pad character.)
700 
701   int equals = 0;
702   while (szsrc > 0) {
703     if (*src == kPad64Equals || *src == kPad64Dot)
704       ++equals;
705     else if (!absl::ascii_isspace(*src))
706       return false;
707     --szsrc;
708     ++src;
709   }
710 
711   const bool ok = (equals == 0 || equals == expected_equals);
712   if (ok) *len = destidx;
713   return ok;
714 }
715 
716 // The arrays below map base64-escaped characters back to their original values.
717 // For the inverse case, see k(WebSafe)Base64Chars in the internal
718 // escaping.cc.
719 // These arrays were generated by the following inversion code:
720 // #include <sys/time.h>
721 // #include <stdlib.h>
722 // #include <string.h>
723 // main()
724 // {
725 //   static const char Base64[] =
726 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
727 //   char* pos;
728 //   int idx, i, j;
729 //   printf("    ");
730 //   for (i = 0; i < 255; i += 8) {
731 //     for (j = i; j < i + 8; j++) {
732 //       pos = strchr(Base64, j);
733 //       if ((pos == nullptr) || (j == 0))
734 //         idx = -1;
735 //       else
736 //         idx = pos - Base64;
737 //       if (idx == -1)
738 //         printf(" %2d,     ", idx);
739 //       else
740 //         printf(" %2d/*%c*/,", idx, j);
741 //     }
742 //     printf("\n    ");
743 //   }
744 // }
745 //
746 // where the value of "Base64[]" was replaced by one of k(WebSafe)Base64Chars
747 // in the internal escaping.cc.
748 /* clang-format off */
749 constexpr signed char kUnBase64[] = {
750     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
751     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
752     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
753     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
754     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
755     -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
756     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
757     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
758     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
759     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
760     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
761     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
762     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
763     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
764     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
765     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
766     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
767     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
768     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
769     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
770     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
771     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
772     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
773     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
774     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
775     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
776     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
777     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
778     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
779     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
780     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
781     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
782 };
783 
784 constexpr signed char kUnWebSafeBase64[] = {
785     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
786     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
787     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
788     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
789     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
790     -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
791     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
792     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
793     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
794     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
795     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
796     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
797     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
798     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
799     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
800     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
801     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
802     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
803     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
804     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
805     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
806     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
807     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
808     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
809     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
810     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
811     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
812     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
813     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
814     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
815     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
816     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
817 };
818 /* clang-format on */
819 
820 template <typename String>
Base64UnescapeInternal(absl::Nullable<const char * > src,size_t slen,absl::Nonnull<String * > dest,absl::Nonnull<const signed char * > unbase64)821 bool Base64UnescapeInternal(absl::Nullable<const char*> src, size_t slen,
822                             absl::Nonnull<String*> dest,
823                             absl::Nonnull<const signed char*> unbase64) {
824   // Determine the size of the output string.  Base64 encodes every 3 bytes into
825   // 4 characters.  Any leftover chars are added directly for good measure.
826   const size_t dest_len = 3 * (slen / 4) + (slen % 4);
827 
828   strings_internal::STLStringResizeUninitialized(dest, dest_len);
829 
830   // We are getting the destination buffer by getting the beginning of the
831   // string and converting it into a char *.
832   size_t len;
833   const bool ok =
834       Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
835   if (!ok) {
836     dest->clear();
837     return false;
838   }
839 
840   // could be shorter if there was padding
841   assert(len <= dest_len);
842   dest->erase(len);
843 
844   return true;
845 }
846 
847 /* clang-format off */
848 constexpr char kHexValueLenient[256] = {
849     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
850     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
851     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
852     0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0,  // '0'..'9'
853     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'A'..'F'
854     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
855     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'a'..'f'
856     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
857     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
858     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
859     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
860     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
861     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
862     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
863     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
864     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
865 };
866 
867 constexpr signed char kHexValueStrict[256] = {
868     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
869     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
870     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
871      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,  // '0'..'9'
872     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'A'..'F'
873     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
874     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,  // 'a'..'f'
875     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
876     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
877     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
878     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
879     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
880     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
881     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
882     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
883     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
884 };
885 /* clang-format on */
886 
887 // This is a templated function so that T can be either a char*
888 // or a string.  This works because we use the [] operator to access
889 // individual characters at a time.
890 template <typename T>
HexStringToBytesInternal(absl::Nullable<const char * > from,T to,size_t num)891 void HexStringToBytesInternal(absl::Nullable<const char*> from, T to,
892                               size_t num) {
893   for (size_t i = 0; i < num; i++) {
894     to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
895             (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
896   }
897 }
898 
899 // This is a templated function so that T can be either a char* or a
900 // std::string.
901 template <typename T>
BytesToHexStringInternal(absl::Nullable<const unsigned char * > src,T dest,size_t num)902 void BytesToHexStringInternal(absl::Nullable<const unsigned char*> src, T dest,
903                               size_t num) {
904   auto dest_ptr = &dest[0];
905   for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
906     const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
907     std::copy(hex_p, hex_p + 2, dest_ptr);
908   }
909 }
910 
911 }  // namespace
912 
913 // ----------------------------------------------------------------------
914 // CUnescape()
915 //
916 // See CUnescapeInternal() for implementation details.
917 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,absl::Nonnull<std::string * > dest,absl::Nullable<std::string * > error)918 bool CUnescape(absl::string_view source, absl::Nonnull<std::string*> dest,
919                absl::Nullable<std::string*> error) {
920   return CUnescapeInternal(source, kUnescapeNulls, dest, error);
921 }
922 
CEscape(absl::string_view src)923 std::string CEscape(absl::string_view src) {
924   std::string dest;
925   CEscapeAndAppendInternal(src, &dest);
926   return dest;
927 }
928 
CHexEscape(absl::string_view src)929 std::string CHexEscape(absl::string_view src) {
930   return CEscapeInternal(src, true, false);
931 }
932 
Utf8SafeCEscape(absl::string_view src)933 std::string Utf8SafeCEscape(absl::string_view src) {
934   return CEscapeInternal(src, false, true);
935 }
936 
Utf8SafeCHexEscape(absl::string_view src)937 std::string Utf8SafeCHexEscape(absl::string_view src) {
938   return CEscapeInternal(src, true, true);
939 }
940 
Base64Unescape(absl::string_view src,absl::Nonnull<std::string * > dest)941 bool Base64Unescape(absl::string_view src, absl::Nonnull<std::string*> dest) {
942   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
943 }
944 
WebSafeBase64Unescape(absl::string_view src,absl::Nonnull<std::string * > dest)945 bool WebSafeBase64Unescape(absl::string_view src,
946                            absl::Nonnull<std::string*> dest) {
947   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
948 }
949 
Base64Escape(absl::string_view src,absl::Nonnull<std::string * > dest)950 void Base64Escape(absl::string_view src, absl::Nonnull<std::string*> dest) {
951   strings_internal::Base64EscapeInternal(
952       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
953       true, strings_internal::kBase64Chars);
954 }
955 
WebSafeBase64Escape(absl::string_view src,absl::Nonnull<std::string * > dest)956 void WebSafeBase64Escape(absl::string_view src,
957                          absl::Nonnull<std::string*> dest) {
958   strings_internal::Base64EscapeInternal(
959       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
960       false, strings_internal::kWebSafeBase64Chars);
961 }
962 
Base64Escape(absl::string_view src)963 std::string Base64Escape(absl::string_view src) {
964   std::string dest;
965   strings_internal::Base64EscapeInternal(
966       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
967       true, strings_internal::kBase64Chars);
968   return dest;
969 }
970 
WebSafeBase64Escape(absl::string_view src)971 std::string WebSafeBase64Escape(absl::string_view src) {
972   std::string dest;
973   strings_internal::Base64EscapeInternal(
974       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
975       false, strings_internal::kWebSafeBase64Chars);
976   return dest;
977 }
978 
HexStringToBytes(absl::string_view hex,absl::Nonnull<std::string * > bytes)979 bool HexStringToBytes(absl::string_view hex,
980                       absl::Nonnull<std::string*> bytes) {
981   std::string output;
982 
983   size_t num_bytes = hex.size() / 2;
984   if (hex.size() != num_bytes * 2) {
985     return false;
986   }
987 
988   absl::strings_internal::STLStringResizeUninitialized(&output, num_bytes);
989   auto hex_p = hex.cbegin();
990   for (std::string::iterator bin_p = output.begin(); bin_p != output.end();
991        ++bin_p) {
992     int h1 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)];
993     int h2 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)];
994     if (h1 == -1 || h2 == -1) {
995       output.resize(static_cast<size_t>(bin_p - output.begin()));
996       return false;
997     }
998     *bin_p = static_cast<char>((h1 << 4) + h2);
999   }
1000 
1001   *bytes = std::move(output);
1002   return true;
1003 }
1004 
HexStringToBytes(absl::string_view from)1005 std::string HexStringToBytes(absl::string_view from) {
1006   std::string result;
1007   const auto num = from.size() / 2;
1008   strings_internal::STLStringResizeUninitialized(&result, num);
1009   absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
1010   return result;
1011 }
1012 
BytesToHexString(absl::string_view from)1013 std::string BytesToHexString(absl::string_view from) {
1014   std::string result;
1015   strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
1016   absl::BytesToHexStringInternal<std::string&>(
1017       reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
1018   return result;
1019 }
1020 
1021 ABSL_NAMESPACE_END
1022 }  // namespace absl
1023