1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/escaping.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cstdint>
20 #include <cstring>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24 
25 #include "absl/base/internal/endian.h"
26 #include "absl/base/internal/raw_logging.h"
27 #include "absl/base/internal/unaligned_access.h"
28 #include "absl/strings/internal/char_map.h"
29 #include "absl/strings/internal/escaping.h"
30 #include "absl/strings/internal/resize_uninitialized.h"
31 #include "absl/strings/internal/utf8.h"
32 #include "absl/strings/str_cat.h"
33 #include "absl/strings/str_join.h"
34 #include "absl/strings/string_view.h"
35 
36 namespace absl {
37 ABSL_NAMESPACE_BEGIN
38 namespace {
39 
40 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
41 constexpr bool kUnescapeNulls = false;
42 
is_octal_digit(char c)43 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
44 
hex_digit_to_int(char c)45 inline unsigned int hex_digit_to_int(char c) {
46   static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
47                 "Character set must be ASCII.");
48   assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
49   unsigned int x = static_cast<unsigned char>(c);
50   if (x > '9') {
51     x += 9;
52   }
53   return x & 0xf;
54 }
55 
IsSurrogate(char32_t c,absl::string_view src,std::string * error)56 inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
57   if (c >= 0xD800 && c <= 0xDFFF) {
58     if (error) {
59       *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
60                             src);
61     }
62     return true;
63   }
64   return false;
65 }
66 
67 // ----------------------------------------------------------------------
68 // CUnescapeInternal()
69 //    Implements both CUnescape() and CUnescapeForNullTerminatedString().
70 //
71 //    Unescapes C escape sequences and is the reverse of CEscape().
72 //
73 //    If 'source' is valid, stores the unescaped string and its size in
74 //    'dest' and 'dest_len' respectively, and returns true. Otherwise
75 //    returns false and optionally stores the error description in
76 //    'error'. Set 'error' to nullptr to disable error reporting.
77 //
78 //    'dest' should point to a buffer that is at least as big as 'source'.
79 //    'source' and 'dest' may be the same.
80 //
81 //     NOTE: any changes to this function must also be reflected in the older
82 //     UnescapeCEscapeSequences().
83 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,char * dest,ptrdiff_t * dest_len,std::string * error)84 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
85                        char* dest, ptrdiff_t* dest_len, std::string* error) {
86   char* d = dest;
87   const char* p = source.data();
88   const char* end = p + source.size();
89   const char* last_byte = end - 1;
90 
91   // Small optimization for case where source = dest and there's no escaping
92   while (p == d && p < end && *p != '\\') p++, d++;
93 
94   while (p < end) {
95     if (*p != '\\') {
96       *d++ = *p++;
97     } else {
98       if (++p > last_byte) {  // skip past the '\\'
99         if (error) *error = "String cannot end with \\";
100         return false;
101       }
102       switch (*p) {
103         case 'a':  *d++ = '\a';  break;
104         case 'b':  *d++ = '\b';  break;
105         case 'f':  *d++ = '\f';  break;
106         case 'n':  *d++ = '\n';  break;
107         case 'r':  *d++ = '\r';  break;
108         case 't':  *d++ = '\t';  break;
109         case 'v':  *d++ = '\v';  break;
110         case '\\': *d++ = '\\';  break;
111         case '?':  *d++ = '\?';  break;    // \?  Who knew?
112         case '\'': *d++ = '\'';  break;
113         case '"':  *d++ = '\"';  break;
114         case '0':
115         case '1':
116         case '2':
117         case '3':
118         case '4':
119         case '5':
120         case '6':
121         case '7': {
122           // octal digit: 1 to 3 digits
123           const char* octal_start = p;
124           unsigned int ch = static_cast<unsigned int>(*p - '0');  // digit 1
125           if (p < last_byte && is_octal_digit(p[1]))
126             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 2
127           if (p < last_byte && is_octal_digit(p[1]))
128             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 3
129           if (ch > 0xff) {
130             if (error) {
131               *error = "Value of \\" +
132                        std::string(octal_start,
133                                    static_cast<size_t>(p + 1 - octal_start)) +
134                        " exceeds 0xff";
135             }
136             return false;
137           }
138           if ((ch == 0) && leave_nulls_escaped) {
139             // Copy the escape sequence for the null character
140             const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
141             *d++ = '\\';
142             memmove(d, octal_start, octal_size);
143             d += octal_size;
144             break;
145           }
146           *d++ = static_cast<char>(ch);
147           break;
148         }
149         case 'x':
150         case 'X': {
151           if (p >= last_byte) {
152             if (error) *error = "String cannot end with \\x";
153             return false;
154           } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
155             if (error) *error = "\\x cannot be followed by a non-hex digit";
156             return false;
157           }
158           unsigned int ch = 0;
159           const char* hex_start = p;
160           while (p < last_byte &&
161                  absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
162             // Arbitrarily many hex digits
163             ch = (ch << 4) + hex_digit_to_int(*++p);
164           if (ch > 0xFF) {
165             if (error) {
166               *error = "Value of \\" +
167                        std::string(hex_start,
168                                    static_cast<size_t>(p + 1 - hex_start)) +
169                        " exceeds 0xff";
170             }
171             return false;
172           }
173           if ((ch == 0) && leave_nulls_escaped) {
174             // Copy the escape sequence for the null character
175             const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
176             *d++ = '\\';
177             memmove(d, hex_start, hex_size);
178             d += hex_size;
179             break;
180           }
181           *d++ = static_cast<char>(ch);
182           break;
183         }
184         case 'u': {
185           // \uhhhh => convert 4 hex digits to UTF-8
186           char32_t rune = 0;
187           const char* hex_start = p;
188           if (p + 4 >= end) {
189             if (error) {
190               *error = "\\u must be followed by 4 hex digits: \\" +
191                        std::string(hex_start,
192                                    static_cast<size_t>(p + 1 - hex_start));
193             }
194             return false;
195           }
196           for (int i = 0; i < 4; ++i) {
197             // Look one char ahead.
198             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
199               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
200             } else {
201               if (error) {
202                 *error = "\\u must be followed by 4 hex digits: \\" +
203                          std::string(hex_start,
204                                      static_cast<size_t>(p + 1 - hex_start));
205               }
206               return false;
207             }
208           }
209           if ((rune == 0) && leave_nulls_escaped) {
210             // Copy the escape sequence for the null character
211             *d++ = '\\';
212             memmove(d, hex_start, 5);  // u0000
213             d += 5;
214             break;
215           }
216           if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
217             return false;
218           }
219           d += strings_internal::EncodeUTF8Char(d, rune);
220           break;
221         }
222         case 'U': {
223           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
224           char32_t rune = 0;
225           const char* hex_start = p;
226           if (p + 8 >= end) {
227             if (error) {
228               *error = "\\U must be followed by 8 hex digits: \\" +
229                        std::string(hex_start,
230                                    static_cast<size_t>(p + 1 - hex_start));
231             }
232             return false;
233           }
234           for (int i = 0; i < 8; ++i) {
235             // Look one char ahead.
236             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
237               // Don't change rune until we're sure this
238               // is within the Unicode limit, but do advance p.
239               uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
240               if (newrune > 0x10FFFF) {
241                 if (error) {
242                   *error = "Value of \\" +
243                            std::string(hex_start,
244                                        static_cast<size_t>(p + 1 - hex_start)) +
245                            " exceeds Unicode limit (0x10FFFF)";
246                 }
247                 return false;
248               } else {
249                 rune = newrune;
250               }
251             } else {
252               if (error) {
253                 *error = "\\U must be followed by 8 hex digits: \\" +
254                          std::string(hex_start,
255                                      static_cast<size_t>(p + 1 - hex_start));
256               }
257               return false;
258             }
259           }
260           if ((rune == 0) && leave_nulls_escaped) {
261             // Copy the escape sequence for the null character
262             *d++ = '\\';
263             memmove(d, hex_start, 9);  // U00000000
264             d += 9;
265             break;
266           }
267           if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
268             return false;
269           }
270           d += strings_internal::EncodeUTF8Char(d, rune);
271           break;
272         }
273         default: {
274           if (error) *error = std::string("Unknown escape sequence: \\") + *p;
275           return false;
276         }
277       }
278       p++;                                 // read past letter we escaped
279     }
280   }
281   *dest_len = d - dest;
282   return true;
283 }
284 
285 // ----------------------------------------------------------------------
286 // CUnescapeInternal()
287 //
288 //    Same as above but uses a std::string for output. 'source' and 'dest'
289 //    may be the same.
290 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,std::string * dest,std::string * error)291 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
292                        std::string* dest, std::string* error) {
293   strings_internal::STLStringResizeUninitialized(dest, source.size());
294 
295   ptrdiff_t dest_size;
296   if (!CUnescapeInternal(source,
297                          leave_nulls_escaped,
298                          &(*dest)[0],
299                          &dest_size,
300                          error)) {
301     return false;
302   }
303   dest->erase(static_cast<size_t>(dest_size));
304   return true;
305 }
306 
307 // ----------------------------------------------------------------------
308 // CEscape()
309 // CHexEscape()
310 // Utf8SafeCEscape()
311 // Utf8SafeCHexEscape()
312 //    Escapes 'src' using C-style escape sequences.  This is useful for
313 //    preparing query flags.  The 'Hex' version uses hexadecimal rather than
314 //    octal sequences.  The 'Utf8Safe' version does not touch UTF-8 bytes.
315 //
316 //    Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
317 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)318 std::string CEscapeInternal(absl::string_view src, bool use_hex,
319                             bool utf8_safe) {
320   std::string dest;
321   bool last_hex_escape = false;  // true if last output char was \xNN.
322 
323   for (char c : src) {
324     bool is_hex_escape = false;
325     switch (c) {
326       case '\n': dest.append("\\" "n"); break;
327       case '\r': dest.append("\\" "r"); break;
328       case '\t': dest.append("\\" "t"); break;
329       case '\"': dest.append("\\" "\""); break;
330       case '\'': dest.append("\\" "'"); break;
331       case '\\': dest.append("\\" "\\"); break;
332       default: {
333         // Note that if we emit \xNN and the src character after that is a hex
334         // digit then that digit must be escaped too to prevent it being
335         // interpreted as part of the character code by C.
336         const unsigned char uc = static_cast<unsigned char>(c);
337         if ((!utf8_safe || uc < 0x80) &&
338             (!absl::ascii_isprint(uc) ||
339              (last_hex_escape && absl::ascii_isxdigit(uc)))) {
340           if (use_hex) {
341             dest.append("\\" "x");
342             dest.push_back(numbers_internal::kHexChar[uc / 16]);
343             dest.push_back(numbers_internal::kHexChar[uc % 16]);
344             is_hex_escape = true;
345           } else {
346             dest.append("\\");
347             dest.push_back(numbers_internal::kHexChar[uc / 64]);
348             dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
349             dest.push_back(numbers_internal::kHexChar[uc % 8]);
350           }
351         } else {
352           dest.push_back(c);
353           break;
354         }
355       }
356     }
357     last_hex_escape = is_hex_escape;
358   }
359 
360   return dest;
361 }
362 
363 /* clang-format off */
364 constexpr unsigned char c_escaped_len[256] = {
365     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
366     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
367     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
368     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
369     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
370     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
371     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
372     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
373     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
374     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
377     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
378     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
379     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
380     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
381 };
382 /* clang-format on */
383 
384 // Calculates the length of the C-style escaped version of 'src'.
385 // Assumes that non-printable characters are escaped using octal sequences, and
386 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)387 inline size_t CEscapedLength(absl::string_view src) {
388   size_t escaped_len = 0;
389   for (char c : src)
390     escaped_len += c_escaped_len[static_cast<unsigned char>(c)];
391   return escaped_len;
392 }
393 
CEscapeAndAppendInternal(absl::string_view src,std::string * dest)394 void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
395   size_t escaped_len = CEscapedLength(src);
396   if (escaped_len == src.size()) {
397     dest->append(src.data(), src.size());
398     return;
399   }
400 
401   size_t cur_dest_len = dest->size();
402   strings_internal::STLStringResizeUninitialized(dest,
403                                                  cur_dest_len + escaped_len);
404   char* append_ptr = &(*dest)[cur_dest_len];
405 
406   for (char c : src) {
407     size_t char_len = c_escaped_len[static_cast<unsigned char>(c)];
408     if (char_len == 1) {
409       *append_ptr++ = c;
410     } else if (char_len == 2) {
411       switch (c) {
412         case '\n':
413           *append_ptr++ = '\\';
414           *append_ptr++ = 'n';
415           break;
416         case '\r':
417           *append_ptr++ = '\\';
418           *append_ptr++ = 'r';
419           break;
420         case '\t':
421           *append_ptr++ = '\\';
422           *append_ptr++ = 't';
423           break;
424         case '\"':
425           *append_ptr++ = '\\';
426           *append_ptr++ = '\"';
427           break;
428         case '\'':
429           *append_ptr++ = '\\';
430           *append_ptr++ = '\'';
431           break;
432         case '\\':
433           *append_ptr++ = '\\';
434           *append_ptr++ = '\\';
435           break;
436       }
437     } else {
438       *append_ptr++ = '\\';
439       *append_ptr++ = '0' + static_cast<unsigned char>(c) / 64;
440       *append_ptr++ = '0' + (static_cast<unsigned char>(c) % 64) / 8;
441       *append_ptr++ = '0' + static_cast<unsigned char>(c) % 8;
442     }
443   }
444 }
445 
Base64UnescapeInternal(const char * src_param,size_t szsrc,char * dest,size_t szdest,const signed char * unbase64,size_t * len)446 bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
447                             size_t szdest, const signed char* unbase64,
448                             size_t* len) {
449   static const char kPad64Equals = '=';
450   static const char kPad64Dot = '.';
451 
452   size_t destidx = 0;
453   int decode = 0;
454   int state = 0;
455   unsigned char ch = 0;
456   unsigned int temp = 0;
457 
458   // If "char" is signed by default, using *src as an array index results in
459   // accessing negative array elements. Treat the input as a pointer to
460   // unsigned char to avoid this.
461   const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
462 
463   // The GET_INPUT macro gets the next input character, skipping
464   // over any whitespace, and stopping when we reach the end of the
465   // string or when we read any non-data character.  The arguments are
466   // an arbitrary identifier (used as a label for goto) and the number
467   // of data bytes that must remain in the input to avoid aborting the
468   // loop.
469 #define GET_INPUT(label, remain)                                \
470   label:                                                        \
471   --szsrc;                                                      \
472   ch = *src++;                                                  \
473   decode = unbase64[ch];                                        \
474   if (decode < 0) {                                             \
475     if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
476     state = 4 - remain;                                         \
477     break;                                                      \
478   }
479 
480   // if dest is null, we're just checking to see if it's legal input
481   // rather than producing output.  (I suspect this could just be done
482   // with a regexp...).  We duplicate the loop so this test can be
483   // outside it instead of in every iteration.
484 
485   if (dest) {
486     // This loop consumes 4 input bytes and produces 3 output bytes
487     // per iteration.  We can't know at the start that there is enough
488     // data left in the string for a full iteration, so the loop may
489     // break out in the middle; if so 'state' will be set to the
490     // number of input bytes read.
491 
492     while (szsrc >= 4) {
493       // We'll start by optimistically assuming that the next four
494       // bytes of the string (src[0..3]) are four good data bytes
495       // (that is, no nulls, whitespace, padding chars, or illegal
496       // chars).  We need to test src[0..2] for nulls individually
497       // before constructing temp to preserve the property that we
498       // never read past a null in the string (no matter how long
499       // szsrc claims the string is).
500 
501       if (!src[0] || !src[1] || !src[2] ||
502           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
503                     (unsigned(unbase64[src[1]]) << 12) |
504                     (unsigned(unbase64[src[2]]) << 6) |
505                     (unsigned(unbase64[src[3]])))) &
506            0x80000000)) {
507         // Iff any of those four characters was bad (null, illegal,
508         // whitespace, padding), then temp's high bit will be set
509         // (because unbase64[] is -1 for all bad characters).
510         //
511         // We'll back up and resort to the slower decoder, which knows
512         // how to handle those cases.
513 
514         GET_INPUT(first, 4);
515         temp = static_cast<unsigned char>(decode);
516         GET_INPUT(second, 3);
517         temp = (temp << 6) | static_cast<unsigned char>(decode);
518         GET_INPUT(third, 2);
519         temp = (temp << 6) | static_cast<unsigned char>(decode);
520         GET_INPUT(fourth, 1);
521         temp = (temp << 6) | static_cast<unsigned char>(decode);
522       } else {
523         // We really did have four good data bytes, so advance four
524         // characters in the string.
525 
526         szsrc -= 4;
527         src += 4;
528       }
529 
530       // temp has 24 bits of input, so write that out as three bytes.
531 
532       if (destidx + 3 > szdest) return false;
533       dest[destidx + 2] = static_cast<char>(temp);
534       temp >>= 8;
535       dest[destidx + 1] = static_cast<char>(temp);
536       temp >>= 8;
537       dest[destidx] = static_cast<char>(temp);
538       destidx += 3;
539     }
540   } else {
541     while (szsrc >= 4) {
542       if (!src[0] || !src[1] || !src[2] ||
543           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
544                     (unsigned(unbase64[src[1]]) << 12) |
545                     (unsigned(unbase64[src[2]]) << 6) |
546                     (unsigned(unbase64[src[3]])))) &
547            0x80000000)) {
548         GET_INPUT(first_no_dest, 4);
549         GET_INPUT(second_no_dest, 3);
550         GET_INPUT(third_no_dest, 2);
551         GET_INPUT(fourth_no_dest, 1);
552       } else {
553         szsrc -= 4;
554         src += 4;
555       }
556       destidx += 3;
557     }
558   }
559 
560 #undef GET_INPUT
561 
562   // if the loop terminated because we read a bad character, return
563   // now.
564   if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
565       !absl::ascii_isspace(ch))
566     return false;
567 
568   if (ch == kPad64Equals || ch == kPad64Dot) {
569     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
570     // look at it again when we count to check for the proper number of
571     // equals signs at the end.
572     ++szsrc;
573     --src;
574   } else {
575     // This loop consumes 1 input byte per iteration.  It's used to
576     // clean up the 0-3 input bytes remaining when the first, faster
577     // loop finishes.  'temp' contains the data from 'state' input
578     // characters read by the first loop.
579     while (szsrc > 0) {
580       --szsrc;
581       ch = *src++;
582       decode = unbase64[ch];
583       if (decode < 0) {
584         if (absl::ascii_isspace(ch)) {
585           continue;
586         } else if (ch == kPad64Equals || ch == kPad64Dot) {
587           // back up one character; we'll read it again when we check
588           // for the correct number of pad characters at the end.
589           ++szsrc;
590           --src;
591           break;
592         } else {
593           return false;
594         }
595       }
596 
597       // Each input character gives us six bits of output.
598       temp = (temp << 6) | static_cast<unsigned char>(decode);
599       ++state;
600       if (state == 4) {
601         // If we've accumulated 24 bits of output, write that out as
602         // three bytes.
603         if (dest) {
604           if (destidx + 3 > szdest) return false;
605           dest[destidx + 2] = static_cast<char>(temp);
606           temp >>= 8;
607           dest[destidx + 1] = static_cast<char>(temp);
608           temp >>= 8;
609           dest[destidx] = static_cast<char>(temp);
610         }
611         destidx += 3;
612         state = 0;
613         temp = 0;
614       }
615     }
616   }
617 
618   // Process the leftover data contained in 'temp' at the end of the input.
619   int expected_equals = 0;
620   switch (state) {
621     case 0:
622       // Nothing left over; output is a multiple of 3 bytes.
623       break;
624 
625     case 1:
626       // Bad input; we have 6 bits left over.
627       return false;
628 
629     case 2:
630       // Produce one more output byte from the 12 input bits we have left.
631       if (dest) {
632         if (destidx + 1 > szdest) return false;
633         temp >>= 4;
634         dest[destidx] = static_cast<char>(temp);
635       }
636       ++destidx;
637       expected_equals = 2;
638       break;
639 
640     case 3:
641       // Produce two more output bytes from the 18 input bits we have left.
642       if (dest) {
643         if (destidx + 2 > szdest) return false;
644         temp >>= 2;
645         dest[destidx + 1] = static_cast<char>(temp);
646         temp >>= 8;
647         dest[destidx] = static_cast<char>(temp);
648       }
649       destidx += 2;
650       expected_equals = 1;
651       break;
652 
653     default:
654       // state should have no other values at this point.
655       ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
656                    state);
657   }
658 
659   // The remainder of the string should be all whitespace, mixed with
660   // exactly 0 equals signs, or exactly 'expected_equals' equals
661   // signs.  (Always accepting 0 equals signs is an Abseil extension
662   // not covered in the RFC, as is accepting dot as the pad character.)
663 
664   int equals = 0;
665   while (szsrc > 0) {
666     if (*src == kPad64Equals || *src == kPad64Dot)
667       ++equals;
668     else if (!absl::ascii_isspace(*src))
669       return false;
670     --szsrc;
671     ++src;
672   }
673 
674   const bool ok = (equals == 0 || equals == expected_equals);
675   if (ok) *len = destidx;
676   return ok;
677 }
678 
679 // The arrays below were generated by the following code
680 // #include <sys/time.h>
681 // #include <stdlib.h>
682 // #include <string.h>
683 // main()
684 // {
685 //   static const char Base64[] =
686 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
687 //   char* pos;
688 //   int idx, i, j;
689 //   printf("    ");
690 //   for (i = 0; i < 255; i += 8) {
691 //     for (j = i; j < i + 8; j++) {
692 //       pos = strchr(Base64, j);
693 //       if ((pos == nullptr) || (j == 0))
694 //         idx = -1;
695 //       else
696 //         idx = pos - Base64;
697 //       if (idx == -1)
698 //         printf(" %2d,     ", idx);
699 //       else
700 //         printf(" %2d/*%c*/,", idx, j);
701 //     }
702 //     printf("\n    ");
703 //   }
704 // }
705 //
706 // where the value of "Base64[]" was replaced by one of the base-64 conversion
707 // tables from the functions below.
708 /* clang-format off */
709 constexpr signed char kUnBase64[] = {
710     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
711     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
712     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
713     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
714     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
715     -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
716     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
717     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
718     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
719     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
720     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
721     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
722     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
723     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
724     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
725     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
726     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
727     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
728     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
729     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
730     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
731     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
732     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
733     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
734     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
735     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
736     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
737     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
738     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
739     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
740     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
741     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
742 };
743 
744 constexpr signed char kUnWebSafeBase64[] = {
745     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
746     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
747     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
748     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
749     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
750     -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
751     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
752     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
753     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
754     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
755     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
756     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
757     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
758     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
759     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
760     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
761     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
762     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
763     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
764     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
765     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
766     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
767     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
768     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
769     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
770     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
771     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
772     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
773     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
774     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
775     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
776     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
777 };
778 /* clang-format on */
779 
780 constexpr char kWebSafeBase64Chars[] =
781     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
782 
783 template <typename String>
Base64UnescapeInternal(const char * src,size_t slen,String * dest,const signed char * unbase64)784 bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
785                             const signed char* unbase64) {
786   // Determine the size of the output string.  Base64 encodes every 3 bytes into
787   // 4 characters.  Any leftover chars are added directly for good measure.
788   const size_t dest_len = 3 * (slen / 4) + (slen % 4);
789 
790   strings_internal::STLStringResizeUninitialized(dest, dest_len);
791 
792   // We are getting the destination buffer by getting the beginning of the
793   // string and converting it into a char *.
794   size_t len;
795   const bool ok =
796       Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
797   if (!ok) {
798     dest->clear();
799     return false;
800   }
801 
802   // could be shorter if there was padding
803   assert(len <= dest_len);
804   dest->erase(len);
805 
806   return true;
807 }
808 
809 /* clang-format off */
810 constexpr char kHexValueLenient[256] = {
811     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
812     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
813     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814     0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0,  // '0'..'9'
815     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'A'..'F'
816     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
817     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'a'..'f'
818     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
819     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
820     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
821     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
822     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
823     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
824     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
825     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
826     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
827 };
828 
829 /* clang-format on */
830 
831 // This is a templated function so that T can be either a char*
832 // or a string.  This works because we use the [] operator to access
833 // individual characters at a time.
834 template <typename T>
HexStringToBytesInternal(const char * from,T to,size_t num)835 void HexStringToBytesInternal(const char* from, T to, size_t num) {
836   for (size_t i = 0; i < num; i++) {
837     to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
838             (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
839   }
840 }
841 
842 // This is a templated function so that T can be either a char* or a
843 // std::string.
844 template <typename T>
BytesToHexStringInternal(const unsigned char * src,T dest,size_t num)845 void BytesToHexStringInternal(const unsigned char* src, T dest, size_t num) {
846   auto dest_ptr = &dest[0];
847   for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
848     const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
849     std::copy(hex_p, hex_p + 2, dest_ptr);
850   }
851 }
852 
853 }  // namespace
854 
855 // ----------------------------------------------------------------------
856 // CUnescape()
857 //
858 // See CUnescapeInternal() for implementation details.
859 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,std::string * dest,std::string * error)860 bool CUnescape(absl::string_view source, std::string* dest,
861                std::string* error) {
862   return CUnescapeInternal(source, kUnescapeNulls, dest, error);
863 }
864 
CEscape(absl::string_view src)865 std::string CEscape(absl::string_view src) {
866   std::string dest;
867   CEscapeAndAppendInternal(src, &dest);
868   return dest;
869 }
870 
CHexEscape(absl::string_view src)871 std::string CHexEscape(absl::string_view src) {
872   return CEscapeInternal(src, true, false);
873 }
874 
Utf8SafeCEscape(absl::string_view src)875 std::string Utf8SafeCEscape(absl::string_view src) {
876   return CEscapeInternal(src, false, true);
877 }
878 
Utf8SafeCHexEscape(absl::string_view src)879 std::string Utf8SafeCHexEscape(absl::string_view src) {
880   return CEscapeInternal(src, true, true);
881 }
882 
883 // ----------------------------------------------------------------------
884 // Base64Unescape() - base64 decoder
885 // Base64Escape() - base64 encoder
886 // WebSafeBase64Unescape() - Google's variation of base64 decoder
887 // WebSafeBase64Escape() - Google's variation of base64 encoder
888 //
889 // Check out
890 // https://datatracker.ietf.org/doc/html/rfc2045 for formal description, but
891 // what we care about is that...
892 //   Take the encoded stuff in groups of 4 characters and turn each
893 //   character into a code 0 to 63 thus:
894 //           A-Z map to 0 to 25
895 //           a-z map to 26 to 51
896 //           0-9 map to 52 to 61
897 //           +(- for WebSafe) maps to 62
898 //           /(_ for WebSafe) maps to 63
899 //   There will be four numbers, all less than 64 which can be represented
900 //   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
901 //   Arrange the 6 digit binary numbers into three bytes as such:
902 //   aaaaaabb bbbbcccc ccdddddd
903 //   Equals signs (one or two) are used at the end of the encoded block to
904 //   indicate that the text was not an integer multiple of three bytes long.
905 // ----------------------------------------------------------------------
906 
Base64Unescape(absl::string_view src,std::string * dest)907 bool Base64Unescape(absl::string_view src, std::string* dest) {
908   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
909 }
910 
WebSafeBase64Unescape(absl::string_view src,std::string * dest)911 bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
912   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
913 }
914 
Base64Escape(absl::string_view src,std::string * dest)915 void Base64Escape(absl::string_view src, std::string* dest) {
916   strings_internal::Base64EscapeInternal(
917       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
918       true, strings_internal::kBase64Chars);
919 }
920 
WebSafeBase64Escape(absl::string_view src,std::string * dest)921 void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
922   strings_internal::Base64EscapeInternal(
923       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
924       false, kWebSafeBase64Chars);
925 }
926 
Base64Escape(absl::string_view src)927 std::string Base64Escape(absl::string_view src) {
928   std::string dest;
929   strings_internal::Base64EscapeInternal(
930       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
931       true, strings_internal::kBase64Chars);
932   return dest;
933 }
934 
WebSafeBase64Escape(absl::string_view src)935 std::string WebSafeBase64Escape(absl::string_view src) {
936   std::string dest;
937   strings_internal::Base64EscapeInternal(
938       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
939       false, kWebSafeBase64Chars);
940   return dest;
941 }
942 
HexStringToBytes(absl::string_view from)943 std::string HexStringToBytes(absl::string_view from) {
944   std::string result;
945   const auto num = from.size() / 2;
946   strings_internal::STLStringResizeUninitialized(&result, num);
947   absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
948   return result;
949 }
950 
BytesToHexString(absl::string_view from)951 std::string BytesToHexString(absl::string_view from) {
952   std::string result;
953   strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
954   absl::BytesToHexStringInternal<std::string&>(
955       reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
956   return result;
957 }
958 
959 ABSL_NAMESPACE_END
960 }  // namespace absl
961