xref: /aosp_15_r20/external/webrtc/third_party/abseil-cpp/absl/strings/escaping.cc (revision d9f758449e529ab9291ac668be2861e7a55c2422)
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/escaping.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cstdint>
20 #include <cstring>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24 
25 #include "absl/base/internal/endian.h"
26 #include "absl/base/internal/raw_logging.h"
27 #include "absl/base/internal/unaligned_access.h"
28 #include "absl/strings/internal/char_map.h"
29 #include "absl/strings/internal/escaping.h"
30 #include "absl/strings/internal/resize_uninitialized.h"
31 #include "absl/strings/internal/utf8.h"
32 #include "absl/strings/str_cat.h"
33 #include "absl/strings/str_join.h"
34 #include "absl/strings/string_view.h"
35 
36 namespace absl {
37 ABSL_NAMESPACE_BEGIN
38 namespace {
39 
40 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
41 constexpr bool kUnescapeNulls = false;
42 
is_octal_digit(char c)43 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
44 
hex_digit_to_int(char c)45 inline unsigned int hex_digit_to_int(char c) {
46   static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
47                 "Character set must be ASCII.");
48   assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
49   unsigned int x = static_cast<unsigned char>(c);
50   if (x > '9') {
51     x += 9;
52   }
53   return x & 0xf;
54 }
55 
IsSurrogate(char32_t c,absl::string_view src,std::string * error)56 inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
57   if (c >= 0xD800 && c <= 0xDFFF) {
58     if (error) {
59       *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
60                             src);
61     }
62     return true;
63   }
64   return false;
65 }
66 
67 // ----------------------------------------------------------------------
68 // CUnescapeInternal()
69 //    Implements both CUnescape() and CUnescapeForNullTerminatedString().
70 //
71 //    Unescapes C escape sequences and is the reverse of CEscape().
72 //
73 //    If 'source' is valid, stores the unescaped string and its size in
74 //    'dest' and 'dest_len' respectively, and returns true. Otherwise
75 //    returns false and optionally stores the error description in
76 //    'error'. Set 'error' to nullptr to disable error reporting.
77 //
78 //    'dest' should point to a buffer that is at least as big as 'source'.
79 //    'source' and 'dest' may be the same.
80 //
81 //     NOTE: any changes to this function must also be reflected in the older
82 //     UnescapeCEscapeSequences().
83 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,char * dest,ptrdiff_t * dest_len,std::string * error)84 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
85                        char* dest, ptrdiff_t* dest_len, std::string* error) {
86   char* d = dest;
87   const char* p = source.data();
88   const char* end = p + source.size();
89   const char* last_byte = end - 1;
90 
91   // Small optimization for case where source = dest and there's no escaping
92   while (p == d && p < end && *p != '\\') p++, d++;
93 
94   while (p < end) {
95     if (*p != '\\') {
96       *d++ = *p++;
97     } else {
98       if (++p > last_byte) {  // skip past the '\\'
99         if (error) *error = "String cannot end with \\";
100         return false;
101       }
102       switch (*p) {
103         case 'a':  *d++ = '\a';  break;
104         case 'b':  *d++ = '\b';  break;
105         case 'f':  *d++ = '\f';  break;
106         case 'n':  *d++ = '\n';  break;
107         case 'r':  *d++ = '\r';  break;
108         case 't':  *d++ = '\t';  break;
109         case 'v':  *d++ = '\v';  break;
110         case '\\': *d++ = '\\';  break;
111         case '?':  *d++ = '\?';  break;    // \?  Who knew?
112         case '\'': *d++ = '\'';  break;
113         case '"':  *d++ = '\"';  break;
114         case '0':
115         case '1':
116         case '2':
117         case '3':
118         case '4':
119         case '5':
120         case '6':
121         case '7': {
122           // octal digit: 1 to 3 digits
123           const char* octal_start = p;
124           unsigned int ch = static_cast<unsigned int>(*p - '0');  // digit 1
125           if (p < last_byte && is_octal_digit(p[1]))
126             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 2
127           if (p < last_byte && is_octal_digit(p[1]))
128             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 3
129           if (ch > 0xff) {
130             if (error) {
131               *error = "Value of \\" +
132                        std::string(octal_start,
133                                    static_cast<size_t>(p + 1 - octal_start)) +
134                        " exceeds 0xff";
135             }
136             return false;
137           }
138           if ((ch == 0) && leave_nulls_escaped) {
139             // Copy the escape sequence for the null character
140             const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
141             *d++ = '\\';
142             memmove(d, octal_start, octal_size);
143             d += octal_size;
144             break;
145           }
146           *d++ = static_cast<char>(ch);
147           break;
148         }
149         case 'x':
150         case 'X': {
151           if (p >= last_byte) {
152             if (error) *error = "String cannot end with \\x";
153             return false;
154           } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
155             if (error) *error = "\\x cannot be followed by a non-hex digit";
156             return false;
157           }
158           unsigned int ch = 0;
159           const char* hex_start = p;
160           while (p < last_byte &&
161                  absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
162             // Arbitrarily many hex digits
163             ch = (ch << 4) + hex_digit_to_int(*++p);
164           if (ch > 0xFF) {
165             if (error) {
166               *error = "Value of \\" +
167                        std::string(hex_start,
168                                    static_cast<size_t>(p + 1 - hex_start)) +
169                        " exceeds 0xff";
170             }
171             return false;
172           }
173           if ((ch == 0) && leave_nulls_escaped) {
174             // Copy the escape sequence for the null character
175             const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
176             *d++ = '\\';
177             memmove(d, hex_start, hex_size);
178             d += hex_size;
179             break;
180           }
181           *d++ = static_cast<char>(ch);
182           break;
183         }
184         case 'u': {
185           // \uhhhh => convert 4 hex digits to UTF-8
186           char32_t rune = 0;
187           const char* hex_start = p;
188           if (p + 4 >= end) {
189             if (error) {
190               *error = "\\u must be followed by 4 hex digits: \\" +
191                        std::string(hex_start,
192                                    static_cast<size_t>(p + 1 - hex_start));
193             }
194             return false;
195           }
196           for (int i = 0; i < 4; ++i) {
197             // Look one char ahead.
198             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
199               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
200             } else {
201               if (error) {
202                 *error = "\\u must be followed by 4 hex digits: \\" +
203                          std::string(hex_start,
204                                      static_cast<size_t>(p + 1 - hex_start));
205               }
206               return false;
207             }
208           }
209           if ((rune == 0) && leave_nulls_escaped) {
210             // Copy the escape sequence for the null character
211             *d++ = '\\';
212             memmove(d, hex_start, 5);  // u0000
213             d += 5;
214             break;
215           }
216           if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
217             return false;
218           }
219           d += strings_internal::EncodeUTF8Char(d, rune);
220           break;
221         }
222         case 'U': {
223           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
224           char32_t rune = 0;
225           const char* hex_start = p;
226           if (p + 8 >= end) {
227             if (error) {
228               *error = "\\U must be followed by 8 hex digits: \\" +
229                        std::string(hex_start,
230                                    static_cast<size_t>(p + 1 - hex_start));
231             }
232             return false;
233           }
234           for (int i = 0; i < 8; ++i) {
235             // Look one char ahead.
236             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
237               // Don't change rune until we're sure this
238               // is within the Unicode limit, but do advance p.
239               uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
240               if (newrune > 0x10FFFF) {
241                 if (error) {
242                   *error = "Value of \\" +
243                            std::string(hex_start,
244                                        static_cast<size_t>(p + 1 - hex_start)) +
245                            " exceeds Unicode limit (0x10FFFF)";
246                 }
247                 return false;
248               } else {
249                 rune = newrune;
250               }
251             } else {
252               if (error) {
253                 *error = "\\U must be followed by 8 hex digits: \\" +
254                          std::string(hex_start,
255                                      static_cast<size_t>(p + 1 - hex_start));
256               }
257               return false;
258             }
259           }
260           if ((rune == 0) && leave_nulls_escaped) {
261             // Copy the escape sequence for the null character
262             *d++ = '\\';
263             memmove(d, hex_start, 9);  // U00000000
264             d += 9;
265             break;
266           }
267           if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
268             return false;
269           }
270           d += strings_internal::EncodeUTF8Char(d, rune);
271           break;
272         }
273         default: {
274           if (error) *error = std::string("Unknown escape sequence: \\") + *p;
275           return false;
276         }
277       }
278       p++;                                 // read past letter we escaped
279     }
280   }
281   *dest_len = d - dest;
282   return true;
283 }
284 
285 // ----------------------------------------------------------------------
286 // CUnescapeInternal()
287 //
288 //    Same as above but uses a std::string for output. 'source' and 'dest'
289 //    may be the same.
290 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,std::string * dest,std::string * error)291 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
292                        std::string* dest, std::string* error) {
293   strings_internal::STLStringResizeUninitialized(dest, source.size());
294 
295   ptrdiff_t dest_size;
296   if (!CUnescapeInternal(source,
297                          leave_nulls_escaped,
298                          &(*dest)[0],
299                          &dest_size,
300                          error)) {
301     return false;
302   }
303   dest->erase(static_cast<size_t>(dest_size));
304   return true;
305 }
306 
307 // ----------------------------------------------------------------------
308 // CEscape()
309 // CHexEscape()
310 // Utf8SafeCEscape()
311 // Utf8SafeCHexEscape()
312 //    Escapes 'src' using C-style escape sequences.  This is useful for
313 //    preparing query flags.  The 'Hex' version uses hexadecimal rather than
314 //    octal sequences.  The 'Utf8Safe' version does not touch UTF-8 bytes.
315 //
316 //    Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
317 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)318 std::string CEscapeInternal(absl::string_view src, bool use_hex,
319                             bool utf8_safe) {
320   std::string dest;
321   bool last_hex_escape = false;  // true if last output char was \xNN.
322 
323   for (char c : src) {
324     bool is_hex_escape = false;
325     switch (c) {
326       case '\n': dest.append("\\" "n"); break;
327       case '\r': dest.append("\\" "r"); break;
328       case '\t': dest.append("\\" "t"); break;
329       case '\"': dest.append("\\" "\""); break;
330       case '\'': dest.append("\\" "'"); break;
331       case '\\': dest.append("\\" "\\"); break;
332       default: {
333         // Note that if we emit \xNN and the src character after that is a hex
334         // digit then that digit must be escaped too to prevent it being
335         // interpreted as part of the character code by C.
336         const unsigned char uc = static_cast<unsigned char>(c);
337         if ((!utf8_safe || uc < 0x80) &&
338             (!absl::ascii_isprint(uc) ||
339              (last_hex_escape && absl::ascii_isxdigit(uc)))) {
340           if (use_hex) {
341             dest.append("\\" "x");
342             dest.push_back(numbers_internal::kHexChar[uc / 16]);
343             dest.push_back(numbers_internal::kHexChar[uc % 16]);
344             is_hex_escape = true;
345           } else {
346             dest.append("\\");
347             dest.push_back(numbers_internal::kHexChar[uc / 64]);
348             dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
349             dest.push_back(numbers_internal::kHexChar[uc % 8]);
350           }
351         } else {
352           dest.push_back(c);
353           break;
354         }
355       }
356     }
357     last_hex_escape = is_hex_escape;
358   }
359 
360   return dest;
361 }
362 
363 /* clang-format off */
364 constexpr unsigned char c_escaped_len[256] = {
365     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
366     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
367     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
368     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
369     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
370     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
371     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
372     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
373     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
374     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
377     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
378     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
379     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
380     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
381 };
382 /* clang-format on */
383 
384 // Calculates the length of the C-style escaped version of 'src'.
385 // Assumes that non-printable characters are escaped using octal sequences, and
386 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)387 inline size_t CEscapedLength(absl::string_view src) {
388   size_t escaped_len = 0;
389   for (char c : src)
390     escaped_len += c_escaped_len[static_cast<unsigned char>(c)];
391   return escaped_len;
392 }
393 
CEscapeAndAppendInternal(absl::string_view src,std::string * dest)394 void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
395   size_t escaped_len = CEscapedLength(src);
396   if (escaped_len == src.size()) {
397     dest->append(src.data(), src.size());
398     return;
399   }
400 
401   size_t cur_dest_len = dest->size();
402   strings_internal::STLStringResizeUninitialized(dest,
403                                                  cur_dest_len + escaped_len);
404   char* append_ptr = &(*dest)[cur_dest_len];
405 
406   for (char c : src) {
407     size_t char_len = c_escaped_len[static_cast<unsigned char>(c)];
408     if (char_len == 1) {
409       *append_ptr++ = c;
410     } else if (char_len == 2) {
411       switch (c) {
412         case '\n':
413           *append_ptr++ = '\\';
414           *append_ptr++ = 'n';
415           break;
416         case '\r':
417           *append_ptr++ = '\\';
418           *append_ptr++ = 'r';
419           break;
420         case '\t':
421           *append_ptr++ = '\\';
422           *append_ptr++ = 't';
423           break;
424         case '\"':
425           *append_ptr++ = '\\';
426           *append_ptr++ = '\"';
427           break;
428         case '\'':
429           *append_ptr++ = '\\';
430           *append_ptr++ = '\'';
431           break;
432         case '\\':
433           *append_ptr++ = '\\';
434           *append_ptr++ = '\\';
435           break;
436       }
437     } else {
438       *append_ptr++ = '\\';
439       *append_ptr++ = '0' + static_cast<unsigned char>(c) / 64;
440       *append_ptr++ = '0' + (static_cast<unsigned char>(c) % 64) / 8;
441       *append_ptr++ = '0' + static_cast<unsigned char>(c) % 8;
442     }
443   }
444 }
445 
Base64UnescapeInternal(const char * src_param,size_t szsrc,char * dest,size_t szdest,const signed char * unbase64,size_t * len)446 bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
447                             size_t szdest, const signed char* unbase64,
448                             size_t* len) {
449   static const char kPad64Equals = '=';
450   static const char kPad64Dot = '.';
451 
452   size_t destidx = 0;
453   int decode = 0;
454   int state = 0;
455   unsigned char ch = 0;
456   unsigned int temp = 0;
457 
458   // If "char" is signed by default, using *src as an array index results in
459   // accessing negative array elements. Treat the input as a pointer to
460   // unsigned char to avoid this.
461   const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
462 
463   // The GET_INPUT macro gets the next input character, skipping
464   // over any whitespace, and stopping when we reach the end of the
465   // string or when we read any non-data character.  The arguments are
466   // an arbitrary identifier (used as a label for goto) and the number
467   // of data bytes that must remain in the input to avoid aborting the
468   // loop.
469 #define GET_INPUT(label, remain)                                \
470   label:                                                        \
471   --szsrc;                                                      \
472   ch = *src++;                                                  \
473   decode = unbase64[ch];                                        \
474   if (decode < 0) {                                             \
475     if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
476     state = 4 - remain;                                         \
477     break;                                                      \
478   }
479 
480   // if dest is null, we're just checking to see if it's legal input
481   // rather than producing output.  (I suspect this could just be done
482   // with a regexp...).  We duplicate the loop so this test can be
483   // outside it instead of in every iteration.
484 
485   if (dest) {
486     // This loop consumes 4 input bytes and produces 3 output bytes
487     // per iteration.  We can't know at the start that there is enough
488     // data left in the string for a full iteration, so the loop may
489     // break out in the middle; if so 'state' will be set to the
490     // number of input bytes read.
491 
492     while (szsrc >= 4) {
493       // We'll start by optimistically assuming that the next four
494       // bytes of the string (src[0..3]) are four good data bytes
495       // (that is, no nulls, whitespace, padding chars, or illegal
496       // chars).  We need to test src[0..2] for nulls individually
497       // before constructing temp to preserve the property that we
498       // never read past a null in the string (no matter how long
499       // szsrc claims the string is).
500 
501       if (!src[0] || !src[1] || !src[2] ||
502           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
503                     (unsigned(unbase64[src[1]]) << 12) |
504                     (unsigned(unbase64[src[2]]) << 6) |
505                     (unsigned(unbase64[src[3]])))) &
506            0x80000000)) {
507         // Iff any of those four characters was bad (null, illegal,
508         // whitespace, padding), then temp's high bit will be set
509         // (because unbase64[] is -1 for all bad characters).
510         //
511         // We'll back up and resort to the slower decoder, which knows
512         // how to handle those cases.
513 
514         GET_INPUT(first, 4);
515         temp = static_cast<unsigned char>(decode);
516         GET_INPUT(second, 3);
517         temp = (temp << 6) | static_cast<unsigned char>(decode);
518         GET_INPUT(third, 2);
519         temp = (temp << 6) | static_cast<unsigned char>(decode);
520         GET_INPUT(fourth, 1);
521         temp = (temp << 6) | static_cast<unsigned char>(decode);
522       } else {
523         // We really did have four good data bytes, so advance four
524         // characters in the string.
525 
526         szsrc -= 4;
527         src += 4;
528       }
529 
530       // temp has 24 bits of input, so write that out as three bytes.
531 
532       if (destidx + 3 > szdest) return false;
533       dest[destidx + 2] = static_cast<char>(temp);
534       temp >>= 8;
535       dest[destidx + 1] = static_cast<char>(temp);
536       temp >>= 8;
537       dest[destidx] = static_cast<char>(temp);
538       destidx += 3;
539     }
540   } else {
541     while (szsrc >= 4) {
542       if (!src[0] || !src[1] || !src[2] ||
543           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
544                     (unsigned(unbase64[src[1]]) << 12) |
545                     (unsigned(unbase64[src[2]]) << 6) |
546                     (unsigned(unbase64[src[3]])))) &
547            0x80000000)) {
548         GET_INPUT(first_no_dest, 4);
549         GET_INPUT(second_no_dest, 3);
550         GET_INPUT(third_no_dest, 2);
551         GET_INPUT(fourth_no_dest, 1);
552       } else {
553         szsrc -= 4;
554         src += 4;
555       }
556       destidx += 3;
557     }
558   }
559 
560 #undef GET_INPUT
561 
562   // if the loop terminated because we read a bad character, return
563   // now.
564   if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
565       !absl::ascii_isspace(ch))
566     return false;
567 
568   if (ch == kPad64Equals || ch == kPad64Dot) {
569     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
570     // look at it again when we count to check for the proper number of
571     // equals signs at the end.
572     ++szsrc;
573     --src;
574   } else {
575     // This loop consumes 1 input byte per iteration.  It's used to
576     // clean up the 0-3 input bytes remaining when the first, faster
577     // loop finishes.  'temp' contains the data from 'state' input
578     // characters read by the first loop.
579     while (szsrc > 0) {
580       --szsrc;
581       ch = *src++;
582       decode = unbase64[ch];
583       if (decode < 0) {
584         if (absl::ascii_isspace(ch)) {
585           continue;
586         } else if (ch == kPad64Equals || ch == kPad64Dot) {
587           // back up one character; we'll read it again when we check
588           // for the correct number of pad characters at the end.
589           ++szsrc;
590           --src;
591           break;
592         } else {
593           return false;
594         }
595       }
596 
597       // Each input character gives us six bits of output.
598       temp = (temp << 6) | static_cast<unsigned char>(decode);
599       ++state;
600       if (state == 4) {
601         // If we've accumulated 24 bits of output, write that out as
602         // three bytes.
603         if (dest) {
604           if (destidx + 3 > szdest) return false;
605           dest[destidx + 2] = static_cast<char>(temp);
606           temp >>= 8;
607           dest[destidx + 1] = static_cast<char>(temp);
608           temp >>= 8;
609           dest[destidx] = static_cast<char>(temp);
610         }
611         destidx += 3;
612         state = 0;
613         temp = 0;
614       }
615     }
616   }
617 
618   // Process the leftover data contained in 'temp' at the end of the input.
619   int expected_equals = 0;
620   switch (state) {
621     case 0:
622       // Nothing left over; output is a multiple of 3 bytes.
623       break;
624 
625     case 1:
626       // Bad input; we have 6 bits left over.
627       return false;
628 
629     case 2:
630       // Produce one more output byte from the 12 input bits we have left.
631       if (dest) {
632         if (destidx + 1 > szdest) return false;
633         temp >>= 4;
634         dest[destidx] = static_cast<char>(temp);
635       }
636       ++destidx;
637       expected_equals = 2;
638       break;
639 
640     case 3:
641       // Produce two more output bytes from the 18 input bits we have left.
642       if (dest) {
643         if (destidx + 2 > szdest) return false;
644         temp >>= 2;
645         dest[destidx + 1] = static_cast<char>(temp);
646         temp >>= 8;
647         dest[destidx] = static_cast<char>(temp);
648       }
649       destidx += 2;
650       expected_equals = 1;
651       break;
652 
653     default:
654       // state should have no other values at this point.
655       ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
656                    state);
657   }
658 
659   // The remainder of the string should be all whitespace, mixed with
660   // exactly 0 equals signs, or exactly 'expected_equals' equals
661   // signs.  (Always accepting 0 equals signs is an Abseil extension
662   // not covered in the RFC, as is accepting dot as the pad character.)
663 
664   int equals = 0;
665   while (szsrc > 0) {
666     if (*src == kPad64Equals || *src == kPad64Dot)
667       ++equals;
668     else if (!absl::ascii_isspace(*src))
669       return false;
670     --szsrc;
671     ++src;
672   }
673 
674   const bool ok = (equals == 0 || equals == expected_equals);
675   if (ok) *len = destidx;
676   return ok;
677 }
678 
679 // The arrays below were generated by the following code
680 // #include <sys/time.h>
681 // #include <stdlib.h>
682 // #include <string.h>
683 // main()
684 // {
685 //   static const char Base64[] =
686 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
687 //   char* pos;
688 //   int idx, i, j;
689 //   printf("    ");
690 //   for (i = 0; i < 255; i += 8) {
691 //     for (j = i; j < i + 8; j++) {
692 //       pos = strchr(Base64, j);
693 //       if ((pos == nullptr) || (j == 0))
694 //         idx = -1;
695 //       else
696 //         idx = pos - Base64;
697 //       if (idx == -1)
698 //         printf(" %2d,     ", idx);
699 //       else
700 //         printf(" %2d/*%c*/,", idx, j);
701 //     }
702 //     printf("\n    ");
703 //   }
704 // }
705 //
706 // where the value of "Base64[]" was replaced by one of the base-64 conversion
707 // tables from the functions below.
708 /* clang-format off */
709 constexpr signed char kUnBase64[] = {
710     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
711     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
712     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
713     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
714     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
715     -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
716     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
717     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
718     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
719     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
720     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
721     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
722     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
723     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
724     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
725     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
726     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
727     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
728     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
729     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
730     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
731     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
732     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
733     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
734     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
735     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
736     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
737     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
738     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
739     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
740     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
741     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
742 };
743 
744 constexpr signed char kUnWebSafeBase64[] = {
745     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
746     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
747     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
748     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
749     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
750     -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
751     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
752     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
753     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
754     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
755     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
756     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
757     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
758     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
759     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
760     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
761     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
762     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
763     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
764     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
765     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
766     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
767     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
768     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
769     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
770     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
771     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
772     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
773     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
774     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
775     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
776     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
777 };
778 /* clang-format on */
779 
780 constexpr char kWebSafeBase64Chars[] =
781     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
782 
783 template <typename String>
Base64UnescapeInternal(const char * src,size_t slen,String * dest,const signed char * unbase64)784 bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
785                             const signed char* unbase64) {
786   // Determine the size of the output string.  Base64 encodes every 3 bytes into
787   // 4 characters.  any leftover chars are added directly for good measure.
788   // This is documented in the base64 RFC:
789   // https://datatracker.ietf.org/doc/html/rfc3548
790   const size_t dest_len = 3 * (slen / 4) + (slen % 4);
791 
792   strings_internal::STLStringResizeUninitialized(dest, dest_len);
793 
794   // We are getting the destination buffer by getting the beginning of the
795   // string and converting it into a char *.
796   size_t len;
797   const bool ok =
798       Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
799   if (!ok) {
800     dest->clear();
801     return false;
802   }
803 
804   // could be shorter if there was padding
805   assert(len <= dest_len);
806   dest->erase(len);
807 
808   return true;
809 }
810 
811 /* clang-format off */
812 constexpr char kHexValueLenient[256] = {
813     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816     0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0,  // '0'..'9'
817     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'A'..'F'
818     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
819     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'a'..'f'
820     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
821     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
822     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
823     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
824     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
825     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
826     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
827     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
828     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
829 };
830 
831 /* clang-format on */
832 
833 // This is a templated function so that T can be either a char*
834 // or a string.  This works because we use the [] operator to access
835 // individual characters at a time.
836 template <typename T>
HexStringToBytesInternal(const char * from,T to,size_t num)837 void HexStringToBytesInternal(const char* from, T to, size_t num) {
838   for (size_t i = 0; i < num; i++) {
839     to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
840             (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
841   }
842 }
843 
844 // This is a templated function so that T can be either a char* or a
845 // std::string.
846 template <typename T>
BytesToHexStringInternal(const unsigned char * src,T dest,size_t num)847 void BytesToHexStringInternal(const unsigned char* src, T dest, size_t num) {
848   auto dest_ptr = &dest[0];
849   for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
850     const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
851     std::copy(hex_p, hex_p + 2, dest_ptr);
852   }
853 }
854 
855 }  // namespace
856 
857 // ----------------------------------------------------------------------
858 // CUnescape()
859 //
860 // See CUnescapeInternal() for implementation details.
861 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,std::string * dest,std::string * error)862 bool CUnescape(absl::string_view source, std::string* dest,
863                std::string* error) {
864   return CUnescapeInternal(source, kUnescapeNulls, dest, error);
865 }
866 
CEscape(absl::string_view src)867 std::string CEscape(absl::string_view src) {
868   std::string dest;
869   CEscapeAndAppendInternal(src, &dest);
870   return dest;
871 }
872 
CHexEscape(absl::string_view src)873 std::string CHexEscape(absl::string_view src) {
874   return CEscapeInternal(src, true, false);
875 }
876 
Utf8SafeCEscape(absl::string_view src)877 std::string Utf8SafeCEscape(absl::string_view src) {
878   return CEscapeInternal(src, false, true);
879 }
880 
Utf8SafeCHexEscape(absl::string_view src)881 std::string Utf8SafeCHexEscape(absl::string_view src) {
882   return CEscapeInternal(src, true, true);
883 }
884 
885 // ----------------------------------------------------------------------
886 // Base64Unescape() - base64 decoder
887 // Base64Escape() - base64 encoder
888 // WebSafeBase64Unescape() - Google's variation of base64 decoder
889 // WebSafeBase64Escape() - Google's variation of base64 encoder
890 //
891 // Check out
892 // https://datatracker.ietf.org/doc/html/rfc2045 for formal description, but
893 // what we care about is that...
894 //   Take the encoded stuff in groups of 4 characters and turn each
895 //   character into a code 0 to 63 thus:
896 //           A-Z map to 0 to 25
897 //           a-z map to 26 to 51
898 //           0-9 map to 52 to 61
899 //           +(- for WebSafe) maps to 62
900 //           /(_ for WebSafe) maps to 63
901 //   There will be four numbers, all less than 64 which can be represented
902 //   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
903 //   Arrange the 6 digit binary numbers into three bytes as such:
904 //   aaaaaabb bbbbcccc ccdddddd
905 //   Equals signs (one or two) are used at the end of the encoded block to
906 //   indicate that the text was not an integer multiple of three bytes long.
907 // ----------------------------------------------------------------------
908 
Base64Unescape(absl::string_view src,std::string * dest)909 bool Base64Unescape(absl::string_view src, std::string* dest) {
910   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
911 }
912 
WebSafeBase64Unescape(absl::string_view src,std::string * dest)913 bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
914   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
915 }
916 
Base64Escape(absl::string_view src,std::string * dest)917 void Base64Escape(absl::string_view src, std::string* dest) {
918   strings_internal::Base64EscapeInternal(
919       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
920       true, strings_internal::kBase64Chars);
921 }
922 
WebSafeBase64Escape(absl::string_view src,std::string * dest)923 void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
924   strings_internal::Base64EscapeInternal(
925       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
926       false, kWebSafeBase64Chars);
927 }
928 
Base64Escape(absl::string_view src)929 std::string Base64Escape(absl::string_view src) {
930   std::string dest;
931   strings_internal::Base64EscapeInternal(
932       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
933       true, strings_internal::kBase64Chars);
934   return dest;
935 }
936 
WebSafeBase64Escape(absl::string_view src)937 std::string WebSafeBase64Escape(absl::string_view src) {
938   std::string dest;
939   strings_internal::Base64EscapeInternal(
940       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
941       false, kWebSafeBase64Chars);
942   return dest;
943 }
944 
HexStringToBytes(absl::string_view from)945 std::string HexStringToBytes(absl::string_view from) {
946   std::string result;
947   const auto num = from.size() / 2;
948   strings_internal::STLStringResizeUninitialized(&result, num);
949   absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
950   return result;
951 }
952 
BytesToHexString(absl::string_view from)953 std::string BytesToHexString(absl::string_view from) {
954   std::string result;
955   strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
956   absl::BytesToHexStringInternal<std::string&>(
957       reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
958   return result;
959 }
960 
961 ABSL_NAMESPACE_END
962 }  // namespace absl
963