xref: /aosp_15_r20/external/cronet/base/strings/escape.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1*6777b538SAndroid Build Coastguard Worker // Copyright 2020 The Chromium Authors
2*6777b538SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*6777b538SAndroid Build Coastguard Worker // found in the LICENSE file.
4*6777b538SAndroid Build Coastguard Worker 
5*6777b538SAndroid Build Coastguard Worker #include "base/strings/escape.h"
6*6777b538SAndroid Build Coastguard Worker 
7*6777b538SAndroid Build Coastguard Worker #include <ostream>
8*6777b538SAndroid Build Coastguard Worker 
9*6777b538SAndroid Build Coastguard Worker #include "base/check_op.h"
10*6777b538SAndroid Build Coastguard Worker #include "base/feature_list.h"
11*6777b538SAndroid Build Coastguard Worker #include "base/features.h"
12*6777b538SAndroid Build Coastguard Worker #include "base/strings/string_number_conversions.h"
13*6777b538SAndroid Build Coastguard Worker #include "base/strings/string_piece.h"
14*6777b538SAndroid Build Coastguard Worker #include "base/strings/string_util.h"
15*6777b538SAndroid Build Coastguard Worker #include "base/strings/utf_string_conversion_utils.h"
16*6777b538SAndroid Build Coastguard Worker #include "base/strings/utf_string_conversions.h"
17*6777b538SAndroid Build Coastguard Worker #include "base/third_party/icu/icu_utf.h"
18*6777b538SAndroid Build Coastguard Worker 
19*6777b538SAndroid Build Coastguard Worker namespace base {
20*6777b538SAndroid Build Coastguard Worker 
21*6777b538SAndroid Build Coastguard Worker namespace {
22*6777b538SAndroid Build Coastguard Worker 
23*6777b538SAndroid Build Coastguard Worker // A fast bit-vector map for ascii characters.
24*6777b538SAndroid Build Coastguard Worker //
25*6777b538SAndroid Build Coastguard Worker // Internally stores 256 bits in an array of 8 ints.
26*6777b538SAndroid Build Coastguard Worker // Does quick bit-flicking to lookup needed characters.
27*6777b538SAndroid Build Coastguard Worker struct Charmap {
Containsbase::__anon620755e90111::Charmap28*6777b538SAndroid Build Coastguard Worker   bool Contains(unsigned char c) const {
29*6777b538SAndroid Build Coastguard Worker     return ((map[c >> 5] & (1 << (c & 31))) != 0);
30*6777b538SAndroid Build Coastguard Worker   }
31*6777b538SAndroid Build Coastguard Worker 
32*6777b538SAndroid Build Coastguard Worker   uint32_t map[8];
33*6777b538SAndroid Build Coastguard Worker };
34*6777b538SAndroid Build Coastguard Worker 
35*6777b538SAndroid Build Coastguard Worker // Given text to escape and a Charmap defining which values to escape,
36*6777b538SAndroid Build Coastguard Worker // return an escaped string.  If use_plus is true, spaces are converted
37*6777b538SAndroid Build Coastguard Worker // to +, otherwise, if spaces are in the charmap, they are converted to
38*6777b538SAndroid Build Coastguard Worker // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
39*6777b538SAndroid Build Coastguard Worker // '%' is in the charmap, it is converted to %25.
Escape(StringPiece text,const Charmap & charmap,bool use_plus,bool keep_escaped=false)40*6777b538SAndroid Build Coastguard Worker std::string Escape(StringPiece text,
41*6777b538SAndroid Build Coastguard Worker                    const Charmap& charmap,
42*6777b538SAndroid Build Coastguard Worker                    bool use_plus,
43*6777b538SAndroid Build Coastguard Worker                    bool keep_escaped = false) {
44*6777b538SAndroid Build Coastguard Worker   std::string escaped;
45*6777b538SAndroid Build Coastguard Worker   escaped.reserve(text.length() * 3);
46*6777b538SAndroid Build Coastguard Worker   for (size_t i = 0; i < text.length(); ++i) {
47*6777b538SAndroid Build Coastguard Worker     unsigned char c = static_cast<unsigned char>(text[i]);
48*6777b538SAndroid Build Coastguard Worker     if (use_plus && ' ' == c) {
49*6777b538SAndroid Build Coastguard Worker       escaped.push_back('+');
50*6777b538SAndroid Build Coastguard Worker     } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
51*6777b538SAndroid Build Coastguard Worker                IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
52*6777b538SAndroid Build Coastguard Worker       escaped.push_back('%');
53*6777b538SAndroid Build Coastguard Worker     } else if (charmap.Contains(c)) {
54*6777b538SAndroid Build Coastguard Worker       escaped.push_back('%');
55*6777b538SAndroid Build Coastguard Worker       AppendHexEncodedByte(c, escaped);
56*6777b538SAndroid Build Coastguard Worker     } else {
57*6777b538SAndroid Build Coastguard Worker       escaped.push_back(static_cast<char>(c));
58*6777b538SAndroid Build Coastguard Worker     }
59*6777b538SAndroid Build Coastguard Worker   }
60*6777b538SAndroid Build Coastguard Worker   return escaped;
61*6777b538SAndroid Build Coastguard Worker }
62*6777b538SAndroid Build Coastguard Worker 
63*6777b538SAndroid Build Coastguard Worker // Convert a character |c| to a form that will not be mistaken as HTML.
64*6777b538SAndroid Build Coastguard Worker template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)65*6777b538SAndroid Build Coastguard Worker void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
66*6777b538SAndroid Build Coastguard Worker   static constexpr struct {
67*6777b538SAndroid Build Coastguard Worker     char key;
68*6777b538SAndroid Build Coastguard Worker     StringPiece replacement;
69*6777b538SAndroid Build Coastguard Worker   } kCharsToEscape[] = {
70*6777b538SAndroid Build Coastguard Worker       {'<', "&lt;"},   {'>', "&gt;"},   {'&', "&amp;"},
71*6777b538SAndroid Build Coastguard Worker       {'"', "&quot;"}, {'\'', "&#39;"},
72*6777b538SAndroid Build Coastguard Worker   };
73*6777b538SAndroid Build Coastguard Worker   for (const auto& char_to_escape : kCharsToEscape) {
74*6777b538SAndroid Build Coastguard Worker     if (c == char_to_escape.key) {
75*6777b538SAndroid Build Coastguard Worker       output->append(std::begin(char_to_escape.replacement),
76*6777b538SAndroid Build Coastguard Worker                      std::end(char_to_escape.replacement));
77*6777b538SAndroid Build Coastguard Worker       return;
78*6777b538SAndroid Build Coastguard Worker     }
79*6777b538SAndroid Build Coastguard Worker   }
80*6777b538SAndroid Build Coastguard Worker   output->push_back(c);
81*6777b538SAndroid Build Coastguard Worker }
82*6777b538SAndroid Build Coastguard Worker 
83*6777b538SAndroid Build Coastguard Worker // Convert |input| string to a form that will not be interpreted as HTML.
84*6777b538SAndroid Build Coastguard Worker template <typename T, typename CharT = typename T::value_type>
EscapeForHTMLImpl(T input)85*6777b538SAndroid Build Coastguard Worker std::basic_string<CharT> EscapeForHTMLImpl(T input) {
86*6777b538SAndroid Build Coastguard Worker   std::basic_string<CharT> result;
87*6777b538SAndroid Build Coastguard Worker   result.reserve(input.size());  // Optimize for no escaping.
88*6777b538SAndroid Build Coastguard Worker 
89*6777b538SAndroid Build Coastguard Worker   for (auto c : input) {
90*6777b538SAndroid Build Coastguard Worker     AppendEscapedCharForHTMLImpl(c, &result);
91*6777b538SAndroid Build Coastguard Worker   }
92*6777b538SAndroid Build Coastguard Worker 
93*6777b538SAndroid Build Coastguard Worker   return result;
94*6777b538SAndroid Build Coastguard Worker }
95*6777b538SAndroid Build Coastguard Worker 
96*6777b538SAndroid Build Coastguard Worker // Everything except alphanumerics and -._~
97*6777b538SAndroid Build Coastguard Worker // See RFC 3986 for the list of unreserved characters.
98*6777b538SAndroid Build Coastguard Worker static const Charmap kUnreservedCharmap = {
99*6777b538SAndroid Build Coastguard Worker     {0xffffffffL, 0xfc009fffL, 0x78000001L, 0xb8000001L, 0xffffffffL,
100*6777b538SAndroid Build Coastguard Worker      0xffffffffL, 0xffffffffL, 0xffffffffL}};
101*6777b538SAndroid Build Coastguard Worker 
102*6777b538SAndroid Build Coastguard Worker // Everything except alphanumerics and !'()*-._~
103*6777b538SAndroid Build Coastguard Worker // See RFC 2396 for the list of reserved characters.
104*6777b538SAndroid Build Coastguard Worker static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L,
105*6777b538SAndroid Build Coastguard Worker                                        0xb8000001L, 0xffffffffL, 0xffffffffL,
106*6777b538SAndroid Build Coastguard Worker                                        0xffffffffL, 0xffffffffL}};
107*6777b538SAndroid Build Coastguard Worker 
108*6777b538SAndroid Build Coastguard Worker // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
109*6777b538SAndroid Build Coastguard Worker static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L,
110*6777b538SAndroid Build Coastguard Worker                                       0xb8000001L, 0xffffffffL, 0xffffffffL,
111*6777b538SAndroid Build Coastguard Worker                                       0xffffffffL, 0xffffffffL}};
112*6777b538SAndroid Build Coastguard Worker 
113*6777b538SAndroid Build Coastguard Worker #if BUILDFLAG(IS_APPLE)
114*6777b538SAndroid Build Coastguard Worker // non-printable, non-7bit, and (including space)  "#%<>[\]^`{|}
115*6777b538SAndroid Build Coastguard Worker static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L,
116*6777b538SAndroid Build Coastguard Worker                                        0xb8000001L, 0xffffffffL, 0xffffffffL,
117*6777b538SAndroid Build Coastguard Worker                                        0xffffffffL, 0xffffffffL}};
118*6777b538SAndroid Build Coastguard Worker #endif  // BUILDFLAG(IS_APPLE)
119*6777b538SAndroid Build Coastguard Worker 
120*6777b538SAndroid Build Coastguard Worker // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
121*6777b538SAndroid Build Coastguard Worker static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L,
122*6777b538SAndroid Build Coastguard Worker                                     0xb8000001L, 0xffffffffL, 0xffffffffL,
123*6777b538SAndroid Build Coastguard Worker                                     0xffffffffL, 0xffffffffL}};
124*6777b538SAndroid Build Coastguard Worker 
125*6777b538SAndroid Build Coastguard Worker // non-7bit, as well as %.
126*6777b538SAndroid Build Coastguard Worker static const Charmap kNonASCIICharmapAndPercent = {
127*6777b538SAndroid Build Coastguard Worker     {0x00000000L, 0x00000020L, 0x00000000L, 0x00000000L, 0xffffffffL,
128*6777b538SAndroid Build Coastguard Worker      0xffffffffL, 0xffffffffL, 0xffffffffL}};
129*6777b538SAndroid Build Coastguard Worker 
130*6777b538SAndroid Build Coastguard Worker // non-7bit
131*6777b538SAndroid Build Coastguard Worker static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L,
132*6777b538SAndroid Build Coastguard Worker                                           0x00000000L, 0xffffffffL, 0xffffffffL,
133*6777b538SAndroid Build Coastguard Worker                                           0xffffffffL, 0xffffffffL}};
134*6777b538SAndroid Build Coastguard Worker 
135*6777b538SAndroid Build Coastguard Worker // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
136*6777b538SAndroid Build Coastguard Worker // !'()*-._~#[]
137*6777b538SAndroid Build Coastguard Worker static const Charmap kExternalHandlerCharmap = {
138*6777b538SAndroid Build Coastguard Worker     {0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, 0xffffffffL,
139*6777b538SAndroid Build Coastguard Worker      0xffffffffL, 0xffffffffL, 0xffffffffL}};
140*6777b538SAndroid Build Coastguard Worker 
141*6777b538SAndroid Build Coastguard Worker // Contains nonzero when the corresponding character is unescapable for normal
142*6777b538SAndroid Build Coastguard Worker // URLs. These characters are the ones that may change the parsing of a URL, so
143*6777b538SAndroid Build Coastguard Worker // we don't want to unescape them sometimes. In many case we won't want to
144*6777b538SAndroid Build Coastguard Worker // unescape spaces, but that is controlled by parameters to Unescape*.
145*6777b538SAndroid Build Coastguard Worker //
146*6777b538SAndroid Build Coastguard Worker // The basic rule is that we can't unescape anything that would changing parsing
147*6777b538SAndroid Build Coastguard Worker // like # or ?. We also can't unescape &, =, or + since that could be part of a
148*6777b538SAndroid Build Coastguard Worker // query and that could change the server's parsing of the query. Nor can we
149*6777b538SAndroid Build Coastguard Worker // unescape \ since src/url/ will convert it to a /.
150*6777b538SAndroid Build Coastguard Worker //
151*6777b538SAndroid Build Coastguard Worker // Lastly, we can't unescape anything that doesn't have a canonical
152*6777b538SAndroid Build Coastguard Worker // representation in a URL. This means that unescaping will change the URL, and
153*6777b538SAndroid Build Coastguard Worker // you could get different behavior if you copy and paste the URL, or press
154*6777b538SAndroid Build Coastguard Worker // enter in the URL bar. The list of characters that fall into this category
155*6777b538SAndroid Build Coastguard Worker // are the ones labeled PASS (allow either escaped or unescaped) in the big
156*6777b538SAndroid Build Coastguard Worker // lookup table at the top of url/url_canon_path.cc.  Also, characters
157*6777b538SAndroid Build Coastguard Worker // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
158*6777b538SAndroid Build Coastguard Worker // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
159*6777b538SAndroid Build Coastguard Worker // not unescaped, to avoid turning a valid url according to spec into an
160*6777b538SAndroid Build Coastguard Worker // invalid one.
161*6777b538SAndroid Build Coastguard Worker // clang-format off
162*6777b538SAndroid Build Coastguard Worker const char kUrlUnescape[128] = {
163*6777b538SAndroid Build Coastguard Worker //   Null, control chars...
164*6777b538SAndroid Build Coastguard Worker      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165*6777b538SAndroid Build Coastguard Worker      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166*6777b538SAndroid Build Coastguard Worker //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
167*6777b538SAndroid Build Coastguard Worker      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
168*6777b538SAndroid Build Coastguard Worker //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
169*6777b538SAndroid Build Coastguard Worker      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
170*6777b538SAndroid Build Coastguard Worker //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
171*6777b538SAndroid Build Coastguard Worker      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172*6777b538SAndroid Build Coastguard Worker //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
173*6777b538SAndroid Build Coastguard Worker      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
174*6777b538SAndroid Build Coastguard Worker //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
175*6777b538SAndroid Build Coastguard Worker      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
176*6777b538SAndroid Build Coastguard Worker //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
177*6777b538SAndroid Build Coastguard Worker      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
178*6777b538SAndroid Build Coastguard Worker };
179*6777b538SAndroid Build Coastguard Worker // clang-format on
180*6777b538SAndroid Build Coastguard Worker 
181*6777b538SAndroid Build Coastguard Worker // Attempts to unescape the sequence at |index| within |escaped_text|.  If
182*6777b538SAndroid Build Coastguard Worker // successful, sets |value| to the unescaped value.  Returns whether
183*6777b538SAndroid Build Coastguard Worker // unescaping succeeded.
UnescapeUnsignedByteAtIndex(StringPiece escaped_text,size_t index,unsigned char * value)184*6777b538SAndroid Build Coastguard Worker bool UnescapeUnsignedByteAtIndex(StringPiece escaped_text,
185*6777b538SAndroid Build Coastguard Worker                                  size_t index,
186*6777b538SAndroid Build Coastguard Worker                                  unsigned char* value) {
187*6777b538SAndroid Build Coastguard Worker   if ((index + 2) >= escaped_text.size())
188*6777b538SAndroid Build Coastguard Worker     return false;
189*6777b538SAndroid Build Coastguard Worker   if (escaped_text[index] != '%')
190*6777b538SAndroid Build Coastguard Worker     return false;
191*6777b538SAndroid Build Coastguard Worker   char most_sig_digit(escaped_text[index + 1]);
192*6777b538SAndroid Build Coastguard Worker   char least_sig_digit(escaped_text[index + 2]);
193*6777b538SAndroid Build Coastguard Worker   if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
194*6777b538SAndroid Build Coastguard Worker     *value = static_cast<unsigned char>(HexDigitToInt(most_sig_digit) * 16 +
195*6777b538SAndroid Build Coastguard Worker                                         HexDigitToInt(least_sig_digit));
196*6777b538SAndroid Build Coastguard Worker     return true;
197*6777b538SAndroid Build Coastguard Worker   }
198*6777b538SAndroid Build Coastguard Worker   return false;
199*6777b538SAndroid Build Coastguard Worker }
200*6777b538SAndroid Build Coastguard Worker 
201*6777b538SAndroid Build Coastguard Worker // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
202*6777b538SAndroid Build Coastguard Worker // the specified index. On success, returns true, sets |code_point_out| to be
203*6777b538SAndroid Build Coastguard Worker // the character's code point and |unescaped_out| to be the unescaped UTF-8
204*6777b538SAndroid Build Coastguard Worker // string. |unescaped_out| will always be 1/3rd the length of the substring of
205*6777b538SAndroid Build Coastguard Worker // |escaped_text| that corresponds to the unescaped character.
UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,size_t index,base_icu::UChar32 * code_point_out,std::string * unescaped_out)206*6777b538SAndroid Build Coastguard Worker bool UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,
207*6777b538SAndroid Build Coastguard Worker                                   size_t index,
208*6777b538SAndroid Build Coastguard Worker                                   base_icu::UChar32* code_point_out,
209*6777b538SAndroid Build Coastguard Worker                                   std::string* unescaped_out) {
210*6777b538SAndroid Build Coastguard Worker   DCHECK(unescaped_out->empty());
211*6777b538SAndroid Build Coastguard Worker 
212*6777b538SAndroid Build Coastguard Worker   unsigned char bytes[CBU8_MAX_LENGTH];
213*6777b538SAndroid Build Coastguard Worker   if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
214*6777b538SAndroid Build Coastguard Worker     return false;
215*6777b538SAndroid Build Coastguard Worker 
216*6777b538SAndroid Build Coastguard Worker   size_t num_bytes = 1;
217*6777b538SAndroid Build Coastguard Worker 
218*6777b538SAndroid Build Coastguard Worker   // If this is a lead byte, need to collect trail bytes as well.
219*6777b538SAndroid Build Coastguard Worker   if (CBU8_IS_LEAD(bytes[0])) {
220*6777b538SAndroid Build Coastguard Worker     // Look for the last trail byte of the UTF-8 character.  Give up once
221*6777b538SAndroid Build Coastguard Worker     // reach max character length number of bytes, or hit an unescaped
222*6777b538SAndroid Build Coastguard Worker     // character. No need to check length of escaped_text, as
223*6777b538SAndroid Build Coastguard Worker     // UnescapeUnsignedByteAtIndex checks lengths.
224*6777b538SAndroid Build Coastguard Worker     while (num_bytes < std::size(bytes) &&
225*6777b538SAndroid Build Coastguard Worker            UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
226*6777b538SAndroid Build Coastguard Worker                                        &bytes[num_bytes]) &&
227*6777b538SAndroid Build Coastguard Worker            CBU8_IS_TRAIL(bytes[num_bytes])) {
228*6777b538SAndroid Build Coastguard Worker       ++num_bytes;
229*6777b538SAndroid Build Coastguard Worker     }
230*6777b538SAndroid Build Coastguard Worker   }
231*6777b538SAndroid Build Coastguard Worker 
232*6777b538SAndroid Build Coastguard Worker   size_t char_index = 0;
233*6777b538SAndroid Build Coastguard Worker   // Check if the unicode "character" that was just unescaped is valid.
234*6777b538SAndroid Build Coastguard Worker   if (!ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
235*6777b538SAndroid Build Coastguard Worker                             &char_index, code_point_out)) {
236*6777b538SAndroid Build Coastguard Worker     return false;
237*6777b538SAndroid Build Coastguard Worker   }
238*6777b538SAndroid Build Coastguard Worker 
239*6777b538SAndroid Build Coastguard Worker   // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
240*6777b538SAndroid Build Coastguard Worker   // and the rest are not valid UTF-8, so need to update |num_bytes| based
241*6777b538SAndroid Build Coastguard Worker   // on the result of ReadUnicodeCharacter().
242*6777b538SAndroid Build Coastguard Worker   num_bytes = char_index + 1;
243*6777b538SAndroid Build Coastguard Worker   *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
244*6777b538SAndroid Build Coastguard Worker   return true;
245*6777b538SAndroid Build Coastguard Worker }
246*6777b538SAndroid Build Coastguard Worker 
247*6777b538SAndroid Build Coastguard Worker // This method takes a Unicode code point and returns true if it should be
248*6777b538SAndroid Build Coastguard Worker // unescaped, based on |rules|.
ShouldUnescapeCodePoint(UnescapeRule::Type rules,base_icu::UChar32 code_point)249*6777b538SAndroid Build Coastguard Worker bool ShouldUnescapeCodePoint(UnescapeRule::Type rules,
250*6777b538SAndroid Build Coastguard Worker                              base_icu::UChar32 code_point) {
251*6777b538SAndroid Build Coastguard Worker   // If this is an ASCII character, use the lookup table.
252*6777b538SAndroid Build Coastguard Worker   if (code_point >= 0 && code_point < 0x80) {
253*6777b538SAndroid Build Coastguard Worker     return kUrlUnescape[static_cast<size_t>(code_point)] ||
254*6777b538SAndroid Build Coastguard Worker            // Allow some additional unescaping when flags are set.
255*6777b538SAndroid Build Coastguard Worker            (code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
256*6777b538SAndroid Build Coastguard Worker            // Allow any of the prohibited but non-control characters when doing
257*6777b538SAndroid Build Coastguard Worker            // "special" chars.
258*6777b538SAndroid Build Coastguard Worker            ((code_point == '/' || code_point == '\\') &&
259*6777b538SAndroid Build Coastguard Worker             (rules & UnescapeRule::PATH_SEPARATORS)) ||
260*6777b538SAndroid Build Coastguard Worker            (code_point > ' ' && code_point != '/' && code_point != '\\' &&
261*6777b538SAndroid Build Coastguard Worker             (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));
262*6777b538SAndroid Build Coastguard Worker   }
263*6777b538SAndroid Build Coastguard Worker 
264*6777b538SAndroid Build Coastguard Worker   // Compare the code point against a list of characters that can be used
265*6777b538SAndroid Build Coastguard Worker   // to spoof other URLs.
266*6777b538SAndroid Build Coastguard Worker   //
267*6777b538SAndroid Build Coastguard Worker   // Can't use icu to make this cleaner, because Cronet cannot depend on
268*6777b538SAndroid Build Coastguard Worker   // icu, and currently uses this file.
269*6777b538SAndroid Build Coastguard Worker   // TODO(https://crbug.com/829873): Try to make this use icu, both to
270*6777b538SAndroid Build Coastguard Worker   // protect against regressions as the Unicode standard is updated and to
271*6777b538SAndroid Build Coastguard Worker   // reduce the number of long lists of characters.
272*6777b538SAndroid Build Coastguard Worker   return !(
273*6777b538SAndroid Build Coastguard Worker       // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
274*6777b538SAndroid Build Coastguard Worker       // control characters are not allowed to appear unescaped in URLs.
275*6777b538SAndroid Build Coastguard Worker       code_point == 0x200E ||  // LEFT-TO-RIGHT MARK         (%E2%80%8E)
276*6777b538SAndroid Build Coastguard Worker       code_point == 0x200F ||  // RIGHT-TO-LEFT MARK         (%E2%80%8F)
277*6777b538SAndroid Build Coastguard Worker       code_point == 0x202A ||  // LEFT-TO-RIGHT EMBEDDING    (%E2%80%AA)
278*6777b538SAndroid Build Coastguard Worker       code_point == 0x202B ||  // RIGHT-TO-LEFT EMBEDDING    (%E2%80%AB)
279*6777b538SAndroid Build Coastguard Worker       code_point == 0x202C ||  // POP DIRECTIONAL FORMATTING (%E2%80%AC)
280*6777b538SAndroid Build Coastguard Worker       code_point == 0x202D ||  // LEFT-TO-RIGHT OVERRIDE     (%E2%80%AD)
281*6777b538SAndroid Build Coastguard Worker       code_point == 0x202E ||  // RIGHT-TO-LEFT OVERRIDE     (%E2%80%AE)
282*6777b538SAndroid Build Coastguard Worker 
283*6777b538SAndroid Build Coastguard Worker       // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
284*6777b538SAndroid Build Coastguard Worker       // since added some new BiDi control characters that are not safe to
285*6777b538SAndroid Build Coastguard Worker       // unescape. http://www.unicode.org/reports/tr9
286*6777b538SAndroid Build Coastguard Worker       code_point == 0x061C ||  // ARABIC LETTER MARK         (%D8%9C)
287*6777b538SAndroid Build Coastguard Worker       code_point == 0x2066 ||  // LEFT-TO-RIGHT ISOLATE      (%E2%81%A6)
288*6777b538SAndroid Build Coastguard Worker       code_point == 0x2067 ||  // RIGHT-TO-LEFT ISOLATE      (%E2%81%A7)
289*6777b538SAndroid Build Coastguard Worker       code_point == 0x2068 ||  // FIRST STRONG ISOLATE       (%E2%81%A8)
290*6777b538SAndroid Build Coastguard Worker       code_point == 0x2069 ||  // POP DIRECTIONAL ISOLATE    (%E2%81%A9)
291*6777b538SAndroid Build Coastguard Worker 
292*6777b538SAndroid Build Coastguard Worker       // The following spoofable characters are also banned in unescaped URLs,
293*6777b538SAndroid Build Coastguard Worker       // because they could be used to imitate parts of a web browser's UI.
294*6777b538SAndroid Build Coastguard Worker       code_point == 0x1F50F ||  // LOCK WITH INK PEN    (%F0%9F%94%8F)
295*6777b538SAndroid Build Coastguard Worker       code_point == 0x1F510 ||  // CLOSED LOCK WITH KEY (%F0%9F%94%90)
296*6777b538SAndroid Build Coastguard Worker       code_point == 0x1F512 ||  // LOCK                 (%F0%9F%94%92)
297*6777b538SAndroid Build Coastguard Worker       code_point == 0x1F513 ||  // OPEN LOCK            (%F0%9F%94%93)
298*6777b538SAndroid Build Coastguard Worker 
299*6777b538SAndroid Build Coastguard Worker       // Spaces are also banned, as they can be used to scroll text out of view.
300*6777b538SAndroid Build Coastguard Worker       code_point == 0x0085 ||  // NEXT LINE                  (%C2%85)
301*6777b538SAndroid Build Coastguard Worker       code_point == 0x00A0 ||  // NO-BREAK SPACE             (%C2%A0)
302*6777b538SAndroid Build Coastguard Worker       code_point == 0x1680 ||  // OGHAM SPACE MARK           (%E1%9A%80)
303*6777b538SAndroid Build Coastguard Worker       code_point == 0x2000 ||  // EN QUAD                    (%E2%80%80)
304*6777b538SAndroid Build Coastguard Worker       code_point == 0x2001 ||  // EM QUAD                    (%E2%80%81)
305*6777b538SAndroid Build Coastguard Worker       code_point == 0x2002 ||  // EN SPACE                   (%E2%80%82)
306*6777b538SAndroid Build Coastguard Worker       code_point == 0x2003 ||  // EM SPACE                   (%E2%80%83)
307*6777b538SAndroid Build Coastguard Worker       code_point == 0x2004 ||  // THREE-PER-EM SPACE         (%E2%80%84)
308*6777b538SAndroid Build Coastguard Worker       code_point == 0x2005 ||  // FOUR-PER-EM SPACE          (%E2%80%85)
309*6777b538SAndroid Build Coastguard Worker       code_point == 0x2006 ||  // SIX-PER-EM SPACE           (%E2%80%86)
310*6777b538SAndroid Build Coastguard Worker       code_point == 0x2007 ||  // FIGURE SPACE               (%E2%80%87)
311*6777b538SAndroid Build Coastguard Worker       code_point == 0x2008 ||  // PUNCTUATION SPACE          (%E2%80%88)
312*6777b538SAndroid Build Coastguard Worker       code_point == 0x2009 ||  // THIN SPACE                 (%E2%80%89)
313*6777b538SAndroid Build Coastguard Worker       code_point == 0x200A ||  // HAIR SPACE                 (%E2%80%8A)
314*6777b538SAndroid Build Coastguard Worker       code_point == 0x2028 ||  // LINE SEPARATOR             (%E2%80%A8)
315*6777b538SAndroid Build Coastguard Worker       code_point == 0x2029 ||  // PARAGRAPH SEPARATOR        (%E2%80%A9)
316*6777b538SAndroid Build Coastguard Worker       code_point == 0x202F ||  // NARROW NO-BREAK SPACE      (%E2%80%AF)
317*6777b538SAndroid Build Coastguard Worker       code_point == 0x205F ||  // MEDIUM MATHEMATICAL SPACE  (%E2%81%9F)
318*6777b538SAndroid Build Coastguard Worker       code_point == 0x3000 ||  // IDEOGRAPHIC SPACE          (%E3%80%80)
319*6777b538SAndroid Build Coastguard Worker       // U+2800 is rendered as a space, but is not considered whitespace (see
320*6777b538SAndroid Build Coastguard Worker       // crbug.com/1068531).
321*6777b538SAndroid Build Coastguard Worker       code_point == 0x2800 ||  // BRAILLE PATTERN BLANK      (%E2%A0%80)
322*6777b538SAndroid Build Coastguard Worker 
323*6777b538SAndroid Build Coastguard Worker       // Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format
324*6777b538SAndroid Build Coastguard Worker       // characters ([:Cf:]) are also banned (see crbug.com/824715).
325*6777b538SAndroid Build Coastguard Worker       code_point == 0x00AD ||  // SOFT HYPHEN               (%C2%AD)
326*6777b538SAndroid Build Coastguard Worker       code_point == 0x034F ||  // COMBINING GRAPHEME JOINER (%CD%8F)
327*6777b538SAndroid Build Coastguard Worker       // Arabic number formatting
328*6777b538SAndroid Build Coastguard Worker       (code_point >= 0x0600 && code_point <= 0x0605) ||
329*6777b538SAndroid Build Coastguard Worker       // U+061C is already banned as a BiDi control character.
330*6777b538SAndroid Build Coastguard Worker       code_point == 0x06DD ||  // ARABIC END OF AYAH          (%DB%9D)
331*6777b538SAndroid Build Coastguard Worker       code_point == 0x070F ||  // SYRIAC ABBREVIATION MARK    (%DC%8F)
332*6777b538SAndroid Build Coastguard Worker       code_point == 0x08E2 ||  // ARABIC DISPUTED END OF AYAH (%E0%A3%A2)
333*6777b538SAndroid Build Coastguard Worker       code_point == 0x115F ||  // HANGUL CHOSEONG FILLER      (%E1%85%9F)
334*6777b538SAndroid Build Coastguard Worker       code_point == 0x1160 ||  // HANGUL JUNGSEONG FILLER     (%E1%85%A0)
335*6777b538SAndroid Build Coastguard Worker       code_point == 0x17B4 ||  // KHMER VOWEL INHERENT AQ     (%E1%9E%B4)
336*6777b538SAndroid Build Coastguard Worker       code_point == 0x17B5 ||  // KHMER VOWEL INHERENT AA     (%E1%9E%B5)
337*6777b538SAndroid Build Coastguard Worker       code_point == 0x180B ||  // MONGOLIAN FREE VARIATION SELECTOR ONE
338*6777b538SAndroid Build Coastguard Worker                                // (%E1%A0%8B)
339*6777b538SAndroid Build Coastguard Worker       code_point == 0x180C ||  // MONGOLIAN FREE VARIATION SELECTOR TWO
340*6777b538SAndroid Build Coastguard Worker                                // (%E1%A0%8C)
341*6777b538SAndroid Build Coastguard Worker       code_point == 0x180D ||  // MONGOLIAN FREE VARIATION SELECTOR THREE
342*6777b538SAndroid Build Coastguard Worker                                // (%E1%A0%8D)
343*6777b538SAndroid Build Coastguard Worker       code_point == 0x180E ||  // MONGOLIAN VOWEL SEPARATOR   (%E1%A0%8E)
344*6777b538SAndroid Build Coastguard Worker       code_point == 0x200B ||  // ZERO WIDTH SPACE            (%E2%80%8B)
345*6777b538SAndroid Build Coastguard Worker       code_point == 0x200C ||  // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C)
346*6777b538SAndroid Build Coastguard Worker       code_point == 0x200D ||  // ZERO WIDTH JOINER           (%E2%80%8D)
347*6777b538SAndroid Build Coastguard Worker       // U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as
348*6777b538SAndroid Build Coastguard Worker       // BiDi control characters.
349*6777b538SAndroid Build Coastguard Worker       code_point == 0x2060 ||  // WORD JOINER          (%E2%81%A0)
350*6777b538SAndroid Build Coastguard Worker       code_point == 0x2061 ||  // FUNCTION APPLICATION (%E2%81%A1)
351*6777b538SAndroid Build Coastguard Worker       code_point == 0x2062 ||  // INVISIBLE TIMES      (%E2%81%A2)
352*6777b538SAndroid Build Coastguard Worker       code_point == 0x2063 ||  // INVISIBLE SEPARATOR  (%E2%81%A3)
353*6777b538SAndroid Build Coastguard Worker       code_point == 0x2064 ||  // INVISIBLE PLUS       (%E2%81%A4)
354*6777b538SAndroid Build Coastguard Worker       code_point == 0x2065 ||  // null (%E2%81%A5)
355*6777b538SAndroid Build Coastguard Worker       // 0x2066--0x2069 are already banned as a BiDi control characters.
356*6777b538SAndroid Build Coastguard Worker       // General Punctuation - Deprecated (U+206A--206F)
357*6777b538SAndroid Build Coastguard Worker       (code_point >= 0x206A && code_point <= 0x206F) ||
358*6777b538SAndroid Build Coastguard Worker       code_point == 0x3164 ||  // HANGUL FILLER (%E3%85%A4)
359*6777b538SAndroid Build Coastguard Worker       (code_point >= 0xFFF0 && code_point <= 0xFFF8) ||  // null
360*6777b538SAndroid Build Coastguard Worker       // Variation selectors (%EF%B8%80 -- %EF%B8%8F)
361*6777b538SAndroid Build Coastguard Worker       (code_point >= 0xFE00 && code_point <= 0xFE0F) ||
362*6777b538SAndroid Build Coastguard Worker       code_point == 0xFEFF ||   // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF)
363*6777b538SAndroid Build Coastguard Worker       code_point == 0xFFA0 ||   // HALFWIDTH HANGUL FILLER (%EF%BE%A0)
364*6777b538SAndroid Build Coastguard Worker       code_point == 0xFFF9 ||   // INTERLINEAR ANNOTATION ANCHOR     (%EF%BF%B9)
365*6777b538SAndroid Build Coastguard Worker       code_point == 0xFFFA ||   // INTERLINEAR ANNOTATION SEPARATOR  (%EF%BF%BA)
366*6777b538SAndroid Build Coastguard Worker       code_point == 0xFFFB ||   // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB)
367*6777b538SAndroid Build Coastguard Worker       code_point == 0x110BD ||  // KAITHI NUMBER SIGN       (%F0%91%82%BD)
368*6777b538SAndroid Build Coastguard Worker       code_point == 0x110CD ||  // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D)
369*6777b538SAndroid Build Coastguard Worker       // Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8)
370*6777b538SAndroid Build Coastguard Worker       (code_point >= 0x13430 && code_point <= 0x13438) ||
371*6777b538SAndroid Build Coastguard Worker       // Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3)
372*6777b538SAndroid Build Coastguard Worker       (code_point >= 0x1BCA0 && code_point <= 0x1BCA3) ||
373*6777b538SAndroid Build Coastguard Worker       // Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA)
374*6777b538SAndroid Build Coastguard Worker       (code_point >= 0x1D173 && code_point <= 0x1D17A) ||
375*6777b538SAndroid Build Coastguard Worker       // Tags, Variation Selectors, nulls
376*6777b538SAndroid Build Coastguard Worker       (code_point >= 0xE0000 && code_point <= 0xE0FFF));
377*6777b538SAndroid Build Coastguard Worker }
378*6777b538SAndroid Build Coastguard Worker 
379*6777b538SAndroid Build Coastguard Worker // Unescapes |escaped_text| according to |rules|, returning the resulting
380*6777b538SAndroid Build Coastguard Worker // string.  Fills in an |adjustments| parameter, if non-nullptr, so it reflects
381*6777b538SAndroid Build Coastguard Worker // the alterations done to the string that are not one-character-to-one-
382*6777b538SAndroid Build Coastguard Worker // character.  The resulting |adjustments| will always be sorted by increasing
383*6777b538SAndroid Build Coastguard Worker // offset.
UnescapeURLWithAdjustmentsImpl(StringPiece escaped_text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)384*6777b538SAndroid Build Coastguard Worker std::string UnescapeURLWithAdjustmentsImpl(
385*6777b538SAndroid Build Coastguard Worker     StringPiece escaped_text,
386*6777b538SAndroid Build Coastguard Worker     UnescapeRule::Type rules,
387*6777b538SAndroid Build Coastguard Worker     OffsetAdjuster::Adjustments* adjustments) {
388*6777b538SAndroid Build Coastguard Worker   if (adjustments)
389*6777b538SAndroid Build Coastguard Worker     adjustments->clear();
390*6777b538SAndroid Build Coastguard Worker   // Do not unescape anything, return the |escaped_text| text.
391*6777b538SAndroid Build Coastguard Worker   if (rules == UnescapeRule::NONE)
392*6777b538SAndroid Build Coastguard Worker     return std::string(escaped_text);
393*6777b538SAndroid Build Coastguard Worker 
394*6777b538SAndroid Build Coastguard Worker   // The output of the unescaping is always smaller than the input, so we can
395*6777b538SAndroid Build Coastguard Worker   // reserve the input size to make sure we have enough buffer and don't have
396*6777b538SAndroid Build Coastguard Worker   // to allocate in the loop below.
397*6777b538SAndroid Build Coastguard Worker   std::string result;
398*6777b538SAndroid Build Coastguard Worker   result.reserve(escaped_text.length());
399*6777b538SAndroid Build Coastguard Worker 
400*6777b538SAndroid Build Coastguard Worker   // Locations of adjusted text.
401*6777b538SAndroid Build Coastguard Worker   for (size_t i = 0, max = escaped_text.size(); i < max;) {
402*6777b538SAndroid Build Coastguard Worker     // Try to unescape the character.
403*6777b538SAndroid Build Coastguard Worker     base_icu::UChar32 code_point;
404*6777b538SAndroid Build Coastguard Worker     std::string unescaped;
405*6777b538SAndroid Build Coastguard Worker     if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
406*6777b538SAndroid Build Coastguard Worker                                       &unescaped)) {
407*6777b538SAndroid Build Coastguard Worker       // Check if the next character can be unescaped, but not as a valid UTF-8
408*6777b538SAndroid Build Coastguard Worker       // character. In that case, just unescaped and write the non-sense
409*6777b538SAndroid Build Coastguard Worker       // character.
410*6777b538SAndroid Build Coastguard Worker       //
411*6777b538SAndroid Build Coastguard Worker       // TODO(https://crbug.com/829868): Do not unescape illegal UTF-8
412*6777b538SAndroid Build Coastguard Worker       // sequences.
413*6777b538SAndroid Build Coastguard Worker       unsigned char non_utf8_byte;
414*6777b538SAndroid Build Coastguard Worker       if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
415*6777b538SAndroid Build Coastguard Worker         result.push_back(static_cast<char>(non_utf8_byte));
416*6777b538SAndroid Build Coastguard Worker         if (adjustments)
417*6777b538SAndroid Build Coastguard Worker           adjustments->push_back(OffsetAdjuster::Adjustment(i, 3, 1));
418*6777b538SAndroid Build Coastguard Worker         i += 3;
419*6777b538SAndroid Build Coastguard Worker         continue;
420*6777b538SAndroid Build Coastguard Worker       }
421*6777b538SAndroid Build Coastguard Worker 
422*6777b538SAndroid Build Coastguard Worker       // Character is not escaped, so append as is, unless it's a '+' and
423*6777b538SAndroid Build Coastguard Worker       // REPLACE_PLUS_WITH_SPACE is being applied.
424*6777b538SAndroid Build Coastguard Worker       if (escaped_text[i] == '+' &&
425*6777b538SAndroid Build Coastguard Worker           (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
426*6777b538SAndroid Build Coastguard Worker         result.push_back(' ');
427*6777b538SAndroid Build Coastguard Worker       } else {
428*6777b538SAndroid Build Coastguard Worker         result.push_back(escaped_text[i]);
429*6777b538SAndroid Build Coastguard Worker       }
430*6777b538SAndroid Build Coastguard Worker       ++i;
431*6777b538SAndroid Build Coastguard Worker       continue;
432*6777b538SAndroid Build Coastguard Worker     }
433*6777b538SAndroid Build Coastguard Worker 
434*6777b538SAndroid Build Coastguard Worker     DCHECK(!unescaped.empty());
435*6777b538SAndroid Build Coastguard Worker 
436*6777b538SAndroid Build Coastguard Worker     if (!ShouldUnescapeCodePoint(rules, code_point)) {
437*6777b538SAndroid Build Coastguard Worker       // If it's a valid UTF-8 character, but not safe to unescape, copy all
438*6777b538SAndroid Build Coastguard Worker       // bytes directly.
439*6777b538SAndroid Build Coastguard Worker       result.append(escaped_text.substr(i, 3 * unescaped.length()));
440*6777b538SAndroid Build Coastguard Worker       i += unescaped.length() * 3;
441*6777b538SAndroid Build Coastguard Worker       continue;
442*6777b538SAndroid Build Coastguard Worker     }
443*6777b538SAndroid Build Coastguard Worker 
444*6777b538SAndroid Build Coastguard Worker     // If the code point is allowed, and append the entire unescaped character.
445*6777b538SAndroid Build Coastguard Worker     result.append(unescaped);
446*6777b538SAndroid Build Coastguard Worker     if (adjustments) {
447*6777b538SAndroid Build Coastguard Worker       for (size_t j = 0; j < unescaped.length(); ++j) {
448*6777b538SAndroid Build Coastguard Worker         adjustments->push_back(OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
449*6777b538SAndroid Build Coastguard Worker       }
450*6777b538SAndroid Build Coastguard Worker     }
451*6777b538SAndroid Build Coastguard Worker     i += 3 * unescaped.length();
452*6777b538SAndroid Build Coastguard Worker   }
453*6777b538SAndroid Build Coastguard Worker 
454*6777b538SAndroid Build Coastguard Worker   return result;
455*6777b538SAndroid Build Coastguard Worker }
456*6777b538SAndroid Build Coastguard Worker 
457*6777b538SAndroid Build Coastguard Worker }  // namespace
458*6777b538SAndroid Build Coastguard Worker 
EscapeAllExceptUnreserved(StringPiece text)459*6777b538SAndroid Build Coastguard Worker std::string EscapeAllExceptUnreserved(StringPiece text) {
460*6777b538SAndroid Build Coastguard Worker   return Escape(text, kUnreservedCharmap, false);
461*6777b538SAndroid Build Coastguard Worker }
462*6777b538SAndroid Build Coastguard Worker 
EscapeQueryParamValue(StringPiece text,bool use_plus)463*6777b538SAndroid Build Coastguard Worker std::string EscapeQueryParamValue(StringPiece text, bool use_plus) {
464*6777b538SAndroid Build Coastguard Worker   return Escape(text, kQueryCharmap, use_plus);
465*6777b538SAndroid Build Coastguard Worker }
466*6777b538SAndroid Build Coastguard Worker 
EscapePath(StringPiece path)467*6777b538SAndroid Build Coastguard Worker std::string EscapePath(StringPiece path) {
468*6777b538SAndroid Build Coastguard Worker   return Escape(path, kPathCharmap, false);
469*6777b538SAndroid Build Coastguard Worker }
470*6777b538SAndroid Build Coastguard Worker 
471*6777b538SAndroid Build Coastguard Worker #if BUILDFLAG(IS_APPLE)
EscapeNSURLPrecursor(StringPiece precursor)472*6777b538SAndroid Build Coastguard Worker std::string EscapeNSURLPrecursor(StringPiece precursor) {
473*6777b538SAndroid Build Coastguard Worker   return Escape(precursor, kNSURLCharmap, false, true);
474*6777b538SAndroid Build Coastguard Worker }
475*6777b538SAndroid Build Coastguard Worker #endif  // BUILDFLAG(IS_APPLE)
476*6777b538SAndroid Build Coastguard Worker 
EscapeUrlEncodedData(StringPiece path,bool use_plus)477*6777b538SAndroid Build Coastguard Worker std::string EscapeUrlEncodedData(StringPiece path, bool use_plus) {
478*6777b538SAndroid Build Coastguard Worker   return Escape(path, kUrlEscape, use_plus);
479*6777b538SAndroid Build Coastguard Worker }
480*6777b538SAndroid Build Coastguard Worker 
EscapeNonASCIIAndPercent(StringPiece input)481*6777b538SAndroid Build Coastguard Worker std::string EscapeNonASCIIAndPercent(StringPiece input) {
482*6777b538SAndroid Build Coastguard Worker   return Escape(input, kNonASCIICharmapAndPercent, false);
483*6777b538SAndroid Build Coastguard Worker }
484*6777b538SAndroid Build Coastguard Worker 
EscapeNonASCII(StringPiece input)485*6777b538SAndroid Build Coastguard Worker std::string EscapeNonASCII(StringPiece input) {
486*6777b538SAndroid Build Coastguard Worker   return Escape(input, kNonASCIICharmap, false);
487*6777b538SAndroid Build Coastguard Worker }
488*6777b538SAndroid Build Coastguard Worker 
EscapeExternalHandlerValue(StringPiece text)489*6777b538SAndroid Build Coastguard Worker std::string EscapeExternalHandlerValue(StringPiece text) {
490*6777b538SAndroid Build Coastguard Worker   return Escape(text, kExternalHandlerCharmap, false, true);
491*6777b538SAndroid Build Coastguard Worker }
492*6777b538SAndroid Build Coastguard Worker 
AppendEscapedCharForHTML(char c,std::string * output)493*6777b538SAndroid Build Coastguard Worker void AppendEscapedCharForHTML(char c, std::string* output) {
494*6777b538SAndroid Build Coastguard Worker   AppendEscapedCharForHTMLImpl(c, output);
495*6777b538SAndroid Build Coastguard Worker }
496*6777b538SAndroid Build Coastguard Worker 
EscapeForHTML(StringPiece input)497*6777b538SAndroid Build Coastguard Worker std::string EscapeForHTML(StringPiece input) {
498*6777b538SAndroid Build Coastguard Worker   return EscapeForHTMLImpl(input);
499*6777b538SAndroid Build Coastguard Worker }
500*6777b538SAndroid Build Coastguard Worker 
EscapeForHTML(StringPiece16 input)501*6777b538SAndroid Build Coastguard Worker std::u16string EscapeForHTML(StringPiece16 input) {
502*6777b538SAndroid Build Coastguard Worker   return EscapeForHTMLImpl(input);
503*6777b538SAndroid Build Coastguard Worker }
504*6777b538SAndroid Build Coastguard Worker 
UnescapeURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)505*6777b538SAndroid Build Coastguard Worker std::string UnescapeURLComponent(StringPiece escaped_text,
506*6777b538SAndroid Build Coastguard Worker                                  UnescapeRule::Type rules) {
507*6777b538SAndroid Build Coastguard Worker   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, nullptr);
508*6777b538SAndroid Build Coastguard Worker }
509*6777b538SAndroid Build Coastguard Worker 
UnescapeAndDecodeUTF8URLComponentWithAdjustments(StringPiece text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)510*6777b538SAndroid Build Coastguard Worker std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments(
511*6777b538SAndroid Build Coastguard Worker     StringPiece text,
512*6777b538SAndroid Build Coastguard Worker     UnescapeRule::Type rules,
513*6777b538SAndroid Build Coastguard Worker     OffsetAdjuster::Adjustments* adjustments) {
514*6777b538SAndroid Build Coastguard Worker   std::u16string result;
515*6777b538SAndroid Build Coastguard Worker   OffsetAdjuster::Adjustments unescape_adjustments;
516*6777b538SAndroid Build Coastguard Worker   std::string unescaped_url(
517*6777b538SAndroid Build Coastguard Worker       UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments));
518*6777b538SAndroid Build Coastguard Worker   if (UTF8ToUTF16WithAdjustments(unescaped_url.data(), unescaped_url.length(),
519*6777b538SAndroid Build Coastguard Worker                                  &result, adjustments)) {
520*6777b538SAndroid Build Coastguard Worker     // Character set looks like it's valid.
521*6777b538SAndroid Build Coastguard Worker     if (adjustments) {
522*6777b538SAndroid Build Coastguard Worker       OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
523*6777b538SAndroid Build Coastguard Worker                                                  adjustments);
524*6777b538SAndroid Build Coastguard Worker     }
525*6777b538SAndroid Build Coastguard Worker     return result;
526*6777b538SAndroid Build Coastguard Worker   }
527*6777b538SAndroid Build Coastguard Worker   // Character set is not valid.  Return the escaped version.
528*6777b538SAndroid Build Coastguard Worker   return UTF8ToUTF16WithAdjustments(text, adjustments);
529*6777b538SAndroid Build Coastguard Worker }
530*6777b538SAndroid Build Coastguard Worker 
UnescapeBinaryURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)531*6777b538SAndroid Build Coastguard Worker std::string UnescapeBinaryURLComponent(StringPiece escaped_text,
532*6777b538SAndroid Build Coastguard Worker                                        UnescapeRule::Type rules) {
533*6777b538SAndroid Build Coastguard Worker   // Only NORMAL and REPLACE_PLUS_WITH_SPACE are supported.
534*6777b538SAndroid Build Coastguard Worker   DCHECK(rules != UnescapeRule::NONE);
535*6777b538SAndroid Build Coastguard Worker   DCHECK(!(rules &
536*6777b538SAndroid Build Coastguard Worker            ~(UnescapeRule::NORMAL | UnescapeRule::REPLACE_PLUS_WITH_SPACE)));
537*6777b538SAndroid Build Coastguard Worker 
538*6777b538SAndroid Build Coastguard Worker   // It is not possible to read the feature state when this function is invoked
539*6777b538SAndroid Build Coastguard Worker   // before FeatureList initialization. In that case, fallback to the feature's
540*6777b538SAndroid Build Coastguard Worker   // default state.
541*6777b538SAndroid Build Coastguard Worker   //
542*6777b538SAndroid Build Coastguard Worker   // TODO(crbug.com/1321924): Cleanup this feature.
543*6777b538SAndroid Build Coastguard Worker   const bool optimize_data_urls_feature_is_enabled =
544*6777b538SAndroid Build Coastguard Worker       base::FeatureList::GetInstance()
545*6777b538SAndroid Build Coastguard Worker           ? base::FeatureList::IsEnabled(features::kOptimizeDataUrls)
546*6777b538SAndroid Build Coastguard Worker           : features::kOptimizeDataUrls.default_state ==
547*6777b538SAndroid Build Coastguard Worker                 base::FEATURE_ENABLED_BY_DEFAULT;
548*6777b538SAndroid Build Coastguard Worker 
549*6777b538SAndroid Build Coastguard Worker   // If there are no '%' characters in the string, there will be nothing to
550*6777b538SAndroid Build Coastguard Worker   // unescape, so we can take the fast path.
551*6777b538SAndroid Build Coastguard Worker   if (optimize_data_urls_feature_is_enabled &&
552*6777b538SAndroid Build Coastguard Worker       escaped_text.find('%') == StringPiece::npos) {
553*6777b538SAndroid Build Coastguard Worker     std::string unescaped_text(escaped_text);
554*6777b538SAndroid Build Coastguard Worker     if (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)
555*6777b538SAndroid Build Coastguard Worker       std::replace(unescaped_text.begin(), unescaped_text.end(), '+', ' ');
556*6777b538SAndroid Build Coastguard Worker     return unescaped_text;
557*6777b538SAndroid Build Coastguard Worker   }
558*6777b538SAndroid Build Coastguard Worker 
559*6777b538SAndroid Build Coastguard Worker   std::string unescaped_text;
560*6777b538SAndroid Build Coastguard Worker 
561*6777b538SAndroid Build Coastguard Worker   // The output of the unescaping is always smaller than the input, so we can
562*6777b538SAndroid Build Coastguard Worker   // reserve the input size to make sure we have enough buffer and don't have
563*6777b538SAndroid Build Coastguard Worker   // to allocate in the loop below.
564*6777b538SAndroid Build Coastguard Worker   // Increase capacity before size, as just resizing can grow capacity
565*6777b538SAndroid Build Coastguard Worker   // needlessly beyond our requested size.
566*6777b538SAndroid Build Coastguard Worker   unescaped_text.reserve(escaped_text.size());
567*6777b538SAndroid Build Coastguard Worker   unescaped_text.resize(escaped_text.size());
568*6777b538SAndroid Build Coastguard Worker 
569*6777b538SAndroid Build Coastguard Worker   size_t output_index = 0;
570*6777b538SAndroid Build Coastguard Worker 
571*6777b538SAndroid Build Coastguard Worker   for (size_t i = 0, max = escaped_text.size(); i < max;) {
572*6777b538SAndroid Build Coastguard Worker     unsigned char byte;
573*6777b538SAndroid Build Coastguard Worker     // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
574*6777b538SAndroid Build Coastguard Worker     // to call.
575*6777b538SAndroid Build Coastguard Worker     if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
576*6777b538SAndroid Build Coastguard Worker       unescaped_text[output_index++] = static_cast<char>(byte);
577*6777b538SAndroid Build Coastguard Worker       i += 3;
578*6777b538SAndroid Build Coastguard Worker       continue;
579*6777b538SAndroid Build Coastguard Worker     }
580*6777b538SAndroid Build Coastguard Worker 
581*6777b538SAndroid Build Coastguard Worker     if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
582*6777b538SAndroid Build Coastguard Worker         escaped_text[i] == '+') {
583*6777b538SAndroid Build Coastguard Worker       unescaped_text[output_index++] = ' ';
584*6777b538SAndroid Build Coastguard Worker       ++i;
585*6777b538SAndroid Build Coastguard Worker       continue;
586*6777b538SAndroid Build Coastguard Worker     }
587*6777b538SAndroid Build Coastguard Worker 
588*6777b538SAndroid Build Coastguard Worker     unescaped_text[output_index++] = escaped_text[i++];
589*6777b538SAndroid Build Coastguard Worker   }
590*6777b538SAndroid Build Coastguard Worker 
591*6777b538SAndroid Build Coastguard Worker   DCHECK_LE(output_index, unescaped_text.size());
592*6777b538SAndroid Build Coastguard Worker   unescaped_text.resize(output_index);
593*6777b538SAndroid Build Coastguard Worker   return unescaped_text;
594*6777b538SAndroid Build Coastguard Worker }
595*6777b538SAndroid Build Coastguard Worker 
UnescapeBinaryURLComponentSafe(StringPiece escaped_text,bool fail_on_path_separators,std::string * unescaped_text)596*6777b538SAndroid Build Coastguard Worker bool UnescapeBinaryURLComponentSafe(StringPiece escaped_text,
597*6777b538SAndroid Build Coastguard Worker                                     bool fail_on_path_separators,
598*6777b538SAndroid Build Coastguard Worker                                     std::string* unescaped_text) {
599*6777b538SAndroid Build Coastguard Worker   unescaped_text->clear();
600*6777b538SAndroid Build Coastguard Worker 
601*6777b538SAndroid Build Coastguard Worker   std::set<unsigned char> illegal_encoded_bytes;
602*6777b538SAndroid Build Coastguard Worker   for (unsigned char c = '\x00'; c < '\x20'; ++c) {
603*6777b538SAndroid Build Coastguard Worker     illegal_encoded_bytes.insert(c);
604*6777b538SAndroid Build Coastguard Worker   }
605*6777b538SAndroid Build Coastguard Worker   if (fail_on_path_separators) {
606*6777b538SAndroid Build Coastguard Worker     illegal_encoded_bytes.insert('/');
607*6777b538SAndroid Build Coastguard Worker     illegal_encoded_bytes.insert('\\');
608*6777b538SAndroid Build Coastguard Worker   }
609*6777b538SAndroid Build Coastguard Worker   if (ContainsEncodedBytes(escaped_text, illegal_encoded_bytes))
610*6777b538SAndroid Build Coastguard Worker     return false;
611*6777b538SAndroid Build Coastguard Worker 
612*6777b538SAndroid Build Coastguard Worker   *unescaped_text = UnescapeBinaryURLComponent(escaped_text);
613*6777b538SAndroid Build Coastguard Worker   return true;
614*6777b538SAndroid Build Coastguard Worker }
615*6777b538SAndroid Build Coastguard Worker 
ContainsEncodedBytes(StringPiece escaped_text,const std::set<unsigned char> & bytes)616*6777b538SAndroid Build Coastguard Worker bool ContainsEncodedBytes(StringPiece escaped_text,
617*6777b538SAndroid Build Coastguard Worker                           const std::set<unsigned char>& bytes) {
618*6777b538SAndroid Build Coastguard Worker   for (size_t i = 0, max = escaped_text.size(); i < max;) {
619*6777b538SAndroid Build Coastguard Worker     unsigned char byte;
620*6777b538SAndroid Build Coastguard Worker     // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
621*6777b538SAndroid Build Coastguard Worker     // to call.
622*6777b538SAndroid Build Coastguard Worker     if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
623*6777b538SAndroid Build Coastguard Worker       if (bytes.find(byte) != bytes.end())
624*6777b538SAndroid Build Coastguard Worker         return true;
625*6777b538SAndroid Build Coastguard Worker 
626*6777b538SAndroid Build Coastguard Worker       i += 3;
627*6777b538SAndroid Build Coastguard Worker       continue;
628*6777b538SAndroid Build Coastguard Worker     }
629*6777b538SAndroid Build Coastguard Worker 
630*6777b538SAndroid Build Coastguard Worker     ++i;
631*6777b538SAndroid Build Coastguard Worker   }
632*6777b538SAndroid Build Coastguard Worker 
633*6777b538SAndroid Build Coastguard Worker   return false;
634*6777b538SAndroid Build Coastguard Worker }
635*6777b538SAndroid Build Coastguard Worker 
UnescapeForHTML(StringPiece16 input)636*6777b538SAndroid Build Coastguard Worker std::u16string UnescapeForHTML(StringPiece16 input) {
637*6777b538SAndroid Build Coastguard Worker   static const struct {
638*6777b538SAndroid Build Coastguard Worker     const char* ampersand_code;
639*6777b538SAndroid Build Coastguard Worker     const char16_t replacement;
640*6777b538SAndroid Build Coastguard Worker   } kEscapeToChars[] = {
641*6777b538SAndroid Build Coastguard Worker       {"&lt;", '<'},   {"&gt;", '>'},   {"&amp;", '&'},
642*6777b538SAndroid Build Coastguard Worker       {"&quot;", '"'}, {"&#39;", '\''},
643*6777b538SAndroid Build Coastguard Worker   };
644*6777b538SAndroid Build Coastguard Worker   constexpr size_t kEscapeToCharsCount = std::size(kEscapeToChars);
645*6777b538SAndroid Build Coastguard Worker 
646*6777b538SAndroid Build Coastguard Worker   if (input.find(u"&") == std::string::npos)
647*6777b538SAndroid Build Coastguard Worker     return std::u16string(input);
648*6777b538SAndroid Build Coastguard Worker 
649*6777b538SAndroid Build Coastguard Worker   std::u16string ampersand_chars[kEscapeToCharsCount];
650*6777b538SAndroid Build Coastguard Worker   std::u16string text(input);
651*6777b538SAndroid Build Coastguard Worker   for (std::u16string::iterator iter = text.begin(); iter != text.end();
652*6777b538SAndroid Build Coastguard Worker        ++iter) {
653*6777b538SAndroid Build Coastguard Worker     if (*iter == '&') {
654*6777b538SAndroid Build Coastguard Worker       // Potential ampersand encode char.
655*6777b538SAndroid Build Coastguard Worker       size_t index = static_cast<size_t>(iter - text.begin());
656*6777b538SAndroid Build Coastguard Worker       for (size_t i = 0; i < std::size(kEscapeToChars); i++) {
657*6777b538SAndroid Build Coastguard Worker         if (ampersand_chars[i].empty()) {
658*6777b538SAndroid Build Coastguard Worker           ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
659*6777b538SAndroid Build Coastguard Worker         }
660*6777b538SAndroid Build Coastguard Worker         if (text.find(ampersand_chars[i], index) == index) {
661*6777b538SAndroid Build Coastguard Worker           text.replace(
662*6777b538SAndroid Build Coastguard Worker               iter, iter + static_cast<ptrdiff_t>(ampersand_chars[i].length()),
663*6777b538SAndroid Build Coastguard Worker               1, kEscapeToChars[i].replacement);
664*6777b538SAndroid Build Coastguard Worker           break;
665*6777b538SAndroid Build Coastguard Worker         }
666*6777b538SAndroid Build Coastguard Worker       }
667*6777b538SAndroid Build Coastguard Worker     }
668*6777b538SAndroid Build Coastguard Worker   }
669*6777b538SAndroid Build Coastguard Worker   return text;
670*6777b538SAndroid Build Coastguard Worker }
671*6777b538SAndroid Build Coastguard Worker 
672*6777b538SAndroid Build Coastguard Worker }  // namespace base
673