1*6777b538SAndroid Build Coastguard Worker // Copyright 2020 The Chromium Authors
2*6777b538SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*6777b538SAndroid Build Coastguard Worker // found in the LICENSE file.
4*6777b538SAndroid Build Coastguard Worker
5*6777b538SAndroid Build Coastguard Worker #include "base/strings/escape.h"
6*6777b538SAndroid Build Coastguard Worker
7*6777b538SAndroid Build Coastguard Worker #include <ostream>
8*6777b538SAndroid Build Coastguard Worker
9*6777b538SAndroid Build Coastguard Worker #include "base/check_op.h"
10*6777b538SAndroid Build Coastguard Worker #include "base/feature_list.h"
11*6777b538SAndroid Build Coastguard Worker #include "base/features.h"
12*6777b538SAndroid Build Coastguard Worker #include "base/strings/string_number_conversions.h"
13*6777b538SAndroid Build Coastguard Worker #include "base/strings/string_piece.h"
14*6777b538SAndroid Build Coastguard Worker #include "base/strings/string_util.h"
15*6777b538SAndroid Build Coastguard Worker #include "base/strings/utf_string_conversion_utils.h"
16*6777b538SAndroid Build Coastguard Worker #include "base/strings/utf_string_conversions.h"
17*6777b538SAndroid Build Coastguard Worker #include "base/third_party/icu/icu_utf.h"
18*6777b538SAndroid Build Coastguard Worker
19*6777b538SAndroid Build Coastguard Worker namespace base {
20*6777b538SAndroid Build Coastguard Worker
21*6777b538SAndroid Build Coastguard Worker namespace {
22*6777b538SAndroid Build Coastguard Worker
23*6777b538SAndroid Build Coastguard Worker // A fast bit-vector map for ascii characters.
24*6777b538SAndroid Build Coastguard Worker //
25*6777b538SAndroid Build Coastguard Worker // Internally stores 256 bits in an array of 8 ints.
26*6777b538SAndroid Build Coastguard Worker // Does quick bit-flicking to lookup needed characters.
27*6777b538SAndroid Build Coastguard Worker struct Charmap {
Containsbase::__anon620755e90111::Charmap28*6777b538SAndroid Build Coastguard Worker bool Contains(unsigned char c) const {
29*6777b538SAndroid Build Coastguard Worker return ((map[c >> 5] & (1 << (c & 31))) != 0);
30*6777b538SAndroid Build Coastguard Worker }
31*6777b538SAndroid Build Coastguard Worker
32*6777b538SAndroid Build Coastguard Worker uint32_t map[8];
33*6777b538SAndroid Build Coastguard Worker };
34*6777b538SAndroid Build Coastguard Worker
35*6777b538SAndroid Build Coastguard Worker // Given text to escape and a Charmap defining which values to escape,
36*6777b538SAndroid Build Coastguard Worker // return an escaped string. If use_plus is true, spaces are converted
37*6777b538SAndroid Build Coastguard Worker // to +, otherwise, if spaces are in the charmap, they are converted to
38*6777b538SAndroid Build Coastguard Worker // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
39*6777b538SAndroid Build Coastguard Worker // '%' is in the charmap, it is converted to %25.
Escape(StringPiece text,const Charmap & charmap,bool use_plus,bool keep_escaped=false)40*6777b538SAndroid Build Coastguard Worker std::string Escape(StringPiece text,
41*6777b538SAndroid Build Coastguard Worker const Charmap& charmap,
42*6777b538SAndroid Build Coastguard Worker bool use_plus,
43*6777b538SAndroid Build Coastguard Worker bool keep_escaped = false) {
44*6777b538SAndroid Build Coastguard Worker std::string escaped;
45*6777b538SAndroid Build Coastguard Worker escaped.reserve(text.length() * 3);
46*6777b538SAndroid Build Coastguard Worker for (size_t i = 0; i < text.length(); ++i) {
47*6777b538SAndroid Build Coastguard Worker unsigned char c = static_cast<unsigned char>(text[i]);
48*6777b538SAndroid Build Coastguard Worker if (use_plus && ' ' == c) {
49*6777b538SAndroid Build Coastguard Worker escaped.push_back('+');
50*6777b538SAndroid Build Coastguard Worker } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
51*6777b538SAndroid Build Coastguard Worker IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
52*6777b538SAndroid Build Coastguard Worker escaped.push_back('%');
53*6777b538SAndroid Build Coastguard Worker } else if (charmap.Contains(c)) {
54*6777b538SAndroid Build Coastguard Worker escaped.push_back('%');
55*6777b538SAndroid Build Coastguard Worker AppendHexEncodedByte(c, escaped);
56*6777b538SAndroid Build Coastguard Worker } else {
57*6777b538SAndroid Build Coastguard Worker escaped.push_back(static_cast<char>(c));
58*6777b538SAndroid Build Coastguard Worker }
59*6777b538SAndroid Build Coastguard Worker }
60*6777b538SAndroid Build Coastguard Worker return escaped;
61*6777b538SAndroid Build Coastguard Worker }
62*6777b538SAndroid Build Coastguard Worker
63*6777b538SAndroid Build Coastguard Worker // Convert a character |c| to a form that will not be mistaken as HTML.
64*6777b538SAndroid Build Coastguard Worker template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)65*6777b538SAndroid Build Coastguard Worker void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
66*6777b538SAndroid Build Coastguard Worker static constexpr struct {
67*6777b538SAndroid Build Coastguard Worker char key;
68*6777b538SAndroid Build Coastguard Worker StringPiece replacement;
69*6777b538SAndroid Build Coastguard Worker } kCharsToEscape[] = {
70*6777b538SAndroid Build Coastguard Worker {'<', "<"}, {'>', ">"}, {'&', "&"},
71*6777b538SAndroid Build Coastguard Worker {'"', """}, {'\'', "'"},
72*6777b538SAndroid Build Coastguard Worker };
73*6777b538SAndroid Build Coastguard Worker for (const auto& char_to_escape : kCharsToEscape) {
74*6777b538SAndroid Build Coastguard Worker if (c == char_to_escape.key) {
75*6777b538SAndroid Build Coastguard Worker output->append(std::begin(char_to_escape.replacement),
76*6777b538SAndroid Build Coastguard Worker std::end(char_to_escape.replacement));
77*6777b538SAndroid Build Coastguard Worker return;
78*6777b538SAndroid Build Coastguard Worker }
79*6777b538SAndroid Build Coastguard Worker }
80*6777b538SAndroid Build Coastguard Worker output->push_back(c);
81*6777b538SAndroid Build Coastguard Worker }
82*6777b538SAndroid Build Coastguard Worker
83*6777b538SAndroid Build Coastguard Worker // Convert |input| string to a form that will not be interpreted as HTML.
84*6777b538SAndroid Build Coastguard Worker template <typename T, typename CharT = typename T::value_type>
EscapeForHTMLImpl(T input)85*6777b538SAndroid Build Coastguard Worker std::basic_string<CharT> EscapeForHTMLImpl(T input) {
86*6777b538SAndroid Build Coastguard Worker std::basic_string<CharT> result;
87*6777b538SAndroid Build Coastguard Worker result.reserve(input.size()); // Optimize for no escaping.
88*6777b538SAndroid Build Coastguard Worker
89*6777b538SAndroid Build Coastguard Worker for (auto c : input) {
90*6777b538SAndroid Build Coastguard Worker AppendEscapedCharForHTMLImpl(c, &result);
91*6777b538SAndroid Build Coastguard Worker }
92*6777b538SAndroid Build Coastguard Worker
93*6777b538SAndroid Build Coastguard Worker return result;
94*6777b538SAndroid Build Coastguard Worker }
95*6777b538SAndroid Build Coastguard Worker
96*6777b538SAndroid Build Coastguard Worker // Everything except alphanumerics and -._~
97*6777b538SAndroid Build Coastguard Worker // See RFC 3986 for the list of unreserved characters.
98*6777b538SAndroid Build Coastguard Worker static const Charmap kUnreservedCharmap = {
99*6777b538SAndroid Build Coastguard Worker {0xffffffffL, 0xfc009fffL, 0x78000001L, 0xb8000001L, 0xffffffffL,
100*6777b538SAndroid Build Coastguard Worker 0xffffffffL, 0xffffffffL, 0xffffffffL}};
101*6777b538SAndroid Build Coastguard Worker
102*6777b538SAndroid Build Coastguard Worker // Everything except alphanumerics and !'()*-._~
103*6777b538SAndroid Build Coastguard Worker // See RFC 2396 for the list of reserved characters.
104*6777b538SAndroid Build Coastguard Worker static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L,
105*6777b538SAndroid Build Coastguard Worker 0xb8000001L, 0xffffffffL, 0xffffffffL,
106*6777b538SAndroid Build Coastguard Worker 0xffffffffL, 0xffffffffL}};
107*6777b538SAndroid Build Coastguard Worker
108*6777b538SAndroid Build Coastguard Worker // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|}
109*6777b538SAndroid Build Coastguard Worker static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L,
110*6777b538SAndroid Build Coastguard Worker 0xb8000001L, 0xffffffffL, 0xffffffffL,
111*6777b538SAndroid Build Coastguard Worker 0xffffffffL, 0xffffffffL}};
112*6777b538SAndroid Build Coastguard Worker
113*6777b538SAndroid Build Coastguard Worker #if BUILDFLAG(IS_APPLE)
114*6777b538SAndroid Build Coastguard Worker // non-printable, non-7bit, and (including space) "#%<>[\]^`{|}
115*6777b538SAndroid Build Coastguard Worker static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L,
116*6777b538SAndroid Build Coastguard Worker 0xb8000001L, 0xffffffffL, 0xffffffffL,
117*6777b538SAndroid Build Coastguard Worker 0xffffffffL, 0xffffffffL}};
118*6777b538SAndroid Build Coastguard Worker #endif // BUILDFLAG(IS_APPLE)
119*6777b538SAndroid Build Coastguard Worker
120*6777b538SAndroid Build Coastguard Worker // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
121*6777b538SAndroid Build Coastguard Worker static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L,
122*6777b538SAndroid Build Coastguard Worker 0xb8000001L, 0xffffffffL, 0xffffffffL,
123*6777b538SAndroid Build Coastguard Worker 0xffffffffL, 0xffffffffL}};
124*6777b538SAndroid Build Coastguard Worker
125*6777b538SAndroid Build Coastguard Worker // non-7bit, as well as %.
126*6777b538SAndroid Build Coastguard Worker static const Charmap kNonASCIICharmapAndPercent = {
127*6777b538SAndroid Build Coastguard Worker {0x00000000L, 0x00000020L, 0x00000000L, 0x00000000L, 0xffffffffL,
128*6777b538SAndroid Build Coastguard Worker 0xffffffffL, 0xffffffffL, 0xffffffffL}};
129*6777b538SAndroid Build Coastguard Worker
130*6777b538SAndroid Build Coastguard Worker // non-7bit
131*6777b538SAndroid Build Coastguard Worker static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L,
132*6777b538SAndroid Build Coastguard Worker 0x00000000L, 0xffffffffL, 0xffffffffL,
133*6777b538SAndroid Build Coastguard Worker 0xffffffffL, 0xffffffffL}};
134*6777b538SAndroid Build Coastguard Worker
135*6777b538SAndroid Build Coastguard Worker // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
136*6777b538SAndroid Build Coastguard Worker // !'()*-._~#[]
137*6777b538SAndroid Build Coastguard Worker static const Charmap kExternalHandlerCharmap = {
138*6777b538SAndroid Build Coastguard Worker {0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, 0xffffffffL,
139*6777b538SAndroid Build Coastguard Worker 0xffffffffL, 0xffffffffL, 0xffffffffL}};
140*6777b538SAndroid Build Coastguard Worker
141*6777b538SAndroid Build Coastguard Worker // Contains nonzero when the corresponding character is unescapable for normal
142*6777b538SAndroid Build Coastguard Worker // URLs. These characters are the ones that may change the parsing of a URL, so
143*6777b538SAndroid Build Coastguard Worker // we don't want to unescape them sometimes. In many case we won't want to
144*6777b538SAndroid Build Coastguard Worker // unescape spaces, but that is controlled by parameters to Unescape*.
145*6777b538SAndroid Build Coastguard Worker //
146*6777b538SAndroid Build Coastguard Worker // The basic rule is that we can't unescape anything that would changing parsing
147*6777b538SAndroid Build Coastguard Worker // like # or ?. We also can't unescape &, =, or + since that could be part of a
148*6777b538SAndroid Build Coastguard Worker // query and that could change the server's parsing of the query. Nor can we
149*6777b538SAndroid Build Coastguard Worker // unescape \ since src/url/ will convert it to a /.
150*6777b538SAndroid Build Coastguard Worker //
151*6777b538SAndroid Build Coastguard Worker // Lastly, we can't unescape anything that doesn't have a canonical
152*6777b538SAndroid Build Coastguard Worker // representation in a URL. This means that unescaping will change the URL, and
153*6777b538SAndroid Build Coastguard Worker // you could get different behavior if you copy and paste the URL, or press
154*6777b538SAndroid Build Coastguard Worker // enter in the URL bar. The list of characters that fall into this category
155*6777b538SAndroid Build Coastguard Worker // are the ones labeled PASS (allow either escaped or unescaped) in the big
156*6777b538SAndroid Build Coastguard Worker // lookup table at the top of url/url_canon_path.cc. Also, characters
157*6777b538SAndroid Build Coastguard Worker // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
158*6777b538SAndroid Build Coastguard Worker // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
159*6777b538SAndroid Build Coastguard Worker // not unescaped, to avoid turning a valid url according to spec into an
160*6777b538SAndroid Build Coastguard Worker // invalid one.
161*6777b538SAndroid Build Coastguard Worker // clang-format off
162*6777b538SAndroid Build Coastguard Worker const char kUrlUnescape[128] = {
163*6777b538SAndroid Build Coastguard Worker // Null, control chars...
164*6777b538SAndroid Build Coastguard Worker 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165*6777b538SAndroid Build Coastguard Worker 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166*6777b538SAndroid Build Coastguard Worker // ' ' ! " # $ % & ' ( ) * + , - . /
167*6777b538SAndroid Build Coastguard Worker 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
168*6777b538SAndroid Build Coastguard Worker // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
169*6777b538SAndroid Build Coastguard Worker 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
170*6777b538SAndroid Build Coastguard Worker // @ A B C D E F G H I J K L M N O
171*6777b538SAndroid Build Coastguard Worker 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172*6777b538SAndroid Build Coastguard Worker // P Q R S T U V W X Y Z [ \ ] ^ _
173*6777b538SAndroid Build Coastguard Worker 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
174*6777b538SAndroid Build Coastguard Worker // ` a b c d e f g h i j k l m n o
175*6777b538SAndroid Build Coastguard Worker 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
176*6777b538SAndroid Build Coastguard Worker // p q r s t u v w x y z { | } ~ <NBSP>
177*6777b538SAndroid Build Coastguard Worker 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
178*6777b538SAndroid Build Coastguard Worker };
179*6777b538SAndroid Build Coastguard Worker // clang-format on
180*6777b538SAndroid Build Coastguard Worker
181*6777b538SAndroid Build Coastguard Worker // Attempts to unescape the sequence at |index| within |escaped_text|. If
182*6777b538SAndroid Build Coastguard Worker // successful, sets |value| to the unescaped value. Returns whether
183*6777b538SAndroid Build Coastguard Worker // unescaping succeeded.
UnescapeUnsignedByteAtIndex(StringPiece escaped_text,size_t index,unsigned char * value)184*6777b538SAndroid Build Coastguard Worker bool UnescapeUnsignedByteAtIndex(StringPiece escaped_text,
185*6777b538SAndroid Build Coastguard Worker size_t index,
186*6777b538SAndroid Build Coastguard Worker unsigned char* value) {
187*6777b538SAndroid Build Coastguard Worker if ((index + 2) >= escaped_text.size())
188*6777b538SAndroid Build Coastguard Worker return false;
189*6777b538SAndroid Build Coastguard Worker if (escaped_text[index] != '%')
190*6777b538SAndroid Build Coastguard Worker return false;
191*6777b538SAndroid Build Coastguard Worker char most_sig_digit(escaped_text[index + 1]);
192*6777b538SAndroid Build Coastguard Worker char least_sig_digit(escaped_text[index + 2]);
193*6777b538SAndroid Build Coastguard Worker if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
194*6777b538SAndroid Build Coastguard Worker *value = static_cast<unsigned char>(HexDigitToInt(most_sig_digit) * 16 +
195*6777b538SAndroid Build Coastguard Worker HexDigitToInt(least_sig_digit));
196*6777b538SAndroid Build Coastguard Worker return true;
197*6777b538SAndroid Build Coastguard Worker }
198*6777b538SAndroid Build Coastguard Worker return false;
199*6777b538SAndroid Build Coastguard Worker }
200*6777b538SAndroid Build Coastguard Worker
201*6777b538SAndroid Build Coastguard Worker // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
202*6777b538SAndroid Build Coastguard Worker // the specified index. On success, returns true, sets |code_point_out| to be
203*6777b538SAndroid Build Coastguard Worker // the character's code point and |unescaped_out| to be the unescaped UTF-8
204*6777b538SAndroid Build Coastguard Worker // string. |unescaped_out| will always be 1/3rd the length of the substring of
205*6777b538SAndroid Build Coastguard Worker // |escaped_text| that corresponds to the unescaped character.
UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,size_t index,base_icu::UChar32 * code_point_out,std::string * unescaped_out)206*6777b538SAndroid Build Coastguard Worker bool UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,
207*6777b538SAndroid Build Coastguard Worker size_t index,
208*6777b538SAndroid Build Coastguard Worker base_icu::UChar32* code_point_out,
209*6777b538SAndroid Build Coastguard Worker std::string* unescaped_out) {
210*6777b538SAndroid Build Coastguard Worker DCHECK(unescaped_out->empty());
211*6777b538SAndroid Build Coastguard Worker
212*6777b538SAndroid Build Coastguard Worker unsigned char bytes[CBU8_MAX_LENGTH];
213*6777b538SAndroid Build Coastguard Worker if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
214*6777b538SAndroid Build Coastguard Worker return false;
215*6777b538SAndroid Build Coastguard Worker
216*6777b538SAndroid Build Coastguard Worker size_t num_bytes = 1;
217*6777b538SAndroid Build Coastguard Worker
218*6777b538SAndroid Build Coastguard Worker // If this is a lead byte, need to collect trail bytes as well.
219*6777b538SAndroid Build Coastguard Worker if (CBU8_IS_LEAD(bytes[0])) {
220*6777b538SAndroid Build Coastguard Worker // Look for the last trail byte of the UTF-8 character. Give up once
221*6777b538SAndroid Build Coastguard Worker // reach max character length number of bytes, or hit an unescaped
222*6777b538SAndroid Build Coastguard Worker // character. No need to check length of escaped_text, as
223*6777b538SAndroid Build Coastguard Worker // UnescapeUnsignedByteAtIndex checks lengths.
224*6777b538SAndroid Build Coastguard Worker while (num_bytes < std::size(bytes) &&
225*6777b538SAndroid Build Coastguard Worker UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
226*6777b538SAndroid Build Coastguard Worker &bytes[num_bytes]) &&
227*6777b538SAndroid Build Coastguard Worker CBU8_IS_TRAIL(bytes[num_bytes])) {
228*6777b538SAndroid Build Coastguard Worker ++num_bytes;
229*6777b538SAndroid Build Coastguard Worker }
230*6777b538SAndroid Build Coastguard Worker }
231*6777b538SAndroid Build Coastguard Worker
232*6777b538SAndroid Build Coastguard Worker size_t char_index = 0;
233*6777b538SAndroid Build Coastguard Worker // Check if the unicode "character" that was just unescaped is valid.
234*6777b538SAndroid Build Coastguard Worker if (!ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
235*6777b538SAndroid Build Coastguard Worker &char_index, code_point_out)) {
236*6777b538SAndroid Build Coastguard Worker return false;
237*6777b538SAndroid Build Coastguard Worker }
238*6777b538SAndroid Build Coastguard Worker
239*6777b538SAndroid Build Coastguard Worker // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
240*6777b538SAndroid Build Coastguard Worker // and the rest are not valid UTF-8, so need to update |num_bytes| based
241*6777b538SAndroid Build Coastguard Worker // on the result of ReadUnicodeCharacter().
242*6777b538SAndroid Build Coastguard Worker num_bytes = char_index + 1;
243*6777b538SAndroid Build Coastguard Worker *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
244*6777b538SAndroid Build Coastguard Worker return true;
245*6777b538SAndroid Build Coastguard Worker }
246*6777b538SAndroid Build Coastguard Worker
247*6777b538SAndroid Build Coastguard Worker // This method takes a Unicode code point and returns true if it should be
248*6777b538SAndroid Build Coastguard Worker // unescaped, based on |rules|.
ShouldUnescapeCodePoint(UnescapeRule::Type rules,base_icu::UChar32 code_point)249*6777b538SAndroid Build Coastguard Worker bool ShouldUnescapeCodePoint(UnescapeRule::Type rules,
250*6777b538SAndroid Build Coastguard Worker base_icu::UChar32 code_point) {
251*6777b538SAndroid Build Coastguard Worker // If this is an ASCII character, use the lookup table.
252*6777b538SAndroid Build Coastguard Worker if (code_point >= 0 && code_point < 0x80) {
253*6777b538SAndroid Build Coastguard Worker return kUrlUnescape[static_cast<size_t>(code_point)] ||
254*6777b538SAndroid Build Coastguard Worker // Allow some additional unescaping when flags are set.
255*6777b538SAndroid Build Coastguard Worker (code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
256*6777b538SAndroid Build Coastguard Worker // Allow any of the prohibited but non-control characters when doing
257*6777b538SAndroid Build Coastguard Worker // "special" chars.
258*6777b538SAndroid Build Coastguard Worker ((code_point == '/' || code_point == '\\') &&
259*6777b538SAndroid Build Coastguard Worker (rules & UnescapeRule::PATH_SEPARATORS)) ||
260*6777b538SAndroid Build Coastguard Worker (code_point > ' ' && code_point != '/' && code_point != '\\' &&
261*6777b538SAndroid Build Coastguard Worker (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));
262*6777b538SAndroid Build Coastguard Worker }
263*6777b538SAndroid Build Coastguard Worker
264*6777b538SAndroid Build Coastguard Worker // Compare the code point against a list of characters that can be used
265*6777b538SAndroid Build Coastguard Worker // to spoof other URLs.
266*6777b538SAndroid Build Coastguard Worker //
267*6777b538SAndroid Build Coastguard Worker // Can't use icu to make this cleaner, because Cronet cannot depend on
268*6777b538SAndroid Build Coastguard Worker // icu, and currently uses this file.
269*6777b538SAndroid Build Coastguard Worker // TODO(https://crbug.com/829873): Try to make this use icu, both to
270*6777b538SAndroid Build Coastguard Worker // protect against regressions as the Unicode standard is updated and to
271*6777b538SAndroid Build Coastguard Worker // reduce the number of long lists of characters.
272*6777b538SAndroid Build Coastguard Worker return !(
273*6777b538SAndroid Build Coastguard Worker // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
274*6777b538SAndroid Build Coastguard Worker // control characters are not allowed to appear unescaped in URLs.
275*6777b538SAndroid Build Coastguard Worker code_point == 0x200E || // LEFT-TO-RIGHT MARK (%E2%80%8E)
276*6777b538SAndroid Build Coastguard Worker code_point == 0x200F || // RIGHT-TO-LEFT MARK (%E2%80%8F)
277*6777b538SAndroid Build Coastguard Worker code_point == 0x202A || // LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
278*6777b538SAndroid Build Coastguard Worker code_point == 0x202B || // RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
279*6777b538SAndroid Build Coastguard Worker code_point == 0x202C || // POP DIRECTIONAL FORMATTING (%E2%80%AC)
280*6777b538SAndroid Build Coastguard Worker code_point == 0x202D || // LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
281*6777b538SAndroid Build Coastguard Worker code_point == 0x202E || // RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
282*6777b538SAndroid Build Coastguard Worker
283*6777b538SAndroid Build Coastguard Worker // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
284*6777b538SAndroid Build Coastguard Worker // since added some new BiDi control characters that are not safe to
285*6777b538SAndroid Build Coastguard Worker // unescape. http://www.unicode.org/reports/tr9
286*6777b538SAndroid Build Coastguard Worker code_point == 0x061C || // ARABIC LETTER MARK (%D8%9C)
287*6777b538SAndroid Build Coastguard Worker code_point == 0x2066 || // LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
288*6777b538SAndroid Build Coastguard Worker code_point == 0x2067 || // RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
289*6777b538SAndroid Build Coastguard Worker code_point == 0x2068 || // FIRST STRONG ISOLATE (%E2%81%A8)
290*6777b538SAndroid Build Coastguard Worker code_point == 0x2069 || // POP DIRECTIONAL ISOLATE (%E2%81%A9)
291*6777b538SAndroid Build Coastguard Worker
292*6777b538SAndroid Build Coastguard Worker // The following spoofable characters are also banned in unescaped URLs,
293*6777b538SAndroid Build Coastguard Worker // because they could be used to imitate parts of a web browser's UI.
294*6777b538SAndroid Build Coastguard Worker code_point == 0x1F50F || // LOCK WITH INK PEN (%F0%9F%94%8F)
295*6777b538SAndroid Build Coastguard Worker code_point == 0x1F510 || // CLOSED LOCK WITH KEY (%F0%9F%94%90)
296*6777b538SAndroid Build Coastguard Worker code_point == 0x1F512 || // LOCK (%F0%9F%94%92)
297*6777b538SAndroid Build Coastguard Worker code_point == 0x1F513 || // OPEN LOCK (%F0%9F%94%93)
298*6777b538SAndroid Build Coastguard Worker
299*6777b538SAndroid Build Coastguard Worker // Spaces are also banned, as they can be used to scroll text out of view.
300*6777b538SAndroid Build Coastguard Worker code_point == 0x0085 || // NEXT LINE (%C2%85)
301*6777b538SAndroid Build Coastguard Worker code_point == 0x00A0 || // NO-BREAK SPACE (%C2%A0)
302*6777b538SAndroid Build Coastguard Worker code_point == 0x1680 || // OGHAM SPACE MARK (%E1%9A%80)
303*6777b538SAndroid Build Coastguard Worker code_point == 0x2000 || // EN QUAD (%E2%80%80)
304*6777b538SAndroid Build Coastguard Worker code_point == 0x2001 || // EM QUAD (%E2%80%81)
305*6777b538SAndroid Build Coastguard Worker code_point == 0x2002 || // EN SPACE (%E2%80%82)
306*6777b538SAndroid Build Coastguard Worker code_point == 0x2003 || // EM SPACE (%E2%80%83)
307*6777b538SAndroid Build Coastguard Worker code_point == 0x2004 || // THREE-PER-EM SPACE (%E2%80%84)
308*6777b538SAndroid Build Coastguard Worker code_point == 0x2005 || // FOUR-PER-EM SPACE (%E2%80%85)
309*6777b538SAndroid Build Coastguard Worker code_point == 0x2006 || // SIX-PER-EM SPACE (%E2%80%86)
310*6777b538SAndroid Build Coastguard Worker code_point == 0x2007 || // FIGURE SPACE (%E2%80%87)
311*6777b538SAndroid Build Coastguard Worker code_point == 0x2008 || // PUNCTUATION SPACE (%E2%80%88)
312*6777b538SAndroid Build Coastguard Worker code_point == 0x2009 || // THIN SPACE (%E2%80%89)
313*6777b538SAndroid Build Coastguard Worker code_point == 0x200A || // HAIR SPACE (%E2%80%8A)
314*6777b538SAndroid Build Coastguard Worker code_point == 0x2028 || // LINE SEPARATOR (%E2%80%A8)
315*6777b538SAndroid Build Coastguard Worker code_point == 0x2029 || // PARAGRAPH SEPARATOR (%E2%80%A9)
316*6777b538SAndroid Build Coastguard Worker code_point == 0x202F || // NARROW NO-BREAK SPACE (%E2%80%AF)
317*6777b538SAndroid Build Coastguard Worker code_point == 0x205F || // MEDIUM MATHEMATICAL SPACE (%E2%81%9F)
318*6777b538SAndroid Build Coastguard Worker code_point == 0x3000 || // IDEOGRAPHIC SPACE (%E3%80%80)
319*6777b538SAndroid Build Coastguard Worker // U+2800 is rendered as a space, but is not considered whitespace (see
320*6777b538SAndroid Build Coastguard Worker // crbug.com/1068531).
321*6777b538SAndroid Build Coastguard Worker code_point == 0x2800 || // BRAILLE PATTERN BLANK (%E2%A0%80)
322*6777b538SAndroid Build Coastguard Worker
323*6777b538SAndroid Build Coastguard Worker // Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format
324*6777b538SAndroid Build Coastguard Worker // characters ([:Cf:]) are also banned (see crbug.com/824715).
325*6777b538SAndroid Build Coastguard Worker code_point == 0x00AD || // SOFT HYPHEN (%C2%AD)
326*6777b538SAndroid Build Coastguard Worker code_point == 0x034F || // COMBINING GRAPHEME JOINER (%CD%8F)
327*6777b538SAndroid Build Coastguard Worker // Arabic number formatting
328*6777b538SAndroid Build Coastguard Worker (code_point >= 0x0600 && code_point <= 0x0605) ||
329*6777b538SAndroid Build Coastguard Worker // U+061C is already banned as a BiDi control character.
330*6777b538SAndroid Build Coastguard Worker code_point == 0x06DD || // ARABIC END OF AYAH (%DB%9D)
331*6777b538SAndroid Build Coastguard Worker code_point == 0x070F || // SYRIAC ABBREVIATION MARK (%DC%8F)
332*6777b538SAndroid Build Coastguard Worker code_point == 0x08E2 || // ARABIC DISPUTED END OF AYAH (%E0%A3%A2)
333*6777b538SAndroid Build Coastguard Worker code_point == 0x115F || // HANGUL CHOSEONG FILLER (%E1%85%9F)
334*6777b538SAndroid Build Coastguard Worker code_point == 0x1160 || // HANGUL JUNGSEONG FILLER (%E1%85%A0)
335*6777b538SAndroid Build Coastguard Worker code_point == 0x17B4 || // KHMER VOWEL INHERENT AQ (%E1%9E%B4)
336*6777b538SAndroid Build Coastguard Worker code_point == 0x17B5 || // KHMER VOWEL INHERENT AA (%E1%9E%B5)
337*6777b538SAndroid Build Coastguard Worker code_point == 0x180B || // MONGOLIAN FREE VARIATION SELECTOR ONE
338*6777b538SAndroid Build Coastguard Worker // (%E1%A0%8B)
339*6777b538SAndroid Build Coastguard Worker code_point == 0x180C || // MONGOLIAN FREE VARIATION SELECTOR TWO
340*6777b538SAndroid Build Coastguard Worker // (%E1%A0%8C)
341*6777b538SAndroid Build Coastguard Worker code_point == 0x180D || // MONGOLIAN FREE VARIATION SELECTOR THREE
342*6777b538SAndroid Build Coastguard Worker // (%E1%A0%8D)
343*6777b538SAndroid Build Coastguard Worker code_point == 0x180E || // MONGOLIAN VOWEL SEPARATOR (%E1%A0%8E)
344*6777b538SAndroid Build Coastguard Worker code_point == 0x200B || // ZERO WIDTH SPACE (%E2%80%8B)
345*6777b538SAndroid Build Coastguard Worker code_point == 0x200C || // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C)
346*6777b538SAndroid Build Coastguard Worker code_point == 0x200D || // ZERO WIDTH JOINER (%E2%80%8D)
347*6777b538SAndroid Build Coastguard Worker // U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as
348*6777b538SAndroid Build Coastguard Worker // BiDi control characters.
349*6777b538SAndroid Build Coastguard Worker code_point == 0x2060 || // WORD JOINER (%E2%81%A0)
350*6777b538SAndroid Build Coastguard Worker code_point == 0x2061 || // FUNCTION APPLICATION (%E2%81%A1)
351*6777b538SAndroid Build Coastguard Worker code_point == 0x2062 || // INVISIBLE TIMES (%E2%81%A2)
352*6777b538SAndroid Build Coastguard Worker code_point == 0x2063 || // INVISIBLE SEPARATOR (%E2%81%A3)
353*6777b538SAndroid Build Coastguard Worker code_point == 0x2064 || // INVISIBLE PLUS (%E2%81%A4)
354*6777b538SAndroid Build Coastguard Worker code_point == 0x2065 || // null (%E2%81%A5)
355*6777b538SAndroid Build Coastguard Worker // 0x2066--0x2069 are already banned as a BiDi control characters.
356*6777b538SAndroid Build Coastguard Worker // General Punctuation - Deprecated (U+206A--206F)
357*6777b538SAndroid Build Coastguard Worker (code_point >= 0x206A && code_point <= 0x206F) ||
358*6777b538SAndroid Build Coastguard Worker code_point == 0x3164 || // HANGUL FILLER (%E3%85%A4)
359*6777b538SAndroid Build Coastguard Worker (code_point >= 0xFFF0 && code_point <= 0xFFF8) || // null
360*6777b538SAndroid Build Coastguard Worker // Variation selectors (%EF%B8%80 -- %EF%B8%8F)
361*6777b538SAndroid Build Coastguard Worker (code_point >= 0xFE00 && code_point <= 0xFE0F) ||
362*6777b538SAndroid Build Coastguard Worker code_point == 0xFEFF || // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF)
363*6777b538SAndroid Build Coastguard Worker code_point == 0xFFA0 || // HALFWIDTH HANGUL FILLER (%EF%BE%A0)
364*6777b538SAndroid Build Coastguard Worker code_point == 0xFFF9 || // INTERLINEAR ANNOTATION ANCHOR (%EF%BF%B9)
365*6777b538SAndroid Build Coastguard Worker code_point == 0xFFFA || // INTERLINEAR ANNOTATION SEPARATOR (%EF%BF%BA)
366*6777b538SAndroid Build Coastguard Worker code_point == 0xFFFB || // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB)
367*6777b538SAndroid Build Coastguard Worker code_point == 0x110BD || // KAITHI NUMBER SIGN (%F0%91%82%BD)
368*6777b538SAndroid Build Coastguard Worker code_point == 0x110CD || // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D)
369*6777b538SAndroid Build Coastguard Worker // Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8)
370*6777b538SAndroid Build Coastguard Worker (code_point >= 0x13430 && code_point <= 0x13438) ||
371*6777b538SAndroid Build Coastguard Worker // Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3)
372*6777b538SAndroid Build Coastguard Worker (code_point >= 0x1BCA0 && code_point <= 0x1BCA3) ||
373*6777b538SAndroid Build Coastguard Worker // Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA)
374*6777b538SAndroid Build Coastguard Worker (code_point >= 0x1D173 && code_point <= 0x1D17A) ||
375*6777b538SAndroid Build Coastguard Worker // Tags, Variation Selectors, nulls
376*6777b538SAndroid Build Coastguard Worker (code_point >= 0xE0000 && code_point <= 0xE0FFF));
377*6777b538SAndroid Build Coastguard Worker }
378*6777b538SAndroid Build Coastguard Worker
379*6777b538SAndroid Build Coastguard Worker // Unescapes |escaped_text| according to |rules|, returning the resulting
380*6777b538SAndroid Build Coastguard Worker // string. Fills in an |adjustments| parameter, if non-nullptr, so it reflects
381*6777b538SAndroid Build Coastguard Worker // the alterations done to the string that are not one-character-to-one-
382*6777b538SAndroid Build Coastguard Worker // character. The resulting |adjustments| will always be sorted by increasing
383*6777b538SAndroid Build Coastguard Worker // offset.
UnescapeURLWithAdjustmentsImpl(StringPiece escaped_text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)384*6777b538SAndroid Build Coastguard Worker std::string UnescapeURLWithAdjustmentsImpl(
385*6777b538SAndroid Build Coastguard Worker StringPiece escaped_text,
386*6777b538SAndroid Build Coastguard Worker UnescapeRule::Type rules,
387*6777b538SAndroid Build Coastguard Worker OffsetAdjuster::Adjustments* adjustments) {
388*6777b538SAndroid Build Coastguard Worker if (adjustments)
389*6777b538SAndroid Build Coastguard Worker adjustments->clear();
390*6777b538SAndroid Build Coastguard Worker // Do not unescape anything, return the |escaped_text| text.
391*6777b538SAndroid Build Coastguard Worker if (rules == UnescapeRule::NONE)
392*6777b538SAndroid Build Coastguard Worker return std::string(escaped_text);
393*6777b538SAndroid Build Coastguard Worker
394*6777b538SAndroid Build Coastguard Worker // The output of the unescaping is always smaller than the input, so we can
395*6777b538SAndroid Build Coastguard Worker // reserve the input size to make sure we have enough buffer and don't have
396*6777b538SAndroid Build Coastguard Worker // to allocate in the loop below.
397*6777b538SAndroid Build Coastguard Worker std::string result;
398*6777b538SAndroid Build Coastguard Worker result.reserve(escaped_text.length());
399*6777b538SAndroid Build Coastguard Worker
400*6777b538SAndroid Build Coastguard Worker // Locations of adjusted text.
401*6777b538SAndroid Build Coastguard Worker for (size_t i = 0, max = escaped_text.size(); i < max;) {
402*6777b538SAndroid Build Coastguard Worker // Try to unescape the character.
403*6777b538SAndroid Build Coastguard Worker base_icu::UChar32 code_point;
404*6777b538SAndroid Build Coastguard Worker std::string unescaped;
405*6777b538SAndroid Build Coastguard Worker if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
406*6777b538SAndroid Build Coastguard Worker &unescaped)) {
407*6777b538SAndroid Build Coastguard Worker // Check if the next character can be unescaped, but not as a valid UTF-8
408*6777b538SAndroid Build Coastguard Worker // character. In that case, just unescaped and write the non-sense
409*6777b538SAndroid Build Coastguard Worker // character.
410*6777b538SAndroid Build Coastguard Worker //
411*6777b538SAndroid Build Coastguard Worker // TODO(https://crbug.com/829868): Do not unescape illegal UTF-8
412*6777b538SAndroid Build Coastguard Worker // sequences.
413*6777b538SAndroid Build Coastguard Worker unsigned char non_utf8_byte;
414*6777b538SAndroid Build Coastguard Worker if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
415*6777b538SAndroid Build Coastguard Worker result.push_back(static_cast<char>(non_utf8_byte));
416*6777b538SAndroid Build Coastguard Worker if (adjustments)
417*6777b538SAndroid Build Coastguard Worker adjustments->push_back(OffsetAdjuster::Adjustment(i, 3, 1));
418*6777b538SAndroid Build Coastguard Worker i += 3;
419*6777b538SAndroid Build Coastguard Worker continue;
420*6777b538SAndroid Build Coastguard Worker }
421*6777b538SAndroid Build Coastguard Worker
422*6777b538SAndroid Build Coastguard Worker // Character is not escaped, so append as is, unless it's a '+' and
423*6777b538SAndroid Build Coastguard Worker // REPLACE_PLUS_WITH_SPACE is being applied.
424*6777b538SAndroid Build Coastguard Worker if (escaped_text[i] == '+' &&
425*6777b538SAndroid Build Coastguard Worker (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
426*6777b538SAndroid Build Coastguard Worker result.push_back(' ');
427*6777b538SAndroid Build Coastguard Worker } else {
428*6777b538SAndroid Build Coastguard Worker result.push_back(escaped_text[i]);
429*6777b538SAndroid Build Coastguard Worker }
430*6777b538SAndroid Build Coastguard Worker ++i;
431*6777b538SAndroid Build Coastguard Worker continue;
432*6777b538SAndroid Build Coastguard Worker }
433*6777b538SAndroid Build Coastguard Worker
434*6777b538SAndroid Build Coastguard Worker DCHECK(!unescaped.empty());
435*6777b538SAndroid Build Coastguard Worker
436*6777b538SAndroid Build Coastguard Worker if (!ShouldUnescapeCodePoint(rules, code_point)) {
437*6777b538SAndroid Build Coastguard Worker // If it's a valid UTF-8 character, but not safe to unescape, copy all
438*6777b538SAndroid Build Coastguard Worker // bytes directly.
439*6777b538SAndroid Build Coastguard Worker result.append(escaped_text.substr(i, 3 * unescaped.length()));
440*6777b538SAndroid Build Coastguard Worker i += unescaped.length() * 3;
441*6777b538SAndroid Build Coastguard Worker continue;
442*6777b538SAndroid Build Coastguard Worker }
443*6777b538SAndroid Build Coastguard Worker
444*6777b538SAndroid Build Coastguard Worker // If the code point is allowed, and append the entire unescaped character.
445*6777b538SAndroid Build Coastguard Worker result.append(unescaped);
446*6777b538SAndroid Build Coastguard Worker if (adjustments) {
447*6777b538SAndroid Build Coastguard Worker for (size_t j = 0; j < unescaped.length(); ++j) {
448*6777b538SAndroid Build Coastguard Worker adjustments->push_back(OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
449*6777b538SAndroid Build Coastguard Worker }
450*6777b538SAndroid Build Coastguard Worker }
451*6777b538SAndroid Build Coastguard Worker i += 3 * unescaped.length();
452*6777b538SAndroid Build Coastguard Worker }
453*6777b538SAndroid Build Coastguard Worker
454*6777b538SAndroid Build Coastguard Worker return result;
455*6777b538SAndroid Build Coastguard Worker }
456*6777b538SAndroid Build Coastguard Worker
457*6777b538SAndroid Build Coastguard Worker } // namespace
458*6777b538SAndroid Build Coastguard Worker
EscapeAllExceptUnreserved(StringPiece text)459*6777b538SAndroid Build Coastguard Worker std::string EscapeAllExceptUnreserved(StringPiece text) {
460*6777b538SAndroid Build Coastguard Worker return Escape(text, kUnreservedCharmap, false);
461*6777b538SAndroid Build Coastguard Worker }
462*6777b538SAndroid Build Coastguard Worker
EscapeQueryParamValue(StringPiece text,bool use_plus)463*6777b538SAndroid Build Coastguard Worker std::string EscapeQueryParamValue(StringPiece text, bool use_plus) {
464*6777b538SAndroid Build Coastguard Worker return Escape(text, kQueryCharmap, use_plus);
465*6777b538SAndroid Build Coastguard Worker }
466*6777b538SAndroid Build Coastguard Worker
EscapePath(StringPiece path)467*6777b538SAndroid Build Coastguard Worker std::string EscapePath(StringPiece path) {
468*6777b538SAndroid Build Coastguard Worker return Escape(path, kPathCharmap, false);
469*6777b538SAndroid Build Coastguard Worker }
470*6777b538SAndroid Build Coastguard Worker
471*6777b538SAndroid Build Coastguard Worker #if BUILDFLAG(IS_APPLE)
EscapeNSURLPrecursor(StringPiece precursor)472*6777b538SAndroid Build Coastguard Worker std::string EscapeNSURLPrecursor(StringPiece precursor) {
473*6777b538SAndroid Build Coastguard Worker return Escape(precursor, kNSURLCharmap, false, true);
474*6777b538SAndroid Build Coastguard Worker }
475*6777b538SAndroid Build Coastguard Worker #endif // BUILDFLAG(IS_APPLE)
476*6777b538SAndroid Build Coastguard Worker
EscapeUrlEncodedData(StringPiece path,bool use_plus)477*6777b538SAndroid Build Coastguard Worker std::string EscapeUrlEncodedData(StringPiece path, bool use_plus) {
478*6777b538SAndroid Build Coastguard Worker return Escape(path, kUrlEscape, use_plus);
479*6777b538SAndroid Build Coastguard Worker }
480*6777b538SAndroid Build Coastguard Worker
EscapeNonASCIIAndPercent(StringPiece input)481*6777b538SAndroid Build Coastguard Worker std::string EscapeNonASCIIAndPercent(StringPiece input) {
482*6777b538SAndroid Build Coastguard Worker return Escape(input, kNonASCIICharmapAndPercent, false);
483*6777b538SAndroid Build Coastguard Worker }
484*6777b538SAndroid Build Coastguard Worker
EscapeNonASCII(StringPiece input)485*6777b538SAndroid Build Coastguard Worker std::string EscapeNonASCII(StringPiece input) {
486*6777b538SAndroid Build Coastguard Worker return Escape(input, kNonASCIICharmap, false);
487*6777b538SAndroid Build Coastguard Worker }
488*6777b538SAndroid Build Coastguard Worker
EscapeExternalHandlerValue(StringPiece text)489*6777b538SAndroid Build Coastguard Worker std::string EscapeExternalHandlerValue(StringPiece text) {
490*6777b538SAndroid Build Coastguard Worker return Escape(text, kExternalHandlerCharmap, false, true);
491*6777b538SAndroid Build Coastguard Worker }
492*6777b538SAndroid Build Coastguard Worker
AppendEscapedCharForHTML(char c,std::string * output)493*6777b538SAndroid Build Coastguard Worker void AppendEscapedCharForHTML(char c, std::string* output) {
494*6777b538SAndroid Build Coastguard Worker AppendEscapedCharForHTMLImpl(c, output);
495*6777b538SAndroid Build Coastguard Worker }
496*6777b538SAndroid Build Coastguard Worker
EscapeForHTML(StringPiece input)497*6777b538SAndroid Build Coastguard Worker std::string EscapeForHTML(StringPiece input) {
498*6777b538SAndroid Build Coastguard Worker return EscapeForHTMLImpl(input);
499*6777b538SAndroid Build Coastguard Worker }
500*6777b538SAndroid Build Coastguard Worker
EscapeForHTML(StringPiece16 input)501*6777b538SAndroid Build Coastguard Worker std::u16string EscapeForHTML(StringPiece16 input) {
502*6777b538SAndroid Build Coastguard Worker return EscapeForHTMLImpl(input);
503*6777b538SAndroid Build Coastguard Worker }
504*6777b538SAndroid Build Coastguard Worker
UnescapeURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)505*6777b538SAndroid Build Coastguard Worker std::string UnescapeURLComponent(StringPiece escaped_text,
506*6777b538SAndroid Build Coastguard Worker UnescapeRule::Type rules) {
507*6777b538SAndroid Build Coastguard Worker return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, nullptr);
508*6777b538SAndroid Build Coastguard Worker }
509*6777b538SAndroid Build Coastguard Worker
UnescapeAndDecodeUTF8URLComponentWithAdjustments(StringPiece text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)510*6777b538SAndroid Build Coastguard Worker std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments(
511*6777b538SAndroid Build Coastguard Worker StringPiece text,
512*6777b538SAndroid Build Coastguard Worker UnescapeRule::Type rules,
513*6777b538SAndroid Build Coastguard Worker OffsetAdjuster::Adjustments* adjustments) {
514*6777b538SAndroid Build Coastguard Worker std::u16string result;
515*6777b538SAndroid Build Coastguard Worker OffsetAdjuster::Adjustments unescape_adjustments;
516*6777b538SAndroid Build Coastguard Worker std::string unescaped_url(
517*6777b538SAndroid Build Coastguard Worker UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments));
518*6777b538SAndroid Build Coastguard Worker if (UTF8ToUTF16WithAdjustments(unescaped_url.data(), unescaped_url.length(),
519*6777b538SAndroid Build Coastguard Worker &result, adjustments)) {
520*6777b538SAndroid Build Coastguard Worker // Character set looks like it's valid.
521*6777b538SAndroid Build Coastguard Worker if (adjustments) {
522*6777b538SAndroid Build Coastguard Worker OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
523*6777b538SAndroid Build Coastguard Worker adjustments);
524*6777b538SAndroid Build Coastguard Worker }
525*6777b538SAndroid Build Coastguard Worker return result;
526*6777b538SAndroid Build Coastguard Worker }
527*6777b538SAndroid Build Coastguard Worker // Character set is not valid. Return the escaped version.
528*6777b538SAndroid Build Coastguard Worker return UTF8ToUTF16WithAdjustments(text, adjustments);
529*6777b538SAndroid Build Coastguard Worker }
530*6777b538SAndroid Build Coastguard Worker
UnescapeBinaryURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)531*6777b538SAndroid Build Coastguard Worker std::string UnescapeBinaryURLComponent(StringPiece escaped_text,
532*6777b538SAndroid Build Coastguard Worker UnescapeRule::Type rules) {
533*6777b538SAndroid Build Coastguard Worker // Only NORMAL and REPLACE_PLUS_WITH_SPACE are supported.
534*6777b538SAndroid Build Coastguard Worker DCHECK(rules != UnescapeRule::NONE);
535*6777b538SAndroid Build Coastguard Worker DCHECK(!(rules &
536*6777b538SAndroid Build Coastguard Worker ~(UnescapeRule::NORMAL | UnescapeRule::REPLACE_PLUS_WITH_SPACE)));
537*6777b538SAndroid Build Coastguard Worker
538*6777b538SAndroid Build Coastguard Worker // It is not possible to read the feature state when this function is invoked
539*6777b538SAndroid Build Coastguard Worker // before FeatureList initialization. In that case, fallback to the feature's
540*6777b538SAndroid Build Coastguard Worker // default state.
541*6777b538SAndroid Build Coastguard Worker //
542*6777b538SAndroid Build Coastguard Worker // TODO(crbug.com/1321924): Cleanup this feature.
543*6777b538SAndroid Build Coastguard Worker const bool optimize_data_urls_feature_is_enabled =
544*6777b538SAndroid Build Coastguard Worker base::FeatureList::GetInstance()
545*6777b538SAndroid Build Coastguard Worker ? base::FeatureList::IsEnabled(features::kOptimizeDataUrls)
546*6777b538SAndroid Build Coastguard Worker : features::kOptimizeDataUrls.default_state ==
547*6777b538SAndroid Build Coastguard Worker base::FEATURE_ENABLED_BY_DEFAULT;
548*6777b538SAndroid Build Coastguard Worker
549*6777b538SAndroid Build Coastguard Worker // If there are no '%' characters in the string, there will be nothing to
550*6777b538SAndroid Build Coastguard Worker // unescape, so we can take the fast path.
551*6777b538SAndroid Build Coastguard Worker if (optimize_data_urls_feature_is_enabled &&
552*6777b538SAndroid Build Coastguard Worker escaped_text.find('%') == StringPiece::npos) {
553*6777b538SAndroid Build Coastguard Worker std::string unescaped_text(escaped_text);
554*6777b538SAndroid Build Coastguard Worker if (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)
555*6777b538SAndroid Build Coastguard Worker std::replace(unescaped_text.begin(), unescaped_text.end(), '+', ' ');
556*6777b538SAndroid Build Coastguard Worker return unescaped_text;
557*6777b538SAndroid Build Coastguard Worker }
558*6777b538SAndroid Build Coastguard Worker
559*6777b538SAndroid Build Coastguard Worker std::string unescaped_text;
560*6777b538SAndroid Build Coastguard Worker
561*6777b538SAndroid Build Coastguard Worker // The output of the unescaping is always smaller than the input, so we can
562*6777b538SAndroid Build Coastguard Worker // reserve the input size to make sure we have enough buffer and don't have
563*6777b538SAndroid Build Coastguard Worker // to allocate in the loop below.
564*6777b538SAndroid Build Coastguard Worker // Increase capacity before size, as just resizing can grow capacity
565*6777b538SAndroid Build Coastguard Worker // needlessly beyond our requested size.
566*6777b538SAndroid Build Coastguard Worker unescaped_text.reserve(escaped_text.size());
567*6777b538SAndroid Build Coastguard Worker unescaped_text.resize(escaped_text.size());
568*6777b538SAndroid Build Coastguard Worker
569*6777b538SAndroid Build Coastguard Worker size_t output_index = 0;
570*6777b538SAndroid Build Coastguard Worker
571*6777b538SAndroid Build Coastguard Worker for (size_t i = 0, max = escaped_text.size(); i < max;) {
572*6777b538SAndroid Build Coastguard Worker unsigned char byte;
573*6777b538SAndroid Build Coastguard Worker // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
574*6777b538SAndroid Build Coastguard Worker // to call.
575*6777b538SAndroid Build Coastguard Worker if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
576*6777b538SAndroid Build Coastguard Worker unescaped_text[output_index++] = static_cast<char>(byte);
577*6777b538SAndroid Build Coastguard Worker i += 3;
578*6777b538SAndroid Build Coastguard Worker continue;
579*6777b538SAndroid Build Coastguard Worker }
580*6777b538SAndroid Build Coastguard Worker
581*6777b538SAndroid Build Coastguard Worker if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
582*6777b538SAndroid Build Coastguard Worker escaped_text[i] == '+') {
583*6777b538SAndroid Build Coastguard Worker unescaped_text[output_index++] = ' ';
584*6777b538SAndroid Build Coastguard Worker ++i;
585*6777b538SAndroid Build Coastguard Worker continue;
586*6777b538SAndroid Build Coastguard Worker }
587*6777b538SAndroid Build Coastguard Worker
588*6777b538SAndroid Build Coastguard Worker unescaped_text[output_index++] = escaped_text[i++];
589*6777b538SAndroid Build Coastguard Worker }
590*6777b538SAndroid Build Coastguard Worker
591*6777b538SAndroid Build Coastguard Worker DCHECK_LE(output_index, unescaped_text.size());
592*6777b538SAndroid Build Coastguard Worker unescaped_text.resize(output_index);
593*6777b538SAndroid Build Coastguard Worker return unescaped_text;
594*6777b538SAndroid Build Coastguard Worker }
595*6777b538SAndroid Build Coastguard Worker
UnescapeBinaryURLComponentSafe(StringPiece escaped_text,bool fail_on_path_separators,std::string * unescaped_text)596*6777b538SAndroid Build Coastguard Worker bool UnescapeBinaryURLComponentSafe(StringPiece escaped_text,
597*6777b538SAndroid Build Coastguard Worker bool fail_on_path_separators,
598*6777b538SAndroid Build Coastguard Worker std::string* unescaped_text) {
599*6777b538SAndroid Build Coastguard Worker unescaped_text->clear();
600*6777b538SAndroid Build Coastguard Worker
601*6777b538SAndroid Build Coastguard Worker std::set<unsigned char> illegal_encoded_bytes;
602*6777b538SAndroid Build Coastguard Worker for (unsigned char c = '\x00'; c < '\x20'; ++c) {
603*6777b538SAndroid Build Coastguard Worker illegal_encoded_bytes.insert(c);
604*6777b538SAndroid Build Coastguard Worker }
605*6777b538SAndroid Build Coastguard Worker if (fail_on_path_separators) {
606*6777b538SAndroid Build Coastguard Worker illegal_encoded_bytes.insert('/');
607*6777b538SAndroid Build Coastguard Worker illegal_encoded_bytes.insert('\\');
608*6777b538SAndroid Build Coastguard Worker }
609*6777b538SAndroid Build Coastguard Worker if (ContainsEncodedBytes(escaped_text, illegal_encoded_bytes))
610*6777b538SAndroid Build Coastguard Worker return false;
611*6777b538SAndroid Build Coastguard Worker
612*6777b538SAndroid Build Coastguard Worker *unescaped_text = UnescapeBinaryURLComponent(escaped_text);
613*6777b538SAndroid Build Coastguard Worker return true;
614*6777b538SAndroid Build Coastguard Worker }
615*6777b538SAndroid Build Coastguard Worker
ContainsEncodedBytes(StringPiece escaped_text,const std::set<unsigned char> & bytes)616*6777b538SAndroid Build Coastguard Worker bool ContainsEncodedBytes(StringPiece escaped_text,
617*6777b538SAndroid Build Coastguard Worker const std::set<unsigned char>& bytes) {
618*6777b538SAndroid Build Coastguard Worker for (size_t i = 0, max = escaped_text.size(); i < max;) {
619*6777b538SAndroid Build Coastguard Worker unsigned char byte;
620*6777b538SAndroid Build Coastguard Worker // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
621*6777b538SAndroid Build Coastguard Worker // to call.
622*6777b538SAndroid Build Coastguard Worker if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
623*6777b538SAndroid Build Coastguard Worker if (bytes.find(byte) != bytes.end())
624*6777b538SAndroid Build Coastguard Worker return true;
625*6777b538SAndroid Build Coastguard Worker
626*6777b538SAndroid Build Coastguard Worker i += 3;
627*6777b538SAndroid Build Coastguard Worker continue;
628*6777b538SAndroid Build Coastguard Worker }
629*6777b538SAndroid Build Coastguard Worker
630*6777b538SAndroid Build Coastguard Worker ++i;
631*6777b538SAndroid Build Coastguard Worker }
632*6777b538SAndroid Build Coastguard Worker
633*6777b538SAndroid Build Coastguard Worker return false;
634*6777b538SAndroid Build Coastguard Worker }
635*6777b538SAndroid Build Coastguard Worker
UnescapeForHTML(StringPiece16 input)636*6777b538SAndroid Build Coastguard Worker std::u16string UnescapeForHTML(StringPiece16 input) {
637*6777b538SAndroid Build Coastguard Worker static const struct {
638*6777b538SAndroid Build Coastguard Worker const char* ampersand_code;
639*6777b538SAndroid Build Coastguard Worker const char16_t replacement;
640*6777b538SAndroid Build Coastguard Worker } kEscapeToChars[] = {
641*6777b538SAndroid Build Coastguard Worker {"<", '<'}, {">", '>'}, {"&", '&'},
642*6777b538SAndroid Build Coastguard Worker {""", '"'}, {"'", '\''},
643*6777b538SAndroid Build Coastguard Worker };
644*6777b538SAndroid Build Coastguard Worker constexpr size_t kEscapeToCharsCount = std::size(kEscapeToChars);
645*6777b538SAndroid Build Coastguard Worker
646*6777b538SAndroid Build Coastguard Worker if (input.find(u"&") == std::string::npos)
647*6777b538SAndroid Build Coastguard Worker return std::u16string(input);
648*6777b538SAndroid Build Coastguard Worker
649*6777b538SAndroid Build Coastguard Worker std::u16string ampersand_chars[kEscapeToCharsCount];
650*6777b538SAndroid Build Coastguard Worker std::u16string text(input);
651*6777b538SAndroid Build Coastguard Worker for (std::u16string::iterator iter = text.begin(); iter != text.end();
652*6777b538SAndroid Build Coastguard Worker ++iter) {
653*6777b538SAndroid Build Coastguard Worker if (*iter == '&') {
654*6777b538SAndroid Build Coastguard Worker // Potential ampersand encode char.
655*6777b538SAndroid Build Coastguard Worker size_t index = static_cast<size_t>(iter - text.begin());
656*6777b538SAndroid Build Coastguard Worker for (size_t i = 0; i < std::size(kEscapeToChars); i++) {
657*6777b538SAndroid Build Coastguard Worker if (ampersand_chars[i].empty()) {
658*6777b538SAndroid Build Coastguard Worker ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
659*6777b538SAndroid Build Coastguard Worker }
660*6777b538SAndroid Build Coastguard Worker if (text.find(ampersand_chars[i], index) == index) {
661*6777b538SAndroid Build Coastguard Worker text.replace(
662*6777b538SAndroid Build Coastguard Worker iter, iter + static_cast<ptrdiff_t>(ampersand_chars[i].length()),
663*6777b538SAndroid Build Coastguard Worker 1, kEscapeToChars[i].replacement);
664*6777b538SAndroid Build Coastguard Worker break;
665*6777b538SAndroid Build Coastguard Worker }
666*6777b538SAndroid Build Coastguard Worker }
667*6777b538SAndroid Build Coastguard Worker }
668*6777b538SAndroid Build Coastguard Worker }
669*6777b538SAndroid Build Coastguard Worker return text;
670*6777b538SAndroid Build Coastguard Worker }
671*6777b538SAndroid Build Coastguard Worker
672*6777b538SAndroid Build Coastguard Worker } // namespace base
673