xref: /aosp_15_r20/external/cronet/url/url_canon_etc.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1*6777b538SAndroid Build Coastguard Worker // Copyright 2013 The Chromium Authors
2*6777b538SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*6777b538SAndroid Build Coastguard Worker // found in the LICENSE file.
4*6777b538SAndroid Build Coastguard Worker 
5*6777b538SAndroid Build Coastguard Worker // Canonicalizers for random bits that aren't big enough for their own files.
6*6777b538SAndroid Build Coastguard Worker 
7*6777b538SAndroid Build Coastguard Worker #include <string.h>
8*6777b538SAndroid Build Coastguard Worker 
9*6777b538SAndroid Build Coastguard Worker #include "url/url_canon.h"
10*6777b538SAndroid Build Coastguard Worker #include "url/url_canon_internal.h"
11*6777b538SAndroid Build Coastguard Worker 
12*6777b538SAndroid Build Coastguard Worker namespace url {
13*6777b538SAndroid Build Coastguard Worker 
14*6777b538SAndroid Build Coastguard Worker namespace {
15*6777b538SAndroid Build Coastguard Worker 
16*6777b538SAndroid Build Coastguard Worker // Returns true if the given character should be removed from the middle of a
17*6777b538SAndroid Build Coastguard Worker // URL.
IsRemovableURLWhitespace(int ch)18*6777b538SAndroid Build Coastguard Worker inline bool IsRemovableURLWhitespace(int ch) {
19*6777b538SAndroid Build Coastguard Worker   return ch == '\r' || ch == '\n' || ch == '\t';
20*6777b538SAndroid Build Coastguard Worker }
21*6777b538SAndroid Build Coastguard Worker 
22*6777b538SAndroid Build Coastguard Worker // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
23*6777b538SAndroid Build Coastguard Worker // It sucks that we have to do this, since this takes about 13% of the total URL
24*6777b538SAndroid Build Coastguard Worker // canonicalization time.
25*6777b538SAndroid Build Coastguard Worker template <typename CHAR>
DoRemoveURLWhitespace(const CHAR * input,int input_len,CanonOutputT<CHAR> * buffer,int * output_len,bool * potentially_dangling_markup)26*6777b538SAndroid Build Coastguard Worker const CHAR* DoRemoveURLWhitespace(const CHAR* input,
27*6777b538SAndroid Build Coastguard Worker                                   int input_len,
28*6777b538SAndroid Build Coastguard Worker                                   CanonOutputT<CHAR>* buffer,
29*6777b538SAndroid Build Coastguard Worker                                   int* output_len,
30*6777b538SAndroid Build Coastguard Worker                                   bool* potentially_dangling_markup) {
31*6777b538SAndroid Build Coastguard Worker   // Fast verification that there's nothing that needs removal. This is the 99%
32*6777b538SAndroid Build Coastguard Worker   // case, so we want it to be fast and don't care about impacting the speed
33*6777b538SAndroid Build Coastguard Worker   // when we do find whitespace.
34*6777b538SAndroid Build Coastguard Worker   bool found_whitespace = false;
35*6777b538SAndroid Build Coastguard Worker   if (sizeof(*input) == 1 && input_len >= kMinimumLengthForSIMD) {
36*6777b538SAndroid Build Coastguard Worker     // For large strings, memchr is much faster than any scalar code we can
37*6777b538SAndroid Build Coastguard Worker     // write, even if we need to run it three times. (If this turns out to still
38*6777b538SAndroid Build Coastguard Worker     // be a bottleneck, we could write our own vector code, but given that
39*6777b538SAndroid Build Coastguard Worker     // memchr is so fast, it's unlikely to be relevant.)
40*6777b538SAndroid Build Coastguard Worker     found_whitespace = memchr(input, '\n', input_len) != nullptr ||
41*6777b538SAndroid Build Coastguard Worker                        memchr(input, '\r', input_len) != nullptr ||
42*6777b538SAndroid Build Coastguard Worker                        memchr(input, '\t', input_len) != nullptr;
43*6777b538SAndroid Build Coastguard Worker   } else {
44*6777b538SAndroid Build Coastguard Worker     for (int i = 0; i < input_len; i++) {
45*6777b538SAndroid Build Coastguard Worker       if (!IsRemovableURLWhitespace(input[i]))
46*6777b538SAndroid Build Coastguard Worker         continue;
47*6777b538SAndroid Build Coastguard Worker       found_whitespace = true;
48*6777b538SAndroid Build Coastguard Worker       break;
49*6777b538SAndroid Build Coastguard Worker     }
50*6777b538SAndroid Build Coastguard Worker   }
51*6777b538SAndroid Build Coastguard Worker 
52*6777b538SAndroid Build Coastguard Worker   if (!found_whitespace) {
53*6777b538SAndroid Build Coastguard Worker     // Didn't find any whitespace, we don't need to do anything. We can just
54*6777b538SAndroid Build Coastguard Worker     // return the input as the output.
55*6777b538SAndroid Build Coastguard Worker     *output_len = input_len;
56*6777b538SAndroid Build Coastguard Worker     return input;
57*6777b538SAndroid Build Coastguard Worker   }
58*6777b538SAndroid Build Coastguard Worker 
59*6777b538SAndroid Build Coastguard Worker   // Skip whitespace removal for `data:` URLs.
60*6777b538SAndroid Build Coastguard Worker   //
61*6777b538SAndroid Build Coastguard Worker   // TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but
62*6777b538SAndroid Build Coastguard Worker   // that turns out to be difficult to do correctly given this function's
63*6777b538SAndroid Build Coastguard Worker   // character type templating.
64*6777b538SAndroid Build Coastguard Worker   if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
65*6777b538SAndroid Build Coastguard Worker       input[3] == 'a' && input[4] == ':') {
66*6777b538SAndroid Build Coastguard Worker     *output_len = input_len;
67*6777b538SAndroid Build Coastguard Worker     return input;
68*6777b538SAndroid Build Coastguard Worker   }
69*6777b538SAndroid Build Coastguard Worker 
70*6777b538SAndroid Build Coastguard Worker   // Remove the whitespace into the new buffer and return it.
71*6777b538SAndroid Build Coastguard Worker   for (int i = 0; i < input_len; i++) {
72*6777b538SAndroid Build Coastguard Worker     if (!IsRemovableURLWhitespace(input[i])) {
73*6777b538SAndroid Build Coastguard Worker       if (potentially_dangling_markup && input[i] == 0x3C)
74*6777b538SAndroid Build Coastguard Worker         *potentially_dangling_markup = true;
75*6777b538SAndroid Build Coastguard Worker       buffer->push_back(input[i]);
76*6777b538SAndroid Build Coastguard Worker     }
77*6777b538SAndroid Build Coastguard Worker   }
78*6777b538SAndroid Build Coastguard Worker   *output_len = buffer->length();
79*6777b538SAndroid Build Coastguard Worker   return buffer->data();
80*6777b538SAndroid Build Coastguard Worker }
81*6777b538SAndroid Build Coastguard Worker 
82*6777b538SAndroid Build Coastguard Worker // Contains the canonical version of each possible input letter in the scheme
83*6777b538SAndroid Build Coastguard Worker // (basically, lower-cased). The corresponding entry will be 0 if the letter
84*6777b538SAndroid Build Coastguard Worker // is not allowed in a scheme.
85*6777b538SAndroid Build Coastguard Worker // clang-format off
86*6777b538SAndroid Build Coastguard Worker const char kSchemeCanonical[0x80] = {
87*6777b538SAndroid Build Coastguard Worker // 00-1f: all are invalid
88*6777b538SAndroid Build Coastguard Worker      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
89*6777b538SAndroid Build Coastguard Worker      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
90*6777b538SAndroid Build Coastguard Worker //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
91*6777b538SAndroid Build Coastguard Worker      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
92*6777b538SAndroid Build Coastguard Worker //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
93*6777b538SAndroid Build Coastguard Worker     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
94*6777b538SAndroid Build Coastguard Worker //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
95*6777b538SAndroid Build Coastguard Worker      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
96*6777b538SAndroid Build Coastguard Worker //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
97*6777b538SAndroid Build Coastguard Worker     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
98*6777b538SAndroid Build Coastguard Worker //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
99*6777b538SAndroid Build Coastguard Worker      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
100*6777b538SAndroid Build Coastguard Worker //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
101*6777b538SAndroid Build Coastguard Worker     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
102*6777b538SAndroid Build Coastguard Worker // clang-format on
103*6777b538SAndroid Build Coastguard Worker 
104*6777b538SAndroid Build Coastguard Worker // This could be a table lookup as well by setting the high bit for each
105*6777b538SAndroid Build Coastguard Worker // valid character, but it's only called once per URL, and it makes the lookup
106*6777b538SAndroid Build Coastguard Worker // table easier to read not having extra stuff in it.
IsSchemeFirstChar(unsigned char c)107*6777b538SAndroid Build Coastguard Worker inline bool IsSchemeFirstChar(unsigned char c) {
108*6777b538SAndroid Build Coastguard Worker   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
109*6777b538SAndroid Build Coastguard Worker }
110*6777b538SAndroid Build Coastguard Worker 
111*6777b538SAndroid Build Coastguard Worker template <typename CHAR, typename UCHAR>
DoScheme(const CHAR * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)112*6777b538SAndroid Build Coastguard Worker bool DoScheme(const CHAR* spec,
113*6777b538SAndroid Build Coastguard Worker               const Component& scheme,
114*6777b538SAndroid Build Coastguard Worker               CanonOutput* output,
115*6777b538SAndroid Build Coastguard Worker               Component* out_scheme) {
116*6777b538SAndroid Build Coastguard Worker   if (scheme.is_empty()) {
117*6777b538SAndroid Build Coastguard Worker     // Scheme is unspecified or empty, convert to empty by appending a colon.
118*6777b538SAndroid Build Coastguard Worker     *out_scheme = Component(output->length(), 0);
119*6777b538SAndroid Build Coastguard Worker     output->push_back(':');
120*6777b538SAndroid Build Coastguard Worker     return false;
121*6777b538SAndroid Build Coastguard Worker   }
122*6777b538SAndroid Build Coastguard Worker 
123*6777b538SAndroid Build Coastguard Worker   // The output scheme starts from the current position.
124*6777b538SAndroid Build Coastguard Worker   out_scheme->begin = output->length();
125*6777b538SAndroid Build Coastguard Worker 
126*6777b538SAndroid Build Coastguard Worker   // Danger: it's important that this code does not strip any characters;
127*6777b538SAndroid Build Coastguard Worker   // it only emits the canonical version (be it valid or escaped) for each
128*6777b538SAndroid Build Coastguard Worker   // of the input characters. Stripping would put it out of sync with
129*6777b538SAndroid Build Coastguard Worker   // FindAndCompareScheme, which could cause some security checks on
130*6777b538SAndroid Build Coastguard Worker   // schemes to be incorrect.
131*6777b538SAndroid Build Coastguard Worker   bool success = true;
132*6777b538SAndroid Build Coastguard Worker   size_t begin = static_cast<size_t>(scheme.begin);
133*6777b538SAndroid Build Coastguard Worker   size_t end = static_cast<size_t>(scheme.end());
134*6777b538SAndroid Build Coastguard Worker   for (size_t i = begin; i < end; i++) {
135*6777b538SAndroid Build Coastguard Worker     UCHAR ch = static_cast<UCHAR>(spec[i]);
136*6777b538SAndroid Build Coastguard Worker     char replacement = 0;
137*6777b538SAndroid Build Coastguard Worker     if (ch < 0x80) {
138*6777b538SAndroid Build Coastguard Worker       if (i == begin) {
139*6777b538SAndroid Build Coastguard Worker         // Need to do a special check for the first letter of the scheme.
140*6777b538SAndroid Build Coastguard Worker         if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
141*6777b538SAndroid Build Coastguard Worker           replacement = kSchemeCanonical[ch];
142*6777b538SAndroid Build Coastguard Worker       } else {
143*6777b538SAndroid Build Coastguard Worker         replacement = kSchemeCanonical[ch];
144*6777b538SAndroid Build Coastguard Worker       }
145*6777b538SAndroid Build Coastguard Worker     }
146*6777b538SAndroid Build Coastguard Worker 
147*6777b538SAndroid Build Coastguard Worker     if (replacement) {
148*6777b538SAndroid Build Coastguard Worker       output->push_back(replacement);
149*6777b538SAndroid Build Coastguard Worker     } else if (ch == '%') {
150*6777b538SAndroid Build Coastguard Worker       // Canonicalizing the scheme multiple times should lead to the same
151*6777b538SAndroid Build Coastguard Worker       // result. Since invalid characters will be escaped, we need to preserve
152*6777b538SAndroid Build Coastguard Worker       // the percent to avoid multiple escaping. The scheme will be invalid.
153*6777b538SAndroid Build Coastguard Worker       success = false;
154*6777b538SAndroid Build Coastguard Worker       output->push_back('%');
155*6777b538SAndroid Build Coastguard Worker     } else {
156*6777b538SAndroid Build Coastguard Worker       // Invalid character, store it but mark this scheme as invalid.
157*6777b538SAndroid Build Coastguard Worker       success = false;
158*6777b538SAndroid Build Coastguard Worker 
159*6777b538SAndroid Build Coastguard Worker       // This will escape the output and also handle encoding issues.
160*6777b538SAndroid Build Coastguard Worker       // Ignore the return value since we already failed.
161*6777b538SAndroid Build Coastguard Worker       AppendUTF8EscapedChar(spec, &i, end, output);
162*6777b538SAndroid Build Coastguard Worker     }
163*6777b538SAndroid Build Coastguard Worker   }
164*6777b538SAndroid Build Coastguard Worker 
165*6777b538SAndroid Build Coastguard Worker   // The output scheme ends with the the current position, before appending
166*6777b538SAndroid Build Coastguard Worker   // the colon.
167*6777b538SAndroid Build Coastguard Worker   out_scheme->len = output->length() - out_scheme->begin;
168*6777b538SAndroid Build Coastguard Worker   output->push_back(':');
169*6777b538SAndroid Build Coastguard Worker   return success;
170*6777b538SAndroid Build Coastguard Worker }
171*6777b538SAndroid Build Coastguard Worker 
172*6777b538SAndroid Build Coastguard Worker // The username and password components reference ranges in the corresponding
173*6777b538SAndroid Build Coastguard Worker // *_spec strings. Typically, these specs will be the same (we're
174*6777b538SAndroid Build Coastguard Worker // canonicalizing a single source string), but may be different when
175*6777b538SAndroid Build Coastguard Worker // replacing components.
176*6777b538SAndroid Build Coastguard Worker template <typename CHAR, typename UCHAR>
DoUserInfo(const CHAR * username_spec,const Component & username,const CHAR * password_spec,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)177*6777b538SAndroid Build Coastguard Worker bool DoUserInfo(const CHAR* username_spec,
178*6777b538SAndroid Build Coastguard Worker                 const Component& username,
179*6777b538SAndroid Build Coastguard Worker                 const CHAR* password_spec,
180*6777b538SAndroid Build Coastguard Worker                 const Component& password,
181*6777b538SAndroid Build Coastguard Worker                 CanonOutput* output,
182*6777b538SAndroid Build Coastguard Worker                 Component* out_username,
183*6777b538SAndroid Build Coastguard Worker                 Component* out_password) {
184*6777b538SAndroid Build Coastguard Worker   if (username.is_empty() && password.is_empty()) {
185*6777b538SAndroid Build Coastguard Worker     // Common case: no user info. We strip empty username/passwords.
186*6777b538SAndroid Build Coastguard Worker     *out_username = Component();
187*6777b538SAndroid Build Coastguard Worker     *out_password = Component();
188*6777b538SAndroid Build Coastguard Worker     return true;
189*6777b538SAndroid Build Coastguard Worker   }
190*6777b538SAndroid Build Coastguard Worker 
191*6777b538SAndroid Build Coastguard Worker   // Write the username.
192*6777b538SAndroid Build Coastguard Worker   out_username->begin = output->length();
193*6777b538SAndroid Build Coastguard Worker   if (username.is_nonempty()) {
194*6777b538SAndroid Build Coastguard Worker     // This will escape characters not valid for the username.
195*6777b538SAndroid Build Coastguard Worker     AppendStringOfType(&username_spec[username.begin],
196*6777b538SAndroid Build Coastguard Worker                        static_cast<size_t>(username.len), CHAR_USERINFO,
197*6777b538SAndroid Build Coastguard Worker                        output);
198*6777b538SAndroid Build Coastguard Worker   }
199*6777b538SAndroid Build Coastguard Worker   out_username->len = output->length() - out_username->begin;
200*6777b538SAndroid Build Coastguard Worker 
201*6777b538SAndroid Build Coastguard Worker   // When there is a password, we need the separator. Note that we strip
202*6777b538SAndroid Build Coastguard Worker   // empty but specified passwords.
203*6777b538SAndroid Build Coastguard Worker   if (password.is_nonempty()) {
204*6777b538SAndroid Build Coastguard Worker     output->push_back(':');
205*6777b538SAndroid Build Coastguard Worker     out_password->begin = output->length();
206*6777b538SAndroid Build Coastguard Worker     AppendStringOfType(&password_spec[password.begin],
207*6777b538SAndroid Build Coastguard Worker                        static_cast<size_t>(password.len), CHAR_USERINFO,
208*6777b538SAndroid Build Coastguard Worker                        output);
209*6777b538SAndroid Build Coastguard Worker     out_password->len = output->length() - out_password->begin;
210*6777b538SAndroid Build Coastguard Worker   } else {
211*6777b538SAndroid Build Coastguard Worker     *out_password = Component();
212*6777b538SAndroid Build Coastguard Worker   }
213*6777b538SAndroid Build Coastguard Worker 
214*6777b538SAndroid Build Coastguard Worker   output->push_back('@');
215*6777b538SAndroid Build Coastguard Worker   return true;
216*6777b538SAndroid Build Coastguard Worker }
217*6777b538SAndroid Build Coastguard Worker 
218*6777b538SAndroid Build Coastguard Worker // Helper functions for converting port integers to strings.
WritePortInt(char * output,int output_len,int port)219*6777b538SAndroid Build Coastguard Worker inline void WritePortInt(char* output, int output_len, int port) {
220*6777b538SAndroid Build Coastguard Worker   _itoa_s(port, output, output_len, 10);
221*6777b538SAndroid Build Coastguard Worker }
222*6777b538SAndroid Build Coastguard Worker 
223*6777b538SAndroid Build Coastguard Worker // This function will prepend the colon if there will be a port.
224*6777b538SAndroid Build Coastguard Worker template <typename CHAR, typename UCHAR>
DoPort(const CHAR * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)225*6777b538SAndroid Build Coastguard Worker bool DoPort(const CHAR* spec,
226*6777b538SAndroid Build Coastguard Worker             const Component& port,
227*6777b538SAndroid Build Coastguard Worker             int default_port_for_scheme,
228*6777b538SAndroid Build Coastguard Worker             CanonOutput* output,
229*6777b538SAndroid Build Coastguard Worker             Component* out_port) {
230*6777b538SAndroid Build Coastguard Worker   int port_num = ParsePort(spec, port);
231*6777b538SAndroid Build Coastguard Worker   if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
232*6777b538SAndroid Build Coastguard Worker     *out_port = Component();
233*6777b538SAndroid Build Coastguard Worker     return true;  // Leave port empty.
234*6777b538SAndroid Build Coastguard Worker   }
235*6777b538SAndroid Build Coastguard Worker 
236*6777b538SAndroid Build Coastguard Worker   if (port_num == PORT_INVALID) {
237*6777b538SAndroid Build Coastguard Worker     // Invalid port: We'll copy the text from the input so the user can see
238*6777b538SAndroid Build Coastguard Worker     // what the error was, and mark the URL as invalid by returning false.
239*6777b538SAndroid Build Coastguard Worker     output->push_back(':');
240*6777b538SAndroid Build Coastguard Worker     out_port->begin = output->length();
241*6777b538SAndroid Build Coastguard Worker     AppendInvalidNarrowString(spec, static_cast<size_t>(port.begin),
242*6777b538SAndroid Build Coastguard Worker                               static_cast<size_t>(port.end()), output);
243*6777b538SAndroid Build Coastguard Worker     out_port->len = output->length() - out_port->begin;
244*6777b538SAndroid Build Coastguard Worker     return false;
245*6777b538SAndroid Build Coastguard Worker   }
246*6777b538SAndroid Build Coastguard Worker 
247*6777b538SAndroid Build Coastguard Worker   // Convert port number back to an integer. Max port value is 5 digits, and
248*6777b538SAndroid Build Coastguard Worker   // the Parsed::ExtractPort will have made sure the integer is in range.
249*6777b538SAndroid Build Coastguard Worker   const int buf_size = 6;
250*6777b538SAndroid Build Coastguard Worker   char buf[buf_size];
251*6777b538SAndroid Build Coastguard Worker   WritePortInt(buf, buf_size, port_num);
252*6777b538SAndroid Build Coastguard Worker 
253*6777b538SAndroid Build Coastguard Worker   // Append the port number to the output, preceded by a colon.
254*6777b538SAndroid Build Coastguard Worker   output->push_back(':');
255*6777b538SAndroid Build Coastguard Worker   out_port->begin = output->length();
256*6777b538SAndroid Build Coastguard Worker   for (int i = 0; i < buf_size && buf[i]; i++)
257*6777b538SAndroid Build Coastguard Worker     output->push_back(buf[i]);
258*6777b538SAndroid Build Coastguard Worker 
259*6777b538SAndroid Build Coastguard Worker   out_port->len = output->length() - out_port->begin;
260*6777b538SAndroid Build Coastguard Worker   return true;
261*6777b538SAndroid Build Coastguard Worker }
262*6777b538SAndroid Build Coastguard Worker 
263*6777b538SAndroid Build Coastguard Worker // clang-format off
264*6777b538SAndroid Build Coastguard Worker //   Percent-escape all characters from the fragment percent-encode set
265*6777b538SAndroid Build Coastguard Worker //   https://url.spec.whatwg.org/#fragment-percent-encode-set
266*6777b538SAndroid Build Coastguard Worker const bool kShouldEscapeCharInFragment[0x80] = {
267*6777b538SAndroid Build Coastguard Worker //  Control characters (0x00-0x1F)
268*6777b538SAndroid Build Coastguard Worker     true,  true,  true,  true,  true,  true,  true,  true,
269*6777b538SAndroid Build Coastguard Worker     true,  true,  true,  true,  true,  true,  true,  true,
270*6777b538SAndroid Build Coastguard Worker     true,  true,  true,  true,  true,  true,  true,  true,
271*6777b538SAndroid Build Coastguard Worker     true,  true,  true,  true,  true,  true,  true,  true,
272*6777b538SAndroid Build Coastguard Worker //  ' '    !      "      #      $      %      &      '
273*6777b538SAndroid Build Coastguard Worker     true,  false, true,  false, false, false, false, false,
274*6777b538SAndroid Build Coastguard Worker //  (      )      *      +      ,      -      .      /
275*6777b538SAndroid Build Coastguard Worker     false, false, false, false, false, false, false, false,
276*6777b538SAndroid Build Coastguard Worker //  0      1      2      3      4      5      6      7
277*6777b538SAndroid Build Coastguard Worker     false, false, false, false, false, false, false, false,
278*6777b538SAndroid Build Coastguard Worker //  8      9      :      ;      <      =      >      ?
279*6777b538SAndroid Build Coastguard Worker     false, false, false, false, true,  false, true,  false,
280*6777b538SAndroid Build Coastguard Worker //  @      A      B      C      D      E      F      G
281*6777b538SAndroid Build Coastguard Worker     false, false, false, false, false, false, false, false,
282*6777b538SAndroid Build Coastguard Worker //  H      I      J      K      L      M      N      O
283*6777b538SAndroid Build Coastguard Worker     false, false, false, false, false, false, false, false,
284*6777b538SAndroid Build Coastguard Worker //  P      Q      R      S      T      U      V      W
285*6777b538SAndroid Build Coastguard Worker     false, false, false, false, false, false, false, false,
286*6777b538SAndroid Build Coastguard Worker //  X      Y      Z      [      \      ]      ^      _
287*6777b538SAndroid Build Coastguard Worker     false, false, false, false, false, false, false, false,
288*6777b538SAndroid Build Coastguard Worker //  `      a      b      c      d      e      f      g
289*6777b538SAndroid Build Coastguard Worker     true,  false, false, false, false, false, false, false,
290*6777b538SAndroid Build Coastguard Worker //  h      i      j      k      l      m      n      o
291*6777b538SAndroid Build Coastguard Worker     false, false, false, false, false, false, false, false,
292*6777b538SAndroid Build Coastguard Worker //  p      q      r      s      t      u      v      w
293*6777b538SAndroid Build Coastguard Worker     false, false, false, false, false, false, false, false,
294*6777b538SAndroid Build Coastguard Worker //  x      y      z      {      |      }      ~      DELETE
295*6777b538SAndroid Build Coastguard Worker     false, false, false, false, false, false, false, true
296*6777b538SAndroid Build Coastguard Worker };
297*6777b538SAndroid Build Coastguard Worker // clang-format on
298*6777b538SAndroid Build Coastguard Worker 
299*6777b538SAndroid Build Coastguard Worker template <typename CHAR, typename UCHAR>
DoCanonicalizeRef(const CHAR * spec,const Component & ref,CanonOutput * output,Component * out_ref)300*6777b538SAndroid Build Coastguard Worker void DoCanonicalizeRef(const CHAR* spec,
301*6777b538SAndroid Build Coastguard Worker                        const Component& ref,
302*6777b538SAndroid Build Coastguard Worker                        CanonOutput* output,
303*6777b538SAndroid Build Coastguard Worker                        Component* out_ref) {
304*6777b538SAndroid Build Coastguard Worker   if (!ref.is_valid()) {
305*6777b538SAndroid Build Coastguard Worker     // Common case of no ref.
306*6777b538SAndroid Build Coastguard Worker     *out_ref = Component();
307*6777b538SAndroid Build Coastguard Worker     return;
308*6777b538SAndroid Build Coastguard Worker   }
309*6777b538SAndroid Build Coastguard Worker 
310*6777b538SAndroid Build Coastguard Worker   // Append the ref separator. Note that we need to do this even when the ref
311*6777b538SAndroid Build Coastguard Worker   // is empty but present.
312*6777b538SAndroid Build Coastguard Worker   output->push_back('#');
313*6777b538SAndroid Build Coastguard Worker   out_ref->begin = output->length();
314*6777b538SAndroid Build Coastguard Worker 
315*6777b538SAndroid Build Coastguard Worker   // Now iterate through all the characters, converting to UTF-8 and validating.
316*6777b538SAndroid Build Coastguard Worker   size_t end = static_cast<size_t>(ref.end());
317*6777b538SAndroid Build Coastguard Worker   for (size_t i = static_cast<size_t>(ref.begin); i < end; i++) {
318*6777b538SAndroid Build Coastguard Worker     UCHAR current_char = static_cast<UCHAR>(spec[i]);
319*6777b538SAndroid Build Coastguard Worker     if (current_char < 0x80) {
320*6777b538SAndroid Build Coastguard Worker       if (kShouldEscapeCharInFragment[current_char])
321*6777b538SAndroid Build Coastguard Worker         AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
322*6777b538SAndroid Build Coastguard Worker       else
323*6777b538SAndroid Build Coastguard Worker         output->push_back(static_cast<char>(spec[i]));
324*6777b538SAndroid Build Coastguard Worker     } else {
325*6777b538SAndroid Build Coastguard Worker       AppendUTF8EscapedChar(spec, &i, end, output);
326*6777b538SAndroid Build Coastguard Worker     }
327*6777b538SAndroid Build Coastguard Worker   }
328*6777b538SAndroid Build Coastguard Worker 
329*6777b538SAndroid Build Coastguard Worker   out_ref->len = output->length() - out_ref->begin;
330*6777b538SAndroid Build Coastguard Worker }
331*6777b538SAndroid Build Coastguard Worker 
332*6777b538SAndroid Build Coastguard Worker }  // namespace
333*6777b538SAndroid Build Coastguard Worker 
RemoveURLWhitespace(const char * input,int input_len,CanonOutputT<char> * buffer,int * output_len,bool * potentially_dangling_markup)334*6777b538SAndroid Build Coastguard Worker const char* RemoveURLWhitespace(const char* input,
335*6777b538SAndroid Build Coastguard Worker                                 int input_len,
336*6777b538SAndroid Build Coastguard Worker                                 CanonOutputT<char>* buffer,
337*6777b538SAndroid Build Coastguard Worker                                 int* output_len,
338*6777b538SAndroid Build Coastguard Worker                                 bool* potentially_dangling_markup) {
339*6777b538SAndroid Build Coastguard Worker   return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
340*6777b538SAndroid Build Coastguard Worker                                potentially_dangling_markup);
341*6777b538SAndroid Build Coastguard Worker }
342*6777b538SAndroid Build Coastguard Worker 
RemoveURLWhitespace(const char16_t * input,int input_len,CanonOutputT<char16_t> * buffer,int * output_len,bool * potentially_dangling_markup)343*6777b538SAndroid Build Coastguard Worker const char16_t* RemoveURLWhitespace(const char16_t* input,
344*6777b538SAndroid Build Coastguard Worker                                     int input_len,
345*6777b538SAndroid Build Coastguard Worker                                     CanonOutputT<char16_t>* buffer,
346*6777b538SAndroid Build Coastguard Worker                                     int* output_len,
347*6777b538SAndroid Build Coastguard Worker                                     bool* potentially_dangling_markup) {
348*6777b538SAndroid Build Coastguard Worker   return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
349*6777b538SAndroid Build Coastguard Worker                                potentially_dangling_markup);
350*6777b538SAndroid Build Coastguard Worker }
351*6777b538SAndroid Build Coastguard Worker 
CanonicalSchemeChar(char16_t ch)352*6777b538SAndroid Build Coastguard Worker char CanonicalSchemeChar(char16_t ch) {
353*6777b538SAndroid Build Coastguard Worker   if (ch >= 0x80)
354*6777b538SAndroid Build Coastguard Worker     return 0;  // Non-ASCII is not supported by schemes.
355*6777b538SAndroid Build Coastguard Worker   return kSchemeCanonical[ch];
356*6777b538SAndroid Build Coastguard Worker }
357*6777b538SAndroid Build Coastguard Worker 
CanonicalizeScheme(const char * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)358*6777b538SAndroid Build Coastguard Worker bool CanonicalizeScheme(const char* spec,
359*6777b538SAndroid Build Coastguard Worker                         const Component& scheme,
360*6777b538SAndroid Build Coastguard Worker                         CanonOutput* output,
361*6777b538SAndroid Build Coastguard Worker                         Component* out_scheme) {
362*6777b538SAndroid Build Coastguard Worker   return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
363*6777b538SAndroid Build Coastguard Worker }
364*6777b538SAndroid Build Coastguard Worker 
CanonicalizeScheme(const char16_t * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)365*6777b538SAndroid Build Coastguard Worker bool CanonicalizeScheme(const char16_t* spec,
366*6777b538SAndroid Build Coastguard Worker                         const Component& scheme,
367*6777b538SAndroid Build Coastguard Worker                         CanonOutput* output,
368*6777b538SAndroid Build Coastguard Worker                         Component* out_scheme) {
369*6777b538SAndroid Build Coastguard Worker   return DoScheme<char16_t, char16_t>(spec, scheme, output, out_scheme);
370*6777b538SAndroid Build Coastguard Worker }
371*6777b538SAndroid Build Coastguard Worker 
CanonicalizeUserInfo(const char * username_source,const Component & username,const char * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)372*6777b538SAndroid Build Coastguard Worker bool CanonicalizeUserInfo(const char* username_source,
373*6777b538SAndroid Build Coastguard Worker                           const Component& username,
374*6777b538SAndroid Build Coastguard Worker                           const char* password_source,
375*6777b538SAndroid Build Coastguard Worker                           const Component& password,
376*6777b538SAndroid Build Coastguard Worker                           CanonOutput* output,
377*6777b538SAndroid Build Coastguard Worker                           Component* out_username,
378*6777b538SAndroid Build Coastguard Worker                           Component* out_password) {
379*6777b538SAndroid Build Coastguard Worker   return DoUserInfo<char, unsigned char>(username_source, username,
380*6777b538SAndroid Build Coastguard Worker                                          password_source, password, output,
381*6777b538SAndroid Build Coastguard Worker                                          out_username, out_password);
382*6777b538SAndroid Build Coastguard Worker }
383*6777b538SAndroid Build Coastguard Worker 
CanonicalizeUserInfo(const char16_t * username_source,const Component & username,const char16_t * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)384*6777b538SAndroid Build Coastguard Worker bool CanonicalizeUserInfo(const char16_t* username_source,
385*6777b538SAndroid Build Coastguard Worker                           const Component& username,
386*6777b538SAndroid Build Coastguard Worker                           const char16_t* password_source,
387*6777b538SAndroid Build Coastguard Worker                           const Component& password,
388*6777b538SAndroid Build Coastguard Worker                           CanonOutput* output,
389*6777b538SAndroid Build Coastguard Worker                           Component* out_username,
390*6777b538SAndroid Build Coastguard Worker                           Component* out_password) {
391*6777b538SAndroid Build Coastguard Worker   return DoUserInfo<char16_t, char16_t>(username_source, username,
392*6777b538SAndroid Build Coastguard Worker                                         password_source, password, output,
393*6777b538SAndroid Build Coastguard Worker                                         out_username, out_password);
394*6777b538SAndroid Build Coastguard Worker }
395*6777b538SAndroid Build Coastguard Worker 
CanonicalizePort(const char * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)396*6777b538SAndroid Build Coastguard Worker bool CanonicalizePort(const char* spec,
397*6777b538SAndroid Build Coastguard Worker                       const Component& port,
398*6777b538SAndroid Build Coastguard Worker                       int default_port_for_scheme,
399*6777b538SAndroid Build Coastguard Worker                       CanonOutput* output,
400*6777b538SAndroid Build Coastguard Worker                       Component* out_port) {
401*6777b538SAndroid Build Coastguard Worker   return DoPort<char, unsigned char>(spec, port, default_port_for_scheme,
402*6777b538SAndroid Build Coastguard Worker                                      output, out_port);
403*6777b538SAndroid Build Coastguard Worker }
404*6777b538SAndroid Build Coastguard Worker 
CanonicalizePort(const char16_t * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)405*6777b538SAndroid Build Coastguard Worker bool CanonicalizePort(const char16_t* spec,
406*6777b538SAndroid Build Coastguard Worker                       const Component& port,
407*6777b538SAndroid Build Coastguard Worker                       int default_port_for_scheme,
408*6777b538SAndroid Build Coastguard Worker                       CanonOutput* output,
409*6777b538SAndroid Build Coastguard Worker                       Component* out_port) {
410*6777b538SAndroid Build Coastguard Worker   return DoPort<char16_t, char16_t>(spec, port, default_port_for_scheme, output,
411*6777b538SAndroid Build Coastguard Worker                                     out_port);
412*6777b538SAndroid Build Coastguard Worker }
413*6777b538SAndroid Build Coastguard Worker 
CanonicalizeRef(const char * spec,const Component & ref,CanonOutput * output,Component * out_ref)414*6777b538SAndroid Build Coastguard Worker void CanonicalizeRef(const char* spec,
415*6777b538SAndroid Build Coastguard Worker                      const Component& ref,
416*6777b538SAndroid Build Coastguard Worker                      CanonOutput* output,
417*6777b538SAndroid Build Coastguard Worker                      Component* out_ref) {
418*6777b538SAndroid Build Coastguard Worker   DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
419*6777b538SAndroid Build Coastguard Worker }
420*6777b538SAndroid Build Coastguard Worker 
CanonicalizeRef(const char16_t * spec,const Component & ref,CanonOutput * output,Component * out_ref)421*6777b538SAndroid Build Coastguard Worker void CanonicalizeRef(const char16_t* spec,
422*6777b538SAndroid Build Coastguard Worker                      const Component& ref,
423*6777b538SAndroid Build Coastguard Worker                      CanonOutput* output,
424*6777b538SAndroid Build Coastguard Worker                      Component* out_ref) {
425*6777b538SAndroid Build Coastguard Worker   DoCanonicalizeRef<char16_t, char16_t>(spec, ref, output, out_ref);
426*6777b538SAndroid Build Coastguard Worker }
427*6777b538SAndroid Build Coastguard Worker 
428*6777b538SAndroid Build Coastguard Worker }  // namespace url
429