xref: /aosp_15_r20/external/cronet/url/url_canon_non_special_url.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2023 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Functions to canonicalize non-special URLs.
6 
7 #include "url/url_canon.h"
8 #include "url/url_canon_internal.h"
9 
10 namespace url {
11 
12 namespace {
13 
14 template <typename CHAR>
DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR> & source,const Parsed & parsed,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)15 bool DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR>& source,
16                                  const Parsed& parsed,
17                                  CharsetConverter* query_converter,
18                                  CanonOutput& output,
19                                  Parsed& new_parsed) {
20   // The implementation is similar to `DoCanonicalizeStandardURL()`, but there
21   // are many subtle differences. So we have a different function for
22   // canonicalizing non-special URLs.
23   //
24   // Since canonicalization is also used from url::ReplaceComponents(),
25   // we have to handle an invalid URL replacement here, such as:
26   //
27   // > const url = "git:///";
28   // > url.username = "x";
29   // > url.href
30   // "git:///" (this should not be "git://x@").
31 
32   DCHECK(!parsed.has_opaque_path);
33 
34   // Scheme: this will append the colon.
35   bool success = CanonicalizeScheme(source.scheme, parsed.scheme, &output,
36                                     &new_parsed.scheme);
37   bool have_authority =
38       (parsed.username.is_valid() || parsed.password.is_valid() ||
39        parsed.host.is_valid() || parsed.port.is_valid());
40 
41   // Non-special URL examples which should be carefully handled:
42   //
43   // | URL      | parsed.user   | parsed.host   | have_authority | Valid URL? |
44   // |----------+---------------+---------------+----------------+------------|
45   // | git:/a   | invalid       | invalid       | false          | valid      |
46   // | git://@/ | valid (empty) | invalid       | true           | invalid    |
47   // | git:///  | invalid       | valid (empty) | true           | valid      |
48 
49   if (have_authority) {
50     // Only write the authority separators when we have a scheme.
51     if (parsed.scheme.is_valid()) {
52       output.push_back('/');
53       output.push_back('/');
54     }
55 
56     // Username and Password
57     //
58     // URL Standard:
59     // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
60     // - https://url.spec.whatwg.org/#dom-url-username
61     // - https://url.spec.whatwg.org/#dom-url-password
62     if (parsed.host.is_nonempty()) {
63       // User info: the canonicalizer will handle the : and @.
64       success &= CanonicalizeUserInfo(
65           source.username, parsed.username, source.password, parsed.password,
66           &output, &new_parsed.username, &new_parsed.password);
67     } else {
68       new_parsed.username.reset();
69       new_parsed.password.reset();
70     }
71 
72     // Host
73     if (parsed.host.is_valid()) {
74       success &= CanonicalizeNonSpecialHost(source.host, parsed.host, output,
75                                             new_parsed.host);
76     } else {
77       new_parsed.host.reset();
78       // URL is invalid if `have_authority` is true, but `parsed.host` is
79       // invalid. Example: "git://@/".
80       success = false;
81     }
82 
83     // Port
84     //
85     // URL Standard:
86     // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
87     // - https://url.spec.whatwg.org/#dom-url-port
88     if (parsed.host.is_nonempty()) {
89       success &= CanonicalizePort(source.port, parsed.port, PORT_UNSPECIFIED,
90                                   &output, &new_parsed.port);
91     } else {
92       new_parsed.port.reset();
93     }
94   } else {
95     // No authority, clear the components.
96     new_parsed.host.reset();
97     new_parsed.username.reset();
98     new_parsed.password.reset();
99     new_parsed.port.reset();
100   }
101 
102   // Path
103   if (parsed.path.is_valid()) {
104     if (!parsed.host.is_valid() && parsed.path.is_empty()) {
105       // Handle an edge case: Replacing non-special path-only URL's pathname
106       // with an empty path.
107       //
108       // Path-only non-special URLs cannot have their paths erased.
109       //
110       // Example:
111       //
112       // > const url = new URL("git:/a");
113       // > url.pathname = '';
114       // > url.href
115       // => The result should be "git:/", instead of "git:".
116       // > url.pathname
117       // => The result should be "/", instead of "".
118       //
119       // URL Standard is https://url.spec.whatwg.org/#dom-url-pathname, however,
120       // it would take some time to understand why url.pathname ends up as "/"
121       // in this case. Please read the URL Standard carefully to understand
122       // that.
123       new_parsed.path.begin = output.length();
124       output.push_back('/');
125       new_parsed.path.len = output.length() - new_parsed.path.begin;
126     } else {
127       success &=
128           CanonicalizePath(source.path, parsed.path, CanonMode::kNonSpecialURL,
129                            &output, &new_parsed.path);
130       if (!parsed.host.is_valid() && new_parsed.path.is_valid() &&
131           new_parsed.path.as_string_view_on(output.view().data())
132               .starts_with("//")) {
133         // To avoid path being treated as the host, prepend "/." to the path".
134         //
135         // Examples:
136         //
137         // > const url = new URL("git:/.//a");
138         // > url.href
139         // => The result should be "git:/.//a", instead of "git://a".
140         //
141         // > const url = new URL("git:/");
142         // > url.pathname = "/.//a"
143         // > url.href
144         // => The result should be "git:/.//a", instead of "git://a".
145         //
146         // URL Standard: https://url.spec.whatwg.org/#concept-url-serializer
147         //
148         // > 3. If url’s host is null, url does not have an opaque path, url’s
149         // > path’s size is greater than 1, and url’s path[0] is the empty
150         // > string, then append U+002F (/) followed by U+002E (.) to output.
151         //
152         // Since the path length is unknown in advance, we post-process the new
153         // path here. This case is likely to be infrequent, so the performance
154         // impact should be minimal.
155         size_t prior_output_length = output.length();
156         output.Insert(new_parsed.path.begin, "/.");
157         // Adjust path.
158         new_parsed.path.begin += output.length() - prior_output_length;
159       }
160     }
161   } else {
162     new_parsed.path.reset();
163   }
164 
165   // Query
166   CanonicalizeQuery(source.query, parsed.query, query_converter, &output,
167                     &new_parsed.query);
168 
169   // Ref: ignore failure for this, since the page can probably still be loaded.
170   CanonicalizeRef(source.ref, parsed.ref, &output, &new_parsed.ref);
171 
172   // Carry over the flag for potentially dangling markup:
173   if (parsed.potentially_dangling_markup) {
174     new_parsed.potentially_dangling_markup = true;
175   }
176 
177   return success;
178 }
179 
180 }  // namespace
181 
CanonicalizeNonSpecialURL(const char * spec,int spec_len,const Parsed & parsed,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)182 bool CanonicalizeNonSpecialURL(const char* spec,
183                                int spec_len,
184                                const Parsed& parsed,
185                                CharsetConverter* query_converter,
186                                CanonOutput& output,
187                                Parsed& new_parsed) {
188   // Carry over the flag.
189   new_parsed.has_opaque_path = parsed.has_opaque_path;
190 
191   if (parsed.has_opaque_path) {
192     return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
193   }
194   return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
195                                      query_converter, output, new_parsed);
196 }
197 
CanonicalizeNonSpecialURL(const char16_t * spec,int spec_len,const Parsed & parsed,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)198 bool CanonicalizeNonSpecialURL(const char16_t* spec,
199                                int spec_len,
200                                const Parsed& parsed,
201                                CharsetConverter* query_converter,
202                                CanonOutput& output,
203                                Parsed& new_parsed) {
204   // Carry over the flag.
205   new_parsed.has_opaque_path = parsed.has_opaque_path;
206 
207   if (parsed.has_opaque_path) {
208     return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
209   }
210   return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
211                                      query_converter, output, new_parsed);
212 }
213 
ReplaceNonSpecialURL(const char * base,const Parsed & base_parsed,const Replacements<char> & replacements,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)214 bool ReplaceNonSpecialURL(const char* base,
215                           const Parsed& base_parsed,
216                           const Replacements<char>& replacements,
217                           CharsetConverter* query_converter,
218                           CanonOutput& output,
219                           Parsed& new_parsed) {
220   if (base_parsed.has_opaque_path) {
221     return ReplacePathURL(base, base_parsed, replacements, &output,
222                           &new_parsed);
223   }
224 
225   URLComponentSource<char> source(base);
226   Parsed parsed(base_parsed);
227   SetupOverrideComponents(base, replacements, &source, &parsed);
228   return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
229                                      new_parsed);
230 }
231 
232 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
233 // regular code path can be used.
ReplaceNonSpecialURL(const char * base,const Parsed & base_parsed,const Replacements<char16_t> & replacements,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)234 bool ReplaceNonSpecialURL(const char* base,
235                           const Parsed& base_parsed,
236                           const Replacements<char16_t>& replacements,
237                           CharsetConverter* query_converter,
238                           CanonOutput& output,
239                           Parsed& new_parsed) {
240   if (base_parsed.has_opaque_path) {
241     return ReplacePathURL(base, base_parsed, replacements, &output,
242                           &new_parsed);
243   }
244 
245   RawCanonOutput<1024> utf8;
246   URLComponentSource<char> source(base);
247   Parsed parsed(base_parsed);
248   SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
249   return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
250                                      new_parsed);
251 }
252 
253 }  // namespace url
254