1 // Copyright 2023 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Functions to canonicalize non-special URLs.
6
7 #include "url/url_canon.h"
8 #include "url/url_canon_internal.h"
9
10 namespace url {
11
12 namespace {
13
14 template <typename CHAR>
DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR> & source,const Parsed & parsed,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)15 bool DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR>& source,
16 const Parsed& parsed,
17 CharsetConverter* query_converter,
18 CanonOutput& output,
19 Parsed& new_parsed) {
20 // The implementation is similar to `DoCanonicalizeStandardURL()`, but there
21 // are many subtle differences. So we have a different function for
22 // canonicalizing non-special URLs.
23 //
24 // Since canonicalization is also used from url::ReplaceComponents(),
25 // we have to handle an invalid URL replacement here, such as:
26 //
27 // > const url = "git:///";
28 // > url.username = "x";
29 // > url.href
30 // "git:///" (this should not be "git://x@").
31
32 DCHECK(!parsed.has_opaque_path);
33
34 // Scheme: this will append the colon.
35 bool success = CanonicalizeScheme(source.scheme, parsed.scheme, &output,
36 &new_parsed.scheme);
37 bool have_authority =
38 (parsed.username.is_valid() || parsed.password.is_valid() ||
39 parsed.host.is_valid() || parsed.port.is_valid());
40
41 // Non-special URL examples which should be carefully handled:
42 //
43 // | URL | parsed.user | parsed.host | have_authority | Valid URL? |
44 // |----------+---------------+---------------+----------------+------------|
45 // | git:/a | invalid | invalid | false | valid |
46 // | git://@/ | valid (empty) | invalid | true | invalid |
47 // | git:/// | invalid | valid (empty) | true | valid |
48
49 if (have_authority) {
50 // Only write the authority separators when we have a scheme.
51 if (parsed.scheme.is_valid()) {
52 output.push_back('/');
53 output.push_back('/');
54 }
55
56 // Username and Password
57 //
58 // URL Standard:
59 // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
60 // - https://url.spec.whatwg.org/#dom-url-username
61 // - https://url.spec.whatwg.org/#dom-url-password
62 if (parsed.host.is_nonempty()) {
63 // User info: the canonicalizer will handle the : and @.
64 success &= CanonicalizeUserInfo(
65 source.username, parsed.username, source.password, parsed.password,
66 &output, &new_parsed.username, &new_parsed.password);
67 } else {
68 new_parsed.username.reset();
69 new_parsed.password.reset();
70 }
71
72 // Host
73 if (parsed.host.is_valid()) {
74 success &= CanonicalizeNonSpecialHost(source.host, parsed.host, output,
75 new_parsed.host);
76 } else {
77 new_parsed.host.reset();
78 // URL is invalid if `have_authority` is true, but `parsed.host` is
79 // invalid. Example: "git://@/".
80 success = false;
81 }
82
83 // Port
84 //
85 // URL Standard:
86 // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
87 // - https://url.spec.whatwg.org/#dom-url-port
88 if (parsed.host.is_nonempty()) {
89 success &= CanonicalizePort(source.port, parsed.port, PORT_UNSPECIFIED,
90 &output, &new_parsed.port);
91 } else {
92 new_parsed.port.reset();
93 }
94 } else {
95 // No authority, clear the components.
96 new_parsed.host.reset();
97 new_parsed.username.reset();
98 new_parsed.password.reset();
99 new_parsed.port.reset();
100 }
101
102 // Path
103 if (parsed.path.is_valid()) {
104 if (!parsed.host.is_valid() && parsed.path.is_empty()) {
105 // Handle an edge case: Replacing non-special path-only URL's pathname
106 // with an empty path.
107 //
108 // Path-only non-special URLs cannot have their paths erased.
109 //
110 // Example:
111 //
112 // > const url = new URL("git:/a");
113 // > url.pathname = '';
114 // > url.href
115 // => The result should be "git:/", instead of "git:".
116 // > url.pathname
117 // => The result should be "/", instead of "".
118 //
119 // URL Standard is https://url.spec.whatwg.org/#dom-url-pathname, however,
120 // it would take some time to understand why url.pathname ends up as "/"
121 // in this case. Please read the URL Standard carefully to understand
122 // that.
123 new_parsed.path.begin = output.length();
124 output.push_back('/');
125 new_parsed.path.len = output.length() - new_parsed.path.begin;
126 } else {
127 success &=
128 CanonicalizePath(source.path, parsed.path, CanonMode::kNonSpecialURL,
129 &output, &new_parsed.path);
130 if (!parsed.host.is_valid() && new_parsed.path.is_valid() &&
131 new_parsed.path.as_string_view_on(output.view().data())
132 .starts_with("//")) {
133 // To avoid path being treated as the host, prepend "/." to the path".
134 //
135 // Examples:
136 //
137 // > const url = new URL("git:/.//a");
138 // > url.href
139 // => The result should be "git:/.//a", instead of "git://a".
140 //
141 // > const url = new URL("git:/");
142 // > url.pathname = "/.//a"
143 // > url.href
144 // => The result should be "git:/.//a", instead of "git://a".
145 //
146 // URL Standard: https://url.spec.whatwg.org/#concept-url-serializer
147 //
148 // > 3. If url’s host is null, url does not have an opaque path, url’s
149 // > path’s size is greater than 1, and url’s path[0] is the empty
150 // > string, then append U+002F (/) followed by U+002E (.) to output.
151 //
152 // Since the path length is unknown in advance, we post-process the new
153 // path here. This case is likely to be infrequent, so the performance
154 // impact should be minimal.
155 size_t prior_output_length = output.length();
156 output.Insert(new_parsed.path.begin, "/.");
157 // Adjust path.
158 new_parsed.path.begin += output.length() - prior_output_length;
159 }
160 }
161 } else {
162 new_parsed.path.reset();
163 }
164
165 // Query
166 CanonicalizeQuery(source.query, parsed.query, query_converter, &output,
167 &new_parsed.query);
168
169 // Ref: ignore failure for this, since the page can probably still be loaded.
170 CanonicalizeRef(source.ref, parsed.ref, &output, &new_parsed.ref);
171
172 // Carry over the flag for potentially dangling markup:
173 if (parsed.potentially_dangling_markup) {
174 new_parsed.potentially_dangling_markup = true;
175 }
176
177 return success;
178 }
179
180 } // namespace
181
CanonicalizeNonSpecialURL(const char * spec,int spec_len,const Parsed & parsed,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)182 bool CanonicalizeNonSpecialURL(const char* spec,
183 int spec_len,
184 const Parsed& parsed,
185 CharsetConverter* query_converter,
186 CanonOutput& output,
187 Parsed& new_parsed) {
188 // Carry over the flag.
189 new_parsed.has_opaque_path = parsed.has_opaque_path;
190
191 if (parsed.has_opaque_path) {
192 return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
193 }
194 return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
195 query_converter, output, new_parsed);
196 }
197
CanonicalizeNonSpecialURL(const char16_t * spec,int spec_len,const Parsed & parsed,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)198 bool CanonicalizeNonSpecialURL(const char16_t* spec,
199 int spec_len,
200 const Parsed& parsed,
201 CharsetConverter* query_converter,
202 CanonOutput& output,
203 Parsed& new_parsed) {
204 // Carry over the flag.
205 new_parsed.has_opaque_path = parsed.has_opaque_path;
206
207 if (parsed.has_opaque_path) {
208 return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
209 }
210 return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
211 query_converter, output, new_parsed);
212 }
213
ReplaceNonSpecialURL(const char * base,const Parsed & base_parsed,const Replacements<char> & replacements,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)214 bool ReplaceNonSpecialURL(const char* base,
215 const Parsed& base_parsed,
216 const Replacements<char>& replacements,
217 CharsetConverter* query_converter,
218 CanonOutput& output,
219 Parsed& new_parsed) {
220 if (base_parsed.has_opaque_path) {
221 return ReplacePathURL(base, base_parsed, replacements, &output,
222 &new_parsed);
223 }
224
225 URLComponentSource<char> source(base);
226 Parsed parsed(base_parsed);
227 SetupOverrideComponents(base, replacements, &source, &parsed);
228 return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
229 new_parsed);
230 }
231
232 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
233 // regular code path can be used.
ReplaceNonSpecialURL(const char * base,const Parsed & base_parsed,const Replacements<char16_t> & replacements,CharsetConverter * query_converter,CanonOutput & output,Parsed & new_parsed)234 bool ReplaceNonSpecialURL(const char* base,
235 const Parsed& base_parsed,
236 const Replacements<char16_t>& replacements,
237 CharsetConverter* query_converter,
238 CanonOutput& output,
239 Parsed& new_parsed) {
240 if (base_parsed.has_opaque_path) {
241 return ReplacePathURL(base, base_parsed, replacements, &output,
242 &new_parsed);
243 }
244
245 RawCanonOutput<1024> utf8;
246 URLComponentSource<char> source(base);
247 Parsed parsed(base_parsed);
248 SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
249 return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
250 new_parsed);
251 }
252
253 } // namespace url
254