1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Canonicalizers for random bits that aren't big enough for their own files.
6
7 #include <string.h>
8
9 #include "url/url_canon.h"
10 #include "url/url_canon_internal.h"
11
12 namespace url {
13
14 namespace {
15
16 // Returns true if the given character should be removed from the middle of a
17 // URL.
IsRemovableURLWhitespace(int ch)18 inline bool IsRemovableURLWhitespace(int ch) {
19 return ch == '\r' || ch == '\n' || ch == '\t';
20 }
21
22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
23 // It sucks that we have to do this, since this takes about 13% of the total URL
24 // canonicalization time.
25 template <typename CHAR>
DoRemoveURLWhitespace(const CHAR * input,int input_len,CanonOutputT<CHAR> * buffer,int * output_len,bool * potentially_dangling_markup)26 const CHAR* DoRemoveURLWhitespace(const CHAR* input,
27 int input_len,
28 CanonOutputT<CHAR>* buffer,
29 int* output_len,
30 bool* potentially_dangling_markup) {
31 // Fast verification that there's nothing that needs removal. This is the 99%
32 // case, so we want it to be fast and don't care about impacting the speed
33 // when we do find whitespace.
34 bool found_whitespace = false;
35 if (sizeof(*input) == 1 && input_len >= kMinimumLengthForSIMD) {
36 // For large strings, memchr is much faster than any scalar code we can
37 // write, even if we need to run it three times. (If this turns out to still
38 // be a bottleneck, we could write our own vector code, but given that
39 // memchr is so fast, it's unlikely to be relevant.)
40 found_whitespace = memchr(input, '\n', input_len) != nullptr ||
41 memchr(input, '\r', input_len) != nullptr ||
42 memchr(input, '\t', input_len) != nullptr;
43 } else {
44 for (int i = 0; i < input_len; i++) {
45 if (!IsRemovableURLWhitespace(input[i]))
46 continue;
47 found_whitespace = true;
48 break;
49 }
50 }
51
52 if (!found_whitespace) {
53 // Didn't find any whitespace, we don't need to do anything. We can just
54 // return the input as the output.
55 *output_len = input_len;
56 return input;
57 }
58
59 // Skip whitespace removal for `data:` URLs.
60 //
61 // TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but
62 // that turns out to be difficult to do correctly given this function's
63 // character type templating.
64 if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
65 input[3] == 'a' && input[4] == ':') {
66 *output_len = input_len;
67 return input;
68 }
69
70 // Remove the whitespace into the new buffer and return it.
71 for (int i = 0; i < input_len; i++) {
72 if (!IsRemovableURLWhitespace(input[i])) {
73 if (potentially_dangling_markup && input[i] == 0x3C)
74 *potentially_dangling_markup = true;
75 buffer->push_back(input[i]);
76 }
77 }
78 *output_len = buffer->length();
79 return buffer->data();
80 }
81
82 // Contains the canonical version of each possible input letter in the scheme
83 // (basically, lower-cased). The corresponding entry will be 0 if the letter
84 // is not allowed in a scheme.
85 // clang-format off
86 const char kSchemeCanonical[0x80] = {
87 // 00-1f: all are invalid
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
90 // ' ' ! " # $ % & ' ( ) * + , - . /
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
92 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
93 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
94 // @ A B C D E F G H I J K L M N O
95 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
96 // P Q R S T U V W X Y Z [ \ ] ^ _
97 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
98 // ` a b c d e f g h i j k l m n o
99 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
100 // p q r s t u v w x y z { | } ~
101 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
102 // clang-format on
103
104 // This could be a table lookup as well by setting the high bit for each
105 // valid character, but it's only called once per URL, and it makes the lookup
106 // table easier to read not having extra stuff in it.
IsSchemeFirstChar(unsigned char c)107 inline bool IsSchemeFirstChar(unsigned char c) {
108 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
109 }
110
111 template <typename CHAR, typename UCHAR>
DoScheme(const CHAR * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)112 bool DoScheme(const CHAR* spec,
113 const Component& scheme,
114 CanonOutput* output,
115 Component* out_scheme) {
116 if (scheme.is_empty()) {
117 // Scheme is unspecified or empty, convert to empty by appending a colon.
118 *out_scheme = Component(output->length(), 0);
119 output->push_back(':');
120 return false;
121 }
122
123 // The output scheme starts from the current position.
124 out_scheme->begin = output->length();
125
126 // Danger: it's important that this code does not strip any characters;
127 // it only emits the canonical version (be it valid or escaped) for each
128 // of the input characters. Stripping would put it out of sync with
129 // FindAndCompareScheme, which could cause some security checks on
130 // schemes to be incorrect.
131 bool success = true;
132 size_t begin = static_cast<size_t>(scheme.begin);
133 size_t end = static_cast<size_t>(scheme.end());
134 for (size_t i = begin; i < end; i++) {
135 UCHAR ch = static_cast<UCHAR>(spec[i]);
136 char replacement = 0;
137 if (ch < 0x80) {
138 if (i == begin) {
139 // Need to do a special check for the first letter of the scheme.
140 if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
141 replacement = kSchemeCanonical[ch];
142 } else {
143 replacement = kSchemeCanonical[ch];
144 }
145 }
146
147 if (replacement) {
148 output->push_back(replacement);
149 } else if (ch == '%') {
150 // Canonicalizing the scheme multiple times should lead to the same
151 // result. Since invalid characters will be escaped, we need to preserve
152 // the percent to avoid multiple escaping. The scheme will be invalid.
153 success = false;
154 output->push_back('%');
155 } else {
156 // Invalid character, store it but mark this scheme as invalid.
157 success = false;
158
159 // This will escape the output and also handle encoding issues.
160 // Ignore the return value since we already failed.
161 AppendUTF8EscapedChar(spec, &i, end, output);
162 }
163 }
164
165 // The output scheme ends with the the current position, before appending
166 // the colon.
167 out_scheme->len = output->length() - out_scheme->begin;
168 output->push_back(':');
169 return success;
170 }
171
172 // The username and password components reference ranges in the corresponding
173 // *_spec strings. Typically, these specs will be the same (we're
174 // canonicalizing a single source string), but may be different when
175 // replacing components.
176 template <typename CHAR, typename UCHAR>
DoUserInfo(const CHAR * username_spec,const Component & username,const CHAR * password_spec,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)177 bool DoUserInfo(const CHAR* username_spec,
178 const Component& username,
179 const CHAR* password_spec,
180 const Component& password,
181 CanonOutput* output,
182 Component* out_username,
183 Component* out_password) {
184 if (username.is_empty() && password.is_empty()) {
185 // Common case: no user info. We strip empty username/passwords.
186 *out_username = Component();
187 *out_password = Component();
188 return true;
189 }
190
191 // Write the username.
192 out_username->begin = output->length();
193 if (username.is_nonempty()) {
194 // This will escape characters not valid for the username.
195 AppendStringOfType(&username_spec[username.begin],
196 static_cast<size_t>(username.len), CHAR_USERINFO,
197 output);
198 }
199 out_username->len = output->length() - out_username->begin;
200
201 // When there is a password, we need the separator. Note that we strip
202 // empty but specified passwords.
203 if (password.is_nonempty()) {
204 output->push_back(':');
205 out_password->begin = output->length();
206 AppendStringOfType(&password_spec[password.begin],
207 static_cast<size_t>(password.len), CHAR_USERINFO,
208 output);
209 out_password->len = output->length() - out_password->begin;
210 } else {
211 *out_password = Component();
212 }
213
214 output->push_back('@');
215 return true;
216 }
217
218 // Helper functions for converting port integers to strings.
WritePortInt(char * output,int output_len,int port)219 inline void WritePortInt(char* output, int output_len, int port) {
220 _itoa_s(port, output, output_len, 10);
221 }
222
223 // This function will prepend the colon if there will be a port.
224 template <typename CHAR, typename UCHAR>
DoPort(const CHAR * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)225 bool DoPort(const CHAR* spec,
226 const Component& port,
227 int default_port_for_scheme,
228 CanonOutput* output,
229 Component* out_port) {
230 int port_num = ParsePort(spec, port);
231 if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
232 *out_port = Component();
233 return true; // Leave port empty.
234 }
235
236 if (port_num == PORT_INVALID) {
237 // Invalid port: We'll copy the text from the input so the user can see
238 // what the error was, and mark the URL as invalid by returning false.
239 output->push_back(':');
240 out_port->begin = output->length();
241 AppendInvalidNarrowString(spec, static_cast<size_t>(port.begin),
242 static_cast<size_t>(port.end()), output);
243 out_port->len = output->length() - out_port->begin;
244 return false;
245 }
246
247 // Convert port number back to an integer. Max port value is 5 digits, and
248 // the Parsed::ExtractPort will have made sure the integer is in range.
249 const int buf_size = 6;
250 char buf[buf_size];
251 WritePortInt(buf, buf_size, port_num);
252
253 // Append the port number to the output, preceded by a colon.
254 output->push_back(':');
255 out_port->begin = output->length();
256 for (int i = 0; i < buf_size && buf[i]; i++)
257 output->push_back(buf[i]);
258
259 out_port->len = output->length() - out_port->begin;
260 return true;
261 }
262
263 // clang-format off
264 // Percent-escape all characters from the fragment percent-encode set
265 // https://url.spec.whatwg.org/#fragment-percent-encode-set
266 const bool kShouldEscapeCharInFragment[0x80] = {
267 // Control characters (0x00-0x1F)
268 true, true, true, true, true, true, true, true,
269 true, true, true, true, true, true, true, true,
270 true, true, true, true, true, true, true, true,
271 true, true, true, true, true, true, true, true,
272 // ' ' ! " # $ % & '
273 true, false, true, false, false, false, false, false,
274 // ( ) * + , - . /
275 false, false, false, false, false, false, false, false,
276 // 0 1 2 3 4 5 6 7
277 false, false, false, false, false, false, false, false,
278 // 8 9 : ; < = > ?
279 false, false, false, false, true, false, true, false,
280 // @ A B C D E F G
281 false, false, false, false, false, false, false, false,
282 // H I J K L M N O
283 false, false, false, false, false, false, false, false,
284 // P Q R S T U V W
285 false, false, false, false, false, false, false, false,
286 // X Y Z [ \ ] ^ _
287 false, false, false, false, false, false, false, false,
288 // ` a b c d e f g
289 true, false, false, false, false, false, false, false,
290 // h i j k l m n o
291 false, false, false, false, false, false, false, false,
292 // p q r s t u v w
293 false, false, false, false, false, false, false, false,
294 // x y z { | } ~ DELETE
295 false, false, false, false, false, false, false, true
296 };
297 // clang-format on
298
299 template <typename CHAR, typename UCHAR>
DoCanonicalizeRef(const CHAR * spec,const Component & ref,CanonOutput * output,Component * out_ref)300 void DoCanonicalizeRef(const CHAR* spec,
301 const Component& ref,
302 CanonOutput* output,
303 Component* out_ref) {
304 if (!ref.is_valid()) {
305 // Common case of no ref.
306 *out_ref = Component();
307 return;
308 }
309
310 // Append the ref separator. Note that we need to do this even when the ref
311 // is empty but present.
312 output->push_back('#');
313 out_ref->begin = output->length();
314
315 // Now iterate through all the characters, converting to UTF-8 and validating.
316 size_t end = static_cast<size_t>(ref.end());
317 for (size_t i = static_cast<size_t>(ref.begin); i < end; i++) {
318 UCHAR current_char = static_cast<UCHAR>(spec[i]);
319 if (current_char < 0x80) {
320 if (kShouldEscapeCharInFragment[current_char])
321 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
322 else
323 output->push_back(static_cast<char>(spec[i]));
324 } else {
325 AppendUTF8EscapedChar(spec, &i, end, output);
326 }
327 }
328
329 out_ref->len = output->length() - out_ref->begin;
330 }
331
332 } // namespace
333
RemoveURLWhitespace(const char * input,int input_len,CanonOutputT<char> * buffer,int * output_len,bool * potentially_dangling_markup)334 const char* RemoveURLWhitespace(const char* input,
335 int input_len,
336 CanonOutputT<char>* buffer,
337 int* output_len,
338 bool* potentially_dangling_markup) {
339 return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
340 potentially_dangling_markup);
341 }
342
RemoveURLWhitespace(const char16_t * input,int input_len,CanonOutputT<char16_t> * buffer,int * output_len,bool * potentially_dangling_markup)343 const char16_t* RemoveURLWhitespace(const char16_t* input,
344 int input_len,
345 CanonOutputT<char16_t>* buffer,
346 int* output_len,
347 bool* potentially_dangling_markup) {
348 return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
349 potentially_dangling_markup);
350 }
351
CanonicalSchemeChar(char16_t ch)352 char CanonicalSchemeChar(char16_t ch) {
353 if (ch >= 0x80)
354 return 0; // Non-ASCII is not supported by schemes.
355 return kSchemeCanonical[ch];
356 }
357
CanonicalizeScheme(const char * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)358 bool CanonicalizeScheme(const char* spec,
359 const Component& scheme,
360 CanonOutput* output,
361 Component* out_scheme) {
362 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
363 }
364
CanonicalizeScheme(const char16_t * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)365 bool CanonicalizeScheme(const char16_t* spec,
366 const Component& scheme,
367 CanonOutput* output,
368 Component* out_scheme) {
369 return DoScheme<char16_t, char16_t>(spec, scheme, output, out_scheme);
370 }
371
CanonicalizeUserInfo(const char * username_source,const Component & username,const char * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)372 bool CanonicalizeUserInfo(const char* username_source,
373 const Component& username,
374 const char* password_source,
375 const Component& password,
376 CanonOutput* output,
377 Component* out_username,
378 Component* out_password) {
379 return DoUserInfo<char, unsigned char>(username_source, username,
380 password_source, password, output,
381 out_username, out_password);
382 }
383
CanonicalizeUserInfo(const char16_t * username_source,const Component & username,const char16_t * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)384 bool CanonicalizeUserInfo(const char16_t* username_source,
385 const Component& username,
386 const char16_t* password_source,
387 const Component& password,
388 CanonOutput* output,
389 Component* out_username,
390 Component* out_password) {
391 return DoUserInfo<char16_t, char16_t>(username_source, username,
392 password_source, password, output,
393 out_username, out_password);
394 }
395
CanonicalizePort(const char * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)396 bool CanonicalizePort(const char* spec,
397 const Component& port,
398 int default_port_for_scheme,
399 CanonOutput* output,
400 Component* out_port) {
401 return DoPort<char, unsigned char>(spec, port, default_port_for_scheme,
402 output, out_port);
403 }
404
CanonicalizePort(const char16_t * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)405 bool CanonicalizePort(const char16_t* spec,
406 const Component& port,
407 int default_port_for_scheme,
408 CanonOutput* output,
409 Component* out_port) {
410 return DoPort<char16_t, char16_t>(spec, port, default_port_for_scheme, output,
411 out_port);
412 }
413
CanonicalizeRef(const char * spec,const Component & ref,CanonOutput * output,Component * out_ref)414 void CanonicalizeRef(const char* spec,
415 const Component& ref,
416 CanonOutput* output,
417 Component* out_ref) {
418 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
419 }
420
CanonicalizeRef(const char16_t * spec,const Component & ref,CanonOutput * output,Component * out_ref)421 void CanonicalizeRef(const char16_t* spec,
422 const Component& ref,
423 CanonOutput* output,
424 Component* out_ref) {
425 DoCanonicalizeRef<char16_t, char16_t>(spec, ref, output, out_ref);
426 }
427
428 } // namespace url
429