1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Detecting mime types is a tricky business because we need to balance
6 // compatibility concerns with security issues. Here is a survey of how other
7 // browsers behave and then a description of how we intend to behave.
8 //
9 // HTML payload, no Content-Type header:
10 // * IE 7: Render as HTML
11 // * Firefox 2: Render as HTML
12 // * Safari 3: Render as HTML
13 // * Opera 9: Render as HTML
14 //
15 // Here the choice seems clear:
16 // => Chrome: Render as HTML
17 //
18 // HTML payload, Content-Type: "text/plain":
19 // * IE 7: Render as HTML
20 // * Firefox 2: Render as text
21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
22 // has an HTML extension)
23 // * Opera 9: Render as text
24 //
25 // Here we choose to follow the majority (and break some compatibility with IE).
26 // Many folks dislike IE's behavior here.
27 // => Chrome: Render as text
28 // We generalize this as follows. If the Content-Type header is text/plain
29 // we won't detect dangerous mime types (those that can execute script).
30 //
31 // HTML payload, Content-Type: "application/octet-stream":
32 // * IE 7: Render as HTML
33 // * Firefox 2: Download as application/octet-stream
34 // * Safari 3: Render as HTML
35 // * Opera 9: Render as HTML
36 //
37 // We follow Firefox.
38 // => Chrome: Download as application/octet-stream
39 // One factor in this decision is that IIS 4 and 5 will send
40 // application/octet-stream for .xhtml files (because they don't recognize
41 // the extension). We did some experiments and it looks like this doesn't occur
42 // very often on the web. We choose the more secure option.
43 //
44 // GIF payload, no Content-Type header:
45 // * IE 7: Render as GIF
46 // * Firefox 2: Render as GIF
47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
48 // URL has an GIF extension)
49 // * Opera 9: Render as GIF
50 //
51 // The choice is clear.
52 // => Chrome: Render as GIF
53 // Once we decide to render HTML without a Content-Type header, there isn't much
54 // reason not to render GIFs.
55 //
56 // GIF payload, Content-Type: "text/plain":
57 // * IE 7: Render as GIF
58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
59 // Download as GIF if the URL has an GIF extension)
60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
61 // URL has an GIF extension)
62 // * Opera 9: Render as GIF
63 //
64 // Displaying as text/plain makes little sense as the content will look like
65 // gibberish. Here, we could change our minds and download.
66 // => Chrome: Render as GIF
67 //
68 // GIF payload, Content-Type: "application/octet-stream":
69 // * IE 7: Render as GIF
70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
71 // Download as GIF if the URL has an GIF extension)
72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
73 // URL has an GIF extension)
74 // * Opera 9: Render as GIF
75 //
76 // We used to render as GIF here, but the problem is that some sites want to
77 // trigger downloads by sending application/octet-stream (even though they
78 // should be sending Content-Disposition: attachment). Although it is safe
79 // to render as GIF from a security perspective, we actually get better
80 // compatibility if we don't sniff from application/octet stream at all.
81 // => Chrome: Download as application/octet-stream
82 //
83 // Note that our definition of HTML payload is much stricter than IE's
84 // definition and roughly the same as Firefox's definition.
85
86 #include <stdint.h>
87 #include <string>
88
89 #include "net/base/mime_sniffer.h"
90
91 #include "base/check_op.h"
92 #include "base/containers/span.h"
93 #include "base/notreached.h"
94 #include "base/strings/string_util.h"
95 #include "build/build_config.h"
96 #include "url/gurl.h"
97
98 namespace net {
99
100 // The number of content bytes we need to use all our magic numbers. Feel free
101 // to increase this number if you add a longer magic number.
102 static const size_t kBytesRequiredForMagic = 42;
103
104 struct MagicNumber {
105 const char* const mime_type;
106 const std::string_view magic;
107 bool is_string;
108 const char* const mask; // if set, must have same length as |magic|
109 };
110
111 #define MAGIC_NUMBER(mime_type, magic) \
112 { (mime_type), std::string_view((magic), sizeof(magic) - 1), false, nullptr }
113
114 template <int MagicSize, int MaskSize>
115 class VerifySizes {
116 static_assert(MagicSize == MaskSize, "sizes must be equal");
117
118 public:
119 enum { SIZES = MagicSize };
120 };
121
122 #define verified_sizeof(magic, mask) \
123 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES
124
125 #define MAGIC_MASK(mime_type, magic, mask) \
126 { \
127 (mime_type), std::string_view((magic), verified_sizeof(magic, mask) - 1), \
128 false, (mask) \
129 }
130
131 // Magic strings are case insensitive and must not include '\0' characters
132 #define MAGIC_STRING(mime_type, magic) \
133 { (mime_type), std::string_view((magic), sizeof(magic) - 1), true, nullptr }
134
135 static const MagicNumber kMagicNumbers[] = {
136 // Source: HTML 5 specification
137 MAGIC_NUMBER("application/pdf", "%PDF-"),
138 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-"),
139 MAGIC_NUMBER("image/gif", "GIF87a"),
140 MAGIC_NUMBER("image/gif", "GIF89a"),
141 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A"),
142 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF"),
143 MAGIC_NUMBER("image/bmp", "BM"),
144 // Source: Mozilla
145 MAGIC_NUMBER("text/plain", "#!"), // Script
146 MAGIC_NUMBER("text/plain", "%!"), // Script, similar to PS
147 MAGIC_NUMBER("text/plain", "From"),
148 MAGIC_NUMBER("text/plain", ">From"),
149 // Chrome specific
150 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08"),
151 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46"),
152 MAGIC_NUMBER("video/x-ms-asf",
153 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C"),
154 MAGIC_NUMBER("image/tiff", "I I"),
155 MAGIC_NUMBER("image/tiff", "II*"),
156 MAGIC_NUMBER("image/tiff", "MM\x00*"),
157 MAGIC_NUMBER("audio/mpeg", "ID3"),
158 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP"),
159 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3"),
160 MAGIC_NUMBER("application/zip", "PK\x03\x04"),
161 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00"),
162 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A"),
163 MAGIC_NUMBER("application/octet-stream", "MZ"), // EXE
164 // Sniffing for Flash:
165 //
166 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
167 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV"),
168 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
169 //
170 // Including these magic number for Flash is a trade off.
171 //
172 // Pros:
173 // * Flash is an important and popular file format
174 //
175 // Cons:
176 // * These patterns are fairly weak
177 // * If we mistakenly decide something is Flash, we will execute it
178 // in the origin of an unsuspecting site. This could be a security
179 // vulnerability if the site allows users to upload content.
180 //
181 // On balance, we do not include these patterns.
182 };
183
184 // The number of content bytes we need to use all our Microsoft Office magic
185 // numbers.
186 static const size_t kBytesRequiredForOfficeMagic = 8;
187
188 static const MagicNumber kOfficeMagicNumbers[] = {
189 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"),
190 MAGIC_NUMBER("OOXML", "PK\x03\x04"),
191 };
192
193 enum OfficeDocType {
194 DOC_TYPE_WORD,
195 DOC_TYPE_EXCEL,
196 DOC_TYPE_POWERPOINT,
197 DOC_TYPE_NONE
198 };
199
200 struct OfficeExtensionType {
201 OfficeDocType doc_type;
202 const std::string_view extension;
203 };
204
205 #define OFFICE_EXTENSION(type, extension) \
206 { (type), std::string_view((extension), sizeof(extension) - 1) }
207
208 static const OfficeExtensionType kOfficeExtensionTypes[] = {
209 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc"),
210 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls"),
211 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt"),
212 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx"),
213 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx"),
214 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx"),
215 };
216
217 static const MagicNumber kExtraMagicNumbers[] = {
218 MAGIC_NUMBER("image/x-xbitmap", "#define"),
219 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00"),
220 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt "),
221 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST"),
222 MAGIC_NUMBER("audio/ogg", "OggS\0"),
223 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0"),
224 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0"),
225 MAGIC_NUMBER("video/3gpp", "....ftyp3g"),
226 MAGIC_NUMBER("video/3gpp", "....ftypavcl"),
227 MAGIC_NUMBER("video/mp4", "....ftyp"),
228 MAGIC_NUMBER("video/quicktime", "....moov"),
229 MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
230 MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
231 MAGIC_NUMBER("video/x-flv", "FLV"),
232 MAGIC_NUMBER("audio/x-flac", "fLaC"),
233 // Per https://tools.ietf.org/html/rfc3267#section-8.1
234 MAGIC_NUMBER("audio/amr", "#!AMR\n"),
235
236 // RAW image types.
237 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR"),
238 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR"),
239 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM"),
240 MAGIC_NUMBER("image/x-olympus-orf", "MMOR"), // big-endian
241 MAGIC_NUMBER("image/x-olympus-orf", "IIRO"), // little-endian
242 MAGIC_NUMBER("image/x-olympus-orf", "IIRS"), // little-endian
243 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW "),
244 MAGIC_NUMBER("image/x-panasonic-raw",
245 "IIU\x00\x08\x00\x00\x00"), // Panasonic .raw
246 MAGIC_NUMBER("image/x-panasonic-raw",
247 "IIU\x00\x18\x00\x00\x00"), // Panasonic .rw2
248 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw"),
249 MAGIC_NUMBER("image/x-x3f", "FOVb"),
250 };
251
252 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will
253 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
254 // HTML, but we will not.
255
256 #define MAGIC_HTML_TAG(tag) \
257 MAGIC_STRING("text/html", "<" tag)
258
259 static const MagicNumber kSniffableTags[] = {
260 // XML processing directive. Although this is not an HTML mime type, we sniff
261 // for this in the HTML phase because text/xml is just as powerful as HTML and
262 // we want to leverage our white space skipping technology.
263 MAGIC_NUMBER("text/xml", "<?xml"), // Mozilla
264 // DOCTYPEs
265 MAGIC_HTML_TAG("!DOCTYPE html"), // HTML5 spec
266 // Sniffable tags, ordered by how often they occur in sniffable documents.
267 MAGIC_HTML_TAG("script"), // HTML5 spec, Mozilla
268 MAGIC_HTML_TAG("html"), // HTML5 spec, Mozilla
269 MAGIC_HTML_TAG("!--"),
270 MAGIC_HTML_TAG("head"), // HTML5 spec, Mozilla
271 MAGIC_HTML_TAG("iframe"), // Mozilla
272 MAGIC_HTML_TAG("h1"), // Mozilla
273 MAGIC_HTML_TAG("div"), // Mozilla
274 MAGIC_HTML_TAG("font"), // Mozilla
275 MAGIC_HTML_TAG("table"), // Mozilla
276 MAGIC_HTML_TAG("a"), // Mozilla
277 MAGIC_HTML_TAG("style"), // Mozilla
278 MAGIC_HTML_TAG("title"), // Mozilla
279 MAGIC_HTML_TAG("b"), // Mozilla
280 MAGIC_HTML_TAG("body"), // Mozilla
281 MAGIC_HTML_TAG("br"),
282 MAGIC_HTML_TAG("p"), // Mozilla
283 };
284
285 // Compare content header to a magic number where magic_entry can contain '.'
286 // for single character of anything, allowing some bytes to be skipped.
MagicCmp(std::string_view content,std::string_view magic_entry)287 static bool MagicCmp(std::string_view content, std::string_view magic_entry) {
288 DCHECK_GE(content.length(), magic_entry.length());
289
290 for (size_t i = 0; i < magic_entry.length(); ++i) {
291 if (magic_entry[i] != '.' && magic_entry[i] != content[i])
292 return false;
293 }
294 return true;
295 }
296
297 // Like MagicCmp() except that it ANDs each byte with a mask before
298 // the comparison, because there are some bits we don't care about.
MagicMaskCmp(std::string_view content,std::string_view magic_entry,std::string_view magic_mask)299 static bool MagicMaskCmp(std::string_view content,
300 std::string_view magic_entry,
301 std::string_view magic_mask) {
302 DCHECK_GE(content.length(), magic_entry.length());
303
304 for (size_t i = 0; i < magic_entry.length(); ++i) {
305 if (magic_entry[i] != '.' && magic_entry[i] != (magic_mask[i] & content[i]))
306 return false;
307 }
308 return true;
309 }
310
MatchMagicNumber(std::string_view content,const MagicNumber & magic_entry,std::string * result)311 static bool MatchMagicNumber(std::string_view content,
312 const MagicNumber& magic_entry,
313 std::string* result) {
314 // Keep kBytesRequiredForMagic honest.
315 DCHECK_LE(magic_entry.magic.length(), kBytesRequiredForMagic);
316
317 bool match = false;
318 if (content.length() >= magic_entry.magic.length()) {
319 if (magic_entry.is_string) {
320 // Consistency check - string entries should have no embedded nulls.
321 DCHECK_EQ(std::string_view::npos, magic_entry.magic.find('\0'));
322
323 // Do a case-insensitive prefix comparison.
324 match = base::StartsWith(content, magic_entry.magic,
325 base::CompareCase::INSENSITIVE_ASCII);
326 } else if (!magic_entry.mask) {
327 match = MagicCmp(content, magic_entry.magic);
328 } else {
329 std::string_view magic_mask(magic_entry.mask, magic_entry.magic.length());
330 match = MagicMaskCmp(content, magic_entry.magic, magic_mask);
331 }
332 }
333
334 if (match) {
335 result->assign(magic_entry.mime_type);
336 return true;
337 }
338 return false;
339 }
340
CheckForMagicNumbers(std::string_view content,base::span<const MagicNumber> magic_numbers,std::string * result)341 static bool CheckForMagicNumbers(std::string_view content,
342 base::span<const MagicNumber> magic_numbers,
343 std::string* result) {
344 for (const MagicNumber& magic : magic_numbers) {
345 if (MatchMagicNumber(content, magic, result))
346 return true;
347 }
348 return false;
349 }
350
351 // Truncates |string_piece| to length |max_size| and returns true if
352 // |string_piece| is now exactly |max_size|.
TruncateStringPiece(const size_t max_size,std::string_view * string_piece)353 static bool TruncateStringPiece(const size_t max_size,
354 std::string_view* string_piece) {
355 // Keep kMaxBytesToSniff honest.
356 DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
357
358 *string_piece = string_piece->substr(0, max_size);
359 return string_piece->length() == max_size;
360 }
361
362 // Returns true and sets result if the content appears to be HTML.
363 // Clears have_enough_content if more data could possibly change the result.
SniffForHTML(std::string_view content,bool * have_enough_content,std::string * result)364 static bool SniffForHTML(std::string_view content,
365 bool* have_enough_content,
366 std::string* result) {
367 // For HTML, we are willing to consider up to 512 bytes. This may be overly
368 // conservative as IE only considers 256.
369 *have_enough_content &= TruncateStringPiece(512, &content);
370
371 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
372 // but with some modifications to better match the HTML5 spec.
373 std::string_view trimmed =
374 base::TrimWhitespaceASCII(content, base::TRIM_LEADING);
375
376 // |trimmed| now starts at first non-whitespace character (or is empty).
377 return CheckForMagicNumbers(trimmed, kSniffableTags, result);
378 }
379
380 // Returns true and sets result if the content matches any of kMagicNumbers.
381 // Clears have_enough_content if more data could possibly change the result.
SniffForMagicNumbers(std::string_view content,bool * have_enough_content,std::string * result)382 static bool SniffForMagicNumbers(std::string_view content,
383 bool* have_enough_content,
384 std::string* result) {
385 *have_enough_content &= TruncateStringPiece(kBytesRequiredForMagic, &content);
386
387 // Check our big table of Magic Numbers
388 return CheckForMagicNumbers(content, kMagicNumbers, result);
389 }
390
391 // Returns true and sets result if the content matches any of
392 // kOfficeMagicNumbers, and the URL has the proper extension.
393 // Clears |have_enough_content| if more data could possibly change the result.
SniffForOfficeDocs(std::string_view content,const GURL & url,bool * have_enough_content,std::string * result)394 static bool SniffForOfficeDocs(std::string_view content,
395 const GURL& url,
396 bool* have_enough_content,
397 std::string* result) {
398 *have_enough_content &=
399 TruncateStringPiece(kBytesRequiredForOfficeMagic, &content);
400
401 // Check our table of magic numbers for Office file types.
402 std::string office_version;
403 if (!CheckForMagicNumbers(content, kOfficeMagicNumbers, &office_version))
404 return false;
405
406 OfficeDocType type = DOC_TYPE_NONE;
407 std::string_view url_path = url.path_piece();
408 for (const auto& office_extension : kOfficeExtensionTypes) {
409 if (base::EndsWith(url_path, office_extension.extension,
410 base::CompareCase::INSENSITIVE_ASCII)) {
411 type = office_extension.doc_type;
412 break;
413 }
414 }
415
416 if (type == DOC_TYPE_NONE)
417 return false;
418
419 if (office_version == "CFB") {
420 switch (type) {
421 case DOC_TYPE_WORD:
422 *result = "application/msword";
423 return true;
424 case DOC_TYPE_EXCEL:
425 *result = "application/vnd.ms-excel";
426 return true;
427 case DOC_TYPE_POWERPOINT:
428 *result = "application/vnd.ms-powerpoint";
429 return true;
430 case DOC_TYPE_NONE:
431 NOTREACHED();
432 return false;
433 }
434 } else if (office_version == "OOXML") {
435 switch (type) {
436 case DOC_TYPE_WORD:
437 *result = "application/vnd.openxmlformats-officedocument."
438 "wordprocessingml.document";
439 return true;
440 case DOC_TYPE_EXCEL:
441 *result = "application/vnd.openxmlformats-officedocument."
442 "spreadsheetml.sheet";
443 return true;
444 case DOC_TYPE_POWERPOINT:
445 *result = "application/vnd.openxmlformats-officedocument."
446 "presentationml.presentation";
447 return true;
448 case DOC_TYPE_NONE:
449 NOTREACHED();
450 return false;
451 }
452 }
453
454 NOTREACHED();
455 return false;
456 }
457
IsOfficeType(const std::string & type_hint)458 static bool IsOfficeType(const std::string& type_hint) {
459 return (type_hint == "application/msword" ||
460 type_hint == "application/vnd.ms-excel" ||
461 type_hint == "application/vnd.ms-powerpoint" ||
462 type_hint == "application/vnd.openxmlformats-officedocument."
463 "wordprocessingml.document" ||
464 type_hint == "application/vnd.openxmlformats-officedocument."
465 "spreadsheetml.sheet" ||
466 type_hint == "application/vnd.openxmlformats-officedocument."
467 "presentationml.presentation" ||
468 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" ||
469 type_hint == "application/vnd.ms-word.document.macroenabled.12" ||
470 type_hint == "application/vnd.ms-powerpoint.presentation."
471 "macroenabled.12" ||
472 type_hint == "application/mspowerpoint" ||
473 type_hint == "application/msexcel" ||
474 type_hint == "application/vnd.ms-word" ||
475 type_hint == "application/vnd.ms-word.document.12" ||
476 type_hint == "application/vnd.msword");
477 }
478
479 // This function checks for files that have a Microsoft Office MIME type
480 // set, but are not actually Office files.
481 //
482 // If this is not actually an Office file, |*result| is set to
483 // "application/octet-stream", otherwise it is not modified.
484 //
485 // Returns false if additional data is required to determine the file type, or
486 // true if there is enough data to make a decision.
SniffForInvalidOfficeDocs(std::string_view content,const GURL & url,std::string * result)487 static bool SniffForInvalidOfficeDocs(std::string_view content,
488 const GURL& url,
489 std::string* result) {
490 if (!TruncateStringPiece(kBytesRequiredForOfficeMagic, &content))
491 return false;
492
493 // Check our table of magic numbers for Office file types. If it does not
494 // match one, the MIME type was invalid. Set it instead to a safe value.
495 std::string office_version;
496 if (!CheckForMagicNumbers(content, kOfficeMagicNumbers, &office_version)) {
497 *result = "application/octet-stream";
498 }
499
500 // We have enough information to determine if this was a Microsoft Office
501 // document or not, so sniffing is completed.
502 return true;
503 }
504
505 // Tags that indicate the content is likely XML.
506 static const MagicNumber kMagicXML[] = {
507 MAGIC_STRING("application/atom+xml", "<feed"),
508 MAGIC_STRING("application/rss+xml", "<rss"),
509 };
510
511 // Returns true and sets result if the content appears to contain XHTML or a
512 // feed.
513 // Clears have_enough_content if more data could possibly change the result.
514 //
515 // TODO(evanm): this is similar but more conservative than what Safari does,
516 // while HTML5 has a different recommendation -- what should we do?
517 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
518 // of ASCII -- do we care?
SniffXML(std::string_view content,bool * have_enough_content,std::string * result)519 static bool SniffXML(std::string_view content,
520 bool* have_enough_content,
521 std::string* result) {
522 // We allow at most 300 bytes of content before we expect the opening tag.
523 *have_enough_content &= TruncateStringPiece(300, &content);
524
525 // This loop iterates through tag-looking offsets in the file.
526 // We want to skip XML processing instructions (of the form "<?xml ...")
527 // and stop at the first "plain" tag, then make a decision on the mime-type
528 // based on the name (or possibly attributes) of that tag.
529 const int kMaxTagIterations = 5;
530 size_t pos = 0;
531 for (size_t i = 0; i < kMaxTagIterations && pos < content.length(); ++i) {
532 pos = content.find('<', pos);
533 if (pos == std::string_view::npos) {
534 return false;
535 }
536
537 std::string_view current = content.substr(pos);
538
539 // Skip XML and DOCTYPE declarations.
540 static constexpr std::string_view kXmlPrefix("<?xml");
541 static constexpr std::string_view kDocTypePrefix("<!DOCTYPE");
542 if (base::StartsWith(current, kXmlPrefix,
543 base::CompareCase::INSENSITIVE_ASCII) ||
544 base::StartsWith(current, kDocTypePrefix,
545 base::CompareCase::INSENSITIVE_ASCII)) {
546 ++pos;
547 continue;
548 }
549
550 if (CheckForMagicNumbers(current, kMagicXML, result))
551 return true;
552
553 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
554 // to identify.
555
556 // If we get here, we've hit an initial tag that hasn't matched one of the
557 // above tests. Abort.
558 return true;
559 }
560
561 // We iterated too far without finding a start tag.
562 // If we have more content to look at, we aren't going to change our mind by
563 // seeing more bytes from the network.
564 return pos < content.length();
565 }
566
567 // Byte order marks
568 static const MagicNumber kByteOrderMark[] = {
569 MAGIC_NUMBER("text/plain", "\xFE\xFF"), // UTF-16BE
570 MAGIC_NUMBER("text/plain", "\xFF\xFE"), // UTF-16LE
571 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF"), // UTF-8
572 };
573
574 // Returns true and sets result to "application/octet-stream" if the content
575 // appears to be binary data. Otherwise, returns false and sets "text/plain".
576 // Clears have_enough_content if more data could possibly change the result.
SniffBinary(std::string_view content,bool * have_enough_content,std::string * result)577 static bool SniffBinary(std::string_view content,
578 bool* have_enough_content,
579 std::string* result) {
580 // There is no consensus about exactly how to sniff for binary content.
581 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
582 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
583 // Here, we side with FF, but with a smaller buffer. This size was chosen
584 // because it is small enough to comfortably fit into a single packet (after
585 // allowing for headers) and yet large enough to account for binary formats
586 // that have a significant amount of ASCII at the beginning (crbug.com/15314).
587 const bool is_truncated = TruncateStringPiece(kMaxBytesToSniff, &content);
588
589 // First, we look for a BOM.
590 std::string unused;
591 if (CheckForMagicNumbers(content, kByteOrderMark, &unused)) {
592 // If there is BOM, we think the buffer is not binary.
593 result->assign("text/plain");
594 return false;
595 }
596
597 // Next we look to see if any of the bytes "look binary."
598 if (LooksLikeBinary(content)) {
599 result->assign("application/octet-stream");
600 return true;
601 }
602
603 // No evidence either way. Default to non-binary and, if truncated, clear
604 // have_enough_content because there could be a binary looking byte in the
605 // truncated data.
606 *have_enough_content &= is_truncated;
607 result->assign("text/plain");
608 return false;
609 }
610
IsUnknownMimeType(std::string_view mime_type)611 static bool IsUnknownMimeType(std::string_view mime_type) {
612 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
613 // If we do, please be careful not to alter the semantics at all.
614 static const char* const kUnknownMimeTypes[] = {
615 // Empty mime types are as unknown as they get.
616 "",
617 // The unknown/unknown type is popular and uninformative
618 "unknown/unknown",
619 // The second most popular unknown mime type is application/unknown
620 "application/unknown",
621 // Firefox rejects a mime type if it is exactly */*
622 "*/*",
623 };
624 for (const char* const unknown_mime_type : kUnknownMimeTypes) {
625 if (mime_type == unknown_mime_type)
626 return true;
627 }
628 if (mime_type.find('/') == std::string_view::npos) {
629 // Firefox rejects a mime type if it does not contain a slash
630 return true;
631 }
632 return false;
633 }
634
635 // Returns true and sets result if the content appears to be a crx (Chrome
636 // extension) file.
637 // Clears have_enough_content if more data could possibly change the result.
SniffCRX(std::string_view content,const GURL & url,bool * have_enough_content,std::string * result)638 static bool SniffCRX(std::string_view content,
639 const GURL& url,
640 bool* have_enough_content,
641 std::string* result) {
642 // Technically, the crx magic number is just Cr24, but the bytes after that
643 // are a version number which changes infrequently. Including it in the
644 // sniffing gives us less room for error. If the version number ever changes,
645 // we can just add an entry to this list.
646 static const struct MagicNumber kCRXMagicNumbers[] = {
647 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00"),
648 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x03\x00\x00\x00")};
649
650 // Only consider files that have the extension ".crx".
651 if (!url.path_piece().ends_with(".crx")) {
652 return false;
653 }
654
655 *have_enough_content &= TruncateStringPiece(kBytesRequiredForMagic, &content);
656 return CheckForMagicNumbers(content, kCRXMagicNumbers, result);
657 }
658
ShouldSniffMimeType(const GURL & url,std::string_view mime_type)659 bool ShouldSniffMimeType(const GURL& url, std::string_view mime_type) {
660 bool sniffable_scheme = url.is_empty() || url.SchemeIsHTTPOrHTTPS() ||
661 #if BUILDFLAG(IS_ANDROID)
662 url.SchemeIs("content") ||
663 #endif
664 url.SchemeIsFile() || url.SchemeIsFileSystem();
665 if (!sniffable_scheme)
666 return false;
667
668 static const char* const kSniffableTypes[] = {
669 // Many web servers are misconfigured to send text/plain for many
670 // different types of content.
671 "text/plain",
672 // We want to sniff application/octet-stream for
673 // application/x-chrome-extension, but nothing else.
674 "application/octet-stream",
675 // XHTML and Atom/RSS feeds are often served as plain xml instead of
676 // their more specific mime types.
677 "text/xml",
678 "application/xml",
679 // Check for false Microsoft Office MIME types.
680 "application/msword",
681 "application/vnd.ms-excel",
682 "application/vnd.ms-powerpoint",
683 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
684 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
685 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
686 "application/vnd.ms-excel.sheet.macroenabled.12",
687 "application/vnd.ms-word.document.macroenabled.12",
688 "application/vnd.ms-powerpoint.presentation.macroenabled.12",
689 "application/mspowerpoint",
690 "application/msexcel",
691 "application/vnd.ms-word",
692 "application/vnd.ms-word.document.12",
693 "application/vnd.msword",
694 };
695 for (const char* const sniffable_type : kSniffableTypes) {
696 if (mime_type == sniffable_type)
697 return true;
698 }
699 if (IsUnknownMimeType(mime_type)) {
700 // The web server didn't specify a content type or specified a mime
701 // type that we ignore.
702 return true;
703 }
704 return false;
705 }
706
SniffMimeType(std::string_view content,const GURL & url,const std::string & type_hint,ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,std::string * result)707 bool SniffMimeType(std::string_view content,
708 const GURL& url,
709 const std::string& type_hint,
710 ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,
711 std::string* result) {
712 // Sanity check.
713 DCHECK_LT(content.length(), 1000000U);
714 DCHECK(result);
715
716 // By default, we assume we have enough content.
717 // Each sniff routine may unset this if it wasn't provided enough content.
718 bool have_enough_content = true;
719
720 // By default, we'll return the type hint.
721 // Each sniff routine may modify this if it has a better guess..
722 result->assign(type_hint);
723
724 // If the file has a Microsoft Office MIME type, we should only check that it
725 // is a valid Office file. Because this is the only reason we sniff files
726 // with a Microsoft Office MIME type, we can return early.
727 if (IsOfficeType(type_hint))
728 return SniffForInvalidOfficeDocs(content, url, result);
729
730 // Cache information about the type_hint
731 bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
732
733 // First check for HTML, unless it's a file URL and
734 // |allow_sniffing_files_urls_as_html| is false.
735 if (hint_is_unknown_mime_type &&
736 (!url.SchemeIsFile() ||
737 force_sniff_file_url_for_html == ForceSniffFileUrlsForHtml::kEnabled)) {
738 // We're only willing to sniff HTML if the server has not supplied a mime
739 // type, or if the type it did supply indicates that it doesn't know what
740 // the type should be.
741 if (SniffForHTML(content, &have_enough_content, result))
742 return true; // We succeeded in sniffing HTML. No more content needed.
743 }
744
745 // We're only willing to sniff for binary in 3 cases:
746 // 1. The server has not supplied a mime type.
747 // 2. The type it did supply indicates that it doesn't know what the type
748 // should be.
749 // 3. The type is "text/plain" which is the default on some web servers and
750 // could be indicative of a mis-configuration that we shield the user from.
751 const bool hint_is_text_plain = (type_hint == "text/plain");
752 if (hint_is_unknown_mime_type || hint_is_text_plain) {
753 if (!SniffBinary(content, &have_enough_content, result)) {
754 // If the server said the content was text/plain and it doesn't appear
755 // to be binary, then we trust it.
756 if (hint_is_text_plain) {
757 return have_enough_content;
758 }
759 }
760 }
761
762 // If we have plain XML, sniff XML subtypes.
763 if (type_hint == "text/xml" || type_hint == "application/xml") {
764 // We're not interested in sniffing these types for images and the like.
765 // Instead, we're looking explicitly for a feed. If we don't find one
766 // we're done and return early.
767 if (SniffXML(content, &have_enough_content, result))
768 return true;
769 return have_enough_content;
770 }
771
772 // CRX files (Chrome extensions) have a special sniffing algorithm. It is
773 // tighter than the others because we don't have to match legacy behavior.
774 if (SniffCRX(content, url, &have_enough_content, result))
775 return true;
776
777 // Check the file extension and magic numbers to see if this is an Office
778 // document. This needs to be checked before the general magic numbers
779 // because zip files and Office documents (OOXML) have the same magic number.
780 if (SniffForOfficeDocs(content, url, &have_enough_content, result)) {
781 return true; // We've matched a magic number. No more content needed.
782 }
783
784 // We're not interested in sniffing for magic numbers when the type_hint
785 // is application/octet-stream. Time to bail out.
786 if (type_hint == "application/octet-stream")
787 return have_enough_content;
788
789 // Now we look in our large table of magic numbers to see if we can find
790 // anything that matches the content.
791 if (SniffForMagicNumbers(content, &have_enough_content, result))
792 return true; // We've matched a magic number. No more content needed.
793
794 return have_enough_content;
795 }
796
SniffMimeTypeFromLocalData(std::string_view content,std::string * result)797 bool SniffMimeTypeFromLocalData(std::string_view content, std::string* result) {
798 // First check the extra table.
799 if (CheckForMagicNumbers(content, kExtraMagicNumbers, result))
800 return true;
801 // Finally check the original table.
802 return CheckForMagicNumbers(content, kMagicNumbers, result);
803 }
804
LooksLikeBinary(std::string_view content)805 bool LooksLikeBinary(std::string_view content) {
806 // The definition of "binary bytes" is from the spec at
807 // https://mimesniff.spec.whatwg.org/#binary-data-byte
808 //
809 // The bytes which are considered to be "binary" are all < 0x20. Encode them
810 // one bit per byte, with 1 for a "binary" bit, and 0 for a "text" bit. The
811 // least-significant bit represents byte 0x00, the most-significant bit
812 // represents byte 0x1F.
813 const uint32_t kBinaryBits =
814 ~(1u << '\t' | 1u << '\n' | 1u << '\r' | 1u << '\f' | 1u << '\x1b');
815 for (char c : content) {
816 uint8_t byte = static_cast<uint8_t>(c);
817 if (byte < 0x20 && (kBinaryBits & (1u << byte)))
818 return true;
819 }
820 return false;
821 }
822
823 } // namespace net
824