xref: /aosp_15_r20/external/cronet/net/base/mime_sniffer.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Detecting mime types is a tricky business because we need to balance
6 // compatibility concerns with security issues.  Here is a survey of how other
7 // browsers behave and then a description of how we intend to behave.
8 //
9 // HTML payload, no Content-Type header:
10 // * IE 7: Render as HTML
11 // * Firefox 2: Render as HTML
12 // * Safari 3: Render as HTML
13 // * Opera 9: Render as HTML
14 //
15 // Here the choice seems clear:
16 // => Chrome: Render as HTML
17 //
18 // HTML payload, Content-Type: "text/plain":
19 // * IE 7: Render as HTML
20 // * Firefox 2: Render as text
21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
22 //                                   has an HTML extension)
23 // * Opera 9: Render as text
24 //
25 // Here we choose to follow the majority (and break some compatibility with IE).
26 // Many folks dislike IE's behavior here.
27 // => Chrome: Render as text
28 // We generalize this as follows.  If the Content-Type header is text/plain
29 // we won't detect dangerous mime types (those that can execute script).
30 //
31 // HTML payload, Content-Type: "application/octet-stream":
32 // * IE 7: Render as HTML
33 // * Firefox 2: Download as application/octet-stream
34 // * Safari 3: Render as HTML
35 // * Opera 9: Render as HTML
36 //
37 // We follow Firefox.
38 // => Chrome: Download as application/octet-stream
39 // One factor in this decision is that IIS 4 and 5 will send
40 // application/octet-stream for .xhtml files (because they don't recognize
41 // the extension).  We did some experiments and it looks like this doesn't occur
42 // very often on the web.  We choose the more secure option.
43 //
44 // GIF payload, no Content-Type header:
45 // * IE 7: Render as GIF
46 // * Firefox 2: Render as GIF
47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
48 //                                        URL has an GIF extension)
49 // * Opera 9: Render as GIF
50 //
51 // The choice is clear.
52 // => Chrome: Render as GIF
53 // Once we decide to render HTML without a Content-Type header, there isn't much
54 // reason not to render GIFs.
55 //
56 // GIF payload, Content-Type: "text/plain":
57 // * IE 7: Render as GIF
58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
59 //                              Download as GIF if the URL has an GIF extension)
60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
61 //                                        URL has an GIF extension)
62 // * Opera 9: Render as GIF
63 //
64 // Displaying as text/plain makes little sense as the content will look like
65 // gibberish.  Here, we could change our minds and download.
66 // => Chrome: Render as GIF
67 //
68 // GIF payload, Content-Type: "application/octet-stream":
69 // * IE 7: Render as GIF
70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
71 //                              Download as GIF if the URL has an GIF extension)
72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
73 //                                        URL has an GIF extension)
74 // * Opera 9: Render as GIF
75 //
76 // We used to render as GIF here, but the problem is that some sites want to
77 // trigger downloads by sending application/octet-stream (even though they
78 // should be sending Content-Disposition: attachment).  Although it is safe
79 // to render as GIF from a security perspective, we actually get better
80 // compatibility if we don't sniff from application/octet stream at all.
81 // => Chrome: Download as application/octet-stream
82 //
83 // Note that our definition of HTML payload is much stricter than IE's
84 // definition and roughly the same as Firefox's definition.
85 
86 #include <stdint.h>
87 #include <string>
88 
89 #include "net/base/mime_sniffer.h"
90 
91 #include "base/check_op.h"
92 #include "base/containers/span.h"
93 #include "base/notreached.h"
94 #include "base/strings/string_util.h"
95 #include "build/build_config.h"
96 #include "url/gurl.h"
97 
98 namespace net {
99 
100 // The number of content bytes we need to use all our magic numbers.  Feel free
101 // to increase this number if you add a longer magic number.
102 static const size_t kBytesRequiredForMagic = 42;
103 
104 struct MagicNumber {
105   const char* const mime_type;
106   const std::string_view magic;
107   bool is_string;
108   const char* const mask;  // if set, must have same length as |magic|
109 };
110 
111 #define MAGIC_NUMBER(mime_type, magic) \
112   { (mime_type), std::string_view((magic), sizeof(magic) - 1), false, nullptr }
113 
114 template <int MagicSize, int MaskSize>
115 class VerifySizes {
116   static_assert(MagicSize == MaskSize, "sizes must be equal");
117 
118  public:
119   enum { SIZES = MagicSize };
120 };
121 
122 #define verified_sizeof(magic, mask) \
123 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES
124 
125 #define MAGIC_MASK(mime_type, magic, mask)                                    \
126   {                                                                           \
127     (mime_type), std::string_view((magic), verified_sizeof(magic, mask) - 1), \
128         false, (mask)                                                         \
129   }
130 
131 // Magic strings are case insensitive and must not include '\0' characters
132 #define MAGIC_STRING(mime_type, magic) \
133   { (mime_type), std::string_view((magic), sizeof(magic) - 1), true, nullptr }
134 
135 static const MagicNumber kMagicNumbers[] = {
136   // Source: HTML 5 specification
137   MAGIC_NUMBER("application/pdf", "%PDF-"),
138   MAGIC_NUMBER("application/postscript", "%!PS-Adobe-"),
139   MAGIC_NUMBER("image/gif", "GIF87a"),
140   MAGIC_NUMBER("image/gif", "GIF89a"),
141   MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A"),
142   MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF"),
143   MAGIC_NUMBER("image/bmp", "BM"),
144   // Source: Mozilla
145   MAGIC_NUMBER("text/plain", "#!"),  // Script
146   MAGIC_NUMBER("text/plain", "%!"),  // Script, similar to PS
147   MAGIC_NUMBER("text/plain", "From"),
148   MAGIC_NUMBER("text/plain", ">From"),
149   // Chrome specific
150   MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08"),
151   MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46"),
152   MAGIC_NUMBER("video/x-ms-asf",
153       "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C"),
154   MAGIC_NUMBER("image/tiff", "I I"),
155   MAGIC_NUMBER("image/tiff", "II*"),
156   MAGIC_NUMBER("image/tiff", "MM\x00*"),
157   MAGIC_NUMBER("audio/mpeg", "ID3"),
158   MAGIC_NUMBER("image/webp", "RIFF....WEBPVP"),
159   MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3"),
160   MAGIC_NUMBER("application/zip", "PK\x03\x04"),
161   MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00"),
162   MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A"),
163   MAGIC_NUMBER("application/octet-stream", "MZ"),  // EXE
164   // Sniffing for Flash:
165   //
166   //   MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
167   //   MAGIC_NUMBER("application/x-shockwave-flash", "FLV"),
168   //   MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
169   //
170   // Including these magic number for Flash is a trade off.
171   //
172   // Pros:
173   //   * Flash is an important and popular file format
174   //
175   // Cons:
176   //   * These patterns are fairly weak
177   //   * If we mistakenly decide something is Flash, we will execute it
178   //     in the origin of an unsuspecting site.  This could be a security
179   //     vulnerability if the site allows users to upload content.
180   //
181   // On balance, we do not include these patterns.
182 };
183 
184 // The number of content bytes we need to use all our Microsoft Office magic
185 // numbers.
186 static const size_t kBytesRequiredForOfficeMagic = 8;
187 
188 static const MagicNumber kOfficeMagicNumbers[] = {
189   MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"),
190   MAGIC_NUMBER("OOXML", "PK\x03\x04"),
191 };
192 
193 enum OfficeDocType {
194   DOC_TYPE_WORD,
195   DOC_TYPE_EXCEL,
196   DOC_TYPE_POWERPOINT,
197   DOC_TYPE_NONE
198 };
199 
200 struct OfficeExtensionType {
201   OfficeDocType doc_type;
202   const std::string_view extension;
203 };
204 
205 #define OFFICE_EXTENSION(type, extension) \
206   { (type), std::string_view((extension), sizeof(extension) - 1) }
207 
208 static const OfficeExtensionType kOfficeExtensionTypes[] = {
209   OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc"),
210   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls"),
211   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt"),
212   OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx"),
213   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx"),
214   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx"),
215 };
216 
217 static const MagicNumber kExtraMagicNumbers[] = {
218   MAGIC_NUMBER("image/x-xbitmap", "#define"),
219   MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00"),
220   MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt "),
221   MAGIC_NUMBER("video/avi", "RIFF....AVI LIST"),
222   MAGIC_NUMBER("audio/ogg", "OggS\0"),
223   MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0"),
224   MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0"),
225   MAGIC_NUMBER("video/3gpp", "....ftyp3g"),
226   MAGIC_NUMBER("video/3gpp", "....ftypavcl"),
227   MAGIC_NUMBER("video/mp4", "....ftyp"),
228   MAGIC_NUMBER("video/quicktime", "....moov"),
229   MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
230   MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
231   MAGIC_NUMBER("video/x-flv", "FLV"),
232   MAGIC_NUMBER("audio/x-flac", "fLaC"),
233   // Per https://tools.ietf.org/html/rfc3267#section-8.1
234   MAGIC_NUMBER("audio/amr", "#!AMR\n"),
235 
236   // RAW image types.
237   MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR"),
238   MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR"),
239   MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM"),
240   MAGIC_NUMBER("image/x-olympus-orf", "MMOR"),  // big-endian
241   MAGIC_NUMBER("image/x-olympus-orf", "IIRO"),  // little-endian
242   MAGIC_NUMBER("image/x-olympus-orf", "IIRS"),  // little-endian
243   MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW "),
244   MAGIC_NUMBER("image/x-panasonic-raw",
245                "IIU\x00\x08\x00\x00\x00"),  // Panasonic .raw
246   MAGIC_NUMBER("image/x-panasonic-raw",
247                "IIU\x00\x18\x00\x00\x00"),  // Panasonic .rw2
248   MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw"),
249   MAGIC_NUMBER("image/x-x3f", "FOVb"),
250 };
251 
252 // Our HTML sniffer differs slightly from Mozilla.  For example, Mozilla will
253 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
254 // HTML, but we will not.
255 
256 #define MAGIC_HTML_TAG(tag) \
257   MAGIC_STRING("text/html", "<" tag)
258 
259 static const MagicNumber kSniffableTags[] = {
260   // XML processing directive.  Although this is not an HTML mime type, we sniff
261   // for this in the HTML phase because text/xml is just as powerful as HTML and
262   // we want to leverage our white space skipping technology.
263   MAGIC_NUMBER("text/xml", "<?xml"),  // Mozilla
264   // DOCTYPEs
265   MAGIC_HTML_TAG("!DOCTYPE html"),  // HTML5 spec
266   // Sniffable tags, ordered by how often they occur in sniffable documents.
267   MAGIC_HTML_TAG("script"),  // HTML5 spec, Mozilla
268   MAGIC_HTML_TAG("html"),  // HTML5 spec, Mozilla
269   MAGIC_HTML_TAG("!--"),
270   MAGIC_HTML_TAG("head"),  // HTML5 spec, Mozilla
271   MAGIC_HTML_TAG("iframe"),  // Mozilla
272   MAGIC_HTML_TAG("h1"),  // Mozilla
273   MAGIC_HTML_TAG("div"),  // Mozilla
274   MAGIC_HTML_TAG("font"),  // Mozilla
275   MAGIC_HTML_TAG("table"),  // Mozilla
276   MAGIC_HTML_TAG("a"),  // Mozilla
277   MAGIC_HTML_TAG("style"),  // Mozilla
278   MAGIC_HTML_TAG("title"),  // Mozilla
279   MAGIC_HTML_TAG("b"),  // Mozilla
280   MAGIC_HTML_TAG("body"),  // Mozilla
281   MAGIC_HTML_TAG("br"),
282   MAGIC_HTML_TAG("p"),  // Mozilla
283 };
284 
285 // Compare content header to a magic number where magic_entry can contain '.'
286 // for single character of anything, allowing some bytes to be skipped.
MagicCmp(std::string_view content,std::string_view magic_entry)287 static bool MagicCmp(std::string_view content, std::string_view magic_entry) {
288   DCHECK_GE(content.length(), magic_entry.length());
289 
290   for (size_t i = 0; i < magic_entry.length(); ++i) {
291     if (magic_entry[i] != '.' && magic_entry[i] != content[i])
292       return false;
293   }
294   return true;
295 }
296 
297 // Like MagicCmp() except that it ANDs each byte with a mask before
298 // the comparison, because there are some bits we don't care about.
MagicMaskCmp(std::string_view content,std::string_view magic_entry,std::string_view magic_mask)299 static bool MagicMaskCmp(std::string_view content,
300                          std::string_view magic_entry,
301                          std::string_view magic_mask) {
302   DCHECK_GE(content.length(), magic_entry.length());
303 
304   for (size_t i = 0; i < magic_entry.length(); ++i) {
305     if (magic_entry[i] != '.' && magic_entry[i] != (magic_mask[i] & content[i]))
306       return false;
307   }
308   return true;
309 }
310 
MatchMagicNumber(std::string_view content,const MagicNumber & magic_entry,std::string * result)311 static bool MatchMagicNumber(std::string_view content,
312                              const MagicNumber& magic_entry,
313                              std::string* result) {
314   // Keep kBytesRequiredForMagic honest.
315   DCHECK_LE(magic_entry.magic.length(), kBytesRequiredForMagic);
316 
317   bool match = false;
318   if (content.length() >= magic_entry.magic.length()) {
319     if (magic_entry.is_string) {
320       // Consistency check - string entries should have no embedded nulls.
321       DCHECK_EQ(std::string_view::npos, magic_entry.magic.find('\0'));
322 
323       // Do a case-insensitive prefix comparison.
324       match = base::StartsWith(content, magic_entry.magic,
325                                base::CompareCase::INSENSITIVE_ASCII);
326     } else if (!magic_entry.mask) {
327       match = MagicCmp(content, magic_entry.magic);
328     } else {
329       std::string_view magic_mask(magic_entry.mask, magic_entry.magic.length());
330       match = MagicMaskCmp(content, magic_entry.magic, magic_mask);
331     }
332   }
333 
334   if (match) {
335     result->assign(magic_entry.mime_type);
336     return true;
337   }
338   return false;
339 }
340 
CheckForMagicNumbers(std::string_view content,base::span<const MagicNumber> magic_numbers,std::string * result)341 static bool CheckForMagicNumbers(std::string_view content,
342                                  base::span<const MagicNumber> magic_numbers,
343                                  std::string* result) {
344   for (const MagicNumber& magic : magic_numbers) {
345     if (MatchMagicNumber(content, magic, result))
346       return true;
347   }
348   return false;
349 }
350 
351 // Truncates |string_piece| to length |max_size| and returns true if
352 // |string_piece| is now exactly |max_size|.
TruncateStringPiece(const size_t max_size,std::string_view * string_piece)353 static bool TruncateStringPiece(const size_t max_size,
354                                 std::string_view* string_piece) {
355   // Keep kMaxBytesToSniff honest.
356   DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
357 
358   *string_piece = string_piece->substr(0, max_size);
359   return string_piece->length() == max_size;
360 }
361 
362 // Returns true and sets result if the content appears to be HTML.
363 // Clears have_enough_content if more data could possibly change the result.
SniffForHTML(std::string_view content,bool * have_enough_content,std::string * result)364 static bool SniffForHTML(std::string_view content,
365                          bool* have_enough_content,
366                          std::string* result) {
367   // For HTML, we are willing to consider up to 512 bytes. This may be overly
368   // conservative as IE only considers 256.
369   *have_enough_content &= TruncateStringPiece(512, &content);
370 
371   // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
372   // but with some modifications to better match the HTML5 spec.
373   std::string_view trimmed =
374       base::TrimWhitespaceASCII(content, base::TRIM_LEADING);
375 
376   // |trimmed| now starts at first non-whitespace character (or is empty).
377   return CheckForMagicNumbers(trimmed, kSniffableTags, result);
378 }
379 
380 // Returns true and sets result if the content matches any of kMagicNumbers.
381 // Clears have_enough_content if more data could possibly change the result.
SniffForMagicNumbers(std::string_view content,bool * have_enough_content,std::string * result)382 static bool SniffForMagicNumbers(std::string_view content,
383                                  bool* have_enough_content,
384                                  std::string* result) {
385   *have_enough_content &= TruncateStringPiece(kBytesRequiredForMagic, &content);
386 
387   // Check our big table of Magic Numbers
388   return CheckForMagicNumbers(content, kMagicNumbers, result);
389 }
390 
391 // Returns true and sets result if the content matches any of
392 // kOfficeMagicNumbers, and the URL has the proper extension.
393 // Clears |have_enough_content| if more data could possibly change the result.
SniffForOfficeDocs(std::string_view content,const GURL & url,bool * have_enough_content,std::string * result)394 static bool SniffForOfficeDocs(std::string_view content,
395                                const GURL& url,
396                                bool* have_enough_content,
397                                std::string* result) {
398   *have_enough_content &=
399       TruncateStringPiece(kBytesRequiredForOfficeMagic, &content);
400 
401   // Check our table of magic numbers for Office file types.
402   std::string office_version;
403   if (!CheckForMagicNumbers(content, kOfficeMagicNumbers, &office_version))
404     return false;
405 
406   OfficeDocType type = DOC_TYPE_NONE;
407   std::string_view url_path = url.path_piece();
408   for (const auto& office_extension : kOfficeExtensionTypes) {
409     if (base::EndsWith(url_path, office_extension.extension,
410                        base::CompareCase::INSENSITIVE_ASCII)) {
411       type = office_extension.doc_type;
412       break;
413     }
414   }
415 
416   if (type == DOC_TYPE_NONE)
417     return false;
418 
419   if (office_version == "CFB") {
420     switch (type) {
421       case DOC_TYPE_WORD:
422         *result = "application/msword";
423         return true;
424       case DOC_TYPE_EXCEL:
425         *result = "application/vnd.ms-excel";
426         return true;
427       case DOC_TYPE_POWERPOINT:
428         *result = "application/vnd.ms-powerpoint";
429         return true;
430       case DOC_TYPE_NONE:
431         NOTREACHED();
432         return false;
433     }
434   } else if (office_version == "OOXML") {
435     switch (type) {
436       case DOC_TYPE_WORD:
437         *result = "application/vnd.openxmlformats-officedocument."
438                   "wordprocessingml.document";
439         return true;
440       case DOC_TYPE_EXCEL:
441         *result = "application/vnd.openxmlformats-officedocument."
442                   "spreadsheetml.sheet";
443         return true;
444       case DOC_TYPE_POWERPOINT:
445         *result = "application/vnd.openxmlformats-officedocument."
446                   "presentationml.presentation";
447         return true;
448       case DOC_TYPE_NONE:
449         NOTREACHED();
450         return false;
451     }
452   }
453 
454   NOTREACHED();
455   return false;
456 }
457 
IsOfficeType(const std::string & type_hint)458 static bool IsOfficeType(const std::string& type_hint) {
459   return (type_hint == "application/msword" ||
460           type_hint == "application/vnd.ms-excel" ||
461           type_hint == "application/vnd.ms-powerpoint" ||
462           type_hint == "application/vnd.openxmlformats-officedocument."
463                        "wordprocessingml.document" ||
464           type_hint == "application/vnd.openxmlformats-officedocument."
465                        "spreadsheetml.sheet" ||
466           type_hint == "application/vnd.openxmlformats-officedocument."
467                        "presentationml.presentation" ||
468           type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" ||
469           type_hint == "application/vnd.ms-word.document.macroenabled.12" ||
470           type_hint == "application/vnd.ms-powerpoint.presentation."
471                        "macroenabled.12" ||
472           type_hint == "application/mspowerpoint" ||
473           type_hint == "application/msexcel" ||
474           type_hint == "application/vnd.ms-word" ||
475           type_hint == "application/vnd.ms-word.document.12" ||
476           type_hint == "application/vnd.msword");
477 }
478 
479 // This function checks for files that have a Microsoft Office MIME type
480 // set, but are not actually Office files.
481 //
482 // If this is not actually an Office file, |*result| is set to
483 // "application/octet-stream", otherwise it is not modified.
484 //
485 // Returns false if additional data is required to determine the file type, or
486 // true if there is enough data to make a decision.
SniffForInvalidOfficeDocs(std::string_view content,const GURL & url,std::string * result)487 static bool SniffForInvalidOfficeDocs(std::string_view content,
488                                       const GURL& url,
489                                       std::string* result) {
490   if (!TruncateStringPiece(kBytesRequiredForOfficeMagic, &content))
491     return false;
492 
493   // Check our table of magic numbers for Office file types.  If it does not
494   // match one, the MIME type was invalid.  Set it instead to a safe value.
495   std::string office_version;
496   if (!CheckForMagicNumbers(content, kOfficeMagicNumbers, &office_version)) {
497     *result = "application/octet-stream";
498   }
499 
500   // We have enough information to determine if this was a Microsoft Office
501   // document or not, so sniffing is completed.
502   return true;
503 }
504 
505 // Tags that indicate the content is likely XML.
506 static const MagicNumber kMagicXML[] = {
507     MAGIC_STRING("application/atom+xml", "<feed"),
508     MAGIC_STRING("application/rss+xml", "<rss"),
509 };
510 
511 // Returns true and sets result if the content appears to contain XHTML or a
512 // feed.
513 // Clears have_enough_content if more data could possibly change the result.
514 //
515 // TODO(evanm): this is similar but more conservative than what Safari does,
516 // while HTML5 has a different recommendation -- what should we do?
517 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
518 // of ASCII -- do we care?
SniffXML(std::string_view content,bool * have_enough_content,std::string * result)519 static bool SniffXML(std::string_view content,
520                      bool* have_enough_content,
521                      std::string* result) {
522   // We allow at most 300 bytes of content before we expect the opening tag.
523   *have_enough_content &= TruncateStringPiece(300, &content);
524 
525   // This loop iterates through tag-looking offsets in the file.
526   // We want to skip XML processing instructions (of the form "<?xml ...")
527   // and stop at the first "plain" tag, then make a decision on the mime-type
528   // based on the name (or possibly attributes) of that tag.
529   const int kMaxTagIterations = 5;
530   size_t pos = 0;
531   for (size_t i = 0; i < kMaxTagIterations && pos < content.length(); ++i) {
532     pos = content.find('<', pos);
533     if (pos == std::string_view::npos) {
534       return false;
535     }
536 
537     std::string_view current = content.substr(pos);
538 
539     // Skip XML and DOCTYPE declarations.
540     static constexpr std::string_view kXmlPrefix("<?xml");
541     static constexpr std::string_view kDocTypePrefix("<!DOCTYPE");
542     if (base::StartsWith(current, kXmlPrefix,
543                          base::CompareCase::INSENSITIVE_ASCII) ||
544         base::StartsWith(current, kDocTypePrefix,
545                          base::CompareCase::INSENSITIVE_ASCII)) {
546       ++pos;
547       continue;
548     }
549 
550     if (CheckForMagicNumbers(current, kMagicXML, result))
551       return true;
552 
553     // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
554     // to identify.
555 
556     // If we get here, we've hit an initial tag that hasn't matched one of the
557     // above tests.  Abort.
558     return true;
559   }
560 
561   // We iterated too far without finding a start tag.
562   // If we have more content to look at, we aren't going to change our mind by
563   // seeing more bytes from the network.
564   return pos < content.length();
565 }
566 
567 // Byte order marks
568 static const MagicNumber kByteOrderMark[] = {
569   MAGIC_NUMBER("text/plain", "\xFE\xFF"),  // UTF-16BE
570   MAGIC_NUMBER("text/plain", "\xFF\xFE"),  // UTF-16LE
571   MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF"),  // UTF-8
572 };
573 
574 // Returns true and sets result to "application/octet-stream" if the content
575 // appears to be binary data. Otherwise, returns false and sets "text/plain".
576 // Clears have_enough_content if more data could possibly change the result.
SniffBinary(std::string_view content,bool * have_enough_content,std::string * result)577 static bool SniffBinary(std::string_view content,
578                         bool* have_enough_content,
579                         std::string* result) {
580   // There is no consensus about exactly how to sniff for binary content.
581   // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
582   // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
583   // Here, we side with FF, but with a smaller buffer. This size was chosen
584   // because it is small enough to comfortably fit into a single packet (after
585   // allowing for headers) and yet large enough to account for binary formats
586   // that have a significant amount of ASCII at the beginning (crbug.com/15314).
587   const bool is_truncated = TruncateStringPiece(kMaxBytesToSniff, &content);
588 
589   // First, we look for a BOM.
590   std::string unused;
591   if (CheckForMagicNumbers(content, kByteOrderMark, &unused)) {
592     // If there is BOM, we think the buffer is not binary.
593     result->assign("text/plain");
594     return false;
595   }
596 
597   // Next we look to see if any of the bytes "look binary."
598   if (LooksLikeBinary(content)) {
599     result->assign("application/octet-stream");
600     return true;
601   }
602 
603   // No evidence either way. Default to non-binary and, if truncated, clear
604   // have_enough_content because there could be a binary looking byte in the
605   // truncated data.
606   *have_enough_content &= is_truncated;
607   result->assign("text/plain");
608   return false;
609 }
610 
IsUnknownMimeType(std::string_view mime_type)611 static bool IsUnknownMimeType(std::string_view mime_type) {
612   // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
613   // If we do, please be careful not to alter the semantics at all.
614   static const char* const kUnknownMimeTypes[] = {
615     // Empty mime types are as unknown as they get.
616     "",
617     // The unknown/unknown type is popular and uninformative
618     "unknown/unknown",
619     // The second most popular unknown mime type is application/unknown
620     "application/unknown",
621     // Firefox rejects a mime type if it is exactly */*
622     "*/*",
623   };
624   for (const char* const unknown_mime_type : kUnknownMimeTypes) {
625     if (mime_type == unknown_mime_type)
626       return true;
627   }
628   if (mime_type.find('/') == std::string_view::npos) {
629     // Firefox rejects a mime type if it does not contain a slash
630     return true;
631   }
632   return false;
633 }
634 
635 // Returns true and sets result if the content appears to be a crx (Chrome
636 // extension) file.
637 // Clears have_enough_content if more data could possibly change the result.
SniffCRX(std::string_view content,const GURL & url,bool * have_enough_content,std::string * result)638 static bool SniffCRX(std::string_view content,
639                      const GURL& url,
640                      bool* have_enough_content,
641                      std::string* result) {
642   // Technically, the crx magic number is just Cr24, but the bytes after that
643   // are a version number which changes infrequently. Including it in the
644   // sniffing gives us less room for error. If the version number ever changes,
645   // we can just add an entry to this list.
646   static const struct MagicNumber kCRXMagicNumbers[] = {
647       MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00"),
648       MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x03\x00\x00\x00")};
649 
650   // Only consider files that have the extension ".crx".
651   if (!url.path_piece().ends_with(".crx")) {
652     return false;
653   }
654 
655   *have_enough_content &= TruncateStringPiece(kBytesRequiredForMagic, &content);
656   return CheckForMagicNumbers(content, kCRXMagicNumbers, result);
657 }
658 
ShouldSniffMimeType(const GURL & url,std::string_view mime_type)659 bool ShouldSniffMimeType(const GURL& url, std::string_view mime_type) {
660   bool sniffable_scheme = url.is_empty() || url.SchemeIsHTTPOrHTTPS() ||
661 #if BUILDFLAG(IS_ANDROID)
662                           url.SchemeIs("content") ||
663 #endif
664                           url.SchemeIsFile() || url.SchemeIsFileSystem();
665   if (!sniffable_scheme)
666     return false;
667 
668   static const char* const kSniffableTypes[] = {
669     // Many web servers are misconfigured to send text/plain for many
670     // different types of content.
671     "text/plain",
672     // We want to sniff application/octet-stream for
673     // application/x-chrome-extension, but nothing else.
674     "application/octet-stream",
675     // XHTML and Atom/RSS feeds are often served as plain xml instead of
676     // their more specific mime types.
677     "text/xml",
678     "application/xml",
679     // Check for false Microsoft Office MIME types.
680     "application/msword",
681     "application/vnd.ms-excel",
682     "application/vnd.ms-powerpoint",
683     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
684     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
685     "application/vnd.openxmlformats-officedocument.presentationml.presentation",
686     "application/vnd.ms-excel.sheet.macroenabled.12",
687     "application/vnd.ms-word.document.macroenabled.12",
688     "application/vnd.ms-powerpoint.presentation.macroenabled.12",
689     "application/mspowerpoint",
690     "application/msexcel",
691     "application/vnd.ms-word",
692     "application/vnd.ms-word.document.12",
693     "application/vnd.msword",
694   };
695   for (const char* const sniffable_type : kSniffableTypes) {
696     if (mime_type == sniffable_type)
697       return true;
698   }
699   if (IsUnknownMimeType(mime_type)) {
700     // The web server didn't specify a content type or specified a mime
701     // type that we ignore.
702     return true;
703   }
704   return false;
705 }
706 
SniffMimeType(std::string_view content,const GURL & url,const std::string & type_hint,ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,std::string * result)707 bool SniffMimeType(std::string_view content,
708                    const GURL& url,
709                    const std::string& type_hint,
710                    ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,
711                    std::string* result) {
712   // Sanity check.
713   DCHECK_LT(content.length(), 1000000U);
714   DCHECK(result);
715 
716   // By default, we assume we have enough content.
717   // Each sniff routine may unset this if it wasn't provided enough content.
718   bool have_enough_content = true;
719 
720   // By default, we'll return the type hint.
721   // Each sniff routine may modify this if it has a better guess..
722   result->assign(type_hint);
723 
724   // If the file has a Microsoft Office MIME type, we should only check that it
725   // is a valid Office file.  Because this is the only reason we sniff files
726   // with a Microsoft Office MIME type, we can return early.
727   if (IsOfficeType(type_hint))
728     return SniffForInvalidOfficeDocs(content, url, result);
729 
730   // Cache information about the type_hint
731   bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
732 
733   // First check for HTML, unless it's a file URL and
734   // |allow_sniffing_files_urls_as_html| is false.
735   if (hint_is_unknown_mime_type &&
736       (!url.SchemeIsFile() ||
737        force_sniff_file_url_for_html == ForceSniffFileUrlsForHtml::kEnabled)) {
738     // We're only willing to sniff HTML if the server has not supplied a mime
739     // type, or if the type it did supply indicates that it doesn't know what
740     // the type should be.
741     if (SniffForHTML(content, &have_enough_content, result))
742       return true;  // We succeeded in sniffing HTML.  No more content needed.
743   }
744 
745   // We're only willing to sniff for binary in 3 cases:
746   // 1. The server has not supplied a mime type.
747   // 2. The type it did supply indicates that it doesn't know what the type
748   //    should be.
749   // 3. The type is "text/plain" which is the default on some web servers and
750   //    could be indicative of a mis-configuration that we shield the user from.
751   const bool hint_is_text_plain = (type_hint == "text/plain");
752   if (hint_is_unknown_mime_type || hint_is_text_plain) {
753     if (!SniffBinary(content, &have_enough_content, result)) {
754       // If the server said the content was text/plain and it doesn't appear
755       // to be binary, then we trust it.
756       if (hint_is_text_plain) {
757         return have_enough_content;
758       }
759     }
760   }
761 
762   // If we have plain XML, sniff XML subtypes.
763   if (type_hint == "text/xml" || type_hint == "application/xml") {
764     // We're not interested in sniffing these types for images and the like.
765     // Instead, we're looking explicitly for a feed.  If we don't find one
766     // we're done and return early.
767     if (SniffXML(content, &have_enough_content, result))
768       return true;
769     return have_enough_content;
770   }
771 
772   // CRX files (Chrome extensions) have a special sniffing algorithm. It is
773   // tighter than the others because we don't have to match legacy behavior.
774   if (SniffCRX(content, url, &have_enough_content, result))
775     return true;
776 
777   // Check the file extension and magic numbers to see if this is an Office
778   // document.  This needs to be checked before the general magic numbers
779   // because zip files and Office documents (OOXML) have the same magic number.
780   if (SniffForOfficeDocs(content, url, &have_enough_content, result)) {
781     return true;  // We've matched a magic number.  No more content needed.
782   }
783 
784   // We're not interested in sniffing for magic numbers when the type_hint
785   // is application/octet-stream.  Time to bail out.
786   if (type_hint == "application/octet-stream")
787     return have_enough_content;
788 
789   // Now we look in our large table of magic numbers to see if we can find
790   // anything that matches the content.
791   if (SniffForMagicNumbers(content, &have_enough_content, result))
792     return true;  // We've matched a magic number.  No more content needed.
793 
794   return have_enough_content;
795 }
796 
SniffMimeTypeFromLocalData(std::string_view content,std::string * result)797 bool SniffMimeTypeFromLocalData(std::string_view content, std::string* result) {
798   // First check the extra table.
799   if (CheckForMagicNumbers(content, kExtraMagicNumbers, result))
800     return true;
801   // Finally check the original table.
802   return CheckForMagicNumbers(content, kMagicNumbers, result);
803 }
804 
LooksLikeBinary(std::string_view content)805 bool LooksLikeBinary(std::string_view content) {
806   // The definition of "binary bytes" is from the spec at
807   // https://mimesniff.spec.whatwg.org/#binary-data-byte
808   //
809   // The bytes which are considered to be "binary" are all < 0x20. Encode them
810   // one bit per byte, with 1 for a "binary" bit, and 0 for a "text" bit. The
811   // least-significant bit represents byte 0x00, the most-significant bit
812   // represents byte 0x1F.
813   const uint32_t kBinaryBits =
814       ~(1u << '\t' | 1u << '\n' | 1u << '\r' | 1u << '\f' | 1u << '\x1b');
815   for (char c : content) {
816     uint8_t byte = static_cast<uint8_t>(c);
817     if (byte < 0x20 && (kBinaryBits & (1u << byte)))
818       return true;
819   }
820   return false;
821 }
822 
823 }  // namespace net
824