xref: /aosp_15_r20/external/cronet/net/http/http_content_disposition.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/http/http_content_disposition.h"
6 
7 #include <string_view>
8 
9 #include "base/base64.h"
10 #include "base/check_op.h"
11 #include "base/strings/escape.h"
12 #include "base/strings/string_tokenizer.h"
13 #include "base/strings/string_util.h"
14 #include "base/strings/sys_string_conversions.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "net/base/net_string_util.h"
17 #include "net/http/http_util.h"
18 
19 namespace net {
20 
21 namespace {
22 
23 enum RFC2047EncodingType {
24   Q_ENCODING,
25   B_ENCODING
26 };
27 
28 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
29 // decoding a quoted-printable string.  Returns true if the input was valid.
DecodeQEncoding(std::string_view input,std::string * output)30 bool DecodeQEncoding(std::string_view input, std::string* output) {
31   std::string temp;
32   temp.reserve(input.size());
33   for (auto it = input.begin(); it != input.end(); ++it) {
34     if (*it == '_') {
35       temp.push_back(' ');
36     } else if (*it == '=') {
37       if ((input.end() - it < 3) ||
38           !base::IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
39           !base::IsHexDigit(static_cast<unsigned char>(*(it + 2))))
40         return false;
41       unsigned char ch =
42           base::HexDigitToInt(*(it + 1)) * 16 + base::HexDigitToInt(*(it + 2));
43       temp.push_back(static_cast<char>(ch));
44       ++it;
45       ++it;
46     } else if (0x20 < *it && *it < 0x7F && *it != '?') {
47       // In a Q-encoded word, only printable ASCII characters
48       // represent themselves. Besides, space, '=', '_' and '?' are
49       // not allowed, but they're already filtered out.
50       DCHECK_NE('=', *it);
51       DCHECK_NE('?', *it);
52       DCHECK_NE('_', *it);
53       temp.push_back(*it);
54     } else {
55       return false;
56     }
57   }
58   output->swap(temp);
59   return true;
60 }
61 
62 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
63 // type is specified in |enc_type|.
DecodeBQEncoding(std::string_view part,RFC2047EncodingType enc_type,const std::string & charset,std::string * output)64 bool DecodeBQEncoding(std::string_view part,
65                       RFC2047EncodingType enc_type,
66                       const std::string& charset,
67                       std::string* output) {
68   std::string decoded;
69   if (!((enc_type == B_ENCODING) ?
70         base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
71     return false;
72   }
73 
74   if (decoded.empty()) {
75     output->clear();
76     return true;
77   }
78 
79   return ConvertToUtf8(decoded, charset.c_str(), output);
80 }
81 
DecodeWord(std::string_view encoded_word,const std::string & referrer_charset,bool * is_rfc2047,std::string * output,int * parse_result_flags)82 bool DecodeWord(std::string_view encoded_word,
83                 const std::string& referrer_charset,
84                 bool* is_rfc2047,
85                 std::string* output,
86                 int* parse_result_flags) {
87   *is_rfc2047 = false;
88   output->clear();
89   if (encoded_word.empty())
90     return true;
91 
92   if (!base::IsStringASCII(encoded_word)) {
93     // Try UTF-8, referrer_charset and the native OS default charset in turn.
94     if (base::IsStringUTF8(encoded_word)) {
95       *output = std::string(encoded_word);
96     } else {
97       std::u16string utf16_output;
98       if (!referrer_charset.empty() &&
99           ConvertToUTF16(encoded_word, referrer_charset.c_str(),
100                          &utf16_output)) {
101         *output = base::UTF16ToUTF8(utf16_output);
102       } else {
103         *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
104       }
105     }
106 
107     *parse_result_flags |= HttpContentDisposition::HAS_NON_ASCII_STRINGS;
108     return true;
109   }
110 
111   // RFC 2047 : one of encoding methods supported by Firefox and relatively
112   // widely used by web servers.
113   // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
114   // We don't care about the length restriction (72 bytes) because
115   // many web servers generate encoded words longer than the limit.
116   std::string decoded_word;
117   *is_rfc2047 = true;
118   int part_index = 0;
119   std::string charset;
120   base::CStringTokenizer t(encoded_word.data(),
121                            encoded_word.data() + encoded_word.size(), "?");
122   RFC2047EncodingType enc_type = Q_ENCODING;
123   while (*is_rfc2047 && t.GetNext()) {
124     std::string_view part = t.token_piece();
125     switch (part_index) {
126       case 0:
127         if (part != "=") {
128           *is_rfc2047 = false;
129           break;
130         }
131         ++part_index;
132         break;
133       case 1:
134         // Do we need charset validity check here?
135         charset = std::string(part);
136         ++part_index;
137         break;
138       case 2:
139         if (part.size() > 1 ||
140             part.find_first_of("bBqQ") == std::string::npos) {
141           *is_rfc2047 = false;
142           break;
143         }
144         if (part[0] == 'b' || part[0] == 'B') {
145           enc_type = B_ENCODING;
146         }
147         ++part_index;
148         break;
149       case 3:
150         *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
151         if (!*is_rfc2047) {
152           // Last minute failure. Invalid B/Q encoding. Rather than
153           // passing it through, return now.
154           return false;
155         }
156         ++part_index;
157         break;
158       case 4:
159         if (part != "=") {
160           // Another last minute failure !
161           // Likely to be a case of two encoded-words in a row or
162           // an encoded word followed by a non-encoded word. We can be
163           // generous, but it does not help much in terms of compatibility,
164           // I believe. Return immediately.
165           *is_rfc2047 = false;
166           return false;
167         }
168         ++part_index;
169         break;
170       default:
171         *is_rfc2047 = false;
172         return false;
173     }
174   }
175 
176   if (*is_rfc2047) {
177     if (*(encoded_word.end() - 1) == '=') {
178       output->swap(decoded_word);
179       *parse_result_flags |=
180           HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
181       return true;
182     }
183     // encoded_word ending prematurelly with '?' or extra '?'
184     *is_rfc2047 = false;
185     return false;
186   }
187 
188   // We're not handling 'especial' characters quoted with '\', but
189   // it should be Ok because we're not an email client but a
190   // web browser.
191 
192   // What IE6/7 does: %-escaped UTF-8.
193   decoded_word = base::UnescapeBinaryURLComponent(encoded_word,
194                                                   base::UnescapeRule::NORMAL);
195   if (decoded_word != encoded_word)
196     *parse_result_flags |= HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
197   if (base::IsStringUTF8(decoded_word)) {
198     output->swap(decoded_word);
199     return true;
200     // We can try either the OS default charset or 'origin charset' here,
201     // As far as I can tell, IE does not support it. However, I've seen
202     // web servers emit %-escaped string in a legacy encoding (usually
203     // origin charset).
204     // TODO(jungshik) : Test IE further and consider adding a fallback here.
205   }
206   return false;
207 }
208 
209 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
210 // value is supposed to be of the form:
211 //
212 //   value                   = token | quoted-string
213 //
214 // However we currently also allow RFC 2047 encoding and non-ASCII
215 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
DecodeFilenameValue(const std::string & input,const std::string & referrer_charset,std::string * output,int * parse_result_flags)216 bool DecodeFilenameValue(const std::string& input,
217                          const std::string& referrer_charset,
218                          std::string* output,
219                          int* parse_result_flags) {
220   int current_parse_result_flags = 0;
221   std::string decoded_value;
222   bool is_previous_token_rfc2047 = true;
223 
224   // Tokenize with whitespace characters.
225   base::StringTokenizer t(input, " \t\n\r");
226   t.set_options(base::StringTokenizer::RETURN_DELIMS);
227   while (t.GetNext()) {
228     if (t.token_is_delim()) {
229       // If the previous non-delimeter token is not RFC2047-encoded,
230       // put in a space in its place. Otheriwse, skip over it.
231       if (!is_previous_token_rfc2047)
232         decoded_value.push_back(' ');
233       continue;
234     }
235     // We don't support a single multibyte character split into
236     // adjacent encoded words. Some broken mail clients emit headers
237     // with that problem, but most web servers usually encode a filename
238     // in a single encoded-word. Firefox/Thunderbird do not support
239     // it, either.
240     std::string decoded;
241     if (!DecodeWord(t.token_piece(), referrer_charset,
242                     &is_previous_token_rfc2047, &decoded,
243                     &current_parse_result_flags))
244       return false;
245     decoded_value.append(decoded);
246   }
247   output->swap(decoded_value);
248   if (parse_result_flags && !output->empty())
249     *parse_result_flags |= current_parse_result_flags;
250   return true;
251 }
252 
253 // Parses the charset and value-chars out of an ext-value string.
254 //
255 //  ext-value     = charset  "'" [ language ] "'" value-chars
ParseExtValueComponents(const std::string & input,std::string * charset,std::string * value_chars)256 bool ParseExtValueComponents(const std::string& input,
257                              std::string* charset,
258                              std::string* value_chars) {
259   base::StringTokenizer t(input, "'");
260   t.set_options(base::StringTokenizer::RETURN_DELIMS);
261   std::string_view temp_charset;
262   std::string_view temp_value;
263   int num_delims_seen = 0;
264   while (t.GetNext()) {
265     if (t.token_is_delim()) {
266       ++num_delims_seen;
267       continue;
268     } else {
269       switch (num_delims_seen) {
270         case 0:
271           temp_charset = t.token_piece();
272           break;
273         case 1:
274           // Language is ignored.
275           break;
276         case 2:
277           temp_value = t.token_piece();
278           break;
279         default:
280           return false;
281       }
282     }
283   }
284   if (num_delims_seen != 2)
285     return false;
286   if (temp_charset.empty() || temp_value.empty())
287     return false;
288   *charset = std::string(temp_charset);
289   *value_chars = std::string(temp_value);
290   return true;
291 }
292 
293 // http://tools.ietf.org/html/rfc5987#section-3.2
294 //
295 //  ext-value     = charset  "'" [ language ] "'" value-chars
296 //
297 //  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
298 //
299 //  mime-charset  = 1*mime-charsetc
300 //  mime-charsetc = ALPHA / DIGIT
301 //                 / "!" / "#" / "$" / "%" / "&"
302 //                 / "+" / "-" / "^" / "_" / "`"
303 //                 / "{" / "}" / "~"
304 //
305 //  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
306 //
307 //  value-chars   = *( pct-encoded / attr-char )
308 //
309 //  pct-encoded   = "%" HEXDIG HEXDIG
310 //
311 //  attr-char     = ALPHA / DIGIT
312 //                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
313 //                 / "^" / "_" / "`" / "|" / "~"
DecodeExtValue(const std::string & param_value,std::string * decoded)314 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
315   if (param_value.find('"') != std::string::npos)
316     return false;
317 
318   std::string charset;
319   std::string value;
320   if (!ParseExtValueComponents(param_value, &charset, &value))
321     return false;
322 
323   // RFC 5987 value should be ASCII-only.
324   if (!base::IsStringASCII(value)) {
325     decoded->clear();
326     return true;
327   }
328 
329   std::string unescaped =
330       base::UnescapeBinaryURLComponent(value, base::UnescapeRule::NORMAL);
331 
332   return ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
333 }
334 
335 } // namespace
336 
HttpContentDisposition(const std::string & header,const std::string & referrer_charset)337 HttpContentDisposition::HttpContentDisposition(
338     const std::string& header,
339     const std::string& referrer_charset) {
340   Parse(header, referrer_charset);
341 }
342 
343 HttpContentDisposition::~HttpContentDisposition() = default;
344 
ConsumeDispositionType(std::string::const_iterator begin,std::string::const_iterator end)345 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
346     std::string::const_iterator begin, std::string::const_iterator end) {
347   DCHECK(type_ == INLINE);
348   auto header = base::MakeStringPiece(begin, end);
349   size_t delimiter = header.find(';');
350   std::string_view type = header.substr(0, delimiter);
351   type = HttpUtil::TrimLWS(type);
352 
353   // If the disposition-type isn't a valid token the then the
354   // Content-Disposition header is malformed, and we treat the first bytes as
355   // a parameter rather than a disposition-type.
356   if (type.empty() || !HttpUtil::IsToken(type))
357     return begin;
358 
359   parse_result_flags_ |= HAS_DISPOSITION_TYPE;
360 
361   DCHECK(type.find('=') == std::string_view::npos);
362 
363   if (base::EqualsCaseInsensitiveASCII(type, "inline")) {
364     type_ = INLINE;
365   } else if (base::EqualsCaseInsensitiveASCII(type, "attachment")) {
366     type_ = ATTACHMENT;
367   } else {
368     parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
369     type_ = ATTACHMENT;
370   }
371   return begin + (type.data() + type.size() - header.data());
372 }
373 
374 // http://tools.ietf.org/html/rfc6266
375 //
376 //  content-disposition = "Content-Disposition" ":"
377 //                         disposition-type *( ";" disposition-parm )
378 //
379 //  disposition-type    = "inline" | "attachment" | disp-ext-type
380 //                      ; case-insensitive
381 //  disp-ext-type       = token
382 //
383 //  disposition-parm    = filename-parm | disp-ext-parm
384 //
385 //  filename-parm       = "filename" "=" value
386 //                      | "filename*" "=" ext-value
387 //
388 //  disp-ext-parm       = token "=" value
389 //                      | ext-token "=" ext-value
390 //  ext-token           = <the characters in token, followed by "*">
391 //
Parse(const std::string & header,const std::string & referrer_charset)392 void HttpContentDisposition::Parse(const std::string& header,
393                                    const std::string& referrer_charset) {
394   DCHECK(type_ == INLINE);
395   DCHECK(filename_.empty());
396 
397   std::string::const_iterator pos = header.begin();
398   std::string::const_iterator end = header.end();
399   pos = ConsumeDispositionType(pos, end);
400 
401   std::string filename;
402   std::string ext_filename;
403 
404   HttpUtil::NameValuePairsIterator iter(pos, end, ';');
405   while (iter.GetNext()) {
406     if (filename.empty() &&
407         base::EqualsCaseInsensitiveASCII(iter.name_piece(), "filename")) {
408       DecodeFilenameValue(iter.value(), referrer_charset, &filename,
409                           &parse_result_flags_);
410       if (!filename.empty()) {
411         parse_result_flags_ |= HAS_FILENAME;
412         if (filename[0] == '\'')
413           parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
414       }
415     } else if (ext_filename.empty() && base::EqualsCaseInsensitiveASCII(
416                                            iter.name_piece(), "filename*")) {
417       DecodeExtValue(iter.raw_value(), &ext_filename);
418       if (!ext_filename.empty())
419         parse_result_flags_ |= HAS_EXT_FILENAME;
420     }
421   }
422 
423   if (!ext_filename.empty())
424     filename_ = ext_filename;
425   else
426     filename_ = filename;
427 
428   if (!filename.empty() && filename[0] == '\'')
429     parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
430 }
431 
432 }  // namespace net
433