1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/http/http_content_disposition.h"
6
7 #include <string_view>
8
9 #include "base/base64.h"
10 #include "base/check_op.h"
11 #include "base/strings/escape.h"
12 #include "base/strings/string_tokenizer.h"
13 #include "base/strings/string_util.h"
14 #include "base/strings/sys_string_conversions.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "net/base/net_string_util.h"
17 #include "net/http/http_util.h"
18
19 namespace net {
20
21 namespace {
22
23 enum RFC2047EncodingType {
24 Q_ENCODING,
25 B_ENCODING
26 };
27
28 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
29 // decoding a quoted-printable string. Returns true if the input was valid.
DecodeQEncoding(std::string_view input,std::string * output)30 bool DecodeQEncoding(std::string_view input, std::string* output) {
31 std::string temp;
32 temp.reserve(input.size());
33 for (auto it = input.begin(); it != input.end(); ++it) {
34 if (*it == '_') {
35 temp.push_back(' ');
36 } else if (*it == '=') {
37 if ((input.end() - it < 3) ||
38 !base::IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
39 !base::IsHexDigit(static_cast<unsigned char>(*(it + 2))))
40 return false;
41 unsigned char ch =
42 base::HexDigitToInt(*(it + 1)) * 16 + base::HexDigitToInt(*(it + 2));
43 temp.push_back(static_cast<char>(ch));
44 ++it;
45 ++it;
46 } else if (0x20 < *it && *it < 0x7F && *it != '?') {
47 // In a Q-encoded word, only printable ASCII characters
48 // represent themselves. Besides, space, '=', '_' and '?' are
49 // not allowed, but they're already filtered out.
50 DCHECK_NE('=', *it);
51 DCHECK_NE('?', *it);
52 DCHECK_NE('_', *it);
53 temp.push_back(*it);
54 } else {
55 return false;
56 }
57 }
58 output->swap(temp);
59 return true;
60 }
61
62 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
63 // type is specified in |enc_type|.
DecodeBQEncoding(std::string_view part,RFC2047EncodingType enc_type,const std::string & charset,std::string * output)64 bool DecodeBQEncoding(std::string_view part,
65 RFC2047EncodingType enc_type,
66 const std::string& charset,
67 std::string* output) {
68 std::string decoded;
69 if (!((enc_type == B_ENCODING) ?
70 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
71 return false;
72 }
73
74 if (decoded.empty()) {
75 output->clear();
76 return true;
77 }
78
79 return ConvertToUtf8(decoded, charset.c_str(), output);
80 }
81
DecodeWord(std::string_view encoded_word,const std::string & referrer_charset,bool * is_rfc2047,std::string * output,int * parse_result_flags)82 bool DecodeWord(std::string_view encoded_word,
83 const std::string& referrer_charset,
84 bool* is_rfc2047,
85 std::string* output,
86 int* parse_result_flags) {
87 *is_rfc2047 = false;
88 output->clear();
89 if (encoded_word.empty())
90 return true;
91
92 if (!base::IsStringASCII(encoded_word)) {
93 // Try UTF-8, referrer_charset and the native OS default charset in turn.
94 if (base::IsStringUTF8(encoded_word)) {
95 *output = std::string(encoded_word);
96 } else {
97 std::u16string utf16_output;
98 if (!referrer_charset.empty() &&
99 ConvertToUTF16(encoded_word, referrer_charset.c_str(),
100 &utf16_output)) {
101 *output = base::UTF16ToUTF8(utf16_output);
102 } else {
103 *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
104 }
105 }
106
107 *parse_result_flags |= HttpContentDisposition::HAS_NON_ASCII_STRINGS;
108 return true;
109 }
110
111 // RFC 2047 : one of encoding methods supported by Firefox and relatively
112 // widely used by web servers.
113 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
114 // We don't care about the length restriction (72 bytes) because
115 // many web servers generate encoded words longer than the limit.
116 std::string decoded_word;
117 *is_rfc2047 = true;
118 int part_index = 0;
119 std::string charset;
120 base::CStringTokenizer t(encoded_word.data(),
121 encoded_word.data() + encoded_word.size(), "?");
122 RFC2047EncodingType enc_type = Q_ENCODING;
123 while (*is_rfc2047 && t.GetNext()) {
124 std::string_view part = t.token_piece();
125 switch (part_index) {
126 case 0:
127 if (part != "=") {
128 *is_rfc2047 = false;
129 break;
130 }
131 ++part_index;
132 break;
133 case 1:
134 // Do we need charset validity check here?
135 charset = std::string(part);
136 ++part_index;
137 break;
138 case 2:
139 if (part.size() > 1 ||
140 part.find_first_of("bBqQ") == std::string::npos) {
141 *is_rfc2047 = false;
142 break;
143 }
144 if (part[0] == 'b' || part[0] == 'B') {
145 enc_type = B_ENCODING;
146 }
147 ++part_index;
148 break;
149 case 3:
150 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
151 if (!*is_rfc2047) {
152 // Last minute failure. Invalid B/Q encoding. Rather than
153 // passing it through, return now.
154 return false;
155 }
156 ++part_index;
157 break;
158 case 4:
159 if (part != "=") {
160 // Another last minute failure !
161 // Likely to be a case of two encoded-words in a row or
162 // an encoded word followed by a non-encoded word. We can be
163 // generous, but it does not help much in terms of compatibility,
164 // I believe. Return immediately.
165 *is_rfc2047 = false;
166 return false;
167 }
168 ++part_index;
169 break;
170 default:
171 *is_rfc2047 = false;
172 return false;
173 }
174 }
175
176 if (*is_rfc2047) {
177 if (*(encoded_word.end() - 1) == '=') {
178 output->swap(decoded_word);
179 *parse_result_flags |=
180 HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
181 return true;
182 }
183 // encoded_word ending prematurelly with '?' or extra '?'
184 *is_rfc2047 = false;
185 return false;
186 }
187
188 // We're not handling 'especial' characters quoted with '\', but
189 // it should be Ok because we're not an email client but a
190 // web browser.
191
192 // What IE6/7 does: %-escaped UTF-8.
193 decoded_word = base::UnescapeBinaryURLComponent(encoded_word,
194 base::UnescapeRule::NORMAL);
195 if (decoded_word != encoded_word)
196 *parse_result_flags |= HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
197 if (base::IsStringUTF8(decoded_word)) {
198 output->swap(decoded_word);
199 return true;
200 // We can try either the OS default charset or 'origin charset' here,
201 // As far as I can tell, IE does not support it. However, I've seen
202 // web servers emit %-escaped string in a legacy encoding (usually
203 // origin charset).
204 // TODO(jungshik) : Test IE further and consider adding a fallback here.
205 }
206 return false;
207 }
208
209 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
210 // value is supposed to be of the form:
211 //
212 // value = token | quoted-string
213 //
214 // However we currently also allow RFC 2047 encoding and non-ASCII
215 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
DecodeFilenameValue(const std::string & input,const std::string & referrer_charset,std::string * output,int * parse_result_flags)216 bool DecodeFilenameValue(const std::string& input,
217 const std::string& referrer_charset,
218 std::string* output,
219 int* parse_result_flags) {
220 int current_parse_result_flags = 0;
221 std::string decoded_value;
222 bool is_previous_token_rfc2047 = true;
223
224 // Tokenize with whitespace characters.
225 base::StringTokenizer t(input, " \t\n\r");
226 t.set_options(base::StringTokenizer::RETURN_DELIMS);
227 while (t.GetNext()) {
228 if (t.token_is_delim()) {
229 // If the previous non-delimeter token is not RFC2047-encoded,
230 // put in a space in its place. Otheriwse, skip over it.
231 if (!is_previous_token_rfc2047)
232 decoded_value.push_back(' ');
233 continue;
234 }
235 // We don't support a single multibyte character split into
236 // adjacent encoded words. Some broken mail clients emit headers
237 // with that problem, but most web servers usually encode a filename
238 // in a single encoded-word. Firefox/Thunderbird do not support
239 // it, either.
240 std::string decoded;
241 if (!DecodeWord(t.token_piece(), referrer_charset,
242 &is_previous_token_rfc2047, &decoded,
243 ¤t_parse_result_flags))
244 return false;
245 decoded_value.append(decoded);
246 }
247 output->swap(decoded_value);
248 if (parse_result_flags && !output->empty())
249 *parse_result_flags |= current_parse_result_flags;
250 return true;
251 }
252
253 // Parses the charset and value-chars out of an ext-value string.
254 //
255 // ext-value = charset "'" [ language ] "'" value-chars
ParseExtValueComponents(const std::string & input,std::string * charset,std::string * value_chars)256 bool ParseExtValueComponents(const std::string& input,
257 std::string* charset,
258 std::string* value_chars) {
259 base::StringTokenizer t(input, "'");
260 t.set_options(base::StringTokenizer::RETURN_DELIMS);
261 std::string_view temp_charset;
262 std::string_view temp_value;
263 int num_delims_seen = 0;
264 while (t.GetNext()) {
265 if (t.token_is_delim()) {
266 ++num_delims_seen;
267 continue;
268 } else {
269 switch (num_delims_seen) {
270 case 0:
271 temp_charset = t.token_piece();
272 break;
273 case 1:
274 // Language is ignored.
275 break;
276 case 2:
277 temp_value = t.token_piece();
278 break;
279 default:
280 return false;
281 }
282 }
283 }
284 if (num_delims_seen != 2)
285 return false;
286 if (temp_charset.empty() || temp_value.empty())
287 return false;
288 *charset = std::string(temp_charset);
289 *value_chars = std::string(temp_value);
290 return true;
291 }
292
293 // http://tools.ietf.org/html/rfc5987#section-3.2
294 //
295 // ext-value = charset "'" [ language ] "'" value-chars
296 //
297 // charset = "UTF-8" / "ISO-8859-1" / mime-charset
298 //
299 // mime-charset = 1*mime-charsetc
300 // mime-charsetc = ALPHA / DIGIT
301 // / "!" / "#" / "$" / "%" / "&"
302 // / "+" / "-" / "^" / "_" / "`"
303 // / "{" / "}" / "~"
304 //
305 // language = <Language-Tag, defined in [RFC5646], Section 2.1>
306 //
307 // value-chars = *( pct-encoded / attr-char )
308 //
309 // pct-encoded = "%" HEXDIG HEXDIG
310 //
311 // attr-char = ALPHA / DIGIT
312 // / "!" / "#" / "$" / "&" / "+" / "-" / "."
313 // / "^" / "_" / "`" / "|" / "~"
DecodeExtValue(const std::string & param_value,std::string * decoded)314 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
315 if (param_value.find('"') != std::string::npos)
316 return false;
317
318 std::string charset;
319 std::string value;
320 if (!ParseExtValueComponents(param_value, &charset, &value))
321 return false;
322
323 // RFC 5987 value should be ASCII-only.
324 if (!base::IsStringASCII(value)) {
325 decoded->clear();
326 return true;
327 }
328
329 std::string unescaped =
330 base::UnescapeBinaryURLComponent(value, base::UnescapeRule::NORMAL);
331
332 return ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
333 }
334
335 } // namespace
336
HttpContentDisposition(const std::string & header,const std::string & referrer_charset)337 HttpContentDisposition::HttpContentDisposition(
338 const std::string& header,
339 const std::string& referrer_charset) {
340 Parse(header, referrer_charset);
341 }
342
343 HttpContentDisposition::~HttpContentDisposition() = default;
344
ConsumeDispositionType(std::string::const_iterator begin,std::string::const_iterator end)345 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
346 std::string::const_iterator begin, std::string::const_iterator end) {
347 DCHECK(type_ == INLINE);
348 auto header = base::MakeStringPiece(begin, end);
349 size_t delimiter = header.find(';');
350 std::string_view type = header.substr(0, delimiter);
351 type = HttpUtil::TrimLWS(type);
352
353 // If the disposition-type isn't a valid token the then the
354 // Content-Disposition header is malformed, and we treat the first bytes as
355 // a parameter rather than a disposition-type.
356 if (type.empty() || !HttpUtil::IsToken(type))
357 return begin;
358
359 parse_result_flags_ |= HAS_DISPOSITION_TYPE;
360
361 DCHECK(type.find('=') == std::string_view::npos);
362
363 if (base::EqualsCaseInsensitiveASCII(type, "inline")) {
364 type_ = INLINE;
365 } else if (base::EqualsCaseInsensitiveASCII(type, "attachment")) {
366 type_ = ATTACHMENT;
367 } else {
368 parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
369 type_ = ATTACHMENT;
370 }
371 return begin + (type.data() + type.size() - header.data());
372 }
373
374 // http://tools.ietf.org/html/rfc6266
375 //
376 // content-disposition = "Content-Disposition" ":"
377 // disposition-type *( ";" disposition-parm )
378 //
379 // disposition-type = "inline" | "attachment" | disp-ext-type
380 // ; case-insensitive
381 // disp-ext-type = token
382 //
383 // disposition-parm = filename-parm | disp-ext-parm
384 //
385 // filename-parm = "filename" "=" value
386 // | "filename*" "=" ext-value
387 //
388 // disp-ext-parm = token "=" value
389 // | ext-token "=" ext-value
390 // ext-token = <the characters in token, followed by "*">
391 //
Parse(const std::string & header,const std::string & referrer_charset)392 void HttpContentDisposition::Parse(const std::string& header,
393 const std::string& referrer_charset) {
394 DCHECK(type_ == INLINE);
395 DCHECK(filename_.empty());
396
397 std::string::const_iterator pos = header.begin();
398 std::string::const_iterator end = header.end();
399 pos = ConsumeDispositionType(pos, end);
400
401 std::string filename;
402 std::string ext_filename;
403
404 HttpUtil::NameValuePairsIterator iter(pos, end, ';');
405 while (iter.GetNext()) {
406 if (filename.empty() &&
407 base::EqualsCaseInsensitiveASCII(iter.name_piece(), "filename")) {
408 DecodeFilenameValue(iter.value(), referrer_charset, &filename,
409 &parse_result_flags_);
410 if (!filename.empty()) {
411 parse_result_flags_ |= HAS_FILENAME;
412 if (filename[0] == '\'')
413 parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
414 }
415 } else if (ext_filename.empty() && base::EqualsCaseInsensitiveASCII(
416 iter.name_piece(), "filename*")) {
417 DecodeExtValue(iter.raw_value(), &ext_filename);
418 if (!ext_filename.empty())
419 parse_result_flags_ |= HAS_EXT_FILENAME;
420 }
421 }
422
423 if (!ext_filename.empty())
424 filename_ = ext_filename;
425 else
426 filename_ = filename;
427
428 if (!filename.empty() && filename[0] == '\'')
429 parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
430 }
431
432 } // namespace net
433