1# Copyright (C) 2001-2006 Python Software Foundation 2# Author: Ben Gertzfield 3# Contact: [email protected] 4 5"""Quoted-printable content transfer encoding per RFCs 2045-2047. 6 7This module handles the content transfer encoding method defined in RFC 2045 8to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to 9safely encode text that is in a character set similar to the 7-bit US ASCII 10character set, but that includes some 8-bit characters that are normally not 11allowed in email bodies or headers. 12 13Quoted-printable is very space-inefficient for encoding binary files; use the 14email.base64mime module for that instead. 15 16This module provides an interface to encode and decode both headers and bodies 17with quoted-printable encoding. 18 19RFC 2045 defines a method for including character set information in an 20`encoded-word' in a header. This method is commonly used for 8-bit real names 21in To:/From:/Cc: etc. fields, as well as Subject: lines. 22 23This module does not do the line wrapping or end-of-line character 24conversion necessary for proper internationalized headers; it only 25does dumb encoding and decoding. To deal with the various line 26wrapping issues, use the email.header module. 27""" 28 29__all__ = [ 30 'body_decode', 31 'body_encode', 32 'body_length', 33 'decode', 34 'decodestring', 35 'header_decode', 36 'header_encode', 37 'header_length', 38 'quote', 39 'unquote', 40 ] 41 42import re 43 44from string import ascii_letters, digits, hexdigits 45 46CRLF = '\r\n' 47NL = '\n' 48EMPTYSTRING = '' 49 50# Build a mapping of octets to the expansion of that octet. Since we're only 51# going to have 256 of these things, this isn't terribly inefficient 52# space-wise. Remember that headers and bodies have different sets of safe 53# characters. Initialize both maps with the full expansion, and then override 54# the safe bytes with the more compact form. 55_QUOPRI_MAP = ['=%02X' % c for c in range(256)] 56_QUOPRI_HEADER_MAP = _QUOPRI_MAP[:] 57_QUOPRI_BODY_MAP = _QUOPRI_MAP[:] 58 59# Safe header bytes which need no encoding. 60for c in b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'): 61 _QUOPRI_HEADER_MAP[c] = chr(c) 62# Headers have one other special encoding; spaces become underscores. 63_QUOPRI_HEADER_MAP[ord(' ')] = '_' 64 65# Safe body bytes which need no encoding. 66for c in (b' !"#$%&\'()*+,-./0123456789:;<>' 67 b'?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`' 68 b'abcdefghijklmnopqrstuvwxyz{|}~\t'): 69 _QUOPRI_BODY_MAP[c] = chr(c) 70 71 72 73# Helpers 74def header_check(octet): 75 """Return True if the octet should be escaped with header quopri.""" 76 return chr(octet) != _QUOPRI_HEADER_MAP[octet] 77 78 79def body_check(octet): 80 """Return True if the octet should be escaped with body quopri.""" 81 return chr(octet) != _QUOPRI_BODY_MAP[octet] 82 83 84def header_length(bytearray): 85 """Return a header quoted-printable encoding length. 86 87 Note that this does not include any RFC 2047 chrome added by 88 `header_encode()`. 89 90 :param bytearray: An array of bytes (a.k.a. octets). 91 :return: The length in bytes of the byte array when it is encoded with 92 quoted-printable for headers. 93 """ 94 return sum(len(_QUOPRI_HEADER_MAP[octet]) for octet in bytearray) 95 96 97def body_length(bytearray): 98 """Return a body quoted-printable encoding length. 99 100 :param bytearray: An array of bytes (a.k.a. octets). 101 :return: The length in bytes of the byte array when it is encoded with 102 quoted-printable for bodies. 103 """ 104 return sum(len(_QUOPRI_BODY_MAP[octet]) for octet in bytearray) 105 106 107def _max_append(L, s, maxlen, extra=''): 108 if not isinstance(s, str): 109 s = chr(s) 110 if not L: 111 L.append(s.lstrip()) 112 elif len(L[-1]) + len(s) <= maxlen: 113 L[-1] += extra + s 114 else: 115 L.append(s.lstrip()) 116 117 118def unquote(s): 119 """Turn a string in the form =AB to the ASCII character with value 0xab""" 120 return chr(int(s[1:3], 16)) 121 122 123def quote(c): 124 return _QUOPRI_MAP[ord(c)] 125 126 127def header_encode(header_bytes, charset='iso-8859-1'): 128 """Encode a single header line with quoted-printable (like) encoding. 129 130 Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but 131 used specifically for email header fields to allow charsets with mostly 7 132 bit characters (and some 8 bit) to remain more or less readable in non-RFC 133 2045 aware mail clients. 134 135 charset names the character set to use in the RFC 2046 header. It 136 defaults to iso-8859-1. 137 """ 138 # Return empty headers as an empty string. 139 if not header_bytes: 140 return '' 141 # Iterate over every byte, encoding if necessary. 142 encoded = header_bytes.decode('latin1').translate(_QUOPRI_HEADER_MAP) 143 # Now add the RFC chrome to each encoded chunk and glue the chunks 144 # together. 145 return '=?%s?q?%s?=' % (charset, encoded) 146 147 148_QUOPRI_BODY_ENCODE_MAP = _QUOPRI_BODY_MAP[:] 149for c in b'\r\n': 150 _QUOPRI_BODY_ENCODE_MAP[c] = chr(c) 151del c 152 153def body_encode(body, maxlinelen=76, eol=NL): 154 """Encode with quoted-printable, wrapping at maxlinelen characters. 155 156 Each line of encoded text will end with eol, which defaults to "\\n". Set 157 this to "\\r\\n" if you will be using the result of this function directly 158 in an email. 159 160 Each line will be wrapped at, at most, maxlinelen characters before the 161 eol string (maxlinelen defaults to 76 characters, the maximum value 162 permitted by RFC 2045). Long lines will have the 'soft line break' 163 quoted-printable character "=" appended to them, so the decoded text will 164 be identical to the original text. 165 166 The minimum maxlinelen is 4 to have room for a quoted character ("=XX") 167 followed by a soft line break. Smaller values will generate a 168 ValueError. 169 170 """ 171 172 if maxlinelen < 4: 173 raise ValueError("maxlinelen must be at least 4") 174 if not body: 175 return body 176 177 # quote special characters 178 body = body.translate(_QUOPRI_BODY_ENCODE_MAP) 179 180 soft_break = '=' + eol 181 # leave space for the '=' at the end of a line 182 maxlinelen1 = maxlinelen - 1 183 184 encoded_body = [] 185 append = encoded_body.append 186 187 for line in body.splitlines(): 188 # break up the line into pieces no longer than maxlinelen - 1 189 start = 0 190 laststart = len(line) - 1 - maxlinelen 191 while start <= laststart: 192 stop = start + maxlinelen1 193 # make sure we don't break up an escape sequence 194 if line[stop - 2] == '=': 195 append(line[start:stop - 1]) 196 start = stop - 2 197 elif line[stop - 1] == '=': 198 append(line[start:stop]) 199 start = stop - 1 200 else: 201 append(line[start:stop] + '=') 202 start = stop 203 204 # handle rest of line, special case if line ends in whitespace 205 if line and line[-1] in ' \t': 206 room = start - laststart 207 if room >= 3: 208 # It's a whitespace character at end-of-line, and we have room 209 # for the three-character quoted encoding. 210 q = quote(line[-1]) 211 elif room == 2: 212 # There's room for the whitespace character and a soft break. 213 q = line[-1] + soft_break 214 else: 215 # There's room only for a soft break. The quoted whitespace 216 # will be the only content on the subsequent line. 217 q = soft_break + quote(line[-1]) 218 append(line[start:-1] + q) 219 else: 220 append(line[start:]) 221 222 # add back final newline if present 223 if body[-1] in CRLF: 224 append('') 225 226 return eol.join(encoded_body) 227 228 229 230# BAW: I'm not sure if the intent was for the signature of this function to be 231# the same as base64MIME.decode() or not... 232def decode(encoded, eol=NL): 233 """Decode a quoted-printable string. 234 235 Lines are separated with eol, which defaults to \\n. 236 """ 237 if not encoded: 238 return encoded 239 # BAW: see comment in encode() above. Again, we're building up the 240 # decoded string with string concatenation, which could be done much more 241 # efficiently. 242 decoded = '' 243 244 for line in encoded.splitlines(): 245 line = line.rstrip() 246 if not line: 247 decoded += eol 248 continue 249 250 i = 0 251 n = len(line) 252 while i < n: 253 c = line[i] 254 if c != '=': 255 decoded += c 256 i += 1 257 # Otherwise, c == "=". Are we at the end of the line? If so, add 258 # a soft line break. 259 elif i+1 == n: 260 i += 1 261 continue 262 # Decode if in form =AB 263 elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits: 264 decoded += unquote(line[i:i+3]) 265 i += 3 266 # Otherwise, not in form =AB, pass literally 267 else: 268 decoded += c 269 i += 1 270 271 if i == n: 272 decoded += eol 273 # Special case if original string did not end with eol 274 if encoded[-1] not in '\r\n' and decoded.endswith(eol): 275 decoded = decoded[:-1] 276 return decoded 277 278 279# For convenience and backwards compatibility w/ standard base64 module 280body_decode = decode 281decodestring = decode 282 283 284 285def _unquote_match(match): 286 """Turn a match in the form =AB to the ASCII character with value 0xab""" 287 s = match.group(0) 288 return unquote(s) 289 290 291# Header decoding is done a bit differently 292def header_decode(s): 293 """Decode a string encoded with RFC 2045 MIME header `Q' encoding. 294 295 This function does not parse a full MIME header value encoded with 296 quoted-printable (like =?iso-8859-1?q?Hello_World?=) -- please use 297 the high level email.header class for that functionality. 298 """ 299 s = s.replace('_', ' ') 300 return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s, flags=re.ASCII) 301