1# Copyright (C) 2002-2007 Python Software Foundation 2# Contact: [email protected] 3 4"""Email address parsing code. 5 6Lifted directly from rfc822.py. This should eventually be rewritten. 7""" 8 9__all__ = [ 10 'mktime_tz', 11 'parsedate', 12 'parsedate_tz', 13 'quote', 14 ] 15 16import time, calendar 17 18SPACE = ' ' 19EMPTYSTRING = '' 20COMMASPACE = ', ' 21 22# Parse a date field 23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 24 'aug', 'sep', 'oct', 'nov', 'dec', 25 'january', 'february', 'march', 'april', 'may', 'june', 'july', 26 'august', 'september', 'october', 'november', 'december'] 27 28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] 29 30# The timezone table does not include the military time zones defined 31# in RFC822, other than Z. According to RFC1123, the description in 32# RFC822 gets the signs wrong, so we can't rely on any such time 33# zones. RFC1123 recommends that numeric timezone indicators be used 34# instead of timezone names. 35 36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, 37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) 38 'EST': -500, 'EDT': -400, # Eastern 39 'CST': -600, 'CDT': -500, # Central 40 'MST': -700, 'MDT': -600, # Mountain 41 'PST': -800, 'PDT': -700 # Pacific 42 } 43 44 45def parsedate_tz(data): 46 """Convert a date string to a time tuple. 47 48 Accounts for military timezones. 49 """ 50 res = _parsedate_tz(data) 51 if not res: 52 return 53 if res[9] is None: 54 res[9] = 0 55 return tuple(res) 56 57def _parsedate_tz(data): 58 """Convert date to extended time tuple. 59 60 The last (additional) element is the time zone offset in seconds, except if 61 the timezone was specified as -0000. In that case the last element is 62 None. This indicates a UTC timestamp that explicitly declaims knowledge of 63 the source timezone, as opposed to a +0000 timestamp that indicates the 64 source timezone really was UTC. 65 66 """ 67 if not data: 68 return None 69 data = data.split() 70 if not data: # This happens for whitespace-only input. 71 return None 72 # The FWS after the comma after the day-of-week is optional, so search and 73 # adjust for this. 74 if data[0].endswith(',') or data[0].lower() in _daynames: 75 # There's a dayname here. Skip it 76 del data[0] 77 else: 78 i = data[0].rfind(',') 79 if i >= 0: 80 data[0] = data[0][i+1:] 81 if len(data) == 3: # RFC 850 date, deprecated 82 stuff = data[0].split('-') 83 if len(stuff) == 3: 84 data = stuff + data[1:] 85 if len(data) == 4: 86 s = data[3] 87 i = s.find('+') 88 if i == -1: 89 i = s.find('-') 90 if i > 0: 91 data[3:] = [s[:i], s[i:]] 92 else: 93 data.append('') # Dummy tz 94 if len(data) < 5: 95 return None 96 data = data[:5] 97 [dd, mm, yy, tm, tz] = data 98 if not (dd and mm and yy): 99 return None 100 mm = mm.lower() 101 if mm not in _monthnames: 102 dd, mm = mm, dd.lower() 103 if mm not in _monthnames: 104 return None 105 mm = _monthnames.index(mm) + 1 106 if mm > 12: 107 mm -= 12 108 if dd[-1] == ',': 109 dd = dd[:-1] 110 i = yy.find(':') 111 if i > 0: 112 yy, tm = tm, yy 113 if yy[-1] == ',': 114 yy = yy[:-1] 115 if not yy: 116 return None 117 if not yy[0].isdigit(): 118 yy, tz = tz, yy 119 if tm[-1] == ',': 120 tm = tm[:-1] 121 tm = tm.split(':') 122 if len(tm) == 2: 123 [thh, tmm] = tm 124 tss = '0' 125 elif len(tm) == 3: 126 [thh, tmm, tss] = tm 127 elif len(tm) == 1 and '.' in tm[0]: 128 # Some non-compliant MUAs use '.' to separate time elements. 129 tm = tm[0].split('.') 130 if len(tm) == 2: 131 [thh, tmm] = tm 132 tss = 0 133 elif len(tm) == 3: 134 [thh, tmm, tss] = tm 135 else: 136 return None 137 else: 138 return None 139 try: 140 yy = int(yy) 141 dd = int(dd) 142 thh = int(thh) 143 tmm = int(tmm) 144 tss = int(tss) 145 except ValueError: 146 return None 147 # Check for a yy specified in two-digit format, then convert it to the 148 # appropriate four-digit format, according to the POSIX standard. RFC 822 149 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822) 150 # mandates a 4-digit yy. For more information, see the documentation for 151 # the time module. 152 if yy < 100: 153 # The year is between 1969 and 1999 (inclusive). 154 if yy > 68: 155 yy += 1900 156 # The year is between 2000 and 2068 (inclusive). 157 else: 158 yy += 2000 159 tzoffset = None 160 tz = tz.upper() 161 if tz in _timezones: 162 tzoffset = _timezones[tz] 163 else: 164 try: 165 tzoffset = int(tz) 166 except ValueError: 167 pass 168 if tzoffset==0 and tz.startswith('-'): 169 tzoffset = None 170 # Convert a timezone offset into seconds ; -0500 -> -18000 171 if tzoffset: 172 if tzoffset < 0: 173 tzsign = -1 174 tzoffset = -tzoffset 175 else: 176 tzsign = 1 177 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) 178 # Daylight Saving Time flag is set to -1, since DST is unknown. 179 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset] 180 181 182def parsedate(data): 183 """Convert a time string to a time tuple.""" 184 t = parsedate_tz(data) 185 if isinstance(t, tuple): 186 return t[:9] 187 else: 188 return t 189 190 191def mktime_tz(data): 192 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp.""" 193 if data[9] is None: 194 # No zone info, so localtime is better assumption than GMT 195 return time.mktime(data[:8] + (-1,)) 196 else: 197 t = calendar.timegm(data) 198 return t - data[9] 199 200 201def quote(str): 202 """Prepare string to be used in a quoted string. 203 204 Turns backslash and double quote characters into quoted pairs. These 205 are the only characters that need to be quoted inside a quoted string. 206 Does not add the surrounding double quotes. 207 """ 208 return str.replace('\\', '\\\\').replace('"', '\\"') 209 210 211class AddrlistClass: 212 """Address parser class by Ben Escoto. 213 214 To understand what this class does, it helps to have a copy of RFC 2822 in 215 front of you. 216 217 Note: this class interface is deprecated and may be removed in the future. 218 Use email.utils.AddressList instead. 219 """ 220 221 def __init__(self, field): 222 """Initialize a new instance. 223 224 `field' is an unparsed address header field, containing 225 one or more addresses. 226 """ 227 self.specials = '()<>@,:;.\"[]' 228 self.pos = 0 229 self.LWS = ' \t' 230 self.CR = '\r\n' 231 self.FWS = self.LWS + self.CR 232 self.atomends = self.specials + self.LWS + self.CR 233 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it 234 # is obsolete syntax. RFC 2822 requires that we recognize obsolete 235 # syntax, so allow dots in phrases. 236 self.phraseends = self.atomends.replace('.', '') 237 self.field = field 238 self.commentlist = [] 239 240 def gotonext(self): 241 """Skip white space and extract comments.""" 242 wslist = [] 243 while self.pos < len(self.field): 244 if self.field[self.pos] in self.LWS + '\n\r': 245 if self.field[self.pos] not in '\n\r': 246 wslist.append(self.field[self.pos]) 247 self.pos += 1 248 elif self.field[self.pos] == '(': 249 self.commentlist.append(self.getcomment()) 250 else: 251 break 252 return EMPTYSTRING.join(wslist) 253 254 def getaddrlist(self): 255 """Parse all addresses. 256 257 Returns a list containing all of the addresses. 258 """ 259 result = [] 260 while self.pos < len(self.field): 261 ad = self.getaddress() 262 if ad: 263 result += ad 264 else: 265 result.append(('', '')) 266 return result 267 268 def getaddress(self): 269 """Parse the next address.""" 270 self.commentlist = [] 271 self.gotonext() 272 273 oldpos = self.pos 274 oldcl = self.commentlist 275 plist = self.getphraselist() 276 277 self.gotonext() 278 returnlist = [] 279 280 if self.pos >= len(self.field): 281 # Bad email address technically, no domain. 282 if plist: 283 returnlist = [(SPACE.join(self.commentlist), plist[0])] 284 285 elif self.field[self.pos] in '.@': 286 # email address is just an addrspec 287 # this isn't very efficient since we start over 288 self.pos = oldpos 289 self.commentlist = oldcl 290 addrspec = self.getaddrspec() 291 returnlist = [(SPACE.join(self.commentlist), addrspec)] 292 293 elif self.field[self.pos] == ':': 294 # address is a group 295 returnlist = [] 296 297 fieldlen = len(self.field) 298 self.pos += 1 299 while self.pos < len(self.field): 300 self.gotonext() 301 if self.pos < fieldlen and self.field[self.pos] == ';': 302 self.pos += 1 303 break 304 returnlist = returnlist + self.getaddress() 305 306 elif self.field[self.pos] == '<': 307 # Address is a phrase then a route addr 308 routeaddr = self.getrouteaddr() 309 310 if self.commentlist: 311 returnlist = [(SPACE.join(plist) + ' (' + 312 ' '.join(self.commentlist) + ')', routeaddr)] 313 else: 314 returnlist = [(SPACE.join(plist), routeaddr)] 315 316 else: 317 if plist: 318 returnlist = [(SPACE.join(self.commentlist), plist[0])] 319 elif self.field[self.pos] in self.specials: 320 self.pos += 1 321 322 self.gotonext() 323 if self.pos < len(self.field) and self.field[self.pos] == ',': 324 self.pos += 1 325 return returnlist 326 327 def getrouteaddr(self): 328 """Parse a route address (Return-path value). 329 330 This method just skips all the route stuff and returns the addrspec. 331 """ 332 if self.field[self.pos] != '<': 333 return 334 335 expectroute = False 336 self.pos += 1 337 self.gotonext() 338 adlist = '' 339 while self.pos < len(self.field): 340 if expectroute: 341 self.getdomain() 342 expectroute = False 343 elif self.field[self.pos] == '>': 344 self.pos += 1 345 break 346 elif self.field[self.pos] == '@': 347 self.pos += 1 348 expectroute = True 349 elif self.field[self.pos] == ':': 350 self.pos += 1 351 else: 352 adlist = self.getaddrspec() 353 self.pos += 1 354 break 355 self.gotonext() 356 357 return adlist 358 359 def getaddrspec(self): 360 """Parse an RFC 2822 addr-spec.""" 361 aslist = [] 362 363 self.gotonext() 364 while self.pos < len(self.field): 365 preserve_ws = True 366 if self.field[self.pos] == '.': 367 if aslist and not aslist[-1].strip(): 368 aslist.pop() 369 aslist.append('.') 370 self.pos += 1 371 preserve_ws = False 372 elif self.field[self.pos] == '"': 373 aslist.append('"%s"' % quote(self.getquote())) 374 elif self.field[self.pos] in self.atomends: 375 if aslist and not aslist[-1].strip(): 376 aslist.pop() 377 break 378 else: 379 aslist.append(self.getatom()) 380 ws = self.gotonext() 381 if preserve_ws and ws: 382 aslist.append(ws) 383 384 if self.pos >= len(self.field) or self.field[self.pos] != '@': 385 return EMPTYSTRING.join(aslist) 386 387 aslist.append('@') 388 self.pos += 1 389 self.gotonext() 390 domain = self.getdomain() 391 if not domain: 392 # Invalid domain, return an empty address instead of returning a 393 # local part to denote failed parsing. 394 return EMPTYSTRING 395 return EMPTYSTRING.join(aslist) + domain 396 397 def getdomain(self): 398 """Get the complete domain name from an address.""" 399 sdlist = [] 400 while self.pos < len(self.field): 401 if self.field[self.pos] in self.LWS: 402 self.pos += 1 403 elif self.field[self.pos] == '(': 404 self.commentlist.append(self.getcomment()) 405 elif self.field[self.pos] == '[': 406 sdlist.append(self.getdomainliteral()) 407 elif self.field[self.pos] == '.': 408 self.pos += 1 409 sdlist.append('.') 410 elif self.field[self.pos] == '@': 411 # bpo-34155: Don't parse domains with two `@` like 412 # `[email protected]@important.com`. 413 return EMPTYSTRING 414 elif self.field[self.pos] in self.atomends: 415 break 416 else: 417 sdlist.append(self.getatom()) 418 return EMPTYSTRING.join(sdlist) 419 420 def getdelimited(self, beginchar, endchars, allowcomments=True): 421 """Parse a header fragment delimited by special characters. 422 423 `beginchar' is the start character for the fragment. 424 If self is not looking at an instance of `beginchar' then 425 getdelimited returns the empty string. 426 427 `endchars' is a sequence of allowable end-delimiting characters. 428 Parsing stops when one of these is encountered. 429 430 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed 431 within the parsed fragment. 432 """ 433 if self.field[self.pos] != beginchar: 434 return '' 435 436 slist = [''] 437 quote = False 438 self.pos += 1 439 while self.pos < len(self.field): 440 if quote: 441 slist.append(self.field[self.pos]) 442 quote = False 443 elif self.field[self.pos] in endchars: 444 self.pos += 1 445 break 446 elif allowcomments and self.field[self.pos] == '(': 447 slist.append(self.getcomment()) 448 continue # have already advanced pos from getcomment 449 elif self.field[self.pos] == '\\': 450 quote = True 451 else: 452 slist.append(self.field[self.pos]) 453 self.pos += 1 454 455 return EMPTYSTRING.join(slist) 456 457 def getquote(self): 458 """Get a quote-delimited fragment from self's field.""" 459 return self.getdelimited('"', '"\r', False) 460 461 def getcomment(self): 462 """Get a parenthesis-delimited fragment from self's field.""" 463 return self.getdelimited('(', ')\r', True) 464 465 def getdomainliteral(self): 466 """Parse an RFC 2822 domain-literal.""" 467 return '[%s]' % self.getdelimited('[', ']\r', False) 468 469 def getatom(self, atomends=None): 470 """Parse an RFC 2822 atom. 471 472 Optional atomends specifies a different set of end token delimiters 473 (the default is to use self.atomends). This is used e.g. in 474 getphraselist() since phrase endings must not include the `.' (which 475 is legal in phrases).""" 476 atomlist = [''] 477 if atomends is None: 478 atomends = self.atomends 479 480 while self.pos < len(self.field): 481 if self.field[self.pos] in atomends: 482 break 483 else: 484 atomlist.append(self.field[self.pos]) 485 self.pos += 1 486 487 return EMPTYSTRING.join(atomlist) 488 489 def getphraselist(self): 490 """Parse a sequence of RFC 2822 phrases. 491 492 A phrase is a sequence of words, which are in turn either RFC 2822 493 atoms or quoted-strings. Phrases are canonicalized by squeezing all 494 runs of continuous whitespace into one space. 495 """ 496 plist = [] 497 498 while self.pos < len(self.field): 499 if self.field[self.pos] in self.FWS: 500 self.pos += 1 501 elif self.field[self.pos] == '"': 502 plist.append(self.getquote()) 503 elif self.field[self.pos] == '(': 504 self.commentlist.append(self.getcomment()) 505 elif self.field[self.pos] in self.phraseends: 506 break 507 else: 508 plist.append(self.getatom(self.phraseends)) 509 510 return plist 511 512class AddressList(AddrlistClass): 513 """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" 514 def __init__(self, field): 515 AddrlistClass.__init__(self, field) 516 if field: 517 self.addresslist = self.getaddrlist() 518 else: 519 self.addresslist = [] 520 521 def __len__(self): 522 return len(self.addresslist) 523 524 def __add__(self, other): 525 # Set union 526 newaddr = AddressList(None) 527 newaddr.addresslist = self.addresslist[:] 528 for x in other.addresslist: 529 if not x in self.addresslist: 530 newaddr.addresslist.append(x) 531 return newaddr 532 533 def __iadd__(self, other): 534 # Set union, in-place 535 for x in other.addresslist: 536 if not x in self.addresslist: 537 self.addresslist.append(x) 538 return self 539 540 def __sub__(self, other): 541 # Set difference 542 newaddr = AddressList(None) 543 for x in self.addresslist: 544 if not x in other.addresslist: 545 newaddr.addresslist.append(x) 546 return newaddr 547 548 def __isub__(self, other): 549 # Set difference, in-place 550 for x in other.addresslist: 551 if x in self.addresslist: 552 self.addresslist.remove(x) 553 return self 554 555 def __getitem__(self, index): 556 # Make indexing, slices, and 'in' work 557 return self.addresslist[index] 558