xref: /aosp_15_r20/prebuilts/build-tools/common/py3-stdlib/email/_parseaddr.py (revision cda5da8d549138a6648c5ee6d7a49cf8f4a657be)
1# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: [email protected]
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py.  This should eventually be rewritten.
7"""
8
9__all__ = [
10    'mktime_tz',
11    'parsedate',
12    'parsedate_tz',
13    'quote',
14    ]
15
16import time, calendar
17
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24               'aug', 'sep', 'oct', 'nov', 'dec',
25               'january', 'february', 'march', 'april', 'may', 'june', 'july',
26               'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z.  According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones.  RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
38              'EST': -500, 'EDT': -400,  # Eastern
39              'CST': -600, 'CDT': -500,  # Central
40              'MST': -700, 'MDT': -600,  # Mountain
41              'PST': -800, 'PDT': -700   # Pacific
42              }
43
44
45def parsedate_tz(data):
46    """Convert a date string to a time tuple.
47
48    Accounts for military timezones.
49    """
50    res = _parsedate_tz(data)
51    if not res:
52        return
53    if res[9] is None:
54        res[9] = 0
55    return tuple(res)
56
57def _parsedate_tz(data):
58    """Convert date to extended time tuple.
59
60    The last (additional) element is the time zone offset in seconds, except if
61    the timezone was specified as -0000.  In that case the last element is
62    None.  This indicates a UTC timestamp that explicitly declaims knowledge of
63    the source timezone, as opposed to a +0000 timestamp that indicates the
64    source timezone really was UTC.
65
66    """
67    if not data:
68        return None
69    data = data.split()
70    if not data:  # This happens for whitespace-only input.
71        return None
72    # The FWS after the comma after the day-of-week is optional, so search and
73    # adjust for this.
74    if data[0].endswith(',') or data[0].lower() in _daynames:
75        # There's a dayname here. Skip it
76        del data[0]
77    else:
78        i = data[0].rfind(',')
79        if i >= 0:
80            data[0] = data[0][i+1:]
81    if len(data) == 3: # RFC 850 date, deprecated
82        stuff = data[0].split('-')
83        if len(stuff) == 3:
84            data = stuff + data[1:]
85    if len(data) == 4:
86        s = data[3]
87        i = s.find('+')
88        if i == -1:
89            i = s.find('-')
90        if i > 0:
91            data[3:] = [s[:i], s[i:]]
92        else:
93            data.append('') # Dummy tz
94    if len(data) < 5:
95        return None
96    data = data[:5]
97    [dd, mm, yy, tm, tz] = data
98    if not (dd and mm and yy):
99        return None
100    mm = mm.lower()
101    if mm not in _monthnames:
102        dd, mm = mm, dd.lower()
103        if mm not in _monthnames:
104            return None
105    mm = _monthnames.index(mm) + 1
106    if mm > 12:
107        mm -= 12
108    if dd[-1] == ',':
109        dd = dd[:-1]
110    i = yy.find(':')
111    if i > 0:
112        yy, tm = tm, yy
113    if yy[-1] == ',':
114        yy = yy[:-1]
115        if not yy:
116            return None
117    if not yy[0].isdigit():
118        yy, tz = tz, yy
119    if tm[-1] == ',':
120        tm = tm[:-1]
121    tm = tm.split(':')
122    if len(tm) == 2:
123        [thh, tmm] = tm
124        tss = '0'
125    elif len(tm) == 3:
126        [thh, tmm, tss] = tm
127    elif len(tm) == 1 and '.' in tm[0]:
128        # Some non-compliant MUAs use '.' to separate time elements.
129        tm = tm[0].split('.')
130        if len(tm) == 2:
131            [thh, tmm] = tm
132            tss = 0
133        elif len(tm) == 3:
134            [thh, tmm, tss] = tm
135        else:
136            return None
137    else:
138        return None
139    try:
140        yy = int(yy)
141        dd = int(dd)
142        thh = int(thh)
143        tmm = int(tmm)
144        tss = int(tss)
145    except ValueError:
146        return None
147    # Check for a yy specified in two-digit format, then convert it to the
148    # appropriate four-digit format, according to the POSIX standard. RFC 822
149    # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
150    # mandates a 4-digit yy. For more information, see the documentation for
151    # the time module.
152    if yy < 100:
153        # The year is between 1969 and 1999 (inclusive).
154        if yy > 68:
155            yy += 1900
156        # The year is between 2000 and 2068 (inclusive).
157        else:
158            yy += 2000
159    tzoffset = None
160    tz = tz.upper()
161    if tz in _timezones:
162        tzoffset = _timezones[tz]
163    else:
164        try:
165            tzoffset = int(tz)
166        except ValueError:
167            pass
168        if tzoffset==0 and tz.startswith('-'):
169            tzoffset = None
170    # Convert a timezone offset into seconds ; -0500 -> -18000
171    if tzoffset:
172        if tzoffset < 0:
173            tzsign = -1
174            tzoffset = -tzoffset
175        else:
176            tzsign = 1
177        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
178    # Daylight Saving Time flag is set to -1, since DST is unknown.
179    return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
180
181
182def parsedate(data):
183    """Convert a time string to a time tuple."""
184    t = parsedate_tz(data)
185    if isinstance(t, tuple):
186        return t[:9]
187    else:
188        return t
189
190
191def mktime_tz(data):
192    """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
193    if data[9] is None:
194        # No zone info, so localtime is better assumption than GMT
195        return time.mktime(data[:8] + (-1,))
196    else:
197        t = calendar.timegm(data)
198        return t - data[9]
199
200
201def quote(str):
202    """Prepare string to be used in a quoted string.
203
204    Turns backslash and double quote characters into quoted pairs.  These
205    are the only characters that need to be quoted inside a quoted string.
206    Does not add the surrounding double quotes.
207    """
208    return str.replace('\\', '\\\\').replace('"', '\\"')
209
210
211class AddrlistClass:
212    """Address parser class by Ben Escoto.
213
214    To understand what this class does, it helps to have a copy of RFC 2822 in
215    front of you.
216
217    Note: this class interface is deprecated and may be removed in the future.
218    Use email.utils.AddressList instead.
219    """
220
221    def __init__(self, field):
222        """Initialize a new instance.
223
224        `field' is an unparsed address header field, containing
225        one or more addresses.
226        """
227        self.specials = '()<>@,:;.\"[]'
228        self.pos = 0
229        self.LWS = ' \t'
230        self.CR = '\r\n'
231        self.FWS = self.LWS + self.CR
232        self.atomends = self.specials + self.LWS + self.CR
233        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
234        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
235        # syntax, so allow dots in phrases.
236        self.phraseends = self.atomends.replace('.', '')
237        self.field = field
238        self.commentlist = []
239
240    def gotonext(self):
241        """Skip white space and extract comments."""
242        wslist = []
243        while self.pos < len(self.field):
244            if self.field[self.pos] in self.LWS + '\n\r':
245                if self.field[self.pos] not in '\n\r':
246                    wslist.append(self.field[self.pos])
247                self.pos += 1
248            elif self.field[self.pos] == '(':
249                self.commentlist.append(self.getcomment())
250            else:
251                break
252        return EMPTYSTRING.join(wslist)
253
254    def getaddrlist(self):
255        """Parse all addresses.
256
257        Returns a list containing all of the addresses.
258        """
259        result = []
260        while self.pos < len(self.field):
261            ad = self.getaddress()
262            if ad:
263                result += ad
264            else:
265                result.append(('', ''))
266        return result
267
268    def getaddress(self):
269        """Parse the next address."""
270        self.commentlist = []
271        self.gotonext()
272
273        oldpos = self.pos
274        oldcl = self.commentlist
275        plist = self.getphraselist()
276
277        self.gotonext()
278        returnlist = []
279
280        if self.pos >= len(self.field):
281            # Bad email address technically, no domain.
282            if plist:
283                returnlist = [(SPACE.join(self.commentlist), plist[0])]
284
285        elif self.field[self.pos] in '.@':
286            # email address is just an addrspec
287            # this isn't very efficient since we start over
288            self.pos = oldpos
289            self.commentlist = oldcl
290            addrspec = self.getaddrspec()
291            returnlist = [(SPACE.join(self.commentlist), addrspec)]
292
293        elif self.field[self.pos] == ':':
294            # address is a group
295            returnlist = []
296
297            fieldlen = len(self.field)
298            self.pos += 1
299            while self.pos < len(self.field):
300                self.gotonext()
301                if self.pos < fieldlen and self.field[self.pos] == ';':
302                    self.pos += 1
303                    break
304                returnlist = returnlist + self.getaddress()
305
306        elif self.field[self.pos] == '<':
307            # Address is a phrase then a route addr
308            routeaddr = self.getrouteaddr()
309
310            if self.commentlist:
311                returnlist = [(SPACE.join(plist) + ' (' +
312                               ' '.join(self.commentlist) + ')', routeaddr)]
313            else:
314                returnlist = [(SPACE.join(plist), routeaddr)]
315
316        else:
317            if plist:
318                returnlist = [(SPACE.join(self.commentlist), plist[0])]
319            elif self.field[self.pos] in self.specials:
320                self.pos += 1
321
322        self.gotonext()
323        if self.pos < len(self.field) and self.field[self.pos] == ',':
324            self.pos += 1
325        return returnlist
326
327    def getrouteaddr(self):
328        """Parse a route address (Return-path value).
329
330        This method just skips all the route stuff and returns the addrspec.
331        """
332        if self.field[self.pos] != '<':
333            return
334
335        expectroute = False
336        self.pos += 1
337        self.gotonext()
338        adlist = ''
339        while self.pos < len(self.field):
340            if expectroute:
341                self.getdomain()
342                expectroute = False
343            elif self.field[self.pos] == '>':
344                self.pos += 1
345                break
346            elif self.field[self.pos] == '@':
347                self.pos += 1
348                expectroute = True
349            elif self.field[self.pos] == ':':
350                self.pos += 1
351            else:
352                adlist = self.getaddrspec()
353                self.pos += 1
354                break
355            self.gotonext()
356
357        return adlist
358
359    def getaddrspec(self):
360        """Parse an RFC 2822 addr-spec."""
361        aslist = []
362
363        self.gotonext()
364        while self.pos < len(self.field):
365            preserve_ws = True
366            if self.field[self.pos] == '.':
367                if aslist and not aslist[-1].strip():
368                    aslist.pop()
369                aslist.append('.')
370                self.pos += 1
371                preserve_ws = False
372            elif self.field[self.pos] == '"':
373                aslist.append('"%s"' % quote(self.getquote()))
374            elif self.field[self.pos] in self.atomends:
375                if aslist and not aslist[-1].strip():
376                    aslist.pop()
377                break
378            else:
379                aslist.append(self.getatom())
380            ws = self.gotonext()
381            if preserve_ws and ws:
382                aslist.append(ws)
383
384        if self.pos >= len(self.field) or self.field[self.pos] != '@':
385            return EMPTYSTRING.join(aslist)
386
387        aslist.append('@')
388        self.pos += 1
389        self.gotonext()
390        domain = self.getdomain()
391        if not domain:
392            # Invalid domain, return an empty address instead of returning a
393            # local part to denote failed parsing.
394            return EMPTYSTRING
395        return EMPTYSTRING.join(aslist) + domain
396
397    def getdomain(self):
398        """Get the complete domain name from an address."""
399        sdlist = []
400        while self.pos < len(self.field):
401            if self.field[self.pos] in self.LWS:
402                self.pos += 1
403            elif self.field[self.pos] == '(':
404                self.commentlist.append(self.getcomment())
405            elif self.field[self.pos] == '[':
406                sdlist.append(self.getdomainliteral())
407            elif self.field[self.pos] == '.':
408                self.pos += 1
409                sdlist.append('.')
410            elif self.field[self.pos] == '@':
411                # bpo-34155: Don't parse domains with two `@` like
412                # `[email protected]@important.com`.
413                return EMPTYSTRING
414            elif self.field[self.pos] in self.atomends:
415                break
416            else:
417                sdlist.append(self.getatom())
418        return EMPTYSTRING.join(sdlist)
419
420    def getdelimited(self, beginchar, endchars, allowcomments=True):
421        """Parse a header fragment delimited by special characters.
422
423        `beginchar' is the start character for the fragment.
424        If self is not looking at an instance of `beginchar' then
425        getdelimited returns the empty string.
426
427        `endchars' is a sequence of allowable end-delimiting characters.
428        Parsing stops when one of these is encountered.
429
430        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
431        within the parsed fragment.
432        """
433        if self.field[self.pos] != beginchar:
434            return ''
435
436        slist = ['']
437        quote = False
438        self.pos += 1
439        while self.pos < len(self.field):
440            if quote:
441                slist.append(self.field[self.pos])
442                quote = False
443            elif self.field[self.pos] in endchars:
444                self.pos += 1
445                break
446            elif allowcomments and self.field[self.pos] == '(':
447                slist.append(self.getcomment())
448                continue        # have already advanced pos from getcomment
449            elif self.field[self.pos] == '\\':
450                quote = True
451            else:
452                slist.append(self.field[self.pos])
453            self.pos += 1
454
455        return EMPTYSTRING.join(slist)
456
457    def getquote(self):
458        """Get a quote-delimited fragment from self's field."""
459        return self.getdelimited('"', '"\r', False)
460
461    def getcomment(self):
462        """Get a parenthesis-delimited fragment from self's field."""
463        return self.getdelimited('(', ')\r', True)
464
465    def getdomainliteral(self):
466        """Parse an RFC 2822 domain-literal."""
467        return '[%s]' % self.getdelimited('[', ']\r', False)
468
469    def getatom(self, atomends=None):
470        """Parse an RFC 2822 atom.
471
472        Optional atomends specifies a different set of end token delimiters
473        (the default is to use self.atomends).  This is used e.g. in
474        getphraselist() since phrase endings must not include the `.' (which
475        is legal in phrases)."""
476        atomlist = ['']
477        if atomends is None:
478            atomends = self.atomends
479
480        while self.pos < len(self.field):
481            if self.field[self.pos] in atomends:
482                break
483            else:
484                atomlist.append(self.field[self.pos])
485            self.pos += 1
486
487        return EMPTYSTRING.join(atomlist)
488
489    def getphraselist(self):
490        """Parse a sequence of RFC 2822 phrases.
491
492        A phrase is a sequence of words, which are in turn either RFC 2822
493        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
494        runs of continuous whitespace into one space.
495        """
496        plist = []
497
498        while self.pos < len(self.field):
499            if self.field[self.pos] in self.FWS:
500                self.pos += 1
501            elif self.field[self.pos] == '"':
502                plist.append(self.getquote())
503            elif self.field[self.pos] == '(':
504                self.commentlist.append(self.getcomment())
505            elif self.field[self.pos] in self.phraseends:
506                break
507            else:
508                plist.append(self.getatom(self.phraseends))
509
510        return plist
511
512class AddressList(AddrlistClass):
513    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
514    def __init__(self, field):
515        AddrlistClass.__init__(self, field)
516        if field:
517            self.addresslist = self.getaddrlist()
518        else:
519            self.addresslist = []
520
521    def __len__(self):
522        return len(self.addresslist)
523
524    def __add__(self, other):
525        # Set union
526        newaddr = AddressList(None)
527        newaddr.addresslist = self.addresslist[:]
528        for x in other.addresslist:
529            if not x in self.addresslist:
530                newaddr.addresslist.append(x)
531        return newaddr
532
533    def __iadd__(self, other):
534        # Set union, in-place
535        for x in other.addresslist:
536            if not x in self.addresslist:
537                self.addresslist.append(x)
538        return self
539
540    def __sub__(self, other):
541        # Set difference
542        newaddr = AddressList(None)
543        for x in self.addresslist:
544            if not x in other.addresslist:
545                newaddr.addresslist.append(x)
546        return newaddr
547
548    def __isub__(self, other):
549        # Set difference, in-place
550        for x in other.addresslist:
551            if x in self.addresslist:
552                self.addresslist.remove(x)
553        return self
554
555    def __getitem__(self, index):
556        # Make indexing, slices, and 'in' work
557        return self.addresslist[index]
558