xref: /aosp_15_r20/prebuilts/build-tools/common/py3-stdlib/html/parser.py (revision cda5da8d549138a6648c5ee6d7a49cf8f4a657be)
1"""A parser for HTML and XHTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import _markupbase
13
14from html import unescape
15
16
17__all__ = ['HTMLParser']
18
19# Regular expressions used for parsing
20
21interesting_normal = re.compile('[&<]')
22incomplete = re.compile('&[a-zA-Z#]')
23
24entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
25charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
26
27starttagopen = re.compile('<[a-zA-Z]')
28piclose = re.compile('>')
29commentclose = re.compile(r'--\s*>')
30# Note:
31#  1) if you change tagfind/attrfind remember to update locatestarttagend too;
32#  2) if you change tagfind/attrfind and/or locatestarttagend the parser will
33#     explode, so don't do it.
34# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
35# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
36tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
37attrfind_tolerant = re.compile(
38    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
39    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
40locatestarttagend_tolerant = re.compile(r"""
41  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
42  (?:[\s/]*                          # optional whitespace before attribute name
43    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
44      (?:\s*=+\s*                    # value indicator
45        (?:'[^']*'                   # LITA-enclosed value
46          |"[^"]*"                   # LIT-enclosed value
47          |(?!['"])[^>\s]*           # bare value
48         )
49        \s*                          # possibly followed by a space
50       )?(?:\s|/(?!>))*
51     )*
52   )?
53  \s*                                # trailing whitespace
54""", re.VERBOSE)
55endendtag = re.compile('>')
56# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
57# </ and the tag name, so maybe this should be fixed
58endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
59
60
61
62class HTMLParser(_markupbase.ParserBase):
63    """Find tags and other markup and call handler functions.
64
65    Usage:
66        p = HTMLParser()
67        p.feed(data)
68        ...
69        p.close()
70
71    Start tags are handled by calling self.handle_starttag() or
72    self.handle_startendtag(); end tags by self.handle_endtag().  The
73    data between tags is passed from the parser to the derived class
74    by calling self.handle_data() with the data as argument (the data
75    may be split up in arbitrary chunks).  If convert_charrefs is
76    True the character references are converted automatically to the
77    corresponding Unicode character (and self.handle_data() is no
78    longer split in chunks), otherwise they are passed by calling
79    self.handle_entityref() or self.handle_charref() with the string
80    containing respectively the named or numeric reference as the
81    argument.
82    """
83
84    CDATA_CONTENT_ELEMENTS = ("script", "style")
85
86    def __init__(self, *, convert_charrefs=True):
87        """Initialize and reset this instance.
88
89        If convert_charrefs is True (the default), all character references
90        are automatically converted to the corresponding Unicode characters.
91        """
92        self.convert_charrefs = convert_charrefs
93        self.reset()
94
95    def reset(self):
96        """Reset this instance.  Loses all unprocessed data."""
97        self.rawdata = ''
98        self.lasttag = '???'
99        self.interesting = interesting_normal
100        self.cdata_elem = None
101        _markupbase.ParserBase.reset(self)
102
103    def feed(self, data):
104        r"""Feed data to the parser.
105
106        Call this as often as you want, with as little or as much text
107        as you want (may include '\n').
108        """
109        self.rawdata = self.rawdata + data
110        self.goahead(0)
111
112    def close(self):
113        """Handle any buffered data."""
114        self.goahead(1)
115
116    __starttag_text = None
117
118    def get_starttag_text(self):
119        """Return full source of start tag: '<...>'."""
120        return self.__starttag_text
121
122    def set_cdata_mode(self, elem):
123        self.cdata_elem = elem.lower()
124        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
125
126    def clear_cdata_mode(self):
127        self.interesting = interesting_normal
128        self.cdata_elem = None
129
130    # Internal -- handle data as far as reasonable.  May leave state
131    # and data to be processed by a subsequent call.  If 'end' is
132    # true, force handling all data as if followed by EOF marker.
133    def goahead(self, end):
134        rawdata = self.rawdata
135        i = 0
136        n = len(rawdata)
137        while i < n:
138            if self.convert_charrefs and not self.cdata_elem:
139                j = rawdata.find('<', i)
140                if j < 0:
141                    # if we can't find the next <, either we are at the end
142                    # or there's more text incoming.  If the latter is True,
143                    # we can't pass the text to handle_data in case we have
144                    # a charref cut in half at end.  Try to determine if
145                    # this is the case before proceeding by looking for an
146                    # & near the end and see if it's followed by a space or ;.
147                    amppos = rawdata.rfind('&', max(i, n-34))
148                    if (amppos >= 0 and
149                        not re.compile(r'[\s;]').search(rawdata, amppos)):
150                        break  # wait till we get all the text
151                    j = n
152            else:
153                match = self.interesting.search(rawdata, i)  # < or &
154                if match:
155                    j = match.start()
156                else:
157                    if self.cdata_elem:
158                        break
159                    j = n
160            if i < j:
161                if self.convert_charrefs and not self.cdata_elem:
162                    self.handle_data(unescape(rawdata[i:j]))
163                else:
164                    self.handle_data(rawdata[i:j])
165            i = self.updatepos(i, j)
166            if i == n: break
167            startswith = rawdata.startswith
168            if startswith('<', i):
169                if starttagopen.match(rawdata, i): # < + letter
170                    k = self.parse_starttag(i)
171                elif startswith("</", i):
172                    k = self.parse_endtag(i)
173                elif startswith("<!--", i):
174                    k = self.parse_comment(i)
175                elif startswith("<?", i):
176                    k = self.parse_pi(i)
177                elif startswith("<!", i):
178                    k = self.parse_html_declaration(i)
179                elif (i + 1) < n:
180                    self.handle_data("<")
181                    k = i + 1
182                else:
183                    break
184                if k < 0:
185                    if not end:
186                        break
187                    k = rawdata.find('>', i + 1)
188                    if k < 0:
189                        k = rawdata.find('<', i + 1)
190                        if k < 0:
191                            k = i + 1
192                    else:
193                        k += 1
194                    if self.convert_charrefs and not self.cdata_elem:
195                        self.handle_data(unescape(rawdata[i:k]))
196                    else:
197                        self.handle_data(rawdata[i:k])
198                i = self.updatepos(i, k)
199            elif startswith("&#", i):
200                match = charref.match(rawdata, i)
201                if match:
202                    name = match.group()[2:-1]
203                    self.handle_charref(name)
204                    k = match.end()
205                    if not startswith(';', k-1):
206                        k = k - 1
207                    i = self.updatepos(i, k)
208                    continue
209                else:
210                    if ";" in rawdata[i:]:  # bail by consuming &#
211                        self.handle_data(rawdata[i:i+2])
212                        i = self.updatepos(i, i+2)
213                    break
214            elif startswith('&', i):
215                match = entityref.match(rawdata, i)
216                if match:
217                    name = match.group(1)
218                    self.handle_entityref(name)
219                    k = match.end()
220                    if not startswith(';', k-1):
221                        k = k - 1
222                    i = self.updatepos(i, k)
223                    continue
224                match = incomplete.match(rawdata, i)
225                if match:
226                    # match.group() will contain at least 2 chars
227                    if end and match.group() == rawdata[i:]:
228                        k = match.end()
229                        if k <= i:
230                            k = n
231                        i = self.updatepos(i, i + 1)
232                    # incomplete
233                    break
234                elif (i + 1) < n:
235                    # not the end of the buffer, and can't be confused
236                    # with some other construct
237                    self.handle_data("&")
238                    i = self.updatepos(i, i + 1)
239                else:
240                    break
241            else:
242                assert 0, "interesting.search() lied"
243        # end while
244        if end and i < n and not self.cdata_elem:
245            if self.convert_charrefs and not self.cdata_elem:
246                self.handle_data(unescape(rawdata[i:n]))
247            else:
248                self.handle_data(rawdata[i:n])
249            i = self.updatepos(i, n)
250        self.rawdata = rawdata[i:]
251
252    # Internal -- parse html declarations, return length or -1 if not terminated
253    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
254    # See also parse_declaration in _markupbase
255    def parse_html_declaration(self, i):
256        rawdata = self.rawdata
257        assert rawdata[i:i+2] == '<!', ('unexpected call to '
258                                        'parse_html_declaration()')
259        if rawdata[i:i+4] == '<!--':
260            # this case is actually already handled in goahead()
261            return self.parse_comment(i)
262        elif rawdata[i:i+3] == '<![':
263            return self.parse_marked_section(i)
264        elif rawdata[i:i+9].lower() == '<!doctype':
265            # find the closing >
266            gtpos = rawdata.find('>', i+9)
267            if gtpos == -1:
268                return -1
269            self.handle_decl(rawdata[i+2:gtpos])
270            return gtpos+1
271        else:
272            return self.parse_bogus_comment(i)
273
274    # Internal -- parse bogus comment, return length or -1 if not terminated
275    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
276    def parse_bogus_comment(self, i, report=1):
277        rawdata = self.rawdata
278        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
279                                                'parse_comment()')
280        pos = rawdata.find('>', i+2)
281        if pos == -1:
282            return -1
283        if report:
284            self.handle_comment(rawdata[i+2:pos])
285        return pos + 1
286
287    # Internal -- parse processing instr, return end or -1 if not terminated
288    def parse_pi(self, i):
289        rawdata = self.rawdata
290        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
291        match = piclose.search(rawdata, i+2) # >
292        if not match:
293            return -1
294        j = match.start()
295        self.handle_pi(rawdata[i+2: j])
296        j = match.end()
297        return j
298
299    # Internal -- handle starttag, return end or -1 if not terminated
300    def parse_starttag(self, i):
301        self.__starttag_text = None
302        endpos = self.check_for_whole_start_tag(i)
303        if endpos < 0:
304            return endpos
305        rawdata = self.rawdata
306        self.__starttag_text = rawdata[i:endpos]
307
308        # Now parse the data between i+1 and j into a tag and attrs
309        attrs = []
310        match = tagfind_tolerant.match(rawdata, i+1)
311        assert match, 'unexpected call to parse_starttag()'
312        k = match.end()
313        self.lasttag = tag = match.group(1).lower()
314        while k < endpos:
315            m = attrfind_tolerant.match(rawdata, k)
316            if not m:
317                break
318            attrname, rest, attrvalue = m.group(1, 2, 3)
319            if not rest:
320                attrvalue = None
321            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
322                 attrvalue[:1] == '"' == attrvalue[-1:]:
323                attrvalue = attrvalue[1:-1]
324            if attrvalue:
325                attrvalue = unescape(attrvalue)
326            attrs.append((attrname.lower(), attrvalue))
327            k = m.end()
328
329        end = rawdata[k:endpos].strip()
330        if end not in (">", "/>"):
331            self.handle_data(rawdata[i:endpos])
332            return endpos
333        if end.endswith('/>'):
334            # XHTML-style empty tag: <span attr="value" />
335            self.handle_startendtag(tag, attrs)
336        else:
337            self.handle_starttag(tag, attrs)
338            if tag in self.CDATA_CONTENT_ELEMENTS:
339                self.set_cdata_mode(tag)
340        return endpos
341
342    # Internal -- check to see if we have a complete starttag; return end
343    # or -1 if incomplete.
344    def check_for_whole_start_tag(self, i):
345        rawdata = self.rawdata
346        m = locatestarttagend_tolerant.match(rawdata, i)
347        if m:
348            j = m.end()
349            next = rawdata[j:j+1]
350            if next == ">":
351                return j + 1
352            if next == "/":
353                if rawdata.startswith("/>", j):
354                    return j + 2
355                if rawdata.startswith("/", j):
356                    # buffer boundary
357                    return -1
358                # else bogus input
359                if j > i:
360                    return j
361                else:
362                    return i + 1
363            if next == "":
364                # end of input
365                return -1
366            if next in ("abcdefghijklmnopqrstuvwxyz=/"
367                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
368                # end of input in or before attribute value, or we have the
369                # '/' from a '/>' ending
370                return -1
371            if j > i:
372                return j
373            else:
374                return i + 1
375        raise AssertionError("we should not get here!")
376
377    # Internal -- parse endtag, return end or -1 if incomplete
378    def parse_endtag(self, i):
379        rawdata = self.rawdata
380        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
381        match = endendtag.search(rawdata, i+1) # >
382        if not match:
383            return -1
384        gtpos = match.end()
385        match = endtagfind.match(rawdata, i) # </ + tag + >
386        if not match:
387            if self.cdata_elem is not None:
388                self.handle_data(rawdata[i:gtpos])
389                return gtpos
390            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
391            namematch = tagfind_tolerant.match(rawdata, i+2)
392            if not namematch:
393                # w3.org/TR/html5/tokenization.html#end-tag-open-state
394                if rawdata[i:i+3] == '</>':
395                    return i+3
396                else:
397                    return self.parse_bogus_comment(i)
398            tagname = namematch.group(1).lower()
399            # consume and ignore other stuff between the name and the >
400            # Note: this is not 100% correct, since we might have things like
401            # </tag attr=">">, but looking for > after the name should cover
402            # most of the cases and is much simpler
403            gtpos = rawdata.find('>', namematch.end())
404            self.handle_endtag(tagname)
405            return gtpos+1
406
407        elem = match.group(1).lower() # script or style
408        if self.cdata_elem is not None:
409            if elem != self.cdata_elem:
410                self.handle_data(rawdata[i:gtpos])
411                return gtpos
412
413        self.handle_endtag(elem)
414        self.clear_cdata_mode()
415        return gtpos
416
417    # Overridable -- finish processing of start+end tag: <tag.../>
418    def handle_startendtag(self, tag, attrs):
419        self.handle_starttag(tag, attrs)
420        self.handle_endtag(tag)
421
422    # Overridable -- handle start tag
423    def handle_starttag(self, tag, attrs):
424        pass
425
426    # Overridable -- handle end tag
427    def handle_endtag(self, tag):
428        pass
429
430    # Overridable -- handle character reference
431    def handle_charref(self, name):
432        pass
433
434    # Overridable -- handle entity reference
435    def handle_entityref(self, name):
436        pass
437
438    # Overridable -- handle data
439    def handle_data(self, data):
440        pass
441
442    # Overridable -- handle comment
443    def handle_comment(self, data):
444        pass
445
446    # Overridable -- handle declaration
447    def handle_decl(self, decl):
448        pass
449
450    # Overridable -- handle processing instruction
451    def handle_pi(self, data):
452        pass
453
454    def unknown_decl(self, data):
455        pass
456