1"""A parser for HTML and XHTML.""" 2 3# This file is based on sgmllib.py, but the API is slightly different. 4 5# XXX There should be a way to distinguish between PCDATA (parsed 6# character data -- the normal case), RCDATA (replaceable character 7# data -- only char and entity references and end tags are special) 8# and CDATA (character data -- only end tags are special). 9 10 11import re 12import _markupbase 13 14from html import unescape 15 16 17__all__ = ['HTMLParser'] 18 19# Regular expressions used for parsing 20 21interesting_normal = re.compile('[&<]') 22incomplete = re.compile('&[a-zA-Z#]') 23 24entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 25charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 26 27starttagopen = re.compile('<[a-zA-Z]') 28piclose = re.compile('>') 29commentclose = re.compile(r'--\s*>') 30# Note: 31# 1) if you change tagfind/attrfind remember to update locatestarttagend too; 32# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will 33# explode, so don't do it. 34# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 35# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 36tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') 37attrfind_tolerant = re.compile( 38 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 39 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 40locatestarttagend_tolerant = re.compile(r""" 41 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name 42 (?:[\s/]* # optional whitespace before attribute name 43 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 44 (?:\s*=+\s* # value indicator 45 (?:'[^']*' # LITA-enclosed value 46 |"[^"]*" # LIT-enclosed value 47 |(?!['"])[^>\s]* # bare value 48 ) 49 \s* # possibly followed by a space 50 )?(?:\s|/(?!>))* 51 )* 52 )? 53 \s* # trailing whitespace 54""", re.VERBOSE) 55endendtag = re.compile('>') 56# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 57# </ and the tag name, so maybe this should be fixed 58endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 59 60 61 62class HTMLParser(_markupbase.ParserBase): 63 """Find tags and other markup and call handler functions. 64 65 Usage: 66 p = HTMLParser() 67 p.feed(data) 68 ... 69 p.close() 70 71 Start tags are handled by calling self.handle_starttag() or 72 self.handle_startendtag(); end tags by self.handle_endtag(). The 73 data between tags is passed from the parser to the derived class 74 by calling self.handle_data() with the data as argument (the data 75 may be split up in arbitrary chunks). If convert_charrefs is 76 True the character references are converted automatically to the 77 corresponding Unicode character (and self.handle_data() is no 78 longer split in chunks), otherwise they are passed by calling 79 self.handle_entityref() or self.handle_charref() with the string 80 containing respectively the named or numeric reference as the 81 argument. 82 """ 83 84 CDATA_CONTENT_ELEMENTS = ("script", "style") 85 86 def __init__(self, *, convert_charrefs=True): 87 """Initialize and reset this instance. 88 89 If convert_charrefs is True (the default), all character references 90 are automatically converted to the corresponding Unicode characters. 91 """ 92 self.convert_charrefs = convert_charrefs 93 self.reset() 94 95 def reset(self): 96 """Reset this instance. Loses all unprocessed data.""" 97 self.rawdata = '' 98 self.lasttag = '???' 99 self.interesting = interesting_normal 100 self.cdata_elem = None 101 _markupbase.ParserBase.reset(self) 102 103 def feed(self, data): 104 r"""Feed data to the parser. 105 106 Call this as often as you want, with as little or as much text 107 as you want (may include '\n'). 108 """ 109 self.rawdata = self.rawdata + data 110 self.goahead(0) 111 112 def close(self): 113 """Handle any buffered data.""" 114 self.goahead(1) 115 116 __starttag_text = None 117 118 def get_starttag_text(self): 119 """Return full source of start tag: '<...>'.""" 120 return self.__starttag_text 121 122 def set_cdata_mode(self, elem): 123 self.cdata_elem = elem.lower() 124 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 125 126 def clear_cdata_mode(self): 127 self.interesting = interesting_normal 128 self.cdata_elem = None 129 130 # Internal -- handle data as far as reasonable. May leave state 131 # and data to be processed by a subsequent call. If 'end' is 132 # true, force handling all data as if followed by EOF marker. 133 def goahead(self, end): 134 rawdata = self.rawdata 135 i = 0 136 n = len(rawdata) 137 while i < n: 138 if self.convert_charrefs and not self.cdata_elem: 139 j = rawdata.find('<', i) 140 if j < 0: 141 # if we can't find the next <, either we are at the end 142 # or there's more text incoming. If the latter is True, 143 # we can't pass the text to handle_data in case we have 144 # a charref cut in half at end. Try to determine if 145 # this is the case before proceeding by looking for an 146 # & near the end and see if it's followed by a space or ;. 147 amppos = rawdata.rfind('&', max(i, n-34)) 148 if (amppos >= 0 and 149 not re.compile(r'[\s;]').search(rawdata, amppos)): 150 break # wait till we get all the text 151 j = n 152 else: 153 match = self.interesting.search(rawdata, i) # < or & 154 if match: 155 j = match.start() 156 else: 157 if self.cdata_elem: 158 break 159 j = n 160 if i < j: 161 if self.convert_charrefs and not self.cdata_elem: 162 self.handle_data(unescape(rawdata[i:j])) 163 else: 164 self.handle_data(rawdata[i:j]) 165 i = self.updatepos(i, j) 166 if i == n: break 167 startswith = rawdata.startswith 168 if startswith('<', i): 169 if starttagopen.match(rawdata, i): # < + letter 170 k = self.parse_starttag(i) 171 elif startswith("</", i): 172 k = self.parse_endtag(i) 173 elif startswith("<!--", i): 174 k = self.parse_comment(i) 175 elif startswith("<?", i): 176 k = self.parse_pi(i) 177 elif startswith("<!", i): 178 k = self.parse_html_declaration(i) 179 elif (i + 1) < n: 180 self.handle_data("<") 181 k = i + 1 182 else: 183 break 184 if k < 0: 185 if not end: 186 break 187 k = rawdata.find('>', i + 1) 188 if k < 0: 189 k = rawdata.find('<', i + 1) 190 if k < 0: 191 k = i + 1 192 else: 193 k += 1 194 if self.convert_charrefs and not self.cdata_elem: 195 self.handle_data(unescape(rawdata[i:k])) 196 else: 197 self.handle_data(rawdata[i:k]) 198 i = self.updatepos(i, k) 199 elif startswith("&#", i): 200 match = charref.match(rawdata, i) 201 if match: 202 name = match.group()[2:-1] 203 self.handle_charref(name) 204 k = match.end() 205 if not startswith(';', k-1): 206 k = k - 1 207 i = self.updatepos(i, k) 208 continue 209 else: 210 if ";" in rawdata[i:]: # bail by consuming &# 211 self.handle_data(rawdata[i:i+2]) 212 i = self.updatepos(i, i+2) 213 break 214 elif startswith('&', i): 215 match = entityref.match(rawdata, i) 216 if match: 217 name = match.group(1) 218 self.handle_entityref(name) 219 k = match.end() 220 if not startswith(';', k-1): 221 k = k - 1 222 i = self.updatepos(i, k) 223 continue 224 match = incomplete.match(rawdata, i) 225 if match: 226 # match.group() will contain at least 2 chars 227 if end and match.group() == rawdata[i:]: 228 k = match.end() 229 if k <= i: 230 k = n 231 i = self.updatepos(i, i + 1) 232 # incomplete 233 break 234 elif (i + 1) < n: 235 # not the end of the buffer, and can't be confused 236 # with some other construct 237 self.handle_data("&") 238 i = self.updatepos(i, i + 1) 239 else: 240 break 241 else: 242 assert 0, "interesting.search() lied" 243 # end while 244 if end and i < n and not self.cdata_elem: 245 if self.convert_charrefs and not self.cdata_elem: 246 self.handle_data(unescape(rawdata[i:n])) 247 else: 248 self.handle_data(rawdata[i:n]) 249 i = self.updatepos(i, n) 250 self.rawdata = rawdata[i:] 251 252 # Internal -- parse html declarations, return length or -1 if not terminated 253 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 254 # See also parse_declaration in _markupbase 255 def parse_html_declaration(self, i): 256 rawdata = self.rawdata 257 assert rawdata[i:i+2] == '<!', ('unexpected call to ' 258 'parse_html_declaration()') 259 if rawdata[i:i+4] == '<!--': 260 # this case is actually already handled in goahead() 261 return self.parse_comment(i) 262 elif rawdata[i:i+3] == '<![': 263 return self.parse_marked_section(i) 264 elif rawdata[i:i+9].lower() == '<!doctype': 265 # find the closing > 266 gtpos = rawdata.find('>', i+9) 267 if gtpos == -1: 268 return -1 269 self.handle_decl(rawdata[i+2:gtpos]) 270 return gtpos+1 271 else: 272 return self.parse_bogus_comment(i) 273 274 # Internal -- parse bogus comment, return length or -1 if not terminated 275 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 276 def parse_bogus_comment(self, i, report=1): 277 rawdata = self.rawdata 278 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' 279 'parse_comment()') 280 pos = rawdata.find('>', i+2) 281 if pos == -1: 282 return -1 283 if report: 284 self.handle_comment(rawdata[i+2:pos]) 285 return pos + 1 286 287 # Internal -- parse processing instr, return end or -1 if not terminated 288 def parse_pi(self, i): 289 rawdata = self.rawdata 290 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 291 match = piclose.search(rawdata, i+2) # > 292 if not match: 293 return -1 294 j = match.start() 295 self.handle_pi(rawdata[i+2: j]) 296 j = match.end() 297 return j 298 299 # Internal -- handle starttag, return end or -1 if not terminated 300 def parse_starttag(self, i): 301 self.__starttag_text = None 302 endpos = self.check_for_whole_start_tag(i) 303 if endpos < 0: 304 return endpos 305 rawdata = self.rawdata 306 self.__starttag_text = rawdata[i:endpos] 307 308 # Now parse the data between i+1 and j into a tag and attrs 309 attrs = [] 310 match = tagfind_tolerant.match(rawdata, i+1) 311 assert match, 'unexpected call to parse_starttag()' 312 k = match.end() 313 self.lasttag = tag = match.group(1).lower() 314 while k < endpos: 315 m = attrfind_tolerant.match(rawdata, k) 316 if not m: 317 break 318 attrname, rest, attrvalue = m.group(1, 2, 3) 319 if not rest: 320 attrvalue = None 321 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 322 attrvalue[:1] == '"' == attrvalue[-1:]: 323 attrvalue = attrvalue[1:-1] 324 if attrvalue: 325 attrvalue = unescape(attrvalue) 326 attrs.append((attrname.lower(), attrvalue)) 327 k = m.end() 328 329 end = rawdata[k:endpos].strip() 330 if end not in (">", "/>"): 331 self.handle_data(rawdata[i:endpos]) 332 return endpos 333 if end.endswith('/>'): 334 # XHTML-style empty tag: <span attr="value" /> 335 self.handle_startendtag(tag, attrs) 336 else: 337 self.handle_starttag(tag, attrs) 338 if tag in self.CDATA_CONTENT_ELEMENTS: 339 self.set_cdata_mode(tag) 340 return endpos 341 342 # Internal -- check to see if we have a complete starttag; return end 343 # or -1 if incomplete. 344 def check_for_whole_start_tag(self, i): 345 rawdata = self.rawdata 346 m = locatestarttagend_tolerant.match(rawdata, i) 347 if m: 348 j = m.end() 349 next = rawdata[j:j+1] 350 if next == ">": 351 return j + 1 352 if next == "/": 353 if rawdata.startswith("/>", j): 354 return j + 2 355 if rawdata.startswith("/", j): 356 # buffer boundary 357 return -1 358 # else bogus input 359 if j > i: 360 return j 361 else: 362 return i + 1 363 if next == "": 364 # end of input 365 return -1 366 if next in ("abcdefghijklmnopqrstuvwxyz=/" 367 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 368 # end of input in or before attribute value, or we have the 369 # '/' from a '/>' ending 370 return -1 371 if j > i: 372 return j 373 else: 374 return i + 1 375 raise AssertionError("we should not get here!") 376 377 # Internal -- parse endtag, return end or -1 if incomplete 378 def parse_endtag(self, i): 379 rawdata = self.rawdata 380 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 381 match = endendtag.search(rawdata, i+1) # > 382 if not match: 383 return -1 384 gtpos = match.end() 385 match = endtagfind.match(rawdata, i) # </ + tag + > 386 if not match: 387 if self.cdata_elem is not None: 388 self.handle_data(rawdata[i:gtpos]) 389 return gtpos 390 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 391 namematch = tagfind_tolerant.match(rawdata, i+2) 392 if not namematch: 393 # w3.org/TR/html5/tokenization.html#end-tag-open-state 394 if rawdata[i:i+3] == '</>': 395 return i+3 396 else: 397 return self.parse_bogus_comment(i) 398 tagname = namematch.group(1).lower() 399 # consume and ignore other stuff between the name and the > 400 # Note: this is not 100% correct, since we might have things like 401 # </tag attr=">">, but looking for > after the name should cover 402 # most of the cases and is much simpler 403 gtpos = rawdata.find('>', namematch.end()) 404 self.handle_endtag(tagname) 405 return gtpos+1 406 407 elem = match.group(1).lower() # script or style 408 if self.cdata_elem is not None: 409 if elem != self.cdata_elem: 410 self.handle_data(rawdata[i:gtpos]) 411 return gtpos 412 413 self.handle_endtag(elem) 414 self.clear_cdata_mode() 415 return gtpos 416 417 # Overridable -- finish processing of start+end tag: <tag.../> 418 def handle_startendtag(self, tag, attrs): 419 self.handle_starttag(tag, attrs) 420 self.handle_endtag(tag) 421 422 # Overridable -- handle start tag 423 def handle_starttag(self, tag, attrs): 424 pass 425 426 # Overridable -- handle end tag 427 def handle_endtag(self, tag): 428 pass 429 430 # Overridable -- handle character reference 431 def handle_charref(self, name): 432 pass 433 434 # Overridable -- handle entity reference 435 def handle_entityref(self, name): 436 pass 437 438 # Overridable -- handle data 439 def handle_data(self, data): 440 pass 441 442 # Overridable -- handle comment 443 def handle_comment(self, data): 444 pass 445 446 # Overridable -- handle declaration 447 def handle_decl(self, decl): 448 pass 449 450 # Overridable -- handle processing instruction 451 def handle_pi(self, data): 452 pass 453 454 def unknown_decl(self, data): 455 pass 456