1*cda5da8dSAndroid Build Coastguard Worker"""Shared support for scanning document type declarations in HTML and XHTML. 2*cda5da8dSAndroid Build Coastguard Worker 3*cda5da8dSAndroid Build Coastguard WorkerThis module is used as a foundation for the html.parser module. It has no 4*cda5da8dSAndroid Build Coastguard Workerdocumented public API and should not be used directly. 5*cda5da8dSAndroid Build Coastguard Worker 6*cda5da8dSAndroid Build Coastguard Worker""" 7*cda5da8dSAndroid Build Coastguard Worker 8*cda5da8dSAndroid Build Coastguard Workerimport re 9*cda5da8dSAndroid Build Coastguard Worker 10*cda5da8dSAndroid Build Coastguard Worker_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match 11*cda5da8dSAndroid Build Coastguard Worker_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match 12*cda5da8dSAndroid Build Coastguard Worker_commentclose = re.compile(r'--\s*>') 13*cda5da8dSAndroid Build Coastguard Worker_markedsectionclose = re.compile(r']\s*]\s*>') 14*cda5da8dSAndroid Build Coastguard Worker 15*cda5da8dSAndroid Build Coastguard Worker# An analysis of the MS-Word extensions is available at 16*cda5da8dSAndroid Build Coastguard Worker# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf 17*cda5da8dSAndroid Build Coastguard Worker 18*cda5da8dSAndroid Build Coastguard Worker_msmarkedsectionclose = re.compile(r']\s*>') 19*cda5da8dSAndroid Build Coastguard Worker 20*cda5da8dSAndroid Build Coastguard Workerdel re 21*cda5da8dSAndroid Build Coastguard Worker 22*cda5da8dSAndroid Build Coastguard Worker 23*cda5da8dSAndroid Build Coastguard Workerclass ParserBase: 24*cda5da8dSAndroid Build Coastguard Worker """Parser base class which provides some common support methods used 25*cda5da8dSAndroid Build Coastguard Worker by the SGML/HTML and XHTML parsers.""" 26*cda5da8dSAndroid Build Coastguard Worker 27*cda5da8dSAndroid Build Coastguard Worker def __init__(self): 28*cda5da8dSAndroid Build Coastguard Worker if self.__class__ is ParserBase: 29*cda5da8dSAndroid Build Coastguard Worker raise RuntimeError( 30*cda5da8dSAndroid Build Coastguard Worker "_markupbase.ParserBase must be subclassed") 31*cda5da8dSAndroid Build Coastguard Worker 32*cda5da8dSAndroid Build Coastguard Worker def reset(self): 33*cda5da8dSAndroid Build Coastguard Worker self.lineno = 1 34*cda5da8dSAndroid Build Coastguard Worker self.offset = 0 35*cda5da8dSAndroid Build Coastguard Worker 36*cda5da8dSAndroid Build Coastguard Worker def getpos(self): 37*cda5da8dSAndroid Build Coastguard Worker """Return current line number and offset.""" 38*cda5da8dSAndroid Build Coastguard Worker return self.lineno, self.offset 39*cda5da8dSAndroid Build Coastguard Worker 40*cda5da8dSAndroid Build Coastguard Worker # Internal -- update line number and offset. This should be 41*cda5da8dSAndroid Build Coastguard Worker # called for each piece of data exactly once, in order -- in other 42*cda5da8dSAndroid Build Coastguard Worker # words the concatenation of all the input strings to this 43*cda5da8dSAndroid Build Coastguard Worker # function should be exactly the entire input. 44*cda5da8dSAndroid Build Coastguard Worker def updatepos(self, i, j): 45*cda5da8dSAndroid Build Coastguard Worker if i >= j: 46*cda5da8dSAndroid Build Coastguard Worker return j 47*cda5da8dSAndroid Build Coastguard Worker rawdata = self.rawdata 48*cda5da8dSAndroid Build Coastguard Worker nlines = rawdata.count("\n", i, j) 49*cda5da8dSAndroid Build Coastguard Worker if nlines: 50*cda5da8dSAndroid Build Coastguard Worker self.lineno = self.lineno + nlines 51*cda5da8dSAndroid Build Coastguard Worker pos = rawdata.rindex("\n", i, j) # Should not fail 52*cda5da8dSAndroid Build Coastguard Worker self.offset = j-(pos+1) 53*cda5da8dSAndroid Build Coastguard Worker else: 54*cda5da8dSAndroid Build Coastguard Worker self.offset = self.offset + j-i 55*cda5da8dSAndroid Build Coastguard Worker return j 56*cda5da8dSAndroid Build Coastguard Worker 57*cda5da8dSAndroid Build Coastguard Worker _decl_otherchars = '' 58*cda5da8dSAndroid Build Coastguard Worker 59*cda5da8dSAndroid Build Coastguard Worker # Internal -- parse declaration (for use by subclasses). 60*cda5da8dSAndroid Build Coastguard Worker def parse_declaration(self, i): 61*cda5da8dSAndroid Build Coastguard Worker # This is some sort of declaration; in "HTML as 62*cda5da8dSAndroid Build Coastguard Worker # deployed," this should only be the document type 63*cda5da8dSAndroid Build Coastguard Worker # declaration ("<!DOCTYPE html...>"). 64*cda5da8dSAndroid Build Coastguard Worker # ISO 8879:1986, however, has more complex 65*cda5da8dSAndroid Build Coastguard Worker # declaration syntax for elements in <!...>, including: 66*cda5da8dSAndroid Build Coastguard Worker # --comment-- 67*cda5da8dSAndroid Build Coastguard Worker # [marked section] 68*cda5da8dSAndroid Build Coastguard Worker # name in the following list: ENTITY, DOCTYPE, ELEMENT, 69*cda5da8dSAndroid Build Coastguard Worker # ATTLIST, NOTATION, SHORTREF, USEMAP, 70*cda5da8dSAndroid Build Coastguard Worker # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM 71*cda5da8dSAndroid Build Coastguard Worker rawdata = self.rawdata 72*cda5da8dSAndroid Build Coastguard Worker j = i + 2 73*cda5da8dSAndroid Build Coastguard Worker assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" 74*cda5da8dSAndroid Build Coastguard Worker if rawdata[j:j+1] == ">": 75*cda5da8dSAndroid Build Coastguard Worker # the empty comment <!> 76*cda5da8dSAndroid Build Coastguard Worker return j + 1 77*cda5da8dSAndroid Build Coastguard Worker if rawdata[j:j+1] in ("-", ""): 78*cda5da8dSAndroid Build Coastguard Worker # Start of comment followed by buffer boundary, 79*cda5da8dSAndroid Build Coastguard Worker # or just a buffer boundary. 80*cda5da8dSAndroid Build Coastguard Worker return -1 81*cda5da8dSAndroid Build Coastguard Worker # A simple, practical version could look like: ((name|stringlit) S*) + '>' 82*cda5da8dSAndroid Build Coastguard Worker n = len(rawdata) 83*cda5da8dSAndroid Build Coastguard Worker if rawdata[j:j+2] == '--': #comment 84*cda5da8dSAndroid Build Coastguard Worker # Locate --.*-- as the body of the comment 85*cda5da8dSAndroid Build Coastguard Worker return self.parse_comment(i) 86*cda5da8dSAndroid Build Coastguard Worker elif rawdata[j] == '[': #marked section 87*cda5da8dSAndroid Build Coastguard Worker # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section 88*cda5da8dSAndroid Build Coastguard Worker # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA 89*cda5da8dSAndroid Build Coastguard Worker # Note that this is extended by Microsoft Office "Save as Web" function 90*cda5da8dSAndroid Build Coastguard Worker # to include [if...] and [endif]. 91*cda5da8dSAndroid Build Coastguard Worker return self.parse_marked_section(i) 92*cda5da8dSAndroid Build Coastguard Worker else: #all other declaration elements 93*cda5da8dSAndroid Build Coastguard Worker decltype, j = self._scan_name(j, i) 94*cda5da8dSAndroid Build Coastguard Worker if j < 0: 95*cda5da8dSAndroid Build Coastguard Worker return j 96*cda5da8dSAndroid Build Coastguard Worker if decltype == "doctype": 97*cda5da8dSAndroid Build Coastguard Worker self._decl_otherchars = '' 98*cda5da8dSAndroid Build Coastguard Worker while j < n: 99*cda5da8dSAndroid Build Coastguard Worker c = rawdata[j] 100*cda5da8dSAndroid Build Coastguard Worker if c == ">": 101*cda5da8dSAndroid Build Coastguard Worker # end of declaration syntax 102*cda5da8dSAndroid Build Coastguard Worker data = rawdata[i+2:j] 103*cda5da8dSAndroid Build Coastguard Worker if decltype == "doctype": 104*cda5da8dSAndroid Build Coastguard Worker self.handle_decl(data) 105*cda5da8dSAndroid Build Coastguard Worker else: 106*cda5da8dSAndroid Build Coastguard Worker # According to the HTML5 specs sections "8.2.4.44 Bogus 107*cda5da8dSAndroid Build Coastguard Worker # comment state" and "8.2.4.45 Markup declaration open 108*cda5da8dSAndroid Build Coastguard Worker # state", a comment token should be emitted. 109*cda5da8dSAndroid Build Coastguard Worker # Calling unknown_decl provides more flexibility though. 110*cda5da8dSAndroid Build Coastguard Worker self.unknown_decl(data) 111*cda5da8dSAndroid Build Coastguard Worker return j + 1 112*cda5da8dSAndroid Build Coastguard Worker if c in "\"'": 113*cda5da8dSAndroid Build Coastguard Worker m = _declstringlit_match(rawdata, j) 114*cda5da8dSAndroid Build Coastguard Worker if not m: 115*cda5da8dSAndroid Build Coastguard Worker return -1 # incomplete 116*cda5da8dSAndroid Build Coastguard Worker j = m.end() 117*cda5da8dSAndroid Build Coastguard Worker elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": 118*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(j, i) 119*cda5da8dSAndroid Build Coastguard Worker elif c in self._decl_otherchars: 120*cda5da8dSAndroid Build Coastguard Worker j = j + 1 121*cda5da8dSAndroid Build Coastguard Worker elif c == "[": 122*cda5da8dSAndroid Build Coastguard Worker # this could be handled in a separate doctype parser 123*cda5da8dSAndroid Build Coastguard Worker if decltype == "doctype": 124*cda5da8dSAndroid Build Coastguard Worker j = self._parse_doctype_subset(j + 1, i) 125*cda5da8dSAndroid Build Coastguard Worker elif decltype in {"attlist", "linktype", "link", "element"}: 126*cda5da8dSAndroid Build Coastguard Worker # must tolerate []'d groups in a content model in an element declaration 127*cda5da8dSAndroid Build Coastguard Worker # also in data attribute specifications of attlist declaration 128*cda5da8dSAndroid Build Coastguard Worker # also link type declaration subsets in linktype declarations 129*cda5da8dSAndroid Build Coastguard Worker # also link attribute specification lists in link declarations 130*cda5da8dSAndroid Build Coastguard Worker raise AssertionError("unsupported '[' char in %s declaration" % decltype) 131*cda5da8dSAndroid Build Coastguard Worker else: 132*cda5da8dSAndroid Build Coastguard Worker raise AssertionError("unexpected '[' char in declaration") 133*cda5da8dSAndroid Build Coastguard Worker else: 134*cda5da8dSAndroid Build Coastguard Worker raise AssertionError("unexpected %r char in declaration" % rawdata[j]) 135*cda5da8dSAndroid Build Coastguard Worker if j < 0: 136*cda5da8dSAndroid Build Coastguard Worker return j 137*cda5da8dSAndroid Build Coastguard Worker return -1 # incomplete 138*cda5da8dSAndroid Build Coastguard Worker 139*cda5da8dSAndroid Build Coastguard Worker # Internal -- parse a marked section 140*cda5da8dSAndroid Build Coastguard Worker # Override this to handle MS-word extension syntax <![if word]>content<![endif]> 141*cda5da8dSAndroid Build Coastguard Worker def parse_marked_section(self, i, report=1): 142*cda5da8dSAndroid Build Coastguard Worker rawdata= self.rawdata 143*cda5da8dSAndroid Build Coastguard Worker assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" 144*cda5da8dSAndroid Build Coastguard Worker sectName, j = self._scan_name( i+3, i ) 145*cda5da8dSAndroid Build Coastguard Worker if j < 0: 146*cda5da8dSAndroid Build Coastguard Worker return j 147*cda5da8dSAndroid Build Coastguard Worker if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}: 148*cda5da8dSAndroid Build Coastguard Worker # look for standard ]]> ending 149*cda5da8dSAndroid Build Coastguard Worker match= _markedsectionclose.search(rawdata, i+3) 150*cda5da8dSAndroid Build Coastguard Worker elif sectName in {"if", "else", "endif"}: 151*cda5da8dSAndroid Build Coastguard Worker # look for MS Office ]> ending 152*cda5da8dSAndroid Build Coastguard Worker match= _msmarkedsectionclose.search(rawdata, i+3) 153*cda5da8dSAndroid Build Coastguard Worker else: 154*cda5da8dSAndroid Build Coastguard Worker raise AssertionError( 155*cda5da8dSAndroid Build Coastguard Worker 'unknown status keyword %r in marked section' % rawdata[i+3:j] 156*cda5da8dSAndroid Build Coastguard Worker ) 157*cda5da8dSAndroid Build Coastguard Worker if not match: 158*cda5da8dSAndroid Build Coastguard Worker return -1 159*cda5da8dSAndroid Build Coastguard Worker if report: 160*cda5da8dSAndroid Build Coastguard Worker j = match.start(0) 161*cda5da8dSAndroid Build Coastguard Worker self.unknown_decl(rawdata[i+3: j]) 162*cda5da8dSAndroid Build Coastguard Worker return match.end(0) 163*cda5da8dSAndroid Build Coastguard Worker 164*cda5da8dSAndroid Build Coastguard Worker # Internal -- parse comment, return length or -1 if not terminated 165*cda5da8dSAndroid Build Coastguard Worker def parse_comment(self, i, report=1): 166*cda5da8dSAndroid Build Coastguard Worker rawdata = self.rawdata 167*cda5da8dSAndroid Build Coastguard Worker if rawdata[i:i+4] != '<!--': 168*cda5da8dSAndroid Build Coastguard Worker raise AssertionError('unexpected call to parse_comment()') 169*cda5da8dSAndroid Build Coastguard Worker match = _commentclose.search(rawdata, i+4) 170*cda5da8dSAndroid Build Coastguard Worker if not match: 171*cda5da8dSAndroid Build Coastguard Worker return -1 172*cda5da8dSAndroid Build Coastguard Worker if report: 173*cda5da8dSAndroid Build Coastguard Worker j = match.start(0) 174*cda5da8dSAndroid Build Coastguard Worker self.handle_comment(rawdata[i+4: j]) 175*cda5da8dSAndroid Build Coastguard Worker return match.end(0) 176*cda5da8dSAndroid Build Coastguard Worker 177*cda5da8dSAndroid Build Coastguard Worker # Internal -- scan past the internal subset in a <!DOCTYPE declaration, 178*cda5da8dSAndroid Build Coastguard Worker # returning the index just past any whitespace following the trailing ']'. 179*cda5da8dSAndroid Build Coastguard Worker def _parse_doctype_subset(self, i, declstartpos): 180*cda5da8dSAndroid Build Coastguard Worker rawdata = self.rawdata 181*cda5da8dSAndroid Build Coastguard Worker n = len(rawdata) 182*cda5da8dSAndroid Build Coastguard Worker j = i 183*cda5da8dSAndroid Build Coastguard Worker while j < n: 184*cda5da8dSAndroid Build Coastguard Worker c = rawdata[j] 185*cda5da8dSAndroid Build Coastguard Worker if c == "<": 186*cda5da8dSAndroid Build Coastguard Worker s = rawdata[j:j+2] 187*cda5da8dSAndroid Build Coastguard Worker if s == "<": 188*cda5da8dSAndroid Build Coastguard Worker # end of buffer; incomplete 189*cda5da8dSAndroid Build Coastguard Worker return -1 190*cda5da8dSAndroid Build Coastguard Worker if s != "<!": 191*cda5da8dSAndroid Build Coastguard Worker self.updatepos(declstartpos, j + 1) 192*cda5da8dSAndroid Build Coastguard Worker raise AssertionError( 193*cda5da8dSAndroid Build Coastguard Worker "unexpected char in internal subset (in %r)" % s 194*cda5da8dSAndroid Build Coastguard Worker ) 195*cda5da8dSAndroid Build Coastguard Worker if (j + 2) == n: 196*cda5da8dSAndroid Build Coastguard Worker # end of buffer; incomplete 197*cda5da8dSAndroid Build Coastguard Worker return -1 198*cda5da8dSAndroid Build Coastguard Worker if (j + 4) > n: 199*cda5da8dSAndroid Build Coastguard Worker # end of buffer; incomplete 200*cda5da8dSAndroid Build Coastguard Worker return -1 201*cda5da8dSAndroid Build Coastguard Worker if rawdata[j:j+4] == "<!--": 202*cda5da8dSAndroid Build Coastguard Worker j = self.parse_comment(j, report=0) 203*cda5da8dSAndroid Build Coastguard Worker if j < 0: 204*cda5da8dSAndroid Build Coastguard Worker return j 205*cda5da8dSAndroid Build Coastguard Worker continue 206*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(j + 2, declstartpos) 207*cda5da8dSAndroid Build Coastguard Worker if j == -1: 208*cda5da8dSAndroid Build Coastguard Worker return -1 209*cda5da8dSAndroid Build Coastguard Worker if name not in {"attlist", "element", "entity", "notation"}: 210*cda5da8dSAndroid Build Coastguard Worker self.updatepos(declstartpos, j + 2) 211*cda5da8dSAndroid Build Coastguard Worker raise AssertionError( 212*cda5da8dSAndroid Build Coastguard Worker "unknown declaration %r in internal subset" % name 213*cda5da8dSAndroid Build Coastguard Worker ) 214*cda5da8dSAndroid Build Coastguard Worker # handle the individual names 215*cda5da8dSAndroid Build Coastguard Worker meth = getattr(self, "_parse_doctype_" + name) 216*cda5da8dSAndroid Build Coastguard Worker j = meth(j, declstartpos) 217*cda5da8dSAndroid Build Coastguard Worker if j < 0: 218*cda5da8dSAndroid Build Coastguard Worker return j 219*cda5da8dSAndroid Build Coastguard Worker elif c == "%": 220*cda5da8dSAndroid Build Coastguard Worker # parameter entity reference 221*cda5da8dSAndroid Build Coastguard Worker if (j + 1) == n: 222*cda5da8dSAndroid Build Coastguard Worker # end of buffer; incomplete 223*cda5da8dSAndroid Build Coastguard Worker return -1 224*cda5da8dSAndroid Build Coastguard Worker s, j = self._scan_name(j + 1, declstartpos) 225*cda5da8dSAndroid Build Coastguard Worker if j < 0: 226*cda5da8dSAndroid Build Coastguard Worker return j 227*cda5da8dSAndroid Build Coastguard Worker if rawdata[j] == ";": 228*cda5da8dSAndroid Build Coastguard Worker j = j + 1 229*cda5da8dSAndroid Build Coastguard Worker elif c == "]": 230*cda5da8dSAndroid Build Coastguard Worker j = j + 1 231*cda5da8dSAndroid Build Coastguard Worker while j < n and rawdata[j].isspace(): 232*cda5da8dSAndroid Build Coastguard Worker j = j + 1 233*cda5da8dSAndroid Build Coastguard Worker if j < n: 234*cda5da8dSAndroid Build Coastguard Worker if rawdata[j] == ">": 235*cda5da8dSAndroid Build Coastguard Worker return j 236*cda5da8dSAndroid Build Coastguard Worker self.updatepos(declstartpos, j) 237*cda5da8dSAndroid Build Coastguard Worker raise AssertionError("unexpected char after internal subset") 238*cda5da8dSAndroid Build Coastguard Worker else: 239*cda5da8dSAndroid Build Coastguard Worker return -1 240*cda5da8dSAndroid Build Coastguard Worker elif c.isspace(): 241*cda5da8dSAndroid Build Coastguard Worker j = j + 1 242*cda5da8dSAndroid Build Coastguard Worker else: 243*cda5da8dSAndroid Build Coastguard Worker self.updatepos(declstartpos, j) 244*cda5da8dSAndroid Build Coastguard Worker raise AssertionError("unexpected char %r in internal subset" % c) 245*cda5da8dSAndroid Build Coastguard Worker # end of buffer reached 246*cda5da8dSAndroid Build Coastguard Worker return -1 247*cda5da8dSAndroid Build Coastguard Worker 248*cda5da8dSAndroid Build Coastguard Worker # Internal -- scan past <!ELEMENT declarations 249*cda5da8dSAndroid Build Coastguard Worker def _parse_doctype_element(self, i, declstartpos): 250*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(i, declstartpos) 251*cda5da8dSAndroid Build Coastguard Worker if j == -1: 252*cda5da8dSAndroid Build Coastguard Worker return -1 253*cda5da8dSAndroid Build Coastguard Worker # style content model; just skip until '>' 254*cda5da8dSAndroid Build Coastguard Worker rawdata = self.rawdata 255*cda5da8dSAndroid Build Coastguard Worker if '>' in rawdata[j:]: 256*cda5da8dSAndroid Build Coastguard Worker return rawdata.find(">", j) + 1 257*cda5da8dSAndroid Build Coastguard Worker return -1 258*cda5da8dSAndroid Build Coastguard Worker 259*cda5da8dSAndroid Build Coastguard Worker # Internal -- scan past <!ATTLIST declarations 260*cda5da8dSAndroid Build Coastguard Worker def _parse_doctype_attlist(self, i, declstartpos): 261*cda5da8dSAndroid Build Coastguard Worker rawdata = self.rawdata 262*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(i, declstartpos) 263*cda5da8dSAndroid Build Coastguard Worker c = rawdata[j:j+1] 264*cda5da8dSAndroid Build Coastguard Worker if c == "": 265*cda5da8dSAndroid Build Coastguard Worker return -1 266*cda5da8dSAndroid Build Coastguard Worker if c == ">": 267*cda5da8dSAndroid Build Coastguard Worker return j + 1 268*cda5da8dSAndroid Build Coastguard Worker while 1: 269*cda5da8dSAndroid Build Coastguard Worker # scan a series of attribute descriptions; simplified: 270*cda5da8dSAndroid Build Coastguard Worker # name type [value] [#constraint] 271*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(j, declstartpos) 272*cda5da8dSAndroid Build Coastguard Worker if j < 0: 273*cda5da8dSAndroid Build Coastguard Worker return j 274*cda5da8dSAndroid Build Coastguard Worker c = rawdata[j:j+1] 275*cda5da8dSAndroid Build Coastguard Worker if c == "": 276*cda5da8dSAndroid Build Coastguard Worker return -1 277*cda5da8dSAndroid Build Coastguard Worker if c == "(": 278*cda5da8dSAndroid Build Coastguard Worker # an enumerated type; look for ')' 279*cda5da8dSAndroid Build Coastguard Worker if ")" in rawdata[j:]: 280*cda5da8dSAndroid Build Coastguard Worker j = rawdata.find(")", j) + 1 281*cda5da8dSAndroid Build Coastguard Worker else: 282*cda5da8dSAndroid Build Coastguard Worker return -1 283*cda5da8dSAndroid Build Coastguard Worker while rawdata[j:j+1].isspace(): 284*cda5da8dSAndroid Build Coastguard Worker j = j + 1 285*cda5da8dSAndroid Build Coastguard Worker if not rawdata[j:]: 286*cda5da8dSAndroid Build Coastguard Worker # end of buffer, incomplete 287*cda5da8dSAndroid Build Coastguard Worker return -1 288*cda5da8dSAndroid Build Coastguard Worker else: 289*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(j, declstartpos) 290*cda5da8dSAndroid Build Coastguard Worker c = rawdata[j:j+1] 291*cda5da8dSAndroid Build Coastguard Worker if not c: 292*cda5da8dSAndroid Build Coastguard Worker return -1 293*cda5da8dSAndroid Build Coastguard Worker if c in "'\"": 294*cda5da8dSAndroid Build Coastguard Worker m = _declstringlit_match(rawdata, j) 295*cda5da8dSAndroid Build Coastguard Worker if m: 296*cda5da8dSAndroid Build Coastguard Worker j = m.end() 297*cda5da8dSAndroid Build Coastguard Worker else: 298*cda5da8dSAndroid Build Coastguard Worker return -1 299*cda5da8dSAndroid Build Coastguard Worker c = rawdata[j:j+1] 300*cda5da8dSAndroid Build Coastguard Worker if not c: 301*cda5da8dSAndroid Build Coastguard Worker return -1 302*cda5da8dSAndroid Build Coastguard Worker if c == "#": 303*cda5da8dSAndroid Build Coastguard Worker if rawdata[j:] == "#": 304*cda5da8dSAndroid Build Coastguard Worker # end of buffer 305*cda5da8dSAndroid Build Coastguard Worker return -1 306*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(j + 1, declstartpos) 307*cda5da8dSAndroid Build Coastguard Worker if j < 0: 308*cda5da8dSAndroid Build Coastguard Worker return j 309*cda5da8dSAndroid Build Coastguard Worker c = rawdata[j:j+1] 310*cda5da8dSAndroid Build Coastguard Worker if not c: 311*cda5da8dSAndroid Build Coastguard Worker return -1 312*cda5da8dSAndroid Build Coastguard Worker if c == '>': 313*cda5da8dSAndroid Build Coastguard Worker # all done 314*cda5da8dSAndroid Build Coastguard Worker return j + 1 315*cda5da8dSAndroid Build Coastguard Worker 316*cda5da8dSAndroid Build Coastguard Worker # Internal -- scan past <!NOTATION declarations 317*cda5da8dSAndroid Build Coastguard Worker def _parse_doctype_notation(self, i, declstartpos): 318*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(i, declstartpos) 319*cda5da8dSAndroid Build Coastguard Worker if j < 0: 320*cda5da8dSAndroid Build Coastguard Worker return j 321*cda5da8dSAndroid Build Coastguard Worker rawdata = self.rawdata 322*cda5da8dSAndroid Build Coastguard Worker while 1: 323*cda5da8dSAndroid Build Coastguard Worker c = rawdata[j:j+1] 324*cda5da8dSAndroid Build Coastguard Worker if not c: 325*cda5da8dSAndroid Build Coastguard Worker # end of buffer; incomplete 326*cda5da8dSAndroid Build Coastguard Worker return -1 327*cda5da8dSAndroid Build Coastguard Worker if c == '>': 328*cda5da8dSAndroid Build Coastguard Worker return j + 1 329*cda5da8dSAndroid Build Coastguard Worker if c in "'\"": 330*cda5da8dSAndroid Build Coastguard Worker m = _declstringlit_match(rawdata, j) 331*cda5da8dSAndroid Build Coastguard Worker if not m: 332*cda5da8dSAndroid Build Coastguard Worker return -1 333*cda5da8dSAndroid Build Coastguard Worker j = m.end() 334*cda5da8dSAndroid Build Coastguard Worker else: 335*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(j, declstartpos) 336*cda5da8dSAndroid Build Coastguard Worker if j < 0: 337*cda5da8dSAndroid Build Coastguard Worker return j 338*cda5da8dSAndroid Build Coastguard Worker 339*cda5da8dSAndroid Build Coastguard Worker # Internal -- scan past <!ENTITY declarations 340*cda5da8dSAndroid Build Coastguard Worker def _parse_doctype_entity(self, i, declstartpos): 341*cda5da8dSAndroid Build Coastguard Worker rawdata = self.rawdata 342*cda5da8dSAndroid Build Coastguard Worker if rawdata[i:i+1] == "%": 343*cda5da8dSAndroid Build Coastguard Worker j = i + 1 344*cda5da8dSAndroid Build Coastguard Worker while 1: 345*cda5da8dSAndroid Build Coastguard Worker c = rawdata[j:j+1] 346*cda5da8dSAndroid Build Coastguard Worker if not c: 347*cda5da8dSAndroid Build Coastguard Worker return -1 348*cda5da8dSAndroid Build Coastguard Worker if c.isspace(): 349*cda5da8dSAndroid Build Coastguard Worker j = j + 1 350*cda5da8dSAndroid Build Coastguard Worker else: 351*cda5da8dSAndroid Build Coastguard Worker break 352*cda5da8dSAndroid Build Coastguard Worker else: 353*cda5da8dSAndroid Build Coastguard Worker j = i 354*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(j, declstartpos) 355*cda5da8dSAndroid Build Coastguard Worker if j < 0: 356*cda5da8dSAndroid Build Coastguard Worker return j 357*cda5da8dSAndroid Build Coastguard Worker while 1: 358*cda5da8dSAndroid Build Coastguard Worker c = self.rawdata[j:j+1] 359*cda5da8dSAndroid Build Coastguard Worker if not c: 360*cda5da8dSAndroid Build Coastguard Worker return -1 361*cda5da8dSAndroid Build Coastguard Worker if c in "'\"": 362*cda5da8dSAndroid Build Coastguard Worker m = _declstringlit_match(rawdata, j) 363*cda5da8dSAndroid Build Coastguard Worker if m: 364*cda5da8dSAndroid Build Coastguard Worker j = m.end() 365*cda5da8dSAndroid Build Coastguard Worker else: 366*cda5da8dSAndroid Build Coastguard Worker return -1 # incomplete 367*cda5da8dSAndroid Build Coastguard Worker elif c == ">": 368*cda5da8dSAndroid Build Coastguard Worker return j + 1 369*cda5da8dSAndroid Build Coastguard Worker else: 370*cda5da8dSAndroid Build Coastguard Worker name, j = self._scan_name(j, declstartpos) 371*cda5da8dSAndroid Build Coastguard Worker if j < 0: 372*cda5da8dSAndroid Build Coastguard Worker return j 373*cda5da8dSAndroid Build Coastguard Worker 374*cda5da8dSAndroid Build Coastguard Worker # Internal -- scan a name token and the new position and the token, or 375*cda5da8dSAndroid Build Coastguard Worker # return -1 if we've reached the end of the buffer. 376*cda5da8dSAndroid Build Coastguard Worker def _scan_name(self, i, declstartpos): 377*cda5da8dSAndroid Build Coastguard Worker rawdata = self.rawdata 378*cda5da8dSAndroid Build Coastguard Worker n = len(rawdata) 379*cda5da8dSAndroid Build Coastguard Worker if i == n: 380*cda5da8dSAndroid Build Coastguard Worker return None, -1 381*cda5da8dSAndroid Build Coastguard Worker m = _declname_match(rawdata, i) 382*cda5da8dSAndroid Build Coastguard Worker if m: 383*cda5da8dSAndroid Build Coastguard Worker s = m.group() 384*cda5da8dSAndroid Build Coastguard Worker name = s.strip() 385*cda5da8dSAndroid Build Coastguard Worker if (i + len(s)) == n: 386*cda5da8dSAndroid Build Coastguard Worker return None, -1 # end of buffer 387*cda5da8dSAndroid Build Coastguard Worker return name.lower(), m.end() 388*cda5da8dSAndroid Build Coastguard Worker else: 389*cda5da8dSAndroid Build Coastguard Worker self.updatepos(declstartpos, i) 390*cda5da8dSAndroid Build Coastguard Worker raise AssertionError( 391*cda5da8dSAndroid Build Coastguard Worker "expected name token at %r" % rawdata[declstartpos:declstartpos+20] 392*cda5da8dSAndroid Build Coastguard Worker ) 393*cda5da8dSAndroid Build Coastguard Worker 394*cda5da8dSAndroid Build Coastguard Worker # To be overridden -- handlers for unknown objects 395*cda5da8dSAndroid Build Coastguard Worker def unknown_decl(self, data): 396*cda5da8dSAndroid Build Coastguard Worker pass 397