1# -*- coding: iso-8859-1 -*- 2""" A SAX2 driver for libxml2, on top of it's XmlReader API 3 4USAGE 5 # put this file (drv_libxml2.py) in PYTHONPATH 6 import xml.sax 7 reader = xml.sax.make_parser(["drv_libxml2"]) 8 # ...and the rest is standard python sax. 9 10CAVEATS 11 - Lexical handlers are supported, except for start/endEntity 12 (waiting for XmlReader.ResolveEntity) and start/endDTD 13 - Error callbacks are not exactly synchronous, they tend 14 to be invoked before the corresponding content callback, 15 because the underlying reader interface parses 16 data by chunks of 512 bytes 17 18TODO 19 - search for TODO 20 - some ErrorHandler events (warning) 21 - some ContentHandler events (setDocumentLocator, skippedEntity) 22 - EntityResolver (using libxml2.?) 23 - DTDHandler (if/when libxml2 exposes such node types) 24 - DeclHandler (if/when libxml2 exposes such node types) 25 - property_xml_string? 26 - feature_string_interning? 27 - Incremental parser 28 - additional performance tuning: 29 - one might cache callbacks to avoid some name lookups 30 - one might implement a smarter way to pass attributes to startElement 31 (some kind of lazy evaluation?) 32 - there might be room for improvement in start/endPrefixMapping 33 - other? 34 35""" 36 37__author__ = "St�phane Bidoul <[email protected]>" 38__version__ = "0.3" 39 40import sys 41import codecs 42 43if sys.version_info[0] < 3: 44 __author__ = codecs.unicode_escape_decode(__author__)[0] 45 46 StringTypes = (str, unicode) 47 # libxml2 returns strings as UTF8 48 _decoder = codecs.lookup("utf8")[1] 49 def _d(s): 50 if s is None: 51 return s 52 else: 53 return _decoder(s)[0] 54else: 55 StringTypes = str 56 # s is Unicode `str` already 57 def _d(s): 58 return s 59 60from xml.sax._exceptions import * 61from xml.sax import xmlreader, saxutils 62from xml.sax.handler import \ 63 feature_namespaces, \ 64 feature_namespace_prefixes, \ 65 feature_string_interning, \ 66 feature_validation, \ 67 feature_external_ges, \ 68 feature_external_pes, \ 69 property_lexical_handler, \ 70 property_declaration_handler, \ 71 property_dom_node, \ 72 property_xml_string 73 74try: 75 import libxml2 76except ImportError: 77 raise SAXReaderNotAvailable("libxml2 not available: " \ 78 "import error was: %s" % sys.exc_info()[1]) 79 80class Locator(xmlreader.Locator): 81 """SAX Locator adapter for libxml2.xmlTextReaderLocator""" 82 83 def __init__(self,locator): 84 self.__locator = locator 85 86 def getColumnNumber(self): 87 "Return the column number where the current event ends." 88 return -1 89 90 def getLineNumber(self): 91 "Return the line number where the current event ends." 92 return self.__locator.LineNumber() 93 94 def getPublicId(self): 95 "Return the public identifier for the current event." 96 return None 97 98 def getSystemId(self): 99 "Return the system identifier for the current event." 100 return self.__locator.BaseURI() 101 102class LibXml2Reader(xmlreader.XMLReader): 103 104 def __init__(self): 105 xmlreader.XMLReader.__init__(self) 106 # features 107 self.__ns = 0 108 self.__nspfx = 0 109 self.__validate = 0 110 self.__extparams = 1 111 # parsing flag 112 self.__parsing = 0 113 # additional handlers 114 self.__lex_handler = None 115 self.__decl_handler = None 116 # error messages accumulator 117 self.__errors = None 118 119 def _errorHandler(self,arg,msg,severity,locator): 120 if self.__errors is None: 121 self.__errors = [] 122 self.__errors.append((severity, 123 SAXParseException(msg,None, 124 Locator(locator)))) 125 126 def _reportErrors(self,fatal): 127 for severity,exception in self.__errors: 128 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, 129 libxml2.PARSER_SEVERITY_WARNING): 130 self._err_handler.warning(exception) 131 else: 132 # when fatal is set, the parse will stop; 133 # we consider that the last error reported 134 # is the fatal one. 135 if fatal and exception is self.__errors[-1][1]: 136 self._err_handler.fatalError(exception) 137 else: 138 self._err_handler.error(exception) 139 self.__errors = None 140 141 def parse(self, source): 142 self.__parsing = 1 143 try: 144 # prepare source and create reader 145 if isinstance(source, StringTypes): 146 reader = libxml2.newTextReaderFilename(source) 147 else: 148 source = saxutils.prepare_input_source(source) 149 stream = source.getCharacterStream() 150 if stream is None: 151 stream = source.getByteStream() 152 input = libxml2.inputBuffer(stream) 153 reader = input.newTextReader(source.getSystemId()) 154 reader.SetErrorHandler(self._errorHandler,None) 155 # configure reader 156 if self.__extparams: 157 reader.SetParserProp(libxml2.PARSER_LOADDTD,1) 158 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) 159 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) 160 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) 161 else: 162 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) 163 # we reuse attribute maps (for a slight performance gain) 164 if self.__ns: 165 attributesNSImpl = xmlreader.AttributesNSImpl({},{}) 166 else: 167 attributesImpl = xmlreader.AttributesImpl({}) 168 # prefixes to pop (for endPrefixMapping) 169 prefixes = [] 170 # start loop 171 self._cont_handler.startDocument() 172 while 1: 173 r = reader.Read() 174 # check for errors 175 if r == 1: 176 if not self.__errors is None: 177 self._reportErrors(0) 178 elif r == 0: 179 if not self.__errors is None: 180 self._reportErrors(0) 181 break # end of parse 182 else: 183 if not self.__errors is None: 184 self._reportErrors(1) 185 else: 186 self._err_handler.fatalError(\ 187 SAXException("Read failed (no details available)")) 188 break # fatal parse error 189 # get node type 190 nodeType = reader.NodeType() 191 # Element 192 if nodeType == 1: 193 if self.__ns: 194 eltName = (_d(reader.NamespaceUri()),\ 195 _d(reader.LocalName())) 196 eltQName = _d(reader.Name()) 197 attributesNSImpl._attrs = attrs = {} 198 attributesNSImpl._qnames = qnames = {} 199 newPrefixes = [] 200 while reader.MoveToNextAttribute(): 201 qname = _d(reader.Name()) 202 value = _d(reader.Value()) 203 if qname.startswith("xmlns"): 204 if len(qname) > 5: 205 newPrefix = qname[6:] 206 else: 207 newPrefix = None 208 newPrefixes.append(newPrefix) 209 self._cont_handler.startPrefixMapping(\ 210 newPrefix,value) 211 if not self.__nspfx: 212 continue # don't report xmlns attribute 213 attName = (_d(reader.NamespaceUri()), 214 _d(reader.LocalName())) 215 qnames[attName] = qname 216 attrs[attName] = value 217 reader.MoveToElement() 218 self._cont_handler.startElementNS( \ 219 eltName,eltQName,attributesNSImpl) 220 if reader.IsEmptyElement(): 221 self._cont_handler.endElementNS(eltName,eltQName) 222 for newPrefix in newPrefixes: 223 self._cont_handler.endPrefixMapping(newPrefix) 224 else: 225 prefixes.append(newPrefixes) 226 else: 227 eltName = _d(reader.Name()) 228 attributesImpl._attrs = attrs = {} 229 while reader.MoveToNextAttribute(): 230 attName = _d(reader.Name()) 231 attrs[attName] = _d(reader.Value()) 232 reader.MoveToElement() 233 self._cont_handler.startElement( \ 234 eltName,attributesImpl) 235 if reader.IsEmptyElement(): 236 self._cont_handler.endElement(eltName) 237 # EndElement 238 elif nodeType == 15: 239 if self.__ns: 240 self._cont_handler.endElementNS( \ 241 (_d(reader.NamespaceUri()),_d(reader.LocalName())), 242 _d(reader.Name())) 243 for prefix in prefixes.pop(): 244 self._cont_handler.endPrefixMapping(prefix) 245 else: 246 self._cont_handler.endElement(_d(reader.Name())) 247 # Text 248 elif nodeType == 3: 249 self._cont_handler.characters(_d(reader.Value())) 250 # Whitespace 251 elif nodeType == 13: 252 self._cont_handler.ignorableWhitespace(_d(reader.Value())) 253 # SignificantWhitespace 254 elif nodeType == 14: 255 self._cont_handler.characters(_d(reader.Value())) 256 # CDATA 257 elif nodeType == 4: 258 if not self.__lex_handler is None: 259 self.__lex_handler.startCDATA() 260 self._cont_handler.characters(_d(reader.Value())) 261 if not self.__lex_handler is None: 262 self.__lex_handler.endCDATA() 263 # EntityReference 264 elif nodeType == 5: 265 if not self.__lex_handler is None: 266 self.startEntity(_d(reader.Name())) 267 reader.ResolveEntity() 268 # EndEntity 269 elif nodeType == 16: 270 if not self.__lex_handler is None: 271 self.endEntity(_d(reader.Name())) 272 # ProcessingInstruction 273 elif nodeType == 7: 274 self._cont_handler.processingInstruction( \ 275 _d(reader.Name()),_d(reader.Value())) 276 # Comment 277 elif nodeType == 8: 278 if not self.__lex_handler is None: 279 self.__lex_handler.comment(_d(reader.Value())) 280 # DocumentType 281 elif nodeType == 10: 282 #if not self.__lex_handler is None: 283 # self.__lex_handler.startDTD() 284 pass # TODO (how to detect endDTD? on first non-dtd event?) 285 # XmlDeclaration 286 elif nodeType == 17: 287 pass # TODO 288 # Entity 289 elif nodeType == 6: 290 pass # TODO (entity decl) 291 # Notation (decl) 292 elif nodeType == 12: 293 pass # TODO 294 # Attribute (never in this loop) 295 #elif nodeType == 2: 296 # pass 297 # Document (not exposed) 298 #elif nodeType == 9: 299 # pass 300 # DocumentFragment (never returned by XmlReader) 301 #elif nodeType == 11: 302 # pass 303 # None 304 #elif nodeType == 0: 305 # pass 306 # - 307 else: 308 raise SAXException("Unexpected node type %d" % nodeType) 309 if r == 0: 310 self._cont_handler.endDocument() 311 reader.Close() 312 finally: 313 self.__parsing = 0 314 315 def setDTDHandler(self, handler): 316 # TODO (when supported, the inherited method works just fine) 317 raise SAXNotSupportedException("DTDHandler not supported") 318 319 def setEntityResolver(self, resolver): 320 # TODO (when supported, the inherited method works just fine) 321 raise SAXNotSupportedException("EntityResolver not supported") 322 323 def getFeature(self, name): 324 if name == feature_namespaces: 325 return self.__ns 326 elif name == feature_namespace_prefixes: 327 return self.__nspfx 328 elif name == feature_validation: 329 return self.__validate 330 elif name == feature_external_ges: 331 return 1 # TODO (does that relate to PARSER_LOADDTD)? 332 elif name == feature_external_pes: 333 return self.__extparams 334 else: 335 raise SAXNotRecognizedException("Feature '%s' not recognized" % \ 336 name) 337 338 def setFeature(self, name, state): 339 if self.__parsing: 340 raise SAXNotSupportedException("Cannot set feature %s " \ 341 "while parsing" % name) 342 if name == feature_namespaces: 343 self.__ns = state 344 elif name == feature_namespace_prefixes: 345 self.__nspfx = state 346 elif name == feature_validation: 347 self.__validate = state 348 elif name == feature_external_ges: 349 if state == 0: 350 # TODO (does that relate to PARSER_LOADDTD)? 351 raise SAXNotSupportedException("Feature '%s' not supported" % \ 352 name) 353 elif name == feature_external_pes: 354 self.__extparams = state 355 else: 356 raise SAXNotRecognizedException("Feature '%s' not recognized" % \ 357 name) 358 359 def getProperty(self, name): 360 if name == property_lexical_handler: 361 return self.__lex_handler 362 elif name == property_declaration_handler: 363 return self.__decl_handler 364 else: 365 raise SAXNotRecognizedException("Property '%s' not recognized" % \ 366 name) 367 368 def setProperty(self, name, value): 369 if name == property_lexical_handler: 370 self.__lex_handler = value 371 elif name == property_declaration_handler: 372 # TODO: remove if/when libxml2 supports dtd events 373 raise SAXNotSupportedException("Property '%s' not supported" % \ 374 name) 375 self.__decl_handler = value 376 else: 377 raise SAXNotRecognizedException("Property '%s' not recognized" % \ 378 name) 379 380def create_parser(): 381 return LibXml2Reader() 382 383