xref: /aosp_15_r20/external/libxml2/python/drv_libxml2.py (revision 7c5688314b92172186c154356a6374bf7684c3ca)
1# -*- coding: iso-8859-1 -*-
2""" A SAX2 driver for libxml2, on top of it's XmlReader API
3
4USAGE
5    # put this file (drv_libxml2.py) in PYTHONPATH
6    import xml.sax
7    reader = xml.sax.make_parser(["drv_libxml2"])
8    # ...and the rest is standard python sax.
9
10CAVEATS
11    - Lexical handlers are supported, except for start/endEntity
12      (waiting for XmlReader.ResolveEntity) and start/endDTD
13    - Error callbacks are not exactly synchronous, they tend
14      to be invoked before the corresponding content callback,
15      because the underlying reader interface parses
16      data by chunks of 512 bytes
17
18TODO
19    - search for TODO
20    - some ErrorHandler events (warning)
21    - some ContentHandler events (setDocumentLocator, skippedEntity)
22    - EntityResolver (using libxml2.?)
23    - DTDHandler (if/when libxml2 exposes such node types)
24    - DeclHandler (if/when libxml2 exposes such node types)
25    - property_xml_string?
26    - feature_string_interning?
27    - Incremental parser
28    - additional performance tuning:
29      - one might cache callbacks to avoid some name lookups
30      - one might implement a smarter way to pass attributes to startElement
31        (some kind of lazy evaluation?)
32      - there might be room for improvement in start/endPrefixMapping
33      - other?
34
35"""
36
37__author__  = "St�phane Bidoul <[email protected]>"
38__version__ = "0.3"
39
40import sys
41import codecs
42
43if sys.version_info[0] < 3:
44    __author__  = codecs.unicode_escape_decode(__author__)[0]
45
46    StringTypes = (str, unicode)
47    # libxml2 returns strings as UTF8
48    _decoder = codecs.lookup("utf8")[1]
49    def _d(s):
50        if s is None:
51            return s
52        else:
53            return _decoder(s)[0]
54else:
55    StringTypes = str
56    # s is Unicode `str` already
57    def _d(s):
58        return s
59
60from xml.sax._exceptions import *
61from xml.sax import xmlreader, saxutils
62from xml.sax.handler import \
63     feature_namespaces, \
64     feature_namespace_prefixes, \
65     feature_string_interning, \
66     feature_validation, \
67     feature_external_ges, \
68     feature_external_pes, \
69     property_lexical_handler, \
70     property_declaration_handler, \
71     property_dom_node, \
72     property_xml_string
73
74try:
75    import libxml2
76except ImportError:
77    raise SAXReaderNotAvailable("libxml2 not available: " \
78                                "import error was: %s" % sys.exc_info()[1])
79
80class Locator(xmlreader.Locator):
81    """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
82
83    def __init__(self,locator):
84        self.__locator = locator
85
86    def getColumnNumber(self):
87        "Return the column number where the current event ends."
88        return -1
89
90    def getLineNumber(self):
91        "Return the line number where the current event ends."
92        return self.__locator.LineNumber()
93
94    def getPublicId(self):
95        "Return the public identifier for the current event."
96        return None
97
98    def getSystemId(self):
99        "Return the system identifier for the current event."
100        return self.__locator.BaseURI()
101
102class LibXml2Reader(xmlreader.XMLReader):
103
104    def __init__(self):
105        xmlreader.XMLReader.__init__(self)
106        # features
107        self.__ns = 0
108        self.__nspfx = 0
109        self.__validate = 0
110        self.__extparams = 1
111        # parsing flag
112        self.__parsing = 0
113        # additional handlers
114        self.__lex_handler = None
115        self.__decl_handler = None
116        # error messages accumulator
117        self.__errors = None
118
119    def _errorHandler(self,arg,msg,severity,locator):
120        if self.__errors is None:
121            self.__errors = []
122        self.__errors.append((severity,
123                              SAXParseException(msg,None,
124                                                Locator(locator))))
125
126    def _reportErrors(self,fatal):
127        for severity,exception in self.__errors:
128            if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
129                            libxml2.PARSER_SEVERITY_WARNING):
130                self._err_handler.warning(exception)
131            else:
132                # when fatal is set, the parse will stop;
133                # we consider that the last error reported
134                # is the fatal one.
135                if fatal and exception is self.__errors[-1][1]:
136                    self._err_handler.fatalError(exception)
137                else:
138                    self._err_handler.error(exception)
139        self.__errors = None
140
141    def parse(self, source):
142        self.__parsing = 1
143        try:
144            # prepare source and create reader
145            if isinstance(source, StringTypes):
146                reader = libxml2.newTextReaderFilename(source)
147            else:
148                source = saxutils.prepare_input_source(source)
149                stream = source.getCharacterStream()
150                if stream is None:
151                    stream = source.getByteStream()
152                input = libxml2.inputBuffer(stream)
153                reader = input.newTextReader(source.getSystemId())
154            reader.SetErrorHandler(self._errorHandler,None)
155            # configure reader
156            if self.__extparams:
157                reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
158                reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
159                reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
160                reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
161            else:
162                reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
163            # we reuse attribute maps (for a slight performance gain)
164            if self.__ns:
165                attributesNSImpl = xmlreader.AttributesNSImpl({},{})
166            else:
167                attributesImpl = xmlreader.AttributesImpl({})
168            # prefixes to pop (for endPrefixMapping)
169            prefixes = []
170            # start loop
171            self._cont_handler.startDocument()
172            while 1:
173                r = reader.Read()
174                # check for errors
175                if r == 1:
176                    if not self.__errors is None:
177                        self._reportErrors(0)
178                elif r == 0:
179                    if not self.__errors is None:
180                        self._reportErrors(0)
181                    break # end of parse
182                else:
183                    if not self.__errors is None:
184                        self._reportErrors(1)
185                    else:
186                        self._err_handler.fatalError(\
187                            SAXException("Read failed (no details available)"))
188                    break # fatal parse error
189                # get node type
190                nodeType = reader.NodeType()
191                # Element
192                if nodeType == 1:
193                    if self.__ns:
194                        eltName = (_d(reader.NamespaceUri()),\
195                                   _d(reader.LocalName()))
196                        eltQName = _d(reader.Name())
197                        attributesNSImpl._attrs = attrs = {}
198                        attributesNSImpl._qnames = qnames = {}
199                        newPrefixes = []
200                        while reader.MoveToNextAttribute():
201                            qname = _d(reader.Name())
202                            value = _d(reader.Value())
203                            if qname.startswith("xmlns"):
204                                if len(qname) > 5:
205                                    newPrefix = qname[6:]
206                                else:
207                                    newPrefix = None
208                                newPrefixes.append(newPrefix)
209                                self._cont_handler.startPrefixMapping(\
210                                    newPrefix,value)
211                                if not self.__nspfx:
212                                    continue # don't report xmlns attribute
213                            attName = (_d(reader.NamespaceUri()),
214                                       _d(reader.LocalName()))
215                            qnames[attName] = qname
216                            attrs[attName] = value
217                        reader.MoveToElement()
218                        self._cont_handler.startElementNS( \
219                            eltName,eltQName,attributesNSImpl)
220                        if reader.IsEmptyElement():
221                            self._cont_handler.endElementNS(eltName,eltQName)
222                            for newPrefix in newPrefixes:
223                                self._cont_handler.endPrefixMapping(newPrefix)
224                        else:
225                            prefixes.append(newPrefixes)
226                    else:
227                        eltName = _d(reader.Name())
228                        attributesImpl._attrs = attrs = {}
229                        while reader.MoveToNextAttribute():
230                            attName = _d(reader.Name())
231                            attrs[attName] = _d(reader.Value())
232                        reader.MoveToElement()
233                        self._cont_handler.startElement( \
234                            eltName,attributesImpl)
235                        if reader.IsEmptyElement():
236                            self._cont_handler.endElement(eltName)
237                # EndElement
238                elif nodeType == 15:
239                    if self.__ns:
240                        self._cont_handler.endElementNS( \
241                             (_d(reader.NamespaceUri()),_d(reader.LocalName())),
242                             _d(reader.Name()))
243                        for prefix in prefixes.pop():
244                            self._cont_handler.endPrefixMapping(prefix)
245                    else:
246                        self._cont_handler.endElement(_d(reader.Name()))
247                # Text
248                elif nodeType == 3:
249                    self._cont_handler.characters(_d(reader.Value()))
250                # Whitespace
251                elif nodeType == 13:
252                    self._cont_handler.ignorableWhitespace(_d(reader.Value()))
253                # SignificantWhitespace
254                elif nodeType == 14:
255                    self._cont_handler.characters(_d(reader.Value()))
256                # CDATA
257                elif nodeType == 4:
258                    if not self.__lex_handler is None:
259                        self.__lex_handler.startCDATA()
260                    self._cont_handler.characters(_d(reader.Value()))
261                    if not self.__lex_handler is None:
262                        self.__lex_handler.endCDATA()
263                # EntityReference
264                elif nodeType == 5:
265                    if not self.__lex_handler is None:
266                        self.startEntity(_d(reader.Name()))
267                    reader.ResolveEntity()
268                # EndEntity
269                elif nodeType == 16:
270                    if not self.__lex_handler is None:
271                        self.endEntity(_d(reader.Name()))
272                # ProcessingInstruction
273                elif nodeType == 7:
274                    self._cont_handler.processingInstruction( \
275                        _d(reader.Name()),_d(reader.Value()))
276                # Comment
277                elif nodeType == 8:
278                    if not self.__lex_handler is None:
279                        self.__lex_handler.comment(_d(reader.Value()))
280                # DocumentType
281                elif nodeType == 10:
282                    #if not self.__lex_handler is None:
283                    #    self.__lex_handler.startDTD()
284                    pass # TODO (how to detect endDTD? on first non-dtd event?)
285                # XmlDeclaration
286                elif nodeType == 17:
287                    pass # TODO
288                # Entity
289                elif nodeType == 6:
290                    pass # TODO (entity decl)
291                # Notation (decl)
292                elif nodeType == 12:
293                    pass # TODO
294                # Attribute (never in this loop)
295                #elif nodeType == 2:
296                #    pass
297                # Document (not exposed)
298                #elif nodeType == 9:
299                #    pass
300                # DocumentFragment (never returned by XmlReader)
301                #elif nodeType == 11:
302                #    pass
303                # None
304                #elif nodeType == 0:
305                #    pass
306                # -
307                else:
308                    raise SAXException("Unexpected node type %d" % nodeType)
309            if r == 0:
310                self._cont_handler.endDocument()
311            reader.Close()
312        finally:
313            self.__parsing = 0
314
315    def setDTDHandler(self, handler):
316        # TODO (when supported, the inherited method works just fine)
317        raise SAXNotSupportedException("DTDHandler not supported")
318
319    def setEntityResolver(self, resolver):
320        # TODO (when supported, the inherited method works just fine)
321        raise SAXNotSupportedException("EntityResolver not supported")
322
323    def getFeature(self, name):
324        if name == feature_namespaces:
325            return self.__ns
326        elif name == feature_namespace_prefixes:
327            return self.__nspfx
328        elif name == feature_validation:
329            return self.__validate
330        elif name == feature_external_ges:
331            return 1 # TODO (does that relate to PARSER_LOADDTD)?
332        elif name == feature_external_pes:
333            return self.__extparams
334        else:
335            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
336                                            name)
337
338    def setFeature(self, name, state):
339        if self.__parsing:
340            raise SAXNotSupportedException("Cannot set feature %s " \
341                                           "while parsing" % name)
342        if name == feature_namespaces:
343            self.__ns = state
344        elif name == feature_namespace_prefixes:
345            self.__nspfx = state
346        elif name == feature_validation:
347            self.__validate = state
348        elif name == feature_external_ges:
349            if state == 0:
350                # TODO (does that relate to PARSER_LOADDTD)?
351                raise SAXNotSupportedException("Feature '%s' not supported" % \
352                                               name)
353        elif name == feature_external_pes:
354            self.__extparams = state
355        else:
356            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
357                                            name)
358
359    def getProperty(self, name):
360        if name == property_lexical_handler:
361            return self.__lex_handler
362        elif name == property_declaration_handler:
363            return self.__decl_handler
364        else:
365            raise SAXNotRecognizedException("Property '%s' not recognized" % \
366                                            name)
367
368    def setProperty(self, name, value):
369        if name == property_lexical_handler:
370            self.__lex_handler = value
371        elif name == property_declaration_handler:
372            # TODO: remove if/when libxml2 supports dtd events
373            raise SAXNotSupportedException("Property '%s' not supported" % \
374                                           name)
375            self.__decl_handler = value
376        else:
377            raise SAXNotRecognizedException("Property '%s' not recognized" % \
378                                            name)
379
380def create_parser():
381    return LibXml2Reader()
382
383