1"""Lightweight XML support for Python. 2 3 XML is an inherently hierarchical data format, and the most natural way to 4 represent it is with a tree. This module has two classes for this purpose: 5 6 1. ElementTree represents the whole XML document as a tree and 7 8 2. Element represents a single node in this tree. 9 10 Interactions with the whole document (reading and writing to/from files) are 11 usually done on the ElementTree level. Interactions with a single XML element 12 and its sub-elements are done on the Element level. 13 14 Element is a flexible container object designed to store hierarchical data 15 structures in memory. It can be described as a cross between a list and a 16 dictionary. Each Element has a number of properties associated with it: 17 18 'tag' - a string containing the element's name. 19 20 'attributes' - a Python dictionary storing the element's attributes. 21 22 'text' - a string containing the element's text content. 23 24 'tail' - an optional string containing text after the element's end tag. 25 26 And a number of child elements stored in a Python sequence. 27 28 To create an element instance, use the Element constructor, 29 or the SubElement factory function. 30 31 You can also use the ElementTree class to wrap an element structure 32 and convert it to and from XML. 33 34""" 35 36#--------------------------------------------------------------------- 37# Licensed to PSF under a Contributor Agreement. 38# See https://www.python.org/psf/license for licensing details. 39# 40# ElementTree 41# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved. 42# 43# [email protected] 44# http://www.pythonware.com 45# -------------------------------------------------------------------- 46# The ElementTree toolkit is 47# 48# Copyright (c) 1999-2008 by Fredrik Lundh 49# 50# By obtaining, using, and/or copying this software and/or its 51# associated documentation, you agree that you have read, understood, 52# and will comply with the following terms and conditions: 53# 54# Permission to use, copy, modify, and distribute this software and 55# its associated documentation for any purpose and without fee is 56# hereby granted, provided that the above copyright notice appears in 57# all copies, and that both that copyright notice and this permission 58# notice appear in supporting documentation, and that the name of 59# Secret Labs AB or the author not be used in advertising or publicity 60# pertaining to distribution of the software without specific, written 61# prior permission. 62# 63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 70# OF THIS SOFTWARE. 71# -------------------------------------------------------------------- 72 73__all__ = [ 74 # public symbols 75 "Comment", 76 "dump", 77 "Element", "ElementTree", 78 "fromstring", "fromstringlist", 79 "indent", "iselement", "iterparse", 80 "parse", "ParseError", 81 "PI", "ProcessingInstruction", 82 "QName", 83 "SubElement", 84 "tostring", "tostringlist", 85 "TreeBuilder", 86 "VERSION", 87 "XML", "XMLID", 88 "XMLParser", "XMLPullParser", 89 "register_namespace", 90 "canonicalize", "C14NWriterTarget", 91 ] 92 93VERSION = "1.3.0" 94 95import sys 96import re 97import warnings 98import io 99import collections 100import collections.abc 101import contextlib 102 103from . import ElementPath 104 105 106class ParseError(SyntaxError): 107 """An error when parsing an XML document. 108 109 In addition to its exception value, a ParseError contains 110 two extra attributes: 111 'code' - the specific exception code 112 'position' - the line and column of the error 113 114 """ 115 pass 116 117# -------------------------------------------------------------------- 118 119 120def iselement(element): 121 """Return True if *element* appears to be an Element.""" 122 return hasattr(element, 'tag') 123 124 125class Element: 126 """An XML element. 127 128 This class is the reference implementation of the Element interface. 129 130 An element's length is its number of subelements. That means if you 131 want to check if an element is truly empty, you should check BOTH 132 its length AND its text attribute. 133 134 The element tag, attribute names, and attribute values can be either 135 bytes or strings. 136 137 *tag* is the element name. *attrib* is an optional dictionary containing 138 element attributes. *extra* are additional element attributes given as 139 keyword arguments. 140 141 Example form: 142 <tag attrib>text<child/>...</tag>tail 143 144 """ 145 146 tag = None 147 """The element's name.""" 148 149 attrib = None 150 """Dictionary of the element's attributes.""" 151 152 text = None 153 """ 154 Text before first subelement. This is either a string or the value None. 155 Note that if there is no text, this attribute may be either 156 None or the empty string, depending on the parser. 157 158 """ 159 160 tail = None 161 """ 162 Text after this element's end tag, but before the next sibling element's 163 start tag. This is either a string or the value None. Note that if there 164 was no text, this attribute may be either None or an empty string, 165 depending on the parser. 166 167 """ 168 169 def __init__(self, tag, attrib={}, **extra): 170 if not isinstance(attrib, dict): 171 raise TypeError("attrib must be dict, not %s" % ( 172 attrib.__class__.__name__,)) 173 self.tag = tag 174 self.attrib = {**attrib, **extra} 175 self._children = [] 176 177 def __repr__(self): 178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self)) 179 180 def makeelement(self, tag, attrib): 181 """Create a new element with the same type. 182 183 *tag* is a string containing the element name. 184 *attrib* is a dictionary containing the element attributes. 185 186 Do not call this method, use the SubElement factory function instead. 187 188 """ 189 return self.__class__(tag, attrib) 190 191 def copy(self): 192 """Return copy of current element. 193 194 This creates a shallow copy. Subelements will be shared with the 195 original tree. 196 197 """ 198 warnings.warn( 199 "elem.copy() is deprecated. Use copy.copy(elem) instead.", 200 DeprecationWarning 201 ) 202 return self.__copy__() 203 204 def __copy__(self): 205 elem = self.makeelement(self.tag, self.attrib) 206 elem.text = self.text 207 elem.tail = self.tail 208 elem[:] = self 209 return elem 210 211 def __len__(self): 212 return len(self._children) 213 214 def __bool__(self): 215 warnings.warn( 216 "The behavior of this method will change in future versions. " 217 "Use specific 'len(elem)' or 'elem is not None' test instead.", 218 FutureWarning, stacklevel=2 219 ) 220 return len(self._children) != 0 # emulate old behaviour, for now 221 222 def __getitem__(self, index): 223 return self._children[index] 224 225 def __setitem__(self, index, element): 226 if isinstance(index, slice): 227 for elt in element: 228 self._assert_is_element(elt) 229 else: 230 self._assert_is_element(element) 231 self._children[index] = element 232 233 def __delitem__(self, index): 234 del self._children[index] 235 236 def append(self, subelement): 237 """Add *subelement* to the end of this element. 238 239 The new element will appear in document order after the last existing 240 subelement (or directly after the text, if it's the first subelement), 241 but before the end tag for this element. 242 243 """ 244 self._assert_is_element(subelement) 245 self._children.append(subelement) 246 247 def extend(self, elements): 248 """Append subelements from a sequence. 249 250 *elements* is a sequence with zero or more elements. 251 252 """ 253 for element in elements: 254 self._assert_is_element(element) 255 self._children.append(element) 256 257 def insert(self, index, subelement): 258 """Insert *subelement* at position *index*.""" 259 self._assert_is_element(subelement) 260 self._children.insert(index, subelement) 261 262 def _assert_is_element(self, e): 263 # Need to refer to the actual Python implementation, not the 264 # shadowing C implementation. 265 if not isinstance(e, _Element_Py): 266 raise TypeError('expected an Element, not %s' % type(e).__name__) 267 268 def remove(self, subelement): 269 """Remove matching subelement. 270 271 Unlike the find methods, this method compares elements based on 272 identity, NOT ON tag value or contents. To remove subelements by 273 other means, the easiest way is to use a list comprehension to 274 select what elements to keep, and then use slice assignment to update 275 the parent element. 276 277 ValueError is raised if a matching element could not be found. 278 279 """ 280 # assert iselement(element) 281 self._children.remove(subelement) 282 283 def find(self, path, namespaces=None): 284 """Find first matching element by tag name or path. 285 286 *path* is a string having either an element tag or an XPath, 287 *namespaces* is an optional mapping from namespace prefix to full name. 288 289 Return the first matching element, or None if no element was found. 290 291 """ 292 return ElementPath.find(self, path, namespaces) 293 294 def findtext(self, path, default=None, namespaces=None): 295 """Find text for first matching element by tag name or path. 296 297 *path* is a string having either an element tag or an XPath, 298 *default* is the value to return if the element was not found, 299 *namespaces* is an optional mapping from namespace prefix to full name. 300 301 Return text content of first matching element, or default value if 302 none was found. Note that if an element is found having no text 303 content, the empty string is returned. 304 305 """ 306 return ElementPath.findtext(self, path, default, namespaces) 307 308 def findall(self, path, namespaces=None): 309 """Find all matching subelements by tag name or path. 310 311 *path* is a string having either an element tag or an XPath, 312 *namespaces* is an optional mapping from namespace prefix to full name. 313 314 Returns list containing all matching elements in document order. 315 316 """ 317 return ElementPath.findall(self, path, namespaces) 318 319 def iterfind(self, path, namespaces=None): 320 """Find all matching subelements by tag name or path. 321 322 *path* is a string having either an element tag or an XPath, 323 *namespaces* is an optional mapping from namespace prefix to full name. 324 325 Return an iterable yielding all matching elements in document order. 326 327 """ 328 return ElementPath.iterfind(self, path, namespaces) 329 330 def clear(self): 331 """Reset element. 332 333 This function removes all subelements, clears all attributes, and sets 334 the text and tail attributes to None. 335 336 """ 337 self.attrib.clear() 338 self._children = [] 339 self.text = self.tail = None 340 341 def get(self, key, default=None): 342 """Get element attribute. 343 344 Equivalent to attrib.get, but some implementations may handle this a 345 bit more efficiently. *key* is what attribute to look for, and 346 *default* is what to return if the attribute was not found. 347 348 Returns a string containing the attribute value, or the default if 349 attribute was not found. 350 351 """ 352 return self.attrib.get(key, default) 353 354 def set(self, key, value): 355 """Set element attribute. 356 357 Equivalent to attrib[key] = value, but some implementations may handle 358 this a bit more efficiently. *key* is what attribute to set, and 359 *value* is the attribute value to set it to. 360 361 """ 362 self.attrib[key] = value 363 364 def keys(self): 365 """Get list of attribute names. 366 367 Names are returned in an arbitrary order, just like an ordinary 368 Python dict. Equivalent to attrib.keys() 369 370 """ 371 return self.attrib.keys() 372 373 def items(self): 374 """Get element attributes as a sequence. 375 376 The attributes are returned in arbitrary order. Equivalent to 377 attrib.items(). 378 379 Return a list of (name, value) tuples. 380 381 """ 382 return self.attrib.items() 383 384 def iter(self, tag=None): 385 """Create tree iterator. 386 387 The iterator loops over the element and all subelements in document 388 order, returning all elements with a matching tag. 389 390 If the tree structure is modified during iteration, new or removed 391 elements may or may not be included. To get a stable set, use the 392 list() function on the iterator, and loop over the resulting list. 393 394 *tag* is what tags to look for (default is to return all elements) 395 396 Return an iterator containing all the matching elements. 397 398 """ 399 if tag == "*": 400 tag = None 401 if tag is None or self.tag == tag: 402 yield self 403 for e in self._children: 404 yield from e.iter(tag) 405 406 def itertext(self): 407 """Create text iterator. 408 409 The iterator loops over the element and all subelements in document 410 order, returning all inner text. 411 412 """ 413 tag = self.tag 414 if not isinstance(tag, str) and tag is not None: 415 return 416 t = self.text 417 if t: 418 yield t 419 for e in self: 420 yield from e.itertext() 421 t = e.tail 422 if t: 423 yield t 424 425 426def SubElement(parent, tag, attrib={}, **extra): 427 """Subelement factory which creates an element instance, and appends it 428 to an existing parent. 429 430 The element tag, attribute names, and attribute values can be either 431 bytes or Unicode strings. 432 433 *parent* is the parent element, *tag* is the subelements name, *attrib* is 434 an optional directory containing element attributes, *extra* are 435 additional attributes given as keyword arguments. 436 437 """ 438 attrib = {**attrib, **extra} 439 element = parent.makeelement(tag, attrib) 440 parent.append(element) 441 return element 442 443 444def Comment(text=None): 445 """Comment element factory. 446 447 This function creates a special element which the standard serializer 448 serializes as an XML comment. 449 450 *text* is a string containing the comment string. 451 452 """ 453 element = Element(Comment) 454 element.text = text 455 return element 456 457 458def ProcessingInstruction(target, text=None): 459 """Processing Instruction element factory. 460 461 This function creates a special element which the standard serializer 462 serializes as an XML comment. 463 464 *target* is a string containing the processing instruction, *text* is a 465 string containing the processing instruction contents, if any. 466 467 """ 468 element = Element(ProcessingInstruction) 469 element.text = target 470 if text: 471 element.text = element.text + " " + text 472 return element 473 474PI = ProcessingInstruction 475 476 477class QName: 478 """Qualified name wrapper. 479 480 This class can be used to wrap a QName attribute value in order to get 481 proper namespace handing on output. 482 483 *text_or_uri* is a string containing the QName value either in the form 484 {uri}local, or if the tag argument is given, the URI part of a QName. 485 486 *tag* is an optional argument which if given, will make the first 487 argument (text_or_uri) be interpreted as a URI, and this argument (tag) 488 be interpreted as a local name. 489 490 """ 491 def __init__(self, text_or_uri, tag=None): 492 if tag: 493 text_or_uri = "{%s}%s" % (text_or_uri, tag) 494 self.text = text_or_uri 495 def __str__(self): 496 return self.text 497 def __repr__(self): 498 return '<%s %r>' % (self.__class__.__name__, self.text) 499 def __hash__(self): 500 return hash(self.text) 501 def __le__(self, other): 502 if isinstance(other, QName): 503 return self.text <= other.text 504 return self.text <= other 505 def __lt__(self, other): 506 if isinstance(other, QName): 507 return self.text < other.text 508 return self.text < other 509 def __ge__(self, other): 510 if isinstance(other, QName): 511 return self.text >= other.text 512 return self.text >= other 513 def __gt__(self, other): 514 if isinstance(other, QName): 515 return self.text > other.text 516 return self.text > other 517 def __eq__(self, other): 518 if isinstance(other, QName): 519 return self.text == other.text 520 return self.text == other 521 522# -------------------------------------------------------------------- 523 524 525class ElementTree: 526 """An XML element hierarchy. 527 528 This class also provides support for serialization to and from 529 standard XML. 530 531 *element* is an optional root element node, 532 *file* is an optional file handle or file name of an XML file whose 533 contents will be used to initialize the tree with. 534 535 """ 536 def __init__(self, element=None, file=None): 537 # assert element is None or iselement(element) 538 self._root = element # first node 539 if file: 540 self.parse(file) 541 542 def getroot(self): 543 """Return root element of this tree.""" 544 return self._root 545 546 def _setroot(self, element): 547 """Replace root element of this tree. 548 549 This will discard the current contents of the tree and replace it 550 with the given element. Use with care! 551 552 """ 553 # assert iselement(element) 554 self._root = element 555 556 def parse(self, source, parser=None): 557 """Load external XML document into element tree. 558 559 *source* is a file name or file object, *parser* is an optional parser 560 instance that defaults to XMLParser. 561 562 ParseError is raised if the parser fails to parse the document. 563 564 Returns the root element of the given source document. 565 566 """ 567 close_source = False 568 if not hasattr(source, "read"): 569 source = open(source, "rb") 570 close_source = True 571 try: 572 if parser is None: 573 # If no parser was specified, create a default XMLParser 574 parser = XMLParser() 575 if hasattr(parser, '_parse_whole'): 576 # The default XMLParser, when it comes from an accelerator, 577 # can define an internal _parse_whole API for efficiency. 578 # It can be used to parse the whole source without feeding 579 # it with chunks. 580 self._root = parser._parse_whole(source) 581 return self._root 582 while True: 583 data = source.read(65536) 584 if not data: 585 break 586 parser.feed(data) 587 self._root = parser.close() 588 return self._root 589 finally: 590 if close_source: 591 source.close() 592 593 def iter(self, tag=None): 594 """Create and return tree iterator for the root element. 595 596 The iterator loops over all elements in this tree, in document order. 597 598 *tag* is a string with the tag name to iterate over 599 (default is to return all elements). 600 601 """ 602 # assert self._root is not None 603 return self._root.iter(tag) 604 605 def find(self, path, namespaces=None): 606 """Find first matching element by tag name or path. 607 608 Same as getroot().find(path), which is Element.find() 609 610 *path* is a string having either an element tag or an XPath, 611 *namespaces* is an optional mapping from namespace prefix to full name. 612 613 Return the first matching element, or None if no element was found. 614 615 """ 616 # assert self._root is not None 617 if path[:1] == "/": 618 path = "." + path 619 warnings.warn( 620 "This search is broken in 1.3 and earlier, and will be " 621 "fixed in a future version. If you rely on the current " 622 "behaviour, change it to %r" % path, 623 FutureWarning, stacklevel=2 624 ) 625 return self._root.find(path, namespaces) 626 627 def findtext(self, path, default=None, namespaces=None): 628 """Find first matching element by tag name or path. 629 630 Same as getroot().findtext(path), which is Element.findtext() 631 632 *path* is a string having either an element tag or an XPath, 633 *namespaces* is an optional mapping from namespace prefix to full name. 634 635 Return the first matching element, or None if no element was found. 636 637 """ 638 # assert self._root is not None 639 if path[:1] == "/": 640 path = "." + path 641 warnings.warn( 642 "This search is broken in 1.3 and earlier, and will be " 643 "fixed in a future version. If you rely on the current " 644 "behaviour, change it to %r" % path, 645 FutureWarning, stacklevel=2 646 ) 647 return self._root.findtext(path, default, namespaces) 648 649 def findall(self, path, namespaces=None): 650 """Find all matching subelements by tag name or path. 651 652 Same as getroot().findall(path), which is Element.findall(). 653 654 *path* is a string having either an element tag or an XPath, 655 *namespaces* is an optional mapping from namespace prefix to full name. 656 657 Return list containing all matching elements in document order. 658 659 """ 660 # assert self._root is not None 661 if path[:1] == "/": 662 path = "." + path 663 warnings.warn( 664 "This search is broken in 1.3 and earlier, and will be " 665 "fixed in a future version. If you rely on the current " 666 "behaviour, change it to %r" % path, 667 FutureWarning, stacklevel=2 668 ) 669 return self._root.findall(path, namespaces) 670 671 def iterfind(self, path, namespaces=None): 672 """Find all matching subelements by tag name or path. 673 674 Same as getroot().iterfind(path), which is element.iterfind() 675 676 *path* is a string having either an element tag or an XPath, 677 *namespaces* is an optional mapping from namespace prefix to full name. 678 679 Return an iterable yielding all matching elements in document order. 680 681 """ 682 # assert self._root is not None 683 if path[:1] == "/": 684 path = "." + path 685 warnings.warn( 686 "This search is broken in 1.3 and earlier, and will be " 687 "fixed in a future version. If you rely on the current " 688 "behaviour, change it to %r" % path, 689 FutureWarning, stacklevel=2 690 ) 691 return self._root.iterfind(path, namespaces) 692 693 def write(self, file_or_filename, 694 encoding=None, 695 xml_declaration=None, 696 default_namespace=None, 697 method=None, *, 698 short_empty_elements=True): 699 """Write element tree to a file as XML. 700 701 Arguments: 702 *file_or_filename* -- file name or a file object opened for writing 703 704 *encoding* -- the output encoding (default: US-ASCII) 705 706 *xml_declaration* -- bool indicating if an XML declaration should be 707 added to the output. If None, an XML declaration 708 is added if encoding IS NOT either of: 709 US-ASCII, UTF-8, or Unicode 710 711 *default_namespace* -- sets the default XML namespace (for "xmlns") 712 713 *method* -- either "xml" (default), "html, "text", or "c14n" 714 715 *short_empty_elements* -- controls the formatting of elements 716 that contain no content. If True (default) 717 they are emitted as a single self-closed 718 tag, otherwise they are emitted as a pair 719 of start/end tags 720 721 """ 722 if not method: 723 method = "xml" 724 elif method not in _serialize: 725 raise ValueError("unknown method %r" % method) 726 if not encoding: 727 if method == "c14n": 728 encoding = "utf-8" 729 else: 730 encoding = "us-ascii" 731 with _get_writer(file_or_filename, encoding) as (write, declared_encoding): 732 if method == "xml" and (xml_declaration or 733 (xml_declaration is None and 734 encoding.lower() != "unicode" and 735 declared_encoding.lower() not in ("utf-8", "us-ascii"))): 736 write("<?xml version='1.0' encoding='%s'?>\n" % ( 737 declared_encoding,)) 738 if method == "text": 739 _serialize_text(write, self._root) 740 else: 741 qnames, namespaces = _namespaces(self._root, default_namespace) 742 serialize = _serialize[method] 743 serialize(write, self._root, qnames, namespaces, 744 short_empty_elements=short_empty_elements) 745 746 def write_c14n(self, file): 747 # lxml.etree compatibility. use output method instead 748 return self.write(file, method="c14n") 749 750# -------------------------------------------------------------------- 751# serialization support 752 753@contextlib.contextmanager 754def _get_writer(file_or_filename, encoding): 755 # returns text write method and release all resources after using 756 try: 757 write = file_or_filename.write 758 except AttributeError: 759 # file_or_filename is a file name 760 if encoding.lower() == "unicode": 761 encoding="utf-8" 762 with open(file_or_filename, "w", encoding=encoding, 763 errors="xmlcharrefreplace") as file: 764 yield file.write, encoding 765 else: 766 # file_or_filename is a file-like object 767 # encoding determines if it is a text or binary writer 768 if encoding.lower() == "unicode": 769 # use a text writer as is 770 yield write, getattr(file_or_filename, "encoding", None) or "utf-8" 771 else: 772 # wrap a binary writer with TextIOWrapper 773 with contextlib.ExitStack() as stack: 774 if isinstance(file_or_filename, io.BufferedIOBase): 775 file = file_or_filename 776 elif isinstance(file_or_filename, io.RawIOBase): 777 file = io.BufferedWriter(file_or_filename) 778 # Keep the original file open when the BufferedWriter is 779 # destroyed 780 stack.callback(file.detach) 781 else: 782 # This is to handle passed objects that aren't in the 783 # IOBase hierarchy, but just have a write method 784 file = io.BufferedIOBase() 785 file.writable = lambda: True 786 file.write = write 787 try: 788 # TextIOWrapper uses this methods to determine 789 # if BOM (for UTF-16, etc) should be added 790 file.seekable = file_or_filename.seekable 791 file.tell = file_or_filename.tell 792 except AttributeError: 793 pass 794 file = io.TextIOWrapper(file, 795 encoding=encoding, 796 errors="xmlcharrefreplace", 797 newline="\n") 798 # Keep the original file open when the TextIOWrapper is 799 # destroyed 800 stack.callback(file.detach) 801 yield file.write, encoding 802 803def _namespaces(elem, default_namespace=None): 804 # identify namespaces used in this tree 805 806 # maps qnames to *encoded* prefix:local names 807 qnames = {None: None} 808 809 # maps uri:s to prefixes 810 namespaces = {} 811 if default_namespace: 812 namespaces[default_namespace] = "" 813 814 def add_qname(qname): 815 # calculate serialized qname representation 816 try: 817 if qname[:1] == "{": 818 uri, tag = qname[1:].rsplit("}", 1) 819 prefix = namespaces.get(uri) 820 if prefix is None: 821 prefix = _namespace_map.get(uri) 822 if prefix is None: 823 prefix = "ns%d" % len(namespaces) 824 if prefix != "xml": 825 namespaces[uri] = prefix 826 if prefix: 827 qnames[qname] = "%s:%s" % (prefix, tag) 828 else: 829 qnames[qname] = tag # default element 830 else: 831 if default_namespace: 832 # FIXME: can this be handled in XML 1.0? 833 raise ValueError( 834 "cannot use non-qualified names with " 835 "default_namespace option" 836 ) 837 qnames[qname] = qname 838 except TypeError: 839 _raise_serialization_error(qname) 840 841 # populate qname and namespaces table 842 for elem in elem.iter(): 843 tag = elem.tag 844 if isinstance(tag, QName): 845 if tag.text not in qnames: 846 add_qname(tag.text) 847 elif isinstance(tag, str): 848 if tag not in qnames: 849 add_qname(tag) 850 elif tag is not None and tag is not Comment and tag is not PI: 851 _raise_serialization_error(tag) 852 for key, value in elem.items(): 853 if isinstance(key, QName): 854 key = key.text 855 if key not in qnames: 856 add_qname(key) 857 if isinstance(value, QName) and value.text not in qnames: 858 add_qname(value.text) 859 text = elem.text 860 if isinstance(text, QName) and text.text not in qnames: 861 add_qname(text.text) 862 return qnames, namespaces 863 864def _serialize_xml(write, elem, qnames, namespaces, 865 short_empty_elements, **kwargs): 866 tag = elem.tag 867 text = elem.text 868 if tag is Comment: 869 write("<!--%s-->" % text) 870 elif tag is ProcessingInstruction: 871 write("<?%s?>" % text) 872 else: 873 tag = qnames[tag] 874 if tag is None: 875 if text: 876 write(_escape_cdata(text)) 877 for e in elem: 878 _serialize_xml(write, e, qnames, None, 879 short_empty_elements=short_empty_elements) 880 else: 881 write("<" + tag) 882 items = list(elem.items()) 883 if items or namespaces: 884 if namespaces: 885 for v, k in sorted(namespaces.items(), 886 key=lambda x: x[1]): # sort on prefix 887 if k: 888 k = ":" + k 889 write(" xmlns%s=\"%s\"" % ( 890 k, 891 _escape_attrib(v) 892 )) 893 for k, v in items: 894 if isinstance(k, QName): 895 k = k.text 896 if isinstance(v, QName): 897 v = qnames[v.text] 898 else: 899 v = _escape_attrib(v) 900 write(" %s=\"%s\"" % (qnames[k], v)) 901 if text or len(elem) or not short_empty_elements: 902 write(">") 903 if text: 904 write(_escape_cdata(text)) 905 for e in elem: 906 _serialize_xml(write, e, qnames, None, 907 short_empty_elements=short_empty_elements) 908 write("</" + tag + ">") 909 else: 910 write(" />") 911 if elem.tail: 912 write(_escape_cdata(elem.tail)) 913 914HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr", 915 "img", "input", "isindex", "link", "meta", "param", "source", 916 "track", "wbr"} 917 918def _serialize_html(write, elem, qnames, namespaces, **kwargs): 919 tag = elem.tag 920 text = elem.text 921 if tag is Comment: 922 write("<!--%s-->" % _escape_cdata(text)) 923 elif tag is ProcessingInstruction: 924 write("<?%s?>" % _escape_cdata(text)) 925 else: 926 tag = qnames[tag] 927 if tag is None: 928 if text: 929 write(_escape_cdata(text)) 930 for e in elem: 931 _serialize_html(write, e, qnames, None) 932 else: 933 write("<" + tag) 934 items = list(elem.items()) 935 if items or namespaces: 936 if namespaces: 937 for v, k in sorted(namespaces.items(), 938 key=lambda x: x[1]): # sort on prefix 939 if k: 940 k = ":" + k 941 write(" xmlns%s=\"%s\"" % ( 942 k, 943 _escape_attrib(v) 944 )) 945 for k, v in items: 946 if isinstance(k, QName): 947 k = k.text 948 if isinstance(v, QName): 949 v = qnames[v.text] 950 else: 951 v = _escape_attrib_html(v) 952 # FIXME: handle boolean attributes 953 write(" %s=\"%s\"" % (qnames[k], v)) 954 write(">") 955 ltag = tag.lower() 956 if text: 957 if ltag == "script" or ltag == "style": 958 write(text) 959 else: 960 write(_escape_cdata(text)) 961 for e in elem: 962 _serialize_html(write, e, qnames, None) 963 if ltag not in HTML_EMPTY: 964 write("</" + tag + ">") 965 if elem.tail: 966 write(_escape_cdata(elem.tail)) 967 968def _serialize_text(write, elem): 969 for part in elem.itertext(): 970 write(part) 971 if elem.tail: 972 write(elem.tail) 973 974_serialize = { 975 "xml": _serialize_xml, 976 "html": _serialize_html, 977 "text": _serialize_text, 978# this optional method is imported at the end of the module 979# "c14n": _serialize_c14n, 980} 981 982 983def register_namespace(prefix, uri): 984 """Register a namespace prefix. 985 986 The registry is global, and any existing mapping for either the 987 given prefix or the namespace URI will be removed. 988 989 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and 990 attributes in this namespace will be serialized with prefix if possible. 991 992 ValueError is raised if prefix is reserved or is invalid. 993 994 """ 995 if re.match(r"ns\d+$", prefix): 996 raise ValueError("Prefix format reserved for internal use") 997 for k, v in list(_namespace_map.items()): 998 if k == uri or v == prefix: 999 del _namespace_map[k] 1000 _namespace_map[uri] = prefix 1001 1002_namespace_map = { 1003 # "well-known" namespace prefixes 1004 "http://www.w3.org/XML/1998/namespace": "xml", 1005 "http://www.w3.org/1999/xhtml": "html", 1006 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", 1007 "http://schemas.xmlsoap.org/wsdl/": "wsdl", 1008 # xml schema 1009 "http://www.w3.org/2001/XMLSchema": "xs", 1010 "http://www.w3.org/2001/XMLSchema-instance": "xsi", 1011 # dublin core 1012 "http://purl.org/dc/elements/1.1/": "dc", 1013} 1014# For tests and troubleshooting 1015register_namespace._namespace_map = _namespace_map 1016 1017def _raise_serialization_error(text): 1018 raise TypeError( 1019 "cannot serialize %r (type %s)" % (text, type(text).__name__) 1020 ) 1021 1022def _escape_cdata(text): 1023 # escape character data 1024 try: 1025 # it's worth avoiding do-nothing calls for strings that are 1026 # shorter than 500 characters, or so. assume that's, by far, 1027 # the most common case in most applications. 1028 if "&" in text: 1029 text = text.replace("&", "&") 1030 if "<" in text: 1031 text = text.replace("<", "<") 1032 if ">" in text: 1033 text = text.replace(">", ">") 1034 return text 1035 except (TypeError, AttributeError): 1036 _raise_serialization_error(text) 1037 1038def _escape_attrib(text): 1039 # escape attribute value 1040 try: 1041 if "&" in text: 1042 text = text.replace("&", "&") 1043 if "<" in text: 1044 text = text.replace("<", "<") 1045 if ">" in text: 1046 text = text.replace(">", ">") 1047 if "\"" in text: 1048 text = text.replace("\"", """) 1049 # Although section 2.11 of the XML specification states that CR or 1050 # CR LN should be replaced with just LN, it applies only to EOLNs 1051 # which take part of organizing file into lines. Within attributes, 1052 # we are replacing these with entity numbers, so they do not count. 1053 # http://www.w3.org/TR/REC-xml/#sec-line-ends 1054 # The current solution, contained in following six lines, was 1055 # discussed in issue 17582 and 39011. 1056 if "\r" in text: 1057 text = text.replace("\r", " ") 1058 if "\n" in text: 1059 text = text.replace("\n", " ") 1060 if "\t" in text: 1061 text = text.replace("\t", "	") 1062 return text 1063 except (TypeError, AttributeError): 1064 _raise_serialization_error(text) 1065 1066def _escape_attrib_html(text): 1067 # escape attribute value 1068 try: 1069 if "&" in text: 1070 text = text.replace("&", "&") 1071 if ">" in text: 1072 text = text.replace(">", ">") 1073 if "\"" in text: 1074 text = text.replace("\"", """) 1075 return text 1076 except (TypeError, AttributeError): 1077 _raise_serialization_error(text) 1078 1079# -------------------------------------------------------------------- 1080 1081def tostring(element, encoding=None, method=None, *, 1082 xml_declaration=None, default_namespace=None, 1083 short_empty_elements=True): 1084 """Generate string representation of XML element. 1085 1086 All subelements are included. If encoding is "unicode", a string 1087 is returned. Otherwise a bytestring is returned. 1088 1089 *element* is an Element instance, *encoding* is an optional output 1090 encoding defaulting to US-ASCII, *method* is an optional output which can 1091 be one of "xml" (default), "html", "text" or "c14n", *default_namespace* 1092 sets the default XML namespace (for "xmlns"). 1093 1094 Returns an (optionally) encoded string containing the XML data. 1095 1096 """ 1097 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO() 1098 ElementTree(element).write(stream, encoding, 1099 xml_declaration=xml_declaration, 1100 default_namespace=default_namespace, 1101 method=method, 1102 short_empty_elements=short_empty_elements) 1103 return stream.getvalue() 1104 1105class _ListDataStream(io.BufferedIOBase): 1106 """An auxiliary stream accumulating into a list reference.""" 1107 def __init__(self, lst): 1108 self.lst = lst 1109 1110 def writable(self): 1111 return True 1112 1113 def seekable(self): 1114 return True 1115 1116 def write(self, b): 1117 self.lst.append(b) 1118 1119 def tell(self): 1120 return len(self.lst) 1121 1122def tostringlist(element, encoding=None, method=None, *, 1123 xml_declaration=None, default_namespace=None, 1124 short_empty_elements=True): 1125 lst = [] 1126 stream = _ListDataStream(lst) 1127 ElementTree(element).write(stream, encoding, 1128 xml_declaration=xml_declaration, 1129 default_namespace=default_namespace, 1130 method=method, 1131 short_empty_elements=short_empty_elements) 1132 return lst 1133 1134 1135def dump(elem): 1136 """Write element tree or element structure to sys.stdout. 1137 1138 This function should be used for debugging only. 1139 1140 *elem* is either an ElementTree, or a single Element. The exact output 1141 format is implementation dependent. In this version, it's written as an 1142 ordinary XML file. 1143 1144 """ 1145 # debugging 1146 if not isinstance(elem, ElementTree): 1147 elem = ElementTree(elem) 1148 elem.write(sys.stdout, encoding="unicode") 1149 tail = elem.getroot().tail 1150 if not tail or tail[-1] != "\n": 1151 sys.stdout.write("\n") 1152 1153 1154def indent(tree, space=" ", level=0): 1155 """Indent an XML document by inserting newlines and indentation space 1156 after elements. 1157 1158 *tree* is the ElementTree or Element to modify. The (root) element 1159 itself will not be changed, but the tail text of all elements in its 1160 subtree will be adapted. 1161 1162 *space* is the whitespace to insert for each indentation level, two 1163 space characters by default. 1164 1165 *level* is the initial indentation level. Setting this to a higher 1166 value than 0 can be used for indenting subtrees that are more deeply 1167 nested inside of a document. 1168 """ 1169 if isinstance(tree, ElementTree): 1170 tree = tree.getroot() 1171 if level < 0: 1172 raise ValueError(f"Initial indentation level must be >= 0, got {level}") 1173 if not len(tree): 1174 return 1175 1176 # Reduce the memory consumption by reusing indentation strings. 1177 indentations = ["\n" + level * space] 1178 1179 def _indent_children(elem, level): 1180 # Start a new indentation level for the first child. 1181 child_level = level + 1 1182 try: 1183 child_indentation = indentations[child_level] 1184 except IndexError: 1185 child_indentation = indentations[level] + space 1186 indentations.append(child_indentation) 1187 1188 if not elem.text or not elem.text.strip(): 1189 elem.text = child_indentation 1190 1191 for child in elem: 1192 if len(child): 1193 _indent_children(child, child_level) 1194 if not child.tail or not child.tail.strip(): 1195 child.tail = child_indentation 1196 1197 # Dedent after the last child by overwriting the previous indentation. 1198 if not child.tail.strip(): 1199 child.tail = indentations[level] 1200 1201 _indent_children(tree, 0) 1202 1203 1204# -------------------------------------------------------------------- 1205# parsing 1206 1207 1208def parse(source, parser=None): 1209 """Parse XML document into element tree. 1210 1211 *source* is a filename or file object containing XML data, 1212 *parser* is an optional parser instance defaulting to XMLParser. 1213 1214 Return an ElementTree instance. 1215 1216 """ 1217 tree = ElementTree() 1218 tree.parse(source, parser) 1219 return tree 1220 1221 1222def iterparse(source, events=None, parser=None): 1223 """Incrementally parse XML document into ElementTree. 1224 1225 This class also reports what's going on to the user based on the 1226 *events* it is initialized with. The supported events are the strings 1227 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get 1228 detailed namespace information). If *events* is omitted, only 1229 "end" events are reported. 1230 1231 *source* is a filename or file object containing XML data, *events* is 1232 a list of events to report back, *parser* is an optional parser instance. 1233 1234 Returns an iterator providing (event, elem) pairs. 1235 1236 """ 1237 # Use the internal, undocumented _parser argument for now; When the 1238 # parser argument of iterparse is removed, this can be killed. 1239 pullparser = XMLPullParser(events=events, _parser=parser) 1240 1241 def iterator(source): 1242 close_source = False 1243 try: 1244 if not hasattr(source, "read"): 1245 source = open(source, "rb") 1246 close_source = True 1247 yield None 1248 while True: 1249 yield from pullparser.read_events() 1250 # load event buffer 1251 data = source.read(16 * 1024) 1252 if not data: 1253 break 1254 pullparser.feed(data) 1255 root = pullparser._close_and_return_root() 1256 yield from pullparser.read_events() 1257 it.root = root 1258 finally: 1259 if close_source: 1260 source.close() 1261 1262 class IterParseIterator(collections.abc.Iterator): 1263 __next__ = iterator(source).__next__ 1264 it = IterParseIterator() 1265 it.root = None 1266 del iterator, IterParseIterator 1267 1268 next(it) 1269 return it 1270 1271 1272class XMLPullParser: 1273 1274 def __init__(self, events=None, *, _parser=None): 1275 # The _parser argument is for internal use only and must not be relied 1276 # upon in user code. It will be removed in a future release. 1277 # See https://bugs.python.org/issue17741 for more details. 1278 1279 self._events_queue = collections.deque() 1280 self._parser = _parser or XMLParser(target=TreeBuilder()) 1281 # wire up the parser for event reporting 1282 if events is None: 1283 events = ("end",) 1284 self._parser._setevents(self._events_queue, events) 1285 1286 def feed(self, data): 1287 """Feed encoded data to parser.""" 1288 if self._parser is None: 1289 raise ValueError("feed() called after end of stream") 1290 if data: 1291 try: 1292 self._parser.feed(data) 1293 except SyntaxError as exc: 1294 self._events_queue.append(exc) 1295 1296 def _close_and_return_root(self): 1297 # iterparse needs this to set its root attribute properly :( 1298 root = self._parser.close() 1299 self._parser = None 1300 return root 1301 1302 def close(self): 1303 """Finish feeding data to parser. 1304 1305 Unlike XMLParser, does not return the root element. Use 1306 read_events() to consume elements from XMLPullParser. 1307 """ 1308 self._close_and_return_root() 1309 1310 def read_events(self): 1311 """Return an iterator over currently available (event, elem) pairs. 1312 1313 Events are consumed from the internal event queue as they are 1314 retrieved from the iterator. 1315 """ 1316 events = self._events_queue 1317 while events: 1318 event = events.popleft() 1319 if isinstance(event, Exception): 1320 raise event 1321 else: 1322 yield event 1323 1324 1325def XML(text, parser=None): 1326 """Parse XML document from string constant. 1327 1328 This function can be used to embed "XML Literals" in Python code. 1329 1330 *text* is a string containing XML data, *parser* is an 1331 optional parser instance, defaulting to the standard XMLParser. 1332 1333 Returns an Element instance. 1334 1335 """ 1336 if not parser: 1337 parser = XMLParser(target=TreeBuilder()) 1338 parser.feed(text) 1339 return parser.close() 1340 1341 1342def XMLID(text, parser=None): 1343 """Parse XML document from string constant for its IDs. 1344 1345 *text* is a string containing XML data, *parser* is an 1346 optional parser instance, defaulting to the standard XMLParser. 1347 1348 Returns an (Element, dict) tuple, in which the 1349 dict maps element id:s to elements. 1350 1351 """ 1352 if not parser: 1353 parser = XMLParser(target=TreeBuilder()) 1354 parser.feed(text) 1355 tree = parser.close() 1356 ids = {} 1357 for elem in tree.iter(): 1358 id = elem.get("id") 1359 if id: 1360 ids[id] = elem 1361 return tree, ids 1362 1363# Parse XML document from string constant. Alias for XML(). 1364fromstring = XML 1365 1366def fromstringlist(sequence, parser=None): 1367 """Parse XML document from sequence of string fragments. 1368 1369 *sequence* is a list of other sequence, *parser* is an optional parser 1370 instance, defaulting to the standard XMLParser. 1371 1372 Returns an Element instance. 1373 1374 """ 1375 if not parser: 1376 parser = XMLParser(target=TreeBuilder()) 1377 for text in sequence: 1378 parser.feed(text) 1379 return parser.close() 1380 1381# -------------------------------------------------------------------- 1382 1383 1384class TreeBuilder: 1385 """Generic element structure builder. 1386 1387 This builder converts a sequence of start, data, and end method 1388 calls to a well-formed element structure. 1389 1390 You can use this class to build an element structure using a custom XML 1391 parser, or a parser for some other XML-like format. 1392 1393 *element_factory* is an optional element factory which is called 1394 to create new Element instances, as necessary. 1395 1396 *comment_factory* is a factory to create comments to be used instead of 1397 the standard factory. If *insert_comments* is false (the default), 1398 comments will not be inserted into the tree. 1399 1400 *pi_factory* is a factory to create processing instructions to be used 1401 instead of the standard factory. If *insert_pis* is false (the default), 1402 processing instructions will not be inserted into the tree. 1403 """ 1404 def __init__(self, element_factory=None, *, 1405 comment_factory=None, pi_factory=None, 1406 insert_comments=False, insert_pis=False): 1407 self._data = [] # data collector 1408 self._elem = [] # element stack 1409 self._last = None # last element 1410 self._root = None # root element 1411 self._tail = None # true if we're after an end tag 1412 if comment_factory is None: 1413 comment_factory = Comment 1414 self._comment_factory = comment_factory 1415 self.insert_comments = insert_comments 1416 if pi_factory is None: 1417 pi_factory = ProcessingInstruction 1418 self._pi_factory = pi_factory 1419 self.insert_pis = insert_pis 1420 if element_factory is None: 1421 element_factory = Element 1422 self._factory = element_factory 1423 1424 def close(self): 1425 """Flush builder buffers and return toplevel document Element.""" 1426 assert len(self._elem) == 0, "missing end tags" 1427 assert self._root is not None, "missing toplevel element" 1428 return self._root 1429 1430 def _flush(self): 1431 if self._data: 1432 if self._last is not None: 1433 text = "".join(self._data) 1434 if self._tail: 1435 assert self._last.tail is None, "internal error (tail)" 1436 self._last.tail = text 1437 else: 1438 assert self._last.text is None, "internal error (text)" 1439 self._last.text = text 1440 self._data = [] 1441 1442 def data(self, data): 1443 """Add text to current element.""" 1444 self._data.append(data) 1445 1446 def start(self, tag, attrs): 1447 """Open new element and return it. 1448 1449 *tag* is the element name, *attrs* is a dict containing element 1450 attributes. 1451 1452 """ 1453 self._flush() 1454 self._last = elem = self._factory(tag, attrs) 1455 if self._elem: 1456 self._elem[-1].append(elem) 1457 elif self._root is None: 1458 self._root = elem 1459 self._elem.append(elem) 1460 self._tail = 0 1461 return elem 1462 1463 def end(self, tag): 1464 """Close and return current Element. 1465 1466 *tag* is the element name. 1467 1468 """ 1469 self._flush() 1470 self._last = self._elem.pop() 1471 assert self._last.tag == tag,\ 1472 "end tag mismatch (expected %s, got %s)" % ( 1473 self._last.tag, tag) 1474 self._tail = 1 1475 return self._last 1476 1477 def comment(self, text): 1478 """Create a comment using the comment_factory. 1479 1480 *text* is the text of the comment. 1481 """ 1482 return self._handle_single( 1483 self._comment_factory, self.insert_comments, text) 1484 1485 def pi(self, target, text=None): 1486 """Create a processing instruction using the pi_factory. 1487 1488 *target* is the target name of the processing instruction. 1489 *text* is the data of the processing instruction, or ''. 1490 """ 1491 return self._handle_single( 1492 self._pi_factory, self.insert_pis, target, text) 1493 1494 def _handle_single(self, factory, insert, *args): 1495 elem = factory(*args) 1496 if insert: 1497 self._flush() 1498 self._last = elem 1499 if self._elem: 1500 self._elem[-1].append(elem) 1501 self._tail = 1 1502 return elem 1503 1504 1505# also see ElementTree and TreeBuilder 1506class XMLParser: 1507 """Element structure builder for XML source data based on the expat parser. 1508 1509 *target* is an optional target object which defaults to an instance of the 1510 standard TreeBuilder class, *encoding* is an optional encoding string 1511 which if given, overrides the encoding specified in the XML file: 1512 http://www.iana.org/assignments/character-sets 1513 1514 """ 1515 1516 def __init__(self, *, target=None, encoding=None): 1517 try: 1518 from xml.parsers import expat 1519 except ImportError: 1520 try: 1521 import pyexpat as expat 1522 except ImportError: 1523 raise ImportError( 1524 "No module named expat; use SimpleXMLTreeBuilder instead" 1525 ) 1526 parser = expat.ParserCreate(encoding, "}") 1527 if target is None: 1528 target = TreeBuilder() 1529 # underscored names are provided for compatibility only 1530 self.parser = self._parser = parser 1531 self.target = self._target = target 1532 self._error = expat.error 1533 self._names = {} # name memo cache 1534 # main callbacks 1535 parser.DefaultHandlerExpand = self._default 1536 if hasattr(target, 'start'): 1537 parser.StartElementHandler = self._start 1538 if hasattr(target, 'end'): 1539 parser.EndElementHandler = self._end 1540 if hasattr(target, 'start_ns'): 1541 parser.StartNamespaceDeclHandler = self._start_ns 1542 if hasattr(target, 'end_ns'): 1543 parser.EndNamespaceDeclHandler = self._end_ns 1544 if hasattr(target, 'data'): 1545 parser.CharacterDataHandler = target.data 1546 # miscellaneous callbacks 1547 if hasattr(target, 'comment'): 1548 parser.CommentHandler = target.comment 1549 if hasattr(target, 'pi'): 1550 parser.ProcessingInstructionHandler = target.pi 1551 # Configure pyexpat: buffering, new-style attribute handling. 1552 parser.buffer_text = 1 1553 parser.ordered_attributes = 1 1554 self._doctype = None 1555 self.entity = {} 1556 try: 1557 self.version = "Expat %d.%d.%d" % expat.version_info 1558 except AttributeError: 1559 pass # unknown 1560 1561 def _setevents(self, events_queue, events_to_report): 1562 # Internal API for XMLPullParser 1563 # events_to_report: a list of events to report during parsing (same as 1564 # the *events* of XMLPullParser's constructor. 1565 # events_queue: a list of actual parsing events that will be populated 1566 # by the underlying parser. 1567 # 1568 parser = self._parser 1569 append = events_queue.append 1570 for event_name in events_to_report: 1571 if event_name == "start": 1572 parser.ordered_attributes = 1 1573 def handler(tag, attrib_in, event=event_name, append=append, 1574 start=self._start): 1575 append((event, start(tag, attrib_in))) 1576 parser.StartElementHandler = handler 1577 elif event_name == "end": 1578 def handler(tag, event=event_name, append=append, 1579 end=self._end): 1580 append((event, end(tag))) 1581 parser.EndElementHandler = handler 1582 elif event_name == "start-ns": 1583 # TreeBuilder does not implement .start_ns() 1584 if hasattr(self.target, "start_ns"): 1585 def handler(prefix, uri, event=event_name, append=append, 1586 start_ns=self._start_ns): 1587 append((event, start_ns(prefix, uri))) 1588 else: 1589 def handler(prefix, uri, event=event_name, append=append): 1590 append((event, (prefix or '', uri or ''))) 1591 parser.StartNamespaceDeclHandler = handler 1592 elif event_name == "end-ns": 1593 # TreeBuilder does not implement .end_ns() 1594 if hasattr(self.target, "end_ns"): 1595 def handler(prefix, event=event_name, append=append, 1596 end_ns=self._end_ns): 1597 append((event, end_ns(prefix))) 1598 else: 1599 def handler(prefix, event=event_name, append=append): 1600 append((event, None)) 1601 parser.EndNamespaceDeclHandler = handler 1602 elif event_name == 'comment': 1603 def handler(text, event=event_name, append=append, self=self): 1604 append((event, self.target.comment(text))) 1605 parser.CommentHandler = handler 1606 elif event_name == 'pi': 1607 def handler(pi_target, data, event=event_name, append=append, 1608 self=self): 1609 append((event, self.target.pi(pi_target, data))) 1610 parser.ProcessingInstructionHandler = handler 1611 else: 1612 raise ValueError("unknown event %r" % event_name) 1613 1614 def _raiseerror(self, value): 1615 err = ParseError(value) 1616 err.code = value.code 1617 err.position = value.lineno, value.offset 1618 raise err 1619 1620 def _fixname(self, key): 1621 # expand qname, and convert name string to ascii, if possible 1622 try: 1623 name = self._names[key] 1624 except KeyError: 1625 name = key 1626 if "}" in name: 1627 name = "{" + name 1628 self._names[key] = name 1629 return name 1630 1631 def _start_ns(self, prefix, uri): 1632 return self.target.start_ns(prefix or '', uri or '') 1633 1634 def _end_ns(self, prefix): 1635 return self.target.end_ns(prefix or '') 1636 1637 def _start(self, tag, attr_list): 1638 # Handler for expat's StartElementHandler. Since ordered_attributes 1639 # is set, the attributes are reported as a list of alternating 1640 # attribute name,value. 1641 fixname = self._fixname 1642 tag = fixname(tag) 1643 attrib = {} 1644 if attr_list: 1645 for i in range(0, len(attr_list), 2): 1646 attrib[fixname(attr_list[i])] = attr_list[i+1] 1647 return self.target.start(tag, attrib) 1648 1649 def _end(self, tag): 1650 return self.target.end(self._fixname(tag)) 1651 1652 def _default(self, text): 1653 prefix = text[:1] 1654 if prefix == "&": 1655 # deal with undefined entities 1656 try: 1657 data_handler = self.target.data 1658 except AttributeError: 1659 return 1660 try: 1661 data_handler(self.entity[text[1:-1]]) 1662 except KeyError: 1663 from xml.parsers import expat 1664 err = expat.error( 1665 "undefined entity %s: line %d, column %d" % 1666 (text, self.parser.ErrorLineNumber, 1667 self.parser.ErrorColumnNumber) 1668 ) 1669 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY 1670 err.lineno = self.parser.ErrorLineNumber 1671 err.offset = self.parser.ErrorColumnNumber 1672 raise err 1673 elif prefix == "<" and text[:9] == "<!DOCTYPE": 1674 self._doctype = [] # inside a doctype declaration 1675 elif self._doctype is not None: 1676 # parse doctype contents 1677 if prefix == ">": 1678 self._doctype = None 1679 return 1680 text = text.strip() 1681 if not text: 1682 return 1683 self._doctype.append(text) 1684 n = len(self._doctype) 1685 if n > 2: 1686 type = self._doctype[1] 1687 if type == "PUBLIC" and n == 4: 1688 name, type, pubid, system = self._doctype 1689 if pubid: 1690 pubid = pubid[1:-1] 1691 elif type == "SYSTEM" and n == 3: 1692 name, type, system = self._doctype 1693 pubid = None 1694 else: 1695 return 1696 if hasattr(self.target, "doctype"): 1697 self.target.doctype(name, pubid, system[1:-1]) 1698 elif hasattr(self, "doctype"): 1699 warnings.warn( 1700 "The doctype() method of XMLParser is ignored. " 1701 "Define doctype() method on the TreeBuilder target.", 1702 RuntimeWarning) 1703 1704 self._doctype = None 1705 1706 def feed(self, data): 1707 """Feed encoded data to parser.""" 1708 try: 1709 self.parser.Parse(data, False) 1710 except self._error as v: 1711 self._raiseerror(v) 1712 1713 def close(self): 1714 """Finish feeding data to parser and return element structure.""" 1715 try: 1716 self.parser.Parse(b"", True) # end of data 1717 except self._error as v: 1718 self._raiseerror(v) 1719 try: 1720 close_handler = self.target.close 1721 except AttributeError: 1722 pass 1723 else: 1724 return close_handler() 1725 finally: 1726 # get rid of circular references 1727 del self.parser, self._parser 1728 del self.target, self._target 1729 1730 1731# -------------------------------------------------------------------- 1732# C14N 2.0 1733 1734def canonicalize(xml_data=None, *, out=None, from_file=None, **options): 1735 """Convert XML to its C14N 2.0 serialised form. 1736 1737 If *out* is provided, it must be a file or file-like object that receives 1738 the serialised canonical XML output (text, not bytes) through its ``.write()`` 1739 method. To write to a file, open it in text mode with encoding "utf-8". 1740 If *out* is not provided, this function returns the output as text string. 1741 1742 Either *xml_data* (an XML string) or *from_file* (a file path or 1743 file-like object) must be provided as input. 1744 1745 The configuration options are the same as for the ``C14NWriterTarget``. 1746 """ 1747 if xml_data is None and from_file is None: 1748 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input") 1749 sio = None 1750 if out is None: 1751 sio = out = io.StringIO() 1752 1753 parser = XMLParser(target=C14NWriterTarget(out.write, **options)) 1754 1755 if xml_data is not None: 1756 parser.feed(xml_data) 1757 parser.close() 1758 elif from_file is not None: 1759 parse(from_file, parser=parser) 1760 1761 return sio.getvalue() if sio is not None else None 1762 1763 1764_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match 1765 1766 1767class C14NWriterTarget: 1768 """ 1769 Canonicalization writer target for the XMLParser. 1770 1771 Serialises parse events to XML C14N 2.0. 1772 1773 The *write* function is used for writing out the resulting data stream 1774 as text (not bytes). To write to a file, open it in text mode with encoding 1775 "utf-8" and pass its ``.write`` method. 1776 1777 Configuration options: 1778 1779 - *with_comments*: set to true to include comments 1780 - *strip_text*: set to true to strip whitespace before and after text content 1781 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}" 1782 - *qname_aware_tags*: a set of qname aware tag names in which prefixes 1783 should be replaced in text content 1784 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes 1785 should be replaced in text content 1786 - *exclude_attrs*: a set of attribute names that should not be serialised 1787 - *exclude_tags*: a set of tag names that should not be serialised 1788 """ 1789 def __init__(self, write, *, 1790 with_comments=False, strip_text=False, rewrite_prefixes=False, 1791 qname_aware_tags=None, qname_aware_attrs=None, 1792 exclude_attrs=None, exclude_tags=None): 1793 self._write = write 1794 self._data = [] 1795 self._with_comments = with_comments 1796 self._strip_text = strip_text 1797 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None 1798 self._exclude_tags = set(exclude_tags) if exclude_tags else None 1799 1800 self._rewrite_prefixes = rewrite_prefixes 1801 if qname_aware_tags: 1802 self._qname_aware_tags = set(qname_aware_tags) 1803 else: 1804 self._qname_aware_tags = None 1805 if qname_aware_attrs: 1806 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection 1807 else: 1808 self._find_qname_aware_attrs = None 1809 1810 # Stack with globally and newly declared namespaces as (uri, prefix) pairs. 1811 self._declared_ns_stack = [[ 1812 ("http://www.w3.org/XML/1998/namespace", "xml"), 1813 ]] 1814 # Stack with user declared namespace prefixes as (uri, prefix) pairs. 1815 self._ns_stack = [] 1816 if not rewrite_prefixes: 1817 self._ns_stack.append(list(_namespace_map.items())) 1818 self._ns_stack.append([]) 1819 self._prefix_map = {} 1820 self._preserve_space = [False] 1821 self._pending_start = None 1822 self._root_seen = False 1823 self._root_done = False 1824 self._ignored_depth = 0 1825 1826 def _iter_namespaces(self, ns_stack, _reversed=reversed): 1827 for namespaces in _reversed(ns_stack): 1828 if namespaces: # almost no element declares new namespaces 1829 yield from namespaces 1830 1831 def _resolve_prefix_name(self, prefixed_name): 1832 prefix, name = prefixed_name.split(':', 1) 1833 for uri, p in self._iter_namespaces(self._ns_stack): 1834 if p == prefix: 1835 return f'{{{uri}}}{name}' 1836 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope') 1837 1838 def _qname(self, qname, uri=None): 1839 if uri is None: 1840 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname) 1841 else: 1842 tag = qname 1843 1844 prefixes_seen = set() 1845 for u, prefix in self._iter_namespaces(self._declared_ns_stack): 1846 if u == uri and prefix not in prefixes_seen: 1847 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1848 prefixes_seen.add(prefix) 1849 1850 # Not declared yet => add new declaration. 1851 if self._rewrite_prefixes: 1852 if uri in self._prefix_map: 1853 prefix = self._prefix_map[uri] 1854 else: 1855 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}' 1856 self._declared_ns_stack[-1].append((uri, prefix)) 1857 return f'{prefix}:{tag}', tag, uri 1858 1859 if not uri and '' not in prefixes_seen: 1860 # No default namespace declared => no prefix needed. 1861 return tag, tag, uri 1862 1863 for u, prefix in self._iter_namespaces(self._ns_stack): 1864 if u == uri: 1865 self._declared_ns_stack[-1].append((uri, prefix)) 1866 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1867 1868 if not uri: 1869 # As soon as a default namespace is defined, 1870 # anything that has no namespace (and thus, no prefix) goes there. 1871 return tag, tag, uri 1872 1873 raise ValueError(f'Namespace "{uri}" is not declared in scope') 1874 1875 def data(self, data): 1876 if not self._ignored_depth: 1877 self._data.append(data) 1878 1879 def _flush(self, _join_text=''.join): 1880 data = _join_text(self._data) 1881 del self._data[:] 1882 if self._strip_text and not self._preserve_space[-1]: 1883 data = data.strip() 1884 if self._pending_start is not None: 1885 args, self._pending_start = self._pending_start, None 1886 qname_text = data if data and _looks_like_prefix_name(data) else None 1887 self._start(*args, qname_text) 1888 if qname_text is not None: 1889 return 1890 if data and self._root_seen: 1891 self._write(_escape_cdata_c14n(data)) 1892 1893 def start_ns(self, prefix, uri): 1894 if self._ignored_depth: 1895 return 1896 # we may have to resolve qnames in text content 1897 if self._data: 1898 self._flush() 1899 self._ns_stack[-1].append((uri, prefix)) 1900 1901 def start(self, tag, attrs): 1902 if self._exclude_tags is not None and ( 1903 self._ignored_depth or tag in self._exclude_tags): 1904 self._ignored_depth += 1 1905 return 1906 if self._data: 1907 self._flush() 1908 1909 new_namespaces = [] 1910 self._declared_ns_stack.append(new_namespaces) 1911 1912 if self._qname_aware_tags is not None and tag in self._qname_aware_tags: 1913 # Need to parse text first to see if it requires a prefix declaration. 1914 self._pending_start = (tag, attrs, new_namespaces) 1915 return 1916 self._start(tag, attrs, new_namespaces) 1917 1918 def _start(self, tag, attrs, new_namespaces, qname_text=None): 1919 if self._exclude_attrs is not None and attrs: 1920 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs} 1921 1922 qnames = {tag, *attrs} 1923 resolved_names = {} 1924 1925 # Resolve prefixes in attribute and tag text. 1926 if qname_text is not None: 1927 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text) 1928 qnames.add(qname) 1929 if self._find_qname_aware_attrs is not None and attrs: 1930 qattrs = self._find_qname_aware_attrs(attrs) 1931 if qattrs: 1932 for attr_name in qattrs: 1933 value = attrs[attr_name] 1934 if _looks_like_prefix_name(value): 1935 qname = resolved_names[value] = self._resolve_prefix_name(value) 1936 qnames.add(qname) 1937 else: 1938 qattrs = None 1939 else: 1940 qattrs = None 1941 1942 # Assign prefixes in lexicographical order of used URIs. 1943 parse_qname = self._qname 1944 parsed_qnames = {n: parse_qname(n) for n in sorted( 1945 qnames, key=lambda n: n.split('}', 1))} 1946 1947 # Write namespace declarations in prefix order ... 1948 if new_namespaces: 1949 attr_list = [ 1950 ('xmlns:' + prefix if prefix else 'xmlns', uri) 1951 for uri, prefix in new_namespaces 1952 ] 1953 attr_list.sort() 1954 else: 1955 # almost always empty 1956 attr_list = [] 1957 1958 # ... followed by attributes in URI+name order 1959 if attrs: 1960 for k, v in sorted(attrs.items()): 1961 if qattrs is not None and k in qattrs and v in resolved_names: 1962 v = parsed_qnames[resolved_names[v]][0] 1963 attr_qname, attr_name, uri = parsed_qnames[k] 1964 # No prefix for attributes in default ('') namespace. 1965 attr_list.append((attr_qname if uri else attr_name, v)) 1966 1967 # Honour xml:space attributes. 1968 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space') 1969 self._preserve_space.append( 1970 space_behaviour == 'preserve' if space_behaviour 1971 else self._preserve_space[-1]) 1972 1973 # Write the tag. 1974 write = self._write 1975 write('<' + parsed_qnames[tag][0]) 1976 if attr_list: 1977 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list])) 1978 write('>') 1979 1980 # Write the resolved qname text content. 1981 if qname_text is not None: 1982 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0])) 1983 1984 self._root_seen = True 1985 self._ns_stack.append([]) 1986 1987 def end(self, tag): 1988 if self._ignored_depth: 1989 self._ignored_depth -= 1 1990 return 1991 if self._data: 1992 self._flush() 1993 self._write(f'</{self._qname(tag)[0]}>') 1994 self._preserve_space.pop() 1995 self._root_done = len(self._preserve_space) == 1 1996 self._declared_ns_stack.pop() 1997 self._ns_stack.pop() 1998 1999 def comment(self, text): 2000 if not self._with_comments: 2001 return 2002 if self._ignored_depth: 2003 return 2004 if self._root_done: 2005 self._write('\n') 2006 elif self._root_seen and self._data: 2007 self._flush() 2008 self._write(f'<!--{_escape_cdata_c14n(text)}-->') 2009 if not self._root_seen: 2010 self._write('\n') 2011 2012 def pi(self, target, data): 2013 if self._ignored_depth: 2014 return 2015 if self._root_done: 2016 self._write('\n') 2017 elif self._root_seen and self._data: 2018 self._flush() 2019 self._write( 2020 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>') 2021 if not self._root_seen: 2022 self._write('\n') 2023 2024 2025def _escape_cdata_c14n(text): 2026 # escape character data 2027 try: 2028 # it's worth avoiding do-nothing calls for strings that are 2029 # shorter than 500 character, or so. assume that's, by far, 2030 # the most common case in most applications. 2031 if '&' in text: 2032 text = text.replace('&', '&') 2033 if '<' in text: 2034 text = text.replace('<', '<') 2035 if '>' in text: 2036 text = text.replace('>', '>') 2037 if '\r' in text: 2038 text = text.replace('\r', '
') 2039 return text 2040 except (TypeError, AttributeError): 2041 _raise_serialization_error(text) 2042 2043 2044def _escape_attrib_c14n(text): 2045 # escape attribute value 2046 try: 2047 if '&' in text: 2048 text = text.replace('&', '&') 2049 if '<' in text: 2050 text = text.replace('<', '<') 2051 if '"' in text: 2052 text = text.replace('"', '"') 2053 if '\t' in text: 2054 text = text.replace('\t', '	') 2055 if '\n' in text: 2056 text = text.replace('\n', '
') 2057 if '\r' in text: 2058 text = text.replace('\r', '
') 2059 return text 2060 except (TypeError, AttributeError): 2061 _raise_serialization_error(text) 2062 2063 2064# -------------------------------------------------------------------- 2065 2066# Import the C accelerators 2067try: 2068 # Element is going to be shadowed by the C implementation. We need to keep 2069 # the Python version of it accessible for some "creative" by external code 2070 # (see tests) 2071 _Element_Py = Element 2072 2073 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories 2074 from _elementtree import * 2075 from _elementtree import _set_factories 2076except ImportError: 2077 pass 2078else: 2079 _set_factories(Comment, ProcessingInstruction) 2080