1# 2# ElementTree 3# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ 4# 5# limited xpath support for element trees 6# 7# history: 8# 2003-05-23 fl created 9# 2003-05-28 fl added support for // etc 10# 2003-08-27 fl fixed parsing of periods in element names 11# 2007-09-10 fl new selection engine 12# 2007-09-12 fl fixed parent selector 13# 2007-09-13 fl added iterfind; changed findall to return a list 14# 2007-11-30 fl added namespaces support 15# 2009-10-30 fl added child element value filter 16# 17# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved. 18# 19# [email protected] 20# http://www.pythonware.com 21# 22# -------------------------------------------------------------------- 23# The ElementTree toolkit is 24# 25# Copyright (c) 1999-2009 by Fredrik Lundh 26# 27# By obtaining, using, and/or copying this software and/or its 28# associated documentation, you agree that you have read, understood, 29# and will comply with the following terms and conditions: 30# 31# Permission to use, copy, modify, and distribute this software and 32# its associated documentation for any purpose and without fee is 33# hereby granted, provided that the above copyright notice appears in 34# all copies, and that both that copyright notice and this permission 35# notice appear in supporting documentation, and that the name of 36# Secret Labs AB or the author not be used in advertising or publicity 37# pertaining to distribution of the software without specific, written 38# prior permission. 39# 40# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 41# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 42# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 43# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 44# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 45# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 46# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 47# OF THIS SOFTWARE. 48# -------------------------------------------------------------------- 49 50# Licensed to PSF under a Contributor Agreement. 51# See https://www.python.org/psf/license for licensing details. 52 53## 54# Implementation module for XPath support. There's usually no reason 55# to import this module directly; the <b>ElementTree</b> does this for 56# you, if needed. 57## 58 59import re 60 61xpath_tokenizer_re = re.compile( 62 r"(" 63 r"'[^']*'|\"[^\"]*\"|" 64 r"::|" 65 r"//?|" 66 r"\.\.|" 67 r"\(\)|" 68 r"!=|" 69 r"[/.*:\[\]\(\)@=])|" 70 r"((?:\{[^}]+\})?[^/\[\]\(\)@!=\s]+)|" 71 r"\s+" 72 ) 73 74def xpath_tokenizer(pattern, namespaces=None): 75 default_namespace = namespaces.get('') if namespaces else None 76 parsing_attribute = False 77 for token in xpath_tokenizer_re.findall(pattern): 78 ttype, tag = token 79 if tag and tag[0] != "{": 80 if ":" in tag: 81 prefix, uri = tag.split(":", 1) 82 try: 83 if not namespaces: 84 raise KeyError 85 yield ttype, "{%s}%s" % (namespaces[prefix], uri) 86 except KeyError: 87 raise SyntaxError("prefix %r not found in prefix map" % prefix) from None 88 elif default_namespace and not parsing_attribute: 89 yield ttype, "{%s}%s" % (default_namespace, tag) 90 else: 91 yield token 92 parsing_attribute = False 93 else: 94 yield token 95 parsing_attribute = ttype == '@' 96 97 98def get_parent_map(context): 99 parent_map = context.parent_map 100 if parent_map is None: 101 context.parent_map = parent_map = {} 102 for p in context.root.iter(): 103 for e in p: 104 parent_map[e] = p 105 return parent_map 106 107 108def _is_wildcard_tag(tag): 109 return tag[:3] == '{*}' or tag[-2:] == '}*' 110 111 112def _prepare_tag(tag): 113 _isinstance, _str = isinstance, str 114 if tag == '{*}*': 115 # Same as '*', but no comments or processing instructions. 116 # It can be a surprise that '*' includes those, but there is no 117 # justification for '{*}*' doing the same. 118 def select(context, result): 119 for elem in result: 120 if _isinstance(elem.tag, _str): 121 yield elem 122 elif tag == '{}*': 123 # Any tag that is not in a namespace. 124 def select(context, result): 125 for elem in result: 126 el_tag = elem.tag 127 if _isinstance(el_tag, _str) and el_tag[0] != '{': 128 yield elem 129 elif tag[:3] == '{*}': 130 # The tag in any (or no) namespace. 131 suffix = tag[2:] # '}name' 132 no_ns = slice(-len(suffix), None) 133 tag = tag[3:] 134 def select(context, result): 135 for elem in result: 136 el_tag = elem.tag 137 if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix: 138 yield elem 139 elif tag[-2:] == '}*': 140 # Any tag in the given namespace. 141 ns = tag[:-1] 142 ns_only = slice(None, len(ns)) 143 def select(context, result): 144 for elem in result: 145 el_tag = elem.tag 146 if _isinstance(el_tag, _str) and el_tag[ns_only] == ns: 147 yield elem 148 else: 149 raise RuntimeError(f"internal parser error, got {tag}") 150 return select 151 152 153def prepare_child(next, token): 154 tag = token[1] 155 if _is_wildcard_tag(tag): 156 select_tag = _prepare_tag(tag) 157 def select(context, result): 158 def select_child(result): 159 for elem in result: 160 yield from elem 161 return select_tag(context, select_child(result)) 162 else: 163 if tag[:2] == '{}': 164 tag = tag[2:] # '{}tag' == 'tag' 165 def select(context, result): 166 for elem in result: 167 for e in elem: 168 if e.tag == tag: 169 yield e 170 return select 171 172def prepare_star(next, token): 173 def select(context, result): 174 for elem in result: 175 yield from elem 176 return select 177 178def prepare_self(next, token): 179 def select(context, result): 180 yield from result 181 return select 182 183def prepare_descendant(next, token): 184 try: 185 token = next() 186 except StopIteration: 187 return 188 if token[0] == "*": 189 tag = "*" 190 elif not token[0]: 191 tag = token[1] 192 else: 193 raise SyntaxError("invalid descendant") 194 195 if _is_wildcard_tag(tag): 196 select_tag = _prepare_tag(tag) 197 def select(context, result): 198 def select_child(result): 199 for elem in result: 200 for e in elem.iter(): 201 if e is not elem: 202 yield e 203 return select_tag(context, select_child(result)) 204 else: 205 if tag[:2] == '{}': 206 tag = tag[2:] # '{}tag' == 'tag' 207 def select(context, result): 208 for elem in result: 209 for e in elem.iter(tag): 210 if e is not elem: 211 yield e 212 return select 213 214def prepare_parent(next, token): 215 def select(context, result): 216 # FIXME: raise error if .. is applied at toplevel? 217 parent_map = get_parent_map(context) 218 result_map = {} 219 for elem in result: 220 if elem in parent_map: 221 parent = parent_map[elem] 222 if parent not in result_map: 223 result_map[parent] = None 224 yield parent 225 return select 226 227def prepare_predicate(next, token): 228 # FIXME: replace with real parser!!! refs: 229 # http://javascript.crockford.com/tdop/tdop.html 230 signature = [] 231 predicate = [] 232 while 1: 233 try: 234 token = next() 235 except StopIteration: 236 return 237 if token[0] == "]": 238 break 239 if token == ('', ''): 240 # ignore whitespace 241 continue 242 if token[0] and token[0][:1] in "'\"": 243 token = "'", token[0][1:-1] 244 signature.append(token[0] or "-") 245 predicate.append(token[1]) 246 signature = "".join(signature) 247 # use signature to determine predicate type 248 if signature == "@-": 249 # [@attribute] predicate 250 key = predicate[1] 251 def select(context, result): 252 for elem in result: 253 if elem.get(key) is not None: 254 yield elem 255 return select 256 if signature == "@-='" or signature == "@-!='": 257 # [@attribute='value'] or [@attribute!='value'] 258 key = predicate[1] 259 value = predicate[-1] 260 def select(context, result): 261 for elem in result: 262 if elem.get(key) == value: 263 yield elem 264 def select_negated(context, result): 265 for elem in result: 266 if (attr_value := elem.get(key)) is not None and attr_value != value: 267 yield elem 268 return select_negated if '!=' in signature else select 269 if signature == "-" and not re.match(r"\-?\d+$", predicate[0]): 270 # [tag] 271 tag = predicate[0] 272 def select(context, result): 273 for elem in result: 274 if elem.find(tag) is not None: 275 yield elem 276 return select 277 if signature == ".='" or signature == ".!='" or ( 278 (signature == "-='" or signature == "-!='") 279 and not re.match(r"\-?\d+$", predicate[0])): 280 # [.='value'] or [tag='value'] or [.!='value'] or [tag!='value'] 281 tag = predicate[0] 282 value = predicate[-1] 283 if tag: 284 def select(context, result): 285 for elem in result: 286 for e in elem.findall(tag): 287 if "".join(e.itertext()) == value: 288 yield elem 289 break 290 def select_negated(context, result): 291 for elem in result: 292 for e in elem.iterfind(tag): 293 if "".join(e.itertext()) != value: 294 yield elem 295 break 296 else: 297 def select(context, result): 298 for elem in result: 299 if "".join(elem.itertext()) == value: 300 yield elem 301 def select_negated(context, result): 302 for elem in result: 303 if "".join(elem.itertext()) != value: 304 yield elem 305 return select_negated if '!=' in signature else select 306 if signature == "-" or signature == "-()" or signature == "-()-": 307 # [index] or [last()] or [last()-index] 308 if signature == "-": 309 # [index] 310 index = int(predicate[0]) - 1 311 if index < 0: 312 raise SyntaxError("XPath position >= 1 expected") 313 else: 314 if predicate[0] != "last": 315 raise SyntaxError("unsupported function") 316 if signature == "-()-": 317 try: 318 index = int(predicate[2]) - 1 319 except ValueError: 320 raise SyntaxError("unsupported expression") 321 if index > -2: 322 raise SyntaxError("XPath offset from last() must be negative") 323 else: 324 index = -1 325 def select(context, result): 326 parent_map = get_parent_map(context) 327 for elem in result: 328 try: 329 parent = parent_map[elem] 330 # FIXME: what if the selector is "*" ? 331 elems = list(parent.findall(elem.tag)) 332 if elems[index] is elem: 333 yield elem 334 except (IndexError, KeyError): 335 pass 336 return select 337 raise SyntaxError("invalid predicate") 338 339ops = { 340 "": prepare_child, 341 "*": prepare_star, 342 ".": prepare_self, 343 "..": prepare_parent, 344 "//": prepare_descendant, 345 "[": prepare_predicate, 346 } 347 348_cache = {} 349 350class _SelectorContext: 351 parent_map = None 352 def __init__(self, root): 353 self.root = root 354 355# -------------------------------------------------------------------- 356 357## 358# Generate all matching objects. 359 360def iterfind(elem, path, namespaces=None): 361 # compile selector pattern 362 if path[-1:] == "/": 363 path = path + "*" # implicit all (FIXME: keep this?) 364 365 cache_key = (path,) 366 if namespaces: 367 cache_key += tuple(sorted(namespaces.items())) 368 369 try: 370 selector = _cache[cache_key] 371 except KeyError: 372 if len(_cache) > 100: 373 _cache.clear() 374 if path[:1] == "/": 375 raise SyntaxError("cannot use absolute path on element") 376 next = iter(xpath_tokenizer(path, namespaces)).__next__ 377 try: 378 token = next() 379 except StopIteration: 380 return 381 selector = [] 382 while 1: 383 try: 384 selector.append(ops[token[0]](next, token)) 385 except StopIteration: 386 raise SyntaxError("invalid path") from None 387 try: 388 token = next() 389 if token[0] == "/": 390 token = next() 391 except StopIteration: 392 break 393 _cache[cache_key] = selector 394 # execute selector pattern 395 result = [elem] 396 context = _SelectorContext(elem) 397 for select in selector: 398 result = select(context, result) 399 return result 400 401## 402# Find first matching object. 403 404def find(elem, path, namespaces=None): 405 return next(iterfind(elem, path, namespaces), None) 406 407## 408# Find all matching objects. 409 410def findall(elem, path, namespaces=None): 411 return list(iterfind(elem, path, namespaces)) 412 413## 414# Find text for first matching object. 415 416def findtext(elem, path, default=None, namespaces=None): 417 try: 418 elem = next(iterfind(elem, path, namespaces)) 419 if elem.text is None: 420 return "" 421 return elem.text 422 except StopIteration: 423 return default 424