1"""Tokenization help for Python programs. 2 3tokenize(readline) is a generator that breaks a stream of bytes into 4Python tokens. It decodes the bytes according to PEP-0263 for 5determining source file encoding. 6 7It accepts a readline-like method which is called repeatedly to get the 8next line of input (or b"" for EOF). It generates 5-tuples with these 9members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators. Additionally, all token lists start with an ENCODING token 20which tells you which encoding was used to decode the bytes stream. 21""" 22 23__author__ = 'Ka-Ping Yee <[email protected]>' 24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 26 'Michael Foord') 27from builtins import open as _builtin_open 28from codecs import lookup, BOM_UTF8 29import collections 30import functools 31from io import TextIOWrapper 32import itertools as _itertools 33import re 34import sys 35from token import * 36from token import EXACT_TOKEN_TYPES 37 38cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 39blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 40 41import token 42__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", 43 "untokenize", "TokenInfo"] 44del token 45 46class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): 47 def __repr__(self): 48 annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) 49 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % 50 self._replace(type=annotated_type)) 51 52 @property 53 def exact_type(self): 54 if self.type == OP and self.string in EXACT_TOKEN_TYPES: 55 return EXACT_TOKEN_TYPES[self.string] 56 else: 57 return self.type 58 59def group(*choices): return '(' + '|'.join(choices) + ')' 60def any(*choices): return group(*choices) + '*' 61def maybe(*choices): return group(*choices) + '?' 62 63# Note: we use unicode matching for names ("\w") but ascii matching for 64# number literals. 65Whitespace = r'[ \f\t]*' 66Comment = r'#[^\r\n]*' 67Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 68Name = r'\w+' 69 70Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' 71Binnumber = r'0[bB](?:_?[01])+' 72Octnumber = r'0[oO](?:_?[0-7])+' 73Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' 74Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 75Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' 76Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', 77 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) 78Expfloat = r'[0-9](?:_?[0-9])*' + Exponent 79Floatnumber = group(Pointfloat, Expfloat) 80Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') 81Number = group(Imagnumber, Floatnumber, Intnumber) 82 83# Return the empty string, plus all of the valid string prefixes. 84def _all_string_prefixes(): 85 # The valid string prefixes. Only contain the lower case versions, 86 # and don't contain any permutations (include 'fr', but not 87 # 'rf'). The various permutations will be generated. 88 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] 89 # if we add binary f-strings, add: ['fb', 'fbr'] 90 result = {''} 91 for prefix in _valid_string_prefixes: 92 for t in _itertools.permutations(prefix): 93 # create a list with upper and lower versions of each 94 # character 95 for u in _itertools.product(*[(c, c.upper()) for c in t]): 96 result.add(''.join(u)) 97 return result 98 99@functools.lru_cache 100def _compile(expr): 101 return re.compile(expr, re.UNICODE) 102 103# Note that since _all_string_prefixes includes the empty string, 104# StringPrefix can be the empty string (making it optional). 105StringPrefix = group(*_all_string_prefixes()) 106 107# Tail end of ' string. 108Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 109# Tail end of " string. 110Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 111# Tail end of ''' string. 112Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 113# Tail end of """ string. 114Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 115Triple = group(StringPrefix + "'''", StringPrefix + '"""') 116# Single-line ' or " string. 117String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 118 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 119 120# Sorting in reverse order puts the long operators before their prefixes. 121# Otherwise if = came before ==, == would get recognized as two instances 122# of =. 123Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) 124Funny = group(r'\r?\n', Special) 125 126PlainToken = group(Number, Funny, String, Name) 127Token = Ignore + PlainToken 128 129# First (or only) line of ' or " string. 130ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 131 group("'", r'\\\r?\n'), 132 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 133 group('"', r'\\\r?\n')) 134PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 135PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 136 137# For a given string prefix plus quotes, endpats maps it to a regex 138# to match the remainder of that string. _prefix can be empty, for 139# a normal single or triple quoted string (with no prefix). 140endpats = {} 141for _prefix in _all_string_prefixes(): 142 endpats[_prefix + "'"] = Single 143 endpats[_prefix + '"'] = Double 144 endpats[_prefix + "'''"] = Single3 145 endpats[_prefix + '"""'] = Double3 146del _prefix 147 148# A set of all of the single and triple quoted string prefixes, 149# including the opening quotes. 150single_quoted = set() 151triple_quoted = set() 152for t in _all_string_prefixes(): 153 for u in (t + '"', t + "'"): 154 single_quoted.add(u) 155 for u in (t + '"""', t + "'''"): 156 triple_quoted.add(u) 157del t, u 158 159tabsize = 8 160 161class TokenError(Exception): pass 162 163class StopTokenizing(Exception): pass 164 165 166class Untokenizer: 167 168 def __init__(self): 169 self.tokens = [] 170 self.prev_row = 1 171 self.prev_col = 0 172 self.encoding = None 173 174 def add_whitespace(self, start): 175 row, col = start 176 if row < self.prev_row or row == self.prev_row and col < self.prev_col: 177 raise ValueError("start ({},{}) precedes previous end ({},{})" 178 .format(row, col, self.prev_row, self.prev_col)) 179 row_offset = row - self.prev_row 180 if row_offset: 181 self.tokens.append("\\\n" * row_offset) 182 self.prev_col = 0 183 col_offset = col - self.prev_col 184 if col_offset: 185 self.tokens.append(" " * col_offset) 186 187 def untokenize(self, iterable): 188 it = iter(iterable) 189 indents = [] 190 startline = False 191 for t in it: 192 if len(t) == 2: 193 self.compat(t, it) 194 break 195 tok_type, token, start, end, line = t 196 if tok_type == ENCODING: 197 self.encoding = token 198 continue 199 if tok_type == ENDMARKER: 200 break 201 if tok_type == INDENT: 202 indents.append(token) 203 continue 204 elif tok_type == DEDENT: 205 indents.pop() 206 self.prev_row, self.prev_col = end 207 continue 208 elif tok_type in (NEWLINE, NL): 209 startline = True 210 elif startline and indents: 211 indent = indents[-1] 212 if start[1] >= len(indent): 213 self.tokens.append(indent) 214 self.prev_col = len(indent) 215 startline = False 216 self.add_whitespace(start) 217 self.tokens.append(token) 218 self.prev_row, self.prev_col = end 219 if tok_type in (NEWLINE, NL): 220 self.prev_row += 1 221 self.prev_col = 0 222 return "".join(self.tokens) 223 224 def compat(self, token, iterable): 225 indents = [] 226 toks_append = self.tokens.append 227 startline = token[0] in (NEWLINE, NL) 228 prevstring = False 229 230 for tok in _itertools.chain([token], iterable): 231 toknum, tokval = tok[:2] 232 if toknum == ENCODING: 233 self.encoding = tokval 234 continue 235 236 if toknum in (NAME, NUMBER): 237 tokval += ' ' 238 239 # Insert a space between two consecutive strings 240 if toknum == STRING: 241 if prevstring: 242 tokval = ' ' + tokval 243 prevstring = True 244 else: 245 prevstring = False 246 247 if toknum == INDENT: 248 indents.append(tokval) 249 continue 250 elif toknum == DEDENT: 251 indents.pop() 252 continue 253 elif toknum in (NEWLINE, NL): 254 startline = True 255 elif startline and indents: 256 toks_append(indents[-1]) 257 startline = False 258 toks_append(tokval) 259 260 261def untokenize(iterable): 262 """Transform tokens back into Python source code. 263 It returns a bytes object, encoded using the ENCODING 264 token, which is the first token sequence output by tokenize. 265 266 Each element returned by the iterable must be a token sequence 267 with at least two elements, a token number and token value. If 268 only two tokens are passed, the resulting output is poor. 269 270 Round-trip invariant for full input: 271 Untokenized source will match input source exactly 272 273 Round-trip invariant for limited input: 274 # Output bytes will tokenize back to the input 275 t1 = [tok[:2] for tok in tokenize(f.readline)] 276 newcode = untokenize(t1) 277 readline = BytesIO(newcode).readline 278 t2 = [tok[:2] for tok in tokenize(readline)] 279 assert t1 == t2 280 """ 281 ut = Untokenizer() 282 out = ut.untokenize(iterable) 283 if ut.encoding is not None: 284 out = out.encode(ut.encoding) 285 return out 286 287 288def _get_normal_name(orig_enc): 289 """Imitates get_normal_name in tokenizer.c.""" 290 # Only care about the first 12 characters. 291 enc = orig_enc[:12].lower().replace("_", "-") 292 if enc == "utf-8" or enc.startswith("utf-8-"): 293 return "utf-8" 294 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 295 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 296 return "iso-8859-1" 297 return orig_enc 298 299def detect_encoding(readline): 300 """ 301 The detect_encoding() function is used to detect the encoding that should 302 be used to decode a Python source file. It requires one argument, readline, 303 in the same way as the tokenize() generator. 304 305 It will call readline a maximum of twice, and return the encoding used 306 (as a string) and a list of any lines (left as bytes) it has read in. 307 308 It detects the encoding from the presence of a utf-8 bom or an encoding 309 cookie as specified in pep-0263. If both a bom and a cookie are present, 310 but disagree, a SyntaxError will be raised. If the encoding cookie is an 311 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 312 'utf-8-sig' is returned. 313 314 If no encoding is specified, then the default of 'utf-8' will be returned. 315 """ 316 try: 317 filename = readline.__self__.name 318 except AttributeError: 319 filename = None 320 bom_found = False 321 encoding = None 322 default = 'utf-8' 323 def read_or_stop(): 324 try: 325 return readline() 326 except StopIteration: 327 return b'' 328 329 def find_cookie(line): 330 try: 331 # Decode as UTF-8. Either the line is an encoding declaration, 332 # in which case it should be pure ASCII, or it must be UTF-8 333 # per default encoding. 334 line_string = line.decode('utf-8') 335 except UnicodeDecodeError: 336 msg = "invalid or missing encoding declaration" 337 if filename is not None: 338 msg = '{} for {!r}'.format(msg, filename) 339 raise SyntaxError(msg) 340 341 match = cookie_re.match(line_string) 342 if not match: 343 return None 344 encoding = _get_normal_name(match.group(1)) 345 try: 346 codec = lookup(encoding) 347 except LookupError: 348 # This behaviour mimics the Python interpreter 349 if filename is None: 350 msg = "unknown encoding: " + encoding 351 else: 352 msg = "unknown encoding for {!r}: {}".format(filename, 353 encoding) 354 raise SyntaxError(msg) 355 356 if bom_found: 357 if encoding != 'utf-8': 358 # This behaviour mimics the Python interpreter 359 if filename is None: 360 msg = 'encoding problem: utf-8' 361 else: 362 msg = 'encoding problem for {!r}: utf-8'.format(filename) 363 raise SyntaxError(msg) 364 encoding += '-sig' 365 return encoding 366 367 first = read_or_stop() 368 if first.startswith(BOM_UTF8): 369 bom_found = True 370 first = first[3:] 371 default = 'utf-8-sig' 372 if not first: 373 return default, [] 374 375 encoding = find_cookie(first) 376 if encoding: 377 return encoding, [first] 378 if not blank_re.match(first): 379 return default, [first] 380 381 second = read_or_stop() 382 if not second: 383 return default, [first] 384 385 encoding = find_cookie(second) 386 if encoding: 387 return encoding, [first, second] 388 389 return default, [first, second] 390 391 392def open(filename): 393 """Open a file in read only mode using the encoding detected by 394 detect_encoding(). 395 """ 396 buffer = _builtin_open(filename, 'rb') 397 try: 398 encoding, lines = detect_encoding(buffer.readline) 399 buffer.seek(0) 400 text = TextIOWrapper(buffer, encoding, line_buffering=True) 401 text.mode = 'r' 402 return text 403 except: 404 buffer.close() 405 raise 406 407 408def tokenize(readline): 409 """ 410 The tokenize() generator requires one argument, readline, which 411 must be a callable object which provides the same interface as the 412 readline() method of built-in file objects. Each call to the function 413 should return one line of input as bytes. Alternatively, readline 414 can be a callable function terminating with StopIteration: 415 readline = open(myfile, 'rb').__next__ # Example of alternate readline 416 417 The generator produces 5-tuples with these members: the token type; the 418 token string; a 2-tuple (srow, scol) of ints specifying the row and 419 column where the token begins in the source; a 2-tuple (erow, ecol) of 420 ints specifying the row and column where the token ends in the source; 421 and the line on which the token was found. The line passed is the 422 physical line. 423 424 The first token sequence will always be an ENCODING token 425 which tells you which encoding was used to decode the bytes stream. 426 """ 427 encoding, consumed = detect_encoding(readline) 428 empty = _itertools.repeat(b"") 429 rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) 430 return _tokenize(rl_gen.__next__, encoding) 431 432 433def _tokenize(readline, encoding): 434 lnum = parenlev = continued = 0 435 numchars = '0123456789' 436 contstr, needcont = '', 0 437 contline = None 438 indents = [0] 439 440 if encoding is not None: 441 if encoding == "utf-8-sig": 442 # BOM will already have been stripped. 443 encoding = "utf-8" 444 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 445 last_line = b'' 446 line = b'' 447 while True: # loop over lines in stream 448 try: 449 # We capture the value of the line variable here because 450 # readline uses the empty string '' to signal end of input, 451 # hence `line` itself will always be overwritten at the end 452 # of this loop. 453 last_line = line 454 line = readline() 455 except StopIteration: 456 line = b'' 457 458 if encoding is not None: 459 line = line.decode(encoding) 460 lnum += 1 461 pos, max = 0, len(line) 462 463 if contstr: # continued string 464 if not line: 465 raise TokenError("EOF in multi-line string", strstart) 466 endmatch = endprog.match(line) 467 if endmatch: 468 pos = end = endmatch.end(0) 469 yield TokenInfo(STRING, contstr + line[:end], 470 strstart, (lnum, end), contline + line) 471 contstr, needcont = '', 0 472 contline = None 473 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 474 yield TokenInfo(ERRORTOKEN, contstr + line, 475 strstart, (lnum, len(line)), contline) 476 contstr = '' 477 contline = None 478 continue 479 else: 480 contstr = contstr + line 481 contline = contline + line 482 continue 483 484 elif parenlev == 0 and not continued: # new statement 485 if not line: break 486 column = 0 487 while pos < max: # measure leading whitespace 488 if line[pos] == ' ': 489 column += 1 490 elif line[pos] == '\t': 491 column = (column//tabsize + 1)*tabsize 492 elif line[pos] == '\f': 493 column = 0 494 else: 495 break 496 pos += 1 497 if pos == max: 498 break 499 500 if line[pos] in '#\r\n': # skip comments or blank lines 501 if line[pos] == '#': 502 comment_token = line[pos:].rstrip('\r\n') 503 yield TokenInfo(COMMENT, comment_token, 504 (lnum, pos), (lnum, pos + len(comment_token)), line) 505 pos += len(comment_token) 506 507 yield TokenInfo(NL, line[pos:], 508 (lnum, pos), (lnum, len(line)), line) 509 continue 510 511 if column > indents[-1]: # count indents or dedents 512 indents.append(column) 513 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 514 while column < indents[-1]: 515 if column not in indents: 516 raise IndentationError( 517 "unindent does not match any outer indentation level", 518 ("<tokenize>", lnum, pos, line)) 519 indents = indents[:-1] 520 521 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 522 523 else: # continued statement 524 if not line: 525 raise TokenError("EOF in multi-line statement", (lnum, 0)) 526 continued = 0 527 528 while pos < max: 529 pseudomatch = _compile(PseudoToken).match(line, pos) 530 if pseudomatch: # scan for tokens 531 start, end = pseudomatch.span(1) 532 spos, epos, pos = (lnum, start), (lnum, end), end 533 if start == end: 534 continue 535 token, initial = line[start:end], line[start] 536 537 if (initial in numchars or # ordinary number 538 (initial == '.' and token != '.' and token != '...')): 539 yield TokenInfo(NUMBER, token, spos, epos, line) 540 elif initial in '\r\n': 541 if parenlev > 0: 542 yield TokenInfo(NL, token, spos, epos, line) 543 else: 544 yield TokenInfo(NEWLINE, token, spos, epos, line) 545 546 elif initial == '#': 547 assert not token.endswith("\n") 548 yield TokenInfo(COMMENT, token, spos, epos, line) 549 550 elif token in triple_quoted: 551 endprog = _compile(endpats[token]) 552 endmatch = endprog.match(line, pos) 553 if endmatch: # all on one line 554 pos = endmatch.end(0) 555 token = line[start:pos] 556 yield TokenInfo(STRING, token, spos, (lnum, pos), line) 557 else: 558 strstart = (lnum, start) # multiple lines 559 contstr = line[start:] 560 contline = line 561 break 562 563 # Check up to the first 3 chars of the token to see if 564 # they're in the single_quoted set. If so, they start 565 # a string. 566 # We're using the first 3, because we're looking for 567 # "rb'" (for example) at the start of the token. If 568 # we switch to longer prefixes, this needs to be 569 # adjusted. 570 # Note that initial == token[:1]. 571 # Also note that single quote checking must come after 572 # triple quote checking (above). 573 elif (initial in single_quoted or 574 token[:2] in single_quoted or 575 token[:3] in single_quoted): 576 if token[-1] == '\n': # continued string 577 strstart = (lnum, start) 578 # Again, using the first 3 chars of the 579 # token. This is looking for the matching end 580 # regex for the correct type of quote 581 # character. So it's really looking for 582 # endpats["'"] or endpats['"'], by trying to 583 # skip string prefix characters, if any. 584 endprog = _compile(endpats.get(initial) or 585 endpats.get(token[1]) or 586 endpats.get(token[2])) 587 contstr, needcont = line[start:], 1 588 contline = line 589 break 590 else: # ordinary string 591 yield TokenInfo(STRING, token, spos, epos, line) 592 593 elif initial.isidentifier(): # ordinary name 594 yield TokenInfo(NAME, token, spos, epos, line) 595 elif initial == '\\': # continued stmt 596 continued = 1 597 else: 598 if initial in '([{': 599 parenlev += 1 600 elif initial in ')]}': 601 parenlev -= 1 602 yield TokenInfo(OP, token, spos, epos, line) 603 else: 604 yield TokenInfo(ERRORTOKEN, line[pos], 605 (lnum, pos), (lnum, pos+1), line) 606 pos += 1 607 608 # Add an implicit NEWLINE if the input doesn't end in one 609 if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): 610 yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') 611 for indent in indents[1:]: # pop remaining indent levels 612 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 613 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 614 615 616def generate_tokens(readline): 617 """Tokenize a source reading Python code as unicode strings. 618 619 This has the same API as tokenize(), except that it expects the *readline* 620 callable to return str objects instead of bytes. 621 """ 622 return _tokenize(readline, None) 623 624def main(): 625 import argparse 626 627 # Helper error handling routines 628 def perror(message): 629 sys.stderr.write(message) 630 sys.stderr.write('\n') 631 632 def error(message, filename=None, location=None): 633 if location: 634 args = (filename,) + location + (message,) 635 perror("%s:%d:%d: error: %s" % args) 636 elif filename: 637 perror("%s: error: %s" % (filename, message)) 638 else: 639 perror("error: %s" % message) 640 sys.exit(1) 641 642 # Parse the arguments and options 643 parser = argparse.ArgumentParser(prog='python -m tokenize') 644 parser.add_argument(dest='filename', nargs='?', 645 metavar='filename.py', 646 help='the file to tokenize; defaults to stdin') 647 parser.add_argument('-e', '--exact', dest='exact', action='store_true', 648 help='display token names using the exact type') 649 args = parser.parse_args() 650 651 try: 652 # Tokenize the input 653 if args.filename: 654 filename = args.filename 655 with _builtin_open(filename, 'rb') as f: 656 tokens = list(tokenize(f.readline)) 657 else: 658 filename = "<stdin>" 659 tokens = _tokenize(sys.stdin.readline, None) 660 661 # Output the tokenization 662 for token in tokens: 663 token_type = token.type 664 if args.exact: 665 token_type = token.exact_type 666 token_range = "%d,%d-%d,%d:" % (token.start + token.end) 667 print("%-20s%-15s%-15r" % 668 (token_range, tok_name[token_type], token.string)) 669 except IndentationError as err: 670 line, column = err.args[1][1:3] 671 error(err.args[0], filename, (line, column)) 672 except TokenError as err: 673 line, column = err.args[1] 674 error(err.args[0], filename, (line, column)) 675 except SyntaxError as err: 676 error(err, filename) 677 except OSError as err: 678 error(err) 679 except KeyboardInterrupt: 680 print("interrupted\n") 681 except Exception as err: 682 perror("unexpected error: %s" % err) 683 raise 684 685def _generate_tokens_from_c_tokenizer(source): 686 """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" 687 import _tokenize as c_tokenizer 688 for info in c_tokenizer.TokenizerIter(source): 689 tok, type, lineno, end_lineno, col_off, end_col_off, line = info 690 yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line) 691 692 693if __name__ == "__main__": 694 main() 695