1from test import support 2from test.support import os_helper 3from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, 4 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, 5 open as tokenize_open, Untokenizer, generate_tokens, 6 NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT) 7from io import BytesIO, StringIO 8import unittest 9from textwrap import dedent 10from unittest import TestCase, mock 11from test.test_grammar import (VALID_UNDERSCORE_LITERALS, 12 INVALID_UNDERSCORE_LITERALS) 13from test.support import os_helper 14from test.support.script_helper import run_test_script, make_script 15import os 16import token 17 18# Converts a source string into a list of textual representation 19# of the tokens such as: 20# ` NAME 'if' (1, 0) (1, 2)` 21# to make writing tests easier. 22def stringify_tokens_from_source(token_generator, source_string): 23 result = [] 24 num_lines = len(source_string.splitlines()) 25 missing_trailing_nl = source_string[-1] not in '\r\n' 26 27 for type, token, start, end, line in token_generator: 28 if type == ENDMARKER: 29 break 30 # Ignore the new line on the last line if the input lacks one 31 if missing_trailing_nl and type == NEWLINE and end[0] == num_lines: 32 continue 33 type = tok_name[type] 34 result.append(f" {type:10} {token!r:13} {start} {end}") 35 36 return result 37 38class TokenizeTest(TestCase): 39 # Tests for the tokenize module. 40 41 # The tests can be really simple. Given a small fragment of source 42 # code, print out a table with tokens. The ENDMARKER, ENCODING and 43 # final NEWLINE are omitted for brevity. 44 45 def check_tokenize(self, s, expected): 46 # Format the tokens in s in a table format. 47 # The ENDMARKER and final NEWLINE are omitted. 48 f = BytesIO(s.encode('utf-8')) 49 result = stringify_tokens_from_source(tokenize(f.readline), s) 50 self.assertEqual(result, 51 [" ENCODING 'utf-8' (0, 0) (0, 0)"] + 52 expected.rstrip().splitlines()) 53 54 def test_implicit_newline(self): 55 # Make sure that the tokenizer puts in an implicit NEWLINE 56 # when the input lacks a trailing new line. 57 f = BytesIO("x".encode('utf-8')) 58 tokens = list(tokenize(f.readline)) 59 self.assertEqual(tokens[-2].type, NEWLINE) 60 self.assertEqual(tokens[-1].type, ENDMARKER) 61 62 def test_basic(self): 63 self.check_tokenize("1 + 1", """\ 64 NUMBER '1' (1, 0) (1, 1) 65 OP '+' (1, 2) (1, 3) 66 NUMBER '1' (1, 4) (1, 5) 67 """) 68 self.check_tokenize("if False:\n" 69 " # NL\n" 70 " \n" 71 " True = False # NEWLINE\n", """\ 72 NAME 'if' (1, 0) (1, 2) 73 NAME 'False' (1, 3) (1, 8) 74 OP ':' (1, 8) (1, 9) 75 NEWLINE '\\n' (1, 9) (1, 10) 76 COMMENT '# NL' (2, 4) (2, 8) 77 NL '\\n' (2, 8) (2, 9) 78 NL '\\n' (3, 4) (3, 5) 79 INDENT ' ' (4, 0) (4, 4) 80 NAME 'True' (4, 4) (4, 8) 81 OP '=' (4, 9) (4, 10) 82 NAME 'False' (4, 11) (4, 16) 83 COMMENT '# NEWLINE' (4, 17) (4, 26) 84 NEWLINE '\\n' (4, 26) (4, 27) 85 DEDENT '' (5, 0) (5, 0) 86 """) 87 indent_error_file = b"""\ 88def k(x): 89 x += 2 90 x += 5 91""" 92 readline = BytesIO(indent_error_file).readline 93 with self.assertRaisesRegex(IndentationError, 94 "unindent does not match any " 95 "outer indentation level"): 96 for tok in tokenize(readline): 97 pass 98 99 def test_int(self): 100 # Ordinary integers and binary operators 101 self.check_tokenize("0xff <= 255", """\ 102 NUMBER '0xff' (1, 0) (1, 4) 103 OP '<=' (1, 5) (1, 7) 104 NUMBER '255' (1, 8) (1, 11) 105 """) 106 self.check_tokenize("0b10 <= 255", """\ 107 NUMBER '0b10' (1, 0) (1, 4) 108 OP '<=' (1, 5) (1, 7) 109 NUMBER '255' (1, 8) (1, 11) 110 """) 111 self.check_tokenize("0o123 <= 0O123", """\ 112 NUMBER '0o123' (1, 0) (1, 5) 113 OP '<=' (1, 6) (1, 8) 114 NUMBER '0O123' (1, 9) (1, 14) 115 """) 116 self.check_tokenize("1234567 > ~0x15", """\ 117 NUMBER '1234567' (1, 0) (1, 7) 118 OP '>' (1, 8) (1, 9) 119 OP '~' (1, 10) (1, 11) 120 NUMBER '0x15' (1, 11) (1, 15) 121 """) 122 self.check_tokenize("2134568 != 1231515", """\ 123 NUMBER '2134568' (1, 0) (1, 7) 124 OP '!=' (1, 8) (1, 10) 125 NUMBER '1231515' (1, 11) (1, 18) 126 """) 127 self.check_tokenize("(-124561-1) & 200000000", """\ 128 OP '(' (1, 0) (1, 1) 129 OP '-' (1, 1) (1, 2) 130 NUMBER '124561' (1, 2) (1, 8) 131 OP '-' (1, 8) (1, 9) 132 NUMBER '1' (1, 9) (1, 10) 133 OP ')' (1, 10) (1, 11) 134 OP '&' (1, 12) (1, 13) 135 NUMBER '200000000' (1, 14) (1, 23) 136 """) 137 self.check_tokenize("0xdeadbeef != -1", """\ 138 NUMBER '0xdeadbeef' (1, 0) (1, 10) 139 OP '!=' (1, 11) (1, 13) 140 OP '-' (1, 14) (1, 15) 141 NUMBER '1' (1, 15) (1, 16) 142 """) 143 self.check_tokenize("0xdeadc0de & 12345", """\ 144 NUMBER '0xdeadc0de' (1, 0) (1, 10) 145 OP '&' (1, 11) (1, 12) 146 NUMBER '12345' (1, 13) (1, 18) 147 """) 148 self.check_tokenize("0xFF & 0x15 | 1234", """\ 149 NUMBER '0xFF' (1, 0) (1, 4) 150 OP '&' (1, 5) (1, 6) 151 NUMBER '0x15' (1, 7) (1, 11) 152 OP '|' (1, 12) (1, 13) 153 NUMBER '1234' (1, 14) (1, 18) 154 """) 155 156 def test_long(self): 157 # Long integers 158 self.check_tokenize("x = 0", """\ 159 NAME 'x' (1, 0) (1, 1) 160 OP '=' (1, 2) (1, 3) 161 NUMBER '0' (1, 4) (1, 5) 162 """) 163 self.check_tokenize("x = 0xfffffffffff", """\ 164 NAME 'x' (1, 0) (1, 1) 165 OP '=' (1, 2) (1, 3) 166 NUMBER '0xfffffffffff' (1, 4) (1, 17) 167 """) 168 self.check_tokenize("x = 123141242151251616110", """\ 169 NAME 'x' (1, 0) (1, 1) 170 OP '=' (1, 2) (1, 3) 171 NUMBER '123141242151251616110' (1, 4) (1, 25) 172 """) 173 self.check_tokenize("x = -15921590215012591", """\ 174 NAME 'x' (1, 0) (1, 1) 175 OP '=' (1, 2) (1, 3) 176 OP '-' (1, 4) (1, 5) 177 NUMBER '15921590215012591' (1, 5) (1, 22) 178 """) 179 180 def test_float(self): 181 # Floating point numbers 182 self.check_tokenize("x = 3.14159", """\ 183 NAME 'x' (1, 0) (1, 1) 184 OP '=' (1, 2) (1, 3) 185 NUMBER '3.14159' (1, 4) (1, 11) 186 """) 187 self.check_tokenize("x = 314159.", """\ 188 NAME 'x' (1, 0) (1, 1) 189 OP '=' (1, 2) (1, 3) 190 NUMBER '314159.' (1, 4) (1, 11) 191 """) 192 self.check_tokenize("x = .314159", """\ 193 NAME 'x' (1, 0) (1, 1) 194 OP '=' (1, 2) (1, 3) 195 NUMBER '.314159' (1, 4) (1, 11) 196 """) 197 self.check_tokenize("x = 3e14159", """\ 198 NAME 'x' (1, 0) (1, 1) 199 OP '=' (1, 2) (1, 3) 200 NUMBER '3e14159' (1, 4) (1, 11) 201 """) 202 self.check_tokenize("x = 3E123", """\ 203 NAME 'x' (1, 0) (1, 1) 204 OP '=' (1, 2) (1, 3) 205 NUMBER '3E123' (1, 4) (1, 9) 206 """) 207 self.check_tokenize("x+y = 3e-1230", """\ 208 NAME 'x' (1, 0) (1, 1) 209 OP '+' (1, 1) (1, 2) 210 NAME 'y' (1, 2) (1, 3) 211 OP '=' (1, 4) (1, 5) 212 NUMBER '3e-1230' (1, 6) (1, 13) 213 """) 214 self.check_tokenize("x = 3.14e159", """\ 215 NAME 'x' (1, 0) (1, 1) 216 OP '=' (1, 2) (1, 3) 217 NUMBER '3.14e159' (1, 4) (1, 12) 218 """) 219 220 def test_underscore_literals(self): 221 def number_token(s): 222 f = BytesIO(s.encode('utf-8')) 223 for toktype, token, start, end, line in tokenize(f.readline): 224 if toktype == NUMBER: 225 return token 226 return 'invalid token' 227 for lit in VALID_UNDERSCORE_LITERALS: 228 if '(' in lit: 229 # this won't work with compound complex inputs 230 continue 231 self.assertEqual(number_token(lit), lit) 232 for lit in INVALID_UNDERSCORE_LITERALS: 233 self.assertNotEqual(number_token(lit), lit) 234 235 def test_string(self): 236 # String literals 237 self.check_tokenize("x = ''; y = \"\"", """\ 238 NAME 'x' (1, 0) (1, 1) 239 OP '=' (1, 2) (1, 3) 240 STRING "''" (1, 4) (1, 6) 241 OP ';' (1, 6) (1, 7) 242 NAME 'y' (1, 8) (1, 9) 243 OP '=' (1, 10) (1, 11) 244 STRING '""' (1, 12) (1, 14) 245 """) 246 self.check_tokenize("x = '\"'; y = \"'\"", """\ 247 NAME 'x' (1, 0) (1, 1) 248 OP '=' (1, 2) (1, 3) 249 STRING '\\'"\\'' (1, 4) (1, 7) 250 OP ';' (1, 7) (1, 8) 251 NAME 'y' (1, 9) (1, 10) 252 OP '=' (1, 11) (1, 12) 253 STRING '"\\'"' (1, 13) (1, 16) 254 """) 255 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\ 256 NAME 'x' (1, 0) (1, 1) 257 OP '=' (1, 2) (1, 3) 258 STRING '"doesn\\'t "' (1, 4) (1, 14) 259 NAME 'shrink' (1, 14) (1, 20) 260 STRING '", does it"' (1, 20) (1, 31) 261 """) 262 self.check_tokenize("x = 'abc' + 'ABC'", """\ 263 NAME 'x' (1, 0) (1, 1) 264 OP '=' (1, 2) (1, 3) 265 STRING "'abc'" (1, 4) (1, 9) 266 OP '+' (1, 10) (1, 11) 267 STRING "'ABC'" (1, 12) (1, 17) 268 """) 269 self.check_tokenize('y = "ABC" + "ABC"', """\ 270 NAME 'y' (1, 0) (1, 1) 271 OP '=' (1, 2) (1, 3) 272 STRING '"ABC"' (1, 4) (1, 9) 273 OP '+' (1, 10) (1, 11) 274 STRING '"ABC"' (1, 12) (1, 17) 275 """) 276 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\ 277 NAME 'x' (1, 0) (1, 1) 278 OP '=' (1, 2) (1, 3) 279 STRING "r'abc'" (1, 4) (1, 10) 280 OP '+' (1, 11) (1, 12) 281 STRING "r'ABC'" (1, 13) (1, 19) 282 OP '+' (1, 20) (1, 21) 283 STRING "R'ABC'" (1, 22) (1, 28) 284 OP '+' (1, 29) (1, 30) 285 STRING "R'ABC'" (1, 31) (1, 37) 286 """) 287 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\ 288 NAME 'y' (1, 0) (1, 1) 289 OP '=' (1, 2) (1, 3) 290 STRING 'r"abc"' (1, 4) (1, 10) 291 OP '+' (1, 11) (1, 12) 292 STRING 'r"ABC"' (1, 13) (1, 19) 293 OP '+' (1, 20) (1, 21) 294 STRING 'R"ABC"' (1, 22) (1, 28) 295 OP '+' (1, 29) (1, 30) 296 STRING 'R"ABC"' (1, 31) (1, 37) 297 """) 298 299 self.check_tokenize("u'abc' + U'abc'", """\ 300 STRING "u'abc'" (1, 0) (1, 6) 301 OP '+' (1, 7) (1, 8) 302 STRING "U'abc'" (1, 9) (1, 15) 303 """) 304 self.check_tokenize('u"abc" + U"abc"', """\ 305 STRING 'u"abc"' (1, 0) (1, 6) 306 OP '+' (1, 7) (1, 8) 307 STRING 'U"abc"' (1, 9) (1, 15) 308 """) 309 310 self.check_tokenize("b'abc' + B'abc'", """\ 311 STRING "b'abc'" (1, 0) (1, 6) 312 OP '+' (1, 7) (1, 8) 313 STRING "B'abc'" (1, 9) (1, 15) 314 """) 315 self.check_tokenize('b"abc" + B"abc"', """\ 316 STRING 'b"abc"' (1, 0) (1, 6) 317 OP '+' (1, 7) (1, 8) 318 STRING 'B"abc"' (1, 9) (1, 15) 319 """) 320 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\ 321 STRING "br'abc'" (1, 0) (1, 7) 322 OP '+' (1, 8) (1, 9) 323 STRING "bR'abc'" (1, 10) (1, 17) 324 OP '+' (1, 18) (1, 19) 325 STRING "Br'abc'" (1, 20) (1, 27) 326 OP '+' (1, 28) (1, 29) 327 STRING "BR'abc'" (1, 30) (1, 37) 328 """) 329 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\ 330 STRING 'br"abc"' (1, 0) (1, 7) 331 OP '+' (1, 8) (1, 9) 332 STRING 'bR"abc"' (1, 10) (1, 17) 333 OP '+' (1, 18) (1, 19) 334 STRING 'Br"abc"' (1, 20) (1, 27) 335 OP '+' (1, 28) (1, 29) 336 STRING 'BR"abc"' (1, 30) (1, 37) 337 """) 338 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\ 339 STRING "rb'abc'" (1, 0) (1, 7) 340 OP '+' (1, 8) (1, 9) 341 STRING "rB'abc'" (1, 10) (1, 17) 342 OP '+' (1, 18) (1, 19) 343 STRING "Rb'abc'" (1, 20) (1, 27) 344 OP '+' (1, 28) (1, 29) 345 STRING "RB'abc'" (1, 30) (1, 37) 346 """) 347 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\ 348 STRING 'rb"abc"' (1, 0) (1, 7) 349 OP '+' (1, 8) (1, 9) 350 STRING 'rB"abc"' (1, 10) (1, 17) 351 OP '+' (1, 18) (1, 19) 352 STRING 'Rb"abc"' (1, 20) (1, 27) 353 OP '+' (1, 28) (1, 29) 354 STRING 'RB"abc"' (1, 30) (1, 37) 355 """) 356 # Check 0, 1, and 2 character string prefixes. 357 self.check_tokenize(r'"a\ 358de\ 359fg"', """\ 360 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3) 361 """) 362 self.check_tokenize(r'u"a\ 363de"', """\ 364 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3) 365 """) 366 self.check_tokenize(r'rb"a\ 367d"', """\ 368 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2) 369 """) 370 self.check_tokenize(r'"""a\ 371b"""', """\ 372 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) 373 """) 374 self.check_tokenize(r'u"""a\ 375b"""', """\ 376 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) 377 """) 378 self.check_tokenize(r'rb"""a\ 379b\ 380c"""', """\ 381 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) 382 """) 383 self.check_tokenize('f"abc"', """\ 384 STRING 'f"abc"' (1, 0) (1, 6) 385 """) 386 self.check_tokenize('fR"a{b}c"', """\ 387 STRING 'fR"a{b}c"' (1, 0) (1, 9) 388 """) 389 self.check_tokenize('f"""abc"""', """\ 390 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10) 391 """) 392 self.check_tokenize(r'f"abc\ 393def"', """\ 394 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4) 395 """) 396 self.check_tokenize(r'Rf"abc\ 397def"', """\ 398 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4) 399 """) 400 401 def test_function(self): 402 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\ 403 NAME 'def' (1, 0) (1, 3) 404 NAME 'd22' (1, 4) (1, 7) 405 OP '(' (1, 7) (1, 8) 406 NAME 'a' (1, 8) (1, 9) 407 OP ',' (1, 9) (1, 10) 408 NAME 'b' (1, 11) (1, 12) 409 OP ',' (1, 12) (1, 13) 410 NAME 'c' (1, 14) (1, 15) 411 OP '=' (1, 15) (1, 16) 412 NUMBER '2' (1, 16) (1, 17) 413 OP ',' (1, 17) (1, 18) 414 NAME 'd' (1, 19) (1, 20) 415 OP '=' (1, 20) (1, 21) 416 NUMBER '2' (1, 21) (1, 22) 417 OP ',' (1, 22) (1, 23) 418 OP '*' (1, 24) (1, 25) 419 NAME 'k' (1, 25) (1, 26) 420 OP ')' (1, 26) (1, 27) 421 OP ':' (1, 27) (1, 28) 422 NAME 'pass' (1, 29) (1, 33) 423 """) 424 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\ 425 NAME 'def' (1, 0) (1, 3) 426 NAME 'd01v_' (1, 4) (1, 9) 427 OP '(' (1, 9) (1, 10) 428 NAME 'a' (1, 10) (1, 11) 429 OP '=' (1, 11) (1, 12) 430 NUMBER '1' (1, 12) (1, 13) 431 OP ',' (1, 13) (1, 14) 432 OP '*' (1, 15) (1, 16) 433 NAME 'k' (1, 16) (1, 17) 434 OP ',' (1, 17) (1, 18) 435 OP '**' (1, 19) (1, 21) 436 NAME 'w' (1, 21) (1, 22) 437 OP ')' (1, 22) (1, 23) 438 OP ':' (1, 23) (1, 24) 439 NAME 'pass' (1, 25) (1, 29) 440 """) 441 self.check_tokenize("def d23(a: str, b: int=3) -> int: pass", """\ 442 NAME 'def' (1, 0) (1, 3) 443 NAME 'd23' (1, 4) (1, 7) 444 OP '(' (1, 7) (1, 8) 445 NAME 'a' (1, 8) (1, 9) 446 OP ':' (1, 9) (1, 10) 447 NAME 'str' (1, 11) (1, 14) 448 OP ',' (1, 14) (1, 15) 449 NAME 'b' (1, 16) (1, 17) 450 OP ':' (1, 17) (1, 18) 451 NAME 'int' (1, 19) (1, 22) 452 OP '=' (1, 22) (1, 23) 453 NUMBER '3' (1, 23) (1, 24) 454 OP ')' (1, 24) (1, 25) 455 OP '->' (1, 26) (1, 28) 456 NAME 'int' (1, 29) (1, 32) 457 OP ':' (1, 32) (1, 33) 458 NAME 'pass' (1, 34) (1, 38) 459 """) 460 461 def test_comparison(self): 462 # Comparison 463 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " 464 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\ 465 NAME 'if' (1, 0) (1, 2) 466 NUMBER '1' (1, 3) (1, 4) 467 OP '<' (1, 5) (1, 6) 468 NUMBER '1' (1, 7) (1, 8) 469 OP '>' (1, 9) (1, 10) 470 NUMBER '1' (1, 11) (1, 12) 471 OP '==' (1, 13) (1, 15) 472 NUMBER '1' (1, 16) (1, 17) 473 OP '>=' (1, 18) (1, 20) 474 NUMBER '5' (1, 21) (1, 22) 475 OP '<=' (1, 23) (1, 25) 476 NUMBER '0x15' (1, 26) (1, 30) 477 OP '<=' (1, 31) (1, 33) 478 NUMBER '0x12' (1, 34) (1, 38) 479 OP '!=' (1, 39) (1, 41) 480 NUMBER '1' (1, 42) (1, 43) 481 NAME 'and' (1, 44) (1, 47) 482 NUMBER '5' (1, 48) (1, 49) 483 NAME 'in' (1, 50) (1, 52) 484 NUMBER '1' (1, 53) (1, 54) 485 NAME 'not' (1, 55) (1, 58) 486 NAME 'in' (1, 59) (1, 61) 487 NUMBER '1' (1, 62) (1, 63) 488 NAME 'is' (1, 64) (1, 66) 489 NUMBER '1' (1, 67) (1, 68) 490 NAME 'or' (1, 69) (1, 71) 491 NUMBER '5' (1, 72) (1, 73) 492 NAME 'is' (1, 74) (1, 76) 493 NAME 'not' (1, 77) (1, 80) 494 NUMBER '1' (1, 81) (1, 82) 495 OP ':' (1, 82) (1, 83) 496 NAME 'pass' (1, 84) (1, 88) 497 """) 498 499 def test_shift(self): 500 # Shift 501 self.check_tokenize("x = 1 << 1 >> 5", """\ 502 NAME 'x' (1, 0) (1, 1) 503 OP '=' (1, 2) (1, 3) 504 NUMBER '1' (1, 4) (1, 5) 505 OP '<<' (1, 6) (1, 8) 506 NUMBER '1' (1, 9) (1, 10) 507 OP '>>' (1, 11) (1, 13) 508 NUMBER '5' (1, 14) (1, 15) 509 """) 510 511 def test_additive(self): 512 # Additive 513 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\ 514 NAME 'x' (1, 0) (1, 1) 515 OP '=' (1, 2) (1, 3) 516 NUMBER '1' (1, 4) (1, 5) 517 OP '-' (1, 6) (1, 7) 518 NAME 'y' (1, 8) (1, 9) 519 OP '+' (1, 10) (1, 11) 520 NUMBER '15' (1, 12) (1, 14) 521 OP '-' (1, 15) (1, 16) 522 NUMBER '1' (1, 17) (1, 18) 523 OP '+' (1, 19) (1, 20) 524 NUMBER '0x124' (1, 21) (1, 26) 525 OP '+' (1, 27) (1, 28) 526 NAME 'z' (1, 29) (1, 30) 527 OP '+' (1, 31) (1, 32) 528 NAME 'a' (1, 33) (1, 34) 529 OP '[' (1, 34) (1, 35) 530 NUMBER '5' (1, 35) (1, 36) 531 OP ']' (1, 36) (1, 37) 532 """) 533 534 def test_multiplicative(self): 535 # Multiplicative 536 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\ 537 NAME 'x' (1, 0) (1, 1) 538 OP '=' (1, 2) (1, 3) 539 NUMBER '1' (1, 4) (1, 5) 540 OP '//' (1, 5) (1, 7) 541 NUMBER '1' (1, 7) (1, 8) 542 OP '*' (1, 8) (1, 9) 543 NUMBER '1' (1, 9) (1, 10) 544 OP '/' (1, 10) (1, 11) 545 NUMBER '5' (1, 11) (1, 12) 546 OP '*' (1, 12) (1, 13) 547 NUMBER '12' (1, 13) (1, 15) 548 OP '%' (1, 15) (1, 16) 549 NUMBER '0x12' (1, 16) (1, 20) 550 OP '@' (1, 20) (1, 21) 551 NUMBER '42' (1, 21) (1, 23) 552 """) 553 554 def test_unary(self): 555 # Unary 556 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\ 557 OP '~' (1, 0) (1, 1) 558 NUMBER '1' (1, 1) (1, 2) 559 OP '^' (1, 3) (1, 4) 560 NUMBER '1' (1, 5) (1, 6) 561 OP '&' (1, 7) (1, 8) 562 NUMBER '1' (1, 9) (1, 10) 563 OP '|' (1, 11) (1, 12) 564 NUMBER '1' (1, 12) (1, 13) 565 OP '^' (1, 14) (1, 15) 566 OP '-' (1, 16) (1, 17) 567 NUMBER '1' (1, 17) (1, 18) 568 """) 569 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\ 570 OP '-' (1, 0) (1, 1) 571 NUMBER '1' (1, 1) (1, 2) 572 OP '*' (1, 2) (1, 3) 573 NUMBER '1' (1, 3) (1, 4) 574 OP '/' (1, 4) (1, 5) 575 NUMBER '1' (1, 5) (1, 6) 576 OP '+' (1, 6) (1, 7) 577 NUMBER '1' (1, 7) (1, 8) 578 OP '*' (1, 8) (1, 9) 579 NUMBER '1' (1, 9) (1, 10) 580 OP '//' (1, 10) (1, 12) 581 NUMBER '1' (1, 12) (1, 13) 582 OP '-' (1, 14) (1, 15) 583 OP '-' (1, 16) (1, 17) 584 OP '-' (1, 17) (1, 18) 585 OP '-' (1, 18) (1, 19) 586 NUMBER '1' (1, 19) (1, 20) 587 OP '**' (1, 20) (1, 22) 588 NUMBER '1' (1, 22) (1, 23) 589 """) 590 591 def test_selector(self): 592 # Selector 593 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\ 594 NAME 'import' (1, 0) (1, 6) 595 NAME 'sys' (1, 7) (1, 10) 596 OP ',' (1, 10) (1, 11) 597 NAME 'time' (1, 12) (1, 16) 598 NEWLINE '\\n' (1, 16) (1, 17) 599 NAME 'x' (2, 0) (2, 1) 600 OP '=' (2, 2) (2, 3) 601 NAME 'sys' (2, 4) (2, 7) 602 OP '.' (2, 7) (2, 8) 603 NAME 'modules' (2, 8) (2, 15) 604 OP '[' (2, 15) (2, 16) 605 STRING "'time'" (2, 16) (2, 22) 606 OP ']' (2, 22) (2, 23) 607 OP '.' (2, 23) (2, 24) 608 NAME 'time' (2, 24) (2, 28) 609 OP '(' (2, 28) (2, 29) 610 OP ')' (2, 29) (2, 30) 611 """) 612 613 def test_method(self): 614 # Methods 615 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\ 616 OP '@' (1, 0) (1, 1) 617 NAME 'staticmethod' (1, 1) (1, 13) 618 NEWLINE '\\n' (1, 13) (1, 14) 619 NAME 'def' (2, 0) (2, 3) 620 NAME 'foo' (2, 4) (2, 7) 621 OP '(' (2, 7) (2, 8) 622 NAME 'x' (2, 8) (2, 9) 623 OP ',' (2, 9) (2, 10) 624 NAME 'y' (2, 10) (2, 11) 625 OP ')' (2, 11) (2, 12) 626 OP ':' (2, 12) (2, 13) 627 NAME 'pass' (2, 14) (2, 18) 628 """) 629 630 def test_tabs(self): 631 # Evil tabs 632 self.check_tokenize("def f():\n" 633 "\tif x\n" 634 " \tpass", """\ 635 NAME 'def' (1, 0) (1, 3) 636 NAME 'f' (1, 4) (1, 5) 637 OP '(' (1, 5) (1, 6) 638 OP ')' (1, 6) (1, 7) 639 OP ':' (1, 7) (1, 8) 640 NEWLINE '\\n' (1, 8) (1, 9) 641 INDENT '\\t' (2, 0) (2, 1) 642 NAME 'if' (2, 1) (2, 3) 643 NAME 'x' (2, 4) (2, 5) 644 NEWLINE '\\n' (2, 5) (2, 6) 645 INDENT ' \\t' (3, 0) (3, 9) 646 NAME 'pass' (3, 9) (3, 13) 647 DEDENT '' (4, 0) (4, 0) 648 DEDENT '' (4, 0) (4, 0) 649 """) 650 651 def test_non_ascii_identifiers(self): 652 # Non-ascii identifiers 653 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\ 654 NAME 'Örter' (1, 0) (1, 5) 655 OP '=' (1, 6) (1, 7) 656 STRING "'places'" (1, 8) (1, 16) 657 NEWLINE '\\n' (1, 16) (1, 17) 658 NAME 'grün' (2, 0) (2, 4) 659 OP '=' (2, 5) (2, 6) 660 STRING "'green'" (2, 7) (2, 14) 661 """) 662 663 def test_unicode(self): 664 # Legacy unicode literals: 665 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ 666 NAME 'Örter' (1, 0) (1, 5) 667 OP '=' (1, 6) (1, 7) 668 STRING "u'places'" (1, 8) (1, 17) 669 NEWLINE '\\n' (1, 17) (1, 18) 670 NAME 'grün' (2, 0) (2, 4) 671 OP '=' (2, 5) (2, 6) 672 STRING "U'green'" (2, 7) (2, 15) 673 """) 674 675 def test_async(self): 676 # Async/await extension: 677 self.check_tokenize("async = 1", """\ 678 NAME 'async' (1, 0) (1, 5) 679 OP '=' (1, 6) (1, 7) 680 NUMBER '1' (1, 8) (1, 9) 681 """) 682 683 self.check_tokenize("a = (async = 1)", """\ 684 NAME 'a' (1, 0) (1, 1) 685 OP '=' (1, 2) (1, 3) 686 OP '(' (1, 4) (1, 5) 687 NAME 'async' (1, 5) (1, 10) 688 OP '=' (1, 11) (1, 12) 689 NUMBER '1' (1, 13) (1, 14) 690 OP ')' (1, 14) (1, 15) 691 """) 692 693 self.check_tokenize("async()", """\ 694 NAME 'async' (1, 0) (1, 5) 695 OP '(' (1, 5) (1, 6) 696 OP ')' (1, 6) (1, 7) 697 """) 698 699 self.check_tokenize("class async(Bar):pass", """\ 700 NAME 'class' (1, 0) (1, 5) 701 NAME 'async' (1, 6) (1, 11) 702 OP '(' (1, 11) (1, 12) 703 NAME 'Bar' (1, 12) (1, 15) 704 OP ')' (1, 15) (1, 16) 705 OP ':' (1, 16) (1, 17) 706 NAME 'pass' (1, 17) (1, 21) 707 """) 708 709 self.check_tokenize("class async:pass", """\ 710 NAME 'class' (1, 0) (1, 5) 711 NAME 'async' (1, 6) (1, 11) 712 OP ':' (1, 11) (1, 12) 713 NAME 'pass' (1, 12) (1, 16) 714 """) 715 716 self.check_tokenize("await = 1", """\ 717 NAME 'await' (1, 0) (1, 5) 718 OP '=' (1, 6) (1, 7) 719 NUMBER '1' (1, 8) (1, 9) 720 """) 721 722 self.check_tokenize("foo.async", """\ 723 NAME 'foo' (1, 0) (1, 3) 724 OP '.' (1, 3) (1, 4) 725 NAME 'async' (1, 4) (1, 9) 726 """) 727 728 self.check_tokenize("async for a in b: pass", """\ 729 NAME 'async' (1, 0) (1, 5) 730 NAME 'for' (1, 6) (1, 9) 731 NAME 'a' (1, 10) (1, 11) 732 NAME 'in' (1, 12) (1, 14) 733 NAME 'b' (1, 15) (1, 16) 734 OP ':' (1, 16) (1, 17) 735 NAME 'pass' (1, 18) (1, 22) 736 """) 737 738 self.check_tokenize("async with a as b: pass", """\ 739 NAME 'async' (1, 0) (1, 5) 740 NAME 'with' (1, 6) (1, 10) 741 NAME 'a' (1, 11) (1, 12) 742 NAME 'as' (1, 13) (1, 15) 743 NAME 'b' (1, 16) (1, 17) 744 OP ':' (1, 17) (1, 18) 745 NAME 'pass' (1, 19) (1, 23) 746 """) 747 748 self.check_tokenize("async.foo", """\ 749 NAME 'async' (1, 0) (1, 5) 750 OP '.' (1, 5) (1, 6) 751 NAME 'foo' (1, 6) (1, 9) 752 """) 753 754 self.check_tokenize("async", """\ 755 NAME 'async' (1, 0) (1, 5) 756 """) 757 758 self.check_tokenize("async\n#comment\nawait", """\ 759 NAME 'async' (1, 0) (1, 5) 760 NEWLINE '\\n' (1, 5) (1, 6) 761 COMMENT '#comment' (2, 0) (2, 8) 762 NL '\\n' (2, 8) (2, 9) 763 NAME 'await' (3, 0) (3, 5) 764 """) 765 766 self.check_tokenize("async\n...\nawait", """\ 767 NAME 'async' (1, 0) (1, 5) 768 NEWLINE '\\n' (1, 5) (1, 6) 769 OP '...' (2, 0) (2, 3) 770 NEWLINE '\\n' (2, 3) (2, 4) 771 NAME 'await' (3, 0) (3, 5) 772 """) 773 774 self.check_tokenize("async\nawait", """\ 775 NAME 'async' (1, 0) (1, 5) 776 NEWLINE '\\n' (1, 5) (1, 6) 777 NAME 'await' (2, 0) (2, 5) 778 """) 779 780 self.check_tokenize("foo.async + 1", """\ 781 NAME 'foo' (1, 0) (1, 3) 782 OP '.' (1, 3) (1, 4) 783 NAME 'async' (1, 4) (1, 9) 784 OP '+' (1, 10) (1, 11) 785 NUMBER '1' (1, 12) (1, 13) 786 """) 787 788 self.check_tokenize("async def foo(): pass", """\ 789 NAME 'async' (1, 0) (1, 5) 790 NAME 'def' (1, 6) (1, 9) 791 NAME 'foo' (1, 10) (1, 13) 792 OP '(' (1, 13) (1, 14) 793 OP ')' (1, 14) (1, 15) 794 OP ':' (1, 15) (1, 16) 795 NAME 'pass' (1, 17) (1, 21) 796 """) 797 798 self.check_tokenize('''\ 799async def foo(): 800 def foo(await): 801 await = 1 802 if 1: 803 await 804async += 1 805''', """\ 806 NAME 'async' (1, 0) (1, 5) 807 NAME 'def' (1, 6) (1, 9) 808 NAME 'foo' (1, 10) (1, 13) 809 OP '(' (1, 13) (1, 14) 810 OP ')' (1, 14) (1, 15) 811 OP ':' (1, 15) (1, 16) 812 NEWLINE '\\n' (1, 16) (1, 17) 813 INDENT ' ' (2, 0) (2, 2) 814 NAME 'def' (2, 2) (2, 5) 815 NAME 'foo' (2, 6) (2, 9) 816 OP '(' (2, 9) (2, 10) 817 NAME 'await' (2, 10) (2, 15) 818 OP ')' (2, 15) (2, 16) 819 OP ':' (2, 16) (2, 17) 820 NEWLINE '\\n' (2, 17) (2, 18) 821 INDENT ' ' (3, 0) (3, 4) 822 NAME 'await' (3, 4) (3, 9) 823 OP '=' (3, 10) (3, 11) 824 NUMBER '1' (3, 12) (3, 13) 825 NEWLINE '\\n' (3, 13) (3, 14) 826 DEDENT '' (4, 2) (4, 2) 827 NAME 'if' (4, 2) (4, 4) 828 NUMBER '1' (4, 5) (4, 6) 829 OP ':' (4, 6) (4, 7) 830 NEWLINE '\\n' (4, 7) (4, 8) 831 INDENT ' ' (5, 0) (5, 4) 832 NAME 'await' (5, 4) (5, 9) 833 NEWLINE '\\n' (5, 9) (5, 10) 834 DEDENT '' (6, 0) (6, 0) 835 DEDENT '' (6, 0) (6, 0) 836 NAME 'async' (6, 0) (6, 5) 837 OP '+=' (6, 6) (6, 8) 838 NUMBER '1' (6, 9) (6, 10) 839 NEWLINE '\\n' (6, 10) (6, 11) 840 """) 841 842 self.check_tokenize('''\ 843async def foo(): 844 async for i in 1: pass''', """\ 845 NAME 'async' (1, 0) (1, 5) 846 NAME 'def' (1, 6) (1, 9) 847 NAME 'foo' (1, 10) (1, 13) 848 OP '(' (1, 13) (1, 14) 849 OP ')' (1, 14) (1, 15) 850 OP ':' (1, 15) (1, 16) 851 NEWLINE '\\n' (1, 16) (1, 17) 852 INDENT ' ' (2, 0) (2, 2) 853 NAME 'async' (2, 2) (2, 7) 854 NAME 'for' (2, 8) (2, 11) 855 NAME 'i' (2, 12) (2, 13) 856 NAME 'in' (2, 14) (2, 16) 857 NUMBER '1' (2, 17) (2, 18) 858 OP ':' (2, 18) (2, 19) 859 NAME 'pass' (2, 20) (2, 24) 860 DEDENT '' (3, 0) (3, 0) 861 """) 862 863 self.check_tokenize('''async def foo(async): await''', """\ 864 NAME 'async' (1, 0) (1, 5) 865 NAME 'def' (1, 6) (1, 9) 866 NAME 'foo' (1, 10) (1, 13) 867 OP '(' (1, 13) (1, 14) 868 NAME 'async' (1, 14) (1, 19) 869 OP ')' (1, 19) (1, 20) 870 OP ':' (1, 20) (1, 21) 871 NAME 'await' (1, 22) (1, 27) 872 """) 873 874 self.check_tokenize('''\ 875def f(): 876 877 def baz(): pass 878 async def bar(): pass 879 880 await = 2''', """\ 881 NAME 'def' (1, 0) (1, 3) 882 NAME 'f' (1, 4) (1, 5) 883 OP '(' (1, 5) (1, 6) 884 OP ')' (1, 6) (1, 7) 885 OP ':' (1, 7) (1, 8) 886 NEWLINE '\\n' (1, 8) (1, 9) 887 NL '\\n' (2, 0) (2, 1) 888 INDENT ' ' (3, 0) (3, 2) 889 NAME 'def' (3, 2) (3, 5) 890 NAME 'baz' (3, 6) (3, 9) 891 OP '(' (3, 9) (3, 10) 892 OP ')' (3, 10) (3, 11) 893 OP ':' (3, 11) (3, 12) 894 NAME 'pass' (3, 13) (3, 17) 895 NEWLINE '\\n' (3, 17) (3, 18) 896 NAME 'async' (4, 2) (4, 7) 897 NAME 'def' (4, 8) (4, 11) 898 NAME 'bar' (4, 12) (4, 15) 899 OP '(' (4, 15) (4, 16) 900 OP ')' (4, 16) (4, 17) 901 OP ':' (4, 17) (4, 18) 902 NAME 'pass' (4, 19) (4, 23) 903 NEWLINE '\\n' (4, 23) (4, 24) 904 NL '\\n' (5, 0) (5, 1) 905 NAME 'await' (6, 2) (6, 7) 906 OP '=' (6, 8) (6, 9) 907 NUMBER '2' (6, 10) (6, 11) 908 DEDENT '' (7, 0) (7, 0) 909 """) 910 911 self.check_tokenize('''\ 912async def f(): 913 914 def baz(): pass 915 async def bar(): pass 916 917 await = 2''', """\ 918 NAME 'async' (1, 0) (1, 5) 919 NAME 'def' (1, 6) (1, 9) 920 NAME 'f' (1, 10) (1, 11) 921 OP '(' (1, 11) (1, 12) 922 OP ')' (1, 12) (1, 13) 923 OP ':' (1, 13) (1, 14) 924 NEWLINE '\\n' (1, 14) (1, 15) 925 NL '\\n' (2, 0) (2, 1) 926 INDENT ' ' (3, 0) (3, 2) 927 NAME 'def' (3, 2) (3, 5) 928 NAME 'baz' (3, 6) (3, 9) 929 OP '(' (3, 9) (3, 10) 930 OP ')' (3, 10) (3, 11) 931 OP ':' (3, 11) (3, 12) 932 NAME 'pass' (3, 13) (3, 17) 933 NEWLINE '\\n' (3, 17) (3, 18) 934 NAME 'async' (4, 2) (4, 7) 935 NAME 'def' (4, 8) (4, 11) 936 NAME 'bar' (4, 12) (4, 15) 937 OP '(' (4, 15) (4, 16) 938 OP ')' (4, 16) (4, 17) 939 OP ':' (4, 17) (4, 18) 940 NAME 'pass' (4, 19) (4, 23) 941 NEWLINE '\\n' (4, 23) (4, 24) 942 NL '\\n' (5, 0) (5, 1) 943 NAME 'await' (6, 2) (6, 7) 944 OP '=' (6, 8) (6, 9) 945 NUMBER '2' (6, 10) (6, 11) 946 DEDENT '' (7, 0) (7, 0) 947 """) 948 949class GenerateTokensTest(TokenizeTest): 950 def check_tokenize(self, s, expected): 951 # Format the tokens in s in a table format. 952 # The ENDMARKER and final NEWLINE are omitted. 953 f = StringIO(s) 954 result = stringify_tokens_from_source(generate_tokens(f.readline), s) 955 self.assertEqual(result, expected.rstrip().splitlines()) 956 957 958def decistmt(s): 959 result = [] 960 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string 961 for toknum, tokval, _, _, _ in g: 962 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens 963 result.extend([ 964 (NAME, 'Decimal'), 965 (OP, '('), 966 (STRING, repr(tokval)), 967 (OP, ')') 968 ]) 969 else: 970 result.append((toknum, tokval)) 971 return untokenize(result).decode('utf-8') 972 973class TestMisc(TestCase): 974 975 def test_decistmt(self): 976 # Substitute Decimals for floats in a string of statements. 977 # This is an example from the docs. 978 979 from decimal import Decimal 980 s = '+21.3e-5*-.1234/81.7' 981 self.assertEqual(decistmt(s), 982 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')") 983 984 # The format of the exponent is inherited from the platform C library. 985 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since 986 # we're only showing 11 digits, and the 12th isn't close to 5, the 987 # rest of the output should be platform-independent. 988 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7') 989 990 # Output from calculations with Decimal should be identical across all 991 # platforms. 992 self.assertEqual(eval(decistmt(s)), 993 Decimal('-3.217160342717258261933904529E-7')) 994 995 996class TestTokenizerAdheresToPep0263(TestCase): 997 """ 998 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263. 999 """ 1000 1001 def _testFile(self, filename): 1002 path = os.path.join(os.path.dirname(__file__), filename) 1003 TestRoundtrip.check_roundtrip(self, open(path, 'rb')) 1004 1005 def test_utf8_coding_cookie_and_no_utf8_bom(self): 1006 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt' 1007 self._testFile(f) 1008 1009 def test_latin1_coding_cookie_and_utf8_bom(self): 1010 """ 1011 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only 1012 allowed encoding for the comment is 'utf-8'. The text file used in 1013 this test starts with a BOM signature, but specifies latin1 as the 1014 coding, so verify that a SyntaxError is raised, which matches the 1015 behaviour of the interpreter when it encounters a similar condition. 1016 """ 1017 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt' 1018 self.assertRaises(SyntaxError, self._testFile, f) 1019 1020 def test_no_coding_cookie_and_utf8_bom(self): 1021 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt' 1022 self._testFile(f) 1023 1024 def test_utf8_coding_cookie_and_utf8_bom(self): 1025 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt' 1026 self._testFile(f) 1027 1028 def test_bad_coding_cookie(self): 1029 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py') 1030 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py') 1031 1032 1033class Test_Tokenize(TestCase): 1034 1035 def test__tokenize_decodes_with_specified_encoding(self): 1036 literal = '"ЉЊЈЁЂ"' 1037 line = literal.encode('utf-8') 1038 first = False 1039 def readline(): 1040 nonlocal first 1041 if not first: 1042 first = True 1043 return line 1044 else: 1045 return b'' 1046 1047 # skip the initial encoding token and the end tokens 1048 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2] 1049 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] 1050 self.assertEqual(tokens, expected_tokens, 1051 "bytes not decoded with encoding") 1052 1053 def test__tokenize_does_not_decode_with_encoding_none(self): 1054 literal = '"ЉЊЈЁЂ"' 1055 first = False 1056 def readline(): 1057 nonlocal first 1058 if not first: 1059 first = True 1060 return literal 1061 else: 1062 return b'' 1063 1064 # skip the end tokens 1065 tokens = list(_tokenize(readline, encoding=None))[:-2] 1066 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] 1067 self.assertEqual(tokens, expected_tokens, 1068 "string not tokenized when encoding is None") 1069 1070 1071class TestDetectEncoding(TestCase): 1072 1073 def get_readline(self, lines): 1074 index = 0 1075 def readline(): 1076 nonlocal index 1077 if index == len(lines): 1078 raise StopIteration 1079 line = lines[index] 1080 index += 1 1081 return line 1082 return readline 1083 1084 def test_no_bom_no_encoding_cookie(self): 1085 lines = ( 1086 b'# something\n', 1087 b'print(something)\n', 1088 b'do_something(else)\n' 1089 ) 1090 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1091 self.assertEqual(encoding, 'utf-8') 1092 self.assertEqual(consumed_lines, list(lines[:2])) 1093 1094 def test_bom_no_cookie(self): 1095 lines = ( 1096 b'\xef\xbb\xbf# something\n', 1097 b'print(something)\n', 1098 b'do_something(else)\n' 1099 ) 1100 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1101 self.assertEqual(encoding, 'utf-8-sig') 1102 self.assertEqual(consumed_lines, 1103 [b'# something\n', b'print(something)\n']) 1104 1105 def test_cookie_first_line_no_bom(self): 1106 lines = ( 1107 b'# -*- coding: latin-1 -*-\n', 1108 b'print(something)\n', 1109 b'do_something(else)\n' 1110 ) 1111 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1112 self.assertEqual(encoding, 'iso-8859-1') 1113 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) 1114 1115 def test_matched_bom_and_cookie_first_line(self): 1116 lines = ( 1117 b'\xef\xbb\xbf# coding=utf-8\n', 1118 b'print(something)\n', 1119 b'do_something(else)\n' 1120 ) 1121 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1122 self.assertEqual(encoding, 'utf-8-sig') 1123 self.assertEqual(consumed_lines, [b'# coding=utf-8\n']) 1124 1125 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): 1126 lines = ( 1127 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n', 1128 b'print(something)\n', 1129 b'do_something(else)\n' 1130 ) 1131 readline = self.get_readline(lines) 1132 self.assertRaises(SyntaxError, detect_encoding, readline) 1133 1134 def test_cookie_second_line_no_bom(self): 1135 lines = ( 1136 b'#! something\n', 1137 b'# vim: set fileencoding=ascii :\n', 1138 b'print(something)\n', 1139 b'do_something(else)\n' 1140 ) 1141 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1142 self.assertEqual(encoding, 'ascii') 1143 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] 1144 self.assertEqual(consumed_lines, expected) 1145 1146 def test_matched_bom_and_cookie_second_line(self): 1147 lines = ( 1148 b'\xef\xbb\xbf#! something\n', 1149 b'f# coding=utf-8\n', 1150 b'print(something)\n', 1151 b'do_something(else)\n' 1152 ) 1153 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1154 self.assertEqual(encoding, 'utf-8-sig') 1155 self.assertEqual(consumed_lines, 1156 [b'#! something\n', b'f# coding=utf-8\n']) 1157 1158 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): 1159 lines = ( 1160 b'\xef\xbb\xbf#! something\n', 1161 b'# vim: set fileencoding=ascii :\n', 1162 b'print(something)\n', 1163 b'do_something(else)\n' 1164 ) 1165 readline = self.get_readline(lines) 1166 self.assertRaises(SyntaxError, detect_encoding, readline) 1167 1168 def test_cookie_second_line_noncommented_first_line(self): 1169 lines = ( 1170 b"print('\xc2\xa3')\n", 1171 b'# vim: set fileencoding=iso8859-15 :\n', 1172 b"print('\xe2\x82\xac')\n" 1173 ) 1174 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1175 self.assertEqual(encoding, 'utf-8') 1176 expected = [b"print('\xc2\xa3')\n"] 1177 self.assertEqual(consumed_lines, expected) 1178 1179 def test_cookie_second_line_commented_first_line(self): 1180 lines = ( 1181 b"#print('\xc2\xa3')\n", 1182 b'# vim: set fileencoding=iso8859-15 :\n', 1183 b"print('\xe2\x82\xac')\n" 1184 ) 1185 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1186 self.assertEqual(encoding, 'iso8859-15') 1187 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] 1188 self.assertEqual(consumed_lines, expected) 1189 1190 def test_cookie_second_line_empty_first_line(self): 1191 lines = ( 1192 b'\n', 1193 b'# vim: set fileencoding=iso8859-15 :\n', 1194 b"print('\xe2\x82\xac')\n" 1195 ) 1196 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1197 self.assertEqual(encoding, 'iso8859-15') 1198 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] 1199 self.assertEqual(consumed_lines, expected) 1200 1201 def test_latin1_normalization(self): 1202 # See get_normal_name() in tokenizer.c. 1203 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", 1204 "iso-8859-1-unix", "iso-latin-1-mac") 1205 for encoding in encodings: 1206 for rep in ("-", "_"): 1207 enc = encoding.replace("-", rep) 1208 lines = (b"#!/usr/bin/python\n", 1209 b"# coding: " + enc.encode("ascii") + b"\n", 1210 b"print(things)\n", 1211 b"do_something += 4\n") 1212 rl = self.get_readline(lines) 1213 found, consumed_lines = detect_encoding(rl) 1214 self.assertEqual(found, "iso-8859-1") 1215 1216 def test_syntaxerror_latin1(self): 1217 # Issue 14629: need to raise SyntaxError if the first 1218 # line(s) have non-UTF-8 characters 1219 lines = ( 1220 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S 1221 ) 1222 readline = self.get_readline(lines) 1223 self.assertRaises(SyntaxError, detect_encoding, readline) 1224 1225 1226 def test_utf8_normalization(self): 1227 # See get_normal_name() in tokenizer.c. 1228 encodings = ("utf-8", "utf-8-mac", "utf-8-unix") 1229 for encoding in encodings: 1230 for rep in ("-", "_"): 1231 enc = encoding.replace("-", rep) 1232 lines = (b"#!/usr/bin/python\n", 1233 b"# coding: " + enc.encode("ascii") + b"\n", 1234 b"1 + 3\n") 1235 rl = self.get_readline(lines) 1236 found, consumed_lines = detect_encoding(rl) 1237 self.assertEqual(found, "utf-8") 1238 1239 def test_short_files(self): 1240 readline = self.get_readline((b'print(something)\n',)) 1241 encoding, consumed_lines = detect_encoding(readline) 1242 self.assertEqual(encoding, 'utf-8') 1243 self.assertEqual(consumed_lines, [b'print(something)\n']) 1244 1245 encoding, consumed_lines = detect_encoding(self.get_readline(())) 1246 self.assertEqual(encoding, 'utf-8') 1247 self.assertEqual(consumed_lines, []) 1248 1249 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) 1250 encoding, consumed_lines = detect_encoding(readline) 1251 self.assertEqual(encoding, 'utf-8-sig') 1252 self.assertEqual(consumed_lines, [b'print(something)\n']) 1253 1254 readline = self.get_readline((b'\xef\xbb\xbf',)) 1255 encoding, consumed_lines = detect_encoding(readline) 1256 self.assertEqual(encoding, 'utf-8-sig') 1257 self.assertEqual(consumed_lines, []) 1258 1259 readline = self.get_readline((b'# coding: bad\n',)) 1260 self.assertRaises(SyntaxError, detect_encoding, readline) 1261 1262 def test_false_encoding(self): 1263 # Issue 18873: "Encoding" detected in non-comment lines 1264 readline = self.get_readline((b'print("#coding=fake")',)) 1265 encoding, consumed_lines = detect_encoding(readline) 1266 self.assertEqual(encoding, 'utf-8') 1267 self.assertEqual(consumed_lines, [b'print("#coding=fake")']) 1268 1269 def test_open(self): 1270 filename = os_helper.TESTFN + '.py' 1271 self.addCleanup(os_helper.unlink, filename) 1272 1273 # test coding cookie 1274 for encoding in ('iso-8859-15', 'utf-8'): 1275 with open(filename, 'w', encoding=encoding) as fp: 1276 print("# coding: %s" % encoding, file=fp) 1277 print("print('euro:\u20ac')", file=fp) 1278 with tokenize_open(filename) as fp: 1279 self.assertEqual(fp.encoding, encoding) 1280 self.assertEqual(fp.mode, 'r') 1281 1282 # test BOM (no coding cookie) 1283 with open(filename, 'w', encoding='utf-8-sig') as fp: 1284 print("print('euro:\u20ac')", file=fp) 1285 with tokenize_open(filename) as fp: 1286 self.assertEqual(fp.encoding, 'utf-8-sig') 1287 self.assertEqual(fp.mode, 'r') 1288 1289 def test_filename_in_exception(self): 1290 # When possible, include the file name in the exception. 1291 path = 'some_file_path' 1292 lines = ( 1293 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S 1294 ) 1295 class Bunk: 1296 def __init__(self, lines, path): 1297 self.name = path 1298 self._lines = lines 1299 self._index = 0 1300 1301 def readline(self): 1302 if self._index == len(lines): 1303 raise StopIteration 1304 line = lines[self._index] 1305 self._index += 1 1306 return line 1307 1308 with self.assertRaises(SyntaxError): 1309 ins = Bunk(lines, path) 1310 # Make sure lacking a name isn't an issue. 1311 del ins.name 1312 detect_encoding(ins.readline) 1313 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)): 1314 ins = Bunk(lines, path) 1315 detect_encoding(ins.readline) 1316 1317 def test_open_error(self): 1318 # Issue #23840: open() must close the binary file on error 1319 m = BytesIO(b'#coding:xxx') 1320 with mock.patch('tokenize._builtin_open', return_value=m): 1321 self.assertRaises(SyntaxError, tokenize_open, 'foobar') 1322 self.assertTrue(m.closed) 1323 1324 1325class TestTokenize(TestCase): 1326 1327 def test_tokenize(self): 1328 import tokenize as tokenize_module 1329 encoding = object() 1330 encoding_used = None 1331 def mock_detect_encoding(readline): 1332 return encoding, [b'first', b'second'] 1333 1334 def mock__tokenize(readline, encoding): 1335 nonlocal encoding_used 1336 encoding_used = encoding 1337 out = [] 1338 while True: 1339 next_line = readline() 1340 if next_line: 1341 out.append(next_line) 1342 continue 1343 return out 1344 1345 counter = 0 1346 def mock_readline(): 1347 nonlocal counter 1348 counter += 1 1349 if counter == 5: 1350 return b'' 1351 return str(counter).encode() 1352 1353 orig_detect_encoding = tokenize_module.detect_encoding 1354 orig__tokenize = tokenize_module._tokenize 1355 tokenize_module.detect_encoding = mock_detect_encoding 1356 tokenize_module._tokenize = mock__tokenize 1357 try: 1358 results = tokenize(mock_readline) 1359 self.assertEqual(list(results), 1360 [b'first', b'second', b'1', b'2', b'3', b'4']) 1361 finally: 1362 tokenize_module.detect_encoding = orig_detect_encoding 1363 tokenize_module._tokenize = orig__tokenize 1364 1365 self.assertEqual(encoding_used, encoding) 1366 1367 def test_oneline_defs(self): 1368 buf = [] 1369 for i in range(500): 1370 buf.append('def i{i}(): return {i}'.format(i=i)) 1371 buf.append('OK') 1372 buf = '\n'.join(buf) 1373 1374 # Test that 500 consequent, one-line defs is OK 1375 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) 1376 self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER 1377 # [-2] is always NEWLINE 1378 1379 def assertExactTypeEqual(self, opstr, *optypes): 1380 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) 1381 num_optypes = len(optypes) 1382 self.assertEqual(len(tokens), 3 + num_optypes) 1383 self.assertEqual(tok_name[tokens[0].exact_type], 1384 tok_name[ENCODING]) 1385 for i in range(num_optypes): 1386 self.assertEqual(tok_name[tokens[i + 1].exact_type], 1387 tok_name[optypes[i]]) 1388 self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type], 1389 tok_name[token.NEWLINE]) 1390 self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type], 1391 tok_name[token.ENDMARKER]) 1392 1393 def test_exact_type(self): 1394 self.assertExactTypeEqual('()', token.LPAR, token.RPAR) 1395 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB) 1396 self.assertExactTypeEqual(':', token.COLON) 1397 self.assertExactTypeEqual(',', token.COMMA) 1398 self.assertExactTypeEqual(';', token.SEMI) 1399 self.assertExactTypeEqual('+', token.PLUS) 1400 self.assertExactTypeEqual('-', token.MINUS) 1401 self.assertExactTypeEqual('*', token.STAR) 1402 self.assertExactTypeEqual('/', token.SLASH) 1403 self.assertExactTypeEqual('|', token.VBAR) 1404 self.assertExactTypeEqual('&', token.AMPER) 1405 self.assertExactTypeEqual('<', token.LESS) 1406 self.assertExactTypeEqual('>', token.GREATER) 1407 self.assertExactTypeEqual('=', token.EQUAL) 1408 self.assertExactTypeEqual('.', token.DOT) 1409 self.assertExactTypeEqual('%', token.PERCENT) 1410 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE) 1411 self.assertExactTypeEqual('==', token.EQEQUAL) 1412 self.assertExactTypeEqual('!=', token.NOTEQUAL) 1413 self.assertExactTypeEqual('<=', token.LESSEQUAL) 1414 self.assertExactTypeEqual('>=', token.GREATEREQUAL) 1415 self.assertExactTypeEqual('~', token.TILDE) 1416 self.assertExactTypeEqual('^', token.CIRCUMFLEX) 1417 self.assertExactTypeEqual('<<', token.LEFTSHIFT) 1418 self.assertExactTypeEqual('>>', token.RIGHTSHIFT) 1419 self.assertExactTypeEqual('**', token.DOUBLESTAR) 1420 self.assertExactTypeEqual('+=', token.PLUSEQUAL) 1421 self.assertExactTypeEqual('-=', token.MINEQUAL) 1422 self.assertExactTypeEqual('*=', token.STAREQUAL) 1423 self.assertExactTypeEqual('/=', token.SLASHEQUAL) 1424 self.assertExactTypeEqual('%=', token.PERCENTEQUAL) 1425 self.assertExactTypeEqual('&=', token.AMPEREQUAL) 1426 self.assertExactTypeEqual('|=', token.VBAREQUAL) 1427 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL) 1428 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL) 1429 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL) 1430 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL) 1431 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL) 1432 self.assertExactTypeEqual('//', token.DOUBLESLASH) 1433 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL) 1434 self.assertExactTypeEqual(':=', token.COLONEQUAL) 1435 self.assertExactTypeEqual('...', token.ELLIPSIS) 1436 self.assertExactTypeEqual('->', token.RARROW) 1437 self.assertExactTypeEqual('@', token.AT) 1438 self.assertExactTypeEqual('@=', token.ATEQUAL) 1439 1440 self.assertExactTypeEqual('a**2+b**2==c**2', 1441 NAME, token.DOUBLESTAR, NUMBER, 1442 token.PLUS, 1443 NAME, token.DOUBLESTAR, NUMBER, 1444 token.EQEQUAL, 1445 NAME, token.DOUBLESTAR, NUMBER) 1446 self.assertExactTypeEqual('{1, 2, 3}', 1447 token.LBRACE, 1448 token.NUMBER, token.COMMA, 1449 token.NUMBER, token.COMMA, 1450 token.NUMBER, 1451 token.RBRACE) 1452 self.assertExactTypeEqual('^(x & 0x1)', 1453 token.CIRCUMFLEX, 1454 token.LPAR, 1455 token.NAME, token.AMPER, token.NUMBER, 1456 token.RPAR) 1457 1458 def test_pathological_trailing_whitespace(self): 1459 # See http://bugs.python.org/issue16152 1460 self.assertExactTypeEqual('@ ', token.AT) 1461 1462 def test_comment_at_the_end_of_the_source_without_newline(self): 1463 # See http://bugs.python.org/issue44667 1464 source = 'b = 1\n\n#test' 1465 expected_tokens = [token.NAME, token.EQUAL, token.NUMBER, token.NEWLINE, token.NL, token.COMMENT] 1466 1467 tokens = list(tokenize(BytesIO(source.encode('utf-8')).readline)) 1468 self.assertEqual(tok_name[tokens[0].exact_type], tok_name[ENCODING]) 1469 for i in range(6): 1470 self.assertEqual(tok_name[tokens[i + 1].exact_type], tok_name[expected_tokens[i]]) 1471 self.assertEqual(tok_name[tokens[-1].exact_type], tok_name[token.ENDMARKER]) 1472 1473class UntokenizeTest(TestCase): 1474 1475 def test_bad_input_order(self): 1476 # raise if previous row 1477 u = Untokenizer() 1478 u.prev_row = 2 1479 u.prev_col = 2 1480 with self.assertRaises(ValueError) as cm: 1481 u.add_whitespace((1,3)) 1482 self.assertEqual(cm.exception.args[0], 1483 'start (1,3) precedes previous end (2,2)') 1484 # raise if previous column in row 1485 self.assertRaises(ValueError, u.add_whitespace, (2,1)) 1486 1487 def test_backslash_continuation(self): 1488 # The problem is that <whitespace>\<newline> leaves no token 1489 u = Untokenizer() 1490 u.prev_row = 1 1491 u.prev_col = 1 1492 u.tokens = [] 1493 u.add_whitespace((2, 0)) 1494 self.assertEqual(u.tokens, ['\\\n']) 1495 u.prev_row = 2 1496 u.add_whitespace((4, 4)) 1497 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' ']) 1498 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') 1499 1500 def test_iter_compat(self): 1501 u = Untokenizer() 1502 token = (NAME, 'Hello') 1503 tokens = [(ENCODING, 'utf-8'), token] 1504 u.compat(token, iter([])) 1505 self.assertEqual(u.tokens, ["Hello "]) 1506 u = Untokenizer() 1507 self.assertEqual(u.untokenize(iter([token])), 'Hello ') 1508 u = Untokenizer() 1509 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ') 1510 self.assertEqual(u.encoding, 'utf-8') 1511 self.assertEqual(untokenize(iter(tokens)), b'Hello ') 1512 1513 1514class TestRoundtrip(TestCase): 1515 1516 def check_roundtrip(self, f): 1517 """ 1518 Test roundtrip for `untokenize`. `f` is an open file or a string. 1519 The source code in f is tokenized to both 5- and 2-tuples. 1520 Both sequences are converted back to source code via 1521 tokenize.untokenize(), and the latter tokenized again to 2-tuples. 1522 The test fails if the 3 pair tokenizations do not match. 1523 1524 When untokenize bugs are fixed, untokenize with 5-tuples should 1525 reproduce code that does not contain a backslash continuation 1526 following spaces. A proper test should test this. 1527 """ 1528 # Get source code and original tokenizations 1529 if isinstance(f, str): 1530 code = f.encode('utf-8') 1531 else: 1532 code = f.read() 1533 f.close() 1534 readline = iter(code.splitlines(keepends=True)).__next__ 1535 tokens5 = list(tokenize(readline)) 1536 tokens2 = [tok[:2] for tok in tokens5] 1537 # Reproduce tokens2 from pairs 1538 bytes_from2 = untokenize(tokens2) 1539 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__ 1540 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)] 1541 self.assertEqual(tokens2_from2, tokens2) 1542 # Reproduce tokens2 from 5-tuples 1543 bytes_from5 = untokenize(tokens5) 1544 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__ 1545 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] 1546 self.assertEqual(tokens2_from5, tokens2) 1547 1548 def test_roundtrip(self): 1549 # There are some standard formatting practices that are easy to get right. 1550 1551 self.check_roundtrip("if x == 1:\n" 1552 " print(x)\n") 1553 self.check_roundtrip("# This is a comment\n" 1554 "# This also\n") 1555 1556 # Some people use different formatting conventions, which makes 1557 # untokenize a little trickier. Note that this test involves trailing 1558 # whitespace after the colon. Note that we use hex escapes to make the 1559 # two trailing blanks apparent in the expected output. 1560 1561 self.check_roundtrip("if x == 1 : \n" 1562 " print(x)\n") 1563 fn = support.findfile("tokenize_tests.txt") 1564 with open(fn, 'rb') as f: 1565 self.check_roundtrip(f) 1566 self.check_roundtrip("if x == 1:\n" 1567 " # A comment by itself.\n" 1568 " print(x) # Comment here, too.\n" 1569 " # Another comment.\n" 1570 "after_if = True\n") 1571 self.check_roundtrip("if (x # The comments need to go in the right place\n" 1572 " == 1):\n" 1573 " print('x==1')\n") 1574 self.check_roundtrip("class Test: # A comment here\n" 1575 " # A comment with weird indent\n" 1576 " after_com = 5\n" 1577 " def x(m): return m*5 # a one liner\n" 1578 " def y(m): # A whitespace after the colon\n" 1579 " return y*4 # 3-space indent\n") 1580 1581 # Some error-handling code 1582 self.check_roundtrip("try: import somemodule\n" 1583 "except ImportError: # comment\n" 1584 " print('Can not import' # comment2\n)" 1585 "else: print('Loaded')\n") 1586 1587 def test_continuation(self): 1588 # Balancing continuation 1589 self.check_roundtrip("a = (3,4, \n" 1590 "5,6)\n" 1591 "y = [3, 4,\n" 1592 "5]\n" 1593 "z = {'a': 5,\n" 1594 "'b':15, 'c':True}\n" 1595 "x = len(y) + 5 - a[\n" 1596 "3] - a[2]\n" 1597 "+ len(z) - z[\n" 1598 "'b']\n") 1599 1600 def test_backslash_continuation(self): 1601 # Backslash means line continuation, except for comments 1602 self.check_roundtrip("x=1+\\\n" 1603 "1\n" 1604 "# This is a comment\\\n" 1605 "# This also\n") 1606 self.check_roundtrip("# Comment \\\n" 1607 "x = 0") 1608 1609 def test_string_concatenation(self): 1610 # Two string literals on the same line 1611 self.check_roundtrip("'' ''") 1612 1613 def test_random_files(self): 1614 # Test roundtrip on random python modules. 1615 # pass the '-ucpu' option to process the full directory. 1616 1617 import glob, random 1618 fn = support.findfile("tokenize_tests.txt") 1619 tempdir = os.path.dirname(fn) or os.curdir 1620 testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py")) 1621 1622 # Tokenize is broken on test_pep3131.py because regular expressions are 1623 # broken on the obscure unicode identifiers in it. *sigh* 1624 # With roundtrip extended to test the 5-tuple mode of untokenize, 1625 # 7 more testfiles fail. Remove them also until the failure is diagnosed. 1626 1627 testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py")) 1628 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'): 1629 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f) 1630 1631 if not support.is_resource_enabled("cpu"): 1632 testfiles = random.sample(testfiles, 10) 1633 1634 for testfile in testfiles: 1635 if support.verbose >= 2: 1636 print('tokenize', testfile) 1637 with open(testfile, 'rb') as f: 1638 with self.subTest(file=testfile): 1639 self.check_roundtrip(f) 1640 1641 1642 def roundtrip(self, code): 1643 if isinstance(code, str): 1644 code = code.encode('utf-8') 1645 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8') 1646 1647 def test_indentation_semantics_retained(self): 1648 """ 1649 Ensure that although whitespace might be mutated in a roundtrip, 1650 the semantic meaning of the indentation remains consistent. 1651 """ 1652 code = "if False:\n\tx=3\n\tx=3\n" 1653 codelines = self.roundtrip(code).split('\n') 1654 self.assertEqual(codelines[1], codelines[2]) 1655 self.check_roundtrip(code) 1656 1657 1658class CTokenizeTest(TestCase): 1659 def check_tokenize(self, s, expected): 1660 # Format the tokens in s in a table format. 1661 # The ENDMARKER and final NEWLINE are omitted. 1662 with self.subTest(source=s): 1663 result = stringify_tokens_from_source( 1664 _generate_tokens_from_c_tokenizer(s), s 1665 ) 1666 self.assertEqual(result, expected.rstrip().splitlines()) 1667 1668 def test_int(self): 1669 1670 self.check_tokenize('0xff <= 255', """\ 1671 NUMBER '0xff' (1, 0) (1, 4) 1672 LESSEQUAL '<=' (1, 5) (1, 7) 1673 NUMBER '255' (1, 8) (1, 11) 1674 """) 1675 1676 self.check_tokenize('0b10 <= 255', """\ 1677 NUMBER '0b10' (1, 0) (1, 4) 1678 LESSEQUAL '<=' (1, 5) (1, 7) 1679 NUMBER '255' (1, 8) (1, 11) 1680 """) 1681 1682 self.check_tokenize('0o123 <= 0O123', """\ 1683 NUMBER '0o123' (1, 0) (1, 5) 1684 LESSEQUAL '<=' (1, 6) (1, 8) 1685 NUMBER '0O123' (1, 9) (1, 14) 1686 """) 1687 1688 self.check_tokenize('1234567 > ~0x15', """\ 1689 NUMBER '1234567' (1, 0) (1, 7) 1690 GREATER '>' (1, 8) (1, 9) 1691 TILDE '~' (1, 10) (1, 11) 1692 NUMBER '0x15' (1, 11) (1, 15) 1693 """) 1694 1695 self.check_tokenize('2134568 != 1231515', """\ 1696 NUMBER '2134568' (1, 0) (1, 7) 1697 NOTEQUAL '!=' (1, 8) (1, 10) 1698 NUMBER '1231515' (1, 11) (1, 18) 1699 """) 1700 1701 self.check_tokenize('(-124561-1) & 200000000', """\ 1702 LPAR '(' (1, 0) (1, 1) 1703 MINUS '-' (1, 1) (1, 2) 1704 NUMBER '124561' (1, 2) (1, 8) 1705 MINUS '-' (1, 8) (1, 9) 1706 NUMBER '1' (1, 9) (1, 10) 1707 RPAR ')' (1, 10) (1, 11) 1708 AMPER '&' (1, 12) (1, 13) 1709 NUMBER '200000000' (1, 14) (1, 23) 1710 """) 1711 1712 self.check_tokenize('0xdeadbeef != -1', """\ 1713 NUMBER '0xdeadbeef' (1, 0) (1, 10) 1714 NOTEQUAL '!=' (1, 11) (1, 13) 1715 MINUS '-' (1, 14) (1, 15) 1716 NUMBER '1' (1, 15) (1, 16) 1717 """) 1718 1719 self.check_tokenize('0xdeadc0de & 12345', """\ 1720 NUMBER '0xdeadc0de' (1, 0) (1, 10) 1721 AMPER '&' (1, 11) (1, 12) 1722 NUMBER '12345' (1, 13) (1, 18) 1723 """) 1724 1725 self.check_tokenize('0xFF & 0x15 | 1234', """\ 1726 NUMBER '0xFF' (1, 0) (1, 4) 1727 AMPER '&' (1, 5) (1, 6) 1728 NUMBER '0x15' (1, 7) (1, 11) 1729 VBAR '|' (1, 12) (1, 13) 1730 NUMBER '1234' (1, 14) (1, 18) 1731 """) 1732 1733 def test_float(self): 1734 1735 self.check_tokenize('x = 3.14159', """\ 1736 NAME 'x' (1, 0) (1, 1) 1737 EQUAL '=' (1, 2) (1, 3) 1738 NUMBER '3.14159' (1, 4) (1, 11) 1739 """) 1740 1741 self.check_tokenize('x = 314159.', """\ 1742 NAME 'x' (1, 0) (1, 1) 1743 EQUAL '=' (1, 2) (1, 3) 1744 NUMBER '314159.' (1, 4) (1, 11) 1745 """) 1746 1747 self.check_tokenize('x = .314159', """\ 1748 NAME 'x' (1, 0) (1, 1) 1749 EQUAL '=' (1, 2) (1, 3) 1750 NUMBER '.314159' (1, 4) (1, 11) 1751 """) 1752 1753 self.check_tokenize('x = 3e14159', """\ 1754 NAME 'x' (1, 0) (1, 1) 1755 EQUAL '=' (1, 2) (1, 3) 1756 NUMBER '3e14159' (1, 4) (1, 11) 1757 """) 1758 1759 self.check_tokenize('x = 3E123', """\ 1760 NAME 'x' (1, 0) (1, 1) 1761 EQUAL '=' (1, 2) (1, 3) 1762 NUMBER '3E123' (1, 4) (1, 9) 1763 """) 1764 1765 self.check_tokenize('x+y = 3e-1230', """\ 1766 NAME 'x' (1, 0) (1, 1) 1767 PLUS '+' (1, 1) (1, 2) 1768 NAME 'y' (1, 2) (1, 3) 1769 EQUAL '=' (1, 4) (1, 5) 1770 NUMBER '3e-1230' (1, 6) (1, 13) 1771 """) 1772 1773 self.check_tokenize('x = 3.14e159', """\ 1774 NAME 'x' (1, 0) (1, 1) 1775 EQUAL '=' (1, 2) (1, 3) 1776 NUMBER '3.14e159' (1, 4) (1, 12) 1777 """) 1778 1779 def test_string(self): 1780 1781 self.check_tokenize('x = \'\'; y = ""', """\ 1782 NAME 'x' (1, 0) (1, 1) 1783 EQUAL '=' (1, 2) (1, 3) 1784 STRING "''" (1, 4) (1, 6) 1785 SEMI ';' (1, 6) (1, 7) 1786 NAME 'y' (1, 8) (1, 9) 1787 EQUAL '=' (1, 10) (1, 11) 1788 STRING '""' (1, 12) (1, 14) 1789 """) 1790 1791 self.check_tokenize('x = \'"\'; y = "\'"', """\ 1792 NAME 'x' (1, 0) (1, 1) 1793 EQUAL '=' (1, 2) (1, 3) 1794 STRING '\\'"\\'' (1, 4) (1, 7) 1795 SEMI ';' (1, 7) (1, 8) 1796 NAME 'y' (1, 9) (1, 10) 1797 EQUAL '=' (1, 11) (1, 12) 1798 STRING '"\\'"' (1, 13) (1, 16) 1799 """) 1800 1801 self.check_tokenize('x = "doesn\'t "shrink", does it"', """\ 1802 NAME 'x' (1, 0) (1, 1) 1803 EQUAL '=' (1, 2) (1, 3) 1804 STRING '"doesn\\'t "' (1, 4) (1, 14) 1805 NAME 'shrink' (1, 14) (1, 20) 1806 STRING '", does it"' (1, 20) (1, 31) 1807 """) 1808 1809 self.check_tokenize("x = 'abc' + 'ABC'", """\ 1810 NAME 'x' (1, 0) (1, 1) 1811 EQUAL '=' (1, 2) (1, 3) 1812 STRING "'abc'" (1, 4) (1, 9) 1813 PLUS '+' (1, 10) (1, 11) 1814 STRING "'ABC'" (1, 12) (1, 17) 1815 """) 1816 1817 self.check_tokenize('y = "ABC" + "ABC"', """\ 1818 NAME 'y' (1, 0) (1, 1) 1819 EQUAL '=' (1, 2) (1, 3) 1820 STRING '"ABC"' (1, 4) (1, 9) 1821 PLUS '+' (1, 10) (1, 11) 1822 STRING '"ABC"' (1, 12) (1, 17) 1823 """) 1824 1825 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\ 1826 NAME 'x' (1, 0) (1, 1) 1827 EQUAL '=' (1, 2) (1, 3) 1828 STRING "r'abc'" (1, 4) (1, 10) 1829 PLUS '+' (1, 11) (1, 12) 1830 STRING "r'ABC'" (1, 13) (1, 19) 1831 PLUS '+' (1, 20) (1, 21) 1832 STRING "R'ABC'" (1, 22) (1, 28) 1833 PLUS '+' (1, 29) (1, 30) 1834 STRING "R'ABC'" (1, 31) (1, 37) 1835 """) 1836 1837 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\ 1838 NAME 'y' (1, 0) (1, 1) 1839 EQUAL '=' (1, 2) (1, 3) 1840 STRING 'r"abc"' (1, 4) (1, 10) 1841 PLUS '+' (1, 11) (1, 12) 1842 STRING 'r"ABC"' (1, 13) (1, 19) 1843 PLUS '+' (1, 20) (1, 21) 1844 STRING 'R"ABC"' (1, 22) (1, 28) 1845 PLUS '+' (1, 29) (1, 30) 1846 STRING 'R"ABC"' (1, 31) (1, 37) 1847 """) 1848 1849 self.check_tokenize("u'abc' + U'abc'", """\ 1850 STRING "u'abc'" (1, 0) (1, 6) 1851 PLUS '+' (1, 7) (1, 8) 1852 STRING "U'abc'" (1, 9) (1, 15) 1853 """) 1854 1855 self.check_tokenize('u"abc" + U"abc"', """\ 1856 STRING 'u"abc"' (1, 0) (1, 6) 1857 PLUS '+' (1, 7) (1, 8) 1858 STRING 'U"abc"' (1, 9) (1, 15) 1859 """) 1860 1861 self.check_tokenize("b'abc' + B'abc'", """\ 1862 STRING "b'abc'" (1, 0) (1, 6) 1863 PLUS '+' (1, 7) (1, 8) 1864 STRING "B'abc'" (1, 9) (1, 15) 1865 """) 1866 1867 self.check_tokenize('b"abc" + B"abc"', """\ 1868 STRING 'b"abc"' (1, 0) (1, 6) 1869 PLUS '+' (1, 7) (1, 8) 1870 STRING 'B"abc"' (1, 9) (1, 15) 1871 """) 1872 1873 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\ 1874 STRING "br'abc'" (1, 0) (1, 7) 1875 PLUS '+' (1, 8) (1, 9) 1876 STRING "bR'abc'" (1, 10) (1, 17) 1877 PLUS '+' (1, 18) (1, 19) 1878 STRING "Br'abc'" (1, 20) (1, 27) 1879 PLUS '+' (1, 28) (1, 29) 1880 STRING "BR'abc'" (1, 30) (1, 37) 1881 """) 1882 1883 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\ 1884 STRING 'br"abc"' (1, 0) (1, 7) 1885 PLUS '+' (1, 8) (1, 9) 1886 STRING 'bR"abc"' (1, 10) (1, 17) 1887 PLUS '+' (1, 18) (1, 19) 1888 STRING 'Br"abc"' (1, 20) (1, 27) 1889 PLUS '+' (1, 28) (1, 29) 1890 STRING 'BR"abc"' (1, 30) (1, 37) 1891 """) 1892 1893 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\ 1894 STRING "rb'abc'" (1, 0) (1, 7) 1895 PLUS '+' (1, 8) (1, 9) 1896 STRING "rB'abc'" (1, 10) (1, 17) 1897 PLUS '+' (1, 18) (1, 19) 1898 STRING "Rb'abc'" (1, 20) (1, 27) 1899 PLUS '+' (1, 28) (1, 29) 1900 STRING "RB'abc'" (1, 30) (1, 37) 1901 """) 1902 1903 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\ 1904 STRING 'rb"abc"' (1, 0) (1, 7) 1905 PLUS '+' (1, 8) (1, 9) 1906 STRING 'rB"abc"' (1, 10) (1, 17) 1907 PLUS '+' (1, 18) (1, 19) 1908 STRING 'Rb"abc"' (1, 20) (1, 27) 1909 PLUS '+' (1, 28) (1, 29) 1910 STRING 'RB"abc"' (1, 30) (1, 37) 1911 """) 1912 1913 self.check_tokenize('"a\\\nde\\\nfg"', """\ 1914 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3) 1915 """) 1916 1917 self.check_tokenize('u"a\\\nde"', """\ 1918 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3) 1919 """) 1920 1921 self.check_tokenize('rb"a\\\nd"', """\ 1922 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2) 1923 """) 1924 1925 self.check_tokenize(r'"""a\ 1926b"""', """\ 1927 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) 1928 """) 1929 self.check_tokenize(r'u"""a\ 1930b"""', """\ 1931 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) 1932 """) 1933 self.check_tokenize(r'rb"""a\ 1934b\ 1935c"""', """\ 1936 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) 1937 """) 1938 1939 self.check_tokenize('f"abc"', """\ 1940 STRING 'f"abc"' (1, 0) (1, 6) 1941 """) 1942 1943 self.check_tokenize('fR"a{b}c"', """\ 1944 STRING 'fR"a{b}c"' (1, 0) (1, 9) 1945 """) 1946 1947 self.check_tokenize('f"""abc"""', """\ 1948 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10) 1949 """) 1950 1951 self.check_tokenize(r'f"abc\ 1952def"', """\ 1953 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4) 1954 """) 1955 1956 self.check_tokenize(r'Rf"abc\ 1957def"', """\ 1958 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4) 1959 """) 1960 1961 def test_function(self): 1962 1963 self.check_tokenize('def d22(a, b, c=2, d=2, *k): pass', """\ 1964 NAME 'def' (1, 0) (1, 3) 1965 NAME 'd22' (1, 4) (1, 7) 1966 LPAR '(' (1, 7) (1, 8) 1967 NAME 'a' (1, 8) (1, 9) 1968 COMMA ',' (1, 9) (1, 10) 1969 NAME 'b' (1, 11) (1, 12) 1970 COMMA ',' (1, 12) (1, 13) 1971 NAME 'c' (1, 14) (1, 15) 1972 EQUAL '=' (1, 15) (1, 16) 1973 NUMBER '2' (1, 16) (1, 17) 1974 COMMA ',' (1, 17) (1, 18) 1975 NAME 'd' (1, 19) (1, 20) 1976 EQUAL '=' (1, 20) (1, 21) 1977 NUMBER '2' (1, 21) (1, 22) 1978 COMMA ',' (1, 22) (1, 23) 1979 STAR '*' (1, 24) (1, 25) 1980 NAME 'k' (1, 25) (1, 26) 1981 RPAR ')' (1, 26) (1, 27) 1982 COLON ':' (1, 27) (1, 28) 1983 NAME 'pass' (1, 29) (1, 33) 1984 """) 1985 1986 self.check_tokenize('def d01v_(a=1, *k, **w): pass', """\ 1987 NAME 'def' (1, 0) (1, 3) 1988 NAME 'd01v_' (1, 4) (1, 9) 1989 LPAR '(' (1, 9) (1, 10) 1990 NAME 'a' (1, 10) (1, 11) 1991 EQUAL '=' (1, 11) (1, 12) 1992 NUMBER '1' (1, 12) (1, 13) 1993 COMMA ',' (1, 13) (1, 14) 1994 STAR '*' (1, 15) (1, 16) 1995 NAME 'k' (1, 16) (1, 17) 1996 COMMA ',' (1, 17) (1, 18) 1997 DOUBLESTAR '**' (1, 19) (1, 21) 1998 NAME 'w' (1, 21) (1, 22) 1999 RPAR ')' (1, 22) (1, 23) 2000 COLON ':' (1, 23) (1, 24) 2001 NAME 'pass' (1, 25) (1, 29) 2002 """) 2003 2004 self.check_tokenize('def d23(a: str, b: int=3) -> int: pass', """\ 2005 NAME 'def' (1, 0) (1, 3) 2006 NAME 'd23' (1, 4) (1, 7) 2007 LPAR '(' (1, 7) (1, 8) 2008 NAME 'a' (1, 8) (1, 9) 2009 COLON ':' (1, 9) (1, 10) 2010 NAME 'str' (1, 11) (1, 14) 2011 COMMA ',' (1, 14) (1, 15) 2012 NAME 'b' (1, 16) (1, 17) 2013 COLON ':' (1, 17) (1, 18) 2014 NAME 'int' (1, 19) (1, 22) 2015 EQUAL '=' (1, 22) (1, 23) 2016 NUMBER '3' (1, 23) (1, 24) 2017 RPAR ')' (1, 24) (1, 25) 2018 RARROW '->' (1, 26) (1, 28) 2019 NAME 'int' (1, 29) (1, 32) 2020 COLON ':' (1, 32) (1, 33) 2021 NAME 'pass' (1, 34) (1, 38) 2022 """) 2023 2024 def test_comparison(self): 2025 2026 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " 2027 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\ 2028 NAME 'if' (1, 0) (1, 2) 2029 NUMBER '1' (1, 3) (1, 4) 2030 LESS '<' (1, 5) (1, 6) 2031 NUMBER '1' (1, 7) (1, 8) 2032 GREATER '>' (1, 9) (1, 10) 2033 NUMBER '1' (1, 11) (1, 12) 2034 EQEQUAL '==' (1, 13) (1, 15) 2035 NUMBER '1' (1, 16) (1, 17) 2036 GREATEREQUAL '>=' (1, 18) (1, 20) 2037 NUMBER '5' (1, 21) (1, 22) 2038 LESSEQUAL '<=' (1, 23) (1, 25) 2039 NUMBER '0x15' (1, 26) (1, 30) 2040 LESSEQUAL '<=' (1, 31) (1, 33) 2041 NUMBER '0x12' (1, 34) (1, 38) 2042 NOTEQUAL '!=' (1, 39) (1, 41) 2043 NUMBER '1' (1, 42) (1, 43) 2044 NAME 'and' (1, 44) (1, 47) 2045 NUMBER '5' (1, 48) (1, 49) 2046 NAME 'in' (1, 50) (1, 52) 2047 NUMBER '1' (1, 53) (1, 54) 2048 NAME 'not' (1, 55) (1, 58) 2049 NAME 'in' (1, 59) (1, 61) 2050 NUMBER '1' (1, 62) (1, 63) 2051 NAME 'is' (1, 64) (1, 66) 2052 NUMBER '1' (1, 67) (1, 68) 2053 NAME 'or' (1, 69) (1, 71) 2054 NUMBER '5' (1, 72) (1, 73) 2055 NAME 'is' (1, 74) (1, 76) 2056 NAME 'not' (1, 77) (1, 80) 2057 NUMBER '1' (1, 81) (1, 82) 2058 COLON ':' (1, 82) (1, 83) 2059 NAME 'pass' (1, 84) (1, 88) 2060 """) 2061 2062 def test_additive(self): 2063 2064 self.check_tokenize('x = 1 - y + 15 - 1 + 0x124 + z + a[5]', """\ 2065 NAME 'x' (1, 0) (1, 1) 2066 EQUAL '=' (1, 2) (1, 3) 2067 NUMBER '1' (1, 4) (1, 5) 2068 MINUS '-' (1, 6) (1, 7) 2069 NAME 'y' (1, 8) (1, 9) 2070 PLUS '+' (1, 10) (1, 11) 2071 NUMBER '15' (1, 12) (1, 14) 2072 MINUS '-' (1, 15) (1, 16) 2073 NUMBER '1' (1, 17) (1, 18) 2074 PLUS '+' (1, 19) (1, 20) 2075 NUMBER '0x124' (1, 21) (1, 26) 2076 PLUS '+' (1, 27) (1, 28) 2077 NAME 'z' (1, 29) (1, 30) 2078 PLUS '+' (1, 31) (1, 32) 2079 NAME 'a' (1, 33) (1, 34) 2080 LSQB '[' (1, 34) (1, 35) 2081 NUMBER '5' (1, 35) (1, 36) 2082 RSQB ']' (1, 36) (1, 37) 2083 """) 2084 2085 def test_multiplicative(self): 2086 2087 self.check_tokenize('x = 1//1*1/5*12%0x12@42', """\ 2088 NAME 'x' (1, 0) (1, 1) 2089 EQUAL '=' (1, 2) (1, 3) 2090 NUMBER '1' (1, 4) (1, 5) 2091 DOUBLESLASH '//' (1, 5) (1, 7) 2092 NUMBER '1' (1, 7) (1, 8) 2093 STAR '*' (1, 8) (1, 9) 2094 NUMBER '1' (1, 9) (1, 10) 2095 SLASH '/' (1, 10) (1, 11) 2096 NUMBER '5' (1, 11) (1, 12) 2097 STAR '*' (1, 12) (1, 13) 2098 NUMBER '12' (1, 13) (1, 15) 2099 PERCENT '%' (1, 15) (1, 16) 2100 NUMBER '0x12' (1, 16) (1, 20) 2101 AT '@' (1, 20) (1, 21) 2102 NUMBER '42' (1, 21) (1, 23) 2103 """) 2104 2105 def test_unary(self): 2106 2107 self.check_tokenize('~1 ^ 1 & 1 |1 ^ -1', """\ 2108 TILDE '~' (1, 0) (1, 1) 2109 NUMBER '1' (1, 1) (1, 2) 2110 CIRCUMFLEX '^' (1, 3) (1, 4) 2111 NUMBER '1' (1, 5) (1, 6) 2112 AMPER '&' (1, 7) (1, 8) 2113 NUMBER '1' (1, 9) (1, 10) 2114 VBAR '|' (1, 11) (1, 12) 2115 NUMBER '1' (1, 12) (1, 13) 2116 CIRCUMFLEX '^' (1, 14) (1, 15) 2117 MINUS '-' (1, 16) (1, 17) 2118 NUMBER '1' (1, 17) (1, 18) 2119 """) 2120 2121 self.check_tokenize('-1*1/1+1*1//1 - ---1**1', """\ 2122 MINUS '-' (1, 0) (1, 1) 2123 NUMBER '1' (1, 1) (1, 2) 2124 STAR '*' (1, 2) (1, 3) 2125 NUMBER '1' (1, 3) (1, 4) 2126 SLASH '/' (1, 4) (1, 5) 2127 NUMBER '1' (1, 5) (1, 6) 2128 PLUS '+' (1, 6) (1, 7) 2129 NUMBER '1' (1, 7) (1, 8) 2130 STAR '*' (1, 8) (1, 9) 2131 NUMBER '1' (1, 9) (1, 10) 2132 DOUBLESLASH '//' (1, 10) (1, 12) 2133 NUMBER '1' (1, 12) (1, 13) 2134 MINUS '-' (1, 14) (1, 15) 2135 MINUS '-' (1, 16) (1, 17) 2136 MINUS '-' (1, 17) (1, 18) 2137 MINUS '-' (1, 18) (1, 19) 2138 NUMBER '1' (1, 19) (1, 20) 2139 DOUBLESTAR '**' (1, 20) (1, 22) 2140 NUMBER '1' (1, 22) (1, 23) 2141 """) 2142 2143 def test_selector(self): 2144 2145 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\ 2146 NAME 'import' (1, 0) (1, 6) 2147 NAME 'sys' (1, 7) (1, 10) 2148 COMMA ',' (1, 10) (1, 11) 2149 NAME 'time' (1, 12) (1, 16) 2150 NEWLINE '' (1, 16) (1, 16) 2151 NAME 'x' (2, 0) (2, 1) 2152 EQUAL '=' (2, 2) (2, 3) 2153 NAME 'sys' (2, 4) (2, 7) 2154 DOT '.' (2, 7) (2, 8) 2155 NAME 'modules' (2, 8) (2, 15) 2156 LSQB '[' (2, 15) (2, 16) 2157 STRING "'time'" (2, 16) (2, 22) 2158 RSQB ']' (2, 22) (2, 23) 2159 DOT '.' (2, 23) (2, 24) 2160 NAME 'time' (2, 24) (2, 28) 2161 LPAR '(' (2, 28) (2, 29) 2162 RPAR ')' (2, 29) (2, 30) 2163 """) 2164 2165 def test_method(self): 2166 2167 self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ 2168 AT '@' (1, 0) (1, 1) 2169 NAME 'staticmethod' (1, 1) (1, 13) 2170 NEWLINE '' (1, 13) (1, 13) 2171 NAME 'def' (2, 0) (2, 3) 2172 NAME 'foo' (2, 4) (2, 7) 2173 LPAR '(' (2, 7) (2, 8) 2174 NAME 'x' (2, 8) (2, 9) 2175 COMMA ',' (2, 9) (2, 10) 2176 NAME 'y' (2, 10) (2, 11) 2177 RPAR ')' (2, 11) (2, 12) 2178 COLON ':' (2, 12) (2, 13) 2179 NAME 'pass' (2, 14) (2, 18) 2180 """) 2181 2182 def test_tabs(self): 2183 2184 self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ 2185 AT '@' (1, 0) (1, 1) 2186 NAME 'staticmethod' (1, 1) (1, 13) 2187 NEWLINE '' (1, 13) (1, 13) 2188 NAME 'def' (2, 0) (2, 3) 2189 NAME 'foo' (2, 4) (2, 7) 2190 LPAR '(' (2, 7) (2, 8) 2191 NAME 'x' (2, 8) (2, 9) 2192 COMMA ',' (2, 9) (2, 10) 2193 NAME 'y' (2, 10) (2, 11) 2194 RPAR ')' (2, 11) (2, 12) 2195 COLON ':' (2, 12) (2, 13) 2196 NAME 'pass' (2, 14) (2, 18) 2197 """) 2198 2199 def test_async(self): 2200 2201 self.check_tokenize('async = 1', """\ 2202 ASYNC 'async' (1, 0) (1, 5) 2203 EQUAL '=' (1, 6) (1, 7) 2204 NUMBER '1' (1, 8) (1, 9) 2205 """) 2206 2207 self.check_tokenize('a = (async = 1)', """\ 2208 NAME 'a' (1, 0) (1, 1) 2209 EQUAL '=' (1, 2) (1, 3) 2210 LPAR '(' (1, 4) (1, 5) 2211 ASYNC 'async' (1, 5) (1, 10) 2212 EQUAL '=' (1, 11) (1, 12) 2213 NUMBER '1' (1, 13) (1, 14) 2214 RPAR ')' (1, 14) (1, 15) 2215 """) 2216 2217 self.check_tokenize('async()', """\ 2218 ASYNC 'async' (1, 0) (1, 5) 2219 LPAR '(' (1, 5) (1, 6) 2220 RPAR ')' (1, 6) (1, 7) 2221 """) 2222 2223 self.check_tokenize('class async(Bar):pass', """\ 2224 NAME 'class' (1, 0) (1, 5) 2225 ASYNC 'async' (1, 6) (1, 11) 2226 LPAR '(' (1, 11) (1, 12) 2227 NAME 'Bar' (1, 12) (1, 15) 2228 RPAR ')' (1, 15) (1, 16) 2229 COLON ':' (1, 16) (1, 17) 2230 NAME 'pass' (1, 17) (1, 21) 2231 """) 2232 2233 self.check_tokenize('class async:pass', """\ 2234 NAME 'class' (1, 0) (1, 5) 2235 ASYNC 'async' (1, 6) (1, 11) 2236 COLON ':' (1, 11) (1, 12) 2237 NAME 'pass' (1, 12) (1, 16) 2238 """) 2239 2240 self.check_tokenize('await = 1', """\ 2241 AWAIT 'await' (1, 0) (1, 5) 2242 EQUAL '=' (1, 6) (1, 7) 2243 NUMBER '1' (1, 8) (1, 9) 2244 """) 2245 2246 self.check_tokenize('foo.async', """\ 2247 NAME 'foo' (1, 0) (1, 3) 2248 DOT '.' (1, 3) (1, 4) 2249 ASYNC 'async' (1, 4) (1, 9) 2250 """) 2251 2252 self.check_tokenize('async for a in b: pass', """\ 2253 ASYNC 'async' (1, 0) (1, 5) 2254 NAME 'for' (1, 6) (1, 9) 2255 NAME 'a' (1, 10) (1, 11) 2256 NAME 'in' (1, 12) (1, 14) 2257 NAME 'b' (1, 15) (1, 16) 2258 COLON ':' (1, 16) (1, 17) 2259 NAME 'pass' (1, 18) (1, 22) 2260 """) 2261 2262 self.check_tokenize('async with a as b: pass', """\ 2263 ASYNC 'async' (1, 0) (1, 5) 2264 NAME 'with' (1, 6) (1, 10) 2265 NAME 'a' (1, 11) (1, 12) 2266 NAME 'as' (1, 13) (1, 15) 2267 NAME 'b' (1, 16) (1, 17) 2268 COLON ':' (1, 17) (1, 18) 2269 NAME 'pass' (1, 19) (1, 23) 2270 """) 2271 2272 self.check_tokenize('async.foo', """\ 2273 ASYNC 'async' (1, 0) (1, 5) 2274 DOT '.' (1, 5) (1, 6) 2275 NAME 'foo' (1, 6) (1, 9) 2276 """) 2277 2278 self.check_tokenize('async', """\ 2279 ASYNC 'async' (1, 0) (1, 5) 2280 """) 2281 2282 self.check_tokenize('async\n#comment\nawait', """\ 2283 ASYNC 'async' (1, 0) (1, 5) 2284 NEWLINE '' (1, 5) (1, 5) 2285 AWAIT 'await' (3, 0) (3, 5) 2286 """) 2287 2288 self.check_tokenize('async\n...\nawait', """\ 2289 ASYNC 'async' (1, 0) (1, 5) 2290 NEWLINE '' (1, 5) (1, 5) 2291 ELLIPSIS '...' (2, 0) (2, 3) 2292 NEWLINE '' (2, 3) (2, 3) 2293 AWAIT 'await' (3, 0) (3, 5) 2294 """) 2295 2296 self.check_tokenize('async\nawait', """\ 2297 ASYNC 'async' (1, 0) (1, 5) 2298 NEWLINE '' (1, 5) (1, 5) 2299 AWAIT 'await' (2, 0) (2, 5) 2300 """) 2301 2302 self.check_tokenize('foo.async + 1', """\ 2303 NAME 'foo' (1, 0) (1, 3) 2304 DOT '.' (1, 3) (1, 4) 2305 ASYNC 'async' (1, 4) (1, 9) 2306 PLUS '+' (1, 10) (1, 11) 2307 NUMBER '1' (1, 12) (1, 13) 2308 """) 2309 2310 self.check_tokenize('async def foo(): pass', """\ 2311 ASYNC 'async' (1, 0) (1, 5) 2312 NAME 'def' (1, 6) (1, 9) 2313 NAME 'foo' (1, 10) (1, 13) 2314 LPAR '(' (1, 13) (1, 14) 2315 RPAR ')' (1, 14) (1, 15) 2316 COLON ':' (1, 15) (1, 16) 2317 NAME 'pass' (1, 17) (1, 21) 2318 """) 2319 2320 self.check_tokenize('''\ 2321async def foo(): 2322 def foo(await): 2323 await = 1 2324 if 1: 2325 await 2326async += 1 2327''', """\ 2328 ASYNC 'async' (1, 0) (1, 5) 2329 NAME 'def' (1, 6) (1, 9) 2330 NAME 'foo' (1, 10) (1, 13) 2331 LPAR '(' (1, 13) (1, 14) 2332 RPAR ')' (1, 14) (1, 15) 2333 COLON ':' (1, 15) (1, 16) 2334 NEWLINE '' (1, 16) (1, 16) 2335 INDENT '' (2, -1) (2, -1) 2336 NAME 'def' (2, 2) (2, 5) 2337 NAME 'foo' (2, 6) (2, 9) 2338 LPAR '(' (2, 9) (2, 10) 2339 AWAIT 'await' (2, 10) (2, 15) 2340 RPAR ')' (2, 15) (2, 16) 2341 COLON ':' (2, 16) (2, 17) 2342 NEWLINE '' (2, 17) (2, 17) 2343 INDENT '' (3, -1) (3, -1) 2344 AWAIT 'await' (3, 4) (3, 9) 2345 EQUAL '=' (3, 10) (3, 11) 2346 NUMBER '1' (3, 12) (3, 13) 2347 NEWLINE '' (3, 13) (3, 13) 2348 DEDENT '' (4, -1) (4, -1) 2349 NAME 'if' (4, 2) (4, 4) 2350 NUMBER '1' (4, 5) (4, 6) 2351 COLON ':' (4, 6) (4, 7) 2352 NEWLINE '' (4, 7) (4, 7) 2353 INDENT '' (5, -1) (5, -1) 2354 AWAIT 'await' (5, 4) (5, 9) 2355 NEWLINE '' (5, 9) (5, 9) 2356 DEDENT '' (6, -1) (6, -1) 2357 DEDENT '' (6, -1) (6, -1) 2358 ASYNC 'async' (6, 0) (6, 5) 2359 PLUSEQUAL '+=' (6, 6) (6, 8) 2360 NUMBER '1' (6, 9) (6, 10) 2361 NEWLINE '' (6, 10) (6, 10) 2362 """) 2363 2364 self.check_tokenize('async def foo():\n async for i in 1: pass', """\ 2365 ASYNC 'async' (1, 0) (1, 5) 2366 NAME 'def' (1, 6) (1, 9) 2367 NAME 'foo' (1, 10) (1, 13) 2368 LPAR '(' (1, 13) (1, 14) 2369 RPAR ')' (1, 14) (1, 15) 2370 COLON ':' (1, 15) (1, 16) 2371 NEWLINE '' (1, 16) (1, 16) 2372 INDENT '' (2, -1) (2, -1) 2373 ASYNC 'async' (2, 2) (2, 7) 2374 NAME 'for' (2, 8) (2, 11) 2375 NAME 'i' (2, 12) (2, 13) 2376 NAME 'in' (2, 14) (2, 16) 2377 NUMBER '1' (2, 17) (2, 18) 2378 COLON ':' (2, 18) (2, 19) 2379 NAME 'pass' (2, 20) (2, 24) 2380 DEDENT '' (2, -1) (2, -1) 2381 """) 2382 2383 self.check_tokenize('async def foo(async): await', """\ 2384 ASYNC 'async' (1, 0) (1, 5) 2385 NAME 'def' (1, 6) (1, 9) 2386 NAME 'foo' (1, 10) (1, 13) 2387 LPAR '(' (1, 13) (1, 14) 2388 ASYNC 'async' (1, 14) (1, 19) 2389 RPAR ')' (1, 19) (1, 20) 2390 COLON ':' (1, 20) (1, 21) 2391 AWAIT 'await' (1, 22) (1, 27) 2392 """) 2393 2394 self.check_tokenize('''\ 2395def f(): 2396 2397 def baz(): pass 2398 async def bar(): pass 2399 2400 await = 2''', """\ 2401 NAME 'def' (1, 0) (1, 3) 2402 NAME 'f' (1, 4) (1, 5) 2403 LPAR '(' (1, 5) (1, 6) 2404 RPAR ')' (1, 6) (1, 7) 2405 COLON ':' (1, 7) (1, 8) 2406 NEWLINE '' (1, 8) (1, 8) 2407 INDENT '' (3, -1) (3, -1) 2408 NAME 'def' (3, 2) (3, 5) 2409 NAME 'baz' (3, 6) (3, 9) 2410 LPAR '(' (3, 9) (3, 10) 2411 RPAR ')' (3, 10) (3, 11) 2412 COLON ':' (3, 11) (3, 12) 2413 NAME 'pass' (3, 13) (3, 17) 2414 NEWLINE '' (3, 17) (3, 17) 2415 ASYNC 'async' (4, 2) (4, 7) 2416 NAME 'def' (4, 8) (4, 11) 2417 NAME 'bar' (4, 12) (4, 15) 2418 LPAR '(' (4, 15) (4, 16) 2419 RPAR ')' (4, 16) (4, 17) 2420 COLON ':' (4, 17) (4, 18) 2421 NAME 'pass' (4, 19) (4, 23) 2422 NEWLINE '' (4, 23) (4, 23) 2423 AWAIT 'await' (6, 2) (6, 7) 2424 EQUAL '=' (6, 8) (6, 9) 2425 NUMBER '2' (6, 10) (6, 11) 2426 DEDENT '' (6, -1) (6, -1) 2427 """) 2428 2429 self.check_tokenize('''\ 2430async def f(): 2431 2432 def baz(): pass 2433 async def bar(): pass 2434 2435 await = 2''', """\ 2436 ASYNC 'async' (1, 0) (1, 5) 2437 NAME 'def' (1, 6) (1, 9) 2438 NAME 'f' (1, 10) (1, 11) 2439 LPAR '(' (1, 11) (1, 12) 2440 RPAR ')' (1, 12) (1, 13) 2441 COLON ':' (1, 13) (1, 14) 2442 NEWLINE '' (1, 14) (1, 14) 2443 INDENT '' (3, -1) (3, -1) 2444 NAME 'def' (3, 2) (3, 5) 2445 NAME 'baz' (3, 6) (3, 9) 2446 LPAR '(' (3, 9) (3, 10) 2447 RPAR ')' (3, 10) (3, 11) 2448 COLON ':' (3, 11) (3, 12) 2449 NAME 'pass' (3, 13) (3, 17) 2450 NEWLINE '' (3, 17) (3, 17) 2451 ASYNC 'async' (4, 2) (4, 7) 2452 NAME 'def' (4, 8) (4, 11) 2453 NAME 'bar' (4, 12) (4, 15) 2454 LPAR '(' (4, 15) (4, 16) 2455 RPAR ')' (4, 16) (4, 17) 2456 COLON ':' (4, 17) (4, 18) 2457 NAME 'pass' (4, 19) (4, 23) 2458 NEWLINE '' (4, 23) (4, 23) 2459 AWAIT 'await' (6, 2) (6, 7) 2460 EQUAL '=' (6, 8) (6, 9) 2461 NUMBER '2' (6, 10) (6, 11) 2462 DEDENT '' (6, -1) (6, -1) 2463 """) 2464 2465 def test_unicode(self): 2466 2467 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ 2468 NAME 'Örter' (1, 0) (1, 6) 2469 EQUAL '=' (1, 7) (1, 8) 2470 STRING "u'places'" (1, 9) (1, 18) 2471 NEWLINE '' (1, 18) (1, 18) 2472 NAME 'grün' (2, 0) (2, 5) 2473 EQUAL '=' (2, 6) (2, 7) 2474 STRING "U'green'" (2, 8) (2, 16) 2475 """) 2476 2477 def test_invalid_syntax(self): 2478 def get_tokens(string): 2479 return list(_generate_tokens_from_c_tokenizer(string)) 2480 2481 self.assertRaises(SyntaxError, get_tokens, "(1+2]") 2482 self.assertRaises(SyntaxError, get_tokens, "(1+2}") 2483 self.assertRaises(SyntaxError, get_tokens, "{1+2]") 2484 2485 self.assertRaises(SyntaxError, get_tokens, "1_") 2486 self.assertRaises(SyntaxError, get_tokens, "1.2_") 2487 self.assertRaises(SyntaxError, get_tokens, "1e2_") 2488 self.assertRaises(SyntaxError, get_tokens, "1e+") 2489 2490 self.assertRaises(SyntaxError, get_tokens, "\xa0") 2491 self.assertRaises(SyntaxError, get_tokens, "€") 2492 2493 self.assertRaises(SyntaxError, get_tokens, "0b12") 2494 self.assertRaises(SyntaxError, get_tokens, "0b1_2") 2495 self.assertRaises(SyntaxError, get_tokens, "0b2") 2496 self.assertRaises(SyntaxError, get_tokens, "0b1_") 2497 self.assertRaises(SyntaxError, get_tokens, "0b") 2498 self.assertRaises(SyntaxError, get_tokens, "0o18") 2499 self.assertRaises(SyntaxError, get_tokens, "0o1_8") 2500 self.assertRaises(SyntaxError, get_tokens, "0o8") 2501 self.assertRaises(SyntaxError, get_tokens, "0o1_") 2502 self.assertRaises(SyntaxError, get_tokens, "0o") 2503 self.assertRaises(SyntaxError, get_tokens, "0x1_") 2504 self.assertRaises(SyntaxError, get_tokens, "0x") 2505 self.assertRaises(SyntaxError, get_tokens, "1_") 2506 self.assertRaises(SyntaxError, get_tokens, "012") 2507 self.assertRaises(SyntaxError, get_tokens, "1.2_") 2508 self.assertRaises(SyntaxError, get_tokens, "1e2_") 2509 self.assertRaises(SyntaxError, get_tokens, "1e+") 2510 2511 self.assertRaises(SyntaxError, get_tokens, "'sdfsdf") 2512 self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''") 2513 2514 self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000) 2515 self.assertRaises(SyntaxError, get_tokens, "]") 2516 2517 def test_max_indent(self): 2518 MAXINDENT = 100 2519 2520 def generate_source(indents): 2521 source = ''.join((' ' * x) + 'if True:\n' for x in range(indents)) 2522 source += ' ' * indents + 'pass\n' 2523 return source 2524 2525 valid = generate_source(MAXINDENT - 1) 2526 tokens = list(_generate_tokens_from_c_tokenizer(valid)) 2527 self.assertEqual(tokens[-1].type, DEDENT) 2528 compile(valid, "<string>", "exec") 2529 2530 invalid = generate_source(MAXINDENT) 2531 tokens = list(_generate_tokens_from_c_tokenizer(invalid)) 2532 self.assertEqual(tokens[-1].type, NEWLINE) 2533 self.assertRaises( 2534 IndentationError, compile, invalid, "<string>", "exec" 2535 ) 2536 2537 def test_continuation_lines_indentation(self): 2538 def get_tokens(string): 2539 return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)] 2540 2541 code = dedent(""" 2542 def fib(n): 2543 \\ 2544 '''Print a Fibonacci series up to n.''' 2545 \\ 2546 a, b = 0, 1 2547 """) 2548 2549 self.check_tokenize(code, """\ 2550 NAME 'def' (2, 0) (2, 3) 2551 NAME 'fib' (2, 4) (2, 7) 2552 LPAR '(' (2, 7) (2, 8) 2553 NAME 'n' (2, 8) (2, 9) 2554 RPAR ')' (2, 9) (2, 10) 2555 COLON ':' (2, 10) (2, 11) 2556 NEWLINE '' (2, 11) (2, 11) 2557 INDENT '' (4, -1) (4, -1) 2558 STRING "'''Print a Fibonacci series up to n.'''" (4, 0) (4, 39) 2559 NEWLINE '' (4, 39) (4, 39) 2560 NAME 'a' (6, 0) (6, 1) 2561 COMMA ',' (6, 1) (6, 2) 2562 NAME 'b' (6, 3) (6, 4) 2563 EQUAL '=' (6, 5) (6, 6) 2564 NUMBER '0' (6, 7) (6, 8) 2565 COMMA ',' (6, 8) (6, 9) 2566 NUMBER '1' (6, 10) (6, 11) 2567 NEWLINE '' (6, 11) (6, 11) 2568 DEDENT '' (6, -1) (6, -1) 2569 """) 2570 2571 code_no_cont = dedent(""" 2572 def fib(n): 2573 '''Print a Fibonacci series up to n.''' 2574 a, b = 0, 1 2575 """) 2576 2577 self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) 2578 2579 code = dedent(""" 2580 pass 2581 \\ 2582 2583 pass 2584 """) 2585 2586 self.check_tokenize(code, """\ 2587 NAME 'pass' (2, 0) (2, 4) 2588 NEWLINE '' (2, 4) (2, 4) 2589 NAME 'pass' (5, 0) (5, 4) 2590 NEWLINE '' (5, 4) (5, 4) 2591 """) 2592 2593 code_no_cont = dedent(""" 2594 pass 2595 pass 2596 """) 2597 2598 self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) 2599 2600 code = dedent(""" 2601 if x: 2602 y = 1 2603 \\ 2604 \\ 2605 \\ 2606 \\ 2607 foo = 1 2608 """) 2609 2610 self.check_tokenize(code, """\ 2611 NAME 'if' (2, 0) (2, 2) 2612 NAME 'x' (2, 3) (2, 4) 2613 COLON ':' (2, 4) (2, 5) 2614 NEWLINE '' (2, 5) (2, 5) 2615 INDENT '' (3, -1) (3, -1) 2616 NAME 'y' (3, 4) (3, 5) 2617 EQUAL '=' (3, 6) (3, 7) 2618 NUMBER '1' (3, 8) (3, 9) 2619 NEWLINE '' (3, 9) (3, 9) 2620 NAME 'foo' (8, 4) (8, 7) 2621 EQUAL '=' (8, 8) (8, 9) 2622 NUMBER '1' (8, 10) (8, 11) 2623 NEWLINE '' (8, 11) (8, 11) 2624 DEDENT '' (8, -1) (8, -1) 2625 """) 2626 2627 code_no_cont = dedent(""" 2628 if x: 2629 y = 1 2630 foo = 1 2631 """) 2632 2633 self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) 2634 2635 2636class CTokenizerBufferTests(unittest.TestCase): 2637 def test_newline_at_the_end_of_buffer(self): 2638 # See issue 99581: Make sure that if we need to add a new line at the 2639 # end of the buffer, we have enough space in the buffer, specially when 2640 # the current line is as long as the buffer space available. 2641 test_script = f"""\ 2642 #coding: latin-1 2643 #{"a"*10000} 2644 #{"a"*10002}""" 2645 with os_helper.temp_dir() as temp_dir: 2646 file_name = make_script(temp_dir, 'foo', test_script) 2647 run_test_script(file_name) 2648 2649 2650if __name__ == "__main__": 2651 unittest.main() 2652