1from test import support
2from test.support import os_helper
3from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
4                     STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
5                     open as tokenize_open, Untokenizer, generate_tokens,
6                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT)
7from io import BytesIO, StringIO
8import unittest
9from textwrap import dedent
10from unittest import TestCase, mock
11from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
12                               INVALID_UNDERSCORE_LITERALS)
13from test.support import os_helper
14from test.support.script_helper import run_test_script, make_script
15import os
16import token
17
18# Converts a source string into a list of textual representation
19# of the tokens such as:
20# `    NAME       'if'          (1, 0) (1, 2)`
21# to make writing tests easier.
22def stringify_tokens_from_source(token_generator, source_string):
23    result = []
24    num_lines = len(source_string.splitlines())
25    missing_trailing_nl = source_string[-1] not in '\r\n'
26
27    for type, token, start, end, line in token_generator:
28        if type == ENDMARKER:
29            break
30        # Ignore the new line on the last line if the input lacks one
31        if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
32            continue
33        type = tok_name[type]
34        result.append(f"    {type:10} {token!r:13} {start} {end}")
35
36    return result
37
38class TokenizeTest(TestCase):
39    # Tests for the tokenize module.
40
41    # The tests can be really simple. Given a small fragment of source
42    # code, print out a table with tokens. The ENDMARKER, ENCODING and
43    # final NEWLINE are omitted for brevity.
44
45    def check_tokenize(self, s, expected):
46        # Format the tokens in s in a table format.
47        # The ENDMARKER and final NEWLINE are omitted.
48        f = BytesIO(s.encode('utf-8'))
49        result = stringify_tokens_from_source(tokenize(f.readline), s)
50        self.assertEqual(result,
51                         ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
52                         expected.rstrip().splitlines())
53
54    def test_implicit_newline(self):
55        # Make sure that the tokenizer puts in an implicit NEWLINE
56        # when the input lacks a trailing new line.
57        f = BytesIO("x".encode('utf-8'))
58        tokens = list(tokenize(f.readline))
59        self.assertEqual(tokens[-2].type, NEWLINE)
60        self.assertEqual(tokens[-1].type, ENDMARKER)
61
62    def test_basic(self):
63        self.check_tokenize("1 + 1", """\
64    NUMBER     '1'           (1, 0) (1, 1)
65    OP         '+'           (1, 2) (1, 3)
66    NUMBER     '1'           (1, 4) (1, 5)
67    """)
68        self.check_tokenize("if False:\n"
69                            "    # NL\n"
70                            "    \n"
71                            "    True = False # NEWLINE\n", """\
72    NAME       'if'          (1, 0) (1, 2)
73    NAME       'False'       (1, 3) (1, 8)
74    OP         ':'           (1, 8) (1, 9)
75    NEWLINE    '\\n'          (1, 9) (1, 10)
76    COMMENT    '# NL'        (2, 4) (2, 8)
77    NL         '\\n'          (2, 8) (2, 9)
78    NL         '\\n'          (3, 4) (3, 5)
79    INDENT     '    '        (4, 0) (4, 4)
80    NAME       'True'        (4, 4) (4, 8)
81    OP         '='           (4, 9) (4, 10)
82    NAME       'False'       (4, 11) (4, 16)
83    COMMENT    '# NEWLINE'   (4, 17) (4, 26)
84    NEWLINE    '\\n'          (4, 26) (4, 27)
85    DEDENT     ''            (5, 0) (5, 0)
86    """)
87        indent_error_file = b"""\
88def k(x):
89    x += 2
90  x += 5
91"""
92        readline = BytesIO(indent_error_file).readline
93        with self.assertRaisesRegex(IndentationError,
94                                    "unindent does not match any "
95                                    "outer indentation level"):
96            for tok in tokenize(readline):
97                pass
98
99    def test_int(self):
100        # Ordinary integers and binary operators
101        self.check_tokenize("0xff <= 255", """\
102    NUMBER     '0xff'        (1, 0) (1, 4)
103    OP         '<='          (1, 5) (1, 7)
104    NUMBER     '255'         (1, 8) (1, 11)
105    """)
106        self.check_tokenize("0b10 <= 255", """\
107    NUMBER     '0b10'        (1, 0) (1, 4)
108    OP         '<='          (1, 5) (1, 7)
109    NUMBER     '255'         (1, 8) (1, 11)
110    """)
111        self.check_tokenize("0o123 <= 0O123", """\
112    NUMBER     '0o123'       (1, 0) (1, 5)
113    OP         '<='          (1, 6) (1, 8)
114    NUMBER     '0O123'       (1, 9) (1, 14)
115    """)
116        self.check_tokenize("1234567 > ~0x15", """\
117    NUMBER     '1234567'     (1, 0) (1, 7)
118    OP         '>'           (1, 8) (1, 9)
119    OP         '~'           (1, 10) (1, 11)
120    NUMBER     '0x15'        (1, 11) (1, 15)
121    """)
122        self.check_tokenize("2134568 != 1231515", """\
123    NUMBER     '2134568'     (1, 0) (1, 7)
124    OP         '!='          (1, 8) (1, 10)
125    NUMBER     '1231515'     (1, 11) (1, 18)
126    """)
127        self.check_tokenize("(-124561-1) & 200000000", """\
128    OP         '('           (1, 0) (1, 1)
129    OP         '-'           (1, 1) (1, 2)
130    NUMBER     '124561'      (1, 2) (1, 8)
131    OP         '-'           (1, 8) (1, 9)
132    NUMBER     '1'           (1, 9) (1, 10)
133    OP         ')'           (1, 10) (1, 11)
134    OP         '&'           (1, 12) (1, 13)
135    NUMBER     '200000000'   (1, 14) (1, 23)
136    """)
137        self.check_tokenize("0xdeadbeef != -1", """\
138    NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
139    OP         '!='          (1, 11) (1, 13)
140    OP         '-'           (1, 14) (1, 15)
141    NUMBER     '1'           (1, 15) (1, 16)
142    """)
143        self.check_tokenize("0xdeadc0de & 12345", """\
144    NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
145    OP         '&'           (1, 11) (1, 12)
146    NUMBER     '12345'       (1, 13) (1, 18)
147    """)
148        self.check_tokenize("0xFF & 0x15 | 1234", """\
149    NUMBER     '0xFF'        (1, 0) (1, 4)
150    OP         '&'           (1, 5) (1, 6)
151    NUMBER     '0x15'        (1, 7) (1, 11)
152    OP         '|'           (1, 12) (1, 13)
153    NUMBER     '1234'        (1, 14) (1, 18)
154    """)
155
156    def test_long(self):
157        # Long integers
158        self.check_tokenize("x = 0", """\
159    NAME       'x'           (1, 0) (1, 1)
160    OP         '='           (1, 2) (1, 3)
161    NUMBER     '0'           (1, 4) (1, 5)
162    """)
163        self.check_tokenize("x = 0xfffffffffff", """\
164    NAME       'x'           (1, 0) (1, 1)
165    OP         '='           (1, 2) (1, 3)
166    NUMBER     '0xfffffffffff' (1, 4) (1, 17)
167    """)
168        self.check_tokenize("x = 123141242151251616110", """\
169    NAME       'x'           (1, 0) (1, 1)
170    OP         '='           (1, 2) (1, 3)
171    NUMBER     '123141242151251616110' (1, 4) (1, 25)
172    """)
173        self.check_tokenize("x = -15921590215012591", """\
174    NAME       'x'           (1, 0) (1, 1)
175    OP         '='           (1, 2) (1, 3)
176    OP         '-'           (1, 4) (1, 5)
177    NUMBER     '15921590215012591' (1, 5) (1, 22)
178    """)
179
180    def test_float(self):
181        # Floating point numbers
182        self.check_tokenize("x = 3.14159", """\
183    NAME       'x'           (1, 0) (1, 1)
184    OP         '='           (1, 2) (1, 3)
185    NUMBER     '3.14159'     (1, 4) (1, 11)
186    """)
187        self.check_tokenize("x = 314159.", """\
188    NAME       'x'           (1, 0) (1, 1)
189    OP         '='           (1, 2) (1, 3)
190    NUMBER     '314159.'     (1, 4) (1, 11)
191    """)
192        self.check_tokenize("x = .314159", """\
193    NAME       'x'           (1, 0) (1, 1)
194    OP         '='           (1, 2) (1, 3)
195    NUMBER     '.314159'     (1, 4) (1, 11)
196    """)
197        self.check_tokenize("x = 3e14159", """\
198    NAME       'x'           (1, 0) (1, 1)
199    OP         '='           (1, 2) (1, 3)
200    NUMBER     '3e14159'     (1, 4) (1, 11)
201    """)
202        self.check_tokenize("x = 3E123", """\
203    NAME       'x'           (1, 0) (1, 1)
204    OP         '='           (1, 2) (1, 3)
205    NUMBER     '3E123'       (1, 4) (1, 9)
206    """)
207        self.check_tokenize("x+y = 3e-1230", """\
208    NAME       'x'           (1, 0) (1, 1)
209    OP         '+'           (1, 1) (1, 2)
210    NAME       'y'           (1, 2) (1, 3)
211    OP         '='           (1, 4) (1, 5)
212    NUMBER     '3e-1230'     (1, 6) (1, 13)
213    """)
214        self.check_tokenize("x = 3.14e159", """\
215    NAME       'x'           (1, 0) (1, 1)
216    OP         '='           (1, 2) (1, 3)
217    NUMBER     '3.14e159'    (1, 4) (1, 12)
218    """)
219
220    def test_underscore_literals(self):
221        def number_token(s):
222            f = BytesIO(s.encode('utf-8'))
223            for toktype, token, start, end, line in tokenize(f.readline):
224                if toktype == NUMBER:
225                    return token
226            return 'invalid token'
227        for lit in VALID_UNDERSCORE_LITERALS:
228            if '(' in lit:
229                # this won't work with compound complex inputs
230                continue
231            self.assertEqual(number_token(lit), lit)
232        for lit in INVALID_UNDERSCORE_LITERALS:
233            self.assertNotEqual(number_token(lit), lit)
234
235    def test_string(self):
236        # String literals
237        self.check_tokenize("x = ''; y = \"\"", """\
238    NAME       'x'           (1, 0) (1, 1)
239    OP         '='           (1, 2) (1, 3)
240    STRING     "''"          (1, 4) (1, 6)
241    OP         ';'           (1, 6) (1, 7)
242    NAME       'y'           (1, 8) (1, 9)
243    OP         '='           (1, 10) (1, 11)
244    STRING     '""'          (1, 12) (1, 14)
245    """)
246        self.check_tokenize("x = '\"'; y = \"'\"", """\
247    NAME       'x'           (1, 0) (1, 1)
248    OP         '='           (1, 2) (1, 3)
249    STRING     '\\'"\\''       (1, 4) (1, 7)
250    OP         ';'           (1, 7) (1, 8)
251    NAME       'y'           (1, 9) (1, 10)
252    OP         '='           (1, 11) (1, 12)
253    STRING     '"\\'"'        (1, 13) (1, 16)
254    """)
255        self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
256    NAME       'x'           (1, 0) (1, 1)
257    OP         '='           (1, 2) (1, 3)
258    STRING     '"doesn\\'t "' (1, 4) (1, 14)
259    NAME       'shrink'      (1, 14) (1, 20)
260    STRING     '", does it"' (1, 20) (1, 31)
261    """)
262        self.check_tokenize("x = 'abc' + 'ABC'", """\
263    NAME       'x'           (1, 0) (1, 1)
264    OP         '='           (1, 2) (1, 3)
265    STRING     "'abc'"       (1, 4) (1, 9)
266    OP         '+'           (1, 10) (1, 11)
267    STRING     "'ABC'"       (1, 12) (1, 17)
268    """)
269        self.check_tokenize('y = "ABC" + "ABC"', """\
270    NAME       'y'           (1, 0) (1, 1)
271    OP         '='           (1, 2) (1, 3)
272    STRING     '"ABC"'       (1, 4) (1, 9)
273    OP         '+'           (1, 10) (1, 11)
274    STRING     '"ABC"'       (1, 12) (1, 17)
275    """)
276        self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
277    NAME       'x'           (1, 0) (1, 1)
278    OP         '='           (1, 2) (1, 3)
279    STRING     "r'abc'"      (1, 4) (1, 10)
280    OP         '+'           (1, 11) (1, 12)
281    STRING     "r'ABC'"      (1, 13) (1, 19)
282    OP         '+'           (1, 20) (1, 21)
283    STRING     "R'ABC'"      (1, 22) (1, 28)
284    OP         '+'           (1, 29) (1, 30)
285    STRING     "R'ABC'"      (1, 31) (1, 37)
286    """)
287        self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
288    NAME       'y'           (1, 0) (1, 1)
289    OP         '='           (1, 2) (1, 3)
290    STRING     'r"abc"'      (1, 4) (1, 10)
291    OP         '+'           (1, 11) (1, 12)
292    STRING     'r"ABC"'      (1, 13) (1, 19)
293    OP         '+'           (1, 20) (1, 21)
294    STRING     'R"ABC"'      (1, 22) (1, 28)
295    OP         '+'           (1, 29) (1, 30)
296    STRING     'R"ABC"'      (1, 31) (1, 37)
297    """)
298
299        self.check_tokenize("u'abc' + U'abc'", """\
300    STRING     "u'abc'"      (1, 0) (1, 6)
301    OP         '+'           (1, 7) (1, 8)
302    STRING     "U'abc'"      (1, 9) (1, 15)
303    """)
304        self.check_tokenize('u"abc" + U"abc"', """\
305    STRING     'u"abc"'      (1, 0) (1, 6)
306    OP         '+'           (1, 7) (1, 8)
307    STRING     'U"abc"'      (1, 9) (1, 15)
308    """)
309
310        self.check_tokenize("b'abc' + B'abc'", """\
311    STRING     "b'abc'"      (1, 0) (1, 6)
312    OP         '+'           (1, 7) (1, 8)
313    STRING     "B'abc'"      (1, 9) (1, 15)
314    """)
315        self.check_tokenize('b"abc" + B"abc"', """\
316    STRING     'b"abc"'      (1, 0) (1, 6)
317    OP         '+'           (1, 7) (1, 8)
318    STRING     'B"abc"'      (1, 9) (1, 15)
319    """)
320        self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
321    STRING     "br'abc'"     (1, 0) (1, 7)
322    OP         '+'           (1, 8) (1, 9)
323    STRING     "bR'abc'"     (1, 10) (1, 17)
324    OP         '+'           (1, 18) (1, 19)
325    STRING     "Br'abc'"     (1, 20) (1, 27)
326    OP         '+'           (1, 28) (1, 29)
327    STRING     "BR'abc'"     (1, 30) (1, 37)
328    """)
329        self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
330    STRING     'br"abc"'     (1, 0) (1, 7)
331    OP         '+'           (1, 8) (1, 9)
332    STRING     'bR"abc"'     (1, 10) (1, 17)
333    OP         '+'           (1, 18) (1, 19)
334    STRING     'Br"abc"'     (1, 20) (1, 27)
335    OP         '+'           (1, 28) (1, 29)
336    STRING     'BR"abc"'     (1, 30) (1, 37)
337    """)
338        self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
339    STRING     "rb'abc'"     (1, 0) (1, 7)
340    OP         '+'           (1, 8) (1, 9)
341    STRING     "rB'abc'"     (1, 10) (1, 17)
342    OP         '+'           (1, 18) (1, 19)
343    STRING     "Rb'abc'"     (1, 20) (1, 27)
344    OP         '+'           (1, 28) (1, 29)
345    STRING     "RB'abc'"     (1, 30) (1, 37)
346    """)
347        self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
348    STRING     'rb"abc"'     (1, 0) (1, 7)
349    OP         '+'           (1, 8) (1, 9)
350    STRING     'rB"abc"'     (1, 10) (1, 17)
351    OP         '+'           (1, 18) (1, 19)
352    STRING     'Rb"abc"'     (1, 20) (1, 27)
353    OP         '+'           (1, 28) (1, 29)
354    STRING     'RB"abc"'     (1, 30) (1, 37)
355    """)
356        # Check 0, 1, and 2 character string prefixes.
357        self.check_tokenize(r'"a\
358de\
359fg"', """\
360    STRING     '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
361    """)
362        self.check_tokenize(r'u"a\
363de"', """\
364    STRING     'u"a\\\\\\nde"\'  (1, 0) (2, 3)
365    """)
366        self.check_tokenize(r'rb"a\
367d"', """\
368    STRING     'rb"a\\\\\\nd"\'  (1, 0) (2, 2)
369    """)
370        self.check_tokenize(r'"""a\
371b"""', """\
372    STRING     '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
373    """)
374        self.check_tokenize(r'u"""a\
375b"""', """\
376    STRING     'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
377    """)
378        self.check_tokenize(r'rb"""a\
379b\
380c"""', """\
381    STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
382    """)
383        self.check_tokenize('f"abc"', """\
384    STRING     'f"abc"'      (1, 0) (1, 6)
385    """)
386        self.check_tokenize('fR"a{b}c"', """\
387    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
388    """)
389        self.check_tokenize('f"""abc"""', """\
390    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
391    """)
392        self.check_tokenize(r'f"abc\
393def"', """\
394    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
395    """)
396        self.check_tokenize(r'Rf"abc\
397def"', """\
398    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
399    """)
400
401    def test_function(self):
402        self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
403    NAME       'def'         (1, 0) (1, 3)
404    NAME       'd22'         (1, 4) (1, 7)
405    OP         '('           (1, 7) (1, 8)
406    NAME       'a'           (1, 8) (1, 9)
407    OP         ','           (1, 9) (1, 10)
408    NAME       'b'           (1, 11) (1, 12)
409    OP         ','           (1, 12) (1, 13)
410    NAME       'c'           (1, 14) (1, 15)
411    OP         '='           (1, 15) (1, 16)
412    NUMBER     '2'           (1, 16) (1, 17)
413    OP         ','           (1, 17) (1, 18)
414    NAME       'd'           (1, 19) (1, 20)
415    OP         '='           (1, 20) (1, 21)
416    NUMBER     '2'           (1, 21) (1, 22)
417    OP         ','           (1, 22) (1, 23)
418    OP         '*'           (1, 24) (1, 25)
419    NAME       'k'           (1, 25) (1, 26)
420    OP         ')'           (1, 26) (1, 27)
421    OP         ':'           (1, 27) (1, 28)
422    NAME       'pass'        (1, 29) (1, 33)
423    """)
424        self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
425    NAME       'def'         (1, 0) (1, 3)
426    NAME       'd01v_'       (1, 4) (1, 9)
427    OP         '('           (1, 9) (1, 10)
428    NAME       'a'           (1, 10) (1, 11)
429    OP         '='           (1, 11) (1, 12)
430    NUMBER     '1'           (1, 12) (1, 13)
431    OP         ','           (1, 13) (1, 14)
432    OP         '*'           (1, 15) (1, 16)
433    NAME       'k'           (1, 16) (1, 17)
434    OP         ','           (1, 17) (1, 18)
435    OP         '**'          (1, 19) (1, 21)
436    NAME       'w'           (1, 21) (1, 22)
437    OP         ')'           (1, 22) (1, 23)
438    OP         ':'           (1, 23) (1, 24)
439    NAME       'pass'        (1, 25) (1, 29)
440    """)
441        self.check_tokenize("def d23(a: str, b: int=3) -> int: pass", """\
442    NAME       'def'         (1, 0) (1, 3)
443    NAME       'd23'         (1, 4) (1, 7)
444    OP         '('           (1, 7) (1, 8)
445    NAME       'a'           (1, 8) (1, 9)
446    OP         ':'           (1, 9) (1, 10)
447    NAME       'str'         (1, 11) (1, 14)
448    OP         ','           (1, 14) (1, 15)
449    NAME       'b'           (1, 16) (1, 17)
450    OP         ':'           (1, 17) (1, 18)
451    NAME       'int'         (1, 19) (1, 22)
452    OP         '='           (1, 22) (1, 23)
453    NUMBER     '3'           (1, 23) (1, 24)
454    OP         ')'           (1, 24) (1, 25)
455    OP         '->'          (1, 26) (1, 28)
456    NAME       'int'         (1, 29) (1, 32)
457    OP         ':'           (1, 32) (1, 33)
458    NAME       'pass'        (1, 34) (1, 38)
459    """)
460
461    def test_comparison(self):
462        # Comparison
463        self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
464                            "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
465    NAME       'if'          (1, 0) (1, 2)
466    NUMBER     '1'           (1, 3) (1, 4)
467    OP         '<'           (1, 5) (1, 6)
468    NUMBER     '1'           (1, 7) (1, 8)
469    OP         '>'           (1, 9) (1, 10)
470    NUMBER     '1'           (1, 11) (1, 12)
471    OP         '=='          (1, 13) (1, 15)
472    NUMBER     '1'           (1, 16) (1, 17)
473    OP         '>='          (1, 18) (1, 20)
474    NUMBER     '5'           (1, 21) (1, 22)
475    OP         '<='          (1, 23) (1, 25)
476    NUMBER     '0x15'        (1, 26) (1, 30)
477    OP         '<='          (1, 31) (1, 33)
478    NUMBER     '0x12'        (1, 34) (1, 38)
479    OP         '!='          (1, 39) (1, 41)
480    NUMBER     '1'           (1, 42) (1, 43)
481    NAME       'and'         (1, 44) (1, 47)
482    NUMBER     '5'           (1, 48) (1, 49)
483    NAME       'in'          (1, 50) (1, 52)
484    NUMBER     '1'           (1, 53) (1, 54)
485    NAME       'not'         (1, 55) (1, 58)
486    NAME       'in'          (1, 59) (1, 61)
487    NUMBER     '1'           (1, 62) (1, 63)
488    NAME       'is'          (1, 64) (1, 66)
489    NUMBER     '1'           (1, 67) (1, 68)
490    NAME       'or'          (1, 69) (1, 71)
491    NUMBER     '5'           (1, 72) (1, 73)
492    NAME       'is'          (1, 74) (1, 76)
493    NAME       'not'         (1, 77) (1, 80)
494    NUMBER     '1'           (1, 81) (1, 82)
495    OP         ':'           (1, 82) (1, 83)
496    NAME       'pass'        (1, 84) (1, 88)
497    """)
498
499    def test_shift(self):
500        # Shift
501        self.check_tokenize("x = 1 << 1 >> 5", """\
502    NAME       'x'           (1, 0) (1, 1)
503    OP         '='           (1, 2) (1, 3)
504    NUMBER     '1'           (1, 4) (1, 5)
505    OP         '<<'          (1, 6) (1, 8)
506    NUMBER     '1'           (1, 9) (1, 10)
507    OP         '>>'          (1, 11) (1, 13)
508    NUMBER     '5'           (1, 14) (1, 15)
509    """)
510
511    def test_additive(self):
512        # Additive
513        self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
514    NAME       'x'           (1, 0) (1, 1)
515    OP         '='           (1, 2) (1, 3)
516    NUMBER     '1'           (1, 4) (1, 5)
517    OP         '-'           (1, 6) (1, 7)
518    NAME       'y'           (1, 8) (1, 9)
519    OP         '+'           (1, 10) (1, 11)
520    NUMBER     '15'          (1, 12) (1, 14)
521    OP         '-'           (1, 15) (1, 16)
522    NUMBER     '1'           (1, 17) (1, 18)
523    OP         '+'           (1, 19) (1, 20)
524    NUMBER     '0x124'       (1, 21) (1, 26)
525    OP         '+'           (1, 27) (1, 28)
526    NAME       'z'           (1, 29) (1, 30)
527    OP         '+'           (1, 31) (1, 32)
528    NAME       'a'           (1, 33) (1, 34)
529    OP         '['           (1, 34) (1, 35)
530    NUMBER     '5'           (1, 35) (1, 36)
531    OP         ']'           (1, 36) (1, 37)
532    """)
533
534    def test_multiplicative(self):
535        # Multiplicative
536        self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
537    NAME       'x'           (1, 0) (1, 1)
538    OP         '='           (1, 2) (1, 3)
539    NUMBER     '1'           (1, 4) (1, 5)
540    OP         '//'          (1, 5) (1, 7)
541    NUMBER     '1'           (1, 7) (1, 8)
542    OP         '*'           (1, 8) (1, 9)
543    NUMBER     '1'           (1, 9) (1, 10)
544    OP         '/'           (1, 10) (1, 11)
545    NUMBER     '5'           (1, 11) (1, 12)
546    OP         '*'           (1, 12) (1, 13)
547    NUMBER     '12'          (1, 13) (1, 15)
548    OP         '%'           (1, 15) (1, 16)
549    NUMBER     '0x12'        (1, 16) (1, 20)
550    OP         '@'           (1, 20) (1, 21)
551    NUMBER     '42'          (1, 21) (1, 23)
552    """)
553
554    def test_unary(self):
555        # Unary
556        self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
557    OP         '~'           (1, 0) (1, 1)
558    NUMBER     '1'           (1, 1) (1, 2)
559    OP         '^'           (1, 3) (1, 4)
560    NUMBER     '1'           (1, 5) (1, 6)
561    OP         '&'           (1, 7) (1, 8)
562    NUMBER     '1'           (1, 9) (1, 10)
563    OP         '|'           (1, 11) (1, 12)
564    NUMBER     '1'           (1, 12) (1, 13)
565    OP         '^'           (1, 14) (1, 15)
566    OP         '-'           (1, 16) (1, 17)
567    NUMBER     '1'           (1, 17) (1, 18)
568    """)
569        self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
570    OP         '-'           (1, 0) (1, 1)
571    NUMBER     '1'           (1, 1) (1, 2)
572    OP         '*'           (1, 2) (1, 3)
573    NUMBER     '1'           (1, 3) (1, 4)
574    OP         '/'           (1, 4) (1, 5)
575    NUMBER     '1'           (1, 5) (1, 6)
576    OP         '+'           (1, 6) (1, 7)
577    NUMBER     '1'           (1, 7) (1, 8)
578    OP         '*'           (1, 8) (1, 9)
579    NUMBER     '1'           (1, 9) (1, 10)
580    OP         '//'          (1, 10) (1, 12)
581    NUMBER     '1'           (1, 12) (1, 13)
582    OP         '-'           (1, 14) (1, 15)
583    OP         '-'           (1, 16) (1, 17)
584    OP         '-'           (1, 17) (1, 18)
585    OP         '-'           (1, 18) (1, 19)
586    NUMBER     '1'           (1, 19) (1, 20)
587    OP         '**'          (1, 20) (1, 22)
588    NUMBER     '1'           (1, 22) (1, 23)
589    """)
590
591    def test_selector(self):
592        # Selector
593        self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
594    NAME       'import'      (1, 0) (1, 6)
595    NAME       'sys'         (1, 7) (1, 10)
596    OP         ','           (1, 10) (1, 11)
597    NAME       'time'        (1, 12) (1, 16)
598    NEWLINE    '\\n'          (1, 16) (1, 17)
599    NAME       'x'           (2, 0) (2, 1)
600    OP         '='           (2, 2) (2, 3)
601    NAME       'sys'         (2, 4) (2, 7)
602    OP         '.'           (2, 7) (2, 8)
603    NAME       'modules'     (2, 8) (2, 15)
604    OP         '['           (2, 15) (2, 16)
605    STRING     "'time'"      (2, 16) (2, 22)
606    OP         ']'           (2, 22) (2, 23)
607    OP         '.'           (2, 23) (2, 24)
608    NAME       'time'        (2, 24) (2, 28)
609    OP         '('           (2, 28) (2, 29)
610    OP         ')'           (2, 29) (2, 30)
611    """)
612
613    def test_method(self):
614        # Methods
615        self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
616    OP         '@'           (1, 0) (1, 1)
617    NAME       'staticmethod' (1, 1) (1, 13)
618    NEWLINE    '\\n'          (1, 13) (1, 14)
619    NAME       'def'         (2, 0) (2, 3)
620    NAME       'foo'         (2, 4) (2, 7)
621    OP         '('           (2, 7) (2, 8)
622    NAME       'x'           (2, 8) (2, 9)
623    OP         ','           (2, 9) (2, 10)
624    NAME       'y'           (2, 10) (2, 11)
625    OP         ')'           (2, 11) (2, 12)
626    OP         ':'           (2, 12) (2, 13)
627    NAME       'pass'        (2, 14) (2, 18)
628    """)
629
630    def test_tabs(self):
631        # Evil tabs
632        self.check_tokenize("def f():\n"
633                            "\tif x\n"
634                            "        \tpass", """\
635    NAME       'def'         (1, 0) (1, 3)
636    NAME       'f'           (1, 4) (1, 5)
637    OP         '('           (1, 5) (1, 6)
638    OP         ')'           (1, 6) (1, 7)
639    OP         ':'           (1, 7) (1, 8)
640    NEWLINE    '\\n'          (1, 8) (1, 9)
641    INDENT     '\\t'          (2, 0) (2, 1)
642    NAME       'if'          (2, 1) (2, 3)
643    NAME       'x'           (2, 4) (2, 5)
644    NEWLINE    '\\n'          (2, 5) (2, 6)
645    INDENT     '        \\t'  (3, 0) (3, 9)
646    NAME       'pass'        (3, 9) (3, 13)
647    DEDENT     ''            (4, 0) (4, 0)
648    DEDENT     ''            (4, 0) (4, 0)
649    """)
650
651    def test_non_ascii_identifiers(self):
652        # Non-ascii identifiers
653        self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
654    NAME       'Örter'       (1, 0) (1, 5)
655    OP         '='           (1, 6) (1, 7)
656    STRING     "'places'"    (1, 8) (1, 16)
657    NEWLINE    '\\n'          (1, 16) (1, 17)
658    NAME       'grün'        (2, 0) (2, 4)
659    OP         '='           (2, 5) (2, 6)
660    STRING     "'green'"     (2, 7) (2, 14)
661    """)
662
663    def test_unicode(self):
664        # Legacy unicode literals:
665        self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
666    NAME       'Örter'       (1, 0) (1, 5)
667    OP         '='           (1, 6) (1, 7)
668    STRING     "u'places'"   (1, 8) (1, 17)
669    NEWLINE    '\\n'          (1, 17) (1, 18)
670    NAME       'grün'        (2, 0) (2, 4)
671    OP         '='           (2, 5) (2, 6)
672    STRING     "U'green'"    (2, 7) (2, 15)
673    """)
674
675    def test_async(self):
676        # Async/await extension:
677        self.check_tokenize("async = 1", """\
678    NAME       'async'       (1, 0) (1, 5)
679    OP         '='           (1, 6) (1, 7)
680    NUMBER     '1'           (1, 8) (1, 9)
681    """)
682
683        self.check_tokenize("a = (async = 1)", """\
684    NAME       'a'           (1, 0) (1, 1)
685    OP         '='           (1, 2) (1, 3)
686    OP         '('           (1, 4) (1, 5)
687    NAME       'async'       (1, 5) (1, 10)
688    OP         '='           (1, 11) (1, 12)
689    NUMBER     '1'           (1, 13) (1, 14)
690    OP         ')'           (1, 14) (1, 15)
691    """)
692
693        self.check_tokenize("async()", """\
694    NAME       'async'       (1, 0) (1, 5)
695    OP         '('           (1, 5) (1, 6)
696    OP         ')'           (1, 6) (1, 7)
697    """)
698
699        self.check_tokenize("class async(Bar):pass", """\
700    NAME       'class'       (1, 0) (1, 5)
701    NAME       'async'       (1, 6) (1, 11)
702    OP         '('           (1, 11) (1, 12)
703    NAME       'Bar'         (1, 12) (1, 15)
704    OP         ')'           (1, 15) (1, 16)
705    OP         ':'           (1, 16) (1, 17)
706    NAME       'pass'        (1, 17) (1, 21)
707    """)
708
709        self.check_tokenize("class async:pass", """\
710    NAME       'class'       (1, 0) (1, 5)
711    NAME       'async'       (1, 6) (1, 11)
712    OP         ':'           (1, 11) (1, 12)
713    NAME       'pass'        (1, 12) (1, 16)
714    """)
715
716        self.check_tokenize("await = 1", """\
717    NAME       'await'       (1, 0) (1, 5)
718    OP         '='           (1, 6) (1, 7)
719    NUMBER     '1'           (1, 8) (1, 9)
720    """)
721
722        self.check_tokenize("foo.async", """\
723    NAME       'foo'         (1, 0) (1, 3)
724    OP         '.'           (1, 3) (1, 4)
725    NAME       'async'       (1, 4) (1, 9)
726    """)
727
728        self.check_tokenize("async for a in b: pass", """\
729    NAME       'async'       (1, 0) (1, 5)
730    NAME       'for'         (1, 6) (1, 9)
731    NAME       'a'           (1, 10) (1, 11)
732    NAME       'in'          (1, 12) (1, 14)
733    NAME       'b'           (1, 15) (1, 16)
734    OP         ':'           (1, 16) (1, 17)
735    NAME       'pass'        (1, 18) (1, 22)
736    """)
737
738        self.check_tokenize("async with a as b: pass", """\
739    NAME       'async'       (1, 0) (1, 5)
740    NAME       'with'        (1, 6) (1, 10)
741    NAME       'a'           (1, 11) (1, 12)
742    NAME       'as'          (1, 13) (1, 15)
743    NAME       'b'           (1, 16) (1, 17)
744    OP         ':'           (1, 17) (1, 18)
745    NAME       'pass'        (1, 19) (1, 23)
746    """)
747
748        self.check_tokenize("async.foo", """\
749    NAME       'async'       (1, 0) (1, 5)
750    OP         '.'           (1, 5) (1, 6)
751    NAME       'foo'         (1, 6) (1, 9)
752    """)
753
754        self.check_tokenize("async", """\
755    NAME       'async'       (1, 0) (1, 5)
756    """)
757
758        self.check_tokenize("async\n#comment\nawait", """\
759    NAME       'async'       (1, 0) (1, 5)
760    NEWLINE    '\\n'          (1, 5) (1, 6)
761    COMMENT    '#comment'    (2, 0) (2, 8)
762    NL         '\\n'          (2, 8) (2, 9)
763    NAME       'await'       (3, 0) (3, 5)
764    """)
765
766        self.check_tokenize("async\n...\nawait", """\
767    NAME       'async'       (1, 0) (1, 5)
768    NEWLINE    '\\n'          (1, 5) (1, 6)
769    OP         '...'         (2, 0) (2, 3)
770    NEWLINE    '\\n'          (2, 3) (2, 4)
771    NAME       'await'       (3, 0) (3, 5)
772    """)
773
774        self.check_tokenize("async\nawait", """\
775    NAME       'async'       (1, 0) (1, 5)
776    NEWLINE    '\\n'          (1, 5) (1, 6)
777    NAME       'await'       (2, 0) (2, 5)
778    """)
779
780        self.check_tokenize("foo.async + 1", """\
781    NAME       'foo'         (1, 0) (1, 3)
782    OP         '.'           (1, 3) (1, 4)
783    NAME       'async'       (1, 4) (1, 9)
784    OP         '+'           (1, 10) (1, 11)
785    NUMBER     '1'           (1, 12) (1, 13)
786    """)
787
788        self.check_tokenize("async def foo(): pass", """\
789    NAME       'async'       (1, 0) (1, 5)
790    NAME       'def'         (1, 6) (1, 9)
791    NAME       'foo'         (1, 10) (1, 13)
792    OP         '('           (1, 13) (1, 14)
793    OP         ')'           (1, 14) (1, 15)
794    OP         ':'           (1, 15) (1, 16)
795    NAME       'pass'        (1, 17) (1, 21)
796    """)
797
798        self.check_tokenize('''\
799async def foo():
800  def foo(await):
801    await = 1
802  if 1:
803    await
804async += 1
805''', """\
806    NAME       'async'       (1, 0) (1, 5)
807    NAME       'def'         (1, 6) (1, 9)
808    NAME       'foo'         (1, 10) (1, 13)
809    OP         '('           (1, 13) (1, 14)
810    OP         ')'           (1, 14) (1, 15)
811    OP         ':'           (1, 15) (1, 16)
812    NEWLINE    '\\n'          (1, 16) (1, 17)
813    INDENT     '  '          (2, 0) (2, 2)
814    NAME       'def'         (2, 2) (2, 5)
815    NAME       'foo'         (2, 6) (2, 9)
816    OP         '('           (2, 9) (2, 10)
817    NAME       'await'       (2, 10) (2, 15)
818    OP         ')'           (2, 15) (2, 16)
819    OP         ':'           (2, 16) (2, 17)
820    NEWLINE    '\\n'          (2, 17) (2, 18)
821    INDENT     '    '        (3, 0) (3, 4)
822    NAME       'await'       (3, 4) (3, 9)
823    OP         '='           (3, 10) (3, 11)
824    NUMBER     '1'           (3, 12) (3, 13)
825    NEWLINE    '\\n'          (3, 13) (3, 14)
826    DEDENT     ''            (4, 2) (4, 2)
827    NAME       'if'          (4, 2) (4, 4)
828    NUMBER     '1'           (4, 5) (4, 6)
829    OP         ':'           (4, 6) (4, 7)
830    NEWLINE    '\\n'          (4, 7) (4, 8)
831    INDENT     '    '        (5, 0) (5, 4)
832    NAME       'await'       (5, 4) (5, 9)
833    NEWLINE    '\\n'          (5, 9) (5, 10)
834    DEDENT     ''            (6, 0) (6, 0)
835    DEDENT     ''            (6, 0) (6, 0)
836    NAME       'async'       (6, 0) (6, 5)
837    OP         '+='          (6, 6) (6, 8)
838    NUMBER     '1'           (6, 9) (6, 10)
839    NEWLINE    '\\n'          (6, 10) (6, 11)
840    """)
841
842        self.check_tokenize('''\
843async def foo():
844  async for i in 1: pass''', """\
845    NAME       'async'       (1, 0) (1, 5)
846    NAME       'def'         (1, 6) (1, 9)
847    NAME       'foo'         (1, 10) (1, 13)
848    OP         '('           (1, 13) (1, 14)
849    OP         ')'           (1, 14) (1, 15)
850    OP         ':'           (1, 15) (1, 16)
851    NEWLINE    '\\n'          (1, 16) (1, 17)
852    INDENT     '  '          (2, 0) (2, 2)
853    NAME       'async'       (2, 2) (2, 7)
854    NAME       'for'         (2, 8) (2, 11)
855    NAME       'i'           (2, 12) (2, 13)
856    NAME       'in'          (2, 14) (2, 16)
857    NUMBER     '1'           (2, 17) (2, 18)
858    OP         ':'           (2, 18) (2, 19)
859    NAME       'pass'        (2, 20) (2, 24)
860    DEDENT     ''            (3, 0) (3, 0)
861    """)
862
863        self.check_tokenize('''async def foo(async): await''', """\
864    NAME       'async'       (1, 0) (1, 5)
865    NAME       'def'         (1, 6) (1, 9)
866    NAME       'foo'         (1, 10) (1, 13)
867    OP         '('           (1, 13) (1, 14)
868    NAME       'async'       (1, 14) (1, 19)
869    OP         ')'           (1, 19) (1, 20)
870    OP         ':'           (1, 20) (1, 21)
871    NAME       'await'       (1, 22) (1, 27)
872    """)
873
874        self.check_tokenize('''\
875def f():
876
877  def baz(): pass
878  async def bar(): pass
879
880  await = 2''', """\
881    NAME       'def'         (1, 0) (1, 3)
882    NAME       'f'           (1, 4) (1, 5)
883    OP         '('           (1, 5) (1, 6)
884    OP         ')'           (1, 6) (1, 7)
885    OP         ':'           (1, 7) (1, 8)
886    NEWLINE    '\\n'          (1, 8) (1, 9)
887    NL         '\\n'          (2, 0) (2, 1)
888    INDENT     '  '          (3, 0) (3, 2)
889    NAME       'def'         (3, 2) (3, 5)
890    NAME       'baz'         (3, 6) (3, 9)
891    OP         '('           (3, 9) (3, 10)
892    OP         ')'           (3, 10) (3, 11)
893    OP         ':'           (3, 11) (3, 12)
894    NAME       'pass'        (3, 13) (3, 17)
895    NEWLINE    '\\n'          (3, 17) (3, 18)
896    NAME       'async'       (4, 2) (4, 7)
897    NAME       'def'         (4, 8) (4, 11)
898    NAME       'bar'         (4, 12) (4, 15)
899    OP         '('           (4, 15) (4, 16)
900    OP         ')'           (4, 16) (4, 17)
901    OP         ':'           (4, 17) (4, 18)
902    NAME       'pass'        (4, 19) (4, 23)
903    NEWLINE    '\\n'          (4, 23) (4, 24)
904    NL         '\\n'          (5, 0) (5, 1)
905    NAME       'await'       (6, 2) (6, 7)
906    OP         '='           (6, 8) (6, 9)
907    NUMBER     '2'           (6, 10) (6, 11)
908    DEDENT     ''            (7, 0) (7, 0)
909    """)
910
911        self.check_tokenize('''\
912async def f():
913
914  def baz(): pass
915  async def bar(): pass
916
917  await = 2''', """\
918    NAME       'async'       (1, 0) (1, 5)
919    NAME       'def'         (1, 6) (1, 9)
920    NAME       'f'           (1, 10) (1, 11)
921    OP         '('           (1, 11) (1, 12)
922    OP         ')'           (1, 12) (1, 13)
923    OP         ':'           (1, 13) (1, 14)
924    NEWLINE    '\\n'          (1, 14) (1, 15)
925    NL         '\\n'          (2, 0) (2, 1)
926    INDENT     '  '          (3, 0) (3, 2)
927    NAME       'def'         (3, 2) (3, 5)
928    NAME       'baz'         (3, 6) (3, 9)
929    OP         '('           (3, 9) (3, 10)
930    OP         ')'           (3, 10) (3, 11)
931    OP         ':'           (3, 11) (3, 12)
932    NAME       'pass'        (3, 13) (3, 17)
933    NEWLINE    '\\n'          (3, 17) (3, 18)
934    NAME       'async'       (4, 2) (4, 7)
935    NAME       'def'         (4, 8) (4, 11)
936    NAME       'bar'         (4, 12) (4, 15)
937    OP         '('           (4, 15) (4, 16)
938    OP         ')'           (4, 16) (4, 17)
939    OP         ':'           (4, 17) (4, 18)
940    NAME       'pass'        (4, 19) (4, 23)
941    NEWLINE    '\\n'          (4, 23) (4, 24)
942    NL         '\\n'          (5, 0) (5, 1)
943    NAME       'await'       (6, 2) (6, 7)
944    OP         '='           (6, 8) (6, 9)
945    NUMBER     '2'           (6, 10) (6, 11)
946    DEDENT     ''            (7, 0) (7, 0)
947    """)
948
949class GenerateTokensTest(TokenizeTest):
950    def check_tokenize(self, s, expected):
951        # Format the tokens in s in a table format.
952        # The ENDMARKER and final NEWLINE are omitted.
953        f = StringIO(s)
954        result = stringify_tokens_from_source(generate_tokens(f.readline), s)
955        self.assertEqual(result, expected.rstrip().splitlines())
956
957
958def decistmt(s):
959    result = []
960    g = tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
961    for toknum, tokval, _, _, _  in g:
962        if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
963            result.extend([
964                (NAME, 'Decimal'),
965                (OP, '('),
966                (STRING, repr(tokval)),
967                (OP, ')')
968            ])
969        else:
970            result.append((toknum, tokval))
971    return untokenize(result).decode('utf-8')
972
973class TestMisc(TestCase):
974
975    def test_decistmt(self):
976        # Substitute Decimals for floats in a string of statements.
977        # This is an example from the docs.
978
979        from decimal import Decimal
980        s = '+21.3e-5*-.1234/81.7'
981        self.assertEqual(decistmt(s),
982                         "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
983
984        # The format of the exponent is inherited from the platform C library.
985        # Known cases are "e-007" (Windows) and "e-07" (not Windows).  Since
986        # we're only showing 11 digits, and the 12th isn't close to 5, the
987        # rest of the output should be platform-independent.
988        self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
989
990        # Output from calculations with Decimal should be identical across all
991        # platforms.
992        self.assertEqual(eval(decistmt(s)),
993                         Decimal('-3.217160342717258261933904529E-7'))
994
995
996class TestTokenizerAdheresToPep0263(TestCase):
997    """
998    Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
999    """
1000
1001    def _testFile(self, filename):
1002        path = os.path.join(os.path.dirname(__file__), filename)
1003        TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
1004
1005    def test_utf8_coding_cookie_and_no_utf8_bom(self):
1006        f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
1007        self._testFile(f)
1008
1009    def test_latin1_coding_cookie_and_utf8_bom(self):
1010        """
1011        As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
1012        allowed encoding for the comment is 'utf-8'.  The text file used in
1013        this test starts with a BOM signature, but specifies latin1 as the
1014        coding, so verify that a SyntaxError is raised, which matches the
1015        behaviour of the interpreter when it encounters a similar condition.
1016        """
1017        f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
1018        self.assertRaises(SyntaxError, self._testFile, f)
1019
1020    def test_no_coding_cookie_and_utf8_bom(self):
1021        f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
1022        self._testFile(f)
1023
1024    def test_utf8_coding_cookie_and_utf8_bom(self):
1025        f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
1026        self._testFile(f)
1027
1028    def test_bad_coding_cookie(self):
1029        self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
1030        self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
1031
1032
1033class Test_Tokenize(TestCase):
1034
1035    def test__tokenize_decodes_with_specified_encoding(self):
1036        literal = '"ЉЊЈЁЂ"'
1037        line = literal.encode('utf-8')
1038        first = False
1039        def readline():
1040            nonlocal first
1041            if not first:
1042                first = True
1043                return line
1044            else:
1045                return b''
1046
1047        # skip the initial encoding token and the end tokens
1048        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
1049        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1050        self.assertEqual(tokens, expected_tokens,
1051                         "bytes not decoded with encoding")
1052
1053    def test__tokenize_does_not_decode_with_encoding_none(self):
1054        literal = '"ЉЊЈЁЂ"'
1055        first = False
1056        def readline():
1057            nonlocal first
1058            if not first:
1059                first = True
1060                return literal
1061            else:
1062                return b''
1063
1064        # skip the end tokens
1065        tokens = list(_tokenize(readline, encoding=None))[:-2]
1066        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1067        self.assertEqual(tokens, expected_tokens,
1068                         "string not tokenized when encoding is None")
1069
1070
1071class TestDetectEncoding(TestCase):
1072
1073    def get_readline(self, lines):
1074        index = 0
1075        def readline():
1076            nonlocal index
1077            if index == len(lines):
1078                raise StopIteration
1079            line = lines[index]
1080            index += 1
1081            return line
1082        return readline
1083
1084    def test_no_bom_no_encoding_cookie(self):
1085        lines = (
1086            b'# something\n',
1087            b'print(something)\n',
1088            b'do_something(else)\n'
1089        )
1090        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1091        self.assertEqual(encoding, 'utf-8')
1092        self.assertEqual(consumed_lines, list(lines[:2]))
1093
1094    def test_bom_no_cookie(self):
1095        lines = (
1096            b'\xef\xbb\xbf# something\n',
1097            b'print(something)\n',
1098            b'do_something(else)\n'
1099        )
1100        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1101        self.assertEqual(encoding, 'utf-8-sig')
1102        self.assertEqual(consumed_lines,
1103                         [b'# something\n', b'print(something)\n'])
1104
1105    def test_cookie_first_line_no_bom(self):
1106        lines = (
1107            b'# -*- coding: latin-1 -*-\n',
1108            b'print(something)\n',
1109            b'do_something(else)\n'
1110        )
1111        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1112        self.assertEqual(encoding, 'iso-8859-1')
1113        self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
1114
1115    def test_matched_bom_and_cookie_first_line(self):
1116        lines = (
1117            b'\xef\xbb\xbf# coding=utf-8\n',
1118            b'print(something)\n',
1119            b'do_something(else)\n'
1120        )
1121        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1122        self.assertEqual(encoding, 'utf-8-sig')
1123        self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
1124
1125    def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1126        lines = (
1127            b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1128            b'print(something)\n',
1129            b'do_something(else)\n'
1130        )
1131        readline = self.get_readline(lines)
1132        self.assertRaises(SyntaxError, detect_encoding, readline)
1133
1134    def test_cookie_second_line_no_bom(self):
1135        lines = (
1136            b'#! something\n',
1137            b'# vim: set fileencoding=ascii :\n',
1138            b'print(something)\n',
1139            b'do_something(else)\n'
1140        )
1141        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1142        self.assertEqual(encoding, 'ascii')
1143        expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
1144        self.assertEqual(consumed_lines, expected)
1145
1146    def test_matched_bom_and_cookie_second_line(self):
1147        lines = (
1148            b'\xef\xbb\xbf#! something\n',
1149            b'f# coding=utf-8\n',
1150            b'print(something)\n',
1151            b'do_something(else)\n'
1152        )
1153        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1154        self.assertEqual(encoding, 'utf-8-sig')
1155        self.assertEqual(consumed_lines,
1156                         [b'#! something\n', b'f# coding=utf-8\n'])
1157
1158    def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1159        lines = (
1160            b'\xef\xbb\xbf#! something\n',
1161            b'# vim: set fileencoding=ascii :\n',
1162            b'print(something)\n',
1163            b'do_something(else)\n'
1164        )
1165        readline = self.get_readline(lines)
1166        self.assertRaises(SyntaxError, detect_encoding, readline)
1167
1168    def test_cookie_second_line_noncommented_first_line(self):
1169        lines = (
1170            b"print('\xc2\xa3')\n",
1171            b'# vim: set fileencoding=iso8859-15 :\n',
1172            b"print('\xe2\x82\xac')\n"
1173        )
1174        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1175        self.assertEqual(encoding, 'utf-8')
1176        expected = [b"print('\xc2\xa3')\n"]
1177        self.assertEqual(consumed_lines, expected)
1178
1179    def test_cookie_second_line_commented_first_line(self):
1180        lines = (
1181            b"#print('\xc2\xa3')\n",
1182            b'# vim: set fileencoding=iso8859-15 :\n',
1183            b"print('\xe2\x82\xac')\n"
1184        )
1185        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1186        self.assertEqual(encoding, 'iso8859-15')
1187        expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1188        self.assertEqual(consumed_lines, expected)
1189
1190    def test_cookie_second_line_empty_first_line(self):
1191        lines = (
1192            b'\n',
1193            b'# vim: set fileencoding=iso8859-15 :\n',
1194            b"print('\xe2\x82\xac')\n"
1195        )
1196        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1197        self.assertEqual(encoding, 'iso8859-15')
1198        expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1199        self.assertEqual(consumed_lines, expected)
1200
1201    def test_latin1_normalization(self):
1202        # See get_normal_name() in tokenizer.c.
1203        encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1204                     "iso-8859-1-unix", "iso-latin-1-mac")
1205        for encoding in encodings:
1206            for rep in ("-", "_"):
1207                enc = encoding.replace("-", rep)
1208                lines = (b"#!/usr/bin/python\n",
1209                         b"# coding: " + enc.encode("ascii") + b"\n",
1210                         b"print(things)\n",
1211                         b"do_something += 4\n")
1212                rl = self.get_readline(lines)
1213                found, consumed_lines = detect_encoding(rl)
1214                self.assertEqual(found, "iso-8859-1")
1215
1216    def test_syntaxerror_latin1(self):
1217        # Issue 14629: need to raise SyntaxError if the first
1218        # line(s) have non-UTF-8 characters
1219        lines = (
1220            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1221            )
1222        readline = self.get_readline(lines)
1223        self.assertRaises(SyntaxError, detect_encoding, readline)
1224
1225
1226    def test_utf8_normalization(self):
1227        # See get_normal_name() in tokenizer.c.
1228        encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1229        for encoding in encodings:
1230            for rep in ("-", "_"):
1231                enc = encoding.replace("-", rep)
1232                lines = (b"#!/usr/bin/python\n",
1233                         b"# coding: " + enc.encode("ascii") + b"\n",
1234                         b"1 + 3\n")
1235                rl = self.get_readline(lines)
1236                found, consumed_lines = detect_encoding(rl)
1237                self.assertEqual(found, "utf-8")
1238
1239    def test_short_files(self):
1240        readline = self.get_readline((b'print(something)\n',))
1241        encoding, consumed_lines = detect_encoding(readline)
1242        self.assertEqual(encoding, 'utf-8')
1243        self.assertEqual(consumed_lines, [b'print(something)\n'])
1244
1245        encoding, consumed_lines = detect_encoding(self.get_readline(()))
1246        self.assertEqual(encoding, 'utf-8')
1247        self.assertEqual(consumed_lines, [])
1248
1249        readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1250        encoding, consumed_lines = detect_encoding(readline)
1251        self.assertEqual(encoding, 'utf-8-sig')
1252        self.assertEqual(consumed_lines, [b'print(something)\n'])
1253
1254        readline = self.get_readline((b'\xef\xbb\xbf',))
1255        encoding, consumed_lines = detect_encoding(readline)
1256        self.assertEqual(encoding, 'utf-8-sig')
1257        self.assertEqual(consumed_lines, [])
1258
1259        readline = self.get_readline((b'# coding: bad\n',))
1260        self.assertRaises(SyntaxError, detect_encoding, readline)
1261
1262    def test_false_encoding(self):
1263        # Issue 18873: "Encoding" detected in non-comment lines
1264        readline = self.get_readline((b'print("#coding=fake")',))
1265        encoding, consumed_lines = detect_encoding(readline)
1266        self.assertEqual(encoding, 'utf-8')
1267        self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1268
1269    def test_open(self):
1270        filename = os_helper.TESTFN + '.py'
1271        self.addCleanup(os_helper.unlink, filename)
1272
1273        # test coding cookie
1274        for encoding in ('iso-8859-15', 'utf-8'):
1275            with open(filename, 'w', encoding=encoding) as fp:
1276                print("# coding: %s" % encoding, file=fp)
1277                print("print('euro:\u20ac')", file=fp)
1278            with tokenize_open(filename) as fp:
1279                self.assertEqual(fp.encoding, encoding)
1280                self.assertEqual(fp.mode, 'r')
1281
1282        # test BOM (no coding cookie)
1283        with open(filename, 'w', encoding='utf-8-sig') as fp:
1284            print("print('euro:\u20ac')", file=fp)
1285        with tokenize_open(filename) as fp:
1286            self.assertEqual(fp.encoding, 'utf-8-sig')
1287            self.assertEqual(fp.mode, 'r')
1288
1289    def test_filename_in_exception(self):
1290        # When possible, include the file name in the exception.
1291        path = 'some_file_path'
1292        lines = (
1293            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1294            )
1295        class Bunk:
1296            def __init__(self, lines, path):
1297                self.name = path
1298                self._lines = lines
1299                self._index = 0
1300
1301            def readline(self):
1302                if self._index == len(lines):
1303                    raise StopIteration
1304                line = lines[self._index]
1305                self._index += 1
1306                return line
1307
1308        with self.assertRaises(SyntaxError):
1309            ins = Bunk(lines, path)
1310            # Make sure lacking a name isn't an issue.
1311            del ins.name
1312            detect_encoding(ins.readline)
1313        with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1314            ins = Bunk(lines, path)
1315            detect_encoding(ins.readline)
1316
1317    def test_open_error(self):
1318        # Issue #23840: open() must close the binary file on error
1319        m = BytesIO(b'#coding:xxx')
1320        with mock.patch('tokenize._builtin_open', return_value=m):
1321            self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1322        self.assertTrue(m.closed)
1323
1324
1325class TestTokenize(TestCase):
1326
1327    def test_tokenize(self):
1328        import tokenize as tokenize_module
1329        encoding = object()
1330        encoding_used = None
1331        def mock_detect_encoding(readline):
1332            return encoding, [b'first', b'second']
1333
1334        def mock__tokenize(readline, encoding):
1335            nonlocal encoding_used
1336            encoding_used = encoding
1337            out = []
1338            while True:
1339                next_line = readline()
1340                if next_line:
1341                    out.append(next_line)
1342                    continue
1343                return out
1344
1345        counter = 0
1346        def mock_readline():
1347            nonlocal counter
1348            counter += 1
1349            if counter == 5:
1350                return b''
1351            return str(counter).encode()
1352
1353        orig_detect_encoding = tokenize_module.detect_encoding
1354        orig__tokenize = tokenize_module._tokenize
1355        tokenize_module.detect_encoding = mock_detect_encoding
1356        tokenize_module._tokenize = mock__tokenize
1357        try:
1358            results = tokenize(mock_readline)
1359            self.assertEqual(list(results),
1360                             [b'first', b'second', b'1', b'2', b'3', b'4'])
1361        finally:
1362            tokenize_module.detect_encoding = orig_detect_encoding
1363            tokenize_module._tokenize = orig__tokenize
1364
1365        self.assertEqual(encoding_used, encoding)
1366
1367    def test_oneline_defs(self):
1368        buf = []
1369        for i in range(500):
1370            buf.append('def i{i}(): return {i}'.format(i=i))
1371        buf.append('OK')
1372        buf = '\n'.join(buf)
1373
1374        # Test that 500 consequent, one-line defs is OK
1375        toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1376        self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
1377                                                # [-2] is always NEWLINE
1378
1379    def assertExactTypeEqual(self, opstr, *optypes):
1380        tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1381        num_optypes = len(optypes)
1382        self.assertEqual(len(tokens), 3 + num_optypes)
1383        self.assertEqual(tok_name[tokens[0].exact_type],
1384                         tok_name[ENCODING])
1385        for i in range(num_optypes):
1386            self.assertEqual(tok_name[tokens[i + 1].exact_type],
1387                             tok_name[optypes[i]])
1388        self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
1389                         tok_name[token.NEWLINE])
1390        self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
1391                         tok_name[token.ENDMARKER])
1392
1393    def test_exact_type(self):
1394        self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1395        self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1396        self.assertExactTypeEqual(':', token.COLON)
1397        self.assertExactTypeEqual(',', token.COMMA)
1398        self.assertExactTypeEqual(';', token.SEMI)
1399        self.assertExactTypeEqual('+', token.PLUS)
1400        self.assertExactTypeEqual('-', token.MINUS)
1401        self.assertExactTypeEqual('*', token.STAR)
1402        self.assertExactTypeEqual('/', token.SLASH)
1403        self.assertExactTypeEqual('|', token.VBAR)
1404        self.assertExactTypeEqual('&', token.AMPER)
1405        self.assertExactTypeEqual('<', token.LESS)
1406        self.assertExactTypeEqual('>', token.GREATER)
1407        self.assertExactTypeEqual('=', token.EQUAL)
1408        self.assertExactTypeEqual('.', token.DOT)
1409        self.assertExactTypeEqual('%', token.PERCENT)
1410        self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1411        self.assertExactTypeEqual('==', token.EQEQUAL)
1412        self.assertExactTypeEqual('!=', token.NOTEQUAL)
1413        self.assertExactTypeEqual('<=', token.LESSEQUAL)
1414        self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1415        self.assertExactTypeEqual('~', token.TILDE)
1416        self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1417        self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1418        self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1419        self.assertExactTypeEqual('**', token.DOUBLESTAR)
1420        self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1421        self.assertExactTypeEqual('-=', token.MINEQUAL)
1422        self.assertExactTypeEqual('*=', token.STAREQUAL)
1423        self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1424        self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1425        self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1426        self.assertExactTypeEqual('|=', token.VBAREQUAL)
1427        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1428        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1429        self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1430        self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1431        self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1432        self.assertExactTypeEqual('//', token.DOUBLESLASH)
1433        self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1434        self.assertExactTypeEqual(':=', token.COLONEQUAL)
1435        self.assertExactTypeEqual('...', token.ELLIPSIS)
1436        self.assertExactTypeEqual('->', token.RARROW)
1437        self.assertExactTypeEqual('@', token.AT)
1438        self.assertExactTypeEqual('@=', token.ATEQUAL)
1439
1440        self.assertExactTypeEqual('a**2+b**2==c**2',
1441                                  NAME, token.DOUBLESTAR, NUMBER,
1442                                  token.PLUS,
1443                                  NAME, token.DOUBLESTAR, NUMBER,
1444                                  token.EQEQUAL,
1445                                  NAME, token.DOUBLESTAR, NUMBER)
1446        self.assertExactTypeEqual('{1, 2, 3}',
1447                                  token.LBRACE,
1448                                  token.NUMBER, token.COMMA,
1449                                  token.NUMBER, token.COMMA,
1450                                  token.NUMBER,
1451                                  token.RBRACE)
1452        self.assertExactTypeEqual('^(x & 0x1)',
1453                                  token.CIRCUMFLEX,
1454                                  token.LPAR,
1455                                  token.NAME, token.AMPER, token.NUMBER,
1456                                  token.RPAR)
1457
1458    def test_pathological_trailing_whitespace(self):
1459        # See http://bugs.python.org/issue16152
1460        self.assertExactTypeEqual('@          ', token.AT)
1461
1462    def test_comment_at_the_end_of_the_source_without_newline(self):
1463        # See http://bugs.python.org/issue44667
1464        source = 'b = 1\n\n#test'
1465        expected_tokens = [token.NAME, token.EQUAL, token.NUMBER, token.NEWLINE, token.NL, token.COMMENT]
1466
1467        tokens = list(tokenize(BytesIO(source.encode('utf-8')).readline))
1468        self.assertEqual(tok_name[tokens[0].exact_type], tok_name[ENCODING])
1469        for i in range(6):
1470            self.assertEqual(tok_name[tokens[i + 1].exact_type], tok_name[expected_tokens[i]])
1471        self.assertEqual(tok_name[tokens[-1].exact_type], tok_name[token.ENDMARKER])
1472
1473class UntokenizeTest(TestCase):
1474
1475    def test_bad_input_order(self):
1476        # raise if previous row
1477        u = Untokenizer()
1478        u.prev_row = 2
1479        u.prev_col = 2
1480        with self.assertRaises(ValueError) as cm:
1481            u.add_whitespace((1,3))
1482        self.assertEqual(cm.exception.args[0],
1483                'start (1,3) precedes previous end (2,2)')
1484        # raise if previous column in row
1485        self.assertRaises(ValueError, u.add_whitespace, (2,1))
1486
1487    def test_backslash_continuation(self):
1488        # The problem is that <whitespace>\<newline> leaves no token
1489        u = Untokenizer()
1490        u.prev_row = 1
1491        u.prev_col =  1
1492        u.tokens = []
1493        u.add_whitespace((2, 0))
1494        self.assertEqual(u.tokens, ['\\\n'])
1495        u.prev_row = 2
1496        u.add_whitespace((4, 4))
1497        self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', '    '])
1498        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
1499
1500    def test_iter_compat(self):
1501        u = Untokenizer()
1502        token = (NAME, 'Hello')
1503        tokens = [(ENCODING, 'utf-8'), token]
1504        u.compat(token, iter([]))
1505        self.assertEqual(u.tokens, ["Hello "])
1506        u = Untokenizer()
1507        self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1508        u = Untokenizer()
1509        self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1510        self.assertEqual(u.encoding, 'utf-8')
1511        self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1512
1513
1514class TestRoundtrip(TestCase):
1515
1516    def check_roundtrip(self, f):
1517        """
1518        Test roundtrip for `untokenize`. `f` is an open file or a string.
1519        The source code in f is tokenized to both 5- and 2-tuples.
1520        Both sequences are converted back to source code via
1521        tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1522        The test fails if the 3 pair tokenizations do not match.
1523
1524        When untokenize bugs are fixed, untokenize with 5-tuples should
1525        reproduce code that does not contain a backslash continuation
1526        following spaces.  A proper test should test this.
1527        """
1528        # Get source code and original tokenizations
1529        if isinstance(f, str):
1530            code = f.encode('utf-8')
1531        else:
1532            code = f.read()
1533            f.close()
1534        readline = iter(code.splitlines(keepends=True)).__next__
1535        tokens5 = list(tokenize(readline))
1536        tokens2 = [tok[:2] for tok in tokens5]
1537        # Reproduce tokens2 from pairs
1538        bytes_from2 = untokenize(tokens2)
1539        readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1540        tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1541        self.assertEqual(tokens2_from2, tokens2)
1542        # Reproduce tokens2 from 5-tuples
1543        bytes_from5 = untokenize(tokens5)
1544        readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1545        tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1546        self.assertEqual(tokens2_from5, tokens2)
1547
1548    def test_roundtrip(self):
1549        # There are some standard formatting practices that are easy to get right.
1550
1551        self.check_roundtrip("if x == 1:\n"
1552                             "    print(x)\n")
1553        self.check_roundtrip("# This is a comment\n"
1554                             "# This also\n")
1555
1556        # Some people use different formatting conventions, which makes
1557        # untokenize a little trickier. Note that this test involves trailing
1558        # whitespace after the colon. Note that we use hex escapes to make the
1559        # two trailing blanks apparent in the expected output.
1560
1561        self.check_roundtrip("if x == 1 : \n"
1562                             "  print(x)\n")
1563        fn = support.findfile("tokenize_tests.txt")
1564        with open(fn, 'rb') as f:
1565            self.check_roundtrip(f)
1566        self.check_roundtrip("if x == 1:\n"
1567                             "    # A comment by itself.\n"
1568                             "    print(x) # Comment here, too.\n"
1569                             "    # Another comment.\n"
1570                             "after_if = True\n")
1571        self.check_roundtrip("if (x # The comments need to go in the right place\n"
1572                             "    == 1):\n"
1573                             "    print('x==1')\n")
1574        self.check_roundtrip("class Test: # A comment here\n"
1575                             "  # A comment with weird indent\n"
1576                             "  after_com = 5\n"
1577                             "  def x(m): return m*5 # a one liner\n"
1578                             "  def y(m): # A whitespace after the colon\n"
1579                             "     return y*4 # 3-space indent\n")
1580
1581        # Some error-handling code
1582        self.check_roundtrip("try: import somemodule\n"
1583                             "except ImportError: # comment\n"
1584                             "    print('Can not import' # comment2\n)"
1585                             "else:   print('Loaded')\n")
1586
1587    def test_continuation(self):
1588        # Balancing continuation
1589        self.check_roundtrip("a = (3,4, \n"
1590                             "5,6)\n"
1591                             "y = [3, 4,\n"
1592                             "5]\n"
1593                             "z = {'a': 5,\n"
1594                             "'b':15, 'c':True}\n"
1595                             "x = len(y) + 5 - a[\n"
1596                             "3] - a[2]\n"
1597                             "+ len(z) - z[\n"
1598                             "'b']\n")
1599
1600    def test_backslash_continuation(self):
1601        # Backslash means line continuation, except for comments
1602        self.check_roundtrip("x=1+\\\n"
1603                             "1\n"
1604                             "# This is a comment\\\n"
1605                             "# This also\n")
1606        self.check_roundtrip("# Comment \\\n"
1607                             "x = 0")
1608
1609    def test_string_concatenation(self):
1610        # Two string literals on the same line
1611        self.check_roundtrip("'' ''")
1612
1613    def test_random_files(self):
1614        # Test roundtrip on random python modules.
1615        # pass the '-ucpu' option to process the full directory.
1616
1617        import glob, random
1618        fn = support.findfile("tokenize_tests.txt")
1619        tempdir = os.path.dirname(fn) or os.curdir
1620        testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))
1621
1622        # Tokenize is broken on test_pep3131.py because regular expressions are
1623        # broken on the obscure unicode identifiers in it. *sigh*
1624        # With roundtrip extended to test the 5-tuple mode of untokenize,
1625        # 7 more testfiles fail.  Remove them also until the failure is diagnosed.
1626
1627        testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
1628        for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1629            testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1630
1631        if not support.is_resource_enabled("cpu"):
1632            testfiles = random.sample(testfiles, 10)
1633
1634        for testfile in testfiles:
1635            if support.verbose >= 2:
1636                print('tokenize', testfile)
1637            with open(testfile, 'rb') as f:
1638                with self.subTest(file=testfile):
1639                    self.check_roundtrip(f)
1640
1641
1642    def roundtrip(self, code):
1643        if isinstance(code, str):
1644            code = code.encode('utf-8')
1645        return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
1646
1647    def test_indentation_semantics_retained(self):
1648        """
1649        Ensure that although whitespace might be mutated in a roundtrip,
1650        the semantic meaning of the indentation remains consistent.
1651        """
1652        code = "if False:\n\tx=3\n\tx=3\n"
1653        codelines = self.roundtrip(code).split('\n')
1654        self.assertEqual(codelines[1], codelines[2])
1655        self.check_roundtrip(code)
1656
1657
1658class CTokenizeTest(TestCase):
1659    def check_tokenize(self, s, expected):
1660        # Format the tokens in s in a table format.
1661        # The ENDMARKER and final NEWLINE are omitted.
1662        with self.subTest(source=s):
1663            result = stringify_tokens_from_source(
1664                _generate_tokens_from_c_tokenizer(s), s
1665            )
1666            self.assertEqual(result, expected.rstrip().splitlines())
1667
1668    def test_int(self):
1669
1670        self.check_tokenize('0xff <= 255', """\
1671    NUMBER     '0xff'        (1, 0) (1, 4)
1672    LESSEQUAL  '<='          (1, 5) (1, 7)
1673    NUMBER     '255'         (1, 8) (1, 11)
1674    """)
1675
1676        self.check_tokenize('0b10 <= 255', """\
1677    NUMBER     '0b10'        (1, 0) (1, 4)
1678    LESSEQUAL  '<='          (1, 5) (1, 7)
1679    NUMBER     '255'         (1, 8) (1, 11)
1680    """)
1681
1682        self.check_tokenize('0o123 <= 0O123', """\
1683    NUMBER     '0o123'       (1, 0) (1, 5)
1684    LESSEQUAL  '<='          (1, 6) (1, 8)
1685    NUMBER     '0O123'       (1, 9) (1, 14)
1686    """)
1687
1688        self.check_tokenize('1234567 > ~0x15', """\
1689    NUMBER     '1234567'     (1, 0) (1, 7)
1690    GREATER    '>'           (1, 8) (1, 9)
1691    TILDE      '~'           (1, 10) (1, 11)
1692    NUMBER     '0x15'        (1, 11) (1, 15)
1693    """)
1694
1695        self.check_tokenize('2134568 != 1231515', """\
1696    NUMBER     '2134568'     (1, 0) (1, 7)
1697    NOTEQUAL   '!='          (1, 8) (1, 10)
1698    NUMBER     '1231515'     (1, 11) (1, 18)
1699    """)
1700
1701        self.check_tokenize('(-124561-1) & 200000000', """\
1702    LPAR       '('           (1, 0) (1, 1)
1703    MINUS      '-'           (1, 1) (1, 2)
1704    NUMBER     '124561'      (1, 2) (1, 8)
1705    MINUS      '-'           (1, 8) (1, 9)
1706    NUMBER     '1'           (1, 9) (1, 10)
1707    RPAR       ')'           (1, 10) (1, 11)
1708    AMPER      '&'           (1, 12) (1, 13)
1709    NUMBER     '200000000'   (1, 14) (1, 23)
1710    """)
1711
1712        self.check_tokenize('0xdeadbeef != -1', """\
1713    NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
1714    NOTEQUAL   '!='          (1, 11) (1, 13)
1715    MINUS      '-'           (1, 14) (1, 15)
1716    NUMBER     '1'           (1, 15) (1, 16)
1717    """)
1718
1719        self.check_tokenize('0xdeadc0de & 12345', """\
1720    NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
1721    AMPER      '&'           (1, 11) (1, 12)
1722    NUMBER     '12345'       (1, 13) (1, 18)
1723    """)
1724
1725        self.check_tokenize('0xFF & 0x15 | 1234', """\
1726    NUMBER     '0xFF'        (1, 0) (1, 4)
1727    AMPER      '&'           (1, 5) (1, 6)
1728    NUMBER     '0x15'        (1, 7) (1, 11)
1729    VBAR       '|'           (1, 12) (1, 13)
1730    NUMBER     '1234'        (1, 14) (1, 18)
1731    """)
1732
1733    def test_float(self):
1734
1735        self.check_tokenize('x = 3.14159', """\
1736    NAME       'x'           (1, 0) (1, 1)
1737    EQUAL      '='           (1, 2) (1, 3)
1738    NUMBER     '3.14159'     (1, 4) (1, 11)
1739    """)
1740
1741        self.check_tokenize('x = 314159.', """\
1742    NAME       'x'           (1, 0) (1, 1)
1743    EQUAL      '='           (1, 2) (1, 3)
1744    NUMBER     '314159.'     (1, 4) (1, 11)
1745    """)
1746
1747        self.check_tokenize('x = .314159', """\
1748    NAME       'x'           (1, 0) (1, 1)
1749    EQUAL      '='           (1, 2) (1, 3)
1750    NUMBER     '.314159'     (1, 4) (1, 11)
1751    """)
1752
1753        self.check_tokenize('x = 3e14159', """\
1754    NAME       'x'           (1, 0) (1, 1)
1755    EQUAL      '='           (1, 2) (1, 3)
1756    NUMBER     '3e14159'     (1, 4) (1, 11)
1757    """)
1758
1759        self.check_tokenize('x = 3E123', """\
1760    NAME       'x'           (1, 0) (1, 1)
1761    EQUAL      '='           (1, 2) (1, 3)
1762    NUMBER     '3E123'       (1, 4) (1, 9)
1763    """)
1764
1765        self.check_tokenize('x+y = 3e-1230', """\
1766    NAME       'x'           (1, 0) (1, 1)
1767    PLUS       '+'           (1, 1) (1, 2)
1768    NAME       'y'           (1, 2) (1, 3)
1769    EQUAL      '='           (1, 4) (1, 5)
1770    NUMBER     '3e-1230'     (1, 6) (1, 13)
1771    """)
1772
1773        self.check_tokenize('x = 3.14e159', """\
1774    NAME       'x'           (1, 0) (1, 1)
1775    EQUAL      '='           (1, 2) (1, 3)
1776    NUMBER     '3.14e159'    (1, 4) (1, 12)
1777    """)
1778
1779    def test_string(self):
1780
1781        self.check_tokenize('x = \'\'; y = ""', """\
1782    NAME       'x'           (1, 0) (1, 1)
1783    EQUAL      '='           (1, 2) (1, 3)
1784    STRING     "''"          (1, 4) (1, 6)
1785    SEMI       ';'           (1, 6) (1, 7)
1786    NAME       'y'           (1, 8) (1, 9)
1787    EQUAL      '='           (1, 10) (1, 11)
1788    STRING     '""'          (1, 12) (1, 14)
1789    """)
1790
1791        self.check_tokenize('x = \'"\'; y = "\'"', """\
1792    NAME       'x'           (1, 0) (1, 1)
1793    EQUAL      '='           (1, 2) (1, 3)
1794    STRING     '\\'"\\''       (1, 4) (1, 7)
1795    SEMI       ';'           (1, 7) (1, 8)
1796    NAME       'y'           (1, 9) (1, 10)
1797    EQUAL      '='           (1, 11) (1, 12)
1798    STRING     '"\\'"'        (1, 13) (1, 16)
1799    """)
1800
1801        self.check_tokenize('x = "doesn\'t "shrink", does it"', """\
1802    NAME       'x'           (1, 0) (1, 1)
1803    EQUAL      '='           (1, 2) (1, 3)
1804    STRING     '"doesn\\'t "' (1, 4) (1, 14)
1805    NAME       'shrink'      (1, 14) (1, 20)
1806    STRING     '", does it"' (1, 20) (1, 31)
1807    """)
1808
1809        self.check_tokenize("x = 'abc' + 'ABC'", """\
1810    NAME       'x'           (1, 0) (1, 1)
1811    EQUAL      '='           (1, 2) (1, 3)
1812    STRING     "'abc'"       (1, 4) (1, 9)
1813    PLUS       '+'           (1, 10) (1, 11)
1814    STRING     "'ABC'"       (1, 12) (1, 17)
1815    """)
1816
1817        self.check_tokenize('y = "ABC" + "ABC"', """\
1818    NAME       'y'           (1, 0) (1, 1)
1819    EQUAL      '='           (1, 2) (1, 3)
1820    STRING     '"ABC"'       (1, 4) (1, 9)
1821    PLUS       '+'           (1, 10) (1, 11)
1822    STRING     '"ABC"'       (1, 12) (1, 17)
1823    """)
1824
1825        self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
1826    NAME       'x'           (1, 0) (1, 1)
1827    EQUAL      '='           (1, 2) (1, 3)
1828    STRING     "r'abc'"      (1, 4) (1, 10)
1829    PLUS       '+'           (1, 11) (1, 12)
1830    STRING     "r'ABC'"      (1, 13) (1, 19)
1831    PLUS       '+'           (1, 20) (1, 21)
1832    STRING     "R'ABC'"      (1, 22) (1, 28)
1833    PLUS       '+'           (1, 29) (1, 30)
1834    STRING     "R'ABC'"      (1, 31) (1, 37)
1835    """)
1836
1837        self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
1838    NAME       'y'           (1, 0) (1, 1)
1839    EQUAL      '='           (1, 2) (1, 3)
1840    STRING     'r"abc"'      (1, 4) (1, 10)
1841    PLUS       '+'           (1, 11) (1, 12)
1842    STRING     'r"ABC"'      (1, 13) (1, 19)
1843    PLUS       '+'           (1, 20) (1, 21)
1844    STRING     'R"ABC"'      (1, 22) (1, 28)
1845    PLUS       '+'           (1, 29) (1, 30)
1846    STRING     'R"ABC"'      (1, 31) (1, 37)
1847    """)
1848
1849        self.check_tokenize("u'abc' + U'abc'", """\
1850    STRING     "u'abc'"      (1, 0) (1, 6)
1851    PLUS       '+'           (1, 7) (1, 8)
1852    STRING     "U'abc'"      (1, 9) (1, 15)
1853    """)
1854
1855        self.check_tokenize('u"abc" + U"abc"', """\
1856    STRING     'u"abc"'      (1, 0) (1, 6)
1857    PLUS       '+'           (1, 7) (1, 8)
1858    STRING     'U"abc"'      (1, 9) (1, 15)
1859    """)
1860
1861        self.check_tokenize("b'abc' + B'abc'", """\
1862    STRING     "b'abc'"      (1, 0) (1, 6)
1863    PLUS       '+'           (1, 7) (1, 8)
1864    STRING     "B'abc'"      (1, 9) (1, 15)
1865    """)
1866
1867        self.check_tokenize('b"abc" + B"abc"', """\
1868    STRING     'b"abc"'      (1, 0) (1, 6)
1869    PLUS       '+'           (1, 7) (1, 8)
1870    STRING     'B"abc"'      (1, 9) (1, 15)
1871    """)
1872
1873        self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
1874    STRING     "br'abc'"     (1, 0) (1, 7)
1875    PLUS       '+'           (1, 8) (1, 9)
1876    STRING     "bR'abc'"     (1, 10) (1, 17)
1877    PLUS       '+'           (1, 18) (1, 19)
1878    STRING     "Br'abc'"     (1, 20) (1, 27)
1879    PLUS       '+'           (1, 28) (1, 29)
1880    STRING     "BR'abc'"     (1, 30) (1, 37)
1881    """)
1882
1883        self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
1884    STRING     'br"abc"'     (1, 0) (1, 7)
1885    PLUS       '+'           (1, 8) (1, 9)
1886    STRING     'bR"abc"'     (1, 10) (1, 17)
1887    PLUS       '+'           (1, 18) (1, 19)
1888    STRING     'Br"abc"'     (1, 20) (1, 27)
1889    PLUS       '+'           (1, 28) (1, 29)
1890    STRING     'BR"abc"'     (1, 30) (1, 37)
1891    """)
1892
1893        self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
1894    STRING     "rb'abc'"     (1, 0) (1, 7)
1895    PLUS       '+'           (1, 8) (1, 9)
1896    STRING     "rB'abc'"     (1, 10) (1, 17)
1897    PLUS       '+'           (1, 18) (1, 19)
1898    STRING     "Rb'abc'"     (1, 20) (1, 27)
1899    PLUS       '+'           (1, 28) (1, 29)
1900    STRING     "RB'abc'"     (1, 30) (1, 37)
1901    """)
1902
1903        self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
1904    STRING     'rb"abc"'     (1, 0) (1, 7)
1905    PLUS       '+'           (1, 8) (1, 9)
1906    STRING     'rB"abc"'     (1, 10) (1, 17)
1907    PLUS       '+'           (1, 18) (1, 19)
1908    STRING     'Rb"abc"'     (1, 20) (1, 27)
1909    PLUS       '+'           (1, 28) (1, 29)
1910    STRING     'RB"abc"'     (1, 30) (1, 37)
1911    """)
1912
1913        self.check_tokenize('"a\\\nde\\\nfg"', """\
1914    STRING     '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
1915    """)
1916
1917        self.check_tokenize('u"a\\\nde"', """\
1918    STRING     'u"a\\\\\\nde"\'  (1, 0) (2, 3)
1919    """)
1920
1921        self.check_tokenize('rb"a\\\nd"', """\
1922    STRING     'rb"a\\\\\\nd"\'  (1, 0) (2, 2)
1923    """)
1924
1925        self.check_tokenize(r'"""a\
1926b"""', """\
1927    STRING     '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
1928    """)
1929        self.check_tokenize(r'u"""a\
1930b"""', """\
1931    STRING     'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
1932    """)
1933        self.check_tokenize(r'rb"""a\
1934b\
1935c"""', """\
1936    STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
1937    """)
1938
1939        self.check_tokenize('f"abc"', """\
1940    STRING     'f"abc"'      (1, 0) (1, 6)
1941    """)
1942
1943        self.check_tokenize('fR"a{b}c"', """\
1944    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
1945    """)
1946
1947        self.check_tokenize('f"""abc"""', """\
1948    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
1949    """)
1950
1951        self.check_tokenize(r'f"abc\
1952def"', """\
1953    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
1954    """)
1955
1956        self.check_tokenize(r'Rf"abc\
1957def"', """\
1958    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
1959    """)
1960
1961    def test_function(self):
1962
1963        self.check_tokenize('def d22(a, b, c=2, d=2, *k): pass', """\
1964    NAME       'def'         (1, 0) (1, 3)
1965    NAME       'd22'         (1, 4) (1, 7)
1966    LPAR       '('           (1, 7) (1, 8)
1967    NAME       'a'           (1, 8) (1, 9)
1968    COMMA      ','           (1, 9) (1, 10)
1969    NAME       'b'           (1, 11) (1, 12)
1970    COMMA      ','           (1, 12) (1, 13)
1971    NAME       'c'           (1, 14) (1, 15)
1972    EQUAL      '='           (1, 15) (1, 16)
1973    NUMBER     '2'           (1, 16) (1, 17)
1974    COMMA      ','           (1, 17) (1, 18)
1975    NAME       'd'           (1, 19) (1, 20)
1976    EQUAL      '='           (1, 20) (1, 21)
1977    NUMBER     '2'           (1, 21) (1, 22)
1978    COMMA      ','           (1, 22) (1, 23)
1979    STAR       '*'           (1, 24) (1, 25)
1980    NAME       'k'           (1, 25) (1, 26)
1981    RPAR       ')'           (1, 26) (1, 27)
1982    COLON      ':'           (1, 27) (1, 28)
1983    NAME       'pass'        (1, 29) (1, 33)
1984    """)
1985
1986        self.check_tokenize('def d01v_(a=1, *k, **w): pass', """\
1987    NAME       'def'         (1, 0) (1, 3)
1988    NAME       'd01v_'       (1, 4) (1, 9)
1989    LPAR       '('           (1, 9) (1, 10)
1990    NAME       'a'           (1, 10) (1, 11)
1991    EQUAL      '='           (1, 11) (1, 12)
1992    NUMBER     '1'           (1, 12) (1, 13)
1993    COMMA      ','           (1, 13) (1, 14)
1994    STAR       '*'           (1, 15) (1, 16)
1995    NAME       'k'           (1, 16) (1, 17)
1996    COMMA      ','           (1, 17) (1, 18)
1997    DOUBLESTAR '**'          (1, 19) (1, 21)
1998    NAME       'w'           (1, 21) (1, 22)
1999    RPAR       ')'           (1, 22) (1, 23)
2000    COLON      ':'           (1, 23) (1, 24)
2001    NAME       'pass'        (1, 25) (1, 29)
2002    """)
2003
2004        self.check_tokenize('def d23(a: str, b: int=3) -> int: pass', """\
2005    NAME       'def'         (1, 0) (1, 3)
2006    NAME       'd23'         (1, 4) (1, 7)
2007    LPAR       '('           (1, 7) (1, 8)
2008    NAME       'a'           (1, 8) (1, 9)
2009    COLON      ':'           (1, 9) (1, 10)
2010    NAME       'str'         (1, 11) (1, 14)
2011    COMMA      ','           (1, 14) (1, 15)
2012    NAME       'b'           (1, 16) (1, 17)
2013    COLON      ':'           (1, 17) (1, 18)
2014    NAME       'int'         (1, 19) (1, 22)
2015    EQUAL      '='           (1, 22) (1, 23)
2016    NUMBER     '3'           (1, 23) (1, 24)
2017    RPAR       ')'           (1, 24) (1, 25)
2018    RARROW     '->'          (1, 26) (1, 28)
2019    NAME       'int'         (1, 29) (1, 32)
2020    COLON      ':'           (1, 32) (1, 33)
2021    NAME       'pass'        (1, 34) (1, 38)
2022    """)
2023
2024    def test_comparison(self):
2025
2026        self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
2027                            "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
2028    NAME       'if'          (1, 0) (1, 2)
2029    NUMBER     '1'           (1, 3) (1, 4)
2030    LESS       '<'           (1, 5) (1, 6)
2031    NUMBER     '1'           (1, 7) (1, 8)
2032    GREATER    '>'           (1, 9) (1, 10)
2033    NUMBER     '1'           (1, 11) (1, 12)
2034    EQEQUAL    '=='          (1, 13) (1, 15)
2035    NUMBER     '1'           (1, 16) (1, 17)
2036    GREATEREQUAL '>='          (1, 18) (1, 20)
2037    NUMBER     '5'           (1, 21) (1, 22)
2038    LESSEQUAL  '<='          (1, 23) (1, 25)
2039    NUMBER     '0x15'        (1, 26) (1, 30)
2040    LESSEQUAL  '<='          (1, 31) (1, 33)
2041    NUMBER     '0x12'        (1, 34) (1, 38)
2042    NOTEQUAL   '!='          (1, 39) (1, 41)
2043    NUMBER     '1'           (1, 42) (1, 43)
2044    NAME       'and'         (1, 44) (1, 47)
2045    NUMBER     '5'           (1, 48) (1, 49)
2046    NAME       'in'          (1, 50) (1, 52)
2047    NUMBER     '1'           (1, 53) (1, 54)
2048    NAME       'not'         (1, 55) (1, 58)
2049    NAME       'in'          (1, 59) (1, 61)
2050    NUMBER     '1'           (1, 62) (1, 63)
2051    NAME       'is'          (1, 64) (1, 66)
2052    NUMBER     '1'           (1, 67) (1, 68)
2053    NAME       'or'          (1, 69) (1, 71)
2054    NUMBER     '5'           (1, 72) (1, 73)
2055    NAME       'is'          (1, 74) (1, 76)
2056    NAME       'not'         (1, 77) (1, 80)
2057    NUMBER     '1'           (1, 81) (1, 82)
2058    COLON      ':'           (1, 82) (1, 83)
2059    NAME       'pass'        (1, 84) (1, 88)
2060    """)
2061
2062    def test_additive(self):
2063
2064        self.check_tokenize('x = 1 - y + 15 - 1 + 0x124 + z + a[5]', """\
2065    NAME       'x'           (1, 0) (1, 1)
2066    EQUAL      '='           (1, 2) (1, 3)
2067    NUMBER     '1'           (1, 4) (1, 5)
2068    MINUS      '-'           (1, 6) (1, 7)
2069    NAME       'y'           (1, 8) (1, 9)
2070    PLUS       '+'           (1, 10) (1, 11)
2071    NUMBER     '15'          (1, 12) (1, 14)
2072    MINUS      '-'           (1, 15) (1, 16)
2073    NUMBER     '1'           (1, 17) (1, 18)
2074    PLUS       '+'           (1, 19) (1, 20)
2075    NUMBER     '0x124'       (1, 21) (1, 26)
2076    PLUS       '+'           (1, 27) (1, 28)
2077    NAME       'z'           (1, 29) (1, 30)
2078    PLUS       '+'           (1, 31) (1, 32)
2079    NAME       'a'           (1, 33) (1, 34)
2080    LSQB       '['           (1, 34) (1, 35)
2081    NUMBER     '5'           (1, 35) (1, 36)
2082    RSQB       ']'           (1, 36) (1, 37)
2083    """)
2084
2085    def test_multiplicative(self):
2086
2087        self.check_tokenize('x = 1//1*1/5*12%0x12@42', """\
2088    NAME       'x'           (1, 0) (1, 1)
2089    EQUAL      '='           (1, 2) (1, 3)
2090    NUMBER     '1'           (1, 4) (1, 5)
2091    DOUBLESLASH '//'          (1, 5) (1, 7)
2092    NUMBER     '1'           (1, 7) (1, 8)
2093    STAR       '*'           (1, 8) (1, 9)
2094    NUMBER     '1'           (1, 9) (1, 10)
2095    SLASH      '/'           (1, 10) (1, 11)
2096    NUMBER     '5'           (1, 11) (1, 12)
2097    STAR       '*'           (1, 12) (1, 13)
2098    NUMBER     '12'          (1, 13) (1, 15)
2099    PERCENT    '%'           (1, 15) (1, 16)
2100    NUMBER     '0x12'        (1, 16) (1, 20)
2101    AT         '@'           (1, 20) (1, 21)
2102    NUMBER     '42'          (1, 21) (1, 23)
2103    """)
2104
2105    def test_unary(self):
2106
2107        self.check_tokenize('~1 ^ 1 & 1 |1 ^ -1', """\
2108    TILDE      '~'           (1, 0) (1, 1)
2109    NUMBER     '1'           (1, 1) (1, 2)
2110    CIRCUMFLEX '^'           (1, 3) (1, 4)
2111    NUMBER     '1'           (1, 5) (1, 6)
2112    AMPER      '&'           (1, 7) (1, 8)
2113    NUMBER     '1'           (1, 9) (1, 10)
2114    VBAR       '|'           (1, 11) (1, 12)
2115    NUMBER     '1'           (1, 12) (1, 13)
2116    CIRCUMFLEX '^'           (1, 14) (1, 15)
2117    MINUS      '-'           (1, 16) (1, 17)
2118    NUMBER     '1'           (1, 17) (1, 18)
2119    """)
2120
2121        self.check_tokenize('-1*1/1+1*1//1 - ---1**1', """\
2122    MINUS      '-'           (1, 0) (1, 1)
2123    NUMBER     '1'           (1, 1) (1, 2)
2124    STAR       '*'           (1, 2) (1, 3)
2125    NUMBER     '1'           (1, 3) (1, 4)
2126    SLASH      '/'           (1, 4) (1, 5)
2127    NUMBER     '1'           (1, 5) (1, 6)
2128    PLUS       '+'           (1, 6) (1, 7)
2129    NUMBER     '1'           (1, 7) (1, 8)
2130    STAR       '*'           (1, 8) (1, 9)
2131    NUMBER     '1'           (1, 9) (1, 10)
2132    DOUBLESLASH '//'          (1, 10) (1, 12)
2133    NUMBER     '1'           (1, 12) (1, 13)
2134    MINUS      '-'           (1, 14) (1, 15)
2135    MINUS      '-'           (1, 16) (1, 17)
2136    MINUS      '-'           (1, 17) (1, 18)
2137    MINUS      '-'           (1, 18) (1, 19)
2138    NUMBER     '1'           (1, 19) (1, 20)
2139    DOUBLESTAR '**'          (1, 20) (1, 22)
2140    NUMBER     '1'           (1, 22) (1, 23)
2141    """)
2142
2143    def test_selector(self):
2144
2145        self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
2146    NAME       'import'      (1, 0) (1, 6)
2147    NAME       'sys'         (1, 7) (1, 10)
2148    COMMA      ','           (1, 10) (1, 11)
2149    NAME       'time'        (1, 12) (1, 16)
2150    NEWLINE    ''            (1, 16) (1, 16)
2151    NAME       'x'           (2, 0) (2, 1)
2152    EQUAL      '='           (2, 2) (2, 3)
2153    NAME       'sys'         (2, 4) (2, 7)
2154    DOT        '.'           (2, 7) (2, 8)
2155    NAME       'modules'     (2, 8) (2, 15)
2156    LSQB       '['           (2, 15) (2, 16)
2157    STRING     "'time'"      (2, 16) (2, 22)
2158    RSQB       ']'           (2, 22) (2, 23)
2159    DOT        '.'           (2, 23) (2, 24)
2160    NAME       'time'        (2, 24) (2, 28)
2161    LPAR       '('           (2, 28) (2, 29)
2162    RPAR       ')'           (2, 29) (2, 30)
2163    """)
2164
2165    def test_method(self):
2166
2167        self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
2168    AT         '@'           (1, 0) (1, 1)
2169    NAME       'staticmethod' (1, 1) (1, 13)
2170    NEWLINE    ''            (1, 13) (1, 13)
2171    NAME       'def'         (2, 0) (2, 3)
2172    NAME       'foo'         (2, 4) (2, 7)
2173    LPAR       '('           (2, 7) (2, 8)
2174    NAME       'x'           (2, 8) (2, 9)
2175    COMMA      ','           (2, 9) (2, 10)
2176    NAME       'y'           (2, 10) (2, 11)
2177    RPAR       ')'           (2, 11) (2, 12)
2178    COLON      ':'           (2, 12) (2, 13)
2179    NAME       'pass'        (2, 14) (2, 18)
2180    """)
2181
2182    def test_tabs(self):
2183
2184        self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
2185    AT         '@'           (1, 0) (1, 1)
2186    NAME       'staticmethod' (1, 1) (1, 13)
2187    NEWLINE    ''            (1, 13) (1, 13)
2188    NAME       'def'         (2, 0) (2, 3)
2189    NAME       'foo'         (2, 4) (2, 7)
2190    LPAR       '('           (2, 7) (2, 8)
2191    NAME       'x'           (2, 8) (2, 9)
2192    COMMA      ','           (2, 9) (2, 10)
2193    NAME       'y'           (2, 10) (2, 11)
2194    RPAR       ')'           (2, 11) (2, 12)
2195    COLON      ':'           (2, 12) (2, 13)
2196    NAME       'pass'        (2, 14) (2, 18)
2197    """)
2198
2199    def test_async(self):
2200
2201        self.check_tokenize('async = 1', """\
2202    ASYNC      'async'       (1, 0) (1, 5)
2203    EQUAL      '='           (1, 6) (1, 7)
2204    NUMBER     '1'           (1, 8) (1, 9)
2205    """)
2206
2207        self.check_tokenize('a = (async = 1)', """\
2208    NAME       'a'           (1, 0) (1, 1)
2209    EQUAL      '='           (1, 2) (1, 3)
2210    LPAR       '('           (1, 4) (1, 5)
2211    ASYNC      'async'       (1, 5) (1, 10)
2212    EQUAL      '='           (1, 11) (1, 12)
2213    NUMBER     '1'           (1, 13) (1, 14)
2214    RPAR       ')'           (1, 14) (1, 15)
2215    """)
2216
2217        self.check_tokenize('async()', """\
2218    ASYNC      'async'       (1, 0) (1, 5)
2219    LPAR       '('           (1, 5) (1, 6)
2220    RPAR       ')'           (1, 6) (1, 7)
2221    """)
2222
2223        self.check_tokenize('class async(Bar):pass', """\
2224    NAME       'class'       (1, 0) (1, 5)
2225    ASYNC      'async'       (1, 6) (1, 11)
2226    LPAR       '('           (1, 11) (1, 12)
2227    NAME       'Bar'         (1, 12) (1, 15)
2228    RPAR       ')'           (1, 15) (1, 16)
2229    COLON      ':'           (1, 16) (1, 17)
2230    NAME       'pass'        (1, 17) (1, 21)
2231    """)
2232
2233        self.check_tokenize('class async:pass', """\
2234    NAME       'class'       (1, 0) (1, 5)
2235    ASYNC      'async'       (1, 6) (1, 11)
2236    COLON      ':'           (1, 11) (1, 12)
2237    NAME       'pass'        (1, 12) (1, 16)
2238    """)
2239
2240        self.check_tokenize('await = 1', """\
2241    AWAIT      'await'       (1, 0) (1, 5)
2242    EQUAL      '='           (1, 6) (1, 7)
2243    NUMBER     '1'           (1, 8) (1, 9)
2244    """)
2245
2246        self.check_tokenize('foo.async', """\
2247    NAME       'foo'         (1, 0) (1, 3)
2248    DOT        '.'           (1, 3) (1, 4)
2249    ASYNC      'async'       (1, 4) (1, 9)
2250    """)
2251
2252        self.check_tokenize('async for a in b: pass', """\
2253    ASYNC      'async'       (1, 0) (1, 5)
2254    NAME       'for'         (1, 6) (1, 9)
2255    NAME       'a'           (1, 10) (1, 11)
2256    NAME       'in'          (1, 12) (1, 14)
2257    NAME       'b'           (1, 15) (1, 16)
2258    COLON      ':'           (1, 16) (1, 17)
2259    NAME       'pass'        (1, 18) (1, 22)
2260    """)
2261
2262        self.check_tokenize('async with a as b: pass', """\
2263    ASYNC      'async'       (1, 0) (1, 5)
2264    NAME       'with'        (1, 6) (1, 10)
2265    NAME       'a'           (1, 11) (1, 12)
2266    NAME       'as'          (1, 13) (1, 15)
2267    NAME       'b'           (1, 16) (1, 17)
2268    COLON      ':'           (1, 17) (1, 18)
2269    NAME       'pass'        (1, 19) (1, 23)
2270    """)
2271
2272        self.check_tokenize('async.foo', """\
2273    ASYNC      'async'       (1, 0) (1, 5)
2274    DOT        '.'           (1, 5) (1, 6)
2275    NAME       'foo'         (1, 6) (1, 9)
2276    """)
2277
2278        self.check_tokenize('async', """\
2279    ASYNC      'async'       (1, 0) (1, 5)
2280    """)
2281
2282        self.check_tokenize('async\n#comment\nawait', """\
2283    ASYNC      'async'       (1, 0) (1, 5)
2284    NEWLINE    ''            (1, 5) (1, 5)
2285    AWAIT      'await'       (3, 0) (3, 5)
2286    """)
2287
2288        self.check_tokenize('async\n...\nawait', """\
2289    ASYNC      'async'       (1, 0) (1, 5)
2290    NEWLINE    ''            (1, 5) (1, 5)
2291    ELLIPSIS   '...'         (2, 0) (2, 3)
2292    NEWLINE    ''            (2, 3) (2, 3)
2293    AWAIT      'await'       (3, 0) (3, 5)
2294    """)
2295
2296        self.check_tokenize('async\nawait', """\
2297    ASYNC      'async'       (1, 0) (1, 5)
2298    NEWLINE    ''            (1, 5) (1, 5)
2299    AWAIT      'await'       (2, 0) (2, 5)
2300    """)
2301
2302        self.check_tokenize('foo.async + 1', """\
2303    NAME       'foo'         (1, 0) (1, 3)
2304    DOT        '.'           (1, 3) (1, 4)
2305    ASYNC      'async'       (1, 4) (1, 9)
2306    PLUS       '+'           (1, 10) (1, 11)
2307    NUMBER     '1'           (1, 12) (1, 13)
2308    """)
2309
2310        self.check_tokenize('async def foo(): pass', """\
2311    ASYNC      'async'       (1, 0) (1, 5)
2312    NAME       'def'         (1, 6) (1, 9)
2313    NAME       'foo'         (1, 10) (1, 13)
2314    LPAR       '('           (1, 13) (1, 14)
2315    RPAR       ')'           (1, 14) (1, 15)
2316    COLON      ':'           (1, 15) (1, 16)
2317    NAME       'pass'        (1, 17) (1, 21)
2318    """)
2319
2320        self.check_tokenize('''\
2321async def foo():
2322  def foo(await):
2323    await = 1
2324  if 1:
2325    await
2326async += 1
2327''', """\
2328    ASYNC      'async'       (1, 0) (1, 5)
2329    NAME       'def'         (1, 6) (1, 9)
2330    NAME       'foo'         (1, 10) (1, 13)
2331    LPAR       '('           (1, 13) (1, 14)
2332    RPAR       ')'           (1, 14) (1, 15)
2333    COLON      ':'           (1, 15) (1, 16)
2334    NEWLINE    ''            (1, 16) (1, 16)
2335    INDENT     ''            (2, -1) (2, -1)
2336    NAME       'def'         (2, 2) (2, 5)
2337    NAME       'foo'         (2, 6) (2, 9)
2338    LPAR       '('           (2, 9) (2, 10)
2339    AWAIT      'await'       (2, 10) (2, 15)
2340    RPAR       ')'           (2, 15) (2, 16)
2341    COLON      ':'           (2, 16) (2, 17)
2342    NEWLINE    ''            (2, 17) (2, 17)
2343    INDENT     ''            (3, -1) (3, -1)
2344    AWAIT      'await'       (3, 4) (3, 9)
2345    EQUAL      '='           (3, 10) (3, 11)
2346    NUMBER     '1'           (3, 12) (3, 13)
2347    NEWLINE    ''            (3, 13) (3, 13)
2348    DEDENT     ''            (4, -1) (4, -1)
2349    NAME       'if'          (4, 2) (4, 4)
2350    NUMBER     '1'           (4, 5) (4, 6)
2351    COLON      ':'           (4, 6) (4, 7)
2352    NEWLINE    ''            (4, 7) (4, 7)
2353    INDENT     ''            (5, -1) (5, -1)
2354    AWAIT      'await'       (5, 4) (5, 9)
2355    NEWLINE    ''            (5, 9) (5, 9)
2356    DEDENT     ''            (6, -1) (6, -1)
2357    DEDENT     ''            (6, -1) (6, -1)
2358    ASYNC      'async'       (6, 0) (6, 5)
2359    PLUSEQUAL  '+='          (6, 6) (6, 8)
2360    NUMBER     '1'           (6, 9) (6, 10)
2361    NEWLINE    ''            (6, 10) (6, 10)
2362    """)
2363
2364        self.check_tokenize('async def foo():\n  async for i in 1: pass', """\
2365    ASYNC      'async'       (1, 0) (1, 5)
2366    NAME       'def'         (1, 6) (1, 9)
2367    NAME       'foo'         (1, 10) (1, 13)
2368    LPAR       '('           (1, 13) (1, 14)
2369    RPAR       ')'           (1, 14) (1, 15)
2370    COLON      ':'           (1, 15) (1, 16)
2371    NEWLINE    ''            (1, 16) (1, 16)
2372    INDENT     ''            (2, -1) (2, -1)
2373    ASYNC      'async'       (2, 2) (2, 7)
2374    NAME       'for'         (2, 8) (2, 11)
2375    NAME       'i'           (2, 12) (2, 13)
2376    NAME       'in'          (2, 14) (2, 16)
2377    NUMBER     '1'           (2, 17) (2, 18)
2378    COLON      ':'           (2, 18) (2, 19)
2379    NAME       'pass'        (2, 20) (2, 24)
2380    DEDENT     ''            (2, -1) (2, -1)
2381    """)
2382
2383        self.check_tokenize('async def foo(async): await', """\
2384    ASYNC      'async'       (1, 0) (1, 5)
2385    NAME       'def'         (1, 6) (1, 9)
2386    NAME       'foo'         (1, 10) (1, 13)
2387    LPAR       '('           (1, 13) (1, 14)
2388    ASYNC      'async'       (1, 14) (1, 19)
2389    RPAR       ')'           (1, 19) (1, 20)
2390    COLON      ':'           (1, 20) (1, 21)
2391    AWAIT      'await'       (1, 22) (1, 27)
2392    """)
2393
2394        self.check_tokenize('''\
2395def f():
2396
2397  def baz(): pass
2398  async def bar(): pass
2399
2400  await = 2''', """\
2401    NAME       'def'         (1, 0) (1, 3)
2402    NAME       'f'           (1, 4) (1, 5)
2403    LPAR       '('           (1, 5) (1, 6)
2404    RPAR       ')'           (1, 6) (1, 7)
2405    COLON      ':'           (1, 7) (1, 8)
2406    NEWLINE    ''            (1, 8) (1, 8)
2407    INDENT     ''            (3, -1) (3, -1)
2408    NAME       'def'         (3, 2) (3, 5)
2409    NAME       'baz'         (3, 6) (3, 9)
2410    LPAR       '('           (3, 9) (3, 10)
2411    RPAR       ')'           (3, 10) (3, 11)
2412    COLON      ':'           (3, 11) (3, 12)
2413    NAME       'pass'        (3, 13) (3, 17)
2414    NEWLINE    ''            (3, 17) (3, 17)
2415    ASYNC      'async'       (4, 2) (4, 7)
2416    NAME       'def'         (4, 8) (4, 11)
2417    NAME       'bar'         (4, 12) (4, 15)
2418    LPAR       '('           (4, 15) (4, 16)
2419    RPAR       ')'           (4, 16) (4, 17)
2420    COLON      ':'           (4, 17) (4, 18)
2421    NAME       'pass'        (4, 19) (4, 23)
2422    NEWLINE    ''            (4, 23) (4, 23)
2423    AWAIT      'await'       (6, 2) (6, 7)
2424    EQUAL      '='           (6, 8) (6, 9)
2425    NUMBER     '2'           (6, 10) (6, 11)
2426    DEDENT     ''            (6, -1) (6, -1)
2427    """)
2428
2429        self.check_tokenize('''\
2430async def f():
2431
2432  def baz(): pass
2433  async def bar(): pass
2434
2435  await = 2''', """\
2436    ASYNC      'async'       (1, 0) (1, 5)
2437    NAME       'def'         (1, 6) (1, 9)
2438    NAME       'f'           (1, 10) (1, 11)
2439    LPAR       '('           (1, 11) (1, 12)
2440    RPAR       ')'           (1, 12) (1, 13)
2441    COLON      ':'           (1, 13) (1, 14)
2442    NEWLINE    ''            (1, 14) (1, 14)
2443    INDENT     ''            (3, -1) (3, -1)
2444    NAME       'def'         (3, 2) (3, 5)
2445    NAME       'baz'         (3, 6) (3, 9)
2446    LPAR       '('           (3, 9) (3, 10)
2447    RPAR       ')'           (3, 10) (3, 11)
2448    COLON      ':'           (3, 11) (3, 12)
2449    NAME       'pass'        (3, 13) (3, 17)
2450    NEWLINE    ''            (3, 17) (3, 17)
2451    ASYNC      'async'       (4, 2) (4, 7)
2452    NAME       'def'         (4, 8) (4, 11)
2453    NAME       'bar'         (4, 12) (4, 15)
2454    LPAR       '('           (4, 15) (4, 16)
2455    RPAR       ')'           (4, 16) (4, 17)
2456    COLON      ':'           (4, 17) (4, 18)
2457    NAME       'pass'        (4, 19) (4, 23)
2458    NEWLINE    ''            (4, 23) (4, 23)
2459    AWAIT      'await'       (6, 2) (6, 7)
2460    EQUAL      '='           (6, 8) (6, 9)
2461    NUMBER     '2'           (6, 10) (6, 11)
2462    DEDENT     ''            (6, -1) (6, -1)
2463    """)
2464
2465    def test_unicode(self):
2466
2467        self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
2468    NAME       'Örter'       (1, 0) (1, 6)
2469    EQUAL      '='           (1, 7) (1, 8)
2470    STRING     "u'places'"   (1, 9) (1, 18)
2471    NEWLINE    ''            (1, 18) (1, 18)
2472    NAME       'grün'        (2, 0) (2, 5)
2473    EQUAL      '='           (2, 6) (2, 7)
2474    STRING     "U'green'"    (2, 8) (2, 16)
2475    """)
2476
2477    def test_invalid_syntax(self):
2478        def get_tokens(string):
2479            return list(_generate_tokens_from_c_tokenizer(string))
2480
2481        self.assertRaises(SyntaxError, get_tokens, "(1+2]")
2482        self.assertRaises(SyntaxError, get_tokens, "(1+2}")
2483        self.assertRaises(SyntaxError, get_tokens, "{1+2]")
2484
2485        self.assertRaises(SyntaxError, get_tokens, "1_")
2486        self.assertRaises(SyntaxError, get_tokens, "1.2_")
2487        self.assertRaises(SyntaxError, get_tokens, "1e2_")
2488        self.assertRaises(SyntaxError, get_tokens, "1e+")
2489
2490        self.assertRaises(SyntaxError, get_tokens, "\xa0")
2491        self.assertRaises(SyntaxError, get_tokens, "€")
2492
2493        self.assertRaises(SyntaxError, get_tokens, "0b12")
2494        self.assertRaises(SyntaxError, get_tokens, "0b1_2")
2495        self.assertRaises(SyntaxError, get_tokens, "0b2")
2496        self.assertRaises(SyntaxError, get_tokens, "0b1_")
2497        self.assertRaises(SyntaxError, get_tokens, "0b")
2498        self.assertRaises(SyntaxError, get_tokens, "0o18")
2499        self.assertRaises(SyntaxError, get_tokens, "0o1_8")
2500        self.assertRaises(SyntaxError, get_tokens, "0o8")
2501        self.assertRaises(SyntaxError, get_tokens, "0o1_")
2502        self.assertRaises(SyntaxError, get_tokens, "0o")
2503        self.assertRaises(SyntaxError, get_tokens, "0x1_")
2504        self.assertRaises(SyntaxError, get_tokens, "0x")
2505        self.assertRaises(SyntaxError, get_tokens, "1_")
2506        self.assertRaises(SyntaxError, get_tokens, "012")
2507        self.assertRaises(SyntaxError, get_tokens, "1.2_")
2508        self.assertRaises(SyntaxError, get_tokens, "1e2_")
2509        self.assertRaises(SyntaxError, get_tokens, "1e+")
2510
2511        self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
2512        self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
2513
2514        self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
2515        self.assertRaises(SyntaxError, get_tokens, "]")
2516
2517    def test_max_indent(self):
2518        MAXINDENT = 100
2519
2520        def generate_source(indents):
2521            source = ''.join(('  ' * x) + 'if True:\n' for x in range(indents))
2522            source += '  ' * indents + 'pass\n'
2523            return source
2524
2525        valid = generate_source(MAXINDENT - 1)
2526        tokens = list(_generate_tokens_from_c_tokenizer(valid))
2527        self.assertEqual(tokens[-1].type, DEDENT)
2528        compile(valid, "<string>", "exec")
2529
2530        invalid = generate_source(MAXINDENT)
2531        tokens = list(_generate_tokens_from_c_tokenizer(invalid))
2532        self.assertEqual(tokens[-1].type, NEWLINE)
2533        self.assertRaises(
2534            IndentationError, compile, invalid, "<string>", "exec"
2535        )
2536
2537    def test_continuation_lines_indentation(self):
2538        def get_tokens(string):
2539            return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
2540
2541        code = dedent("""
2542            def fib(n):
2543                \\
2544            '''Print a Fibonacci series up to n.'''
2545                \\
2546            a, b = 0, 1
2547        """)
2548
2549        self.check_tokenize(code, """\
2550    NAME       'def'         (2, 0) (2, 3)
2551    NAME       'fib'         (2, 4) (2, 7)
2552    LPAR       '('           (2, 7) (2, 8)
2553    NAME       'n'           (2, 8) (2, 9)
2554    RPAR       ')'           (2, 9) (2, 10)
2555    COLON      ':'           (2, 10) (2, 11)
2556    NEWLINE    ''            (2, 11) (2, 11)
2557    INDENT     ''            (4, -1) (4, -1)
2558    STRING     "'''Print a Fibonacci series up to n.'''" (4, 0) (4, 39)
2559    NEWLINE    ''            (4, 39) (4, 39)
2560    NAME       'a'           (6, 0) (6, 1)
2561    COMMA      ','           (6, 1) (6, 2)
2562    NAME       'b'           (6, 3) (6, 4)
2563    EQUAL      '='           (6, 5) (6, 6)
2564    NUMBER     '0'           (6, 7) (6, 8)
2565    COMMA      ','           (6, 8) (6, 9)
2566    NUMBER     '1'           (6, 10) (6, 11)
2567    NEWLINE    ''            (6, 11) (6, 11)
2568    DEDENT     ''            (6, -1) (6, -1)
2569        """)
2570
2571        code_no_cont = dedent("""
2572            def fib(n):
2573                '''Print a Fibonacci series up to n.'''
2574                a, b = 0, 1
2575        """)
2576
2577        self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
2578
2579        code = dedent("""
2580            pass
2581                \\
2582
2583            pass
2584        """)
2585
2586        self.check_tokenize(code, """\
2587    NAME       'pass'        (2, 0) (2, 4)
2588    NEWLINE    ''            (2, 4) (2, 4)
2589    NAME       'pass'        (5, 0) (5, 4)
2590    NEWLINE    ''            (5, 4) (5, 4)
2591        """)
2592
2593        code_no_cont = dedent("""
2594            pass
2595            pass
2596        """)
2597
2598        self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
2599
2600        code = dedent("""
2601            if x:
2602                y = 1
2603                \\
2604                        \\
2605                    \\
2606                \\
2607                foo = 1
2608        """)
2609
2610        self.check_tokenize(code, """\
2611    NAME       'if'          (2, 0) (2, 2)
2612    NAME       'x'           (2, 3) (2, 4)
2613    COLON      ':'           (2, 4) (2, 5)
2614    NEWLINE    ''            (2, 5) (2, 5)
2615    INDENT     ''            (3, -1) (3, -1)
2616    NAME       'y'           (3, 4) (3, 5)
2617    EQUAL      '='           (3, 6) (3, 7)
2618    NUMBER     '1'           (3, 8) (3, 9)
2619    NEWLINE    ''            (3, 9) (3, 9)
2620    NAME       'foo'         (8, 4) (8, 7)
2621    EQUAL      '='           (8, 8) (8, 9)
2622    NUMBER     '1'           (8, 10) (8, 11)
2623    NEWLINE    ''            (8, 11) (8, 11)
2624    DEDENT     ''            (8, -1) (8, -1)
2625        """)
2626
2627        code_no_cont = dedent("""
2628            if x:
2629                y = 1
2630                foo = 1
2631        """)
2632
2633        self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
2634
2635
2636class CTokenizerBufferTests(unittest.TestCase):
2637    def test_newline_at_the_end_of_buffer(self):
2638        # See issue 99581: Make sure that if we need to add a new line at the
2639        # end of the buffer, we have enough space in the buffer, specially when
2640        # the current line is as long as the buffer space available.
2641        test_script = f"""\
2642        #coding: latin-1
2643        #{"a"*10000}
2644        #{"a"*10002}"""
2645        with os_helper.temp_dir() as temp_dir:
2646            file_name = make_script(temp_dir, 'foo', test_script)
2647            run_test_script(file_name)
2648
2649
2650if __name__ == "__main__":
2651    unittest.main()
2652