1# -*- coding: koi8-r -*- 2 3import unittest 4from test.support import script_helper, captured_stdout, requires_subprocess 5from test.support.os_helper import TESTFN, unlink, rmtree 6from test.support.import_helper import unload 7import importlib 8import os 9import sys 10import subprocess 11import tempfile 12 13class MiscSourceEncodingTest(unittest.TestCase): 14 15 def test_pep263(self): 16 self.assertEqual( 17 "�����".encode("utf-8"), 18 b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd' 19 ) 20 self.assertEqual( 21 "\�".encode("utf-8"), 22 b'\\\xd0\x9f' 23 ) 24 25 def test_compilestring(self): 26 # see #1882 27 c = compile(b"\n# coding: utf-8\nu = '\xc3\xb3'\n", "dummy", "exec") 28 d = {} 29 exec(c, d) 30 self.assertEqual(d['u'], '\xf3') 31 32 def test_issue2301(self): 33 try: 34 compile(b"# coding: cp932\nprint '\x94\x4e'", "dummy", "exec") 35 except SyntaxError as v: 36 self.assertEqual(v.text.rstrip('\n'), "print '\u5e74'") 37 else: 38 self.fail() 39 40 def test_issue4626(self): 41 c = compile("# coding=latin-1\n\u00c6 = '\u00c6'", "dummy", "exec") 42 d = {} 43 exec(c, d) 44 self.assertEqual(d['\xc6'], '\xc6') 45 46 def test_issue3297(self): 47 c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec") 48 d = {} 49 exec(c, d) 50 self.assertEqual(d['a'], d['b']) 51 self.assertEqual(len(d['a']), len(d['b'])) 52 self.assertEqual(ascii(d['a']), ascii(d['b'])) 53 54 def test_issue7820(self): 55 # Ensure that check_bom() restores all bytes in the right order if 56 # check_bom() fails in pydebug mode: a buffer starts with the first 57 # byte of a valid BOM, but next bytes are different 58 59 # one byte in common with the UTF-16-LE BOM 60 self.assertRaises(SyntaxError, eval, b'\xff\x20') 61 62 # one byte in common with the UTF-8 BOM 63 self.assertRaises(SyntaxError, eval, b'\xef\x20') 64 65 # two bytes in common with the UTF-8 BOM 66 self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20') 67 68 @requires_subprocess() 69 def test_20731(self): 70 sub = subprocess.Popen([sys.executable, 71 os.path.join(os.path.dirname(__file__), 72 'coding20731.py')], 73 stderr=subprocess.PIPE) 74 err = sub.communicate()[1] 75 self.assertEqual(sub.returncode, 0) 76 self.assertNotIn(b'SyntaxError', err) 77 78 def test_error_message(self): 79 compile(b'# -*- coding: iso-8859-15 -*-\n', 'dummy', 'exec') 80 compile(b'\xef\xbb\xbf\n', 'dummy', 'exec') 81 compile(b'\xef\xbb\xbf# -*- coding: utf-8 -*-\n', 'dummy', 'exec') 82 with self.assertRaisesRegex(SyntaxError, 'fake'): 83 compile(b'# -*- coding: fake -*-\n', 'dummy', 'exec') 84 with self.assertRaisesRegex(SyntaxError, 'iso-8859-15'): 85 compile(b'\xef\xbb\xbf# -*- coding: iso-8859-15 -*-\n', 86 'dummy', 'exec') 87 with self.assertRaisesRegex(SyntaxError, 'BOM'): 88 compile(b'\xef\xbb\xbf# -*- coding: iso-8859-15 -*-\n', 89 'dummy', 'exec') 90 with self.assertRaisesRegex(SyntaxError, 'fake'): 91 compile(b'\xef\xbb\xbf# -*- coding: fake -*-\n', 'dummy', 'exec') 92 with self.assertRaisesRegex(SyntaxError, 'BOM'): 93 compile(b'\xef\xbb\xbf# -*- coding: fake -*-\n', 'dummy', 'exec') 94 95 def test_bad_coding(self): 96 module_name = 'bad_coding' 97 self.verify_bad_module(module_name) 98 99 def test_bad_coding2(self): 100 module_name = 'bad_coding2' 101 self.verify_bad_module(module_name) 102 103 def verify_bad_module(self, module_name): 104 self.assertRaises(SyntaxError, __import__, 'test.' + module_name) 105 106 path = os.path.dirname(__file__) 107 filename = os.path.join(path, module_name + '.py') 108 with open(filename, "rb") as fp: 109 bytes = fp.read() 110 self.assertRaises(SyntaxError, compile, bytes, filename, 'exec') 111 112 def test_exec_valid_coding(self): 113 d = {} 114 exec(b'# coding: cp949\na = "\xaa\xa7"\n', d) 115 self.assertEqual(d['a'], '\u3047') 116 117 def test_file_parse(self): 118 # issue1134: all encodings outside latin-1 and utf-8 fail on 119 # multiline strings and long lines (>512 columns) 120 unload(TESTFN) 121 filename = TESTFN + ".py" 122 f = open(filename, "w", encoding="cp1252") 123 sys.path.insert(0, os.curdir) 124 try: 125 with f: 126 f.write("# -*- coding: cp1252 -*-\n") 127 f.write("'''A short string\n") 128 f.write("'''\n") 129 f.write("'A very long string %s'\n" % ("X" * 1000)) 130 131 importlib.invalidate_caches() 132 __import__(TESTFN) 133 finally: 134 del sys.path[0] 135 unlink(filename) 136 unlink(filename + "c") 137 unlink(filename + "o") 138 unload(TESTFN) 139 rmtree('__pycache__') 140 141 def test_error_from_string(self): 142 # See http://bugs.python.org/issue6289 143 input = "# coding: ascii\n\N{SNOWMAN}".encode('utf-8') 144 with self.assertRaises(SyntaxError) as c: 145 compile(input, "<string>", "exec") 146 expected = "'ascii' codec can't decode byte 0xe2 in position 16: " \ 147 "ordinal not in range(128)" 148 self.assertTrue(c.exception.args[0].startswith(expected), 149 msg=c.exception.args[0]) 150 151 def test_file_parse_error_multiline(self): 152 # gh96611: 153 with open(TESTFN, "wb") as fd: 154 fd.write(b'print("""\n\xb1""")\n') 155 156 try: 157 retcode, stdout, stderr = script_helper.assert_python_failure(TESTFN) 158 159 self.assertGreater(retcode, 0) 160 self.assertIn(b"Non-UTF-8 code starting with '\\xb1'", stderr) 161 finally: 162 os.unlink(TESTFN) 163 164 def test_tokenizer_fstring_warning_in_first_line(self): 165 source = "0b1and 2" 166 with open(TESTFN, "w") as fd: 167 fd.write("{}".format(source)) 168 try: 169 retcode, stdout, stderr = script_helper.assert_python_ok(TESTFN) 170 self.assertIn(b"SyntaxWarning: invalid binary litera", stderr) 171 self.assertEqual(stderr.count(source.encode()), 1) 172 finally: 173 os.unlink(TESTFN) 174 175 176class AbstractSourceEncodingTest: 177 178 def test_default_coding(self): 179 src = (b'print(ascii("\xc3\xa4"))\n') 180 self.check_script_output(src, br"'\xe4'") 181 182 def test_first_coding_line(self): 183 src = (b'#coding:iso8859-15\n' 184 b'print(ascii("\xc3\xa4"))\n') 185 self.check_script_output(src, br"'\xc3\u20ac'") 186 187 def test_second_coding_line(self): 188 src = (b'#\n' 189 b'#coding:iso8859-15\n' 190 b'print(ascii("\xc3\xa4"))\n') 191 self.check_script_output(src, br"'\xc3\u20ac'") 192 193 def test_third_coding_line(self): 194 # Only first two lines are tested for a magic comment. 195 src = (b'#\n' 196 b'#\n' 197 b'#coding:iso8859-15\n' 198 b'print(ascii("\xc3\xa4"))\n') 199 self.check_script_output(src, br"'\xe4'") 200 201 def test_double_coding_line(self): 202 # If the first line matches the second line is ignored. 203 src = (b'#coding:iso8859-15\n' 204 b'#coding:latin1\n' 205 b'print(ascii("\xc3\xa4"))\n') 206 self.check_script_output(src, br"'\xc3\u20ac'") 207 208 def test_double_coding_same_line(self): 209 src = (b'#coding:iso8859-15 coding:latin1\n' 210 b'print(ascii("\xc3\xa4"))\n') 211 self.check_script_output(src, br"'\xc3\u20ac'") 212 213 def test_first_non_utf8_coding_line(self): 214 src = (b'#coding:iso-8859-15 \xa4\n' 215 b'print(ascii("\xc3\xa4"))\n') 216 self.check_script_output(src, br"'\xc3\u20ac'") 217 218 def test_second_non_utf8_coding_line(self): 219 src = (b'\n' 220 b'#coding:iso-8859-15 \xa4\n' 221 b'print(ascii("\xc3\xa4"))\n') 222 self.check_script_output(src, br"'\xc3\u20ac'") 223 224 def test_utf8_bom(self): 225 src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n') 226 self.check_script_output(src, br"'\xe4'") 227 228 def test_utf8_bom_and_utf8_coding_line(self): 229 src = (b'\xef\xbb\xbf#coding:utf-8\n' 230 b'print(ascii("\xc3\xa4"))\n') 231 self.check_script_output(src, br"'\xe4'") 232 233 def test_crlf(self): 234 src = (b'print(ascii("""\r\n"""))\n') 235 out = self.check_script_output(src, br"'\n'") 236 237 def test_crcrlf(self): 238 src = (b'print(ascii("""\r\r\n"""))\n') 239 out = self.check_script_output(src, br"'\n\n'") 240 241 def test_crcrcrlf(self): 242 src = (b'print(ascii("""\r\r\r\n"""))\n') 243 out = self.check_script_output(src, br"'\n\n\n'") 244 245 def test_crcrcrlf2(self): 246 src = (b'#coding:iso-8859-1\n' 247 b'print(ascii("""\r\r\r\n"""))\n') 248 out = self.check_script_output(src, br"'\n\n\n'") 249 250 251class UTF8ValidatorTest(unittest.TestCase): 252 @unittest.skipIf(not sys.platform.startswith("linux"), 253 "Too slow to run on non-Linux platforms") 254 def test_invalid_utf8(self): 255 # This is a port of test_utf8_decode_invalid_sequences in 256 # test_unicode.py to exercise the separate utf8 validator in 257 # Parser/tokenizer.c used when reading source files. 258 259 # That file is written using low-level C file I/O, so the only way to 260 # test it is to write actual files to disk. 261 262 # Each example is put inside a string at the top of the file so 263 # it's an otherwise valid Python source file. Put some newlines 264 # beforehand so we can assert that the error is reported on the 265 # correct line. 266 template = b'\n\n\n"%s"\n' 267 268 fn = TESTFN 269 self.addCleanup(unlink, fn) 270 271 def check(content): 272 with open(fn, 'wb') as fp: 273 fp.write(template % content) 274 rc, stdout, stderr = script_helper.assert_python_failure(fn) 275 # We want to assert that the python subprocess failed gracefully, 276 # not via a signal. 277 self.assertGreaterEqual(rc, 1) 278 self.assertIn(b"Non-UTF-8 code starting with", stderr) 279 self.assertIn(b"on line 4", stderr) 280 281 # continuation bytes in a sequence of 2, 3, or 4 bytes 282 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] 283 # start bytes of a 2-byte sequence equivalent to code points < 0x7F 284 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)] 285 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF 286 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)] 287 invalid_start_bytes = ( 288 continuation_bytes + invalid_2B_seq_start_bytes + 289 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)] 290 ) 291 292 for byte in invalid_start_bytes: 293 check(byte) 294 295 for sb in invalid_2B_seq_start_bytes: 296 for cb in continuation_bytes: 297 check(sb + cb) 298 299 for sb in invalid_4B_seq_start_bytes: 300 for cb1 in continuation_bytes[:3]: 301 for cb3 in continuation_bytes[:3]: 302 check(sb+cb1+b'\x80'+cb3) 303 304 for cb in [bytes([x]) for x in range(0x80, 0xA0)]: 305 check(b'\xE0'+cb+b'\x80') 306 check(b'\xE0'+cb+b'\xBF') 307 # surrogates 308 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]: 309 check(b'\xED'+cb+b'\x80') 310 check(b'\xED'+cb+b'\xBF') 311 for cb in [bytes([x]) for x in range(0x80, 0x90)]: 312 check(b'\xF0'+cb+b'\x80\x80') 313 check(b'\xF0'+cb+b'\xBF\xBF') 314 for cb in [bytes([x]) for x in range(0x90, 0xC0)]: 315 check(b'\xF4'+cb+b'\x80\x80') 316 check(b'\xF4'+cb+b'\xBF\xBF') 317 318 319class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): 320 321 def check_script_output(self, src, expected): 322 with captured_stdout() as stdout: 323 exec(src) 324 out = stdout.getvalue().encode('latin1') 325 self.assertEqual(out.rstrip(), expected) 326 327 328class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): 329 330 def check_script_output(self, src, expected): 331 with tempfile.TemporaryDirectory() as tmpd: 332 fn = os.path.join(tmpd, 'test.py') 333 with open(fn, 'wb') as fp: 334 fp.write(src) 335 res = script_helper.assert_python_ok(fn) 336 self.assertEqual(res.out.rstrip(), expected) 337 338 339if __name__ == "__main__": 340 unittest.main() 341