1import codecs 2import contextlib 3import io 4import locale 5import sys 6import unittest 7import encodings 8from unittest import mock 9 10from test import support 11from test.support import os_helper 12from test.support import warnings_helper 13 14try: 15 import _testcapi 16except ImportError: 17 _testcapi = None 18try: 19 import _testinternalcapi 20except ImportError: 21 _testinternalcapi = None 22 23try: 24 import ctypes 25except ImportError: 26 ctypes = None 27 SIZEOF_WCHAR_T = -1 28else: 29 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) 30 31def coding_checker(self, coder): 32 def check(input, expect): 33 self.assertEqual(coder(input), (expect, len(input))) 34 return check 35 36# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present 37def is_code_page_present(cp): 38 from ctypes import POINTER, WINFUNCTYPE, WinDLL 39 from ctypes.wintypes import BOOL, BYTE, WCHAR, UINT, DWORD 40 41 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term. 42 MAX_DEFAULTCHAR = 2 # single or double byte 43 MAX_PATH = 260 44 class CPINFOEXW(ctypes.Structure): 45 _fields_ = [("MaxCharSize", UINT), 46 ("DefaultChar", BYTE*MAX_DEFAULTCHAR), 47 ("LeadByte", BYTE*MAX_LEADBYTES), 48 ("UnicodeDefaultChar", WCHAR), 49 ("CodePage", UINT), 50 ("CodePageName", WCHAR*MAX_PATH)] 51 52 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW)) 53 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32"))) 54 info = CPINFOEXW() 55 return GetCPInfoEx(cp, 0, info) 56 57class Queue(object): 58 """ 59 queue: write bytes at one end, read bytes from the other end 60 """ 61 def __init__(self, buffer): 62 self._buffer = buffer 63 64 def write(self, chars): 65 self._buffer += chars 66 67 def read(self, size=-1): 68 if size<0: 69 s = self._buffer 70 self._buffer = self._buffer[:0] # make empty 71 return s 72 else: 73 s = self._buffer[:size] 74 self._buffer = self._buffer[size:] 75 return s 76 77 78class MixInCheckStateHandling: 79 def check_state_handling_decode(self, encoding, u, s): 80 for i in range(len(s)+1): 81 d = codecs.getincrementaldecoder(encoding)() 82 part1 = d.decode(s[:i]) 83 state = d.getstate() 84 self.assertIsInstance(state[1], int) 85 # Check that the condition stated in the documentation for 86 # IncrementalDecoder.getstate() holds 87 if not state[1]: 88 # reset decoder to the default state without anything buffered 89 d.setstate((state[0][:0], 0)) 90 # Feeding the previous input may not produce any output 91 self.assertTrue(not d.decode(state[0])) 92 # The decoder must return to the same state 93 self.assertEqual(state, d.getstate()) 94 # Create a new decoder and set it to the state 95 # we extracted from the old one 96 d = codecs.getincrementaldecoder(encoding)() 97 d.setstate(state) 98 part2 = d.decode(s[i:], True) 99 self.assertEqual(u, part1+part2) 100 101 def check_state_handling_encode(self, encoding, u, s): 102 for i in range(len(u)+1): 103 d = codecs.getincrementalencoder(encoding)() 104 part1 = d.encode(u[:i]) 105 state = d.getstate() 106 d = codecs.getincrementalencoder(encoding)() 107 d.setstate(state) 108 part2 = d.encode(u[i:], True) 109 self.assertEqual(s, part1+part2) 110 111 112class ReadTest(MixInCheckStateHandling): 113 def check_partial(self, input, partialresults): 114 # get a StreamReader for the encoding and feed the bytestring version 115 # of input to the reader byte by byte. Read everything available from 116 # the StreamReader and check that the results equal the appropriate 117 # entries from partialresults. 118 q = Queue(b"") 119 r = codecs.getreader(self.encoding)(q) 120 result = "" 121 for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True): 122 q.write(bytes([c])) 123 result += r.read() 124 self.assertEqual(result, partialresult) 125 # check that there's nothing left in the buffers 126 self.assertEqual(r.read(), "") 127 self.assertEqual(r.bytebuffer, b"") 128 129 # do the check again, this time using an incremental decoder 130 d = codecs.getincrementaldecoder(self.encoding)() 131 result = "" 132 for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True): 133 result += d.decode(bytes([c])) 134 self.assertEqual(result, partialresult) 135 # check that there's nothing left in the buffers 136 self.assertEqual(d.decode(b"", True), "") 137 self.assertEqual(d.buffer, b"") 138 139 # Check whether the reset method works properly 140 d.reset() 141 result = "" 142 for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True): 143 result += d.decode(bytes([c])) 144 self.assertEqual(result, partialresult) 145 # check that there's nothing left in the buffers 146 self.assertEqual(d.decode(b"", True), "") 147 self.assertEqual(d.buffer, b"") 148 149 # check iterdecode() 150 encoded = input.encode(self.encoding) 151 self.assertEqual( 152 input, 153 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding)) 154 ) 155 156 def test_readline(self): 157 def getreader(input): 158 stream = io.BytesIO(input.encode(self.encoding)) 159 return codecs.getreader(self.encoding)(stream) 160 161 def readalllines(input, keepends=True, size=None): 162 reader = getreader(input) 163 lines = [] 164 while True: 165 line = reader.readline(size=size, keepends=keepends) 166 if not line: 167 break 168 lines.append(line) 169 return "|".join(lines) 170 171 s = "foo\nbar\r\nbaz\rspam\u2028eggs" 172 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs" 173 sexpectednoends = "foo|bar|baz|spam|eggs" 174 self.assertEqual(readalllines(s, True), sexpected) 175 self.assertEqual(readalllines(s, False), sexpectednoends) 176 self.assertEqual(readalllines(s, True, 10), sexpected) 177 self.assertEqual(readalllines(s, False, 10), sexpectednoends) 178 179 lineends = ("\n", "\r\n", "\r", "\u2028") 180 # Test long lines (multiple calls to read() in readline()) 181 vw = [] 182 vwo = [] 183 for (i, lineend) in enumerate(lineends): 184 vw.append((i*200+200)*"\u3042" + lineend) 185 vwo.append((i*200+200)*"\u3042") 186 self.assertEqual(readalllines("".join(vw), True), "|".join(vw)) 187 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo)) 188 189 # Test lines where the first read might end with \r, so the 190 # reader has to look ahead whether this is a lone \r or a \r\n 191 for size in range(80): 192 for lineend in lineends: 193 s = 10*(size*"a" + lineend + "xxx\n") 194 reader = getreader(s) 195 for i in range(10): 196 self.assertEqual( 197 reader.readline(keepends=True), 198 size*"a" + lineend, 199 ) 200 self.assertEqual( 201 reader.readline(keepends=True), 202 "xxx\n", 203 ) 204 reader = getreader(s) 205 for i in range(10): 206 self.assertEqual( 207 reader.readline(keepends=False), 208 size*"a", 209 ) 210 self.assertEqual( 211 reader.readline(keepends=False), 212 "xxx", 213 ) 214 215 def test_mixed_readline_and_read(self): 216 lines = ["Humpty Dumpty sat on a wall,\n", 217 "Humpty Dumpty had a great fall.\r\n", 218 "All the king's horses and all the king's men\r", 219 "Couldn't put Humpty together again."] 220 data = ''.join(lines) 221 def getreader(): 222 stream = io.BytesIO(data.encode(self.encoding)) 223 return codecs.getreader(self.encoding)(stream) 224 225 # Issue #8260: Test readline() followed by read() 226 f = getreader() 227 self.assertEqual(f.readline(), lines[0]) 228 self.assertEqual(f.read(), ''.join(lines[1:])) 229 self.assertEqual(f.read(), '') 230 231 # Issue #32110: Test readline() followed by read(n) 232 f = getreader() 233 self.assertEqual(f.readline(), lines[0]) 234 self.assertEqual(f.read(1), lines[1][0]) 235 self.assertEqual(f.read(0), '') 236 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100]) 237 238 # Issue #16636: Test readline() followed by readlines() 239 f = getreader() 240 self.assertEqual(f.readline(), lines[0]) 241 self.assertEqual(f.readlines(), lines[1:]) 242 self.assertEqual(f.read(), '') 243 244 # Test read(n) followed by read() 245 f = getreader() 246 self.assertEqual(f.read(size=40, chars=5), data[:5]) 247 self.assertEqual(f.read(), data[5:]) 248 self.assertEqual(f.read(), '') 249 250 # Issue #32110: Test read(n) followed by read(n) 251 f = getreader() 252 self.assertEqual(f.read(size=40, chars=5), data[:5]) 253 self.assertEqual(f.read(1), data[5]) 254 self.assertEqual(f.read(0), '') 255 self.assertEqual(f.read(100), data[6:106]) 256 257 # Issue #12446: Test read(n) followed by readlines() 258 f = getreader() 259 self.assertEqual(f.read(size=40, chars=5), data[:5]) 260 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:]) 261 self.assertEqual(f.read(), '') 262 263 def test_bug1175396(self): 264 s = [ 265 '<%!--===================================================\r\n', 266 ' BLOG index page: show recent articles,\r\n', 267 ' today\'s articles, or articles of a specific date.\r\n', 268 '========================================================--%>\r\n', 269 '<%@inputencoding="ISO-8859-1"%>\r\n', 270 '<%@pagetemplate=TEMPLATE.y%>\r\n', 271 '<%@import=import frog.util, frog%>\r\n', 272 '<%@import=import frog.objects%>\r\n', 273 '<%@import=from frog.storageerrors import StorageError%>\r\n', 274 '<%\r\n', 275 '\r\n', 276 'import logging\r\n', 277 'log=logging.getLogger("Snakelets.logger")\r\n', 278 '\r\n', 279 '\r\n', 280 'user=self.SessionCtx.user\r\n', 281 'storageEngine=self.SessionCtx.storageEngine\r\n', 282 '\r\n', 283 '\r\n', 284 'def readArticlesFromDate(date, count=None):\r\n', 285 ' entryids=storageEngine.listBlogEntries(date)\r\n', 286 ' entryids.reverse() # descending\r\n', 287 ' if count:\r\n', 288 ' entryids=entryids[:count]\r\n', 289 ' try:\r\n', 290 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n', 291 ' except StorageError,x:\r\n', 292 ' log.error("Error loading articles: "+str(x))\r\n', 293 ' self.abort("cannot load articles")\r\n', 294 '\r\n', 295 'showdate=None\r\n', 296 '\r\n', 297 'arg=self.Request.getArg()\r\n', 298 'if arg=="today":\r\n', 299 ' #-------------------- TODAY\'S ARTICLES\r\n', 300 ' self.write("<h2>Today\'s articles</h2>")\r\n', 301 ' showdate = frog.util.isodatestr() \r\n', 302 ' entries = readArticlesFromDate(showdate)\r\n', 303 'elif arg=="active":\r\n', 304 ' #-------------------- ACTIVE ARTICLES redirect\r\n', 305 ' self.Yredirect("active.y")\r\n', 306 'elif arg=="login":\r\n', 307 ' #-------------------- LOGIN PAGE redirect\r\n', 308 ' self.Yredirect("login.y")\r\n', 309 'elif arg=="date":\r\n', 310 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n', 311 ' showdate = self.Request.getParameter("date")\r\n', 312 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n', 313 ' entries = readArticlesFromDate(showdate)\r\n', 314 'else:\r\n', 315 ' #-------------------- RECENT ARTICLES\r\n', 316 ' self.write("<h2>Recent articles</h2>")\r\n', 317 ' dates=storageEngine.listBlogEntryDates()\r\n', 318 ' if dates:\r\n', 319 ' entries=[]\r\n', 320 ' SHOWAMOUNT=10\r\n', 321 ' for showdate in dates:\r\n', 322 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n', 323 ' if len(entries)>=SHOWAMOUNT:\r\n', 324 ' break\r\n', 325 ' \r\n', 326 ] 327 stream = io.BytesIO("".join(s).encode(self.encoding)) 328 reader = codecs.getreader(self.encoding)(stream) 329 for (i, line) in enumerate(reader): 330 self.assertEqual(line, s[i]) 331 332 def test_readlinequeue(self): 333 q = Queue(b"") 334 writer = codecs.getwriter(self.encoding)(q) 335 reader = codecs.getreader(self.encoding)(q) 336 337 # No lineends 338 writer.write("foo\r") 339 self.assertEqual(reader.readline(keepends=False), "foo") 340 writer.write("\nbar\r") 341 self.assertEqual(reader.readline(keepends=False), "") 342 self.assertEqual(reader.readline(keepends=False), "bar") 343 writer.write("baz") 344 self.assertEqual(reader.readline(keepends=False), "baz") 345 self.assertEqual(reader.readline(keepends=False), "") 346 347 # Lineends 348 writer.write("foo\r") 349 self.assertEqual(reader.readline(keepends=True), "foo\r") 350 writer.write("\nbar\r") 351 self.assertEqual(reader.readline(keepends=True), "\n") 352 self.assertEqual(reader.readline(keepends=True), "bar\r") 353 writer.write("baz") 354 self.assertEqual(reader.readline(keepends=True), "baz") 355 self.assertEqual(reader.readline(keepends=True), "") 356 writer.write("foo\r\n") 357 self.assertEqual(reader.readline(keepends=True), "foo\r\n") 358 359 def test_bug1098990_a(self): 360 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n" 361 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n" 362 s3 = "next line.\r\n" 363 364 s = (s1+s2+s3).encode(self.encoding) 365 stream = io.BytesIO(s) 366 reader = codecs.getreader(self.encoding)(stream) 367 self.assertEqual(reader.readline(), s1) 368 self.assertEqual(reader.readline(), s2) 369 self.assertEqual(reader.readline(), s3) 370 self.assertEqual(reader.readline(), "") 371 372 def test_bug1098990_b(self): 373 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n" 374 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n" 375 s3 = "stillokay:bbbbxx\r\n" 376 s4 = "broken!!!!badbad\r\n" 377 s5 = "againokay.\r\n" 378 379 s = (s1+s2+s3+s4+s5).encode(self.encoding) 380 stream = io.BytesIO(s) 381 reader = codecs.getreader(self.encoding)(stream) 382 self.assertEqual(reader.readline(), s1) 383 self.assertEqual(reader.readline(), s2) 384 self.assertEqual(reader.readline(), s3) 385 self.assertEqual(reader.readline(), s4) 386 self.assertEqual(reader.readline(), s5) 387 self.assertEqual(reader.readline(), "") 388 389 ill_formed_sequence_replace = "\ufffd" 390 391 def test_lone_surrogates(self): 392 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) 393 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), 394 "[\\udc80]".encode(self.encoding)) 395 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"), 396 "[\\udc80]".encode(self.encoding)) 397 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), 398 "[�]".encode(self.encoding)) 399 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), 400 "[]".encode(self.encoding)) 401 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), 402 "[?]".encode(self.encoding)) 403 404 # sequential surrogate characters 405 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"), 406 "[]".encode(self.encoding)) 407 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"), 408 "[??]".encode(self.encoding)) 409 410 bom = "".encode(self.encoding) 411 for before, after in [("\U00010fff", "A"), ("[", "]"), 412 ("A", "\U00010fff")]: 413 before_sequence = before.encode(self.encoding)[len(bom):] 414 after_sequence = after.encode(self.encoding)[len(bom):] 415 test_string = before + "\uDC80" + after 416 test_sequence = (bom + before_sequence + 417 self.ill_formed_sequence + after_sequence) 418 self.assertRaises(UnicodeDecodeError, test_sequence.decode, 419 self.encoding) 420 self.assertEqual(test_string.encode(self.encoding, 421 "surrogatepass"), 422 test_sequence) 423 self.assertEqual(test_sequence.decode(self.encoding, 424 "surrogatepass"), 425 test_string) 426 self.assertEqual(test_sequence.decode(self.encoding, "ignore"), 427 before + after) 428 self.assertEqual(test_sequence.decode(self.encoding, "replace"), 429 before + self.ill_formed_sequence_replace + after) 430 backslashreplace = ''.join('\\x%02x' % b 431 for b in self.ill_formed_sequence) 432 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"), 433 before + backslashreplace + after) 434 435 def test_incremental_surrogatepass(self): 436 # Test incremental decoder for surrogatepass handler: 437 # see issue #24214 438 # High surrogate 439 data = '\uD901'.encode(self.encoding, 'surrogatepass') 440 for i in range(1, len(data)): 441 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass') 442 self.assertEqual(dec.decode(data[:i]), '') 443 self.assertEqual(dec.decode(data[i:], True), '\uD901') 444 # Low surrogate 445 data = '\uDC02'.encode(self.encoding, 'surrogatepass') 446 for i in range(1, len(data)): 447 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass') 448 self.assertEqual(dec.decode(data[:i]), '') 449 self.assertEqual(dec.decode(data[i:]), '\uDC02') 450 451 452class UTF32Test(ReadTest, unittest.TestCase): 453 encoding = "utf-32" 454 if sys.byteorder == 'little': 455 ill_formed_sequence = b"\x80\xdc\x00\x00" 456 else: 457 ill_formed_sequence = b"\x00\x00\xdc\x80" 458 459 spamle = (b'\xff\xfe\x00\x00' 460 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' 461 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') 462 spambe = (b'\x00\x00\xfe\xff' 463 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' 464 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') 465 466 def test_only_one_bom(self): 467 _,_,reader,writer = codecs.lookup(self.encoding) 468 # encode some stream 469 s = io.BytesIO() 470 f = writer(s) 471 f.write("spam") 472 f.write("spam") 473 d = s.getvalue() 474 # check whether there is exactly one BOM in it 475 self.assertTrue(d == self.spamle or d == self.spambe) 476 # try to read it back 477 s = io.BytesIO(d) 478 f = reader(s) 479 self.assertEqual(f.read(), "spamspam") 480 481 def test_badbom(self): 482 s = io.BytesIO(4*b"\xff") 483 f = codecs.getreader(self.encoding)(s) 484 self.assertRaises(UnicodeError, f.read) 485 486 s = io.BytesIO(8*b"\xff") 487 f = codecs.getreader(self.encoding)(s) 488 self.assertRaises(UnicodeError, f.read) 489 490 def test_partial(self): 491 self.check_partial( 492 "\x00\xff\u0100\uffff\U00010000", 493 [ 494 "", # first byte of BOM read 495 "", # second byte of BOM read 496 "", # third byte of BOM read 497 "", # fourth byte of BOM read => byteorder known 498 "", 499 "", 500 "", 501 "\x00", 502 "\x00", 503 "\x00", 504 "\x00", 505 "\x00\xff", 506 "\x00\xff", 507 "\x00\xff", 508 "\x00\xff", 509 "\x00\xff\u0100", 510 "\x00\xff\u0100", 511 "\x00\xff\u0100", 512 "\x00\xff\u0100", 513 "\x00\xff\u0100\uffff", 514 "\x00\xff\u0100\uffff", 515 "\x00\xff\u0100\uffff", 516 "\x00\xff\u0100\uffff", 517 "\x00\xff\u0100\uffff\U00010000", 518 ] 519 ) 520 521 def test_handlers(self): 522 self.assertEqual(('\ufffd', 1), 523 codecs.utf_32_decode(b'\x01', 'replace', True)) 524 self.assertEqual(('', 1), 525 codecs.utf_32_decode(b'\x01', 'ignore', True)) 526 527 def test_errors(self): 528 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, 529 b"\xff", "strict", True) 530 531 def test_decoder_state(self): 532 self.check_state_handling_decode(self.encoding, 533 "spamspam", self.spamle) 534 self.check_state_handling_decode(self.encoding, 535 "spamspam", self.spambe) 536 537 def test_issue8941(self): 538 # Issue #8941: insufficient result allocation when decoding into 539 # surrogate pairs on UCS-2 builds. 540 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024 541 self.assertEqual('\U00010000' * 1024, 542 codecs.utf_32_decode(encoded_le)[0]) 543 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024 544 self.assertEqual('\U00010000' * 1024, 545 codecs.utf_32_decode(encoded_be)[0]) 546 547 548class UTF32LETest(ReadTest, unittest.TestCase): 549 encoding = "utf-32-le" 550 ill_formed_sequence = b"\x80\xdc\x00\x00" 551 552 def test_partial(self): 553 self.check_partial( 554 "\x00\xff\u0100\uffff\U00010000", 555 [ 556 "", 557 "", 558 "", 559 "\x00", 560 "\x00", 561 "\x00", 562 "\x00", 563 "\x00\xff", 564 "\x00\xff", 565 "\x00\xff", 566 "\x00\xff", 567 "\x00\xff\u0100", 568 "\x00\xff\u0100", 569 "\x00\xff\u0100", 570 "\x00\xff\u0100", 571 "\x00\xff\u0100\uffff", 572 "\x00\xff\u0100\uffff", 573 "\x00\xff\u0100\uffff", 574 "\x00\xff\u0100\uffff", 575 "\x00\xff\u0100\uffff\U00010000", 576 ] 577 ) 578 579 def test_simple(self): 580 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00") 581 582 def test_errors(self): 583 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, 584 b"\xff", "strict", True) 585 586 def test_issue8941(self): 587 # Issue #8941: insufficient result allocation when decoding into 588 # surrogate pairs on UCS-2 builds. 589 encoded = b'\x00\x00\x01\x00' * 1024 590 self.assertEqual('\U00010000' * 1024, 591 codecs.utf_32_le_decode(encoded)[0]) 592 593 594class UTF32BETest(ReadTest, unittest.TestCase): 595 encoding = "utf-32-be" 596 ill_formed_sequence = b"\x00\x00\xdc\x80" 597 598 def test_partial(self): 599 self.check_partial( 600 "\x00\xff\u0100\uffff\U00010000", 601 [ 602 "", 603 "", 604 "", 605 "\x00", 606 "\x00", 607 "\x00", 608 "\x00", 609 "\x00\xff", 610 "\x00\xff", 611 "\x00\xff", 612 "\x00\xff", 613 "\x00\xff\u0100", 614 "\x00\xff\u0100", 615 "\x00\xff\u0100", 616 "\x00\xff\u0100", 617 "\x00\xff\u0100\uffff", 618 "\x00\xff\u0100\uffff", 619 "\x00\xff\u0100\uffff", 620 "\x00\xff\u0100\uffff", 621 "\x00\xff\u0100\uffff\U00010000", 622 ] 623 ) 624 625 def test_simple(self): 626 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03") 627 628 def test_errors(self): 629 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, 630 b"\xff", "strict", True) 631 632 def test_issue8941(self): 633 # Issue #8941: insufficient result allocation when decoding into 634 # surrogate pairs on UCS-2 builds. 635 encoded = b'\x00\x01\x00\x00' * 1024 636 self.assertEqual('\U00010000' * 1024, 637 codecs.utf_32_be_decode(encoded)[0]) 638 639 640class UTF16Test(ReadTest, unittest.TestCase): 641 encoding = "utf-16" 642 if sys.byteorder == 'little': 643 ill_formed_sequence = b"\x80\xdc" 644 else: 645 ill_formed_sequence = b"\xdc\x80" 646 647 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' 648 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' 649 650 def test_only_one_bom(self): 651 _,_,reader,writer = codecs.lookup(self.encoding) 652 # encode some stream 653 s = io.BytesIO() 654 f = writer(s) 655 f.write("spam") 656 f.write("spam") 657 d = s.getvalue() 658 # check whether there is exactly one BOM in it 659 self.assertTrue(d == self.spamle or d == self.spambe) 660 # try to read it back 661 s = io.BytesIO(d) 662 f = reader(s) 663 self.assertEqual(f.read(), "spamspam") 664 665 def test_badbom(self): 666 s = io.BytesIO(b"\xff\xff") 667 f = codecs.getreader(self.encoding)(s) 668 self.assertRaises(UnicodeError, f.read) 669 670 s = io.BytesIO(b"\xff\xff\xff\xff") 671 f = codecs.getreader(self.encoding)(s) 672 self.assertRaises(UnicodeError, f.read) 673 674 def test_partial(self): 675 self.check_partial( 676 "\x00\xff\u0100\uffff\U00010000", 677 [ 678 "", # first byte of BOM read 679 "", # second byte of BOM read => byteorder known 680 "", 681 "\x00", 682 "\x00", 683 "\x00\xff", 684 "\x00\xff", 685 "\x00\xff\u0100", 686 "\x00\xff\u0100", 687 "\x00\xff\u0100\uffff", 688 "\x00\xff\u0100\uffff", 689 "\x00\xff\u0100\uffff", 690 "\x00\xff\u0100\uffff", 691 "\x00\xff\u0100\uffff\U00010000", 692 ] 693 ) 694 695 def test_handlers(self): 696 self.assertEqual(('\ufffd', 1), 697 codecs.utf_16_decode(b'\x01', 'replace', True)) 698 self.assertEqual(('', 1), 699 codecs.utf_16_decode(b'\x01', 'ignore', True)) 700 701 def test_errors(self): 702 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, 703 b"\xff", "strict", True) 704 705 def test_decoder_state(self): 706 self.check_state_handling_decode(self.encoding, 707 "spamspam", self.spamle) 708 self.check_state_handling_decode(self.encoding, 709 "spamspam", self.spambe) 710 711 def test_bug691291(self): 712 # If encoding is not None, then 713 # files are always opened in binary mode, even if no binary mode was 714 # specified. This means that no automatic conversion of '\n' is done 715 # on reading and writing. 716 s1 = 'Hello\r\nworld\r\n' 717 718 s = s1.encode(self.encoding) 719 self.addCleanup(os_helper.unlink, os_helper.TESTFN) 720 with open(os_helper.TESTFN, 'wb') as fp: 721 fp.write(s) 722 with codecs.open(os_helper.TESTFN, 'r', 723 encoding=self.encoding) as reader: 724 self.assertEqual(reader.read(), s1) 725 726 def test_invalid_modes(self): 727 for mode in ('U', 'rU', 'r+U'): 728 with self.assertRaises(ValueError) as cm: 729 codecs.open(os_helper.TESTFN, mode, encoding=self.encoding) 730 self.assertIn('invalid mode', str(cm.exception)) 731 732 for mode in ('rt', 'wt', 'at', 'r+t'): 733 with self.assertRaises(ValueError) as cm: 734 codecs.open(os_helper.TESTFN, mode, encoding=self.encoding) 735 self.assertIn("can't have text and binary mode at once", 736 str(cm.exception)) 737 738 739class UTF16LETest(ReadTest, unittest.TestCase): 740 encoding = "utf-16-le" 741 ill_formed_sequence = b"\x80\xdc" 742 743 def test_partial(self): 744 self.check_partial( 745 "\x00\xff\u0100\uffff\U00010000", 746 [ 747 "", 748 "\x00", 749 "\x00", 750 "\x00\xff", 751 "\x00\xff", 752 "\x00\xff\u0100", 753 "\x00\xff\u0100", 754 "\x00\xff\u0100\uffff", 755 "\x00\xff\u0100\uffff", 756 "\x00\xff\u0100\uffff", 757 "\x00\xff\u0100\uffff", 758 "\x00\xff\u0100\uffff\U00010000", 759 ] 760 ) 761 762 def test_errors(self): 763 tests = [ 764 (b'\xff', '\ufffd'), 765 (b'A\x00Z', 'A\ufffd'), 766 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'), 767 (b'\x00\xd8', '\ufffd'), 768 (b'\x00\xd8A', '\ufffd'), 769 (b'\x00\xd8A\x00', '\ufffdA'), 770 (b'\x00\xdcA\x00', '\ufffdA'), 771 ] 772 for raw, expected in tests: 773 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, 774 raw, 'strict', True) 775 self.assertEqual(raw.decode('utf-16le', 'replace'), expected) 776 777 def test_nonbmp(self): 778 self.assertEqual("\U00010203".encode(self.encoding), 779 b'\x00\xd8\x03\xde') 780 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding), 781 "\U00010203") 782 783class UTF16BETest(ReadTest, unittest.TestCase): 784 encoding = "utf-16-be" 785 ill_formed_sequence = b"\xdc\x80" 786 787 def test_partial(self): 788 self.check_partial( 789 "\x00\xff\u0100\uffff\U00010000", 790 [ 791 "", 792 "\x00", 793 "\x00", 794 "\x00\xff", 795 "\x00\xff", 796 "\x00\xff\u0100", 797 "\x00\xff\u0100", 798 "\x00\xff\u0100\uffff", 799 "\x00\xff\u0100\uffff", 800 "\x00\xff\u0100\uffff", 801 "\x00\xff\u0100\uffff", 802 "\x00\xff\u0100\uffff\U00010000", 803 ] 804 ) 805 806 def test_errors(self): 807 tests = [ 808 (b'\xff', '\ufffd'), 809 (b'\x00A\xff', 'A\ufffd'), 810 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'), 811 (b'\xd8\x00', '\ufffd'), 812 (b'\xd8\x00\xdc', '\ufffd'), 813 (b'\xd8\x00\x00A', '\ufffdA'), 814 (b'\xdc\x00\x00A', '\ufffdA'), 815 ] 816 for raw, expected in tests: 817 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, 818 raw, 'strict', True) 819 self.assertEqual(raw.decode('utf-16be', 'replace'), expected) 820 821 def test_nonbmp(self): 822 self.assertEqual("\U00010203".encode(self.encoding), 823 b'\xd8\x00\xde\x03') 824 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding), 825 "\U00010203") 826 827class UTF8Test(ReadTest, unittest.TestCase): 828 encoding = "utf-8" 829 ill_formed_sequence = b"\xed\xb2\x80" 830 ill_formed_sequence_replace = "\ufffd" * 3 831 BOM = b'' 832 833 def test_partial(self): 834 self.check_partial( 835 "\x00\xff\u07ff\u0800\uffff\U00010000", 836 [ 837 "\x00", 838 "\x00", 839 "\x00\xff", 840 "\x00\xff", 841 "\x00\xff\u07ff", 842 "\x00\xff\u07ff", 843 "\x00\xff\u07ff", 844 "\x00\xff\u07ff\u0800", 845 "\x00\xff\u07ff\u0800", 846 "\x00\xff\u07ff\u0800", 847 "\x00\xff\u07ff\u0800\uffff", 848 "\x00\xff\u07ff\u0800\uffff", 849 "\x00\xff\u07ff\u0800\uffff", 850 "\x00\xff\u07ff\u0800\uffff", 851 "\x00\xff\u07ff\u0800\uffff\U00010000", 852 ] 853 ) 854 855 def test_decoder_state(self): 856 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" 857 self.check_state_handling_decode(self.encoding, 858 u, u.encode(self.encoding)) 859 860 def test_decode_error(self): 861 for data, error_handler, expected in ( 862 (b'[\x80\xff]', 'ignore', '[]'), 863 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), 864 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), 865 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), 866 ): 867 with self.subTest(data=data, error_handler=error_handler, 868 expected=expected): 869 self.assertEqual(data.decode(self.encoding, error_handler), 870 expected) 871 872 def test_lone_surrogates(self): 873 super().test_lone_surrogates() 874 # not sure if this is making sense for 875 # UTF-16 and UTF-32 876 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"), 877 self.BOM + b'[\x80]') 878 879 with self.assertRaises(UnicodeEncodeError) as cm: 880 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape") 881 exc = cm.exception 882 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF') 883 884 def test_surrogatepass_handler(self): 885 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"), 886 self.BOM + b"abc\xed\xa0\x80def") 887 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"), 888 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80") 889 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"), 890 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]') 891 892 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"), 893 "abc\ud800def") 894 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"), 895 "\U00010fff\uD800") 896 897 self.assertTrue(codecs.lookup_error("surrogatepass")) 898 with self.assertRaises(UnicodeDecodeError): 899 b"abc\xed\xa0".decode(self.encoding, "surrogatepass") 900 with self.assertRaises(UnicodeDecodeError): 901 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") 902 903 def test_incremental_errors(self): 904 # Test that the incremental decoder can fail with final=False. 905 # See issue #24214 906 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF'] 907 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF', 908 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80', 909 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'): 910 for suffix in b'\x7F', b'\xC0': 911 cases.append(prefix + suffix) 912 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80', 913 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90')) 914 915 for data in cases: 916 with self.subTest(data=data): 917 dec = codecs.getincrementaldecoder(self.encoding)() 918 self.assertRaises(UnicodeDecodeError, dec.decode, data) 919 920 921class UTF7Test(ReadTest, unittest.TestCase): 922 encoding = "utf-7" 923 924 def test_ascii(self): 925 # Set D (directly encoded characters) 926 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 927 'abcdefghijklmnopqrstuvwxyz' 928 '0123456789' 929 '\'(),-./:?') 930 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii')) 931 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d) 932 # Set O (optional direct characters) 933 set_o = ' !"#$%&*;<=>@[]^_`{|}' 934 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii')) 935 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o) 936 # + 937 self.assertEqual('a+b'.encode(self.encoding), b'a+-b') 938 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b') 939 # White spaces 940 ws = ' \t\n\r' 941 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii')) 942 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws) 943 # Other ASCII characters 944 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) - 945 set(set_d + set_o + '+' + ws))) 946 self.assertEqual(other_ascii.encode(self.encoding), 947 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' 948 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') 949 950 def test_partial(self): 951 self.check_partial( 952 'a+-b\x00c\x80d\u0100e\U00010000f', 953 [ 954 'a', 955 'a', 956 'a+', 957 'a+-', 958 'a+-b', 959 'a+-b', 960 'a+-b', 961 'a+-b', 962 'a+-b', 963 'a+-b\x00', 964 'a+-b\x00c', 965 'a+-b\x00c', 966 'a+-b\x00c', 967 'a+-b\x00c', 968 'a+-b\x00c', 969 'a+-b\x00c\x80', 970 'a+-b\x00c\x80d', 971 'a+-b\x00c\x80d', 972 'a+-b\x00c\x80d', 973 'a+-b\x00c\x80d', 974 'a+-b\x00c\x80d', 975 'a+-b\x00c\x80d\u0100', 976 'a+-b\x00c\x80d\u0100e', 977 'a+-b\x00c\x80d\u0100e', 978 'a+-b\x00c\x80d\u0100e', 979 'a+-b\x00c\x80d\u0100e', 980 'a+-b\x00c\x80d\u0100e', 981 'a+-b\x00c\x80d\u0100e', 982 'a+-b\x00c\x80d\u0100e', 983 'a+-b\x00c\x80d\u0100e', 984 'a+-b\x00c\x80d\u0100e\U00010000', 985 'a+-b\x00c\x80d\u0100e\U00010000f', 986 ] 987 ) 988 989 def test_errors(self): 990 tests = [ 991 (b'\xffb', '\ufffdb'), 992 (b'a\xffb', 'a\ufffdb'), 993 (b'a\xff\xffb', 'a\ufffd\ufffdb'), 994 (b'a+IK', 'a\ufffd'), 995 (b'a+IK-b', 'a\ufffdb'), 996 (b'a+IK,b', 'a\ufffdb'), 997 (b'a+IKx', 'a\u20ac\ufffd'), 998 (b'a+IKx-b', 'a\u20ac\ufffdb'), 999 (b'a+IKwgr', 'a\u20ac\ufffd'), 1000 (b'a+IKwgr-b', 'a\u20ac\ufffdb'), 1001 (b'a+IKwgr,', 'a\u20ac\ufffd'), 1002 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'), 1003 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'), 1004 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'), 1005 (b'a+/,+IKw-b', 'a\ufffd\u20acb'), 1006 (b'a+//,+IKw-b', 'a\ufffd\u20acb'), 1007 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), 1008 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), 1009 (b'a+IKw-b\xff', 'a\u20acb\ufffd'), 1010 (b'a+IKw\xffb', 'a\u20ac\ufffdb'), 1011 (b'a+@b', 'a\ufffdb'), 1012 ] 1013 for raw, expected in tests: 1014 with self.subTest(raw=raw): 1015 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, 1016 raw, 'strict', True) 1017 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 1018 1019 def test_nonbmp(self): 1020 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') 1021 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') 1022 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') 1023 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0') 1024 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-') 1025 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0') 1026 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0') 1027 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding), 1028 b'+IKwgrNgB3KA-') 1029 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding), 1030 '\u20ac\u20ac\U000104A0') 1031 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding), 1032 '\u20ac\u20ac\U000104A0') 1033 1034 def test_lone_surrogates(self): 1035 tests = [ 1036 (b'a+2AE-b', 'a\ud801b'), 1037 (b'a+2AE\xffb', 'a\ufffdb'), 1038 (b'a+2AE', 'a\ufffd'), 1039 (b'a+2AEA-b', 'a\ufffdb'), 1040 (b'a+2AH-b', 'a\ufffdb'), 1041 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'), 1042 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'), 1043 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'), 1044 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'), 1045 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'), 1046 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'), 1047 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'), 1048 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'), 1049 ] 1050 for raw, expected in tests: 1051 with self.subTest(raw=raw): 1052 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 1053 1054 1055class UTF16ExTest(unittest.TestCase): 1056 1057 def test_errors(self): 1058 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True) 1059 1060 def test_bad_args(self): 1061 self.assertRaises(TypeError, codecs.utf_16_ex_decode) 1062 1063class ReadBufferTest(unittest.TestCase): 1064 1065 def test_array(self): 1066 import array 1067 self.assertEqual( 1068 codecs.readbuffer_encode(array.array("b", b"spam")), 1069 (b"spam", 4) 1070 ) 1071 1072 def test_empty(self): 1073 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0)) 1074 1075 def test_bad_args(self): 1076 self.assertRaises(TypeError, codecs.readbuffer_encode) 1077 self.assertRaises(TypeError, codecs.readbuffer_encode, 42) 1078 1079class UTF8SigTest(UTF8Test, unittest.TestCase): 1080 encoding = "utf-8-sig" 1081 BOM = codecs.BOM_UTF8 1082 1083 def test_partial(self): 1084 self.check_partial( 1085 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 1086 [ 1087 "", 1088 "", 1089 "", # First BOM has been read and skipped 1090 "", 1091 "", 1092 "\ufeff", # Second BOM has been read and emitted 1093 "\ufeff\x00", # "\x00" read and emitted 1094 "\ufeff\x00", # First byte of encoded "\xff" read 1095 "\ufeff\x00\xff", # Second byte of encoded "\xff" read 1096 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read 1097 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read 1098 "\ufeff\x00\xff\u07ff", 1099 "\ufeff\x00\xff\u07ff", 1100 "\ufeff\x00\xff\u07ff\u0800", 1101 "\ufeff\x00\xff\u07ff\u0800", 1102 "\ufeff\x00\xff\u07ff\u0800", 1103 "\ufeff\x00\xff\u07ff\u0800\uffff", 1104 "\ufeff\x00\xff\u07ff\u0800\uffff", 1105 "\ufeff\x00\xff\u07ff\u0800\uffff", 1106 "\ufeff\x00\xff\u07ff\u0800\uffff", 1107 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 1108 ] 1109 ) 1110 1111 def test_bug1601501(self): 1112 # SF bug #1601501: check that the codec works with a buffer 1113 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "") 1114 1115 def test_bom(self): 1116 d = codecs.getincrementaldecoder("utf-8-sig")() 1117 s = "spam" 1118 self.assertEqual(d.decode(s.encode("utf-8-sig")), s) 1119 1120 def test_stream_bom(self): 1121 unistring = "ABC\u00A1\u2200XYZ" 1122 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ" 1123 1124 reader = codecs.getreader("utf-8-sig") 1125 for sizehint in [None] + list(range(1, 11)) + \ 1126 [64, 128, 256, 512, 1024]: 1127 istream = reader(io.BytesIO(bytestring)) 1128 ostream = io.StringIO() 1129 while 1: 1130 if sizehint is not None: 1131 data = istream.read(sizehint) 1132 else: 1133 data = istream.read() 1134 1135 if not data: 1136 break 1137 ostream.write(data) 1138 1139 got = ostream.getvalue() 1140 self.assertEqual(got, unistring) 1141 1142 def test_stream_bare(self): 1143 unistring = "ABC\u00A1\u2200XYZ" 1144 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ" 1145 1146 reader = codecs.getreader("utf-8-sig") 1147 for sizehint in [None] + list(range(1, 11)) + \ 1148 [64, 128, 256, 512, 1024]: 1149 istream = reader(io.BytesIO(bytestring)) 1150 ostream = io.StringIO() 1151 while 1: 1152 if sizehint is not None: 1153 data = istream.read(sizehint) 1154 else: 1155 data = istream.read() 1156 1157 if not data: 1158 break 1159 ostream.write(data) 1160 1161 got = ostream.getvalue() 1162 self.assertEqual(got, unistring) 1163 1164 1165class EscapeDecodeTest(unittest.TestCase): 1166 def test_empty(self): 1167 self.assertEqual(codecs.escape_decode(b""), (b"", 0)) 1168 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0)) 1169 1170 def test_raw(self): 1171 decode = codecs.escape_decode 1172 for b in range(256): 1173 b = bytes([b]) 1174 if b != b'\\': 1175 self.assertEqual(decode(b + b'0'), (b + b'0', 2)) 1176 1177 def test_escape(self): 1178 decode = codecs.escape_decode 1179 check = coding_checker(self, decode) 1180 check(b"[\\\n]", b"[]") 1181 check(br'[\"]', b'["]') 1182 check(br"[\']", b"[']") 1183 check(br"[\\]", b"[\\]") 1184 check(br"[\a]", b"[\x07]") 1185 check(br"[\b]", b"[\x08]") 1186 check(br"[\t]", b"[\x09]") 1187 check(br"[\n]", b"[\x0a]") 1188 check(br"[\v]", b"[\x0b]") 1189 check(br"[\f]", b"[\x0c]") 1190 check(br"[\r]", b"[\x0d]") 1191 check(br"[\7]", b"[\x07]") 1192 check(br"[\78]", b"[\x078]") 1193 check(br"[\41]", b"[!]") 1194 check(br"[\418]", b"[!8]") 1195 check(br"[\101]", b"[A]") 1196 check(br"[\1010]", b"[A0]") 1197 check(br"[\x41]", b"[A]") 1198 check(br"[\x410]", b"[A0]") 1199 for i in range(97, 123): 1200 b = bytes([i]) 1201 if b not in b'abfnrtvx': 1202 with self.assertWarns(DeprecationWarning): 1203 check(b"\\" + b, b"\\" + b) 1204 with self.assertWarns(DeprecationWarning): 1205 check(b"\\" + b.upper(), b"\\" + b.upper()) 1206 with self.assertWarns(DeprecationWarning): 1207 check(br"\8", b"\\8") 1208 with self.assertWarns(DeprecationWarning): 1209 check(br"\9", b"\\9") 1210 with self.assertWarns(DeprecationWarning): 1211 check(b"\\\xfa", b"\\\xfa") 1212 for i in range(0o400, 0o1000): 1213 with self.assertWarns(DeprecationWarning): 1214 check(rb'\%o' % i, bytes([i & 0o377])) 1215 1216 def test_errors(self): 1217 decode = codecs.escape_decode 1218 self.assertRaises(ValueError, decode, br"\x") 1219 self.assertRaises(ValueError, decode, br"[\x]") 1220 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6)) 1221 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6)) 1222 self.assertRaises(ValueError, decode, br"\x0") 1223 self.assertRaises(ValueError, decode, br"[\x0]") 1224 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8)) 1225 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8)) 1226 1227 1228# From RFC 3492 1229punycode_testcases = [ 1230 # A Arabic (Egyptian): 1231 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1232 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", 1233 b"egbpdaj6bu4bxfgehfvwxn"), 1234 # B Chinese (simplified): 1235 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", 1236 b"ihqwcrb4cv8a8dqg056pqjye"), 1237 # C Chinese (traditional): 1238 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", 1239 b"ihqwctvzc91f659drss3x8bo0yb"), 1240 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky 1241 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" 1242 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" 1243 "\u0065\u0073\u006B\u0079", 1244 b"Proprostnemluvesky-uyb24dma41a"), 1245 # E Hebrew: 1246 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" 1247 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" 1248 "\u05D1\u05E8\u05D9\u05EA", 1249 b"4dbcagdahymbxekheh6e0a7fei0b"), 1250 # F Hindi (Devanagari): 1251 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" 1252 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" 1253 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" 1254 "\u0939\u0948\u0902", 1255 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), 1256 1257 #(G) Japanese (kanji and hiragana): 1258 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" 1259 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", 1260 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), 1261 1262 # (H) Korean (Hangul syllables): 1263 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" 1264 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" 1265 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", 1266 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" 1267 b"psd879ccm6fea98c"), 1268 1269 # (I) Russian (Cyrillic): 1270 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" 1271 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" 1272 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" 1273 "\u0438", 1274 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"), 1275 1276 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol 1277 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" 1278 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070" 1279 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" 1280 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" 1281 "\u0061\u00F1\u006F\u006C", 1282 b"PorqunopuedensimplementehablarenEspaol-fmd56a"), 1283 1284 # (K) Vietnamese: 1285 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ 1286 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t 1287 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B" 1288 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" 1289 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" 1290 "\u0056\u0069\u1EC7\u0074", 1291 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), 1292 1293 #(L) 3<nen>B<gumi><kinpachi><sensei> 1294 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", 1295 b"3B-ww4c5e180e575a65lsy2b"), 1296 1297 # (M) <amuro><namie>-with-SUPER-MONKEYS 1298 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" 1299 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" 1300 "\u004F\u004E\u004B\u0045\u0059\u0053", 1301 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), 1302 1303 # (N) Hello-Another-Way-<sorezore><no><basho> 1304 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" 1305 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" 1306 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240", 1307 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"), 1308 1309 # (O) <hitotsu><yane><no><shita>2 1310 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", 1311 b"2-u9tlzr9756bt3uc0v"), 1312 1313 # (P) Maji<de>Koi<suru>5<byou><mae> 1314 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" 1315 "\u308B\u0035\u79D2\u524D", 1316 b"MajiKoi5-783gue6qz075azm5e"), 1317 1318 # (Q) <pafii>de<runba> 1319 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", 1320 b"de-jg4avhby1noc0d"), 1321 1322 # (R) <sono><supiido><de> 1323 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", 1324 b"d9juau41awczczp"), 1325 1326 # (S) -> $1.00 <- 1327 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" 1328 "\u003C\u002D", 1329 b"-> $1.00 <--") 1330 ] 1331 1332for i in punycode_testcases: 1333 if len(i)!=2: 1334 print(repr(i)) 1335 1336 1337class PunycodeTest(unittest.TestCase): 1338 def test_encode(self): 1339 for uni, puny in punycode_testcases: 1340 # Need to convert both strings to lower case, since 1341 # some of the extended encodings use upper case, but our 1342 # code produces only lower case. Converting just puny to 1343 # lower is also insufficient, since some of the input characters 1344 # are upper case. 1345 self.assertEqual( 1346 str(uni.encode("punycode"), "ascii").lower(), 1347 str(puny, "ascii").lower() 1348 ) 1349 1350 def test_decode(self): 1351 for uni, puny in punycode_testcases: 1352 self.assertEqual(uni, puny.decode("punycode")) 1353 puny = puny.decode("ascii").encode("ascii") 1354 self.assertEqual(uni, puny.decode("punycode")) 1355 1356 def test_decode_invalid(self): 1357 testcases = [ 1358 (b"xn--w&", "strict", UnicodeError()), 1359 (b"xn--w&", "ignore", "xn-"), 1360 ] 1361 for puny, errors, expected in testcases: 1362 with self.subTest(puny=puny, errors=errors): 1363 if isinstance(expected, Exception): 1364 self.assertRaises(UnicodeError, puny.decode, "punycode", errors) 1365 else: 1366 self.assertEqual(puny.decode("punycode", errors), expected) 1367 1368 1369# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html 1370nameprep_tests = [ 1371 # 3.1 Map to nothing. 1372 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar' 1373 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef' 1374 b'\xb8\x8f\xef\xbb\xbf', 1375 b'foobarbaz'), 1376 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045. 1377 (b'CAFE', 1378 b'cafe'), 1379 # 3.3 Case folding 8bit U+00DF (german sharp s). 1380 # The original test case is bogus; it says \xc3\xdf 1381 (b'\xc3\x9f', 1382 b'ss'), 1383 # 3.4 Case folding U+0130 (turkish capital I with dot). 1384 (b'\xc4\xb0', 1385 b'i\xcc\x87'), 1386 # 3.5 Case folding multibyte U+0143 U+037A. 1387 (b'\xc5\x83\xcd\xba', 1388 b'\xc5\x84 \xce\xb9'), 1389 # 3.6 Case folding U+2121 U+33C6 U+1D7BB. 1390 # XXX: skip this as it fails in UCS-2 mode 1391 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb', 1392 # 'telc\xe2\x88\x95kg\xcf\x83'), 1393 (None, None), 1394 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA. 1395 (b'j\xcc\x8c\xc2\xa0\xc2\xaa', 1396 b'\xc7\xb0 a'), 1397 # 3.8 Case folding U+1FB7 and normalization. 1398 (b'\xe1\xbe\xb7', 1399 b'\xe1\xbe\xb6\xce\xb9'), 1400 # 3.9 Self-reverting case folding U+01F0 and normalization. 1401 # The original test case is bogus, it says `\xc7\xf0' 1402 (b'\xc7\xb0', 1403 b'\xc7\xb0'), 1404 # 3.10 Self-reverting case folding U+0390 and normalization. 1405 (b'\xce\x90', 1406 b'\xce\x90'), 1407 # 3.11 Self-reverting case folding U+03B0 and normalization. 1408 (b'\xce\xb0', 1409 b'\xce\xb0'), 1410 # 3.12 Self-reverting case folding U+1E96 and normalization. 1411 (b'\xe1\xba\x96', 1412 b'\xe1\xba\x96'), 1413 # 3.13 Self-reverting case folding U+1F56 and normalization. 1414 (b'\xe1\xbd\x96', 1415 b'\xe1\xbd\x96'), 1416 # 3.14 ASCII space character U+0020. 1417 (b' ', 1418 b' '), 1419 # 3.15 Non-ASCII 8bit space character U+00A0. 1420 (b'\xc2\xa0', 1421 b' '), 1422 # 3.16 Non-ASCII multibyte space character U+1680. 1423 (b'\xe1\x9a\x80', 1424 None), 1425 # 3.17 Non-ASCII multibyte space character U+2000. 1426 (b'\xe2\x80\x80', 1427 b' '), 1428 # 3.18 Zero Width Space U+200b. 1429 (b'\xe2\x80\x8b', 1430 b''), 1431 # 3.19 Non-ASCII multibyte space character U+3000. 1432 (b'\xe3\x80\x80', 1433 b' '), 1434 # 3.20 ASCII control characters U+0010 U+007F. 1435 (b'\x10\x7f', 1436 b'\x10\x7f'), 1437 # 3.21 Non-ASCII 8bit control character U+0085. 1438 (b'\xc2\x85', 1439 None), 1440 # 3.22 Non-ASCII multibyte control character U+180E. 1441 (b'\xe1\xa0\x8e', 1442 None), 1443 # 3.23 Zero Width No-Break Space U+FEFF. 1444 (b'\xef\xbb\xbf', 1445 b''), 1446 # 3.24 Non-ASCII control character U+1D175. 1447 (b'\xf0\x9d\x85\xb5', 1448 None), 1449 # 3.25 Plane 0 private use character U+F123. 1450 (b'\xef\x84\xa3', 1451 None), 1452 # 3.26 Plane 15 private use character U+F1234. 1453 (b'\xf3\xb1\x88\xb4', 1454 None), 1455 # 3.27 Plane 16 private use character U+10F234. 1456 (b'\xf4\x8f\x88\xb4', 1457 None), 1458 # 3.28 Non-character code point U+8FFFE. 1459 (b'\xf2\x8f\xbf\xbe', 1460 None), 1461 # 3.29 Non-character code point U+10FFFF. 1462 (b'\xf4\x8f\xbf\xbf', 1463 None), 1464 # 3.30 Surrogate code U+DF42. 1465 (b'\xed\xbd\x82', 1466 None), 1467 # 3.31 Non-plain text character U+FFFD. 1468 (b'\xef\xbf\xbd', 1469 None), 1470 # 3.32 Ideographic description character U+2FF5. 1471 (b'\xe2\xbf\xb5', 1472 None), 1473 # 3.33 Display property character U+0341. 1474 (b'\xcd\x81', 1475 b'\xcc\x81'), 1476 # 3.34 Left-to-right mark U+200E. 1477 (b'\xe2\x80\x8e', 1478 None), 1479 # 3.35 Deprecated U+202A. 1480 (b'\xe2\x80\xaa', 1481 None), 1482 # 3.36 Language tagging character U+E0001. 1483 (b'\xf3\xa0\x80\x81', 1484 None), 1485 # 3.37 Language tagging character U+E0042. 1486 (b'\xf3\xa0\x81\x82', 1487 None), 1488 # 3.38 Bidi: RandALCat character U+05BE and LCat characters. 1489 (b'foo\xd6\xbebar', 1490 None), 1491 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters. 1492 (b'foo\xef\xb5\x90bar', 1493 None), 1494 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters. 1495 (b'foo\xef\xb9\xb6bar', 1496 b'foo \xd9\x8ebar'), 1497 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031. 1498 (b'\xd8\xa71', 1499 None), 1500 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628. 1501 (b'\xd8\xa71\xd8\xa8', 1502 b'\xd8\xa71\xd8\xa8'), 1503 # 3.43 Unassigned code point U+E0002. 1504 # Skip this test as we allow unassigned 1505 #(b'\xf3\xa0\x80\x82', 1506 # None), 1507 (None, None), 1508 # 3.44 Larger test (shrinking). 1509 # Original test case reads \xc3\xdf 1510 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2' 1511 b'\xaa\xce\xb0\xe2\x80\x80', 1512 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '), 1513 # 3.45 Larger test (expanding). 1514 # Original test case reads \xc3\x9f 1515 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c' 1516 b'\x80', 1517 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3' 1518 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82' 1519 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88') 1520 ] 1521 1522 1523class NameprepTest(unittest.TestCase): 1524 def test_nameprep(self): 1525 from encodings.idna import nameprep 1526 for pos, (orig, prepped) in enumerate(nameprep_tests): 1527 if orig is None: 1528 # Skipped 1529 continue 1530 # The Unicode strings are given in UTF-8 1531 orig = str(orig, "utf-8", "surrogatepass") 1532 if prepped is None: 1533 # Input contains prohibited characters 1534 self.assertRaises(UnicodeError, nameprep, orig) 1535 else: 1536 prepped = str(prepped, "utf-8", "surrogatepass") 1537 try: 1538 self.assertEqual(nameprep(orig), prepped) 1539 except Exception as e: 1540 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) 1541 1542 1543class IDNACodecTest(unittest.TestCase): 1544 def test_builtin_decode(self): 1545 self.assertEqual(str(b"python.org", "idna"), "python.org") 1546 self.assertEqual(str(b"python.org.", "idna"), "python.org.") 1547 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org") 1548 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.") 1549 1550 def test_builtin_encode(self): 1551 self.assertEqual("python.org".encode("idna"), b"python.org") 1552 self.assertEqual("python.org.".encode("idna"), b"python.org.") 1553 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org") 1554 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.") 1555 1556 def test_builtin_decode_length_limit(self): 1557 with self.assertRaisesRegex(UnicodeError, "too long"): 1558 (b"xn--016c"+b"a"*1100).decode("idna") 1559 with self.assertRaisesRegex(UnicodeError, "too long"): 1560 (b"xn--016c"+b"a"*70).decode("idna") 1561 1562 def test_stream(self): 1563 r = codecs.getreader("idna")(io.BytesIO(b"abc")) 1564 r.read(3) 1565 self.assertEqual(r.read(), "") 1566 1567 def test_incremental_decode(self): 1568 self.assertEqual( 1569 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")), 1570 "python.org" 1571 ) 1572 self.assertEqual( 1573 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")), 1574 "python.org." 1575 ) 1576 self.assertEqual( 1577 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")), 1578 "pyth\xf6n.org." 1579 ) 1580 self.assertEqual( 1581 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")), 1582 "pyth\xf6n.org." 1583 ) 1584 1585 decoder = codecs.getincrementaldecoder("idna")() 1586 self.assertEqual(decoder.decode(b"xn--xam", ), "") 1587 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.") 1588 self.assertEqual(decoder.decode(b"rg"), "") 1589 self.assertEqual(decoder.decode(b"", True), "org") 1590 1591 decoder.reset() 1592 self.assertEqual(decoder.decode(b"xn--xam", ), "") 1593 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.") 1594 self.assertEqual(decoder.decode(b"rg."), "org.") 1595 self.assertEqual(decoder.decode(b"", True), "") 1596 1597 def test_incremental_encode(self): 1598 self.assertEqual( 1599 b"".join(codecs.iterencode("python.org", "idna")), 1600 b"python.org" 1601 ) 1602 self.assertEqual( 1603 b"".join(codecs.iterencode("python.org.", "idna")), 1604 b"python.org." 1605 ) 1606 self.assertEqual( 1607 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")), 1608 b"xn--pythn-mua.org." 1609 ) 1610 self.assertEqual( 1611 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")), 1612 b"xn--pythn-mua.org." 1613 ) 1614 1615 encoder = codecs.getincrementalencoder("idna")() 1616 self.assertEqual(encoder.encode("\xe4x"), b"") 1617 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.") 1618 self.assertEqual(encoder.encode("", True), b"org") 1619 1620 encoder.reset() 1621 self.assertEqual(encoder.encode("\xe4x"), b"") 1622 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.") 1623 self.assertEqual(encoder.encode("", True), b"") 1624 1625 def test_errors(self): 1626 """Only supports "strict" error handler""" 1627 "python.org".encode("idna", "strict") 1628 b"python.org".decode("idna", "strict") 1629 for errors in ("ignore", "replace", "backslashreplace", 1630 "surrogateescape"): 1631 self.assertRaises(Exception, "python.org".encode, "idna", errors) 1632 self.assertRaises(Exception, 1633 b"python.org".decode, "idna", errors) 1634 1635 1636class CodecsModuleTest(unittest.TestCase): 1637 1638 def test_decode(self): 1639 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'), 1640 '\xe4\xf6\xfc') 1641 self.assertRaises(TypeError, codecs.decode) 1642 self.assertEqual(codecs.decode(b'abc'), 'abc') 1643 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii') 1644 1645 # test keywords 1646 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'), 1647 '\xe4\xf6\xfc') 1648 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'), 1649 '[]') 1650 1651 def test_encode(self): 1652 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'), 1653 b'\xe4\xf6\xfc') 1654 self.assertRaises(TypeError, codecs.encode) 1655 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__") 1656 self.assertEqual(codecs.encode('abc'), b'abc') 1657 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii') 1658 1659 # test keywords 1660 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'), 1661 b'\xe4\xf6\xfc') 1662 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'), 1663 b'[]') 1664 1665 def test_register(self): 1666 self.assertRaises(TypeError, codecs.register) 1667 self.assertRaises(TypeError, codecs.register, 42) 1668 1669 def test_unregister(self): 1670 name = "nonexistent_codec_name" 1671 search_function = mock.Mock() 1672 codecs.register(search_function) 1673 self.assertRaises(TypeError, codecs.lookup, name) 1674 search_function.assert_called_with(name) 1675 search_function.reset_mock() 1676 1677 codecs.unregister(search_function) 1678 self.assertRaises(LookupError, codecs.lookup, name) 1679 search_function.assert_not_called() 1680 1681 def test_lookup(self): 1682 self.assertRaises(TypeError, codecs.lookup) 1683 self.assertRaises(LookupError, codecs.lookup, "__spam__") 1684 self.assertRaises(LookupError, codecs.lookup, " ") 1685 1686 def test_getencoder(self): 1687 self.assertRaises(TypeError, codecs.getencoder) 1688 self.assertRaises(LookupError, codecs.getencoder, "__spam__") 1689 1690 def test_getdecoder(self): 1691 self.assertRaises(TypeError, codecs.getdecoder) 1692 self.assertRaises(LookupError, codecs.getdecoder, "__spam__") 1693 1694 def test_getreader(self): 1695 self.assertRaises(TypeError, codecs.getreader) 1696 self.assertRaises(LookupError, codecs.getreader, "__spam__") 1697 1698 def test_getwriter(self): 1699 self.assertRaises(TypeError, codecs.getwriter) 1700 self.assertRaises(LookupError, codecs.getwriter, "__spam__") 1701 1702 def test_lookup_issue1813(self): 1703 # Issue #1813: under Turkish locales, lookup of some codecs failed 1704 # because 'I' is lowercased as "ı" (dotless i) 1705 oldlocale = locale.setlocale(locale.LC_CTYPE) 1706 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1707 try: 1708 locale.setlocale(locale.LC_CTYPE, 'tr_TR') 1709 except locale.Error: 1710 # Unsupported locale on this system 1711 self.skipTest('test needs Turkish locale') 1712 c = codecs.lookup('ASCII') 1713 self.assertEqual(c.name, 'ascii') 1714 1715 def test_all(self): 1716 api = ( 1717 "encode", "decode", 1718 "register", "CodecInfo", "Codec", "IncrementalEncoder", 1719 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup", 1720 "getencoder", "getdecoder", "getincrementalencoder", 1721 "getincrementaldecoder", "getreader", "getwriter", 1722 "register_error", "lookup_error", 1723 "strict_errors", "replace_errors", "ignore_errors", 1724 "xmlcharrefreplace_errors", "backslashreplace_errors", 1725 "namereplace_errors", 1726 "open", "EncodedFile", 1727 "iterencode", "iterdecode", 1728 "BOM", "BOM_BE", "BOM_LE", 1729 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE", 1730 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE", 1731 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented 1732 "StreamReaderWriter", "StreamRecoder", 1733 ) 1734 self.assertCountEqual(api, codecs.__all__) 1735 for api in codecs.__all__: 1736 getattr(codecs, api) 1737 1738 def test_open(self): 1739 self.addCleanup(os_helper.unlink, os_helper.TESTFN) 1740 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'): 1741 with self.subTest(mode), \ 1742 codecs.open(os_helper.TESTFN, mode, 'ascii') as file: 1743 self.assertIsInstance(file, codecs.StreamReaderWriter) 1744 1745 def test_undefined(self): 1746 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined') 1747 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined') 1748 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined') 1749 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined') 1750 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'): 1751 self.assertRaises(UnicodeError, 1752 codecs.encode, 'abc', 'undefined', errors) 1753 self.assertRaises(UnicodeError, 1754 codecs.decode, b'abc', 'undefined', errors) 1755 1756 def test_file_closes_if_lookup_error_raised(self): 1757 mock_open = mock.mock_open() 1758 with mock.patch('builtins.open', mock_open) as file: 1759 with self.assertRaises(LookupError): 1760 codecs.open(os_helper.TESTFN, 'wt', 'invalid-encoding') 1761 1762 file().close.assert_called() 1763 1764 1765class StreamReaderTest(unittest.TestCase): 1766 1767 def setUp(self): 1768 self.reader = codecs.getreader('utf-8') 1769 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80') 1770 1771 def test_readlines(self): 1772 f = self.reader(self.stream) 1773 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00']) 1774 1775 1776class EncodedFileTest(unittest.TestCase): 1777 1778 def test_basic(self): 1779 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80') 1780 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8') 1781 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae') 1782 1783 f = io.BytesIO() 1784 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1') 1785 ef.write(b'\xc3\xbc') 1786 self.assertEqual(f.getvalue(), b'\xfc') 1787 1788all_unicode_encodings = [ 1789 "ascii", 1790 "big5", 1791 "big5hkscs", 1792 "charmap", 1793 "cp037", 1794 "cp1006", 1795 "cp1026", 1796 "cp1125", 1797 "cp1140", 1798 "cp1250", 1799 "cp1251", 1800 "cp1252", 1801 "cp1253", 1802 "cp1254", 1803 "cp1255", 1804 "cp1256", 1805 "cp1257", 1806 "cp1258", 1807 "cp424", 1808 "cp437", 1809 "cp500", 1810 "cp720", 1811 "cp737", 1812 "cp775", 1813 "cp850", 1814 "cp852", 1815 "cp855", 1816 "cp856", 1817 "cp857", 1818 "cp858", 1819 "cp860", 1820 "cp861", 1821 "cp862", 1822 "cp863", 1823 "cp864", 1824 "cp865", 1825 "cp866", 1826 "cp869", 1827 "cp874", 1828 "cp875", 1829 "cp932", 1830 "cp949", 1831 "cp950", 1832 "euc_jis_2004", 1833 "euc_jisx0213", 1834 "euc_jp", 1835 "euc_kr", 1836 "gb18030", 1837 "gb2312", 1838 "gbk", 1839 "hp_roman8", 1840 "hz", 1841 "idna", 1842 "iso2022_jp", 1843 "iso2022_jp_1", 1844 "iso2022_jp_2", 1845 "iso2022_jp_2004", 1846 "iso2022_jp_3", 1847 "iso2022_jp_ext", 1848 "iso2022_kr", 1849 "iso8859_1", 1850 "iso8859_10", 1851 "iso8859_11", 1852 "iso8859_13", 1853 "iso8859_14", 1854 "iso8859_15", 1855 "iso8859_16", 1856 "iso8859_2", 1857 "iso8859_3", 1858 "iso8859_4", 1859 "iso8859_5", 1860 "iso8859_6", 1861 "iso8859_7", 1862 "iso8859_8", 1863 "iso8859_9", 1864 "johab", 1865 "koi8_r", 1866 "koi8_t", 1867 "koi8_u", 1868 "kz1048", 1869 "latin_1", 1870 "mac_cyrillic", 1871 "mac_greek", 1872 "mac_iceland", 1873 "mac_latin2", 1874 "mac_roman", 1875 "mac_turkish", 1876 "palmos", 1877 "ptcp154", 1878 "punycode", 1879 "raw_unicode_escape", 1880 "shift_jis", 1881 "shift_jis_2004", 1882 "shift_jisx0213", 1883 "tis_620", 1884 "unicode_escape", 1885 "utf_16", 1886 "utf_16_be", 1887 "utf_16_le", 1888 "utf_7", 1889 "utf_8", 1890] 1891 1892if hasattr(codecs, "mbcs_encode"): 1893 all_unicode_encodings.append("mbcs") 1894if hasattr(codecs, "oem_encode"): 1895 all_unicode_encodings.append("oem") 1896 1897# The following encoding is not tested, because it's not supposed 1898# to work: 1899# "undefined" 1900 1901# The following encodings don't work in stateful mode 1902broken_unicode_with_stateful = [ 1903 "punycode", 1904] 1905 1906 1907class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): 1908 def test_basics(self): 1909 s = "abc123" # all codecs should be able to encode these 1910 for encoding in all_unicode_encodings: 1911 name = codecs.lookup(encoding).name 1912 if encoding.endswith("_codec"): 1913 name += "_codec" 1914 elif encoding == "latin_1": 1915 name = "latin_1" 1916 # Skip the mbcs alias on Windows 1917 if name != "mbcs": 1918 self.assertEqual(encoding.replace("_", "-"), 1919 name.replace("_", "-")) 1920 1921 (b, size) = codecs.getencoder(encoding)(s) 1922 self.assertEqual(size, len(s), "encoding=%r" % encoding) 1923 (chars, size) = codecs.getdecoder(encoding)(b) 1924 self.assertEqual(chars, s, "encoding=%r" % encoding) 1925 1926 if encoding not in broken_unicode_with_stateful: 1927 # check stream reader/writer 1928 q = Queue(b"") 1929 writer = codecs.getwriter(encoding)(q) 1930 encodedresult = b"" 1931 for c in s: 1932 writer.write(c) 1933 chunk = q.read() 1934 self.assertTrue(type(chunk) is bytes, type(chunk)) 1935 encodedresult += chunk 1936 q = Queue(b"") 1937 reader = codecs.getreader(encoding)(q) 1938 decodedresult = "" 1939 for c in encodedresult: 1940 q.write(bytes([c])) 1941 decodedresult += reader.read() 1942 self.assertEqual(decodedresult, s, "encoding=%r" % encoding) 1943 1944 if encoding not in broken_unicode_with_stateful: 1945 # check incremental decoder/encoder and iterencode()/iterdecode() 1946 try: 1947 encoder = codecs.getincrementalencoder(encoding)() 1948 except LookupError: # no IncrementalEncoder 1949 pass 1950 else: 1951 # check incremental decoder/encoder 1952 encodedresult = b"" 1953 for c in s: 1954 encodedresult += encoder.encode(c) 1955 encodedresult += encoder.encode("", True) 1956 decoder = codecs.getincrementaldecoder(encoding)() 1957 decodedresult = "" 1958 for c in encodedresult: 1959 decodedresult += decoder.decode(bytes([c])) 1960 decodedresult += decoder.decode(b"", True) 1961 self.assertEqual(decodedresult, s, 1962 "encoding=%r" % encoding) 1963 1964 # check iterencode()/iterdecode() 1965 result = "".join(codecs.iterdecode( 1966 codecs.iterencode(s, encoding), encoding)) 1967 self.assertEqual(result, s, "encoding=%r" % encoding) 1968 1969 # check iterencode()/iterdecode() with empty string 1970 result = "".join(codecs.iterdecode( 1971 codecs.iterencode("", encoding), encoding)) 1972 self.assertEqual(result, "") 1973 1974 if encoding not in ("idna", "mbcs"): 1975 # check incremental decoder/encoder with errors argument 1976 try: 1977 encoder = codecs.getincrementalencoder(encoding)("ignore") 1978 except LookupError: # no IncrementalEncoder 1979 pass 1980 else: 1981 encodedresult = b"".join(encoder.encode(c) for c in s) 1982 decoder = codecs.getincrementaldecoder(encoding)("ignore") 1983 decodedresult = "".join(decoder.decode(bytes([c])) 1984 for c in encodedresult) 1985 self.assertEqual(decodedresult, s, 1986 "encoding=%r" % encoding) 1987 1988 @support.cpython_only 1989 @unittest.skipIf(_testcapi is None, 'need _testcapi module') 1990 def test_basics_capi(self): 1991 s = "abc123" # all codecs should be able to encode these 1992 for encoding in all_unicode_encodings: 1993 if encoding not in broken_unicode_with_stateful: 1994 # check incremental decoder/encoder (fetched via the C API) 1995 try: 1996 cencoder = _testcapi.codec_incrementalencoder(encoding) 1997 except LookupError: # no IncrementalEncoder 1998 pass 1999 else: 2000 # check C API 2001 encodedresult = b"" 2002 for c in s: 2003 encodedresult += cencoder.encode(c) 2004 encodedresult += cencoder.encode("", True) 2005 cdecoder = _testcapi.codec_incrementaldecoder(encoding) 2006 decodedresult = "" 2007 for c in encodedresult: 2008 decodedresult += cdecoder.decode(bytes([c])) 2009 decodedresult += cdecoder.decode(b"", True) 2010 self.assertEqual(decodedresult, s, 2011 "encoding=%r" % encoding) 2012 2013 if encoding not in ("idna", "mbcs"): 2014 # check incremental decoder/encoder with errors argument 2015 try: 2016 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore") 2017 except LookupError: # no IncrementalEncoder 2018 pass 2019 else: 2020 encodedresult = b"".join(cencoder.encode(c) for c in s) 2021 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore") 2022 decodedresult = "".join(cdecoder.decode(bytes([c])) 2023 for c in encodedresult) 2024 self.assertEqual(decodedresult, s, 2025 "encoding=%r" % encoding) 2026 2027 def test_seek(self): 2028 # all codecs should be able to encode these 2029 s = "%s\n%s\n" % (100*"abc123", 100*"def456") 2030 for encoding in all_unicode_encodings: 2031 if encoding == "idna": # FIXME: See SF bug #1163178 2032 continue 2033 if encoding in broken_unicode_with_stateful: 2034 continue 2035 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding))) 2036 for t in range(5): 2037 # Test that calling seek resets the internal codec state and buffers 2038 reader.seek(0, 0) 2039 data = reader.read() 2040 self.assertEqual(s, data) 2041 2042 def test_bad_decode_args(self): 2043 for encoding in all_unicode_encodings: 2044 decoder = codecs.getdecoder(encoding) 2045 self.assertRaises(TypeError, decoder) 2046 if encoding not in ("idna", "punycode"): 2047 self.assertRaises(TypeError, decoder, 42) 2048 2049 def test_bad_encode_args(self): 2050 for encoding in all_unicode_encodings: 2051 encoder = codecs.getencoder(encoding) 2052 self.assertRaises(TypeError, encoder) 2053 2054 def test_encoding_map_type_initialized(self): 2055 from encodings import cp1140 2056 # This used to crash, we are only verifying there's no crash. 2057 table_type = type(cp1140.encoding_table) 2058 self.assertEqual(table_type, table_type) 2059 2060 def test_decoder_state(self): 2061 # Check that getstate() and setstate() handle the state properly 2062 u = "abc123" 2063 for encoding in all_unicode_encodings: 2064 if encoding not in broken_unicode_with_stateful: 2065 self.check_state_handling_decode(encoding, u, u.encode(encoding)) 2066 self.check_state_handling_encode(encoding, u, u.encode(encoding)) 2067 2068 2069class CharmapTest(unittest.TestCase): 2070 def test_decode_with_string_map(self): 2071 self.assertEqual( 2072 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"), 2073 ("abc", 3) 2074 ) 2075 2076 self.assertEqual( 2077 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"), 2078 ("\U0010FFFFbc", 3) 2079 ) 2080 2081 self.assertRaises(UnicodeDecodeError, 2082 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab" 2083 ) 2084 2085 self.assertRaises(UnicodeDecodeError, 2086 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe" 2087 ) 2088 2089 self.assertEqual( 2090 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"), 2091 ("ab\ufffd", 3) 2092 ) 2093 2094 self.assertEqual( 2095 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"), 2096 ("ab\ufffd", 3) 2097 ) 2098 2099 self.assertEqual( 2100 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"), 2101 ("ab\\x02", 3) 2102 ) 2103 2104 self.assertEqual( 2105 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"), 2106 ("ab\\x02", 3) 2107 ) 2108 2109 self.assertEqual( 2110 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"), 2111 ("ab", 3) 2112 ) 2113 2114 self.assertEqual( 2115 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"), 2116 ("ab", 3) 2117 ) 2118 2119 allbytes = bytes(range(256)) 2120 self.assertEqual( 2121 codecs.charmap_decode(allbytes, "ignore", ""), 2122 ("", len(allbytes)) 2123 ) 2124 2125 def test_decode_with_int2str_map(self): 2126 self.assertEqual( 2127 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2128 {0: 'a', 1: 'b', 2: 'c'}), 2129 ("abc", 3) 2130 ) 2131 2132 self.assertEqual( 2133 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2134 {0: 'Aa', 1: 'Bb', 2: 'Cc'}), 2135 ("AaBbCc", 3) 2136 ) 2137 2138 self.assertEqual( 2139 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2140 {0: '\U0010FFFF', 1: 'b', 2: 'c'}), 2141 ("\U0010FFFFbc", 3) 2142 ) 2143 2144 self.assertEqual( 2145 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2146 {0: 'a', 1: 'b', 2: ''}), 2147 ("ab", 3) 2148 ) 2149 2150 self.assertRaises(UnicodeDecodeError, 2151 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2152 {0: 'a', 1: 'b'} 2153 ) 2154 2155 self.assertRaises(UnicodeDecodeError, 2156 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2157 {0: 'a', 1: 'b', 2: None} 2158 ) 2159 2160 # Issue #14850 2161 self.assertRaises(UnicodeDecodeError, 2162 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2163 {0: 'a', 1: 'b', 2: '\ufffe'} 2164 ) 2165 2166 self.assertEqual( 2167 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2168 {0: 'a', 1: 'b'}), 2169 ("ab\ufffd", 3) 2170 ) 2171 2172 self.assertEqual( 2173 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2174 {0: 'a', 1: 'b', 2: None}), 2175 ("ab\ufffd", 3) 2176 ) 2177 2178 # Issue #14850 2179 self.assertEqual( 2180 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2181 {0: 'a', 1: 'b', 2: '\ufffe'}), 2182 ("ab\ufffd", 3) 2183 ) 2184 2185 self.assertEqual( 2186 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2187 {0: 'a', 1: 'b'}), 2188 ("ab\\x02", 3) 2189 ) 2190 2191 self.assertEqual( 2192 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2193 {0: 'a', 1: 'b', 2: None}), 2194 ("ab\\x02", 3) 2195 ) 2196 2197 # Issue #14850 2198 self.assertEqual( 2199 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2200 {0: 'a', 1: 'b', 2: '\ufffe'}), 2201 ("ab\\x02", 3) 2202 ) 2203 2204 self.assertEqual( 2205 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2206 {0: 'a', 1: 'b'}), 2207 ("ab", 3) 2208 ) 2209 2210 self.assertEqual( 2211 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2212 {0: 'a', 1: 'b', 2: None}), 2213 ("ab", 3) 2214 ) 2215 2216 # Issue #14850 2217 self.assertEqual( 2218 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2219 {0: 'a', 1: 'b', 2: '\ufffe'}), 2220 ("ab", 3) 2221 ) 2222 2223 allbytes = bytes(range(256)) 2224 self.assertEqual( 2225 codecs.charmap_decode(allbytes, "ignore", {}), 2226 ("", len(allbytes)) 2227 ) 2228 2229 self.assertRaisesRegex(TypeError, 2230 "character mapping must be in range\\(0x110000\\)", 2231 codecs.charmap_decode, 2232 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: -2} 2233 ) 2234 2235 self.assertRaisesRegex(TypeError, 2236 "character mapping must be in range\\(0x110000\\)", 2237 codecs.charmap_decode, 2238 b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: 999999999} 2239 ) 2240 2241 def test_decode_with_int2int_map(self): 2242 a = ord('a') 2243 b = ord('b') 2244 c = ord('c') 2245 2246 self.assertEqual( 2247 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2248 {0: a, 1: b, 2: c}), 2249 ("abc", 3) 2250 ) 2251 2252 # Issue #15379 2253 self.assertEqual( 2254 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2255 {0: 0x10FFFF, 1: b, 2: c}), 2256 ("\U0010FFFFbc", 3) 2257 ) 2258 2259 self.assertEqual( 2260 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2261 {0: sys.maxunicode, 1: b, 2: c}), 2262 (chr(sys.maxunicode) + "bc", 3) 2263 ) 2264 2265 self.assertRaises(TypeError, 2266 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2267 {0: sys.maxunicode + 1, 1: b, 2: c} 2268 ) 2269 2270 self.assertRaises(UnicodeDecodeError, 2271 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2272 {0: a, 1: b}, 2273 ) 2274 2275 self.assertRaises(UnicodeDecodeError, 2276 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2277 {0: a, 1: b, 2: 0xFFFE}, 2278 ) 2279 2280 self.assertEqual( 2281 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2282 {0: a, 1: b}), 2283 ("ab\ufffd", 3) 2284 ) 2285 2286 self.assertEqual( 2287 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2288 {0: a, 1: b, 2: 0xFFFE}), 2289 ("ab\ufffd", 3) 2290 ) 2291 2292 self.assertEqual( 2293 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2294 {0: a, 1: b}), 2295 ("ab\\x02", 3) 2296 ) 2297 2298 self.assertEqual( 2299 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2300 {0: a, 1: b, 2: 0xFFFE}), 2301 ("ab\\x02", 3) 2302 ) 2303 2304 self.assertEqual( 2305 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2306 {0: a, 1: b}), 2307 ("ab", 3) 2308 ) 2309 2310 self.assertEqual( 2311 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2312 {0: a, 1: b, 2: 0xFFFE}), 2313 ("ab", 3) 2314 ) 2315 2316 2317class WithStmtTest(unittest.TestCase): 2318 def test_encodedfile(self): 2319 f = io.BytesIO(b"\xc3\xbc") 2320 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef: 2321 self.assertEqual(ef.read(), b"\xfc") 2322 self.assertTrue(f.closed) 2323 2324 def test_streamreaderwriter(self): 2325 f = io.BytesIO(b"\xc3\xbc") 2326 info = codecs.lookup("utf-8") 2327 with codecs.StreamReaderWriter(f, info.streamreader, 2328 info.streamwriter, 'strict') as srw: 2329 self.assertEqual(srw.read(), "\xfc") 2330 2331 2332class TypesTest(unittest.TestCase): 2333 def test_decode_unicode(self): 2334 # Most decoders don't accept unicode input 2335 decoders = [ 2336 codecs.utf_7_decode, 2337 codecs.utf_8_decode, 2338 codecs.utf_16_le_decode, 2339 codecs.utf_16_be_decode, 2340 codecs.utf_16_ex_decode, 2341 codecs.utf_32_decode, 2342 codecs.utf_32_le_decode, 2343 codecs.utf_32_be_decode, 2344 codecs.utf_32_ex_decode, 2345 codecs.latin_1_decode, 2346 codecs.ascii_decode, 2347 codecs.charmap_decode, 2348 ] 2349 if hasattr(codecs, "mbcs_decode"): 2350 decoders.append(codecs.mbcs_decode) 2351 for decoder in decoders: 2352 self.assertRaises(TypeError, decoder, "xxx") 2353 2354 def test_unicode_escape(self): 2355 # Escape-decoding a unicode string is supported and gives the same 2356 # result as decoding the equivalent ASCII bytes string. 2357 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2358 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6)) 2359 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2360 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6)) 2361 2362 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000") 2363 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) 2364 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"), 2365 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) 2366 2367 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000") 2368 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) 2369 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"), 2370 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) 2371 2372 2373class UnicodeEscapeTest(ReadTest, unittest.TestCase): 2374 encoding = "unicode-escape" 2375 2376 test_lone_surrogates = None 2377 2378 def test_empty(self): 2379 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0)) 2380 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0)) 2381 2382 def test_raw_encode(self): 2383 encode = codecs.unicode_escape_encode 2384 for b in range(32, 127): 2385 if b != b'\\'[0]: 2386 self.assertEqual(encode(chr(b)), (bytes([b]), 1)) 2387 2388 def test_raw_decode(self): 2389 decode = codecs.unicode_escape_decode 2390 for b in range(256): 2391 if b != b'\\'[0]: 2392 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2)) 2393 2394 def test_escape_encode(self): 2395 encode = codecs.unicode_escape_encode 2396 check = coding_checker(self, encode) 2397 check('\t', br'\t') 2398 check('\n', br'\n') 2399 check('\r', br'\r') 2400 check('\\', br'\\') 2401 for b in range(32): 2402 if chr(b) not in '\t\n\r': 2403 check(chr(b), ('\\x%02x' % b).encode()) 2404 for b in range(127, 256): 2405 check(chr(b), ('\\x%02x' % b).encode()) 2406 check('\u20ac', br'\u20ac') 2407 check('\U0001d120', br'\U0001d120') 2408 2409 def test_escape_decode(self): 2410 decode = codecs.unicode_escape_decode 2411 check = coding_checker(self, decode) 2412 check(b"[\\\n]", "[]") 2413 check(br'[\"]', '["]') 2414 check(br"[\']", "[']") 2415 check(br"[\\]", r"[\]") 2416 check(br"[\a]", "[\x07]") 2417 check(br"[\b]", "[\x08]") 2418 check(br"[\t]", "[\x09]") 2419 check(br"[\n]", "[\x0a]") 2420 check(br"[\v]", "[\x0b]") 2421 check(br"[\f]", "[\x0c]") 2422 check(br"[\r]", "[\x0d]") 2423 check(br"[\7]", "[\x07]") 2424 check(br"[\78]", "[\x078]") 2425 check(br"[\41]", "[!]") 2426 check(br"[\418]", "[!8]") 2427 check(br"[\101]", "[A]") 2428 check(br"[\1010]", "[A0]") 2429 check(br"[\x41]", "[A]") 2430 check(br"[\x410]", "[A0]") 2431 check(br"\u20ac", "\u20ac") 2432 check(br"\U0001d120", "\U0001d120") 2433 for i in range(97, 123): 2434 b = bytes([i]) 2435 if b not in b'abfnrtuvx': 2436 with self.assertWarns(DeprecationWarning): 2437 check(b"\\" + b, "\\" + chr(i)) 2438 if b.upper() not in b'UN': 2439 with self.assertWarns(DeprecationWarning): 2440 check(b"\\" + b.upper(), "\\" + chr(i-32)) 2441 with self.assertWarns(DeprecationWarning): 2442 check(br"\8", "\\8") 2443 with self.assertWarns(DeprecationWarning): 2444 check(br"\9", "\\9") 2445 with self.assertWarns(DeprecationWarning): 2446 check(b"\\\xfa", "\\\xfa") 2447 for i in range(0o400, 0o1000): 2448 with self.assertWarns(DeprecationWarning): 2449 check(rb'\%o' % i, chr(i)) 2450 2451 def test_decode_errors(self): 2452 decode = codecs.unicode_escape_decode 2453 for c, d in (b'x', 2), (b'u', 4), (b'U', 4): 2454 for i in range(d): 2455 self.assertRaises(UnicodeDecodeError, decode, 2456 b"\\" + c + b"0"*i) 2457 self.assertRaises(UnicodeDecodeError, decode, 2458 b"[\\" + c + b"0"*i + b"]") 2459 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i 2460 self.assertEqual(decode(data, "ignore"), ("[]", len(data))) 2461 self.assertEqual(decode(data, "replace"), 2462 ("[\ufffd]\ufffd", len(data))) 2463 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000") 2464 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) 2465 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) 2466 2467 def test_partial(self): 2468 self.check_partial( 2469 "\x00\t\n\r\\\xff\uffff\U00010000", 2470 [ 2471 '', 2472 '', 2473 '', 2474 '\x00', 2475 '\x00', 2476 '\x00\t', 2477 '\x00\t', 2478 '\x00\t\n', 2479 '\x00\t\n', 2480 '\x00\t\n\r', 2481 '\x00\t\n\r', 2482 '\x00\t\n\r\\', 2483 '\x00\t\n\r\\', 2484 '\x00\t\n\r\\', 2485 '\x00\t\n\r\\', 2486 '\x00\t\n\r\\\xff', 2487 '\x00\t\n\r\\\xff', 2488 '\x00\t\n\r\\\xff', 2489 '\x00\t\n\r\\\xff', 2490 '\x00\t\n\r\\\xff', 2491 '\x00\t\n\r\\\xff', 2492 '\x00\t\n\r\\\xff\uffff', 2493 '\x00\t\n\r\\\xff\uffff', 2494 '\x00\t\n\r\\\xff\uffff', 2495 '\x00\t\n\r\\\xff\uffff', 2496 '\x00\t\n\r\\\xff\uffff', 2497 '\x00\t\n\r\\\xff\uffff', 2498 '\x00\t\n\r\\\xff\uffff', 2499 '\x00\t\n\r\\\xff\uffff', 2500 '\x00\t\n\r\\\xff\uffff', 2501 '\x00\t\n\r\\\xff\uffff', 2502 '\x00\t\n\r\\\xff\uffff\U00010000', 2503 ] 2504 ) 2505 2506class RawUnicodeEscapeTest(ReadTest, unittest.TestCase): 2507 encoding = "raw-unicode-escape" 2508 2509 test_lone_surrogates = None 2510 2511 def test_empty(self): 2512 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0)) 2513 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0)) 2514 2515 def test_raw_encode(self): 2516 encode = codecs.raw_unicode_escape_encode 2517 for b in range(256): 2518 self.assertEqual(encode(chr(b)), (bytes([b]), 1)) 2519 2520 def test_raw_decode(self): 2521 decode = codecs.raw_unicode_escape_decode 2522 for b in range(256): 2523 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2)) 2524 2525 def test_escape_encode(self): 2526 encode = codecs.raw_unicode_escape_encode 2527 check = coding_checker(self, encode) 2528 for b in range(256): 2529 if b not in b'uU': 2530 check('\\' + chr(b), b'\\' + bytes([b])) 2531 check('\u20ac', br'\u20ac') 2532 check('\U0001d120', br'\U0001d120') 2533 2534 def test_escape_decode(self): 2535 decode = codecs.raw_unicode_escape_decode 2536 check = coding_checker(self, decode) 2537 for b in range(256): 2538 if b not in b'uU': 2539 check(b'\\' + bytes([b]), '\\' + chr(b)) 2540 check(br"\u20ac", "\u20ac") 2541 check(br"\U0001d120", "\U0001d120") 2542 2543 def test_decode_errors(self): 2544 decode = codecs.raw_unicode_escape_decode 2545 for c, d in (b'u', 4), (b'U', 4): 2546 for i in range(d): 2547 self.assertRaises(UnicodeDecodeError, decode, 2548 b"\\" + c + b"0"*i) 2549 self.assertRaises(UnicodeDecodeError, decode, 2550 b"[\\" + c + b"0"*i + b"]") 2551 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i 2552 self.assertEqual(decode(data, "ignore"), ("[]", len(data))) 2553 self.assertEqual(decode(data, "replace"), 2554 ("[\ufffd]\ufffd", len(data))) 2555 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000") 2556 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) 2557 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) 2558 2559 def test_partial(self): 2560 self.check_partial( 2561 "\x00\t\n\r\\\xff\uffff\U00010000", 2562 [ 2563 '\x00', 2564 '\x00\t', 2565 '\x00\t\n', 2566 '\x00\t\n\r', 2567 '\x00\t\n\r', 2568 '\x00\t\n\r\\\xff', 2569 '\x00\t\n\r\\\xff', 2570 '\x00\t\n\r\\\xff', 2571 '\x00\t\n\r\\\xff', 2572 '\x00\t\n\r\\\xff', 2573 '\x00\t\n\r\\\xff', 2574 '\x00\t\n\r\\\xff\uffff', 2575 '\x00\t\n\r\\\xff\uffff', 2576 '\x00\t\n\r\\\xff\uffff', 2577 '\x00\t\n\r\\\xff\uffff', 2578 '\x00\t\n\r\\\xff\uffff', 2579 '\x00\t\n\r\\\xff\uffff', 2580 '\x00\t\n\r\\\xff\uffff', 2581 '\x00\t\n\r\\\xff\uffff', 2582 '\x00\t\n\r\\\xff\uffff', 2583 '\x00\t\n\r\\\xff\uffff', 2584 '\x00\t\n\r\\\xff\uffff\U00010000', 2585 ] 2586 ) 2587 2588 2589class EscapeEncodeTest(unittest.TestCase): 2590 2591 def test_escape_encode(self): 2592 tests = [ 2593 (b'', (b'', 0)), 2594 (b'foobar', (b'foobar', 6)), 2595 (b'spam\0eggs', (b'spam\\x00eggs', 9)), 2596 (b'a\'b', (b"a\\'b", 3)), 2597 (b'b\\c', (b'b\\\\c', 3)), 2598 (b'c\nd', (b'c\\nd', 3)), 2599 (b'd\re', (b'd\\re', 3)), 2600 (b'f\x7fg', (b'f\\x7fg', 3)), 2601 ] 2602 for data, output in tests: 2603 with self.subTest(data=data): 2604 self.assertEqual(codecs.escape_encode(data), output) 2605 self.assertRaises(TypeError, codecs.escape_encode, 'spam') 2606 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam')) 2607 2608 2609class SurrogateEscapeTest(unittest.TestCase): 2610 2611 def test_utf8(self): 2612 # Bad byte 2613 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"), 2614 "foo\udc80bar") 2615 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"), 2616 b"foo\x80bar") 2617 # bad-utf-8 encoded surrogate 2618 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"), 2619 "\udced\udcb0\udc80") 2620 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"), 2621 b"\xed\xb0\x80") 2622 2623 def test_ascii(self): 2624 # bad byte 2625 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"), 2626 "foo\udc80bar") 2627 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"), 2628 b"foo\x80bar") 2629 2630 def test_charmap(self): 2631 # bad byte: \xa5 is unmapped in iso-8859-3 2632 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"), 2633 "foo\udca5bar") 2634 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"), 2635 b"foo\xa5bar") 2636 2637 def test_latin1(self): 2638 # Issue6373 2639 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"), 2640 b"\xe4\xeb\xef\xf6\xfc") 2641 2642 2643class BomTest(unittest.TestCase): 2644 def test_seek0(self): 2645 data = "1234567890" 2646 tests = ("utf-16", 2647 "utf-16-le", 2648 "utf-16-be", 2649 "utf-32", 2650 "utf-32-le", 2651 "utf-32-be") 2652 self.addCleanup(os_helper.unlink, os_helper.TESTFN) 2653 for encoding in tests: 2654 # Check if the BOM is written only once 2655 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f: 2656 f.write(data) 2657 f.write(data) 2658 f.seek(0) 2659 self.assertEqual(f.read(), data * 2) 2660 f.seek(0) 2661 self.assertEqual(f.read(), data * 2) 2662 2663 # Check that the BOM is written after a seek(0) 2664 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f: 2665 f.write(data[0]) 2666 self.assertNotEqual(f.tell(), 0) 2667 f.seek(0) 2668 f.write(data) 2669 f.seek(0) 2670 self.assertEqual(f.read(), data) 2671 2672 # (StreamWriter) Check that the BOM is written after a seek(0) 2673 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f: 2674 f.writer.write(data[0]) 2675 self.assertNotEqual(f.writer.tell(), 0) 2676 f.writer.seek(0) 2677 f.writer.write(data) 2678 f.seek(0) 2679 self.assertEqual(f.read(), data) 2680 2681 # Check that the BOM is not written after a seek() at a position 2682 # different than the start 2683 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f: 2684 f.write(data) 2685 f.seek(f.tell()) 2686 f.write(data) 2687 f.seek(0) 2688 self.assertEqual(f.read(), data * 2) 2689 2690 # (StreamWriter) Check that the BOM is not written after a seek() 2691 # at a position different than the start 2692 with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f: 2693 f.writer.write(data) 2694 f.writer.seek(f.writer.tell()) 2695 f.writer.write(data) 2696 f.seek(0) 2697 self.assertEqual(f.read(), data * 2) 2698 2699 2700bytes_transform_encodings = [ 2701 "base64_codec", 2702 "uu_codec", 2703 "quopri_codec", 2704 "hex_codec", 2705] 2706 2707transform_aliases = { 2708 "base64_codec": ["base64", "base_64"], 2709 "uu_codec": ["uu"], 2710 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"], 2711 "hex_codec": ["hex"], 2712 "rot_13": ["rot13"], 2713} 2714 2715try: 2716 import zlib 2717except ImportError: 2718 zlib = None 2719else: 2720 bytes_transform_encodings.append("zlib_codec") 2721 transform_aliases["zlib_codec"] = ["zip", "zlib"] 2722try: 2723 import bz2 2724except ImportError: 2725 pass 2726else: 2727 bytes_transform_encodings.append("bz2_codec") 2728 transform_aliases["bz2_codec"] = ["bz2"] 2729 2730 2731class TransformCodecTest(unittest.TestCase): 2732 2733 def test_basics(self): 2734 binput = bytes(range(256)) 2735 for encoding in bytes_transform_encodings: 2736 with self.subTest(encoding=encoding): 2737 # generic codecs interface 2738 (o, size) = codecs.getencoder(encoding)(binput) 2739 self.assertEqual(size, len(binput)) 2740 (i, size) = codecs.getdecoder(encoding)(o) 2741 self.assertEqual(size, len(o)) 2742 self.assertEqual(i, binput) 2743 2744 def test_read(self): 2745 for encoding in bytes_transform_encodings: 2746 with self.subTest(encoding=encoding): 2747 sin = codecs.encode(b"\x80", encoding) 2748 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2749 sout = reader.read() 2750 self.assertEqual(sout, b"\x80") 2751 2752 def test_readline(self): 2753 for encoding in bytes_transform_encodings: 2754 with self.subTest(encoding=encoding): 2755 sin = codecs.encode(b"\x80", encoding) 2756 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2757 sout = reader.readline() 2758 self.assertEqual(sout, b"\x80") 2759 2760 def test_buffer_api_usage(self): 2761 # We check all the transform codecs accept memoryview input 2762 # for encoding and decoding 2763 # and also that they roundtrip correctly 2764 original = b"12345\x80" 2765 for encoding in bytes_transform_encodings: 2766 with self.subTest(encoding=encoding): 2767 data = original 2768 view = memoryview(data) 2769 data = codecs.encode(data, encoding) 2770 view_encoded = codecs.encode(view, encoding) 2771 self.assertEqual(view_encoded, data) 2772 view = memoryview(data) 2773 data = codecs.decode(data, encoding) 2774 self.assertEqual(data, original) 2775 view_decoded = codecs.decode(view, encoding) 2776 self.assertEqual(view_decoded, data) 2777 2778 def test_text_to_binary_denylists_binary_transforms(self): 2779 # Check binary -> binary codecs give a good error for str input 2780 bad_input = "bad input type" 2781 for encoding in bytes_transform_encodings: 2782 with self.subTest(encoding=encoding): 2783 fmt = (r"{!r} is not a text encoding; " 2784 r"use codecs.encode\(\) to handle arbitrary codecs") 2785 msg = fmt.format(encoding) 2786 with self.assertRaisesRegex(LookupError, msg) as failure: 2787 bad_input.encode(encoding) 2788 self.assertIsNone(failure.exception.__cause__) 2789 2790 def test_text_to_binary_denylists_text_transforms(self): 2791 # Check str.encode gives a good error message for str -> str codecs 2792 msg = (r"^'rot_13' is not a text encoding; " 2793 r"use codecs.encode\(\) to handle arbitrary codecs") 2794 with self.assertRaisesRegex(LookupError, msg): 2795 "just an example message".encode("rot_13") 2796 2797 def test_binary_to_text_denylists_binary_transforms(self): 2798 # Check bytes.decode and bytearray.decode give a good error 2799 # message for binary -> binary codecs 2800 data = b"encode first to ensure we meet any format restrictions" 2801 for encoding in bytes_transform_encodings: 2802 with self.subTest(encoding=encoding): 2803 encoded_data = codecs.encode(data, encoding) 2804 fmt = (r"{!r} is not a text encoding; " 2805 r"use codecs.decode\(\) to handle arbitrary codecs") 2806 msg = fmt.format(encoding) 2807 with self.assertRaisesRegex(LookupError, msg): 2808 encoded_data.decode(encoding) 2809 with self.assertRaisesRegex(LookupError, msg): 2810 bytearray(encoded_data).decode(encoding) 2811 2812 def test_binary_to_text_denylists_text_transforms(self): 2813 # Check str -> str codec gives a good error for binary input 2814 for bad_input in (b"immutable", bytearray(b"mutable")): 2815 with self.subTest(bad_input=bad_input): 2816 msg = (r"^'rot_13' is not a text encoding; " 2817 r"use codecs.decode\(\) to handle arbitrary codecs") 2818 with self.assertRaisesRegex(LookupError, msg) as failure: 2819 bad_input.decode("rot_13") 2820 self.assertIsNone(failure.exception.__cause__) 2821 2822 @unittest.skipUnless(zlib, "Requires zlib support") 2823 def test_custom_zlib_error_is_wrapped(self): 2824 # Check zlib codec gives a good error for malformed input 2825 msg = "^decoding with 'zlib_codec' codec failed" 2826 with self.assertRaisesRegex(Exception, msg) as failure: 2827 codecs.decode(b"hello", "zlib_codec") 2828 self.assertIsInstance(failure.exception.__cause__, 2829 type(failure.exception)) 2830 2831 def test_custom_hex_error_is_wrapped(self): 2832 # Check hex codec gives a good error for malformed input 2833 msg = "^decoding with 'hex_codec' codec failed" 2834 with self.assertRaisesRegex(Exception, msg) as failure: 2835 codecs.decode(b"hello", "hex_codec") 2836 self.assertIsInstance(failure.exception.__cause__, 2837 type(failure.exception)) 2838 2839 # Unfortunately, the bz2 module throws OSError, which the codec 2840 # machinery currently can't wrap :( 2841 2842 # Ensure codec aliases from http://bugs.python.org/issue7475 work 2843 def test_aliases(self): 2844 for codec_name, aliases in transform_aliases.items(): 2845 expected_name = codecs.lookup(codec_name).name 2846 for alias in aliases: 2847 with self.subTest(alias=alias): 2848 info = codecs.lookup(alias) 2849 self.assertEqual(info.name, expected_name) 2850 2851 def test_quopri_stateless(self): 2852 # Should encode with quotetabs=True 2853 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec") 2854 self.assertEqual(encoded, b"space=20tab=09eol=20\n") 2855 # But should still support unescaped tabs and spaces 2856 unescaped = b"space tab eol\n" 2857 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped) 2858 2859 def test_uu_invalid(self): 2860 # Missing "begin" line 2861 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec") 2862 2863 2864# The codec system tries to wrap exceptions in order to ensure the error 2865# mentions the operation being performed and the codec involved. We 2866# currently *only* want this to happen for relatively stateless 2867# exceptions, where the only significant information they contain is their 2868# type and a single str argument. 2869 2870# Use a local codec registry to avoid appearing to leak objects when 2871# registering multiple search functions 2872_TEST_CODECS = {} 2873 2874def _get_test_codec(codec_name): 2875 return _TEST_CODECS.get(codec_name) 2876 2877 2878class ExceptionChainingTest(unittest.TestCase): 2879 2880 def setUp(self): 2881 self.codec_name = 'exception_chaining_test' 2882 codecs.register(_get_test_codec) 2883 self.addCleanup(codecs.unregister, _get_test_codec) 2884 2885 # We store the object to raise on the instance because of a bad 2886 # interaction between the codec caching (which means we can't 2887 # recreate the codec entry) and regrtest refleak hunting (which 2888 # runs the same test instance multiple times). This means we 2889 # need to ensure the codecs call back in to the instance to find 2890 # out which exception to raise rather than binding them in a 2891 # closure to an object that may change on the next run 2892 self.obj_to_raise = RuntimeError 2893 2894 def tearDown(self): 2895 _TEST_CODECS.pop(self.codec_name, None) 2896 # Issue #22166: Also pop from caches to avoid appearance of ref leaks 2897 encodings._cache.pop(self.codec_name, None) 2898 2899 def set_codec(self, encode, decode): 2900 codec_info = codecs.CodecInfo(encode, decode, 2901 name=self.codec_name) 2902 _TEST_CODECS[self.codec_name] = codec_info 2903 2904 @contextlib.contextmanager 2905 def assertWrapped(self, operation, exc_type, msg): 2906 full_msg = r"{} with {!r} codec failed \({}: {}\)".format( 2907 operation, self.codec_name, exc_type.__name__, msg) 2908 with self.assertRaisesRegex(exc_type, full_msg) as caught: 2909 yield caught 2910 self.assertIsInstance(caught.exception.__cause__, exc_type) 2911 self.assertIsNotNone(caught.exception.__cause__.__traceback__) 2912 2913 def raise_obj(self, *args, **kwds): 2914 # Helper to dynamically change the object raised by a test codec 2915 raise self.obj_to_raise 2916 2917 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError): 2918 self.obj_to_raise = obj_to_raise 2919 self.set_codec(self.raise_obj, self.raise_obj) 2920 with self.assertWrapped("encoding", exc_type, msg): 2921 "str_input".encode(self.codec_name) 2922 with self.assertWrapped("encoding", exc_type, msg): 2923 codecs.encode("str_input", self.codec_name) 2924 with self.assertWrapped("decoding", exc_type, msg): 2925 b"bytes input".decode(self.codec_name) 2926 with self.assertWrapped("decoding", exc_type, msg): 2927 codecs.decode(b"bytes input", self.codec_name) 2928 2929 def test_raise_by_type(self): 2930 self.check_wrapped(RuntimeError, "") 2931 2932 def test_raise_by_value(self): 2933 msg = "This should be wrapped" 2934 self.check_wrapped(RuntimeError(msg), msg) 2935 2936 def test_raise_grandchild_subclass_exact_size(self): 2937 msg = "This should be wrapped" 2938 class MyRuntimeError(RuntimeError): 2939 __slots__ = () 2940 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError) 2941 2942 def test_raise_subclass_with_weakref_support(self): 2943 msg = "This should be wrapped" 2944 class MyRuntimeError(RuntimeError): 2945 pass 2946 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError) 2947 2948 def check_not_wrapped(self, obj_to_raise, msg): 2949 def raise_obj(*args, **kwds): 2950 raise obj_to_raise 2951 self.set_codec(raise_obj, raise_obj) 2952 with self.assertRaisesRegex(RuntimeError, msg): 2953 "str input".encode(self.codec_name) 2954 with self.assertRaisesRegex(RuntimeError, msg): 2955 codecs.encode("str input", self.codec_name) 2956 with self.assertRaisesRegex(RuntimeError, msg): 2957 b"bytes input".decode(self.codec_name) 2958 with self.assertRaisesRegex(RuntimeError, msg): 2959 codecs.decode(b"bytes input", self.codec_name) 2960 2961 def test_init_override_is_not_wrapped(self): 2962 class CustomInit(RuntimeError): 2963 def __init__(self): 2964 pass 2965 self.check_not_wrapped(CustomInit, "") 2966 2967 def test_new_override_is_not_wrapped(self): 2968 class CustomNew(RuntimeError): 2969 def __new__(cls): 2970 return super().__new__(cls) 2971 self.check_not_wrapped(CustomNew, "") 2972 2973 def test_instance_attribute_is_not_wrapped(self): 2974 msg = "This should NOT be wrapped" 2975 exc = RuntimeError(msg) 2976 exc.attr = 1 2977 self.check_not_wrapped(exc, "^{}$".format(msg)) 2978 2979 def test_non_str_arg_is_not_wrapped(self): 2980 self.check_not_wrapped(RuntimeError(1), "1") 2981 2982 def test_multiple_args_is_not_wrapped(self): 2983 msg_re = r"^\('a', 'b', 'c'\)$" 2984 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re) 2985 2986 # http://bugs.python.org/issue19609 2987 def test_codec_lookup_failure_not_wrapped(self): 2988 msg = "^unknown encoding: {}$".format(self.codec_name) 2989 # The initial codec lookup should not be wrapped 2990 with self.assertRaisesRegex(LookupError, msg): 2991 "str input".encode(self.codec_name) 2992 with self.assertRaisesRegex(LookupError, msg): 2993 codecs.encode("str input", self.codec_name) 2994 with self.assertRaisesRegex(LookupError, msg): 2995 b"bytes input".decode(self.codec_name) 2996 with self.assertRaisesRegex(LookupError, msg): 2997 codecs.decode(b"bytes input", self.codec_name) 2998 2999 def test_unflagged_non_text_codec_handling(self): 3000 # The stdlib non-text codecs are now marked so they're 3001 # pre-emptively skipped by the text model related methods 3002 # However, third party codecs won't be flagged, so we still make 3003 # sure the case where an inappropriate output type is produced is 3004 # handled appropriately 3005 def encode_to_str(*args, **kwds): 3006 return "not bytes!", 0 3007 def decode_to_bytes(*args, **kwds): 3008 return b"not str!", 0 3009 self.set_codec(encode_to_str, decode_to_bytes) 3010 # No input or output type checks on the codecs module functions 3011 encoded = codecs.encode(None, self.codec_name) 3012 self.assertEqual(encoded, "not bytes!") 3013 decoded = codecs.decode(None, self.codec_name) 3014 self.assertEqual(decoded, b"not str!") 3015 # Text model methods should complain 3016 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; " 3017 r"use codecs.encode\(\) to encode to arbitrary types$") 3018 msg = fmt.format(self.codec_name) 3019 with self.assertRaisesRegex(TypeError, msg): 3020 "str_input".encode(self.codec_name) 3021 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; " 3022 r"use codecs.decode\(\) to decode to arbitrary types$") 3023 msg = fmt.format(self.codec_name) 3024 with self.assertRaisesRegex(TypeError, msg): 3025 b"bytes input".decode(self.codec_name) 3026 3027 3028 3029@unittest.skipUnless(sys.platform == 'win32', 3030 'code pages are specific to Windows') 3031class CodePageTest(unittest.TestCase): 3032 CP_UTF8 = 65001 3033 3034 def test_invalid_code_page(self): 3035 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') 3036 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') 3037 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a') 3038 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a') 3039 3040 def test_code_page_name(self): 3041 self.assertRaisesRegex(UnicodeEncodeError, 'cp932', 3042 codecs.code_page_encode, 932, '\xff') 3043 self.assertRaisesRegex(UnicodeDecodeError, 'cp932', 3044 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True) 3045 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', 3046 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True) 3047 3048 def check_decode(self, cp, tests): 3049 for raw, errors, expected in tests: 3050 if expected is not None: 3051 try: 3052 decoded = codecs.code_page_decode(cp, raw, errors, True) 3053 except UnicodeDecodeError as err: 3054 self.fail('Unable to decode %a from "cp%s" with ' 3055 'errors=%r: %s' % (raw, cp, errors, err)) 3056 self.assertEqual(decoded[0], expected, 3057 '%a.decode("cp%s", %r)=%a != %a' 3058 % (raw, cp, errors, decoded[0], expected)) 3059 # assert 0 <= decoded[1] <= len(raw) 3060 self.assertGreaterEqual(decoded[1], 0) 3061 self.assertLessEqual(decoded[1], len(raw)) 3062 else: 3063 self.assertRaises(UnicodeDecodeError, 3064 codecs.code_page_decode, cp, raw, errors, True) 3065 3066 def check_encode(self, cp, tests): 3067 for text, errors, expected in tests: 3068 if expected is not None: 3069 try: 3070 encoded = codecs.code_page_encode(cp, text, errors) 3071 except UnicodeEncodeError as err: 3072 self.fail('Unable to encode %a to "cp%s" with ' 3073 'errors=%r: %s' % (text, cp, errors, err)) 3074 self.assertEqual(encoded[0], expected, 3075 '%a.encode("cp%s", %r)=%a != %a' 3076 % (text, cp, errors, encoded[0], expected)) 3077 self.assertEqual(encoded[1], len(text)) 3078 else: 3079 self.assertRaises(UnicodeEncodeError, 3080 codecs.code_page_encode, cp, text, errors) 3081 3082 def test_cp932(self): 3083 self.check_encode(932, ( 3084 ('abc', 'strict', b'abc'), 3085 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), 3086 # test error handlers 3087 ('\xff', 'strict', None), 3088 ('[\xff]', 'ignore', b'[]'), 3089 ('[\xff]', 'replace', b'[y]'), 3090 ('[\u20ac]', 'replace', b'[?]'), 3091 ('[\xff]', 'backslashreplace', b'[\\xff]'), 3092 ('[\xff]', 'namereplace', 3093 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'), 3094 ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'), 3095 ('\udcff', 'strict', None), 3096 ('[\udcff]', 'surrogateescape', b'[\xff]'), 3097 ('[\udcff]', 'surrogatepass', None), 3098 )) 3099 self.check_decode(932, ( 3100 (b'abc', 'strict', 'abc'), 3101 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), 3102 # invalid bytes 3103 (b'[\xff]', 'strict', None), 3104 (b'[\xff]', 'ignore', '[]'), 3105 (b'[\xff]', 'replace', '[\ufffd]'), 3106 (b'[\xff]', 'backslashreplace', '[\\xff]'), 3107 (b'[\xff]', 'surrogateescape', '[\udcff]'), 3108 (b'[\xff]', 'surrogatepass', None), 3109 (b'\x81\x00abc', 'strict', None), 3110 (b'\x81\x00abc', 'ignore', '\x00abc'), 3111 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'), 3112 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'), 3113 )) 3114 3115 def test_cp1252(self): 3116 self.check_encode(1252, ( 3117 ('abc', 'strict', b'abc'), 3118 ('\xe9\u20ac', 'strict', b'\xe9\x80'), 3119 ('\xff', 'strict', b'\xff'), 3120 # test error handlers 3121 ('\u0141', 'strict', None), 3122 ('\u0141', 'ignore', b''), 3123 ('\u0141', 'replace', b'L'), 3124 ('\udc98', 'surrogateescape', b'\x98'), 3125 ('\udc98', 'surrogatepass', None), 3126 )) 3127 self.check_decode(1252, ( 3128 (b'abc', 'strict', 'abc'), 3129 (b'\xe9\x80', 'strict', '\xe9\u20ac'), 3130 (b'\xff', 'strict', '\xff'), 3131 )) 3132 3133 def test_cp_utf7(self): 3134 cp = 65000 3135 self.check_encode(cp, ( 3136 ('abc', 'strict', b'abc'), 3137 ('\xe9\u20ac', 'strict', b'+AOkgrA-'), 3138 ('\U0010ffff', 'strict', b'+2//f/w-'), 3139 ('\udc80', 'strict', b'+3IA-'), 3140 ('\ufffd', 'strict', b'+//0-'), 3141 )) 3142 self.check_decode(cp, ( 3143 (b'abc', 'strict', 'abc'), 3144 (b'+AOkgrA-', 'strict', '\xe9\u20ac'), 3145 (b'+2//f/w-', 'strict', '\U0010ffff'), 3146 (b'+3IA-', 'strict', '\udc80'), 3147 (b'+//0-', 'strict', '\ufffd'), 3148 # invalid bytes 3149 (b'[+/]', 'strict', '[]'), 3150 (b'[\xff]', 'strict', '[\xff]'), 3151 )) 3152 3153 def test_multibyte_encoding(self): 3154 self.check_decode(932, ( 3155 (b'\x84\xe9\x80', 'ignore', '\u9a3e'), 3156 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'), 3157 )) 3158 self.check_decode(self.CP_UTF8, ( 3159 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), 3160 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), 3161 )) 3162 self.check_encode(self.CP_UTF8, ( 3163 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), 3164 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), 3165 )) 3166 3167 def test_code_page_decode_flags(self): 3168 # Issue #36312: For some code pages (e.g. UTF-7) flags for 3169 # MultiByteToWideChar() must be set to 0. 3170 if support.verbose: 3171 sys.stdout.write('\n') 3172 for cp in (50220, 50221, 50222, 50225, 50227, 50229, 3173 *range(57002, 57011+1), 65000): 3174 # On small versions of Windows like Windows IoT 3175 # not all codepages are present. 3176 # A missing codepage causes an OSError exception 3177 # so check for the codepage before decoding 3178 if is_code_page_present(cp): 3179 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}') 3180 else: 3181 if support.verbose: 3182 print(f" skipping cp={cp}") 3183 self.assertEqual(codecs.code_page_decode(42, b'abc'), 3184 ('\uf061\uf062\uf063', 3)) 3185 3186 def test_incremental(self): 3187 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) 3188 self.assertEqual(decoded, ('', 0)) 3189 3190 decoded = codecs.code_page_decode(932, 3191 b'\xe9\x80\xe9', 'strict', 3192 False) 3193 self.assertEqual(decoded, ('\u9a3e', 2)) 3194 3195 decoded = codecs.code_page_decode(932, 3196 b'\xe9\x80\xe9\x80', 'strict', 3197 False) 3198 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4)) 3199 3200 decoded = codecs.code_page_decode(932, 3201 b'abc', 'strict', 3202 False) 3203 self.assertEqual(decoded, ('abc', 3)) 3204 3205 def test_mbcs_alias(self): 3206 # Check that looking up our 'default' codepage will return 3207 # mbcs when we don't have a more specific one available 3208 code_page = 99_999 3209 name = f'cp{code_page}' 3210 with mock.patch('_winapi.GetACP', return_value=code_page): 3211 try: 3212 codec = codecs.lookup(name) 3213 self.assertEqual(codec.name, 'mbcs') 3214 finally: 3215 codecs.unregister(name) 3216 3217 @support.bigmemtest(size=2**31, memuse=7, dry_run=False) 3218 def test_large_input(self, size): 3219 # Test input longer than INT_MAX. 3220 # Input should contain undecodable bytes before and after 3221 # the INT_MAX limit. 3222 encoded = (b'01234567' * ((size//8)-1) + 3223 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff') 3224 self.assertEqual(len(encoded), size+2) 3225 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True) 3226 self.assertEqual(decoded[1], len(encoded)) 3227 del encoded 3228 self.assertEqual(len(decoded[0]), decoded[1]) 3229 self.assertEqual(decoded[0][:10], '0123456701') 3230 self.assertEqual(decoded[0][-20:], 3231 '6701234567' 3232 '\udc85\udc86\udcea\udceb\udcec' 3233 '\udcef\udcfc\udcfd\udcfe\udcff') 3234 3235 @support.bigmemtest(size=2**31, memuse=6, dry_run=False) 3236 def test_large_utf8_input(self, size): 3237 # Test input longer than INT_MAX. 3238 # Input should contain a decodable multi-byte character 3239 # surrounding INT_MAX 3240 encoded = (b'0123456\xed\x84\x80' * (size//8)) 3241 self.assertEqual(len(encoded), size // 8 * 10) 3242 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True) 3243 self.assertEqual(decoded[1], len(encoded)) 3244 del encoded 3245 self.assertEqual(len(decoded[0]), size) 3246 self.assertEqual(decoded[0][:10], '0123456\ud10001') 3247 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100') 3248 3249 3250class ASCIITest(unittest.TestCase): 3251 def test_encode(self): 3252 self.assertEqual('abc123'.encode('ascii'), b'abc123') 3253 3254 def test_encode_error(self): 3255 for data, error_handler, expected in ( 3256 ('[\x80\xff\u20ac]', 'ignore', b'[]'), 3257 ('[\x80\xff\u20ac]', 'replace', b'[???]'), 3258 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'), 3259 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace', 3260 b'[\\x80\\xff\\u20ac\\U000abcde]'), 3261 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3262 ): 3263 with self.subTest(data=data, error_handler=error_handler, 3264 expected=expected): 3265 self.assertEqual(data.encode('ascii', error_handler), 3266 expected) 3267 3268 def test_encode_surrogateescape_error(self): 3269 with self.assertRaises(UnicodeEncodeError): 3270 # the first character can be decoded, but not the second 3271 '\udc80\xff'.encode('ascii', 'surrogateescape') 3272 3273 def test_decode(self): 3274 self.assertEqual(b'abc'.decode('ascii'), 'abc') 3275 3276 def test_decode_error(self): 3277 for data, error_handler, expected in ( 3278 (b'[\x80\xff]', 'ignore', '[]'), 3279 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), 3280 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), 3281 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), 3282 ): 3283 with self.subTest(data=data, error_handler=error_handler, 3284 expected=expected): 3285 self.assertEqual(data.decode('ascii', error_handler), 3286 expected) 3287 3288 3289class Latin1Test(unittest.TestCase): 3290 def test_encode(self): 3291 for data, expected in ( 3292 ('abc', b'abc'), 3293 ('\x80\xe9\xff', b'\x80\xe9\xff'), 3294 ): 3295 with self.subTest(data=data, expected=expected): 3296 self.assertEqual(data.encode('latin1'), expected) 3297 3298 def test_encode_errors(self): 3299 for data, error_handler, expected in ( 3300 ('[\u20ac\udc80]', 'ignore', b'[]'), 3301 ('[\u20ac\udc80]', 'replace', b'[??]'), 3302 ('[\u20ac\U000abcde]', 'backslashreplace', 3303 b'[\\u20ac\\U000abcde]'), 3304 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'), 3305 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3306 ): 3307 with self.subTest(data=data, error_handler=error_handler, 3308 expected=expected): 3309 self.assertEqual(data.encode('latin1', error_handler), 3310 expected) 3311 3312 def test_encode_surrogateescape_error(self): 3313 with self.assertRaises(UnicodeEncodeError): 3314 # the first character can be decoded, but not the second 3315 '\udc80\u20ac'.encode('latin1', 'surrogateescape') 3316 3317 def test_decode(self): 3318 for data, expected in ( 3319 (b'abc', 'abc'), 3320 (b'[\x80\xff]', '[\x80\xff]'), 3321 ): 3322 with self.subTest(data=data, expected=expected): 3323 self.assertEqual(data.decode('latin1'), expected) 3324 3325 3326class StreamRecoderTest(unittest.TestCase): 3327 def test_writelines(self): 3328 bio = io.BytesIO() 3329 codec = codecs.lookup('ascii') 3330 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode, 3331 encodings.ascii.StreamReader, encodings.ascii.StreamWriter) 3332 sr.writelines([b'a', b'b']) 3333 self.assertEqual(bio.getvalue(), b'ab') 3334 3335 def test_write(self): 3336 bio = io.BytesIO() 3337 codec = codecs.lookup('latin1') 3338 # Recode from Latin-1 to utf-8. 3339 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode, 3340 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter) 3341 3342 text = 'àñé' 3343 sr.write(text.encode('latin1')) 3344 self.assertEqual(bio.getvalue(), text.encode('utf-8')) 3345 3346 def test_seeking_read(self): 3347 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le')) 3348 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le') 3349 3350 self.assertEqual(sr.readline(), b'line1\n') 3351 sr.seek(0) 3352 self.assertEqual(sr.readline(), b'line1\n') 3353 self.assertEqual(sr.readline(), b'line2\n') 3354 self.assertEqual(sr.readline(), b'line3\n') 3355 self.assertEqual(sr.readline(), b'') 3356 3357 def test_seeking_write(self): 3358 bio = io.BytesIO('123456789\n'.encode('utf-16-le')) 3359 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le') 3360 3361 # Test that seek() only resets its internal buffer when offset 3362 # and whence are zero. 3363 sr.seek(2) 3364 sr.write(b'\nabc\n') 3365 self.assertEqual(sr.readline(), b'789\n') 3366 sr.seek(0) 3367 self.assertEqual(sr.readline(), b'1\n') 3368 self.assertEqual(sr.readline(), b'abc\n') 3369 self.assertEqual(sr.readline(), b'789\n') 3370 3371 3372@unittest.skipIf(_testinternalcapi is None, 'need _testinternalcapi module') 3373class LocaleCodecTest(unittest.TestCase): 3374 """ 3375 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex(). 3376 """ 3377 ENCODING = sys.getfilesystemencoding() 3378 STRINGS = ("ascii", "ulatin1:\xa7\xe9", 3379 "u255:\xff", 3380 "UCS:\xe9\u20ac\U0010ffff", 3381 "surrogates:\uDC80\uDCFF") 3382 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff") 3383 SURROGATES = "\uDC80\uDCFF" 3384 3385 def encode(self, text, errors="strict"): 3386 return _testinternalcapi.EncodeLocaleEx(text, 0, errors) 3387 3388 def check_encode_strings(self, errors): 3389 for text in self.STRINGS: 3390 with self.subTest(text=text): 3391 try: 3392 expected = text.encode(self.ENCODING, errors) 3393 except UnicodeEncodeError: 3394 with self.assertRaises(RuntimeError) as cm: 3395 self.encode(text, errors) 3396 errmsg = str(cm.exception) 3397 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=") 3398 else: 3399 encoded = self.encode(text, errors) 3400 self.assertEqual(encoded, expected) 3401 3402 def test_encode_strict(self): 3403 self.check_encode_strings("strict") 3404 3405 def test_encode_surrogateescape(self): 3406 self.check_encode_strings("surrogateescape") 3407 3408 def test_encode_surrogatepass(self): 3409 try: 3410 self.encode('', 'surrogatepass') 3411 except ValueError as exc: 3412 if str(exc) == 'unsupported error handler': 3413 self.skipTest(f"{self.ENCODING!r} encoder doesn't support " 3414 f"surrogatepass error handler") 3415 else: 3416 raise 3417 3418 self.check_encode_strings("surrogatepass") 3419 3420 def test_encode_unsupported_error_handler(self): 3421 with self.assertRaises(ValueError) as cm: 3422 self.encode('', 'backslashreplace') 3423 self.assertEqual(str(cm.exception), 'unsupported error handler') 3424 3425 def decode(self, encoded, errors="strict"): 3426 return _testinternalcapi.DecodeLocaleEx(encoded, 0, errors) 3427 3428 def check_decode_strings(self, errors): 3429 is_utf8 = (self.ENCODING == "utf-8") 3430 if is_utf8: 3431 encode_errors = 'surrogateescape' 3432 else: 3433 encode_errors = 'strict' 3434 3435 strings = list(self.BYTES_STRINGS) 3436 for text in self.STRINGS: 3437 try: 3438 encoded = text.encode(self.ENCODING, encode_errors) 3439 if encoded not in strings: 3440 strings.append(encoded) 3441 except UnicodeEncodeError: 3442 encoded = None 3443 3444 if is_utf8: 3445 encoded2 = text.encode(self.ENCODING, 'surrogatepass') 3446 if encoded2 != encoded: 3447 strings.append(encoded2) 3448 3449 for encoded in strings: 3450 with self.subTest(encoded=encoded): 3451 try: 3452 expected = encoded.decode(self.ENCODING, errors) 3453 except UnicodeDecodeError: 3454 with self.assertRaises(RuntimeError) as cm: 3455 self.decode(encoded, errors) 3456 errmsg = str(cm.exception) 3457 self.assertTrue(errmsg.startswith("decode error: "), errmsg) 3458 else: 3459 decoded = self.decode(encoded, errors) 3460 self.assertEqual(decoded, expected) 3461 3462 def test_decode_strict(self): 3463 self.check_decode_strings("strict") 3464 3465 def test_decode_surrogateescape(self): 3466 self.check_decode_strings("surrogateescape") 3467 3468 def test_decode_surrogatepass(self): 3469 try: 3470 self.decode(b'', 'surrogatepass') 3471 except ValueError as exc: 3472 if str(exc) == 'unsupported error handler': 3473 self.skipTest(f"{self.ENCODING!r} decoder doesn't support " 3474 f"surrogatepass error handler") 3475 else: 3476 raise 3477 3478 self.check_decode_strings("surrogatepass") 3479 3480 def test_decode_unsupported_error_handler(self): 3481 with self.assertRaises(ValueError) as cm: 3482 self.decode(b'', 'backslashreplace') 3483 self.assertEqual(str(cm.exception), 'unsupported error handler') 3484 3485 3486class Rot13Test(unittest.TestCase): 3487 """Test the educational ROT-13 codec.""" 3488 def test_encode(self): 3489 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13') 3490 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref') 3491 3492 def test_decode(self): 3493 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13') 3494 self.assertEqual(plaintext, 'Et tu, Brute?') 3495 3496 def test_incremental_encode(self): 3497 encoder = codecs.getincrementalencoder('rot-13')() 3498 ciphertext = encoder.encode('ABBA nag Cheryl Baker') 3499 self.assertEqual(ciphertext, 'NOON ant Purely Onxre') 3500 3501 def test_incremental_decode(self): 3502 decoder = codecs.getincrementaldecoder('rot-13')() 3503 plaintext = decoder.decode('terra Ares envy tha') 3504 self.assertEqual(plaintext, 'green Nerf rail gun') 3505 3506 3507class Rot13UtilTest(unittest.TestCase): 3508 """Test the ROT-13 codec via rot13 function, 3509 i.e. the user has done something like: 3510 $ echo "Hello World" | python -m encodings.rot_13 3511 """ 3512 def test_rot13_func(self): 3513 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba') 3514 outfile = io.StringIO() 3515 encodings.rot_13.rot13(infile, outfile) 3516 outfile.seek(0) 3517 plain_text = outfile.read() 3518 self.assertEqual( 3519 plain_text, 3520 'To be, or not to be, that is the question') 3521 3522 3523class CodecNameNormalizationTest(unittest.TestCase): 3524 """Test codec name normalization""" 3525 def test_codecs_lookup(self): 3526 FOUND = (1, 2, 3, 4) 3527 NOT_FOUND = (None, None, None, None) 3528 def search_function(encoding): 3529 if encoding == "aaa_8": 3530 return FOUND 3531 else: 3532 return NOT_FOUND 3533 3534 codecs.register(search_function) 3535 self.addCleanup(codecs.unregister, search_function) 3536 self.assertEqual(FOUND, codecs.lookup('aaa_8')) 3537 self.assertEqual(FOUND, codecs.lookup('AAA-8')) 3538 self.assertEqual(FOUND, codecs.lookup('AAA---8')) 3539 self.assertEqual(FOUND, codecs.lookup('AAA 8')) 3540 self.assertEqual(FOUND, codecs.lookup('aaa\xe9\u20ac-8')) 3541 self.assertEqual(NOT_FOUND, codecs.lookup('AAA.8')) 3542 self.assertEqual(NOT_FOUND, codecs.lookup('AAA...8')) 3543 self.assertEqual(NOT_FOUND, codecs.lookup('BBB-8')) 3544 self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8')) 3545 self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8')) 3546 3547 def test_encodings_normalize_encoding(self): 3548 # encodings.normalize_encoding() ignores non-ASCII characters. 3549 normalize = encodings.normalize_encoding 3550 self.assertEqual(normalize('utf_8'), 'utf_8') 3551 self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') 3552 self.assertEqual(normalize('utf 8'), 'utf_8') 3553 # encodings.normalize_encoding() doesn't convert 3554 # characters to lower case. 3555 self.assertEqual(normalize('UTF 8'), 'UTF_8') 3556 self.assertEqual(normalize('utf.8'), 'utf.8') 3557 self.assertEqual(normalize('utf...8'), 'utf...8') 3558 3559 3560if __name__ == "__main__": 3561 unittest.main() 3562