1import codecs 2import html.entities 3import itertools 4import sys 5import unicodedata 6import unittest 7 8 9class PosReturn: 10 # this can be used for configurable callbacks 11 12 def __init__(self): 13 self.pos = 0 14 15 def handle(self, exc): 16 oldpos = self.pos 17 realpos = oldpos 18 if realpos<0: 19 realpos = len(exc.object) + realpos 20 # if we don't advance this time, terminate on the next call 21 # otherwise we'd get an endless loop 22 if realpos <= exc.start: 23 self.pos = len(exc.object) 24 return ("<?>", oldpos) 25 26class RepeatedPosReturn: 27 def __init__(self, repl="<?>"): 28 self.repl = repl 29 self.pos = 0 30 self.count = 0 31 32 def handle(self, exc): 33 if self.count > 0: 34 self.count -= 1 35 return (self.repl, self.pos) 36 return (self.repl, exc.end) 37 38# A UnicodeEncodeError object with a bad start attribute 39class BadStartUnicodeEncodeError(UnicodeEncodeError): 40 def __init__(self): 41 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") 42 self.start = [] 43 44# A UnicodeEncodeError object with a bad object attribute 45class BadObjectUnicodeEncodeError(UnicodeEncodeError): 46 def __init__(self): 47 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") 48 self.object = [] 49 50# A UnicodeDecodeError object without an end attribute 51class NoEndUnicodeDecodeError(UnicodeDecodeError): 52 def __init__(self): 53 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad") 54 del self.end 55 56# A UnicodeDecodeError object with a bad object attribute 57class BadObjectUnicodeDecodeError(UnicodeDecodeError): 58 def __init__(self): 59 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad") 60 self.object = [] 61 62# A UnicodeTranslateError object without a start attribute 63class NoStartUnicodeTranslateError(UnicodeTranslateError): 64 def __init__(self): 65 UnicodeTranslateError.__init__(self, "", 0, 1, "bad") 66 del self.start 67 68# A UnicodeTranslateError object without an end attribute 69class NoEndUnicodeTranslateError(UnicodeTranslateError): 70 def __init__(self): 71 UnicodeTranslateError.__init__(self, "", 0, 1, "bad") 72 del self.end 73 74# A UnicodeTranslateError object without an object attribute 75class NoObjectUnicodeTranslateError(UnicodeTranslateError): 76 def __init__(self): 77 UnicodeTranslateError.__init__(self, "", 0, 1, "bad") 78 del self.object 79 80class CodecCallbackTest(unittest.TestCase): 81 82 def test_xmlcharrefreplace(self): 83 # replace unencodable characters which numeric character entities. 84 # For ascii, latin-1 and charmaps this is completely implemented 85 # in C and should be reasonably fast. 86 s = "\u30b9\u30d1\u30e2 \xe4nd eggs" 87 self.assertEqual( 88 s.encode("ascii", "xmlcharrefreplace"), 89 b"スパモ änd eggs" 90 ) 91 self.assertEqual( 92 s.encode("latin-1", "xmlcharrefreplace"), 93 b"スパモ \xe4nd eggs" 94 ) 95 96 def test_xmlcharnamereplace(self): 97 # This time use a named character entity for unencodable 98 # characters, if one is available. 99 100 def xmlcharnamereplace(exc): 101 if not isinstance(exc, UnicodeEncodeError): 102 raise TypeError("don't know how to handle %r" % exc) 103 l = [] 104 for c in exc.object[exc.start:exc.end]: 105 try: 106 l.append("&%s;" % html.entities.codepoint2name[ord(c)]) 107 except KeyError: 108 l.append("&#%d;" % ord(c)) 109 return ("".join(l), exc.end) 110 111 codecs.register_error( 112 "test.xmlcharnamereplace", xmlcharnamereplace) 113 114 sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" 115 sout = b"«ℜ» = ⟨ሴ€⟩" 116 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) 117 sout = b"\xabℜ\xbb = ⟨ሴ€⟩" 118 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) 119 sout = b"\xabℜ\xbb = ⟨ሴ\xa4⟩" 120 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) 121 122 def test_uninamereplace(self): 123 # We're using the names from the unicode database this time, 124 # and we're doing "syntax highlighting" here, i.e. we include 125 # the replaced text in ANSI escape sequences. For this it is 126 # useful that the error handler is not called for every single 127 # unencodable character, but for a complete sequence of 128 # unencodable characters, otherwise we would output many 129 # unnecessary escape sequences. 130 131 def uninamereplace(exc): 132 if not isinstance(exc, UnicodeEncodeError): 133 raise TypeError("don't know how to handle %r" % exc) 134 l = [] 135 for c in exc.object[exc.start:exc.end]: 136 l.append(unicodedata.name(c, "0x%x" % ord(c))) 137 return ("\033[1m%s\033[0m" % ", ".join(l), exc.end) 138 139 codecs.register_error( 140 "test.uninamereplace", uninamereplace) 141 142 sin = "\xac\u1234\u20ac\u8000" 143 sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" 144 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) 145 146 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" 147 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) 148 149 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m" 150 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) 151 152 def test_backslashescape(self): 153 # Does the same as the "unicode-escape" encoding, but with different 154 # base encodings. 155 sin = "a\xac\u1234\u20ac\u8000\U0010ffff" 156 sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff" 157 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) 158 159 sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff" 160 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) 161 162 sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff" 163 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) 164 165 def test_nameescape(self): 166 # Does the same as backslashescape, but prefers ``\N{...}`` escape 167 # sequences. 168 sin = "a\xac\u1234\u20ac\u8000\U0010ffff" 169 sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' 170 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') 171 self.assertEqual(sin.encode("ascii", "namereplace"), sout) 172 173 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' 174 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') 175 self.assertEqual(sin.encode("latin-1", "namereplace"), sout) 176 177 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4' 178 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') 179 self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout) 180 181 def test_decoding_callbacks(self): 182 # This is a test for a decoding callback handler 183 # that allows the decoding of the invalid sequence 184 # "\xc0\x80" and returns "\x00" instead of raising an error. 185 # All other illegal sequences will be handled strictly. 186 def relaxedutf8(exc): 187 if not isinstance(exc, UnicodeDecodeError): 188 raise TypeError("don't know how to handle %r" % exc) 189 if exc.object[exc.start:exc.start+2] == b"\xc0\x80": 190 return ("\x00", exc.start+2) # retry after two bytes 191 else: 192 raise exc 193 194 codecs.register_error("test.relaxedutf8", relaxedutf8) 195 196 # all the "\xc0\x80" will be decoded to "\x00" 197 sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" 198 sout = "a\x00b\x00c\xfc\x00\x00" 199 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) 200 201 # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised 202 sin = b"\xc0\x80\xc0\x81" 203 self.assertRaises(UnicodeDecodeError, sin.decode, 204 "utf-8", "test.relaxedutf8") 205 206 def test_charmapencode(self): 207 # For charmap encodings the replacement string will be 208 # mapped through the encoding again. This means, that 209 # to be able to use e.g. the "replace" handler, the 210 # charmap has to have a mapping for "?". 211 charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh") 212 sin = "abc" 213 sout = b"AABBCC" 214 self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout) 215 216 sin = "abcA" 217 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) 218 219 charmap[ord("?")] = b"XYZ" 220 sin = "abcDEF" 221 sout = b"AABBCCXYZXYZXYZ" 222 self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout) 223 224 charmap[ord("?")] = "XYZ" # wrong type in mapping 225 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) 226 227 def test_callbacks(self): 228 def handler1(exc): 229 r = range(exc.start, exc.end) 230 if isinstance(exc, UnicodeEncodeError): 231 l = ["<%d>" % ord(exc.object[pos]) for pos in r] 232 elif isinstance(exc, UnicodeDecodeError): 233 l = ["<%d>" % exc.object[pos] for pos in r] 234 else: 235 raise TypeError("don't know how to handle %r" % exc) 236 return ("[%s]" % "".join(l), exc.end) 237 238 codecs.register_error("test.handler1", handler1) 239 240 def handler2(exc): 241 if not isinstance(exc, UnicodeDecodeError): 242 raise TypeError("don't know how to handle %r" % exc) 243 l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)] 244 return ("[%s]" % "".join(l), exc.end+1) # skip one character 245 246 codecs.register_error("test.handler2", handler2) 247 248 s = b"\x00\x81\x7f\x80\xff" 249 250 self.assertEqual( 251 s.decode("ascii", "test.handler1"), 252 "\x00[<129>]\x7f[<128>][<255>]" 253 ) 254 self.assertEqual( 255 s.decode("ascii", "test.handler2"), 256 "\x00[<129>][<128>]" 257 ) 258 259 self.assertEqual( 260 b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"), 261 "\u3042[<92><117><51>]xxx" 262 ) 263 264 self.assertEqual( 265 b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"), 266 "\u3042[<92><117><51>]xx" 267 ) 268 269 self.assertEqual( 270 codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0], 271 "z[<98>][<99>]" 272 ) 273 274 self.assertEqual( 275 "g\xfc\xdfrk".encode("ascii", "test.handler1"), 276 b"g[<252><223>]rk" 277 ) 278 279 self.assertEqual( 280 "g\xfc\xdf".encode("ascii", "test.handler1"), 281 b"g[<252><223>]" 282 ) 283 284 def test_longstrings(self): 285 # test long strings to check for memory overflow problems 286 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", 287 "backslashreplace", "namereplace"] 288 # register the handlers under different names, 289 # to prevent the codec from recognizing the name 290 for err in errors: 291 codecs.register_error("test." + err, codecs.lookup_error(err)) 292 l = 1000 293 errors += [ "test." + err for err in errors ] 294 for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]: 295 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", 296 "utf-8", "utf-7", "utf-16", "utf-32"): 297 for err in errors: 298 try: 299 uni.encode(enc, err) 300 except UnicodeError: 301 pass 302 303 def check_exceptionobjectargs(self, exctype, args, msg): 304 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion 305 # check with one missing argument 306 self.assertRaises(TypeError, exctype, *args[:-1]) 307 # check with one argument too much 308 self.assertRaises(TypeError, exctype, *(args + ["too much"])) 309 # check with one argument of the wrong type 310 wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ] 311 for i in range(len(args)): 312 for wrongarg in wrongargs: 313 if type(wrongarg) is type(args[i]): 314 continue 315 # build argument array 316 callargs = [] 317 for j in range(len(args)): 318 if i==j: 319 callargs.append(wrongarg) 320 else: 321 callargs.append(args[i]) 322 self.assertRaises(TypeError, exctype, *callargs) 323 324 # check with the correct number and type of arguments 325 exc = exctype(*args) 326 self.assertEqual(str(exc), msg) 327 328 def test_unicodeencodeerror(self): 329 self.check_exceptionobjectargs( 330 UnicodeEncodeError, 331 ["ascii", "g\xfcrk", 1, 2, "ouch"], 332 "'ascii' codec can't encode character '\\xfc' in position 1: ouch" 333 ) 334 self.check_exceptionobjectargs( 335 UnicodeEncodeError, 336 ["ascii", "g\xfcrk", 1, 4, "ouch"], 337 "'ascii' codec can't encode characters in position 1-3: ouch" 338 ) 339 self.check_exceptionobjectargs( 340 UnicodeEncodeError, 341 ["ascii", "\xfcx", 0, 1, "ouch"], 342 "'ascii' codec can't encode character '\\xfc' in position 0: ouch" 343 ) 344 self.check_exceptionobjectargs( 345 UnicodeEncodeError, 346 ["ascii", "\u0100x", 0, 1, "ouch"], 347 "'ascii' codec can't encode character '\\u0100' in position 0: ouch" 348 ) 349 self.check_exceptionobjectargs( 350 UnicodeEncodeError, 351 ["ascii", "\uffffx", 0, 1, "ouch"], 352 "'ascii' codec can't encode character '\\uffff' in position 0: ouch" 353 ) 354 self.check_exceptionobjectargs( 355 UnicodeEncodeError, 356 ["ascii", "\U00010000x", 0, 1, "ouch"], 357 "'ascii' codec can't encode character '\\U00010000' in position 0: ouch" 358 ) 359 360 def test_unicodedecodeerror(self): 361 self.check_exceptionobjectargs( 362 UnicodeDecodeError, 363 ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"], 364 "'ascii' codec can't decode byte 0xfc in position 1: ouch" 365 ) 366 self.check_exceptionobjectargs( 367 UnicodeDecodeError, 368 ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"], 369 "'ascii' codec can't decode bytes in position 1-2: ouch" 370 ) 371 372 def test_unicodetranslateerror(self): 373 self.check_exceptionobjectargs( 374 UnicodeTranslateError, 375 ["g\xfcrk", 1, 2, "ouch"], 376 "can't translate character '\\xfc' in position 1: ouch" 377 ) 378 self.check_exceptionobjectargs( 379 UnicodeTranslateError, 380 ["g\u0100rk", 1, 2, "ouch"], 381 "can't translate character '\\u0100' in position 1: ouch" 382 ) 383 self.check_exceptionobjectargs( 384 UnicodeTranslateError, 385 ["g\uffffrk", 1, 2, "ouch"], 386 "can't translate character '\\uffff' in position 1: ouch" 387 ) 388 self.check_exceptionobjectargs( 389 UnicodeTranslateError, 390 ["g\U00010000rk", 1, 2, "ouch"], 391 "can't translate character '\\U00010000' in position 1: ouch" 392 ) 393 self.check_exceptionobjectargs( 394 UnicodeTranslateError, 395 ["g\xfcrk", 1, 3, "ouch"], 396 "can't translate characters in position 1-2: ouch" 397 ) 398 399 def test_badandgoodstrictexceptions(self): 400 # "strict" complains about a non-exception passed in 401 self.assertRaises( 402 TypeError, 403 codecs.strict_errors, 404 42 405 ) 406 # "strict" complains about the wrong exception type 407 self.assertRaises( 408 Exception, 409 codecs.strict_errors, 410 Exception("ouch") 411 ) 412 413 # If the correct exception is passed in, "strict" raises it 414 self.assertRaises( 415 UnicodeEncodeError, 416 codecs.strict_errors, 417 UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch") 418 ) 419 self.assertRaises( 420 UnicodeDecodeError, 421 codecs.strict_errors, 422 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") 423 ) 424 self.assertRaises( 425 UnicodeTranslateError, 426 codecs.strict_errors, 427 UnicodeTranslateError("\u3042", 0, 1, "ouch") 428 ) 429 430 def test_badandgoodignoreexceptions(self): 431 # "ignore" complains about a non-exception passed in 432 self.assertRaises( 433 TypeError, 434 codecs.ignore_errors, 435 42 436 ) 437 # "ignore" complains about the wrong exception type 438 self.assertRaises( 439 TypeError, 440 codecs.ignore_errors, 441 UnicodeError("ouch") 442 ) 443 # If the correct exception is passed in, "ignore" returns an empty replacement 444 self.assertEqual( 445 codecs.ignore_errors( 446 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")), 447 ("", 2) 448 ) 449 self.assertEqual( 450 codecs.ignore_errors( 451 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")), 452 ("", 2) 453 ) 454 self.assertEqual( 455 codecs.ignore_errors( 456 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")), 457 ("", 2) 458 ) 459 460 def test_badandgoodreplaceexceptions(self): 461 # "replace" complains about a non-exception passed in 462 self.assertRaises( 463 TypeError, 464 codecs.replace_errors, 465 42 466 ) 467 # "replace" complains about the wrong exception type 468 self.assertRaises( 469 TypeError, 470 codecs.replace_errors, 471 UnicodeError("ouch") 472 ) 473 self.assertRaises( 474 TypeError, 475 codecs.replace_errors, 476 BadObjectUnicodeEncodeError() 477 ) 478 self.assertRaises( 479 TypeError, 480 codecs.replace_errors, 481 BadObjectUnicodeDecodeError() 482 ) 483 # With the correct exception, "replace" returns an "?" or "\ufffd" replacement 484 self.assertEqual( 485 codecs.replace_errors( 486 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")), 487 ("?", 2) 488 ) 489 self.assertEqual( 490 codecs.replace_errors( 491 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")), 492 ("\ufffd", 2) 493 ) 494 self.assertEqual( 495 codecs.replace_errors( 496 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")), 497 ("\ufffd", 2) 498 ) 499 500 def test_badandgoodxmlcharrefreplaceexceptions(self): 501 # "xmlcharrefreplace" complains about a non-exception passed in 502 self.assertRaises( 503 TypeError, 504 codecs.xmlcharrefreplace_errors, 505 42 506 ) 507 # "xmlcharrefreplace" complains about the wrong exception types 508 self.assertRaises( 509 TypeError, 510 codecs.xmlcharrefreplace_errors, 511 UnicodeError("ouch") 512 ) 513 # "xmlcharrefreplace" can only be used for encoding 514 self.assertRaises( 515 TypeError, 516 codecs.xmlcharrefreplace_errors, 517 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") 518 ) 519 self.assertRaises( 520 TypeError, 521 codecs.xmlcharrefreplace_errors, 522 UnicodeTranslateError("\u3042", 0, 1, "ouch") 523 ) 524 # Use the correct exception 525 cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000, 526 999999, 1000000) 527 cs += (0xd800, 0xdfff) 528 s = "".join(chr(c) for c in cs) 529 self.assertEqual( 530 codecs.xmlcharrefreplace_errors( 531 UnicodeEncodeError("ascii", "a" + s + "b", 532 1, 1 + len(s), "ouch") 533 ), 534 ("".join("&#%d;" % c for c in cs), 1 + len(s)) 535 ) 536 537 def test_badandgoodbackslashreplaceexceptions(self): 538 # "backslashreplace" complains about a non-exception passed in 539 self.assertRaises( 540 TypeError, 541 codecs.backslashreplace_errors, 542 42 543 ) 544 # "backslashreplace" complains about the wrong exception types 545 self.assertRaises( 546 TypeError, 547 codecs.backslashreplace_errors, 548 UnicodeError("ouch") 549 ) 550 # Use the correct exception 551 tests = [ 552 ("\u3042", "\\u3042"), 553 ("\n", "\\x0a"), 554 ("a", "\\x61"), 555 ("\x00", "\\x00"), 556 ("\xff", "\\xff"), 557 ("\u0100", "\\u0100"), 558 ("\uffff", "\\uffff"), 559 ("\U00010000", "\\U00010000"), 560 ("\U0010ffff", "\\U0010ffff"), 561 # Lone surrogates 562 ("\ud800", "\\ud800"), 563 ("\udfff", "\\udfff"), 564 ("\ud800\udfff", "\\ud800\\udfff"), 565 ] 566 for s, r in tests: 567 with self.subTest(str=s): 568 self.assertEqual( 569 codecs.backslashreplace_errors( 570 UnicodeEncodeError("ascii", "a" + s + "b", 571 1, 1 + len(s), "ouch")), 572 (r, 1 + len(s)) 573 ) 574 self.assertEqual( 575 codecs.backslashreplace_errors( 576 UnicodeTranslateError("a" + s + "b", 577 1, 1 + len(s), "ouch")), 578 (r, 1 + len(s)) 579 ) 580 tests = [ 581 (b"a", "\\x61"), 582 (b"\n", "\\x0a"), 583 (b"\x00", "\\x00"), 584 (b"\xff", "\\xff"), 585 ] 586 for b, r in tests: 587 with self.subTest(bytes=b): 588 self.assertEqual( 589 codecs.backslashreplace_errors( 590 UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"), 591 1, 2, "ouch")), 592 (r, 2) 593 ) 594 595 def test_badandgoodnamereplaceexceptions(self): 596 # "namereplace" complains about a non-exception passed in 597 self.assertRaises( 598 TypeError, 599 codecs.namereplace_errors, 600 42 601 ) 602 # "namereplace" complains about the wrong exception types 603 self.assertRaises( 604 TypeError, 605 codecs.namereplace_errors, 606 UnicodeError("ouch") 607 ) 608 # "namereplace" can only be used for encoding 609 self.assertRaises( 610 TypeError, 611 codecs.namereplace_errors, 612 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") 613 ) 614 self.assertRaises( 615 TypeError, 616 codecs.namereplace_errors, 617 UnicodeTranslateError("\u3042", 0, 1, "ouch") 618 ) 619 # Use the correct exception 620 tests = [ 621 ("\u3042", "\\N{HIRAGANA LETTER A}"), 622 ("\x00", "\\x00"), 623 ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH " 624 "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"), 625 ("\U000e007f", "\\N{CANCEL TAG}"), 626 ("\U0010ffff", "\\U0010ffff"), 627 # Lone surrogates 628 ("\ud800", "\\ud800"), 629 ("\udfff", "\\udfff"), 630 ("\ud800\udfff", "\\ud800\\udfff"), 631 ] 632 for s, r in tests: 633 with self.subTest(str=s): 634 self.assertEqual( 635 codecs.namereplace_errors( 636 UnicodeEncodeError("ascii", "a" + s + "b", 637 1, 1 + len(s), "ouch")), 638 (r, 1 + len(s)) 639 ) 640 641 def test_badandgoodsurrogateescapeexceptions(self): 642 surrogateescape_errors = codecs.lookup_error('surrogateescape') 643 # "surrogateescape" complains about a non-exception passed in 644 self.assertRaises( 645 TypeError, 646 surrogateescape_errors, 647 42 648 ) 649 # "surrogateescape" complains about the wrong exception types 650 self.assertRaises( 651 TypeError, 652 surrogateescape_errors, 653 UnicodeError("ouch") 654 ) 655 # "surrogateescape" can not be used for translating 656 self.assertRaises( 657 TypeError, 658 surrogateescape_errors, 659 UnicodeTranslateError("\udc80", 0, 1, "ouch") 660 ) 661 # Use the correct exception 662 for s in ("a", "\udc7f", "\udd00"): 663 with self.subTest(str=s): 664 self.assertRaises( 665 UnicodeEncodeError, 666 surrogateescape_errors, 667 UnicodeEncodeError("ascii", s, 0, 1, "ouch") 668 ) 669 self.assertEqual( 670 surrogateescape_errors( 671 UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")), 672 (b"\x80", 2) 673 ) 674 self.assertRaises( 675 UnicodeDecodeError, 676 surrogateescape_errors, 677 UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch") 678 ) 679 self.assertEqual( 680 surrogateescape_errors( 681 UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")), 682 ("\udc80", 2) 683 ) 684 685 def test_badandgoodsurrogatepassexceptions(self): 686 surrogatepass_errors = codecs.lookup_error('surrogatepass') 687 # "surrogatepass" complains about a non-exception passed in 688 self.assertRaises( 689 TypeError, 690 surrogatepass_errors, 691 42 692 ) 693 # "surrogatepass" complains about the wrong exception types 694 self.assertRaises( 695 TypeError, 696 surrogatepass_errors, 697 UnicodeError("ouch") 698 ) 699 # "surrogatepass" can not be used for translating 700 self.assertRaises( 701 TypeError, 702 surrogatepass_errors, 703 UnicodeTranslateError("\ud800", 0, 1, "ouch") 704 ) 705 # Use the correct exception 706 for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"): 707 with self.subTest(encoding=enc): 708 self.assertRaises( 709 UnicodeEncodeError, 710 surrogatepass_errors, 711 UnicodeEncodeError(enc, "a", 0, 1, "ouch") 712 ) 713 self.assertRaises( 714 UnicodeDecodeError, 715 surrogatepass_errors, 716 UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch") 717 ) 718 for s in ("\ud800", "\udfff", "\ud800\udfff"): 719 with self.subTest(str=s): 720 self.assertRaises( 721 UnicodeEncodeError, 722 surrogatepass_errors, 723 UnicodeEncodeError("ascii", s, 0, len(s), "ouch") 724 ) 725 tests = [ 726 ("utf-8", "\ud800", b'\xed\xa0\x80', 3), 727 ("utf-16le", "\ud800", b'\x00\xd8', 2), 728 ("utf-16be", "\ud800", b'\xd8\x00', 2), 729 ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4), 730 ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4), 731 ("utf-8", "\udfff", b'\xed\xbf\xbf', 3), 732 ("utf-16le", "\udfff", b'\xff\xdf', 2), 733 ("utf-16be", "\udfff", b'\xdf\xff', 2), 734 ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4), 735 ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4), 736 ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3), 737 ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2), 738 ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2), 739 ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4), 740 ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4), 741 ] 742 for enc, s, b, n in tests: 743 with self.subTest(encoding=enc, str=s, bytes=b): 744 self.assertEqual( 745 surrogatepass_errors( 746 UnicodeEncodeError(enc, "a" + s + "b", 747 1, 1 + len(s), "ouch")), 748 (b, 1 + len(s)) 749 ) 750 self.assertEqual( 751 surrogatepass_errors( 752 UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"), 753 1, 1 + n, "ouch")), 754 (s[:1], 1 + n) 755 ) 756 757 def test_badhandlerresults(self): 758 results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) 759 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") 760 761 for res in results: 762 codecs.register_error("test.badhandler", lambda x: res) 763 for enc in encs: 764 self.assertRaises( 765 TypeError, 766 "\u3042".encode, 767 enc, 768 "test.badhandler" 769 ) 770 for (enc, bytes) in ( 771 ("ascii", b"\xff"), 772 ("utf-8", b"\xff"), 773 ("utf-7", b"+x-"), 774 ): 775 self.assertRaises( 776 TypeError, 777 bytes.decode, 778 enc, 779 "test.badhandler" 780 ) 781 782 def test_lookup(self): 783 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) 784 self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore")) 785 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) 786 self.assertEqual( 787 codecs.xmlcharrefreplace_errors, 788 codecs.lookup_error("xmlcharrefreplace") 789 ) 790 self.assertEqual( 791 codecs.backslashreplace_errors, 792 codecs.lookup_error("backslashreplace") 793 ) 794 self.assertEqual( 795 codecs.namereplace_errors, 796 codecs.lookup_error("namereplace") 797 ) 798 799 def test_encode_nonascii_replacement(self): 800 def handle(exc): 801 if isinstance(exc, UnicodeEncodeError): 802 return (repl, exc.end) 803 raise TypeError("don't know how to handle %r" % exc) 804 codecs.register_error("test.replacing", handle) 805 806 for enc, input, repl in ( 807 ("ascii", "[¤]", "abc"), 808 ("iso-8859-1", "[€]", "½¾"), 809 ("iso-8859-15", "[¤]", "œŸ"), 810 ): 811 res = input.encode(enc, "test.replacing") 812 self.assertEqual(res, ("[" + repl + "]").encode(enc)) 813 814 for enc, input, repl in ( 815 ("utf-8", "[\udc80]", "\U0001f40d"), 816 ("utf-16", "[\udc80]", "\U0001f40d"), 817 ("utf-32", "[\udc80]", "\U0001f40d"), 818 ): 819 with self.subTest(encoding=enc): 820 with self.assertRaises(UnicodeEncodeError) as cm: 821 input.encode(enc, "test.replacing") 822 exc = cm.exception 823 self.assertEqual(exc.start, 1) 824 self.assertEqual(exc.end, 2) 825 self.assertEqual(exc.object, input) 826 827 def test_encode_unencodable_replacement(self): 828 def unencrepl(exc): 829 if isinstance(exc, UnicodeEncodeError): 830 return (repl, exc.end) 831 else: 832 raise TypeError("don't know how to handle %r" % exc) 833 codecs.register_error("test.unencreplhandler", unencrepl) 834 835 for enc, input, repl in ( 836 ("ascii", "[¤]", "½"), 837 ("iso-8859-1", "[€]", "œ"), 838 ("iso-8859-15", "[¤]", "½"), 839 ("utf-8", "[\udc80]", "\udcff"), 840 ("utf-16", "[\udc80]", "\udcff"), 841 ("utf-32", "[\udc80]", "\udcff"), 842 ): 843 with self.subTest(encoding=enc): 844 with self.assertRaises(UnicodeEncodeError) as cm: 845 input.encode(enc, "test.unencreplhandler") 846 exc = cm.exception 847 self.assertEqual(exc.start, 1) 848 self.assertEqual(exc.end, 2) 849 self.assertEqual(exc.object, input) 850 851 def test_encode_bytes_replacement(self): 852 def handle(exc): 853 if isinstance(exc, UnicodeEncodeError): 854 return (repl, exc.end) 855 raise TypeError("don't know how to handle %r" % exc) 856 codecs.register_error("test.replacing", handle) 857 858 # It works even if the bytes sequence is not decodable. 859 for enc, input, repl in ( 860 ("ascii", "[¤]", b"\xbd\xbe"), 861 ("iso-8859-1", "[€]", b"\xbd\xbe"), 862 ("iso-8859-15", "[¤]", b"\xbd\xbe"), 863 ("utf-8", "[\udc80]", b"\xbd\xbe"), 864 ("utf-16le", "[\udc80]", b"\xbd\xbe"), 865 ("utf-16be", "[\udc80]", b"\xbd\xbe"), 866 ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"), 867 ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"), 868 ): 869 with self.subTest(encoding=enc): 870 res = input.encode(enc, "test.replacing") 871 self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc)) 872 873 def test_encode_odd_bytes_replacement(self): 874 def handle(exc): 875 if isinstance(exc, UnicodeEncodeError): 876 return (repl, exc.end) 877 raise TypeError("don't know how to handle %r" % exc) 878 codecs.register_error("test.replacing", handle) 879 880 input = "[\udc80]" 881 # Tests in which the replacement bytestring contains not whole number 882 # of code units. 883 for enc, repl in ( 884 *itertools.product(("utf-16le", "utf-16be"), 885 [b"a", b"abc"]), 886 *itertools.product(("utf-32le", "utf-32be"), 887 [b"a", b"ab", b"abc", b"abcde"]), 888 ): 889 with self.subTest(encoding=enc, repl=repl): 890 with self.assertRaises(UnicodeEncodeError) as cm: 891 input.encode(enc, "test.replacing") 892 exc = cm.exception 893 self.assertEqual(exc.start, 1) 894 self.assertEqual(exc.end, 2) 895 self.assertEqual(exc.object, input) 896 self.assertEqual(exc.reason, "surrogates not allowed") 897 898 def test_badregistercall(self): 899 # enhance coverage of: 900 # Modules/_codecsmodule.c::register_error() 901 # Python/codecs.c::PyCodec_RegisterError() 902 self.assertRaises(TypeError, codecs.register_error, 42) 903 self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) 904 905 def test_badlookupcall(self): 906 # enhance coverage of: 907 # Modules/_codecsmodule.c::lookup_error() 908 self.assertRaises(TypeError, codecs.lookup_error) 909 910 def test_unknownhandler(self): 911 # enhance coverage of: 912 # Modules/_codecsmodule.c::lookup_error() 913 self.assertRaises(LookupError, codecs.lookup_error, "test.unknown") 914 915 def test_xmlcharrefvalues(self): 916 # enhance coverage of: 917 # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() 918 # and inline implementations 919 v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000, 920 500000, 1000000) 921 s = "".join([chr(x) for x in v]) 922 codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) 923 for enc in ("ascii", "iso-8859-15"): 924 for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"): 925 s.encode(enc, err) 926 927 def test_decodehelper(self): 928 # enhance coverage of: 929 # Objects/unicodeobject.c::unicode_decode_call_errorhandler() 930 # and callers 931 self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown") 932 933 def baddecodereturn1(exc): 934 return 42 935 codecs.register_error("test.baddecodereturn1", baddecodereturn1) 936 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1") 937 self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1") 938 self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1") 939 self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1") 940 self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") 941 self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") 942 943 def baddecodereturn2(exc): 944 return ("?", None) 945 codecs.register_error("test.baddecodereturn2", baddecodereturn2) 946 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2") 947 948 handler = PosReturn() 949 codecs.register_error("test.posreturn", handler.handle) 950 951 # Valid negative position 952 handler.pos = -1 953 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0") 954 955 # Valid negative position 956 handler.pos = -2 957 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>") 958 959 # Negative position out of bounds 960 handler.pos = -3 961 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn") 962 963 # Valid positive position 964 handler.pos = 1 965 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0") 966 967 # Largest valid positive position (one beyond end of input) 968 handler.pos = 2 969 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>") 970 971 # Invalid positive position 972 handler.pos = 3 973 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn") 974 975 # Restart at the "0" 976 handler.pos = 6 977 self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0") 978 979 class D(dict): 980 def __getitem__(self, key): 981 raise ValueError 982 self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None}) 983 self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D()) 984 self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1}) 985 986 def test_encodehelper(self): 987 # enhance coverage of: 988 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() 989 # and callers 990 self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown") 991 992 def badencodereturn1(exc): 993 return 42 994 codecs.register_error("test.badencodereturn1", badencodereturn1) 995 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1") 996 997 def badencodereturn2(exc): 998 return ("?", None) 999 codecs.register_error("test.badencodereturn2", badencodereturn2) 1000 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2") 1001 1002 handler = PosReturn() 1003 codecs.register_error("test.posreturn", handler.handle) 1004 1005 # Valid negative position 1006 handler.pos = -1 1007 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0") 1008 1009 # Valid negative position 1010 handler.pos = -2 1011 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>") 1012 1013 # Negative position out of bounds 1014 handler.pos = -3 1015 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn") 1016 1017 # Valid positive position 1018 handler.pos = 1 1019 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0") 1020 1021 # Largest valid positive position (one beyond end of input 1022 handler.pos = 2 1023 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>") 1024 1025 # Invalid positive position 1026 handler.pos = 3 1027 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn") 1028 1029 handler.pos = 0 1030 1031 class D(dict): 1032 def __getitem__(self, key): 1033 raise ValueError 1034 for err in ("strict", "replace", "xmlcharrefreplace", 1035 "backslashreplace", "namereplace", "test.posreturn"): 1036 self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None}) 1037 self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D()) 1038 self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300}) 1039 1040 def test_decodehelper_bug36819(self): 1041 handler = RepeatedPosReturn("x") 1042 codecs.register_error("test.bug36819", handler.handle) 1043 1044 testcases = [ 1045 ("ascii", b"\xff"), 1046 ("utf-8", b"\xff"), 1047 ("utf-16be", b'\xdc\x80'), 1048 ("utf-32be", b'\x00\x00\xdc\x80'), 1049 ("iso-8859-6", b"\xff"), 1050 ] 1051 for enc, bad in testcases: 1052 input = "abcd".encode(enc) + bad 1053 with self.subTest(encoding=enc): 1054 handler.count = 50 1055 decoded = input.decode(enc, "test.bug36819") 1056 self.assertEqual(decoded, 'abcdx' * 51) 1057 1058 def test_encodehelper_bug36819(self): 1059 handler = RepeatedPosReturn() 1060 codecs.register_error("test.bug36819", handler.handle) 1061 1062 input = "abcd\udc80" 1063 encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in 1064 encodings += ["iso-8859-15"] # charmap codec 1065 if sys.platform == 'win32': 1066 encodings = ["mbcs", "oem"] # code page codecs 1067 1068 handler.repl = "\udcff" 1069 for enc in encodings: 1070 with self.subTest(encoding=enc): 1071 handler.count = 50 1072 with self.assertRaises(UnicodeEncodeError) as cm: 1073 input.encode(enc, "test.bug36819") 1074 exc = cm.exception 1075 self.assertEqual(exc.start, 4) 1076 self.assertEqual(exc.end, 5) 1077 self.assertEqual(exc.object, input) 1078 if sys.platform == "win32": 1079 handler.count = 50 1080 with self.assertRaises(UnicodeEncodeError) as cm: 1081 codecs.code_page_encode(437, input, "test.bug36819") 1082 exc = cm.exception 1083 self.assertEqual(exc.start, 4) 1084 self.assertEqual(exc.end, 5) 1085 self.assertEqual(exc.object, input) 1086 1087 handler.repl = "x" 1088 for enc in encodings: 1089 with self.subTest(encoding=enc): 1090 # The interpreter should segfault after a handful of attempts. 1091 # 50 was chosen to try to ensure a segfault without a fix, 1092 # but not OOM a machine with one. 1093 handler.count = 50 1094 encoded = input.encode(enc, "test.bug36819") 1095 self.assertEqual(encoded.decode(enc), "abcdx" * 51) 1096 if sys.platform == "win32": 1097 handler.count = 50 1098 encoded = codecs.code_page_encode(437, input, "test.bug36819") 1099 self.assertEqual(encoded[0].decode(), "abcdx" * 51) 1100 self.assertEqual(encoded[1], len(input)) 1101 1102 def test_translatehelper(self): 1103 # enhance coverage of: 1104 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() 1105 # and callers 1106 # (Unfortunately the errors argument is not directly accessible 1107 # from Python, so we can't test that much) 1108 class D(dict): 1109 def __getitem__(self, key): 1110 raise ValueError 1111 #self.assertRaises(ValueError, "\xff".translate, D()) 1112 self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1}) 1113 self.assertRaises(TypeError, "\xff".translate, {0xff: ()}) 1114 1115 def test_bug828737(self): 1116 charmap = { 1117 ord("&"): "&", 1118 ord("<"): "<", 1119 ord(">"): ">", 1120 ord('"'): """, 1121 } 1122 1123 for n in (1, 10, 100, 1000): 1124 text = 'abc<def>ghi'*n 1125 text.translate(charmap) 1126 1127 def test_mutatingdecodehandler(self): 1128 baddata = [ 1129 ("ascii", b"\xff"), 1130 ("utf-7", b"++"), 1131 ("utf-8", b"\xff"), 1132 ("utf-16", b"\xff"), 1133 ("utf-32", b"\xff"), 1134 ("unicode-escape", b"\\u123g"), 1135 ("raw-unicode-escape", b"\\u123g"), 1136 ] 1137 1138 def replacing(exc): 1139 if isinstance(exc, UnicodeDecodeError): 1140 exc.object = 42 1141 return ("\u4242", 0) 1142 else: 1143 raise TypeError("don't know how to handle %r" % exc) 1144 codecs.register_error("test.replacing", replacing) 1145 1146 for (encoding, data) in baddata: 1147 with self.assertRaises(TypeError): 1148 data.decode(encoding, "test.replacing") 1149 1150 def mutating(exc): 1151 if isinstance(exc, UnicodeDecodeError): 1152 exc.object = b"" 1153 return ("\u4242", 0) 1154 else: 1155 raise TypeError("don't know how to handle %r" % exc) 1156 codecs.register_error("test.mutating", mutating) 1157 # If the decoder doesn't pick up the modified input the following 1158 # will lead to an endless loop 1159 for (encoding, data) in baddata: 1160 self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242") 1161 1162 # issue32583 1163 def test_crashing_decode_handler(self): 1164 # better generating one more character to fill the extra space slot 1165 # so in debug build it can steadily fail 1166 def forward_shorter_than_end(exc): 1167 if isinstance(exc, UnicodeDecodeError): 1168 # size one character, 0 < forward < exc.end 1169 return ('\ufffd', exc.start+1) 1170 else: 1171 raise TypeError("don't know how to handle %r" % exc) 1172 codecs.register_error( 1173 "test.forward_shorter_than_end", forward_shorter_than_end) 1174 1175 self.assertEqual( 1176 b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode( 1177 'utf-16-le', 'test.forward_shorter_than_end'), 1178 '\ufffd\ufffd\ufffd\ufffd\xd8\x00' 1179 ) 1180 self.assertEqual( 1181 b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode( 1182 'utf-16-be', 'test.forward_shorter_than_end'), 1183 '\ufffd\ufffd\ufffd\ufffd\xd8\x00' 1184 ) 1185 self.assertEqual( 1186 b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode( 1187 'utf-32-le', 'test.forward_shorter_than_end'), 1188 '\ufffd\ufffd\ufffd\u1111\x00' 1189 ) 1190 self.assertEqual( 1191 b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode( 1192 'utf-32-be', 'test.forward_shorter_than_end'), 1193 '\ufffd\ufffd\ufffd\u1111\x00' 1194 ) 1195 1196 def replace_with_long(exc): 1197 if isinstance(exc, UnicodeDecodeError): 1198 exc.object = b"\x00" * 8 1199 return ('\ufffd', exc.start) 1200 else: 1201 raise TypeError("don't know how to handle %r" % exc) 1202 codecs.register_error("test.replace_with_long", replace_with_long) 1203 1204 self.assertEqual( 1205 b'\x00'.decode('utf-16', 'test.replace_with_long'), 1206 '\ufffd\x00\x00\x00\x00' 1207 ) 1208 self.assertEqual( 1209 b'\x00'.decode('utf-32', 'test.replace_with_long'), 1210 '\ufffd\x00\x00' 1211 ) 1212 1213 1214 def test_fake_error_class(self): 1215 handlers = [ 1216 codecs.strict_errors, 1217 codecs.ignore_errors, 1218 codecs.replace_errors, 1219 codecs.backslashreplace_errors, 1220 codecs.namereplace_errors, 1221 codecs.xmlcharrefreplace_errors, 1222 codecs.lookup_error('surrogateescape'), 1223 codecs.lookup_error('surrogatepass'), 1224 ] 1225 for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError: 1226 class FakeUnicodeError(str): 1227 __class__ = cls 1228 for handler in handlers: 1229 with self.subTest(handler=handler, error_class=cls): 1230 self.assertRaises(TypeError, handler, FakeUnicodeError()) 1231 class FakeUnicodeError(Exception): 1232 __class__ = cls 1233 for handler in handlers: 1234 with self.subTest(handler=handler, error_class=cls): 1235 with self.assertRaises((TypeError, FakeUnicodeError)): 1236 handler(FakeUnicodeError()) 1237 1238 1239if __name__ == "__main__": 1240 unittest.main() 1241