1""" Test script for the Unicode implementation. 2 3Written by Marc-Andre Lemburg ([email protected]). 4 5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 6 7"""#" 8import sys 9import struct 10import codecs 11import unittest 12from test import test_support, string_tests 13 14# decorator to skip tests on narrow builds 15requires_wide_build = unittest.skipIf(sys.maxunicode == 65535, 16 'requires wide build') 17 18# Error handling (bad decoder return) 19def search_function(encoding): 20 def decode1(input, errors="strict"): 21 return 42 # not a tuple 22 def encode1(input, errors="strict"): 23 return 42 # not a tuple 24 def encode2(input, errors="strict"): 25 return (42, 42) # no unicode 26 def decode2(input, errors="strict"): 27 return (42, 42) # no unicode 28 if encoding=="test.unicode1": 29 return (encode1, decode1, None, None) 30 elif encoding=="test.unicode2": 31 return (encode2, decode2, None, None) 32 else: 33 return None 34codecs.register(search_function) 35 36class UnicodeSubclass(unicode): 37 pass 38 39class UnicodeTest( 40 string_tests.CommonTest, 41 string_tests.MixinStrUnicodeUserStringTest, 42 string_tests.MixinStrUnicodeTest, 43 ): 44 type2test = unicode 45 46 def assertEqual(self, first, second, msg=None): 47 # strict assertEqual method: reject implicit bytes/unicode equality 48 super(UnicodeTest, self).assertEqual(first, second, msg) 49 if isinstance(first, unicode) or isinstance(second, unicode): 50 self.assertIsInstance(first, unicode) 51 self.assertIsInstance(second, unicode) 52 elif isinstance(first, str) or isinstance(second, str): 53 self.assertIsInstance(first, str) 54 self.assertIsInstance(second, str) 55 56 def checkequalnofix(self, result, object, methodname, *args): 57 method = getattr(object, methodname) 58 realresult = method(*args) 59 self.assertEqual(realresult, result) 60 self.assertTrue(type(realresult) is type(result)) 61 62 # if the original is returned make sure that 63 # this doesn't happen with subclasses 64 if realresult is object: 65 class usub(unicode): 66 def __repr__(self): 67 return 'usub(%r)' % unicode.__repr__(self) 68 object = usub(object) 69 method = getattr(object, methodname) 70 realresult = method(*args) 71 self.assertEqual(realresult, result) 72 self.assertTrue(object is not realresult) 73 74 def test_literals(self): 75 self.assertEqual(u'\xff', u'\u00ff') 76 self.assertEqual(u'\uffff', u'\U0000ffff') 77 self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'') 78 self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'') 79 self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000) 80 81 def test_repr(self): 82 if not sys.platform.startswith('java'): 83 # Test basic sanity of repr() 84 self.assertEqual(repr(u'abc'), "u'abc'") 85 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'") 86 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'") 87 self.assertEqual(repr(u'\\c'), "u'\\\\c'") 88 self.assertEqual(repr(u'\\'), "u'\\\\'") 89 self.assertEqual(repr(u'\n'), "u'\\n'") 90 self.assertEqual(repr(u'\r'), "u'\\r'") 91 self.assertEqual(repr(u'\t'), "u'\\t'") 92 self.assertEqual(repr(u'\b'), "u'\\x08'") 93 self.assertEqual(repr(u"'\""), """u'\\'"'""") 94 self.assertEqual(repr(u"'\""), """u'\\'"'""") 95 self.assertEqual(repr(u"'"), '''u"'"''') 96 self.assertEqual(repr(u'"'), """u'"'""") 97 latin1repr = ( 98 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" 99 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" 100 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" 101 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" 102 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" 103 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" 104 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9" 105 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7" 106 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5" 107 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3" 108 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1" 109 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef" 110 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd" 111 "\\xfe\\xff'") 112 testrepr = repr(u''.join(map(unichr, xrange(256)))) 113 self.assertEqual(testrepr, latin1repr) 114 # Test repr works on wide unicode escapes without overflow. 115 self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096), 116 repr(u"\U00010000" * 39 + u"\uffff" * 4096)) 117 118 119 def test_count(self): 120 string_tests.CommonTest.test_count(self) 121 # check mixed argument types 122 self.checkequalnofix(3, 'aaa', 'count', u'a') 123 self.checkequalnofix(0, 'aaa', 'count', u'b') 124 self.checkequalnofix(3, u'aaa', 'count', 'a') 125 self.checkequalnofix(0, u'aaa', 'count', 'b') 126 self.checkequalnofix(0, u'aaa', 'count', 'b') 127 self.checkequalnofix(1, u'aaa', 'count', 'a', -1) 128 self.checkequalnofix(3, u'aaa', 'count', 'a', -10) 129 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1) 130 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10) 131 132 def test_find(self): 133 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc') 134 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1) 135 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4) 136 137 self.assertRaises(TypeError, u'hello'.find) 138 self.assertRaises(TypeError, u'hello'.find, 42) 139 140 def test_rfind(self): 141 string_tests.CommonTest.test_rfind(self) 142 # check mixed argument types 143 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc') 144 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'') 145 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '') 146 147 def test_index(self): 148 string_tests.CommonTest.test_index(self) 149 # check mixed argument types 150 for (t1, t2) in ((str, unicode), (unicode, str)): 151 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('')) 152 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def')) 153 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc')) 154 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1) 155 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib')) 156 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1) 157 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8) 158 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1) 159 160 def test_rindex(self): 161 string_tests.CommonTest.test_rindex(self) 162 # check mixed argument types 163 for (t1, t2) in ((str, unicode), (unicode, str)): 164 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2('')) 165 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def')) 166 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc')) 167 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1) 168 169 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib')) 170 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1) 171 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1) 172 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8) 173 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1) 174 175 def test_translate(self): 176 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None}) 177 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')}) 178 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'}) 179 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'}) 180 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''}) 181 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'}) 182 183 self.assertRaises(TypeError, u'hello'.translate) 184 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''}) 185 186 def test_split(self): 187 string_tests.CommonTest.test_split(self) 188 189 # Mixed arguments 190 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//') 191 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//') 192 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test') 193 194 def test_join(self): 195 string_tests.MixinStrUnicodeUserStringTest.test_join(self) 196 197 # mixed arguments 198 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd']) 199 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd')) 200 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz')) 201 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd']) 202 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd']) 203 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd')) 204 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz')) 205 206 def test_strip(self): 207 string_tests.CommonTest.test_strip(self) 208 self.assertRaises(UnicodeError, u"hello".strip, "\xff") 209 210 def test_replace(self): 211 string_tests.CommonTest.test_replace(self) 212 213 # method call forwarded from str implementation because of unicode argument 214 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1) 215 self.assertRaises(TypeError, 'replace'.replace, u"r", 42) 216 217 def test_comparison(self): 218 # Comparisons: 219 self.assertTrue(u'abc' == 'abc') 220 self.assertTrue('abc' == u'abc') 221 self.assertTrue(u'abc' == u'abc') 222 self.assertTrue(u'abcd' > 'abc') 223 self.assertTrue('abcd' > u'abc') 224 self.assertTrue(u'abcd' > u'abc') 225 self.assertTrue(u'abc' < 'abcd') 226 self.assertTrue('abc' < u'abcd') 227 self.assertTrue(u'abc' < u'abcd') 228 229 if 0: 230 # Move these tests to a Unicode collation module test... 231 # Testing UTF-16 code point order comparisons... 232 233 # No surrogates, no fixup required. 234 self.assertTrue(u'\u0061' < u'\u20ac') 235 # Non surrogate below surrogate value, no fixup required 236 self.assertTrue(u'\u0061' < u'\ud800\udc02') 237 238 # Non surrogate above surrogate value, fixup required 239 def test_lecmp(s, s2): 240 self.assertTrue(s < s2) 241 242 def test_fixup(s): 243 s2 = u'\ud800\udc01' 244 test_lecmp(s, s2) 245 s2 = u'\ud900\udc01' 246 test_lecmp(s, s2) 247 s2 = u'\uda00\udc01' 248 test_lecmp(s, s2) 249 s2 = u'\udb00\udc01' 250 test_lecmp(s, s2) 251 s2 = u'\ud800\udd01' 252 test_lecmp(s, s2) 253 s2 = u'\ud900\udd01' 254 test_lecmp(s, s2) 255 s2 = u'\uda00\udd01' 256 test_lecmp(s, s2) 257 s2 = u'\udb00\udd01' 258 test_lecmp(s, s2) 259 s2 = u'\ud800\ude01' 260 test_lecmp(s, s2) 261 s2 = u'\ud900\ude01' 262 test_lecmp(s, s2) 263 s2 = u'\uda00\ude01' 264 test_lecmp(s, s2) 265 s2 = u'\udb00\ude01' 266 test_lecmp(s, s2) 267 s2 = u'\ud800\udfff' 268 test_lecmp(s, s2) 269 s2 = u'\ud900\udfff' 270 test_lecmp(s, s2) 271 s2 = u'\uda00\udfff' 272 test_lecmp(s, s2) 273 s2 = u'\udb00\udfff' 274 test_lecmp(s, s2) 275 276 test_fixup(u'\ue000') 277 test_fixup(u'\uff61') 278 279 # Surrogates on both sides, no fixup required 280 self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56') 281 282 def test_capitalize(self): 283 string_tests.CommonTest.test_capitalize(self) 284 # check that titlecased chars are lowered correctly 285 # \u1ffc is the titlecased char 286 self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3', 287 u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize') 288 # check with cased non-letter chars 289 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd', 290 u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize') 291 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd', 292 u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize') 293 self.checkequal(u'\u2160\u2171\u2172', 294 u'\u2160\u2161\u2162', 'capitalize') 295 self.checkequal(u'\u2160\u2171\u2172', 296 u'\u2170\u2171\u2172', 'capitalize') 297 # check with Ll chars with no upper - nothing changes here 298 self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7', 299 u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize') 300 301 def test_islower(self): 302 string_tests.MixinStrUnicodeUserStringTest.test_islower(self) 303 self.checkequalnofix(False, u'\u1FFc', 'islower') 304 305 @requires_wide_build 306 def test_islower_non_bmp(self): 307 # non-BMP, uppercase 308 self.assertFalse(u'\U00010401'.islower()) 309 self.assertFalse(u'\U00010427'.islower()) 310 # non-BMP, lowercase 311 self.assertTrue(u'\U00010429'.islower()) 312 self.assertTrue(u'\U0001044E'.islower()) 313 # non-BMP, non-cased 314 self.assertFalse(u'\U0001F40D'.islower()) 315 self.assertFalse(u'\U0001F46F'.islower()) 316 317 def test_isupper(self): 318 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self) 319 if not sys.platform.startswith('java'): 320 self.checkequalnofix(False, u'\u1FFc', 'isupper') 321 322 @requires_wide_build 323 def test_isupper_non_bmp(self): 324 # non-BMP, uppercase 325 self.assertTrue(u'\U00010401'.isupper()) 326 self.assertTrue(u'\U00010427'.isupper()) 327 # non-BMP, lowercase 328 self.assertFalse(u'\U00010429'.isupper()) 329 self.assertFalse(u'\U0001044E'.isupper()) 330 # non-BMP, non-cased 331 self.assertFalse(u'\U0001F40D'.isupper()) 332 self.assertFalse(u'\U0001F46F'.isupper()) 333 334 def test_istitle(self): 335 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self) 336 self.checkequalnofix(True, u'\u1FFc', 'istitle') 337 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle') 338 339 @requires_wide_build 340 def test_istitle_non_bmp(self): 341 # non-BMP, uppercase + lowercase 342 self.assertTrue(u'\U00010401\U00010429'.istitle()) 343 self.assertTrue(u'\U00010427\U0001044E'.istitle()) 344 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6 345 for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']: 346 self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch)) 347 348 def test_isspace(self): 349 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self) 350 self.checkequalnofix(True, u'\u2000', 'isspace') 351 self.checkequalnofix(True, u'\u200a', 'isspace') 352 self.checkequalnofix(False, u'\u2014', 'isspace') 353 354 @requires_wide_build 355 def test_isspace_non_bmp(self): 356 # apparently there are no non-BMP spaces chars in Unicode 6 357 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 358 u'\U0001F40D', u'\U0001F46F']: 359 self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch)) 360 361 @requires_wide_build 362 def test_isalnum_non_bmp(self): 363 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 364 u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']: 365 self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch)) 366 367 def test_isalpha(self): 368 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self) 369 self.checkequalnofix(True, u'\u1FFc', 'isalpha') 370 371 @requires_wide_build 372 def test_isalpha_non_bmp(self): 373 # non-BMP, cased 374 self.assertTrue(u'\U00010401'.isalpha()) 375 self.assertTrue(u'\U00010427'.isalpha()) 376 self.assertTrue(u'\U00010429'.isalpha()) 377 self.assertTrue(u'\U0001044E'.isalpha()) 378 # non-BMP, non-cased 379 self.assertFalse(u'\U0001F40D'.isalpha()) 380 self.assertFalse(u'\U0001F46F'.isalpha()) 381 382 def test_isdecimal(self): 383 self.checkequalnofix(False, u'', 'isdecimal') 384 self.checkequalnofix(False, u'a', 'isdecimal') 385 self.checkequalnofix(True, u'0', 'isdecimal') 386 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE 387 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER 388 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO 389 self.checkequalnofix(True, u'0123456789', 'isdecimal') 390 self.checkequalnofix(False, u'0123456789a', 'isdecimal') 391 392 self.checkraises(TypeError, 'abc', 'isdecimal', 42) 393 394 @requires_wide_build 395 def test_isdecimal_non_bmp(self): 396 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 397 u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']: 398 self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch)) 399 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']: 400 self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch)) 401 402 def test_isdigit(self): 403 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self) 404 self.checkequalnofix(True, u'\u2460', 'isdigit') 405 self.checkequalnofix(False, u'\xbc', 'isdigit') 406 self.checkequalnofix(True, u'\u0660', 'isdigit') 407 408 @requires_wide_build 409 def test_isdigit_non_bmp(self): 410 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 411 u'\U0001F40D', u'\U0001F46F', u'\U00011065']: 412 self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch)) 413 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']: 414 self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch)) 415 416 def test_isnumeric(self): 417 self.checkequalnofix(False, u'', 'isnumeric') 418 self.checkequalnofix(False, u'a', 'isnumeric') 419 self.checkequalnofix(True, u'0', 'isnumeric') 420 self.checkequalnofix(True, u'\u2460', 'isnumeric') 421 self.checkequalnofix(True, u'\xbc', 'isnumeric') 422 self.checkequalnofix(True, u'\u0660', 'isnumeric') 423 self.checkequalnofix(True, u'0123456789', 'isnumeric') 424 self.checkequalnofix(False, u'0123456789a', 'isnumeric') 425 426 self.assertRaises(TypeError, u"abc".isnumeric, 42) 427 428 @requires_wide_build 429 def test_isnumeric_non_bmp(self): 430 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 431 u'\U0001F40D', u'\U0001F46F']: 432 self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch)) 433 for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b', 434 u'\U000104A0', u'\U0001F107']: 435 self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch)) 436 437 @requires_wide_build 438 def test_surrogates(self): 439 # this test actually passes on narrow too, but it's just by accident. 440 # Surrogates are seen as non-cased chars, so u'X\uD800X' is as 441 # uppercase as 'X X' 442 for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800', 443 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'): 444 self.assertTrue(s.islower()) 445 self.assertFalse(s.isupper()) 446 self.assertFalse(s.istitle()) 447 for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800', 448 u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'): 449 self.assertFalse(s.islower()) 450 self.assertTrue(s.isupper()) 451 self.assertTrue(s.istitle()) 452 453 for meth_name in ('islower', 'isupper', 'istitle'): 454 meth = getattr(unicode, meth_name) 455 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'): 456 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name)) 457 458 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace', 459 'isdecimal', 'isnumeric'): 460 meth = getattr(unicode, meth_name) 461 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF', 462 u'a\uD800b\uDFFF', u'a\uDFFFb\uD800', 463 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'): 464 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name)) 465 466 467 @requires_wide_build 468 def test_lower(self): 469 string_tests.CommonTest.test_lower(self) 470 self.assertEqual(u'\U00010427'.lower(), u'\U0001044F') 471 self.assertEqual(u'\U00010427\U00010427'.lower(), 472 u'\U0001044F\U0001044F') 473 self.assertEqual(u'\U00010427\U0001044F'.lower(), 474 u'\U0001044F\U0001044F') 475 self.assertEqual(u'X\U00010427x\U0001044F'.lower(), 476 u'x\U0001044Fx\U0001044F') 477 478 @requires_wide_build 479 def test_upper(self): 480 string_tests.CommonTest.test_upper(self) 481 self.assertEqual(u'\U0001044F'.upper(), u'\U00010427') 482 self.assertEqual(u'\U0001044F\U0001044F'.upper(), 483 u'\U00010427\U00010427') 484 self.assertEqual(u'\U00010427\U0001044F'.upper(), 485 u'\U00010427\U00010427') 486 self.assertEqual(u'X\U00010427x\U0001044F'.upper(), 487 u'X\U00010427X\U00010427') 488 489 @requires_wide_build 490 def test_capitalize_wide_build(self): 491 string_tests.CommonTest.test_capitalize(self) 492 self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427') 493 self.assertEqual(u'\U0001044F\U0001044F'.capitalize(), 494 u'\U00010427\U0001044F') 495 self.assertEqual(u'\U00010427\U0001044F'.capitalize(), 496 u'\U00010427\U0001044F') 497 self.assertEqual(u'\U0001044F\U00010427'.capitalize(), 498 u'\U00010427\U0001044F') 499 self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(), 500 u'X\U0001044Fx\U0001044F') 501 502 @requires_wide_build 503 def test_title(self): 504 string_tests.MixinStrUnicodeUserStringTest.test_title(self) 505 self.assertEqual(u'\U0001044F'.title(), u'\U00010427') 506 self.assertEqual(u'\U0001044F\U0001044F'.title(), 507 u'\U00010427\U0001044F') 508 self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(), 509 u'\U00010427\U0001044F \U00010427\U0001044F') 510 self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(), 511 u'\U00010427\U0001044F \U00010427\U0001044F') 512 self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(), 513 u'\U00010427\U0001044F \U00010427\U0001044F') 514 self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), 515 u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') 516 517 @requires_wide_build 518 def test_swapcase(self): 519 string_tests.CommonTest.test_swapcase(self) 520 self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427') 521 self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F') 522 self.assertEqual(u'\U0001044F\U0001044F'.swapcase(), 523 u'\U00010427\U00010427') 524 self.assertEqual(u'\U00010427\U0001044F'.swapcase(), 525 u'\U0001044F\U00010427') 526 self.assertEqual(u'\U0001044F\U00010427'.swapcase(), 527 u'\U00010427\U0001044F') 528 self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(), 529 u'x\U0001044FX\U00010427') 530 531 def test_contains(self): 532 # Testing Unicode contains method 533 self.assertIn('a', u'abdb') 534 self.assertIn('a', u'bdab') 535 self.assertIn('a', u'bdaba') 536 self.assertIn('a', u'bdba') 537 self.assertIn('a', u'bdba') 538 self.assertIn(u'a', u'bdba') 539 self.assertNotIn(u'a', u'bdb') 540 self.assertNotIn(u'a', 'bdb') 541 self.assertIn(u'a', 'bdba') 542 self.assertIn(u'a', ('a',1,None)) 543 self.assertIn(u'a', (1,None,'a')) 544 self.assertIn(u'a', (1,None,u'a')) 545 self.assertIn('a', ('a',1,None)) 546 self.assertIn('a', (1,None,'a')) 547 self.assertIn('a', (1,None,u'a')) 548 self.assertNotIn('a', ('x',1,u'y')) 549 self.assertNotIn('a', ('x',1,None)) 550 self.assertNotIn(u'abcd', u'abcxxxx') 551 self.assertIn(u'ab', u'abcd') 552 self.assertIn('ab', u'abc') 553 self.assertIn(u'ab', 'abc') 554 self.assertIn(u'ab', (1,None,u'ab')) 555 self.assertIn(u'', u'abc') 556 self.assertIn('', u'abc') 557 558 # If the following fails either 559 # the contains operator does not propagate UnicodeErrors or 560 # someone has changed the default encoding 561 self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2') 562 self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2') 563 564 self.assertIn(u'', '') 565 self.assertIn('', u'') 566 self.assertIn(u'', u'') 567 self.assertIn(u'', 'abc') 568 self.assertIn('', u'abc') 569 self.assertIn(u'', u'abc') 570 self.assertNotIn(u'\0', 'abc') 571 self.assertNotIn('\0', u'abc') 572 self.assertNotIn(u'\0', u'abc') 573 self.assertIn(u'\0', '\0abc') 574 self.assertIn('\0', u'\0abc') 575 self.assertIn(u'\0', u'\0abc') 576 self.assertIn(u'\0', 'abc\0') 577 self.assertIn('\0', u'abc\0') 578 self.assertIn(u'\0', u'abc\0') 579 self.assertIn(u'a', '\0abc') 580 self.assertIn('a', u'\0abc') 581 self.assertIn(u'a', u'\0abc') 582 self.assertIn(u'asdf', 'asdf') 583 self.assertIn('asdf', u'asdf') 584 self.assertIn(u'asdf', u'asdf') 585 self.assertNotIn(u'asdf', 'asd') 586 self.assertNotIn('asdf', u'asd') 587 self.assertNotIn(u'asdf', u'asd') 588 self.assertNotIn(u'asdf', '') 589 self.assertNotIn('asdf', u'') 590 self.assertNotIn(u'asdf', u'') 591 592 self.assertRaises(TypeError, u"abc".__contains__) 593 self.assertRaises(TypeError, u"abc".__contains__, object()) 594 595 def test_formatting(self): 596 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self) 597 # Testing Unicode formatting strings... 598 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc') 599 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00') 600 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00') 601 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50') 602 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57') 603 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57') 604 if not sys.platform.startswith('java'): 605 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'") 606 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def') 607 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def') 608 609 self.assertEqual(u'%c' % 0x1234, u'\u1234') 610 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,)) 611 self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3)) 612 613 for num in range(0x00,0x80): 614 char = chr(num) 615 self.assertEqual(u"%c" % char, unicode(char)) 616 self.assertEqual(u"%c" % num, unicode(char)) 617 self.assertTrue(char == u"%c" % char) 618 self.assertTrue(char == u"%c" % num) 619 # Issue 7649 620 for num in range(0x80,0x100): 621 uchar = unichr(num) 622 self.assertEqual(uchar, u"%c" % num) # works only with ints 623 self.assertEqual(uchar, u"%c" % uchar) # and unicode chars 624 # the implicit decoding should fail for non-ascii chars 625 self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num)) 626 self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num)) 627 628 # formatting jobs delegated from the string implementation: 629 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...') 630 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 631 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...') 632 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...') 633 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...') 634 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...') 635 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...') 636 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...') 637 self.assertEqual('...%s...' % u"abc", u'...abc...') 638 self.assertEqual('%*s' % (5,u'abc',), u' abc') 639 self.assertEqual('%*s' % (-5,u'abc',), u'abc ') 640 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab') 641 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc') 642 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc') 643 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc') 644 self.assertEqual('%c' % u'a', u'a') 645 class Wrapper: 646 def __str__(self): 647 return u'\u1234' 648 self.assertEqual('%s' % Wrapper(), u'\u1234') 649 650 def test_formatting_huge_precision(self): 651 format_string = u"%.{}f".format(sys.maxsize + 1) 652 with self.assertRaises(ValueError): 653 result = format_string % 2.34 654 655 @test_support.cpython_only 656 def test_formatting_huge_precision_c_limits(self): 657 from _testcapi import INT_MAX 658 format_string = u"%.{}f".format(INT_MAX + 1) 659 with self.assertRaises(ValueError): 660 result = format_string % 2.34 661 662 def test_formatting_huge_width(self): 663 format_string = u"%{}f".format(sys.maxsize + 1) 664 with self.assertRaises(ValueError): 665 result = format_string % 2.34 666 667 def test_startswith_endswith_errors(self): 668 for meth in (u'foo'.startswith, u'foo'.endswith): 669 with self.assertRaises(UnicodeDecodeError): 670 meth('\xff') 671 with self.assertRaises(TypeError) as cm: 672 meth(['f']) 673 exc = str(cm.exception) 674 self.assertIn('unicode', exc) 675 self.assertIn('str', exc) 676 self.assertIn('tuple', exc) 677 678 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR') 679 def test_format_float(self): 680 # should not format with a comma, but always with C locale 681 self.assertEqual(u'1.0', u'%.1f' % 1.0) 682 683 def test_constructor(self): 684 # unicode(obj) tests (this maps to PyObject_Unicode() at C level) 685 686 self.assertEqual( 687 unicode(u'unicode remains unicode'), 688 u'unicode remains unicode' 689 ) 690 691 self.assertEqual( 692 unicode(UnicodeSubclass('unicode subclass becomes unicode')), 693 u'unicode subclass becomes unicode' 694 ) 695 696 self.assertEqual( 697 unicode('strings are converted to unicode'), 698 u'strings are converted to unicode' 699 ) 700 701 class UnicodeCompat: 702 def __init__(self, x): 703 self.x = x 704 def __unicode__(self): 705 return self.x 706 707 self.assertEqual( 708 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')), 709 u'__unicode__ compatible objects are recognized') 710 711 class StringCompat: 712 def __init__(self, x): 713 self.x = x 714 def __str__(self): 715 return self.x 716 717 self.assertEqual( 718 unicode(StringCompat('__str__ compatible objects are recognized')), 719 u'__str__ compatible objects are recognized' 720 ) 721 722 # unicode(obj) is compatible to str(): 723 724 o = StringCompat('unicode(obj) is compatible to str()') 725 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()') 726 self.assertEqual(str(o), 'unicode(obj) is compatible to str()') 727 728 # %-formatting and .__unicode__() 729 self.assertEqual(u'%s' % 730 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"), 731 u"u'%s' % obj uses obj.__unicode__()") 732 self.assertEqual(u'%s' % 733 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"), 734 u"u'%s' % obj falls back to obj.__str__()") 735 736 for obj in (123, 123.45, 123L): 737 self.assertEqual(unicode(obj), unicode(str(obj))) 738 739 # unicode(obj, encoding, error) tests (this maps to 740 # PyUnicode_FromEncodedObject() at C level) 741 742 if not sys.platform.startswith('java'): 743 self.assertRaises( 744 TypeError, 745 unicode, 746 u'decoding unicode is not supported', 747 'utf-8', 748 'strict' 749 ) 750 751 self.assertEqual( 752 unicode('strings are decoded to unicode', 'utf-8', 'strict'), 753 u'strings are decoded to unicode' 754 ) 755 756 if not sys.platform.startswith('java'): 757 with test_support.check_py3k_warnings(): 758 buf = buffer('character buffers are decoded to unicode') 759 self.assertEqual( 760 unicode( 761 buf, 762 'utf-8', 763 'strict' 764 ), 765 u'character buffers are decoded to unicode' 766 ) 767 768 self.assertRaises(TypeError, unicode, 42, 42, 42) 769 770 def test_codecs_utf7(self): 771 utfTests = [ 772 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example 773 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example 774 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example 775 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example 776 (u'+', '+-'), 777 (u'+-', '+--'), 778 (u'+?', '+-?'), 779 (u'\?', '+AFw?'), 780 (u'+?', '+-?'), 781 (ur'\\?', '+AFwAXA?'), 782 (ur'\\\?', '+AFwAXABc?'), 783 (ur'++--', '+-+---'), 784 (u'\U000abcde', '+2m/c3g-'), # surrogate pairs 785 (u'/', '/'), 786 ] 787 788 for (x, y) in utfTests: 789 self.assertEqual(x.encode('utf-7'), y) 790 791 # Unpaired surrogates are passed through 792 self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-') 793 self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x') 794 self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-') 795 self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x') 796 self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801') 797 self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x') 798 self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01') 799 self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x') 800 801 self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-') 802 self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde') 803 804 # Direct encoded characters 805 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" 806 # Optional direct characters 807 set_o = '!"#$%&*;<=>@[]^_`{|}' 808 for c in set_d: 809 self.assertEqual(c.encode('utf7'), c.encode('ascii')) 810 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c)) 811 self.assertTrue(c == c.encode('ascii').decode('utf7')) 812 for c in set_o: 813 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c)) 814 self.assertTrue(c == c.encode('ascii').decode('utf7')) 815 816 def test_codecs_utf8(self): 817 self.assertEqual(u''.encode('utf-8'), '') 818 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac') 819 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82') 820 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96') 821 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80') 822 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80') 823 self.assertEqual( 824 (u'\ud800\udc02'*1000).encode('utf-8'), 825 '\xf0\x90\x80\x82'*1000 826 ) 827 self.assertEqual( 828 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' 829 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' 830 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c' 831 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067' 832 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das' 833 u' Nunstuck git und'.encode('utf-8'), 834 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81' 835 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3' 836 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe' 837 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83' 838 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8' 839 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81' 840 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81' 841 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3' 842 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf' 843 '\xe3\x80\x8cWenn ist das Nunstuck git und' 844 ) 845 846 # UTF-8 specific decoding tests 847 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456') 848 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002') 849 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac') 850 851 # Other possible utf-8 test cases: 852 # * strict decoding testing for all of the 853 # UTF8_ERROR cases in PyUnicode_DecodeUTF8 854 855 def test_utf8_decode_valid_sequences(self): 856 sequences = [ 857 # single byte 858 ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'), 859 # 2 bytes 860 ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'), 861 # 3 bytes 862 ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'), 863 ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'), 864 # 4 bytes 865 ('\xF0\x90\x80\x80', u'\U00010000'), 866 ('\xf4\x8f\xbf\xbf', u'\U0010FFFF') 867 ] 868 for seq, res in sequences: 869 self.assertEqual(seq.decode('utf-8'), res) 870 871 for ch in map(unichr, range(0, sys.maxunicode)): 872 self.assertEqual(ch, ch.encode('utf-8').decode('utf-8')) 873 874 def test_utf8_decode_invalid_sequences(self): 875 # continuation bytes in a sequence of 2, 3, or 4 bytes 876 continuation_bytes = map(chr, range(0x80, 0xC0)) 877 # start bytes of a 2-byte sequence equivalent to code points < 0x7F 878 invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2)) 879 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF 880 invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8)) 881 invalid_start_bytes = ( 882 continuation_bytes + invalid_2B_seq_start_bytes + 883 invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100)) 884 ) 885 886 for byte in invalid_start_bytes: 887 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8') 888 889 for sb in invalid_2B_seq_start_bytes: 890 for cb in continuation_bytes: 891 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8') 892 893 for sb in invalid_4B_seq_start_bytes: 894 for cb1 in continuation_bytes[:3]: 895 for cb3 in continuation_bytes[:3]: 896 self.assertRaises(UnicodeDecodeError, 897 (sb+cb1+'\x80'+cb3).decode, 'utf-8') 898 899 for cb in map(chr, range(0x80, 0xA0)): 900 self.assertRaises(UnicodeDecodeError, 901 ('\xE0'+cb+'\x80').decode, 'utf-8') 902 self.assertRaises(UnicodeDecodeError, 903 ('\xE0'+cb+'\xBF').decode, 'utf-8') 904 # XXX: surrogates shouldn't be valid UTF-8! 905 # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 906 # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt 907 #for cb in map(chr, range(0xA0, 0xC0)): 908 #self.assertRaises(UnicodeDecodeError, 909 #('\xED'+cb+'\x80').decode, 'utf-8') 910 #self.assertRaises(UnicodeDecodeError, 911 #('\xED'+cb+'\xBF').decode, 'utf-8') 912 # but since they are valid on Python 2 add a test for that: 913 for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)), 914 map(unichr, range(0xd800, 0xe000, 64))): 915 encoded = '\xED'+cb+'\x80' 916 self.assertEqual(encoded.decode('utf-8'), surrogate) 917 self.assertEqual(surrogate.encode('utf-8'), encoded) 918 919 for cb in map(chr, range(0x80, 0x90)): 920 self.assertRaises(UnicodeDecodeError, 921 ('\xF0'+cb+'\x80\x80').decode, 'utf-8') 922 self.assertRaises(UnicodeDecodeError, 923 ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8') 924 for cb in map(chr, range(0x90, 0xC0)): 925 self.assertRaises(UnicodeDecodeError, 926 ('\xF4'+cb+'\x80\x80').decode, 'utf-8') 927 self.assertRaises(UnicodeDecodeError, 928 ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8') 929 930 def test_issue8271(self): 931 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence, 932 # only the start byte and the continuation byte(s) are now considered 933 # invalid, instead of the number of bytes specified by the start byte. 934 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95, 935 # table 3-8, Row 2) for more information about the algorithm used. 936 FFFD = u'\ufffd' 937 sequences = [ 938 # invalid start bytes 939 ('\x80', FFFD), # continuation byte 940 ('\x80\x80', FFFD*2), # 2 continuation bytes 941 ('\xc0', FFFD), 942 ('\xc0\xc0', FFFD*2), 943 ('\xc1', FFFD), 944 ('\xc1\xc0', FFFD*2), 945 ('\xc0\xc1', FFFD*2), 946 # with start byte of a 2-byte sequence 947 ('\xc2', FFFD), # only the start byte 948 ('\xc2\xc2', FFFD*2), # 2 start bytes 949 ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes 950 ('\xc2\x41', FFFD+'A'), # invalid continuation byte 951 # with start byte of a 3-byte sequence 952 ('\xe1', FFFD), # only the start byte 953 ('\xe1\xe1', FFFD*2), # 2 start bytes 954 ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes 955 ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes 956 ('\xe1\x80', FFFD), # only 1 continuation byte 957 ('\xe1\x41', FFFD+'A'), # invalid continuation byte 958 ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb 959 ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes 960 ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte 961 ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid 962 ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid 963 # with start byte of a 4-byte sequence 964 ('\xf1', FFFD), # only the start byte 965 ('\xf1\xf1', FFFD*2), # 2 start bytes 966 ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes 967 ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes 968 ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes 969 ('\xf1\x80', FFFD), # only 1 continuation bytes 970 ('\xf1\x80\x80', FFFD), # only 2 continuation bytes 971 ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid 972 ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid 973 ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid 974 ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid 975 ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid 976 ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid 977 ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid 978 ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD), 979 ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2), 980 ('\xf1\xf1\x80\x41', FFFD*2+'A'), 981 ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2), 982 # with invalid start byte of a 4-byte sequence (rfc2279) 983 ('\xf5', FFFD), # only the start byte 984 ('\xf5\xf5', FFFD*2), # 2 start bytes 985 ('\xf5\x80', FFFD*2), # only 1 continuation byte 986 ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte 987 ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes 988 ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid 989 ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD), 990 ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'), 991 # with invalid start byte of a 5-byte sequence (rfc2279) 992 ('\xf8', FFFD), # only the start byte 993 ('\xf8\xf8', FFFD*2), # 2 start bytes 994 ('\xf8\x80', FFFD*2), # only one continuation byte 995 ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid 996 ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes 997 # with invalid start byte of a 6-byte sequence (rfc2279) 998 ('\xfc', FFFD), # only the start byte 999 ('\xfc\xfc', FFFD*2), # 2 start bytes 1000 ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes 1001 ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes 1002 # invalid start byte 1003 ('\xfe', FFFD), 1004 ('\xfe\x80\x80', FFFD*3), 1005 # other sequences 1006 ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'), 1007 ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'), 1008 ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'), 1009 ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64', 1010 u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'), 1011 ] 1012 for n, (seq, res) in enumerate(sequences): 1013 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict') 1014 self.assertEqual(seq.decode('utf-8', 'replace'), res) 1015 self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b') 1016 self.assertEqual(seq.decode('utf-8', 'ignore'), 1017 res.replace(u'\uFFFD', '')) 1018 1019 def test_codecs_idna(self): 1020 # Test whether trailing dot is preserved 1021 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.") 1022 1023 def test_codecs_errors(self): 1024 # Error handling (encoding) 1025 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii') 1026 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict') 1027 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x") 1028 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x") 1029 self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'), 1030 u'Andr\202 x'.encode('ascii', errors='replace')) 1031 self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'), 1032 u'Andr\202 x'.encode(encoding='ascii', errors='ignore')) 1033 1034 # Error handling (decoding) 1035 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii') 1036 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict') 1037 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x") 1038 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x') 1039 self.assertEqual(unicode('\202 x', 'ascii', 'replace'), u'\uFFFD x') 1040 with test_support.check_py3k_warnings(): 1041 self.assertEqual(u'abcde'.decode('ascii', 'ignore'), 1042 u'abcde'.decode('ascii', errors='ignore')) 1043 with test_support.check_py3k_warnings(): 1044 self.assertEqual(u'abcde'.decode('ascii', 'replace'), 1045 u'abcde'.decode(encoding='ascii', errors='replace')) 1046 1047 # Error handling (unknown character names) 1048 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx") 1049 1050 # Error handling (truncated escape sequence) 1051 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape") 1052 1053 self.assertRaises(TypeError, "hello".decode, "test.unicode1") 1054 self.assertRaises(TypeError, unicode, "hello", "test.unicode2") 1055 self.assertRaises(TypeError, u"hello".encode, "test.unicode1") 1056 self.assertRaises(TypeError, u"hello".encode, "test.unicode2") 1057 # executes PyUnicode_Encode() 1058 import imp 1059 self.assertRaises( 1060 ImportError, 1061 imp.find_module, 1062 "non-existing module", 1063 [u"non-existing dir"] 1064 ) 1065 1066 # Error handling (wrong arguments) 1067 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42) 1068 1069 # Error handling (PyUnicode_EncodeDecimal()) 1070 self.assertRaises(UnicodeError, int, u"\u0200") 1071 1072 def test_codecs(self): 1073 # Encoding 1074 self.assertEqual(u'hello'.encode('ascii'), 'hello') 1075 self.assertEqual(u'hello'.encode('utf-7'), 'hello') 1076 self.assertEqual(u'hello'.encode('utf-8'), 'hello') 1077 self.assertEqual(u'hello'.encode('utf8'), 'hello') 1078 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000') 1079 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o') 1080 self.assertEqual(u'hello'.encode('latin-1'), 'hello') 1081 1082 # Roundtrip safety for BMP (just the first 1024 chars) 1083 for c in xrange(1024): 1084 u = unichr(c) 1085 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 1086 'utf-16-be', 'raw_unicode_escape', 1087 'unicode_escape', 'unicode_internal'): 1088 self.assertEqual(unicode(u.encode(encoding),encoding), u) 1089 1090 # Roundtrip safety for BMP (just the first 256 chars) 1091 for c in xrange(256): 1092 u = unichr(c) 1093 for encoding in ('latin-1',): 1094 self.assertEqual(unicode(u.encode(encoding),encoding), u) 1095 1096 # Roundtrip safety for BMP (just the first 128 chars) 1097 for c in xrange(128): 1098 u = unichr(c) 1099 for encoding in ('ascii',): 1100 self.assertEqual(unicode(u.encode(encoding),encoding), u) 1101 1102 # Roundtrip safety for non-BMP (just a few chars) 1103 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005' 1104 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 1105 #'raw_unicode_escape', 1106 'unicode_escape', 'unicode_internal'): 1107 self.assertEqual(unicode(u.encode(encoding),encoding), u) 1108 1109 # UTF-8 must be roundtrip safe for all UCS-2 code points 1110 # This excludes surrogates: in the full range, there would be 1111 # a surrogate pair (\udbff\udc00), which gets converted back 1112 # to a non-BMP character (\U0010fc00) 1113 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000))) 1114 for encoding in ('utf-8',): 1115 self.assertEqual(unicode(u.encode(encoding),encoding), u) 1116 1117 def test_codecs_charmap(self): 1118 # 0-127 1119 s = ''.join(map(chr, xrange(128))) 1120 for encoding in ( 1121 'cp037', 'cp1026', 1122 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 1123 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 1124 'cp863', 'cp865', 'cp866', 1125 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 1126 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 1127 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1', 1128 'mac_cyrillic', 'mac_latin2', 1129 1130 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 1131 'cp1256', 'cp1257', 'cp1258', 1132 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 1133 1134 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 1135 'cp1006', 'iso8859_8', 1136 1137 ### These have undefined mappings: 1138 #'cp424', 1139 1140 ### These fail the round-trip: 1141 #'cp875' 1142 1143 ): 1144 self.assertEqual(unicode(s, encoding).encode(encoding), s) 1145 1146 # 128-255 1147 s = ''.join(map(chr, xrange(128, 256))) 1148 for encoding in ( 1149 'cp037', 'cp1026', 1150 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 1151 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 1152 'cp863', 'cp865', 'cp866', 1153 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 1154 'iso8859_2', 'iso8859_4', 'iso8859_5', 1155 'iso8859_9', 'koi8_r', 'latin_1', 1156 'mac_cyrillic', 'mac_latin2', 1157 1158 ### These have undefined mappings: 1159 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 1160 #'cp1256', 'cp1257', 'cp1258', 1161 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 1162 #'iso8859_3', 'iso8859_6', 'iso8859_7', 1163 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 1164 1165 ### These fail the round-trip: 1166 #'cp1006', 'cp875', 'iso8859_8', 1167 1168 ): 1169 self.assertEqual(unicode(s, encoding).encode(encoding), s) 1170 1171 def test_concatenation(self): 1172 self.assertEqual((u"abc" u"def"), u"abcdef") 1173 self.assertEqual(("abc" u"def"), u"abcdef") 1174 self.assertEqual((u"abc" "def"), u"abcdef") 1175 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi") 1176 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi") 1177 1178 def test_printing(self): 1179 class BitBucket: 1180 def write(self, text): 1181 pass 1182 1183 out = BitBucket() 1184 print >>out, u'abc' 1185 print >>out, u'abc', u'def' 1186 print >>out, u'abc', 'def' 1187 print >>out, 'abc', u'def' 1188 print >>out, u'abc\n' 1189 print >>out, u'abc\n', 1190 print >>out, u'abc\n', 1191 print >>out, u'def\n' 1192 print >>out, u'def\n' 1193 1194 def test_ucs4(self): 1195 x = u'\U00100000' 1196 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") 1197 self.assertEqual(x, y) 1198 1199 y = r'\U00100000' 1200 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") 1201 self.assertEqual(x, y) 1202 y = r'\U00010000' 1203 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") 1204 self.assertEqual(x, y) 1205 1206 try: 1207 '\U11111111'.decode("raw-unicode-escape") 1208 except UnicodeDecodeError as e: 1209 self.assertEqual(e.start, 0) 1210 self.assertEqual(e.end, 10) 1211 else: 1212 self.fail("Should have raised UnicodeDecodeError") 1213 1214 def test_conversion(self): 1215 # Make sure __unicode__() works properly 1216 class Foo0: 1217 def __str__(self): 1218 return "foo" 1219 1220 class Foo1: 1221 def __unicode__(self): 1222 return u"foo" 1223 1224 class Foo2(object): 1225 def __unicode__(self): 1226 return u"foo" 1227 1228 class Foo3(object): 1229 def __unicode__(self): 1230 return "foo" 1231 1232 class Foo4(str): 1233 def __unicode__(self): 1234 return "foo" 1235 1236 class Foo5(unicode): 1237 def __unicode__(self): 1238 return "foo" 1239 1240 class Foo6(str): 1241 def __str__(self): 1242 return "foos" 1243 1244 def __unicode__(self): 1245 return u"foou" 1246 1247 class Foo7(unicode): 1248 def __str__(self): 1249 return "foos" 1250 def __unicode__(self): 1251 return u"foou" 1252 1253 class Foo8(unicode): 1254 def __new__(cls, content=""): 1255 return unicode.__new__(cls, 2*content) 1256 def __unicode__(self): 1257 return self 1258 1259 class Foo9(unicode): 1260 def __str__(self): 1261 return "string" 1262 def __unicode__(self): 1263 return "not unicode" 1264 1265 self.assertEqual(unicode(Foo0()), u"foo") 1266 self.assertEqual(unicode(Foo1()), u"foo") 1267 self.assertEqual(unicode(Foo2()), u"foo") 1268 self.assertEqual(unicode(Foo3()), u"foo") 1269 self.assertEqual(unicode(Foo4("bar")), u"foo") 1270 self.assertEqual(unicode(Foo5("bar")), u"foo") 1271 self.assertEqual(unicode(Foo6("bar")), u"foou") 1272 self.assertEqual(unicode(Foo7("bar")), u"foou") 1273 self.assertEqual(unicode(Foo8("foo")), u"foofoo") 1274 self.assertIs(type(unicode(Foo8("foo"))), Foo8) 1275 self.assertEqual(UnicodeSubclass(Foo8("foo")), u"foofoo") 1276 self.assertIs(type(UnicodeSubclass(Foo8("foo"))), UnicodeSubclass) 1277 self.assertEqual(str(Foo9("foo")), "string") 1278 self.assertEqual(unicode(Foo9("foo")), u"not unicode") 1279 1280 def test_unicode_repr(self): 1281 class s1: 1282 def __repr__(self): 1283 return '\\n' 1284 1285 class s2: 1286 def __repr__(self): 1287 return u'\\n' 1288 1289 self.assertEqual(repr(s1()), '\\n') 1290 self.assertEqual(repr(s2()), '\\n') 1291 1292 # This test only affects 32-bit platforms because expandtabs can only take 1293 # an int as the max value, not a 64-bit C long. If expandtabs is changed 1294 # to take a 64-bit long, this test should apply to all platforms. 1295 @unittest.skipIf(sys.maxint > (1 << 32) or struct.calcsize('P') != 4, 1296 'only applies to 32-bit platforms') 1297 def test_expandtabs_overflows_gracefully(self): 1298 self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint) 1299 1300 def test__format__(self): 1301 def test(value, format, expected): 1302 # test both with and without the trailing 's' 1303 self.assertEqual(value.__format__(format), expected) 1304 self.assertEqual(value.__format__(format + u's'), expected) 1305 1306 test(u'', u'', u'') 1307 test(u'abc', u'', u'abc') 1308 test(u'abc', u'.3', u'abc') 1309 test(u'ab', u'.3', u'ab') 1310 test(u'abcdef', u'.3', u'abc') 1311 test(u'abcdef', u'.0', u'') 1312 test(u'abc', u'3.3', u'abc') 1313 test(u'abc', u'2.3', u'abc') 1314 test(u'abc', u'2.2', u'ab') 1315 test(u'abc', u'3.2', u'ab ') 1316 test(u'result', u'x<0', u'result') 1317 test(u'result', u'x<5', u'result') 1318 test(u'result', u'x<6', u'result') 1319 test(u'result', u'x<7', u'resultx') 1320 test(u'result', u'x<8', u'resultxx') 1321 test(u'result', u' <7', u'result ') 1322 test(u'result', u'<7', u'result ') 1323 test(u'result', u'>7', u' result') 1324 test(u'result', u'>8', u' result') 1325 test(u'result', u'^8', u' result ') 1326 test(u'result', u'^9', u' result ') 1327 test(u'result', u'^10', u' result ') 1328 test(u'a', u'10000', u'a' + u' ' * 9999) 1329 test(u'', u'10000', u' ' * 10000) 1330 test(u'', u'10000000', u' ' * 10000000) 1331 1332 # test mixing unicode and str 1333 self.assertEqual(u'abc'.__format__('s'), u'abc') 1334 self.assertEqual(u'abc'.__format__('->10s'), u'-------abc') 1335 1336 def test_format(self): 1337 self.assertEqual(u''.format(), u'') 1338 self.assertEqual(u'a'.format(), u'a') 1339 self.assertEqual(u'ab'.format(), u'ab') 1340 self.assertEqual(u'a{{'.format(), u'a{') 1341 self.assertEqual(u'a}}'.format(), u'a}') 1342 self.assertEqual(u'{{b'.format(), u'{b') 1343 self.assertEqual(u'}}b'.format(), u'}b') 1344 self.assertEqual(u'a{{b'.format(), u'a{b') 1345 1346 # examples from the PEP: 1347 import datetime 1348 self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred") 1349 self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')), 1350 u"My name is Fred") 1351 self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'), 1352 u"My name is Fred :-{}") 1353 1354 # datetime.__format__ doesn't work with unicode 1355 #d = datetime.date(2007, 8, 18) 1356 #self.assertEqual("The year is {0.year}".format(d), 1357 # "The year is 2007") 1358 1359 # classes we'll use for testing 1360 class C: 1361 def __init__(self, x=100): 1362 self._x = x 1363 def __format__(self, spec): 1364 return spec 1365 1366 class D: 1367 def __init__(self, x): 1368 self.x = x 1369 def __format__(self, spec): 1370 return str(self.x) 1371 1372 # class with __str__, but no __format__ 1373 class E: 1374 def __init__(self, x): 1375 self.x = x 1376 def __str__(self): 1377 return u'E(' + self.x + u')' 1378 1379 # class with __repr__, but no __format__ or __str__ 1380 class F: 1381 def __init__(self, x): 1382 self.x = x 1383 def __repr__(self): 1384 return u'F(' + self.x + u')' 1385 1386 # class with __format__ that forwards to string, for some format_spec's 1387 class G: 1388 def __init__(self, x): 1389 self.x = x 1390 def __str__(self): 1391 return u"string is " + self.x 1392 def __format__(self, format_spec): 1393 if format_spec == 'd': 1394 return u'G(' + self.x + u')' 1395 return object.__format__(self, format_spec) 1396 1397 # class that returns a bad type from __format__ 1398 class H: 1399 def __format__(self, format_spec): 1400 return 1.0 1401 1402 class I(datetime.date): 1403 def __format__(self, format_spec): 1404 return self.strftime(format_spec) 1405 1406 class J(int): 1407 def __format__(self, format_spec): 1408 return int.__format__(self * 2, format_spec) 1409 1410 1411 self.assertEqual(u''.format(), u'') 1412 self.assertEqual(u'abc'.format(), u'abc') 1413 self.assertEqual(u'{0}'.format(u'abc'), u'abc') 1414 self.assertEqual(u'{0:}'.format(u'abc'), u'abc') 1415 self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc') 1416 self.assertEqual(u'{0}X'.format(u'abc'), u'abcX') 1417 self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY') 1418 self.assertEqual(u'{1}'.format(1, u'abc'), u'abc') 1419 self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc') 1420 self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX') 1421 self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY') 1422 self.assertEqual(u'{0}'.format(-15), u'-15') 1423 self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc') 1424 self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc') 1425 self.assertEqual(u'{{'.format(), u'{') 1426 self.assertEqual(u'}}'.format(), u'}') 1427 self.assertEqual(u'{{}}'.format(), u'{}') 1428 self.assertEqual(u'{{x}}'.format(), u'{x}') 1429 self.assertEqual(u'{{{0}}}'.format(123), u'{123}') 1430 self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}') 1431 self.assertEqual(u'}}{{'.format(), u'}{') 1432 self.assertEqual(u'}}x{{'.format(), u'}x{') 1433 1434 # weird field names 1435 self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz') 1436 self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz') 1437 self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3') 1438 1439 self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20') 1440 self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010') 1441 self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc') 1442 self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc') 1443 self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def') 1444 self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def') 1445 self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def') 1446 1447 # strings 1448 self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc') 1449 self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab') 1450 self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc') 1451 self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'') 1452 self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc') 1453 self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc') 1454 self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab') 1455 self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ') 1456 self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result') 1457 self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result') 1458 self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result') 1459 self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx') 1460 self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx') 1461 self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ') 1462 self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ') 1463 self.assertEqual(u'{0:>7s}'.format(u'result'), u' result') 1464 self.assertEqual(u'{0:>8s}'.format(u'result'), u' result') 1465 self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ') 1466 self.assertEqual(u'{0:^9s}'.format(u'result'), u' result ') 1467 self.assertEqual(u'{0:^10s}'.format(u'result'), u' result ') 1468 self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999) 1469 self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000) 1470 self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000) 1471 1472 # issue 12546: use \x00 as a fill character 1473 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00') 1474 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01') 1475 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00') 1476 self.assertEqual('{0:^6s}'.format('foo'), ' foo ') 1477 1478 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00') 1479 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01') 1480 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00') 1481 self.assertEqual('{0:<6}'.format(3), '3 ') 1482 1483 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00') 1484 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01') 1485 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00') 1486 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ') 1487 1488 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00') 1489 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01') 1490 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00') 1491 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ') 1492 1493 # format specifiers for user defined type 1494 self.assertEqual(u'{0:abc}'.format(C()), u'abc') 1495 1496 # !r and !s coercions 1497 self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello') 1498 self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello') 1499 self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello ') 1500 self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello ') 1501 self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'") 1502 self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'") 1503 self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)') 1504 1505 # test fallback to object.__format__ 1506 self.assertEqual(u'{0}'.format({}), u'{}') 1507 self.assertEqual(u'{0}'.format([]), u'[]') 1508 self.assertEqual(u'{0}'.format([1]), u'[1]') 1509 self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)') 1510 self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)') 1511 self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data') 1512 1513 msg = 'object.__format__ with a non-empty format string is deprecated' 1514 with test_support.check_warnings((msg, PendingDeprecationWarning)): 1515 self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ') 1516 self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ') 1517 self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data') 1518 1519 self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007, 1520 month=8, 1521 day=27)), 1522 u"date: 2007-08-27") 1523 1524 # test deriving from a builtin type and overriding __format__ 1525 self.assertEqual(u"{0}".format(J(10)), u"20") 1526 1527 1528 # string format specifiers 1529 self.assertEqual(u'{0:}'.format('a'), u'a') 1530 1531 # computed format specifiers 1532 self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello') 1533 self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello') 1534 self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello') 1535 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello ') 1536 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello ') 1537 1538 # test various errors 1539 self.assertRaises(ValueError, u'{'.format) 1540 self.assertRaises(ValueError, u'}'.format) 1541 self.assertRaises(ValueError, u'a{'.format) 1542 self.assertRaises(ValueError, u'a}'.format) 1543 self.assertRaises(ValueError, u'{a'.format) 1544 self.assertRaises(ValueError, u'}a'.format) 1545 self.assertRaises(IndexError, u'{0}'.format) 1546 self.assertRaises(IndexError, u'{1}'.format, u'abc') 1547 self.assertRaises(KeyError, u'{x}'.format) 1548 self.assertRaises(ValueError, u"}{".format) 1549 self.assertRaises(ValueError, u"{".format) 1550 self.assertRaises(ValueError, u"}".format) 1551 self.assertRaises(ValueError, u"abc{0:{}".format) 1552 self.assertRaises(ValueError, u"{0".format) 1553 self.assertRaises(IndexError, u"{0.}".format) 1554 self.assertRaises(ValueError, u"{0.}".format, 0) 1555 self.assertRaises(IndexError, u"{0[}".format) 1556 self.assertRaises(ValueError, u"{0[}".format, []) 1557 self.assertRaises(KeyError, u"{0]}".format) 1558 self.assertRaises(ValueError, u"{0.[]}".format, 0) 1559 self.assertRaises(ValueError, u"{0..foo}".format, 0) 1560 self.assertRaises(ValueError, u"{0[0}".format, 0) 1561 self.assertRaises(ValueError, u"{0[0:foo}".format, 0) 1562 self.assertRaises(KeyError, u"{c]}".format) 1563 self.assertRaises(ValueError, u"{{ {{{0}}".format, 0) 1564 self.assertRaises(ValueError, u"{0}}".format, 0) 1565 self.assertRaises(KeyError, u"{foo}".format, bar=3) 1566 self.assertRaises(ValueError, u"{0!x}".format, 3) 1567 self.assertRaises(ValueError, u"{0!}".format, 0) 1568 self.assertRaises(ValueError, u"{0!rs}".format, 0) 1569 self.assertRaises(ValueError, u"{!}".format) 1570 self.assertRaises(IndexError, u"{:}".format) 1571 self.assertRaises(IndexError, u"{:s}".format) 1572 self.assertRaises(IndexError, u"{}".format) 1573 big = u"23098475029384702983476098230754973209482573" 1574 self.assertRaises(ValueError, (u"{" + big + u"}").format) 1575 self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0]) 1576 1577 # issue 6089 1578 self.assertRaises(ValueError, u"{0[0]x}".format, [None]) 1579 self.assertRaises(ValueError, u"{0[0](10)}".format, [None]) 1580 1581 # can't have a replacement on the field name portion 1582 self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4) 1583 1584 # exceed maximum recursion depth 1585 self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'') 1586 self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format, 1587 0, 1, 2, 3, 4, 5, 6, 7) 1588 1589 # string format spec errors 1590 self.assertRaises(ValueError, u"{0:-s}".format, u'') 1591 self.assertRaises(ValueError, format, u"", u"-") 1592 self.assertRaises(ValueError, u"{0:=s}".format, u'') 1593 1594 # test combining string and unicode 1595 self.assertEqual(u"foo{0}".format('bar'), u'foobar') 1596 # This will try to convert the argument from unicode to str, which 1597 # will succeed 1598 self.assertEqual("foo{0}".format(u'bar'), 'foobar') 1599 # This will try to convert the argument from unicode to str, which 1600 # will fail 1601 self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar') 1602 1603 def test_format_huge_precision(self): 1604 format_string = u".{}f".format(sys.maxsize + 1) 1605 with self.assertRaises(ValueError): 1606 result = format(2.34, format_string) 1607 1608 def test_format_huge_width(self): 1609 format_string = u"{}f".format(sys.maxsize + 1) 1610 with self.assertRaises(ValueError): 1611 result = format(2.34, format_string) 1612 1613 def test_format_huge_item_number(self): 1614 format_string = u"{{{}:.6f}}".format(sys.maxsize + 1) 1615 with self.assertRaises(ValueError): 1616 result = format_string.format(2.34) 1617 1618 def test_format_auto_numbering(self): 1619 class C: 1620 def __init__(self, x=100): 1621 self._x = x 1622 def __format__(self, spec): 1623 return spec 1624 1625 self.assertEqual(u'{}'.format(10), u'10') 1626 self.assertEqual(u'{:5}'.format('s'), u's ') 1627 self.assertEqual(u'{!r}'.format('s'), u"'s'") 1628 self.assertEqual(u'{._x}'.format(C(10)), u'10') 1629 self.assertEqual(u'{[1]}'.format([1, 2]), u'2') 1630 self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4') 1631 self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c') 1632 1633 self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a x b') 1634 self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b') 1635 1636 # can't mix and match numbering and auto-numbering 1637 self.assertRaises(ValueError, u'{}{1}'.format, 1, 2) 1638 self.assertRaises(ValueError, u'{1}{}'.format, 1, 2) 1639 self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2) 1640 self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2) 1641 1642 # can mix and match auto-numbering and named 1643 self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4') 1644 self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test') 1645 self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3') 1646 self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g') 1647 1648 def test_raiseMemError(self): 1649 # Ensure that the freelist contains a consistent object, even 1650 # when a string allocation fails with a MemoryError. 1651 # This used to crash the interpreter, 1652 # or leak references when the number was smaller. 1653 charwidth = 4 if sys.maxunicode >= 0x10000 else 2 1654 # Note: sys.maxsize is half of the actual max allocation because of 1655 # the signedness of Py_ssize_t. 1656 alloc = lambda: u"a" * (sys.maxsize // charwidth * 2) 1657 self.assertRaises(MemoryError, alloc) 1658 self.assertRaises(MemoryError, alloc) 1659 1660 def test_format_subclass(self): 1661 class U(unicode): 1662 def __unicode__(self): 1663 return u'__unicode__ overridden' 1664 u = U(u'xxx') 1665 self.assertEqual("%s" % u, u'__unicode__ overridden') 1666 self.assertEqual("{}".format(u), '__unicode__ overridden') 1667 1668 def test_free_after_iterating(self): 1669 test_support.check_free_after_iterating(self, iter, unicode) 1670 test_support.check_free_after_iterating(self, reversed, unicode) 1671 1672 1673class CAPITest(unittest.TestCase): 1674 1675 # Test PyUnicode_FromFormat() 1676 def test_from_format(self): 1677 test_support.import_module('ctypes') 1678 from ctypes import ( 1679 pythonapi, py_object, sizeof, 1680 c_int, c_long, c_longlong, c_ssize_t, 1681 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p) 1682 if sys.maxunicode == 0xffff: 1683 name = "PyUnicodeUCS2_FromFormat" 1684 else: 1685 name = "PyUnicodeUCS4_FromFormat" 1686 _PyUnicode_FromFormat = getattr(pythonapi, name) 1687 _PyUnicode_FromFormat.restype = py_object 1688 1689 def PyUnicode_FromFormat(format, *args): 1690 cargs = tuple( 1691 py_object(arg) if isinstance(arg, unicode) else arg 1692 for arg in args) 1693 return _PyUnicode_FromFormat(format, *cargs) 1694 1695 def check_format(expected, format, *args): 1696 text = PyUnicode_FromFormat(format, *args) 1697 self.assertEqual(expected, text) 1698 1699 # ascii format, non-ascii argument 1700 check_format(u'ascii\x7f=unicode\xe9', 1701 b'ascii\x7f=%U', u'unicode\xe9') 1702 1703 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() 1704 # raises an error 1705 #self.assertRaisesRegex(ValueError, 1706 # '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' 1707 # 'string, got a non-ASCII byte: 0xe9$', 1708 # PyUnicode_FromFormat, b'unicode\xe9=%s', u'ascii') 1709 1710 # test "%c" 1711 check_format(u'\uabcd', 1712 b'%c', c_int(0xabcd)) 1713 if sys.maxunicode > 0xffff: 1714 check_format(u'\U0010ffff', 1715 b'%c', c_int(0x10ffff)) 1716 else: 1717 with self.assertRaises(OverflowError): 1718 PyUnicode_FromFormat(b'%c', c_int(0x10000)) 1719 with self.assertRaises(OverflowError): 1720 PyUnicode_FromFormat(b'%c', c_int(0x110000)) 1721 # Issue #18183 1722 if sys.maxunicode > 0xffff: 1723 check_format(u'\U00010000\U00100000', 1724 b'%c%c', c_int(0x10000), c_int(0x100000)) 1725 1726 # test "%" 1727 check_format(u'%', 1728 b'%') 1729 check_format(u'%', 1730 b'%%') 1731 check_format(u'%s', 1732 b'%%s') 1733 check_format(u'[%]', 1734 b'[%%]') 1735 check_format(u'%abc', 1736 b'%%%s', b'abc') 1737 1738 # test %S 1739 check_format(u"repr=abc", 1740 b'repr=%S', u'abc') 1741 1742 # test %R 1743 check_format(u"repr=u'abc'", 1744 b'repr=%R', u'abc') 1745 1746 # test integer formats (%i, %d, %u) 1747 check_format(u'010', 1748 b'%03i', c_int(10)) 1749 check_format(u'0010', 1750 b'%0.4i', c_int(10)) 1751 check_format(u'-123', 1752 b'%i', c_int(-123)) 1753 1754 check_format(u'-123', 1755 b'%d', c_int(-123)) 1756 check_format(u'-123', 1757 b'%ld', c_long(-123)) 1758 check_format(u'-123', 1759 b'%zd', c_ssize_t(-123)) 1760 1761 check_format(u'123', 1762 b'%u', c_uint(123)) 1763 check_format(u'123', 1764 b'%lu', c_ulong(123)) 1765 check_format(u'123', 1766 b'%zu', c_size_t(123)) 1767 1768 # test long output 1769 min_long = -(2 ** (8 * sizeof(c_long) - 1)) 1770 max_long = -min_long - 1 1771 check_format(unicode(min_long), 1772 b'%ld', c_long(min_long)) 1773 check_format(unicode(max_long), 1774 b'%ld', c_long(max_long)) 1775 max_ulong = 2 ** (8 * sizeof(c_ulong)) - 1 1776 check_format(unicode(max_ulong), 1777 b'%lu', c_ulong(max_ulong)) 1778 PyUnicode_FromFormat(b'%p', c_void_p(-1)) 1779 1780 # test padding (width and/or precision) 1781 check_format(u'123'.rjust(10, u'0'), 1782 b'%010i', c_int(123)) 1783 check_format(u'123'.rjust(100), 1784 b'%100i', c_int(123)) 1785 check_format(u'123'.rjust(100, u'0'), 1786 b'%.100i', c_int(123)) 1787 check_format(u'123'.rjust(80, u'0').rjust(100), 1788 b'%100.80i', c_int(123)) 1789 1790 check_format(u'123'.rjust(10, u'0'), 1791 b'%010u', c_uint(123)) 1792 check_format(u'123'.rjust(100), 1793 b'%100u', c_uint(123)) 1794 check_format(u'123'.rjust(100, u'0'), 1795 b'%.100u', c_uint(123)) 1796 check_format(u'123'.rjust(80, u'0').rjust(100), 1797 b'%100.80u', c_uint(123)) 1798 1799 check_format(u'123'.rjust(10, u'0'), 1800 b'%010x', c_int(0x123)) 1801 check_format(u'123'.rjust(100), 1802 b'%100x', c_int(0x123)) 1803 check_format(u'123'.rjust(100, u'0'), 1804 b'%.100x', c_int(0x123)) 1805 check_format(u'123'.rjust(80, u'0').rjust(100), 1806 b'%100.80x', c_int(0x123)) 1807 1808 # test %V 1809 check_format(u'repr=abc', 1810 b'repr=%V', u'abc', b'xyz') 1811 check_format(u'repr=\xe4\xba\xba\xe6\xb0\x91', 1812 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') 1813 check_format(u'repr=abc\xff', 1814 b'repr=%V', None, b'abc\xff') 1815 1816 # not supported: copy the raw format string. these tests are just here 1817 # to check for crashes and should not be considered as specifications 1818 check_format(u'%s', 1819 b'%1%s', b'abc') 1820 check_format(u'%1abc', 1821 b'%1abc') 1822 check_format(u'%+i', 1823 b'%+i', c_int(10)) 1824 check_format(u'%s', 1825 b'%.%s', b'abc') 1826 1827 @test_support.cpython_only 1828 def test_encode_decimal(self): 1829 from _testcapi import unicode_encodedecimal 1830 self.assertEqual(unicode_encodedecimal(u'123'), 1831 b'123') 1832 self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'), 1833 b'3.14') 1834 self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"), 1835 b' 3.14 ') 1836 self.assertRaises(UnicodeEncodeError, 1837 unicode_encodedecimal, u"123\u20ac", "strict") 1838 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"), 1839 b'123?') 1840 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"), 1841 b'123') 1842 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"), 1843 b'123€') 1844 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"), 1845 b'123\\u20ac') 1846 self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"), 1847 b'123? ') 1848 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"), 1849 b'123??') 1850 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"), 1851 b'123?0') 1852 1853 @test_support.cpython_only 1854 def test_encode_decimal_with_surrogates(self): 1855 from _testcapi import unicode_encodedecimal 1856 tests = [(u'\U0001f49d', '💝'), 1857 (u'\ud83d', '�'), 1858 (u'\udc9d', '�'), 1859 ] 1860 if u'\ud83d\udc9d' != u'\U0001f49d': 1861 tests += [(u'\ud83d\udc9d', '��')] 1862 for s, exp in tests: 1863 self.assertEqual( 1864 unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"), 1865 '123' + exp) 1866 1867def test_main(): 1868 test_support.run_unittest(__name__) 1869 1870if __name__ == "__main__": 1871 test_main() 1872