1import unittest 2import sys 3from test import support 4from test.support import import_helper 5 6try: 7 import _testcapi 8except ImportError: 9 _testcapi = None 10 11 12class CAPITest(unittest.TestCase): 13 14 # Test PyUnicode_FromFormat() 15 def test_from_format(self): 16 import_helper.import_module('ctypes') 17 from ctypes import ( 18 c_char_p, 19 pythonapi, py_object, sizeof, 20 c_int, c_long, c_longlong, c_ssize_t, 21 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p) 22 name = "PyUnicode_FromFormat" 23 _PyUnicode_FromFormat = getattr(pythonapi, name) 24 _PyUnicode_FromFormat.argtypes = (c_char_p,) 25 _PyUnicode_FromFormat.restype = py_object 26 27 def PyUnicode_FromFormat(format, *args): 28 cargs = tuple( 29 py_object(arg) if isinstance(arg, str) else arg 30 for arg in args) 31 return _PyUnicode_FromFormat(format, *cargs) 32 33 def check_format(expected, format, *args): 34 text = PyUnicode_FromFormat(format, *args) 35 self.assertEqual(expected, text) 36 37 # ascii format, non-ascii argument 38 check_format('ascii\x7f=unicode\xe9', 39 b'ascii\x7f=%U', 'unicode\xe9') 40 41 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() 42 # raises an error 43 self.assertRaisesRegex(ValueError, 44 r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' 45 'string, got a non-ASCII byte: 0xe9$', 46 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') 47 48 # test "%c" 49 check_format('\uabcd', 50 b'%c', c_int(0xabcd)) 51 check_format('\U0010ffff', 52 b'%c', c_int(0x10ffff)) 53 with self.assertRaises(OverflowError): 54 PyUnicode_FromFormat(b'%c', c_int(0x110000)) 55 # Issue #18183 56 check_format('\U00010000\U00100000', 57 b'%c%c', c_int(0x10000), c_int(0x100000)) 58 59 # test "%" 60 check_format('%', 61 b'%') 62 check_format('%', 63 b'%%') 64 check_format('%s', 65 b'%%s') 66 check_format('[%]', 67 b'[%%]') 68 check_format('%abc', 69 b'%%%s', b'abc') 70 71 # truncated string 72 check_format('abc', 73 b'%.3s', b'abcdef') 74 check_format('abc[\ufffd', 75 b'%.5s', 'abc[\u20ac]'.encode('utf8')) 76 check_format("'\\u20acABC'", 77 b'%A', '\u20acABC') 78 check_format("'\\u20", 79 b'%.5A', '\u20acABCDEF') 80 check_format("'\u20acABC'", 81 b'%R', '\u20acABC') 82 check_format("'\u20acA", 83 b'%.3R', '\u20acABCDEF') 84 check_format('\u20acAB', 85 b'%.3S', '\u20acABCDEF') 86 check_format('\u20acAB', 87 b'%.3U', '\u20acABCDEF') 88 check_format('\u20acAB', 89 b'%.3V', '\u20acABCDEF', None) 90 check_format('abc[\ufffd', 91 b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) 92 93 # following tests comes from #7330 94 # test width modifier and precision modifier with %S 95 check_format("repr= abc", 96 b'repr=%5S', 'abc') 97 check_format("repr=ab", 98 b'repr=%.2S', 'abc') 99 check_format("repr= ab", 100 b'repr=%5.2S', 'abc') 101 102 # test width modifier and precision modifier with %R 103 check_format("repr= 'abc'", 104 b'repr=%8R', 'abc') 105 check_format("repr='ab", 106 b'repr=%.3R', 'abc') 107 check_format("repr= 'ab", 108 b'repr=%5.3R', 'abc') 109 110 # test width modifier and precision modifier with %A 111 check_format("repr= 'abc'", 112 b'repr=%8A', 'abc') 113 check_format("repr='ab", 114 b'repr=%.3A', 'abc') 115 check_format("repr= 'ab", 116 b'repr=%5.3A', 'abc') 117 118 # test width modifier and precision modifier with %s 119 check_format("repr= abc", 120 b'repr=%5s', b'abc') 121 check_format("repr=ab", 122 b'repr=%.2s', b'abc') 123 check_format("repr= ab", 124 b'repr=%5.2s', b'abc') 125 126 # test width modifier and precision modifier with %U 127 check_format("repr= abc", 128 b'repr=%5U', 'abc') 129 check_format("repr=ab", 130 b'repr=%.2U', 'abc') 131 check_format("repr= ab", 132 b'repr=%5.2U', 'abc') 133 134 # test width modifier and precision modifier with %V 135 check_format("repr= abc", 136 b'repr=%5V', 'abc', b'123') 137 check_format("repr=ab", 138 b'repr=%.2V', 'abc', b'123') 139 check_format("repr= ab", 140 b'repr=%5.2V', 'abc', b'123') 141 check_format("repr= 123", 142 b'repr=%5V', None, b'123') 143 check_format("repr=12", 144 b'repr=%.2V', None, b'123') 145 check_format("repr= 12", 146 b'repr=%5.2V', None, b'123') 147 148 # test integer formats (%i, %d, %u) 149 check_format('010', 150 b'%03i', c_int(10)) 151 check_format('0010', 152 b'%0.4i', c_int(10)) 153 check_format('-123', 154 b'%i', c_int(-123)) 155 check_format('-123', 156 b'%li', c_long(-123)) 157 check_format('-123', 158 b'%lli', c_longlong(-123)) 159 check_format('-123', 160 b'%zi', c_ssize_t(-123)) 161 162 check_format('-123', 163 b'%d', c_int(-123)) 164 check_format('-123', 165 b'%ld', c_long(-123)) 166 check_format('-123', 167 b'%lld', c_longlong(-123)) 168 check_format('-123', 169 b'%zd', c_ssize_t(-123)) 170 171 check_format('123', 172 b'%u', c_uint(123)) 173 check_format('123', 174 b'%lu', c_ulong(123)) 175 check_format('123', 176 b'%llu', c_ulonglong(123)) 177 check_format('123', 178 b'%zu', c_size_t(123)) 179 180 # test long output 181 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1)) 182 max_longlong = -min_longlong - 1 183 check_format(str(min_longlong), 184 b'%lld', c_longlong(min_longlong)) 185 check_format(str(max_longlong), 186 b'%lld', c_longlong(max_longlong)) 187 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1 188 check_format(str(max_ulonglong), 189 b'%llu', c_ulonglong(max_ulonglong)) 190 PyUnicode_FromFormat(b'%p', c_void_p(-1)) 191 192 # test padding (width and/or precision) 193 check_format('123'.rjust(10, '0'), 194 b'%010i', c_int(123)) 195 check_format('123'.rjust(100), 196 b'%100i', c_int(123)) 197 check_format('123'.rjust(100, '0'), 198 b'%.100i', c_int(123)) 199 check_format('123'.rjust(80, '0').rjust(100), 200 b'%100.80i', c_int(123)) 201 202 check_format('123'.rjust(10, '0'), 203 b'%010u', c_uint(123)) 204 check_format('123'.rjust(100), 205 b'%100u', c_uint(123)) 206 check_format('123'.rjust(100, '0'), 207 b'%.100u', c_uint(123)) 208 check_format('123'.rjust(80, '0').rjust(100), 209 b'%100.80u', c_uint(123)) 210 211 check_format('123'.rjust(10, '0'), 212 b'%010x', c_int(0x123)) 213 check_format('123'.rjust(100), 214 b'%100x', c_int(0x123)) 215 check_format('123'.rjust(100, '0'), 216 b'%.100x', c_int(0x123)) 217 check_format('123'.rjust(80, '0').rjust(100), 218 b'%100.80x', c_int(0x123)) 219 220 # test %A 221 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", 222 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') 223 224 # test %V 225 check_format('repr=abc', 226 b'repr=%V', 'abc', b'xyz') 227 228 # test %p 229 # We cannot test the exact result, 230 # because it returns a hex representation of a C pointer, 231 # which is going to be different each time. But, we can test the format. 232 p_format_regex = r'^0x[a-zA-Z0-9]{3,}$' 233 p_format1 = PyUnicode_FromFormat(b'%p', 'abc') 234 self.assertIsInstance(p_format1, str) 235 self.assertRegex(p_format1, p_format_regex) 236 237 p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz') 238 self.assertIsInstance(p_format2, str) 239 self.assertRegex(p_format2, 240 r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}') 241 242 # Extra args are ignored: 243 p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz') 244 self.assertIsInstance(p_format3, str) 245 self.assertRegex(p_format3, p_format_regex) 246 247 # Test string decode from parameter of %s using utf-8. 248 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of 249 # '\u4eba\u6c11' 250 check_format('repr=\u4eba\u6c11', 251 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') 252 253 #Test replace error handler. 254 check_format('repr=abc\ufffd', 255 b'repr=%V', None, b'abc\xff') 256 257 # not supported: copy the raw format string. these tests are just here 258 # to check for crashes and should not be considered as specifications 259 check_format('%s', 260 b'%1%s', b'abc') 261 check_format('%1abc', 262 b'%1abc') 263 check_format('%+i', 264 b'%+i', c_int(10)) 265 check_format('%.%s', 266 b'%.%s', b'abc') 267 268 # Issue #33817: empty strings 269 check_format('', 270 b'') 271 check_format('', 272 b'%s', b'') 273 274 # Test PyUnicode_AsWideChar() 275 @support.cpython_only 276 @unittest.skipIf(_testcapi is None, 'need _testcapi module') 277 def test_aswidechar(self): 278 from _testcapi import unicode_aswidechar 279 import_helper.import_module('ctypes') 280 from ctypes import c_wchar, sizeof 281 282 wchar, size = unicode_aswidechar('abcdef', 2) 283 self.assertEqual(size, 2) 284 self.assertEqual(wchar, 'ab') 285 286 wchar, size = unicode_aswidechar('abc', 3) 287 self.assertEqual(size, 3) 288 self.assertEqual(wchar, 'abc') 289 290 wchar, size = unicode_aswidechar('abc', 4) 291 self.assertEqual(size, 3) 292 self.assertEqual(wchar, 'abc\0') 293 294 wchar, size = unicode_aswidechar('abc', 10) 295 self.assertEqual(size, 3) 296 self.assertEqual(wchar, 'abc\0') 297 298 wchar, size = unicode_aswidechar('abc\0def', 20) 299 self.assertEqual(size, 7) 300 self.assertEqual(wchar, 'abc\0def\0') 301 302 nonbmp = chr(0x10ffff) 303 if sizeof(c_wchar) == 2: 304 buflen = 3 305 nchar = 2 306 else: # sizeof(c_wchar) == 4 307 buflen = 2 308 nchar = 1 309 wchar, size = unicode_aswidechar(nonbmp, buflen) 310 self.assertEqual(size, nchar) 311 self.assertEqual(wchar, nonbmp + '\0') 312 313 # Test PyUnicode_AsWideCharString() 314 @support.cpython_only 315 @unittest.skipIf(_testcapi is None, 'need _testcapi module') 316 def test_aswidecharstring(self): 317 from _testcapi import unicode_aswidecharstring 318 import_helper.import_module('ctypes') 319 from ctypes import c_wchar, sizeof 320 321 wchar, size = unicode_aswidecharstring('abc') 322 self.assertEqual(size, 3) 323 self.assertEqual(wchar, 'abc\0') 324 325 wchar, size = unicode_aswidecharstring('abc\0def') 326 self.assertEqual(size, 7) 327 self.assertEqual(wchar, 'abc\0def\0') 328 329 nonbmp = chr(0x10ffff) 330 if sizeof(c_wchar) == 2: 331 nchar = 2 332 else: # sizeof(c_wchar) == 4 333 nchar = 1 334 wchar, size = unicode_aswidecharstring(nonbmp) 335 self.assertEqual(size, nchar) 336 self.assertEqual(wchar, nonbmp + '\0') 337 338 # Test PyUnicode_AsUCS4() 339 @support.cpython_only 340 @unittest.skipIf(_testcapi is None, 'need _testcapi module') 341 def test_asucs4(self): 342 from _testcapi import unicode_asucs4 343 for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600', 344 'a\ud800b\udfffc', '\ud834\udd1e']: 345 l = len(s) 346 self.assertEqual(unicode_asucs4(s, l, True), s+'\0') 347 self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff') 348 self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff') 349 self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff') 350 self.assertRaises(SystemError, unicode_asucs4, s, l-1, True) 351 self.assertRaises(SystemError, unicode_asucs4, s, l-2, False) 352 s = '\0'.join([s, s]) 353 self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0') 354 self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff') 355 356 # Test PyUnicode_AsUTF8() 357 @support.cpython_only 358 @unittest.skipIf(_testcapi is None, 'need _testcapi module') 359 def test_asutf8(self): 360 from _testcapi import unicode_asutf8 361 362 bmp = '\u0100' 363 bmp2 = '\uffff' 364 nonbmp = chr(0x10ffff) 365 366 self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80') 367 self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf') 368 self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf') 369 self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc') 370 371 # Test PyUnicode_AsUTF8AndSize() 372 @support.cpython_only 373 @unittest.skipIf(_testcapi is None, 'need _testcapi module') 374 def test_asutf8andsize(self): 375 from _testcapi import unicode_asutf8andsize 376 377 bmp = '\u0100' 378 bmp2 = '\uffff' 379 nonbmp = chr(0x10ffff) 380 381 self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2)) 382 self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3)) 383 self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4)) 384 self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc') 385 386 # Test PyUnicode_FindChar() 387 @support.cpython_only 388 @unittest.skipIf(_testcapi is None, 'need _testcapi module') 389 def test_findchar(self): 390 from _testcapi import unicode_findchar 391 392 for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1": 393 for i, ch in enumerate(str): 394 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i) 395 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i) 396 397 str = "!>_<!" 398 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1) 399 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1) 400 # start < end 401 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4) 402 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4) 403 # start >= end 404 self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1) 405 self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1) 406 # negative 407 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0) 408 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0) 409 410 # Test PyUnicode_CopyCharacters() 411 @support.cpython_only 412 @unittest.skipIf(_testcapi is None, 'need _testcapi module') 413 def test_copycharacters(self): 414 from _testcapi import unicode_copycharacters 415 416 strings = [ 417 'abcde', '\xa1\xa2\xa3\xa4\xa5', 418 '\u4f60\u597d\u4e16\u754c\uff01', 419 '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604' 420 ] 421 422 for idx, from_ in enumerate(strings): 423 # wide -> narrow: exceed maxchar limitation 424 for to in strings[:idx]: 425 self.assertRaises( 426 SystemError, 427 unicode_copycharacters, to, 0, from_, 0, 5 428 ) 429 # same kind 430 for from_start in range(5): 431 self.assertEqual( 432 unicode_copycharacters(from_, 0, from_, from_start, 5), 433 (from_[from_start:from_start+5].ljust(5, '\0'), 434 5-from_start) 435 ) 436 for to_start in range(5): 437 self.assertEqual( 438 unicode_copycharacters(from_, to_start, from_, to_start, 5), 439 (from_[to_start:to_start+5].rjust(5, '\0'), 440 5-to_start) 441 ) 442 # narrow -> wide 443 # Tests omitted since this creates invalid strings. 444 445 s = strings[0] 446 self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5) 447 self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5) 448 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5) 449 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5) 450 self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5) 451 self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1) 452 self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0) 453 454 @support.cpython_only 455 @unittest.skipIf(_testcapi is None, 'need _testcapi module') 456 def test_pep393_utf8_caching_bug(self): 457 # Issue #25709: Problem with string concatenation and utf-8 cache 458 from _testcapi import getargs_s_hash 459 for k in 0x24, 0xa4, 0x20ac, 0x1f40d: 460 s = '' 461 for i in range(5): 462 # Due to CPython specific optimization the 's' string can be 463 # resized in-place. 464 s += chr(k) 465 # Parsing with the "s#" format code calls indirectly 466 # PyUnicode_AsUTF8AndSize() which creates the UTF-8 467 # encoded string cached in the Unicode object. 468 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) 469 # Check that the second call returns the same result 470 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) 471 472 473if __name__ == "__main__": 474 unittest.main() 475