1import unittest
2import sys
3from test import support
4from test.support import import_helper
5
6try:
7    import _testcapi
8except ImportError:
9    _testcapi = None
10
11
12class CAPITest(unittest.TestCase):
13
14    # Test PyUnicode_FromFormat()
15    def test_from_format(self):
16        import_helper.import_module('ctypes')
17        from ctypes import (
18            c_char_p,
19            pythonapi, py_object, sizeof,
20            c_int, c_long, c_longlong, c_ssize_t,
21            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
22        name = "PyUnicode_FromFormat"
23        _PyUnicode_FromFormat = getattr(pythonapi, name)
24        _PyUnicode_FromFormat.argtypes = (c_char_p,)
25        _PyUnicode_FromFormat.restype = py_object
26
27        def PyUnicode_FromFormat(format, *args):
28            cargs = tuple(
29                py_object(arg) if isinstance(arg, str) else arg
30                for arg in args)
31            return _PyUnicode_FromFormat(format, *cargs)
32
33        def check_format(expected, format, *args):
34            text = PyUnicode_FromFormat(format, *args)
35            self.assertEqual(expected, text)
36
37        # ascii format, non-ascii argument
38        check_format('ascii\x7f=unicode\xe9',
39                     b'ascii\x7f=%U', 'unicode\xe9')
40
41        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
42        # raises an error
43        self.assertRaisesRegex(ValueError,
44            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
45            'string, got a non-ASCII byte: 0xe9$',
46            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
47
48        # test "%c"
49        check_format('\uabcd',
50                     b'%c', c_int(0xabcd))
51        check_format('\U0010ffff',
52                     b'%c', c_int(0x10ffff))
53        with self.assertRaises(OverflowError):
54            PyUnicode_FromFormat(b'%c', c_int(0x110000))
55        # Issue #18183
56        check_format('\U00010000\U00100000',
57                     b'%c%c', c_int(0x10000), c_int(0x100000))
58
59        # test "%"
60        check_format('%',
61                     b'%')
62        check_format('%',
63                     b'%%')
64        check_format('%s',
65                     b'%%s')
66        check_format('[%]',
67                     b'[%%]')
68        check_format('%abc',
69                     b'%%%s', b'abc')
70
71        # truncated string
72        check_format('abc',
73                     b'%.3s', b'abcdef')
74        check_format('abc[\ufffd',
75                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
76        check_format("'\\u20acABC'",
77                     b'%A', '\u20acABC')
78        check_format("'\\u20",
79                     b'%.5A', '\u20acABCDEF')
80        check_format("'\u20acABC'",
81                     b'%R', '\u20acABC')
82        check_format("'\u20acA",
83                     b'%.3R', '\u20acABCDEF')
84        check_format('\u20acAB',
85                     b'%.3S', '\u20acABCDEF')
86        check_format('\u20acAB',
87                     b'%.3U', '\u20acABCDEF')
88        check_format('\u20acAB',
89                     b'%.3V', '\u20acABCDEF', None)
90        check_format('abc[\ufffd',
91                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
92
93        # following tests comes from #7330
94        # test width modifier and precision modifier with %S
95        check_format("repr=  abc",
96                     b'repr=%5S', 'abc')
97        check_format("repr=ab",
98                     b'repr=%.2S', 'abc')
99        check_format("repr=   ab",
100                     b'repr=%5.2S', 'abc')
101
102        # test width modifier and precision modifier with %R
103        check_format("repr=   'abc'",
104                     b'repr=%8R', 'abc')
105        check_format("repr='ab",
106                     b'repr=%.3R', 'abc')
107        check_format("repr=  'ab",
108                     b'repr=%5.3R', 'abc')
109
110        # test width modifier and precision modifier with %A
111        check_format("repr=   'abc'",
112                     b'repr=%8A', 'abc')
113        check_format("repr='ab",
114                     b'repr=%.3A', 'abc')
115        check_format("repr=  'ab",
116                     b'repr=%5.3A', 'abc')
117
118        # test width modifier and precision modifier with %s
119        check_format("repr=  abc",
120                     b'repr=%5s', b'abc')
121        check_format("repr=ab",
122                     b'repr=%.2s', b'abc')
123        check_format("repr=   ab",
124                     b'repr=%5.2s', b'abc')
125
126        # test width modifier and precision modifier with %U
127        check_format("repr=  abc",
128                     b'repr=%5U', 'abc')
129        check_format("repr=ab",
130                     b'repr=%.2U', 'abc')
131        check_format("repr=   ab",
132                     b'repr=%5.2U', 'abc')
133
134        # test width modifier and precision modifier with %V
135        check_format("repr=  abc",
136                     b'repr=%5V', 'abc', b'123')
137        check_format("repr=ab",
138                     b'repr=%.2V', 'abc', b'123')
139        check_format("repr=   ab",
140                     b'repr=%5.2V', 'abc', b'123')
141        check_format("repr=  123",
142                     b'repr=%5V', None, b'123')
143        check_format("repr=12",
144                     b'repr=%.2V', None, b'123')
145        check_format("repr=   12",
146                     b'repr=%5.2V', None, b'123')
147
148        # test integer formats (%i, %d, %u)
149        check_format('010',
150                     b'%03i', c_int(10))
151        check_format('0010',
152                     b'%0.4i', c_int(10))
153        check_format('-123',
154                     b'%i', c_int(-123))
155        check_format('-123',
156                     b'%li', c_long(-123))
157        check_format('-123',
158                     b'%lli', c_longlong(-123))
159        check_format('-123',
160                     b'%zi', c_ssize_t(-123))
161
162        check_format('-123',
163                     b'%d', c_int(-123))
164        check_format('-123',
165                     b'%ld', c_long(-123))
166        check_format('-123',
167                     b'%lld', c_longlong(-123))
168        check_format('-123',
169                     b'%zd', c_ssize_t(-123))
170
171        check_format('123',
172                     b'%u', c_uint(123))
173        check_format('123',
174                     b'%lu', c_ulong(123))
175        check_format('123',
176                     b'%llu', c_ulonglong(123))
177        check_format('123',
178                     b'%zu', c_size_t(123))
179
180        # test long output
181        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
182        max_longlong = -min_longlong - 1
183        check_format(str(min_longlong),
184                     b'%lld', c_longlong(min_longlong))
185        check_format(str(max_longlong),
186                     b'%lld', c_longlong(max_longlong))
187        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
188        check_format(str(max_ulonglong),
189                     b'%llu', c_ulonglong(max_ulonglong))
190        PyUnicode_FromFormat(b'%p', c_void_p(-1))
191
192        # test padding (width and/or precision)
193        check_format('123'.rjust(10, '0'),
194                     b'%010i', c_int(123))
195        check_format('123'.rjust(100),
196                     b'%100i', c_int(123))
197        check_format('123'.rjust(100, '0'),
198                     b'%.100i', c_int(123))
199        check_format('123'.rjust(80, '0').rjust(100),
200                     b'%100.80i', c_int(123))
201
202        check_format('123'.rjust(10, '0'),
203                     b'%010u', c_uint(123))
204        check_format('123'.rjust(100),
205                     b'%100u', c_uint(123))
206        check_format('123'.rjust(100, '0'),
207                     b'%.100u', c_uint(123))
208        check_format('123'.rjust(80, '0').rjust(100),
209                     b'%100.80u', c_uint(123))
210
211        check_format('123'.rjust(10, '0'),
212                     b'%010x', c_int(0x123))
213        check_format('123'.rjust(100),
214                     b'%100x', c_int(0x123))
215        check_format('123'.rjust(100, '0'),
216                     b'%.100x', c_int(0x123))
217        check_format('123'.rjust(80, '0').rjust(100),
218                     b'%100.80x', c_int(0x123))
219
220        # test %A
221        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
222                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
223
224        # test %V
225        check_format('repr=abc',
226                     b'repr=%V', 'abc', b'xyz')
227
228        # test %p
229        # We cannot test the exact result,
230        # because it returns a hex representation of a C pointer,
231        # which is going to be different each time. But, we can test the format.
232        p_format_regex = r'^0x[a-zA-Z0-9]{3,}$'
233        p_format1 = PyUnicode_FromFormat(b'%p', 'abc')
234        self.assertIsInstance(p_format1, str)
235        self.assertRegex(p_format1, p_format_regex)
236
237        p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz')
238        self.assertIsInstance(p_format2, str)
239        self.assertRegex(p_format2,
240                         r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}')
241
242        # Extra args are ignored:
243        p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz')
244        self.assertIsInstance(p_format3, str)
245        self.assertRegex(p_format3, p_format_regex)
246
247        # Test string decode from parameter of %s using utf-8.
248        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
249        # '\u4eba\u6c11'
250        check_format('repr=\u4eba\u6c11',
251                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
252
253        #Test replace error handler.
254        check_format('repr=abc\ufffd',
255                     b'repr=%V', None, b'abc\xff')
256
257        # not supported: copy the raw format string. these tests are just here
258        # to check for crashes and should not be considered as specifications
259        check_format('%s',
260                     b'%1%s', b'abc')
261        check_format('%1abc',
262                     b'%1abc')
263        check_format('%+i',
264                     b'%+i', c_int(10))
265        check_format('%.%s',
266                     b'%.%s', b'abc')
267
268        # Issue #33817: empty strings
269        check_format('',
270                     b'')
271        check_format('',
272                     b'%s', b'')
273
274    # Test PyUnicode_AsWideChar()
275    @support.cpython_only
276    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
277    def test_aswidechar(self):
278        from _testcapi import unicode_aswidechar
279        import_helper.import_module('ctypes')
280        from ctypes import c_wchar, sizeof
281
282        wchar, size = unicode_aswidechar('abcdef', 2)
283        self.assertEqual(size, 2)
284        self.assertEqual(wchar, 'ab')
285
286        wchar, size = unicode_aswidechar('abc', 3)
287        self.assertEqual(size, 3)
288        self.assertEqual(wchar, 'abc')
289
290        wchar, size = unicode_aswidechar('abc', 4)
291        self.assertEqual(size, 3)
292        self.assertEqual(wchar, 'abc\0')
293
294        wchar, size = unicode_aswidechar('abc', 10)
295        self.assertEqual(size, 3)
296        self.assertEqual(wchar, 'abc\0')
297
298        wchar, size = unicode_aswidechar('abc\0def', 20)
299        self.assertEqual(size, 7)
300        self.assertEqual(wchar, 'abc\0def\0')
301
302        nonbmp = chr(0x10ffff)
303        if sizeof(c_wchar) == 2:
304            buflen = 3
305            nchar = 2
306        else: # sizeof(c_wchar) == 4
307            buflen = 2
308            nchar = 1
309        wchar, size = unicode_aswidechar(nonbmp, buflen)
310        self.assertEqual(size, nchar)
311        self.assertEqual(wchar, nonbmp + '\0')
312
313    # Test PyUnicode_AsWideCharString()
314    @support.cpython_only
315    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
316    def test_aswidecharstring(self):
317        from _testcapi import unicode_aswidecharstring
318        import_helper.import_module('ctypes')
319        from ctypes import c_wchar, sizeof
320
321        wchar, size = unicode_aswidecharstring('abc')
322        self.assertEqual(size, 3)
323        self.assertEqual(wchar, 'abc\0')
324
325        wchar, size = unicode_aswidecharstring('abc\0def')
326        self.assertEqual(size, 7)
327        self.assertEqual(wchar, 'abc\0def\0')
328
329        nonbmp = chr(0x10ffff)
330        if sizeof(c_wchar) == 2:
331            nchar = 2
332        else: # sizeof(c_wchar) == 4
333            nchar = 1
334        wchar, size = unicode_aswidecharstring(nonbmp)
335        self.assertEqual(size, nchar)
336        self.assertEqual(wchar, nonbmp + '\0')
337
338    # Test PyUnicode_AsUCS4()
339    @support.cpython_only
340    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
341    def test_asucs4(self):
342        from _testcapi import unicode_asucs4
343        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
344                  'a\ud800b\udfffc', '\ud834\udd1e']:
345            l = len(s)
346            self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
347            self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
348            self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
349            self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
350            self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
351            self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
352            s = '\0'.join([s, s])
353            self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
354            self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
355
356    # Test PyUnicode_AsUTF8()
357    @support.cpython_only
358    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
359    def test_asutf8(self):
360        from _testcapi import unicode_asutf8
361
362        bmp = '\u0100'
363        bmp2 = '\uffff'
364        nonbmp = chr(0x10ffff)
365
366        self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
367        self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
368        self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
369        self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
370
371    # Test PyUnicode_AsUTF8AndSize()
372    @support.cpython_only
373    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
374    def test_asutf8andsize(self):
375        from _testcapi import unicode_asutf8andsize
376
377        bmp = '\u0100'
378        bmp2 = '\uffff'
379        nonbmp = chr(0x10ffff)
380
381        self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
382        self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
383        self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
384        self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
385
386    # Test PyUnicode_FindChar()
387    @support.cpython_only
388    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
389    def test_findchar(self):
390        from _testcapi import unicode_findchar
391
392        for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
393            for i, ch in enumerate(str):
394                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
395                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
396
397        str = "!>_<!"
398        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
399        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
400        # start < end
401        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
402        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
403        # start >= end
404        self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
405        self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
406        # negative
407        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
408        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
409
410    # Test PyUnicode_CopyCharacters()
411    @support.cpython_only
412    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
413    def test_copycharacters(self):
414        from _testcapi import unicode_copycharacters
415
416        strings = [
417            'abcde', '\xa1\xa2\xa3\xa4\xa5',
418            '\u4f60\u597d\u4e16\u754c\uff01',
419            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
420        ]
421
422        for idx, from_ in enumerate(strings):
423            # wide -> narrow: exceed maxchar limitation
424            for to in strings[:idx]:
425                self.assertRaises(
426                    SystemError,
427                    unicode_copycharacters, to, 0, from_, 0, 5
428                )
429            # same kind
430            for from_start in range(5):
431                self.assertEqual(
432                    unicode_copycharacters(from_, 0, from_, from_start, 5),
433                    (from_[from_start:from_start+5].ljust(5, '\0'),
434                     5-from_start)
435                )
436            for to_start in range(5):
437                self.assertEqual(
438                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
439                    (from_[to_start:to_start+5].rjust(5, '\0'),
440                     5-to_start)
441                )
442            # narrow -> wide
443            # Tests omitted since this creates invalid strings.
444
445        s = strings[0]
446        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
447        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
448        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
449        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
450        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
451        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
452        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
453
454    @support.cpython_only
455    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
456    def test_pep393_utf8_caching_bug(self):
457        # Issue #25709: Problem with string concatenation and utf-8 cache
458        from _testcapi import getargs_s_hash
459        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
460            s = ''
461            for i in range(5):
462                # Due to CPython specific optimization the 's' string can be
463                # resized in-place.
464                s += chr(k)
465                # Parsing with the "s#" format code calls indirectly
466                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
467                # encoded string cached in the Unicode object.
468                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
469                # Check that the second call returns the same result
470                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
471
472
473if __name__ == "__main__":
474    unittest.main()
475