1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg ([email protected]).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8import _string
9import codecs
10import itertools
11import operator
12import pickle
13import struct
14import sys
15import textwrap
16import unicodedata
17import unittest
18import warnings
19from test.support import warnings_helper
20from test import support, string_tests
21from test.support.script_helper import assert_python_failure
22
23try:
24    import _testcapi
25except ImportError:
26    _testcapi = None
27
28# Error handling (bad decoder return)
29def search_function(encoding):
30    def decode1(input, errors="strict"):
31        return 42 # not a tuple
32    def encode1(input, errors="strict"):
33        return 42 # not a tuple
34    def encode2(input, errors="strict"):
35        return (42, 42) # no unicode
36    def decode2(input, errors="strict"):
37        return (42, 42) # no unicode
38    if encoding=="test.unicode1":
39        return (encode1, decode1, None, None)
40    elif encoding=="test.unicode2":
41        return (encode2, decode2, None, None)
42    else:
43        return None
44
45def duplicate_string(text):
46    """
47    Try to get a fresh clone of the specified text:
48    new object with a reference count of 1.
49
50    This is a best-effort: latin1 single letters and the empty
51    string ('') are singletons and cannot be cloned.
52    """
53    return text.encode().decode()
54
55class StrSubclass(str):
56    pass
57
58class UnicodeTest(string_tests.CommonTest,
59        string_tests.MixinStrUnicodeUserStringTest,
60        string_tests.MixinStrUnicodeTest,
61        unittest.TestCase):
62
63    type2test = str
64
65    def setUp(self):
66        codecs.register(search_function)
67        self.addCleanup(codecs.unregister, search_function)
68
69    def checkequalnofix(self, result, object, methodname, *args):
70        method = getattr(object, methodname)
71        realresult = method(*args)
72        self.assertEqual(realresult, result)
73        self.assertTrue(type(realresult) is type(result))
74
75        # if the original is returned make sure that
76        # this doesn't happen with subclasses
77        if realresult is object:
78            class usub(str):
79                def __repr__(self):
80                    return 'usub(%r)' % str.__repr__(self)
81            object = usub(object)
82            method = getattr(object, methodname)
83            realresult = method(*args)
84            self.assertEqual(realresult, result)
85            self.assertTrue(object is not realresult)
86
87    def test_literals(self):
88        self.assertEqual('\xff', '\u00ff')
89        self.assertEqual('\uffff', '\U0000ffff')
90        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
91        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
92        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
93        # raw strings should not have unicode escapes
94        self.assertNotEqual(r"\u0020", " ")
95
96    def test_ascii(self):
97        if not sys.platform.startswith('java'):
98            # Test basic sanity of repr()
99            self.assertEqual(ascii('abc'), "'abc'")
100            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
101            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
102            self.assertEqual(ascii('\\c'), "'\\\\c'")
103            self.assertEqual(ascii('\\'), "'\\\\'")
104            self.assertEqual(ascii('\n'), "'\\n'")
105            self.assertEqual(ascii('\r'), "'\\r'")
106            self.assertEqual(ascii('\t'), "'\\t'")
107            self.assertEqual(ascii('\b'), "'\\x08'")
108            self.assertEqual(ascii("'\""), """'\\'"'""")
109            self.assertEqual(ascii("'\""), """'\\'"'""")
110            self.assertEqual(ascii("'"), '''"'"''')
111            self.assertEqual(ascii('"'), """'"'""")
112            latin1repr = (
113                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
114                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
115                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
116                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
117                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
118                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
119                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
120                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
121                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
122                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
123                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
124                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
125                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
126                "\\xfe\\xff'")
127            testrepr = ascii(''.join(map(chr, range(256))))
128            self.assertEqual(testrepr, latin1repr)
129            # Test ascii works on wide unicode escapes without overflow.
130            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
131                             ascii("\U00010000" * 39 + "\uffff" * 4096))
132
133            class WrongRepr:
134                def __repr__(self):
135                    return b'byte-repr'
136            self.assertRaises(TypeError, ascii, WrongRepr())
137
138    def test_repr(self):
139        if not sys.platform.startswith('java'):
140            # Test basic sanity of repr()
141            self.assertEqual(repr('abc'), "'abc'")
142            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
143            self.assertEqual(repr('ab\\'), "'ab\\\\'")
144            self.assertEqual(repr('\\c'), "'\\\\c'")
145            self.assertEqual(repr('\\'), "'\\\\'")
146            self.assertEqual(repr('\n'), "'\\n'")
147            self.assertEqual(repr('\r'), "'\\r'")
148            self.assertEqual(repr('\t'), "'\\t'")
149            self.assertEqual(repr('\b'), "'\\x08'")
150            self.assertEqual(repr("'\""), """'\\'"'""")
151            self.assertEqual(repr("'\""), """'\\'"'""")
152            self.assertEqual(repr("'"), '''"'"''')
153            self.assertEqual(repr('"'), """'"'""")
154            latin1repr = (
155                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
156                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
157                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
158                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
159                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
160                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
161                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
162                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
163                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
164                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
165                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
166                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
167                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
168                "\xfe\xff'")
169            testrepr = repr(''.join(map(chr, range(256))))
170            self.assertEqual(testrepr, latin1repr)
171            # Test repr works on wide unicode escapes without overflow.
172            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
173                             repr("\U00010000" * 39 + "\uffff" * 4096))
174
175            class WrongRepr:
176                def __repr__(self):
177                    return b'byte-repr'
178            self.assertRaises(TypeError, repr, WrongRepr())
179
180    def test_iterators(self):
181        # Make sure unicode objects have an __iter__ method
182        it = "\u1111\u2222\u3333".__iter__()
183        self.assertEqual(next(it), "\u1111")
184        self.assertEqual(next(it), "\u2222")
185        self.assertEqual(next(it), "\u3333")
186        self.assertRaises(StopIteration, next, it)
187
188    def test_iterators_invocation(self):
189        cases = [type(iter('abc')), type(iter('��'))]
190        for cls in cases:
191            with self.subTest(cls=cls):
192                self.assertRaises(TypeError, cls)
193
194    def test_iteration(self):
195        cases = ['abc', '������', "\u1111\u2222\u3333"]
196        for case in cases:
197            with self.subTest(string=case):
198                self.assertEqual(case, "".join(iter(case)))
199
200    def test_exhausted_iterator(self):
201        cases = ['abc', '������', "\u1111\u2222\u3333"]
202        for case in cases:
203            with self.subTest(case=case):
204                iterator = iter(case)
205                tuple(iterator)
206                self.assertRaises(StopIteration, next, iterator)
207
208    def test_pickle_iterator(self):
209        cases = ['abc', '������', "\u1111\u2222\u3333"]
210        for case in cases:
211            with self.subTest(case=case):
212                for proto in range(pickle.HIGHEST_PROTOCOL + 1):
213                    it = iter(case)
214                    with self.subTest(proto=proto):
215                        pickled = "".join(pickle.loads(pickle.dumps(it, proto)))
216                        self.assertEqual(case, pickled)
217
218    def test_count(self):
219        string_tests.CommonTest.test_count(self)
220        # check mixed argument types
221        self.checkequalnofix(3,  'aaa', 'count', 'a')
222        self.checkequalnofix(0,  'aaa', 'count', 'b')
223        self.checkequalnofix(3, 'aaa', 'count',  'a')
224        self.checkequalnofix(0, 'aaa', 'count',  'b')
225        self.checkequalnofix(0, 'aaa', 'count',  'b')
226        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
227        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
228        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
229        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
230        # test mixed kinds
231        self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
232        self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
233        self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
234        self.checkequal(0, 'a' * 10, 'count', '\u0102')
235        self.checkequal(0, 'a' * 10, 'count', '\U00100304')
236        self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
237        self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
238        self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
239        self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
240        self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
241        self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
242        self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
243
244    def test_find(self):
245        string_tests.CommonTest.test_find(self)
246        # test implementation details of the memchr fast path
247        self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
248        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
249        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
250        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
251        self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
252        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
253        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
254        # check mixed argument types
255        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
256        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
257        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
258
259        # test utf-8 non-ascii char
260        self.checkequal(0, 'тест', 'find', 'т')
261        self.checkequal(3, 'тест', 'find', 'т', 1)
262        self.checkequal(-1, 'тест', 'find', 'т', 1, 3)
263        self.checkequal(-1, 'тест', 'find', 'e')  # english `e`
264        # test utf-8 non-ascii slice
265        self.checkequal(1, 'тест тест', 'find', 'ес')
266        self.checkequal(1, 'тест тест', 'find', 'ес', 1)
267        self.checkequal(1, 'тест тест', 'find', 'ес', 1, 3)
268        self.checkequal(6, 'тест тест', 'find', 'ес', 2)
269        self.checkequal(-1, 'тест тест', 'find', 'ес', 6, 7)
270        self.checkequal(-1, 'тест тест', 'find', 'ес', 7)
271        self.checkequal(-1, 'тест тест', 'find', 'ec')  # english `ec`
272
273        self.assertRaises(TypeError, 'hello'.find)
274        self.assertRaises(TypeError, 'hello'.find, 42)
275        # test mixed kinds
276        self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
277        self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
278        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
279        self.checkequal(-1, 'a' * 100, 'find', '\u0102')
280        self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
281        self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
282        self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
283        self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
284        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
285        self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
286        self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
287        self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
288
289    def test_rfind(self):
290        string_tests.CommonTest.test_rfind(self)
291        # test implementation details of the memrchr fast path
292        self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
293        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
294        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
295        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
296        self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
297        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
298        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
299        # check mixed argument types
300        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
301        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
302        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
303        # test utf-8 non-ascii char
304        self.checkequal(1, 'тест', 'rfind', 'е')
305        self.checkequal(1, 'тест', 'rfind', 'е', 1)
306        self.checkequal(-1, 'тест', 'rfind', 'е', 2)
307        self.checkequal(-1, 'тест', 'rfind', 'e')  # english `e`
308        # test utf-8 non-ascii slice
309        self.checkequal(6, 'тест тест', 'rfind', 'ес')
310        self.checkequal(6, 'тест тест', 'rfind', 'ес', 1)
311        self.checkequal(1, 'тест тест', 'rfind', 'ес', 1, 3)
312        self.checkequal(6, 'тест тест', 'rfind', 'ес', 2)
313        self.checkequal(-1, 'тест тест', 'rfind', 'ес', 6, 7)
314        self.checkequal(-1, 'тест тест', 'rfind', 'ес', 7)
315        self.checkequal(-1, 'тест тест', 'rfind', 'ec')  # english `ec`
316        # test mixed kinds
317        self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
318        self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
319        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
320        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
321        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
322        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
323        self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
324        self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
325        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
326        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
327        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
328        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
329
330    def test_index(self):
331        string_tests.CommonTest.test_index(self)
332        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
333        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
334        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
335        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
336        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
337        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
338        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
339        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
340        # test mixed kinds
341        self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
342        self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
343        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
344        self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
345        self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
346        self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
347        self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
348        self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
349        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
350        self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
351        self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
352        self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
353
354    def test_rindex(self):
355        string_tests.CommonTest.test_rindex(self)
356        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
357        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
358        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
359        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)
360
361        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
362        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
363        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
364        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
365        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
366        # test mixed kinds
367        self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
368        self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
369        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
370        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
371        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
372        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
373        self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
374        self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
375        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
376        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
377        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
378        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
379
380    def test_maketrans_translate(self):
381        # these work with plain translate()
382        self.checkequalnofix('bbbc', 'abababc', 'translate',
383                             {ord('a'): None})
384        self.checkequalnofix('iiic', 'abababc', 'translate',
385                             {ord('a'): None, ord('b'): ord('i')})
386        self.checkequalnofix('iiix', 'abababc', 'translate',
387                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
388        self.checkequalnofix('c', 'abababc', 'translate',
389                             {ord('a'): None, ord('b'): ''})
390        self.checkequalnofix('xyyx', 'xzx', 'translate',
391                             {ord('z'): 'yy'})
392
393        # this needs maketrans()
394        self.checkequalnofix('abababc', 'abababc', 'translate',
395                             {'b': '<i>'})
396        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
397        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
398        # test alternative way of calling maketrans()
399        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
400        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
401
402        # various tests switching from ASCII to latin1 or the opposite;
403        # same length, remove a letter, or replace with a longer string.
404        self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
405                         "[X]")
406        self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
407                         "[X]")
408        self.assertEqual("[a]".translate(str.maketrans({'a': None})),
409                         "[]")
410        self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
411                         "[XXX]")
412        self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
413                         "[\xe9]")
414        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
415                         "x123")
416        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
417                         "x\xe9")
418
419        # test non-ASCII (don't take the fast-path)
420        self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
421                         "[<\xe9>]")
422        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
423                         "[a]")
424        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
425                         "[]")
426        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
427                         "[123]")
428        self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
429                         "[<\u20ac>\xe9]")
430
431        # invalid Unicode characters
432        invalid_char = 0x10ffff+1
433        for before in "a\xe9\u20ac\U0010ffff":
434            mapping = str.maketrans({before: invalid_char})
435            text = "[%s]" % before
436            self.assertRaises(ValueError, text.translate, mapping)
437
438        # errors
439        self.assertRaises(TypeError, self.type2test.maketrans)
440        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
441        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
442        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
443        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
444        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
445        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
446
447        self.assertRaises(TypeError, 'hello'.translate)
448        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
449
450    def test_split(self):
451        string_tests.CommonTest.test_split(self)
452
453        # test mixed kinds
454        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
455            left *= 9
456            right *= 9
457            for delim in ('c', '\u0102', '\U00010302'):
458                self.checkequal([left + right],
459                                left + right, 'split', delim)
460                self.checkequal([left, right],
461                                left + delim + right, 'split', delim)
462                self.checkequal([left + right],
463                                left + right, 'split', delim * 2)
464                self.checkequal([left, right],
465                                left + delim * 2 + right, 'split', delim *2)
466
467    def test_rsplit(self):
468        string_tests.CommonTest.test_rsplit(self)
469        # test mixed kinds
470        for left, right in ('ba', 'юё', '\u0101\u0100', '\U00010301\U00010300'):
471            left *= 9
472            right *= 9
473            for delim in ('c', 'ы', '\u0102', '\U00010302'):
474                self.checkequal([left + right],
475                                left + right, 'rsplit', delim)
476                self.checkequal([left, right],
477                                left + delim + right, 'rsplit', delim)
478                self.checkequal([left + right],
479                                left + right, 'rsplit', delim * 2)
480                self.checkequal([left, right],
481                                left + delim * 2 + right, 'rsplit', delim *2)
482
483            # Check `None` as well:
484            self.checkequal([left + right],
485                             left + right, 'rsplit', None)
486
487    def test_partition(self):
488        string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
489        # test mixed kinds
490        self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
491        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
492            left *= 9
493            right *= 9
494            for delim in ('c', '\u0102', '\U00010302'):
495                self.checkequal((left + right, '', ''),
496                                left + right, 'partition', delim)
497                self.checkequal((left, delim, right),
498                                left + delim + right, 'partition', delim)
499                self.checkequal((left + right, '', ''),
500                                left + right, 'partition', delim * 2)
501                self.checkequal((left, delim * 2, right),
502                                left + delim * 2 + right, 'partition', delim * 2)
503
504    def test_rpartition(self):
505        string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
506        # test mixed kinds
507        self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
508        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
509            left *= 9
510            right *= 9
511            for delim in ('c', '\u0102', '\U00010302'):
512                self.checkequal(('', '', left + right),
513                                left + right, 'rpartition', delim)
514                self.checkequal((left, delim, right),
515                                left + delim + right, 'rpartition', delim)
516                self.checkequal(('', '', left + right),
517                                left + right, 'rpartition', delim * 2)
518                self.checkequal((left, delim * 2, right),
519                                left + delim * 2 + right, 'rpartition', delim * 2)
520
521    def test_join(self):
522        string_tests.MixinStrUnicodeUserStringTest.test_join(self)
523
524        class MyWrapper:
525            def __init__(self, sval): self.sval = sval
526            def __str__(self): return self.sval
527
528        # mixed arguments
529        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
530        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
531        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
532        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
533        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
534        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
535        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
536        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
537        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
538        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
539        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
540
541    @unittest.skipIf(sys.maxsize > 2**32,
542        'needs too much memory on a 64-bit platform')
543    def test_join_overflow(self):
544        size = int(sys.maxsize**0.5) + 1
545        seq = ('A' * size,) * size
546        self.assertRaises(OverflowError, ''.join, seq)
547
548    def test_replace(self):
549        string_tests.CommonTest.test_replace(self)
550
551        # method call forwarded from str implementation because of unicode argument
552        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
553        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
554        # test mixed kinds
555        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
556            left *= 9
557            right *= 9
558            for delim in ('c', '\u0102', '\U00010302'):
559                for repl in ('d', '\u0103', '\U00010303'):
560                    self.checkequal(left + right,
561                                    left + right, 'replace', delim, repl)
562                    self.checkequal(left + repl + right,
563                                    left + delim + right,
564                                    'replace', delim, repl)
565                    self.checkequal(left + right,
566                                    left + right, 'replace', delim * 2, repl)
567                    self.checkequal(left + repl + right,
568                                    left + delim * 2 + right,
569                                    'replace', delim * 2, repl)
570
571    @support.cpython_only
572    def test_replace_id(self):
573        pattern = 'abc'
574        text = 'abc def'
575        self.assertIs(text.replace(pattern, pattern), text)
576
577    def test_repeat_id_preserving(self):
578        a = '123abc1@'
579        b = '456zyx-+'
580        self.assertEqual(id(a), id(a))
581        self.assertNotEqual(id(a), id(b))
582        self.assertNotEqual(id(a), id(a * -4))
583        self.assertNotEqual(id(a), id(a * 0))
584        self.assertEqual(id(a), id(a * 1))
585        self.assertEqual(id(a), id(1 * a))
586        self.assertNotEqual(id(a), id(a * 2))
587
588        class SubStr(str):
589            pass
590
591        s = SubStr('qwerty()')
592        self.assertEqual(id(s), id(s))
593        self.assertNotEqual(id(s), id(s * -4))
594        self.assertNotEqual(id(s), id(s * 0))
595        self.assertNotEqual(id(s), id(s * 1))
596        self.assertNotEqual(id(s), id(1 * s))
597        self.assertNotEqual(id(s), id(s * 2))
598
599    def test_bytes_comparison(self):
600        with warnings_helper.check_warnings():
601            warnings.simplefilter('ignore', BytesWarning)
602            self.assertEqual('abc' == b'abc', False)
603            self.assertEqual('abc' != b'abc', True)
604            self.assertEqual('abc' == bytearray(b'abc'), False)
605            self.assertEqual('abc' != bytearray(b'abc'), True)
606
607    def test_comparison(self):
608        # Comparisons:
609        self.assertEqual('abc', 'abc')
610        self.assertTrue('abcd' > 'abc')
611        self.assertTrue('abc' < 'abcd')
612
613        if 0:
614            # Move these tests to a Unicode collation module test...
615            # Testing UTF-16 code point order comparisons...
616
617            # No surrogates, no fixup required.
618            self.assertTrue('\u0061' < '\u20ac')
619            # Non surrogate below surrogate value, no fixup required
620            self.assertTrue('\u0061' < '\ud800\udc02')
621
622            # Non surrogate above surrogate value, fixup required
623            def test_lecmp(s, s2):
624                self.assertTrue(s < s2)
625
626            def test_fixup(s):
627                s2 = '\ud800\udc01'
628                test_lecmp(s, s2)
629                s2 = '\ud900\udc01'
630                test_lecmp(s, s2)
631                s2 = '\uda00\udc01'
632                test_lecmp(s, s2)
633                s2 = '\udb00\udc01'
634                test_lecmp(s, s2)
635                s2 = '\ud800\udd01'
636                test_lecmp(s, s2)
637                s2 = '\ud900\udd01'
638                test_lecmp(s, s2)
639                s2 = '\uda00\udd01'
640                test_lecmp(s, s2)
641                s2 = '\udb00\udd01'
642                test_lecmp(s, s2)
643                s2 = '\ud800\ude01'
644                test_lecmp(s, s2)
645                s2 = '\ud900\ude01'
646                test_lecmp(s, s2)
647                s2 = '\uda00\ude01'
648                test_lecmp(s, s2)
649                s2 = '\udb00\ude01'
650                test_lecmp(s, s2)
651                s2 = '\ud800\udfff'
652                test_lecmp(s, s2)
653                s2 = '\ud900\udfff'
654                test_lecmp(s, s2)
655                s2 = '\uda00\udfff'
656                test_lecmp(s, s2)
657                s2 = '\udb00\udfff'
658                test_lecmp(s, s2)
659
660                test_fixup('\ue000')
661                test_fixup('\uff61')
662
663        # Surrogates on both sides, no fixup required
664        self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
665
666    def test_islower(self):
667        super().test_islower()
668        self.checkequalnofix(False, '\u1FFc', 'islower')
669        self.assertFalse('\u2167'.islower())
670        self.assertTrue('\u2177'.islower())
671        # non-BMP, uppercase
672        self.assertFalse('\U00010401'.islower())
673        self.assertFalse('\U00010427'.islower())
674        # non-BMP, lowercase
675        self.assertTrue('\U00010429'.islower())
676        self.assertTrue('\U0001044E'.islower())
677        # non-BMP, non-cased
678        self.assertFalse('\U0001F40D'.islower())
679        self.assertFalse('\U0001F46F'.islower())
680
681    def test_isupper(self):
682        super().test_isupper()
683        if not sys.platform.startswith('java'):
684            self.checkequalnofix(False, '\u1FFc', 'isupper')
685        self.assertTrue('\u2167'.isupper())
686        self.assertFalse('\u2177'.isupper())
687        # non-BMP, uppercase
688        self.assertTrue('\U00010401'.isupper())
689        self.assertTrue('\U00010427'.isupper())
690        # non-BMP, lowercase
691        self.assertFalse('\U00010429'.isupper())
692        self.assertFalse('\U0001044E'.isupper())
693        # non-BMP, non-cased
694        self.assertFalse('\U0001F40D'.isupper())
695        self.assertFalse('\U0001F46F'.isupper())
696
697    def test_istitle(self):
698        super().test_istitle()
699        self.checkequalnofix(True, '\u1FFc', 'istitle')
700        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
701
702        # non-BMP, uppercase + lowercase
703        self.assertTrue('\U00010401\U00010429'.istitle())
704        self.assertTrue('\U00010427\U0001044E'.istitle())
705        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
706        for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
707            self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
708
709    def test_isspace(self):
710        super().test_isspace()
711        self.checkequalnofix(True, '\u2000', 'isspace')
712        self.checkequalnofix(True, '\u200a', 'isspace')
713        self.checkequalnofix(False, '\u2014', 'isspace')
714        # There are no non-BMP whitespace chars as of Unicode 12.
715        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
716                   '\U0001F40D', '\U0001F46F']:
717            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
718
719    @support.requires_resource('cpu')
720    def test_isspace_invariant(self):
721        for codepoint in range(sys.maxunicode + 1):
722            char = chr(codepoint)
723            bidirectional = unicodedata.bidirectional(char)
724            category = unicodedata.category(char)
725            self.assertEqual(char.isspace(),
726                             (bidirectional in ('WS', 'B', 'S')
727                              or category == 'Zs'))
728
729    def test_isalnum(self):
730        super().test_isalnum()
731        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
732                   '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
733            self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
734
735    def test_isalpha(self):
736        super().test_isalpha()
737        self.checkequalnofix(True, '\u1FFc', 'isalpha')
738        # non-BMP, cased
739        self.assertTrue('\U00010401'.isalpha())
740        self.assertTrue('\U00010427'.isalpha())
741        self.assertTrue('\U00010429'.isalpha())
742        self.assertTrue('\U0001044E'.isalpha())
743        # non-BMP, non-cased
744        self.assertFalse('\U0001F40D'.isalpha())
745        self.assertFalse('\U0001F46F'.isalpha())
746
747    def test_isascii(self):
748        super().test_isascii()
749        self.assertFalse("\u20ac".isascii())
750        self.assertFalse("\U0010ffff".isascii())
751
752    def test_isdecimal(self):
753        self.checkequalnofix(False, '', 'isdecimal')
754        self.checkequalnofix(False, 'a', 'isdecimal')
755        self.checkequalnofix(True, '0', 'isdecimal')
756        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
757        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
758        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
759        self.checkequalnofix(True, '0123456789', 'isdecimal')
760        self.checkequalnofix(False, '0123456789a', 'isdecimal')
761
762        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
763
764        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
765                   '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
766            self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
767        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
768            self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
769
770    def test_isdigit(self):
771        super().test_isdigit()
772        self.checkequalnofix(True, '\u2460', 'isdigit')
773        self.checkequalnofix(False, '\xbc', 'isdigit')
774        self.checkequalnofix(True, '\u0660', 'isdigit')
775
776        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
777                   '\U0001F40D', '\U0001F46F', '\U00011065']:
778            self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
779        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
780            self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
781
782    def test_isnumeric(self):
783        self.checkequalnofix(False, '', 'isnumeric')
784        self.checkequalnofix(False, 'a', 'isnumeric')
785        self.checkequalnofix(True, '0', 'isnumeric')
786        self.checkequalnofix(True, '\u2460', 'isnumeric')
787        self.checkequalnofix(True, '\xbc', 'isnumeric')
788        self.checkequalnofix(True, '\u0660', 'isnumeric')
789        self.checkequalnofix(True, '0123456789', 'isnumeric')
790        self.checkequalnofix(False, '0123456789a', 'isnumeric')
791
792        self.assertRaises(TypeError, "abc".isnumeric, 42)
793
794        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
795                   '\U0001F40D', '\U0001F46F']:
796            self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
797        for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
798                   '\U000104A0', '\U0001F107']:
799            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
800
801    def test_isidentifier(self):
802        self.assertTrue("a".isidentifier())
803        self.assertTrue("Z".isidentifier())
804        self.assertTrue("_".isidentifier())
805        self.assertTrue("b0".isidentifier())
806        self.assertTrue("bc".isidentifier())
807        self.assertTrue("b_".isidentifier())
808        self.assertTrue("µ".isidentifier())
809        self.assertTrue("��������������".isidentifier())
810
811        self.assertFalse(" ".isidentifier())
812        self.assertFalse("[".isidentifier())
813        self.assertFalse("©".isidentifier())
814        self.assertFalse("0".isidentifier())
815
816    @support.cpython_only
817    @support.requires_legacy_unicode_capi
818    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
819    def test_isidentifier_legacy(self):
820        u = '��������������'
821        self.assertTrue(u.isidentifier())
822        with warnings_helper.check_warnings():
823            warnings.simplefilter('ignore', DeprecationWarning)
824            self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
825
826    def test_isprintable(self):
827        self.assertTrue("".isprintable())
828        self.assertTrue(" ".isprintable())
829        self.assertTrue("abcdefg".isprintable())
830        self.assertFalse("abcdefg\n".isprintable())
831        # some defined Unicode character
832        self.assertTrue("\u0374".isprintable())
833        # undefined character
834        self.assertFalse("\u0378".isprintable())
835        # single surrogate character
836        self.assertFalse("\ud800".isprintable())
837
838        self.assertTrue('\U0001F46F'.isprintable())
839        self.assertFalse('\U000E0020'.isprintable())
840
841    def test_surrogates(self):
842        for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
843                  'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
844            self.assertTrue(s.islower())
845            self.assertFalse(s.isupper())
846            self.assertFalse(s.istitle())
847        for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
848                  'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
849            self.assertFalse(s.islower())
850            self.assertTrue(s.isupper())
851            self.assertTrue(s.istitle())
852
853        for meth_name in ('islower', 'isupper', 'istitle'):
854            meth = getattr(str, meth_name)
855            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
856                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
857
858        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
859                          'isdecimal', 'isnumeric',
860                          'isidentifier', 'isprintable'):
861            meth = getattr(str, meth_name)
862            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
863                      'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
864                      'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
865                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
866
867
868    def test_lower(self):
869        string_tests.CommonTest.test_lower(self)
870        self.assertEqual('\U00010427'.lower(), '\U0001044F')
871        self.assertEqual('\U00010427\U00010427'.lower(),
872                         '\U0001044F\U0001044F')
873        self.assertEqual('\U00010427\U0001044F'.lower(),
874                         '\U0001044F\U0001044F')
875        self.assertEqual('X\U00010427x\U0001044F'.lower(),
876                         'x\U0001044Fx\U0001044F')
877        self.assertEqual('fi'.lower(), 'fi')
878        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
879        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
880        self.assertEqual('\u03a3'.lower(), '\u03c3')
881        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
882        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
883        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
884        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
885        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
886        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
887        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
888        self.assertEqual('\u2177'.lower(), '\u2177')
889
890    def test_casefold(self):
891        self.assertEqual('hello'.casefold(), 'hello')
892        self.assertEqual('hELlo'.casefold(), 'hello')
893        self.assertEqual('ß'.casefold(), 'ss')
894        self.assertEqual('fi'.casefold(), 'fi')
895        self.assertEqual('\u03a3'.casefold(), '\u03c3')
896        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
897        self.assertEqual('\u00b5'.casefold(), '\u03bc')
898
899    def test_upper(self):
900        string_tests.CommonTest.test_upper(self)
901        self.assertEqual('\U0001044F'.upper(), '\U00010427')
902        self.assertEqual('\U0001044F\U0001044F'.upper(),
903                         '\U00010427\U00010427')
904        self.assertEqual('\U00010427\U0001044F'.upper(),
905                         '\U00010427\U00010427')
906        self.assertEqual('X\U00010427x\U0001044F'.upper(),
907                         'X\U00010427X\U00010427')
908        self.assertEqual('fi'.upper(), 'FI')
909        self.assertEqual('\u0130'.upper(), '\u0130')
910        self.assertEqual('\u03a3'.upper(), '\u03a3')
911        self.assertEqual('ß'.upper(), 'SS')
912        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
913        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
914        self.assertEqual('\u2177'.upper(), '\u2167')
915
916    def test_capitalize(self):
917        string_tests.CommonTest.test_capitalize(self)
918        self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
919        self.assertEqual('\U0001044F\U0001044F'.capitalize(),
920                         '\U00010427\U0001044F')
921        self.assertEqual('\U00010427\U0001044F'.capitalize(),
922                         '\U00010427\U0001044F')
923        self.assertEqual('\U0001044F\U00010427'.capitalize(),
924                         '\U00010427\U0001044F')
925        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
926                         'X\U0001044Fx\U0001044F')
927        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
928        exp = '\u0399\u0308\u0300\u0069\u0307'
929        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
930        self.assertEqual('finnish'.capitalize(), 'Finnish')
931        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
932
933    def test_title(self):
934        super().test_title()
935        self.assertEqual('\U0001044F'.title(), '\U00010427')
936        self.assertEqual('\U0001044F\U0001044F'.title(),
937                         '\U00010427\U0001044F')
938        self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
939                         '\U00010427\U0001044F \U00010427\U0001044F')
940        self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
941                         '\U00010427\U0001044F \U00010427\U0001044F')
942        self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
943                         '\U00010427\U0001044F \U00010427\U0001044F')
944        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
945                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
946        self.assertEqual('fiNNISH'.title(), 'Finnish')
947        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
948        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
949
950    def test_swapcase(self):
951        string_tests.CommonTest.test_swapcase(self)
952        self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
953        self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
954        self.assertEqual('\U0001044F\U0001044F'.swapcase(),
955                         '\U00010427\U00010427')
956        self.assertEqual('\U00010427\U0001044F'.swapcase(),
957                         '\U0001044F\U00010427')
958        self.assertEqual('\U0001044F\U00010427'.swapcase(),
959                         '\U00010427\U0001044F')
960        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
961                         'x\U0001044FX\U00010427')
962        self.assertEqual('fi'.swapcase(), 'FI')
963        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
964        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
965        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
966        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
967        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
968        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
969        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
970        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
971        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
972        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
973        self.assertEqual('ß'.swapcase(), 'SS')
974        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
975
976    def test_center(self):
977        string_tests.CommonTest.test_center(self)
978        self.assertEqual('x'.center(2, '\U0010FFFF'),
979                         'x\U0010FFFF')
980        self.assertEqual('x'.center(3, '\U0010FFFF'),
981                         '\U0010FFFFx\U0010FFFF')
982        self.assertEqual('x'.center(4, '\U0010FFFF'),
983                         '\U0010FFFFx\U0010FFFF\U0010FFFF')
984
985    @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
986    @support.cpython_only
987    def test_case_operation_overflow(self):
988        # Issue #22643
989        size = 2**32//12 + 1
990        try:
991            s = "ü" * size
992        except MemoryError:
993            self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
994        try:
995            self.assertRaises(OverflowError, s.upper)
996        finally:
997            del s
998
999    def test_contains(self):
1000        # Testing Unicode contains method
1001        self.assertIn('a', 'abdb')
1002        self.assertIn('a', 'bdab')
1003        self.assertIn('a', 'bdaba')
1004        self.assertIn('a', 'bdba')
1005        self.assertNotIn('a', 'bdb')
1006        self.assertIn('a', 'bdba')
1007        self.assertIn('a', ('a',1,None))
1008        self.assertIn('a', (1,None,'a'))
1009        self.assertIn('a', ('a',1,None))
1010        self.assertIn('a', (1,None,'a'))
1011        self.assertNotIn('a', ('x',1,'y'))
1012        self.assertNotIn('a', ('x',1,None))
1013        self.assertNotIn('abcd', 'abcxxxx')
1014        self.assertIn('ab', 'abcd')
1015        self.assertIn('ab', 'abc')
1016        self.assertIn('ab', (1,None,'ab'))
1017        self.assertIn('', 'abc')
1018        self.assertIn('', '')
1019        self.assertIn('', 'abc')
1020        self.assertNotIn('\0', 'abc')
1021        self.assertIn('\0', '\0abc')
1022        self.assertIn('\0', 'abc\0')
1023        self.assertIn('a', '\0abc')
1024        self.assertIn('asdf', 'asdf')
1025        self.assertNotIn('asdf', 'asd')
1026        self.assertNotIn('asdf', '')
1027
1028        self.assertRaises(TypeError, "abc".__contains__)
1029        # test mixed kinds
1030        for fill in ('a', '\u0100', '\U00010300'):
1031            fill *= 9
1032            for delim in ('c', '\u0102', '\U00010302'):
1033                self.assertNotIn(delim, fill)
1034                self.assertIn(delim, fill + delim)
1035                self.assertNotIn(delim * 2, fill)
1036                self.assertIn(delim * 2, fill + delim * 2)
1037
1038    def test_issue18183(self):
1039        '\U00010000\U00100000'.lower()
1040        '\U00010000\U00100000'.casefold()
1041        '\U00010000\U00100000'.upper()
1042        '\U00010000\U00100000'.capitalize()
1043        '\U00010000\U00100000'.title()
1044        '\U00010000\U00100000'.swapcase()
1045        '\U00100000'.center(3, '\U00010000')
1046        '\U00100000'.ljust(3, '\U00010000')
1047        '\U00100000'.rjust(3, '\U00010000')
1048
1049    def test_format(self):
1050        self.assertEqual(''.format(), '')
1051        self.assertEqual('a'.format(), 'a')
1052        self.assertEqual('ab'.format(), 'ab')
1053        self.assertEqual('a{{'.format(), 'a{')
1054        self.assertEqual('a}}'.format(), 'a}')
1055        self.assertEqual('{{b'.format(), '{b')
1056        self.assertEqual('}}b'.format(), '}b')
1057        self.assertEqual('a{{b'.format(), 'a{b')
1058
1059        # examples from the PEP:
1060        import datetime
1061        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
1062        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
1063                         "My name is Fred")
1064        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
1065                         "My name is Fred :-{}")
1066
1067        d = datetime.date(2007, 8, 18)
1068        self.assertEqual("The year is {0.year}".format(d),
1069                         "The year is 2007")
1070
1071        # classes we'll use for testing
1072        class C:
1073            def __init__(self, x=100):
1074                self._x = x
1075            def __format__(self, spec):
1076                return spec
1077
1078        class D:
1079            def __init__(self, x):
1080                self.x = x
1081            def __format__(self, spec):
1082                return str(self.x)
1083
1084        # class with __str__, but no __format__
1085        class E:
1086            def __init__(self, x):
1087                self.x = x
1088            def __str__(self):
1089                return 'E(' + self.x + ')'
1090
1091        # class with __repr__, but no __format__ or __str__
1092        class F:
1093            def __init__(self, x):
1094                self.x = x
1095            def __repr__(self):
1096                return 'F(' + self.x + ')'
1097
1098        # class with __format__ that forwards to string, for some format_spec's
1099        class G:
1100            def __init__(self, x):
1101                self.x = x
1102            def __str__(self):
1103                return "string is " + self.x
1104            def __format__(self, format_spec):
1105                if format_spec == 'd':
1106                    return 'G(' + self.x + ')'
1107                return object.__format__(self, format_spec)
1108
1109        class I(datetime.date):
1110            def __format__(self, format_spec):
1111                return self.strftime(format_spec)
1112
1113        class J(int):
1114            def __format__(self, format_spec):
1115                return int.__format__(self * 2, format_spec)
1116
1117        class M:
1118            def __init__(self, x):
1119                self.x = x
1120            def __repr__(self):
1121                return 'M(' + self.x + ')'
1122            __str__ = None
1123
1124        class N:
1125            def __init__(self, x):
1126                self.x = x
1127            def __repr__(self):
1128                return 'N(' + self.x + ')'
1129            __format__ = None
1130
1131        self.assertEqual(''.format(), '')
1132        self.assertEqual('abc'.format(), 'abc')
1133        self.assertEqual('{0}'.format('abc'), 'abc')
1134        self.assertEqual('{0:}'.format('abc'), 'abc')
1135#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
1136        self.assertEqual('X{0}'.format('abc'), 'Xabc')
1137        self.assertEqual('{0}X'.format('abc'), 'abcX')
1138        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1139        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1140        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1141        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1142        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1143        self.assertEqual('{0}'.format(-15), '-15')
1144        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1145        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1146        self.assertEqual('{{'.format(), '{')
1147        self.assertEqual('}}'.format(), '}')
1148        self.assertEqual('{{}}'.format(), '{}')
1149        self.assertEqual('{{x}}'.format(), '{x}')
1150        self.assertEqual('{{{0}}}'.format(123), '{123}')
1151        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1152        self.assertEqual('}}{{'.format(), '}{')
1153        self.assertEqual('}}x{{'.format(), '}x{')
1154
1155        # weird field names
1156        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1157        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1158        self.assertEqual("{0[ ]}".format({' ':3}), '3')
1159
1160        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1161        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1162        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1163        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1164        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1165        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1166        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1167
1168        # strings
1169        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1170        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1171        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1172        self.assertEqual('{0:.0s}'.format('abcdef'), '')
1173        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1174        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1175        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1176        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1177        self.assertEqual('{0:x<0s}'.format('result'), 'result')
1178        self.assertEqual('{0:x<5s}'.format('result'), 'result')
1179        self.assertEqual('{0:x<6s}'.format('result'), 'result')
1180        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1181        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1182        self.assertEqual('{0: <7s}'.format('result'), 'result ')
1183        self.assertEqual('{0:<7s}'.format('result'), 'result ')
1184        self.assertEqual('{0:>7s}'.format('result'), ' result')
1185        self.assertEqual('{0:>8s}'.format('result'), '  result')
1186        self.assertEqual('{0:^8s}'.format('result'), ' result ')
1187        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
1188        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
1189        self.assertEqual('{0:8s}'.format('result'), 'result  ')
1190        self.assertEqual('{0:0s}'.format('result'), 'result')
1191        self.assertEqual('{0:08s}'.format('result'), 'result00')
1192        self.assertEqual('{0:<08s}'.format('result'), 'result00')
1193        self.assertEqual('{0:>08s}'.format('result'), '00result')
1194        self.assertEqual('{0:^08s}'.format('result'), '0result0')
1195        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1196        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1197        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1198
1199        # issue 12546: use \x00 as a fill character
1200        self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1201        self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1202        self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1203        self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
1204
1205        self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1206        self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1207        self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1208        self.assertEqual('{0:<6}'.format(3), '3     ')
1209
1210        self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1211        self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1212        self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1213        self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1214
1215        self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1216        self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1217        self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1218        self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
1219
1220        # format specifiers for user defined type
1221        self.assertEqual('{0:abc}'.format(C()), 'abc')
1222
1223        # !r, !s and !a coercions
1224        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1225        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1226        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
1227        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
1228        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1229        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1230        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1231        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1232        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
1233        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1234        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1235        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1236        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1237        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1238        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1239        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1240
1241        # test fallback to object.__format__
1242        self.assertEqual('{0}'.format({}), '{}')
1243        self.assertEqual('{0}'.format([]), '[]')
1244        self.assertEqual('{0}'.format([1]), '[1]')
1245
1246        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1247        self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1248
1249        self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1250        self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1251        self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1252
1253        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1254                                                       month=8,
1255                                                       day=27)),
1256                         "date: 2007-08-27")
1257
1258        # test deriving from a builtin type and overriding __format__
1259        self.assertEqual("{0}".format(J(10)), "20")
1260
1261
1262        # string format specifiers
1263        self.assertEqual('{0:}'.format('a'), 'a')
1264
1265        # computed format specifiers
1266        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1267        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1268        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1269        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
1270        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')
1271
1272        # test various errors
1273        self.assertRaises(ValueError, '{'.format)
1274        self.assertRaises(ValueError, '}'.format)
1275        self.assertRaises(ValueError, 'a{'.format)
1276        self.assertRaises(ValueError, 'a}'.format)
1277        self.assertRaises(ValueError, '{a'.format)
1278        self.assertRaises(ValueError, '}a'.format)
1279        self.assertRaises(IndexError, '{0}'.format)
1280        self.assertRaises(IndexError, '{1}'.format, 'abc')
1281        self.assertRaises(KeyError,   '{x}'.format)
1282        self.assertRaises(ValueError, "}{".format)
1283        self.assertRaises(ValueError, "abc{0:{}".format)
1284        self.assertRaises(ValueError, "{0".format)
1285        self.assertRaises(IndexError, "{0.}".format)
1286        self.assertRaises(ValueError, "{0.}".format, 0)
1287        self.assertRaises(ValueError, "{0[}".format)
1288        self.assertRaises(ValueError, "{0[}".format, [])
1289        self.assertRaises(KeyError,   "{0]}".format)
1290        self.assertRaises(ValueError, "{0.[]}".format, 0)
1291        self.assertRaises(ValueError, "{0..foo}".format, 0)
1292        self.assertRaises(ValueError, "{0[0}".format, 0)
1293        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1294        self.assertRaises(KeyError,   "{c]}".format)
1295        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1296        self.assertRaises(ValueError, "{0}}".format, 0)
1297        self.assertRaises(KeyError,   "{foo}".format, bar=3)
1298        self.assertRaises(ValueError, "{0!x}".format, 3)
1299        self.assertRaises(ValueError, "{0!}".format, 0)
1300        self.assertRaises(ValueError, "{0!rs}".format, 0)
1301        self.assertRaises(ValueError, "{!}".format)
1302        self.assertRaises(IndexError, "{:}".format)
1303        self.assertRaises(IndexError, "{:s}".format)
1304        self.assertRaises(IndexError, "{}".format)
1305        big = "23098475029384702983476098230754973209482573"
1306        self.assertRaises(ValueError, ("{" + big + "}").format)
1307        self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1308
1309        # test number formatter errors:
1310        self.assertRaises(ValueError, '{0:x}'.format, 1j)
1311        self.assertRaises(ValueError, '{0:x}'.format, 1.0)
1312        self.assertRaises(ValueError, '{0:X}'.format, 1j)
1313        self.assertRaises(ValueError, '{0:X}'.format, 1.0)
1314        self.assertRaises(ValueError, '{0:o}'.format, 1j)
1315        self.assertRaises(ValueError, '{0:o}'.format, 1.0)
1316        self.assertRaises(ValueError, '{0:u}'.format, 1j)
1317        self.assertRaises(ValueError, '{0:u}'.format, 1.0)
1318        self.assertRaises(ValueError, '{0:i}'.format, 1j)
1319        self.assertRaises(ValueError, '{0:i}'.format, 1.0)
1320        self.assertRaises(ValueError, '{0:d}'.format, 1j)
1321        self.assertRaises(ValueError, '{0:d}'.format, 1.0)
1322
1323        # issue 6089
1324        self.assertRaises(ValueError, "{0[0]x}".format, [None])
1325        self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1326
1327        # can't have a replacement on the field name portion
1328        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1329
1330        # exceed maximum recursion depth
1331        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1332        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1333                          0, 1, 2, 3, 4, 5, 6, 7)
1334
1335        # string format spec errors
1336        sign_msg = "Sign not allowed in string format specifier"
1337        self.assertRaisesRegex(ValueError, sign_msg, "{0:-s}".format, '')
1338        self.assertRaisesRegex(ValueError, sign_msg, format, "", "-")
1339        space_msg = "Space not allowed in string format specifier"
1340        self.assertRaisesRegex(ValueError, space_msg, "{: }".format, '')
1341        self.assertRaises(ValueError, "{0:=s}".format, '')
1342
1343        # Alternate formatting is not supported
1344        self.assertRaises(ValueError, format, '', '#')
1345        self.assertRaises(ValueError, format, '', '#20')
1346
1347        # Non-ASCII
1348        self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1349                         'ABC\u0410\u0411\u0412')
1350        self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1351                         'ABC')
1352        self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1353                         '')
1354
1355        self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1356        self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1357        self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1358        self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1359        self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1360        self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1361        self.assertRaises(ValueError, "{a{}b}".format, 42)
1362        self.assertRaises(ValueError, "{a{b}".format, 42)
1363        self.assertRaises(ValueError, "{[}".format, 42)
1364
1365        self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1366
1367        # Blocking fallback
1368        m = M('data')
1369        self.assertEqual("{!r}".format(m), 'M(data)')
1370        self.assertRaises(TypeError, "{!s}".format, m)
1371        self.assertRaises(TypeError, "{}".format, m)
1372        n = N('data')
1373        self.assertEqual("{!r}".format(n), 'N(data)')
1374        self.assertEqual("{!s}".format(n), 'N(data)')
1375        self.assertRaises(TypeError, "{}".format, n)
1376
1377    def test_format_map(self):
1378        self.assertEqual(''.format_map({}), '')
1379        self.assertEqual('a'.format_map({}), 'a')
1380        self.assertEqual('ab'.format_map({}), 'ab')
1381        self.assertEqual('a{{'.format_map({}), 'a{')
1382        self.assertEqual('a}}'.format_map({}), 'a}')
1383        self.assertEqual('{{b'.format_map({}), '{b')
1384        self.assertEqual('}}b'.format_map({}), '}b')
1385        self.assertEqual('a{{b'.format_map({}), 'a{b')
1386
1387        # using mappings
1388        class Mapping(dict):
1389            def __missing__(self, key):
1390                return key
1391        self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1392        self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1393
1394        class InternalMapping:
1395            def __init__(self):
1396                self.mapping = {'a': 'hello'}
1397            def __getitem__(self, key):
1398                return self.mapping[key]
1399        self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1400
1401
1402        class C:
1403            def __init__(self, x=100):
1404                self._x = x
1405            def __format__(self, spec):
1406                return spec
1407        self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1408
1409        # test various errors
1410        self.assertRaises(TypeError, ''.format_map)
1411        self.assertRaises(TypeError, 'a'.format_map)
1412
1413        self.assertRaises(ValueError, '{'.format_map, {})
1414        self.assertRaises(ValueError, '}'.format_map, {})
1415        self.assertRaises(ValueError, 'a{'.format_map, {})
1416        self.assertRaises(ValueError, 'a}'.format_map, {})
1417        self.assertRaises(ValueError, '{a'.format_map, {})
1418        self.assertRaises(ValueError, '}a'.format_map, {})
1419
1420        # issue #12579: can't supply positional params to format_map
1421        self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1422        self.assertRaises(ValueError, '{}'.format_map, 'a')
1423        self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1424
1425        class BadMapping:
1426            def __getitem__(self, key):
1427                return 1/0
1428        self.assertRaises(KeyError, '{a}'.format_map, {})
1429        self.assertRaises(TypeError, '{a}'.format_map, [])
1430        self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1431
1432    def test_format_huge_precision(self):
1433        format_string = ".{}f".format(sys.maxsize + 1)
1434        with self.assertRaises(ValueError):
1435            result = format(2.34, format_string)
1436
1437    def test_format_huge_width(self):
1438        format_string = "{}f".format(sys.maxsize + 1)
1439        with self.assertRaises(ValueError):
1440            result = format(2.34, format_string)
1441
1442    def test_format_huge_item_number(self):
1443        format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1444        with self.assertRaises(ValueError):
1445            result = format_string.format(2.34)
1446
1447    def test_format_auto_numbering(self):
1448        class C:
1449            def __init__(self, x=100):
1450                self._x = x
1451            def __format__(self, spec):
1452                return spec
1453
1454        self.assertEqual('{}'.format(10), '10')
1455        self.assertEqual('{:5}'.format('s'), 's    ')
1456        self.assertEqual('{!r}'.format('s'), "'s'")
1457        self.assertEqual('{._x}'.format(C(10)), '10')
1458        self.assertEqual('{[1]}'.format([1, 2]), '2')
1459        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1460        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1461
1462        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
1463        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1464
1465        # can't mix and match numbering and auto-numbering
1466        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1467        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1468        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1469        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1470
1471        # can mix and match auto-numbering and named
1472        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1473        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1474        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1475        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1476
1477    def test_formatting(self):
1478        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
1479        # Testing Unicode formatting strings...
1480        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1481        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
1482        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
1483        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
1484        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
1485        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1486        if not sys.platform.startswith('java'):
1487            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1488            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1489            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1490        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1491        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1492
1493        self.assertEqual('%c' % 0x1234, '\u1234')
1494        self.assertEqual('%c' % 0x21483, '\U00021483')
1495        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1496        self.assertEqual('%c' % '\U00021483', '\U00021483')
1497        self.assertRaises(TypeError, "%c".__mod__, "aa")
1498        self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1499        self.assertRaises(TypeError, "%i".__mod__, "aa")
1500
1501        # formatting jobs delegated from the string implementation:
1502        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1503        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1504        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1505        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1506        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
1507        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1508        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1509        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1510        self.assertEqual('...%s...' % "abc", '...abc...')
1511        self.assertEqual('%*s' % (5,'abc',), '  abc')
1512        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
1513        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
1514        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
1515        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
1516        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
1517        self.assertEqual('%c' % 'a', 'a')
1518        class Wrapper:
1519            def __str__(self):
1520                return '\u1234'
1521        self.assertEqual('%s' % Wrapper(), '\u1234')
1522
1523        # issue 3382
1524        NAN = float('nan')
1525        INF = float('inf')
1526        self.assertEqual('%f' % NAN, 'nan')
1527        self.assertEqual('%F' % NAN, 'NAN')
1528        self.assertEqual('%f' % INF, 'inf')
1529        self.assertEqual('%F' % INF, 'INF')
1530
1531        # PEP 393
1532        self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1533        self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1534
1535        #issue 19995
1536        class PseudoInt:
1537            def __init__(self, value):
1538                self.value = int(value)
1539            def __int__(self):
1540                return self.value
1541            def __index__(self):
1542                return self.value
1543        class PseudoFloat:
1544            def __init__(self, value):
1545                self.value = float(value)
1546            def __int__(self):
1547                return int(self.value)
1548        pi = PseudoFloat(3.1415)
1549        letter_m = PseudoInt(109)
1550        self.assertEqual('%x' % 42, '2a')
1551        self.assertEqual('%X' % 15, 'F')
1552        self.assertEqual('%o' % 9, '11')
1553        self.assertEqual('%c' % 109, 'm')
1554        self.assertEqual('%x' % letter_m, '6d')
1555        self.assertEqual('%X' % letter_m, '6D')
1556        self.assertEqual('%o' % letter_m, '155')
1557        self.assertEqual('%c' % letter_m, 'm')
1558        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14)
1559        self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11)
1560        self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79)
1561        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi)
1562        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not complex', operator.mod, '%x', 3j)
1563        self.assertRaisesRegex(TypeError, '%X format: an integer is required, not complex', operator.mod, '%X', 2j)
1564        self.assertRaisesRegex(TypeError, '%o format: an integer is required, not complex', operator.mod, '%o', 1j)
1565        self.assertRaisesRegex(TypeError, '%u format: a real number is required, not complex', operator.mod, '%u', 3j)
1566        self.assertRaisesRegex(TypeError, '%i format: a real number is required, not complex', operator.mod, '%i', 2j)
1567        self.assertRaisesRegex(TypeError, '%d format: a real number is required, not complex', operator.mod, '%d', 1j)
1568        self.assertRaisesRegex(TypeError, '%c requires int or char', operator.mod, '%c', pi)
1569
1570        class RaisingNumber:
1571            def __int__(self):
1572                raise RuntimeError('int')  # should not be `TypeError`
1573            def __index__(self):
1574                raise RuntimeError('index')  # should not be `TypeError`
1575
1576        rn = RaisingNumber()
1577        self.assertRaisesRegex(RuntimeError, 'int', operator.mod, '%d', rn)
1578        self.assertRaisesRegex(RuntimeError, 'int', operator.mod, '%i', rn)
1579        self.assertRaisesRegex(RuntimeError, 'int', operator.mod, '%u', rn)
1580        self.assertRaisesRegex(RuntimeError, 'index', operator.mod, '%x', rn)
1581        self.assertRaisesRegex(RuntimeError, 'index', operator.mod, '%X', rn)
1582        self.assertRaisesRegex(RuntimeError, 'index', operator.mod, '%o', rn)
1583
1584    def test_formatting_with_enum(self):
1585        # issue18780
1586        import enum
1587        class Float(float, enum.Enum):
1588            # a mixed-in type will use the name for %s etc.
1589            PI = 3.1415926
1590        class Int(enum.IntEnum):
1591            # IntEnum uses the value and not the name for %s etc.
1592            IDES = 15
1593        class Str(enum.StrEnum):
1594            # StrEnum uses the value and not the name for %s etc.
1595            ABC = 'abc'
1596        # Testing Unicode formatting strings...
1597        self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1598                         'abc, abc')
1599        self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1600                        (Str.ABC, Str.ABC,
1601                         Int.IDES, Int.IDES, Int.IDES,
1602                         Float.PI, Float.PI),
1603                         'abc, abc, 15, 15, 15, 3.141593,  3.14')
1604
1605        # formatting jobs delegated from the string implementation:
1606        self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1607                         '...abc...')
1608        self.assertEqual('...%(foo)r...' % {'foo':Int.IDES},
1609                         '...<Int.IDES: 15>...')
1610        self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1611                         '...15...')
1612        self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1613                         '...15...')
1614        self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1615                         '...15...')
1616        self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1617                         '...15...')
1618        self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1619                         '...3.141593...')
1620
1621    def test_formatting_huge_precision(self):
1622        format_string = "%.{}f".format(sys.maxsize + 1)
1623        with self.assertRaises(ValueError):
1624            result = format_string % 2.34
1625
1626    def test_issue28598_strsubclass_rhs(self):
1627        # A subclass of str with an __rmod__ method should be able to hook
1628        # into the % operator
1629        class SubclassedStr(str):
1630            def __rmod__(self, other):
1631                return 'Success, self.__rmod__({!r}) was called'.format(other)
1632        self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1633                         "Success, self.__rmod__('lhs %% %r') was called")
1634
1635    @support.cpython_only
1636    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
1637    def test_formatting_huge_precision_c_limits(self):
1638        format_string = "%.{}f".format(_testcapi.INT_MAX + 1)
1639        with self.assertRaises(ValueError):
1640            result = format_string % 2.34
1641
1642    def test_formatting_huge_width(self):
1643        format_string = "%{}f".format(sys.maxsize + 1)
1644        with self.assertRaises(ValueError):
1645            result = format_string % 2.34
1646
1647    def test_startswith_endswith_errors(self):
1648        for meth in ('foo'.startswith, 'foo'.endswith):
1649            with self.assertRaises(TypeError) as cm:
1650                meth(['f'])
1651            exc = str(cm.exception)
1652            self.assertIn('str', exc)
1653            self.assertIn('tuple', exc)
1654
1655    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1656    def test_format_float(self):
1657        # should not format with a comma, but always with C locale
1658        self.assertEqual('1.0', '%.1f' % 1.0)
1659
1660    def test_constructor(self):
1661        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1662
1663        self.assertEqual(
1664            str('unicode remains unicode'),
1665            'unicode remains unicode'
1666        )
1667
1668        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1669            subclass = StrSubclass(text)
1670            self.assertEqual(str(subclass), text)
1671            self.assertEqual(len(subclass), len(text))
1672            if text == 'ascii':
1673                self.assertEqual(subclass.encode('ascii'), b'ascii')
1674                self.assertEqual(subclass.encode('utf-8'), b'ascii')
1675
1676        self.assertEqual(
1677            str('strings are converted to unicode'),
1678            'strings are converted to unicode'
1679        )
1680
1681        class StringCompat:
1682            def __init__(self, x):
1683                self.x = x
1684            def __str__(self):
1685                return self.x
1686
1687        self.assertEqual(
1688            str(StringCompat('__str__ compatible objects are recognized')),
1689            '__str__ compatible objects are recognized'
1690        )
1691
1692        # unicode(obj) is compatible to str():
1693
1694        o = StringCompat('unicode(obj) is compatible to str()')
1695        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1696        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1697
1698        for obj in (123, 123.45, 123):
1699            self.assertEqual(str(obj), str(str(obj)))
1700
1701        # unicode(obj, encoding, error) tests (this maps to
1702        # PyUnicode_FromEncodedObject() at C level)
1703
1704        if not sys.platform.startswith('java'):
1705            self.assertRaises(
1706                TypeError,
1707                str,
1708                'decoding unicode is not supported',
1709                'utf-8',
1710                'strict'
1711            )
1712
1713        self.assertEqual(
1714            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1715            'strings are decoded to unicode'
1716        )
1717
1718        if not sys.platform.startswith('java'):
1719            self.assertEqual(
1720                str(
1721                    memoryview(b'character buffers are decoded to unicode'),
1722                    'utf-8',
1723                    'strict'
1724                ),
1725                'character buffers are decoded to unicode'
1726            )
1727
1728        self.assertRaises(TypeError, str, 42, 42, 42)
1729
1730    def test_constructor_keyword_args(self):
1731        """Pass various keyword argument combinations to the constructor."""
1732        # The object argument can be passed as a keyword.
1733        self.assertEqual(str(object='foo'), 'foo')
1734        self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1735        # The errors argument without encoding triggers "decode" mode.
1736        self.assertEqual(str(b'foo', errors='strict'), 'foo')  # not "b'foo'"
1737        self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1738
1739    def test_constructor_defaults(self):
1740        """Check the constructor argument defaults."""
1741        # The object argument defaults to '' or b''.
1742        self.assertEqual(str(), '')
1743        self.assertEqual(str(errors='strict'), '')
1744        utf8_cent = '¢'.encode('utf-8')
1745        # The encoding argument defaults to utf-8.
1746        self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1747        # The errors argument defaults to strict.
1748        self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1749
1750    def test_codecs_utf7(self):
1751        utfTests = [
1752            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
1753            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
1754            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
1755            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1756            ('+', b'+-'),
1757            ('+-', b'+--'),
1758            ('+?', b'+-?'),
1759            (r'\?', b'+AFw?'),
1760            ('+?', b'+-?'),
1761            (r'\\?', b'+AFwAXA?'),
1762            (r'\\\?', b'+AFwAXABc?'),
1763            (r'++--', b'+-+---'),
1764            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
1765            ('/', b'/'),
1766        ]
1767
1768        for (x, y) in utfTests:
1769            self.assertEqual(x.encode('utf-7'), y)
1770
1771        # Unpaired surrogates are passed through
1772        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1773        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1774        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1775        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1776        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1777        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1778        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1779        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1780
1781        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1782        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1783
1784        # Issue #2242: crash on some Windows/MSVC versions
1785        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1786
1787        # Direct encoded characters
1788        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1789        # Optional direct characters
1790        set_o = '!"#$%&*;<=>@[]^_`{|}'
1791        for c in set_d:
1792            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1793            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1794        for c in set_o:
1795            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1796
1797        with self.assertRaisesRegex(UnicodeDecodeError,
1798                                    'ill-formed sequence'):
1799            b'+@'.decode('utf-7')
1800
1801    def test_codecs_utf8(self):
1802        self.assertEqual(''.encode('utf-8'), b'')
1803        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1804        self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1805        self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1806        self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1807        self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1808        self.assertEqual(('\U00010002'*10).encode('utf-8'),
1809                         b'\xf0\x90\x80\x82'*10)
1810        self.assertEqual(
1811            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1812            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1813            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1814            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1815            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1816            ' Nunstuck git und'.encode('utf-8'),
1817            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1818            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1819            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1820            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1821            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1822            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1823            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1824            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1825            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1826            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1827        )
1828
1829        # UTF-8 specific decoding tests
1830        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1831        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1832        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1833
1834        # Other possible utf-8 test cases:
1835        # * strict decoding testing for all of the
1836        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
1837
1838    def test_utf8_decode_valid_sequences(self):
1839        sequences = [
1840            # single byte
1841            (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1842            # 2 bytes
1843            (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1844            # 3 bytes
1845            (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1846            (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1847            # 4 bytes
1848            (b'\xF0\x90\x80\x80', '\U00010000'),
1849            (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1850        ]
1851        for seq, res in sequences:
1852            self.assertEqual(seq.decode('utf-8'), res)
1853
1854
1855    def test_utf8_decode_invalid_sequences(self):
1856        # continuation bytes in a sequence of 2, 3, or 4 bytes
1857        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1858        # start bytes of a 2-byte sequence equivalent to code points < 0x7F
1859        invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1860        # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1861        invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1862        invalid_start_bytes = (
1863            continuation_bytes + invalid_2B_seq_start_bytes +
1864            invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1865        )
1866
1867        for byte in invalid_start_bytes:
1868            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1869
1870        for sb in invalid_2B_seq_start_bytes:
1871            for cb in continuation_bytes:
1872                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1873
1874        for sb in invalid_4B_seq_start_bytes:
1875            for cb1 in continuation_bytes[:3]:
1876                for cb3 in continuation_bytes[:3]:
1877                    self.assertRaises(UnicodeDecodeError,
1878                                      (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1879
1880        for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1881            self.assertRaises(UnicodeDecodeError,
1882                              (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1883            self.assertRaises(UnicodeDecodeError,
1884                              (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1885        # surrogates
1886        for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1887            self.assertRaises(UnicodeDecodeError,
1888                              (b'\xED'+cb+b'\x80').decode, 'utf-8')
1889            self.assertRaises(UnicodeDecodeError,
1890                              (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1891        for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1892            self.assertRaises(UnicodeDecodeError,
1893                              (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1894            self.assertRaises(UnicodeDecodeError,
1895                              (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1896        for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1897            self.assertRaises(UnicodeDecodeError,
1898                              (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1899            self.assertRaises(UnicodeDecodeError,
1900                              (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1901
1902    def test_issue8271(self):
1903        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1904        # only the start byte and the continuation byte(s) are now considered
1905        # invalid, instead of the number of bytes specified by the start byte.
1906        # See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1907        # table 3-8, Row 2) for more information about the algorithm used.
1908        FFFD = '\ufffd'
1909        sequences = [
1910            # invalid start bytes
1911            (b'\x80', FFFD), # continuation byte
1912            (b'\x80\x80', FFFD*2), # 2 continuation bytes
1913            (b'\xc0', FFFD),
1914            (b'\xc0\xc0', FFFD*2),
1915            (b'\xc1', FFFD),
1916            (b'\xc1\xc0', FFFD*2),
1917            (b'\xc0\xc1', FFFD*2),
1918            # with start byte of a 2-byte sequence
1919            (b'\xc2', FFFD), # only the start byte
1920            (b'\xc2\xc2', FFFD*2), # 2 start bytes
1921            (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1922            (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1923            # with start byte of a 3-byte sequence
1924            (b'\xe1', FFFD), # only the start byte
1925            (b'\xe1\xe1', FFFD*2), # 2 start bytes
1926            (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1927            (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1928            (b'\xe1\x80', FFFD), # only 1 continuation byte
1929            (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1930            (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1931            (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1932            (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1933            (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1934            (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1935            # with start byte of a 4-byte sequence
1936            (b'\xf1', FFFD), # only the start byte
1937            (b'\xf1\xf1', FFFD*2), # 2 start bytes
1938            (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1939            (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1940            (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1941            (b'\xf1\x80', FFFD), # only 1 continuation bytes
1942            (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1943            (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1944            (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1945            (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1946            (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1947            (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1948            (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1949            (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1950            (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1951            (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1952            (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1953            (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1954            # with invalid start byte of a 4-byte sequence (rfc2279)
1955            (b'\xf5', FFFD), # only the start byte
1956            (b'\xf5\xf5', FFFD*2), # 2 start bytes
1957            (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1958            (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1959            (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1960            (b'\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
1961            (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1962            (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1963            # with invalid start byte of a 5-byte sequence (rfc2279)
1964            (b'\xf8', FFFD), # only the start byte
1965            (b'\xf8\xf8', FFFD*2), # 2 start bytes
1966            (b'\xf8\x80', FFFD*2), # only one continuation byte
1967            (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1968            (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1969            # with invalid start byte of a 6-byte sequence (rfc2279)
1970            (b'\xfc', FFFD), # only the start byte
1971            (b'\xfc\xfc', FFFD*2), # 2 start bytes
1972            (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1973            (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1974            # invalid start byte
1975            (b'\xfe', FFFD),
1976            (b'\xfe\x80\x80', FFFD*3),
1977            # other sequences
1978            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1979            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1980            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1981            (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1982             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1983        ]
1984        for n, (seq, res) in enumerate(sequences):
1985            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1986            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1987            self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1988            self.assertEqual(seq.decode('utf-8', 'ignore'),
1989                             res.replace('\uFFFD', ''))
1990
1991    def assertCorrectUTF8Decoding(self, seq, res, err):
1992        """
1993        Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1994        'strict' is used, returns res when 'replace' is used, and that doesn't
1995        return anything when 'ignore' is used.
1996        """
1997        with self.assertRaises(UnicodeDecodeError) as cm:
1998            seq.decode('utf-8')
1999        exc = cm.exception
2000
2001        self.assertIn(err, str(exc))
2002        self.assertEqual(seq.decode('utf-8', 'replace'), res)
2003        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
2004                         'aaaa' + res + 'bbbb')
2005        res = res.replace('\ufffd', '')
2006        self.assertEqual(seq.decode('utf-8', 'ignore'), res)
2007        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
2008                          'aaaa' + res + 'bbbb')
2009
2010    def test_invalid_start_byte(self):
2011        """
2012        Test that an 'invalid start byte' error is raised when the first byte
2013        is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
2014        4-bytes sequence. The invalid start byte is replaced with a single
2015        U+FFFD when errors='replace'.
2016        E.g. <80> is a continuation byte and can appear only after a start byte.
2017        """
2018        FFFD = '\ufffd'
2019        for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
2020            self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
2021                                           'invalid start byte')
2022
2023    def test_unexpected_end_of_data(self):
2024        """
2025        Test that an 'unexpected end of data' error is raised when the string
2026        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
2027        enough continuation bytes.  The incomplete sequence is replaced with a
2028        single U+FFFD when errors='replace'.
2029        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
2030        sequence, but it's followed by only 2 valid continuation bytes and the
2031        last continuation bytes is missing.
2032        Note: the continuation bytes must be all valid, if one of them is
2033        invalid another error will be raised.
2034        """
2035        sequences = [
2036            'C2', 'DF',
2037            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
2038            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
2039            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
2040            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
2041            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
2042            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
2043        ]
2044        FFFD = '\ufffd'
2045        for seq in sequences:
2046            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
2047                                           'unexpected end of data')
2048
2049    def test_invalid_cb_for_2bytes_seq(self):
2050        """
2051        Test that an 'invalid continuation byte' error is raised when the
2052        continuation byte of a 2-bytes sequence is invalid.  The start byte
2053        is replaced by a single U+FFFD and the second byte is handled
2054        separately when errors='replace'.
2055        E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
2056        sequence, but 41 is not a valid continuation byte because it's the
2057        ASCII letter 'A'.
2058        """
2059        FFFD = '\ufffd'
2060        FFFDx2 = FFFD * 2
2061        sequences = [
2062            ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
2063            ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
2064            ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
2065            ('DF C0', FFFDx2), ('DF FF', FFFDx2),
2066        ]
2067        for seq, res in sequences:
2068            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2069                                           'invalid continuation byte')
2070
2071    def test_invalid_cb_for_3bytes_seq(self):
2072        """
2073        Test that an 'invalid continuation byte' error is raised when the
2074        continuation byte(s) of a 3-bytes sequence are invalid.  When
2075        errors='replace', if the first continuation byte is valid, the first
2076        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
2077        third byte is handled separately, otherwise only the start byte is
2078        replaced with a U+FFFD and the other continuation bytes are handled
2079        separately.
2080        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
2081        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
2082        because it's the ASCII letter 'A'.
2083        Note: when the start byte is E0 or ED, the valid ranges for the first
2084        continuation byte are limited to A0..BF and 80..9F respectively.
2085        Python 2 used to consider all the bytes in range 80..BF valid when the
2086        start byte was ED.  This is fixed in Python 3.
2087        """
2088        FFFD = '\ufffd'
2089        FFFDx2 = FFFD * 2
2090        sequences = [
2091            ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
2092            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
2093            ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
2094            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
2095            ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
2096            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
2097            ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
2098            ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
2099            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
2100            ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
2101            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
2102            ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
2103            ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
2104            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
2105            ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
2106            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
2107            ('ED 7F', FFFD+'\x7f'),
2108            ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
2109            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
2110            ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
2111            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
2112            ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
2113            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
2114            ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
2115            ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
2116            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
2117            ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
2118            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
2119            ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
2120            ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
2121            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
2122            ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
2123            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
2124        ]
2125        for seq, res in sequences:
2126            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2127                                           'invalid continuation byte')
2128
2129    def test_invalid_cb_for_4bytes_seq(self):
2130        """
2131        Test that an 'invalid continuation byte' error is raised when the
2132        continuation byte(s) of a 4-bytes sequence are invalid.  When
2133        errors='replace',the start byte and all the following valid
2134        continuation bytes are replaced with a single U+FFFD, and all the bytes
2135        starting from the first invalid continuation bytes (included) are
2136        handled separately.
2137        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
2138        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
2139        because it's the ASCII letter 'A'.
2140        Note: when the start byte is E0 or ED, the valid ranges for the first
2141        continuation byte are limited to A0..BF and 80..9F respectively.
2142        However, when the start byte is ED, Python 2 considers all the bytes
2143        in range 80..BF valid.  This is fixed in Python 3.
2144        """
2145        FFFD = '\ufffd'
2146        FFFDx2 = FFFD * 2
2147        sequences = [
2148            ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
2149            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
2150            ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
2151            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
2152            ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
2153            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
2154            ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
2155            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
2156            ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
2157            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
2158            ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
2159            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
2160            ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
2161            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2162            ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2163            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2164            ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2165            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2166            ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2167            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2168            ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2169            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2170            ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2171            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2172            ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2173            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2174            ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2175            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2176            ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2177            ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2178            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2179            ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2180            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2181            ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2182            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2183            ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2184            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2185            ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2186            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2187            ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2188            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2189            ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2190            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2191            ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2192            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2193            ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2194            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2195            ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2196            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2197            ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2198            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2199            ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2200            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2201            ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2202            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2203        ]
2204        for seq, res in sequences:
2205            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2206                                           'invalid continuation byte')
2207
2208    def test_codecs_idna(self):
2209        # Test whether trailing dot is preserved
2210        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2211
2212    def test_codecs_errors(self):
2213        # Error handling (encoding)
2214        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2215        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2216        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2217        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2218        self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2219                         'Andr\202 x'.encode('ascii', errors='replace'))
2220        self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2221                         'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2222
2223        # Error handling (decoding)
2224        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2225        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2226        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2227        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2228        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2229
2230        # Error handling (unknown character names)
2231        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2232
2233        # Error handling (truncated escape sequence)
2234        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2235
2236        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2237        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2238        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2239        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2240
2241        # Error handling (wrong arguments)
2242        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2243
2244        # Error handling (lone surrogate in
2245        # _PyUnicode_TransformDecimalAndSpaceToASCII())
2246        self.assertRaises(ValueError, int, "\ud800")
2247        self.assertRaises(ValueError, int, "\udf00")
2248        self.assertRaises(ValueError, float, "\ud800")
2249        self.assertRaises(ValueError, float, "\udf00")
2250        self.assertRaises(ValueError, complex, "\ud800")
2251        self.assertRaises(ValueError, complex, "\udf00")
2252
2253    def test_codecs(self):
2254        # Encoding
2255        self.assertEqual('hello'.encode('ascii'), b'hello')
2256        self.assertEqual('hello'.encode('utf-7'), b'hello')
2257        self.assertEqual('hello'.encode('utf-8'), b'hello')
2258        self.assertEqual('hello'.encode('utf-8'), b'hello')
2259        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2260        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2261        self.assertEqual('hello'.encode('latin-1'), b'hello')
2262
2263        # Default encoding is utf-8
2264        self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2265
2266        # Roundtrip safety for BMP (just the first 1024 chars)
2267        for c in range(1024):
2268            u = chr(c)
2269            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2270                             'utf-16-be', 'raw_unicode_escape',
2271                             'unicode_escape'):
2272                self.assertEqual(str(u.encode(encoding),encoding), u)
2273
2274        # Roundtrip safety for BMP (just the first 256 chars)
2275        for c in range(256):
2276            u = chr(c)
2277            for encoding in ('latin-1',):
2278                self.assertEqual(str(u.encode(encoding),encoding), u)
2279
2280        # Roundtrip safety for BMP (just the first 128 chars)
2281        for c in range(128):
2282            u = chr(c)
2283            for encoding in ('ascii',):
2284                self.assertEqual(str(u.encode(encoding),encoding), u)
2285
2286        # Roundtrip safety for non-BMP (just a few chars)
2287        with warnings.catch_warnings():
2288            u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2289            for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2290                             'raw_unicode_escape', 'unicode_escape'):
2291                self.assertEqual(str(u.encode(encoding),encoding), u)
2292
2293        # UTF-8 must be roundtrip safe for all code points
2294        # (except surrogates, which are forbidden).
2295        u = ''.join(map(chr, list(range(0, 0xd800)) +
2296                             list(range(0xe000, 0x110000))))
2297        for encoding in ('utf-8',):
2298            self.assertEqual(str(u.encode(encoding),encoding), u)
2299
2300    def test_codecs_charmap(self):
2301        # 0-127
2302        s = bytes(range(128))
2303        for encoding in (
2304            'cp037', 'cp1026', 'cp273',
2305            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2306            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2307            'cp863', 'cp865', 'cp866', 'cp1125',
2308            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2309            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2310            'iso8859_7', 'iso8859_9',
2311            'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2312            'mac_cyrillic', 'mac_latin2',
2313
2314            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2315            'cp1256', 'cp1257', 'cp1258',
2316            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2317
2318            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2319            'cp1006', 'iso8859_8',
2320
2321            ### These have undefined mappings:
2322            #'cp424',
2323
2324            ### These fail the round-trip:
2325            #'cp875'
2326
2327            ):
2328            self.assertEqual(str(s, encoding).encode(encoding), s)
2329
2330        # 128-255
2331        s = bytes(range(128, 256))
2332        for encoding in (
2333            'cp037', 'cp1026', 'cp273',
2334            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2335            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2336            'cp863', 'cp865', 'cp866', 'cp1125',
2337            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2338            'iso8859_2', 'iso8859_4', 'iso8859_5',
2339            'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2340            'mac_cyrillic', 'mac_latin2',
2341
2342            ### These have undefined mappings:
2343            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2344            #'cp1256', 'cp1257', 'cp1258',
2345            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2346            #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2347            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2348
2349            ### These fail the round-trip:
2350            #'cp1006', 'cp875', 'iso8859_8',
2351
2352            ):
2353            self.assertEqual(str(s, encoding).encode(encoding), s)
2354
2355    def test_concatenation(self):
2356        self.assertEqual(("abc" "def"), "abcdef")
2357        self.assertEqual(("abc" "def"), "abcdef")
2358        self.assertEqual(("abc" "def"), "abcdef")
2359        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2360        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2361
2362    def test_ucs4(self):
2363        x = '\U00100000'
2364        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2365        self.assertEqual(x, y)
2366
2367        y = br'\U00100000'
2368        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2369        self.assertEqual(x, y)
2370        y = br'\U00010000'
2371        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2372        self.assertEqual(x, y)
2373
2374        try:
2375            br'\U11111111'.decode("raw-unicode-escape")
2376        except UnicodeDecodeError as e:
2377            self.assertEqual(e.start, 0)
2378            self.assertEqual(e.end, 10)
2379        else:
2380            self.fail("Should have raised UnicodeDecodeError")
2381
2382    def test_conversion(self):
2383        # Make sure __str__() works properly
2384        class ObjectToStr:
2385            def __str__(self):
2386                return "foo"
2387
2388        class StrSubclassToStr(str):
2389            def __str__(self):
2390                return "foo"
2391
2392        class StrSubclassToStrSubclass(str):
2393            def __new__(cls, content=""):
2394                return str.__new__(cls, 2*content)
2395            def __str__(self):
2396                return self
2397
2398        self.assertEqual(str(ObjectToStr()), "foo")
2399        self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2400        s = str(StrSubclassToStrSubclass("foo"))
2401        self.assertEqual(s, "foofoo")
2402        self.assertIs(type(s), StrSubclassToStrSubclass)
2403        s = StrSubclass(StrSubclassToStrSubclass("foo"))
2404        self.assertEqual(s, "foofoo")
2405        self.assertIs(type(s), StrSubclass)
2406
2407    def test_unicode_repr(self):
2408        class s1:
2409            def __repr__(self):
2410                return '\\n'
2411
2412        class s2:
2413            def __repr__(self):
2414                return '\\n'
2415
2416        self.assertEqual(repr(s1()), '\\n')
2417        self.assertEqual(repr(s2()), '\\n')
2418
2419    def test_printable_repr(self):
2420        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2421        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
2422
2423    # This test only affects 32-bit platforms because expandtabs can only take
2424    # an int as the max value, not a 64-bit C long.  If expandtabs is changed
2425    # to take a 64-bit long, this test should apply to all platforms.
2426    @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2427                     'only applies to 32-bit platforms')
2428    def test_expandtabs_overflows_gracefully(self):
2429        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2430
2431    @support.cpython_only
2432    def test_expandtabs_optimization(self):
2433        s = 'abc'
2434        self.assertIs(s.expandtabs(), s)
2435
2436    def test_raiseMemError(self):
2437        asciifields = "nnbP"
2438        compactfields = asciifields + "nPn"
2439        ascii_struct_size = support.calcobjsize(asciifields)
2440        compact_struct_size = support.calcobjsize(compactfields)
2441
2442        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2443            code = ord(char)
2444            if code < 0x80:
2445                char_size = 1  # sizeof(Py_UCS1)
2446                struct_size = ascii_struct_size
2447            elif code < 0x100:
2448                char_size = 1  # sizeof(Py_UCS1)
2449                struct_size = compact_struct_size
2450            elif code < 0x10000:
2451                char_size = 2  # sizeof(Py_UCS2)
2452                struct_size = compact_struct_size
2453            else:
2454                char_size = 4  # sizeof(Py_UCS4)
2455                struct_size = compact_struct_size
2456            # Note: sys.maxsize is half of the actual max allocation because of
2457            # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2458            # be allocatable, given enough memory.
2459            maxlen = ((sys.maxsize - struct_size) // char_size)
2460            alloc = lambda: char * maxlen
2461            with self.subTest(
2462                char=char,
2463                struct_size=struct_size,
2464                char_size=char_size
2465            ):
2466                # self-check
2467                self.assertEqual(
2468                    sys.getsizeof(char * 42),
2469                    struct_size + (char_size * (42 + 1))
2470                )
2471                self.assertRaises(MemoryError, alloc)
2472                self.assertRaises(MemoryError, alloc)
2473
2474    def test_format_subclass(self):
2475        class S(str):
2476            def __str__(self):
2477                return '__str__ overridden'
2478        s = S('xxx')
2479        self.assertEqual("%s" % s, '__str__ overridden')
2480        self.assertEqual("{}".format(s), '__str__ overridden')
2481
2482    def test_subclass_add(self):
2483        class S(str):
2484            def __add__(self, o):
2485                return "3"
2486        self.assertEqual(S("4") + S("5"), "3")
2487        class S(str):
2488            def __iadd__(self, o):
2489                return "3"
2490        s = S("1")
2491        s += "4"
2492        self.assertEqual(s, "3")
2493
2494    def test_getnewargs(self):
2495        text = 'abc'
2496        args = text.__getnewargs__()
2497        self.assertIsNot(args[0], text)
2498        self.assertEqual(args[0], text)
2499        self.assertEqual(len(args), 1)
2500
2501    @support.cpython_only
2502    @support.requires_legacy_unicode_capi
2503    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
2504    def test_resize(self):
2505        for length in range(1, 100, 7):
2506            # generate a fresh string (refcount=1)
2507            text = 'a' * length + 'b'
2508
2509            # fill wstr internal field
2510            with self.assertWarns(DeprecationWarning):
2511                abc = _testcapi.getargs_u(text)
2512            self.assertEqual(abc, text)
2513
2514            # resize text: wstr field must be cleared and then recomputed
2515            text += 'c'
2516            with self.assertWarns(DeprecationWarning):
2517                abcdef = _testcapi.getargs_u(text)
2518            self.assertNotEqual(abc, abcdef)
2519            self.assertEqual(abcdef, text)
2520
2521    def test_compare(self):
2522        # Issue #17615
2523        N = 10
2524        ascii = 'a' * N
2525        ascii2 = 'z' * N
2526        latin = '\x80' * N
2527        latin2 = '\xff' * N
2528        bmp = '\u0100' * N
2529        bmp2 = '\uffff' * N
2530        astral = '\U00100000' * N
2531        astral2 = '\U0010ffff' * N
2532        strings = (
2533            ascii, ascii2,
2534            latin, latin2,
2535            bmp, bmp2,
2536            astral, astral2)
2537        for text1, text2 in itertools.combinations(strings, 2):
2538            equal = (text1 is text2)
2539            self.assertEqual(text1 == text2, equal)
2540            self.assertEqual(text1 != text2, not equal)
2541
2542            if equal:
2543                self.assertTrue(text1 <= text2)
2544                self.assertTrue(text1 >= text2)
2545
2546                # text1 is text2: duplicate strings to skip the "str1 == str2"
2547                # optimization in unicode_compare_eq() and really compare
2548                # character per character
2549                copy1 = duplicate_string(text1)
2550                copy2 = duplicate_string(text2)
2551                self.assertIsNot(copy1, copy2)
2552
2553                self.assertTrue(copy1 == copy2)
2554                self.assertFalse(copy1 != copy2)
2555
2556                self.assertTrue(copy1 <= copy2)
2557                self.assertTrue(copy2 >= copy2)
2558
2559        self.assertTrue(ascii < ascii2)
2560        self.assertTrue(ascii < latin)
2561        self.assertTrue(ascii < bmp)
2562        self.assertTrue(ascii < astral)
2563        self.assertFalse(ascii >= ascii2)
2564        self.assertFalse(ascii >= latin)
2565        self.assertFalse(ascii >= bmp)
2566        self.assertFalse(ascii >= astral)
2567
2568        self.assertFalse(latin < ascii)
2569        self.assertTrue(latin < latin2)
2570        self.assertTrue(latin < bmp)
2571        self.assertTrue(latin < astral)
2572        self.assertTrue(latin >= ascii)
2573        self.assertFalse(latin >= latin2)
2574        self.assertFalse(latin >= bmp)
2575        self.assertFalse(latin >= astral)
2576
2577        self.assertFalse(bmp < ascii)
2578        self.assertFalse(bmp < latin)
2579        self.assertTrue(bmp < bmp2)
2580        self.assertTrue(bmp < astral)
2581        self.assertTrue(bmp >= ascii)
2582        self.assertTrue(bmp >= latin)
2583        self.assertFalse(bmp >= bmp2)
2584        self.assertFalse(bmp >= astral)
2585
2586        self.assertFalse(astral < ascii)
2587        self.assertFalse(astral < latin)
2588        self.assertFalse(astral < bmp2)
2589        self.assertTrue(astral < astral2)
2590        self.assertTrue(astral >= ascii)
2591        self.assertTrue(astral >= latin)
2592        self.assertTrue(astral >= bmp2)
2593        self.assertFalse(astral >= astral2)
2594
2595    def test_free_after_iterating(self):
2596        support.check_free_after_iterating(self, iter, str)
2597        support.check_free_after_iterating(self, reversed, str)
2598
2599    def test_check_encoding_errors(self):
2600        # bpo-37388: str(bytes) and str.decode() must check encoding and errors
2601        # arguments in dev mode
2602        encodings = ('ascii', 'utf8', 'latin1')
2603        invalid = 'Boom, Shaka Laka, Boom!'
2604        code = textwrap.dedent(f'''
2605            import sys
2606            encodings = {encodings!r}
2607
2608            for data in (b'', b'short string'):
2609                try:
2610                    str(data, encoding={invalid!r})
2611                except LookupError:
2612                    pass
2613                else:
2614                    sys.exit(21)
2615
2616                try:
2617                    str(data, errors={invalid!r})
2618                except LookupError:
2619                    pass
2620                else:
2621                    sys.exit(22)
2622
2623                for encoding in encodings:
2624                    try:
2625                        str(data, encoding, errors={invalid!r})
2626                    except LookupError:
2627                        pass
2628                    else:
2629                        sys.exit(22)
2630
2631            for data in ('', 'short string'):
2632                try:
2633                    data.encode(encoding={invalid!r})
2634                except LookupError:
2635                    pass
2636                else:
2637                    sys.exit(23)
2638
2639                try:
2640                    data.encode(errors={invalid!r})
2641                except LookupError:
2642                    pass
2643                else:
2644                    sys.exit(24)
2645
2646                for encoding in encodings:
2647                    try:
2648                        data.encode(encoding, errors={invalid!r})
2649                    except LookupError:
2650                        pass
2651                    else:
2652                        sys.exit(24)
2653
2654            sys.exit(10)
2655        ''')
2656        proc = assert_python_failure('-X', 'dev', '-c', code)
2657        self.assertEqual(proc.rc, 10, proc)
2658
2659
2660class StringModuleTest(unittest.TestCase):
2661    def test_formatter_parser(self):
2662        def parse(format):
2663            return list(_string.formatter_parser(format))
2664
2665        formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2666        self.assertEqual(formatter, [
2667            ('prefix ', '2', '', 's'),
2668            ('xxx', '0', '^+10.3f', None),
2669            ('', 'obj.attr', '', 's'),
2670            (' ', 'z[0]', '10', 's'),
2671        ])
2672
2673        formatter = parse("prefix {} suffix")
2674        self.assertEqual(formatter, [
2675            ('prefix ', '', '', None),
2676            (' suffix', None, None, None),
2677        ])
2678
2679        formatter = parse("str")
2680        self.assertEqual(formatter, [
2681            ('str', None, None, None),
2682        ])
2683
2684        formatter = parse("")
2685        self.assertEqual(formatter, [])
2686
2687        formatter = parse("{0}")
2688        self.assertEqual(formatter, [
2689            ('', '0', '', None),
2690        ])
2691
2692        self.assertRaises(TypeError, _string.formatter_parser, 1)
2693
2694    def test_formatter_field_name_split(self):
2695        def split(name):
2696            items = list(_string.formatter_field_name_split(name))
2697            items[1] = list(items[1])
2698            return items
2699        self.assertEqual(split("obj"), ["obj", []])
2700        self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2701        self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2702        self.assertEqual(split("obj.arg[key1][key2]"), [
2703            "obj",
2704            [(True, 'arg'),
2705             (False, 'key1'),
2706             (False, 'key2'),
2707            ]])
2708        self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2709
2710    def test_str_subclass_attr(self):
2711
2712        name = StrSubclass("name")
2713        name2 = StrSubclass("name2")
2714        class Bag:
2715            pass
2716
2717        o = Bag()
2718        with self.assertRaises(AttributeError):
2719            delattr(o, name)
2720        setattr(o, name, 1)
2721        self.assertEqual(o.name, 1)
2722        o.name = 2
2723        self.assertEqual(list(o.__dict__), [name])
2724
2725        with self.assertRaises(AttributeError):
2726            delattr(o, name2)
2727        with self.assertRaises(AttributeError):
2728            del o.name2
2729        setattr(o, name2, 3)
2730        self.assertEqual(o.name2, 3)
2731        o.name2 = 4
2732        self.assertEqual(list(o.__dict__), [name, name2])
2733
2734
2735if __name__ == "__main__":
2736    unittest.main()
2737