1from test.support import (gc_collect, bigmemtest, _2G,
2                          cpython_only, captured_stdout,
3                          check_disallow_instantiation, is_emscripten, is_wasi,
4                          SHORT_TIMEOUT)
5import locale
6import re
7import string
8import sys
9import time
10import unittest
11import warnings
12from re import Scanner
13from weakref import proxy
14
15# some platforms lack working multiprocessing
16try:
17    import _multiprocessing
18except ImportError:
19    multiprocessing = None
20else:
21    import multiprocessing
22
23# Misc tests from Tim Peters' re.doc
24
25# WARNING: Don't change details in these tests if you don't know
26# what you're doing. Some of these tests were carefully modeled to
27# cover most of the code.
28
29class S(str):
30    def __getitem__(self, index):
31        return S(super().__getitem__(index))
32
33class B(bytes):
34    def __getitem__(self, index):
35        return B(super().__getitem__(index))
36
37class ReTests(unittest.TestCase):
38
39    def assertTypedEqual(self, actual, expect, msg=None):
40        self.assertEqual(actual, expect, msg)
41        def recurse(actual, expect):
42            if isinstance(expect, (tuple, list)):
43                for x, y in zip(actual, expect):
44                    recurse(x, y)
45            else:
46                self.assertIs(type(actual), type(expect), msg)
47        recurse(actual, expect)
48
49    def checkPatternError(self, pattern, errmsg, pos=None):
50        with self.assertRaises(re.error) as cm:
51            re.compile(pattern)
52        with self.subTest(pattern=pattern):
53            err = cm.exception
54            self.assertEqual(err.msg, errmsg)
55            if pos is not None:
56                self.assertEqual(err.pos, pos)
57
58    def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
59        with self.assertRaises(re.error) as cm:
60            re.sub(pattern, repl, string)
61        with self.subTest(pattern=pattern, repl=repl):
62            err = cm.exception
63            self.assertEqual(err.msg, errmsg)
64            if pos is not None:
65                self.assertEqual(err.pos, pos)
66
67    def test_keep_buffer(self):
68        # See bug 14212
69        b = bytearray(b'x')
70        it = re.finditer(b'a', b)
71        with self.assertRaises(BufferError):
72            b.extend(b'x'*400)
73        list(it)
74        del it
75        gc_collect()
76        b.extend(b'x'*400)
77
78    def test_weakref(self):
79        s = 'QabbbcR'
80        x = re.compile('ab+c')
81        y = proxy(x)
82        self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
83
84    def test_search_star_plus(self):
85        self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
86        self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
87        self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
88        self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
89        self.assertIsNone(re.search('x', 'aaa'))
90        self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
91        self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
92        self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
93        self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
94        self.assertIsNone(re.match('a+', 'xxx'))
95
96    def test_branching(self):
97        """Test Branching
98        Test expressions using the OR ('|') operator."""
99        self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2))
100        self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2))
101        self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(),
102                         (0, 3))
103        self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(),
104                         (0, 3))
105        self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(),
106                         (0, 2))
107        self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(),
108                         (0, 2))
109        self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1))
110        self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1))
111        self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1))
112
113    def bump_num(self, matchobj):
114        int_value = int(matchobj.group(0))
115        return str(int_value + 1)
116
117    def test_basic_re_sub(self):
118        self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
119        self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
120        self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
121        self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
122        self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
123        self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
124        for y in ("\xe0", "\u0430", "\U0001d49c"):
125            self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
126
127        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
128        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
129                         '9.3 -3 24x100y')
130        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
131                         '9.3 -3 23x99y')
132        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
133                         '9.3 -3 23x99y')
134
135        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
136        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
137
138        s = r"\1\1"
139        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
140        self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s)
141        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
142
143        self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
144        self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
145        self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
146        self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
147        self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')
148
149        self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
150        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
151        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
152                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
153        for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
154            with self.subTest(c):
155                with self.assertRaises(re.error):
156                    self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
157
158        self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
159
160    def test_bug_449964(self):
161        # fails for group followed by other escape
162        self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
163                         'xx\bxx\b')
164
165    def test_bug_449000(self):
166        # Test for sub() on escaped characters
167        self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
168                         'abc\ndef\n')
169        self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
170                         'abc\ndef\n')
171        self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
172                         'abc\ndef\n')
173        self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
174                         'abc\ndef\n')
175
176    def test_bug_1661(self):
177        # Verify that flags do not get silently ignored with compiled patterns
178        pattern = re.compile('.')
179        self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
180        self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
181        self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
182        self.assertRaises(ValueError, re.compile, pattern, re.I)
183
184    def test_bug_3629(self):
185        # A regex that triggered a bug in the sre-code validator
186        re.compile("(?P<quote>)(?(quote))")
187
188    def test_sub_template_numeric_escape(self):
189        # bug 776311 and friends
190        self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
191        self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
192        self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
193        self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
194        self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
195        self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
196        self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
197        self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
198
199        self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
200        self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
201
202        self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
203        self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
204        self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
205        self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
206        self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
207
208        self.checkTemplateError('x', r'\400', 'x',
209                                r'octal escape value \400 outside of '
210                                r'range 0-0o377', 0)
211        self.checkTemplateError('x', r'\777', 'x',
212                                r'octal escape value \777 outside of '
213                                r'range 0-0o377', 0)
214
215        self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
216        self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
217        self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
218        self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
219        self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
220        self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
221        self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
222        self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
223        self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
224        self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
225        self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
226        self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
227        self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
228
229        # in python2.3 (etc), these loop endlessly in sre_parser.py
230        self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
231        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
232                         'xz8')
233        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
234                         'xza')
235
236    def test_qualified_re_sub(self):
237        self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
238        self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
239        self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
240
241    def test_bug_114660(self):
242        self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
243                         'hello there')
244
245    def test_symbolic_groups(self):
246        re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
247        re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
248        re.compile(r'(?P<a1>x)\1(?(1)y)')
249        re.compile(b'(?P<a1>x)(?P=a1)(?(a1)y)')
250        # New valid identifiers in Python 3
251        re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
252        re.compile('(?P<��������������>x)(?P=��������������)(?(��������������)y)')
253        # Support > 100 groups.
254        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
255        pat = '(?:%s)(?(200)z|t)' % pat
256        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
257
258    def test_symbolic_groups_errors(self):
259        self.checkPatternError(r'(?P<a>)(?P<a>)',
260                               "redefinition of group name 'a' as group 2; "
261                               "was group 1")
262        self.checkPatternError(r'(?P<a>(?P=a))',
263                               "cannot refer to an open group", 10)
264        self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
265        self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
266        self.checkPatternError(r'(?P=', 'missing group name', 4)
267        self.checkPatternError(r'(?P=)', 'missing group name', 4)
268        self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
269        self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
270        self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
271        self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
272        self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
273        self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
274        self.checkPatternError(r'(?P<', 'missing group name', 4)
275        self.checkPatternError(r'(?P<>)', 'missing group name', 4)
276        self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
277        self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
278        self.checkPatternError(r'(?(', 'missing group name', 3)
279        self.checkPatternError(r'(?())', 'missing group name', 3)
280        self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
281        self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
282        self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
283        self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
284        self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
285        self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
286        self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
287        with self.assertWarnsRegex(DeprecationWarning,
288                                   r"bad character in group name '\\xc2\\xb5' "
289                                   r"at position 4") as w:
290            re.compile(b'(?P<\xc2\xb5>x)')
291        self.assertEqual(w.filename, __file__)
292        with self.assertWarnsRegex(DeprecationWarning,
293                                   r"bad character in group name '\\xc2\\xb5' "
294                                   r"at position 4"):
295            self.checkPatternError(b'(?P=\xc2\xb5)',
296                                   r"unknown group name '\xc2\xb5'", 4)
297        with self.assertWarnsRegex(DeprecationWarning,
298                                   r"bad character in group name '\\xc2\\xb5' "
299                                   r"at position 3"):
300            self.checkPatternError(b'(?(\xc2\xb5)y)',
301                                   r"unknown group name '\xc2\xb5'", 3)
302
303    def test_symbolic_refs(self):
304        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
305        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
306        self.assertEqual(re.sub(b'(?P<a1>x)', br'\g<a1>', b'xx'), b'xx')
307        # New valid identifiers in Python 3
308        self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
309        self.assertEqual(re.sub('(?P<��������������>x)', r'\g<��������������>', 'xx'), 'xx')
310        # Support > 100 groups.
311        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
312        self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
313
314    def test_symbolic_refs_errors(self):
315        self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
316                                'missing >, unterminated name', 3)
317        self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
318                                'missing group name', 3)
319        self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
320        self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
321                                "bad character in group name 'a a'", 3)
322        self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
323                                'missing group name', 3)
324        self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
325                                "bad character in group name '1a1'", 3)
326        self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
327                                'invalid group reference 2', 3)
328        self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
329                                'invalid group reference 2', 1)
330        with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
331            re.sub('(?P<a>x)', r'\g<ab>', 'xx')
332        self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
333                                "bad character in group name '-1'", 3)
334        with self.assertWarnsRegex(DeprecationWarning,
335                                   r"bad character in group name '\+1' "
336                                   r"at position 3") as w:
337            re.sub('(?P<a>x)', r'\g<+1>', 'xx')
338        self.assertEqual(w.filename, __file__)
339        with self.assertWarnsRegex(DeprecationWarning,
340                                   r"bad character in group name '1_0' "
341                                   r"at position 3"):
342            re.sub('()'*10, r'\g<1_0>', 'xx')
343        with self.assertWarnsRegex(DeprecationWarning,
344                                   r"bad character in group name ' 1 ' "
345                                   r"at position 3"):
346            re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
347        self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
348                                "bad character in group name '©'", 3)
349        with self.assertWarnsRegex(DeprecationWarning,
350                                   r"bad character in group name '\\xc2\\xb5' "
351                                   r"at position 3") as w:
352            with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
353                re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
354        self.assertEqual(w.filename, __file__)
355        self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
356                                "bad character in group name '㊀'", 3)
357        self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
358                                "bad character in group name '¹'", 3)
359        with self.assertWarnsRegex(DeprecationWarning,
360                                   r"bad character in group name '१' "
361                                   r"at position 3"):
362            re.sub('(?P<a>x)', r'\g<१>', 'xx')
363
364    def test_re_subn(self):
365        self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
366        self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
367        self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
368        self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
369        self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
370        self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
371
372    def test_re_split(self):
373        for string in ":a:b::c", S(":a:b::c"):
374            self.assertTypedEqual(re.split(":", string),
375                                  ['', 'a', 'b', '', 'c'])
376            self.assertTypedEqual(re.split(":+", string),
377                                  ['', 'a', 'b', 'c'])
378            self.assertTypedEqual(re.split("(:+)", string),
379                                  ['', ':', 'a', ':', 'b', '::', 'c'])
380        for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
381                       memoryview(b":a:b::c")):
382            self.assertTypedEqual(re.split(b":", string),
383                                  [b'', b'a', b'b', b'', b'c'])
384            self.assertTypedEqual(re.split(b":+", string),
385                                  [b'', b'a', b'b', b'c'])
386            self.assertTypedEqual(re.split(b"(:+)", string),
387                                  [b'', b':', b'a', b':', b'b', b'::', b'c'])
388        for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
389                        "\U0001d49c\U0001d49e\U0001d4b5"):
390            string = ":%s:%s::%s" % (a, b, c)
391            self.assertEqual(re.split(":", string), ['', a, b, '', c])
392            self.assertEqual(re.split(":+", string), ['', a, b, c])
393            self.assertEqual(re.split("(:+)", string),
394                             ['', ':', a, ':', b, '::', c])
395
396        self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
397        self.assertEqual(re.split("(:)+", ":a:b::c"),
398                         ['', ':', 'a', ':', 'b', ':', 'c'])
399        self.assertEqual(re.split("([b:]+)", ":a:b::c"),
400                         ['', ':', 'a', ':b::', 'c'])
401        self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
402                         ['', None, ':', 'a', None, ':', '', 'b', None, '',
403                          None, '::', 'c'])
404        self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
405                         ['', 'a', '', '', 'c'])
406
407        for sep, expected in [
408            (':*', ['', '', 'a', '', 'b', '', 'c', '']),
409            ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']),
410            ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']),
411            ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']),
412        ]:
413            with self.subTest(sep=sep):
414                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
415
416        for sep, expected in [
417            ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
418            (r'\b', [':', 'a', ':', 'b', '::', 'c', '']),
419            (r'(?=:)', ['', ':a', ':b', ':', ':c']),
420            (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']),
421        ]:
422            with self.subTest(sep=sep):
423                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
424
425    def test_qualified_re_split(self):
426        self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
427        self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
428        self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
429        self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
430                         ['', ':', 'a', ':', 'b::c'])
431        self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
432                         ['', ':', 'a', ':', 'b::c'])
433        self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
434                         ['', ':', '', '', 'a:b::c'])
435
436    def test_re_findall(self):
437        self.assertEqual(re.findall(":+", "abc"), [])
438        for string in "a:b::c:::d", S("a:b::c:::d"):
439            self.assertTypedEqual(re.findall(":+", string),
440                                  [":", "::", ":::"])
441            self.assertTypedEqual(re.findall("(:+)", string),
442                                  [":", "::", ":::"])
443            self.assertTypedEqual(re.findall("(:)(:*)", string),
444                                  [(":", ""), (":", ":"), (":", "::")])
445        for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
446                       memoryview(b"a:b::c:::d")):
447            self.assertTypedEqual(re.findall(b":+", string),
448                                  [b":", b"::", b":::"])
449            self.assertTypedEqual(re.findall(b"(:+)", string),
450                                  [b":", b"::", b":::"])
451            self.assertTypedEqual(re.findall(b"(:)(:*)", string),
452                                  [(b":", b""), (b":", b":"), (b":", b"::")])
453        for x in ("\xe0", "\u0430", "\U0001d49c"):
454            xx = x * 2
455            xxx = x * 3
456            string = "a%sb%sc%sd" % (x, xx, xxx)
457            self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
458            self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
459            self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
460                             [(x, ""), (x, x), (x, xx)])
461
462    def test_bug_117612(self):
463        self.assertEqual(re.findall(r"(a|(b))", "aba"),
464                         [("a", ""),("b", "b"),("a", "")])
465
466    def test_re_match(self):
467        for string in 'a', S('a'):
468            self.assertEqual(re.match('a', string).groups(), ())
469            self.assertEqual(re.match('(a)', string).groups(), ('a',))
470            self.assertEqual(re.match('(a)', string).group(0), 'a')
471            self.assertEqual(re.match('(a)', string).group(1), 'a')
472            self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
473        for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
474            self.assertEqual(re.match(b'a', string).groups(), ())
475            self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
476            self.assertEqual(re.match(b'(a)', string).group(0), b'a')
477            self.assertEqual(re.match(b'(a)', string).group(1), b'a')
478            self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
479        for a in ("\xe0", "\u0430", "\U0001d49c"):
480            self.assertEqual(re.match(a, a).groups(), ())
481            self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
482            self.assertEqual(re.match('(%s)' % a, a).group(0), a)
483            self.assertEqual(re.match('(%s)' % a, a).group(1), a)
484            self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
485
486        pat = re.compile('((a)|(b))(c)?')
487        self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
488        self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
489        self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
490        self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
491        self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
492
493        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
494        self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
495        self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
496                         (None, 'b', None))
497        self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
498
499    def test_group(self):
500        class Index:
501            def __init__(self, value):
502                self.value = value
503            def __index__(self):
504                return self.value
505        # A single group
506        m = re.match('(a)(b)', 'ab')
507        self.assertEqual(m.group(), 'ab')
508        self.assertEqual(m.group(0), 'ab')
509        self.assertEqual(m.group(1), 'a')
510        self.assertEqual(m.group(Index(1)), 'a')
511        self.assertRaises(IndexError, m.group, -1)
512        self.assertRaises(IndexError, m.group, 3)
513        self.assertRaises(IndexError, m.group, 1<<1000)
514        self.assertRaises(IndexError, m.group, Index(1<<1000))
515        self.assertRaises(IndexError, m.group, 'x')
516        # Multiple groups
517        self.assertEqual(m.group(2, 1), ('b', 'a'))
518        self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
519
520    def test_match_getitem(self):
521        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
522
523        m = pat.match('a')
524        self.assertEqual(m['a1'], 'a')
525        self.assertEqual(m['b2'], None)
526        self.assertEqual(m['c3'], None)
527        self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
528        self.assertEqual(m[0], 'a')
529        self.assertEqual(m[1], 'a')
530        self.assertEqual(m[2], None)
531        self.assertEqual(m[3], None)
532        with self.assertRaisesRegex(IndexError, 'no such group'):
533            m['X']
534        with self.assertRaisesRegex(IndexError, 'no such group'):
535            m[-1]
536        with self.assertRaisesRegex(IndexError, 'no such group'):
537            m[4]
538        with self.assertRaisesRegex(IndexError, 'no such group'):
539            m[0, 1]
540        with self.assertRaisesRegex(IndexError, 'no such group'):
541            m[(0,)]
542        with self.assertRaisesRegex(IndexError, 'no such group'):
543            m[(0, 1)]
544        with self.assertRaisesRegex(IndexError, 'no such group'):
545            'a1={a2}'.format_map(m)
546
547        m = pat.match('ac')
548        self.assertEqual(m['a1'], 'a')
549        self.assertEqual(m['b2'], None)
550        self.assertEqual(m['c3'], 'c')
551        self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
552        self.assertEqual(m[0], 'ac')
553        self.assertEqual(m[1], 'a')
554        self.assertEqual(m[2], None)
555        self.assertEqual(m[3], 'c')
556
557        # Cannot assign.
558        with self.assertRaises(TypeError):
559            m[0] = 1
560
561        # No len().
562        self.assertRaises(TypeError, len, m)
563
564    def test_re_fullmatch(self):
565        # Issue 16203: Proposal: add re.fullmatch() method.
566        self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
567        for string in "ab", S("ab"):
568            self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
569        for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
570            self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
571        for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
572            r = r"%s|%s" % (a, a + b)
573            self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
574        self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
575        self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
576        self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
577        self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
578        self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
579        self.assertIsNone(re.fullmatch(r"a+", "ab"))
580        self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
581        self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
582        self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
583        self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
584        self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
585        self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
586
587        self.assertEqual(
588            re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
589        self.assertEqual(
590            re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
591        self.assertEqual(
592            re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
593
594    def test_re_groupref_exists(self):
595        self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
596                         ('(', 'a'))
597        self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
598                         (None, 'a'))
599        self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
600        self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
601        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
602                         ('a', 'b'))
603        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
604                         (None, 'd'))
605        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
606                         (None, 'd'))
607        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
608                         ('a', ''))
609
610        # Tests for bug #1177831: exercise groups other than the first group
611        p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
612        self.assertEqual(p.match('abc').groups(),
613                         ('a', 'b', 'c'))
614        self.assertEqual(p.match('ad').groups(),
615                         ('a', None, 'd'))
616        self.assertIsNone(p.match('abd'))
617        self.assertIsNone(p.match('ac'))
618
619        # Support > 100 groups.
620        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
621        pat = '(?:%s)(?(200)z)' % pat
622        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
623
624    def test_re_groupref_exists_errors(self):
625        self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
626        self.checkPatternError(r'()(?(-1)a|b)',
627                               "bad character in group name '-1'", 5)
628        with self.assertWarnsRegex(DeprecationWarning,
629                                   r"bad character in group name '\+1' "
630                                   r"at position 5") as w:
631            re.compile(r'()(?(+1)a|b)')
632        self.assertEqual(w.filename, __file__)
633        with self.assertWarnsRegex(DeprecationWarning,
634                                   r"bad character in group name '1_0' "
635                                   r"at position 23"):
636            re.compile(r'()'*10 + r'(?(1_0)a|b)')
637        with self.assertWarnsRegex(DeprecationWarning,
638                                   r"bad character in group name ' 1 ' "
639                                   r"at position 5"):
640            re.compile(r'()(?( 1 )a|b)')
641        self.checkPatternError(r'()(?(㊀)a|b)',
642                               "bad character in group name '㊀'", 5)
643        self.checkPatternError(r'()(?(¹)a|b)',
644                               "bad character in group name '¹'", 5)
645        with self.assertWarnsRegex(DeprecationWarning,
646                                   r"bad character in group name '१' "
647                                   r"at position 5"):
648            re.compile(r'()(?(१)a|b)')
649        self.checkPatternError(r'()(?(1',
650                               "missing ), unterminated name", 5)
651        self.checkPatternError(r'()(?(1)a',
652                               "missing ), unterminated subpattern", 2)
653        self.checkPatternError(r'()(?(1)a|b',
654                               'missing ), unterminated subpattern', 2)
655        self.checkPatternError(r'()(?(1)a|b|c',
656                               'conditional backref with more than '
657                               'two branches', 10)
658        self.checkPatternError(r'()(?(1)a|b|c)',
659                               'conditional backref with more than '
660                               'two branches', 10)
661        self.checkPatternError(r'()(?(2)a)',
662                               "invalid group reference 2", 5)
663
664    def test_re_groupref_exists_validation_bug(self):
665        for i in range(256):
666            with self.subTest(code=i):
667                re.compile(r'()(?(1)\x%02x?)' % i)
668
669    def test_re_groupref_overflow(self):
670        from re._constants import MAXGROUPS
671        self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
672                                'invalid group reference %d' % MAXGROUPS, 3)
673        self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
674                               'invalid group reference %d' % MAXGROUPS, 10)
675
676    def test_re_groupref(self):
677        self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
678                         ('|', 'a'))
679        self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
680                         (None, 'a'))
681        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
682        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
683        self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
684                         ('a', 'a'))
685        self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
686                         (None, None))
687
688        self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
689
690    def test_groupdict(self):
691        self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
692                                  'first second').groupdict(),
693                         {'first':'first', 'second':'second'})
694
695    def test_expand(self):
696        self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
697                                  "first second")
698                                  .expand(r"\2 \1 \g<second> \g<first>"),
699                         "second first second first")
700        self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
701                                  "first")
702                                  .expand(r"\2 \g<second>"),
703                         " ")
704
705    def test_repeat_minmax(self):
706        self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
707        self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
708        self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
709        self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
710
711        self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
712        self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
713        self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
714        self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
715        self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
716        self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
717        self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
718        self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
719
720        self.assertIsNone(re.match(r"^x{1}$", "xxx"))
721        self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
722        self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
723        self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
724
725        self.assertTrue(re.match(r"^x{3}$", "xxx"))
726        self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
727        self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
728        self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
729        self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
730        self.assertTrue(re.match(r"^x{3}?$", "xxx"))
731        self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
732        self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
733        self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
734
735        self.assertIsNone(re.match(r"^x{}$", "xxx"))
736        self.assertTrue(re.match(r"^x{}$", "x{}"))
737
738        self.checkPatternError(r'x{2,1}',
739                               'min repeat greater than max repeat', 2)
740
741    def test_getattr(self):
742        self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
743        self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
744        self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
745        self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
746        self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
747                         {'first': 1, 'other': 2})
748
749        self.assertEqual(re.match("(a)", "a").pos, 0)
750        self.assertEqual(re.match("(a)", "a").endpos, 1)
751        self.assertEqual(re.match("(a)", "a").string, "a")
752        self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
753        self.assertTrue(re.match("(a)", "a").re)
754
755        # Issue 14260. groupindex should be non-modifiable mapping.
756        p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
757        self.assertEqual(sorted(p.groupindex), ['first', 'other'])
758        self.assertEqual(p.groupindex['other'], 2)
759        with self.assertRaises(TypeError):
760            p.groupindex['other'] = 0
761        self.assertEqual(p.groupindex['other'], 2)
762
763    def test_special_escapes(self):
764        self.assertEqual(re.search(r"\b(b.)\b",
765                                   "abcd abc bcd bx").group(1), "bx")
766        self.assertEqual(re.search(r"\B(b.)\B",
767                                   "abc bcd bc abxd").group(1), "bx")
768        self.assertEqual(re.search(r"\b(b.)\b",
769                                   "abcd abc bcd bx", re.ASCII).group(1), "bx")
770        self.assertEqual(re.search(r"\B(b.)\B",
771                                   "abc bcd bc abxd", re.ASCII).group(1), "bx")
772        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
773        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
774        self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
775        self.assertEqual(re.search(br"\b(b.)\b",
776                                   b"abcd abc bcd bx").group(1), b"bx")
777        self.assertEqual(re.search(br"\B(b.)\B",
778                                   b"abc bcd bc abxd").group(1), b"bx")
779        self.assertEqual(re.search(br"\b(b.)\b",
780                                   b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
781        self.assertEqual(re.search(br"\B(b.)\B",
782                                   b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
783        self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
784        self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
785        self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
786        self.assertEqual(re.search(r"\d\D\w\W\s\S",
787                                   "1aa! a").group(0), "1aa! a")
788        self.assertEqual(re.search(br"\d\D\w\W\s\S",
789                                   b"1aa! a").group(0), b"1aa! a")
790        self.assertEqual(re.search(r"\d\D\w\W\s\S",
791                                   "1aa! a", re.ASCII).group(0), "1aa! a")
792        self.assertEqual(re.search(br"\d\D\w\W\s\S",
793                                   b"1aa! a", re.LOCALE).group(0), b"1aa! a")
794
795    def test_other_escapes(self):
796        self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
797        self.assertEqual(re.match(r"\(", '(').group(), '(')
798        self.assertIsNone(re.match(r"\(", ')'))
799        self.assertEqual(re.match(r"\\", '\\').group(), '\\')
800        self.assertEqual(re.match(r"[\]]", ']').group(), ']')
801        self.assertIsNone(re.match(r"[\]]", '['))
802        self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
803        self.assertIsNone(re.match(r"[a\-c]", 'b'))
804        self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
805        self.assertIsNone(re.match(r"[\^a]+", 'b'))
806        re.purge()  # for warnings
807        for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
808            with self.subTest(c):
809                self.assertRaises(re.error, re.compile, '\\%c' % c)
810        for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
811            with self.subTest(c):
812                self.assertRaises(re.error, re.compile, '[\\%c]' % c)
813
814    def test_named_unicode_escapes(self):
815        # test individual Unicode named escapes
816        self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
817        self.assertTrue(re.match(r'\N{less-than sign}', '<'))
818        self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>'))
819        self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d'))
820        self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH '
821                                 r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}',
822                                 '\ufbf9'))
823        self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
824                                 '='))
825        self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
826                                   ';'))
827
828        # test errors in \N{name} handling - only valid names should pass
829        self.checkPatternError(r'\N', 'missing {', 2)
830        self.checkPatternError(r'[\N]', 'missing {', 3)
831        self.checkPatternError(r'\N{', 'missing character name', 3)
832        self.checkPatternError(r'[\N{', 'missing character name', 4)
833        self.checkPatternError(r'\N{}', 'missing character name', 3)
834        self.checkPatternError(r'[\N{}]', 'missing character name', 4)
835        self.checkPatternError(r'\NSNAKE}', 'missing {', 2)
836        self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3)
837        self.checkPatternError(r'\N{SNAKE',
838                               'missing }, unterminated name', 3)
839        self.checkPatternError(r'[\N{SNAKE]',
840                               'missing }, unterminated name', 4)
841        self.checkPatternError(r'[\N{SNAKE]}',
842                               "undefined character name 'SNAKE]'", 1)
843        self.checkPatternError(r'\N{SPAM}',
844                               "undefined character name 'SPAM'", 0)
845        self.checkPatternError(r'[\N{SPAM}]',
846                               "undefined character name 'SPAM'", 1)
847        self.checkPatternError(r'\N{KEYCAP NUMBER SIGN}',
848                            "undefined character name 'KEYCAP NUMBER SIGN'", 0)
849        self.checkPatternError(r'[\N{KEYCAP NUMBER SIGN}]',
850                            "undefined character name 'KEYCAP NUMBER SIGN'", 1)
851        self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
852        self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)
853
854    def test_string_boundaries(self):
855        # See http://bugs.python.org/issue10713
856        self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
857                         "abc")
858        # There's a word boundary at the start of a string.
859        self.assertTrue(re.match(r"\b", "abc"))
860        # A non-empty string includes a non-boundary zero-length match.
861        self.assertTrue(re.search(r"\B", "abc"))
862        # There is no non-boundary match at the start of a string.
863        self.assertFalse(re.match(r"\B", "abc"))
864        # However, an empty string contains no word boundaries, and also no
865        # non-boundaries.
866        self.assertIsNone(re.search(r"\B", ""))
867        # This one is questionable and different from the perlre behaviour,
868        # but describes current behavior.
869        self.assertIsNone(re.search(r"\b", ""))
870        # A single word-character string has two boundaries, but no
871        # non-boundary gaps.
872        self.assertEqual(len(re.findall(r"\b", "a")), 2)
873        self.assertEqual(len(re.findall(r"\B", "a")), 0)
874        # If there are no words, there are no boundaries
875        self.assertEqual(len(re.findall(r"\b", " ")), 0)
876        self.assertEqual(len(re.findall(r"\b", "   ")), 0)
877        # Can match around the whitespace.
878        self.assertEqual(len(re.findall(r"\B", " ")), 2)
879
880    def test_bigcharset(self):
881        self.assertEqual(re.match("([\u2222\u2223])",
882                                  "\u2222").group(1), "\u2222")
883        r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
884        self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
885
886    def test_big_codesize(self):
887        # Issue #1160
888        r = re.compile('|'.join(('%d'%x for x in range(10000))))
889        self.assertTrue(r.match('1000'))
890        self.assertTrue(r.match('9999'))
891
892    def test_anyall(self):
893        self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
894                         "a\nb")
895        self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
896                         "a\n\nb")
897
898    def test_lookahead(self):
899        self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
900        self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
901        self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
902        self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
903        self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
904        self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
905        self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
906
907        self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
908        self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
909        self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
910        self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
911
912        # Group reference.
913        self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
914        self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
915        # Conditional group reference.
916        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
917        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
918        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
919        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
920        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
921        # Group used before defined.
922        self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
923        self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
924        self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
925
926    def test_lookbehind(self):
927        self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
928        self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
929        self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
930        self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
931        # Group reference.
932        self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
933        self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
934        self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
935        self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
936        # Conditional group reference.
937        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
938        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
939        self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
940        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
941        self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
942        # Group used before defined.
943        self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
944        self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
945        self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
946        # Group defined in the same lookbehind pattern
947        self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
948        self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
949        self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
950        self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
951
952    def test_ignore_case(self):
953        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
954        self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
955        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
956        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
957        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
958        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
959        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
960        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
961        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
962        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
963
964        # Two different characters have the same lowercase.
965        assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
966        self.assertTrue(re.match(r'K', '\u212a', re.I))
967        self.assertTrue(re.match(r'k', '\u212a', re.I))
968        self.assertTrue(re.match(r'\u212a', 'K', re.I))
969        self.assertTrue(re.match(r'\u212a', 'k', re.I))
970
971        # Two different characters have the same uppercase.
972        assert 's'.upper() == '\u017f'.upper() == 'S' # 'ſ'
973        self.assertTrue(re.match(r'S', '\u017f', re.I))
974        self.assertTrue(re.match(r's', '\u017f', re.I))
975        self.assertTrue(re.match(r'\u017f', 'S', re.I))
976        self.assertTrue(re.match(r'\u017f', 's', re.I))
977
978        # Two different characters have the same uppercase. Unicode 9.0+.
979        assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'в', 'ᲀ', 'В'
980        self.assertTrue(re.match(r'\u0412', '\u0432', re.I))
981        self.assertTrue(re.match(r'\u0412', '\u1c80', re.I))
982        self.assertTrue(re.match(r'\u0432', '\u0412', re.I))
983        self.assertTrue(re.match(r'\u0432', '\u1c80', re.I))
984        self.assertTrue(re.match(r'\u1c80', '\u0412', re.I))
985        self.assertTrue(re.match(r'\u1c80', '\u0432', re.I))
986
987        # Two different characters have the same multicharacter uppercase.
988        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
989        self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
990        self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
991
992    def test_ignore_case_set(self):
993        self.assertTrue(re.match(r'[19A]', 'A', re.I))
994        self.assertTrue(re.match(r'[19a]', 'a', re.I))
995        self.assertTrue(re.match(r'[19a]', 'A', re.I))
996        self.assertTrue(re.match(r'[19A]', 'a', re.I))
997        self.assertTrue(re.match(br'[19A]', b'A', re.I))
998        self.assertTrue(re.match(br'[19a]', b'a', re.I))
999        self.assertTrue(re.match(br'[19a]', b'A', re.I))
1000        self.assertTrue(re.match(br'[19A]', b'a', re.I))
1001
1002        # Two different characters have the same lowercase.
1003        assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
1004        self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
1005        self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
1006        self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
1007        self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
1008
1009        # Two different characters have the same uppercase.
1010        assert 's'.upper() == '\u017f'.upper() == 'S' # 'ſ'
1011        self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
1012        self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
1013        self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
1014        self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
1015
1016        # Two different characters have the same uppercase. Unicode 9.0+.
1017        assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'в', 'ᲀ', 'В'
1018        self.assertTrue(re.match(r'[19\u0412]', '\u0432', re.I))
1019        self.assertTrue(re.match(r'[19\u0412]', '\u1c80', re.I))
1020        self.assertTrue(re.match(r'[19\u0432]', '\u0412', re.I))
1021        self.assertTrue(re.match(r'[19\u0432]', '\u1c80', re.I))
1022        self.assertTrue(re.match(r'[19\u1c80]', '\u0412', re.I))
1023        self.assertTrue(re.match(r'[19\u1c80]', '\u0432', re.I))
1024
1025        # Two different characters have the same multicharacter uppercase.
1026        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
1027        self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
1028        self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
1029
1030    def test_ignore_case_range(self):
1031        # Issues #3511, #17381.
1032        self.assertTrue(re.match(r'[9-a]', '_', re.I))
1033        self.assertIsNone(re.match(r'[9-A]', '_', re.I))
1034        self.assertTrue(re.match(br'[9-a]', b'_', re.I))
1035        self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
1036        self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
1037        self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
1038        self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
1039        self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
1040        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
1041        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
1042        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
1043        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
1044        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
1045        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
1046        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
1047        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
1048
1049        # Two different characters have the same lowercase.
1050        assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
1051        self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
1052        self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
1053        self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
1054        self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
1055
1056        # Two different characters have the same uppercase.
1057        assert 's'.upper() == '\u017f'.upper() == 'S' # 'ſ'
1058        self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
1059        self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
1060        self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
1061        self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
1062
1063        # Two different characters have the same uppercase. Unicode 9.0+.
1064        assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'в', 'ᲀ', 'В'
1065        self.assertTrue(re.match(r'[\u0411-\u0413]', '\u0432', re.I))
1066        self.assertTrue(re.match(r'[\u0411-\u0413]', '\u1c80', re.I))
1067        self.assertTrue(re.match(r'[\u0431-\u0433]', '\u0412', re.I))
1068        self.assertTrue(re.match(r'[\u0431-\u0433]', '\u1c80', re.I))
1069        self.assertTrue(re.match(r'[\u1c80-\u1c82]', '\u0412', re.I))
1070        self.assertTrue(re.match(r'[\u1c80-\u1c82]', '\u0432', re.I))
1071
1072        # Two different characters have the same multicharacter uppercase.
1073        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
1074        self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
1075        self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
1076
1077    def test_category(self):
1078        self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
1079
1080    @cpython_only
1081    def test_case_helpers(self):
1082        import _sre
1083        for i in range(128):
1084            c = chr(i)
1085            lo = ord(c.lower())
1086            self.assertEqual(_sre.ascii_tolower(i), lo)
1087            self.assertEqual(_sre.unicode_tolower(i), lo)
1088            iscased = c in string.ascii_letters
1089            self.assertEqual(_sre.ascii_iscased(i), iscased)
1090            self.assertEqual(_sre.unicode_iscased(i), iscased)
1091
1092        for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
1093            c = chr(i)
1094            self.assertEqual(_sre.ascii_tolower(i), i)
1095            if i != 0x0130:
1096                self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
1097            iscased = c != c.lower() or c != c.upper()
1098            self.assertFalse(_sre.ascii_iscased(i))
1099            self.assertEqual(_sre.unicode_iscased(i),
1100                             c != c.lower() or c != c.upper())
1101
1102        self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
1103        self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
1104        self.assertFalse(_sre.ascii_iscased(0x0130))
1105        self.assertTrue(_sre.unicode_iscased(0x0130))
1106
1107    def test_not_literal(self):
1108        self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
1109        self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
1110
1111    def test_possible_set_operations(self):
1112        s = bytes(range(128)).decode()
1113        with self.assertWarns(FutureWarning):
1114            p = re.compile(r'[0-9--1]')
1115        self.assertEqual(p.findall(s), list('-./0123456789'))
1116        self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
1117        with self.assertWarns(FutureWarning):
1118            p = re.compile(r'[%--1]')
1119        self.assertEqual(p.findall(s), list("%&'()*+,-1"))
1120        with self.assertWarns(FutureWarning):
1121            p = re.compile(r'[%--]')
1122        self.assertEqual(p.findall(s), list("%&'()*+,-"))
1123
1124        with self.assertWarns(FutureWarning):
1125            p = re.compile(r'[0-9&&1]')
1126        self.assertEqual(p.findall(s), list('&0123456789'))
1127        with self.assertWarns(FutureWarning):
1128            p = re.compile(r'[\d&&1]')
1129        self.assertEqual(p.findall(s), list('&0123456789'))
1130        self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
1131
1132        with self.assertWarns(FutureWarning):
1133            p = re.compile(r'[0-9||a]')
1134        self.assertEqual(p.findall(s), list('0123456789a|'))
1135        with self.assertWarns(FutureWarning):
1136            p = re.compile(r'[\d||a]')
1137        self.assertEqual(p.findall(s), list('0123456789a|'))
1138        self.assertEqual(re.findall(r'[||1]', s), list('1|'))
1139
1140        with self.assertWarns(FutureWarning):
1141            p = re.compile(r'[0-9~~1]')
1142        self.assertEqual(p.findall(s), list('0123456789~'))
1143        with self.assertWarns(FutureWarning):
1144            p = re.compile(r'[\d~~1]')
1145        self.assertEqual(p.findall(s), list('0123456789~'))
1146        self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
1147
1148        with self.assertWarns(FutureWarning):
1149            p = re.compile(r'[[0-9]|]')
1150        self.assertEqual(p.findall(s), list('0123456789[]'))
1151
1152        with self.assertWarns(FutureWarning):
1153            p = re.compile(r'[[:digit:]|]')
1154        self.assertEqual(p.findall(s), list(':[]dgit'))
1155
1156    def test_search_coverage(self):
1157        self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
1158        self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
1159
1160    def assertMatch(self, pattern, text, match=None, span=None,
1161                    matcher=re.fullmatch):
1162        if match is None and span is None:
1163            # the pattern matches the whole text
1164            match = text
1165            span = (0, len(text))
1166        elif match is None or span is None:
1167            raise ValueError('If match is not None, span should be specified '
1168                             '(and vice versa).')
1169        m = matcher(pattern, text)
1170        self.assertTrue(m)
1171        self.assertEqual(m.group(), match)
1172        self.assertEqual(m.span(), span)
1173
1174    LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
1175
1176    def test_re_escape(self):
1177        p = ''.join(chr(i) for i in range(256))
1178        for c in p:
1179            self.assertMatch(re.escape(c), c)
1180            self.assertMatch('[' + re.escape(c) + ']', c)
1181            self.assertMatch('(?x)' + re.escape(c), c)
1182        self.assertMatch(re.escape(p), p)
1183        for c in '-.]{}':
1184            self.assertEqual(re.escape(c)[:1], '\\')
1185        literal_chars = self.LITERAL_CHARS
1186        self.assertEqual(re.escape(literal_chars), literal_chars)
1187
1188    def test_re_escape_bytes(self):
1189        p = bytes(range(256))
1190        for i in p:
1191            b = bytes([i])
1192            self.assertMatch(re.escape(b), b)
1193            self.assertMatch(b'[' + re.escape(b) + b']', b)
1194            self.assertMatch(b'(?x)' + re.escape(b), b)
1195        self.assertMatch(re.escape(p), p)
1196        for i in b'-.]{}':
1197            b = bytes([i])
1198            self.assertEqual(re.escape(b)[:1], b'\\')
1199        literal_chars = self.LITERAL_CHARS.encode('ascii')
1200        self.assertEqual(re.escape(literal_chars), literal_chars)
1201
1202    def test_re_escape_non_ascii(self):
1203        s = 'xxx\u2620\u2620\u2620xxx'
1204        s_escaped = re.escape(s)
1205        self.assertEqual(s_escaped, s)
1206        self.assertMatch(s_escaped, s)
1207        self.assertMatch('.%s+.' % re.escape('\u2620'), s,
1208                         'x\u2620\u2620\u2620x', (2, 7), re.search)
1209
1210    def test_re_escape_non_ascii_bytes(self):
1211        b = 'y\u2620y\u2620y'.encode('utf-8')
1212        b_escaped = re.escape(b)
1213        self.assertEqual(b_escaped, b)
1214        self.assertMatch(b_escaped, b)
1215        res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
1216        self.assertEqual(len(res), 2)
1217
1218    def test_pickling(self):
1219        import pickle
1220        oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
1221        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
1222            pickled = pickle.dumps(oldpat, proto)
1223            newpat = pickle.loads(pickled)
1224            self.assertEqual(newpat, oldpat)
1225        # current pickle expects the _compile() reconstructor in re module
1226        from re import _compile
1227
1228    def test_copying(self):
1229        import copy
1230        p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?')
1231        self.assertIs(copy.copy(p), p)
1232        self.assertIs(copy.deepcopy(p), p)
1233        m = p.match('12.34')
1234        self.assertIs(copy.copy(m), m)
1235        self.assertIs(copy.deepcopy(m), m)
1236
1237    def test_constants(self):
1238        self.assertEqual(re.I, re.IGNORECASE)
1239        self.assertEqual(re.L, re.LOCALE)
1240        self.assertEqual(re.M, re.MULTILINE)
1241        self.assertEqual(re.S, re.DOTALL)
1242        self.assertEqual(re.X, re.VERBOSE)
1243
1244    def test_flags(self):
1245        for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
1246            self.assertTrue(re.compile('^pattern$', flag))
1247        for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
1248            self.assertTrue(re.compile(b'^pattern$', flag))
1249
1250    def test_sre_character_literals(self):
1251        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1252            if i < 256:
1253                self.assertTrue(re.match(r"\%03o" % i, chr(i)))
1254                self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
1255                self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
1256                self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
1257                self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
1258                self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
1259            if i < 0x10000:
1260                self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
1261                self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
1262                self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
1263            self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
1264            self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
1265            self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
1266        self.assertTrue(re.match(r"\0", "\000"))
1267        self.assertTrue(re.match(r"\08", "\0008"))
1268        self.assertTrue(re.match(r"\01", "\001"))
1269        self.assertTrue(re.match(r"\018", "\0018"))
1270        self.checkPatternError(r"\567",
1271                               r'octal escape value \567 outside of '
1272                               r'range 0-0o377', 0)
1273        self.checkPatternError(r"\911", 'invalid group reference 91', 1)
1274        self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
1275        self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
1276        self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
1277        self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
1278        self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
1279        self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
1280        self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
1281
1282    def test_sre_character_class_literals(self):
1283        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1284            if i < 256:
1285                self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
1286                self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
1287                self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
1288                self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
1289                self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
1290                self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
1291                self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
1292                self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
1293            if i < 0x10000:
1294                self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
1295                self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
1296                self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
1297            self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
1298            self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
1299            self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
1300        self.checkPatternError(r"[\567]",
1301                               r'octal escape value \567 outside of '
1302                               r'range 0-0o377', 1)
1303        self.checkPatternError(r"[\911]", r'bad escape \9', 1)
1304        self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
1305        self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
1306        self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
1307        self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
1308        self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
1309
1310    def test_sre_byte_literals(self):
1311        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
1312            self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1313            self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1314            self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1315            self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1316            self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1317            self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
1318        self.assertRaises(re.error, re.compile, br"\u1234")
1319        self.assertRaises(re.error, re.compile, br"\U00012345")
1320        self.assertTrue(re.match(br"\0", b"\000"))
1321        self.assertTrue(re.match(br"\08", b"\0008"))
1322        self.assertTrue(re.match(br"\01", b"\001"))
1323        self.assertTrue(re.match(br"\018", b"\0018"))
1324        self.checkPatternError(br"\567",
1325                               r'octal escape value \567 outside of '
1326                               r'range 0-0o377', 0)
1327        self.checkPatternError(br"\911", 'invalid group reference 91', 1)
1328        self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1329        self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
1330
1331    def test_sre_byte_class_literals(self):
1332        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
1333            self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1334            self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1335            self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1336            self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1337            self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1338            self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1339            self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1340            self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
1341        self.assertRaises(re.error, re.compile, br"[\u1234]")
1342        self.assertRaises(re.error, re.compile, br"[\U00012345]")
1343        self.checkPatternError(br"[\567]",
1344                               r'octal escape value \567 outside of '
1345                               r'range 0-0o377', 1)
1346        self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1347        self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1348
1349    def test_character_set_errors(self):
1350        self.checkPatternError(r'[', 'unterminated character set', 0)
1351        self.checkPatternError(r'[^', 'unterminated character set', 0)
1352        self.checkPatternError(r'[a', 'unterminated character set', 0)
1353        # bug 545855 -- This pattern failed to cause a compile error as it
1354        # should, instead provoking a TypeError.
1355        self.checkPatternError(r"[a-", 'unterminated character set', 0)
1356        self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1357        self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1358        self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
1359
1360    def test_bug_113254(self):
1361        self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1362        self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1363        self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1364
1365    def test_bug_527371(self):
1366        # bug described in patches 527371/672491
1367        self.assertIsNone(re.match(r'(a)?a','a').lastindex)
1368        self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1369        self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1370        self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
1371        self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
1372
1373    def test_bug_418626(self):
1374        # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1375        # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1376        # pattern '*?' on a long string.
1377        self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1378        self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1379                         20003)
1380        self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
1381        # non-simple '*?' still used to hit the recursion limit, before the
1382        # non-recursive scheme was implemented.
1383        self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
1384
1385    def test_bug_612074(self):
1386        pat="["+re.escape("\u2039")+"]"
1387        self.assertEqual(re.compile(pat) and 1, 1)
1388
1389    def test_stack_overflow(self):
1390        # nasty cases that used to overflow the straightforward recursive
1391        # implementation of repeated groups.
1392        self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1393        self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1394        self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
1395
1396    def test_nothing_to_repeat(self):
1397        for reps in '*', '+', '?', '{1,2}':
1398            for mod in '', '?':
1399                self.checkPatternError('%s%s' % (reps, mod),
1400                                       'nothing to repeat', 0)
1401                self.checkPatternError('(?:%s%s)' % (reps, mod),
1402                                       'nothing to repeat', 3)
1403
1404    def test_multiple_repeat(self):
1405        for outer_reps in '*', '+', '?', '{1,2}':
1406            for outer_mod in '', '?', '+':
1407                outer_op = outer_reps + outer_mod
1408                for inner_reps in '*', '+', '?', '{1,2}':
1409                    for inner_mod in '', '?', '+':
1410                        if inner_mod + outer_reps in ('?', '+'):
1411                            continue
1412                        inner_op = inner_reps + inner_mod
1413                        self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1414                                'multiple repeat', 1 + len(inner_op))
1415
1416    def test_unlimited_zero_width_repeat(self):
1417        # Issue #9669
1418        self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1419        self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1420        self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1421        self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1422        self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1423        self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1424
1425    def test_scanner(self):
1426        def s_ident(scanner, token): return token
1427        def s_operator(scanner, token): return "op%s" % token
1428        def s_float(scanner, token): return float(token)
1429        def s_int(scanner, token): return int(token)
1430
1431        scanner = Scanner([
1432            (r"[a-zA-Z_]\w*", s_ident),
1433            (r"\d+\.\d*", s_float),
1434            (r"\d+", s_int),
1435            (r"=|\+|-|\*|/", s_operator),
1436            (r"\s+", None),
1437            ])
1438
1439        self.assertTrue(scanner.scanner.scanner("").pattern)
1440
1441        self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1442                         (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1443                           'op+', 'bar'], ''))
1444
1445    def test_bug_448951(self):
1446        # bug 448951 (similar to 429357, but with single char match)
1447        # (Also test greedy matches.)
1448        for op in '','?','*':
1449            self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1450                             (None, None))
1451            self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1452                             ('a:', 'a'))
1453
1454    def test_bug_725106(self):
1455        # capturing groups in alternatives in repeats
1456        self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1457                         ('b', 'a'))
1458        self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1459                         ('c', 'b'))
1460        self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1461                         ('b', None))
1462        self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1463                         ('b', None))
1464        self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1465                         ('b', 'a'))
1466        self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1467                         ('c', 'b'))
1468        self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1469                         ('b', None))
1470        self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1471                         ('b', None))
1472
1473    def test_bug_725149(self):
1474        # mark_stack_base restoring before restoring marks
1475        self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1476                         ('a', None))
1477        self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1478                         ('a', None, None))
1479
1480    def test_bug_764548(self):
1481        # bug 764548, re.compile() barfs on str/unicode subclasses
1482        class my_unicode(str): pass
1483        pat = re.compile(my_unicode("abc"))
1484        self.assertIsNone(pat.match("xyz"))
1485
1486    def test_finditer(self):
1487        iter = re.finditer(r":+", "a:b::c:::d")
1488        self.assertEqual([item.group(0) for item in iter],
1489                         [":", "::", ":::"])
1490
1491        pat = re.compile(r":+")
1492        iter = pat.finditer("a:b::c:::d", 1, 10)
1493        self.assertEqual([item.group(0) for item in iter],
1494                         [":", "::", ":::"])
1495
1496        pat = re.compile(r":+")
1497        iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1498        self.assertEqual([item.group(0) for item in iter],
1499                         [":", "::", ":::"])
1500
1501        pat = re.compile(r":+")
1502        iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1503        self.assertEqual([item.group(0) for item in iter],
1504                         [":", "::", ":::"])
1505
1506        pat = re.compile(r":+")
1507        iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1508        self.assertEqual([item.group(0) for item in iter],
1509                         ["::", "::"])
1510
1511    def test_bug_926075(self):
1512        self.assertIsNot(re.compile('bug_926075'),
1513                         re.compile(b'bug_926075'))
1514
1515    def test_bug_931848(self):
1516        pattern = "[\u002E\u3002\uFF0E\uFF61]"
1517        self.assertEqual(re.compile(pattern).split("a.b.c"),
1518                         ['a','b','c'])
1519
1520    def test_bug_581080(self):
1521        iter = re.finditer(r"\s", "a b")
1522        self.assertEqual(next(iter).span(), (1,2))
1523        self.assertRaises(StopIteration, next, iter)
1524
1525        scanner = re.compile(r"\s").scanner("a b")
1526        self.assertEqual(scanner.search().span(), (1, 2))
1527        self.assertIsNone(scanner.search())
1528
1529    def test_bug_817234(self):
1530        iter = re.finditer(r".*", "asdf")
1531        self.assertEqual(next(iter).span(), (0, 4))
1532        self.assertEqual(next(iter).span(), (4, 4))
1533        self.assertRaises(StopIteration, next, iter)
1534
1535    def test_bug_6561(self):
1536        # '\d' should match characters in Unicode category 'Nd'
1537        # (Number, Decimal Digit), but not those in 'Nl' (Number,
1538        # Letter) or 'No' (Number, Other).
1539        decimal_digits = [
1540            '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1541            '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1542            '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1543            ]
1544        for x in decimal_digits:
1545            self.assertEqual(re.match(r'^\d$', x).group(0), x)
1546
1547        not_decimal_digits = [
1548            '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1549            '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1550            '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1551            '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1552            ]
1553        for x in not_decimal_digits:
1554            self.assertIsNone(re.match(r'^\d$', x))
1555
1556    def test_empty_array(self):
1557        # SF buf 1647541
1558        import array
1559        for typecode in 'bBuhHiIlLfd':
1560            a = array.array(typecode)
1561            self.assertIsNone(re.compile(b"bla").match(a))
1562            self.assertEqual(re.compile(b"").match(a).groups(), ())
1563
1564    def test_inline_flags(self):
1565        # Bug #1700
1566        upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1567        lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
1568
1569        p = re.compile('.' + upper_char, re.I | re.S)
1570        q = p.match('\n' + lower_char)
1571        self.assertTrue(q)
1572
1573        p = re.compile('.' + lower_char, re.I | re.S)
1574        q = p.match('\n' + upper_char)
1575        self.assertTrue(q)
1576
1577        p = re.compile('(?i).' + upper_char, re.S)
1578        q = p.match('\n' + lower_char)
1579        self.assertTrue(q)
1580
1581        p = re.compile('(?i).' + lower_char, re.S)
1582        q = p.match('\n' + upper_char)
1583        self.assertTrue(q)
1584
1585        p = re.compile('(?is).' + upper_char)
1586        q = p.match('\n' + lower_char)
1587        self.assertTrue(q)
1588
1589        p = re.compile('(?is).' + lower_char)
1590        q = p.match('\n' + upper_char)
1591        self.assertTrue(q)
1592
1593        p = re.compile('(?s)(?i).' + upper_char)
1594        q = p.match('\n' + lower_char)
1595        self.assertTrue(q)
1596
1597        p = re.compile('(?s)(?i).' + lower_char)
1598        q = p.match('\n' + upper_char)
1599        self.assertTrue(q)
1600
1601        self.assertTrue(re.match('(?ix) ' + upper_char, lower_char))
1602        self.assertTrue(re.match('(?ix) ' + lower_char, upper_char))
1603        self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X))
1604        self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char))
1605        self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X))
1606
1607        msg = "global flags not at the start of the expression"
1608        self.checkPatternError(upper_char + '(?i)', msg, 1)
1609
1610        # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning
1611        with warnings.catch_warnings():
1612            warnings.simplefilter('error', BytesWarning)
1613            self.checkPatternError(b'A(?i)', msg, 1)
1614
1615        self.checkPatternError('(?s).(?i)' + upper_char, msg, 5)
1616        self.checkPatternError('(?i) ' + upper_char + ' (?x)', msg, 7)
1617        self.checkPatternError(' (?x) (?i) ' + upper_char, msg, 1)
1618        self.checkPatternError('^(?i)' + upper_char, msg, 1)
1619        self.checkPatternError('$|(?i)' + upper_char, msg, 2)
1620        self.checkPatternError('(?:(?i)' + upper_char + ')', msg, 3)
1621        self.checkPatternError('(^)?(?(1)(?i)' + upper_char + ')', msg, 9)
1622        self.checkPatternError('($)?(?(1)|(?i)' + upper_char + ')', msg, 10)
1623
1624
1625    def test_dollar_matches_twice(self):
1626        r"""Test that $ does not include \n
1627        $ matches the end of string, and just before the terminating \n"""
1628        pattern = re.compile('$')
1629        self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1630        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1631        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1632
1633        pattern = re.compile('$', re.MULTILINE)
1634        self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1635        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1636        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1637
1638    def test_bytes_str_mixing(self):
1639        # Mixing str and bytes is disallowed
1640        pat = re.compile('.')
1641        bpat = re.compile(b'.')
1642        self.assertRaises(TypeError, pat.match, b'b')
1643        self.assertRaises(TypeError, bpat.match, 'b')
1644        self.assertRaises(TypeError, pat.sub, b'b', 'c')
1645        self.assertRaises(TypeError, pat.sub, 'b', b'c')
1646        self.assertRaises(TypeError, pat.sub, b'b', b'c')
1647        self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1648        self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1649        self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1650
1651    def test_ascii_and_unicode_flag(self):
1652        # String patterns
1653        for flags in (0, re.UNICODE):
1654            pat = re.compile('\xc0', flags | re.IGNORECASE)
1655            self.assertTrue(pat.match('\xe0'))
1656            pat = re.compile(r'\w', flags)
1657            self.assertTrue(pat.match('\xe0'))
1658        pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
1659        self.assertIsNone(pat.match('\xe0'))
1660        pat = re.compile('(?a)\xc0', re.IGNORECASE)
1661        self.assertIsNone(pat.match('\xe0'))
1662        pat = re.compile(r'\w', re.ASCII)
1663        self.assertIsNone(pat.match('\xe0'))
1664        pat = re.compile(r'(?a)\w')
1665        self.assertIsNone(pat.match('\xe0'))
1666        # Bytes patterns
1667        for flags in (0, re.ASCII):
1668            pat = re.compile(b'\xc0', flags | re.IGNORECASE)
1669            self.assertIsNone(pat.match(b'\xe0'))
1670            pat = re.compile(br'\w', flags)
1671            self.assertIsNone(pat.match(b'\xe0'))
1672        # Incompatibilities
1673        self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
1674        self.assertRaises(re.error, re.compile, br'(?u)\w')
1675        self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
1676        self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
1677        self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
1678        self.assertRaises(re.error, re.compile, r'(?au)\w')
1679
1680    def test_locale_flag(self):
1681        enc = locale.getpreferredencoding()
1682        # Search non-ASCII letter
1683        for i in range(128, 256):
1684            try:
1685                c = bytes([i]).decode(enc)
1686                sletter = c.lower()
1687                if sletter == c: continue
1688                bletter = sletter.encode(enc)
1689                if len(bletter) != 1: continue
1690                if bletter.decode(enc) != sletter: continue
1691                bpat = re.escape(bytes([i]))
1692                break
1693            except (UnicodeError, TypeError):
1694                pass
1695        else:
1696            bletter = None
1697            bpat = b'A'
1698        # Bytes patterns
1699        pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1700        if bletter:
1701            self.assertTrue(pat.match(bletter))
1702        pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1703        if bletter:
1704            self.assertTrue(pat.match(bletter))
1705        pat = re.compile(bpat, re.IGNORECASE)
1706        if bletter:
1707            self.assertIsNone(pat.match(bletter))
1708        pat = re.compile(br'\w', re.LOCALE)
1709        if bletter:
1710            self.assertTrue(pat.match(bletter))
1711        pat = re.compile(br'(?L)\w')
1712        if bletter:
1713            self.assertTrue(pat.match(bletter))
1714        pat = re.compile(br'\w')
1715        if bletter:
1716            self.assertIsNone(pat.match(bletter))
1717        # Incompatibilities
1718        self.assertRaises(ValueError, re.compile, '', re.LOCALE)
1719        self.assertRaises(re.error, re.compile, '(?L)')
1720        self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1721        self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1722        self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
1723        self.assertRaises(re.error, re.compile, b'(?aL)')
1724
1725    def test_scoped_flags(self):
1726        self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
1727        self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
1728        self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
1729        self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
1730        self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
1731        self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
1732
1733        self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0'))
1734        self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0'))
1735        self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII))
1736
1737        self.checkPatternError(r'(?a)(?-a:\w)',
1738                "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)
1739        self.checkPatternError(r'(?i-i:a)',
1740                'bad inline flags: flag turned on and off', 5)
1741        self.checkPatternError(r'(?au:a)',
1742                "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1743        self.checkPatternError(br'(?aL:a)',
1744                "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1745
1746        self.checkPatternError(r'(?-', 'missing flag', 3)
1747        self.checkPatternError(r'(?-+', 'missing flag', 3)
1748        self.checkPatternError(r'(?-z', 'unknown flag', 3)
1749        self.checkPatternError(r'(?-i', 'missing :', 4)
1750        self.checkPatternError(r'(?-i)', 'missing :', 4)
1751        self.checkPatternError(r'(?-i+', 'missing :', 4)
1752        self.checkPatternError(r'(?-iz', 'unknown flag', 4)
1753        self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
1754        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1755        self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
1756        self.checkPatternError(r'(?iz', 'unknown flag', 3)
1757
1758    def test_ignore_spaces(self):
1759        for space in " \t\n\r\v\f":
1760            self.assertTrue(re.fullmatch(space + 'a', 'a', re.VERBOSE))
1761        for space in b" ", b"\t", b"\n", b"\r", b"\v", b"\f":
1762            self.assertTrue(re.fullmatch(space + b'a', b'a', re.VERBOSE))
1763        self.assertTrue(re.fullmatch('(?x) a', 'a'))
1764        self.assertTrue(re.fullmatch(' (?x) a', 'a', re.VERBOSE))
1765        self.assertTrue(re.fullmatch('(?x) (?x) a', 'a'))
1766        self.assertTrue(re.fullmatch(' a(?x: b) c', ' ab c'))
1767        self.assertTrue(re.fullmatch(' a(?-x: b) c', 'a bc', re.VERBOSE))
1768        self.assertTrue(re.fullmatch('(?x) a(?-x: b) c', 'a bc'))
1769        self.assertTrue(re.fullmatch('(?x) a| b', 'a'))
1770        self.assertTrue(re.fullmatch('(?x) a| b', 'b'))
1771
1772    def test_comments(self):
1773        self.assertTrue(re.fullmatch('#x\na', 'a', re.VERBOSE))
1774        self.assertTrue(re.fullmatch(b'#x\na', b'a', re.VERBOSE))
1775        self.assertTrue(re.fullmatch('(?x)#x\na', 'a'))
1776        self.assertTrue(re.fullmatch('#x\n(?x)#y\na', 'a', re.VERBOSE))
1777        self.assertTrue(re.fullmatch('(?x)#x\n(?x)#y\na', 'a'))
1778        self.assertTrue(re.fullmatch('#x\na(?x:#y\nb)#z\nc', '#x\nab#z\nc'))
1779        self.assertTrue(re.fullmatch('#x\na(?-x:#y\nb)#z\nc', 'a#y\nbc',
1780                                     re.VERBOSE))
1781        self.assertTrue(re.fullmatch('(?x)#x\na(?-x:#y\nb)#z\nc', 'a#y\nbc'))
1782        self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'a'))
1783        self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'b'))
1784
1785    def test_bug_6509(self):
1786        # Replacement strings of both types must parse properly.
1787        # all strings
1788        pat = re.compile(r'a(\w)')
1789        self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1790        pat = re.compile('a(.)')
1791        self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1792        pat = re.compile('..')
1793        self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1794
1795        # all bytes
1796        pat = re.compile(br'a(\w)')
1797        self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1798        pat = re.compile(b'a(.)')
1799        self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1800        pat = re.compile(b'..')
1801        self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1802
1803    def test_dealloc(self):
1804        # issue 3299: check for segfault in debug build
1805        import _sre
1806        # the overflow limit is different on wide and narrow builds and it
1807        # depends on the definition of SRE_CODE (see sre.h).
1808        # 2**128 should be big enough to overflow on both. For smaller values
1809        # a RuntimeError is raised instead of OverflowError.
1810        long_overflow = 2**128
1811        self.assertRaises(TypeError, re.finditer, "a", {})
1812        with self.assertRaises(OverflowError):
1813            _sre.compile("abc", 0, [long_overflow], 0, {}, ())
1814        with self.assertRaises(TypeError):
1815            _sre.compile({}, 0, [], 0, [], [])
1816
1817    def test_search_dot_unicode(self):
1818        self.assertTrue(re.search("123.*-", '123abc-'))
1819        self.assertTrue(re.search("123.*-", '123\xe9-'))
1820        self.assertTrue(re.search("123.*-", '123\u20ac-'))
1821        self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1822        self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
1823
1824    def test_compile(self):
1825        # Test return value when given string and pattern as parameter
1826        pattern = re.compile('random pattern')
1827        self.assertIsInstance(pattern, re.Pattern)
1828        same_pattern = re.compile(pattern)
1829        self.assertIsInstance(same_pattern, re.Pattern)
1830        self.assertIs(same_pattern, pattern)
1831        # Test behaviour when not given a string or pattern as parameter
1832        self.assertRaises(TypeError, re.compile, 0)
1833
1834    @bigmemtest(size=_2G, memuse=1)
1835    def test_large_search(self, size):
1836        # Issue #10182: indices were 32-bit-truncated.
1837        s = 'a' * size
1838        m = re.search('$', s)
1839        self.assertIsNotNone(m)
1840        self.assertEqual(m.start(), size)
1841        self.assertEqual(m.end(), size)
1842
1843    # The huge memuse is because of re.sub() using a list and a join()
1844    # to create the replacement result.
1845    @bigmemtest(size=_2G, memuse=16 + 2)
1846    def test_large_subn(self, size):
1847        # Issue #10182: indices were 32-bit-truncated.
1848        s = 'a' * size
1849        r, n = re.subn('', '', s)
1850        self.assertEqual(r, s)
1851        self.assertEqual(n, size + 1)
1852
1853    def test_bug_16688(self):
1854        # Issue 16688: Backreferences make case-insensitive regex fail on
1855        # non-ASCII strings.
1856        self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1857        self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
1858
1859    def test_repeat_minmax_overflow(self):
1860        # Issue #13169
1861        string = "x" * 100000
1862        self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1863        self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1864        self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1865        self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1866        self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1867        self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1868        # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1869        self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1870        self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1871        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1872        self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1873
1874    @cpython_only
1875    def test_repeat_minmax_overflow_maxrepeat(self):
1876        try:
1877            from _sre import MAXREPEAT
1878        except ImportError:
1879            self.skipTest('requires _sre.MAXREPEAT constant')
1880        string = "x" * 100000
1881        self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1882        self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1883                         (0, 100000))
1884        self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1885        self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1886        self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1887        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1888
1889    def test_backref_group_name_in_exception(self):
1890        # Issue 17341: Poor error message when compiling invalid regex
1891        self.checkPatternError('(?P=<foo>)',
1892                               "bad character in group name '<foo>'", 4)
1893
1894    def test_group_name_in_exception(self):
1895        # Issue 17341: Poor error message when compiling invalid regex
1896        self.checkPatternError('(?P<?foo>)',
1897                               "bad character in group name '?foo'", 4)
1898
1899    def test_issue17998(self):
1900        for reps in '*', '+', '?', '{1}':
1901            for mod in '', '?':
1902                pattern = '.' + reps + mod + 'yz'
1903                self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1904                                 ['xyz'], msg=pattern)
1905                pattern = pattern.encode()
1906                self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1907                                 [b'xyz'], msg=pattern)
1908
1909    def test_match_repr(self):
1910        for string in '[abracadabra]', S('[abracadabra]'):
1911            m = re.search(r'(.+)(.*?)\1', string)
1912            pattern = r"<(%s\.)?%s object; span=\(1, 12\), match='abracadabra'>" % (
1913                type(m).__module__, type(m).__qualname__
1914            )
1915            self.assertRegex(repr(m), pattern)
1916        for string in (b'[abracadabra]', B(b'[abracadabra]'),
1917                       bytearray(b'[abracadabra]'),
1918                       memoryview(b'[abracadabra]')):
1919            m = re.search(br'(.+)(.*?)\1', string)
1920            pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % (
1921                type(m).__module__, type(m).__qualname__
1922            )
1923            self.assertRegex(repr(m), pattern)
1924
1925        first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1926        pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % (
1927            type(second).__module__, type(second).__qualname__
1928        )
1929        self.assertRegex(repr(first), pattern)
1930        pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % (
1931            type(second).__module__, type(second).__qualname__
1932        )
1933        self.assertRegex(repr(second), pattern)
1934
1935    def test_zerowidth(self):
1936        # Issues 852532, 1647489, 3262, 25054.
1937        self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
1938        self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', ''])
1939        self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc'])
1940        self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
1941
1942        self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
1943        self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-')
1944        self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]')
1945
1946        self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
1947        self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
1948                         ['', 'a', '', '', 'bc', ''])
1949
1950        self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")],
1951                         [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)])
1952        self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")],
1953                         [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)])
1954
1955    def test_bug_2537(self):
1956        # issue 2537: empty submatches
1957        for outer_op in ('{0,}', '*', '+', '{1,187}'):
1958            for inner_op in ('{0,}', '*', '?'):
1959                r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1960                m = r.match("xyyzy")
1961                self.assertEqual(m.group(0), "xyy")
1962                self.assertEqual(m.group(1), "")
1963                self.assertEqual(m.group(2), "y")
1964
1965    def test_keyword_parameters(self):
1966        # Issue #20283: Accepting the string keyword parameter.
1967        pat = re.compile(r'(ab)')
1968        self.assertEqual(
1969            pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1970        self.assertEqual(
1971            pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1972        self.assertEqual(
1973            pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1974        self.assertEqual(
1975            pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1976        self.assertEqual(
1977            pat.split(string='abracadabra', maxsplit=1),
1978            ['', 'ab', 'racadabra'])
1979        self.assertEqual(
1980            pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1981            (7, 9))
1982
1983    def test_bug_20998(self):
1984        # Issue #20998: Fullmatch of repeated single character pattern
1985        # with ignore case.
1986        self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1987
1988    @unittest.skipIf(
1989        is_emscripten or is_wasi,
1990        "musl libc issue on Emscripten/WASI, bpo-46390"
1991    )
1992    def test_locale_caching(self):
1993        # Issue #22410
1994        oldlocale = locale.setlocale(locale.LC_CTYPE)
1995        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1996        for loc in 'en_US.iso88591', 'en_US.utf8':
1997            try:
1998                locale.setlocale(locale.LC_CTYPE, loc)
1999            except locale.Error:
2000                # Unsupported locale on this system
2001                self.skipTest('test needs %s locale' % loc)
2002
2003        re.purge()
2004        self.check_en_US_iso88591()
2005        self.check_en_US_utf8()
2006        re.purge()
2007        self.check_en_US_utf8()
2008        self.check_en_US_iso88591()
2009
2010    def check_en_US_iso88591(self):
2011        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
2012        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
2013        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
2014        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
2015        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
2016        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
2017        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
2018
2019    def check_en_US_utf8(self):
2020        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
2021        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
2022        self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
2023        self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
2024        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
2025        self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
2026        self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
2027
2028    @unittest.skipIf(
2029        is_emscripten or is_wasi,
2030        "musl libc issue on Emscripten/WASI, bpo-46390"
2031    )
2032    def test_locale_compiled(self):
2033        oldlocale = locale.setlocale(locale.LC_CTYPE)
2034        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
2035        for loc in 'en_US.iso88591', 'en_US.utf8':
2036            try:
2037                locale.setlocale(locale.LC_CTYPE, loc)
2038            except locale.Error:
2039                # Unsupported locale on this system
2040                self.skipTest('test needs %s locale' % loc)
2041
2042        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
2043        p1 = re.compile(b'\xc5\xe5', re.L|re.I)
2044        p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
2045        p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
2046        p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
2047        for p in p1, p2, p3:
2048            self.assertTrue(p.match(b'\xc5\xe5'))
2049            self.assertTrue(p.match(b'\xe5\xe5'))
2050            self.assertTrue(p.match(b'\xc5\xc5'))
2051        self.assertIsNone(p4.match(b'\xe5\xc5'))
2052        self.assertIsNone(p4.match(b'\xe5\xe5'))
2053        self.assertIsNone(p4.match(b'\xc5\xc5'))
2054
2055        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
2056        for p in p1, p2, p3:
2057            self.assertTrue(p.match(b'\xc5\xe5'))
2058            self.assertIsNone(p.match(b'\xe5\xe5'))
2059            self.assertIsNone(p.match(b'\xc5\xc5'))
2060        self.assertTrue(p4.match(b'\xe5\xc5'))
2061        self.assertIsNone(p4.match(b'\xe5\xe5'))
2062        self.assertIsNone(p4.match(b'\xc5\xc5'))
2063
2064    def test_error(self):
2065        with self.assertRaises(re.error) as cm:
2066            re.compile('(\u20ac))')
2067        err = cm.exception
2068        self.assertIsInstance(err.pattern, str)
2069        self.assertEqual(err.pattern, '(\u20ac))')
2070        self.assertEqual(err.pos, 3)
2071        self.assertEqual(err.lineno, 1)
2072        self.assertEqual(err.colno, 4)
2073        self.assertIn(err.msg, str(err))
2074        self.assertIn(' at position 3', str(err))
2075        self.assertNotIn(' at position 3', err.msg)
2076        # Bytes pattern
2077        with self.assertRaises(re.error) as cm:
2078            re.compile(b'(\xa4))')
2079        err = cm.exception
2080        self.assertIsInstance(err.pattern, bytes)
2081        self.assertEqual(err.pattern, b'(\xa4))')
2082        self.assertEqual(err.pos, 3)
2083        # Multiline pattern
2084        with self.assertRaises(re.error) as cm:
2085            re.compile("""
2086                (
2087                    abc
2088                )
2089                )
2090                (
2091                """, re.VERBOSE)
2092        err = cm.exception
2093        self.assertEqual(err.pos, 77)
2094        self.assertEqual(err.lineno, 5)
2095        self.assertEqual(err.colno, 17)
2096        self.assertIn(err.msg, str(err))
2097        self.assertIn(' at position 77', str(err))
2098        self.assertIn('(line 5, column 17)', str(err))
2099
2100    def test_misc_errors(self):
2101        self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
2102        self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
2103        self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
2104        self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
2105        self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
2106        self.checkPatternError(r'(?iz)', 'unknown flag', 3)
2107        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
2108        self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
2109        self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
2110        self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
2111        self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
2112
2113    def test_enum(self):
2114        # Issue #28082: Check that str(flag) returns a human readable string
2115        # instead of an integer
2116        self.assertIn('ASCII', str(re.A))
2117        self.assertIn('DOTALL', str(re.S))
2118
2119    def test_pattern_compare(self):
2120        pattern1 = re.compile('abc', re.IGNORECASE)
2121
2122        # equal to itself
2123        self.assertEqual(pattern1, pattern1)
2124        self.assertFalse(pattern1 != pattern1)
2125
2126        # equal
2127        re.purge()
2128        pattern2 = re.compile('abc', re.IGNORECASE)
2129        self.assertEqual(hash(pattern2), hash(pattern1))
2130        self.assertEqual(pattern2, pattern1)
2131
2132        # not equal: different pattern
2133        re.purge()
2134        pattern3 = re.compile('XYZ', re.IGNORECASE)
2135        # Don't test hash(pattern3) != hash(pattern1) because there is no
2136        # warranty that hash values are different
2137        self.assertNotEqual(pattern3, pattern1)
2138
2139        # not equal: different flag (flags=0)
2140        re.purge()
2141        pattern4 = re.compile('abc')
2142        self.assertNotEqual(pattern4, pattern1)
2143
2144        # only == and != comparison operators are supported
2145        with self.assertRaises(TypeError):
2146            pattern1 < pattern2
2147
2148    def test_pattern_compare_bytes(self):
2149        pattern1 = re.compile(b'abc')
2150
2151        # equal: test bytes patterns
2152        re.purge()
2153        pattern2 = re.compile(b'abc')
2154        self.assertEqual(hash(pattern2), hash(pattern1))
2155        self.assertEqual(pattern2, pattern1)
2156
2157        # not equal: pattern of a different types (str vs bytes),
2158        # comparison must not raise a BytesWarning
2159        re.purge()
2160        pattern3 = re.compile('abc')
2161        with warnings.catch_warnings():
2162            warnings.simplefilter('error', BytesWarning)
2163            self.assertNotEqual(pattern3, pattern1)
2164
2165    def test_bug_29444(self):
2166        s = bytearray(b'abcdefgh')
2167        m = re.search(b'[a-h]+', s)
2168        m2 = re.search(b'[e-h]+', s)
2169        self.assertEqual(m.group(), b'abcdefgh')
2170        self.assertEqual(m2.group(), b'efgh')
2171        s[:] = b'xyz'
2172        self.assertEqual(m.group(), b'xyz')
2173        self.assertEqual(m2.group(), b'')
2174
2175    def test_bug_34294(self):
2176        # Issue 34294: wrong capturing groups
2177
2178        # exists since Python 2
2179        s = "a\tx"
2180        p = r"\b(?=(\t)|(x))x"
2181        self.assertEqual(re.search(p, s).groups(), (None, 'x'))
2182
2183        # introduced in Python 3.7.0
2184        s = "ab"
2185        p = r"(?=(.)(.)?)"
2186        self.assertEqual(re.findall(p, s),
2187                         [('a', 'b'), ('b', '')])
2188        self.assertEqual([m.groups() for m in re.finditer(p, s)],
2189                         [('a', 'b'), ('b', None)])
2190
2191        # test-cases provided by issue34294, introduced in Python 3.7.0
2192        p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
2193        s = "<test><foo2/></test>"
2194        self.assertEqual(re.findall(p, s),
2195                         [('test', '<foo2/>'), ('foo2', '')])
2196        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2197                         [{'tag': 'test', 'text': '<foo2/>'},
2198                          {'tag': 'foo2', 'text': None}])
2199        s = "<test>Hello</test><foo/>"
2200        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2201                         [{'tag': 'test', 'text': 'Hello'},
2202                          {'tag': 'foo', 'text': None}])
2203        s = "<test>Hello</test><foo/><foo/>"
2204        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2205                         [{'tag': 'test', 'text': 'Hello'},
2206                          {'tag': 'foo', 'text': None},
2207                          {'tag': 'foo', 'text': None}])
2208
2209    def test_MARK_PUSH_macro_bug(self):
2210        # issue35859, MARK_PUSH() macro didn't protect MARK-0 if it
2211        # was the only available mark.
2212        self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',))
2213        self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',))
2214        self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',))
2215        self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',))
2216
2217    def test_MIN_UNTIL_mark_bug(self):
2218        # Fixed in issue35859, reported in issue9134.
2219        # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
2220        s = 'axxzbcz'
2221        p = r'(?:(?:a|bc)*?(xx)??z)*'
2222        self.assertEqual(re.match(p, s).groups(), ('xx',))
2223
2224        # test-case provided by issue9134
2225        s = 'xtcxyzxc'
2226        p = r'((x|yz)+?(t)??c)*'
2227        m = re.match(p, s)
2228        self.assertEqual(m.span(), (0, 8))
2229        self.assertEqual(m.span(2), (6, 7))
2230        self.assertEqual(m.groups(), ('xyzxc', 'x', 't'))
2231
2232    def test_REPEAT_ONE_mark_bug(self):
2233        # issue35859
2234        # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat
2235        s = 'aabaab'
2236        p = r'(?:[^b]*a(?=(b)|(a))ab)*'
2237        m = re.match(p, s)
2238        self.assertEqual(m.span(), (0, 6))
2239        self.assertEqual(m.span(2), (4, 5))
2240        self.assertEqual(m.groups(), (None, 'a'))
2241
2242        # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat
2243        s = 'abab'
2244        p = r'(?:[^b]*(?=(b)|(a))ab)*'
2245        m = re.match(p, s)
2246        self.assertEqual(m.span(), (0, 4))
2247        self.assertEqual(m.span(2), (2, 3))
2248        self.assertEqual(m.groups(), (None, 'a'))
2249
2250        self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',))
2251
2252    def test_MIN_REPEAT_ONE_mark_bug(self):
2253        # issue35859
2254        # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
2255        s = 'abab'
2256        p = r'(?:.*?(?=(a)|(b))b)*'
2257        m = re.match(p, s)
2258        self.assertEqual(m.span(), (0, 4))
2259        self.assertEqual(m.span(2), (3, 4))
2260        self.assertEqual(m.groups(), (None, 'b'))
2261
2262        s = 'axxzaz'
2263        p = r'(?:a*?(xx)??z)*'
2264        self.assertEqual(re.match(p, s).groups(), ('xx',))
2265
2266    def test_ASSERT_NOT_mark_bug(self):
2267        # Fixed in issue35859, reported in issue725149.
2268        # JUMP_ASSERT_NOT should LASTMARK_SAVE()
2269        self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,))
2270
2271        # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
2272        m = re.match(r'((?!(ab)c)(.))*', 'abab')
2273        self.assertEqual(m.span(), (0, 4))
2274        self.assertEqual(m.span(1), (3, 4))
2275        self.assertEqual(m.span(3), (3, 4))
2276        self.assertEqual(m.groups(), ('b', None, 'b'))
2277
2278    def test_bug_40736(self):
2279        with self.assertRaisesRegex(TypeError, "got 'int'"):
2280            re.search("x*", 5)
2281        with self.assertRaisesRegex(TypeError, "got 'type'"):
2282            re.search("x*", type)
2283
2284    def test_search_anchor_at_beginning(self):
2285        s = 'x'*10**7
2286        start = time.perf_counter()
2287        for p in r'\Ay', r'^y':
2288            self.assertIsNone(re.search(p, s))
2289            self.assertEqual(re.split(p, s), [s])
2290            self.assertEqual(re.findall(p, s), [])
2291            self.assertEqual(list(re.finditer(p, s)), [])
2292            self.assertEqual(re.sub(p, '', s), s)
2293        t = time.perf_counter() - start
2294        # Without optimization it takes 1 second on my computer.
2295        # With optimization -- 0.0003 seconds.
2296        self.assertLess(t, 0.1)
2297
2298    def test_possessive_quantifiers(self):
2299        """Test Possessive Quantifiers
2300        Test quantifiers of the form @+ for some repetition operator @,
2301        e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed
2302        without creating a stack frame for rolling the stack back and
2303        trying 1 or more fewer matches."""
2304        self.assertIsNone(re.match('e*+e', 'eeee'))
2305        self.assertEqual(re.match('e++a', 'eeea').group(0), 'eeea')
2306        self.assertEqual(re.match('e?+a', 'ea').group(0), 'ea')
2307        self.assertEqual(re.match('e{2,4}+a', 'eeea').group(0), 'eeea')
2308        self.assertIsNone(re.match('(.)++.', 'ee'))
2309        self.assertEqual(re.match('(ae)*+a', 'aea').groups(), ('ae',))
2310        self.assertEqual(re.match('([ae][ae])?+a', 'aea').groups(),
2311                         ('ae',))
2312        self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(),
2313                         ('',))
2314        self.assertEqual(re.match('()*+a', 'a').groups(), ('',))
2315        self.assertEqual(re.search('x*+', 'axx').span(), (0, 0))
2316        self.assertEqual(re.search('x++', 'axx').span(), (1, 3))
2317        self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0))
2318        self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3))
2319        self.assertIsNone(re.match('a++', 'xxx'))
2320        self.assertIsNone(re.match(r"^(\w){1}+$", "abc"))
2321        self.assertIsNone(re.match(r"^(\w){1,2}+$", "abc"))
2322
2323        self.assertEqual(re.match(r"^(\w){3}+$", "abc").group(1), "c")
2324        self.assertEqual(re.match(r"^(\w){1,3}+$", "abc").group(1), "c")
2325        self.assertEqual(re.match(r"^(\w){1,4}+$", "abc").group(1), "c")
2326
2327        self.assertIsNone(re.match("^x{1}+$", "xxx"))
2328        self.assertIsNone(re.match("^x{1,2}+$", "xxx"))
2329
2330        self.assertTrue(re.match("^x{3}+$", "xxx"))
2331        self.assertTrue(re.match("^x{1,3}+$", "xxx"))
2332        self.assertTrue(re.match("^x{1,4}+$", "xxx"))
2333
2334        self.assertIsNone(re.match("^x{}+$", "xxx"))
2335        self.assertTrue(re.match("^x{}+$", "x{}"))
2336
2337    def test_fullmatch_possessive_quantifiers(self):
2338        self.assertTrue(re.fullmatch(r'a++', 'a'))
2339        self.assertTrue(re.fullmatch(r'a*+', 'a'))
2340        self.assertTrue(re.fullmatch(r'a?+', 'a'))
2341        self.assertTrue(re.fullmatch(r'a{1,3}+', 'a'))
2342        self.assertIsNone(re.fullmatch(r'a++', 'ab'))
2343        self.assertIsNone(re.fullmatch(r'a*+', 'ab'))
2344        self.assertIsNone(re.fullmatch(r'a?+', 'ab'))
2345        self.assertIsNone(re.fullmatch(r'a{1,3}+', 'ab'))
2346        self.assertTrue(re.fullmatch(r'a++b', 'ab'))
2347        self.assertTrue(re.fullmatch(r'a*+b', 'ab'))
2348        self.assertTrue(re.fullmatch(r'a?+b', 'ab'))
2349        self.assertTrue(re.fullmatch(r'a{1,3}+b', 'ab'))
2350
2351        self.assertTrue(re.fullmatch(r'(?:ab)++', 'ab'))
2352        self.assertTrue(re.fullmatch(r'(?:ab)*+', 'ab'))
2353        self.assertTrue(re.fullmatch(r'(?:ab)?+', 'ab'))
2354        self.assertTrue(re.fullmatch(r'(?:ab){1,3}+', 'ab'))
2355        self.assertIsNone(re.fullmatch(r'(?:ab)++', 'abc'))
2356        self.assertIsNone(re.fullmatch(r'(?:ab)*+', 'abc'))
2357        self.assertIsNone(re.fullmatch(r'(?:ab)?+', 'abc'))
2358        self.assertIsNone(re.fullmatch(r'(?:ab){1,3}+', 'abc'))
2359        self.assertTrue(re.fullmatch(r'(?:ab)++c', 'abc'))
2360        self.assertTrue(re.fullmatch(r'(?:ab)*+c', 'abc'))
2361        self.assertTrue(re.fullmatch(r'(?:ab)?+c', 'abc'))
2362        self.assertTrue(re.fullmatch(r'(?:ab){1,3}+c', 'abc'))
2363
2364    def test_findall_possessive_quantifiers(self):
2365        self.assertEqual(re.findall(r'a++', 'aab'), ['aa'])
2366        self.assertEqual(re.findall(r'a*+', 'aab'), ['aa', '', ''])
2367        self.assertEqual(re.findall(r'a?+', 'aab'), ['a', 'a', '', ''])
2368        self.assertEqual(re.findall(r'a{1,3}+', 'aab'), ['aa'])
2369
2370        self.assertEqual(re.findall(r'(?:ab)++', 'ababc'), ['abab'])
2371        self.assertEqual(re.findall(r'(?:ab)*+', 'ababc'), ['abab', '', ''])
2372        self.assertEqual(re.findall(r'(?:ab)?+', 'ababc'), ['ab', 'ab', '', ''])
2373        self.assertEqual(re.findall(r'(?:ab){1,3}+', 'ababc'), ['abab'])
2374
2375    def test_atomic_grouping(self):
2376        """Test Atomic Grouping
2377        Test non-capturing groups of the form (?>...), which does
2378        not maintain any stack point created within the group once the
2379        group is finished being evaluated."""
2380        pattern1 = re.compile(r'a(?>bc|b)c')
2381        self.assertIsNone(pattern1.match('abc'))
2382        self.assertTrue(pattern1.match('abcc'))
2383        self.assertIsNone(re.match(r'(?>.*).', 'abc'))
2384        self.assertTrue(re.match(r'(?>x)++', 'xxx'))
2385        self.assertTrue(re.match(r'(?>x++)', 'xxx'))
2386        self.assertIsNone(re.match(r'(?>x)++x', 'xxx'))
2387        self.assertIsNone(re.match(r'(?>x++)x', 'xxx'))
2388
2389    def test_fullmatch_atomic_grouping(self):
2390        self.assertTrue(re.fullmatch(r'(?>a+)', 'a'))
2391        self.assertTrue(re.fullmatch(r'(?>a*)', 'a'))
2392        self.assertTrue(re.fullmatch(r'(?>a?)', 'a'))
2393        self.assertTrue(re.fullmatch(r'(?>a{1,3})', 'a'))
2394        self.assertIsNone(re.fullmatch(r'(?>a+)', 'ab'))
2395        self.assertIsNone(re.fullmatch(r'(?>a*)', 'ab'))
2396        self.assertIsNone(re.fullmatch(r'(?>a?)', 'ab'))
2397        self.assertIsNone(re.fullmatch(r'(?>a{1,3})', 'ab'))
2398        self.assertTrue(re.fullmatch(r'(?>a+)b', 'ab'))
2399        self.assertTrue(re.fullmatch(r'(?>a*)b', 'ab'))
2400        self.assertTrue(re.fullmatch(r'(?>a?)b', 'ab'))
2401        self.assertTrue(re.fullmatch(r'(?>a{1,3})b', 'ab'))
2402
2403        self.assertTrue(re.fullmatch(r'(?>(?:ab)+)', 'ab'))
2404        self.assertTrue(re.fullmatch(r'(?>(?:ab)*)', 'ab'))
2405        self.assertTrue(re.fullmatch(r'(?>(?:ab)?)', 'ab'))
2406        self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})', 'ab'))
2407        self.assertIsNone(re.fullmatch(r'(?>(?:ab)+)', 'abc'))
2408        self.assertIsNone(re.fullmatch(r'(?>(?:ab)*)', 'abc'))
2409        self.assertIsNone(re.fullmatch(r'(?>(?:ab)?)', 'abc'))
2410        self.assertIsNone(re.fullmatch(r'(?>(?:ab){1,3})', 'abc'))
2411        self.assertTrue(re.fullmatch(r'(?>(?:ab)+)c', 'abc'))
2412        self.assertTrue(re.fullmatch(r'(?>(?:ab)*)c', 'abc'))
2413        self.assertTrue(re.fullmatch(r'(?>(?:ab)?)c', 'abc'))
2414        self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})c', 'abc'))
2415
2416    def test_findall_atomic_grouping(self):
2417        self.assertEqual(re.findall(r'(?>a+)', 'aab'), ['aa'])
2418        self.assertEqual(re.findall(r'(?>a*)', 'aab'), ['aa', '', ''])
2419        self.assertEqual(re.findall(r'(?>a?)', 'aab'), ['a', 'a', '', ''])
2420        self.assertEqual(re.findall(r'(?>a{1,3})', 'aab'), ['aa'])
2421
2422        self.assertEqual(re.findall(r'(?>(?:ab)+)', 'ababc'), ['abab'])
2423        self.assertEqual(re.findall(r'(?>(?:ab)*)', 'ababc'), ['abab', '', ''])
2424        self.assertEqual(re.findall(r'(?>(?:ab)?)', 'ababc'), ['ab', 'ab', '', ''])
2425        self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab'])
2426
2427    def test_bug_gh91616(self):
2428        self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\Z', "a.txt")) # reproducer
2429        self.assertTrue(re.fullmatch(r'(?s:(?=(?P<g0>.*?\.))(?P=g0).*)\Z', "a.txt"))
2430
2431    def test_template_function_and_flag_is_deprecated(self):
2432        with self.assertWarns(DeprecationWarning) as cm:
2433            template_re1 = re.template(r'a')
2434        self.assertIn('re.template()', str(cm.warning))
2435        self.assertIn('is deprecated', str(cm.warning))
2436        self.assertIn('function', str(cm.warning))
2437        self.assertNotIn('flag', str(cm.warning))
2438
2439        with self.assertWarns(DeprecationWarning) as cm:
2440            # we deliberately use more flags here to test that that still
2441            # triggers the warning
2442            # if paranoid, we could test multiple different combinations,
2443            # but it's probably not worth it
2444            template_re2 = re.compile(r'a', flags=re.TEMPLATE|re.UNICODE)
2445        self.assertIn('re.TEMPLATE', str(cm.warning))
2446        self.assertIn('is deprecated', str(cm.warning))
2447        self.assertIn('flag', str(cm.warning))
2448        self.assertNotIn('function', str(cm.warning))
2449
2450        # while deprecated, is should still function
2451        self.assertEqual(template_re1, template_re2)
2452        self.assertTrue(template_re1.match('ahoy'))
2453        self.assertFalse(template_re1.match('nope'))
2454
2455    @unittest.skipIf(multiprocessing is None, 'test requires multiprocessing')
2456    def test_regression_gh94675(self):
2457        pattern = re.compile(r'(?<=[({}])(((//[^\n]*)?[\n])([\000-\040])*)*'
2458                             r'((/[^/\[\n]*(([^\n]|(\[\n]*(]*)*\]))'
2459                             r'[^/\[]*)*/))((((//[^\n]*)?[\n])'
2460                             r'([\000-\040]|(/\*[^*]*\*+'
2461                             r'([^/*]\*+)*/))*)+(?=[^\000-\040);\]}]))')
2462        input_js = '''a(function() {
2463            ///////////////////////////////////////////////////////////////////
2464        });'''
2465        p = multiprocessing.Process(target=pattern.sub, args=('', input_js))
2466        p.start()
2467        p.join(SHORT_TIMEOUT)
2468        try:
2469            self.assertFalse(p.is_alive(), 'pattern.sub() timed out')
2470        finally:
2471            if p.is_alive():
2472                p.terminate()
2473                p.join()
2474
2475
2476def get_debug_out(pat):
2477    with captured_stdout() as out:
2478        re.compile(pat, re.DEBUG)
2479    return out.getvalue()
2480
2481
2482@cpython_only
2483class DebugTests(unittest.TestCase):
2484    maxDiff = None
2485
2486    def test_debug_flag(self):
2487        pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
2488        dump = '''\
2489SUBPATTERN 1 0 0
2490  LITERAL 46
2491BRANCH
2492  IN
2493    LITERAL 99
2494    LITERAL 104
2495OR
2496  LITERAL 112
2497  LITERAL 121
2498GROUPREF_EXISTS 1
2499  AT AT_END
2500ELSE
2501  LITERAL 58
2502  LITERAL 32
2503
2504 0. INFO 8 0b1 2 5 (to 9)
2505      prefix_skip 0
2506      prefix [0x2e] ('.')
2507      overlap [0]
2508 9: MARK 0
250911. LITERAL 0x2e ('.')
251013. MARK 1
251115. BRANCH 10 (to 26)
251217.   IN 6 (to 24)
251319.     LITERAL 0x63 ('c')
251421.     LITERAL 0x68 ('h')
251523.     FAILURE
251624:   JUMP 9 (to 34)
251726: branch 7 (to 33)
251827.   LITERAL 0x70 ('p')
251929.   LITERAL 0x79 ('y')
252031.   JUMP 2 (to 34)
252133: FAILURE
252234: GROUPREF_EXISTS 0 6 (to 41)
252337. AT END
252439. JUMP 5 (to 45)
252541: LITERAL 0x3a (':')
252643. LITERAL 0x20 (' ')
252745: SUCCESS
2528'''
2529        self.assertEqual(get_debug_out(pat), dump)
2530        # Debug output is output again even a second time (bypassing
2531        # the cache -- issue #20426).
2532        self.assertEqual(get_debug_out(pat), dump)
2533
2534    def test_atomic_group(self):
2535        self.assertEqual(get_debug_out(r'(?>ab?)'), '''\
2536ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))]
2537
2538 0. INFO 4 0b0 1 2 (to 5)
2539 5: ATOMIC_GROUP 11 (to 17)
2540 7.   LITERAL 0x61 ('a')
2541 9.   REPEAT_ONE 6 0 1 (to 16)
254213.     LITERAL 0x62 ('b')
254315.     SUCCESS
254416:   SUCCESS
254517: SUCCESS
2546''')
2547
2548    def test_possesive_repeat_one(self):
2549        self.assertEqual(get_debug_out(r'a?+'), '''\
2550POSSESSIVE_REPEAT 0 1
2551  LITERAL 97
2552
2553 0. INFO 4 0b0 0 1 (to 5)
2554 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12)
2555 9.   LITERAL 0x61 ('a')
255611.   SUCCESS
255712: SUCCESS
2558''')
2559
2560    def test_possesive_repeat(self):
2561        self.assertEqual(get_debug_out(r'(?:ab)?+'), '''\
2562POSSESSIVE_REPEAT 0 1
2563  LITERAL 97
2564  LITERAL 98
2565
2566 0. INFO 4 0b0 0 2 (to 5)
2567 5: POSSESSIVE_REPEAT 7 0 1 (to 13)
2568 9.   LITERAL 0x61 ('a')
256911.   LITERAL 0x62 ('b')
257013: SUCCESS
257114. SUCCESS
2572''')
2573
2574
2575class PatternReprTests(unittest.TestCase):
2576    def check(self, pattern, expected):
2577        self.assertEqual(repr(re.compile(pattern)), expected)
2578
2579    def check_flags(self, pattern, flags, expected):
2580        self.assertEqual(repr(re.compile(pattern, flags)), expected)
2581
2582    def test_without_flags(self):
2583        self.check('random pattern',
2584                   "re.compile('random pattern')")
2585
2586    def test_single_flag(self):
2587        self.check_flags('random pattern', re.IGNORECASE,
2588            "re.compile('random pattern', re.IGNORECASE)")
2589
2590    def test_multiple_flags(self):
2591        self.check_flags('random pattern', re.I|re.S|re.X,
2592            "re.compile('random pattern', "
2593            "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2594
2595    def test_unicode_flag(self):
2596        self.check_flags('random pattern', re.U,
2597                         "re.compile('random pattern')")
2598        self.check_flags('random pattern', re.I|re.S|re.U,
2599                         "re.compile('random pattern', "
2600                         "re.IGNORECASE|re.DOTALL)")
2601
2602    def test_inline_flags(self):
2603        self.check('(?i)pattern',
2604                   "re.compile('(?i)pattern', re.IGNORECASE)")
2605
2606    def test_unknown_flags(self):
2607        self.check_flags('random pattern', 0x123000,
2608                         "re.compile('random pattern', 0x123000)")
2609        self.check_flags('random pattern', 0x123000|re.I,
2610            "re.compile('random pattern', re.IGNORECASE|0x123000)")
2611
2612    def test_bytes(self):
2613        self.check(b'bytes pattern',
2614                   "re.compile(b'bytes pattern')")
2615        self.check_flags(b'bytes pattern', re.A,
2616                         "re.compile(b'bytes pattern', re.ASCII)")
2617
2618    def test_locale(self):
2619        self.check_flags(b'bytes pattern', re.L,
2620                         "re.compile(b'bytes pattern', re.LOCALE)")
2621
2622    def test_quotes(self):
2623        self.check('random "double quoted" pattern',
2624            '''re.compile('random "double quoted" pattern')''')
2625        self.check("random 'single quoted' pattern",
2626            '''re.compile("random 'single quoted' pattern")''')
2627        self.check('''both 'single' and "double" quotes''',
2628            '''re.compile('both \\'single\\' and "double" quotes')''')
2629
2630    def test_long_pattern(self):
2631        pattern = 'Very %spattern' % ('long ' * 1000)
2632        r = repr(re.compile(pattern))
2633        self.assertLess(len(r), 300)
2634        self.assertEqual(r[:30], "re.compile('Very long long lon")
2635        r = repr(re.compile(pattern, re.I))
2636        self.assertLess(len(r), 300)
2637        self.assertEqual(r[:30], "re.compile('Very long long lon")
2638        self.assertEqual(r[-16:], ", re.IGNORECASE)")
2639
2640    def test_flags_repr(self):
2641        self.assertEqual(repr(re.I), "re.IGNORECASE")
2642        self.assertEqual(repr(re.I|re.S|re.X),
2643                         "re.IGNORECASE|re.DOTALL|re.VERBOSE")
2644        self.assertEqual(repr(re.I|re.S|re.X|(1<<20)),
2645                         "re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000")
2646        self.assertEqual(
2647                repr(~re.I),
2648                "re.ASCII|re.LOCALE|re.UNICODE|re.MULTILINE|re.DOTALL|re.VERBOSE|re.TEMPLATE|re.DEBUG")
2649        self.assertEqual(repr(~(re.I|re.S|re.X)),
2650                         "re.ASCII|re.LOCALE|re.UNICODE|re.MULTILINE|re.TEMPLATE|re.DEBUG")
2651        self.assertEqual(repr(~(re.I|re.S|re.X|(1<<20))),
2652                         "re.ASCII|re.LOCALE|re.UNICODE|re.MULTILINE|re.TEMPLATE|re.DEBUG|0xffe00")
2653
2654
2655class ImplementationTest(unittest.TestCase):
2656    """
2657    Test implementation details of the re module.
2658    """
2659
2660    @cpython_only
2661    def test_immutable(self):
2662        # bpo-43908: check that re types are immutable
2663        with self.assertRaises(TypeError):
2664            re.Match.foo = 1
2665        with self.assertRaises(TypeError):
2666            re.Pattern.foo = 1
2667        with self.assertRaises(TypeError):
2668            pat = re.compile("")
2669            tp = type(pat.scanner(""))
2670            tp.foo = 1
2671
2672    def test_overlap_table(self):
2673        f = re._compiler._generate_overlap_table
2674        self.assertEqual(f(""), [])
2675        self.assertEqual(f("a"), [0])
2676        self.assertEqual(f("abcd"), [0, 0, 0, 0])
2677        self.assertEqual(f("aaaa"), [0, 1, 2, 3])
2678        self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
2679        self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
2680
2681    def test_signedness(self):
2682        self.assertGreaterEqual(re._compiler.MAXREPEAT, 0)
2683        self.assertGreaterEqual(re._compiler.MAXGROUPS, 0)
2684
2685    @cpython_only
2686    def test_disallow_instantiation(self):
2687        # Ensure that the type disallows instantiation (bpo-43916)
2688        check_disallow_instantiation(self, re.Match)
2689        check_disallow_instantiation(self, re.Pattern)
2690        pat = re.compile("")
2691        check_disallow_instantiation(self, type(pat.scanner("")))
2692
2693    def test_deprecated_modules(self):
2694        deprecated = {
2695            'sre_compile': ['compile', 'error',
2696                            'SRE_FLAG_IGNORECASE', 'SUBPATTERN',
2697                            '_compile_info'],
2698            'sre_constants': ['error', 'SRE_FLAG_IGNORECASE', 'SUBPATTERN',
2699                              '_NamedIntConstant'],
2700            'sre_parse': ['SubPattern', 'parse',
2701                          'SRE_FLAG_IGNORECASE', 'SUBPATTERN',
2702                          '_parse_sub'],
2703        }
2704        for name in deprecated:
2705            with self.subTest(module=name):
2706                sys.modules.pop(name, None)
2707                with self.assertWarns(DeprecationWarning) as w:
2708                    __import__(name)
2709                self.assertEqual(str(w.warning),
2710                                 f"module {name!r} is deprecated")
2711                self.assertEqual(w.filename, __file__)
2712                self.assertIn(name, sys.modules)
2713                mod = sys.modules[name]
2714                self.assertEqual(mod.__name__, name)
2715                self.assertEqual(mod.__package__, '')
2716                for attr in deprecated[name]:
2717                    self.assertTrue(hasattr(mod, attr))
2718                del sys.modules[name]
2719
2720class ExternalTests(unittest.TestCase):
2721
2722    def test_re_benchmarks(self):
2723        're_tests benchmarks'
2724        from test.re_tests import benchmarks
2725        for pattern, s in benchmarks:
2726            with self.subTest(pattern=pattern, string=s):
2727                p = re.compile(pattern)
2728                self.assertTrue(p.search(s))
2729                self.assertTrue(p.match(s))
2730                self.assertTrue(p.fullmatch(s))
2731                s2 = ' '*10000 + s + ' '*10000
2732                self.assertTrue(p.search(s2))
2733                self.assertTrue(p.match(s2, 10000))
2734                self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
2735                self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
2736
2737    def test_re_tests(self):
2738        're_tests test suite'
2739        from test.re_tests import tests, FAIL, SYNTAX_ERROR
2740        for t in tests:
2741            pattern = s = outcome = repl = expected = None
2742            if len(t) == 5:
2743                pattern, s, outcome, repl, expected = t
2744            elif len(t) == 3:
2745                pattern, s, outcome = t
2746            else:
2747                raise ValueError('Test tuples should have 3 or 5 fields', t)
2748
2749            with self.subTest(pattern=pattern, string=s):
2750                if outcome == SYNTAX_ERROR:  # Expected a syntax error
2751                    with self.assertRaises(re.error):
2752                        re.compile(pattern)
2753                    continue
2754
2755                obj = re.compile(pattern)
2756                result = obj.search(s)
2757                if outcome == FAIL:
2758                    self.assertIsNone(result, 'Succeeded incorrectly')
2759                    continue
2760
2761                with self.subTest():
2762                    self.assertTrue(result, 'Failed incorrectly')
2763                    # Matched, as expected, so now we compute the
2764                    # result string and compare it to our expected result.
2765                    start, end = result.span(0)
2766                    vardict = {'found': result.group(0),
2767                               'groups': result.group(),
2768                               'flags': result.re.flags}
2769                    for i in range(1, 100):
2770                        try:
2771                            gi = result.group(i)
2772                            # Special hack because else the string concat fails:
2773                            if gi is None:
2774                                gi = "None"
2775                        except IndexError:
2776                            gi = "Error"
2777                        vardict['g%d' % i] = gi
2778                    for i in result.re.groupindex.keys():
2779                        try:
2780                            gi = result.group(i)
2781                            if gi is None:
2782                                gi = "None"
2783                        except IndexError:
2784                            gi = "Error"
2785                        vardict[i] = gi
2786                    self.assertEqual(eval(repl, vardict), expected,
2787                                     'grouping error')
2788
2789                # Try the match with both pattern and string converted to
2790                # bytes, and check that it still succeeds.
2791                try:
2792                    bpat = bytes(pattern, "ascii")
2793                    bs = bytes(s, "ascii")
2794                except UnicodeEncodeError:
2795                    # skip non-ascii tests
2796                    pass
2797                else:
2798                    with self.subTest('bytes pattern match'):
2799                        obj = re.compile(bpat)
2800                        self.assertTrue(obj.search(bs))
2801
2802                    # Try the match with LOCALE enabled, and check that it
2803                    # still succeeds.
2804                    with self.subTest('locale-sensitive match'):
2805                        obj = re.compile(bpat, re.LOCALE)
2806                        result = obj.search(bs)
2807                        if result is None:
2808                            print('=== Fails on locale-sensitive match', t)
2809
2810                # Try the match with the search area limited to the extent
2811                # of the match and see if it still succeeds.  \B will
2812                # break (because it won't match at the end or start of a
2813                # string), so we'll ignore patterns that feature it.
2814                if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
2815                            and result is not None):
2816                    with self.subTest('range-limited match'):
2817                        obj = re.compile(pattern)
2818                        self.assertTrue(obj.search(s, start, end + 1))
2819
2820                # Try the match with IGNORECASE enabled, and check that it
2821                # still succeeds.
2822                with self.subTest('case-insensitive match'):
2823                    obj = re.compile(pattern, re.IGNORECASE)
2824                    self.assertTrue(obj.search(s))
2825
2826                # Try the match with UNICODE locale enabled, and check
2827                # that it still succeeds.
2828                with self.subTest('unicode-sensitive match'):
2829                    obj = re.compile(pattern, re.UNICODE)
2830                    self.assertTrue(obj.search(s))
2831
2832
2833if __name__ == "__main__":
2834    unittest.main()
2835