1from test.support import (gc_collect, bigmemtest, _2G, 2 cpython_only, captured_stdout, 3 check_disallow_instantiation, is_emscripten, is_wasi, 4 SHORT_TIMEOUT) 5import locale 6import re 7import string 8import sys 9import time 10import unittest 11import warnings 12from re import Scanner 13from weakref import proxy 14 15# some platforms lack working multiprocessing 16try: 17 import _multiprocessing 18except ImportError: 19 multiprocessing = None 20else: 21 import multiprocessing 22 23# Misc tests from Tim Peters' re.doc 24 25# WARNING: Don't change details in these tests if you don't know 26# what you're doing. Some of these tests were carefully modeled to 27# cover most of the code. 28 29class S(str): 30 def __getitem__(self, index): 31 return S(super().__getitem__(index)) 32 33class B(bytes): 34 def __getitem__(self, index): 35 return B(super().__getitem__(index)) 36 37class ReTests(unittest.TestCase): 38 39 def assertTypedEqual(self, actual, expect, msg=None): 40 self.assertEqual(actual, expect, msg) 41 def recurse(actual, expect): 42 if isinstance(expect, (tuple, list)): 43 for x, y in zip(actual, expect): 44 recurse(x, y) 45 else: 46 self.assertIs(type(actual), type(expect), msg) 47 recurse(actual, expect) 48 49 def checkPatternError(self, pattern, errmsg, pos=None): 50 with self.assertRaises(re.error) as cm: 51 re.compile(pattern) 52 with self.subTest(pattern=pattern): 53 err = cm.exception 54 self.assertEqual(err.msg, errmsg) 55 if pos is not None: 56 self.assertEqual(err.pos, pos) 57 58 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None): 59 with self.assertRaises(re.error) as cm: 60 re.sub(pattern, repl, string) 61 with self.subTest(pattern=pattern, repl=repl): 62 err = cm.exception 63 self.assertEqual(err.msg, errmsg) 64 if pos is not None: 65 self.assertEqual(err.pos, pos) 66 67 def test_keep_buffer(self): 68 # See bug 14212 69 b = bytearray(b'x') 70 it = re.finditer(b'a', b) 71 with self.assertRaises(BufferError): 72 b.extend(b'x'*400) 73 list(it) 74 del it 75 gc_collect() 76 b.extend(b'x'*400) 77 78 def test_weakref(self): 79 s = 'QabbbcR' 80 x = re.compile('ab+c') 81 y = proxy(x) 82 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) 83 84 def test_search_star_plus(self): 85 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) 86 self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) 87 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) 88 self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) 89 self.assertIsNone(re.search('x', 'aaa')) 90 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) 91 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) 92 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) 93 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) 94 self.assertIsNone(re.match('a+', 'xxx')) 95 96 def test_branching(self): 97 """Test Branching 98 Test expressions using the OR ('|') operator.""" 99 self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2)) 100 self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2)) 101 self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(), 102 (0, 3)) 103 self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(), 104 (0, 3)) 105 self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(), 106 (0, 2)) 107 self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(), 108 (0, 2)) 109 self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1)) 110 self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1)) 111 self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1)) 112 113 def bump_num(self, matchobj): 114 int_value = int(matchobj.group(0)) 115 return str(int_value + 1) 116 117 def test_basic_re_sub(self): 118 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz') 119 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz') 120 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz') 121 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz') 122 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz') 123 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz') 124 for y in ("\xe0", "\u0430", "\U0001d49c"): 125 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz') 126 127 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') 128 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), 129 '9.3 -3 24x100y') 130 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), 131 '9.3 -3 23x99y') 132 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3), 133 '9.3 -3 23x99y') 134 135 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') 136 self.assertEqual(re.sub('.', r"\n", 'x'), '\n') 137 138 s = r"\1\1" 139 self.assertEqual(re.sub('(.)', s, 'x'), 'xx') 140 self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s) 141 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) 142 143 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx') 144 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx') 145 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx') 146 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx') 147 self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx') 148 149 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 150 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 151 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), 152 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) 153 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': 154 with self.subTest(c): 155 with self.assertRaises(re.error): 156 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) 157 158 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest') 159 160 def test_bug_449964(self): 161 # fails for group followed by other escape 162 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'), 163 'xx\bxx\b') 164 165 def test_bug_449000(self): 166 # Test for sub() on escaped characters 167 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 168 'abc\ndef\n') 169 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 170 'abc\ndef\n') 171 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 172 'abc\ndef\n') 173 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 174 'abc\ndef\n') 175 176 def test_bug_1661(self): 177 # Verify that flags do not get silently ignored with compiled patterns 178 pattern = re.compile('.') 179 self.assertRaises(ValueError, re.match, pattern, 'A', re.I) 180 self.assertRaises(ValueError, re.search, pattern, 'A', re.I) 181 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) 182 self.assertRaises(ValueError, re.compile, pattern, re.I) 183 184 def test_bug_3629(self): 185 # A regex that triggered a bug in the sre-code validator 186 re.compile("(?P<quote>)(?(quote))") 187 188 def test_sub_template_numeric_escape(self): 189 # bug 776311 and friends 190 self.assertEqual(re.sub('x', r'\0', 'x'), '\0') 191 self.assertEqual(re.sub('x', r'\000', 'x'), '\000') 192 self.assertEqual(re.sub('x', r'\001', 'x'), '\001') 193 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') 194 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') 195 self.assertEqual(re.sub('x', r'\111', 'x'), '\111') 196 self.assertEqual(re.sub('x', r'\117', 'x'), '\117') 197 self.assertEqual(re.sub('x', r'\377', 'x'), '\377') 198 199 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') 200 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') 201 202 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') 203 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') 204 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') 205 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') 206 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') 207 208 self.checkTemplateError('x', r'\400', 'x', 209 r'octal escape value \400 outside of ' 210 r'range 0-0o377', 0) 211 self.checkTemplateError('x', r'\777', 'x', 212 r'octal escape value \777 outside of ' 213 r'range 0-0o377', 0) 214 215 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1) 216 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1) 217 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1) 218 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1) 219 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1) 220 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1) 221 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1) 222 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1) 223 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1) 224 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1) 225 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1) 226 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1) 227 self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1) 228 229 # in python2.3 (etc), these loop endlessly in sre_parser.py 230 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') 231 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 232 'xz8') 233 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 234 'xza') 235 236 def test_qualified_re_sub(self): 237 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') 238 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') 239 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa') 240 241 def test_bug_114660(self): 242 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 243 'hello there') 244 245 def test_symbolic_groups(self): 246 re.compile(r'(?P<a>x)(?P=a)(?(a)y)') 247 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)') 248 re.compile(r'(?P<a1>x)\1(?(1)y)') 249 re.compile(b'(?P<a1>x)(?P=a1)(?(a1)y)') 250 # New valid identifiers in Python 3 251 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)') 252 re.compile('(?P<>x)(?P=)(?()y)') 253 # Support > 100 groups. 254 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 255 pat = '(?:%s)(?(200)z|t)' % pat 256 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 257 258 def test_symbolic_groups_errors(self): 259 self.checkPatternError(r'(?P<a>)(?P<a>)', 260 "redefinition of group name 'a' as group 2; " 261 "was group 1") 262 self.checkPatternError(r'(?P<a>(?P=a))', 263 "cannot refer to an open group", 10) 264 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px') 265 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11) 266 self.checkPatternError(r'(?P=', 'missing group name', 4) 267 self.checkPatternError(r'(?P=)', 'missing group name', 4) 268 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4) 269 self.checkPatternError(r'(?P=a)', "unknown group name 'a'") 270 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'") 271 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4) 272 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4) 273 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4) 274 self.checkPatternError(r'(?P<', 'missing group name', 4) 275 self.checkPatternError(r'(?P<>)', 'missing group name', 4) 276 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4) 277 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4) 278 self.checkPatternError(r'(?(', 'missing group name', 3) 279 self.checkPatternError(r'(?())', 'missing group name', 3) 280 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3) 281 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3) 282 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3) 283 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3) 284 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) 285 self.checkPatternError('(?P=©)', "bad character in group name '©'", 4) 286 self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3) 287 with self.assertWarnsRegex(DeprecationWarning, 288 r"bad character in group name '\\xc2\\xb5' " 289 r"at position 4") as w: 290 re.compile(b'(?P<\xc2\xb5>x)') 291 self.assertEqual(w.filename, __file__) 292 with self.assertWarnsRegex(DeprecationWarning, 293 r"bad character in group name '\\xc2\\xb5' " 294 r"at position 4"): 295 self.checkPatternError(b'(?P=\xc2\xb5)', 296 r"unknown group name '\xc2\xb5'", 4) 297 with self.assertWarnsRegex(DeprecationWarning, 298 r"bad character in group name '\\xc2\\xb5' " 299 r"at position 3"): 300 self.checkPatternError(b'(?(\xc2\xb5)y)', 301 r"unknown group name '\xc2\xb5'", 3) 302 303 def test_symbolic_refs(self): 304 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') 305 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '') 306 self.assertEqual(re.sub(b'(?P<a1>x)', br'\g<a1>', b'xx'), b'xx') 307 # New valid identifiers in Python 3 308 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx') 309 self.assertEqual(re.sub('(?P<>x)', r'\g<>', 'xx'), 'xx') 310 # Support > 100 groups. 311 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 312 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8') 313 314 def test_symbolic_refs_errors(self): 315 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx', 316 'missing >, unterminated name', 3) 317 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx', 318 'missing group name', 3) 319 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2) 320 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx', 321 "bad character in group name 'a a'", 3) 322 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx', 323 'missing group name', 3) 324 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx', 325 "bad character in group name '1a1'", 3) 326 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx', 327 'invalid group reference 2', 3) 328 self.checkTemplateError('(?P<a>x)', r'\2', 'xx', 329 'invalid group reference 2', 1) 330 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"): 331 re.sub('(?P<a>x)', r'\g<ab>', 'xx') 332 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx', 333 "bad character in group name '-1'", 3) 334 with self.assertWarnsRegex(DeprecationWarning, 335 r"bad character in group name '\+1' " 336 r"at position 3") as w: 337 re.sub('(?P<a>x)', r'\g<+1>', 'xx') 338 self.assertEqual(w.filename, __file__) 339 with self.assertWarnsRegex(DeprecationWarning, 340 r"bad character in group name '1_0' " 341 r"at position 3"): 342 re.sub('()'*10, r'\g<1_0>', 'xx') 343 with self.assertWarnsRegex(DeprecationWarning, 344 r"bad character in group name ' 1 ' " 345 r"at position 3"): 346 re.sub('(?P<a>x)', r'\g< 1 >', 'xx') 347 self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx', 348 "bad character in group name '©'", 3) 349 with self.assertWarnsRegex(DeprecationWarning, 350 r"bad character in group name '\\xc2\\xb5' " 351 r"at position 3") as w: 352 with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"): 353 re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx') 354 self.assertEqual(w.filename, __file__) 355 self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx', 356 "bad character in group name '㊀'", 3) 357 self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx', 358 "bad character in group name '¹'", 3) 359 with self.assertWarnsRegex(DeprecationWarning, 360 r"bad character in group name '१' " 361 r"at position 3"): 362 re.sub('(?P<a>x)', r'\g<१>', 'xx') 363 364 def test_re_subn(self): 365 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) 366 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) 367 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) 368 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) 369 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) 370 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2)) 371 372 def test_re_split(self): 373 for string in ":a:b::c", S(":a:b::c"): 374 self.assertTypedEqual(re.split(":", string), 375 ['', 'a', 'b', '', 'c']) 376 self.assertTypedEqual(re.split(":+", string), 377 ['', 'a', 'b', 'c']) 378 self.assertTypedEqual(re.split("(:+)", string), 379 ['', ':', 'a', ':', 'b', '::', 'c']) 380 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), 381 memoryview(b":a:b::c")): 382 self.assertTypedEqual(re.split(b":", string), 383 [b'', b'a', b'b', b'', b'c']) 384 self.assertTypedEqual(re.split(b":+", string), 385 [b'', b'a', b'b', b'c']) 386 self.assertTypedEqual(re.split(b"(:+)", string), 387 [b'', b':', b'a', b':', b'b', b'::', b'c']) 388 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", 389 "\U0001d49c\U0001d49e\U0001d4b5"): 390 string = ":%s:%s::%s" % (a, b, c) 391 self.assertEqual(re.split(":", string), ['', a, b, '', c]) 392 self.assertEqual(re.split(":+", string), ['', a, b, c]) 393 self.assertEqual(re.split("(:+)", string), 394 ['', ':', a, ':', b, '::', c]) 395 396 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) 397 self.assertEqual(re.split("(:)+", ":a:b::c"), 398 ['', ':', 'a', ':', 'b', ':', 'c']) 399 self.assertEqual(re.split("([b:]+)", ":a:b::c"), 400 ['', ':', 'a', ':b::', 'c']) 401 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), 402 ['', None, ':', 'a', None, ':', '', 'b', None, '', 403 None, '::', 'c']) 404 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), 405 ['', 'a', '', '', 'c']) 406 407 for sep, expected in [ 408 (':*', ['', '', 'a', '', 'b', '', 'c', '']), 409 ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']), 410 ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']), 411 ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']), 412 ]: 413 with self.subTest(sep=sep): 414 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 415 416 for sep, expected in [ 417 ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']), 418 (r'\b', [':', 'a', ':', 'b', '::', 'c', '']), 419 (r'(?=:)', ['', ':a', ':b', ':', ':c']), 420 (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']), 421 ]: 422 with self.subTest(sep=sep): 423 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 424 425 def test_qualified_re_split(self): 426 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) 427 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c']) 428 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d']) 429 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2), 430 ['', ':', 'a', ':', 'b::c']) 431 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), 432 ['', ':', 'a', ':', 'b::c']) 433 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), 434 ['', ':', '', '', 'a:b::c']) 435 436 def test_re_findall(self): 437 self.assertEqual(re.findall(":+", "abc"), []) 438 for string in "a:b::c:::d", S("a:b::c:::d"): 439 self.assertTypedEqual(re.findall(":+", string), 440 [":", "::", ":::"]) 441 self.assertTypedEqual(re.findall("(:+)", string), 442 [":", "::", ":::"]) 443 self.assertTypedEqual(re.findall("(:)(:*)", string), 444 [(":", ""), (":", ":"), (":", "::")]) 445 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"), 446 memoryview(b"a:b::c:::d")): 447 self.assertTypedEqual(re.findall(b":+", string), 448 [b":", b"::", b":::"]) 449 self.assertTypedEqual(re.findall(b"(:+)", string), 450 [b":", b"::", b":::"]) 451 self.assertTypedEqual(re.findall(b"(:)(:*)", string), 452 [(b":", b""), (b":", b":"), (b":", b"::")]) 453 for x in ("\xe0", "\u0430", "\U0001d49c"): 454 xx = x * 2 455 xxx = x * 3 456 string = "a%sb%sc%sd" % (x, xx, xxx) 457 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx]) 458 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx]) 459 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string), 460 [(x, ""), (x, x), (x, xx)]) 461 462 def test_bug_117612(self): 463 self.assertEqual(re.findall(r"(a|(b))", "aba"), 464 [("a", ""),("b", "b"),("a", "")]) 465 466 def test_re_match(self): 467 for string in 'a', S('a'): 468 self.assertEqual(re.match('a', string).groups(), ()) 469 self.assertEqual(re.match('(a)', string).groups(), ('a',)) 470 self.assertEqual(re.match('(a)', string).group(0), 'a') 471 self.assertEqual(re.match('(a)', string).group(1), 'a') 472 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a')) 473 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'): 474 self.assertEqual(re.match(b'a', string).groups(), ()) 475 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',)) 476 self.assertEqual(re.match(b'(a)', string).group(0), b'a') 477 self.assertEqual(re.match(b'(a)', string).group(1), b'a') 478 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a')) 479 for a in ("\xe0", "\u0430", "\U0001d49c"): 480 self.assertEqual(re.match(a, a).groups(), ()) 481 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,)) 482 self.assertEqual(re.match('(%s)' % a, a).group(0), a) 483 self.assertEqual(re.match('(%s)' % a, a).group(1), a) 484 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a)) 485 486 pat = re.compile('((a)|(b))(c)?') 487 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) 488 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) 489 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) 490 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) 491 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) 492 493 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 494 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) 495 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), 496 (None, 'b', None)) 497 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) 498 499 def test_group(self): 500 class Index: 501 def __init__(self, value): 502 self.value = value 503 def __index__(self): 504 return self.value 505 # A single group 506 m = re.match('(a)(b)', 'ab') 507 self.assertEqual(m.group(), 'ab') 508 self.assertEqual(m.group(0), 'ab') 509 self.assertEqual(m.group(1), 'a') 510 self.assertEqual(m.group(Index(1)), 'a') 511 self.assertRaises(IndexError, m.group, -1) 512 self.assertRaises(IndexError, m.group, 3) 513 self.assertRaises(IndexError, m.group, 1<<1000) 514 self.assertRaises(IndexError, m.group, Index(1<<1000)) 515 self.assertRaises(IndexError, m.group, 'x') 516 # Multiple groups 517 self.assertEqual(m.group(2, 1), ('b', 'a')) 518 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a')) 519 520 def test_match_getitem(self): 521 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 522 523 m = pat.match('a') 524 self.assertEqual(m['a1'], 'a') 525 self.assertEqual(m['b2'], None) 526 self.assertEqual(m['c3'], None) 527 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None') 528 self.assertEqual(m[0], 'a') 529 self.assertEqual(m[1], 'a') 530 self.assertEqual(m[2], None) 531 self.assertEqual(m[3], None) 532 with self.assertRaisesRegex(IndexError, 'no such group'): 533 m['X'] 534 with self.assertRaisesRegex(IndexError, 'no such group'): 535 m[-1] 536 with self.assertRaisesRegex(IndexError, 'no such group'): 537 m[4] 538 with self.assertRaisesRegex(IndexError, 'no such group'): 539 m[0, 1] 540 with self.assertRaisesRegex(IndexError, 'no such group'): 541 m[(0,)] 542 with self.assertRaisesRegex(IndexError, 'no such group'): 543 m[(0, 1)] 544 with self.assertRaisesRegex(IndexError, 'no such group'): 545 'a1={a2}'.format_map(m) 546 547 m = pat.match('ac') 548 self.assertEqual(m['a1'], 'a') 549 self.assertEqual(m['b2'], None) 550 self.assertEqual(m['c3'], 'c') 551 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c') 552 self.assertEqual(m[0], 'ac') 553 self.assertEqual(m[1], 'a') 554 self.assertEqual(m[2], None) 555 self.assertEqual(m[3], 'c') 556 557 # Cannot assign. 558 with self.assertRaises(TypeError): 559 m[0] = 1 560 561 # No len(). 562 self.assertRaises(TypeError, len, m) 563 564 def test_re_fullmatch(self): 565 # Issue 16203: Proposal: add re.fullmatch() method. 566 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1)) 567 for string in "ab", S("ab"): 568 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2)) 569 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"): 570 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2)) 571 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e": 572 r = r"%s|%s" % (a, a + b) 573 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2)) 574 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3)) 575 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3)) 576 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2)) 577 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3)) 578 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4)) 579 self.assertIsNone(re.fullmatch(r"a+", "ab")) 580 self.assertIsNone(re.fullmatch(r"abc$", "abc\n")) 581 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n")) 582 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n")) 583 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4)) 584 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4)) 585 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2)) 586 587 self.assertEqual( 588 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 589 self.assertEqual( 590 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 591 self.assertEqual( 592 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 593 594 def test_re_groupref_exists(self): 595 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(), 596 ('(', 'a')) 597 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(), 598 (None, 'a')) 599 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)')) 600 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a')) 601 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), 602 ('a', 'b')) 603 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), 604 (None, 'd')) 605 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(), 606 (None, 'd')) 607 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(), 608 ('a', '')) 609 610 # Tests for bug #1177831: exercise groups other than the first group 611 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') 612 self.assertEqual(p.match('abc').groups(), 613 ('a', 'b', 'c')) 614 self.assertEqual(p.match('ad').groups(), 615 ('a', None, 'd')) 616 self.assertIsNone(p.match('abd')) 617 self.assertIsNone(p.match('ac')) 618 619 # Support > 100 groups. 620 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 621 pat = '(?:%s)(?(200)z)' % pat 622 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 623 624 def test_re_groupref_exists_errors(self): 625 self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10) 626 self.checkPatternError(r'()(?(-1)a|b)', 627 "bad character in group name '-1'", 5) 628 with self.assertWarnsRegex(DeprecationWarning, 629 r"bad character in group name '\+1' " 630 r"at position 5") as w: 631 re.compile(r'()(?(+1)a|b)') 632 self.assertEqual(w.filename, __file__) 633 with self.assertWarnsRegex(DeprecationWarning, 634 r"bad character in group name '1_0' " 635 r"at position 23"): 636 re.compile(r'()'*10 + r'(?(1_0)a|b)') 637 with self.assertWarnsRegex(DeprecationWarning, 638 r"bad character in group name ' 1 ' " 639 r"at position 5"): 640 re.compile(r'()(?( 1 )a|b)') 641 self.checkPatternError(r'()(?(㊀)a|b)', 642 "bad character in group name '㊀'", 5) 643 self.checkPatternError(r'()(?(¹)a|b)', 644 "bad character in group name '¹'", 5) 645 with self.assertWarnsRegex(DeprecationWarning, 646 r"bad character in group name '१' " 647 r"at position 5"): 648 re.compile(r'()(?(१)a|b)') 649 self.checkPatternError(r'()(?(1', 650 "missing ), unterminated name", 5) 651 self.checkPatternError(r'()(?(1)a', 652 "missing ), unterminated subpattern", 2) 653 self.checkPatternError(r'()(?(1)a|b', 654 'missing ), unterminated subpattern', 2) 655 self.checkPatternError(r'()(?(1)a|b|c', 656 'conditional backref with more than ' 657 'two branches', 10) 658 self.checkPatternError(r'()(?(1)a|b|c)', 659 'conditional backref with more than ' 660 'two branches', 10) 661 self.checkPatternError(r'()(?(2)a)', 662 "invalid group reference 2", 5) 663 664 def test_re_groupref_exists_validation_bug(self): 665 for i in range(256): 666 with self.subTest(code=i): 667 re.compile(r'()(?(1)\x%02x?)' % i) 668 669 def test_re_groupref_overflow(self): 670 from re._constants import MAXGROUPS 671 self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', 672 'invalid group reference %d' % MAXGROUPS, 3) 673 self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS, 674 'invalid group reference %d' % MAXGROUPS, 10) 675 676 def test_re_groupref(self): 677 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), 678 ('|', 'a')) 679 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), 680 (None, 'a')) 681 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|')) 682 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a')) 683 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), 684 ('a', 'a')) 685 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), 686 (None, None)) 687 688 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4) 689 690 def test_groupdict(self): 691 self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 692 'first second').groupdict(), 693 {'first':'first', 'second':'second'}) 694 695 def test_expand(self): 696 self.assertEqual(re.match("(?P<first>first) (?P<second>second)", 697 "first second") 698 .expand(r"\2 \1 \g<second> \g<first>"), 699 "second first second first") 700 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)", 701 "first") 702 .expand(r"\2 \g<second>"), 703 " ") 704 705 def test_repeat_minmax(self): 706 self.assertIsNone(re.match(r"^(\w){1}$", "abc")) 707 self.assertIsNone(re.match(r"^(\w){1}?$", "abc")) 708 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc")) 709 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc")) 710 711 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c") 712 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c") 713 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c") 714 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 715 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c") 716 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c") 717 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c") 718 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 719 720 self.assertIsNone(re.match(r"^x{1}$", "xxx")) 721 self.assertIsNone(re.match(r"^x{1}?$", "xxx")) 722 self.assertIsNone(re.match(r"^x{1,2}$", "xxx")) 723 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx")) 724 725 self.assertTrue(re.match(r"^x{3}$", "xxx")) 726 self.assertTrue(re.match(r"^x{1,3}$", "xxx")) 727 self.assertTrue(re.match(r"^x{3,3}$", "xxx")) 728 self.assertTrue(re.match(r"^x{1,4}$", "xxx")) 729 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 730 self.assertTrue(re.match(r"^x{3}?$", "xxx")) 731 self.assertTrue(re.match(r"^x{1,3}?$", "xxx")) 732 self.assertTrue(re.match(r"^x{1,4}?$", "xxx")) 733 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 734 735 self.assertIsNone(re.match(r"^x{}$", "xxx")) 736 self.assertTrue(re.match(r"^x{}$", "x{}")) 737 738 self.checkPatternError(r'x{2,1}', 739 'min repeat greater than max repeat', 2) 740 741 def test_getattr(self): 742 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") 743 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) 744 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2) 745 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {}) 746 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, 747 {'first': 1, 'other': 2}) 748 749 self.assertEqual(re.match("(a)", "a").pos, 0) 750 self.assertEqual(re.match("(a)", "a").endpos, 1) 751 self.assertEqual(re.match("(a)", "a").string, "a") 752 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) 753 self.assertTrue(re.match("(a)", "a").re) 754 755 # Issue 14260. groupindex should be non-modifiable mapping. 756 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)') 757 self.assertEqual(sorted(p.groupindex), ['first', 'other']) 758 self.assertEqual(p.groupindex['other'], 2) 759 with self.assertRaises(TypeError): 760 p.groupindex['other'] = 0 761 self.assertEqual(p.groupindex['other'], 2) 762 763 def test_special_escapes(self): 764 self.assertEqual(re.search(r"\b(b.)\b", 765 "abcd abc bcd bx").group(1), "bx") 766 self.assertEqual(re.search(r"\B(b.)\B", 767 "abc bcd bc abxd").group(1), "bx") 768 self.assertEqual(re.search(r"\b(b.)\b", 769 "abcd abc bcd bx", re.ASCII).group(1), "bx") 770 self.assertEqual(re.search(r"\B(b.)\B", 771 "abc bcd bc abxd", re.ASCII).group(1), "bx") 772 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") 773 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") 774 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) 775 self.assertEqual(re.search(br"\b(b.)\b", 776 b"abcd abc bcd bx").group(1), b"bx") 777 self.assertEqual(re.search(br"\B(b.)\B", 778 b"abc bcd bc abxd").group(1), b"bx") 779 self.assertEqual(re.search(br"\b(b.)\b", 780 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx") 781 self.assertEqual(re.search(br"\B(b.)\B", 782 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx") 783 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc") 784 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc") 785 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M)) 786 self.assertEqual(re.search(r"\d\D\w\W\s\S", 787 "1aa! a").group(0), "1aa! a") 788 self.assertEqual(re.search(br"\d\D\w\W\s\S", 789 b"1aa! a").group(0), b"1aa! a") 790 self.assertEqual(re.search(r"\d\D\w\W\s\S", 791 "1aa! a", re.ASCII).group(0), "1aa! a") 792 self.assertEqual(re.search(br"\d\D\w\W\s\S", 793 b"1aa! a", re.LOCALE).group(0), b"1aa! a") 794 795 def test_other_escapes(self): 796 self.checkPatternError("\\", 'bad escape (end of pattern)', 0) 797 self.assertEqual(re.match(r"\(", '(').group(), '(') 798 self.assertIsNone(re.match(r"\(", ')')) 799 self.assertEqual(re.match(r"\\", '\\').group(), '\\') 800 self.assertEqual(re.match(r"[\]]", ']').group(), ']') 801 self.assertIsNone(re.match(r"[\]]", '[')) 802 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') 803 self.assertIsNone(re.match(r"[a\-c]", 'b')) 804 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') 805 self.assertIsNone(re.match(r"[\^a]+", 'b')) 806 re.purge() # for warnings 807 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY': 808 with self.subTest(c): 809 self.assertRaises(re.error, re.compile, '\\%c' % c) 810 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ': 811 with self.subTest(c): 812 self.assertRaises(re.error, re.compile, '[\\%c]' % c) 813 814 def test_named_unicode_escapes(self): 815 # test individual Unicode named escapes 816 self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<')) 817 self.assertTrue(re.match(r'\N{less-than sign}', '<')) 818 self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>')) 819 self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d')) 820 self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH ' 821 r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}', 822 '\ufbf9')) 823 self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]', 824 '=')) 825 self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]', 826 ';')) 827 828 # test errors in \N{name} handling - only valid names should pass 829 self.checkPatternError(r'\N', 'missing {', 2) 830 self.checkPatternError(r'[\N]', 'missing {', 3) 831 self.checkPatternError(r'\N{', 'missing character name', 3) 832 self.checkPatternError(r'[\N{', 'missing character name', 4) 833 self.checkPatternError(r'\N{}', 'missing character name', 3) 834 self.checkPatternError(r'[\N{}]', 'missing character name', 4) 835 self.checkPatternError(r'\NSNAKE}', 'missing {', 2) 836 self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3) 837 self.checkPatternError(r'\N{SNAKE', 838 'missing }, unterminated name', 3) 839 self.checkPatternError(r'[\N{SNAKE]', 840 'missing }, unterminated name', 4) 841 self.checkPatternError(r'[\N{SNAKE]}', 842 "undefined character name 'SNAKE]'", 1) 843 self.checkPatternError(r'\N{SPAM}', 844 "undefined character name 'SPAM'", 0) 845 self.checkPatternError(r'[\N{SPAM}]', 846 "undefined character name 'SPAM'", 1) 847 self.checkPatternError(r'\N{KEYCAP NUMBER SIGN}', 848 "undefined character name 'KEYCAP NUMBER SIGN'", 0) 849 self.checkPatternError(r'[\N{KEYCAP NUMBER SIGN}]', 850 "undefined character name 'KEYCAP NUMBER SIGN'", 1) 851 self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0) 852 self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1) 853 854 def test_string_boundaries(self): 855 # See http://bugs.python.org/issue10713 856 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), 857 "abc") 858 # There's a word boundary at the start of a string. 859 self.assertTrue(re.match(r"\b", "abc")) 860 # A non-empty string includes a non-boundary zero-length match. 861 self.assertTrue(re.search(r"\B", "abc")) 862 # There is no non-boundary match at the start of a string. 863 self.assertFalse(re.match(r"\B", "abc")) 864 # However, an empty string contains no word boundaries, and also no 865 # non-boundaries. 866 self.assertIsNone(re.search(r"\B", "")) 867 # This one is questionable and different from the perlre behaviour, 868 # but describes current behavior. 869 self.assertIsNone(re.search(r"\b", "")) 870 # A single word-character string has two boundaries, but no 871 # non-boundary gaps. 872 self.assertEqual(len(re.findall(r"\b", "a")), 2) 873 self.assertEqual(len(re.findall(r"\B", "a")), 0) 874 # If there are no words, there are no boundaries 875 self.assertEqual(len(re.findall(r"\b", " ")), 0) 876 self.assertEqual(len(re.findall(r"\b", " ")), 0) 877 # Can match around the whitespace. 878 self.assertEqual(len(re.findall(r"\B", " ")), 2) 879 880 def test_bigcharset(self): 881 self.assertEqual(re.match("([\u2222\u2223])", 882 "\u2222").group(1), "\u2222") 883 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255))) 884 self.assertEqual(re.match(r, "\uff01").group(), "\uff01") 885 886 def test_big_codesize(self): 887 # Issue #1160 888 r = re.compile('|'.join(('%d'%x for x in range(10000)))) 889 self.assertTrue(r.match('1000')) 890 self.assertTrue(r.match('9999')) 891 892 def test_anyall(self): 893 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), 894 "a\nb") 895 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), 896 "a\n\nb") 897 898 def test_lookahead(self): 899 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a") 900 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a") 901 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a") 902 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a") 903 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") 904 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") 905 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") 906 907 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") 908 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") 909 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") 910 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") 911 912 # Group reference. 913 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba')) 914 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac')) 915 # Conditional group reference. 916 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 917 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc')) 918 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 919 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc')) 920 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc')) 921 # Group used before defined. 922 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc')) 923 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc')) 924 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc')) 925 926 def test_lookbehind(self): 927 self.assertTrue(re.match(r'ab(?<=b)c', 'abc')) 928 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc')) 929 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc')) 930 self.assertTrue(re.match(r'ab(?<!c)c', 'abc')) 931 # Group reference. 932 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac')) 933 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa')) 934 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac')) 935 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa')) 936 # Conditional group reference. 937 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc')) 938 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc')) 939 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc')) 940 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc')) 941 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc')) 942 # Group used before defined. 943 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)') 944 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc')) 945 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc')) 946 # Group defined in the same lookbehind pattern 947 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)') 948 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)') 949 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)') 950 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)') 951 952 def test_ignore_case(self): 953 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 954 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") 955 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") 956 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") 957 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") 958 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") 959 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") 960 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") 961 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") 962 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") 963 964 # Two different characters have the same lowercase. 965 assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K' 966 self.assertTrue(re.match(r'K', '\u212a', re.I)) 967 self.assertTrue(re.match(r'k', '\u212a', re.I)) 968 self.assertTrue(re.match(r'\u212a', 'K', re.I)) 969 self.assertTrue(re.match(r'\u212a', 'k', re.I)) 970 971 # Two different characters have the same uppercase. 972 assert 's'.upper() == '\u017f'.upper() == 'S' # 'ſ' 973 self.assertTrue(re.match(r'S', '\u017f', re.I)) 974 self.assertTrue(re.match(r's', '\u017f', re.I)) 975 self.assertTrue(re.match(r'\u017f', 'S', re.I)) 976 self.assertTrue(re.match(r'\u017f', 's', re.I)) 977 978 # Two different characters have the same uppercase. Unicode 9.0+. 979 assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'в', 'ᲀ', 'В' 980 self.assertTrue(re.match(r'\u0412', '\u0432', re.I)) 981 self.assertTrue(re.match(r'\u0412', '\u1c80', re.I)) 982 self.assertTrue(re.match(r'\u0432', '\u0412', re.I)) 983 self.assertTrue(re.match(r'\u0432', '\u1c80', re.I)) 984 self.assertTrue(re.match(r'\u1c80', '\u0412', re.I)) 985 self.assertTrue(re.match(r'\u1c80', '\u0432', re.I)) 986 987 # Two different characters have the same multicharacter uppercase. 988 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 989 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I)) 990 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I)) 991 992 def test_ignore_case_set(self): 993 self.assertTrue(re.match(r'[19A]', 'A', re.I)) 994 self.assertTrue(re.match(r'[19a]', 'a', re.I)) 995 self.assertTrue(re.match(r'[19a]', 'A', re.I)) 996 self.assertTrue(re.match(r'[19A]', 'a', re.I)) 997 self.assertTrue(re.match(br'[19A]', b'A', re.I)) 998 self.assertTrue(re.match(br'[19a]', b'a', re.I)) 999 self.assertTrue(re.match(br'[19a]', b'A', re.I)) 1000 self.assertTrue(re.match(br'[19A]', b'a', re.I)) 1001 1002 # Two different characters have the same lowercase. 1003 assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K' 1004 self.assertTrue(re.match(r'[19K]', '\u212a', re.I)) 1005 self.assertTrue(re.match(r'[19k]', '\u212a', re.I)) 1006 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I)) 1007 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I)) 1008 1009 # Two different characters have the same uppercase. 1010 assert 's'.upper() == '\u017f'.upper() == 'S' # 'ſ' 1011 self.assertTrue(re.match(r'[19S]', '\u017f', re.I)) 1012 self.assertTrue(re.match(r'[19s]', '\u017f', re.I)) 1013 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I)) 1014 self.assertTrue(re.match(r'[19\u017f]', 's', re.I)) 1015 1016 # Two different characters have the same uppercase. Unicode 9.0+. 1017 assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'в', 'ᲀ', 'В' 1018 self.assertTrue(re.match(r'[19\u0412]', '\u0432', re.I)) 1019 self.assertTrue(re.match(r'[19\u0412]', '\u1c80', re.I)) 1020 self.assertTrue(re.match(r'[19\u0432]', '\u0412', re.I)) 1021 self.assertTrue(re.match(r'[19\u0432]', '\u1c80', re.I)) 1022 self.assertTrue(re.match(r'[19\u1c80]', '\u0412', re.I)) 1023 self.assertTrue(re.match(r'[19\u1c80]', '\u0432', re.I)) 1024 1025 # Two different characters have the same multicharacter uppercase. 1026 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 1027 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I)) 1028 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I)) 1029 1030 def test_ignore_case_range(self): 1031 # Issues #3511, #17381. 1032 self.assertTrue(re.match(r'[9-a]', '_', re.I)) 1033 self.assertIsNone(re.match(r'[9-A]', '_', re.I)) 1034 self.assertTrue(re.match(br'[9-a]', b'_', re.I)) 1035 self.assertIsNone(re.match(br'[9-A]', b'_', re.I)) 1036 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) 1037 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) 1038 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I)) 1039 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) 1040 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I)) 1041 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I)) 1042 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I)) 1043 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I)) 1044 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I)) 1045 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I)) 1046 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I)) 1047 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I)) 1048 1049 # Two different characters have the same lowercase. 1050 assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K' 1051 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I)) 1052 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I)) 1053 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I)) 1054 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I)) 1055 1056 # Two different characters have the same uppercase. 1057 assert 's'.upper() == '\u017f'.upper() == 'S' # 'ſ' 1058 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I)) 1059 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I)) 1060 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I)) 1061 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I)) 1062 1063 # Two different characters have the same uppercase. Unicode 9.0+. 1064 assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'в', 'ᲀ', 'В' 1065 self.assertTrue(re.match(r'[\u0411-\u0413]', '\u0432', re.I)) 1066 self.assertTrue(re.match(r'[\u0411-\u0413]', '\u1c80', re.I)) 1067 self.assertTrue(re.match(r'[\u0431-\u0433]', '\u0412', re.I)) 1068 self.assertTrue(re.match(r'[\u0431-\u0433]', '\u1c80', re.I)) 1069 self.assertTrue(re.match(r'[\u1c80-\u1c82]', '\u0412', re.I)) 1070 self.assertTrue(re.match(r'[\u1c80-\u1c82]', '\u0432', re.I)) 1071 1072 # Two different characters have the same multicharacter uppercase. 1073 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 1074 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I)) 1075 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I)) 1076 1077 def test_category(self): 1078 self.assertEqual(re.match(r"(\s)", " ").group(1), " ") 1079 1080 @cpython_only 1081 def test_case_helpers(self): 1082 import _sre 1083 for i in range(128): 1084 c = chr(i) 1085 lo = ord(c.lower()) 1086 self.assertEqual(_sre.ascii_tolower(i), lo) 1087 self.assertEqual(_sre.unicode_tolower(i), lo) 1088 iscased = c in string.ascii_letters 1089 self.assertEqual(_sre.ascii_iscased(i), iscased) 1090 self.assertEqual(_sre.unicode_iscased(i), iscased) 1091 1092 for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: 1093 c = chr(i) 1094 self.assertEqual(_sre.ascii_tolower(i), i) 1095 if i != 0x0130: 1096 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) 1097 iscased = c != c.lower() or c != c.upper() 1098 self.assertFalse(_sre.ascii_iscased(i)) 1099 self.assertEqual(_sre.unicode_iscased(i), 1100 c != c.lower() or c != c.upper()) 1101 1102 self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) 1103 self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) 1104 self.assertFalse(_sre.ascii_iscased(0x0130)) 1105 self.assertTrue(_sre.unicode_iscased(0x0130)) 1106 1107 def test_not_literal(self): 1108 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") 1109 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb") 1110 1111 def test_possible_set_operations(self): 1112 s = bytes(range(128)).decode() 1113 with self.assertWarns(FutureWarning): 1114 p = re.compile(r'[0-9--1]') 1115 self.assertEqual(p.findall(s), list('-./0123456789')) 1116 self.assertEqual(re.findall(r'[--1]', s), list('-./01')) 1117 with self.assertWarns(FutureWarning): 1118 p = re.compile(r'[%--1]') 1119 self.assertEqual(p.findall(s), list("%&'()*+,-1")) 1120 with self.assertWarns(FutureWarning): 1121 p = re.compile(r'[%--]') 1122 self.assertEqual(p.findall(s), list("%&'()*+,-")) 1123 1124 with self.assertWarns(FutureWarning): 1125 p = re.compile(r'[0-9&&1]') 1126 self.assertEqual(p.findall(s), list('&0123456789')) 1127 with self.assertWarns(FutureWarning): 1128 p = re.compile(r'[\d&&1]') 1129 self.assertEqual(p.findall(s), list('&0123456789')) 1130 self.assertEqual(re.findall(r'[&&1]', s), list('&1')) 1131 1132 with self.assertWarns(FutureWarning): 1133 p = re.compile(r'[0-9||a]') 1134 self.assertEqual(p.findall(s), list('0123456789a|')) 1135 with self.assertWarns(FutureWarning): 1136 p = re.compile(r'[\d||a]') 1137 self.assertEqual(p.findall(s), list('0123456789a|')) 1138 self.assertEqual(re.findall(r'[||1]', s), list('1|')) 1139 1140 with self.assertWarns(FutureWarning): 1141 p = re.compile(r'[0-9~~1]') 1142 self.assertEqual(p.findall(s), list('0123456789~')) 1143 with self.assertWarns(FutureWarning): 1144 p = re.compile(r'[\d~~1]') 1145 self.assertEqual(p.findall(s), list('0123456789~')) 1146 self.assertEqual(re.findall(r'[~~1]', s), list('1~')) 1147 1148 with self.assertWarns(FutureWarning): 1149 p = re.compile(r'[[0-9]|]') 1150 self.assertEqual(p.findall(s), list('0123456789[]')) 1151 1152 with self.assertWarns(FutureWarning): 1153 p = re.compile(r'[[:digit:]|]') 1154 self.assertEqual(p.findall(s), list(':[]dgit')) 1155 1156 def test_search_coverage(self): 1157 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b") 1158 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") 1159 1160 def assertMatch(self, pattern, text, match=None, span=None, 1161 matcher=re.fullmatch): 1162 if match is None and span is None: 1163 # the pattern matches the whole text 1164 match = text 1165 span = (0, len(text)) 1166 elif match is None or span is None: 1167 raise ValueError('If match is not None, span should be specified ' 1168 '(and vice versa).') 1169 m = matcher(pattern, text) 1170 self.assertTrue(m) 1171 self.assertEqual(m.group(), match) 1172 self.assertEqual(m.span(), span) 1173 1174 LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`' 1175 1176 def test_re_escape(self): 1177 p = ''.join(chr(i) for i in range(256)) 1178 for c in p: 1179 self.assertMatch(re.escape(c), c) 1180 self.assertMatch('[' + re.escape(c) + ']', c) 1181 self.assertMatch('(?x)' + re.escape(c), c) 1182 self.assertMatch(re.escape(p), p) 1183 for c in '-.]{}': 1184 self.assertEqual(re.escape(c)[:1], '\\') 1185 literal_chars = self.LITERAL_CHARS 1186 self.assertEqual(re.escape(literal_chars), literal_chars) 1187 1188 def test_re_escape_bytes(self): 1189 p = bytes(range(256)) 1190 for i in p: 1191 b = bytes([i]) 1192 self.assertMatch(re.escape(b), b) 1193 self.assertMatch(b'[' + re.escape(b) + b']', b) 1194 self.assertMatch(b'(?x)' + re.escape(b), b) 1195 self.assertMatch(re.escape(p), p) 1196 for i in b'-.]{}': 1197 b = bytes([i]) 1198 self.assertEqual(re.escape(b)[:1], b'\\') 1199 literal_chars = self.LITERAL_CHARS.encode('ascii') 1200 self.assertEqual(re.escape(literal_chars), literal_chars) 1201 1202 def test_re_escape_non_ascii(self): 1203 s = 'xxx\u2620\u2620\u2620xxx' 1204 s_escaped = re.escape(s) 1205 self.assertEqual(s_escaped, s) 1206 self.assertMatch(s_escaped, s) 1207 self.assertMatch('.%s+.' % re.escape('\u2620'), s, 1208 'x\u2620\u2620\u2620x', (2, 7), re.search) 1209 1210 def test_re_escape_non_ascii_bytes(self): 1211 b = 'y\u2620y\u2620y'.encode('utf-8') 1212 b_escaped = re.escape(b) 1213 self.assertEqual(b_escaped, b) 1214 self.assertMatch(b_escaped, b) 1215 res = re.findall(re.escape('\u2620'.encode('utf-8')), b) 1216 self.assertEqual(len(res), 2) 1217 1218 def test_pickling(self): 1219 import pickle 1220 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE) 1221 for proto in range(pickle.HIGHEST_PROTOCOL + 1): 1222 pickled = pickle.dumps(oldpat, proto) 1223 newpat = pickle.loads(pickled) 1224 self.assertEqual(newpat, oldpat) 1225 # current pickle expects the _compile() reconstructor in re module 1226 from re import _compile 1227 1228 def test_copying(self): 1229 import copy 1230 p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?') 1231 self.assertIs(copy.copy(p), p) 1232 self.assertIs(copy.deepcopy(p), p) 1233 m = p.match('12.34') 1234 self.assertIs(copy.copy(m), m) 1235 self.assertIs(copy.deepcopy(m), m) 1236 1237 def test_constants(self): 1238 self.assertEqual(re.I, re.IGNORECASE) 1239 self.assertEqual(re.L, re.LOCALE) 1240 self.assertEqual(re.M, re.MULTILINE) 1241 self.assertEqual(re.S, re.DOTALL) 1242 self.assertEqual(re.X, re.VERBOSE) 1243 1244 def test_flags(self): 1245 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]: 1246 self.assertTrue(re.compile('^pattern$', flag)) 1247 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]: 1248 self.assertTrue(re.compile(b'^pattern$', flag)) 1249 1250 def test_sre_character_literals(self): 1251 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 1252 if i < 256: 1253 self.assertTrue(re.match(r"\%03o" % i, chr(i))) 1254 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0")) 1255 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8")) 1256 self.assertTrue(re.match(r"\x%02x" % i, chr(i))) 1257 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0")) 1258 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z")) 1259 if i < 0x10000: 1260 self.assertTrue(re.match(r"\u%04x" % i, chr(i))) 1261 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0")) 1262 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z")) 1263 self.assertTrue(re.match(r"\U%08x" % i, chr(i))) 1264 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0")) 1265 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z")) 1266 self.assertTrue(re.match(r"\0", "\000")) 1267 self.assertTrue(re.match(r"\08", "\0008")) 1268 self.assertTrue(re.match(r"\01", "\001")) 1269 self.assertTrue(re.match(r"\018", "\0018")) 1270 self.checkPatternError(r"\567", 1271 r'octal escape value \567 outside of ' 1272 r'range 0-0o377', 0) 1273 self.checkPatternError(r"\911", 'invalid group reference 91', 1) 1274 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0) 1275 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0) 1276 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0) 1277 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0) 1278 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0) 1279 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0) 1280 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0) 1281 1282 def test_sre_character_class_literals(self): 1283 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 1284 if i < 256: 1285 self.assertTrue(re.match(r"[\%o]" % i, chr(i))) 1286 self.assertTrue(re.match(r"[\%o8]" % i, chr(i))) 1287 self.assertTrue(re.match(r"[\%03o]" % i, chr(i))) 1288 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i))) 1289 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i))) 1290 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i))) 1291 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i))) 1292 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i))) 1293 if i < 0x10000: 1294 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i))) 1295 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i))) 1296 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i))) 1297 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i))) 1298 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0")) 1299 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z")) 1300 self.checkPatternError(r"[\567]", 1301 r'octal escape value \567 outside of ' 1302 r'range 0-0o377', 1) 1303 self.checkPatternError(r"[\911]", r'bad escape \9', 1) 1304 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1) 1305 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1) 1306 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1) 1307 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1) 1308 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) 1309 1310 def test_sre_byte_literals(self): 1311 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1312 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i]))) 1313 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0")) 1314 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8")) 1315 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i]))) 1316 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) 1317 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) 1318 self.assertRaises(re.error, re.compile, br"\u1234") 1319 self.assertRaises(re.error, re.compile, br"\U00012345") 1320 self.assertTrue(re.match(br"\0", b"\000")) 1321 self.assertTrue(re.match(br"\08", b"\0008")) 1322 self.assertTrue(re.match(br"\01", b"\001")) 1323 self.assertTrue(re.match(br"\018", b"\0018")) 1324 self.checkPatternError(br"\567", 1325 r'octal escape value \567 outside of ' 1326 r'range 0-0o377', 0) 1327 self.checkPatternError(br"\911", 'invalid group reference 91', 1) 1328 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0) 1329 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0) 1330 1331 def test_sre_byte_class_literals(self): 1332 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1333 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i]))) 1334 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i]))) 1335 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i]))) 1336 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i]))) 1337 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i]))) 1338 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) 1339 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) 1340 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) 1341 self.assertRaises(re.error, re.compile, br"[\u1234]") 1342 self.assertRaises(re.error, re.compile, br"[\U00012345]") 1343 self.checkPatternError(br"[\567]", 1344 r'octal escape value \567 outside of ' 1345 r'range 0-0o377', 1) 1346 self.checkPatternError(br"[\911]", r'bad escape \9', 1) 1347 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1) 1348 1349 def test_character_set_errors(self): 1350 self.checkPatternError(r'[', 'unterminated character set', 0) 1351 self.checkPatternError(r'[^', 'unterminated character set', 0) 1352 self.checkPatternError(r'[a', 'unterminated character set', 0) 1353 # bug 545855 -- This pattern failed to cause a compile error as it 1354 # should, instead provoking a TypeError. 1355 self.checkPatternError(r"[a-", 'unterminated character set', 0) 1356 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1) 1357 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1) 1358 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1) 1359 1360 def test_bug_113254(self): 1361 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) 1362 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) 1363 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) 1364 1365 def test_bug_527371(self): 1366 # bug described in patches 527371/672491 1367 self.assertIsNone(re.match(r'(a)?a','a').lastindex) 1368 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) 1369 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') 1370 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a') 1371 self.assertEqual(re.match(r"((a))", "a").lastindex, 1) 1372 1373 def test_bug_418626(self): 1374 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code 1375 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of 1376 # pattern '*?' on a long string. 1377 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) 1378 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 1379 20003) 1380 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) 1381 # non-simple '*?' still used to hit the recursion limit, before the 1382 # non-recursive scheme was implemented. 1383 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) 1384 1385 def test_bug_612074(self): 1386 pat="["+re.escape("\u2039")+"]" 1387 self.assertEqual(re.compile(pat) and 1, 1) 1388 1389 def test_stack_overflow(self): 1390 # nasty cases that used to overflow the straightforward recursive 1391 # implementation of repeated groups. 1392 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') 1393 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') 1394 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') 1395 1396 def test_nothing_to_repeat(self): 1397 for reps in '*', '+', '?', '{1,2}': 1398 for mod in '', '?': 1399 self.checkPatternError('%s%s' % (reps, mod), 1400 'nothing to repeat', 0) 1401 self.checkPatternError('(?:%s%s)' % (reps, mod), 1402 'nothing to repeat', 3) 1403 1404 def test_multiple_repeat(self): 1405 for outer_reps in '*', '+', '?', '{1,2}': 1406 for outer_mod in '', '?', '+': 1407 outer_op = outer_reps + outer_mod 1408 for inner_reps in '*', '+', '?', '{1,2}': 1409 for inner_mod in '', '?', '+': 1410 if inner_mod + outer_reps in ('?', '+'): 1411 continue 1412 inner_op = inner_reps + inner_mod 1413 self.checkPatternError(r'x%s%s' % (inner_op, outer_op), 1414 'multiple repeat', 1 + len(inner_op)) 1415 1416 def test_unlimited_zero_width_repeat(self): 1417 # Issue #9669 1418 self.assertIsNone(re.match(r'(?:a?)*y', 'z')) 1419 self.assertIsNone(re.match(r'(?:a?)+y', 'z')) 1420 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z')) 1421 self.assertIsNone(re.match(r'(?:a?)*?y', 'z')) 1422 self.assertIsNone(re.match(r'(?:a?)+?y', 'z')) 1423 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z')) 1424 1425 def test_scanner(self): 1426 def s_ident(scanner, token): return token 1427 def s_operator(scanner, token): return "op%s" % token 1428 def s_float(scanner, token): return float(token) 1429 def s_int(scanner, token): return int(token) 1430 1431 scanner = Scanner([ 1432 (r"[a-zA-Z_]\w*", s_ident), 1433 (r"\d+\.\d*", s_float), 1434 (r"\d+", s_int), 1435 (r"=|\+|-|\*|/", s_operator), 1436 (r"\s+", None), 1437 ]) 1438 1439 self.assertTrue(scanner.scanner.scanner("").pattern) 1440 1441 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), 1442 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 1443 'op+', 'bar'], '')) 1444 1445 def test_bug_448951(self): 1446 # bug 448951 (similar to 429357, but with single char match) 1447 # (Also test greedy matches.) 1448 for op in '','?','*': 1449 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), 1450 (None, None)) 1451 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), 1452 ('a:', 'a')) 1453 1454 def test_bug_725106(self): 1455 # capturing groups in alternatives in repeats 1456 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), 1457 ('b', 'a')) 1458 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), 1459 ('c', 'b')) 1460 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), 1461 ('b', None)) 1462 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), 1463 ('b', None)) 1464 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), 1465 ('b', 'a')) 1466 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), 1467 ('c', 'b')) 1468 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), 1469 ('b', None)) 1470 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), 1471 ('b', None)) 1472 1473 def test_bug_725149(self): 1474 # mark_stack_base restoring before restoring marks 1475 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), 1476 ('a', None)) 1477 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), 1478 ('a', None, None)) 1479 1480 def test_bug_764548(self): 1481 # bug 764548, re.compile() barfs on str/unicode subclasses 1482 class my_unicode(str): pass 1483 pat = re.compile(my_unicode("abc")) 1484 self.assertIsNone(pat.match("xyz")) 1485 1486 def test_finditer(self): 1487 iter = re.finditer(r":+", "a:b::c:::d") 1488 self.assertEqual([item.group(0) for item in iter], 1489 [":", "::", ":::"]) 1490 1491 pat = re.compile(r":+") 1492 iter = pat.finditer("a:b::c:::d", 1, 10) 1493 self.assertEqual([item.group(0) for item in iter], 1494 [":", "::", ":::"]) 1495 1496 pat = re.compile(r":+") 1497 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10) 1498 self.assertEqual([item.group(0) for item in iter], 1499 [":", "::", ":::"]) 1500 1501 pat = re.compile(r":+") 1502 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1) 1503 self.assertEqual([item.group(0) for item in iter], 1504 [":", "::", ":::"]) 1505 1506 pat = re.compile(r":+") 1507 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8) 1508 self.assertEqual([item.group(0) for item in iter], 1509 ["::", "::"]) 1510 1511 def test_bug_926075(self): 1512 self.assertIsNot(re.compile('bug_926075'), 1513 re.compile(b'bug_926075')) 1514 1515 def test_bug_931848(self): 1516 pattern = "[\u002E\u3002\uFF0E\uFF61]" 1517 self.assertEqual(re.compile(pattern).split("a.b.c"), 1518 ['a','b','c']) 1519 1520 def test_bug_581080(self): 1521 iter = re.finditer(r"\s", "a b") 1522 self.assertEqual(next(iter).span(), (1,2)) 1523 self.assertRaises(StopIteration, next, iter) 1524 1525 scanner = re.compile(r"\s").scanner("a b") 1526 self.assertEqual(scanner.search().span(), (1, 2)) 1527 self.assertIsNone(scanner.search()) 1528 1529 def test_bug_817234(self): 1530 iter = re.finditer(r".*", "asdf") 1531 self.assertEqual(next(iter).span(), (0, 4)) 1532 self.assertEqual(next(iter).span(), (4, 4)) 1533 self.assertRaises(StopIteration, next, iter) 1534 1535 def test_bug_6561(self): 1536 # '\d' should match characters in Unicode category 'Nd' 1537 # (Number, Decimal Digit), but not those in 'Nl' (Number, 1538 # Letter) or 'No' (Number, Other). 1539 decimal_digits = [ 1540 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd' 1541 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' 1542 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' 1543 ] 1544 for x in decimal_digits: 1545 self.assertEqual(re.match(r'^\d$', x).group(0), x) 1546 1547 not_decimal_digits = [ 1548 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' 1549 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' 1550 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No' 1551 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' 1552 ] 1553 for x in not_decimal_digits: 1554 self.assertIsNone(re.match(r'^\d$', x)) 1555 1556 def test_empty_array(self): 1557 # SF buf 1647541 1558 import array 1559 for typecode in 'bBuhHiIlLfd': 1560 a = array.array(typecode) 1561 self.assertIsNone(re.compile(b"bla").match(a)) 1562 self.assertEqual(re.compile(b"").match(a).groups(), ()) 1563 1564 def test_inline_flags(self): 1565 # Bug #1700 1566 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below 1567 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below 1568 1569 p = re.compile('.' + upper_char, re.I | re.S) 1570 q = p.match('\n' + lower_char) 1571 self.assertTrue(q) 1572 1573 p = re.compile('.' + lower_char, re.I | re.S) 1574 q = p.match('\n' + upper_char) 1575 self.assertTrue(q) 1576 1577 p = re.compile('(?i).' + upper_char, re.S) 1578 q = p.match('\n' + lower_char) 1579 self.assertTrue(q) 1580 1581 p = re.compile('(?i).' + lower_char, re.S) 1582 q = p.match('\n' + upper_char) 1583 self.assertTrue(q) 1584 1585 p = re.compile('(?is).' + upper_char) 1586 q = p.match('\n' + lower_char) 1587 self.assertTrue(q) 1588 1589 p = re.compile('(?is).' + lower_char) 1590 q = p.match('\n' + upper_char) 1591 self.assertTrue(q) 1592 1593 p = re.compile('(?s)(?i).' + upper_char) 1594 q = p.match('\n' + lower_char) 1595 self.assertTrue(q) 1596 1597 p = re.compile('(?s)(?i).' + lower_char) 1598 q = p.match('\n' + upper_char) 1599 self.assertTrue(q) 1600 1601 self.assertTrue(re.match('(?ix) ' + upper_char, lower_char)) 1602 self.assertTrue(re.match('(?ix) ' + lower_char, upper_char)) 1603 self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X)) 1604 self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char)) 1605 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X)) 1606 1607 msg = "global flags not at the start of the expression" 1608 self.checkPatternError(upper_char + '(?i)', msg, 1) 1609 1610 # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning 1611 with warnings.catch_warnings(): 1612 warnings.simplefilter('error', BytesWarning) 1613 self.checkPatternError(b'A(?i)', msg, 1) 1614 1615 self.checkPatternError('(?s).(?i)' + upper_char, msg, 5) 1616 self.checkPatternError('(?i) ' + upper_char + ' (?x)', msg, 7) 1617 self.checkPatternError(' (?x) (?i) ' + upper_char, msg, 1) 1618 self.checkPatternError('^(?i)' + upper_char, msg, 1) 1619 self.checkPatternError('$|(?i)' + upper_char, msg, 2) 1620 self.checkPatternError('(?:(?i)' + upper_char + ')', msg, 3) 1621 self.checkPatternError('(^)?(?(1)(?i)' + upper_char + ')', msg, 9) 1622 self.checkPatternError('($)?(?(1)|(?i)' + upper_char + ')', msg, 10) 1623 1624 1625 def test_dollar_matches_twice(self): 1626 r"""Test that $ does not include \n 1627 $ matches the end of string, and just before the terminating \n""" 1628 pattern = re.compile('$') 1629 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') 1630 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') 1631 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1632 1633 pattern = re.compile('$', re.MULTILINE) 1634 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) 1635 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') 1636 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1637 1638 def test_bytes_str_mixing(self): 1639 # Mixing str and bytes is disallowed 1640 pat = re.compile('.') 1641 bpat = re.compile(b'.') 1642 self.assertRaises(TypeError, pat.match, b'b') 1643 self.assertRaises(TypeError, bpat.match, 'b') 1644 self.assertRaises(TypeError, pat.sub, b'b', 'c') 1645 self.assertRaises(TypeError, pat.sub, 'b', b'c') 1646 self.assertRaises(TypeError, pat.sub, b'b', b'c') 1647 self.assertRaises(TypeError, bpat.sub, b'b', 'c') 1648 self.assertRaises(TypeError, bpat.sub, 'b', b'c') 1649 self.assertRaises(TypeError, bpat.sub, 'b', 'c') 1650 1651 def test_ascii_and_unicode_flag(self): 1652 # String patterns 1653 for flags in (0, re.UNICODE): 1654 pat = re.compile('\xc0', flags | re.IGNORECASE) 1655 self.assertTrue(pat.match('\xe0')) 1656 pat = re.compile(r'\w', flags) 1657 self.assertTrue(pat.match('\xe0')) 1658 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE) 1659 self.assertIsNone(pat.match('\xe0')) 1660 pat = re.compile('(?a)\xc0', re.IGNORECASE) 1661 self.assertIsNone(pat.match('\xe0')) 1662 pat = re.compile(r'\w', re.ASCII) 1663 self.assertIsNone(pat.match('\xe0')) 1664 pat = re.compile(r'(?a)\w') 1665 self.assertIsNone(pat.match('\xe0')) 1666 # Bytes patterns 1667 for flags in (0, re.ASCII): 1668 pat = re.compile(b'\xc0', flags | re.IGNORECASE) 1669 self.assertIsNone(pat.match(b'\xe0')) 1670 pat = re.compile(br'\w', flags) 1671 self.assertIsNone(pat.match(b'\xe0')) 1672 # Incompatibilities 1673 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE) 1674 self.assertRaises(re.error, re.compile, br'(?u)\w') 1675 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII) 1676 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII) 1677 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE) 1678 self.assertRaises(re.error, re.compile, r'(?au)\w') 1679 1680 def test_locale_flag(self): 1681 enc = locale.getpreferredencoding() 1682 # Search non-ASCII letter 1683 for i in range(128, 256): 1684 try: 1685 c = bytes([i]).decode(enc) 1686 sletter = c.lower() 1687 if sletter == c: continue 1688 bletter = sletter.encode(enc) 1689 if len(bletter) != 1: continue 1690 if bletter.decode(enc) != sletter: continue 1691 bpat = re.escape(bytes([i])) 1692 break 1693 except (UnicodeError, TypeError): 1694 pass 1695 else: 1696 bletter = None 1697 bpat = b'A' 1698 # Bytes patterns 1699 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE) 1700 if bletter: 1701 self.assertTrue(pat.match(bletter)) 1702 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE) 1703 if bletter: 1704 self.assertTrue(pat.match(bletter)) 1705 pat = re.compile(bpat, re.IGNORECASE) 1706 if bletter: 1707 self.assertIsNone(pat.match(bletter)) 1708 pat = re.compile(br'\w', re.LOCALE) 1709 if bletter: 1710 self.assertTrue(pat.match(bletter)) 1711 pat = re.compile(br'(?L)\w') 1712 if bletter: 1713 self.assertTrue(pat.match(bletter)) 1714 pat = re.compile(br'\w') 1715 if bletter: 1716 self.assertIsNone(pat.match(bletter)) 1717 # Incompatibilities 1718 self.assertRaises(ValueError, re.compile, '', re.LOCALE) 1719 self.assertRaises(re.error, re.compile, '(?L)') 1720 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII) 1721 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII) 1722 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) 1723 self.assertRaises(re.error, re.compile, b'(?aL)') 1724 1725 def test_scoped_flags(self): 1726 self.assertTrue(re.match(r'(?i:a)b', 'Ab')) 1727 self.assertIsNone(re.match(r'(?i:a)b', 'aB')) 1728 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE)) 1729 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE)) 1730 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab')) 1731 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB')) 1732 1733 self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0')) 1734 self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0')) 1735 self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII)) 1736 1737 self.checkPatternError(r'(?a)(?-a:\w)', 1738 "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8) 1739 self.checkPatternError(r'(?i-i:a)', 1740 'bad inline flags: flag turned on and off', 5) 1741 self.checkPatternError(r'(?au:a)', 1742 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) 1743 self.checkPatternError(br'(?aL:a)', 1744 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) 1745 1746 self.checkPatternError(r'(?-', 'missing flag', 3) 1747 self.checkPatternError(r'(?-+', 'missing flag', 3) 1748 self.checkPatternError(r'(?-z', 'unknown flag', 3) 1749 self.checkPatternError(r'(?-i', 'missing :', 4) 1750 self.checkPatternError(r'(?-i)', 'missing :', 4) 1751 self.checkPatternError(r'(?-i+', 'missing :', 4) 1752 self.checkPatternError(r'(?-iz', 'unknown flag', 4) 1753 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0) 1754 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 1755 self.checkPatternError(r'(?i+', 'missing -, : or )', 3) 1756 self.checkPatternError(r'(?iz', 'unknown flag', 3) 1757 1758 def test_ignore_spaces(self): 1759 for space in " \t\n\r\v\f": 1760 self.assertTrue(re.fullmatch(space + 'a', 'a', re.VERBOSE)) 1761 for space in b" ", b"\t", b"\n", b"\r", b"\v", b"\f": 1762 self.assertTrue(re.fullmatch(space + b'a', b'a', re.VERBOSE)) 1763 self.assertTrue(re.fullmatch('(?x) a', 'a')) 1764 self.assertTrue(re.fullmatch(' (?x) a', 'a', re.VERBOSE)) 1765 self.assertTrue(re.fullmatch('(?x) (?x) a', 'a')) 1766 self.assertTrue(re.fullmatch(' a(?x: b) c', ' ab c')) 1767 self.assertTrue(re.fullmatch(' a(?-x: b) c', 'a bc', re.VERBOSE)) 1768 self.assertTrue(re.fullmatch('(?x) a(?-x: b) c', 'a bc')) 1769 self.assertTrue(re.fullmatch('(?x) a| b', 'a')) 1770 self.assertTrue(re.fullmatch('(?x) a| b', 'b')) 1771 1772 def test_comments(self): 1773 self.assertTrue(re.fullmatch('#x\na', 'a', re.VERBOSE)) 1774 self.assertTrue(re.fullmatch(b'#x\na', b'a', re.VERBOSE)) 1775 self.assertTrue(re.fullmatch('(?x)#x\na', 'a')) 1776 self.assertTrue(re.fullmatch('#x\n(?x)#y\na', 'a', re.VERBOSE)) 1777 self.assertTrue(re.fullmatch('(?x)#x\n(?x)#y\na', 'a')) 1778 self.assertTrue(re.fullmatch('#x\na(?x:#y\nb)#z\nc', '#x\nab#z\nc')) 1779 self.assertTrue(re.fullmatch('#x\na(?-x:#y\nb)#z\nc', 'a#y\nbc', 1780 re.VERBOSE)) 1781 self.assertTrue(re.fullmatch('(?x)#x\na(?-x:#y\nb)#z\nc', 'a#y\nbc')) 1782 self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'a')) 1783 self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'b')) 1784 1785 def test_bug_6509(self): 1786 # Replacement strings of both types must parse properly. 1787 # all strings 1788 pat = re.compile(r'a(\w)') 1789 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc') 1790 pat = re.compile('a(.)') 1791 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234') 1792 pat = re.compile('..') 1793 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str') 1794 1795 # all bytes 1796 pat = re.compile(br'a(\w)') 1797 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc') 1798 pat = re.compile(b'a(.)') 1799 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD') 1800 pat = re.compile(b'..') 1801 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes') 1802 1803 def test_dealloc(self): 1804 # issue 3299: check for segfault in debug build 1805 import _sre 1806 # the overflow limit is different on wide and narrow builds and it 1807 # depends on the definition of SRE_CODE (see sre.h). 1808 # 2**128 should be big enough to overflow on both. For smaller values 1809 # a RuntimeError is raised instead of OverflowError. 1810 long_overflow = 2**128 1811 self.assertRaises(TypeError, re.finditer, "a", {}) 1812 with self.assertRaises(OverflowError): 1813 _sre.compile("abc", 0, [long_overflow], 0, {}, ()) 1814 with self.assertRaises(TypeError): 1815 _sre.compile({}, 0, [], 0, [], []) 1816 1817 def test_search_dot_unicode(self): 1818 self.assertTrue(re.search("123.*-", '123abc-')) 1819 self.assertTrue(re.search("123.*-", '123\xe9-')) 1820 self.assertTrue(re.search("123.*-", '123\u20ac-')) 1821 self.assertTrue(re.search("123.*-", '123\U0010ffff-')) 1822 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-')) 1823 1824 def test_compile(self): 1825 # Test return value when given string and pattern as parameter 1826 pattern = re.compile('random pattern') 1827 self.assertIsInstance(pattern, re.Pattern) 1828 same_pattern = re.compile(pattern) 1829 self.assertIsInstance(same_pattern, re.Pattern) 1830 self.assertIs(same_pattern, pattern) 1831 # Test behaviour when not given a string or pattern as parameter 1832 self.assertRaises(TypeError, re.compile, 0) 1833 1834 @bigmemtest(size=_2G, memuse=1) 1835 def test_large_search(self, size): 1836 # Issue #10182: indices were 32-bit-truncated. 1837 s = 'a' * size 1838 m = re.search('$', s) 1839 self.assertIsNotNone(m) 1840 self.assertEqual(m.start(), size) 1841 self.assertEqual(m.end(), size) 1842 1843 # The huge memuse is because of re.sub() using a list and a join() 1844 # to create the replacement result. 1845 @bigmemtest(size=_2G, memuse=16 + 2) 1846 def test_large_subn(self, size): 1847 # Issue #10182: indices were 32-bit-truncated. 1848 s = 'a' * size 1849 r, n = re.subn('', '', s) 1850 self.assertEqual(r, s) 1851 self.assertEqual(n, size + 1) 1852 1853 def test_bug_16688(self): 1854 # Issue 16688: Backreferences make case-insensitive regex fail on 1855 # non-ASCII strings. 1856 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a']) 1857 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2)) 1858 1859 def test_repeat_minmax_overflow(self): 1860 # Issue #13169 1861 string = "x" * 100000 1862 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535)) 1863 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535)) 1864 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535)) 1865 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536)) 1866 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536)) 1867 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536)) 1868 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t. 1869 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128) 1870 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128) 1871 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) 1872 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) 1873 1874 @cpython_only 1875 def test_repeat_minmax_overflow_maxrepeat(self): 1876 try: 1877 from _sre import MAXREPEAT 1878 except ImportError: 1879 self.skipTest('requires _sre.MAXREPEAT constant') 1880 string = "x" * 100000 1881 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string)) 1882 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(), 1883 (0, 100000)) 1884 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string)) 1885 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT) 1886 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT) 1887 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT) 1888 1889 def test_backref_group_name_in_exception(self): 1890 # Issue 17341: Poor error message when compiling invalid regex 1891 self.checkPatternError('(?P=<foo>)', 1892 "bad character in group name '<foo>'", 4) 1893 1894 def test_group_name_in_exception(self): 1895 # Issue 17341: Poor error message when compiling invalid regex 1896 self.checkPatternError('(?P<?foo>)', 1897 "bad character in group name '?foo'", 4) 1898 1899 def test_issue17998(self): 1900 for reps in '*', '+', '?', '{1}': 1901 for mod in '', '?': 1902 pattern = '.' + reps + mod + 'yz' 1903 self.assertEqual(re.compile(pattern, re.S).findall('xyz'), 1904 ['xyz'], msg=pattern) 1905 pattern = pattern.encode() 1906 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'), 1907 [b'xyz'], msg=pattern) 1908 1909 def test_match_repr(self): 1910 for string in '[abracadabra]', S('[abracadabra]'): 1911 m = re.search(r'(.+)(.*?)\1', string) 1912 pattern = r"<(%s\.)?%s object; span=\(1, 12\), match='abracadabra'>" % ( 1913 type(m).__module__, type(m).__qualname__ 1914 ) 1915 self.assertRegex(repr(m), pattern) 1916 for string in (b'[abracadabra]', B(b'[abracadabra]'), 1917 bytearray(b'[abracadabra]'), 1918 memoryview(b'[abracadabra]')): 1919 m = re.search(br'(.+)(.*?)\1', string) 1920 pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % ( 1921 type(m).__module__, type(m).__qualname__ 1922 ) 1923 self.assertRegex(repr(m), pattern) 1924 1925 first, second = list(re.finditer("(aa)|(bb)", "aa bb")) 1926 pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % ( 1927 type(second).__module__, type(second).__qualname__ 1928 ) 1929 self.assertRegex(repr(first), pattern) 1930 pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % ( 1931 type(second).__module__, type(second).__qualname__ 1932 ) 1933 self.assertRegex(repr(second), pattern) 1934 1935 def test_zerowidth(self): 1936 # Issues 852532, 1647489, 3262, 25054. 1937 self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', '']) 1938 self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', '']) 1939 self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc']) 1940 self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', '']) 1941 1942 self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-') 1943 self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-') 1944 self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]') 1945 1946 self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', '']) 1947 self.assertEqual(re.findall(r"\b|\w+", "a::bc"), 1948 ['', 'a', '', '', 'bc', '']) 1949 1950 self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")], 1951 [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)]) 1952 self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")], 1953 [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)]) 1954 1955 def test_bug_2537(self): 1956 # issue 2537: empty submatches 1957 for outer_op in ('{0,}', '*', '+', '{1,187}'): 1958 for inner_op in ('{0,}', '*', '?'): 1959 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op)) 1960 m = r.match("xyyzy") 1961 self.assertEqual(m.group(0), "xyy") 1962 self.assertEqual(m.group(1), "") 1963 self.assertEqual(m.group(2), "y") 1964 1965 def test_keyword_parameters(self): 1966 # Issue #20283: Accepting the string keyword parameter. 1967 pat = re.compile(r'(ab)') 1968 self.assertEqual( 1969 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9)) 1970 self.assertEqual( 1971 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9)) 1972 self.assertEqual( 1973 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9)) 1974 self.assertEqual( 1975 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab']) 1976 self.assertEqual( 1977 pat.split(string='abracadabra', maxsplit=1), 1978 ['', 'ab', 'racadabra']) 1979 self.assertEqual( 1980 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(), 1981 (7, 9)) 1982 1983 def test_bug_20998(self): 1984 # Issue #20998: Fullmatch of repeated single character pattern 1985 # with ignore case. 1986 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3)) 1987 1988 @unittest.skipIf( 1989 is_emscripten or is_wasi, 1990 "musl libc issue on Emscripten/WASI, bpo-46390" 1991 ) 1992 def test_locale_caching(self): 1993 # Issue #22410 1994 oldlocale = locale.setlocale(locale.LC_CTYPE) 1995 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1996 for loc in 'en_US.iso88591', 'en_US.utf8': 1997 try: 1998 locale.setlocale(locale.LC_CTYPE, loc) 1999 except locale.Error: 2000 # Unsupported locale on this system 2001 self.skipTest('test needs %s locale' % loc) 2002 2003 re.purge() 2004 self.check_en_US_iso88591() 2005 self.check_en_US_utf8() 2006 re.purge() 2007 self.check_en_US_utf8() 2008 self.check_en_US_iso88591() 2009 2010 def check_en_US_iso88591(self): 2011 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 2012 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 2013 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I)) 2014 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I)) 2015 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 2016 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5')) 2017 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 2018 2019 def check_en_US_utf8(self): 2020 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 2021 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 2022 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I)) 2023 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I)) 2024 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 2025 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) 2026 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) 2027 2028 @unittest.skipIf( 2029 is_emscripten or is_wasi, 2030 "musl libc issue on Emscripten/WASI, bpo-46390" 2031 ) 2032 def test_locale_compiled(self): 2033 oldlocale = locale.setlocale(locale.LC_CTYPE) 2034 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 2035 for loc in 'en_US.iso88591', 'en_US.utf8': 2036 try: 2037 locale.setlocale(locale.LC_CTYPE, loc) 2038 except locale.Error: 2039 # Unsupported locale on this system 2040 self.skipTest('test needs %s locale' % loc) 2041 2042 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 2043 p1 = re.compile(b'\xc5\xe5', re.L|re.I) 2044 p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I) 2045 p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I) 2046 p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I) 2047 for p in p1, p2, p3: 2048 self.assertTrue(p.match(b'\xc5\xe5')) 2049 self.assertTrue(p.match(b'\xe5\xe5')) 2050 self.assertTrue(p.match(b'\xc5\xc5')) 2051 self.assertIsNone(p4.match(b'\xe5\xc5')) 2052 self.assertIsNone(p4.match(b'\xe5\xe5')) 2053 self.assertIsNone(p4.match(b'\xc5\xc5')) 2054 2055 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 2056 for p in p1, p2, p3: 2057 self.assertTrue(p.match(b'\xc5\xe5')) 2058 self.assertIsNone(p.match(b'\xe5\xe5')) 2059 self.assertIsNone(p.match(b'\xc5\xc5')) 2060 self.assertTrue(p4.match(b'\xe5\xc5')) 2061 self.assertIsNone(p4.match(b'\xe5\xe5')) 2062 self.assertIsNone(p4.match(b'\xc5\xc5')) 2063 2064 def test_error(self): 2065 with self.assertRaises(re.error) as cm: 2066 re.compile('(\u20ac))') 2067 err = cm.exception 2068 self.assertIsInstance(err.pattern, str) 2069 self.assertEqual(err.pattern, '(\u20ac))') 2070 self.assertEqual(err.pos, 3) 2071 self.assertEqual(err.lineno, 1) 2072 self.assertEqual(err.colno, 4) 2073 self.assertIn(err.msg, str(err)) 2074 self.assertIn(' at position 3', str(err)) 2075 self.assertNotIn(' at position 3', err.msg) 2076 # Bytes pattern 2077 with self.assertRaises(re.error) as cm: 2078 re.compile(b'(\xa4))') 2079 err = cm.exception 2080 self.assertIsInstance(err.pattern, bytes) 2081 self.assertEqual(err.pattern, b'(\xa4))') 2082 self.assertEqual(err.pos, 3) 2083 # Multiline pattern 2084 with self.assertRaises(re.error) as cm: 2085 re.compile(""" 2086 ( 2087 abc 2088 ) 2089 ) 2090 ( 2091 """, re.VERBOSE) 2092 err = cm.exception 2093 self.assertEqual(err.pos, 77) 2094 self.assertEqual(err.lineno, 5) 2095 self.assertEqual(err.colno, 17) 2096 self.assertIn(err.msg, str(err)) 2097 self.assertIn(' at position 77', str(err)) 2098 self.assertIn('(line 5, column 17)', str(err)) 2099 2100 def test_misc_errors(self): 2101 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0) 2102 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0) 2103 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5) 2104 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) 2105 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) 2106 self.checkPatternError(r'(?iz)', 'unknown flag', 3) 2107 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 2108 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) 2109 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) 2110 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1) 2111 self.checkPatternError(r'(?', 'unexpected end of pattern', 2) 2112 2113 def test_enum(self): 2114 # Issue #28082: Check that str(flag) returns a human readable string 2115 # instead of an integer 2116 self.assertIn('ASCII', str(re.A)) 2117 self.assertIn('DOTALL', str(re.S)) 2118 2119 def test_pattern_compare(self): 2120 pattern1 = re.compile('abc', re.IGNORECASE) 2121 2122 # equal to itself 2123 self.assertEqual(pattern1, pattern1) 2124 self.assertFalse(pattern1 != pattern1) 2125 2126 # equal 2127 re.purge() 2128 pattern2 = re.compile('abc', re.IGNORECASE) 2129 self.assertEqual(hash(pattern2), hash(pattern1)) 2130 self.assertEqual(pattern2, pattern1) 2131 2132 # not equal: different pattern 2133 re.purge() 2134 pattern3 = re.compile('XYZ', re.IGNORECASE) 2135 # Don't test hash(pattern3) != hash(pattern1) because there is no 2136 # warranty that hash values are different 2137 self.assertNotEqual(pattern3, pattern1) 2138 2139 # not equal: different flag (flags=0) 2140 re.purge() 2141 pattern4 = re.compile('abc') 2142 self.assertNotEqual(pattern4, pattern1) 2143 2144 # only == and != comparison operators are supported 2145 with self.assertRaises(TypeError): 2146 pattern1 < pattern2 2147 2148 def test_pattern_compare_bytes(self): 2149 pattern1 = re.compile(b'abc') 2150 2151 # equal: test bytes patterns 2152 re.purge() 2153 pattern2 = re.compile(b'abc') 2154 self.assertEqual(hash(pattern2), hash(pattern1)) 2155 self.assertEqual(pattern2, pattern1) 2156 2157 # not equal: pattern of a different types (str vs bytes), 2158 # comparison must not raise a BytesWarning 2159 re.purge() 2160 pattern3 = re.compile('abc') 2161 with warnings.catch_warnings(): 2162 warnings.simplefilter('error', BytesWarning) 2163 self.assertNotEqual(pattern3, pattern1) 2164 2165 def test_bug_29444(self): 2166 s = bytearray(b'abcdefgh') 2167 m = re.search(b'[a-h]+', s) 2168 m2 = re.search(b'[e-h]+', s) 2169 self.assertEqual(m.group(), b'abcdefgh') 2170 self.assertEqual(m2.group(), b'efgh') 2171 s[:] = b'xyz' 2172 self.assertEqual(m.group(), b'xyz') 2173 self.assertEqual(m2.group(), b'') 2174 2175 def test_bug_34294(self): 2176 # Issue 34294: wrong capturing groups 2177 2178 # exists since Python 2 2179 s = "a\tx" 2180 p = r"\b(?=(\t)|(x))x" 2181 self.assertEqual(re.search(p, s).groups(), (None, 'x')) 2182 2183 # introduced in Python 3.7.0 2184 s = "ab" 2185 p = r"(?=(.)(.)?)" 2186 self.assertEqual(re.findall(p, s), 2187 [('a', 'b'), ('b', '')]) 2188 self.assertEqual([m.groups() for m in re.finditer(p, s)], 2189 [('a', 'b'), ('b', None)]) 2190 2191 # test-cases provided by issue34294, introduced in Python 3.7.0 2192 p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)" 2193 s = "<test><foo2/></test>" 2194 self.assertEqual(re.findall(p, s), 2195 [('test', '<foo2/>'), ('foo2', '')]) 2196 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2197 [{'tag': 'test', 'text': '<foo2/>'}, 2198 {'tag': 'foo2', 'text': None}]) 2199 s = "<test>Hello</test><foo/>" 2200 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2201 [{'tag': 'test', 'text': 'Hello'}, 2202 {'tag': 'foo', 'text': None}]) 2203 s = "<test>Hello</test><foo/><foo/>" 2204 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2205 [{'tag': 'test', 'text': 'Hello'}, 2206 {'tag': 'foo', 'text': None}, 2207 {'tag': 'foo', 'text': None}]) 2208 2209 def test_MARK_PUSH_macro_bug(self): 2210 # issue35859, MARK_PUSH() macro didn't protect MARK-0 if it 2211 # was the only available mark. 2212 self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',)) 2213 self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',)) 2214 self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',)) 2215 self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',)) 2216 2217 def test_MIN_UNTIL_mark_bug(self): 2218 # Fixed in issue35859, reported in issue9134. 2219 # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat 2220 s = 'axxzbcz' 2221 p = r'(?:(?:a|bc)*?(xx)??z)*' 2222 self.assertEqual(re.match(p, s).groups(), ('xx',)) 2223 2224 # test-case provided by issue9134 2225 s = 'xtcxyzxc' 2226 p = r'((x|yz)+?(t)??c)*' 2227 m = re.match(p, s) 2228 self.assertEqual(m.span(), (0, 8)) 2229 self.assertEqual(m.span(2), (6, 7)) 2230 self.assertEqual(m.groups(), ('xyzxc', 'x', 't')) 2231 2232 def test_REPEAT_ONE_mark_bug(self): 2233 # issue35859 2234 # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat 2235 s = 'aabaab' 2236 p = r'(?:[^b]*a(?=(b)|(a))ab)*' 2237 m = re.match(p, s) 2238 self.assertEqual(m.span(), (0, 6)) 2239 self.assertEqual(m.span(2), (4, 5)) 2240 self.assertEqual(m.groups(), (None, 'a')) 2241 2242 # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat 2243 s = 'abab' 2244 p = r'(?:[^b]*(?=(b)|(a))ab)*' 2245 m = re.match(p, s) 2246 self.assertEqual(m.span(), (0, 4)) 2247 self.assertEqual(m.span(2), (2, 3)) 2248 self.assertEqual(m.groups(), (None, 'a')) 2249 2250 self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',)) 2251 2252 def test_MIN_REPEAT_ONE_mark_bug(self): 2253 # issue35859 2254 # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat 2255 s = 'abab' 2256 p = r'(?:.*?(?=(a)|(b))b)*' 2257 m = re.match(p, s) 2258 self.assertEqual(m.span(), (0, 4)) 2259 self.assertEqual(m.span(2), (3, 4)) 2260 self.assertEqual(m.groups(), (None, 'b')) 2261 2262 s = 'axxzaz' 2263 p = r'(?:a*?(xx)??z)*' 2264 self.assertEqual(re.match(p, s).groups(), ('xx',)) 2265 2266 def test_ASSERT_NOT_mark_bug(self): 2267 # Fixed in issue35859, reported in issue725149. 2268 # JUMP_ASSERT_NOT should LASTMARK_SAVE() 2269 self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,)) 2270 2271 # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat 2272 m = re.match(r'((?!(ab)c)(.))*', 'abab') 2273 self.assertEqual(m.span(), (0, 4)) 2274 self.assertEqual(m.span(1), (3, 4)) 2275 self.assertEqual(m.span(3), (3, 4)) 2276 self.assertEqual(m.groups(), ('b', None, 'b')) 2277 2278 def test_bug_40736(self): 2279 with self.assertRaisesRegex(TypeError, "got 'int'"): 2280 re.search("x*", 5) 2281 with self.assertRaisesRegex(TypeError, "got 'type'"): 2282 re.search("x*", type) 2283 2284 def test_search_anchor_at_beginning(self): 2285 s = 'x'*10**7 2286 start = time.perf_counter() 2287 for p in r'\Ay', r'^y': 2288 self.assertIsNone(re.search(p, s)) 2289 self.assertEqual(re.split(p, s), [s]) 2290 self.assertEqual(re.findall(p, s), []) 2291 self.assertEqual(list(re.finditer(p, s)), []) 2292 self.assertEqual(re.sub(p, '', s), s) 2293 t = time.perf_counter() - start 2294 # Without optimization it takes 1 second on my computer. 2295 # With optimization -- 0.0003 seconds. 2296 self.assertLess(t, 0.1) 2297 2298 def test_possessive_quantifiers(self): 2299 """Test Possessive Quantifiers 2300 Test quantifiers of the form @+ for some repetition operator @, 2301 e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed 2302 without creating a stack frame for rolling the stack back and 2303 trying 1 or more fewer matches.""" 2304 self.assertIsNone(re.match('e*+e', 'eeee')) 2305 self.assertEqual(re.match('e++a', 'eeea').group(0), 'eeea') 2306 self.assertEqual(re.match('e?+a', 'ea').group(0), 'ea') 2307 self.assertEqual(re.match('e{2,4}+a', 'eeea').group(0), 'eeea') 2308 self.assertIsNone(re.match('(.)++.', 'ee')) 2309 self.assertEqual(re.match('(ae)*+a', 'aea').groups(), ('ae',)) 2310 self.assertEqual(re.match('([ae][ae])?+a', 'aea').groups(), 2311 ('ae',)) 2312 self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(), 2313 ('',)) 2314 self.assertEqual(re.match('()*+a', 'a').groups(), ('',)) 2315 self.assertEqual(re.search('x*+', 'axx').span(), (0, 0)) 2316 self.assertEqual(re.search('x++', 'axx').span(), (1, 3)) 2317 self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0)) 2318 self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3)) 2319 self.assertIsNone(re.match('a++', 'xxx')) 2320 self.assertIsNone(re.match(r"^(\w){1}+$", "abc")) 2321 self.assertIsNone(re.match(r"^(\w){1,2}+$", "abc")) 2322 2323 self.assertEqual(re.match(r"^(\w){3}+$", "abc").group(1), "c") 2324 self.assertEqual(re.match(r"^(\w){1,3}+$", "abc").group(1), "c") 2325 self.assertEqual(re.match(r"^(\w){1,4}+$", "abc").group(1), "c") 2326 2327 self.assertIsNone(re.match("^x{1}+$", "xxx")) 2328 self.assertIsNone(re.match("^x{1,2}+$", "xxx")) 2329 2330 self.assertTrue(re.match("^x{3}+$", "xxx")) 2331 self.assertTrue(re.match("^x{1,3}+$", "xxx")) 2332 self.assertTrue(re.match("^x{1,4}+$", "xxx")) 2333 2334 self.assertIsNone(re.match("^x{}+$", "xxx")) 2335 self.assertTrue(re.match("^x{}+$", "x{}")) 2336 2337 def test_fullmatch_possessive_quantifiers(self): 2338 self.assertTrue(re.fullmatch(r'a++', 'a')) 2339 self.assertTrue(re.fullmatch(r'a*+', 'a')) 2340 self.assertTrue(re.fullmatch(r'a?+', 'a')) 2341 self.assertTrue(re.fullmatch(r'a{1,3}+', 'a')) 2342 self.assertIsNone(re.fullmatch(r'a++', 'ab')) 2343 self.assertIsNone(re.fullmatch(r'a*+', 'ab')) 2344 self.assertIsNone(re.fullmatch(r'a?+', 'ab')) 2345 self.assertIsNone(re.fullmatch(r'a{1,3}+', 'ab')) 2346 self.assertTrue(re.fullmatch(r'a++b', 'ab')) 2347 self.assertTrue(re.fullmatch(r'a*+b', 'ab')) 2348 self.assertTrue(re.fullmatch(r'a?+b', 'ab')) 2349 self.assertTrue(re.fullmatch(r'a{1,3}+b', 'ab')) 2350 2351 self.assertTrue(re.fullmatch(r'(?:ab)++', 'ab')) 2352 self.assertTrue(re.fullmatch(r'(?:ab)*+', 'ab')) 2353 self.assertTrue(re.fullmatch(r'(?:ab)?+', 'ab')) 2354 self.assertTrue(re.fullmatch(r'(?:ab){1,3}+', 'ab')) 2355 self.assertIsNone(re.fullmatch(r'(?:ab)++', 'abc')) 2356 self.assertIsNone(re.fullmatch(r'(?:ab)*+', 'abc')) 2357 self.assertIsNone(re.fullmatch(r'(?:ab)?+', 'abc')) 2358 self.assertIsNone(re.fullmatch(r'(?:ab){1,3}+', 'abc')) 2359 self.assertTrue(re.fullmatch(r'(?:ab)++c', 'abc')) 2360 self.assertTrue(re.fullmatch(r'(?:ab)*+c', 'abc')) 2361 self.assertTrue(re.fullmatch(r'(?:ab)?+c', 'abc')) 2362 self.assertTrue(re.fullmatch(r'(?:ab){1,3}+c', 'abc')) 2363 2364 def test_findall_possessive_quantifiers(self): 2365 self.assertEqual(re.findall(r'a++', 'aab'), ['aa']) 2366 self.assertEqual(re.findall(r'a*+', 'aab'), ['aa', '', '']) 2367 self.assertEqual(re.findall(r'a?+', 'aab'), ['a', 'a', '', '']) 2368 self.assertEqual(re.findall(r'a{1,3}+', 'aab'), ['aa']) 2369 2370 self.assertEqual(re.findall(r'(?:ab)++', 'ababc'), ['abab']) 2371 self.assertEqual(re.findall(r'(?:ab)*+', 'ababc'), ['abab', '', '']) 2372 self.assertEqual(re.findall(r'(?:ab)?+', 'ababc'), ['ab', 'ab', '', '']) 2373 self.assertEqual(re.findall(r'(?:ab){1,3}+', 'ababc'), ['abab']) 2374 2375 def test_atomic_grouping(self): 2376 """Test Atomic Grouping 2377 Test non-capturing groups of the form (?>...), which does 2378 not maintain any stack point created within the group once the 2379 group is finished being evaluated.""" 2380 pattern1 = re.compile(r'a(?>bc|b)c') 2381 self.assertIsNone(pattern1.match('abc')) 2382 self.assertTrue(pattern1.match('abcc')) 2383 self.assertIsNone(re.match(r'(?>.*).', 'abc')) 2384 self.assertTrue(re.match(r'(?>x)++', 'xxx')) 2385 self.assertTrue(re.match(r'(?>x++)', 'xxx')) 2386 self.assertIsNone(re.match(r'(?>x)++x', 'xxx')) 2387 self.assertIsNone(re.match(r'(?>x++)x', 'xxx')) 2388 2389 def test_fullmatch_atomic_grouping(self): 2390 self.assertTrue(re.fullmatch(r'(?>a+)', 'a')) 2391 self.assertTrue(re.fullmatch(r'(?>a*)', 'a')) 2392 self.assertTrue(re.fullmatch(r'(?>a?)', 'a')) 2393 self.assertTrue(re.fullmatch(r'(?>a{1,3})', 'a')) 2394 self.assertIsNone(re.fullmatch(r'(?>a+)', 'ab')) 2395 self.assertIsNone(re.fullmatch(r'(?>a*)', 'ab')) 2396 self.assertIsNone(re.fullmatch(r'(?>a?)', 'ab')) 2397 self.assertIsNone(re.fullmatch(r'(?>a{1,3})', 'ab')) 2398 self.assertTrue(re.fullmatch(r'(?>a+)b', 'ab')) 2399 self.assertTrue(re.fullmatch(r'(?>a*)b', 'ab')) 2400 self.assertTrue(re.fullmatch(r'(?>a?)b', 'ab')) 2401 self.assertTrue(re.fullmatch(r'(?>a{1,3})b', 'ab')) 2402 2403 self.assertTrue(re.fullmatch(r'(?>(?:ab)+)', 'ab')) 2404 self.assertTrue(re.fullmatch(r'(?>(?:ab)*)', 'ab')) 2405 self.assertTrue(re.fullmatch(r'(?>(?:ab)?)', 'ab')) 2406 self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})', 'ab')) 2407 self.assertIsNone(re.fullmatch(r'(?>(?:ab)+)', 'abc')) 2408 self.assertIsNone(re.fullmatch(r'(?>(?:ab)*)', 'abc')) 2409 self.assertIsNone(re.fullmatch(r'(?>(?:ab)?)', 'abc')) 2410 self.assertIsNone(re.fullmatch(r'(?>(?:ab){1,3})', 'abc')) 2411 self.assertTrue(re.fullmatch(r'(?>(?:ab)+)c', 'abc')) 2412 self.assertTrue(re.fullmatch(r'(?>(?:ab)*)c', 'abc')) 2413 self.assertTrue(re.fullmatch(r'(?>(?:ab)?)c', 'abc')) 2414 self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})c', 'abc')) 2415 2416 def test_findall_atomic_grouping(self): 2417 self.assertEqual(re.findall(r'(?>a+)', 'aab'), ['aa']) 2418 self.assertEqual(re.findall(r'(?>a*)', 'aab'), ['aa', '', '']) 2419 self.assertEqual(re.findall(r'(?>a?)', 'aab'), ['a', 'a', '', '']) 2420 self.assertEqual(re.findall(r'(?>a{1,3})', 'aab'), ['aa']) 2421 2422 self.assertEqual(re.findall(r'(?>(?:ab)+)', 'ababc'), ['abab']) 2423 self.assertEqual(re.findall(r'(?>(?:ab)*)', 'ababc'), ['abab', '', '']) 2424 self.assertEqual(re.findall(r'(?>(?:ab)?)', 'ababc'), ['ab', 'ab', '', '']) 2425 self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab']) 2426 2427 def test_bug_gh91616(self): 2428 self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\Z', "a.txt")) # reproducer 2429 self.assertTrue(re.fullmatch(r'(?s:(?=(?P<g0>.*?\.))(?P=g0).*)\Z', "a.txt")) 2430 2431 def test_template_function_and_flag_is_deprecated(self): 2432 with self.assertWarns(DeprecationWarning) as cm: 2433 template_re1 = re.template(r'a') 2434 self.assertIn('re.template()', str(cm.warning)) 2435 self.assertIn('is deprecated', str(cm.warning)) 2436 self.assertIn('function', str(cm.warning)) 2437 self.assertNotIn('flag', str(cm.warning)) 2438 2439 with self.assertWarns(DeprecationWarning) as cm: 2440 # we deliberately use more flags here to test that that still 2441 # triggers the warning 2442 # if paranoid, we could test multiple different combinations, 2443 # but it's probably not worth it 2444 template_re2 = re.compile(r'a', flags=re.TEMPLATE|re.UNICODE) 2445 self.assertIn('re.TEMPLATE', str(cm.warning)) 2446 self.assertIn('is deprecated', str(cm.warning)) 2447 self.assertIn('flag', str(cm.warning)) 2448 self.assertNotIn('function', str(cm.warning)) 2449 2450 # while deprecated, is should still function 2451 self.assertEqual(template_re1, template_re2) 2452 self.assertTrue(template_re1.match('ahoy')) 2453 self.assertFalse(template_re1.match('nope')) 2454 2455 @unittest.skipIf(multiprocessing is None, 'test requires multiprocessing') 2456 def test_regression_gh94675(self): 2457 pattern = re.compile(r'(?<=[({}])(((//[^\n]*)?[\n])([\000-\040])*)*' 2458 r'((/[^/\[\n]*(([^\n]|(\[\n]*(]*)*\]))' 2459 r'[^/\[]*)*/))((((//[^\n]*)?[\n])' 2460 r'([\000-\040]|(/\*[^*]*\*+' 2461 r'([^/*]\*+)*/))*)+(?=[^\000-\040);\]}]))') 2462 input_js = '''a(function() { 2463 /////////////////////////////////////////////////////////////////// 2464 });''' 2465 p = multiprocessing.Process(target=pattern.sub, args=('', input_js)) 2466 p.start() 2467 p.join(SHORT_TIMEOUT) 2468 try: 2469 self.assertFalse(p.is_alive(), 'pattern.sub() timed out') 2470 finally: 2471 if p.is_alive(): 2472 p.terminate() 2473 p.join() 2474 2475 2476def get_debug_out(pat): 2477 with captured_stdout() as out: 2478 re.compile(pat, re.DEBUG) 2479 return out.getvalue() 2480 2481 2482@cpython_only 2483class DebugTests(unittest.TestCase): 2484 maxDiff = None 2485 2486 def test_debug_flag(self): 2487 pat = r'(\.)(?:[ch]|py)(?(1)$|: )' 2488 dump = '''\ 2489SUBPATTERN 1 0 0 2490 LITERAL 46 2491BRANCH 2492 IN 2493 LITERAL 99 2494 LITERAL 104 2495OR 2496 LITERAL 112 2497 LITERAL 121 2498GROUPREF_EXISTS 1 2499 AT AT_END 2500ELSE 2501 LITERAL 58 2502 LITERAL 32 2503 2504 0. INFO 8 0b1 2 5 (to 9) 2505 prefix_skip 0 2506 prefix [0x2e] ('.') 2507 overlap [0] 2508 9: MARK 0 250911. LITERAL 0x2e ('.') 251013. MARK 1 251115. BRANCH 10 (to 26) 251217. IN 6 (to 24) 251319. LITERAL 0x63 ('c') 251421. LITERAL 0x68 ('h') 251523. FAILURE 251624: JUMP 9 (to 34) 251726: branch 7 (to 33) 251827. LITERAL 0x70 ('p') 251929. LITERAL 0x79 ('y') 252031. JUMP 2 (to 34) 252133: FAILURE 252234: GROUPREF_EXISTS 0 6 (to 41) 252337. AT END 252439. JUMP 5 (to 45) 252541: LITERAL 0x3a (':') 252643. LITERAL 0x20 (' ') 252745: SUCCESS 2528''' 2529 self.assertEqual(get_debug_out(pat), dump) 2530 # Debug output is output again even a second time (bypassing 2531 # the cache -- issue #20426). 2532 self.assertEqual(get_debug_out(pat), dump) 2533 2534 def test_atomic_group(self): 2535 self.assertEqual(get_debug_out(r'(?>ab?)'), '''\ 2536ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))] 2537 2538 0. INFO 4 0b0 1 2 (to 5) 2539 5: ATOMIC_GROUP 11 (to 17) 2540 7. LITERAL 0x61 ('a') 2541 9. REPEAT_ONE 6 0 1 (to 16) 254213. LITERAL 0x62 ('b') 254315. SUCCESS 254416: SUCCESS 254517: SUCCESS 2546''') 2547 2548 def test_possesive_repeat_one(self): 2549 self.assertEqual(get_debug_out(r'a?+'), '''\ 2550POSSESSIVE_REPEAT 0 1 2551 LITERAL 97 2552 2553 0. INFO 4 0b0 0 1 (to 5) 2554 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12) 2555 9. LITERAL 0x61 ('a') 255611. SUCCESS 255712: SUCCESS 2558''') 2559 2560 def test_possesive_repeat(self): 2561 self.assertEqual(get_debug_out(r'(?:ab)?+'), '''\ 2562POSSESSIVE_REPEAT 0 1 2563 LITERAL 97 2564 LITERAL 98 2565 2566 0. INFO 4 0b0 0 2 (to 5) 2567 5: POSSESSIVE_REPEAT 7 0 1 (to 13) 2568 9. LITERAL 0x61 ('a') 256911. LITERAL 0x62 ('b') 257013: SUCCESS 257114. SUCCESS 2572''') 2573 2574 2575class PatternReprTests(unittest.TestCase): 2576 def check(self, pattern, expected): 2577 self.assertEqual(repr(re.compile(pattern)), expected) 2578 2579 def check_flags(self, pattern, flags, expected): 2580 self.assertEqual(repr(re.compile(pattern, flags)), expected) 2581 2582 def test_without_flags(self): 2583 self.check('random pattern', 2584 "re.compile('random pattern')") 2585 2586 def test_single_flag(self): 2587 self.check_flags('random pattern', re.IGNORECASE, 2588 "re.compile('random pattern', re.IGNORECASE)") 2589 2590 def test_multiple_flags(self): 2591 self.check_flags('random pattern', re.I|re.S|re.X, 2592 "re.compile('random pattern', " 2593 "re.IGNORECASE|re.DOTALL|re.VERBOSE)") 2594 2595 def test_unicode_flag(self): 2596 self.check_flags('random pattern', re.U, 2597 "re.compile('random pattern')") 2598 self.check_flags('random pattern', re.I|re.S|re.U, 2599 "re.compile('random pattern', " 2600 "re.IGNORECASE|re.DOTALL)") 2601 2602 def test_inline_flags(self): 2603 self.check('(?i)pattern', 2604 "re.compile('(?i)pattern', re.IGNORECASE)") 2605 2606 def test_unknown_flags(self): 2607 self.check_flags('random pattern', 0x123000, 2608 "re.compile('random pattern', 0x123000)") 2609 self.check_flags('random pattern', 0x123000|re.I, 2610 "re.compile('random pattern', re.IGNORECASE|0x123000)") 2611 2612 def test_bytes(self): 2613 self.check(b'bytes pattern', 2614 "re.compile(b'bytes pattern')") 2615 self.check_flags(b'bytes pattern', re.A, 2616 "re.compile(b'bytes pattern', re.ASCII)") 2617 2618 def test_locale(self): 2619 self.check_flags(b'bytes pattern', re.L, 2620 "re.compile(b'bytes pattern', re.LOCALE)") 2621 2622 def test_quotes(self): 2623 self.check('random "double quoted" pattern', 2624 '''re.compile('random "double quoted" pattern')''') 2625 self.check("random 'single quoted' pattern", 2626 '''re.compile("random 'single quoted' pattern")''') 2627 self.check('''both 'single' and "double" quotes''', 2628 '''re.compile('both \\'single\\' and "double" quotes')''') 2629 2630 def test_long_pattern(self): 2631 pattern = 'Very %spattern' % ('long ' * 1000) 2632 r = repr(re.compile(pattern)) 2633 self.assertLess(len(r), 300) 2634 self.assertEqual(r[:30], "re.compile('Very long long lon") 2635 r = repr(re.compile(pattern, re.I)) 2636 self.assertLess(len(r), 300) 2637 self.assertEqual(r[:30], "re.compile('Very long long lon") 2638 self.assertEqual(r[-16:], ", re.IGNORECASE)") 2639 2640 def test_flags_repr(self): 2641 self.assertEqual(repr(re.I), "re.IGNORECASE") 2642 self.assertEqual(repr(re.I|re.S|re.X), 2643 "re.IGNORECASE|re.DOTALL|re.VERBOSE") 2644 self.assertEqual(repr(re.I|re.S|re.X|(1<<20)), 2645 "re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000") 2646 self.assertEqual( 2647 repr(~re.I), 2648 "re.ASCII|re.LOCALE|re.UNICODE|re.MULTILINE|re.DOTALL|re.VERBOSE|re.TEMPLATE|re.DEBUG") 2649 self.assertEqual(repr(~(re.I|re.S|re.X)), 2650 "re.ASCII|re.LOCALE|re.UNICODE|re.MULTILINE|re.TEMPLATE|re.DEBUG") 2651 self.assertEqual(repr(~(re.I|re.S|re.X|(1<<20))), 2652 "re.ASCII|re.LOCALE|re.UNICODE|re.MULTILINE|re.TEMPLATE|re.DEBUG|0xffe00") 2653 2654 2655class ImplementationTest(unittest.TestCase): 2656 """ 2657 Test implementation details of the re module. 2658 """ 2659 2660 @cpython_only 2661 def test_immutable(self): 2662 # bpo-43908: check that re types are immutable 2663 with self.assertRaises(TypeError): 2664 re.Match.foo = 1 2665 with self.assertRaises(TypeError): 2666 re.Pattern.foo = 1 2667 with self.assertRaises(TypeError): 2668 pat = re.compile("") 2669 tp = type(pat.scanner("")) 2670 tp.foo = 1 2671 2672 def test_overlap_table(self): 2673 f = re._compiler._generate_overlap_table 2674 self.assertEqual(f(""), []) 2675 self.assertEqual(f("a"), [0]) 2676 self.assertEqual(f("abcd"), [0, 0, 0, 0]) 2677 self.assertEqual(f("aaaa"), [0, 1, 2, 3]) 2678 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1]) 2679 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) 2680 2681 def test_signedness(self): 2682 self.assertGreaterEqual(re._compiler.MAXREPEAT, 0) 2683 self.assertGreaterEqual(re._compiler.MAXGROUPS, 0) 2684 2685 @cpython_only 2686 def test_disallow_instantiation(self): 2687 # Ensure that the type disallows instantiation (bpo-43916) 2688 check_disallow_instantiation(self, re.Match) 2689 check_disallow_instantiation(self, re.Pattern) 2690 pat = re.compile("") 2691 check_disallow_instantiation(self, type(pat.scanner(""))) 2692 2693 def test_deprecated_modules(self): 2694 deprecated = { 2695 'sre_compile': ['compile', 'error', 2696 'SRE_FLAG_IGNORECASE', 'SUBPATTERN', 2697 '_compile_info'], 2698 'sre_constants': ['error', 'SRE_FLAG_IGNORECASE', 'SUBPATTERN', 2699 '_NamedIntConstant'], 2700 'sre_parse': ['SubPattern', 'parse', 2701 'SRE_FLAG_IGNORECASE', 'SUBPATTERN', 2702 '_parse_sub'], 2703 } 2704 for name in deprecated: 2705 with self.subTest(module=name): 2706 sys.modules.pop(name, None) 2707 with self.assertWarns(DeprecationWarning) as w: 2708 __import__(name) 2709 self.assertEqual(str(w.warning), 2710 f"module {name!r} is deprecated") 2711 self.assertEqual(w.filename, __file__) 2712 self.assertIn(name, sys.modules) 2713 mod = sys.modules[name] 2714 self.assertEqual(mod.__name__, name) 2715 self.assertEqual(mod.__package__, '') 2716 for attr in deprecated[name]: 2717 self.assertTrue(hasattr(mod, attr)) 2718 del sys.modules[name] 2719 2720class ExternalTests(unittest.TestCase): 2721 2722 def test_re_benchmarks(self): 2723 're_tests benchmarks' 2724 from test.re_tests import benchmarks 2725 for pattern, s in benchmarks: 2726 with self.subTest(pattern=pattern, string=s): 2727 p = re.compile(pattern) 2728 self.assertTrue(p.search(s)) 2729 self.assertTrue(p.match(s)) 2730 self.assertTrue(p.fullmatch(s)) 2731 s2 = ' '*10000 + s + ' '*10000 2732 self.assertTrue(p.search(s2)) 2733 self.assertTrue(p.match(s2, 10000)) 2734 self.assertTrue(p.match(s2, 10000, 10000 + len(s))) 2735 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s))) 2736 2737 def test_re_tests(self): 2738 're_tests test suite' 2739 from test.re_tests import tests, FAIL, SYNTAX_ERROR 2740 for t in tests: 2741 pattern = s = outcome = repl = expected = None 2742 if len(t) == 5: 2743 pattern, s, outcome, repl, expected = t 2744 elif len(t) == 3: 2745 pattern, s, outcome = t 2746 else: 2747 raise ValueError('Test tuples should have 3 or 5 fields', t) 2748 2749 with self.subTest(pattern=pattern, string=s): 2750 if outcome == SYNTAX_ERROR: # Expected a syntax error 2751 with self.assertRaises(re.error): 2752 re.compile(pattern) 2753 continue 2754 2755 obj = re.compile(pattern) 2756 result = obj.search(s) 2757 if outcome == FAIL: 2758 self.assertIsNone(result, 'Succeeded incorrectly') 2759 continue 2760 2761 with self.subTest(): 2762 self.assertTrue(result, 'Failed incorrectly') 2763 # Matched, as expected, so now we compute the 2764 # result string and compare it to our expected result. 2765 start, end = result.span(0) 2766 vardict = {'found': result.group(0), 2767 'groups': result.group(), 2768 'flags': result.re.flags} 2769 for i in range(1, 100): 2770 try: 2771 gi = result.group(i) 2772 # Special hack because else the string concat fails: 2773 if gi is None: 2774 gi = "None" 2775 except IndexError: 2776 gi = "Error" 2777 vardict['g%d' % i] = gi 2778 for i in result.re.groupindex.keys(): 2779 try: 2780 gi = result.group(i) 2781 if gi is None: 2782 gi = "None" 2783 except IndexError: 2784 gi = "Error" 2785 vardict[i] = gi 2786 self.assertEqual(eval(repl, vardict), expected, 2787 'grouping error') 2788 2789 # Try the match with both pattern and string converted to 2790 # bytes, and check that it still succeeds. 2791 try: 2792 bpat = bytes(pattern, "ascii") 2793 bs = bytes(s, "ascii") 2794 except UnicodeEncodeError: 2795 # skip non-ascii tests 2796 pass 2797 else: 2798 with self.subTest('bytes pattern match'): 2799 obj = re.compile(bpat) 2800 self.assertTrue(obj.search(bs)) 2801 2802 # Try the match with LOCALE enabled, and check that it 2803 # still succeeds. 2804 with self.subTest('locale-sensitive match'): 2805 obj = re.compile(bpat, re.LOCALE) 2806 result = obj.search(bs) 2807 if result is None: 2808 print('=== Fails on locale-sensitive match', t) 2809 2810 # Try the match with the search area limited to the extent 2811 # of the match and see if it still succeeds. \B will 2812 # break (because it won't match at the end or start of a 2813 # string), so we'll ignore patterns that feature it. 2814 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B' 2815 and result is not None): 2816 with self.subTest('range-limited match'): 2817 obj = re.compile(pattern) 2818 self.assertTrue(obj.search(s, start, end + 1)) 2819 2820 # Try the match with IGNORECASE enabled, and check that it 2821 # still succeeds. 2822 with self.subTest('case-insensitive match'): 2823 obj = re.compile(pattern, re.IGNORECASE) 2824 self.assertTrue(obj.search(s)) 2825 2826 # Try the match with UNICODE locale enabled, and check 2827 # that it still succeeds. 2828 with self.subTest('unicode-sensitive match'): 2829 obj = re.compile(pattern, re.UNICODE) 2830 self.assertTrue(obj.search(s)) 2831 2832 2833if __name__ == "__main__": 2834 unittest.main() 2835