1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7 *
8 * See the sre.c file for information on usage and redistribution.
9 */
10
11 /* String matching engine */
12
13 /* This file is included three times, with different character settings */
14
15 LOCAL(int)
SRE(at)16 SRE(at)(SRE_STATE* state, const SRE_CHAR* ptr, SRE_CODE at)
17 {
18 /* check if pointer is at given position */
19
20 Py_ssize_t thisp, thatp;
21
22 switch (at) {
23
24 case SRE_AT_BEGINNING:
25 case SRE_AT_BEGINNING_STRING:
26 return ((void*) ptr == state->beginning);
27
28 case SRE_AT_BEGINNING_LINE:
29 return ((void*) ptr == state->beginning ||
30 SRE_IS_LINEBREAK((int) ptr[-1]));
31
32 case SRE_AT_END:
33 return (((SRE_CHAR *)state->end - ptr == 1 &&
34 SRE_IS_LINEBREAK((int) ptr[0])) ||
35 ((void*) ptr == state->end));
36
37 case SRE_AT_END_LINE:
38 return ((void*) ptr == state->end ||
39 SRE_IS_LINEBREAK((int) ptr[0]));
40
41 case SRE_AT_END_STRING:
42 return ((void*) ptr == state->end);
43
44 case SRE_AT_BOUNDARY:
45 if (state->beginning == state->end)
46 return 0;
47 thatp = ((void*) ptr > state->beginning) ?
48 SRE_IS_WORD((int) ptr[-1]) : 0;
49 thisp = ((void*) ptr < state->end) ?
50 SRE_IS_WORD((int) ptr[0]) : 0;
51 return thisp != thatp;
52
53 case SRE_AT_NON_BOUNDARY:
54 if (state->beginning == state->end)
55 return 0;
56 thatp = ((void*) ptr > state->beginning) ?
57 SRE_IS_WORD((int) ptr[-1]) : 0;
58 thisp = ((void*) ptr < state->end) ?
59 SRE_IS_WORD((int) ptr[0]) : 0;
60 return thisp == thatp;
61
62 case SRE_AT_LOC_BOUNDARY:
63 if (state->beginning == state->end)
64 return 0;
65 thatp = ((void*) ptr > state->beginning) ?
66 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
67 thisp = ((void*) ptr < state->end) ?
68 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
69 return thisp != thatp;
70
71 case SRE_AT_LOC_NON_BOUNDARY:
72 if (state->beginning == state->end)
73 return 0;
74 thatp = ((void*) ptr > state->beginning) ?
75 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
76 thisp = ((void*) ptr < state->end) ?
77 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
78 return thisp == thatp;
79
80 case SRE_AT_UNI_BOUNDARY:
81 if (state->beginning == state->end)
82 return 0;
83 thatp = ((void*) ptr > state->beginning) ?
84 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
85 thisp = ((void*) ptr < state->end) ?
86 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
87 return thisp != thatp;
88
89 case SRE_AT_UNI_NON_BOUNDARY:
90 if (state->beginning == state->end)
91 return 0;
92 thatp = ((void*) ptr > state->beginning) ?
93 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
94 thisp = ((void*) ptr < state->end) ?
95 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
96 return thisp == thatp;
97
98 }
99
100 return 0;
101 }
102
103 LOCAL(int)
SRE(charset)104 SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)
105 {
106 /* check if character is a member of the given set */
107
108 int ok = 1;
109
110 for (;;) {
111 switch (*set++) {
112
113 case SRE_OP_FAILURE:
114 return !ok;
115
116 case SRE_OP_LITERAL:
117 /* <LITERAL> <code> */
118 if (ch == set[0])
119 return ok;
120 set++;
121 break;
122
123 case SRE_OP_CATEGORY:
124 /* <CATEGORY> <code> */
125 if (sre_category(set[0], (int) ch))
126 return ok;
127 set++;
128 break;
129
130 case SRE_OP_CHARSET:
131 /* <CHARSET> <bitmap> */
132 if (ch < 256 &&
133 (set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1)))))
134 return ok;
135 set += 256/SRE_CODE_BITS;
136 break;
137
138 case SRE_OP_RANGE:
139 /* <RANGE> <lower> <upper> */
140 if (set[0] <= ch && ch <= set[1])
141 return ok;
142 set += 2;
143 break;
144
145 case SRE_OP_RANGE_UNI_IGNORE:
146 /* <RANGE_UNI_IGNORE> <lower> <upper> */
147 {
148 SRE_CODE uch;
149 /* ch is already lower cased */
150 if (set[0] <= ch && ch <= set[1])
151 return ok;
152 uch = sre_upper_unicode(ch);
153 if (set[0] <= uch && uch <= set[1])
154 return ok;
155 set += 2;
156 break;
157 }
158
159 case SRE_OP_NEGATE:
160 ok = !ok;
161 break;
162
163 case SRE_OP_BIGCHARSET:
164 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
165 {
166 Py_ssize_t count, block;
167 count = *(set++);
168
169 if (ch < 0x10000u)
170 block = ((unsigned char*)set)[ch >> 8];
171 else
172 block = -1;
173 set += 256/sizeof(SRE_CODE);
174 if (block >=0 &&
175 (set[(block * 256 + (ch & 255))/SRE_CODE_BITS] &
176 (1u << (ch & (SRE_CODE_BITS-1)))))
177 return ok;
178 set += count * (256/SRE_CODE_BITS);
179 break;
180 }
181
182 default:
183 /* internal error -- there's not much we can do about it
184 here, so let's just pretend it didn't match... */
185 return 0;
186 }
187 }
188 }
189
190 LOCAL(int)
SRE(charset_loc_ignore)191 SRE(charset_loc_ignore)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)
192 {
193 SRE_CODE lo, up;
194 lo = sre_lower_locale(ch);
195 if (SRE(charset)(state, set, lo))
196 return 1;
197
198 up = sre_upper_locale(ch);
199 return up != lo && SRE(charset)(state, set, up);
200 }
201
202 LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel);
203
204 LOCAL(Py_ssize_t)
SRE(count)205 SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount)
206 {
207 SRE_CODE chr;
208 SRE_CHAR c;
209 const SRE_CHAR* ptr = (const SRE_CHAR *)state->ptr;
210 const SRE_CHAR* end = (const SRE_CHAR *)state->end;
211 Py_ssize_t i;
212
213 /* adjust end */
214 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
215 end = ptr + maxcount;
216
217 switch (pattern[0]) {
218
219 case SRE_OP_IN:
220 /* repeated set */
221 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
222 while (ptr < end && SRE(charset)(state, pattern + 2, *ptr))
223 ptr++;
224 break;
225
226 case SRE_OP_ANY:
227 /* repeated dot wildcard. */
228 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
229 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
230 ptr++;
231 break;
232
233 case SRE_OP_ANY_ALL:
234 /* repeated dot wildcard. skip to the end of the target
235 string, and backtrack from there */
236 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
237 ptr = end;
238 break;
239
240 case SRE_OP_LITERAL:
241 /* repeated literal */
242 chr = pattern[1];
243 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
244 c = (SRE_CHAR) chr;
245 #if SIZEOF_SRE_CHAR < 4
246 if ((SRE_CODE) c != chr)
247 ; /* literal can't match: doesn't fit in char width */
248 else
249 #endif
250 while (ptr < end && *ptr == c)
251 ptr++;
252 break;
253
254 case SRE_OP_LITERAL_IGNORE:
255 /* repeated literal */
256 chr = pattern[1];
257 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
258 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr)
259 ptr++;
260 break;
261
262 case SRE_OP_LITERAL_UNI_IGNORE:
263 /* repeated literal */
264 chr = pattern[1];
265 TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
266 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr)
267 ptr++;
268 break;
269
270 case SRE_OP_LITERAL_LOC_IGNORE:
271 /* repeated literal */
272 chr = pattern[1];
273 TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
274 while (ptr < end && char_loc_ignore(chr, *ptr))
275 ptr++;
276 break;
277
278 case SRE_OP_NOT_LITERAL:
279 /* repeated non-literal */
280 chr = pattern[1];
281 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
282 c = (SRE_CHAR) chr;
283 #if SIZEOF_SRE_CHAR < 4
284 if ((SRE_CODE) c != chr)
285 ptr = end; /* literal can't match: doesn't fit in char width */
286 else
287 #endif
288 while (ptr < end && *ptr != c)
289 ptr++;
290 break;
291
292 case SRE_OP_NOT_LITERAL_IGNORE:
293 /* repeated non-literal */
294 chr = pattern[1];
295 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
296 while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr)
297 ptr++;
298 break;
299
300 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
301 /* repeated non-literal */
302 chr = pattern[1];
303 TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr));
304 while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr)
305 ptr++;
306 break;
307
308 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
309 /* repeated non-literal */
310 chr = pattern[1];
311 TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
312 while (ptr < end && !char_loc_ignore(chr, *ptr))
313 ptr++;
314 break;
315
316 default:
317 /* repeated single character pattern */
318 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
319 while ((SRE_CHAR*) state->ptr < end) {
320 i = SRE(match)(state, pattern, 0);
321 if (i < 0)
322 return i;
323 if (!i)
324 break;
325 }
326 TRACE(("|%p|%p|COUNT %zd\n", pattern, ptr,
327 (SRE_CHAR*) state->ptr - ptr));
328 return (SRE_CHAR*) state->ptr - ptr;
329 }
330
331 TRACE(("|%p|%p|COUNT %zd\n", pattern, ptr,
332 ptr - (SRE_CHAR*) state->ptr));
333 return ptr - (SRE_CHAR*) state->ptr;
334 }
335
336 /* The macros below should be used to protect recursive SRE(match)()
337 * calls that *failed* and do *not* return immediately (IOW, those
338 * that will backtrack). Explaining:
339 *
340 * - Recursive SRE(match)() returned true: that's usually a success
341 * (besides atypical cases like ASSERT_NOT), therefore there's no
342 * reason to restore lastmark;
343 *
344 * - Recursive SRE(match)() returned false but the current SRE(match)()
345 * is returning to the caller: If the current SRE(match)() is the
346 * top function of the recursion, returning false will be a matching
347 * failure, and it doesn't matter where lastmark is pointing to.
348 * If it's *not* the top function, it will be a recursive SRE(match)()
349 * failure by itself, and the calling SRE(match)() will have to deal
350 * with the failure by the same rules explained here (it will restore
351 * lastmark by itself if necessary);
352 *
353 * - Recursive SRE(match)() returned false, and will continue the
354 * outside 'for' loop: must be protected when breaking, since the next
355 * OP could potentially depend on lastmark;
356 *
357 * - Recursive SRE(match)() returned false, and will be called again
358 * inside a local for/while loop: must be protected between each
359 * loop iteration, since the recursive SRE(match)() could do anything,
360 * and could potentially depend on lastmark.
361 *
362 * For more information, check the discussion at SF patch #712900.
363 */
364 #define LASTMARK_SAVE() \
365 do { \
366 ctx->lastmark = state->lastmark; \
367 ctx->lastindex = state->lastindex; \
368 } while (0)
369 #define LASTMARK_RESTORE() \
370 do { \
371 state->lastmark = ctx->lastmark; \
372 state->lastindex = ctx->lastindex; \
373 } while (0)
374
375 #define RETURN_ERROR(i) do { return i; } while(0)
376 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
377 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
378
379 #define RETURN_ON_ERROR(i) \
380 do { if (i < 0) RETURN_ERROR(i); } while (0)
381 #define RETURN_ON_SUCCESS(i) \
382 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
383 #define RETURN_ON_FAILURE(i) \
384 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
385
386 #define DATA_STACK_ALLOC(state, type, ptr) \
387 do { \
388 alloc_pos = state->data_stack_base; \
389 TRACE(("allocating %s in %zd (%zd)\n", \
390 Py_STRINGIFY(type), alloc_pos, sizeof(type))); \
391 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
392 int j = data_stack_grow(state, sizeof(type)); \
393 if (j < 0) return j; \
394 if (ctx_pos != -1) \
395 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
396 } \
397 ptr = (type*)(state->data_stack+alloc_pos); \
398 state->data_stack_base += sizeof(type); \
399 } while (0)
400
401 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
402 do { \
403 TRACE(("looking up %s at %zd\n", Py_STRINGIFY(type), pos)); \
404 ptr = (type*)(state->data_stack+pos); \
405 } while (0)
406
407 #define DATA_STACK_PUSH(state, data, size) \
408 do { \
409 TRACE(("copy data in %p to %zd (%zd)\n", \
410 data, state->data_stack_base, size)); \
411 if (size > state->data_stack_size - state->data_stack_base) { \
412 int j = data_stack_grow(state, size); \
413 if (j < 0) return j; \
414 if (ctx_pos != -1) \
415 DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \
416 } \
417 memcpy(state->data_stack+state->data_stack_base, data, size); \
418 state->data_stack_base += size; \
419 } while (0)
420
421 /* We add an explicit cast to memcpy here because MSVC has a bug when
422 compiling C code where it believes that `const void**` cannot be
423 safely casted to `void*`, see bpo-39943 for details. */
424 #define DATA_STACK_POP(state, data, size, discard) \
425 do { \
426 TRACE(("copy data to %p from %zd (%zd)\n", \
427 data, state->data_stack_base-size, size)); \
428 memcpy((void*) data, state->data_stack+state->data_stack_base-size, size); \
429 if (discard) \
430 state->data_stack_base -= size; \
431 } while (0)
432
433 #define DATA_STACK_POP_DISCARD(state, size) \
434 do { \
435 TRACE(("discard data from %zd (%zd)\n", \
436 state->data_stack_base-size, size)); \
437 state->data_stack_base -= size; \
438 } while(0)
439
440 #define DATA_PUSH(x) \
441 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
442 #define DATA_POP(x) \
443 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
444 #define DATA_POP_DISCARD(x) \
445 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
446 #define DATA_ALLOC(t,p) \
447 DATA_STACK_ALLOC(state, t, p)
448 #define DATA_LOOKUP_AT(t,p,pos) \
449 DATA_STACK_LOOKUP_AT(state,t,p,pos)
450
451 #define MARK_PUSH(lastmark) \
452 do if (lastmark >= 0) { \
453 size_t _marks_size = (lastmark+1) * sizeof(void*); \
454 DATA_STACK_PUSH(state, state->mark, _marks_size); \
455 } while (0)
456 #define MARK_POP(lastmark) \
457 do if (lastmark >= 0) { \
458 size_t _marks_size = (lastmark+1) * sizeof(void*); \
459 DATA_STACK_POP(state, state->mark, _marks_size, 1); \
460 } while (0)
461 #define MARK_POP_KEEP(lastmark) \
462 do if (lastmark >= 0) { \
463 size_t _marks_size = (lastmark+1) * sizeof(void*); \
464 DATA_STACK_POP(state, state->mark, _marks_size, 0); \
465 } while (0)
466 #define MARK_POP_DISCARD(lastmark) \
467 do if (lastmark >= 0) { \
468 size_t _marks_size = (lastmark+1) * sizeof(void*); \
469 DATA_STACK_POP_DISCARD(state, _marks_size); \
470 } while (0)
471
472 #define JUMP_NONE 0
473 #define JUMP_MAX_UNTIL_1 1
474 #define JUMP_MAX_UNTIL_2 2
475 #define JUMP_MAX_UNTIL_3 3
476 #define JUMP_MIN_UNTIL_1 4
477 #define JUMP_MIN_UNTIL_2 5
478 #define JUMP_MIN_UNTIL_3 6
479 #define JUMP_REPEAT 7
480 #define JUMP_REPEAT_ONE_1 8
481 #define JUMP_REPEAT_ONE_2 9
482 #define JUMP_MIN_REPEAT_ONE 10
483 #define JUMP_BRANCH 11
484 #define JUMP_ASSERT 12
485 #define JUMP_ASSERT_NOT 13
486 #define JUMP_POSS_REPEAT_1 14
487 #define JUMP_POSS_REPEAT_2 15
488 #define JUMP_ATOMIC_GROUP 16
489
490 #define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
491 ctx->pattern = pattern; \
492 ctx->ptr = ptr; \
493 DATA_ALLOC(SRE(match_context), nextctx); \
494 nextctx->pattern = nextpattern; \
495 nextctx->toplevel = toplevel_; \
496 nextctx->jump = jumpvalue; \
497 nextctx->last_ctx_pos = ctx_pos; \
498 pattern = nextpattern; \
499 ctx_pos = alloc_pos; \
500 ctx = nextctx; \
501 goto entrance; \
502 jumplabel: \
503 pattern = ctx->pattern; \
504 ptr = ctx->ptr;
505
506 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
507 DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel)
508
509 #define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \
510 DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
511
512 typedef struct {
513 Py_ssize_t count;
514 union {
515 SRE_CODE chr;
516 SRE_REPEAT* rep;
517 } u;
518 int lastmark;
519 int lastindex;
520 const SRE_CODE* pattern;
521 const SRE_CHAR* ptr;
522 int toplevel;
523 int jump;
524 Py_ssize_t last_ctx_pos;
525 } SRE(match_context);
526
527 #define MAYBE_CHECK_SIGNALS \
528 do { \
529 if ((0 == (++sigcount & 0xfff)) && PyErr_CheckSignals()) { \
530 RETURN_ERROR(SRE_ERROR_INTERRUPTED); \
531 } \
532 } while (0)
533
534 #ifdef HAVE_COMPUTED_GOTOS
535 #ifndef USE_COMPUTED_GOTOS
536 #define USE_COMPUTED_GOTOS 1
537 #endif
538 #elif defined(USE_COMPUTED_GOTOS) && USE_COMPUTED_GOTOS
539 #error "Computed gotos are not supported on this compiler."
540 #else
541 #undef USE_COMPUTED_GOTOS
542 #define USE_COMPUTED_GOTOS 0
543 #endif
544
545 #if USE_COMPUTED_GOTOS
546 #define TARGET(OP) TARGET_ ## OP
547 #define DISPATCH \
548 do { \
549 MAYBE_CHECK_SIGNALS; \
550 goto *sre_targets[*pattern++]; \
551 } while (0)
552 #else
553 #define TARGET(OP) case OP
554 #define DISPATCH goto dispatch
555 #endif
556
557 /* check if string matches the given pattern. returns <0 for
558 error, 0 for failure, and 1 for success */
559 LOCAL(Py_ssize_t)
SRE(match)560 SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
561 {
562 const SRE_CHAR* end = (const SRE_CHAR *)state->end;
563 Py_ssize_t alloc_pos, ctx_pos = -1;
564 Py_ssize_t ret = 0;
565 int jump;
566 unsigned int sigcount=0;
567
568 SRE(match_context)* ctx;
569 SRE(match_context)* nextctx;
570
571 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
572
573 DATA_ALLOC(SRE(match_context), ctx);
574 ctx->last_ctx_pos = -1;
575 ctx->jump = JUMP_NONE;
576 ctx->toplevel = toplevel;
577 ctx_pos = alloc_pos;
578
579 #if USE_COMPUTED_GOTOS
580 #include "sre_targets.h"
581 #endif
582
583 entrance:
584
585 ; // Fashion statement.
586 const SRE_CHAR *ptr = (SRE_CHAR *)state->ptr;
587
588 if (pattern[0] == SRE_OP_INFO) {
589 /* optimization info block */
590 /* <INFO> <1=skip> <2=flags> <3=min> ... */
591 if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
592 TRACE(("reject (got %zd chars, need %zd)\n",
593 end - ptr, (Py_ssize_t) pattern[3]));
594 RETURN_FAILURE;
595 }
596 pattern += pattern[1] + 1;
597 }
598
599 #if USE_COMPUTED_GOTOS
600 DISPATCH;
601 #else
602 dispatch:
603 MAYBE_CHECK_SIGNALS;
604 switch (*pattern++)
605 #endif
606 {
607
608 TARGET(SRE_OP_MARK):
609 /* set mark */
610 /* <MARK> <gid> */
611 TRACE(("|%p|%p|MARK %d\n", pattern,
612 ptr, pattern[0]));
613 {
614 int i = pattern[0];
615 if (i & 1)
616 state->lastindex = i/2 + 1;
617 if (i > state->lastmark) {
618 /* state->lastmark is the highest valid index in the
619 state->mark array. If it is increased by more than 1,
620 the intervening marks must be set to NULL to signal
621 that these marks have not been encountered. */
622 int j = state->lastmark + 1;
623 while (j < i)
624 state->mark[j++] = NULL;
625 state->lastmark = i;
626 }
627 state->mark[i] = ptr;
628 }
629 pattern++;
630 DISPATCH;
631
632 TARGET(SRE_OP_LITERAL):
633 /* match literal string */
634 /* <LITERAL> <code> */
635 TRACE(("|%p|%p|LITERAL %d\n", pattern,
636 ptr, *pattern));
637 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
638 RETURN_FAILURE;
639 pattern++;
640 ptr++;
641 DISPATCH;
642
643 TARGET(SRE_OP_NOT_LITERAL):
644 /* match anything that is not literal character */
645 /* <NOT_LITERAL> <code> */
646 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern,
647 ptr, *pattern));
648 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
649 RETURN_FAILURE;
650 pattern++;
651 ptr++;
652 DISPATCH;
653
654 TARGET(SRE_OP_SUCCESS):
655 /* end of pattern */
656 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
657 if (ctx->toplevel &&
658 ((state->match_all && ptr != state->end) ||
659 (state->must_advance && ptr == state->start)))
660 {
661 RETURN_FAILURE;
662 }
663 state->ptr = ptr;
664 RETURN_SUCCESS;
665
666 TARGET(SRE_OP_AT):
667 /* match at given position */
668 /* <AT> <code> */
669 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
670 if (!SRE(at)(state, ptr, *pattern))
671 RETURN_FAILURE;
672 pattern++;
673 DISPATCH;
674
675 TARGET(SRE_OP_CATEGORY):
676 /* match at given category */
677 /* <CATEGORY> <code> */
678 TRACE(("|%p|%p|CATEGORY %d\n", pattern,
679 ptr, *pattern));
680 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
681 RETURN_FAILURE;
682 pattern++;
683 ptr++;
684 DISPATCH;
685
686 TARGET(SRE_OP_ANY):
687 /* match anything (except a newline) */
688 /* <ANY> */
689 TRACE(("|%p|%p|ANY\n", pattern, ptr));
690 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
691 RETURN_FAILURE;
692 ptr++;
693 DISPATCH;
694
695 TARGET(SRE_OP_ANY_ALL):
696 /* match anything */
697 /* <ANY_ALL> */
698 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
699 if (ptr >= end)
700 RETURN_FAILURE;
701 ptr++;
702 DISPATCH;
703
704 TARGET(SRE_OP_IN):
705 /* match set member (or non_member) */
706 /* <IN> <skip> <set> */
707 TRACE(("|%p|%p|IN\n", pattern, ptr));
708 if (ptr >= end ||
709 !SRE(charset)(state, pattern + 1, *ptr))
710 RETURN_FAILURE;
711 pattern += pattern[0];
712 ptr++;
713 DISPATCH;
714
715 TARGET(SRE_OP_LITERAL_IGNORE):
716 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
717 pattern, ptr, pattern[0]));
718 if (ptr >= end ||
719 sre_lower_ascii(*ptr) != *pattern)
720 RETURN_FAILURE;
721 pattern++;
722 ptr++;
723 DISPATCH;
724
725 TARGET(SRE_OP_LITERAL_UNI_IGNORE):
726 TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n",
727 pattern, ptr, pattern[0]));
728 if (ptr >= end ||
729 sre_lower_unicode(*ptr) != *pattern)
730 RETURN_FAILURE;
731 pattern++;
732 ptr++;
733 DISPATCH;
734
735 TARGET(SRE_OP_LITERAL_LOC_IGNORE):
736 TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
737 pattern, ptr, pattern[0]));
738 if (ptr >= end
739 || !char_loc_ignore(*pattern, *ptr))
740 RETURN_FAILURE;
741 pattern++;
742 ptr++;
743 DISPATCH;
744
745 TARGET(SRE_OP_NOT_LITERAL_IGNORE):
746 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
747 pattern, ptr, *pattern));
748 if (ptr >= end ||
749 sre_lower_ascii(*ptr) == *pattern)
750 RETURN_FAILURE;
751 pattern++;
752 ptr++;
753 DISPATCH;
754
755 TARGET(SRE_OP_NOT_LITERAL_UNI_IGNORE):
756 TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n",
757 pattern, ptr, *pattern));
758 if (ptr >= end ||
759 sre_lower_unicode(*ptr) == *pattern)
760 RETURN_FAILURE;
761 pattern++;
762 ptr++;
763 DISPATCH;
764
765 TARGET(SRE_OP_NOT_LITERAL_LOC_IGNORE):
766 TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
767 pattern, ptr, *pattern));
768 if (ptr >= end
769 || char_loc_ignore(*pattern, *ptr))
770 RETURN_FAILURE;
771 pattern++;
772 ptr++;
773 DISPATCH;
774
775 TARGET(SRE_OP_IN_IGNORE):
776 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
777 if (ptr >= end
778 || !SRE(charset)(state, pattern+1,
779 (SRE_CODE)sre_lower_ascii(*ptr)))
780 RETURN_FAILURE;
781 pattern += pattern[0];
782 ptr++;
783 DISPATCH;
784
785 TARGET(SRE_OP_IN_UNI_IGNORE):
786 TRACE(("|%p|%p|IN_UNI_IGNORE\n", pattern, ptr));
787 if (ptr >= end
788 || !SRE(charset)(state, pattern+1,
789 (SRE_CODE)sre_lower_unicode(*ptr)))
790 RETURN_FAILURE;
791 pattern += pattern[0];
792 ptr++;
793 DISPATCH;
794
795 TARGET(SRE_OP_IN_LOC_IGNORE):
796 TRACE(("|%p|%p|IN_LOC_IGNORE\n", pattern, ptr));
797 if (ptr >= end
798 || !SRE(charset_loc_ignore)(state, pattern+1, *ptr))
799 RETURN_FAILURE;
800 pattern += pattern[0];
801 ptr++;
802 DISPATCH;
803
804 TARGET(SRE_OP_JUMP):
805 TARGET(SRE_OP_INFO):
806 /* jump forward */
807 /* <JUMP> <offset> */
808 TRACE(("|%p|%p|JUMP %d\n", pattern,
809 ptr, pattern[0]));
810 pattern += pattern[0];
811 DISPATCH;
812
813 TARGET(SRE_OP_BRANCH):
814 /* alternation */
815 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
816 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
817 LASTMARK_SAVE();
818 if (state->repeat)
819 MARK_PUSH(ctx->lastmark);
820 for (; pattern[0]; pattern += pattern[0]) {
821 if (pattern[1] == SRE_OP_LITERAL &&
822 (ptr >= end ||
823 (SRE_CODE) *ptr != pattern[2]))
824 continue;
825 if (pattern[1] == SRE_OP_IN &&
826 (ptr >= end ||
827 !SRE(charset)(state, pattern + 3,
828 (SRE_CODE) *ptr)))
829 continue;
830 state->ptr = ptr;
831 DO_JUMP(JUMP_BRANCH, jump_branch, pattern+1);
832 if (ret) {
833 if (state->repeat)
834 MARK_POP_DISCARD(ctx->lastmark);
835 RETURN_ON_ERROR(ret);
836 RETURN_SUCCESS;
837 }
838 if (state->repeat)
839 MARK_POP_KEEP(ctx->lastmark);
840 LASTMARK_RESTORE();
841 }
842 if (state->repeat)
843 MARK_POP_DISCARD(ctx->lastmark);
844 RETURN_FAILURE;
845
846 TARGET(SRE_OP_REPEAT_ONE):
847 /* match repeated sequence (maximizing regexp) */
848
849 /* this operator only works if the repeated item is
850 exactly one character wide, and we're not already
851 collecting backtracking points. for other cases,
852 use the MAX_REPEAT operator */
853
854 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
855
856 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
857 pattern[1], pattern[2]));
858
859 if ((Py_ssize_t) pattern[1] > end - ptr)
860 RETURN_FAILURE; /* cannot match */
861
862 state->ptr = ptr;
863
864 ret = SRE(count)(state, pattern+3, pattern[2]);
865 RETURN_ON_ERROR(ret);
866 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
867 ctx->count = ret;
868 ptr += ctx->count;
869
870 /* when we arrive here, count contains the number of
871 matches, and ptr points to the tail of the target
872 string. check if the rest of the pattern matches,
873 and backtrack if not. */
874
875 if (ctx->count < (Py_ssize_t) pattern[1])
876 RETURN_FAILURE;
877
878 if (pattern[pattern[0]] == SRE_OP_SUCCESS &&
879 ptr == state->end &&
880 !(ctx->toplevel && state->must_advance && ptr == state->start))
881 {
882 /* tail is empty. we're finished */
883 state->ptr = ptr;
884 RETURN_SUCCESS;
885 }
886
887 LASTMARK_SAVE();
888 if (state->repeat)
889 MARK_PUSH(ctx->lastmark);
890
891 if (pattern[pattern[0]] == SRE_OP_LITERAL) {
892 /* tail starts with a literal. skip positions where
893 the rest of the pattern cannot possibly match */
894 ctx->u.chr = pattern[pattern[0]+1];
895 for (;;) {
896 while (ctx->count >= (Py_ssize_t) pattern[1] &&
897 (ptr >= end || *ptr != ctx->u.chr)) {
898 ptr--;
899 ctx->count--;
900 }
901 if (ctx->count < (Py_ssize_t) pattern[1])
902 break;
903 state->ptr = ptr;
904 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
905 pattern+pattern[0]);
906 if (ret) {
907 if (state->repeat)
908 MARK_POP_DISCARD(ctx->lastmark);
909 RETURN_ON_ERROR(ret);
910 RETURN_SUCCESS;
911 }
912 if (state->repeat)
913 MARK_POP_KEEP(ctx->lastmark);
914 LASTMARK_RESTORE();
915
916 ptr--;
917 ctx->count--;
918 }
919 if (state->repeat)
920 MARK_POP_DISCARD(ctx->lastmark);
921 } else {
922 /* general case */
923 while (ctx->count >= (Py_ssize_t) pattern[1]) {
924 state->ptr = ptr;
925 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
926 pattern+pattern[0]);
927 if (ret) {
928 if (state->repeat)
929 MARK_POP_DISCARD(ctx->lastmark);
930 RETURN_ON_ERROR(ret);
931 RETURN_SUCCESS;
932 }
933 if (state->repeat)
934 MARK_POP_KEEP(ctx->lastmark);
935 LASTMARK_RESTORE();
936
937 ptr--;
938 ctx->count--;
939 }
940 if (state->repeat)
941 MARK_POP_DISCARD(ctx->lastmark);
942 }
943 RETURN_FAILURE;
944
945 TARGET(SRE_OP_MIN_REPEAT_ONE):
946 /* match repeated sequence (minimizing regexp) */
947
948 /* this operator only works if the repeated item is
949 exactly one character wide, and we're not already
950 collecting backtracking points. for other cases,
951 use the MIN_REPEAT operator */
952
953 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
954
955 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", pattern, ptr,
956 pattern[1], pattern[2]));
957
958 if ((Py_ssize_t) pattern[1] > end - ptr)
959 RETURN_FAILURE; /* cannot match */
960
961 state->ptr = ptr;
962
963 if (pattern[1] == 0)
964 ctx->count = 0;
965 else {
966 /* count using pattern min as the maximum */
967 ret = SRE(count)(state, pattern+3, pattern[1]);
968 RETURN_ON_ERROR(ret);
969 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
970 if (ret < (Py_ssize_t) pattern[1])
971 /* didn't match minimum number of times */
972 RETURN_FAILURE;
973 /* advance past minimum matches of repeat */
974 ctx->count = ret;
975 ptr += ctx->count;
976 }
977
978 if (pattern[pattern[0]] == SRE_OP_SUCCESS &&
979 !(ctx->toplevel &&
980 ((state->match_all && ptr != state->end) ||
981 (state->must_advance && ptr == state->start))))
982 {
983 /* tail is empty. we're finished */
984 state->ptr = ptr;
985 RETURN_SUCCESS;
986
987 } else {
988 /* general case */
989 LASTMARK_SAVE();
990 if (state->repeat)
991 MARK_PUSH(ctx->lastmark);
992
993 while ((Py_ssize_t)pattern[2] == SRE_MAXREPEAT
994 || ctx->count <= (Py_ssize_t)pattern[2]) {
995 state->ptr = ptr;
996 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
997 pattern+pattern[0]);
998 if (ret) {
999 if (state->repeat)
1000 MARK_POP_DISCARD(ctx->lastmark);
1001 RETURN_ON_ERROR(ret);
1002 RETURN_SUCCESS;
1003 }
1004 if (state->repeat)
1005 MARK_POP_KEEP(ctx->lastmark);
1006 LASTMARK_RESTORE();
1007
1008 state->ptr = ptr;
1009 ret = SRE(count)(state, pattern+3, 1);
1010 RETURN_ON_ERROR(ret);
1011 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
1012 if (ret == 0)
1013 break;
1014 assert(ret == 1);
1015 ptr++;
1016 ctx->count++;
1017 }
1018 if (state->repeat)
1019 MARK_POP_DISCARD(ctx->lastmark);
1020 }
1021 RETURN_FAILURE;
1022
1023 TARGET(SRE_OP_POSSESSIVE_REPEAT_ONE):
1024 /* match repeated sequence (maximizing regexp) without
1025 backtracking */
1026
1027 /* this operator only works if the repeated item is
1028 exactly one character wide, and we're not already
1029 collecting backtracking points. for other cases,
1030 use the MAX_REPEAT operator */
1031
1032 /* <POSSESSIVE_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS>
1033 tail */
1034
1035 TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", pattern,
1036 ptr, pattern[1], pattern[2]));
1037
1038 if (ptr + pattern[1] > end) {
1039 RETURN_FAILURE; /* cannot match */
1040 }
1041
1042 state->ptr = ptr;
1043
1044 ret = SRE(count)(state, pattern + 3, pattern[2]);
1045 RETURN_ON_ERROR(ret);
1046 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
1047 ctx->count = ret;
1048 ptr += ctx->count;
1049
1050 /* when we arrive here, count contains the number of
1051 matches, and ptr points to the tail of the target
1052 string. check if the rest of the pattern matches,
1053 and fail if not. */
1054
1055 /* Test for not enough repetitions in match */
1056 if (ctx->count < (Py_ssize_t) pattern[1]) {
1057 RETURN_FAILURE;
1058 }
1059
1060 /* Update the pattern to point to the next op code */
1061 pattern += pattern[0];
1062
1063 /* Let the tail be evaluated separately and consider this
1064 match successful. */
1065 if (*pattern == SRE_OP_SUCCESS &&
1066 ptr == state->end &&
1067 !(ctx->toplevel && state->must_advance && ptr == state->start))
1068 {
1069 /* tail is empty. we're finished */
1070 state->ptr = ptr;
1071 RETURN_SUCCESS;
1072 }
1073
1074 /* Attempt to match the rest of the string */
1075 DISPATCH;
1076
1077 TARGET(SRE_OP_REPEAT):
1078 /* create repeat context. all the hard work is done
1079 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1080 /* <REPEAT> <skip> <1=min> <2=max>
1081 <3=repeat_index> item <UNTIL> tail */
1082 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
1083 pattern[1], pattern[2]));
1084
1085 /* install new repeat context */
1086 /* TODO(https://github.com/python/cpython/issues/67877): Fix this
1087 * potential memory leak. */
1088 ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep));
1089 if (!ctx->u.rep) {
1090 PyErr_NoMemory();
1091 RETURN_FAILURE;
1092 }
1093 ctx->u.rep->count = -1;
1094 ctx->u.rep->pattern = pattern;
1095 ctx->u.rep->prev = state->repeat;
1096 ctx->u.rep->last_ptr = NULL;
1097 state->repeat = ctx->u.rep;
1098
1099 state->ptr = ptr;
1100 DO_JUMP(JUMP_REPEAT, jump_repeat, pattern+pattern[0]);
1101 state->repeat = ctx->u.rep->prev;
1102 PyObject_Free(ctx->u.rep);
1103
1104 if (ret) {
1105 RETURN_ON_ERROR(ret);
1106 RETURN_SUCCESS;
1107 }
1108 RETURN_FAILURE;
1109
1110 TARGET(SRE_OP_MAX_UNTIL):
1111 /* maximizing repeat */
1112 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1113
1114 /* FIXME: we probably need to deal with zero-width
1115 matches in here... */
1116
1117 ctx->u.rep = state->repeat;
1118 if (!ctx->u.rep)
1119 RETURN_ERROR(SRE_ERROR_STATE);
1120
1121 state->ptr = ptr;
1122
1123 ctx->count = ctx->u.rep->count+1;
1124
1125 TRACE(("|%p|%p|MAX_UNTIL %zd\n", pattern,
1126 ptr, ctx->count));
1127
1128 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1129 /* not enough matches */
1130 ctx->u.rep->count = ctx->count;
1131 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1132 ctx->u.rep->pattern+3);
1133 if (ret) {
1134 RETURN_ON_ERROR(ret);
1135 RETURN_SUCCESS;
1136 }
1137 ctx->u.rep->count = ctx->count-1;
1138 state->ptr = ptr;
1139 RETURN_FAILURE;
1140 }
1141
1142 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1143 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1144 state->ptr != ctx->u.rep->last_ptr) {
1145 /* we may have enough matches, but if we can
1146 match another item, do so */
1147 ctx->u.rep->count = ctx->count;
1148 LASTMARK_SAVE();
1149 MARK_PUSH(ctx->lastmark);
1150 /* zero-width match protection */
1151 DATA_PUSH(&ctx->u.rep->last_ptr);
1152 ctx->u.rep->last_ptr = state->ptr;
1153 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1154 ctx->u.rep->pattern+3);
1155 DATA_POP(&ctx->u.rep->last_ptr);
1156 if (ret) {
1157 MARK_POP_DISCARD(ctx->lastmark);
1158 RETURN_ON_ERROR(ret);
1159 RETURN_SUCCESS;
1160 }
1161 MARK_POP(ctx->lastmark);
1162 LASTMARK_RESTORE();
1163 ctx->u.rep->count = ctx->count-1;
1164 state->ptr = ptr;
1165 }
1166
1167 /* cannot match more repeated items here. make sure the
1168 tail matches */
1169 state->repeat = ctx->u.rep->prev;
1170 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, pattern);
1171 state->repeat = ctx->u.rep; // restore repeat before return
1172
1173 RETURN_ON_SUCCESS(ret);
1174 state->ptr = ptr;
1175 RETURN_FAILURE;
1176
1177 TARGET(SRE_OP_MIN_UNTIL):
1178 /* minimizing repeat */
1179 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1180
1181 ctx->u.rep = state->repeat;
1182 if (!ctx->u.rep)
1183 RETURN_ERROR(SRE_ERROR_STATE);
1184
1185 state->ptr = ptr;
1186
1187 ctx->count = ctx->u.rep->count+1;
1188
1189 TRACE(("|%p|%p|MIN_UNTIL %zd %p\n", pattern,
1190 ptr, ctx->count, ctx->u.rep->pattern));
1191
1192 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1193 /* not enough matches */
1194 ctx->u.rep->count = ctx->count;
1195 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1196 ctx->u.rep->pattern+3);
1197 if (ret) {
1198 RETURN_ON_ERROR(ret);
1199 RETURN_SUCCESS;
1200 }
1201 ctx->u.rep->count = ctx->count-1;
1202 state->ptr = ptr;
1203 RETURN_FAILURE;
1204 }
1205
1206 /* see if the tail matches */
1207 state->repeat = ctx->u.rep->prev;
1208
1209 LASTMARK_SAVE();
1210 if (state->repeat)
1211 MARK_PUSH(ctx->lastmark);
1212
1213 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, pattern);
1214 SRE_REPEAT *repeat_of_tail = state->repeat;
1215 state->repeat = ctx->u.rep; // restore repeat before return
1216
1217 if (ret) {
1218 if (repeat_of_tail)
1219 MARK_POP_DISCARD(ctx->lastmark);
1220 RETURN_ON_ERROR(ret);
1221 RETURN_SUCCESS;
1222 }
1223 if (repeat_of_tail)
1224 MARK_POP(ctx->lastmark);
1225 LASTMARK_RESTORE();
1226
1227 state->ptr = ptr;
1228
1229 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1230 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1231 state->ptr == ctx->u.rep->last_ptr)
1232 RETURN_FAILURE;
1233
1234 ctx->u.rep->count = ctx->count;
1235 /* zero-width match protection */
1236 DATA_PUSH(&ctx->u.rep->last_ptr);
1237 ctx->u.rep->last_ptr = state->ptr;
1238 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1239 ctx->u.rep->pattern+3);
1240 DATA_POP(&ctx->u.rep->last_ptr);
1241 if (ret) {
1242 RETURN_ON_ERROR(ret);
1243 RETURN_SUCCESS;
1244 }
1245 ctx->u.rep->count = ctx->count-1;
1246 state->ptr = ptr;
1247 RETURN_FAILURE;
1248
1249 TARGET(SRE_OP_POSSESSIVE_REPEAT):
1250 /* create possessive repeat contexts. */
1251 /* <POSSESSIVE_REPEAT> <skip> <1=min> <2=max> pattern
1252 <SUCCESS> tail */
1253 TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", pattern,
1254 ptr, pattern[1], pattern[2]));
1255
1256 /* Set the global Input pointer to this context's Input
1257 pointer */
1258 state->ptr = ptr;
1259
1260 /* Initialize Count to 0 */
1261 ctx->count = 0;
1262
1263 /* Check for minimum required matches. */
1264 while (ctx->count < (Py_ssize_t)pattern[1]) {
1265 /* not enough matches */
1266 DO_JUMP0(JUMP_POSS_REPEAT_1, jump_poss_repeat_1,
1267 &pattern[3]);
1268 if (ret) {
1269 RETURN_ON_ERROR(ret);
1270 ctx->count++;
1271 }
1272 else {
1273 state->ptr = ptr;
1274 RETURN_FAILURE;
1275 }
1276 }
1277
1278 /* Clear the context's Input stream pointer so that it
1279 doesn't match the global state so that the while loop can
1280 be entered. */
1281 ptr = NULL;
1282
1283 /* Keep trying to parse the <pattern> sub-pattern until the
1284 end is reached, creating a new context each time. */
1285 while ((ctx->count < (Py_ssize_t)pattern[2] ||
1286 (Py_ssize_t)pattern[2] == SRE_MAXREPEAT) &&
1287 state->ptr != ptr) {
1288 /* Save the Capture Group Marker state into the current
1289 Context and back up the current highest number
1290 Capture Group marker. */
1291 LASTMARK_SAVE();
1292 MARK_PUSH(ctx->lastmark);
1293
1294 /* zero-width match protection */
1295 /* Set the context's Input Stream pointer to be the
1296 current Input Stream pointer from the global
1297 state. When the loop reaches the next iteration,
1298 the context will then store the last known good
1299 position with the global state holding the Input
1300 Input Stream position that has been updated with
1301 the most recent match. Thus, if state's Input
1302 stream remains the same as the one stored in the
1303 current Context, we know we have successfully
1304 matched an empty string and that all subsequent
1305 matches will also be the empty string until the
1306 maximum number of matches are counted, and because
1307 of this, we could immediately stop at that point and
1308 consider this match successful. */
1309 ptr = state->ptr;
1310
1311 /* We have not reached the maximin matches, so try to
1312 match once more. */
1313 DO_JUMP0(JUMP_POSS_REPEAT_2, jump_poss_repeat_2,
1314 &pattern[3]);
1315
1316 /* Check to see if the last attempted match
1317 succeeded. */
1318 if (ret) {
1319 /* Drop the saved highest number Capture Group
1320 marker saved above and use the newly updated
1321 value. */
1322 MARK_POP_DISCARD(ctx->lastmark);
1323 RETURN_ON_ERROR(ret);
1324
1325 /* Success, increment the count. */
1326 ctx->count++;
1327 }
1328 /* Last attempted match failed. */
1329 else {
1330 /* Restore the previously saved highest number
1331 Capture Group marker since the last iteration
1332 did not match, then restore that to the global
1333 state. */
1334 MARK_POP(ctx->lastmark);
1335 LASTMARK_RESTORE();
1336
1337 /* We have sufficient matches, so exit loop. */
1338 break;
1339 }
1340 }
1341
1342 /* Evaluate Tail */
1343 /* Jump to end of pattern indicated by skip, and then skip
1344 the SUCCESS op code that follows it. */
1345 pattern += pattern[0] + 1;
1346 ptr = state->ptr;
1347 DISPATCH;
1348
1349 TARGET(SRE_OP_ATOMIC_GROUP):
1350 /* Atomic Group Sub Pattern */
1351 /* <ATOMIC_GROUP> <skip> pattern <SUCCESS> tail */
1352 TRACE(("|%p|%p|ATOMIC_GROUP\n", pattern, ptr));
1353
1354 /* Set the global Input pointer to this context's Input
1355 pointer */
1356 state->ptr = ptr;
1357
1358 /* Evaluate the Atomic Group in a new context, terminating
1359 when the end of the group, represented by a SUCCESS op
1360 code, is reached. */
1361 /* Group Pattern begins at an offset of 1 code. */
1362 DO_JUMP0(JUMP_ATOMIC_GROUP, jump_atomic_group,
1363 &pattern[1]);
1364
1365 /* Test Exit Condition */
1366 RETURN_ON_ERROR(ret);
1367
1368 if (ret == 0) {
1369 /* Atomic Group failed to Match. */
1370 state->ptr = ptr;
1371 RETURN_FAILURE;
1372 }
1373
1374 /* Evaluate Tail */
1375 /* Jump to end of pattern indicated by skip, and then skip
1376 the SUCCESS op code that follows it. */
1377 pattern += pattern[0];
1378 ptr = state->ptr;
1379 DISPATCH;
1380
1381 TARGET(SRE_OP_GROUPREF):
1382 /* match backreference */
1383 TRACE(("|%p|%p|GROUPREF %d\n", pattern,
1384 ptr, pattern[0]));
1385 {
1386 int groupref = pattern[0] * 2;
1387 if (groupref >= state->lastmark) {
1388 RETURN_FAILURE;
1389 } else {
1390 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1391 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1392 if (!p || !e || e < p)
1393 RETURN_FAILURE;
1394 while (p < e) {
1395 if (ptr >= end || *ptr != *p)
1396 RETURN_FAILURE;
1397 p++;
1398 ptr++;
1399 }
1400 }
1401 }
1402 pattern++;
1403 DISPATCH;
1404
1405 TARGET(SRE_OP_GROUPREF_IGNORE):
1406 /* match backreference */
1407 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern,
1408 ptr, pattern[0]));
1409 {
1410 int groupref = pattern[0] * 2;
1411 if (groupref >= state->lastmark) {
1412 RETURN_FAILURE;
1413 } else {
1414 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1415 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1416 if (!p || !e || e < p)
1417 RETURN_FAILURE;
1418 while (p < e) {
1419 if (ptr >= end ||
1420 sre_lower_ascii(*ptr) != sre_lower_ascii(*p))
1421 RETURN_FAILURE;
1422 p++;
1423 ptr++;
1424 }
1425 }
1426 }
1427 pattern++;
1428 DISPATCH;
1429
1430 TARGET(SRE_OP_GROUPREF_UNI_IGNORE):
1431 /* match backreference */
1432 TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", pattern,
1433 ptr, pattern[0]));
1434 {
1435 int groupref = pattern[0] * 2;
1436 if (groupref >= state->lastmark) {
1437 RETURN_FAILURE;
1438 } else {
1439 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1440 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1441 if (!p || !e || e < p)
1442 RETURN_FAILURE;
1443 while (p < e) {
1444 if (ptr >= end ||
1445 sre_lower_unicode(*ptr) != sre_lower_unicode(*p))
1446 RETURN_FAILURE;
1447 p++;
1448 ptr++;
1449 }
1450 }
1451 }
1452 pattern++;
1453 DISPATCH;
1454
1455 TARGET(SRE_OP_GROUPREF_LOC_IGNORE):
1456 /* match backreference */
1457 TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", pattern,
1458 ptr, pattern[0]));
1459 {
1460 int groupref = pattern[0] * 2;
1461 if (groupref >= state->lastmark) {
1462 RETURN_FAILURE;
1463 } else {
1464 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1465 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1466 if (!p || !e || e < p)
1467 RETURN_FAILURE;
1468 while (p < e) {
1469 if (ptr >= end ||
1470 sre_lower_locale(*ptr) != sre_lower_locale(*p))
1471 RETURN_FAILURE;
1472 p++;
1473 ptr++;
1474 }
1475 }
1476 }
1477 pattern++;
1478 DISPATCH;
1479
1480 TARGET(SRE_OP_GROUPREF_EXISTS):
1481 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", pattern,
1482 ptr, pattern[0]));
1483 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1484 {
1485 int groupref = pattern[0] * 2;
1486 if (groupref >= state->lastmark) {
1487 pattern += pattern[1];
1488 DISPATCH;
1489 } else {
1490 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1491 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1492 if (!p || !e || e < p) {
1493 pattern += pattern[1];
1494 DISPATCH;
1495 }
1496 }
1497 }
1498 pattern += 2;
1499 DISPATCH;
1500
1501 TARGET(SRE_OP_ASSERT):
1502 /* assert subpattern */
1503 /* <ASSERT> <skip> <back> <pattern> */
1504 TRACE(("|%p|%p|ASSERT %d\n", pattern,
1505 ptr, pattern[1]));
1506 if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
1507 RETURN_FAILURE;
1508 state->ptr = ptr - pattern[1];
1509 DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
1510 RETURN_ON_FAILURE(ret);
1511 pattern += pattern[0];
1512 DISPATCH;
1513
1514 TARGET(SRE_OP_ASSERT_NOT):
1515 /* assert not subpattern */
1516 /* <ASSERT_NOT> <skip> <back> <pattern> */
1517 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
1518 ptr, pattern[1]));
1519 if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
1520 state->ptr = ptr - pattern[1];
1521 LASTMARK_SAVE();
1522 if (state->repeat)
1523 MARK_PUSH(ctx->lastmark);
1524
1525 DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, pattern+2);
1526 if (ret) {
1527 if (state->repeat)
1528 MARK_POP_DISCARD(ctx->lastmark);
1529 RETURN_ON_ERROR(ret);
1530 RETURN_FAILURE;
1531 }
1532 if (state->repeat)
1533 MARK_POP(ctx->lastmark);
1534 LASTMARK_RESTORE();
1535 }
1536 pattern += pattern[0];
1537 DISPATCH;
1538
1539 TARGET(SRE_OP_FAILURE):
1540 /* immediate failure */
1541 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
1542 RETURN_FAILURE;
1543
1544 #if !USE_COMPUTED_GOTOS
1545 default:
1546 #endif
1547 // Also any unused opcodes:
1548 TARGET(SRE_OP_RANGE_UNI_IGNORE):
1549 TARGET(SRE_OP_SUBPATTERN):
1550 TARGET(SRE_OP_RANGE):
1551 TARGET(SRE_OP_NEGATE):
1552 TARGET(SRE_OP_BIGCHARSET):
1553 TARGET(SRE_OP_CHARSET):
1554 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr,
1555 pattern[-1]));
1556 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1557
1558 }
1559
1560 exit:
1561 ctx_pos = ctx->last_ctx_pos;
1562 jump = ctx->jump;
1563 DATA_POP_DISCARD(ctx);
1564 if (ctx_pos == -1)
1565 return ret;
1566 DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos);
1567
1568 switch (jump) {
1569 case JUMP_MAX_UNTIL_2:
1570 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", pattern, ptr));
1571 goto jump_max_until_2;
1572 case JUMP_MAX_UNTIL_3:
1573 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", pattern, ptr));
1574 goto jump_max_until_3;
1575 case JUMP_MIN_UNTIL_2:
1576 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", pattern, ptr));
1577 goto jump_min_until_2;
1578 case JUMP_MIN_UNTIL_3:
1579 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", pattern, ptr));
1580 goto jump_min_until_3;
1581 case JUMP_BRANCH:
1582 TRACE(("|%p|%p|JUMP_BRANCH\n", pattern, ptr));
1583 goto jump_branch;
1584 case JUMP_MAX_UNTIL_1:
1585 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", pattern, ptr));
1586 goto jump_max_until_1;
1587 case JUMP_MIN_UNTIL_1:
1588 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", pattern, ptr));
1589 goto jump_min_until_1;
1590 case JUMP_POSS_REPEAT_1:
1591 TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", pattern, ptr));
1592 goto jump_poss_repeat_1;
1593 case JUMP_POSS_REPEAT_2:
1594 TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", pattern, ptr));
1595 goto jump_poss_repeat_2;
1596 case JUMP_REPEAT:
1597 TRACE(("|%p|%p|JUMP_REPEAT\n", pattern, ptr));
1598 goto jump_repeat;
1599 case JUMP_REPEAT_ONE_1:
1600 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", pattern, ptr));
1601 goto jump_repeat_one_1;
1602 case JUMP_REPEAT_ONE_2:
1603 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", pattern, ptr));
1604 goto jump_repeat_one_2;
1605 case JUMP_MIN_REPEAT_ONE:
1606 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", pattern, ptr));
1607 goto jump_min_repeat_one;
1608 case JUMP_ATOMIC_GROUP:
1609 TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", pattern, ptr));
1610 goto jump_atomic_group;
1611 case JUMP_ASSERT:
1612 TRACE(("|%p|%p|JUMP_ASSERT\n", pattern, ptr));
1613 goto jump_assert;
1614 case JUMP_ASSERT_NOT:
1615 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", pattern, ptr));
1616 goto jump_assert_not;
1617 case JUMP_NONE:
1618 TRACE(("|%p|%p|RETURN %zd\n", pattern,
1619 ptr, ret));
1620 break;
1621 }
1622
1623 return ret; /* should never get here */
1624 }
1625
1626 /* need to reset capturing groups between two SRE(match) callings in loops */
1627 #define RESET_CAPTURE_GROUP() \
1628 do { state->lastmark = state->lastindex = -1; } while (0)
1629
1630 LOCAL(Py_ssize_t)
SRE(search)1631 SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
1632 {
1633 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1634 SRE_CHAR* end = (SRE_CHAR *)state->end;
1635 Py_ssize_t status = 0;
1636 Py_ssize_t prefix_len = 0;
1637 Py_ssize_t prefix_skip = 0;
1638 SRE_CODE* prefix = NULL;
1639 SRE_CODE* charset = NULL;
1640 SRE_CODE* overlap = NULL;
1641 int flags = 0;
1642
1643 if (ptr > end)
1644 return 0;
1645
1646 if (pattern[0] == SRE_OP_INFO) {
1647 /* optimization info block */
1648 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1649
1650 flags = pattern[2];
1651
1652 if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1653 TRACE(("reject (got %u chars, need %u)\n",
1654 (unsigned int)(end - ptr), pattern[3]));
1655 return 0;
1656 }
1657 if (pattern[3] > 1) {
1658 /* adjust end point (but make sure we leave at least one
1659 character in there, so literal search will work) */
1660 end -= pattern[3] - 1;
1661 if (end <= ptr)
1662 end = ptr;
1663 }
1664
1665 if (flags & SRE_INFO_PREFIX) {
1666 /* pattern starts with a known prefix */
1667 /* <length> <skip> <prefix data> <overlap data> */
1668 prefix_len = pattern[5];
1669 prefix_skip = pattern[6];
1670 prefix = pattern + 7;
1671 overlap = prefix + prefix_len - 1;
1672 } else if (flags & SRE_INFO_CHARSET)
1673 /* pattern starts with a character from a known set */
1674 /* <charset> */
1675 charset = pattern + 5;
1676
1677 pattern += 1 + pattern[1];
1678 }
1679
1680 TRACE(("prefix = %p %zd %zd\n",
1681 prefix, prefix_len, prefix_skip));
1682 TRACE(("charset = %p\n", charset));
1683
1684 if (prefix_len == 1) {
1685 /* pattern starts with a literal character */
1686 SRE_CHAR c = (SRE_CHAR) prefix[0];
1687 #if SIZEOF_SRE_CHAR < 4
1688 if ((SRE_CODE) c != prefix[0])
1689 return 0; /* literal can't match: doesn't fit in char width */
1690 #endif
1691 end = (SRE_CHAR *)state->end;
1692 state->must_advance = 0;
1693 while (ptr < end) {
1694 while (*ptr != c) {
1695 if (++ptr >= end)
1696 return 0;
1697 }
1698 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1699 state->start = ptr;
1700 state->ptr = ptr + prefix_skip;
1701 if (flags & SRE_INFO_LITERAL)
1702 return 1; /* we got all of it */
1703 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
1704 if (status != 0)
1705 return status;
1706 ++ptr;
1707 RESET_CAPTURE_GROUP();
1708 }
1709 return 0;
1710 }
1711
1712 if (prefix_len > 1) {
1713 /* pattern starts with a known prefix. use the overlap
1714 table to skip forward as fast as we possibly can */
1715 Py_ssize_t i = 0;
1716
1717 end = (SRE_CHAR *)state->end;
1718 if (prefix_len > end - ptr)
1719 return 0;
1720 #if SIZEOF_SRE_CHAR < 4
1721 for (i = 0; i < prefix_len; i++)
1722 if ((SRE_CODE)(SRE_CHAR) prefix[i] != prefix[i])
1723 return 0; /* literal can't match: doesn't fit in char width */
1724 #endif
1725 while (ptr < end) {
1726 SRE_CHAR c = (SRE_CHAR) prefix[0];
1727 while (*ptr++ != c) {
1728 if (ptr >= end)
1729 return 0;
1730 }
1731 if (ptr >= end)
1732 return 0;
1733
1734 i = 1;
1735 state->must_advance = 0;
1736 do {
1737 if (*ptr == (SRE_CHAR) prefix[i]) {
1738 if (++i != prefix_len) {
1739 if (++ptr >= end)
1740 return 0;
1741 continue;
1742 }
1743 /* found a potential match */
1744 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1745 state->start = ptr - (prefix_len - 1);
1746 state->ptr = ptr - (prefix_len - prefix_skip - 1);
1747 if (flags & SRE_INFO_LITERAL)
1748 return 1; /* we got all of it */
1749 status = SRE(match)(state, pattern + 2*prefix_skip, 0);
1750 if (status != 0)
1751 return status;
1752 /* close but no cigar -- try again */
1753 if (++ptr >= end)
1754 return 0;
1755 RESET_CAPTURE_GROUP();
1756 }
1757 i = overlap[i];
1758 } while (i != 0);
1759 }
1760 return 0;
1761 }
1762
1763 if (charset) {
1764 /* pattern starts with a character from a known set */
1765 end = (SRE_CHAR *)state->end;
1766 state->must_advance = 0;
1767 for (;;) {
1768 while (ptr < end && !SRE(charset)(state, charset, *ptr))
1769 ptr++;
1770 if (ptr >= end)
1771 return 0;
1772 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1773 state->start = ptr;
1774 state->ptr = ptr;
1775 status = SRE(match)(state, pattern, 0);
1776 if (status != 0)
1777 break;
1778 ptr++;
1779 RESET_CAPTURE_GROUP();
1780 }
1781 } else {
1782 /* general case */
1783 assert(ptr <= end);
1784 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1785 state->start = state->ptr = ptr;
1786 status = SRE(match)(state, pattern, 1);
1787 state->must_advance = 0;
1788 if (status == 0 && pattern[0] == SRE_OP_AT &&
1789 (pattern[1] == SRE_AT_BEGINNING ||
1790 pattern[1] == SRE_AT_BEGINNING_STRING))
1791 {
1792 state->start = state->ptr = ptr = end;
1793 return 0;
1794 }
1795 while (status == 0 && ptr < end) {
1796 ptr++;
1797 RESET_CAPTURE_GROUP();
1798 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1799 state->start = state->ptr = ptr;
1800 status = SRE(match)(state, pattern, 0);
1801 }
1802 }
1803
1804 return status;
1805 }
1806
1807 #undef SRE_CHAR
1808 #undef SIZEOF_SRE_CHAR
1809 #undef SRE
1810
1811 /* vim:ts=4:sw=4:et
1812 */
1813