xref: /aosp_15_r20/prebuilts/build-tools/common/py3-stdlib/re/_compiler.py (revision cda5da8d549138a6648c5ee6d7a49cf8f4a657be)
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the __init__.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13import _sre
14from . import _parser
15from ._constants import *
16from ._casefix import _EXTRA_CASES
17
18assert _sre.MAGIC == MAGIC, "SRE module mismatch"
19
20_LITERAL_CODES = {LITERAL, NOT_LITERAL}
21_SUCCESS_CODES = {SUCCESS, FAILURE}
22_ASSERT_CODES = {ASSERT, ASSERT_NOT}
23_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
24
25_REPEATING_CODES = {
26    MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
27    MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
28    POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
29}
30
31def _combine_flags(flags, add_flags, del_flags,
32                   TYPE_FLAGS=_parser.TYPE_FLAGS):
33    if add_flags & TYPE_FLAGS:
34        flags &= ~TYPE_FLAGS
35    return (flags | add_flags) & ~del_flags
36
37def _compile(code, pattern, flags):
38    # internal: compile a (sub)pattern
39    emit = code.append
40    _len = len
41    LITERAL_CODES = _LITERAL_CODES
42    REPEATING_CODES = _REPEATING_CODES
43    SUCCESS_CODES = _SUCCESS_CODES
44    ASSERT_CODES = _ASSERT_CODES
45    iscased = None
46    tolower = None
47    fixes = None
48    if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
49        if flags & SRE_FLAG_UNICODE:
50            iscased = _sre.unicode_iscased
51            tolower = _sre.unicode_tolower
52            fixes = _EXTRA_CASES
53        else:
54            iscased = _sre.ascii_iscased
55            tolower = _sre.ascii_tolower
56    for op, av in pattern:
57        if op in LITERAL_CODES:
58            if not flags & SRE_FLAG_IGNORECASE:
59                emit(op)
60                emit(av)
61            elif flags & SRE_FLAG_LOCALE:
62                emit(OP_LOCALE_IGNORE[op])
63                emit(av)
64            elif not iscased(av):
65                emit(op)
66                emit(av)
67            else:
68                lo = tolower(av)
69                if not fixes:  # ascii
70                    emit(OP_IGNORE[op])
71                    emit(lo)
72                elif lo not in fixes:
73                    emit(OP_UNICODE_IGNORE[op])
74                    emit(lo)
75                else:
76                    emit(IN_UNI_IGNORE)
77                    skip = _len(code); emit(0)
78                    if op is NOT_LITERAL:
79                        emit(NEGATE)
80                    for k in (lo,) + fixes[lo]:
81                        emit(LITERAL)
82                        emit(k)
83                    emit(FAILURE)
84                    code[skip] = _len(code) - skip
85        elif op is IN:
86            charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
87            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
88                emit(IN_LOC_IGNORE)
89            elif not hascased:
90                emit(IN)
91            elif not fixes:  # ascii
92                emit(IN_IGNORE)
93            else:
94                emit(IN_UNI_IGNORE)
95            skip = _len(code); emit(0)
96            _compile_charset(charset, flags, code)
97            code[skip] = _len(code) - skip
98        elif op is ANY:
99            if flags & SRE_FLAG_DOTALL:
100                emit(ANY_ALL)
101            else:
102                emit(ANY)
103        elif op in REPEATING_CODES:
104            if flags & SRE_FLAG_TEMPLATE:
105                raise error("internal: unsupported template operator %r" % (op,))
106            if _simple(av[2]):
107                emit(REPEATING_CODES[op][2])
108                skip = _len(code); emit(0)
109                emit(av[0])
110                emit(av[1])
111                _compile(code, av[2], flags)
112                emit(SUCCESS)
113                code[skip] = _len(code) - skip
114            else:
115                emit(REPEATING_CODES[op][0])
116                skip = _len(code); emit(0)
117                emit(av[0])
118                emit(av[1])
119                _compile(code, av[2], flags)
120                code[skip] = _len(code) - skip
121                emit(REPEATING_CODES[op][1])
122        elif op is SUBPATTERN:
123            group, add_flags, del_flags, p = av
124            if group:
125                emit(MARK)
126                emit((group-1)*2)
127            # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
128            _compile(code, p, _combine_flags(flags, add_flags, del_flags))
129            if group:
130                emit(MARK)
131                emit((group-1)*2+1)
132        elif op is ATOMIC_GROUP:
133            # Atomic Groups are handled by starting with an Atomic
134            # Group op code, then putting in the atomic group pattern
135            # and finally a success op code to tell any repeat
136            # operations within the Atomic Group to stop eating and
137            # pop their stack if they reach it
138            emit(ATOMIC_GROUP)
139            skip = _len(code); emit(0)
140            _compile(code, av, flags)
141            emit(SUCCESS)
142            code[skip] = _len(code) - skip
143        elif op in SUCCESS_CODES:
144            emit(op)
145        elif op in ASSERT_CODES:
146            emit(op)
147            skip = _len(code); emit(0)
148            if av[0] >= 0:
149                emit(0) # look ahead
150            else:
151                lo, hi = av[1].getwidth()
152                if lo != hi:
153                    raise error("look-behind requires fixed-width pattern")
154                emit(lo) # look behind
155            _compile(code, av[1], flags)
156            emit(SUCCESS)
157            code[skip] = _len(code) - skip
158        elif op is AT:
159            emit(op)
160            if flags & SRE_FLAG_MULTILINE:
161                av = AT_MULTILINE.get(av, av)
162            if flags & SRE_FLAG_LOCALE:
163                av = AT_LOCALE.get(av, av)
164            elif flags & SRE_FLAG_UNICODE:
165                av = AT_UNICODE.get(av, av)
166            emit(av)
167        elif op is BRANCH:
168            emit(op)
169            tail = []
170            tailappend = tail.append
171            for av in av[1]:
172                skip = _len(code); emit(0)
173                # _compile_info(code, av, flags)
174                _compile(code, av, flags)
175                emit(JUMP)
176                tailappend(_len(code)); emit(0)
177                code[skip] = _len(code) - skip
178            emit(FAILURE) # end of branch
179            for tail in tail:
180                code[tail] = _len(code) - tail
181        elif op is CATEGORY:
182            emit(op)
183            if flags & SRE_FLAG_LOCALE:
184                av = CH_LOCALE[av]
185            elif flags & SRE_FLAG_UNICODE:
186                av = CH_UNICODE[av]
187            emit(av)
188        elif op is GROUPREF:
189            if not flags & SRE_FLAG_IGNORECASE:
190                emit(op)
191            elif flags & SRE_FLAG_LOCALE:
192                emit(GROUPREF_LOC_IGNORE)
193            elif not fixes:  # ascii
194                emit(GROUPREF_IGNORE)
195            else:
196                emit(GROUPREF_UNI_IGNORE)
197            emit(av-1)
198        elif op is GROUPREF_EXISTS:
199            emit(op)
200            emit(av[0]-1)
201            skipyes = _len(code); emit(0)
202            _compile(code, av[1], flags)
203            if av[2]:
204                emit(JUMP)
205                skipno = _len(code); emit(0)
206                code[skipyes] = _len(code) - skipyes + 1
207                _compile(code, av[2], flags)
208                code[skipno] = _len(code) - skipno
209            else:
210                code[skipyes] = _len(code) - skipyes + 1
211        else:
212            raise error("internal: unsupported operand type %r" % (op,))
213
214def _compile_charset(charset, flags, code):
215    # compile charset subprogram
216    emit = code.append
217    for op, av in charset:
218        emit(op)
219        if op is NEGATE:
220            pass
221        elif op is LITERAL:
222            emit(av)
223        elif op is RANGE or op is RANGE_UNI_IGNORE:
224            emit(av[0])
225            emit(av[1])
226        elif op is CHARSET:
227            code.extend(av)
228        elif op is BIGCHARSET:
229            code.extend(av)
230        elif op is CATEGORY:
231            if flags & SRE_FLAG_LOCALE:
232                emit(CH_LOCALE[av])
233            elif flags & SRE_FLAG_UNICODE:
234                emit(CH_UNICODE[av])
235            else:
236                emit(av)
237        else:
238            raise error("internal: unsupported set operator %r" % (op,))
239    emit(FAILURE)
240
241def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
242    # internal: optimize character set
243    out = []
244    tail = []
245    charmap = bytearray(256)
246    hascased = False
247    for op, av in charset:
248        while True:
249            try:
250                if op is LITERAL:
251                    if fixup:
252                        lo = fixup(av)
253                        charmap[lo] = 1
254                        if fixes and lo in fixes:
255                            for k in fixes[lo]:
256                                charmap[k] = 1
257                        if not hascased and iscased(av):
258                            hascased = True
259                    else:
260                        charmap[av] = 1
261                elif op is RANGE:
262                    r = range(av[0], av[1]+1)
263                    if fixup:
264                        if fixes:
265                            for i in map(fixup, r):
266                                charmap[i] = 1
267                                if i in fixes:
268                                    for k in fixes[i]:
269                                        charmap[k] = 1
270                        else:
271                            for i in map(fixup, r):
272                                charmap[i] = 1
273                        if not hascased:
274                            hascased = any(map(iscased, r))
275                    else:
276                        for i in r:
277                            charmap[i] = 1
278                elif op is NEGATE:
279                    out.append((op, av))
280                else:
281                    tail.append((op, av))
282            except IndexError:
283                if len(charmap) == 256:
284                    # character set contains non-UCS1 character codes
285                    charmap += b'\0' * 0xff00
286                    continue
287                # Character set contains non-BMP character codes.
288                # For range, all BMP characters in the range are already
289                # proceeded.
290                if fixup:
291                    hascased = True
292                    # For now, IN_UNI_IGNORE+LITERAL and
293                    # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
294                    # characters, because two characters (at least one of
295                    # which is not in the BMP) match case-insensitively
296                    # if and only if:
297                    # 1) c1.lower() == c2.lower()
298                    # 2) c1.lower() == c2 or c1.lower().upper() == c2
299                    # Also, both c.lower() and c.lower().upper() are single
300                    # characters for every non-BMP character.
301                    if op is RANGE:
302                        op = RANGE_UNI_IGNORE
303                tail.append((op, av))
304            break
305
306    # compress character map
307    runs = []
308    q = 0
309    while True:
310        p = charmap.find(1, q)
311        if p < 0:
312            break
313        if len(runs) >= 2:
314            runs = None
315            break
316        q = charmap.find(0, p)
317        if q < 0:
318            runs.append((p, len(charmap)))
319            break
320        runs.append((p, q))
321    if runs is not None:
322        # use literal/range
323        for p, q in runs:
324            if q - p == 1:
325                out.append((LITERAL, p))
326            else:
327                out.append((RANGE, (p, q - 1)))
328        out += tail
329        # if the case was changed or new representation is more compact
330        if hascased or len(out) < len(charset):
331            return out, hascased
332        # else original character set is good enough
333        return charset, hascased
334
335    # use bitmap
336    if len(charmap) == 256:
337        data = _mk_bitmap(charmap)
338        out.append((CHARSET, data))
339        out += tail
340        return out, hascased
341
342    # To represent a big charset, first a bitmap of all characters in the
343    # set is constructed. Then, this bitmap is sliced into chunks of 256
344    # characters, duplicate chunks are eliminated, and each chunk is
345    # given a number. In the compiled expression, the charset is
346    # represented by a 32-bit word sequence, consisting of one word for
347    # the number of different chunks, a sequence of 256 bytes (64 words)
348    # of chunk numbers indexed by their original chunk position, and a
349    # sequence of 256-bit chunks (8 words each).
350
351    # Compression is normally good: in a typical charset, large ranges of
352    # Unicode will be either completely excluded (e.g. if only cyrillic
353    # letters are to be matched), or completely included (e.g. if large
354    # subranges of Kanji match). These ranges will be represented by
355    # chunks of all one-bits or all zero-bits.
356
357    # Matching can be also done efficiently: the more significant byte of
358    # the Unicode character is an index into the chunk number, and the
359    # less significant byte is a bit index in the chunk (just like the
360    # CHARSET matching).
361
362    charmap = bytes(charmap) # should be hashable
363    comps = {}
364    mapping = bytearray(256)
365    block = 0
366    data = bytearray()
367    for i in range(0, 65536, 256):
368        chunk = charmap[i: i + 256]
369        if chunk in comps:
370            mapping[i // 256] = comps[chunk]
371        else:
372            mapping[i // 256] = comps[chunk] = block
373            block += 1
374            data += chunk
375    data = _mk_bitmap(data)
376    data[0:0] = [block] + _bytes_to_codes(mapping)
377    out.append((BIGCHARSET, data))
378    out += tail
379    return out, hascased
380
381_CODEBITS = _sre.CODESIZE * 8
382MAXCODE = (1 << _CODEBITS) - 1
383_BITS_TRANS = b'0' + b'1' * 255
384def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
385    s = bits.translate(_BITS_TRANS)[::-1]
386    return [_int(s[i - _CODEBITS: i], 2)
387            for i in range(len(s), 0, -_CODEBITS)]
388
389def _bytes_to_codes(b):
390    # Convert block indices to word array
391    a = memoryview(b).cast('I')
392    assert a.itemsize == _sre.CODESIZE
393    assert len(a) * a.itemsize == len(b)
394    return a.tolist()
395
396def _simple(p):
397    # check if this subpattern is a "simple" operator
398    if len(p) != 1:
399        return False
400    op, av = p[0]
401    if op is SUBPATTERN:
402        return av[0] is None and _simple(av[-1])
403    return op in _UNIT_CODES
404
405def _generate_overlap_table(prefix):
406    """
407    Generate an overlap table for the following prefix.
408    An overlap table is a table of the same size as the prefix which
409    informs about the potential self-overlap for each index in the prefix:
410    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
411    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
412      prefix[0:k]
413    """
414    table = [0] * len(prefix)
415    for i in range(1, len(prefix)):
416        idx = table[i - 1]
417        while prefix[i] != prefix[idx]:
418            if idx == 0:
419                table[i] = 0
420                break
421            idx = table[idx - 1]
422        else:
423            table[i] = idx + 1
424    return table
425
426def _get_iscased(flags):
427    if not flags & SRE_FLAG_IGNORECASE:
428        return None
429    elif flags & SRE_FLAG_UNICODE:
430        return _sre.unicode_iscased
431    else:
432        return _sre.ascii_iscased
433
434def _get_literal_prefix(pattern, flags):
435    # look for literal prefix
436    prefix = []
437    prefixappend = prefix.append
438    prefix_skip = None
439    iscased = _get_iscased(flags)
440    for op, av in pattern.data:
441        if op is LITERAL:
442            if iscased and iscased(av):
443                break
444            prefixappend(av)
445        elif op is SUBPATTERN:
446            group, add_flags, del_flags, p = av
447            flags1 = _combine_flags(flags, add_flags, del_flags)
448            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
449                break
450            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
451            if prefix_skip is None:
452                if group is not None:
453                    prefix_skip = len(prefix)
454                elif prefix_skip1 is not None:
455                    prefix_skip = len(prefix) + prefix_skip1
456            prefix.extend(prefix1)
457            if not got_all:
458                break
459        else:
460            break
461    else:
462        return prefix, prefix_skip, True
463    return prefix, prefix_skip, False
464
465def _get_charset_prefix(pattern, flags):
466    while True:
467        if not pattern.data:
468            return None
469        op, av = pattern.data[0]
470        if op is not SUBPATTERN:
471            break
472        group, add_flags, del_flags, pattern = av
473        flags = _combine_flags(flags, add_flags, del_flags)
474        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
475            return None
476
477    iscased = _get_iscased(flags)
478    if op is LITERAL:
479        if iscased and iscased(av):
480            return None
481        return [(op, av)]
482    elif op is BRANCH:
483        charset = []
484        charsetappend = charset.append
485        for p in av[1]:
486            if not p:
487                return None
488            op, av = p[0]
489            if op is LITERAL and not (iscased and iscased(av)):
490                charsetappend((op, av))
491            else:
492                return None
493        return charset
494    elif op is IN:
495        charset = av
496        if iscased:
497            for op, av in charset:
498                if op is LITERAL:
499                    if iscased(av):
500                        return None
501                elif op is RANGE:
502                    if av[1] > 0xffff:
503                        return None
504                    if any(map(iscased, range(av[0], av[1]+1))):
505                        return None
506        return charset
507    return None
508
509def _compile_info(code, pattern, flags):
510    # internal: compile an info block.  in the current version,
511    # this contains min/max pattern width, and an optional literal
512    # prefix or a character map
513    lo, hi = pattern.getwidth()
514    if hi > MAXCODE:
515        hi = MAXCODE
516    if lo == 0:
517        code.extend([INFO, 4, 0, lo, hi])
518        return
519    # look for a literal prefix
520    prefix = []
521    prefix_skip = 0
522    charset = [] # not used
523    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
524        # look for literal prefix
525        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
526        # if no prefix, look for charset prefix
527        if not prefix:
528            charset = _get_charset_prefix(pattern, flags)
529##     if prefix:
530##         print("*** PREFIX", prefix, prefix_skip)
531##     if charset:
532##         print("*** CHARSET", charset)
533    # add an info block
534    emit = code.append
535    emit(INFO)
536    skip = len(code); emit(0)
537    # literal flag
538    mask = 0
539    if prefix:
540        mask = SRE_INFO_PREFIX
541        if prefix_skip is None and got_all:
542            mask = mask | SRE_INFO_LITERAL
543    elif charset:
544        mask = mask | SRE_INFO_CHARSET
545    emit(mask)
546    # pattern length
547    if lo < MAXCODE:
548        emit(lo)
549    else:
550        emit(MAXCODE)
551        prefix = prefix[:MAXCODE]
552    emit(min(hi, MAXCODE))
553    # add literal prefix
554    if prefix:
555        emit(len(prefix)) # length
556        if prefix_skip is None:
557            prefix_skip =  len(prefix)
558        emit(prefix_skip) # skip
559        code.extend(prefix)
560        # generate overlap table
561        code.extend(_generate_overlap_table(prefix))
562    elif charset:
563        charset, hascased = _optimize_charset(charset)
564        assert not hascased
565        _compile_charset(charset, flags, code)
566    code[skip] = len(code) - skip
567
568def isstring(obj):
569    return isinstance(obj, (str, bytes))
570
571def _code(p, flags):
572
573    flags = p.state.flags | flags
574    code = []
575
576    # compile info block
577    _compile_info(code, p, flags)
578
579    # compile the pattern
580    _compile(code, p.data, flags)
581
582    code.append(SUCCESS)
583
584    return code
585
586def _hex_code(code):
587    return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
588
589def dis(code):
590    import sys
591
592    labels = set()
593    level = 0
594    offset_width = len(str(len(code) - 1))
595
596    def dis_(start, end):
597        def print_(*args, to=None):
598            if to is not None:
599                labels.add(to)
600                args += ('(to %d)' % (to,),)
601            print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
602                  end='  '*(level-1))
603            print(*args)
604
605        def print_2(*args):
606            print(end=' '*(offset_width + 2*level))
607            print(*args)
608
609        nonlocal level
610        level += 1
611        i = start
612        while i < end:
613            start = i
614            op = code[i]
615            i += 1
616            op = OPCODES[op]
617            if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
618                      MAX_UNTIL, MIN_UNTIL, NEGATE):
619                print_(op)
620            elif op in (LITERAL, NOT_LITERAL,
621                        LITERAL_IGNORE, NOT_LITERAL_IGNORE,
622                        LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
623                        LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
624                arg = code[i]
625                i += 1
626                print_(op, '%#02x (%r)' % (arg, chr(arg)))
627            elif op is AT:
628                arg = code[i]
629                i += 1
630                arg = str(ATCODES[arg])
631                assert arg[:3] == 'AT_'
632                print_(op, arg[3:])
633            elif op is CATEGORY:
634                arg = code[i]
635                i += 1
636                arg = str(CHCODES[arg])
637                assert arg[:9] == 'CATEGORY_'
638                print_(op, arg[9:])
639            elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
640                skip = code[i]
641                print_(op, skip, to=i+skip)
642                dis_(i+1, i+skip)
643                i += skip
644            elif op in (RANGE, RANGE_UNI_IGNORE):
645                lo, hi = code[i: i+2]
646                i += 2
647                print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
648            elif op is CHARSET:
649                print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
650                i += 256//_CODEBITS
651            elif op is BIGCHARSET:
652                arg = code[i]
653                i += 1
654                mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
655                                        for x in code[i: i + 256//_sre.CODESIZE]))
656                print_(op, arg, mapping)
657                i += 256//_sre.CODESIZE
658                level += 1
659                for j in range(arg):
660                    print_2(_hex_code(code[i: i + 256//_CODEBITS]))
661                    i += 256//_CODEBITS
662                level -= 1
663            elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
664                        GROUPREF_LOC_IGNORE):
665                arg = code[i]
666                i += 1
667                print_(op, arg)
668            elif op is JUMP:
669                skip = code[i]
670                print_(op, skip, to=i+skip)
671                i += 1
672            elif op is BRANCH:
673                skip = code[i]
674                print_(op, skip, to=i+skip)
675                while skip:
676                    dis_(i+1, i+skip)
677                    i += skip
678                    start = i
679                    skip = code[i]
680                    if skip:
681                        print_('branch', skip, to=i+skip)
682                    else:
683                        print_(FAILURE)
684                i += 1
685            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
686                        POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
687                skip, min, max = code[i: i+3]
688                if max == MAXREPEAT:
689                    max = 'MAXREPEAT'
690                print_(op, skip, min, max, to=i+skip)
691                dis_(i+3, i+skip)
692                i += skip
693            elif op is GROUPREF_EXISTS:
694                arg, skip = code[i: i+2]
695                print_(op, arg, skip, to=i+skip)
696                i += 2
697            elif op in (ASSERT, ASSERT_NOT):
698                skip, arg = code[i: i+2]
699                print_(op, skip, arg, to=i+skip)
700                dis_(i+2, i+skip)
701                i += skip
702            elif op is ATOMIC_GROUP:
703                skip = code[i]
704                print_(op, skip, to=i+skip)
705                dis_(i+1, i+skip)
706                i += skip
707            elif op is INFO:
708                skip, flags, min, max = code[i: i+4]
709                if max == MAXREPEAT:
710                    max = 'MAXREPEAT'
711                print_(op, skip, bin(flags), min, max, to=i+skip)
712                start = i+4
713                if flags & SRE_INFO_PREFIX:
714                    prefix_len, prefix_skip = code[i+4: i+6]
715                    print_2('  prefix_skip', prefix_skip)
716                    start = i + 6
717                    prefix = code[start: start+prefix_len]
718                    print_2('  prefix',
719                            '[%s]' % ', '.join('%#02x' % x for x in prefix),
720                            '(%r)' % ''.join(map(chr, prefix)))
721                    start += prefix_len
722                    print_2('  overlap', code[start: start+prefix_len])
723                    start += prefix_len
724                if flags & SRE_INFO_CHARSET:
725                    level += 1
726                    print_2('in')
727                    dis_(start, i+skip)
728                    level -= 1
729                i += skip
730            else:
731                raise ValueError(op)
732
733        level -= 1
734
735    dis_(0, len(code))
736
737
738def compile(p, flags=0):
739    # internal: convert pattern list to internal format
740
741    if isstring(p):
742        pattern = p
743        p = _parser.parse(p, flags)
744    else:
745        pattern = None
746
747    code = _code(p, flags)
748
749    if flags & SRE_FLAG_DEBUG:
750        print()
751        dis(code)
752
753    # map in either direction
754    groupindex = p.state.groupdict
755    indexgroup = [None] * p.state.groups
756    for k, i in groupindex.items():
757        indexgroup[i] = k
758
759    return _sre.compile(
760        pattern, flags | p.state.flags, code,
761        p.state.groups-1,
762        groupindex, tuple(indexgroup)
763        )
764