1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi * Perl-Compatible Regular Expressions *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi
8*22dc650dSSadaf Ebrahimi Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi New API code Copyright (c) 2016-2024 University of Cambridge
11*22dc650dSSadaf Ebrahimi
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi
16*22dc650dSSadaf Ebrahimi * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi
19*22dc650dSSadaf Ebrahimi * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi
23*22dc650dSSadaf Ebrahimi * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi
41*22dc650dSSadaf Ebrahimi
42*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
43*22dc650dSSadaf Ebrahimi #include "config.h"
44*22dc650dSSadaf Ebrahimi #endif
45*22dc650dSSadaf Ebrahimi
46*22dc650dSSadaf Ebrahimi #define NLBLOCK cb /* Block containing newline information */
47*22dc650dSSadaf Ebrahimi #define PSSTART start_pattern /* Field containing processed string start */
48*22dc650dSSadaf Ebrahimi #define PSEND end_pattern /* Field containing processed string end */
49*22dc650dSSadaf Ebrahimi
50*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
51*22dc650dSSadaf Ebrahimi
52*22dc650dSSadaf Ebrahimi /* In rare error cases debugging might require calling pcre2_printint(). */
53*22dc650dSSadaf Ebrahimi
54*22dc650dSSadaf Ebrahimi #if 0
55*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
56*22dc650dSSadaf Ebrahimi #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57*22dc650dSSadaf Ebrahimi #else
58*22dc650dSSadaf Ebrahimi #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59*22dc650dSSadaf Ebrahimi #endif
60*22dc650dSSadaf Ebrahimi #include "pcre2_printint.c"
61*22dc650dSSadaf Ebrahimi #define DEBUG_CALL_PRINTINT
62*22dc650dSSadaf Ebrahimi #endif
63*22dc650dSSadaf Ebrahimi
64*22dc650dSSadaf Ebrahimi /* Other debugging code can be enabled by these defines. */
65*22dc650dSSadaf Ebrahimi
66*22dc650dSSadaf Ebrahimi /* #define DEBUG_SHOW_CAPTURES */
67*22dc650dSSadaf Ebrahimi /* #define DEBUG_SHOW_PARSED */
68*22dc650dSSadaf Ebrahimi
69*22dc650dSSadaf Ebrahimi /* There are a few things that vary with different code unit sizes. Handle them
70*22dc650dSSadaf Ebrahimi by defining macros in order to minimize #if usage. */
71*22dc650dSSadaf Ebrahimi
72*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
73*22dc650dSSadaf Ebrahimi #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74*22dc650dSSadaf Ebrahimi #define XDIGIT(c) xdigitab[c]
75*22dc650dSSadaf Ebrahimi
76*22dc650dSSadaf Ebrahimi #else /* Either 16-bit or 32-bit */
77*22dc650dSSadaf Ebrahimi #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78*22dc650dSSadaf Ebrahimi
79*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 16
80*22dc650dSSadaf Ebrahimi #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81*22dc650dSSadaf Ebrahimi
82*22dc650dSSadaf Ebrahimi #else /* 32-bit */
83*22dc650dSSadaf Ebrahimi #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84*22dc650dSSadaf Ebrahimi #endif
85*22dc650dSSadaf Ebrahimi #endif
86*22dc650dSSadaf Ebrahimi
87*22dc650dSSadaf Ebrahimi /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88*22dc650dSSadaf Ebrahimi consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89*22dc650dSSadaf Ebrahimi them will be able to (i.e. assume a 64-bit world). */
90*22dc650dSSadaf Ebrahimi
91*22dc650dSSadaf Ebrahimi #if PCRE2_SIZE_MAX <= UINT32_MAX
92*22dc650dSSadaf Ebrahimi #define PUTOFFSET(s,p) *p++ = s
93*22dc650dSSadaf Ebrahimi #define GETOFFSET(s,p) s = *p++
94*22dc650dSSadaf Ebrahimi #define GETPLUSOFFSET(s,p) s = *(++p)
95*22dc650dSSadaf Ebrahimi #define READPLUSOFFSET(s,p) s = p[1]
96*22dc650dSSadaf Ebrahimi #define SKIPOFFSET(p) p++
97*22dc650dSSadaf Ebrahimi #define SIZEOFFSET 1
98*22dc650dSSadaf Ebrahimi #else
99*22dc650dSSadaf Ebrahimi #define PUTOFFSET(s,p) \
100*22dc650dSSadaf Ebrahimi { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101*22dc650dSSadaf Ebrahimi #define GETOFFSET(s,p) \
102*22dc650dSSadaf Ebrahimi { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103*22dc650dSSadaf Ebrahimi #define GETPLUSOFFSET(s,p) \
104*22dc650dSSadaf Ebrahimi { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105*22dc650dSSadaf Ebrahimi #define READPLUSOFFSET(s,p) \
106*22dc650dSSadaf Ebrahimi { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107*22dc650dSSadaf Ebrahimi #define SKIPOFFSET(p) p += 2
108*22dc650dSSadaf Ebrahimi #define SIZEOFFSET 2
109*22dc650dSSadaf Ebrahimi #endif
110*22dc650dSSadaf Ebrahimi
111*22dc650dSSadaf Ebrahimi /* Macros for manipulating elements of the parsed pattern vector. */
112*22dc650dSSadaf Ebrahimi
113*22dc650dSSadaf Ebrahimi #define META_CODE(x) (x & 0xffff0000u)
114*22dc650dSSadaf Ebrahimi #define META_DATA(x) (x & 0x0000ffffu)
115*22dc650dSSadaf Ebrahimi #define META_DIFF(x,y) ((x-y)>>16)
116*22dc650dSSadaf Ebrahimi
117*22dc650dSSadaf Ebrahimi /* Function definitions to allow mutual recursion */
118*22dc650dSSadaf Ebrahimi
119*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
120*22dc650dSSadaf Ebrahimi static unsigned int
121*22dc650dSSadaf Ebrahimi add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t,
122*22dc650dSSadaf Ebrahimi compile_block *, const uint32_t *, unsigned int);
123*22dc650dSSadaf Ebrahimi #endif
124*22dc650dSSadaf Ebrahimi
125*22dc650dSSadaf Ebrahimi static int
126*22dc650dSSadaf Ebrahimi compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
127*22dc650dSSadaf Ebrahimi uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128*22dc650dSSadaf Ebrahimi open_capitem *, compile_block *, PCRE2_SIZE *);
129*22dc650dSSadaf Ebrahimi
130*22dc650dSSadaf Ebrahimi static int
131*22dc650dSSadaf Ebrahimi get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
132*22dc650dSSadaf Ebrahimi compile_block *);
133*22dc650dSSadaf Ebrahimi
134*22dc650dSSadaf Ebrahimi static BOOL
135*22dc650dSSadaf Ebrahimi set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136*22dc650dSSadaf Ebrahimi compile_block *);
137*22dc650dSSadaf Ebrahimi
138*22dc650dSSadaf Ebrahimi static int
139*22dc650dSSadaf Ebrahimi check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140*22dc650dSSadaf Ebrahimi compile_block *, int *);
141*22dc650dSSadaf Ebrahimi
142*22dc650dSSadaf Ebrahimi
143*22dc650dSSadaf Ebrahimi /*************************************************
144*22dc650dSSadaf Ebrahimi * Code parameters and static tables *
145*22dc650dSSadaf Ebrahimi *************************************************/
146*22dc650dSSadaf Ebrahimi
147*22dc650dSSadaf Ebrahimi #define MAX_GROUP_NUMBER 65535u
148*22dc650dSSadaf Ebrahimi #define MAX_REPEAT_COUNT 65535u
149*22dc650dSSadaf Ebrahimi #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
150*22dc650dSSadaf Ebrahimi
151*22dc650dSSadaf Ebrahimi /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152*22dc650dSSadaf Ebrahimi different ways in the different pattern scans. The parsing and group-
153*22dc650dSSadaf Ebrahimi identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154*22dc650dSSadaf Ebrahimi aligned for this. Having defined the size in code units, we set up
155*22dc650dSSadaf Ebrahimi C16_WORK_SIZE as the number of elements in the 16-bit vector.
156*22dc650dSSadaf Ebrahimi
157*22dc650dSSadaf Ebrahimi During the first compiling phase, when determining how much memory is required,
158*22dc650dSSadaf Ebrahimi the regex is partly compiled into this space, but the compiled parts are
159*22dc650dSSadaf Ebrahimi discarded as soon as they can be, so that hopefully there will never be an
160*22dc650dSSadaf Ebrahimi overrun. The code does, however, check for an overrun, which can occur for
161*22dc650dSSadaf Ebrahimi pathological patterns. The size of the workspace depends on LINK_SIZE because
162*22dc650dSSadaf Ebrahimi the length of compiled items varies with this.
163*22dc650dSSadaf Ebrahimi
164*22dc650dSSadaf Ebrahimi In the real compile phase, this workspace is not currently used. */
165*22dc650dSSadaf Ebrahimi
166*22dc650dSSadaf Ebrahimi #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
167*22dc650dSSadaf Ebrahimi
168*22dc650dSSadaf Ebrahimi #define C16_WORK_SIZE \
169*22dc650dSSadaf Ebrahimi ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170*22dc650dSSadaf Ebrahimi
171*22dc650dSSadaf Ebrahimi /* A uint32_t vector is used for caching information about the size of
172*22dc650dSSadaf Ebrahimi capturing groups, to improve performance. A default is created on the stack of
173*22dc650dSSadaf Ebrahimi this size. */
174*22dc650dSSadaf Ebrahimi
175*22dc650dSSadaf Ebrahimi #define GROUPINFO_DEFAULT_SIZE 256
176*22dc650dSSadaf Ebrahimi
177*22dc650dSSadaf Ebrahimi /* The overrun tests check for a slightly smaller size so that they detect the
178*22dc650dSSadaf Ebrahimi overrun before it actually does run off the end of the data block. */
179*22dc650dSSadaf Ebrahimi
180*22dc650dSSadaf Ebrahimi #define WORK_SIZE_SAFETY_MARGIN (100)
181*22dc650dSSadaf Ebrahimi
182*22dc650dSSadaf Ebrahimi /* This value determines the size of the initial vector that is used for
183*22dc650dSSadaf Ebrahimi remembering named groups during the pre-compile. It is allocated on the stack,
184*22dc650dSSadaf Ebrahimi but if it is too small, it is expanded, in a similar way to the workspace. The
185*22dc650dSSadaf Ebrahimi value is the number of slots in the list. */
186*22dc650dSSadaf Ebrahimi
187*22dc650dSSadaf Ebrahimi #define NAMED_GROUP_LIST_SIZE 20
188*22dc650dSSadaf Ebrahimi
189*22dc650dSSadaf Ebrahimi /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190*22dc650dSSadaf Ebrahimi of uint32_t. For short patterns this lives on the stack, with this size. Heap
191*22dc650dSSadaf Ebrahimi memory is used for longer patterns. */
192*22dc650dSSadaf Ebrahimi
193*22dc650dSSadaf Ebrahimi #define PARSED_PATTERN_DEFAULT_SIZE 1024
194*22dc650dSSadaf Ebrahimi
195*22dc650dSSadaf Ebrahimi /* Maximum length value to check against when making sure that the variable
196*22dc650dSSadaf Ebrahimi that holds the compiled pattern length does not overflow. We make it a bit less
197*22dc650dSSadaf Ebrahimi than INT_MAX to allow for adding in group terminating code units, so that we
198*22dc650dSSadaf Ebrahimi don't have to check them every time. */
199*22dc650dSSadaf Ebrahimi
200*22dc650dSSadaf Ebrahimi #define OFLOW_MAX (INT_MAX - 20)
201*22dc650dSSadaf Ebrahimi
202*22dc650dSSadaf Ebrahimi /* Code values for parsed patterns, which are stored in a vector of 32-bit
203*22dc650dSSadaf Ebrahimi unsigned ints. Values less than META_END are literal data values. The coding
204*22dc650dSSadaf Ebrahimi for identifying the item is in the top 16-bits, leaving 16 bits for the
205*22dc650dSSadaf Ebrahimi additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206*22dc650dSSadaf Ebrahimi macros are used to manipulate parsed pattern elements.
207*22dc650dSSadaf Ebrahimi
208*22dc650dSSadaf Ebrahimi NOTE: When these definitions are changed, the table of extra lengths for each
209*22dc650dSSadaf Ebrahimi code (meta_extra_lengths, just below) must be updated to remain in step. */
210*22dc650dSSadaf Ebrahimi
211*22dc650dSSadaf Ebrahimi #define META_END 0x80000000u /* End of pattern */
212*22dc650dSSadaf Ebrahimi
213*22dc650dSSadaf Ebrahimi #define META_ALT 0x80010000u /* alternation */
214*22dc650dSSadaf Ebrahimi #define META_ATOMIC 0x80020000u /* atomic group */
215*22dc650dSSadaf Ebrahimi #define META_BACKREF 0x80030000u /* Back ref */
216*22dc650dSSadaf Ebrahimi #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
217*22dc650dSSadaf Ebrahimi #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
218*22dc650dSSadaf Ebrahimi #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
219*22dc650dSSadaf Ebrahimi #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
220*22dc650dSSadaf Ebrahimi #define META_CAPTURE 0x80080000u /* Capturing parenthesis */
221*22dc650dSSadaf Ebrahimi #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
222*22dc650dSSadaf Ebrahimi #define META_CLASS 0x800a0000u /* start non-empty class */
223*22dc650dSSadaf Ebrahimi #define META_CLASS_EMPTY 0x800b0000u /* empty class */
224*22dc650dSSadaf Ebrahimi #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
225*22dc650dSSadaf Ebrahimi #define META_CLASS_END 0x800d0000u /* end of non-empty class */
226*22dc650dSSadaf Ebrahimi #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
227*22dc650dSSadaf Ebrahimi #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
228*22dc650dSSadaf Ebrahimi #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
229*22dc650dSSadaf Ebrahimi #define META_COND_NAME 0x80110000u /* (?(<name>)... */
230*22dc650dSSadaf Ebrahimi #define META_COND_NUMBER 0x80120000u /* (?(digits)... */
231*22dc650dSSadaf Ebrahimi #define META_COND_RNAME 0x80130000u /* (?(R&name)... */
232*22dc650dSSadaf Ebrahimi #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
233*22dc650dSSadaf Ebrahimi #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
234*22dc650dSSadaf Ebrahimi #define META_DOLLAR 0x80160000u /* $ metacharacter */
235*22dc650dSSadaf Ebrahimi #define META_DOT 0x80170000u /* . metacharacter */
236*22dc650dSSadaf Ebrahimi #define META_ESCAPE 0x80180000u /* \d and friends */
237*22dc650dSSadaf Ebrahimi #define META_KET 0x80190000u /* closing parenthesis */
238*22dc650dSSadaf Ebrahimi #define META_NOCAPTURE 0x801a0000u /* no capture parens */
239*22dc650dSSadaf Ebrahimi #define META_OPTIONS 0x801b0000u /* (?i) and friends */
240*22dc650dSSadaf Ebrahimi #define META_POSIX 0x801c0000u /* POSIX class item */
241*22dc650dSSadaf Ebrahimi #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
242*22dc650dSSadaf Ebrahimi #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
243*22dc650dSSadaf Ebrahimi #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
244*22dc650dSSadaf Ebrahimi #define META_RECURSE 0x80200000u /* Recursion */
245*22dc650dSSadaf Ebrahimi #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
246*22dc650dSSadaf Ebrahimi #define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
247*22dc650dSSadaf Ebrahimi
248*22dc650dSSadaf Ebrahimi /* These must be kept together to make it easy to check that an assertion
249*22dc650dSSadaf Ebrahimi is present where expected in a conditional group. */
250*22dc650dSSadaf Ebrahimi
251*22dc650dSSadaf Ebrahimi #define META_LOOKAHEAD 0x80230000u /* (?= */
252*22dc650dSSadaf Ebrahimi #define META_LOOKAHEADNOT 0x80240000u /* (?! */
253*22dc650dSSadaf Ebrahimi #define META_LOOKBEHIND 0x80250000u /* (?<= */
254*22dc650dSSadaf Ebrahimi #define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
255*22dc650dSSadaf Ebrahimi
256*22dc650dSSadaf Ebrahimi /* These cannot be conditions */
257*22dc650dSSadaf Ebrahimi
258*22dc650dSSadaf Ebrahimi #define META_LOOKAHEAD_NA 0x80270000u /* (*napla: */
259*22dc650dSSadaf Ebrahimi #define META_LOOKBEHIND_NA 0x80280000u /* (*naplb: */
260*22dc650dSSadaf Ebrahimi
261*22dc650dSSadaf Ebrahimi /* These must be kept in this order, with consecutive values, and the _ARG
262*22dc650dSSadaf Ebrahimi versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263*22dc650dSSadaf Ebrahimi versions. */
264*22dc650dSSadaf Ebrahimi
265*22dc650dSSadaf Ebrahimi #define META_MARK 0x80290000u /* (*MARK) */
266*22dc650dSSadaf Ebrahimi #define META_ACCEPT 0x802a0000u /* (*ACCEPT) */
267*22dc650dSSadaf Ebrahimi #define META_FAIL 0x802b0000u /* (*FAIL) */
268*22dc650dSSadaf Ebrahimi #define META_COMMIT 0x802c0000u /* These */
269*22dc650dSSadaf Ebrahimi #define META_COMMIT_ARG 0x802d0000u /* pairs */
270*22dc650dSSadaf Ebrahimi #define META_PRUNE 0x802e0000u /* must */
271*22dc650dSSadaf Ebrahimi #define META_PRUNE_ARG 0x802f0000u /* be */
272*22dc650dSSadaf Ebrahimi #define META_SKIP 0x80300000u /* kept */
273*22dc650dSSadaf Ebrahimi #define META_SKIP_ARG 0x80310000u /* in */
274*22dc650dSSadaf Ebrahimi #define META_THEN 0x80320000u /* this */
275*22dc650dSSadaf Ebrahimi #define META_THEN_ARG 0x80330000u /* order */
276*22dc650dSSadaf Ebrahimi
277*22dc650dSSadaf Ebrahimi /* These must be kept in groups of adjacent 3 values, and all together. */
278*22dc650dSSadaf Ebrahimi
279*22dc650dSSadaf Ebrahimi #define META_ASTERISK 0x80340000u /* * */
280*22dc650dSSadaf Ebrahimi #define META_ASTERISK_PLUS 0x80350000u /* *+ */
281*22dc650dSSadaf Ebrahimi #define META_ASTERISK_QUERY 0x80360000u /* *? */
282*22dc650dSSadaf Ebrahimi #define META_PLUS 0x80370000u /* + */
283*22dc650dSSadaf Ebrahimi #define META_PLUS_PLUS 0x80380000u /* ++ */
284*22dc650dSSadaf Ebrahimi #define META_PLUS_QUERY 0x80390000u /* +? */
285*22dc650dSSadaf Ebrahimi #define META_QUERY 0x803a0000u /* ? */
286*22dc650dSSadaf Ebrahimi #define META_QUERY_PLUS 0x803b0000u /* ?+ */
287*22dc650dSSadaf Ebrahimi #define META_QUERY_QUERY 0x803c0000u /* ?? */
288*22dc650dSSadaf Ebrahimi #define META_MINMAX 0x803d0000u /* {n,m} repeat */
289*22dc650dSSadaf Ebrahimi #define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
290*22dc650dSSadaf Ebrahimi #define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
291*22dc650dSSadaf Ebrahimi
292*22dc650dSSadaf Ebrahimi #define META_FIRST_QUANTIFIER META_ASTERISK
293*22dc650dSSadaf Ebrahimi #define META_LAST_QUANTIFIER META_MINMAX_QUERY
294*22dc650dSSadaf Ebrahimi
295*22dc650dSSadaf Ebrahimi /* This is a special "meta code" that is used only to distinguish (*asr: from
296*22dc650dSSadaf Ebrahimi (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297*22dc650dSSadaf Ebrahimi pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298*22dc650dSSadaf Ebrahimi therefore no need for it to have a length entry, so use a high value. */
299*22dc650dSSadaf Ebrahimi
300*22dc650dSSadaf Ebrahimi #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301*22dc650dSSadaf Ebrahimi
302*22dc650dSSadaf Ebrahimi /* Table of extra lengths for each of the meta codes. Must be kept in step with
303*22dc650dSSadaf Ebrahimi the definitions above. For some items these values are a basic length to which
304*22dc650dSSadaf Ebrahimi a variable amount has to be added. */
305*22dc650dSSadaf Ebrahimi
306*22dc650dSSadaf Ebrahimi static unsigned char meta_extra_lengths[] = {
307*22dc650dSSadaf Ebrahimi 0, /* META_END */
308*22dc650dSSadaf Ebrahimi 0, /* META_ALT */
309*22dc650dSSadaf Ebrahimi 0, /* META_ATOMIC */
310*22dc650dSSadaf Ebrahimi 0, /* META_BACKREF - more if group is >= 10 */
311*22dc650dSSadaf Ebrahimi 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
312*22dc650dSSadaf Ebrahimi 1, /* META_BIGVALUE */
313*22dc650dSSadaf Ebrahimi 3, /* META_CALLOUT_NUMBER */
314*22dc650dSSadaf Ebrahimi 3+SIZEOFFSET, /* META_CALLOUT_STRING */
315*22dc650dSSadaf Ebrahimi 0, /* META_CAPTURE */
316*22dc650dSSadaf Ebrahimi 0, /* META_CIRCUMFLEX */
317*22dc650dSSadaf Ebrahimi 0, /* META_CLASS */
318*22dc650dSSadaf Ebrahimi 0, /* META_CLASS_EMPTY */
319*22dc650dSSadaf Ebrahimi 0, /* META_CLASS_EMPTY_NOT */
320*22dc650dSSadaf Ebrahimi 0, /* META_CLASS_END */
321*22dc650dSSadaf Ebrahimi 0, /* META_CLASS_NOT */
322*22dc650dSSadaf Ebrahimi 0, /* META_COND_ASSERT */
323*22dc650dSSadaf Ebrahimi SIZEOFFSET, /* META_COND_DEFINE */
324*22dc650dSSadaf Ebrahimi 1+SIZEOFFSET, /* META_COND_NAME */
325*22dc650dSSadaf Ebrahimi 1+SIZEOFFSET, /* META_COND_NUMBER */
326*22dc650dSSadaf Ebrahimi 1+SIZEOFFSET, /* META_COND_RNAME */
327*22dc650dSSadaf Ebrahimi 1+SIZEOFFSET, /* META_COND_RNUMBER */
328*22dc650dSSadaf Ebrahimi 3, /* META_COND_VERSION */
329*22dc650dSSadaf Ebrahimi 0, /* META_DOLLAR */
330*22dc650dSSadaf Ebrahimi 0, /* META_DOT */
331*22dc650dSSadaf Ebrahimi 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332*22dc650dSSadaf Ebrahimi 0, /* META_KET */
333*22dc650dSSadaf Ebrahimi 0, /* META_NOCAPTURE */
334*22dc650dSSadaf Ebrahimi 1, /* META_OPTIONS */
335*22dc650dSSadaf Ebrahimi 1, /* META_POSIX */
336*22dc650dSSadaf Ebrahimi 1, /* META_POSIX_NEG */
337*22dc650dSSadaf Ebrahimi 0, /* META_RANGE_ESCAPED */
338*22dc650dSSadaf Ebrahimi 0, /* META_RANGE_LITERAL */
339*22dc650dSSadaf Ebrahimi SIZEOFFSET, /* META_RECURSE */
340*22dc650dSSadaf Ebrahimi 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
341*22dc650dSSadaf Ebrahimi 0, /* META_SCRIPT_RUN */
342*22dc650dSSadaf Ebrahimi 0, /* META_LOOKAHEAD */
343*22dc650dSSadaf Ebrahimi 0, /* META_LOOKAHEADNOT */
344*22dc650dSSadaf Ebrahimi SIZEOFFSET, /* META_LOOKBEHIND */
345*22dc650dSSadaf Ebrahimi SIZEOFFSET, /* META_LOOKBEHINDNOT */
346*22dc650dSSadaf Ebrahimi 0, /* META_LOOKAHEAD_NA */
347*22dc650dSSadaf Ebrahimi SIZEOFFSET, /* META_LOOKBEHIND_NA */
348*22dc650dSSadaf Ebrahimi 1, /* META_MARK - plus the string length */
349*22dc650dSSadaf Ebrahimi 0, /* META_ACCEPT */
350*22dc650dSSadaf Ebrahimi 0, /* META_FAIL */
351*22dc650dSSadaf Ebrahimi 0, /* META_COMMIT */
352*22dc650dSSadaf Ebrahimi 1, /* META_COMMIT_ARG - plus the string length */
353*22dc650dSSadaf Ebrahimi 0, /* META_PRUNE */
354*22dc650dSSadaf Ebrahimi 1, /* META_PRUNE_ARG - plus the string length */
355*22dc650dSSadaf Ebrahimi 0, /* META_SKIP */
356*22dc650dSSadaf Ebrahimi 1, /* META_SKIP_ARG - plus the string length */
357*22dc650dSSadaf Ebrahimi 0, /* META_THEN */
358*22dc650dSSadaf Ebrahimi 1, /* META_THEN_ARG - plus the string length */
359*22dc650dSSadaf Ebrahimi 0, /* META_ASTERISK */
360*22dc650dSSadaf Ebrahimi 0, /* META_ASTERISK_PLUS */
361*22dc650dSSadaf Ebrahimi 0, /* META_ASTERISK_QUERY */
362*22dc650dSSadaf Ebrahimi 0, /* META_PLUS */
363*22dc650dSSadaf Ebrahimi 0, /* META_PLUS_PLUS */
364*22dc650dSSadaf Ebrahimi 0, /* META_PLUS_QUERY */
365*22dc650dSSadaf Ebrahimi 0, /* META_QUERY */
366*22dc650dSSadaf Ebrahimi 0, /* META_QUERY_PLUS */
367*22dc650dSSadaf Ebrahimi 0, /* META_QUERY_QUERY */
368*22dc650dSSadaf Ebrahimi 2, /* META_MINMAX */
369*22dc650dSSadaf Ebrahimi 2, /* META_MINMAX_PLUS */
370*22dc650dSSadaf Ebrahimi 2 /* META_MINMAX_QUERY */
371*22dc650dSSadaf Ebrahimi };
372*22dc650dSSadaf Ebrahimi
373*22dc650dSSadaf Ebrahimi /* Types for skipping parts of a parsed pattern. */
374*22dc650dSSadaf Ebrahimi
375*22dc650dSSadaf Ebrahimi enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376*22dc650dSSadaf Ebrahimi
377*22dc650dSSadaf Ebrahimi /* Macro for setting individual bits in class bitmaps. It took some
378*22dc650dSSadaf Ebrahimi experimenting to figure out how to stop gcc 5.3.0 from warning with
379*22dc650dSSadaf Ebrahimi -Wconversion. This version gets a warning:
380*22dc650dSSadaf Ebrahimi
381*22dc650dSSadaf Ebrahimi #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382*22dc650dSSadaf Ebrahimi
383*22dc650dSSadaf Ebrahimi Let's hope the apparently less efficient version isn't actually so bad if the
384*22dc650dSSadaf Ebrahimi compiler is clever with identical subexpressions. */
385*22dc650dSSadaf Ebrahimi
386*22dc650dSSadaf Ebrahimi #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387*22dc650dSSadaf Ebrahimi
388*22dc650dSSadaf Ebrahimi /* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389*22dc650dSSadaf Ebrahimi variables, which are concerned with first and required code units. A value
390*22dc650dSSadaf Ebrahimi greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391*22dc650dSSadaf Ebrahimi matching xxcu variable is set, and the low valued bits are relevant. */
392*22dc650dSSadaf Ebrahimi
393*22dc650dSSadaf Ebrahimi #define REQ_UNSET 0xffffffffu /* Not yet found anything */
394*22dc650dSSadaf Ebrahimi #define REQ_NONE 0xfffffffeu /* Found not fixed character */
395*22dc650dSSadaf Ebrahimi #define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
396*22dc650dSSadaf Ebrahimi #define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
397*22dc650dSSadaf Ebrahimi
398*22dc650dSSadaf Ebrahimi /* These flags are used in the groupinfo vector. */
399*22dc650dSSadaf Ebrahimi
400*22dc650dSSadaf Ebrahimi #define GI_SET_FIXED_LENGTH 0x80000000u
401*22dc650dSSadaf Ebrahimi #define GI_NOT_FIXED_LENGTH 0x40000000u
402*22dc650dSSadaf Ebrahimi #define GI_FIXED_LENGTH_MASK 0x0000ffffu
403*22dc650dSSadaf Ebrahimi
404*22dc650dSSadaf Ebrahimi /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405*22dc650dSSadaf Ebrahimi and is fast (a good compiler can turn it into a subtraction and unsigned
406*22dc650dSSadaf Ebrahimi comparison). */
407*22dc650dSSadaf Ebrahimi
408*22dc650dSSadaf Ebrahimi #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409*22dc650dSSadaf Ebrahimi
410*22dc650dSSadaf Ebrahimi /* Table to identify hex digits. The tables in chartables are dependent on the
411*22dc650dSSadaf Ebrahimi locale, and may mark arbitrary characters as digits. We want to recognize only
412*22dc650dSSadaf Ebrahimi 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413*22dc650dSSadaf Ebrahimi costs 256 bytes, but it is a lot faster than doing character value tests (at
414*22dc650dSSadaf Ebrahimi least in some simple cases I timed), and in some applications one wants PCRE2
415*22dc650dSSadaf Ebrahimi to compile efficiently as well as match efficiently. The value in the table is
416*22dc650dSSadaf Ebrahimi the binary hex digit value, or 0xff for non-hex digits. */
417*22dc650dSSadaf Ebrahimi
418*22dc650dSSadaf Ebrahimi /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419*22dc650dSSadaf Ebrahimi UTF-8 mode. */
420*22dc650dSSadaf Ebrahimi
421*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
422*22dc650dSSadaf Ebrahimi static const uint8_t xdigitab[] =
423*22dc650dSSadaf Ebrahimi {
424*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
425*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
426*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
427*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
428*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
429*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
430*22dc650dSSadaf Ebrahimi 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
431*22dc650dSSadaf Ebrahimi 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
432*22dc650dSSadaf Ebrahimi 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
433*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
434*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
435*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
436*22dc650dSSadaf Ebrahimi 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
437*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
438*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
439*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
440*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456*22dc650dSSadaf Ebrahimi
457*22dc650dSSadaf Ebrahimi #else
458*22dc650dSSadaf Ebrahimi
459*22dc650dSSadaf Ebrahimi /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460*22dc650dSSadaf Ebrahimi
461*22dc650dSSadaf Ebrahimi static const uint8_t xdigitab[] =
462*22dc650dSSadaf Ebrahimi {
463*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
464*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
465*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
466*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
467*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
468*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
469*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
470*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
471*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
472*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
473*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
474*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
475*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
476*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
477*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
479*22dc650dSSadaf Ebrahimi 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
480*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
481*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
482*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
483*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
484*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
485*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
486*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
487*22dc650dSSadaf Ebrahimi 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
488*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
489*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
490*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
491*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
492*22dc650dSSadaf Ebrahimi 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
493*22dc650dSSadaf Ebrahimi 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
494*22dc650dSSadaf Ebrahimi 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
495*22dc650dSSadaf Ebrahimi #endif /* EBCDIC */
496*22dc650dSSadaf Ebrahimi
497*22dc650dSSadaf Ebrahimi
498*22dc650dSSadaf Ebrahimi /* Table for handling alphanumeric escaped characters. Positive returns are
499*22dc650dSSadaf Ebrahimi simple data values; negative values are for special things like \d and so on.
500*22dc650dSSadaf Ebrahimi Zero means further processing is needed (for things like \x), or the escape is
501*22dc650dSSadaf Ebrahimi invalid. */
502*22dc650dSSadaf Ebrahimi
503*22dc650dSSadaf Ebrahimi /* This is the "normal" table for ASCII systems or for EBCDIC systems running
504*22dc650dSSadaf Ebrahimi in UTF-8 mode. It runs from '0' to 'z'. */
505*22dc650dSSadaf Ebrahimi
506*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
507*22dc650dSSadaf Ebrahimi #define ESCAPES_FIRST CHAR_0
508*22dc650dSSadaf Ebrahimi #define ESCAPES_LAST CHAR_z
509*22dc650dSSadaf Ebrahimi #define UPPER_CASE(c) (c-32)
510*22dc650dSSadaf Ebrahimi
511*22dc650dSSadaf Ebrahimi static const short int escapes[] = {
512*22dc650dSSadaf Ebrahimi 0, 0,
513*22dc650dSSadaf Ebrahimi 0, 0,
514*22dc650dSSadaf Ebrahimi 0, 0,
515*22dc650dSSadaf Ebrahimi 0, 0,
516*22dc650dSSadaf Ebrahimi 0, 0,
517*22dc650dSSadaf Ebrahimi CHAR_COLON, CHAR_SEMICOLON,
518*22dc650dSSadaf Ebrahimi CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
519*22dc650dSSadaf Ebrahimi CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
520*22dc650dSSadaf Ebrahimi CHAR_COMMERCIAL_AT, -ESC_A,
521*22dc650dSSadaf Ebrahimi -ESC_B, -ESC_C,
522*22dc650dSSadaf Ebrahimi -ESC_D, -ESC_E,
523*22dc650dSSadaf Ebrahimi 0, -ESC_G,
524*22dc650dSSadaf Ebrahimi -ESC_H, 0,
525*22dc650dSSadaf Ebrahimi 0, -ESC_K,
526*22dc650dSSadaf Ebrahimi 0, 0,
527*22dc650dSSadaf Ebrahimi -ESC_N, 0,
528*22dc650dSSadaf Ebrahimi -ESC_P, -ESC_Q,
529*22dc650dSSadaf Ebrahimi -ESC_R, -ESC_S,
530*22dc650dSSadaf Ebrahimi 0, 0,
531*22dc650dSSadaf Ebrahimi -ESC_V, -ESC_W,
532*22dc650dSSadaf Ebrahimi -ESC_X, 0,
533*22dc650dSSadaf Ebrahimi -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
534*22dc650dSSadaf Ebrahimi CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
535*22dc650dSSadaf Ebrahimi CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
536*22dc650dSSadaf Ebrahimi CHAR_GRAVE_ACCENT, CHAR_BEL,
537*22dc650dSSadaf Ebrahimi -ESC_b, 0,
538*22dc650dSSadaf Ebrahimi -ESC_d, CHAR_ESC,
539*22dc650dSSadaf Ebrahimi CHAR_FF, 0,
540*22dc650dSSadaf Ebrahimi -ESC_h, 0,
541*22dc650dSSadaf Ebrahimi 0, -ESC_k,
542*22dc650dSSadaf Ebrahimi 0, 0,
543*22dc650dSSadaf Ebrahimi CHAR_LF, 0,
544*22dc650dSSadaf Ebrahimi -ESC_p, 0,
545*22dc650dSSadaf Ebrahimi CHAR_CR, -ESC_s,
546*22dc650dSSadaf Ebrahimi CHAR_HT, 0,
547*22dc650dSSadaf Ebrahimi -ESC_v, -ESC_w,
548*22dc650dSSadaf Ebrahimi 0, 0,
549*22dc650dSSadaf Ebrahimi -ESC_z
550*22dc650dSSadaf Ebrahimi };
551*22dc650dSSadaf Ebrahimi
552*22dc650dSSadaf Ebrahimi #else
553*22dc650dSSadaf Ebrahimi
554*22dc650dSSadaf Ebrahimi /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555*22dc650dSSadaf Ebrahimi It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556*22dc650dSSadaf Ebrahimi is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557*22dc650dSSadaf Ebrahimi because it is defined as 'a', which of course picks up the ASCII value. */
558*22dc650dSSadaf Ebrahimi
559*22dc650dSSadaf Ebrahimi #if 'a' == 0x81 /* Check for a real EBCDIC environment */
560*22dc650dSSadaf Ebrahimi #define ESCAPES_FIRST CHAR_a
561*22dc650dSSadaf Ebrahimi #define ESCAPES_LAST CHAR_9
562*22dc650dSSadaf Ebrahimi #define UPPER_CASE(c) (c+64)
563*22dc650dSSadaf Ebrahimi #else /* Testing in an ASCII environment */
564*22dc650dSSadaf Ebrahimi #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
565*22dc650dSSadaf Ebrahimi #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
566*22dc650dSSadaf Ebrahimi #define UPPER_CASE(c) (c-32)
567*22dc650dSSadaf Ebrahimi #endif
568*22dc650dSSadaf Ebrahimi
569*22dc650dSSadaf Ebrahimi static const short int escapes[] = {
570*22dc650dSSadaf Ebrahimi /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
571*22dc650dSSadaf Ebrahimi /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
572*22dc650dSSadaf Ebrahimi /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
573*22dc650dSSadaf Ebrahimi /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
574*22dc650dSSadaf Ebrahimi /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
575*22dc650dSSadaf Ebrahimi /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
576*22dc650dSSadaf Ebrahimi /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
577*22dc650dSSadaf Ebrahimi /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
578*22dc650dSSadaf Ebrahimi /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
579*22dc650dSSadaf Ebrahimi /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
580*22dc650dSSadaf Ebrahimi /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
581*22dc650dSSadaf Ebrahimi /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
582*22dc650dSSadaf Ebrahimi /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
583*22dc650dSSadaf Ebrahimi /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
584*22dc650dSSadaf Ebrahimi /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
585*22dc650dSSadaf Ebrahimi /* F8 */ 0, 0
586*22dc650dSSadaf Ebrahimi };
587*22dc650dSSadaf Ebrahimi
588*22dc650dSSadaf Ebrahimi /* We also need a table of characters that may follow \c in an EBCDIC
589*22dc650dSSadaf Ebrahimi environment for characters 0-31. */
590*22dc650dSSadaf Ebrahimi
591*22dc650dSSadaf Ebrahimi static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592*22dc650dSSadaf Ebrahimi
593*22dc650dSSadaf Ebrahimi #endif /* EBCDIC */
594*22dc650dSSadaf Ebrahimi
595*22dc650dSSadaf Ebrahimi
596*22dc650dSSadaf Ebrahimi /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597*22dc650dSSadaf Ebrahimi searched linearly. Put all the names into a single string, in order to reduce
598*22dc650dSSadaf Ebrahimi the number of relocations when a shared library is dynamically linked. The
599*22dc650dSSadaf Ebrahimi string is built from string macros so that it works in UTF-8 mode on EBCDIC
600*22dc650dSSadaf Ebrahimi platforms. */
601*22dc650dSSadaf Ebrahimi
602*22dc650dSSadaf Ebrahimi typedef struct verbitem {
603*22dc650dSSadaf Ebrahimi unsigned int len; /* Length of verb name */
604*22dc650dSSadaf Ebrahimi uint32_t meta; /* Base META_ code */
605*22dc650dSSadaf Ebrahimi int has_arg; /* Argument requirement */
606*22dc650dSSadaf Ebrahimi } verbitem;
607*22dc650dSSadaf Ebrahimi
608*22dc650dSSadaf Ebrahimi static const char verbnames[] =
609*22dc650dSSadaf Ebrahimi "\0" /* Empty name is a shorthand for MARK */
610*22dc650dSSadaf Ebrahimi STRING_MARK0
611*22dc650dSSadaf Ebrahimi STRING_ACCEPT0
612*22dc650dSSadaf Ebrahimi STRING_F0
613*22dc650dSSadaf Ebrahimi STRING_FAIL0
614*22dc650dSSadaf Ebrahimi STRING_COMMIT0
615*22dc650dSSadaf Ebrahimi STRING_PRUNE0
616*22dc650dSSadaf Ebrahimi STRING_SKIP0
617*22dc650dSSadaf Ebrahimi STRING_THEN;
618*22dc650dSSadaf Ebrahimi
619*22dc650dSSadaf Ebrahimi static const verbitem verbs[] = {
620*22dc650dSSadaf Ebrahimi { 0, META_MARK, +1 }, /* > 0 => must have an argument */
621*22dc650dSSadaf Ebrahimi { 4, META_MARK, +1 },
622*22dc650dSSadaf Ebrahimi { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
623*22dc650dSSadaf Ebrahimi { 1, META_FAIL, -1 },
624*22dc650dSSadaf Ebrahimi { 4, META_FAIL, -1 },
625*22dc650dSSadaf Ebrahimi { 6, META_COMMIT, 0 },
626*22dc650dSSadaf Ebrahimi { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
627*22dc650dSSadaf Ebrahimi { 4, META_SKIP, 0 },
628*22dc650dSSadaf Ebrahimi { 4, META_THEN, 0 }
629*22dc650dSSadaf Ebrahimi };
630*22dc650dSSadaf Ebrahimi
631*22dc650dSSadaf Ebrahimi static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632*22dc650dSSadaf Ebrahimi
633*22dc650dSSadaf Ebrahimi /* Verb opcodes, indexed by their META code offset from META_MARK. */
634*22dc650dSSadaf Ebrahimi
635*22dc650dSSadaf Ebrahimi static const uint32_t verbops[] = {
636*22dc650dSSadaf Ebrahimi OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637*22dc650dSSadaf Ebrahimi OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638*22dc650dSSadaf Ebrahimi
639*22dc650dSSadaf Ebrahimi /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640*22dc650dSSadaf Ebrahimi
641*22dc650dSSadaf Ebrahimi typedef struct alasitem {
642*22dc650dSSadaf Ebrahimi unsigned int len; /* Length of name */
643*22dc650dSSadaf Ebrahimi uint32_t meta; /* Base META_ code */
644*22dc650dSSadaf Ebrahimi } alasitem;
645*22dc650dSSadaf Ebrahimi
646*22dc650dSSadaf Ebrahimi static const char alasnames[] =
647*22dc650dSSadaf Ebrahimi STRING_pla0
648*22dc650dSSadaf Ebrahimi STRING_plb0
649*22dc650dSSadaf Ebrahimi STRING_napla0
650*22dc650dSSadaf Ebrahimi STRING_naplb0
651*22dc650dSSadaf Ebrahimi STRING_nla0
652*22dc650dSSadaf Ebrahimi STRING_nlb0
653*22dc650dSSadaf Ebrahimi STRING_positive_lookahead0
654*22dc650dSSadaf Ebrahimi STRING_positive_lookbehind0
655*22dc650dSSadaf Ebrahimi STRING_non_atomic_positive_lookahead0
656*22dc650dSSadaf Ebrahimi STRING_non_atomic_positive_lookbehind0
657*22dc650dSSadaf Ebrahimi STRING_negative_lookahead0
658*22dc650dSSadaf Ebrahimi STRING_negative_lookbehind0
659*22dc650dSSadaf Ebrahimi STRING_atomic0
660*22dc650dSSadaf Ebrahimi STRING_sr0
661*22dc650dSSadaf Ebrahimi STRING_asr0
662*22dc650dSSadaf Ebrahimi STRING_script_run0
663*22dc650dSSadaf Ebrahimi STRING_atomic_script_run;
664*22dc650dSSadaf Ebrahimi
665*22dc650dSSadaf Ebrahimi static const alasitem alasmeta[] = {
666*22dc650dSSadaf Ebrahimi { 3, META_LOOKAHEAD },
667*22dc650dSSadaf Ebrahimi { 3, META_LOOKBEHIND },
668*22dc650dSSadaf Ebrahimi { 5, META_LOOKAHEAD_NA },
669*22dc650dSSadaf Ebrahimi { 5, META_LOOKBEHIND_NA },
670*22dc650dSSadaf Ebrahimi { 3, META_LOOKAHEADNOT },
671*22dc650dSSadaf Ebrahimi { 3, META_LOOKBEHINDNOT },
672*22dc650dSSadaf Ebrahimi { 18, META_LOOKAHEAD },
673*22dc650dSSadaf Ebrahimi { 19, META_LOOKBEHIND },
674*22dc650dSSadaf Ebrahimi { 29, META_LOOKAHEAD_NA },
675*22dc650dSSadaf Ebrahimi { 30, META_LOOKBEHIND_NA },
676*22dc650dSSadaf Ebrahimi { 18, META_LOOKAHEADNOT },
677*22dc650dSSadaf Ebrahimi { 19, META_LOOKBEHINDNOT },
678*22dc650dSSadaf Ebrahimi { 6, META_ATOMIC },
679*22dc650dSSadaf Ebrahimi { 2, META_SCRIPT_RUN }, /* sr = script run */
680*22dc650dSSadaf Ebrahimi { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681*22dc650dSSadaf Ebrahimi { 10, META_SCRIPT_RUN }, /* script run */
682*22dc650dSSadaf Ebrahimi { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
683*22dc650dSSadaf Ebrahimi };
684*22dc650dSSadaf Ebrahimi
685*22dc650dSSadaf Ebrahimi static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686*22dc650dSSadaf Ebrahimi
687*22dc650dSSadaf Ebrahimi /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688*22dc650dSSadaf Ebrahimi
689*22dc650dSSadaf Ebrahimi static uint32_t chartypeoffset[] = {
690*22dc650dSSadaf Ebrahimi OP_STAR - OP_STAR, OP_STARI - OP_STAR,
691*22dc650dSSadaf Ebrahimi OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692*22dc650dSSadaf Ebrahimi
693*22dc650dSSadaf Ebrahimi /* Tables of names of POSIX character classes and their lengths. The names are
694*22dc650dSSadaf Ebrahimi now all in a single string, to reduce the number of relocations when a shared
695*22dc650dSSadaf Ebrahimi library is dynamically loaded. The list of lengths is terminated by a zero
696*22dc650dSSadaf Ebrahimi length entry. The first three must be alpha, lower, upper, as this is assumed
697*22dc650dSSadaf Ebrahimi for handling case independence. The indices for several classes are needed, so
698*22dc650dSSadaf Ebrahimi identify them. */
699*22dc650dSSadaf Ebrahimi
700*22dc650dSSadaf Ebrahimi static const char posix_names[] =
701*22dc650dSSadaf Ebrahimi STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702*22dc650dSSadaf Ebrahimi STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703*22dc650dSSadaf Ebrahimi STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704*22dc650dSSadaf Ebrahimi STRING_word0 STRING_xdigit;
705*22dc650dSSadaf Ebrahimi
706*22dc650dSSadaf Ebrahimi static const uint8_t posix_name_lengths[] = {
707*22dc650dSSadaf Ebrahimi 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708*22dc650dSSadaf Ebrahimi
709*22dc650dSSadaf Ebrahimi #define PC_DIGIT 7
710*22dc650dSSadaf Ebrahimi #define PC_GRAPH 8
711*22dc650dSSadaf Ebrahimi #define PC_PRINT 9
712*22dc650dSSadaf Ebrahimi #define PC_PUNCT 10
713*22dc650dSSadaf Ebrahimi #define PC_XDIGIT 13
714*22dc650dSSadaf Ebrahimi
715*22dc650dSSadaf Ebrahimi /* Table of class bit maps for each POSIX class. Each class is formed from a
716*22dc650dSSadaf Ebrahimi base map, with an optional addition or removal of another map. Then, for some
717*22dc650dSSadaf Ebrahimi classes, there is some additional tweaking: for [:blank:] the vertical space
718*22dc650dSSadaf Ebrahimi characters are removed, and for [:alpha:] and [:alnum:] the underscore
719*22dc650dSSadaf Ebrahimi character is removed. The triples in the table consist of the base map offset,
720*22dc650dSSadaf Ebrahimi second map offset or -1 if no second map, and a non-negative value for map
721*22dc650dSSadaf Ebrahimi addition or a negative value for map subtraction (if there are two maps). The
722*22dc650dSSadaf Ebrahimi absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
723*22dc650dSSadaf Ebrahimi remove vertical space characters, 2 => remove underscore. */
724*22dc650dSSadaf Ebrahimi
725*22dc650dSSadaf Ebrahimi static const int posix_class_maps[] = {
726*22dc650dSSadaf Ebrahimi cbit_word, cbit_digit, -2, /* alpha */
727*22dc650dSSadaf Ebrahimi cbit_lower, -1, 0, /* lower */
728*22dc650dSSadaf Ebrahimi cbit_upper, -1, 0, /* upper */
729*22dc650dSSadaf Ebrahimi cbit_word, -1, 2, /* alnum - word without underscore */
730*22dc650dSSadaf Ebrahimi cbit_print, cbit_cntrl, 0, /* ascii */
731*22dc650dSSadaf Ebrahimi cbit_space, -1, 1, /* blank - a GNU extension */
732*22dc650dSSadaf Ebrahimi cbit_cntrl, -1, 0, /* cntrl */
733*22dc650dSSadaf Ebrahimi cbit_digit, -1, 0, /* digit */
734*22dc650dSSadaf Ebrahimi cbit_graph, -1, 0, /* graph */
735*22dc650dSSadaf Ebrahimi cbit_print, -1, 0, /* print */
736*22dc650dSSadaf Ebrahimi cbit_punct, -1, 0, /* punct */
737*22dc650dSSadaf Ebrahimi cbit_space, -1, 0, /* space */
738*22dc650dSSadaf Ebrahimi cbit_word, -1, 0, /* word - a Perl extension */
739*22dc650dSSadaf Ebrahimi cbit_xdigit, -1, 0 /* xdigit */
740*22dc650dSSadaf Ebrahimi };
741*22dc650dSSadaf Ebrahimi
742*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
743*22dc650dSSadaf Ebrahimi
744*22dc650dSSadaf Ebrahimi /* The POSIX class Unicode property substitutes that are used in UCP mode must
745*22dc650dSSadaf Ebrahimi be in the order of the POSIX class names, defined above. */
746*22dc650dSSadaf Ebrahimi
747*22dc650dSSadaf Ebrahimi static int posix_substitutes[] = {
748*22dc650dSSadaf Ebrahimi PT_GC, ucp_L, /* alpha */
749*22dc650dSSadaf Ebrahimi PT_PC, ucp_Ll, /* lower */
750*22dc650dSSadaf Ebrahimi PT_PC, ucp_Lu, /* upper */
751*22dc650dSSadaf Ebrahimi PT_ALNUM, 0, /* alnum */
752*22dc650dSSadaf Ebrahimi -1, 0, /* ascii, treat as non-UCP */
753*22dc650dSSadaf Ebrahimi -1, 1, /* blank, treat as \h */
754*22dc650dSSadaf Ebrahimi PT_PC, ucp_Cc, /* cntrl */
755*22dc650dSSadaf Ebrahimi PT_PC, ucp_Nd, /* digit */
756*22dc650dSSadaf Ebrahimi PT_PXGRAPH, 0, /* graph */
757*22dc650dSSadaf Ebrahimi PT_PXPRINT, 0, /* print */
758*22dc650dSSadaf Ebrahimi PT_PXPUNCT, 0, /* punct */
759*22dc650dSSadaf Ebrahimi PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
760*22dc650dSSadaf Ebrahimi PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
761*22dc650dSSadaf Ebrahimi PT_PXXDIGIT, 0 /* xdigit */ /* Perl has additional hex digits */
762*22dc650dSSadaf Ebrahimi };
763*22dc650dSSadaf Ebrahimi #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
764*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
765*22dc650dSSadaf Ebrahimi
766*22dc650dSSadaf Ebrahimi /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
767*22dc650dSSadaf Ebrahimi are allowed. */
768*22dc650dSSadaf Ebrahimi
769*22dc650dSSadaf Ebrahimi #define PUBLIC_LITERAL_COMPILE_OPTIONS \
770*22dc650dSSadaf Ebrahimi (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
771*22dc650dSSadaf Ebrahimi PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
772*22dc650dSSadaf Ebrahimi PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
773*22dc650dSSadaf Ebrahimi
774*22dc650dSSadaf Ebrahimi #define PUBLIC_COMPILE_OPTIONS \
775*22dc650dSSadaf Ebrahimi (PUBLIC_LITERAL_COMPILE_OPTIONS| \
776*22dc650dSSadaf Ebrahimi PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
777*22dc650dSSadaf Ebrahimi PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
778*22dc650dSSadaf Ebrahimi PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
779*22dc650dSSadaf Ebrahimi PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
780*22dc650dSSadaf Ebrahimi PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
781*22dc650dSSadaf Ebrahimi PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
782*22dc650dSSadaf Ebrahimi
783*22dc650dSSadaf Ebrahimi #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
784*22dc650dSSadaf Ebrahimi (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT)
785*22dc650dSSadaf Ebrahimi
786*22dc650dSSadaf Ebrahimi #define PUBLIC_COMPILE_EXTRA_OPTIONS \
787*22dc650dSSadaf Ebrahimi (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
788*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
789*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
790*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
791*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
792*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ASCII_DIGIT)
793*22dc650dSSadaf Ebrahimi
794*22dc650dSSadaf Ebrahimi /* Compile time error code numbers. They are given names so that they can more
795*22dc650dSSadaf Ebrahimi easily be tracked. When a new number is added, the tables called eint1 and
796*22dc650dSSadaf Ebrahimi eint2 in pcre2posix.c may need to be updated, and a new error text must be
797*22dc650dSSadaf Ebrahimi added to compile_error_texts in pcre2_error.c. Also, the error codes in
798*22dc650dSSadaf Ebrahimi pcre2.h.in must be updated - their values are exactly 100 greater than these
799*22dc650dSSadaf Ebrahimi values. */
800*22dc650dSSadaf Ebrahimi
801*22dc650dSSadaf Ebrahimi enum { ERR0 = COMPILE_ERROR_BASE,
802*22dc650dSSadaf Ebrahimi ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
803*22dc650dSSadaf Ebrahimi ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
804*22dc650dSSadaf Ebrahimi ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
805*22dc650dSSadaf Ebrahimi ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
806*22dc650dSSadaf Ebrahimi ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
807*22dc650dSSadaf Ebrahimi ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
808*22dc650dSSadaf Ebrahimi ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
809*22dc650dSSadaf Ebrahimi ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
810*22dc650dSSadaf Ebrahimi ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
811*22dc650dSSadaf Ebrahimi ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99, ERR100,
812*22dc650dSSadaf Ebrahimi ERR101 };
813*22dc650dSSadaf Ebrahimi
814*22dc650dSSadaf Ebrahimi /* This is a table of start-of-pattern options such as (*UTF) and settings such
815*22dc650dSSadaf Ebrahimi as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
816*22dc650dSSadaf Ebrahimi compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
817*22dc650dSSadaf Ebrahimi generic and always supported. */
818*22dc650dSSadaf Ebrahimi
819*22dc650dSSadaf Ebrahimi enum { PSO_OPT, /* Value is an option bit */
820*22dc650dSSadaf Ebrahimi PSO_FLG, /* Value is a flag bit */
821*22dc650dSSadaf Ebrahimi PSO_NL, /* Value is a newline type */
822*22dc650dSSadaf Ebrahimi PSO_BSR, /* Value is a \R type */
823*22dc650dSSadaf Ebrahimi PSO_LIMH, /* Read integer value for heap limit */
824*22dc650dSSadaf Ebrahimi PSO_LIMM, /* Read integer value for match limit */
825*22dc650dSSadaf Ebrahimi PSO_LIMD /* Read integer value for depth limit */
826*22dc650dSSadaf Ebrahimi };
827*22dc650dSSadaf Ebrahimi
828*22dc650dSSadaf Ebrahimi typedef struct pso {
829*22dc650dSSadaf Ebrahimi const uint8_t *name;
830*22dc650dSSadaf Ebrahimi uint16_t length;
831*22dc650dSSadaf Ebrahimi uint16_t type;
832*22dc650dSSadaf Ebrahimi uint32_t value;
833*22dc650dSSadaf Ebrahimi } pso;
834*22dc650dSSadaf Ebrahimi
835*22dc650dSSadaf Ebrahimi /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
836*22dc650dSSadaf Ebrahimi
837*22dc650dSSadaf Ebrahimi static const pso pso_list[] = {
838*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
839*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
840*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
841*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
842*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
843*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
844*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
845*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
846*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
847*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
848*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
849*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
850*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
851*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
852*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
853*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
854*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
855*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
856*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
857*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
858*22dc650dSSadaf Ebrahimi { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
859*22dc650dSSadaf Ebrahimi };
860*22dc650dSSadaf Ebrahimi
861*22dc650dSSadaf Ebrahimi /* This table is used when converting repeating opcodes into possessified
862*22dc650dSSadaf Ebrahimi versions as a result of an explicit possessive quantifier such as ++. A zero
863*22dc650dSSadaf Ebrahimi value means there is no possessified version - in those cases the item in
864*22dc650dSSadaf Ebrahimi question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
865*22dc650dSSadaf Ebrahimi because all relevant opcodes are less than that. */
866*22dc650dSSadaf Ebrahimi
867*22dc650dSSadaf Ebrahimi static const uint8_t opcode_possessify[] = {
868*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
869*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
870*22dc650dSSadaf Ebrahimi
871*22dc650dSSadaf Ebrahimi 0, /* NOTI */
872*22dc650dSSadaf Ebrahimi OP_POSSTAR, 0, /* STAR, MINSTAR */
873*22dc650dSSadaf Ebrahimi OP_POSPLUS, 0, /* PLUS, MINPLUS */
874*22dc650dSSadaf Ebrahimi OP_POSQUERY, 0, /* QUERY, MINQUERY */
875*22dc650dSSadaf Ebrahimi OP_POSUPTO, 0, /* UPTO, MINUPTO */
876*22dc650dSSadaf Ebrahimi 0, /* EXACT */
877*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
878*22dc650dSSadaf Ebrahimi
879*22dc650dSSadaf Ebrahimi OP_POSSTARI, 0, /* STARI, MINSTARI */
880*22dc650dSSadaf Ebrahimi OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
881*22dc650dSSadaf Ebrahimi OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
882*22dc650dSSadaf Ebrahimi OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
883*22dc650dSSadaf Ebrahimi 0, /* EXACTI */
884*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
885*22dc650dSSadaf Ebrahimi
886*22dc650dSSadaf Ebrahimi OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
887*22dc650dSSadaf Ebrahimi OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
888*22dc650dSSadaf Ebrahimi OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
889*22dc650dSSadaf Ebrahimi OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
890*22dc650dSSadaf Ebrahimi 0, /* NOTEXACT */
891*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
892*22dc650dSSadaf Ebrahimi
893*22dc650dSSadaf Ebrahimi OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
894*22dc650dSSadaf Ebrahimi OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
895*22dc650dSSadaf Ebrahimi OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
896*22dc650dSSadaf Ebrahimi OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
897*22dc650dSSadaf Ebrahimi 0, /* NOTEXACTI */
898*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
899*22dc650dSSadaf Ebrahimi
900*22dc650dSSadaf Ebrahimi OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
901*22dc650dSSadaf Ebrahimi OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
902*22dc650dSSadaf Ebrahimi OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
903*22dc650dSSadaf Ebrahimi OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
904*22dc650dSSadaf Ebrahimi 0, /* TYPEEXACT */
905*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
906*22dc650dSSadaf Ebrahimi
907*22dc650dSSadaf Ebrahimi OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
908*22dc650dSSadaf Ebrahimi OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
909*22dc650dSSadaf Ebrahimi OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
910*22dc650dSSadaf Ebrahimi OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
911*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
912*22dc650dSSadaf Ebrahimi
913*22dc650dSSadaf Ebrahimi 0, 0, 0, /* CLASS, NCLASS, XCLASS */
914*22dc650dSSadaf Ebrahimi 0, 0, /* REF, REFI */
915*22dc650dSSadaf Ebrahimi 0, 0, /* DNREF, DNREFI */
916*22dc650dSSadaf Ebrahimi 0, 0 /* RECURSE, CALLOUT */
917*22dc650dSSadaf Ebrahimi };
918*22dc650dSSadaf Ebrahimi
919*22dc650dSSadaf Ebrahimi
920*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_PARSED
921*22dc650dSSadaf Ebrahimi /*************************************************
922*22dc650dSSadaf Ebrahimi * Show the parsed pattern for debugging *
923*22dc650dSSadaf Ebrahimi *************************************************/
924*22dc650dSSadaf Ebrahimi
925*22dc650dSSadaf Ebrahimi /* For debugging the pre-scan, this code, which outputs the parsed data vector,
926*22dc650dSSadaf Ebrahimi can be enabled. */
927*22dc650dSSadaf Ebrahimi
show_parsed(compile_block * cb)928*22dc650dSSadaf Ebrahimi static void show_parsed(compile_block *cb)
929*22dc650dSSadaf Ebrahimi {
930*22dc650dSSadaf Ebrahimi uint32_t *pptr = cb->parsed_pattern;
931*22dc650dSSadaf Ebrahimi
932*22dc650dSSadaf Ebrahimi for (;;)
933*22dc650dSSadaf Ebrahimi {
934*22dc650dSSadaf Ebrahimi int max, min;
935*22dc650dSSadaf Ebrahimi PCRE2_SIZE offset;
936*22dc650dSSadaf Ebrahimi uint32_t i;
937*22dc650dSSadaf Ebrahimi uint32_t length;
938*22dc650dSSadaf Ebrahimi uint32_t meta_arg = META_DATA(*pptr);
939*22dc650dSSadaf Ebrahimi
940*22dc650dSSadaf Ebrahimi fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
941*22dc650dSSadaf Ebrahimi
942*22dc650dSSadaf Ebrahimi if (*pptr < META_END)
943*22dc650dSSadaf Ebrahimi {
944*22dc650dSSadaf Ebrahimi if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
945*22dc650dSSadaf Ebrahimi pptr++;
946*22dc650dSSadaf Ebrahimi }
947*22dc650dSSadaf Ebrahimi
948*22dc650dSSadaf Ebrahimi else switch (META_CODE(*pptr++))
949*22dc650dSSadaf Ebrahimi {
950*22dc650dSSadaf Ebrahimi default:
951*22dc650dSSadaf Ebrahimi fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
952*22dc650dSSadaf Ebrahimi return;
953*22dc650dSSadaf Ebrahimi
954*22dc650dSSadaf Ebrahimi case META_END:
955*22dc650dSSadaf Ebrahimi fprintf(stderr, "META_END\n");
956*22dc650dSSadaf Ebrahimi return;
957*22dc650dSSadaf Ebrahimi
958*22dc650dSSadaf Ebrahimi case META_CAPTURE:
959*22dc650dSSadaf Ebrahimi fprintf(stderr, "META_CAPTURE %d", meta_arg);
960*22dc650dSSadaf Ebrahimi break;
961*22dc650dSSadaf Ebrahimi
962*22dc650dSSadaf Ebrahimi case META_RECURSE:
963*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
964*22dc650dSSadaf Ebrahimi fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
965*22dc650dSSadaf Ebrahimi break;
966*22dc650dSSadaf Ebrahimi
967*22dc650dSSadaf Ebrahimi case META_BACKREF:
968*22dc650dSSadaf Ebrahimi if (meta_arg < 10)
969*22dc650dSSadaf Ebrahimi offset = cb->small_ref_offset[meta_arg];
970*22dc650dSSadaf Ebrahimi else
971*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
972*22dc650dSSadaf Ebrahimi fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
973*22dc650dSSadaf Ebrahimi break;
974*22dc650dSSadaf Ebrahimi
975*22dc650dSSadaf Ebrahimi case META_ESCAPE:
976*22dc650dSSadaf Ebrahimi if (meta_arg == ESC_P || meta_arg == ESC_p)
977*22dc650dSSadaf Ebrahimi {
978*22dc650dSSadaf Ebrahimi uint32_t ptype = *pptr >> 16;
979*22dc650dSSadaf Ebrahimi uint32_t pvalue = *pptr++ & 0xffff;
980*22dc650dSSadaf Ebrahimi fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
981*22dc650dSSadaf Ebrahimi ptype, pvalue);
982*22dc650dSSadaf Ebrahimi }
983*22dc650dSSadaf Ebrahimi else
984*22dc650dSSadaf Ebrahimi {
985*22dc650dSSadaf Ebrahimi uint32_t cc;
986*22dc650dSSadaf Ebrahimi /* There's just one escape we might have here that isn't negated in the
987*22dc650dSSadaf Ebrahimi escapes table. */
988*22dc650dSSadaf Ebrahimi if (meta_arg == ESC_g) cc = CHAR_g;
989*22dc650dSSadaf Ebrahimi else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
990*22dc650dSSadaf Ebrahimi {
991*22dc650dSSadaf Ebrahimi if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
992*22dc650dSSadaf Ebrahimi }
993*22dc650dSSadaf Ebrahimi if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
994*22dc650dSSadaf Ebrahimi fprintf(stderr, "META \\%c", cc);
995*22dc650dSSadaf Ebrahimi }
996*22dc650dSSadaf Ebrahimi break;
997*22dc650dSSadaf Ebrahimi
998*22dc650dSSadaf Ebrahimi case META_MINMAX:
999*22dc650dSSadaf Ebrahimi min = *pptr++;
1000*22dc650dSSadaf Ebrahimi max = *pptr++;
1001*22dc650dSSadaf Ebrahimi if (max != REPEAT_UNLIMITED)
1002*22dc650dSSadaf Ebrahimi fprintf(stderr, "META {%d,%d}", min, max);
1003*22dc650dSSadaf Ebrahimi else
1004*22dc650dSSadaf Ebrahimi fprintf(stderr, "META {%d,}", min);
1005*22dc650dSSadaf Ebrahimi break;
1006*22dc650dSSadaf Ebrahimi
1007*22dc650dSSadaf Ebrahimi case META_MINMAX_QUERY:
1008*22dc650dSSadaf Ebrahimi min = *pptr++;
1009*22dc650dSSadaf Ebrahimi max = *pptr++;
1010*22dc650dSSadaf Ebrahimi if (max != REPEAT_UNLIMITED)
1011*22dc650dSSadaf Ebrahimi fprintf(stderr, "META {%d,%d}?", min, max);
1012*22dc650dSSadaf Ebrahimi else
1013*22dc650dSSadaf Ebrahimi fprintf(stderr, "META {%d,}?", min);
1014*22dc650dSSadaf Ebrahimi break;
1015*22dc650dSSadaf Ebrahimi
1016*22dc650dSSadaf Ebrahimi case META_MINMAX_PLUS:
1017*22dc650dSSadaf Ebrahimi min = *pptr++;
1018*22dc650dSSadaf Ebrahimi max = *pptr++;
1019*22dc650dSSadaf Ebrahimi if (max != REPEAT_UNLIMITED)
1020*22dc650dSSadaf Ebrahimi fprintf(stderr, "META {%d,%d}+", min, max);
1021*22dc650dSSadaf Ebrahimi else
1022*22dc650dSSadaf Ebrahimi fprintf(stderr, "META {%d,}+", min);
1023*22dc650dSSadaf Ebrahimi break;
1024*22dc650dSSadaf Ebrahimi
1025*22dc650dSSadaf Ebrahimi case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1026*22dc650dSSadaf Ebrahimi case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1027*22dc650dSSadaf Ebrahimi case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1028*22dc650dSSadaf Ebrahimi case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1029*22dc650dSSadaf Ebrahimi case META_DOT: fprintf(stderr, "META_DOT"); break;
1030*22dc650dSSadaf Ebrahimi case META_ASTERISK: fprintf(stderr, "META *"); break;
1031*22dc650dSSadaf Ebrahimi case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1032*22dc650dSSadaf Ebrahimi case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1033*22dc650dSSadaf Ebrahimi case META_PLUS: fprintf(stderr, "META +"); break;
1034*22dc650dSSadaf Ebrahimi case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1035*22dc650dSSadaf Ebrahimi case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1036*22dc650dSSadaf Ebrahimi case META_QUERY: fprintf(stderr, "META ?"); break;
1037*22dc650dSSadaf Ebrahimi case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1038*22dc650dSSadaf Ebrahimi case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1039*22dc650dSSadaf Ebrahimi
1040*22dc650dSSadaf Ebrahimi case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1041*22dc650dSSadaf Ebrahimi case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1042*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1043*22dc650dSSadaf Ebrahimi case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1044*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1045*22dc650dSSadaf Ebrahimi case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1046*22dc650dSSadaf Ebrahimi case META_KET: fprintf(stderr, "META )"); break;
1047*22dc650dSSadaf Ebrahimi case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1048*22dc650dSSadaf Ebrahimi
1049*22dc650dSSadaf Ebrahimi case META_CLASS: fprintf(stderr, "META ["); break;
1050*22dc650dSSadaf Ebrahimi case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1051*22dc650dSSadaf Ebrahimi case META_CLASS_END: fprintf(stderr, "META ]"); break;
1052*22dc650dSSadaf Ebrahimi case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1053*22dc650dSSadaf Ebrahimi case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1054*22dc650dSSadaf Ebrahimi
1055*22dc650dSSadaf Ebrahimi case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1056*22dc650dSSadaf Ebrahimi case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1057*22dc650dSSadaf Ebrahimi
1058*22dc650dSSadaf Ebrahimi case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1059*22dc650dSSadaf Ebrahimi case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1060*22dc650dSSadaf Ebrahimi
1061*22dc650dSSadaf Ebrahimi case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1062*22dc650dSSadaf Ebrahimi case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1063*22dc650dSSadaf Ebrahimi case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1064*22dc650dSSadaf Ebrahimi case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1065*22dc650dSSadaf Ebrahimi case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1066*22dc650dSSadaf Ebrahimi case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1067*22dc650dSSadaf Ebrahimi
1068*22dc650dSSadaf Ebrahimi case META_OPTIONS:
1069*22dc650dSSadaf Ebrahimi fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
1070*22dc650dSSadaf Ebrahimi pptr += 2;
1071*22dc650dSSadaf Ebrahimi break;
1072*22dc650dSSadaf Ebrahimi
1073*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND:
1074*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
1075*22dc650dSSadaf Ebrahimi pptr += 2;
1076*22dc650dSSadaf Ebrahimi break;
1077*22dc650dSSadaf Ebrahimi
1078*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND_NA:
1079*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
1080*22dc650dSSadaf Ebrahimi pptr += 2;
1081*22dc650dSSadaf Ebrahimi break;
1082*22dc650dSSadaf Ebrahimi
1083*22dc650dSSadaf Ebrahimi case META_LOOKBEHINDNOT:
1084*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
1085*22dc650dSSadaf Ebrahimi pptr += 2;
1086*22dc650dSSadaf Ebrahimi break;
1087*22dc650dSSadaf Ebrahimi
1088*22dc650dSSadaf Ebrahimi case META_CALLOUT_NUMBER:
1089*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1090*22dc650dSSadaf Ebrahimi pptr[1]);
1091*22dc650dSSadaf Ebrahimi pptr += 3;
1092*22dc650dSSadaf Ebrahimi break;
1093*22dc650dSSadaf Ebrahimi
1094*22dc650dSSadaf Ebrahimi case META_CALLOUT_STRING:
1095*22dc650dSSadaf Ebrahimi {
1096*22dc650dSSadaf Ebrahimi uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1097*22dc650dSSadaf Ebrahimi uint32_t patlength = *pptr++; /* Length of next pattern item */
1098*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1099*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
1100*22dc650dSSadaf Ebrahimi fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1101*22dc650dSSadaf Ebrahimi }
1102*22dc650dSSadaf Ebrahimi break;
1103*22dc650dSSadaf Ebrahimi
1104*22dc650dSSadaf Ebrahimi case META_RECURSE_BYNAME:
1105*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1106*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
1107*22dc650dSSadaf Ebrahimi fprintf(stderr, "%zd", offset);
1108*22dc650dSSadaf Ebrahimi break;
1109*22dc650dSSadaf Ebrahimi
1110*22dc650dSSadaf Ebrahimi case META_BACKREF_BYNAME:
1111*22dc650dSSadaf Ebrahimi fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1112*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
1113*22dc650dSSadaf Ebrahimi fprintf(stderr, "%zd", offset);
1114*22dc650dSSadaf Ebrahimi break;
1115*22dc650dSSadaf Ebrahimi
1116*22dc650dSSadaf Ebrahimi case META_COND_NUMBER:
1117*22dc650dSSadaf Ebrahimi fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1118*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
1119*22dc650dSSadaf Ebrahimi fprintf(stderr, "%zd", offset);
1120*22dc650dSSadaf Ebrahimi pptr++;
1121*22dc650dSSadaf Ebrahimi break;
1122*22dc650dSSadaf Ebrahimi
1123*22dc650dSSadaf Ebrahimi case META_COND_DEFINE:
1124*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?(DEFINE) offset=");
1125*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
1126*22dc650dSSadaf Ebrahimi fprintf(stderr, "%zd", offset);
1127*22dc650dSSadaf Ebrahimi break;
1128*22dc650dSSadaf Ebrahimi
1129*22dc650dSSadaf Ebrahimi case META_COND_VERSION:
1130*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1131*22dc650dSSadaf Ebrahimi fprintf(stderr, "%d.", *pptr++);
1132*22dc650dSSadaf Ebrahimi fprintf(stderr, "%d)", *pptr++);
1133*22dc650dSSadaf Ebrahimi break;
1134*22dc650dSSadaf Ebrahimi
1135*22dc650dSSadaf Ebrahimi case META_COND_NAME:
1136*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1137*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
1138*22dc650dSSadaf Ebrahimi fprintf(stderr, "%zd", offset);
1139*22dc650dSSadaf Ebrahimi break;
1140*22dc650dSSadaf Ebrahimi
1141*22dc650dSSadaf Ebrahimi case META_COND_RNAME:
1142*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1143*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
1144*22dc650dSSadaf Ebrahimi fprintf(stderr, "%zd", offset);
1145*22dc650dSSadaf Ebrahimi break;
1146*22dc650dSSadaf Ebrahimi
1147*22dc650dSSadaf Ebrahimi /* This is kept as a name, because it might be. */
1148*22dc650dSSadaf Ebrahimi
1149*22dc650dSSadaf Ebrahimi case META_COND_RNUMBER:
1150*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1151*22dc650dSSadaf Ebrahimi GETOFFSET(offset, pptr);
1152*22dc650dSSadaf Ebrahimi fprintf(stderr, "%zd", offset);
1153*22dc650dSSadaf Ebrahimi break;
1154*22dc650dSSadaf Ebrahimi
1155*22dc650dSSadaf Ebrahimi case META_MARK:
1156*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (*MARK:");
1157*22dc650dSSadaf Ebrahimi goto SHOWARG;
1158*22dc650dSSadaf Ebrahimi
1159*22dc650dSSadaf Ebrahimi case META_COMMIT_ARG:
1160*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (*COMMIT:");
1161*22dc650dSSadaf Ebrahimi goto SHOWARG;
1162*22dc650dSSadaf Ebrahimi
1163*22dc650dSSadaf Ebrahimi case META_PRUNE_ARG:
1164*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (*PRUNE:");
1165*22dc650dSSadaf Ebrahimi goto SHOWARG;
1166*22dc650dSSadaf Ebrahimi
1167*22dc650dSSadaf Ebrahimi case META_SKIP_ARG:
1168*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (*SKIP:");
1169*22dc650dSSadaf Ebrahimi goto SHOWARG;
1170*22dc650dSSadaf Ebrahimi
1171*22dc650dSSadaf Ebrahimi case META_THEN_ARG:
1172*22dc650dSSadaf Ebrahimi fprintf(stderr, "META (*THEN:");
1173*22dc650dSSadaf Ebrahimi SHOWARG:
1174*22dc650dSSadaf Ebrahimi length = *pptr++;
1175*22dc650dSSadaf Ebrahimi for (i = 0; i < length; i++)
1176*22dc650dSSadaf Ebrahimi {
1177*22dc650dSSadaf Ebrahimi uint32_t cc = *pptr++;
1178*22dc650dSSadaf Ebrahimi if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1179*22dc650dSSadaf Ebrahimi else fprintf(stderr, "\\x{%x}", cc);
1180*22dc650dSSadaf Ebrahimi }
1181*22dc650dSSadaf Ebrahimi fprintf(stderr, ") length=%u", length);
1182*22dc650dSSadaf Ebrahimi break;
1183*22dc650dSSadaf Ebrahimi }
1184*22dc650dSSadaf Ebrahimi fprintf(stderr, "\n");
1185*22dc650dSSadaf Ebrahimi }
1186*22dc650dSSadaf Ebrahimi return;
1187*22dc650dSSadaf Ebrahimi }
1188*22dc650dSSadaf Ebrahimi #endif /* DEBUG_SHOW_PARSED */
1189*22dc650dSSadaf Ebrahimi
1190*22dc650dSSadaf Ebrahimi
1191*22dc650dSSadaf Ebrahimi
1192*22dc650dSSadaf Ebrahimi /*************************************************
1193*22dc650dSSadaf Ebrahimi * Copy compiled code *
1194*22dc650dSSadaf Ebrahimi *************************************************/
1195*22dc650dSSadaf Ebrahimi
1196*22dc650dSSadaf Ebrahimi /* Compiled JIT code cannot be copied, so the new compiled block has no
1197*22dc650dSSadaf Ebrahimi associated JIT data. */
1198*22dc650dSSadaf Ebrahimi
1199*22dc650dSSadaf Ebrahimi PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1200*22dc650dSSadaf Ebrahimi pcre2_code_copy(const pcre2_code *code)
1201*22dc650dSSadaf Ebrahimi {
1202*22dc650dSSadaf Ebrahimi PCRE2_SIZE* ref_count;
1203*22dc650dSSadaf Ebrahimi pcre2_code *newcode;
1204*22dc650dSSadaf Ebrahimi
1205*22dc650dSSadaf Ebrahimi if (code == NULL) return NULL;
1206*22dc650dSSadaf Ebrahimi newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1207*22dc650dSSadaf Ebrahimi if (newcode == NULL) return NULL;
1208*22dc650dSSadaf Ebrahimi memcpy(newcode, code, code->blocksize);
1209*22dc650dSSadaf Ebrahimi newcode->executable_jit = NULL;
1210*22dc650dSSadaf Ebrahimi
1211*22dc650dSSadaf Ebrahimi /* If the code is one that has been deserialized, increment the reference count
1212*22dc650dSSadaf Ebrahimi in the decoded tables. */
1213*22dc650dSSadaf Ebrahimi
1214*22dc650dSSadaf Ebrahimi if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1215*22dc650dSSadaf Ebrahimi {
1216*22dc650dSSadaf Ebrahimi ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1217*22dc650dSSadaf Ebrahimi (*ref_count)++;
1218*22dc650dSSadaf Ebrahimi }
1219*22dc650dSSadaf Ebrahimi
1220*22dc650dSSadaf Ebrahimi return newcode;
1221*22dc650dSSadaf Ebrahimi }
1222*22dc650dSSadaf Ebrahimi
1223*22dc650dSSadaf Ebrahimi
1224*22dc650dSSadaf Ebrahimi
1225*22dc650dSSadaf Ebrahimi /*************************************************
1226*22dc650dSSadaf Ebrahimi * Copy compiled code and character tables *
1227*22dc650dSSadaf Ebrahimi *************************************************/
1228*22dc650dSSadaf Ebrahimi
1229*22dc650dSSadaf Ebrahimi /* Compiled JIT code cannot be copied, so the new compiled block has no
1230*22dc650dSSadaf Ebrahimi associated JIT data. This version of code_copy also makes a separate copy of
1231*22dc650dSSadaf Ebrahimi the character tables. */
1232*22dc650dSSadaf Ebrahimi
1233*22dc650dSSadaf Ebrahimi PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1234*22dc650dSSadaf Ebrahimi pcre2_code_copy_with_tables(const pcre2_code *code)
1235*22dc650dSSadaf Ebrahimi {
1236*22dc650dSSadaf Ebrahimi PCRE2_SIZE* ref_count;
1237*22dc650dSSadaf Ebrahimi pcre2_code *newcode;
1238*22dc650dSSadaf Ebrahimi uint8_t *newtables;
1239*22dc650dSSadaf Ebrahimi
1240*22dc650dSSadaf Ebrahimi if (code == NULL) return NULL;
1241*22dc650dSSadaf Ebrahimi newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1242*22dc650dSSadaf Ebrahimi if (newcode == NULL) return NULL;
1243*22dc650dSSadaf Ebrahimi memcpy(newcode, code, code->blocksize);
1244*22dc650dSSadaf Ebrahimi newcode->executable_jit = NULL;
1245*22dc650dSSadaf Ebrahimi
1246*22dc650dSSadaf Ebrahimi newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1247*22dc650dSSadaf Ebrahimi code->memctl.memory_data);
1248*22dc650dSSadaf Ebrahimi if (newtables == NULL)
1249*22dc650dSSadaf Ebrahimi {
1250*22dc650dSSadaf Ebrahimi code->memctl.free((void *)newcode, code->memctl.memory_data);
1251*22dc650dSSadaf Ebrahimi return NULL;
1252*22dc650dSSadaf Ebrahimi }
1253*22dc650dSSadaf Ebrahimi memcpy(newtables, code->tables, TABLES_LENGTH);
1254*22dc650dSSadaf Ebrahimi ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1255*22dc650dSSadaf Ebrahimi *ref_count = 1;
1256*22dc650dSSadaf Ebrahimi
1257*22dc650dSSadaf Ebrahimi newcode->tables = newtables;
1258*22dc650dSSadaf Ebrahimi newcode->flags |= PCRE2_DEREF_TABLES;
1259*22dc650dSSadaf Ebrahimi return newcode;
1260*22dc650dSSadaf Ebrahimi }
1261*22dc650dSSadaf Ebrahimi
1262*22dc650dSSadaf Ebrahimi
1263*22dc650dSSadaf Ebrahimi
1264*22dc650dSSadaf Ebrahimi /*************************************************
1265*22dc650dSSadaf Ebrahimi * Free compiled code *
1266*22dc650dSSadaf Ebrahimi *************************************************/
1267*22dc650dSSadaf Ebrahimi
1268*22dc650dSSadaf Ebrahimi PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1269*22dc650dSSadaf Ebrahimi pcre2_code_free(pcre2_code *code)
1270*22dc650dSSadaf Ebrahimi {
1271*22dc650dSSadaf Ebrahimi PCRE2_SIZE* ref_count;
1272*22dc650dSSadaf Ebrahimi
1273*22dc650dSSadaf Ebrahimi if (code != NULL)
1274*22dc650dSSadaf Ebrahimi {
1275*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_JIT
1276*22dc650dSSadaf Ebrahimi if (code->executable_jit != NULL)
1277*22dc650dSSadaf Ebrahimi PRIV(jit_free)(code->executable_jit, &code->memctl);
1278*22dc650dSSadaf Ebrahimi #endif
1279*22dc650dSSadaf Ebrahimi
1280*22dc650dSSadaf Ebrahimi if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1281*22dc650dSSadaf Ebrahimi {
1282*22dc650dSSadaf Ebrahimi /* Decoded tables belong to the codes after deserialization, and they must
1283*22dc650dSSadaf Ebrahimi be freed when there are no more references to them. The *ref_count should
1284*22dc650dSSadaf Ebrahimi always be > 0. */
1285*22dc650dSSadaf Ebrahimi
1286*22dc650dSSadaf Ebrahimi ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1287*22dc650dSSadaf Ebrahimi if (*ref_count > 0)
1288*22dc650dSSadaf Ebrahimi {
1289*22dc650dSSadaf Ebrahimi (*ref_count)--;
1290*22dc650dSSadaf Ebrahimi if (*ref_count == 0)
1291*22dc650dSSadaf Ebrahimi code->memctl.free((void *)code->tables, code->memctl.memory_data);
1292*22dc650dSSadaf Ebrahimi }
1293*22dc650dSSadaf Ebrahimi }
1294*22dc650dSSadaf Ebrahimi
1295*22dc650dSSadaf Ebrahimi code->memctl.free(code, code->memctl.memory_data);
1296*22dc650dSSadaf Ebrahimi }
1297*22dc650dSSadaf Ebrahimi }
1298*22dc650dSSadaf Ebrahimi
1299*22dc650dSSadaf Ebrahimi
1300*22dc650dSSadaf Ebrahimi
1301*22dc650dSSadaf Ebrahimi /*************************************************
1302*22dc650dSSadaf Ebrahimi * Read a number, possibly signed *
1303*22dc650dSSadaf Ebrahimi *************************************************/
1304*22dc650dSSadaf Ebrahimi
1305*22dc650dSSadaf Ebrahimi /* This function is used to read numbers in the pattern. The initial pointer
1306*22dc650dSSadaf Ebrahimi must be at the sign or first digit of the number. When relative values
1307*22dc650dSSadaf Ebrahimi (introduced by + or -) are allowed, they are relative group numbers, and the
1308*22dc650dSSadaf Ebrahimi result must be greater than zero.
1309*22dc650dSSadaf Ebrahimi
1310*22dc650dSSadaf Ebrahimi Arguments:
1311*22dc650dSSadaf Ebrahimi ptrptr points to the character pointer variable
1312*22dc650dSSadaf Ebrahimi ptrend points to the end of the input string
1313*22dc650dSSadaf Ebrahimi allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1314*22dc650dSSadaf Ebrahimi max_value the largest number allowed
1315*22dc650dSSadaf Ebrahimi max_error the error to give for an over-large number
1316*22dc650dSSadaf Ebrahimi intptr where to put the result
1317*22dc650dSSadaf Ebrahimi errcodeptr where to put an error code
1318*22dc650dSSadaf Ebrahimi
1319*22dc650dSSadaf Ebrahimi Returns: TRUE - a number was read
1320*22dc650dSSadaf Ebrahimi FALSE - errorcode == 0 => no number was found
1321*22dc650dSSadaf Ebrahimi errorcode != 0 => an error occurred
1322*22dc650dSSadaf Ebrahimi */
1323*22dc650dSSadaf Ebrahimi
1324*22dc650dSSadaf Ebrahimi static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1325*22dc650dSSadaf Ebrahimi read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1326*22dc650dSSadaf Ebrahimi uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1327*22dc650dSSadaf Ebrahimi {
1328*22dc650dSSadaf Ebrahimi int sign = 0;
1329*22dc650dSSadaf Ebrahimi uint32_t n = 0;
1330*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr = *ptrptr;
1331*22dc650dSSadaf Ebrahimi BOOL yield = FALSE;
1332*22dc650dSSadaf Ebrahimi
1333*22dc650dSSadaf Ebrahimi *errorcodeptr = 0;
1334*22dc650dSSadaf Ebrahimi
1335*22dc650dSSadaf Ebrahimi if (allow_sign >= 0 && ptr < ptrend)
1336*22dc650dSSadaf Ebrahimi {
1337*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_PLUS)
1338*22dc650dSSadaf Ebrahimi {
1339*22dc650dSSadaf Ebrahimi sign = +1;
1340*22dc650dSSadaf Ebrahimi max_value -= allow_sign;
1341*22dc650dSSadaf Ebrahimi ptr++;
1342*22dc650dSSadaf Ebrahimi }
1343*22dc650dSSadaf Ebrahimi else if (*ptr == CHAR_MINUS)
1344*22dc650dSSadaf Ebrahimi {
1345*22dc650dSSadaf Ebrahimi sign = -1;
1346*22dc650dSSadaf Ebrahimi ptr++;
1347*22dc650dSSadaf Ebrahimi }
1348*22dc650dSSadaf Ebrahimi }
1349*22dc650dSSadaf Ebrahimi
1350*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1351*22dc650dSSadaf Ebrahimi while (ptr < ptrend && IS_DIGIT(*ptr))
1352*22dc650dSSadaf Ebrahimi {
1353*22dc650dSSadaf Ebrahimi n = n * 10 + *ptr++ - CHAR_0;
1354*22dc650dSSadaf Ebrahimi if (n > max_value)
1355*22dc650dSSadaf Ebrahimi {
1356*22dc650dSSadaf Ebrahimi *errorcodeptr = max_error;
1357*22dc650dSSadaf Ebrahimi goto EXIT;
1358*22dc650dSSadaf Ebrahimi }
1359*22dc650dSSadaf Ebrahimi }
1360*22dc650dSSadaf Ebrahimi
1361*22dc650dSSadaf Ebrahimi if (allow_sign >= 0 && sign != 0)
1362*22dc650dSSadaf Ebrahimi {
1363*22dc650dSSadaf Ebrahimi if (n == 0)
1364*22dc650dSSadaf Ebrahimi {
1365*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1366*22dc650dSSadaf Ebrahimi goto EXIT;
1367*22dc650dSSadaf Ebrahimi }
1368*22dc650dSSadaf Ebrahimi
1369*22dc650dSSadaf Ebrahimi if (sign > 0) n += allow_sign;
1370*22dc650dSSadaf Ebrahimi else if ((int)n > allow_sign)
1371*22dc650dSSadaf Ebrahimi {
1372*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR15; /* Non-existent subpattern */
1373*22dc650dSSadaf Ebrahimi goto EXIT;
1374*22dc650dSSadaf Ebrahimi }
1375*22dc650dSSadaf Ebrahimi else n = allow_sign + 1 - n;
1376*22dc650dSSadaf Ebrahimi }
1377*22dc650dSSadaf Ebrahimi
1378*22dc650dSSadaf Ebrahimi yield = TRUE;
1379*22dc650dSSadaf Ebrahimi
1380*22dc650dSSadaf Ebrahimi EXIT:
1381*22dc650dSSadaf Ebrahimi *intptr = n;
1382*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
1383*22dc650dSSadaf Ebrahimi return yield;
1384*22dc650dSSadaf Ebrahimi }
1385*22dc650dSSadaf Ebrahimi
1386*22dc650dSSadaf Ebrahimi
1387*22dc650dSSadaf Ebrahimi
1388*22dc650dSSadaf Ebrahimi /*************************************************
1389*22dc650dSSadaf Ebrahimi * Read repeat counts *
1390*22dc650dSSadaf Ebrahimi *************************************************/
1391*22dc650dSSadaf Ebrahimi
1392*22dc650dSSadaf Ebrahimi /* Read an item of the form {n,m} and return the values when non-NULL pointers
1393*22dc650dSSadaf Ebrahimi are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1394*22dc650dSSadaf Ebrahimi larger value is used for "unlimited". We have to use signed arguments for
1395*22dc650dSSadaf Ebrahimi read_number() because it is capable of returning a signed value. As of Perl
1396*22dc650dSSadaf Ebrahimi 5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1397*22dc650dSSadaf Ebrahimi tabs after { and before } and between the numbers and the comma, so we do too.
1398*22dc650dSSadaf Ebrahimi
1399*22dc650dSSadaf Ebrahimi Arguments:
1400*22dc650dSSadaf Ebrahimi ptrptr points to pointer to character after '{'
1401*22dc650dSSadaf Ebrahimi ptrend pointer to end of input
1402*22dc650dSSadaf Ebrahimi minp if not NULL, pointer to int for min
1403*22dc650dSSadaf Ebrahimi maxp if not NULL, pointer to int for max
1404*22dc650dSSadaf Ebrahimi errorcodeptr points to error code variable
1405*22dc650dSSadaf Ebrahimi
1406*22dc650dSSadaf Ebrahimi Returns: FALSE if not a repeat quantifier, errorcode set zero
1407*22dc650dSSadaf Ebrahimi FALSE on error, with errorcode set non-zero
1408*22dc650dSSadaf Ebrahimi TRUE on success, with pointer updated to point after '}'
1409*22dc650dSSadaf Ebrahimi */
1410*22dc650dSSadaf Ebrahimi
1411*22dc650dSSadaf Ebrahimi static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1412*22dc650dSSadaf Ebrahimi read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1413*22dc650dSSadaf Ebrahimi uint32_t *maxp, int *errorcodeptr)
1414*22dc650dSSadaf Ebrahimi {
1415*22dc650dSSadaf Ebrahimi PCRE2_SPTR p = *ptrptr;
1416*22dc650dSSadaf Ebrahimi PCRE2_SPTR pp;
1417*22dc650dSSadaf Ebrahimi BOOL yield = FALSE;
1418*22dc650dSSadaf Ebrahimi BOOL had_minimum = FALSE;
1419*22dc650dSSadaf Ebrahimi int32_t min = 0;
1420*22dc650dSSadaf Ebrahimi int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1421*22dc650dSSadaf Ebrahimi
1422*22dc650dSSadaf Ebrahimi *errorcodeptr = 0;
1423*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1424*22dc650dSSadaf Ebrahimi
1425*22dc650dSSadaf Ebrahimi /* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1426*22dc650dSSadaf Ebrahimi such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1427*22dc650dSSadaf Ebrahimi error. */
1428*22dc650dSSadaf Ebrahimi
1429*22dc650dSSadaf Ebrahimi pp = p;
1430*22dc650dSSadaf Ebrahimi if (pp < ptrend && IS_DIGIT(*pp))
1431*22dc650dSSadaf Ebrahimi {
1432*22dc650dSSadaf Ebrahimi had_minimum = TRUE;
1433*22dc650dSSadaf Ebrahimi while (++pp < ptrend && IS_DIGIT(*pp)) {}
1434*22dc650dSSadaf Ebrahimi }
1435*22dc650dSSadaf Ebrahimi
1436*22dc650dSSadaf Ebrahimi while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1437*22dc650dSSadaf Ebrahimi if (pp >= ptrend) return FALSE;
1438*22dc650dSSadaf Ebrahimi
1439*22dc650dSSadaf Ebrahimi if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1440*22dc650dSSadaf Ebrahimi {
1441*22dc650dSSadaf Ebrahimi if (!had_minimum) return FALSE;
1442*22dc650dSSadaf Ebrahimi }
1443*22dc650dSSadaf Ebrahimi else
1444*22dc650dSSadaf Ebrahimi {
1445*22dc650dSSadaf Ebrahimi if (*pp++ != CHAR_COMMA) return FALSE;
1446*22dc650dSSadaf Ebrahimi while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1447*22dc650dSSadaf Ebrahimi if (pp >= ptrend) return FALSE;
1448*22dc650dSSadaf Ebrahimi if (IS_DIGIT(*pp))
1449*22dc650dSSadaf Ebrahimi {
1450*22dc650dSSadaf Ebrahimi while (++pp < ptrend && IS_DIGIT(*pp)) {}
1451*22dc650dSSadaf Ebrahimi }
1452*22dc650dSSadaf Ebrahimi else if (!had_minimum) return FALSE;
1453*22dc650dSSadaf Ebrahimi while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1454*22dc650dSSadaf Ebrahimi if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1455*22dc650dSSadaf Ebrahimi }
1456*22dc650dSSadaf Ebrahimi
1457*22dc650dSSadaf Ebrahimi /* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
1458*22dc650dSSadaf Ebrahimi or {n,m}. The only error that read_number() can return is for a number that is
1459*22dc650dSSadaf Ebrahimi too big. If *errorcodeptr is returned as zero it means no number was found. */
1460*22dc650dSSadaf Ebrahimi
1461*22dc650dSSadaf Ebrahimi /* Deal with {,m} or n too big. If we successfully read m there is no need to
1462*22dc650dSSadaf Ebrahimi check m >= n because n defaults to zero. */
1463*22dc650dSSadaf Ebrahimi
1464*22dc650dSSadaf Ebrahimi if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1465*22dc650dSSadaf Ebrahimi {
1466*22dc650dSSadaf Ebrahimi if (*errorcodeptr != 0) goto EXIT; /* n too big */
1467*22dc650dSSadaf Ebrahimi p++; /* Skip comma and subsequent spaces */
1468*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1469*22dc650dSSadaf Ebrahimi if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1470*22dc650dSSadaf Ebrahimi {
1471*22dc650dSSadaf Ebrahimi if (*errorcodeptr != 0) goto EXIT; /* m too big */
1472*22dc650dSSadaf Ebrahimi }
1473*22dc650dSSadaf Ebrahimi }
1474*22dc650dSSadaf Ebrahimi
1475*22dc650dSSadaf Ebrahimi /* Have read one number. Deal with {n} or {n,} or {n,m} */
1476*22dc650dSSadaf Ebrahimi
1477*22dc650dSSadaf Ebrahimi else
1478*22dc650dSSadaf Ebrahimi {
1479*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1480*22dc650dSSadaf Ebrahimi if (*p == CHAR_RIGHT_CURLY_BRACKET)
1481*22dc650dSSadaf Ebrahimi {
1482*22dc650dSSadaf Ebrahimi max = min;
1483*22dc650dSSadaf Ebrahimi }
1484*22dc650dSSadaf Ebrahimi else /* Handle {n,} or {n,m} */
1485*22dc650dSSadaf Ebrahimi {
1486*22dc650dSSadaf Ebrahimi p++; /* Skip comma and subsequent spaces */
1487*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1488*22dc650dSSadaf Ebrahimi if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1489*22dc650dSSadaf Ebrahimi {
1490*22dc650dSSadaf Ebrahimi if (*errorcodeptr != 0) goto EXIT; /* m too big */
1491*22dc650dSSadaf Ebrahimi }
1492*22dc650dSSadaf Ebrahimi
1493*22dc650dSSadaf Ebrahimi if (max < min)
1494*22dc650dSSadaf Ebrahimi {
1495*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR4;
1496*22dc650dSSadaf Ebrahimi goto EXIT;
1497*22dc650dSSadaf Ebrahimi }
1498*22dc650dSSadaf Ebrahimi }
1499*22dc650dSSadaf Ebrahimi }
1500*22dc650dSSadaf Ebrahimi
1501*22dc650dSSadaf Ebrahimi /* Valid quantifier exists */
1502*22dc650dSSadaf Ebrahimi
1503*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1504*22dc650dSSadaf Ebrahimi p++;
1505*22dc650dSSadaf Ebrahimi yield = TRUE;
1506*22dc650dSSadaf Ebrahimi if (minp != NULL) *minp = (uint32_t)min;
1507*22dc650dSSadaf Ebrahimi if (maxp != NULL) *maxp = (uint32_t)max;
1508*22dc650dSSadaf Ebrahimi
1509*22dc650dSSadaf Ebrahimi /* Update the pattern pointer */
1510*22dc650dSSadaf Ebrahimi
1511*22dc650dSSadaf Ebrahimi EXIT:
1512*22dc650dSSadaf Ebrahimi *ptrptr = p;
1513*22dc650dSSadaf Ebrahimi return yield;
1514*22dc650dSSadaf Ebrahimi }
1515*22dc650dSSadaf Ebrahimi
1516*22dc650dSSadaf Ebrahimi
1517*22dc650dSSadaf Ebrahimi
1518*22dc650dSSadaf Ebrahimi /*************************************************
1519*22dc650dSSadaf Ebrahimi * Handle escapes *
1520*22dc650dSSadaf Ebrahimi *************************************************/
1521*22dc650dSSadaf Ebrahimi
1522*22dc650dSSadaf Ebrahimi /* This function is called when a \ has been encountered. It either returns a
1523*22dc650dSSadaf Ebrahimi positive value for a simple escape such as \d, or 0 for a data character, which
1524*22dc650dSSadaf Ebrahimi is placed in chptr. A backreference to group n is returned as negative n. On
1525*22dc650dSSadaf Ebrahimi entry, ptr is pointing at the character after \. On exit, it points after the
1526*22dc650dSSadaf Ebrahimi final code unit of the escape sequence.
1527*22dc650dSSadaf Ebrahimi
1528*22dc650dSSadaf Ebrahimi This function is also called from pcre2_substitute() to handle escape sequences
1529*22dc650dSSadaf Ebrahimi in replacement strings. In this case, the cb argument is NULL, and in the case
1530*22dc650dSSadaf Ebrahimi of escapes that have further processing, only sequences that define a data
1531*22dc650dSSadaf Ebrahimi character are recognised. The isclass argument is not relevant; the options
1532*22dc650dSSadaf Ebrahimi argument is the final value of the compiled pattern's options.
1533*22dc650dSSadaf Ebrahimi
1534*22dc650dSSadaf Ebrahimi Arguments:
1535*22dc650dSSadaf Ebrahimi ptrptr points to the input position pointer
1536*22dc650dSSadaf Ebrahimi ptrend points to the end of the input
1537*22dc650dSSadaf Ebrahimi chptr points to a returned data character
1538*22dc650dSSadaf Ebrahimi errorcodeptr points to the errorcode variable (containing zero)
1539*22dc650dSSadaf Ebrahimi options the current options bits
1540*22dc650dSSadaf Ebrahimi xoptions the current extra options bits
1541*22dc650dSSadaf Ebrahimi isclass TRUE if inside a character class
1542*22dc650dSSadaf Ebrahimi cb compile data block or NULL when called from pcre2_substitute()
1543*22dc650dSSadaf Ebrahimi
1544*22dc650dSSadaf Ebrahimi Returns: zero => a data character
1545*22dc650dSSadaf Ebrahimi positive => a special escape sequence
1546*22dc650dSSadaf Ebrahimi negative => a numerical back reference
1547*22dc650dSSadaf Ebrahimi on error, errorcodeptr is set non-zero
1548*22dc650dSSadaf Ebrahimi */
1549*22dc650dSSadaf Ebrahimi
1550*22dc650dSSadaf Ebrahimi int
PRIV(check_escape)1551*22dc650dSSadaf Ebrahimi PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1552*22dc650dSSadaf Ebrahimi int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
1553*22dc650dSSadaf Ebrahimi compile_block *cb)
1554*22dc650dSSadaf Ebrahimi {
1555*22dc650dSSadaf Ebrahimi BOOL utf = (options & PCRE2_UTF) != 0;
1556*22dc650dSSadaf Ebrahimi BOOL alt_bsux =
1557*22dc650dSSadaf Ebrahimi ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1558*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr = *ptrptr;
1559*22dc650dSSadaf Ebrahimi uint32_t c, cc;
1560*22dc650dSSadaf Ebrahimi int escape = 0;
1561*22dc650dSSadaf Ebrahimi int i;
1562*22dc650dSSadaf Ebrahimi
1563*22dc650dSSadaf Ebrahimi /* If backslash is at the end of the string, it's an error. */
1564*22dc650dSSadaf Ebrahimi
1565*22dc650dSSadaf Ebrahimi if (ptr >= ptrend)
1566*22dc650dSSadaf Ebrahimi {
1567*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR1;
1568*22dc650dSSadaf Ebrahimi return 0;
1569*22dc650dSSadaf Ebrahimi }
1570*22dc650dSSadaf Ebrahimi
1571*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1572*22dc650dSSadaf Ebrahimi *errorcodeptr = 0; /* Be optimistic */
1573*22dc650dSSadaf Ebrahimi
1574*22dc650dSSadaf Ebrahimi /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1575*22dc650dSSadaf Ebrahimi value test saves a memory lookup for code points outside the alphanumeric
1576*22dc650dSSadaf Ebrahimi range. */
1577*22dc650dSSadaf Ebrahimi
1578*22dc650dSSadaf Ebrahimi if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1579*22dc650dSSadaf Ebrahimi
1580*22dc650dSSadaf Ebrahimi /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1581*22dc650dSSadaf Ebrahimi positive value is a literal value for something like \n. A negative value is
1582*22dc650dSSadaf Ebrahimi the negation of one of the ESC_ macros that is passed back for handling by the
1583*22dc650dSSadaf Ebrahimi calling function. Some extra checking is needed for \N because only \N{U+dddd}
1584*22dc650dSSadaf Ebrahimi is supported. If the value is zero, further processing is handled below. */
1585*22dc650dSSadaf Ebrahimi
1586*22dc650dSSadaf Ebrahimi else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1587*22dc650dSSadaf Ebrahimi {
1588*22dc650dSSadaf Ebrahimi if (i > 0)
1589*22dc650dSSadaf Ebrahimi {
1590*22dc650dSSadaf Ebrahimi c = (uint32_t)i;
1591*22dc650dSSadaf Ebrahimi if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1592*22dc650dSSadaf Ebrahimi c = CHAR_LF;
1593*22dc650dSSadaf Ebrahimi }
1594*22dc650dSSadaf Ebrahimi else /* Negative table entry */
1595*22dc650dSSadaf Ebrahimi {
1596*22dc650dSSadaf Ebrahimi escape = -i; /* Else return a special escape */
1597*22dc650dSSadaf Ebrahimi if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1598*22dc650dSSadaf Ebrahimi cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1599*22dc650dSSadaf Ebrahimi
1600*22dc650dSSadaf Ebrahimi /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1601*22dc650dSSadaf Ebrahimi Unicode code points, as well as plain \N for "not newline". PCRE does not
1602*22dc650dSSadaf Ebrahimi support \N{name}. However, it does support quantification such as \N{2,3},
1603*22dc650dSSadaf Ebrahimi so if \N{ is not followed by U+dddd we check for a quantifier. */
1604*22dc650dSSadaf Ebrahimi
1605*22dc650dSSadaf Ebrahimi if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1606*22dc650dSSadaf Ebrahimi {
1607*22dc650dSSadaf Ebrahimi PCRE2_SPTR p = ptr + 1;
1608*22dc650dSSadaf Ebrahimi
1609*22dc650dSSadaf Ebrahimi /* Perl ignores spaces and tabs after { */
1610*22dc650dSSadaf Ebrahimi
1611*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1612*22dc650dSSadaf Ebrahimi
1613*22dc650dSSadaf Ebrahimi /* \N{U+ can be handled by the \x{ code. However, this construction is
1614*22dc650dSSadaf Ebrahimi not valid in EBCDIC environments because it specifies a Unicode
1615*22dc650dSSadaf Ebrahimi character, not a codepoint in the local code. For example \N{U+0041}
1616*22dc650dSSadaf Ebrahimi must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1617*22dc650dSSadaf Ebrahimi casing semantics for the entire pattern, so allow it only in UTF (i.e.
1618*22dc650dSSadaf Ebrahimi Unicode) mode. */
1619*22dc650dSSadaf Ebrahimi
1620*22dc650dSSadaf Ebrahimi if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1621*22dc650dSSadaf Ebrahimi {
1622*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
1623*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR93;
1624*22dc650dSSadaf Ebrahimi #else
1625*22dc650dSSadaf Ebrahimi if (utf)
1626*22dc650dSSadaf Ebrahimi {
1627*22dc650dSSadaf Ebrahimi ptr = p + 2;
1628*22dc650dSSadaf Ebrahimi escape = 0; /* Not a fancy escape after all */
1629*22dc650dSSadaf Ebrahimi goto COME_FROM_NU;
1630*22dc650dSSadaf Ebrahimi }
1631*22dc650dSSadaf Ebrahimi else *errorcodeptr = ERR93;
1632*22dc650dSSadaf Ebrahimi #endif
1633*22dc650dSSadaf Ebrahimi }
1634*22dc650dSSadaf Ebrahimi
1635*22dc650dSSadaf Ebrahimi /* Give an error if what follows is not a quantifier, but don't override
1636*22dc650dSSadaf Ebrahimi an error set by the quantifier reader (e.g. number overflow). */
1637*22dc650dSSadaf Ebrahimi
1638*22dc650dSSadaf Ebrahimi else
1639*22dc650dSSadaf Ebrahimi {
1640*22dc650dSSadaf Ebrahimi if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1641*22dc650dSSadaf Ebrahimi *errorcodeptr == 0)
1642*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR37;
1643*22dc650dSSadaf Ebrahimi }
1644*22dc650dSSadaf Ebrahimi }
1645*22dc650dSSadaf Ebrahimi }
1646*22dc650dSSadaf Ebrahimi }
1647*22dc650dSSadaf Ebrahimi
1648*22dc650dSSadaf Ebrahimi /* Escapes that need further processing, including those that are unknown, have
1649*22dc650dSSadaf Ebrahimi a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1650*22dc650dSSadaf Ebrahimi \o, and \x are recognized (\u and \U can never appear as they are used for case
1651*22dc650dSSadaf Ebrahimi forcing). */
1652*22dc650dSSadaf Ebrahimi
1653*22dc650dSSadaf Ebrahimi else
1654*22dc650dSSadaf Ebrahimi {
1655*22dc650dSSadaf Ebrahimi int s;
1656*22dc650dSSadaf Ebrahimi PCRE2_SPTR oldptr;
1657*22dc650dSSadaf Ebrahimi BOOL overflow;
1658*22dc650dSSadaf Ebrahimi
1659*22dc650dSSadaf Ebrahimi /* Filter calls from pcre2_substitute(). */
1660*22dc650dSSadaf Ebrahimi
1661*22dc650dSSadaf Ebrahimi if (cb == NULL)
1662*22dc650dSSadaf Ebrahimi {
1663*22dc650dSSadaf Ebrahimi if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1664*22dc650dSSadaf Ebrahimi {
1665*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR3;
1666*22dc650dSSadaf Ebrahimi return 0;
1667*22dc650dSSadaf Ebrahimi }
1668*22dc650dSSadaf Ebrahimi alt_bsux = FALSE; /* Do not modify \x handling */
1669*22dc650dSSadaf Ebrahimi }
1670*22dc650dSSadaf Ebrahimi
1671*22dc650dSSadaf Ebrahimi switch (c)
1672*22dc650dSSadaf Ebrahimi {
1673*22dc650dSSadaf Ebrahimi /* A number of Perl escapes are not handled by PCRE. We give an explicit
1674*22dc650dSSadaf Ebrahimi error. */
1675*22dc650dSSadaf Ebrahimi
1676*22dc650dSSadaf Ebrahimi case CHAR_F:
1677*22dc650dSSadaf Ebrahimi case CHAR_l:
1678*22dc650dSSadaf Ebrahimi case CHAR_L:
1679*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR37;
1680*22dc650dSSadaf Ebrahimi break;
1681*22dc650dSSadaf Ebrahimi
1682*22dc650dSSadaf Ebrahimi /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1683*22dc650dSSadaf Ebrahimi is set. Otherwise, \u must be followed by exactly four hex digits or, if
1684*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1685*22dc650dSSadaf Ebrahimi Otherwise it is a lowercase u letter. This gives some compatibility with
1686*22dc650dSSadaf Ebrahimi ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1687*22dc650dSSadaf Ebrahimi allowed. When \u{ is not followed by hex digits, a special return is given
1688*22dc650dSSadaf Ebrahimi because otherwise \u{ 12} (for example) would be treated as u{12}. */
1689*22dc650dSSadaf Ebrahimi
1690*22dc650dSSadaf Ebrahimi case CHAR_u:
1691*22dc650dSSadaf Ebrahimi if (!alt_bsux) *errorcodeptr = ERR37; else
1692*22dc650dSSadaf Ebrahimi {
1693*22dc650dSSadaf Ebrahimi uint32_t xc;
1694*22dc650dSSadaf Ebrahimi
1695*22dc650dSSadaf Ebrahimi if (ptr >= ptrend) break;
1696*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1697*22dc650dSSadaf Ebrahimi (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1698*22dc650dSSadaf Ebrahimi {
1699*22dc650dSSadaf Ebrahimi PCRE2_SPTR hptr = ptr + 1;
1700*22dc650dSSadaf Ebrahimi
1701*22dc650dSSadaf Ebrahimi cc = 0;
1702*22dc650dSSadaf Ebrahimi while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1703*22dc650dSSadaf Ebrahimi {
1704*22dc650dSSadaf Ebrahimi if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1705*22dc650dSSadaf Ebrahimi {
1706*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR77;
1707*22dc650dSSadaf Ebrahimi ptr = hptr; /* Show where */
1708*22dc650dSSadaf Ebrahimi break; /* *hptr != } will cause another break below */
1709*22dc650dSSadaf Ebrahimi }
1710*22dc650dSSadaf Ebrahimi cc = (cc << 4) | xc;
1711*22dc650dSSadaf Ebrahimi hptr++;
1712*22dc650dSSadaf Ebrahimi }
1713*22dc650dSSadaf Ebrahimi
1714*22dc650dSSadaf Ebrahimi if (hptr == ptr + 1 || /* No hex digits */
1715*22dc650dSSadaf Ebrahimi hptr >= ptrend || /* Hit end of input */
1716*22dc650dSSadaf Ebrahimi *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1717*22dc650dSSadaf Ebrahimi {
1718*22dc650dSSadaf Ebrahimi escape = ESC_ub; /* Special return */
1719*22dc650dSSadaf Ebrahimi ptr++; /* Skip { */
1720*22dc650dSSadaf Ebrahimi break; /* Hex escape not recognized */
1721*22dc650dSSadaf Ebrahimi }
1722*22dc650dSSadaf Ebrahimi
1723*22dc650dSSadaf Ebrahimi c = cc; /* Accept the code point */
1724*22dc650dSSadaf Ebrahimi ptr = hptr + 1;
1725*22dc650dSSadaf Ebrahimi }
1726*22dc650dSSadaf Ebrahimi
1727*22dc650dSSadaf Ebrahimi else /* Must be exactly 4 hex digits */
1728*22dc650dSSadaf Ebrahimi {
1729*22dc650dSSadaf Ebrahimi if (ptrend - ptr < 4) break; /* Less than 4 chars */
1730*22dc650dSSadaf Ebrahimi if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1731*22dc650dSSadaf Ebrahimi if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1732*22dc650dSSadaf Ebrahimi cc = (cc << 4) | xc;
1733*22dc650dSSadaf Ebrahimi if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1734*22dc650dSSadaf Ebrahimi cc = (cc << 4) | xc;
1735*22dc650dSSadaf Ebrahimi if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1736*22dc650dSSadaf Ebrahimi c = (cc << 4) | xc;
1737*22dc650dSSadaf Ebrahimi ptr += 4;
1738*22dc650dSSadaf Ebrahimi }
1739*22dc650dSSadaf Ebrahimi
1740*22dc650dSSadaf Ebrahimi if (utf)
1741*22dc650dSSadaf Ebrahimi {
1742*22dc650dSSadaf Ebrahimi if (c > 0x10ffffU) *errorcodeptr = ERR77;
1743*22dc650dSSadaf Ebrahimi else
1744*22dc650dSSadaf Ebrahimi if (c >= 0xd800 && c <= 0xdfff &&
1745*22dc650dSSadaf Ebrahimi (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1746*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR73;
1747*22dc650dSSadaf Ebrahimi }
1748*22dc650dSSadaf Ebrahimi else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1749*22dc650dSSadaf Ebrahimi }
1750*22dc650dSSadaf Ebrahimi break;
1751*22dc650dSSadaf Ebrahimi
1752*22dc650dSSadaf Ebrahimi /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1753*22dc650dSSadaf Ebrahimi in which case it is an upper case letter. */
1754*22dc650dSSadaf Ebrahimi
1755*22dc650dSSadaf Ebrahimi case CHAR_U:
1756*22dc650dSSadaf Ebrahimi if (!alt_bsux) *errorcodeptr = ERR37;
1757*22dc650dSSadaf Ebrahimi break;
1758*22dc650dSSadaf Ebrahimi
1759*22dc650dSSadaf Ebrahimi /* In a character class, \g is just a literal "g". Outside a character
1760*22dc650dSSadaf Ebrahimi class, \g must be followed by one of a number of specific things:
1761*22dc650dSSadaf Ebrahimi
1762*22dc650dSSadaf Ebrahimi (1) A number, either plain or braced. If positive, it is an absolute
1763*22dc650dSSadaf Ebrahimi backreference. If negative, it is a relative backreference. This is a Perl
1764*22dc650dSSadaf Ebrahimi 5.10 feature.
1765*22dc650dSSadaf Ebrahimi
1766*22dc650dSSadaf Ebrahimi (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1767*22dc650dSSadaf Ebrahimi is part of Perl's movement towards a unified syntax for back references. As
1768*22dc650dSSadaf Ebrahimi this is synonymous with \k{name}, we fudge it up by pretending it really
1769*22dc650dSSadaf Ebrahimi was \k{name}.
1770*22dc650dSSadaf Ebrahimi
1771*22dc650dSSadaf Ebrahimi (3) For Oniguruma compatibility we also support \g followed by a name or a
1772*22dc650dSSadaf Ebrahimi number either in angle brackets or in single quotes. However, these are
1773*22dc650dSSadaf Ebrahimi (possibly recursive) subroutine calls, _not_ backreferences. We return
1774*22dc650dSSadaf Ebrahimi the ESC_g code.
1775*22dc650dSSadaf Ebrahimi
1776*22dc650dSSadaf Ebrahimi Summary: Return a negative number for a numerical back reference, ESC_k for
1777*22dc650dSSadaf Ebrahimi a named back reference, and ESC_g for a named or numbered subroutine call.
1778*22dc650dSSadaf Ebrahimi */
1779*22dc650dSSadaf Ebrahimi
1780*22dc650dSSadaf Ebrahimi case CHAR_g:
1781*22dc650dSSadaf Ebrahimi if (isclass) break;
1782*22dc650dSSadaf Ebrahimi
1783*22dc650dSSadaf Ebrahimi if (ptr >= ptrend)
1784*22dc650dSSadaf Ebrahimi {
1785*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR57;
1786*22dc650dSSadaf Ebrahimi break;
1787*22dc650dSSadaf Ebrahimi }
1788*22dc650dSSadaf Ebrahimi
1789*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790*22dc650dSSadaf Ebrahimi {
1791*22dc650dSSadaf Ebrahimi escape = ESC_g;
1792*22dc650dSSadaf Ebrahimi break;
1793*22dc650dSSadaf Ebrahimi }
1794*22dc650dSSadaf Ebrahimi
1795*22dc650dSSadaf Ebrahimi /* If there is a brace delimiter, try to read a numerical reference. If
1796*22dc650dSSadaf Ebrahimi there isn't one, assume we have a name and treat it as \k. */
1797*22dc650dSSadaf Ebrahimi
1798*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799*22dc650dSSadaf Ebrahimi {
1800*22dc650dSSadaf Ebrahimi PCRE2_SPTR p = ptr + 1;
1801*22dc650dSSadaf Ebrahimi
1802*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803*22dc650dSSadaf Ebrahimi if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804*22dc650dSSadaf Ebrahimi errorcodeptr))
1805*22dc650dSSadaf Ebrahimi {
1806*22dc650dSSadaf Ebrahimi if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1807*22dc650dSSadaf Ebrahimi break;
1808*22dc650dSSadaf Ebrahimi }
1809*22dc650dSSadaf Ebrahimi while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810*22dc650dSSadaf Ebrahimi
1811*22dc650dSSadaf Ebrahimi if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812*22dc650dSSadaf Ebrahimi {
1813*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR57;
1814*22dc650dSSadaf Ebrahimi break;
1815*22dc650dSSadaf Ebrahimi }
1816*22dc650dSSadaf Ebrahimi ptr = p + 1;
1817*22dc650dSSadaf Ebrahimi }
1818*22dc650dSSadaf Ebrahimi
1819*22dc650dSSadaf Ebrahimi /* Read an undelimited number */
1820*22dc650dSSadaf Ebrahimi
1821*22dc650dSSadaf Ebrahimi else
1822*22dc650dSSadaf Ebrahimi {
1823*22dc650dSSadaf Ebrahimi if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1824*22dc650dSSadaf Ebrahimi errorcodeptr))
1825*22dc650dSSadaf Ebrahimi {
1826*22dc650dSSadaf Ebrahimi if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1827*22dc650dSSadaf Ebrahimi break;
1828*22dc650dSSadaf Ebrahimi }
1829*22dc650dSSadaf Ebrahimi }
1830*22dc650dSSadaf Ebrahimi
1831*22dc650dSSadaf Ebrahimi if (s <= 0)
1832*22dc650dSSadaf Ebrahimi {
1833*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR15;
1834*22dc650dSSadaf Ebrahimi break;
1835*22dc650dSSadaf Ebrahimi }
1836*22dc650dSSadaf Ebrahimi
1837*22dc650dSSadaf Ebrahimi escape = -s;
1838*22dc650dSSadaf Ebrahimi break;
1839*22dc650dSSadaf Ebrahimi
1840*22dc650dSSadaf Ebrahimi /* The handling of escape sequences consisting of a string of digits
1841*22dc650dSSadaf Ebrahimi starting with one that is not zero is not straightforward. Perl has changed
1842*22dc650dSSadaf Ebrahimi over the years. Nowadays \g{} for backreferences and \o{} for octal are
1843*22dc650dSSadaf Ebrahimi recommended to avoid the ambiguities in the old syntax.
1844*22dc650dSSadaf Ebrahimi
1845*22dc650dSSadaf Ebrahimi Outside a character class, the digits are read as a decimal number. If the
1846*22dc650dSSadaf Ebrahimi number is less than 10, or if there are that many previous extracting left
1847*22dc650dSSadaf Ebrahimi brackets, it is a back reference. Otherwise, up to three octal digits are
1848*22dc650dSSadaf Ebrahimi read to form an escaped character code. Thus \123 is likely to be octal 123
1849*22dc650dSSadaf Ebrahimi (cf \0123, which is octal 012 followed by the literal 3).
1850*22dc650dSSadaf Ebrahimi
1851*22dc650dSSadaf Ebrahimi Inside a character class, \ followed by a digit is always either a literal
1852*22dc650dSSadaf Ebrahimi 8 or 9 or an octal number. */
1853*22dc650dSSadaf Ebrahimi
1854*22dc650dSSadaf Ebrahimi case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1855*22dc650dSSadaf Ebrahimi case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1856*22dc650dSSadaf Ebrahimi
1857*22dc650dSSadaf Ebrahimi if (!isclass)
1858*22dc650dSSadaf Ebrahimi {
1859*22dc650dSSadaf Ebrahimi oldptr = ptr;
1860*22dc650dSSadaf Ebrahimi ptr--; /* Back to the digit */
1861*22dc650dSSadaf Ebrahimi
1862*22dc650dSSadaf Ebrahimi /* As we know we are at a digit, the only possible error from
1863*22dc650dSSadaf Ebrahimi read_number() is a number that is too large to be a group number. In this
1864*22dc650dSSadaf Ebrahimi case we fall through handle this as not a group reference. If we have
1865*22dc650dSSadaf Ebrahimi read a small enough number, check for a back reference.
1866*22dc650dSSadaf Ebrahimi
1867*22dc650dSSadaf Ebrahimi \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1868*22dc650dSSadaf Ebrahimi are octal escapes if there are not that many previous captures. */
1869*22dc650dSSadaf Ebrahimi
1870*22dc650dSSadaf Ebrahimi if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1871*22dc650dSSadaf Ebrahimi (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1872*22dc650dSSadaf Ebrahimi {
1873*22dc650dSSadaf Ebrahimi if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1874*22dc650dSSadaf Ebrahimi else escape = -s; /* Indicates a back reference */
1875*22dc650dSSadaf Ebrahimi break;
1876*22dc650dSSadaf Ebrahimi }
1877*22dc650dSSadaf Ebrahimi
1878*22dc650dSSadaf Ebrahimi ptr = oldptr; /* Put the pointer back and fall through */
1879*22dc650dSSadaf Ebrahimi }
1880*22dc650dSSadaf Ebrahimi
1881*22dc650dSSadaf Ebrahimi /* Handle a digit following \ when the number is not a back reference, or
1882*22dc650dSSadaf Ebrahimi we are within a character class. If the first digit is 8 or 9, Perl used to
1883*22dc650dSSadaf Ebrahimi generate a binary zero and then treat the digit as a following literal. At
1884*22dc650dSSadaf Ebrahimi least by Perl 5.18 this changed so as not to insert the binary zero. */
1885*22dc650dSSadaf Ebrahimi
1886*22dc650dSSadaf Ebrahimi if (c >= CHAR_8) break;
1887*22dc650dSSadaf Ebrahimi
1888*22dc650dSSadaf Ebrahimi /* Fall through */
1889*22dc650dSSadaf Ebrahimi
1890*22dc650dSSadaf Ebrahimi /* \0 always starts an octal number, but we may drop through to here with a
1891*22dc650dSSadaf Ebrahimi larger first octal digit. The original code used just to take the least
1892*22dc650dSSadaf Ebrahimi significant 8 bits of octal numbers (I think this is what early Perls used
1893*22dc650dSSadaf Ebrahimi to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1894*22dc650dSSadaf Ebrahimi but no more than 3 octal digits. */
1895*22dc650dSSadaf Ebrahimi
1896*22dc650dSSadaf Ebrahimi case CHAR_0:
1897*22dc650dSSadaf Ebrahimi c -= CHAR_0;
1898*22dc650dSSadaf Ebrahimi while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1899*22dc650dSSadaf Ebrahimi c = c * 8 + *ptr++ - CHAR_0;
1900*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
1901*22dc650dSSadaf Ebrahimi if (!utf && c > 0xff) *errorcodeptr = ERR51;
1902*22dc650dSSadaf Ebrahimi #endif
1903*22dc650dSSadaf Ebrahimi break;
1904*22dc650dSSadaf Ebrahimi
1905*22dc650dSSadaf Ebrahimi /* \o is a relatively new Perl feature, supporting a more general way of
1906*22dc650dSSadaf Ebrahimi specifying character codes in octal. The only supported form is \o{ddd},
1907*22dc650dSSadaf Ebrahimi with optional spaces or tabs after { and before }. */
1908*22dc650dSSadaf Ebrahimi
1909*22dc650dSSadaf Ebrahimi case CHAR_o:
1910*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1911*22dc650dSSadaf Ebrahimi {
1912*22dc650dSSadaf Ebrahimi ptr--;
1913*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR55;
1914*22dc650dSSadaf Ebrahimi break;
1915*22dc650dSSadaf Ebrahimi }
1916*22dc650dSSadaf Ebrahimi
1917*22dc650dSSadaf Ebrahimi while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1918*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1919*22dc650dSSadaf Ebrahimi {
1920*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR78;
1921*22dc650dSSadaf Ebrahimi break;
1922*22dc650dSSadaf Ebrahimi }
1923*22dc650dSSadaf Ebrahimi
1924*22dc650dSSadaf Ebrahimi c = 0;
1925*22dc650dSSadaf Ebrahimi overflow = FALSE;
1926*22dc650dSSadaf Ebrahimi while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1927*22dc650dSSadaf Ebrahimi {
1928*22dc650dSSadaf Ebrahimi cc = *ptr++;
1929*22dc650dSSadaf Ebrahimi if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1930*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
1931*22dc650dSSadaf Ebrahimi if (c >= 0x20000000l) { overflow = TRUE; break; }
1932*22dc650dSSadaf Ebrahimi #endif
1933*22dc650dSSadaf Ebrahimi c = (c << 3) + (cc - CHAR_0);
1934*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
1935*22dc650dSSadaf Ebrahimi if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1936*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
1937*22dc650dSSadaf Ebrahimi if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1938*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 32
1939*22dc650dSSadaf Ebrahimi if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1940*22dc650dSSadaf Ebrahimi #endif
1941*22dc650dSSadaf Ebrahimi }
1942*22dc650dSSadaf Ebrahimi
1943*22dc650dSSadaf Ebrahimi while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1944*22dc650dSSadaf Ebrahimi
1945*22dc650dSSadaf Ebrahimi if (overflow)
1946*22dc650dSSadaf Ebrahimi {
1947*22dc650dSSadaf Ebrahimi while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1948*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR34;
1949*22dc650dSSadaf Ebrahimi }
1950*22dc650dSSadaf Ebrahimi else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1951*22dc650dSSadaf Ebrahimi {
1952*22dc650dSSadaf Ebrahimi if (utf && c >= 0xd800 && c <= 0xdfff &&
1953*22dc650dSSadaf Ebrahimi (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1954*22dc650dSSadaf Ebrahimi {
1955*22dc650dSSadaf Ebrahimi ptr--;
1956*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR73;
1957*22dc650dSSadaf Ebrahimi }
1958*22dc650dSSadaf Ebrahimi }
1959*22dc650dSSadaf Ebrahimi else
1960*22dc650dSSadaf Ebrahimi {
1961*22dc650dSSadaf Ebrahimi ptr--;
1962*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR64;
1963*22dc650dSSadaf Ebrahimi }
1964*22dc650dSSadaf Ebrahimi break;
1965*22dc650dSSadaf Ebrahimi
1966*22dc650dSSadaf Ebrahimi /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1967*22dc650dSSadaf Ebrahimi by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1968*22dc650dSSadaf Ebrahimi
1969*22dc650dSSadaf Ebrahimi case CHAR_x:
1970*22dc650dSSadaf Ebrahimi if (alt_bsux)
1971*22dc650dSSadaf Ebrahimi {
1972*22dc650dSSadaf Ebrahimi uint32_t xc;
1973*22dc650dSSadaf Ebrahimi if (ptrend - ptr < 2) break; /* Less than 2 characters */
1974*22dc650dSSadaf Ebrahimi if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1975*22dc650dSSadaf Ebrahimi if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1976*22dc650dSSadaf Ebrahimi c = (cc << 4) | xc;
1977*22dc650dSSadaf Ebrahimi ptr += 2;
1978*22dc650dSSadaf Ebrahimi }
1979*22dc650dSSadaf Ebrahimi
1980*22dc650dSSadaf Ebrahimi /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1981*22dc650dSSadaf Ebrahimi greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1982*22dc650dSSadaf Ebrahimi digits. If not, { used to be treated as a data character. However, Perl
1983*22dc650dSSadaf Ebrahimi seems to read hex digits up to the first non-such, and ignore the rest, so
1984*22dc650dSSadaf Ebrahimi that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1985*22dc650dSSadaf Ebrahimi now gives an error. */
1986*22dc650dSSadaf Ebrahimi
1987*22dc650dSSadaf Ebrahimi else
1988*22dc650dSSadaf Ebrahimi {
1989*22dc650dSSadaf Ebrahimi if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1990*22dc650dSSadaf Ebrahimi {
1991*22dc650dSSadaf Ebrahimi ptr++;
1992*22dc650dSSadaf Ebrahimi while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1993*22dc650dSSadaf Ebrahimi
1994*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
1995*22dc650dSSadaf Ebrahimi COME_FROM_NU:
1996*22dc650dSSadaf Ebrahimi #endif
1997*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1998*22dc650dSSadaf Ebrahimi {
1999*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR78;
2000*22dc650dSSadaf Ebrahimi break;
2001*22dc650dSSadaf Ebrahimi }
2002*22dc650dSSadaf Ebrahimi c = 0;
2003*22dc650dSSadaf Ebrahimi overflow = FALSE;
2004*22dc650dSSadaf Ebrahimi
2005*22dc650dSSadaf Ebrahimi while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2006*22dc650dSSadaf Ebrahimi {
2007*22dc650dSSadaf Ebrahimi ptr++;
2008*22dc650dSSadaf Ebrahimi if (c == 0 && cc == 0) continue; /* Leading zeroes */
2009*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
2010*22dc650dSSadaf Ebrahimi if (c >= 0x10000000l) { overflow = TRUE; break; }
2011*22dc650dSSadaf Ebrahimi #endif
2012*22dc650dSSadaf Ebrahimi c = (c << 4) | cc;
2013*22dc650dSSadaf Ebrahimi if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2014*22dc650dSSadaf Ebrahimi {
2015*22dc650dSSadaf Ebrahimi overflow = TRUE;
2016*22dc650dSSadaf Ebrahimi break;
2017*22dc650dSSadaf Ebrahimi }
2018*22dc650dSSadaf Ebrahimi }
2019*22dc650dSSadaf Ebrahimi
2020*22dc650dSSadaf Ebrahimi /* Perl ignores spaces and tabs before } */
2021*22dc650dSSadaf Ebrahimi
2022*22dc650dSSadaf Ebrahimi while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2023*22dc650dSSadaf Ebrahimi
2024*22dc650dSSadaf Ebrahimi /* On overflow, skip remaining hex digits */
2025*22dc650dSSadaf Ebrahimi
2026*22dc650dSSadaf Ebrahimi if (overflow)
2027*22dc650dSSadaf Ebrahimi {
2028*22dc650dSSadaf Ebrahimi while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2029*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR34;
2030*22dc650dSSadaf Ebrahimi }
2031*22dc650dSSadaf Ebrahimi else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2032*22dc650dSSadaf Ebrahimi {
2033*22dc650dSSadaf Ebrahimi if (utf && c >= 0xd800 && c <= 0xdfff &&
2034*22dc650dSSadaf Ebrahimi (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2035*22dc650dSSadaf Ebrahimi {
2036*22dc650dSSadaf Ebrahimi ptr--;
2037*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR73;
2038*22dc650dSSadaf Ebrahimi }
2039*22dc650dSSadaf Ebrahimi }
2040*22dc650dSSadaf Ebrahimi
2041*22dc650dSSadaf Ebrahimi /* If the sequence of hex digits (followed by optional space) does not
2042*22dc650dSSadaf Ebrahimi end with '}', give an error. We used just to recognize this construct
2043*22dc650dSSadaf Ebrahimi and fall through to the normal \x handling, but nowadays Perl gives an
2044*22dc650dSSadaf Ebrahimi error, which seems much more sensible, so we do too. */
2045*22dc650dSSadaf Ebrahimi
2046*22dc650dSSadaf Ebrahimi else
2047*22dc650dSSadaf Ebrahimi {
2048*22dc650dSSadaf Ebrahimi ptr--;
2049*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR67;
2050*22dc650dSSadaf Ebrahimi }
2051*22dc650dSSadaf Ebrahimi } /* End of \x{} processing */
2052*22dc650dSSadaf Ebrahimi
2053*22dc650dSSadaf Ebrahimi /* Read a up to two hex digits after \x */
2054*22dc650dSSadaf Ebrahimi
2055*22dc650dSSadaf Ebrahimi else
2056*22dc650dSSadaf Ebrahimi {
2057*22dc650dSSadaf Ebrahimi c = 0;
2058*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
2059*22dc650dSSadaf Ebrahimi ptr++;
2060*22dc650dSSadaf Ebrahimi c = cc;
2061*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
2062*22dc650dSSadaf Ebrahimi ptr++;
2063*22dc650dSSadaf Ebrahimi c = (c << 4) | cc;
2064*22dc650dSSadaf Ebrahimi } /* End of \xdd handling */
2065*22dc650dSSadaf Ebrahimi } /* End of Perl-style \x handling */
2066*22dc650dSSadaf Ebrahimi break;
2067*22dc650dSSadaf Ebrahimi
2068*22dc650dSSadaf Ebrahimi /* The handling of \c is different in ASCII and EBCDIC environments. In an
2069*22dc650dSSadaf Ebrahimi ASCII (or Unicode) environment, an error is given if the character
2070*22dc650dSSadaf Ebrahimi following \c is not a printable ASCII character. Otherwise, the following
2071*22dc650dSSadaf Ebrahimi character is upper-cased if it is a letter, and after that the 0x40 bit is
2072*22dc650dSSadaf Ebrahimi flipped. The result is the value of the escape.
2073*22dc650dSSadaf Ebrahimi
2074*22dc650dSSadaf Ebrahimi In an EBCDIC environment the handling of \c is compatible with the
2075*22dc650dSSadaf Ebrahimi specification in the perlebcdic document. The following character must be
2076*22dc650dSSadaf Ebrahimi a letter or one of small number of special characters. These provide a
2077*22dc650dSSadaf Ebrahimi means of defining the character values 0-31.
2078*22dc650dSSadaf Ebrahimi
2079*22dc650dSSadaf Ebrahimi For testing the EBCDIC handling of \c in an ASCII environment, recognize
2080*22dc650dSSadaf Ebrahimi the EBCDIC value of 'c' explicitly. */
2081*22dc650dSSadaf Ebrahimi
2082*22dc650dSSadaf Ebrahimi #if defined EBCDIC && 'a' != 0x81
2083*22dc650dSSadaf Ebrahimi case 0x83:
2084*22dc650dSSadaf Ebrahimi #else
2085*22dc650dSSadaf Ebrahimi case CHAR_c:
2086*22dc650dSSadaf Ebrahimi #endif
2087*22dc650dSSadaf Ebrahimi if (ptr >= ptrend)
2088*22dc650dSSadaf Ebrahimi {
2089*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR2;
2090*22dc650dSSadaf Ebrahimi break;
2091*22dc650dSSadaf Ebrahimi }
2092*22dc650dSSadaf Ebrahimi c = *ptr;
2093*22dc650dSSadaf Ebrahimi if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2094*22dc650dSSadaf Ebrahimi
2095*22dc650dSSadaf Ebrahimi /* Handle \c in an ASCII/Unicode environment. */
2096*22dc650dSSadaf Ebrahimi
2097*22dc650dSSadaf Ebrahimi #ifndef EBCDIC /* ASCII/UTF-8 coding */
2098*22dc650dSSadaf Ebrahimi if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
2099*22dc650dSSadaf Ebrahimi {
2100*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR68;
2101*22dc650dSSadaf Ebrahimi break;
2102*22dc650dSSadaf Ebrahimi }
2103*22dc650dSSadaf Ebrahimi c ^= 0x40;
2104*22dc650dSSadaf Ebrahimi
2105*22dc650dSSadaf Ebrahimi /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2106*22dc650dSSadaf Ebrahimi 255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2107*22dc650dSSadaf Ebrahimi POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2108*22dc650dSSadaf Ebrahimi The other valid sequences correspond to a list of specific characters. */
2109*22dc650dSSadaf Ebrahimi
2110*22dc650dSSadaf Ebrahimi #else
2111*22dc650dSSadaf Ebrahimi if (c == CHAR_QUESTION_MARK)
2112*22dc650dSSadaf Ebrahimi c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2113*22dc650dSSadaf Ebrahimi else
2114*22dc650dSSadaf Ebrahimi {
2115*22dc650dSSadaf Ebrahimi for (i = 0; i < 32; i++)
2116*22dc650dSSadaf Ebrahimi {
2117*22dc650dSSadaf Ebrahimi if (c == ebcdic_escape_c[i]) break;
2118*22dc650dSSadaf Ebrahimi }
2119*22dc650dSSadaf Ebrahimi if (i < 32) c = i; else *errorcodeptr = ERR68;
2120*22dc650dSSadaf Ebrahimi }
2121*22dc650dSSadaf Ebrahimi #endif /* EBCDIC */
2122*22dc650dSSadaf Ebrahimi
2123*22dc650dSSadaf Ebrahimi ptr++;
2124*22dc650dSSadaf Ebrahimi break;
2125*22dc650dSSadaf Ebrahimi
2126*22dc650dSSadaf Ebrahimi /* Any other alphanumeric following \ is an error. Perl gives an error only
2127*22dc650dSSadaf Ebrahimi if in warning mode, but PCRE doesn't have a warning mode. */
2128*22dc650dSSadaf Ebrahimi
2129*22dc650dSSadaf Ebrahimi default:
2130*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR3;
2131*22dc650dSSadaf Ebrahimi *ptrptr = ptr - 1; /* Point to the character at fault */
2132*22dc650dSSadaf Ebrahimi return 0;
2133*22dc650dSSadaf Ebrahimi }
2134*22dc650dSSadaf Ebrahimi }
2135*22dc650dSSadaf Ebrahimi
2136*22dc650dSSadaf Ebrahimi /* Set the pointer to the next character before returning. */
2137*22dc650dSSadaf Ebrahimi
2138*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2139*22dc650dSSadaf Ebrahimi *chptr = c;
2140*22dc650dSSadaf Ebrahimi return escape;
2141*22dc650dSSadaf Ebrahimi }
2142*22dc650dSSadaf Ebrahimi
2143*22dc650dSSadaf Ebrahimi
2144*22dc650dSSadaf Ebrahimi
2145*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2146*22dc650dSSadaf Ebrahimi /*************************************************
2147*22dc650dSSadaf Ebrahimi * Handle \P and \p *
2148*22dc650dSSadaf Ebrahimi *************************************************/
2149*22dc650dSSadaf Ebrahimi
2150*22dc650dSSadaf Ebrahimi /* This function is called after \P or \p has been encountered, provided that
2151*22dc650dSSadaf Ebrahimi PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2152*22dc650dSSadaf Ebrahimi contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2153*22dc650dSSadaf Ebrahimi after the final code unit of the escape sequence.
2154*22dc650dSSadaf Ebrahimi
2155*22dc650dSSadaf Ebrahimi Arguments:
2156*22dc650dSSadaf Ebrahimi ptrptr the pattern position pointer
2157*22dc650dSSadaf Ebrahimi negptr a boolean that is set TRUE for negation else FALSE
2158*22dc650dSSadaf Ebrahimi ptypeptr an unsigned int that is set to the type value
2159*22dc650dSSadaf Ebrahimi pdataptr an unsigned int that is set to the detailed property value
2160*22dc650dSSadaf Ebrahimi errorcodeptr the error code variable
2161*22dc650dSSadaf Ebrahimi cb the compile data
2162*22dc650dSSadaf Ebrahimi
2163*22dc650dSSadaf Ebrahimi Returns: TRUE if the type value was found, or FALSE for an invalid type
2164*22dc650dSSadaf Ebrahimi */
2165*22dc650dSSadaf Ebrahimi
2166*22dc650dSSadaf Ebrahimi static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2167*22dc650dSSadaf Ebrahimi get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2168*22dc650dSSadaf Ebrahimi uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2169*22dc650dSSadaf Ebrahimi {
2170*22dc650dSSadaf Ebrahimi PCRE2_UCHAR c;
2171*22dc650dSSadaf Ebrahimi PCRE2_SIZE i, bot, top;
2172*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr = *ptrptr;
2173*22dc650dSSadaf Ebrahimi PCRE2_UCHAR name[50];
2174*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *vptr = NULL;
2175*22dc650dSSadaf Ebrahimi uint16_t ptscript = PT_NOTSCRIPT;
2176*22dc650dSSadaf Ebrahimi
2177*22dc650dSSadaf Ebrahimi if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2178*22dc650dSSadaf Ebrahimi c = *ptr++;
2179*22dc650dSSadaf Ebrahimi *negptr = FALSE;
2180*22dc650dSSadaf Ebrahimi
2181*22dc650dSSadaf Ebrahimi /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2182*22dc650dSSadaf Ebrahimi negation. */
2183*22dc650dSSadaf Ebrahimi
2184*22dc650dSSadaf Ebrahimi if (c == CHAR_LEFT_CURLY_BRACKET)
2185*22dc650dSSadaf Ebrahimi {
2186*22dc650dSSadaf Ebrahimi if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2187*22dc650dSSadaf Ebrahimi
2188*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2189*22dc650dSSadaf Ebrahimi {
2190*22dc650dSSadaf Ebrahimi *negptr = TRUE;
2191*22dc650dSSadaf Ebrahimi ptr++;
2192*22dc650dSSadaf Ebrahimi }
2193*22dc650dSSadaf Ebrahimi
2194*22dc650dSSadaf Ebrahimi for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2195*22dc650dSSadaf Ebrahimi {
2196*22dc650dSSadaf Ebrahimi if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2197*22dc650dSSadaf Ebrahimi c = *ptr++;
2198*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
2199*22dc650dSSadaf Ebrahimi while (c == '_' || c == '-' || (c <= 0xff && isspace(c)))
2200*22dc650dSSadaf Ebrahimi #else
2201*22dc650dSSadaf Ebrahimi while (c == '_' || c == '-' || isspace(c))
2202*22dc650dSSadaf Ebrahimi #endif
2203*22dc650dSSadaf Ebrahimi {
2204*22dc650dSSadaf Ebrahimi if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2205*22dc650dSSadaf Ebrahimi c = *ptr++;
2206*22dc650dSSadaf Ebrahimi }
2207*22dc650dSSadaf Ebrahimi if (c == CHAR_NUL) goto ERROR_RETURN;
2208*22dc650dSSadaf Ebrahimi if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2209*22dc650dSSadaf Ebrahimi name[i] = tolower(c);
2210*22dc650dSSadaf Ebrahimi if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2211*22dc650dSSadaf Ebrahimi }
2212*22dc650dSSadaf Ebrahimi
2213*22dc650dSSadaf Ebrahimi if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2214*22dc650dSSadaf Ebrahimi name[i] = 0;
2215*22dc650dSSadaf Ebrahimi }
2216*22dc650dSSadaf Ebrahimi
2217*22dc650dSSadaf Ebrahimi /* If { doesn't follow \p or \P there is just one following character, which
2218*22dc650dSSadaf Ebrahimi must be an ASCII letter. */
2219*22dc650dSSadaf Ebrahimi
2220*22dc650dSSadaf Ebrahimi else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2221*22dc650dSSadaf Ebrahimi {
2222*22dc650dSSadaf Ebrahimi name[0] = tolower(c);
2223*22dc650dSSadaf Ebrahimi name[1] = 0;
2224*22dc650dSSadaf Ebrahimi }
2225*22dc650dSSadaf Ebrahimi else goto ERROR_RETURN;
2226*22dc650dSSadaf Ebrahimi
2227*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2228*22dc650dSSadaf Ebrahimi
2229*22dc650dSSadaf Ebrahimi /* If the property contains ':' or '=' we have class name and value separately
2230*22dc650dSSadaf Ebrahimi specified. The following are supported:
2231*22dc650dSSadaf Ebrahimi
2232*22dc650dSSadaf Ebrahimi . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2233*22dc650dSSadaf Ebrahimi . Script (synonym sc) for which the property name is the script name
2234*22dc650dSSadaf Ebrahimi . Script_Extensions (synonym scx), ditto
2235*22dc650dSSadaf Ebrahimi
2236*22dc650dSSadaf Ebrahimi As this is a small number, we currently just check the names directly. If this
2237*22dc650dSSadaf Ebrahimi grows, a sorted table and a switch will be neater.
2238*22dc650dSSadaf Ebrahimi
2239*22dc650dSSadaf Ebrahimi For both the script properties, set a PT_xxx value so that (1) they can be
2240*22dc650dSSadaf Ebrahimi distinguished and (2) invalid script names that happen to be the name of
2241*22dc650dSSadaf Ebrahimi another property can be diagnosed. */
2242*22dc650dSSadaf Ebrahimi
2243*22dc650dSSadaf Ebrahimi if (vptr != NULL)
2244*22dc650dSSadaf Ebrahimi {
2245*22dc650dSSadaf Ebrahimi int offset = 0;
2246*22dc650dSSadaf Ebrahimi PCRE2_UCHAR sname[8];
2247*22dc650dSSadaf Ebrahimi
2248*22dc650dSSadaf Ebrahimi *vptr = 0; /* Terminate property name */
2249*22dc650dSSadaf Ebrahimi if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2250*22dc650dSSadaf Ebrahimi PRIV(strcmp_c8)(name, STRING_bc) == 0)
2251*22dc650dSSadaf Ebrahimi {
2252*22dc650dSSadaf Ebrahimi offset = 4;
2253*22dc650dSSadaf Ebrahimi sname[0] = CHAR_b;
2254*22dc650dSSadaf Ebrahimi sname[1] = CHAR_i; /* There is no strcpy_c8 function */
2255*22dc650dSSadaf Ebrahimi sname[2] = CHAR_d;
2256*22dc650dSSadaf Ebrahimi sname[3] = CHAR_i;
2257*22dc650dSSadaf Ebrahimi }
2258*22dc650dSSadaf Ebrahimi
2259*22dc650dSSadaf Ebrahimi else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2260*22dc650dSSadaf Ebrahimi PRIV(strcmp_c8)(name, STRING_sc) == 0)
2261*22dc650dSSadaf Ebrahimi ptscript = PT_SC;
2262*22dc650dSSadaf Ebrahimi
2263*22dc650dSSadaf Ebrahimi else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2264*22dc650dSSadaf Ebrahimi PRIV(strcmp_c8)(name, STRING_scx) == 0)
2265*22dc650dSSadaf Ebrahimi ptscript = PT_SCX;
2266*22dc650dSSadaf Ebrahimi
2267*22dc650dSSadaf Ebrahimi else
2268*22dc650dSSadaf Ebrahimi {
2269*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR47;
2270*22dc650dSSadaf Ebrahimi return FALSE;
2271*22dc650dSSadaf Ebrahimi }
2272*22dc650dSSadaf Ebrahimi
2273*22dc650dSSadaf Ebrahimi /* Adjust the string in name[] as needed */
2274*22dc650dSSadaf Ebrahimi
2275*22dc650dSSadaf Ebrahimi memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2276*22dc650dSSadaf Ebrahimi if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2277*22dc650dSSadaf Ebrahimi }
2278*22dc650dSSadaf Ebrahimi
2279*22dc650dSSadaf Ebrahimi /* Search for a recognized property using binary chop. */
2280*22dc650dSSadaf Ebrahimi
2281*22dc650dSSadaf Ebrahimi bot = 0;
2282*22dc650dSSadaf Ebrahimi top = PRIV(utt_size);
2283*22dc650dSSadaf Ebrahimi
2284*22dc650dSSadaf Ebrahimi while (bot < top)
2285*22dc650dSSadaf Ebrahimi {
2286*22dc650dSSadaf Ebrahimi int r;
2287*22dc650dSSadaf Ebrahimi i = (bot + top) >> 1;
2288*22dc650dSSadaf Ebrahimi r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2289*22dc650dSSadaf Ebrahimi
2290*22dc650dSSadaf Ebrahimi /* When a matching property is found, some extra checking is needed when the
2291*22dc650dSSadaf Ebrahimi \p{xx:yy} syntax is used and xx is either sc or scx. */
2292*22dc650dSSadaf Ebrahimi
2293*22dc650dSSadaf Ebrahimi if (r == 0)
2294*22dc650dSSadaf Ebrahimi {
2295*22dc650dSSadaf Ebrahimi *pdataptr = PRIV(utt)[i].value;
2296*22dc650dSSadaf Ebrahimi if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2297*22dc650dSSadaf Ebrahimi {
2298*22dc650dSSadaf Ebrahimi *ptypeptr = PRIV(utt)[i].type;
2299*22dc650dSSadaf Ebrahimi return TRUE;
2300*22dc650dSSadaf Ebrahimi }
2301*22dc650dSSadaf Ebrahimi
2302*22dc650dSSadaf Ebrahimi switch (PRIV(utt)[i].type)
2303*22dc650dSSadaf Ebrahimi {
2304*22dc650dSSadaf Ebrahimi case PT_SC:
2305*22dc650dSSadaf Ebrahimi *ptypeptr = PT_SC;
2306*22dc650dSSadaf Ebrahimi return TRUE;
2307*22dc650dSSadaf Ebrahimi
2308*22dc650dSSadaf Ebrahimi case PT_SCX:
2309*22dc650dSSadaf Ebrahimi *ptypeptr = ptscript;
2310*22dc650dSSadaf Ebrahimi return TRUE;
2311*22dc650dSSadaf Ebrahimi }
2312*22dc650dSSadaf Ebrahimi
2313*22dc650dSSadaf Ebrahimi break; /* Non-script found */
2314*22dc650dSSadaf Ebrahimi }
2315*22dc650dSSadaf Ebrahimi
2316*22dc650dSSadaf Ebrahimi if (r > 0) bot = i + 1; else top = i;
2317*22dc650dSSadaf Ebrahimi }
2318*22dc650dSSadaf Ebrahimi
2319*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR47; /* Unrecognized property */
2320*22dc650dSSadaf Ebrahimi return FALSE;
2321*22dc650dSSadaf Ebrahimi
2322*22dc650dSSadaf Ebrahimi ERROR_RETURN: /* Malformed \P or \p */
2323*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR46;
2324*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2325*22dc650dSSadaf Ebrahimi return FALSE;
2326*22dc650dSSadaf Ebrahimi }
2327*22dc650dSSadaf Ebrahimi #endif
2328*22dc650dSSadaf Ebrahimi
2329*22dc650dSSadaf Ebrahimi
2330*22dc650dSSadaf Ebrahimi
2331*22dc650dSSadaf Ebrahimi /*************************************************
2332*22dc650dSSadaf Ebrahimi * Check for POSIX class syntax *
2333*22dc650dSSadaf Ebrahimi *************************************************/
2334*22dc650dSSadaf Ebrahimi
2335*22dc650dSSadaf Ebrahimi /* This function is called when the sequence "[:" or "[." or "[=" is
2336*22dc650dSSadaf Ebrahimi encountered in a character class. It checks whether this is followed by a
2337*22dc650dSSadaf Ebrahimi sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2338*22dc650dSSadaf Ebrahimi reach an unescaped ']' without the special preceding character, return FALSE.
2339*22dc650dSSadaf Ebrahimi
2340*22dc650dSSadaf Ebrahimi Originally, this function only recognized a sequence of letters between the
2341*22dc650dSSadaf Ebrahimi terminators, but it seems that Perl recognizes any sequence of characters,
2342*22dc650dSSadaf Ebrahimi though of course unknown POSIX names are subsequently rejected. Perl gives an
2343*22dc650dSSadaf Ebrahimi "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2344*22dc650dSSadaf Ebrahimi didn't consider this to be a POSIX class. Likewise for [:1234:].
2345*22dc650dSSadaf Ebrahimi
2346*22dc650dSSadaf Ebrahimi The problem in trying to be exactly like Perl is in the handling of escapes. We
2347*22dc650dSSadaf Ebrahimi have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2348*22dc650dSSadaf Ebrahimi class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2349*22dc650dSSadaf Ebrahimi below handles the special cases \\ and \], but does not try to do any other
2350*22dc650dSSadaf Ebrahimi escape processing. This makes it different from Perl for cases such as
2351*22dc650dSSadaf Ebrahimi [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2352*22dc650dSSadaf Ebrahimi not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2353*22dc650dSSadaf Ebrahimi when Perl does, I think.
2354*22dc650dSSadaf Ebrahimi
2355*22dc650dSSadaf Ebrahimi A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2356*22dc650dSSadaf Ebrahimi It seems that the appearance of a nested POSIX class supersedes an apparent
2357*22dc650dSSadaf Ebrahimi external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2358*22dc650dSSadaf Ebrahimi a digit. This is handled by returning FALSE if the start of a new group with
2359*22dc650dSSadaf Ebrahimi the same terminator is encountered, since the next closing sequence must close
2360*22dc650dSSadaf Ebrahimi the nested group, not the outer one.
2361*22dc650dSSadaf Ebrahimi
2362*22dc650dSSadaf Ebrahimi In Perl, unescaped square brackets may also appear as part of class names. For
2363*22dc650dSSadaf Ebrahimi example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2364*22dc650dSSadaf Ebrahimi [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2365*22dc650dSSadaf Ebrahimi seem right at all. PCRE does not allow closing square brackets in POSIX class
2366*22dc650dSSadaf Ebrahimi names.
2367*22dc650dSSadaf Ebrahimi
2368*22dc650dSSadaf Ebrahimi Arguments:
2369*22dc650dSSadaf Ebrahimi ptr pointer to the character after the initial [ (colon, dot, equals)
2370*22dc650dSSadaf Ebrahimi ptrend pointer to the end of the pattern
2371*22dc650dSSadaf Ebrahimi endptr where to return a pointer to the terminating ':', '.', or '='
2372*22dc650dSSadaf Ebrahimi
2373*22dc650dSSadaf Ebrahimi Returns: TRUE or FALSE
2374*22dc650dSSadaf Ebrahimi */
2375*22dc650dSSadaf Ebrahimi
2376*22dc650dSSadaf Ebrahimi static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2377*22dc650dSSadaf Ebrahimi check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2378*22dc650dSSadaf Ebrahimi {
2379*22dc650dSSadaf Ebrahimi PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2380*22dc650dSSadaf Ebrahimi terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2381*22dc650dSSadaf Ebrahimi
2382*22dc650dSSadaf Ebrahimi for (; ptrend - ptr >= 2; ptr++)
2383*22dc650dSSadaf Ebrahimi {
2384*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_BACKSLASH &&
2385*22dc650dSSadaf Ebrahimi (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2386*22dc650dSSadaf Ebrahimi ptr++;
2387*22dc650dSSadaf Ebrahimi
2388*22dc650dSSadaf Ebrahimi else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2389*22dc650dSSadaf Ebrahimi *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2390*22dc650dSSadaf Ebrahimi
2391*22dc650dSSadaf Ebrahimi else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2392*22dc650dSSadaf Ebrahimi {
2393*22dc650dSSadaf Ebrahimi *endptr = ptr;
2394*22dc650dSSadaf Ebrahimi return TRUE;
2395*22dc650dSSadaf Ebrahimi }
2396*22dc650dSSadaf Ebrahimi }
2397*22dc650dSSadaf Ebrahimi
2398*22dc650dSSadaf Ebrahimi return FALSE;
2399*22dc650dSSadaf Ebrahimi }
2400*22dc650dSSadaf Ebrahimi
2401*22dc650dSSadaf Ebrahimi
2402*22dc650dSSadaf Ebrahimi
2403*22dc650dSSadaf Ebrahimi /*************************************************
2404*22dc650dSSadaf Ebrahimi * Check POSIX class name *
2405*22dc650dSSadaf Ebrahimi *************************************************/
2406*22dc650dSSadaf Ebrahimi
2407*22dc650dSSadaf Ebrahimi /* This function is called to check the name given in a POSIX-style class entry
2408*22dc650dSSadaf Ebrahimi such as [:alnum:].
2409*22dc650dSSadaf Ebrahimi
2410*22dc650dSSadaf Ebrahimi Arguments:
2411*22dc650dSSadaf Ebrahimi ptr points to the first letter
2412*22dc650dSSadaf Ebrahimi len the length of the name
2413*22dc650dSSadaf Ebrahimi
2414*22dc650dSSadaf Ebrahimi Returns: a value representing the name, or -1 if unknown
2415*22dc650dSSadaf Ebrahimi */
2416*22dc650dSSadaf Ebrahimi
2417*22dc650dSSadaf Ebrahimi static int
check_posix_name(PCRE2_SPTR ptr,int len)2418*22dc650dSSadaf Ebrahimi check_posix_name(PCRE2_SPTR ptr, int len)
2419*22dc650dSSadaf Ebrahimi {
2420*22dc650dSSadaf Ebrahimi const char *pn = posix_names;
2421*22dc650dSSadaf Ebrahimi int yield = 0;
2422*22dc650dSSadaf Ebrahimi while (posix_name_lengths[yield] != 0)
2423*22dc650dSSadaf Ebrahimi {
2424*22dc650dSSadaf Ebrahimi if (len == posix_name_lengths[yield] &&
2425*22dc650dSSadaf Ebrahimi PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2426*22dc650dSSadaf Ebrahimi pn += posix_name_lengths[yield] + 1;
2427*22dc650dSSadaf Ebrahimi yield++;
2428*22dc650dSSadaf Ebrahimi }
2429*22dc650dSSadaf Ebrahimi return -1;
2430*22dc650dSSadaf Ebrahimi }
2431*22dc650dSSadaf Ebrahimi
2432*22dc650dSSadaf Ebrahimi
2433*22dc650dSSadaf Ebrahimi
2434*22dc650dSSadaf Ebrahimi /*************************************************
2435*22dc650dSSadaf Ebrahimi * Read a subpattern or VERB name *
2436*22dc650dSSadaf Ebrahimi *************************************************/
2437*22dc650dSSadaf Ebrahimi
2438*22dc650dSSadaf Ebrahimi /* This function is called from parse_regex() below whenever it needs to read
2439*22dc650dSSadaf Ebrahimi the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2440*22dc650dSSadaf Ebrahimi pointer must be to the preceding character. If that character is '*' we are
2441*22dc650dSSadaf Ebrahimi reading a verb or alpha assertion name. The pointer is updated to point after
2442*22dc650dSSadaf Ebrahimi the name, for a VERB or alpha assertion name, or after tha name's terminator
2443*22dc650dSSadaf Ebrahimi for a subpattern name. Returning both the offset and the name pointer is
2444*22dc650dSSadaf Ebrahimi redundant information, but some callers use one and some the other, so it is
2445*22dc650dSSadaf Ebrahimi simplest just to return both. When the name is in braces, spaces and tabs are
2446*22dc650dSSadaf Ebrahimi allowed (and ignored) at either end.
2447*22dc650dSSadaf Ebrahimi
2448*22dc650dSSadaf Ebrahimi Arguments:
2449*22dc650dSSadaf Ebrahimi ptrptr points to the character pointer variable
2450*22dc650dSSadaf Ebrahimi ptrend points to the end of the input string
2451*22dc650dSSadaf Ebrahimi utf true if the input is UTF-encoded
2452*22dc650dSSadaf Ebrahimi terminator the terminator of a subpattern name must be this
2453*22dc650dSSadaf Ebrahimi offsetptr where to put the offset from the start of the pattern
2454*22dc650dSSadaf Ebrahimi nameptr where to put a pointer to the name in the input
2455*22dc650dSSadaf Ebrahimi namelenptr where to put the length of the name
2456*22dc650dSSadaf Ebrahimi errcodeptr where to put an error code
2457*22dc650dSSadaf Ebrahimi cb pointer to the compile data block
2458*22dc650dSSadaf Ebrahimi
2459*22dc650dSSadaf Ebrahimi Returns: TRUE if a name was read
2460*22dc650dSSadaf Ebrahimi FALSE otherwise, with error code set
2461*22dc650dSSadaf Ebrahimi */
2462*22dc650dSSadaf Ebrahimi
2463*22dc650dSSadaf Ebrahimi static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2464*22dc650dSSadaf Ebrahimi read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2465*22dc650dSSadaf Ebrahimi PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2466*22dc650dSSadaf Ebrahimi int *errorcodeptr, compile_block *cb)
2467*22dc650dSSadaf Ebrahimi {
2468*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr = *ptrptr;
2469*22dc650dSSadaf Ebrahimi BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2470*22dc650dSSadaf Ebrahimi BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2471*22dc650dSSadaf Ebrahimi
2472*22dc650dSSadaf Ebrahimi if (is_braced)
2473*22dc650dSSadaf Ebrahimi while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2474*22dc650dSSadaf Ebrahimi
2475*22dc650dSSadaf Ebrahimi if (ptr >= ptrend) /* No characters in name */
2476*22dc650dSSadaf Ebrahimi {
2477*22dc650dSSadaf Ebrahimi *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2478*22dc650dSSadaf Ebrahimi ERR60; /* Verb not recognized or malformed */
2479*22dc650dSSadaf Ebrahimi goto FAILED;
2480*22dc650dSSadaf Ebrahimi }
2481*22dc650dSSadaf Ebrahimi
2482*22dc650dSSadaf Ebrahimi *nameptr = ptr;
2483*22dc650dSSadaf Ebrahimi *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2484*22dc650dSSadaf Ebrahimi
2485*22dc650dSSadaf Ebrahimi /* In UTF mode, a group name may contain letters and decimal digits as defined
2486*22dc650dSSadaf Ebrahimi by Unicode properties, and underscores, but must not start with a digit. */
2487*22dc650dSSadaf Ebrahimi
2488*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2489*22dc650dSSadaf Ebrahimi if (utf && is_group)
2490*22dc650dSSadaf Ebrahimi {
2491*22dc650dSSadaf Ebrahimi uint32_t c, type;
2492*22dc650dSSadaf Ebrahimi
2493*22dc650dSSadaf Ebrahimi GETCHAR(c, ptr);
2494*22dc650dSSadaf Ebrahimi type = UCD_CHARTYPE(c);
2495*22dc650dSSadaf Ebrahimi
2496*22dc650dSSadaf Ebrahimi if (type == ucp_Nd)
2497*22dc650dSSadaf Ebrahimi {
2498*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR44;
2499*22dc650dSSadaf Ebrahimi goto FAILED;
2500*22dc650dSSadaf Ebrahimi }
2501*22dc650dSSadaf Ebrahimi
2502*22dc650dSSadaf Ebrahimi for(;;)
2503*22dc650dSSadaf Ebrahimi {
2504*22dc650dSSadaf Ebrahimi if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2505*22dc650dSSadaf Ebrahimi c != CHAR_UNDERSCORE) break;
2506*22dc650dSSadaf Ebrahimi ptr++;
2507*22dc650dSSadaf Ebrahimi FORWARDCHARTEST(ptr, ptrend);
2508*22dc650dSSadaf Ebrahimi if (ptr >= ptrend) break;
2509*22dc650dSSadaf Ebrahimi GETCHAR(c, ptr);
2510*22dc650dSSadaf Ebrahimi type = UCD_CHARTYPE(c);
2511*22dc650dSSadaf Ebrahimi }
2512*22dc650dSSadaf Ebrahimi }
2513*22dc650dSSadaf Ebrahimi else
2514*22dc650dSSadaf Ebrahimi #else
2515*22dc650dSSadaf Ebrahimi (void)utf; /* Avoid compiler warning */
2516*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
2517*22dc650dSSadaf Ebrahimi
2518*22dc650dSSadaf Ebrahimi /* Handle non-group names and group names in non-UTF modes. A group name must
2519*22dc650dSSadaf Ebrahimi not start with a digit. If either of the others start with a digit it just
2520*22dc650dSSadaf Ebrahimi won't be recognized. */
2521*22dc650dSSadaf Ebrahimi
2522*22dc650dSSadaf Ebrahimi {
2523*22dc650dSSadaf Ebrahimi if (is_group && IS_DIGIT(*ptr))
2524*22dc650dSSadaf Ebrahimi {
2525*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR44;
2526*22dc650dSSadaf Ebrahimi goto FAILED;
2527*22dc650dSSadaf Ebrahimi }
2528*22dc650dSSadaf Ebrahimi
2529*22dc650dSSadaf Ebrahimi while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2530*22dc650dSSadaf Ebrahimi {
2531*22dc650dSSadaf Ebrahimi ptr++;
2532*22dc650dSSadaf Ebrahimi }
2533*22dc650dSSadaf Ebrahimi }
2534*22dc650dSSadaf Ebrahimi
2535*22dc650dSSadaf Ebrahimi /* Check name length */
2536*22dc650dSSadaf Ebrahimi
2537*22dc650dSSadaf Ebrahimi if (ptr > *nameptr + MAX_NAME_SIZE)
2538*22dc650dSSadaf Ebrahimi {
2539*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR48;
2540*22dc650dSSadaf Ebrahimi goto FAILED;
2541*22dc650dSSadaf Ebrahimi }
2542*22dc650dSSadaf Ebrahimi *namelenptr = (uint32_t)(ptr - *nameptr);
2543*22dc650dSSadaf Ebrahimi
2544*22dc650dSSadaf Ebrahimi /* Subpattern names must not be empty, and their terminator is checked here.
2545*22dc650dSSadaf Ebrahimi (What follows a verb or alpha assertion name is checked separately.) */
2546*22dc650dSSadaf Ebrahimi
2547*22dc650dSSadaf Ebrahimi if (is_group)
2548*22dc650dSSadaf Ebrahimi {
2549*22dc650dSSadaf Ebrahimi if (ptr == *nameptr)
2550*22dc650dSSadaf Ebrahimi {
2551*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR62; /* Subpattern name expected */
2552*22dc650dSSadaf Ebrahimi goto FAILED;
2553*22dc650dSSadaf Ebrahimi }
2554*22dc650dSSadaf Ebrahimi if (is_braced)
2555*22dc650dSSadaf Ebrahimi while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2556*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2557*22dc650dSSadaf Ebrahimi {
2558*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR42;
2559*22dc650dSSadaf Ebrahimi goto FAILED;
2560*22dc650dSSadaf Ebrahimi }
2561*22dc650dSSadaf Ebrahimi ptr++;
2562*22dc650dSSadaf Ebrahimi }
2563*22dc650dSSadaf Ebrahimi
2564*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2565*22dc650dSSadaf Ebrahimi return TRUE;
2566*22dc650dSSadaf Ebrahimi
2567*22dc650dSSadaf Ebrahimi FAILED:
2568*22dc650dSSadaf Ebrahimi *ptrptr = ptr;
2569*22dc650dSSadaf Ebrahimi return FALSE;
2570*22dc650dSSadaf Ebrahimi }
2571*22dc650dSSadaf Ebrahimi
2572*22dc650dSSadaf Ebrahimi
2573*22dc650dSSadaf Ebrahimi
2574*22dc650dSSadaf Ebrahimi /*************************************************
2575*22dc650dSSadaf Ebrahimi * Manage callouts at start of cycle *
2576*22dc650dSSadaf Ebrahimi *************************************************/
2577*22dc650dSSadaf Ebrahimi
2578*22dc650dSSadaf Ebrahimi /* At the start of a new item in parse_regex() we are able to record the
2579*22dc650dSSadaf Ebrahimi details of the previous item in a prior callout, and also to set up an
2580*22dc650dSSadaf Ebrahimi automatic callout if enabled. Avoid having two adjacent automatic callouts,
2581*22dc650dSSadaf Ebrahimi which would otherwise happen for items such as \Q that contribute nothing to
2582*22dc650dSSadaf Ebrahimi the parsed pattern.
2583*22dc650dSSadaf Ebrahimi
2584*22dc650dSSadaf Ebrahimi Arguments:
2585*22dc650dSSadaf Ebrahimi ptr current pattern pointer
2586*22dc650dSSadaf Ebrahimi pcalloutptr points to a pointer to previous callout, or NULL
2587*22dc650dSSadaf Ebrahimi auto_callout TRUE if auto_callouts are enabled
2588*22dc650dSSadaf Ebrahimi parsed_pattern the parsed pattern pointer
2589*22dc650dSSadaf Ebrahimi cb compile block
2590*22dc650dSSadaf Ebrahimi
2591*22dc650dSSadaf Ebrahimi Returns: possibly updated parsed_pattern pointer.
2592*22dc650dSSadaf Ebrahimi */
2593*22dc650dSSadaf Ebrahimi
2594*22dc650dSSadaf Ebrahimi static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2595*22dc650dSSadaf Ebrahimi manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2596*22dc650dSSadaf Ebrahimi uint32_t *parsed_pattern, compile_block *cb)
2597*22dc650dSSadaf Ebrahimi {
2598*22dc650dSSadaf Ebrahimi uint32_t *previous_callout = *pcalloutptr;
2599*22dc650dSSadaf Ebrahimi
2600*22dc650dSSadaf Ebrahimi if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2601*22dc650dSSadaf Ebrahimi cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2602*22dc650dSSadaf Ebrahimi
2603*22dc650dSSadaf Ebrahimi if (!auto_callout) previous_callout = NULL; else
2604*22dc650dSSadaf Ebrahimi {
2605*22dc650dSSadaf Ebrahimi if (previous_callout == NULL ||
2606*22dc650dSSadaf Ebrahimi previous_callout != parsed_pattern - 4 ||
2607*22dc650dSSadaf Ebrahimi previous_callout[3] != 255)
2608*22dc650dSSadaf Ebrahimi {
2609*22dc650dSSadaf Ebrahimi previous_callout = parsed_pattern; /* Set up new automatic callout */
2610*22dc650dSSadaf Ebrahimi parsed_pattern += 4;
2611*22dc650dSSadaf Ebrahimi previous_callout[0] = META_CALLOUT_NUMBER;
2612*22dc650dSSadaf Ebrahimi previous_callout[2] = 0;
2613*22dc650dSSadaf Ebrahimi previous_callout[3] = 255;
2614*22dc650dSSadaf Ebrahimi }
2615*22dc650dSSadaf Ebrahimi previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2616*22dc650dSSadaf Ebrahimi }
2617*22dc650dSSadaf Ebrahimi
2618*22dc650dSSadaf Ebrahimi *pcalloutptr = previous_callout;
2619*22dc650dSSadaf Ebrahimi return parsed_pattern;
2620*22dc650dSSadaf Ebrahimi }
2621*22dc650dSSadaf Ebrahimi
2622*22dc650dSSadaf Ebrahimi
2623*22dc650dSSadaf Ebrahimi
2624*22dc650dSSadaf Ebrahimi /*************************************************
2625*22dc650dSSadaf Ebrahimi * Handle \d, \D, \s, \S, \w, \W *
2626*22dc650dSSadaf Ebrahimi *************************************************/
2627*22dc650dSSadaf Ebrahimi
2628*22dc650dSSadaf Ebrahimi /* This function is called from parse_regex() below, both for freestanding
2629*22dc650dSSadaf Ebrahimi escapes, and those within classes, to handle those escapes that may change when
2630*22dc650dSSadaf Ebrahimi Unicode property support is requested. Note that PCRE2_UCP will never be set
2631*22dc650dSSadaf Ebrahimi without Unicode support because that is checked when pcre2_compile() is called.
2632*22dc650dSSadaf Ebrahimi
2633*22dc650dSSadaf Ebrahimi Arguments:
2634*22dc650dSSadaf Ebrahimi escape the ESC_... value
2635*22dc650dSSadaf Ebrahimi parsed_pattern where to add the code
2636*22dc650dSSadaf Ebrahimi options options bits
2637*22dc650dSSadaf Ebrahimi xoptions extra options bits
2638*22dc650dSSadaf Ebrahimi
2639*22dc650dSSadaf Ebrahimi Returns: updated value of parsed_pattern
2640*22dc650dSSadaf Ebrahimi */
2641*22dc650dSSadaf Ebrahimi static uint32_t *
handle_escdsw(int escape,uint32_t * parsed_pattern,uint32_t options,uint32_t xoptions)2642*22dc650dSSadaf Ebrahimi handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2643*22dc650dSSadaf Ebrahimi uint32_t xoptions)
2644*22dc650dSSadaf Ebrahimi {
2645*22dc650dSSadaf Ebrahimi uint32_t ascii_option = 0;
2646*22dc650dSSadaf Ebrahimi uint32_t prop = ESC_p;
2647*22dc650dSSadaf Ebrahimi
2648*22dc650dSSadaf Ebrahimi switch(escape)
2649*22dc650dSSadaf Ebrahimi {
2650*22dc650dSSadaf Ebrahimi case ESC_D:
2651*22dc650dSSadaf Ebrahimi prop = ESC_P;
2652*22dc650dSSadaf Ebrahimi /* Fall through */
2653*22dc650dSSadaf Ebrahimi case ESC_d:
2654*22dc650dSSadaf Ebrahimi ascii_option = PCRE2_EXTRA_ASCII_BSD;
2655*22dc650dSSadaf Ebrahimi break;
2656*22dc650dSSadaf Ebrahimi
2657*22dc650dSSadaf Ebrahimi case ESC_S:
2658*22dc650dSSadaf Ebrahimi prop = ESC_P;
2659*22dc650dSSadaf Ebrahimi /* Fall through */
2660*22dc650dSSadaf Ebrahimi case ESC_s:
2661*22dc650dSSadaf Ebrahimi ascii_option = PCRE2_EXTRA_ASCII_BSS;
2662*22dc650dSSadaf Ebrahimi break;
2663*22dc650dSSadaf Ebrahimi
2664*22dc650dSSadaf Ebrahimi case ESC_W:
2665*22dc650dSSadaf Ebrahimi prop = ESC_P;
2666*22dc650dSSadaf Ebrahimi /* Fall through */
2667*22dc650dSSadaf Ebrahimi case ESC_w:
2668*22dc650dSSadaf Ebrahimi ascii_option = PCRE2_EXTRA_ASCII_BSW;
2669*22dc650dSSadaf Ebrahimi break;
2670*22dc650dSSadaf Ebrahimi }
2671*22dc650dSSadaf Ebrahimi
2672*22dc650dSSadaf Ebrahimi if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2673*22dc650dSSadaf Ebrahimi {
2674*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + escape;
2675*22dc650dSSadaf Ebrahimi }
2676*22dc650dSSadaf Ebrahimi else
2677*22dc650dSSadaf Ebrahimi {
2678*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + prop;
2679*22dc650dSSadaf Ebrahimi switch(escape)
2680*22dc650dSSadaf Ebrahimi {
2681*22dc650dSSadaf Ebrahimi case ESC_d:
2682*22dc650dSSadaf Ebrahimi case ESC_D:
2683*22dc650dSSadaf Ebrahimi *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2684*22dc650dSSadaf Ebrahimi break;
2685*22dc650dSSadaf Ebrahimi
2686*22dc650dSSadaf Ebrahimi case ESC_s:
2687*22dc650dSSadaf Ebrahimi case ESC_S:
2688*22dc650dSSadaf Ebrahimi *parsed_pattern++ = PT_SPACE << 16;
2689*22dc650dSSadaf Ebrahimi break;
2690*22dc650dSSadaf Ebrahimi
2691*22dc650dSSadaf Ebrahimi case ESC_w:
2692*22dc650dSSadaf Ebrahimi case ESC_W:
2693*22dc650dSSadaf Ebrahimi *parsed_pattern++ = PT_WORD << 16;
2694*22dc650dSSadaf Ebrahimi break;
2695*22dc650dSSadaf Ebrahimi }
2696*22dc650dSSadaf Ebrahimi }
2697*22dc650dSSadaf Ebrahimi
2698*22dc650dSSadaf Ebrahimi return parsed_pattern;
2699*22dc650dSSadaf Ebrahimi }
2700*22dc650dSSadaf Ebrahimi
2701*22dc650dSSadaf Ebrahimi
2702*22dc650dSSadaf Ebrahimi
2703*22dc650dSSadaf Ebrahimi /*************************************************
2704*22dc650dSSadaf Ebrahimi * Parse regex and identify named groups *
2705*22dc650dSSadaf Ebrahimi *************************************************/
2706*22dc650dSSadaf Ebrahimi
2707*22dc650dSSadaf Ebrahimi /* This function is called first of all. It scans the pattern and does two
2708*22dc650dSSadaf Ebrahimi things: (1) It identifies capturing groups and makes a table of named capturing
2709*22dc650dSSadaf Ebrahimi groups so that information about them is fully available to both the compiling
2710*22dc650dSSadaf Ebrahimi scans. (2) It writes a parsed version of the pattern with comments omitted and
2711*22dc650dSSadaf Ebrahimi escapes processed into the parsed_pattern vector.
2712*22dc650dSSadaf Ebrahimi
2713*22dc650dSSadaf Ebrahimi Arguments:
2714*22dc650dSSadaf Ebrahimi ptr points to the start of the pattern
2715*22dc650dSSadaf Ebrahimi options compiling dynamic options (may change during the scan)
2716*22dc650dSSadaf Ebrahimi has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2717*22dc650dSSadaf Ebrahimi cb pointer to the compile data block
2718*22dc650dSSadaf Ebrahimi
2719*22dc650dSSadaf Ebrahimi Returns: zero on success or a non-zero error code, with the
2720*22dc650dSSadaf Ebrahimi error offset placed in the cb field
2721*22dc650dSSadaf Ebrahimi */
2722*22dc650dSSadaf Ebrahimi
2723*22dc650dSSadaf Ebrahimi /* A structure and some flags for dealing with nested groups. */
2724*22dc650dSSadaf Ebrahimi
2725*22dc650dSSadaf Ebrahimi typedef struct nest_save {
2726*22dc650dSSadaf Ebrahimi uint16_t nest_depth;
2727*22dc650dSSadaf Ebrahimi uint16_t reset_group;
2728*22dc650dSSadaf Ebrahimi uint16_t max_group;
2729*22dc650dSSadaf Ebrahimi uint16_t flags;
2730*22dc650dSSadaf Ebrahimi uint32_t options;
2731*22dc650dSSadaf Ebrahimi uint32_t xoptions;
2732*22dc650dSSadaf Ebrahimi } nest_save;
2733*22dc650dSSadaf Ebrahimi
2734*22dc650dSSadaf Ebrahimi #define NSF_RESET 0x0001u
2735*22dc650dSSadaf Ebrahimi #define NSF_CONDASSERT 0x0002u
2736*22dc650dSSadaf Ebrahimi #define NSF_ATOMICSR 0x0004u
2737*22dc650dSSadaf Ebrahimi
2738*22dc650dSSadaf Ebrahimi /* Options that are changeable within the pattern must be tracked during
2739*22dc650dSSadaf Ebrahimi parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2740*22dc650dSSadaf Ebrahimi but all must be tracked so that META_OPTIONS items set the correct values for
2741*22dc650dSSadaf Ebrahimi the main compiling phase. */
2742*22dc650dSSadaf Ebrahimi
2743*22dc650dSSadaf Ebrahimi #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2744*22dc650dSSadaf Ebrahimi PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2745*22dc650dSSadaf Ebrahimi PCRE2_UNGREEDY)
2746*22dc650dSSadaf Ebrahimi
2747*22dc650dSSadaf Ebrahimi #define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2748*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2749*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2750*22dc650dSSadaf Ebrahimi
2751*22dc650dSSadaf Ebrahimi /* States used for analyzing ranges in character classes. The two OK values
2752*22dc650dSSadaf Ebrahimi must be last. */
2753*22dc650dSSadaf Ebrahimi
2754*22dc650dSSadaf Ebrahimi enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2755*22dc650dSSadaf Ebrahimi
2756*22dc650dSSadaf Ebrahimi /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2757*22dc650dSSadaf Ebrahimi the storing of literal values in the main parsed pattern, where they can always
2758*22dc650dSSadaf Ebrahimi be quantified. */
2759*22dc650dSSadaf Ebrahimi
2760*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
2761*22dc650dSSadaf Ebrahimi #define PARSED_LITERAL(c, p) \
2762*22dc650dSSadaf Ebrahimi { \
2763*22dc650dSSadaf Ebrahimi if (c >= META_END) *p++ = META_BIGVALUE; \
2764*22dc650dSSadaf Ebrahimi *p++ = c; \
2765*22dc650dSSadaf Ebrahimi okquantifier = TRUE; \
2766*22dc650dSSadaf Ebrahimi }
2767*22dc650dSSadaf Ebrahimi #else
2768*22dc650dSSadaf Ebrahimi #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2769*22dc650dSSadaf Ebrahimi #endif
2770*22dc650dSSadaf Ebrahimi
2771*22dc650dSSadaf Ebrahimi /* Here's the actual function. */
2772*22dc650dSSadaf Ebrahimi
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2773*22dc650dSSadaf Ebrahimi static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2774*22dc650dSSadaf Ebrahimi compile_block *cb)
2775*22dc650dSSadaf Ebrahimi {
2776*22dc650dSSadaf Ebrahimi uint32_t c;
2777*22dc650dSSadaf Ebrahimi uint32_t delimiter;
2778*22dc650dSSadaf Ebrahimi uint32_t namelen;
2779*22dc650dSSadaf Ebrahimi uint32_t class_range_state;
2780*22dc650dSSadaf Ebrahimi uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2781*22dc650dSSadaf Ebrahimi uint32_t *verbstartptr = NULL;
2782*22dc650dSSadaf Ebrahimi uint32_t *previous_callout = NULL;
2783*22dc650dSSadaf Ebrahimi uint32_t *parsed_pattern = cb->parsed_pattern;
2784*22dc650dSSadaf Ebrahimi uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2785*22dc650dSSadaf Ebrahimi uint32_t *this_parsed_item = NULL;
2786*22dc650dSSadaf Ebrahimi uint32_t *prev_parsed_item = NULL;
2787*22dc650dSSadaf Ebrahimi uint32_t meta_quantifier = 0;
2788*22dc650dSSadaf Ebrahimi uint32_t add_after_mark = 0;
2789*22dc650dSSadaf Ebrahimi uint32_t xoptions = cb->cx->extra_options;
2790*22dc650dSSadaf Ebrahimi uint16_t nest_depth = 0;
2791*22dc650dSSadaf Ebrahimi int after_manual_callout = 0;
2792*22dc650dSSadaf Ebrahimi int expect_cond_assert = 0;
2793*22dc650dSSadaf Ebrahimi int errorcode = 0;
2794*22dc650dSSadaf Ebrahimi int escape;
2795*22dc650dSSadaf Ebrahimi int i;
2796*22dc650dSSadaf Ebrahimi BOOL inescq = FALSE;
2797*22dc650dSSadaf Ebrahimi BOOL inverbname = FALSE;
2798*22dc650dSSadaf Ebrahimi BOOL utf = (options & PCRE2_UTF) != 0;
2799*22dc650dSSadaf Ebrahimi BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2800*22dc650dSSadaf Ebrahimi BOOL isdupname;
2801*22dc650dSSadaf Ebrahimi BOOL negate_class;
2802*22dc650dSSadaf Ebrahimi BOOL okquantifier = FALSE;
2803*22dc650dSSadaf Ebrahimi PCRE2_SPTR thisptr;
2804*22dc650dSSadaf Ebrahimi PCRE2_SPTR name;
2805*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptrend = cb->end_pattern;
2806*22dc650dSSadaf Ebrahimi PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2807*22dc650dSSadaf Ebrahimi named_group *ng;
2808*22dc650dSSadaf Ebrahimi nest_save *top_nest, *end_nests;
2809*22dc650dSSadaf Ebrahimi
2810*22dc650dSSadaf Ebrahimi /* Insert leading items for word and line matching (features provided for the
2811*22dc650dSSadaf Ebrahimi benefit of pcre2grep). */
2812*22dc650dSSadaf Ebrahimi
2813*22dc650dSSadaf Ebrahimi if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2814*22dc650dSSadaf Ebrahimi {
2815*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_CIRCUMFLEX;
2816*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_NOCAPTURE;
2817*22dc650dSSadaf Ebrahimi }
2818*22dc650dSSadaf Ebrahimi else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2819*22dc650dSSadaf Ebrahimi {
2820*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + ESC_b;
2821*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_NOCAPTURE;
2822*22dc650dSSadaf Ebrahimi }
2823*22dc650dSSadaf Ebrahimi
2824*22dc650dSSadaf Ebrahimi /* If the pattern is actually a literal string, process it separately to avoid
2825*22dc650dSSadaf Ebrahimi cluttering up the main loop. */
2826*22dc650dSSadaf Ebrahimi
2827*22dc650dSSadaf Ebrahimi if ((options & PCRE2_LITERAL) != 0)
2828*22dc650dSSadaf Ebrahimi {
2829*22dc650dSSadaf Ebrahimi while (ptr < ptrend)
2830*22dc650dSSadaf Ebrahimi {
2831*22dc650dSSadaf Ebrahimi if (parsed_pattern >= parsed_pattern_end)
2832*22dc650dSSadaf Ebrahimi {
2833*22dc650dSSadaf Ebrahimi errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2834*22dc650dSSadaf Ebrahimi goto FAILED;
2835*22dc650dSSadaf Ebrahimi }
2836*22dc650dSSadaf Ebrahimi thisptr = ptr;
2837*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr);
2838*22dc650dSSadaf Ebrahimi if (auto_callout)
2839*22dc650dSSadaf Ebrahimi parsed_pattern = manage_callouts(thisptr, &previous_callout,
2840*22dc650dSSadaf Ebrahimi auto_callout, parsed_pattern, cb);
2841*22dc650dSSadaf Ebrahimi PARSED_LITERAL(c, parsed_pattern);
2842*22dc650dSSadaf Ebrahimi }
2843*22dc650dSSadaf Ebrahimi goto PARSED_END;
2844*22dc650dSSadaf Ebrahimi }
2845*22dc650dSSadaf Ebrahimi
2846*22dc650dSSadaf Ebrahimi /* Process a real regex which may contain meta-characters. */
2847*22dc650dSSadaf Ebrahimi
2848*22dc650dSSadaf Ebrahimi top_nest = NULL;
2849*22dc650dSSadaf Ebrahimi end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2850*22dc650dSSadaf Ebrahimi
2851*22dc650dSSadaf Ebrahimi /* The size of the nest_save structure might not be a factor of the size of the
2852*22dc650dSSadaf Ebrahimi workspace. Therefore we must round down end_nests so as to correctly avoid
2853*22dc650dSSadaf Ebrahimi creating a nest_save that spans the end of the workspace. */
2854*22dc650dSSadaf Ebrahimi
2855*22dc650dSSadaf Ebrahimi end_nests = (nest_save *)((char *)end_nests -
2856*22dc650dSSadaf Ebrahimi ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2857*22dc650dSSadaf Ebrahimi
2858*22dc650dSSadaf Ebrahimi /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2859*22dc650dSSadaf Ebrahimi
2860*22dc650dSSadaf Ebrahimi if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2861*22dc650dSSadaf Ebrahimi
2862*22dc650dSSadaf Ebrahimi /* Now scan the pattern */
2863*22dc650dSSadaf Ebrahimi
2864*22dc650dSSadaf Ebrahimi while (ptr < ptrend)
2865*22dc650dSSadaf Ebrahimi {
2866*22dc650dSSadaf Ebrahimi int prev_expect_cond_assert;
2867*22dc650dSSadaf Ebrahimi uint32_t min_repeat = 0, max_repeat = 0;
2868*22dc650dSSadaf Ebrahimi uint32_t set, unset, *optset;
2869*22dc650dSSadaf Ebrahimi uint32_t xset, xunset, *xoptset;
2870*22dc650dSSadaf Ebrahimi uint32_t terminator;
2871*22dc650dSSadaf Ebrahimi uint32_t prev_meta_quantifier;
2872*22dc650dSSadaf Ebrahimi BOOL prev_okquantifier;
2873*22dc650dSSadaf Ebrahimi PCRE2_SPTR tempptr;
2874*22dc650dSSadaf Ebrahimi PCRE2_SIZE offset;
2875*22dc650dSSadaf Ebrahimi
2876*22dc650dSSadaf Ebrahimi if (parsed_pattern >= parsed_pattern_end)
2877*22dc650dSSadaf Ebrahimi {
2878*22dc650dSSadaf Ebrahimi errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2879*22dc650dSSadaf Ebrahimi goto FAILED;
2880*22dc650dSSadaf Ebrahimi }
2881*22dc650dSSadaf Ebrahimi
2882*22dc650dSSadaf Ebrahimi if (nest_depth > cb->cx->parens_nest_limit)
2883*22dc650dSSadaf Ebrahimi {
2884*22dc650dSSadaf Ebrahimi errorcode = ERR19;
2885*22dc650dSSadaf Ebrahimi goto FAILED; /* Parentheses too deeply nested */
2886*22dc650dSSadaf Ebrahimi }
2887*22dc650dSSadaf Ebrahimi
2888*22dc650dSSadaf Ebrahimi /* If the last time round this loop something was added, parsed_pattern will
2889*22dc650dSSadaf Ebrahimi no longer be equal to this_parsed_item. Remember where the previous item
2890*22dc650dSSadaf Ebrahimi started and reset for the next item. Note that sometimes round the loop,
2891*22dc650dSSadaf Ebrahimi nothing gets added (e.g. for ignored white space). */
2892*22dc650dSSadaf Ebrahimi
2893*22dc650dSSadaf Ebrahimi if (this_parsed_item != parsed_pattern)
2894*22dc650dSSadaf Ebrahimi {
2895*22dc650dSSadaf Ebrahimi prev_parsed_item = this_parsed_item;
2896*22dc650dSSadaf Ebrahimi this_parsed_item = parsed_pattern;
2897*22dc650dSSadaf Ebrahimi }
2898*22dc650dSSadaf Ebrahimi
2899*22dc650dSSadaf Ebrahimi /* Get next input character, save its position for callout handling. */
2900*22dc650dSSadaf Ebrahimi
2901*22dc650dSSadaf Ebrahimi thisptr = ptr;
2902*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr);
2903*22dc650dSSadaf Ebrahimi
2904*22dc650dSSadaf Ebrahimi /* Copy quoted literals until \E, allowing for the possibility of automatic
2905*22dc650dSSadaf Ebrahimi callouts, except when processing a (*VERB) "name". */
2906*22dc650dSSadaf Ebrahimi
2907*22dc650dSSadaf Ebrahimi if (inescq)
2908*22dc650dSSadaf Ebrahimi {
2909*22dc650dSSadaf Ebrahimi if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2910*22dc650dSSadaf Ebrahimi {
2911*22dc650dSSadaf Ebrahimi inescq = FALSE;
2912*22dc650dSSadaf Ebrahimi ptr++; /* Skip E */
2913*22dc650dSSadaf Ebrahimi }
2914*22dc650dSSadaf Ebrahimi else
2915*22dc650dSSadaf Ebrahimi {
2916*22dc650dSSadaf Ebrahimi if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2917*22dc650dSSadaf Ebrahimi { /* expecting a conditional assertion, */
2918*22dc650dSSadaf Ebrahimi ptr--; /* but an empty \Q\E sequence is OK. */
2919*22dc650dSSadaf Ebrahimi errorcode = ERR28;
2920*22dc650dSSadaf Ebrahimi goto FAILED;
2921*22dc650dSSadaf Ebrahimi }
2922*22dc650dSSadaf Ebrahimi if (inverbname)
2923*22dc650dSSadaf Ebrahimi { /* Don't use PARSED_LITERAL() because it */
2924*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2925*22dc650dSSadaf Ebrahimi if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2926*22dc650dSSadaf Ebrahimi #endif
2927*22dc650dSSadaf Ebrahimi *parsed_pattern++ = c;
2928*22dc650dSSadaf Ebrahimi }
2929*22dc650dSSadaf Ebrahimi else
2930*22dc650dSSadaf Ebrahimi {
2931*22dc650dSSadaf Ebrahimi if (after_manual_callout-- <= 0)
2932*22dc650dSSadaf Ebrahimi parsed_pattern = manage_callouts(thisptr, &previous_callout,
2933*22dc650dSSadaf Ebrahimi auto_callout, parsed_pattern, cb);
2934*22dc650dSSadaf Ebrahimi PARSED_LITERAL(c, parsed_pattern);
2935*22dc650dSSadaf Ebrahimi }
2936*22dc650dSSadaf Ebrahimi meta_quantifier = 0;
2937*22dc650dSSadaf Ebrahimi }
2938*22dc650dSSadaf Ebrahimi continue; /* Next character */
2939*22dc650dSSadaf Ebrahimi }
2940*22dc650dSSadaf Ebrahimi
2941*22dc650dSSadaf Ebrahimi /* If we are processing the "name" part of a (*VERB:NAME) item, all
2942*22dc650dSSadaf Ebrahimi characters up to the closing parenthesis are literals except when
2943*22dc650dSSadaf Ebrahimi PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2944*22dc650dSSadaf Ebrahimi and \E and escaped characters are allowed (no character types such as \d). If
2945*22dc650dSSadaf Ebrahimi PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2946*22dc650dSSadaf Ebrahimi this by not entering the special (*VERB:NAME) processing - they are then
2947*22dc650dSSadaf Ebrahimi picked up below. Note that c is a character, not a code unit, so we must not
2948*22dc650dSSadaf Ebrahimi use MAX_255 to test its size because MAX_255 tests code units and is assumed
2949*22dc650dSSadaf Ebrahimi TRUE in 8-bit mode. */
2950*22dc650dSSadaf Ebrahimi
2951*22dc650dSSadaf Ebrahimi if (inverbname &&
2952*22dc650dSSadaf Ebrahimi (
2953*22dc650dSSadaf Ebrahimi /* EITHER: not both options set */
2954*22dc650dSSadaf Ebrahimi ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2955*22dc650dSSadaf Ebrahimi (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2956*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2957*22dc650dSSadaf Ebrahimi /* OR: character > 255 AND not Unicode Pattern White Space */
2958*22dc650dSSadaf Ebrahimi (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2959*22dc650dSSadaf Ebrahimi #endif
2960*22dc650dSSadaf Ebrahimi /* OR: not a # comment or isspace() white space */
2961*22dc650dSSadaf Ebrahimi (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2962*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2963*22dc650dSSadaf Ebrahimi /* and not CHAR_NEL when Unicode is supported */
2964*22dc650dSSadaf Ebrahimi && c != CHAR_NEL
2965*22dc650dSSadaf Ebrahimi #endif
2966*22dc650dSSadaf Ebrahimi )))
2967*22dc650dSSadaf Ebrahimi {
2968*22dc650dSSadaf Ebrahimi PCRE2_SIZE verbnamelength;
2969*22dc650dSSadaf Ebrahimi
2970*22dc650dSSadaf Ebrahimi switch(c)
2971*22dc650dSSadaf Ebrahimi {
2972*22dc650dSSadaf Ebrahimi default: /* Don't use PARSED_LITERAL() because it */
2973*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2974*22dc650dSSadaf Ebrahimi if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2975*22dc650dSSadaf Ebrahimi #endif
2976*22dc650dSSadaf Ebrahimi *parsed_pattern++ = c;
2977*22dc650dSSadaf Ebrahimi break;
2978*22dc650dSSadaf Ebrahimi
2979*22dc650dSSadaf Ebrahimi case CHAR_RIGHT_PARENTHESIS:
2980*22dc650dSSadaf Ebrahimi inverbname = FALSE;
2981*22dc650dSSadaf Ebrahimi /* This is the length in characters */
2982*22dc650dSSadaf Ebrahimi verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2983*22dc650dSSadaf Ebrahimi /* But the limit on the length is in code units */
2984*22dc650dSSadaf Ebrahimi if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2985*22dc650dSSadaf Ebrahimi {
2986*22dc650dSSadaf Ebrahimi ptr--;
2987*22dc650dSSadaf Ebrahimi errorcode = ERR76;
2988*22dc650dSSadaf Ebrahimi goto FAILED;
2989*22dc650dSSadaf Ebrahimi }
2990*22dc650dSSadaf Ebrahimi *verblengthptr = (uint32_t)verbnamelength;
2991*22dc650dSSadaf Ebrahimi
2992*22dc650dSSadaf Ebrahimi /* If this name was on a verb such as (*ACCEPT) which does not continue,
2993*22dc650dSSadaf Ebrahimi a (*MARK) was generated for the name. We now add the original verb as the
2994*22dc650dSSadaf Ebrahimi next item. */
2995*22dc650dSSadaf Ebrahimi
2996*22dc650dSSadaf Ebrahimi if (add_after_mark != 0)
2997*22dc650dSSadaf Ebrahimi {
2998*22dc650dSSadaf Ebrahimi *parsed_pattern++ = add_after_mark;
2999*22dc650dSSadaf Ebrahimi add_after_mark = 0;
3000*22dc650dSSadaf Ebrahimi }
3001*22dc650dSSadaf Ebrahimi break;
3002*22dc650dSSadaf Ebrahimi
3003*22dc650dSSadaf Ebrahimi case CHAR_BACKSLASH:
3004*22dc650dSSadaf Ebrahimi if ((options & PCRE2_ALT_VERBNAMES) != 0)
3005*22dc650dSSadaf Ebrahimi {
3006*22dc650dSSadaf Ebrahimi escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3007*22dc650dSSadaf Ebrahimi xoptions, FALSE, cb);
3008*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto FAILED;
3009*22dc650dSSadaf Ebrahimi }
3010*22dc650dSSadaf Ebrahimi else escape = 0; /* Treat all as literal */
3011*22dc650dSSadaf Ebrahimi
3012*22dc650dSSadaf Ebrahimi switch(escape)
3013*22dc650dSSadaf Ebrahimi {
3014*22dc650dSSadaf Ebrahimi case 0: /* Don't use PARSED_LITERAL() because it */
3015*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
3016*22dc650dSSadaf Ebrahimi if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3017*22dc650dSSadaf Ebrahimi #endif
3018*22dc650dSSadaf Ebrahimi *parsed_pattern++ = c;
3019*22dc650dSSadaf Ebrahimi break;
3020*22dc650dSSadaf Ebrahimi
3021*22dc650dSSadaf Ebrahimi case ESC_ub:
3022*22dc650dSSadaf Ebrahimi *parsed_pattern++ = CHAR_u;
3023*22dc650dSSadaf Ebrahimi PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3024*22dc650dSSadaf Ebrahimi break;
3025*22dc650dSSadaf Ebrahimi
3026*22dc650dSSadaf Ebrahimi case ESC_Q:
3027*22dc650dSSadaf Ebrahimi inescq = TRUE;
3028*22dc650dSSadaf Ebrahimi break;
3029*22dc650dSSadaf Ebrahimi
3030*22dc650dSSadaf Ebrahimi case ESC_E: /* Ignore */
3031*22dc650dSSadaf Ebrahimi break;
3032*22dc650dSSadaf Ebrahimi
3033*22dc650dSSadaf Ebrahimi default:
3034*22dc650dSSadaf Ebrahimi errorcode = ERR40; /* Invalid in verb name */
3035*22dc650dSSadaf Ebrahimi goto FAILED;
3036*22dc650dSSadaf Ebrahimi }
3037*22dc650dSSadaf Ebrahimi }
3038*22dc650dSSadaf Ebrahimi continue; /* Next character in pattern */
3039*22dc650dSSadaf Ebrahimi }
3040*22dc650dSSadaf Ebrahimi
3041*22dc650dSSadaf Ebrahimi /* Not a verb name character. At this point we must process everything that
3042*22dc650dSSadaf Ebrahimi must not change the quantification state. This is mainly comments, but we
3043*22dc650dSSadaf Ebrahimi handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3044*22dc650dSSadaf Ebrahimi A+, as in Perl. An isolated \E is ignored. */
3045*22dc650dSSadaf Ebrahimi
3046*22dc650dSSadaf Ebrahimi if (c == CHAR_BACKSLASH && ptr < ptrend)
3047*22dc650dSSadaf Ebrahimi {
3048*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_Q || *ptr == CHAR_E)
3049*22dc650dSSadaf Ebrahimi {
3050*22dc650dSSadaf Ebrahimi inescq = *ptr == CHAR_Q;
3051*22dc650dSSadaf Ebrahimi ptr++;
3052*22dc650dSSadaf Ebrahimi continue;
3053*22dc650dSSadaf Ebrahimi }
3054*22dc650dSSadaf Ebrahimi }
3055*22dc650dSSadaf Ebrahimi
3056*22dc650dSSadaf Ebrahimi /* Skip over whitespace and # comments in extended mode. Note that c is a
3057*22dc650dSSadaf Ebrahimi character, not a code unit, so we must not use MAX_255 to test its size
3058*22dc650dSSadaf Ebrahimi because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3059*22dc650dSSadaf Ebrahimi whitespace characters are those designated as "Pattern White Space" by
3060*22dc650dSSadaf Ebrahimi Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3061*22dc650dSSadaf Ebrahimi U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3062*22dc650dSSadaf Ebrahimi subset of space characters that match \h and \v. */
3063*22dc650dSSadaf Ebrahimi
3064*22dc650dSSadaf Ebrahimi if ((options & PCRE2_EXTENDED) != 0)
3065*22dc650dSSadaf Ebrahimi {
3066*22dc650dSSadaf Ebrahimi if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3067*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3068*22dc650dSSadaf Ebrahimi if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3069*22dc650dSSadaf Ebrahimi #endif
3070*22dc650dSSadaf Ebrahimi if (c == CHAR_NUMBER_SIGN)
3071*22dc650dSSadaf Ebrahimi {
3072*22dc650dSSadaf Ebrahimi while (ptr < ptrend)
3073*22dc650dSSadaf Ebrahimi {
3074*22dc650dSSadaf Ebrahimi if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
3075*22dc650dSSadaf Ebrahimi { /* IS_NEWLINE sets cb->nllen. */
3076*22dc650dSSadaf Ebrahimi ptr += cb->nllen;
3077*22dc650dSSadaf Ebrahimi break;
3078*22dc650dSSadaf Ebrahimi }
3079*22dc650dSSadaf Ebrahimi ptr++;
3080*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3081*22dc650dSSadaf Ebrahimi if (utf) FORWARDCHARTEST(ptr, ptrend);
3082*22dc650dSSadaf Ebrahimi #endif
3083*22dc650dSSadaf Ebrahimi }
3084*22dc650dSSadaf Ebrahimi continue; /* Next character in pattern */
3085*22dc650dSSadaf Ebrahimi }
3086*22dc650dSSadaf Ebrahimi }
3087*22dc650dSSadaf Ebrahimi
3088*22dc650dSSadaf Ebrahimi /* Skip over bracketed comments */
3089*22dc650dSSadaf Ebrahimi
3090*22dc650dSSadaf Ebrahimi if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3091*22dc650dSSadaf Ebrahimi ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3092*22dc650dSSadaf Ebrahimi {
3093*22dc650dSSadaf Ebrahimi while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3094*22dc650dSSadaf Ebrahimi if (ptr >= ptrend)
3095*22dc650dSSadaf Ebrahimi {
3096*22dc650dSSadaf Ebrahimi errorcode = ERR18; /* A special error for missing ) in a comment */
3097*22dc650dSSadaf Ebrahimi goto FAILED; /* to make it easier to debug. */
3098*22dc650dSSadaf Ebrahimi }
3099*22dc650dSSadaf Ebrahimi ptr++;
3100*22dc650dSSadaf Ebrahimi continue; /* Next character in pattern */
3101*22dc650dSSadaf Ebrahimi }
3102*22dc650dSSadaf Ebrahimi
3103*22dc650dSSadaf Ebrahimi /* If the next item is not a quantifier, fill in length of any previous
3104*22dc650dSSadaf Ebrahimi callout and create an auto callout if required. */
3105*22dc650dSSadaf Ebrahimi
3106*22dc650dSSadaf Ebrahimi if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3107*22dc650dSSadaf Ebrahimi (c != CHAR_LEFT_CURLY_BRACKET ||
3108*22dc650dSSadaf Ebrahimi (tempptr = ptr,
3109*22dc650dSSadaf Ebrahimi !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3110*22dc650dSSadaf Ebrahimi {
3111*22dc650dSSadaf Ebrahimi if (after_manual_callout-- <= 0)
3112*22dc650dSSadaf Ebrahimi {
3113*22dc650dSSadaf Ebrahimi parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3114*22dc650dSSadaf Ebrahimi parsed_pattern, cb);
3115*22dc650dSSadaf Ebrahimi this_parsed_item = parsed_pattern; /* New start for current item */
3116*22dc650dSSadaf Ebrahimi }
3117*22dc650dSSadaf Ebrahimi }
3118*22dc650dSSadaf Ebrahimi
3119*22dc650dSSadaf Ebrahimi /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3120*22dc650dSSadaf Ebrahimi assertion, possibly preceded by a callout. If the value is 1, we have just
3121*22dc650dSSadaf Ebrahimi had the callout and expect an assertion. There must be at least 3 more
3122*22dc650dSSadaf Ebrahimi characters in all cases. When expect_cond_assert is 2, we know that the
3123*22dc650dSSadaf Ebrahimi current character is an opening parenthesis, as otherwise we wouldn't be
3124*22dc650dSSadaf Ebrahimi here. However, when it is 1, we need to check, and it's easiest just to check
3125*22dc650dSSadaf Ebrahimi always. Note that expect_cond_assert may be negative, since all callouts just
3126*22dc650dSSadaf Ebrahimi decrement it. */
3127*22dc650dSSadaf Ebrahimi
3128*22dc650dSSadaf Ebrahimi if (expect_cond_assert > 0)
3129*22dc650dSSadaf Ebrahimi {
3130*22dc650dSSadaf Ebrahimi BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3131*22dc650dSSadaf Ebrahimi (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3132*22dc650dSSadaf Ebrahimi if (ok)
3133*22dc650dSSadaf Ebrahimi {
3134*22dc650dSSadaf Ebrahimi if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
3135*22dc650dSSadaf Ebrahimi {
3136*22dc650dSSadaf Ebrahimi ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3137*22dc650dSSadaf Ebrahimi }
3138*22dc650dSSadaf Ebrahimi else switch(ptr[1]) /* Traditional symbolic format */
3139*22dc650dSSadaf Ebrahimi {
3140*22dc650dSSadaf Ebrahimi case CHAR_C:
3141*22dc650dSSadaf Ebrahimi ok = expect_cond_assert == 2;
3142*22dc650dSSadaf Ebrahimi break;
3143*22dc650dSSadaf Ebrahimi
3144*22dc650dSSadaf Ebrahimi case CHAR_EQUALS_SIGN:
3145*22dc650dSSadaf Ebrahimi case CHAR_EXCLAMATION_MARK:
3146*22dc650dSSadaf Ebrahimi break;
3147*22dc650dSSadaf Ebrahimi
3148*22dc650dSSadaf Ebrahimi case CHAR_LESS_THAN_SIGN:
3149*22dc650dSSadaf Ebrahimi ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3150*22dc650dSSadaf Ebrahimi break;
3151*22dc650dSSadaf Ebrahimi
3152*22dc650dSSadaf Ebrahimi default:
3153*22dc650dSSadaf Ebrahimi ok = FALSE;
3154*22dc650dSSadaf Ebrahimi }
3155*22dc650dSSadaf Ebrahimi }
3156*22dc650dSSadaf Ebrahimi
3157*22dc650dSSadaf Ebrahimi if (!ok)
3158*22dc650dSSadaf Ebrahimi {
3159*22dc650dSSadaf Ebrahimi ptr--; /* Adjust error offset */
3160*22dc650dSSadaf Ebrahimi errorcode = ERR28;
3161*22dc650dSSadaf Ebrahimi goto FAILED;
3162*22dc650dSSadaf Ebrahimi }
3163*22dc650dSSadaf Ebrahimi }
3164*22dc650dSSadaf Ebrahimi
3165*22dc650dSSadaf Ebrahimi /* Remember whether we are expecting a conditional assertion, and set the
3166*22dc650dSSadaf Ebrahimi default for this item. */
3167*22dc650dSSadaf Ebrahimi
3168*22dc650dSSadaf Ebrahimi prev_expect_cond_assert = expect_cond_assert;
3169*22dc650dSSadaf Ebrahimi expect_cond_assert = 0;
3170*22dc650dSSadaf Ebrahimi
3171*22dc650dSSadaf Ebrahimi /* Remember quantification status for the previous significant item, then set
3172*22dc650dSSadaf Ebrahimi default for this item. */
3173*22dc650dSSadaf Ebrahimi
3174*22dc650dSSadaf Ebrahimi prev_okquantifier = okquantifier;
3175*22dc650dSSadaf Ebrahimi prev_meta_quantifier = meta_quantifier;
3176*22dc650dSSadaf Ebrahimi okquantifier = FALSE;
3177*22dc650dSSadaf Ebrahimi meta_quantifier = 0;
3178*22dc650dSSadaf Ebrahimi
3179*22dc650dSSadaf Ebrahimi /* If the previous significant item was a quantifier, adjust the parsed code
3180*22dc650dSSadaf Ebrahimi if there is a following modifier. The base meta value is always followed by
3181*22dc650dSSadaf Ebrahimi the PLUS and QUERY values, in that order. We do this here rather than after
3182*22dc650dSSadaf Ebrahimi reading a quantifier so that intervening comments and /x whitespace can be
3183*22dc650dSSadaf Ebrahimi ignored without having to replicate code. */
3184*22dc650dSSadaf Ebrahimi
3185*22dc650dSSadaf Ebrahimi if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3186*22dc650dSSadaf Ebrahimi {
3187*22dc650dSSadaf Ebrahimi parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3188*22dc650dSSadaf Ebrahimi prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3189*22dc650dSSadaf Ebrahimi 0x00020000u : 0x00010000u);
3190*22dc650dSSadaf Ebrahimi continue; /* Next character in pattern */
3191*22dc650dSSadaf Ebrahimi }
3192*22dc650dSSadaf Ebrahimi
3193*22dc650dSSadaf Ebrahimi /* Process the next item in the main part of a pattern. */
3194*22dc650dSSadaf Ebrahimi
3195*22dc650dSSadaf Ebrahimi switch(c)
3196*22dc650dSSadaf Ebrahimi {
3197*22dc650dSSadaf Ebrahimi default: /* Non-special character */
3198*22dc650dSSadaf Ebrahimi PARSED_LITERAL(c, parsed_pattern);
3199*22dc650dSSadaf Ebrahimi break;
3200*22dc650dSSadaf Ebrahimi
3201*22dc650dSSadaf Ebrahimi
3202*22dc650dSSadaf Ebrahimi /* ---- Escape sequence ---- */
3203*22dc650dSSadaf Ebrahimi
3204*22dc650dSSadaf Ebrahimi case CHAR_BACKSLASH:
3205*22dc650dSSadaf Ebrahimi tempptr = ptr;
3206*22dc650dSSadaf Ebrahimi escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3207*22dc650dSSadaf Ebrahimi xoptions, FALSE, cb);
3208*22dc650dSSadaf Ebrahimi if (errorcode != 0)
3209*22dc650dSSadaf Ebrahimi {
3210*22dc650dSSadaf Ebrahimi ESCAPE_FAILED:
3211*22dc650dSSadaf Ebrahimi if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3212*22dc650dSSadaf Ebrahimi goto FAILED;
3213*22dc650dSSadaf Ebrahimi ptr = tempptr;
3214*22dc650dSSadaf Ebrahimi if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3215*22dc650dSSadaf Ebrahimi {
3216*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3217*22dc650dSSadaf Ebrahimi }
3218*22dc650dSSadaf Ebrahimi escape = 0; /* Treat as literal character */
3219*22dc650dSSadaf Ebrahimi }
3220*22dc650dSSadaf Ebrahimi
3221*22dc650dSSadaf Ebrahimi /* The escape was a data escape or literal character. */
3222*22dc650dSSadaf Ebrahimi
3223*22dc650dSSadaf Ebrahimi if (escape == 0)
3224*22dc650dSSadaf Ebrahimi {
3225*22dc650dSSadaf Ebrahimi PARSED_LITERAL(c, parsed_pattern);
3226*22dc650dSSadaf Ebrahimi }
3227*22dc650dSSadaf Ebrahimi
3228*22dc650dSSadaf Ebrahimi /* The escape was a back (or forward) reference. We keep the offset in
3229*22dc650dSSadaf Ebrahimi order to give a more useful diagnostic for a bad forward reference. For
3230*22dc650dSSadaf Ebrahimi references to groups numbered less than 10 we can't use more than two items
3231*22dc650dSSadaf Ebrahimi in parsed_pattern because they may be just two characters in the input (and
3232*22dc650dSSadaf Ebrahimi in a 64-bit world an offset may need two elements). So for them, the offset
3233*22dc650dSSadaf Ebrahimi of the first occurrent is held in a special vector. */
3234*22dc650dSSadaf Ebrahimi
3235*22dc650dSSadaf Ebrahimi else if (escape < 0)
3236*22dc650dSSadaf Ebrahimi {
3237*22dc650dSSadaf Ebrahimi offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3238*22dc650dSSadaf Ebrahimi escape = -escape;
3239*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3240*22dc650dSSadaf Ebrahimi if (escape < 10)
3241*22dc650dSSadaf Ebrahimi {
3242*22dc650dSSadaf Ebrahimi if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3243*22dc650dSSadaf Ebrahimi cb->small_ref_offset[escape] = offset;
3244*22dc650dSSadaf Ebrahimi }
3245*22dc650dSSadaf Ebrahimi else
3246*22dc650dSSadaf Ebrahimi {
3247*22dc650dSSadaf Ebrahimi PUTOFFSET(offset, parsed_pattern);
3248*22dc650dSSadaf Ebrahimi }
3249*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
3250*22dc650dSSadaf Ebrahimi }
3251*22dc650dSSadaf Ebrahimi
3252*22dc650dSSadaf Ebrahimi /* The escape was a character class such as \d etc. or other special
3253*22dc650dSSadaf Ebrahimi escape indicator such as \A or \X. Most of them generate just a single
3254*22dc650dSSadaf Ebrahimi parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3255*22dc650dSSadaf Ebrahimi value. They are supported only when Unicode is available. The type and
3256*22dc650dSSadaf Ebrahimi value are packed into a single 32-bit value so that the whole sequences
3257*22dc650dSSadaf Ebrahimi uses only two elements in the parsed_vector. This is because the same
3258*22dc650dSSadaf Ebrahimi coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3259*22dc650dSSadaf Ebrahimi set.
3260*22dc650dSSadaf Ebrahimi
3261*22dc650dSSadaf Ebrahimi There are also some cases where the escape sequence is followed by a name:
3262*22dc650dSSadaf Ebrahimi \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3263*22dc650dSSadaf Ebrahimi and \g'name' are subroutine calls by name; \g{name} is a synonym for
3264*22dc650dSSadaf Ebrahimi \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3265*22dc650dSSadaf Ebrahimi and returned as a negative value (handled above). A name is coded as an
3266*22dc650dSSadaf Ebrahimi offset into the pattern and a length. */
3267*22dc650dSSadaf Ebrahimi
3268*22dc650dSSadaf Ebrahimi else switch (escape)
3269*22dc650dSSadaf Ebrahimi {
3270*22dc650dSSadaf Ebrahimi case ESC_C:
3271*22dc650dSSadaf Ebrahimi #ifdef NEVER_BACKSLASH_C
3272*22dc650dSSadaf Ebrahimi errorcode = ERR85;
3273*22dc650dSSadaf Ebrahimi goto ESCAPE_FAILED;
3274*22dc650dSSadaf Ebrahimi #else
3275*22dc650dSSadaf Ebrahimi if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3276*22dc650dSSadaf Ebrahimi {
3277*22dc650dSSadaf Ebrahimi errorcode = ERR83;
3278*22dc650dSSadaf Ebrahimi goto ESCAPE_FAILED;
3279*22dc650dSSadaf Ebrahimi }
3280*22dc650dSSadaf Ebrahimi #endif
3281*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
3282*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + escape;
3283*22dc650dSSadaf Ebrahimi break;
3284*22dc650dSSadaf Ebrahimi
3285*22dc650dSSadaf Ebrahimi /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3286*22dc650dSSadaf Ebrahimi when \u{ is not followed by hex digits and }. It requests two literal
3287*22dc650dSSadaf Ebrahimi characters, u and { and we need this, as otherwise \u{ 12} (for example)
3288*22dc650dSSadaf Ebrahimi would be treated as u{12} now that spaces are allowed in quantifiers. */
3289*22dc650dSSadaf Ebrahimi
3290*22dc650dSSadaf Ebrahimi case ESC_ub:
3291*22dc650dSSadaf Ebrahimi *parsed_pattern++ = CHAR_u;
3292*22dc650dSSadaf Ebrahimi PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3293*22dc650dSSadaf Ebrahimi break;
3294*22dc650dSSadaf Ebrahimi
3295*22dc650dSSadaf Ebrahimi case ESC_X:
3296*22dc650dSSadaf Ebrahimi #ifndef SUPPORT_UNICODE
3297*22dc650dSSadaf Ebrahimi errorcode = ERR45; /* Supported only with Unicode support */
3298*22dc650dSSadaf Ebrahimi goto ESCAPE_FAILED;
3299*22dc650dSSadaf Ebrahimi #endif
3300*22dc650dSSadaf Ebrahimi case ESC_H:
3301*22dc650dSSadaf Ebrahimi case ESC_h:
3302*22dc650dSSadaf Ebrahimi case ESC_N:
3303*22dc650dSSadaf Ebrahimi case ESC_R:
3304*22dc650dSSadaf Ebrahimi case ESC_V:
3305*22dc650dSSadaf Ebrahimi case ESC_v:
3306*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
3307*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + escape;
3308*22dc650dSSadaf Ebrahimi break;
3309*22dc650dSSadaf Ebrahimi
3310*22dc650dSSadaf Ebrahimi default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3311*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + escape;
3312*22dc650dSSadaf Ebrahimi break;
3313*22dc650dSSadaf Ebrahimi
3314*22dc650dSSadaf Ebrahimi /* Escapes that may change in UCP mode. */
3315*22dc650dSSadaf Ebrahimi
3316*22dc650dSSadaf Ebrahimi case ESC_d:
3317*22dc650dSSadaf Ebrahimi case ESC_D:
3318*22dc650dSSadaf Ebrahimi case ESC_s:
3319*22dc650dSSadaf Ebrahimi case ESC_S:
3320*22dc650dSSadaf Ebrahimi case ESC_w:
3321*22dc650dSSadaf Ebrahimi case ESC_W:
3322*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
3323*22dc650dSSadaf Ebrahimi parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3324*22dc650dSSadaf Ebrahimi xoptions);
3325*22dc650dSSadaf Ebrahimi break;
3326*22dc650dSSadaf Ebrahimi
3327*22dc650dSSadaf Ebrahimi /* Unicode property matching */
3328*22dc650dSSadaf Ebrahimi
3329*22dc650dSSadaf Ebrahimi case ESC_P:
3330*22dc650dSSadaf Ebrahimi case ESC_p:
3331*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3332*22dc650dSSadaf Ebrahimi {
3333*22dc650dSSadaf Ebrahimi BOOL negated;
3334*22dc650dSSadaf Ebrahimi uint16_t ptype = 0, pdata = 0;
3335*22dc650dSSadaf Ebrahimi if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3336*22dc650dSSadaf Ebrahimi goto ESCAPE_FAILED;
3337*22dc650dSSadaf Ebrahimi if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3338*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + escape;
3339*22dc650dSSadaf Ebrahimi *parsed_pattern++ = (ptype << 16) | pdata;
3340*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
3341*22dc650dSSadaf Ebrahimi }
3342*22dc650dSSadaf Ebrahimi #else
3343*22dc650dSSadaf Ebrahimi errorcode = ERR45;
3344*22dc650dSSadaf Ebrahimi goto ESCAPE_FAILED;
3345*22dc650dSSadaf Ebrahimi #endif
3346*22dc650dSSadaf Ebrahimi break; /* End \P and \p */
3347*22dc650dSSadaf Ebrahimi
3348*22dc650dSSadaf Ebrahimi /* When \g is used with quotes or angle brackets as delimiters, it is a
3349*22dc650dSSadaf Ebrahimi numerical or named subroutine call, and control comes here. When used
3350*22dc650dSSadaf Ebrahimi with brace delimiters it is a numberical back reference and does not come
3351*22dc650dSSadaf Ebrahimi here because check_escape() returns it directly as a reference. \k is
3352*22dc650dSSadaf Ebrahimi always a named back reference. */
3353*22dc650dSSadaf Ebrahimi
3354*22dc650dSSadaf Ebrahimi case ESC_g:
3355*22dc650dSSadaf Ebrahimi case ESC_k:
3356*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3357*22dc650dSSadaf Ebrahimi *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3358*22dc650dSSadaf Ebrahimi {
3359*22dc650dSSadaf Ebrahimi errorcode = (escape == ESC_g)? ERR57 : ERR69;
3360*22dc650dSSadaf Ebrahimi goto ESCAPE_FAILED;
3361*22dc650dSSadaf Ebrahimi }
3362*22dc650dSSadaf Ebrahimi terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3363*22dc650dSSadaf Ebrahimi CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3364*22dc650dSSadaf Ebrahimi CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3365*22dc650dSSadaf Ebrahimi
3366*22dc650dSSadaf Ebrahimi /* For a non-braced \g, check for a numerical recursion. */
3367*22dc650dSSadaf Ebrahimi
3368*22dc650dSSadaf Ebrahimi if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3369*22dc650dSSadaf Ebrahimi {
3370*22dc650dSSadaf Ebrahimi PCRE2_SPTR p = ptr + 1;
3371*22dc650dSSadaf Ebrahimi
3372*22dc650dSSadaf Ebrahimi if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3373*22dc650dSSadaf Ebrahimi &errorcode))
3374*22dc650dSSadaf Ebrahimi {
3375*22dc650dSSadaf Ebrahimi if (p >= ptrend || *p != terminator)
3376*22dc650dSSadaf Ebrahimi {
3377*22dc650dSSadaf Ebrahimi errorcode = ERR57;
3378*22dc650dSSadaf Ebrahimi goto ESCAPE_FAILED;
3379*22dc650dSSadaf Ebrahimi }
3380*22dc650dSSadaf Ebrahimi ptr = p;
3381*22dc650dSSadaf Ebrahimi goto SET_RECURSION;
3382*22dc650dSSadaf Ebrahimi }
3383*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto ESCAPE_FAILED;
3384*22dc650dSSadaf Ebrahimi }
3385*22dc650dSSadaf Ebrahimi
3386*22dc650dSSadaf Ebrahimi /* Not a numerical recursion. Perl allows spaces and tabs after { and
3387*22dc650dSSadaf Ebrahimi before } but not for other delimiters. */
3388*22dc650dSSadaf Ebrahimi
3389*22dc650dSSadaf Ebrahimi if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3390*22dc650dSSadaf Ebrahimi &errorcode, cb)) goto ESCAPE_FAILED;
3391*22dc650dSSadaf Ebrahimi
3392*22dc650dSSadaf Ebrahimi /* \k and \g when used with braces are back references, whereas \g used
3393*22dc650dSSadaf Ebrahimi with quotes or angle brackets is a recursion */
3394*22dc650dSSadaf Ebrahimi
3395*22dc650dSSadaf Ebrahimi *parsed_pattern++ =
3396*22dc650dSSadaf Ebrahimi (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3397*22dc650dSSadaf Ebrahimi META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3398*22dc650dSSadaf Ebrahimi *parsed_pattern++ = namelen;
3399*22dc650dSSadaf Ebrahimi
3400*22dc650dSSadaf Ebrahimi PUTOFFSET(offset, parsed_pattern);
3401*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
3402*22dc650dSSadaf Ebrahimi break; /* End special escape processing */
3403*22dc650dSSadaf Ebrahimi }
3404*22dc650dSSadaf Ebrahimi break; /* End escape sequence processing */
3405*22dc650dSSadaf Ebrahimi
3406*22dc650dSSadaf Ebrahimi
3407*22dc650dSSadaf Ebrahimi /* ---- Single-character special items ---- */
3408*22dc650dSSadaf Ebrahimi
3409*22dc650dSSadaf Ebrahimi case CHAR_CIRCUMFLEX_ACCENT:
3410*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_CIRCUMFLEX;
3411*22dc650dSSadaf Ebrahimi break;
3412*22dc650dSSadaf Ebrahimi
3413*22dc650dSSadaf Ebrahimi case CHAR_DOLLAR_SIGN:
3414*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_DOLLAR;
3415*22dc650dSSadaf Ebrahimi break;
3416*22dc650dSSadaf Ebrahimi
3417*22dc650dSSadaf Ebrahimi case CHAR_DOT:
3418*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_DOT;
3419*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
3420*22dc650dSSadaf Ebrahimi break;
3421*22dc650dSSadaf Ebrahimi
3422*22dc650dSSadaf Ebrahimi
3423*22dc650dSSadaf Ebrahimi /* ---- Single-character quantifiers ---- */
3424*22dc650dSSadaf Ebrahimi
3425*22dc650dSSadaf Ebrahimi case CHAR_ASTERISK:
3426*22dc650dSSadaf Ebrahimi meta_quantifier = META_ASTERISK;
3427*22dc650dSSadaf Ebrahimi goto CHECK_QUANTIFIER;
3428*22dc650dSSadaf Ebrahimi
3429*22dc650dSSadaf Ebrahimi case CHAR_PLUS:
3430*22dc650dSSadaf Ebrahimi meta_quantifier = META_PLUS;
3431*22dc650dSSadaf Ebrahimi goto CHECK_QUANTIFIER;
3432*22dc650dSSadaf Ebrahimi
3433*22dc650dSSadaf Ebrahimi case CHAR_QUESTION_MARK:
3434*22dc650dSSadaf Ebrahimi meta_quantifier = META_QUERY;
3435*22dc650dSSadaf Ebrahimi goto CHECK_QUANTIFIER;
3436*22dc650dSSadaf Ebrahimi
3437*22dc650dSSadaf Ebrahimi
3438*22dc650dSSadaf Ebrahimi /* ---- Potential {n,m} quantifier ---- */
3439*22dc650dSSadaf Ebrahimi
3440*22dc650dSSadaf Ebrahimi case CHAR_LEFT_CURLY_BRACKET:
3441*22dc650dSSadaf Ebrahimi if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3442*22dc650dSSadaf Ebrahimi &errorcode))
3443*22dc650dSSadaf Ebrahimi {
3444*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3445*22dc650dSSadaf Ebrahimi PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3446*22dc650dSSadaf Ebrahimi break; /* No more quantifier processing */
3447*22dc650dSSadaf Ebrahimi }
3448*22dc650dSSadaf Ebrahimi meta_quantifier = META_MINMAX;
3449*22dc650dSSadaf Ebrahimi /* Fall through */
3450*22dc650dSSadaf Ebrahimi
3451*22dc650dSSadaf Ebrahimi
3452*22dc650dSSadaf Ebrahimi /* ---- Quantifier post-processing ---- */
3453*22dc650dSSadaf Ebrahimi
3454*22dc650dSSadaf Ebrahimi /* Check that a quantifier is allowed after the previous item. This
3455*22dc650dSSadaf Ebrahimi guarantees that there is a previous item. */
3456*22dc650dSSadaf Ebrahimi
3457*22dc650dSSadaf Ebrahimi CHECK_QUANTIFIER:
3458*22dc650dSSadaf Ebrahimi if (!prev_okquantifier)
3459*22dc650dSSadaf Ebrahimi {
3460*22dc650dSSadaf Ebrahimi errorcode = ERR9;
3461*22dc650dSSadaf Ebrahimi goto FAILED_BACK;
3462*22dc650dSSadaf Ebrahimi }
3463*22dc650dSSadaf Ebrahimi
3464*22dc650dSSadaf Ebrahimi /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3465*22dc650dSSadaf Ebrahimi quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3466*22dc650dSSadaf Ebrahimi sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3467*22dc650dSSadaf Ebrahimi wrapping it in non-capturing brackets, but we have to allow for a preceding
3468*22dc650dSSadaf Ebrahimi (*MARK) for when (*ACCEPT) has an argument. */
3469*22dc650dSSadaf Ebrahimi
3470*22dc650dSSadaf Ebrahimi if (*prev_parsed_item == META_ACCEPT)
3471*22dc650dSSadaf Ebrahimi {
3472*22dc650dSSadaf Ebrahimi uint32_t *p;
3473*22dc650dSSadaf Ebrahimi for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3474*22dc650dSSadaf Ebrahimi *verbstartptr = META_NOCAPTURE;
3475*22dc650dSSadaf Ebrahimi parsed_pattern[1] = META_KET;
3476*22dc650dSSadaf Ebrahimi parsed_pattern += 2;
3477*22dc650dSSadaf Ebrahimi }
3478*22dc650dSSadaf Ebrahimi
3479*22dc650dSSadaf Ebrahimi /* Now we can put the quantifier into the parsed pattern vector. At this
3480*22dc650dSSadaf Ebrahimi stage, we have only the basic quantifier. The check for a following + or ?
3481*22dc650dSSadaf Ebrahimi modifier happens at the top of the loop, after any intervening comments
3482*22dc650dSSadaf Ebrahimi have been removed. */
3483*22dc650dSSadaf Ebrahimi
3484*22dc650dSSadaf Ebrahimi *parsed_pattern++ = meta_quantifier;
3485*22dc650dSSadaf Ebrahimi if (c == CHAR_LEFT_CURLY_BRACKET)
3486*22dc650dSSadaf Ebrahimi {
3487*22dc650dSSadaf Ebrahimi *parsed_pattern++ = min_repeat;
3488*22dc650dSSadaf Ebrahimi *parsed_pattern++ = max_repeat;
3489*22dc650dSSadaf Ebrahimi }
3490*22dc650dSSadaf Ebrahimi break;
3491*22dc650dSSadaf Ebrahimi
3492*22dc650dSSadaf Ebrahimi
3493*22dc650dSSadaf Ebrahimi /* ---- Character class ---- */
3494*22dc650dSSadaf Ebrahimi
3495*22dc650dSSadaf Ebrahimi case CHAR_LEFT_SQUARE_BRACKET:
3496*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
3497*22dc650dSSadaf Ebrahimi
3498*22dc650dSSadaf Ebrahimi /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3499*22dc650dSSadaf Ebrahimi used for "start of word" and "end of word". As these are otherwise illegal
3500*22dc650dSSadaf Ebrahimi sequences, we don't break anything by recognizing them. They are replaced
3501*22dc650dSSadaf Ebrahimi by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3502*22dc650dSSadaf Ebrahimi erroneous and are handled by the normal code below. */
3503*22dc650dSSadaf Ebrahimi
3504*22dc650dSSadaf Ebrahimi if (ptrend - ptr >= 6 &&
3505*22dc650dSSadaf Ebrahimi (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3506*22dc650dSSadaf Ebrahimi PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3507*22dc650dSSadaf Ebrahimi {
3508*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + ESC_b;
3509*22dc650dSSadaf Ebrahimi
3510*22dc650dSSadaf Ebrahimi if (ptr[2] == CHAR_LESS_THAN_SIGN)
3511*22dc650dSSadaf Ebrahimi {
3512*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_LOOKAHEAD;
3513*22dc650dSSadaf Ebrahimi }
3514*22dc650dSSadaf Ebrahimi else
3515*22dc650dSSadaf Ebrahimi {
3516*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_LOOKBEHIND;
3517*22dc650dSSadaf Ebrahimi *has_lookbehind = TRUE;
3518*22dc650dSSadaf Ebrahimi
3519*22dc650dSSadaf Ebrahimi /* The offset is used only for the "non-fixed length" error; this won't
3520*22dc650dSSadaf Ebrahimi occur here, so just store zero. */
3521*22dc650dSSadaf Ebrahimi
3522*22dc650dSSadaf Ebrahimi PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3523*22dc650dSSadaf Ebrahimi }
3524*22dc650dSSadaf Ebrahimi
3525*22dc650dSSadaf Ebrahimi if ((options & PCRE2_UCP) == 0)
3526*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + ESC_w;
3527*22dc650dSSadaf Ebrahimi else
3528*22dc650dSSadaf Ebrahimi {
3529*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + ESC_p;
3530*22dc650dSSadaf Ebrahimi *parsed_pattern++ = PT_WORD << 16;
3531*22dc650dSSadaf Ebrahimi }
3532*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_KET;
3533*22dc650dSSadaf Ebrahimi ptr += 6;
3534*22dc650dSSadaf Ebrahimi break;
3535*22dc650dSSadaf Ebrahimi }
3536*22dc650dSSadaf Ebrahimi
3537*22dc650dSSadaf Ebrahimi /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3538*22dc650dSSadaf Ebrahimi they are encountered at the top level, so we'll do that too. */
3539*22dc650dSSadaf Ebrahimi
3540*22dc650dSSadaf Ebrahimi if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3541*22dc650dSSadaf Ebrahimi *ptr == CHAR_EQUALS_SIGN) &&
3542*22dc650dSSadaf Ebrahimi check_posix_syntax(ptr, ptrend, &tempptr))
3543*22dc650dSSadaf Ebrahimi {
3544*22dc650dSSadaf Ebrahimi errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3545*22dc650dSSadaf Ebrahimi goto FAILED;
3546*22dc650dSSadaf Ebrahimi }
3547*22dc650dSSadaf Ebrahimi
3548*22dc650dSSadaf Ebrahimi /* Process a regular character class. If the first character is '^', set
3549*22dc650dSSadaf Ebrahimi the negation flag. If the first few characters (either before or after ^)
3550*22dc650dSSadaf Ebrahimi are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3551*22dc650dSSadaf Ebrahimi This makes for compatibility with Perl. */
3552*22dc650dSSadaf Ebrahimi
3553*22dc650dSSadaf Ebrahimi negate_class = FALSE;
3554*22dc650dSSadaf Ebrahimi while (ptr < ptrend)
3555*22dc650dSSadaf Ebrahimi {
3556*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr);
3557*22dc650dSSadaf Ebrahimi if (c == CHAR_BACKSLASH)
3558*22dc650dSSadaf Ebrahimi {
3559*22dc650dSSadaf Ebrahimi if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3560*22dc650dSSadaf Ebrahimi else if (ptrend - ptr >= 3 &&
3561*22dc650dSSadaf Ebrahimi PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3562*22dc650dSSadaf Ebrahimi ptr += 3;
3563*22dc650dSSadaf Ebrahimi else
3564*22dc650dSSadaf Ebrahimi break;
3565*22dc650dSSadaf Ebrahimi }
3566*22dc650dSSadaf Ebrahimi else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3567*22dc650dSSadaf Ebrahimi (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3568*22dc650dSSadaf Ebrahimi continue;
3569*22dc650dSSadaf Ebrahimi else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3570*22dc650dSSadaf Ebrahimi negate_class = TRUE;
3571*22dc650dSSadaf Ebrahimi else break;
3572*22dc650dSSadaf Ebrahimi }
3573*22dc650dSSadaf Ebrahimi
3574*22dc650dSSadaf Ebrahimi /* Now the real contents of the class; c has the first "real" character.
3575*22dc650dSSadaf Ebrahimi Empty classes are permitted only if the option is set. */
3576*22dc650dSSadaf Ebrahimi
3577*22dc650dSSadaf Ebrahimi if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3578*22dc650dSSadaf Ebrahimi (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3579*22dc650dSSadaf Ebrahimi {
3580*22dc650dSSadaf Ebrahimi *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3581*22dc650dSSadaf Ebrahimi break; /* End of class processing */
3582*22dc650dSSadaf Ebrahimi }
3583*22dc650dSSadaf Ebrahimi
3584*22dc650dSSadaf Ebrahimi /* Process a non-empty class. */
3585*22dc650dSSadaf Ebrahimi
3586*22dc650dSSadaf Ebrahimi *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3587*22dc650dSSadaf Ebrahimi class_range_state = RANGE_NO;
3588*22dc650dSSadaf Ebrahimi
3589*22dc650dSSadaf Ebrahimi /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3590*22dc650dSSadaf Ebrahimi because there are holes in the encoding, and simply using the range A-Z
3591*22dc650dSSadaf Ebrahimi (for example) would include the characters in the holes. This applies only
3592*22dc650dSSadaf Ebrahimi to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3593*22dc650dSSadaf Ebrahimi in this respect. In order to accommodate this, we keep track of whether
3594*22dc650dSSadaf Ebrahimi character values are literal or not, and a state variable for handling
3595*22dc650dSSadaf Ebrahimi ranges. */
3596*22dc650dSSadaf Ebrahimi
3597*22dc650dSSadaf Ebrahimi /* Loop for the contents of the class */
3598*22dc650dSSadaf Ebrahimi
3599*22dc650dSSadaf Ebrahimi for (;;)
3600*22dc650dSSadaf Ebrahimi {
3601*22dc650dSSadaf Ebrahimi BOOL char_is_literal = TRUE;
3602*22dc650dSSadaf Ebrahimi
3603*22dc650dSSadaf Ebrahimi /* Inside \Q...\E everything is literal except \E */
3604*22dc650dSSadaf Ebrahimi
3605*22dc650dSSadaf Ebrahimi if (inescq)
3606*22dc650dSSadaf Ebrahimi {
3607*22dc650dSSadaf Ebrahimi if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3608*22dc650dSSadaf Ebrahimi {
3609*22dc650dSSadaf Ebrahimi inescq = FALSE; /* Reset literal state */
3610*22dc650dSSadaf Ebrahimi ptr++; /* Skip the 'E' */
3611*22dc650dSSadaf Ebrahimi goto CLASS_CONTINUE;
3612*22dc650dSSadaf Ebrahimi }
3613*22dc650dSSadaf Ebrahimi goto CLASS_LITERAL;
3614*22dc650dSSadaf Ebrahimi }
3615*22dc650dSSadaf Ebrahimi
3616*22dc650dSSadaf Ebrahimi /* Skip over space and tab (only) in extended-more mode. */
3617*22dc650dSSadaf Ebrahimi
3618*22dc650dSSadaf Ebrahimi if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3619*22dc650dSSadaf Ebrahimi (c == CHAR_SPACE || c == CHAR_HT))
3620*22dc650dSSadaf Ebrahimi goto CLASS_CONTINUE;
3621*22dc650dSSadaf Ebrahimi
3622*22dc650dSSadaf Ebrahimi /* Handle POSIX class names. Perl allows a negation extension of the
3623*22dc650dSSadaf Ebrahimi form [:^name:]. A square bracket that doesn't match the syntax is
3624*22dc650dSSadaf Ebrahimi treated as a literal. We also recognize the POSIX constructions
3625*22dc650dSSadaf Ebrahimi [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3626*22dc650dSSadaf Ebrahimi 5.6 and 5.8 do. */
3627*22dc650dSSadaf Ebrahimi
3628*22dc650dSSadaf Ebrahimi if (c == CHAR_LEFT_SQUARE_BRACKET &&
3629*22dc650dSSadaf Ebrahimi ptrend - ptr >= 3 &&
3630*22dc650dSSadaf Ebrahimi (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3631*22dc650dSSadaf Ebrahimi *ptr == CHAR_EQUALS_SIGN) &&
3632*22dc650dSSadaf Ebrahimi check_posix_syntax(ptr, ptrend, &tempptr))
3633*22dc650dSSadaf Ebrahimi {
3634*22dc650dSSadaf Ebrahimi BOOL posix_negate = FALSE;
3635*22dc650dSSadaf Ebrahimi int posix_class;
3636*22dc650dSSadaf Ebrahimi
3637*22dc650dSSadaf Ebrahimi /* Perl treats a hyphen before a POSIX class as a literal, not the
3638*22dc650dSSadaf Ebrahimi start of a range. However, it gives a warning in its warning mode. PCRE
3639*22dc650dSSadaf Ebrahimi does not have a warning mode, so we give an error, because this is
3640*22dc650dSSadaf Ebrahimi likely an error on the user's part. */
3641*22dc650dSSadaf Ebrahimi
3642*22dc650dSSadaf Ebrahimi if (class_range_state == RANGE_STARTED)
3643*22dc650dSSadaf Ebrahimi {
3644*22dc650dSSadaf Ebrahimi errorcode = ERR50;
3645*22dc650dSSadaf Ebrahimi goto FAILED;
3646*22dc650dSSadaf Ebrahimi }
3647*22dc650dSSadaf Ebrahimi
3648*22dc650dSSadaf Ebrahimi if (*ptr != CHAR_COLON)
3649*22dc650dSSadaf Ebrahimi {
3650*22dc650dSSadaf Ebrahimi errorcode = ERR13;
3651*22dc650dSSadaf Ebrahimi goto FAILED_BACK;
3652*22dc650dSSadaf Ebrahimi }
3653*22dc650dSSadaf Ebrahimi
3654*22dc650dSSadaf Ebrahimi if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3655*22dc650dSSadaf Ebrahimi {
3656*22dc650dSSadaf Ebrahimi posix_negate = TRUE;
3657*22dc650dSSadaf Ebrahimi ptr++;
3658*22dc650dSSadaf Ebrahimi }
3659*22dc650dSSadaf Ebrahimi
3660*22dc650dSSadaf Ebrahimi posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3661*22dc650dSSadaf Ebrahimi if (posix_class < 0)
3662*22dc650dSSadaf Ebrahimi {
3663*22dc650dSSadaf Ebrahimi errorcode = ERR30;
3664*22dc650dSSadaf Ebrahimi goto FAILED;
3665*22dc650dSSadaf Ebrahimi }
3666*22dc650dSSadaf Ebrahimi ptr = tempptr + 2;
3667*22dc650dSSadaf Ebrahimi
3668*22dc650dSSadaf Ebrahimi /* Perl treats a hyphen after a POSIX class as a literal, not the
3669*22dc650dSSadaf Ebrahimi start of a range. However, it gives a warning in its warning mode
3670*22dc650dSSadaf Ebrahimi unless the hyphen is the last character in the class. PCRE does not
3671*22dc650dSSadaf Ebrahimi have a warning mode, so we give an error, because this is likely an
3672*22dc650dSSadaf Ebrahimi error on the user's part. */
3673*22dc650dSSadaf Ebrahimi
3674*22dc650dSSadaf Ebrahimi if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3675*22dc650dSSadaf Ebrahimi ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3676*22dc650dSSadaf Ebrahimi {
3677*22dc650dSSadaf Ebrahimi errorcode = ERR50;
3678*22dc650dSSadaf Ebrahimi goto FAILED;
3679*22dc650dSSadaf Ebrahimi }
3680*22dc650dSSadaf Ebrahimi
3681*22dc650dSSadaf Ebrahimi /* Set "a hyphen is not the start of a range" for the -] case, and also
3682*22dc650dSSadaf Ebrahimi in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3683*22dc650dSSadaf Ebrahimi fuzzers do that kind of thing) and *then* a hyphen. This causes that
3684*22dc650dSSadaf Ebrahimi hyphen to be treated as a literal. I don't think it's worth setting up
3685*22dc650dSSadaf Ebrahimi special apparatus to do otherwise. */
3686*22dc650dSSadaf Ebrahimi
3687*22dc650dSSadaf Ebrahimi class_range_state = RANGE_NO;
3688*22dc650dSSadaf Ebrahimi
3689*22dc650dSSadaf Ebrahimi /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3690*22dc650dSSadaf Ebrahimi of the POSIX classes are converted to use Unicode properties \p or \P
3691*22dc650dSSadaf Ebrahimi or, in one case, \h or \H. The substitutes table has two values per
3692*22dc650dSSadaf Ebrahimi class, containing the type and value of a \p or \P item. The special
3693*22dc650dSSadaf Ebrahimi cases are specified with a negative type: a non-zero value causes \h or
3694*22dc650dSSadaf Ebrahimi \H to be used, and a zero value falls through to behave like a non-UCP
3695*22dc650dSSadaf Ebrahimi POSIX class. There are now also some extra options that force ASCII for
3696*22dc650dSSadaf Ebrahimi some classes. */
3697*22dc650dSSadaf Ebrahimi
3698*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3699*22dc650dSSadaf Ebrahimi if ((options & PCRE2_UCP) != 0 &&
3700*22dc650dSSadaf Ebrahimi (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3701*22dc650dSSadaf Ebrahimi !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3702*22dc650dSSadaf Ebrahimi (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3703*22dc650dSSadaf Ebrahimi {
3704*22dc650dSSadaf Ebrahimi int ptype = posix_substitutes[2*posix_class];
3705*22dc650dSSadaf Ebrahimi int pvalue = posix_substitutes[2*posix_class + 1];
3706*22dc650dSSadaf Ebrahimi
3707*22dc650dSSadaf Ebrahimi if (ptype >= 0)
3708*22dc650dSSadaf Ebrahimi {
3709*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3710*22dc650dSSadaf Ebrahimi *parsed_pattern++ = (ptype << 16) | pvalue;
3711*22dc650dSSadaf Ebrahimi goto CLASS_CONTINUE;
3712*22dc650dSSadaf Ebrahimi }
3713*22dc650dSSadaf Ebrahimi
3714*22dc650dSSadaf Ebrahimi if (pvalue != 0)
3715*22dc650dSSadaf Ebrahimi {
3716*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3717*22dc650dSSadaf Ebrahimi goto CLASS_CONTINUE;
3718*22dc650dSSadaf Ebrahimi }
3719*22dc650dSSadaf Ebrahimi
3720*22dc650dSSadaf Ebrahimi /* Fall through */
3721*22dc650dSSadaf Ebrahimi }
3722*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
3723*22dc650dSSadaf Ebrahimi
3724*22dc650dSSadaf Ebrahimi /* Non-UCP POSIX class */
3725*22dc650dSSadaf Ebrahimi
3726*22dc650dSSadaf Ebrahimi *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3727*22dc650dSSadaf Ebrahimi *parsed_pattern++ = posix_class;
3728*22dc650dSSadaf Ebrahimi }
3729*22dc650dSSadaf Ebrahimi
3730*22dc650dSSadaf Ebrahimi /* Handle potential start of range */
3731*22dc650dSSadaf Ebrahimi
3732*22dc650dSSadaf Ebrahimi else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3733*22dc650dSSadaf Ebrahimi {
3734*22dc650dSSadaf Ebrahimi *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3735*22dc650dSSadaf Ebrahimi META_RANGE_LITERAL : META_RANGE_ESCAPED;
3736*22dc650dSSadaf Ebrahimi class_range_state = RANGE_STARTED;
3737*22dc650dSSadaf Ebrahimi }
3738*22dc650dSSadaf Ebrahimi
3739*22dc650dSSadaf Ebrahimi /* Handle a literal character */
3740*22dc650dSSadaf Ebrahimi
3741*22dc650dSSadaf Ebrahimi else if (c != CHAR_BACKSLASH)
3742*22dc650dSSadaf Ebrahimi {
3743*22dc650dSSadaf Ebrahimi CLASS_LITERAL:
3744*22dc650dSSadaf Ebrahimi if (class_range_state == RANGE_STARTED)
3745*22dc650dSSadaf Ebrahimi {
3746*22dc650dSSadaf Ebrahimi if (c == parsed_pattern[-2]) /* Optimize one-char range */
3747*22dc650dSSadaf Ebrahimi parsed_pattern--;
3748*22dc650dSSadaf Ebrahimi else if (parsed_pattern[-2] > c) /* Check range is in order */
3749*22dc650dSSadaf Ebrahimi {
3750*22dc650dSSadaf Ebrahimi errorcode = ERR8;
3751*22dc650dSSadaf Ebrahimi goto FAILED_BACK;
3752*22dc650dSSadaf Ebrahimi }
3753*22dc650dSSadaf Ebrahimi else
3754*22dc650dSSadaf Ebrahimi {
3755*22dc650dSSadaf Ebrahimi if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3756*22dc650dSSadaf Ebrahimi parsed_pattern[-1] = META_RANGE_ESCAPED;
3757*22dc650dSSadaf Ebrahimi PARSED_LITERAL(c, parsed_pattern);
3758*22dc650dSSadaf Ebrahimi }
3759*22dc650dSSadaf Ebrahimi class_range_state = RANGE_NO;
3760*22dc650dSSadaf Ebrahimi }
3761*22dc650dSSadaf Ebrahimi else /* Potential start of range */
3762*22dc650dSSadaf Ebrahimi {
3763*22dc650dSSadaf Ebrahimi class_range_state = char_is_literal?
3764*22dc650dSSadaf Ebrahimi RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3765*22dc650dSSadaf Ebrahimi PARSED_LITERAL(c, parsed_pattern);
3766*22dc650dSSadaf Ebrahimi }
3767*22dc650dSSadaf Ebrahimi }
3768*22dc650dSSadaf Ebrahimi
3769*22dc650dSSadaf Ebrahimi /* Handle escapes in a class */
3770*22dc650dSSadaf Ebrahimi
3771*22dc650dSSadaf Ebrahimi else
3772*22dc650dSSadaf Ebrahimi {
3773*22dc650dSSadaf Ebrahimi tempptr = ptr;
3774*22dc650dSSadaf Ebrahimi escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3775*22dc650dSSadaf Ebrahimi xoptions, TRUE, cb);
3776*22dc650dSSadaf Ebrahimi
3777*22dc650dSSadaf Ebrahimi if (errorcode != 0)
3778*22dc650dSSadaf Ebrahimi {
3779*22dc650dSSadaf Ebrahimi if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3780*22dc650dSSadaf Ebrahimi goto FAILED;
3781*22dc650dSSadaf Ebrahimi ptr = tempptr;
3782*22dc650dSSadaf Ebrahimi if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3783*22dc650dSSadaf Ebrahimi {
3784*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3785*22dc650dSSadaf Ebrahimi }
3786*22dc650dSSadaf Ebrahimi escape = 0; /* Treat as literal character */
3787*22dc650dSSadaf Ebrahimi }
3788*22dc650dSSadaf Ebrahimi
3789*22dc650dSSadaf Ebrahimi switch(escape)
3790*22dc650dSSadaf Ebrahimi {
3791*22dc650dSSadaf Ebrahimi case 0: /* Escaped character code point is in c */
3792*22dc650dSSadaf Ebrahimi char_is_literal = FALSE;
3793*22dc650dSSadaf Ebrahimi goto CLASS_LITERAL; /* (a few lines above) */
3794*22dc650dSSadaf Ebrahimi
3795*22dc650dSSadaf Ebrahimi case ESC_b:
3796*22dc650dSSadaf Ebrahimi c = CHAR_BS; /* \b is backspace in a class */
3797*22dc650dSSadaf Ebrahimi char_is_literal = FALSE;
3798*22dc650dSSadaf Ebrahimi goto CLASS_LITERAL;
3799*22dc650dSSadaf Ebrahimi
3800*22dc650dSSadaf Ebrahimi case ESC_Q:
3801*22dc650dSSadaf Ebrahimi inescq = TRUE; /* Enter literal mode */
3802*22dc650dSSadaf Ebrahimi goto CLASS_CONTINUE;
3803*22dc650dSSadaf Ebrahimi
3804*22dc650dSSadaf Ebrahimi case ESC_E: /* Ignore orphan \E */
3805*22dc650dSSadaf Ebrahimi goto CLASS_CONTINUE;
3806*22dc650dSSadaf Ebrahimi
3807*22dc650dSSadaf Ebrahimi case ESC_B: /* Always an error in a class */
3808*22dc650dSSadaf Ebrahimi case ESC_R:
3809*22dc650dSSadaf Ebrahimi case ESC_X:
3810*22dc650dSSadaf Ebrahimi errorcode = ERR7;
3811*22dc650dSSadaf Ebrahimi ptr--;
3812*22dc650dSSadaf Ebrahimi goto FAILED;
3813*22dc650dSSadaf Ebrahimi }
3814*22dc650dSSadaf Ebrahimi
3815*22dc650dSSadaf Ebrahimi /* The second part of a range can be a single-character escape
3816*22dc650dSSadaf Ebrahimi sequence (detected above), but not any of the other escapes. Perl
3817*22dc650dSSadaf Ebrahimi treats a hyphen as a literal in such circumstances. However, in Perl's
3818*22dc650dSSadaf Ebrahimi warning mode, a warning is given, so PCRE now faults it, as it is
3819*22dc650dSSadaf Ebrahimi almost certainly a mistake on the user's part. */
3820*22dc650dSSadaf Ebrahimi
3821*22dc650dSSadaf Ebrahimi if (class_range_state == RANGE_STARTED)
3822*22dc650dSSadaf Ebrahimi {
3823*22dc650dSSadaf Ebrahimi errorcode = ERR50;
3824*22dc650dSSadaf Ebrahimi goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
3825*22dc650dSSadaf Ebrahimi }
3826*22dc650dSSadaf Ebrahimi
3827*22dc650dSSadaf Ebrahimi /* Of the remaining escapes, only those that define characters are
3828*22dc650dSSadaf Ebrahimi allowed in a class. None may start a range. */
3829*22dc650dSSadaf Ebrahimi
3830*22dc650dSSadaf Ebrahimi class_range_state = RANGE_NO;
3831*22dc650dSSadaf Ebrahimi switch(escape)
3832*22dc650dSSadaf Ebrahimi {
3833*22dc650dSSadaf Ebrahimi case ESC_N:
3834*22dc650dSSadaf Ebrahimi errorcode = ERR71;
3835*22dc650dSSadaf Ebrahimi goto FAILED;
3836*22dc650dSSadaf Ebrahimi
3837*22dc650dSSadaf Ebrahimi case ESC_H:
3838*22dc650dSSadaf Ebrahimi case ESC_h:
3839*22dc650dSSadaf Ebrahimi case ESC_V:
3840*22dc650dSSadaf Ebrahimi case ESC_v:
3841*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + escape;
3842*22dc650dSSadaf Ebrahimi break;
3843*22dc650dSSadaf Ebrahimi
3844*22dc650dSSadaf Ebrahimi /* These escapes may be converted to Unicode property tests when
3845*22dc650dSSadaf Ebrahimi PCRE2_UCP is set. */
3846*22dc650dSSadaf Ebrahimi
3847*22dc650dSSadaf Ebrahimi case ESC_d:
3848*22dc650dSSadaf Ebrahimi case ESC_D:
3849*22dc650dSSadaf Ebrahimi case ESC_s:
3850*22dc650dSSadaf Ebrahimi case ESC_S:
3851*22dc650dSSadaf Ebrahimi case ESC_w:
3852*22dc650dSSadaf Ebrahimi case ESC_W:
3853*22dc650dSSadaf Ebrahimi parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3854*22dc650dSSadaf Ebrahimi xoptions);
3855*22dc650dSSadaf Ebrahimi break;
3856*22dc650dSSadaf Ebrahimi
3857*22dc650dSSadaf Ebrahimi /* Explicit Unicode property matching */
3858*22dc650dSSadaf Ebrahimi
3859*22dc650dSSadaf Ebrahimi case ESC_P:
3860*22dc650dSSadaf Ebrahimi case ESC_p:
3861*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3862*22dc650dSSadaf Ebrahimi {
3863*22dc650dSSadaf Ebrahimi BOOL negated;
3864*22dc650dSSadaf Ebrahimi uint16_t ptype = 0, pdata = 0;
3865*22dc650dSSadaf Ebrahimi if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3866*22dc650dSSadaf Ebrahimi goto FAILED;
3867*22dc650dSSadaf Ebrahimi if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3868*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + escape;
3869*22dc650dSSadaf Ebrahimi *parsed_pattern++ = (ptype << 16) | pdata;
3870*22dc650dSSadaf Ebrahimi }
3871*22dc650dSSadaf Ebrahimi #else
3872*22dc650dSSadaf Ebrahimi errorcode = ERR45;
3873*22dc650dSSadaf Ebrahimi goto FAILED;
3874*22dc650dSSadaf Ebrahimi #endif
3875*22dc650dSSadaf Ebrahimi break; /* End \P and \p */
3876*22dc650dSSadaf Ebrahimi
3877*22dc650dSSadaf Ebrahimi default: /* All others are not allowed in a class */
3878*22dc650dSSadaf Ebrahimi errorcode = ERR7;
3879*22dc650dSSadaf Ebrahimi ptr--;
3880*22dc650dSSadaf Ebrahimi goto FAILED;
3881*22dc650dSSadaf Ebrahimi }
3882*22dc650dSSadaf Ebrahimi
3883*22dc650dSSadaf Ebrahimi /* Perl gives a warning unless a following hyphen is the last character
3884*22dc650dSSadaf Ebrahimi in the class. PCRE throws an error. */
3885*22dc650dSSadaf Ebrahimi
3886*22dc650dSSadaf Ebrahimi if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3887*22dc650dSSadaf Ebrahimi ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3888*22dc650dSSadaf Ebrahimi {
3889*22dc650dSSadaf Ebrahimi errorcode = ERR50;
3890*22dc650dSSadaf Ebrahimi goto FAILED;
3891*22dc650dSSadaf Ebrahimi }
3892*22dc650dSSadaf Ebrahimi }
3893*22dc650dSSadaf Ebrahimi
3894*22dc650dSSadaf Ebrahimi /* Proceed to next thing in the class. */
3895*22dc650dSSadaf Ebrahimi
3896*22dc650dSSadaf Ebrahimi CLASS_CONTINUE:
3897*22dc650dSSadaf Ebrahimi if (ptr >= ptrend)
3898*22dc650dSSadaf Ebrahimi {
3899*22dc650dSSadaf Ebrahimi errorcode = ERR6; /* Missing terminating ']' */
3900*22dc650dSSadaf Ebrahimi goto FAILED;
3901*22dc650dSSadaf Ebrahimi }
3902*22dc650dSSadaf Ebrahimi GETCHARINCTEST(c, ptr);
3903*22dc650dSSadaf Ebrahimi if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3904*22dc650dSSadaf Ebrahimi } /* End of class-processing loop */
3905*22dc650dSSadaf Ebrahimi
3906*22dc650dSSadaf Ebrahimi /* -] at the end of a class is a literal '-' */
3907*22dc650dSSadaf Ebrahimi
3908*22dc650dSSadaf Ebrahimi if (class_range_state == RANGE_STARTED)
3909*22dc650dSSadaf Ebrahimi {
3910*22dc650dSSadaf Ebrahimi parsed_pattern[-1] = CHAR_MINUS;
3911*22dc650dSSadaf Ebrahimi class_range_state = RANGE_NO;
3912*22dc650dSSadaf Ebrahimi }
3913*22dc650dSSadaf Ebrahimi
3914*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_CLASS_END;
3915*22dc650dSSadaf Ebrahimi break; /* End of character class */
3916*22dc650dSSadaf Ebrahimi
3917*22dc650dSSadaf Ebrahimi
3918*22dc650dSSadaf Ebrahimi /* ---- Opening parenthesis ---- */
3919*22dc650dSSadaf Ebrahimi
3920*22dc650dSSadaf Ebrahimi case CHAR_LEFT_PARENTHESIS:
3921*22dc650dSSadaf Ebrahimi if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3922*22dc650dSSadaf Ebrahimi
3923*22dc650dSSadaf Ebrahimi /* If ( is not followed by ? it is either a capture or a special verb or an
3924*22dc650dSSadaf Ebrahimi alpha assertion or a positive non-atomic lookahead. */
3925*22dc650dSSadaf Ebrahimi
3926*22dc650dSSadaf Ebrahimi if (*ptr != CHAR_QUESTION_MARK)
3927*22dc650dSSadaf Ebrahimi {
3928*22dc650dSSadaf Ebrahimi const char *vn;
3929*22dc650dSSadaf Ebrahimi
3930*22dc650dSSadaf Ebrahimi /* Handle capturing brackets (or non-capturing if auto-capture is turned
3931*22dc650dSSadaf Ebrahimi off). */
3932*22dc650dSSadaf Ebrahimi
3933*22dc650dSSadaf Ebrahimi if (*ptr != CHAR_ASTERISK)
3934*22dc650dSSadaf Ebrahimi {
3935*22dc650dSSadaf Ebrahimi nest_depth++;
3936*22dc650dSSadaf Ebrahimi if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3937*22dc650dSSadaf Ebrahimi {
3938*22dc650dSSadaf Ebrahimi if (cb->bracount >= MAX_GROUP_NUMBER)
3939*22dc650dSSadaf Ebrahimi {
3940*22dc650dSSadaf Ebrahimi errorcode = ERR97;
3941*22dc650dSSadaf Ebrahimi goto FAILED;
3942*22dc650dSSadaf Ebrahimi }
3943*22dc650dSSadaf Ebrahimi cb->bracount++;
3944*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_CAPTURE | cb->bracount;
3945*22dc650dSSadaf Ebrahimi }
3946*22dc650dSSadaf Ebrahimi else *parsed_pattern++ = META_NOCAPTURE;
3947*22dc650dSSadaf Ebrahimi }
3948*22dc650dSSadaf Ebrahimi
3949*22dc650dSSadaf Ebrahimi /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3950*22dc650dSSadaf Ebrahimi quantifier" error rather than "(*MARK) must have an argument". */
3951*22dc650dSSadaf Ebrahimi
3952*22dc650dSSadaf Ebrahimi else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3953*22dc650dSSadaf Ebrahimi break;
3954*22dc650dSSadaf Ebrahimi
3955*22dc650dSSadaf Ebrahimi /* Handle "alpha assertions" such as (*pla:...). Most of these are
3956*22dc650dSSadaf Ebrahimi synonyms for the historical symbolic assertions, but the script run and
3957*22dc650dSSadaf Ebrahimi non-atomic lookaround ones are new. They are distinguished by starting
3958*22dc650dSSadaf Ebrahimi with a lower case letter. Checking both ends of the alphabet makes this
3959*22dc650dSSadaf Ebrahimi work in all character codes. */
3960*22dc650dSSadaf Ebrahimi
3961*22dc650dSSadaf Ebrahimi else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3962*22dc650dSSadaf Ebrahimi {
3963*22dc650dSSadaf Ebrahimi uint32_t meta;
3964*22dc650dSSadaf Ebrahimi
3965*22dc650dSSadaf Ebrahimi vn = alasnames;
3966*22dc650dSSadaf Ebrahimi if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3967*22dc650dSSadaf Ebrahimi &errorcode, cb)) goto FAILED;
3968*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr != CHAR_COLON)
3969*22dc650dSSadaf Ebrahimi {
3970*22dc650dSSadaf Ebrahimi errorcode = ERR95; /* Malformed */
3971*22dc650dSSadaf Ebrahimi goto FAILED;
3972*22dc650dSSadaf Ebrahimi }
3973*22dc650dSSadaf Ebrahimi
3974*22dc650dSSadaf Ebrahimi /* Scan the table of alpha assertion names */
3975*22dc650dSSadaf Ebrahimi
3976*22dc650dSSadaf Ebrahimi for (i = 0; i < alascount; i++)
3977*22dc650dSSadaf Ebrahimi {
3978*22dc650dSSadaf Ebrahimi if (namelen == alasmeta[i].len &&
3979*22dc650dSSadaf Ebrahimi PRIV(strncmp_c8)(name, vn, namelen) == 0)
3980*22dc650dSSadaf Ebrahimi break;
3981*22dc650dSSadaf Ebrahimi vn += alasmeta[i].len + 1;
3982*22dc650dSSadaf Ebrahimi }
3983*22dc650dSSadaf Ebrahimi
3984*22dc650dSSadaf Ebrahimi if (i >= alascount)
3985*22dc650dSSadaf Ebrahimi {
3986*22dc650dSSadaf Ebrahimi errorcode = ERR95; /* Alpha assertion not recognized */
3987*22dc650dSSadaf Ebrahimi goto FAILED;
3988*22dc650dSSadaf Ebrahimi }
3989*22dc650dSSadaf Ebrahimi
3990*22dc650dSSadaf Ebrahimi /* Check for expecting an assertion condition. If so, only atomic
3991*22dc650dSSadaf Ebrahimi lookaround assertions are valid. */
3992*22dc650dSSadaf Ebrahimi
3993*22dc650dSSadaf Ebrahimi meta = alasmeta[i].meta;
3994*22dc650dSSadaf Ebrahimi if (prev_expect_cond_assert > 0 &&
3995*22dc650dSSadaf Ebrahimi (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3996*22dc650dSSadaf Ebrahimi {
3997*22dc650dSSadaf Ebrahimi errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3998*22dc650dSSadaf Ebrahimi ERR98 : ERR28; /* (Atomic) assertion expected */
3999*22dc650dSSadaf Ebrahimi goto FAILED;
4000*22dc650dSSadaf Ebrahimi }
4001*22dc650dSSadaf Ebrahimi
4002*22dc650dSSadaf Ebrahimi /* The lookaround alphabetic synonyms can mostly be handled by jumping
4003*22dc650dSSadaf Ebrahimi to the code that handles the traditional symbolic forms. */
4004*22dc650dSSadaf Ebrahimi
4005*22dc650dSSadaf Ebrahimi switch(meta)
4006*22dc650dSSadaf Ebrahimi {
4007*22dc650dSSadaf Ebrahimi default:
4008*22dc650dSSadaf Ebrahimi errorcode = ERR89; /* Unknown code; should never occur because */
4009*22dc650dSSadaf Ebrahimi goto FAILED; /* the meta values come from a table above. */
4010*22dc650dSSadaf Ebrahimi
4011*22dc650dSSadaf Ebrahimi case META_ATOMIC:
4012*22dc650dSSadaf Ebrahimi goto ATOMIC_GROUP;
4013*22dc650dSSadaf Ebrahimi
4014*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD:
4015*22dc650dSSadaf Ebrahimi goto POSITIVE_LOOK_AHEAD;
4016*22dc650dSSadaf Ebrahimi
4017*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD_NA:
4018*22dc650dSSadaf Ebrahimi goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4019*22dc650dSSadaf Ebrahimi
4020*22dc650dSSadaf Ebrahimi case META_LOOKAHEADNOT:
4021*22dc650dSSadaf Ebrahimi goto NEGATIVE_LOOK_AHEAD;
4022*22dc650dSSadaf Ebrahimi
4023*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND:
4024*22dc650dSSadaf Ebrahimi case META_LOOKBEHINDNOT:
4025*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND_NA:
4026*22dc650dSSadaf Ebrahimi *parsed_pattern++ = meta;
4027*22dc650dSSadaf Ebrahimi ptr--;
4028*22dc650dSSadaf Ebrahimi goto POST_LOOKBEHIND;
4029*22dc650dSSadaf Ebrahimi
4030*22dc650dSSadaf Ebrahimi /* The script run facilities are handled here. Unicode support is
4031*22dc650dSSadaf Ebrahimi required (give an error if not, as this is a security issue). Always
4032*22dc650dSSadaf Ebrahimi record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4033*22dc650dSSadaf Ebrahimi META_ATOMIC and remember that we need two META_KETs at the end. */
4034*22dc650dSSadaf Ebrahimi
4035*22dc650dSSadaf Ebrahimi case META_SCRIPT_RUN:
4036*22dc650dSSadaf Ebrahimi case META_ATOMIC_SCRIPT_RUN:
4037*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
4038*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_SCRIPT_RUN;
4039*22dc650dSSadaf Ebrahimi nest_depth++;
4040*22dc650dSSadaf Ebrahimi ptr++;
4041*22dc650dSSadaf Ebrahimi if (meta == META_ATOMIC_SCRIPT_RUN)
4042*22dc650dSSadaf Ebrahimi {
4043*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ATOMIC;
4044*22dc650dSSadaf Ebrahimi if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4045*22dc650dSSadaf Ebrahimi else if (++top_nest >= end_nests)
4046*22dc650dSSadaf Ebrahimi {
4047*22dc650dSSadaf Ebrahimi errorcode = ERR84;
4048*22dc650dSSadaf Ebrahimi goto FAILED;
4049*22dc650dSSadaf Ebrahimi }
4050*22dc650dSSadaf Ebrahimi top_nest->nest_depth = nest_depth;
4051*22dc650dSSadaf Ebrahimi top_nest->flags = NSF_ATOMICSR;
4052*22dc650dSSadaf Ebrahimi top_nest->options = options & PARSE_TRACKED_OPTIONS;
4053*22dc650dSSadaf Ebrahimi top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4054*22dc650dSSadaf Ebrahimi }
4055*22dc650dSSadaf Ebrahimi break;
4056*22dc650dSSadaf Ebrahimi #else /* SUPPORT_UNICODE */
4057*22dc650dSSadaf Ebrahimi errorcode = ERR96;
4058*22dc650dSSadaf Ebrahimi goto FAILED;
4059*22dc650dSSadaf Ebrahimi #endif
4060*22dc650dSSadaf Ebrahimi }
4061*22dc650dSSadaf Ebrahimi }
4062*22dc650dSSadaf Ebrahimi
4063*22dc650dSSadaf Ebrahimi
4064*22dc650dSSadaf Ebrahimi /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4065*22dc650dSSadaf Ebrahimi
4066*22dc650dSSadaf Ebrahimi else
4067*22dc650dSSadaf Ebrahimi {
4068*22dc650dSSadaf Ebrahimi vn = verbnames;
4069*22dc650dSSadaf Ebrahimi if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4070*22dc650dSSadaf Ebrahimi &errorcode, cb)) goto FAILED;
4071*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4072*22dc650dSSadaf Ebrahimi *ptr != CHAR_RIGHT_PARENTHESIS))
4073*22dc650dSSadaf Ebrahimi {
4074*22dc650dSSadaf Ebrahimi errorcode = ERR60; /* Malformed */
4075*22dc650dSSadaf Ebrahimi goto FAILED;
4076*22dc650dSSadaf Ebrahimi }
4077*22dc650dSSadaf Ebrahimi
4078*22dc650dSSadaf Ebrahimi /* Scan the table of verb names */
4079*22dc650dSSadaf Ebrahimi
4080*22dc650dSSadaf Ebrahimi for (i = 0; i < verbcount; i++)
4081*22dc650dSSadaf Ebrahimi {
4082*22dc650dSSadaf Ebrahimi if (namelen == verbs[i].len &&
4083*22dc650dSSadaf Ebrahimi PRIV(strncmp_c8)(name, vn, namelen) == 0)
4084*22dc650dSSadaf Ebrahimi break;
4085*22dc650dSSadaf Ebrahimi vn += verbs[i].len + 1;
4086*22dc650dSSadaf Ebrahimi }
4087*22dc650dSSadaf Ebrahimi
4088*22dc650dSSadaf Ebrahimi if (i >= verbcount)
4089*22dc650dSSadaf Ebrahimi {
4090*22dc650dSSadaf Ebrahimi errorcode = ERR60; /* Verb not recognized */
4091*22dc650dSSadaf Ebrahimi goto FAILED;
4092*22dc650dSSadaf Ebrahimi }
4093*22dc650dSSadaf Ebrahimi
4094*22dc650dSSadaf Ebrahimi /* An empty argument is treated as no argument. */
4095*22dc650dSSadaf Ebrahimi
4096*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4097*22dc650dSSadaf Ebrahimi ptr[1] == CHAR_RIGHT_PARENTHESIS)
4098*22dc650dSSadaf Ebrahimi ptr++; /* Advance to the closing parens */
4099*22dc650dSSadaf Ebrahimi
4100*22dc650dSSadaf Ebrahimi /* Check for mandatory non-empty argument; this is (*MARK) */
4101*22dc650dSSadaf Ebrahimi
4102*22dc650dSSadaf Ebrahimi if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4103*22dc650dSSadaf Ebrahimi {
4104*22dc650dSSadaf Ebrahimi errorcode = ERR66;
4105*22dc650dSSadaf Ebrahimi goto FAILED;
4106*22dc650dSSadaf Ebrahimi }
4107*22dc650dSSadaf Ebrahimi
4108*22dc650dSSadaf Ebrahimi /* Remember where this verb, possibly with a preceding (*MARK), starts,
4109*22dc650dSSadaf Ebrahimi for handling quantified (*ACCEPT). */
4110*22dc650dSSadaf Ebrahimi
4111*22dc650dSSadaf Ebrahimi verbstartptr = parsed_pattern;
4112*22dc650dSSadaf Ebrahimi okquantifier = (verbs[i].meta == META_ACCEPT);
4113*22dc650dSSadaf Ebrahimi
4114*22dc650dSSadaf Ebrahimi /* It appears that Perl allows any characters whatsoever, other than a
4115*22dc650dSSadaf Ebrahimi closing parenthesis, to appear in arguments ("names"), so we no longer
4116*22dc650dSSadaf Ebrahimi insist on letters, digits, and underscores. Perl does not, however, do
4117*22dc650dSSadaf Ebrahimi any interpretation within arguments, and has no means of including a
4118*22dc650dSSadaf Ebrahimi closing parenthesis. PCRE supports escape processing but only when it
4119*22dc650dSSadaf Ebrahimi is requested by an option. We set inverbname TRUE here, and let the
4120*22dc650dSSadaf Ebrahimi main loop take care of this so that escape and \x processing is done by
4121*22dc650dSSadaf Ebrahimi the main code above. */
4122*22dc650dSSadaf Ebrahimi
4123*22dc650dSSadaf Ebrahimi if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
4124*22dc650dSSadaf Ebrahimi {
4125*22dc650dSSadaf Ebrahimi /* Some optional arguments can be treated as a preceding (*MARK) */
4126*22dc650dSSadaf Ebrahimi
4127*22dc650dSSadaf Ebrahimi if (verbs[i].has_arg < 0)
4128*22dc650dSSadaf Ebrahimi {
4129*22dc650dSSadaf Ebrahimi add_after_mark = verbs[i].meta;
4130*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_MARK;
4131*22dc650dSSadaf Ebrahimi }
4132*22dc650dSSadaf Ebrahimi
4133*22dc650dSSadaf Ebrahimi /* The remaining verbs with arguments (except *MARK) need a different
4134*22dc650dSSadaf Ebrahimi opcode. */
4135*22dc650dSSadaf Ebrahimi
4136*22dc650dSSadaf Ebrahimi else
4137*22dc650dSSadaf Ebrahimi {
4138*22dc650dSSadaf Ebrahimi *parsed_pattern++ = verbs[i].meta +
4139*22dc650dSSadaf Ebrahimi ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4140*22dc650dSSadaf Ebrahimi }
4141*22dc650dSSadaf Ebrahimi
4142*22dc650dSSadaf Ebrahimi /* Set up for reading the name in the main loop. */
4143*22dc650dSSadaf Ebrahimi
4144*22dc650dSSadaf Ebrahimi verblengthptr = parsed_pattern++;
4145*22dc650dSSadaf Ebrahimi verbnamestart = ptr;
4146*22dc650dSSadaf Ebrahimi inverbname = TRUE;
4147*22dc650dSSadaf Ebrahimi }
4148*22dc650dSSadaf Ebrahimi else /* No verb "name" argument */
4149*22dc650dSSadaf Ebrahimi {
4150*22dc650dSSadaf Ebrahimi *parsed_pattern++ = verbs[i].meta;
4151*22dc650dSSadaf Ebrahimi }
4152*22dc650dSSadaf Ebrahimi } /* End of (*VERB) handling */
4153*22dc650dSSadaf Ebrahimi break; /* Done with this parenthesis */
4154*22dc650dSSadaf Ebrahimi } /* End of groups that don't start with (? */
4155*22dc650dSSadaf Ebrahimi
4156*22dc650dSSadaf Ebrahimi
4157*22dc650dSSadaf Ebrahimi /* ---- Items starting (? ---- */
4158*22dc650dSSadaf Ebrahimi
4159*22dc650dSSadaf Ebrahimi /* The type of item is determined by what follows (?. Handle (?| and option
4160*22dc650dSSadaf Ebrahimi changes under "default" because both need a new block on the nest stack.
4161*22dc650dSSadaf Ebrahimi Comments starting with (?# are handled above. Note that there is some
4162*22dc650dSSadaf Ebrahimi ambiguity about the sequence (?- because if a digit follows it's a relative
4163*22dc650dSSadaf Ebrahimi recursion or subroutine call whereas otherwise it's an option unsetting. */
4164*22dc650dSSadaf Ebrahimi
4165*22dc650dSSadaf Ebrahimi if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4166*22dc650dSSadaf Ebrahimi
4167*22dc650dSSadaf Ebrahimi switch(*ptr)
4168*22dc650dSSadaf Ebrahimi {
4169*22dc650dSSadaf Ebrahimi default:
4170*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4171*22dc650dSSadaf Ebrahimi goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
4172*22dc650dSSadaf Ebrahimi
4173*22dc650dSSadaf Ebrahimi /* We now have either (?| or a (possibly empty) option setting,
4174*22dc650dSSadaf Ebrahimi optionally followed by a non-capturing group. */
4175*22dc650dSSadaf Ebrahimi
4176*22dc650dSSadaf Ebrahimi nest_depth++;
4177*22dc650dSSadaf Ebrahimi if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4178*22dc650dSSadaf Ebrahimi else if (++top_nest >= end_nests)
4179*22dc650dSSadaf Ebrahimi {
4180*22dc650dSSadaf Ebrahimi errorcode = ERR84;
4181*22dc650dSSadaf Ebrahimi goto FAILED;
4182*22dc650dSSadaf Ebrahimi }
4183*22dc650dSSadaf Ebrahimi top_nest->nest_depth = nest_depth;
4184*22dc650dSSadaf Ebrahimi top_nest->flags = 0;
4185*22dc650dSSadaf Ebrahimi top_nest->options = options & PARSE_TRACKED_OPTIONS;
4186*22dc650dSSadaf Ebrahimi top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4187*22dc650dSSadaf Ebrahimi
4188*22dc650dSSadaf Ebrahimi /* Start of non-capturing group that resets the capture count for each
4189*22dc650dSSadaf Ebrahimi branch. */
4190*22dc650dSSadaf Ebrahimi
4191*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_VERTICAL_LINE)
4192*22dc650dSSadaf Ebrahimi {
4193*22dc650dSSadaf Ebrahimi top_nest->reset_group = (uint16_t)cb->bracount;
4194*22dc650dSSadaf Ebrahimi top_nest->max_group = (uint16_t)cb->bracount;
4195*22dc650dSSadaf Ebrahimi top_nest->flags |= NSF_RESET;
4196*22dc650dSSadaf Ebrahimi cb->external_flags |= PCRE2_DUPCAPUSED;
4197*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_NOCAPTURE;
4198*22dc650dSSadaf Ebrahimi ptr++;
4199*22dc650dSSadaf Ebrahimi }
4200*22dc650dSSadaf Ebrahimi
4201*22dc650dSSadaf Ebrahimi /* Scan for options imnrsxJU to be set or unset. */
4202*22dc650dSSadaf Ebrahimi
4203*22dc650dSSadaf Ebrahimi else
4204*22dc650dSSadaf Ebrahimi {
4205*22dc650dSSadaf Ebrahimi BOOL hyphenok = TRUE;
4206*22dc650dSSadaf Ebrahimi uint32_t oldoptions = options;
4207*22dc650dSSadaf Ebrahimi uint32_t oldxoptions = xoptions;
4208*22dc650dSSadaf Ebrahimi
4209*22dc650dSSadaf Ebrahimi top_nest->reset_group = 0;
4210*22dc650dSSadaf Ebrahimi top_nest->max_group = 0;
4211*22dc650dSSadaf Ebrahimi set = unset = 0;
4212*22dc650dSSadaf Ebrahimi optset = &set;
4213*22dc650dSSadaf Ebrahimi xset = xunset = 0;
4214*22dc650dSSadaf Ebrahimi xoptset = &xset;
4215*22dc650dSSadaf Ebrahimi
4216*22dc650dSSadaf Ebrahimi /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4217*22dc650dSSadaf Ebrahimi
4218*22dc650dSSadaf Ebrahimi if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4219*22dc650dSSadaf Ebrahimi {
4220*22dc650dSSadaf Ebrahimi options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4221*22dc650dSSadaf Ebrahimi PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4222*22dc650dSSadaf Ebrahimi xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4223*22dc650dSSadaf Ebrahimi hyphenok = FALSE;
4224*22dc650dSSadaf Ebrahimi ptr++;
4225*22dc650dSSadaf Ebrahimi }
4226*22dc650dSSadaf Ebrahimi
4227*22dc650dSSadaf Ebrahimi while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4228*22dc650dSSadaf Ebrahimi *ptr != CHAR_COLON)
4229*22dc650dSSadaf Ebrahimi {
4230*22dc650dSSadaf Ebrahimi switch (*ptr++)
4231*22dc650dSSadaf Ebrahimi {
4232*22dc650dSSadaf Ebrahimi case CHAR_MINUS:
4233*22dc650dSSadaf Ebrahimi if (!hyphenok)
4234*22dc650dSSadaf Ebrahimi {
4235*22dc650dSSadaf Ebrahimi errorcode = ERR94;
4236*22dc650dSSadaf Ebrahimi ptr--; /* Correct the offset */
4237*22dc650dSSadaf Ebrahimi goto FAILED;
4238*22dc650dSSadaf Ebrahimi }
4239*22dc650dSSadaf Ebrahimi optset = &unset;
4240*22dc650dSSadaf Ebrahimi xoptset = &xunset;
4241*22dc650dSSadaf Ebrahimi hyphenok = FALSE;
4242*22dc650dSSadaf Ebrahimi break;
4243*22dc650dSSadaf Ebrahimi
4244*22dc650dSSadaf Ebrahimi /* There are some two-character sequences that start with 'a'. */
4245*22dc650dSSadaf Ebrahimi
4246*22dc650dSSadaf Ebrahimi case CHAR_a:
4247*22dc650dSSadaf Ebrahimi if (ptr < ptrend)
4248*22dc650dSSadaf Ebrahimi {
4249*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_D)
4250*22dc650dSSadaf Ebrahimi {
4251*22dc650dSSadaf Ebrahimi *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4252*22dc650dSSadaf Ebrahimi ptr++;
4253*22dc650dSSadaf Ebrahimi break;
4254*22dc650dSSadaf Ebrahimi }
4255*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_P)
4256*22dc650dSSadaf Ebrahimi {
4257*22dc650dSSadaf Ebrahimi *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4258*22dc650dSSadaf Ebrahimi ptr++;
4259*22dc650dSSadaf Ebrahimi break;
4260*22dc650dSSadaf Ebrahimi }
4261*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_S)
4262*22dc650dSSadaf Ebrahimi {
4263*22dc650dSSadaf Ebrahimi *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4264*22dc650dSSadaf Ebrahimi ptr++;
4265*22dc650dSSadaf Ebrahimi break;
4266*22dc650dSSadaf Ebrahimi }
4267*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_T)
4268*22dc650dSSadaf Ebrahimi {
4269*22dc650dSSadaf Ebrahimi *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4270*22dc650dSSadaf Ebrahimi ptr++;
4271*22dc650dSSadaf Ebrahimi break;
4272*22dc650dSSadaf Ebrahimi }
4273*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_W)
4274*22dc650dSSadaf Ebrahimi {
4275*22dc650dSSadaf Ebrahimi *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4276*22dc650dSSadaf Ebrahimi ptr++;
4277*22dc650dSSadaf Ebrahimi break;
4278*22dc650dSSadaf Ebrahimi }
4279*22dc650dSSadaf Ebrahimi }
4280*22dc650dSSadaf Ebrahimi *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4281*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ASCII_BSW|
4282*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4283*22dc650dSSadaf Ebrahimi break;
4284*22dc650dSSadaf Ebrahimi
4285*22dc650dSSadaf Ebrahimi case CHAR_J: /* Record that it changed in the external options */
4286*22dc650dSSadaf Ebrahimi *optset |= PCRE2_DUPNAMES;
4287*22dc650dSSadaf Ebrahimi cb->external_flags |= PCRE2_JCHANGED;
4288*22dc650dSSadaf Ebrahimi break;
4289*22dc650dSSadaf Ebrahimi
4290*22dc650dSSadaf Ebrahimi case CHAR_i: *optset |= PCRE2_CASELESS; break;
4291*22dc650dSSadaf Ebrahimi case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4292*22dc650dSSadaf Ebrahimi case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4293*22dc650dSSadaf Ebrahimi case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4294*22dc650dSSadaf Ebrahimi case CHAR_s: *optset |= PCRE2_DOTALL; break;
4295*22dc650dSSadaf Ebrahimi case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4296*22dc650dSSadaf Ebrahimi
4297*22dc650dSSadaf Ebrahimi /* If x appears twice it sets the extended extended option. */
4298*22dc650dSSadaf Ebrahimi
4299*22dc650dSSadaf Ebrahimi case CHAR_x:
4300*22dc650dSSadaf Ebrahimi *optset |= PCRE2_EXTENDED;
4301*22dc650dSSadaf Ebrahimi if (ptr < ptrend && *ptr == CHAR_x)
4302*22dc650dSSadaf Ebrahimi {
4303*22dc650dSSadaf Ebrahimi *optset |= PCRE2_EXTENDED_MORE;
4304*22dc650dSSadaf Ebrahimi ptr++;
4305*22dc650dSSadaf Ebrahimi }
4306*22dc650dSSadaf Ebrahimi break;
4307*22dc650dSSadaf Ebrahimi
4308*22dc650dSSadaf Ebrahimi default:
4309*22dc650dSSadaf Ebrahimi errorcode = ERR11;
4310*22dc650dSSadaf Ebrahimi ptr--; /* Correct the offset */
4311*22dc650dSSadaf Ebrahimi goto FAILED;
4312*22dc650dSSadaf Ebrahimi }
4313*22dc650dSSadaf Ebrahimi }
4314*22dc650dSSadaf Ebrahimi
4315*22dc650dSSadaf Ebrahimi /* If we are setting extended without extended-more, ensure that any
4316*22dc650dSSadaf Ebrahimi existing extended-more gets unset. Also, unsetting extended must also
4317*22dc650dSSadaf Ebrahimi unset extended-more. */
4318*22dc650dSSadaf Ebrahimi
4319*22dc650dSSadaf Ebrahimi if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4320*22dc650dSSadaf Ebrahimi (unset & PCRE2_EXTENDED) != 0)
4321*22dc650dSSadaf Ebrahimi unset |= PCRE2_EXTENDED_MORE;
4322*22dc650dSSadaf Ebrahimi
4323*22dc650dSSadaf Ebrahimi options = (options | set) & (~unset);
4324*22dc650dSSadaf Ebrahimi xoptions = (xoptions | xset) & (~xunset);
4325*22dc650dSSadaf Ebrahimi
4326*22dc650dSSadaf Ebrahimi /* If the options ended with ')' this is not the start of a nested
4327*22dc650dSSadaf Ebrahimi group with option changes, so the options change at this level.
4328*22dc650dSSadaf Ebrahimi In this case, if the previous level set up a nest block, discard the
4329*22dc650dSSadaf Ebrahimi one we have just created. Otherwise adjust it for the previous level.
4330*22dc650dSSadaf Ebrahimi If the options ended with ':' we are starting a non-capturing group,
4331*22dc650dSSadaf Ebrahimi possibly with an options setting. */
4332*22dc650dSSadaf Ebrahimi
4333*22dc650dSSadaf Ebrahimi if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4334*22dc650dSSadaf Ebrahimi if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4335*22dc650dSSadaf Ebrahimi {
4336*22dc650dSSadaf Ebrahimi nest_depth--; /* This is not a nested group after all. */
4337*22dc650dSSadaf Ebrahimi if (top_nest > (nest_save *)(cb->start_workspace) &&
4338*22dc650dSSadaf Ebrahimi (top_nest-1)->nest_depth == nest_depth) top_nest--;
4339*22dc650dSSadaf Ebrahimi else top_nest->nest_depth = nest_depth;
4340*22dc650dSSadaf Ebrahimi }
4341*22dc650dSSadaf Ebrahimi else *parsed_pattern++ = META_NOCAPTURE;
4342*22dc650dSSadaf Ebrahimi
4343*22dc650dSSadaf Ebrahimi /* If nothing changed, no need to record. */
4344*22dc650dSSadaf Ebrahimi
4345*22dc650dSSadaf Ebrahimi if (options != oldoptions || xoptions != oldxoptions)
4346*22dc650dSSadaf Ebrahimi {
4347*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_OPTIONS;
4348*22dc650dSSadaf Ebrahimi *parsed_pattern++ = options;
4349*22dc650dSSadaf Ebrahimi *parsed_pattern++ = xoptions;
4350*22dc650dSSadaf Ebrahimi }
4351*22dc650dSSadaf Ebrahimi } /* End options processing */
4352*22dc650dSSadaf Ebrahimi break; /* End default case after (? */
4353*22dc650dSSadaf Ebrahimi
4354*22dc650dSSadaf Ebrahimi
4355*22dc650dSSadaf Ebrahimi /* ---- Python syntax support ---- */
4356*22dc650dSSadaf Ebrahimi
4357*22dc650dSSadaf Ebrahimi case CHAR_P:
4358*22dc650dSSadaf Ebrahimi if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4359*22dc650dSSadaf Ebrahimi
4360*22dc650dSSadaf Ebrahimi /* (?P<name> is the same as (?<name>, which defines a named group. */
4361*22dc650dSSadaf Ebrahimi
4362*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_LESS_THAN_SIGN)
4363*22dc650dSSadaf Ebrahimi {
4364*22dc650dSSadaf Ebrahimi terminator = CHAR_GREATER_THAN_SIGN;
4365*22dc650dSSadaf Ebrahimi goto DEFINE_NAME;
4366*22dc650dSSadaf Ebrahimi }
4367*22dc650dSSadaf Ebrahimi
4368*22dc650dSSadaf Ebrahimi /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4369*22dc650dSSadaf Ebrahimi call. */
4370*22dc650dSSadaf Ebrahimi
4371*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4372*22dc650dSSadaf Ebrahimi
4373*22dc650dSSadaf Ebrahimi /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4374*22dc650dSSadaf Ebrahimi else after (?P is an error. */
4375*22dc650dSSadaf Ebrahimi
4376*22dc650dSSadaf Ebrahimi if (*ptr != CHAR_EQUALS_SIGN)
4377*22dc650dSSadaf Ebrahimi {
4378*22dc650dSSadaf Ebrahimi errorcode = ERR41;
4379*22dc650dSSadaf Ebrahimi goto FAILED;
4380*22dc650dSSadaf Ebrahimi }
4381*22dc650dSSadaf Ebrahimi if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4382*22dc650dSSadaf Ebrahimi &namelen, &errorcode, cb)) goto FAILED;
4383*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_BACKREF_BYNAME;
4384*22dc650dSSadaf Ebrahimi *parsed_pattern++ = namelen;
4385*22dc650dSSadaf Ebrahimi PUTOFFSET(offset, parsed_pattern);
4386*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
4387*22dc650dSSadaf Ebrahimi break; /* End of (?P processing */
4388*22dc650dSSadaf Ebrahimi
4389*22dc650dSSadaf Ebrahimi
4390*22dc650dSSadaf Ebrahimi /* ---- Recursion/subroutine calls by number ---- */
4391*22dc650dSSadaf Ebrahimi
4392*22dc650dSSadaf Ebrahimi case CHAR_R:
4393*22dc650dSSadaf Ebrahimi i = 0; /* (?R) == (?R0) */
4394*22dc650dSSadaf Ebrahimi ptr++;
4395*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4396*22dc650dSSadaf Ebrahimi {
4397*22dc650dSSadaf Ebrahimi errorcode = ERR58;
4398*22dc650dSSadaf Ebrahimi goto FAILED;
4399*22dc650dSSadaf Ebrahimi }
4400*22dc650dSSadaf Ebrahimi goto SET_RECURSION;
4401*22dc650dSSadaf Ebrahimi
4402*22dc650dSSadaf Ebrahimi /* An item starting (?- followed by a digit comes here via the "default"
4403*22dc650dSSadaf Ebrahimi case because (?- followed by a non-digit is an options setting. */
4404*22dc650dSSadaf Ebrahimi
4405*22dc650dSSadaf Ebrahimi case CHAR_PLUS:
4406*22dc650dSSadaf Ebrahimi if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4407*22dc650dSSadaf Ebrahimi {
4408*22dc650dSSadaf Ebrahimi errorcode = ERR29; /* Missing number */
4409*22dc650dSSadaf Ebrahimi goto FAILED;
4410*22dc650dSSadaf Ebrahimi }
4411*22dc650dSSadaf Ebrahimi /* Fall through */
4412*22dc650dSSadaf Ebrahimi
4413*22dc650dSSadaf Ebrahimi case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4414*22dc650dSSadaf Ebrahimi case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4415*22dc650dSSadaf Ebrahimi RECURSION_BYNUMBER:
4416*22dc650dSSadaf Ebrahimi if (!read_number(&ptr, ptrend,
4417*22dc650dSSadaf Ebrahimi (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4418*22dc650dSSadaf Ebrahimi MAX_GROUP_NUMBER, ERR61,
4419*22dc650dSSadaf Ebrahimi &i, &errorcode)) goto FAILED;
4420*22dc650dSSadaf Ebrahimi if (i < 0) /* NB (?0) is permitted */
4421*22dc650dSSadaf Ebrahimi {
4422*22dc650dSSadaf Ebrahimi errorcode = ERR15; /* Unknown group */
4423*22dc650dSSadaf Ebrahimi goto FAILED_BACK;
4424*22dc650dSSadaf Ebrahimi }
4425*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4426*22dc650dSSadaf Ebrahimi goto UNCLOSED_PARENTHESIS;
4427*22dc650dSSadaf Ebrahimi
4428*22dc650dSSadaf Ebrahimi SET_RECURSION:
4429*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4430*22dc650dSSadaf Ebrahimi offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4431*22dc650dSSadaf Ebrahimi ptr++;
4432*22dc650dSSadaf Ebrahimi PUTOFFSET(offset, parsed_pattern);
4433*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
4434*22dc650dSSadaf Ebrahimi break; /* End of recursive call by number handling */
4435*22dc650dSSadaf Ebrahimi
4436*22dc650dSSadaf Ebrahimi
4437*22dc650dSSadaf Ebrahimi /* ---- Recursion/subroutine calls by name ---- */
4438*22dc650dSSadaf Ebrahimi
4439*22dc650dSSadaf Ebrahimi case CHAR_AMPERSAND:
4440*22dc650dSSadaf Ebrahimi RECURSE_BY_NAME:
4441*22dc650dSSadaf Ebrahimi if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4442*22dc650dSSadaf Ebrahimi &namelen, &errorcode, cb)) goto FAILED;
4443*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_RECURSE_BYNAME;
4444*22dc650dSSadaf Ebrahimi *parsed_pattern++ = namelen;
4445*22dc650dSSadaf Ebrahimi PUTOFFSET(offset, parsed_pattern);
4446*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
4447*22dc650dSSadaf Ebrahimi break;
4448*22dc650dSSadaf Ebrahimi
4449*22dc650dSSadaf Ebrahimi /* ---- Callout with numerical or string argument ---- */
4450*22dc650dSSadaf Ebrahimi
4451*22dc650dSSadaf Ebrahimi case CHAR_C:
4452*22dc650dSSadaf Ebrahimi if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4453*22dc650dSSadaf Ebrahimi
4454*22dc650dSSadaf Ebrahimi /* If the previous item was a condition starting (?(? an assertion,
4455*22dc650dSSadaf Ebrahimi optionally preceded by a callout, is expected. This is checked later on,
4456*22dc650dSSadaf Ebrahimi during actual compilation. However we need to identify this kind of
4457*22dc650dSSadaf Ebrahimi assertion in this pass because it must not be qualified. The value of
4458*22dc650dSSadaf Ebrahimi expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4459*22dc650dSSadaf Ebrahimi for a callout - still leaving a positive value that identifies the
4460*22dc650dSSadaf Ebrahimi assertion. Multiple callouts or any other items will make it zero or
4461*22dc650dSSadaf Ebrahimi less, which doesn't matter because they will cause an error later. */
4462*22dc650dSSadaf Ebrahimi
4463*22dc650dSSadaf Ebrahimi expect_cond_assert = prev_expect_cond_assert - 1;
4464*22dc650dSSadaf Ebrahimi
4465*22dc650dSSadaf Ebrahimi /* If previous_callout is not NULL, it means this follows a previous
4466*22dc650dSSadaf Ebrahimi callout. If it was a manual callout, do nothing; this means its "length
4467*22dc650dSSadaf Ebrahimi of next pattern item" field will remain zero. If it was an automatic
4468*22dc650dSSadaf Ebrahimi callout, abolish it. */
4469*22dc650dSSadaf Ebrahimi
4470*22dc650dSSadaf Ebrahimi if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4471*22dc650dSSadaf Ebrahimi previous_callout == parsed_pattern - 4 &&
4472*22dc650dSSadaf Ebrahimi parsed_pattern[-1] == 255)
4473*22dc650dSSadaf Ebrahimi parsed_pattern = previous_callout;
4474*22dc650dSSadaf Ebrahimi
4475*22dc650dSSadaf Ebrahimi /* Save for updating next pattern item length, and skip one item before
4476*22dc650dSSadaf Ebrahimi completing. */
4477*22dc650dSSadaf Ebrahimi
4478*22dc650dSSadaf Ebrahimi previous_callout = parsed_pattern;
4479*22dc650dSSadaf Ebrahimi after_manual_callout = 1;
4480*22dc650dSSadaf Ebrahimi
4481*22dc650dSSadaf Ebrahimi /* Handle a string argument; specific delimiter is required. */
4482*22dc650dSSadaf Ebrahimi
4483*22dc650dSSadaf Ebrahimi if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4484*22dc650dSSadaf Ebrahimi {
4485*22dc650dSSadaf Ebrahimi PCRE2_SIZE calloutlength;
4486*22dc650dSSadaf Ebrahimi PCRE2_SPTR startptr = ptr;
4487*22dc650dSSadaf Ebrahimi
4488*22dc650dSSadaf Ebrahimi delimiter = 0;
4489*22dc650dSSadaf Ebrahimi for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4490*22dc650dSSadaf Ebrahimi {
4491*22dc650dSSadaf Ebrahimi if (*ptr == PRIV(callout_start_delims)[i])
4492*22dc650dSSadaf Ebrahimi {
4493*22dc650dSSadaf Ebrahimi delimiter = PRIV(callout_end_delims)[i];
4494*22dc650dSSadaf Ebrahimi break;
4495*22dc650dSSadaf Ebrahimi }
4496*22dc650dSSadaf Ebrahimi }
4497*22dc650dSSadaf Ebrahimi if (delimiter == 0)
4498*22dc650dSSadaf Ebrahimi {
4499*22dc650dSSadaf Ebrahimi errorcode = ERR82;
4500*22dc650dSSadaf Ebrahimi goto FAILED;
4501*22dc650dSSadaf Ebrahimi }
4502*22dc650dSSadaf Ebrahimi
4503*22dc650dSSadaf Ebrahimi *parsed_pattern = META_CALLOUT_STRING;
4504*22dc650dSSadaf Ebrahimi parsed_pattern += 3; /* Skip pattern info */
4505*22dc650dSSadaf Ebrahimi
4506*22dc650dSSadaf Ebrahimi for (;;)
4507*22dc650dSSadaf Ebrahimi {
4508*22dc650dSSadaf Ebrahimi if (++ptr >= ptrend)
4509*22dc650dSSadaf Ebrahimi {
4510*22dc650dSSadaf Ebrahimi errorcode = ERR81;
4511*22dc650dSSadaf Ebrahimi ptr = startptr; /* To give a more useful message */
4512*22dc650dSSadaf Ebrahimi goto FAILED;
4513*22dc650dSSadaf Ebrahimi }
4514*22dc650dSSadaf Ebrahimi if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4515*22dc650dSSadaf Ebrahimi break;
4516*22dc650dSSadaf Ebrahimi }
4517*22dc650dSSadaf Ebrahimi
4518*22dc650dSSadaf Ebrahimi calloutlength = (PCRE2_SIZE)(ptr - startptr);
4519*22dc650dSSadaf Ebrahimi if (calloutlength > UINT32_MAX)
4520*22dc650dSSadaf Ebrahimi {
4521*22dc650dSSadaf Ebrahimi errorcode = ERR72;
4522*22dc650dSSadaf Ebrahimi goto FAILED;
4523*22dc650dSSadaf Ebrahimi }
4524*22dc650dSSadaf Ebrahimi *parsed_pattern++ = (uint32_t)calloutlength;
4525*22dc650dSSadaf Ebrahimi offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4526*22dc650dSSadaf Ebrahimi PUTOFFSET(offset, parsed_pattern);
4527*22dc650dSSadaf Ebrahimi }
4528*22dc650dSSadaf Ebrahimi
4529*22dc650dSSadaf Ebrahimi /* Handle a callout with an optional numerical argument, which must be
4530*22dc650dSSadaf Ebrahimi less than or equal to 255. A missing argument gives 0. */
4531*22dc650dSSadaf Ebrahimi
4532*22dc650dSSadaf Ebrahimi else
4533*22dc650dSSadaf Ebrahimi {
4534*22dc650dSSadaf Ebrahimi int n = 0;
4535*22dc650dSSadaf Ebrahimi *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
4536*22dc650dSSadaf Ebrahimi parsed_pattern += 3; /* Skip pattern info */
4537*22dc650dSSadaf Ebrahimi while (ptr < ptrend && IS_DIGIT(*ptr))
4538*22dc650dSSadaf Ebrahimi {
4539*22dc650dSSadaf Ebrahimi n = n * 10 + *ptr++ - CHAR_0;
4540*22dc650dSSadaf Ebrahimi if (n > 255)
4541*22dc650dSSadaf Ebrahimi {
4542*22dc650dSSadaf Ebrahimi errorcode = ERR38;
4543*22dc650dSSadaf Ebrahimi goto FAILED;
4544*22dc650dSSadaf Ebrahimi }
4545*22dc650dSSadaf Ebrahimi }
4546*22dc650dSSadaf Ebrahimi *parsed_pattern++ = n;
4547*22dc650dSSadaf Ebrahimi }
4548*22dc650dSSadaf Ebrahimi
4549*22dc650dSSadaf Ebrahimi /* Both formats must have a closing parenthesis */
4550*22dc650dSSadaf Ebrahimi
4551*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4552*22dc650dSSadaf Ebrahimi {
4553*22dc650dSSadaf Ebrahimi errorcode = ERR39;
4554*22dc650dSSadaf Ebrahimi goto FAILED;
4555*22dc650dSSadaf Ebrahimi }
4556*22dc650dSSadaf Ebrahimi ptr++;
4557*22dc650dSSadaf Ebrahimi
4558*22dc650dSSadaf Ebrahimi /* Remember the offset to the next item in the pattern, and set a default
4559*22dc650dSSadaf Ebrahimi length. This should get updated after the next item is read. */
4560*22dc650dSSadaf Ebrahimi
4561*22dc650dSSadaf Ebrahimi previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4562*22dc650dSSadaf Ebrahimi previous_callout[2] = 0;
4563*22dc650dSSadaf Ebrahimi break; /* End callout */
4564*22dc650dSSadaf Ebrahimi
4565*22dc650dSSadaf Ebrahimi
4566*22dc650dSSadaf Ebrahimi /* ---- Conditional group ---- */
4567*22dc650dSSadaf Ebrahimi
4568*22dc650dSSadaf Ebrahimi /* A condition can be an assertion, a number (referring to a numbered
4569*22dc650dSSadaf Ebrahimi group's having been set), a name (referring to a named group), or 'R',
4570*22dc650dSSadaf Ebrahimi referring to overall recursion. R<digits> and R&name are also permitted
4571*22dc650dSSadaf Ebrahimi for recursion state tests. Numbers may be preceded by + or - to specify a
4572*22dc650dSSadaf Ebrahimi relative group number.
4573*22dc650dSSadaf Ebrahimi
4574*22dc650dSSadaf Ebrahimi There are several syntaxes for testing a named group: (?(name)) is used
4575*22dc650dSSadaf Ebrahimi by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4576*22dc650dSSadaf Ebrahimi
4577*22dc650dSSadaf Ebrahimi There are two unfortunate ambiguities. 'R' can be the recursive thing or
4578*22dc650dSSadaf Ebrahimi the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4579*22dc650dSSadaf Ebrahimi the Perl DEFINE feature or the Python named test. We look for a name
4580*22dc650dSSadaf Ebrahimi first; if not found, we try the other case.
4581*22dc650dSSadaf Ebrahimi
4582*22dc650dSSadaf Ebrahimi For compatibility with auto-callouts, we allow a callout to be specified
4583*22dc650dSSadaf Ebrahimi before a condition that is an assertion. */
4584*22dc650dSSadaf Ebrahimi
4585*22dc650dSSadaf Ebrahimi case CHAR_LEFT_PARENTHESIS:
4586*22dc650dSSadaf Ebrahimi if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4587*22dc650dSSadaf Ebrahimi nest_depth++;
4588*22dc650dSSadaf Ebrahimi
4589*22dc650dSSadaf Ebrahimi /* If the next character is ? or * there must be an assertion next
4590*22dc650dSSadaf Ebrahimi (optionally preceded by a callout). We do not check this here, but
4591*22dc650dSSadaf Ebrahimi instead we set expect_cond_assert to 2. If this is still greater than
4592*22dc650dSSadaf Ebrahimi zero (callouts decrement it) when the next assertion is read, it will be
4593*22dc650dSSadaf Ebrahimi marked as a condition that must not be repeated. A value greater than
4594*22dc650dSSadaf Ebrahimi zero also causes checking that an assertion (possibly with callout)
4595*22dc650dSSadaf Ebrahimi follows. */
4596*22dc650dSSadaf Ebrahimi
4597*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4598*22dc650dSSadaf Ebrahimi {
4599*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_COND_ASSERT;
4600*22dc650dSSadaf Ebrahimi ptr--; /* Pull pointer back to the opening parenthesis. */
4601*22dc650dSSadaf Ebrahimi expect_cond_assert = 2;
4602*22dc650dSSadaf Ebrahimi break; /* End of conditional */
4603*22dc650dSSadaf Ebrahimi }
4604*22dc650dSSadaf Ebrahimi
4605*22dc650dSSadaf Ebrahimi /* Handle (?([+-]number)... */
4606*22dc650dSSadaf Ebrahimi
4607*22dc650dSSadaf Ebrahimi if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4608*22dc650dSSadaf Ebrahimi &errorcode))
4609*22dc650dSSadaf Ebrahimi {
4610*22dc650dSSadaf Ebrahimi if (i <= 0)
4611*22dc650dSSadaf Ebrahimi {
4612*22dc650dSSadaf Ebrahimi errorcode = ERR15;
4613*22dc650dSSadaf Ebrahimi goto FAILED;
4614*22dc650dSSadaf Ebrahimi }
4615*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_COND_NUMBER;
4616*22dc650dSSadaf Ebrahimi offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4617*22dc650dSSadaf Ebrahimi PUTOFFSET(offset, parsed_pattern);
4618*22dc650dSSadaf Ebrahimi *parsed_pattern++ = i;
4619*22dc650dSSadaf Ebrahimi }
4620*22dc650dSSadaf Ebrahimi else if (errorcode != 0) goto FAILED; /* Number too big */
4621*22dc650dSSadaf Ebrahimi
4622*22dc650dSSadaf Ebrahimi /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4623*22dc650dSSadaf Ebrahimi
4624*22dc650dSSadaf Ebrahimi else if (ptrend - ptr >= 10 &&
4625*22dc650dSSadaf Ebrahimi PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4626*22dc650dSSadaf Ebrahimi ptr[7] != CHAR_RIGHT_PARENTHESIS)
4627*22dc650dSSadaf Ebrahimi {
4628*22dc650dSSadaf Ebrahimi uint32_t ge = 0;
4629*22dc650dSSadaf Ebrahimi int major = 0;
4630*22dc650dSSadaf Ebrahimi int minor = 0;
4631*22dc650dSSadaf Ebrahimi
4632*22dc650dSSadaf Ebrahimi ptr += 7;
4633*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_GREATER_THAN_SIGN)
4634*22dc650dSSadaf Ebrahimi {
4635*22dc650dSSadaf Ebrahimi ge = 1;
4636*22dc650dSSadaf Ebrahimi ptr++;
4637*22dc650dSSadaf Ebrahimi }
4638*22dc650dSSadaf Ebrahimi
4639*22dc650dSSadaf Ebrahimi /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4640*22dc650dSSadaf Ebrahimi references its argument twice. */
4641*22dc650dSSadaf Ebrahimi
4642*22dc650dSSadaf Ebrahimi if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4643*22dc650dSSadaf Ebrahimi goto BAD_VERSION_CONDITION;
4644*22dc650dSSadaf Ebrahimi
4645*22dc650dSSadaf Ebrahimi if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4646*22dc650dSSadaf Ebrahimi goto FAILED;
4647*22dc650dSSadaf Ebrahimi
4648*22dc650dSSadaf Ebrahimi if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4649*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_DOT)
4650*22dc650dSSadaf Ebrahimi {
4651*22dc650dSSadaf Ebrahimi if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4652*22dc650dSSadaf Ebrahimi minor = (*ptr++ - CHAR_0) * 10;
4653*22dc650dSSadaf Ebrahimi if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4654*22dc650dSSadaf Ebrahimi if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4655*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4656*22dc650dSSadaf Ebrahimi goto BAD_VERSION_CONDITION;
4657*22dc650dSSadaf Ebrahimi }
4658*22dc650dSSadaf Ebrahimi
4659*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_COND_VERSION;
4660*22dc650dSSadaf Ebrahimi *parsed_pattern++ = ge;
4661*22dc650dSSadaf Ebrahimi *parsed_pattern++ = major;
4662*22dc650dSSadaf Ebrahimi *parsed_pattern++ = minor;
4663*22dc650dSSadaf Ebrahimi }
4664*22dc650dSSadaf Ebrahimi
4665*22dc650dSSadaf Ebrahimi /* All the remaining cases now require us to read a name. We cannot at
4666*22dc650dSSadaf Ebrahimi this stage distinguish ambiguous cases such as (?(R12) which might be a
4667*22dc650dSSadaf Ebrahimi recursion test by number or a name, because the named groups have not yet
4668*22dc650dSSadaf Ebrahimi all been identified. Those cases are treated as names, but given a
4669*22dc650dSSadaf Ebrahimi different META code. */
4670*22dc650dSSadaf Ebrahimi
4671*22dc650dSSadaf Ebrahimi else
4672*22dc650dSSadaf Ebrahimi {
4673*22dc650dSSadaf Ebrahimi BOOL was_r_ampersand = FALSE;
4674*22dc650dSSadaf Ebrahimi
4675*22dc650dSSadaf Ebrahimi if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4676*22dc650dSSadaf Ebrahimi {
4677*22dc650dSSadaf Ebrahimi terminator = CHAR_RIGHT_PARENTHESIS;
4678*22dc650dSSadaf Ebrahimi was_r_ampersand = TRUE;
4679*22dc650dSSadaf Ebrahimi ptr++;
4680*22dc650dSSadaf Ebrahimi }
4681*22dc650dSSadaf Ebrahimi else if (*ptr == CHAR_LESS_THAN_SIGN)
4682*22dc650dSSadaf Ebrahimi terminator = CHAR_GREATER_THAN_SIGN;
4683*22dc650dSSadaf Ebrahimi else if (*ptr == CHAR_APOSTROPHE)
4684*22dc650dSSadaf Ebrahimi terminator = CHAR_APOSTROPHE;
4685*22dc650dSSadaf Ebrahimi else
4686*22dc650dSSadaf Ebrahimi {
4687*22dc650dSSadaf Ebrahimi terminator = CHAR_RIGHT_PARENTHESIS;
4688*22dc650dSSadaf Ebrahimi ptr--; /* Point to char before name */
4689*22dc650dSSadaf Ebrahimi }
4690*22dc650dSSadaf Ebrahimi if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4691*22dc650dSSadaf Ebrahimi &errorcode, cb)) goto FAILED;
4692*22dc650dSSadaf Ebrahimi
4693*22dc650dSSadaf Ebrahimi /* Handle (?(R&name) */
4694*22dc650dSSadaf Ebrahimi
4695*22dc650dSSadaf Ebrahimi if (was_r_ampersand)
4696*22dc650dSSadaf Ebrahimi {
4697*22dc650dSSadaf Ebrahimi *parsed_pattern = META_COND_RNAME;
4698*22dc650dSSadaf Ebrahimi ptr--; /* Back to closing parens */
4699*22dc650dSSadaf Ebrahimi }
4700*22dc650dSSadaf Ebrahimi
4701*22dc650dSSadaf Ebrahimi /* Handle (?(name). If the name is "DEFINE" we identify it with a
4702*22dc650dSSadaf Ebrahimi special code. Likewise if the name consists of R followed only by
4703*22dc650dSSadaf Ebrahimi digits. Otherwise, handle it like a quoted name. */
4704*22dc650dSSadaf Ebrahimi
4705*22dc650dSSadaf Ebrahimi else if (terminator == CHAR_RIGHT_PARENTHESIS)
4706*22dc650dSSadaf Ebrahimi {
4707*22dc650dSSadaf Ebrahimi if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4708*22dc650dSSadaf Ebrahimi *parsed_pattern = META_COND_DEFINE;
4709*22dc650dSSadaf Ebrahimi else
4710*22dc650dSSadaf Ebrahimi {
4711*22dc650dSSadaf Ebrahimi for (i = 1; i < (int)namelen; i++)
4712*22dc650dSSadaf Ebrahimi if (!IS_DIGIT(name[i])) break;
4713*22dc650dSSadaf Ebrahimi *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4714*22dc650dSSadaf Ebrahimi META_COND_RNUMBER : META_COND_NAME;
4715*22dc650dSSadaf Ebrahimi }
4716*22dc650dSSadaf Ebrahimi ptr--; /* Back to closing parens */
4717*22dc650dSSadaf Ebrahimi }
4718*22dc650dSSadaf Ebrahimi
4719*22dc650dSSadaf Ebrahimi /* Handle (?('name') or (?(<name>) */
4720*22dc650dSSadaf Ebrahimi
4721*22dc650dSSadaf Ebrahimi else *parsed_pattern = META_COND_NAME;
4722*22dc650dSSadaf Ebrahimi
4723*22dc650dSSadaf Ebrahimi /* All these cases except DEFINE end with the name length and offset;
4724*22dc650dSSadaf Ebrahimi DEFINE just has an offset (for the "too many branches" error). */
4725*22dc650dSSadaf Ebrahimi
4726*22dc650dSSadaf Ebrahimi if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4727*22dc650dSSadaf Ebrahimi PUTOFFSET(offset, parsed_pattern);
4728*22dc650dSSadaf Ebrahimi } /* End cases that read a name */
4729*22dc650dSSadaf Ebrahimi
4730*22dc650dSSadaf Ebrahimi /* Check the closing parenthesis of the condition */
4731*22dc650dSSadaf Ebrahimi
4732*22dc650dSSadaf Ebrahimi if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4733*22dc650dSSadaf Ebrahimi {
4734*22dc650dSSadaf Ebrahimi errorcode = ERR24;
4735*22dc650dSSadaf Ebrahimi goto FAILED;
4736*22dc650dSSadaf Ebrahimi }
4737*22dc650dSSadaf Ebrahimi ptr++;
4738*22dc650dSSadaf Ebrahimi break; /* End of condition processing */
4739*22dc650dSSadaf Ebrahimi
4740*22dc650dSSadaf Ebrahimi
4741*22dc650dSSadaf Ebrahimi /* ---- Atomic group ---- */
4742*22dc650dSSadaf Ebrahimi
4743*22dc650dSSadaf Ebrahimi case CHAR_GREATER_THAN_SIGN:
4744*22dc650dSSadaf Ebrahimi ATOMIC_GROUP: /* Come from (*atomic: */
4745*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ATOMIC;
4746*22dc650dSSadaf Ebrahimi nest_depth++;
4747*22dc650dSSadaf Ebrahimi ptr++;
4748*22dc650dSSadaf Ebrahimi break;
4749*22dc650dSSadaf Ebrahimi
4750*22dc650dSSadaf Ebrahimi
4751*22dc650dSSadaf Ebrahimi /* ---- Lookahead assertions ---- */
4752*22dc650dSSadaf Ebrahimi
4753*22dc650dSSadaf Ebrahimi case CHAR_EQUALS_SIGN:
4754*22dc650dSSadaf Ebrahimi POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
4755*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_LOOKAHEAD;
4756*22dc650dSSadaf Ebrahimi ptr++;
4757*22dc650dSSadaf Ebrahimi goto POST_ASSERTION;
4758*22dc650dSSadaf Ebrahimi
4759*22dc650dSSadaf Ebrahimi case CHAR_ASTERISK:
4760*22dc650dSSadaf Ebrahimi POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
4761*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_LOOKAHEAD_NA;
4762*22dc650dSSadaf Ebrahimi ptr++;
4763*22dc650dSSadaf Ebrahimi goto POST_ASSERTION;
4764*22dc650dSSadaf Ebrahimi
4765*22dc650dSSadaf Ebrahimi case CHAR_EXCLAMATION_MARK:
4766*22dc650dSSadaf Ebrahimi NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
4767*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_LOOKAHEADNOT;
4768*22dc650dSSadaf Ebrahimi ptr++;
4769*22dc650dSSadaf Ebrahimi goto POST_ASSERTION;
4770*22dc650dSSadaf Ebrahimi
4771*22dc650dSSadaf Ebrahimi
4772*22dc650dSSadaf Ebrahimi /* ---- Lookbehind assertions ---- */
4773*22dc650dSSadaf Ebrahimi
4774*22dc650dSSadaf Ebrahimi /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4775*22dc650dSSadaf Ebrahimi is the start of the name of a capturing group. */
4776*22dc650dSSadaf Ebrahimi
4777*22dc650dSSadaf Ebrahimi case CHAR_LESS_THAN_SIGN:
4778*22dc650dSSadaf Ebrahimi if (ptrend - ptr <= 1 ||
4779*22dc650dSSadaf Ebrahimi (ptr[1] != CHAR_EQUALS_SIGN &&
4780*22dc650dSSadaf Ebrahimi ptr[1] != CHAR_EXCLAMATION_MARK &&
4781*22dc650dSSadaf Ebrahimi ptr[1] != CHAR_ASTERISK))
4782*22dc650dSSadaf Ebrahimi {
4783*22dc650dSSadaf Ebrahimi terminator = CHAR_GREATER_THAN_SIGN;
4784*22dc650dSSadaf Ebrahimi goto DEFINE_NAME;
4785*22dc650dSSadaf Ebrahimi }
4786*22dc650dSSadaf Ebrahimi *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4787*22dc650dSSadaf Ebrahimi META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4788*22dc650dSSadaf Ebrahimi META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4789*22dc650dSSadaf Ebrahimi
4790*22dc650dSSadaf Ebrahimi POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
4791*22dc650dSSadaf Ebrahimi *has_lookbehind = TRUE;
4792*22dc650dSSadaf Ebrahimi offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4793*22dc650dSSadaf Ebrahimi PUTOFFSET(offset, parsed_pattern);
4794*22dc650dSSadaf Ebrahimi ptr += 2;
4795*22dc650dSSadaf Ebrahimi /* Fall through */
4796*22dc650dSSadaf Ebrahimi
4797*22dc650dSSadaf Ebrahimi /* If the previous item was a condition starting (?(? an assertion,
4798*22dc650dSSadaf Ebrahimi optionally preceded by a callout, is expected. This is checked later on,
4799*22dc650dSSadaf Ebrahimi during actual compilation. However we need to identify this kind of
4800*22dc650dSSadaf Ebrahimi assertion in this pass because it must not be qualified. The value of
4801*22dc650dSSadaf Ebrahimi expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4802*22dc650dSSadaf Ebrahimi for a callout - still leaving a positive value that identifies the
4803*22dc650dSSadaf Ebrahimi assertion. Multiple callouts or any other items will make it zero or
4804*22dc650dSSadaf Ebrahimi less, which doesn't matter because they will cause an error later. */
4805*22dc650dSSadaf Ebrahimi
4806*22dc650dSSadaf Ebrahimi POST_ASSERTION:
4807*22dc650dSSadaf Ebrahimi nest_depth++;
4808*22dc650dSSadaf Ebrahimi if (prev_expect_cond_assert > 0)
4809*22dc650dSSadaf Ebrahimi {
4810*22dc650dSSadaf Ebrahimi if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4811*22dc650dSSadaf Ebrahimi else if (++top_nest >= end_nests)
4812*22dc650dSSadaf Ebrahimi {
4813*22dc650dSSadaf Ebrahimi errorcode = ERR84;
4814*22dc650dSSadaf Ebrahimi goto FAILED;
4815*22dc650dSSadaf Ebrahimi }
4816*22dc650dSSadaf Ebrahimi top_nest->nest_depth = nest_depth;
4817*22dc650dSSadaf Ebrahimi top_nest->flags = NSF_CONDASSERT;
4818*22dc650dSSadaf Ebrahimi top_nest->options = options & PARSE_TRACKED_OPTIONS;
4819*22dc650dSSadaf Ebrahimi top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4820*22dc650dSSadaf Ebrahimi }
4821*22dc650dSSadaf Ebrahimi break;
4822*22dc650dSSadaf Ebrahimi
4823*22dc650dSSadaf Ebrahimi
4824*22dc650dSSadaf Ebrahimi /* ---- Define a named group ---- */
4825*22dc650dSSadaf Ebrahimi
4826*22dc650dSSadaf Ebrahimi /* A named group may be defined as (?'name') or (?<name>). In the latter
4827*22dc650dSSadaf Ebrahimi case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4828*22dc650dSSadaf Ebrahimi terminator set to '>'. */
4829*22dc650dSSadaf Ebrahimi
4830*22dc650dSSadaf Ebrahimi case CHAR_APOSTROPHE:
4831*22dc650dSSadaf Ebrahimi terminator = CHAR_APOSTROPHE; /* Terminator */
4832*22dc650dSSadaf Ebrahimi
4833*22dc650dSSadaf Ebrahimi DEFINE_NAME:
4834*22dc650dSSadaf Ebrahimi if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4835*22dc650dSSadaf Ebrahimi &errorcode, cb)) goto FAILED;
4836*22dc650dSSadaf Ebrahimi
4837*22dc650dSSadaf Ebrahimi /* We have a name for this capturing group. It is also assigned a number,
4838*22dc650dSSadaf Ebrahimi which is its primary means of identification. */
4839*22dc650dSSadaf Ebrahimi
4840*22dc650dSSadaf Ebrahimi if (cb->bracount >= MAX_GROUP_NUMBER)
4841*22dc650dSSadaf Ebrahimi {
4842*22dc650dSSadaf Ebrahimi errorcode = ERR97;
4843*22dc650dSSadaf Ebrahimi goto FAILED;
4844*22dc650dSSadaf Ebrahimi }
4845*22dc650dSSadaf Ebrahimi cb->bracount++;
4846*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_CAPTURE | cb->bracount;
4847*22dc650dSSadaf Ebrahimi nest_depth++;
4848*22dc650dSSadaf Ebrahimi
4849*22dc650dSSadaf Ebrahimi /* Check not too many names */
4850*22dc650dSSadaf Ebrahimi
4851*22dc650dSSadaf Ebrahimi if (cb->names_found >= MAX_NAME_COUNT)
4852*22dc650dSSadaf Ebrahimi {
4853*22dc650dSSadaf Ebrahimi errorcode = ERR49;
4854*22dc650dSSadaf Ebrahimi goto FAILED;
4855*22dc650dSSadaf Ebrahimi }
4856*22dc650dSSadaf Ebrahimi
4857*22dc650dSSadaf Ebrahimi /* Adjust the entry size to accommodate the longest name found. */
4858*22dc650dSSadaf Ebrahimi
4859*22dc650dSSadaf Ebrahimi if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4860*22dc650dSSadaf Ebrahimi cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4861*22dc650dSSadaf Ebrahimi
4862*22dc650dSSadaf Ebrahimi /* Scan the list to check for duplicates. For duplicate names, if the
4863*22dc650dSSadaf Ebrahimi number is the same, break the loop, which causes the name to be
4864*22dc650dSSadaf Ebrahimi discarded; otherwise, if DUPNAMES is not set, give an error.
4865*22dc650dSSadaf Ebrahimi If it is set, allow the name with a different number, but continue
4866*22dc650dSSadaf Ebrahimi scanning in case this is a duplicate with the same number. For
4867*22dc650dSSadaf Ebrahimi non-duplicate names, give an error if the number is duplicated. */
4868*22dc650dSSadaf Ebrahimi
4869*22dc650dSSadaf Ebrahimi isdupname = FALSE;
4870*22dc650dSSadaf Ebrahimi ng = cb->named_groups;
4871*22dc650dSSadaf Ebrahimi for (i = 0; i < cb->names_found; i++, ng++)
4872*22dc650dSSadaf Ebrahimi {
4873*22dc650dSSadaf Ebrahimi if (namelen == ng->length &&
4874*22dc650dSSadaf Ebrahimi PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4875*22dc650dSSadaf Ebrahimi {
4876*22dc650dSSadaf Ebrahimi if (ng->number == cb->bracount) break;
4877*22dc650dSSadaf Ebrahimi if ((options & PCRE2_DUPNAMES) == 0)
4878*22dc650dSSadaf Ebrahimi {
4879*22dc650dSSadaf Ebrahimi errorcode = ERR43;
4880*22dc650dSSadaf Ebrahimi goto FAILED;
4881*22dc650dSSadaf Ebrahimi }
4882*22dc650dSSadaf Ebrahimi isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4883*22dc650dSSadaf Ebrahimi cb->dupnames = TRUE; /* Duplicate names exist */
4884*22dc650dSSadaf Ebrahimi }
4885*22dc650dSSadaf Ebrahimi else if (ng->number == cb->bracount)
4886*22dc650dSSadaf Ebrahimi {
4887*22dc650dSSadaf Ebrahimi errorcode = ERR65;
4888*22dc650dSSadaf Ebrahimi goto FAILED;
4889*22dc650dSSadaf Ebrahimi }
4890*22dc650dSSadaf Ebrahimi }
4891*22dc650dSSadaf Ebrahimi
4892*22dc650dSSadaf Ebrahimi if (i < cb->names_found) break; /* Ignore duplicate with same number */
4893*22dc650dSSadaf Ebrahimi
4894*22dc650dSSadaf Ebrahimi /* Increase the list size if necessary */
4895*22dc650dSSadaf Ebrahimi
4896*22dc650dSSadaf Ebrahimi if (cb->names_found >= cb->named_group_list_size)
4897*22dc650dSSadaf Ebrahimi {
4898*22dc650dSSadaf Ebrahimi uint32_t newsize = cb->named_group_list_size * 2;
4899*22dc650dSSadaf Ebrahimi named_group *newspace =
4900*22dc650dSSadaf Ebrahimi cb->cx->memctl.malloc(newsize * sizeof(named_group),
4901*22dc650dSSadaf Ebrahimi cb->cx->memctl.memory_data);
4902*22dc650dSSadaf Ebrahimi if (newspace == NULL)
4903*22dc650dSSadaf Ebrahimi {
4904*22dc650dSSadaf Ebrahimi errorcode = ERR21;
4905*22dc650dSSadaf Ebrahimi goto FAILED;
4906*22dc650dSSadaf Ebrahimi }
4907*22dc650dSSadaf Ebrahimi
4908*22dc650dSSadaf Ebrahimi memcpy(newspace, cb->named_groups,
4909*22dc650dSSadaf Ebrahimi cb->named_group_list_size * sizeof(named_group));
4910*22dc650dSSadaf Ebrahimi if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4911*22dc650dSSadaf Ebrahimi cb->cx->memctl.free((void *)cb->named_groups,
4912*22dc650dSSadaf Ebrahimi cb->cx->memctl.memory_data);
4913*22dc650dSSadaf Ebrahimi cb->named_groups = newspace;
4914*22dc650dSSadaf Ebrahimi cb->named_group_list_size = newsize;
4915*22dc650dSSadaf Ebrahimi }
4916*22dc650dSSadaf Ebrahimi
4917*22dc650dSSadaf Ebrahimi /* Add this name to the list */
4918*22dc650dSSadaf Ebrahimi
4919*22dc650dSSadaf Ebrahimi cb->named_groups[cb->names_found].name = name;
4920*22dc650dSSadaf Ebrahimi cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4921*22dc650dSSadaf Ebrahimi cb->named_groups[cb->names_found].number = cb->bracount;
4922*22dc650dSSadaf Ebrahimi cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4923*22dc650dSSadaf Ebrahimi cb->names_found++;
4924*22dc650dSSadaf Ebrahimi break;
4925*22dc650dSSadaf Ebrahimi } /* End of (? switch */
4926*22dc650dSSadaf Ebrahimi break; /* End of ( handling */
4927*22dc650dSSadaf Ebrahimi
4928*22dc650dSSadaf Ebrahimi
4929*22dc650dSSadaf Ebrahimi /* ---- Branch terminators ---- */
4930*22dc650dSSadaf Ebrahimi
4931*22dc650dSSadaf Ebrahimi /* Alternation: reset the capture count if we are in a (?| group. */
4932*22dc650dSSadaf Ebrahimi
4933*22dc650dSSadaf Ebrahimi case CHAR_VERTICAL_LINE:
4934*22dc650dSSadaf Ebrahimi if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4935*22dc650dSSadaf Ebrahimi (top_nest->flags & NSF_RESET) != 0)
4936*22dc650dSSadaf Ebrahimi {
4937*22dc650dSSadaf Ebrahimi if (cb->bracount > top_nest->max_group)
4938*22dc650dSSadaf Ebrahimi top_nest->max_group = (uint16_t)cb->bracount;
4939*22dc650dSSadaf Ebrahimi cb->bracount = top_nest->reset_group;
4940*22dc650dSSadaf Ebrahimi }
4941*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ALT;
4942*22dc650dSSadaf Ebrahimi break;
4943*22dc650dSSadaf Ebrahimi
4944*22dc650dSSadaf Ebrahimi /* End of group; reset the capture count to the maximum if we are in a (?|
4945*22dc650dSSadaf Ebrahimi group and/or reset the options that are tracked during parsing. Disallow
4946*22dc650dSSadaf Ebrahimi quantifier for a condition that is an assertion. */
4947*22dc650dSSadaf Ebrahimi
4948*22dc650dSSadaf Ebrahimi case CHAR_RIGHT_PARENTHESIS:
4949*22dc650dSSadaf Ebrahimi okquantifier = TRUE;
4950*22dc650dSSadaf Ebrahimi if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4951*22dc650dSSadaf Ebrahimi {
4952*22dc650dSSadaf Ebrahimi options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4953*22dc650dSSadaf Ebrahimi xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
4954*22dc650dSSadaf Ebrahimi if ((top_nest->flags & NSF_RESET) != 0 &&
4955*22dc650dSSadaf Ebrahimi top_nest->max_group > cb->bracount)
4956*22dc650dSSadaf Ebrahimi cb->bracount = top_nest->max_group;
4957*22dc650dSSadaf Ebrahimi if ((top_nest->flags & NSF_CONDASSERT) != 0)
4958*22dc650dSSadaf Ebrahimi okquantifier = FALSE;
4959*22dc650dSSadaf Ebrahimi
4960*22dc650dSSadaf Ebrahimi if ((top_nest->flags & NSF_ATOMICSR) != 0)
4961*22dc650dSSadaf Ebrahimi {
4962*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_KET;
4963*22dc650dSSadaf Ebrahimi }
4964*22dc650dSSadaf Ebrahimi
4965*22dc650dSSadaf Ebrahimi if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4966*22dc650dSSadaf Ebrahimi else top_nest--;
4967*22dc650dSSadaf Ebrahimi }
4968*22dc650dSSadaf Ebrahimi if (nest_depth == 0) /* Unmatched closing parenthesis */
4969*22dc650dSSadaf Ebrahimi {
4970*22dc650dSSadaf Ebrahimi errorcode = ERR22;
4971*22dc650dSSadaf Ebrahimi goto FAILED_BACK;
4972*22dc650dSSadaf Ebrahimi }
4973*22dc650dSSadaf Ebrahimi nest_depth--;
4974*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_KET;
4975*22dc650dSSadaf Ebrahimi break;
4976*22dc650dSSadaf Ebrahimi } /* End of switch on pattern character */
4977*22dc650dSSadaf Ebrahimi } /* End of main character scan loop */
4978*22dc650dSSadaf Ebrahimi
4979*22dc650dSSadaf Ebrahimi /* End of pattern reached. Check for missing ) at the end of a verb name. */
4980*22dc650dSSadaf Ebrahimi
4981*22dc650dSSadaf Ebrahimi if (inverbname && ptr >= ptrend)
4982*22dc650dSSadaf Ebrahimi {
4983*22dc650dSSadaf Ebrahimi errorcode = ERR60;
4984*22dc650dSSadaf Ebrahimi goto FAILED;
4985*22dc650dSSadaf Ebrahimi }
4986*22dc650dSSadaf Ebrahimi
4987*22dc650dSSadaf Ebrahimi /* Manage callout for the final item */
4988*22dc650dSSadaf Ebrahimi
4989*22dc650dSSadaf Ebrahimi PARSED_END:
4990*22dc650dSSadaf Ebrahimi parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4991*22dc650dSSadaf Ebrahimi parsed_pattern, cb);
4992*22dc650dSSadaf Ebrahimi
4993*22dc650dSSadaf Ebrahimi /* Insert trailing items for word and line matching (features provided for the
4994*22dc650dSSadaf Ebrahimi benefit of pcre2grep). */
4995*22dc650dSSadaf Ebrahimi
4996*22dc650dSSadaf Ebrahimi if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
4997*22dc650dSSadaf Ebrahimi {
4998*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_KET;
4999*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_DOLLAR;
5000*22dc650dSSadaf Ebrahimi }
5001*22dc650dSSadaf Ebrahimi else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5002*22dc650dSSadaf Ebrahimi {
5003*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_KET;
5004*22dc650dSSadaf Ebrahimi *parsed_pattern++ = META_ESCAPE + ESC_b;
5005*22dc650dSSadaf Ebrahimi }
5006*22dc650dSSadaf Ebrahimi
5007*22dc650dSSadaf Ebrahimi /* Terminate the parsed pattern, then return success if all groups are closed.
5008*22dc650dSSadaf Ebrahimi Otherwise we have unclosed parentheses. */
5009*22dc650dSSadaf Ebrahimi
5010*22dc650dSSadaf Ebrahimi if (parsed_pattern >= parsed_pattern_end)
5011*22dc650dSSadaf Ebrahimi {
5012*22dc650dSSadaf Ebrahimi errorcode = ERR63; /* Internal error (parsed pattern overflow) */
5013*22dc650dSSadaf Ebrahimi goto FAILED;
5014*22dc650dSSadaf Ebrahimi }
5015*22dc650dSSadaf Ebrahimi
5016*22dc650dSSadaf Ebrahimi *parsed_pattern = META_END;
5017*22dc650dSSadaf Ebrahimi if (nest_depth == 0) return 0;
5018*22dc650dSSadaf Ebrahimi
5019*22dc650dSSadaf Ebrahimi UNCLOSED_PARENTHESIS:
5020*22dc650dSSadaf Ebrahimi errorcode = ERR14;
5021*22dc650dSSadaf Ebrahimi
5022*22dc650dSSadaf Ebrahimi /* Come here for all failures. */
5023*22dc650dSSadaf Ebrahimi
5024*22dc650dSSadaf Ebrahimi FAILED:
5025*22dc650dSSadaf Ebrahimi cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5026*22dc650dSSadaf Ebrahimi return errorcode;
5027*22dc650dSSadaf Ebrahimi
5028*22dc650dSSadaf Ebrahimi /* Some errors need to indicate the previous character. */
5029*22dc650dSSadaf Ebrahimi
5030*22dc650dSSadaf Ebrahimi FAILED_BACK:
5031*22dc650dSSadaf Ebrahimi ptr--;
5032*22dc650dSSadaf Ebrahimi goto FAILED;
5033*22dc650dSSadaf Ebrahimi
5034*22dc650dSSadaf Ebrahimi /* This failure happens several times. */
5035*22dc650dSSadaf Ebrahimi
5036*22dc650dSSadaf Ebrahimi BAD_VERSION_CONDITION:
5037*22dc650dSSadaf Ebrahimi errorcode = ERR79;
5038*22dc650dSSadaf Ebrahimi goto FAILED;
5039*22dc650dSSadaf Ebrahimi }
5040*22dc650dSSadaf Ebrahimi
5041*22dc650dSSadaf Ebrahimi
5042*22dc650dSSadaf Ebrahimi
5043*22dc650dSSadaf Ebrahimi /*************************************************
5044*22dc650dSSadaf Ebrahimi * Find first significant opcode *
5045*22dc650dSSadaf Ebrahimi *************************************************/
5046*22dc650dSSadaf Ebrahimi
5047*22dc650dSSadaf Ebrahimi /* This is called by several functions that scan a compiled expression looking
5048*22dc650dSSadaf Ebrahimi for a fixed first character, or an anchoring opcode etc. It skips over things
5049*22dc650dSSadaf Ebrahimi that do not influence this. For some calls, it makes sense to skip negative
5050*22dc650dSSadaf Ebrahimi forward and all backward assertions, and also the \b assertion; for others it
5051*22dc650dSSadaf Ebrahimi does not.
5052*22dc650dSSadaf Ebrahimi
5053*22dc650dSSadaf Ebrahimi Arguments:
5054*22dc650dSSadaf Ebrahimi code pointer to the start of the group
5055*22dc650dSSadaf Ebrahimi skipassert TRUE if certain assertions are to be skipped
5056*22dc650dSSadaf Ebrahimi
5057*22dc650dSSadaf Ebrahimi Returns: pointer to the first significant opcode
5058*22dc650dSSadaf Ebrahimi */
5059*22dc650dSSadaf Ebrahimi
5060*22dc650dSSadaf Ebrahimi static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)5061*22dc650dSSadaf Ebrahimi first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5062*22dc650dSSadaf Ebrahimi {
5063*22dc650dSSadaf Ebrahimi for (;;)
5064*22dc650dSSadaf Ebrahimi {
5065*22dc650dSSadaf Ebrahimi switch ((int)*code)
5066*22dc650dSSadaf Ebrahimi {
5067*22dc650dSSadaf Ebrahimi case OP_ASSERT_NOT:
5068*22dc650dSSadaf Ebrahimi case OP_ASSERTBACK:
5069*22dc650dSSadaf Ebrahimi case OP_ASSERTBACK_NOT:
5070*22dc650dSSadaf Ebrahimi case OP_ASSERTBACK_NA:
5071*22dc650dSSadaf Ebrahimi if (!skipassert) return code;
5072*22dc650dSSadaf Ebrahimi do code += GET(code, 1); while (*code == OP_ALT);
5073*22dc650dSSadaf Ebrahimi code += PRIV(OP_lengths)[*code];
5074*22dc650dSSadaf Ebrahimi break;
5075*22dc650dSSadaf Ebrahimi
5076*22dc650dSSadaf Ebrahimi case OP_WORD_BOUNDARY:
5077*22dc650dSSadaf Ebrahimi case OP_NOT_WORD_BOUNDARY:
5078*22dc650dSSadaf Ebrahimi case OP_UCP_WORD_BOUNDARY:
5079*22dc650dSSadaf Ebrahimi case OP_NOT_UCP_WORD_BOUNDARY:
5080*22dc650dSSadaf Ebrahimi if (!skipassert) return code;
5081*22dc650dSSadaf Ebrahimi /* Fall through */
5082*22dc650dSSadaf Ebrahimi
5083*22dc650dSSadaf Ebrahimi case OP_CALLOUT:
5084*22dc650dSSadaf Ebrahimi case OP_CREF:
5085*22dc650dSSadaf Ebrahimi case OP_DNCREF:
5086*22dc650dSSadaf Ebrahimi case OP_RREF:
5087*22dc650dSSadaf Ebrahimi case OP_DNRREF:
5088*22dc650dSSadaf Ebrahimi case OP_FALSE:
5089*22dc650dSSadaf Ebrahimi case OP_TRUE:
5090*22dc650dSSadaf Ebrahimi code += PRIV(OP_lengths)[*code];
5091*22dc650dSSadaf Ebrahimi break;
5092*22dc650dSSadaf Ebrahimi
5093*22dc650dSSadaf Ebrahimi case OP_CALLOUT_STR:
5094*22dc650dSSadaf Ebrahimi code += GET(code, 1 + 2*LINK_SIZE);
5095*22dc650dSSadaf Ebrahimi break;
5096*22dc650dSSadaf Ebrahimi
5097*22dc650dSSadaf Ebrahimi case OP_SKIPZERO:
5098*22dc650dSSadaf Ebrahimi code += 2 + GET(code, 2) + LINK_SIZE;
5099*22dc650dSSadaf Ebrahimi break;
5100*22dc650dSSadaf Ebrahimi
5101*22dc650dSSadaf Ebrahimi case OP_COND:
5102*22dc650dSSadaf Ebrahimi case OP_SCOND:
5103*22dc650dSSadaf Ebrahimi if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
5104*22dc650dSSadaf Ebrahimi code[GET(code, 1)] != OP_KET) /* More than one branch */
5105*22dc650dSSadaf Ebrahimi return code;
5106*22dc650dSSadaf Ebrahimi code += GET(code, 1) + 1 + LINK_SIZE;
5107*22dc650dSSadaf Ebrahimi break;
5108*22dc650dSSadaf Ebrahimi
5109*22dc650dSSadaf Ebrahimi case OP_MARK:
5110*22dc650dSSadaf Ebrahimi case OP_COMMIT_ARG:
5111*22dc650dSSadaf Ebrahimi case OP_PRUNE_ARG:
5112*22dc650dSSadaf Ebrahimi case OP_SKIP_ARG:
5113*22dc650dSSadaf Ebrahimi case OP_THEN_ARG:
5114*22dc650dSSadaf Ebrahimi code += code[1] + PRIV(OP_lengths)[*code];
5115*22dc650dSSadaf Ebrahimi break;
5116*22dc650dSSadaf Ebrahimi
5117*22dc650dSSadaf Ebrahimi default:
5118*22dc650dSSadaf Ebrahimi return code;
5119*22dc650dSSadaf Ebrahimi }
5120*22dc650dSSadaf Ebrahimi }
5121*22dc650dSSadaf Ebrahimi /* Control never reaches here */
5122*22dc650dSSadaf Ebrahimi }
5123*22dc650dSSadaf Ebrahimi
5124*22dc650dSSadaf Ebrahimi
5125*22dc650dSSadaf Ebrahimi
5126*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5127*22dc650dSSadaf Ebrahimi /*************************************************
5128*22dc650dSSadaf Ebrahimi * Get othercase range *
5129*22dc650dSSadaf Ebrahimi *************************************************/
5130*22dc650dSSadaf Ebrahimi
5131*22dc650dSSadaf Ebrahimi /* This function is passed the start and end of a class range in UCP mode. For
5132*22dc650dSSadaf Ebrahimi single characters the range may be just one character long. The function
5133*22dc650dSSadaf Ebrahimi searches up the characters, looking for ranges of characters in the "other"
5134*22dc650dSSadaf Ebrahimi case. Each call returns the next one, updating the start address. A character
5135*22dc650dSSadaf Ebrahimi with multiple other cases is returned on its own with a special return value.
5136*22dc650dSSadaf Ebrahimi
5137*22dc650dSSadaf Ebrahimi Arguments:
5138*22dc650dSSadaf Ebrahimi cptr points to starting character value; updated
5139*22dc650dSSadaf Ebrahimi d end value
5140*22dc650dSSadaf Ebrahimi ocptr where to put start of othercase range
5141*22dc650dSSadaf Ebrahimi odptr where to put end of othercase range
5142*22dc650dSSadaf Ebrahimi restricted TRUE if caseless restriction applies
5143*22dc650dSSadaf Ebrahimi
5144*22dc650dSSadaf Ebrahimi Yield: -1 when no more
5145*22dc650dSSadaf Ebrahimi 0 when a range is returned
5146*22dc650dSSadaf Ebrahimi >0 the CASESET offset for char with multiple other cases;
5147*22dc650dSSadaf Ebrahimi for this return, *ocptr contains the original
5148*22dc650dSSadaf Ebrahimi */
5149*22dc650dSSadaf Ebrahimi
5150*22dc650dSSadaf Ebrahimi static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr,BOOL restricted)5151*22dc650dSSadaf Ebrahimi get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
5152*22dc650dSSadaf Ebrahimi uint32_t *odptr, BOOL restricted)
5153*22dc650dSSadaf Ebrahimi {
5154*22dc650dSSadaf Ebrahimi uint32_t c, othercase, next;
5155*22dc650dSSadaf Ebrahimi unsigned int co;
5156*22dc650dSSadaf Ebrahimi
5157*22dc650dSSadaf Ebrahimi /* Find the first character that has an other case. If it has multiple other
5158*22dc650dSSadaf Ebrahimi cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the
5159*22dc650dSSadaf Ebrahimi multi-case entries that begin with ASCII values. In 32-bit mode, a value
5160*22dc650dSSadaf Ebrahimi greater than the Unicode maximum ends the range. */
5161*22dc650dSSadaf Ebrahimi
5162*22dc650dSSadaf Ebrahimi for (c = *cptr; c <= d; c++)
5163*22dc650dSSadaf Ebrahimi {
5164*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
5165*22dc650dSSadaf Ebrahimi if (c > MAX_UTF_CODE_POINT) return -1;
5166*22dc650dSSadaf Ebrahimi #endif
5167*22dc650dSSadaf Ebrahimi if ((co = UCD_CASESET(c)) != 0 &&
5168*22dc650dSSadaf Ebrahimi (!restricted || PRIV(ucd_caseless_sets)[co] > 127))
5169*22dc650dSSadaf Ebrahimi {
5170*22dc650dSSadaf Ebrahimi *ocptr = c++; /* Character that has the set */
5171*22dc650dSSadaf Ebrahimi *cptr = c; /* Rest of input range */
5172*22dc650dSSadaf Ebrahimi return (int)co;
5173*22dc650dSSadaf Ebrahimi }
5174*22dc650dSSadaf Ebrahimi
5175*22dc650dSSadaf Ebrahimi /* This is not a valid multiple-case character. Check that the single other
5176*22dc650dSSadaf Ebrahimi case is different to the original. We don't need to check "restricted" here
5177*22dc650dSSadaf Ebrahimi because the non-ASCII characters with multiple cases that include an ASCII
5178*22dc650dSSadaf Ebrahimi character don't have a different "othercase". */
5179*22dc650dSSadaf Ebrahimi
5180*22dc650dSSadaf Ebrahimi if ((othercase = UCD_OTHERCASE(c)) != c) break;
5181*22dc650dSSadaf Ebrahimi }
5182*22dc650dSSadaf Ebrahimi
5183*22dc650dSSadaf Ebrahimi if (c > d) return -1; /* Reached end of range */
5184*22dc650dSSadaf Ebrahimi
5185*22dc650dSSadaf Ebrahimi /* Found a character that has a single other case. Search for the end of the
5186*22dc650dSSadaf Ebrahimi range, which is either the end of the input range, or a character that has zero
5187*22dc650dSSadaf Ebrahimi or more than one other cases. */
5188*22dc650dSSadaf Ebrahimi
5189*22dc650dSSadaf Ebrahimi *ocptr = othercase;
5190*22dc650dSSadaf Ebrahimi next = othercase + 1;
5191*22dc650dSSadaf Ebrahimi
5192*22dc650dSSadaf Ebrahimi for (++c; c <= d; c++)
5193*22dc650dSSadaf Ebrahimi {
5194*22dc650dSSadaf Ebrahimi if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
5195*22dc650dSSadaf Ebrahimi next++;
5196*22dc650dSSadaf Ebrahimi }
5197*22dc650dSSadaf Ebrahimi
5198*22dc650dSSadaf Ebrahimi *odptr = next - 1; /* End of othercase range */
5199*22dc650dSSadaf Ebrahimi *cptr = c; /* Rest of input range */
5200*22dc650dSSadaf Ebrahimi return 0;
5201*22dc650dSSadaf Ebrahimi }
5202*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
5203*22dc650dSSadaf Ebrahimi
5204*22dc650dSSadaf Ebrahimi
5205*22dc650dSSadaf Ebrahimi
5206*22dc650dSSadaf Ebrahimi /*************************************************
5207*22dc650dSSadaf Ebrahimi * Add a character or range to a class (internal) *
5208*22dc650dSSadaf Ebrahimi *************************************************/
5209*22dc650dSSadaf Ebrahimi
5210*22dc650dSSadaf Ebrahimi /* This function packages up the logic of adding a character or range of
5211*22dc650dSSadaf Ebrahimi characters to a class. The character values in the arguments will be within the
5212*22dc650dSSadaf Ebrahimi valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
5213*22dc650dSSadaf Ebrahimi called only from within the "add to class" group of functions, some of which
5214*22dc650dSSadaf Ebrahimi are recursive and mutually recursive. The external entry point is
5215*22dc650dSSadaf Ebrahimi add_to_class().
5216*22dc650dSSadaf Ebrahimi
5217*22dc650dSSadaf Ebrahimi Arguments:
5218*22dc650dSSadaf Ebrahimi classbits the bit map for characters < 256
5219*22dc650dSSadaf Ebrahimi uchardptr points to the pointer for extra data
5220*22dc650dSSadaf Ebrahimi options the options bits
5221*22dc650dSSadaf Ebrahimi xoptions the extra options bits
5222*22dc650dSSadaf Ebrahimi cb compile data
5223*22dc650dSSadaf Ebrahimi start start of range character
5224*22dc650dSSadaf Ebrahimi end end of range character
5225*22dc650dSSadaf Ebrahimi
5226*22dc650dSSadaf Ebrahimi Returns: the number of < 256 characters added
5227*22dc650dSSadaf Ebrahimi the pointer to extra data is updated
5228*22dc650dSSadaf Ebrahimi */
5229*22dc650dSSadaf Ebrahimi
5230*22dc650dSSadaf Ebrahimi static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5231*22dc650dSSadaf Ebrahimi add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5232*22dc650dSSadaf Ebrahimi uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start,
5233*22dc650dSSadaf Ebrahimi uint32_t end)
5234*22dc650dSSadaf Ebrahimi {
5235*22dc650dSSadaf Ebrahimi uint32_t c;
5236*22dc650dSSadaf Ebrahimi uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5237*22dc650dSSadaf Ebrahimi unsigned int n8 = 0;
5238*22dc650dSSadaf Ebrahimi
5239*22dc650dSSadaf Ebrahimi /* If caseless matching is required, scan the range and process alternate
5240*22dc650dSSadaf Ebrahimi cases. In Unicode, there are 8-bit characters that have alternate cases that
5241*22dc650dSSadaf Ebrahimi are greater than 255 and vice-versa (though these may be ignored if caseless
5242*22dc650dSSadaf Ebrahimi restriction is in force). Sometimes we can just extend the original range. */
5243*22dc650dSSadaf Ebrahimi
5244*22dc650dSSadaf Ebrahimi if ((options & PCRE2_CASELESS) != 0)
5245*22dc650dSSadaf Ebrahimi {
5246*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5247*22dc650dSSadaf Ebrahimi if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5248*22dc650dSSadaf Ebrahimi {
5249*22dc650dSSadaf Ebrahimi int rc;
5250*22dc650dSSadaf Ebrahimi uint32_t oc, od;
5251*22dc650dSSadaf Ebrahimi
5252*22dc650dSSadaf Ebrahimi options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
5253*22dc650dSSadaf Ebrahimi c = start;
5254*22dc650dSSadaf Ebrahimi
5255*22dc650dSSadaf Ebrahimi while ((rc = get_othercase_range(&c, end, &oc, &od,
5256*22dc650dSSadaf Ebrahimi (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0)
5257*22dc650dSSadaf Ebrahimi {
5258*22dc650dSSadaf Ebrahimi /* Handle a single character that has more than one other case. */
5259*22dc650dSSadaf Ebrahimi
5260*22dc650dSSadaf Ebrahimi if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr,
5261*22dc650dSSadaf Ebrahimi options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc);
5262*22dc650dSSadaf Ebrahimi
5263*22dc650dSSadaf Ebrahimi /* Do nothing if the other case range is within the original range. */
5264*22dc650dSSadaf Ebrahimi
5265*22dc650dSSadaf Ebrahimi else if (oc >= cb->class_range_start && od <= cb->class_range_end)
5266*22dc650dSSadaf Ebrahimi continue;
5267*22dc650dSSadaf Ebrahimi
5268*22dc650dSSadaf Ebrahimi /* Extend the original range if there is overlap, noting that if oc < c,
5269*22dc650dSSadaf Ebrahimi we can't have od > end because a subrange is always shorter than the
5270*22dc650dSSadaf Ebrahimi basic range. Otherwise, use a recursive call to add the additional range.
5271*22dc650dSSadaf Ebrahimi */
5272*22dc650dSSadaf Ebrahimi
5273*22dc650dSSadaf Ebrahimi else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5274*22dc650dSSadaf Ebrahimi else if (od > end && oc <= end + 1)
5275*22dc650dSSadaf Ebrahimi {
5276*22dc650dSSadaf Ebrahimi end = od; /* Extend upwards */
5277*22dc650dSSadaf Ebrahimi if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5278*22dc650dSSadaf Ebrahimi }
5279*22dc650dSSadaf Ebrahimi else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions,
5280*22dc650dSSadaf Ebrahimi cb, oc, od);
5281*22dc650dSSadaf Ebrahimi }
5282*22dc650dSSadaf Ebrahimi }
5283*22dc650dSSadaf Ebrahimi else
5284*22dc650dSSadaf Ebrahimi #else
5285*22dc650dSSadaf Ebrahimi (void)xoptions; /* Avoid compiler warning */
5286*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
5287*22dc650dSSadaf Ebrahimi
5288*22dc650dSSadaf Ebrahimi /* Not UTF mode */
5289*22dc650dSSadaf Ebrahimi
5290*22dc650dSSadaf Ebrahimi for (c = start; c <= classbits_end; c++)
5291*22dc650dSSadaf Ebrahimi {
5292*22dc650dSSadaf Ebrahimi SETBIT(classbits, cb->fcc[c]);
5293*22dc650dSSadaf Ebrahimi n8++;
5294*22dc650dSSadaf Ebrahimi }
5295*22dc650dSSadaf Ebrahimi }
5296*22dc650dSSadaf Ebrahimi
5297*22dc650dSSadaf Ebrahimi /* Now handle the originally supplied range. Adjust the final value according
5298*22dc650dSSadaf Ebrahimi to the bit length - this means that the same lists of (e.g.) horizontal spaces
5299*22dc650dSSadaf Ebrahimi can be used in all cases. */
5300*22dc650dSSadaf Ebrahimi
5301*22dc650dSSadaf Ebrahimi if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5302*22dc650dSSadaf Ebrahimi end = MAX_NON_UTF_CHAR;
5303*22dc650dSSadaf Ebrahimi
5304*22dc650dSSadaf Ebrahimi if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5305*22dc650dSSadaf Ebrahimi
5306*22dc650dSSadaf Ebrahimi /* Use the bitmap for characters < 256. Otherwise use extra data.*/
5307*22dc650dSSadaf Ebrahimi
5308*22dc650dSSadaf Ebrahimi for (c = start; c <= classbits_end; c++)
5309*22dc650dSSadaf Ebrahimi {
5310*22dc650dSSadaf Ebrahimi /* Regardless of start, c will always be <= 255. */
5311*22dc650dSSadaf Ebrahimi SETBIT(classbits, c);
5312*22dc650dSSadaf Ebrahimi n8++;
5313*22dc650dSSadaf Ebrahimi }
5314*22dc650dSSadaf Ebrahimi
5315*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
5316*22dc650dSSadaf Ebrahimi if (start <= 0xff) start = 0xff + 1;
5317*22dc650dSSadaf Ebrahimi
5318*22dc650dSSadaf Ebrahimi if (end >= start)
5319*22dc650dSSadaf Ebrahimi {
5320*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *uchardata = *uchardptr;
5321*22dc650dSSadaf Ebrahimi
5322*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5323*22dc650dSSadaf Ebrahimi if ((options & PCRE2_UTF) != 0)
5324*22dc650dSSadaf Ebrahimi {
5325*22dc650dSSadaf Ebrahimi if (start < end)
5326*22dc650dSSadaf Ebrahimi {
5327*22dc650dSSadaf Ebrahimi *uchardata++ = XCL_RANGE;
5328*22dc650dSSadaf Ebrahimi uchardata += PRIV(ord2utf)(start, uchardata);
5329*22dc650dSSadaf Ebrahimi uchardata += PRIV(ord2utf)(end, uchardata);
5330*22dc650dSSadaf Ebrahimi }
5331*22dc650dSSadaf Ebrahimi else if (start == end)
5332*22dc650dSSadaf Ebrahimi {
5333*22dc650dSSadaf Ebrahimi *uchardata++ = XCL_SINGLE;
5334*22dc650dSSadaf Ebrahimi uchardata += PRIV(ord2utf)(start, uchardata);
5335*22dc650dSSadaf Ebrahimi }
5336*22dc650dSSadaf Ebrahimi }
5337*22dc650dSSadaf Ebrahimi else
5338*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
5339*22dc650dSSadaf Ebrahimi
5340*22dc650dSSadaf Ebrahimi /* Without UTF support, character values are constrained by the bit length,
5341*22dc650dSSadaf Ebrahimi and can only be > 256 for 16-bit and 32-bit libraries. */
5342*22dc650dSSadaf Ebrahimi
5343*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
5344*22dc650dSSadaf Ebrahimi {}
5345*22dc650dSSadaf Ebrahimi #else
5346*22dc650dSSadaf Ebrahimi if (start < end)
5347*22dc650dSSadaf Ebrahimi {
5348*22dc650dSSadaf Ebrahimi *uchardata++ = XCL_RANGE;
5349*22dc650dSSadaf Ebrahimi *uchardata++ = start;
5350*22dc650dSSadaf Ebrahimi *uchardata++ = end;
5351*22dc650dSSadaf Ebrahimi }
5352*22dc650dSSadaf Ebrahimi else if (start == end)
5353*22dc650dSSadaf Ebrahimi {
5354*22dc650dSSadaf Ebrahimi *uchardata++ = XCL_SINGLE;
5355*22dc650dSSadaf Ebrahimi *uchardata++ = start;
5356*22dc650dSSadaf Ebrahimi }
5357*22dc650dSSadaf Ebrahimi #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
5358*22dc650dSSadaf Ebrahimi *uchardptr = uchardata; /* Updata extra data pointer */
5359*22dc650dSSadaf Ebrahimi }
5360*22dc650dSSadaf Ebrahimi #else /* SUPPORT_WIDE_CHARS */
5361*22dc650dSSadaf Ebrahimi (void)uchardptr; /* Avoid compiler warning */
5362*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_WIDE_CHARS */
5363*22dc650dSSadaf Ebrahimi
5364*22dc650dSSadaf Ebrahimi return n8; /* Number of 8-bit characters */
5365*22dc650dSSadaf Ebrahimi }
5366*22dc650dSSadaf Ebrahimi
5367*22dc650dSSadaf Ebrahimi
5368*22dc650dSSadaf Ebrahimi
5369*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5370*22dc650dSSadaf Ebrahimi /*************************************************
5371*22dc650dSSadaf Ebrahimi * Add a list of characters to a class (internal) *
5372*22dc650dSSadaf Ebrahimi *************************************************/
5373*22dc650dSSadaf Ebrahimi
5374*22dc650dSSadaf Ebrahimi /* This function is used for adding a list of case-equivalent characters to a
5375*22dc650dSSadaf Ebrahimi class when in UTF mode. This function is called only from within
5376*22dc650dSSadaf Ebrahimi add_to_class_internal(), with which it is mutually recursive.
5377*22dc650dSSadaf Ebrahimi
5378*22dc650dSSadaf Ebrahimi Arguments:
5379*22dc650dSSadaf Ebrahimi classbits the bit map for characters < 256
5380*22dc650dSSadaf Ebrahimi uchardptr points to the pointer for extra data
5381*22dc650dSSadaf Ebrahimi options the options bits
5382*22dc650dSSadaf Ebrahimi xoptions the extra options bits
5383*22dc650dSSadaf Ebrahimi cb contains pointers to tables etc.
5384*22dc650dSSadaf Ebrahimi p points to row of 32-bit values, terminated by NOTACHAR
5385*22dc650dSSadaf Ebrahimi except character to omit; this is used when adding lists of
5386*22dc650dSSadaf Ebrahimi case-equivalent characters to avoid including the one we
5387*22dc650dSSadaf Ebrahimi already know about
5388*22dc650dSSadaf Ebrahimi
5389*22dc650dSSadaf Ebrahimi Returns: the number of < 256 characters added
5390*22dc650dSSadaf Ebrahimi the pointer to extra data is updated
5391*22dc650dSSadaf Ebrahimi */
5392*22dc650dSSadaf Ebrahimi
5393*22dc650dSSadaf Ebrahimi static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5394*22dc650dSSadaf Ebrahimi add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5395*22dc650dSSadaf Ebrahimi uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p,
5396*22dc650dSSadaf Ebrahimi unsigned int except)
5397*22dc650dSSadaf Ebrahimi {
5398*22dc650dSSadaf Ebrahimi unsigned int n8 = 0;
5399*22dc650dSSadaf Ebrahimi while (p[0] < NOTACHAR)
5400*22dc650dSSadaf Ebrahimi {
5401*22dc650dSSadaf Ebrahimi unsigned int n = 0;
5402*22dc650dSSadaf Ebrahimi if (p[0] != except)
5403*22dc650dSSadaf Ebrahimi {
5404*22dc650dSSadaf Ebrahimi while(p[n+1] == p[0] + n + 1) n++;
5405*22dc650dSSadaf Ebrahimi n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5406*22dc650dSSadaf Ebrahimi p[0], p[n]);
5407*22dc650dSSadaf Ebrahimi }
5408*22dc650dSSadaf Ebrahimi p += n + 1;
5409*22dc650dSSadaf Ebrahimi }
5410*22dc650dSSadaf Ebrahimi return n8;
5411*22dc650dSSadaf Ebrahimi }
5412*22dc650dSSadaf Ebrahimi #endif
5413*22dc650dSSadaf Ebrahimi
5414*22dc650dSSadaf Ebrahimi
5415*22dc650dSSadaf Ebrahimi
5416*22dc650dSSadaf Ebrahimi /*************************************************
5417*22dc650dSSadaf Ebrahimi * External entry point for add range to class *
5418*22dc650dSSadaf Ebrahimi *************************************************/
5419*22dc650dSSadaf Ebrahimi
5420*22dc650dSSadaf Ebrahimi /* This function sets the overall range so that the internal functions can try
5421*22dc650dSSadaf Ebrahimi to avoid duplication when handling case-independence.
5422*22dc650dSSadaf Ebrahimi
5423*22dc650dSSadaf Ebrahimi Arguments:
5424*22dc650dSSadaf Ebrahimi classbits the bit map for characters < 256
5425*22dc650dSSadaf Ebrahimi uchardptr points to the pointer for extra data
5426*22dc650dSSadaf Ebrahimi options the options bits
5427*22dc650dSSadaf Ebrahimi xoptions the extra options bits
5428*22dc650dSSadaf Ebrahimi cb compile data
5429*22dc650dSSadaf Ebrahimi start start of range character
5430*22dc650dSSadaf Ebrahimi end end of range character
5431*22dc650dSSadaf Ebrahimi
5432*22dc650dSSadaf Ebrahimi Returns: the number of < 256 characters added
5433*22dc650dSSadaf Ebrahimi the pointer to extra data is updated
5434*22dc650dSSadaf Ebrahimi */
5435*22dc650dSSadaf Ebrahimi
5436*22dc650dSSadaf Ebrahimi static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5437*22dc650dSSadaf Ebrahimi add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5438*22dc650dSSadaf Ebrahimi uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end)
5439*22dc650dSSadaf Ebrahimi {
5440*22dc650dSSadaf Ebrahimi cb->class_range_start = start;
5441*22dc650dSSadaf Ebrahimi cb->class_range_end = end;
5442*22dc650dSSadaf Ebrahimi return add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5443*22dc650dSSadaf Ebrahimi start, end);
5444*22dc650dSSadaf Ebrahimi }
5445*22dc650dSSadaf Ebrahimi
5446*22dc650dSSadaf Ebrahimi
5447*22dc650dSSadaf Ebrahimi /*************************************************
5448*22dc650dSSadaf Ebrahimi * External entry point for add list to class *
5449*22dc650dSSadaf Ebrahimi *************************************************/
5450*22dc650dSSadaf Ebrahimi
5451*22dc650dSSadaf Ebrahimi /* This function is used for adding a list of horizontal or vertical whitespace
5452*22dc650dSSadaf Ebrahimi characters to a class. The list must be in order so that ranges of characters
5453*22dc650dSSadaf Ebrahimi can be detected and handled appropriately. This function sets the overall range
5454*22dc650dSSadaf Ebrahimi so that the internal functions can try to avoid duplication when handling
5455*22dc650dSSadaf Ebrahimi case-independence.
5456*22dc650dSSadaf Ebrahimi
5457*22dc650dSSadaf Ebrahimi Arguments:
5458*22dc650dSSadaf Ebrahimi classbits the bit map for characters < 256
5459*22dc650dSSadaf Ebrahimi uchardptr points to the pointer for extra data
5460*22dc650dSSadaf Ebrahimi options the options bits
5461*22dc650dSSadaf Ebrahimi xoptions the extra options bits
5462*22dc650dSSadaf Ebrahimi cb contains pointers to tables etc.
5463*22dc650dSSadaf Ebrahimi p points to row of 32-bit values, terminated by NOTACHAR
5464*22dc650dSSadaf Ebrahimi except character to omit; this is used when adding lists of
5465*22dc650dSSadaf Ebrahimi case-equivalent characters to avoid including the one we
5466*22dc650dSSadaf Ebrahimi already know about
5467*22dc650dSSadaf Ebrahimi
5468*22dc650dSSadaf Ebrahimi Returns: the number of < 256 characters added
5469*22dc650dSSadaf Ebrahimi the pointer to extra data is updated
5470*22dc650dSSadaf Ebrahimi */
5471*22dc650dSSadaf Ebrahimi
5472*22dc650dSSadaf Ebrahimi static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5473*22dc650dSSadaf Ebrahimi add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5474*22dc650dSSadaf Ebrahimi uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except)
5475*22dc650dSSadaf Ebrahimi {
5476*22dc650dSSadaf Ebrahimi unsigned int n8 = 0;
5477*22dc650dSSadaf Ebrahimi while (p[0] < NOTACHAR)
5478*22dc650dSSadaf Ebrahimi {
5479*22dc650dSSadaf Ebrahimi unsigned int n = 0;
5480*22dc650dSSadaf Ebrahimi if (p[0] != except)
5481*22dc650dSSadaf Ebrahimi {
5482*22dc650dSSadaf Ebrahimi while(p[n+1] == p[0] + n + 1) n++;
5483*22dc650dSSadaf Ebrahimi cb->class_range_start = p[0];
5484*22dc650dSSadaf Ebrahimi cb->class_range_end = p[n];
5485*22dc650dSSadaf Ebrahimi n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5486*22dc650dSSadaf Ebrahimi p[0], p[n]);
5487*22dc650dSSadaf Ebrahimi }
5488*22dc650dSSadaf Ebrahimi p += n + 1;
5489*22dc650dSSadaf Ebrahimi }
5490*22dc650dSSadaf Ebrahimi return n8;
5491*22dc650dSSadaf Ebrahimi }
5492*22dc650dSSadaf Ebrahimi
5493*22dc650dSSadaf Ebrahimi
5494*22dc650dSSadaf Ebrahimi
5495*22dc650dSSadaf Ebrahimi /*************************************************
5496*22dc650dSSadaf Ebrahimi * Add characters not in a list to a class *
5497*22dc650dSSadaf Ebrahimi *************************************************/
5498*22dc650dSSadaf Ebrahimi
5499*22dc650dSSadaf Ebrahimi /* This function is used for adding the complement of a list of horizontal or
5500*22dc650dSSadaf Ebrahimi vertical whitespace to a class. The list must be in order.
5501*22dc650dSSadaf Ebrahimi
5502*22dc650dSSadaf Ebrahimi Arguments:
5503*22dc650dSSadaf Ebrahimi classbits the bit map for characters < 256
5504*22dc650dSSadaf Ebrahimi uchardptr points to the pointer for extra data
5505*22dc650dSSadaf Ebrahimi options the options bits
5506*22dc650dSSadaf Ebrahimi xoptions the extra options bits
5507*22dc650dSSadaf Ebrahimi cb contains pointers to tables etc.
5508*22dc650dSSadaf Ebrahimi p points to row of 32-bit values, terminated by NOTACHAR
5509*22dc650dSSadaf Ebrahimi
5510*22dc650dSSadaf Ebrahimi Returns: the number of < 256 characters added
5511*22dc650dSSadaf Ebrahimi the pointer to extra data is updated
5512*22dc650dSSadaf Ebrahimi */
5513*22dc650dSSadaf Ebrahimi
5514*22dc650dSSadaf Ebrahimi static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p)5515*22dc650dSSadaf Ebrahimi add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5516*22dc650dSSadaf Ebrahimi uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p)
5517*22dc650dSSadaf Ebrahimi {
5518*22dc650dSSadaf Ebrahimi BOOL utf = (options & PCRE2_UTF) != 0;
5519*22dc650dSSadaf Ebrahimi unsigned int n8 = 0;
5520*22dc650dSSadaf Ebrahimi if (p[0] > 0)
5521*22dc650dSSadaf Ebrahimi n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1);
5522*22dc650dSSadaf Ebrahimi while (p[0] < NOTACHAR)
5523*22dc650dSSadaf Ebrahimi {
5524*22dc650dSSadaf Ebrahimi while (p[1] == p[0] + 1) p++;
5525*22dc650dSSadaf Ebrahimi n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1,
5526*22dc650dSSadaf Ebrahimi (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5527*22dc650dSSadaf Ebrahimi p++;
5528*22dc650dSSadaf Ebrahimi }
5529*22dc650dSSadaf Ebrahimi return n8;
5530*22dc650dSSadaf Ebrahimi }
5531*22dc650dSSadaf Ebrahimi
5532*22dc650dSSadaf Ebrahimi
5533*22dc650dSSadaf Ebrahimi
5534*22dc650dSSadaf Ebrahimi /*************************************************
5535*22dc650dSSadaf Ebrahimi * Find details of duplicate group names *
5536*22dc650dSSadaf Ebrahimi *************************************************/
5537*22dc650dSSadaf Ebrahimi
5538*22dc650dSSadaf Ebrahimi /* This is called from compile_branch() when it needs to know the index and
5539*22dc650dSSadaf Ebrahimi count of duplicates in the names table when processing named backreferences,
5540*22dc650dSSadaf Ebrahimi either directly, or as conditions.
5541*22dc650dSSadaf Ebrahimi
5542*22dc650dSSadaf Ebrahimi Arguments:
5543*22dc650dSSadaf Ebrahimi name points to the name
5544*22dc650dSSadaf Ebrahimi length the length of the name
5545*22dc650dSSadaf Ebrahimi indexptr where to put the index
5546*22dc650dSSadaf Ebrahimi countptr where to put the count of duplicates
5547*22dc650dSSadaf Ebrahimi errorcodeptr where to put an error code
5548*22dc650dSSadaf Ebrahimi cb the compile block
5549*22dc650dSSadaf Ebrahimi
5550*22dc650dSSadaf Ebrahimi Returns: TRUE if OK, FALSE if not, error code set
5551*22dc650dSSadaf Ebrahimi */
5552*22dc650dSSadaf Ebrahimi
5553*22dc650dSSadaf Ebrahimi static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5554*22dc650dSSadaf Ebrahimi find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5555*22dc650dSSadaf Ebrahimi int *countptr, int *errorcodeptr, compile_block *cb)
5556*22dc650dSSadaf Ebrahimi {
5557*22dc650dSSadaf Ebrahimi uint32_t i, groupnumber;
5558*22dc650dSSadaf Ebrahimi int count;
5559*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *slot = cb->name_table;
5560*22dc650dSSadaf Ebrahimi
5561*22dc650dSSadaf Ebrahimi /* Find the first entry in the table */
5562*22dc650dSSadaf Ebrahimi
5563*22dc650dSSadaf Ebrahimi for (i = 0; i < cb->names_found; i++)
5564*22dc650dSSadaf Ebrahimi {
5565*22dc650dSSadaf Ebrahimi if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5566*22dc650dSSadaf Ebrahimi slot[IMM2_SIZE+length] == 0) break;
5567*22dc650dSSadaf Ebrahimi slot += cb->name_entry_size;
5568*22dc650dSSadaf Ebrahimi }
5569*22dc650dSSadaf Ebrahimi
5570*22dc650dSSadaf Ebrahimi /* This should not occur, because this function is called only when we know we
5571*22dc650dSSadaf Ebrahimi have duplicate names. Give an internal error. */
5572*22dc650dSSadaf Ebrahimi
5573*22dc650dSSadaf Ebrahimi if (i >= cb->names_found)
5574*22dc650dSSadaf Ebrahimi {
5575*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR53;
5576*22dc650dSSadaf Ebrahimi cb->erroroffset = name - cb->start_pattern;
5577*22dc650dSSadaf Ebrahimi return FALSE;
5578*22dc650dSSadaf Ebrahimi }
5579*22dc650dSSadaf Ebrahimi
5580*22dc650dSSadaf Ebrahimi /* Record the index and then see how many duplicates there are, updating the
5581*22dc650dSSadaf Ebrahimi backref map and maximum back reference as we do. */
5582*22dc650dSSadaf Ebrahimi
5583*22dc650dSSadaf Ebrahimi *indexptr = i;
5584*22dc650dSSadaf Ebrahimi count = 0;
5585*22dc650dSSadaf Ebrahimi
5586*22dc650dSSadaf Ebrahimi for (;;)
5587*22dc650dSSadaf Ebrahimi {
5588*22dc650dSSadaf Ebrahimi count++;
5589*22dc650dSSadaf Ebrahimi groupnumber = GET2(slot,0);
5590*22dc650dSSadaf Ebrahimi cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5591*22dc650dSSadaf Ebrahimi if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5592*22dc650dSSadaf Ebrahimi if (++i >= cb->names_found) break;
5593*22dc650dSSadaf Ebrahimi slot += cb->name_entry_size;
5594*22dc650dSSadaf Ebrahimi if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5595*22dc650dSSadaf Ebrahimi (slot+IMM2_SIZE)[length] != 0) break;
5596*22dc650dSSadaf Ebrahimi }
5597*22dc650dSSadaf Ebrahimi
5598*22dc650dSSadaf Ebrahimi *countptr = count;
5599*22dc650dSSadaf Ebrahimi return TRUE;
5600*22dc650dSSadaf Ebrahimi }
5601*22dc650dSSadaf Ebrahimi
5602*22dc650dSSadaf Ebrahimi
5603*22dc650dSSadaf Ebrahimi
5604*22dc650dSSadaf Ebrahimi /*************************************************
5605*22dc650dSSadaf Ebrahimi * Compile one branch *
5606*22dc650dSSadaf Ebrahimi *************************************************/
5607*22dc650dSSadaf Ebrahimi
5608*22dc650dSSadaf Ebrahimi /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5609*22dc650dSSadaf Ebrahimi the options are changed during the branch, the pointer is used to change the
5610*22dc650dSSadaf Ebrahimi external options bits. This function is used during the pre-compile phase when
5611*22dc650dSSadaf Ebrahimi we are trying to find out the amount of memory needed, as well as during the
5612*22dc650dSSadaf Ebrahimi real compile phase. The value of lengthptr distinguishes the two phases.
5613*22dc650dSSadaf Ebrahimi
5614*22dc650dSSadaf Ebrahimi Arguments:
5615*22dc650dSSadaf Ebrahimi optionsptr pointer to the option bits
5616*22dc650dSSadaf Ebrahimi xoptionsptr pointer to the extra option bits
5617*22dc650dSSadaf Ebrahimi codeptr points to the pointer to the current code point
5618*22dc650dSSadaf Ebrahimi pptrptr points to the current parsed pattern pointer
5619*22dc650dSSadaf Ebrahimi errorcodeptr points to error code variable
5620*22dc650dSSadaf Ebrahimi firstcuptr place to put the first required code unit
5621*22dc650dSSadaf Ebrahimi firstcuflagsptr place to put the first code unit flags
5622*22dc650dSSadaf Ebrahimi reqcuptr place to put the last required code unit
5623*22dc650dSSadaf Ebrahimi reqcuflagsptr place to put the last required code unit flags
5624*22dc650dSSadaf Ebrahimi bcptr points to current branch chain
5625*22dc650dSSadaf Ebrahimi open_caps points to current capitem
5626*22dc650dSSadaf Ebrahimi cb contains pointers to tables etc.
5627*22dc650dSSadaf Ebrahimi lengthptr NULL during the real compile phase
5628*22dc650dSSadaf Ebrahimi points to length accumulator during pre-compile phase
5629*22dc650dSSadaf Ebrahimi
5630*22dc650dSSadaf Ebrahimi Returns: 0 There's been an error, *errorcodeptr is non-zero
5631*22dc650dSSadaf Ebrahimi +1 Success, this branch must match at least one character
5632*22dc650dSSadaf Ebrahimi -1 Success, this branch may match an empty string
5633*22dc650dSSadaf Ebrahimi */
5634*22dc650dSSadaf Ebrahimi
5635*22dc650dSSadaf Ebrahimi static int
compile_branch(uint32_t * optionsptr,uint32_t * xoptionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)5636*22dc650dSSadaf Ebrahimi compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5637*22dc650dSSadaf Ebrahimi PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5638*22dc650dSSadaf Ebrahimi uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5639*22dc650dSSadaf Ebrahimi uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5640*22dc650dSSadaf Ebrahimi compile_block *cb, PCRE2_SIZE *lengthptr)
5641*22dc650dSSadaf Ebrahimi {
5642*22dc650dSSadaf Ebrahimi int bravalue = 0;
5643*22dc650dSSadaf Ebrahimi int okreturn = -1;
5644*22dc650dSSadaf Ebrahimi int group_return = 0;
5645*22dc650dSSadaf Ebrahimi uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
5646*22dc650dSSadaf Ebrahimi uint32_t greedy_default, greedy_non_default;
5647*22dc650dSSadaf Ebrahimi uint32_t repeat_type, op_type;
5648*22dc650dSSadaf Ebrahimi uint32_t options = *optionsptr; /* May change dynamically */
5649*22dc650dSSadaf Ebrahimi uint32_t xoptions = *xoptionsptr; /* May change dynamically */
5650*22dc650dSSadaf Ebrahimi uint32_t firstcu, reqcu;
5651*22dc650dSSadaf Ebrahimi uint32_t zeroreqcu, zerofirstcu;
5652*22dc650dSSadaf Ebrahimi uint32_t escape;
5653*22dc650dSSadaf Ebrahimi uint32_t *pptr = *pptrptr;
5654*22dc650dSSadaf Ebrahimi uint32_t meta, meta_arg;
5655*22dc650dSSadaf Ebrahimi uint32_t firstcuflags, reqcuflags;
5656*22dc650dSSadaf Ebrahimi uint32_t zeroreqcuflags, zerofirstcuflags;
5657*22dc650dSSadaf Ebrahimi uint32_t req_caseopt, reqvary, tempreqvary;
5658*22dc650dSSadaf Ebrahimi PCRE2_SIZE offset = 0;
5659*22dc650dSSadaf Ebrahimi PCRE2_SIZE length_prevgroup = 0;
5660*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *code = *codeptr;
5661*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *last_code = code;
5662*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *orig_code = code;
5663*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *tempcode;
5664*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *previous = NULL;
5665*22dc650dSSadaf Ebrahimi PCRE2_UCHAR op_previous;
5666*22dc650dSSadaf Ebrahimi BOOL groupsetfirstcu = FALSE;
5667*22dc650dSSadaf Ebrahimi BOOL had_accept = FALSE;
5668*22dc650dSSadaf Ebrahimi BOOL matched_char = FALSE;
5669*22dc650dSSadaf Ebrahimi BOOL previous_matched_char = FALSE;
5670*22dc650dSSadaf Ebrahimi BOOL reset_caseful = FALSE;
5671*22dc650dSSadaf Ebrahimi const uint8_t *cbits = cb->cbits;
5672*22dc650dSSadaf Ebrahimi uint8_t classbits[32];
5673*22dc650dSSadaf Ebrahimi
5674*22dc650dSSadaf Ebrahimi /* We can fish out the UTF setting once and for all into a BOOL, but we must
5675*22dc650dSSadaf Ebrahimi not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5676*22dc650dSSadaf Ebrahimi as we process the pattern. */
5677*22dc650dSSadaf Ebrahimi
5678*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5679*22dc650dSSadaf Ebrahimi BOOL utf = (options & PCRE2_UTF) != 0;
5680*22dc650dSSadaf Ebrahimi BOOL ucp = (options & PCRE2_UCP) != 0;
5681*22dc650dSSadaf Ebrahimi #else /* No Unicode support */
5682*22dc650dSSadaf Ebrahimi BOOL utf = FALSE;
5683*22dc650dSSadaf Ebrahimi #endif
5684*22dc650dSSadaf Ebrahimi
5685*22dc650dSSadaf Ebrahimi /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5686*22dc650dSSadaf Ebrahimi class_uchardata always so that it can be passed to add_to_class() always,
5687*22dc650dSSadaf Ebrahimi though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5688*22dc650dSSadaf Ebrahimi alternative calls for the different cases. */
5689*22dc650dSSadaf Ebrahimi
5690*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *class_uchardata;
5691*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
5692*22dc650dSSadaf Ebrahimi BOOL xclass;
5693*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *class_uchardata_base;
5694*22dc650dSSadaf Ebrahimi #endif
5695*22dc650dSSadaf Ebrahimi
5696*22dc650dSSadaf Ebrahimi /* Set up the default and non-default settings for greediness */
5697*22dc650dSSadaf Ebrahimi
5698*22dc650dSSadaf Ebrahimi greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5699*22dc650dSSadaf Ebrahimi greedy_non_default = greedy_default ^ 1;
5700*22dc650dSSadaf Ebrahimi
5701*22dc650dSSadaf Ebrahimi /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5702*22dc650dSSadaf Ebrahimi matching encountered yet". It gets changed to REQ_NONE if we hit something that
5703*22dc650dSSadaf Ebrahimi matches a non-fixed first unit; reqcu just remains unset if we never find one.
5704*22dc650dSSadaf Ebrahimi
5705*22dc650dSSadaf Ebrahimi When we hit a repeat whose minimum is zero, we may have to adjust these values
5706*22dc650dSSadaf Ebrahimi to take the zero repeat into account. This is implemented by setting them to
5707*22dc650dSSadaf Ebrahimi zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5708*22dc650dSSadaf Ebrahimi item types that can be repeated set these backoff variables appropriately. */
5709*22dc650dSSadaf Ebrahimi
5710*22dc650dSSadaf Ebrahimi firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5711*22dc650dSSadaf Ebrahimi firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5712*22dc650dSSadaf Ebrahimi
5713*22dc650dSSadaf Ebrahimi /* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5714*22dc650dSSadaf Ebrahimi according to the current setting of the caseless flag. The REQ_CASELESS value
5715*22dc650dSSadaf Ebrahimi leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5716*22dc650dSSadaf Ebrahimi to record the case status of the value. This is used only for ASCII characters.
5717*22dc650dSSadaf Ebrahimi */
5718*22dc650dSSadaf Ebrahimi
5719*22dc650dSSadaf Ebrahimi req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5720*22dc650dSSadaf Ebrahimi
5721*22dc650dSSadaf Ebrahimi /* Switch on next META item until the end of the branch */
5722*22dc650dSSadaf Ebrahimi
5723*22dc650dSSadaf Ebrahimi for (;; pptr++)
5724*22dc650dSSadaf Ebrahimi {
5725*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
5726*22dc650dSSadaf Ebrahimi BOOL xclass_has_prop;
5727*22dc650dSSadaf Ebrahimi #endif
5728*22dc650dSSadaf Ebrahimi BOOL negate_class;
5729*22dc650dSSadaf Ebrahimi BOOL should_flip_negation;
5730*22dc650dSSadaf Ebrahimi BOOL match_all_or_no_wide_chars;
5731*22dc650dSSadaf Ebrahimi BOOL possessive_quantifier;
5732*22dc650dSSadaf Ebrahimi BOOL note_group_empty;
5733*22dc650dSSadaf Ebrahimi int class_has_8bitchar;
5734*22dc650dSSadaf Ebrahimi uint32_t mclength;
5735*22dc650dSSadaf Ebrahimi uint32_t skipunits;
5736*22dc650dSSadaf Ebrahimi uint32_t subreqcu, subfirstcu;
5737*22dc650dSSadaf Ebrahimi uint32_t groupnumber;
5738*22dc650dSSadaf Ebrahimi uint32_t verbarglen, verbculen;
5739*22dc650dSSadaf Ebrahimi uint32_t subreqcuflags, subfirstcuflags;
5740*22dc650dSSadaf Ebrahimi open_capitem *oc;
5741*22dc650dSSadaf Ebrahimi PCRE2_UCHAR mcbuffer[8];
5742*22dc650dSSadaf Ebrahimi
5743*22dc650dSSadaf Ebrahimi /* Get next META item in the pattern and its potential argument. */
5744*22dc650dSSadaf Ebrahimi
5745*22dc650dSSadaf Ebrahimi meta = META_CODE(*pptr);
5746*22dc650dSSadaf Ebrahimi meta_arg = META_DATA(*pptr);
5747*22dc650dSSadaf Ebrahimi
5748*22dc650dSSadaf Ebrahimi /* If we are in the pre-compile phase, accumulate the length used for the
5749*22dc650dSSadaf Ebrahimi previous cycle of this loop, unless the next item is a quantifier. */
5750*22dc650dSSadaf Ebrahimi
5751*22dc650dSSadaf Ebrahimi if (lengthptr != NULL)
5752*22dc650dSSadaf Ebrahimi {
5753*22dc650dSSadaf Ebrahimi if (code > cb->start_workspace + cb->workspace_size -
5754*22dc650dSSadaf Ebrahimi WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5755*22dc650dSSadaf Ebrahimi {
5756*22dc650dSSadaf Ebrahimi *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5757*22dc650dSSadaf Ebrahimi ERR52 : ERR86;
5758*22dc650dSSadaf Ebrahimi return 0;
5759*22dc650dSSadaf Ebrahimi }
5760*22dc650dSSadaf Ebrahimi
5761*22dc650dSSadaf Ebrahimi /* There is at least one situation where code goes backwards: this is the
5762*22dc650dSSadaf Ebrahimi case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5763*22dc650dSSadaf Ebrahimi is processed, the whole class is eliminated. However, it is created first,
5764*22dc650dSSadaf Ebrahimi so we have to allow memory for it. Therefore, don't ever reduce the length
5765*22dc650dSSadaf Ebrahimi at this point. */
5766*22dc650dSSadaf Ebrahimi
5767*22dc650dSSadaf Ebrahimi if (code < last_code) code = last_code;
5768*22dc650dSSadaf Ebrahimi
5769*22dc650dSSadaf Ebrahimi /* If the next thing is not a quantifier, we add the length of the previous
5770*22dc650dSSadaf Ebrahimi item into the total, and reset the code pointer to the start of the
5771*22dc650dSSadaf Ebrahimi workspace. Otherwise leave the previous item available to be quantified. */
5772*22dc650dSSadaf Ebrahimi
5773*22dc650dSSadaf Ebrahimi if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5774*22dc650dSSadaf Ebrahimi {
5775*22dc650dSSadaf Ebrahimi if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5776*22dc650dSSadaf Ebrahimi {
5777*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR20; /* Integer overflow */
5778*22dc650dSSadaf Ebrahimi return 0;
5779*22dc650dSSadaf Ebrahimi }
5780*22dc650dSSadaf Ebrahimi *lengthptr += (PCRE2_SIZE)(code - orig_code);
5781*22dc650dSSadaf Ebrahimi if (*lengthptr > MAX_PATTERN_SIZE)
5782*22dc650dSSadaf Ebrahimi {
5783*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR20; /* Pattern is too large */
5784*22dc650dSSadaf Ebrahimi return 0;
5785*22dc650dSSadaf Ebrahimi }
5786*22dc650dSSadaf Ebrahimi code = orig_code;
5787*22dc650dSSadaf Ebrahimi }
5788*22dc650dSSadaf Ebrahimi
5789*22dc650dSSadaf Ebrahimi /* Remember where this code item starts so we can catch the "backwards"
5790*22dc650dSSadaf Ebrahimi case above next time round. */
5791*22dc650dSSadaf Ebrahimi
5792*22dc650dSSadaf Ebrahimi last_code = code;
5793*22dc650dSSadaf Ebrahimi }
5794*22dc650dSSadaf Ebrahimi
5795*22dc650dSSadaf Ebrahimi /* Process the next parsed pattern item. If it is not a quantifier, remember
5796*22dc650dSSadaf Ebrahimi where it starts so that it can be quantified when a quantifier follows.
5797*22dc650dSSadaf Ebrahimi Checking for the legality of quantifiers happens in parse_regex(), except for
5798*22dc650dSSadaf Ebrahimi a quantifier after an assertion that is a condition. */
5799*22dc650dSSadaf Ebrahimi
5800*22dc650dSSadaf Ebrahimi if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5801*22dc650dSSadaf Ebrahimi {
5802*22dc650dSSadaf Ebrahimi previous = code;
5803*22dc650dSSadaf Ebrahimi if (matched_char && !had_accept) okreturn = 1;
5804*22dc650dSSadaf Ebrahimi }
5805*22dc650dSSadaf Ebrahimi
5806*22dc650dSSadaf Ebrahimi previous_matched_char = matched_char;
5807*22dc650dSSadaf Ebrahimi matched_char = FALSE;
5808*22dc650dSSadaf Ebrahimi note_group_empty = FALSE;
5809*22dc650dSSadaf Ebrahimi skipunits = 0; /* Default value for most subgroups */
5810*22dc650dSSadaf Ebrahimi
5811*22dc650dSSadaf Ebrahimi switch(meta)
5812*22dc650dSSadaf Ebrahimi {
5813*22dc650dSSadaf Ebrahimi /* ===================================================================*/
5814*22dc650dSSadaf Ebrahimi /* The branch terminates at pattern end or | or ) */
5815*22dc650dSSadaf Ebrahimi
5816*22dc650dSSadaf Ebrahimi case META_END:
5817*22dc650dSSadaf Ebrahimi case META_ALT:
5818*22dc650dSSadaf Ebrahimi case META_KET:
5819*22dc650dSSadaf Ebrahimi *firstcuptr = firstcu;
5820*22dc650dSSadaf Ebrahimi *firstcuflagsptr = firstcuflags;
5821*22dc650dSSadaf Ebrahimi *reqcuptr = reqcu;
5822*22dc650dSSadaf Ebrahimi *reqcuflagsptr = reqcuflags;
5823*22dc650dSSadaf Ebrahimi *codeptr = code;
5824*22dc650dSSadaf Ebrahimi *pptrptr = pptr;
5825*22dc650dSSadaf Ebrahimi return okreturn;
5826*22dc650dSSadaf Ebrahimi
5827*22dc650dSSadaf Ebrahimi
5828*22dc650dSSadaf Ebrahimi /* ===================================================================*/
5829*22dc650dSSadaf Ebrahimi /* Handle single-character metacharacters. In multiline mode, ^ disables
5830*22dc650dSSadaf Ebrahimi the setting of any following char as a first character. */
5831*22dc650dSSadaf Ebrahimi
5832*22dc650dSSadaf Ebrahimi case META_CIRCUMFLEX:
5833*22dc650dSSadaf Ebrahimi if ((options & PCRE2_MULTILINE) != 0)
5834*22dc650dSSadaf Ebrahimi {
5835*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET)
5836*22dc650dSSadaf Ebrahimi zerofirstcuflags = firstcuflags = REQ_NONE;
5837*22dc650dSSadaf Ebrahimi *code++ = OP_CIRCM;
5838*22dc650dSSadaf Ebrahimi }
5839*22dc650dSSadaf Ebrahimi else *code++ = OP_CIRC;
5840*22dc650dSSadaf Ebrahimi break;
5841*22dc650dSSadaf Ebrahimi
5842*22dc650dSSadaf Ebrahimi case META_DOLLAR:
5843*22dc650dSSadaf Ebrahimi *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5844*22dc650dSSadaf Ebrahimi break;
5845*22dc650dSSadaf Ebrahimi
5846*22dc650dSSadaf Ebrahimi /* There can never be a first char if '.' is first, whatever happens about
5847*22dc650dSSadaf Ebrahimi repeats. The value of reqcu doesn't change either. */
5848*22dc650dSSadaf Ebrahimi
5849*22dc650dSSadaf Ebrahimi case META_DOT:
5850*22dc650dSSadaf Ebrahimi matched_char = TRUE;
5851*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5852*22dc650dSSadaf Ebrahimi zerofirstcu = firstcu;
5853*22dc650dSSadaf Ebrahimi zerofirstcuflags = firstcuflags;
5854*22dc650dSSadaf Ebrahimi zeroreqcu = reqcu;
5855*22dc650dSSadaf Ebrahimi zeroreqcuflags = reqcuflags;
5856*22dc650dSSadaf Ebrahimi *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5857*22dc650dSSadaf Ebrahimi break;
5858*22dc650dSSadaf Ebrahimi
5859*22dc650dSSadaf Ebrahimi
5860*22dc650dSSadaf Ebrahimi /* ===================================================================*/
5861*22dc650dSSadaf Ebrahimi /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5862*22dc650dSSadaf Ebrahimi Otherwise, an initial ']' is taken as a data character. When empty classes
5863*22dc650dSSadaf Ebrahimi are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5864*22dc650dSSadaf Ebrahimi match any character, so generate OP_ALLANY. */
5865*22dc650dSSadaf Ebrahimi
5866*22dc650dSSadaf Ebrahimi case META_CLASS_EMPTY:
5867*22dc650dSSadaf Ebrahimi case META_CLASS_EMPTY_NOT:
5868*22dc650dSSadaf Ebrahimi matched_char = TRUE;
5869*22dc650dSSadaf Ebrahimi *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5870*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5871*22dc650dSSadaf Ebrahimi zerofirstcu = firstcu;
5872*22dc650dSSadaf Ebrahimi zerofirstcuflags = firstcuflags;
5873*22dc650dSSadaf Ebrahimi break;
5874*22dc650dSSadaf Ebrahimi
5875*22dc650dSSadaf Ebrahimi
5876*22dc650dSSadaf Ebrahimi /* ===================================================================*/
5877*22dc650dSSadaf Ebrahimi /* Non-empty character class. If the included characters are all < 256, we
5878*22dc650dSSadaf Ebrahimi build a 32-byte bitmap of the permitted characters, except in the special
5879*22dc650dSSadaf Ebrahimi case where there is only one such character. For negated classes, we build
5880*22dc650dSSadaf Ebrahimi the map as usual, then invert it at the end. However, we use a different
5881*22dc650dSSadaf Ebrahimi opcode so that data characters > 255 can be handled correctly.
5882*22dc650dSSadaf Ebrahimi
5883*22dc650dSSadaf Ebrahimi If the class contains characters outside the 0-255 range, a different
5884*22dc650dSSadaf Ebrahimi opcode is compiled. It may optionally have a bit map for characters < 256,
5885*22dc650dSSadaf Ebrahimi but those above are explicitly listed afterwards. A flag code unit tells
5886*22dc650dSSadaf Ebrahimi whether the bitmap is present, and whether this is a negated class or
5887*22dc650dSSadaf Ebrahimi not. */
5888*22dc650dSSadaf Ebrahimi
5889*22dc650dSSadaf Ebrahimi case META_CLASS_NOT:
5890*22dc650dSSadaf Ebrahimi case META_CLASS:
5891*22dc650dSSadaf Ebrahimi matched_char = TRUE;
5892*22dc650dSSadaf Ebrahimi negate_class = meta == META_CLASS_NOT;
5893*22dc650dSSadaf Ebrahimi
5894*22dc650dSSadaf Ebrahimi /* We can optimize the case of a single character in a class by generating
5895*22dc650dSSadaf Ebrahimi OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5896*22dc650dSSadaf Ebrahimi negative. In the negative case there can be no first char if this item is
5897*22dc650dSSadaf Ebrahimi first, whatever repeat count may follow. In the case of reqcu, save the
5898*22dc650dSSadaf Ebrahimi previous value for reinstating. */
5899*22dc650dSSadaf Ebrahimi
5900*22dc650dSSadaf Ebrahimi /* NOTE: at present this optimization is not effective if the only
5901*22dc650dSSadaf Ebrahimi character in a class in 32-bit, non-UCP mode has its top bit set. */
5902*22dc650dSSadaf Ebrahimi
5903*22dc650dSSadaf Ebrahimi if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5904*22dc650dSSadaf Ebrahimi {
5905*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5906*22dc650dSSadaf Ebrahimi uint32_t d;
5907*22dc650dSSadaf Ebrahimi #endif
5908*22dc650dSSadaf Ebrahimi uint32_t c = pptr[1];
5909*22dc650dSSadaf Ebrahimi
5910*22dc650dSSadaf Ebrahimi pptr += 2; /* Move on to class end */
5911*22dc650dSSadaf Ebrahimi if (meta == META_CLASS) /* A positive one-char class can be */
5912*22dc650dSSadaf Ebrahimi { /* handled as a normal literal character. */
5913*22dc650dSSadaf Ebrahimi meta = c; /* Set up the character */
5914*22dc650dSSadaf Ebrahimi goto NORMAL_CHAR_SET;
5915*22dc650dSSadaf Ebrahimi }
5916*22dc650dSSadaf Ebrahimi
5917*22dc650dSSadaf Ebrahimi /* Handle a negative one-character class */
5918*22dc650dSSadaf Ebrahimi
5919*22dc650dSSadaf Ebrahimi zeroreqcu = reqcu;
5920*22dc650dSSadaf Ebrahimi zeroreqcuflags = reqcuflags;
5921*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5922*22dc650dSSadaf Ebrahimi zerofirstcu = firstcu;
5923*22dc650dSSadaf Ebrahimi zerofirstcuflags = firstcuflags;
5924*22dc650dSSadaf Ebrahimi
5925*22dc650dSSadaf Ebrahimi /* For caseless UTF or UCP mode, check whether this character has more
5926*22dc650dSSadaf Ebrahimi than one other case. If so, generate a special OP_NOTPROP item instead of
5927*22dc650dSSadaf Ebrahimi OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
5928*22dc650dSSadaf Ebrahimi caseless set that starts with an ASCII character. */
5929*22dc650dSSadaf Ebrahimi
5930*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5931*22dc650dSSadaf Ebrahimi if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5932*22dc650dSSadaf Ebrahimi (d = UCD_CASESET(c)) != 0 &&
5933*22dc650dSSadaf Ebrahimi ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
5934*22dc650dSSadaf Ebrahimi PRIV(ucd_caseless_sets)[d] > 127))
5935*22dc650dSSadaf Ebrahimi {
5936*22dc650dSSadaf Ebrahimi *code++ = OP_NOTPROP;
5937*22dc650dSSadaf Ebrahimi *code++ = PT_CLIST;
5938*22dc650dSSadaf Ebrahimi *code++ = d;
5939*22dc650dSSadaf Ebrahimi break; /* We are finished with this class */
5940*22dc650dSSadaf Ebrahimi }
5941*22dc650dSSadaf Ebrahimi #endif
5942*22dc650dSSadaf Ebrahimi /* Char has only one other (usable) case, or UCP not available */
5943*22dc650dSSadaf Ebrahimi
5944*22dc650dSSadaf Ebrahimi *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5945*22dc650dSSadaf Ebrahimi code += PUTCHAR(c, code);
5946*22dc650dSSadaf Ebrahimi break; /* We are finished with this class */
5947*22dc650dSSadaf Ebrahimi } /* End of 1-char optimization */
5948*22dc650dSSadaf Ebrahimi
5949*22dc650dSSadaf Ebrahimi /* Handle character classes that contain more than just one literal
5950*22dc650dSSadaf Ebrahimi character. If there are exactly two characters in a positive class, see if
5951*22dc650dSSadaf Ebrahimi they are case partners. This can be optimized to generate a caseless single
5952*22dc650dSSadaf Ebrahimi character match (which also sets first/required code units if relevant).
5953*22dc650dSSadaf Ebrahimi When casing restrictions apply, ignore a caseless set if both characters
5954*22dc650dSSadaf Ebrahimi are ASCII. */
5955*22dc650dSSadaf Ebrahimi
5956*22dc650dSSadaf Ebrahimi if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5957*22dc650dSSadaf Ebrahimi pptr[3] == META_CLASS_END)
5958*22dc650dSSadaf Ebrahimi {
5959*22dc650dSSadaf Ebrahimi uint32_t c = pptr[1];
5960*22dc650dSSadaf Ebrahimi
5961*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5962*22dc650dSSadaf Ebrahimi if (UCD_CASESET(c) == 0 ||
5963*22dc650dSSadaf Ebrahimi ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
5964*22dc650dSSadaf Ebrahimi c < 128 && pptr[2] < 128))
5965*22dc650dSSadaf Ebrahimi #endif
5966*22dc650dSSadaf Ebrahimi {
5967*22dc650dSSadaf Ebrahimi uint32_t d;
5968*22dc650dSSadaf Ebrahimi
5969*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
5970*22dc650dSSadaf Ebrahimi if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5971*22dc650dSSadaf Ebrahimi #endif
5972*22dc650dSSadaf Ebrahimi {
5973*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
5974*22dc650dSSadaf Ebrahimi if (c > 255) d = c; else
5975*22dc650dSSadaf Ebrahimi #endif
5976*22dc650dSSadaf Ebrahimi d = TABLE_GET(c, cb->fcc, c);
5977*22dc650dSSadaf Ebrahimi }
5978*22dc650dSSadaf Ebrahimi
5979*22dc650dSSadaf Ebrahimi if (c != d && pptr[2] == d)
5980*22dc650dSSadaf Ebrahimi {
5981*22dc650dSSadaf Ebrahimi pptr += 3; /* Move on to class end */
5982*22dc650dSSadaf Ebrahimi meta = c;
5983*22dc650dSSadaf Ebrahimi if ((options & PCRE2_CASELESS) == 0)
5984*22dc650dSSadaf Ebrahimi {
5985*22dc650dSSadaf Ebrahimi reset_caseful = TRUE;
5986*22dc650dSSadaf Ebrahimi options |= PCRE2_CASELESS;
5987*22dc650dSSadaf Ebrahimi req_caseopt = REQ_CASELESS;
5988*22dc650dSSadaf Ebrahimi }
5989*22dc650dSSadaf Ebrahimi goto CLASS_CASELESS_CHAR;
5990*22dc650dSSadaf Ebrahimi }
5991*22dc650dSSadaf Ebrahimi }
5992*22dc650dSSadaf Ebrahimi }
5993*22dc650dSSadaf Ebrahimi
5994*22dc650dSSadaf Ebrahimi /* If a non-extended class contains a negative special such as \S, we need
5995*22dc650dSSadaf Ebrahimi to flip the negation flag at the end, so that support for characters > 255
5996*22dc650dSSadaf Ebrahimi works correctly (they are all included in the class). An extended class may
5997*22dc650dSSadaf Ebrahimi need to insert specific matching or non-matching code for wide characters.
5998*22dc650dSSadaf Ebrahimi */
5999*22dc650dSSadaf Ebrahimi
6000*22dc650dSSadaf Ebrahimi should_flip_negation = match_all_or_no_wide_chars = FALSE;
6001*22dc650dSSadaf Ebrahimi
6002*22dc650dSSadaf Ebrahimi /* Extended class (xclass) will be used when characters > 255
6003*22dc650dSSadaf Ebrahimi might match. */
6004*22dc650dSSadaf Ebrahimi
6005*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
6006*22dc650dSSadaf Ebrahimi xclass = FALSE;
6007*22dc650dSSadaf Ebrahimi class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
6008*22dc650dSSadaf Ebrahimi class_uchardata_base = class_uchardata; /* Save the start */
6009*22dc650dSSadaf Ebrahimi #endif
6010*22dc650dSSadaf Ebrahimi
6011*22dc650dSSadaf Ebrahimi /* For optimization purposes, we track some properties of the class:
6012*22dc650dSSadaf Ebrahimi class_has_8bitchar will be non-zero if the class contains at least one
6013*22dc650dSSadaf Ebrahimi character with a code point less than 256; xclass_has_prop will be TRUE if
6014*22dc650dSSadaf Ebrahimi Unicode property checks are present in the class. */
6015*22dc650dSSadaf Ebrahimi
6016*22dc650dSSadaf Ebrahimi class_has_8bitchar = 0;
6017*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
6018*22dc650dSSadaf Ebrahimi xclass_has_prop = FALSE;
6019*22dc650dSSadaf Ebrahimi #endif
6020*22dc650dSSadaf Ebrahimi
6021*22dc650dSSadaf Ebrahimi /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
6022*22dc650dSSadaf Ebrahimi in a temporary bit of memory, in case the class contains fewer than two
6023*22dc650dSSadaf Ebrahimi 8-bit characters because in that case the compiled code doesn't use the bit
6024*22dc650dSSadaf Ebrahimi map. */
6025*22dc650dSSadaf Ebrahimi
6026*22dc650dSSadaf Ebrahimi memset(classbits, 0, 32 * sizeof(uint8_t));
6027*22dc650dSSadaf Ebrahimi
6028*22dc650dSSadaf Ebrahimi /* Process items until META_CLASS_END is reached. */
6029*22dc650dSSadaf Ebrahimi
6030*22dc650dSSadaf Ebrahimi while ((meta = *(++pptr)) != META_CLASS_END)
6031*22dc650dSSadaf Ebrahimi {
6032*22dc650dSSadaf Ebrahimi /* Handle POSIX classes such as [:alpha:] etc. */
6033*22dc650dSSadaf Ebrahimi
6034*22dc650dSSadaf Ebrahimi if (meta == META_POSIX || meta == META_POSIX_NEG)
6035*22dc650dSSadaf Ebrahimi {
6036*22dc650dSSadaf Ebrahimi BOOL local_negate = (meta == META_POSIX_NEG);
6037*22dc650dSSadaf Ebrahimi int posix_class = *(++pptr);
6038*22dc650dSSadaf Ebrahimi int taboffset, tabopt;
6039*22dc650dSSadaf Ebrahimi uint8_t pbits[32];
6040*22dc650dSSadaf Ebrahimi
6041*22dc650dSSadaf Ebrahimi should_flip_negation = local_negate; /* Note negative special */
6042*22dc650dSSadaf Ebrahimi
6043*22dc650dSSadaf Ebrahimi /* If matching is caseless, upper and lower are converted to alpha.
6044*22dc650dSSadaf Ebrahimi This relies on the fact that the class table starts with alpha,
6045*22dc650dSSadaf Ebrahimi lower, upper as the first 3 entries. */
6046*22dc650dSSadaf Ebrahimi
6047*22dc650dSSadaf Ebrahimi if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
6048*22dc650dSSadaf Ebrahimi posix_class = 0;
6049*22dc650dSSadaf Ebrahimi
6050*22dc650dSSadaf Ebrahimi /* When PCRE2_UCP is set, some of the POSIX classes are converted to
6051*22dc650dSSadaf Ebrahimi different escape sequences that use Unicode properties \p or \P.
6052*22dc650dSSadaf Ebrahimi Others that are not available via \p or \P have to generate
6053*22dc650dSSadaf Ebrahimi XCL_PROP/XCL_NOTPROP directly, which is done here. */
6054*22dc650dSSadaf Ebrahimi
6055*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
6056*22dc650dSSadaf Ebrahimi if ((options & PCRE2_UCP) != 0 &&
6057*22dc650dSSadaf Ebrahimi (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
6058*22dc650dSSadaf Ebrahimi {
6059*22dc650dSSadaf Ebrahimi switch(posix_class)
6060*22dc650dSSadaf Ebrahimi {
6061*22dc650dSSadaf Ebrahimi case PC_GRAPH:
6062*22dc650dSSadaf Ebrahimi case PC_PRINT:
6063*22dc650dSSadaf Ebrahimi case PC_PUNCT:
6064*22dc650dSSadaf Ebrahimi *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
6065*22dc650dSSadaf Ebrahimi *class_uchardata++ = (PCRE2_UCHAR)
6066*22dc650dSSadaf Ebrahimi ((posix_class == PC_GRAPH)? PT_PXGRAPH :
6067*22dc650dSSadaf Ebrahimi (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
6068*22dc650dSSadaf Ebrahimi *class_uchardata++ = 0;
6069*22dc650dSSadaf Ebrahimi xclass_has_prop = TRUE;
6070*22dc650dSSadaf Ebrahimi goto CONTINUE_CLASS;
6071*22dc650dSSadaf Ebrahimi
6072*22dc650dSSadaf Ebrahimi /* For the other POSIX classes (ex: ascii) we are going to
6073*22dc650dSSadaf Ebrahimi fall through to the non-UCP case and build a bit map for
6074*22dc650dSSadaf Ebrahimi characters with code points less than 256. However, if we are in
6075*22dc650dSSadaf Ebrahimi a negated POSIX class, characters with code points greater than
6076*22dc650dSSadaf Ebrahimi 255 must either all match or all not match, depending on whether
6077*22dc650dSSadaf Ebrahimi the whole class is not or is negated. For example, for
6078*22dc650dSSadaf Ebrahimi [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
6079*22dc650dSSadaf Ebrahimi they must not.
6080*22dc650dSSadaf Ebrahimi
6081*22dc650dSSadaf Ebrahimi In the special case where there are no xclass items, this is
6082*22dc650dSSadaf Ebrahimi automatically handled by the use of OP_CLASS or OP_NCLASS, but an
6083*22dc650dSSadaf Ebrahimi explicit range is needed for OP_XCLASS. Setting a flag here
6084*22dc650dSSadaf Ebrahimi causes the range to be generated later when it is known that
6085*22dc650dSSadaf Ebrahimi OP_XCLASS is required. In the 8-bit library this is relevant only in
6086*22dc650dSSadaf Ebrahimi utf mode, since no wide characters can exist otherwise. */
6087*22dc650dSSadaf Ebrahimi
6088*22dc650dSSadaf Ebrahimi default:
6089*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
6090*22dc650dSSadaf Ebrahimi if (utf)
6091*22dc650dSSadaf Ebrahimi #endif
6092*22dc650dSSadaf Ebrahimi match_all_or_no_wide_chars |= local_negate;
6093*22dc650dSSadaf Ebrahimi break;
6094*22dc650dSSadaf Ebrahimi }
6095*22dc650dSSadaf Ebrahimi }
6096*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
6097*22dc650dSSadaf Ebrahimi
6098*22dc650dSSadaf Ebrahimi /* In the non-UCP case, or when UCP makes no difference, we build the
6099*22dc650dSSadaf Ebrahimi bit map for the POSIX class in a chunk of local store because we may
6100*22dc650dSSadaf Ebrahimi be adding and subtracting from it, and we don't want to subtract bits
6101*22dc650dSSadaf Ebrahimi that may be in the main map already. At the end we or the result into
6102*22dc650dSSadaf Ebrahimi the bit map that is being built. */
6103*22dc650dSSadaf Ebrahimi
6104*22dc650dSSadaf Ebrahimi posix_class *= 3;
6105*22dc650dSSadaf Ebrahimi
6106*22dc650dSSadaf Ebrahimi /* Copy in the first table (always present) */
6107*22dc650dSSadaf Ebrahimi
6108*22dc650dSSadaf Ebrahimi memcpy(pbits, cbits + posix_class_maps[posix_class],
6109*22dc650dSSadaf Ebrahimi 32 * sizeof(uint8_t));
6110*22dc650dSSadaf Ebrahimi
6111*22dc650dSSadaf Ebrahimi /* If there is a second table, add or remove it as required. */
6112*22dc650dSSadaf Ebrahimi
6113*22dc650dSSadaf Ebrahimi taboffset = posix_class_maps[posix_class + 1];
6114*22dc650dSSadaf Ebrahimi tabopt = posix_class_maps[posix_class + 2];
6115*22dc650dSSadaf Ebrahimi
6116*22dc650dSSadaf Ebrahimi if (taboffset >= 0)
6117*22dc650dSSadaf Ebrahimi {
6118*22dc650dSSadaf Ebrahimi if (tabopt >= 0)
6119*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
6120*22dc650dSSadaf Ebrahimi else
6121*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
6122*22dc650dSSadaf Ebrahimi }
6123*22dc650dSSadaf Ebrahimi
6124*22dc650dSSadaf Ebrahimi /* Now see if we need to remove any special characters. An option
6125*22dc650dSSadaf Ebrahimi value of 1 removes vertical space and 2 removes underscore. */
6126*22dc650dSSadaf Ebrahimi
6127*22dc650dSSadaf Ebrahimi if (tabopt < 0) tabopt = -tabopt;
6128*22dc650dSSadaf Ebrahimi if (tabopt == 1) pbits[1] &= ~0x3c;
6129*22dc650dSSadaf Ebrahimi else if (tabopt == 2) pbits[11] &= 0x7f;
6130*22dc650dSSadaf Ebrahimi
6131*22dc650dSSadaf Ebrahimi /* Add the POSIX table or its complement into the main table that is
6132*22dc650dSSadaf Ebrahimi being built and we are done. */
6133*22dc650dSSadaf Ebrahimi
6134*22dc650dSSadaf Ebrahimi if (local_negate)
6135*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
6136*22dc650dSSadaf Ebrahimi else
6137*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
6138*22dc650dSSadaf Ebrahimi
6139*22dc650dSSadaf Ebrahimi /* Every class contains at least one < 256 character. */
6140*22dc650dSSadaf Ebrahimi
6141*22dc650dSSadaf Ebrahimi class_has_8bitchar = 1;
6142*22dc650dSSadaf Ebrahimi goto CONTINUE_CLASS; /* End of POSIX handling */
6143*22dc650dSSadaf Ebrahimi }
6144*22dc650dSSadaf Ebrahimi
6145*22dc650dSSadaf Ebrahimi /* Other than POSIX classes, the only items we should encounter are
6146*22dc650dSSadaf Ebrahimi \d-type escapes and literal characters (possibly as ranges). */
6147*22dc650dSSadaf Ebrahimi
6148*22dc650dSSadaf Ebrahimi if (meta == META_BIGVALUE)
6149*22dc650dSSadaf Ebrahimi {
6150*22dc650dSSadaf Ebrahimi meta = *(++pptr);
6151*22dc650dSSadaf Ebrahimi goto CLASS_LITERAL;
6152*22dc650dSSadaf Ebrahimi }
6153*22dc650dSSadaf Ebrahimi
6154*22dc650dSSadaf Ebrahimi /* Any other non-literal must be an escape */
6155*22dc650dSSadaf Ebrahimi
6156*22dc650dSSadaf Ebrahimi if (meta >= META_END)
6157*22dc650dSSadaf Ebrahimi {
6158*22dc650dSSadaf Ebrahimi if (META_CODE(meta) != META_ESCAPE)
6159*22dc650dSSadaf Ebrahimi {
6160*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_PARSED
6161*22dc650dSSadaf Ebrahimi fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
6162*22dc650dSSadaf Ebrahimi "in character class\n", meta);
6163*22dc650dSSadaf Ebrahimi #endif
6164*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR89; /* Internal error - unrecognized. */
6165*22dc650dSSadaf Ebrahimi return 0;
6166*22dc650dSSadaf Ebrahimi }
6167*22dc650dSSadaf Ebrahimi escape = META_DATA(meta);
6168*22dc650dSSadaf Ebrahimi
6169*22dc650dSSadaf Ebrahimi /* Every class contains at least one < 256 character. */
6170*22dc650dSSadaf Ebrahimi
6171*22dc650dSSadaf Ebrahimi class_has_8bitchar++;
6172*22dc650dSSadaf Ebrahimi
6173*22dc650dSSadaf Ebrahimi switch(escape)
6174*22dc650dSSadaf Ebrahimi {
6175*22dc650dSSadaf Ebrahimi case ESC_d:
6176*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
6177*22dc650dSSadaf Ebrahimi break;
6178*22dc650dSSadaf Ebrahimi
6179*22dc650dSSadaf Ebrahimi case ESC_D:
6180*22dc650dSSadaf Ebrahimi should_flip_negation = TRUE;
6181*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++)
6182*22dc650dSSadaf Ebrahimi classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
6183*22dc650dSSadaf Ebrahimi break;
6184*22dc650dSSadaf Ebrahimi
6185*22dc650dSSadaf Ebrahimi case ESC_w:
6186*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
6187*22dc650dSSadaf Ebrahimi break;
6188*22dc650dSSadaf Ebrahimi
6189*22dc650dSSadaf Ebrahimi case ESC_W:
6190*22dc650dSSadaf Ebrahimi should_flip_negation = TRUE;
6191*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++)
6192*22dc650dSSadaf Ebrahimi classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
6193*22dc650dSSadaf Ebrahimi break;
6194*22dc650dSSadaf Ebrahimi
6195*22dc650dSSadaf Ebrahimi /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
6196*22dc650dSSadaf Ebrahimi 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
6197*22dc650dSSadaf Ebrahimi previously set by something earlier in the character class.
6198*22dc650dSSadaf Ebrahimi Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
6199*22dc650dSSadaf Ebrahimi we could just adjust the appropriate bit. From PCRE 8.34 we no
6200*22dc650dSSadaf Ebrahimi longer treat \s and \S specially. */
6201*22dc650dSSadaf Ebrahimi
6202*22dc650dSSadaf Ebrahimi case ESC_s:
6203*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
6204*22dc650dSSadaf Ebrahimi break;
6205*22dc650dSSadaf Ebrahimi
6206*22dc650dSSadaf Ebrahimi case ESC_S:
6207*22dc650dSSadaf Ebrahimi should_flip_negation = TRUE;
6208*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++)
6209*22dc650dSSadaf Ebrahimi classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
6210*22dc650dSSadaf Ebrahimi break;
6211*22dc650dSSadaf Ebrahimi
6212*22dc650dSSadaf Ebrahimi /* When adding the horizontal or vertical space lists to a class, or
6213*22dc650dSSadaf Ebrahimi their complements, disable PCRE2_CASELESS, because it justs wastes
6214*22dc650dSSadaf Ebrahimi time, and in the "not-x" UTF cases can create unwanted duplicates in
6215*22dc650dSSadaf Ebrahimi the XCLASS list (provoked by characters that have more than one other
6216*22dc650dSSadaf Ebrahimi case and by both cases being in the same "not-x" sublist). */
6217*22dc650dSSadaf Ebrahimi
6218*22dc650dSSadaf Ebrahimi case ESC_h:
6219*22dc650dSSadaf Ebrahimi (void)add_list_to_class(classbits, &class_uchardata,
6220*22dc650dSSadaf Ebrahimi options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list),
6221*22dc650dSSadaf Ebrahimi NOTACHAR);
6222*22dc650dSSadaf Ebrahimi break;
6223*22dc650dSSadaf Ebrahimi
6224*22dc650dSSadaf Ebrahimi case ESC_H:
6225*22dc650dSSadaf Ebrahimi (void)add_not_list_to_class(classbits, &class_uchardata,
6226*22dc650dSSadaf Ebrahimi options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list));
6227*22dc650dSSadaf Ebrahimi break;
6228*22dc650dSSadaf Ebrahimi
6229*22dc650dSSadaf Ebrahimi case ESC_v:
6230*22dc650dSSadaf Ebrahimi (void)add_list_to_class(classbits, &class_uchardata,
6231*22dc650dSSadaf Ebrahimi options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list),
6232*22dc650dSSadaf Ebrahimi NOTACHAR);
6233*22dc650dSSadaf Ebrahimi break;
6234*22dc650dSSadaf Ebrahimi
6235*22dc650dSSadaf Ebrahimi case ESC_V:
6236*22dc650dSSadaf Ebrahimi (void)add_not_list_to_class(classbits, &class_uchardata,
6237*22dc650dSSadaf Ebrahimi options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list));
6238*22dc650dSSadaf Ebrahimi break;
6239*22dc650dSSadaf Ebrahimi
6240*22dc650dSSadaf Ebrahimi /* If Unicode is not supported, \P and \p are not allowed and are
6241*22dc650dSSadaf Ebrahimi faulted at parse time, so will never appear here. */
6242*22dc650dSSadaf Ebrahimi
6243*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
6244*22dc650dSSadaf Ebrahimi case ESC_p:
6245*22dc650dSSadaf Ebrahimi case ESC_P:
6246*22dc650dSSadaf Ebrahimi {
6247*22dc650dSSadaf Ebrahimi uint32_t ptype = *(++pptr) >> 16;
6248*22dc650dSSadaf Ebrahimi uint32_t pdata = *pptr & 0xffff;
6249*22dc650dSSadaf Ebrahimi *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
6250*22dc650dSSadaf Ebrahimi *class_uchardata++ = ptype;
6251*22dc650dSSadaf Ebrahimi *class_uchardata++ = pdata;
6252*22dc650dSSadaf Ebrahimi xclass_has_prop = TRUE;
6253*22dc650dSSadaf Ebrahimi class_has_8bitchar--; /* Undo! */
6254*22dc650dSSadaf Ebrahimi }
6255*22dc650dSSadaf Ebrahimi break;
6256*22dc650dSSadaf Ebrahimi #endif
6257*22dc650dSSadaf Ebrahimi }
6258*22dc650dSSadaf Ebrahimi
6259*22dc650dSSadaf Ebrahimi goto CONTINUE_CLASS;
6260*22dc650dSSadaf Ebrahimi } /* End handling \d-type escapes */
6261*22dc650dSSadaf Ebrahimi
6262*22dc650dSSadaf Ebrahimi /* A literal character may be followed by a range meta. At parse time
6263*22dc650dSSadaf Ebrahimi there are checks for out-of-order characters, for ranges where the two
6264*22dc650dSSadaf Ebrahimi characters are equal, and for hyphens that cannot indicate a range. At
6265*22dc650dSSadaf Ebrahimi this point, therefore, no checking is needed. */
6266*22dc650dSSadaf Ebrahimi
6267*22dc650dSSadaf Ebrahimi else
6268*22dc650dSSadaf Ebrahimi {
6269*22dc650dSSadaf Ebrahimi uint32_t c, d;
6270*22dc650dSSadaf Ebrahimi
6271*22dc650dSSadaf Ebrahimi CLASS_LITERAL:
6272*22dc650dSSadaf Ebrahimi c = d = meta;
6273*22dc650dSSadaf Ebrahimi
6274*22dc650dSSadaf Ebrahimi /* Remember if \r or \n were explicitly used */
6275*22dc650dSSadaf Ebrahimi
6276*22dc650dSSadaf Ebrahimi if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6277*22dc650dSSadaf Ebrahimi
6278*22dc650dSSadaf Ebrahimi /* Process a character range */
6279*22dc650dSSadaf Ebrahimi
6280*22dc650dSSadaf Ebrahimi if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6281*22dc650dSSadaf Ebrahimi {
6282*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
6283*22dc650dSSadaf Ebrahimi BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6284*22dc650dSSadaf Ebrahimi #endif
6285*22dc650dSSadaf Ebrahimi pptr += 2;
6286*22dc650dSSadaf Ebrahimi d = *pptr;
6287*22dc650dSSadaf Ebrahimi if (d == META_BIGVALUE) d = *(++pptr);
6288*22dc650dSSadaf Ebrahimi
6289*22dc650dSSadaf Ebrahimi /* Remember an explicit \r or \n, and add the range to the class. */
6290*22dc650dSSadaf Ebrahimi
6291*22dc650dSSadaf Ebrahimi if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6292*22dc650dSSadaf Ebrahimi
6293*22dc650dSSadaf Ebrahimi /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6294*22dc650dSSadaf Ebrahimi because there are holes in the encoding, and simply using the range
6295*22dc650dSSadaf Ebrahimi A-Z (for example) would include the characters in the holes. This
6296*22dc650dSSadaf Ebrahimi applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6297*22dc650dSSadaf Ebrahimi
6298*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
6299*22dc650dSSadaf Ebrahimi if (range_is_literal &&
6300*22dc650dSSadaf Ebrahimi (cb->ctypes[c] & ctype_letter) != 0 &&
6301*22dc650dSSadaf Ebrahimi (cb->ctypes[d] & ctype_letter) != 0 &&
6302*22dc650dSSadaf Ebrahimi (c <= CHAR_z) == (d <= CHAR_z))
6303*22dc650dSSadaf Ebrahimi {
6304*22dc650dSSadaf Ebrahimi uint32_t uc = (d <= CHAR_z)? 0 : 64;
6305*22dc650dSSadaf Ebrahimi uint32_t C = c - uc;
6306*22dc650dSSadaf Ebrahimi uint32_t D = d - uc;
6307*22dc650dSSadaf Ebrahimi
6308*22dc650dSSadaf Ebrahimi if (C <= CHAR_i)
6309*22dc650dSSadaf Ebrahimi {
6310*22dc650dSSadaf Ebrahimi class_has_8bitchar +=
6311*22dc650dSSadaf Ebrahimi add_to_class(classbits, &class_uchardata, options, xoptions,
6312*22dc650dSSadaf Ebrahimi cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc);
6313*22dc650dSSadaf Ebrahimi C = CHAR_j;
6314*22dc650dSSadaf Ebrahimi }
6315*22dc650dSSadaf Ebrahimi
6316*22dc650dSSadaf Ebrahimi if (C <= D && C <= CHAR_r)
6317*22dc650dSSadaf Ebrahimi {
6318*22dc650dSSadaf Ebrahimi class_has_8bitchar +=
6319*22dc650dSSadaf Ebrahimi add_to_class(classbits, &class_uchardata, options, xoptions,
6320*22dc650dSSadaf Ebrahimi cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc);
6321*22dc650dSSadaf Ebrahimi C = CHAR_s;
6322*22dc650dSSadaf Ebrahimi }
6323*22dc650dSSadaf Ebrahimi
6324*22dc650dSSadaf Ebrahimi if (C <= D)
6325*22dc650dSSadaf Ebrahimi {
6326*22dc650dSSadaf Ebrahimi class_has_8bitchar +=
6327*22dc650dSSadaf Ebrahimi add_to_class(classbits, &class_uchardata, options, xoptions,
6328*22dc650dSSadaf Ebrahimi cb, C + uc, D + uc);
6329*22dc650dSSadaf Ebrahimi }
6330*22dc650dSSadaf Ebrahimi }
6331*22dc650dSSadaf Ebrahimi else
6332*22dc650dSSadaf Ebrahimi #endif
6333*22dc650dSSadaf Ebrahimi /* Not an EBCDIC special range */
6334*22dc650dSSadaf Ebrahimi
6335*22dc650dSSadaf Ebrahimi class_has_8bitchar += add_to_class(classbits, &class_uchardata,
6336*22dc650dSSadaf Ebrahimi options, xoptions, cb, c, d);
6337*22dc650dSSadaf Ebrahimi goto CONTINUE_CLASS; /* Go get the next char in the class */
6338*22dc650dSSadaf Ebrahimi } /* End of range handling */
6339*22dc650dSSadaf Ebrahimi
6340*22dc650dSSadaf Ebrahimi
6341*22dc650dSSadaf Ebrahimi /* Handle a single character. */
6342*22dc650dSSadaf Ebrahimi
6343*22dc650dSSadaf Ebrahimi class_has_8bitchar +=
6344*22dc650dSSadaf Ebrahimi add_to_class(classbits, &class_uchardata, options, xoptions, cb,
6345*22dc650dSSadaf Ebrahimi meta, meta);
6346*22dc650dSSadaf Ebrahimi }
6347*22dc650dSSadaf Ebrahimi
6348*22dc650dSSadaf Ebrahimi /* Continue to the next item in the class. */
6349*22dc650dSSadaf Ebrahimi
6350*22dc650dSSadaf Ebrahimi CONTINUE_CLASS:
6351*22dc650dSSadaf Ebrahimi
6352*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
6353*22dc650dSSadaf Ebrahimi /* If any wide characters or Unicode properties have been encountered,
6354*22dc650dSSadaf Ebrahimi set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6355*22dc650dSSadaf Ebrahimi of the extra data and reset the pointer. This is so that very large
6356*22dc650dSSadaf Ebrahimi classes that contain a zillion wide characters or Unicode property tests
6357*22dc650dSSadaf Ebrahimi do not overwrite the workspace (which is on the stack). */
6358*22dc650dSSadaf Ebrahimi
6359*22dc650dSSadaf Ebrahimi if (class_uchardata > class_uchardata_base)
6360*22dc650dSSadaf Ebrahimi {
6361*22dc650dSSadaf Ebrahimi xclass = TRUE;
6362*22dc650dSSadaf Ebrahimi if (lengthptr != NULL)
6363*22dc650dSSadaf Ebrahimi {
6364*22dc650dSSadaf Ebrahimi *lengthptr += class_uchardata - class_uchardata_base;
6365*22dc650dSSadaf Ebrahimi class_uchardata = class_uchardata_base;
6366*22dc650dSSadaf Ebrahimi }
6367*22dc650dSSadaf Ebrahimi }
6368*22dc650dSSadaf Ebrahimi #endif
6369*22dc650dSSadaf Ebrahimi
6370*22dc650dSSadaf Ebrahimi continue; /* Needed to avoid error when not supporting wide chars */
6371*22dc650dSSadaf Ebrahimi } /* End of main class-processing loop */
6372*22dc650dSSadaf Ebrahimi
6373*22dc650dSSadaf Ebrahimi /* If this class is the first thing in the branch, there can be no first
6374*22dc650dSSadaf Ebrahimi char setting, whatever the repeat count. Any reqcu setting must remain
6375*22dc650dSSadaf Ebrahimi unchanged after any kind of repeat. */
6376*22dc650dSSadaf Ebrahimi
6377*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6378*22dc650dSSadaf Ebrahimi zerofirstcu = firstcu;
6379*22dc650dSSadaf Ebrahimi zerofirstcuflags = firstcuflags;
6380*22dc650dSSadaf Ebrahimi zeroreqcu = reqcu;
6381*22dc650dSSadaf Ebrahimi zeroreqcuflags = reqcuflags;
6382*22dc650dSSadaf Ebrahimi
6383*22dc650dSSadaf Ebrahimi /* If there are characters with values > 255, or Unicode property settings
6384*22dc650dSSadaf Ebrahimi (\p or \P), we have to compile an extended class, with its own opcode,
6385*22dc650dSSadaf Ebrahimi unless there were no property settings and there was a negated special such
6386*22dc650dSSadaf Ebrahimi as \S in the class, and PCRE2_UCP is not set, because in that case all
6387*22dc650dSSadaf Ebrahimi characters > 255 are in or not in the class, so any that were explicitly
6388*22dc650dSSadaf Ebrahimi given as well can be ignored.
6389*22dc650dSSadaf Ebrahimi
6390*22dc650dSSadaf Ebrahimi In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
6391*22dc650dSSadaf Ebrahimi were present in a class, we either have to match or not match all wide
6392*22dc650dSSadaf Ebrahimi characters (depending on whether the whole class is or is not negated).
6393*22dc650dSSadaf Ebrahimi This requirement is indicated by match_all_or_no_wide_chars being true.
6394*22dc650dSSadaf Ebrahimi We do this by including an explicit range, which works in both cases.
6395*22dc650dSSadaf Ebrahimi This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6396*22dc650dSSadaf Ebrahimi cannot be any wide characters in 8-bit non-UTF mode.
6397*22dc650dSSadaf Ebrahimi
6398*22dc650dSSadaf Ebrahimi When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6399*22dc650dSSadaf Ebrahimi class where \S etc is present without PCRE2_UCP, causing an extended class
6400*22dc650dSSadaf Ebrahimi to be compiled, we make sure that all characters > 255 are included by
6401*22dc650dSSadaf Ebrahimi forcing match_all_or_no_wide_chars to be true.
6402*22dc650dSSadaf Ebrahimi
6403*22dc650dSSadaf Ebrahimi If, when generating an xclass, there are no characters < 256, we can omit
6404*22dc650dSSadaf Ebrahimi the bitmap in the actual compiled code. */
6405*22dc650dSSadaf Ebrahimi
6406*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
6407*22dc650dSSadaf Ebrahimi if (xclass && (
6408*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
6409*22dc650dSSadaf Ebrahimi (options & PCRE2_UCP) != 0 ||
6410*22dc650dSSadaf Ebrahimi #endif
6411*22dc650dSSadaf Ebrahimi xclass_has_prop || !should_flip_negation))
6412*22dc650dSSadaf Ebrahimi {
6413*22dc650dSSadaf Ebrahimi if (match_all_or_no_wide_chars || (
6414*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
6415*22dc650dSSadaf Ebrahimi utf &&
6416*22dc650dSSadaf Ebrahimi #endif
6417*22dc650dSSadaf Ebrahimi should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6418*22dc650dSSadaf Ebrahimi {
6419*22dc650dSSadaf Ebrahimi *class_uchardata++ = XCL_RANGE;
6420*22dc650dSSadaf Ebrahimi if (utf) /* Will always be utf in the 8-bit library */
6421*22dc650dSSadaf Ebrahimi {
6422*22dc650dSSadaf Ebrahimi class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6423*22dc650dSSadaf Ebrahimi class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6424*22dc650dSSadaf Ebrahimi }
6425*22dc650dSSadaf Ebrahimi else /* Can only happen for the 16-bit & 32-bit libraries */
6426*22dc650dSSadaf Ebrahimi {
6427*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 16
6428*22dc650dSSadaf Ebrahimi *class_uchardata++ = 0x100;
6429*22dc650dSSadaf Ebrahimi *class_uchardata++ = 0xffffu;
6430*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 32
6431*22dc650dSSadaf Ebrahimi *class_uchardata++ = 0x100;
6432*22dc650dSSadaf Ebrahimi *class_uchardata++ = 0xffffffffu;
6433*22dc650dSSadaf Ebrahimi #endif
6434*22dc650dSSadaf Ebrahimi }
6435*22dc650dSSadaf Ebrahimi }
6436*22dc650dSSadaf Ebrahimi *class_uchardata++ = XCL_END; /* Marks the end of extra data */
6437*22dc650dSSadaf Ebrahimi *code++ = OP_XCLASS;
6438*22dc650dSSadaf Ebrahimi code += LINK_SIZE;
6439*22dc650dSSadaf Ebrahimi *code = negate_class? XCL_NOT:0;
6440*22dc650dSSadaf Ebrahimi if (xclass_has_prop) *code |= XCL_HASPROP;
6441*22dc650dSSadaf Ebrahimi
6442*22dc650dSSadaf Ebrahimi /* If the map is required, move up the extra data to make room for it;
6443*22dc650dSSadaf Ebrahimi otherwise just move the code pointer to the end of the extra data. */
6444*22dc650dSSadaf Ebrahimi
6445*22dc650dSSadaf Ebrahimi if (class_has_8bitchar > 0)
6446*22dc650dSSadaf Ebrahimi {
6447*22dc650dSSadaf Ebrahimi *code++ |= XCL_MAP;
6448*22dc650dSSadaf Ebrahimi (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6449*22dc650dSSadaf Ebrahimi CU2BYTES(class_uchardata - code));
6450*22dc650dSSadaf Ebrahimi if (negate_class && !xclass_has_prop)
6451*22dc650dSSadaf Ebrahimi {
6452*22dc650dSSadaf Ebrahimi /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6453*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6454*22dc650dSSadaf Ebrahimi }
6455*22dc650dSSadaf Ebrahimi memcpy(code, classbits, 32);
6456*22dc650dSSadaf Ebrahimi code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6457*22dc650dSSadaf Ebrahimi }
6458*22dc650dSSadaf Ebrahimi else code = class_uchardata;
6459*22dc650dSSadaf Ebrahimi
6460*22dc650dSSadaf Ebrahimi /* Now fill in the complete length of the item */
6461*22dc650dSSadaf Ebrahimi
6462*22dc650dSSadaf Ebrahimi PUT(previous, 1, (int)(code - previous));
6463*22dc650dSSadaf Ebrahimi break; /* End of class handling */
6464*22dc650dSSadaf Ebrahimi }
6465*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_WIDE_CHARS */
6466*22dc650dSSadaf Ebrahimi
6467*22dc650dSSadaf Ebrahimi /* If there are no characters > 255, or they are all to be included or
6468*22dc650dSSadaf Ebrahimi excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6469*22dc650dSSadaf Ebrahimi whole class was negated and whether there were negative specials such as \S
6470*22dc650dSSadaf Ebrahimi (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6471*22dc650dSSadaf Ebrahimi negating it if necessary. */
6472*22dc650dSSadaf Ebrahimi
6473*22dc650dSSadaf Ebrahimi *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6474*22dc650dSSadaf Ebrahimi if (lengthptr == NULL) /* Save time in the pre-compile phase */
6475*22dc650dSSadaf Ebrahimi {
6476*22dc650dSSadaf Ebrahimi if (negate_class)
6477*22dc650dSSadaf Ebrahimi {
6478*22dc650dSSadaf Ebrahimi /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6479*22dc650dSSadaf Ebrahimi for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6480*22dc650dSSadaf Ebrahimi }
6481*22dc650dSSadaf Ebrahimi memcpy(code, classbits, 32);
6482*22dc650dSSadaf Ebrahimi }
6483*22dc650dSSadaf Ebrahimi code += 32 / sizeof(PCRE2_UCHAR);
6484*22dc650dSSadaf Ebrahimi break; /* End of class processing */
6485*22dc650dSSadaf Ebrahimi
6486*22dc650dSSadaf Ebrahimi
6487*22dc650dSSadaf Ebrahimi /* ===================================================================*/
6488*22dc650dSSadaf Ebrahimi /* Deal with (*VERB)s. */
6489*22dc650dSSadaf Ebrahimi
6490*22dc650dSSadaf Ebrahimi /* Check for open captures before ACCEPT and close those that are within
6491*22dc650dSSadaf Ebrahimi the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6492*22dc650dSSadaf Ebrahimi assertion. In the first pass, just accumulate the length required;
6493*22dc650dSSadaf Ebrahimi otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6494*22dc650dSSadaf Ebrahimi workspace overflow. Do not set firstcu after *ACCEPT. */
6495*22dc650dSSadaf Ebrahimi
6496*22dc650dSSadaf Ebrahimi case META_ACCEPT:
6497*22dc650dSSadaf Ebrahimi cb->had_accept = had_accept = TRUE;
6498*22dc650dSSadaf Ebrahimi for (oc = open_caps;
6499*22dc650dSSadaf Ebrahimi oc != NULL && oc->assert_depth >= cb->assert_depth;
6500*22dc650dSSadaf Ebrahimi oc = oc->next)
6501*22dc650dSSadaf Ebrahimi {
6502*22dc650dSSadaf Ebrahimi if (lengthptr != NULL)
6503*22dc650dSSadaf Ebrahimi {
6504*22dc650dSSadaf Ebrahimi *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6505*22dc650dSSadaf Ebrahimi }
6506*22dc650dSSadaf Ebrahimi else
6507*22dc650dSSadaf Ebrahimi {
6508*22dc650dSSadaf Ebrahimi *code++ = OP_CLOSE;
6509*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, oc->number);
6510*22dc650dSSadaf Ebrahimi }
6511*22dc650dSSadaf Ebrahimi }
6512*22dc650dSSadaf Ebrahimi *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6513*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6514*22dc650dSSadaf Ebrahimi break;
6515*22dc650dSSadaf Ebrahimi
6516*22dc650dSSadaf Ebrahimi case META_PRUNE:
6517*22dc650dSSadaf Ebrahimi case META_SKIP:
6518*22dc650dSSadaf Ebrahimi cb->had_pruneorskip = TRUE;
6519*22dc650dSSadaf Ebrahimi /* Fall through */
6520*22dc650dSSadaf Ebrahimi case META_COMMIT:
6521*22dc650dSSadaf Ebrahimi case META_FAIL:
6522*22dc650dSSadaf Ebrahimi *code++ = verbops[(meta - META_MARK) >> 16];
6523*22dc650dSSadaf Ebrahimi break;
6524*22dc650dSSadaf Ebrahimi
6525*22dc650dSSadaf Ebrahimi case META_THEN:
6526*22dc650dSSadaf Ebrahimi cb->external_flags |= PCRE2_HASTHEN;
6527*22dc650dSSadaf Ebrahimi *code++ = OP_THEN;
6528*22dc650dSSadaf Ebrahimi break;
6529*22dc650dSSadaf Ebrahimi
6530*22dc650dSSadaf Ebrahimi /* Handle verbs with arguments. Arguments can be very long, especially in
6531*22dc650dSSadaf Ebrahimi 16- and 32-bit modes, and can overflow the workspace in the first pass.
6532*22dc650dSSadaf Ebrahimi However, the argument length is constrained to be small enough to fit in
6533*22dc650dSSadaf Ebrahimi one code unit. This check happens in parse_regex(). In the first pass,
6534*22dc650dSSadaf Ebrahimi instead of putting the argument into memory, we just update the length
6535*22dc650dSSadaf Ebrahimi counter and set up an empty argument. */
6536*22dc650dSSadaf Ebrahimi
6537*22dc650dSSadaf Ebrahimi case META_THEN_ARG:
6538*22dc650dSSadaf Ebrahimi cb->external_flags |= PCRE2_HASTHEN;
6539*22dc650dSSadaf Ebrahimi goto VERB_ARG;
6540*22dc650dSSadaf Ebrahimi
6541*22dc650dSSadaf Ebrahimi case META_PRUNE_ARG:
6542*22dc650dSSadaf Ebrahimi case META_SKIP_ARG:
6543*22dc650dSSadaf Ebrahimi cb->had_pruneorskip = TRUE;
6544*22dc650dSSadaf Ebrahimi /* Fall through */
6545*22dc650dSSadaf Ebrahimi case META_MARK:
6546*22dc650dSSadaf Ebrahimi case META_COMMIT_ARG:
6547*22dc650dSSadaf Ebrahimi VERB_ARG:
6548*22dc650dSSadaf Ebrahimi *code++ = verbops[(meta - META_MARK) >> 16];
6549*22dc650dSSadaf Ebrahimi /* The length is in characters. */
6550*22dc650dSSadaf Ebrahimi verbarglen = *(++pptr);
6551*22dc650dSSadaf Ebrahimi verbculen = 0;
6552*22dc650dSSadaf Ebrahimi tempcode = code++;
6553*22dc650dSSadaf Ebrahimi for (int i = 0; i < (int)verbarglen; i++)
6554*22dc650dSSadaf Ebrahimi {
6555*22dc650dSSadaf Ebrahimi meta = *(++pptr);
6556*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
6557*22dc650dSSadaf Ebrahimi if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6558*22dc650dSSadaf Ebrahimi #endif
6559*22dc650dSSadaf Ebrahimi {
6560*22dc650dSSadaf Ebrahimi mclength = 1;
6561*22dc650dSSadaf Ebrahimi mcbuffer[0] = meta;
6562*22dc650dSSadaf Ebrahimi }
6563*22dc650dSSadaf Ebrahimi if (lengthptr != NULL) *lengthptr += mclength; else
6564*22dc650dSSadaf Ebrahimi {
6565*22dc650dSSadaf Ebrahimi memcpy(code, mcbuffer, CU2BYTES(mclength));
6566*22dc650dSSadaf Ebrahimi code += mclength;
6567*22dc650dSSadaf Ebrahimi verbculen += mclength;
6568*22dc650dSSadaf Ebrahimi }
6569*22dc650dSSadaf Ebrahimi }
6570*22dc650dSSadaf Ebrahimi
6571*22dc650dSSadaf Ebrahimi *tempcode = verbculen; /* Fill in the code unit length */
6572*22dc650dSSadaf Ebrahimi *code++ = 0; /* Terminating zero */
6573*22dc650dSSadaf Ebrahimi break;
6574*22dc650dSSadaf Ebrahimi
6575*22dc650dSSadaf Ebrahimi
6576*22dc650dSSadaf Ebrahimi /* ===================================================================*/
6577*22dc650dSSadaf Ebrahimi /* Handle options change. The new setting must be passed back for use in
6578*22dc650dSSadaf Ebrahimi subsequent branches. Reset the greedy defaults and the case value for
6579*22dc650dSSadaf Ebrahimi firstcu and reqcu. */
6580*22dc650dSSadaf Ebrahimi
6581*22dc650dSSadaf Ebrahimi case META_OPTIONS:
6582*22dc650dSSadaf Ebrahimi *optionsptr = options = *(++pptr);
6583*22dc650dSSadaf Ebrahimi *xoptionsptr = xoptions = *(++pptr);
6584*22dc650dSSadaf Ebrahimi greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6585*22dc650dSSadaf Ebrahimi greedy_non_default = greedy_default ^ 1;
6586*22dc650dSSadaf Ebrahimi req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6587*22dc650dSSadaf Ebrahimi break;
6588*22dc650dSSadaf Ebrahimi
6589*22dc650dSSadaf Ebrahimi
6590*22dc650dSSadaf Ebrahimi /* ===================================================================*/
6591*22dc650dSSadaf Ebrahimi /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6592*22dc650dSSadaf Ebrahimi because it could be a numerical check on recursion, or a name check on a
6593*22dc650dSSadaf Ebrahimi group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6594*22dc650dSSadaf Ebrahimi we can handle it either way. We first try for a name; if not found, process
6595*22dc650dSSadaf Ebrahimi the number. */
6596*22dc650dSSadaf Ebrahimi
6597*22dc650dSSadaf Ebrahimi case META_COND_RNUMBER: /* (?(Rdigits) */
6598*22dc650dSSadaf Ebrahimi case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6599*22dc650dSSadaf Ebrahimi case META_COND_RNAME: /* (?(R&name) - test for recursion */
6600*22dc650dSSadaf Ebrahimi bravalue = OP_COND;
6601*22dc650dSSadaf Ebrahimi {
6602*22dc650dSSadaf Ebrahimi int count, index;
6603*22dc650dSSadaf Ebrahimi unsigned int i;
6604*22dc650dSSadaf Ebrahimi PCRE2_SPTR name;
6605*22dc650dSSadaf Ebrahimi named_group *ng = cb->named_groups;
6606*22dc650dSSadaf Ebrahimi uint32_t length = *(++pptr);
6607*22dc650dSSadaf Ebrahimi
6608*22dc650dSSadaf Ebrahimi GETPLUSOFFSET(offset, pptr);
6609*22dc650dSSadaf Ebrahimi name = cb->start_pattern + offset;
6610*22dc650dSSadaf Ebrahimi
6611*22dc650dSSadaf Ebrahimi /* In the first pass, the names generated in the pre-pass are available,
6612*22dc650dSSadaf Ebrahimi but the main name table has not yet been created. Scan the list of names
6613*22dc650dSSadaf Ebrahimi generated in the pre-pass in order to get a number and whether or not
6614*22dc650dSSadaf Ebrahimi this name is duplicated. If it is not duplicated, we can handle it as a
6615*22dc650dSSadaf Ebrahimi numerical group. */
6616*22dc650dSSadaf Ebrahimi
6617*22dc650dSSadaf Ebrahimi for (i = 0; i < cb->names_found; i++, ng++)
6618*22dc650dSSadaf Ebrahimi {
6619*22dc650dSSadaf Ebrahimi if (length == ng->length &&
6620*22dc650dSSadaf Ebrahimi PRIV(strncmp)(name, ng->name, length) == 0)
6621*22dc650dSSadaf Ebrahimi {
6622*22dc650dSSadaf Ebrahimi if (!ng->isdup)
6623*22dc650dSSadaf Ebrahimi {
6624*22dc650dSSadaf Ebrahimi code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6625*22dc650dSSadaf Ebrahimi PUT2(code, 2+LINK_SIZE, ng->number);
6626*22dc650dSSadaf Ebrahimi if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6627*22dc650dSSadaf Ebrahimi skipunits = 1+IMM2_SIZE;
6628*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS_NOTE_EMPTY;
6629*22dc650dSSadaf Ebrahimi }
6630*22dc650dSSadaf Ebrahimi break; /* Found a duplicated name */
6631*22dc650dSSadaf Ebrahimi }
6632*22dc650dSSadaf Ebrahimi }
6633*22dc650dSSadaf Ebrahimi
6634*22dc650dSSadaf Ebrahimi /* If the name was not found we have a bad reference, unless we are
6635*22dc650dSSadaf Ebrahimi dealing with R<digits>, which is treated as a recursion test by number.
6636*22dc650dSSadaf Ebrahimi */
6637*22dc650dSSadaf Ebrahimi
6638*22dc650dSSadaf Ebrahimi if (i >= cb->names_found)
6639*22dc650dSSadaf Ebrahimi {
6640*22dc650dSSadaf Ebrahimi groupnumber = 0;
6641*22dc650dSSadaf Ebrahimi if (meta == META_COND_RNUMBER)
6642*22dc650dSSadaf Ebrahimi {
6643*22dc650dSSadaf Ebrahimi for (i = 1; i < length; i++)
6644*22dc650dSSadaf Ebrahimi {
6645*22dc650dSSadaf Ebrahimi groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6646*22dc650dSSadaf Ebrahimi if (groupnumber > MAX_GROUP_NUMBER)
6647*22dc650dSSadaf Ebrahimi {
6648*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR61;
6649*22dc650dSSadaf Ebrahimi cb->erroroffset = offset + i;
6650*22dc650dSSadaf Ebrahimi return 0;
6651*22dc650dSSadaf Ebrahimi }
6652*22dc650dSSadaf Ebrahimi }
6653*22dc650dSSadaf Ebrahimi }
6654*22dc650dSSadaf Ebrahimi
6655*22dc650dSSadaf Ebrahimi if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6656*22dc650dSSadaf Ebrahimi {
6657*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR15;
6658*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
6659*22dc650dSSadaf Ebrahimi return 0;
6660*22dc650dSSadaf Ebrahimi }
6661*22dc650dSSadaf Ebrahimi
6662*22dc650dSSadaf Ebrahimi /* (?Rdigits) treated as a recursion reference by number. A value of
6663*22dc650dSSadaf Ebrahimi zero (which is the result of both (?R) and (?R0)) means "any", and is
6664*22dc650dSSadaf Ebrahimi translated into RREF_ANY (which is 0xffff). */
6665*22dc650dSSadaf Ebrahimi
6666*22dc650dSSadaf Ebrahimi if (groupnumber == 0) groupnumber = RREF_ANY;
6667*22dc650dSSadaf Ebrahimi code[1+LINK_SIZE] = OP_RREF;
6668*22dc650dSSadaf Ebrahimi PUT2(code, 2+LINK_SIZE, groupnumber);
6669*22dc650dSSadaf Ebrahimi skipunits = 1+IMM2_SIZE;
6670*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS_NOTE_EMPTY;
6671*22dc650dSSadaf Ebrahimi }
6672*22dc650dSSadaf Ebrahimi
6673*22dc650dSSadaf Ebrahimi /* A duplicated name was found. Note that if an R<digits> name is found
6674*22dc650dSSadaf Ebrahimi (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6675*22dc650dSSadaf Ebrahimi
6676*22dc650dSSadaf Ebrahimi code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6677*22dc650dSSadaf Ebrahimi
6678*22dc650dSSadaf Ebrahimi /* We have a duplicated name. In the compile pass we have to search the
6679*22dc650dSSadaf Ebrahimi main table in order to get the index and count values. */
6680*22dc650dSSadaf Ebrahimi
6681*22dc650dSSadaf Ebrahimi count = 0; /* Values for first pass (avoids compiler warning) */
6682*22dc650dSSadaf Ebrahimi index = 0;
6683*22dc650dSSadaf Ebrahimi if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6684*22dc650dSSadaf Ebrahimi &count, errorcodeptr, cb)) return 0;
6685*22dc650dSSadaf Ebrahimi
6686*22dc650dSSadaf Ebrahimi /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6687*22dc650dSSadaf Ebrahimi insert appropriate data values. */
6688*22dc650dSSadaf Ebrahimi
6689*22dc650dSSadaf Ebrahimi code[1+LINK_SIZE]++;
6690*22dc650dSSadaf Ebrahimi skipunits = 1+2*IMM2_SIZE;
6691*22dc650dSSadaf Ebrahimi PUT2(code, 2+LINK_SIZE, index);
6692*22dc650dSSadaf Ebrahimi PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6693*22dc650dSSadaf Ebrahimi }
6694*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS_NOTE_EMPTY;
6695*22dc650dSSadaf Ebrahimi
6696*22dc650dSSadaf Ebrahimi /* The DEFINE condition is always false. Its internal groups may never
6697*22dc650dSSadaf Ebrahimi be called, so matched_char must remain false, hence the jump to
6698*22dc650dSSadaf Ebrahimi GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6699*22dc650dSSadaf Ebrahimi
6700*22dc650dSSadaf Ebrahimi case META_COND_DEFINE:
6701*22dc650dSSadaf Ebrahimi bravalue = OP_COND;
6702*22dc650dSSadaf Ebrahimi GETPLUSOFFSET(offset, pptr);
6703*22dc650dSSadaf Ebrahimi code[1+LINK_SIZE] = OP_DEFINE;
6704*22dc650dSSadaf Ebrahimi skipunits = 1;
6705*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS;
6706*22dc650dSSadaf Ebrahimi
6707*22dc650dSSadaf Ebrahimi /* Conditional test of a group's being set. */
6708*22dc650dSSadaf Ebrahimi
6709*22dc650dSSadaf Ebrahimi case META_COND_NUMBER:
6710*22dc650dSSadaf Ebrahimi bravalue = OP_COND;
6711*22dc650dSSadaf Ebrahimi GETPLUSOFFSET(offset, pptr);
6712*22dc650dSSadaf Ebrahimi groupnumber = *(++pptr);
6713*22dc650dSSadaf Ebrahimi if (groupnumber > cb->bracount)
6714*22dc650dSSadaf Ebrahimi {
6715*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR15;
6716*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
6717*22dc650dSSadaf Ebrahimi return 0;
6718*22dc650dSSadaf Ebrahimi }
6719*22dc650dSSadaf Ebrahimi if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6720*22dc650dSSadaf Ebrahimi offset -= 2; /* Point at initial ( for too many branches error */
6721*22dc650dSSadaf Ebrahimi code[1+LINK_SIZE] = OP_CREF;
6722*22dc650dSSadaf Ebrahimi skipunits = 1+IMM2_SIZE;
6723*22dc650dSSadaf Ebrahimi PUT2(code, 2+LINK_SIZE, groupnumber);
6724*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS_NOTE_EMPTY;
6725*22dc650dSSadaf Ebrahimi
6726*22dc650dSSadaf Ebrahimi /* Test for the PCRE2 version. */
6727*22dc650dSSadaf Ebrahimi
6728*22dc650dSSadaf Ebrahimi case META_COND_VERSION:
6729*22dc650dSSadaf Ebrahimi bravalue = OP_COND;
6730*22dc650dSSadaf Ebrahimi if (pptr[1] > 0)
6731*22dc650dSSadaf Ebrahimi code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6732*22dc650dSSadaf Ebrahimi (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6733*22dc650dSSadaf Ebrahimi OP_TRUE : OP_FALSE;
6734*22dc650dSSadaf Ebrahimi else
6735*22dc650dSSadaf Ebrahimi code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6736*22dc650dSSadaf Ebrahimi OP_TRUE : OP_FALSE;
6737*22dc650dSSadaf Ebrahimi skipunits = 1;
6738*22dc650dSSadaf Ebrahimi pptr += 3;
6739*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS_NOTE_EMPTY;
6740*22dc650dSSadaf Ebrahimi
6741*22dc650dSSadaf Ebrahimi /* The condition is an assertion, possibly preceded by a callout. */
6742*22dc650dSSadaf Ebrahimi
6743*22dc650dSSadaf Ebrahimi case META_COND_ASSERT:
6744*22dc650dSSadaf Ebrahimi bravalue = OP_COND;
6745*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS_NOTE_EMPTY;
6746*22dc650dSSadaf Ebrahimi
6747*22dc650dSSadaf Ebrahimi
6748*22dc650dSSadaf Ebrahimi /* ===================================================================*/
6749*22dc650dSSadaf Ebrahimi /* Handle all kinds of nested bracketed groups. The non-capturing,
6750*22dc650dSSadaf Ebrahimi non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6751*22dc650dSSadaf Ebrahimi
6752*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD:
6753*22dc650dSSadaf Ebrahimi bravalue = OP_ASSERT;
6754*22dc650dSSadaf Ebrahimi cb->assert_depth += 1;
6755*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS;
6756*22dc650dSSadaf Ebrahimi
6757*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD_NA:
6758*22dc650dSSadaf Ebrahimi bravalue = OP_ASSERT_NA;
6759*22dc650dSSadaf Ebrahimi cb->assert_depth += 1;
6760*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS;
6761*22dc650dSSadaf Ebrahimi
6762*22dc650dSSadaf Ebrahimi /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6763*22dc650dSSadaf Ebrahimi thing to do, but Perl allows all assertions to be quantified, and when
6764*22dc650dSSadaf Ebrahimi they contain capturing parentheses there may be a potential use for
6765*22dc650dSSadaf Ebrahimi this feature. Not that that applies to a quantified (?!) but we allow
6766*22dc650dSSadaf Ebrahimi it for uniformity. */
6767*22dc650dSSadaf Ebrahimi
6768*22dc650dSSadaf Ebrahimi case META_LOOKAHEADNOT:
6769*22dc650dSSadaf Ebrahimi if (pptr[1] == META_KET &&
6770*22dc650dSSadaf Ebrahimi (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6771*22dc650dSSadaf Ebrahimi {
6772*22dc650dSSadaf Ebrahimi *code++ = OP_FAIL;
6773*22dc650dSSadaf Ebrahimi pptr++;
6774*22dc650dSSadaf Ebrahimi }
6775*22dc650dSSadaf Ebrahimi else
6776*22dc650dSSadaf Ebrahimi {
6777*22dc650dSSadaf Ebrahimi bravalue = OP_ASSERT_NOT;
6778*22dc650dSSadaf Ebrahimi cb->assert_depth += 1;
6779*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS;
6780*22dc650dSSadaf Ebrahimi }
6781*22dc650dSSadaf Ebrahimi break;
6782*22dc650dSSadaf Ebrahimi
6783*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND:
6784*22dc650dSSadaf Ebrahimi bravalue = OP_ASSERTBACK;
6785*22dc650dSSadaf Ebrahimi cb->assert_depth += 1;
6786*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS;
6787*22dc650dSSadaf Ebrahimi
6788*22dc650dSSadaf Ebrahimi case META_LOOKBEHINDNOT:
6789*22dc650dSSadaf Ebrahimi bravalue = OP_ASSERTBACK_NOT;
6790*22dc650dSSadaf Ebrahimi cb->assert_depth += 1;
6791*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS;
6792*22dc650dSSadaf Ebrahimi
6793*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND_NA:
6794*22dc650dSSadaf Ebrahimi bravalue = OP_ASSERTBACK_NA;
6795*22dc650dSSadaf Ebrahimi cb->assert_depth += 1;
6796*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS;
6797*22dc650dSSadaf Ebrahimi
6798*22dc650dSSadaf Ebrahimi case META_ATOMIC:
6799*22dc650dSSadaf Ebrahimi bravalue = OP_ONCE;
6800*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS_NOTE_EMPTY;
6801*22dc650dSSadaf Ebrahimi
6802*22dc650dSSadaf Ebrahimi case META_SCRIPT_RUN:
6803*22dc650dSSadaf Ebrahimi bravalue = OP_SCRIPT_RUN;
6804*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS_NOTE_EMPTY;
6805*22dc650dSSadaf Ebrahimi
6806*22dc650dSSadaf Ebrahimi case META_NOCAPTURE:
6807*22dc650dSSadaf Ebrahimi bravalue = OP_BRA;
6808*22dc650dSSadaf Ebrahimi /* Fall through */
6809*22dc650dSSadaf Ebrahimi
6810*22dc650dSSadaf Ebrahimi /* Process nested bracketed regex. The nesting depth is maintained for the
6811*22dc650dSSadaf Ebrahimi benefit of the stackguard function. The test for too deep nesting is now
6812*22dc650dSSadaf Ebrahimi done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6813*22dc650dSSadaf Ebrahimi others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6814*22dc650dSSadaf Ebrahimi note of whether or not they may match an empty string. */
6815*22dc650dSSadaf Ebrahimi
6816*22dc650dSSadaf Ebrahimi GROUP_PROCESS_NOTE_EMPTY:
6817*22dc650dSSadaf Ebrahimi note_group_empty = TRUE;
6818*22dc650dSSadaf Ebrahimi
6819*22dc650dSSadaf Ebrahimi GROUP_PROCESS:
6820*22dc650dSSadaf Ebrahimi cb->parens_depth += 1;
6821*22dc650dSSadaf Ebrahimi *code = bravalue;
6822*22dc650dSSadaf Ebrahimi pptr++;
6823*22dc650dSSadaf Ebrahimi tempcode = code;
6824*22dc650dSSadaf Ebrahimi tempreqvary = cb->req_varyopt; /* Save value before group */
6825*22dc650dSSadaf Ebrahimi length_prevgroup = 0; /* Initialize for pre-compile phase */
6826*22dc650dSSadaf Ebrahimi
6827*22dc650dSSadaf Ebrahimi if ((group_return =
6828*22dc650dSSadaf Ebrahimi compile_regex(
6829*22dc650dSSadaf Ebrahimi options, /* The options state */
6830*22dc650dSSadaf Ebrahimi xoptions, /* The extra options state */
6831*22dc650dSSadaf Ebrahimi &tempcode, /* Where to put code (updated) */
6832*22dc650dSSadaf Ebrahimi &pptr, /* Input pointer (updated) */
6833*22dc650dSSadaf Ebrahimi errorcodeptr, /* Where to put an error message */
6834*22dc650dSSadaf Ebrahimi skipunits, /* Skip over bracket number */
6835*22dc650dSSadaf Ebrahimi &subfirstcu, /* For possible first char */
6836*22dc650dSSadaf Ebrahimi &subfirstcuflags,
6837*22dc650dSSadaf Ebrahimi &subreqcu, /* For possible last char */
6838*22dc650dSSadaf Ebrahimi &subreqcuflags,
6839*22dc650dSSadaf Ebrahimi bcptr, /* Current branch chain */
6840*22dc650dSSadaf Ebrahimi open_caps, /* Pointer to capture stack */
6841*22dc650dSSadaf Ebrahimi cb, /* Compile data block */
6842*22dc650dSSadaf Ebrahimi (lengthptr == NULL)? NULL : /* Actual compile phase */
6843*22dc650dSSadaf Ebrahimi &length_prevgroup /* Pre-compile phase */
6844*22dc650dSSadaf Ebrahimi )) == 0)
6845*22dc650dSSadaf Ebrahimi return 0; /* Error */
6846*22dc650dSSadaf Ebrahimi
6847*22dc650dSSadaf Ebrahimi cb->parens_depth -= 1;
6848*22dc650dSSadaf Ebrahimi
6849*22dc650dSSadaf Ebrahimi /* If that was a non-conditional significant group (not an assertion, not a
6850*22dc650dSSadaf Ebrahimi DEFINE) that matches at least one character, then the current item matches
6851*22dc650dSSadaf Ebrahimi a character. Conditionals are handled below. */
6852*22dc650dSSadaf Ebrahimi
6853*22dc650dSSadaf Ebrahimi if (note_group_empty && bravalue != OP_COND && group_return > 0)
6854*22dc650dSSadaf Ebrahimi matched_char = TRUE;
6855*22dc650dSSadaf Ebrahimi
6856*22dc650dSSadaf Ebrahimi /* If we've just compiled an assertion, pop the assert depth. */
6857*22dc650dSSadaf Ebrahimi
6858*22dc650dSSadaf Ebrahimi if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6859*22dc650dSSadaf Ebrahimi cb->assert_depth -= 1;
6860*22dc650dSSadaf Ebrahimi
6861*22dc650dSSadaf Ebrahimi /* At the end of compiling, code is still pointing to the start of the
6862*22dc650dSSadaf Ebrahimi group, while tempcode has been updated to point past the end of the group.
6863*22dc650dSSadaf Ebrahimi The parsed pattern pointer (pptr) is on the closing META_KET.
6864*22dc650dSSadaf Ebrahimi
6865*22dc650dSSadaf Ebrahimi If this is a conditional bracket, check that there are no more than
6866*22dc650dSSadaf Ebrahimi two branches in the group, or just one if it's a DEFINE group. We do this
6867*22dc650dSSadaf Ebrahimi in the real compile phase, not in the pre-pass, where the whole group may
6868*22dc650dSSadaf Ebrahimi not be available. */
6869*22dc650dSSadaf Ebrahimi
6870*22dc650dSSadaf Ebrahimi if (bravalue == OP_COND && lengthptr == NULL)
6871*22dc650dSSadaf Ebrahimi {
6872*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *tc = code;
6873*22dc650dSSadaf Ebrahimi int condcount = 0;
6874*22dc650dSSadaf Ebrahimi
6875*22dc650dSSadaf Ebrahimi do {
6876*22dc650dSSadaf Ebrahimi condcount++;
6877*22dc650dSSadaf Ebrahimi tc += GET(tc,1);
6878*22dc650dSSadaf Ebrahimi }
6879*22dc650dSSadaf Ebrahimi while (*tc != OP_KET);
6880*22dc650dSSadaf Ebrahimi
6881*22dc650dSSadaf Ebrahimi /* A DEFINE group is never obeyed inline (the "condition" is always
6882*22dc650dSSadaf Ebrahimi false). It must have only one branch. Having checked this, change the
6883*22dc650dSSadaf Ebrahimi opcode to OP_FALSE. */
6884*22dc650dSSadaf Ebrahimi
6885*22dc650dSSadaf Ebrahimi if (code[LINK_SIZE+1] == OP_DEFINE)
6886*22dc650dSSadaf Ebrahimi {
6887*22dc650dSSadaf Ebrahimi if (condcount > 1)
6888*22dc650dSSadaf Ebrahimi {
6889*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
6890*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR54;
6891*22dc650dSSadaf Ebrahimi return 0;
6892*22dc650dSSadaf Ebrahimi }
6893*22dc650dSSadaf Ebrahimi code[LINK_SIZE+1] = OP_FALSE;
6894*22dc650dSSadaf Ebrahimi bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6895*22dc650dSSadaf Ebrahimi }
6896*22dc650dSSadaf Ebrahimi
6897*22dc650dSSadaf Ebrahimi /* A "normal" conditional group. If there is just one branch, we must not
6898*22dc650dSSadaf Ebrahimi make use of its firstcu or reqcu, because this is equivalent to an
6899*22dc650dSSadaf Ebrahimi empty second branch. Also, it may match an empty string. If there are two
6900*22dc650dSSadaf Ebrahimi branches, this item must match a character if the group must. */
6901*22dc650dSSadaf Ebrahimi
6902*22dc650dSSadaf Ebrahimi else
6903*22dc650dSSadaf Ebrahimi {
6904*22dc650dSSadaf Ebrahimi if (condcount > 2)
6905*22dc650dSSadaf Ebrahimi {
6906*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
6907*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR27;
6908*22dc650dSSadaf Ebrahimi return 0;
6909*22dc650dSSadaf Ebrahimi }
6910*22dc650dSSadaf Ebrahimi if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6911*22dc650dSSadaf Ebrahimi else if (group_return > 0) matched_char = TRUE;
6912*22dc650dSSadaf Ebrahimi }
6913*22dc650dSSadaf Ebrahimi }
6914*22dc650dSSadaf Ebrahimi
6915*22dc650dSSadaf Ebrahimi /* In the pre-compile phase, update the length by the length of the group,
6916*22dc650dSSadaf Ebrahimi less the brackets at either end. Then reduce the compiled code to just a
6917*22dc650dSSadaf Ebrahimi set of non-capturing brackets so that it doesn't use much memory if it is
6918*22dc650dSSadaf Ebrahimi duplicated by a quantifier.*/
6919*22dc650dSSadaf Ebrahimi
6920*22dc650dSSadaf Ebrahimi if (lengthptr != NULL)
6921*22dc650dSSadaf Ebrahimi {
6922*22dc650dSSadaf Ebrahimi if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6923*22dc650dSSadaf Ebrahimi {
6924*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR20;
6925*22dc650dSSadaf Ebrahimi return 0;
6926*22dc650dSSadaf Ebrahimi }
6927*22dc650dSSadaf Ebrahimi *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6928*22dc650dSSadaf Ebrahimi code++; /* This already contains bravalue */
6929*22dc650dSSadaf Ebrahimi PUTINC(code, 0, 1 + LINK_SIZE);
6930*22dc650dSSadaf Ebrahimi *code++ = OP_KET;
6931*22dc650dSSadaf Ebrahimi PUTINC(code, 0, 1 + LINK_SIZE);
6932*22dc650dSSadaf Ebrahimi break; /* No need to waste time with special character handling */
6933*22dc650dSSadaf Ebrahimi }
6934*22dc650dSSadaf Ebrahimi
6935*22dc650dSSadaf Ebrahimi /* Otherwise update the main code pointer to the end of the group. */
6936*22dc650dSSadaf Ebrahimi
6937*22dc650dSSadaf Ebrahimi code = tempcode;
6938*22dc650dSSadaf Ebrahimi
6939*22dc650dSSadaf Ebrahimi /* For a DEFINE group, required and first character settings are not
6940*22dc650dSSadaf Ebrahimi relevant. */
6941*22dc650dSSadaf Ebrahimi
6942*22dc650dSSadaf Ebrahimi if (bravalue == OP_DEFINE) break;
6943*22dc650dSSadaf Ebrahimi
6944*22dc650dSSadaf Ebrahimi /* Handle updating of the required and first code units for other types of
6945*22dc650dSSadaf Ebrahimi group. Update for normal brackets of all kinds, and conditions with two
6946*22dc650dSSadaf Ebrahimi branches (see code above). If the bracket is followed by a quantifier with
6947*22dc650dSSadaf Ebrahimi zero repeat, we have to back off. Hence the definition of zeroreqcu and
6948*22dc650dSSadaf Ebrahimi zerofirstcu outside the main loop so that they can be accessed for the back
6949*22dc650dSSadaf Ebrahimi off. */
6950*22dc650dSSadaf Ebrahimi
6951*22dc650dSSadaf Ebrahimi zeroreqcu = reqcu;
6952*22dc650dSSadaf Ebrahimi zeroreqcuflags = reqcuflags;
6953*22dc650dSSadaf Ebrahimi zerofirstcu = firstcu;
6954*22dc650dSSadaf Ebrahimi zerofirstcuflags = firstcuflags;
6955*22dc650dSSadaf Ebrahimi groupsetfirstcu = FALSE;
6956*22dc650dSSadaf Ebrahimi
6957*22dc650dSSadaf Ebrahimi if (bravalue >= OP_ONCE) /* Not an assertion */
6958*22dc650dSSadaf Ebrahimi {
6959*22dc650dSSadaf Ebrahimi /* If we have not yet set a firstcu in this branch, take it from the
6960*22dc650dSSadaf Ebrahimi subpattern, remembering that it was set here so that a repeat of more
6961*22dc650dSSadaf Ebrahimi than one can replicate it as reqcu if necessary. If the subpattern has
6962*22dc650dSSadaf Ebrahimi no firstcu, set "none" for the whole branch. In both cases, a zero
6963*22dc650dSSadaf Ebrahimi repeat forces firstcu to "none". */
6964*22dc650dSSadaf Ebrahimi
6965*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6966*22dc650dSSadaf Ebrahimi {
6967*22dc650dSSadaf Ebrahimi if (subfirstcuflags < REQ_NONE)
6968*22dc650dSSadaf Ebrahimi {
6969*22dc650dSSadaf Ebrahimi firstcu = subfirstcu;
6970*22dc650dSSadaf Ebrahimi firstcuflags = subfirstcuflags;
6971*22dc650dSSadaf Ebrahimi groupsetfirstcu = TRUE;
6972*22dc650dSSadaf Ebrahimi }
6973*22dc650dSSadaf Ebrahimi else firstcuflags = REQ_NONE;
6974*22dc650dSSadaf Ebrahimi zerofirstcuflags = REQ_NONE;
6975*22dc650dSSadaf Ebrahimi }
6976*22dc650dSSadaf Ebrahimi
6977*22dc650dSSadaf Ebrahimi /* If firstcu was previously set, convert the subpattern's firstcu
6978*22dc650dSSadaf Ebrahimi into reqcu if there wasn't one, using the vary flag that was in
6979*22dc650dSSadaf Ebrahimi existence beforehand. */
6980*22dc650dSSadaf Ebrahimi
6981*22dc650dSSadaf Ebrahimi else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6982*22dc650dSSadaf Ebrahimi {
6983*22dc650dSSadaf Ebrahimi subreqcu = subfirstcu;
6984*22dc650dSSadaf Ebrahimi subreqcuflags = subfirstcuflags | tempreqvary;
6985*22dc650dSSadaf Ebrahimi }
6986*22dc650dSSadaf Ebrahimi
6987*22dc650dSSadaf Ebrahimi /* If the subpattern set a required code unit (or set a first code unit
6988*22dc650dSSadaf Ebrahimi that isn't really the first code unit - see above), set it. */
6989*22dc650dSSadaf Ebrahimi
6990*22dc650dSSadaf Ebrahimi if (subreqcuflags < REQ_NONE)
6991*22dc650dSSadaf Ebrahimi {
6992*22dc650dSSadaf Ebrahimi reqcu = subreqcu;
6993*22dc650dSSadaf Ebrahimi reqcuflags = subreqcuflags;
6994*22dc650dSSadaf Ebrahimi }
6995*22dc650dSSadaf Ebrahimi }
6996*22dc650dSSadaf Ebrahimi
6997*22dc650dSSadaf Ebrahimi /* For a forward assertion, we take the reqcu, if set, provided that the
6998*22dc650dSSadaf Ebrahimi group has also set a firstcu. This can be helpful if the pattern that
6999*22dc650dSSadaf Ebrahimi follows the assertion doesn't set a different char. For example, it's
7000*22dc650dSSadaf Ebrahimi useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7001*22dc650dSSadaf Ebrahimi because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7002*22dc650dSSadaf Ebrahimi the "real" "a" would then become a reqcu instead of a firstcu. This is
7003*22dc650dSSadaf Ebrahimi overcome by a scan at the end if there's no firstcu, looking for an
7004*22dc650dSSadaf Ebrahimi asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7005*22dc650dSSadaf Ebrahimi we must only take the reqcu when the group also set a firstcu. Otherwise,
7006*22dc650dSSadaf Ebrahimi in that example, 'X' ends up set for both. */
7007*22dc650dSSadaf Ebrahimi
7008*22dc650dSSadaf Ebrahimi else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7009*22dc650dSSadaf Ebrahimi subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7010*22dc650dSSadaf Ebrahimi {
7011*22dc650dSSadaf Ebrahimi reqcu = subreqcu;
7012*22dc650dSSadaf Ebrahimi reqcuflags = subreqcuflags;
7013*22dc650dSSadaf Ebrahimi }
7014*22dc650dSSadaf Ebrahimi
7015*22dc650dSSadaf Ebrahimi break; /* End of nested group handling */
7016*22dc650dSSadaf Ebrahimi
7017*22dc650dSSadaf Ebrahimi
7018*22dc650dSSadaf Ebrahimi /* ===================================================================*/
7019*22dc650dSSadaf Ebrahimi /* Handle named backreferences and recursions. */
7020*22dc650dSSadaf Ebrahimi
7021*22dc650dSSadaf Ebrahimi case META_BACKREF_BYNAME:
7022*22dc650dSSadaf Ebrahimi case META_RECURSE_BYNAME:
7023*22dc650dSSadaf Ebrahimi {
7024*22dc650dSSadaf Ebrahimi int count, index;
7025*22dc650dSSadaf Ebrahimi PCRE2_SPTR name;
7026*22dc650dSSadaf Ebrahimi BOOL is_dupname = FALSE;
7027*22dc650dSSadaf Ebrahimi named_group *ng = cb->named_groups;
7028*22dc650dSSadaf Ebrahimi uint32_t length = *(++pptr);
7029*22dc650dSSadaf Ebrahimi
7030*22dc650dSSadaf Ebrahimi GETPLUSOFFSET(offset, pptr);
7031*22dc650dSSadaf Ebrahimi name = cb->start_pattern + offset;
7032*22dc650dSSadaf Ebrahimi
7033*22dc650dSSadaf Ebrahimi /* In the first pass, the names generated in the pre-pass are available,
7034*22dc650dSSadaf Ebrahimi but the main name table has not yet been created. Scan the list of names
7035*22dc650dSSadaf Ebrahimi generated in the pre-pass in order to get a number and whether or not
7036*22dc650dSSadaf Ebrahimi this name is duplicated. */
7037*22dc650dSSadaf Ebrahimi
7038*22dc650dSSadaf Ebrahimi groupnumber = 0;
7039*22dc650dSSadaf Ebrahimi for (unsigned int i = 0; i < cb->names_found; i++, ng++)
7040*22dc650dSSadaf Ebrahimi {
7041*22dc650dSSadaf Ebrahimi if (length == ng->length &&
7042*22dc650dSSadaf Ebrahimi PRIV(strncmp)(name, ng->name, length) == 0)
7043*22dc650dSSadaf Ebrahimi {
7044*22dc650dSSadaf Ebrahimi is_dupname = ng->isdup;
7045*22dc650dSSadaf Ebrahimi groupnumber = ng->number;
7046*22dc650dSSadaf Ebrahimi
7047*22dc650dSSadaf Ebrahimi /* For a recursion, that's all that is needed. We can now go to
7048*22dc650dSSadaf Ebrahimi the code that handles numerical recursion, applying it to the first
7049*22dc650dSSadaf Ebrahimi group with the given name. */
7050*22dc650dSSadaf Ebrahimi
7051*22dc650dSSadaf Ebrahimi if (meta == META_RECURSE_BYNAME)
7052*22dc650dSSadaf Ebrahimi {
7053*22dc650dSSadaf Ebrahimi meta_arg = groupnumber;
7054*22dc650dSSadaf Ebrahimi goto HANDLE_NUMERICAL_RECURSION;
7055*22dc650dSSadaf Ebrahimi }
7056*22dc650dSSadaf Ebrahimi
7057*22dc650dSSadaf Ebrahimi /* For a back reference, update the back reference map and the
7058*22dc650dSSadaf Ebrahimi maximum back reference. */
7059*22dc650dSSadaf Ebrahimi
7060*22dc650dSSadaf Ebrahimi cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7061*22dc650dSSadaf Ebrahimi if (groupnumber > cb->top_backref)
7062*22dc650dSSadaf Ebrahimi cb->top_backref = groupnumber;
7063*22dc650dSSadaf Ebrahimi }
7064*22dc650dSSadaf Ebrahimi }
7065*22dc650dSSadaf Ebrahimi
7066*22dc650dSSadaf Ebrahimi /* If the name was not found we have a bad reference. */
7067*22dc650dSSadaf Ebrahimi
7068*22dc650dSSadaf Ebrahimi if (groupnumber == 0)
7069*22dc650dSSadaf Ebrahimi {
7070*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR15;
7071*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
7072*22dc650dSSadaf Ebrahimi return 0;
7073*22dc650dSSadaf Ebrahimi }
7074*22dc650dSSadaf Ebrahimi
7075*22dc650dSSadaf Ebrahimi /* If a back reference name is not duplicated, we can handle it as
7076*22dc650dSSadaf Ebrahimi a numerical reference. */
7077*22dc650dSSadaf Ebrahimi
7078*22dc650dSSadaf Ebrahimi if (!is_dupname)
7079*22dc650dSSadaf Ebrahimi {
7080*22dc650dSSadaf Ebrahimi meta_arg = groupnumber;
7081*22dc650dSSadaf Ebrahimi goto HANDLE_SINGLE_REFERENCE;
7082*22dc650dSSadaf Ebrahimi }
7083*22dc650dSSadaf Ebrahimi
7084*22dc650dSSadaf Ebrahimi /* If a back reference name is duplicated, we generate a different
7085*22dc650dSSadaf Ebrahimi opcode to a numerical back reference. In the second pass we must
7086*22dc650dSSadaf Ebrahimi search for the index and count in the final name table. */
7087*22dc650dSSadaf Ebrahimi
7088*22dc650dSSadaf Ebrahimi count = 0; /* Values for first pass (avoids compiler warning) */
7089*22dc650dSSadaf Ebrahimi index = 0;
7090*22dc650dSSadaf Ebrahimi if (lengthptr == NULL && !find_dupname_details(name, length, &index,
7091*22dc650dSSadaf Ebrahimi &count, errorcodeptr, cb)) return 0;
7092*22dc650dSSadaf Ebrahimi
7093*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7094*22dc650dSSadaf Ebrahimi *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7095*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, index);
7096*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, count);
7097*22dc650dSSadaf Ebrahimi }
7098*22dc650dSSadaf Ebrahimi break;
7099*22dc650dSSadaf Ebrahimi
7100*22dc650dSSadaf Ebrahimi
7101*22dc650dSSadaf Ebrahimi /* ===================================================================*/
7102*22dc650dSSadaf Ebrahimi /* Handle a numerical callout. */
7103*22dc650dSSadaf Ebrahimi
7104*22dc650dSSadaf Ebrahimi case META_CALLOUT_NUMBER:
7105*22dc650dSSadaf Ebrahimi code[0] = OP_CALLOUT;
7106*22dc650dSSadaf Ebrahimi PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7107*22dc650dSSadaf Ebrahimi PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7108*22dc650dSSadaf Ebrahimi code[1 + 2*LINK_SIZE] = pptr[3];
7109*22dc650dSSadaf Ebrahimi pptr += 3;
7110*22dc650dSSadaf Ebrahimi code += PRIV(OP_lengths)[OP_CALLOUT];
7111*22dc650dSSadaf Ebrahimi break;
7112*22dc650dSSadaf Ebrahimi
7113*22dc650dSSadaf Ebrahimi
7114*22dc650dSSadaf Ebrahimi /* ===================================================================*/
7115*22dc650dSSadaf Ebrahimi /* Handle a callout with a string argument. In the pre-pass we just compute
7116*22dc650dSSadaf Ebrahimi the length without generating anything. The length in pptr[3] includes both
7117*22dc650dSSadaf Ebrahimi delimiters; in the actual compile only the first one is copied, but a
7118*22dc650dSSadaf Ebrahimi terminating zero is added. Any doubled delimiters within the string make
7119*22dc650dSSadaf Ebrahimi this an overestimate, but it is not worth bothering about. */
7120*22dc650dSSadaf Ebrahimi
7121*22dc650dSSadaf Ebrahimi case META_CALLOUT_STRING:
7122*22dc650dSSadaf Ebrahimi if (lengthptr != NULL)
7123*22dc650dSSadaf Ebrahimi {
7124*22dc650dSSadaf Ebrahimi *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7125*22dc650dSSadaf Ebrahimi pptr += 3;
7126*22dc650dSSadaf Ebrahimi SKIPOFFSET(pptr);
7127*22dc650dSSadaf Ebrahimi }
7128*22dc650dSSadaf Ebrahimi
7129*22dc650dSSadaf Ebrahimi /* In the real compile we can copy the string. The starting delimiter is
7130*22dc650dSSadaf Ebrahimi included so that the client can discover it if they want. We also pass the
7131*22dc650dSSadaf Ebrahimi start offset to help a script language give better error messages. */
7132*22dc650dSSadaf Ebrahimi
7133*22dc650dSSadaf Ebrahimi else
7134*22dc650dSSadaf Ebrahimi {
7135*22dc650dSSadaf Ebrahimi PCRE2_SPTR pp;
7136*22dc650dSSadaf Ebrahimi uint32_t delimiter;
7137*22dc650dSSadaf Ebrahimi uint32_t length = pptr[3];
7138*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7139*22dc650dSSadaf Ebrahimi
7140*22dc650dSSadaf Ebrahimi code[0] = OP_CALLOUT_STR;
7141*22dc650dSSadaf Ebrahimi PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7142*22dc650dSSadaf Ebrahimi PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7143*22dc650dSSadaf Ebrahimi
7144*22dc650dSSadaf Ebrahimi pptr += 3;
7145*22dc650dSSadaf Ebrahimi GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
7146*22dc650dSSadaf Ebrahimi pp = cb->start_pattern + offset;
7147*22dc650dSSadaf Ebrahimi delimiter = *callout_string++ = *pp++;
7148*22dc650dSSadaf Ebrahimi if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7149*22dc650dSSadaf Ebrahimi delimiter = CHAR_RIGHT_CURLY_BRACKET;
7150*22dc650dSSadaf Ebrahimi PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
7151*22dc650dSSadaf Ebrahimi
7152*22dc650dSSadaf Ebrahimi /* The syntax of the pattern was checked in the parsing scan. The length
7153*22dc650dSSadaf Ebrahimi includes both delimiters, but we have passed the opening one just above,
7154*22dc650dSSadaf Ebrahimi so we reduce length before testing it. The test is for > 1 because we do
7155*22dc650dSSadaf Ebrahimi not want to copy the final delimiter. This also ensures that pp[1] is
7156*22dc650dSSadaf Ebrahimi accessible. */
7157*22dc650dSSadaf Ebrahimi
7158*22dc650dSSadaf Ebrahimi while (--length > 1)
7159*22dc650dSSadaf Ebrahimi {
7160*22dc650dSSadaf Ebrahimi if (*pp == delimiter && pp[1] == delimiter)
7161*22dc650dSSadaf Ebrahimi {
7162*22dc650dSSadaf Ebrahimi *callout_string++ = delimiter;
7163*22dc650dSSadaf Ebrahimi pp += 2;
7164*22dc650dSSadaf Ebrahimi length--;
7165*22dc650dSSadaf Ebrahimi }
7166*22dc650dSSadaf Ebrahimi else *callout_string++ = *pp++;
7167*22dc650dSSadaf Ebrahimi }
7168*22dc650dSSadaf Ebrahimi *callout_string++ = CHAR_NUL;
7169*22dc650dSSadaf Ebrahimi
7170*22dc650dSSadaf Ebrahimi /* Set the length of the entire item, the advance to its end. */
7171*22dc650dSSadaf Ebrahimi
7172*22dc650dSSadaf Ebrahimi PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7173*22dc650dSSadaf Ebrahimi code = callout_string;
7174*22dc650dSSadaf Ebrahimi }
7175*22dc650dSSadaf Ebrahimi break;
7176*22dc650dSSadaf Ebrahimi
7177*22dc650dSSadaf Ebrahimi
7178*22dc650dSSadaf Ebrahimi /* ===================================================================*/
7179*22dc650dSSadaf Ebrahimi /* Handle repetition. The different types are all sorted out in the parsing
7180*22dc650dSSadaf Ebrahimi pass. */
7181*22dc650dSSadaf Ebrahimi
7182*22dc650dSSadaf Ebrahimi case META_MINMAX_PLUS:
7183*22dc650dSSadaf Ebrahimi case META_MINMAX_QUERY:
7184*22dc650dSSadaf Ebrahimi case META_MINMAX:
7185*22dc650dSSadaf Ebrahimi repeat_min = *(++pptr);
7186*22dc650dSSadaf Ebrahimi repeat_max = *(++pptr);
7187*22dc650dSSadaf Ebrahimi goto REPEAT;
7188*22dc650dSSadaf Ebrahimi
7189*22dc650dSSadaf Ebrahimi case META_ASTERISK:
7190*22dc650dSSadaf Ebrahimi case META_ASTERISK_PLUS:
7191*22dc650dSSadaf Ebrahimi case META_ASTERISK_QUERY:
7192*22dc650dSSadaf Ebrahimi repeat_min = 0;
7193*22dc650dSSadaf Ebrahimi repeat_max = REPEAT_UNLIMITED;
7194*22dc650dSSadaf Ebrahimi goto REPEAT;
7195*22dc650dSSadaf Ebrahimi
7196*22dc650dSSadaf Ebrahimi case META_PLUS:
7197*22dc650dSSadaf Ebrahimi case META_PLUS_PLUS:
7198*22dc650dSSadaf Ebrahimi case META_PLUS_QUERY:
7199*22dc650dSSadaf Ebrahimi repeat_min = 1;
7200*22dc650dSSadaf Ebrahimi repeat_max = REPEAT_UNLIMITED;
7201*22dc650dSSadaf Ebrahimi goto REPEAT;
7202*22dc650dSSadaf Ebrahimi
7203*22dc650dSSadaf Ebrahimi case META_QUERY:
7204*22dc650dSSadaf Ebrahimi case META_QUERY_PLUS:
7205*22dc650dSSadaf Ebrahimi case META_QUERY_QUERY:
7206*22dc650dSSadaf Ebrahimi repeat_min = 0;
7207*22dc650dSSadaf Ebrahimi repeat_max = 1;
7208*22dc650dSSadaf Ebrahimi
7209*22dc650dSSadaf Ebrahimi REPEAT:
7210*22dc650dSSadaf Ebrahimi if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7211*22dc650dSSadaf Ebrahimi
7212*22dc650dSSadaf Ebrahimi /* Remember whether this is a variable length repeat, and default to
7213*22dc650dSSadaf Ebrahimi single-char opcodes. */
7214*22dc650dSSadaf Ebrahimi
7215*22dc650dSSadaf Ebrahimi reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7216*22dc650dSSadaf Ebrahimi op_type = 0;
7217*22dc650dSSadaf Ebrahimi
7218*22dc650dSSadaf Ebrahimi /* Adjust first and required code units for a zero repeat. */
7219*22dc650dSSadaf Ebrahimi
7220*22dc650dSSadaf Ebrahimi if (repeat_min == 0)
7221*22dc650dSSadaf Ebrahimi {
7222*22dc650dSSadaf Ebrahimi firstcu = zerofirstcu;
7223*22dc650dSSadaf Ebrahimi firstcuflags = zerofirstcuflags;
7224*22dc650dSSadaf Ebrahimi reqcu = zeroreqcu;
7225*22dc650dSSadaf Ebrahimi reqcuflags = zeroreqcuflags;
7226*22dc650dSSadaf Ebrahimi }
7227*22dc650dSSadaf Ebrahimi
7228*22dc650dSSadaf Ebrahimi /* Note the greediness and possessiveness. */
7229*22dc650dSSadaf Ebrahimi
7230*22dc650dSSadaf Ebrahimi switch (meta)
7231*22dc650dSSadaf Ebrahimi {
7232*22dc650dSSadaf Ebrahimi case META_MINMAX_PLUS:
7233*22dc650dSSadaf Ebrahimi case META_ASTERISK_PLUS:
7234*22dc650dSSadaf Ebrahimi case META_PLUS_PLUS:
7235*22dc650dSSadaf Ebrahimi case META_QUERY_PLUS:
7236*22dc650dSSadaf Ebrahimi repeat_type = 0; /* Force greedy */
7237*22dc650dSSadaf Ebrahimi possessive_quantifier = TRUE;
7238*22dc650dSSadaf Ebrahimi break;
7239*22dc650dSSadaf Ebrahimi
7240*22dc650dSSadaf Ebrahimi case META_MINMAX_QUERY:
7241*22dc650dSSadaf Ebrahimi case META_ASTERISK_QUERY:
7242*22dc650dSSadaf Ebrahimi case META_PLUS_QUERY:
7243*22dc650dSSadaf Ebrahimi case META_QUERY_QUERY:
7244*22dc650dSSadaf Ebrahimi repeat_type = greedy_non_default;
7245*22dc650dSSadaf Ebrahimi possessive_quantifier = FALSE;
7246*22dc650dSSadaf Ebrahimi break;
7247*22dc650dSSadaf Ebrahimi
7248*22dc650dSSadaf Ebrahimi default:
7249*22dc650dSSadaf Ebrahimi repeat_type = greedy_default;
7250*22dc650dSSadaf Ebrahimi possessive_quantifier = FALSE;
7251*22dc650dSSadaf Ebrahimi break;
7252*22dc650dSSadaf Ebrahimi }
7253*22dc650dSSadaf Ebrahimi
7254*22dc650dSSadaf Ebrahimi /* Save start of previous item, in case we have to move it up in order to
7255*22dc650dSSadaf Ebrahimi insert something before it, and remember what it was. */
7256*22dc650dSSadaf Ebrahimi
7257*22dc650dSSadaf Ebrahimi tempcode = previous;
7258*22dc650dSSadaf Ebrahimi op_previous = *previous;
7259*22dc650dSSadaf Ebrahimi
7260*22dc650dSSadaf Ebrahimi /* Now handle repetition for the different types of item. If the repeat
7261*22dc650dSSadaf Ebrahimi minimum and the repeat maximum are both 1, we can ignore the quantifier for
7262*22dc650dSSadaf Ebrahimi non-parenthesized items, as they have only one alternative. For anything in
7263*22dc650dSSadaf Ebrahimi parentheses, we must not ignore if {1} is possessive. */
7264*22dc650dSSadaf Ebrahimi
7265*22dc650dSSadaf Ebrahimi switch (op_previous)
7266*22dc650dSSadaf Ebrahimi {
7267*22dc650dSSadaf Ebrahimi /* If previous was a character or negated character match, abolish the
7268*22dc650dSSadaf Ebrahimi item and generate a repeat item instead. If a char item has a minimum of
7269*22dc650dSSadaf Ebrahimi more than one, ensure that it is set in reqcu - it might not be if a
7270*22dc650dSSadaf Ebrahimi sequence such as x{3} is the first thing in a branch because the x will
7271*22dc650dSSadaf Ebrahimi have gone into firstcu instead. */
7272*22dc650dSSadaf Ebrahimi
7273*22dc650dSSadaf Ebrahimi case OP_CHAR:
7274*22dc650dSSadaf Ebrahimi case OP_CHARI:
7275*22dc650dSSadaf Ebrahimi case OP_NOT:
7276*22dc650dSSadaf Ebrahimi case OP_NOTI:
7277*22dc650dSSadaf Ebrahimi if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7278*22dc650dSSadaf Ebrahimi op_type = chartypeoffset[op_previous - OP_CHAR];
7279*22dc650dSSadaf Ebrahimi
7280*22dc650dSSadaf Ebrahimi /* Deal with UTF characters that take up more than one code unit. */
7281*22dc650dSSadaf Ebrahimi
7282*22dc650dSSadaf Ebrahimi #ifdef MAYBE_UTF_MULTI
7283*22dc650dSSadaf Ebrahimi if (utf && NOT_FIRSTCU(code[-1]))
7284*22dc650dSSadaf Ebrahimi {
7285*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *lastchar = code - 1;
7286*22dc650dSSadaf Ebrahimi BACKCHAR(lastchar);
7287*22dc650dSSadaf Ebrahimi mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
7288*22dc650dSSadaf Ebrahimi memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
7289*22dc650dSSadaf Ebrahimi }
7290*22dc650dSSadaf Ebrahimi else
7291*22dc650dSSadaf Ebrahimi #endif /* MAYBE_UTF_MULTI */
7292*22dc650dSSadaf Ebrahimi
7293*22dc650dSSadaf Ebrahimi /* Handle the case of a single code unit - either with no UTF support, or
7294*22dc650dSSadaf Ebrahimi with UTF disabled, or for a single-code-unit UTF character. In the latter
7295*22dc650dSSadaf Ebrahimi case, for a repeated positive match, get the caseless flag for the
7296*22dc650dSSadaf Ebrahimi required code unit from the previous character, because a class like [Aa]
7297*22dc650dSSadaf Ebrahimi sets a caseless A but by now the req_caseopt flag has been reset. */
7298*22dc650dSSadaf Ebrahimi
7299*22dc650dSSadaf Ebrahimi {
7300*22dc650dSSadaf Ebrahimi mcbuffer[0] = code[-1];
7301*22dc650dSSadaf Ebrahimi mclength = 1;
7302*22dc650dSSadaf Ebrahimi if (op_previous <= OP_CHARI && repeat_min > 1)
7303*22dc650dSSadaf Ebrahimi {
7304*22dc650dSSadaf Ebrahimi reqcu = mcbuffer[0];
7305*22dc650dSSadaf Ebrahimi reqcuflags = cb->req_varyopt;
7306*22dc650dSSadaf Ebrahimi if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7307*22dc650dSSadaf Ebrahimi }
7308*22dc650dSSadaf Ebrahimi }
7309*22dc650dSSadaf Ebrahimi goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
7310*22dc650dSSadaf Ebrahimi
7311*22dc650dSSadaf Ebrahimi /* If previous was a character class or a back reference, we put the
7312*22dc650dSSadaf Ebrahimi repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7313*22dc650dSSadaf Ebrahimi
7314*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
7315*22dc650dSSadaf Ebrahimi case OP_XCLASS:
7316*22dc650dSSadaf Ebrahimi #endif
7317*22dc650dSSadaf Ebrahimi case OP_CLASS:
7318*22dc650dSSadaf Ebrahimi case OP_NCLASS:
7319*22dc650dSSadaf Ebrahimi case OP_REF:
7320*22dc650dSSadaf Ebrahimi case OP_REFI:
7321*22dc650dSSadaf Ebrahimi case OP_DNREF:
7322*22dc650dSSadaf Ebrahimi case OP_DNREFI:
7323*22dc650dSSadaf Ebrahimi
7324*22dc650dSSadaf Ebrahimi if (repeat_max == 0)
7325*22dc650dSSadaf Ebrahimi {
7326*22dc650dSSadaf Ebrahimi code = previous;
7327*22dc650dSSadaf Ebrahimi goto END_REPEAT;
7328*22dc650dSSadaf Ebrahimi }
7329*22dc650dSSadaf Ebrahimi if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7330*22dc650dSSadaf Ebrahimi
7331*22dc650dSSadaf Ebrahimi if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7332*22dc650dSSadaf Ebrahimi *code++ = OP_CRSTAR + repeat_type;
7333*22dc650dSSadaf Ebrahimi else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7334*22dc650dSSadaf Ebrahimi *code++ = OP_CRPLUS + repeat_type;
7335*22dc650dSSadaf Ebrahimi else if (repeat_min == 0 && repeat_max == 1)
7336*22dc650dSSadaf Ebrahimi *code++ = OP_CRQUERY + repeat_type;
7337*22dc650dSSadaf Ebrahimi else
7338*22dc650dSSadaf Ebrahimi {
7339*22dc650dSSadaf Ebrahimi *code++ = OP_CRRANGE + repeat_type;
7340*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, repeat_min);
7341*22dc650dSSadaf Ebrahimi if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
7342*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, repeat_max);
7343*22dc650dSSadaf Ebrahimi }
7344*22dc650dSSadaf Ebrahimi break;
7345*22dc650dSSadaf Ebrahimi
7346*22dc650dSSadaf Ebrahimi /* If previous is OP_FAIL, it was generated by an empty class []
7347*22dc650dSSadaf Ebrahimi (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7348*22dc650dSSadaf Ebrahimi generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7349*22dc650dSSadaf Ebrahimi time. We can just ignore this repeat. */
7350*22dc650dSSadaf Ebrahimi
7351*22dc650dSSadaf Ebrahimi case OP_FAIL:
7352*22dc650dSSadaf Ebrahimi goto END_REPEAT;
7353*22dc650dSSadaf Ebrahimi
7354*22dc650dSSadaf Ebrahimi /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7355*22dc650dSSadaf Ebrahimi because pcre2_match() could not handle backtracking into recursively
7356*22dc650dSSadaf Ebrahimi called groups. Now that this backtracking is available, we no longer need
7357*22dc650dSSadaf Ebrahimi to do this. However, we still need to replicate recursions as we do for
7358*22dc650dSSadaf Ebrahimi groups so as to have independent backtracking points. We can replicate
7359*22dc650dSSadaf Ebrahimi for the minimum number of repeats directly. For optional repeats we now
7360*22dc650dSSadaf Ebrahimi wrap the recursion in OP_BRA brackets and make use of the bracket
7361*22dc650dSSadaf Ebrahimi repetition. */
7362*22dc650dSSadaf Ebrahimi
7363*22dc650dSSadaf Ebrahimi case OP_RECURSE:
7364*22dc650dSSadaf Ebrahimi if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7365*22dc650dSSadaf Ebrahimi goto END_REPEAT;
7366*22dc650dSSadaf Ebrahimi
7367*22dc650dSSadaf Ebrahimi /* Generate unwrapped repeats for a non-zero minimum, except when the
7368*22dc650dSSadaf Ebrahimi minimum is 1 and the maximum unlimited, because that can be handled with
7369*22dc650dSSadaf Ebrahimi OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7370*22dc650dSSadaf Ebrahimi minimum, we just need to generate the appropriate additional copies.
7371*22dc650dSSadaf Ebrahimi Otherwise we need to generate one more, to simulate the situation when
7372*22dc650dSSadaf Ebrahimi the minimum is zero. */
7373*22dc650dSSadaf Ebrahimi
7374*22dc650dSSadaf Ebrahimi if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7375*22dc650dSSadaf Ebrahimi {
7376*22dc650dSSadaf Ebrahimi int replicate = repeat_min;
7377*22dc650dSSadaf Ebrahimi if (repeat_min == repeat_max) replicate--;
7378*22dc650dSSadaf Ebrahimi
7379*22dc650dSSadaf Ebrahimi /* In the pre-compile phase, we don't actually do the replication. We
7380*22dc650dSSadaf Ebrahimi just adjust the length as if we had. Do some paranoid checks for
7381*22dc650dSSadaf Ebrahimi potential integer overflow. */
7382*22dc650dSSadaf Ebrahimi
7383*22dc650dSSadaf Ebrahimi if (lengthptr != NULL)
7384*22dc650dSSadaf Ebrahimi {
7385*22dc650dSSadaf Ebrahimi PCRE2_SIZE delta;
7386*22dc650dSSadaf Ebrahimi if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7387*22dc650dSSadaf Ebrahimi OFLOW_MAX - *lengthptr < delta)
7388*22dc650dSSadaf Ebrahimi {
7389*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR20;
7390*22dc650dSSadaf Ebrahimi return 0;
7391*22dc650dSSadaf Ebrahimi }
7392*22dc650dSSadaf Ebrahimi *lengthptr += delta;
7393*22dc650dSSadaf Ebrahimi }
7394*22dc650dSSadaf Ebrahimi
7395*22dc650dSSadaf Ebrahimi else for (int i = 0; i < replicate; i++)
7396*22dc650dSSadaf Ebrahimi {
7397*22dc650dSSadaf Ebrahimi memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7398*22dc650dSSadaf Ebrahimi previous = code;
7399*22dc650dSSadaf Ebrahimi code += 1 + LINK_SIZE;
7400*22dc650dSSadaf Ebrahimi }
7401*22dc650dSSadaf Ebrahimi
7402*22dc650dSSadaf Ebrahimi /* If the number of repeats is fixed, we are done. Otherwise, adjust
7403*22dc650dSSadaf Ebrahimi the counts and fall through. */
7404*22dc650dSSadaf Ebrahimi
7405*22dc650dSSadaf Ebrahimi if (repeat_min == repeat_max) break;
7406*22dc650dSSadaf Ebrahimi if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7407*22dc650dSSadaf Ebrahimi repeat_min = 0;
7408*22dc650dSSadaf Ebrahimi }
7409*22dc650dSSadaf Ebrahimi
7410*22dc650dSSadaf Ebrahimi /* Wrap the recursion call in OP_BRA brackets. */
7411*22dc650dSSadaf Ebrahimi
7412*22dc650dSSadaf Ebrahimi (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7413*22dc650dSSadaf Ebrahimi op_previous = *previous = OP_BRA;
7414*22dc650dSSadaf Ebrahimi PUT(previous, 1, 2 + 2*LINK_SIZE);
7415*22dc650dSSadaf Ebrahimi previous[2 + 2*LINK_SIZE] = OP_KET;
7416*22dc650dSSadaf Ebrahimi PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7417*22dc650dSSadaf Ebrahimi code += 2 + 2 * LINK_SIZE;
7418*22dc650dSSadaf Ebrahimi length_prevgroup = 3 + 3*LINK_SIZE;
7419*22dc650dSSadaf Ebrahimi group_return = -1; /* Set "may match empty string" */
7420*22dc650dSSadaf Ebrahimi
7421*22dc650dSSadaf Ebrahimi /* Now treat as a repeated OP_BRA. */
7422*22dc650dSSadaf Ebrahimi /* Fall through */
7423*22dc650dSSadaf Ebrahimi
7424*22dc650dSSadaf Ebrahimi /* If previous was a bracket group, we may have to replicate it in
7425*22dc650dSSadaf Ebrahimi certain cases. Note that at this point we can encounter only the "basic"
7426*22dc650dSSadaf Ebrahimi bracket opcodes such as BRA and CBRA, as this is the place where they get
7427*22dc650dSSadaf Ebrahimi converted into the more special varieties such as BRAPOS and SBRA.
7428*22dc650dSSadaf Ebrahimi Originally, PCRE did not allow repetition of assertions, but now it does,
7429*22dc650dSSadaf Ebrahimi for Perl compatibility. */
7430*22dc650dSSadaf Ebrahimi
7431*22dc650dSSadaf Ebrahimi case OP_ASSERT:
7432*22dc650dSSadaf Ebrahimi case OP_ASSERT_NOT:
7433*22dc650dSSadaf Ebrahimi case OP_ASSERT_NA:
7434*22dc650dSSadaf Ebrahimi case OP_ASSERTBACK:
7435*22dc650dSSadaf Ebrahimi case OP_ASSERTBACK_NOT:
7436*22dc650dSSadaf Ebrahimi case OP_ASSERTBACK_NA:
7437*22dc650dSSadaf Ebrahimi case OP_ONCE:
7438*22dc650dSSadaf Ebrahimi case OP_SCRIPT_RUN:
7439*22dc650dSSadaf Ebrahimi case OP_BRA:
7440*22dc650dSSadaf Ebrahimi case OP_CBRA:
7441*22dc650dSSadaf Ebrahimi case OP_COND:
7442*22dc650dSSadaf Ebrahimi {
7443*22dc650dSSadaf Ebrahimi int len = (int)(code - previous);
7444*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *bralink = NULL;
7445*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *brazeroptr = NULL;
7446*22dc650dSSadaf Ebrahimi
7447*22dc650dSSadaf Ebrahimi if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7448*22dc650dSSadaf Ebrahimi goto END_REPEAT;
7449*22dc650dSSadaf Ebrahimi
7450*22dc650dSSadaf Ebrahimi /* Repeating a DEFINE group (or any group where the condition is always
7451*22dc650dSSadaf Ebrahimi FALSE and there is only one branch) is pointless, but Perl allows the
7452*22dc650dSSadaf Ebrahimi syntax, so we just ignore the repeat. */
7453*22dc650dSSadaf Ebrahimi
7454*22dc650dSSadaf Ebrahimi if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7455*22dc650dSSadaf Ebrahimi previous[GET(previous, 1)] != OP_ALT)
7456*22dc650dSSadaf Ebrahimi goto END_REPEAT;
7457*22dc650dSSadaf Ebrahimi
7458*22dc650dSSadaf Ebrahimi /* Perl allows all assertions to be quantified, and when they contain
7459*22dc650dSSadaf Ebrahimi capturing parentheses and/or are optional there are potential uses for
7460*22dc650dSSadaf Ebrahimi this feature. PCRE2 used to force the maximum quantifier to 1 on the
7461*22dc650dSSadaf Ebrahimi invalid grounds that further repetition was never useful. This was
7462*22dc650dSSadaf Ebrahimi always a bit pointless, since an assertion could be wrapped with a
7463*22dc650dSSadaf Ebrahimi repeated group to achieve the effect. General repetition is now
7464*22dc650dSSadaf Ebrahimi permitted, but if the maximum is unlimited it is set to one more than
7465*22dc650dSSadaf Ebrahimi the minimum. */
7466*22dc650dSSadaf Ebrahimi
7467*22dc650dSSadaf Ebrahimi if (op_previous < OP_ONCE) /* Assertion */
7468*22dc650dSSadaf Ebrahimi {
7469*22dc650dSSadaf Ebrahimi if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7470*22dc650dSSadaf Ebrahimi }
7471*22dc650dSSadaf Ebrahimi
7472*22dc650dSSadaf Ebrahimi /* The case of a zero minimum is special because of the need to stick
7473*22dc650dSSadaf Ebrahimi OP_BRAZERO in front of it, and because the group appears once in the
7474*22dc650dSSadaf Ebrahimi data, whereas in other cases it appears the minimum number of times. For
7475*22dc650dSSadaf Ebrahimi this reason, it is simplest to treat this case separately, as otherwise
7476*22dc650dSSadaf Ebrahimi the code gets far too messy. There are several special subcases when the
7477*22dc650dSSadaf Ebrahimi minimum is zero. */
7478*22dc650dSSadaf Ebrahimi
7479*22dc650dSSadaf Ebrahimi if (repeat_min == 0)
7480*22dc650dSSadaf Ebrahimi {
7481*22dc650dSSadaf Ebrahimi /* If the maximum is also zero, we used to just omit the group from
7482*22dc650dSSadaf Ebrahimi the output altogether, like this:
7483*22dc650dSSadaf Ebrahimi
7484*22dc650dSSadaf Ebrahimi ** if (repeat_max == 0)
7485*22dc650dSSadaf Ebrahimi ** {
7486*22dc650dSSadaf Ebrahimi ** code = previous;
7487*22dc650dSSadaf Ebrahimi ** goto END_REPEAT;
7488*22dc650dSSadaf Ebrahimi ** }
7489*22dc650dSSadaf Ebrahimi
7490*22dc650dSSadaf Ebrahimi However, that fails when a group or a subgroup within it is
7491*22dc650dSSadaf Ebrahimi referenced as a subroutine from elsewhere in the pattern, so now we
7492*22dc650dSSadaf Ebrahimi stick in OP_SKIPZERO in front of it so that it is skipped on
7493*22dc650dSSadaf Ebrahimi execution. As we don't have a list of which groups are referenced, we
7494*22dc650dSSadaf Ebrahimi cannot do this selectively.
7495*22dc650dSSadaf Ebrahimi
7496*22dc650dSSadaf Ebrahimi If the maximum is 1 or unlimited, we just have to stick in the
7497*22dc650dSSadaf Ebrahimi BRAZERO and do no more at this point. */
7498*22dc650dSSadaf Ebrahimi
7499*22dc650dSSadaf Ebrahimi if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7500*22dc650dSSadaf Ebrahimi {
7501*22dc650dSSadaf Ebrahimi (void)memmove(previous + 1, previous, CU2BYTES(len));
7502*22dc650dSSadaf Ebrahimi code++;
7503*22dc650dSSadaf Ebrahimi if (repeat_max == 0)
7504*22dc650dSSadaf Ebrahimi {
7505*22dc650dSSadaf Ebrahimi *previous++ = OP_SKIPZERO;
7506*22dc650dSSadaf Ebrahimi goto END_REPEAT;
7507*22dc650dSSadaf Ebrahimi }
7508*22dc650dSSadaf Ebrahimi brazeroptr = previous; /* Save for possessive optimizing */
7509*22dc650dSSadaf Ebrahimi *previous++ = OP_BRAZERO + repeat_type;
7510*22dc650dSSadaf Ebrahimi }
7511*22dc650dSSadaf Ebrahimi
7512*22dc650dSSadaf Ebrahimi /* If the maximum is greater than 1 and limited, we have to replicate
7513*22dc650dSSadaf Ebrahimi in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7514*22dc650dSSadaf Ebrahimi The first one has to be handled carefully because it's the original
7515*22dc650dSSadaf Ebrahimi copy, which has to be moved up. The remainder can be handled by code
7516*22dc650dSSadaf Ebrahimi that is common with the non-zero minimum case below. We have to
7517*22dc650dSSadaf Ebrahimi adjust the value or repeat_max, since one less copy is required. */
7518*22dc650dSSadaf Ebrahimi
7519*22dc650dSSadaf Ebrahimi else
7520*22dc650dSSadaf Ebrahimi {
7521*22dc650dSSadaf Ebrahimi int linkoffset;
7522*22dc650dSSadaf Ebrahimi (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7523*22dc650dSSadaf Ebrahimi code += 2 + LINK_SIZE;
7524*22dc650dSSadaf Ebrahimi *previous++ = OP_BRAZERO + repeat_type;
7525*22dc650dSSadaf Ebrahimi *previous++ = OP_BRA;
7526*22dc650dSSadaf Ebrahimi
7527*22dc650dSSadaf Ebrahimi /* We chain together the bracket link offset fields that have to be
7528*22dc650dSSadaf Ebrahimi filled in later when the ends of the brackets are reached. */
7529*22dc650dSSadaf Ebrahimi
7530*22dc650dSSadaf Ebrahimi linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7531*22dc650dSSadaf Ebrahimi bralink = previous;
7532*22dc650dSSadaf Ebrahimi PUTINC(previous, 0, linkoffset);
7533*22dc650dSSadaf Ebrahimi }
7534*22dc650dSSadaf Ebrahimi
7535*22dc650dSSadaf Ebrahimi if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7536*22dc650dSSadaf Ebrahimi }
7537*22dc650dSSadaf Ebrahimi
7538*22dc650dSSadaf Ebrahimi /* If the minimum is greater than zero, replicate the group as many
7539*22dc650dSSadaf Ebrahimi times as necessary, and adjust the maximum to the number of subsequent
7540*22dc650dSSadaf Ebrahimi copies that we need. */
7541*22dc650dSSadaf Ebrahimi
7542*22dc650dSSadaf Ebrahimi else
7543*22dc650dSSadaf Ebrahimi {
7544*22dc650dSSadaf Ebrahimi if (repeat_min > 1)
7545*22dc650dSSadaf Ebrahimi {
7546*22dc650dSSadaf Ebrahimi /* In the pre-compile phase, we don't actually do the replication.
7547*22dc650dSSadaf Ebrahimi We just adjust the length as if we had. Do some paranoid checks for
7548*22dc650dSSadaf Ebrahimi potential integer overflow. */
7549*22dc650dSSadaf Ebrahimi
7550*22dc650dSSadaf Ebrahimi if (lengthptr != NULL)
7551*22dc650dSSadaf Ebrahimi {
7552*22dc650dSSadaf Ebrahimi PCRE2_SIZE delta;
7553*22dc650dSSadaf Ebrahimi if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7554*22dc650dSSadaf Ebrahimi (int)length_prevgroup) ||
7555*22dc650dSSadaf Ebrahimi OFLOW_MAX - *lengthptr < delta)
7556*22dc650dSSadaf Ebrahimi {
7557*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR20;
7558*22dc650dSSadaf Ebrahimi return 0;
7559*22dc650dSSadaf Ebrahimi }
7560*22dc650dSSadaf Ebrahimi *lengthptr += delta;
7561*22dc650dSSadaf Ebrahimi }
7562*22dc650dSSadaf Ebrahimi
7563*22dc650dSSadaf Ebrahimi /* This is compiling for real. If there is a set first code unit
7564*22dc650dSSadaf Ebrahimi for the group, and we have not yet set a "required code unit", set
7565*22dc650dSSadaf Ebrahimi it. */
7566*22dc650dSSadaf Ebrahimi
7567*22dc650dSSadaf Ebrahimi else
7568*22dc650dSSadaf Ebrahimi {
7569*22dc650dSSadaf Ebrahimi if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7570*22dc650dSSadaf Ebrahimi {
7571*22dc650dSSadaf Ebrahimi reqcu = firstcu;
7572*22dc650dSSadaf Ebrahimi reqcuflags = firstcuflags;
7573*22dc650dSSadaf Ebrahimi }
7574*22dc650dSSadaf Ebrahimi for (uint32_t i = 1; i < repeat_min; i++)
7575*22dc650dSSadaf Ebrahimi {
7576*22dc650dSSadaf Ebrahimi memcpy(code, previous, CU2BYTES(len));
7577*22dc650dSSadaf Ebrahimi code += len;
7578*22dc650dSSadaf Ebrahimi }
7579*22dc650dSSadaf Ebrahimi }
7580*22dc650dSSadaf Ebrahimi }
7581*22dc650dSSadaf Ebrahimi
7582*22dc650dSSadaf Ebrahimi if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7583*22dc650dSSadaf Ebrahimi }
7584*22dc650dSSadaf Ebrahimi
7585*22dc650dSSadaf Ebrahimi /* This code is common to both the zero and non-zero minimum cases. If
7586*22dc650dSSadaf Ebrahimi the maximum is limited, it replicates the group in a nested fashion,
7587*22dc650dSSadaf Ebrahimi remembering the bracket starts on a stack. In the case of a zero
7588*22dc650dSSadaf Ebrahimi minimum, the first one was set up above. In all cases the repeat_max
7589*22dc650dSSadaf Ebrahimi now specifies the number of additional copies needed. Again, we must
7590*22dc650dSSadaf Ebrahimi remember to replicate entries on the forward reference list. */
7591*22dc650dSSadaf Ebrahimi
7592*22dc650dSSadaf Ebrahimi if (repeat_max != REPEAT_UNLIMITED)
7593*22dc650dSSadaf Ebrahimi {
7594*22dc650dSSadaf Ebrahimi /* In the pre-compile phase, we don't actually do the replication. We
7595*22dc650dSSadaf Ebrahimi just adjust the length as if we had. For each repetition we must add
7596*22dc650dSSadaf Ebrahimi 1 to the length for BRAZERO and for all but the last repetition we
7597*22dc650dSSadaf Ebrahimi must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7598*22dc650dSSadaf Ebrahimi paranoid checks to avoid integer overflow. */
7599*22dc650dSSadaf Ebrahimi
7600*22dc650dSSadaf Ebrahimi if (lengthptr != NULL && repeat_max > 0)
7601*22dc650dSSadaf Ebrahimi {
7602*22dc650dSSadaf Ebrahimi PCRE2_SIZE delta;
7603*22dc650dSSadaf Ebrahimi if (PRIV(ckd_smul)(&delta, repeat_max,
7604*22dc650dSSadaf Ebrahimi (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7605*22dc650dSSadaf Ebrahimi OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7606*22dc650dSSadaf Ebrahimi {
7607*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR20;
7608*22dc650dSSadaf Ebrahimi return 0;
7609*22dc650dSSadaf Ebrahimi }
7610*22dc650dSSadaf Ebrahimi delta -= (2 + 2*LINK_SIZE); /* Last one doesn't nest */
7611*22dc650dSSadaf Ebrahimi *lengthptr += delta;
7612*22dc650dSSadaf Ebrahimi }
7613*22dc650dSSadaf Ebrahimi
7614*22dc650dSSadaf Ebrahimi /* This is compiling for real */
7615*22dc650dSSadaf Ebrahimi
7616*22dc650dSSadaf Ebrahimi else for (uint32_t i = repeat_max; i >= 1; i--)
7617*22dc650dSSadaf Ebrahimi {
7618*22dc650dSSadaf Ebrahimi *code++ = OP_BRAZERO + repeat_type;
7619*22dc650dSSadaf Ebrahimi
7620*22dc650dSSadaf Ebrahimi /* All but the final copy start a new nesting, maintaining the
7621*22dc650dSSadaf Ebrahimi chain of brackets outstanding. */
7622*22dc650dSSadaf Ebrahimi
7623*22dc650dSSadaf Ebrahimi if (i != 1)
7624*22dc650dSSadaf Ebrahimi {
7625*22dc650dSSadaf Ebrahimi int linkoffset;
7626*22dc650dSSadaf Ebrahimi *code++ = OP_BRA;
7627*22dc650dSSadaf Ebrahimi linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7628*22dc650dSSadaf Ebrahimi bralink = code;
7629*22dc650dSSadaf Ebrahimi PUTINC(code, 0, linkoffset);
7630*22dc650dSSadaf Ebrahimi }
7631*22dc650dSSadaf Ebrahimi
7632*22dc650dSSadaf Ebrahimi memcpy(code, previous, CU2BYTES(len));
7633*22dc650dSSadaf Ebrahimi code += len;
7634*22dc650dSSadaf Ebrahimi }
7635*22dc650dSSadaf Ebrahimi
7636*22dc650dSSadaf Ebrahimi /* Now chain through the pending brackets, and fill in their length
7637*22dc650dSSadaf Ebrahimi fields (which are holding the chain links pro tem). */
7638*22dc650dSSadaf Ebrahimi
7639*22dc650dSSadaf Ebrahimi while (bralink != NULL)
7640*22dc650dSSadaf Ebrahimi {
7641*22dc650dSSadaf Ebrahimi int oldlinkoffset;
7642*22dc650dSSadaf Ebrahimi int linkoffset = (int)(code - bralink + 1);
7643*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *bra = code - linkoffset;
7644*22dc650dSSadaf Ebrahimi oldlinkoffset = GET(bra, 1);
7645*22dc650dSSadaf Ebrahimi bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7646*22dc650dSSadaf Ebrahimi *code++ = OP_KET;
7647*22dc650dSSadaf Ebrahimi PUTINC(code, 0, linkoffset);
7648*22dc650dSSadaf Ebrahimi PUT(bra, 1, linkoffset);
7649*22dc650dSSadaf Ebrahimi }
7650*22dc650dSSadaf Ebrahimi }
7651*22dc650dSSadaf Ebrahimi
7652*22dc650dSSadaf Ebrahimi /* If the maximum is unlimited, set a repeater in the final copy. For
7653*22dc650dSSadaf Ebrahimi SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7654*22dc650dSSadaf Ebrahimi possessively repeated ONCE brackets can be converted into non-capturing
7655*22dc650dSSadaf Ebrahimi brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7656*22dc650dSSadaf Ebrahimi saves having to deal with possessive ONCEs specially.
7657*22dc650dSSadaf Ebrahimi
7658*22dc650dSSadaf Ebrahimi Otherwise, when we are doing the actual compile phase, check to see
7659*22dc650dSSadaf Ebrahimi whether this group is one that could match an empty string. If so,
7660*22dc650dSSadaf Ebrahimi convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7661*22dc650dSSadaf Ebrahimi that runtime checking can be done. [This check is also applied to ONCE
7662*22dc650dSSadaf Ebrahimi and SCRIPT_RUN groups at runtime, but in a different way.]
7663*22dc650dSSadaf Ebrahimi
7664*22dc650dSSadaf Ebrahimi Then, if the quantifier was possessive and the bracket is not a
7665*22dc650dSSadaf Ebrahimi conditional, we convert the BRA code to the POS form, and the KET code
7666*22dc650dSSadaf Ebrahimi to KETRPOS. (It turns out to be convenient at runtime to detect this
7667*22dc650dSSadaf Ebrahimi kind of subpattern at both the start and at the end.) The use of
7668*22dc650dSSadaf Ebrahimi special opcodes makes it possible to reduce greatly the stack usage in
7669*22dc650dSSadaf Ebrahimi pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7670*22dc650dSSadaf Ebrahimi OP_BRAPOSZERO.
7671*22dc650dSSadaf Ebrahimi
7672*22dc650dSSadaf Ebrahimi Then, if the minimum number of matches is 1 or 0, cancel the possessive
7673*22dc650dSSadaf Ebrahimi flag so that the default action below, of wrapping everything inside
7674*22dc650dSSadaf Ebrahimi atomic brackets, does not happen. When the minimum is greater than 1,
7675*22dc650dSSadaf Ebrahimi there will be earlier copies of the group, and so we still have to wrap
7676*22dc650dSSadaf Ebrahimi the whole thing. */
7677*22dc650dSSadaf Ebrahimi
7678*22dc650dSSadaf Ebrahimi else
7679*22dc650dSSadaf Ebrahimi {
7680*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7681*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7682*22dc650dSSadaf Ebrahimi
7683*22dc650dSSadaf Ebrahimi /* Convert possessive ONCE brackets to non-capturing */
7684*22dc650dSSadaf Ebrahimi
7685*22dc650dSSadaf Ebrahimi if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7686*22dc650dSSadaf Ebrahimi
7687*22dc650dSSadaf Ebrahimi /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7688*22dc650dSSadaf Ebrahimi to do is to set the KET. */
7689*22dc650dSSadaf Ebrahimi
7690*22dc650dSSadaf Ebrahimi if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7691*22dc650dSSadaf Ebrahimi *ketcode = OP_KETRMAX + repeat_type;
7692*22dc650dSSadaf Ebrahimi
7693*22dc650dSSadaf Ebrahimi /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7694*22dc650dSSadaf Ebrahimi (which have been converted to non-capturing above). */
7695*22dc650dSSadaf Ebrahimi
7696*22dc650dSSadaf Ebrahimi else
7697*22dc650dSSadaf Ebrahimi {
7698*22dc650dSSadaf Ebrahimi /* In the compile phase, adjust the opcode if the group can match
7699*22dc650dSSadaf Ebrahimi an empty string. For a conditional group with only one branch, the
7700*22dc650dSSadaf Ebrahimi value of group_return will not show "could be empty", so we must
7701*22dc650dSSadaf Ebrahimi check that separately. */
7702*22dc650dSSadaf Ebrahimi
7703*22dc650dSSadaf Ebrahimi if (lengthptr == NULL)
7704*22dc650dSSadaf Ebrahimi {
7705*22dc650dSSadaf Ebrahimi if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7706*22dc650dSSadaf Ebrahimi if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7707*22dc650dSSadaf Ebrahimi *bracode = OP_SCOND;
7708*22dc650dSSadaf Ebrahimi }
7709*22dc650dSSadaf Ebrahimi
7710*22dc650dSSadaf Ebrahimi /* Handle possessive quantifiers. */
7711*22dc650dSSadaf Ebrahimi
7712*22dc650dSSadaf Ebrahimi if (possessive_quantifier)
7713*22dc650dSSadaf Ebrahimi {
7714*22dc650dSSadaf Ebrahimi /* For COND brackets, we wrap the whole thing in a possessively
7715*22dc650dSSadaf Ebrahimi repeated non-capturing bracket, because we have not invented POS
7716*22dc650dSSadaf Ebrahimi versions of the COND opcodes. */
7717*22dc650dSSadaf Ebrahimi
7718*22dc650dSSadaf Ebrahimi if (*bracode == OP_COND || *bracode == OP_SCOND)
7719*22dc650dSSadaf Ebrahimi {
7720*22dc650dSSadaf Ebrahimi int nlen = (int)(code - bracode);
7721*22dc650dSSadaf Ebrahimi (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7722*22dc650dSSadaf Ebrahimi code += 1 + LINK_SIZE;
7723*22dc650dSSadaf Ebrahimi nlen += 1 + LINK_SIZE;
7724*22dc650dSSadaf Ebrahimi *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7725*22dc650dSSadaf Ebrahimi *code++ = OP_KETRPOS;
7726*22dc650dSSadaf Ebrahimi PUTINC(code, 0, nlen);
7727*22dc650dSSadaf Ebrahimi PUT(bracode, 1, nlen);
7728*22dc650dSSadaf Ebrahimi }
7729*22dc650dSSadaf Ebrahimi
7730*22dc650dSSadaf Ebrahimi /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7731*22dc650dSSadaf Ebrahimi
7732*22dc650dSSadaf Ebrahimi else
7733*22dc650dSSadaf Ebrahimi {
7734*22dc650dSSadaf Ebrahimi *bracode += 1; /* Switch to xxxPOS opcodes */
7735*22dc650dSSadaf Ebrahimi *ketcode = OP_KETRPOS;
7736*22dc650dSSadaf Ebrahimi }
7737*22dc650dSSadaf Ebrahimi
7738*22dc650dSSadaf Ebrahimi /* If the minimum is zero, mark it as possessive, then unset the
7739*22dc650dSSadaf Ebrahimi possessive flag when the minimum is 0 or 1. */
7740*22dc650dSSadaf Ebrahimi
7741*22dc650dSSadaf Ebrahimi if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7742*22dc650dSSadaf Ebrahimi if (repeat_min < 2) possessive_quantifier = FALSE;
7743*22dc650dSSadaf Ebrahimi }
7744*22dc650dSSadaf Ebrahimi
7745*22dc650dSSadaf Ebrahimi /* Non-possessive quantifier */
7746*22dc650dSSadaf Ebrahimi
7747*22dc650dSSadaf Ebrahimi else *ketcode = OP_KETRMAX + repeat_type;
7748*22dc650dSSadaf Ebrahimi }
7749*22dc650dSSadaf Ebrahimi }
7750*22dc650dSSadaf Ebrahimi }
7751*22dc650dSSadaf Ebrahimi break;
7752*22dc650dSSadaf Ebrahimi
7753*22dc650dSSadaf Ebrahimi /* If previous was a character type match (\d or similar), abolish it and
7754*22dc650dSSadaf Ebrahimi create a suitable repeat item. The code is shared with single-character
7755*22dc650dSSadaf Ebrahimi repeats by setting op_type to add a suitable offset into repeat_type.
7756*22dc650dSSadaf Ebrahimi Note the the Unicode property types will be present only when
7757*22dc650dSSadaf Ebrahimi SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7758*22dc650dSSadaf Ebrahimi here because it just makes it horribly messy. */
7759*22dc650dSSadaf Ebrahimi
7760*22dc650dSSadaf Ebrahimi default:
7761*22dc650dSSadaf Ebrahimi if (op_previous >= OP_EODN) /* Not a character type - internal error */
7762*22dc650dSSadaf Ebrahimi {
7763*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR10;
7764*22dc650dSSadaf Ebrahimi return 0;
7765*22dc650dSSadaf Ebrahimi }
7766*22dc650dSSadaf Ebrahimi else
7767*22dc650dSSadaf Ebrahimi {
7768*22dc650dSSadaf Ebrahimi int prop_type, prop_value;
7769*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *oldcode;
7770*22dc650dSSadaf Ebrahimi
7771*22dc650dSSadaf Ebrahimi if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7772*22dc650dSSadaf Ebrahimi
7773*22dc650dSSadaf Ebrahimi op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7774*22dc650dSSadaf Ebrahimi mclength = 0; /* Not a character */
7775*22dc650dSSadaf Ebrahimi
7776*22dc650dSSadaf Ebrahimi if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7777*22dc650dSSadaf Ebrahimi {
7778*22dc650dSSadaf Ebrahimi prop_type = previous[1];
7779*22dc650dSSadaf Ebrahimi prop_value = previous[2];
7780*22dc650dSSadaf Ebrahimi }
7781*22dc650dSSadaf Ebrahimi else
7782*22dc650dSSadaf Ebrahimi {
7783*22dc650dSSadaf Ebrahimi /* Come here from just above with a character in mcbuffer/mclength. */
7784*22dc650dSSadaf Ebrahimi OUTPUT_SINGLE_REPEAT:
7785*22dc650dSSadaf Ebrahimi prop_type = prop_value = -1;
7786*22dc650dSSadaf Ebrahimi }
7787*22dc650dSSadaf Ebrahimi
7788*22dc650dSSadaf Ebrahimi /* At this point, if prop_type == prop_value == -1 we either have a
7789*22dc650dSSadaf Ebrahimi character in mcbuffer when mclength is greater than zero, or we have
7790*22dc650dSSadaf Ebrahimi mclength zero, in which case there is a non-property character type in
7791*22dc650dSSadaf Ebrahimi op_previous. If prop_type/value are not negative, we have a property
7792*22dc650dSSadaf Ebrahimi character type in op_previous. */
7793*22dc650dSSadaf Ebrahimi
7794*22dc650dSSadaf Ebrahimi oldcode = code; /* Save where we were */
7795*22dc650dSSadaf Ebrahimi code = previous; /* Usually overwrite previous item */
7796*22dc650dSSadaf Ebrahimi
7797*22dc650dSSadaf Ebrahimi /* If the maximum is zero then the minimum must also be zero; Perl allows
7798*22dc650dSSadaf Ebrahimi this case, so we do too - by simply omitting the item altogether. */
7799*22dc650dSSadaf Ebrahimi
7800*22dc650dSSadaf Ebrahimi if (repeat_max == 0) goto END_REPEAT;
7801*22dc650dSSadaf Ebrahimi
7802*22dc650dSSadaf Ebrahimi /* Combine the op_type with the repeat_type */
7803*22dc650dSSadaf Ebrahimi
7804*22dc650dSSadaf Ebrahimi repeat_type += op_type;
7805*22dc650dSSadaf Ebrahimi
7806*22dc650dSSadaf Ebrahimi /* A minimum of zero is handled either as the special case * or ?, or as
7807*22dc650dSSadaf Ebrahimi an UPTO, with the maximum given. */
7808*22dc650dSSadaf Ebrahimi
7809*22dc650dSSadaf Ebrahimi if (repeat_min == 0)
7810*22dc650dSSadaf Ebrahimi {
7811*22dc650dSSadaf Ebrahimi if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7812*22dc650dSSadaf Ebrahimi else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7813*22dc650dSSadaf Ebrahimi else
7814*22dc650dSSadaf Ebrahimi {
7815*22dc650dSSadaf Ebrahimi *code++ = OP_UPTO + repeat_type;
7816*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, repeat_max);
7817*22dc650dSSadaf Ebrahimi }
7818*22dc650dSSadaf Ebrahimi }
7819*22dc650dSSadaf Ebrahimi
7820*22dc650dSSadaf Ebrahimi /* A repeat minimum of 1 is optimized into some special cases. If the
7821*22dc650dSSadaf Ebrahimi maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7822*22dc650dSSadaf Ebrahimi left in place and, if the maximum is greater than 1, we use OP_UPTO with
7823*22dc650dSSadaf Ebrahimi one less than the maximum. */
7824*22dc650dSSadaf Ebrahimi
7825*22dc650dSSadaf Ebrahimi else if (repeat_min == 1)
7826*22dc650dSSadaf Ebrahimi {
7827*22dc650dSSadaf Ebrahimi if (repeat_max == REPEAT_UNLIMITED)
7828*22dc650dSSadaf Ebrahimi *code++ = OP_PLUS + repeat_type;
7829*22dc650dSSadaf Ebrahimi else
7830*22dc650dSSadaf Ebrahimi {
7831*22dc650dSSadaf Ebrahimi code = oldcode; /* Leave previous item in place */
7832*22dc650dSSadaf Ebrahimi if (repeat_max == 1) goto END_REPEAT;
7833*22dc650dSSadaf Ebrahimi *code++ = OP_UPTO + repeat_type;
7834*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, repeat_max - 1);
7835*22dc650dSSadaf Ebrahimi }
7836*22dc650dSSadaf Ebrahimi }
7837*22dc650dSSadaf Ebrahimi
7838*22dc650dSSadaf Ebrahimi /* The case {n,n} is just an EXACT, while the general case {n,m} is
7839*22dc650dSSadaf Ebrahimi handled as an EXACT followed by an UPTO or STAR or QUERY. */
7840*22dc650dSSadaf Ebrahimi
7841*22dc650dSSadaf Ebrahimi else
7842*22dc650dSSadaf Ebrahimi {
7843*22dc650dSSadaf Ebrahimi *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7844*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, repeat_min);
7845*22dc650dSSadaf Ebrahimi
7846*22dc650dSSadaf Ebrahimi /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7847*22dc650dSSadaf Ebrahimi and then generate the second opcode. For a repeated Unicode property
7848*22dc650dSSadaf Ebrahimi match, there are two extra values that define the required property,
7849*22dc650dSSadaf Ebrahimi and mclength is set zero to indicate this. */
7850*22dc650dSSadaf Ebrahimi
7851*22dc650dSSadaf Ebrahimi if (repeat_max != repeat_min)
7852*22dc650dSSadaf Ebrahimi {
7853*22dc650dSSadaf Ebrahimi if (mclength > 0)
7854*22dc650dSSadaf Ebrahimi {
7855*22dc650dSSadaf Ebrahimi memcpy(code, mcbuffer, CU2BYTES(mclength));
7856*22dc650dSSadaf Ebrahimi code += mclength;
7857*22dc650dSSadaf Ebrahimi }
7858*22dc650dSSadaf Ebrahimi else
7859*22dc650dSSadaf Ebrahimi {
7860*22dc650dSSadaf Ebrahimi *code++ = op_previous;
7861*22dc650dSSadaf Ebrahimi if (prop_type >= 0)
7862*22dc650dSSadaf Ebrahimi {
7863*22dc650dSSadaf Ebrahimi *code++ = prop_type;
7864*22dc650dSSadaf Ebrahimi *code++ = prop_value;
7865*22dc650dSSadaf Ebrahimi }
7866*22dc650dSSadaf Ebrahimi }
7867*22dc650dSSadaf Ebrahimi
7868*22dc650dSSadaf Ebrahimi /* Now set up the following opcode */
7869*22dc650dSSadaf Ebrahimi
7870*22dc650dSSadaf Ebrahimi if (repeat_max == REPEAT_UNLIMITED)
7871*22dc650dSSadaf Ebrahimi *code++ = OP_STAR + repeat_type;
7872*22dc650dSSadaf Ebrahimi else
7873*22dc650dSSadaf Ebrahimi {
7874*22dc650dSSadaf Ebrahimi repeat_max -= repeat_min;
7875*22dc650dSSadaf Ebrahimi if (repeat_max == 1)
7876*22dc650dSSadaf Ebrahimi {
7877*22dc650dSSadaf Ebrahimi *code++ = OP_QUERY + repeat_type;
7878*22dc650dSSadaf Ebrahimi }
7879*22dc650dSSadaf Ebrahimi else
7880*22dc650dSSadaf Ebrahimi {
7881*22dc650dSSadaf Ebrahimi *code++ = OP_UPTO + repeat_type;
7882*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, repeat_max);
7883*22dc650dSSadaf Ebrahimi }
7884*22dc650dSSadaf Ebrahimi }
7885*22dc650dSSadaf Ebrahimi }
7886*22dc650dSSadaf Ebrahimi }
7887*22dc650dSSadaf Ebrahimi
7888*22dc650dSSadaf Ebrahimi /* Fill in the character or character type for the final opcode. */
7889*22dc650dSSadaf Ebrahimi
7890*22dc650dSSadaf Ebrahimi if (mclength > 0)
7891*22dc650dSSadaf Ebrahimi {
7892*22dc650dSSadaf Ebrahimi memcpy(code, mcbuffer, CU2BYTES(mclength));
7893*22dc650dSSadaf Ebrahimi code += mclength;
7894*22dc650dSSadaf Ebrahimi }
7895*22dc650dSSadaf Ebrahimi else
7896*22dc650dSSadaf Ebrahimi {
7897*22dc650dSSadaf Ebrahimi *code++ = op_previous;
7898*22dc650dSSadaf Ebrahimi if (prop_type >= 0)
7899*22dc650dSSadaf Ebrahimi {
7900*22dc650dSSadaf Ebrahimi *code++ = prop_type;
7901*22dc650dSSadaf Ebrahimi *code++ = prop_value;
7902*22dc650dSSadaf Ebrahimi }
7903*22dc650dSSadaf Ebrahimi }
7904*22dc650dSSadaf Ebrahimi }
7905*22dc650dSSadaf Ebrahimi break;
7906*22dc650dSSadaf Ebrahimi } /* End of switch on different op_previous values */
7907*22dc650dSSadaf Ebrahimi
7908*22dc650dSSadaf Ebrahimi
7909*22dc650dSSadaf Ebrahimi /* If the character following a repeat is '+', possessive_quantifier is
7910*22dc650dSSadaf Ebrahimi TRUE. For some opcodes, there are special alternative opcodes for this
7911*22dc650dSSadaf Ebrahimi case. For anything else, we wrap the entire repeated item inside OP_ONCE
7912*22dc650dSSadaf Ebrahimi brackets. Logically, the '+' notation is just syntactic sugar, taken from
7913*22dc650dSSadaf Ebrahimi Sun's Java package, but the special opcodes can optimize it.
7914*22dc650dSSadaf Ebrahimi
7915*22dc650dSSadaf Ebrahimi Some (but not all) possessively repeated subpatterns have already been
7916*22dc650dSSadaf Ebrahimi completely handled in the code just above. For them, possessive_quantifier
7917*22dc650dSSadaf Ebrahimi is always FALSE at this stage. Note that the repeated item starts at
7918*22dc650dSSadaf Ebrahimi tempcode, not at previous, which might be the first part of a string whose
7919*22dc650dSSadaf Ebrahimi (former) last char we repeated. */
7920*22dc650dSSadaf Ebrahimi
7921*22dc650dSSadaf Ebrahimi if (possessive_quantifier)
7922*22dc650dSSadaf Ebrahimi {
7923*22dc650dSSadaf Ebrahimi int len;
7924*22dc650dSSadaf Ebrahimi
7925*22dc650dSSadaf Ebrahimi /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7926*22dc650dSSadaf Ebrahimi However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7927*22dc650dSSadaf Ebrahimi {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7928*22dc650dSSadaf Ebrahimi remains is greater than zero, there's a further opcode that can be
7929*22dc650dSSadaf Ebrahimi handled. If not, do nothing, leaving the EXACT alone. */
7930*22dc650dSSadaf Ebrahimi
7931*22dc650dSSadaf Ebrahimi switch(*tempcode)
7932*22dc650dSSadaf Ebrahimi {
7933*22dc650dSSadaf Ebrahimi case OP_TYPEEXACT:
7934*22dc650dSSadaf Ebrahimi tempcode += PRIV(OP_lengths)[*tempcode] +
7935*22dc650dSSadaf Ebrahimi ((tempcode[1 + IMM2_SIZE] == OP_PROP
7936*22dc650dSSadaf Ebrahimi || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7937*22dc650dSSadaf Ebrahimi break;
7938*22dc650dSSadaf Ebrahimi
7939*22dc650dSSadaf Ebrahimi /* CHAR opcodes are used for exacts whose count is 1. */
7940*22dc650dSSadaf Ebrahimi
7941*22dc650dSSadaf Ebrahimi case OP_CHAR:
7942*22dc650dSSadaf Ebrahimi case OP_CHARI:
7943*22dc650dSSadaf Ebrahimi case OP_NOT:
7944*22dc650dSSadaf Ebrahimi case OP_NOTI:
7945*22dc650dSSadaf Ebrahimi case OP_EXACT:
7946*22dc650dSSadaf Ebrahimi case OP_EXACTI:
7947*22dc650dSSadaf Ebrahimi case OP_NOTEXACT:
7948*22dc650dSSadaf Ebrahimi case OP_NOTEXACTI:
7949*22dc650dSSadaf Ebrahimi tempcode += PRIV(OP_lengths)[*tempcode];
7950*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
7951*22dc650dSSadaf Ebrahimi if (utf && HAS_EXTRALEN(tempcode[-1]))
7952*22dc650dSSadaf Ebrahimi tempcode += GET_EXTRALEN(tempcode[-1]);
7953*22dc650dSSadaf Ebrahimi #endif
7954*22dc650dSSadaf Ebrahimi break;
7955*22dc650dSSadaf Ebrahimi
7956*22dc650dSSadaf Ebrahimi /* For the class opcodes, the repeat operator appears at the end;
7957*22dc650dSSadaf Ebrahimi adjust tempcode to point to it. */
7958*22dc650dSSadaf Ebrahimi
7959*22dc650dSSadaf Ebrahimi case OP_CLASS:
7960*22dc650dSSadaf Ebrahimi case OP_NCLASS:
7961*22dc650dSSadaf Ebrahimi tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7962*22dc650dSSadaf Ebrahimi break;
7963*22dc650dSSadaf Ebrahimi
7964*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
7965*22dc650dSSadaf Ebrahimi case OP_XCLASS:
7966*22dc650dSSadaf Ebrahimi tempcode += GET(tempcode, 1);
7967*22dc650dSSadaf Ebrahimi break;
7968*22dc650dSSadaf Ebrahimi #endif
7969*22dc650dSSadaf Ebrahimi }
7970*22dc650dSSadaf Ebrahimi
7971*22dc650dSSadaf Ebrahimi /* If tempcode is equal to code (which points to the end of the repeated
7972*22dc650dSSadaf Ebrahimi item), it means we have skipped an EXACT item but there is no following
7973*22dc650dSSadaf Ebrahimi QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7974*22dc650dSSadaf Ebrahimi all other cases, tempcode will be pointing to the repeat opcode, and will
7975*22dc650dSSadaf Ebrahimi be less than code, so the value of len will be greater than 0. */
7976*22dc650dSSadaf Ebrahimi
7977*22dc650dSSadaf Ebrahimi len = (int)(code - tempcode);
7978*22dc650dSSadaf Ebrahimi if (len > 0)
7979*22dc650dSSadaf Ebrahimi {
7980*22dc650dSSadaf Ebrahimi unsigned int repcode = *tempcode;
7981*22dc650dSSadaf Ebrahimi
7982*22dc650dSSadaf Ebrahimi /* There is a table for possessifying opcodes, all of which are less
7983*22dc650dSSadaf Ebrahimi than OP_CALLOUT. A zero entry means there is no possessified version.
7984*22dc650dSSadaf Ebrahimi */
7985*22dc650dSSadaf Ebrahimi
7986*22dc650dSSadaf Ebrahimi if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7987*22dc650dSSadaf Ebrahimi *tempcode = opcode_possessify[repcode];
7988*22dc650dSSadaf Ebrahimi
7989*22dc650dSSadaf Ebrahimi /* For opcode without a special possessified version, wrap the item in
7990*22dc650dSSadaf Ebrahimi ONCE brackets. */
7991*22dc650dSSadaf Ebrahimi
7992*22dc650dSSadaf Ebrahimi else
7993*22dc650dSSadaf Ebrahimi {
7994*22dc650dSSadaf Ebrahimi (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7995*22dc650dSSadaf Ebrahimi code += 1 + LINK_SIZE;
7996*22dc650dSSadaf Ebrahimi len += 1 + LINK_SIZE;
7997*22dc650dSSadaf Ebrahimi tempcode[0] = OP_ONCE;
7998*22dc650dSSadaf Ebrahimi *code++ = OP_KET;
7999*22dc650dSSadaf Ebrahimi PUTINC(code, 0, len);
8000*22dc650dSSadaf Ebrahimi PUT(tempcode, 1, len);
8001*22dc650dSSadaf Ebrahimi }
8002*22dc650dSSadaf Ebrahimi }
8003*22dc650dSSadaf Ebrahimi }
8004*22dc650dSSadaf Ebrahimi
8005*22dc650dSSadaf Ebrahimi /* We set the "follows varying string" flag for subsequently encountered
8006*22dc650dSSadaf Ebrahimi reqcus if it isn't already set and we have just passed a varying length
8007*22dc650dSSadaf Ebrahimi item. */
8008*22dc650dSSadaf Ebrahimi
8009*22dc650dSSadaf Ebrahimi END_REPEAT:
8010*22dc650dSSadaf Ebrahimi cb->req_varyopt |= reqvary;
8011*22dc650dSSadaf Ebrahimi break;
8012*22dc650dSSadaf Ebrahimi
8013*22dc650dSSadaf Ebrahimi
8014*22dc650dSSadaf Ebrahimi /* ===================================================================*/
8015*22dc650dSSadaf Ebrahimi /* Handle a 32-bit data character with a value greater than META_END. */
8016*22dc650dSSadaf Ebrahimi
8017*22dc650dSSadaf Ebrahimi case META_BIGVALUE:
8018*22dc650dSSadaf Ebrahimi pptr++;
8019*22dc650dSSadaf Ebrahimi goto NORMAL_CHAR;
8020*22dc650dSSadaf Ebrahimi
8021*22dc650dSSadaf Ebrahimi
8022*22dc650dSSadaf Ebrahimi /* ===============================================================*/
8023*22dc650dSSadaf Ebrahimi /* Handle a back reference by number, which is the meta argument. The
8024*22dc650dSSadaf Ebrahimi pattern offsets for back references to group numbers less than 10 are held
8025*22dc650dSSadaf Ebrahimi in a special vector, to avoid using more than two parsed pattern elements
8026*22dc650dSSadaf Ebrahimi in 64-bit environments. We only need the offset to the first occurrence,
8027*22dc650dSSadaf Ebrahimi because if that doesn't fail, subsequent ones will also be OK. */
8028*22dc650dSSadaf Ebrahimi
8029*22dc650dSSadaf Ebrahimi case META_BACKREF:
8030*22dc650dSSadaf Ebrahimi if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8031*22dc650dSSadaf Ebrahimi else GETPLUSOFFSET(offset, pptr);
8032*22dc650dSSadaf Ebrahimi
8033*22dc650dSSadaf Ebrahimi if (meta_arg > cb->bracount)
8034*22dc650dSSadaf Ebrahimi {
8035*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
8036*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR15; /* Non-existent subpattern */
8037*22dc650dSSadaf Ebrahimi return 0;
8038*22dc650dSSadaf Ebrahimi }
8039*22dc650dSSadaf Ebrahimi
8040*22dc650dSSadaf Ebrahimi /* Come here from named backref handling when the reference is to a
8041*22dc650dSSadaf Ebrahimi single group (that is, not to a duplicated name). The back reference
8042*22dc650dSSadaf Ebrahimi data will have already been updated. We must disable firstcu if not
8043*22dc650dSSadaf Ebrahimi set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8044*22dc650dSSadaf Ebrahimi later. */
8045*22dc650dSSadaf Ebrahimi
8046*22dc650dSSadaf Ebrahimi HANDLE_SINGLE_REFERENCE:
8047*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8048*22dc650dSSadaf Ebrahimi *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8049*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, meta_arg);
8050*22dc650dSSadaf Ebrahimi
8051*22dc650dSSadaf Ebrahimi /* Update the map of back references, and keep the highest one. We
8052*22dc650dSSadaf Ebrahimi could do this in parse_regex() for numerical back references, but not
8053*22dc650dSSadaf Ebrahimi for named back references, because we don't know the numbers to which
8054*22dc650dSSadaf Ebrahimi named back references refer. So we do it all in this function. */
8055*22dc650dSSadaf Ebrahimi
8056*22dc650dSSadaf Ebrahimi cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8057*22dc650dSSadaf Ebrahimi if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8058*22dc650dSSadaf Ebrahimi break;
8059*22dc650dSSadaf Ebrahimi
8060*22dc650dSSadaf Ebrahimi
8061*22dc650dSSadaf Ebrahimi /* ===============================================================*/
8062*22dc650dSSadaf Ebrahimi /* Handle recursion by inserting the number of the called group (which is
8063*22dc650dSSadaf Ebrahimi the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8064*22dc650dSSadaf Ebrahimi scanned and these numbers are replaced by offsets within the pattern. It is
8065*22dc650dSSadaf Ebrahimi done like this to avoid problems with forward references and adjusting
8066*22dc650dSSadaf Ebrahimi offsets when groups are duplicated and moved (as discovered in previous
8067*22dc650dSSadaf Ebrahimi implementations). Note that a recursion does not have a set first
8068*22dc650dSSadaf Ebrahimi character. */
8069*22dc650dSSadaf Ebrahimi
8070*22dc650dSSadaf Ebrahimi case META_RECURSE:
8071*22dc650dSSadaf Ebrahimi GETPLUSOFFSET(offset, pptr);
8072*22dc650dSSadaf Ebrahimi if (meta_arg > cb->bracount)
8073*22dc650dSSadaf Ebrahimi {
8074*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
8075*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR15; /* Non-existent subpattern */
8076*22dc650dSSadaf Ebrahimi return 0;
8077*22dc650dSSadaf Ebrahimi }
8078*22dc650dSSadaf Ebrahimi HANDLE_NUMERICAL_RECURSION:
8079*22dc650dSSadaf Ebrahimi *code = OP_RECURSE;
8080*22dc650dSSadaf Ebrahimi PUT(code, 1, meta_arg);
8081*22dc650dSSadaf Ebrahimi code += 1 + LINK_SIZE;
8082*22dc650dSSadaf Ebrahimi groupsetfirstcu = FALSE;
8083*22dc650dSSadaf Ebrahimi cb->had_recurse = TRUE;
8084*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8085*22dc650dSSadaf Ebrahimi zerofirstcu = firstcu;
8086*22dc650dSSadaf Ebrahimi zerofirstcuflags = firstcuflags;
8087*22dc650dSSadaf Ebrahimi break;
8088*22dc650dSSadaf Ebrahimi
8089*22dc650dSSadaf Ebrahimi
8090*22dc650dSSadaf Ebrahimi /* ===============================================================*/
8091*22dc650dSSadaf Ebrahimi /* Handle capturing parentheses; the number is the meta argument. */
8092*22dc650dSSadaf Ebrahimi
8093*22dc650dSSadaf Ebrahimi case META_CAPTURE:
8094*22dc650dSSadaf Ebrahimi bravalue = OP_CBRA;
8095*22dc650dSSadaf Ebrahimi skipunits = IMM2_SIZE;
8096*22dc650dSSadaf Ebrahimi PUT2(code, 1+LINK_SIZE, meta_arg);
8097*22dc650dSSadaf Ebrahimi cb->lastcapture = meta_arg;
8098*22dc650dSSadaf Ebrahimi goto GROUP_PROCESS_NOTE_EMPTY;
8099*22dc650dSSadaf Ebrahimi
8100*22dc650dSSadaf Ebrahimi
8101*22dc650dSSadaf Ebrahimi /* ===============================================================*/
8102*22dc650dSSadaf Ebrahimi /* Handle escape sequence items. For ones like \d, the ESC_values are
8103*22dc650dSSadaf Ebrahimi arranged to be the same as the corresponding OP_values in the default case
8104*22dc650dSSadaf Ebrahimi when PCRE2_UCP is not set (which is the only case in which they will appear
8105*22dc650dSSadaf Ebrahimi here).
8106*22dc650dSSadaf Ebrahimi
8107*22dc650dSSadaf Ebrahimi Note: \Q and \E are never seen here, as they were dealt with in
8108*22dc650dSSadaf Ebrahimi parse_pattern(). Neither are numerical back references or recursions, which
8109*22dc650dSSadaf Ebrahimi were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8110*22dc650dSSadaf Ebrahimi \g, when followed by names, are turned into META_BACKREF_BYNAME or
8111*22dc650dSSadaf Ebrahimi META_RECURSE_BYNAME. */
8112*22dc650dSSadaf Ebrahimi
8113*22dc650dSSadaf Ebrahimi case META_ESCAPE:
8114*22dc650dSSadaf Ebrahimi
8115*22dc650dSSadaf Ebrahimi /* We can test for escape sequences that consume a character because their
8116*22dc650dSSadaf Ebrahimi values lie between ESC_b and ESC_Z; this may have to change if any new ones
8117*22dc650dSSadaf Ebrahimi are ever created. For these sequences, we disable the setting of a first
8118*22dc650dSSadaf Ebrahimi character if it hasn't already been set. */
8119*22dc650dSSadaf Ebrahimi
8120*22dc650dSSadaf Ebrahimi if (meta_arg > ESC_b && meta_arg < ESC_Z)
8121*22dc650dSSadaf Ebrahimi {
8122*22dc650dSSadaf Ebrahimi matched_char = TRUE;
8123*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8124*22dc650dSSadaf Ebrahimi }
8125*22dc650dSSadaf Ebrahimi
8126*22dc650dSSadaf Ebrahimi /* Set values to reset to if this is followed by a zero repeat. */
8127*22dc650dSSadaf Ebrahimi
8128*22dc650dSSadaf Ebrahimi zerofirstcu = firstcu;
8129*22dc650dSSadaf Ebrahimi zerofirstcuflags = firstcuflags;
8130*22dc650dSSadaf Ebrahimi zeroreqcu = reqcu;
8131*22dc650dSSadaf Ebrahimi zeroreqcuflags = reqcuflags;
8132*22dc650dSSadaf Ebrahimi
8133*22dc650dSSadaf Ebrahimi /* If Unicode is not supported, \P and \p are not allowed and are
8134*22dc650dSSadaf Ebrahimi faulted at parse time, so will never appear here. */
8135*22dc650dSSadaf Ebrahimi
8136*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
8137*22dc650dSSadaf Ebrahimi if (meta_arg == ESC_P || meta_arg == ESC_p)
8138*22dc650dSSadaf Ebrahimi {
8139*22dc650dSSadaf Ebrahimi uint32_t ptype = *(++pptr) >> 16;
8140*22dc650dSSadaf Ebrahimi uint32_t pdata = *pptr & 0xffff;
8141*22dc650dSSadaf Ebrahimi
8142*22dc650dSSadaf Ebrahimi /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
8143*22dc650dSSadaf Ebrahimi from the auto-anchoring code. */
8144*22dc650dSSadaf Ebrahimi
8145*22dc650dSSadaf Ebrahimi if (meta_arg == ESC_p && ptype == PT_ANY)
8146*22dc650dSSadaf Ebrahimi {
8147*22dc650dSSadaf Ebrahimi *code++ = OP_ALLANY;
8148*22dc650dSSadaf Ebrahimi }
8149*22dc650dSSadaf Ebrahimi else
8150*22dc650dSSadaf Ebrahimi {
8151*22dc650dSSadaf Ebrahimi *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8152*22dc650dSSadaf Ebrahimi *code++ = ptype;
8153*22dc650dSSadaf Ebrahimi *code++ = pdata;
8154*22dc650dSSadaf Ebrahimi }
8155*22dc650dSSadaf Ebrahimi break; /* End META_ESCAPE */
8156*22dc650dSSadaf Ebrahimi }
8157*22dc650dSSadaf Ebrahimi #endif
8158*22dc650dSSadaf Ebrahimi
8159*22dc650dSSadaf Ebrahimi /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8160*22dc650dSSadaf Ebrahimi done. However, there's an option, in case anyone was relying on it. */
8161*22dc650dSSadaf Ebrahimi
8162*22dc650dSSadaf Ebrahimi if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8163*22dc650dSSadaf Ebrahimi (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8164*22dc650dSSadaf Ebrahimi {
8165*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR99;
8166*22dc650dSSadaf Ebrahimi return 0;
8167*22dc650dSSadaf Ebrahimi }
8168*22dc650dSSadaf Ebrahimi
8169*22dc650dSSadaf Ebrahimi /* For the rest (including \X when Unicode is supported - if not it's
8170*22dc650dSSadaf Ebrahimi faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8171*22dc650dSSadaf Ebrahimi not set; if it is set, most of them do not show up here because they are
8172*22dc650dSSadaf Ebrahimi converted into Unicode property tests in parse_regex().
8173*22dc650dSSadaf Ebrahimi
8174*22dc650dSSadaf Ebrahimi In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8175*22dc650dSSadaf Ebrahimi instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8176*22dc650dSSadaf Ebrahimi There are special UCP codes for \B and \b which are used in UCP mode unless
8177*22dc650dSSadaf Ebrahimi "word" matching is being forced to ASCII.
8178*22dc650dSSadaf Ebrahimi
8179*22dc650dSSadaf Ebrahimi Note that \b and \B do a one-character lookbehind, and \A also behaves as
8180*22dc650dSSadaf Ebrahimi if it does. */
8181*22dc650dSSadaf Ebrahimi
8182*22dc650dSSadaf Ebrahimi switch(meta_arg)
8183*22dc650dSSadaf Ebrahimi {
8184*22dc650dSSadaf Ebrahimi case ESC_C:
8185*22dc650dSSadaf Ebrahimi cb->external_flags |= PCRE2_HASBKC; /* Record */
8186*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
8187*22dc650dSSadaf Ebrahimi meta_arg = OP_ALLANY;
8188*22dc650dSSadaf Ebrahimi #else
8189*22dc650dSSadaf Ebrahimi if (!utf) meta_arg = OP_ALLANY;
8190*22dc650dSSadaf Ebrahimi #endif
8191*22dc650dSSadaf Ebrahimi break;
8192*22dc650dSSadaf Ebrahimi
8193*22dc650dSSadaf Ebrahimi case ESC_B:
8194*22dc650dSSadaf Ebrahimi case ESC_b:
8195*22dc650dSSadaf Ebrahimi if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8196*22dc650dSSadaf Ebrahimi meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8197*22dc650dSSadaf Ebrahimi OP_UCP_WORD_BOUNDARY;
8198*22dc650dSSadaf Ebrahimi /* Fall through */
8199*22dc650dSSadaf Ebrahimi
8200*22dc650dSSadaf Ebrahimi case ESC_A:
8201*22dc650dSSadaf Ebrahimi if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8202*22dc650dSSadaf Ebrahimi break;
8203*22dc650dSSadaf Ebrahimi }
8204*22dc650dSSadaf Ebrahimi
8205*22dc650dSSadaf Ebrahimi *code++ = meta_arg;
8206*22dc650dSSadaf Ebrahimi break; /* End META_ESCAPE */
8207*22dc650dSSadaf Ebrahimi
8208*22dc650dSSadaf Ebrahimi
8209*22dc650dSSadaf Ebrahimi /* ===================================================================*/
8210*22dc650dSSadaf Ebrahimi /* Handle an unrecognized meta value. A parsed pattern value less than
8211*22dc650dSSadaf Ebrahimi META_END is a literal. Otherwise we have a problem. */
8212*22dc650dSSadaf Ebrahimi
8213*22dc650dSSadaf Ebrahimi default:
8214*22dc650dSSadaf Ebrahimi if (meta >= META_END)
8215*22dc650dSSadaf Ebrahimi {
8216*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_PARSED
8217*22dc650dSSadaf Ebrahimi fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
8218*22dc650dSSadaf Ebrahimi #endif
8219*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR89; /* Internal error - unrecognized. */
8220*22dc650dSSadaf Ebrahimi return 0;
8221*22dc650dSSadaf Ebrahimi }
8222*22dc650dSSadaf Ebrahimi
8223*22dc650dSSadaf Ebrahimi /* Handle a literal character. We come here by goto in the case of a
8224*22dc650dSSadaf Ebrahimi 32-bit, non-UTF character whose value is greater than META_END. */
8225*22dc650dSSadaf Ebrahimi
8226*22dc650dSSadaf Ebrahimi NORMAL_CHAR:
8227*22dc650dSSadaf Ebrahimi meta = *pptr; /* Get the full 32 bits */
8228*22dc650dSSadaf Ebrahimi NORMAL_CHAR_SET: /* Character is already in meta */
8229*22dc650dSSadaf Ebrahimi matched_char = TRUE;
8230*22dc650dSSadaf Ebrahimi
8231*22dc650dSSadaf Ebrahimi /* For caseless UTF or UCP mode, check whether this character has more than
8232*22dc650dSSadaf Ebrahimi one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8233*22dc650dSSadaf Ebrahimi When casing restrictions apply, ignore caseless sets that start with an
8234*22dc650dSSadaf Ebrahimi ASCII character. */
8235*22dc650dSSadaf Ebrahimi
8236*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
8237*22dc650dSSadaf Ebrahimi if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8238*22dc650dSSadaf Ebrahimi {
8239*22dc650dSSadaf Ebrahimi uint32_t caseset = UCD_CASESET(meta);
8240*22dc650dSSadaf Ebrahimi if (caseset != 0 &&
8241*22dc650dSSadaf Ebrahimi ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
8242*22dc650dSSadaf Ebrahimi PRIV(ucd_caseless_sets)[caseset] > 127))
8243*22dc650dSSadaf Ebrahimi {
8244*22dc650dSSadaf Ebrahimi *code++ = OP_PROP;
8245*22dc650dSSadaf Ebrahimi *code++ = PT_CLIST;
8246*22dc650dSSadaf Ebrahimi *code++ = caseset;
8247*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET)
8248*22dc650dSSadaf Ebrahimi firstcuflags = zerofirstcuflags = REQ_NONE;
8249*22dc650dSSadaf Ebrahimi break; /* End handling this meta item */
8250*22dc650dSSadaf Ebrahimi }
8251*22dc650dSSadaf Ebrahimi }
8252*22dc650dSSadaf Ebrahimi #endif
8253*22dc650dSSadaf Ebrahimi
8254*22dc650dSSadaf Ebrahimi /* Caseful matches, or caseless and not one of the multicase characters. We
8255*22dc650dSSadaf Ebrahimi come here by goto in the case of a positive class that contains only
8256*22dc650dSSadaf Ebrahimi case-partners of a character with just two cases; matched_char has already
8257*22dc650dSSadaf Ebrahimi been set TRUE and options fudged if necessary. */
8258*22dc650dSSadaf Ebrahimi
8259*22dc650dSSadaf Ebrahimi CLASS_CASELESS_CHAR:
8260*22dc650dSSadaf Ebrahimi
8261*22dc650dSSadaf Ebrahimi /* Get the character's code units into mcbuffer, with the length in
8262*22dc650dSSadaf Ebrahimi mclength. When not in UTF mode, the length is always 1. */
8263*22dc650dSSadaf Ebrahimi
8264*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
8265*22dc650dSSadaf Ebrahimi if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8266*22dc650dSSadaf Ebrahimi #endif
8267*22dc650dSSadaf Ebrahimi {
8268*22dc650dSSadaf Ebrahimi mclength = 1;
8269*22dc650dSSadaf Ebrahimi mcbuffer[0] = meta;
8270*22dc650dSSadaf Ebrahimi }
8271*22dc650dSSadaf Ebrahimi
8272*22dc650dSSadaf Ebrahimi /* Generate the appropriate code */
8273*22dc650dSSadaf Ebrahimi
8274*22dc650dSSadaf Ebrahimi *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8275*22dc650dSSadaf Ebrahimi memcpy(code, mcbuffer, CU2BYTES(mclength));
8276*22dc650dSSadaf Ebrahimi code += mclength;
8277*22dc650dSSadaf Ebrahimi
8278*22dc650dSSadaf Ebrahimi /* Remember if \r or \n were seen */
8279*22dc650dSSadaf Ebrahimi
8280*22dc650dSSadaf Ebrahimi if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8281*22dc650dSSadaf Ebrahimi cb->external_flags |= PCRE2_HASCRORLF;
8282*22dc650dSSadaf Ebrahimi
8283*22dc650dSSadaf Ebrahimi /* Set the first and required code units appropriately. If no previous
8284*22dc650dSSadaf Ebrahimi first code unit, set it from this character, but revert to none on a zero
8285*22dc650dSSadaf Ebrahimi repeat. Otherwise, leave the firstcu value alone, and don't change it on
8286*22dc650dSSadaf Ebrahimi a zero repeat. */
8287*22dc650dSSadaf Ebrahimi
8288*22dc650dSSadaf Ebrahimi if (firstcuflags == REQ_UNSET)
8289*22dc650dSSadaf Ebrahimi {
8290*22dc650dSSadaf Ebrahimi zerofirstcuflags = REQ_NONE;
8291*22dc650dSSadaf Ebrahimi zeroreqcu = reqcu;
8292*22dc650dSSadaf Ebrahimi zeroreqcuflags = reqcuflags;
8293*22dc650dSSadaf Ebrahimi
8294*22dc650dSSadaf Ebrahimi /* If the character is more than one code unit long, we can set a single
8295*22dc650dSSadaf Ebrahimi firstcu only if it is not to be matched caselessly. Multiple possible
8296*22dc650dSSadaf Ebrahimi starting code units may be picked up later in the studying code. */
8297*22dc650dSSadaf Ebrahimi
8298*22dc650dSSadaf Ebrahimi if (mclength == 1 || req_caseopt == 0)
8299*22dc650dSSadaf Ebrahimi {
8300*22dc650dSSadaf Ebrahimi firstcu = mcbuffer[0];
8301*22dc650dSSadaf Ebrahimi firstcuflags = req_caseopt;
8302*22dc650dSSadaf Ebrahimi if (mclength != 1)
8303*22dc650dSSadaf Ebrahimi {
8304*22dc650dSSadaf Ebrahimi reqcu = code[-1];
8305*22dc650dSSadaf Ebrahimi reqcuflags = cb->req_varyopt;
8306*22dc650dSSadaf Ebrahimi }
8307*22dc650dSSadaf Ebrahimi }
8308*22dc650dSSadaf Ebrahimi else firstcuflags = reqcuflags = REQ_NONE;
8309*22dc650dSSadaf Ebrahimi }
8310*22dc650dSSadaf Ebrahimi
8311*22dc650dSSadaf Ebrahimi /* firstcu was previously set; we can set reqcu only if the length is
8312*22dc650dSSadaf Ebrahimi 1 or the matching is caseful. */
8313*22dc650dSSadaf Ebrahimi
8314*22dc650dSSadaf Ebrahimi else
8315*22dc650dSSadaf Ebrahimi {
8316*22dc650dSSadaf Ebrahimi zerofirstcu = firstcu;
8317*22dc650dSSadaf Ebrahimi zerofirstcuflags = firstcuflags;
8318*22dc650dSSadaf Ebrahimi zeroreqcu = reqcu;
8319*22dc650dSSadaf Ebrahimi zeroreqcuflags = reqcuflags;
8320*22dc650dSSadaf Ebrahimi if (mclength == 1 || req_caseopt == 0)
8321*22dc650dSSadaf Ebrahimi {
8322*22dc650dSSadaf Ebrahimi reqcu = code[-1];
8323*22dc650dSSadaf Ebrahimi reqcuflags = req_caseopt | cb->req_varyopt;
8324*22dc650dSSadaf Ebrahimi }
8325*22dc650dSSadaf Ebrahimi }
8326*22dc650dSSadaf Ebrahimi
8327*22dc650dSSadaf Ebrahimi /* If caselessness was temporarily instated, reset it. */
8328*22dc650dSSadaf Ebrahimi
8329*22dc650dSSadaf Ebrahimi if (reset_caseful)
8330*22dc650dSSadaf Ebrahimi {
8331*22dc650dSSadaf Ebrahimi options &= ~PCRE2_CASELESS;
8332*22dc650dSSadaf Ebrahimi req_caseopt = 0;
8333*22dc650dSSadaf Ebrahimi reset_caseful = FALSE;
8334*22dc650dSSadaf Ebrahimi }
8335*22dc650dSSadaf Ebrahimi
8336*22dc650dSSadaf Ebrahimi break; /* End literal character handling */
8337*22dc650dSSadaf Ebrahimi } /* End of big switch */
8338*22dc650dSSadaf Ebrahimi } /* End of big loop */
8339*22dc650dSSadaf Ebrahimi
8340*22dc650dSSadaf Ebrahimi /* Control never reaches here. */
8341*22dc650dSSadaf Ebrahimi }
8342*22dc650dSSadaf Ebrahimi
8343*22dc650dSSadaf Ebrahimi
8344*22dc650dSSadaf Ebrahimi
8345*22dc650dSSadaf Ebrahimi /*************************************************
8346*22dc650dSSadaf Ebrahimi * Compile regex: a sequence of alternatives *
8347*22dc650dSSadaf Ebrahimi *************************************************/
8348*22dc650dSSadaf Ebrahimi
8349*22dc650dSSadaf Ebrahimi /* On entry, pptr is pointing past the bracket meta, but on return it points to
8350*22dc650dSSadaf Ebrahimi the closing bracket or META_END. The code variable is pointing at the code unit
8351*22dc650dSSadaf Ebrahimi into which the BRA operator has been stored. This function is used during the
8352*22dc650dSSadaf Ebrahimi pre-compile phase when we are trying to find out the amount of memory needed,
8353*22dc650dSSadaf Ebrahimi as well as during the real compile phase. The value of lengthptr distinguishes
8354*22dc650dSSadaf Ebrahimi the two phases.
8355*22dc650dSSadaf Ebrahimi
8356*22dc650dSSadaf Ebrahimi Arguments:
8357*22dc650dSSadaf Ebrahimi options option bits, including any changes for this subpattern
8358*22dc650dSSadaf Ebrahimi xoptions extra option bits, ditto
8359*22dc650dSSadaf Ebrahimi codeptr -> the address of the current code pointer
8360*22dc650dSSadaf Ebrahimi pptrptr -> the address of the current parsed pattern pointer
8361*22dc650dSSadaf Ebrahimi errorcodeptr -> pointer to error code variable
8362*22dc650dSSadaf Ebrahimi skipunits skip this many code units at start (for brackets and OP_COND)
8363*22dc650dSSadaf Ebrahimi firstcuptr place to put the first required code unit
8364*22dc650dSSadaf Ebrahimi firstcuflagsptr place to put the first code unit flags
8365*22dc650dSSadaf Ebrahimi reqcuptr place to put the last required code unit
8366*22dc650dSSadaf Ebrahimi reqcuflagsptr place to put the last required code unit flags
8367*22dc650dSSadaf Ebrahimi bcptr pointer to the chain of currently open branches
8368*22dc650dSSadaf Ebrahimi cb points to the data block with tables pointers etc.
8369*22dc650dSSadaf Ebrahimi lengthptr NULL during the real compile phase
8370*22dc650dSSadaf Ebrahimi points to length accumulator during pre-compile phase
8371*22dc650dSSadaf Ebrahimi
8372*22dc650dSSadaf Ebrahimi Returns: 0 There has been an error
8373*22dc650dSSadaf Ebrahimi +1 Success, this group must match at least one character
8374*22dc650dSSadaf Ebrahimi -1 Success, this group may match an empty string
8375*22dc650dSSadaf Ebrahimi */
8376*22dc650dSSadaf Ebrahimi
8377*22dc650dSSadaf Ebrahimi static int
compile_regex(uint32_t options,uint32_t xoptions,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)8378*22dc650dSSadaf Ebrahimi compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8379*22dc650dSSadaf Ebrahimi uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8380*22dc650dSSadaf Ebrahimi uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8381*22dc650dSSadaf Ebrahimi uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8382*22dc650dSSadaf Ebrahimi compile_block *cb, PCRE2_SIZE *lengthptr)
8383*22dc650dSSadaf Ebrahimi {
8384*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *code = *codeptr;
8385*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *last_branch = code;
8386*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *start_bracket = code;
8387*22dc650dSSadaf Ebrahimi BOOL lookbehind;
8388*22dc650dSSadaf Ebrahimi open_capitem capitem;
8389*22dc650dSSadaf Ebrahimi int capnumber = 0;
8390*22dc650dSSadaf Ebrahimi int okreturn = 1;
8391*22dc650dSSadaf Ebrahimi uint32_t *pptr = *pptrptr;
8392*22dc650dSSadaf Ebrahimi uint32_t firstcu, reqcu;
8393*22dc650dSSadaf Ebrahimi uint32_t lookbehindlength;
8394*22dc650dSSadaf Ebrahimi uint32_t lookbehindminlength;
8395*22dc650dSSadaf Ebrahimi uint32_t firstcuflags, reqcuflags;
8396*22dc650dSSadaf Ebrahimi uint32_t branchfirstcu, branchreqcu;
8397*22dc650dSSadaf Ebrahimi uint32_t branchfirstcuflags, branchreqcuflags;
8398*22dc650dSSadaf Ebrahimi PCRE2_SIZE length;
8399*22dc650dSSadaf Ebrahimi branch_chain bc;
8400*22dc650dSSadaf Ebrahimi
8401*22dc650dSSadaf Ebrahimi /* If set, call the external function that checks for stack availability. */
8402*22dc650dSSadaf Ebrahimi
8403*22dc650dSSadaf Ebrahimi if (cb->cx->stack_guard != NULL &&
8404*22dc650dSSadaf Ebrahimi cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8405*22dc650dSSadaf Ebrahimi {
8406*22dc650dSSadaf Ebrahimi *errorcodeptr= ERR33;
8407*22dc650dSSadaf Ebrahimi return 0;
8408*22dc650dSSadaf Ebrahimi }
8409*22dc650dSSadaf Ebrahimi
8410*22dc650dSSadaf Ebrahimi /* Miscellaneous initialization */
8411*22dc650dSSadaf Ebrahimi
8412*22dc650dSSadaf Ebrahimi bc.outer = bcptr;
8413*22dc650dSSadaf Ebrahimi bc.current_branch = code;
8414*22dc650dSSadaf Ebrahimi
8415*22dc650dSSadaf Ebrahimi firstcu = reqcu = 0;
8416*22dc650dSSadaf Ebrahimi firstcuflags = reqcuflags = REQ_UNSET;
8417*22dc650dSSadaf Ebrahimi
8418*22dc650dSSadaf Ebrahimi /* Accumulate the length for use in the pre-compile phase. Start with the
8419*22dc650dSSadaf Ebrahimi length of the BRA and KET and any extra code units that are required at the
8420*22dc650dSSadaf Ebrahimi beginning. We accumulate in a local variable to save frequent testing of
8421*22dc650dSSadaf Ebrahimi lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8422*22dc650dSSadaf Ebrahimi start and end of each alternative, because compiled items are discarded during
8423*22dc650dSSadaf Ebrahimi the pre-compile phase so that the workspace is not exceeded. */
8424*22dc650dSSadaf Ebrahimi
8425*22dc650dSSadaf Ebrahimi length = 2 + 2*LINK_SIZE + skipunits;
8426*22dc650dSSadaf Ebrahimi
8427*22dc650dSSadaf Ebrahimi /* Remember if this is a lookbehind assertion, and if it is, save its length
8428*22dc650dSSadaf Ebrahimi and skip over the pattern offset. */
8429*22dc650dSSadaf Ebrahimi
8430*22dc650dSSadaf Ebrahimi lookbehind = *code == OP_ASSERTBACK ||
8431*22dc650dSSadaf Ebrahimi *code == OP_ASSERTBACK_NOT ||
8432*22dc650dSSadaf Ebrahimi *code == OP_ASSERTBACK_NA;
8433*22dc650dSSadaf Ebrahimi
8434*22dc650dSSadaf Ebrahimi if (lookbehind)
8435*22dc650dSSadaf Ebrahimi {
8436*22dc650dSSadaf Ebrahimi lookbehindlength = META_DATA(pptr[-1]);
8437*22dc650dSSadaf Ebrahimi lookbehindminlength = *pptr;
8438*22dc650dSSadaf Ebrahimi pptr += SIZEOFFSET;
8439*22dc650dSSadaf Ebrahimi }
8440*22dc650dSSadaf Ebrahimi else lookbehindlength = lookbehindminlength = 0;
8441*22dc650dSSadaf Ebrahimi
8442*22dc650dSSadaf Ebrahimi /* If this is a capturing subpattern, add to the chain of open capturing items
8443*22dc650dSSadaf Ebrahimi so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8444*22dc650dSSadaf Ebrahimi need be tested here; changing this opcode to one of its variants, e.g.
8445*22dc650dSSadaf Ebrahimi OP_SCBRAPOS, happens later, after the group has been compiled. */
8446*22dc650dSSadaf Ebrahimi
8447*22dc650dSSadaf Ebrahimi if (*code == OP_CBRA)
8448*22dc650dSSadaf Ebrahimi {
8449*22dc650dSSadaf Ebrahimi capnumber = GET2(code, 1 + LINK_SIZE);
8450*22dc650dSSadaf Ebrahimi capitem.number = capnumber;
8451*22dc650dSSadaf Ebrahimi capitem.next = open_caps;
8452*22dc650dSSadaf Ebrahimi capitem.assert_depth = cb->assert_depth;
8453*22dc650dSSadaf Ebrahimi open_caps = &capitem;
8454*22dc650dSSadaf Ebrahimi }
8455*22dc650dSSadaf Ebrahimi
8456*22dc650dSSadaf Ebrahimi /* Offset is set zero to mark that this bracket is still open */
8457*22dc650dSSadaf Ebrahimi
8458*22dc650dSSadaf Ebrahimi PUT(code, 1, 0);
8459*22dc650dSSadaf Ebrahimi code += 1 + LINK_SIZE + skipunits;
8460*22dc650dSSadaf Ebrahimi
8461*22dc650dSSadaf Ebrahimi /* Loop for each alternative branch */
8462*22dc650dSSadaf Ebrahimi
8463*22dc650dSSadaf Ebrahimi for (;;)
8464*22dc650dSSadaf Ebrahimi {
8465*22dc650dSSadaf Ebrahimi int branch_return;
8466*22dc650dSSadaf Ebrahimi
8467*22dc650dSSadaf Ebrahimi /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8468*22dc650dSSadaf Ebrahimi is only a single mimimum length for the whole assertion. When the mimimum
8469*22dc650dSSadaf Ebrahimi length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8470*22dc650dSSadaf Ebrahimi though not necessarily the same length. In this case, the original OP_REVERSE
8471*22dc650dSSadaf Ebrahimi can be used. It can also be used if a branch in a variable length lookbehind
8472*22dc650dSSadaf Ebrahimi has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8473*22dc650dSSadaf Ebrahimi maximum and minimum values. */
8474*22dc650dSSadaf Ebrahimi
8475*22dc650dSSadaf Ebrahimi if (lookbehind && lookbehindlength > 0)
8476*22dc650dSSadaf Ebrahimi {
8477*22dc650dSSadaf Ebrahimi if (lookbehindminlength == LOOKBEHIND_MAX ||
8478*22dc650dSSadaf Ebrahimi lookbehindminlength == lookbehindlength)
8479*22dc650dSSadaf Ebrahimi {
8480*22dc650dSSadaf Ebrahimi *code++ = OP_REVERSE;
8481*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, lookbehindlength);
8482*22dc650dSSadaf Ebrahimi length += 1 + IMM2_SIZE;
8483*22dc650dSSadaf Ebrahimi }
8484*22dc650dSSadaf Ebrahimi else
8485*22dc650dSSadaf Ebrahimi {
8486*22dc650dSSadaf Ebrahimi *code++ = OP_VREVERSE;
8487*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, lookbehindminlength);
8488*22dc650dSSadaf Ebrahimi PUT2INC(code, 0, lookbehindlength);
8489*22dc650dSSadaf Ebrahimi length += 1 + 2*IMM2_SIZE;
8490*22dc650dSSadaf Ebrahimi }
8491*22dc650dSSadaf Ebrahimi }
8492*22dc650dSSadaf Ebrahimi
8493*22dc650dSSadaf Ebrahimi /* Now compile the branch; in the pre-compile phase its length gets added
8494*22dc650dSSadaf Ebrahimi into the length. */
8495*22dc650dSSadaf Ebrahimi
8496*22dc650dSSadaf Ebrahimi if ((branch_return =
8497*22dc650dSSadaf Ebrahimi compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8498*22dc650dSSadaf Ebrahimi &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8499*22dc650dSSadaf Ebrahimi &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8500*22dc650dSSadaf Ebrahimi return 0;
8501*22dc650dSSadaf Ebrahimi
8502*22dc650dSSadaf Ebrahimi /* If a branch can match an empty string, so can the whole group. */
8503*22dc650dSSadaf Ebrahimi
8504*22dc650dSSadaf Ebrahimi if (branch_return < 0) okreturn = -1;
8505*22dc650dSSadaf Ebrahimi
8506*22dc650dSSadaf Ebrahimi /* In the real compile phase, there is some post-processing to be done. */
8507*22dc650dSSadaf Ebrahimi
8508*22dc650dSSadaf Ebrahimi if (lengthptr == NULL)
8509*22dc650dSSadaf Ebrahimi {
8510*22dc650dSSadaf Ebrahimi /* If this is the first branch, the firstcu and reqcu values for the
8511*22dc650dSSadaf Ebrahimi branch become the values for the regex. */
8512*22dc650dSSadaf Ebrahimi
8513*22dc650dSSadaf Ebrahimi if (*last_branch != OP_ALT)
8514*22dc650dSSadaf Ebrahimi {
8515*22dc650dSSadaf Ebrahimi firstcu = branchfirstcu;
8516*22dc650dSSadaf Ebrahimi firstcuflags = branchfirstcuflags;
8517*22dc650dSSadaf Ebrahimi reqcu = branchreqcu;
8518*22dc650dSSadaf Ebrahimi reqcuflags = branchreqcuflags;
8519*22dc650dSSadaf Ebrahimi }
8520*22dc650dSSadaf Ebrahimi
8521*22dc650dSSadaf Ebrahimi /* If this is not the first branch, the first char and reqcu have to
8522*22dc650dSSadaf Ebrahimi match the values from all the previous branches, except that if the
8523*22dc650dSSadaf Ebrahimi previous value for reqcu didn't have REQ_VARY set, it can still match,
8524*22dc650dSSadaf Ebrahimi and we set REQ_VARY for the group from this branch's value. */
8525*22dc650dSSadaf Ebrahimi
8526*22dc650dSSadaf Ebrahimi else
8527*22dc650dSSadaf Ebrahimi {
8528*22dc650dSSadaf Ebrahimi /* If we previously had a firstcu, but it doesn't match the new branch,
8529*22dc650dSSadaf Ebrahimi we have to abandon the firstcu for the regex, but if there was
8530*22dc650dSSadaf Ebrahimi previously no reqcu, it takes on the value of the old firstcu. */
8531*22dc650dSSadaf Ebrahimi
8532*22dc650dSSadaf Ebrahimi if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8533*22dc650dSSadaf Ebrahimi {
8534*22dc650dSSadaf Ebrahimi if (firstcuflags < REQ_NONE)
8535*22dc650dSSadaf Ebrahimi {
8536*22dc650dSSadaf Ebrahimi if (reqcuflags >= REQ_NONE)
8537*22dc650dSSadaf Ebrahimi {
8538*22dc650dSSadaf Ebrahimi reqcu = firstcu;
8539*22dc650dSSadaf Ebrahimi reqcuflags = firstcuflags;
8540*22dc650dSSadaf Ebrahimi }
8541*22dc650dSSadaf Ebrahimi }
8542*22dc650dSSadaf Ebrahimi firstcuflags = REQ_NONE;
8543*22dc650dSSadaf Ebrahimi }
8544*22dc650dSSadaf Ebrahimi
8545*22dc650dSSadaf Ebrahimi /* If we (now or from before) have no firstcu, a firstcu from the
8546*22dc650dSSadaf Ebrahimi branch becomes a reqcu if there isn't a branch reqcu. */
8547*22dc650dSSadaf Ebrahimi
8548*22dc650dSSadaf Ebrahimi if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8549*22dc650dSSadaf Ebrahimi branchreqcuflags >= REQ_NONE)
8550*22dc650dSSadaf Ebrahimi {
8551*22dc650dSSadaf Ebrahimi branchreqcu = branchfirstcu;
8552*22dc650dSSadaf Ebrahimi branchreqcuflags = branchfirstcuflags;
8553*22dc650dSSadaf Ebrahimi }
8554*22dc650dSSadaf Ebrahimi
8555*22dc650dSSadaf Ebrahimi /* Now ensure that the reqcus match */
8556*22dc650dSSadaf Ebrahimi
8557*22dc650dSSadaf Ebrahimi if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8558*22dc650dSSadaf Ebrahimi reqcu != branchreqcu)
8559*22dc650dSSadaf Ebrahimi reqcuflags = REQ_NONE;
8560*22dc650dSSadaf Ebrahimi else
8561*22dc650dSSadaf Ebrahimi {
8562*22dc650dSSadaf Ebrahimi reqcu = branchreqcu;
8563*22dc650dSSadaf Ebrahimi reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8564*22dc650dSSadaf Ebrahimi }
8565*22dc650dSSadaf Ebrahimi }
8566*22dc650dSSadaf Ebrahimi }
8567*22dc650dSSadaf Ebrahimi
8568*22dc650dSSadaf Ebrahimi /* Handle reaching the end of the expression, either ')' or end of pattern.
8569*22dc650dSSadaf Ebrahimi In the real compile phase, go back through the alternative branches and
8570*22dc650dSSadaf Ebrahimi reverse the chain of offsets, with the field in the BRA item now becoming an
8571*22dc650dSSadaf Ebrahimi offset to the first alternative. If there are no alternatives, it points to
8572*22dc650dSSadaf Ebrahimi the end of the group. The length in the terminating ket is always the length
8573*22dc650dSSadaf Ebrahimi of the whole bracketed item. Return leaving the pointer at the terminating
8574*22dc650dSSadaf Ebrahimi char. */
8575*22dc650dSSadaf Ebrahimi
8576*22dc650dSSadaf Ebrahimi if (META_CODE(*pptr) != META_ALT)
8577*22dc650dSSadaf Ebrahimi {
8578*22dc650dSSadaf Ebrahimi if (lengthptr == NULL)
8579*22dc650dSSadaf Ebrahimi {
8580*22dc650dSSadaf Ebrahimi PCRE2_SIZE branch_length = code - last_branch;
8581*22dc650dSSadaf Ebrahimi do
8582*22dc650dSSadaf Ebrahimi {
8583*22dc650dSSadaf Ebrahimi PCRE2_SIZE prev_length = GET(last_branch, 1);
8584*22dc650dSSadaf Ebrahimi PUT(last_branch, 1, branch_length);
8585*22dc650dSSadaf Ebrahimi branch_length = prev_length;
8586*22dc650dSSadaf Ebrahimi last_branch -= branch_length;
8587*22dc650dSSadaf Ebrahimi }
8588*22dc650dSSadaf Ebrahimi while (branch_length > 0);
8589*22dc650dSSadaf Ebrahimi }
8590*22dc650dSSadaf Ebrahimi
8591*22dc650dSSadaf Ebrahimi /* Fill in the ket */
8592*22dc650dSSadaf Ebrahimi
8593*22dc650dSSadaf Ebrahimi *code = OP_KET;
8594*22dc650dSSadaf Ebrahimi PUT(code, 1, (int)(code - start_bracket));
8595*22dc650dSSadaf Ebrahimi code += 1 + LINK_SIZE;
8596*22dc650dSSadaf Ebrahimi
8597*22dc650dSSadaf Ebrahimi /* Set values to pass back */
8598*22dc650dSSadaf Ebrahimi
8599*22dc650dSSadaf Ebrahimi *codeptr = code;
8600*22dc650dSSadaf Ebrahimi *pptrptr = pptr;
8601*22dc650dSSadaf Ebrahimi *firstcuptr = firstcu;
8602*22dc650dSSadaf Ebrahimi *firstcuflagsptr = firstcuflags;
8603*22dc650dSSadaf Ebrahimi *reqcuptr = reqcu;
8604*22dc650dSSadaf Ebrahimi *reqcuflagsptr = reqcuflags;
8605*22dc650dSSadaf Ebrahimi if (lengthptr != NULL)
8606*22dc650dSSadaf Ebrahimi {
8607*22dc650dSSadaf Ebrahimi if (OFLOW_MAX - *lengthptr < length)
8608*22dc650dSSadaf Ebrahimi {
8609*22dc650dSSadaf Ebrahimi *errorcodeptr = ERR20;
8610*22dc650dSSadaf Ebrahimi return 0;
8611*22dc650dSSadaf Ebrahimi }
8612*22dc650dSSadaf Ebrahimi *lengthptr += length;
8613*22dc650dSSadaf Ebrahimi }
8614*22dc650dSSadaf Ebrahimi return okreturn;
8615*22dc650dSSadaf Ebrahimi }
8616*22dc650dSSadaf Ebrahimi
8617*22dc650dSSadaf Ebrahimi /* Another branch follows. In the pre-compile phase, we can move the code
8618*22dc650dSSadaf Ebrahimi pointer back to where it was for the start of the first branch. (That is,
8619*22dc650dSSadaf Ebrahimi pretend that each branch is the only one.)
8620*22dc650dSSadaf Ebrahimi
8621*22dc650dSSadaf Ebrahimi In the real compile phase, insert an ALT node. Its length field points back
8622*22dc650dSSadaf Ebrahimi to the previous branch while the bracket remains open. At the end the chain
8623*22dc650dSSadaf Ebrahimi is reversed. It's done like this so that the start of the bracket has a
8624*22dc650dSSadaf Ebrahimi zero offset until it is closed, making it possible to detect recursion. */
8625*22dc650dSSadaf Ebrahimi
8626*22dc650dSSadaf Ebrahimi if (lengthptr != NULL)
8627*22dc650dSSadaf Ebrahimi {
8628*22dc650dSSadaf Ebrahimi code = *codeptr + 1 + LINK_SIZE + skipunits;
8629*22dc650dSSadaf Ebrahimi length += 1 + LINK_SIZE;
8630*22dc650dSSadaf Ebrahimi }
8631*22dc650dSSadaf Ebrahimi else
8632*22dc650dSSadaf Ebrahimi {
8633*22dc650dSSadaf Ebrahimi *code = OP_ALT;
8634*22dc650dSSadaf Ebrahimi PUT(code, 1, (int)(code - last_branch));
8635*22dc650dSSadaf Ebrahimi bc.current_branch = last_branch = code;
8636*22dc650dSSadaf Ebrahimi code += 1 + LINK_SIZE;
8637*22dc650dSSadaf Ebrahimi }
8638*22dc650dSSadaf Ebrahimi
8639*22dc650dSSadaf Ebrahimi /* Set the maximum lookbehind length for the next branch (if not in a
8640*22dc650dSSadaf Ebrahimi lookbehind the value will be zero) and then advance past the vertical bar. */
8641*22dc650dSSadaf Ebrahimi
8642*22dc650dSSadaf Ebrahimi lookbehindlength = META_DATA(*pptr);
8643*22dc650dSSadaf Ebrahimi pptr++;
8644*22dc650dSSadaf Ebrahimi }
8645*22dc650dSSadaf Ebrahimi /* Control never reaches here */
8646*22dc650dSSadaf Ebrahimi }
8647*22dc650dSSadaf Ebrahimi
8648*22dc650dSSadaf Ebrahimi
8649*22dc650dSSadaf Ebrahimi
8650*22dc650dSSadaf Ebrahimi /*************************************************
8651*22dc650dSSadaf Ebrahimi * Check for anchored pattern *
8652*22dc650dSSadaf Ebrahimi *************************************************/
8653*22dc650dSSadaf Ebrahimi
8654*22dc650dSSadaf Ebrahimi /* Try to find out if this is an anchored regular expression. Consider each
8655*22dc650dSSadaf Ebrahimi alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8656*22dc650dSSadaf Ebrahimi all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8657*22dc650dSSadaf Ebrahimi it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8658*22dc650dSSadaf Ebrahimi be found, because ^ generates OP_CIRCM in that mode.
8659*22dc650dSSadaf Ebrahimi
8660*22dc650dSSadaf Ebrahimi We can also consider a regex to be anchored if OP_SOM starts all its branches.
8661*22dc650dSSadaf Ebrahimi This is the code for \G, which means "match at start of match position, taking
8662*22dc650dSSadaf Ebrahimi into account the match offset".
8663*22dc650dSSadaf Ebrahimi
8664*22dc650dSSadaf Ebrahimi A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8665*22dc650dSSadaf Ebrahimi because that will try the rest of the pattern at all possible matching points,
8666*22dc650dSSadaf Ebrahimi so there is no point trying again.... er ....
8667*22dc650dSSadaf Ebrahimi
8668*22dc650dSSadaf Ebrahimi .... except when the .* appears inside capturing parentheses, and there is a
8669*22dc650dSSadaf Ebrahimi subsequent back reference to those parentheses. We haven't enough information
8670*22dc650dSSadaf Ebrahimi to catch that case precisely.
8671*22dc650dSSadaf Ebrahimi
8672*22dc650dSSadaf Ebrahimi At first, the best we could do was to detect when .* was in capturing brackets
8673*22dc650dSSadaf Ebrahimi and the highest back reference was greater than or equal to that level.
8674*22dc650dSSadaf Ebrahimi However, by keeping a bitmap of the first 31 back references, we can catch some
8675*22dc650dSSadaf Ebrahimi of the more common cases more precisely.
8676*22dc650dSSadaf Ebrahimi
8677*22dc650dSSadaf Ebrahimi ... A second exception is when the .* appears inside an atomic group, because
8678*22dc650dSSadaf Ebrahimi this prevents the number of characters it matches from being adjusted.
8679*22dc650dSSadaf Ebrahimi
8680*22dc650dSSadaf Ebrahimi Arguments:
8681*22dc650dSSadaf Ebrahimi code points to start of the compiled pattern
8682*22dc650dSSadaf Ebrahimi bracket_map a bitmap of which brackets we are inside while testing; this
8683*22dc650dSSadaf Ebrahimi handles up to substring 31; after that we just have to take
8684*22dc650dSSadaf Ebrahimi the less precise approach
8685*22dc650dSSadaf Ebrahimi cb points to the compile data block
8686*22dc650dSSadaf Ebrahimi atomcount atomic group level
8687*22dc650dSSadaf Ebrahimi inassert TRUE if in an assertion
8688*22dc650dSSadaf Ebrahimi
8689*22dc650dSSadaf Ebrahimi Returns: TRUE or FALSE
8690*22dc650dSSadaf Ebrahimi */
8691*22dc650dSSadaf Ebrahimi
8692*22dc650dSSadaf Ebrahimi static BOOL
is_anchored(PCRE2_SPTR code,uint32_t bracket_map,compile_block * cb,int atomcount,BOOL inassert)8693*22dc650dSSadaf Ebrahimi is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8694*22dc650dSSadaf Ebrahimi int atomcount, BOOL inassert)
8695*22dc650dSSadaf Ebrahimi {
8696*22dc650dSSadaf Ebrahimi do {
8697*22dc650dSSadaf Ebrahimi PCRE2_SPTR scode = first_significant_code(
8698*22dc650dSSadaf Ebrahimi code + PRIV(OP_lengths)[*code], FALSE);
8699*22dc650dSSadaf Ebrahimi int op = *scode;
8700*22dc650dSSadaf Ebrahimi
8701*22dc650dSSadaf Ebrahimi /* Non-capturing brackets */
8702*22dc650dSSadaf Ebrahimi
8703*22dc650dSSadaf Ebrahimi if (op == OP_BRA || op == OP_BRAPOS ||
8704*22dc650dSSadaf Ebrahimi op == OP_SBRA || op == OP_SBRAPOS)
8705*22dc650dSSadaf Ebrahimi {
8706*22dc650dSSadaf Ebrahimi if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8707*22dc650dSSadaf Ebrahimi return FALSE;
8708*22dc650dSSadaf Ebrahimi }
8709*22dc650dSSadaf Ebrahimi
8710*22dc650dSSadaf Ebrahimi /* Capturing brackets */
8711*22dc650dSSadaf Ebrahimi
8712*22dc650dSSadaf Ebrahimi else if (op == OP_CBRA || op == OP_CBRAPOS ||
8713*22dc650dSSadaf Ebrahimi op == OP_SCBRA || op == OP_SCBRAPOS)
8714*22dc650dSSadaf Ebrahimi {
8715*22dc650dSSadaf Ebrahimi int n = GET2(scode, 1+LINK_SIZE);
8716*22dc650dSSadaf Ebrahimi uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8717*22dc650dSSadaf Ebrahimi if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8718*22dc650dSSadaf Ebrahimi }
8719*22dc650dSSadaf Ebrahimi
8720*22dc650dSSadaf Ebrahimi /* Positive forward assertion */
8721*22dc650dSSadaf Ebrahimi
8722*22dc650dSSadaf Ebrahimi else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8723*22dc650dSSadaf Ebrahimi {
8724*22dc650dSSadaf Ebrahimi if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8725*22dc650dSSadaf Ebrahimi }
8726*22dc650dSSadaf Ebrahimi
8727*22dc650dSSadaf Ebrahimi /* Condition. If there is no second branch, it can't be anchored. */
8728*22dc650dSSadaf Ebrahimi
8729*22dc650dSSadaf Ebrahimi else if (op == OP_COND || op == OP_SCOND)
8730*22dc650dSSadaf Ebrahimi {
8731*22dc650dSSadaf Ebrahimi if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8732*22dc650dSSadaf Ebrahimi if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8733*22dc650dSSadaf Ebrahimi return FALSE;
8734*22dc650dSSadaf Ebrahimi }
8735*22dc650dSSadaf Ebrahimi
8736*22dc650dSSadaf Ebrahimi /* Atomic groups */
8737*22dc650dSSadaf Ebrahimi
8738*22dc650dSSadaf Ebrahimi else if (op == OP_ONCE)
8739*22dc650dSSadaf Ebrahimi {
8740*22dc650dSSadaf Ebrahimi if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8741*22dc650dSSadaf Ebrahimi return FALSE;
8742*22dc650dSSadaf Ebrahimi }
8743*22dc650dSSadaf Ebrahimi
8744*22dc650dSSadaf Ebrahimi /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8745*22dc650dSSadaf Ebrahimi it isn't in brackets that are or may be referenced or inside an atomic
8746*22dc650dSSadaf Ebrahimi group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8747*22dc650dSSadaf Ebrahimi because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8748*22dc650dSSadaf Ebrahimi with the subject "aab", which matches "b", i.e. not at the start of a line.
8749*22dc650dSSadaf Ebrahimi There is also an option that disables auto-anchoring. */
8750*22dc650dSSadaf Ebrahimi
8751*22dc650dSSadaf Ebrahimi else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8752*22dc650dSSadaf Ebrahimi op == OP_TYPEPOSSTAR))
8753*22dc650dSSadaf Ebrahimi {
8754*22dc650dSSadaf Ebrahimi if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8755*22dc650dSSadaf Ebrahimi atomcount > 0 || cb->had_pruneorskip || inassert ||
8756*22dc650dSSadaf Ebrahimi (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8757*22dc650dSSadaf Ebrahimi return FALSE;
8758*22dc650dSSadaf Ebrahimi }
8759*22dc650dSSadaf Ebrahimi
8760*22dc650dSSadaf Ebrahimi /* Check for explicit anchoring */
8761*22dc650dSSadaf Ebrahimi
8762*22dc650dSSadaf Ebrahimi else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8763*22dc650dSSadaf Ebrahimi
8764*22dc650dSSadaf Ebrahimi code += GET(code, 1);
8765*22dc650dSSadaf Ebrahimi }
8766*22dc650dSSadaf Ebrahimi while (*code == OP_ALT); /* Loop for each alternative */
8767*22dc650dSSadaf Ebrahimi return TRUE;
8768*22dc650dSSadaf Ebrahimi }
8769*22dc650dSSadaf Ebrahimi
8770*22dc650dSSadaf Ebrahimi
8771*22dc650dSSadaf Ebrahimi
8772*22dc650dSSadaf Ebrahimi /*************************************************
8773*22dc650dSSadaf Ebrahimi * Check for starting with ^ or .* *
8774*22dc650dSSadaf Ebrahimi *************************************************/
8775*22dc650dSSadaf Ebrahimi
8776*22dc650dSSadaf Ebrahimi /* This is called to find out if every branch starts with ^ or .* so that
8777*22dc650dSSadaf Ebrahimi "first char" processing can be done to speed things up in multiline
8778*22dc650dSSadaf Ebrahimi matching and for non-DOTALL patterns that start with .* (which must start at
8779*22dc650dSSadaf Ebrahimi the beginning or after \n). As in the case of is_anchored() (see above), we
8780*22dc650dSSadaf Ebrahimi have to take account of back references to capturing brackets that contain .*
8781*22dc650dSSadaf Ebrahimi because in that case we can't make the assumption. Also, the appearance of .*
8782*22dc650dSSadaf Ebrahimi inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8783*22dc650dSSadaf Ebrahimi or *SKIP does not count, because once again the assumption no longer holds.
8784*22dc650dSSadaf Ebrahimi
8785*22dc650dSSadaf Ebrahimi Arguments:
8786*22dc650dSSadaf Ebrahimi code points to start of the compiled pattern or a group
8787*22dc650dSSadaf Ebrahimi bracket_map a bitmap of which brackets we are inside while testing; this
8788*22dc650dSSadaf Ebrahimi handles up to substring 31; after that we just have to take
8789*22dc650dSSadaf Ebrahimi the less precise approach
8790*22dc650dSSadaf Ebrahimi cb points to the compile data
8791*22dc650dSSadaf Ebrahimi atomcount atomic group level
8792*22dc650dSSadaf Ebrahimi inassert TRUE if in an assertion
8793*22dc650dSSadaf Ebrahimi
8794*22dc650dSSadaf Ebrahimi Returns: TRUE or FALSE
8795*22dc650dSSadaf Ebrahimi */
8796*22dc650dSSadaf Ebrahimi
8797*22dc650dSSadaf Ebrahimi static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8798*22dc650dSSadaf Ebrahimi is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8799*22dc650dSSadaf Ebrahimi int atomcount, BOOL inassert)
8800*22dc650dSSadaf Ebrahimi {
8801*22dc650dSSadaf Ebrahimi do {
8802*22dc650dSSadaf Ebrahimi PCRE2_SPTR scode = first_significant_code(
8803*22dc650dSSadaf Ebrahimi code + PRIV(OP_lengths)[*code], FALSE);
8804*22dc650dSSadaf Ebrahimi int op = *scode;
8805*22dc650dSSadaf Ebrahimi
8806*22dc650dSSadaf Ebrahimi /* If we are at the start of a conditional assertion group, *both* the
8807*22dc650dSSadaf Ebrahimi conditional assertion *and* what follows the condition must satisfy the test
8808*22dc650dSSadaf Ebrahimi for start of line. Other kinds of condition fail. Note that there may be an
8809*22dc650dSSadaf Ebrahimi auto-callout at the start of a condition. */
8810*22dc650dSSadaf Ebrahimi
8811*22dc650dSSadaf Ebrahimi if (op == OP_COND)
8812*22dc650dSSadaf Ebrahimi {
8813*22dc650dSSadaf Ebrahimi scode += 1 + LINK_SIZE;
8814*22dc650dSSadaf Ebrahimi
8815*22dc650dSSadaf Ebrahimi if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8816*22dc650dSSadaf Ebrahimi else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8817*22dc650dSSadaf Ebrahimi
8818*22dc650dSSadaf Ebrahimi switch (*scode)
8819*22dc650dSSadaf Ebrahimi {
8820*22dc650dSSadaf Ebrahimi case OP_CREF:
8821*22dc650dSSadaf Ebrahimi case OP_DNCREF:
8822*22dc650dSSadaf Ebrahimi case OP_RREF:
8823*22dc650dSSadaf Ebrahimi case OP_DNRREF:
8824*22dc650dSSadaf Ebrahimi case OP_FAIL:
8825*22dc650dSSadaf Ebrahimi case OP_FALSE:
8826*22dc650dSSadaf Ebrahimi case OP_TRUE:
8827*22dc650dSSadaf Ebrahimi return FALSE;
8828*22dc650dSSadaf Ebrahimi
8829*22dc650dSSadaf Ebrahimi default: /* Assertion */
8830*22dc650dSSadaf Ebrahimi if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8831*22dc650dSSadaf Ebrahimi do scode += GET(scode, 1); while (*scode == OP_ALT);
8832*22dc650dSSadaf Ebrahimi scode += 1 + LINK_SIZE;
8833*22dc650dSSadaf Ebrahimi break;
8834*22dc650dSSadaf Ebrahimi }
8835*22dc650dSSadaf Ebrahimi scode = first_significant_code(scode, FALSE);
8836*22dc650dSSadaf Ebrahimi op = *scode;
8837*22dc650dSSadaf Ebrahimi }
8838*22dc650dSSadaf Ebrahimi
8839*22dc650dSSadaf Ebrahimi /* Non-capturing brackets */
8840*22dc650dSSadaf Ebrahimi
8841*22dc650dSSadaf Ebrahimi if (op == OP_BRA || op == OP_BRAPOS ||
8842*22dc650dSSadaf Ebrahimi op == OP_SBRA || op == OP_SBRAPOS)
8843*22dc650dSSadaf Ebrahimi {
8844*22dc650dSSadaf Ebrahimi if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8845*22dc650dSSadaf Ebrahimi return FALSE;
8846*22dc650dSSadaf Ebrahimi }
8847*22dc650dSSadaf Ebrahimi
8848*22dc650dSSadaf Ebrahimi /* Capturing brackets */
8849*22dc650dSSadaf Ebrahimi
8850*22dc650dSSadaf Ebrahimi else if (op == OP_CBRA || op == OP_CBRAPOS ||
8851*22dc650dSSadaf Ebrahimi op == OP_SCBRA || op == OP_SCBRAPOS)
8852*22dc650dSSadaf Ebrahimi {
8853*22dc650dSSadaf Ebrahimi int n = GET2(scode, 1+LINK_SIZE);
8854*22dc650dSSadaf Ebrahimi unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8855*22dc650dSSadaf Ebrahimi if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8856*22dc650dSSadaf Ebrahimi }
8857*22dc650dSSadaf Ebrahimi
8858*22dc650dSSadaf Ebrahimi /* Positive forward assertions */
8859*22dc650dSSadaf Ebrahimi
8860*22dc650dSSadaf Ebrahimi else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8861*22dc650dSSadaf Ebrahimi {
8862*22dc650dSSadaf Ebrahimi if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8863*22dc650dSSadaf Ebrahimi return FALSE;
8864*22dc650dSSadaf Ebrahimi }
8865*22dc650dSSadaf Ebrahimi
8866*22dc650dSSadaf Ebrahimi /* Atomic brackets */
8867*22dc650dSSadaf Ebrahimi
8868*22dc650dSSadaf Ebrahimi else if (op == OP_ONCE)
8869*22dc650dSSadaf Ebrahimi {
8870*22dc650dSSadaf Ebrahimi if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8871*22dc650dSSadaf Ebrahimi return FALSE;
8872*22dc650dSSadaf Ebrahimi }
8873*22dc650dSSadaf Ebrahimi
8874*22dc650dSSadaf Ebrahimi /* .* means "start at start or after \n" if it isn't in atomic brackets or
8875*22dc650dSSadaf Ebrahimi brackets that may be referenced or an assertion, and as long as the pattern
8876*22dc650dSSadaf Ebrahimi does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8877*22dc650dSSadaf Ebrahimi for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8878*22dc650dSSadaf Ebrahimi i.e. not at the start of a line. There is also an option that disables this
8879*22dc650dSSadaf Ebrahimi optimization. */
8880*22dc650dSSadaf Ebrahimi
8881*22dc650dSSadaf Ebrahimi else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8882*22dc650dSSadaf Ebrahimi {
8883*22dc650dSSadaf Ebrahimi if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8884*22dc650dSSadaf Ebrahimi atomcount > 0 || cb->had_pruneorskip || inassert ||
8885*22dc650dSSadaf Ebrahimi (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8886*22dc650dSSadaf Ebrahimi return FALSE;
8887*22dc650dSSadaf Ebrahimi }
8888*22dc650dSSadaf Ebrahimi
8889*22dc650dSSadaf Ebrahimi /* Check for explicit circumflex; anything else gives a FALSE result. Note
8890*22dc650dSSadaf Ebrahimi in particular that this includes atomic brackets OP_ONCE because the number
8891*22dc650dSSadaf Ebrahimi of characters matched by .* cannot be adjusted inside them. */
8892*22dc650dSSadaf Ebrahimi
8893*22dc650dSSadaf Ebrahimi else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8894*22dc650dSSadaf Ebrahimi
8895*22dc650dSSadaf Ebrahimi /* Move on to the next alternative */
8896*22dc650dSSadaf Ebrahimi
8897*22dc650dSSadaf Ebrahimi code += GET(code, 1);
8898*22dc650dSSadaf Ebrahimi }
8899*22dc650dSSadaf Ebrahimi while (*code == OP_ALT); /* Loop for each alternative */
8900*22dc650dSSadaf Ebrahimi return TRUE;
8901*22dc650dSSadaf Ebrahimi }
8902*22dc650dSSadaf Ebrahimi
8903*22dc650dSSadaf Ebrahimi
8904*22dc650dSSadaf Ebrahimi
8905*22dc650dSSadaf Ebrahimi /*************************************************
8906*22dc650dSSadaf Ebrahimi * Scan compiled regex for recursion reference *
8907*22dc650dSSadaf Ebrahimi *************************************************/
8908*22dc650dSSadaf Ebrahimi
8909*22dc650dSSadaf Ebrahimi /* This function scans through a compiled pattern until it finds an instance of
8910*22dc650dSSadaf Ebrahimi OP_RECURSE.
8911*22dc650dSSadaf Ebrahimi
8912*22dc650dSSadaf Ebrahimi Arguments:
8913*22dc650dSSadaf Ebrahimi code points to start of expression
8914*22dc650dSSadaf Ebrahimi utf TRUE in UTF mode
8915*22dc650dSSadaf Ebrahimi
8916*22dc650dSSadaf Ebrahimi Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8917*22dc650dSSadaf Ebrahimi */
8918*22dc650dSSadaf Ebrahimi
8919*22dc650dSSadaf Ebrahimi static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8920*22dc650dSSadaf Ebrahimi find_recurse(PCRE2_SPTR code, BOOL utf)
8921*22dc650dSSadaf Ebrahimi {
8922*22dc650dSSadaf Ebrahimi for (;;)
8923*22dc650dSSadaf Ebrahimi {
8924*22dc650dSSadaf Ebrahimi PCRE2_UCHAR c = *code;
8925*22dc650dSSadaf Ebrahimi if (c == OP_END) return NULL;
8926*22dc650dSSadaf Ebrahimi if (c == OP_RECURSE) return code;
8927*22dc650dSSadaf Ebrahimi
8928*22dc650dSSadaf Ebrahimi /* XCLASS is used for classes that cannot be represented just by a bit map.
8929*22dc650dSSadaf Ebrahimi This includes negated single high-valued characters. CALLOUT_STR is used for
8930*22dc650dSSadaf Ebrahimi callouts with string arguments. In both cases the length in the table is
8931*22dc650dSSadaf Ebrahimi zero; the actual length is stored in the compiled code. */
8932*22dc650dSSadaf Ebrahimi
8933*22dc650dSSadaf Ebrahimi if (c == OP_XCLASS) code += GET(code, 1);
8934*22dc650dSSadaf Ebrahimi else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8935*22dc650dSSadaf Ebrahimi
8936*22dc650dSSadaf Ebrahimi /* Otherwise, we can get the item's length from the table, except that for
8937*22dc650dSSadaf Ebrahimi repeated character types, we have to test for \p and \P, which have an extra
8938*22dc650dSSadaf Ebrahimi two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8939*22dc650dSSadaf Ebrahimi we must add in its length. */
8940*22dc650dSSadaf Ebrahimi
8941*22dc650dSSadaf Ebrahimi else
8942*22dc650dSSadaf Ebrahimi {
8943*22dc650dSSadaf Ebrahimi switch(c)
8944*22dc650dSSadaf Ebrahimi {
8945*22dc650dSSadaf Ebrahimi case OP_TYPESTAR:
8946*22dc650dSSadaf Ebrahimi case OP_TYPEMINSTAR:
8947*22dc650dSSadaf Ebrahimi case OP_TYPEPLUS:
8948*22dc650dSSadaf Ebrahimi case OP_TYPEMINPLUS:
8949*22dc650dSSadaf Ebrahimi case OP_TYPEQUERY:
8950*22dc650dSSadaf Ebrahimi case OP_TYPEMINQUERY:
8951*22dc650dSSadaf Ebrahimi case OP_TYPEPOSSTAR:
8952*22dc650dSSadaf Ebrahimi case OP_TYPEPOSPLUS:
8953*22dc650dSSadaf Ebrahimi case OP_TYPEPOSQUERY:
8954*22dc650dSSadaf Ebrahimi if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8955*22dc650dSSadaf Ebrahimi break;
8956*22dc650dSSadaf Ebrahimi
8957*22dc650dSSadaf Ebrahimi case OP_TYPEPOSUPTO:
8958*22dc650dSSadaf Ebrahimi case OP_TYPEUPTO:
8959*22dc650dSSadaf Ebrahimi case OP_TYPEMINUPTO:
8960*22dc650dSSadaf Ebrahimi case OP_TYPEEXACT:
8961*22dc650dSSadaf Ebrahimi if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8962*22dc650dSSadaf Ebrahimi code += 2;
8963*22dc650dSSadaf Ebrahimi break;
8964*22dc650dSSadaf Ebrahimi
8965*22dc650dSSadaf Ebrahimi case OP_MARK:
8966*22dc650dSSadaf Ebrahimi case OP_COMMIT_ARG:
8967*22dc650dSSadaf Ebrahimi case OP_PRUNE_ARG:
8968*22dc650dSSadaf Ebrahimi case OP_SKIP_ARG:
8969*22dc650dSSadaf Ebrahimi case OP_THEN_ARG:
8970*22dc650dSSadaf Ebrahimi code += code[1];
8971*22dc650dSSadaf Ebrahimi break;
8972*22dc650dSSadaf Ebrahimi }
8973*22dc650dSSadaf Ebrahimi
8974*22dc650dSSadaf Ebrahimi /* Add in the fixed length from the table */
8975*22dc650dSSadaf Ebrahimi
8976*22dc650dSSadaf Ebrahimi code += PRIV(OP_lengths)[c];
8977*22dc650dSSadaf Ebrahimi
8978*22dc650dSSadaf Ebrahimi /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8979*22dc650dSSadaf Ebrahimi be followed by a multi-unit character. The length in the table is a
8980*22dc650dSSadaf Ebrahimi minimum, so we have to arrange to skip the extra units. */
8981*22dc650dSSadaf Ebrahimi
8982*22dc650dSSadaf Ebrahimi #ifdef MAYBE_UTF_MULTI
8983*22dc650dSSadaf Ebrahimi if (utf) switch(c)
8984*22dc650dSSadaf Ebrahimi {
8985*22dc650dSSadaf Ebrahimi case OP_CHAR:
8986*22dc650dSSadaf Ebrahimi case OP_CHARI:
8987*22dc650dSSadaf Ebrahimi case OP_NOT:
8988*22dc650dSSadaf Ebrahimi case OP_NOTI:
8989*22dc650dSSadaf Ebrahimi case OP_EXACT:
8990*22dc650dSSadaf Ebrahimi case OP_EXACTI:
8991*22dc650dSSadaf Ebrahimi case OP_NOTEXACT:
8992*22dc650dSSadaf Ebrahimi case OP_NOTEXACTI:
8993*22dc650dSSadaf Ebrahimi case OP_UPTO:
8994*22dc650dSSadaf Ebrahimi case OP_UPTOI:
8995*22dc650dSSadaf Ebrahimi case OP_NOTUPTO:
8996*22dc650dSSadaf Ebrahimi case OP_NOTUPTOI:
8997*22dc650dSSadaf Ebrahimi case OP_MINUPTO:
8998*22dc650dSSadaf Ebrahimi case OP_MINUPTOI:
8999*22dc650dSSadaf Ebrahimi case OP_NOTMINUPTO:
9000*22dc650dSSadaf Ebrahimi case OP_NOTMINUPTOI:
9001*22dc650dSSadaf Ebrahimi case OP_POSUPTO:
9002*22dc650dSSadaf Ebrahimi case OP_POSUPTOI:
9003*22dc650dSSadaf Ebrahimi case OP_NOTPOSUPTO:
9004*22dc650dSSadaf Ebrahimi case OP_NOTPOSUPTOI:
9005*22dc650dSSadaf Ebrahimi case OP_STAR:
9006*22dc650dSSadaf Ebrahimi case OP_STARI:
9007*22dc650dSSadaf Ebrahimi case OP_NOTSTAR:
9008*22dc650dSSadaf Ebrahimi case OP_NOTSTARI:
9009*22dc650dSSadaf Ebrahimi case OP_MINSTAR:
9010*22dc650dSSadaf Ebrahimi case OP_MINSTARI:
9011*22dc650dSSadaf Ebrahimi case OP_NOTMINSTAR:
9012*22dc650dSSadaf Ebrahimi case OP_NOTMINSTARI:
9013*22dc650dSSadaf Ebrahimi case OP_POSSTAR:
9014*22dc650dSSadaf Ebrahimi case OP_POSSTARI:
9015*22dc650dSSadaf Ebrahimi case OP_NOTPOSSTAR:
9016*22dc650dSSadaf Ebrahimi case OP_NOTPOSSTARI:
9017*22dc650dSSadaf Ebrahimi case OP_PLUS:
9018*22dc650dSSadaf Ebrahimi case OP_PLUSI:
9019*22dc650dSSadaf Ebrahimi case OP_NOTPLUS:
9020*22dc650dSSadaf Ebrahimi case OP_NOTPLUSI:
9021*22dc650dSSadaf Ebrahimi case OP_MINPLUS:
9022*22dc650dSSadaf Ebrahimi case OP_MINPLUSI:
9023*22dc650dSSadaf Ebrahimi case OP_NOTMINPLUS:
9024*22dc650dSSadaf Ebrahimi case OP_NOTMINPLUSI:
9025*22dc650dSSadaf Ebrahimi case OP_POSPLUS:
9026*22dc650dSSadaf Ebrahimi case OP_POSPLUSI:
9027*22dc650dSSadaf Ebrahimi case OP_NOTPOSPLUS:
9028*22dc650dSSadaf Ebrahimi case OP_NOTPOSPLUSI:
9029*22dc650dSSadaf Ebrahimi case OP_QUERY:
9030*22dc650dSSadaf Ebrahimi case OP_QUERYI:
9031*22dc650dSSadaf Ebrahimi case OP_NOTQUERY:
9032*22dc650dSSadaf Ebrahimi case OP_NOTQUERYI:
9033*22dc650dSSadaf Ebrahimi case OP_MINQUERY:
9034*22dc650dSSadaf Ebrahimi case OP_MINQUERYI:
9035*22dc650dSSadaf Ebrahimi case OP_NOTMINQUERY:
9036*22dc650dSSadaf Ebrahimi case OP_NOTMINQUERYI:
9037*22dc650dSSadaf Ebrahimi case OP_POSQUERY:
9038*22dc650dSSadaf Ebrahimi case OP_POSQUERYI:
9039*22dc650dSSadaf Ebrahimi case OP_NOTPOSQUERY:
9040*22dc650dSSadaf Ebrahimi case OP_NOTPOSQUERYI:
9041*22dc650dSSadaf Ebrahimi if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9042*22dc650dSSadaf Ebrahimi break;
9043*22dc650dSSadaf Ebrahimi }
9044*22dc650dSSadaf Ebrahimi #else
9045*22dc650dSSadaf Ebrahimi (void)(utf); /* Keep compiler happy by referencing function argument */
9046*22dc650dSSadaf Ebrahimi #endif /* MAYBE_UTF_MULTI */
9047*22dc650dSSadaf Ebrahimi }
9048*22dc650dSSadaf Ebrahimi }
9049*22dc650dSSadaf Ebrahimi }
9050*22dc650dSSadaf Ebrahimi
9051*22dc650dSSadaf Ebrahimi
9052*22dc650dSSadaf Ebrahimi
9053*22dc650dSSadaf Ebrahimi /*************************************************
9054*22dc650dSSadaf Ebrahimi * Check for asserted fixed first code unit *
9055*22dc650dSSadaf Ebrahimi *************************************************/
9056*22dc650dSSadaf Ebrahimi
9057*22dc650dSSadaf Ebrahimi /* During compilation, the "first code unit" settings from forward assertions
9058*22dc650dSSadaf Ebrahimi are discarded, because they can cause conflicts with actual literals that
9059*22dc650dSSadaf Ebrahimi follow. However, if we end up without a first code unit setting for an
9060*22dc650dSSadaf Ebrahimi unanchored pattern, it is worth scanning the regex to see if there is an
9061*22dc650dSSadaf Ebrahimi initial asserted first code unit. If all branches start with the same asserted
9062*22dc650dSSadaf Ebrahimi code unit, or with a non-conditional bracket all of whose alternatives start
9063*22dc650dSSadaf Ebrahimi with the same asserted code unit (recurse ad lib), then we return that code
9064*22dc650dSSadaf Ebrahimi unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9065*22dc650dSSadaf Ebrahimi REQ_NONE in the flags.
9066*22dc650dSSadaf Ebrahimi
9067*22dc650dSSadaf Ebrahimi Arguments:
9068*22dc650dSSadaf Ebrahimi code points to start of compiled pattern
9069*22dc650dSSadaf Ebrahimi flags points to the first code unit flags
9070*22dc650dSSadaf Ebrahimi inassert non-zero if in an assertion
9071*22dc650dSSadaf Ebrahimi
9072*22dc650dSSadaf Ebrahimi Returns: the fixed first code unit, or 0 with REQ_NONE in flags
9073*22dc650dSSadaf Ebrahimi */
9074*22dc650dSSadaf Ebrahimi
9075*22dc650dSSadaf Ebrahimi static uint32_t
find_firstassertedcu(PCRE2_SPTR code,uint32_t * flags,uint32_t inassert)9076*22dc650dSSadaf Ebrahimi find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9077*22dc650dSSadaf Ebrahimi {
9078*22dc650dSSadaf Ebrahimi uint32_t c = 0;
9079*22dc650dSSadaf Ebrahimi uint32_t cflags = REQ_NONE;
9080*22dc650dSSadaf Ebrahimi
9081*22dc650dSSadaf Ebrahimi *flags = REQ_NONE;
9082*22dc650dSSadaf Ebrahimi do {
9083*22dc650dSSadaf Ebrahimi uint32_t d;
9084*22dc650dSSadaf Ebrahimi uint32_t dflags;
9085*22dc650dSSadaf Ebrahimi int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9086*22dc650dSSadaf Ebrahimi *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9087*22dc650dSSadaf Ebrahimi PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9088*22dc650dSSadaf Ebrahimi PCRE2_UCHAR op = *scode;
9089*22dc650dSSadaf Ebrahimi
9090*22dc650dSSadaf Ebrahimi switch(op)
9091*22dc650dSSadaf Ebrahimi {
9092*22dc650dSSadaf Ebrahimi default:
9093*22dc650dSSadaf Ebrahimi return 0;
9094*22dc650dSSadaf Ebrahimi
9095*22dc650dSSadaf Ebrahimi case OP_BRA:
9096*22dc650dSSadaf Ebrahimi case OP_BRAPOS:
9097*22dc650dSSadaf Ebrahimi case OP_CBRA:
9098*22dc650dSSadaf Ebrahimi case OP_SCBRA:
9099*22dc650dSSadaf Ebrahimi case OP_CBRAPOS:
9100*22dc650dSSadaf Ebrahimi case OP_SCBRAPOS:
9101*22dc650dSSadaf Ebrahimi case OP_ASSERT:
9102*22dc650dSSadaf Ebrahimi case OP_ASSERT_NA:
9103*22dc650dSSadaf Ebrahimi case OP_ONCE:
9104*22dc650dSSadaf Ebrahimi case OP_SCRIPT_RUN:
9105*22dc650dSSadaf Ebrahimi d = find_firstassertedcu(scode, &dflags, inassert +
9106*22dc650dSSadaf Ebrahimi ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9107*22dc650dSSadaf Ebrahimi if (dflags >= REQ_NONE) return 0;
9108*22dc650dSSadaf Ebrahimi if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9109*22dc650dSSadaf Ebrahimi else if (c != d || cflags != dflags) return 0;
9110*22dc650dSSadaf Ebrahimi break;
9111*22dc650dSSadaf Ebrahimi
9112*22dc650dSSadaf Ebrahimi case OP_EXACT:
9113*22dc650dSSadaf Ebrahimi scode += IMM2_SIZE;
9114*22dc650dSSadaf Ebrahimi /* Fall through */
9115*22dc650dSSadaf Ebrahimi
9116*22dc650dSSadaf Ebrahimi case OP_CHAR:
9117*22dc650dSSadaf Ebrahimi case OP_PLUS:
9118*22dc650dSSadaf Ebrahimi case OP_MINPLUS:
9119*22dc650dSSadaf Ebrahimi case OP_POSPLUS:
9120*22dc650dSSadaf Ebrahimi if (inassert == 0) return 0;
9121*22dc650dSSadaf Ebrahimi if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9122*22dc650dSSadaf Ebrahimi else if (c != scode[1]) return 0;
9123*22dc650dSSadaf Ebrahimi break;
9124*22dc650dSSadaf Ebrahimi
9125*22dc650dSSadaf Ebrahimi case OP_EXACTI:
9126*22dc650dSSadaf Ebrahimi scode += IMM2_SIZE;
9127*22dc650dSSadaf Ebrahimi /* Fall through */
9128*22dc650dSSadaf Ebrahimi
9129*22dc650dSSadaf Ebrahimi case OP_CHARI:
9130*22dc650dSSadaf Ebrahimi case OP_PLUSI:
9131*22dc650dSSadaf Ebrahimi case OP_MINPLUSI:
9132*22dc650dSSadaf Ebrahimi case OP_POSPLUSI:
9133*22dc650dSSadaf Ebrahimi if (inassert == 0) return 0;
9134*22dc650dSSadaf Ebrahimi
9135*22dc650dSSadaf Ebrahimi /* If the character is more than one code unit long, we cannot set its
9136*22dc650dSSadaf Ebrahimi first code unit when matching caselessly. Later scanning may pick up
9137*22dc650dSSadaf Ebrahimi multiple code units. */
9138*22dc650dSSadaf Ebrahimi
9139*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
9140*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
9141*22dc650dSSadaf Ebrahimi if (scode[1] >= 0x80) return 0;
9142*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 16
9143*22dc650dSSadaf Ebrahimi if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9144*22dc650dSSadaf Ebrahimi #endif
9145*22dc650dSSadaf Ebrahimi #endif
9146*22dc650dSSadaf Ebrahimi
9147*22dc650dSSadaf Ebrahimi if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9148*22dc650dSSadaf Ebrahimi else if (c != scode[1]) return 0;
9149*22dc650dSSadaf Ebrahimi break;
9150*22dc650dSSadaf Ebrahimi }
9151*22dc650dSSadaf Ebrahimi
9152*22dc650dSSadaf Ebrahimi code += GET(code, 1);
9153*22dc650dSSadaf Ebrahimi }
9154*22dc650dSSadaf Ebrahimi while (*code == OP_ALT);
9155*22dc650dSSadaf Ebrahimi
9156*22dc650dSSadaf Ebrahimi *flags = cflags;
9157*22dc650dSSadaf Ebrahimi return c;
9158*22dc650dSSadaf Ebrahimi }
9159*22dc650dSSadaf Ebrahimi
9160*22dc650dSSadaf Ebrahimi
9161*22dc650dSSadaf Ebrahimi
9162*22dc650dSSadaf Ebrahimi /*************************************************
9163*22dc650dSSadaf Ebrahimi * Add an entry to the name/number table *
9164*22dc650dSSadaf Ebrahimi *************************************************/
9165*22dc650dSSadaf Ebrahimi
9166*22dc650dSSadaf Ebrahimi /* This function is called between compiling passes to add an entry to the
9167*22dc650dSSadaf Ebrahimi name/number table, maintaining alphabetical order. Checking for permitted
9168*22dc650dSSadaf Ebrahimi and forbidden duplicates has already been done.
9169*22dc650dSSadaf Ebrahimi
9170*22dc650dSSadaf Ebrahimi Arguments:
9171*22dc650dSSadaf Ebrahimi cb the compile data block
9172*22dc650dSSadaf Ebrahimi name the name to add
9173*22dc650dSSadaf Ebrahimi length the length of the name
9174*22dc650dSSadaf Ebrahimi groupno the group number
9175*22dc650dSSadaf Ebrahimi tablecount the count of names in the table so far
9176*22dc650dSSadaf Ebrahimi
9177*22dc650dSSadaf Ebrahimi Returns: nothing
9178*22dc650dSSadaf Ebrahimi */
9179*22dc650dSSadaf Ebrahimi
9180*22dc650dSSadaf Ebrahimi static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)9181*22dc650dSSadaf Ebrahimi add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9182*22dc650dSSadaf Ebrahimi unsigned int groupno, uint32_t tablecount)
9183*22dc650dSSadaf Ebrahimi {
9184*22dc650dSSadaf Ebrahimi uint32_t i;
9185*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *slot = cb->name_table;
9186*22dc650dSSadaf Ebrahimi
9187*22dc650dSSadaf Ebrahimi for (i = 0; i < tablecount; i++)
9188*22dc650dSSadaf Ebrahimi {
9189*22dc650dSSadaf Ebrahimi int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9190*22dc650dSSadaf Ebrahimi if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9191*22dc650dSSadaf Ebrahimi crc = -1; /* Current name is a substring */
9192*22dc650dSSadaf Ebrahimi
9193*22dc650dSSadaf Ebrahimi /* Make space in the table and break the loop for an earlier name. For a
9194*22dc650dSSadaf Ebrahimi duplicate or later name, carry on. We do this for duplicates so that in the
9195*22dc650dSSadaf Ebrahimi simple case (when ?(| is not used) they are in order of their numbers. In all
9196*22dc650dSSadaf Ebrahimi cases they are in the order in which they appear in the pattern. */
9197*22dc650dSSadaf Ebrahimi
9198*22dc650dSSadaf Ebrahimi if (crc < 0)
9199*22dc650dSSadaf Ebrahimi {
9200*22dc650dSSadaf Ebrahimi (void)memmove(slot + cb->name_entry_size, slot,
9201*22dc650dSSadaf Ebrahimi CU2BYTES((tablecount - i) * cb->name_entry_size));
9202*22dc650dSSadaf Ebrahimi break;
9203*22dc650dSSadaf Ebrahimi }
9204*22dc650dSSadaf Ebrahimi
9205*22dc650dSSadaf Ebrahimi /* Continue the loop for a later or duplicate name */
9206*22dc650dSSadaf Ebrahimi
9207*22dc650dSSadaf Ebrahimi slot += cb->name_entry_size;
9208*22dc650dSSadaf Ebrahimi }
9209*22dc650dSSadaf Ebrahimi
9210*22dc650dSSadaf Ebrahimi PUT2(slot, 0, groupno);
9211*22dc650dSSadaf Ebrahimi memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9212*22dc650dSSadaf Ebrahimi
9213*22dc650dSSadaf Ebrahimi /* Add a terminating zero and fill the rest of the slot with zeroes so that
9214*22dc650dSSadaf Ebrahimi the memory is all initialized. Otherwise valgrind moans about uninitialized
9215*22dc650dSSadaf Ebrahimi memory when saving serialized compiled patterns. */
9216*22dc650dSSadaf Ebrahimi
9217*22dc650dSSadaf Ebrahimi memset(slot + IMM2_SIZE + length, 0,
9218*22dc650dSSadaf Ebrahimi CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9219*22dc650dSSadaf Ebrahimi }
9220*22dc650dSSadaf Ebrahimi
9221*22dc650dSSadaf Ebrahimi
9222*22dc650dSSadaf Ebrahimi
9223*22dc650dSSadaf Ebrahimi /*************************************************
9224*22dc650dSSadaf Ebrahimi * Skip in parsed pattern *
9225*22dc650dSSadaf Ebrahimi *************************************************/
9226*22dc650dSSadaf Ebrahimi
9227*22dc650dSSadaf Ebrahimi /* This function is called to skip parts of the parsed pattern when finding the
9228*22dc650dSSadaf Ebrahimi length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9229*22dc650dSSadaf Ebrahimi the end of the branch, it is called to skip over an internal lookaround or
9230*22dc650dSSadaf Ebrahimi (DEFINE) group, and it is also called to skip to the end of a class, during
9231*22dc650dSSadaf Ebrahimi which it will never encounter nested groups (but there's no need to have
9232*22dc650dSSadaf Ebrahimi special code for that).
9233*22dc650dSSadaf Ebrahimi
9234*22dc650dSSadaf Ebrahimi When called to find the end of a branch or group, pptr must point to the first
9235*22dc650dSSadaf Ebrahimi meta code inside the branch, not the branch-starting code. In other cases it
9236*22dc650dSSadaf Ebrahimi can point to the item that causes the function to be called.
9237*22dc650dSSadaf Ebrahimi
9238*22dc650dSSadaf Ebrahimi Arguments:
9239*22dc650dSSadaf Ebrahimi pptr current pointer to skip from
9240*22dc650dSSadaf Ebrahimi skiptype PSKIP_CLASS when skipping to end of class
9241*22dc650dSSadaf Ebrahimi PSKIP_ALT when META_ALT ends the skip
9242*22dc650dSSadaf Ebrahimi PSKIP_KET when only META_KET ends the skip
9243*22dc650dSSadaf Ebrahimi
9244*22dc650dSSadaf Ebrahimi Returns: new value of pptr
9245*22dc650dSSadaf Ebrahimi NULL if META_END is reached - should never occur
9246*22dc650dSSadaf Ebrahimi or for an unknown meta value - likewise
9247*22dc650dSSadaf Ebrahimi */
9248*22dc650dSSadaf Ebrahimi
9249*22dc650dSSadaf Ebrahimi static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)9250*22dc650dSSadaf Ebrahimi parsed_skip(uint32_t *pptr, uint32_t skiptype)
9251*22dc650dSSadaf Ebrahimi {
9252*22dc650dSSadaf Ebrahimi uint32_t nestlevel = 0;
9253*22dc650dSSadaf Ebrahimi
9254*22dc650dSSadaf Ebrahimi for (;; pptr++)
9255*22dc650dSSadaf Ebrahimi {
9256*22dc650dSSadaf Ebrahimi uint32_t meta = META_CODE(*pptr);
9257*22dc650dSSadaf Ebrahimi
9258*22dc650dSSadaf Ebrahimi switch(meta)
9259*22dc650dSSadaf Ebrahimi {
9260*22dc650dSSadaf Ebrahimi default: /* Just skip over most items */
9261*22dc650dSSadaf Ebrahimi if (meta < META_END) continue; /* Literal */
9262*22dc650dSSadaf Ebrahimi break;
9263*22dc650dSSadaf Ebrahimi
9264*22dc650dSSadaf Ebrahimi /* This should never occur. */
9265*22dc650dSSadaf Ebrahimi
9266*22dc650dSSadaf Ebrahimi case META_END:
9267*22dc650dSSadaf Ebrahimi return NULL;
9268*22dc650dSSadaf Ebrahimi
9269*22dc650dSSadaf Ebrahimi /* The data for these items is variable in length. */
9270*22dc650dSSadaf Ebrahimi
9271*22dc650dSSadaf Ebrahimi case META_BACKREF: /* Offset is present only if group >= 10 */
9272*22dc650dSSadaf Ebrahimi if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9273*22dc650dSSadaf Ebrahimi break;
9274*22dc650dSSadaf Ebrahimi
9275*22dc650dSSadaf Ebrahimi case META_ESCAPE: /* A few escapes are followed by data items. */
9276*22dc650dSSadaf Ebrahimi switch (META_DATA(*pptr))
9277*22dc650dSSadaf Ebrahimi {
9278*22dc650dSSadaf Ebrahimi case ESC_P:
9279*22dc650dSSadaf Ebrahimi case ESC_p:
9280*22dc650dSSadaf Ebrahimi pptr += 1;
9281*22dc650dSSadaf Ebrahimi break;
9282*22dc650dSSadaf Ebrahimi
9283*22dc650dSSadaf Ebrahimi case ESC_g:
9284*22dc650dSSadaf Ebrahimi case ESC_k:
9285*22dc650dSSadaf Ebrahimi pptr += 1 + SIZEOFFSET;
9286*22dc650dSSadaf Ebrahimi break;
9287*22dc650dSSadaf Ebrahimi }
9288*22dc650dSSadaf Ebrahimi break;
9289*22dc650dSSadaf Ebrahimi
9290*22dc650dSSadaf Ebrahimi case META_MARK: /* Add the length of the name. */
9291*22dc650dSSadaf Ebrahimi case META_COMMIT_ARG:
9292*22dc650dSSadaf Ebrahimi case META_PRUNE_ARG:
9293*22dc650dSSadaf Ebrahimi case META_SKIP_ARG:
9294*22dc650dSSadaf Ebrahimi case META_THEN_ARG:
9295*22dc650dSSadaf Ebrahimi pptr += pptr[1];
9296*22dc650dSSadaf Ebrahimi break;
9297*22dc650dSSadaf Ebrahimi
9298*22dc650dSSadaf Ebrahimi /* These are the "active" items in this loop. */
9299*22dc650dSSadaf Ebrahimi
9300*22dc650dSSadaf Ebrahimi case META_CLASS_END:
9301*22dc650dSSadaf Ebrahimi if (skiptype == PSKIP_CLASS) return pptr;
9302*22dc650dSSadaf Ebrahimi break;
9303*22dc650dSSadaf Ebrahimi
9304*22dc650dSSadaf Ebrahimi case META_ATOMIC:
9305*22dc650dSSadaf Ebrahimi case META_CAPTURE:
9306*22dc650dSSadaf Ebrahimi case META_COND_ASSERT:
9307*22dc650dSSadaf Ebrahimi case META_COND_DEFINE:
9308*22dc650dSSadaf Ebrahimi case META_COND_NAME:
9309*22dc650dSSadaf Ebrahimi case META_COND_NUMBER:
9310*22dc650dSSadaf Ebrahimi case META_COND_RNAME:
9311*22dc650dSSadaf Ebrahimi case META_COND_RNUMBER:
9312*22dc650dSSadaf Ebrahimi case META_COND_VERSION:
9313*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD:
9314*22dc650dSSadaf Ebrahimi case META_LOOKAHEADNOT:
9315*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD_NA:
9316*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND:
9317*22dc650dSSadaf Ebrahimi case META_LOOKBEHINDNOT:
9318*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND_NA:
9319*22dc650dSSadaf Ebrahimi case META_NOCAPTURE:
9320*22dc650dSSadaf Ebrahimi case META_SCRIPT_RUN:
9321*22dc650dSSadaf Ebrahimi nestlevel++;
9322*22dc650dSSadaf Ebrahimi break;
9323*22dc650dSSadaf Ebrahimi
9324*22dc650dSSadaf Ebrahimi case META_ALT:
9325*22dc650dSSadaf Ebrahimi if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9326*22dc650dSSadaf Ebrahimi break;
9327*22dc650dSSadaf Ebrahimi
9328*22dc650dSSadaf Ebrahimi case META_KET:
9329*22dc650dSSadaf Ebrahimi if (nestlevel == 0) return pptr;
9330*22dc650dSSadaf Ebrahimi nestlevel--;
9331*22dc650dSSadaf Ebrahimi break;
9332*22dc650dSSadaf Ebrahimi }
9333*22dc650dSSadaf Ebrahimi
9334*22dc650dSSadaf Ebrahimi /* The extra data item length for each meta is in a table. */
9335*22dc650dSSadaf Ebrahimi
9336*22dc650dSSadaf Ebrahimi meta = (meta >> 16) & 0x7fff;
9337*22dc650dSSadaf Ebrahimi if (meta >= sizeof(meta_extra_lengths)) return NULL;
9338*22dc650dSSadaf Ebrahimi pptr += meta_extra_lengths[meta];
9339*22dc650dSSadaf Ebrahimi }
9340*22dc650dSSadaf Ebrahimi /* Control never reaches here */
9341*22dc650dSSadaf Ebrahimi return pptr;
9342*22dc650dSSadaf Ebrahimi }
9343*22dc650dSSadaf Ebrahimi
9344*22dc650dSSadaf Ebrahimi
9345*22dc650dSSadaf Ebrahimi
9346*22dc650dSSadaf Ebrahimi /*************************************************
9347*22dc650dSSadaf Ebrahimi * Find length of a parsed group *
9348*22dc650dSSadaf Ebrahimi *************************************************/
9349*22dc650dSSadaf Ebrahimi
9350*22dc650dSSadaf Ebrahimi /* This is called for nested groups within a branch of a lookbehind whose
9351*22dc650dSSadaf Ebrahimi length is being computed. On entry, the pointer must be at the first element
9352*22dc650dSSadaf Ebrahimi after the group initializing code. On exit it points to OP_KET. Caching is used
9353*22dc650dSSadaf Ebrahimi to improve processing speed when the same capturing group occurs many times.
9354*22dc650dSSadaf Ebrahimi
9355*22dc650dSSadaf Ebrahimi Arguments:
9356*22dc650dSSadaf Ebrahimi pptrptr pointer to pointer in the parsed pattern
9357*22dc650dSSadaf Ebrahimi minptr where to return the minimum length
9358*22dc650dSSadaf Ebrahimi isinline FALSE if a reference or recursion; TRUE for inline group
9359*22dc650dSSadaf Ebrahimi errcodeptr pointer to the errorcode
9360*22dc650dSSadaf Ebrahimi lcptr pointer to the loop counter
9361*22dc650dSSadaf Ebrahimi group number of captured group or -1 for a non-capturing group
9362*22dc650dSSadaf Ebrahimi recurses chain of recurse_check to catch mutual recursion
9363*22dc650dSSadaf Ebrahimi cb pointer to the compile data
9364*22dc650dSSadaf Ebrahimi
9365*22dc650dSSadaf Ebrahimi Returns: the maximum group length or a negative number
9366*22dc650dSSadaf Ebrahimi */
9367*22dc650dSSadaf Ebrahimi
9368*22dc650dSSadaf Ebrahimi static int
get_grouplength(uint32_t ** pptrptr,int * minptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)9369*22dc650dSSadaf Ebrahimi get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9370*22dc650dSSadaf Ebrahimi int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9371*22dc650dSSadaf Ebrahimi {
9372*22dc650dSSadaf Ebrahimi uint32_t *gi = cb->groupinfo + 2 * group;
9373*22dc650dSSadaf Ebrahimi int branchlength, branchminlength;
9374*22dc650dSSadaf Ebrahimi int grouplength = -1;
9375*22dc650dSSadaf Ebrahimi int groupminlength = INT_MAX;
9376*22dc650dSSadaf Ebrahimi
9377*22dc650dSSadaf Ebrahimi /* The cache can be used only if there is no possibility of there being two
9378*22dc650dSSadaf Ebrahimi groups with the same number. We do not need to set the end pointer for a group
9379*22dc650dSSadaf Ebrahimi that is being processed as a back reference or recursion, but we must do so for
9380*22dc650dSSadaf Ebrahimi an inline group. */
9381*22dc650dSSadaf Ebrahimi
9382*22dc650dSSadaf Ebrahimi if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9383*22dc650dSSadaf Ebrahimi {
9384*22dc650dSSadaf Ebrahimi uint32_t groupinfo = gi[0];
9385*22dc650dSSadaf Ebrahimi if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9386*22dc650dSSadaf Ebrahimi if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9387*22dc650dSSadaf Ebrahimi {
9388*22dc650dSSadaf Ebrahimi if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9389*22dc650dSSadaf Ebrahimi *minptr = gi[1];
9390*22dc650dSSadaf Ebrahimi return groupinfo & GI_FIXED_LENGTH_MASK;
9391*22dc650dSSadaf Ebrahimi }
9392*22dc650dSSadaf Ebrahimi }
9393*22dc650dSSadaf Ebrahimi
9394*22dc650dSSadaf Ebrahimi /* Scan the group. In this case we find the end pointer of necessity. */
9395*22dc650dSSadaf Ebrahimi
9396*22dc650dSSadaf Ebrahimi for(;;)
9397*22dc650dSSadaf Ebrahimi {
9398*22dc650dSSadaf Ebrahimi branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9399*22dc650dSSadaf Ebrahimi recurses, cb);
9400*22dc650dSSadaf Ebrahimi if (branchlength < 0) goto ISNOTFIXED;
9401*22dc650dSSadaf Ebrahimi if (branchlength > grouplength) grouplength = branchlength;
9402*22dc650dSSadaf Ebrahimi if (branchminlength < groupminlength) groupminlength = branchminlength;
9403*22dc650dSSadaf Ebrahimi if (**pptrptr == META_KET) break;
9404*22dc650dSSadaf Ebrahimi *pptrptr += 1; /* Skip META_ALT */
9405*22dc650dSSadaf Ebrahimi }
9406*22dc650dSSadaf Ebrahimi
9407*22dc650dSSadaf Ebrahimi if (group > 0)
9408*22dc650dSSadaf Ebrahimi {
9409*22dc650dSSadaf Ebrahimi gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9410*22dc650dSSadaf Ebrahimi gi[1] = groupminlength;
9411*22dc650dSSadaf Ebrahimi }
9412*22dc650dSSadaf Ebrahimi
9413*22dc650dSSadaf Ebrahimi *minptr = groupminlength;
9414*22dc650dSSadaf Ebrahimi return grouplength;
9415*22dc650dSSadaf Ebrahimi
9416*22dc650dSSadaf Ebrahimi ISNOTFIXED:
9417*22dc650dSSadaf Ebrahimi if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9418*22dc650dSSadaf Ebrahimi return -1;
9419*22dc650dSSadaf Ebrahimi }
9420*22dc650dSSadaf Ebrahimi
9421*22dc650dSSadaf Ebrahimi
9422*22dc650dSSadaf Ebrahimi
9423*22dc650dSSadaf Ebrahimi /*************************************************
9424*22dc650dSSadaf Ebrahimi * Find length of a parsed branch *
9425*22dc650dSSadaf Ebrahimi *************************************************/
9426*22dc650dSSadaf Ebrahimi
9427*22dc650dSSadaf Ebrahimi /* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9428*22dc650dSSadaf Ebrahimi giving an error if the length is not limited. On entry, *pptrptr points to the
9429*22dc650dSSadaf Ebrahimi first element inside the branch. On exit it is set to point to the ALT or KET.
9430*22dc650dSSadaf Ebrahimi
9431*22dc650dSSadaf Ebrahimi Arguments:
9432*22dc650dSSadaf Ebrahimi pptrptr pointer to pointer in the parsed pattern
9433*22dc650dSSadaf Ebrahimi minptr where to return the minimum length
9434*22dc650dSSadaf Ebrahimi errcodeptr pointer to error code
9435*22dc650dSSadaf Ebrahimi lcptr pointer to loop counter
9436*22dc650dSSadaf Ebrahimi recurses chain of recurse_check to catch mutual recursion
9437*22dc650dSSadaf Ebrahimi cb pointer to compile block
9438*22dc650dSSadaf Ebrahimi
9439*22dc650dSSadaf Ebrahimi Returns: the maximum length, or a negative value on error
9440*22dc650dSSadaf Ebrahimi */
9441*22dc650dSSadaf Ebrahimi
9442*22dc650dSSadaf Ebrahimi static int
get_branchlength(uint32_t ** pptrptr,int * minptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9443*22dc650dSSadaf Ebrahimi get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9444*22dc650dSSadaf Ebrahimi parsed_recurse_check *recurses, compile_block *cb)
9445*22dc650dSSadaf Ebrahimi {
9446*22dc650dSSadaf Ebrahimi int branchlength = 0;
9447*22dc650dSSadaf Ebrahimi int branchminlength = 0;
9448*22dc650dSSadaf Ebrahimi int grouplength, groupminlength;
9449*22dc650dSSadaf Ebrahimi uint32_t lastitemlength = 0;
9450*22dc650dSSadaf Ebrahimi uint32_t lastitemminlength = 0;
9451*22dc650dSSadaf Ebrahimi uint32_t *pptr = *pptrptr;
9452*22dc650dSSadaf Ebrahimi PCRE2_SIZE offset;
9453*22dc650dSSadaf Ebrahimi parsed_recurse_check this_recurse;
9454*22dc650dSSadaf Ebrahimi
9455*22dc650dSSadaf Ebrahimi /* A large and/or complex regex can take too long to process. This can happen
9456*22dc650dSSadaf Ebrahimi more often when (?| groups are present in the pattern because their length
9457*22dc650dSSadaf Ebrahimi cannot be cached. */
9458*22dc650dSSadaf Ebrahimi
9459*22dc650dSSadaf Ebrahimi if ((*lcptr)++ > 2000)
9460*22dc650dSSadaf Ebrahimi {
9461*22dc650dSSadaf Ebrahimi *errcodeptr = ERR35; /* Lookbehind is too complicated */
9462*22dc650dSSadaf Ebrahimi return -1;
9463*22dc650dSSadaf Ebrahimi }
9464*22dc650dSSadaf Ebrahimi
9465*22dc650dSSadaf Ebrahimi /* Scan the branch, accumulating the length. */
9466*22dc650dSSadaf Ebrahimi
9467*22dc650dSSadaf Ebrahimi for (;; pptr++)
9468*22dc650dSSadaf Ebrahimi {
9469*22dc650dSSadaf Ebrahimi parsed_recurse_check *r;
9470*22dc650dSSadaf Ebrahimi uint32_t *gptr, *gptrend;
9471*22dc650dSSadaf Ebrahimi uint32_t escape;
9472*22dc650dSSadaf Ebrahimi uint32_t group = 0;
9473*22dc650dSSadaf Ebrahimi uint32_t itemlength = 0;
9474*22dc650dSSadaf Ebrahimi uint32_t itemminlength = 0;
9475*22dc650dSSadaf Ebrahimi uint32_t min, max;
9476*22dc650dSSadaf Ebrahimi
9477*22dc650dSSadaf Ebrahimi if (*pptr < META_END)
9478*22dc650dSSadaf Ebrahimi {
9479*22dc650dSSadaf Ebrahimi itemlength = itemminlength = 1;
9480*22dc650dSSadaf Ebrahimi }
9481*22dc650dSSadaf Ebrahimi
9482*22dc650dSSadaf Ebrahimi else switch (META_CODE(*pptr))
9483*22dc650dSSadaf Ebrahimi {
9484*22dc650dSSadaf Ebrahimi case META_KET:
9485*22dc650dSSadaf Ebrahimi case META_ALT:
9486*22dc650dSSadaf Ebrahimi goto EXIT;
9487*22dc650dSSadaf Ebrahimi
9488*22dc650dSSadaf Ebrahimi /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9489*22dc650dSSadaf Ebrahimi actual termination. */
9490*22dc650dSSadaf Ebrahimi
9491*22dc650dSSadaf Ebrahimi case META_ACCEPT:
9492*22dc650dSSadaf Ebrahimi case META_FAIL:
9493*22dc650dSSadaf Ebrahimi pptr = parsed_skip(pptr, PSKIP_ALT);
9494*22dc650dSSadaf Ebrahimi if (pptr == NULL) goto PARSED_SKIP_FAILED;
9495*22dc650dSSadaf Ebrahimi goto EXIT;
9496*22dc650dSSadaf Ebrahimi
9497*22dc650dSSadaf Ebrahimi case META_MARK:
9498*22dc650dSSadaf Ebrahimi case META_COMMIT_ARG:
9499*22dc650dSSadaf Ebrahimi case META_PRUNE_ARG:
9500*22dc650dSSadaf Ebrahimi case META_SKIP_ARG:
9501*22dc650dSSadaf Ebrahimi case META_THEN_ARG:
9502*22dc650dSSadaf Ebrahimi pptr += pptr[1] + 1;
9503*22dc650dSSadaf Ebrahimi break;
9504*22dc650dSSadaf Ebrahimi
9505*22dc650dSSadaf Ebrahimi case META_CIRCUMFLEX:
9506*22dc650dSSadaf Ebrahimi case META_COMMIT:
9507*22dc650dSSadaf Ebrahimi case META_DOLLAR:
9508*22dc650dSSadaf Ebrahimi case META_PRUNE:
9509*22dc650dSSadaf Ebrahimi case META_SKIP:
9510*22dc650dSSadaf Ebrahimi case META_THEN:
9511*22dc650dSSadaf Ebrahimi break;
9512*22dc650dSSadaf Ebrahimi
9513*22dc650dSSadaf Ebrahimi case META_OPTIONS:
9514*22dc650dSSadaf Ebrahimi pptr += 2;
9515*22dc650dSSadaf Ebrahimi break;
9516*22dc650dSSadaf Ebrahimi
9517*22dc650dSSadaf Ebrahimi case META_BIGVALUE:
9518*22dc650dSSadaf Ebrahimi itemlength = itemminlength = 1;
9519*22dc650dSSadaf Ebrahimi pptr += 1;
9520*22dc650dSSadaf Ebrahimi break;
9521*22dc650dSSadaf Ebrahimi
9522*22dc650dSSadaf Ebrahimi case META_CLASS:
9523*22dc650dSSadaf Ebrahimi case META_CLASS_NOT:
9524*22dc650dSSadaf Ebrahimi itemlength = itemminlength = 1;
9525*22dc650dSSadaf Ebrahimi pptr = parsed_skip(pptr, PSKIP_CLASS);
9526*22dc650dSSadaf Ebrahimi if (pptr == NULL) goto PARSED_SKIP_FAILED;
9527*22dc650dSSadaf Ebrahimi break;
9528*22dc650dSSadaf Ebrahimi
9529*22dc650dSSadaf Ebrahimi case META_CLASS_EMPTY_NOT:
9530*22dc650dSSadaf Ebrahimi case META_DOT:
9531*22dc650dSSadaf Ebrahimi itemlength = itemminlength = 1;
9532*22dc650dSSadaf Ebrahimi break;
9533*22dc650dSSadaf Ebrahimi
9534*22dc650dSSadaf Ebrahimi case META_CALLOUT_NUMBER:
9535*22dc650dSSadaf Ebrahimi pptr += 3;
9536*22dc650dSSadaf Ebrahimi break;
9537*22dc650dSSadaf Ebrahimi
9538*22dc650dSSadaf Ebrahimi case META_CALLOUT_STRING:
9539*22dc650dSSadaf Ebrahimi pptr += 3 + SIZEOFFSET;
9540*22dc650dSSadaf Ebrahimi break;
9541*22dc650dSSadaf Ebrahimi
9542*22dc650dSSadaf Ebrahimi /* Only some escapes consume a character. Of those, \R can match one or two
9543*22dc650dSSadaf Ebrahimi characters, but \X is never allowed because it matches an unknown number of
9544*22dc650dSSadaf Ebrahimi characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9545*22dc650dSSadaf Ebrahimi
9546*22dc650dSSadaf Ebrahimi case META_ESCAPE:
9547*22dc650dSSadaf Ebrahimi escape = META_DATA(*pptr);
9548*22dc650dSSadaf Ebrahimi if (escape == ESC_X) return -1;
9549*22dc650dSSadaf Ebrahimi if (escape == ESC_R)
9550*22dc650dSSadaf Ebrahimi {
9551*22dc650dSSadaf Ebrahimi itemminlength = 1;
9552*22dc650dSSadaf Ebrahimi itemlength = 2;
9553*22dc650dSSadaf Ebrahimi }
9554*22dc650dSSadaf Ebrahimi else if (escape > ESC_b && escape < ESC_Z)
9555*22dc650dSSadaf Ebrahimi {
9556*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 32
9557*22dc650dSSadaf Ebrahimi if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9558*22dc650dSSadaf Ebrahimi {
9559*22dc650dSSadaf Ebrahimi *errcodeptr = ERR36;
9560*22dc650dSSadaf Ebrahimi return -1;
9561*22dc650dSSadaf Ebrahimi }
9562*22dc650dSSadaf Ebrahimi #endif
9563*22dc650dSSadaf Ebrahimi itemlength = itemminlength = 1;
9564*22dc650dSSadaf Ebrahimi if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
9565*22dc650dSSadaf Ebrahimi }
9566*22dc650dSSadaf Ebrahimi break;
9567*22dc650dSSadaf Ebrahimi
9568*22dc650dSSadaf Ebrahimi /* Lookaheads do not contribute to the length of this branch, but they may
9569*22dc650dSSadaf Ebrahimi contain lookbehinds within them whose lengths need to be set. */
9570*22dc650dSSadaf Ebrahimi
9571*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD:
9572*22dc650dSSadaf Ebrahimi case META_LOOKAHEADNOT:
9573*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD_NA:
9574*22dc650dSSadaf Ebrahimi *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9575*22dc650dSSadaf Ebrahimi if (*errcodeptr != 0) return -1;
9576*22dc650dSSadaf Ebrahimi
9577*22dc650dSSadaf Ebrahimi /* Ignore any qualifiers that follow a lookahead assertion. */
9578*22dc650dSSadaf Ebrahimi
9579*22dc650dSSadaf Ebrahimi switch (pptr[1])
9580*22dc650dSSadaf Ebrahimi {
9581*22dc650dSSadaf Ebrahimi case META_ASTERISK:
9582*22dc650dSSadaf Ebrahimi case META_ASTERISK_PLUS:
9583*22dc650dSSadaf Ebrahimi case META_ASTERISK_QUERY:
9584*22dc650dSSadaf Ebrahimi case META_PLUS:
9585*22dc650dSSadaf Ebrahimi case META_PLUS_PLUS:
9586*22dc650dSSadaf Ebrahimi case META_PLUS_QUERY:
9587*22dc650dSSadaf Ebrahimi case META_QUERY:
9588*22dc650dSSadaf Ebrahimi case META_QUERY_PLUS:
9589*22dc650dSSadaf Ebrahimi case META_QUERY_QUERY:
9590*22dc650dSSadaf Ebrahimi pptr++;
9591*22dc650dSSadaf Ebrahimi break;
9592*22dc650dSSadaf Ebrahimi
9593*22dc650dSSadaf Ebrahimi case META_MINMAX:
9594*22dc650dSSadaf Ebrahimi case META_MINMAX_PLUS:
9595*22dc650dSSadaf Ebrahimi case META_MINMAX_QUERY:
9596*22dc650dSSadaf Ebrahimi pptr += 3;
9597*22dc650dSSadaf Ebrahimi break;
9598*22dc650dSSadaf Ebrahimi
9599*22dc650dSSadaf Ebrahimi default:
9600*22dc650dSSadaf Ebrahimi break;
9601*22dc650dSSadaf Ebrahimi }
9602*22dc650dSSadaf Ebrahimi break;
9603*22dc650dSSadaf Ebrahimi
9604*22dc650dSSadaf Ebrahimi /* A nested lookbehind does not contribute any length to this lookbehind,
9605*22dc650dSSadaf Ebrahimi but must itself be checked and have its lengths set. */
9606*22dc650dSSadaf Ebrahimi
9607*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND:
9608*22dc650dSSadaf Ebrahimi case META_LOOKBEHINDNOT:
9609*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND_NA:
9610*22dc650dSSadaf Ebrahimi if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9611*22dc650dSSadaf Ebrahimi return -1;
9612*22dc650dSSadaf Ebrahimi break;
9613*22dc650dSSadaf Ebrahimi
9614*22dc650dSSadaf Ebrahimi /* Back references and recursions are handled by very similar code. At this
9615*22dc650dSSadaf Ebrahimi stage, the names generated in the parsing pass are available, but the main
9616*22dc650dSSadaf Ebrahimi name table has not yet been created. So for the named varieties, scan the
9617*22dc650dSSadaf Ebrahimi list of names in order to get the number of the first one in the pattern,
9618*22dc650dSSadaf Ebrahimi and whether or not this name is duplicated. */
9619*22dc650dSSadaf Ebrahimi
9620*22dc650dSSadaf Ebrahimi case META_BACKREF_BYNAME:
9621*22dc650dSSadaf Ebrahimi if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9622*22dc650dSSadaf Ebrahimi goto ISNOTFIXED;
9623*22dc650dSSadaf Ebrahimi /* Fall through */
9624*22dc650dSSadaf Ebrahimi
9625*22dc650dSSadaf Ebrahimi case META_RECURSE_BYNAME:
9626*22dc650dSSadaf Ebrahimi {
9627*22dc650dSSadaf Ebrahimi int i;
9628*22dc650dSSadaf Ebrahimi PCRE2_SPTR name;
9629*22dc650dSSadaf Ebrahimi BOOL is_dupname = FALSE;
9630*22dc650dSSadaf Ebrahimi named_group *ng = cb->named_groups;
9631*22dc650dSSadaf Ebrahimi uint32_t meta_code = META_CODE(*pptr);
9632*22dc650dSSadaf Ebrahimi uint32_t length = *(++pptr);
9633*22dc650dSSadaf Ebrahimi
9634*22dc650dSSadaf Ebrahimi GETPLUSOFFSET(offset, pptr);
9635*22dc650dSSadaf Ebrahimi name = cb->start_pattern + offset;
9636*22dc650dSSadaf Ebrahimi for (i = 0; i < cb->names_found; i++, ng++)
9637*22dc650dSSadaf Ebrahimi {
9638*22dc650dSSadaf Ebrahimi if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9639*22dc650dSSadaf Ebrahimi {
9640*22dc650dSSadaf Ebrahimi group = ng->number;
9641*22dc650dSSadaf Ebrahimi is_dupname = ng->isdup;
9642*22dc650dSSadaf Ebrahimi break;
9643*22dc650dSSadaf Ebrahimi }
9644*22dc650dSSadaf Ebrahimi }
9645*22dc650dSSadaf Ebrahimi
9646*22dc650dSSadaf Ebrahimi if (group == 0)
9647*22dc650dSSadaf Ebrahimi {
9648*22dc650dSSadaf Ebrahimi *errcodeptr = ERR15; /* Non-existent subpattern */
9649*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
9650*22dc650dSSadaf Ebrahimi return -1;
9651*22dc650dSSadaf Ebrahimi }
9652*22dc650dSSadaf Ebrahimi
9653*22dc650dSSadaf Ebrahimi /* A numerical back reference can be fixed length if duplicate capturing
9654*22dc650dSSadaf Ebrahimi groups are not being used. A non-duplicate named back reference can also
9655*22dc650dSSadaf Ebrahimi be handled. */
9656*22dc650dSSadaf Ebrahimi
9657*22dc650dSSadaf Ebrahimi if (meta_code == META_RECURSE_BYNAME ||
9658*22dc650dSSadaf Ebrahimi (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9659*22dc650dSSadaf Ebrahimi goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9660*22dc650dSSadaf Ebrahimi }
9661*22dc650dSSadaf Ebrahimi goto ISNOTFIXED; /* Duplicate name or number */
9662*22dc650dSSadaf Ebrahimi
9663*22dc650dSSadaf Ebrahimi /* The offset values for back references < 10 are in a separate vector
9664*22dc650dSSadaf Ebrahimi because otherwise they would use more than two parsed pattern elements on
9665*22dc650dSSadaf Ebrahimi 64-bit systems. */
9666*22dc650dSSadaf Ebrahimi
9667*22dc650dSSadaf Ebrahimi case META_BACKREF:
9668*22dc650dSSadaf Ebrahimi if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9669*22dc650dSSadaf Ebrahimi (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9670*22dc650dSSadaf Ebrahimi goto ISNOTFIXED;
9671*22dc650dSSadaf Ebrahimi group = META_DATA(*pptr);
9672*22dc650dSSadaf Ebrahimi if (group < 10)
9673*22dc650dSSadaf Ebrahimi {
9674*22dc650dSSadaf Ebrahimi offset = cb->small_ref_offset[group];
9675*22dc650dSSadaf Ebrahimi goto RECURSE_OR_BACKREF_LENGTH;
9676*22dc650dSSadaf Ebrahimi }
9677*22dc650dSSadaf Ebrahimi
9678*22dc650dSSadaf Ebrahimi /* Fall through */
9679*22dc650dSSadaf Ebrahimi /* For groups >= 10 - picking up group twice does no harm. */
9680*22dc650dSSadaf Ebrahimi
9681*22dc650dSSadaf Ebrahimi /* A true recursion implies not fixed length, but a subroutine call may
9682*22dc650dSSadaf Ebrahimi be OK. Back reference "recursions" are also failed. */
9683*22dc650dSSadaf Ebrahimi
9684*22dc650dSSadaf Ebrahimi case META_RECURSE:
9685*22dc650dSSadaf Ebrahimi group = META_DATA(*pptr);
9686*22dc650dSSadaf Ebrahimi GETPLUSOFFSET(offset, pptr);
9687*22dc650dSSadaf Ebrahimi
9688*22dc650dSSadaf Ebrahimi RECURSE_OR_BACKREF_LENGTH:
9689*22dc650dSSadaf Ebrahimi if (group > cb->bracount)
9690*22dc650dSSadaf Ebrahimi {
9691*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
9692*22dc650dSSadaf Ebrahimi *errcodeptr = ERR15; /* Non-existent subpattern */
9693*22dc650dSSadaf Ebrahimi return -1;
9694*22dc650dSSadaf Ebrahimi }
9695*22dc650dSSadaf Ebrahimi if (group == 0) goto ISNOTFIXED; /* Local recursion */
9696*22dc650dSSadaf Ebrahimi for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9697*22dc650dSSadaf Ebrahimi {
9698*22dc650dSSadaf Ebrahimi if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9699*22dc650dSSadaf Ebrahimi else if (*gptr == (META_CAPTURE | group)) break;
9700*22dc650dSSadaf Ebrahimi }
9701*22dc650dSSadaf Ebrahimi
9702*22dc650dSSadaf Ebrahimi /* We must start the search for the end of the group at the first meta code
9703*22dc650dSSadaf Ebrahimi inside the group. Otherwise it will be treated as an enclosed group. */
9704*22dc650dSSadaf Ebrahimi
9705*22dc650dSSadaf Ebrahimi gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9706*22dc650dSSadaf Ebrahimi if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9707*22dc650dSSadaf Ebrahimi if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9708*22dc650dSSadaf Ebrahimi for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9709*22dc650dSSadaf Ebrahimi if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9710*22dc650dSSadaf Ebrahimi this_recurse.prev = recurses;
9711*22dc650dSSadaf Ebrahimi this_recurse.groupptr = gptr;
9712*22dc650dSSadaf Ebrahimi
9713*22dc650dSSadaf Ebrahimi /* We do not need to know the position of the end of the group, that is,
9714*22dc650dSSadaf Ebrahimi gptr is not used after the call to get_grouplength(). Setting the second
9715*22dc650dSSadaf Ebrahimi argument FALSE stops it scanning for the end when the length can be found
9716*22dc650dSSadaf Ebrahimi in the cache. */
9717*22dc650dSSadaf Ebrahimi
9718*22dc650dSSadaf Ebrahimi gptr++;
9719*22dc650dSSadaf Ebrahimi grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9720*22dc650dSSadaf Ebrahimi lcptr, group, &this_recurse, cb);
9721*22dc650dSSadaf Ebrahimi if (grouplength < 0)
9722*22dc650dSSadaf Ebrahimi {
9723*22dc650dSSadaf Ebrahimi if (*errcodeptr == 0) goto ISNOTFIXED;
9724*22dc650dSSadaf Ebrahimi return -1; /* Error already set */
9725*22dc650dSSadaf Ebrahimi }
9726*22dc650dSSadaf Ebrahimi itemlength = grouplength;
9727*22dc650dSSadaf Ebrahimi itemminlength = groupminlength;
9728*22dc650dSSadaf Ebrahimi break;
9729*22dc650dSSadaf Ebrahimi
9730*22dc650dSSadaf Ebrahimi /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9731*22dc650dSSadaf Ebrahimi the length of this branch. Skip from the following item to the next
9732*22dc650dSSadaf Ebrahimi unpaired ket. */
9733*22dc650dSSadaf Ebrahimi
9734*22dc650dSSadaf Ebrahimi case META_COND_DEFINE:
9735*22dc650dSSadaf Ebrahimi pptr = parsed_skip(pptr + 1, PSKIP_KET);
9736*22dc650dSSadaf Ebrahimi break;
9737*22dc650dSSadaf Ebrahimi
9738*22dc650dSSadaf Ebrahimi /* Check other nested groups - advance past the initial data for each type
9739*22dc650dSSadaf Ebrahimi and then seek a fixed length with get_grouplength(). */
9740*22dc650dSSadaf Ebrahimi
9741*22dc650dSSadaf Ebrahimi case META_COND_NAME:
9742*22dc650dSSadaf Ebrahimi case META_COND_NUMBER:
9743*22dc650dSSadaf Ebrahimi case META_COND_RNAME:
9744*22dc650dSSadaf Ebrahimi case META_COND_RNUMBER:
9745*22dc650dSSadaf Ebrahimi pptr += 2 + SIZEOFFSET;
9746*22dc650dSSadaf Ebrahimi goto CHECK_GROUP;
9747*22dc650dSSadaf Ebrahimi
9748*22dc650dSSadaf Ebrahimi case META_COND_ASSERT:
9749*22dc650dSSadaf Ebrahimi pptr += 1;
9750*22dc650dSSadaf Ebrahimi goto CHECK_GROUP;
9751*22dc650dSSadaf Ebrahimi
9752*22dc650dSSadaf Ebrahimi case META_COND_VERSION:
9753*22dc650dSSadaf Ebrahimi pptr += 4;
9754*22dc650dSSadaf Ebrahimi goto CHECK_GROUP;
9755*22dc650dSSadaf Ebrahimi
9756*22dc650dSSadaf Ebrahimi case META_CAPTURE:
9757*22dc650dSSadaf Ebrahimi group = META_DATA(*pptr);
9758*22dc650dSSadaf Ebrahimi /* Fall through */
9759*22dc650dSSadaf Ebrahimi
9760*22dc650dSSadaf Ebrahimi case META_ATOMIC:
9761*22dc650dSSadaf Ebrahimi case META_NOCAPTURE:
9762*22dc650dSSadaf Ebrahimi case META_SCRIPT_RUN:
9763*22dc650dSSadaf Ebrahimi pptr++;
9764*22dc650dSSadaf Ebrahimi CHECK_GROUP:
9765*22dc650dSSadaf Ebrahimi grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9766*22dc650dSSadaf Ebrahimi lcptr, group, recurses, cb);
9767*22dc650dSSadaf Ebrahimi if (grouplength < 0) return -1;
9768*22dc650dSSadaf Ebrahimi itemlength = grouplength;
9769*22dc650dSSadaf Ebrahimi itemminlength = groupminlength;
9770*22dc650dSSadaf Ebrahimi break;
9771*22dc650dSSadaf Ebrahimi
9772*22dc650dSSadaf Ebrahimi case META_QUERY:
9773*22dc650dSSadaf Ebrahimi case META_QUERY_PLUS:
9774*22dc650dSSadaf Ebrahimi case META_QUERY_QUERY:
9775*22dc650dSSadaf Ebrahimi min = 0;
9776*22dc650dSSadaf Ebrahimi max = 1;
9777*22dc650dSSadaf Ebrahimi goto REPETITION;
9778*22dc650dSSadaf Ebrahimi
9779*22dc650dSSadaf Ebrahimi /* Exact repetition is OK; variable repetition is not. A repetition of zero
9780*22dc650dSSadaf Ebrahimi must subtract the length that has already been added. */
9781*22dc650dSSadaf Ebrahimi
9782*22dc650dSSadaf Ebrahimi case META_MINMAX:
9783*22dc650dSSadaf Ebrahimi case META_MINMAX_PLUS:
9784*22dc650dSSadaf Ebrahimi case META_MINMAX_QUERY:
9785*22dc650dSSadaf Ebrahimi min = pptr[1];
9786*22dc650dSSadaf Ebrahimi max = pptr[2];
9787*22dc650dSSadaf Ebrahimi pptr += 2;
9788*22dc650dSSadaf Ebrahimi
9789*22dc650dSSadaf Ebrahimi REPETITION:
9790*22dc650dSSadaf Ebrahimi if (max != REPEAT_UNLIMITED)
9791*22dc650dSSadaf Ebrahimi {
9792*22dc650dSSadaf Ebrahimi if (lastitemlength != 0 && /* Should not occur, but just in case */
9793*22dc650dSSadaf Ebrahimi max != 0 &&
9794*22dc650dSSadaf Ebrahimi (INT_MAX - branchlength)/lastitemlength < max - 1)
9795*22dc650dSSadaf Ebrahimi {
9796*22dc650dSSadaf Ebrahimi *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */
9797*22dc650dSSadaf Ebrahimi return -1;
9798*22dc650dSSadaf Ebrahimi }
9799*22dc650dSSadaf Ebrahimi if (min == 0) branchminlength -= lastitemminlength;
9800*22dc650dSSadaf Ebrahimi else itemminlength = (min - 1) * lastitemminlength;
9801*22dc650dSSadaf Ebrahimi if (max == 0) branchlength -= lastitemlength;
9802*22dc650dSSadaf Ebrahimi else itemlength = (max - 1) * lastitemlength;
9803*22dc650dSSadaf Ebrahimi break;
9804*22dc650dSSadaf Ebrahimi }
9805*22dc650dSSadaf Ebrahimi /* Fall through */
9806*22dc650dSSadaf Ebrahimi
9807*22dc650dSSadaf Ebrahimi /* Any other item means this branch does not have a fixed length. */
9808*22dc650dSSadaf Ebrahimi
9809*22dc650dSSadaf Ebrahimi default:
9810*22dc650dSSadaf Ebrahimi ISNOTFIXED:
9811*22dc650dSSadaf Ebrahimi *errcodeptr = ERR25; /* Not fixed length */
9812*22dc650dSSadaf Ebrahimi return -1;
9813*22dc650dSSadaf Ebrahimi }
9814*22dc650dSSadaf Ebrahimi
9815*22dc650dSSadaf Ebrahimi /* Add the item length to the branchlength, checking for integer overflow and
9816*22dc650dSSadaf Ebrahimi for the branch length exceeding the overall limit. Later, if there is at
9817*22dc650dSSadaf Ebrahimi least one variable-length branch in the group, there is a test for the
9818*22dc650dSSadaf Ebrahimi (smaller) variable-length branch length limit. */
9819*22dc650dSSadaf Ebrahimi
9820*22dc650dSSadaf Ebrahimi if (INT_MAX - branchlength < (int)itemlength ||
9821*22dc650dSSadaf Ebrahimi (branchlength += itemlength) > LOOKBEHIND_MAX)
9822*22dc650dSSadaf Ebrahimi {
9823*22dc650dSSadaf Ebrahimi *errcodeptr = ERR87;
9824*22dc650dSSadaf Ebrahimi return -1;
9825*22dc650dSSadaf Ebrahimi }
9826*22dc650dSSadaf Ebrahimi
9827*22dc650dSSadaf Ebrahimi branchminlength += itemminlength;
9828*22dc650dSSadaf Ebrahimi
9829*22dc650dSSadaf Ebrahimi /* Save this item length for use if the next item is a quantifier. */
9830*22dc650dSSadaf Ebrahimi
9831*22dc650dSSadaf Ebrahimi lastitemlength = itemlength;
9832*22dc650dSSadaf Ebrahimi lastitemminlength = itemminlength;
9833*22dc650dSSadaf Ebrahimi }
9834*22dc650dSSadaf Ebrahimi
9835*22dc650dSSadaf Ebrahimi EXIT:
9836*22dc650dSSadaf Ebrahimi *pptrptr = pptr;
9837*22dc650dSSadaf Ebrahimi *minptr = branchminlength;
9838*22dc650dSSadaf Ebrahimi return branchlength;
9839*22dc650dSSadaf Ebrahimi
9840*22dc650dSSadaf Ebrahimi PARSED_SKIP_FAILED:
9841*22dc650dSSadaf Ebrahimi *errcodeptr = ERR90;
9842*22dc650dSSadaf Ebrahimi return -1;
9843*22dc650dSSadaf Ebrahimi }
9844*22dc650dSSadaf Ebrahimi
9845*22dc650dSSadaf Ebrahimi
9846*22dc650dSSadaf Ebrahimi
9847*22dc650dSSadaf Ebrahimi /*************************************************
9848*22dc650dSSadaf Ebrahimi * Set lengths in a lookbehind *
9849*22dc650dSSadaf Ebrahimi *************************************************/
9850*22dc650dSSadaf Ebrahimi
9851*22dc650dSSadaf Ebrahimi /* This function is called for each lookbehind, to set the lengths in its
9852*22dc650dSSadaf Ebrahimi branches. An error occurs if any branch does not have a limited maximum length
9853*22dc650dSSadaf Ebrahimi that is less than the limit (65535). On exit, the pointer must be left on the
9854*22dc650dSSadaf Ebrahimi final ket.
9855*22dc650dSSadaf Ebrahimi
9856*22dc650dSSadaf Ebrahimi The function also maintains the max_lookbehind value. Any lookbehind branch
9857*22dc650dSSadaf Ebrahimi that contains a nested lookbehind may actually look further back than the
9858*22dc650dSSadaf Ebrahimi length of the branch. The additional amount is passed back from
9859*22dc650dSSadaf Ebrahimi get_branchlength() as an "extra" value.
9860*22dc650dSSadaf Ebrahimi
9861*22dc650dSSadaf Ebrahimi Arguments:
9862*22dc650dSSadaf Ebrahimi pptrptr pointer to pointer in the parsed pattern
9863*22dc650dSSadaf Ebrahimi errcodeptr pointer to error code
9864*22dc650dSSadaf Ebrahimi lcptr pointer to loop counter
9865*22dc650dSSadaf Ebrahimi recurses chain of recurse_check to catch mutual recursion
9866*22dc650dSSadaf Ebrahimi cb pointer to compile block
9867*22dc650dSSadaf Ebrahimi
9868*22dc650dSSadaf Ebrahimi Returns: TRUE if all is well
9869*22dc650dSSadaf Ebrahimi FALSE otherwise, with error code and offset set
9870*22dc650dSSadaf Ebrahimi */
9871*22dc650dSSadaf Ebrahimi
9872*22dc650dSSadaf Ebrahimi static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9873*22dc650dSSadaf Ebrahimi set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9874*22dc650dSSadaf Ebrahimi parsed_recurse_check *recurses, compile_block *cb)
9875*22dc650dSSadaf Ebrahimi {
9876*22dc650dSSadaf Ebrahimi PCRE2_SIZE offset;
9877*22dc650dSSadaf Ebrahimi uint32_t *bptr = *pptrptr;
9878*22dc650dSSadaf Ebrahimi uint32_t *gbptr = bptr;
9879*22dc650dSSadaf Ebrahimi int maxlength = 0;
9880*22dc650dSSadaf Ebrahimi int minlength = INT_MAX;
9881*22dc650dSSadaf Ebrahimi BOOL variable = FALSE;
9882*22dc650dSSadaf Ebrahimi
9883*22dc650dSSadaf Ebrahimi READPLUSOFFSET(offset, bptr); /* Offset for error messages */
9884*22dc650dSSadaf Ebrahimi *pptrptr += SIZEOFFSET;
9885*22dc650dSSadaf Ebrahimi
9886*22dc650dSSadaf Ebrahimi /* Each branch can have a different maximum length, but we can keep only a
9887*22dc650dSSadaf Ebrahimi single minimum for the whole group, because there's nowhere to save individual
9888*22dc650dSSadaf Ebrahimi values in the META_ALT item. */
9889*22dc650dSSadaf Ebrahimi
9890*22dc650dSSadaf Ebrahimi do
9891*22dc650dSSadaf Ebrahimi {
9892*22dc650dSSadaf Ebrahimi int branchlength, branchminlength;
9893*22dc650dSSadaf Ebrahimi
9894*22dc650dSSadaf Ebrahimi *pptrptr += 1;
9895*22dc650dSSadaf Ebrahimi branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9896*22dc650dSSadaf Ebrahimi recurses, cb);
9897*22dc650dSSadaf Ebrahimi
9898*22dc650dSSadaf Ebrahimi if (branchlength < 0)
9899*22dc650dSSadaf Ebrahimi {
9900*22dc650dSSadaf Ebrahimi /* The errorcode and offset may already be set from a nested lookbehind. */
9901*22dc650dSSadaf Ebrahimi if (*errcodeptr == 0) *errcodeptr = ERR25;
9902*22dc650dSSadaf Ebrahimi if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9903*22dc650dSSadaf Ebrahimi return FALSE;
9904*22dc650dSSadaf Ebrahimi }
9905*22dc650dSSadaf Ebrahimi
9906*22dc650dSSadaf Ebrahimi if (branchlength != branchminlength) variable = TRUE;
9907*22dc650dSSadaf Ebrahimi if (branchminlength < minlength) minlength = branchminlength;
9908*22dc650dSSadaf Ebrahimi if (branchlength > maxlength) maxlength = branchlength;
9909*22dc650dSSadaf Ebrahimi if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9910*22dc650dSSadaf Ebrahimi *bptr |= branchlength; /* branchlength never more than 65535 */
9911*22dc650dSSadaf Ebrahimi bptr = *pptrptr;
9912*22dc650dSSadaf Ebrahimi }
9913*22dc650dSSadaf Ebrahimi while (META_CODE(*bptr) == META_ALT);
9914*22dc650dSSadaf Ebrahimi
9915*22dc650dSSadaf Ebrahimi /* If any branch is of variable length, the whole lookbehind is of variable
9916*22dc650dSSadaf Ebrahimi length. If the maximum length of any branch exceeds the maximum for variable
9917*22dc650dSSadaf Ebrahimi lookbehinds, give an error. Otherwise, the minimum length is set in the word
9918*22dc650dSSadaf Ebrahimi that follows the original group META value. For a fixed-length lookbehind, this
9919*22dc650dSSadaf Ebrahimi is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9920*22dc650dSSadaf Ebrahimi possibly different) length. */
9921*22dc650dSSadaf Ebrahimi
9922*22dc650dSSadaf Ebrahimi if (variable)
9923*22dc650dSSadaf Ebrahimi {
9924*22dc650dSSadaf Ebrahimi gbptr[1] = minlength;
9925*22dc650dSSadaf Ebrahimi if ((uint32_t)maxlength > cb->max_varlookbehind)
9926*22dc650dSSadaf Ebrahimi {
9927*22dc650dSSadaf Ebrahimi *errcodeptr = ERR100;
9928*22dc650dSSadaf Ebrahimi cb->erroroffset = offset;
9929*22dc650dSSadaf Ebrahimi return FALSE;
9930*22dc650dSSadaf Ebrahimi }
9931*22dc650dSSadaf Ebrahimi }
9932*22dc650dSSadaf Ebrahimi else gbptr[1] = LOOKBEHIND_MAX;
9933*22dc650dSSadaf Ebrahimi
9934*22dc650dSSadaf Ebrahimi
9935*22dc650dSSadaf Ebrahimi gbptr[1] = variable? minlength : LOOKBEHIND_MAX;
9936*22dc650dSSadaf Ebrahimi return TRUE;
9937*22dc650dSSadaf Ebrahimi }
9938*22dc650dSSadaf Ebrahimi
9939*22dc650dSSadaf Ebrahimi
9940*22dc650dSSadaf Ebrahimi
9941*22dc650dSSadaf Ebrahimi /*************************************************
9942*22dc650dSSadaf Ebrahimi * Check parsed pattern lookbehinds *
9943*22dc650dSSadaf Ebrahimi *************************************************/
9944*22dc650dSSadaf Ebrahimi
9945*22dc650dSSadaf Ebrahimi /* This function is called at the end of parsing a pattern if any lookbehinds
9946*22dc650dSSadaf Ebrahimi were encountered. It scans the parsed pattern for them, calling
9947*22dc650dSSadaf Ebrahimi set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9948*22dc650dSSadaf Ebrahimi the error offset is marked unset. The enables the functions above not to
9949*22dc650dSSadaf Ebrahimi override settings from deeper nestings.
9950*22dc650dSSadaf Ebrahimi
9951*22dc650dSSadaf Ebrahimi This function is called recursively from get_branchlength() for lookaheads in
9952*22dc650dSSadaf Ebrahimi order to process any lookbehinds that they may contain. It stops when it hits a
9953*22dc650dSSadaf Ebrahimi non-nested closing parenthesis in this case, returning a pointer to it.
9954*22dc650dSSadaf Ebrahimi
9955*22dc650dSSadaf Ebrahimi Arguments
9956*22dc650dSSadaf Ebrahimi pptr points to where to start (start of pattern or start of lookahead)
9957*22dc650dSSadaf Ebrahimi retptr if not NULL, return the ket pointer here
9958*22dc650dSSadaf Ebrahimi recurses chain of recurse_check to catch mutual recursion
9959*22dc650dSSadaf Ebrahimi cb points to the compile block
9960*22dc650dSSadaf Ebrahimi lcptr points to loop counter
9961*22dc650dSSadaf Ebrahimi
9962*22dc650dSSadaf Ebrahimi Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9963*22dc650dSSadaf Ebrahimi */
9964*22dc650dSSadaf Ebrahimi
9965*22dc650dSSadaf Ebrahimi static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9966*22dc650dSSadaf Ebrahimi check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9967*22dc650dSSadaf Ebrahimi parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9968*22dc650dSSadaf Ebrahimi {
9969*22dc650dSSadaf Ebrahimi int errorcode = 0;
9970*22dc650dSSadaf Ebrahimi int nestlevel = 0;
9971*22dc650dSSadaf Ebrahimi
9972*22dc650dSSadaf Ebrahimi cb->erroroffset = PCRE2_UNSET;
9973*22dc650dSSadaf Ebrahimi
9974*22dc650dSSadaf Ebrahimi for (; *pptr != META_END; pptr++)
9975*22dc650dSSadaf Ebrahimi {
9976*22dc650dSSadaf Ebrahimi if (*pptr < META_END) continue; /* Literal */
9977*22dc650dSSadaf Ebrahimi
9978*22dc650dSSadaf Ebrahimi switch (META_CODE(*pptr))
9979*22dc650dSSadaf Ebrahimi {
9980*22dc650dSSadaf Ebrahimi default:
9981*22dc650dSSadaf Ebrahimi return ERR70; /* Unrecognized meta code */
9982*22dc650dSSadaf Ebrahimi
9983*22dc650dSSadaf Ebrahimi case META_ESCAPE:
9984*22dc650dSSadaf Ebrahimi if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9985*22dc650dSSadaf Ebrahimi pptr += 1;
9986*22dc650dSSadaf Ebrahimi break;
9987*22dc650dSSadaf Ebrahimi
9988*22dc650dSSadaf Ebrahimi case META_KET:
9989*22dc650dSSadaf Ebrahimi if (--nestlevel < 0)
9990*22dc650dSSadaf Ebrahimi {
9991*22dc650dSSadaf Ebrahimi if (retptr != NULL) *retptr = pptr;
9992*22dc650dSSadaf Ebrahimi return 0;
9993*22dc650dSSadaf Ebrahimi }
9994*22dc650dSSadaf Ebrahimi break;
9995*22dc650dSSadaf Ebrahimi
9996*22dc650dSSadaf Ebrahimi case META_ATOMIC:
9997*22dc650dSSadaf Ebrahimi case META_CAPTURE:
9998*22dc650dSSadaf Ebrahimi case META_COND_ASSERT:
9999*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD:
10000*22dc650dSSadaf Ebrahimi case META_LOOKAHEADNOT:
10001*22dc650dSSadaf Ebrahimi case META_LOOKAHEAD_NA:
10002*22dc650dSSadaf Ebrahimi case META_NOCAPTURE:
10003*22dc650dSSadaf Ebrahimi case META_SCRIPT_RUN:
10004*22dc650dSSadaf Ebrahimi nestlevel++;
10005*22dc650dSSadaf Ebrahimi break;
10006*22dc650dSSadaf Ebrahimi
10007*22dc650dSSadaf Ebrahimi case META_ACCEPT:
10008*22dc650dSSadaf Ebrahimi case META_ALT:
10009*22dc650dSSadaf Ebrahimi case META_ASTERISK:
10010*22dc650dSSadaf Ebrahimi case META_ASTERISK_PLUS:
10011*22dc650dSSadaf Ebrahimi case META_ASTERISK_QUERY:
10012*22dc650dSSadaf Ebrahimi case META_BACKREF:
10013*22dc650dSSadaf Ebrahimi case META_CIRCUMFLEX:
10014*22dc650dSSadaf Ebrahimi case META_CLASS:
10015*22dc650dSSadaf Ebrahimi case META_CLASS_EMPTY:
10016*22dc650dSSadaf Ebrahimi case META_CLASS_EMPTY_NOT:
10017*22dc650dSSadaf Ebrahimi case META_CLASS_END:
10018*22dc650dSSadaf Ebrahimi case META_CLASS_NOT:
10019*22dc650dSSadaf Ebrahimi case META_COMMIT:
10020*22dc650dSSadaf Ebrahimi case META_DOLLAR:
10021*22dc650dSSadaf Ebrahimi case META_DOT:
10022*22dc650dSSadaf Ebrahimi case META_FAIL:
10023*22dc650dSSadaf Ebrahimi case META_PLUS:
10024*22dc650dSSadaf Ebrahimi case META_PLUS_PLUS:
10025*22dc650dSSadaf Ebrahimi case META_PLUS_QUERY:
10026*22dc650dSSadaf Ebrahimi case META_PRUNE:
10027*22dc650dSSadaf Ebrahimi case META_QUERY:
10028*22dc650dSSadaf Ebrahimi case META_QUERY_PLUS:
10029*22dc650dSSadaf Ebrahimi case META_QUERY_QUERY:
10030*22dc650dSSadaf Ebrahimi case META_RANGE_ESCAPED:
10031*22dc650dSSadaf Ebrahimi case META_RANGE_LITERAL:
10032*22dc650dSSadaf Ebrahimi case META_SKIP:
10033*22dc650dSSadaf Ebrahimi case META_THEN:
10034*22dc650dSSadaf Ebrahimi break;
10035*22dc650dSSadaf Ebrahimi
10036*22dc650dSSadaf Ebrahimi case META_RECURSE:
10037*22dc650dSSadaf Ebrahimi pptr += SIZEOFFSET;
10038*22dc650dSSadaf Ebrahimi break;
10039*22dc650dSSadaf Ebrahimi
10040*22dc650dSSadaf Ebrahimi case META_BACKREF_BYNAME:
10041*22dc650dSSadaf Ebrahimi case META_RECURSE_BYNAME:
10042*22dc650dSSadaf Ebrahimi pptr += 1 + SIZEOFFSET;
10043*22dc650dSSadaf Ebrahimi break;
10044*22dc650dSSadaf Ebrahimi
10045*22dc650dSSadaf Ebrahimi case META_COND_DEFINE:
10046*22dc650dSSadaf Ebrahimi pptr += SIZEOFFSET;
10047*22dc650dSSadaf Ebrahimi nestlevel++;
10048*22dc650dSSadaf Ebrahimi break;
10049*22dc650dSSadaf Ebrahimi
10050*22dc650dSSadaf Ebrahimi case META_COND_NAME:
10051*22dc650dSSadaf Ebrahimi case META_COND_NUMBER:
10052*22dc650dSSadaf Ebrahimi case META_COND_RNAME:
10053*22dc650dSSadaf Ebrahimi case META_COND_RNUMBER:
10054*22dc650dSSadaf Ebrahimi pptr += 1 + SIZEOFFSET;
10055*22dc650dSSadaf Ebrahimi nestlevel++;
10056*22dc650dSSadaf Ebrahimi break;
10057*22dc650dSSadaf Ebrahimi
10058*22dc650dSSadaf Ebrahimi case META_COND_VERSION:
10059*22dc650dSSadaf Ebrahimi pptr += 3;
10060*22dc650dSSadaf Ebrahimi nestlevel++;
10061*22dc650dSSadaf Ebrahimi break;
10062*22dc650dSSadaf Ebrahimi
10063*22dc650dSSadaf Ebrahimi case META_CALLOUT_STRING:
10064*22dc650dSSadaf Ebrahimi pptr += 3 + SIZEOFFSET;
10065*22dc650dSSadaf Ebrahimi break;
10066*22dc650dSSadaf Ebrahimi
10067*22dc650dSSadaf Ebrahimi case META_BIGVALUE:
10068*22dc650dSSadaf Ebrahimi case META_POSIX:
10069*22dc650dSSadaf Ebrahimi case META_POSIX_NEG:
10070*22dc650dSSadaf Ebrahimi pptr += 1;
10071*22dc650dSSadaf Ebrahimi break;
10072*22dc650dSSadaf Ebrahimi
10073*22dc650dSSadaf Ebrahimi case META_MINMAX:
10074*22dc650dSSadaf Ebrahimi case META_MINMAX_QUERY:
10075*22dc650dSSadaf Ebrahimi case META_MINMAX_PLUS:
10076*22dc650dSSadaf Ebrahimi case META_OPTIONS:
10077*22dc650dSSadaf Ebrahimi pptr += 2;
10078*22dc650dSSadaf Ebrahimi break;
10079*22dc650dSSadaf Ebrahimi
10080*22dc650dSSadaf Ebrahimi case META_CALLOUT_NUMBER:
10081*22dc650dSSadaf Ebrahimi pptr += 3;
10082*22dc650dSSadaf Ebrahimi break;
10083*22dc650dSSadaf Ebrahimi
10084*22dc650dSSadaf Ebrahimi case META_MARK:
10085*22dc650dSSadaf Ebrahimi case META_COMMIT_ARG:
10086*22dc650dSSadaf Ebrahimi case META_PRUNE_ARG:
10087*22dc650dSSadaf Ebrahimi case META_SKIP_ARG:
10088*22dc650dSSadaf Ebrahimi case META_THEN_ARG:
10089*22dc650dSSadaf Ebrahimi pptr += 1 + pptr[1];
10090*22dc650dSSadaf Ebrahimi break;
10091*22dc650dSSadaf Ebrahimi
10092*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND:
10093*22dc650dSSadaf Ebrahimi case META_LOOKBEHINDNOT:
10094*22dc650dSSadaf Ebrahimi case META_LOOKBEHIND_NA:
10095*22dc650dSSadaf Ebrahimi if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10096*22dc650dSSadaf Ebrahimi return errorcode;
10097*22dc650dSSadaf Ebrahimi break;
10098*22dc650dSSadaf Ebrahimi }
10099*22dc650dSSadaf Ebrahimi }
10100*22dc650dSSadaf Ebrahimi
10101*22dc650dSSadaf Ebrahimi return 0;
10102*22dc650dSSadaf Ebrahimi }
10103*22dc650dSSadaf Ebrahimi
10104*22dc650dSSadaf Ebrahimi
10105*22dc650dSSadaf Ebrahimi
10106*22dc650dSSadaf Ebrahimi /*************************************************
10107*22dc650dSSadaf Ebrahimi * External function to compile a pattern *
10108*22dc650dSSadaf Ebrahimi *************************************************/
10109*22dc650dSSadaf Ebrahimi
10110*22dc650dSSadaf Ebrahimi /* This function reads a regular expression in the form of a string and returns
10111*22dc650dSSadaf Ebrahimi a pointer to a block of store holding a compiled version of the expression.
10112*22dc650dSSadaf Ebrahimi
10113*22dc650dSSadaf Ebrahimi Arguments:
10114*22dc650dSSadaf Ebrahimi pattern the regular expression
10115*22dc650dSSadaf Ebrahimi patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
10116*22dc650dSSadaf Ebrahimi options option bits
10117*22dc650dSSadaf Ebrahimi errorptr pointer to errorcode
10118*22dc650dSSadaf Ebrahimi erroroffset pointer to error offset
10119*22dc650dSSadaf Ebrahimi ccontext points to a compile context or is NULL
10120*22dc650dSSadaf Ebrahimi
10121*22dc650dSSadaf Ebrahimi Returns: pointer to compiled data block, or NULL on error,
10122*22dc650dSSadaf Ebrahimi with errorcode and erroroffset set
10123*22dc650dSSadaf Ebrahimi */
10124*22dc650dSSadaf Ebrahimi
10125*22dc650dSSadaf Ebrahimi PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)10126*22dc650dSSadaf Ebrahimi pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10127*22dc650dSSadaf Ebrahimi int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10128*22dc650dSSadaf Ebrahimi {
10129*22dc650dSSadaf Ebrahimi BOOL utf; /* Set TRUE for UTF mode */
10130*22dc650dSSadaf Ebrahimi BOOL ucp; /* Set TRUE for UCP mode */
10131*22dc650dSSadaf Ebrahimi BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
10132*22dc650dSSadaf Ebrahimi BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
10133*22dc650dSSadaf Ebrahimi pcre2_real_code *re = NULL; /* What we will return */
10134*22dc650dSSadaf Ebrahimi compile_block cb; /* "Static" compile-time data */
10135*22dc650dSSadaf Ebrahimi const uint8_t *tables; /* Char tables base pointer */
10136*22dc650dSSadaf Ebrahimi
10137*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *code; /* Current pointer in compiled code */
10138*22dc650dSSadaf Ebrahimi PCRE2_SPTR codestart; /* Start of compiled code */
10139*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr; /* Current pointer in pattern */
10140*22dc650dSSadaf Ebrahimi uint32_t *pptr; /* Current pointer in parsed pattern */
10141*22dc650dSSadaf Ebrahimi
10142*22dc650dSSadaf Ebrahimi PCRE2_SIZE length = 1; /* Allow for final END opcode */
10143*22dc650dSSadaf Ebrahimi PCRE2_SIZE usedlength; /* Actual length used */
10144*22dc650dSSadaf Ebrahimi PCRE2_SIZE re_blocksize; /* Size of memory block */
10145*22dc650dSSadaf Ebrahimi PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
10146*22dc650dSSadaf Ebrahimi PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
10147*22dc650dSSadaf Ebrahimi
10148*22dc650dSSadaf Ebrahimi uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
10149*22dc650dSSadaf Ebrahimi uint32_t firstcu, reqcu; /* Value of first/req code unit */
10150*22dc650dSSadaf Ebrahimi uint32_t setflags = 0; /* NL and BSR set flags */
10151*22dc650dSSadaf Ebrahimi
10152*22dc650dSSadaf Ebrahimi uint32_t skipatstart; /* When checking (*UTF) etc */
10153*22dc650dSSadaf Ebrahimi uint32_t limit_heap = UINT32_MAX;
10154*22dc650dSSadaf Ebrahimi uint32_t limit_match = UINT32_MAX; /* Unset match limits */
10155*22dc650dSSadaf Ebrahimi uint32_t limit_depth = UINT32_MAX;
10156*22dc650dSSadaf Ebrahimi
10157*22dc650dSSadaf Ebrahimi int newline = 0; /* Unset; can be set by the pattern */
10158*22dc650dSSadaf Ebrahimi int bsr = 0; /* Unset; can be set by the pattern */
10159*22dc650dSSadaf Ebrahimi int errorcode = 0; /* Initialize to avoid compiler warn */
10160*22dc650dSSadaf Ebrahimi int regexrc; /* Return from compile */
10161*22dc650dSSadaf Ebrahimi
10162*22dc650dSSadaf Ebrahimi uint32_t i; /* Local loop counter */
10163*22dc650dSSadaf Ebrahimi
10164*22dc650dSSadaf Ebrahimi /* Comments at the head of this file explain about these variables. */
10165*22dc650dSSadaf Ebrahimi
10166*22dc650dSSadaf Ebrahimi uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10167*22dc650dSSadaf Ebrahimi uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10168*22dc650dSSadaf Ebrahimi named_group named_groups[NAMED_GROUP_LIST_SIZE];
10169*22dc650dSSadaf Ebrahimi
10170*22dc650dSSadaf Ebrahimi /* The workspace is used in different ways in the different compiling phases.
10171*22dc650dSSadaf Ebrahimi It needs to be 16-bit aligned for the preliminary parsing scan. */
10172*22dc650dSSadaf Ebrahimi
10173*22dc650dSSadaf Ebrahimi uint32_t c16workspace[C16_WORK_SIZE];
10174*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10175*22dc650dSSadaf Ebrahimi
10176*22dc650dSSadaf Ebrahimi
10177*22dc650dSSadaf Ebrahimi /* -------------- Check arguments and set up the pattern ----------------- */
10178*22dc650dSSadaf Ebrahimi
10179*22dc650dSSadaf Ebrahimi /* There must be error code and offset pointers. */
10180*22dc650dSSadaf Ebrahimi
10181*22dc650dSSadaf Ebrahimi if (errorptr == NULL || erroroffset == NULL) return NULL;
10182*22dc650dSSadaf Ebrahimi *errorptr = ERR0;
10183*22dc650dSSadaf Ebrahimi *erroroffset = 0;
10184*22dc650dSSadaf Ebrahimi
10185*22dc650dSSadaf Ebrahimi /* There must be a pattern, but NULL is allowed with zero length. */
10186*22dc650dSSadaf Ebrahimi
10187*22dc650dSSadaf Ebrahimi if (pattern == NULL)
10188*22dc650dSSadaf Ebrahimi {
10189*22dc650dSSadaf Ebrahimi if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10190*22dc650dSSadaf Ebrahimi {
10191*22dc650dSSadaf Ebrahimi *errorptr = ERR16;
10192*22dc650dSSadaf Ebrahimi return NULL;
10193*22dc650dSSadaf Ebrahimi }
10194*22dc650dSSadaf Ebrahimi }
10195*22dc650dSSadaf Ebrahimi
10196*22dc650dSSadaf Ebrahimi /* A NULL compile context means "use a default context" */
10197*22dc650dSSadaf Ebrahimi
10198*22dc650dSSadaf Ebrahimi if (ccontext == NULL)
10199*22dc650dSSadaf Ebrahimi ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10200*22dc650dSSadaf Ebrahimi
10201*22dc650dSSadaf Ebrahimi /* PCRE2_MATCH_INVALID_UTF implies UTF */
10202*22dc650dSSadaf Ebrahimi
10203*22dc650dSSadaf Ebrahimi if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10204*22dc650dSSadaf Ebrahimi
10205*22dc650dSSadaf Ebrahimi /* Check that all undefined public option bits are zero. */
10206*22dc650dSSadaf Ebrahimi
10207*22dc650dSSadaf Ebrahimi if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10208*22dc650dSSadaf Ebrahimi (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10209*22dc650dSSadaf Ebrahimi {
10210*22dc650dSSadaf Ebrahimi *errorptr = ERR17;
10211*22dc650dSSadaf Ebrahimi return NULL;
10212*22dc650dSSadaf Ebrahimi }
10213*22dc650dSSadaf Ebrahimi
10214*22dc650dSSadaf Ebrahimi if ((options & PCRE2_LITERAL) != 0 &&
10215*22dc650dSSadaf Ebrahimi ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10216*22dc650dSSadaf Ebrahimi (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10217*22dc650dSSadaf Ebrahimi {
10218*22dc650dSSadaf Ebrahimi *errorptr = ERR92;
10219*22dc650dSSadaf Ebrahimi return NULL;
10220*22dc650dSSadaf Ebrahimi }
10221*22dc650dSSadaf Ebrahimi
10222*22dc650dSSadaf Ebrahimi /* A zero-terminated pattern is indicated by the special length value
10223*22dc650dSSadaf Ebrahimi PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10224*22dc650dSSadaf Ebrahimi
10225*22dc650dSSadaf Ebrahimi if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10226*22dc650dSSadaf Ebrahimi patlen = PRIV(strlen)(pattern);
10227*22dc650dSSadaf Ebrahimi
10228*22dc650dSSadaf Ebrahimi if (patlen > ccontext->max_pattern_length)
10229*22dc650dSSadaf Ebrahimi {
10230*22dc650dSSadaf Ebrahimi *errorptr = ERR88;
10231*22dc650dSSadaf Ebrahimi return NULL;
10232*22dc650dSSadaf Ebrahimi }
10233*22dc650dSSadaf Ebrahimi
10234*22dc650dSSadaf Ebrahimi /* From here on, all returns from this function should end up going via the
10235*22dc650dSSadaf Ebrahimi EXIT label. */
10236*22dc650dSSadaf Ebrahimi
10237*22dc650dSSadaf Ebrahimi
10238*22dc650dSSadaf Ebrahimi /* ------------ Initialize the "static" compile data -------------- */
10239*22dc650dSSadaf Ebrahimi
10240*22dc650dSSadaf Ebrahimi tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10241*22dc650dSSadaf Ebrahimi
10242*22dc650dSSadaf Ebrahimi cb.lcc = tables + lcc_offset; /* Individual */
10243*22dc650dSSadaf Ebrahimi cb.fcc = tables + fcc_offset; /* character */
10244*22dc650dSSadaf Ebrahimi cb.cbits = tables + cbits_offset; /* tables */
10245*22dc650dSSadaf Ebrahimi cb.ctypes = tables + ctypes_offset;
10246*22dc650dSSadaf Ebrahimi
10247*22dc650dSSadaf Ebrahimi cb.assert_depth = 0;
10248*22dc650dSSadaf Ebrahimi cb.bracount = 0;
10249*22dc650dSSadaf Ebrahimi cb.cx = ccontext;
10250*22dc650dSSadaf Ebrahimi cb.dupnames = FALSE;
10251*22dc650dSSadaf Ebrahimi cb.end_pattern = pattern + patlen;
10252*22dc650dSSadaf Ebrahimi cb.erroroffset = 0;
10253*22dc650dSSadaf Ebrahimi cb.external_flags = 0;
10254*22dc650dSSadaf Ebrahimi cb.external_options = options;
10255*22dc650dSSadaf Ebrahimi cb.groupinfo = stack_groupinfo;
10256*22dc650dSSadaf Ebrahimi cb.had_recurse = FALSE;
10257*22dc650dSSadaf Ebrahimi cb.lastcapture = 0;
10258*22dc650dSSadaf Ebrahimi cb.max_lookbehind = 0; /* Max encountered */
10259*22dc650dSSadaf Ebrahimi cb.max_varlookbehind = ccontext->max_varlookbehind; /* Limit */
10260*22dc650dSSadaf Ebrahimi cb.name_entry_size = 0;
10261*22dc650dSSadaf Ebrahimi cb.name_table = NULL;
10262*22dc650dSSadaf Ebrahimi cb.named_groups = named_groups;
10263*22dc650dSSadaf Ebrahimi cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10264*22dc650dSSadaf Ebrahimi cb.names_found = 0;
10265*22dc650dSSadaf Ebrahimi cb.parens_depth = 0;
10266*22dc650dSSadaf Ebrahimi cb.parsed_pattern = stack_parsed_pattern;
10267*22dc650dSSadaf Ebrahimi cb.req_varyopt = 0;
10268*22dc650dSSadaf Ebrahimi cb.start_code = cworkspace;
10269*22dc650dSSadaf Ebrahimi cb.start_pattern = pattern;
10270*22dc650dSSadaf Ebrahimi cb.start_workspace = cworkspace;
10271*22dc650dSSadaf Ebrahimi cb.workspace_size = COMPILE_WORK_SIZE;
10272*22dc650dSSadaf Ebrahimi
10273*22dc650dSSadaf Ebrahimi /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10274*22dc650dSSadaf Ebrahimi references to help in deciding whether (.*) can be treated as anchored or not.
10275*22dc650dSSadaf Ebrahimi */
10276*22dc650dSSadaf Ebrahimi
10277*22dc650dSSadaf Ebrahimi cb.top_backref = 0;
10278*22dc650dSSadaf Ebrahimi cb.backref_map = 0;
10279*22dc650dSSadaf Ebrahimi
10280*22dc650dSSadaf Ebrahimi /* Escape sequences \1 to \9 are always back references, but as they are only
10281*22dc650dSSadaf Ebrahimi two characters long, only two elements can be used in the parsed_pattern
10282*22dc650dSSadaf Ebrahimi vector. The first contains the reference, and we'd like to use the second to
10283*22dc650dSSadaf Ebrahimi record the offset in the pattern, so that forward references to non-existent
10284*22dc650dSSadaf Ebrahimi groups can be diagnosed later with an offset. However, on 64-bit systems,
10285*22dc650dSSadaf Ebrahimi PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10286*22dc650dSSadaf Ebrahimi occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10287*22dc650dSSadaf Ebrahimi references have enough space for the offset to be put into the parsed pattern.
10288*22dc650dSSadaf Ebrahimi */
10289*22dc650dSSadaf Ebrahimi
10290*22dc650dSSadaf Ebrahimi for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10291*22dc650dSSadaf Ebrahimi
10292*22dc650dSSadaf Ebrahimi
10293*22dc650dSSadaf Ebrahimi /* --------------- Start looking at the pattern --------------- */
10294*22dc650dSSadaf Ebrahimi
10295*22dc650dSSadaf Ebrahimi /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10296*22dc650dSSadaf Ebrahimi the start of the pattern, and remember the offset to the actual regex. With
10297*22dc650dSSadaf Ebrahimi valgrind support, make the terminator of a zero-terminated pattern
10298*22dc650dSSadaf Ebrahimi inaccessible. This catches bugs that would otherwise only show up for
10299*22dc650dSSadaf Ebrahimi non-zero-terminated patterns. */
10300*22dc650dSSadaf Ebrahimi
10301*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_VALGRIND
10302*22dc650dSSadaf Ebrahimi if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10303*22dc650dSSadaf Ebrahimi #endif
10304*22dc650dSSadaf Ebrahimi
10305*22dc650dSSadaf Ebrahimi ptr = pattern;
10306*22dc650dSSadaf Ebrahimi skipatstart = 0;
10307*22dc650dSSadaf Ebrahimi
10308*22dc650dSSadaf Ebrahimi if ((options & PCRE2_LITERAL) == 0)
10309*22dc650dSSadaf Ebrahimi {
10310*22dc650dSSadaf Ebrahimi while (patlen - skipatstart >= 2 &&
10311*22dc650dSSadaf Ebrahimi ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10312*22dc650dSSadaf Ebrahimi ptr[skipatstart+1] == CHAR_ASTERISK)
10313*22dc650dSSadaf Ebrahimi {
10314*22dc650dSSadaf Ebrahimi for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10315*22dc650dSSadaf Ebrahimi {
10316*22dc650dSSadaf Ebrahimi uint32_t c, pp;
10317*22dc650dSSadaf Ebrahimi const pso *p = pso_list + i;
10318*22dc650dSSadaf Ebrahimi
10319*22dc650dSSadaf Ebrahimi if (patlen - skipatstart - 2 >= p->length &&
10320*22dc650dSSadaf Ebrahimi PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
10321*22dc650dSSadaf Ebrahimi p->length) == 0)
10322*22dc650dSSadaf Ebrahimi {
10323*22dc650dSSadaf Ebrahimi skipatstart += p->length + 2;
10324*22dc650dSSadaf Ebrahimi switch(p->type)
10325*22dc650dSSadaf Ebrahimi {
10326*22dc650dSSadaf Ebrahimi case PSO_OPT:
10327*22dc650dSSadaf Ebrahimi cb.external_options |= p->value;
10328*22dc650dSSadaf Ebrahimi break;
10329*22dc650dSSadaf Ebrahimi
10330*22dc650dSSadaf Ebrahimi case PSO_FLG:
10331*22dc650dSSadaf Ebrahimi setflags |= p->value;
10332*22dc650dSSadaf Ebrahimi break;
10333*22dc650dSSadaf Ebrahimi
10334*22dc650dSSadaf Ebrahimi case PSO_NL:
10335*22dc650dSSadaf Ebrahimi newline = p->value;
10336*22dc650dSSadaf Ebrahimi setflags |= PCRE2_NL_SET;
10337*22dc650dSSadaf Ebrahimi break;
10338*22dc650dSSadaf Ebrahimi
10339*22dc650dSSadaf Ebrahimi case PSO_BSR:
10340*22dc650dSSadaf Ebrahimi bsr = p->value;
10341*22dc650dSSadaf Ebrahimi setflags |= PCRE2_BSR_SET;
10342*22dc650dSSadaf Ebrahimi break;
10343*22dc650dSSadaf Ebrahimi
10344*22dc650dSSadaf Ebrahimi case PSO_LIMM:
10345*22dc650dSSadaf Ebrahimi case PSO_LIMD:
10346*22dc650dSSadaf Ebrahimi case PSO_LIMH:
10347*22dc650dSSadaf Ebrahimi c = 0;
10348*22dc650dSSadaf Ebrahimi pp = skipatstart;
10349*22dc650dSSadaf Ebrahimi if (!IS_DIGIT(ptr[pp]))
10350*22dc650dSSadaf Ebrahimi {
10351*22dc650dSSadaf Ebrahimi errorcode = ERR60;
10352*22dc650dSSadaf Ebrahimi ptr += pp;
10353*22dc650dSSadaf Ebrahimi goto HAD_EARLY_ERROR;
10354*22dc650dSSadaf Ebrahimi }
10355*22dc650dSSadaf Ebrahimi while (IS_DIGIT(ptr[pp]))
10356*22dc650dSSadaf Ebrahimi {
10357*22dc650dSSadaf Ebrahimi if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
10358*22dc650dSSadaf Ebrahimi c = c*10 + (ptr[pp++] - CHAR_0);
10359*22dc650dSSadaf Ebrahimi }
10360*22dc650dSSadaf Ebrahimi if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
10361*22dc650dSSadaf Ebrahimi {
10362*22dc650dSSadaf Ebrahimi errorcode = ERR60;
10363*22dc650dSSadaf Ebrahimi ptr += pp;
10364*22dc650dSSadaf Ebrahimi goto HAD_EARLY_ERROR;
10365*22dc650dSSadaf Ebrahimi }
10366*22dc650dSSadaf Ebrahimi if (p->type == PSO_LIMH) limit_heap = c;
10367*22dc650dSSadaf Ebrahimi else if (p->type == PSO_LIMM) limit_match = c;
10368*22dc650dSSadaf Ebrahimi else limit_depth = c;
10369*22dc650dSSadaf Ebrahimi skipatstart += pp - skipatstart;
10370*22dc650dSSadaf Ebrahimi break;
10371*22dc650dSSadaf Ebrahimi }
10372*22dc650dSSadaf Ebrahimi break; /* Out of the table scan loop */
10373*22dc650dSSadaf Ebrahimi }
10374*22dc650dSSadaf Ebrahimi }
10375*22dc650dSSadaf Ebrahimi if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
10376*22dc650dSSadaf Ebrahimi }
10377*22dc650dSSadaf Ebrahimi }
10378*22dc650dSSadaf Ebrahimi
10379*22dc650dSSadaf Ebrahimi /* End of pattern-start options; advance to start of real regex. */
10380*22dc650dSSadaf Ebrahimi
10381*22dc650dSSadaf Ebrahimi ptr += skipatstart;
10382*22dc650dSSadaf Ebrahimi
10383*22dc650dSSadaf Ebrahimi /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10384*22dc650dSSadaf Ebrahimi
10385*22dc650dSSadaf Ebrahimi #ifndef SUPPORT_UNICODE
10386*22dc650dSSadaf Ebrahimi if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10387*22dc650dSSadaf Ebrahimi {
10388*22dc650dSSadaf Ebrahimi errorcode = ERR32;
10389*22dc650dSSadaf Ebrahimi goto HAD_EARLY_ERROR;
10390*22dc650dSSadaf Ebrahimi }
10391*22dc650dSSadaf Ebrahimi #endif
10392*22dc650dSSadaf Ebrahimi
10393*22dc650dSSadaf Ebrahimi /* Check UTF. We have the original options in 'options', with that value as
10394*22dc650dSSadaf Ebrahimi modified by (*UTF) etc in cb->external_options. The extra option
10395*22dc650dSSadaf Ebrahimi PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10396*22dc650dSSadaf Ebrahimi surrogate code points cannot be represented in UTF-16. */
10397*22dc650dSSadaf Ebrahimi
10398*22dc650dSSadaf Ebrahimi utf = (cb.external_options & PCRE2_UTF) != 0;
10399*22dc650dSSadaf Ebrahimi if (utf)
10400*22dc650dSSadaf Ebrahimi {
10401*22dc650dSSadaf Ebrahimi if ((options & PCRE2_NEVER_UTF) != 0)
10402*22dc650dSSadaf Ebrahimi {
10403*22dc650dSSadaf Ebrahimi errorcode = ERR74;
10404*22dc650dSSadaf Ebrahimi goto HAD_EARLY_ERROR;
10405*22dc650dSSadaf Ebrahimi }
10406*22dc650dSSadaf Ebrahimi if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10407*22dc650dSSadaf Ebrahimi (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10408*22dc650dSSadaf Ebrahimi goto HAD_ERROR; /* Offset was set by valid_utf() */
10409*22dc650dSSadaf Ebrahimi
10410*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 16
10411*22dc650dSSadaf Ebrahimi if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10412*22dc650dSSadaf Ebrahimi {
10413*22dc650dSSadaf Ebrahimi errorcode = ERR91;
10414*22dc650dSSadaf Ebrahimi goto HAD_EARLY_ERROR;
10415*22dc650dSSadaf Ebrahimi }
10416*22dc650dSSadaf Ebrahimi #endif
10417*22dc650dSSadaf Ebrahimi }
10418*22dc650dSSadaf Ebrahimi
10419*22dc650dSSadaf Ebrahimi /* Check UCP lockout. */
10420*22dc650dSSadaf Ebrahimi
10421*22dc650dSSadaf Ebrahimi ucp = (cb.external_options & PCRE2_UCP) != 0;
10422*22dc650dSSadaf Ebrahimi if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10423*22dc650dSSadaf Ebrahimi {
10424*22dc650dSSadaf Ebrahimi errorcode = ERR75;
10425*22dc650dSSadaf Ebrahimi goto HAD_EARLY_ERROR;
10426*22dc650dSSadaf Ebrahimi }
10427*22dc650dSSadaf Ebrahimi
10428*22dc650dSSadaf Ebrahimi /* Process the BSR setting. */
10429*22dc650dSSadaf Ebrahimi
10430*22dc650dSSadaf Ebrahimi if (bsr == 0) bsr = ccontext->bsr_convention;
10431*22dc650dSSadaf Ebrahimi
10432*22dc650dSSadaf Ebrahimi /* Process the newline setting. */
10433*22dc650dSSadaf Ebrahimi
10434*22dc650dSSadaf Ebrahimi if (newline == 0) newline = ccontext->newline_convention;
10435*22dc650dSSadaf Ebrahimi cb.nltype = NLTYPE_FIXED;
10436*22dc650dSSadaf Ebrahimi switch(newline)
10437*22dc650dSSadaf Ebrahimi {
10438*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_CR:
10439*22dc650dSSadaf Ebrahimi cb.nllen = 1;
10440*22dc650dSSadaf Ebrahimi cb.nl[0] = CHAR_CR;
10441*22dc650dSSadaf Ebrahimi break;
10442*22dc650dSSadaf Ebrahimi
10443*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_LF:
10444*22dc650dSSadaf Ebrahimi cb.nllen = 1;
10445*22dc650dSSadaf Ebrahimi cb.nl[0] = CHAR_NL;
10446*22dc650dSSadaf Ebrahimi break;
10447*22dc650dSSadaf Ebrahimi
10448*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_NUL:
10449*22dc650dSSadaf Ebrahimi cb.nllen = 1;
10450*22dc650dSSadaf Ebrahimi cb.nl[0] = CHAR_NUL;
10451*22dc650dSSadaf Ebrahimi break;
10452*22dc650dSSadaf Ebrahimi
10453*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_CRLF:
10454*22dc650dSSadaf Ebrahimi cb.nllen = 2;
10455*22dc650dSSadaf Ebrahimi cb.nl[0] = CHAR_CR;
10456*22dc650dSSadaf Ebrahimi cb.nl[1] = CHAR_NL;
10457*22dc650dSSadaf Ebrahimi break;
10458*22dc650dSSadaf Ebrahimi
10459*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_ANY:
10460*22dc650dSSadaf Ebrahimi cb.nltype = NLTYPE_ANY;
10461*22dc650dSSadaf Ebrahimi break;
10462*22dc650dSSadaf Ebrahimi
10463*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_ANYCRLF:
10464*22dc650dSSadaf Ebrahimi cb.nltype = NLTYPE_ANYCRLF;
10465*22dc650dSSadaf Ebrahimi break;
10466*22dc650dSSadaf Ebrahimi
10467*22dc650dSSadaf Ebrahimi default:
10468*22dc650dSSadaf Ebrahimi errorcode = ERR56;
10469*22dc650dSSadaf Ebrahimi goto HAD_EARLY_ERROR;
10470*22dc650dSSadaf Ebrahimi }
10471*22dc650dSSadaf Ebrahimi
10472*22dc650dSSadaf Ebrahimi /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10473*22dc650dSSadaf Ebrahimi their numerical equivalents, so that this information is always available for
10474*22dc650dSSadaf Ebrahimi the remaining processing. (2) At the same time, parse the pattern and put a
10475*22dc650dSSadaf Ebrahimi processed version into the parsed_pattern vector. This has escapes interpreted
10476*22dc650dSSadaf Ebrahimi and comments removed (amongst other things).
10477*22dc650dSSadaf Ebrahimi
10478*22dc650dSSadaf Ebrahimi In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10479*22dc650dSSadaf Ebrahimi 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10480*22dc650dSSadaf Ebrahimi one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10481*22dc650dSSadaf Ebrahimi set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10482*22dc650dSSadaf Ebrahimi characters greater than META_END (0x80000000) have to be coded as two units. In
10483*22dc650dSSadaf Ebrahimi this case, therefore, we scan the pattern to check for such values. */
10484*22dc650dSSadaf Ebrahimi
10485*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
10486*22dc650dSSadaf Ebrahimi if (!utf)
10487*22dc650dSSadaf Ebrahimi {
10488*22dc650dSSadaf Ebrahimi PCRE2_SPTR p;
10489*22dc650dSSadaf Ebrahimi for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10490*22dc650dSSadaf Ebrahimi }
10491*22dc650dSSadaf Ebrahimi #endif
10492*22dc650dSSadaf Ebrahimi
10493*22dc650dSSadaf Ebrahimi /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10494*22dc650dSSadaf Ebrahimi is set we have to assume a numerical callout (4 elements) for each character
10495*22dc650dSSadaf Ebrahimi plus one at the end. This is overkill, but memory is plentiful these days. For
10496*22dc650dSSadaf Ebrahimi many smaller patterns the vector on the stack (which was set up above) can be
10497*22dc650dSSadaf Ebrahimi used. */
10498*22dc650dSSadaf Ebrahimi
10499*22dc650dSSadaf Ebrahimi parsed_size_needed = patlen - skipatstart + big32count;
10500*22dc650dSSadaf Ebrahimi
10501*22dc650dSSadaf Ebrahimi if ((ccontext->extra_options &
10502*22dc650dSSadaf Ebrahimi (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10503*22dc650dSSadaf Ebrahimi parsed_size_needed += 4;
10504*22dc650dSSadaf Ebrahimi
10505*22dc650dSSadaf Ebrahimi if ((options & PCRE2_AUTO_CALLOUT) != 0)
10506*22dc650dSSadaf Ebrahimi parsed_size_needed = (parsed_size_needed + 1) * 5;
10507*22dc650dSSadaf Ebrahimi
10508*22dc650dSSadaf Ebrahimi if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10509*22dc650dSSadaf Ebrahimi {
10510*22dc650dSSadaf Ebrahimi uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10511*22dc650dSSadaf Ebrahimi (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10512*22dc650dSSadaf Ebrahimi if (heap_parsed_pattern == NULL)
10513*22dc650dSSadaf Ebrahimi {
10514*22dc650dSSadaf Ebrahimi *errorptr = ERR21;
10515*22dc650dSSadaf Ebrahimi goto EXIT;
10516*22dc650dSSadaf Ebrahimi }
10517*22dc650dSSadaf Ebrahimi cb.parsed_pattern = heap_parsed_pattern;
10518*22dc650dSSadaf Ebrahimi }
10519*22dc650dSSadaf Ebrahimi cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10520*22dc650dSSadaf Ebrahimi
10521*22dc650dSSadaf Ebrahimi /* Do the parsing scan. */
10522*22dc650dSSadaf Ebrahimi
10523*22dc650dSSadaf Ebrahimi errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10524*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto HAD_CB_ERROR;
10525*22dc650dSSadaf Ebrahimi
10526*22dc650dSSadaf Ebrahimi /* If there are any lookbehinds, scan the parsed pattern to figure out their
10527*22dc650dSSadaf Ebrahimi lengths. Workspace is needed to remember whether numbered groups are or are not
10528*22dc650dSSadaf Ebrahimi of limited length, and if limited, what the minimum and maximum lengths are.
10529*22dc650dSSadaf Ebrahimi This caching saves re-computing the length of any group that is referenced more
10530*22dc650dSSadaf Ebrahimi than once, which is particularly relevant when recursion is involved.
10531*22dc650dSSadaf Ebrahimi Unnumbered groups do not have this exposure because they cannot be referenced.
10532*22dc650dSSadaf Ebrahimi If there are sufficiently few groups, the default index vector on the stack, as
10533*22dc650dSSadaf Ebrahimi set up above, can be used. Otherwise we have to get/free some heap memory. The
10534*22dc650dSSadaf Ebrahimi vector must be initialized to zero. */
10535*22dc650dSSadaf Ebrahimi
10536*22dc650dSSadaf Ebrahimi if (has_lookbehind)
10537*22dc650dSSadaf Ebrahimi {
10538*22dc650dSSadaf Ebrahimi int loopcount = 0;
10539*22dc650dSSadaf Ebrahimi if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10540*22dc650dSSadaf Ebrahimi {
10541*22dc650dSSadaf Ebrahimi cb.groupinfo = ccontext->memctl.malloc(
10542*22dc650dSSadaf Ebrahimi (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10543*22dc650dSSadaf Ebrahimi if (cb.groupinfo == NULL)
10544*22dc650dSSadaf Ebrahimi {
10545*22dc650dSSadaf Ebrahimi errorcode = ERR21;
10546*22dc650dSSadaf Ebrahimi cb.erroroffset = 0;
10547*22dc650dSSadaf Ebrahimi goto HAD_CB_ERROR;
10548*22dc650dSSadaf Ebrahimi }
10549*22dc650dSSadaf Ebrahimi }
10550*22dc650dSSadaf Ebrahimi memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10551*22dc650dSSadaf Ebrahimi errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10552*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto HAD_CB_ERROR;
10553*22dc650dSSadaf Ebrahimi }
10554*22dc650dSSadaf Ebrahimi
10555*22dc650dSSadaf Ebrahimi /* For debugging, there is a function that shows the parsed pattern vector. */
10556*22dc650dSSadaf Ebrahimi
10557*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_PARSED
10558*22dc650dSSadaf Ebrahimi fprintf(stderr, "+++ Pre-scan complete:\n");
10559*22dc650dSSadaf Ebrahimi show_parsed(&cb);
10560*22dc650dSSadaf Ebrahimi #endif
10561*22dc650dSSadaf Ebrahimi
10562*22dc650dSSadaf Ebrahimi /* For debugging capturing information this code can be enabled. */
10563*22dc650dSSadaf Ebrahimi
10564*22dc650dSSadaf Ebrahimi #ifdef DEBUG_SHOW_CAPTURES
10565*22dc650dSSadaf Ebrahimi {
10566*22dc650dSSadaf Ebrahimi named_group *ng = cb.named_groups;
10567*22dc650dSSadaf Ebrahimi fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10568*22dc650dSSadaf Ebrahimi for (i = 0; i < cb.names_found; i++, ng++)
10569*22dc650dSSadaf Ebrahimi {
10570*22dc650dSSadaf Ebrahimi fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10571*22dc650dSSadaf Ebrahimi }
10572*22dc650dSSadaf Ebrahimi }
10573*22dc650dSSadaf Ebrahimi #endif
10574*22dc650dSSadaf Ebrahimi
10575*22dc650dSSadaf Ebrahimi /* Pretend to compile the pattern while actually just accumulating the amount
10576*22dc650dSSadaf Ebrahimi of memory required in the 'length' variable. This behaviour is triggered by
10577*22dc650dSSadaf Ebrahimi passing a non-NULL final argument to compile_regex(). We pass a block of
10578*22dc650dSSadaf Ebrahimi workspace (cworkspace) for it to compile parts of the pattern into; the
10579*22dc650dSSadaf Ebrahimi compiled code is discarded when it is no longer needed, so hopefully this
10580*22dc650dSSadaf Ebrahimi workspace will never overflow, though there is a test for its doing so.
10581*22dc650dSSadaf Ebrahimi
10582*22dc650dSSadaf Ebrahimi On error, errorcode will be set non-zero, so we don't need to look at the
10583*22dc650dSSadaf Ebrahimi result of the function. The initial options have been put into the cb block,
10584*22dc650dSSadaf Ebrahimi but we still have to pass a separate options variable (the first argument)
10585*22dc650dSSadaf Ebrahimi because the options may change as the pattern is processed. */
10586*22dc650dSSadaf Ebrahimi
10587*22dc650dSSadaf Ebrahimi cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
10588*22dc650dSSadaf Ebrahimi pptr = cb.parsed_pattern;
10589*22dc650dSSadaf Ebrahimi code = cworkspace;
10590*22dc650dSSadaf Ebrahimi *code = OP_BRA;
10591*22dc650dSSadaf Ebrahimi
10592*22dc650dSSadaf Ebrahimi (void)compile_regex(cb.external_options, ccontext->extra_options, &code, &pptr,
10593*22dc650dSSadaf Ebrahimi &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10594*22dc650dSSadaf Ebrahimi &cb, &length);
10595*22dc650dSSadaf Ebrahimi
10596*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
10597*22dc650dSSadaf Ebrahimi
10598*22dc650dSSadaf Ebrahimi /* This should be caught in compile_regex(), but just in case... */
10599*22dc650dSSadaf Ebrahimi
10600*22dc650dSSadaf Ebrahimi if (length > MAX_PATTERN_SIZE)
10601*22dc650dSSadaf Ebrahimi {
10602*22dc650dSSadaf Ebrahimi errorcode = ERR20;
10603*22dc650dSSadaf Ebrahimi goto HAD_CB_ERROR;
10604*22dc650dSSadaf Ebrahimi }
10605*22dc650dSSadaf Ebrahimi
10606*22dc650dSSadaf Ebrahimi /* Compute the size of, then, if not too large, get and initialize the data
10607*22dc650dSSadaf Ebrahimi block for storing the compiled pattern and names table. Integer overflow should
10608*22dc650dSSadaf Ebrahimi no longer be possible because nowadays we limit the maximum value of
10609*22dc650dSSadaf Ebrahimi cb.names_found and cb.name_entry_size. */
10610*22dc650dSSadaf Ebrahimi
10611*22dc650dSSadaf Ebrahimi re_blocksize = sizeof(pcre2_real_code) +
10612*22dc650dSSadaf Ebrahimi CU2BYTES(length +
10613*22dc650dSSadaf Ebrahimi (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10614*22dc650dSSadaf Ebrahimi
10615*22dc650dSSadaf Ebrahimi if (re_blocksize > ccontext->max_pattern_compiled_length)
10616*22dc650dSSadaf Ebrahimi {
10617*22dc650dSSadaf Ebrahimi errorcode = ERR101;
10618*22dc650dSSadaf Ebrahimi goto HAD_CB_ERROR;
10619*22dc650dSSadaf Ebrahimi }
10620*22dc650dSSadaf Ebrahimi
10621*22dc650dSSadaf Ebrahimi re = (pcre2_real_code *)
10622*22dc650dSSadaf Ebrahimi ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10623*22dc650dSSadaf Ebrahimi if (re == NULL)
10624*22dc650dSSadaf Ebrahimi {
10625*22dc650dSSadaf Ebrahimi errorcode = ERR21;
10626*22dc650dSSadaf Ebrahimi goto HAD_CB_ERROR;
10627*22dc650dSSadaf Ebrahimi }
10628*22dc650dSSadaf Ebrahimi
10629*22dc650dSSadaf Ebrahimi /* The compiler may put padding at the end of the pcre2_real_code structure in
10630*22dc650dSSadaf Ebrahimi order to round it up to a multiple of 4 or 8 bytes. This means that when a
10631*22dc650dSSadaf Ebrahimi compiled pattern is copied (for example, when serialized) undefined bytes are
10632*22dc650dSSadaf Ebrahimi read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10633*22dc650dSSadaf Ebrahimi write to the last 8 bytes of the structure before setting the fields. */
10634*22dc650dSSadaf Ebrahimi
10635*22dc650dSSadaf Ebrahimi memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10636*22dc650dSSadaf Ebrahimi re->memctl = ccontext->memctl;
10637*22dc650dSSadaf Ebrahimi re->tables = tables;
10638*22dc650dSSadaf Ebrahimi re->executable_jit = NULL;
10639*22dc650dSSadaf Ebrahimi memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10640*22dc650dSSadaf Ebrahimi re->blocksize = re_blocksize;
10641*22dc650dSSadaf Ebrahimi re->magic_number = MAGIC_NUMBER;
10642*22dc650dSSadaf Ebrahimi re->compile_options = options;
10643*22dc650dSSadaf Ebrahimi re->overall_options = cb.external_options;
10644*22dc650dSSadaf Ebrahimi re->extra_options = ccontext->extra_options;
10645*22dc650dSSadaf Ebrahimi re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10646*22dc650dSSadaf Ebrahimi re->limit_heap = limit_heap;
10647*22dc650dSSadaf Ebrahimi re->limit_match = limit_match;
10648*22dc650dSSadaf Ebrahimi re->limit_depth = limit_depth;
10649*22dc650dSSadaf Ebrahimi re->first_codeunit = 0;
10650*22dc650dSSadaf Ebrahimi re->last_codeunit = 0;
10651*22dc650dSSadaf Ebrahimi re->bsr_convention = bsr;
10652*22dc650dSSadaf Ebrahimi re->newline_convention = newline;
10653*22dc650dSSadaf Ebrahimi re->max_lookbehind = 0;
10654*22dc650dSSadaf Ebrahimi re->minlength = 0;
10655*22dc650dSSadaf Ebrahimi re->top_bracket = 0;
10656*22dc650dSSadaf Ebrahimi re->top_backref = 0;
10657*22dc650dSSadaf Ebrahimi re->name_entry_size = cb.name_entry_size;
10658*22dc650dSSadaf Ebrahimi re->name_count = cb.names_found;
10659*22dc650dSSadaf Ebrahimi
10660*22dc650dSSadaf Ebrahimi /* The basic block is immediately followed by the name table, and the compiled
10661*22dc650dSSadaf Ebrahimi code follows after that. */
10662*22dc650dSSadaf Ebrahimi
10663*22dc650dSSadaf Ebrahimi codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10664*22dc650dSSadaf Ebrahimi re->name_entry_size * re->name_count;
10665*22dc650dSSadaf Ebrahimi
10666*22dc650dSSadaf Ebrahimi /* Update the compile data block for the actual compile. The starting points of
10667*22dc650dSSadaf Ebrahimi the name/number translation table and of the code are passed around in the
10668*22dc650dSSadaf Ebrahimi compile data block. The start/end pattern and initial options are already set
10669*22dc650dSSadaf Ebrahimi from the pre-compile phase, as is the name_entry_size field. */
10670*22dc650dSSadaf Ebrahimi
10671*22dc650dSSadaf Ebrahimi cb.parens_depth = 0;
10672*22dc650dSSadaf Ebrahimi cb.assert_depth = 0;
10673*22dc650dSSadaf Ebrahimi cb.lastcapture = 0;
10674*22dc650dSSadaf Ebrahimi cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10675*22dc650dSSadaf Ebrahimi cb.start_code = codestart;
10676*22dc650dSSadaf Ebrahimi cb.req_varyopt = 0;
10677*22dc650dSSadaf Ebrahimi cb.had_accept = FALSE;
10678*22dc650dSSadaf Ebrahimi cb.had_pruneorskip = FALSE;
10679*22dc650dSSadaf Ebrahimi
10680*22dc650dSSadaf Ebrahimi /* If any named groups were found, create the name/number table from the list
10681*22dc650dSSadaf Ebrahimi created in the pre-pass. */
10682*22dc650dSSadaf Ebrahimi
10683*22dc650dSSadaf Ebrahimi if (cb.names_found > 0)
10684*22dc650dSSadaf Ebrahimi {
10685*22dc650dSSadaf Ebrahimi named_group *ng = cb.named_groups;
10686*22dc650dSSadaf Ebrahimi for (i = 0; i < cb.names_found; i++, ng++)
10687*22dc650dSSadaf Ebrahimi add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10688*22dc650dSSadaf Ebrahimi }
10689*22dc650dSSadaf Ebrahimi
10690*22dc650dSSadaf Ebrahimi /* Set up a starting, non-extracting bracket, then compile the expression. On
10691*22dc650dSSadaf Ebrahimi error, errorcode will be set non-zero, so we don't need to look at the result
10692*22dc650dSSadaf Ebrahimi of the function here. */
10693*22dc650dSSadaf Ebrahimi
10694*22dc650dSSadaf Ebrahimi pptr = cb.parsed_pattern;
10695*22dc650dSSadaf Ebrahimi code = (PCRE2_UCHAR *)codestart;
10696*22dc650dSSadaf Ebrahimi *code = OP_BRA;
10697*22dc650dSSadaf Ebrahimi regexrc = compile_regex(re->overall_options, ccontext->extra_options, &code,
10698*22dc650dSSadaf Ebrahimi &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10699*22dc650dSSadaf Ebrahimi NULL, &cb, NULL);
10700*22dc650dSSadaf Ebrahimi if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10701*22dc650dSSadaf Ebrahimi re->top_bracket = cb.bracount;
10702*22dc650dSSadaf Ebrahimi re->top_backref = cb.top_backref;
10703*22dc650dSSadaf Ebrahimi re->max_lookbehind = cb.max_lookbehind;
10704*22dc650dSSadaf Ebrahimi
10705*22dc650dSSadaf Ebrahimi if (cb.had_accept)
10706*22dc650dSSadaf Ebrahimi {
10707*22dc650dSSadaf Ebrahimi reqcu = 0; /* Must disable after (*ACCEPT) */
10708*22dc650dSSadaf Ebrahimi reqcuflags = REQ_NONE;
10709*22dc650dSSadaf Ebrahimi re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
10710*22dc650dSSadaf Ebrahimi }
10711*22dc650dSSadaf Ebrahimi
10712*22dc650dSSadaf Ebrahimi /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10713*22dc650dSSadaf Ebrahimi but the estimated length exceeds the really used length, adjust the value of
10714*22dc650dSSadaf Ebrahimi re->blocksize, and if valgrind support is configured, mark the extra allocated
10715*22dc650dSSadaf Ebrahimi memory as unaddressable, so that any out-of-bound reads can be detected. */
10716*22dc650dSSadaf Ebrahimi
10717*22dc650dSSadaf Ebrahimi *code++ = OP_END;
10718*22dc650dSSadaf Ebrahimi usedlength = code - codestart;
10719*22dc650dSSadaf Ebrahimi if (usedlength > length) errorcode = ERR23; else
10720*22dc650dSSadaf Ebrahimi {
10721*22dc650dSSadaf Ebrahimi re->blocksize -= CU2BYTES(length - usedlength);
10722*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_VALGRIND
10723*22dc650dSSadaf Ebrahimi VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10724*22dc650dSSadaf Ebrahimi #endif
10725*22dc650dSSadaf Ebrahimi }
10726*22dc650dSSadaf Ebrahimi
10727*22dc650dSSadaf Ebrahimi /* Scan the pattern for recursion/subroutine calls and convert the group
10728*22dc650dSSadaf Ebrahimi numbers into offsets. Maintain a small cache so that repeated groups containing
10729*22dc650dSSadaf Ebrahimi recursions are efficiently handled. */
10730*22dc650dSSadaf Ebrahimi
10731*22dc650dSSadaf Ebrahimi #define RSCAN_CACHE_SIZE 8
10732*22dc650dSSadaf Ebrahimi
10733*22dc650dSSadaf Ebrahimi if (errorcode == 0 && cb.had_recurse)
10734*22dc650dSSadaf Ebrahimi {
10735*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *rcode;
10736*22dc650dSSadaf Ebrahimi PCRE2_SPTR rgroup;
10737*22dc650dSSadaf Ebrahimi unsigned int ccount = 0;
10738*22dc650dSSadaf Ebrahimi int start = RSCAN_CACHE_SIZE;
10739*22dc650dSSadaf Ebrahimi recurse_cache rc[RSCAN_CACHE_SIZE];
10740*22dc650dSSadaf Ebrahimi
10741*22dc650dSSadaf Ebrahimi for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10742*22dc650dSSadaf Ebrahimi rcode != NULL;
10743*22dc650dSSadaf Ebrahimi rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10744*22dc650dSSadaf Ebrahimi {
10745*22dc650dSSadaf Ebrahimi int p, groupnumber;
10746*22dc650dSSadaf Ebrahimi
10747*22dc650dSSadaf Ebrahimi groupnumber = (int)GET(rcode, 1);
10748*22dc650dSSadaf Ebrahimi if (groupnumber == 0) rgroup = codestart; else
10749*22dc650dSSadaf Ebrahimi {
10750*22dc650dSSadaf Ebrahimi PCRE2_SPTR search_from = codestart;
10751*22dc650dSSadaf Ebrahimi rgroup = NULL;
10752*22dc650dSSadaf Ebrahimi for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10753*22dc650dSSadaf Ebrahimi {
10754*22dc650dSSadaf Ebrahimi if (groupnumber == rc[p].groupnumber)
10755*22dc650dSSadaf Ebrahimi {
10756*22dc650dSSadaf Ebrahimi rgroup = rc[p].group;
10757*22dc650dSSadaf Ebrahimi break;
10758*22dc650dSSadaf Ebrahimi }
10759*22dc650dSSadaf Ebrahimi
10760*22dc650dSSadaf Ebrahimi /* Group n+1 must always start to the right of group n, so we can save
10761*22dc650dSSadaf Ebrahimi search time below when the new group number is greater than any of the
10762*22dc650dSSadaf Ebrahimi previously found groups. */
10763*22dc650dSSadaf Ebrahimi
10764*22dc650dSSadaf Ebrahimi if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10765*22dc650dSSadaf Ebrahimi }
10766*22dc650dSSadaf Ebrahimi
10767*22dc650dSSadaf Ebrahimi if (rgroup == NULL)
10768*22dc650dSSadaf Ebrahimi {
10769*22dc650dSSadaf Ebrahimi rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10770*22dc650dSSadaf Ebrahimi if (rgroup == NULL)
10771*22dc650dSSadaf Ebrahimi {
10772*22dc650dSSadaf Ebrahimi errorcode = ERR53;
10773*22dc650dSSadaf Ebrahimi break;
10774*22dc650dSSadaf Ebrahimi }
10775*22dc650dSSadaf Ebrahimi if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10776*22dc650dSSadaf Ebrahimi rc[start].groupnumber = groupnumber;
10777*22dc650dSSadaf Ebrahimi rc[start].group = rgroup;
10778*22dc650dSSadaf Ebrahimi if (ccount < RSCAN_CACHE_SIZE) ccount++;
10779*22dc650dSSadaf Ebrahimi }
10780*22dc650dSSadaf Ebrahimi }
10781*22dc650dSSadaf Ebrahimi
10782*22dc650dSSadaf Ebrahimi PUT(rcode, 1, rgroup - codestart);
10783*22dc650dSSadaf Ebrahimi }
10784*22dc650dSSadaf Ebrahimi }
10785*22dc650dSSadaf Ebrahimi
10786*22dc650dSSadaf Ebrahimi /* In rare debugging situations we sometimes need to look at the compiled code
10787*22dc650dSSadaf Ebrahimi at this stage. */
10788*22dc650dSSadaf Ebrahimi
10789*22dc650dSSadaf Ebrahimi #ifdef DEBUG_CALL_PRINTINT
10790*22dc650dSSadaf Ebrahimi pcre2_printint(re, stderr, TRUE);
10791*22dc650dSSadaf Ebrahimi fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10792*22dc650dSSadaf Ebrahimi #endif
10793*22dc650dSSadaf Ebrahimi
10794*22dc650dSSadaf Ebrahimi /* Unless disabled, check whether any single character iterators can be
10795*22dc650dSSadaf Ebrahimi auto-possessified. The function overwrites the appropriate opcode values, so
10796*22dc650dSSadaf Ebrahimi the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10797*22dc650dSSadaf Ebrahimi used in this code because at least one compiler gives a warning about loss of
10798*22dc650dSSadaf Ebrahimi "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10799*22dc650dSSadaf Ebrahimi function call. */
10800*22dc650dSSadaf Ebrahimi
10801*22dc650dSSadaf Ebrahimi if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10802*22dc650dSSadaf Ebrahimi {
10803*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10804*22dc650dSSadaf Ebrahimi if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10805*22dc650dSSadaf Ebrahimi }
10806*22dc650dSSadaf Ebrahimi
10807*22dc650dSSadaf Ebrahimi /* Failed to compile, or error while post-processing. */
10808*22dc650dSSadaf Ebrahimi
10809*22dc650dSSadaf Ebrahimi if (errorcode != 0) goto HAD_CB_ERROR;
10810*22dc650dSSadaf Ebrahimi
10811*22dc650dSSadaf Ebrahimi /* Successful compile. If the anchored option was not passed, set it if
10812*22dc650dSSadaf Ebrahimi we can determine that the pattern is anchored by virtue of ^ characters or \A
10813*22dc650dSSadaf Ebrahimi or anything else, such as starting with non-atomic .* when DOTALL is set and
10814*22dc650dSSadaf Ebrahimi there are no occurrences of *PRUNE or *SKIP (though there is an option to
10815*22dc650dSSadaf Ebrahimi disable this case). */
10816*22dc650dSSadaf Ebrahimi
10817*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10818*22dc650dSSadaf Ebrahimi is_anchored(codestart, 0, &cb, 0, FALSE))
10819*22dc650dSSadaf Ebrahimi re->overall_options |= PCRE2_ANCHORED;
10820*22dc650dSSadaf Ebrahimi
10821*22dc650dSSadaf Ebrahimi /* Set up the first code unit or startline flag, the required code unit, and
10822*22dc650dSSadaf Ebrahimi then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10823*22dc650dSSadaf Ebrahimi is set, as the data it would create will not be used. Note that a first code
10824*22dc650dSSadaf Ebrahimi unit (but not the startline flag) is useful for anchored patterns because it
10825*22dc650dSSadaf Ebrahimi can still give a quick "no match" and also avoid searching for a last code
10826*22dc650dSSadaf Ebrahimi unit. */
10827*22dc650dSSadaf Ebrahimi
10828*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10829*22dc650dSSadaf Ebrahimi {
10830*22dc650dSSadaf Ebrahimi int minminlength = 0; /* For minimal minlength from first/required CU */
10831*22dc650dSSadaf Ebrahimi
10832*22dc650dSSadaf Ebrahimi /* If we do not have a first code unit, see if there is one that is asserted
10833*22dc650dSSadaf Ebrahimi (these are not saved during the compile because they can cause conflicts with
10834*22dc650dSSadaf Ebrahimi actual literals that follow). */
10835*22dc650dSSadaf Ebrahimi
10836*22dc650dSSadaf Ebrahimi if (firstcuflags >= REQ_NONE)
10837*22dc650dSSadaf Ebrahimi firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10838*22dc650dSSadaf Ebrahimi
10839*22dc650dSSadaf Ebrahimi /* Save the data for a first code unit. The existence of one means the
10840*22dc650dSSadaf Ebrahimi minimum length must be at least 1. */
10841*22dc650dSSadaf Ebrahimi
10842*22dc650dSSadaf Ebrahimi if (firstcuflags < REQ_NONE)
10843*22dc650dSSadaf Ebrahimi {
10844*22dc650dSSadaf Ebrahimi re->first_codeunit = firstcu;
10845*22dc650dSSadaf Ebrahimi re->flags |= PCRE2_FIRSTSET;
10846*22dc650dSSadaf Ebrahimi minminlength++;
10847*22dc650dSSadaf Ebrahimi
10848*22dc650dSSadaf Ebrahimi /* Handle caseless first code units. */
10849*22dc650dSSadaf Ebrahimi
10850*22dc650dSSadaf Ebrahimi if ((firstcuflags & REQ_CASELESS) != 0)
10851*22dc650dSSadaf Ebrahimi {
10852*22dc650dSSadaf Ebrahimi if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10853*22dc650dSSadaf Ebrahimi {
10854*22dc650dSSadaf Ebrahimi if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10855*22dc650dSSadaf Ebrahimi }
10856*22dc650dSSadaf Ebrahimi
10857*22dc650dSSadaf Ebrahimi /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10858*22dc650dSSadaf Ebrahimi In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10859*22dc650dSSadaf Ebrahimi points and cannot have another case, but if UCP is set they may do. */
10860*22dc650dSSadaf Ebrahimi
10861*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
10862*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
10863*22dc650dSSadaf Ebrahimi else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10864*22dc650dSSadaf Ebrahimi re->flags |= PCRE2_FIRSTCASELESS;
10865*22dc650dSSadaf Ebrahimi #else
10866*22dc650dSSadaf Ebrahimi else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10867*22dc650dSSadaf Ebrahimi UCD_OTHERCASE(firstcu) != firstcu)
10868*22dc650dSSadaf Ebrahimi re->flags |= PCRE2_FIRSTCASELESS;
10869*22dc650dSSadaf Ebrahimi #endif
10870*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
10871*22dc650dSSadaf Ebrahimi }
10872*22dc650dSSadaf Ebrahimi }
10873*22dc650dSSadaf Ebrahimi
10874*22dc650dSSadaf Ebrahimi /* When there is no first code unit, for non-anchored patterns, see if we can
10875*22dc650dSSadaf Ebrahimi set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10876*22dc650dSSadaf Ebrahimi branches start with ^ and also when all branches start with non-atomic .* for
10877*22dc650dSSadaf Ebrahimi non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10878*22dc650dSSadaf Ebrahimi that disables this case.) */
10879*22dc650dSSadaf Ebrahimi
10880*22dc650dSSadaf Ebrahimi else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10881*22dc650dSSadaf Ebrahimi is_startline(codestart, 0, &cb, 0, FALSE))
10882*22dc650dSSadaf Ebrahimi re->flags |= PCRE2_STARTLINE;
10883*22dc650dSSadaf Ebrahimi
10884*22dc650dSSadaf Ebrahimi /* Handle the "required code unit", if one is set. In the UTF case we can
10885*22dc650dSSadaf Ebrahimi increment the minimum minimum length only if we are sure this really is a
10886*22dc650dSSadaf Ebrahimi different character and not a non-starting code unit of the first character,
10887*22dc650dSSadaf Ebrahimi because the minimum length count is in characters, not code units. */
10888*22dc650dSSadaf Ebrahimi
10889*22dc650dSSadaf Ebrahimi if (reqcuflags < REQ_NONE)
10890*22dc650dSSadaf Ebrahimi {
10891*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 16
10892*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10893*22dc650dSSadaf Ebrahimi firstcuflags >= REQ_NONE || /* First not set */
10894*22dc650dSSadaf Ebrahimi (firstcu & 0xf800) != 0xd800 || /* First not surrogate */
10895*22dc650dSSadaf Ebrahimi (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
10896*22dc650dSSadaf Ebrahimi #elif PCRE2_CODE_UNIT_WIDTH == 8
10897*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10898*22dc650dSSadaf Ebrahimi firstcuflags >= REQ_NONE || /* First not set */
10899*22dc650dSSadaf Ebrahimi (firstcu & 0x80) == 0 || /* First is ASCII */
10900*22dc650dSSadaf Ebrahimi (reqcu & 0x80) == 0) /* Req is ASCII */
10901*22dc650dSSadaf Ebrahimi #endif
10902*22dc650dSSadaf Ebrahimi {
10903*22dc650dSSadaf Ebrahimi minminlength++;
10904*22dc650dSSadaf Ebrahimi }
10905*22dc650dSSadaf Ebrahimi
10906*22dc650dSSadaf Ebrahimi /* In the case of an anchored pattern, set up the value only if it follows
10907*22dc650dSSadaf Ebrahimi a variable length item in the pattern. */
10908*22dc650dSSadaf Ebrahimi
10909*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10910*22dc650dSSadaf Ebrahimi (reqcuflags & REQ_VARY) != 0)
10911*22dc650dSSadaf Ebrahimi {
10912*22dc650dSSadaf Ebrahimi re->last_codeunit = reqcu;
10913*22dc650dSSadaf Ebrahimi re->flags |= PCRE2_LASTSET;
10914*22dc650dSSadaf Ebrahimi
10915*22dc650dSSadaf Ebrahimi /* Handle caseless required code units as for first code units (above). */
10916*22dc650dSSadaf Ebrahimi
10917*22dc650dSSadaf Ebrahimi if ((reqcuflags & REQ_CASELESS) != 0)
10918*22dc650dSSadaf Ebrahimi {
10919*22dc650dSSadaf Ebrahimi if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10920*22dc650dSSadaf Ebrahimi {
10921*22dc650dSSadaf Ebrahimi if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10922*22dc650dSSadaf Ebrahimi }
10923*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
10924*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
10925*22dc650dSSadaf Ebrahimi else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10926*22dc650dSSadaf Ebrahimi re->flags |= PCRE2_LASTCASELESS;
10927*22dc650dSSadaf Ebrahimi #else
10928*22dc650dSSadaf Ebrahimi else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10929*22dc650dSSadaf Ebrahimi UCD_OTHERCASE(reqcu) != reqcu)
10930*22dc650dSSadaf Ebrahimi re->flags |= PCRE2_LASTCASELESS;
10931*22dc650dSSadaf Ebrahimi #endif
10932*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
10933*22dc650dSSadaf Ebrahimi }
10934*22dc650dSSadaf Ebrahimi }
10935*22dc650dSSadaf Ebrahimi }
10936*22dc650dSSadaf Ebrahimi
10937*22dc650dSSadaf Ebrahimi /* Study the compiled pattern to set up information such as a bitmap of
10938*22dc650dSSadaf Ebrahimi starting code units and a minimum matching length. */
10939*22dc650dSSadaf Ebrahimi
10940*22dc650dSSadaf Ebrahimi if (PRIV(study)(re) != 0)
10941*22dc650dSSadaf Ebrahimi {
10942*22dc650dSSadaf Ebrahimi errorcode = ERR31;
10943*22dc650dSSadaf Ebrahimi goto HAD_CB_ERROR;
10944*22dc650dSSadaf Ebrahimi }
10945*22dc650dSSadaf Ebrahimi
10946*22dc650dSSadaf Ebrahimi /* If study() set a bitmap of starting code units, it implies a minimum
10947*22dc650dSSadaf Ebrahimi length of at least one. */
10948*22dc650dSSadaf Ebrahimi
10949*22dc650dSSadaf Ebrahimi if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10950*22dc650dSSadaf Ebrahimi minminlength = 1;
10951*22dc650dSSadaf Ebrahimi
10952*22dc650dSSadaf Ebrahimi /* If the minimum length set (or not set) by study() is less than the minimum
10953*22dc650dSSadaf Ebrahimi implied by required code units, override it. */
10954*22dc650dSSadaf Ebrahimi
10955*22dc650dSSadaf Ebrahimi if (re->minlength < minminlength) re->minlength = minminlength;
10956*22dc650dSSadaf Ebrahimi } /* End of start-of-match optimizations. */
10957*22dc650dSSadaf Ebrahimi
10958*22dc650dSSadaf Ebrahimi /* Control ends up here in all cases. When running under valgrind, make a
10959*22dc650dSSadaf Ebrahimi pattern's terminating zero defined again. If memory was obtained for the parsed
10960*22dc650dSSadaf Ebrahimi version of the pattern, free it before returning. Also free the list of named
10961*22dc650dSSadaf Ebrahimi groups if a larger one had to be obtained, and likewise the group information
10962*22dc650dSSadaf Ebrahimi vector. */
10963*22dc650dSSadaf Ebrahimi
10964*22dc650dSSadaf Ebrahimi EXIT:
10965*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_VALGRIND
10966*22dc650dSSadaf Ebrahimi if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10967*22dc650dSSadaf Ebrahimi #endif
10968*22dc650dSSadaf Ebrahimi if (cb.parsed_pattern != stack_parsed_pattern)
10969*22dc650dSSadaf Ebrahimi ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10970*22dc650dSSadaf Ebrahimi if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10971*22dc650dSSadaf Ebrahimi ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10972*22dc650dSSadaf Ebrahimi if (cb.groupinfo != stack_groupinfo)
10973*22dc650dSSadaf Ebrahimi ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10974*22dc650dSSadaf Ebrahimi return re; /* Will be NULL after an error */
10975*22dc650dSSadaf Ebrahimi
10976*22dc650dSSadaf Ebrahimi /* Errors discovered in parse_regex() set the offset value in the compile
10977*22dc650dSSadaf Ebrahimi block. Errors discovered before it is called must compute it from the ptr
10978*22dc650dSSadaf Ebrahimi value. After parse_regex() is called, the offset in the compile block is set to
10979*22dc650dSSadaf Ebrahimi the end of the pattern, but certain errors in compile_regex() may reset it if
10980*22dc650dSSadaf Ebrahimi an offset is available in the parsed pattern. */
10981*22dc650dSSadaf Ebrahimi
10982*22dc650dSSadaf Ebrahimi HAD_CB_ERROR:
10983*22dc650dSSadaf Ebrahimi ptr = pattern + cb.erroroffset;
10984*22dc650dSSadaf Ebrahimi
10985*22dc650dSSadaf Ebrahimi HAD_EARLY_ERROR:
10986*22dc650dSSadaf Ebrahimi *erroroffset = ptr - pattern;
10987*22dc650dSSadaf Ebrahimi
10988*22dc650dSSadaf Ebrahimi HAD_ERROR:
10989*22dc650dSSadaf Ebrahimi *errorptr = errorcode;
10990*22dc650dSSadaf Ebrahimi pcre2_code_free(re);
10991*22dc650dSSadaf Ebrahimi re = NULL;
10992*22dc650dSSadaf Ebrahimi goto EXIT;
10993*22dc650dSSadaf Ebrahimi }
10994*22dc650dSSadaf Ebrahimi
10995*22dc650dSSadaf Ebrahimi /* These #undefs are here to enable unity builds with CMake. */
10996*22dc650dSSadaf Ebrahimi
10997*22dc650dSSadaf Ebrahimi #undef NLBLOCK /* Block containing newline information */
10998*22dc650dSSadaf Ebrahimi #undef PSSTART /* Field containing processed string start */
10999*22dc650dSSadaf Ebrahimi #undef PSEND /* Field containing processed string end */
11000*22dc650dSSadaf Ebrahimi
11001*22dc650dSSadaf Ebrahimi /* End of pcre2_compile.c */
11002