xref: /aosp_15_r20/external/pcre/src/pcre2_auto_possess.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *      Perl-Compatible Regular Expressions       *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi 
8*22dc650dSSadaf Ebrahimi                        Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi      Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi           New API code Copyright (c) 2016-2022 University of Cambridge
11*22dc650dSSadaf Ebrahimi 
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi 
16*22dc650dSSadaf Ebrahimi     * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi       this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi 
19*22dc650dSSadaf Ebrahimi     * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi       notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi       documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi 
23*22dc650dSSadaf Ebrahimi     * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi       contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi       this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi 
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi 
41*22dc650dSSadaf Ebrahimi /* This module contains functions that scan a compiled pattern and change
42*22dc650dSSadaf Ebrahimi repeats into possessive repeats where possible. */
43*22dc650dSSadaf Ebrahimi 
44*22dc650dSSadaf Ebrahimi 
45*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
46*22dc650dSSadaf Ebrahimi #include "config.h"
47*22dc650dSSadaf Ebrahimi #endif
48*22dc650dSSadaf Ebrahimi 
49*22dc650dSSadaf Ebrahimi 
50*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
51*22dc650dSSadaf Ebrahimi 
52*22dc650dSSadaf Ebrahimi 
53*22dc650dSSadaf Ebrahimi /*************************************************
54*22dc650dSSadaf Ebrahimi *        Tables for auto-possessification        *
55*22dc650dSSadaf Ebrahimi *************************************************/
56*22dc650dSSadaf Ebrahimi 
57*22dc650dSSadaf Ebrahimi /* This table is used to check whether auto-possessification is possible
58*22dc650dSSadaf Ebrahimi between adjacent character-type opcodes. The left-hand (repeated) opcode is
59*22dc650dSSadaf Ebrahimi used to select the row, and the right-hand opcode is use to select the column.
60*22dc650dSSadaf Ebrahimi A value of 1 means that auto-possessification is OK. For example, the second
61*22dc650dSSadaf Ebrahimi value in the first row means that \D+\d can be turned into \D++\d.
62*22dc650dSSadaf Ebrahimi 
63*22dc650dSSadaf Ebrahimi The Unicode property types (\P and \p) have to be present to fill out the table
64*22dc650dSSadaf Ebrahimi because of what their opcode values are, but the table values should always be
65*22dc650dSSadaf Ebrahimi zero because property types are handled separately in the code. The last four
66*22dc650dSSadaf Ebrahimi columns apply to items that cannot be repeated, so there is no need to have
67*22dc650dSSadaf Ebrahimi rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
68*22dc650dSSadaf Ebrahimi *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
69*22dc650dSSadaf Ebrahimi 
70*22dc650dSSadaf Ebrahimi #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
71*22dc650dSSadaf Ebrahimi #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
72*22dc650dSSadaf Ebrahimi 
73*22dc650dSSadaf Ebrahimi static const uint8_t autoposstab[APTROWS][APTCOLS] = {
74*22dc650dSSadaf Ebrahimi /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
75*22dc650dSSadaf Ebrahimi   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
76*22dc650dSSadaf Ebrahimi   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
77*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
78*22dc650dSSadaf Ebrahimi   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
79*22dc650dSSadaf Ebrahimi   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
80*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
81*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
82*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
83*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
84*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
85*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
86*22dc650dSSadaf Ebrahimi   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
87*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
88*22dc650dSSadaf Ebrahimi   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
89*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
90*22dc650dSSadaf Ebrahimi   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
91*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
92*22dc650dSSadaf Ebrahimi };
93*22dc650dSSadaf Ebrahimi 
94*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
95*22dc650dSSadaf Ebrahimi /* This table is used to check whether auto-possessification is possible
96*22dc650dSSadaf Ebrahimi between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
97*22dc650dSSadaf Ebrahimi left-hand (repeated) opcode is used to select the row, and the right-hand
98*22dc650dSSadaf Ebrahimi opcode is used to select the column. The values are as follows:
99*22dc650dSSadaf Ebrahimi 
100*22dc650dSSadaf Ebrahimi   0   Always return FALSE (never auto-possessify)
101*22dc650dSSadaf Ebrahimi   1   Character groups are distinct (possessify if both are OP_PROP)
102*22dc650dSSadaf Ebrahimi   2   Check character categories in the same group (general or particular)
103*22dc650dSSadaf Ebrahimi   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
104*22dc650dSSadaf Ebrahimi 
105*22dc650dSSadaf Ebrahimi   4   Check left general category vs right particular category
106*22dc650dSSadaf Ebrahimi   5   Check right general category vs left particular category
107*22dc650dSSadaf Ebrahimi 
108*22dc650dSSadaf Ebrahimi   6   Left alphanum vs right general category
109*22dc650dSSadaf Ebrahimi   7   Left space vs right general category
110*22dc650dSSadaf Ebrahimi   8   Left word vs right general category
111*22dc650dSSadaf Ebrahimi 
112*22dc650dSSadaf Ebrahimi   9   Right alphanum vs left general category
113*22dc650dSSadaf Ebrahimi  10   Right space vs left general category
114*22dc650dSSadaf Ebrahimi  11   Right word vs left general category
115*22dc650dSSadaf Ebrahimi 
116*22dc650dSSadaf Ebrahimi  12   Left alphanum vs right particular category
117*22dc650dSSadaf Ebrahimi  13   Left space vs right particular category
118*22dc650dSSadaf Ebrahimi  14   Left word vs right particular category
119*22dc650dSSadaf Ebrahimi 
120*22dc650dSSadaf Ebrahimi  15   Right alphanum vs left particular category
121*22dc650dSSadaf Ebrahimi  16   Right space vs left particular category
122*22dc650dSSadaf Ebrahimi  17   Right word vs left particular category
123*22dc650dSSadaf Ebrahimi */
124*22dc650dSSadaf Ebrahimi 
125*22dc650dSSadaf Ebrahimi static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
126*22dc650dSSadaf Ebrahimi /* ANY LAMP GC  PC  SC  SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
127*22dc650dSSadaf Ebrahimi   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_ANY */
128*22dc650dSSadaf Ebrahimi   { 0,  3,  0,  0,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_LAMP */
129*22dc650dSSadaf Ebrahimi   { 0,  0,  2,  4,  0,   0,    9,   10,     10,  11,    0,   0,    0,    0 },  /* PT_GC */
130*22dc650dSSadaf Ebrahimi   { 0,  0,  5,  2,  0,   0,   15,   16,     16,  17,    0,   0,    0,    0 },  /* PT_PC */
131*22dc650dSSadaf Ebrahimi   { 0,  0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SC */
132*22dc650dSSadaf Ebrahimi   { 0,  0,  0,  0,  2,   2,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_SCX */
133*22dc650dSSadaf Ebrahimi   { 0,  3,  6, 12,  0,   0,    3,    1,      1,   0,    0,   0,    0,    0 },  /* PT_ALNUM */
134*22dc650dSSadaf Ebrahimi   { 0,  1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_SPACE */
135*22dc650dSSadaf Ebrahimi   { 0,  1,  7, 13,  0,   0,    1,    3,      3,   1,    0,   0,    0,    0 },  /* PT_PXSPACE */
136*22dc650dSSadaf Ebrahimi   { 0,  0,  8, 14,  0,   0,    0,    1,      1,   3,    0,   0,    0,    0 },  /* PT_WORD */
137*22dc650dSSadaf Ebrahimi   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_CLIST */
138*22dc650dSSadaf Ebrahimi   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   3,    0,    0 },  /* PT_UCNC */
139*22dc650dSSadaf Ebrahimi   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 },  /* PT_BIDICL */
140*22dc650dSSadaf Ebrahimi   { 0,  0,  0,  0,  0,   0,    0,    0,      0,   0,    0,   0,    0,    0 }   /* PT_BOOL */
141*22dc650dSSadaf Ebrahimi };
142*22dc650dSSadaf Ebrahimi 
143*22dc650dSSadaf Ebrahimi /* This table is used to check whether auto-possessification is possible
144*22dc650dSSadaf Ebrahimi between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
145*22dc650dSSadaf Ebrahimi specifies a general category and the other specifies a particular category. The
146*22dc650dSSadaf Ebrahimi row is selected by the general category and the column by the particular
147*22dc650dSSadaf Ebrahimi category. The value is 1 if the particular category is not part of the general
148*22dc650dSSadaf Ebrahimi category. */
149*22dc650dSSadaf Ebrahimi 
150*22dc650dSSadaf Ebrahimi static const uint8_t catposstab[7][30] = {
151*22dc650dSSadaf Ebrahimi /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
152*22dc650dSSadaf Ebrahimi   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
153*22dc650dSSadaf Ebrahimi   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
154*22dc650dSSadaf Ebrahimi   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
155*22dc650dSSadaf Ebrahimi   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
156*22dc650dSSadaf Ebrahimi   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
157*22dc650dSSadaf Ebrahimi   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
158*22dc650dSSadaf Ebrahimi   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
159*22dc650dSSadaf Ebrahimi };
160*22dc650dSSadaf Ebrahimi 
161*22dc650dSSadaf Ebrahimi /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
162*22dc650dSSadaf Ebrahimi a general or particular category. The properties in each row are those
163*22dc650dSSadaf Ebrahimi that apply to the character set in question. Duplication means that a little
164*22dc650dSSadaf Ebrahimi unnecessary work is done when checking, but this keeps things much simpler
165*22dc650dSSadaf Ebrahimi because they can all use the same code. For more details see the comment where
166*22dc650dSSadaf Ebrahimi this table is used.
167*22dc650dSSadaf Ebrahimi 
168*22dc650dSSadaf Ebrahimi Note: SPACE and PXSPACE used to be different because Perl excluded VT from
169*22dc650dSSadaf Ebrahimi "space", but from Perl 5.18 it's included, so both categories are treated the
170*22dc650dSSadaf Ebrahimi same here. */
171*22dc650dSSadaf Ebrahimi 
172*22dc650dSSadaf Ebrahimi static const uint8_t posspropstab[3][4] = {
173*22dc650dSSadaf Ebrahimi   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
174*22dc650dSSadaf Ebrahimi   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
175*22dc650dSSadaf Ebrahimi   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
176*22dc650dSSadaf Ebrahimi };
177*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
178*22dc650dSSadaf Ebrahimi 
179*22dc650dSSadaf Ebrahimi 
180*22dc650dSSadaf Ebrahimi 
181*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
182*22dc650dSSadaf Ebrahimi /*************************************************
183*22dc650dSSadaf Ebrahimi *        Check a character and a property        *
184*22dc650dSSadaf Ebrahimi *************************************************/
185*22dc650dSSadaf Ebrahimi 
186*22dc650dSSadaf Ebrahimi /* This function is called by compare_opcodes() when a property item is
187*22dc650dSSadaf Ebrahimi adjacent to a fixed character.
188*22dc650dSSadaf Ebrahimi 
189*22dc650dSSadaf Ebrahimi Arguments:
190*22dc650dSSadaf Ebrahimi   c            the character
191*22dc650dSSadaf Ebrahimi   ptype        the property type
192*22dc650dSSadaf Ebrahimi   pdata        the data for the type
193*22dc650dSSadaf Ebrahimi   negated      TRUE if it's a negated property (\P or \p{^)
194*22dc650dSSadaf Ebrahimi 
195*22dc650dSSadaf Ebrahimi Returns:       TRUE if auto-possessifying is OK
196*22dc650dSSadaf Ebrahimi */
197*22dc650dSSadaf Ebrahimi 
198*22dc650dSSadaf Ebrahimi static BOOL
check_char_prop(uint32_t c,unsigned int ptype,unsigned int pdata,BOOL negated)199*22dc650dSSadaf Ebrahimi check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
200*22dc650dSSadaf Ebrahimi   BOOL negated)
201*22dc650dSSadaf Ebrahimi {
202*22dc650dSSadaf Ebrahimi BOOL ok;
203*22dc650dSSadaf Ebrahimi const uint32_t *p;
204*22dc650dSSadaf Ebrahimi const ucd_record *prop = GET_UCD(c);
205*22dc650dSSadaf Ebrahimi 
206*22dc650dSSadaf Ebrahimi switch(ptype)
207*22dc650dSSadaf Ebrahimi   {
208*22dc650dSSadaf Ebrahimi   case PT_LAMP:
209*22dc650dSSadaf Ebrahimi   return (prop->chartype == ucp_Lu ||
210*22dc650dSSadaf Ebrahimi           prop->chartype == ucp_Ll ||
211*22dc650dSSadaf Ebrahimi           prop->chartype == ucp_Lt) == negated;
212*22dc650dSSadaf Ebrahimi 
213*22dc650dSSadaf Ebrahimi   case PT_GC:
214*22dc650dSSadaf Ebrahimi   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
215*22dc650dSSadaf Ebrahimi 
216*22dc650dSSadaf Ebrahimi   case PT_PC:
217*22dc650dSSadaf Ebrahimi   return (pdata == prop->chartype) == negated;
218*22dc650dSSadaf Ebrahimi 
219*22dc650dSSadaf Ebrahimi   case PT_SC:
220*22dc650dSSadaf Ebrahimi   return (pdata == prop->script) == negated;
221*22dc650dSSadaf Ebrahimi 
222*22dc650dSSadaf Ebrahimi   case PT_SCX:
223*22dc650dSSadaf Ebrahimi   ok = (pdata == prop->script
224*22dc650dSSadaf Ebrahimi         || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
225*22dc650dSSadaf Ebrahimi   return ok == negated;
226*22dc650dSSadaf Ebrahimi 
227*22dc650dSSadaf Ebrahimi   /* These are specials */
228*22dc650dSSadaf Ebrahimi 
229*22dc650dSSadaf Ebrahimi   case PT_ALNUM:
230*22dc650dSSadaf Ebrahimi   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
231*22dc650dSSadaf Ebrahimi           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
232*22dc650dSSadaf Ebrahimi 
233*22dc650dSSadaf Ebrahimi   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
234*22dc650dSSadaf Ebrahimi   means that Perl space and POSIX space are now identical. PCRE was changed
235*22dc650dSSadaf Ebrahimi   at release 8.34. */
236*22dc650dSSadaf Ebrahimi 
237*22dc650dSSadaf Ebrahimi   case PT_SPACE:    /* Perl space */
238*22dc650dSSadaf Ebrahimi   case PT_PXSPACE:  /* POSIX space */
239*22dc650dSSadaf Ebrahimi   switch(c)
240*22dc650dSSadaf Ebrahimi     {
241*22dc650dSSadaf Ebrahimi     HSPACE_CASES:
242*22dc650dSSadaf Ebrahimi     VSPACE_CASES:
243*22dc650dSSadaf Ebrahimi     return negated;
244*22dc650dSSadaf Ebrahimi 
245*22dc650dSSadaf Ebrahimi     default:
246*22dc650dSSadaf Ebrahimi     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
247*22dc650dSSadaf Ebrahimi     }
248*22dc650dSSadaf Ebrahimi   break;  /* Control never reaches here */
249*22dc650dSSadaf Ebrahimi 
250*22dc650dSSadaf Ebrahimi   case PT_WORD:
251*22dc650dSSadaf Ebrahimi   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
252*22dc650dSSadaf Ebrahimi           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
253*22dc650dSSadaf Ebrahimi           c == CHAR_UNDERSCORE) == negated;
254*22dc650dSSadaf Ebrahimi 
255*22dc650dSSadaf Ebrahimi   case PT_CLIST:
256*22dc650dSSadaf Ebrahimi   p = PRIV(ucd_caseless_sets) + prop->caseset;
257*22dc650dSSadaf Ebrahimi   for (;;)
258*22dc650dSSadaf Ebrahimi     {
259*22dc650dSSadaf Ebrahimi     if (c < *p) return !negated;
260*22dc650dSSadaf Ebrahimi     if (c == *p++) return negated;
261*22dc650dSSadaf Ebrahimi     }
262*22dc650dSSadaf Ebrahimi   break;  /* Control never reaches here */
263*22dc650dSSadaf Ebrahimi 
264*22dc650dSSadaf Ebrahimi   /* Haven't yet thought these through. */
265*22dc650dSSadaf Ebrahimi 
266*22dc650dSSadaf Ebrahimi   case PT_BIDICL:
267*22dc650dSSadaf Ebrahimi   return FALSE;
268*22dc650dSSadaf Ebrahimi 
269*22dc650dSSadaf Ebrahimi   case PT_BOOL:
270*22dc650dSSadaf Ebrahimi   return FALSE;
271*22dc650dSSadaf Ebrahimi   }
272*22dc650dSSadaf Ebrahimi 
273*22dc650dSSadaf Ebrahimi return FALSE;
274*22dc650dSSadaf Ebrahimi }
275*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
276*22dc650dSSadaf Ebrahimi 
277*22dc650dSSadaf Ebrahimi 
278*22dc650dSSadaf Ebrahimi 
279*22dc650dSSadaf Ebrahimi /*************************************************
280*22dc650dSSadaf Ebrahimi *        Base opcode of repeated opcodes         *
281*22dc650dSSadaf Ebrahimi *************************************************/
282*22dc650dSSadaf Ebrahimi 
283*22dc650dSSadaf Ebrahimi /* Returns the base opcode for repeated single character type opcodes. If the
284*22dc650dSSadaf Ebrahimi opcode is not a repeated character type, it returns with the original value.
285*22dc650dSSadaf Ebrahimi 
286*22dc650dSSadaf Ebrahimi Arguments:  c opcode
287*22dc650dSSadaf Ebrahimi Returns:    base opcode for the type
288*22dc650dSSadaf Ebrahimi */
289*22dc650dSSadaf Ebrahimi 
290*22dc650dSSadaf Ebrahimi static PCRE2_UCHAR
get_repeat_base(PCRE2_UCHAR c)291*22dc650dSSadaf Ebrahimi get_repeat_base(PCRE2_UCHAR c)
292*22dc650dSSadaf Ebrahimi {
293*22dc650dSSadaf Ebrahimi return (c > OP_TYPEPOSUPTO)? c :
294*22dc650dSSadaf Ebrahimi        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
295*22dc650dSSadaf Ebrahimi        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
296*22dc650dSSadaf Ebrahimi        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
297*22dc650dSSadaf Ebrahimi        (c >= OP_STARI)?      OP_STARI :
298*22dc650dSSadaf Ebrahimi                              OP_STAR;
299*22dc650dSSadaf Ebrahimi }
300*22dc650dSSadaf Ebrahimi 
301*22dc650dSSadaf Ebrahimi 
302*22dc650dSSadaf Ebrahimi /*************************************************
303*22dc650dSSadaf Ebrahimi *        Fill the character property list        *
304*22dc650dSSadaf Ebrahimi *************************************************/
305*22dc650dSSadaf Ebrahimi 
306*22dc650dSSadaf Ebrahimi /* Checks whether the code points to an opcode that can take part in auto-
307*22dc650dSSadaf Ebrahimi possessification, and if so, fills a list with its properties.
308*22dc650dSSadaf Ebrahimi 
309*22dc650dSSadaf Ebrahimi Arguments:
310*22dc650dSSadaf Ebrahimi   code        points to start of expression
311*22dc650dSSadaf Ebrahimi   utf         TRUE if in UTF mode
312*22dc650dSSadaf Ebrahimi   ucp         TRUE if in UCP mode
313*22dc650dSSadaf Ebrahimi   fcc         points to the case-flipping table
314*22dc650dSSadaf Ebrahimi   list        points to output list
315*22dc650dSSadaf Ebrahimi               list[0] will be filled with the opcode
316*22dc650dSSadaf Ebrahimi               list[1] will be non-zero if this opcode
317*22dc650dSSadaf Ebrahimi                 can match an empty character string
318*22dc650dSSadaf Ebrahimi               list[2..7] depends on the opcode
319*22dc650dSSadaf Ebrahimi 
320*22dc650dSSadaf Ebrahimi Returns:      points to the start of the next opcode if *code is accepted
321*22dc650dSSadaf Ebrahimi               NULL if *code is not accepted
322*22dc650dSSadaf Ebrahimi */
323*22dc650dSSadaf Ebrahimi 
324*22dc650dSSadaf Ebrahimi static PCRE2_SPTR
get_chr_property_list(PCRE2_SPTR code,BOOL utf,BOOL ucp,const uint8_t * fcc,uint32_t * list)325*22dc650dSSadaf Ebrahimi get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
326*22dc650dSSadaf Ebrahimi   uint32_t *list)
327*22dc650dSSadaf Ebrahimi {
328*22dc650dSSadaf Ebrahimi PCRE2_UCHAR c = *code;
329*22dc650dSSadaf Ebrahimi PCRE2_UCHAR base;
330*22dc650dSSadaf Ebrahimi PCRE2_SPTR end;
331*22dc650dSSadaf Ebrahimi uint32_t chr;
332*22dc650dSSadaf Ebrahimi 
333*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
334*22dc650dSSadaf Ebrahimi uint32_t *clist_dest;
335*22dc650dSSadaf Ebrahimi const uint32_t *clist_src;
336*22dc650dSSadaf Ebrahimi #else
337*22dc650dSSadaf Ebrahimi (void)utf;    /* Suppress "unused parameter" compiler warnings */
338*22dc650dSSadaf Ebrahimi (void)ucp;
339*22dc650dSSadaf Ebrahimi #endif
340*22dc650dSSadaf Ebrahimi 
341*22dc650dSSadaf Ebrahimi list[0] = c;
342*22dc650dSSadaf Ebrahimi list[1] = FALSE;
343*22dc650dSSadaf Ebrahimi code++;
344*22dc650dSSadaf Ebrahimi 
345*22dc650dSSadaf Ebrahimi if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
346*22dc650dSSadaf Ebrahimi   {
347*22dc650dSSadaf Ebrahimi   base = get_repeat_base(c);
348*22dc650dSSadaf Ebrahimi   c -= (base - OP_STAR);
349*22dc650dSSadaf Ebrahimi 
350*22dc650dSSadaf Ebrahimi   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
351*22dc650dSSadaf Ebrahimi     code += IMM2_SIZE;
352*22dc650dSSadaf Ebrahimi 
353*22dc650dSSadaf Ebrahimi   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
354*22dc650dSSadaf Ebrahimi              c != OP_POSPLUS);
355*22dc650dSSadaf Ebrahimi 
356*22dc650dSSadaf Ebrahimi   switch(base)
357*22dc650dSSadaf Ebrahimi     {
358*22dc650dSSadaf Ebrahimi     case OP_STAR:
359*22dc650dSSadaf Ebrahimi     list[0] = OP_CHAR;
360*22dc650dSSadaf Ebrahimi     break;
361*22dc650dSSadaf Ebrahimi 
362*22dc650dSSadaf Ebrahimi     case OP_STARI:
363*22dc650dSSadaf Ebrahimi     list[0] = OP_CHARI;
364*22dc650dSSadaf Ebrahimi     break;
365*22dc650dSSadaf Ebrahimi 
366*22dc650dSSadaf Ebrahimi     case OP_NOTSTAR:
367*22dc650dSSadaf Ebrahimi     list[0] = OP_NOT;
368*22dc650dSSadaf Ebrahimi     break;
369*22dc650dSSadaf Ebrahimi 
370*22dc650dSSadaf Ebrahimi     case OP_NOTSTARI:
371*22dc650dSSadaf Ebrahimi     list[0] = OP_NOTI;
372*22dc650dSSadaf Ebrahimi     break;
373*22dc650dSSadaf Ebrahimi 
374*22dc650dSSadaf Ebrahimi     case OP_TYPESTAR:
375*22dc650dSSadaf Ebrahimi     list[0] = *code;
376*22dc650dSSadaf Ebrahimi     code++;
377*22dc650dSSadaf Ebrahimi     break;
378*22dc650dSSadaf Ebrahimi     }
379*22dc650dSSadaf Ebrahimi   c = list[0];
380*22dc650dSSadaf Ebrahimi   }
381*22dc650dSSadaf Ebrahimi 
382*22dc650dSSadaf Ebrahimi switch(c)
383*22dc650dSSadaf Ebrahimi   {
384*22dc650dSSadaf Ebrahimi   case OP_NOT_DIGIT:
385*22dc650dSSadaf Ebrahimi   case OP_DIGIT:
386*22dc650dSSadaf Ebrahimi   case OP_NOT_WHITESPACE:
387*22dc650dSSadaf Ebrahimi   case OP_WHITESPACE:
388*22dc650dSSadaf Ebrahimi   case OP_NOT_WORDCHAR:
389*22dc650dSSadaf Ebrahimi   case OP_WORDCHAR:
390*22dc650dSSadaf Ebrahimi   case OP_ANY:
391*22dc650dSSadaf Ebrahimi   case OP_ALLANY:
392*22dc650dSSadaf Ebrahimi   case OP_ANYNL:
393*22dc650dSSadaf Ebrahimi   case OP_NOT_HSPACE:
394*22dc650dSSadaf Ebrahimi   case OP_HSPACE:
395*22dc650dSSadaf Ebrahimi   case OP_NOT_VSPACE:
396*22dc650dSSadaf Ebrahimi   case OP_VSPACE:
397*22dc650dSSadaf Ebrahimi   case OP_EXTUNI:
398*22dc650dSSadaf Ebrahimi   case OP_EODN:
399*22dc650dSSadaf Ebrahimi   case OP_EOD:
400*22dc650dSSadaf Ebrahimi   case OP_DOLL:
401*22dc650dSSadaf Ebrahimi   case OP_DOLLM:
402*22dc650dSSadaf Ebrahimi   return code;
403*22dc650dSSadaf Ebrahimi 
404*22dc650dSSadaf Ebrahimi   case OP_CHAR:
405*22dc650dSSadaf Ebrahimi   case OP_NOT:
406*22dc650dSSadaf Ebrahimi   GETCHARINCTEST(chr, code);
407*22dc650dSSadaf Ebrahimi   list[2] = chr;
408*22dc650dSSadaf Ebrahimi   list[3] = NOTACHAR;
409*22dc650dSSadaf Ebrahimi   return code;
410*22dc650dSSadaf Ebrahimi 
411*22dc650dSSadaf Ebrahimi   case OP_CHARI:
412*22dc650dSSadaf Ebrahimi   case OP_NOTI:
413*22dc650dSSadaf Ebrahimi   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
414*22dc650dSSadaf Ebrahimi   GETCHARINCTEST(chr, code);
415*22dc650dSSadaf Ebrahimi   list[2] = chr;
416*22dc650dSSadaf Ebrahimi 
417*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
418*22dc650dSSadaf Ebrahimi   if (chr < 128 || (chr < 256 && !utf && !ucp))
419*22dc650dSSadaf Ebrahimi     list[3] = fcc[chr];
420*22dc650dSSadaf Ebrahimi   else
421*22dc650dSSadaf Ebrahimi     list[3] = UCD_OTHERCASE(chr);
422*22dc650dSSadaf Ebrahimi #elif defined SUPPORT_WIDE_CHARS
423*22dc650dSSadaf Ebrahimi   list[3] = (chr < 256) ? fcc[chr] : chr;
424*22dc650dSSadaf Ebrahimi #else
425*22dc650dSSadaf Ebrahimi   list[3] = fcc[chr];
426*22dc650dSSadaf Ebrahimi #endif
427*22dc650dSSadaf Ebrahimi 
428*22dc650dSSadaf Ebrahimi   /* The othercase might be the same value. */
429*22dc650dSSadaf Ebrahimi 
430*22dc650dSSadaf Ebrahimi   if (chr == list[3])
431*22dc650dSSadaf Ebrahimi     list[3] = NOTACHAR;
432*22dc650dSSadaf Ebrahimi   else
433*22dc650dSSadaf Ebrahimi     list[4] = NOTACHAR;
434*22dc650dSSadaf Ebrahimi   return code;
435*22dc650dSSadaf Ebrahimi 
436*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
437*22dc650dSSadaf Ebrahimi   case OP_PROP:
438*22dc650dSSadaf Ebrahimi   case OP_NOTPROP:
439*22dc650dSSadaf Ebrahimi   if (code[0] != PT_CLIST)
440*22dc650dSSadaf Ebrahimi     {
441*22dc650dSSadaf Ebrahimi     list[2] = code[0];
442*22dc650dSSadaf Ebrahimi     list[3] = code[1];
443*22dc650dSSadaf Ebrahimi     return code + 2;
444*22dc650dSSadaf Ebrahimi     }
445*22dc650dSSadaf Ebrahimi 
446*22dc650dSSadaf Ebrahimi   /* Convert only if we have enough space. */
447*22dc650dSSadaf Ebrahimi 
448*22dc650dSSadaf Ebrahimi   clist_src = PRIV(ucd_caseless_sets) + code[1];
449*22dc650dSSadaf Ebrahimi   clist_dest = list + 2;
450*22dc650dSSadaf Ebrahimi   code += 2;
451*22dc650dSSadaf Ebrahimi 
452*22dc650dSSadaf Ebrahimi   do {
453*22dc650dSSadaf Ebrahimi      if (clist_dest >= list + 8)
454*22dc650dSSadaf Ebrahimi        {
455*22dc650dSSadaf Ebrahimi        /* Early return if there is not enough space. This should never
456*22dc650dSSadaf Ebrahimi        happen, since all clists are shorter than 5 character now. */
457*22dc650dSSadaf Ebrahimi        list[2] = code[0];
458*22dc650dSSadaf Ebrahimi        list[3] = code[1];
459*22dc650dSSadaf Ebrahimi        return code;
460*22dc650dSSadaf Ebrahimi        }
461*22dc650dSSadaf Ebrahimi      *clist_dest++ = *clist_src;
462*22dc650dSSadaf Ebrahimi      }
463*22dc650dSSadaf Ebrahimi   while(*clist_src++ != NOTACHAR);
464*22dc650dSSadaf Ebrahimi 
465*22dc650dSSadaf Ebrahimi   /* All characters are stored. The terminating NOTACHAR is copied from the
466*22dc650dSSadaf Ebrahimi   clist itself. */
467*22dc650dSSadaf Ebrahimi 
468*22dc650dSSadaf Ebrahimi   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
469*22dc650dSSadaf Ebrahimi   return code;
470*22dc650dSSadaf Ebrahimi #endif
471*22dc650dSSadaf Ebrahimi 
472*22dc650dSSadaf Ebrahimi   case OP_NCLASS:
473*22dc650dSSadaf Ebrahimi   case OP_CLASS:
474*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
475*22dc650dSSadaf Ebrahimi   case OP_XCLASS:
476*22dc650dSSadaf Ebrahimi   if (c == OP_XCLASS)
477*22dc650dSSadaf Ebrahimi     end = code + GET(code, 0) - 1;
478*22dc650dSSadaf Ebrahimi   else
479*22dc650dSSadaf Ebrahimi #endif
480*22dc650dSSadaf Ebrahimi     end = code + 32 / sizeof(PCRE2_UCHAR);
481*22dc650dSSadaf Ebrahimi 
482*22dc650dSSadaf Ebrahimi   switch(*end)
483*22dc650dSSadaf Ebrahimi     {
484*22dc650dSSadaf Ebrahimi     case OP_CRSTAR:
485*22dc650dSSadaf Ebrahimi     case OP_CRMINSTAR:
486*22dc650dSSadaf Ebrahimi     case OP_CRQUERY:
487*22dc650dSSadaf Ebrahimi     case OP_CRMINQUERY:
488*22dc650dSSadaf Ebrahimi     case OP_CRPOSSTAR:
489*22dc650dSSadaf Ebrahimi     case OP_CRPOSQUERY:
490*22dc650dSSadaf Ebrahimi     list[1] = TRUE;
491*22dc650dSSadaf Ebrahimi     end++;
492*22dc650dSSadaf Ebrahimi     break;
493*22dc650dSSadaf Ebrahimi 
494*22dc650dSSadaf Ebrahimi     case OP_CRPLUS:
495*22dc650dSSadaf Ebrahimi     case OP_CRMINPLUS:
496*22dc650dSSadaf Ebrahimi     case OP_CRPOSPLUS:
497*22dc650dSSadaf Ebrahimi     end++;
498*22dc650dSSadaf Ebrahimi     break;
499*22dc650dSSadaf Ebrahimi 
500*22dc650dSSadaf Ebrahimi     case OP_CRRANGE:
501*22dc650dSSadaf Ebrahimi     case OP_CRMINRANGE:
502*22dc650dSSadaf Ebrahimi     case OP_CRPOSRANGE:
503*22dc650dSSadaf Ebrahimi     list[1] = (GET2(end, 1) == 0);
504*22dc650dSSadaf Ebrahimi     end += 1 + 2 * IMM2_SIZE;
505*22dc650dSSadaf Ebrahimi     break;
506*22dc650dSSadaf Ebrahimi     }
507*22dc650dSSadaf Ebrahimi   list[2] = (uint32_t)(end - code);
508*22dc650dSSadaf Ebrahimi   return end;
509*22dc650dSSadaf Ebrahimi   }
510*22dc650dSSadaf Ebrahimi 
511*22dc650dSSadaf Ebrahimi return NULL;    /* Opcode not accepted */
512*22dc650dSSadaf Ebrahimi }
513*22dc650dSSadaf Ebrahimi 
514*22dc650dSSadaf Ebrahimi 
515*22dc650dSSadaf Ebrahimi 
516*22dc650dSSadaf Ebrahimi /*************************************************
517*22dc650dSSadaf Ebrahimi *    Scan further character sets for match       *
518*22dc650dSSadaf Ebrahimi *************************************************/
519*22dc650dSSadaf Ebrahimi 
520*22dc650dSSadaf Ebrahimi /* Checks whether the base and the current opcode have a common character, in
521*22dc650dSSadaf Ebrahimi which case the base cannot be possessified.
522*22dc650dSSadaf Ebrahimi 
523*22dc650dSSadaf Ebrahimi Arguments:
524*22dc650dSSadaf Ebrahimi   code        points to the byte code
525*22dc650dSSadaf Ebrahimi   utf         TRUE in UTF mode
526*22dc650dSSadaf Ebrahimi   ucp         TRUE in UCP mode
527*22dc650dSSadaf Ebrahimi   cb          compile data block
528*22dc650dSSadaf Ebrahimi   base_list   the data list of the base opcode
529*22dc650dSSadaf Ebrahimi   base_end    the end of the base opcode
530*22dc650dSSadaf Ebrahimi   rec_limit   points to recursion depth counter
531*22dc650dSSadaf Ebrahimi 
532*22dc650dSSadaf Ebrahimi Returns:      TRUE if the auto-possessification is possible
533*22dc650dSSadaf Ebrahimi */
534*22dc650dSSadaf Ebrahimi 
535*22dc650dSSadaf Ebrahimi static BOOL
compare_opcodes(PCRE2_SPTR code,BOOL utf,BOOL ucp,const compile_block * cb,const uint32_t * base_list,PCRE2_SPTR base_end,int * rec_limit)536*22dc650dSSadaf Ebrahimi compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
537*22dc650dSSadaf Ebrahimi   const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
538*22dc650dSSadaf Ebrahimi {
539*22dc650dSSadaf Ebrahimi PCRE2_UCHAR c;
540*22dc650dSSadaf Ebrahimi uint32_t list[8];
541*22dc650dSSadaf Ebrahimi const uint32_t *chr_ptr;
542*22dc650dSSadaf Ebrahimi const uint32_t *ochr_ptr;
543*22dc650dSSadaf Ebrahimi const uint32_t *list_ptr;
544*22dc650dSSadaf Ebrahimi PCRE2_SPTR next_code;
545*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
546*22dc650dSSadaf Ebrahimi PCRE2_SPTR xclass_flags;
547*22dc650dSSadaf Ebrahimi #endif
548*22dc650dSSadaf Ebrahimi const uint8_t *class_bitset;
549*22dc650dSSadaf Ebrahimi const uint8_t *set1, *set2, *set_end;
550*22dc650dSSadaf Ebrahimi uint32_t chr;
551*22dc650dSSadaf Ebrahimi BOOL accepted, invert_bits;
552*22dc650dSSadaf Ebrahimi BOOL entered_a_group = FALSE;
553*22dc650dSSadaf Ebrahimi 
554*22dc650dSSadaf Ebrahimi if (--(*rec_limit) <= 0) return FALSE;  /* Recursion has gone too deep */
555*22dc650dSSadaf Ebrahimi 
556*22dc650dSSadaf Ebrahimi /* Note: the base_list[1] contains whether the current opcode has a greedy
557*22dc650dSSadaf Ebrahimi (represented by a non-zero value) quantifier. This is a different from
558*22dc650dSSadaf Ebrahimi other character type lists, which store here that the character iterator
559*22dc650dSSadaf Ebrahimi matches to an empty string (also represented by a non-zero value). */
560*22dc650dSSadaf Ebrahimi 
561*22dc650dSSadaf Ebrahimi for(;;)
562*22dc650dSSadaf Ebrahimi   {
563*22dc650dSSadaf Ebrahimi   PCRE2_SPTR bracode;
564*22dc650dSSadaf Ebrahimi 
565*22dc650dSSadaf Ebrahimi   /* All operations move the code pointer forward.
566*22dc650dSSadaf Ebrahimi   Therefore infinite recursions are not possible. */
567*22dc650dSSadaf Ebrahimi 
568*22dc650dSSadaf Ebrahimi   c = *code;
569*22dc650dSSadaf Ebrahimi 
570*22dc650dSSadaf Ebrahimi   /* Skip over callouts */
571*22dc650dSSadaf Ebrahimi 
572*22dc650dSSadaf Ebrahimi   if (c == OP_CALLOUT)
573*22dc650dSSadaf Ebrahimi     {
574*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[c];
575*22dc650dSSadaf Ebrahimi     continue;
576*22dc650dSSadaf Ebrahimi     }
577*22dc650dSSadaf Ebrahimi 
578*22dc650dSSadaf Ebrahimi   if (c == OP_CALLOUT_STR)
579*22dc650dSSadaf Ebrahimi     {
580*22dc650dSSadaf Ebrahimi     code += GET(code, 1 + 2*LINK_SIZE);
581*22dc650dSSadaf Ebrahimi     continue;
582*22dc650dSSadaf Ebrahimi     }
583*22dc650dSSadaf Ebrahimi 
584*22dc650dSSadaf Ebrahimi   /* At the end of a branch, skip to the end of the group. */
585*22dc650dSSadaf Ebrahimi 
586*22dc650dSSadaf Ebrahimi   if (c == OP_ALT)
587*22dc650dSSadaf Ebrahimi     {
588*22dc650dSSadaf Ebrahimi     do code += GET(code, 1); while (*code == OP_ALT);
589*22dc650dSSadaf Ebrahimi     c = *code;
590*22dc650dSSadaf Ebrahimi     }
591*22dc650dSSadaf Ebrahimi 
592*22dc650dSSadaf Ebrahimi   /* Inspect the next opcode. */
593*22dc650dSSadaf Ebrahimi 
594*22dc650dSSadaf Ebrahimi   switch(c)
595*22dc650dSSadaf Ebrahimi     {
596*22dc650dSSadaf Ebrahimi     /* We can always possessify a greedy iterator at the end of the pattern,
597*22dc650dSSadaf Ebrahimi     which is reached after skipping over the final OP_KET. A non-greedy
598*22dc650dSSadaf Ebrahimi     iterator must never be possessified. */
599*22dc650dSSadaf Ebrahimi 
600*22dc650dSSadaf Ebrahimi     case OP_END:
601*22dc650dSSadaf Ebrahimi     return base_list[1] != 0;
602*22dc650dSSadaf Ebrahimi 
603*22dc650dSSadaf Ebrahimi     /* When an iterator is at the end of certain kinds of group we can inspect
604*22dc650dSSadaf Ebrahimi     what follows the group by skipping over the closing ket. Note that this
605*22dc650dSSadaf Ebrahimi     does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
606*22dc650dSSadaf Ebrahimi     iteration is variable (could be another iteration or could be the next
607*22dc650dSSadaf Ebrahimi     item). As these two opcodes are not listed in the next switch, they will
608*22dc650dSSadaf Ebrahimi     end up as the next code to inspect, and return FALSE by virtue of being
609*22dc650dSSadaf Ebrahimi     unsupported. */
610*22dc650dSSadaf Ebrahimi 
611*22dc650dSSadaf Ebrahimi     case OP_KET:
612*22dc650dSSadaf Ebrahimi     case OP_KETRPOS:
613*22dc650dSSadaf Ebrahimi     /* The non-greedy case cannot be converted to a possessive form. */
614*22dc650dSSadaf Ebrahimi 
615*22dc650dSSadaf Ebrahimi     if (base_list[1] == 0) return FALSE;
616*22dc650dSSadaf Ebrahimi 
617*22dc650dSSadaf Ebrahimi     /* If the bracket is capturing it might be referenced by an OP_RECURSE
618*22dc650dSSadaf Ebrahimi     so its last iterator can never be possessified if the pattern contains
619*22dc650dSSadaf Ebrahimi     recursions. (This could be improved by keeping a list of group numbers that
620*22dc650dSSadaf Ebrahimi     are called by recursion.) */
621*22dc650dSSadaf Ebrahimi 
622*22dc650dSSadaf Ebrahimi     bracode = code - GET(code, 1);
623*22dc650dSSadaf Ebrahimi     switch(*bracode)
624*22dc650dSSadaf Ebrahimi       {
625*22dc650dSSadaf Ebrahimi       case OP_CBRA:
626*22dc650dSSadaf Ebrahimi       case OP_SCBRA:
627*22dc650dSSadaf Ebrahimi       case OP_CBRAPOS:
628*22dc650dSSadaf Ebrahimi       case OP_SCBRAPOS:
629*22dc650dSSadaf Ebrahimi       if (cb->had_recurse) return FALSE;
630*22dc650dSSadaf Ebrahimi       break;
631*22dc650dSSadaf Ebrahimi 
632*22dc650dSSadaf Ebrahimi       /* A script run might have to backtrack if the iterated item can match
633*22dc650dSSadaf Ebrahimi       characters from more than one script. So give up unless repeating an
634*22dc650dSSadaf Ebrahimi       explicit character. */
635*22dc650dSSadaf Ebrahimi 
636*22dc650dSSadaf Ebrahimi       case OP_SCRIPT_RUN:
637*22dc650dSSadaf Ebrahimi       if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
638*22dc650dSSadaf Ebrahimi         return FALSE;
639*22dc650dSSadaf Ebrahimi       break;
640*22dc650dSSadaf Ebrahimi 
641*22dc650dSSadaf Ebrahimi       /* Atomic sub-patterns and assertions can always auto-possessify their
642*22dc650dSSadaf Ebrahimi       last iterator except for variable length lookbehinds. However, if the
643*22dc650dSSadaf Ebrahimi       group was entered as a result of checking a previous iterator, this is
644*22dc650dSSadaf Ebrahimi       not possible. */
645*22dc650dSSadaf Ebrahimi 
646*22dc650dSSadaf Ebrahimi       case OP_ASSERT:
647*22dc650dSSadaf Ebrahimi       case OP_ASSERT_NOT:
648*22dc650dSSadaf Ebrahimi       case OP_ONCE:
649*22dc650dSSadaf Ebrahimi       return !entered_a_group;
650*22dc650dSSadaf Ebrahimi 
651*22dc650dSSadaf Ebrahimi       case OP_ASSERTBACK:
652*22dc650dSSadaf Ebrahimi       case OP_ASSERTBACK_NOT:
653*22dc650dSSadaf Ebrahimi       return (bracode[1+LINK_SIZE] == OP_VREVERSE)? FALSE : !entered_a_group;
654*22dc650dSSadaf Ebrahimi 
655*22dc650dSSadaf Ebrahimi       /* Non-atomic assertions - don't possessify last iterator. This needs
656*22dc650dSSadaf Ebrahimi       more thought. */
657*22dc650dSSadaf Ebrahimi 
658*22dc650dSSadaf Ebrahimi       case OP_ASSERT_NA:
659*22dc650dSSadaf Ebrahimi       case OP_ASSERTBACK_NA:
660*22dc650dSSadaf Ebrahimi       return FALSE;
661*22dc650dSSadaf Ebrahimi       }
662*22dc650dSSadaf Ebrahimi 
663*22dc650dSSadaf Ebrahimi     /* Skip over the bracket and inspect what comes next. */
664*22dc650dSSadaf Ebrahimi 
665*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[c];
666*22dc650dSSadaf Ebrahimi     continue;
667*22dc650dSSadaf Ebrahimi 
668*22dc650dSSadaf Ebrahimi     /* Handle cases where the next item is a group. */
669*22dc650dSSadaf Ebrahimi 
670*22dc650dSSadaf Ebrahimi     case OP_ONCE:
671*22dc650dSSadaf Ebrahimi     case OP_BRA:
672*22dc650dSSadaf Ebrahimi     case OP_CBRA:
673*22dc650dSSadaf Ebrahimi     next_code = code + GET(code, 1);
674*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[c];
675*22dc650dSSadaf Ebrahimi 
676*22dc650dSSadaf Ebrahimi     /* Check each branch. We have to recurse a level for all but the last
677*22dc650dSSadaf Ebrahimi     branch. */
678*22dc650dSSadaf Ebrahimi 
679*22dc650dSSadaf Ebrahimi     while (*next_code == OP_ALT)
680*22dc650dSSadaf Ebrahimi       {
681*22dc650dSSadaf Ebrahimi       if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
682*22dc650dSSadaf Ebrahimi         return FALSE;
683*22dc650dSSadaf Ebrahimi       code = next_code + 1 + LINK_SIZE;
684*22dc650dSSadaf Ebrahimi       next_code += GET(next_code, 1);
685*22dc650dSSadaf Ebrahimi       }
686*22dc650dSSadaf Ebrahimi 
687*22dc650dSSadaf Ebrahimi     entered_a_group = TRUE;
688*22dc650dSSadaf Ebrahimi     continue;
689*22dc650dSSadaf Ebrahimi 
690*22dc650dSSadaf Ebrahimi     case OP_BRAZERO:
691*22dc650dSSadaf Ebrahimi     case OP_BRAMINZERO:
692*22dc650dSSadaf Ebrahimi 
693*22dc650dSSadaf Ebrahimi     next_code = code + 1;
694*22dc650dSSadaf Ebrahimi     if (*next_code != OP_BRA && *next_code != OP_CBRA &&
695*22dc650dSSadaf Ebrahimi         *next_code != OP_ONCE) return FALSE;
696*22dc650dSSadaf Ebrahimi 
697*22dc650dSSadaf Ebrahimi     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
698*22dc650dSSadaf Ebrahimi 
699*22dc650dSSadaf Ebrahimi     /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
700*22dc650dSSadaf Ebrahimi 
701*22dc650dSSadaf Ebrahimi     next_code += 1 + LINK_SIZE;
702*22dc650dSSadaf Ebrahimi     if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
703*22dc650dSSadaf Ebrahimi          rec_limit))
704*22dc650dSSadaf Ebrahimi       return FALSE;
705*22dc650dSSadaf Ebrahimi 
706*22dc650dSSadaf Ebrahimi     code += PRIV(OP_lengths)[c];
707*22dc650dSSadaf Ebrahimi     continue;
708*22dc650dSSadaf Ebrahimi 
709*22dc650dSSadaf Ebrahimi     /* The next opcode does not need special handling; fall through and use it
710*22dc650dSSadaf Ebrahimi     to see if the base can be possessified. */
711*22dc650dSSadaf Ebrahimi 
712*22dc650dSSadaf Ebrahimi     default:
713*22dc650dSSadaf Ebrahimi     break;
714*22dc650dSSadaf Ebrahimi     }
715*22dc650dSSadaf Ebrahimi 
716*22dc650dSSadaf Ebrahimi   /* We now have the next appropriate opcode to compare with the base. Check
717*22dc650dSSadaf Ebrahimi   for a supported opcode, and load its properties. */
718*22dc650dSSadaf Ebrahimi 
719*22dc650dSSadaf Ebrahimi   code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
720*22dc650dSSadaf Ebrahimi   if (code == NULL) return FALSE;    /* Unsupported */
721*22dc650dSSadaf Ebrahimi 
722*22dc650dSSadaf Ebrahimi   /* If either opcode is a small character list, set pointers for comparing
723*22dc650dSSadaf Ebrahimi   characters from that list with another list, or with a property. */
724*22dc650dSSadaf Ebrahimi 
725*22dc650dSSadaf Ebrahimi   if (base_list[0] == OP_CHAR)
726*22dc650dSSadaf Ebrahimi     {
727*22dc650dSSadaf Ebrahimi     chr_ptr = base_list + 2;
728*22dc650dSSadaf Ebrahimi     list_ptr = list;
729*22dc650dSSadaf Ebrahimi     }
730*22dc650dSSadaf Ebrahimi   else if (list[0] == OP_CHAR)
731*22dc650dSSadaf Ebrahimi     {
732*22dc650dSSadaf Ebrahimi     chr_ptr = list + 2;
733*22dc650dSSadaf Ebrahimi     list_ptr = base_list;
734*22dc650dSSadaf Ebrahimi     }
735*22dc650dSSadaf Ebrahimi 
736*22dc650dSSadaf Ebrahimi   /* Character bitsets can also be compared to certain opcodes. */
737*22dc650dSSadaf Ebrahimi 
738*22dc650dSSadaf Ebrahimi   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
739*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
740*22dc650dSSadaf Ebrahimi       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
741*22dc650dSSadaf Ebrahimi       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
742*22dc650dSSadaf Ebrahimi #endif
743*22dc650dSSadaf Ebrahimi       )
744*22dc650dSSadaf Ebrahimi     {
745*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
746*22dc650dSSadaf Ebrahimi     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
747*22dc650dSSadaf Ebrahimi #else
748*22dc650dSSadaf Ebrahimi     if (base_list[0] == OP_CLASS)
749*22dc650dSSadaf Ebrahimi #endif
750*22dc650dSSadaf Ebrahimi       {
751*22dc650dSSadaf Ebrahimi       set1 = (uint8_t *)(base_end - base_list[2]);
752*22dc650dSSadaf Ebrahimi       list_ptr = list;
753*22dc650dSSadaf Ebrahimi       }
754*22dc650dSSadaf Ebrahimi     else
755*22dc650dSSadaf Ebrahimi       {
756*22dc650dSSadaf Ebrahimi       set1 = (uint8_t *)(code - list[2]);
757*22dc650dSSadaf Ebrahimi       list_ptr = base_list;
758*22dc650dSSadaf Ebrahimi       }
759*22dc650dSSadaf Ebrahimi 
760*22dc650dSSadaf Ebrahimi     invert_bits = FALSE;
761*22dc650dSSadaf Ebrahimi     switch(list_ptr[0])
762*22dc650dSSadaf Ebrahimi       {
763*22dc650dSSadaf Ebrahimi       case OP_CLASS:
764*22dc650dSSadaf Ebrahimi       case OP_NCLASS:
765*22dc650dSSadaf Ebrahimi       set2 = (uint8_t *)
766*22dc650dSSadaf Ebrahimi         ((list_ptr == list ? code : base_end) - list_ptr[2]);
767*22dc650dSSadaf Ebrahimi       break;
768*22dc650dSSadaf Ebrahimi 
769*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
770*22dc650dSSadaf Ebrahimi       case OP_XCLASS:
771*22dc650dSSadaf Ebrahimi       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
772*22dc650dSSadaf Ebrahimi       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
773*22dc650dSSadaf Ebrahimi       if ((*xclass_flags & XCL_MAP) == 0)
774*22dc650dSSadaf Ebrahimi         {
775*22dc650dSSadaf Ebrahimi         /* No bits are set for characters < 256. */
776*22dc650dSSadaf Ebrahimi         if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
777*22dc650dSSadaf Ebrahimi         /* Might be an empty repeat. */
778*22dc650dSSadaf Ebrahimi         continue;
779*22dc650dSSadaf Ebrahimi         }
780*22dc650dSSadaf Ebrahimi       set2 = (uint8_t *)(xclass_flags + 1);
781*22dc650dSSadaf Ebrahimi       break;
782*22dc650dSSadaf Ebrahimi #endif
783*22dc650dSSadaf Ebrahimi 
784*22dc650dSSadaf Ebrahimi       case OP_NOT_DIGIT:
785*22dc650dSSadaf Ebrahimi       invert_bits = TRUE;
786*22dc650dSSadaf Ebrahimi       /* Fall through */
787*22dc650dSSadaf Ebrahimi       case OP_DIGIT:
788*22dc650dSSadaf Ebrahimi       set2 = (uint8_t *)(cb->cbits + cbit_digit);
789*22dc650dSSadaf Ebrahimi       break;
790*22dc650dSSadaf Ebrahimi 
791*22dc650dSSadaf Ebrahimi       case OP_NOT_WHITESPACE:
792*22dc650dSSadaf Ebrahimi       invert_bits = TRUE;
793*22dc650dSSadaf Ebrahimi       /* Fall through */
794*22dc650dSSadaf Ebrahimi       case OP_WHITESPACE:
795*22dc650dSSadaf Ebrahimi       set2 = (uint8_t *)(cb->cbits + cbit_space);
796*22dc650dSSadaf Ebrahimi       break;
797*22dc650dSSadaf Ebrahimi 
798*22dc650dSSadaf Ebrahimi       case OP_NOT_WORDCHAR:
799*22dc650dSSadaf Ebrahimi       invert_bits = TRUE;
800*22dc650dSSadaf Ebrahimi       /* Fall through */
801*22dc650dSSadaf Ebrahimi       case OP_WORDCHAR:
802*22dc650dSSadaf Ebrahimi       set2 = (uint8_t *)(cb->cbits + cbit_word);
803*22dc650dSSadaf Ebrahimi       break;
804*22dc650dSSadaf Ebrahimi 
805*22dc650dSSadaf Ebrahimi       default:
806*22dc650dSSadaf Ebrahimi       return FALSE;
807*22dc650dSSadaf Ebrahimi       }
808*22dc650dSSadaf Ebrahimi 
809*22dc650dSSadaf Ebrahimi     /* Because the bit sets are unaligned bytes, we need to perform byte
810*22dc650dSSadaf Ebrahimi     comparison here. */
811*22dc650dSSadaf Ebrahimi 
812*22dc650dSSadaf Ebrahimi     set_end = set1 + 32;
813*22dc650dSSadaf Ebrahimi     if (invert_bits)
814*22dc650dSSadaf Ebrahimi       {
815*22dc650dSSadaf Ebrahimi       do
816*22dc650dSSadaf Ebrahimi         {
817*22dc650dSSadaf Ebrahimi         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
818*22dc650dSSadaf Ebrahimi         }
819*22dc650dSSadaf Ebrahimi       while (set1 < set_end);
820*22dc650dSSadaf Ebrahimi       }
821*22dc650dSSadaf Ebrahimi     else
822*22dc650dSSadaf Ebrahimi       {
823*22dc650dSSadaf Ebrahimi       do
824*22dc650dSSadaf Ebrahimi         {
825*22dc650dSSadaf Ebrahimi         if ((*set1++ & *set2++) != 0) return FALSE;
826*22dc650dSSadaf Ebrahimi         }
827*22dc650dSSadaf Ebrahimi       while (set1 < set_end);
828*22dc650dSSadaf Ebrahimi       }
829*22dc650dSSadaf Ebrahimi 
830*22dc650dSSadaf Ebrahimi     if (list[1] == 0) return TRUE;
831*22dc650dSSadaf Ebrahimi     /* Might be an empty repeat. */
832*22dc650dSSadaf Ebrahimi     continue;
833*22dc650dSSadaf Ebrahimi     }
834*22dc650dSSadaf Ebrahimi 
835*22dc650dSSadaf Ebrahimi   /* Some property combinations also acceptable. Unicode property opcodes are
836*22dc650dSSadaf Ebrahimi   processed specially; the rest can be handled with a lookup table. */
837*22dc650dSSadaf Ebrahimi 
838*22dc650dSSadaf Ebrahimi   else
839*22dc650dSSadaf Ebrahimi     {
840*22dc650dSSadaf Ebrahimi     uint32_t leftop, rightop;
841*22dc650dSSadaf Ebrahimi 
842*22dc650dSSadaf Ebrahimi     leftop = base_list[0];
843*22dc650dSSadaf Ebrahimi     rightop = list[0];
844*22dc650dSSadaf Ebrahimi 
845*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
846*22dc650dSSadaf Ebrahimi     accepted = FALSE; /* Always set in non-unicode case. */
847*22dc650dSSadaf Ebrahimi     if (leftop == OP_PROP || leftop == OP_NOTPROP)
848*22dc650dSSadaf Ebrahimi       {
849*22dc650dSSadaf Ebrahimi       if (rightop == OP_EOD)
850*22dc650dSSadaf Ebrahimi         accepted = TRUE;
851*22dc650dSSadaf Ebrahimi       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
852*22dc650dSSadaf Ebrahimi         {
853*22dc650dSSadaf Ebrahimi         int n;
854*22dc650dSSadaf Ebrahimi         const uint8_t *p;
855*22dc650dSSadaf Ebrahimi         BOOL same = leftop == rightop;
856*22dc650dSSadaf Ebrahimi         BOOL lisprop = leftop == OP_PROP;
857*22dc650dSSadaf Ebrahimi         BOOL risprop = rightop == OP_PROP;
858*22dc650dSSadaf Ebrahimi         BOOL bothprop = lisprop && risprop;
859*22dc650dSSadaf Ebrahimi 
860*22dc650dSSadaf Ebrahimi         /* There's a table that specifies how each combination is to be
861*22dc650dSSadaf Ebrahimi         processed:
862*22dc650dSSadaf Ebrahimi           0   Always return FALSE (never auto-possessify)
863*22dc650dSSadaf Ebrahimi           1   Character groups are distinct (possessify if both are OP_PROP)
864*22dc650dSSadaf Ebrahimi           2   Check character categories in the same group (general or particular)
865*22dc650dSSadaf Ebrahimi           3   Return TRUE if the two opcodes are not the same
866*22dc650dSSadaf Ebrahimi           ... see comments below
867*22dc650dSSadaf Ebrahimi         */
868*22dc650dSSadaf Ebrahimi 
869*22dc650dSSadaf Ebrahimi         n = propposstab[base_list[2]][list[2]];
870*22dc650dSSadaf Ebrahimi         switch(n)
871*22dc650dSSadaf Ebrahimi           {
872*22dc650dSSadaf Ebrahimi           case 0: break;
873*22dc650dSSadaf Ebrahimi           case 1: accepted = bothprop; break;
874*22dc650dSSadaf Ebrahimi           case 2: accepted = (base_list[3] == list[3]) != same; break;
875*22dc650dSSadaf Ebrahimi           case 3: accepted = !same; break;
876*22dc650dSSadaf Ebrahimi 
877*22dc650dSSadaf Ebrahimi           case 4:  /* Left general category, right particular category */
878*22dc650dSSadaf Ebrahimi           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
879*22dc650dSSadaf Ebrahimi           break;
880*22dc650dSSadaf Ebrahimi 
881*22dc650dSSadaf Ebrahimi           case 5:  /* Right general category, left particular category */
882*22dc650dSSadaf Ebrahimi           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
883*22dc650dSSadaf Ebrahimi           break;
884*22dc650dSSadaf Ebrahimi 
885*22dc650dSSadaf Ebrahimi           /* This code is logically tricky. Think hard before fiddling with it.
886*22dc650dSSadaf Ebrahimi           The posspropstab table has four entries per row. Each row relates to
887*22dc650dSSadaf Ebrahimi           one of PCRE's special properties such as ALNUM or SPACE or WORD.
888*22dc650dSSadaf Ebrahimi           Only WORD actually needs all four entries, but using repeats for the
889*22dc650dSSadaf Ebrahimi           others means they can all use the same code below.
890*22dc650dSSadaf Ebrahimi 
891*22dc650dSSadaf Ebrahimi           The first two entries in each row are Unicode general categories, and
892*22dc650dSSadaf Ebrahimi           apply always, because all the characters they include are part of the
893*22dc650dSSadaf Ebrahimi           PCRE character set. The third and fourth entries are a general and a
894*22dc650dSSadaf Ebrahimi           particular category, respectively, that include one or more relevant
895*22dc650dSSadaf Ebrahimi           characters. One or the other is used, depending on whether the check
896*22dc650dSSadaf Ebrahimi           is for a general or a particular category. However, in both cases the
897*22dc650dSSadaf Ebrahimi           category contains more characters than the specials that are defined
898*22dc650dSSadaf Ebrahimi           for the property being tested against. Therefore, it cannot be used
899*22dc650dSSadaf Ebrahimi           in a NOTPROP case.
900*22dc650dSSadaf Ebrahimi 
901*22dc650dSSadaf Ebrahimi           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
902*22dc650dSSadaf Ebrahimi           Underscore is covered by ucp_P or ucp_Po. */
903*22dc650dSSadaf Ebrahimi 
904*22dc650dSSadaf Ebrahimi           case 6:  /* Left alphanum vs right general category */
905*22dc650dSSadaf Ebrahimi           case 7:  /* Left space vs right general category */
906*22dc650dSSadaf Ebrahimi           case 8:  /* Left word vs right general category */
907*22dc650dSSadaf Ebrahimi           p = posspropstab[n-6];
908*22dc650dSSadaf Ebrahimi           accepted = risprop && lisprop ==
909*22dc650dSSadaf Ebrahimi             (list[3] != p[0] &&
910*22dc650dSSadaf Ebrahimi              list[3] != p[1] &&
911*22dc650dSSadaf Ebrahimi             (list[3] != p[2] || !lisprop));
912*22dc650dSSadaf Ebrahimi           break;
913*22dc650dSSadaf Ebrahimi 
914*22dc650dSSadaf Ebrahimi           case 9:   /* Right alphanum vs left general category */
915*22dc650dSSadaf Ebrahimi           case 10:  /* Right space vs left general category */
916*22dc650dSSadaf Ebrahimi           case 11:  /* Right word vs left general category */
917*22dc650dSSadaf Ebrahimi           p = posspropstab[n-9];
918*22dc650dSSadaf Ebrahimi           accepted = lisprop && risprop ==
919*22dc650dSSadaf Ebrahimi             (base_list[3] != p[0] &&
920*22dc650dSSadaf Ebrahimi              base_list[3] != p[1] &&
921*22dc650dSSadaf Ebrahimi             (base_list[3] != p[2] || !risprop));
922*22dc650dSSadaf Ebrahimi           break;
923*22dc650dSSadaf Ebrahimi 
924*22dc650dSSadaf Ebrahimi           case 12:  /* Left alphanum vs right particular category */
925*22dc650dSSadaf Ebrahimi           case 13:  /* Left space vs right particular category */
926*22dc650dSSadaf Ebrahimi           case 14:  /* Left word vs right particular category */
927*22dc650dSSadaf Ebrahimi           p = posspropstab[n-12];
928*22dc650dSSadaf Ebrahimi           accepted = risprop && lisprop ==
929*22dc650dSSadaf Ebrahimi             (catposstab[p[0]][list[3]] &&
930*22dc650dSSadaf Ebrahimi              catposstab[p[1]][list[3]] &&
931*22dc650dSSadaf Ebrahimi             (list[3] != p[3] || !lisprop));
932*22dc650dSSadaf Ebrahimi           break;
933*22dc650dSSadaf Ebrahimi 
934*22dc650dSSadaf Ebrahimi           case 15:  /* Right alphanum vs left particular category */
935*22dc650dSSadaf Ebrahimi           case 16:  /* Right space vs left particular category */
936*22dc650dSSadaf Ebrahimi           case 17:  /* Right word vs left particular category */
937*22dc650dSSadaf Ebrahimi           p = posspropstab[n-15];
938*22dc650dSSadaf Ebrahimi           accepted = lisprop && risprop ==
939*22dc650dSSadaf Ebrahimi             (catposstab[p[0]][base_list[3]] &&
940*22dc650dSSadaf Ebrahimi              catposstab[p[1]][base_list[3]] &&
941*22dc650dSSadaf Ebrahimi             (base_list[3] != p[3] || !risprop));
942*22dc650dSSadaf Ebrahimi           break;
943*22dc650dSSadaf Ebrahimi           }
944*22dc650dSSadaf Ebrahimi         }
945*22dc650dSSadaf Ebrahimi       }
946*22dc650dSSadaf Ebrahimi 
947*22dc650dSSadaf Ebrahimi     else
948*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
949*22dc650dSSadaf Ebrahimi 
950*22dc650dSSadaf Ebrahimi     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
951*22dc650dSSadaf Ebrahimi            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
952*22dc650dSSadaf Ebrahimi            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
953*22dc650dSSadaf Ebrahimi 
954*22dc650dSSadaf Ebrahimi     if (!accepted) return FALSE;
955*22dc650dSSadaf Ebrahimi 
956*22dc650dSSadaf Ebrahimi     if (list[1] == 0) return TRUE;
957*22dc650dSSadaf Ebrahimi     /* Might be an empty repeat. */
958*22dc650dSSadaf Ebrahimi     continue;
959*22dc650dSSadaf Ebrahimi     }
960*22dc650dSSadaf Ebrahimi 
961*22dc650dSSadaf Ebrahimi   /* Control reaches here only if one of the items is a small character list.
962*22dc650dSSadaf Ebrahimi   All characters are checked against the other side. */
963*22dc650dSSadaf Ebrahimi 
964*22dc650dSSadaf Ebrahimi   do
965*22dc650dSSadaf Ebrahimi     {
966*22dc650dSSadaf Ebrahimi     chr = *chr_ptr;
967*22dc650dSSadaf Ebrahimi 
968*22dc650dSSadaf Ebrahimi     switch(list_ptr[0])
969*22dc650dSSadaf Ebrahimi       {
970*22dc650dSSadaf Ebrahimi       case OP_CHAR:
971*22dc650dSSadaf Ebrahimi       ochr_ptr = list_ptr + 2;
972*22dc650dSSadaf Ebrahimi       do
973*22dc650dSSadaf Ebrahimi         {
974*22dc650dSSadaf Ebrahimi         if (chr == *ochr_ptr) return FALSE;
975*22dc650dSSadaf Ebrahimi         ochr_ptr++;
976*22dc650dSSadaf Ebrahimi         }
977*22dc650dSSadaf Ebrahimi       while(*ochr_ptr != NOTACHAR);
978*22dc650dSSadaf Ebrahimi       break;
979*22dc650dSSadaf Ebrahimi 
980*22dc650dSSadaf Ebrahimi       case OP_NOT:
981*22dc650dSSadaf Ebrahimi       ochr_ptr = list_ptr + 2;
982*22dc650dSSadaf Ebrahimi       do
983*22dc650dSSadaf Ebrahimi         {
984*22dc650dSSadaf Ebrahimi         if (chr == *ochr_ptr)
985*22dc650dSSadaf Ebrahimi           break;
986*22dc650dSSadaf Ebrahimi         ochr_ptr++;
987*22dc650dSSadaf Ebrahimi         }
988*22dc650dSSadaf Ebrahimi       while(*ochr_ptr != NOTACHAR);
989*22dc650dSSadaf Ebrahimi       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
990*22dc650dSSadaf Ebrahimi       break;
991*22dc650dSSadaf Ebrahimi 
992*22dc650dSSadaf Ebrahimi       /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
993*22dc650dSSadaf Ebrahimi       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
994*22dc650dSSadaf Ebrahimi 
995*22dc650dSSadaf Ebrahimi       case OP_DIGIT:
996*22dc650dSSadaf Ebrahimi       if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
997*22dc650dSSadaf Ebrahimi       break;
998*22dc650dSSadaf Ebrahimi 
999*22dc650dSSadaf Ebrahimi       case OP_NOT_DIGIT:
1000*22dc650dSSadaf Ebrahimi       if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
1001*22dc650dSSadaf Ebrahimi       break;
1002*22dc650dSSadaf Ebrahimi 
1003*22dc650dSSadaf Ebrahimi       case OP_WHITESPACE:
1004*22dc650dSSadaf Ebrahimi       if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
1005*22dc650dSSadaf Ebrahimi       break;
1006*22dc650dSSadaf Ebrahimi 
1007*22dc650dSSadaf Ebrahimi       case OP_NOT_WHITESPACE:
1008*22dc650dSSadaf Ebrahimi       if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
1009*22dc650dSSadaf Ebrahimi       break;
1010*22dc650dSSadaf Ebrahimi 
1011*22dc650dSSadaf Ebrahimi       case OP_WORDCHAR:
1012*22dc650dSSadaf Ebrahimi       if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
1013*22dc650dSSadaf Ebrahimi       break;
1014*22dc650dSSadaf Ebrahimi 
1015*22dc650dSSadaf Ebrahimi       case OP_NOT_WORDCHAR:
1016*22dc650dSSadaf Ebrahimi       if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
1017*22dc650dSSadaf Ebrahimi       break;
1018*22dc650dSSadaf Ebrahimi 
1019*22dc650dSSadaf Ebrahimi       case OP_HSPACE:
1020*22dc650dSSadaf Ebrahimi       switch(chr)
1021*22dc650dSSadaf Ebrahimi         {
1022*22dc650dSSadaf Ebrahimi         HSPACE_CASES: return FALSE;
1023*22dc650dSSadaf Ebrahimi         default: break;
1024*22dc650dSSadaf Ebrahimi         }
1025*22dc650dSSadaf Ebrahimi       break;
1026*22dc650dSSadaf Ebrahimi 
1027*22dc650dSSadaf Ebrahimi       case OP_NOT_HSPACE:
1028*22dc650dSSadaf Ebrahimi       switch(chr)
1029*22dc650dSSadaf Ebrahimi         {
1030*22dc650dSSadaf Ebrahimi         HSPACE_CASES: break;
1031*22dc650dSSadaf Ebrahimi         default: return FALSE;
1032*22dc650dSSadaf Ebrahimi         }
1033*22dc650dSSadaf Ebrahimi       break;
1034*22dc650dSSadaf Ebrahimi 
1035*22dc650dSSadaf Ebrahimi       case OP_ANYNL:
1036*22dc650dSSadaf Ebrahimi       case OP_VSPACE:
1037*22dc650dSSadaf Ebrahimi       switch(chr)
1038*22dc650dSSadaf Ebrahimi         {
1039*22dc650dSSadaf Ebrahimi         VSPACE_CASES: return FALSE;
1040*22dc650dSSadaf Ebrahimi         default: break;
1041*22dc650dSSadaf Ebrahimi         }
1042*22dc650dSSadaf Ebrahimi       break;
1043*22dc650dSSadaf Ebrahimi 
1044*22dc650dSSadaf Ebrahimi       case OP_NOT_VSPACE:
1045*22dc650dSSadaf Ebrahimi       switch(chr)
1046*22dc650dSSadaf Ebrahimi         {
1047*22dc650dSSadaf Ebrahimi         VSPACE_CASES: break;
1048*22dc650dSSadaf Ebrahimi         default: return FALSE;
1049*22dc650dSSadaf Ebrahimi         }
1050*22dc650dSSadaf Ebrahimi       break;
1051*22dc650dSSadaf Ebrahimi 
1052*22dc650dSSadaf Ebrahimi       case OP_DOLL:
1053*22dc650dSSadaf Ebrahimi       case OP_EODN:
1054*22dc650dSSadaf Ebrahimi       switch (chr)
1055*22dc650dSSadaf Ebrahimi         {
1056*22dc650dSSadaf Ebrahimi         case CHAR_CR:
1057*22dc650dSSadaf Ebrahimi         case CHAR_LF:
1058*22dc650dSSadaf Ebrahimi         case CHAR_VT:
1059*22dc650dSSadaf Ebrahimi         case CHAR_FF:
1060*22dc650dSSadaf Ebrahimi         case CHAR_NEL:
1061*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
1062*22dc650dSSadaf Ebrahimi         case 0x2028:
1063*22dc650dSSadaf Ebrahimi         case 0x2029:
1064*22dc650dSSadaf Ebrahimi #endif  /* Not EBCDIC */
1065*22dc650dSSadaf Ebrahimi         return FALSE;
1066*22dc650dSSadaf Ebrahimi         }
1067*22dc650dSSadaf Ebrahimi       break;
1068*22dc650dSSadaf Ebrahimi 
1069*22dc650dSSadaf Ebrahimi       case OP_EOD:    /* Can always possessify before \z */
1070*22dc650dSSadaf Ebrahimi       break;
1071*22dc650dSSadaf Ebrahimi 
1072*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1073*22dc650dSSadaf Ebrahimi       case OP_PROP:
1074*22dc650dSSadaf Ebrahimi       case OP_NOTPROP:
1075*22dc650dSSadaf Ebrahimi       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1076*22dc650dSSadaf Ebrahimi             list_ptr[0] == OP_NOTPROP))
1077*22dc650dSSadaf Ebrahimi         return FALSE;
1078*22dc650dSSadaf Ebrahimi       break;
1079*22dc650dSSadaf Ebrahimi #endif
1080*22dc650dSSadaf Ebrahimi 
1081*22dc650dSSadaf Ebrahimi       case OP_NCLASS:
1082*22dc650dSSadaf Ebrahimi       if (chr > 255) return FALSE;
1083*22dc650dSSadaf Ebrahimi       /* Fall through */
1084*22dc650dSSadaf Ebrahimi 
1085*22dc650dSSadaf Ebrahimi       case OP_CLASS:
1086*22dc650dSSadaf Ebrahimi       if (chr > 255) break;
1087*22dc650dSSadaf Ebrahimi       class_bitset = (uint8_t *)
1088*22dc650dSSadaf Ebrahimi         ((list_ptr == list ? code : base_end) - list_ptr[2]);
1089*22dc650dSSadaf Ebrahimi       if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE;
1090*22dc650dSSadaf Ebrahimi       break;
1091*22dc650dSSadaf Ebrahimi 
1092*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
1093*22dc650dSSadaf Ebrahimi       case OP_XCLASS:
1094*22dc650dSSadaf Ebrahimi       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1095*22dc650dSSadaf Ebrahimi           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
1096*22dc650dSSadaf Ebrahimi       break;
1097*22dc650dSSadaf Ebrahimi #endif
1098*22dc650dSSadaf Ebrahimi 
1099*22dc650dSSadaf Ebrahimi       default:
1100*22dc650dSSadaf Ebrahimi       return FALSE;
1101*22dc650dSSadaf Ebrahimi       }
1102*22dc650dSSadaf Ebrahimi 
1103*22dc650dSSadaf Ebrahimi     chr_ptr++;
1104*22dc650dSSadaf Ebrahimi     }
1105*22dc650dSSadaf Ebrahimi   while(*chr_ptr != NOTACHAR);
1106*22dc650dSSadaf Ebrahimi 
1107*22dc650dSSadaf Ebrahimi   /* At least one character must be matched from this opcode. */
1108*22dc650dSSadaf Ebrahimi 
1109*22dc650dSSadaf Ebrahimi   if (list[1] == 0) return TRUE;
1110*22dc650dSSadaf Ebrahimi   }
1111*22dc650dSSadaf Ebrahimi 
1112*22dc650dSSadaf Ebrahimi /* Control never reaches here. There used to be a fail-save return FALSE; here,
1113*22dc650dSSadaf Ebrahimi but some compilers complain about an unreachable statement. */
1114*22dc650dSSadaf Ebrahimi }
1115*22dc650dSSadaf Ebrahimi 
1116*22dc650dSSadaf Ebrahimi 
1117*22dc650dSSadaf Ebrahimi 
1118*22dc650dSSadaf Ebrahimi /*************************************************
1119*22dc650dSSadaf Ebrahimi *    Scan compiled regex for auto-possession     *
1120*22dc650dSSadaf Ebrahimi *************************************************/
1121*22dc650dSSadaf Ebrahimi 
1122*22dc650dSSadaf Ebrahimi /* Replaces single character iterations with their possessive alternatives
1123*22dc650dSSadaf Ebrahimi if appropriate. This function modifies the compiled opcode! Hitting a
1124*22dc650dSSadaf Ebrahimi non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a
1125*22dc650dSSadaf Ebrahimi bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches
1126*22dc650dSSadaf Ebrahimi overly complicated or large patterns. In these cases, the check just stops,
1127*22dc650dSSadaf Ebrahimi leaving the remainder of the pattern unpossessified.
1128*22dc650dSSadaf Ebrahimi 
1129*22dc650dSSadaf Ebrahimi Arguments:
1130*22dc650dSSadaf Ebrahimi   code        points to start of the byte code
1131*22dc650dSSadaf Ebrahimi   cb          compile data block
1132*22dc650dSSadaf Ebrahimi 
1133*22dc650dSSadaf Ebrahimi Returns:      0 for success
1134*22dc650dSSadaf Ebrahimi               -1 if a non-existant opcode is encountered
1135*22dc650dSSadaf Ebrahimi */
1136*22dc650dSSadaf Ebrahimi 
1137*22dc650dSSadaf Ebrahimi int
PRIV(auto_possessify)1138*22dc650dSSadaf Ebrahimi PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
1139*22dc650dSSadaf Ebrahimi {
1140*22dc650dSSadaf Ebrahimi PCRE2_UCHAR c;
1141*22dc650dSSadaf Ebrahimi PCRE2_SPTR end;
1142*22dc650dSSadaf Ebrahimi PCRE2_UCHAR *repeat_opcode;
1143*22dc650dSSadaf Ebrahimi uint32_t list[8];
1144*22dc650dSSadaf Ebrahimi int rec_limit = 1000;  /* Was 10,000 but clang+ASAN uses a lot of stack. */
1145*22dc650dSSadaf Ebrahimi BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
1146*22dc650dSSadaf Ebrahimi BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
1147*22dc650dSSadaf Ebrahimi 
1148*22dc650dSSadaf Ebrahimi for (;;)
1149*22dc650dSSadaf Ebrahimi   {
1150*22dc650dSSadaf Ebrahimi   c = *code;
1151*22dc650dSSadaf Ebrahimi 
1152*22dc650dSSadaf Ebrahimi   if (c >= OP_TABLE_LENGTH) return -1;   /* Something gone wrong */
1153*22dc650dSSadaf Ebrahimi 
1154*22dc650dSSadaf Ebrahimi   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1155*22dc650dSSadaf Ebrahimi     {
1156*22dc650dSSadaf Ebrahimi     c -= get_repeat_base(c) - OP_STAR;
1157*22dc650dSSadaf Ebrahimi     end = (c <= OP_MINUPTO) ?
1158*22dc650dSSadaf Ebrahimi       get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
1159*22dc650dSSadaf Ebrahimi     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1160*22dc650dSSadaf Ebrahimi 
1161*22dc650dSSadaf Ebrahimi     if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
1162*22dc650dSSadaf Ebrahimi         &rec_limit))
1163*22dc650dSSadaf Ebrahimi       {
1164*22dc650dSSadaf Ebrahimi       switch(c)
1165*22dc650dSSadaf Ebrahimi         {
1166*22dc650dSSadaf Ebrahimi         case OP_STAR:
1167*22dc650dSSadaf Ebrahimi         *code += OP_POSSTAR - OP_STAR;
1168*22dc650dSSadaf Ebrahimi         break;
1169*22dc650dSSadaf Ebrahimi 
1170*22dc650dSSadaf Ebrahimi         case OP_MINSTAR:
1171*22dc650dSSadaf Ebrahimi         *code += OP_POSSTAR - OP_MINSTAR;
1172*22dc650dSSadaf Ebrahimi         break;
1173*22dc650dSSadaf Ebrahimi 
1174*22dc650dSSadaf Ebrahimi         case OP_PLUS:
1175*22dc650dSSadaf Ebrahimi         *code += OP_POSPLUS - OP_PLUS;
1176*22dc650dSSadaf Ebrahimi         break;
1177*22dc650dSSadaf Ebrahimi 
1178*22dc650dSSadaf Ebrahimi         case OP_MINPLUS:
1179*22dc650dSSadaf Ebrahimi         *code += OP_POSPLUS - OP_MINPLUS;
1180*22dc650dSSadaf Ebrahimi         break;
1181*22dc650dSSadaf Ebrahimi 
1182*22dc650dSSadaf Ebrahimi         case OP_QUERY:
1183*22dc650dSSadaf Ebrahimi         *code += OP_POSQUERY - OP_QUERY;
1184*22dc650dSSadaf Ebrahimi         break;
1185*22dc650dSSadaf Ebrahimi 
1186*22dc650dSSadaf Ebrahimi         case OP_MINQUERY:
1187*22dc650dSSadaf Ebrahimi         *code += OP_POSQUERY - OP_MINQUERY;
1188*22dc650dSSadaf Ebrahimi         break;
1189*22dc650dSSadaf Ebrahimi 
1190*22dc650dSSadaf Ebrahimi         case OP_UPTO:
1191*22dc650dSSadaf Ebrahimi         *code += OP_POSUPTO - OP_UPTO;
1192*22dc650dSSadaf Ebrahimi         break;
1193*22dc650dSSadaf Ebrahimi 
1194*22dc650dSSadaf Ebrahimi         case OP_MINUPTO:
1195*22dc650dSSadaf Ebrahimi         *code += OP_POSUPTO - OP_MINUPTO;
1196*22dc650dSSadaf Ebrahimi         break;
1197*22dc650dSSadaf Ebrahimi         }
1198*22dc650dSSadaf Ebrahimi       }
1199*22dc650dSSadaf Ebrahimi     c = *code;
1200*22dc650dSSadaf Ebrahimi     }
1201*22dc650dSSadaf Ebrahimi   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
1202*22dc650dSSadaf Ebrahimi     {
1203*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
1204*22dc650dSSadaf Ebrahimi     if (c == OP_XCLASS)
1205*22dc650dSSadaf Ebrahimi       repeat_opcode = code + GET(code, 1);
1206*22dc650dSSadaf Ebrahimi     else
1207*22dc650dSSadaf Ebrahimi #endif
1208*22dc650dSSadaf Ebrahimi       repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1209*22dc650dSSadaf Ebrahimi 
1210*22dc650dSSadaf Ebrahimi     c = *repeat_opcode;
1211*22dc650dSSadaf Ebrahimi     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1212*22dc650dSSadaf Ebrahimi       {
1213*22dc650dSSadaf Ebrahimi       /* The return from get_chr_property_list() will never be NULL when
1214*22dc650dSSadaf Ebrahimi       *code (aka c) is one of the three class opcodes. However, gcc with
1215*22dc650dSSadaf Ebrahimi       -fanalyzer notes that a NULL return is possible, and grumbles. Hence we
1216*22dc650dSSadaf Ebrahimi       put in a check. */
1217*22dc650dSSadaf Ebrahimi 
1218*22dc650dSSadaf Ebrahimi       end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
1219*22dc650dSSadaf Ebrahimi       list[1] = (c & 1) == 0;
1220*22dc650dSSadaf Ebrahimi 
1221*22dc650dSSadaf Ebrahimi       if (end != NULL &&
1222*22dc650dSSadaf Ebrahimi           compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
1223*22dc650dSSadaf Ebrahimi         {
1224*22dc650dSSadaf Ebrahimi         switch (c)
1225*22dc650dSSadaf Ebrahimi           {
1226*22dc650dSSadaf Ebrahimi           case OP_CRSTAR:
1227*22dc650dSSadaf Ebrahimi           case OP_CRMINSTAR:
1228*22dc650dSSadaf Ebrahimi           *repeat_opcode = OP_CRPOSSTAR;
1229*22dc650dSSadaf Ebrahimi           break;
1230*22dc650dSSadaf Ebrahimi 
1231*22dc650dSSadaf Ebrahimi           case OP_CRPLUS:
1232*22dc650dSSadaf Ebrahimi           case OP_CRMINPLUS:
1233*22dc650dSSadaf Ebrahimi           *repeat_opcode = OP_CRPOSPLUS;
1234*22dc650dSSadaf Ebrahimi           break;
1235*22dc650dSSadaf Ebrahimi 
1236*22dc650dSSadaf Ebrahimi           case OP_CRQUERY:
1237*22dc650dSSadaf Ebrahimi           case OP_CRMINQUERY:
1238*22dc650dSSadaf Ebrahimi           *repeat_opcode = OP_CRPOSQUERY;
1239*22dc650dSSadaf Ebrahimi           break;
1240*22dc650dSSadaf Ebrahimi 
1241*22dc650dSSadaf Ebrahimi           case OP_CRRANGE:
1242*22dc650dSSadaf Ebrahimi           case OP_CRMINRANGE:
1243*22dc650dSSadaf Ebrahimi           *repeat_opcode = OP_CRPOSRANGE;
1244*22dc650dSSadaf Ebrahimi           break;
1245*22dc650dSSadaf Ebrahimi           }
1246*22dc650dSSadaf Ebrahimi         }
1247*22dc650dSSadaf Ebrahimi       }
1248*22dc650dSSadaf Ebrahimi     c = *code;
1249*22dc650dSSadaf Ebrahimi     }
1250*22dc650dSSadaf Ebrahimi 
1251*22dc650dSSadaf Ebrahimi   switch(c)
1252*22dc650dSSadaf Ebrahimi     {
1253*22dc650dSSadaf Ebrahimi     case OP_END:
1254*22dc650dSSadaf Ebrahimi     return 0;
1255*22dc650dSSadaf Ebrahimi 
1256*22dc650dSSadaf Ebrahimi     case OP_TYPESTAR:
1257*22dc650dSSadaf Ebrahimi     case OP_TYPEMINSTAR:
1258*22dc650dSSadaf Ebrahimi     case OP_TYPEPLUS:
1259*22dc650dSSadaf Ebrahimi     case OP_TYPEMINPLUS:
1260*22dc650dSSadaf Ebrahimi     case OP_TYPEQUERY:
1261*22dc650dSSadaf Ebrahimi     case OP_TYPEMINQUERY:
1262*22dc650dSSadaf Ebrahimi     case OP_TYPEPOSSTAR:
1263*22dc650dSSadaf Ebrahimi     case OP_TYPEPOSPLUS:
1264*22dc650dSSadaf Ebrahimi     case OP_TYPEPOSQUERY:
1265*22dc650dSSadaf Ebrahimi     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1266*22dc650dSSadaf Ebrahimi     break;
1267*22dc650dSSadaf Ebrahimi 
1268*22dc650dSSadaf Ebrahimi     case OP_TYPEUPTO:
1269*22dc650dSSadaf Ebrahimi     case OP_TYPEMINUPTO:
1270*22dc650dSSadaf Ebrahimi     case OP_TYPEEXACT:
1271*22dc650dSSadaf Ebrahimi     case OP_TYPEPOSUPTO:
1272*22dc650dSSadaf Ebrahimi     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1273*22dc650dSSadaf Ebrahimi       code += 2;
1274*22dc650dSSadaf Ebrahimi     break;
1275*22dc650dSSadaf Ebrahimi 
1276*22dc650dSSadaf Ebrahimi     case OP_CALLOUT_STR:
1277*22dc650dSSadaf Ebrahimi     code += GET(code, 1 + 2*LINK_SIZE);
1278*22dc650dSSadaf Ebrahimi     break;
1279*22dc650dSSadaf Ebrahimi 
1280*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_WIDE_CHARS
1281*22dc650dSSadaf Ebrahimi     case OP_XCLASS:
1282*22dc650dSSadaf Ebrahimi     code += GET(code, 1);
1283*22dc650dSSadaf Ebrahimi     break;
1284*22dc650dSSadaf Ebrahimi #endif
1285*22dc650dSSadaf Ebrahimi 
1286*22dc650dSSadaf Ebrahimi     case OP_MARK:
1287*22dc650dSSadaf Ebrahimi     case OP_COMMIT_ARG:
1288*22dc650dSSadaf Ebrahimi     case OP_PRUNE_ARG:
1289*22dc650dSSadaf Ebrahimi     case OP_SKIP_ARG:
1290*22dc650dSSadaf Ebrahimi     case OP_THEN_ARG:
1291*22dc650dSSadaf Ebrahimi     code += code[1];
1292*22dc650dSSadaf Ebrahimi     break;
1293*22dc650dSSadaf Ebrahimi     }
1294*22dc650dSSadaf Ebrahimi 
1295*22dc650dSSadaf Ebrahimi   /* Add in the fixed length from the table */
1296*22dc650dSSadaf Ebrahimi 
1297*22dc650dSSadaf Ebrahimi   code += PRIV(OP_lengths)[c];
1298*22dc650dSSadaf Ebrahimi 
1299*22dc650dSSadaf Ebrahimi   /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1300*22dc650dSSadaf Ebrahimi   followed by a multi-byte character. The length in the table is a minimum, so
1301*22dc650dSSadaf Ebrahimi   we have to arrange to skip the extra code units. */
1302*22dc650dSSadaf Ebrahimi 
1303*22dc650dSSadaf Ebrahimi #ifdef MAYBE_UTF_MULTI
1304*22dc650dSSadaf Ebrahimi   if (utf) switch(c)
1305*22dc650dSSadaf Ebrahimi     {
1306*22dc650dSSadaf Ebrahimi     case OP_CHAR:
1307*22dc650dSSadaf Ebrahimi     case OP_CHARI:
1308*22dc650dSSadaf Ebrahimi     case OP_NOT:
1309*22dc650dSSadaf Ebrahimi     case OP_NOTI:
1310*22dc650dSSadaf Ebrahimi     case OP_STAR:
1311*22dc650dSSadaf Ebrahimi     case OP_MINSTAR:
1312*22dc650dSSadaf Ebrahimi     case OP_PLUS:
1313*22dc650dSSadaf Ebrahimi     case OP_MINPLUS:
1314*22dc650dSSadaf Ebrahimi     case OP_QUERY:
1315*22dc650dSSadaf Ebrahimi     case OP_MINQUERY:
1316*22dc650dSSadaf Ebrahimi     case OP_UPTO:
1317*22dc650dSSadaf Ebrahimi     case OP_MINUPTO:
1318*22dc650dSSadaf Ebrahimi     case OP_EXACT:
1319*22dc650dSSadaf Ebrahimi     case OP_POSSTAR:
1320*22dc650dSSadaf Ebrahimi     case OP_POSPLUS:
1321*22dc650dSSadaf Ebrahimi     case OP_POSQUERY:
1322*22dc650dSSadaf Ebrahimi     case OP_POSUPTO:
1323*22dc650dSSadaf Ebrahimi     case OP_STARI:
1324*22dc650dSSadaf Ebrahimi     case OP_MINSTARI:
1325*22dc650dSSadaf Ebrahimi     case OP_PLUSI:
1326*22dc650dSSadaf Ebrahimi     case OP_MINPLUSI:
1327*22dc650dSSadaf Ebrahimi     case OP_QUERYI:
1328*22dc650dSSadaf Ebrahimi     case OP_MINQUERYI:
1329*22dc650dSSadaf Ebrahimi     case OP_UPTOI:
1330*22dc650dSSadaf Ebrahimi     case OP_MINUPTOI:
1331*22dc650dSSadaf Ebrahimi     case OP_EXACTI:
1332*22dc650dSSadaf Ebrahimi     case OP_POSSTARI:
1333*22dc650dSSadaf Ebrahimi     case OP_POSPLUSI:
1334*22dc650dSSadaf Ebrahimi     case OP_POSQUERYI:
1335*22dc650dSSadaf Ebrahimi     case OP_POSUPTOI:
1336*22dc650dSSadaf Ebrahimi     case OP_NOTSTAR:
1337*22dc650dSSadaf Ebrahimi     case OP_NOTMINSTAR:
1338*22dc650dSSadaf Ebrahimi     case OP_NOTPLUS:
1339*22dc650dSSadaf Ebrahimi     case OP_NOTMINPLUS:
1340*22dc650dSSadaf Ebrahimi     case OP_NOTQUERY:
1341*22dc650dSSadaf Ebrahimi     case OP_NOTMINQUERY:
1342*22dc650dSSadaf Ebrahimi     case OP_NOTUPTO:
1343*22dc650dSSadaf Ebrahimi     case OP_NOTMINUPTO:
1344*22dc650dSSadaf Ebrahimi     case OP_NOTEXACT:
1345*22dc650dSSadaf Ebrahimi     case OP_NOTPOSSTAR:
1346*22dc650dSSadaf Ebrahimi     case OP_NOTPOSPLUS:
1347*22dc650dSSadaf Ebrahimi     case OP_NOTPOSQUERY:
1348*22dc650dSSadaf Ebrahimi     case OP_NOTPOSUPTO:
1349*22dc650dSSadaf Ebrahimi     case OP_NOTSTARI:
1350*22dc650dSSadaf Ebrahimi     case OP_NOTMINSTARI:
1351*22dc650dSSadaf Ebrahimi     case OP_NOTPLUSI:
1352*22dc650dSSadaf Ebrahimi     case OP_NOTMINPLUSI:
1353*22dc650dSSadaf Ebrahimi     case OP_NOTQUERYI:
1354*22dc650dSSadaf Ebrahimi     case OP_NOTMINQUERYI:
1355*22dc650dSSadaf Ebrahimi     case OP_NOTUPTOI:
1356*22dc650dSSadaf Ebrahimi     case OP_NOTMINUPTOI:
1357*22dc650dSSadaf Ebrahimi     case OP_NOTEXACTI:
1358*22dc650dSSadaf Ebrahimi     case OP_NOTPOSSTARI:
1359*22dc650dSSadaf Ebrahimi     case OP_NOTPOSPLUSI:
1360*22dc650dSSadaf Ebrahimi     case OP_NOTPOSQUERYI:
1361*22dc650dSSadaf Ebrahimi     case OP_NOTPOSUPTOI:
1362*22dc650dSSadaf Ebrahimi     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1363*22dc650dSSadaf Ebrahimi     break;
1364*22dc650dSSadaf Ebrahimi     }
1365*22dc650dSSadaf Ebrahimi #else
1366*22dc650dSSadaf Ebrahimi   (void)(utf);  /* Keep compiler happy by referencing function argument */
1367*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_WIDE_CHARS */
1368*22dc650dSSadaf Ebrahimi   }
1369*22dc650dSSadaf Ebrahimi }
1370*22dc650dSSadaf Ebrahimi 
1371*22dc650dSSadaf Ebrahimi /* End of pcre2_auto_possess.c */
1372