1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi * Perl-Compatible Regular Expressions *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi
8*22dc650dSSadaf Ebrahimi Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi New API code Copyright (c) 2016-2023 University of Cambridge
11*22dc650dSSadaf Ebrahimi
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi
16*22dc650dSSadaf Ebrahimi * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi
19*22dc650dSSadaf Ebrahimi * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi
23*22dc650dSSadaf Ebrahimi * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi
41*22dc650dSSadaf Ebrahimi
42*22dc650dSSadaf Ebrahimi /* This module contains the external function pcre2_dfa_match(), which is an
43*22dc650dSSadaf Ebrahimi alternative matching function that uses a sort of DFA algorithm (not a true
44*22dc650dSSadaf Ebrahimi FSM). This is NOT Perl-compatible, but it has advantages in certain
45*22dc650dSSadaf Ebrahimi applications. */
46*22dc650dSSadaf Ebrahimi
47*22dc650dSSadaf Ebrahimi
48*22dc650dSSadaf Ebrahimi /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49*22dc650dSSadaf Ebrahimi the performance of his patterns greatly. I could not use it as it stood, as it
50*22dc650dSSadaf Ebrahimi was not thread safe, and made assumptions about pattern sizes. Also, it caused
51*22dc650dSSadaf Ebrahimi test 7 to loop, and test 9 to crash with a segfault.
52*22dc650dSSadaf Ebrahimi
53*22dc650dSSadaf Ebrahimi The issue is the check for duplicate states, which is done by a simple linear
54*22dc650dSSadaf Ebrahimi search up the state list. (Grep for "duplicate" below to find the code.) For
55*22dc650dSSadaf Ebrahimi many patterns, there will never be many states active at one time, so a simple
56*22dc650dSSadaf Ebrahimi linear search is fine. In patterns that have many active states, it might be a
57*22dc650dSSadaf Ebrahimi bottleneck. The suggested code used an indexing scheme to remember which states
58*22dc650dSSadaf Ebrahimi had previously been used for each character, and avoided the linear search when
59*22dc650dSSadaf Ebrahimi it knew there was no chance of a duplicate. This was implemented when adding
60*22dc650dSSadaf Ebrahimi states to the state lists.
61*22dc650dSSadaf Ebrahimi
62*22dc650dSSadaf Ebrahimi I wrote some thread-safe, not-limited code to try something similar at the time
63*22dc650dSSadaf Ebrahimi of checking for duplicates (instead of when adding states), using index vectors
64*22dc650dSSadaf Ebrahimi on the stack. It did give a 13% improvement with one specially constructed
65*22dc650dSSadaf Ebrahimi pattern for certain subject strings, but on other strings and on many of the
66*22dc650dSSadaf Ebrahimi simpler patterns in the test suite it did worse. The major problem, I think,
67*22dc650dSSadaf Ebrahimi was the extra time to initialize the index. This had to be done for each call
68*22dc650dSSadaf Ebrahimi of internal_dfa_match(). (The supplied patch used a static vector, initialized
69*22dc650dSSadaf Ebrahimi only once - I suspect this was the cause of the problems with the tests.)
70*22dc650dSSadaf Ebrahimi
71*22dc650dSSadaf Ebrahimi Overall, I concluded that the gains in some cases did not outweigh the losses
72*22dc650dSSadaf Ebrahimi in others, so I abandoned this code. */
73*22dc650dSSadaf Ebrahimi
74*22dc650dSSadaf Ebrahimi
75*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
76*22dc650dSSadaf Ebrahimi #include "config.h"
77*22dc650dSSadaf Ebrahimi #endif
78*22dc650dSSadaf Ebrahimi
79*22dc650dSSadaf Ebrahimi #define NLBLOCK mb /* Block containing newline information */
80*22dc650dSSadaf Ebrahimi #define PSSTART start_subject /* Field containing processed string start */
81*22dc650dSSadaf Ebrahimi #define PSEND end_subject /* Field containing processed string end */
82*22dc650dSSadaf Ebrahimi
83*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
84*22dc650dSSadaf Ebrahimi
85*22dc650dSSadaf Ebrahimi #define PUBLIC_DFA_MATCH_OPTIONS \
86*22dc650dSSadaf Ebrahimi (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87*22dc650dSSadaf Ebrahimi PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88*22dc650dSSadaf Ebrahimi PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89*22dc650dSSadaf Ebrahimi PCRE2_COPY_MATCHED_SUBJECT)
90*22dc650dSSadaf Ebrahimi
91*22dc650dSSadaf Ebrahimi
92*22dc650dSSadaf Ebrahimi /*************************************************
93*22dc650dSSadaf Ebrahimi * Code parameters and static tables *
94*22dc650dSSadaf Ebrahimi *************************************************/
95*22dc650dSSadaf Ebrahimi
96*22dc650dSSadaf Ebrahimi /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97*22dc650dSSadaf Ebrahimi into others, under special conditions. A gap of 20 between the blocks should be
98*22dc650dSSadaf Ebrahimi enough. The resulting opcodes don't have to be less than 256 because they are
99*22dc650dSSadaf Ebrahimi never stored, so we push them well clear of the normal opcodes. */
100*22dc650dSSadaf Ebrahimi
101*22dc650dSSadaf Ebrahimi #define OP_PROP_EXTRA 300
102*22dc650dSSadaf Ebrahimi #define OP_EXTUNI_EXTRA 320
103*22dc650dSSadaf Ebrahimi #define OP_ANYNL_EXTRA 340
104*22dc650dSSadaf Ebrahimi #define OP_HSPACE_EXTRA 360
105*22dc650dSSadaf Ebrahimi #define OP_VSPACE_EXTRA 380
106*22dc650dSSadaf Ebrahimi
107*22dc650dSSadaf Ebrahimi
108*22dc650dSSadaf Ebrahimi /* This table identifies those opcodes that are followed immediately by a
109*22dc650dSSadaf Ebrahimi character that is to be tested in some way. This makes it possible to
110*22dc650dSSadaf Ebrahimi centralize the loading of these characters. In the case of Type * etc, the
111*22dc650dSSadaf Ebrahimi "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112*22dc650dSSadaf Ebrahimi small value. Non-zero values in the table are the offsets from the opcode where
113*22dc650dSSadaf Ebrahimi the character is to be found. ***NOTE*** If the start of this table is
114*22dc650dSSadaf Ebrahimi modified, the three tables that follow must also be modified. */
115*22dc650dSSadaf Ebrahimi
116*22dc650dSSadaf Ebrahimi static const uint8_t coptable[] = {
117*22dc650dSSadaf Ebrahimi 0, /* End */
118*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120*22dc650dSSadaf Ebrahimi 0, 0, 0, /* Any, AllAny, Anybyte */
121*22dc650dSSadaf Ebrahimi 0, 0, /* \P, \p */
122*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123*22dc650dSSadaf Ebrahimi 0, /* \X */
124*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
125*22dc650dSSadaf Ebrahimi 1, /* Char */
126*22dc650dSSadaf Ebrahimi 1, /* Chari */
127*22dc650dSSadaf Ebrahimi 1, /* not */
128*22dc650dSSadaf Ebrahimi 1, /* noti */
129*22dc650dSSadaf Ebrahimi /* Positive single-char repeats */
130*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, /* exact */
133*22dc650dSSadaf Ebrahimi 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, /* exact I */
137*22dc650dSSadaf Ebrahimi 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138*22dc650dSSadaf Ebrahimi /* Negative single-char repeats - only for chars < 256 */
139*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, /* NOT exact */
142*22dc650dSSadaf Ebrahimi 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, /* NOT exact I */
146*22dc650dSSadaf Ebrahimi 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147*22dc650dSSadaf Ebrahimi /* Positive type repeats */
148*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150*22dc650dSSadaf Ebrahimi 1+IMM2_SIZE, /* Type exact */
151*22dc650dSSadaf Ebrahimi 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152*22dc650dSSadaf Ebrahimi /* Character class & ref repeats */
153*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154*22dc650dSSadaf Ebrahimi 0, 0, /* CRRANGE, CRMINRANGE */
155*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
156*22dc650dSSadaf Ebrahimi 0, /* CLASS */
157*22dc650dSSadaf Ebrahimi 0, /* NCLASS */
158*22dc650dSSadaf Ebrahimi 0, /* XCLASS - variable length */
159*22dc650dSSadaf Ebrahimi 0, /* REF */
160*22dc650dSSadaf Ebrahimi 0, /* REFI */
161*22dc650dSSadaf Ebrahimi 0, /* DNREF */
162*22dc650dSSadaf Ebrahimi 0, /* DNREFI */
163*22dc650dSSadaf Ebrahimi 0, /* RECURSE */
164*22dc650dSSadaf Ebrahimi 0, /* CALLOUT */
165*22dc650dSSadaf Ebrahimi 0, /* CALLOUT_STR */
166*22dc650dSSadaf Ebrahimi 0, /* Alt */
167*22dc650dSSadaf Ebrahimi 0, /* Ket */
168*22dc650dSSadaf Ebrahimi 0, /* KetRmax */
169*22dc650dSSadaf Ebrahimi 0, /* KetRmin */
170*22dc650dSSadaf Ebrahimi 0, /* KetRpos */
171*22dc650dSSadaf Ebrahimi 0, 0, /* Reverse, Vreverse */
172*22dc650dSSadaf Ebrahimi 0, /* Assert */
173*22dc650dSSadaf Ebrahimi 0, /* Assert not */
174*22dc650dSSadaf Ebrahimi 0, /* Assert behind */
175*22dc650dSSadaf Ebrahimi 0, /* Assert behind not */
176*22dc650dSSadaf Ebrahimi 0, /* NA assert */
177*22dc650dSSadaf Ebrahimi 0, /* NA assert behind */
178*22dc650dSSadaf Ebrahimi 0, /* ONCE */
179*22dc650dSSadaf Ebrahimi 0, /* SCRIPT_RUN */
180*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
181*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
182*22dc650dSSadaf Ebrahimi 0, 0, /* CREF, DNCREF */
183*22dc650dSSadaf Ebrahimi 0, 0, /* RREF, DNRREF */
184*22dc650dSSadaf Ebrahimi 0, 0, /* FALSE, TRUE */
185*22dc650dSSadaf Ebrahimi 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
186*22dc650dSSadaf Ebrahimi 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
187*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
188*22dc650dSSadaf Ebrahimi 0, 0, /* COMMIT, COMMIT_ARG */
189*22dc650dSSadaf Ebrahimi 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
190*22dc650dSSadaf Ebrahimi 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
191*22dc650dSSadaf Ebrahimi 0, 0 /* \B and \b in UCP mode */
192*22dc650dSSadaf Ebrahimi };
193*22dc650dSSadaf Ebrahimi
194*22dc650dSSadaf Ebrahimi /* This table identifies those opcodes that inspect a character. It is used to
195*22dc650dSSadaf Ebrahimi remember the fact that a character could have been inspected when the end of
196*22dc650dSSadaf Ebrahimi the subject is reached. ***NOTE*** If the start of this table is modified, the
197*22dc650dSSadaf Ebrahimi two tables that follow must also be modified. */
198*22dc650dSSadaf Ebrahimi
199*22dc650dSSadaf Ebrahimi static const uint8_t poptable[] = {
200*22dc650dSSadaf Ebrahimi 0, /* End */
201*22dc650dSSadaf Ebrahimi 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
202*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
203*22dc650dSSadaf Ebrahimi 1, 1, 1, /* Any, AllAny, Anybyte */
204*22dc650dSSadaf Ebrahimi 1, 1, /* \P, \p */
205*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
206*22dc650dSSadaf Ebrahimi 1, /* \X */
207*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
208*22dc650dSSadaf Ebrahimi 1, /* Char */
209*22dc650dSSadaf Ebrahimi 1, /* Chari */
210*22dc650dSSadaf Ebrahimi 1, /* not */
211*22dc650dSSadaf Ebrahimi 1, /* noti */
212*22dc650dSSadaf Ebrahimi /* Positive single-char repeats */
213*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
214*22dc650dSSadaf Ebrahimi 1, 1, 1, /* upto, minupto, exact */
215*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
216*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
217*22dc650dSSadaf Ebrahimi 1, 1, 1, /* upto I, minupto I, exact I */
218*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
219*22dc650dSSadaf Ebrahimi /* Negative single-char repeats - only for chars < 256 */
220*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
221*22dc650dSSadaf Ebrahimi 1, 1, 1, /* NOT upto, minupto, exact */
222*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
223*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
224*22dc650dSSadaf Ebrahimi 1, 1, 1, /* NOT upto I, minupto I, exact I */
225*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
226*22dc650dSSadaf Ebrahimi /* Positive type repeats */
227*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
228*22dc650dSSadaf Ebrahimi 1, 1, 1, /* Type upto, minupto, exact */
229*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
230*22dc650dSSadaf Ebrahimi /* Character class & ref repeats */
231*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
232*22dc650dSSadaf Ebrahimi 1, 1, /* CRRANGE, CRMINRANGE */
233*22dc650dSSadaf Ebrahimi 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
234*22dc650dSSadaf Ebrahimi 1, /* CLASS */
235*22dc650dSSadaf Ebrahimi 1, /* NCLASS */
236*22dc650dSSadaf Ebrahimi 1, /* XCLASS - variable length */
237*22dc650dSSadaf Ebrahimi 0, /* REF */
238*22dc650dSSadaf Ebrahimi 0, /* REFI */
239*22dc650dSSadaf Ebrahimi 0, /* DNREF */
240*22dc650dSSadaf Ebrahimi 0, /* DNREFI */
241*22dc650dSSadaf Ebrahimi 0, /* RECURSE */
242*22dc650dSSadaf Ebrahimi 0, /* CALLOUT */
243*22dc650dSSadaf Ebrahimi 0, /* CALLOUT_STR */
244*22dc650dSSadaf Ebrahimi 0, /* Alt */
245*22dc650dSSadaf Ebrahimi 0, /* Ket */
246*22dc650dSSadaf Ebrahimi 0, /* KetRmax */
247*22dc650dSSadaf Ebrahimi 0, /* KetRmin */
248*22dc650dSSadaf Ebrahimi 0, /* KetRpos */
249*22dc650dSSadaf Ebrahimi 0, 0, /* Reverse, Vreverse */
250*22dc650dSSadaf Ebrahimi 0, /* Assert */
251*22dc650dSSadaf Ebrahimi 0, /* Assert not */
252*22dc650dSSadaf Ebrahimi 0, /* Assert behind */
253*22dc650dSSadaf Ebrahimi 0, /* Assert behind not */
254*22dc650dSSadaf Ebrahimi 0, /* NA assert */
255*22dc650dSSadaf Ebrahimi 0, /* NA assert behind */
256*22dc650dSSadaf Ebrahimi 0, /* ONCE */
257*22dc650dSSadaf Ebrahimi 0, /* SCRIPT_RUN */
258*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
259*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
260*22dc650dSSadaf Ebrahimi 0, 0, /* CREF, DNCREF */
261*22dc650dSSadaf Ebrahimi 0, 0, /* RREF, DNRREF */
262*22dc650dSSadaf Ebrahimi 0, 0, /* FALSE, TRUE */
263*22dc650dSSadaf Ebrahimi 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
264*22dc650dSSadaf Ebrahimi 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
265*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
266*22dc650dSSadaf Ebrahimi 0, 0, /* COMMIT, COMMIT_ARG */
267*22dc650dSSadaf Ebrahimi 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
268*22dc650dSSadaf Ebrahimi 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
269*22dc650dSSadaf Ebrahimi 1, 1 /* \B and \b in UCP mode */
270*22dc650dSSadaf Ebrahimi };
271*22dc650dSSadaf Ebrahimi
272*22dc650dSSadaf Ebrahimi /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
273*22dc650dSSadaf Ebrahimi and \w */
274*22dc650dSSadaf Ebrahimi
275*22dc650dSSadaf Ebrahimi static const uint8_t toptable1[] = {
276*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, 0,
277*22dc650dSSadaf Ebrahimi ctype_digit, ctype_digit,
278*22dc650dSSadaf Ebrahimi ctype_space, ctype_space,
279*22dc650dSSadaf Ebrahimi ctype_word, ctype_word,
280*22dc650dSSadaf Ebrahimi 0, 0 /* OP_ANY, OP_ALLANY */
281*22dc650dSSadaf Ebrahimi };
282*22dc650dSSadaf Ebrahimi
283*22dc650dSSadaf Ebrahimi static const uint8_t toptable2[] = {
284*22dc650dSSadaf Ebrahimi 0, 0, 0, 0, 0, 0,
285*22dc650dSSadaf Ebrahimi ctype_digit, 0,
286*22dc650dSSadaf Ebrahimi ctype_space, 0,
287*22dc650dSSadaf Ebrahimi ctype_word, 0,
288*22dc650dSSadaf Ebrahimi 1, 1 /* OP_ANY, OP_ALLANY */
289*22dc650dSSadaf Ebrahimi };
290*22dc650dSSadaf Ebrahimi
291*22dc650dSSadaf Ebrahimi
292*22dc650dSSadaf Ebrahimi /* Structure for holding data about a particular state, which is in effect the
293*22dc650dSSadaf Ebrahimi current data for an active path through the match tree. It must consist
294*22dc650dSSadaf Ebrahimi entirely of ints because the working vector we are passed, and which we put
295*22dc650dSSadaf Ebrahimi these structures in, is a vector of ints. */
296*22dc650dSSadaf Ebrahimi
297*22dc650dSSadaf Ebrahimi typedef struct stateblock {
298*22dc650dSSadaf Ebrahimi int offset; /* Offset to opcode (-ve has meaning) */
299*22dc650dSSadaf Ebrahimi int count; /* Count for repeats */
300*22dc650dSSadaf Ebrahimi int data; /* Some use extra data */
301*22dc650dSSadaf Ebrahimi } stateblock;
302*22dc650dSSadaf Ebrahimi
303*22dc650dSSadaf Ebrahimi #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
304*22dc650dSSadaf Ebrahimi
305*22dc650dSSadaf Ebrahimi
306*22dc650dSSadaf Ebrahimi /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
307*22dc650dSSadaf Ebrahimi local working space and output vectors that were created on the stack. This has
308*22dc650dSSadaf Ebrahimi caused issues for some patterns, especially in small-stack environments such as
309*22dc650dSSadaf Ebrahimi Windows. A new scheme is now in use which sets up a vector on the stack, but if
310*22dc650dSSadaf Ebrahimi this is too small, heap memory is used, up to the heap_limit. The main
311*22dc650dSSadaf Ebrahimi parameters are all numbers of ints because the workspace is a vector of ints.
312*22dc650dSSadaf Ebrahimi
313*22dc650dSSadaf Ebrahimi The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
314*22dc650dSSadaf Ebrahimi defined in pcre2_internal.h so as to be available to pcre2test when it is
315*22dc650dSSadaf Ebrahimi finding the minimum heap requirement for a match. */
316*22dc650dSSadaf Ebrahimi
317*22dc650dSSadaf Ebrahimi #define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
318*22dc650dSSadaf Ebrahimi
319*22dc650dSSadaf Ebrahimi #define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
320*22dc650dSSadaf Ebrahimi #define RWS_RSIZE 1000 /* Work size for recursion */
321*22dc650dSSadaf Ebrahimi #define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
322*22dc650dSSadaf Ebrahimi #define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
323*22dc650dSSadaf Ebrahimi
324*22dc650dSSadaf Ebrahimi /* This structure is at the start of each workspace block. */
325*22dc650dSSadaf Ebrahimi
326*22dc650dSSadaf Ebrahimi typedef struct RWS_anchor {
327*22dc650dSSadaf Ebrahimi struct RWS_anchor *next;
328*22dc650dSSadaf Ebrahimi uint32_t size; /* Number of ints */
329*22dc650dSSadaf Ebrahimi uint32_t free; /* Number of ints */
330*22dc650dSSadaf Ebrahimi } RWS_anchor;
331*22dc650dSSadaf Ebrahimi
332*22dc650dSSadaf Ebrahimi #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
333*22dc650dSSadaf Ebrahimi
334*22dc650dSSadaf Ebrahimi
335*22dc650dSSadaf Ebrahimi
336*22dc650dSSadaf Ebrahimi /*************************************************
337*22dc650dSSadaf Ebrahimi * Process a callout *
338*22dc650dSSadaf Ebrahimi *************************************************/
339*22dc650dSSadaf Ebrahimi
340*22dc650dSSadaf Ebrahimi /* This function is called to perform a callout.
341*22dc650dSSadaf Ebrahimi
342*22dc650dSSadaf Ebrahimi Arguments:
343*22dc650dSSadaf Ebrahimi code current code pointer
344*22dc650dSSadaf Ebrahimi offsets points to current capture offsets
345*22dc650dSSadaf Ebrahimi current_subject start of current subject match
346*22dc650dSSadaf Ebrahimi ptr current position in subject
347*22dc650dSSadaf Ebrahimi mb the match block
348*22dc650dSSadaf Ebrahimi extracode extra code offset when called from condition
349*22dc650dSSadaf Ebrahimi lengthptr where to return the callout length
350*22dc650dSSadaf Ebrahimi
351*22dc650dSSadaf Ebrahimi Returns: the return from the callout
352*22dc650dSSadaf Ebrahimi */
353*22dc650dSSadaf Ebrahimi
354*22dc650dSSadaf Ebrahimi static int
do_callout_dfa(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)355*22dc650dSSadaf Ebrahimi do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
356*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
357*22dc650dSSadaf Ebrahimi PCRE2_SIZE *lengthptr)
358*22dc650dSSadaf Ebrahimi {
359*22dc650dSSadaf Ebrahimi pcre2_callout_block *cb = mb->cb;
360*22dc650dSSadaf Ebrahimi
361*22dc650dSSadaf Ebrahimi *lengthptr = (code[extracode] == OP_CALLOUT)?
362*22dc650dSSadaf Ebrahimi (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
363*22dc650dSSadaf Ebrahimi (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
364*22dc650dSSadaf Ebrahimi
365*22dc650dSSadaf Ebrahimi if (mb->callout == NULL) return 0; /* No callout provided */
366*22dc650dSSadaf Ebrahimi
367*22dc650dSSadaf Ebrahimi /* Fixed fields in the callout block are set once and for all at the start of
368*22dc650dSSadaf Ebrahimi matching. */
369*22dc650dSSadaf Ebrahimi
370*22dc650dSSadaf Ebrahimi cb->offset_vector = offsets;
371*22dc650dSSadaf Ebrahimi cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
372*22dc650dSSadaf Ebrahimi cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
373*22dc650dSSadaf Ebrahimi cb->pattern_position = GET(code, 1 + extracode);
374*22dc650dSSadaf Ebrahimi cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
375*22dc650dSSadaf Ebrahimi
376*22dc650dSSadaf Ebrahimi if (code[extracode] == OP_CALLOUT)
377*22dc650dSSadaf Ebrahimi {
378*22dc650dSSadaf Ebrahimi cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
379*22dc650dSSadaf Ebrahimi cb->callout_string_offset = 0;
380*22dc650dSSadaf Ebrahimi cb->callout_string = NULL;
381*22dc650dSSadaf Ebrahimi cb->callout_string_length = 0;
382*22dc650dSSadaf Ebrahimi }
383*22dc650dSSadaf Ebrahimi else
384*22dc650dSSadaf Ebrahimi {
385*22dc650dSSadaf Ebrahimi cb->callout_number = 0;
386*22dc650dSSadaf Ebrahimi cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
387*22dc650dSSadaf Ebrahimi cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
388*22dc650dSSadaf Ebrahimi cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
389*22dc650dSSadaf Ebrahimi }
390*22dc650dSSadaf Ebrahimi
391*22dc650dSSadaf Ebrahimi return (mb->callout)(cb, mb->callout_data);
392*22dc650dSSadaf Ebrahimi }
393*22dc650dSSadaf Ebrahimi
394*22dc650dSSadaf Ebrahimi
395*22dc650dSSadaf Ebrahimi
396*22dc650dSSadaf Ebrahimi /*************************************************
397*22dc650dSSadaf Ebrahimi * Expand local workspace memory *
398*22dc650dSSadaf Ebrahimi *************************************************/
399*22dc650dSSadaf Ebrahimi
400*22dc650dSSadaf Ebrahimi /* This function is called when internal_dfa_match() is about to be called
401*22dc650dSSadaf Ebrahimi recursively and there is insufficient working space left in the current
402*22dc650dSSadaf Ebrahimi workspace block. If there's an existing next block, use it; otherwise get a new
403*22dc650dSSadaf Ebrahimi block unless the heap limit is reached.
404*22dc650dSSadaf Ebrahimi
405*22dc650dSSadaf Ebrahimi Arguments:
406*22dc650dSSadaf Ebrahimi rwsptr pointer to block pointer (updated)
407*22dc650dSSadaf Ebrahimi ovecsize space needed for an ovector
408*22dc650dSSadaf Ebrahimi mb the match block
409*22dc650dSSadaf Ebrahimi
410*22dc650dSSadaf Ebrahimi Returns: 0 rwsptr has been updated
411*22dc650dSSadaf Ebrahimi !0 an error code
412*22dc650dSSadaf Ebrahimi */
413*22dc650dSSadaf Ebrahimi
414*22dc650dSSadaf Ebrahimi static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)415*22dc650dSSadaf Ebrahimi more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
416*22dc650dSSadaf Ebrahimi {
417*22dc650dSSadaf Ebrahimi RWS_anchor *rws = *rwsptr;
418*22dc650dSSadaf Ebrahimi RWS_anchor *new;
419*22dc650dSSadaf Ebrahimi
420*22dc650dSSadaf Ebrahimi if (rws->next != NULL)
421*22dc650dSSadaf Ebrahimi {
422*22dc650dSSadaf Ebrahimi new = rws->next;
423*22dc650dSSadaf Ebrahimi }
424*22dc650dSSadaf Ebrahimi
425*22dc650dSSadaf Ebrahimi /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
426*22dc650dSSadaf Ebrahimi mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
427*22dc650dSSadaf Ebrahimi overflow. */
428*22dc650dSSadaf Ebrahimi
429*22dc650dSSadaf Ebrahimi else
430*22dc650dSSadaf Ebrahimi {
431*22dc650dSSadaf Ebrahimi uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
432*22dc650dSSadaf Ebrahimi uint32_t newsizeK = newsize/(1024/sizeof(int));
433*22dc650dSSadaf Ebrahimi
434*22dc650dSSadaf Ebrahimi if (newsizeK + mb->heap_used > mb->heap_limit)
435*22dc650dSSadaf Ebrahimi newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
436*22dc650dSSadaf Ebrahimi newsize = newsizeK*(1024/sizeof(int));
437*22dc650dSSadaf Ebrahimi
438*22dc650dSSadaf Ebrahimi if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
439*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_HEAPLIMIT;
440*22dc650dSSadaf Ebrahimi new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
441*22dc650dSSadaf Ebrahimi if (new == NULL) return PCRE2_ERROR_NOMEMORY;
442*22dc650dSSadaf Ebrahimi mb->heap_used += newsizeK;
443*22dc650dSSadaf Ebrahimi new->next = NULL;
444*22dc650dSSadaf Ebrahimi new->size = newsize;
445*22dc650dSSadaf Ebrahimi rws->next = new;
446*22dc650dSSadaf Ebrahimi }
447*22dc650dSSadaf Ebrahimi
448*22dc650dSSadaf Ebrahimi new->free = new->size - RWS_ANCHOR_SIZE;
449*22dc650dSSadaf Ebrahimi *rwsptr = new;
450*22dc650dSSadaf Ebrahimi return 0;
451*22dc650dSSadaf Ebrahimi }
452*22dc650dSSadaf Ebrahimi
453*22dc650dSSadaf Ebrahimi
454*22dc650dSSadaf Ebrahimi
455*22dc650dSSadaf Ebrahimi /*************************************************
456*22dc650dSSadaf Ebrahimi * Match a Regular Expression - DFA engine *
457*22dc650dSSadaf Ebrahimi *************************************************/
458*22dc650dSSadaf Ebrahimi
459*22dc650dSSadaf Ebrahimi /* This internal function applies a compiled pattern to a subject string,
460*22dc650dSSadaf Ebrahimi starting at a given point, using a DFA engine. This function is called from the
461*22dc650dSSadaf Ebrahimi external one, possibly multiple times if the pattern is not anchored. The
462*22dc650dSSadaf Ebrahimi function calls itself recursively for some kinds of subpattern.
463*22dc650dSSadaf Ebrahimi
464*22dc650dSSadaf Ebrahimi Arguments:
465*22dc650dSSadaf Ebrahimi mb the match_data block with fixed information
466*22dc650dSSadaf Ebrahimi this_start_code the opening bracket of this subexpression's code
467*22dc650dSSadaf Ebrahimi current_subject where we currently are in the subject string
468*22dc650dSSadaf Ebrahimi start_offset start offset in the subject string
469*22dc650dSSadaf Ebrahimi offsets vector to contain the matching string offsets
470*22dc650dSSadaf Ebrahimi offsetcount size of same
471*22dc650dSSadaf Ebrahimi workspace vector of workspace
472*22dc650dSSadaf Ebrahimi wscount size of same
473*22dc650dSSadaf Ebrahimi rlevel function call recursion level
474*22dc650dSSadaf Ebrahimi
475*22dc650dSSadaf Ebrahimi Returns: > 0 => number of match offset pairs placed in offsets
476*22dc650dSSadaf Ebrahimi = 0 => offsets overflowed; longest matches are present
477*22dc650dSSadaf Ebrahimi -1 => failed to match
478*22dc650dSSadaf Ebrahimi < -1 => some kind of unexpected problem
479*22dc650dSSadaf Ebrahimi
480*22dc650dSSadaf Ebrahimi The following macros are used for adding states to the two state vectors (one
481*22dc650dSSadaf Ebrahimi for the current character, one for the following character). */
482*22dc650dSSadaf Ebrahimi
483*22dc650dSSadaf Ebrahimi #define ADD_ACTIVE(x,y) \
484*22dc650dSSadaf Ebrahimi if (active_count++ < wscount) \
485*22dc650dSSadaf Ebrahimi { \
486*22dc650dSSadaf Ebrahimi next_active_state->offset = (x); \
487*22dc650dSSadaf Ebrahimi next_active_state->count = (y); \
488*22dc650dSSadaf Ebrahimi next_active_state++; \
489*22dc650dSSadaf Ebrahimi } \
490*22dc650dSSadaf Ebrahimi else return PCRE2_ERROR_DFA_WSSIZE
491*22dc650dSSadaf Ebrahimi
492*22dc650dSSadaf Ebrahimi #define ADD_ACTIVE_DATA(x,y,z) \
493*22dc650dSSadaf Ebrahimi if (active_count++ < wscount) \
494*22dc650dSSadaf Ebrahimi { \
495*22dc650dSSadaf Ebrahimi next_active_state->offset = (x); \
496*22dc650dSSadaf Ebrahimi next_active_state->count = (y); \
497*22dc650dSSadaf Ebrahimi next_active_state->data = (z); \
498*22dc650dSSadaf Ebrahimi next_active_state++; \
499*22dc650dSSadaf Ebrahimi } \
500*22dc650dSSadaf Ebrahimi else return PCRE2_ERROR_DFA_WSSIZE
501*22dc650dSSadaf Ebrahimi
502*22dc650dSSadaf Ebrahimi #define ADD_NEW(x,y) \
503*22dc650dSSadaf Ebrahimi if (new_count++ < wscount) \
504*22dc650dSSadaf Ebrahimi { \
505*22dc650dSSadaf Ebrahimi next_new_state->offset = (x); \
506*22dc650dSSadaf Ebrahimi next_new_state->count = (y); \
507*22dc650dSSadaf Ebrahimi next_new_state++; \
508*22dc650dSSadaf Ebrahimi } \
509*22dc650dSSadaf Ebrahimi else return PCRE2_ERROR_DFA_WSSIZE
510*22dc650dSSadaf Ebrahimi
511*22dc650dSSadaf Ebrahimi #define ADD_NEW_DATA(x,y,z) \
512*22dc650dSSadaf Ebrahimi if (new_count++ < wscount) \
513*22dc650dSSadaf Ebrahimi { \
514*22dc650dSSadaf Ebrahimi next_new_state->offset = (x); \
515*22dc650dSSadaf Ebrahimi next_new_state->count = (y); \
516*22dc650dSSadaf Ebrahimi next_new_state->data = (z); \
517*22dc650dSSadaf Ebrahimi next_new_state++; \
518*22dc650dSSadaf Ebrahimi } \
519*22dc650dSSadaf Ebrahimi else return PCRE2_ERROR_DFA_WSSIZE
520*22dc650dSSadaf Ebrahimi
521*22dc650dSSadaf Ebrahimi /* And now, here is the code */
522*22dc650dSSadaf Ebrahimi
523*22dc650dSSadaf Ebrahimi static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)524*22dc650dSSadaf Ebrahimi internal_dfa_match(
525*22dc650dSSadaf Ebrahimi dfa_match_block *mb,
526*22dc650dSSadaf Ebrahimi PCRE2_SPTR this_start_code,
527*22dc650dSSadaf Ebrahimi PCRE2_SPTR current_subject,
528*22dc650dSSadaf Ebrahimi PCRE2_SIZE start_offset,
529*22dc650dSSadaf Ebrahimi PCRE2_SIZE *offsets,
530*22dc650dSSadaf Ebrahimi uint32_t offsetcount,
531*22dc650dSSadaf Ebrahimi int *workspace,
532*22dc650dSSadaf Ebrahimi int wscount,
533*22dc650dSSadaf Ebrahimi uint32_t rlevel,
534*22dc650dSSadaf Ebrahimi int *RWS)
535*22dc650dSSadaf Ebrahimi {
536*22dc650dSSadaf Ebrahimi stateblock *active_states, *new_states, *temp_states;
537*22dc650dSSadaf Ebrahimi stateblock *next_active_state, *next_new_state;
538*22dc650dSSadaf Ebrahimi const uint8_t *ctypes, *lcc, *fcc;
539*22dc650dSSadaf Ebrahimi PCRE2_SPTR ptr;
540*22dc650dSSadaf Ebrahimi PCRE2_SPTR end_code;
541*22dc650dSSadaf Ebrahimi dfa_recursion_info new_recursive;
542*22dc650dSSadaf Ebrahimi int active_count, new_count, match_count;
543*22dc650dSSadaf Ebrahimi
544*22dc650dSSadaf Ebrahimi /* Some fields in the mb block are frequently referenced, so we load them into
545*22dc650dSSadaf Ebrahimi independent variables in the hope that this will perform better. */
546*22dc650dSSadaf Ebrahimi
547*22dc650dSSadaf Ebrahimi PCRE2_SPTR start_subject = mb->start_subject;
548*22dc650dSSadaf Ebrahimi PCRE2_SPTR end_subject = mb->end_subject;
549*22dc650dSSadaf Ebrahimi PCRE2_SPTR start_code = mb->start_code;
550*22dc650dSSadaf Ebrahimi
551*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
552*22dc650dSSadaf Ebrahimi BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
553*22dc650dSSadaf Ebrahimi BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
554*22dc650dSSadaf Ebrahimi #else
555*22dc650dSSadaf Ebrahimi BOOL utf = FALSE;
556*22dc650dSSadaf Ebrahimi #endif
557*22dc650dSSadaf Ebrahimi
558*22dc650dSSadaf Ebrahimi BOOL reset_could_continue = FALSE;
559*22dc650dSSadaf Ebrahimi
560*22dc650dSSadaf Ebrahimi if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
561*22dc650dSSadaf Ebrahimi if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
562*22dc650dSSadaf Ebrahimi offsetcount &= (uint32_t)(-2); /* Round down */
563*22dc650dSSadaf Ebrahimi
564*22dc650dSSadaf Ebrahimi wscount -= 2;
565*22dc650dSSadaf Ebrahimi wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
566*22dc650dSSadaf Ebrahimi (2 * INTS_PER_STATEBLOCK);
567*22dc650dSSadaf Ebrahimi
568*22dc650dSSadaf Ebrahimi ctypes = mb->tables + ctypes_offset;
569*22dc650dSSadaf Ebrahimi lcc = mb->tables + lcc_offset;
570*22dc650dSSadaf Ebrahimi fcc = mb->tables + fcc_offset;
571*22dc650dSSadaf Ebrahimi
572*22dc650dSSadaf Ebrahimi match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
573*22dc650dSSadaf Ebrahimi
574*22dc650dSSadaf Ebrahimi active_states = (stateblock *)(workspace + 2);
575*22dc650dSSadaf Ebrahimi next_new_state = new_states = active_states + wscount;
576*22dc650dSSadaf Ebrahimi new_count = 0;
577*22dc650dSSadaf Ebrahimi
578*22dc650dSSadaf Ebrahimi /* The first thing in any (sub) pattern is a bracket of some sort. Push all
579*22dc650dSSadaf Ebrahimi the alternative states onto the list, and find out where the end is. This
580*22dc650dSSadaf Ebrahimi makes is possible to use this function recursively, when we want to stop at a
581*22dc650dSSadaf Ebrahimi matching internal ket rather than at the end.
582*22dc650dSSadaf Ebrahimi
583*22dc650dSSadaf Ebrahimi If we are dealing with a backward assertion we have to find out the maximum
584*22dc650dSSadaf Ebrahimi amount to move back, and set up each alternative appropriately. */
585*22dc650dSSadaf Ebrahimi
586*22dc650dSSadaf Ebrahimi if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
587*22dc650dSSadaf Ebrahimi {
588*22dc650dSSadaf Ebrahimi size_t max_back = 0;
589*22dc650dSSadaf Ebrahimi size_t gone_back;
590*22dc650dSSadaf Ebrahimi
591*22dc650dSSadaf Ebrahimi end_code = this_start_code;
592*22dc650dSSadaf Ebrahimi do
593*22dc650dSSadaf Ebrahimi {
594*22dc650dSSadaf Ebrahimi size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
595*22dc650dSSadaf Ebrahimi if (back > max_back) max_back = back;
596*22dc650dSSadaf Ebrahimi end_code += GET(end_code, 1);
597*22dc650dSSadaf Ebrahimi }
598*22dc650dSSadaf Ebrahimi while (*end_code == OP_ALT);
599*22dc650dSSadaf Ebrahimi
600*22dc650dSSadaf Ebrahimi /* If we can't go back the amount required for the longest lookbehind
601*22dc650dSSadaf Ebrahimi pattern, go back as far as we can; some alternatives may still be viable. */
602*22dc650dSSadaf Ebrahimi
603*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
604*22dc650dSSadaf Ebrahimi /* In character mode we have to step back character by character */
605*22dc650dSSadaf Ebrahimi
606*22dc650dSSadaf Ebrahimi if (utf)
607*22dc650dSSadaf Ebrahimi {
608*22dc650dSSadaf Ebrahimi for (gone_back = 0; gone_back < max_back; gone_back++)
609*22dc650dSSadaf Ebrahimi {
610*22dc650dSSadaf Ebrahimi if (current_subject <= start_subject) break;
611*22dc650dSSadaf Ebrahimi current_subject--;
612*22dc650dSSadaf Ebrahimi ACROSSCHAR(current_subject > start_subject, current_subject,
613*22dc650dSSadaf Ebrahimi current_subject--);
614*22dc650dSSadaf Ebrahimi }
615*22dc650dSSadaf Ebrahimi }
616*22dc650dSSadaf Ebrahimi else
617*22dc650dSSadaf Ebrahimi #endif
618*22dc650dSSadaf Ebrahimi
619*22dc650dSSadaf Ebrahimi /* In byte-mode we can do this quickly. */
620*22dc650dSSadaf Ebrahimi
621*22dc650dSSadaf Ebrahimi {
622*22dc650dSSadaf Ebrahimi size_t current_offset = (size_t)(current_subject - start_subject);
623*22dc650dSSadaf Ebrahimi gone_back = (current_offset < max_back)? current_offset : max_back;
624*22dc650dSSadaf Ebrahimi current_subject -= gone_back;
625*22dc650dSSadaf Ebrahimi }
626*22dc650dSSadaf Ebrahimi
627*22dc650dSSadaf Ebrahimi /* Save the earliest consulted character */
628*22dc650dSSadaf Ebrahimi
629*22dc650dSSadaf Ebrahimi if (current_subject < mb->start_used_ptr)
630*22dc650dSSadaf Ebrahimi mb->start_used_ptr = current_subject;
631*22dc650dSSadaf Ebrahimi
632*22dc650dSSadaf Ebrahimi /* Now we can process the individual branches. There will be an OP_REVERSE at
633*22dc650dSSadaf Ebrahimi the start of each branch, except when the length of the branch is zero. */
634*22dc650dSSadaf Ebrahimi
635*22dc650dSSadaf Ebrahimi end_code = this_start_code;
636*22dc650dSSadaf Ebrahimi do
637*22dc650dSSadaf Ebrahimi {
638*22dc650dSSadaf Ebrahimi uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
639*22dc650dSSadaf Ebrahimi size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
640*22dc650dSSadaf Ebrahimi if (back <= gone_back)
641*22dc650dSSadaf Ebrahimi {
642*22dc650dSSadaf Ebrahimi int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
643*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
644*22dc650dSSadaf Ebrahimi }
645*22dc650dSSadaf Ebrahimi end_code += GET(end_code, 1);
646*22dc650dSSadaf Ebrahimi }
647*22dc650dSSadaf Ebrahimi while (*end_code == OP_ALT);
648*22dc650dSSadaf Ebrahimi }
649*22dc650dSSadaf Ebrahimi
650*22dc650dSSadaf Ebrahimi /* This is the code for a "normal" subpattern (not a backward assertion). The
651*22dc650dSSadaf Ebrahimi start of a whole pattern is always one of these. If we are at the top level,
652*22dc650dSSadaf Ebrahimi we may be asked to restart matching from the same point that we reached for a
653*22dc650dSSadaf Ebrahimi previous partial match. We still have to scan through the top-level branches to
654*22dc650dSSadaf Ebrahimi find the end state. */
655*22dc650dSSadaf Ebrahimi
656*22dc650dSSadaf Ebrahimi else
657*22dc650dSSadaf Ebrahimi {
658*22dc650dSSadaf Ebrahimi end_code = this_start_code;
659*22dc650dSSadaf Ebrahimi
660*22dc650dSSadaf Ebrahimi /* Restarting */
661*22dc650dSSadaf Ebrahimi
662*22dc650dSSadaf Ebrahimi if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
663*22dc650dSSadaf Ebrahimi {
664*22dc650dSSadaf Ebrahimi do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
665*22dc650dSSadaf Ebrahimi new_count = workspace[1];
666*22dc650dSSadaf Ebrahimi if (!workspace[0])
667*22dc650dSSadaf Ebrahimi memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
668*22dc650dSSadaf Ebrahimi }
669*22dc650dSSadaf Ebrahimi
670*22dc650dSSadaf Ebrahimi /* Not restarting */
671*22dc650dSSadaf Ebrahimi
672*22dc650dSSadaf Ebrahimi else
673*22dc650dSSadaf Ebrahimi {
674*22dc650dSSadaf Ebrahimi int length = 1 + LINK_SIZE +
675*22dc650dSSadaf Ebrahimi ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
676*22dc650dSSadaf Ebrahimi *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
677*22dc650dSSadaf Ebrahimi ? IMM2_SIZE:0);
678*22dc650dSSadaf Ebrahimi do
679*22dc650dSSadaf Ebrahimi {
680*22dc650dSSadaf Ebrahimi ADD_NEW((int)(end_code - start_code + length), 0);
681*22dc650dSSadaf Ebrahimi end_code += GET(end_code, 1);
682*22dc650dSSadaf Ebrahimi length = 1 + LINK_SIZE;
683*22dc650dSSadaf Ebrahimi }
684*22dc650dSSadaf Ebrahimi while (*end_code == OP_ALT);
685*22dc650dSSadaf Ebrahimi }
686*22dc650dSSadaf Ebrahimi }
687*22dc650dSSadaf Ebrahimi
688*22dc650dSSadaf Ebrahimi workspace[0] = 0; /* Bit indicating which vector is current */
689*22dc650dSSadaf Ebrahimi
690*22dc650dSSadaf Ebrahimi /* Loop for scanning the subject */
691*22dc650dSSadaf Ebrahimi
692*22dc650dSSadaf Ebrahimi ptr = current_subject;
693*22dc650dSSadaf Ebrahimi for (;;)
694*22dc650dSSadaf Ebrahimi {
695*22dc650dSSadaf Ebrahimi int i, j;
696*22dc650dSSadaf Ebrahimi int clen, dlen;
697*22dc650dSSadaf Ebrahimi uint32_t c, d;
698*22dc650dSSadaf Ebrahimi int forced_fail = 0;
699*22dc650dSSadaf Ebrahimi BOOL partial_newline = FALSE;
700*22dc650dSSadaf Ebrahimi BOOL could_continue = reset_could_continue;
701*22dc650dSSadaf Ebrahimi reset_could_continue = FALSE;
702*22dc650dSSadaf Ebrahimi
703*22dc650dSSadaf Ebrahimi if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
704*22dc650dSSadaf Ebrahimi
705*22dc650dSSadaf Ebrahimi /* Make the new state list into the active state list and empty the
706*22dc650dSSadaf Ebrahimi new state list. */
707*22dc650dSSadaf Ebrahimi
708*22dc650dSSadaf Ebrahimi temp_states = active_states;
709*22dc650dSSadaf Ebrahimi active_states = new_states;
710*22dc650dSSadaf Ebrahimi new_states = temp_states;
711*22dc650dSSadaf Ebrahimi active_count = new_count;
712*22dc650dSSadaf Ebrahimi new_count = 0;
713*22dc650dSSadaf Ebrahimi
714*22dc650dSSadaf Ebrahimi workspace[0] ^= 1; /* Remember for the restarting feature */
715*22dc650dSSadaf Ebrahimi workspace[1] = active_count;
716*22dc650dSSadaf Ebrahimi
717*22dc650dSSadaf Ebrahimi /* Set the pointers for adding new states */
718*22dc650dSSadaf Ebrahimi
719*22dc650dSSadaf Ebrahimi next_active_state = active_states + active_count;
720*22dc650dSSadaf Ebrahimi next_new_state = new_states;
721*22dc650dSSadaf Ebrahimi
722*22dc650dSSadaf Ebrahimi /* Load the current character from the subject outside the loop, as many
723*22dc650dSSadaf Ebrahimi different states may want to look at it, and we assume that at least one
724*22dc650dSSadaf Ebrahimi will. */
725*22dc650dSSadaf Ebrahimi
726*22dc650dSSadaf Ebrahimi if (ptr < end_subject)
727*22dc650dSSadaf Ebrahimi {
728*22dc650dSSadaf Ebrahimi clen = 1; /* Number of data items in the character */
729*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
730*22dc650dSSadaf Ebrahimi GETCHARLENTEST(c, ptr, clen);
731*22dc650dSSadaf Ebrahimi #else
732*22dc650dSSadaf Ebrahimi c = *ptr;
733*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
734*22dc650dSSadaf Ebrahimi }
735*22dc650dSSadaf Ebrahimi else
736*22dc650dSSadaf Ebrahimi {
737*22dc650dSSadaf Ebrahimi clen = 0; /* This indicates the end of the subject */
738*22dc650dSSadaf Ebrahimi c = NOTACHAR; /* This value should never actually be used */
739*22dc650dSSadaf Ebrahimi }
740*22dc650dSSadaf Ebrahimi
741*22dc650dSSadaf Ebrahimi /* Scan up the active states and act on each one. The result of an action
742*22dc650dSSadaf Ebrahimi may be to add more states to the currently active list (e.g. on hitting a
743*22dc650dSSadaf Ebrahimi parenthesis) or it may be to put states on the new list, for considering
744*22dc650dSSadaf Ebrahimi when we move the character pointer on. */
745*22dc650dSSadaf Ebrahimi
746*22dc650dSSadaf Ebrahimi for (i = 0; i < active_count; i++)
747*22dc650dSSadaf Ebrahimi {
748*22dc650dSSadaf Ebrahimi stateblock *current_state = active_states + i;
749*22dc650dSSadaf Ebrahimi BOOL caseless = FALSE;
750*22dc650dSSadaf Ebrahimi PCRE2_SPTR code;
751*22dc650dSSadaf Ebrahimi uint32_t codevalue;
752*22dc650dSSadaf Ebrahimi int state_offset = current_state->offset;
753*22dc650dSSadaf Ebrahimi int rrc;
754*22dc650dSSadaf Ebrahimi int count;
755*22dc650dSSadaf Ebrahimi
756*22dc650dSSadaf Ebrahimi /* A negative offset is a special case meaning "hold off going to this
757*22dc650dSSadaf Ebrahimi (negated) state until the number of characters in the data field have
758*22dc650dSSadaf Ebrahimi been skipped". If the could_continue flag was passed over from a previous
759*22dc650dSSadaf Ebrahimi state, arrange for it to passed on. */
760*22dc650dSSadaf Ebrahimi
761*22dc650dSSadaf Ebrahimi if (state_offset < 0)
762*22dc650dSSadaf Ebrahimi {
763*22dc650dSSadaf Ebrahimi if (current_state->data > 0)
764*22dc650dSSadaf Ebrahimi {
765*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(state_offset, current_state->count,
766*22dc650dSSadaf Ebrahimi current_state->data - 1);
767*22dc650dSSadaf Ebrahimi if (could_continue) reset_could_continue = TRUE;
768*22dc650dSSadaf Ebrahimi continue;
769*22dc650dSSadaf Ebrahimi }
770*22dc650dSSadaf Ebrahimi else
771*22dc650dSSadaf Ebrahimi {
772*22dc650dSSadaf Ebrahimi current_state->offset = state_offset = -state_offset;
773*22dc650dSSadaf Ebrahimi }
774*22dc650dSSadaf Ebrahimi }
775*22dc650dSSadaf Ebrahimi
776*22dc650dSSadaf Ebrahimi /* Check for a duplicate state with the same count, and skip if found.
777*22dc650dSSadaf Ebrahimi See the note at the head of this module about the possibility of improving
778*22dc650dSSadaf Ebrahimi performance here. */
779*22dc650dSSadaf Ebrahimi
780*22dc650dSSadaf Ebrahimi for (j = 0; j < i; j++)
781*22dc650dSSadaf Ebrahimi {
782*22dc650dSSadaf Ebrahimi if (active_states[j].offset == state_offset &&
783*22dc650dSSadaf Ebrahimi active_states[j].count == current_state->count)
784*22dc650dSSadaf Ebrahimi goto NEXT_ACTIVE_STATE;
785*22dc650dSSadaf Ebrahimi }
786*22dc650dSSadaf Ebrahimi
787*22dc650dSSadaf Ebrahimi /* The state offset is the offset to the opcode */
788*22dc650dSSadaf Ebrahimi
789*22dc650dSSadaf Ebrahimi code = start_code + state_offset;
790*22dc650dSSadaf Ebrahimi codevalue = *code;
791*22dc650dSSadaf Ebrahimi
792*22dc650dSSadaf Ebrahimi /* If this opcode inspects a character, but we are at the end of the
793*22dc650dSSadaf Ebrahimi subject, remember the fact for use when testing for a partial match. */
794*22dc650dSSadaf Ebrahimi
795*22dc650dSSadaf Ebrahimi if (clen == 0 && poptable[codevalue] != 0)
796*22dc650dSSadaf Ebrahimi could_continue = TRUE;
797*22dc650dSSadaf Ebrahimi
798*22dc650dSSadaf Ebrahimi /* If this opcode is followed by an inline character, load it. It is
799*22dc650dSSadaf Ebrahimi tempting to test for the presence of a subject character here, but that
800*22dc650dSSadaf Ebrahimi is wrong, because sometimes zero repetitions of the subject are
801*22dc650dSSadaf Ebrahimi permitted.
802*22dc650dSSadaf Ebrahimi
803*22dc650dSSadaf Ebrahimi We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
804*22dc650dSSadaf Ebrahimi argument that is not a data character - but is always one byte long because
805*22dc650dSSadaf Ebrahimi the values are small. We have to take special action to deal with \P, \p,
806*22dc650dSSadaf Ebrahimi \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
807*22dc650dSSadaf Ebrahimi these ones to new opcodes. */
808*22dc650dSSadaf Ebrahimi
809*22dc650dSSadaf Ebrahimi if (coptable[codevalue] > 0)
810*22dc650dSSadaf Ebrahimi {
811*22dc650dSSadaf Ebrahimi dlen = 1;
812*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
813*22dc650dSSadaf Ebrahimi if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
814*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
815*22dc650dSSadaf Ebrahimi d = code[coptable[codevalue]];
816*22dc650dSSadaf Ebrahimi if (codevalue >= OP_TYPESTAR)
817*22dc650dSSadaf Ebrahimi {
818*22dc650dSSadaf Ebrahimi switch(d)
819*22dc650dSSadaf Ebrahimi {
820*22dc650dSSadaf Ebrahimi case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
821*22dc650dSSadaf Ebrahimi case OP_NOTPROP:
822*22dc650dSSadaf Ebrahimi case OP_PROP: codevalue += OP_PROP_EXTRA; break;
823*22dc650dSSadaf Ebrahimi case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
824*22dc650dSSadaf Ebrahimi case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
825*22dc650dSSadaf Ebrahimi case OP_NOT_HSPACE:
826*22dc650dSSadaf Ebrahimi case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
827*22dc650dSSadaf Ebrahimi case OP_NOT_VSPACE:
828*22dc650dSSadaf Ebrahimi case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
829*22dc650dSSadaf Ebrahimi default: break;
830*22dc650dSSadaf Ebrahimi }
831*22dc650dSSadaf Ebrahimi }
832*22dc650dSSadaf Ebrahimi }
833*22dc650dSSadaf Ebrahimi else
834*22dc650dSSadaf Ebrahimi {
835*22dc650dSSadaf Ebrahimi dlen = 0; /* Not strictly necessary, but compilers moan */
836*22dc650dSSadaf Ebrahimi d = NOTACHAR; /* if these variables are not set. */
837*22dc650dSSadaf Ebrahimi }
838*22dc650dSSadaf Ebrahimi
839*22dc650dSSadaf Ebrahimi
840*22dc650dSSadaf Ebrahimi /* Now process the individual opcodes */
841*22dc650dSSadaf Ebrahimi
842*22dc650dSSadaf Ebrahimi switch (codevalue)
843*22dc650dSSadaf Ebrahimi {
844*22dc650dSSadaf Ebrahimi /* ========================================================================== */
845*22dc650dSSadaf Ebrahimi /* These cases are never obeyed. This is a fudge that causes a compile-
846*22dc650dSSadaf Ebrahimi time error if the vectors coptable or poptable, which are indexed by
847*22dc650dSSadaf Ebrahimi opcode, are not the correct length. It seems to be the only way to do
848*22dc650dSSadaf Ebrahimi such a check at compile time, as the sizeof() operator does not work
849*22dc650dSSadaf Ebrahimi in the C preprocessor. */
850*22dc650dSSadaf Ebrahimi
851*22dc650dSSadaf Ebrahimi case OP_TABLE_LENGTH:
852*22dc650dSSadaf Ebrahimi case OP_TABLE_LENGTH +
853*22dc650dSSadaf Ebrahimi ((sizeof(coptable) == OP_TABLE_LENGTH) &&
854*22dc650dSSadaf Ebrahimi (sizeof(poptable) == OP_TABLE_LENGTH)):
855*22dc650dSSadaf Ebrahimi return 0;
856*22dc650dSSadaf Ebrahimi
857*22dc650dSSadaf Ebrahimi /* ========================================================================== */
858*22dc650dSSadaf Ebrahimi /* Reached a closing bracket. If not at the end of the pattern, carry
859*22dc650dSSadaf Ebrahimi on with the next opcode. For repeating opcodes, also add the repeat
860*22dc650dSSadaf Ebrahimi state. Note that KETRPOS will always be encountered at the end of the
861*22dc650dSSadaf Ebrahimi subpattern, because the possessive subpattern repeats are always handled
862*22dc650dSSadaf Ebrahimi using recursive calls. Thus, it never adds any new states.
863*22dc650dSSadaf Ebrahimi
864*22dc650dSSadaf Ebrahimi At the end of the (sub)pattern, unless we have an empty string and
865*22dc650dSSadaf Ebrahimi PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
866*22dc650dSSadaf Ebrahimi start of the subject, save the match data, shifting up all previous
867*22dc650dSSadaf Ebrahimi matches so we always have the longest first. */
868*22dc650dSSadaf Ebrahimi
869*22dc650dSSadaf Ebrahimi case OP_KET:
870*22dc650dSSadaf Ebrahimi case OP_KETRMIN:
871*22dc650dSSadaf Ebrahimi case OP_KETRMAX:
872*22dc650dSSadaf Ebrahimi case OP_KETRPOS:
873*22dc650dSSadaf Ebrahimi if (code != end_code)
874*22dc650dSSadaf Ebrahimi {
875*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
876*22dc650dSSadaf Ebrahimi if (codevalue != OP_KET)
877*22dc650dSSadaf Ebrahimi {
878*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
879*22dc650dSSadaf Ebrahimi }
880*22dc650dSSadaf Ebrahimi }
881*22dc650dSSadaf Ebrahimi else
882*22dc650dSSadaf Ebrahimi {
883*22dc650dSSadaf Ebrahimi if (ptr > current_subject ||
884*22dc650dSSadaf Ebrahimi ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
885*22dc650dSSadaf Ebrahimi ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
886*22dc650dSSadaf Ebrahimi current_subject > start_subject + mb->start_offset)))
887*22dc650dSSadaf Ebrahimi {
888*22dc650dSSadaf Ebrahimi if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
889*22dc650dSSadaf Ebrahimi else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
890*22dc650dSSadaf Ebrahimi match_count = 0;
891*22dc650dSSadaf Ebrahimi count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
892*22dc650dSSadaf Ebrahimi if (count > 0) (void)memmove(offsets + 2, offsets,
893*22dc650dSSadaf Ebrahimi (size_t)count * sizeof(PCRE2_SIZE));
894*22dc650dSSadaf Ebrahimi if (offsetcount >= 2)
895*22dc650dSSadaf Ebrahimi {
896*22dc650dSSadaf Ebrahimi offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
897*22dc650dSSadaf Ebrahimi offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
898*22dc650dSSadaf Ebrahimi }
899*22dc650dSSadaf Ebrahimi if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
900*22dc650dSSadaf Ebrahimi }
901*22dc650dSSadaf Ebrahimi }
902*22dc650dSSadaf Ebrahimi break;
903*22dc650dSSadaf Ebrahimi
904*22dc650dSSadaf Ebrahimi /* ========================================================================== */
905*22dc650dSSadaf Ebrahimi /* These opcodes add to the current list of states without looking
906*22dc650dSSadaf Ebrahimi at the current character. */
907*22dc650dSSadaf Ebrahimi
908*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
909*22dc650dSSadaf Ebrahimi case OP_ALT:
910*22dc650dSSadaf Ebrahimi do { code += GET(code, 1); } while (*code == OP_ALT);
911*22dc650dSSadaf Ebrahimi ADD_ACTIVE((int)(code - start_code), 0);
912*22dc650dSSadaf Ebrahimi break;
913*22dc650dSSadaf Ebrahimi
914*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
915*22dc650dSSadaf Ebrahimi case OP_BRA:
916*22dc650dSSadaf Ebrahimi case OP_SBRA:
917*22dc650dSSadaf Ebrahimi do
918*22dc650dSSadaf Ebrahimi {
919*22dc650dSSadaf Ebrahimi ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
920*22dc650dSSadaf Ebrahimi code += GET(code, 1);
921*22dc650dSSadaf Ebrahimi }
922*22dc650dSSadaf Ebrahimi while (*code == OP_ALT);
923*22dc650dSSadaf Ebrahimi break;
924*22dc650dSSadaf Ebrahimi
925*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
926*22dc650dSSadaf Ebrahimi case OP_CBRA:
927*22dc650dSSadaf Ebrahimi case OP_SCBRA:
928*22dc650dSSadaf Ebrahimi ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
929*22dc650dSSadaf Ebrahimi code += GET(code, 1);
930*22dc650dSSadaf Ebrahimi while (*code == OP_ALT)
931*22dc650dSSadaf Ebrahimi {
932*22dc650dSSadaf Ebrahimi ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
933*22dc650dSSadaf Ebrahimi code += GET(code, 1);
934*22dc650dSSadaf Ebrahimi }
935*22dc650dSSadaf Ebrahimi break;
936*22dc650dSSadaf Ebrahimi
937*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
938*22dc650dSSadaf Ebrahimi case OP_BRAZERO:
939*22dc650dSSadaf Ebrahimi case OP_BRAMINZERO:
940*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 1, 0);
941*22dc650dSSadaf Ebrahimi code += 1 + GET(code, 2);
942*22dc650dSSadaf Ebrahimi while (*code == OP_ALT) code += GET(code, 1);
943*22dc650dSSadaf Ebrahimi ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
944*22dc650dSSadaf Ebrahimi break;
945*22dc650dSSadaf Ebrahimi
946*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
947*22dc650dSSadaf Ebrahimi case OP_SKIPZERO:
948*22dc650dSSadaf Ebrahimi code += 1 + GET(code, 2);
949*22dc650dSSadaf Ebrahimi while (*code == OP_ALT) code += GET(code, 1);
950*22dc650dSSadaf Ebrahimi ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
951*22dc650dSSadaf Ebrahimi break;
952*22dc650dSSadaf Ebrahimi
953*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
954*22dc650dSSadaf Ebrahimi case OP_CIRC:
955*22dc650dSSadaf Ebrahimi if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
956*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 1, 0); }
957*22dc650dSSadaf Ebrahimi break;
958*22dc650dSSadaf Ebrahimi
959*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
960*22dc650dSSadaf Ebrahimi case OP_CIRCM:
961*22dc650dSSadaf Ebrahimi if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
962*22dc650dSSadaf Ebrahimi ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
963*22dc650dSSadaf Ebrahimi && WAS_NEWLINE(ptr)))
964*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 1, 0); }
965*22dc650dSSadaf Ebrahimi break;
966*22dc650dSSadaf Ebrahimi
967*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
968*22dc650dSSadaf Ebrahimi case OP_EOD:
969*22dc650dSSadaf Ebrahimi if (ptr >= end_subject)
970*22dc650dSSadaf Ebrahimi {
971*22dc650dSSadaf Ebrahimi if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
972*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_PARTIAL;
973*22dc650dSSadaf Ebrahimi else { ADD_ACTIVE(state_offset + 1, 0); }
974*22dc650dSSadaf Ebrahimi }
975*22dc650dSSadaf Ebrahimi break;
976*22dc650dSSadaf Ebrahimi
977*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
978*22dc650dSSadaf Ebrahimi case OP_SOD:
979*22dc650dSSadaf Ebrahimi if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
980*22dc650dSSadaf Ebrahimi break;
981*22dc650dSSadaf Ebrahimi
982*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
983*22dc650dSSadaf Ebrahimi case OP_SOM:
984*22dc650dSSadaf Ebrahimi if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
985*22dc650dSSadaf Ebrahimi break;
986*22dc650dSSadaf Ebrahimi
987*22dc650dSSadaf Ebrahimi
988*22dc650dSSadaf Ebrahimi /* ========================================================================== */
989*22dc650dSSadaf Ebrahimi /* These opcodes inspect the next subject character, and sometimes
990*22dc650dSSadaf Ebrahimi the previous one as well, but do not have an argument. The variable
991*22dc650dSSadaf Ebrahimi clen contains the length of the current character and is zero if we are
992*22dc650dSSadaf Ebrahimi at the end of the subject. */
993*22dc650dSSadaf Ebrahimi
994*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
995*22dc650dSSadaf Ebrahimi case OP_ANY:
996*22dc650dSSadaf Ebrahimi if (clen > 0 && !IS_NEWLINE(ptr))
997*22dc650dSSadaf Ebrahimi {
998*22dc650dSSadaf Ebrahimi if (ptr + 1 >= mb->end_subject &&
999*22dc650dSSadaf Ebrahimi (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1000*22dc650dSSadaf Ebrahimi NLBLOCK->nltype == NLTYPE_FIXED &&
1001*22dc650dSSadaf Ebrahimi NLBLOCK->nllen == 2 &&
1002*22dc650dSSadaf Ebrahimi c == NLBLOCK->nl[0])
1003*22dc650dSSadaf Ebrahimi {
1004*22dc650dSSadaf Ebrahimi could_continue = partial_newline = TRUE;
1005*22dc650dSSadaf Ebrahimi }
1006*22dc650dSSadaf Ebrahimi else
1007*22dc650dSSadaf Ebrahimi {
1008*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + 1, 0);
1009*22dc650dSSadaf Ebrahimi }
1010*22dc650dSSadaf Ebrahimi }
1011*22dc650dSSadaf Ebrahimi break;
1012*22dc650dSSadaf Ebrahimi
1013*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1014*22dc650dSSadaf Ebrahimi case OP_ALLANY:
1015*22dc650dSSadaf Ebrahimi if (clen > 0)
1016*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + 1, 0); }
1017*22dc650dSSadaf Ebrahimi break;
1018*22dc650dSSadaf Ebrahimi
1019*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1020*22dc650dSSadaf Ebrahimi case OP_EODN:
1021*22dc650dSSadaf Ebrahimi if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1022*22dc650dSSadaf Ebrahimi {
1023*22dc650dSSadaf Ebrahimi if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1024*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_PARTIAL;
1025*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 1, 0);
1026*22dc650dSSadaf Ebrahimi }
1027*22dc650dSSadaf Ebrahimi break;
1028*22dc650dSSadaf Ebrahimi
1029*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1030*22dc650dSSadaf Ebrahimi case OP_DOLL:
1031*22dc650dSSadaf Ebrahimi if ((mb->moptions & PCRE2_NOTEOL) == 0)
1032*22dc650dSSadaf Ebrahimi {
1033*22dc650dSSadaf Ebrahimi if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1034*22dc650dSSadaf Ebrahimi could_continue = TRUE;
1035*22dc650dSSadaf Ebrahimi else if (clen == 0 ||
1036*22dc650dSSadaf Ebrahimi ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1037*22dc650dSSadaf Ebrahimi (ptr == end_subject - mb->nllen)
1038*22dc650dSSadaf Ebrahimi ))
1039*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 1, 0); }
1040*22dc650dSSadaf Ebrahimi else if (ptr + 1 >= mb->end_subject &&
1041*22dc650dSSadaf Ebrahimi (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1042*22dc650dSSadaf Ebrahimi NLBLOCK->nltype == NLTYPE_FIXED &&
1043*22dc650dSSadaf Ebrahimi NLBLOCK->nllen == 2 &&
1044*22dc650dSSadaf Ebrahimi c == NLBLOCK->nl[0])
1045*22dc650dSSadaf Ebrahimi {
1046*22dc650dSSadaf Ebrahimi if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1047*22dc650dSSadaf Ebrahimi {
1048*22dc650dSSadaf Ebrahimi reset_could_continue = TRUE;
1049*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1050*22dc650dSSadaf Ebrahimi }
1051*22dc650dSSadaf Ebrahimi else could_continue = partial_newline = TRUE;
1052*22dc650dSSadaf Ebrahimi }
1053*22dc650dSSadaf Ebrahimi }
1054*22dc650dSSadaf Ebrahimi break;
1055*22dc650dSSadaf Ebrahimi
1056*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1057*22dc650dSSadaf Ebrahimi case OP_DOLLM:
1058*22dc650dSSadaf Ebrahimi if ((mb->moptions & PCRE2_NOTEOL) == 0)
1059*22dc650dSSadaf Ebrahimi {
1060*22dc650dSSadaf Ebrahimi if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1061*22dc650dSSadaf Ebrahimi could_continue = TRUE;
1062*22dc650dSSadaf Ebrahimi else if (clen == 0 ||
1063*22dc650dSSadaf Ebrahimi ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1064*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 1, 0); }
1065*22dc650dSSadaf Ebrahimi else if (ptr + 1 >= mb->end_subject &&
1066*22dc650dSSadaf Ebrahimi (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1067*22dc650dSSadaf Ebrahimi NLBLOCK->nltype == NLTYPE_FIXED &&
1068*22dc650dSSadaf Ebrahimi NLBLOCK->nllen == 2 &&
1069*22dc650dSSadaf Ebrahimi c == NLBLOCK->nl[0])
1070*22dc650dSSadaf Ebrahimi {
1071*22dc650dSSadaf Ebrahimi if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1072*22dc650dSSadaf Ebrahimi {
1073*22dc650dSSadaf Ebrahimi reset_could_continue = TRUE;
1074*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1075*22dc650dSSadaf Ebrahimi }
1076*22dc650dSSadaf Ebrahimi else could_continue = partial_newline = TRUE;
1077*22dc650dSSadaf Ebrahimi }
1078*22dc650dSSadaf Ebrahimi }
1079*22dc650dSSadaf Ebrahimi else if (IS_NEWLINE(ptr))
1080*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 1, 0); }
1081*22dc650dSSadaf Ebrahimi break;
1082*22dc650dSSadaf Ebrahimi
1083*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1084*22dc650dSSadaf Ebrahimi
1085*22dc650dSSadaf Ebrahimi case OP_DIGIT:
1086*22dc650dSSadaf Ebrahimi case OP_WHITESPACE:
1087*22dc650dSSadaf Ebrahimi case OP_WORDCHAR:
1088*22dc650dSSadaf Ebrahimi if (clen > 0 && c < 256 &&
1089*22dc650dSSadaf Ebrahimi ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1090*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + 1, 0); }
1091*22dc650dSSadaf Ebrahimi break;
1092*22dc650dSSadaf Ebrahimi
1093*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1094*22dc650dSSadaf Ebrahimi case OP_NOT_DIGIT:
1095*22dc650dSSadaf Ebrahimi case OP_NOT_WHITESPACE:
1096*22dc650dSSadaf Ebrahimi case OP_NOT_WORDCHAR:
1097*22dc650dSSadaf Ebrahimi if (clen > 0 && (c >= 256 ||
1098*22dc650dSSadaf Ebrahimi ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1099*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + 1, 0); }
1100*22dc650dSSadaf Ebrahimi break;
1101*22dc650dSSadaf Ebrahimi
1102*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1103*22dc650dSSadaf Ebrahimi case OP_WORD_BOUNDARY:
1104*22dc650dSSadaf Ebrahimi case OP_NOT_WORD_BOUNDARY:
1105*22dc650dSSadaf Ebrahimi case OP_NOT_UCP_WORD_BOUNDARY:
1106*22dc650dSSadaf Ebrahimi case OP_UCP_WORD_BOUNDARY:
1107*22dc650dSSadaf Ebrahimi {
1108*22dc650dSSadaf Ebrahimi int left_word, right_word;
1109*22dc650dSSadaf Ebrahimi
1110*22dc650dSSadaf Ebrahimi if (ptr > start_subject)
1111*22dc650dSSadaf Ebrahimi {
1112*22dc650dSSadaf Ebrahimi PCRE2_SPTR temp = ptr - 1;
1113*22dc650dSSadaf Ebrahimi if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1114*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1115*22dc650dSSadaf Ebrahimi if (utf) { BACKCHAR(temp); }
1116*22dc650dSSadaf Ebrahimi #endif
1117*22dc650dSSadaf Ebrahimi GETCHARTEST(d, temp);
1118*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1119*22dc650dSSadaf Ebrahimi if (codevalue == OP_UCP_WORD_BOUNDARY ||
1120*22dc650dSSadaf Ebrahimi codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1121*22dc650dSSadaf Ebrahimi {
1122*22dc650dSSadaf Ebrahimi int chartype = UCD_CHARTYPE(d);
1123*22dc650dSSadaf Ebrahimi int category = PRIV(ucp_gentype)[chartype];
1124*22dc650dSSadaf Ebrahimi left_word = (category == ucp_L || category == ucp_N ||
1125*22dc650dSSadaf Ebrahimi chartype == ucp_Mn || chartype == ucp_Pc);
1126*22dc650dSSadaf Ebrahimi }
1127*22dc650dSSadaf Ebrahimi else
1128*22dc650dSSadaf Ebrahimi #endif
1129*22dc650dSSadaf Ebrahimi left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1130*22dc650dSSadaf Ebrahimi }
1131*22dc650dSSadaf Ebrahimi else left_word = FALSE;
1132*22dc650dSSadaf Ebrahimi
1133*22dc650dSSadaf Ebrahimi if (clen > 0)
1134*22dc650dSSadaf Ebrahimi {
1135*22dc650dSSadaf Ebrahimi if (ptr >= mb->last_used_ptr)
1136*22dc650dSSadaf Ebrahimi {
1137*22dc650dSSadaf Ebrahimi PCRE2_SPTR temp = ptr + 1;
1138*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1139*22dc650dSSadaf Ebrahimi if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1140*22dc650dSSadaf Ebrahimi #endif
1141*22dc650dSSadaf Ebrahimi mb->last_used_ptr = temp;
1142*22dc650dSSadaf Ebrahimi }
1143*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1144*22dc650dSSadaf Ebrahimi if (codevalue == OP_UCP_WORD_BOUNDARY ||
1145*22dc650dSSadaf Ebrahimi codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1146*22dc650dSSadaf Ebrahimi {
1147*22dc650dSSadaf Ebrahimi int chartype = UCD_CHARTYPE(c);
1148*22dc650dSSadaf Ebrahimi int category = PRIV(ucp_gentype)[chartype];
1149*22dc650dSSadaf Ebrahimi right_word = (category == ucp_L || category == ucp_N ||
1150*22dc650dSSadaf Ebrahimi chartype == ucp_Mn || chartype == ucp_Pc);
1151*22dc650dSSadaf Ebrahimi }
1152*22dc650dSSadaf Ebrahimi else
1153*22dc650dSSadaf Ebrahimi #endif
1154*22dc650dSSadaf Ebrahimi right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1155*22dc650dSSadaf Ebrahimi }
1156*22dc650dSSadaf Ebrahimi else right_word = FALSE;
1157*22dc650dSSadaf Ebrahimi
1158*22dc650dSSadaf Ebrahimi if ((left_word == right_word) ==
1159*22dc650dSSadaf Ebrahimi (codevalue == OP_NOT_WORD_BOUNDARY ||
1160*22dc650dSSadaf Ebrahimi codevalue == OP_NOT_UCP_WORD_BOUNDARY))
1161*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 1, 0); }
1162*22dc650dSSadaf Ebrahimi }
1163*22dc650dSSadaf Ebrahimi break;
1164*22dc650dSSadaf Ebrahimi
1165*22dc650dSSadaf Ebrahimi
1166*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1167*22dc650dSSadaf Ebrahimi /* Check the next character by Unicode property. We will get here only
1168*22dc650dSSadaf Ebrahimi if the support is in the binary; otherwise a compile-time error occurs.
1169*22dc650dSSadaf Ebrahimi */
1170*22dc650dSSadaf Ebrahimi
1171*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1172*22dc650dSSadaf Ebrahimi case OP_PROP:
1173*22dc650dSSadaf Ebrahimi case OP_NOTPROP:
1174*22dc650dSSadaf Ebrahimi if (clen > 0)
1175*22dc650dSSadaf Ebrahimi {
1176*22dc650dSSadaf Ebrahimi BOOL OK;
1177*22dc650dSSadaf Ebrahimi int chartype;
1178*22dc650dSSadaf Ebrahimi const uint32_t *cp;
1179*22dc650dSSadaf Ebrahimi const ucd_record * prop = GET_UCD(c);
1180*22dc650dSSadaf Ebrahimi switch(code[1])
1181*22dc650dSSadaf Ebrahimi {
1182*22dc650dSSadaf Ebrahimi case PT_ANY:
1183*22dc650dSSadaf Ebrahimi OK = TRUE;
1184*22dc650dSSadaf Ebrahimi break;
1185*22dc650dSSadaf Ebrahimi
1186*22dc650dSSadaf Ebrahimi case PT_LAMP:
1187*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
1188*22dc650dSSadaf Ebrahimi OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1189*22dc650dSSadaf Ebrahimi chartype == ucp_Lt;
1190*22dc650dSSadaf Ebrahimi break;
1191*22dc650dSSadaf Ebrahimi
1192*22dc650dSSadaf Ebrahimi case PT_GC:
1193*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1194*22dc650dSSadaf Ebrahimi break;
1195*22dc650dSSadaf Ebrahimi
1196*22dc650dSSadaf Ebrahimi case PT_PC:
1197*22dc650dSSadaf Ebrahimi OK = prop->chartype == code[2];
1198*22dc650dSSadaf Ebrahimi break;
1199*22dc650dSSadaf Ebrahimi
1200*22dc650dSSadaf Ebrahimi case PT_SC:
1201*22dc650dSSadaf Ebrahimi OK = prop->script == code[2];
1202*22dc650dSSadaf Ebrahimi break;
1203*22dc650dSSadaf Ebrahimi
1204*22dc650dSSadaf Ebrahimi case PT_SCX:
1205*22dc650dSSadaf Ebrahimi OK = (prop->script == code[2] ||
1206*22dc650dSSadaf Ebrahimi MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1207*22dc650dSSadaf Ebrahimi break;
1208*22dc650dSSadaf Ebrahimi
1209*22dc650dSSadaf Ebrahimi /* These are specials for combination cases. */
1210*22dc650dSSadaf Ebrahimi
1211*22dc650dSSadaf Ebrahimi case PT_ALNUM:
1212*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
1213*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1214*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N;
1215*22dc650dSSadaf Ebrahimi break;
1216*22dc650dSSadaf Ebrahimi
1217*22dc650dSSadaf Ebrahimi /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1218*22dc650dSSadaf Ebrahimi which means that Perl space and POSIX space are now identical. PCRE
1219*22dc650dSSadaf Ebrahimi was changed at release 8.34. */
1220*22dc650dSSadaf Ebrahimi
1221*22dc650dSSadaf Ebrahimi case PT_SPACE: /* Perl space */
1222*22dc650dSSadaf Ebrahimi case PT_PXSPACE: /* POSIX space */
1223*22dc650dSSadaf Ebrahimi switch(c)
1224*22dc650dSSadaf Ebrahimi {
1225*22dc650dSSadaf Ebrahimi HSPACE_CASES:
1226*22dc650dSSadaf Ebrahimi VSPACE_CASES:
1227*22dc650dSSadaf Ebrahimi OK = TRUE;
1228*22dc650dSSadaf Ebrahimi break;
1229*22dc650dSSadaf Ebrahimi
1230*22dc650dSSadaf Ebrahimi default:
1231*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1232*22dc650dSSadaf Ebrahimi break;
1233*22dc650dSSadaf Ebrahimi }
1234*22dc650dSSadaf Ebrahimi break;
1235*22dc650dSSadaf Ebrahimi
1236*22dc650dSSadaf Ebrahimi case PT_WORD:
1237*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
1238*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1239*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N ||
1240*22dc650dSSadaf Ebrahimi chartype == ucp_Mn || chartype == ucp_Pc;
1241*22dc650dSSadaf Ebrahimi break;
1242*22dc650dSSadaf Ebrahimi
1243*22dc650dSSadaf Ebrahimi case PT_CLIST:
1244*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
1245*22dc650dSSadaf Ebrahimi if (c > MAX_UTF_CODE_POINT)
1246*22dc650dSSadaf Ebrahimi {
1247*22dc650dSSadaf Ebrahimi OK = FALSE;
1248*22dc650dSSadaf Ebrahimi break;
1249*22dc650dSSadaf Ebrahimi }
1250*22dc650dSSadaf Ebrahimi #endif
1251*22dc650dSSadaf Ebrahimi cp = PRIV(ucd_caseless_sets) + code[2];
1252*22dc650dSSadaf Ebrahimi for (;;)
1253*22dc650dSSadaf Ebrahimi {
1254*22dc650dSSadaf Ebrahimi if (c < *cp) { OK = FALSE; break; }
1255*22dc650dSSadaf Ebrahimi if (c == *cp++) { OK = TRUE; break; }
1256*22dc650dSSadaf Ebrahimi }
1257*22dc650dSSadaf Ebrahimi break;
1258*22dc650dSSadaf Ebrahimi
1259*22dc650dSSadaf Ebrahimi case PT_UCNC:
1260*22dc650dSSadaf Ebrahimi OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1261*22dc650dSSadaf Ebrahimi c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1262*22dc650dSSadaf Ebrahimi c >= 0xe000;
1263*22dc650dSSadaf Ebrahimi break;
1264*22dc650dSSadaf Ebrahimi
1265*22dc650dSSadaf Ebrahimi case PT_BIDICL:
1266*22dc650dSSadaf Ebrahimi OK = UCD_BIDICLASS(c) == code[2];
1267*22dc650dSSadaf Ebrahimi break;
1268*22dc650dSSadaf Ebrahimi
1269*22dc650dSSadaf Ebrahimi case PT_BOOL:
1270*22dc650dSSadaf Ebrahimi OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1271*22dc650dSSadaf Ebrahimi UCD_BPROPS_PROP(prop), code[2]) != 0;
1272*22dc650dSSadaf Ebrahimi break;
1273*22dc650dSSadaf Ebrahimi
1274*22dc650dSSadaf Ebrahimi /* Should never occur, but keep compilers from grumbling. */
1275*22dc650dSSadaf Ebrahimi
1276*22dc650dSSadaf Ebrahimi default:
1277*22dc650dSSadaf Ebrahimi OK = codevalue != OP_PROP;
1278*22dc650dSSadaf Ebrahimi break;
1279*22dc650dSSadaf Ebrahimi }
1280*22dc650dSSadaf Ebrahimi
1281*22dc650dSSadaf Ebrahimi if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1282*22dc650dSSadaf Ebrahimi }
1283*22dc650dSSadaf Ebrahimi break;
1284*22dc650dSSadaf Ebrahimi #endif
1285*22dc650dSSadaf Ebrahimi
1286*22dc650dSSadaf Ebrahimi
1287*22dc650dSSadaf Ebrahimi
1288*22dc650dSSadaf Ebrahimi /* ========================================================================== */
1289*22dc650dSSadaf Ebrahimi /* These opcodes likewise inspect the subject character, but have an
1290*22dc650dSSadaf Ebrahimi argument that is not a data character. It is one of these opcodes:
1291*22dc650dSSadaf Ebrahimi OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1292*22dc650dSSadaf Ebrahimi OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1293*22dc650dSSadaf Ebrahimi
1294*22dc650dSSadaf Ebrahimi case OP_TYPEPLUS:
1295*22dc650dSSadaf Ebrahimi case OP_TYPEMINPLUS:
1296*22dc650dSSadaf Ebrahimi case OP_TYPEPOSPLUS:
1297*22dc650dSSadaf Ebrahimi count = current_state->count; /* Already matched */
1298*22dc650dSSadaf Ebrahimi if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1299*22dc650dSSadaf Ebrahimi if (clen > 0)
1300*22dc650dSSadaf Ebrahimi {
1301*22dc650dSSadaf Ebrahimi if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1302*22dc650dSSadaf Ebrahimi (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1303*22dc650dSSadaf Ebrahimi NLBLOCK->nltype == NLTYPE_FIXED &&
1304*22dc650dSSadaf Ebrahimi NLBLOCK->nllen == 2 &&
1305*22dc650dSSadaf Ebrahimi c == NLBLOCK->nl[0])
1306*22dc650dSSadaf Ebrahimi {
1307*22dc650dSSadaf Ebrahimi could_continue = partial_newline = TRUE;
1308*22dc650dSSadaf Ebrahimi }
1309*22dc650dSSadaf Ebrahimi else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1310*22dc650dSSadaf Ebrahimi (c < 256 &&
1311*22dc650dSSadaf Ebrahimi (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1312*22dc650dSSadaf Ebrahimi ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1313*22dc650dSSadaf Ebrahimi {
1314*22dc650dSSadaf Ebrahimi if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1315*22dc650dSSadaf Ebrahimi {
1316*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1317*22dc650dSSadaf Ebrahimi next_active_state--;
1318*22dc650dSSadaf Ebrahimi }
1319*22dc650dSSadaf Ebrahimi count++;
1320*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset, count);
1321*22dc650dSSadaf Ebrahimi }
1322*22dc650dSSadaf Ebrahimi }
1323*22dc650dSSadaf Ebrahimi break;
1324*22dc650dSSadaf Ebrahimi
1325*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1326*22dc650dSSadaf Ebrahimi case OP_TYPEQUERY:
1327*22dc650dSSadaf Ebrahimi case OP_TYPEMINQUERY:
1328*22dc650dSSadaf Ebrahimi case OP_TYPEPOSQUERY:
1329*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 2, 0);
1330*22dc650dSSadaf Ebrahimi if (clen > 0)
1331*22dc650dSSadaf Ebrahimi {
1332*22dc650dSSadaf Ebrahimi if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1333*22dc650dSSadaf Ebrahimi (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1334*22dc650dSSadaf Ebrahimi NLBLOCK->nltype == NLTYPE_FIXED &&
1335*22dc650dSSadaf Ebrahimi NLBLOCK->nllen == 2 &&
1336*22dc650dSSadaf Ebrahimi c == NLBLOCK->nl[0])
1337*22dc650dSSadaf Ebrahimi {
1338*22dc650dSSadaf Ebrahimi could_continue = partial_newline = TRUE;
1339*22dc650dSSadaf Ebrahimi }
1340*22dc650dSSadaf Ebrahimi else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1341*22dc650dSSadaf Ebrahimi (c < 256 &&
1342*22dc650dSSadaf Ebrahimi (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1343*22dc650dSSadaf Ebrahimi ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1344*22dc650dSSadaf Ebrahimi {
1345*22dc650dSSadaf Ebrahimi if (codevalue == OP_TYPEPOSQUERY)
1346*22dc650dSSadaf Ebrahimi {
1347*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1348*22dc650dSSadaf Ebrahimi next_active_state--;
1349*22dc650dSSadaf Ebrahimi }
1350*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + 2, 0);
1351*22dc650dSSadaf Ebrahimi }
1352*22dc650dSSadaf Ebrahimi }
1353*22dc650dSSadaf Ebrahimi break;
1354*22dc650dSSadaf Ebrahimi
1355*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1356*22dc650dSSadaf Ebrahimi case OP_TYPESTAR:
1357*22dc650dSSadaf Ebrahimi case OP_TYPEMINSTAR:
1358*22dc650dSSadaf Ebrahimi case OP_TYPEPOSSTAR:
1359*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 2, 0);
1360*22dc650dSSadaf Ebrahimi if (clen > 0)
1361*22dc650dSSadaf Ebrahimi {
1362*22dc650dSSadaf Ebrahimi if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1363*22dc650dSSadaf Ebrahimi (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1364*22dc650dSSadaf Ebrahimi NLBLOCK->nltype == NLTYPE_FIXED &&
1365*22dc650dSSadaf Ebrahimi NLBLOCK->nllen == 2 &&
1366*22dc650dSSadaf Ebrahimi c == NLBLOCK->nl[0])
1367*22dc650dSSadaf Ebrahimi {
1368*22dc650dSSadaf Ebrahimi could_continue = partial_newline = TRUE;
1369*22dc650dSSadaf Ebrahimi }
1370*22dc650dSSadaf Ebrahimi else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1371*22dc650dSSadaf Ebrahimi (c < 256 &&
1372*22dc650dSSadaf Ebrahimi (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1373*22dc650dSSadaf Ebrahimi ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1374*22dc650dSSadaf Ebrahimi {
1375*22dc650dSSadaf Ebrahimi if (codevalue == OP_TYPEPOSSTAR)
1376*22dc650dSSadaf Ebrahimi {
1377*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1378*22dc650dSSadaf Ebrahimi next_active_state--;
1379*22dc650dSSadaf Ebrahimi }
1380*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset, 0);
1381*22dc650dSSadaf Ebrahimi }
1382*22dc650dSSadaf Ebrahimi }
1383*22dc650dSSadaf Ebrahimi break;
1384*22dc650dSSadaf Ebrahimi
1385*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1386*22dc650dSSadaf Ebrahimi case OP_TYPEEXACT:
1387*22dc650dSSadaf Ebrahimi count = current_state->count; /* Number already matched */
1388*22dc650dSSadaf Ebrahimi if (clen > 0)
1389*22dc650dSSadaf Ebrahimi {
1390*22dc650dSSadaf Ebrahimi if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1391*22dc650dSSadaf Ebrahimi (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1392*22dc650dSSadaf Ebrahimi NLBLOCK->nltype == NLTYPE_FIXED &&
1393*22dc650dSSadaf Ebrahimi NLBLOCK->nllen == 2 &&
1394*22dc650dSSadaf Ebrahimi c == NLBLOCK->nl[0])
1395*22dc650dSSadaf Ebrahimi {
1396*22dc650dSSadaf Ebrahimi could_continue = partial_newline = TRUE;
1397*22dc650dSSadaf Ebrahimi }
1398*22dc650dSSadaf Ebrahimi else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1399*22dc650dSSadaf Ebrahimi (c < 256 &&
1400*22dc650dSSadaf Ebrahimi (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1401*22dc650dSSadaf Ebrahimi ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1402*22dc650dSSadaf Ebrahimi {
1403*22dc650dSSadaf Ebrahimi if (++count >= (int)GET2(code, 1))
1404*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1405*22dc650dSSadaf Ebrahimi else
1406*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset, count); }
1407*22dc650dSSadaf Ebrahimi }
1408*22dc650dSSadaf Ebrahimi }
1409*22dc650dSSadaf Ebrahimi break;
1410*22dc650dSSadaf Ebrahimi
1411*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1412*22dc650dSSadaf Ebrahimi case OP_TYPEUPTO:
1413*22dc650dSSadaf Ebrahimi case OP_TYPEMINUPTO:
1414*22dc650dSSadaf Ebrahimi case OP_TYPEPOSUPTO:
1415*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1416*22dc650dSSadaf Ebrahimi count = current_state->count; /* Number already matched */
1417*22dc650dSSadaf Ebrahimi if (clen > 0)
1418*22dc650dSSadaf Ebrahimi {
1419*22dc650dSSadaf Ebrahimi if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1420*22dc650dSSadaf Ebrahimi (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1421*22dc650dSSadaf Ebrahimi NLBLOCK->nltype == NLTYPE_FIXED &&
1422*22dc650dSSadaf Ebrahimi NLBLOCK->nllen == 2 &&
1423*22dc650dSSadaf Ebrahimi c == NLBLOCK->nl[0])
1424*22dc650dSSadaf Ebrahimi {
1425*22dc650dSSadaf Ebrahimi could_continue = partial_newline = TRUE;
1426*22dc650dSSadaf Ebrahimi }
1427*22dc650dSSadaf Ebrahimi else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1428*22dc650dSSadaf Ebrahimi (c < 256 &&
1429*22dc650dSSadaf Ebrahimi (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1430*22dc650dSSadaf Ebrahimi ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1431*22dc650dSSadaf Ebrahimi {
1432*22dc650dSSadaf Ebrahimi if (codevalue == OP_TYPEPOSUPTO)
1433*22dc650dSSadaf Ebrahimi {
1434*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1435*22dc650dSSadaf Ebrahimi next_active_state--;
1436*22dc650dSSadaf Ebrahimi }
1437*22dc650dSSadaf Ebrahimi if (++count >= (int)GET2(code, 1))
1438*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1439*22dc650dSSadaf Ebrahimi else
1440*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset, count); }
1441*22dc650dSSadaf Ebrahimi }
1442*22dc650dSSadaf Ebrahimi }
1443*22dc650dSSadaf Ebrahimi break;
1444*22dc650dSSadaf Ebrahimi
1445*22dc650dSSadaf Ebrahimi /* ========================================================================== */
1446*22dc650dSSadaf Ebrahimi /* These are virtual opcodes that are used when something like
1447*22dc650dSSadaf Ebrahimi OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1448*22dc650dSSadaf Ebrahimi argument. It keeps the code above fast for the other cases. The argument
1449*22dc650dSSadaf Ebrahimi is in the d variable. */
1450*22dc650dSSadaf Ebrahimi
1451*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1452*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEPLUS:
1453*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1454*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1455*22dc650dSSadaf Ebrahimi count = current_state->count; /* Already matched */
1456*22dc650dSSadaf Ebrahimi if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1457*22dc650dSSadaf Ebrahimi if (clen > 0)
1458*22dc650dSSadaf Ebrahimi {
1459*22dc650dSSadaf Ebrahimi BOOL OK;
1460*22dc650dSSadaf Ebrahimi int chartype;
1461*22dc650dSSadaf Ebrahimi const uint32_t *cp;
1462*22dc650dSSadaf Ebrahimi const ucd_record * prop = GET_UCD(c);
1463*22dc650dSSadaf Ebrahimi switch(code[2])
1464*22dc650dSSadaf Ebrahimi {
1465*22dc650dSSadaf Ebrahimi case PT_ANY:
1466*22dc650dSSadaf Ebrahimi OK = TRUE;
1467*22dc650dSSadaf Ebrahimi break;
1468*22dc650dSSadaf Ebrahimi
1469*22dc650dSSadaf Ebrahimi case PT_LAMP:
1470*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
1471*22dc650dSSadaf Ebrahimi OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1472*22dc650dSSadaf Ebrahimi break;
1473*22dc650dSSadaf Ebrahimi
1474*22dc650dSSadaf Ebrahimi case PT_GC:
1475*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1476*22dc650dSSadaf Ebrahimi break;
1477*22dc650dSSadaf Ebrahimi
1478*22dc650dSSadaf Ebrahimi case PT_PC:
1479*22dc650dSSadaf Ebrahimi OK = prop->chartype == code[3];
1480*22dc650dSSadaf Ebrahimi break;
1481*22dc650dSSadaf Ebrahimi
1482*22dc650dSSadaf Ebrahimi case PT_SC:
1483*22dc650dSSadaf Ebrahimi OK = prop->script == code[3];
1484*22dc650dSSadaf Ebrahimi break;
1485*22dc650dSSadaf Ebrahimi
1486*22dc650dSSadaf Ebrahimi case PT_SCX:
1487*22dc650dSSadaf Ebrahimi OK = (prop->script == code[3] ||
1488*22dc650dSSadaf Ebrahimi MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1489*22dc650dSSadaf Ebrahimi break;
1490*22dc650dSSadaf Ebrahimi
1491*22dc650dSSadaf Ebrahimi /* These are specials for combination cases. */
1492*22dc650dSSadaf Ebrahimi
1493*22dc650dSSadaf Ebrahimi case PT_ALNUM:
1494*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
1495*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1496*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N;
1497*22dc650dSSadaf Ebrahimi break;
1498*22dc650dSSadaf Ebrahimi
1499*22dc650dSSadaf Ebrahimi /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1500*22dc650dSSadaf Ebrahimi which means that Perl space and POSIX space are now identical. PCRE
1501*22dc650dSSadaf Ebrahimi was changed at release 8.34. */
1502*22dc650dSSadaf Ebrahimi
1503*22dc650dSSadaf Ebrahimi case PT_SPACE: /* Perl space */
1504*22dc650dSSadaf Ebrahimi case PT_PXSPACE: /* POSIX space */
1505*22dc650dSSadaf Ebrahimi switch(c)
1506*22dc650dSSadaf Ebrahimi {
1507*22dc650dSSadaf Ebrahimi HSPACE_CASES:
1508*22dc650dSSadaf Ebrahimi VSPACE_CASES:
1509*22dc650dSSadaf Ebrahimi OK = TRUE;
1510*22dc650dSSadaf Ebrahimi break;
1511*22dc650dSSadaf Ebrahimi
1512*22dc650dSSadaf Ebrahimi default:
1513*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1514*22dc650dSSadaf Ebrahimi break;
1515*22dc650dSSadaf Ebrahimi }
1516*22dc650dSSadaf Ebrahimi break;
1517*22dc650dSSadaf Ebrahimi
1518*22dc650dSSadaf Ebrahimi case PT_WORD:
1519*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
1520*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1521*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N ||
1522*22dc650dSSadaf Ebrahimi chartype == ucp_Mn || chartype == ucp_Pc;
1523*22dc650dSSadaf Ebrahimi break;
1524*22dc650dSSadaf Ebrahimi
1525*22dc650dSSadaf Ebrahimi case PT_CLIST:
1526*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
1527*22dc650dSSadaf Ebrahimi if (c > MAX_UTF_CODE_POINT)
1528*22dc650dSSadaf Ebrahimi {
1529*22dc650dSSadaf Ebrahimi OK = FALSE;
1530*22dc650dSSadaf Ebrahimi break;
1531*22dc650dSSadaf Ebrahimi }
1532*22dc650dSSadaf Ebrahimi #endif
1533*22dc650dSSadaf Ebrahimi cp = PRIV(ucd_caseless_sets) + code[3];
1534*22dc650dSSadaf Ebrahimi for (;;)
1535*22dc650dSSadaf Ebrahimi {
1536*22dc650dSSadaf Ebrahimi if (c < *cp) { OK = FALSE; break; }
1537*22dc650dSSadaf Ebrahimi if (c == *cp++) { OK = TRUE; break; }
1538*22dc650dSSadaf Ebrahimi }
1539*22dc650dSSadaf Ebrahimi break;
1540*22dc650dSSadaf Ebrahimi
1541*22dc650dSSadaf Ebrahimi case PT_UCNC:
1542*22dc650dSSadaf Ebrahimi OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1543*22dc650dSSadaf Ebrahimi c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1544*22dc650dSSadaf Ebrahimi c >= 0xe000;
1545*22dc650dSSadaf Ebrahimi break;
1546*22dc650dSSadaf Ebrahimi
1547*22dc650dSSadaf Ebrahimi case PT_BIDICL:
1548*22dc650dSSadaf Ebrahimi OK = UCD_BIDICLASS(c) == code[3];
1549*22dc650dSSadaf Ebrahimi break;
1550*22dc650dSSadaf Ebrahimi
1551*22dc650dSSadaf Ebrahimi case PT_BOOL:
1552*22dc650dSSadaf Ebrahimi OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1553*22dc650dSSadaf Ebrahimi UCD_BPROPS_PROP(prop), code[3]) != 0;
1554*22dc650dSSadaf Ebrahimi break;
1555*22dc650dSSadaf Ebrahimi
1556*22dc650dSSadaf Ebrahimi /* Should never occur, but keep compilers from grumbling. */
1557*22dc650dSSadaf Ebrahimi
1558*22dc650dSSadaf Ebrahimi default:
1559*22dc650dSSadaf Ebrahimi OK = codevalue != OP_PROP;
1560*22dc650dSSadaf Ebrahimi break;
1561*22dc650dSSadaf Ebrahimi }
1562*22dc650dSSadaf Ebrahimi
1563*22dc650dSSadaf Ebrahimi if (OK == (d == OP_PROP))
1564*22dc650dSSadaf Ebrahimi {
1565*22dc650dSSadaf Ebrahimi if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1566*22dc650dSSadaf Ebrahimi {
1567*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1568*22dc650dSSadaf Ebrahimi next_active_state--;
1569*22dc650dSSadaf Ebrahimi }
1570*22dc650dSSadaf Ebrahimi count++;
1571*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset, count);
1572*22dc650dSSadaf Ebrahimi }
1573*22dc650dSSadaf Ebrahimi }
1574*22dc650dSSadaf Ebrahimi break;
1575*22dc650dSSadaf Ebrahimi
1576*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1577*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1578*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1579*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1580*22dc650dSSadaf Ebrahimi count = current_state->count; /* Already matched */
1581*22dc650dSSadaf Ebrahimi if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1582*22dc650dSSadaf Ebrahimi if (clen > 0)
1583*22dc650dSSadaf Ebrahimi {
1584*22dc650dSSadaf Ebrahimi int ncount = 0;
1585*22dc650dSSadaf Ebrahimi if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1586*22dc650dSSadaf Ebrahimi {
1587*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1588*22dc650dSSadaf Ebrahimi next_active_state--;
1589*22dc650dSSadaf Ebrahimi }
1590*22dc650dSSadaf Ebrahimi (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1591*22dc650dSSadaf Ebrahimi &ncount);
1592*22dc650dSSadaf Ebrahimi count++;
1593*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-state_offset, count, ncount);
1594*22dc650dSSadaf Ebrahimi }
1595*22dc650dSSadaf Ebrahimi break;
1596*22dc650dSSadaf Ebrahimi #endif
1597*22dc650dSSadaf Ebrahimi
1598*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1599*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1600*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1601*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1602*22dc650dSSadaf Ebrahimi count = current_state->count; /* Already matched */
1603*22dc650dSSadaf Ebrahimi if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1604*22dc650dSSadaf Ebrahimi if (clen > 0)
1605*22dc650dSSadaf Ebrahimi {
1606*22dc650dSSadaf Ebrahimi int ncount = 0;
1607*22dc650dSSadaf Ebrahimi switch (c)
1608*22dc650dSSadaf Ebrahimi {
1609*22dc650dSSadaf Ebrahimi case CHAR_VT:
1610*22dc650dSSadaf Ebrahimi case CHAR_FF:
1611*22dc650dSSadaf Ebrahimi case CHAR_NEL:
1612*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
1613*22dc650dSSadaf Ebrahimi case 0x2028:
1614*22dc650dSSadaf Ebrahimi case 0x2029:
1615*22dc650dSSadaf Ebrahimi #endif /* Not EBCDIC */
1616*22dc650dSSadaf Ebrahimi if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1617*22dc650dSSadaf Ebrahimi goto ANYNL01;
1618*22dc650dSSadaf Ebrahimi
1619*22dc650dSSadaf Ebrahimi case CHAR_CR:
1620*22dc650dSSadaf Ebrahimi if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1621*22dc650dSSadaf Ebrahimi /* Fall through */
1622*22dc650dSSadaf Ebrahimi
1623*22dc650dSSadaf Ebrahimi ANYNL01:
1624*22dc650dSSadaf Ebrahimi case CHAR_LF:
1625*22dc650dSSadaf Ebrahimi if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1626*22dc650dSSadaf Ebrahimi {
1627*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1628*22dc650dSSadaf Ebrahimi next_active_state--;
1629*22dc650dSSadaf Ebrahimi }
1630*22dc650dSSadaf Ebrahimi count++;
1631*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-state_offset, count, ncount);
1632*22dc650dSSadaf Ebrahimi break;
1633*22dc650dSSadaf Ebrahimi
1634*22dc650dSSadaf Ebrahimi default:
1635*22dc650dSSadaf Ebrahimi break;
1636*22dc650dSSadaf Ebrahimi }
1637*22dc650dSSadaf Ebrahimi }
1638*22dc650dSSadaf Ebrahimi break;
1639*22dc650dSSadaf Ebrahimi
1640*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1641*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1642*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1643*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1644*22dc650dSSadaf Ebrahimi count = current_state->count; /* Already matched */
1645*22dc650dSSadaf Ebrahimi if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1646*22dc650dSSadaf Ebrahimi if (clen > 0)
1647*22dc650dSSadaf Ebrahimi {
1648*22dc650dSSadaf Ebrahimi BOOL OK;
1649*22dc650dSSadaf Ebrahimi switch (c)
1650*22dc650dSSadaf Ebrahimi {
1651*22dc650dSSadaf Ebrahimi VSPACE_CASES:
1652*22dc650dSSadaf Ebrahimi OK = TRUE;
1653*22dc650dSSadaf Ebrahimi break;
1654*22dc650dSSadaf Ebrahimi
1655*22dc650dSSadaf Ebrahimi default:
1656*22dc650dSSadaf Ebrahimi OK = FALSE;
1657*22dc650dSSadaf Ebrahimi break;
1658*22dc650dSSadaf Ebrahimi }
1659*22dc650dSSadaf Ebrahimi
1660*22dc650dSSadaf Ebrahimi if (OK == (d == OP_VSPACE))
1661*22dc650dSSadaf Ebrahimi {
1662*22dc650dSSadaf Ebrahimi if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1663*22dc650dSSadaf Ebrahimi {
1664*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1665*22dc650dSSadaf Ebrahimi next_active_state--;
1666*22dc650dSSadaf Ebrahimi }
1667*22dc650dSSadaf Ebrahimi count++;
1668*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-state_offset, count, 0);
1669*22dc650dSSadaf Ebrahimi }
1670*22dc650dSSadaf Ebrahimi }
1671*22dc650dSSadaf Ebrahimi break;
1672*22dc650dSSadaf Ebrahimi
1673*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1674*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1675*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1676*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1677*22dc650dSSadaf Ebrahimi count = current_state->count; /* Already matched */
1678*22dc650dSSadaf Ebrahimi if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1679*22dc650dSSadaf Ebrahimi if (clen > 0)
1680*22dc650dSSadaf Ebrahimi {
1681*22dc650dSSadaf Ebrahimi BOOL OK;
1682*22dc650dSSadaf Ebrahimi switch (c)
1683*22dc650dSSadaf Ebrahimi {
1684*22dc650dSSadaf Ebrahimi HSPACE_CASES:
1685*22dc650dSSadaf Ebrahimi OK = TRUE;
1686*22dc650dSSadaf Ebrahimi break;
1687*22dc650dSSadaf Ebrahimi
1688*22dc650dSSadaf Ebrahimi default:
1689*22dc650dSSadaf Ebrahimi OK = FALSE;
1690*22dc650dSSadaf Ebrahimi break;
1691*22dc650dSSadaf Ebrahimi }
1692*22dc650dSSadaf Ebrahimi
1693*22dc650dSSadaf Ebrahimi if (OK == (d == OP_HSPACE))
1694*22dc650dSSadaf Ebrahimi {
1695*22dc650dSSadaf Ebrahimi if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1696*22dc650dSSadaf Ebrahimi {
1697*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1698*22dc650dSSadaf Ebrahimi next_active_state--;
1699*22dc650dSSadaf Ebrahimi }
1700*22dc650dSSadaf Ebrahimi count++;
1701*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-state_offset, count, 0);
1702*22dc650dSSadaf Ebrahimi }
1703*22dc650dSSadaf Ebrahimi }
1704*22dc650dSSadaf Ebrahimi break;
1705*22dc650dSSadaf Ebrahimi
1706*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1707*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
1708*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEQUERY:
1709*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1710*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1711*22dc650dSSadaf Ebrahimi count = 4;
1712*22dc650dSSadaf Ebrahimi goto QS1;
1713*22dc650dSSadaf Ebrahimi
1714*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPESTAR:
1715*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1716*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1717*22dc650dSSadaf Ebrahimi count = 0;
1718*22dc650dSSadaf Ebrahimi
1719*22dc650dSSadaf Ebrahimi QS1:
1720*22dc650dSSadaf Ebrahimi
1721*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 4, 0);
1722*22dc650dSSadaf Ebrahimi if (clen > 0)
1723*22dc650dSSadaf Ebrahimi {
1724*22dc650dSSadaf Ebrahimi BOOL OK;
1725*22dc650dSSadaf Ebrahimi int chartype;
1726*22dc650dSSadaf Ebrahimi const uint32_t *cp;
1727*22dc650dSSadaf Ebrahimi const ucd_record * prop = GET_UCD(c);
1728*22dc650dSSadaf Ebrahimi switch(code[2])
1729*22dc650dSSadaf Ebrahimi {
1730*22dc650dSSadaf Ebrahimi case PT_ANY:
1731*22dc650dSSadaf Ebrahimi OK = TRUE;
1732*22dc650dSSadaf Ebrahimi break;
1733*22dc650dSSadaf Ebrahimi
1734*22dc650dSSadaf Ebrahimi case PT_LAMP:
1735*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
1736*22dc650dSSadaf Ebrahimi OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1737*22dc650dSSadaf Ebrahimi break;
1738*22dc650dSSadaf Ebrahimi
1739*22dc650dSSadaf Ebrahimi case PT_GC:
1740*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1741*22dc650dSSadaf Ebrahimi break;
1742*22dc650dSSadaf Ebrahimi
1743*22dc650dSSadaf Ebrahimi case PT_PC:
1744*22dc650dSSadaf Ebrahimi OK = prop->chartype == code[3];
1745*22dc650dSSadaf Ebrahimi break;
1746*22dc650dSSadaf Ebrahimi
1747*22dc650dSSadaf Ebrahimi case PT_SC:
1748*22dc650dSSadaf Ebrahimi OK = prop->script == code[3];
1749*22dc650dSSadaf Ebrahimi break;
1750*22dc650dSSadaf Ebrahimi
1751*22dc650dSSadaf Ebrahimi case PT_SCX:
1752*22dc650dSSadaf Ebrahimi OK = (prop->script == code[3] ||
1753*22dc650dSSadaf Ebrahimi MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1754*22dc650dSSadaf Ebrahimi break;
1755*22dc650dSSadaf Ebrahimi
1756*22dc650dSSadaf Ebrahimi /* These are specials for combination cases. */
1757*22dc650dSSadaf Ebrahimi
1758*22dc650dSSadaf Ebrahimi case PT_ALNUM:
1759*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
1760*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1761*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N;
1762*22dc650dSSadaf Ebrahimi break;
1763*22dc650dSSadaf Ebrahimi
1764*22dc650dSSadaf Ebrahimi /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1765*22dc650dSSadaf Ebrahimi which means that Perl space and POSIX space are now identical. PCRE
1766*22dc650dSSadaf Ebrahimi was changed at release 8.34. */
1767*22dc650dSSadaf Ebrahimi
1768*22dc650dSSadaf Ebrahimi case PT_SPACE: /* Perl space */
1769*22dc650dSSadaf Ebrahimi case PT_PXSPACE: /* POSIX space */
1770*22dc650dSSadaf Ebrahimi switch(c)
1771*22dc650dSSadaf Ebrahimi {
1772*22dc650dSSadaf Ebrahimi HSPACE_CASES:
1773*22dc650dSSadaf Ebrahimi VSPACE_CASES:
1774*22dc650dSSadaf Ebrahimi OK = TRUE;
1775*22dc650dSSadaf Ebrahimi break;
1776*22dc650dSSadaf Ebrahimi
1777*22dc650dSSadaf Ebrahimi default:
1778*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1779*22dc650dSSadaf Ebrahimi break;
1780*22dc650dSSadaf Ebrahimi }
1781*22dc650dSSadaf Ebrahimi break;
1782*22dc650dSSadaf Ebrahimi
1783*22dc650dSSadaf Ebrahimi case PT_WORD:
1784*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
1785*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1786*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N ||
1787*22dc650dSSadaf Ebrahimi chartype == ucp_Mn || chartype == ucp_Pc;
1788*22dc650dSSadaf Ebrahimi break;
1789*22dc650dSSadaf Ebrahimi
1790*22dc650dSSadaf Ebrahimi case PT_CLIST:
1791*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
1792*22dc650dSSadaf Ebrahimi if (c > MAX_UTF_CODE_POINT)
1793*22dc650dSSadaf Ebrahimi {
1794*22dc650dSSadaf Ebrahimi OK = FALSE;
1795*22dc650dSSadaf Ebrahimi break;
1796*22dc650dSSadaf Ebrahimi }
1797*22dc650dSSadaf Ebrahimi #endif
1798*22dc650dSSadaf Ebrahimi cp = PRIV(ucd_caseless_sets) + code[3];
1799*22dc650dSSadaf Ebrahimi for (;;)
1800*22dc650dSSadaf Ebrahimi {
1801*22dc650dSSadaf Ebrahimi if (c < *cp) { OK = FALSE; break; }
1802*22dc650dSSadaf Ebrahimi if (c == *cp++) { OK = TRUE; break; }
1803*22dc650dSSadaf Ebrahimi }
1804*22dc650dSSadaf Ebrahimi break;
1805*22dc650dSSadaf Ebrahimi
1806*22dc650dSSadaf Ebrahimi case PT_UCNC:
1807*22dc650dSSadaf Ebrahimi OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1808*22dc650dSSadaf Ebrahimi c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1809*22dc650dSSadaf Ebrahimi c >= 0xe000;
1810*22dc650dSSadaf Ebrahimi break;
1811*22dc650dSSadaf Ebrahimi
1812*22dc650dSSadaf Ebrahimi case PT_BIDICL:
1813*22dc650dSSadaf Ebrahimi OK = UCD_BIDICLASS(c) == code[3];
1814*22dc650dSSadaf Ebrahimi break;
1815*22dc650dSSadaf Ebrahimi
1816*22dc650dSSadaf Ebrahimi case PT_BOOL:
1817*22dc650dSSadaf Ebrahimi OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1818*22dc650dSSadaf Ebrahimi UCD_BPROPS_PROP(prop), code[3]) != 0;
1819*22dc650dSSadaf Ebrahimi break;
1820*22dc650dSSadaf Ebrahimi
1821*22dc650dSSadaf Ebrahimi /* Should never occur, but keep compilers from grumbling. */
1822*22dc650dSSadaf Ebrahimi
1823*22dc650dSSadaf Ebrahimi default:
1824*22dc650dSSadaf Ebrahimi OK = codevalue != OP_PROP;
1825*22dc650dSSadaf Ebrahimi break;
1826*22dc650dSSadaf Ebrahimi }
1827*22dc650dSSadaf Ebrahimi
1828*22dc650dSSadaf Ebrahimi if (OK == (d == OP_PROP))
1829*22dc650dSSadaf Ebrahimi {
1830*22dc650dSSadaf Ebrahimi if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1831*22dc650dSSadaf Ebrahimi codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1832*22dc650dSSadaf Ebrahimi {
1833*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1834*22dc650dSSadaf Ebrahimi next_active_state--;
1835*22dc650dSSadaf Ebrahimi }
1836*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + count, 0);
1837*22dc650dSSadaf Ebrahimi }
1838*22dc650dSSadaf Ebrahimi }
1839*22dc650dSSadaf Ebrahimi break;
1840*22dc650dSSadaf Ebrahimi
1841*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1842*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1843*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1844*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1845*22dc650dSSadaf Ebrahimi count = 2;
1846*22dc650dSSadaf Ebrahimi goto QS2;
1847*22dc650dSSadaf Ebrahimi
1848*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1849*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1850*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1851*22dc650dSSadaf Ebrahimi count = 0;
1852*22dc650dSSadaf Ebrahimi
1853*22dc650dSSadaf Ebrahimi QS2:
1854*22dc650dSSadaf Ebrahimi
1855*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 2, 0);
1856*22dc650dSSadaf Ebrahimi if (clen > 0)
1857*22dc650dSSadaf Ebrahimi {
1858*22dc650dSSadaf Ebrahimi int ncount = 0;
1859*22dc650dSSadaf Ebrahimi if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1860*22dc650dSSadaf Ebrahimi codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1861*22dc650dSSadaf Ebrahimi {
1862*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1863*22dc650dSSadaf Ebrahimi next_active_state--;
1864*22dc650dSSadaf Ebrahimi }
1865*22dc650dSSadaf Ebrahimi (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1866*22dc650dSSadaf Ebrahimi &ncount);
1867*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1868*22dc650dSSadaf Ebrahimi }
1869*22dc650dSSadaf Ebrahimi break;
1870*22dc650dSSadaf Ebrahimi #endif
1871*22dc650dSSadaf Ebrahimi
1872*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1873*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1874*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1875*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1876*22dc650dSSadaf Ebrahimi count = 2;
1877*22dc650dSSadaf Ebrahimi goto QS3;
1878*22dc650dSSadaf Ebrahimi
1879*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPESTAR:
1880*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1881*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1882*22dc650dSSadaf Ebrahimi count = 0;
1883*22dc650dSSadaf Ebrahimi
1884*22dc650dSSadaf Ebrahimi QS3:
1885*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 2, 0);
1886*22dc650dSSadaf Ebrahimi if (clen > 0)
1887*22dc650dSSadaf Ebrahimi {
1888*22dc650dSSadaf Ebrahimi int ncount = 0;
1889*22dc650dSSadaf Ebrahimi switch (c)
1890*22dc650dSSadaf Ebrahimi {
1891*22dc650dSSadaf Ebrahimi case CHAR_VT:
1892*22dc650dSSadaf Ebrahimi case CHAR_FF:
1893*22dc650dSSadaf Ebrahimi case CHAR_NEL:
1894*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
1895*22dc650dSSadaf Ebrahimi case 0x2028:
1896*22dc650dSSadaf Ebrahimi case 0x2029:
1897*22dc650dSSadaf Ebrahimi #endif /* Not EBCDIC */
1898*22dc650dSSadaf Ebrahimi if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1899*22dc650dSSadaf Ebrahimi goto ANYNL02;
1900*22dc650dSSadaf Ebrahimi
1901*22dc650dSSadaf Ebrahimi case CHAR_CR:
1902*22dc650dSSadaf Ebrahimi if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1903*22dc650dSSadaf Ebrahimi /* Fall through */
1904*22dc650dSSadaf Ebrahimi
1905*22dc650dSSadaf Ebrahimi ANYNL02:
1906*22dc650dSSadaf Ebrahimi case CHAR_LF:
1907*22dc650dSSadaf Ebrahimi if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1908*22dc650dSSadaf Ebrahimi codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1909*22dc650dSSadaf Ebrahimi {
1910*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1911*22dc650dSSadaf Ebrahimi next_active_state--;
1912*22dc650dSSadaf Ebrahimi }
1913*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1914*22dc650dSSadaf Ebrahimi break;
1915*22dc650dSSadaf Ebrahimi
1916*22dc650dSSadaf Ebrahimi default:
1917*22dc650dSSadaf Ebrahimi break;
1918*22dc650dSSadaf Ebrahimi }
1919*22dc650dSSadaf Ebrahimi }
1920*22dc650dSSadaf Ebrahimi break;
1921*22dc650dSSadaf Ebrahimi
1922*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1923*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1924*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1925*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1926*22dc650dSSadaf Ebrahimi count = 2;
1927*22dc650dSSadaf Ebrahimi goto QS4;
1928*22dc650dSSadaf Ebrahimi
1929*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPESTAR:
1930*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1931*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1932*22dc650dSSadaf Ebrahimi count = 0;
1933*22dc650dSSadaf Ebrahimi
1934*22dc650dSSadaf Ebrahimi QS4:
1935*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 2, 0);
1936*22dc650dSSadaf Ebrahimi if (clen > 0)
1937*22dc650dSSadaf Ebrahimi {
1938*22dc650dSSadaf Ebrahimi BOOL OK;
1939*22dc650dSSadaf Ebrahimi switch (c)
1940*22dc650dSSadaf Ebrahimi {
1941*22dc650dSSadaf Ebrahimi VSPACE_CASES:
1942*22dc650dSSadaf Ebrahimi OK = TRUE;
1943*22dc650dSSadaf Ebrahimi break;
1944*22dc650dSSadaf Ebrahimi
1945*22dc650dSSadaf Ebrahimi default:
1946*22dc650dSSadaf Ebrahimi OK = FALSE;
1947*22dc650dSSadaf Ebrahimi break;
1948*22dc650dSSadaf Ebrahimi }
1949*22dc650dSSadaf Ebrahimi if (OK == (d == OP_VSPACE))
1950*22dc650dSSadaf Ebrahimi {
1951*22dc650dSSadaf Ebrahimi if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1952*22dc650dSSadaf Ebrahimi codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1953*22dc650dSSadaf Ebrahimi {
1954*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1955*22dc650dSSadaf Ebrahimi next_active_state--;
1956*22dc650dSSadaf Ebrahimi }
1957*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1958*22dc650dSSadaf Ebrahimi }
1959*22dc650dSSadaf Ebrahimi }
1960*22dc650dSSadaf Ebrahimi break;
1961*22dc650dSSadaf Ebrahimi
1962*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
1963*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1964*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1965*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1966*22dc650dSSadaf Ebrahimi count = 2;
1967*22dc650dSSadaf Ebrahimi goto QS5;
1968*22dc650dSSadaf Ebrahimi
1969*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPESTAR:
1970*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1971*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1972*22dc650dSSadaf Ebrahimi count = 0;
1973*22dc650dSSadaf Ebrahimi
1974*22dc650dSSadaf Ebrahimi QS5:
1975*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + 2, 0);
1976*22dc650dSSadaf Ebrahimi if (clen > 0)
1977*22dc650dSSadaf Ebrahimi {
1978*22dc650dSSadaf Ebrahimi BOOL OK;
1979*22dc650dSSadaf Ebrahimi switch (c)
1980*22dc650dSSadaf Ebrahimi {
1981*22dc650dSSadaf Ebrahimi HSPACE_CASES:
1982*22dc650dSSadaf Ebrahimi OK = TRUE;
1983*22dc650dSSadaf Ebrahimi break;
1984*22dc650dSSadaf Ebrahimi
1985*22dc650dSSadaf Ebrahimi default:
1986*22dc650dSSadaf Ebrahimi OK = FALSE;
1987*22dc650dSSadaf Ebrahimi break;
1988*22dc650dSSadaf Ebrahimi }
1989*22dc650dSSadaf Ebrahimi
1990*22dc650dSSadaf Ebrahimi if (OK == (d == OP_HSPACE))
1991*22dc650dSSadaf Ebrahimi {
1992*22dc650dSSadaf Ebrahimi if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1993*22dc650dSSadaf Ebrahimi codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1994*22dc650dSSadaf Ebrahimi {
1995*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
1996*22dc650dSSadaf Ebrahimi next_active_state--;
1997*22dc650dSSadaf Ebrahimi }
1998*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1999*22dc650dSSadaf Ebrahimi }
2000*22dc650dSSadaf Ebrahimi }
2001*22dc650dSSadaf Ebrahimi break;
2002*22dc650dSSadaf Ebrahimi
2003*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2004*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2005*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEEXACT:
2006*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEUPTO:
2007*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEMINUPTO:
2008*22dc650dSSadaf Ebrahimi case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
2009*22dc650dSSadaf Ebrahimi if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
2010*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
2011*22dc650dSSadaf Ebrahimi count = current_state->count; /* Number already matched */
2012*22dc650dSSadaf Ebrahimi if (clen > 0)
2013*22dc650dSSadaf Ebrahimi {
2014*22dc650dSSadaf Ebrahimi BOOL OK;
2015*22dc650dSSadaf Ebrahimi int chartype;
2016*22dc650dSSadaf Ebrahimi const uint32_t *cp;
2017*22dc650dSSadaf Ebrahimi const ucd_record * prop = GET_UCD(c);
2018*22dc650dSSadaf Ebrahimi switch(code[1 + IMM2_SIZE + 1])
2019*22dc650dSSadaf Ebrahimi {
2020*22dc650dSSadaf Ebrahimi case PT_ANY:
2021*22dc650dSSadaf Ebrahimi OK = TRUE;
2022*22dc650dSSadaf Ebrahimi break;
2023*22dc650dSSadaf Ebrahimi
2024*22dc650dSSadaf Ebrahimi case PT_LAMP:
2025*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
2026*22dc650dSSadaf Ebrahimi OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
2027*22dc650dSSadaf Ebrahimi break;
2028*22dc650dSSadaf Ebrahimi
2029*22dc650dSSadaf Ebrahimi case PT_GC:
2030*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
2031*22dc650dSSadaf Ebrahimi break;
2032*22dc650dSSadaf Ebrahimi
2033*22dc650dSSadaf Ebrahimi case PT_PC:
2034*22dc650dSSadaf Ebrahimi OK = prop->chartype == code[1 + IMM2_SIZE + 2];
2035*22dc650dSSadaf Ebrahimi break;
2036*22dc650dSSadaf Ebrahimi
2037*22dc650dSSadaf Ebrahimi case PT_SC:
2038*22dc650dSSadaf Ebrahimi OK = prop->script == code[1 + IMM2_SIZE + 2];
2039*22dc650dSSadaf Ebrahimi break;
2040*22dc650dSSadaf Ebrahimi
2041*22dc650dSSadaf Ebrahimi case PT_SCX:
2042*22dc650dSSadaf Ebrahimi OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2043*22dc650dSSadaf Ebrahimi MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2044*22dc650dSSadaf Ebrahimi code[1 + IMM2_SIZE + 2]) != 0);
2045*22dc650dSSadaf Ebrahimi break;
2046*22dc650dSSadaf Ebrahimi
2047*22dc650dSSadaf Ebrahimi /* These are specials for combination cases. */
2048*22dc650dSSadaf Ebrahimi
2049*22dc650dSSadaf Ebrahimi case PT_ALNUM:
2050*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
2051*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2052*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N;
2053*22dc650dSSadaf Ebrahimi break;
2054*22dc650dSSadaf Ebrahimi
2055*22dc650dSSadaf Ebrahimi /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2056*22dc650dSSadaf Ebrahimi which means that Perl space and POSIX space are now identical. PCRE
2057*22dc650dSSadaf Ebrahimi was changed at release 8.34. */
2058*22dc650dSSadaf Ebrahimi
2059*22dc650dSSadaf Ebrahimi case PT_SPACE: /* Perl space */
2060*22dc650dSSadaf Ebrahimi case PT_PXSPACE: /* POSIX space */
2061*22dc650dSSadaf Ebrahimi switch(c)
2062*22dc650dSSadaf Ebrahimi {
2063*22dc650dSSadaf Ebrahimi HSPACE_CASES:
2064*22dc650dSSadaf Ebrahimi VSPACE_CASES:
2065*22dc650dSSadaf Ebrahimi OK = TRUE;
2066*22dc650dSSadaf Ebrahimi break;
2067*22dc650dSSadaf Ebrahimi
2068*22dc650dSSadaf Ebrahimi default:
2069*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2070*22dc650dSSadaf Ebrahimi break;
2071*22dc650dSSadaf Ebrahimi }
2072*22dc650dSSadaf Ebrahimi break;
2073*22dc650dSSadaf Ebrahimi
2074*22dc650dSSadaf Ebrahimi case PT_WORD:
2075*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
2076*22dc650dSSadaf Ebrahimi OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2077*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N ||
2078*22dc650dSSadaf Ebrahimi chartype == ucp_Mn || chartype == ucp_Pc;
2079*22dc650dSSadaf Ebrahimi break;
2080*22dc650dSSadaf Ebrahimi
2081*22dc650dSSadaf Ebrahimi case PT_CLIST:
2082*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 32
2083*22dc650dSSadaf Ebrahimi if (c > MAX_UTF_CODE_POINT)
2084*22dc650dSSadaf Ebrahimi {
2085*22dc650dSSadaf Ebrahimi OK = FALSE;
2086*22dc650dSSadaf Ebrahimi break;
2087*22dc650dSSadaf Ebrahimi }
2088*22dc650dSSadaf Ebrahimi #endif
2089*22dc650dSSadaf Ebrahimi cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2090*22dc650dSSadaf Ebrahimi for (;;)
2091*22dc650dSSadaf Ebrahimi {
2092*22dc650dSSadaf Ebrahimi if (c < *cp) { OK = FALSE; break; }
2093*22dc650dSSadaf Ebrahimi if (c == *cp++) { OK = TRUE; break; }
2094*22dc650dSSadaf Ebrahimi }
2095*22dc650dSSadaf Ebrahimi break;
2096*22dc650dSSadaf Ebrahimi
2097*22dc650dSSadaf Ebrahimi case PT_UCNC:
2098*22dc650dSSadaf Ebrahimi OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2099*22dc650dSSadaf Ebrahimi c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2100*22dc650dSSadaf Ebrahimi c >= 0xe000;
2101*22dc650dSSadaf Ebrahimi break;
2102*22dc650dSSadaf Ebrahimi
2103*22dc650dSSadaf Ebrahimi case PT_BIDICL:
2104*22dc650dSSadaf Ebrahimi OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2105*22dc650dSSadaf Ebrahimi break;
2106*22dc650dSSadaf Ebrahimi
2107*22dc650dSSadaf Ebrahimi case PT_BOOL:
2108*22dc650dSSadaf Ebrahimi OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2109*22dc650dSSadaf Ebrahimi UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2110*22dc650dSSadaf Ebrahimi break;
2111*22dc650dSSadaf Ebrahimi
2112*22dc650dSSadaf Ebrahimi /* Should never occur, but keep compilers from grumbling. */
2113*22dc650dSSadaf Ebrahimi
2114*22dc650dSSadaf Ebrahimi default:
2115*22dc650dSSadaf Ebrahimi OK = codevalue != OP_PROP;
2116*22dc650dSSadaf Ebrahimi break;
2117*22dc650dSSadaf Ebrahimi }
2118*22dc650dSSadaf Ebrahimi
2119*22dc650dSSadaf Ebrahimi if (OK == (d == OP_PROP))
2120*22dc650dSSadaf Ebrahimi {
2121*22dc650dSSadaf Ebrahimi if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2122*22dc650dSSadaf Ebrahimi {
2123*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2124*22dc650dSSadaf Ebrahimi next_active_state--;
2125*22dc650dSSadaf Ebrahimi }
2126*22dc650dSSadaf Ebrahimi if (++count >= (int)GET2(code, 1))
2127*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2128*22dc650dSSadaf Ebrahimi else
2129*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset, count); }
2130*22dc650dSSadaf Ebrahimi }
2131*22dc650dSSadaf Ebrahimi }
2132*22dc650dSSadaf Ebrahimi break;
2133*22dc650dSSadaf Ebrahimi
2134*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2135*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2136*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2137*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2138*22dc650dSSadaf Ebrahimi case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2139*22dc650dSSadaf Ebrahimi if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2140*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2141*22dc650dSSadaf Ebrahimi count = current_state->count; /* Number already matched */
2142*22dc650dSSadaf Ebrahimi if (clen > 0)
2143*22dc650dSSadaf Ebrahimi {
2144*22dc650dSSadaf Ebrahimi PCRE2_SPTR nptr;
2145*22dc650dSSadaf Ebrahimi int ncount = 0;
2146*22dc650dSSadaf Ebrahimi if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2147*22dc650dSSadaf Ebrahimi {
2148*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2149*22dc650dSSadaf Ebrahimi next_active_state--;
2150*22dc650dSSadaf Ebrahimi }
2151*22dc650dSSadaf Ebrahimi nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2152*22dc650dSSadaf Ebrahimi &ncount);
2153*22dc650dSSadaf Ebrahimi if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2154*22dc650dSSadaf Ebrahimi reset_could_continue = TRUE;
2155*22dc650dSSadaf Ebrahimi if (++count >= (int)GET2(code, 1))
2156*22dc650dSSadaf Ebrahimi { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2157*22dc650dSSadaf Ebrahimi else
2158*22dc650dSSadaf Ebrahimi { ADD_NEW_DATA(-state_offset, count, ncount); }
2159*22dc650dSSadaf Ebrahimi }
2160*22dc650dSSadaf Ebrahimi break;
2161*22dc650dSSadaf Ebrahimi #endif
2162*22dc650dSSadaf Ebrahimi
2163*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2164*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2165*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2166*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2167*22dc650dSSadaf Ebrahimi case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2168*22dc650dSSadaf Ebrahimi if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2169*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2170*22dc650dSSadaf Ebrahimi count = current_state->count; /* Number already matched */
2171*22dc650dSSadaf Ebrahimi if (clen > 0)
2172*22dc650dSSadaf Ebrahimi {
2173*22dc650dSSadaf Ebrahimi int ncount = 0;
2174*22dc650dSSadaf Ebrahimi switch (c)
2175*22dc650dSSadaf Ebrahimi {
2176*22dc650dSSadaf Ebrahimi case CHAR_VT:
2177*22dc650dSSadaf Ebrahimi case CHAR_FF:
2178*22dc650dSSadaf Ebrahimi case CHAR_NEL:
2179*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
2180*22dc650dSSadaf Ebrahimi case 0x2028:
2181*22dc650dSSadaf Ebrahimi case 0x2029:
2182*22dc650dSSadaf Ebrahimi #endif /* Not EBCDIC */
2183*22dc650dSSadaf Ebrahimi if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2184*22dc650dSSadaf Ebrahimi goto ANYNL03;
2185*22dc650dSSadaf Ebrahimi
2186*22dc650dSSadaf Ebrahimi case CHAR_CR:
2187*22dc650dSSadaf Ebrahimi if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2188*22dc650dSSadaf Ebrahimi /* Fall through */
2189*22dc650dSSadaf Ebrahimi
2190*22dc650dSSadaf Ebrahimi ANYNL03:
2191*22dc650dSSadaf Ebrahimi case CHAR_LF:
2192*22dc650dSSadaf Ebrahimi if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2193*22dc650dSSadaf Ebrahimi {
2194*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2195*22dc650dSSadaf Ebrahimi next_active_state--;
2196*22dc650dSSadaf Ebrahimi }
2197*22dc650dSSadaf Ebrahimi if (++count >= (int)GET2(code, 1))
2198*22dc650dSSadaf Ebrahimi { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2199*22dc650dSSadaf Ebrahimi else
2200*22dc650dSSadaf Ebrahimi { ADD_NEW_DATA(-state_offset, count, ncount); }
2201*22dc650dSSadaf Ebrahimi break;
2202*22dc650dSSadaf Ebrahimi
2203*22dc650dSSadaf Ebrahimi default:
2204*22dc650dSSadaf Ebrahimi break;
2205*22dc650dSSadaf Ebrahimi }
2206*22dc650dSSadaf Ebrahimi }
2207*22dc650dSSadaf Ebrahimi break;
2208*22dc650dSSadaf Ebrahimi
2209*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2210*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2211*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2212*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2213*22dc650dSSadaf Ebrahimi case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2214*22dc650dSSadaf Ebrahimi if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2215*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2216*22dc650dSSadaf Ebrahimi count = current_state->count; /* Number already matched */
2217*22dc650dSSadaf Ebrahimi if (clen > 0)
2218*22dc650dSSadaf Ebrahimi {
2219*22dc650dSSadaf Ebrahimi BOOL OK;
2220*22dc650dSSadaf Ebrahimi switch (c)
2221*22dc650dSSadaf Ebrahimi {
2222*22dc650dSSadaf Ebrahimi VSPACE_CASES:
2223*22dc650dSSadaf Ebrahimi OK = TRUE;
2224*22dc650dSSadaf Ebrahimi break;
2225*22dc650dSSadaf Ebrahimi
2226*22dc650dSSadaf Ebrahimi default:
2227*22dc650dSSadaf Ebrahimi OK = FALSE;
2228*22dc650dSSadaf Ebrahimi }
2229*22dc650dSSadaf Ebrahimi
2230*22dc650dSSadaf Ebrahimi if (OK == (d == OP_VSPACE))
2231*22dc650dSSadaf Ebrahimi {
2232*22dc650dSSadaf Ebrahimi if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2233*22dc650dSSadaf Ebrahimi {
2234*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2235*22dc650dSSadaf Ebrahimi next_active_state--;
2236*22dc650dSSadaf Ebrahimi }
2237*22dc650dSSadaf Ebrahimi if (++count >= (int)GET2(code, 1))
2238*22dc650dSSadaf Ebrahimi { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2239*22dc650dSSadaf Ebrahimi else
2240*22dc650dSSadaf Ebrahimi { ADD_NEW_DATA(-state_offset, count, 0); }
2241*22dc650dSSadaf Ebrahimi }
2242*22dc650dSSadaf Ebrahimi }
2243*22dc650dSSadaf Ebrahimi break;
2244*22dc650dSSadaf Ebrahimi
2245*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2246*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2247*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2248*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2249*22dc650dSSadaf Ebrahimi case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2250*22dc650dSSadaf Ebrahimi if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2251*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2252*22dc650dSSadaf Ebrahimi count = current_state->count; /* Number already matched */
2253*22dc650dSSadaf Ebrahimi if (clen > 0)
2254*22dc650dSSadaf Ebrahimi {
2255*22dc650dSSadaf Ebrahimi BOOL OK;
2256*22dc650dSSadaf Ebrahimi switch (c)
2257*22dc650dSSadaf Ebrahimi {
2258*22dc650dSSadaf Ebrahimi HSPACE_CASES:
2259*22dc650dSSadaf Ebrahimi OK = TRUE;
2260*22dc650dSSadaf Ebrahimi break;
2261*22dc650dSSadaf Ebrahimi
2262*22dc650dSSadaf Ebrahimi default:
2263*22dc650dSSadaf Ebrahimi OK = FALSE;
2264*22dc650dSSadaf Ebrahimi break;
2265*22dc650dSSadaf Ebrahimi }
2266*22dc650dSSadaf Ebrahimi
2267*22dc650dSSadaf Ebrahimi if (OK == (d == OP_HSPACE))
2268*22dc650dSSadaf Ebrahimi {
2269*22dc650dSSadaf Ebrahimi if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2270*22dc650dSSadaf Ebrahimi {
2271*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2272*22dc650dSSadaf Ebrahimi next_active_state--;
2273*22dc650dSSadaf Ebrahimi }
2274*22dc650dSSadaf Ebrahimi if (++count >= (int)GET2(code, 1))
2275*22dc650dSSadaf Ebrahimi { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2276*22dc650dSSadaf Ebrahimi else
2277*22dc650dSSadaf Ebrahimi { ADD_NEW_DATA(-state_offset, count, 0); }
2278*22dc650dSSadaf Ebrahimi }
2279*22dc650dSSadaf Ebrahimi }
2280*22dc650dSSadaf Ebrahimi break;
2281*22dc650dSSadaf Ebrahimi
2282*22dc650dSSadaf Ebrahimi /* ========================================================================== */
2283*22dc650dSSadaf Ebrahimi /* These opcodes are followed by a character that is usually compared
2284*22dc650dSSadaf Ebrahimi to the current subject character; it is loaded into d. We still get
2285*22dc650dSSadaf Ebrahimi here even if there is no subject character, because in some cases zero
2286*22dc650dSSadaf Ebrahimi repetitions are permitted. */
2287*22dc650dSSadaf Ebrahimi
2288*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2289*22dc650dSSadaf Ebrahimi case OP_CHAR:
2290*22dc650dSSadaf Ebrahimi if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2291*22dc650dSSadaf Ebrahimi break;
2292*22dc650dSSadaf Ebrahimi
2293*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2294*22dc650dSSadaf Ebrahimi case OP_CHARI:
2295*22dc650dSSadaf Ebrahimi if (clen == 0) break;
2296*22dc650dSSadaf Ebrahimi
2297*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2298*22dc650dSSadaf Ebrahimi if (utf_or_ucp)
2299*22dc650dSSadaf Ebrahimi {
2300*22dc650dSSadaf Ebrahimi if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2301*22dc650dSSadaf Ebrahimi {
2302*22dc650dSSadaf Ebrahimi unsigned int othercase;
2303*22dc650dSSadaf Ebrahimi if (c < 128)
2304*22dc650dSSadaf Ebrahimi othercase = fcc[c];
2305*22dc650dSSadaf Ebrahimi else
2306*22dc650dSSadaf Ebrahimi othercase = UCD_OTHERCASE(c);
2307*22dc650dSSadaf Ebrahimi if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2308*22dc650dSSadaf Ebrahimi }
2309*22dc650dSSadaf Ebrahimi }
2310*22dc650dSSadaf Ebrahimi else
2311*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
2312*22dc650dSSadaf Ebrahimi /* Not UTF or UCP mode */
2313*22dc650dSSadaf Ebrahimi {
2314*22dc650dSSadaf Ebrahimi if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2315*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + 2, 0); }
2316*22dc650dSSadaf Ebrahimi }
2317*22dc650dSSadaf Ebrahimi break;
2318*22dc650dSSadaf Ebrahimi
2319*22dc650dSSadaf Ebrahimi
2320*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2321*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2322*22dc650dSSadaf Ebrahimi /* This is a tricky one because it can match more than one character.
2323*22dc650dSSadaf Ebrahimi Find out how many characters to skip, and then set up a negative state
2324*22dc650dSSadaf Ebrahimi to wait for them to pass before continuing. */
2325*22dc650dSSadaf Ebrahimi
2326*22dc650dSSadaf Ebrahimi case OP_EXTUNI:
2327*22dc650dSSadaf Ebrahimi if (clen > 0)
2328*22dc650dSSadaf Ebrahimi {
2329*22dc650dSSadaf Ebrahimi int ncount = 0;
2330*22dc650dSSadaf Ebrahimi PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2331*22dc650dSSadaf Ebrahimi end_subject, utf, &ncount);
2332*22dc650dSSadaf Ebrahimi if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2333*22dc650dSSadaf Ebrahimi reset_could_continue = TRUE;
2334*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2335*22dc650dSSadaf Ebrahimi }
2336*22dc650dSSadaf Ebrahimi break;
2337*22dc650dSSadaf Ebrahimi #endif
2338*22dc650dSSadaf Ebrahimi
2339*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2340*22dc650dSSadaf Ebrahimi /* This is a tricky like EXTUNI because it too can match more than one
2341*22dc650dSSadaf Ebrahimi character (when CR is followed by LF). In this case, set up a negative
2342*22dc650dSSadaf Ebrahimi state to wait for one character to pass before continuing. */
2343*22dc650dSSadaf Ebrahimi
2344*22dc650dSSadaf Ebrahimi case OP_ANYNL:
2345*22dc650dSSadaf Ebrahimi if (clen > 0) switch(c)
2346*22dc650dSSadaf Ebrahimi {
2347*22dc650dSSadaf Ebrahimi case CHAR_VT:
2348*22dc650dSSadaf Ebrahimi case CHAR_FF:
2349*22dc650dSSadaf Ebrahimi case CHAR_NEL:
2350*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
2351*22dc650dSSadaf Ebrahimi case 0x2028:
2352*22dc650dSSadaf Ebrahimi case 0x2029:
2353*22dc650dSSadaf Ebrahimi #endif /* Not EBCDIC */
2354*22dc650dSSadaf Ebrahimi if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2355*22dc650dSSadaf Ebrahimi /* Fall through */
2356*22dc650dSSadaf Ebrahimi
2357*22dc650dSSadaf Ebrahimi case CHAR_LF:
2358*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + 1, 0);
2359*22dc650dSSadaf Ebrahimi break;
2360*22dc650dSSadaf Ebrahimi
2361*22dc650dSSadaf Ebrahimi case CHAR_CR:
2362*22dc650dSSadaf Ebrahimi if (ptr + 1 >= end_subject)
2363*22dc650dSSadaf Ebrahimi {
2364*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + 1, 0);
2365*22dc650dSSadaf Ebrahimi if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2366*22dc650dSSadaf Ebrahimi reset_could_continue = TRUE;
2367*22dc650dSSadaf Ebrahimi }
2368*22dc650dSSadaf Ebrahimi else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2369*22dc650dSSadaf Ebrahimi {
2370*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2371*22dc650dSSadaf Ebrahimi }
2372*22dc650dSSadaf Ebrahimi else
2373*22dc650dSSadaf Ebrahimi {
2374*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + 1, 0);
2375*22dc650dSSadaf Ebrahimi }
2376*22dc650dSSadaf Ebrahimi break;
2377*22dc650dSSadaf Ebrahimi }
2378*22dc650dSSadaf Ebrahimi break;
2379*22dc650dSSadaf Ebrahimi
2380*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2381*22dc650dSSadaf Ebrahimi case OP_NOT_VSPACE:
2382*22dc650dSSadaf Ebrahimi if (clen > 0) switch(c)
2383*22dc650dSSadaf Ebrahimi {
2384*22dc650dSSadaf Ebrahimi VSPACE_CASES:
2385*22dc650dSSadaf Ebrahimi break;
2386*22dc650dSSadaf Ebrahimi
2387*22dc650dSSadaf Ebrahimi default:
2388*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + 1, 0);
2389*22dc650dSSadaf Ebrahimi break;
2390*22dc650dSSadaf Ebrahimi }
2391*22dc650dSSadaf Ebrahimi break;
2392*22dc650dSSadaf Ebrahimi
2393*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2394*22dc650dSSadaf Ebrahimi case OP_VSPACE:
2395*22dc650dSSadaf Ebrahimi if (clen > 0) switch(c)
2396*22dc650dSSadaf Ebrahimi {
2397*22dc650dSSadaf Ebrahimi VSPACE_CASES:
2398*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + 1, 0);
2399*22dc650dSSadaf Ebrahimi break;
2400*22dc650dSSadaf Ebrahimi
2401*22dc650dSSadaf Ebrahimi default:
2402*22dc650dSSadaf Ebrahimi break;
2403*22dc650dSSadaf Ebrahimi }
2404*22dc650dSSadaf Ebrahimi break;
2405*22dc650dSSadaf Ebrahimi
2406*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2407*22dc650dSSadaf Ebrahimi case OP_NOT_HSPACE:
2408*22dc650dSSadaf Ebrahimi if (clen > 0) switch(c)
2409*22dc650dSSadaf Ebrahimi {
2410*22dc650dSSadaf Ebrahimi HSPACE_CASES:
2411*22dc650dSSadaf Ebrahimi break;
2412*22dc650dSSadaf Ebrahimi
2413*22dc650dSSadaf Ebrahimi default:
2414*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + 1, 0);
2415*22dc650dSSadaf Ebrahimi break;
2416*22dc650dSSadaf Ebrahimi }
2417*22dc650dSSadaf Ebrahimi break;
2418*22dc650dSSadaf Ebrahimi
2419*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2420*22dc650dSSadaf Ebrahimi case OP_HSPACE:
2421*22dc650dSSadaf Ebrahimi if (clen > 0) switch(c)
2422*22dc650dSSadaf Ebrahimi {
2423*22dc650dSSadaf Ebrahimi HSPACE_CASES:
2424*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + 1, 0);
2425*22dc650dSSadaf Ebrahimi break;
2426*22dc650dSSadaf Ebrahimi
2427*22dc650dSSadaf Ebrahimi default:
2428*22dc650dSSadaf Ebrahimi break;
2429*22dc650dSSadaf Ebrahimi }
2430*22dc650dSSadaf Ebrahimi break;
2431*22dc650dSSadaf Ebrahimi
2432*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2433*22dc650dSSadaf Ebrahimi /* Match a negated single character casefully. */
2434*22dc650dSSadaf Ebrahimi
2435*22dc650dSSadaf Ebrahimi case OP_NOT:
2436*22dc650dSSadaf Ebrahimi if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2437*22dc650dSSadaf Ebrahimi break;
2438*22dc650dSSadaf Ebrahimi
2439*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2440*22dc650dSSadaf Ebrahimi /* Match a negated single character caselessly. */
2441*22dc650dSSadaf Ebrahimi
2442*22dc650dSSadaf Ebrahimi case OP_NOTI:
2443*22dc650dSSadaf Ebrahimi if (clen > 0)
2444*22dc650dSSadaf Ebrahimi {
2445*22dc650dSSadaf Ebrahimi uint32_t otherd;
2446*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2447*22dc650dSSadaf Ebrahimi if (utf_or_ucp && d >= 128)
2448*22dc650dSSadaf Ebrahimi otherd = UCD_OTHERCASE(d);
2449*22dc650dSSadaf Ebrahimi else
2450*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
2451*22dc650dSSadaf Ebrahimi otherd = TABLE_GET(d, fcc, d);
2452*22dc650dSSadaf Ebrahimi if (c != d && c != otherd)
2453*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + dlen + 1, 0); }
2454*22dc650dSSadaf Ebrahimi }
2455*22dc650dSSadaf Ebrahimi break;
2456*22dc650dSSadaf Ebrahimi
2457*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2458*22dc650dSSadaf Ebrahimi case OP_PLUSI:
2459*22dc650dSSadaf Ebrahimi case OP_MINPLUSI:
2460*22dc650dSSadaf Ebrahimi case OP_POSPLUSI:
2461*22dc650dSSadaf Ebrahimi case OP_NOTPLUSI:
2462*22dc650dSSadaf Ebrahimi case OP_NOTMINPLUSI:
2463*22dc650dSSadaf Ebrahimi case OP_NOTPOSPLUSI:
2464*22dc650dSSadaf Ebrahimi caseless = TRUE;
2465*22dc650dSSadaf Ebrahimi codevalue -= OP_STARI - OP_STAR;
2466*22dc650dSSadaf Ebrahimi
2467*22dc650dSSadaf Ebrahimi /* Fall through */
2468*22dc650dSSadaf Ebrahimi case OP_PLUS:
2469*22dc650dSSadaf Ebrahimi case OP_MINPLUS:
2470*22dc650dSSadaf Ebrahimi case OP_POSPLUS:
2471*22dc650dSSadaf Ebrahimi case OP_NOTPLUS:
2472*22dc650dSSadaf Ebrahimi case OP_NOTMINPLUS:
2473*22dc650dSSadaf Ebrahimi case OP_NOTPOSPLUS:
2474*22dc650dSSadaf Ebrahimi count = current_state->count; /* Already matched */
2475*22dc650dSSadaf Ebrahimi if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2476*22dc650dSSadaf Ebrahimi if (clen > 0)
2477*22dc650dSSadaf Ebrahimi {
2478*22dc650dSSadaf Ebrahimi uint32_t otherd = NOTACHAR;
2479*22dc650dSSadaf Ebrahimi if (caseless)
2480*22dc650dSSadaf Ebrahimi {
2481*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2482*22dc650dSSadaf Ebrahimi if (utf_or_ucp && d >= 128)
2483*22dc650dSSadaf Ebrahimi otherd = UCD_OTHERCASE(d);
2484*22dc650dSSadaf Ebrahimi else
2485*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
2486*22dc650dSSadaf Ebrahimi otherd = TABLE_GET(d, fcc, d);
2487*22dc650dSSadaf Ebrahimi }
2488*22dc650dSSadaf Ebrahimi if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2489*22dc650dSSadaf Ebrahimi {
2490*22dc650dSSadaf Ebrahimi if (count > 0 &&
2491*22dc650dSSadaf Ebrahimi (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2492*22dc650dSSadaf Ebrahimi {
2493*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2494*22dc650dSSadaf Ebrahimi next_active_state--;
2495*22dc650dSSadaf Ebrahimi }
2496*22dc650dSSadaf Ebrahimi count++;
2497*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset, count);
2498*22dc650dSSadaf Ebrahimi }
2499*22dc650dSSadaf Ebrahimi }
2500*22dc650dSSadaf Ebrahimi break;
2501*22dc650dSSadaf Ebrahimi
2502*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2503*22dc650dSSadaf Ebrahimi case OP_QUERYI:
2504*22dc650dSSadaf Ebrahimi case OP_MINQUERYI:
2505*22dc650dSSadaf Ebrahimi case OP_POSQUERYI:
2506*22dc650dSSadaf Ebrahimi case OP_NOTQUERYI:
2507*22dc650dSSadaf Ebrahimi case OP_NOTMINQUERYI:
2508*22dc650dSSadaf Ebrahimi case OP_NOTPOSQUERYI:
2509*22dc650dSSadaf Ebrahimi caseless = TRUE;
2510*22dc650dSSadaf Ebrahimi codevalue -= OP_STARI - OP_STAR;
2511*22dc650dSSadaf Ebrahimi /* Fall through */
2512*22dc650dSSadaf Ebrahimi case OP_QUERY:
2513*22dc650dSSadaf Ebrahimi case OP_MINQUERY:
2514*22dc650dSSadaf Ebrahimi case OP_POSQUERY:
2515*22dc650dSSadaf Ebrahimi case OP_NOTQUERY:
2516*22dc650dSSadaf Ebrahimi case OP_NOTMINQUERY:
2517*22dc650dSSadaf Ebrahimi case OP_NOTPOSQUERY:
2518*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + dlen + 1, 0);
2519*22dc650dSSadaf Ebrahimi if (clen > 0)
2520*22dc650dSSadaf Ebrahimi {
2521*22dc650dSSadaf Ebrahimi uint32_t otherd = NOTACHAR;
2522*22dc650dSSadaf Ebrahimi if (caseless)
2523*22dc650dSSadaf Ebrahimi {
2524*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2525*22dc650dSSadaf Ebrahimi if (utf_or_ucp && d >= 128)
2526*22dc650dSSadaf Ebrahimi otherd = UCD_OTHERCASE(d);
2527*22dc650dSSadaf Ebrahimi else
2528*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
2529*22dc650dSSadaf Ebrahimi otherd = TABLE_GET(d, fcc, d);
2530*22dc650dSSadaf Ebrahimi }
2531*22dc650dSSadaf Ebrahimi if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2532*22dc650dSSadaf Ebrahimi {
2533*22dc650dSSadaf Ebrahimi if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2534*22dc650dSSadaf Ebrahimi {
2535*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2536*22dc650dSSadaf Ebrahimi next_active_state--;
2537*22dc650dSSadaf Ebrahimi }
2538*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset + dlen + 1, 0);
2539*22dc650dSSadaf Ebrahimi }
2540*22dc650dSSadaf Ebrahimi }
2541*22dc650dSSadaf Ebrahimi break;
2542*22dc650dSSadaf Ebrahimi
2543*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2544*22dc650dSSadaf Ebrahimi case OP_STARI:
2545*22dc650dSSadaf Ebrahimi case OP_MINSTARI:
2546*22dc650dSSadaf Ebrahimi case OP_POSSTARI:
2547*22dc650dSSadaf Ebrahimi case OP_NOTSTARI:
2548*22dc650dSSadaf Ebrahimi case OP_NOTMINSTARI:
2549*22dc650dSSadaf Ebrahimi case OP_NOTPOSSTARI:
2550*22dc650dSSadaf Ebrahimi caseless = TRUE;
2551*22dc650dSSadaf Ebrahimi codevalue -= OP_STARI - OP_STAR;
2552*22dc650dSSadaf Ebrahimi /* Fall through */
2553*22dc650dSSadaf Ebrahimi case OP_STAR:
2554*22dc650dSSadaf Ebrahimi case OP_MINSTAR:
2555*22dc650dSSadaf Ebrahimi case OP_POSSTAR:
2556*22dc650dSSadaf Ebrahimi case OP_NOTSTAR:
2557*22dc650dSSadaf Ebrahimi case OP_NOTMINSTAR:
2558*22dc650dSSadaf Ebrahimi case OP_NOTPOSSTAR:
2559*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + dlen + 1, 0);
2560*22dc650dSSadaf Ebrahimi if (clen > 0)
2561*22dc650dSSadaf Ebrahimi {
2562*22dc650dSSadaf Ebrahimi uint32_t otherd = NOTACHAR;
2563*22dc650dSSadaf Ebrahimi if (caseless)
2564*22dc650dSSadaf Ebrahimi {
2565*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2566*22dc650dSSadaf Ebrahimi if (utf_or_ucp && d >= 128)
2567*22dc650dSSadaf Ebrahimi otherd = UCD_OTHERCASE(d);
2568*22dc650dSSadaf Ebrahimi else
2569*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
2570*22dc650dSSadaf Ebrahimi otherd = TABLE_GET(d, fcc, d);
2571*22dc650dSSadaf Ebrahimi }
2572*22dc650dSSadaf Ebrahimi if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2573*22dc650dSSadaf Ebrahimi {
2574*22dc650dSSadaf Ebrahimi if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2575*22dc650dSSadaf Ebrahimi {
2576*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2577*22dc650dSSadaf Ebrahimi next_active_state--;
2578*22dc650dSSadaf Ebrahimi }
2579*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset, 0);
2580*22dc650dSSadaf Ebrahimi }
2581*22dc650dSSadaf Ebrahimi }
2582*22dc650dSSadaf Ebrahimi break;
2583*22dc650dSSadaf Ebrahimi
2584*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2585*22dc650dSSadaf Ebrahimi case OP_EXACTI:
2586*22dc650dSSadaf Ebrahimi case OP_NOTEXACTI:
2587*22dc650dSSadaf Ebrahimi caseless = TRUE;
2588*22dc650dSSadaf Ebrahimi codevalue -= OP_STARI - OP_STAR;
2589*22dc650dSSadaf Ebrahimi /* Fall through */
2590*22dc650dSSadaf Ebrahimi case OP_EXACT:
2591*22dc650dSSadaf Ebrahimi case OP_NOTEXACT:
2592*22dc650dSSadaf Ebrahimi count = current_state->count; /* Number already matched */
2593*22dc650dSSadaf Ebrahimi if (clen > 0)
2594*22dc650dSSadaf Ebrahimi {
2595*22dc650dSSadaf Ebrahimi uint32_t otherd = NOTACHAR;
2596*22dc650dSSadaf Ebrahimi if (caseless)
2597*22dc650dSSadaf Ebrahimi {
2598*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2599*22dc650dSSadaf Ebrahimi if (utf_or_ucp && d >= 128)
2600*22dc650dSSadaf Ebrahimi otherd = UCD_OTHERCASE(d);
2601*22dc650dSSadaf Ebrahimi else
2602*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
2603*22dc650dSSadaf Ebrahimi otherd = TABLE_GET(d, fcc, d);
2604*22dc650dSSadaf Ebrahimi }
2605*22dc650dSSadaf Ebrahimi if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2606*22dc650dSSadaf Ebrahimi {
2607*22dc650dSSadaf Ebrahimi if (++count >= (int)GET2(code, 1))
2608*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2609*22dc650dSSadaf Ebrahimi else
2610*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset, count); }
2611*22dc650dSSadaf Ebrahimi }
2612*22dc650dSSadaf Ebrahimi }
2613*22dc650dSSadaf Ebrahimi break;
2614*22dc650dSSadaf Ebrahimi
2615*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2616*22dc650dSSadaf Ebrahimi case OP_UPTOI:
2617*22dc650dSSadaf Ebrahimi case OP_MINUPTOI:
2618*22dc650dSSadaf Ebrahimi case OP_POSUPTOI:
2619*22dc650dSSadaf Ebrahimi case OP_NOTUPTOI:
2620*22dc650dSSadaf Ebrahimi case OP_NOTMINUPTOI:
2621*22dc650dSSadaf Ebrahimi case OP_NOTPOSUPTOI:
2622*22dc650dSSadaf Ebrahimi caseless = TRUE;
2623*22dc650dSSadaf Ebrahimi codevalue -= OP_STARI - OP_STAR;
2624*22dc650dSSadaf Ebrahimi /* Fall through */
2625*22dc650dSSadaf Ebrahimi case OP_UPTO:
2626*22dc650dSSadaf Ebrahimi case OP_MINUPTO:
2627*22dc650dSSadaf Ebrahimi case OP_POSUPTO:
2628*22dc650dSSadaf Ebrahimi case OP_NOTUPTO:
2629*22dc650dSSadaf Ebrahimi case OP_NOTMINUPTO:
2630*22dc650dSSadaf Ebrahimi case OP_NOTPOSUPTO:
2631*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2632*22dc650dSSadaf Ebrahimi count = current_state->count; /* Number already matched */
2633*22dc650dSSadaf Ebrahimi if (clen > 0)
2634*22dc650dSSadaf Ebrahimi {
2635*22dc650dSSadaf Ebrahimi uint32_t otherd = NOTACHAR;
2636*22dc650dSSadaf Ebrahimi if (caseless)
2637*22dc650dSSadaf Ebrahimi {
2638*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
2639*22dc650dSSadaf Ebrahimi if (utf_or_ucp && d >= 128)
2640*22dc650dSSadaf Ebrahimi otherd = UCD_OTHERCASE(d);
2641*22dc650dSSadaf Ebrahimi else
2642*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
2643*22dc650dSSadaf Ebrahimi otherd = TABLE_GET(d, fcc, d);
2644*22dc650dSSadaf Ebrahimi }
2645*22dc650dSSadaf Ebrahimi if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2646*22dc650dSSadaf Ebrahimi {
2647*22dc650dSSadaf Ebrahimi if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2648*22dc650dSSadaf Ebrahimi {
2649*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2650*22dc650dSSadaf Ebrahimi next_active_state--;
2651*22dc650dSSadaf Ebrahimi }
2652*22dc650dSSadaf Ebrahimi if (++count >= (int)GET2(code, 1))
2653*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2654*22dc650dSSadaf Ebrahimi else
2655*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset, count); }
2656*22dc650dSSadaf Ebrahimi }
2657*22dc650dSSadaf Ebrahimi }
2658*22dc650dSSadaf Ebrahimi break;
2659*22dc650dSSadaf Ebrahimi
2660*22dc650dSSadaf Ebrahimi
2661*22dc650dSSadaf Ebrahimi /* ========================================================================== */
2662*22dc650dSSadaf Ebrahimi /* These are the class-handling opcodes */
2663*22dc650dSSadaf Ebrahimi
2664*22dc650dSSadaf Ebrahimi case OP_CLASS:
2665*22dc650dSSadaf Ebrahimi case OP_NCLASS:
2666*22dc650dSSadaf Ebrahimi case OP_XCLASS:
2667*22dc650dSSadaf Ebrahimi {
2668*22dc650dSSadaf Ebrahimi BOOL isinclass = FALSE;
2669*22dc650dSSadaf Ebrahimi int next_state_offset;
2670*22dc650dSSadaf Ebrahimi PCRE2_SPTR ecode;
2671*22dc650dSSadaf Ebrahimi
2672*22dc650dSSadaf Ebrahimi /* For a simple class, there is always just a 32-byte table, and we
2673*22dc650dSSadaf Ebrahimi can set isinclass from it. */
2674*22dc650dSSadaf Ebrahimi
2675*22dc650dSSadaf Ebrahimi if (codevalue != OP_XCLASS)
2676*22dc650dSSadaf Ebrahimi {
2677*22dc650dSSadaf Ebrahimi ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2678*22dc650dSSadaf Ebrahimi if (clen > 0)
2679*22dc650dSSadaf Ebrahimi {
2680*22dc650dSSadaf Ebrahimi isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2681*22dc650dSSadaf Ebrahimi ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2682*22dc650dSSadaf Ebrahimi }
2683*22dc650dSSadaf Ebrahimi }
2684*22dc650dSSadaf Ebrahimi
2685*22dc650dSSadaf Ebrahimi /* An extended class may have a table or a list of single characters,
2686*22dc650dSSadaf Ebrahimi ranges, or both, and it may be positive or negative. There's a
2687*22dc650dSSadaf Ebrahimi function that sorts all this out. */
2688*22dc650dSSadaf Ebrahimi
2689*22dc650dSSadaf Ebrahimi else
2690*22dc650dSSadaf Ebrahimi {
2691*22dc650dSSadaf Ebrahimi ecode = code + GET(code, 1);
2692*22dc650dSSadaf Ebrahimi if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2693*22dc650dSSadaf Ebrahimi }
2694*22dc650dSSadaf Ebrahimi
2695*22dc650dSSadaf Ebrahimi /* At this point, isinclass is set for all kinds of class, and ecode
2696*22dc650dSSadaf Ebrahimi points to the byte after the end of the class. If there is a
2697*22dc650dSSadaf Ebrahimi quantifier, this is where it will be. */
2698*22dc650dSSadaf Ebrahimi
2699*22dc650dSSadaf Ebrahimi next_state_offset = (int)(ecode - start_code);
2700*22dc650dSSadaf Ebrahimi
2701*22dc650dSSadaf Ebrahimi switch (*ecode)
2702*22dc650dSSadaf Ebrahimi {
2703*22dc650dSSadaf Ebrahimi case OP_CRSTAR:
2704*22dc650dSSadaf Ebrahimi case OP_CRMINSTAR:
2705*22dc650dSSadaf Ebrahimi case OP_CRPOSSTAR:
2706*22dc650dSSadaf Ebrahimi ADD_ACTIVE(next_state_offset + 1, 0);
2707*22dc650dSSadaf Ebrahimi if (isinclass)
2708*22dc650dSSadaf Ebrahimi {
2709*22dc650dSSadaf Ebrahimi if (*ecode == OP_CRPOSSTAR)
2710*22dc650dSSadaf Ebrahimi {
2711*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2712*22dc650dSSadaf Ebrahimi next_active_state--;
2713*22dc650dSSadaf Ebrahimi }
2714*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset, 0);
2715*22dc650dSSadaf Ebrahimi }
2716*22dc650dSSadaf Ebrahimi break;
2717*22dc650dSSadaf Ebrahimi
2718*22dc650dSSadaf Ebrahimi case OP_CRPLUS:
2719*22dc650dSSadaf Ebrahimi case OP_CRMINPLUS:
2720*22dc650dSSadaf Ebrahimi case OP_CRPOSPLUS:
2721*22dc650dSSadaf Ebrahimi count = current_state->count; /* Already matched */
2722*22dc650dSSadaf Ebrahimi if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2723*22dc650dSSadaf Ebrahimi if (isinclass)
2724*22dc650dSSadaf Ebrahimi {
2725*22dc650dSSadaf Ebrahimi if (count > 0 && *ecode == OP_CRPOSPLUS)
2726*22dc650dSSadaf Ebrahimi {
2727*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2728*22dc650dSSadaf Ebrahimi next_active_state--;
2729*22dc650dSSadaf Ebrahimi }
2730*22dc650dSSadaf Ebrahimi count++;
2731*22dc650dSSadaf Ebrahimi ADD_NEW(state_offset, count);
2732*22dc650dSSadaf Ebrahimi }
2733*22dc650dSSadaf Ebrahimi break;
2734*22dc650dSSadaf Ebrahimi
2735*22dc650dSSadaf Ebrahimi case OP_CRQUERY:
2736*22dc650dSSadaf Ebrahimi case OP_CRMINQUERY:
2737*22dc650dSSadaf Ebrahimi case OP_CRPOSQUERY:
2738*22dc650dSSadaf Ebrahimi ADD_ACTIVE(next_state_offset + 1, 0);
2739*22dc650dSSadaf Ebrahimi if (isinclass)
2740*22dc650dSSadaf Ebrahimi {
2741*22dc650dSSadaf Ebrahimi if (*ecode == OP_CRPOSQUERY)
2742*22dc650dSSadaf Ebrahimi {
2743*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2744*22dc650dSSadaf Ebrahimi next_active_state--;
2745*22dc650dSSadaf Ebrahimi }
2746*22dc650dSSadaf Ebrahimi ADD_NEW(next_state_offset + 1, 0);
2747*22dc650dSSadaf Ebrahimi }
2748*22dc650dSSadaf Ebrahimi break;
2749*22dc650dSSadaf Ebrahimi
2750*22dc650dSSadaf Ebrahimi case OP_CRRANGE:
2751*22dc650dSSadaf Ebrahimi case OP_CRMINRANGE:
2752*22dc650dSSadaf Ebrahimi case OP_CRPOSRANGE:
2753*22dc650dSSadaf Ebrahimi count = current_state->count; /* Already matched */
2754*22dc650dSSadaf Ebrahimi if (count >= (int)GET2(ecode, 1))
2755*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2756*22dc650dSSadaf Ebrahimi if (isinclass)
2757*22dc650dSSadaf Ebrahimi {
2758*22dc650dSSadaf Ebrahimi int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2759*22dc650dSSadaf Ebrahimi
2760*22dc650dSSadaf Ebrahimi if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2761*22dc650dSSadaf Ebrahimi {
2762*22dc650dSSadaf Ebrahimi active_count--; /* Remove non-match possibility */
2763*22dc650dSSadaf Ebrahimi next_active_state--;
2764*22dc650dSSadaf Ebrahimi }
2765*22dc650dSSadaf Ebrahimi
2766*22dc650dSSadaf Ebrahimi if (++count >= max && max != 0) /* Max 0 => no limit */
2767*22dc650dSSadaf Ebrahimi { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2768*22dc650dSSadaf Ebrahimi else
2769*22dc650dSSadaf Ebrahimi { ADD_NEW(state_offset, count); }
2770*22dc650dSSadaf Ebrahimi }
2771*22dc650dSSadaf Ebrahimi break;
2772*22dc650dSSadaf Ebrahimi
2773*22dc650dSSadaf Ebrahimi default:
2774*22dc650dSSadaf Ebrahimi if (isinclass) { ADD_NEW(next_state_offset, 0); }
2775*22dc650dSSadaf Ebrahimi break;
2776*22dc650dSSadaf Ebrahimi }
2777*22dc650dSSadaf Ebrahimi }
2778*22dc650dSSadaf Ebrahimi break;
2779*22dc650dSSadaf Ebrahimi
2780*22dc650dSSadaf Ebrahimi /* ========================================================================== */
2781*22dc650dSSadaf Ebrahimi /* These are the opcodes for fancy brackets of various kinds. We have
2782*22dc650dSSadaf Ebrahimi to use recursion in order to handle them. The "always failing" assertion
2783*22dc650dSSadaf Ebrahimi (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2784*22dc650dSSadaf Ebrahimi though the other "backtracking verbs" are not supported. */
2785*22dc650dSSadaf Ebrahimi
2786*22dc650dSSadaf Ebrahimi case OP_FAIL:
2787*22dc650dSSadaf Ebrahimi forced_fail++; /* Count FAILs for multiple states */
2788*22dc650dSSadaf Ebrahimi break;
2789*22dc650dSSadaf Ebrahimi
2790*22dc650dSSadaf Ebrahimi case OP_ASSERT:
2791*22dc650dSSadaf Ebrahimi case OP_ASSERT_NOT:
2792*22dc650dSSadaf Ebrahimi case OP_ASSERTBACK:
2793*22dc650dSSadaf Ebrahimi case OP_ASSERTBACK_NOT:
2794*22dc650dSSadaf Ebrahimi {
2795*22dc650dSSadaf Ebrahimi int rc;
2796*22dc650dSSadaf Ebrahimi int *local_workspace;
2797*22dc650dSSadaf Ebrahimi PCRE2_SIZE *local_offsets;
2798*22dc650dSSadaf Ebrahimi PCRE2_SPTR endasscode = code + GET(code, 1);
2799*22dc650dSSadaf Ebrahimi RWS_anchor *rws = (RWS_anchor *)RWS;
2800*22dc650dSSadaf Ebrahimi
2801*22dc650dSSadaf Ebrahimi if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2802*22dc650dSSadaf Ebrahimi {
2803*22dc650dSSadaf Ebrahimi rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2804*22dc650dSSadaf Ebrahimi if (rc != 0) return rc;
2805*22dc650dSSadaf Ebrahimi RWS = (int *)rws;
2806*22dc650dSSadaf Ebrahimi }
2807*22dc650dSSadaf Ebrahimi
2808*22dc650dSSadaf Ebrahimi local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2809*22dc650dSSadaf Ebrahimi local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2810*22dc650dSSadaf Ebrahimi rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2811*22dc650dSSadaf Ebrahimi
2812*22dc650dSSadaf Ebrahimi while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2813*22dc650dSSadaf Ebrahimi
2814*22dc650dSSadaf Ebrahimi rc = internal_dfa_match(
2815*22dc650dSSadaf Ebrahimi mb, /* static match data */
2816*22dc650dSSadaf Ebrahimi code, /* this subexpression's code */
2817*22dc650dSSadaf Ebrahimi ptr, /* where we currently are */
2818*22dc650dSSadaf Ebrahimi (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2819*22dc650dSSadaf Ebrahimi local_offsets, /* offset vector */
2820*22dc650dSSadaf Ebrahimi RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2821*22dc650dSSadaf Ebrahimi local_workspace, /* workspace vector */
2822*22dc650dSSadaf Ebrahimi RWS_RSIZE, /* size of same */
2823*22dc650dSSadaf Ebrahimi rlevel, /* function recursion level */
2824*22dc650dSSadaf Ebrahimi RWS); /* recursion workspace */
2825*22dc650dSSadaf Ebrahimi
2826*22dc650dSSadaf Ebrahimi rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2827*22dc650dSSadaf Ebrahimi
2828*22dc650dSSadaf Ebrahimi if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2829*22dc650dSSadaf Ebrahimi if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2830*22dc650dSSadaf Ebrahimi { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2831*22dc650dSSadaf Ebrahimi }
2832*22dc650dSSadaf Ebrahimi break;
2833*22dc650dSSadaf Ebrahimi
2834*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2835*22dc650dSSadaf Ebrahimi case OP_COND:
2836*22dc650dSSadaf Ebrahimi case OP_SCOND:
2837*22dc650dSSadaf Ebrahimi {
2838*22dc650dSSadaf Ebrahimi int codelink = (int)GET(code, 1);
2839*22dc650dSSadaf Ebrahimi PCRE2_UCHAR condcode;
2840*22dc650dSSadaf Ebrahimi
2841*22dc650dSSadaf Ebrahimi /* Because of the way auto-callout works during compile, a callout item
2842*22dc650dSSadaf Ebrahimi is inserted between OP_COND and an assertion condition. This does not
2843*22dc650dSSadaf Ebrahimi happen for the other conditions. */
2844*22dc650dSSadaf Ebrahimi
2845*22dc650dSSadaf Ebrahimi if (code[LINK_SIZE + 1] == OP_CALLOUT
2846*22dc650dSSadaf Ebrahimi || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2847*22dc650dSSadaf Ebrahimi {
2848*22dc650dSSadaf Ebrahimi PCRE2_SIZE callout_length;
2849*22dc650dSSadaf Ebrahimi rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2850*22dc650dSSadaf Ebrahimi 1 + LINK_SIZE, &callout_length);
2851*22dc650dSSadaf Ebrahimi if (rrc < 0) return rrc; /* Abandon */
2852*22dc650dSSadaf Ebrahimi if (rrc > 0) break; /* Fail this thread */
2853*22dc650dSSadaf Ebrahimi code += callout_length; /* Skip callout data */
2854*22dc650dSSadaf Ebrahimi }
2855*22dc650dSSadaf Ebrahimi
2856*22dc650dSSadaf Ebrahimi condcode = code[LINK_SIZE+1];
2857*22dc650dSSadaf Ebrahimi
2858*22dc650dSSadaf Ebrahimi /* Back reference conditions and duplicate named recursion conditions
2859*22dc650dSSadaf Ebrahimi are not supported */
2860*22dc650dSSadaf Ebrahimi
2861*22dc650dSSadaf Ebrahimi if (condcode == OP_CREF || condcode == OP_DNCREF ||
2862*22dc650dSSadaf Ebrahimi condcode == OP_DNRREF)
2863*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_DFA_UCOND;
2864*22dc650dSSadaf Ebrahimi
2865*22dc650dSSadaf Ebrahimi /* The DEFINE condition is always false, and the assertion (?!) is
2866*22dc650dSSadaf Ebrahimi converted to OP_FAIL. */
2867*22dc650dSSadaf Ebrahimi
2868*22dc650dSSadaf Ebrahimi if (condcode == OP_FALSE || condcode == OP_FAIL)
2869*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2870*22dc650dSSadaf Ebrahimi
2871*22dc650dSSadaf Ebrahimi /* There is also an always-true condition */
2872*22dc650dSSadaf Ebrahimi
2873*22dc650dSSadaf Ebrahimi else if (condcode == OP_TRUE)
2874*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2875*22dc650dSSadaf Ebrahimi
2876*22dc650dSSadaf Ebrahimi /* The only supported version of OP_RREF is for the value RREF_ANY,
2877*22dc650dSSadaf Ebrahimi which means "test if in any recursion". We can't test for specifically
2878*22dc650dSSadaf Ebrahimi recursed groups. */
2879*22dc650dSSadaf Ebrahimi
2880*22dc650dSSadaf Ebrahimi else if (condcode == OP_RREF)
2881*22dc650dSSadaf Ebrahimi {
2882*22dc650dSSadaf Ebrahimi unsigned int value = GET2(code, LINK_SIZE + 2);
2883*22dc650dSSadaf Ebrahimi if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2884*22dc650dSSadaf Ebrahimi if (mb->recursive != NULL)
2885*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2886*22dc650dSSadaf Ebrahimi else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2887*22dc650dSSadaf Ebrahimi }
2888*22dc650dSSadaf Ebrahimi
2889*22dc650dSSadaf Ebrahimi /* Otherwise, the condition is an assertion */
2890*22dc650dSSadaf Ebrahimi
2891*22dc650dSSadaf Ebrahimi else
2892*22dc650dSSadaf Ebrahimi {
2893*22dc650dSSadaf Ebrahimi int rc;
2894*22dc650dSSadaf Ebrahimi int *local_workspace;
2895*22dc650dSSadaf Ebrahimi PCRE2_SIZE *local_offsets;
2896*22dc650dSSadaf Ebrahimi PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2897*22dc650dSSadaf Ebrahimi PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2898*22dc650dSSadaf Ebrahimi RWS_anchor *rws = (RWS_anchor *)RWS;
2899*22dc650dSSadaf Ebrahimi
2900*22dc650dSSadaf Ebrahimi if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2901*22dc650dSSadaf Ebrahimi {
2902*22dc650dSSadaf Ebrahimi rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2903*22dc650dSSadaf Ebrahimi if (rc != 0) return rc;
2904*22dc650dSSadaf Ebrahimi RWS = (int *)rws;
2905*22dc650dSSadaf Ebrahimi }
2906*22dc650dSSadaf Ebrahimi
2907*22dc650dSSadaf Ebrahimi local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2908*22dc650dSSadaf Ebrahimi local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2909*22dc650dSSadaf Ebrahimi rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2910*22dc650dSSadaf Ebrahimi
2911*22dc650dSSadaf Ebrahimi while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2912*22dc650dSSadaf Ebrahimi
2913*22dc650dSSadaf Ebrahimi rc = internal_dfa_match(
2914*22dc650dSSadaf Ebrahimi mb, /* fixed match data */
2915*22dc650dSSadaf Ebrahimi asscode, /* this subexpression's code */
2916*22dc650dSSadaf Ebrahimi ptr, /* where we currently are */
2917*22dc650dSSadaf Ebrahimi (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2918*22dc650dSSadaf Ebrahimi local_offsets, /* offset vector */
2919*22dc650dSSadaf Ebrahimi RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2920*22dc650dSSadaf Ebrahimi local_workspace, /* workspace vector */
2921*22dc650dSSadaf Ebrahimi RWS_RSIZE, /* size of same */
2922*22dc650dSSadaf Ebrahimi rlevel, /* function recursion level */
2923*22dc650dSSadaf Ebrahimi RWS); /* recursion workspace */
2924*22dc650dSSadaf Ebrahimi
2925*22dc650dSSadaf Ebrahimi rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2926*22dc650dSSadaf Ebrahimi
2927*22dc650dSSadaf Ebrahimi if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2928*22dc650dSSadaf Ebrahimi if ((rc >= 0) ==
2929*22dc650dSSadaf Ebrahimi (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2930*22dc650dSSadaf Ebrahimi { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2931*22dc650dSSadaf Ebrahimi else
2932*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2933*22dc650dSSadaf Ebrahimi }
2934*22dc650dSSadaf Ebrahimi }
2935*22dc650dSSadaf Ebrahimi break;
2936*22dc650dSSadaf Ebrahimi
2937*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
2938*22dc650dSSadaf Ebrahimi case OP_RECURSE:
2939*22dc650dSSadaf Ebrahimi {
2940*22dc650dSSadaf Ebrahimi int rc;
2941*22dc650dSSadaf Ebrahimi int *local_workspace;
2942*22dc650dSSadaf Ebrahimi PCRE2_SIZE *local_offsets;
2943*22dc650dSSadaf Ebrahimi RWS_anchor *rws = (RWS_anchor *)RWS;
2944*22dc650dSSadaf Ebrahimi PCRE2_SPTR callpat = start_code + GET(code, 1);
2945*22dc650dSSadaf Ebrahimi uint32_t recno = (callpat == mb->start_code)? 0 :
2946*22dc650dSSadaf Ebrahimi GET2(callpat, 1 + LINK_SIZE);
2947*22dc650dSSadaf Ebrahimi
2948*22dc650dSSadaf Ebrahimi if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2949*22dc650dSSadaf Ebrahimi {
2950*22dc650dSSadaf Ebrahimi rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2951*22dc650dSSadaf Ebrahimi if (rc != 0) return rc;
2952*22dc650dSSadaf Ebrahimi RWS = (int *)rws;
2953*22dc650dSSadaf Ebrahimi }
2954*22dc650dSSadaf Ebrahimi
2955*22dc650dSSadaf Ebrahimi local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2956*22dc650dSSadaf Ebrahimi local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2957*22dc650dSSadaf Ebrahimi rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2958*22dc650dSSadaf Ebrahimi
2959*22dc650dSSadaf Ebrahimi /* Check for repeating a recursion without advancing the subject
2960*22dc650dSSadaf Ebrahimi pointer or last used character. This should catch convoluted mutual
2961*22dc650dSSadaf Ebrahimi recursions. (Some simple cases are caught at compile time.) */
2962*22dc650dSSadaf Ebrahimi
2963*22dc650dSSadaf Ebrahimi for (dfa_recursion_info *ri = mb->recursive;
2964*22dc650dSSadaf Ebrahimi ri != NULL;
2965*22dc650dSSadaf Ebrahimi ri = ri->prevrec)
2966*22dc650dSSadaf Ebrahimi {
2967*22dc650dSSadaf Ebrahimi if (recno == ri->group_num && ptr == ri->subject_position &&
2968*22dc650dSSadaf Ebrahimi mb->last_used_ptr == ri->last_used_ptr)
2969*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_RECURSELOOP;
2970*22dc650dSSadaf Ebrahimi }
2971*22dc650dSSadaf Ebrahimi
2972*22dc650dSSadaf Ebrahimi /* Remember this recursion and where we started it so as to
2973*22dc650dSSadaf Ebrahimi catch infinite loops. */
2974*22dc650dSSadaf Ebrahimi
2975*22dc650dSSadaf Ebrahimi new_recursive.group_num = recno;
2976*22dc650dSSadaf Ebrahimi new_recursive.subject_position = ptr;
2977*22dc650dSSadaf Ebrahimi new_recursive.last_used_ptr = mb->last_used_ptr;
2978*22dc650dSSadaf Ebrahimi new_recursive.prevrec = mb->recursive;
2979*22dc650dSSadaf Ebrahimi mb->recursive = &new_recursive;
2980*22dc650dSSadaf Ebrahimi
2981*22dc650dSSadaf Ebrahimi rc = internal_dfa_match(
2982*22dc650dSSadaf Ebrahimi mb, /* fixed match data */
2983*22dc650dSSadaf Ebrahimi callpat, /* this subexpression's code */
2984*22dc650dSSadaf Ebrahimi ptr, /* where we currently are */
2985*22dc650dSSadaf Ebrahimi (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2986*22dc650dSSadaf Ebrahimi local_offsets, /* offset vector */
2987*22dc650dSSadaf Ebrahimi RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2988*22dc650dSSadaf Ebrahimi local_workspace, /* workspace vector */
2989*22dc650dSSadaf Ebrahimi RWS_RSIZE, /* size of same */
2990*22dc650dSSadaf Ebrahimi rlevel, /* function recursion level */
2991*22dc650dSSadaf Ebrahimi RWS); /* recursion workspace */
2992*22dc650dSSadaf Ebrahimi
2993*22dc650dSSadaf Ebrahimi rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2994*22dc650dSSadaf Ebrahimi mb->recursive = new_recursive.prevrec; /* Done this recursion */
2995*22dc650dSSadaf Ebrahimi
2996*22dc650dSSadaf Ebrahimi /* Ran out of internal offsets */
2997*22dc650dSSadaf Ebrahimi
2998*22dc650dSSadaf Ebrahimi if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2999*22dc650dSSadaf Ebrahimi
3000*22dc650dSSadaf Ebrahimi /* For each successful matched substring, set up the next state with a
3001*22dc650dSSadaf Ebrahimi count of characters to skip before trying it. Note that the count is in
3002*22dc650dSSadaf Ebrahimi characters, not bytes. */
3003*22dc650dSSadaf Ebrahimi
3004*22dc650dSSadaf Ebrahimi if (rc > 0)
3005*22dc650dSSadaf Ebrahimi {
3006*22dc650dSSadaf Ebrahimi for (rc = rc*2 - 2; rc >= 0; rc -= 2)
3007*22dc650dSSadaf Ebrahimi {
3008*22dc650dSSadaf Ebrahimi PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
3009*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3010*22dc650dSSadaf Ebrahimi if (utf)
3011*22dc650dSSadaf Ebrahimi {
3012*22dc650dSSadaf Ebrahimi PCRE2_SPTR p = start_subject + local_offsets[rc];
3013*22dc650dSSadaf Ebrahimi PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
3014*22dc650dSSadaf Ebrahimi while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3015*22dc650dSSadaf Ebrahimi }
3016*22dc650dSSadaf Ebrahimi #endif
3017*22dc650dSSadaf Ebrahimi if (charcount > 0)
3018*22dc650dSSadaf Ebrahimi {
3019*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
3020*22dc650dSSadaf Ebrahimi (int)(charcount - 1));
3021*22dc650dSSadaf Ebrahimi }
3022*22dc650dSSadaf Ebrahimi else
3023*22dc650dSSadaf Ebrahimi {
3024*22dc650dSSadaf Ebrahimi ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
3025*22dc650dSSadaf Ebrahimi }
3026*22dc650dSSadaf Ebrahimi }
3027*22dc650dSSadaf Ebrahimi }
3028*22dc650dSSadaf Ebrahimi else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3029*22dc650dSSadaf Ebrahimi }
3030*22dc650dSSadaf Ebrahimi break;
3031*22dc650dSSadaf Ebrahimi
3032*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
3033*22dc650dSSadaf Ebrahimi case OP_BRAPOS:
3034*22dc650dSSadaf Ebrahimi case OP_SBRAPOS:
3035*22dc650dSSadaf Ebrahimi case OP_CBRAPOS:
3036*22dc650dSSadaf Ebrahimi case OP_SCBRAPOS:
3037*22dc650dSSadaf Ebrahimi case OP_BRAPOSZERO:
3038*22dc650dSSadaf Ebrahimi {
3039*22dc650dSSadaf Ebrahimi int rc;
3040*22dc650dSSadaf Ebrahimi int *local_workspace;
3041*22dc650dSSadaf Ebrahimi PCRE2_SIZE *local_offsets;
3042*22dc650dSSadaf Ebrahimi PCRE2_SIZE charcount, matched_count;
3043*22dc650dSSadaf Ebrahimi PCRE2_SPTR local_ptr = ptr;
3044*22dc650dSSadaf Ebrahimi RWS_anchor *rws = (RWS_anchor *)RWS;
3045*22dc650dSSadaf Ebrahimi BOOL allow_zero;
3046*22dc650dSSadaf Ebrahimi
3047*22dc650dSSadaf Ebrahimi if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3048*22dc650dSSadaf Ebrahimi {
3049*22dc650dSSadaf Ebrahimi rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3050*22dc650dSSadaf Ebrahimi if (rc != 0) return rc;
3051*22dc650dSSadaf Ebrahimi RWS = (int *)rws;
3052*22dc650dSSadaf Ebrahimi }
3053*22dc650dSSadaf Ebrahimi
3054*22dc650dSSadaf Ebrahimi local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3055*22dc650dSSadaf Ebrahimi local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3056*22dc650dSSadaf Ebrahimi rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3057*22dc650dSSadaf Ebrahimi
3058*22dc650dSSadaf Ebrahimi if (codevalue == OP_BRAPOSZERO)
3059*22dc650dSSadaf Ebrahimi {
3060*22dc650dSSadaf Ebrahimi allow_zero = TRUE;
3061*22dc650dSSadaf Ebrahimi codevalue = *(++code); /* Codevalue will be one of above BRAs */
3062*22dc650dSSadaf Ebrahimi }
3063*22dc650dSSadaf Ebrahimi else allow_zero = FALSE;
3064*22dc650dSSadaf Ebrahimi
3065*22dc650dSSadaf Ebrahimi /* Loop to match the subpattern as many times as possible as if it were
3066*22dc650dSSadaf Ebrahimi a complete pattern. */
3067*22dc650dSSadaf Ebrahimi
3068*22dc650dSSadaf Ebrahimi for (matched_count = 0;; matched_count++)
3069*22dc650dSSadaf Ebrahimi {
3070*22dc650dSSadaf Ebrahimi rc = internal_dfa_match(
3071*22dc650dSSadaf Ebrahimi mb, /* fixed match data */
3072*22dc650dSSadaf Ebrahimi code, /* this subexpression's code */
3073*22dc650dSSadaf Ebrahimi local_ptr, /* where we currently are */
3074*22dc650dSSadaf Ebrahimi (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3075*22dc650dSSadaf Ebrahimi local_offsets, /* offset vector */
3076*22dc650dSSadaf Ebrahimi RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3077*22dc650dSSadaf Ebrahimi local_workspace, /* workspace vector */
3078*22dc650dSSadaf Ebrahimi RWS_RSIZE, /* size of same */
3079*22dc650dSSadaf Ebrahimi rlevel, /* function recursion level */
3080*22dc650dSSadaf Ebrahimi RWS); /* recursion workspace */
3081*22dc650dSSadaf Ebrahimi
3082*22dc650dSSadaf Ebrahimi /* Failed to match */
3083*22dc650dSSadaf Ebrahimi
3084*22dc650dSSadaf Ebrahimi if (rc < 0)
3085*22dc650dSSadaf Ebrahimi {
3086*22dc650dSSadaf Ebrahimi if (rc != PCRE2_ERROR_NOMATCH) return rc;
3087*22dc650dSSadaf Ebrahimi break;
3088*22dc650dSSadaf Ebrahimi }
3089*22dc650dSSadaf Ebrahimi
3090*22dc650dSSadaf Ebrahimi /* Matched: break the loop if zero characters matched. */
3091*22dc650dSSadaf Ebrahimi
3092*22dc650dSSadaf Ebrahimi charcount = local_offsets[1] - local_offsets[0];
3093*22dc650dSSadaf Ebrahimi if (charcount == 0) break;
3094*22dc650dSSadaf Ebrahimi local_ptr += charcount; /* Advance temporary position ptr */
3095*22dc650dSSadaf Ebrahimi }
3096*22dc650dSSadaf Ebrahimi
3097*22dc650dSSadaf Ebrahimi rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3098*22dc650dSSadaf Ebrahimi
3099*22dc650dSSadaf Ebrahimi /* At this point we have matched the subpattern matched_count
3100*22dc650dSSadaf Ebrahimi times, and local_ptr is pointing to the character after the end of the
3101*22dc650dSSadaf Ebrahimi last match. */
3102*22dc650dSSadaf Ebrahimi
3103*22dc650dSSadaf Ebrahimi if (matched_count > 0 || allow_zero)
3104*22dc650dSSadaf Ebrahimi {
3105*22dc650dSSadaf Ebrahimi PCRE2_SPTR end_subpattern = code;
3106*22dc650dSSadaf Ebrahimi int next_state_offset;
3107*22dc650dSSadaf Ebrahimi
3108*22dc650dSSadaf Ebrahimi do { end_subpattern += GET(end_subpattern, 1); }
3109*22dc650dSSadaf Ebrahimi while (*end_subpattern == OP_ALT);
3110*22dc650dSSadaf Ebrahimi next_state_offset =
3111*22dc650dSSadaf Ebrahimi (int)(end_subpattern - start_code + LINK_SIZE + 1);
3112*22dc650dSSadaf Ebrahimi
3113*22dc650dSSadaf Ebrahimi /* Optimization: if there are no more active states, and there
3114*22dc650dSSadaf Ebrahimi are no new states yet set up, then skip over the subject string
3115*22dc650dSSadaf Ebrahimi right here, to save looping. Otherwise, set up the new state to swing
3116*22dc650dSSadaf Ebrahimi into action when the end of the matched substring is reached. */
3117*22dc650dSSadaf Ebrahimi
3118*22dc650dSSadaf Ebrahimi if (i + 1 >= active_count && new_count == 0)
3119*22dc650dSSadaf Ebrahimi {
3120*22dc650dSSadaf Ebrahimi ptr = local_ptr;
3121*22dc650dSSadaf Ebrahimi clen = 0;
3122*22dc650dSSadaf Ebrahimi ADD_NEW(next_state_offset, 0);
3123*22dc650dSSadaf Ebrahimi }
3124*22dc650dSSadaf Ebrahimi else
3125*22dc650dSSadaf Ebrahimi {
3126*22dc650dSSadaf Ebrahimi PCRE2_SPTR p = ptr;
3127*22dc650dSSadaf Ebrahimi PCRE2_SPTR pp = local_ptr;
3128*22dc650dSSadaf Ebrahimi charcount = (PCRE2_SIZE)(pp - p);
3129*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3130*22dc650dSSadaf Ebrahimi if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3131*22dc650dSSadaf Ebrahimi #endif
3132*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3133*22dc650dSSadaf Ebrahimi }
3134*22dc650dSSadaf Ebrahimi }
3135*22dc650dSSadaf Ebrahimi }
3136*22dc650dSSadaf Ebrahimi break;
3137*22dc650dSSadaf Ebrahimi
3138*22dc650dSSadaf Ebrahimi /*-----------------------------------------------------------------*/
3139*22dc650dSSadaf Ebrahimi case OP_ONCE:
3140*22dc650dSSadaf Ebrahimi {
3141*22dc650dSSadaf Ebrahimi int rc;
3142*22dc650dSSadaf Ebrahimi int *local_workspace;
3143*22dc650dSSadaf Ebrahimi PCRE2_SIZE *local_offsets;
3144*22dc650dSSadaf Ebrahimi RWS_anchor *rws = (RWS_anchor *)RWS;
3145*22dc650dSSadaf Ebrahimi
3146*22dc650dSSadaf Ebrahimi if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3147*22dc650dSSadaf Ebrahimi {
3148*22dc650dSSadaf Ebrahimi rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3149*22dc650dSSadaf Ebrahimi if (rc != 0) return rc;
3150*22dc650dSSadaf Ebrahimi RWS = (int *)rws;
3151*22dc650dSSadaf Ebrahimi }
3152*22dc650dSSadaf Ebrahimi
3153*22dc650dSSadaf Ebrahimi local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3154*22dc650dSSadaf Ebrahimi local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3155*22dc650dSSadaf Ebrahimi rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3156*22dc650dSSadaf Ebrahimi
3157*22dc650dSSadaf Ebrahimi rc = internal_dfa_match(
3158*22dc650dSSadaf Ebrahimi mb, /* fixed match data */
3159*22dc650dSSadaf Ebrahimi code, /* this subexpression's code */
3160*22dc650dSSadaf Ebrahimi ptr, /* where we currently are */
3161*22dc650dSSadaf Ebrahimi (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3162*22dc650dSSadaf Ebrahimi local_offsets, /* offset vector */
3163*22dc650dSSadaf Ebrahimi RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3164*22dc650dSSadaf Ebrahimi local_workspace, /* workspace vector */
3165*22dc650dSSadaf Ebrahimi RWS_RSIZE, /* size of same */
3166*22dc650dSSadaf Ebrahimi rlevel, /* function recursion level */
3167*22dc650dSSadaf Ebrahimi RWS); /* recursion workspace */
3168*22dc650dSSadaf Ebrahimi
3169*22dc650dSSadaf Ebrahimi rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3170*22dc650dSSadaf Ebrahimi
3171*22dc650dSSadaf Ebrahimi if (rc >= 0)
3172*22dc650dSSadaf Ebrahimi {
3173*22dc650dSSadaf Ebrahimi PCRE2_SPTR end_subpattern = code;
3174*22dc650dSSadaf Ebrahimi PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3175*22dc650dSSadaf Ebrahimi int next_state_offset, repeat_state_offset;
3176*22dc650dSSadaf Ebrahimi
3177*22dc650dSSadaf Ebrahimi do { end_subpattern += GET(end_subpattern, 1); }
3178*22dc650dSSadaf Ebrahimi while (*end_subpattern == OP_ALT);
3179*22dc650dSSadaf Ebrahimi next_state_offset =
3180*22dc650dSSadaf Ebrahimi (int)(end_subpattern - start_code + LINK_SIZE + 1);
3181*22dc650dSSadaf Ebrahimi
3182*22dc650dSSadaf Ebrahimi /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3183*22dc650dSSadaf Ebrahimi arrange for the repeat state also to be added to the relevant list.
3184*22dc650dSSadaf Ebrahimi Calculate the offset, or set -1 for no repeat. */
3185*22dc650dSSadaf Ebrahimi
3186*22dc650dSSadaf Ebrahimi repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3187*22dc650dSSadaf Ebrahimi *end_subpattern == OP_KETRMIN)?
3188*22dc650dSSadaf Ebrahimi (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3189*22dc650dSSadaf Ebrahimi
3190*22dc650dSSadaf Ebrahimi /* If we have matched an empty string, add the next state at the
3191*22dc650dSSadaf Ebrahimi current character pointer. This is important so that the duplicate
3192*22dc650dSSadaf Ebrahimi checking kicks in, which is what breaks infinite loops that match an
3193*22dc650dSSadaf Ebrahimi empty string. */
3194*22dc650dSSadaf Ebrahimi
3195*22dc650dSSadaf Ebrahimi if (charcount == 0)
3196*22dc650dSSadaf Ebrahimi {
3197*22dc650dSSadaf Ebrahimi ADD_ACTIVE(next_state_offset, 0);
3198*22dc650dSSadaf Ebrahimi }
3199*22dc650dSSadaf Ebrahimi
3200*22dc650dSSadaf Ebrahimi /* Optimization: if there are no more active states, and there
3201*22dc650dSSadaf Ebrahimi are no new states yet set up, then skip over the subject string
3202*22dc650dSSadaf Ebrahimi right here, to save looping. Otherwise, set up the new state to swing
3203*22dc650dSSadaf Ebrahimi into action when the end of the matched substring is reached. */
3204*22dc650dSSadaf Ebrahimi
3205*22dc650dSSadaf Ebrahimi else if (i + 1 >= active_count && new_count == 0)
3206*22dc650dSSadaf Ebrahimi {
3207*22dc650dSSadaf Ebrahimi ptr += charcount;
3208*22dc650dSSadaf Ebrahimi clen = 0;
3209*22dc650dSSadaf Ebrahimi ADD_NEW(next_state_offset, 0);
3210*22dc650dSSadaf Ebrahimi
3211*22dc650dSSadaf Ebrahimi /* If we are adding a repeat state at the new character position,
3212*22dc650dSSadaf Ebrahimi we must fudge things so that it is the only current state.
3213*22dc650dSSadaf Ebrahimi Otherwise, it might be a duplicate of one we processed before, and
3214*22dc650dSSadaf Ebrahimi that would cause it to be skipped. */
3215*22dc650dSSadaf Ebrahimi
3216*22dc650dSSadaf Ebrahimi if (repeat_state_offset >= 0)
3217*22dc650dSSadaf Ebrahimi {
3218*22dc650dSSadaf Ebrahimi next_active_state = active_states;
3219*22dc650dSSadaf Ebrahimi active_count = 0;
3220*22dc650dSSadaf Ebrahimi i = -1;
3221*22dc650dSSadaf Ebrahimi ADD_ACTIVE(repeat_state_offset, 0);
3222*22dc650dSSadaf Ebrahimi }
3223*22dc650dSSadaf Ebrahimi }
3224*22dc650dSSadaf Ebrahimi else
3225*22dc650dSSadaf Ebrahimi {
3226*22dc650dSSadaf Ebrahimi #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3227*22dc650dSSadaf Ebrahimi if (utf)
3228*22dc650dSSadaf Ebrahimi {
3229*22dc650dSSadaf Ebrahimi PCRE2_SPTR p = start_subject + local_offsets[0];
3230*22dc650dSSadaf Ebrahimi PCRE2_SPTR pp = start_subject + local_offsets[1];
3231*22dc650dSSadaf Ebrahimi while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3232*22dc650dSSadaf Ebrahimi }
3233*22dc650dSSadaf Ebrahimi #endif
3234*22dc650dSSadaf Ebrahimi ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3235*22dc650dSSadaf Ebrahimi if (repeat_state_offset >= 0)
3236*22dc650dSSadaf Ebrahimi { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3237*22dc650dSSadaf Ebrahimi }
3238*22dc650dSSadaf Ebrahimi }
3239*22dc650dSSadaf Ebrahimi else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3240*22dc650dSSadaf Ebrahimi }
3241*22dc650dSSadaf Ebrahimi break;
3242*22dc650dSSadaf Ebrahimi
3243*22dc650dSSadaf Ebrahimi
3244*22dc650dSSadaf Ebrahimi /* ========================================================================== */
3245*22dc650dSSadaf Ebrahimi /* Handle callouts */
3246*22dc650dSSadaf Ebrahimi
3247*22dc650dSSadaf Ebrahimi case OP_CALLOUT:
3248*22dc650dSSadaf Ebrahimi case OP_CALLOUT_STR:
3249*22dc650dSSadaf Ebrahimi {
3250*22dc650dSSadaf Ebrahimi PCRE2_SIZE callout_length;
3251*22dc650dSSadaf Ebrahimi rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3252*22dc650dSSadaf Ebrahimi &callout_length);
3253*22dc650dSSadaf Ebrahimi if (rrc < 0) return rrc; /* Abandon */
3254*22dc650dSSadaf Ebrahimi if (rrc == 0)
3255*22dc650dSSadaf Ebrahimi { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3256*22dc650dSSadaf Ebrahimi }
3257*22dc650dSSadaf Ebrahimi break;
3258*22dc650dSSadaf Ebrahimi
3259*22dc650dSSadaf Ebrahimi
3260*22dc650dSSadaf Ebrahimi /* ========================================================================== */
3261*22dc650dSSadaf Ebrahimi default: /* Unsupported opcode */
3262*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_DFA_UITEM;
3263*22dc650dSSadaf Ebrahimi }
3264*22dc650dSSadaf Ebrahimi
3265*22dc650dSSadaf Ebrahimi NEXT_ACTIVE_STATE: continue;
3266*22dc650dSSadaf Ebrahimi
3267*22dc650dSSadaf Ebrahimi } /* End of loop scanning active states */
3268*22dc650dSSadaf Ebrahimi
3269*22dc650dSSadaf Ebrahimi /* We have finished the processing at the current subject character. If no
3270*22dc650dSSadaf Ebrahimi new states have been set for the next character, we have found all the
3271*22dc650dSSadaf Ebrahimi matches that we are going to find. If partial matching has been requested,
3272*22dc650dSSadaf Ebrahimi check for appropriate conditions.
3273*22dc650dSSadaf Ebrahimi
3274*22dc650dSSadaf Ebrahimi The "forced_ fail" variable counts the number of (*F) encountered for the
3275*22dc650dSSadaf Ebrahimi character. If it is equal to the original active_count (saved in
3276*22dc650dSSadaf Ebrahimi workspace[1]) it means that (*F) was found on every active state. In this
3277*22dc650dSSadaf Ebrahimi case we don't want to give a partial match.
3278*22dc650dSSadaf Ebrahimi
3279*22dc650dSSadaf Ebrahimi The "could_continue" variable is true if a state could have continued but
3280*22dc650dSSadaf Ebrahimi for the fact that the end of the subject was reached. */
3281*22dc650dSSadaf Ebrahimi
3282*22dc650dSSadaf Ebrahimi if (new_count <= 0)
3283*22dc650dSSadaf Ebrahimi {
3284*22dc650dSSadaf Ebrahimi if (could_continue && /* Some could go on, and */
3285*22dc650dSSadaf Ebrahimi forced_fail != workspace[1] && /* Not all forced fail & */
3286*22dc650dSSadaf Ebrahimi ( /* either... */
3287*22dc650dSSadaf Ebrahimi (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3288*22dc650dSSadaf Ebrahimi || /* or... */
3289*22dc650dSSadaf Ebrahimi ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3290*22dc650dSSadaf Ebrahimi match_count < 0) /* no matches */
3291*22dc650dSSadaf Ebrahimi ) && /* And... */
3292*22dc650dSSadaf Ebrahimi (
3293*22dc650dSSadaf Ebrahimi partial_newline || /* Either partial NL */
3294*22dc650dSSadaf Ebrahimi ( /* or ... */
3295*22dc650dSSadaf Ebrahimi ptr >= end_subject && /* End of subject and */
3296*22dc650dSSadaf Ebrahimi ( /* either */
3297*22dc650dSSadaf Ebrahimi ptr > mb->start_used_ptr || /* Inspected non-empty string */
3298*22dc650dSSadaf Ebrahimi mb->allowemptypartial /* or pattern has lookbehind */
3299*22dc650dSSadaf Ebrahimi ) /* or could match empty */
3300*22dc650dSSadaf Ebrahimi )
3301*22dc650dSSadaf Ebrahimi ))
3302*22dc650dSSadaf Ebrahimi match_count = PCRE2_ERROR_PARTIAL;
3303*22dc650dSSadaf Ebrahimi break; /* Exit from loop along the subject string */
3304*22dc650dSSadaf Ebrahimi }
3305*22dc650dSSadaf Ebrahimi
3306*22dc650dSSadaf Ebrahimi /* One or more states are active for the next character. */
3307*22dc650dSSadaf Ebrahimi
3308*22dc650dSSadaf Ebrahimi ptr += clen; /* Advance to next subject character */
3309*22dc650dSSadaf Ebrahimi } /* Loop to move along the subject string */
3310*22dc650dSSadaf Ebrahimi
3311*22dc650dSSadaf Ebrahimi /* Control gets here from "break" a few lines above. If we have a match and
3312*22dc650dSSadaf Ebrahimi PCRE2_ENDANCHORED is set, the match fails. */
3313*22dc650dSSadaf Ebrahimi
3314*22dc650dSSadaf Ebrahimi if (match_count >= 0 &&
3315*22dc650dSSadaf Ebrahimi ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3316*22dc650dSSadaf Ebrahimi ptr < end_subject)
3317*22dc650dSSadaf Ebrahimi match_count = PCRE2_ERROR_NOMATCH;
3318*22dc650dSSadaf Ebrahimi
3319*22dc650dSSadaf Ebrahimi return match_count;
3320*22dc650dSSadaf Ebrahimi }
3321*22dc650dSSadaf Ebrahimi
3322*22dc650dSSadaf Ebrahimi
3323*22dc650dSSadaf Ebrahimi
3324*22dc650dSSadaf Ebrahimi /*************************************************
3325*22dc650dSSadaf Ebrahimi * Match a pattern using the DFA algorithm *
3326*22dc650dSSadaf Ebrahimi *************************************************/
3327*22dc650dSSadaf Ebrahimi
3328*22dc650dSSadaf Ebrahimi /* This function matches a compiled pattern to a subject string, using the
3329*22dc650dSSadaf Ebrahimi alternate matching algorithm that finds all matches at once.
3330*22dc650dSSadaf Ebrahimi
3331*22dc650dSSadaf Ebrahimi Arguments:
3332*22dc650dSSadaf Ebrahimi code points to the compiled pattern
3333*22dc650dSSadaf Ebrahimi subject subject string
3334*22dc650dSSadaf Ebrahimi length length of subject string
3335*22dc650dSSadaf Ebrahimi startoffset where to start matching in the subject
3336*22dc650dSSadaf Ebrahimi options option bits
3337*22dc650dSSadaf Ebrahimi match_data points to a match data structure
3338*22dc650dSSadaf Ebrahimi gcontext points to a match context
3339*22dc650dSSadaf Ebrahimi workspace pointer to workspace
3340*22dc650dSSadaf Ebrahimi wscount size of workspace
3341*22dc650dSSadaf Ebrahimi
3342*22dc650dSSadaf Ebrahimi Returns: > 0 => number of match offset pairs placed in offsets
3343*22dc650dSSadaf Ebrahimi = 0 => offsets overflowed; longest matches are present
3344*22dc650dSSadaf Ebrahimi -1 => failed to match
3345*22dc650dSSadaf Ebrahimi < -1 => some kind of unexpected problem
3346*22dc650dSSadaf Ebrahimi */
3347*22dc650dSSadaf Ebrahimi
3348*22dc650dSSadaf Ebrahimi PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3349*22dc650dSSadaf Ebrahimi pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3350*22dc650dSSadaf Ebrahimi PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3351*22dc650dSSadaf Ebrahimi pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3352*22dc650dSSadaf Ebrahimi {
3353*22dc650dSSadaf Ebrahimi int rc;
3354*22dc650dSSadaf Ebrahimi int was_zero_terminated = 0;
3355*22dc650dSSadaf Ebrahimi
3356*22dc650dSSadaf Ebrahimi const pcre2_real_code *re = (const pcre2_real_code *)code;
3357*22dc650dSSadaf Ebrahimi
3358*22dc650dSSadaf Ebrahimi PCRE2_SPTR start_match;
3359*22dc650dSSadaf Ebrahimi PCRE2_SPTR end_subject;
3360*22dc650dSSadaf Ebrahimi PCRE2_SPTR bumpalong_limit;
3361*22dc650dSSadaf Ebrahimi PCRE2_SPTR req_cu_ptr;
3362*22dc650dSSadaf Ebrahimi
3363*22dc650dSSadaf Ebrahimi BOOL utf, anchored, startline, firstline;
3364*22dc650dSSadaf Ebrahimi BOOL has_first_cu = FALSE;
3365*22dc650dSSadaf Ebrahimi BOOL has_req_cu = FALSE;
3366*22dc650dSSadaf Ebrahimi
3367*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
3368*22dc650dSSadaf Ebrahimi PCRE2_SPTR memchr_found_first_cu = NULL;
3369*22dc650dSSadaf Ebrahimi PCRE2_SPTR memchr_found_first_cu2 = NULL;
3370*22dc650dSSadaf Ebrahimi #endif
3371*22dc650dSSadaf Ebrahimi
3372*22dc650dSSadaf Ebrahimi PCRE2_UCHAR first_cu = 0;
3373*22dc650dSSadaf Ebrahimi PCRE2_UCHAR first_cu2 = 0;
3374*22dc650dSSadaf Ebrahimi PCRE2_UCHAR req_cu = 0;
3375*22dc650dSSadaf Ebrahimi PCRE2_UCHAR req_cu2 = 0;
3376*22dc650dSSadaf Ebrahimi
3377*22dc650dSSadaf Ebrahimi const uint8_t *start_bits = NULL;
3378*22dc650dSSadaf Ebrahimi
3379*22dc650dSSadaf Ebrahimi /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3380*22dc650dSSadaf Ebrahimi is used below, and it expects NLBLOCK to be defined as a pointer. */
3381*22dc650dSSadaf Ebrahimi
3382*22dc650dSSadaf Ebrahimi pcre2_callout_block cb;
3383*22dc650dSSadaf Ebrahimi dfa_match_block actual_match_block;
3384*22dc650dSSadaf Ebrahimi dfa_match_block *mb = &actual_match_block;
3385*22dc650dSSadaf Ebrahimi
3386*22dc650dSSadaf Ebrahimi /* Set up a starting block of memory for use during recursive calls to
3387*22dc650dSSadaf Ebrahimi internal_dfa_match(). By putting this on the stack, it minimizes resource use
3388*22dc650dSSadaf Ebrahimi in the case when it is not needed. If this is too small, more memory is
3389*22dc650dSSadaf Ebrahimi obtained from the heap. At the start of each block is an anchor structure.*/
3390*22dc650dSSadaf Ebrahimi
3391*22dc650dSSadaf Ebrahimi int base_recursion_workspace[RWS_BASE_SIZE];
3392*22dc650dSSadaf Ebrahimi RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3393*22dc650dSSadaf Ebrahimi rws->next = NULL;
3394*22dc650dSSadaf Ebrahimi rws->size = RWS_BASE_SIZE;
3395*22dc650dSSadaf Ebrahimi rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3396*22dc650dSSadaf Ebrahimi
3397*22dc650dSSadaf Ebrahimi /* Recognize NULL, length 0 as an empty string. */
3398*22dc650dSSadaf Ebrahimi
3399*22dc650dSSadaf Ebrahimi if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3400*22dc650dSSadaf Ebrahimi
3401*22dc650dSSadaf Ebrahimi /* Plausibility checks */
3402*22dc650dSSadaf Ebrahimi
3403*22dc650dSSadaf Ebrahimi if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3404*22dc650dSSadaf Ebrahimi if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3405*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_NULL;
3406*22dc650dSSadaf Ebrahimi
3407*22dc650dSSadaf Ebrahimi if (length == PCRE2_ZERO_TERMINATED)
3408*22dc650dSSadaf Ebrahimi {
3409*22dc650dSSadaf Ebrahimi length = PRIV(strlen)(subject);
3410*22dc650dSSadaf Ebrahimi was_zero_terminated = 1;
3411*22dc650dSSadaf Ebrahimi }
3412*22dc650dSSadaf Ebrahimi
3413*22dc650dSSadaf Ebrahimi if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3414*22dc650dSSadaf Ebrahimi if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3415*22dc650dSSadaf Ebrahimi
3416*22dc650dSSadaf Ebrahimi /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3417*22dc650dSSadaf Ebrahimi time. */
3418*22dc650dSSadaf Ebrahimi
3419*22dc650dSSadaf Ebrahimi if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3420*22dc650dSSadaf Ebrahimi ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3421*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_BADOPTION;
3422*22dc650dSSadaf Ebrahimi
3423*22dc650dSSadaf Ebrahimi /* Invalid UTF support is not available for DFA matching. */
3424*22dc650dSSadaf Ebrahimi
3425*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3426*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_DFA_UINVALID_UTF;
3427*22dc650dSSadaf Ebrahimi
3428*22dc650dSSadaf Ebrahimi /* Check that the first field in the block is the magic number. If it is not,
3429*22dc650dSSadaf Ebrahimi return with PCRE2_ERROR_BADMAGIC. */
3430*22dc650dSSadaf Ebrahimi
3431*22dc650dSSadaf Ebrahimi if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3432*22dc650dSSadaf Ebrahimi
3433*22dc650dSSadaf Ebrahimi /* Check the code unit width. */
3434*22dc650dSSadaf Ebrahimi
3435*22dc650dSSadaf Ebrahimi if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3436*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_BADMODE;
3437*22dc650dSSadaf Ebrahimi
3438*22dc650dSSadaf Ebrahimi /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3439*22dc650dSSadaf Ebrahimi options variable for this function. Users of PCRE2 who are not calling the
3440*22dc650dSSadaf Ebrahimi function directly would like to have a way of setting these flags, in the same
3441*22dc650dSSadaf Ebrahimi way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3442*22dc650dSSadaf Ebrahimi constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3443*22dc650dSSadaf Ebrahimi (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3444*22dc650dSSadaf Ebrahimi transferred to the options for this function. The bits are guaranteed to be
3445*22dc650dSSadaf Ebrahimi adjacent, but do not have the same values. This bit of Boolean trickery assumes
3446*22dc650dSSadaf Ebrahimi that the match-time bits are not more significant than the flag bits. If by
3447*22dc650dSSadaf Ebrahimi accident this is not the case, a compile-time division by zero error will
3448*22dc650dSSadaf Ebrahimi occur. */
3449*22dc650dSSadaf Ebrahimi
3450*22dc650dSSadaf Ebrahimi #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3451*22dc650dSSadaf Ebrahimi #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3452*22dc650dSSadaf Ebrahimi options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3453*22dc650dSSadaf Ebrahimi #undef FF
3454*22dc650dSSadaf Ebrahimi #undef OO
3455*22dc650dSSadaf Ebrahimi
3456*22dc650dSSadaf Ebrahimi /* If restarting after a partial match, do some sanity checks on the contents
3457*22dc650dSSadaf Ebrahimi of the workspace. */
3458*22dc650dSSadaf Ebrahimi
3459*22dc650dSSadaf Ebrahimi if ((options & PCRE2_DFA_RESTART) != 0)
3460*22dc650dSSadaf Ebrahimi {
3461*22dc650dSSadaf Ebrahimi if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3462*22dc650dSSadaf Ebrahimi workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3463*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_DFA_BADRESTART;
3464*22dc650dSSadaf Ebrahimi }
3465*22dc650dSSadaf Ebrahimi
3466*22dc650dSSadaf Ebrahimi /* Set some local values */
3467*22dc650dSSadaf Ebrahimi
3468*22dc650dSSadaf Ebrahimi utf = (re->overall_options & PCRE2_UTF) != 0;
3469*22dc650dSSadaf Ebrahimi start_match = subject + start_offset;
3470*22dc650dSSadaf Ebrahimi end_subject = subject + length;
3471*22dc650dSSadaf Ebrahimi req_cu_ptr = start_match - 1;
3472*22dc650dSSadaf Ebrahimi anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3473*22dc650dSSadaf Ebrahimi (re->overall_options & PCRE2_ANCHORED) != 0;
3474*22dc650dSSadaf Ebrahimi
3475*22dc650dSSadaf Ebrahimi /* The "must be at the start of a line" flags are used in a loop when finding
3476*22dc650dSSadaf Ebrahimi where to start. */
3477*22dc650dSSadaf Ebrahimi
3478*22dc650dSSadaf Ebrahimi startline = (re->flags & PCRE2_STARTLINE) != 0;
3479*22dc650dSSadaf Ebrahimi firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
3480*22dc650dSSadaf Ebrahimi bumpalong_limit = end_subject;
3481*22dc650dSSadaf Ebrahimi
3482*22dc650dSSadaf Ebrahimi /* Initialize and set up the fixed fields in the callout block, with a pointer
3483*22dc650dSSadaf Ebrahimi in the match block. */
3484*22dc650dSSadaf Ebrahimi
3485*22dc650dSSadaf Ebrahimi mb->cb = &cb;
3486*22dc650dSSadaf Ebrahimi cb.version = 2;
3487*22dc650dSSadaf Ebrahimi cb.subject = subject;
3488*22dc650dSSadaf Ebrahimi cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3489*22dc650dSSadaf Ebrahimi cb.callout_flags = 0;
3490*22dc650dSSadaf Ebrahimi cb.capture_top = 1; /* No capture support */
3491*22dc650dSSadaf Ebrahimi cb.capture_last = 0;
3492*22dc650dSSadaf Ebrahimi cb.mark = NULL; /* No (*MARK) support */
3493*22dc650dSSadaf Ebrahimi
3494*22dc650dSSadaf Ebrahimi /* Get data from the match context, if present, and fill in the remaining
3495*22dc650dSSadaf Ebrahimi fields in the match block. It is an error to set an offset limit without
3496*22dc650dSSadaf Ebrahimi setting the flag at compile time. */
3497*22dc650dSSadaf Ebrahimi
3498*22dc650dSSadaf Ebrahimi if (mcontext == NULL)
3499*22dc650dSSadaf Ebrahimi {
3500*22dc650dSSadaf Ebrahimi mb->callout = NULL;
3501*22dc650dSSadaf Ebrahimi mb->memctl = re->memctl;
3502*22dc650dSSadaf Ebrahimi mb->match_limit = PRIV(default_match_context).match_limit;
3503*22dc650dSSadaf Ebrahimi mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3504*22dc650dSSadaf Ebrahimi mb->heap_limit = PRIV(default_match_context).heap_limit;
3505*22dc650dSSadaf Ebrahimi }
3506*22dc650dSSadaf Ebrahimi else
3507*22dc650dSSadaf Ebrahimi {
3508*22dc650dSSadaf Ebrahimi if (mcontext->offset_limit != PCRE2_UNSET)
3509*22dc650dSSadaf Ebrahimi {
3510*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3511*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_BADOFFSETLIMIT;
3512*22dc650dSSadaf Ebrahimi bumpalong_limit = subject + mcontext->offset_limit;
3513*22dc650dSSadaf Ebrahimi }
3514*22dc650dSSadaf Ebrahimi mb->callout = mcontext->callout;
3515*22dc650dSSadaf Ebrahimi mb->callout_data = mcontext->callout_data;
3516*22dc650dSSadaf Ebrahimi mb->memctl = mcontext->memctl;
3517*22dc650dSSadaf Ebrahimi mb->match_limit = mcontext->match_limit;
3518*22dc650dSSadaf Ebrahimi mb->match_limit_depth = mcontext->depth_limit;
3519*22dc650dSSadaf Ebrahimi mb->heap_limit = mcontext->heap_limit;
3520*22dc650dSSadaf Ebrahimi }
3521*22dc650dSSadaf Ebrahimi
3522*22dc650dSSadaf Ebrahimi if (mb->match_limit > re->limit_match)
3523*22dc650dSSadaf Ebrahimi mb->match_limit = re->limit_match;
3524*22dc650dSSadaf Ebrahimi
3525*22dc650dSSadaf Ebrahimi if (mb->match_limit_depth > re->limit_depth)
3526*22dc650dSSadaf Ebrahimi mb->match_limit_depth = re->limit_depth;
3527*22dc650dSSadaf Ebrahimi
3528*22dc650dSSadaf Ebrahimi if (mb->heap_limit > re->limit_heap)
3529*22dc650dSSadaf Ebrahimi mb->heap_limit = re->limit_heap;
3530*22dc650dSSadaf Ebrahimi
3531*22dc650dSSadaf Ebrahimi mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3532*22dc650dSSadaf Ebrahimi re->name_count * re->name_entry_size;
3533*22dc650dSSadaf Ebrahimi mb->tables = re->tables;
3534*22dc650dSSadaf Ebrahimi mb->start_subject = subject;
3535*22dc650dSSadaf Ebrahimi mb->end_subject = end_subject;
3536*22dc650dSSadaf Ebrahimi mb->start_offset = start_offset;
3537*22dc650dSSadaf Ebrahimi mb->allowemptypartial = (re->max_lookbehind > 0) ||
3538*22dc650dSSadaf Ebrahimi (re->flags & PCRE2_MATCH_EMPTY) != 0;
3539*22dc650dSSadaf Ebrahimi mb->moptions = options;
3540*22dc650dSSadaf Ebrahimi mb->poptions = re->overall_options;
3541*22dc650dSSadaf Ebrahimi mb->match_call_count = 0;
3542*22dc650dSSadaf Ebrahimi mb->heap_used = 0;
3543*22dc650dSSadaf Ebrahimi
3544*22dc650dSSadaf Ebrahimi /* Process the \R and newline settings. */
3545*22dc650dSSadaf Ebrahimi
3546*22dc650dSSadaf Ebrahimi mb->bsr_convention = re->bsr_convention;
3547*22dc650dSSadaf Ebrahimi mb->nltype = NLTYPE_FIXED;
3548*22dc650dSSadaf Ebrahimi switch(re->newline_convention)
3549*22dc650dSSadaf Ebrahimi {
3550*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_CR:
3551*22dc650dSSadaf Ebrahimi mb->nllen = 1;
3552*22dc650dSSadaf Ebrahimi mb->nl[0] = CHAR_CR;
3553*22dc650dSSadaf Ebrahimi break;
3554*22dc650dSSadaf Ebrahimi
3555*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_LF:
3556*22dc650dSSadaf Ebrahimi mb->nllen = 1;
3557*22dc650dSSadaf Ebrahimi mb->nl[0] = CHAR_NL;
3558*22dc650dSSadaf Ebrahimi break;
3559*22dc650dSSadaf Ebrahimi
3560*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_NUL:
3561*22dc650dSSadaf Ebrahimi mb->nllen = 1;
3562*22dc650dSSadaf Ebrahimi mb->nl[0] = CHAR_NUL;
3563*22dc650dSSadaf Ebrahimi break;
3564*22dc650dSSadaf Ebrahimi
3565*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_CRLF:
3566*22dc650dSSadaf Ebrahimi mb->nllen = 2;
3567*22dc650dSSadaf Ebrahimi mb->nl[0] = CHAR_CR;
3568*22dc650dSSadaf Ebrahimi mb->nl[1] = CHAR_NL;
3569*22dc650dSSadaf Ebrahimi break;
3570*22dc650dSSadaf Ebrahimi
3571*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_ANY:
3572*22dc650dSSadaf Ebrahimi mb->nltype = NLTYPE_ANY;
3573*22dc650dSSadaf Ebrahimi break;
3574*22dc650dSSadaf Ebrahimi
3575*22dc650dSSadaf Ebrahimi case PCRE2_NEWLINE_ANYCRLF:
3576*22dc650dSSadaf Ebrahimi mb->nltype = NLTYPE_ANYCRLF;
3577*22dc650dSSadaf Ebrahimi break;
3578*22dc650dSSadaf Ebrahimi
3579*22dc650dSSadaf Ebrahimi default: return PCRE2_ERROR_INTERNAL;
3580*22dc650dSSadaf Ebrahimi }
3581*22dc650dSSadaf Ebrahimi
3582*22dc650dSSadaf Ebrahimi /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3583*22dc650dSSadaf Ebrahimi we must also check that a starting offset does not point into the middle of a
3584*22dc650dSSadaf Ebrahimi multiunit character. We check only the portion of the subject that is going to
3585*22dc650dSSadaf Ebrahimi be inspected during matching - from the offset minus the maximum back reference
3586*22dc650dSSadaf Ebrahimi to the given length. This saves time when a small part of a large subject is
3587*22dc650dSSadaf Ebrahimi being matched by the use of a starting offset. Note that the maximum lookbehind
3588*22dc650dSSadaf Ebrahimi is a number of characters, not code units. */
3589*22dc650dSSadaf Ebrahimi
3590*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3591*22dc650dSSadaf Ebrahimi if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3592*22dc650dSSadaf Ebrahimi {
3593*22dc650dSSadaf Ebrahimi PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3594*22dc650dSSadaf Ebrahimi
3595*22dc650dSSadaf Ebrahimi if (start_offset > 0)
3596*22dc650dSSadaf Ebrahimi {
3597*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 32
3598*22dc650dSSadaf Ebrahimi unsigned int i;
3599*22dc650dSSadaf Ebrahimi if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3600*22dc650dSSadaf Ebrahimi return PCRE2_ERROR_BADUTFOFFSET;
3601*22dc650dSSadaf Ebrahimi for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3602*22dc650dSSadaf Ebrahimi {
3603*22dc650dSSadaf Ebrahimi check_subject--;
3604*22dc650dSSadaf Ebrahimi while (check_subject > subject &&
3605*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
3606*22dc650dSSadaf Ebrahimi (*check_subject & 0xc0) == 0x80)
3607*22dc650dSSadaf Ebrahimi #else /* 16-bit */
3608*22dc650dSSadaf Ebrahimi (*check_subject & 0xfc00) == 0xdc00)
3609*22dc650dSSadaf Ebrahimi #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3610*22dc650dSSadaf Ebrahimi check_subject--;
3611*22dc650dSSadaf Ebrahimi }
3612*22dc650dSSadaf Ebrahimi #else /* In the 32-bit library, one code unit equals one character. */
3613*22dc650dSSadaf Ebrahimi check_subject -= re->max_lookbehind;
3614*22dc650dSSadaf Ebrahimi if (check_subject < subject) check_subject = subject;
3615*22dc650dSSadaf Ebrahimi #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3616*22dc650dSSadaf Ebrahimi }
3617*22dc650dSSadaf Ebrahimi
3618*22dc650dSSadaf Ebrahimi /* Validate the relevant portion of the subject. After an error, adjust the
3619*22dc650dSSadaf Ebrahimi offset to be an absolute offset in the whole string. */
3620*22dc650dSSadaf Ebrahimi
3621*22dc650dSSadaf Ebrahimi match_data->rc = PRIV(valid_utf)(check_subject,
3622*22dc650dSSadaf Ebrahimi length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3623*22dc650dSSadaf Ebrahimi if (match_data->rc != 0)
3624*22dc650dSSadaf Ebrahimi {
3625*22dc650dSSadaf Ebrahimi match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3626*22dc650dSSadaf Ebrahimi return match_data->rc;
3627*22dc650dSSadaf Ebrahimi }
3628*22dc650dSSadaf Ebrahimi }
3629*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
3630*22dc650dSSadaf Ebrahimi
3631*22dc650dSSadaf Ebrahimi /* Set up the first code unit to match, if available. If there's no first code
3632*22dc650dSSadaf Ebrahimi unit there may be a bitmap of possible first characters. */
3633*22dc650dSSadaf Ebrahimi
3634*22dc650dSSadaf Ebrahimi if ((re->flags & PCRE2_FIRSTSET) != 0)
3635*22dc650dSSadaf Ebrahimi {
3636*22dc650dSSadaf Ebrahimi has_first_cu = TRUE;
3637*22dc650dSSadaf Ebrahimi first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3638*22dc650dSSadaf Ebrahimi if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3639*22dc650dSSadaf Ebrahimi {
3640*22dc650dSSadaf Ebrahimi first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3641*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3642*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
3643*22dc650dSSadaf Ebrahimi if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3644*22dc650dSSadaf Ebrahimi first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3645*22dc650dSSadaf Ebrahimi #else
3646*22dc650dSSadaf Ebrahimi if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3647*22dc650dSSadaf Ebrahimi first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3648*22dc650dSSadaf Ebrahimi #endif
3649*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
3650*22dc650dSSadaf Ebrahimi }
3651*22dc650dSSadaf Ebrahimi }
3652*22dc650dSSadaf Ebrahimi else
3653*22dc650dSSadaf Ebrahimi if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3654*22dc650dSSadaf Ebrahimi start_bits = re->start_bitmap;
3655*22dc650dSSadaf Ebrahimi
3656*22dc650dSSadaf Ebrahimi /* There may be a "last known required code unit" set. */
3657*22dc650dSSadaf Ebrahimi
3658*22dc650dSSadaf Ebrahimi if ((re->flags & PCRE2_LASTSET) != 0)
3659*22dc650dSSadaf Ebrahimi {
3660*22dc650dSSadaf Ebrahimi has_req_cu = TRUE;
3661*22dc650dSSadaf Ebrahimi req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3662*22dc650dSSadaf Ebrahimi if ((re->flags & PCRE2_LASTCASELESS) != 0)
3663*22dc650dSSadaf Ebrahimi {
3664*22dc650dSSadaf Ebrahimi req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3665*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3666*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
3667*22dc650dSSadaf Ebrahimi if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3668*22dc650dSSadaf Ebrahimi req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3669*22dc650dSSadaf Ebrahimi #else
3670*22dc650dSSadaf Ebrahimi if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3671*22dc650dSSadaf Ebrahimi req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3672*22dc650dSSadaf Ebrahimi #endif
3673*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
3674*22dc650dSSadaf Ebrahimi }
3675*22dc650dSSadaf Ebrahimi }
3676*22dc650dSSadaf Ebrahimi
3677*22dc650dSSadaf Ebrahimi /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3678*22dc650dSSadaf Ebrahimi free the memory that was obtained. */
3679*22dc650dSSadaf Ebrahimi
3680*22dc650dSSadaf Ebrahimi if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3681*22dc650dSSadaf Ebrahimi {
3682*22dc650dSSadaf Ebrahimi match_data->memctl.free((void *)match_data->subject,
3683*22dc650dSSadaf Ebrahimi match_data->memctl.memory_data);
3684*22dc650dSSadaf Ebrahimi match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3685*22dc650dSSadaf Ebrahimi }
3686*22dc650dSSadaf Ebrahimi
3687*22dc650dSSadaf Ebrahimi /* Fill in fields that are always returned in the match data. */
3688*22dc650dSSadaf Ebrahimi
3689*22dc650dSSadaf Ebrahimi match_data->code = re;
3690*22dc650dSSadaf Ebrahimi match_data->subject = NULL; /* Default for no match */
3691*22dc650dSSadaf Ebrahimi match_data->mark = NULL;
3692*22dc650dSSadaf Ebrahimi match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3693*22dc650dSSadaf Ebrahimi
3694*22dc650dSSadaf Ebrahimi /* Call the main matching function, looping for a non-anchored regex after a
3695*22dc650dSSadaf Ebrahimi failed match. If not restarting, perform certain optimizations at the start of
3696*22dc650dSSadaf Ebrahimi a match. */
3697*22dc650dSSadaf Ebrahimi
3698*22dc650dSSadaf Ebrahimi for (;;)
3699*22dc650dSSadaf Ebrahimi {
3700*22dc650dSSadaf Ebrahimi /* ----------------- Start of match optimizations ---------------- */
3701*22dc650dSSadaf Ebrahimi
3702*22dc650dSSadaf Ebrahimi /* There are some optimizations that avoid running the match if a known
3703*22dc650dSSadaf Ebrahimi starting point is not found, or if a known later code unit is not present.
3704*22dc650dSSadaf Ebrahimi However, there is an option (settable at compile time) that disables
3705*22dc650dSSadaf Ebrahimi these, for testing and for ensuring that all callouts do actually occur.
3706*22dc650dSSadaf Ebrahimi The optimizations must also be avoided when restarting a DFA match. */
3707*22dc650dSSadaf Ebrahimi
3708*22dc650dSSadaf Ebrahimi if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3709*22dc650dSSadaf Ebrahimi (options & PCRE2_DFA_RESTART) == 0)
3710*22dc650dSSadaf Ebrahimi {
3711*22dc650dSSadaf Ebrahimi /* If firstline is TRUE, the start of the match is constrained to the first
3712*22dc650dSSadaf Ebrahimi line of a multiline string. That is, the match must be before or at the
3713*22dc650dSSadaf Ebrahimi first newline following the start of matching. Temporarily adjust
3714*22dc650dSSadaf Ebrahimi end_subject so that we stop the optimization scans for a first code unit
3715*22dc650dSSadaf Ebrahimi immediately after the first character of a newline (the first code unit can
3716*22dc650dSSadaf Ebrahimi legitimately be a newline). If the match fails at the newline, later code
3717*22dc650dSSadaf Ebrahimi breaks this loop. */
3718*22dc650dSSadaf Ebrahimi
3719*22dc650dSSadaf Ebrahimi if (firstline)
3720*22dc650dSSadaf Ebrahimi {
3721*22dc650dSSadaf Ebrahimi PCRE2_SPTR t = start_match;
3722*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3723*22dc650dSSadaf Ebrahimi if (utf)
3724*22dc650dSSadaf Ebrahimi {
3725*22dc650dSSadaf Ebrahimi while (t < end_subject && !IS_NEWLINE(t))
3726*22dc650dSSadaf Ebrahimi {
3727*22dc650dSSadaf Ebrahimi t++;
3728*22dc650dSSadaf Ebrahimi ACROSSCHAR(t < end_subject, t, t++);
3729*22dc650dSSadaf Ebrahimi }
3730*22dc650dSSadaf Ebrahimi }
3731*22dc650dSSadaf Ebrahimi else
3732*22dc650dSSadaf Ebrahimi #endif
3733*22dc650dSSadaf Ebrahimi while (t < end_subject && !IS_NEWLINE(t)) t++;
3734*22dc650dSSadaf Ebrahimi end_subject = t;
3735*22dc650dSSadaf Ebrahimi }
3736*22dc650dSSadaf Ebrahimi
3737*22dc650dSSadaf Ebrahimi /* Anchored: check the first code unit if one is recorded. This may seem
3738*22dc650dSSadaf Ebrahimi pointless but it can help in detecting a no match case without scanning for
3739*22dc650dSSadaf Ebrahimi the required code unit. */
3740*22dc650dSSadaf Ebrahimi
3741*22dc650dSSadaf Ebrahimi if (anchored)
3742*22dc650dSSadaf Ebrahimi {
3743*22dc650dSSadaf Ebrahimi if (has_first_cu || start_bits != NULL)
3744*22dc650dSSadaf Ebrahimi {
3745*22dc650dSSadaf Ebrahimi BOOL ok = start_match < end_subject;
3746*22dc650dSSadaf Ebrahimi if (ok)
3747*22dc650dSSadaf Ebrahimi {
3748*22dc650dSSadaf Ebrahimi PCRE2_UCHAR c = UCHAR21TEST(start_match);
3749*22dc650dSSadaf Ebrahimi ok = has_first_cu && (c == first_cu || c == first_cu2);
3750*22dc650dSSadaf Ebrahimi if (!ok && start_bits != NULL)
3751*22dc650dSSadaf Ebrahimi {
3752*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
3753*22dc650dSSadaf Ebrahimi if (c > 255) c = 255;
3754*22dc650dSSadaf Ebrahimi #endif
3755*22dc650dSSadaf Ebrahimi ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3756*22dc650dSSadaf Ebrahimi }
3757*22dc650dSSadaf Ebrahimi }
3758*22dc650dSSadaf Ebrahimi if (!ok) break;
3759*22dc650dSSadaf Ebrahimi }
3760*22dc650dSSadaf Ebrahimi }
3761*22dc650dSSadaf Ebrahimi
3762*22dc650dSSadaf Ebrahimi /* Not anchored. Advance to a unique first code unit if there is one. */
3763*22dc650dSSadaf Ebrahimi
3764*22dc650dSSadaf Ebrahimi else
3765*22dc650dSSadaf Ebrahimi {
3766*22dc650dSSadaf Ebrahimi if (has_first_cu)
3767*22dc650dSSadaf Ebrahimi {
3768*22dc650dSSadaf Ebrahimi if (first_cu != first_cu2) /* Caseless */
3769*22dc650dSSadaf Ebrahimi {
3770*22dc650dSSadaf Ebrahimi /* In 16-bit and 32_bit modes we have to do our own search, so can
3771*22dc650dSSadaf Ebrahimi look for both cases at once. */
3772*22dc650dSSadaf Ebrahimi
3773*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
3774*22dc650dSSadaf Ebrahimi PCRE2_UCHAR smc;
3775*22dc650dSSadaf Ebrahimi while (start_match < end_subject &&
3776*22dc650dSSadaf Ebrahimi (smc = UCHAR21TEST(start_match)) != first_cu &&
3777*22dc650dSSadaf Ebrahimi smc != first_cu2)
3778*22dc650dSSadaf Ebrahimi start_match++;
3779*22dc650dSSadaf Ebrahimi #else
3780*22dc650dSSadaf Ebrahimi /* In 8-bit mode, the use of memchr() gives a big speed up, even
3781*22dc650dSSadaf Ebrahimi though we have to call it twice in order to find the earliest
3782*22dc650dSSadaf Ebrahimi occurrence of the code unit in either of its cases. Caching is used
3783*22dc650dSSadaf Ebrahimi to remember the positions of previously found code units. This can
3784*22dc650dSSadaf Ebrahimi make a huge difference when the strings are very long and only one
3785*22dc650dSSadaf Ebrahimi case is actually present. */
3786*22dc650dSSadaf Ebrahimi
3787*22dc650dSSadaf Ebrahimi PCRE2_SPTR pp1 = NULL;
3788*22dc650dSSadaf Ebrahimi PCRE2_SPTR pp2 = NULL;
3789*22dc650dSSadaf Ebrahimi PCRE2_SIZE searchlength = end_subject - start_match;
3790*22dc650dSSadaf Ebrahimi
3791*22dc650dSSadaf Ebrahimi /* If we haven't got a previously found position for first_cu, or if
3792*22dc650dSSadaf Ebrahimi the current starting position is later, we need to do a search. If
3793*22dc650dSSadaf Ebrahimi the code unit is not found, set it to the end. */
3794*22dc650dSSadaf Ebrahimi
3795*22dc650dSSadaf Ebrahimi if (memchr_found_first_cu == NULL ||
3796*22dc650dSSadaf Ebrahimi start_match > memchr_found_first_cu)
3797*22dc650dSSadaf Ebrahimi {
3798*22dc650dSSadaf Ebrahimi pp1 = memchr(start_match, first_cu, searchlength);
3799*22dc650dSSadaf Ebrahimi memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3800*22dc650dSSadaf Ebrahimi }
3801*22dc650dSSadaf Ebrahimi
3802*22dc650dSSadaf Ebrahimi /* If the start is before a previously found position, use the
3803*22dc650dSSadaf Ebrahimi previous position, or NULL if a previous search failed. */
3804*22dc650dSSadaf Ebrahimi
3805*22dc650dSSadaf Ebrahimi else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3806*22dc650dSSadaf Ebrahimi memchr_found_first_cu;
3807*22dc650dSSadaf Ebrahimi
3808*22dc650dSSadaf Ebrahimi /* Do the same thing for the other case. */
3809*22dc650dSSadaf Ebrahimi
3810*22dc650dSSadaf Ebrahimi if (memchr_found_first_cu2 == NULL ||
3811*22dc650dSSadaf Ebrahimi start_match > memchr_found_first_cu2)
3812*22dc650dSSadaf Ebrahimi {
3813*22dc650dSSadaf Ebrahimi pp2 = memchr(start_match, first_cu2, searchlength);
3814*22dc650dSSadaf Ebrahimi memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3815*22dc650dSSadaf Ebrahimi }
3816*22dc650dSSadaf Ebrahimi
3817*22dc650dSSadaf Ebrahimi else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3818*22dc650dSSadaf Ebrahimi memchr_found_first_cu2;
3819*22dc650dSSadaf Ebrahimi
3820*22dc650dSSadaf Ebrahimi /* Set the start to the end of the subject if neither case was found.
3821*22dc650dSSadaf Ebrahimi Otherwise, use the earlier found point. */
3822*22dc650dSSadaf Ebrahimi
3823*22dc650dSSadaf Ebrahimi if (pp1 == NULL)
3824*22dc650dSSadaf Ebrahimi start_match = (pp2 == NULL)? end_subject : pp2;
3825*22dc650dSSadaf Ebrahimi else
3826*22dc650dSSadaf Ebrahimi start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3827*22dc650dSSadaf Ebrahimi
3828*22dc650dSSadaf Ebrahimi #endif /* 8-bit handling */
3829*22dc650dSSadaf Ebrahimi }
3830*22dc650dSSadaf Ebrahimi
3831*22dc650dSSadaf Ebrahimi /* The caseful case is much simpler. */
3832*22dc650dSSadaf Ebrahimi
3833*22dc650dSSadaf Ebrahimi else
3834*22dc650dSSadaf Ebrahimi {
3835*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
3836*22dc650dSSadaf Ebrahimi while (start_match < end_subject && UCHAR21TEST(start_match) !=
3837*22dc650dSSadaf Ebrahimi first_cu)
3838*22dc650dSSadaf Ebrahimi start_match++;
3839*22dc650dSSadaf Ebrahimi #else /* 8-bit code units */
3840*22dc650dSSadaf Ebrahimi start_match = memchr(start_match, first_cu, end_subject - start_match);
3841*22dc650dSSadaf Ebrahimi if (start_match == NULL) start_match = end_subject;
3842*22dc650dSSadaf Ebrahimi #endif
3843*22dc650dSSadaf Ebrahimi }
3844*22dc650dSSadaf Ebrahimi
3845*22dc650dSSadaf Ebrahimi /* If we can't find the required code unit, having reached the true end
3846*22dc650dSSadaf Ebrahimi of the subject, break the bumpalong loop, to force a match failure,
3847*22dc650dSSadaf Ebrahimi except when doing partial matching, when we let the next cycle run at
3848*22dc650dSSadaf Ebrahimi the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3849*22dc650dSSadaf Ebrahimi which partially matches "abc", even though the string does not contain
3850*22dc650dSSadaf Ebrahimi the starting character "d". If we have not reached the true end of the
3851*22dc650dSSadaf Ebrahimi subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3852*22dc650dSSadaf Ebrahimi we also let the cycle run, because the matching string is legitimately
3853*22dc650dSSadaf Ebrahimi allowed to start with the first code unit of a newline. */
3854*22dc650dSSadaf Ebrahimi
3855*22dc650dSSadaf Ebrahimi if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3856*22dc650dSSadaf Ebrahimi start_match >= mb->end_subject)
3857*22dc650dSSadaf Ebrahimi break;
3858*22dc650dSSadaf Ebrahimi }
3859*22dc650dSSadaf Ebrahimi
3860*22dc650dSSadaf Ebrahimi /* If there's no first code unit, advance to just after a linebreak for a
3861*22dc650dSSadaf Ebrahimi multiline match if required. */
3862*22dc650dSSadaf Ebrahimi
3863*22dc650dSSadaf Ebrahimi else if (startline)
3864*22dc650dSSadaf Ebrahimi {
3865*22dc650dSSadaf Ebrahimi if (start_match > mb->start_subject + start_offset)
3866*22dc650dSSadaf Ebrahimi {
3867*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
3868*22dc650dSSadaf Ebrahimi if (utf)
3869*22dc650dSSadaf Ebrahimi {
3870*22dc650dSSadaf Ebrahimi while (start_match < end_subject && !WAS_NEWLINE(start_match))
3871*22dc650dSSadaf Ebrahimi {
3872*22dc650dSSadaf Ebrahimi start_match++;
3873*22dc650dSSadaf Ebrahimi ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3874*22dc650dSSadaf Ebrahimi }
3875*22dc650dSSadaf Ebrahimi }
3876*22dc650dSSadaf Ebrahimi else
3877*22dc650dSSadaf Ebrahimi #endif
3878*22dc650dSSadaf Ebrahimi while (start_match < end_subject && !WAS_NEWLINE(start_match))
3879*22dc650dSSadaf Ebrahimi start_match++;
3880*22dc650dSSadaf Ebrahimi
3881*22dc650dSSadaf Ebrahimi /* If we have just passed a CR and the newline option is ANY or
3882*22dc650dSSadaf Ebrahimi ANYCRLF, and we are now at a LF, advance the match position by one
3883*22dc650dSSadaf Ebrahimi more code unit. */
3884*22dc650dSSadaf Ebrahimi
3885*22dc650dSSadaf Ebrahimi if (start_match[-1] == CHAR_CR &&
3886*22dc650dSSadaf Ebrahimi (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3887*22dc650dSSadaf Ebrahimi start_match < end_subject &&
3888*22dc650dSSadaf Ebrahimi UCHAR21TEST(start_match) == CHAR_NL)
3889*22dc650dSSadaf Ebrahimi start_match++;
3890*22dc650dSSadaf Ebrahimi }
3891*22dc650dSSadaf Ebrahimi }
3892*22dc650dSSadaf Ebrahimi
3893*22dc650dSSadaf Ebrahimi /* If there's no first code unit or a requirement for a multiline line
3894*22dc650dSSadaf Ebrahimi start, advance to a non-unique first code unit if any have been
3895*22dc650dSSadaf Ebrahimi identified. The bitmap contains only 256 bits. When code units are 16 or
3896*22dc650dSSadaf Ebrahimi 32 bits wide, all code units greater than 254 set the 255 bit. */
3897*22dc650dSSadaf Ebrahimi
3898*22dc650dSSadaf Ebrahimi else if (start_bits != NULL)
3899*22dc650dSSadaf Ebrahimi {
3900*22dc650dSSadaf Ebrahimi while (start_match < end_subject)
3901*22dc650dSSadaf Ebrahimi {
3902*22dc650dSSadaf Ebrahimi uint32_t c = UCHAR21TEST(start_match);
3903*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
3904*22dc650dSSadaf Ebrahimi if (c > 255) c = 255;
3905*22dc650dSSadaf Ebrahimi #endif
3906*22dc650dSSadaf Ebrahimi if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3907*22dc650dSSadaf Ebrahimi start_match++;
3908*22dc650dSSadaf Ebrahimi }
3909*22dc650dSSadaf Ebrahimi
3910*22dc650dSSadaf Ebrahimi /* See comment above in first_cu checking about the next line. */
3911*22dc650dSSadaf Ebrahimi
3912*22dc650dSSadaf Ebrahimi if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3913*22dc650dSSadaf Ebrahimi start_match >= mb->end_subject)
3914*22dc650dSSadaf Ebrahimi break;
3915*22dc650dSSadaf Ebrahimi }
3916*22dc650dSSadaf Ebrahimi } /* End of first code unit handling */
3917*22dc650dSSadaf Ebrahimi
3918*22dc650dSSadaf Ebrahimi /* Restore fudged end_subject */
3919*22dc650dSSadaf Ebrahimi
3920*22dc650dSSadaf Ebrahimi end_subject = mb->end_subject;
3921*22dc650dSSadaf Ebrahimi
3922*22dc650dSSadaf Ebrahimi /* The following two optimizations are disabled for partial matching. */
3923*22dc650dSSadaf Ebrahimi
3924*22dc650dSSadaf Ebrahimi if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3925*22dc650dSSadaf Ebrahimi {
3926*22dc650dSSadaf Ebrahimi PCRE2_SPTR p;
3927*22dc650dSSadaf Ebrahimi
3928*22dc650dSSadaf Ebrahimi /* The minimum matching length is a lower bound; no actual string of that
3929*22dc650dSSadaf Ebrahimi length may actually match the pattern. Although the value is, strictly,
3930*22dc650dSSadaf Ebrahimi in characters, we treat it as code units to avoid spending too much time
3931*22dc650dSSadaf Ebrahimi in this optimization. */
3932*22dc650dSSadaf Ebrahimi
3933*22dc650dSSadaf Ebrahimi if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3934*22dc650dSSadaf Ebrahimi
3935*22dc650dSSadaf Ebrahimi /* If req_cu is set, we know that that code unit must appear in the
3936*22dc650dSSadaf Ebrahimi subject for the match to succeed. If the first code unit is set, req_cu
3937*22dc650dSSadaf Ebrahimi must be later in the subject; otherwise the test starts at the match
3938*22dc650dSSadaf Ebrahimi point. This optimization can save a huge amount of backtracking in
3939*22dc650dSSadaf Ebrahimi patterns with nested unlimited repeats that aren't going to match.
3940*22dc650dSSadaf Ebrahimi Writing separate code for cased/caseless versions makes it go faster, as
3941*22dc650dSSadaf Ebrahimi does using an autoincrement and backing off on a match. As in the case of
3942*22dc650dSSadaf Ebrahimi the first code unit, using memchr() in the 8-bit library gives a big
3943*22dc650dSSadaf Ebrahimi speed up. Unlike the first_cu check above, we do not need to call
3944*22dc650dSSadaf Ebrahimi memchr() twice in the caseless case because we only need to check for the
3945*22dc650dSSadaf Ebrahimi presence of the character in either case, not find the first occurrence.
3946*22dc650dSSadaf Ebrahimi
3947*22dc650dSSadaf Ebrahimi The search can be skipped if the code unit was found later than the
3948*22dc650dSSadaf Ebrahimi current starting point in a previous iteration of the bumpalong loop.
3949*22dc650dSSadaf Ebrahimi
3950*22dc650dSSadaf Ebrahimi HOWEVER: when the subject string is very, very long, searching to its end
3951*22dc650dSSadaf Ebrahimi can take a long time, and give bad performance on quite ordinary
3952*22dc650dSSadaf Ebrahimi patterns. This showed up when somebody was matching something like
3953*22dc650dSSadaf Ebrahimi /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3954*22dc650dSSadaf Ebrahimi sufficiently long, but it's worth searching a lot more for unanchored
3955*22dc650dSSadaf Ebrahimi patterns. */
3956*22dc650dSSadaf Ebrahimi
3957*22dc650dSSadaf Ebrahimi p = start_match + (has_first_cu? 1:0);
3958*22dc650dSSadaf Ebrahimi if (has_req_cu && p > req_cu_ptr)
3959*22dc650dSSadaf Ebrahimi {
3960*22dc650dSSadaf Ebrahimi PCRE2_SIZE check_length = end_subject - start_match;
3961*22dc650dSSadaf Ebrahimi
3962*22dc650dSSadaf Ebrahimi if (check_length < REQ_CU_MAX ||
3963*22dc650dSSadaf Ebrahimi (!anchored && check_length < REQ_CU_MAX * 1000))
3964*22dc650dSSadaf Ebrahimi {
3965*22dc650dSSadaf Ebrahimi if (req_cu != req_cu2) /* Caseless */
3966*22dc650dSSadaf Ebrahimi {
3967*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
3968*22dc650dSSadaf Ebrahimi while (p < end_subject)
3969*22dc650dSSadaf Ebrahimi {
3970*22dc650dSSadaf Ebrahimi uint32_t pp = UCHAR21INCTEST(p);
3971*22dc650dSSadaf Ebrahimi if (pp == req_cu || pp == req_cu2) { p--; break; }
3972*22dc650dSSadaf Ebrahimi }
3973*22dc650dSSadaf Ebrahimi #else /* 8-bit code units */
3974*22dc650dSSadaf Ebrahimi PCRE2_SPTR pp = p;
3975*22dc650dSSadaf Ebrahimi p = memchr(pp, req_cu, end_subject - pp);
3976*22dc650dSSadaf Ebrahimi if (p == NULL)
3977*22dc650dSSadaf Ebrahimi {
3978*22dc650dSSadaf Ebrahimi p = memchr(pp, req_cu2, end_subject - pp);
3979*22dc650dSSadaf Ebrahimi if (p == NULL) p = end_subject;
3980*22dc650dSSadaf Ebrahimi }
3981*22dc650dSSadaf Ebrahimi #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3982*22dc650dSSadaf Ebrahimi }
3983*22dc650dSSadaf Ebrahimi
3984*22dc650dSSadaf Ebrahimi /* The caseful case */
3985*22dc650dSSadaf Ebrahimi
3986*22dc650dSSadaf Ebrahimi else
3987*22dc650dSSadaf Ebrahimi {
3988*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH != 8
3989*22dc650dSSadaf Ebrahimi while (p < end_subject)
3990*22dc650dSSadaf Ebrahimi {
3991*22dc650dSSadaf Ebrahimi if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3992*22dc650dSSadaf Ebrahimi }
3993*22dc650dSSadaf Ebrahimi
3994*22dc650dSSadaf Ebrahimi #else /* 8-bit code units */
3995*22dc650dSSadaf Ebrahimi p = memchr(p, req_cu, end_subject - p);
3996*22dc650dSSadaf Ebrahimi if (p == NULL) p = end_subject;
3997*22dc650dSSadaf Ebrahimi #endif
3998*22dc650dSSadaf Ebrahimi }
3999*22dc650dSSadaf Ebrahimi
4000*22dc650dSSadaf Ebrahimi /* If we can't find the required code unit, break the matching loop,
4001*22dc650dSSadaf Ebrahimi forcing a match failure. */
4002*22dc650dSSadaf Ebrahimi
4003*22dc650dSSadaf Ebrahimi if (p >= end_subject) break;
4004*22dc650dSSadaf Ebrahimi
4005*22dc650dSSadaf Ebrahimi /* If we have found the required code unit, save the point where we
4006*22dc650dSSadaf Ebrahimi found it, so that we don't search again next time round the loop if
4007*22dc650dSSadaf Ebrahimi the start hasn't passed this code unit yet. */
4008*22dc650dSSadaf Ebrahimi
4009*22dc650dSSadaf Ebrahimi req_cu_ptr = p;
4010*22dc650dSSadaf Ebrahimi }
4011*22dc650dSSadaf Ebrahimi }
4012*22dc650dSSadaf Ebrahimi }
4013*22dc650dSSadaf Ebrahimi }
4014*22dc650dSSadaf Ebrahimi
4015*22dc650dSSadaf Ebrahimi /* ------------ End of start of match optimizations ------------ */
4016*22dc650dSSadaf Ebrahimi
4017*22dc650dSSadaf Ebrahimi /* Give no match if we have passed the bumpalong limit. */
4018*22dc650dSSadaf Ebrahimi
4019*22dc650dSSadaf Ebrahimi if (start_match > bumpalong_limit) break;
4020*22dc650dSSadaf Ebrahimi
4021*22dc650dSSadaf Ebrahimi /* OK, now we can do the business */
4022*22dc650dSSadaf Ebrahimi
4023*22dc650dSSadaf Ebrahimi mb->start_used_ptr = start_match;
4024*22dc650dSSadaf Ebrahimi mb->last_used_ptr = start_match;
4025*22dc650dSSadaf Ebrahimi mb->recursive = NULL;
4026*22dc650dSSadaf Ebrahimi
4027*22dc650dSSadaf Ebrahimi rc = internal_dfa_match(
4028*22dc650dSSadaf Ebrahimi mb, /* fixed match data */
4029*22dc650dSSadaf Ebrahimi mb->start_code, /* this subexpression's code */
4030*22dc650dSSadaf Ebrahimi start_match, /* where we currently are */
4031*22dc650dSSadaf Ebrahimi start_offset, /* start offset in subject */
4032*22dc650dSSadaf Ebrahimi match_data->ovector, /* offset vector */
4033*22dc650dSSadaf Ebrahimi (uint32_t)match_data->oveccount * 2, /* actual size of same */
4034*22dc650dSSadaf Ebrahimi workspace, /* workspace vector */
4035*22dc650dSSadaf Ebrahimi (int)wscount, /* size of same */
4036*22dc650dSSadaf Ebrahimi 0, /* function recurse level */
4037*22dc650dSSadaf Ebrahimi base_recursion_workspace); /* initial workspace for recursion */
4038*22dc650dSSadaf Ebrahimi
4039*22dc650dSSadaf Ebrahimi /* Anything other than "no match" means we are done, always; otherwise, carry
4040*22dc650dSSadaf Ebrahimi on only if not anchored. */
4041*22dc650dSSadaf Ebrahimi
4042*22dc650dSSadaf Ebrahimi if (rc != PCRE2_ERROR_NOMATCH || anchored)
4043*22dc650dSSadaf Ebrahimi {
4044*22dc650dSSadaf Ebrahimi if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
4045*22dc650dSSadaf Ebrahimi {
4046*22dc650dSSadaf Ebrahimi match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
4047*22dc650dSSadaf Ebrahimi match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
4048*22dc650dSSadaf Ebrahimi }
4049*22dc650dSSadaf Ebrahimi match_data->subject_length = length;
4050*22dc650dSSadaf Ebrahimi match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
4051*22dc650dSSadaf Ebrahimi match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
4052*22dc650dSSadaf Ebrahimi match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4053*22dc650dSSadaf Ebrahimi match_data->rc = rc;
4054*22dc650dSSadaf Ebrahimi
4055*22dc650dSSadaf Ebrahimi if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4056*22dc650dSSadaf Ebrahimi {
4057*22dc650dSSadaf Ebrahimi length = CU2BYTES(length + was_zero_terminated);
4058*22dc650dSSadaf Ebrahimi match_data->subject = match_data->memctl.malloc(length,
4059*22dc650dSSadaf Ebrahimi match_data->memctl.memory_data);
4060*22dc650dSSadaf Ebrahimi if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4061*22dc650dSSadaf Ebrahimi memcpy((void *)match_data->subject, subject, length);
4062*22dc650dSSadaf Ebrahimi match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4063*22dc650dSSadaf Ebrahimi }
4064*22dc650dSSadaf Ebrahimi else
4065*22dc650dSSadaf Ebrahimi {
4066*22dc650dSSadaf Ebrahimi if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4067*22dc650dSSadaf Ebrahimi }
4068*22dc650dSSadaf Ebrahimi goto EXIT;
4069*22dc650dSSadaf Ebrahimi }
4070*22dc650dSSadaf Ebrahimi
4071*22dc650dSSadaf Ebrahimi /* Advance to the next subject character unless we are at the end of a line
4072*22dc650dSSadaf Ebrahimi and firstline is set. */
4073*22dc650dSSadaf Ebrahimi
4074*22dc650dSSadaf Ebrahimi if (firstline && IS_NEWLINE(start_match)) break;
4075*22dc650dSSadaf Ebrahimi start_match++;
4076*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
4077*22dc650dSSadaf Ebrahimi if (utf)
4078*22dc650dSSadaf Ebrahimi {
4079*22dc650dSSadaf Ebrahimi ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4080*22dc650dSSadaf Ebrahimi }
4081*22dc650dSSadaf Ebrahimi #endif
4082*22dc650dSSadaf Ebrahimi if (start_match > end_subject) break;
4083*22dc650dSSadaf Ebrahimi
4084*22dc650dSSadaf Ebrahimi /* If we have just passed a CR and we are now at a LF, and the pattern does
4085*22dc650dSSadaf Ebrahimi not contain any explicit matches for \r or \n, and the newline option is CRLF
4086*22dc650dSSadaf Ebrahimi or ANY or ANYCRLF, advance the match position by one more character. */
4087*22dc650dSSadaf Ebrahimi
4088*22dc650dSSadaf Ebrahimi if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4089*22dc650dSSadaf Ebrahimi start_match < end_subject &&
4090*22dc650dSSadaf Ebrahimi UCHAR21TEST(start_match) == CHAR_NL &&
4091*22dc650dSSadaf Ebrahimi (re->flags & PCRE2_HASCRORLF) == 0 &&
4092*22dc650dSSadaf Ebrahimi (mb->nltype == NLTYPE_ANY ||
4093*22dc650dSSadaf Ebrahimi mb->nltype == NLTYPE_ANYCRLF ||
4094*22dc650dSSadaf Ebrahimi mb->nllen == 2))
4095*22dc650dSSadaf Ebrahimi start_match++;
4096*22dc650dSSadaf Ebrahimi
4097*22dc650dSSadaf Ebrahimi } /* "Bumpalong" loop */
4098*22dc650dSSadaf Ebrahimi
4099*22dc650dSSadaf Ebrahimi NOMATCH_EXIT:
4100*22dc650dSSadaf Ebrahimi rc = PCRE2_ERROR_NOMATCH;
4101*22dc650dSSadaf Ebrahimi
4102*22dc650dSSadaf Ebrahimi EXIT:
4103*22dc650dSSadaf Ebrahimi while (rws->next != NULL)
4104*22dc650dSSadaf Ebrahimi {
4105*22dc650dSSadaf Ebrahimi RWS_anchor *next = rws->next;
4106*22dc650dSSadaf Ebrahimi rws->next = next->next;
4107*22dc650dSSadaf Ebrahimi mb->memctl.free(next, mb->memctl.memory_data);
4108*22dc650dSSadaf Ebrahimi }
4109*22dc650dSSadaf Ebrahimi
4110*22dc650dSSadaf Ebrahimi return rc;
4111*22dc650dSSadaf Ebrahimi }
4112*22dc650dSSadaf Ebrahimi
4113*22dc650dSSadaf Ebrahimi /* These #undefs are here to enable unity builds with CMake. */
4114*22dc650dSSadaf Ebrahimi
4115*22dc650dSSadaf Ebrahimi #undef NLBLOCK /* Block containing newline information */
4116*22dc650dSSadaf Ebrahimi #undef PSSTART /* Field containing processed string start */
4117*22dc650dSSadaf Ebrahimi #undef PSEND /* Field containing processed string end */
4118*22dc650dSSadaf Ebrahimi
4119*22dc650dSSadaf Ebrahimi /* End of pcre2_dfa_match.c */
4120