1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi * Perl-Compatible Regular Expressions *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi
8*22dc650dSSadaf Ebrahimi Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi New API code Copyright (c) 2016-2023 University of Cambridge
11*22dc650dSSadaf Ebrahimi
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi
16*22dc650dSSadaf Ebrahimi * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi
19*22dc650dSSadaf Ebrahimi * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi
23*22dc650dSSadaf Ebrahimi * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi
41*22dc650dSSadaf Ebrahimi /* This module contains an internal function that is used to match an extended
42*22dc650dSSadaf Ebrahimi class. It is used by pcre2_auto_possessify() and by both pcre2_match() and
43*22dc650dSSadaf Ebrahimi pcre2_def_match(). */
44*22dc650dSSadaf Ebrahimi
45*22dc650dSSadaf Ebrahimi
46*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
47*22dc650dSSadaf Ebrahimi #include "config.h"
48*22dc650dSSadaf Ebrahimi #endif
49*22dc650dSSadaf Ebrahimi
50*22dc650dSSadaf Ebrahimi
51*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
52*22dc650dSSadaf Ebrahimi
53*22dc650dSSadaf Ebrahimi /*************************************************
54*22dc650dSSadaf Ebrahimi * Match character against an XCLASS *
55*22dc650dSSadaf Ebrahimi *************************************************/
56*22dc650dSSadaf Ebrahimi
57*22dc650dSSadaf Ebrahimi /* This function is called to match a character against an extended class that
58*22dc650dSSadaf Ebrahimi might contain codepoints above 255 and/or Unicode properties.
59*22dc650dSSadaf Ebrahimi
60*22dc650dSSadaf Ebrahimi Arguments:
61*22dc650dSSadaf Ebrahimi c the character
62*22dc650dSSadaf Ebrahimi data points to the flag code unit of the XCLASS data
63*22dc650dSSadaf Ebrahimi utf TRUE if in UTF mode
64*22dc650dSSadaf Ebrahimi
65*22dc650dSSadaf Ebrahimi Returns: TRUE if character matches, else FALSE
66*22dc650dSSadaf Ebrahimi */
67*22dc650dSSadaf Ebrahimi
68*22dc650dSSadaf Ebrahimi BOOL
PRIV(xclass)69*22dc650dSSadaf Ebrahimi PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf)
70*22dc650dSSadaf Ebrahimi {
71*22dc650dSSadaf Ebrahimi PCRE2_UCHAR t;
72*22dc650dSSadaf Ebrahimi BOOL negated = (*data & XCL_NOT) != 0;
73*22dc650dSSadaf Ebrahimi
74*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
75*22dc650dSSadaf Ebrahimi /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
76*22dc650dSSadaf Ebrahimi utf = TRUE;
77*22dc650dSSadaf Ebrahimi #endif
78*22dc650dSSadaf Ebrahimi
79*22dc650dSSadaf Ebrahimi /* Code points < 256 are matched against a bitmap, if one is present. If not,
80*22dc650dSSadaf Ebrahimi we still carry on, because there may be ranges that start below 256 in the
81*22dc650dSSadaf Ebrahimi additional data. */
82*22dc650dSSadaf Ebrahimi
83*22dc650dSSadaf Ebrahimi if (c < 256)
84*22dc650dSSadaf Ebrahimi {
85*22dc650dSSadaf Ebrahimi if ((*data & XCL_HASPROP) == 0)
86*22dc650dSSadaf Ebrahimi {
87*22dc650dSSadaf Ebrahimi if ((*data & XCL_MAP) == 0) return negated;
88*22dc650dSSadaf Ebrahimi return (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0;
89*22dc650dSSadaf Ebrahimi }
90*22dc650dSSadaf Ebrahimi if ((*data & XCL_MAP) != 0 &&
91*22dc650dSSadaf Ebrahimi (((uint8_t *)(data + 1))[c/8] & (1u << (c&7))) != 0)
92*22dc650dSSadaf Ebrahimi return !negated; /* char found */
93*22dc650dSSadaf Ebrahimi }
94*22dc650dSSadaf Ebrahimi
95*22dc650dSSadaf Ebrahimi /* First skip the bit map if present. Then match against the list of Unicode
96*22dc650dSSadaf Ebrahimi properties or large chars or ranges that end with a large char. We won't ever
97*22dc650dSSadaf Ebrahimi encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
98*22dc650dSSadaf Ebrahimi
99*22dc650dSSadaf Ebrahimi if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR);
100*22dc650dSSadaf Ebrahimi
101*22dc650dSSadaf Ebrahimi while ((t = *data++) != XCL_END)
102*22dc650dSSadaf Ebrahimi {
103*22dc650dSSadaf Ebrahimi uint32_t x, y;
104*22dc650dSSadaf Ebrahimi if (t == XCL_SINGLE)
105*22dc650dSSadaf Ebrahimi {
106*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
107*22dc650dSSadaf Ebrahimi if (utf)
108*22dc650dSSadaf Ebrahimi {
109*22dc650dSSadaf Ebrahimi GETCHARINC(x, data); /* macro generates multiple statements */
110*22dc650dSSadaf Ebrahimi }
111*22dc650dSSadaf Ebrahimi else
112*22dc650dSSadaf Ebrahimi #endif
113*22dc650dSSadaf Ebrahimi x = *data++;
114*22dc650dSSadaf Ebrahimi if (c == x) return !negated;
115*22dc650dSSadaf Ebrahimi }
116*22dc650dSSadaf Ebrahimi else if (t == XCL_RANGE)
117*22dc650dSSadaf Ebrahimi {
118*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
119*22dc650dSSadaf Ebrahimi if (utf)
120*22dc650dSSadaf Ebrahimi {
121*22dc650dSSadaf Ebrahimi GETCHARINC(x, data); /* macro generates multiple statements */
122*22dc650dSSadaf Ebrahimi GETCHARINC(y, data); /* macro generates multiple statements */
123*22dc650dSSadaf Ebrahimi }
124*22dc650dSSadaf Ebrahimi else
125*22dc650dSSadaf Ebrahimi #endif
126*22dc650dSSadaf Ebrahimi {
127*22dc650dSSadaf Ebrahimi x = *data++;
128*22dc650dSSadaf Ebrahimi y = *data++;
129*22dc650dSSadaf Ebrahimi }
130*22dc650dSSadaf Ebrahimi if (c >= x && c <= y) return !negated;
131*22dc650dSSadaf Ebrahimi }
132*22dc650dSSadaf Ebrahimi
133*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
134*22dc650dSSadaf Ebrahimi else /* XCL_PROP & XCL_NOTPROP */
135*22dc650dSSadaf Ebrahimi {
136*22dc650dSSadaf Ebrahimi int chartype;
137*22dc650dSSadaf Ebrahimi const ucd_record *prop = GET_UCD(c);
138*22dc650dSSadaf Ebrahimi BOOL isprop = t == XCL_PROP;
139*22dc650dSSadaf Ebrahimi BOOL ok;
140*22dc650dSSadaf Ebrahimi
141*22dc650dSSadaf Ebrahimi switch(*data)
142*22dc650dSSadaf Ebrahimi {
143*22dc650dSSadaf Ebrahimi case PT_ANY:
144*22dc650dSSadaf Ebrahimi if (isprop) return !negated;
145*22dc650dSSadaf Ebrahimi break;
146*22dc650dSSadaf Ebrahimi
147*22dc650dSSadaf Ebrahimi case PT_LAMP:
148*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
149*22dc650dSSadaf Ebrahimi if ((chartype == ucp_Lu || chartype == ucp_Ll ||
150*22dc650dSSadaf Ebrahimi chartype == ucp_Lt) == isprop) return !negated;
151*22dc650dSSadaf Ebrahimi break;
152*22dc650dSSadaf Ebrahimi
153*22dc650dSSadaf Ebrahimi case PT_GC:
154*22dc650dSSadaf Ebrahimi if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
155*22dc650dSSadaf Ebrahimi return !negated;
156*22dc650dSSadaf Ebrahimi break;
157*22dc650dSSadaf Ebrahimi
158*22dc650dSSadaf Ebrahimi case PT_PC:
159*22dc650dSSadaf Ebrahimi if ((data[1] == prop->chartype) == isprop) return !negated;
160*22dc650dSSadaf Ebrahimi break;
161*22dc650dSSadaf Ebrahimi
162*22dc650dSSadaf Ebrahimi case PT_SC:
163*22dc650dSSadaf Ebrahimi if ((data[1] == prop->script) == isprop) return !negated;
164*22dc650dSSadaf Ebrahimi break;
165*22dc650dSSadaf Ebrahimi
166*22dc650dSSadaf Ebrahimi case PT_SCX:
167*22dc650dSSadaf Ebrahimi ok = (data[1] == prop->script ||
168*22dc650dSSadaf Ebrahimi MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), data[1]) != 0);
169*22dc650dSSadaf Ebrahimi if (ok == isprop) return !negated;
170*22dc650dSSadaf Ebrahimi break;
171*22dc650dSSadaf Ebrahimi
172*22dc650dSSadaf Ebrahimi case PT_ALNUM:
173*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
174*22dc650dSSadaf Ebrahimi if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
175*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N) == isprop)
176*22dc650dSSadaf Ebrahimi return !negated;
177*22dc650dSSadaf Ebrahimi break;
178*22dc650dSSadaf Ebrahimi
179*22dc650dSSadaf Ebrahimi /* Perl space used to exclude VT, but from Perl 5.18 it is included,
180*22dc650dSSadaf Ebrahimi which means that Perl space and POSIX space are now identical. PCRE
181*22dc650dSSadaf Ebrahimi was changed at release 8.34. */
182*22dc650dSSadaf Ebrahimi
183*22dc650dSSadaf Ebrahimi case PT_SPACE: /* Perl space */
184*22dc650dSSadaf Ebrahimi case PT_PXSPACE: /* POSIX space */
185*22dc650dSSadaf Ebrahimi switch(c)
186*22dc650dSSadaf Ebrahimi {
187*22dc650dSSadaf Ebrahimi HSPACE_CASES:
188*22dc650dSSadaf Ebrahimi VSPACE_CASES:
189*22dc650dSSadaf Ebrahimi if (isprop) return !negated;
190*22dc650dSSadaf Ebrahimi break;
191*22dc650dSSadaf Ebrahimi
192*22dc650dSSadaf Ebrahimi default:
193*22dc650dSSadaf Ebrahimi if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
194*22dc650dSSadaf Ebrahimi return !negated;
195*22dc650dSSadaf Ebrahimi break;
196*22dc650dSSadaf Ebrahimi }
197*22dc650dSSadaf Ebrahimi break;
198*22dc650dSSadaf Ebrahimi
199*22dc650dSSadaf Ebrahimi case PT_WORD:
200*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
201*22dc650dSSadaf Ebrahimi if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
202*22dc650dSSadaf Ebrahimi PRIV(ucp_gentype)[chartype] == ucp_N ||
203*22dc650dSSadaf Ebrahimi chartype == ucp_Mn || chartype == ucp_Pc) == isprop)
204*22dc650dSSadaf Ebrahimi return !negated;
205*22dc650dSSadaf Ebrahimi break;
206*22dc650dSSadaf Ebrahimi
207*22dc650dSSadaf Ebrahimi case PT_UCNC:
208*22dc650dSSadaf Ebrahimi if (c < 0xa0)
209*22dc650dSSadaf Ebrahimi {
210*22dc650dSSadaf Ebrahimi if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
211*22dc650dSSadaf Ebrahimi c == CHAR_GRAVE_ACCENT) == isprop)
212*22dc650dSSadaf Ebrahimi return !negated;
213*22dc650dSSadaf Ebrahimi }
214*22dc650dSSadaf Ebrahimi else
215*22dc650dSSadaf Ebrahimi {
216*22dc650dSSadaf Ebrahimi if ((c < 0xd800 || c > 0xdfff) == isprop)
217*22dc650dSSadaf Ebrahimi return !negated;
218*22dc650dSSadaf Ebrahimi }
219*22dc650dSSadaf Ebrahimi break;
220*22dc650dSSadaf Ebrahimi
221*22dc650dSSadaf Ebrahimi case PT_BIDICL:
222*22dc650dSSadaf Ebrahimi if ((UCD_BIDICLASS_PROP(prop) == data[1]) == isprop)
223*22dc650dSSadaf Ebrahimi return !negated;
224*22dc650dSSadaf Ebrahimi break;
225*22dc650dSSadaf Ebrahimi
226*22dc650dSSadaf Ebrahimi case PT_BOOL:
227*22dc650dSSadaf Ebrahimi ok = MAPBIT(PRIV(ucd_boolprop_sets) +
228*22dc650dSSadaf Ebrahimi UCD_BPROPS_PROP(prop), data[1]) != 0;
229*22dc650dSSadaf Ebrahimi if (ok == isprop) return !negated;
230*22dc650dSSadaf Ebrahimi break;
231*22dc650dSSadaf Ebrahimi
232*22dc650dSSadaf Ebrahimi /* The following three properties can occur only in an XCLASS, as there
233*22dc650dSSadaf Ebrahimi is no \p or \P coding for them. */
234*22dc650dSSadaf Ebrahimi
235*22dc650dSSadaf Ebrahimi /* Graphic character. Implement this as not Z (space or separator) and
236*22dc650dSSadaf Ebrahimi not C (other), except for Cf (format) with a few exceptions. This seems
237*22dc650dSSadaf Ebrahimi to be what Perl does. The exceptional characters are:
238*22dc650dSSadaf Ebrahimi
239*22dc650dSSadaf Ebrahimi U+061C Arabic Letter Mark
240*22dc650dSSadaf Ebrahimi U+180E Mongolian Vowel Separator
241*22dc650dSSadaf Ebrahimi U+2066 - U+2069 Various "isolate"s
242*22dc650dSSadaf Ebrahimi */
243*22dc650dSSadaf Ebrahimi
244*22dc650dSSadaf Ebrahimi case PT_PXGRAPH:
245*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
246*22dc650dSSadaf Ebrahimi if ((PRIV(ucp_gentype)[chartype] != ucp_Z &&
247*22dc650dSSadaf Ebrahimi (PRIV(ucp_gentype)[chartype] != ucp_C ||
248*22dc650dSSadaf Ebrahimi (chartype == ucp_Cf &&
249*22dc650dSSadaf Ebrahimi c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
250*22dc650dSSadaf Ebrahimi )) == isprop)
251*22dc650dSSadaf Ebrahimi return !negated;
252*22dc650dSSadaf Ebrahimi break;
253*22dc650dSSadaf Ebrahimi
254*22dc650dSSadaf Ebrahimi /* Printable character: same as graphic, with the addition of Zs, i.e.
255*22dc650dSSadaf Ebrahimi not Zl and not Zp, and U+180E. */
256*22dc650dSSadaf Ebrahimi
257*22dc650dSSadaf Ebrahimi case PT_PXPRINT:
258*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
259*22dc650dSSadaf Ebrahimi if ((chartype != ucp_Zl &&
260*22dc650dSSadaf Ebrahimi chartype != ucp_Zp &&
261*22dc650dSSadaf Ebrahimi (PRIV(ucp_gentype)[chartype] != ucp_C ||
262*22dc650dSSadaf Ebrahimi (chartype == ucp_Cf &&
263*22dc650dSSadaf Ebrahimi c != 0x061c && (c < 0x2066 || c > 0x2069))
264*22dc650dSSadaf Ebrahimi )) == isprop)
265*22dc650dSSadaf Ebrahimi return !negated;
266*22dc650dSSadaf Ebrahimi break;
267*22dc650dSSadaf Ebrahimi
268*22dc650dSSadaf Ebrahimi /* Punctuation: all Unicode punctuation, plus ASCII characters that
269*22dc650dSSadaf Ebrahimi Unicode treats as symbols rather than punctuation, for Perl
270*22dc650dSSadaf Ebrahimi compatibility (these are $+<=>^`|~). */
271*22dc650dSSadaf Ebrahimi
272*22dc650dSSadaf Ebrahimi case PT_PXPUNCT:
273*22dc650dSSadaf Ebrahimi chartype = prop->chartype;
274*22dc650dSSadaf Ebrahimi if ((PRIV(ucp_gentype)[chartype] == ucp_P ||
275*22dc650dSSadaf Ebrahimi (c < 128 && PRIV(ucp_gentype)[chartype] == ucp_S)) == isprop)
276*22dc650dSSadaf Ebrahimi return !negated;
277*22dc650dSSadaf Ebrahimi break;
278*22dc650dSSadaf Ebrahimi
279*22dc650dSSadaf Ebrahimi /* Perl has two sets of hex digits */
280*22dc650dSSadaf Ebrahimi
281*22dc650dSSadaf Ebrahimi case PT_PXXDIGIT:
282*22dc650dSSadaf Ebrahimi if (((c >= CHAR_0 && c <= CHAR_9) ||
283*22dc650dSSadaf Ebrahimi (c >= CHAR_A && c <= CHAR_F) ||
284*22dc650dSSadaf Ebrahimi (c >= CHAR_a && c <= CHAR_f) ||
285*22dc650dSSadaf Ebrahimi (c >= 0xff10 && c <= 0xff19) || /* Fullwidth digits */
286*22dc650dSSadaf Ebrahimi (c >= 0xff21 && c <= 0xff26) || /* Fullwidth letters */
287*22dc650dSSadaf Ebrahimi (c >= 0xff41 && c <= 0xff46)) == isprop)
288*22dc650dSSadaf Ebrahimi return !negated;
289*22dc650dSSadaf Ebrahimi break;
290*22dc650dSSadaf Ebrahimi
291*22dc650dSSadaf Ebrahimi /* This should never occur, but compilers may mutter if there is no
292*22dc650dSSadaf Ebrahimi default. */
293*22dc650dSSadaf Ebrahimi
294*22dc650dSSadaf Ebrahimi default:
295*22dc650dSSadaf Ebrahimi return FALSE;
296*22dc650dSSadaf Ebrahimi }
297*22dc650dSSadaf Ebrahimi
298*22dc650dSSadaf Ebrahimi data += 2;
299*22dc650dSSadaf Ebrahimi }
300*22dc650dSSadaf Ebrahimi #else
301*22dc650dSSadaf Ebrahimi (void)utf; /* Avoid compiler warning */
302*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
303*22dc650dSSadaf Ebrahimi }
304*22dc650dSSadaf Ebrahimi
305*22dc650dSSadaf Ebrahimi return negated; /* char did not match */
306*22dc650dSSadaf Ebrahimi }
307*22dc650dSSadaf Ebrahimi
308*22dc650dSSadaf Ebrahimi /* End of pcre2_xclass.c */
309