1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi * Perl-Compatible Regular Expressions *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi
8*22dc650dSSadaf Ebrahimi Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi New API code Copyright (c) 2016 University of Cambridge
11*22dc650dSSadaf Ebrahimi
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi
16*22dc650dSSadaf Ebrahimi * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi
19*22dc650dSSadaf Ebrahimi * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi
23*22dc650dSSadaf Ebrahimi * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi
41*22dc650dSSadaf Ebrahimi
42*22dc650dSSadaf Ebrahimi /* This module contains internal functions for testing newlines when more than
43*22dc650dSSadaf Ebrahimi one kind of newline is to be recognized. When a newline is found, its length is
44*22dc650dSSadaf Ebrahimi returned. In principle, we could implement several newline "types", each
45*22dc650dSSadaf Ebrahimi referring to a different set of newline characters. At present, PCRE2 supports
46*22dc650dSSadaf Ebrahimi only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF,
47*22dc650dSSadaf Ebrahimi and NLTYPE_ANY. The full list of Unicode newline characters is taken from
48*22dc650dSSadaf Ebrahimi http://unicode.org/unicode/reports/tr18/. */
49*22dc650dSSadaf Ebrahimi
50*22dc650dSSadaf Ebrahimi
51*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
52*22dc650dSSadaf Ebrahimi #include "config.h"
53*22dc650dSSadaf Ebrahimi #endif
54*22dc650dSSadaf Ebrahimi
55*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
56*22dc650dSSadaf Ebrahimi
57*22dc650dSSadaf Ebrahimi
58*22dc650dSSadaf Ebrahimi
59*22dc650dSSadaf Ebrahimi /*************************************************
60*22dc650dSSadaf Ebrahimi * Check for newline at given position *
61*22dc650dSSadaf Ebrahimi *************************************************/
62*22dc650dSSadaf Ebrahimi
63*22dc650dSSadaf Ebrahimi /* This function is called only via the IS_NEWLINE macro, which does so only
64*22dc650dSSadaf Ebrahimi when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed
65*22dc650dSSadaf Ebrahimi newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the code unit
66*22dc650dSSadaf Ebrahimi pointed to by ptr is less than the end of the string.
67*22dc650dSSadaf Ebrahimi
68*22dc650dSSadaf Ebrahimi Arguments:
69*22dc650dSSadaf Ebrahimi ptr pointer to possible newline
70*22dc650dSSadaf Ebrahimi type the newline type
71*22dc650dSSadaf Ebrahimi endptr pointer to the end of the string
72*22dc650dSSadaf Ebrahimi lenptr where to return the length
73*22dc650dSSadaf Ebrahimi utf TRUE if in utf mode
74*22dc650dSSadaf Ebrahimi
75*22dc650dSSadaf Ebrahimi Returns: TRUE or FALSE
76*22dc650dSSadaf Ebrahimi */
77*22dc650dSSadaf Ebrahimi
78*22dc650dSSadaf Ebrahimi BOOL
PRIV(is_newline)79*22dc650dSSadaf Ebrahimi PRIV(is_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR endptr,
80*22dc650dSSadaf Ebrahimi uint32_t *lenptr, BOOL utf)
81*22dc650dSSadaf Ebrahimi {
82*22dc650dSSadaf Ebrahimi uint32_t c;
83*22dc650dSSadaf Ebrahimi
84*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
85*22dc650dSSadaf Ebrahimi if (utf) { GETCHAR(c, ptr); } else c = *ptr;
86*22dc650dSSadaf Ebrahimi #else
87*22dc650dSSadaf Ebrahimi (void)utf;
88*22dc650dSSadaf Ebrahimi c = *ptr;
89*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
90*22dc650dSSadaf Ebrahimi
91*22dc650dSSadaf Ebrahimi if (type == NLTYPE_ANYCRLF) switch(c)
92*22dc650dSSadaf Ebrahimi {
93*22dc650dSSadaf Ebrahimi case CHAR_LF:
94*22dc650dSSadaf Ebrahimi *lenptr = 1;
95*22dc650dSSadaf Ebrahimi return TRUE;
96*22dc650dSSadaf Ebrahimi
97*22dc650dSSadaf Ebrahimi case CHAR_CR:
98*22dc650dSSadaf Ebrahimi *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
99*22dc650dSSadaf Ebrahimi return TRUE;
100*22dc650dSSadaf Ebrahimi
101*22dc650dSSadaf Ebrahimi default:
102*22dc650dSSadaf Ebrahimi return FALSE;
103*22dc650dSSadaf Ebrahimi }
104*22dc650dSSadaf Ebrahimi
105*22dc650dSSadaf Ebrahimi /* NLTYPE_ANY */
106*22dc650dSSadaf Ebrahimi
107*22dc650dSSadaf Ebrahimi else switch(c)
108*22dc650dSSadaf Ebrahimi {
109*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
110*22dc650dSSadaf Ebrahimi case CHAR_NEL:
111*22dc650dSSadaf Ebrahimi #endif
112*22dc650dSSadaf Ebrahimi case CHAR_LF:
113*22dc650dSSadaf Ebrahimi case CHAR_VT:
114*22dc650dSSadaf Ebrahimi case CHAR_FF:
115*22dc650dSSadaf Ebrahimi *lenptr = 1;
116*22dc650dSSadaf Ebrahimi return TRUE;
117*22dc650dSSadaf Ebrahimi
118*22dc650dSSadaf Ebrahimi case CHAR_CR:
119*22dc650dSSadaf Ebrahimi *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
120*22dc650dSSadaf Ebrahimi return TRUE;
121*22dc650dSSadaf Ebrahimi
122*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
123*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
124*22dc650dSSadaf Ebrahimi case CHAR_NEL:
125*22dc650dSSadaf Ebrahimi *lenptr = utf? 2 : 1;
126*22dc650dSSadaf Ebrahimi return TRUE;
127*22dc650dSSadaf Ebrahimi
128*22dc650dSSadaf Ebrahimi case 0x2028: /* LS */
129*22dc650dSSadaf Ebrahimi case 0x2029: /* PS */
130*22dc650dSSadaf Ebrahimi *lenptr = 3;
131*22dc650dSSadaf Ebrahimi return TRUE;
132*22dc650dSSadaf Ebrahimi
133*22dc650dSSadaf Ebrahimi #else /* 16-bit or 32-bit code units */
134*22dc650dSSadaf Ebrahimi case CHAR_NEL:
135*22dc650dSSadaf Ebrahimi case 0x2028: /* LS */
136*22dc650dSSadaf Ebrahimi case 0x2029: /* PS */
137*22dc650dSSadaf Ebrahimi *lenptr = 1;
138*22dc650dSSadaf Ebrahimi return TRUE;
139*22dc650dSSadaf Ebrahimi #endif
140*22dc650dSSadaf Ebrahimi #endif /* Not EBCDIC */
141*22dc650dSSadaf Ebrahimi
142*22dc650dSSadaf Ebrahimi default:
143*22dc650dSSadaf Ebrahimi return FALSE;
144*22dc650dSSadaf Ebrahimi }
145*22dc650dSSadaf Ebrahimi }
146*22dc650dSSadaf Ebrahimi
147*22dc650dSSadaf Ebrahimi
148*22dc650dSSadaf Ebrahimi
149*22dc650dSSadaf Ebrahimi /*************************************************
150*22dc650dSSadaf Ebrahimi * Check for newline at previous position *
151*22dc650dSSadaf Ebrahimi *************************************************/
152*22dc650dSSadaf Ebrahimi
153*22dc650dSSadaf Ebrahimi /* This function is called only via the WAS_NEWLINE macro, which does so only
154*22dc650dSSadaf Ebrahimi when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed
155*22dc650dSSadaf Ebrahimi newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the initial
156*22dc650dSSadaf Ebrahimi value of ptr is greater than the start of the string that is being processed.
157*22dc650dSSadaf Ebrahimi
158*22dc650dSSadaf Ebrahimi Arguments:
159*22dc650dSSadaf Ebrahimi ptr pointer to possible newline
160*22dc650dSSadaf Ebrahimi type the newline type
161*22dc650dSSadaf Ebrahimi startptr pointer to the start of the string
162*22dc650dSSadaf Ebrahimi lenptr where to return the length
163*22dc650dSSadaf Ebrahimi utf TRUE if in utf mode
164*22dc650dSSadaf Ebrahimi
165*22dc650dSSadaf Ebrahimi Returns: TRUE or FALSE
166*22dc650dSSadaf Ebrahimi */
167*22dc650dSSadaf Ebrahimi
168*22dc650dSSadaf Ebrahimi BOOL
PRIV(was_newline)169*22dc650dSSadaf Ebrahimi PRIV(was_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR startptr,
170*22dc650dSSadaf Ebrahimi uint32_t *lenptr, BOOL utf)
171*22dc650dSSadaf Ebrahimi {
172*22dc650dSSadaf Ebrahimi uint32_t c;
173*22dc650dSSadaf Ebrahimi ptr--;
174*22dc650dSSadaf Ebrahimi
175*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
176*22dc650dSSadaf Ebrahimi if (utf)
177*22dc650dSSadaf Ebrahimi {
178*22dc650dSSadaf Ebrahimi BACKCHAR(ptr);
179*22dc650dSSadaf Ebrahimi GETCHAR(c, ptr);
180*22dc650dSSadaf Ebrahimi }
181*22dc650dSSadaf Ebrahimi else c = *ptr;
182*22dc650dSSadaf Ebrahimi #else
183*22dc650dSSadaf Ebrahimi (void)utf;
184*22dc650dSSadaf Ebrahimi c = *ptr;
185*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */
186*22dc650dSSadaf Ebrahimi
187*22dc650dSSadaf Ebrahimi if (type == NLTYPE_ANYCRLF) switch(c)
188*22dc650dSSadaf Ebrahimi {
189*22dc650dSSadaf Ebrahimi case CHAR_LF:
190*22dc650dSSadaf Ebrahimi *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
191*22dc650dSSadaf Ebrahimi return TRUE;
192*22dc650dSSadaf Ebrahimi
193*22dc650dSSadaf Ebrahimi case CHAR_CR:
194*22dc650dSSadaf Ebrahimi *lenptr = 1;
195*22dc650dSSadaf Ebrahimi return TRUE;
196*22dc650dSSadaf Ebrahimi
197*22dc650dSSadaf Ebrahimi default:
198*22dc650dSSadaf Ebrahimi return FALSE;
199*22dc650dSSadaf Ebrahimi }
200*22dc650dSSadaf Ebrahimi
201*22dc650dSSadaf Ebrahimi /* NLTYPE_ANY */
202*22dc650dSSadaf Ebrahimi
203*22dc650dSSadaf Ebrahimi else switch(c)
204*22dc650dSSadaf Ebrahimi {
205*22dc650dSSadaf Ebrahimi case CHAR_LF:
206*22dc650dSSadaf Ebrahimi *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
207*22dc650dSSadaf Ebrahimi return TRUE;
208*22dc650dSSadaf Ebrahimi
209*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
210*22dc650dSSadaf Ebrahimi case CHAR_NEL:
211*22dc650dSSadaf Ebrahimi #endif
212*22dc650dSSadaf Ebrahimi case CHAR_VT:
213*22dc650dSSadaf Ebrahimi case CHAR_FF:
214*22dc650dSSadaf Ebrahimi case CHAR_CR:
215*22dc650dSSadaf Ebrahimi *lenptr = 1;
216*22dc650dSSadaf Ebrahimi return TRUE;
217*22dc650dSSadaf Ebrahimi
218*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
219*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
220*22dc650dSSadaf Ebrahimi case CHAR_NEL:
221*22dc650dSSadaf Ebrahimi *lenptr = utf? 2 : 1;
222*22dc650dSSadaf Ebrahimi return TRUE;
223*22dc650dSSadaf Ebrahimi
224*22dc650dSSadaf Ebrahimi case 0x2028: /* LS */
225*22dc650dSSadaf Ebrahimi case 0x2029: /* PS */
226*22dc650dSSadaf Ebrahimi *lenptr = 3;
227*22dc650dSSadaf Ebrahimi return TRUE;
228*22dc650dSSadaf Ebrahimi
229*22dc650dSSadaf Ebrahimi #else /* 16-bit or 32-bit code units */
230*22dc650dSSadaf Ebrahimi case CHAR_NEL:
231*22dc650dSSadaf Ebrahimi case 0x2028: /* LS */
232*22dc650dSSadaf Ebrahimi case 0x2029: /* PS */
233*22dc650dSSadaf Ebrahimi *lenptr = 1;
234*22dc650dSSadaf Ebrahimi return TRUE;
235*22dc650dSSadaf Ebrahimi #endif
236*22dc650dSSadaf Ebrahimi #endif /* Not EBCDIC */
237*22dc650dSSadaf Ebrahimi
238*22dc650dSSadaf Ebrahimi default:
239*22dc650dSSadaf Ebrahimi return FALSE;
240*22dc650dSSadaf Ebrahimi }
241*22dc650dSSadaf Ebrahimi }
242*22dc650dSSadaf Ebrahimi
243*22dc650dSSadaf Ebrahimi /* End of pcre2_newline.c */
244