xref: /aosp_15_r20/external/pcre/src/pcre2_newline.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *      Perl-Compatible Regular Expressions       *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax
6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language.
7*22dc650dSSadaf Ebrahimi 
8*22dc650dSSadaf Ebrahimi                        Written by Philip Hazel
9*22dc650dSSadaf Ebrahimi      Original API code Copyright (c) 1997-2012 University of Cambridge
10*22dc650dSSadaf Ebrahimi          New API code Copyright (c) 2016 University of Cambridge
11*22dc650dSSadaf Ebrahimi 
12*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without
14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met:
15*22dc650dSSadaf Ebrahimi 
16*22dc650dSSadaf Ebrahimi     * Redistributions of source code must retain the above copyright notice,
17*22dc650dSSadaf Ebrahimi       this list of conditions and the following disclaimer.
18*22dc650dSSadaf Ebrahimi 
19*22dc650dSSadaf Ebrahimi     * Redistributions in binary form must reproduce the above copyright
20*22dc650dSSadaf Ebrahimi       notice, this list of conditions and the following disclaimer in the
21*22dc650dSSadaf Ebrahimi       documentation and/or other materials provided with the distribution.
22*22dc650dSSadaf Ebrahimi 
23*22dc650dSSadaf Ebrahimi     * Neither the name of the University of Cambridge nor the names of its
24*22dc650dSSadaf Ebrahimi       contributors may be used to endorse or promote products derived from
25*22dc650dSSadaf Ebrahimi       this software without specific prior written permission.
26*22dc650dSSadaf Ebrahimi 
27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE.
38*22dc650dSSadaf Ebrahimi -----------------------------------------------------------------------------
39*22dc650dSSadaf Ebrahimi */
40*22dc650dSSadaf Ebrahimi 
41*22dc650dSSadaf Ebrahimi 
42*22dc650dSSadaf Ebrahimi /* This module contains internal functions for testing newlines when more than
43*22dc650dSSadaf Ebrahimi one kind of newline is to be recognized. When a newline is found, its length is
44*22dc650dSSadaf Ebrahimi returned. In principle, we could implement several newline "types", each
45*22dc650dSSadaf Ebrahimi referring to a different set of newline characters. At present, PCRE2 supports
46*22dc650dSSadaf Ebrahimi only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF,
47*22dc650dSSadaf Ebrahimi and NLTYPE_ANY. The full list of Unicode newline characters is taken from
48*22dc650dSSadaf Ebrahimi http://unicode.org/unicode/reports/tr18/. */
49*22dc650dSSadaf Ebrahimi 
50*22dc650dSSadaf Ebrahimi 
51*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H
52*22dc650dSSadaf Ebrahimi #include "config.h"
53*22dc650dSSadaf Ebrahimi #endif
54*22dc650dSSadaf Ebrahimi 
55*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h"
56*22dc650dSSadaf Ebrahimi 
57*22dc650dSSadaf Ebrahimi 
58*22dc650dSSadaf Ebrahimi 
59*22dc650dSSadaf Ebrahimi /*************************************************
60*22dc650dSSadaf Ebrahimi *      Check for newline at given position       *
61*22dc650dSSadaf Ebrahimi *************************************************/
62*22dc650dSSadaf Ebrahimi 
63*22dc650dSSadaf Ebrahimi /* This function is called only via the IS_NEWLINE macro, which does so only
64*22dc650dSSadaf Ebrahimi when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed
65*22dc650dSSadaf Ebrahimi newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the code unit
66*22dc650dSSadaf Ebrahimi pointed to by ptr is less than the end of the string.
67*22dc650dSSadaf Ebrahimi 
68*22dc650dSSadaf Ebrahimi Arguments:
69*22dc650dSSadaf Ebrahimi   ptr          pointer to possible newline
70*22dc650dSSadaf Ebrahimi   type         the newline type
71*22dc650dSSadaf Ebrahimi   endptr       pointer to the end of the string
72*22dc650dSSadaf Ebrahimi   lenptr       where to return the length
73*22dc650dSSadaf Ebrahimi   utf          TRUE if in utf mode
74*22dc650dSSadaf Ebrahimi 
75*22dc650dSSadaf Ebrahimi Returns:       TRUE or FALSE
76*22dc650dSSadaf Ebrahimi */
77*22dc650dSSadaf Ebrahimi 
78*22dc650dSSadaf Ebrahimi BOOL
PRIV(is_newline)79*22dc650dSSadaf Ebrahimi PRIV(is_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR endptr,
80*22dc650dSSadaf Ebrahimi   uint32_t *lenptr, BOOL utf)
81*22dc650dSSadaf Ebrahimi {
82*22dc650dSSadaf Ebrahimi uint32_t c;
83*22dc650dSSadaf Ebrahimi 
84*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
85*22dc650dSSadaf Ebrahimi if (utf) { GETCHAR(c, ptr); } else c = *ptr;
86*22dc650dSSadaf Ebrahimi #else
87*22dc650dSSadaf Ebrahimi (void)utf;
88*22dc650dSSadaf Ebrahimi c = *ptr;
89*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
90*22dc650dSSadaf Ebrahimi 
91*22dc650dSSadaf Ebrahimi if (type == NLTYPE_ANYCRLF) switch(c)
92*22dc650dSSadaf Ebrahimi   {
93*22dc650dSSadaf Ebrahimi   case CHAR_LF:
94*22dc650dSSadaf Ebrahimi   *lenptr = 1;
95*22dc650dSSadaf Ebrahimi   return TRUE;
96*22dc650dSSadaf Ebrahimi 
97*22dc650dSSadaf Ebrahimi   case CHAR_CR:
98*22dc650dSSadaf Ebrahimi   *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
99*22dc650dSSadaf Ebrahimi   return TRUE;
100*22dc650dSSadaf Ebrahimi 
101*22dc650dSSadaf Ebrahimi   default:
102*22dc650dSSadaf Ebrahimi   return FALSE;
103*22dc650dSSadaf Ebrahimi   }
104*22dc650dSSadaf Ebrahimi 
105*22dc650dSSadaf Ebrahimi /* NLTYPE_ANY */
106*22dc650dSSadaf Ebrahimi 
107*22dc650dSSadaf Ebrahimi else switch(c)
108*22dc650dSSadaf Ebrahimi   {
109*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
110*22dc650dSSadaf Ebrahimi   case CHAR_NEL:
111*22dc650dSSadaf Ebrahimi #endif
112*22dc650dSSadaf Ebrahimi   case CHAR_LF:
113*22dc650dSSadaf Ebrahimi   case CHAR_VT:
114*22dc650dSSadaf Ebrahimi   case CHAR_FF:
115*22dc650dSSadaf Ebrahimi   *lenptr = 1;
116*22dc650dSSadaf Ebrahimi   return TRUE;
117*22dc650dSSadaf Ebrahimi 
118*22dc650dSSadaf Ebrahimi   case CHAR_CR:
119*22dc650dSSadaf Ebrahimi   *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
120*22dc650dSSadaf Ebrahimi   return TRUE;
121*22dc650dSSadaf Ebrahimi 
122*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
123*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
124*22dc650dSSadaf Ebrahimi   case CHAR_NEL:
125*22dc650dSSadaf Ebrahimi   *lenptr = utf? 2 : 1;
126*22dc650dSSadaf Ebrahimi   return TRUE;
127*22dc650dSSadaf Ebrahimi 
128*22dc650dSSadaf Ebrahimi   case 0x2028:   /* LS */
129*22dc650dSSadaf Ebrahimi   case 0x2029:   /* PS */
130*22dc650dSSadaf Ebrahimi   *lenptr = 3;
131*22dc650dSSadaf Ebrahimi   return TRUE;
132*22dc650dSSadaf Ebrahimi 
133*22dc650dSSadaf Ebrahimi #else  /* 16-bit or 32-bit code units */
134*22dc650dSSadaf Ebrahimi   case CHAR_NEL:
135*22dc650dSSadaf Ebrahimi   case 0x2028:   /* LS */
136*22dc650dSSadaf Ebrahimi   case 0x2029:   /* PS */
137*22dc650dSSadaf Ebrahimi   *lenptr = 1;
138*22dc650dSSadaf Ebrahimi   return TRUE;
139*22dc650dSSadaf Ebrahimi #endif
140*22dc650dSSadaf Ebrahimi #endif /* Not EBCDIC */
141*22dc650dSSadaf Ebrahimi 
142*22dc650dSSadaf Ebrahimi   default:
143*22dc650dSSadaf Ebrahimi   return FALSE;
144*22dc650dSSadaf Ebrahimi   }
145*22dc650dSSadaf Ebrahimi }
146*22dc650dSSadaf Ebrahimi 
147*22dc650dSSadaf Ebrahimi 
148*22dc650dSSadaf Ebrahimi 
149*22dc650dSSadaf Ebrahimi /*************************************************
150*22dc650dSSadaf Ebrahimi *     Check for newline at previous position     *
151*22dc650dSSadaf Ebrahimi *************************************************/
152*22dc650dSSadaf Ebrahimi 
153*22dc650dSSadaf Ebrahimi /* This function is called only via the WAS_NEWLINE macro, which does so only
154*22dc650dSSadaf Ebrahimi when the newline type is NLTYPE_ANY or NLTYPE_ANYCRLF. The case of a fixed
155*22dc650dSSadaf Ebrahimi newline (NLTYPE_FIXED) is handled inline. It is guaranteed that the initial
156*22dc650dSSadaf Ebrahimi value of ptr is greater than the start of the string that is being processed.
157*22dc650dSSadaf Ebrahimi 
158*22dc650dSSadaf Ebrahimi Arguments:
159*22dc650dSSadaf Ebrahimi   ptr          pointer to possible newline
160*22dc650dSSadaf Ebrahimi   type         the newline type
161*22dc650dSSadaf Ebrahimi   startptr     pointer to the start of the string
162*22dc650dSSadaf Ebrahimi   lenptr       where to return the length
163*22dc650dSSadaf Ebrahimi   utf          TRUE if in utf mode
164*22dc650dSSadaf Ebrahimi 
165*22dc650dSSadaf Ebrahimi Returns:       TRUE or FALSE
166*22dc650dSSadaf Ebrahimi */
167*22dc650dSSadaf Ebrahimi 
168*22dc650dSSadaf Ebrahimi BOOL
PRIV(was_newline)169*22dc650dSSadaf Ebrahimi PRIV(was_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR startptr,
170*22dc650dSSadaf Ebrahimi   uint32_t *lenptr, BOOL utf)
171*22dc650dSSadaf Ebrahimi {
172*22dc650dSSadaf Ebrahimi uint32_t c;
173*22dc650dSSadaf Ebrahimi ptr--;
174*22dc650dSSadaf Ebrahimi 
175*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE
176*22dc650dSSadaf Ebrahimi if (utf)
177*22dc650dSSadaf Ebrahimi   {
178*22dc650dSSadaf Ebrahimi   BACKCHAR(ptr);
179*22dc650dSSadaf Ebrahimi   GETCHAR(c, ptr);
180*22dc650dSSadaf Ebrahimi   }
181*22dc650dSSadaf Ebrahimi else c = *ptr;
182*22dc650dSSadaf Ebrahimi #else
183*22dc650dSSadaf Ebrahimi (void)utf;
184*22dc650dSSadaf Ebrahimi c = *ptr;
185*22dc650dSSadaf Ebrahimi #endif  /* SUPPORT_UNICODE */
186*22dc650dSSadaf Ebrahimi 
187*22dc650dSSadaf Ebrahimi if (type == NLTYPE_ANYCRLF) switch(c)
188*22dc650dSSadaf Ebrahimi   {
189*22dc650dSSadaf Ebrahimi   case CHAR_LF:
190*22dc650dSSadaf Ebrahimi   *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
191*22dc650dSSadaf Ebrahimi   return TRUE;
192*22dc650dSSadaf Ebrahimi 
193*22dc650dSSadaf Ebrahimi   case CHAR_CR:
194*22dc650dSSadaf Ebrahimi   *lenptr = 1;
195*22dc650dSSadaf Ebrahimi   return TRUE;
196*22dc650dSSadaf Ebrahimi 
197*22dc650dSSadaf Ebrahimi   default:
198*22dc650dSSadaf Ebrahimi   return FALSE;
199*22dc650dSSadaf Ebrahimi   }
200*22dc650dSSadaf Ebrahimi 
201*22dc650dSSadaf Ebrahimi /* NLTYPE_ANY */
202*22dc650dSSadaf Ebrahimi 
203*22dc650dSSadaf Ebrahimi else switch(c)
204*22dc650dSSadaf Ebrahimi   {
205*22dc650dSSadaf Ebrahimi   case CHAR_LF:
206*22dc650dSSadaf Ebrahimi   *lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
207*22dc650dSSadaf Ebrahimi   return TRUE;
208*22dc650dSSadaf Ebrahimi 
209*22dc650dSSadaf Ebrahimi #ifdef EBCDIC
210*22dc650dSSadaf Ebrahimi   case CHAR_NEL:
211*22dc650dSSadaf Ebrahimi #endif
212*22dc650dSSadaf Ebrahimi   case CHAR_VT:
213*22dc650dSSadaf Ebrahimi   case CHAR_FF:
214*22dc650dSSadaf Ebrahimi   case CHAR_CR:
215*22dc650dSSadaf Ebrahimi   *lenptr = 1;
216*22dc650dSSadaf Ebrahimi   return TRUE;
217*22dc650dSSadaf Ebrahimi 
218*22dc650dSSadaf Ebrahimi #ifndef EBCDIC
219*22dc650dSSadaf Ebrahimi #if PCRE2_CODE_UNIT_WIDTH == 8
220*22dc650dSSadaf Ebrahimi   case CHAR_NEL:
221*22dc650dSSadaf Ebrahimi   *lenptr = utf? 2 : 1;
222*22dc650dSSadaf Ebrahimi   return TRUE;
223*22dc650dSSadaf Ebrahimi 
224*22dc650dSSadaf Ebrahimi   case 0x2028:   /* LS */
225*22dc650dSSadaf Ebrahimi   case 0x2029:   /* PS */
226*22dc650dSSadaf Ebrahimi   *lenptr = 3;
227*22dc650dSSadaf Ebrahimi   return TRUE;
228*22dc650dSSadaf Ebrahimi 
229*22dc650dSSadaf Ebrahimi #else /* 16-bit or 32-bit code units */
230*22dc650dSSadaf Ebrahimi   case CHAR_NEL:
231*22dc650dSSadaf Ebrahimi   case 0x2028:   /* LS */
232*22dc650dSSadaf Ebrahimi   case 0x2029:   /* PS */
233*22dc650dSSadaf Ebrahimi   *lenptr = 1;
234*22dc650dSSadaf Ebrahimi   return TRUE;
235*22dc650dSSadaf Ebrahimi #endif
236*22dc650dSSadaf Ebrahimi #endif /* Not EBCDIC */
237*22dc650dSSadaf Ebrahimi 
238*22dc650dSSadaf Ebrahimi   default:
239*22dc650dSSadaf Ebrahimi   return FALSE;
240*22dc650dSSadaf Ebrahimi   }
241*22dc650dSSadaf Ebrahimi }
242*22dc650dSSadaf Ebrahimi 
243*22dc650dSSadaf Ebrahimi /* End of pcre2_newline.c */
244