1*22dc650dSSadaf Ebrahimi /************************************************* 2*22dc650dSSadaf Ebrahimi * Perl-Compatible Regular Expressions * 3*22dc650dSSadaf Ebrahimi *************************************************/ 4*22dc650dSSadaf Ebrahimi 5*22dc650dSSadaf Ebrahimi /* PCRE is a library of functions to support regular expressions whose syntax 6*22dc650dSSadaf Ebrahimi and semantics are as close as possible to those of the Perl 5 language. 7*22dc650dSSadaf Ebrahimi 8*22dc650dSSadaf Ebrahimi Written by Philip Hazel 9*22dc650dSSadaf Ebrahimi Original API code Copyright (c) 1997-2012 University of Cambridge 10*22dc650dSSadaf Ebrahimi New API code Copyright (c) 2016-2024 University of Cambridge 11*22dc650dSSadaf Ebrahimi 12*22dc650dSSadaf Ebrahimi ----------------------------------------------------------------------------- 13*22dc650dSSadaf Ebrahimi Redistribution and use in source and binary forms, with or without 14*22dc650dSSadaf Ebrahimi modification, are permitted provided that the following conditions are met: 15*22dc650dSSadaf Ebrahimi 16*22dc650dSSadaf Ebrahimi * Redistributions of source code must retain the above copyright notice, 17*22dc650dSSadaf Ebrahimi this list of conditions and the following disclaimer. 18*22dc650dSSadaf Ebrahimi 19*22dc650dSSadaf Ebrahimi * Redistributions in binary form must reproduce the above copyright 20*22dc650dSSadaf Ebrahimi notice, this list of conditions and the following disclaimer in the 21*22dc650dSSadaf Ebrahimi documentation and/or other materials provided with the distribution. 22*22dc650dSSadaf Ebrahimi 23*22dc650dSSadaf Ebrahimi * Neither the name of the University of Cambridge nor the names of its 24*22dc650dSSadaf Ebrahimi contributors may be used to endorse or promote products derived from 25*22dc650dSSadaf Ebrahimi this software without specific prior written permission. 26*22dc650dSSadaf Ebrahimi 27*22dc650dSSadaf Ebrahimi THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28*22dc650dSSadaf Ebrahimi AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29*22dc650dSSadaf Ebrahimi IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30*22dc650dSSadaf Ebrahimi ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31*22dc650dSSadaf Ebrahimi LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32*22dc650dSSadaf Ebrahimi CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33*22dc650dSSadaf Ebrahimi SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34*22dc650dSSadaf Ebrahimi INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35*22dc650dSSadaf Ebrahimi CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36*22dc650dSSadaf Ebrahimi ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37*22dc650dSSadaf Ebrahimi POSSIBILITY OF SUCH DAMAGE. 38*22dc650dSSadaf Ebrahimi ----------------------------------------------------------------------------- 39*22dc650dSSadaf Ebrahimi */ 40*22dc650dSSadaf Ebrahimi 41*22dc650dSSadaf Ebrahimi /* This module contains some fixed tables that are used by more than one of the 42*22dc650dSSadaf Ebrahimi PCRE2 code modules. The tables are also #included by the pcre2test program, 43*22dc650dSSadaf Ebrahimi which uses macros to change their names from _pcre2_xxx to xxxx, thereby 44*22dc650dSSadaf Ebrahimi avoiding name clashes with the library. In this case, PCRE2_PCRE2TEST is 45*22dc650dSSadaf Ebrahimi defined. */ 46*22dc650dSSadaf Ebrahimi 47*22dc650dSSadaf Ebrahimi #ifndef PCRE2_PCRE2TEST /* We're compiling the library */ 48*22dc650dSSadaf Ebrahimi #ifdef HAVE_CONFIG_H 49*22dc650dSSadaf Ebrahimi #include "config.h" 50*22dc650dSSadaf Ebrahimi #endif 51*22dc650dSSadaf Ebrahimi #include "pcre2_internal.h" 52*22dc650dSSadaf Ebrahimi #endif /* PCRE2_PCRE2TEST */ 53*22dc650dSSadaf Ebrahimi 54*22dc650dSSadaf Ebrahimi /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that 55*22dc650dSSadaf Ebrahimi the definition is next to the definition of the opcodes in pcre2_internal.h. 56*22dc650dSSadaf Ebrahimi This is mode-dependent, so it is skipped when this file is included by 57*22dc650dSSadaf Ebrahimi pcre2test. */ 58*22dc650dSSadaf Ebrahimi 59*22dc650dSSadaf Ebrahimi #ifndef PCRE2_PCRE2TEST 60*22dc650dSSadaf Ebrahimi const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS }; 61*22dc650dSSadaf Ebrahimi #endif 62*22dc650dSSadaf Ebrahimi 63*22dc650dSSadaf Ebrahimi /* Tables of horizontal and vertical whitespace characters, suitable for 64*22dc650dSSadaf Ebrahimi adding to classes. */ 65*22dc650dSSadaf Ebrahimi 66*22dc650dSSadaf Ebrahimi const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST }; 67*22dc650dSSadaf Ebrahimi const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST }; 68*22dc650dSSadaf Ebrahimi 69*22dc650dSSadaf Ebrahimi /* These tables are the pairs of delimiters that are valid for callout string 70*22dc650dSSadaf Ebrahimi arguments. For each starting delimiter there must be a matching ending 71*22dc650dSSadaf Ebrahimi delimiter, which in fact is different only for bracket-like delimiters. */ 72*22dc650dSSadaf Ebrahimi 73*22dc650dSSadaf Ebrahimi const uint32_t PRIV(callout_start_delims)[] = { 74*22dc650dSSadaf Ebrahimi CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK, 75*22dc650dSSadaf Ebrahimi CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN, 76*22dc650dSSadaf Ebrahimi CHAR_DOLLAR_SIGN, CHAR_LEFT_CURLY_BRACKET, 0 }; 77*22dc650dSSadaf Ebrahimi 78*22dc650dSSadaf Ebrahimi const uint32_t PRIV(callout_end_delims[]) = { 79*22dc650dSSadaf Ebrahimi CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK, 80*22dc650dSSadaf Ebrahimi CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN, 81*22dc650dSSadaf Ebrahimi CHAR_DOLLAR_SIGN, CHAR_RIGHT_CURLY_BRACKET, 0 }; 82*22dc650dSSadaf Ebrahimi 83*22dc650dSSadaf Ebrahimi 84*22dc650dSSadaf Ebrahimi /************************************************* 85*22dc650dSSadaf Ebrahimi * Tables for UTF-8 support * 86*22dc650dSSadaf Ebrahimi *************************************************/ 87*22dc650dSSadaf Ebrahimi 88*22dc650dSSadaf Ebrahimi /* These tables are required by pcre2test in 16- or 32-bit mode, as well 89*22dc650dSSadaf Ebrahimi as for the library in 8-bit mode, because pcre2test uses UTF-8 internally for 90*22dc650dSSadaf Ebrahimi handling wide characters. */ 91*22dc650dSSadaf Ebrahimi 92*22dc650dSSadaf Ebrahimi #if defined PCRE2_PCRE2TEST || \ 93*22dc650dSSadaf Ebrahimi (defined SUPPORT_UNICODE && \ 94*22dc650dSSadaf Ebrahimi defined PCRE2_CODE_UNIT_WIDTH && \ 95*22dc650dSSadaf Ebrahimi PCRE2_CODE_UNIT_WIDTH == 8) 96*22dc650dSSadaf Ebrahimi 97*22dc650dSSadaf Ebrahimi /* These are the breakpoints for different numbers of bytes in a UTF-8 98*22dc650dSSadaf Ebrahimi character. */ 99*22dc650dSSadaf Ebrahimi 100*22dc650dSSadaf Ebrahimi const int PRIV(utf8_table1)[] = 101*22dc650dSSadaf Ebrahimi { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; 102*22dc650dSSadaf Ebrahimi 103*22dc650dSSadaf Ebrahimi const int PRIV(utf8_table1_size) = sizeof(PRIV(utf8_table1)) / sizeof(int); 104*22dc650dSSadaf Ebrahimi 105*22dc650dSSadaf Ebrahimi /* These are the indicator bits and the mask for the data bits to set in the 106*22dc650dSSadaf Ebrahimi first byte of a character, indexed by the number of additional bytes. */ 107*22dc650dSSadaf Ebrahimi 108*22dc650dSSadaf Ebrahimi const int PRIV(utf8_table2)[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; 109*22dc650dSSadaf Ebrahimi const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; 110*22dc650dSSadaf Ebrahimi 111*22dc650dSSadaf Ebrahimi /* Table of the number of extra bytes, indexed by the first byte masked with 112*22dc650dSSadaf Ebrahimi 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ 113*22dc650dSSadaf Ebrahimi 114*22dc650dSSadaf Ebrahimi const uint8_t PRIV(utf8_table4)[] = { 115*22dc650dSSadaf Ebrahimi 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 116*22dc650dSSadaf Ebrahimi 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 117*22dc650dSSadaf Ebrahimi 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 118*22dc650dSSadaf Ebrahimi 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; 119*22dc650dSSadaf Ebrahimi 120*22dc650dSSadaf Ebrahimi #endif /* UTF-8 support needed */ 121*22dc650dSSadaf Ebrahimi 122*22dc650dSSadaf Ebrahimi /* Tables concerned with Unicode properties are relevant only when Unicode 123*22dc650dSSadaf Ebrahimi support is enabled. See also the pcre2_ucptables.c file, which is generated by 124*22dc650dSSadaf Ebrahimi a Python script from Unicode data files. */ 125*22dc650dSSadaf Ebrahimi 126*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_UNICODE 127*22dc650dSSadaf Ebrahimi 128*22dc650dSSadaf Ebrahimi /* Table to translate from particular type value to the general value. */ 129*22dc650dSSadaf Ebrahimi 130*22dc650dSSadaf Ebrahimi const uint32_t PRIV(ucp_gentype)[] = { 131*22dc650dSSadaf Ebrahimi ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */ 132*22dc650dSSadaf Ebrahimi ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */ 133*22dc650dSSadaf Ebrahimi ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */ 134*22dc650dSSadaf Ebrahimi ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */ 135*22dc650dSSadaf Ebrahimi ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */ 136*22dc650dSSadaf Ebrahimi ucp_P, ucp_P, /* Ps, Po */ 137*22dc650dSSadaf Ebrahimi ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */ 138*22dc650dSSadaf Ebrahimi ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ 139*22dc650dSSadaf Ebrahimi }; 140*22dc650dSSadaf Ebrahimi 141*22dc650dSSadaf Ebrahimi /* This table encodes the rules for finding the end of an extended grapheme 142*22dc650dSSadaf Ebrahimi cluster. Every code point has a grapheme break property which is one of the 143*22dc650dSSadaf Ebrahimi ucp_gbXX values defined in pcre2_ucp.h. These changed between Unicode versions 144*22dc650dSSadaf Ebrahimi 10 and 11. The 2-dimensional table is indexed by the properties of two adjacent 145*22dc650dSSadaf Ebrahimi code points. The left property selects a word from the table, and the right 146*22dc650dSSadaf Ebrahimi property selects a bit from that word like this: 147*22dc650dSSadaf Ebrahimi 148*22dc650dSSadaf Ebrahimi PRIV(ucp_gbtable)[left-property] & (1u << right-property) 149*22dc650dSSadaf Ebrahimi 150*22dc650dSSadaf Ebrahimi The value is non-zero if a grapheme break is NOT permitted between the relevant 151*22dc650dSSadaf Ebrahimi two code points. The breaking rules are as follows: 152*22dc650dSSadaf Ebrahimi 153*22dc650dSSadaf Ebrahimi 1. Break at the start and end of text (pretty obviously). 154*22dc650dSSadaf Ebrahimi 155*22dc650dSSadaf Ebrahimi 2. Do not break between a CR and LF; otherwise, break before and after 156*22dc650dSSadaf Ebrahimi controls. 157*22dc650dSSadaf Ebrahimi 158*22dc650dSSadaf Ebrahimi 3. Do not break Hangul syllable sequences, the rules for which are: 159*22dc650dSSadaf Ebrahimi 160*22dc650dSSadaf Ebrahimi L may be followed by L, V, LV or LVT 161*22dc650dSSadaf Ebrahimi LV or V may be followed by V or T 162*22dc650dSSadaf Ebrahimi LVT or T may be followed by T 163*22dc650dSSadaf Ebrahimi 164*22dc650dSSadaf Ebrahimi 4. Do not break before extending characters or zero-width-joiner (ZWJ). 165*22dc650dSSadaf Ebrahimi 166*22dc650dSSadaf Ebrahimi The following rules are only for extended grapheme clusters (but that's what we 167*22dc650dSSadaf Ebrahimi are implementing). 168*22dc650dSSadaf Ebrahimi 169*22dc650dSSadaf Ebrahimi 5. Do not break before SpacingMarks. 170*22dc650dSSadaf Ebrahimi 171*22dc650dSSadaf Ebrahimi 6. Do not break after Prepend characters. 172*22dc650dSSadaf Ebrahimi 173*22dc650dSSadaf Ebrahimi 7. Do not break within emoji modifier sequences or emoji zwj sequences. That 174*22dc650dSSadaf Ebrahimi is, do not break between characters with the Extended_Pictographic property 175*22dc650dSSadaf Ebrahimi if a ZWJ intervenes. Extend characters are allowed between the characters; 176*22dc650dSSadaf Ebrahimi this cannot be represented in this table, the code has to deal with it. 177*22dc650dSSadaf Ebrahimi 178*22dc650dSSadaf Ebrahimi 8. Do not break within emoji flag sequences. That is, do not break between 179*22dc650dSSadaf Ebrahimi regional indicator (RI) symbols if there are an odd number of RI characters 180*22dc650dSSadaf Ebrahimi before the break point. This table encodes "join RI characters"; the code 181*22dc650dSSadaf Ebrahimi has to deal with checking for previous adjoining RIs. 182*22dc650dSSadaf Ebrahimi 183*22dc650dSSadaf Ebrahimi 9. Otherwise, break everywhere. 184*22dc650dSSadaf Ebrahimi */ 185*22dc650dSSadaf Ebrahimi 186*22dc650dSSadaf Ebrahimi #define ESZ (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbZWJ) 187*22dc650dSSadaf Ebrahimi 188*22dc650dSSadaf Ebrahimi const uint32_t PRIV(ucp_gbtable)[] = { 189*22dc650dSSadaf Ebrahimi (1u<<ucp_gbLF), /* 0 CR */ 190*22dc650dSSadaf Ebrahimi 0, /* 1 LF */ 191*22dc650dSSadaf Ebrahimi 0, /* 2 Control */ 192*22dc650dSSadaf Ebrahimi ESZ, /* 3 Extend */ 193*22dc650dSSadaf Ebrahimi ESZ|(1u<<ucp_gbPrepend)| /* 4 Prepend */ 194*22dc650dSSadaf Ebrahimi (1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbT)| 195*22dc650dSSadaf Ebrahimi (1u<<ucp_gbLV)|(1u<<ucp_gbLVT)|(1u<<ucp_gbOther)| 196*22dc650dSSadaf Ebrahimi (1u<<ucp_gbRegional_Indicator), 197*22dc650dSSadaf Ebrahimi ESZ, /* 5 SpacingMark */ 198*22dc650dSSadaf Ebrahimi ESZ|(1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbLV)| /* 6 L */ 199*22dc650dSSadaf Ebrahimi (1u<<ucp_gbLVT), 200*22dc650dSSadaf Ebrahimi ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT), /* 7 V */ 201*22dc650dSSadaf Ebrahimi ESZ|(1u<<ucp_gbT), /* 8 T */ 202*22dc650dSSadaf Ebrahimi ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT), /* 9 LV */ 203*22dc650dSSadaf Ebrahimi ESZ|(1u<<ucp_gbT), /* 10 LVT */ 204*22dc650dSSadaf Ebrahimi (1u<<ucp_gbRegional_Indicator), /* 11 Regional Indicator */ 205*22dc650dSSadaf Ebrahimi ESZ, /* 12 Other */ 206*22dc650dSSadaf Ebrahimi ESZ|(1u<<ucp_gbExtended_Pictographic), /* 13 ZWJ */ 207*22dc650dSSadaf Ebrahimi ESZ /* 14 Extended Pictographic */ 208*22dc650dSSadaf Ebrahimi }; 209*22dc650dSSadaf Ebrahimi 210*22dc650dSSadaf Ebrahimi #undef ESZ 211*22dc650dSSadaf Ebrahimi 212*22dc650dSSadaf Ebrahimi #ifdef SUPPORT_JIT 213*22dc650dSSadaf Ebrahimi /* This table reverses PRIV(ucp_gentype). We can save the cost 214*22dc650dSSadaf Ebrahimi of a memory load. */ 215*22dc650dSSadaf Ebrahimi 216*22dc650dSSadaf Ebrahimi const int PRIV(ucp_typerange)[] = { 217*22dc650dSSadaf Ebrahimi ucp_Cc, ucp_Cs, 218*22dc650dSSadaf Ebrahimi ucp_Ll, ucp_Lu, 219*22dc650dSSadaf Ebrahimi ucp_Mc, ucp_Mn, 220*22dc650dSSadaf Ebrahimi ucp_Nd, ucp_No, 221*22dc650dSSadaf Ebrahimi ucp_Pc, ucp_Ps, 222*22dc650dSSadaf Ebrahimi ucp_Sc, ucp_So, 223*22dc650dSSadaf Ebrahimi ucp_Zl, ucp_Zs, 224*22dc650dSSadaf Ebrahimi }; 225*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_JIT */ 226*22dc650dSSadaf Ebrahimi 227*22dc650dSSadaf Ebrahimi /* Finally, include the tables that are auto-generated from the Unicode data 228*22dc650dSSadaf Ebrahimi files. */ 229*22dc650dSSadaf Ebrahimi 230*22dc650dSSadaf Ebrahimi #include "pcre2_ucptables.c" 231*22dc650dSSadaf Ebrahimi 232*22dc650dSSadaf Ebrahimi #endif /* SUPPORT_UNICODE */ 233*22dc650dSSadaf Ebrahimi 234*22dc650dSSadaf Ebrahimi /* End of pcre2_tables.c */ 235