1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker // 4*0e209d39SAndroid Build Coastguard Worker // regexcmp.h 5*0e209d39SAndroid Build Coastguard Worker // 6*0e209d39SAndroid Build Coastguard Worker // Copyright (C) 2002-2016, International Business Machines Corporation and others. 7*0e209d39SAndroid Build Coastguard Worker // All Rights Reserved. 8*0e209d39SAndroid Build Coastguard Worker // 9*0e209d39SAndroid Build Coastguard Worker // This file contains declarations for the class RegexCompile 10*0e209d39SAndroid Build Coastguard Worker // 11*0e209d39SAndroid Build Coastguard Worker // This class is internal to the regular expression implementation. 12*0e209d39SAndroid Build Coastguard Worker // For the public Regular Expression API, see the file "unicode/regex.h" 13*0e209d39SAndroid Build Coastguard Worker // 14*0e209d39SAndroid Build Coastguard Worker 15*0e209d39SAndroid Build Coastguard Worker 16*0e209d39SAndroid Build Coastguard Worker #ifndef RBBISCAN_H 17*0e209d39SAndroid Build Coastguard Worker #define RBBISCAN_H 18*0e209d39SAndroid Build Coastguard Worker 19*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 20*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_REGULAR_EXPRESSIONS 21*0e209d39SAndroid Build Coastguard Worker 22*0e209d39SAndroid Build Coastguard Worker #include "unicode/parseerr.h" 23*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h" 24*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h" 25*0e209d39SAndroid Build Coastguard Worker #include "unicode/utext.h" 26*0e209d39SAndroid Build Coastguard Worker #include "uhash.h" 27*0e209d39SAndroid Build Coastguard Worker #include "uvector.h" 28*0e209d39SAndroid Build Coastguard Worker #include "uvectr32.h" 29*0e209d39SAndroid Build Coastguard Worker 30*0e209d39SAndroid Build Coastguard Worker 31*0e209d39SAndroid Build Coastguard Worker 32*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 33*0e209d39SAndroid Build Coastguard Worker 34*0e209d39SAndroid Build Coastguard Worker 35*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------------------- 36*0e209d39SAndroid Build Coastguard Worker // 37*0e209d39SAndroid Build Coastguard Worker // class RegexCompile Contains the regular expression compiler. 38*0e209d39SAndroid Build Coastguard Worker // 39*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------------------- 40*0e209d39SAndroid Build Coastguard Worker class RegexPattern; 41*0e209d39SAndroid Build Coastguard Worker 42*0e209d39SAndroid Build Coastguard Worker 43*0e209d39SAndroid Build Coastguard Worker class U_I18N_API RegexCompile : public UMemory { 44*0e209d39SAndroid Build Coastguard Worker public: 45*0e209d39SAndroid Build Coastguard Worker 46*0e209d39SAndroid Build Coastguard Worker enum { 47*0e209d39SAndroid Build Coastguard Worker kStackSize = 100 // The size of the state stack for 48*0e209d39SAndroid Build Coastguard Worker }; // pattern parsing. Corresponds roughly 49*0e209d39SAndroid Build Coastguard Worker // to the depth of parentheses nesting 50*0e209d39SAndroid Build Coastguard Worker // that is allowed in the rules. 51*0e209d39SAndroid Build Coastguard Worker 52*0e209d39SAndroid Build Coastguard Worker struct RegexPatternChar { 53*0e209d39SAndroid Build Coastguard Worker UChar32 fChar; 54*0e209d39SAndroid Build Coastguard Worker UBool fQuoted; 55*0e209d39SAndroid Build Coastguard Worker }; 56*0e209d39SAndroid Build Coastguard Worker 57*0e209d39SAndroid Build Coastguard Worker RegexCompile(RegexPattern *rp, UErrorCode &e); 58*0e209d39SAndroid Build Coastguard Worker 59*0e209d39SAndroid Build Coastguard Worker void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 60*0e209d39SAndroid Build Coastguard Worker void compile(UText *pat, UParseError &pp, UErrorCode &e); 61*0e209d39SAndroid Build Coastguard Worker 62*0e209d39SAndroid Build Coastguard Worker 63*0e209d39SAndroid Build Coastguard Worker virtual ~RegexCompile(); 64*0e209d39SAndroid Build Coastguard Worker 65*0e209d39SAndroid Build Coastguard Worker void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 66*0e209d39SAndroid Build Coastguard Worker 67*0e209d39SAndroid Build Coastguard Worker 68*0e209d39SAndroid Build Coastguard Worker // Categories of parentheses in pattern. 69*0e209d39SAndroid Build Coastguard Worker // The category is saved in the compile-time parentheses stack frame, and 70*0e209d39SAndroid Build Coastguard Worker // determines the code to be generated when the matching close ) is encountered. 71*0e209d39SAndroid Build Coastguard Worker enum EParenClass { 72*0e209d39SAndroid Build Coastguard Worker plain = -1, // No special handling 73*0e209d39SAndroid Build Coastguard Worker capturing = -2, 74*0e209d39SAndroid Build Coastguard Worker atomic = -3, 75*0e209d39SAndroid Build Coastguard Worker lookAhead = -4, 76*0e209d39SAndroid Build Coastguard Worker negLookAhead = -5, 77*0e209d39SAndroid Build Coastguard Worker flags = -6, 78*0e209d39SAndroid Build Coastguard Worker lookBehind = -7, 79*0e209d39SAndroid Build Coastguard Worker lookBehindN = -8 80*0e209d39SAndroid Build Coastguard Worker }; 81*0e209d39SAndroid Build Coastguard Worker 82*0e209d39SAndroid Build Coastguard Worker private: 83*0e209d39SAndroid Build Coastguard Worker 84*0e209d39SAndroid Build Coastguard Worker 85*0e209d39SAndroid Build Coastguard Worker UBool doParseActions(int32_t a); 86*0e209d39SAndroid Build Coastguard Worker void error(UErrorCode e); // error reporting convenience function. 87*0e209d39SAndroid Build Coastguard Worker 88*0e209d39SAndroid Build Coastguard Worker UChar32 nextCharLL(); 89*0e209d39SAndroid Build Coastguard Worker UChar32 peekCharLL(); 90*0e209d39SAndroid Build Coastguard Worker UnicodeSet *scanProp(); 91*0e209d39SAndroid Build Coastguard Worker UnicodeSet *scanPosixProp(); 92*0e209d39SAndroid Build Coastguard Worker void handleCloseParen(); 93*0e209d39SAndroid Build Coastguard Worker int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 94*0e209d39SAndroid Build Coastguard Worker // at the top of the just completed block 95*0e209d39SAndroid Build Coastguard Worker // or operation, and optionally ensure that 96*0e209d39SAndroid Build Coastguard Worker // there is space to add an opcode there. 97*0e209d39SAndroid Build Coastguard Worker void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 98*0e209d39SAndroid Build Coastguard Worker // a reference to a UnicodeSet. 99*0e209d39SAndroid Build Coastguard Worker void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 100*0e209d39SAndroid Build Coastguard Worker int32_t LoopOp); 101*0e209d39SAndroid Build Coastguard Worker UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 102*0e209d39SAndroid Build Coastguard Worker void literalChar(UChar32 c); // Compile a literal char 103*0e209d39SAndroid Build Coastguard Worker void fixLiterals(UBool split=false); // Generate code for pending literal characters. 104*0e209d39SAndroid Build Coastguard Worker void insertOp(int32_t where); // Open up a slot for a new op in the 105*0e209d39SAndroid Build Coastguard Worker // generated code at the specified location. 106*0e209d39SAndroid Build Coastguard Worker void appendOp(int32_t op); // Append a new op to the compiled pattern. 107*0e209d39SAndroid Build Coastguard Worker void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern. 108*0e209d39SAndroid Build Coastguard Worker int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode instruction. 109*0e209d39SAndroid Build Coastguard Worker int32_t allocateData(int32_t size); // Allocate space in the matcher data area. 110*0e209d39SAndroid Build Coastguard Worker // Return index of the newly allocated data. 111*0e209d39SAndroid Build Coastguard Worker int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame. 112*0e209d39SAndroid Build Coastguard Worker // Return offset index in the frame. 113*0e209d39SAndroid Build Coastguard Worker int32_t minMatchLength(int32_t start, 114*0e209d39SAndroid Build Coastguard Worker int32_t end); 115*0e209d39SAndroid Build Coastguard Worker int32_t maxMatchLength(int32_t start, 116*0e209d39SAndroid Build Coastguard Worker int32_t end); 117*0e209d39SAndroid Build Coastguard Worker void matchStartType(); 118*0e209d39SAndroid Build Coastguard Worker void stripNOPs(); 119*0e209d39SAndroid Build Coastguard Worker 120*0e209d39SAndroid Build Coastguard Worker void setEval(int32_t op); 121*0e209d39SAndroid Build Coastguard Worker void setPushOp(int32_t op); 122*0e209d39SAndroid Build Coastguard Worker UChar32 scanNamedChar(); 123*0e209d39SAndroid Build Coastguard Worker UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); 124*0e209d39SAndroid Build Coastguard Worker 125*0e209d39SAndroid Build Coastguard Worker public: // Public for testing only. 126*0e209d39SAndroid Build Coastguard Worker static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars); 127*0e209d39SAndroid Build Coastguard Worker private: 128*0e209d39SAndroid Build Coastguard Worker 129*0e209d39SAndroid Build Coastguard Worker 130*0e209d39SAndroid Build Coastguard Worker UErrorCode *fStatus; 131*0e209d39SAndroid Build Coastguard Worker RegexPattern *fRXPat; 132*0e209d39SAndroid Build Coastguard Worker UParseError *fParseErr; 133*0e209d39SAndroid Build Coastguard Worker 134*0e209d39SAndroid Build Coastguard Worker // 135*0e209d39SAndroid Build Coastguard Worker // Data associated with low level character scanning 136*0e209d39SAndroid Build Coastguard Worker // 137*0e209d39SAndroid Build Coastguard Worker int64_t fScanIndex; // Index of current character being processed 138*0e209d39SAndroid Build Coastguard Worker // in the rule input string. 139*0e209d39SAndroid Build Coastguard Worker UBool fQuoteMode; // Scan is in a \Q...\E quoted region 140*0e209d39SAndroid Build Coastguard Worker UBool fInBackslashQuote; // Scan is between a '\' and the following char. 141*0e209d39SAndroid Build Coastguard Worker UBool fEOLComments; // When scan is just after '(?', inhibit #... to 142*0e209d39SAndroid Build Coastguard Worker // end of line comments, in favor of (?#...) comments. 143*0e209d39SAndroid Build Coastguard Worker int64_t fLineNum; // Line number in input file. 144*0e209d39SAndroid Build Coastguard Worker int64_t fCharNum; // Char position within the line. 145*0e209d39SAndroid Build Coastguard Worker UChar32 fLastChar; // Previous char, needed to count CR-LF 146*0e209d39SAndroid Build Coastguard Worker // as a single line, not two. 147*0e209d39SAndroid Build Coastguard Worker UChar32 fPeekChar; // Saved char, if we've scanned ahead. 148*0e209d39SAndroid Build Coastguard Worker 149*0e209d39SAndroid Build Coastguard Worker 150*0e209d39SAndroid Build Coastguard Worker RegexPatternChar fC; // Current char for parse state machine 151*0e209d39SAndroid Build Coastguard Worker // processing. 152*0e209d39SAndroid Build Coastguard Worker 153*0e209d39SAndroid Build Coastguard Worker uint16_t fStack[kStackSize]; // State stack, holds state pushes 154*0e209d39SAndroid Build Coastguard Worker int32_t fStackPtr; // and pops as specified in the state 155*0e209d39SAndroid Build Coastguard Worker // transition rules. 156*0e209d39SAndroid Build Coastguard Worker 157*0e209d39SAndroid Build Coastguard Worker // 158*0e209d39SAndroid Build Coastguard Worker // Data associated with the generation of the pcode for the match engine 159*0e209d39SAndroid Build Coastguard Worker // 160*0e209d39SAndroid Build Coastguard Worker int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 161*0e209d39SAndroid Build Coastguard Worker // Always has high bit (31) set so that flag values 162*0e209d39SAndroid Build Coastguard Worker // on the paren stack are distinguished from relocatable 163*0e209d39SAndroid Build Coastguard Worker // pcode addresses. 164*0e209d39SAndroid Build Coastguard Worker int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 165*0e209d39SAndroid Build Coastguard Worker // until last flag is scanned. 166*0e209d39SAndroid Build Coastguard Worker UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 167*0e209d39SAndroid Build Coastguard Worker 168*0e209d39SAndroid Build Coastguard Worker UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. 169*0e209d39SAndroid Build Coastguard Worker // Once completed, meaning that some non-literal pattern 170*0e209d39SAndroid Build Coastguard Worker // construct is encountered, the appropriate opcodes 171*0e209d39SAndroid Build Coastguard Worker // to match the literal will be generated, and this 172*0e209d39SAndroid Build Coastguard Worker // string will be cleared. 173*0e209d39SAndroid Build Coastguard Worker 174*0e209d39SAndroid Build Coastguard Worker int64_t fPatternLength; // Length of the input pattern string. 175*0e209d39SAndroid Build Coastguard Worker 176*0e209d39SAndroid Build Coastguard Worker UVector32 fParenStack; // parentheses stack. Each frame consists of 177*0e209d39SAndroid Build Coastguard Worker // the positions of compiled pattern operations 178*0e209d39SAndroid Build Coastguard Worker // needing fixup, followed by negative value. The 179*0e209d39SAndroid Build Coastguard Worker // first entry in each frame is the position of the 180*0e209d39SAndroid Build Coastguard Worker // spot reserved for use when a quantifier 181*0e209d39SAndroid Build Coastguard Worker // needs to add a SAVE at the start of a (block) 182*0e209d39SAndroid Build Coastguard Worker // The negative value (-1, -2,...) indicates 183*0e209d39SAndroid Build Coastguard Worker // the kind of paren that opened the frame. Some 184*0e209d39SAndroid Build Coastguard Worker // need special handling on close. 185*0e209d39SAndroid Build Coastguard Worker 186*0e209d39SAndroid Build Coastguard Worker 187*0e209d39SAndroid Build Coastguard Worker int32_t fMatchOpenParen; // The position in the compiled pattern 188*0e209d39SAndroid Build Coastguard Worker // of the slot reserved for a state save 189*0e209d39SAndroid Build Coastguard Worker // at the start of the most recently processed 190*0e209d39SAndroid Build Coastguard Worker // parenthesized block. Updated when processing 191*0e209d39SAndroid Build Coastguard Worker // a close to the location for the corresponding open. 192*0e209d39SAndroid Build Coastguard Worker 193*0e209d39SAndroid Build Coastguard Worker int32_t fMatchCloseParen; // The position in the pattern of the first 194*0e209d39SAndroid Build Coastguard Worker // location after the most recently processed 195*0e209d39SAndroid Build Coastguard Worker // parenthesized block. 196*0e209d39SAndroid Build Coastguard Worker 197*0e209d39SAndroid Build Coastguard Worker int32_t fIntervalLow; // {lower, upper} interval quantifier values. 198*0e209d39SAndroid Build Coastguard Worker int32_t fIntervalUpper; // Placed here temporarily, when pattern is 199*0e209d39SAndroid Build Coastguard Worker // initially scanned. Each new interval 200*0e209d39SAndroid Build Coastguard Worker // encountered overwrites these values. 201*0e209d39SAndroid Build Coastguard Worker // -1 for the upper interval value means none 202*0e209d39SAndroid Build Coastguard Worker // was specified (unlimited occurrences.) 203*0e209d39SAndroid Build Coastguard Worker 204*0e209d39SAndroid Build Coastguard Worker UStack fSetStack; // Stack of UnicodeSets, used while evaluating 205*0e209d39SAndroid Build Coastguard Worker // (at compile time) set expressions within 206*0e209d39SAndroid Build Coastguard Worker // the pattern. 207*0e209d39SAndroid Build Coastguard Worker UStack fSetOpStack; // Stack of pending set operators (&&, --, union) 208*0e209d39SAndroid Build Coastguard Worker 209*0e209d39SAndroid Build Coastguard Worker UChar32 fLastSetLiteral; // The last single code point added to a set. 210*0e209d39SAndroid Build Coastguard Worker // needed when "-y" is scanned, and we need 211*0e209d39SAndroid Build Coastguard Worker // to turn "x-y" into a range. 212*0e209d39SAndroid Build Coastguard Worker 213*0e209d39SAndroid Build Coastguard Worker UnicodeString *fCaptureName; // Named Capture, the group name is built up 214*0e209d39SAndroid Build Coastguard Worker // in this string while being scanned. 215*0e209d39SAndroid Build Coastguard Worker }; 216*0e209d39SAndroid Build Coastguard Worker 217*0e209d39SAndroid Build Coastguard Worker // Constant values to be pushed onto fSetOpStack while scanning & evaluating [set expressions] 218*0e209d39SAndroid Build Coastguard Worker // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. 219*0e209d39SAndroid Build Coastguard Worker 220*0e209d39SAndroid Build Coastguard Worker enum SetOperations { 221*0e209d39SAndroid Build Coastguard Worker setStart = 0 << 16 | 1, 222*0e209d39SAndroid Build Coastguard Worker setEnd = 1 << 16 | 2, 223*0e209d39SAndroid Build Coastguard Worker setNegation = 2 << 16 | 3, 224*0e209d39SAndroid Build Coastguard Worker setCaseClose = 2 << 16 | 9, 225*0e209d39SAndroid Build Coastguard Worker setDifference2 = 3 << 16 | 4, // '--' set difference operator 226*0e209d39SAndroid Build Coastguard Worker setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator 227*0e209d39SAndroid Build Coastguard Worker setUnion = 4 << 16 | 6, // implicit union of adjacent items 228*0e209d39SAndroid Build Coastguard Worker setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. 229*0e209d39SAndroid Build Coastguard Worker setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. 230*0e209d39SAndroid Build Coastguard Worker }; 231*0e209d39SAndroid Build Coastguard Worker 232*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 233*0e209d39SAndroid Build Coastguard Worker #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 234*0e209d39SAndroid Build Coastguard Worker #endif // RBBISCAN_H 235