1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006 The RE2 Authors. All Rights Reserved. 2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style 3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file. 4*ccdc9c3eSSadaf Ebrahimi 5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_REGEXP_H_ 6*ccdc9c3eSSadaf Ebrahimi #define RE2_REGEXP_H_ 7*ccdc9c3eSSadaf Ebrahimi 8*ccdc9c3eSSadaf Ebrahimi // --- SPONSORED LINK -------------------------------------------------- 9*ccdc9c3eSSadaf Ebrahimi // If you want to use this library for regular expression matching, 10*ccdc9c3eSSadaf Ebrahimi // you should use re2/re2.h, which provides a class RE2 that 11*ccdc9c3eSSadaf Ebrahimi // mimics the PCRE interface provided by PCRE's C++ wrappers. 12*ccdc9c3eSSadaf Ebrahimi // This header describes the low-level interface used to implement RE2 13*ccdc9c3eSSadaf Ebrahimi // and may change in backwards-incompatible ways from time to time. 14*ccdc9c3eSSadaf Ebrahimi // In contrast, RE2's interface will not. 15*ccdc9c3eSSadaf Ebrahimi // --------------------------------------------------------------------- 16*ccdc9c3eSSadaf Ebrahimi 17*ccdc9c3eSSadaf Ebrahimi // Regular expression library: parsing, execution, and manipulation 18*ccdc9c3eSSadaf Ebrahimi // of regular expressions. 19*ccdc9c3eSSadaf Ebrahimi // 20*ccdc9c3eSSadaf Ebrahimi // Any operation that traverses the Regexp structures should be written 21*ccdc9c3eSSadaf Ebrahimi // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested 22*ccdc9c3eSSadaf Ebrahimi // regular expressions such as x++++++++++++++++++++... might cause recursive 23*ccdc9c3eSSadaf Ebrahimi // traversals to overflow the stack. 24*ccdc9c3eSSadaf Ebrahimi // 25*ccdc9c3eSSadaf Ebrahimi // It is the caller's responsibility to provide appropriate mutual exclusion 26*ccdc9c3eSSadaf Ebrahimi // around manipulation of the regexps. RE2 does this. 27*ccdc9c3eSSadaf Ebrahimi // 28*ccdc9c3eSSadaf Ebrahimi // PARSING 29*ccdc9c3eSSadaf Ebrahimi // 30*ccdc9c3eSSadaf Ebrahimi // Regexp::Parse parses regular expressions encoded in UTF-8. 31*ccdc9c3eSSadaf Ebrahimi // The default syntax is POSIX extended regular expressions, 32*ccdc9c3eSSadaf Ebrahimi // with the following changes: 33*ccdc9c3eSSadaf Ebrahimi // 34*ccdc9c3eSSadaf Ebrahimi // 1. Backreferences (optional in POSIX EREs) are not supported. 35*ccdc9c3eSSadaf Ebrahimi // (Supporting them precludes the use of DFA-based 36*ccdc9c3eSSadaf Ebrahimi // matching engines.) 37*ccdc9c3eSSadaf Ebrahimi // 38*ccdc9c3eSSadaf Ebrahimi // 2. Collating elements and collation classes are not supported. 39*ccdc9c3eSSadaf Ebrahimi // (No one has needed or wanted them.) 40*ccdc9c3eSSadaf Ebrahimi // 41*ccdc9c3eSSadaf Ebrahimi // The exact syntax accepted can be modified by passing flags to 42*ccdc9c3eSSadaf Ebrahimi // Regexp::Parse. In particular, many of the basic Perl additions 43*ccdc9c3eSSadaf Ebrahimi // are available. The flags are documented below (search for LikePerl). 44*ccdc9c3eSSadaf Ebrahimi // 45*ccdc9c3eSSadaf Ebrahimi // If parsed with the flag Regexp::Latin1, both the regular expression 46*ccdc9c3eSSadaf Ebrahimi // and the input to the matching routines are assumed to be encoded in 47*ccdc9c3eSSadaf Ebrahimi // Latin-1, not UTF-8. 48*ccdc9c3eSSadaf Ebrahimi // 49*ccdc9c3eSSadaf Ebrahimi // EXECUTION 50*ccdc9c3eSSadaf Ebrahimi // 51*ccdc9c3eSSadaf Ebrahimi // Once Regexp has parsed a regular expression, it provides methods 52*ccdc9c3eSSadaf Ebrahimi // to search text using that regular expression. These methods are 53*ccdc9c3eSSadaf Ebrahimi // implemented via calling out to other regular expression libraries. 54*ccdc9c3eSSadaf Ebrahimi // (Let's call them the sublibraries.) 55*ccdc9c3eSSadaf Ebrahimi // 56*ccdc9c3eSSadaf Ebrahimi // To call a sublibrary, Regexp does not simply prepare a 57*ccdc9c3eSSadaf Ebrahimi // string version of the regular expression and hand it to the 58*ccdc9c3eSSadaf Ebrahimi // sublibrary. Instead, Regexp prepares, from its own parsed form, the 59*ccdc9c3eSSadaf Ebrahimi // corresponding internal representation used by the sublibrary. 60*ccdc9c3eSSadaf Ebrahimi // This has the drawback of needing to know the internal representation 61*ccdc9c3eSSadaf Ebrahimi // used by the sublibrary, but it has two important benefits: 62*ccdc9c3eSSadaf Ebrahimi // 63*ccdc9c3eSSadaf Ebrahimi // 1. The syntax and meaning of regular expressions is guaranteed 64*ccdc9c3eSSadaf Ebrahimi // to be that used by Regexp's parser, not the syntax expected 65*ccdc9c3eSSadaf Ebrahimi // by the sublibrary. Regexp might accept a restricted or 66*ccdc9c3eSSadaf Ebrahimi // expanded syntax for regular expressions as compared with 67*ccdc9c3eSSadaf Ebrahimi // the sublibrary. As long as Regexp can translate from its 68*ccdc9c3eSSadaf Ebrahimi // internal form into the sublibrary's, clients need not know 69*ccdc9c3eSSadaf Ebrahimi // exactly which sublibrary they are using. 70*ccdc9c3eSSadaf Ebrahimi // 71*ccdc9c3eSSadaf Ebrahimi // 2. The sublibrary parsers are bypassed. For whatever reason, 72*ccdc9c3eSSadaf Ebrahimi // sublibrary regular expression parsers often have security 73*ccdc9c3eSSadaf Ebrahimi // problems. For example, plan9grep's regular expression parser 74*ccdc9c3eSSadaf Ebrahimi // has a buffer overflow in its handling of large character 75*ccdc9c3eSSadaf Ebrahimi // classes, and PCRE's parser has had buffer overflow problems 76*ccdc9c3eSSadaf Ebrahimi // in the past. Security-team requires sandboxing of sublibrary 77*ccdc9c3eSSadaf Ebrahimi // regular expression parsers. Avoiding the sublibrary parsers 78*ccdc9c3eSSadaf Ebrahimi // avoids the sandbox. 79*ccdc9c3eSSadaf Ebrahimi // 80*ccdc9c3eSSadaf Ebrahimi // The execution methods we use now are provided by the compiled form, 81*ccdc9c3eSSadaf Ebrahimi // Prog, described in prog.h 82*ccdc9c3eSSadaf Ebrahimi // 83*ccdc9c3eSSadaf Ebrahimi // MANIPULATION 84*ccdc9c3eSSadaf Ebrahimi // 85*ccdc9c3eSSadaf Ebrahimi // Unlike other regular expression libraries, Regexp makes its parsed 86*ccdc9c3eSSadaf Ebrahimi // form accessible to clients, so that client code can analyze the 87*ccdc9c3eSSadaf Ebrahimi // parsed regular expressions. 88*ccdc9c3eSSadaf Ebrahimi 89*ccdc9c3eSSadaf Ebrahimi #include <stdint.h> 90*ccdc9c3eSSadaf Ebrahimi #include <map> 91*ccdc9c3eSSadaf Ebrahimi #include <set> 92*ccdc9c3eSSadaf Ebrahimi #include <string> 93*ccdc9c3eSSadaf Ebrahimi 94*ccdc9c3eSSadaf Ebrahimi #include "util/util.h" 95*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h" 96*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h" 97*ccdc9c3eSSadaf Ebrahimi #include "re2/stringpiece.h" 98*ccdc9c3eSSadaf Ebrahimi 99*ccdc9c3eSSadaf Ebrahimi namespace re2 { 100*ccdc9c3eSSadaf Ebrahimi 101*ccdc9c3eSSadaf Ebrahimi // Keep in sync with string list kOpcodeNames[] in testing/dump.cc 102*ccdc9c3eSSadaf Ebrahimi enum RegexpOp { 103*ccdc9c3eSSadaf Ebrahimi // Matches no strings. 104*ccdc9c3eSSadaf Ebrahimi kRegexpNoMatch = 1, 105*ccdc9c3eSSadaf Ebrahimi 106*ccdc9c3eSSadaf Ebrahimi // Matches empty string. 107*ccdc9c3eSSadaf Ebrahimi kRegexpEmptyMatch, 108*ccdc9c3eSSadaf Ebrahimi 109*ccdc9c3eSSadaf Ebrahimi // Matches rune_. 110*ccdc9c3eSSadaf Ebrahimi kRegexpLiteral, 111*ccdc9c3eSSadaf Ebrahimi 112*ccdc9c3eSSadaf Ebrahimi // Matches runes_. 113*ccdc9c3eSSadaf Ebrahimi kRegexpLiteralString, 114*ccdc9c3eSSadaf Ebrahimi 115*ccdc9c3eSSadaf Ebrahimi // Matches concatenation of sub_[0..nsub-1]. 116*ccdc9c3eSSadaf Ebrahimi kRegexpConcat, 117*ccdc9c3eSSadaf Ebrahimi // Matches union of sub_[0..nsub-1]. 118*ccdc9c3eSSadaf Ebrahimi kRegexpAlternate, 119*ccdc9c3eSSadaf Ebrahimi 120*ccdc9c3eSSadaf Ebrahimi // Matches sub_[0] zero or more times. 121*ccdc9c3eSSadaf Ebrahimi kRegexpStar, 122*ccdc9c3eSSadaf Ebrahimi // Matches sub_[0] one or more times. 123*ccdc9c3eSSadaf Ebrahimi kRegexpPlus, 124*ccdc9c3eSSadaf Ebrahimi // Matches sub_[0] zero or one times. 125*ccdc9c3eSSadaf Ebrahimi kRegexpQuest, 126*ccdc9c3eSSadaf Ebrahimi 127*ccdc9c3eSSadaf Ebrahimi // Matches sub_[0] at least min_ times, at most max_ times. 128*ccdc9c3eSSadaf Ebrahimi // max_ == -1 means no upper limit. 129*ccdc9c3eSSadaf Ebrahimi kRegexpRepeat, 130*ccdc9c3eSSadaf Ebrahimi 131*ccdc9c3eSSadaf Ebrahimi // Parenthesized (capturing) subexpression. Index is cap_. 132*ccdc9c3eSSadaf Ebrahimi // Optionally, capturing name is name_. 133*ccdc9c3eSSadaf Ebrahimi kRegexpCapture, 134*ccdc9c3eSSadaf Ebrahimi 135*ccdc9c3eSSadaf Ebrahimi // Matches any character. 136*ccdc9c3eSSadaf Ebrahimi kRegexpAnyChar, 137*ccdc9c3eSSadaf Ebrahimi 138*ccdc9c3eSSadaf Ebrahimi // Matches any byte [sic]. 139*ccdc9c3eSSadaf Ebrahimi kRegexpAnyByte, 140*ccdc9c3eSSadaf Ebrahimi 141*ccdc9c3eSSadaf Ebrahimi // Matches empty string at beginning of line. 142*ccdc9c3eSSadaf Ebrahimi kRegexpBeginLine, 143*ccdc9c3eSSadaf Ebrahimi // Matches empty string at end of line. 144*ccdc9c3eSSadaf Ebrahimi kRegexpEndLine, 145*ccdc9c3eSSadaf Ebrahimi 146*ccdc9c3eSSadaf Ebrahimi // Matches word boundary "\b". 147*ccdc9c3eSSadaf Ebrahimi kRegexpWordBoundary, 148*ccdc9c3eSSadaf Ebrahimi // Matches not-a-word boundary "\B". 149*ccdc9c3eSSadaf Ebrahimi kRegexpNoWordBoundary, 150*ccdc9c3eSSadaf Ebrahimi 151*ccdc9c3eSSadaf Ebrahimi // Matches empty string at beginning of text. 152*ccdc9c3eSSadaf Ebrahimi kRegexpBeginText, 153*ccdc9c3eSSadaf Ebrahimi // Matches empty string at end of text. 154*ccdc9c3eSSadaf Ebrahimi kRegexpEndText, 155*ccdc9c3eSSadaf Ebrahimi 156*ccdc9c3eSSadaf Ebrahimi // Matches character class given by cc_. 157*ccdc9c3eSSadaf Ebrahimi kRegexpCharClass, 158*ccdc9c3eSSadaf Ebrahimi 159*ccdc9c3eSSadaf Ebrahimi // Forces match of entire expression right now, 160*ccdc9c3eSSadaf Ebrahimi // with match ID match_id_ (used by RE2::Set). 161*ccdc9c3eSSadaf Ebrahimi kRegexpHaveMatch, 162*ccdc9c3eSSadaf Ebrahimi 163*ccdc9c3eSSadaf Ebrahimi kMaxRegexpOp = kRegexpHaveMatch, 164*ccdc9c3eSSadaf Ebrahimi }; 165*ccdc9c3eSSadaf Ebrahimi 166*ccdc9c3eSSadaf Ebrahimi // Keep in sync with string list in regexp.cc 167*ccdc9c3eSSadaf Ebrahimi enum RegexpStatusCode { 168*ccdc9c3eSSadaf Ebrahimi // No error 169*ccdc9c3eSSadaf Ebrahimi kRegexpSuccess = 0, 170*ccdc9c3eSSadaf Ebrahimi 171*ccdc9c3eSSadaf Ebrahimi // Unexpected error 172*ccdc9c3eSSadaf Ebrahimi kRegexpInternalError, 173*ccdc9c3eSSadaf Ebrahimi 174*ccdc9c3eSSadaf Ebrahimi // Parse errors 175*ccdc9c3eSSadaf Ebrahimi kRegexpBadEscape, // bad escape sequence 176*ccdc9c3eSSadaf Ebrahimi kRegexpBadCharClass, // bad character class 177*ccdc9c3eSSadaf Ebrahimi kRegexpBadCharRange, // bad character class range 178*ccdc9c3eSSadaf Ebrahimi kRegexpMissingBracket, // missing closing ] 179*ccdc9c3eSSadaf Ebrahimi kRegexpMissingParen, // missing closing ) 180*ccdc9c3eSSadaf Ebrahimi kRegexpTrailingBackslash, // at end of regexp 181*ccdc9c3eSSadaf Ebrahimi kRegexpRepeatArgument, // repeat argument missing, e.g. "*" 182*ccdc9c3eSSadaf Ebrahimi kRegexpRepeatSize, // bad repetition argument 183*ccdc9c3eSSadaf Ebrahimi kRegexpRepeatOp, // bad repetition operator 184*ccdc9c3eSSadaf Ebrahimi kRegexpBadPerlOp, // bad perl operator 185*ccdc9c3eSSadaf Ebrahimi kRegexpBadUTF8, // invalid UTF-8 in regexp 186*ccdc9c3eSSadaf Ebrahimi kRegexpBadNamedCapture, // bad named capture 187*ccdc9c3eSSadaf Ebrahimi }; 188*ccdc9c3eSSadaf Ebrahimi 189*ccdc9c3eSSadaf Ebrahimi // Error status for certain operations. 190*ccdc9c3eSSadaf Ebrahimi class RegexpStatus { 191*ccdc9c3eSSadaf Ebrahimi public: RegexpStatus()192*ccdc9c3eSSadaf Ebrahimi RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} ~RegexpStatus()193*ccdc9c3eSSadaf Ebrahimi ~RegexpStatus() { delete tmp_; } 194*ccdc9c3eSSadaf Ebrahimi set_code(RegexpStatusCode code)195*ccdc9c3eSSadaf Ebrahimi void set_code(RegexpStatusCode code) { code_ = code; } set_error_arg(const StringPiece & error_arg)196*ccdc9c3eSSadaf Ebrahimi void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } set_tmp(string * tmp)197*ccdc9c3eSSadaf Ebrahimi void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; } code()198*ccdc9c3eSSadaf Ebrahimi RegexpStatusCode code() const { return code_; } error_arg()199*ccdc9c3eSSadaf Ebrahimi const StringPiece& error_arg() const { return error_arg_; } ok()200*ccdc9c3eSSadaf Ebrahimi bool ok() const { return code() == kRegexpSuccess; } 201*ccdc9c3eSSadaf Ebrahimi 202*ccdc9c3eSSadaf Ebrahimi // Copies state from status. 203*ccdc9c3eSSadaf Ebrahimi void Copy(const RegexpStatus& status); 204*ccdc9c3eSSadaf Ebrahimi 205*ccdc9c3eSSadaf Ebrahimi // Returns text equivalent of code, e.g.: 206*ccdc9c3eSSadaf Ebrahimi // "Bad character class" 207*ccdc9c3eSSadaf Ebrahimi static string CodeText(RegexpStatusCode code); 208*ccdc9c3eSSadaf Ebrahimi 209*ccdc9c3eSSadaf Ebrahimi // Returns text describing error, e.g.: 210*ccdc9c3eSSadaf Ebrahimi // "Bad character class: [z-a]" 211*ccdc9c3eSSadaf Ebrahimi string Text() const; 212*ccdc9c3eSSadaf Ebrahimi 213*ccdc9c3eSSadaf Ebrahimi private: 214*ccdc9c3eSSadaf Ebrahimi RegexpStatusCode code_; // Kind of error 215*ccdc9c3eSSadaf Ebrahimi StringPiece error_arg_; // Piece of regexp containing syntax error. 216*ccdc9c3eSSadaf Ebrahimi string* tmp_; // Temporary storage, possibly where error_arg_ is. 217*ccdc9c3eSSadaf Ebrahimi 218*ccdc9c3eSSadaf Ebrahimi RegexpStatus(const RegexpStatus&) = delete; 219*ccdc9c3eSSadaf Ebrahimi RegexpStatus& operator=(const RegexpStatus&) = delete; 220*ccdc9c3eSSadaf Ebrahimi }; 221*ccdc9c3eSSadaf Ebrahimi 222*ccdc9c3eSSadaf Ebrahimi // Compiled form; see prog.h 223*ccdc9c3eSSadaf Ebrahimi class Prog; 224*ccdc9c3eSSadaf Ebrahimi 225*ccdc9c3eSSadaf Ebrahimi struct RuneRange { RuneRangeRuneRange226*ccdc9c3eSSadaf Ebrahimi RuneRange() : lo(0), hi(0) { } RuneRangeRuneRange227*ccdc9c3eSSadaf Ebrahimi RuneRange(int l, int h) : lo(l), hi(h) { } 228*ccdc9c3eSSadaf Ebrahimi Rune lo; 229*ccdc9c3eSSadaf Ebrahimi Rune hi; 230*ccdc9c3eSSadaf Ebrahimi }; 231*ccdc9c3eSSadaf Ebrahimi 232*ccdc9c3eSSadaf Ebrahimi // Less-than on RuneRanges treats a == b if they overlap at all. 233*ccdc9c3eSSadaf Ebrahimi // This lets us look in a set to find the range covering a particular Rune. 234*ccdc9c3eSSadaf Ebrahimi struct RuneRangeLess { operatorRuneRangeLess235*ccdc9c3eSSadaf Ebrahimi bool operator()(const RuneRange& a, const RuneRange& b) const { 236*ccdc9c3eSSadaf Ebrahimi return a.hi < b.lo; 237*ccdc9c3eSSadaf Ebrahimi } 238*ccdc9c3eSSadaf Ebrahimi }; 239*ccdc9c3eSSadaf Ebrahimi 240*ccdc9c3eSSadaf Ebrahimi class CharClassBuilder; 241*ccdc9c3eSSadaf Ebrahimi 242*ccdc9c3eSSadaf Ebrahimi class CharClass { 243*ccdc9c3eSSadaf Ebrahimi public: 244*ccdc9c3eSSadaf Ebrahimi void Delete(); 245*ccdc9c3eSSadaf Ebrahimi 246*ccdc9c3eSSadaf Ebrahimi typedef RuneRange* iterator; begin()247*ccdc9c3eSSadaf Ebrahimi iterator begin() { return ranges_; } end()248*ccdc9c3eSSadaf Ebrahimi iterator end() { return ranges_ + nranges_; } 249*ccdc9c3eSSadaf Ebrahimi size()250*ccdc9c3eSSadaf Ebrahimi int size() { return nrunes_; } empty()251*ccdc9c3eSSadaf Ebrahimi bool empty() { return nrunes_ == 0; } full()252*ccdc9c3eSSadaf Ebrahimi bool full() { return nrunes_ == Runemax+1; } FoldsASCII()253*ccdc9c3eSSadaf Ebrahimi bool FoldsASCII() { return folds_ascii_; } 254*ccdc9c3eSSadaf Ebrahimi 255*ccdc9c3eSSadaf Ebrahimi bool Contains(Rune r); 256*ccdc9c3eSSadaf Ebrahimi CharClass* Negate(); 257*ccdc9c3eSSadaf Ebrahimi 258*ccdc9c3eSSadaf Ebrahimi private: 259*ccdc9c3eSSadaf Ebrahimi CharClass(); // not implemented 260*ccdc9c3eSSadaf Ebrahimi ~CharClass(); // not implemented 261*ccdc9c3eSSadaf Ebrahimi static CharClass* New(int maxranges); 262*ccdc9c3eSSadaf Ebrahimi 263*ccdc9c3eSSadaf Ebrahimi friend class CharClassBuilder; 264*ccdc9c3eSSadaf Ebrahimi 265*ccdc9c3eSSadaf Ebrahimi bool folds_ascii_; 266*ccdc9c3eSSadaf Ebrahimi int nrunes_; 267*ccdc9c3eSSadaf Ebrahimi RuneRange *ranges_; 268*ccdc9c3eSSadaf Ebrahimi int nranges_; 269*ccdc9c3eSSadaf Ebrahimi 270*ccdc9c3eSSadaf Ebrahimi CharClass(const CharClass&) = delete; 271*ccdc9c3eSSadaf Ebrahimi CharClass& operator=(const CharClass&) = delete; 272*ccdc9c3eSSadaf Ebrahimi }; 273*ccdc9c3eSSadaf Ebrahimi 274*ccdc9c3eSSadaf Ebrahimi class Regexp { 275*ccdc9c3eSSadaf Ebrahimi public: 276*ccdc9c3eSSadaf Ebrahimi 277*ccdc9c3eSSadaf Ebrahimi // Flags for parsing. Can be ORed together. 278*ccdc9c3eSSadaf Ebrahimi enum ParseFlags { 279*ccdc9c3eSSadaf Ebrahimi NoParseFlags = 0, 280*ccdc9c3eSSadaf Ebrahimi FoldCase = 1<<0, // Fold case during matching (case-insensitive). 281*ccdc9c3eSSadaf Ebrahimi Literal = 1<<1, // Treat s as literal string instead of a regexp. 282*ccdc9c3eSSadaf Ebrahimi ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s 283*ccdc9c3eSSadaf Ebrahimi // and [[:space:]] to match newline. 284*ccdc9c3eSSadaf Ebrahimi DotNL = 1<<3, // Allow . to match newline. 285*ccdc9c3eSSadaf Ebrahimi MatchNL = ClassNL | DotNL, 286*ccdc9c3eSSadaf Ebrahimi OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and 287*ccdc9c3eSSadaf Ebrahimi // end of text, not around embedded newlines. 288*ccdc9c3eSSadaf Ebrahimi // (Perl's default) 289*ccdc9c3eSSadaf Ebrahimi Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. 290*ccdc9c3eSSadaf Ebrahimi NonGreedy = 1<<6, // Repetition operators are non-greedy by default. 291*ccdc9c3eSSadaf Ebrahimi PerlClasses = 1<<7, // Allow Perl character classes like \d. 292*ccdc9c3eSSadaf Ebrahimi PerlB = 1<<8, // Allow Perl's \b and \B. 293*ccdc9c3eSSadaf Ebrahimi PerlX = 1<<9, // Perl extensions: 294*ccdc9c3eSSadaf Ebrahimi // non-capturing parens - (?: ) 295*ccdc9c3eSSadaf Ebrahimi // non-greedy operators - *? +? ?? {}? 296*ccdc9c3eSSadaf Ebrahimi // flag edits - (?i) (?-i) (?i: ) 297*ccdc9c3eSSadaf Ebrahimi // i - FoldCase 298*ccdc9c3eSSadaf Ebrahimi // m - !OneLine 299*ccdc9c3eSSadaf Ebrahimi // s - DotNL 300*ccdc9c3eSSadaf Ebrahimi // U - NonGreedy 301*ccdc9c3eSSadaf Ebrahimi // line ends: \A \z 302*ccdc9c3eSSadaf Ebrahimi // \Q and \E to disable/enable metacharacters 303*ccdc9c3eSSadaf Ebrahimi // (?P<name>expr) for named captures 304*ccdc9c3eSSadaf Ebrahimi // \C to match any single byte 305*ccdc9c3eSSadaf Ebrahimi UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group 306*ccdc9c3eSSadaf Ebrahimi // and \P{Han} for its negation. 307*ccdc9c3eSSadaf Ebrahimi NeverNL = 1<<11, // Never match NL, even if the regexp mentions 308*ccdc9c3eSSadaf Ebrahimi // it explicitly. 309*ccdc9c3eSSadaf Ebrahimi NeverCapture = 1<<12, // Parse all parens as non-capturing. 310*ccdc9c3eSSadaf Ebrahimi 311*ccdc9c3eSSadaf Ebrahimi // As close to Perl as we can get. 312*ccdc9c3eSSadaf Ebrahimi LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | 313*ccdc9c3eSSadaf Ebrahimi UnicodeGroups, 314*ccdc9c3eSSadaf Ebrahimi 315*ccdc9c3eSSadaf Ebrahimi // Internal use only. 316*ccdc9c3eSSadaf Ebrahimi WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text 317*ccdc9c3eSSadaf Ebrahimi AllParseFlags = (1<<14)-1, 318*ccdc9c3eSSadaf Ebrahimi }; 319*ccdc9c3eSSadaf Ebrahimi 320*ccdc9c3eSSadaf Ebrahimi // Get. No set, Regexps are logically immutable once created. op()321*ccdc9c3eSSadaf Ebrahimi RegexpOp op() { return static_cast<RegexpOp>(op_); } nsub()322*ccdc9c3eSSadaf Ebrahimi int nsub() { return nsub_; } simple()323*ccdc9c3eSSadaf Ebrahimi bool simple() { return simple_ != 0; } parse_flags()324*ccdc9c3eSSadaf Ebrahimi ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); } 325*ccdc9c3eSSadaf Ebrahimi int Ref(); // For testing. 326*ccdc9c3eSSadaf Ebrahimi sub()327*ccdc9c3eSSadaf Ebrahimi Regexp** sub() { 328*ccdc9c3eSSadaf Ebrahimi if(nsub_ <= 1) 329*ccdc9c3eSSadaf Ebrahimi return &subone_; 330*ccdc9c3eSSadaf Ebrahimi else 331*ccdc9c3eSSadaf Ebrahimi return submany_; 332*ccdc9c3eSSadaf Ebrahimi } 333*ccdc9c3eSSadaf Ebrahimi min()334*ccdc9c3eSSadaf Ebrahimi int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; } max()335*ccdc9c3eSSadaf Ebrahimi int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; } rune()336*ccdc9c3eSSadaf Ebrahimi Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; } cc()337*ccdc9c3eSSadaf Ebrahimi CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; } cap()338*ccdc9c3eSSadaf Ebrahimi int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; } name()339*ccdc9c3eSSadaf Ebrahimi const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } runes()340*ccdc9c3eSSadaf Ebrahimi Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; } nrunes()341*ccdc9c3eSSadaf Ebrahimi int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; } match_id()342*ccdc9c3eSSadaf Ebrahimi int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; } 343*ccdc9c3eSSadaf Ebrahimi 344*ccdc9c3eSSadaf Ebrahimi // Increments reference count, returns object as convenience. 345*ccdc9c3eSSadaf Ebrahimi Regexp* Incref(); 346*ccdc9c3eSSadaf Ebrahimi 347*ccdc9c3eSSadaf Ebrahimi // Decrements reference count and deletes this object if count reaches 0. 348*ccdc9c3eSSadaf Ebrahimi void Decref(); 349*ccdc9c3eSSadaf Ebrahimi 350*ccdc9c3eSSadaf Ebrahimi // Parses string s to produce regular expression, returned. 351*ccdc9c3eSSadaf Ebrahimi // Caller must release return value with re->Decref(). 352*ccdc9c3eSSadaf Ebrahimi // On failure, sets *status (if status != NULL) and returns NULL. 353*ccdc9c3eSSadaf Ebrahimi static Regexp* Parse(const StringPiece& s, ParseFlags flags, 354*ccdc9c3eSSadaf Ebrahimi RegexpStatus* status); 355*ccdc9c3eSSadaf Ebrahimi 356*ccdc9c3eSSadaf Ebrahimi // Returns a _new_ simplified version of the current regexp. 357*ccdc9c3eSSadaf Ebrahimi // Does not edit the current regexp. 358*ccdc9c3eSSadaf Ebrahimi // Caller must release return value with re->Decref(). 359*ccdc9c3eSSadaf Ebrahimi // Simplified means that counted repetition has been rewritten 360*ccdc9c3eSSadaf Ebrahimi // into simpler terms and all Perl/POSIX features have been 361*ccdc9c3eSSadaf Ebrahimi // removed. The result will capture exactly the same 362*ccdc9c3eSSadaf Ebrahimi // subexpressions the original did, unless formatted with ToString. 363*ccdc9c3eSSadaf Ebrahimi Regexp* Simplify(); 364*ccdc9c3eSSadaf Ebrahimi friend class CoalesceWalker; 365*ccdc9c3eSSadaf Ebrahimi friend class SimplifyWalker; 366*ccdc9c3eSSadaf Ebrahimi 367*ccdc9c3eSSadaf Ebrahimi // Parses the regexp src and then simplifies it and sets *dst to the 368*ccdc9c3eSSadaf Ebrahimi // string representation of the simplified form. Returns true on success. 369*ccdc9c3eSSadaf Ebrahimi // Returns false and sets *status (if status != NULL) on parse error. 370*ccdc9c3eSSadaf Ebrahimi static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, 371*ccdc9c3eSSadaf Ebrahimi string* dst, 372*ccdc9c3eSSadaf Ebrahimi RegexpStatus* status); 373*ccdc9c3eSSadaf Ebrahimi 374*ccdc9c3eSSadaf Ebrahimi // Returns the number of capturing groups in the regexp. 375*ccdc9c3eSSadaf Ebrahimi int NumCaptures(); 376*ccdc9c3eSSadaf Ebrahimi friend class NumCapturesWalker; 377*ccdc9c3eSSadaf Ebrahimi 378*ccdc9c3eSSadaf Ebrahimi // Returns a map from names to capturing group indices, 379*ccdc9c3eSSadaf Ebrahimi // or NULL if the regexp contains no named capture groups. 380*ccdc9c3eSSadaf Ebrahimi // The caller is responsible for deleting the map. 381*ccdc9c3eSSadaf Ebrahimi std::map<string, int>* NamedCaptures(); 382*ccdc9c3eSSadaf Ebrahimi 383*ccdc9c3eSSadaf Ebrahimi // Returns a map from capturing group indices to capturing group 384*ccdc9c3eSSadaf Ebrahimi // names or NULL if the regexp contains no named capture groups. The 385*ccdc9c3eSSadaf Ebrahimi // caller is responsible for deleting the map. 386*ccdc9c3eSSadaf Ebrahimi std::map<int, string>* CaptureNames(); 387*ccdc9c3eSSadaf Ebrahimi 388*ccdc9c3eSSadaf Ebrahimi // Returns a string representation of the current regexp, 389*ccdc9c3eSSadaf Ebrahimi // using as few parentheses as possible. 390*ccdc9c3eSSadaf Ebrahimi string ToString(); 391*ccdc9c3eSSadaf Ebrahimi 392*ccdc9c3eSSadaf Ebrahimi // Convenience functions. They consume the passed reference, 393*ccdc9c3eSSadaf Ebrahimi // so in many cases you should use, e.g., Plus(re->Incref(), flags). 394*ccdc9c3eSSadaf Ebrahimi // They do not consume allocated arrays like subs or runes. 395*ccdc9c3eSSadaf Ebrahimi static Regexp* Plus(Regexp* sub, ParseFlags flags); 396*ccdc9c3eSSadaf Ebrahimi static Regexp* Star(Regexp* sub, ParseFlags flags); 397*ccdc9c3eSSadaf Ebrahimi static Regexp* Quest(Regexp* sub, ParseFlags flags); 398*ccdc9c3eSSadaf Ebrahimi static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags); 399*ccdc9c3eSSadaf Ebrahimi static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags); 400*ccdc9c3eSSadaf Ebrahimi static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap); 401*ccdc9c3eSSadaf Ebrahimi static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max); 402*ccdc9c3eSSadaf Ebrahimi static Regexp* NewLiteral(Rune rune, ParseFlags flags); 403*ccdc9c3eSSadaf Ebrahimi static Regexp* NewCharClass(CharClass* cc, ParseFlags flags); 404*ccdc9c3eSSadaf Ebrahimi static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags); 405*ccdc9c3eSSadaf Ebrahimi static Regexp* HaveMatch(int match_id, ParseFlags flags); 406*ccdc9c3eSSadaf Ebrahimi 407*ccdc9c3eSSadaf Ebrahimi // Like Alternate but does not factor out common prefixes. 408*ccdc9c3eSSadaf Ebrahimi static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags); 409*ccdc9c3eSSadaf Ebrahimi 410*ccdc9c3eSSadaf Ebrahimi // Debugging function. Returns string format for regexp 411*ccdc9c3eSSadaf Ebrahimi // that makes structure clear. Does NOT use regexp syntax. 412*ccdc9c3eSSadaf Ebrahimi string Dump(); 413*ccdc9c3eSSadaf Ebrahimi 414*ccdc9c3eSSadaf Ebrahimi // Helper traversal class, defined fully in walker-inl.h. 415*ccdc9c3eSSadaf Ebrahimi template<typename T> class Walker; 416*ccdc9c3eSSadaf Ebrahimi 417*ccdc9c3eSSadaf Ebrahimi // Compile to Prog. See prog.h 418*ccdc9c3eSSadaf Ebrahimi // Reverse prog expects to be run over text backward. 419*ccdc9c3eSSadaf Ebrahimi // Construction and execution of prog will 420*ccdc9c3eSSadaf Ebrahimi // stay within approximately max_mem bytes of memory. 421*ccdc9c3eSSadaf Ebrahimi // If max_mem <= 0, a reasonable default is used. 422*ccdc9c3eSSadaf Ebrahimi Prog* CompileToProg(int64_t max_mem); 423*ccdc9c3eSSadaf Ebrahimi Prog* CompileToReverseProg(int64_t max_mem); 424*ccdc9c3eSSadaf Ebrahimi 425*ccdc9c3eSSadaf Ebrahimi // Whether to expect this library to find exactly the same answer as PCRE 426*ccdc9c3eSSadaf Ebrahimi // when running this regexp. Most regexps do mimic PCRE exactly, but a few 427*ccdc9c3eSSadaf Ebrahimi // obscure cases behave differently. Technically this is more a property 428*ccdc9c3eSSadaf Ebrahimi // of the Prog than the Regexp, but the computation is much easier to do 429*ccdc9c3eSSadaf Ebrahimi // on the Regexp. See mimics_pcre.cc for the exact conditions. 430*ccdc9c3eSSadaf Ebrahimi bool MimicsPCRE(); 431*ccdc9c3eSSadaf Ebrahimi 432*ccdc9c3eSSadaf Ebrahimi // Benchmarking function. 433*ccdc9c3eSSadaf Ebrahimi void NullWalk(); 434*ccdc9c3eSSadaf Ebrahimi 435*ccdc9c3eSSadaf Ebrahimi // Whether every match of this regexp must be anchored and 436*ccdc9c3eSSadaf Ebrahimi // begin with a non-empty fixed string (perhaps after ASCII 437*ccdc9c3eSSadaf Ebrahimi // case-folding). If so, returns the prefix and the sub-regexp that 438*ccdc9c3eSSadaf Ebrahimi // follows it. 439*ccdc9c3eSSadaf Ebrahimi // Callers should expect *prefix, *foldcase and *suffix to be "zeroed" 440*ccdc9c3eSSadaf Ebrahimi // regardless of the return value. 441*ccdc9c3eSSadaf Ebrahimi bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix); 442*ccdc9c3eSSadaf Ebrahimi 443*ccdc9c3eSSadaf Ebrahimi private: 444*ccdc9c3eSSadaf Ebrahimi // Constructor allocates vectors as appropriate for operator. 445*ccdc9c3eSSadaf Ebrahimi explicit Regexp(RegexpOp op, ParseFlags parse_flags); 446*ccdc9c3eSSadaf Ebrahimi 447*ccdc9c3eSSadaf Ebrahimi // Use Decref() instead of delete to release Regexps. 448*ccdc9c3eSSadaf Ebrahimi // This is private to catch deletes at compile time. 449*ccdc9c3eSSadaf Ebrahimi ~Regexp(); 450*ccdc9c3eSSadaf Ebrahimi void Destroy(); 451*ccdc9c3eSSadaf Ebrahimi bool QuickDestroy(); 452*ccdc9c3eSSadaf Ebrahimi 453*ccdc9c3eSSadaf Ebrahimi // Helpers for Parse. Listed here so they can edit Regexps. 454*ccdc9c3eSSadaf Ebrahimi class ParseState; 455*ccdc9c3eSSadaf Ebrahimi 456*ccdc9c3eSSadaf Ebrahimi friend class ParseState; 457*ccdc9c3eSSadaf Ebrahimi friend bool ParseCharClass(StringPiece* s, Regexp** out_re, 458*ccdc9c3eSSadaf Ebrahimi RegexpStatus* status); 459*ccdc9c3eSSadaf Ebrahimi 460*ccdc9c3eSSadaf Ebrahimi // Helper for testing [sic]. 461*ccdc9c3eSSadaf Ebrahimi friend bool RegexpEqualTestingOnly(Regexp*, Regexp*); 462*ccdc9c3eSSadaf Ebrahimi 463*ccdc9c3eSSadaf Ebrahimi // Computes whether Regexp is already simple. 464*ccdc9c3eSSadaf Ebrahimi bool ComputeSimple(); 465*ccdc9c3eSSadaf Ebrahimi 466*ccdc9c3eSSadaf Ebrahimi // Constructor that generates a Star, Plus or Quest, 467*ccdc9c3eSSadaf Ebrahimi // squashing the pair if sub is also a Star, Plus or Quest. 468*ccdc9c3eSSadaf Ebrahimi static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags); 469*ccdc9c3eSSadaf Ebrahimi 470*ccdc9c3eSSadaf Ebrahimi // Constructor that generates a concatenation or alternation, 471*ccdc9c3eSSadaf Ebrahimi // enforcing the limit on the number of subexpressions for 472*ccdc9c3eSSadaf Ebrahimi // a particular Regexp. 473*ccdc9c3eSSadaf Ebrahimi static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, 474*ccdc9c3eSSadaf Ebrahimi ParseFlags flags, bool can_factor); 475*ccdc9c3eSSadaf Ebrahimi 476*ccdc9c3eSSadaf Ebrahimi // Returns the leading string that re starts with. 477*ccdc9c3eSSadaf Ebrahimi // The returned Rune* points into a piece of re, 478*ccdc9c3eSSadaf Ebrahimi // so it must not be used after the caller calls re->Decref(). 479*ccdc9c3eSSadaf Ebrahimi static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags); 480*ccdc9c3eSSadaf Ebrahimi 481*ccdc9c3eSSadaf Ebrahimi // Removes the first n leading runes from the beginning of re. 482*ccdc9c3eSSadaf Ebrahimi // Edits re in place. 483*ccdc9c3eSSadaf Ebrahimi static void RemoveLeadingString(Regexp* re, int n); 484*ccdc9c3eSSadaf Ebrahimi 485*ccdc9c3eSSadaf Ebrahimi // Returns the leading regexp in re's top-level concatenation. 486*ccdc9c3eSSadaf Ebrahimi // The returned Regexp* points at re or a sub-expression of re, 487*ccdc9c3eSSadaf Ebrahimi // so it must not be used after the caller calls re->Decref(). 488*ccdc9c3eSSadaf Ebrahimi static Regexp* LeadingRegexp(Regexp* re); 489*ccdc9c3eSSadaf Ebrahimi 490*ccdc9c3eSSadaf Ebrahimi // Removes LeadingRegexp(re) from re and returns the remainder. 491*ccdc9c3eSSadaf Ebrahimi // Might edit re in place. 492*ccdc9c3eSSadaf Ebrahimi static Regexp* RemoveLeadingRegexp(Regexp* re); 493*ccdc9c3eSSadaf Ebrahimi 494*ccdc9c3eSSadaf Ebrahimi // Simplifies an alternation of literal strings by factoring out 495*ccdc9c3eSSadaf Ebrahimi // common prefixes. 496*ccdc9c3eSSadaf Ebrahimi static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); 497*ccdc9c3eSSadaf Ebrahimi friend class FactorAlternationImpl; 498*ccdc9c3eSSadaf Ebrahimi 499*ccdc9c3eSSadaf Ebrahimi // Is a == b? Only efficient on regexps that have not been through 500*ccdc9c3eSSadaf Ebrahimi // Simplify yet - the expansion of a kRegexpRepeat will make this 501*ccdc9c3eSSadaf Ebrahimi // take a long time. Do not call on such regexps, hence private. 502*ccdc9c3eSSadaf Ebrahimi static bool Equal(Regexp* a, Regexp* b); 503*ccdc9c3eSSadaf Ebrahimi 504*ccdc9c3eSSadaf Ebrahimi // Allocate space for n sub-regexps. AllocSub(int n)505*ccdc9c3eSSadaf Ebrahimi void AllocSub(int n) { 506*ccdc9c3eSSadaf Ebrahimi DCHECK(n >= 0 && static_cast<uint16_t>(n) == n); 507*ccdc9c3eSSadaf Ebrahimi if (n > 1) 508*ccdc9c3eSSadaf Ebrahimi submany_ = new Regexp*[n]; 509*ccdc9c3eSSadaf Ebrahimi nsub_ = static_cast<uint16_t>(n); 510*ccdc9c3eSSadaf Ebrahimi } 511*ccdc9c3eSSadaf Ebrahimi 512*ccdc9c3eSSadaf Ebrahimi // Add Rune to LiteralString 513*ccdc9c3eSSadaf Ebrahimi void AddRuneToString(Rune r); 514*ccdc9c3eSSadaf Ebrahimi 515*ccdc9c3eSSadaf Ebrahimi // Swaps this with that, in place. 516*ccdc9c3eSSadaf Ebrahimi void Swap(Regexp *that); 517*ccdc9c3eSSadaf Ebrahimi 518*ccdc9c3eSSadaf Ebrahimi // Operator. See description of operators above. 519*ccdc9c3eSSadaf Ebrahimi // uint8_t instead of RegexpOp to control space usage. 520*ccdc9c3eSSadaf Ebrahimi uint8_t op_; 521*ccdc9c3eSSadaf Ebrahimi 522*ccdc9c3eSSadaf Ebrahimi // Is this regexp structure already simple 523*ccdc9c3eSSadaf Ebrahimi // (has it been returned by Simplify)? 524*ccdc9c3eSSadaf Ebrahimi // uint8_t instead of bool to control space usage. 525*ccdc9c3eSSadaf Ebrahimi uint8_t simple_; 526*ccdc9c3eSSadaf Ebrahimi 527*ccdc9c3eSSadaf Ebrahimi // Flags saved from parsing and used during execution. 528*ccdc9c3eSSadaf Ebrahimi // (Only FoldCase is used.) 529*ccdc9c3eSSadaf Ebrahimi // uint16_t instead of ParseFlags to control space usage. 530*ccdc9c3eSSadaf Ebrahimi uint16_t parse_flags_; 531*ccdc9c3eSSadaf Ebrahimi 532*ccdc9c3eSSadaf Ebrahimi // Reference count. Exists so that SimplifyRegexp can build 533*ccdc9c3eSSadaf Ebrahimi // regexp structures that are dags rather than trees to avoid 534*ccdc9c3eSSadaf Ebrahimi // exponential blowup in space requirements. 535*ccdc9c3eSSadaf Ebrahimi // uint16_t to control space usage. 536*ccdc9c3eSSadaf Ebrahimi // The standard regexp routines will never generate a 537*ccdc9c3eSSadaf Ebrahimi // ref greater than the maximum repeat count (kMaxRepeat), 538*ccdc9c3eSSadaf Ebrahimi // but even so, Incref and Decref consult an overflow map 539*ccdc9c3eSSadaf Ebrahimi // when ref_ reaches kMaxRef. 540*ccdc9c3eSSadaf Ebrahimi uint16_t ref_; 541*ccdc9c3eSSadaf Ebrahimi static const uint16_t kMaxRef = 0xffff; 542*ccdc9c3eSSadaf Ebrahimi 543*ccdc9c3eSSadaf Ebrahimi // Subexpressions. 544*ccdc9c3eSSadaf Ebrahimi // uint16_t to control space usage. 545*ccdc9c3eSSadaf Ebrahimi // Concat and Alternate handle larger numbers of subexpressions 546*ccdc9c3eSSadaf Ebrahimi // by building concatenation or alternation trees. 547*ccdc9c3eSSadaf Ebrahimi // Other routines should call Concat or Alternate instead of 548*ccdc9c3eSSadaf Ebrahimi // filling in sub() by hand. 549*ccdc9c3eSSadaf Ebrahimi uint16_t nsub_; 550*ccdc9c3eSSadaf Ebrahimi static const uint16_t kMaxNsub = 0xffff; 551*ccdc9c3eSSadaf Ebrahimi union { 552*ccdc9c3eSSadaf Ebrahimi Regexp** submany_; // if nsub_ > 1 553*ccdc9c3eSSadaf Ebrahimi Regexp* subone_; // if nsub_ == 1 554*ccdc9c3eSSadaf Ebrahimi }; 555*ccdc9c3eSSadaf Ebrahimi 556*ccdc9c3eSSadaf Ebrahimi // Extra space for parse and teardown stacks. 557*ccdc9c3eSSadaf Ebrahimi Regexp* down_; 558*ccdc9c3eSSadaf Ebrahimi 559*ccdc9c3eSSadaf Ebrahimi // Arguments to operator. See description of operators above. 560*ccdc9c3eSSadaf Ebrahimi union { 561*ccdc9c3eSSadaf Ebrahimi struct { // Repeat 562*ccdc9c3eSSadaf Ebrahimi int max_; 563*ccdc9c3eSSadaf Ebrahimi int min_; 564*ccdc9c3eSSadaf Ebrahimi }; 565*ccdc9c3eSSadaf Ebrahimi struct { // Capture 566*ccdc9c3eSSadaf Ebrahimi int cap_; 567*ccdc9c3eSSadaf Ebrahimi string* name_; 568*ccdc9c3eSSadaf Ebrahimi }; 569*ccdc9c3eSSadaf Ebrahimi struct { // LiteralString 570*ccdc9c3eSSadaf Ebrahimi int nrunes_; 571*ccdc9c3eSSadaf Ebrahimi Rune* runes_; 572*ccdc9c3eSSadaf Ebrahimi }; 573*ccdc9c3eSSadaf Ebrahimi struct { // CharClass 574*ccdc9c3eSSadaf Ebrahimi // These two could be in separate union members, 575*ccdc9c3eSSadaf Ebrahimi // but it wouldn't save any space (there are other two-word structs) 576*ccdc9c3eSSadaf Ebrahimi // and keeping them separate avoids confusion during parsing. 577*ccdc9c3eSSadaf Ebrahimi CharClass* cc_; 578*ccdc9c3eSSadaf Ebrahimi CharClassBuilder* ccb_; 579*ccdc9c3eSSadaf Ebrahimi }; 580*ccdc9c3eSSadaf Ebrahimi Rune rune_; // Literal 581*ccdc9c3eSSadaf Ebrahimi int match_id_; // HaveMatch 582*ccdc9c3eSSadaf Ebrahimi void *the_union_[2]; // as big as any other element, for memset 583*ccdc9c3eSSadaf Ebrahimi }; 584*ccdc9c3eSSadaf Ebrahimi 585*ccdc9c3eSSadaf Ebrahimi Regexp(const Regexp&) = delete; 586*ccdc9c3eSSadaf Ebrahimi Regexp& operator=(const Regexp&) = delete; 587*ccdc9c3eSSadaf Ebrahimi }; 588*ccdc9c3eSSadaf Ebrahimi 589*ccdc9c3eSSadaf Ebrahimi // Character class set: contains non-overlapping, non-abutting RuneRanges. 590*ccdc9c3eSSadaf Ebrahimi typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet; 591*ccdc9c3eSSadaf Ebrahimi 592*ccdc9c3eSSadaf Ebrahimi class CharClassBuilder { 593*ccdc9c3eSSadaf Ebrahimi public: 594*ccdc9c3eSSadaf Ebrahimi CharClassBuilder(); 595*ccdc9c3eSSadaf Ebrahimi 596*ccdc9c3eSSadaf Ebrahimi typedef RuneRangeSet::iterator iterator; begin()597*ccdc9c3eSSadaf Ebrahimi iterator begin() { return ranges_.begin(); } end()598*ccdc9c3eSSadaf Ebrahimi iterator end() { return ranges_.end(); } 599*ccdc9c3eSSadaf Ebrahimi size()600*ccdc9c3eSSadaf Ebrahimi int size() { return nrunes_; } empty()601*ccdc9c3eSSadaf Ebrahimi bool empty() { return nrunes_ == 0; } full()602*ccdc9c3eSSadaf Ebrahimi bool full() { return nrunes_ == Runemax+1; } 603*ccdc9c3eSSadaf Ebrahimi 604*ccdc9c3eSSadaf Ebrahimi bool Contains(Rune r); 605*ccdc9c3eSSadaf Ebrahimi bool FoldsASCII(); 606*ccdc9c3eSSadaf Ebrahimi bool AddRange(Rune lo, Rune hi); // returns whether class changed 607*ccdc9c3eSSadaf Ebrahimi CharClassBuilder* Copy(); 608*ccdc9c3eSSadaf Ebrahimi void AddCharClass(CharClassBuilder* cc); 609*ccdc9c3eSSadaf Ebrahimi void Negate(); 610*ccdc9c3eSSadaf Ebrahimi void RemoveAbove(Rune r); 611*ccdc9c3eSSadaf Ebrahimi CharClass* GetCharClass(); 612*ccdc9c3eSSadaf Ebrahimi void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); 613*ccdc9c3eSSadaf Ebrahimi 614*ccdc9c3eSSadaf Ebrahimi private: 615*ccdc9c3eSSadaf Ebrahimi static const uint32_t AlphaMask = (1<<26) - 1; 616*ccdc9c3eSSadaf Ebrahimi uint32_t upper_; // bitmap of A-Z 617*ccdc9c3eSSadaf Ebrahimi uint32_t lower_; // bitmap of a-z 618*ccdc9c3eSSadaf Ebrahimi int nrunes_; 619*ccdc9c3eSSadaf Ebrahimi RuneRangeSet ranges_; 620*ccdc9c3eSSadaf Ebrahimi 621*ccdc9c3eSSadaf Ebrahimi CharClassBuilder(const CharClassBuilder&) = delete; 622*ccdc9c3eSSadaf Ebrahimi CharClassBuilder& operator=(const CharClassBuilder&) = delete; 623*ccdc9c3eSSadaf Ebrahimi }; 624*ccdc9c3eSSadaf Ebrahimi 625*ccdc9c3eSSadaf Ebrahimi // Bitwise ops on ParseFlags produce ParseFlags. 626*ccdc9c3eSSadaf Ebrahimi inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, 627*ccdc9c3eSSadaf Ebrahimi Regexp::ParseFlags b) { 628*ccdc9c3eSSadaf Ebrahimi return static_cast<Regexp::ParseFlags>( 629*ccdc9c3eSSadaf Ebrahimi static_cast<int>(a) | static_cast<int>(b)); 630*ccdc9c3eSSadaf Ebrahimi } 631*ccdc9c3eSSadaf Ebrahimi 632*ccdc9c3eSSadaf Ebrahimi inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, 633*ccdc9c3eSSadaf Ebrahimi Regexp::ParseFlags b) { 634*ccdc9c3eSSadaf Ebrahimi return static_cast<Regexp::ParseFlags>( 635*ccdc9c3eSSadaf Ebrahimi static_cast<int>(a) ^ static_cast<int>(b)); 636*ccdc9c3eSSadaf Ebrahimi } 637*ccdc9c3eSSadaf Ebrahimi 638*ccdc9c3eSSadaf Ebrahimi inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, 639*ccdc9c3eSSadaf Ebrahimi Regexp::ParseFlags b) { 640*ccdc9c3eSSadaf Ebrahimi return static_cast<Regexp::ParseFlags>( 641*ccdc9c3eSSadaf Ebrahimi static_cast<int>(a) & static_cast<int>(b)); 642*ccdc9c3eSSadaf Ebrahimi } 643*ccdc9c3eSSadaf Ebrahimi 644*ccdc9c3eSSadaf Ebrahimi inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) { 645*ccdc9c3eSSadaf Ebrahimi // Attempting to produce a value out of enum's range has undefined behaviour. 646*ccdc9c3eSSadaf Ebrahimi return static_cast<Regexp::ParseFlags>( 647*ccdc9c3eSSadaf Ebrahimi ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags)); 648*ccdc9c3eSSadaf Ebrahimi } 649*ccdc9c3eSSadaf Ebrahimi 650*ccdc9c3eSSadaf Ebrahimi } // namespace re2 651*ccdc9c3eSSadaf Ebrahimi 652*ccdc9c3eSSadaf Ebrahimi #endif // RE2_REGEXP_H_ 653