xref: /aosp_15_r20/external/regex-re2/re2/regexp.h (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_REGEXP_H_
6*ccdc9c3eSSadaf Ebrahimi #define RE2_REGEXP_H_
7*ccdc9c3eSSadaf Ebrahimi 
8*ccdc9c3eSSadaf Ebrahimi // --- SPONSORED LINK --------------------------------------------------
9*ccdc9c3eSSadaf Ebrahimi // If you want to use this library for regular expression matching,
10*ccdc9c3eSSadaf Ebrahimi // you should use re2/re2.h, which provides a class RE2 that
11*ccdc9c3eSSadaf Ebrahimi // mimics the PCRE interface provided by PCRE's C++ wrappers.
12*ccdc9c3eSSadaf Ebrahimi // This header describes the low-level interface used to implement RE2
13*ccdc9c3eSSadaf Ebrahimi // and may change in backwards-incompatible ways from time to time.
14*ccdc9c3eSSadaf Ebrahimi // In contrast, RE2's interface will not.
15*ccdc9c3eSSadaf Ebrahimi // ---------------------------------------------------------------------
16*ccdc9c3eSSadaf Ebrahimi 
17*ccdc9c3eSSadaf Ebrahimi // Regular expression library: parsing, execution, and manipulation
18*ccdc9c3eSSadaf Ebrahimi // of regular expressions.
19*ccdc9c3eSSadaf Ebrahimi //
20*ccdc9c3eSSadaf Ebrahimi // Any operation that traverses the Regexp structures should be written
21*ccdc9c3eSSadaf Ebrahimi // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
22*ccdc9c3eSSadaf Ebrahimi // regular expressions such as x++++++++++++++++++++... might cause recursive
23*ccdc9c3eSSadaf Ebrahimi // traversals to overflow the stack.
24*ccdc9c3eSSadaf Ebrahimi //
25*ccdc9c3eSSadaf Ebrahimi // It is the caller's responsibility to provide appropriate mutual exclusion
26*ccdc9c3eSSadaf Ebrahimi // around manipulation of the regexps.  RE2 does this.
27*ccdc9c3eSSadaf Ebrahimi //
28*ccdc9c3eSSadaf Ebrahimi // PARSING
29*ccdc9c3eSSadaf Ebrahimi //
30*ccdc9c3eSSadaf Ebrahimi // Regexp::Parse parses regular expressions encoded in UTF-8.
31*ccdc9c3eSSadaf Ebrahimi // The default syntax is POSIX extended regular expressions,
32*ccdc9c3eSSadaf Ebrahimi // with the following changes:
33*ccdc9c3eSSadaf Ebrahimi //
34*ccdc9c3eSSadaf Ebrahimi //   1.  Backreferences (optional in POSIX EREs) are not supported.
35*ccdc9c3eSSadaf Ebrahimi //         (Supporting them precludes the use of DFA-based
36*ccdc9c3eSSadaf Ebrahimi //          matching engines.)
37*ccdc9c3eSSadaf Ebrahimi //
38*ccdc9c3eSSadaf Ebrahimi //   2.  Collating elements and collation classes are not supported.
39*ccdc9c3eSSadaf Ebrahimi //         (No one has needed or wanted them.)
40*ccdc9c3eSSadaf Ebrahimi //
41*ccdc9c3eSSadaf Ebrahimi // The exact syntax accepted can be modified by passing flags to
42*ccdc9c3eSSadaf Ebrahimi // Regexp::Parse.  In particular, many of the basic Perl additions
43*ccdc9c3eSSadaf Ebrahimi // are available.  The flags are documented below (search for LikePerl).
44*ccdc9c3eSSadaf Ebrahimi //
45*ccdc9c3eSSadaf Ebrahimi // If parsed with the flag Regexp::Latin1, both the regular expression
46*ccdc9c3eSSadaf Ebrahimi // and the input to the matching routines are assumed to be encoded in
47*ccdc9c3eSSadaf Ebrahimi // Latin-1, not UTF-8.
48*ccdc9c3eSSadaf Ebrahimi //
49*ccdc9c3eSSadaf Ebrahimi // EXECUTION
50*ccdc9c3eSSadaf Ebrahimi //
51*ccdc9c3eSSadaf Ebrahimi // Once Regexp has parsed a regular expression, it provides methods
52*ccdc9c3eSSadaf Ebrahimi // to search text using that regular expression.  These methods are
53*ccdc9c3eSSadaf Ebrahimi // implemented via calling out to other regular expression libraries.
54*ccdc9c3eSSadaf Ebrahimi // (Let's call them the sublibraries.)
55*ccdc9c3eSSadaf Ebrahimi //
56*ccdc9c3eSSadaf Ebrahimi // To call a sublibrary, Regexp does not simply prepare a
57*ccdc9c3eSSadaf Ebrahimi // string version of the regular expression and hand it to the
58*ccdc9c3eSSadaf Ebrahimi // sublibrary.  Instead, Regexp prepares, from its own parsed form, the
59*ccdc9c3eSSadaf Ebrahimi // corresponding internal representation used by the sublibrary.
60*ccdc9c3eSSadaf Ebrahimi // This has the drawback of needing to know the internal representation
61*ccdc9c3eSSadaf Ebrahimi // used by the sublibrary, but it has two important benefits:
62*ccdc9c3eSSadaf Ebrahimi //
63*ccdc9c3eSSadaf Ebrahimi //   1. The syntax and meaning of regular expressions is guaranteed
64*ccdc9c3eSSadaf Ebrahimi //      to be that used by Regexp's parser, not the syntax expected
65*ccdc9c3eSSadaf Ebrahimi //      by the sublibrary.  Regexp might accept a restricted or
66*ccdc9c3eSSadaf Ebrahimi //      expanded syntax for regular expressions as compared with
67*ccdc9c3eSSadaf Ebrahimi //      the sublibrary.  As long as Regexp can translate from its
68*ccdc9c3eSSadaf Ebrahimi //      internal form into the sublibrary's, clients need not know
69*ccdc9c3eSSadaf Ebrahimi //      exactly which sublibrary they are using.
70*ccdc9c3eSSadaf Ebrahimi //
71*ccdc9c3eSSadaf Ebrahimi //   2. The sublibrary parsers are bypassed.  For whatever reason,
72*ccdc9c3eSSadaf Ebrahimi //      sublibrary regular expression parsers often have security
73*ccdc9c3eSSadaf Ebrahimi //      problems.  For example, plan9grep's regular expression parser
74*ccdc9c3eSSadaf Ebrahimi //      has a buffer overflow in its handling of large character
75*ccdc9c3eSSadaf Ebrahimi //      classes, and PCRE's parser has had buffer overflow problems
76*ccdc9c3eSSadaf Ebrahimi //      in the past.  Security-team requires sandboxing of sublibrary
77*ccdc9c3eSSadaf Ebrahimi //      regular expression parsers.  Avoiding the sublibrary parsers
78*ccdc9c3eSSadaf Ebrahimi //      avoids the sandbox.
79*ccdc9c3eSSadaf Ebrahimi //
80*ccdc9c3eSSadaf Ebrahimi // The execution methods we use now are provided by the compiled form,
81*ccdc9c3eSSadaf Ebrahimi // Prog, described in prog.h
82*ccdc9c3eSSadaf Ebrahimi //
83*ccdc9c3eSSadaf Ebrahimi // MANIPULATION
84*ccdc9c3eSSadaf Ebrahimi //
85*ccdc9c3eSSadaf Ebrahimi // Unlike other regular expression libraries, Regexp makes its parsed
86*ccdc9c3eSSadaf Ebrahimi // form accessible to clients, so that client code can analyze the
87*ccdc9c3eSSadaf Ebrahimi // parsed regular expressions.
88*ccdc9c3eSSadaf Ebrahimi 
89*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
90*ccdc9c3eSSadaf Ebrahimi #include <map>
91*ccdc9c3eSSadaf Ebrahimi #include <set>
92*ccdc9c3eSSadaf Ebrahimi #include <string>
93*ccdc9c3eSSadaf Ebrahimi 
94*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
95*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
96*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
97*ccdc9c3eSSadaf Ebrahimi #include "re2/stringpiece.h"
98*ccdc9c3eSSadaf Ebrahimi 
99*ccdc9c3eSSadaf Ebrahimi namespace re2 {
100*ccdc9c3eSSadaf Ebrahimi 
101*ccdc9c3eSSadaf Ebrahimi // Keep in sync with string list kOpcodeNames[] in testing/dump.cc
102*ccdc9c3eSSadaf Ebrahimi enum RegexpOp {
103*ccdc9c3eSSadaf Ebrahimi   // Matches no strings.
104*ccdc9c3eSSadaf Ebrahimi   kRegexpNoMatch = 1,
105*ccdc9c3eSSadaf Ebrahimi 
106*ccdc9c3eSSadaf Ebrahimi   // Matches empty string.
107*ccdc9c3eSSadaf Ebrahimi   kRegexpEmptyMatch,
108*ccdc9c3eSSadaf Ebrahimi 
109*ccdc9c3eSSadaf Ebrahimi   // Matches rune_.
110*ccdc9c3eSSadaf Ebrahimi   kRegexpLiteral,
111*ccdc9c3eSSadaf Ebrahimi 
112*ccdc9c3eSSadaf Ebrahimi   // Matches runes_.
113*ccdc9c3eSSadaf Ebrahimi   kRegexpLiteralString,
114*ccdc9c3eSSadaf Ebrahimi 
115*ccdc9c3eSSadaf Ebrahimi   // Matches concatenation of sub_[0..nsub-1].
116*ccdc9c3eSSadaf Ebrahimi   kRegexpConcat,
117*ccdc9c3eSSadaf Ebrahimi   // Matches union of sub_[0..nsub-1].
118*ccdc9c3eSSadaf Ebrahimi   kRegexpAlternate,
119*ccdc9c3eSSadaf Ebrahimi 
120*ccdc9c3eSSadaf Ebrahimi   // Matches sub_[0] zero or more times.
121*ccdc9c3eSSadaf Ebrahimi   kRegexpStar,
122*ccdc9c3eSSadaf Ebrahimi   // Matches sub_[0] one or more times.
123*ccdc9c3eSSadaf Ebrahimi   kRegexpPlus,
124*ccdc9c3eSSadaf Ebrahimi   // Matches sub_[0] zero or one times.
125*ccdc9c3eSSadaf Ebrahimi   kRegexpQuest,
126*ccdc9c3eSSadaf Ebrahimi 
127*ccdc9c3eSSadaf Ebrahimi   // Matches sub_[0] at least min_ times, at most max_ times.
128*ccdc9c3eSSadaf Ebrahimi   // max_ == -1 means no upper limit.
129*ccdc9c3eSSadaf Ebrahimi   kRegexpRepeat,
130*ccdc9c3eSSadaf Ebrahimi 
131*ccdc9c3eSSadaf Ebrahimi   // Parenthesized (capturing) subexpression.  Index is cap_.
132*ccdc9c3eSSadaf Ebrahimi   // Optionally, capturing name is name_.
133*ccdc9c3eSSadaf Ebrahimi   kRegexpCapture,
134*ccdc9c3eSSadaf Ebrahimi 
135*ccdc9c3eSSadaf Ebrahimi   // Matches any character.
136*ccdc9c3eSSadaf Ebrahimi   kRegexpAnyChar,
137*ccdc9c3eSSadaf Ebrahimi 
138*ccdc9c3eSSadaf Ebrahimi   // Matches any byte [sic].
139*ccdc9c3eSSadaf Ebrahimi   kRegexpAnyByte,
140*ccdc9c3eSSadaf Ebrahimi 
141*ccdc9c3eSSadaf Ebrahimi   // Matches empty string at beginning of line.
142*ccdc9c3eSSadaf Ebrahimi   kRegexpBeginLine,
143*ccdc9c3eSSadaf Ebrahimi   // Matches empty string at end of line.
144*ccdc9c3eSSadaf Ebrahimi   kRegexpEndLine,
145*ccdc9c3eSSadaf Ebrahimi 
146*ccdc9c3eSSadaf Ebrahimi   // Matches word boundary "\b".
147*ccdc9c3eSSadaf Ebrahimi   kRegexpWordBoundary,
148*ccdc9c3eSSadaf Ebrahimi   // Matches not-a-word boundary "\B".
149*ccdc9c3eSSadaf Ebrahimi   kRegexpNoWordBoundary,
150*ccdc9c3eSSadaf Ebrahimi 
151*ccdc9c3eSSadaf Ebrahimi   // Matches empty string at beginning of text.
152*ccdc9c3eSSadaf Ebrahimi   kRegexpBeginText,
153*ccdc9c3eSSadaf Ebrahimi   // Matches empty string at end of text.
154*ccdc9c3eSSadaf Ebrahimi   kRegexpEndText,
155*ccdc9c3eSSadaf Ebrahimi 
156*ccdc9c3eSSadaf Ebrahimi   // Matches character class given by cc_.
157*ccdc9c3eSSadaf Ebrahimi   kRegexpCharClass,
158*ccdc9c3eSSadaf Ebrahimi 
159*ccdc9c3eSSadaf Ebrahimi   // Forces match of entire expression right now,
160*ccdc9c3eSSadaf Ebrahimi   // with match ID match_id_ (used by RE2::Set).
161*ccdc9c3eSSadaf Ebrahimi   kRegexpHaveMatch,
162*ccdc9c3eSSadaf Ebrahimi 
163*ccdc9c3eSSadaf Ebrahimi   kMaxRegexpOp = kRegexpHaveMatch,
164*ccdc9c3eSSadaf Ebrahimi };
165*ccdc9c3eSSadaf Ebrahimi 
166*ccdc9c3eSSadaf Ebrahimi // Keep in sync with string list in regexp.cc
167*ccdc9c3eSSadaf Ebrahimi enum RegexpStatusCode {
168*ccdc9c3eSSadaf Ebrahimi   // No error
169*ccdc9c3eSSadaf Ebrahimi   kRegexpSuccess = 0,
170*ccdc9c3eSSadaf Ebrahimi 
171*ccdc9c3eSSadaf Ebrahimi   // Unexpected error
172*ccdc9c3eSSadaf Ebrahimi   kRegexpInternalError,
173*ccdc9c3eSSadaf Ebrahimi 
174*ccdc9c3eSSadaf Ebrahimi   // Parse errors
175*ccdc9c3eSSadaf Ebrahimi   kRegexpBadEscape,          // bad escape sequence
176*ccdc9c3eSSadaf Ebrahimi   kRegexpBadCharClass,       // bad character class
177*ccdc9c3eSSadaf Ebrahimi   kRegexpBadCharRange,       // bad character class range
178*ccdc9c3eSSadaf Ebrahimi   kRegexpMissingBracket,     // missing closing ]
179*ccdc9c3eSSadaf Ebrahimi   kRegexpMissingParen,       // missing closing )
180*ccdc9c3eSSadaf Ebrahimi   kRegexpTrailingBackslash,  // at end of regexp
181*ccdc9c3eSSadaf Ebrahimi   kRegexpRepeatArgument,     // repeat argument missing, e.g. "*"
182*ccdc9c3eSSadaf Ebrahimi   kRegexpRepeatSize,         // bad repetition argument
183*ccdc9c3eSSadaf Ebrahimi   kRegexpRepeatOp,           // bad repetition operator
184*ccdc9c3eSSadaf Ebrahimi   kRegexpBadPerlOp,          // bad perl operator
185*ccdc9c3eSSadaf Ebrahimi   kRegexpBadUTF8,            // invalid UTF-8 in regexp
186*ccdc9c3eSSadaf Ebrahimi   kRegexpBadNamedCapture,    // bad named capture
187*ccdc9c3eSSadaf Ebrahimi };
188*ccdc9c3eSSadaf Ebrahimi 
189*ccdc9c3eSSadaf Ebrahimi // Error status for certain operations.
190*ccdc9c3eSSadaf Ebrahimi class RegexpStatus {
191*ccdc9c3eSSadaf Ebrahimi  public:
RegexpStatus()192*ccdc9c3eSSadaf Ebrahimi   RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus()193*ccdc9c3eSSadaf Ebrahimi   ~RegexpStatus() { delete tmp_; }
194*ccdc9c3eSSadaf Ebrahimi 
set_code(RegexpStatusCode code)195*ccdc9c3eSSadaf Ebrahimi   void set_code(RegexpStatusCode code) { code_ = code; }
set_error_arg(const StringPiece & error_arg)196*ccdc9c3eSSadaf Ebrahimi   void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
set_tmp(string * tmp)197*ccdc9c3eSSadaf Ebrahimi   void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
code()198*ccdc9c3eSSadaf Ebrahimi   RegexpStatusCode code() const { return code_; }
error_arg()199*ccdc9c3eSSadaf Ebrahimi   const StringPiece& error_arg() const { return error_arg_; }
ok()200*ccdc9c3eSSadaf Ebrahimi   bool ok() const { return code() == kRegexpSuccess; }
201*ccdc9c3eSSadaf Ebrahimi 
202*ccdc9c3eSSadaf Ebrahimi   // Copies state from status.
203*ccdc9c3eSSadaf Ebrahimi   void Copy(const RegexpStatus& status);
204*ccdc9c3eSSadaf Ebrahimi 
205*ccdc9c3eSSadaf Ebrahimi   // Returns text equivalent of code, e.g.:
206*ccdc9c3eSSadaf Ebrahimi   //   "Bad character class"
207*ccdc9c3eSSadaf Ebrahimi   static string CodeText(RegexpStatusCode code);
208*ccdc9c3eSSadaf Ebrahimi 
209*ccdc9c3eSSadaf Ebrahimi   // Returns text describing error, e.g.:
210*ccdc9c3eSSadaf Ebrahimi   //   "Bad character class: [z-a]"
211*ccdc9c3eSSadaf Ebrahimi   string Text() const;
212*ccdc9c3eSSadaf Ebrahimi 
213*ccdc9c3eSSadaf Ebrahimi  private:
214*ccdc9c3eSSadaf Ebrahimi   RegexpStatusCode code_;  // Kind of error
215*ccdc9c3eSSadaf Ebrahimi   StringPiece error_arg_;       // Piece of regexp containing syntax error.
216*ccdc9c3eSSadaf Ebrahimi   string* tmp_;                 // Temporary storage, possibly where error_arg_ is.
217*ccdc9c3eSSadaf Ebrahimi 
218*ccdc9c3eSSadaf Ebrahimi   RegexpStatus(const RegexpStatus&) = delete;
219*ccdc9c3eSSadaf Ebrahimi   RegexpStatus& operator=(const RegexpStatus&) = delete;
220*ccdc9c3eSSadaf Ebrahimi };
221*ccdc9c3eSSadaf Ebrahimi 
222*ccdc9c3eSSadaf Ebrahimi // Compiled form; see prog.h
223*ccdc9c3eSSadaf Ebrahimi class Prog;
224*ccdc9c3eSSadaf Ebrahimi 
225*ccdc9c3eSSadaf Ebrahimi struct RuneRange {
RuneRangeRuneRange226*ccdc9c3eSSadaf Ebrahimi   RuneRange() : lo(0), hi(0) { }
RuneRangeRuneRange227*ccdc9c3eSSadaf Ebrahimi   RuneRange(int l, int h) : lo(l), hi(h) { }
228*ccdc9c3eSSadaf Ebrahimi   Rune lo;
229*ccdc9c3eSSadaf Ebrahimi   Rune hi;
230*ccdc9c3eSSadaf Ebrahimi };
231*ccdc9c3eSSadaf Ebrahimi 
232*ccdc9c3eSSadaf Ebrahimi // Less-than on RuneRanges treats a == b if they overlap at all.
233*ccdc9c3eSSadaf Ebrahimi // This lets us look in a set to find the range covering a particular Rune.
234*ccdc9c3eSSadaf Ebrahimi struct RuneRangeLess {
operatorRuneRangeLess235*ccdc9c3eSSadaf Ebrahimi   bool operator()(const RuneRange& a, const RuneRange& b) const {
236*ccdc9c3eSSadaf Ebrahimi     return a.hi < b.lo;
237*ccdc9c3eSSadaf Ebrahimi   }
238*ccdc9c3eSSadaf Ebrahimi };
239*ccdc9c3eSSadaf Ebrahimi 
240*ccdc9c3eSSadaf Ebrahimi class CharClassBuilder;
241*ccdc9c3eSSadaf Ebrahimi 
242*ccdc9c3eSSadaf Ebrahimi class CharClass {
243*ccdc9c3eSSadaf Ebrahimi  public:
244*ccdc9c3eSSadaf Ebrahimi   void Delete();
245*ccdc9c3eSSadaf Ebrahimi 
246*ccdc9c3eSSadaf Ebrahimi   typedef RuneRange* iterator;
begin()247*ccdc9c3eSSadaf Ebrahimi   iterator begin() { return ranges_; }
end()248*ccdc9c3eSSadaf Ebrahimi   iterator end() { return ranges_ + nranges_; }
249*ccdc9c3eSSadaf Ebrahimi 
size()250*ccdc9c3eSSadaf Ebrahimi   int size() { return nrunes_; }
empty()251*ccdc9c3eSSadaf Ebrahimi   bool empty() { return nrunes_ == 0; }
full()252*ccdc9c3eSSadaf Ebrahimi   bool full() { return nrunes_ == Runemax+1; }
FoldsASCII()253*ccdc9c3eSSadaf Ebrahimi   bool FoldsASCII() { return folds_ascii_; }
254*ccdc9c3eSSadaf Ebrahimi 
255*ccdc9c3eSSadaf Ebrahimi   bool Contains(Rune r);
256*ccdc9c3eSSadaf Ebrahimi   CharClass* Negate();
257*ccdc9c3eSSadaf Ebrahimi 
258*ccdc9c3eSSadaf Ebrahimi  private:
259*ccdc9c3eSSadaf Ebrahimi   CharClass();  // not implemented
260*ccdc9c3eSSadaf Ebrahimi   ~CharClass();  // not implemented
261*ccdc9c3eSSadaf Ebrahimi   static CharClass* New(int maxranges);
262*ccdc9c3eSSadaf Ebrahimi 
263*ccdc9c3eSSadaf Ebrahimi   friend class CharClassBuilder;
264*ccdc9c3eSSadaf Ebrahimi 
265*ccdc9c3eSSadaf Ebrahimi   bool folds_ascii_;
266*ccdc9c3eSSadaf Ebrahimi   int nrunes_;
267*ccdc9c3eSSadaf Ebrahimi   RuneRange *ranges_;
268*ccdc9c3eSSadaf Ebrahimi   int nranges_;
269*ccdc9c3eSSadaf Ebrahimi 
270*ccdc9c3eSSadaf Ebrahimi   CharClass(const CharClass&) = delete;
271*ccdc9c3eSSadaf Ebrahimi   CharClass& operator=(const CharClass&) = delete;
272*ccdc9c3eSSadaf Ebrahimi };
273*ccdc9c3eSSadaf Ebrahimi 
274*ccdc9c3eSSadaf Ebrahimi class Regexp {
275*ccdc9c3eSSadaf Ebrahimi  public:
276*ccdc9c3eSSadaf Ebrahimi 
277*ccdc9c3eSSadaf Ebrahimi   // Flags for parsing.  Can be ORed together.
278*ccdc9c3eSSadaf Ebrahimi   enum ParseFlags {
279*ccdc9c3eSSadaf Ebrahimi     NoParseFlags  = 0,
280*ccdc9c3eSSadaf Ebrahimi     FoldCase      = 1<<0,   // Fold case during matching (case-insensitive).
281*ccdc9c3eSSadaf Ebrahimi     Literal       = 1<<1,   // Treat s as literal string instead of a regexp.
282*ccdc9c3eSSadaf Ebrahimi     ClassNL       = 1<<2,   // Allow char classes like [^a-z] and \D and \s
283*ccdc9c3eSSadaf Ebrahimi                             // and [[:space:]] to match newline.
284*ccdc9c3eSSadaf Ebrahimi     DotNL         = 1<<3,   // Allow . to match newline.
285*ccdc9c3eSSadaf Ebrahimi     MatchNL       = ClassNL | DotNL,
286*ccdc9c3eSSadaf Ebrahimi     OneLine       = 1<<4,   // Treat ^ and $ as only matching at beginning and
287*ccdc9c3eSSadaf Ebrahimi                             // end of text, not around embedded newlines.
288*ccdc9c3eSSadaf Ebrahimi                             // (Perl's default)
289*ccdc9c3eSSadaf Ebrahimi     Latin1        = 1<<5,   // Regexp and text are in Latin1, not UTF-8.
290*ccdc9c3eSSadaf Ebrahimi     NonGreedy     = 1<<6,   // Repetition operators are non-greedy by default.
291*ccdc9c3eSSadaf Ebrahimi     PerlClasses   = 1<<7,   // Allow Perl character classes like \d.
292*ccdc9c3eSSadaf Ebrahimi     PerlB         = 1<<8,   // Allow Perl's \b and \B.
293*ccdc9c3eSSadaf Ebrahimi     PerlX         = 1<<9,   // Perl extensions:
294*ccdc9c3eSSadaf Ebrahimi                             //   non-capturing parens - (?: )
295*ccdc9c3eSSadaf Ebrahimi                             //   non-greedy operators - *? +? ?? {}?
296*ccdc9c3eSSadaf Ebrahimi                             //   flag edits - (?i) (?-i) (?i: )
297*ccdc9c3eSSadaf Ebrahimi                             //     i - FoldCase
298*ccdc9c3eSSadaf Ebrahimi                             //     m - !OneLine
299*ccdc9c3eSSadaf Ebrahimi                             //     s - DotNL
300*ccdc9c3eSSadaf Ebrahimi                             //     U - NonGreedy
301*ccdc9c3eSSadaf Ebrahimi                             //   line ends: \A \z
302*ccdc9c3eSSadaf Ebrahimi                             //   \Q and \E to disable/enable metacharacters
303*ccdc9c3eSSadaf Ebrahimi                             //   (?P<name>expr) for named captures
304*ccdc9c3eSSadaf Ebrahimi                             //   \C to match any single byte
305*ccdc9c3eSSadaf Ebrahimi     UnicodeGroups = 1<<10,  // Allow \p{Han} for Unicode Han group
306*ccdc9c3eSSadaf Ebrahimi                             //   and \P{Han} for its negation.
307*ccdc9c3eSSadaf Ebrahimi     NeverNL       = 1<<11,  // Never match NL, even if the regexp mentions
308*ccdc9c3eSSadaf Ebrahimi                             //   it explicitly.
309*ccdc9c3eSSadaf Ebrahimi     NeverCapture  = 1<<12,  // Parse all parens as non-capturing.
310*ccdc9c3eSSadaf Ebrahimi 
311*ccdc9c3eSSadaf Ebrahimi     // As close to Perl as we can get.
312*ccdc9c3eSSadaf Ebrahimi     LikePerl      = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
313*ccdc9c3eSSadaf Ebrahimi                     UnicodeGroups,
314*ccdc9c3eSSadaf Ebrahimi 
315*ccdc9c3eSSadaf Ebrahimi     // Internal use only.
316*ccdc9c3eSSadaf Ebrahimi     WasDollar     = 1<<13,  // on kRegexpEndText: was $ in regexp text
317*ccdc9c3eSSadaf Ebrahimi     AllParseFlags = (1<<14)-1,
318*ccdc9c3eSSadaf Ebrahimi   };
319*ccdc9c3eSSadaf Ebrahimi 
320*ccdc9c3eSSadaf Ebrahimi   // Get.  No set, Regexps are logically immutable once created.
op()321*ccdc9c3eSSadaf Ebrahimi   RegexpOp op() { return static_cast<RegexpOp>(op_); }
nsub()322*ccdc9c3eSSadaf Ebrahimi   int nsub() { return nsub_; }
simple()323*ccdc9c3eSSadaf Ebrahimi   bool simple() { return simple_ != 0; }
parse_flags()324*ccdc9c3eSSadaf Ebrahimi   ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
325*ccdc9c3eSSadaf Ebrahimi   int Ref();  // For testing.
326*ccdc9c3eSSadaf Ebrahimi 
sub()327*ccdc9c3eSSadaf Ebrahimi   Regexp** sub() {
328*ccdc9c3eSSadaf Ebrahimi     if(nsub_ <= 1)
329*ccdc9c3eSSadaf Ebrahimi       return &subone_;
330*ccdc9c3eSSadaf Ebrahimi     else
331*ccdc9c3eSSadaf Ebrahimi       return submany_;
332*ccdc9c3eSSadaf Ebrahimi   }
333*ccdc9c3eSSadaf Ebrahimi 
min()334*ccdc9c3eSSadaf Ebrahimi   int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
max()335*ccdc9c3eSSadaf Ebrahimi   int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
rune()336*ccdc9c3eSSadaf Ebrahimi   Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
cc()337*ccdc9c3eSSadaf Ebrahimi   CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
cap()338*ccdc9c3eSSadaf Ebrahimi   int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
name()339*ccdc9c3eSSadaf Ebrahimi   const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
runes()340*ccdc9c3eSSadaf Ebrahimi   Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
nrunes()341*ccdc9c3eSSadaf Ebrahimi   int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
match_id()342*ccdc9c3eSSadaf Ebrahimi   int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
343*ccdc9c3eSSadaf Ebrahimi 
344*ccdc9c3eSSadaf Ebrahimi   // Increments reference count, returns object as convenience.
345*ccdc9c3eSSadaf Ebrahimi   Regexp* Incref();
346*ccdc9c3eSSadaf Ebrahimi 
347*ccdc9c3eSSadaf Ebrahimi   // Decrements reference count and deletes this object if count reaches 0.
348*ccdc9c3eSSadaf Ebrahimi   void Decref();
349*ccdc9c3eSSadaf Ebrahimi 
350*ccdc9c3eSSadaf Ebrahimi   // Parses string s to produce regular expression, returned.
351*ccdc9c3eSSadaf Ebrahimi   // Caller must release return value with re->Decref().
352*ccdc9c3eSSadaf Ebrahimi   // On failure, sets *status (if status != NULL) and returns NULL.
353*ccdc9c3eSSadaf Ebrahimi   static Regexp* Parse(const StringPiece& s, ParseFlags flags,
354*ccdc9c3eSSadaf Ebrahimi                        RegexpStatus* status);
355*ccdc9c3eSSadaf Ebrahimi 
356*ccdc9c3eSSadaf Ebrahimi   // Returns a _new_ simplified version of the current regexp.
357*ccdc9c3eSSadaf Ebrahimi   // Does not edit the current regexp.
358*ccdc9c3eSSadaf Ebrahimi   // Caller must release return value with re->Decref().
359*ccdc9c3eSSadaf Ebrahimi   // Simplified means that counted repetition has been rewritten
360*ccdc9c3eSSadaf Ebrahimi   // into simpler terms and all Perl/POSIX features have been
361*ccdc9c3eSSadaf Ebrahimi   // removed.  The result will capture exactly the same
362*ccdc9c3eSSadaf Ebrahimi   // subexpressions the original did, unless formatted with ToString.
363*ccdc9c3eSSadaf Ebrahimi   Regexp* Simplify();
364*ccdc9c3eSSadaf Ebrahimi   friend class CoalesceWalker;
365*ccdc9c3eSSadaf Ebrahimi   friend class SimplifyWalker;
366*ccdc9c3eSSadaf Ebrahimi 
367*ccdc9c3eSSadaf Ebrahimi   // Parses the regexp src and then simplifies it and sets *dst to the
368*ccdc9c3eSSadaf Ebrahimi   // string representation of the simplified form.  Returns true on success.
369*ccdc9c3eSSadaf Ebrahimi   // Returns false and sets *status (if status != NULL) on parse error.
370*ccdc9c3eSSadaf Ebrahimi   static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
371*ccdc9c3eSSadaf Ebrahimi                              string* dst,
372*ccdc9c3eSSadaf Ebrahimi                              RegexpStatus* status);
373*ccdc9c3eSSadaf Ebrahimi 
374*ccdc9c3eSSadaf Ebrahimi   // Returns the number of capturing groups in the regexp.
375*ccdc9c3eSSadaf Ebrahimi   int NumCaptures();
376*ccdc9c3eSSadaf Ebrahimi   friend class NumCapturesWalker;
377*ccdc9c3eSSadaf Ebrahimi 
378*ccdc9c3eSSadaf Ebrahimi   // Returns a map from names to capturing group indices,
379*ccdc9c3eSSadaf Ebrahimi   // or NULL if the regexp contains no named capture groups.
380*ccdc9c3eSSadaf Ebrahimi   // The caller is responsible for deleting the map.
381*ccdc9c3eSSadaf Ebrahimi   std::map<string, int>* NamedCaptures();
382*ccdc9c3eSSadaf Ebrahimi 
383*ccdc9c3eSSadaf Ebrahimi   // Returns a map from capturing group indices to capturing group
384*ccdc9c3eSSadaf Ebrahimi   // names or NULL if the regexp contains no named capture groups. The
385*ccdc9c3eSSadaf Ebrahimi   // caller is responsible for deleting the map.
386*ccdc9c3eSSadaf Ebrahimi   std::map<int, string>* CaptureNames();
387*ccdc9c3eSSadaf Ebrahimi 
388*ccdc9c3eSSadaf Ebrahimi   // Returns a string representation of the current regexp,
389*ccdc9c3eSSadaf Ebrahimi   // using as few parentheses as possible.
390*ccdc9c3eSSadaf Ebrahimi   string ToString();
391*ccdc9c3eSSadaf Ebrahimi 
392*ccdc9c3eSSadaf Ebrahimi   // Convenience functions.  They consume the passed reference,
393*ccdc9c3eSSadaf Ebrahimi   // so in many cases you should use, e.g., Plus(re->Incref(), flags).
394*ccdc9c3eSSadaf Ebrahimi   // They do not consume allocated arrays like subs or runes.
395*ccdc9c3eSSadaf Ebrahimi   static Regexp* Plus(Regexp* sub, ParseFlags flags);
396*ccdc9c3eSSadaf Ebrahimi   static Regexp* Star(Regexp* sub, ParseFlags flags);
397*ccdc9c3eSSadaf Ebrahimi   static Regexp* Quest(Regexp* sub, ParseFlags flags);
398*ccdc9c3eSSadaf Ebrahimi   static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
399*ccdc9c3eSSadaf Ebrahimi   static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
400*ccdc9c3eSSadaf Ebrahimi   static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
401*ccdc9c3eSSadaf Ebrahimi   static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
402*ccdc9c3eSSadaf Ebrahimi   static Regexp* NewLiteral(Rune rune, ParseFlags flags);
403*ccdc9c3eSSadaf Ebrahimi   static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
404*ccdc9c3eSSadaf Ebrahimi   static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
405*ccdc9c3eSSadaf Ebrahimi   static Regexp* HaveMatch(int match_id, ParseFlags flags);
406*ccdc9c3eSSadaf Ebrahimi 
407*ccdc9c3eSSadaf Ebrahimi   // Like Alternate but does not factor out common prefixes.
408*ccdc9c3eSSadaf Ebrahimi   static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
409*ccdc9c3eSSadaf Ebrahimi 
410*ccdc9c3eSSadaf Ebrahimi   // Debugging function.  Returns string format for regexp
411*ccdc9c3eSSadaf Ebrahimi   // that makes structure clear.  Does NOT use regexp syntax.
412*ccdc9c3eSSadaf Ebrahimi   string Dump();
413*ccdc9c3eSSadaf Ebrahimi 
414*ccdc9c3eSSadaf Ebrahimi   // Helper traversal class, defined fully in walker-inl.h.
415*ccdc9c3eSSadaf Ebrahimi   template<typename T> class Walker;
416*ccdc9c3eSSadaf Ebrahimi 
417*ccdc9c3eSSadaf Ebrahimi   // Compile to Prog.  See prog.h
418*ccdc9c3eSSadaf Ebrahimi   // Reverse prog expects to be run over text backward.
419*ccdc9c3eSSadaf Ebrahimi   // Construction and execution of prog will
420*ccdc9c3eSSadaf Ebrahimi   // stay within approximately max_mem bytes of memory.
421*ccdc9c3eSSadaf Ebrahimi   // If max_mem <= 0, a reasonable default is used.
422*ccdc9c3eSSadaf Ebrahimi   Prog* CompileToProg(int64_t max_mem);
423*ccdc9c3eSSadaf Ebrahimi   Prog* CompileToReverseProg(int64_t max_mem);
424*ccdc9c3eSSadaf Ebrahimi 
425*ccdc9c3eSSadaf Ebrahimi   // Whether to expect this library to find exactly the same answer as PCRE
426*ccdc9c3eSSadaf Ebrahimi   // when running this regexp.  Most regexps do mimic PCRE exactly, but a few
427*ccdc9c3eSSadaf Ebrahimi   // obscure cases behave differently.  Technically this is more a property
428*ccdc9c3eSSadaf Ebrahimi   // of the Prog than the Regexp, but the computation is much easier to do
429*ccdc9c3eSSadaf Ebrahimi   // on the Regexp.  See mimics_pcre.cc for the exact conditions.
430*ccdc9c3eSSadaf Ebrahimi   bool MimicsPCRE();
431*ccdc9c3eSSadaf Ebrahimi 
432*ccdc9c3eSSadaf Ebrahimi   // Benchmarking function.
433*ccdc9c3eSSadaf Ebrahimi   void NullWalk();
434*ccdc9c3eSSadaf Ebrahimi 
435*ccdc9c3eSSadaf Ebrahimi   // Whether every match of this regexp must be anchored and
436*ccdc9c3eSSadaf Ebrahimi   // begin with a non-empty fixed string (perhaps after ASCII
437*ccdc9c3eSSadaf Ebrahimi   // case-folding).  If so, returns the prefix and the sub-regexp that
438*ccdc9c3eSSadaf Ebrahimi   // follows it.
439*ccdc9c3eSSadaf Ebrahimi   // Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
440*ccdc9c3eSSadaf Ebrahimi   // regardless of the return value.
441*ccdc9c3eSSadaf Ebrahimi   bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix);
442*ccdc9c3eSSadaf Ebrahimi 
443*ccdc9c3eSSadaf Ebrahimi  private:
444*ccdc9c3eSSadaf Ebrahimi   // Constructor allocates vectors as appropriate for operator.
445*ccdc9c3eSSadaf Ebrahimi   explicit Regexp(RegexpOp op, ParseFlags parse_flags);
446*ccdc9c3eSSadaf Ebrahimi 
447*ccdc9c3eSSadaf Ebrahimi   // Use Decref() instead of delete to release Regexps.
448*ccdc9c3eSSadaf Ebrahimi   // This is private to catch deletes at compile time.
449*ccdc9c3eSSadaf Ebrahimi   ~Regexp();
450*ccdc9c3eSSadaf Ebrahimi   void Destroy();
451*ccdc9c3eSSadaf Ebrahimi   bool QuickDestroy();
452*ccdc9c3eSSadaf Ebrahimi 
453*ccdc9c3eSSadaf Ebrahimi   // Helpers for Parse.  Listed here so they can edit Regexps.
454*ccdc9c3eSSadaf Ebrahimi   class ParseState;
455*ccdc9c3eSSadaf Ebrahimi 
456*ccdc9c3eSSadaf Ebrahimi   friend class ParseState;
457*ccdc9c3eSSadaf Ebrahimi   friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
458*ccdc9c3eSSadaf Ebrahimi                              RegexpStatus* status);
459*ccdc9c3eSSadaf Ebrahimi 
460*ccdc9c3eSSadaf Ebrahimi   // Helper for testing [sic].
461*ccdc9c3eSSadaf Ebrahimi   friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
462*ccdc9c3eSSadaf Ebrahimi 
463*ccdc9c3eSSadaf Ebrahimi   // Computes whether Regexp is already simple.
464*ccdc9c3eSSadaf Ebrahimi   bool ComputeSimple();
465*ccdc9c3eSSadaf Ebrahimi 
466*ccdc9c3eSSadaf Ebrahimi   // Constructor that generates a Star, Plus or Quest,
467*ccdc9c3eSSadaf Ebrahimi   // squashing the pair if sub is also a Star, Plus or Quest.
468*ccdc9c3eSSadaf Ebrahimi   static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
469*ccdc9c3eSSadaf Ebrahimi 
470*ccdc9c3eSSadaf Ebrahimi   // Constructor that generates a concatenation or alternation,
471*ccdc9c3eSSadaf Ebrahimi   // enforcing the limit on the number of subexpressions for
472*ccdc9c3eSSadaf Ebrahimi   // a particular Regexp.
473*ccdc9c3eSSadaf Ebrahimi   static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
474*ccdc9c3eSSadaf Ebrahimi                                    ParseFlags flags, bool can_factor);
475*ccdc9c3eSSadaf Ebrahimi 
476*ccdc9c3eSSadaf Ebrahimi   // Returns the leading string that re starts with.
477*ccdc9c3eSSadaf Ebrahimi   // The returned Rune* points into a piece of re,
478*ccdc9c3eSSadaf Ebrahimi   // so it must not be used after the caller calls re->Decref().
479*ccdc9c3eSSadaf Ebrahimi   static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
480*ccdc9c3eSSadaf Ebrahimi 
481*ccdc9c3eSSadaf Ebrahimi   // Removes the first n leading runes from the beginning of re.
482*ccdc9c3eSSadaf Ebrahimi   // Edits re in place.
483*ccdc9c3eSSadaf Ebrahimi   static void RemoveLeadingString(Regexp* re, int n);
484*ccdc9c3eSSadaf Ebrahimi 
485*ccdc9c3eSSadaf Ebrahimi   // Returns the leading regexp in re's top-level concatenation.
486*ccdc9c3eSSadaf Ebrahimi   // The returned Regexp* points at re or a sub-expression of re,
487*ccdc9c3eSSadaf Ebrahimi   // so it must not be used after the caller calls re->Decref().
488*ccdc9c3eSSadaf Ebrahimi   static Regexp* LeadingRegexp(Regexp* re);
489*ccdc9c3eSSadaf Ebrahimi 
490*ccdc9c3eSSadaf Ebrahimi   // Removes LeadingRegexp(re) from re and returns the remainder.
491*ccdc9c3eSSadaf Ebrahimi   // Might edit re in place.
492*ccdc9c3eSSadaf Ebrahimi   static Regexp* RemoveLeadingRegexp(Regexp* re);
493*ccdc9c3eSSadaf Ebrahimi 
494*ccdc9c3eSSadaf Ebrahimi   // Simplifies an alternation of literal strings by factoring out
495*ccdc9c3eSSadaf Ebrahimi   // common prefixes.
496*ccdc9c3eSSadaf Ebrahimi   static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
497*ccdc9c3eSSadaf Ebrahimi   friend class FactorAlternationImpl;
498*ccdc9c3eSSadaf Ebrahimi 
499*ccdc9c3eSSadaf Ebrahimi   // Is a == b?  Only efficient on regexps that have not been through
500*ccdc9c3eSSadaf Ebrahimi   // Simplify yet - the expansion of a kRegexpRepeat will make this
501*ccdc9c3eSSadaf Ebrahimi   // take a long time.  Do not call on such regexps, hence private.
502*ccdc9c3eSSadaf Ebrahimi   static bool Equal(Regexp* a, Regexp* b);
503*ccdc9c3eSSadaf Ebrahimi 
504*ccdc9c3eSSadaf Ebrahimi   // Allocate space for n sub-regexps.
AllocSub(int n)505*ccdc9c3eSSadaf Ebrahimi   void AllocSub(int n) {
506*ccdc9c3eSSadaf Ebrahimi     DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
507*ccdc9c3eSSadaf Ebrahimi     if (n > 1)
508*ccdc9c3eSSadaf Ebrahimi       submany_ = new Regexp*[n];
509*ccdc9c3eSSadaf Ebrahimi     nsub_ = static_cast<uint16_t>(n);
510*ccdc9c3eSSadaf Ebrahimi   }
511*ccdc9c3eSSadaf Ebrahimi 
512*ccdc9c3eSSadaf Ebrahimi   // Add Rune to LiteralString
513*ccdc9c3eSSadaf Ebrahimi   void AddRuneToString(Rune r);
514*ccdc9c3eSSadaf Ebrahimi 
515*ccdc9c3eSSadaf Ebrahimi   // Swaps this with that, in place.
516*ccdc9c3eSSadaf Ebrahimi   void Swap(Regexp *that);
517*ccdc9c3eSSadaf Ebrahimi 
518*ccdc9c3eSSadaf Ebrahimi   // Operator.  See description of operators above.
519*ccdc9c3eSSadaf Ebrahimi   // uint8_t instead of RegexpOp to control space usage.
520*ccdc9c3eSSadaf Ebrahimi   uint8_t op_;
521*ccdc9c3eSSadaf Ebrahimi 
522*ccdc9c3eSSadaf Ebrahimi   // Is this regexp structure already simple
523*ccdc9c3eSSadaf Ebrahimi   // (has it been returned by Simplify)?
524*ccdc9c3eSSadaf Ebrahimi   // uint8_t instead of bool to control space usage.
525*ccdc9c3eSSadaf Ebrahimi   uint8_t simple_;
526*ccdc9c3eSSadaf Ebrahimi 
527*ccdc9c3eSSadaf Ebrahimi   // Flags saved from parsing and used during execution.
528*ccdc9c3eSSadaf Ebrahimi   // (Only FoldCase is used.)
529*ccdc9c3eSSadaf Ebrahimi   // uint16_t instead of ParseFlags to control space usage.
530*ccdc9c3eSSadaf Ebrahimi   uint16_t parse_flags_;
531*ccdc9c3eSSadaf Ebrahimi 
532*ccdc9c3eSSadaf Ebrahimi   // Reference count.  Exists so that SimplifyRegexp can build
533*ccdc9c3eSSadaf Ebrahimi   // regexp structures that are dags rather than trees to avoid
534*ccdc9c3eSSadaf Ebrahimi   // exponential blowup in space requirements.
535*ccdc9c3eSSadaf Ebrahimi   // uint16_t to control space usage.
536*ccdc9c3eSSadaf Ebrahimi   // The standard regexp routines will never generate a
537*ccdc9c3eSSadaf Ebrahimi   // ref greater than the maximum repeat count (kMaxRepeat),
538*ccdc9c3eSSadaf Ebrahimi   // but even so, Incref and Decref consult an overflow map
539*ccdc9c3eSSadaf Ebrahimi   // when ref_ reaches kMaxRef.
540*ccdc9c3eSSadaf Ebrahimi   uint16_t ref_;
541*ccdc9c3eSSadaf Ebrahimi   static const uint16_t kMaxRef = 0xffff;
542*ccdc9c3eSSadaf Ebrahimi 
543*ccdc9c3eSSadaf Ebrahimi   // Subexpressions.
544*ccdc9c3eSSadaf Ebrahimi   // uint16_t to control space usage.
545*ccdc9c3eSSadaf Ebrahimi   // Concat and Alternate handle larger numbers of subexpressions
546*ccdc9c3eSSadaf Ebrahimi   // by building concatenation or alternation trees.
547*ccdc9c3eSSadaf Ebrahimi   // Other routines should call Concat or Alternate instead of
548*ccdc9c3eSSadaf Ebrahimi   // filling in sub() by hand.
549*ccdc9c3eSSadaf Ebrahimi   uint16_t nsub_;
550*ccdc9c3eSSadaf Ebrahimi   static const uint16_t kMaxNsub = 0xffff;
551*ccdc9c3eSSadaf Ebrahimi   union {
552*ccdc9c3eSSadaf Ebrahimi     Regexp** submany_;  // if nsub_ > 1
553*ccdc9c3eSSadaf Ebrahimi     Regexp* subone_;  // if nsub_ == 1
554*ccdc9c3eSSadaf Ebrahimi   };
555*ccdc9c3eSSadaf Ebrahimi 
556*ccdc9c3eSSadaf Ebrahimi   // Extra space for parse and teardown stacks.
557*ccdc9c3eSSadaf Ebrahimi   Regexp* down_;
558*ccdc9c3eSSadaf Ebrahimi 
559*ccdc9c3eSSadaf Ebrahimi   // Arguments to operator.  See description of operators above.
560*ccdc9c3eSSadaf Ebrahimi   union {
561*ccdc9c3eSSadaf Ebrahimi     struct {  // Repeat
562*ccdc9c3eSSadaf Ebrahimi       int max_;
563*ccdc9c3eSSadaf Ebrahimi       int min_;
564*ccdc9c3eSSadaf Ebrahimi     };
565*ccdc9c3eSSadaf Ebrahimi     struct {  // Capture
566*ccdc9c3eSSadaf Ebrahimi       int cap_;
567*ccdc9c3eSSadaf Ebrahimi       string* name_;
568*ccdc9c3eSSadaf Ebrahimi     };
569*ccdc9c3eSSadaf Ebrahimi     struct {  // LiteralString
570*ccdc9c3eSSadaf Ebrahimi       int nrunes_;
571*ccdc9c3eSSadaf Ebrahimi       Rune* runes_;
572*ccdc9c3eSSadaf Ebrahimi     };
573*ccdc9c3eSSadaf Ebrahimi     struct {  // CharClass
574*ccdc9c3eSSadaf Ebrahimi       // These two could be in separate union members,
575*ccdc9c3eSSadaf Ebrahimi       // but it wouldn't save any space (there are other two-word structs)
576*ccdc9c3eSSadaf Ebrahimi       // and keeping them separate avoids confusion during parsing.
577*ccdc9c3eSSadaf Ebrahimi       CharClass* cc_;
578*ccdc9c3eSSadaf Ebrahimi       CharClassBuilder* ccb_;
579*ccdc9c3eSSadaf Ebrahimi     };
580*ccdc9c3eSSadaf Ebrahimi     Rune rune_;  // Literal
581*ccdc9c3eSSadaf Ebrahimi     int match_id_;  // HaveMatch
582*ccdc9c3eSSadaf Ebrahimi     void *the_union_[2];  // as big as any other element, for memset
583*ccdc9c3eSSadaf Ebrahimi   };
584*ccdc9c3eSSadaf Ebrahimi 
585*ccdc9c3eSSadaf Ebrahimi   Regexp(const Regexp&) = delete;
586*ccdc9c3eSSadaf Ebrahimi   Regexp& operator=(const Regexp&) = delete;
587*ccdc9c3eSSadaf Ebrahimi };
588*ccdc9c3eSSadaf Ebrahimi 
589*ccdc9c3eSSadaf Ebrahimi // Character class set: contains non-overlapping, non-abutting RuneRanges.
590*ccdc9c3eSSadaf Ebrahimi typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
591*ccdc9c3eSSadaf Ebrahimi 
592*ccdc9c3eSSadaf Ebrahimi class CharClassBuilder {
593*ccdc9c3eSSadaf Ebrahimi  public:
594*ccdc9c3eSSadaf Ebrahimi   CharClassBuilder();
595*ccdc9c3eSSadaf Ebrahimi 
596*ccdc9c3eSSadaf Ebrahimi   typedef RuneRangeSet::iterator iterator;
begin()597*ccdc9c3eSSadaf Ebrahimi   iterator begin() { return ranges_.begin(); }
end()598*ccdc9c3eSSadaf Ebrahimi   iterator end() { return ranges_.end(); }
599*ccdc9c3eSSadaf Ebrahimi 
size()600*ccdc9c3eSSadaf Ebrahimi   int size() { return nrunes_; }
empty()601*ccdc9c3eSSadaf Ebrahimi   bool empty() { return nrunes_ == 0; }
full()602*ccdc9c3eSSadaf Ebrahimi   bool full() { return nrunes_ == Runemax+1; }
603*ccdc9c3eSSadaf Ebrahimi 
604*ccdc9c3eSSadaf Ebrahimi   bool Contains(Rune r);
605*ccdc9c3eSSadaf Ebrahimi   bool FoldsASCII();
606*ccdc9c3eSSadaf Ebrahimi   bool AddRange(Rune lo, Rune hi);  // returns whether class changed
607*ccdc9c3eSSadaf Ebrahimi   CharClassBuilder* Copy();
608*ccdc9c3eSSadaf Ebrahimi   void AddCharClass(CharClassBuilder* cc);
609*ccdc9c3eSSadaf Ebrahimi   void Negate();
610*ccdc9c3eSSadaf Ebrahimi   void RemoveAbove(Rune r);
611*ccdc9c3eSSadaf Ebrahimi   CharClass* GetCharClass();
612*ccdc9c3eSSadaf Ebrahimi   void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
613*ccdc9c3eSSadaf Ebrahimi 
614*ccdc9c3eSSadaf Ebrahimi  private:
615*ccdc9c3eSSadaf Ebrahimi   static const uint32_t AlphaMask = (1<<26) - 1;
616*ccdc9c3eSSadaf Ebrahimi   uint32_t upper_;  // bitmap of A-Z
617*ccdc9c3eSSadaf Ebrahimi   uint32_t lower_;  // bitmap of a-z
618*ccdc9c3eSSadaf Ebrahimi   int nrunes_;
619*ccdc9c3eSSadaf Ebrahimi   RuneRangeSet ranges_;
620*ccdc9c3eSSadaf Ebrahimi 
621*ccdc9c3eSSadaf Ebrahimi   CharClassBuilder(const CharClassBuilder&) = delete;
622*ccdc9c3eSSadaf Ebrahimi   CharClassBuilder& operator=(const CharClassBuilder&) = delete;
623*ccdc9c3eSSadaf Ebrahimi };
624*ccdc9c3eSSadaf Ebrahimi 
625*ccdc9c3eSSadaf Ebrahimi // Bitwise ops on ParseFlags produce ParseFlags.
626*ccdc9c3eSSadaf Ebrahimi inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
627*ccdc9c3eSSadaf Ebrahimi                                     Regexp::ParseFlags b) {
628*ccdc9c3eSSadaf Ebrahimi   return static_cast<Regexp::ParseFlags>(
629*ccdc9c3eSSadaf Ebrahimi       static_cast<int>(a) | static_cast<int>(b));
630*ccdc9c3eSSadaf Ebrahimi }
631*ccdc9c3eSSadaf Ebrahimi 
632*ccdc9c3eSSadaf Ebrahimi inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
633*ccdc9c3eSSadaf Ebrahimi                                     Regexp::ParseFlags b) {
634*ccdc9c3eSSadaf Ebrahimi   return static_cast<Regexp::ParseFlags>(
635*ccdc9c3eSSadaf Ebrahimi       static_cast<int>(a) ^ static_cast<int>(b));
636*ccdc9c3eSSadaf Ebrahimi }
637*ccdc9c3eSSadaf Ebrahimi 
638*ccdc9c3eSSadaf Ebrahimi inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
639*ccdc9c3eSSadaf Ebrahimi                                     Regexp::ParseFlags b) {
640*ccdc9c3eSSadaf Ebrahimi   return static_cast<Regexp::ParseFlags>(
641*ccdc9c3eSSadaf Ebrahimi       static_cast<int>(a) & static_cast<int>(b));
642*ccdc9c3eSSadaf Ebrahimi }
643*ccdc9c3eSSadaf Ebrahimi 
644*ccdc9c3eSSadaf Ebrahimi inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
645*ccdc9c3eSSadaf Ebrahimi   // Attempting to produce a value out of enum's range has undefined behaviour.
646*ccdc9c3eSSadaf Ebrahimi   return static_cast<Regexp::ParseFlags>(
647*ccdc9c3eSSadaf Ebrahimi       ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
648*ccdc9c3eSSadaf Ebrahimi }
649*ccdc9c3eSSadaf Ebrahimi 
650*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
651*ccdc9c3eSSadaf Ebrahimi 
652*ccdc9c3eSSadaf Ebrahimi #endif  // RE2_REGEXP_H_
653