1*ccdc9c3eSSadaf Ebrahimi // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_RE2_H_
6*ccdc9c3eSSadaf Ebrahimi #define RE2_RE2_H_
7*ccdc9c3eSSadaf Ebrahimi
8*ccdc9c3eSSadaf Ebrahimi // C++ interface to the re2 regular-expression library.
9*ccdc9c3eSSadaf Ebrahimi // RE2 supports Perl-style regular expressions (with extensions like
10*ccdc9c3eSSadaf Ebrahimi // \d, \w, \s, ...).
11*ccdc9c3eSSadaf Ebrahimi //
12*ccdc9c3eSSadaf Ebrahimi // -----------------------------------------------------------------------
13*ccdc9c3eSSadaf Ebrahimi // REGEXP SYNTAX:
14*ccdc9c3eSSadaf Ebrahimi //
15*ccdc9c3eSSadaf Ebrahimi // This module uses the re2 library and hence supports
16*ccdc9c3eSSadaf Ebrahimi // its syntax for regular expressions, which is similar to Perl's with
17*ccdc9c3eSSadaf Ebrahimi // some of the more complicated things thrown away. In particular,
18*ccdc9c3eSSadaf Ebrahimi // backreferences and generalized assertions are not available, nor is \Z.
19*ccdc9c3eSSadaf Ebrahimi //
20*ccdc9c3eSSadaf Ebrahimi // See https://github.com/google/re2/wiki/Syntax for the syntax
21*ccdc9c3eSSadaf Ebrahimi // supported by RE2, and a comparison with PCRE and PERL regexps.
22*ccdc9c3eSSadaf Ebrahimi //
23*ccdc9c3eSSadaf Ebrahimi // For those not familiar with Perl's regular expressions,
24*ccdc9c3eSSadaf Ebrahimi // here are some examples of the most commonly used extensions:
25*ccdc9c3eSSadaf Ebrahimi //
26*ccdc9c3eSSadaf Ebrahimi // "hello (\\w+) world" -- \w matches a "word" character
27*ccdc9c3eSSadaf Ebrahimi // "version (\\d+)" -- \d matches a digit
28*ccdc9c3eSSadaf Ebrahimi // "hello\\s+world" -- \s matches any whitespace character
29*ccdc9c3eSSadaf Ebrahimi // "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
30*ccdc9c3eSSadaf Ebrahimi // "(?i)hello" -- (?i) turns on case-insensitive matching
31*ccdc9c3eSSadaf Ebrahimi // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
32*ccdc9c3eSSadaf Ebrahimi //
33*ccdc9c3eSSadaf Ebrahimi // -----------------------------------------------------------------------
34*ccdc9c3eSSadaf Ebrahimi // MATCHING INTERFACE:
35*ccdc9c3eSSadaf Ebrahimi //
36*ccdc9c3eSSadaf Ebrahimi // The "FullMatch" operation checks that supplied text matches a
37*ccdc9c3eSSadaf Ebrahimi // supplied pattern exactly.
38*ccdc9c3eSSadaf Ebrahimi //
39*ccdc9c3eSSadaf Ebrahimi // Example: successful match
40*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::FullMatch("hello", "h.*o"));
41*ccdc9c3eSSadaf Ebrahimi //
42*ccdc9c3eSSadaf Ebrahimi // Example: unsuccessful match (requires full match):
43*ccdc9c3eSSadaf Ebrahimi // CHECK(!RE2::FullMatch("hello", "e"));
44*ccdc9c3eSSadaf Ebrahimi //
45*ccdc9c3eSSadaf Ebrahimi // -----------------------------------------------------------------------
46*ccdc9c3eSSadaf Ebrahimi // UTF-8 AND THE MATCHING INTERFACE:
47*ccdc9c3eSSadaf Ebrahimi //
48*ccdc9c3eSSadaf Ebrahimi // By default, the pattern and input text are interpreted as UTF-8.
49*ccdc9c3eSSadaf Ebrahimi // The RE2::Latin1 option causes them to be interpreted as Latin-1.
50*ccdc9c3eSSadaf Ebrahimi //
51*ccdc9c3eSSadaf Ebrahimi // Example:
52*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
53*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
54*ccdc9c3eSSadaf Ebrahimi //
55*ccdc9c3eSSadaf Ebrahimi // -----------------------------------------------------------------------
56*ccdc9c3eSSadaf Ebrahimi // MATCHING WITH SUBSTRING EXTRACTION:
57*ccdc9c3eSSadaf Ebrahimi //
58*ccdc9c3eSSadaf Ebrahimi // You can supply extra pointer arguments to extract matched substrings.
59*ccdc9c3eSSadaf Ebrahimi // On match failure, none of the pointees will have been modified.
60*ccdc9c3eSSadaf Ebrahimi // On match success, the substrings will be converted (as necessary) and
61*ccdc9c3eSSadaf Ebrahimi // their values will be assigned to their pointees until all conversions
62*ccdc9c3eSSadaf Ebrahimi // have succeeded or one conversion has failed.
63*ccdc9c3eSSadaf Ebrahimi // On conversion failure, the pointees will be in an indeterminate state
64*ccdc9c3eSSadaf Ebrahimi // because the caller has no way of knowing which conversion failed.
65*ccdc9c3eSSadaf Ebrahimi // However, conversion cannot fail for types like string and StringPiece
66*ccdc9c3eSSadaf Ebrahimi // that do not inspect the substring contents. Hence, in the common case
67*ccdc9c3eSSadaf Ebrahimi // where all of the pointees are of such types, failure is always due to
68*ccdc9c3eSSadaf Ebrahimi // match failure and thus none of the pointees will have been modified.
69*ccdc9c3eSSadaf Ebrahimi //
70*ccdc9c3eSSadaf Ebrahimi // Example: extracts "ruby" into "s" and 1234 into "i"
71*ccdc9c3eSSadaf Ebrahimi // int i;
72*ccdc9c3eSSadaf Ebrahimi // string s;
73*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
74*ccdc9c3eSSadaf Ebrahimi //
75*ccdc9c3eSSadaf Ebrahimi // Example: fails because string cannot be stored in integer
76*ccdc9c3eSSadaf Ebrahimi // CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
77*ccdc9c3eSSadaf Ebrahimi //
78*ccdc9c3eSSadaf Ebrahimi // Example: fails because there aren't enough sub-patterns
79*ccdc9c3eSSadaf Ebrahimi // CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
80*ccdc9c3eSSadaf Ebrahimi //
81*ccdc9c3eSSadaf Ebrahimi // Example: does not try to extract any extra sub-patterns
82*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
83*ccdc9c3eSSadaf Ebrahimi //
84*ccdc9c3eSSadaf Ebrahimi // Example: does not try to extract into NULL
85*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
86*ccdc9c3eSSadaf Ebrahimi //
87*ccdc9c3eSSadaf Ebrahimi // Example: integer overflow causes failure
88*ccdc9c3eSSadaf Ebrahimi // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
89*ccdc9c3eSSadaf Ebrahimi //
90*ccdc9c3eSSadaf Ebrahimi // NOTE(rsc): Asking for substrings slows successful matches quite a bit.
91*ccdc9c3eSSadaf Ebrahimi // This may get a little faster in the future, but right now is slower
92*ccdc9c3eSSadaf Ebrahimi // than PCRE. On the other hand, failed matches run *very* fast (faster
93*ccdc9c3eSSadaf Ebrahimi // than PCRE), as do matches without substring extraction.
94*ccdc9c3eSSadaf Ebrahimi //
95*ccdc9c3eSSadaf Ebrahimi // -----------------------------------------------------------------------
96*ccdc9c3eSSadaf Ebrahimi // PARTIAL MATCHES
97*ccdc9c3eSSadaf Ebrahimi //
98*ccdc9c3eSSadaf Ebrahimi // You can use the "PartialMatch" operation when you want the pattern
99*ccdc9c3eSSadaf Ebrahimi // to match any substring of the text.
100*ccdc9c3eSSadaf Ebrahimi //
101*ccdc9c3eSSadaf Ebrahimi // Example: simple search for a string:
102*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::PartialMatch("hello", "ell"));
103*ccdc9c3eSSadaf Ebrahimi //
104*ccdc9c3eSSadaf Ebrahimi // Example: find first number in a string
105*ccdc9c3eSSadaf Ebrahimi // int number;
106*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
107*ccdc9c3eSSadaf Ebrahimi // CHECK_EQ(number, 100);
108*ccdc9c3eSSadaf Ebrahimi //
109*ccdc9c3eSSadaf Ebrahimi // -----------------------------------------------------------------------
110*ccdc9c3eSSadaf Ebrahimi // PRE-COMPILED REGULAR EXPRESSIONS
111*ccdc9c3eSSadaf Ebrahimi //
112*ccdc9c3eSSadaf Ebrahimi // RE2 makes it easy to use any string as a regular expression, without
113*ccdc9c3eSSadaf Ebrahimi // requiring a separate compilation step.
114*ccdc9c3eSSadaf Ebrahimi //
115*ccdc9c3eSSadaf Ebrahimi // If speed is of the essence, you can create a pre-compiled "RE2"
116*ccdc9c3eSSadaf Ebrahimi // object from the pattern and use it multiple times. If you do so,
117*ccdc9c3eSSadaf Ebrahimi // you can typically parse text faster than with sscanf.
118*ccdc9c3eSSadaf Ebrahimi //
119*ccdc9c3eSSadaf Ebrahimi // Example: precompile pattern for faster matching:
120*ccdc9c3eSSadaf Ebrahimi // RE2 pattern("h.*o");
121*ccdc9c3eSSadaf Ebrahimi // while (ReadLine(&str)) {
122*ccdc9c3eSSadaf Ebrahimi // if (RE2::FullMatch(str, pattern)) ...;
123*ccdc9c3eSSadaf Ebrahimi // }
124*ccdc9c3eSSadaf Ebrahimi //
125*ccdc9c3eSSadaf Ebrahimi // -----------------------------------------------------------------------
126*ccdc9c3eSSadaf Ebrahimi // SCANNING TEXT INCREMENTALLY
127*ccdc9c3eSSadaf Ebrahimi //
128*ccdc9c3eSSadaf Ebrahimi // The "Consume" operation may be useful if you want to repeatedly
129*ccdc9c3eSSadaf Ebrahimi // match regular expressions at the front of a string and skip over
130*ccdc9c3eSSadaf Ebrahimi // them as they match. This requires use of the "StringPiece" type,
131*ccdc9c3eSSadaf Ebrahimi // which represents a sub-range of a real string.
132*ccdc9c3eSSadaf Ebrahimi //
133*ccdc9c3eSSadaf Ebrahimi // Example: read lines of the form "var = value" from a string.
134*ccdc9c3eSSadaf Ebrahimi // string contents = ...; // Fill string somehow
135*ccdc9c3eSSadaf Ebrahimi // StringPiece input(contents); // Wrap a StringPiece around it
136*ccdc9c3eSSadaf Ebrahimi //
137*ccdc9c3eSSadaf Ebrahimi // string var;
138*ccdc9c3eSSadaf Ebrahimi // int value;
139*ccdc9c3eSSadaf Ebrahimi // while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
140*ccdc9c3eSSadaf Ebrahimi // ...;
141*ccdc9c3eSSadaf Ebrahimi // }
142*ccdc9c3eSSadaf Ebrahimi //
143*ccdc9c3eSSadaf Ebrahimi // Each successful call to "Consume" will set "var/value", and also
144*ccdc9c3eSSadaf Ebrahimi // advance "input" so it points past the matched text. Note that if the
145*ccdc9c3eSSadaf Ebrahimi // regular expression matches an empty string, input will advance
146*ccdc9c3eSSadaf Ebrahimi // by 0 bytes. If the regular expression being used might match
147*ccdc9c3eSSadaf Ebrahimi // an empty string, the loop body must check for this case and either
148*ccdc9c3eSSadaf Ebrahimi // advance the string or break out of the loop.
149*ccdc9c3eSSadaf Ebrahimi //
150*ccdc9c3eSSadaf Ebrahimi // The "FindAndConsume" operation is similar to "Consume" but does not
151*ccdc9c3eSSadaf Ebrahimi // anchor your match at the beginning of the string. For example, you
152*ccdc9c3eSSadaf Ebrahimi // could extract all words from a string by repeatedly calling
153*ccdc9c3eSSadaf Ebrahimi // RE2::FindAndConsume(&input, "(\\w+)", &word)
154*ccdc9c3eSSadaf Ebrahimi //
155*ccdc9c3eSSadaf Ebrahimi // -----------------------------------------------------------------------
156*ccdc9c3eSSadaf Ebrahimi // USING VARIABLE NUMBER OF ARGUMENTS
157*ccdc9c3eSSadaf Ebrahimi //
158*ccdc9c3eSSadaf Ebrahimi // The above operations require you to know the number of arguments
159*ccdc9c3eSSadaf Ebrahimi // when you write the code. This is not always possible or easy (for
160*ccdc9c3eSSadaf Ebrahimi // example, the regular expression may be calculated at run time).
161*ccdc9c3eSSadaf Ebrahimi // You can use the "N" version of the operations when the number of
162*ccdc9c3eSSadaf Ebrahimi // match arguments are determined at run time.
163*ccdc9c3eSSadaf Ebrahimi //
164*ccdc9c3eSSadaf Ebrahimi // Example:
165*ccdc9c3eSSadaf Ebrahimi // const RE2::Arg* args[10];
166*ccdc9c3eSSadaf Ebrahimi // int n;
167*ccdc9c3eSSadaf Ebrahimi // // ... populate args with pointers to RE2::Arg values ...
168*ccdc9c3eSSadaf Ebrahimi // // ... set n to the number of RE2::Arg objects ...
169*ccdc9c3eSSadaf Ebrahimi // bool match = RE2::FullMatchN(input, pattern, args, n);
170*ccdc9c3eSSadaf Ebrahimi //
171*ccdc9c3eSSadaf Ebrahimi // The last statement is equivalent to
172*ccdc9c3eSSadaf Ebrahimi //
173*ccdc9c3eSSadaf Ebrahimi // bool match = RE2::FullMatch(input, pattern,
174*ccdc9c3eSSadaf Ebrahimi // *args[0], *args[1], ..., *args[n - 1]);
175*ccdc9c3eSSadaf Ebrahimi //
176*ccdc9c3eSSadaf Ebrahimi // -----------------------------------------------------------------------
177*ccdc9c3eSSadaf Ebrahimi // PARSING HEX/OCTAL/C-RADIX NUMBERS
178*ccdc9c3eSSadaf Ebrahimi //
179*ccdc9c3eSSadaf Ebrahimi // By default, if you pass a pointer to a numeric value, the
180*ccdc9c3eSSadaf Ebrahimi // corresponding text is interpreted as a base-10 number. You can
181*ccdc9c3eSSadaf Ebrahimi // instead wrap the pointer with a call to one of the operators Hex(),
182*ccdc9c3eSSadaf Ebrahimi // Octal(), or CRadix() to interpret the text in another base. The
183*ccdc9c3eSSadaf Ebrahimi // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
184*ccdc9c3eSSadaf Ebrahimi // prefixes, but defaults to base-10.
185*ccdc9c3eSSadaf Ebrahimi //
186*ccdc9c3eSSadaf Ebrahimi // Example:
187*ccdc9c3eSSadaf Ebrahimi // int a, b, c, d;
188*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
189*ccdc9c3eSSadaf Ebrahimi // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
190*ccdc9c3eSSadaf Ebrahimi // will leave 64 in a, b, c, and d.
191*ccdc9c3eSSadaf Ebrahimi
192*ccdc9c3eSSadaf Ebrahimi #include <stddef.h>
193*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
194*ccdc9c3eSSadaf Ebrahimi #include <algorithm>
195*ccdc9c3eSSadaf Ebrahimi #include <map>
196*ccdc9c3eSSadaf Ebrahimi #include <mutex>
197*ccdc9c3eSSadaf Ebrahimi #include <string>
198*ccdc9c3eSSadaf Ebrahimi
199*ccdc9c3eSSadaf Ebrahimi #include "re2/stringpiece.h"
200*ccdc9c3eSSadaf Ebrahimi
201*ccdc9c3eSSadaf Ebrahimi namespace re2 {
202*ccdc9c3eSSadaf Ebrahimi class Prog;
203*ccdc9c3eSSadaf Ebrahimi class Regexp;
204*ccdc9c3eSSadaf Ebrahimi } // namespace re2
205*ccdc9c3eSSadaf Ebrahimi
206*ccdc9c3eSSadaf Ebrahimi namespace re2 {
207*ccdc9c3eSSadaf Ebrahimi
208*ccdc9c3eSSadaf Ebrahimi // TODO(junyer): Get rid of this.
209*ccdc9c3eSSadaf Ebrahimi using std::string;
210*ccdc9c3eSSadaf Ebrahimi
211*ccdc9c3eSSadaf Ebrahimi // Interface for regular expression matching. Also corresponds to a
212*ccdc9c3eSSadaf Ebrahimi // pre-compiled regular expression. An "RE2" object is safe for
213*ccdc9c3eSSadaf Ebrahimi // concurrent use by multiple threads.
214*ccdc9c3eSSadaf Ebrahimi class RE2 {
215*ccdc9c3eSSadaf Ebrahimi public:
216*ccdc9c3eSSadaf Ebrahimi // We convert user-passed pointers into special Arg objects
217*ccdc9c3eSSadaf Ebrahimi class Arg;
218*ccdc9c3eSSadaf Ebrahimi class Options;
219*ccdc9c3eSSadaf Ebrahimi
220*ccdc9c3eSSadaf Ebrahimi // Defined in set.h.
221*ccdc9c3eSSadaf Ebrahimi class Set;
222*ccdc9c3eSSadaf Ebrahimi
223*ccdc9c3eSSadaf Ebrahimi enum ErrorCode {
224*ccdc9c3eSSadaf Ebrahimi NoError = 0,
225*ccdc9c3eSSadaf Ebrahimi
226*ccdc9c3eSSadaf Ebrahimi // Unexpected error
227*ccdc9c3eSSadaf Ebrahimi ErrorInternal,
228*ccdc9c3eSSadaf Ebrahimi
229*ccdc9c3eSSadaf Ebrahimi // Parse errors
230*ccdc9c3eSSadaf Ebrahimi ErrorBadEscape, // bad escape sequence
231*ccdc9c3eSSadaf Ebrahimi ErrorBadCharClass, // bad character class
232*ccdc9c3eSSadaf Ebrahimi ErrorBadCharRange, // bad character class range
233*ccdc9c3eSSadaf Ebrahimi ErrorMissingBracket, // missing closing ]
234*ccdc9c3eSSadaf Ebrahimi ErrorMissingParen, // missing closing )
235*ccdc9c3eSSadaf Ebrahimi ErrorTrailingBackslash, // trailing \ at end of regexp
236*ccdc9c3eSSadaf Ebrahimi ErrorRepeatArgument, // repeat argument missing, e.g. "*"
237*ccdc9c3eSSadaf Ebrahimi ErrorRepeatSize, // bad repetition argument
238*ccdc9c3eSSadaf Ebrahimi ErrorRepeatOp, // bad repetition operator
239*ccdc9c3eSSadaf Ebrahimi ErrorBadPerlOp, // bad perl operator
240*ccdc9c3eSSadaf Ebrahimi ErrorBadUTF8, // invalid UTF-8 in regexp
241*ccdc9c3eSSadaf Ebrahimi ErrorBadNamedCapture, // bad named capture group
242*ccdc9c3eSSadaf Ebrahimi ErrorPatternTooLarge // pattern too large (compile failed)
243*ccdc9c3eSSadaf Ebrahimi };
244*ccdc9c3eSSadaf Ebrahimi
245*ccdc9c3eSSadaf Ebrahimi // Predefined common options.
246*ccdc9c3eSSadaf Ebrahimi // If you need more complicated things, instantiate
247*ccdc9c3eSSadaf Ebrahimi // an Option class, possibly passing one of these to
248*ccdc9c3eSSadaf Ebrahimi // the Option constructor, change the settings, and pass that
249*ccdc9c3eSSadaf Ebrahimi // Option class to the RE2 constructor.
250*ccdc9c3eSSadaf Ebrahimi enum CannedOptions {
251*ccdc9c3eSSadaf Ebrahimi DefaultOptions = 0,
252*ccdc9c3eSSadaf Ebrahimi Latin1, // treat input as Latin-1 (default UTF-8)
253*ccdc9c3eSSadaf Ebrahimi POSIX, // POSIX syntax, leftmost-longest match
254*ccdc9c3eSSadaf Ebrahimi Quiet // do not log about regexp parse errors
255*ccdc9c3eSSadaf Ebrahimi };
256*ccdc9c3eSSadaf Ebrahimi
257*ccdc9c3eSSadaf Ebrahimi // Need to have the const char* and const string& forms for implicit
258*ccdc9c3eSSadaf Ebrahimi // conversions when passing string literals to FullMatch and PartialMatch.
259*ccdc9c3eSSadaf Ebrahimi // Otherwise the StringPiece form would be sufficient.
260*ccdc9c3eSSadaf Ebrahimi #ifndef SWIG
261*ccdc9c3eSSadaf Ebrahimi RE2(const char* pattern);
262*ccdc9c3eSSadaf Ebrahimi RE2(const string& pattern);
263*ccdc9c3eSSadaf Ebrahimi #endif
264*ccdc9c3eSSadaf Ebrahimi RE2(const StringPiece& pattern);
265*ccdc9c3eSSadaf Ebrahimi RE2(const StringPiece& pattern, const Options& options);
266*ccdc9c3eSSadaf Ebrahimi ~RE2();
267*ccdc9c3eSSadaf Ebrahimi
268*ccdc9c3eSSadaf Ebrahimi // Returns whether RE2 was created properly.
ok()269*ccdc9c3eSSadaf Ebrahimi bool ok() const { return error_code() == NoError; }
270*ccdc9c3eSSadaf Ebrahimi
271*ccdc9c3eSSadaf Ebrahimi // The string specification for this RE2. E.g.
272*ccdc9c3eSSadaf Ebrahimi // RE2 re("ab*c?d+");
273*ccdc9c3eSSadaf Ebrahimi // re.pattern(); // "ab*c?d+"
pattern()274*ccdc9c3eSSadaf Ebrahimi const string& pattern() const { return pattern_; }
275*ccdc9c3eSSadaf Ebrahimi
276*ccdc9c3eSSadaf Ebrahimi // If RE2 could not be created properly, returns an error string.
277*ccdc9c3eSSadaf Ebrahimi // Else returns the empty string.
error()278*ccdc9c3eSSadaf Ebrahimi const string& error() const { return *error_; }
279*ccdc9c3eSSadaf Ebrahimi
280*ccdc9c3eSSadaf Ebrahimi // If RE2 could not be created properly, returns an error code.
281*ccdc9c3eSSadaf Ebrahimi // Else returns RE2::NoError (== 0).
error_code()282*ccdc9c3eSSadaf Ebrahimi ErrorCode error_code() const { return error_code_; }
283*ccdc9c3eSSadaf Ebrahimi
284*ccdc9c3eSSadaf Ebrahimi // If RE2 could not be created properly, returns the offending
285*ccdc9c3eSSadaf Ebrahimi // portion of the regexp.
error_arg()286*ccdc9c3eSSadaf Ebrahimi const string& error_arg() const { return error_arg_; }
287*ccdc9c3eSSadaf Ebrahimi
288*ccdc9c3eSSadaf Ebrahimi // Returns the program size, a very approximate measure of a regexp's "cost".
289*ccdc9c3eSSadaf Ebrahimi // Larger numbers are more expensive than smaller numbers.
290*ccdc9c3eSSadaf Ebrahimi int ProgramSize() const;
291*ccdc9c3eSSadaf Ebrahimi int ReverseProgramSize() const;
292*ccdc9c3eSSadaf Ebrahimi
293*ccdc9c3eSSadaf Ebrahimi // EXPERIMENTAL! SUBJECT TO CHANGE!
294*ccdc9c3eSSadaf Ebrahimi // Outputs the program fanout as a histogram bucketed by powers of 2.
295*ccdc9c3eSSadaf Ebrahimi // Returns the number of the largest non-empty bucket.
296*ccdc9c3eSSadaf Ebrahimi int ProgramFanout(std::map<int, int>* histogram) const;
297*ccdc9c3eSSadaf Ebrahimi int ReverseProgramFanout(std::map<int, int>* histogram) const;
298*ccdc9c3eSSadaf Ebrahimi
299*ccdc9c3eSSadaf Ebrahimi // Returns the underlying Regexp; not for general use.
300*ccdc9c3eSSadaf Ebrahimi // Returns entire_regexp_ so that callers don't need
301*ccdc9c3eSSadaf Ebrahimi // to know about prefix_ and prefix_foldcase_.
Regexp()302*ccdc9c3eSSadaf Ebrahimi re2::Regexp* Regexp() const { return entire_regexp_; }
303*ccdc9c3eSSadaf Ebrahimi
304*ccdc9c3eSSadaf Ebrahimi /***** The array-based matching interface ******/
305*ccdc9c3eSSadaf Ebrahimi
306*ccdc9c3eSSadaf Ebrahimi // The functions here have names ending in 'N' and are used to implement
307*ccdc9c3eSSadaf Ebrahimi // the functions whose names are the prefix before the 'N'. It is sometimes
308*ccdc9c3eSSadaf Ebrahimi // useful to invoke them directly, but the syntax is awkward, so the 'N'-less
309*ccdc9c3eSSadaf Ebrahimi // versions should be preferred.
310*ccdc9c3eSSadaf Ebrahimi static bool FullMatchN(const StringPiece& text, const RE2& re,
311*ccdc9c3eSSadaf Ebrahimi const Arg* const args[], int argc);
312*ccdc9c3eSSadaf Ebrahimi static bool PartialMatchN(const StringPiece& text, const RE2& re,
313*ccdc9c3eSSadaf Ebrahimi const Arg* const args[], int argc);
314*ccdc9c3eSSadaf Ebrahimi static bool ConsumeN(StringPiece* input, const RE2& re,
315*ccdc9c3eSSadaf Ebrahimi const Arg* const args[], int argc);
316*ccdc9c3eSSadaf Ebrahimi static bool FindAndConsumeN(StringPiece* input, const RE2& re,
317*ccdc9c3eSSadaf Ebrahimi const Arg* const args[], int argc);
318*ccdc9c3eSSadaf Ebrahimi
319*ccdc9c3eSSadaf Ebrahimi #ifndef SWIG
320*ccdc9c3eSSadaf Ebrahimi private:
321*ccdc9c3eSSadaf Ebrahimi template <typename F, typename SP>
Apply(F f,SP sp,const RE2 & re)322*ccdc9c3eSSadaf Ebrahimi static inline bool Apply(F f, SP sp, const RE2& re) {
323*ccdc9c3eSSadaf Ebrahimi return f(sp, re, NULL, 0);
324*ccdc9c3eSSadaf Ebrahimi }
325*ccdc9c3eSSadaf Ebrahimi
326*ccdc9c3eSSadaf Ebrahimi template <typename F, typename SP, typename... A>
Apply(F f,SP sp,const RE2 & re,const A &...a)327*ccdc9c3eSSadaf Ebrahimi static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
328*ccdc9c3eSSadaf Ebrahimi const Arg* const args[] = {&a...};
329*ccdc9c3eSSadaf Ebrahimi const int argc = sizeof...(a);
330*ccdc9c3eSSadaf Ebrahimi return f(sp, re, args, argc);
331*ccdc9c3eSSadaf Ebrahimi }
332*ccdc9c3eSSadaf Ebrahimi
333*ccdc9c3eSSadaf Ebrahimi public:
334*ccdc9c3eSSadaf Ebrahimi // In order to allow FullMatch() et al. to be called with a varying number
335*ccdc9c3eSSadaf Ebrahimi // of arguments of varying types, we use two layers of variadic templates.
336*ccdc9c3eSSadaf Ebrahimi // The first layer constructs the temporary Arg objects. The second layer
337*ccdc9c3eSSadaf Ebrahimi // (above) constructs the array of pointers to the temporary Arg objects.
338*ccdc9c3eSSadaf Ebrahimi
339*ccdc9c3eSSadaf Ebrahimi /***** The useful part: the matching interface *****/
340*ccdc9c3eSSadaf Ebrahimi
341*ccdc9c3eSSadaf Ebrahimi // Matches "text" against "re". If pointer arguments are
342*ccdc9c3eSSadaf Ebrahimi // supplied, copies matched sub-patterns into them.
343*ccdc9c3eSSadaf Ebrahimi //
344*ccdc9c3eSSadaf Ebrahimi // You can pass in a "const char*" or a "string" for "text".
345*ccdc9c3eSSadaf Ebrahimi // You can pass in a "const char*" or a "string" or a "RE2" for "re".
346*ccdc9c3eSSadaf Ebrahimi //
347*ccdc9c3eSSadaf Ebrahimi // The provided pointer arguments can be pointers to any scalar numeric
348*ccdc9c3eSSadaf Ebrahimi // type, or one of:
349*ccdc9c3eSSadaf Ebrahimi // string (matched piece is copied to string)
350*ccdc9c3eSSadaf Ebrahimi // StringPiece (StringPiece is mutated to point to matched piece)
351*ccdc9c3eSSadaf Ebrahimi // T (where "bool T::ParseFrom(const char*, size_t)" exists)
352*ccdc9c3eSSadaf Ebrahimi // (void*)NULL (the corresponding matched sub-pattern is not copied)
353*ccdc9c3eSSadaf Ebrahimi //
354*ccdc9c3eSSadaf Ebrahimi // Returns true iff all of the following conditions are satisfied:
355*ccdc9c3eSSadaf Ebrahimi // a. "text" matches "re" exactly
356*ccdc9c3eSSadaf Ebrahimi // b. The number of matched sub-patterns is >= number of supplied pointers
357*ccdc9c3eSSadaf Ebrahimi // c. The "i"th argument has a suitable type for holding the
358*ccdc9c3eSSadaf Ebrahimi // string captured as the "i"th sub-pattern. If you pass in
359*ccdc9c3eSSadaf Ebrahimi // NULL for the "i"th argument, or pass fewer arguments than
360*ccdc9c3eSSadaf Ebrahimi // number of sub-patterns, "i"th captured sub-pattern is
361*ccdc9c3eSSadaf Ebrahimi // ignored.
362*ccdc9c3eSSadaf Ebrahimi //
363*ccdc9c3eSSadaf Ebrahimi // CAVEAT: An optional sub-pattern that does not exist in the
364*ccdc9c3eSSadaf Ebrahimi // matched string is assigned the empty string. Therefore, the
365*ccdc9c3eSSadaf Ebrahimi // following will return false (because the empty string is not a
366*ccdc9c3eSSadaf Ebrahimi // valid number):
367*ccdc9c3eSSadaf Ebrahimi // int number;
368*ccdc9c3eSSadaf Ebrahimi // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
369*ccdc9c3eSSadaf Ebrahimi template <typename... A>
FullMatch(const StringPiece & text,const RE2 & re,A &&...a)370*ccdc9c3eSSadaf Ebrahimi static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
371*ccdc9c3eSSadaf Ebrahimi return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
372*ccdc9c3eSSadaf Ebrahimi }
373*ccdc9c3eSSadaf Ebrahimi
374*ccdc9c3eSSadaf Ebrahimi // Exactly like FullMatch(), except that "re" is allowed to match
375*ccdc9c3eSSadaf Ebrahimi // a substring of "text".
376*ccdc9c3eSSadaf Ebrahimi template <typename... A>
PartialMatch(const StringPiece & text,const RE2 & re,A &&...a)377*ccdc9c3eSSadaf Ebrahimi static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
378*ccdc9c3eSSadaf Ebrahimi return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
379*ccdc9c3eSSadaf Ebrahimi }
380*ccdc9c3eSSadaf Ebrahimi
381*ccdc9c3eSSadaf Ebrahimi // Like FullMatch() and PartialMatch(), except that "re" has to match
382*ccdc9c3eSSadaf Ebrahimi // a prefix of the text, and "input" is advanced past the matched
383*ccdc9c3eSSadaf Ebrahimi // text. Note: "input" is modified iff this routine returns true.
384*ccdc9c3eSSadaf Ebrahimi template <typename... A>
Consume(StringPiece * input,const RE2 & re,A &&...a)385*ccdc9c3eSSadaf Ebrahimi static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
386*ccdc9c3eSSadaf Ebrahimi return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
387*ccdc9c3eSSadaf Ebrahimi }
388*ccdc9c3eSSadaf Ebrahimi
389*ccdc9c3eSSadaf Ebrahimi // Like Consume(), but does not anchor the match at the beginning of
390*ccdc9c3eSSadaf Ebrahimi // the text. That is, "re" need not start its match at the beginning
391*ccdc9c3eSSadaf Ebrahimi // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
392*ccdc9c3eSSadaf Ebrahimi // the next word in "s" and stores it in "word".
393*ccdc9c3eSSadaf Ebrahimi template <typename... A>
FindAndConsume(StringPiece * input,const RE2 & re,A &&...a)394*ccdc9c3eSSadaf Ebrahimi static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
395*ccdc9c3eSSadaf Ebrahimi return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
396*ccdc9c3eSSadaf Ebrahimi }
397*ccdc9c3eSSadaf Ebrahimi #endif
398*ccdc9c3eSSadaf Ebrahimi
399*ccdc9c3eSSadaf Ebrahimi // Replace the first match of "re" in "str" with "rewrite".
400*ccdc9c3eSSadaf Ebrahimi // Within "rewrite", backslash-escaped digits (\1 to \9) can be
401*ccdc9c3eSSadaf Ebrahimi // used to insert text matching corresponding parenthesized group
402*ccdc9c3eSSadaf Ebrahimi // from the pattern. \0 in "rewrite" refers to the entire matching
403*ccdc9c3eSSadaf Ebrahimi // text. E.g.,
404*ccdc9c3eSSadaf Ebrahimi //
405*ccdc9c3eSSadaf Ebrahimi // string s = "yabba dabba doo";
406*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::Replace(&s, "b+", "d"));
407*ccdc9c3eSSadaf Ebrahimi //
408*ccdc9c3eSSadaf Ebrahimi // will leave "s" containing "yada dabba doo"
409*ccdc9c3eSSadaf Ebrahimi //
410*ccdc9c3eSSadaf Ebrahimi // Returns true if the pattern matches and a replacement occurs,
411*ccdc9c3eSSadaf Ebrahimi // false otherwise.
412*ccdc9c3eSSadaf Ebrahimi static bool Replace(string* str,
413*ccdc9c3eSSadaf Ebrahimi const RE2& re,
414*ccdc9c3eSSadaf Ebrahimi const StringPiece& rewrite);
415*ccdc9c3eSSadaf Ebrahimi
416*ccdc9c3eSSadaf Ebrahimi // Like Replace(), except replaces successive non-overlapping occurrences
417*ccdc9c3eSSadaf Ebrahimi // of the pattern in the string with the rewrite. E.g.
418*ccdc9c3eSSadaf Ebrahimi //
419*ccdc9c3eSSadaf Ebrahimi // string s = "yabba dabba doo";
420*ccdc9c3eSSadaf Ebrahimi // CHECK(RE2::GlobalReplace(&s, "b+", "d"));
421*ccdc9c3eSSadaf Ebrahimi //
422*ccdc9c3eSSadaf Ebrahimi // will leave "s" containing "yada dada doo"
423*ccdc9c3eSSadaf Ebrahimi // Replacements are not subject to re-matching.
424*ccdc9c3eSSadaf Ebrahimi //
425*ccdc9c3eSSadaf Ebrahimi // Because GlobalReplace only replaces non-overlapping matches,
426*ccdc9c3eSSadaf Ebrahimi // replacing "ana" within "banana" makes only one replacement, not two.
427*ccdc9c3eSSadaf Ebrahimi //
428*ccdc9c3eSSadaf Ebrahimi // Returns the number of replacements made.
429*ccdc9c3eSSadaf Ebrahimi static int GlobalReplace(string* str,
430*ccdc9c3eSSadaf Ebrahimi const RE2& re,
431*ccdc9c3eSSadaf Ebrahimi const StringPiece& rewrite);
432*ccdc9c3eSSadaf Ebrahimi
433*ccdc9c3eSSadaf Ebrahimi // Like Replace, except that if the pattern matches, "rewrite"
434*ccdc9c3eSSadaf Ebrahimi // is copied into "out" with substitutions. The non-matching
435*ccdc9c3eSSadaf Ebrahimi // portions of "text" are ignored.
436*ccdc9c3eSSadaf Ebrahimi //
437*ccdc9c3eSSadaf Ebrahimi // Returns true iff a match occurred and the extraction happened
438*ccdc9c3eSSadaf Ebrahimi // successfully; if no match occurs, the string is left unaffected.
439*ccdc9c3eSSadaf Ebrahimi //
440*ccdc9c3eSSadaf Ebrahimi // REQUIRES: "text" must not alias any part of "*out".
441*ccdc9c3eSSadaf Ebrahimi static bool Extract(const StringPiece& text,
442*ccdc9c3eSSadaf Ebrahimi const RE2& re,
443*ccdc9c3eSSadaf Ebrahimi const StringPiece& rewrite,
444*ccdc9c3eSSadaf Ebrahimi string* out);
445*ccdc9c3eSSadaf Ebrahimi
446*ccdc9c3eSSadaf Ebrahimi // Escapes all potentially meaningful regexp characters in
447*ccdc9c3eSSadaf Ebrahimi // 'unquoted'. The returned string, used as a regular expression,
448*ccdc9c3eSSadaf Ebrahimi // will exactly match the original string. For example,
449*ccdc9c3eSSadaf Ebrahimi // 1.5-2.0?
450*ccdc9c3eSSadaf Ebrahimi // may become:
451*ccdc9c3eSSadaf Ebrahimi // 1\.5\-2\.0\?
452*ccdc9c3eSSadaf Ebrahimi static string QuoteMeta(const StringPiece& unquoted);
453*ccdc9c3eSSadaf Ebrahimi
454*ccdc9c3eSSadaf Ebrahimi // Computes range for any strings matching regexp. The min and max can in
455*ccdc9c3eSSadaf Ebrahimi // some cases be arbitrarily precise, so the caller gets to specify the
456*ccdc9c3eSSadaf Ebrahimi // maximum desired length of string returned.
457*ccdc9c3eSSadaf Ebrahimi //
458*ccdc9c3eSSadaf Ebrahimi // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
459*ccdc9c3eSSadaf Ebrahimi // string s that is an anchored match for this regexp satisfies
460*ccdc9c3eSSadaf Ebrahimi // min <= s && s <= max.
461*ccdc9c3eSSadaf Ebrahimi //
462*ccdc9c3eSSadaf Ebrahimi // Note that PossibleMatchRange() will only consider the first copy of an
463*ccdc9c3eSSadaf Ebrahimi // infinitely repeated element (i.e., any regexp element followed by a '*' or
464*ccdc9c3eSSadaf Ebrahimi // '+' operator). Regexps with "{N}" constructions are not affected, as those
465*ccdc9c3eSSadaf Ebrahimi // do not compile down to infinite repetitions.
466*ccdc9c3eSSadaf Ebrahimi //
467*ccdc9c3eSSadaf Ebrahimi // Returns true on success, false on error.
468*ccdc9c3eSSadaf Ebrahimi bool PossibleMatchRange(string* min, string* max, int maxlen) const;
469*ccdc9c3eSSadaf Ebrahimi
470*ccdc9c3eSSadaf Ebrahimi // Generic matching interface
471*ccdc9c3eSSadaf Ebrahimi
472*ccdc9c3eSSadaf Ebrahimi // Type of match.
473*ccdc9c3eSSadaf Ebrahimi enum Anchor {
474*ccdc9c3eSSadaf Ebrahimi UNANCHORED, // No anchoring
475*ccdc9c3eSSadaf Ebrahimi ANCHOR_START, // Anchor at start only
476*ccdc9c3eSSadaf Ebrahimi ANCHOR_BOTH // Anchor at start and end
477*ccdc9c3eSSadaf Ebrahimi };
478*ccdc9c3eSSadaf Ebrahimi
479*ccdc9c3eSSadaf Ebrahimi // Return the number of capturing subpatterns, or -1 if the
480*ccdc9c3eSSadaf Ebrahimi // regexp wasn't valid on construction. The overall match ($0)
481*ccdc9c3eSSadaf Ebrahimi // does not count: if the regexp is "(a)(b)", returns 2.
NumberOfCapturingGroups()482*ccdc9c3eSSadaf Ebrahimi int NumberOfCapturingGroups() const { return num_captures_; }
483*ccdc9c3eSSadaf Ebrahimi
484*ccdc9c3eSSadaf Ebrahimi // Return a map from names to capturing indices.
485*ccdc9c3eSSadaf Ebrahimi // The map records the index of the leftmost group
486*ccdc9c3eSSadaf Ebrahimi // with the given name.
487*ccdc9c3eSSadaf Ebrahimi // Only valid until the re is deleted.
488*ccdc9c3eSSadaf Ebrahimi const std::map<string, int>& NamedCapturingGroups() const;
489*ccdc9c3eSSadaf Ebrahimi
490*ccdc9c3eSSadaf Ebrahimi // Return a map from capturing indices to names.
491*ccdc9c3eSSadaf Ebrahimi // The map has no entries for unnamed groups.
492*ccdc9c3eSSadaf Ebrahimi // Only valid until the re is deleted.
493*ccdc9c3eSSadaf Ebrahimi const std::map<int, string>& CapturingGroupNames() const;
494*ccdc9c3eSSadaf Ebrahimi
495*ccdc9c3eSSadaf Ebrahimi // General matching routine.
496*ccdc9c3eSSadaf Ebrahimi // Match against text starting at offset startpos
497*ccdc9c3eSSadaf Ebrahimi // and stopping the search at offset endpos.
498*ccdc9c3eSSadaf Ebrahimi // Returns true if match found, false if not.
499*ccdc9c3eSSadaf Ebrahimi // On a successful match, fills in submatch[] (up to nsubmatch entries)
500*ccdc9c3eSSadaf Ebrahimi // with information about submatches.
501*ccdc9c3eSSadaf Ebrahimi // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with
502*ccdc9c3eSSadaf Ebrahimi // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar",
503*ccdc9c3eSSadaf Ebrahimi // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL.
504*ccdc9c3eSSadaf Ebrahimi // Caveat: submatch[] may be clobbered even on match failure.
505*ccdc9c3eSSadaf Ebrahimi //
506*ccdc9c3eSSadaf Ebrahimi // Don't ask for more match information than you will use:
507*ccdc9c3eSSadaf Ebrahimi // runs much faster with nsubmatch == 1 than nsubmatch > 1, and
508*ccdc9c3eSSadaf Ebrahimi // runs even faster if nsubmatch == 0.
509*ccdc9c3eSSadaf Ebrahimi // Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(),
510*ccdc9c3eSSadaf Ebrahimi // but will be handled correctly.
511*ccdc9c3eSSadaf Ebrahimi //
512*ccdc9c3eSSadaf Ebrahimi // Passing text == StringPiece(NULL, 0) will be handled like any other
513*ccdc9c3eSSadaf Ebrahimi // empty string, but note that on return, it will not be possible to tell
514*ccdc9c3eSSadaf Ebrahimi // whether submatch i matched the empty string or did not match:
515*ccdc9c3eSSadaf Ebrahimi // either way, submatch[i].data() == NULL.
516*ccdc9c3eSSadaf Ebrahimi bool Match(const StringPiece& text,
517*ccdc9c3eSSadaf Ebrahimi size_t startpos,
518*ccdc9c3eSSadaf Ebrahimi size_t endpos,
519*ccdc9c3eSSadaf Ebrahimi Anchor re_anchor,
520*ccdc9c3eSSadaf Ebrahimi StringPiece* submatch,
521*ccdc9c3eSSadaf Ebrahimi int nsubmatch) const;
522*ccdc9c3eSSadaf Ebrahimi
523*ccdc9c3eSSadaf Ebrahimi // Check that the given rewrite string is suitable for use with this
524*ccdc9c3eSSadaf Ebrahimi // regular expression. It checks that:
525*ccdc9c3eSSadaf Ebrahimi // * The regular expression has enough parenthesized subexpressions
526*ccdc9c3eSSadaf Ebrahimi // to satisfy all of the \N tokens in rewrite
527*ccdc9c3eSSadaf Ebrahimi // * The rewrite string doesn't have any syntax errors. E.g.,
528*ccdc9c3eSSadaf Ebrahimi // '\' followed by anything other than a digit or '\'.
529*ccdc9c3eSSadaf Ebrahimi // A true return value guarantees that Replace() and Extract() won't
530*ccdc9c3eSSadaf Ebrahimi // fail because of a bad rewrite string.
531*ccdc9c3eSSadaf Ebrahimi bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
532*ccdc9c3eSSadaf Ebrahimi
533*ccdc9c3eSSadaf Ebrahimi // Returns the maximum submatch needed for the rewrite to be done by
534*ccdc9c3eSSadaf Ebrahimi // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
535*ccdc9c3eSSadaf Ebrahimi static int MaxSubmatch(const StringPiece& rewrite);
536*ccdc9c3eSSadaf Ebrahimi
537*ccdc9c3eSSadaf Ebrahimi // Append the "rewrite" string, with backslash subsitutions from "vec",
538*ccdc9c3eSSadaf Ebrahimi // to string "out".
539*ccdc9c3eSSadaf Ebrahimi // Returns true on success. This method can fail because of a malformed
540*ccdc9c3eSSadaf Ebrahimi // rewrite string. CheckRewriteString guarantees that the rewrite will
541*ccdc9c3eSSadaf Ebrahimi // be sucessful.
542*ccdc9c3eSSadaf Ebrahimi bool Rewrite(string* out,
543*ccdc9c3eSSadaf Ebrahimi const StringPiece& rewrite,
544*ccdc9c3eSSadaf Ebrahimi const StringPiece* vec,
545*ccdc9c3eSSadaf Ebrahimi int veclen) const;
546*ccdc9c3eSSadaf Ebrahimi
547*ccdc9c3eSSadaf Ebrahimi // Constructor options
548*ccdc9c3eSSadaf Ebrahimi class Options {
549*ccdc9c3eSSadaf Ebrahimi public:
550*ccdc9c3eSSadaf Ebrahimi // The options are (defaults in parentheses):
551*ccdc9c3eSSadaf Ebrahimi //
552*ccdc9c3eSSadaf Ebrahimi // utf8 (true) text and pattern are UTF-8; otherwise Latin-1
553*ccdc9c3eSSadaf Ebrahimi // posix_syntax (false) restrict regexps to POSIX egrep syntax
554*ccdc9c3eSSadaf Ebrahimi // longest_match (false) search for longest match, not first match
555*ccdc9c3eSSadaf Ebrahimi // log_errors (true) log syntax and execution errors to ERROR
556*ccdc9c3eSSadaf Ebrahimi // max_mem (see below) approx. max memory footprint of RE2
557*ccdc9c3eSSadaf Ebrahimi // literal (false) interpret string as literal, not regexp
558*ccdc9c3eSSadaf Ebrahimi // never_nl (false) never match \n, even if it is in regexp
559*ccdc9c3eSSadaf Ebrahimi // dot_nl (false) dot matches everything including new line
560*ccdc9c3eSSadaf Ebrahimi // never_capture (false) parse all parens as non-capturing
561*ccdc9c3eSSadaf Ebrahimi // case_sensitive (true) match is case-sensitive (regexp can override
562*ccdc9c3eSSadaf Ebrahimi // with (?i) unless in posix_syntax mode)
563*ccdc9c3eSSadaf Ebrahimi //
564*ccdc9c3eSSadaf Ebrahimi // The following options are only consulted when posix_syntax == true.
565*ccdc9c3eSSadaf Ebrahimi // When posix_syntax == false, these features are always enabled and
566*ccdc9c3eSSadaf Ebrahimi // cannot be turned off; to perform multi-line matching in that case,
567*ccdc9c3eSSadaf Ebrahimi // begin the regexp with (?m).
568*ccdc9c3eSSadaf Ebrahimi // perl_classes (false) allow Perl's \d \s \w \D \S \W
569*ccdc9c3eSSadaf Ebrahimi // word_boundary (false) allow Perl's \b \B (word boundary and not)
570*ccdc9c3eSSadaf Ebrahimi // one_line (false) ^ and $ only match beginning and end of text
571*ccdc9c3eSSadaf Ebrahimi //
572*ccdc9c3eSSadaf Ebrahimi // The max_mem option controls how much memory can be used
573*ccdc9c3eSSadaf Ebrahimi // to hold the compiled form of the regexp (the Prog) and
574*ccdc9c3eSSadaf Ebrahimi // its cached DFA graphs. Code Search placed limits on the number
575*ccdc9c3eSSadaf Ebrahimi // of Prog instructions and DFA states: 10,000 for both.
576*ccdc9c3eSSadaf Ebrahimi // In RE2, those limits would translate to about 240 KB per Prog
577*ccdc9c3eSSadaf Ebrahimi // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
578*ccdc9c3eSSadaf Ebrahimi // better job of keeping them small than Code Search did).
579*ccdc9c3eSSadaf Ebrahimi // Each RE2 has two Progs (one forward, one reverse), and each Prog
580*ccdc9c3eSSadaf Ebrahimi // can have two DFAs (one first match, one longest match).
581*ccdc9c3eSSadaf Ebrahimi // That makes 4 DFAs:
582*ccdc9c3eSSadaf Ebrahimi //
583*ccdc9c3eSSadaf Ebrahimi // forward, first-match - used for UNANCHORED or ANCHOR_START searches
584*ccdc9c3eSSadaf Ebrahimi // if opt.longest_match() == false
585*ccdc9c3eSSadaf Ebrahimi // forward, longest-match - used for all ANCHOR_BOTH searches,
586*ccdc9c3eSSadaf Ebrahimi // and the other two kinds if
587*ccdc9c3eSSadaf Ebrahimi // opt.longest_match() == true
588*ccdc9c3eSSadaf Ebrahimi // reverse, first-match - never used
589*ccdc9c3eSSadaf Ebrahimi // reverse, longest-match - used as second phase for unanchored searches
590*ccdc9c3eSSadaf Ebrahimi //
591*ccdc9c3eSSadaf Ebrahimi // The RE2 memory budget is statically divided between the two
592*ccdc9c3eSSadaf Ebrahimi // Progs and then the DFAs: two thirds to the forward Prog
593*ccdc9c3eSSadaf Ebrahimi // and one third to the reverse Prog. The forward Prog gives half
594*ccdc9c3eSSadaf Ebrahimi // of what it has left over to each of its DFAs. The reverse Prog
595*ccdc9c3eSSadaf Ebrahimi // gives it all to its longest-match DFA.
596*ccdc9c3eSSadaf Ebrahimi //
597*ccdc9c3eSSadaf Ebrahimi // Once a DFA fills its budget, it flushes its cache and starts over.
598*ccdc9c3eSSadaf Ebrahimi // If this happens too often, RE2 falls back on the NFA implementation.
599*ccdc9c3eSSadaf Ebrahimi
600*ccdc9c3eSSadaf Ebrahimi // For now, make the default budget something close to Code Search.
601*ccdc9c3eSSadaf Ebrahimi static const int kDefaultMaxMem = 8<<20;
602*ccdc9c3eSSadaf Ebrahimi
603*ccdc9c3eSSadaf Ebrahimi enum Encoding {
604*ccdc9c3eSSadaf Ebrahimi EncodingUTF8 = 1,
605*ccdc9c3eSSadaf Ebrahimi EncodingLatin1
606*ccdc9c3eSSadaf Ebrahimi };
607*ccdc9c3eSSadaf Ebrahimi
Options()608*ccdc9c3eSSadaf Ebrahimi Options() :
609*ccdc9c3eSSadaf Ebrahimi encoding_(EncodingUTF8),
610*ccdc9c3eSSadaf Ebrahimi posix_syntax_(false),
611*ccdc9c3eSSadaf Ebrahimi longest_match_(false),
612*ccdc9c3eSSadaf Ebrahimi log_errors_(true),
613*ccdc9c3eSSadaf Ebrahimi max_mem_(kDefaultMaxMem),
614*ccdc9c3eSSadaf Ebrahimi literal_(false),
615*ccdc9c3eSSadaf Ebrahimi never_nl_(false),
616*ccdc9c3eSSadaf Ebrahimi dot_nl_(false),
617*ccdc9c3eSSadaf Ebrahimi never_capture_(false),
618*ccdc9c3eSSadaf Ebrahimi case_sensitive_(true),
619*ccdc9c3eSSadaf Ebrahimi perl_classes_(false),
620*ccdc9c3eSSadaf Ebrahimi word_boundary_(false),
621*ccdc9c3eSSadaf Ebrahimi one_line_(false) {
622*ccdc9c3eSSadaf Ebrahimi }
623*ccdc9c3eSSadaf Ebrahimi
624*ccdc9c3eSSadaf Ebrahimi /*implicit*/ Options(CannedOptions);
625*ccdc9c3eSSadaf Ebrahimi
encoding()626*ccdc9c3eSSadaf Ebrahimi Encoding encoding() const { return encoding_; }
set_encoding(Encoding encoding)627*ccdc9c3eSSadaf Ebrahimi void set_encoding(Encoding encoding) { encoding_ = encoding; }
628*ccdc9c3eSSadaf Ebrahimi
629*ccdc9c3eSSadaf Ebrahimi // Legacy interface to encoding.
630*ccdc9c3eSSadaf Ebrahimi // TODO(rsc): Remove once clients have been converted.
utf8()631*ccdc9c3eSSadaf Ebrahimi bool utf8() const { return encoding_ == EncodingUTF8; }
set_utf8(bool b)632*ccdc9c3eSSadaf Ebrahimi void set_utf8(bool b) {
633*ccdc9c3eSSadaf Ebrahimi if (b) {
634*ccdc9c3eSSadaf Ebrahimi encoding_ = EncodingUTF8;
635*ccdc9c3eSSadaf Ebrahimi } else {
636*ccdc9c3eSSadaf Ebrahimi encoding_ = EncodingLatin1;
637*ccdc9c3eSSadaf Ebrahimi }
638*ccdc9c3eSSadaf Ebrahimi }
639*ccdc9c3eSSadaf Ebrahimi
posix_syntax()640*ccdc9c3eSSadaf Ebrahimi bool posix_syntax() const { return posix_syntax_; }
set_posix_syntax(bool b)641*ccdc9c3eSSadaf Ebrahimi void set_posix_syntax(bool b) { posix_syntax_ = b; }
642*ccdc9c3eSSadaf Ebrahimi
longest_match()643*ccdc9c3eSSadaf Ebrahimi bool longest_match() const { return longest_match_; }
set_longest_match(bool b)644*ccdc9c3eSSadaf Ebrahimi void set_longest_match(bool b) { longest_match_ = b; }
645*ccdc9c3eSSadaf Ebrahimi
log_errors()646*ccdc9c3eSSadaf Ebrahimi bool log_errors() const { return log_errors_; }
set_log_errors(bool b)647*ccdc9c3eSSadaf Ebrahimi void set_log_errors(bool b) { log_errors_ = b; }
648*ccdc9c3eSSadaf Ebrahimi
max_mem()649*ccdc9c3eSSadaf Ebrahimi int64_t max_mem() const { return max_mem_; }
set_max_mem(int64_t m)650*ccdc9c3eSSadaf Ebrahimi void set_max_mem(int64_t m) { max_mem_ = m; }
651*ccdc9c3eSSadaf Ebrahimi
literal()652*ccdc9c3eSSadaf Ebrahimi bool literal() const { return literal_; }
set_literal(bool b)653*ccdc9c3eSSadaf Ebrahimi void set_literal(bool b) { literal_ = b; }
654*ccdc9c3eSSadaf Ebrahimi
never_nl()655*ccdc9c3eSSadaf Ebrahimi bool never_nl() const { return never_nl_; }
set_never_nl(bool b)656*ccdc9c3eSSadaf Ebrahimi void set_never_nl(bool b) { never_nl_ = b; }
657*ccdc9c3eSSadaf Ebrahimi
dot_nl()658*ccdc9c3eSSadaf Ebrahimi bool dot_nl() const { return dot_nl_; }
set_dot_nl(bool b)659*ccdc9c3eSSadaf Ebrahimi void set_dot_nl(bool b) { dot_nl_ = b; }
660*ccdc9c3eSSadaf Ebrahimi
never_capture()661*ccdc9c3eSSadaf Ebrahimi bool never_capture() const { return never_capture_; }
set_never_capture(bool b)662*ccdc9c3eSSadaf Ebrahimi void set_never_capture(bool b) { never_capture_ = b; }
663*ccdc9c3eSSadaf Ebrahimi
case_sensitive()664*ccdc9c3eSSadaf Ebrahimi bool case_sensitive() const { return case_sensitive_; }
set_case_sensitive(bool b)665*ccdc9c3eSSadaf Ebrahimi void set_case_sensitive(bool b) { case_sensitive_ = b; }
666*ccdc9c3eSSadaf Ebrahimi
perl_classes()667*ccdc9c3eSSadaf Ebrahimi bool perl_classes() const { return perl_classes_; }
set_perl_classes(bool b)668*ccdc9c3eSSadaf Ebrahimi void set_perl_classes(bool b) { perl_classes_ = b; }
669*ccdc9c3eSSadaf Ebrahimi
word_boundary()670*ccdc9c3eSSadaf Ebrahimi bool word_boundary() const { return word_boundary_; }
set_word_boundary(bool b)671*ccdc9c3eSSadaf Ebrahimi void set_word_boundary(bool b) { word_boundary_ = b; }
672*ccdc9c3eSSadaf Ebrahimi
one_line()673*ccdc9c3eSSadaf Ebrahimi bool one_line() const { return one_line_; }
set_one_line(bool b)674*ccdc9c3eSSadaf Ebrahimi void set_one_line(bool b) { one_line_ = b; }
675*ccdc9c3eSSadaf Ebrahimi
Copy(const Options & src)676*ccdc9c3eSSadaf Ebrahimi void Copy(const Options& src) {
677*ccdc9c3eSSadaf Ebrahimi *this = src;
678*ccdc9c3eSSadaf Ebrahimi }
679*ccdc9c3eSSadaf Ebrahimi
680*ccdc9c3eSSadaf Ebrahimi int ParseFlags() const;
681*ccdc9c3eSSadaf Ebrahimi
682*ccdc9c3eSSadaf Ebrahimi private:
683*ccdc9c3eSSadaf Ebrahimi Encoding encoding_;
684*ccdc9c3eSSadaf Ebrahimi bool posix_syntax_;
685*ccdc9c3eSSadaf Ebrahimi bool longest_match_;
686*ccdc9c3eSSadaf Ebrahimi bool log_errors_;
687*ccdc9c3eSSadaf Ebrahimi int64_t max_mem_;
688*ccdc9c3eSSadaf Ebrahimi bool literal_;
689*ccdc9c3eSSadaf Ebrahimi bool never_nl_;
690*ccdc9c3eSSadaf Ebrahimi bool dot_nl_;
691*ccdc9c3eSSadaf Ebrahimi bool never_capture_;
692*ccdc9c3eSSadaf Ebrahimi bool case_sensitive_;
693*ccdc9c3eSSadaf Ebrahimi bool perl_classes_;
694*ccdc9c3eSSadaf Ebrahimi bool word_boundary_;
695*ccdc9c3eSSadaf Ebrahimi bool one_line_;
696*ccdc9c3eSSadaf Ebrahimi };
697*ccdc9c3eSSadaf Ebrahimi
698*ccdc9c3eSSadaf Ebrahimi // Returns the options set in the constructor.
options()699*ccdc9c3eSSadaf Ebrahimi const Options& options() const { return options_; }
700*ccdc9c3eSSadaf Ebrahimi
701*ccdc9c3eSSadaf Ebrahimi // Argument converters; see below.
702*ccdc9c3eSSadaf Ebrahimi static inline Arg CRadix(short* x);
703*ccdc9c3eSSadaf Ebrahimi static inline Arg CRadix(unsigned short* x);
704*ccdc9c3eSSadaf Ebrahimi static inline Arg CRadix(int* x);
705*ccdc9c3eSSadaf Ebrahimi static inline Arg CRadix(unsigned int* x);
706*ccdc9c3eSSadaf Ebrahimi static inline Arg CRadix(long* x);
707*ccdc9c3eSSadaf Ebrahimi static inline Arg CRadix(unsigned long* x);
708*ccdc9c3eSSadaf Ebrahimi static inline Arg CRadix(long long* x);
709*ccdc9c3eSSadaf Ebrahimi static inline Arg CRadix(unsigned long long* x);
710*ccdc9c3eSSadaf Ebrahimi
711*ccdc9c3eSSadaf Ebrahimi static inline Arg Hex(short* x);
712*ccdc9c3eSSadaf Ebrahimi static inline Arg Hex(unsigned short* x);
713*ccdc9c3eSSadaf Ebrahimi static inline Arg Hex(int* x);
714*ccdc9c3eSSadaf Ebrahimi static inline Arg Hex(unsigned int* x);
715*ccdc9c3eSSadaf Ebrahimi static inline Arg Hex(long* x);
716*ccdc9c3eSSadaf Ebrahimi static inline Arg Hex(unsigned long* x);
717*ccdc9c3eSSadaf Ebrahimi static inline Arg Hex(long long* x);
718*ccdc9c3eSSadaf Ebrahimi static inline Arg Hex(unsigned long long* x);
719*ccdc9c3eSSadaf Ebrahimi
720*ccdc9c3eSSadaf Ebrahimi static inline Arg Octal(short* x);
721*ccdc9c3eSSadaf Ebrahimi static inline Arg Octal(unsigned short* x);
722*ccdc9c3eSSadaf Ebrahimi static inline Arg Octal(int* x);
723*ccdc9c3eSSadaf Ebrahimi static inline Arg Octal(unsigned int* x);
724*ccdc9c3eSSadaf Ebrahimi static inline Arg Octal(long* x);
725*ccdc9c3eSSadaf Ebrahimi static inline Arg Octal(unsigned long* x);
726*ccdc9c3eSSadaf Ebrahimi static inline Arg Octal(long long* x);
727*ccdc9c3eSSadaf Ebrahimi static inline Arg Octal(unsigned long long* x);
728*ccdc9c3eSSadaf Ebrahimi
729*ccdc9c3eSSadaf Ebrahimi private:
730*ccdc9c3eSSadaf Ebrahimi void Init(const StringPiece& pattern, const Options& options);
731*ccdc9c3eSSadaf Ebrahimi
732*ccdc9c3eSSadaf Ebrahimi bool DoMatch(const StringPiece& text,
733*ccdc9c3eSSadaf Ebrahimi Anchor re_anchor,
734*ccdc9c3eSSadaf Ebrahimi size_t* consumed,
735*ccdc9c3eSSadaf Ebrahimi const Arg* const args[],
736*ccdc9c3eSSadaf Ebrahimi int n) const;
737*ccdc9c3eSSadaf Ebrahimi
738*ccdc9c3eSSadaf Ebrahimi re2::Prog* ReverseProg() const;
739*ccdc9c3eSSadaf Ebrahimi
740*ccdc9c3eSSadaf Ebrahimi string pattern_; // string regular expression
741*ccdc9c3eSSadaf Ebrahimi Options options_; // option flags
742*ccdc9c3eSSadaf Ebrahimi string prefix_; // required prefix (before regexp_)
743*ccdc9c3eSSadaf Ebrahimi bool prefix_foldcase_; // prefix is ASCII case-insensitive
744*ccdc9c3eSSadaf Ebrahimi re2::Regexp* entire_regexp_; // parsed regular expression
745*ccdc9c3eSSadaf Ebrahimi re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
746*ccdc9c3eSSadaf Ebrahimi re2::Prog* prog_; // compiled program for regexp
747*ccdc9c3eSSadaf Ebrahimi int num_captures_; // Number of capturing groups
748*ccdc9c3eSSadaf Ebrahimi bool is_one_pass_; // can use prog_->SearchOnePass?
749*ccdc9c3eSSadaf Ebrahimi
750*ccdc9c3eSSadaf Ebrahimi mutable re2::Prog* rprog_; // reverse program for regexp
751*ccdc9c3eSSadaf Ebrahimi mutable const string* error_; // Error indicator
752*ccdc9c3eSSadaf Ebrahimi // (or points to empty string)
753*ccdc9c3eSSadaf Ebrahimi mutable ErrorCode error_code_; // Error code
754*ccdc9c3eSSadaf Ebrahimi mutable string error_arg_; // Fragment of regexp showing error
755*ccdc9c3eSSadaf Ebrahimi
756*ccdc9c3eSSadaf Ebrahimi // Map from capture names to indices
757*ccdc9c3eSSadaf Ebrahimi mutable const std::map<string, int>* named_groups_;
758*ccdc9c3eSSadaf Ebrahimi
759*ccdc9c3eSSadaf Ebrahimi // Map from capture indices to names
760*ccdc9c3eSSadaf Ebrahimi mutable const std::map<int, string>* group_names_;
761*ccdc9c3eSSadaf Ebrahimi
762*ccdc9c3eSSadaf Ebrahimi // Onces for lazy computations.
763*ccdc9c3eSSadaf Ebrahimi mutable std::once_flag rprog_once_;
764*ccdc9c3eSSadaf Ebrahimi mutable std::once_flag named_groups_once_;
765*ccdc9c3eSSadaf Ebrahimi mutable std::once_flag group_names_once_;
766*ccdc9c3eSSadaf Ebrahimi
767*ccdc9c3eSSadaf Ebrahimi RE2(const RE2&) = delete;
768*ccdc9c3eSSadaf Ebrahimi RE2& operator=(const RE2&) = delete;
769*ccdc9c3eSSadaf Ebrahimi };
770*ccdc9c3eSSadaf Ebrahimi
771*ccdc9c3eSSadaf Ebrahimi /***** Implementation details *****/
772*ccdc9c3eSSadaf Ebrahimi
773*ccdc9c3eSSadaf Ebrahimi // Hex/Octal/Binary?
774*ccdc9c3eSSadaf Ebrahimi
775*ccdc9c3eSSadaf Ebrahimi // Special class for parsing into objects that define a ParseFrom() method
776*ccdc9c3eSSadaf Ebrahimi template <class T>
777*ccdc9c3eSSadaf Ebrahimi class _RE2_MatchObject {
778*ccdc9c3eSSadaf Ebrahimi public:
Parse(const char * str,size_t n,void * dest)779*ccdc9c3eSSadaf Ebrahimi static inline bool Parse(const char* str, size_t n, void* dest) {
780*ccdc9c3eSSadaf Ebrahimi if (dest == NULL) return true;
781*ccdc9c3eSSadaf Ebrahimi T* object = reinterpret_cast<T*>(dest);
782*ccdc9c3eSSadaf Ebrahimi return object->ParseFrom(str, n);
783*ccdc9c3eSSadaf Ebrahimi }
784*ccdc9c3eSSadaf Ebrahimi };
785*ccdc9c3eSSadaf Ebrahimi
786*ccdc9c3eSSadaf Ebrahimi class RE2::Arg {
787*ccdc9c3eSSadaf Ebrahimi public:
788*ccdc9c3eSSadaf Ebrahimi // Empty constructor so we can declare arrays of RE2::Arg
789*ccdc9c3eSSadaf Ebrahimi Arg();
790*ccdc9c3eSSadaf Ebrahimi
791*ccdc9c3eSSadaf Ebrahimi // Constructor specially designed for NULL arguments
792*ccdc9c3eSSadaf Ebrahimi Arg(void*);
793*ccdc9c3eSSadaf Ebrahimi Arg(std::nullptr_t);
794*ccdc9c3eSSadaf Ebrahimi
795*ccdc9c3eSSadaf Ebrahimi typedef bool (*Parser)(const char* str, size_t n, void* dest);
796*ccdc9c3eSSadaf Ebrahimi
797*ccdc9c3eSSadaf Ebrahimi // Type-specific parsers
798*ccdc9c3eSSadaf Ebrahimi #define MAKE_PARSER(type, name) \
799*ccdc9c3eSSadaf Ebrahimi Arg(type* p) : arg_(p), parser_(name) {} \
800*ccdc9c3eSSadaf Ebrahimi Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
801*ccdc9c3eSSadaf Ebrahimi
MAKE_PARSER(char,parse_char)802*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(char, parse_char)
803*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(signed char, parse_schar)
804*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(unsigned char, parse_uchar)
805*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(float, parse_float)
806*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(double, parse_double)
807*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(string, parse_string)
808*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(StringPiece, parse_stringpiece)
809*ccdc9c3eSSadaf Ebrahimi
810*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(short, parse_short)
811*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(unsigned short, parse_ushort)
812*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(int, parse_int)
813*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(unsigned int, parse_uint)
814*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(long, parse_long)
815*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(unsigned long, parse_ulong)
816*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(long long, parse_longlong)
817*ccdc9c3eSSadaf Ebrahimi MAKE_PARSER(unsigned long long, parse_ulonglong)
818*ccdc9c3eSSadaf Ebrahimi
819*ccdc9c3eSSadaf Ebrahimi #undef MAKE_PARSER
820*ccdc9c3eSSadaf Ebrahimi
821*ccdc9c3eSSadaf Ebrahimi // Generic constructor templates
822*ccdc9c3eSSadaf Ebrahimi template <class T> Arg(T* p)
823*ccdc9c3eSSadaf Ebrahimi : arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
Arg(T * p,Parser parser)824*ccdc9c3eSSadaf Ebrahimi template <class T> Arg(T* p, Parser parser)
825*ccdc9c3eSSadaf Ebrahimi : arg_(p), parser_(parser) { }
826*ccdc9c3eSSadaf Ebrahimi
827*ccdc9c3eSSadaf Ebrahimi // Parse the data
828*ccdc9c3eSSadaf Ebrahimi bool Parse(const char* str, size_t n) const;
829*ccdc9c3eSSadaf Ebrahimi
830*ccdc9c3eSSadaf Ebrahimi private:
831*ccdc9c3eSSadaf Ebrahimi void* arg_;
832*ccdc9c3eSSadaf Ebrahimi Parser parser_;
833*ccdc9c3eSSadaf Ebrahimi
834*ccdc9c3eSSadaf Ebrahimi static bool parse_null (const char* str, size_t n, void* dest);
835*ccdc9c3eSSadaf Ebrahimi static bool parse_char (const char* str, size_t n, void* dest);
836*ccdc9c3eSSadaf Ebrahimi static bool parse_schar (const char* str, size_t n, void* dest);
837*ccdc9c3eSSadaf Ebrahimi static bool parse_uchar (const char* str, size_t n, void* dest);
838*ccdc9c3eSSadaf Ebrahimi static bool parse_float (const char* str, size_t n, void* dest);
839*ccdc9c3eSSadaf Ebrahimi static bool parse_double (const char* str, size_t n, void* dest);
840*ccdc9c3eSSadaf Ebrahimi static bool parse_string (const char* str, size_t n, void* dest);
841*ccdc9c3eSSadaf Ebrahimi static bool parse_stringpiece (const char* str, size_t n, void* dest);
842*ccdc9c3eSSadaf Ebrahimi
843*ccdc9c3eSSadaf Ebrahimi #define DECLARE_INTEGER_PARSER(name) \
844*ccdc9c3eSSadaf Ebrahimi private: \
845*ccdc9c3eSSadaf Ebrahimi static bool parse_##name(const char* str, size_t n, void* dest); \
846*ccdc9c3eSSadaf Ebrahimi static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
847*ccdc9c3eSSadaf Ebrahimi int radix); \
848*ccdc9c3eSSadaf Ebrahimi \
849*ccdc9c3eSSadaf Ebrahimi public: \
850*ccdc9c3eSSadaf Ebrahimi static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
851*ccdc9c3eSSadaf Ebrahimi static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
852*ccdc9c3eSSadaf Ebrahimi static bool parse_##name##_cradix(const char* str, size_t n, void* dest);
853*ccdc9c3eSSadaf Ebrahimi
854*ccdc9c3eSSadaf Ebrahimi DECLARE_INTEGER_PARSER(short)
855*ccdc9c3eSSadaf Ebrahimi DECLARE_INTEGER_PARSER(ushort)
856*ccdc9c3eSSadaf Ebrahimi DECLARE_INTEGER_PARSER(int)
857*ccdc9c3eSSadaf Ebrahimi DECLARE_INTEGER_PARSER(uint)
858*ccdc9c3eSSadaf Ebrahimi DECLARE_INTEGER_PARSER(long)
859*ccdc9c3eSSadaf Ebrahimi DECLARE_INTEGER_PARSER(ulong)
860*ccdc9c3eSSadaf Ebrahimi DECLARE_INTEGER_PARSER(longlong)
861*ccdc9c3eSSadaf Ebrahimi DECLARE_INTEGER_PARSER(ulonglong)
862*ccdc9c3eSSadaf Ebrahimi
863*ccdc9c3eSSadaf Ebrahimi #undef DECLARE_INTEGER_PARSER
864*ccdc9c3eSSadaf Ebrahimi
865*ccdc9c3eSSadaf Ebrahimi };
866*ccdc9c3eSSadaf Ebrahimi
Arg()867*ccdc9c3eSSadaf Ebrahimi inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
Arg(void * p)868*ccdc9c3eSSadaf Ebrahimi inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
Arg(std::nullptr_t p)869*ccdc9c3eSSadaf Ebrahimi inline RE2::Arg::Arg(std::nullptr_t p) : arg_(p), parser_(parse_null) { }
870*ccdc9c3eSSadaf Ebrahimi
Parse(const char * str,size_t n)871*ccdc9c3eSSadaf Ebrahimi inline bool RE2::Arg::Parse(const char* str, size_t n) const {
872*ccdc9c3eSSadaf Ebrahimi return (*parser_)(str, n, arg_);
873*ccdc9c3eSSadaf Ebrahimi }
874*ccdc9c3eSSadaf Ebrahimi
875*ccdc9c3eSSadaf Ebrahimi // This part of the parser, appropriate only for ints, deals with bases
876*ccdc9c3eSSadaf Ebrahimi #define MAKE_INTEGER_PARSER(type, name) \
877*ccdc9c3eSSadaf Ebrahimi inline RE2::Arg RE2::Hex(type* ptr) { \
878*ccdc9c3eSSadaf Ebrahimi return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \
879*ccdc9c3eSSadaf Ebrahimi } \
880*ccdc9c3eSSadaf Ebrahimi inline RE2::Arg RE2::Octal(type* ptr) { \
881*ccdc9c3eSSadaf Ebrahimi return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \
882*ccdc9c3eSSadaf Ebrahimi } \
883*ccdc9c3eSSadaf Ebrahimi inline RE2::Arg RE2::CRadix(type* ptr) { \
884*ccdc9c3eSSadaf Ebrahimi return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
885*ccdc9c3eSSadaf Ebrahimi }
886*ccdc9c3eSSadaf Ebrahimi
MAKE_INTEGER_PARSER(short,short)887*ccdc9c3eSSadaf Ebrahimi MAKE_INTEGER_PARSER(short, short)
888*ccdc9c3eSSadaf Ebrahimi MAKE_INTEGER_PARSER(unsigned short, ushort)
889*ccdc9c3eSSadaf Ebrahimi MAKE_INTEGER_PARSER(int, int)
890*ccdc9c3eSSadaf Ebrahimi MAKE_INTEGER_PARSER(unsigned int, uint)
891*ccdc9c3eSSadaf Ebrahimi MAKE_INTEGER_PARSER(long, long)
892*ccdc9c3eSSadaf Ebrahimi MAKE_INTEGER_PARSER(unsigned long, ulong)
893*ccdc9c3eSSadaf Ebrahimi MAKE_INTEGER_PARSER(long long, longlong)
894*ccdc9c3eSSadaf Ebrahimi MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
895*ccdc9c3eSSadaf Ebrahimi
896*ccdc9c3eSSadaf Ebrahimi #undef MAKE_INTEGER_PARSER
897*ccdc9c3eSSadaf Ebrahimi
898*ccdc9c3eSSadaf Ebrahimi #ifndef SWIG
899*ccdc9c3eSSadaf Ebrahimi
900*ccdc9c3eSSadaf Ebrahimi // Silence warnings about missing initializers for members of LazyRE2.
901*ccdc9c3eSSadaf Ebrahimi // Note that we test for Clang first because it defines __GNUC__ as well.
902*ccdc9c3eSSadaf Ebrahimi #if defined(__clang__)
903*ccdc9c3eSSadaf Ebrahimi #elif defined(__GNUC__) && __GNUC__ >= 6
904*ccdc9c3eSSadaf Ebrahimi #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
905*ccdc9c3eSSadaf Ebrahimi #endif
906*ccdc9c3eSSadaf Ebrahimi
907*ccdc9c3eSSadaf Ebrahimi // Helper for writing global or static RE2s safely.
908*ccdc9c3eSSadaf Ebrahimi // Write
909*ccdc9c3eSSadaf Ebrahimi // static LazyRE2 re = {".*"};
910*ccdc9c3eSSadaf Ebrahimi // and then use *re instead of writing
911*ccdc9c3eSSadaf Ebrahimi // static RE2 re(".*");
912*ccdc9c3eSSadaf Ebrahimi // The former is more careful about multithreaded
913*ccdc9c3eSSadaf Ebrahimi // situations than the latter.
914*ccdc9c3eSSadaf Ebrahimi //
915*ccdc9c3eSSadaf Ebrahimi // N.B. This class never deletes the RE2 object that
916*ccdc9c3eSSadaf Ebrahimi // it constructs: that's a feature, so that it can be used
917*ccdc9c3eSSadaf Ebrahimi // for global and function static variables.
918*ccdc9c3eSSadaf Ebrahimi class LazyRE2 {
919*ccdc9c3eSSadaf Ebrahimi private:
920*ccdc9c3eSSadaf Ebrahimi struct NoArg {};
921*ccdc9c3eSSadaf Ebrahimi
922*ccdc9c3eSSadaf Ebrahimi public:
923*ccdc9c3eSSadaf Ebrahimi typedef RE2 element_type; // support std::pointer_traits
924*ccdc9c3eSSadaf Ebrahimi
925*ccdc9c3eSSadaf Ebrahimi // Constructor omitted to preserve braced initialization in C++98.
926*ccdc9c3eSSadaf Ebrahimi
927*ccdc9c3eSSadaf Ebrahimi // Pretend to be a pointer to Type (never NULL due to on-demand creation):
928*ccdc9c3eSSadaf Ebrahimi RE2& operator*() const { return *get(); }
929*ccdc9c3eSSadaf Ebrahimi RE2* operator->() const { return get(); }
930*ccdc9c3eSSadaf Ebrahimi
931*ccdc9c3eSSadaf Ebrahimi // Named accessor/initializer:
932*ccdc9c3eSSadaf Ebrahimi RE2* get() const {
933*ccdc9c3eSSadaf Ebrahimi std::call_once(once_, &LazyRE2::Init, this);
934*ccdc9c3eSSadaf Ebrahimi return ptr_;
935*ccdc9c3eSSadaf Ebrahimi }
936*ccdc9c3eSSadaf Ebrahimi
937*ccdc9c3eSSadaf Ebrahimi // All data fields must be public to support {"foo"} initialization.
938*ccdc9c3eSSadaf Ebrahimi const char* pattern_;
939*ccdc9c3eSSadaf Ebrahimi RE2::CannedOptions options_;
940*ccdc9c3eSSadaf Ebrahimi NoArg barrier_against_excess_initializers_;
941*ccdc9c3eSSadaf Ebrahimi
942*ccdc9c3eSSadaf Ebrahimi mutable RE2* ptr_;
943*ccdc9c3eSSadaf Ebrahimi mutable std::once_flag once_;
944*ccdc9c3eSSadaf Ebrahimi
945*ccdc9c3eSSadaf Ebrahimi private:
946*ccdc9c3eSSadaf Ebrahimi static void Init(const LazyRE2* lazy_re2) {
947*ccdc9c3eSSadaf Ebrahimi lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
948*ccdc9c3eSSadaf Ebrahimi }
949*ccdc9c3eSSadaf Ebrahimi
950*ccdc9c3eSSadaf Ebrahimi void operator=(const LazyRE2&); // disallowed
951*ccdc9c3eSSadaf Ebrahimi };
952*ccdc9c3eSSadaf Ebrahimi #endif // SWIG
953*ccdc9c3eSSadaf Ebrahimi
954*ccdc9c3eSSadaf Ebrahimi } // namespace re2
955*ccdc9c3eSSadaf Ebrahimi
956*ccdc9c3eSSadaf Ebrahimi using re2::RE2;
957*ccdc9c3eSSadaf Ebrahimi using re2::LazyRE2;
958*ccdc9c3eSSadaf Ebrahimi
959*ccdc9c3eSSadaf Ebrahimi #endif // RE2_RE2_H_
960