xref: /aosp_15_r20/external/regex-re2/re2/prog.h (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_PROG_H_
6*ccdc9c3eSSadaf Ebrahimi #define RE2_PROG_H_
7*ccdc9c3eSSadaf Ebrahimi 
8*ccdc9c3eSSadaf Ebrahimi // Compiled representation of regular expressions.
9*ccdc9c3eSSadaf Ebrahimi // See regexp.h for the Regexp class, which represents a regular
10*ccdc9c3eSSadaf Ebrahimi // expression symbolically.
11*ccdc9c3eSSadaf Ebrahimi 
12*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
13*ccdc9c3eSSadaf Ebrahimi #include <functional>
14*ccdc9c3eSSadaf Ebrahimi #include <mutex>
15*ccdc9c3eSSadaf Ebrahimi #include <string>
16*ccdc9c3eSSadaf Ebrahimi #include <vector>
17*ccdc9c3eSSadaf Ebrahimi #include <type_traits>
18*ccdc9c3eSSadaf Ebrahimi 
19*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
20*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
21*ccdc9c3eSSadaf Ebrahimi #include "util/pod_array.h"
22*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_array.h"
23*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_set.h"
24*ccdc9c3eSSadaf Ebrahimi #include "re2/re2.h"
25*ccdc9c3eSSadaf Ebrahimi 
26*ccdc9c3eSSadaf Ebrahimi namespace re2 {
27*ccdc9c3eSSadaf Ebrahimi 
28*ccdc9c3eSSadaf Ebrahimi // Opcodes for Inst
29*ccdc9c3eSSadaf Ebrahimi enum InstOp {
30*ccdc9c3eSSadaf Ebrahimi   kInstAlt = 0,      // choose between out_ and out1_
31*ccdc9c3eSSadaf Ebrahimi   kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
32*ccdc9c3eSSadaf Ebrahimi   kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
33*ccdc9c3eSSadaf Ebrahimi   kInstCapture,      // capturing parenthesis number cap_
34*ccdc9c3eSSadaf Ebrahimi   kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
35*ccdc9c3eSSadaf Ebrahimi   kInstMatch,        // found a match!
36*ccdc9c3eSSadaf Ebrahimi   kInstNop,          // no-op; occasionally unavoidable
37*ccdc9c3eSSadaf Ebrahimi   kInstFail,         // never match; occasionally unavoidable
38*ccdc9c3eSSadaf Ebrahimi   kNumInst,
39*ccdc9c3eSSadaf Ebrahimi };
40*ccdc9c3eSSadaf Ebrahimi 
41*ccdc9c3eSSadaf Ebrahimi // Bit flags for empty-width specials
42*ccdc9c3eSSadaf Ebrahimi enum EmptyOp {
43*ccdc9c3eSSadaf Ebrahimi   kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
44*ccdc9c3eSSadaf Ebrahimi   kEmptyEndLine          = 1<<1,      // $ - end of line
45*ccdc9c3eSSadaf Ebrahimi   kEmptyBeginText        = 1<<2,      // \A - beginning of text
46*ccdc9c3eSSadaf Ebrahimi   kEmptyEndText          = 1<<3,      // \z - end of text
47*ccdc9c3eSSadaf Ebrahimi   kEmptyWordBoundary     = 1<<4,      // \b - word boundary
48*ccdc9c3eSSadaf Ebrahimi   kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
49*ccdc9c3eSSadaf Ebrahimi   kEmptyAllFlags         = (1<<6)-1,
50*ccdc9c3eSSadaf Ebrahimi };
51*ccdc9c3eSSadaf Ebrahimi 
52*ccdc9c3eSSadaf Ebrahimi class DFA;
53*ccdc9c3eSSadaf Ebrahimi class Regexp;
54*ccdc9c3eSSadaf Ebrahimi 
55*ccdc9c3eSSadaf Ebrahimi // Compiled form of regexp program.
56*ccdc9c3eSSadaf Ebrahimi class Prog {
57*ccdc9c3eSSadaf Ebrahimi  public:
58*ccdc9c3eSSadaf Ebrahimi   Prog();
59*ccdc9c3eSSadaf Ebrahimi   ~Prog();
60*ccdc9c3eSSadaf Ebrahimi 
61*ccdc9c3eSSadaf Ebrahimi   // Single instruction in regexp program.
62*ccdc9c3eSSadaf Ebrahimi   class Inst {
63*ccdc9c3eSSadaf Ebrahimi    public:
64*ccdc9c3eSSadaf Ebrahimi     // See the assertion below for why this is so.
65*ccdc9c3eSSadaf Ebrahimi     Inst() = default;
66*ccdc9c3eSSadaf Ebrahimi 
67*ccdc9c3eSSadaf Ebrahimi     // Copyable.
68*ccdc9c3eSSadaf Ebrahimi     Inst(const Inst&) = default;
69*ccdc9c3eSSadaf Ebrahimi     Inst& operator=(const Inst&) = default;
70*ccdc9c3eSSadaf Ebrahimi 
71*ccdc9c3eSSadaf Ebrahimi     // Constructors per opcode
72*ccdc9c3eSSadaf Ebrahimi     void InitAlt(uint32_t out, uint32_t out1);
73*ccdc9c3eSSadaf Ebrahimi     void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
74*ccdc9c3eSSadaf Ebrahimi     void InitCapture(int cap, uint32_t out);
75*ccdc9c3eSSadaf Ebrahimi     void InitEmptyWidth(EmptyOp empty, uint32_t out);
76*ccdc9c3eSSadaf Ebrahimi     void InitMatch(int id);
77*ccdc9c3eSSadaf Ebrahimi     void InitNop(uint32_t out);
78*ccdc9c3eSSadaf Ebrahimi     void InitFail();
79*ccdc9c3eSSadaf Ebrahimi 
80*ccdc9c3eSSadaf Ebrahimi     // Getters
id(Prog * p)81*ccdc9c3eSSadaf Ebrahimi     int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
opcode()82*ccdc9c3eSSadaf Ebrahimi     InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
last()83*ccdc9c3eSSadaf Ebrahimi     int last()      { return (out_opcode_>>3)&1; }
out()84*ccdc9c3eSSadaf Ebrahimi     int out()       { return out_opcode_>>4; }
out1()85*ccdc9c3eSSadaf Ebrahimi     int out1()      { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
cap()86*ccdc9c3eSSadaf Ebrahimi     int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
lo()87*ccdc9c3eSSadaf Ebrahimi     int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
hi()88*ccdc9c3eSSadaf Ebrahimi     int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
foldcase()89*ccdc9c3eSSadaf Ebrahimi     int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
match_id()90*ccdc9c3eSSadaf Ebrahimi     int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
empty()91*ccdc9c3eSSadaf Ebrahimi     EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
92*ccdc9c3eSSadaf Ebrahimi 
greedy(Prog * p)93*ccdc9c3eSSadaf Ebrahimi     bool greedy(Prog* p) {
94*ccdc9c3eSSadaf Ebrahimi       DCHECK_EQ(opcode(), kInstAltMatch);
95*ccdc9c3eSSadaf Ebrahimi       return p->inst(out())->opcode() == kInstByteRange ||
96*ccdc9c3eSSadaf Ebrahimi              (p->inst(out())->opcode() == kInstNop &&
97*ccdc9c3eSSadaf Ebrahimi               p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
98*ccdc9c3eSSadaf Ebrahimi     }
99*ccdc9c3eSSadaf Ebrahimi 
100*ccdc9c3eSSadaf Ebrahimi     // Does this inst (an kInstByteRange) match c?
Matches(int c)101*ccdc9c3eSSadaf Ebrahimi     inline bool Matches(int c) {
102*ccdc9c3eSSadaf Ebrahimi       DCHECK_EQ(opcode(), kInstByteRange);
103*ccdc9c3eSSadaf Ebrahimi       if (foldcase_ && 'A' <= c && c <= 'Z')
104*ccdc9c3eSSadaf Ebrahimi         c += 'a' - 'A';
105*ccdc9c3eSSadaf Ebrahimi       return lo_ <= c && c <= hi_;
106*ccdc9c3eSSadaf Ebrahimi     }
107*ccdc9c3eSSadaf Ebrahimi 
108*ccdc9c3eSSadaf Ebrahimi     // Returns string representation for debugging.
109*ccdc9c3eSSadaf Ebrahimi     string Dump();
110*ccdc9c3eSSadaf Ebrahimi 
111*ccdc9c3eSSadaf Ebrahimi     // Maximum instruction id.
112*ccdc9c3eSSadaf Ebrahimi     // (Must fit in out_opcode_. PatchList/last steal another bit.)
113*ccdc9c3eSSadaf Ebrahimi     static const int kMaxInst = (1<<28) - 1;
114*ccdc9c3eSSadaf Ebrahimi 
115*ccdc9c3eSSadaf Ebrahimi    private:
set_opcode(InstOp opcode)116*ccdc9c3eSSadaf Ebrahimi     void set_opcode(InstOp opcode) {
117*ccdc9c3eSSadaf Ebrahimi       out_opcode_ = (out()<<4) | (last()<<3) | opcode;
118*ccdc9c3eSSadaf Ebrahimi     }
119*ccdc9c3eSSadaf Ebrahimi 
set_last()120*ccdc9c3eSSadaf Ebrahimi     void set_last() {
121*ccdc9c3eSSadaf Ebrahimi       out_opcode_ = (out()<<4) | (1<<3) | opcode();
122*ccdc9c3eSSadaf Ebrahimi     }
123*ccdc9c3eSSadaf Ebrahimi 
set_out(int out)124*ccdc9c3eSSadaf Ebrahimi     void set_out(int out) {
125*ccdc9c3eSSadaf Ebrahimi       out_opcode_ = (out<<4) | (last()<<3) | opcode();
126*ccdc9c3eSSadaf Ebrahimi     }
127*ccdc9c3eSSadaf Ebrahimi 
set_out_opcode(int out,InstOp opcode)128*ccdc9c3eSSadaf Ebrahimi     void set_out_opcode(int out, InstOp opcode) {
129*ccdc9c3eSSadaf Ebrahimi       out_opcode_ = (out<<4) | (last()<<3) | opcode;
130*ccdc9c3eSSadaf Ebrahimi     }
131*ccdc9c3eSSadaf Ebrahimi 
132*ccdc9c3eSSadaf Ebrahimi     uint32_t out_opcode_;   // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
133*ccdc9c3eSSadaf Ebrahimi     union {                 // additional instruction arguments:
134*ccdc9c3eSSadaf Ebrahimi       uint32_t out1_;       // opcode == kInstAlt
135*ccdc9c3eSSadaf Ebrahimi                             //   alternate next instruction
136*ccdc9c3eSSadaf Ebrahimi 
137*ccdc9c3eSSadaf Ebrahimi       int32_t cap_;         // opcode == kInstCapture
138*ccdc9c3eSSadaf Ebrahimi                             //   Index of capture register (holds text
139*ccdc9c3eSSadaf Ebrahimi                             //   position recorded by capturing parentheses).
140*ccdc9c3eSSadaf Ebrahimi                             //   For \n (the submatch for the nth parentheses),
141*ccdc9c3eSSadaf Ebrahimi                             //   the left parenthesis captures into register 2*n
142*ccdc9c3eSSadaf Ebrahimi                             //   and the right one captures into register 2*n+1.
143*ccdc9c3eSSadaf Ebrahimi 
144*ccdc9c3eSSadaf Ebrahimi       int32_t match_id_;    // opcode == kInstMatch
145*ccdc9c3eSSadaf Ebrahimi                             //   Match ID to identify this match (for re2::Set).
146*ccdc9c3eSSadaf Ebrahimi 
147*ccdc9c3eSSadaf Ebrahimi       struct {              // opcode == kInstByteRange
148*ccdc9c3eSSadaf Ebrahimi         uint8_t lo_;        //   byte range is lo_-hi_ inclusive
149*ccdc9c3eSSadaf Ebrahimi         uint8_t hi_;        //
150*ccdc9c3eSSadaf Ebrahimi         uint8_t foldcase_;  //   convert A-Z to a-z before checking range.
151*ccdc9c3eSSadaf Ebrahimi       };
152*ccdc9c3eSSadaf Ebrahimi 
153*ccdc9c3eSSadaf Ebrahimi       EmptyOp empty_;       // opcode == kInstEmptyWidth
154*ccdc9c3eSSadaf Ebrahimi                             //   empty_ is bitwise OR of kEmpty* flags above.
155*ccdc9c3eSSadaf Ebrahimi     };
156*ccdc9c3eSSadaf Ebrahimi 
157*ccdc9c3eSSadaf Ebrahimi     friend class Compiler;
158*ccdc9c3eSSadaf Ebrahimi     friend struct PatchList;
159*ccdc9c3eSSadaf Ebrahimi     friend class Prog;
160*ccdc9c3eSSadaf Ebrahimi   };
161*ccdc9c3eSSadaf Ebrahimi 
162*ccdc9c3eSSadaf Ebrahimi   // Inst must be trivial so that we can freely clear it with memset(3).
163*ccdc9c3eSSadaf Ebrahimi   // Arrays of Inst are initialised by copying the initial elements with
164*ccdc9c3eSSadaf Ebrahimi   // memmove(3) and then clearing any remaining elements with memset(3).
165*ccdc9c3eSSadaf Ebrahimi   static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
166*ccdc9c3eSSadaf Ebrahimi 
167*ccdc9c3eSSadaf Ebrahimi   // Whether to anchor the search.
168*ccdc9c3eSSadaf Ebrahimi   enum Anchor {
169*ccdc9c3eSSadaf Ebrahimi     kUnanchored,  // match anywhere
170*ccdc9c3eSSadaf Ebrahimi     kAnchored,    // match only starting at beginning of text
171*ccdc9c3eSSadaf Ebrahimi   };
172*ccdc9c3eSSadaf Ebrahimi 
173*ccdc9c3eSSadaf Ebrahimi   // Kind of match to look for (for anchor != kFullMatch)
174*ccdc9c3eSSadaf Ebrahimi   //
175*ccdc9c3eSSadaf Ebrahimi   // kLongestMatch mode finds the overall longest
176*ccdc9c3eSSadaf Ebrahimi   // match but still makes its submatch choices the way
177*ccdc9c3eSSadaf Ebrahimi   // Perl would, not in the way prescribed by POSIX.
178*ccdc9c3eSSadaf Ebrahimi   // The POSIX rules are much more expensive to implement,
179*ccdc9c3eSSadaf Ebrahimi   // and no one has needed them.
180*ccdc9c3eSSadaf Ebrahimi   //
181*ccdc9c3eSSadaf Ebrahimi   // kFullMatch is not strictly necessary -- we could use
182*ccdc9c3eSSadaf Ebrahimi   // kLongestMatch and then check the length of the match -- but
183*ccdc9c3eSSadaf Ebrahimi   // the matching code can run faster if it knows to consider only
184*ccdc9c3eSSadaf Ebrahimi   // full matches.
185*ccdc9c3eSSadaf Ebrahimi   enum MatchKind {
186*ccdc9c3eSSadaf Ebrahimi     kFirstMatch,     // like Perl, PCRE
187*ccdc9c3eSSadaf Ebrahimi     kLongestMatch,   // like egrep or POSIX
188*ccdc9c3eSSadaf Ebrahimi     kFullMatch,      // match only entire text; implies anchor==kAnchored
189*ccdc9c3eSSadaf Ebrahimi     kManyMatch       // for SearchDFA, records set of matches
190*ccdc9c3eSSadaf Ebrahimi   };
191*ccdc9c3eSSadaf Ebrahimi 
inst(int id)192*ccdc9c3eSSadaf Ebrahimi   Inst *inst(int id) { return &inst_[id]; }
start()193*ccdc9c3eSSadaf Ebrahimi   int start() { return start_; }
start_unanchored()194*ccdc9c3eSSadaf Ebrahimi   int start_unanchored() { return start_unanchored_; }
set_start(int start)195*ccdc9c3eSSadaf Ebrahimi   void set_start(int start) { start_ = start; }
set_start_unanchored(int start)196*ccdc9c3eSSadaf Ebrahimi   void set_start_unanchored(int start) { start_unanchored_ = start; }
size()197*ccdc9c3eSSadaf Ebrahimi   int size() { return size_; }
reversed()198*ccdc9c3eSSadaf Ebrahimi   bool reversed() { return reversed_; }
set_reversed(bool reversed)199*ccdc9c3eSSadaf Ebrahimi   void set_reversed(bool reversed) { reversed_ = reversed; }
list_count()200*ccdc9c3eSSadaf Ebrahimi   int list_count() { return list_count_; }
inst_count(InstOp op)201*ccdc9c3eSSadaf Ebrahimi   int inst_count(InstOp op) { return inst_count_[op]; }
set_dfa_mem(int64_t dfa_mem)202*ccdc9c3eSSadaf Ebrahimi   void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
dfa_mem()203*ccdc9c3eSSadaf Ebrahimi   int64_t dfa_mem() { return dfa_mem_; }
flags()204*ccdc9c3eSSadaf Ebrahimi   int flags() { return flags_; }
set_flags(int flags)205*ccdc9c3eSSadaf Ebrahimi   void set_flags(int flags) { flags_ = flags; }
anchor_start()206*ccdc9c3eSSadaf Ebrahimi   bool anchor_start() { return anchor_start_; }
set_anchor_start(bool b)207*ccdc9c3eSSadaf Ebrahimi   void set_anchor_start(bool b) { anchor_start_ = b; }
anchor_end()208*ccdc9c3eSSadaf Ebrahimi   bool anchor_end() { return anchor_end_; }
set_anchor_end(bool b)209*ccdc9c3eSSadaf Ebrahimi   void set_anchor_end(bool b) { anchor_end_ = b; }
bytemap_range()210*ccdc9c3eSSadaf Ebrahimi   int bytemap_range() { return bytemap_range_; }
bytemap()211*ccdc9c3eSSadaf Ebrahimi   const uint8_t* bytemap() { return bytemap_; }
212*ccdc9c3eSSadaf Ebrahimi 
213*ccdc9c3eSSadaf Ebrahimi   // Lazily computed.
214*ccdc9c3eSSadaf Ebrahimi   int first_byte();
215*ccdc9c3eSSadaf Ebrahimi 
216*ccdc9c3eSSadaf Ebrahimi   // Returns string representation of program for debugging.
217*ccdc9c3eSSadaf Ebrahimi   string Dump();
218*ccdc9c3eSSadaf Ebrahimi   string DumpUnanchored();
219*ccdc9c3eSSadaf Ebrahimi   string DumpByteMap();
220*ccdc9c3eSSadaf Ebrahimi 
221*ccdc9c3eSSadaf Ebrahimi   // Returns the set of kEmpty flags that are in effect at
222*ccdc9c3eSSadaf Ebrahimi   // position p within context.
223*ccdc9c3eSSadaf Ebrahimi   static uint32_t EmptyFlags(const StringPiece& context, const char* p);
224*ccdc9c3eSSadaf Ebrahimi 
225*ccdc9c3eSSadaf Ebrahimi   // Returns whether byte c is a word character: ASCII only.
226*ccdc9c3eSSadaf Ebrahimi   // Used by the implementation of \b and \B.
227*ccdc9c3eSSadaf Ebrahimi   // This is not right for Unicode, but:
228*ccdc9c3eSSadaf Ebrahimi   //   - it's hard to get right in a byte-at-a-time matching world
229*ccdc9c3eSSadaf Ebrahimi   //     (the DFA has only one-byte lookahead).
230*ccdc9c3eSSadaf Ebrahimi   //   - even if the lookahead were possible, the Progs would be huge.
231*ccdc9c3eSSadaf Ebrahimi   // This crude approximation is the same one PCRE uses.
IsWordChar(uint8_t c)232*ccdc9c3eSSadaf Ebrahimi   static bool IsWordChar(uint8_t c) {
233*ccdc9c3eSSadaf Ebrahimi     return ('A' <= c && c <= 'Z') ||
234*ccdc9c3eSSadaf Ebrahimi            ('a' <= c && c <= 'z') ||
235*ccdc9c3eSSadaf Ebrahimi            ('0' <= c && c <= '9') ||
236*ccdc9c3eSSadaf Ebrahimi            c == '_';
237*ccdc9c3eSSadaf Ebrahimi   }
238*ccdc9c3eSSadaf Ebrahimi 
239*ccdc9c3eSSadaf Ebrahimi   // Execution engines.  They all search for the regexp (run the prog)
240*ccdc9c3eSSadaf Ebrahimi   // in text, which is in the larger context (used for ^ $ \b etc).
241*ccdc9c3eSSadaf Ebrahimi   // Anchor and kind control the kind of search.
242*ccdc9c3eSSadaf Ebrahimi   // Returns true if match found, false if not.
243*ccdc9c3eSSadaf Ebrahimi   // If match found, fills match[0..nmatch-1] with submatch info.
244*ccdc9c3eSSadaf Ebrahimi   // match[0] is overall match, match[1] is first set of parens, etc.
245*ccdc9c3eSSadaf Ebrahimi   // If a particular submatch is not matched during the regexp match,
246*ccdc9c3eSSadaf Ebrahimi   // it is set to NULL.
247*ccdc9c3eSSadaf Ebrahimi   //
248*ccdc9c3eSSadaf Ebrahimi   // Matching text == StringPiece(NULL, 0) is treated as any other empty
249*ccdc9c3eSSadaf Ebrahimi   // string, but note that on return, it will not be possible to distinguish
250*ccdc9c3eSSadaf Ebrahimi   // submatches that matched that empty string from submatches that didn't
251*ccdc9c3eSSadaf Ebrahimi   // match anything.  Either way, match[i] == NULL.
252*ccdc9c3eSSadaf Ebrahimi 
253*ccdc9c3eSSadaf Ebrahimi   // Search using NFA: can find submatches but kind of slow.
254*ccdc9c3eSSadaf Ebrahimi   bool SearchNFA(const StringPiece& text, const StringPiece& context,
255*ccdc9c3eSSadaf Ebrahimi                  Anchor anchor, MatchKind kind,
256*ccdc9c3eSSadaf Ebrahimi                  StringPiece* match, int nmatch);
257*ccdc9c3eSSadaf Ebrahimi 
258*ccdc9c3eSSadaf Ebrahimi   // Search using DFA: much faster than NFA but only finds
259*ccdc9c3eSSadaf Ebrahimi   // end of match and can use a lot more memory.
260*ccdc9c3eSSadaf Ebrahimi   // Returns whether a match was found.
261*ccdc9c3eSSadaf Ebrahimi   // If the DFA runs out of memory, sets *failed to true and returns false.
262*ccdc9c3eSSadaf Ebrahimi   // If matches != NULL and kind == kManyMatch and there is a match,
263*ccdc9c3eSSadaf Ebrahimi   // SearchDFA fills matches with the match IDs of the final matching state.
264*ccdc9c3eSSadaf Ebrahimi   bool SearchDFA(const StringPiece& text, const StringPiece& context,
265*ccdc9c3eSSadaf Ebrahimi                  Anchor anchor, MatchKind kind, StringPiece* match0,
266*ccdc9c3eSSadaf Ebrahimi                  bool* failed, SparseSet* matches);
267*ccdc9c3eSSadaf Ebrahimi 
268*ccdc9c3eSSadaf Ebrahimi   // The callback issued after building each DFA state with BuildEntireDFA().
269*ccdc9c3eSSadaf Ebrahimi   // If next is null, then the memory budget has been exhausted and building
270*ccdc9c3eSSadaf Ebrahimi   // will halt. Otherwise, the state has been built and next points to an array
271*ccdc9c3eSSadaf Ebrahimi   // of bytemap_range()+1 slots holding the next states as per the bytemap and
272*ccdc9c3eSSadaf Ebrahimi   // kByteEndText. The number of the state is implied by the callback sequence:
273*ccdc9c3eSSadaf Ebrahimi   // the first callback is for state 0, the second callback is for state 1, ...
274*ccdc9c3eSSadaf Ebrahimi   // match indicates whether the state is a matching state.
275*ccdc9c3eSSadaf Ebrahimi   using DFAStateCallback = std::function<void(const int* next, bool match)>;
276*ccdc9c3eSSadaf Ebrahimi 
277*ccdc9c3eSSadaf Ebrahimi   // Build the entire DFA for the given match kind.
278*ccdc9c3eSSadaf Ebrahimi   // Usually the DFA is built out incrementally, as needed, which
279*ccdc9c3eSSadaf Ebrahimi   // avoids lots of unnecessary work.
280*ccdc9c3eSSadaf Ebrahimi   // If cb is not empty, it receives one callback per state built.
281*ccdc9c3eSSadaf Ebrahimi   // Returns the number of states built.
282*ccdc9c3eSSadaf Ebrahimi   // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
283*ccdc9c3eSSadaf Ebrahimi   int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
284*ccdc9c3eSSadaf Ebrahimi 
285*ccdc9c3eSSadaf Ebrahimi   // Controls whether the DFA should bail out early if the NFA would be faster.
286*ccdc9c3eSSadaf Ebrahimi   // FOR TESTING ONLY.
287*ccdc9c3eSSadaf Ebrahimi   static void TEST_dfa_should_bail_when_slow(bool b);
288*ccdc9c3eSSadaf Ebrahimi 
289*ccdc9c3eSSadaf Ebrahimi   // Compute bytemap.
290*ccdc9c3eSSadaf Ebrahimi   void ComputeByteMap();
291*ccdc9c3eSSadaf Ebrahimi 
292*ccdc9c3eSSadaf Ebrahimi   // Computes whether all matches must begin with the same first
293*ccdc9c3eSSadaf Ebrahimi   // byte, and if so, returns that byte.  If not, returns -1.
294*ccdc9c3eSSadaf Ebrahimi   int ComputeFirstByte();
295*ccdc9c3eSSadaf Ebrahimi 
296*ccdc9c3eSSadaf Ebrahimi   // Run peep-hole optimizer on program.
297*ccdc9c3eSSadaf Ebrahimi   void Optimize();
298*ccdc9c3eSSadaf Ebrahimi 
299*ccdc9c3eSSadaf Ebrahimi   // One-pass NFA: only correct if IsOnePass() is true,
300*ccdc9c3eSSadaf Ebrahimi   // but much faster than NFA (competitive with PCRE)
301*ccdc9c3eSSadaf Ebrahimi   // for those expressions.
302*ccdc9c3eSSadaf Ebrahimi   bool IsOnePass();
303*ccdc9c3eSSadaf Ebrahimi   bool SearchOnePass(const StringPiece& text, const StringPiece& context,
304*ccdc9c3eSSadaf Ebrahimi                      Anchor anchor, MatchKind kind,
305*ccdc9c3eSSadaf Ebrahimi                      StringPiece* match, int nmatch);
306*ccdc9c3eSSadaf Ebrahimi 
307*ccdc9c3eSSadaf Ebrahimi   // Bit-state backtracking.  Fast on small cases but uses memory
308*ccdc9c3eSSadaf Ebrahimi   // proportional to the product of the program size and the text size.
309*ccdc9c3eSSadaf Ebrahimi   bool SearchBitState(const StringPiece& text, const StringPiece& context,
310*ccdc9c3eSSadaf Ebrahimi                       Anchor anchor, MatchKind kind,
311*ccdc9c3eSSadaf Ebrahimi                       StringPiece* match, int nmatch);
312*ccdc9c3eSSadaf Ebrahimi 
313*ccdc9c3eSSadaf Ebrahimi   static const int kMaxOnePassCapture = 5;  // $0 through $4
314*ccdc9c3eSSadaf Ebrahimi 
315*ccdc9c3eSSadaf Ebrahimi   // Backtracking search: the gold standard against which the other
316*ccdc9c3eSSadaf Ebrahimi   // implementations are checked.  FOR TESTING ONLY.
317*ccdc9c3eSSadaf Ebrahimi   // It allocates a ton of memory to avoid running forever.
318*ccdc9c3eSSadaf Ebrahimi   // It is also recursive, so can't use in production (will overflow stacks).
319*ccdc9c3eSSadaf Ebrahimi   // The name "Unsafe" here is supposed to be a flag that
320*ccdc9c3eSSadaf Ebrahimi   // you should not be using this function.
321*ccdc9c3eSSadaf Ebrahimi   bool UnsafeSearchBacktrack(const StringPiece& text,
322*ccdc9c3eSSadaf Ebrahimi                              const StringPiece& context,
323*ccdc9c3eSSadaf Ebrahimi                              Anchor anchor, MatchKind kind,
324*ccdc9c3eSSadaf Ebrahimi                              StringPiece* match, int nmatch);
325*ccdc9c3eSSadaf Ebrahimi 
326*ccdc9c3eSSadaf Ebrahimi   // Computes range for any strings matching regexp. The min and max can in
327*ccdc9c3eSSadaf Ebrahimi   // some cases be arbitrarily precise, so the caller gets to specify the
328*ccdc9c3eSSadaf Ebrahimi   // maximum desired length of string returned.
329*ccdc9c3eSSadaf Ebrahimi   //
330*ccdc9c3eSSadaf Ebrahimi   // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
331*ccdc9c3eSSadaf Ebrahimi   // string s that is an anchored match for this regexp satisfies
332*ccdc9c3eSSadaf Ebrahimi   //   min <= s && s <= max.
333*ccdc9c3eSSadaf Ebrahimi   //
334*ccdc9c3eSSadaf Ebrahimi   // Note that PossibleMatchRange() will only consider the first copy of an
335*ccdc9c3eSSadaf Ebrahimi   // infinitely repeated element (i.e., any regexp element followed by a '*' or
336*ccdc9c3eSSadaf Ebrahimi   // '+' operator). Regexps with "{N}" constructions are not affected, as those
337*ccdc9c3eSSadaf Ebrahimi   // do not compile down to infinite repetitions.
338*ccdc9c3eSSadaf Ebrahimi   //
339*ccdc9c3eSSadaf Ebrahimi   // Returns true on success, false on error.
340*ccdc9c3eSSadaf Ebrahimi   bool PossibleMatchRange(string* min, string* max, int maxlen);
341*ccdc9c3eSSadaf Ebrahimi 
342*ccdc9c3eSSadaf Ebrahimi   // EXPERIMENTAL! SUBJECT TO CHANGE!
343*ccdc9c3eSSadaf Ebrahimi   // Outputs the program fanout into the given sparse array.
344*ccdc9c3eSSadaf Ebrahimi   void Fanout(SparseArray<int>* fanout);
345*ccdc9c3eSSadaf Ebrahimi 
346*ccdc9c3eSSadaf Ebrahimi   // Compiles a collection of regexps to Prog.  Each regexp will have
347*ccdc9c3eSSadaf Ebrahimi   // its own Match instruction recording the index in the output vector.
348*ccdc9c3eSSadaf Ebrahimi   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
349*ccdc9c3eSSadaf Ebrahimi 
350*ccdc9c3eSSadaf Ebrahimi   // Flattens the Prog from "tree" form to "list" form. This is an in-place
351*ccdc9c3eSSadaf Ebrahimi   // operation in the sense that the old instructions are lost.
352*ccdc9c3eSSadaf Ebrahimi   void Flatten();
353*ccdc9c3eSSadaf Ebrahimi 
354*ccdc9c3eSSadaf Ebrahimi   // Walks the Prog; the "successor roots" or predecessors of the reachable
355*ccdc9c3eSSadaf Ebrahimi   // instructions are marked in rootmap or predmap/predvec, respectively.
356*ccdc9c3eSSadaf Ebrahimi   // reachable and stk are preallocated scratch structures.
357*ccdc9c3eSSadaf Ebrahimi   void MarkSuccessors(SparseArray<int>* rootmap,
358*ccdc9c3eSSadaf Ebrahimi                       SparseArray<int>* predmap,
359*ccdc9c3eSSadaf Ebrahimi                       std::vector<std::vector<int>>* predvec,
360*ccdc9c3eSSadaf Ebrahimi                       SparseSet* reachable, std::vector<int>* stk);
361*ccdc9c3eSSadaf Ebrahimi 
362*ccdc9c3eSSadaf Ebrahimi   // Walks the Prog from the given "root" instruction; the "dominator root"
363*ccdc9c3eSSadaf Ebrahimi   // of the reachable instructions (if such exists) is marked in rootmap.
364*ccdc9c3eSSadaf Ebrahimi   // reachable and stk are preallocated scratch structures.
365*ccdc9c3eSSadaf Ebrahimi   void MarkDominator(int root, SparseArray<int>* rootmap,
366*ccdc9c3eSSadaf Ebrahimi                      SparseArray<int>* predmap,
367*ccdc9c3eSSadaf Ebrahimi                      std::vector<std::vector<int>>* predvec,
368*ccdc9c3eSSadaf Ebrahimi                      SparseSet* reachable, std::vector<int>* stk);
369*ccdc9c3eSSadaf Ebrahimi 
370*ccdc9c3eSSadaf Ebrahimi   // Walks the Prog from the given "root" instruction; the reachable
371*ccdc9c3eSSadaf Ebrahimi   // instructions are emitted in "list" form and appended to flat.
372*ccdc9c3eSSadaf Ebrahimi   // reachable and stk are preallocated scratch structures.
373*ccdc9c3eSSadaf Ebrahimi   void EmitList(int root, SparseArray<int>* rootmap,
374*ccdc9c3eSSadaf Ebrahimi                 std::vector<Inst>* flat,
375*ccdc9c3eSSadaf Ebrahimi                 SparseSet* reachable, std::vector<int>* stk);
376*ccdc9c3eSSadaf Ebrahimi 
377*ccdc9c3eSSadaf Ebrahimi  private:
378*ccdc9c3eSSadaf Ebrahimi   friend class Compiler;
379*ccdc9c3eSSadaf Ebrahimi 
380*ccdc9c3eSSadaf Ebrahimi   DFA* GetDFA(MatchKind kind);
381*ccdc9c3eSSadaf Ebrahimi   void DeleteDFA(DFA* dfa);
382*ccdc9c3eSSadaf Ebrahimi 
383*ccdc9c3eSSadaf Ebrahimi   bool anchor_start_;       // regexp has explicit start anchor
384*ccdc9c3eSSadaf Ebrahimi   bool anchor_end_;         // regexp has explicit end anchor
385*ccdc9c3eSSadaf Ebrahimi   bool reversed_;           // whether program runs backward over input
386*ccdc9c3eSSadaf Ebrahimi   bool did_flatten_;        // has Flatten been called?
387*ccdc9c3eSSadaf Ebrahimi   bool did_onepass_;        // has IsOnePass been called?
388*ccdc9c3eSSadaf Ebrahimi 
389*ccdc9c3eSSadaf Ebrahimi   int start_;               // entry point for program
390*ccdc9c3eSSadaf Ebrahimi   int start_unanchored_;    // unanchored entry point for program
391*ccdc9c3eSSadaf Ebrahimi   int size_;                // number of instructions
392*ccdc9c3eSSadaf Ebrahimi   int bytemap_range_;       // bytemap_[x] < bytemap_range_
393*ccdc9c3eSSadaf Ebrahimi   int first_byte_;          // required first byte for match, or -1 if none
394*ccdc9c3eSSadaf Ebrahimi   int flags_;               // regexp parse flags
395*ccdc9c3eSSadaf Ebrahimi 
396*ccdc9c3eSSadaf Ebrahimi   int list_count_;            // count of lists (see above)
397*ccdc9c3eSSadaf Ebrahimi   int inst_count_[kNumInst];  // count of instructions by opcode
398*ccdc9c3eSSadaf Ebrahimi 
399*ccdc9c3eSSadaf Ebrahimi   PODArray<Inst> inst_;     // pointer to instruction array
400*ccdc9c3eSSadaf Ebrahimi   uint8_t* onepass_nodes_;  // data for OnePass nodes
401*ccdc9c3eSSadaf Ebrahimi 
402*ccdc9c3eSSadaf Ebrahimi   int64_t dfa_mem_;         // Maximum memory for DFAs.
403*ccdc9c3eSSadaf Ebrahimi   DFA* dfa_first_;          // DFA cached for kFirstMatch/kManyMatch
404*ccdc9c3eSSadaf Ebrahimi   DFA* dfa_longest_;        // DFA cached for kLongestMatch/kFullMatch
405*ccdc9c3eSSadaf Ebrahimi 
406*ccdc9c3eSSadaf Ebrahimi   uint8_t bytemap_[256];    // map from input bytes to byte classes
407*ccdc9c3eSSadaf Ebrahimi 
408*ccdc9c3eSSadaf Ebrahimi   std::once_flag first_byte_once_;
409*ccdc9c3eSSadaf Ebrahimi   std::once_flag dfa_first_once_;
410*ccdc9c3eSSadaf Ebrahimi   std::once_flag dfa_longest_once_;
411*ccdc9c3eSSadaf Ebrahimi 
412*ccdc9c3eSSadaf Ebrahimi   Prog(const Prog&) = delete;
413*ccdc9c3eSSadaf Ebrahimi   Prog& operator=(const Prog&) = delete;
414*ccdc9c3eSSadaf Ebrahimi };
415*ccdc9c3eSSadaf Ebrahimi 
416*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
417*ccdc9c3eSSadaf Ebrahimi 
418*ccdc9c3eSSadaf Ebrahimi #endif  // RE2_PROG_H_
419