xref: /aosp_15_r20/external/regex-re2/re2/prog.h (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #ifndef RE2_PROG_H_
6 #define RE2_PROG_H_
7 
8 // Compiled representation of regular expressions.
9 // See regexp.h for the Regexp class, which represents a regular
10 // expression symbolically.
11 
12 #include <stdint.h>
13 #include <functional>
14 #include <mutex>
15 #include <string>
16 #include <vector>
17 #include <type_traits>
18 
19 #include "util/util.h"
20 #include "util/logging.h"
21 #include "util/pod_array.h"
22 #include "util/sparse_array.h"
23 #include "util/sparse_set.h"
24 #include "re2/re2.h"
25 
26 namespace re2 {
27 
28 // Opcodes for Inst
29 enum InstOp {
30   kInstAlt = 0,      // choose between out_ and out1_
31   kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
32   kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
33   kInstCapture,      // capturing parenthesis number cap_
34   kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
35   kInstMatch,        // found a match!
36   kInstNop,          // no-op; occasionally unavoidable
37   kInstFail,         // never match; occasionally unavoidable
38   kNumInst,
39 };
40 
41 // Bit flags for empty-width specials
42 enum EmptyOp {
43   kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
44   kEmptyEndLine          = 1<<1,      // $ - end of line
45   kEmptyBeginText        = 1<<2,      // \A - beginning of text
46   kEmptyEndText          = 1<<3,      // \z - end of text
47   kEmptyWordBoundary     = 1<<4,      // \b - word boundary
48   kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
49   kEmptyAllFlags         = (1<<6)-1,
50 };
51 
52 class DFA;
53 class Regexp;
54 
55 // Compiled form of regexp program.
56 class Prog {
57  public:
58   Prog();
59   ~Prog();
60 
61   // Single instruction in regexp program.
62   class Inst {
63    public:
64     // See the assertion below for why this is so.
65     Inst() = default;
66 
67     // Copyable.
68     Inst(const Inst&) = default;
69     Inst& operator=(const Inst&) = default;
70 
71     // Constructors per opcode
72     void InitAlt(uint32_t out, uint32_t out1);
73     void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
74     void InitCapture(int cap, uint32_t out);
75     void InitEmptyWidth(EmptyOp empty, uint32_t out);
76     void InitMatch(int id);
77     void InitNop(uint32_t out);
78     void InitFail();
79 
80     // Getters
id(Prog * p)81     int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
opcode()82     InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
last()83     int last()      { return (out_opcode_>>3)&1; }
out()84     int out()       { return out_opcode_>>4; }
out1()85     int out1()      { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
cap()86     int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
lo()87     int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
hi()88     int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
foldcase()89     int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
match_id()90     int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
empty()91     EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
92 
greedy(Prog * p)93     bool greedy(Prog* p) {
94       DCHECK_EQ(opcode(), kInstAltMatch);
95       return p->inst(out())->opcode() == kInstByteRange ||
96              (p->inst(out())->opcode() == kInstNop &&
97               p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
98     }
99 
100     // Does this inst (an kInstByteRange) match c?
Matches(int c)101     inline bool Matches(int c) {
102       DCHECK_EQ(opcode(), kInstByteRange);
103       if (foldcase_ && 'A' <= c && c <= 'Z')
104         c += 'a' - 'A';
105       return lo_ <= c && c <= hi_;
106     }
107 
108     // Returns string representation for debugging.
109     string Dump();
110 
111     // Maximum instruction id.
112     // (Must fit in out_opcode_. PatchList/last steal another bit.)
113     static const int kMaxInst = (1<<28) - 1;
114 
115    private:
set_opcode(InstOp opcode)116     void set_opcode(InstOp opcode) {
117       out_opcode_ = (out()<<4) | (last()<<3) | opcode;
118     }
119 
set_last()120     void set_last() {
121       out_opcode_ = (out()<<4) | (1<<3) | opcode();
122     }
123 
set_out(int out)124     void set_out(int out) {
125       out_opcode_ = (out<<4) | (last()<<3) | opcode();
126     }
127 
set_out_opcode(int out,InstOp opcode)128     void set_out_opcode(int out, InstOp opcode) {
129       out_opcode_ = (out<<4) | (last()<<3) | opcode;
130     }
131 
132     uint32_t out_opcode_;   // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
133     union {                 // additional instruction arguments:
134       uint32_t out1_;       // opcode == kInstAlt
135                             //   alternate next instruction
136 
137       int32_t cap_;         // opcode == kInstCapture
138                             //   Index of capture register (holds text
139                             //   position recorded by capturing parentheses).
140                             //   For \n (the submatch for the nth parentheses),
141                             //   the left parenthesis captures into register 2*n
142                             //   and the right one captures into register 2*n+1.
143 
144       int32_t match_id_;    // opcode == kInstMatch
145                             //   Match ID to identify this match (for re2::Set).
146 
147       struct {              // opcode == kInstByteRange
148         uint8_t lo_;        //   byte range is lo_-hi_ inclusive
149         uint8_t hi_;        //
150         uint8_t foldcase_;  //   convert A-Z to a-z before checking range.
151       };
152 
153       EmptyOp empty_;       // opcode == kInstEmptyWidth
154                             //   empty_ is bitwise OR of kEmpty* flags above.
155     };
156 
157     friend class Compiler;
158     friend struct PatchList;
159     friend class Prog;
160   };
161 
162   // Inst must be trivial so that we can freely clear it with memset(3).
163   // Arrays of Inst are initialised by copying the initial elements with
164   // memmove(3) and then clearing any remaining elements with memset(3).
165   static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
166 
167   // Whether to anchor the search.
168   enum Anchor {
169     kUnanchored,  // match anywhere
170     kAnchored,    // match only starting at beginning of text
171   };
172 
173   // Kind of match to look for (for anchor != kFullMatch)
174   //
175   // kLongestMatch mode finds the overall longest
176   // match but still makes its submatch choices the way
177   // Perl would, not in the way prescribed by POSIX.
178   // The POSIX rules are much more expensive to implement,
179   // and no one has needed them.
180   //
181   // kFullMatch is not strictly necessary -- we could use
182   // kLongestMatch and then check the length of the match -- but
183   // the matching code can run faster if it knows to consider only
184   // full matches.
185   enum MatchKind {
186     kFirstMatch,     // like Perl, PCRE
187     kLongestMatch,   // like egrep or POSIX
188     kFullMatch,      // match only entire text; implies anchor==kAnchored
189     kManyMatch       // for SearchDFA, records set of matches
190   };
191 
inst(int id)192   Inst *inst(int id) { return &inst_[id]; }
start()193   int start() { return start_; }
start_unanchored()194   int start_unanchored() { return start_unanchored_; }
set_start(int start)195   void set_start(int start) { start_ = start; }
set_start_unanchored(int start)196   void set_start_unanchored(int start) { start_unanchored_ = start; }
size()197   int size() { return size_; }
reversed()198   bool reversed() { return reversed_; }
set_reversed(bool reversed)199   void set_reversed(bool reversed) { reversed_ = reversed; }
list_count()200   int list_count() { return list_count_; }
inst_count(InstOp op)201   int inst_count(InstOp op) { return inst_count_[op]; }
set_dfa_mem(int64_t dfa_mem)202   void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
dfa_mem()203   int64_t dfa_mem() { return dfa_mem_; }
flags()204   int flags() { return flags_; }
set_flags(int flags)205   void set_flags(int flags) { flags_ = flags; }
anchor_start()206   bool anchor_start() { return anchor_start_; }
set_anchor_start(bool b)207   void set_anchor_start(bool b) { anchor_start_ = b; }
anchor_end()208   bool anchor_end() { return anchor_end_; }
set_anchor_end(bool b)209   void set_anchor_end(bool b) { anchor_end_ = b; }
bytemap_range()210   int bytemap_range() { return bytemap_range_; }
bytemap()211   const uint8_t* bytemap() { return bytemap_; }
212 
213   // Lazily computed.
214   int first_byte();
215 
216   // Returns string representation of program for debugging.
217   string Dump();
218   string DumpUnanchored();
219   string DumpByteMap();
220 
221   // Returns the set of kEmpty flags that are in effect at
222   // position p within context.
223   static uint32_t EmptyFlags(const StringPiece& context, const char* p);
224 
225   // Returns whether byte c is a word character: ASCII only.
226   // Used by the implementation of \b and \B.
227   // This is not right for Unicode, but:
228   //   - it's hard to get right in a byte-at-a-time matching world
229   //     (the DFA has only one-byte lookahead).
230   //   - even if the lookahead were possible, the Progs would be huge.
231   // This crude approximation is the same one PCRE uses.
IsWordChar(uint8_t c)232   static bool IsWordChar(uint8_t c) {
233     return ('A' <= c && c <= 'Z') ||
234            ('a' <= c && c <= 'z') ||
235            ('0' <= c && c <= '9') ||
236            c == '_';
237   }
238 
239   // Execution engines.  They all search for the regexp (run the prog)
240   // in text, which is in the larger context (used for ^ $ \b etc).
241   // Anchor and kind control the kind of search.
242   // Returns true if match found, false if not.
243   // If match found, fills match[0..nmatch-1] with submatch info.
244   // match[0] is overall match, match[1] is first set of parens, etc.
245   // If a particular submatch is not matched during the regexp match,
246   // it is set to NULL.
247   //
248   // Matching text == StringPiece(NULL, 0) is treated as any other empty
249   // string, but note that on return, it will not be possible to distinguish
250   // submatches that matched that empty string from submatches that didn't
251   // match anything.  Either way, match[i] == NULL.
252 
253   // Search using NFA: can find submatches but kind of slow.
254   bool SearchNFA(const StringPiece& text, const StringPiece& context,
255                  Anchor anchor, MatchKind kind,
256                  StringPiece* match, int nmatch);
257 
258   // Search using DFA: much faster than NFA but only finds
259   // end of match and can use a lot more memory.
260   // Returns whether a match was found.
261   // If the DFA runs out of memory, sets *failed to true and returns false.
262   // If matches != NULL and kind == kManyMatch and there is a match,
263   // SearchDFA fills matches with the match IDs of the final matching state.
264   bool SearchDFA(const StringPiece& text, const StringPiece& context,
265                  Anchor anchor, MatchKind kind, StringPiece* match0,
266                  bool* failed, SparseSet* matches);
267 
268   // The callback issued after building each DFA state with BuildEntireDFA().
269   // If next is null, then the memory budget has been exhausted and building
270   // will halt. Otherwise, the state has been built and next points to an array
271   // of bytemap_range()+1 slots holding the next states as per the bytemap and
272   // kByteEndText. The number of the state is implied by the callback sequence:
273   // the first callback is for state 0, the second callback is for state 1, ...
274   // match indicates whether the state is a matching state.
275   using DFAStateCallback = std::function<void(const int* next, bool match)>;
276 
277   // Build the entire DFA for the given match kind.
278   // Usually the DFA is built out incrementally, as needed, which
279   // avoids lots of unnecessary work.
280   // If cb is not empty, it receives one callback per state built.
281   // Returns the number of states built.
282   // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
283   int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
284 
285   // Controls whether the DFA should bail out early if the NFA would be faster.
286   // FOR TESTING ONLY.
287   static void TEST_dfa_should_bail_when_slow(bool b);
288 
289   // Compute bytemap.
290   void ComputeByteMap();
291 
292   // Computes whether all matches must begin with the same first
293   // byte, and if so, returns that byte.  If not, returns -1.
294   int ComputeFirstByte();
295 
296   // Run peep-hole optimizer on program.
297   void Optimize();
298 
299   // One-pass NFA: only correct if IsOnePass() is true,
300   // but much faster than NFA (competitive with PCRE)
301   // for those expressions.
302   bool IsOnePass();
303   bool SearchOnePass(const StringPiece& text, const StringPiece& context,
304                      Anchor anchor, MatchKind kind,
305                      StringPiece* match, int nmatch);
306 
307   // Bit-state backtracking.  Fast on small cases but uses memory
308   // proportional to the product of the program size and the text size.
309   bool SearchBitState(const StringPiece& text, const StringPiece& context,
310                       Anchor anchor, MatchKind kind,
311                       StringPiece* match, int nmatch);
312 
313   static const int kMaxOnePassCapture = 5;  // $0 through $4
314 
315   // Backtracking search: the gold standard against which the other
316   // implementations are checked.  FOR TESTING ONLY.
317   // It allocates a ton of memory to avoid running forever.
318   // It is also recursive, so can't use in production (will overflow stacks).
319   // The name "Unsafe" here is supposed to be a flag that
320   // you should not be using this function.
321   bool UnsafeSearchBacktrack(const StringPiece& text,
322                              const StringPiece& context,
323                              Anchor anchor, MatchKind kind,
324                              StringPiece* match, int nmatch);
325 
326   // Computes range for any strings matching regexp. The min and max can in
327   // some cases be arbitrarily precise, so the caller gets to specify the
328   // maximum desired length of string returned.
329   //
330   // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
331   // string s that is an anchored match for this regexp satisfies
332   //   min <= s && s <= max.
333   //
334   // Note that PossibleMatchRange() will only consider the first copy of an
335   // infinitely repeated element (i.e., any regexp element followed by a '*' or
336   // '+' operator). Regexps with "{N}" constructions are not affected, as those
337   // do not compile down to infinite repetitions.
338   //
339   // Returns true on success, false on error.
340   bool PossibleMatchRange(string* min, string* max, int maxlen);
341 
342   // EXPERIMENTAL! SUBJECT TO CHANGE!
343   // Outputs the program fanout into the given sparse array.
344   void Fanout(SparseArray<int>* fanout);
345 
346   // Compiles a collection of regexps to Prog.  Each regexp will have
347   // its own Match instruction recording the index in the output vector.
348   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
349 
350   // Flattens the Prog from "tree" form to "list" form. This is an in-place
351   // operation in the sense that the old instructions are lost.
352   void Flatten();
353 
354   // Walks the Prog; the "successor roots" or predecessors of the reachable
355   // instructions are marked in rootmap or predmap/predvec, respectively.
356   // reachable and stk are preallocated scratch structures.
357   void MarkSuccessors(SparseArray<int>* rootmap,
358                       SparseArray<int>* predmap,
359                       std::vector<std::vector<int>>* predvec,
360                       SparseSet* reachable, std::vector<int>* stk);
361 
362   // Walks the Prog from the given "root" instruction; the "dominator root"
363   // of the reachable instructions (if such exists) is marked in rootmap.
364   // reachable and stk are preallocated scratch structures.
365   void MarkDominator(int root, SparseArray<int>* rootmap,
366                      SparseArray<int>* predmap,
367                      std::vector<std::vector<int>>* predvec,
368                      SparseSet* reachable, std::vector<int>* stk);
369 
370   // Walks the Prog from the given "root" instruction; the reachable
371   // instructions are emitted in "list" form and appended to flat.
372   // reachable and stk are preallocated scratch structures.
373   void EmitList(int root, SparseArray<int>* rootmap,
374                 std::vector<Inst>* flat,
375                 SparseSet* reachable, std::vector<int>* stk);
376 
377  private:
378   friend class Compiler;
379 
380   DFA* GetDFA(MatchKind kind);
381   void DeleteDFA(DFA* dfa);
382 
383   bool anchor_start_;       // regexp has explicit start anchor
384   bool anchor_end_;         // regexp has explicit end anchor
385   bool reversed_;           // whether program runs backward over input
386   bool did_flatten_;        // has Flatten been called?
387   bool did_onepass_;        // has IsOnePass been called?
388 
389   int start_;               // entry point for program
390   int start_unanchored_;    // unanchored entry point for program
391   int size_;                // number of instructions
392   int bytemap_range_;       // bytemap_[x] < bytemap_range_
393   int first_byte_;          // required first byte for match, or -1 if none
394   int flags_;               // regexp parse flags
395 
396   int list_count_;            // count of lists (see above)
397   int inst_count_[kNumInst];  // count of instructions by opcode
398 
399   PODArray<Inst> inst_;     // pointer to instruction array
400   uint8_t* onepass_nodes_;  // data for OnePass nodes
401 
402   int64_t dfa_mem_;         // Maximum memory for DFAs.
403   DFA* dfa_first_;          // DFA cached for kFirstMatch/kManyMatch
404   DFA* dfa_longest_;        // DFA cached for kLongestMatch/kFullMatch
405 
406   uint8_t bytemap_[256];    // map from input bytes to byte classes
407 
408   std::once_flag first_byte_once_;
409   std::once_flag dfa_first_once_;
410   std::once_flag dfa_longest_once_;
411 
412   Prog(const Prog&) = delete;
413   Prog& operator=(const Prog&) = delete;
414 };
415 
416 }  // namespace re2
417 
418 #endif  // RE2_PROG_H_
419