xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/compile.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compile regular expression to Prog.
6 //
7 // Prog and Inst are defined in prog.h.
8 // This file's external interface is just Regexp::CompileToProg.
9 // The Compiler class defined in this file is private.
10 
11 #include <stdint.h>
12 #include <string.h>
13 #include <utility>
14 
15 #include "absl/base/macros.h"
16 #include "absl/container/flat_hash_map.h"
17 #include "util/logging.h"
18 #include "util/utf.h"
19 #include "re2/pod_array.h"
20 #include "re2/prog.h"
21 #include "re2/re2.h"
22 #include "re2/regexp.h"
23 #include "re2/walker-inl.h"
24 
25 namespace re2 {
26 
27 // List of pointers to Inst* that need to be filled in (patched).
28 // Because the Inst* haven't been filled in yet,
29 // we can use the Inst* word to hold the list's "next" pointer.
30 // It's kind of sleazy, but it works well in practice.
31 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
32 //
33 // Because the out and out1 fields in Inst are no longer pointers,
34 // we can't use pointers directly here either.  Instead, head refers
35 // to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
36 // head == 0 represents the NULL list.  This is okay because instruction #0
37 // is always the fail instruction, which never appears on a list.
38 struct PatchList {
39   // Returns patch list containing just p.
Mkre2::PatchList40   static PatchList Mk(uint32_t p) {
41     return {p, p};
42   }
43 
44   // Patches all the entries on l to have value p.
45   // Caller must not ever use patch list again.
Patchre2::PatchList46   static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
47     while (l.head != 0) {
48       Prog::Inst* ip = &inst0[l.head>>1];
49       if (l.head&1) {
50         l.head = ip->out1();
51         ip->out1_ = p;
52       } else {
53         l.head = ip->out();
54         ip->set_out(p);
55       }
56     }
57   }
58 
59   // Appends two patch lists and returns result.
Appendre2::PatchList60   static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
61     if (l1.head == 0)
62       return l2;
63     if (l2.head == 0)
64       return l1;
65     Prog::Inst* ip = &inst0[l1.tail>>1];
66     if (l1.tail&1)
67       ip->out1_ = l2.head;
68     else
69       ip->set_out(l2.head);
70     return {l1.head, l2.tail};
71   }
72 
73   uint32_t head;
74   uint32_t tail;  // for constant-time append
75 };
76 
77 static const PatchList kNullPatchList = {0, 0};
78 
79 // Compiled program fragment.
80 struct Frag {
81   uint32_t begin;
82   PatchList end;
83   bool nullable;
84 
Fragre2::Frag85   Frag() : begin(0), end(kNullPatchList), nullable(false) {}
Fragre2::Frag86   Frag(uint32_t begin, PatchList end, bool nullable)
87       : begin(begin), end(end), nullable(nullable) {}
88 };
89 
90 // Input encodings.
91 enum Encoding {
92   kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
93   kEncodingLatin1,    // Latin-1 (0-FF)
94 };
95 
96 class Compiler : public Regexp::Walker<Frag> {
97  public:
98   explicit Compiler();
99   ~Compiler();
100 
101   // Compiles Regexp to a new Prog.
102   // Caller is responsible for deleting Prog when finished with it.
103   // If reversed is true, compiles for walking over the input
104   // string backward (reverses all concatenations).
105   static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
106 
107   // Compiles alternation of all the re to a new Prog.
108   // Each re has a match with an id equal to its index in the vector.
109   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
110 
111   // Interface for Regexp::Walker, which helps traverse the Regexp.
112   // The walk is purely post-recursive: given the machines for the
113   // children, PostVisit combines them to create the machine for
114   // the current node.  The child_args are Frags.
115   // The Compiler traverses the Regexp parse tree, visiting
116   // each node in depth-first order.  It invokes PreVisit before
117   // visiting the node's children and PostVisit after visiting
118   // the children.
119   Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
120   Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
121                  int nchild_args);
122   Frag ShortVisit(Regexp* re, Frag parent_arg);
123   Frag Copy(Frag arg);
124 
125   // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
126   Frag Plus(Frag a, bool nongreedy);
127   Frag Star(Frag a, bool nongreedy);
128   Frag Quest(Frag a, bool nongreedy);
129 
130   // Given fragment a, returns (a) capturing as \n.
131   Frag Capture(Frag a, int n);
132 
133   // Given fragments a and b, returns ab; a|b
134   Frag Cat(Frag a, Frag b);
135   Frag Alt(Frag a, Frag b);
136 
137   // Returns a fragment that can't match anything.
138   Frag NoMatch();
139 
140   // Returns a fragment that matches the empty string.
141   Frag Match(int32_t id);
142 
143   // Returns a no-op fragment.
144   Frag Nop();
145 
146   // Returns a fragment matching the byte range lo-hi.
147   Frag ByteRange(int lo, int hi, bool foldcase);
148 
149   // Returns a fragment matching an empty-width special op.
150   Frag EmptyWidth(EmptyOp op);
151 
152   // Adds n instructions to the program.
153   // Returns the index of the first one.
154   // Returns -1 if no more instructions are available.
155   int AllocInst(int n);
156 
157   // Rune range compiler.
158 
159   // Begins a new alternation.
160   void BeginRange();
161 
162   // Adds a fragment matching the rune range lo-hi.
163   void AddRuneRange(Rune lo, Rune hi, bool foldcase);
164   void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
165   void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
166   void Add_80_10ffff();
167 
168   // New suffix that matches the byte range lo-hi, then goes to next.
169   int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
170   int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
171 
172   // Returns true iff the suffix is cached.
173   bool IsCachedRuneByteSuffix(int id);
174 
175   // Adds a suffix to alternation.
176   void AddSuffix(int id);
177 
178   // Adds a suffix to the trie starting from the given root node.
179   // Returns zero iff allocating an instruction fails. Otherwise, returns
180   // the current root node, which might be different from what was given.
181   int AddSuffixRecursive(int root, int id);
182 
183   // Finds the trie node for the given suffix. Returns a Frag in order to
184   // distinguish between pointing at the root node directly (end.head == 0)
185   // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
186   Frag FindByteRange(int root, int id);
187 
188   // Compares two ByteRanges and returns true iff they are equal.
189   bool ByteRangeEqual(int id1, int id2);
190 
191   // Returns the alternation of all the added suffixes.
192   Frag EndRange();
193 
194   // Single rune.
195   Frag Literal(Rune r, bool foldcase);
196 
197   void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
198   Prog* Finish(Regexp* re);
199 
200   // Returns .* where dot = any byte
201   Frag DotStar();
202 
203  private:
204   Prog* prog_;         // Program being built.
205   bool failed_;        // Did we give up compiling?
206   Encoding encoding_;  // Input encoding
207   bool reversed_;      // Should program run backward over text?
208 
209   PODArray<Prog::Inst> inst_;
210   int ninst_;          // Number of instructions used.
211   int max_ninst_;      // Maximum number of instructions.
212 
213   int64_t max_mem_;    // Total memory budget.
214 
215   absl::flat_hash_map<uint64_t, int> rune_cache_;
216   Frag rune_range_;
217 
218   RE2::Anchor anchor_;  // anchor mode for RE2::Set
219 
220   Compiler(const Compiler&) = delete;
221   Compiler& operator=(const Compiler&) = delete;
222 };
223 
Compiler()224 Compiler::Compiler() {
225   prog_ = new Prog();
226   failed_ = false;
227   encoding_ = kEncodingUTF8;
228   reversed_ = false;
229   ninst_ = 0;
230   max_ninst_ = 1;  // make AllocInst for fail instruction okay
231   max_mem_ = 0;
232   int fail = AllocInst(1);
233   inst_[fail].InitFail();
234   max_ninst_ = 0;  // Caller must change
235 }
236 
~Compiler()237 Compiler::~Compiler() {
238   delete prog_;
239 }
240 
AllocInst(int n)241 int Compiler::AllocInst(int n) {
242   if (failed_ || ninst_ + n > max_ninst_) {
243     failed_ = true;
244     return -1;
245   }
246 
247   if (ninst_ + n > inst_.size()) {
248     int cap = inst_.size();
249     if (cap == 0)
250       cap = 8;
251     while (ninst_ + n > cap)
252       cap *= 2;
253     PODArray<Prog::Inst> inst(cap);
254     if (inst_.data() != NULL)
255       memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
256     memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
257     inst_ = std::move(inst);
258   }
259   int id = ninst_;
260   ninst_ += n;
261   return id;
262 }
263 
264 // These routines are somewhat hard to visualize in text --
265 // see http://swtch.com/~rsc/regexp/regexp1.html for
266 // pictures explaining what is going on here.
267 
268 // Returns an unmatchable fragment.
NoMatch()269 Frag Compiler::NoMatch() {
270   return Frag();
271 }
272 
273 // Is a an unmatchable fragment?
IsNoMatch(Frag a)274 static bool IsNoMatch(Frag a) {
275   return a.begin == 0;
276 }
277 
278 // Given fragments a and b, returns fragment for ab.
Cat(Frag a,Frag b)279 Frag Compiler::Cat(Frag a, Frag b) {
280   if (IsNoMatch(a) || IsNoMatch(b))
281     return NoMatch();
282 
283   // Elide no-op.
284   Prog::Inst* begin = &inst_[a.begin];
285   if (begin->opcode() == kInstNop &&
286       a.end.head == (a.begin << 1) &&
287       begin->out() == 0) {
288     // in case refs to a somewhere
289     PatchList::Patch(inst_.data(), a.end, b.begin);
290     return b;
291   }
292 
293   // To run backward over string, reverse all concatenations.
294   if (reversed_) {
295     PatchList::Patch(inst_.data(), b.end, a.begin);
296     return Frag(b.begin, a.end, b.nullable && a.nullable);
297   }
298 
299   PatchList::Patch(inst_.data(), a.end, b.begin);
300   return Frag(a.begin, b.end, a.nullable && b.nullable);
301 }
302 
303 // Given fragments for a and b, returns fragment for a|b.
Alt(Frag a,Frag b)304 Frag Compiler::Alt(Frag a, Frag b) {
305   // Special case for convenience in loops.
306   if (IsNoMatch(a))
307     return b;
308   if (IsNoMatch(b))
309     return a;
310 
311   int id = AllocInst(1);
312   if (id < 0)
313     return NoMatch();
314 
315   inst_[id].InitAlt(a.begin, b.begin);
316   return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
317               a.nullable || b.nullable);
318 }
319 
320 // When capturing submatches in like-Perl mode, a kOpAlt Inst
321 // treats out_ as the first choice, out1_ as the second.
322 //
323 // For *, +, and ?, if out_ causes another repetition,
324 // then the operator is greedy.  If out1_ is the repetition
325 // (and out_ moves forward), then the operator is non-greedy.
326 
327 // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Plus(Frag a,bool nongreedy)328 Frag Compiler::Plus(Frag a, bool nongreedy) {
329   int id = AllocInst(1);
330   if (id < 0)
331     return NoMatch();
332   PatchList pl;
333   if (nongreedy) {
334     inst_[id].InitAlt(0, a.begin);
335     pl = PatchList::Mk(id << 1);
336   } else {
337     inst_[id].InitAlt(a.begin, 0);
338     pl = PatchList::Mk((id << 1) | 1);
339   }
340   PatchList::Patch(inst_.data(), a.end, id);
341   return Frag(a.begin, pl, a.nullable);
342 }
343 
344 // Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
Star(Frag a,bool nongreedy)345 Frag Compiler::Star(Frag a, bool nongreedy) {
346   // When the subexpression is nullable, one Alt isn't enough to guarantee
347   // correct priority ordering within the transitive closure. The simplest
348   // solution is to handle it as (a+)? instead, which adds the second Alt.
349   if (a.nullable)
350     return Quest(Plus(a, nongreedy), nongreedy);
351 
352   int id = AllocInst(1);
353   if (id < 0)
354     return NoMatch();
355   PatchList pl;
356   if (nongreedy) {
357     inst_[id].InitAlt(0, a.begin);
358     pl = PatchList::Mk(id << 1);
359   } else {
360     inst_[id].InitAlt(a.begin, 0);
361     pl = PatchList::Mk((id << 1) | 1);
362   }
363   PatchList::Patch(inst_.data(), a.end, id);
364   return Frag(id, pl, true);
365 }
366 
367 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Quest(Frag a,bool nongreedy)368 Frag Compiler::Quest(Frag a, bool nongreedy) {
369   if (IsNoMatch(a))
370     return Nop();
371   int id = AllocInst(1);
372   if (id < 0)
373     return NoMatch();
374   PatchList pl;
375   if (nongreedy) {
376     inst_[id].InitAlt(0, a.begin);
377     pl = PatchList::Mk(id << 1);
378   } else {
379     inst_[id].InitAlt(a.begin, 0);
380     pl = PatchList::Mk((id << 1) | 1);
381   }
382   return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
383 }
384 
385 // Returns a fragment for the byte range lo-hi.
ByteRange(int lo,int hi,bool foldcase)386 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
387   int id = AllocInst(1);
388   if (id < 0)
389     return NoMatch();
390   inst_[id].InitByteRange(lo, hi, foldcase, 0);
391   return Frag(id, PatchList::Mk(id << 1), false);
392 }
393 
394 // Returns a no-op fragment.  Sometimes unavoidable.
Nop()395 Frag Compiler::Nop() {
396   int id = AllocInst(1);
397   if (id < 0)
398     return NoMatch();
399   inst_[id].InitNop(0);
400   return Frag(id, PatchList::Mk(id << 1), true);
401 }
402 
403 // Returns a fragment that signals a match.
Match(int32_t match_id)404 Frag Compiler::Match(int32_t match_id) {
405   int id = AllocInst(1);
406   if (id < 0)
407     return NoMatch();
408   inst_[id].InitMatch(match_id);
409   return Frag(id, kNullPatchList, false);
410 }
411 
412 // Returns a fragment matching a particular empty-width op (like ^ or $)
EmptyWidth(EmptyOp empty)413 Frag Compiler::EmptyWidth(EmptyOp empty) {
414   int id = AllocInst(1);
415   if (id < 0)
416     return NoMatch();
417   inst_[id].InitEmptyWidth(empty, 0);
418   return Frag(id, PatchList::Mk(id << 1), true);
419 }
420 
421 // Given a fragment a, returns a fragment with capturing parens around a.
Capture(Frag a,int n)422 Frag Compiler::Capture(Frag a, int n) {
423   if (IsNoMatch(a))
424     return NoMatch();
425   int id = AllocInst(2);
426   if (id < 0)
427     return NoMatch();
428   inst_[id].InitCapture(2*n, a.begin);
429   inst_[id+1].InitCapture(2*n+1, 0);
430   PatchList::Patch(inst_.data(), a.end, id+1);
431 
432   return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
433 }
434 
435 // A Rune is a name for a Unicode code point.
436 // Returns maximum rune encoded by UTF-8 sequence of length len.
MaxRune(int len)437 static int MaxRune(int len) {
438   int b;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
439   if (len == 1)
440     b = 7;
441   else
442     b = 8-(len+1) + 6*(len-1);
443   return (1<<b) - 1;   // maximum Rune for b bits.
444 }
445 
446 // The rune range compiler caches common suffix fragments,
447 // which are very common in UTF-8 (e.g., [80-bf]).
448 // The fragment suffixes are identified by their start
449 // instructions.  NULL denotes the eventual end match.
450 // The Frag accumulates in rune_range_.  Caching common
451 // suffixes reduces the UTF-8 "." from 32 to 24 instructions,
452 // and it reduces the corresponding one-pass NFA from 16 nodes to 8.
453 
BeginRange()454 void Compiler::BeginRange() {
455   rune_cache_.clear();
456   rune_range_.begin = 0;
457   rune_range_.end = kNullPatchList;
458 }
459 
UncachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)460 int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
461                                      int next) {
462   Frag f = ByteRange(lo, hi, foldcase);
463   if (next != 0) {
464     PatchList::Patch(inst_.data(), f.end, next);
465   } else {
466     rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
467   }
468   return f.begin;
469 }
470 
MakeRuneCacheKey(uint8_t lo,uint8_t hi,bool foldcase,int next)471 static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
472                                  int next) {
473   return (uint64_t)next << 17 |
474          (uint64_t)lo   <<  9 |
475          (uint64_t)hi   <<  1 |
476          (uint64_t)foldcase;
477 }
478 
CachedRuneByteSuffix(uint8_t lo,uint8_t hi,bool foldcase,int next)479 int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
480                                    int next) {
481   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
482   absl::flat_hash_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
483   if (it != rune_cache_.end())
484     return it->second;
485   int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
486   rune_cache_[key] = id;
487   return id;
488 }
489 
IsCachedRuneByteSuffix(int id)490 bool Compiler::IsCachedRuneByteSuffix(int id) {
491   uint8_t lo = inst_[id].lo_;
492   uint8_t hi = inst_[id].hi_;
493   bool foldcase = inst_[id].foldcase() != 0;
494   int next = inst_[id].out();
495 
496   uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
497   return rune_cache_.find(key) != rune_cache_.end();
498 }
499 
AddSuffix(int id)500 void Compiler::AddSuffix(int id) {
501   if (failed_)
502     return;
503 
504   if (rune_range_.begin == 0) {
505     rune_range_.begin = id;
506     return;
507   }
508 
509   if (encoding_ == kEncodingUTF8) {
510     // Build a trie in order to reduce fanout.
511     rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
512     return;
513   }
514 
515   int alt = AllocInst(1);
516   if (alt < 0) {
517     rune_range_.begin = 0;
518     return;
519   }
520   inst_[alt].InitAlt(rune_range_.begin, id);
521   rune_range_.begin = alt;
522 }
523 
AddSuffixRecursive(int root,int id)524 int Compiler::AddSuffixRecursive(int root, int id) {
525   DCHECK(inst_[root].opcode() == kInstAlt ||
526          inst_[root].opcode() == kInstByteRange);
527 
528   Frag f = FindByteRange(root, id);
529   if (IsNoMatch(f)) {
530     int alt = AllocInst(1);
531     if (alt < 0)
532       return 0;
533     inst_[alt].InitAlt(root, id);
534     return alt;
535   }
536 
537   int br;
538   if (f.end.head == 0)
539     br = root;
540   else if (f.end.head&1)
541     br = inst_[f.begin].out1();
542   else
543     br = inst_[f.begin].out();
544 
545   if (IsCachedRuneByteSuffix(br)) {
546     // We can't fiddle with cached suffixes, so make a clone of the head.
547     int byterange = AllocInst(1);
548     if (byterange < 0)
549       return 0;
550     inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
551                                    inst_[br].foldcase(), inst_[br].out());
552 
553     // Ensure that the parent points to the clone, not to the original.
554     // Note that this could leave the head unreachable except via the cache.
555     br = byterange;
556     if (f.end.head == 0)
557       root = br;
558     else if (f.end.head&1)
559       inst_[f.begin].out1_ = br;
560     else
561       inst_[f.begin].set_out(br);
562   }
563 
564   int out = inst_[id].out();
565   if (!IsCachedRuneByteSuffix(id)) {
566     // The head should be the instruction most recently allocated, so free it
567     // instead of leaving it unreachable.
568     DCHECK_EQ(id, ninst_-1);
569     inst_[id].out_opcode_ = 0;
570     inst_[id].out1_ = 0;
571     ninst_--;
572   }
573 
574   out = AddSuffixRecursive(inst_[br].out(), out);
575   if (out == 0)
576     return 0;
577 
578   inst_[br].set_out(out);
579   return root;
580 }
581 
ByteRangeEqual(int id1,int id2)582 bool Compiler::ByteRangeEqual(int id1, int id2) {
583   return inst_[id1].lo() == inst_[id2].lo() &&
584          inst_[id1].hi() == inst_[id2].hi() &&
585          inst_[id1].foldcase() == inst_[id2].foldcase();
586 }
587 
FindByteRange(int root,int id)588 Frag Compiler::FindByteRange(int root, int id) {
589   if (inst_[root].opcode() == kInstByteRange) {
590     if (ByteRangeEqual(root, id))
591       return Frag(root, kNullPatchList, false);
592     else
593       return NoMatch();
594   }
595 
596   while (inst_[root].opcode() == kInstAlt) {
597     int out1 = inst_[root].out1();
598     if (ByteRangeEqual(out1, id))
599       return Frag(root, PatchList::Mk((root << 1) | 1), false);
600 
601     // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
602     // what we're looking for, then we can stop immediately. Unfortunately, we
603     // can't short-circuit the search in reverse mode.
604     if (!reversed_)
605       return NoMatch();
606 
607     int out = inst_[root].out();
608     if (inst_[out].opcode() == kInstAlt)
609       root = out;
610     else if (ByteRangeEqual(out, id))
611       return Frag(root, PatchList::Mk(root << 1), false);
612     else
613       return NoMatch();
614   }
615 
616   LOG(DFATAL) << "should never happen";
617   return NoMatch();
618 }
619 
EndRange()620 Frag Compiler::EndRange() {
621   return rune_range_;
622 }
623 
624 // Converts rune range lo-hi into a fragment that recognizes
625 // the bytes that would make up those runes in the current
626 // encoding (Latin 1 or UTF-8).
627 // This lets the machine work byte-by-byte even when
628 // using multibyte encodings.
629 
AddRuneRange(Rune lo,Rune hi,bool foldcase)630 void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
631   switch (encoding_) {
632     default:
633     case kEncodingUTF8:
634       AddRuneRangeUTF8(lo, hi, foldcase);
635       break;
636     case kEncodingLatin1:
637       AddRuneRangeLatin1(lo, hi, foldcase);
638       break;
639   }
640 }
641 
AddRuneRangeLatin1(Rune lo,Rune hi,bool foldcase)642 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
643   // Latin-1 is easy: runes *are* bytes.
644   if (lo > hi || lo > 0xFF)
645     return;
646   if (hi > 0xFF)
647     hi = 0xFF;
648   AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
649                                    static_cast<uint8_t>(hi), foldcase, 0));
650 }
651 
Add_80_10ffff()652 void Compiler::Add_80_10ffff() {
653   // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
654   // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
655   // permitting overlong encodings in E0 and F0 sequences and code points
656   // over 10FFFF in F4 sequences, the size of the bytecode and the number
657   // of equivalence classes are reduced significantly.
658   int id;
659   if (reversed_) {
660     // Prefix factoring matters, but we don't have to handle it here
661     // because the rune range trie logic takes care of that already.
662     id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
663     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
664     AddSuffix(id);
665 
666     id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
667     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
668     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
669     AddSuffix(id);
670 
671     id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
672     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
673     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
674     id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
675     AddSuffix(id);
676   } else {
677     // Suffix factoring matters - and we do have to handle it here.
678     int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
679     id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
680     AddSuffix(id);
681 
682     int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
683     id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
684     AddSuffix(id);
685 
686     int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
687     id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
688     AddSuffix(id);
689   }
690 }
691 
AddRuneRangeUTF8(Rune lo,Rune hi,bool foldcase)692 void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
693   if (lo > hi)
694     return;
695 
696   // Pick off 80-10FFFF as a common special case.
697   if (lo == 0x80 && hi == 0x10ffff) {
698     Add_80_10ffff();
699     return;
700   }
701 
702   // Split range into same-length sized ranges.
703   for (int i = 1; i < UTFmax; i++) {
704     Rune max = MaxRune(i);
705     if (lo <= max && max < hi) {
706       AddRuneRangeUTF8(lo, max, foldcase);
707       AddRuneRangeUTF8(max+1, hi, foldcase);
708       return;
709     }
710   }
711 
712   // ASCII range is always a special case.
713   if (hi < Runeself) {
714     AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
715                                      static_cast<uint8_t>(hi), foldcase, 0));
716     return;
717   }
718 
719   // Split range into sections that agree on leading bytes.
720   for (int i = 1; i < UTFmax; i++) {
721     uint32_t m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
722     if ((lo & ~m) != (hi & ~m)) {
723       if ((lo & m) != 0) {
724         AddRuneRangeUTF8(lo, lo|m, foldcase);
725         AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
726         return;
727       }
728       if ((hi & m) != m) {
729         AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
730         AddRuneRangeUTF8(hi&~m, hi, foldcase);
731         return;
732       }
733     }
734   }
735 
736   // Finally.  Generate byte matching equivalent for lo-hi.
737   uint8_t ulo[UTFmax], uhi[UTFmax];
738   int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
739   int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
740   (void)m;  // USED(m)
741   DCHECK_EQ(n, m);
742 
743   // The logic below encodes this thinking:
744   //
745   // 1. When we have built the whole suffix, we know that it cannot
746   // possibly be a suffix of anything longer: in forward mode, nothing
747   // else can occur before the leading byte; in reverse mode, nothing
748   // else can occur after the last continuation byte or else the leading
749   // byte would have to change. Thus, there is no benefit to caching
750   // the first byte of the suffix whereas there is a cost involved in
751   // cloning it if it begins a common prefix, which is fairly likely.
752   //
753   // 2. Conversely, the last byte of the suffix cannot possibly be a
754   // prefix of anything because next == 0, so we will never want to
755   // clone it, but it is fairly likely to be a common suffix. Perhaps
756   // more so in reverse mode than in forward mode because the former is
757   // "converging" towards lower entropy, but caching is still worthwhile
758   // for the latter in cases such as 80-BF.
759   //
760   // 3. Handling the bytes between the first and the last is less
761   // straightforward and, again, the approach depends on whether we are
762   // "converging" towards lower entropy: in forward mode, a single byte
763   // is unlikely to be part of a common suffix whereas a byte range
764   // is more likely so; in reverse mode, a byte range is unlikely to
765   // be part of a common suffix whereas a single byte is more likely
766   // so. The same benefit versus cost argument applies here.
767   int id = 0;
768   if (reversed_) {
769     for (int i = 0; i < n; i++) {
770       // In reverse UTF-8 mode: cache the leading byte; don't cache the last
771       // continuation byte; cache anything else iff it's a single byte (XX-XX).
772       if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
773         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
774       else
775         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
776     }
777   } else {
778     for (int i = n-1; i >= 0; i--) {
779       // In forward UTF-8 mode: don't cache the leading byte; cache the last
780       // continuation byte; cache anything else iff it's a byte range (XX-YY).
781       if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
782         id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
783       else
784         id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
785     }
786   }
787   AddSuffix(id);
788 }
789 
790 // Should not be called.
Copy(Frag arg)791 Frag Compiler::Copy(Frag arg) {
792   // We're using WalkExponential; there should be no copying.
793   failed_ = true;
794   LOG(DFATAL) << "Compiler::Copy called!";
795   return NoMatch();
796 }
797 
798 // Visits a node quickly; called once WalkExponential has
799 // decided to cut this walk short.
ShortVisit(Regexp * re,Frag)800 Frag Compiler::ShortVisit(Regexp* re, Frag) {
801   failed_ = true;
802   return NoMatch();
803 }
804 
805 // Called before traversing a node's children during the walk.
PreVisit(Regexp * re,Frag,bool * stop)806 Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
807   // Cut off walk if we've already failed.
808   if (failed_)
809     *stop = true;
810 
811   return Frag();  // not used by caller
812 }
813 
Literal(Rune r,bool foldcase)814 Frag Compiler::Literal(Rune r, bool foldcase) {
815   switch (encoding_) {
816     default:
817       return Frag();
818 
819     case kEncodingLatin1:
820       return ByteRange(r, r, foldcase);
821 
822     case kEncodingUTF8: {
823       if (r < Runeself)  // Make common case fast.
824         return ByteRange(r, r, foldcase);
825       uint8_t buf[UTFmax];
826       int n = runetochar(reinterpret_cast<char*>(buf), &r);
827       Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
828       for (int i = 1; i < n; i++)
829         f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
830       return f;
831     }
832   }
833 }
834 
835 // Called after traversing the node's children during the walk.
836 // Given their frags, build and return the frag for this re.
PostVisit(Regexp * re,Frag,Frag,Frag * child_frags,int nchild_frags)837 Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
838                          int nchild_frags) {
839   // If a child failed, don't bother going forward, especially
840   // since the child_frags might contain Frags with NULLs in them.
841   if (failed_)
842     return NoMatch();
843 
844   // Given the child fragments, return the fragment for this node.
845   switch (re->op()) {
846     case kRegexpRepeat:
847       // Should not see; code at bottom of function will print error
848       break;
849 
850     case kRegexpNoMatch:
851       return NoMatch();
852 
853     case kRegexpEmptyMatch:
854       return Nop();
855 
856     case kRegexpHaveMatch: {
857       Frag f = Match(re->match_id());
858       if (anchor_ == RE2::ANCHOR_BOTH) {
859         // Append \z or else the subexpression will effectively be unanchored.
860         // Complemented by the UNANCHORED case in CompileSet().
861         f = Cat(EmptyWidth(kEmptyEndText), f);
862       }
863       return f;
864     }
865 
866     case kRegexpConcat: {
867       Frag f = child_frags[0];
868       for (int i = 1; i < nchild_frags; i++)
869         f = Cat(f, child_frags[i]);
870       return f;
871     }
872 
873     case kRegexpAlternate: {
874       Frag f = child_frags[0];
875       for (int i = 1; i < nchild_frags; i++)
876         f = Alt(f, child_frags[i]);
877       return f;
878     }
879 
880     case kRegexpStar:
881       return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
882 
883     case kRegexpPlus:
884       return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
885 
886     case kRegexpQuest:
887       return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
888 
889     case kRegexpLiteral:
890       return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
891 
892     case kRegexpLiteralString: {
893       // Concatenation of literals.
894       if (re->nrunes() == 0)
895         return Nop();
896       Frag f;
897       for (int i = 0; i < re->nrunes(); i++) {
898         Frag f1 = Literal(re->runes()[i],
899                           (re->parse_flags()&Regexp::FoldCase) != 0);
900         if (i == 0)
901           f = f1;
902         else
903           f = Cat(f, f1);
904       }
905       return f;
906     }
907 
908     case kRegexpAnyChar:
909       BeginRange();
910       AddRuneRange(0, Runemax, false);
911       return EndRange();
912 
913     case kRegexpAnyByte:
914       return ByteRange(0x00, 0xFF, false);
915 
916     case kRegexpCharClass: {
917       CharClass* cc = re->cc();
918       if (cc->empty()) {
919         // This can't happen.
920         failed_ = true;
921         LOG(DFATAL) << "No ranges in char class";
922         return NoMatch();
923       }
924 
925       // ASCII case-folding optimization: if the char class
926       // behaves the same on A-Z as it does on a-z,
927       // discard any ranges wholly contained in A-Z
928       // and mark the other ranges as foldascii.
929       // This reduces the size of a program for
930       // (?i)abc from 3 insts per letter to 1 per letter.
931       bool foldascii = cc->FoldsASCII();
932 
933       // Character class is just a big OR of the different
934       // character ranges in the class.
935       BeginRange();
936       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
937         // ASCII case-folding optimization (see above).
938         if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
939           continue;
940 
941         // If this range contains all of A-Za-z or none of it,
942         // the fold flag is unnecessary; don't bother.
943         bool fold = foldascii;
944         if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
945             ('Z' < i->lo && i->hi < 'a'))
946           fold = false;
947 
948         AddRuneRange(i->lo, i->hi, fold);
949       }
950       return EndRange();
951     }
952 
953     case kRegexpCapture:
954       // If this is a non-capturing parenthesis -- (?:foo) --
955       // just use the inner expression.
956       if (re->cap() < 0)
957         return child_frags[0];
958       return Capture(child_frags[0], re->cap());
959 
960     case kRegexpBeginLine:
961       return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
962 
963     case kRegexpEndLine:
964       return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
965 
966     case kRegexpBeginText:
967       return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
968 
969     case kRegexpEndText:
970       return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
971 
972     case kRegexpWordBoundary:
973       return EmptyWidth(kEmptyWordBoundary);
974 
975     case kRegexpNoWordBoundary:
976       return EmptyWidth(kEmptyNonWordBoundary);
977   }
978   failed_ = true;
979   LOG(DFATAL) << "Missing case in Compiler: " << re->op();
980   return NoMatch();
981 }
982 
983 // Is this regexp required to start at the beginning of the text?
984 // Only approximate; can return false for complicated regexps like (\Aa|\Ab),
985 // but handles (\A(a|b)).  Could use the Walker to write a more exact one.
IsAnchorStart(Regexp ** pre,int depth)986 static bool IsAnchorStart(Regexp** pre, int depth) {
987   Regexp* re = *pre;
988   Regexp* sub;
989   // The depth limit makes sure that we don't overflow
990   // the stack on a deeply nested regexp.  As the comment
991   // above says, IsAnchorStart is conservative, so returning
992   // a false negative is okay.  The exact limit is somewhat arbitrary.
993   if (re == NULL || depth >= 4)
994     return false;
995   switch (re->op()) {
996     default:
997       break;
998     case kRegexpConcat:
999       if (re->nsub() > 0) {
1000         sub = re->sub()[0]->Incref();
1001         if (IsAnchorStart(&sub, depth+1)) {
1002           PODArray<Regexp*> subcopy(re->nsub());
1003           subcopy[0] = sub;  // already have reference
1004           for (int i = 1; i < re->nsub(); i++)
1005             subcopy[i] = re->sub()[i]->Incref();
1006           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1007           re->Decref();
1008           return true;
1009         }
1010         sub->Decref();
1011       }
1012       break;
1013     case kRegexpCapture:
1014       sub = re->sub()[0]->Incref();
1015       if (IsAnchorStart(&sub, depth+1)) {
1016         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1017         re->Decref();
1018         return true;
1019       }
1020       sub->Decref();
1021       break;
1022     case kRegexpBeginText:
1023       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1024       re->Decref();
1025       return true;
1026   }
1027   return false;
1028 }
1029 
1030 // Is this regexp required to start at the end of the text?
1031 // Only approximate; can return false for complicated regexps like (a\z|b\z),
1032 // but handles ((a|b)\z).  Could use the Walker to write a more exact one.
IsAnchorEnd(Regexp ** pre,int depth)1033 static bool IsAnchorEnd(Regexp** pre, int depth) {
1034   Regexp* re = *pre;
1035   Regexp* sub;
1036   // The depth limit makes sure that we don't overflow
1037   // the stack on a deeply nested regexp.  As the comment
1038   // above says, IsAnchorEnd is conservative, so returning
1039   // a false negative is okay.  The exact limit is somewhat arbitrary.
1040   if (re == NULL || depth >= 4)
1041     return false;
1042   switch (re->op()) {
1043     default:
1044       break;
1045     case kRegexpConcat:
1046       if (re->nsub() > 0) {
1047         sub = re->sub()[re->nsub() - 1]->Incref();
1048         if (IsAnchorEnd(&sub, depth+1)) {
1049           PODArray<Regexp*> subcopy(re->nsub());
1050           subcopy[re->nsub() - 1] = sub;  // already have reference
1051           for (int i = 0; i < re->nsub() - 1; i++)
1052             subcopy[i] = re->sub()[i]->Incref();
1053           *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1054           re->Decref();
1055           return true;
1056         }
1057         sub->Decref();
1058       }
1059       break;
1060     case kRegexpCapture:
1061       sub = re->sub()[0]->Incref();
1062       if (IsAnchorEnd(&sub, depth+1)) {
1063         *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1064         re->Decref();
1065         return true;
1066       }
1067       sub->Decref();
1068       break;
1069     case kRegexpEndText:
1070       *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1071       re->Decref();
1072       return true;
1073   }
1074   return false;
1075 }
1076 
Setup(Regexp::ParseFlags flags,int64_t max_mem,RE2::Anchor anchor)1077 void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1078                      RE2::Anchor anchor) {
1079   if (flags & Regexp::Latin1)
1080     encoding_ = kEncodingLatin1;
1081   max_mem_ = max_mem;
1082   if (max_mem <= 0) {
1083     max_ninst_ = 100000;  // more than enough
1084   } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1085     // No room for anything.
1086     max_ninst_ = 0;
1087   } else {
1088     int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1089     // Limit instruction count so that inst->id() fits nicely in an int.
1090     // SparseArray also assumes that the indices (inst->id()) are ints.
1091     // The call to WalkExponential uses 2*max_ninst_ below,
1092     // and other places in the code use 2 or 3 * prog->size().
1093     // Limiting to 2^24 should avoid overflow in those places.
1094     // (The point of allowing more than 32 bits of memory is to
1095     // have plenty of room for the DFA states, not to use it up
1096     // on the program.)
1097     if (m >= 1<<24)
1098       m = 1<<24;
1099     // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1100     if (m > Prog::Inst::kMaxInst)
1101       m = Prog::Inst::kMaxInst;
1102     max_ninst_ = static_cast<int>(m);
1103   }
1104   anchor_ = anchor;
1105 }
1106 
1107 // Compiles re, returning program.
1108 // Caller is responsible for deleting prog_.
1109 // If reversed is true, compiles a program that expects
1110 // to run over the input string backward (reverses all concatenations).
1111 // The reversed flag is also recorded in the returned program.
Compile(Regexp * re,bool reversed,int64_t max_mem)1112 Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1113   Compiler c;
1114   c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1115   c.reversed_ = reversed;
1116 
1117   // Simplify to remove things like counted repetitions
1118   // and character classes like \d.
1119   Regexp* sre = re->Simplify();
1120   if (sre == NULL)
1121     return NULL;
1122 
1123   // Record whether prog is anchored, removing the anchors.
1124   // (They get in the way of other optimizations.)
1125   bool is_anchor_start = IsAnchorStart(&sre, 0);
1126   bool is_anchor_end = IsAnchorEnd(&sre, 0);
1127 
1128   // Generate fragment for entire regexp.
1129   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1130   sre->Decref();
1131   if (c.failed_)
1132     return NULL;
1133 
1134   // Success!  Finish by putting Match node at end, and record start.
1135   // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1136   // to behave normally.
1137   c.reversed_ = false;
1138   all = c.Cat(all, c.Match(0));
1139 
1140   c.prog_->set_reversed(reversed);
1141   if (c.prog_->reversed()) {
1142     c.prog_->set_anchor_start(is_anchor_end);
1143     c.prog_->set_anchor_end(is_anchor_start);
1144   } else {
1145     c.prog_->set_anchor_start(is_anchor_start);
1146     c.prog_->set_anchor_end(is_anchor_end);
1147   }
1148 
1149   c.prog_->set_start(all.begin);
1150   if (!c.prog_->anchor_start()) {
1151     // Also create unanchored version, which starts with a .*? loop.
1152     all = c.Cat(c.DotStar(), all);
1153   }
1154   c.prog_->set_start_unanchored(all.begin);
1155 
1156   // Hand ownership of prog_ to caller.
1157   return c.Finish(re);
1158 }
1159 
Finish(Regexp * re)1160 Prog* Compiler::Finish(Regexp* re) {
1161   if (failed_)
1162     return NULL;
1163 
1164   if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1165     // No possible matches; keep Fail instruction only.
1166     ninst_ = 1;
1167   }
1168 
1169   // Hand off the array to Prog.
1170   prog_->inst_ = std::move(inst_);
1171   prog_->size_ = ninst_;
1172 
1173   prog_->Optimize();
1174   prog_->Flatten();
1175   prog_->ComputeByteMap();
1176 
1177   if (!prog_->reversed()) {
1178     std::string prefix;
1179     bool prefix_foldcase;
1180     if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
1181       prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
1182   }
1183 
1184   // Record remaining memory for DFA.
1185   if (max_mem_ <= 0) {
1186     prog_->set_dfa_mem(1<<20);
1187   } else {
1188     int64_t m = max_mem_ - sizeof(Prog);
1189     m -= prog_->size_*sizeof(Prog::Inst);  // account for inst_
1190     if (prog_->CanBitState())
1191       m -= prog_->size_*sizeof(uint16_t);  // account for list_heads_
1192     if (m < 0)
1193       m = 0;
1194     prog_->set_dfa_mem(m);
1195   }
1196 
1197   Prog* p = prog_;
1198   prog_ = NULL;
1199   return p;
1200 }
1201 
1202 // Converts Regexp to Prog.
CompileToProg(int64_t max_mem)1203 Prog* Regexp::CompileToProg(int64_t max_mem) {
1204   return Compiler::Compile(this, false, max_mem);
1205 }
1206 
CompileToReverseProg(int64_t max_mem)1207 Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1208   return Compiler::Compile(this, true, max_mem);
1209 }
1210 
DotStar()1211 Frag Compiler::DotStar() {
1212   return Star(ByteRange(0x00, 0xff, false), true);
1213 }
1214 
1215 // Compiles RE set to Prog.
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1216 Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1217   Compiler c;
1218   c.Setup(re->parse_flags(), max_mem, anchor);
1219 
1220   Regexp* sre = re->Simplify();
1221   if (sre == NULL)
1222     return NULL;
1223 
1224   Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1225   sre->Decref();
1226   if (c.failed_)
1227     return NULL;
1228 
1229   c.prog_->set_anchor_start(true);
1230   c.prog_->set_anchor_end(true);
1231 
1232   if (anchor == RE2::UNANCHORED) {
1233     // Prepend .* or else the expression will effectively be anchored.
1234     // Complemented by the ANCHOR_BOTH case in PostVisit().
1235     all = c.Cat(c.DotStar(), all);
1236   }
1237   c.prog_->set_start(all.begin);
1238   c.prog_->set_start_unanchored(all.begin);
1239 
1240   Prog* prog = c.Finish(re);
1241   if (prog == NULL)
1242     return NULL;
1243 
1244   // Make sure DFA has enough memory to operate,
1245   // since we're not going to fall back to the NFA.
1246   bool dfa_failed = false;
1247   absl::string_view sp = "hello, world";
1248   prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1249                   NULL, &dfa_failed, NULL);
1250   if (dfa_failed) {
1251     delete prog;
1252     return NULL;
1253   }
1254 
1255   return prog;
1256 }
1257 
CompileSet(Regexp * re,RE2::Anchor anchor,int64_t max_mem)1258 Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1259   return Compiler::CompileSet(re, anchor, max_mem);
1260 }
1261 
1262 }  // namespace re2
1263