xref: /aosp_15_r20/external/regex-re2/re2/prog.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compiled regular expression representation.
6 // Tested by compile_test.cc
7 
8 #include "re2/prog.h"
9 
10 #include <stdint.h>
11 #include <string.h>
12 #include <algorithm>
13 #include <memory>
14 #include <utility>
15 
16 #include "util/util.h"
17 #include "util/logging.h"
18 #include "util/strutil.h"
19 #include "re2/bitmap256.h"
20 #include "re2/stringpiece.h"
21 
22 namespace re2 {
23 
24 // Constructors per Inst opcode
25 
InitAlt(uint32_t out,uint32_t out1)26 void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
27   DCHECK_EQ(out_opcode_, 0);
28   set_out_opcode(out, kInstAlt);
29   out1_ = out1;
30 }
31 
InitByteRange(int lo,int hi,int foldcase,uint32_t out)32 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
33   DCHECK_EQ(out_opcode_, 0);
34   set_out_opcode(out, kInstByteRange);
35   lo_ = lo & 0xFF;
36   hi_ = hi & 0xFF;
37   foldcase_ = foldcase & 0xFF;
38 }
39 
InitCapture(int cap,uint32_t out)40 void Prog::Inst::InitCapture(int cap, uint32_t out) {
41   DCHECK_EQ(out_opcode_, 0);
42   set_out_opcode(out, kInstCapture);
43   cap_ = cap;
44 }
45 
InitEmptyWidth(EmptyOp empty,uint32_t out)46 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
47   DCHECK_EQ(out_opcode_, 0);
48   set_out_opcode(out, kInstEmptyWidth);
49   empty_ = empty;
50 }
51 
InitMatch(int32_t id)52 void Prog::Inst::InitMatch(int32_t id) {
53   DCHECK_EQ(out_opcode_, 0);
54   set_opcode(kInstMatch);
55   match_id_ = id;
56 }
57 
InitNop(uint32_t out)58 void Prog::Inst::InitNop(uint32_t out) {
59   DCHECK_EQ(out_opcode_, 0);
60   set_opcode(kInstNop);
61 }
62 
InitFail()63 void Prog::Inst::InitFail() {
64   DCHECK_EQ(out_opcode_, 0);
65   set_opcode(kInstFail);
66 }
67 
Dump()68 string Prog::Inst::Dump() {
69   switch (opcode()) {
70     default:
71       return StringPrintf("opcode %d", static_cast<int>(opcode()));
72 
73     case kInstAlt:
74       return StringPrintf("alt -> %d | %d", out(), out1_);
75 
76     case kInstAltMatch:
77       return StringPrintf("altmatch -> %d | %d", out(), out1_);
78 
79     case kInstByteRange:
80       return StringPrintf("byte%s [%02x-%02x] -> %d",
81                           foldcase_ ? "/i" : "",
82                           lo_, hi_, out());
83 
84     case kInstCapture:
85       return StringPrintf("capture %d -> %d", cap_, out());
86 
87     case kInstEmptyWidth:
88       return StringPrintf("emptywidth %#x -> %d",
89                           static_cast<int>(empty_), out());
90 
91     case kInstMatch:
92       return StringPrintf("match! %d", match_id());
93 
94     case kInstNop:
95       return StringPrintf("nop -> %d", out());
96 
97     case kInstFail:
98       return StringPrintf("fail");
99   }
100 }
101 
Prog()102 Prog::Prog()
103   : anchor_start_(false),
104     anchor_end_(false),
105     reversed_(false),
106     did_flatten_(false),
107     did_onepass_(false),
108     start_(0),
109     start_unanchored_(0),
110     size_(0),
111     bytemap_range_(0),
112     first_byte_(-1),
113     flags_(0),
114     list_count_(0),
115     onepass_nodes_(NULL),
116     dfa_mem_(0),
117     dfa_first_(NULL),
118     dfa_longest_(NULL) {
119 }
120 
~Prog()121 Prog::~Prog() {
122   DeleteDFA(dfa_longest_);
123   DeleteDFA(dfa_first_);
124   delete[] onepass_nodes_;
125 }
126 
127 typedef SparseSet Workq;
128 
AddToQueue(Workq * q,int id)129 static inline void AddToQueue(Workq* q, int id) {
130   if (id != 0)
131     q->insert(id);
132 }
133 
ProgToString(Prog * prog,Workq * q)134 static string ProgToString(Prog* prog, Workq* q) {
135   string s;
136   for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
137     int id = *i;
138     Prog::Inst* ip = prog->inst(id);
139     StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
140     AddToQueue(q, ip->out());
141     if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
142       AddToQueue(q, ip->out1());
143   }
144   return s;
145 }
146 
FlattenedProgToString(Prog * prog,int start)147 static string FlattenedProgToString(Prog* prog, int start) {
148   string s;
149   for (int id = start; id < prog->size(); id++) {
150     Prog::Inst* ip = prog->inst(id);
151     if (ip->last())
152       StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
153     else
154       StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str());
155   }
156   return s;
157 }
158 
Dump()159 string Prog::Dump() {
160   if (did_flatten_)
161     return FlattenedProgToString(this, start_);
162 
163   Workq q(size_);
164   AddToQueue(&q, start_);
165   return ProgToString(this, &q);
166 }
167 
DumpUnanchored()168 string Prog::DumpUnanchored() {
169   if (did_flatten_)
170     return FlattenedProgToString(this, start_unanchored_);
171 
172   Workq q(size_);
173   AddToQueue(&q, start_unanchored_);
174   return ProgToString(this, &q);
175 }
176 
DumpByteMap()177 string Prog::DumpByteMap() {
178   string map;
179   for (int c = 0; c < 256; c++) {
180     int b = bytemap_[c];
181     int lo = c;
182     while (c < 256-1 && bytemap_[c+1] == b)
183       c++;
184     int hi = c;
185     StringAppendF(&map, "[%02x-%02x] -> %d\n", lo, hi, b);
186   }
187   return map;
188 }
189 
first_byte()190 int Prog::first_byte() {
191   std::call_once(first_byte_once_, [](Prog* prog) {
192     prog->first_byte_ = prog->ComputeFirstByte();
193   }, this);
194   return first_byte_;
195 }
196 
197 static bool IsMatch(Prog*, Prog::Inst*);
198 
199 // Peep-hole optimizer.
Optimize()200 void Prog::Optimize() {
201   Workq q(size_);
202 
203   // Eliminate nops.  Most are taken out during compilation
204   // but a few are hard to avoid.
205   q.clear();
206   AddToQueue(&q, start_);
207   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
208     int id = *i;
209 
210     Inst* ip = inst(id);
211     int j = ip->out();
212     Inst* jp;
213     while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
214       j = jp->out();
215     }
216     ip->set_out(j);
217     AddToQueue(&q, ip->out());
218 
219     if (ip->opcode() == kInstAlt) {
220       j = ip->out1();
221       while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
222         j = jp->out();
223       }
224       ip->out1_ = j;
225       AddToQueue(&q, ip->out1());
226     }
227   }
228 
229   // Insert kInstAltMatch instructions
230   // Look for
231   //   ip: Alt -> j | k
232   //	  j: ByteRange [00-FF] -> ip
233   //    k: Match
234   // or the reverse (the above is the greedy one).
235   // Rewrite Alt to AltMatch.
236   q.clear();
237   AddToQueue(&q, start_);
238   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
239     int id = *i;
240     Inst* ip = inst(id);
241     AddToQueue(&q, ip->out());
242     if (ip->opcode() == kInstAlt)
243       AddToQueue(&q, ip->out1());
244 
245     if (ip->opcode() == kInstAlt) {
246       Inst* j = inst(ip->out());
247       Inst* k = inst(ip->out1());
248       if (j->opcode() == kInstByteRange && j->out() == id &&
249           j->lo() == 0x00 && j->hi() == 0xFF &&
250           IsMatch(this, k)) {
251         ip->set_opcode(kInstAltMatch);
252         continue;
253       }
254       if (IsMatch(this, j) &&
255           k->opcode() == kInstByteRange && k->out() == id &&
256           k->lo() == 0x00 && k->hi() == 0xFF) {
257         ip->set_opcode(kInstAltMatch);
258       }
259     }
260   }
261 }
262 
263 // Is ip a guaranteed match at end of text, perhaps after some capturing?
IsMatch(Prog * prog,Prog::Inst * ip)264 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
265   for (;;) {
266     switch (ip->opcode()) {
267       default:
268         LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
269         return false;
270 
271       case kInstAlt:
272       case kInstAltMatch:
273       case kInstByteRange:
274       case kInstFail:
275       case kInstEmptyWidth:
276         return false;
277 
278       case kInstCapture:
279       case kInstNop:
280         ip = prog->inst(ip->out());
281         break;
282 
283       case kInstMatch:
284         return true;
285     }
286   }
287 }
288 
EmptyFlags(const StringPiece & text,const char * p)289 uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
290   int flags = 0;
291 
292   // ^ and \A
293   if (p == text.begin())
294     flags |= kEmptyBeginText | kEmptyBeginLine;
295   else if (p[-1] == '\n')
296     flags |= kEmptyBeginLine;
297 
298   // $ and \z
299   if (p == text.end())
300     flags |= kEmptyEndText | kEmptyEndLine;
301   else if (p < text.end() && p[0] == '\n')
302     flags |= kEmptyEndLine;
303 
304   // \b and \B
305   if (p == text.begin() && p == text.end()) {
306     // no word boundary here
307   } else if (p == text.begin()) {
308     if (IsWordChar(p[0]))
309       flags |= kEmptyWordBoundary;
310   } else if (p == text.end()) {
311     if (IsWordChar(p[-1]))
312       flags |= kEmptyWordBoundary;
313   } else {
314     if (IsWordChar(p[-1]) != IsWordChar(p[0]))
315       flags |= kEmptyWordBoundary;
316   }
317   if (!(flags & kEmptyWordBoundary))
318     flags |= kEmptyNonWordBoundary;
319 
320   return flags;
321 }
322 
323 // ByteMapBuilder implements a coloring algorithm.
324 //
325 // The first phase is a series of "mark and merge" batches: we mark one or more
326 // [lo-hi] ranges, then merge them into our internal state. Batching is not for
327 // performance; rather, it means that the ranges are treated indistinguishably.
328 //
329 // Internally, the ranges are represented using a bitmap that stores the splits
330 // and a vector that stores the colors; both of them are indexed by the ranges'
331 // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
332 // hi (if not already split), then recolor each range in between. The color map
333 // (i.e. from the old color to the new color) is maintained for the lifetime of
334 // the batch and so underpins this somewhat obscure approach to set operations.
335 //
336 // The second phase builds the bytemap from our internal state: we recolor each
337 // range, then store the new color (which is now the byte class) in each of the
338 // corresponding array elements. Finally, we output the number of byte classes.
339 class ByteMapBuilder {
340  public:
ByteMapBuilder()341   ByteMapBuilder() {
342     // Initial state: the [0-255] range has color 256.
343     // This will avoid problems during the second phase,
344     // in which we assign byte classes numbered from 0.
345     splits_.Set(255);
346     colors_.resize(256);
347     colors_[255] = 256;
348     nextcolor_ = 257;
349   }
350 
351   void Mark(int lo, int hi);
352   void Merge();
353   void Build(uint8_t* bytemap, int* bytemap_range);
354 
355  private:
356   int Recolor(int oldcolor);
357 
358   Bitmap256 splits_;
359   std::vector<int> colors_;
360   int nextcolor_;
361   std::vector<std::pair<int, int>> colormap_;
362   std::vector<std::pair<int, int>> ranges_;
363 
364   ByteMapBuilder(const ByteMapBuilder&) = delete;
365   ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
366 };
367 
Mark(int lo,int hi)368 void ByteMapBuilder::Mark(int lo, int hi) {
369   DCHECK_GE(lo, 0);
370   DCHECK_GE(hi, 0);
371   DCHECK_LE(lo, 255);
372   DCHECK_LE(hi, 255);
373   DCHECK_LE(lo, hi);
374 
375   // Ignore any [0-255] ranges. They cause us to recolor every range, which
376   // has no effect on the eventual result and is therefore a waste of time.
377   if (lo == 0 && hi == 255)
378     return;
379 
380   ranges_.emplace_back(lo, hi);
381 }
382 
Merge()383 void ByteMapBuilder::Merge() {
384   for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
385        it != ranges_.end();
386        ++it) {
387     int lo = it->first-1;
388     int hi = it->second;
389 
390     if (0 <= lo && !splits_.Test(lo)) {
391       splits_.Set(lo);
392       int next = splits_.FindNextSetBit(lo+1);
393       colors_[lo] = colors_[next];
394     }
395     if (!splits_.Test(hi)) {
396       splits_.Set(hi);
397       int next = splits_.FindNextSetBit(hi+1);
398       colors_[hi] = colors_[next];
399     }
400 
401     int c = lo+1;
402     while (c < 256) {
403       int next = splits_.FindNextSetBit(c);
404       colors_[next] = Recolor(colors_[next]);
405       if (next == hi)
406         break;
407       c = next+1;
408     }
409   }
410   colormap_.clear();
411   ranges_.clear();
412 }
413 
Build(uint8_t * bytemap,int * bytemap_range)414 void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
415   // Assign byte classes numbered from 0.
416   nextcolor_ = 0;
417 
418   int c = 0;
419   while (c < 256) {
420     int next = splits_.FindNextSetBit(c);
421     uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
422     while (c <= next) {
423       bytemap[c] = b;
424       c++;
425     }
426   }
427 
428   *bytemap_range = nextcolor_;
429 }
430 
Recolor(int oldcolor)431 int ByteMapBuilder::Recolor(int oldcolor) {
432   // Yes, this is a linear search. There can be at most 256
433   // colors and there will typically be far fewer than that.
434   // Also, we need to consider keys *and* values in order to
435   // avoid recoloring a given range more than once per batch.
436   std::vector<std::pair<int, int>>::const_iterator it =
437       std::find_if(colormap_.begin(), colormap_.end(),
438                    [=](const std::pair<int, int>& kv) -> bool {
439                      return kv.first == oldcolor || kv.second == oldcolor;
440                    });
441   if (it != colormap_.end())
442     return it->second;
443   int newcolor = nextcolor_;
444   nextcolor_++;
445   colormap_.emplace_back(oldcolor, newcolor);
446   return newcolor;
447 }
448 
ComputeByteMap()449 void Prog::ComputeByteMap() {
450   // Fill in bytemap with byte classes for the program.
451   // Ranges of bytes that are treated indistinguishably
452   // will be mapped to a single byte class.
453   ByteMapBuilder builder;
454 
455   // Don't repeat the work for ^ and $.
456   bool marked_line_boundaries = false;
457   // Don't repeat the work for \b and \B.
458   bool marked_word_boundaries = false;
459 
460   for (int id = 0; id < size(); id++) {
461     Inst* ip = inst(id);
462     if (ip->opcode() == kInstByteRange) {
463       int lo = ip->lo();
464       int hi = ip->hi();
465       builder.Mark(lo, hi);
466       if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
467         int foldlo = lo;
468         int foldhi = hi;
469         if (foldlo < 'a')
470           foldlo = 'a';
471         if (foldhi > 'z')
472           foldhi = 'z';
473         if (foldlo <= foldhi)
474           builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
475       }
476       // If this Inst is not the last Inst in its list AND the next Inst is
477       // also a ByteRange AND the Insts have the same out, defer the merge.
478       if (!ip->last() &&
479           inst(id+1)->opcode() == kInstByteRange &&
480           ip->out() == inst(id+1)->out())
481         continue;
482       builder.Merge();
483     } else if (ip->opcode() == kInstEmptyWidth) {
484       if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
485           !marked_line_boundaries) {
486         builder.Mark('\n', '\n');
487         builder.Merge();
488         marked_line_boundaries = true;
489       }
490       if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
491           !marked_word_boundaries) {
492         // We require two batches here: the first for ranges that are word
493         // characters, the second for ranges that are not word characters.
494         for (bool isword : {true, false}) {
495           int j;
496           for (int i = 0; i < 256; i = j) {
497             for (j = i + 1; j < 256 &&
498                             Prog::IsWordChar(static_cast<uint8_t>(i)) ==
499                                 Prog::IsWordChar(static_cast<uint8_t>(j));
500                  j++)
501               ;
502             if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
503               builder.Mark(i, j - 1);
504           }
505           builder.Merge();
506         }
507         marked_word_boundaries = true;
508       }
509     }
510   }
511 
512   builder.Build(bytemap_, &bytemap_range_);
513 
514   if (0) {  // For debugging, use trivial bytemap.
515     LOG(ERROR) << "Using trivial bytemap.";
516     for (int i = 0; i < 256; i++)
517       bytemap_[i] = static_cast<uint8_t>(i);
518     bytemap_range_ = 256;
519   }
520 }
521 
522 // Prog::Flatten() implements a graph rewriting algorithm.
523 //
524 // The overall process is similar to epsilon removal, but retains some epsilon
525 // transitions: those from Capture and EmptyWidth instructions; and those from
526 // nullable subexpressions. (The latter avoids quadratic blowup in transitions
527 // in the worst case.) It might be best thought of as Alt instruction elision.
528 //
529 // In conceptual terms, it divides the Prog into "trees" of instructions, then
530 // traverses the "trees" in order to produce "lists" of instructions. A "tree"
531 // is one or more instructions that grow from one "root" instruction to one or
532 // more "leaf" instructions; if a "tree" has exactly one instruction, then the
533 // "root" is also the "leaf". In most cases, a "root" is the successor of some
534 // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
535 // and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
536 // EmptyWidth or Match instruction. However, this is insufficient for handling
537 // nested nullable subexpressions correctly, so in some cases, a "root" is the
538 // dominator of the instructions reachable from some "successor root" (i.e. it
539 // has an unreachable predecessor) and is considered a "dominator root". Since
540 // only Alt instructions can be "dominator roots" (other instructions would be
541 // "leaves"), only Alt instructions are required to be marked as predecessors.
542 //
543 // Dividing the Prog into "trees" comprises two passes: marking the "successor
544 // roots" and the predecessors; and marking the "dominator roots". Sorting the
545 // "successor roots" by their bytecode offsets enables iteration in order from
546 // greatest to least during the second pass; by working backwards in this case
547 // and flooding the graph no further than "leaves" and already marked "roots",
548 // it becomes possible to mark "dominator roots" without doing excessive work.
549 //
550 // Traversing the "trees" is just iterating over the "roots" in order of their
551 // marking and flooding the graph no further than "leaves" and "roots". When a
552 // "leaf" is reached, the instruction is copied with its successor remapped to
553 // its "root" number. When a "root" is reached, a Nop instruction is generated
554 // with its successor remapped similarly. As each "list" is produced, its last
555 // instruction is marked as such. After all of the "lists" have been produced,
556 // a pass over their instructions remaps their successors to bytecode offsets.
Flatten()557 void Prog::Flatten() {
558   if (did_flatten_)
559     return;
560   did_flatten_ = true;
561 
562   // Scratch structures. It's important that these are reused by functions
563   // that we call in loops because they would thrash the heap otherwise.
564   SparseSet reachable(size());
565   std::vector<int> stk;
566   stk.reserve(size());
567 
568   // First pass: Marks "successor roots" and predecessors.
569   // Builds the mapping from inst-ids to root-ids.
570   SparseArray<int> rootmap(size());
571   SparseArray<int> predmap(size());
572   std::vector<std::vector<int>> predvec;
573   MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
574 
575   // Second pass: Marks "dominator roots".
576   SparseArray<int> sorted(rootmap);
577   std::sort(sorted.begin(), sorted.end(), sorted.less);
578   for (SparseArray<int>::const_iterator i = sorted.end() - 1;
579        i != sorted.begin();
580        --i) {
581     if (i->index() != start_unanchored() && i->index() != start())
582       MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
583   }
584 
585   // Third pass: Emits "lists". Remaps outs to root-ids.
586   // Builds the mapping from root-ids to flat-ids.
587   std::vector<int> flatmap(rootmap.size());
588   std::vector<Inst> flat;
589   flat.reserve(size());
590   for (SparseArray<int>::const_iterator i = rootmap.begin();
591        i != rootmap.end();
592        ++i) {
593     flatmap[i->value()] = static_cast<int>(flat.size());
594     EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
595     flat.back().set_last();
596   }
597 
598   list_count_ = static_cast<int>(flatmap.size());
599   for (int i = 0; i < kNumInst; i++)
600     inst_count_[i] = 0;
601 
602   // Fourth pass: Remaps outs to flat-ids.
603   // Counts instructions by opcode.
604   for (int id = 0; id < static_cast<int>(flat.size()); id++) {
605     Inst* ip = &flat[id];
606     if (ip->opcode() != kInstAltMatch)  // handled in EmitList()
607       ip->set_out(flatmap[ip->out()]);
608     inst_count_[ip->opcode()]++;
609   }
610 
611   int total = 0;
612   for (int i = 0; i < kNumInst; i++)
613     total += inst_count_[i];
614   DCHECK_EQ(total, static_cast<int>(flat.size()));
615 
616   // Remap start_unanchored and start.
617   if (start_unanchored() == 0) {
618     DCHECK_EQ(start(), 0);
619   } else if (start_unanchored() == start()) {
620     set_start_unanchored(flatmap[1]);
621     set_start(flatmap[1]);
622   } else {
623     set_start_unanchored(flatmap[1]);
624     set_start(flatmap[2]);
625   }
626 
627   // Finally, replace the old instructions with the new instructions.
628   size_ = static_cast<int>(flat.size());
629   inst_ = PODArray<Inst>(size_);
630   memmove(inst_.data(), flat.data(), size_*sizeof(inst_[0]));
631 }
632 
MarkSuccessors(SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)633 void Prog::MarkSuccessors(SparseArray<int>* rootmap,
634                           SparseArray<int>* predmap,
635                           std::vector<std::vector<int>>* predvec,
636                           SparseSet* reachable, std::vector<int>* stk) {
637   // Mark the kInstFail instruction.
638   rootmap->set_new(0, rootmap->size());
639 
640   // Mark the start_unanchored and start instructions.
641   if (!rootmap->has_index(start_unanchored()))
642     rootmap->set_new(start_unanchored(), rootmap->size());
643   if (!rootmap->has_index(start()))
644     rootmap->set_new(start(), rootmap->size());
645 
646   reachable->clear();
647   stk->clear();
648   stk->push_back(start_unanchored());
649   while (!stk->empty()) {
650     int id = stk->back();
651     stk->pop_back();
652   Loop:
653     if (reachable->contains(id))
654       continue;
655     reachable->insert_new(id);
656 
657     Inst* ip = inst(id);
658     switch (ip->opcode()) {
659       default:
660         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
661         break;
662 
663       case kInstAltMatch:
664       case kInstAlt:
665         // Mark this instruction as a predecessor of each out.
666         for (int out : {ip->out(), ip->out1()}) {
667           if (!predmap->has_index(out)) {
668             predmap->set_new(out, static_cast<int>(predvec->size()));
669             predvec->emplace_back();
670           }
671           (*predvec)[predmap->get_existing(out)].emplace_back(id);
672         }
673         stk->push_back(ip->out1());
674         id = ip->out();
675         goto Loop;
676 
677       case kInstByteRange:
678       case kInstCapture:
679       case kInstEmptyWidth:
680         // Mark the out of this instruction as a "root".
681         if (!rootmap->has_index(ip->out()))
682           rootmap->set_new(ip->out(), rootmap->size());
683         id = ip->out();
684         goto Loop;
685 
686       case kInstNop:
687         id = ip->out();
688         goto Loop;
689 
690       case kInstMatch:
691       case kInstFail:
692         break;
693     }
694   }
695 }
696 
MarkDominator(int root,SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)697 void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
698                          SparseArray<int>* predmap,
699                          std::vector<std::vector<int>>* predvec,
700                          SparseSet* reachable, std::vector<int>* stk) {
701   reachable->clear();
702   stk->clear();
703   stk->push_back(root);
704   while (!stk->empty()) {
705     int id = stk->back();
706     stk->pop_back();
707   Loop:
708     if (reachable->contains(id))
709       continue;
710     reachable->insert_new(id);
711 
712     if (id != root && rootmap->has_index(id)) {
713       // We reached another "tree" via epsilon transition.
714       continue;
715     }
716 
717     Inst* ip = inst(id);
718     switch (ip->opcode()) {
719       default:
720         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
721         break;
722 
723       case kInstAltMatch:
724       case kInstAlt:
725         stk->push_back(ip->out1());
726         id = ip->out();
727         goto Loop;
728 
729       case kInstByteRange:
730       case kInstCapture:
731       case kInstEmptyWidth:
732         break;
733 
734       case kInstNop:
735         id = ip->out();
736         goto Loop;
737 
738       case kInstMatch:
739       case kInstFail:
740         break;
741     }
742   }
743 
744   for (SparseSet::const_iterator i = reachable->begin();
745        i != reachable->end();
746        ++i) {
747     int id = *i;
748     if (predmap->has_index(id)) {
749       for (int pred : (*predvec)[predmap->get_existing(id)]) {
750         if (!reachable->contains(pred)) {
751           // id has a predecessor that cannot be reached from root!
752           // Therefore, id must be a "root" too - mark it as such.
753           if (!rootmap->has_index(id))
754             rootmap->set_new(id, rootmap->size());
755         }
756       }
757     }
758   }
759 }
760 
EmitList(int root,SparseArray<int> * rootmap,std::vector<Inst> * flat,SparseSet * reachable,std::vector<int> * stk)761 void Prog::EmitList(int root, SparseArray<int>* rootmap,
762                     std::vector<Inst>* flat,
763                     SparseSet* reachable, std::vector<int>* stk) {
764   reachable->clear();
765   stk->clear();
766   stk->push_back(root);
767   while (!stk->empty()) {
768     int id = stk->back();
769     stk->pop_back();
770   Loop:
771     if (reachable->contains(id))
772       continue;
773     reachable->insert_new(id);
774 
775     if (id != root && rootmap->has_index(id)) {
776       // We reached another "tree" via epsilon transition. Emit a kInstNop
777       // instruction so that the Prog does not become quadratically larger.
778       flat->emplace_back();
779       flat->back().set_opcode(kInstNop);
780       flat->back().set_out(rootmap->get_existing(id));
781       continue;
782     }
783 
784     Inst* ip = inst(id);
785     switch (ip->opcode()) {
786       default:
787         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
788         break;
789 
790       case kInstAltMatch:
791         flat->emplace_back();
792         flat->back().set_opcode(kInstAltMatch);
793         flat->back().set_out(static_cast<int>(flat->size()));
794         flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
795         FALLTHROUGH_INTENDED;
796 
797       case kInstAlt:
798         stk->push_back(ip->out1());
799         id = ip->out();
800         goto Loop;
801 
802       case kInstByteRange:
803       case kInstCapture:
804       case kInstEmptyWidth:
805         flat->emplace_back();
806         memmove(&flat->back(), ip, sizeof *ip);
807         flat->back().set_out(rootmap->get_existing(ip->out()));
808         break;
809 
810       case kInstNop:
811         id = ip->out();
812         goto Loop;
813 
814       case kInstMatch:
815       case kInstFail:
816         flat->emplace_back();
817         memmove(&flat->back(), ip, sizeof *ip);
818         break;
819     }
820   }
821 }
822 
823 }  // namespace re2
824