xref: /aosp_15_r20/external/regex-re2/re2/prog.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Compiled regular expression representation.
6*ccdc9c3eSSadaf Ebrahimi // Tested by compile_test.cc
7*ccdc9c3eSSadaf Ebrahimi 
8*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
9*ccdc9c3eSSadaf Ebrahimi 
10*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
11*ccdc9c3eSSadaf Ebrahimi #include <string.h>
12*ccdc9c3eSSadaf Ebrahimi #include <algorithm>
13*ccdc9c3eSSadaf Ebrahimi #include <memory>
14*ccdc9c3eSSadaf Ebrahimi #include <utility>
15*ccdc9c3eSSadaf Ebrahimi 
16*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
17*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
18*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
19*ccdc9c3eSSadaf Ebrahimi #include "re2/bitmap256.h"
20*ccdc9c3eSSadaf Ebrahimi #include "re2/stringpiece.h"
21*ccdc9c3eSSadaf Ebrahimi 
22*ccdc9c3eSSadaf Ebrahimi namespace re2 {
23*ccdc9c3eSSadaf Ebrahimi 
24*ccdc9c3eSSadaf Ebrahimi // Constructors per Inst opcode
25*ccdc9c3eSSadaf Ebrahimi 
InitAlt(uint32_t out,uint32_t out1)26*ccdc9c3eSSadaf Ebrahimi void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
27*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(out_opcode_, 0);
28*ccdc9c3eSSadaf Ebrahimi   set_out_opcode(out, kInstAlt);
29*ccdc9c3eSSadaf Ebrahimi   out1_ = out1;
30*ccdc9c3eSSadaf Ebrahimi }
31*ccdc9c3eSSadaf Ebrahimi 
InitByteRange(int lo,int hi,int foldcase,uint32_t out)32*ccdc9c3eSSadaf Ebrahimi void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
33*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(out_opcode_, 0);
34*ccdc9c3eSSadaf Ebrahimi   set_out_opcode(out, kInstByteRange);
35*ccdc9c3eSSadaf Ebrahimi   lo_ = lo & 0xFF;
36*ccdc9c3eSSadaf Ebrahimi   hi_ = hi & 0xFF;
37*ccdc9c3eSSadaf Ebrahimi   foldcase_ = foldcase & 0xFF;
38*ccdc9c3eSSadaf Ebrahimi }
39*ccdc9c3eSSadaf Ebrahimi 
InitCapture(int cap,uint32_t out)40*ccdc9c3eSSadaf Ebrahimi void Prog::Inst::InitCapture(int cap, uint32_t out) {
41*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(out_opcode_, 0);
42*ccdc9c3eSSadaf Ebrahimi   set_out_opcode(out, kInstCapture);
43*ccdc9c3eSSadaf Ebrahimi   cap_ = cap;
44*ccdc9c3eSSadaf Ebrahimi }
45*ccdc9c3eSSadaf Ebrahimi 
InitEmptyWidth(EmptyOp empty,uint32_t out)46*ccdc9c3eSSadaf Ebrahimi void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
47*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(out_opcode_, 0);
48*ccdc9c3eSSadaf Ebrahimi   set_out_opcode(out, kInstEmptyWidth);
49*ccdc9c3eSSadaf Ebrahimi   empty_ = empty;
50*ccdc9c3eSSadaf Ebrahimi }
51*ccdc9c3eSSadaf Ebrahimi 
InitMatch(int32_t id)52*ccdc9c3eSSadaf Ebrahimi void Prog::Inst::InitMatch(int32_t id) {
53*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(out_opcode_, 0);
54*ccdc9c3eSSadaf Ebrahimi   set_opcode(kInstMatch);
55*ccdc9c3eSSadaf Ebrahimi   match_id_ = id;
56*ccdc9c3eSSadaf Ebrahimi }
57*ccdc9c3eSSadaf Ebrahimi 
InitNop(uint32_t out)58*ccdc9c3eSSadaf Ebrahimi void Prog::Inst::InitNop(uint32_t out) {
59*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(out_opcode_, 0);
60*ccdc9c3eSSadaf Ebrahimi   set_opcode(kInstNop);
61*ccdc9c3eSSadaf Ebrahimi }
62*ccdc9c3eSSadaf Ebrahimi 
InitFail()63*ccdc9c3eSSadaf Ebrahimi void Prog::Inst::InitFail() {
64*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(out_opcode_, 0);
65*ccdc9c3eSSadaf Ebrahimi   set_opcode(kInstFail);
66*ccdc9c3eSSadaf Ebrahimi }
67*ccdc9c3eSSadaf Ebrahimi 
Dump()68*ccdc9c3eSSadaf Ebrahimi string Prog::Inst::Dump() {
69*ccdc9c3eSSadaf Ebrahimi   switch (opcode()) {
70*ccdc9c3eSSadaf Ebrahimi     default:
71*ccdc9c3eSSadaf Ebrahimi       return StringPrintf("opcode %d", static_cast<int>(opcode()));
72*ccdc9c3eSSadaf Ebrahimi 
73*ccdc9c3eSSadaf Ebrahimi     case kInstAlt:
74*ccdc9c3eSSadaf Ebrahimi       return StringPrintf("alt -> %d | %d", out(), out1_);
75*ccdc9c3eSSadaf Ebrahimi 
76*ccdc9c3eSSadaf Ebrahimi     case kInstAltMatch:
77*ccdc9c3eSSadaf Ebrahimi       return StringPrintf("altmatch -> %d | %d", out(), out1_);
78*ccdc9c3eSSadaf Ebrahimi 
79*ccdc9c3eSSadaf Ebrahimi     case kInstByteRange:
80*ccdc9c3eSSadaf Ebrahimi       return StringPrintf("byte%s [%02x-%02x] -> %d",
81*ccdc9c3eSSadaf Ebrahimi                           foldcase_ ? "/i" : "",
82*ccdc9c3eSSadaf Ebrahimi                           lo_, hi_, out());
83*ccdc9c3eSSadaf Ebrahimi 
84*ccdc9c3eSSadaf Ebrahimi     case kInstCapture:
85*ccdc9c3eSSadaf Ebrahimi       return StringPrintf("capture %d -> %d", cap_, out());
86*ccdc9c3eSSadaf Ebrahimi 
87*ccdc9c3eSSadaf Ebrahimi     case kInstEmptyWidth:
88*ccdc9c3eSSadaf Ebrahimi       return StringPrintf("emptywidth %#x -> %d",
89*ccdc9c3eSSadaf Ebrahimi                           static_cast<int>(empty_), out());
90*ccdc9c3eSSadaf Ebrahimi 
91*ccdc9c3eSSadaf Ebrahimi     case kInstMatch:
92*ccdc9c3eSSadaf Ebrahimi       return StringPrintf("match! %d", match_id());
93*ccdc9c3eSSadaf Ebrahimi 
94*ccdc9c3eSSadaf Ebrahimi     case kInstNop:
95*ccdc9c3eSSadaf Ebrahimi       return StringPrintf("nop -> %d", out());
96*ccdc9c3eSSadaf Ebrahimi 
97*ccdc9c3eSSadaf Ebrahimi     case kInstFail:
98*ccdc9c3eSSadaf Ebrahimi       return StringPrintf("fail");
99*ccdc9c3eSSadaf Ebrahimi   }
100*ccdc9c3eSSadaf Ebrahimi }
101*ccdc9c3eSSadaf Ebrahimi 
Prog()102*ccdc9c3eSSadaf Ebrahimi Prog::Prog()
103*ccdc9c3eSSadaf Ebrahimi   : anchor_start_(false),
104*ccdc9c3eSSadaf Ebrahimi     anchor_end_(false),
105*ccdc9c3eSSadaf Ebrahimi     reversed_(false),
106*ccdc9c3eSSadaf Ebrahimi     did_flatten_(false),
107*ccdc9c3eSSadaf Ebrahimi     did_onepass_(false),
108*ccdc9c3eSSadaf Ebrahimi     start_(0),
109*ccdc9c3eSSadaf Ebrahimi     start_unanchored_(0),
110*ccdc9c3eSSadaf Ebrahimi     size_(0),
111*ccdc9c3eSSadaf Ebrahimi     bytemap_range_(0),
112*ccdc9c3eSSadaf Ebrahimi     first_byte_(-1),
113*ccdc9c3eSSadaf Ebrahimi     flags_(0),
114*ccdc9c3eSSadaf Ebrahimi     list_count_(0),
115*ccdc9c3eSSadaf Ebrahimi     onepass_nodes_(NULL),
116*ccdc9c3eSSadaf Ebrahimi     dfa_mem_(0),
117*ccdc9c3eSSadaf Ebrahimi     dfa_first_(NULL),
118*ccdc9c3eSSadaf Ebrahimi     dfa_longest_(NULL) {
119*ccdc9c3eSSadaf Ebrahimi }
120*ccdc9c3eSSadaf Ebrahimi 
~Prog()121*ccdc9c3eSSadaf Ebrahimi Prog::~Prog() {
122*ccdc9c3eSSadaf Ebrahimi   DeleteDFA(dfa_longest_);
123*ccdc9c3eSSadaf Ebrahimi   DeleteDFA(dfa_first_);
124*ccdc9c3eSSadaf Ebrahimi   delete[] onepass_nodes_;
125*ccdc9c3eSSadaf Ebrahimi }
126*ccdc9c3eSSadaf Ebrahimi 
127*ccdc9c3eSSadaf Ebrahimi typedef SparseSet Workq;
128*ccdc9c3eSSadaf Ebrahimi 
AddToQueue(Workq * q,int id)129*ccdc9c3eSSadaf Ebrahimi static inline void AddToQueue(Workq* q, int id) {
130*ccdc9c3eSSadaf Ebrahimi   if (id != 0)
131*ccdc9c3eSSadaf Ebrahimi     q->insert(id);
132*ccdc9c3eSSadaf Ebrahimi }
133*ccdc9c3eSSadaf Ebrahimi 
ProgToString(Prog * prog,Workq * q)134*ccdc9c3eSSadaf Ebrahimi static string ProgToString(Prog* prog, Workq* q) {
135*ccdc9c3eSSadaf Ebrahimi   string s;
136*ccdc9c3eSSadaf Ebrahimi   for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
137*ccdc9c3eSSadaf Ebrahimi     int id = *i;
138*ccdc9c3eSSadaf Ebrahimi     Prog::Inst* ip = prog->inst(id);
139*ccdc9c3eSSadaf Ebrahimi     StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
140*ccdc9c3eSSadaf Ebrahimi     AddToQueue(q, ip->out());
141*ccdc9c3eSSadaf Ebrahimi     if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
142*ccdc9c3eSSadaf Ebrahimi       AddToQueue(q, ip->out1());
143*ccdc9c3eSSadaf Ebrahimi   }
144*ccdc9c3eSSadaf Ebrahimi   return s;
145*ccdc9c3eSSadaf Ebrahimi }
146*ccdc9c3eSSadaf Ebrahimi 
FlattenedProgToString(Prog * prog,int start)147*ccdc9c3eSSadaf Ebrahimi static string FlattenedProgToString(Prog* prog, int start) {
148*ccdc9c3eSSadaf Ebrahimi   string s;
149*ccdc9c3eSSadaf Ebrahimi   for (int id = start; id < prog->size(); id++) {
150*ccdc9c3eSSadaf Ebrahimi     Prog::Inst* ip = prog->inst(id);
151*ccdc9c3eSSadaf Ebrahimi     if (ip->last())
152*ccdc9c3eSSadaf Ebrahimi       StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
153*ccdc9c3eSSadaf Ebrahimi     else
154*ccdc9c3eSSadaf Ebrahimi       StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str());
155*ccdc9c3eSSadaf Ebrahimi   }
156*ccdc9c3eSSadaf Ebrahimi   return s;
157*ccdc9c3eSSadaf Ebrahimi }
158*ccdc9c3eSSadaf Ebrahimi 
Dump()159*ccdc9c3eSSadaf Ebrahimi string Prog::Dump() {
160*ccdc9c3eSSadaf Ebrahimi   if (did_flatten_)
161*ccdc9c3eSSadaf Ebrahimi     return FlattenedProgToString(this, start_);
162*ccdc9c3eSSadaf Ebrahimi 
163*ccdc9c3eSSadaf Ebrahimi   Workq q(size_);
164*ccdc9c3eSSadaf Ebrahimi   AddToQueue(&q, start_);
165*ccdc9c3eSSadaf Ebrahimi   return ProgToString(this, &q);
166*ccdc9c3eSSadaf Ebrahimi }
167*ccdc9c3eSSadaf Ebrahimi 
DumpUnanchored()168*ccdc9c3eSSadaf Ebrahimi string Prog::DumpUnanchored() {
169*ccdc9c3eSSadaf Ebrahimi   if (did_flatten_)
170*ccdc9c3eSSadaf Ebrahimi     return FlattenedProgToString(this, start_unanchored_);
171*ccdc9c3eSSadaf Ebrahimi 
172*ccdc9c3eSSadaf Ebrahimi   Workq q(size_);
173*ccdc9c3eSSadaf Ebrahimi   AddToQueue(&q, start_unanchored_);
174*ccdc9c3eSSadaf Ebrahimi   return ProgToString(this, &q);
175*ccdc9c3eSSadaf Ebrahimi }
176*ccdc9c3eSSadaf Ebrahimi 
DumpByteMap()177*ccdc9c3eSSadaf Ebrahimi string Prog::DumpByteMap() {
178*ccdc9c3eSSadaf Ebrahimi   string map;
179*ccdc9c3eSSadaf Ebrahimi   for (int c = 0; c < 256; c++) {
180*ccdc9c3eSSadaf Ebrahimi     int b = bytemap_[c];
181*ccdc9c3eSSadaf Ebrahimi     int lo = c;
182*ccdc9c3eSSadaf Ebrahimi     while (c < 256-1 && bytemap_[c+1] == b)
183*ccdc9c3eSSadaf Ebrahimi       c++;
184*ccdc9c3eSSadaf Ebrahimi     int hi = c;
185*ccdc9c3eSSadaf Ebrahimi     StringAppendF(&map, "[%02x-%02x] -> %d\n", lo, hi, b);
186*ccdc9c3eSSadaf Ebrahimi   }
187*ccdc9c3eSSadaf Ebrahimi   return map;
188*ccdc9c3eSSadaf Ebrahimi }
189*ccdc9c3eSSadaf Ebrahimi 
first_byte()190*ccdc9c3eSSadaf Ebrahimi int Prog::first_byte() {
191*ccdc9c3eSSadaf Ebrahimi   std::call_once(first_byte_once_, [](Prog* prog) {
192*ccdc9c3eSSadaf Ebrahimi     prog->first_byte_ = prog->ComputeFirstByte();
193*ccdc9c3eSSadaf Ebrahimi   }, this);
194*ccdc9c3eSSadaf Ebrahimi   return first_byte_;
195*ccdc9c3eSSadaf Ebrahimi }
196*ccdc9c3eSSadaf Ebrahimi 
197*ccdc9c3eSSadaf Ebrahimi static bool IsMatch(Prog*, Prog::Inst*);
198*ccdc9c3eSSadaf Ebrahimi 
199*ccdc9c3eSSadaf Ebrahimi // Peep-hole optimizer.
Optimize()200*ccdc9c3eSSadaf Ebrahimi void Prog::Optimize() {
201*ccdc9c3eSSadaf Ebrahimi   Workq q(size_);
202*ccdc9c3eSSadaf Ebrahimi 
203*ccdc9c3eSSadaf Ebrahimi   // Eliminate nops.  Most are taken out during compilation
204*ccdc9c3eSSadaf Ebrahimi   // but a few are hard to avoid.
205*ccdc9c3eSSadaf Ebrahimi   q.clear();
206*ccdc9c3eSSadaf Ebrahimi   AddToQueue(&q, start_);
207*ccdc9c3eSSadaf Ebrahimi   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
208*ccdc9c3eSSadaf Ebrahimi     int id = *i;
209*ccdc9c3eSSadaf Ebrahimi 
210*ccdc9c3eSSadaf Ebrahimi     Inst* ip = inst(id);
211*ccdc9c3eSSadaf Ebrahimi     int j = ip->out();
212*ccdc9c3eSSadaf Ebrahimi     Inst* jp;
213*ccdc9c3eSSadaf Ebrahimi     while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
214*ccdc9c3eSSadaf Ebrahimi       j = jp->out();
215*ccdc9c3eSSadaf Ebrahimi     }
216*ccdc9c3eSSadaf Ebrahimi     ip->set_out(j);
217*ccdc9c3eSSadaf Ebrahimi     AddToQueue(&q, ip->out());
218*ccdc9c3eSSadaf Ebrahimi 
219*ccdc9c3eSSadaf Ebrahimi     if (ip->opcode() == kInstAlt) {
220*ccdc9c3eSSadaf Ebrahimi       j = ip->out1();
221*ccdc9c3eSSadaf Ebrahimi       while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
222*ccdc9c3eSSadaf Ebrahimi         j = jp->out();
223*ccdc9c3eSSadaf Ebrahimi       }
224*ccdc9c3eSSadaf Ebrahimi       ip->out1_ = j;
225*ccdc9c3eSSadaf Ebrahimi       AddToQueue(&q, ip->out1());
226*ccdc9c3eSSadaf Ebrahimi     }
227*ccdc9c3eSSadaf Ebrahimi   }
228*ccdc9c3eSSadaf Ebrahimi 
229*ccdc9c3eSSadaf Ebrahimi   // Insert kInstAltMatch instructions
230*ccdc9c3eSSadaf Ebrahimi   // Look for
231*ccdc9c3eSSadaf Ebrahimi   //   ip: Alt -> j | k
232*ccdc9c3eSSadaf Ebrahimi   //	  j: ByteRange [00-FF] -> ip
233*ccdc9c3eSSadaf Ebrahimi   //    k: Match
234*ccdc9c3eSSadaf Ebrahimi   // or the reverse (the above is the greedy one).
235*ccdc9c3eSSadaf Ebrahimi   // Rewrite Alt to AltMatch.
236*ccdc9c3eSSadaf Ebrahimi   q.clear();
237*ccdc9c3eSSadaf Ebrahimi   AddToQueue(&q, start_);
238*ccdc9c3eSSadaf Ebrahimi   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
239*ccdc9c3eSSadaf Ebrahimi     int id = *i;
240*ccdc9c3eSSadaf Ebrahimi     Inst* ip = inst(id);
241*ccdc9c3eSSadaf Ebrahimi     AddToQueue(&q, ip->out());
242*ccdc9c3eSSadaf Ebrahimi     if (ip->opcode() == kInstAlt)
243*ccdc9c3eSSadaf Ebrahimi       AddToQueue(&q, ip->out1());
244*ccdc9c3eSSadaf Ebrahimi 
245*ccdc9c3eSSadaf Ebrahimi     if (ip->opcode() == kInstAlt) {
246*ccdc9c3eSSadaf Ebrahimi       Inst* j = inst(ip->out());
247*ccdc9c3eSSadaf Ebrahimi       Inst* k = inst(ip->out1());
248*ccdc9c3eSSadaf Ebrahimi       if (j->opcode() == kInstByteRange && j->out() == id &&
249*ccdc9c3eSSadaf Ebrahimi           j->lo() == 0x00 && j->hi() == 0xFF &&
250*ccdc9c3eSSadaf Ebrahimi           IsMatch(this, k)) {
251*ccdc9c3eSSadaf Ebrahimi         ip->set_opcode(kInstAltMatch);
252*ccdc9c3eSSadaf Ebrahimi         continue;
253*ccdc9c3eSSadaf Ebrahimi       }
254*ccdc9c3eSSadaf Ebrahimi       if (IsMatch(this, j) &&
255*ccdc9c3eSSadaf Ebrahimi           k->opcode() == kInstByteRange && k->out() == id &&
256*ccdc9c3eSSadaf Ebrahimi           k->lo() == 0x00 && k->hi() == 0xFF) {
257*ccdc9c3eSSadaf Ebrahimi         ip->set_opcode(kInstAltMatch);
258*ccdc9c3eSSadaf Ebrahimi       }
259*ccdc9c3eSSadaf Ebrahimi     }
260*ccdc9c3eSSadaf Ebrahimi   }
261*ccdc9c3eSSadaf Ebrahimi }
262*ccdc9c3eSSadaf Ebrahimi 
263*ccdc9c3eSSadaf Ebrahimi // Is ip a guaranteed match at end of text, perhaps after some capturing?
IsMatch(Prog * prog,Prog::Inst * ip)264*ccdc9c3eSSadaf Ebrahimi static bool IsMatch(Prog* prog, Prog::Inst* ip) {
265*ccdc9c3eSSadaf Ebrahimi   for (;;) {
266*ccdc9c3eSSadaf Ebrahimi     switch (ip->opcode()) {
267*ccdc9c3eSSadaf Ebrahimi       default:
268*ccdc9c3eSSadaf Ebrahimi         LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
269*ccdc9c3eSSadaf Ebrahimi         return false;
270*ccdc9c3eSSadaf Ebrahimi 
271*ccdc9c3eSSadaf Ebrahimi       case kInstAlt:
272*ccdc9c3eSSadaf Ebrahimi       case kInstAltMatch:
273*ccdc9c3eSSadaf Ebrahimi       case kInstByteRange:
274*ccdc9c3eSSadaf Ebrahimi       case kInstFail:
275*ccdc9c3eSSadaf Ebrahimi       case kInstEmptyWidth:
276*ccdc9c3eSSadaf Ebrahimi         return false;
277*ccdc9c3eSSadaf Ebrahimi 
278*ccdc9c3eSSadaf Ebrahimi       case kInstCapture:
279*ccdc9c3eSSadaf Ebrahimi       case kInstNop:
280*ccdc9c3eSSadaf Ebrahimi         ip = prog->inst(ip->out());
281*ccdc9c3eSSadaf Ebrahimi         break;
282*ccdc9c3eSSadaf Ebrahimi 
283*ccdc9c3eSSadaf Ebrahimi       case kInstMatch:
284*ccdc9c3eSSadaf Ebrahimi         return true;
285*ccdc9c3eSSadaf Ebrahimi     }
286*ccdc9c3eSSadaf Ebrahimi   }
287*ccdc9c3eSSadaf Ebrahimi }
288*ccdc9c3eSSadaf Ebrahimi 
EmptyFlags(const StringPiece & text,const char * p)289*ccdc9c3eSSadaf Ebrahimi uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
290*ccdc9c3eSSadaf Ebrahimi   int flags = 0;
291*ccdc9c3eSSadaf Ebrahimi 
292*ccdc9c3eSSadaf Ebrahimi   // ^ and \A
293*ccdc9c3eSSadaf Ebrahimi   if (p == text.begin())
294*ccdc9c3eSSadaf Ebrahimi     flags |= kEmptyBeginText | kEmptyBeginLine;
295*ccdc9c3eSSadaf Ebrahimi   else if (p[-1] == '\n')
296*ccdc9c3eSSadaf Ebrahimi     flags |= kEmptyBeginLine;
297*ccdc9c3eSSadaf Ebrahimi 
298*ccdc9c3eSSadaf Ebrahimi   // $ and \z
299*ccdc9c3eSSadaf Ebrahimi   if (p == text.end())
300*ccdc9c3eSSadaf Ebrahimi     flags |= kEmptyEndText | kEmptyEndLine;
301*ccdc9c3eSSadaf Ebrahimi   else if (p < text.end() && p[0] == '\n')
302*ccdc9c3eSSadaf Ebrahimi     flags |= kEmptyEndLine;
303*ccdc9c3eSSadaf Ebrahimi 
304*ccdc9c3eSSadaf Ebrahimi   // \b and \B
305*ccdc9c3eSSadaf Ebrahimi   if (p == text.begin() && p == text.end()) {
306*ccdc9c3eSSadaf Ebrahimi     // no word boundary here
307*ccdc9c3eSSadaf Ebrahimi   } else if (p == text.begin()) {
308*ccdc9c3eSSadaf Ebrahimi     if (IsWordChar(p[0]))
309*ccdc9c3eSSadaf Ebrahimi       flags |= kEmptyWordBoundary;
310*ccdc9c3eSSadaf Ebrahimi   } else if (p == text.end()) {
311*ccdc9c3eSSadaf Ebrahimi     if (IsWordChar(p[-1]))
312*ccdc9c3eSSadaf Ebrahimi       flags |= kEmptyWordBoundary;
313*ccdc9c3eSSadaf Ebrahimi   } else {
314*ccdc9c3eSSadaf Ebrahimi     if (IsWordChar(p[-1]) != IsWordChar(p[0]))
315*ccdc9c3eSSadaf Ebrahimi       flags |= kEmptyWordBoundary;
316*ccdc9c3eSSadaf Ebrahimi   }
317*ccdc9c3eSSadaf Ebrahimi   if (!(flags & kEmptyWordBoundary))
318*ccdc9c3eSSadaf Ebrahimi     flags |= kEmptyNonWordBoundary;
319*ccdc9c3eSSadaf Ebrahimi 
320*ccdc9c3eSSadaf Ebrahimi   return flags;
321*ccdc9c3eSSadaf Ebrahimi }
322*ccdc9c3eSSadaf Ebrahimi 
323*ccdc9c3eSSadaf Ebrahimi // ByteMapBuilder implements a coloring algorithm.
324*ccdc9c3eSSadaf Ebrahimi //
325*ccdc9c3eSSadaf Ebrahimi // The first phase is a series of "mark and merge" batches: we mark one or more
326*ccdc9c3eSSadaf Ebrahimi // [lo-hi] ranges, then merge them into our internal state. Batching is not for
327*ccdc9c3eSSadaf Ebrahimi // performance; rather, it means that the ranges are treated indistinguishably.
328*ccdc9c3eSSadaf Ebrahimi //
329*ccdc9c3eSSadaf Ebrahimi // Internally, the ranges are represented using a bitmap that stores the splits
330*ccdc9c3eSSadaf Ebrahimi // and a vector that stores the colors; both of them are indexed by the ranges'
331*ccdc9c3eSSadaf Ebrahimi // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
332*ccdc9c3eSSadaf Ebrahimi // hi (if not already split), then recolor each range in between. The color map
333*ccdc9c3eSSadaf Ebrahimi // (i.e. from the old color to the new color) is maintained for the lifetime of
334*ccdc9c3eSSadaf Ebrahimi // the batch and so underpins this somewhat obscure approach to set operations.
335*ccdc9c3eSSadaf Ebrahimi //
336*ccdc9c3eSSadaf Ebrahimi // The second phase builds the bytemap from our internal state: we recolor each
337*ccdc9c3eSSadaf Ebrahimi // range, then store the new color (which is now the byte class) in each of the
338*ccdc9c3eSSadaf Ebrahimi // corresponding array elements. Finally, we output the number of byte classes.
339*ccdc9c3eSSadaf Ebrahimi class ByteMapBuilder {
340*ccdc9c3eSSadaf Ebrahimi  public:
ByteMapBuilder()341*ccdc9c3eSSadaf Ebrahimi   ByteMapBuilder() {
342*ccdc9c3eSSadaf Ebrahimi     // Initial state: the [0-255] range has color 256.
343*ccdc9c3eSSadaf Ebrahimi     // This will avoid problems during the second phase,
344*ccdc9c3eSSadaf Ebrahimi     // in which we assign byte classes numbered from 0.
345*ccdc9c3eSSadaf Ebrahimi     splits_.Set(255);
346*ccdc9c3eSSadaf Ebrahimi     colors_.resize(256);
347*ccdc9c3eSSadaf Ebrahimi     colors_[255] = 256;
348*ccdc9c3eSSadaf Ebrahimi     nextcolor_ = 257;
349*ccdc9c3eSSadaf Ebrahimi   }
350*ccdc9c3eSSadaf Ebrahimi 
351*ccdc9c3eSSadaf Ebrahimi   void Mark(int lo, int hi);
352*ccdc9c3eSSadaf Ebrahimi   void Merge();
353*ccdc9c3eSSadaf Ebrahimi   void Build(uint8_t* bytemap, int* bytemap_range);
354*ccdc9c3eSSadaf Ebrahimi 
355*ccdc9c3eSSadaf Ebrahimi  private:
356*ccdc9c3eSSadaf Ebrahimi   int Recolor(int oldcolor);
357*ccdc9c3eSSadaf Ebrahimi 
358*ccdc9c3eSSadaf Ebrahimi   Bitmap256 splits_;
359*ccdc9c3eSSadaf Ebrahimi   std::vector<int> colors_;
360*ccdc9c3eSSadaf Ebrahimi   int nextcolor_;
361*ccdc9c3eSSadaf Ebrahimi   std::vector<std::pair<int, int>> colormap_;
362*ccdc9c3eSSadaf Ebrahimi   std::vector<std::pair<int, int>> ranges_;
363*ccdc9c3eSSadaf Ebrahimi 
364*ccdc9c3eSSadaf Ebrahimi   ByteMapBuilder(const ByteMapBuilder&) = delete;
365*ccdc9c3eSSadaf Ebrahimi   ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
366*ccdc9c3eSSadaf Ebrahimi };
367*ccdc9c3eSSadaf Ebrahimi 
Mark(int lo,int hi)368*ccdc9c3eSSadaf Ebrahimi void ByteMapBuilder::Mark(int lo, int hi) {
369*ccdc9c3eSSadaf Ebrahimi   DCHECK_GE(lo, 0);
370*ccdc9c3eSSadaf Ebrahimi   DCHECK_GE(hi, 0);
371*ccdc9c3eSSadaf Ebrahimi   DCHECK_LE(lo, 255);
372*ccdc9c3eSSadaf Ebrahimi   DCHECK_LE(hi, 255);
373*ccdc9c3eSSadaf Ebrahimi   DCHECK_LE(lo, hi);
374*ccdc9c3eSSadaf Ebrahimi 
375*ccdc9c3eSSadaf Ebrahimi   // Ignore any [0-255] ranges. They cause us to recolor every range, which
376*ccdc9c3eSSadaf Ebrahimi   // has no effect on the eventual result and is therefore a waste of time.
377*ccdc9c3eSSadaf Ebrahimi   if (lo == 0 && hi == 255)
378*ccdc9c3eSSadaf Ebrahimi     return;
379*ccdc9c3eSSadaf Ebrahimi 
380*ccdc9c3eSSadaf Ebrahimi   ranges_.emplace_back(lo, hi);
381*ccdc9c3eSSadaf Ebrahimi }
382*ccdc9c3eSSadaf Ebrahimi 
Merge()383*ccdc9c3eSSadaf Ebrahimi void ByteMapBuilder::Merge() {
384*ccdc9c3eSSadaf Ebrahimi   for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
385*ccdc9c3eSSadaf Ebrahimi        it != ranges_.end();
386*ccdc9c3eSSadaf Ebrahimi        ++it) {
387*ccdc9c3eSSadaf Ebrahimi     int lo = it->first-1;
388*ccdc9c3eSSadaf Ebrahimi     int hi = it->second;
389*ccdc9c3eSSadaf Ebrahimi 
390*ccdc9c3eSSadaf Ebrahimi     if (0 <= lo && !splits_.Test(lo)) {
391*ccdc9c3eSSadaf Ebrahimi       splits_.Set(lo);
392*ccdc9c3eSSadaf Ebrahimi       int next = splits_.FindNextSetBit(lo+1);
393*ccdc9c3eSSadaf Ebrahimi       colors_[lo] = colors_[next];
394*ccdc9c3eSSadaf Ebrahimi     }
395*ccdc9c3eSSadaf Ebrahimi     if (!splits_.Test(hi)) {
396*ccdc9c3eSSadaf Ebrahimi       splits_.Set(hi);
397*ccdc9c3eSSadaf Ebrahimi       int next = splits_.FindNextSetBit(hi+1);
398*ccdc9c3eSSadaf Ebrahimi       colors_[hi] = colors_[next];
399*ccdc9c3eSSadaf Ebrahimi     }
400*ccdc9c3eSSadaf Ebrahimi 
401*ccdc9c3eSSadaf Ebrahimi     int c = lo+1;
402*ccdc9c3eSSadaf Ebrahimi     while (c < 256) {
403*ccdc9c3eSSadaf Ebrahimi       int next = splits_.FindNextSetBit(c);
404*ccdc9c3eSSadaf Ebrahimi       colors_[next] = Recolor(colors_[next]);
405*ccdc9c3eSSadaf Ebrahimi       if (next == hi)
406*ccdc9c3eSSadaf Ebrahimi         break;
407*ccdc9c3eSSadaf Ebrahimi       c = next+1;
408*ccdc9c3eSSadaf Ebrahimi     }
409*ccdc9c3eSSadaf Ebrahimi   }
410*ccdc9c3eSSadaf Ebrahimi   colormap_.clear();
411*ccdc9c3eSSadaf Ebrahimi   ranges_.clear();
412*ccdc9c3eSSadaf Ebrahimi }
413*ccdc9c3eSSadaf Ebrahimi 
Build(uint8_t * bytemap,int * bytemap_range)414*ccdc9c3eSSadaf Ebrahimi void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
415*ccdc9c3eSSadaf Ebrahimi   // Assign byte classes numbered from 0.
416*ccdc9c3eSSadaf Ebrahimi   nextcolor_ = 0;
417*ccdc9c3eSSadaf Ebrahimi 
418*ccdc9c3eSSadaf Ebrahimi   int c = 0;
419*ccdc9c3eSSadaf Ebrahimi   while (c < 256) {
420*ccdc9c3eSSadaf Ebrahimi     int next = splits_.FindNextSetBit(c);
421*ccdc9c3eSSadaf Ebrahimi     uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
422*ccdc9c3eSSadaf Ebrahimi     while (c <= next) {
423*ccdc9c3eSSadaf Ebrahimi       bytemap[c] = b;
424*ccdc9c3eSSadaf Ebrahimi       c++;
425*ccdc9c3eSSadaf Ebrahimi     }
426*ccdc9c3eSSadaf Ebrahimi   }
427*ccdc9c3eSSadaf Ebrahimi 
428*ccdc9c3eSSadaf Ebrahimi   *bytemap_range = nextcolor_;
429*ccdc9c3eSSadaf Ebrahimi }
430*ccdc9c3eSSadaf Ebrahimi 
Recolor(int oldcolor)431*ccdc9c3eSSadaf Ebrahimi int ByteMapBuilder::Recolor(int oldcolor) {
432*ccdc9c3eSSadaf Ebrahimi   // Yes, this is a linear search. There can be at most 256
433*ccdc9c3eSSadaf Ebrahimi   // colors and there will typically be far fewer than that.
434*ccdc9c3eSSadaf Ebrahimi   // Also, we need to consider keys *and* values in order to
435*ccdc9c3eSSadaf Ebrahimi   // avoid recoloring a given range more than once per batch.
436*ccdc9c3eSSadaf Ebrahimi   std::vector<std::pair<int, int>>::const_iterator it =
437*ccdc9c3eSSadaf Ebrahimi       std::find_if(colormap_.begin(), colormap_.end(),
438*ccdc9c3eSSadaf Ebrahimi                    [=](const std::pair<int, int>& kv) -> bool {
439*ccdc9c3eSSadaf Ebrahimi                      return kv.first == oldcolor || kv.second == oldcolor;
440*ccdc9c3eSSadaf Ebrahimi                    });
441*ccdc9c3eSSadaf Ebrahimi   if (it != colormap_.end())
442*ccdc9c3eSSadaf Ebrahimi     return it->second;
443*ccdc9c3eSSadaf Ebrahimi   int newcolor = nextcolor_;
444*ccdc9c3eSSadaf Ebrahimi   nextcolor_++;
445*ccdc9c3eSSadaf Ebrahimi   colormap_.emplace_back(oldcolor, newcolor);
446*ccdc9c3eSSadaf Ebrahimi   return newcolor;
447*ccdc9c3eSSadaf Ebrahimi }
448*ccdc9c3eSSadaf Ebrahimi 
ComputeByteMap()449*ccdc9c3eSSadaf Ebrahimi void Prog::ComputeByteMap() {
450*ccdc9c3eSSadaf Ebrahimi   // Fill in bytemap with byte classes for the program.
451*ccdc9c3eSSadaf Ebrahimi   // Ranges of bytes that are treated indistinguishably
452*ccdc9c3eSSadaf Ebrahimi   // will be mapped to a single byte class.
453*ccdc9c3eSSadaf Ebrahimi   ByteMapBuilder builder;
454*ccdc9c3eSSadaf Ebrahimi 
455*ccdc9c3eSSadaf Ebrahimi   // Don't repeat the work for ^ and $.
456*ccdc9c3eSSadaf Ebrahimi   bool marked_line_boundaries = false;
457*ccdc9c3eSSadaf Ebrahimi   // Don't repeat the work for \b and \B.
458*ccdc9c3eSSadaf Ebrahimi   bool marked_word_boundaries = false;
459*ccdc9c3eSSadaf Ebrahimi 
460*ccdc9c3eSSadaf Ebrahimi   for (int id = 0; id < size(); id++) {
461*ccdc9c3eSSadaf Ebrahimi     Inst* ip = inst(id);
462*ccdc9c3eSSadaf Ebrahimi     if (ip->opcode() == kInstByteRange) {
463*ccdc9c3eSSadaf Ebrahimi       int lo = ip->lo();
464*ccdc9c3eSSadaf Ebrahimi       int hi = ip->hi();
465*ccdc9c3eSSadaf Ebrahimi       builder.Mark(lo, hi);
466*ccdc9c3eSSadaf Ebrahimi       if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
467*ccdc9c3eSSadaf Ebrahimi         int foldlo = lo;
468*ccdc9c3eSSadaf Ebrahimi         int foldhi = hi;
469*ccdc9c3eSSadaf Ebrahimi         if (foldlo < 'a')
470*ccdc9c3eSSadaf Ebrahimi           foldlo = 'a';
471*ccdc9c3eSSadaf Ebrahimi         if (foldhi > 'z')
472*ccdc9c3eSSadaf Ebrahimi           foldhi = 'z';
473*ccdc9c3eSSadaf Ebrahimi         if (foldlo <= foldhi)
474*ccdc9c3eSSadaf Ebrahimi           builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
475*ccdc9c3eSSadaf Ebrahimi       }
476*ccdc9c3eSSadaf Ebrahimi       // If this Inst is not the last Inst in its list AND the next Inst is
477*ccdc9c3eSSadaf Ebrahimi       // also a ByteRange AND the Insts have the same out, defer the merge.
478*ccdc9c3eSSadaf Ebrahimi       if (!ip->last() &&
479*ccdc9c3eSSadaf Ebrahimi           inst(id+1)->opcode() == kInstByteRange &&
480*ccdc9c3eSSadaf Ebrahimi           ip->out() == inst(id+1)->out())
481*ccdc9c3eSSadaf Ebrahimi         continue;
482*ccdc9c3eSSadaf Ebrahimi       builder.Merge();
483*ccdc9c3eSSadaf Ebrahimi     } else if (ip->opcode() == kInstEmptyWidth) {
484*ccdc9c3eSSadaf Ebrahimi       if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
485*ccdc9c3eSSadaf Ebrahimi           !marked_line_boundaries) {
486*ccdc9c3eSSadaf Ebrahimi         builder.Mark('\n', '\n');
487*ccdc9c3eSSadaf Ebrahimi         builder.Merge();
488*ccdc9c3eSSadaf Ebrahimi         marked_line_boundaries = true;
489*ccdc9c3eSSadaf Ebrahimi       }
490*ccdc9c3eSSadaf Ebrahimi       if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
491*ccdc9c3eSSadaf Ebrahimi           !marked_word_boundaries) {
492*ccdc9c3eSSadaf Ebrahimi         // We require two batches here: the first for ranges that are word
493*ccdc9c3eSSadaf Ebrahimi         // characters, the second for ranges that are not word characters.
494*ccdc9c3eSSadaf Ebrahimi         for (bool isword : {true, false}) {
495*ccdc9c3eSSadaf Ebrahimi           int j;
496*ccdc9c3eSSadaf Ebrahimi           for (int i = 0; i < 256; i = j) {
497*ccdc9c3eSSadaf Ebrahimi             for (j = i + 1; j < 256 &&
498*ccdc9c3eSSadaf Ebrahimi                             Prog::IsWordChar(static_cast<uint8_t>(i)) ==
499*ccdc9c3eSSadaf Ebrahimi                                 Prog::IsWordChar(static_cast<uint8_t>(j));
500*ccdc9c3eSSadaf Ebrahimi                  j++)
501*ccdc9c3eSSadaf Ebrahimi               ;
502*ccdc9c3eSSadaf Ebrahimi             if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
503*ccdc9c3eSSadaf Ebrahimi               builder.Mark(i, j - 1);
504*ccdc9c3eSSadaf Ebrahimi           }
505*ccdc9c3eSSadaf Ebrahimi           builder.Merge();
506*ccdc9c3eSSadaf Ebrahimi         }
507*ccdc9c3eSSadaf Ebrahimi         marked_word_boundaries = true;
508*ccdc9c3eSSadaf Ebrahimi       }
509*ccdc9c3eSSadaf Ebrahimi     }
510*ccdc9c3eSSadaf Ebrahimi   }
511*ccdc9c3eSSadaf Ebrahimi 
512*ccdc9c3eSSadaf Ebrahimi   builder.Build(bytemap_, &bytemap_range_);
513*ccdc9c3eSSadaf Ebrahimi 
514*ccdc9c3eSSadaf Ebrahimi   if (0) {  // For debugging, use trivial bytemap.
515*ccdc9c3eSSadaf Ebrahimi     LOG(ERROR) << "Using trivial bytemap.";
516*ccdc9c3eSSadaf Ebrahimi     for (int i = 0; i < 256; i++)
517*ccdc9c3eSSadaf Ebrahimi       bytemap_[i] = static_cast<uint8_t>(i);
518*ccdc9c3eSSadaf Ebrahimi     bytemap_range_ = 256;
519*ccdc9c3eSSadaf Ebrahimi   }
520*ccdc9c3eSSadaf Ebrahimi }
521*ccdc9c3eSSadaf Ebrahimi 
522*ccdc9c3eSSadaf Ebrahimi // Prog::Flatten() implements a graph rewriting algorithm.
523*ccdc9c3eSSadaf Ebrahimi //
524*ccdc9c3eSSadaf Ebrahimi // The overall process is similar to epsilon removal, but retains some epsilon
525*ccdc9c3eSSadaf Ebrahimi // transitions: those from Capture and EmptyWidth instructions; and those from
526*ccdc9c3eSSadaf Ebrahimi // nullable subexpressions. (The latter avoids quadratic blowup in transitions
527*ccdc9c3eSSadaf Ebrahimi // in the worst case.) It might be best thought of as Alt instruction elision.
528*ccdc9c3eSSadaf Ebrahimi //
529*ccdc9c3eSSadaf Ebrahimi // In conceptual terms, it divides the Prog into "trees" of instructions, then
530*ccdc9c3eSSadaf Ebrahimi // traverses the "trees" in order to produce "lists" of instructions. A "tree"
531*ccdc9c3eSSadaf Ebrahimi // is one or more instructions that grow from one "root" instruction to one or
532*ccdc9c3eSSadaf Ebrahimi // more "leaf" instructions; if a "tree" has exactly one instruction, then the
533*ccdc9c3eSSadaf Ebrahimi // "root" is also the "leaf". In most cases, a "root" is the successor of some
534*ccdc9c3eSSadaf Ebrahimi // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
535*ccdc9c3eSSadaf Ebrahimi // and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
536*ccdc9c3eSSadaf Ebrahimi // EmptyWidth or Match instruction. However, this is insufficient for handling
537*ccdc9c3eSSadaf Ebrahimi // nested nullable subexpressions correctly, so in some cases, a "root" is the
538*ccdc9c3eSSadaf Ebrahimi // dominator of the instructions reachable from some "successor root" (i.e. it
539*ccdc9c3eSSadaf Ebrahimi // has an unreachable predecessor) and is considered a "dominator root". Since
540*ccdc9c3eSSadaf Ebrahimi // only Alt instructions can be "dominator roots" (other instructions would be
541*ccdc9c3eSSadaf Ebrahimi // "leaves"), only Alt instructions are required to be marked as predecessors.
542*ccdc9c3eSSadaf Ebrahimi //
543*ccdc9c3eSSadaf Ebrahimi // Dividing the Prog into "trees" comprises two passes: marking the "successor
544*ccdc9c3eSSadaf Ebrahimi // roots" and the predecessors; and marking the "dominator roots". Sorting the
545*ccdc9c3eSSadaf Ebrahimi // "successor roots" by their bytecode offsets enables iteration in order from
546*ccdc9c3eSSadaf Ebrahimi // greatest to least during the second pass; by working backwards in this case
547*ccdc9c3eSSadaf Ebrahimi // and flooding the graph no further than "leaves" and already marked "roots",
548*ccdc9c3eSSadaf Ebrahimi // it becomes possible to mark "dominator roots" without doing excessive work.
549*ccdc9c3eSSadaf Ebrahimi //
550*ccdc9c3eSSadaf Ebrahimi // Traversing the "trees" is just iterating over the "roots" in order of their
551*ccdc9c3eSSadaf Ebrahimi // marking and flooding the graph no further than "leaves" and "roots". When a
552*ccdc9c3eSSadaf Ebrahimi // "leaf" is reached, the instruction is copied with its successor remapped to
553*ccdc9c3eSSadaf Ebrahimi // its "root" number. When a "root" is reached, a Nop instruction is generated
554*ccdc9c3eSSadaf Ebrahimi // with its successor remapped similarly. As each "list" is produced, its last
555*ccdc9c3eSSadaf Ebrahimi // instruction is marked as such. After all of the "lists" have been produced,
556*ccdc9c3eSSadaf Ebrahimi // a pass over their instructions remaps their successors to bytecode offsets.
Flatten()557*ccdc9c3eSSadaf Ebrahimi void Prog::Flatten() {
558*ccdc9c3eSSadaf Ebrahimi   if (did_flatten_)
559*ccdc9c3eSSadaf Ebrahimi     return;
560*ccdc9c3eSSadaf Ebrahimi   did_flatten_ = true;
561*ccdc9c3eSSadaf Ebrahimi 
562*ccdc9c3eSSadaf Ebrahimi   // Scratch structures. It's important that these are reused by functions
563*ccdc9c3eSSadaf Ebrahimi   // that we call in loops because they would thrash the heap otherwise.
564*ccdc9c3eSSadaf Ebrahimi   SparseSet reachable(size());
565*ccdc9c3eSSadaf Ebrahimi   std::vector<int> stk;
566*ccdc9c3eSSadaf Ebrahimi   stk.reserve(size());
567*ccdc9c3eSSadaf Ebrahimi 
568*ccdc9c3eSSadaf Ebrahimi   // First pass: Marks "successor roots" and predecessors.
569*ccdc9c3eSSadaf Ebrahimi   // Builds the mapping from inst-ids to root-ids.
570*ccdc9c3eSSadaf Ebrahimi   SparseArray<int> rootmap(size());
571*ccdc9c3eSSadaf Ebrahimi   SparseArray<int> predmap(size());
572*ccdc9c3eSSadaf Ebrahimi   std::vector<std::vector<int>> predvec;
573*ccdc9c3eSSadaf Ebrahimi   MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
574*ccdc9c3eSSadaf Ebrahimi 
575*ccdc9c3eSSadaf Ebrahimi   // Second pass: Marks "dominator roots".
576*ccdc9c3eSSadaf Ebrahimi   SparseArray<int> sorted(rootmap);
577*ccdc9c3eSSadaf Ebrahimi   std::sort(sorted.begin(), sorted.end(), sorted.less);
578*ccdc9c3eSSadaf Ebrahimi   for (SparseArray<int>::const_iterator i = sorted.end() - 1;
579*ccdc9c3eSSadaf Ebrahimi        i != sorted.begin();
580*ccdc9c3eSSadaf Ebrahimi        --i) {
581*ccdc9c3eSSadaf Ebrahimi     if (i->index() != start_unanchored() && i->index() != start())
582*ccdc9c3eSSadaf Ebrahimi       MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
583*ccdc9c3eSSadaf Ebrahimi   }
584*ccdc9c3eSSadaf Ebrahimi 
585*ccdc9c3eSSadaf Ebrahimi   // Third pass: Emits "lists". Remaps outs to root-ids.
586*ccdc9c3eSSadaf Ebrahimi   // Builds the mapping from root-ids to flat-ids.
587*ccdc9c3eSSadaf Ebrahimi   std::vector<int> flatmap(rootmap.size());
588*ccdc9c3eSSadaf Ebrahimi   std::vector<Inst> flat;
589*ccdc9c3eSSadaf Ebrahimi   flat.reserve(size());
590*ccdc9c3eSSadaf Ebrahimi   for (SparseArray<int>::const_iterator i = rootmap.begin();
591*ccdc9c3eSSadaf Ebrahimi        i != rootmap.end();
592*ccdc9c3eSSadaf Ebrahimi        ++i) {
593*ccdc9c3eSSadaf Ebrahimi     flatmap[i->value()] = static_cast<int>(flat.size());
594*ccdc9c3eSSadaf Ebrahimi     EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
595*ccdc9c3eSSadaf Ebrahimi     flat.back().set_last();
596*ccdc9c3eSSadaf Ebrahimi   }
597*ccdc9c3eSSadaf Ebrahimi 
598*ccdc9c3eSSadaf Ebrahimi   list_count_ = static_cast<int>(flatmap.size());
599*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < kNumInst; i++)
600*ccdc9c3eSSadaf Ebrahimi     inst_count_[i] = 0;
601*ccdc9c3eSSadaf Ebrahimi 
602*ccdc9c3eSSadaf Ebrahimi   // Fourth pass: Remaps outs to flat-ids.
603*ccdc9c3eSSadaf Ebrahimi   // Counts instructions by opcode.
604*ccdc9c3eSSadaf Ebrahimi   for (int id = 0; id < static_cast<int>(flat.size()); id++) {
605*ccdc9c3eSSadaf Ebrahimi     Inst* ip = &flat[id];
606*ccdc9c3eSSadaf Ebrahimi     if (ip->opcode() != kInstAltMatch)  // handled in EmitList()
607*ccdc9c3eSSadaf Ebrahimi       ip->set_out(flatmap[ip->out()]);
608*ccdc9c3eSSadaf Ebrahimi     inst_count_[ip->opcode()]++;
609*ccdc9c3eSSadaf Ebrahimi   }
610*ccdc9c3eSSadaf Ebrahimi 
611*ccdc9c3eSSadaf Ebrahimi   int total = 0;
612*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < kNumInst; i++)
613*ccdc9c3eSSadaf Ebrahimi     total += inst_count_[i];
614*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(total, static_cast<int>(flat.size()));
615*ccdc9c3eSSadaf Ebrahimi 
616*ccdc9c3eSSadaf Ebrahimi   // Remap start_unanchored and start.
617*ccdc9c3eSSadaf Ebrahimi   if (start_unanchored() == 0) {
618*ccdc9c3eSSadaf Ebrahimi     DCHECK_EQ(start(), 0);
619*ccdc9c3eSSadaf Ebrahimi   } else if (start_unanchored() == start()) {
620*ccdc9c3eSSadaf Ebrahimi     set_start_unanchored(flatmap[1]);
621*ccdc9c3eSSadaf Ebrahimi     set_start(flatmap[1]);
622*ccdc9c3eSSadaf Ebrahimi   } else {
623*ccdc9c3eSSadaf Ebrahimi     set_start_unanchored(flatmap[1]);
624*ccdc9c3eSSadaf Ebrahimi     set_start(flatmap[2]);
625*ccdc9c3eSSadaf Ebrahimi   }
626*ccdc9c3eSSadaf Ebrahimi 
627*ccdc9c3eSSadaf Ebrahimi   // Finally, replace the old instructions with the new instructions.
628*ccdc9c3eSSadaf Ebrahimi   size_ = static_cast<int>(flat.size());
629*ccdc9c3eSSadaf Ebrahimi   inst_ = PODArray<Inst>(size_);
630*ccdc9c3eSSadaf Ebrahimi   memmove(inst_.data(), flat.data(), size_*sizeof(inst_[0]));
631*ccdc9c3eSSadaf Ebrahimi }
632*ccdc9c3eSSadaf Ebrahimi 
MarkSuccessors(SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)633*ccdc9c3eSSadaf Ebrahimi void Prog::MarkSuccessors(SparseArray<int>* rootmap,
634*ccdc9c3eSSadaf Ebrahimi                           SparseArray<int>* predmap,
635*ccdc9c3eSSadaf Ebrahimi                           std::vector<std::vector<int>>* predvec,
636*ccdc9c3eSSadaf Ebrahimi                           SparseSet* reachable, std::vector<int>* stk) {
637*ccdc9c3eSSadaf Ebrahimi   // Mark the kInstFail instruction.
638*ccdc9c3eSSadaf Ebrahimi   rootmap->set_new(0, rootmap->size());
639*ccdc9c3eSSadaf Ebrahimi 
640*ccdc9c3eSSadaf Ebrahimi   // Mark the start_unanchored and start instructions.
641*ccdc9c3eSSadaf Ebrahimi   if (!rootmap->has_index(start_unanchored()))
642*ccdc9c3eSSadaf Ebrahimi     rootmap->set_new(start_unanchored(), rootmap->size());
643*ccdc9c3eSSadaf Ebrahimi   if (!rootmap->has_index(start()))
644*ccdc9c3eSSadaf Ebrahimi     rootmap->set_new(start(), rootmap->size());
645*ccdc9c3eSSadaf Ebrahimi 
646*ccdc9c3eSSadaf Ebrahimi   reachable->clear();
647*ccdc9c3eSSadaf Ebrahimi   stk->clear();
648*ccdc9c3eSSadaf Ebrahimi   stk->push_back(start_unanchored());
649*ccdc9c3eSSadaf Ebrahimi   while (!stk->empty()) {
650*ccdc9c3eSSadaf Ebrahimi     int id = stk->back();
651*ccdc9c3eSSadaf Ebrahimi     stk->pop_back();
652*ccdc9c3eSSadaf Ebrahimi   Loop:
653*ccdc9c3eSSadaf Ebrahimi     if (reachable->contains(id))
654*ccdc9c3eSSadaf Ebrahimi       continue;
655*ccdc9c3eSSadaf Ebrahimi     reachable->insert_new(id);
656*ccdc9c3eSSadaf Ebrahimi 
657*ccdc9c3eSSadaf Ebrahimi     Inst* ip = inst(id);
658*ccdc9c3eSSadaf Ebrahimi     switch (ip->opcode()) {
659*ccdc9c3eSSadaf Ebrahimi       default:
660*ccdc9c3eSSadaf Ebrahimi         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
661*ccdc9c3eSSadaf Ebrahimi         break;
662*ccdc9c3eSSadaf Ebrahimi 
663*ccdc9c3eSSadaf Ebrahimi       case kInstAltMatch:
664*ccdc9c3eSSadaf Ebrahimi       case kInstAlt:
665*ccdc9c3eSSadaf Ebrahimi         // Mark this instruction as a predecessor of each out.
666*ccdc9c3eSSadaf Ebrahimi         for (int out : {ip->out(), ip->out1()}) {
667*ccdc9c3eSSadaf Ebrahimi           if (!predmap->has_index(out)) {
668*ccdc9c3eSSadaf Ebrahimi             predmap->set_new(out, static_cast<int>(predvec->size()));
669*ccdc9c3eSSadaf Ebrahimi             predvec->emplace_back();
670*ccdc9c3eSSadaf Ebrahimi           }
671*ccdc9c3eSSadaf Ebrahimi           (*predvec)[predmap->get_existing(out)].emplace_back(id);
672*ccdc9c3eSSadaf Ebrahimi         }
673*ccdc9c3eSSadaf Ebrahimi         stk->push_back(ip->out1());
674*ccdc9c3eSSadaf Ebrahimi         id = ip->out();
675*ccdc9c3eSSadaf Ebrahimi         goto Loop;
676*ccdc9c3eSSadaf Ebrahimi 
677*ccdc9c3eSSadaf Ebrahimi       case kInstByteRange:
678*ccdc9c3eSSadaf Ebrahimi       case kInstCapture:
679*ccdc9c3eSSadaf Ebrahimi       case kInstEmptyWidth:
680*ccdc9c3eSSadaf Ebrahimi         // Mark the out of this instruction as a "root".
681*ccdc9c3eSSadaf Ebrahimi         if (!rootmap->has_index(ip->out()))
682*ccdc9c3eSSadaf Ebrahimi           rootmap->set_new(ip->out(), rootmap->size());
683*ccdc9c3eSSadaf Ebrahimi         id = ip->out();
684*ccdc9c3eSSadaf Ebrahimi         goto Loop;
685*ccdc9c3eSSadaf Ebrahimi 
686*ccdc9c3eSSadaf Ebrahimi       case kInstNop:
687*ccdc9c3eSSadaf Ebrahimi         id = ip->out();
688*ccdc9c3eSSadaf Ebrahimi         goto Loop;
689*ccdc9c3eSSadaf Ebrahimi 
690*ccdc9c3eSSadaf Ebrahimi       case kInstMatch:
691*ccdc9c3eSSadaf Ebrahimi       case kInstFail:
692*ccdc9c3eSSadaf Ebrahimi         break;
693*ccdc9c3eSSadaf Ebrahimi     }
694*ccdc9c3eSSadaf Ebrahimi   }
695*ccdc9c3eSSadaf Ebrahimi }
696*ccdc9c3eSSadaf Ebrahimi 
MarkDominator(int root,SparseArray<int> * rootmap,SparseArray<int> * predmap,std::vector<std::vector<int>> * predvec,SparseSet * reachable,std::vector<int> * stk)697*ccdc9c3eSSadaf Ebrahimi void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
698*ccdc9c3eSSadaf Ebrahimi                          SparseArray<int>* predmap,
699*ccdc9c3eSSadaf Ebrahimi                          std::vector<std::vector<int>>* predvec,
700*ccdc9c3eSSadaf Ebrahimi                          SparseSet* reachable, std::vector<int>* stk) {
701*ccdc9c3eSSadaf Ebrahimi   reachable->clear();
702*ccdc9c3eSSadaf Ebrahimi   stk->clear();
703*ccdc9c3eSSadaf Ebrahimi   stk->push_back(root);
704*ccdc9c3eSSadaf Ebrahimi   while (!stk->empty()) {
705*ccdc9c3eSSadaf Ebrahimi     int id = stk->back();
706*ccdc9c3eSSadaf Ebrahimi     stk->pop_back();
707*ccdc9c3eSSadaf Ebrahimi   Loop:
708*ccdc9c3eSSadaf Ebrahimi     if (reachable->contains(id))
709*ccdc9c3eSSadaf Ebrahimi       continue;
710*ccdc9c3eSSadaf Ebrahimi     reachable->insert_new(id);
711*ccdc9c3eSSadaf Ebrahimi 
712*ccdc9c3eSSadaf Ebrahimi     if (id != root && rootmap->has_index(id)) {
713*ccdc9c3eSSadaf Ebrahimi       // We reached another "tree" via epsilon transition.
714*ccdc9c3eSSadaf Ebrahimi       continue;
715*ccdc9c3eSSadaf Ebrahimi     }
716*ccdc9c3eSSadaf Ebrahimi 
717*ccdc9c3eSSadaf Ebrahimi     Inst* ip = inst(id);
718*ccdc9c3eSSadaf Ebrahimi     switch (ip->opcode()) {
719*ccdc9c3eSSadaf Ebrahimi       default:
720*ccdc9c3eSSadaf Ebrahimi         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
721*ccdc9c3eSSadaf Ebrahimi         break;
722*ccdc9c3eSSadaf Ebrahimi 
723*ccdc9c3eSSadaf Ebrahimi       case kInstAltMatch:
724*ccdc9c3eSSadaf Ebrahimi       case kInstAlt:
725*ccdc9c3eSSadaf Ebrahimi         stk->push_back(ip->out1());
726*ccdc9c3eSSadaf Ebrahimi         id = ip->out();
727*ccdc9c3eSSadaf Ebrahimi         goto Loop;
728*ccdc9c3eSSadaf Ebrahimi 
729*ccdc9c3eSSadaf Ebrahimi       case kInstByteRange:
730*ccdc9c3eSSadaf Ebrahimi       case kInstCapture:
731*ccdc9c3eSSadaf Ebrahimi       case kInstEmptyWidth:
732*ccdc9c3eSSadaf Ebrahimi         break;
733*ccdc9c3eSSadaf Ebrahimi 
734*ccdc9c3eSSadaf Ebrahimi       case kInstNop:
735*ccdc9c3eSSadaf Ebrahimi         id = ip->out();
736*ccdc9c3eSSadaf Ebrahimi         goto Loop;
737*ccdc9c3eSSadaf Ebrahimi 
738*ccdc9c3eSSadaf Ebrahimi       case kInstMatch:
739*ccdc9c3eSSadaf Ebrahimi       case kInstFail:
740*ccdc9c3eSSadaf Ebrahimi         break;
741*ccdc9c3eSSadaf Ebrahimi     }
742*ccdc9c3eSSadaf Ebrahimi   }
743*ccdc9c3eSSadaf Ebrahimi 
744*ccdc9c3eSSadaf Ebrahimi   for (SparseSet::const_iterator i = reachable->begin();
745*ccdc9c3eSSadaf Ebrahimi        i != reachable->end();
746*ccdc9c3eSSadaf Ebrahimi        ++i) {
747*ccdc9c3eSSadaf Ebrahimi     int id = *i;
748*ccdc9c3eSSadaf Ebrahimi     if (predmap->has_index(id)) {
749*ccdc9c3eSSadaf Ebrahimi       for (int pred : (*predvec)[predmap->get_existing(id)]) {
750*ccdc9c3eSSadaf Ebrahimi         if (!reachable->contains(pred)) {
751*ccdc9c3eSSadaf Ebrahimi           // id has a predecessor that cannot be reached from root!
752*ccdc9c3eSSadaf Ebrahimi           // Therefore, id must be a "root" too - mark it as such.
753*ccdc9c3eSSadaf Ebrahimi           if (!rootmap->has_index(id))
754*ccdc9c3eSSadaf Ebrahimi             rootmap->set_new(id, rootmap->size());
755*ccdc9c3eSSadaf Ebrahimi         }
756*ccdc9c3eSSadaf Ebrahimi       }
757*ccdc9c3eSSadaf Ebrahimi     }
758*ccdc9c3eSSadaf Ebrahimi   }
759*ccdc9c3eSSadaf Ebrahimi }
760*ccdc9c3eSSadaf Ebrahimi 
EmitList(int root,SparseArray<int> * rootmap,std::vector<Inst> * flat,SparseSet * reachable,std::vector<int> * stk)761*ccdc9c3eSSadaf Ebrahimi void Prog::EmitList(int root, SparseArray<int>* rootmap,
762*ccdc9c3eSSadaf Ebrahimi                     std::vector<Inst>* flat,
763*ccdc9c3eSSadaf Ebrahimi                     SparseSet* reachable, std::vector<int>* stk) {
764*ccdc9c3eSSadaf Ebrahimi   reachable->clear();
765*ccdc9c3eSSadaf Ebrahimi   stk->clear();
766*ccdc9c3eSSadaf Ebrahimi   stk->push_back(root);
767*ccdc9c3eSSadaf Ebrahimi   while (!stk->empty()) {
768*ccdc9c3eSSadaf Ebrahimi     int id = stk->back();
769*ccdc9c3eSSadaf Ebrahimi     stk->pop_back();
770*ccdc9c3eSSadaf Ebrahimi   Loop:
771*ccdc9c3eSSadaf Ebrahimi     if (reachable->contains(id))
772*ccdc9c3eSSadaf Ebrahimi       continue;
773*ccdc9c3eSSadaf Ebrahimi     reachable->insert_new(id);
774*ccdc9c3eSSadaf Ebrahimi 
775*ccdc9c3eSSadaf Ebrahimi     if (id != root && rootmap->has_index(id)) {
776*ccdc9c3eSSadaf Ebrahimi       // We reached another "tree" via epsilon transition. Emit a kInstNop
777*ccdc9c3eSSadaf Ebrahimi       // instruction so that the Prog does not become quadratically larger.
778*ccdc9c3eSSadaf Ebrahimi       flat->emplace_back();
779*ccdc9c3eSSadaf Ebrahimi       flat->back().set_opcode(kInstNop);
780*ccdc9c3eSSadaf Ebrahimi       flat->back().set_out(rootmap->get_existing(id));
781*ccdc9c3eSSadaf Ebrahimi       continue;
782*ccdc9c3eSSadaf Ebrahimi     }
783*ccdc9c3eSSadaf Ebrahimi 
784*ccdc9c3eSSadaf Ebrahimi     Inst* ip = inst(id);
785*ccdc9c3eSSadaf Ebrahimi     switch (ip->opcode()) {
786*ccdc9c3eSSadaf Ebrahimi       default:
787*ccdc9c3eSSadaf Ebrahimi         LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
788*ccdc9c3eSSadaf Ebrahimi         break;
789*ccdc9c3eSSadaf Ebrahimi 
790*ccdc9c3eSSadaf Ebrahimi       case kInstAltMatch:
791*ccdc9c3eSSadaf Ebrahimi         flat->emplace_back();
792*ccdc9c3eSSadaf Ebrahimi         flat->back().set_opcode(kInstAltMatch);
793*ccdc9c3eSSadaf Ebrahimi         flat->back().set_out(static_cast<int>(flat->size()));
794*ccdc9c3eSSadaf Ebrahimi         flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
795*ccdc9c3eSSadaf Ebrahimi         FALLTHROUGH_INTENDED;
796*ccdc9c3eSSadaf Ebrahimi 
797*ccdc9c3eSSadaf Ebrahimi       case kInstAlt:
798*ccdc9c3eSSadaf Ebrahimi         stk->push_back(ip->out1());
799*ccdc9c3eSSadaf Ebrahimi         id = ip->out();
800*ccdc9c3eSSadaf Ebrahimi         goto Loop;
801*ccdc9c3eSSadaf Ebrahimi 
802*ccdc9c3eSSadaf Ebrahimi       case kInstByteRange:
803*ccdc9c3eSSadaf Ebrahimi       case kInstCapture:
804*ccdc9c3eSSadaf Ebrahimi       case kInstEmptyWidth:
805*ccdc9c3eSSadaf Ebrahimi         flat->emplace_back();
806*ccdc9c3eSSadaf Ebrahimi         memmove(&flat->back(), ip, sizeof *ip);
807*ccdc9c3eSSadaf Ebrahimi         flat->back().set_out(rootmap->get_existing(ip->out()));
808*ccdc9c3eSSadaf Ebrahimi         break;
809*ccdc9c3eSSadaf Ebrahimi 
810*ccdc9c3eSSadaf Ebrahimi       case kInstNop:
811*ccdc9c3eSSadaf Ebrahimi         id = ip->out();
812*ccdc9c3eSSadaf Ebrahimi         goto Loop;
813*ccdc9c3eSSadaf Ebrahimi 
814*ccdc9c3eSSadaf Ebrahimi       case kInstMatch:
815*ccdc9c3eSSadaf Ebrahimi       case kInstFail:
816*ccdc9c3eSSadaf Ebrahimi         flat->emplace_back();
817*ccdc9c3eSSadaf Ebrahimi         memmove(&flat->back(), ip, sizeof *ip);
818*ccdc9c3eSSadaf Ebrahimi         break;
819*ccdc9c3eSSadaf Ebrahimi     }
820*ccdc9c3eSSadaf Ebrahimi   }
821*ccdc9c3eSSadaf Ebrahimi }
822*ccdc9c3eSSadaf Ebrahimi 
823*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
824