xref: /aosp_15_r20/external/regex-re2/re2/nfa.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Tested by search_test.cc.
6*ccdc9c3eSSadaf Ebrahimi //
7*ccdc9c3eSSadaf Ebrahimi // Prog::SearchNFA, an NFA search.
8*ccdc9c3eSSadaf Ebrahimi // This is an actual NFA like the theorists talk about,
9*ccdc9c3eSSadaf Ebrahimi // not the pseudo-NFA found in backtracking regexp implementations.
10*ccdc9c3eSSadaf Ebrahimi //
11*ccdc9c3eSSadaf Ebrahimi // IMPLEMENTATION
12*ccdc9c3eSSadaf Ebrahimi //
13*ccdc9c3eSSadaf Ebrahimi // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14*ccdc9c3eSSadaf Ebrahimi // which is a variant of the one described in Thompson's 1968 CACM paper.
15*ccdc9c3eSSadaf Ebrahimi // See http://swtch.com/~rsc/regexp/ for various history.  The main feature
16*ccdc9c3eSSadaf Ebrahimi // over the DFA implementation is that it tracks submatch boundaries.
17*ccdc9c3eSSadaf Ebrahimi //
18*ccdc9c3eSSadaf Ebrahimi // When the choice of submatch boundaries is ambiguous, this particular
19*ccdc9c3eSSadaf Ebrahimi // implementation makes the same choices that traditional backtracking
20*ccdc9c3eSSadaf Ebrahimi // implementations (in particular, Perl and PCRE) do.
21*ccdc9c3eSSadaf Ebrahimi // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22*ccdc9c3eSSadaf Ebrahimi // time in the length of the input.
23*ccdc9c3eSSadaf Ebrahimi //
24*ccdc9c3eSSadaf Ebrahimi // Like Thompson's original machine and like the DFA implementation, this
25*ccdc9c3eSSadaf Ebrahimi // implementation notices a match only once it is one byte past it.
26*ccdc9c3eSSadaf Ebrahimi 
27*ccdc9c3eSSadaf Ebrahimi #include <stdio.h>
28*ccdc9c3eSSadaf Ebrahimi #include <string.h>
29*ccdc9c3eSSadaf Ebrahimi #include <algorithm>
30*ccdc9c3eSSadaf Ebrahimi #include <string>
31*ccdc9c3eSSadaf Ebrahimi #include <utility>
32*ccdc9c3eSSadaf Ebrahimi #include <vector>
33*ccdc9c3eSSadaf Ebrahimi 
34*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
35*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
36*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
37*ccdc9c3eSSadaf Ebrahimi #include "util/pod_array.h"
38*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_array.h"
39*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_set.h"
40*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
41*ccdc9c3eSSadaf Ebrahimi 
42*ccdc9c3eSSadaf Ebrahimi namespace re2 {
43*ccdc9c3eSSadaf Ebrahimi 
44*ccdc9c3eSSadaf Ebrahimi static const bool ExtraDebug = false;
45*ccdc9c3eSSadaf Ebrahimi 
46*ccdc9c3eSSadaf Ebrahimi class NFA {
47*ccdc9c3eSSadaf Ebrahimi  public:
48*ccdc9c3eSSadaf Ebrahimi   NFA(Prog* prog);
49*ccdc9c3eSSadaf Ebrahimi   ~NFA();
50*ccdc9c3eSSadaf Ebrahimi 
51*ccdc9c3eSSadaf Ebrahimi   // Searches for a matching string.
52*ccdc9c3eSSadaf Ebrahimi   //   * If anchored is true, only considers matches starting at offset.
53*ccdc9c3eSSadaf Ebrahimi   //     Otherwise finds lefmost match at or after offset.
54*ccdc9c3eSSadaf Ebrahimi   //   * If longest is true, returns the longest match starting
55*ccdc9c3eSSadaf Ebrahimi   //     at the chosen start point.  Otherwise returns the so-called
56*ccdc9c3eSSadaf Ebrahimi   //     left-biased match, the one traditional backtracking engines
57*ccdc9c3eSSadaf Ebrahimi   //     (like Perl and PCRE) find.
58*ccdc9c3eSSadaf Ebrahimi   // Records submatch boundaries in submatch[1..nsubmatch-1].
59*ccdc9c3eSSadaf Ebrahimi   // Submatch[0] is the entire match.  When there is a choice in
60*ccdc9c3eSSadaf Ebrahimi   // which text matches each subexpression, the submatch boundaries
61*ccdc9c3eSSadaf Ebrahimi   // are chosen to match what a backtracking implementation would choose.
62*ccdc9c3eSSadaf Ebrahimi   bool Search(const StringPiece& text, const StringPiece& context,
63*ccdc9c3eSSadaf Ebrahimi               bool anchored, bool longest,
64*ccdc9c3eSSadaf Ebrahimi               StringPiece* submatch, int nsubmatch);
65*ccdc9c3eSSadaf Ebrahimi 
66*ccdc9c3eSSadaf Ebrahimi  private:
67*ccdc9c3eSSadaf Ebrahimi   struct Thread {
68*ccdc9c3eSSadaf Ebrahimi     union {
69*ccdc9c3eSSadaf Ebrahimi       int ref;
70*ccdc9c3eSSadaf Ebrahimi       Thread* next;  // when on free list
71*ccdc9c3eSSadaf Ebrahimi     };
72*ccdc9c3eSSadaf Ebrahimi     const char** capture;
73*ccdc9c3eSSadaf Ebrahimi   };
74*ccdc9c3eSSadaf Ebrahimi 
75*ccdc9c3eSSadaf Ebrahimi   // State for explicit stack in AddToThreadq.
76*ccdc9c3eSSadaf Ebrahimi   struct AddState {
77*ccdc9c3eSSadaf Ebrahimi     int id;     // Inst to process
78*ccdc9c3eSSadaf Ebrahimi     Thread* t;  // if not null, set t0 = t before processing id
79*ccdc9c3eSSadaf Ebrahimi   };
80*ccdc9c3eSSadaf Ebrahimi 
81*ccdc9c3eSSadaf Ebrahimi   // Threadq is a list of threads.  The list is sorted by the order
82*ccdc9c3eSSadaf Ebrahimi   // in which Perl would explore that particular state -- the earlier
83*ccdc9c3eSSadaf Ebrahimi   // choices appear earlier in the list.
84*ccdc9c3eSSadaf Ebrahimi   typedef SparseArray<Thread*> Threadq;
85*ccdc9c3eSSadaf Ebrahimi 
86*ccdc9c3eSSadaf Ebrahimi   inline Thread* AllocThread();
87*ccdc9c3eSSadaf Ebrahimi   inline Thread* Incref(Thread* t);
88*ccdc9c3eSSadaf Ebrahimi   inline void Decref(Thread* t);
89*ccdc9c3eSSadaf Ebrahimi 
90*ccdc9c3eSSadaf Ebrahimi   // Follows all empty arrows from id0 and enqueues all the states reached.
91*ccdc9c3eSSadaf Ebrahimi   // Enqueues only the ByteRange instructions that match byte c.
92*ccdc9c3eSSadaf Ebrahimi   // context is used (with p) for evaluating empty-width specials.
93*ccdc9c3eSSadaf Ebrahimi   // p is the current input position, and t0 is the current thread.
94*ccdc9c3eSSadaf Ebrahimi   void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
95*ccdc9c3eSSadaf Ebrahimi                     const char* p, Thread* t0);
96*ccdc9c3eSSadaf Ebrahimi 
97*ccdc9c3eSSadaf Ebrahimi   // Run runq on byte c, appending new states to nextq.
98*ccdc9c3eSSadaf Ebrahimi   // Updates matched_ and match_ as new, better matches are found.
99*ccdc9c3eSSadaf Ebrahimi   // context is used (with p) for evaluating empty-width specials.
100*ccdc9c3eSSadaf Ebrahimi   // p is the position of byte c in the input string for AddToThreadq;
101*ccdc9c3eSSadaf Ebrahimi   // p-1 will be used when processing Match instructions.
102*ccdc9c3eSSadaf Ebrahimi   // Frees all the threads on runq.
103*ccdc9c3eSSadaf Ebrahimi   // If there is a shortcut to the end, returns that shortcut.
104*ccdc9c3eSSadaf Ebrahimi   int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
105*ccdc9c3eSSadaf Ebrahimi            const char* p);
106*ccdc9c3eSSadaf Ebrahimi 
107*ccdc9c3eSSadaf Ebrahimi   // Returns text version of capture information, for debugging.
108*ccdc9c3eSSadaf Ebrahimi   string FormatCapture(const char** capture);
109*ccdc9c3eSSadaf Ebrahimi 
110*ccdc9c3eSSadaf Ebrahimi   inline void CopyCapture(const char** dst, const char** src);
111*ccdc9c3eSSadaf Ebrahimi 
112*ccdc9c3eSSadaf Ebrahimi   Prog* prog_;                // underlying program
113*ccdc9c3eSSadaf Ebrahimi   int start_;                 // start instruction in program
114*ccdc9c3eSSadaf Ebrahimi   int ncapture_;              // number of submatches to track
115*ccdc9c3eSSadaf Ebrahimi   bool longest_;              // whether searching for longest match
116*ccdc9c3eSSadaf Ebrahimi   bool endmatch_;             // whether match must end at text.end()
117*ccdc9c3eSSadaf Ebrahimi   const char* btext_;         // beginning of text being matched (for FormatSubmatch)
118*ccdc9c3eSSadaf Ebrahimi   const char* etext_;         // end of text being matched (for endmatch_)
119*ccdc9c3eSSadaf Ebrahimi   Threadq q0_, q1_;           // pre-allocated for Search.
120*ccdc9c3eSSadaf Ebrahimi   PODArray<AddState> stack_;  // pre-allocated for AddToThreadq
121*ccdc9c3eSSadaf Ebrahimi   Thread* free_threads_;      // free list
122*ccdc9c3eSSadaf Ebrahimi   const char** match_;        // best match so far
123*ccdc9c3eSSadaf Ebrahimi   bool matched_;              // any match so far?
124*ccdc9c3eSSadaf Ebrahimi 
125*ccdc9c3eSSadaf Ebrahimi   NFA(const NFA&) = delete;
126*ccdc9c3eSSadaf Ebrahimi   NFA& operator=(const NFA&) = delete;
127*ccdc9c3eSSadaf Ebrahimi };
128*ccdc9c3eSSadaf Ebrahimi 
NFA(Prog * prog)129*ccdc9c3eSSadaf Ebrahimi NFA::NFA(Prog* prog) {
130*ccdc9c3eSSadaf Ebrahimi   prog_ = prog;
131*ccdc9c3eSSadaf Ebrahimi   start_ = prog_->start();
132*ccdc9c3eSSadaf Ebrahimi   ncapture_ = 0;
133*ccdc9c3eSSadaf Ebrahimi   longest_ = false;
134*ccdc9c3eSSadaf Ebrahimi   endmatch_ = false;
135*ccdc9c3eSSadaf Ebrahimi   btext_ = NULL;
136*ccdc9c3eSSadaf Ebrahimi   etext_ = NULL;
137*ccdc9c3eSSadaf Ebrahimi   q0_.resize(prog_->size());
138*ccdc9c3eSSadaf Ebrahimi   q1_.resize(prog_->size());
139*ccdc9c3eSSadaf Ebrahimi   // See NFA::AddToThreadq() for why this is so.
140*ccdc9c3eSSadaf Ebrahimi   int nstack = 2*prog_->inst_count(kInstCapture) +
141*ccdc9c3eSSadaf Ebrahimi                prog_->inst_count(kInstEmptyWidth) +
142*ccdc9c3eSSadaf Ebrahimi                prog_->inst_count(kInstNop) + 1;  // + 1 for start inst
143*ccdc9c3eSSadaf Ebrahimi   stack_ = PODArray<AddState>(nstack);
144*ccdc9c3eSSadaf Ebrahimi   free_threads_ = NULL;
145*ccdc9c3eSSadaf Ebrahimi   match_ = NULL;
146*ccdc9c3eSSadaf Ebrahimi   matched_ = false;
147*ccdc9c3eSSadaf Ebrahimi }
148*ccdc9c3eSSadaf Ebrahimi 
~NFA()149*ccdc9c3eSSadaf Ebrahimi NFA::~NFA() {
150*ccdc9c3eSSadaf Ebrahimi   delete[] match_;
151*ccdc9c3eSSadaf Ebrahimi   Thread* next;
152*ccdc9c3eSSadaf Ebrahimi   for (Thread* t = free_threads_; t; t = next) {
153*ccdc9c3eSSadaf Ebrahimi     next = t->next;
154*ccdc9c3eSSadaf Ebrahimi     delete[] t->capture;
155*ccdc9c3eSSadaf Ebrahimi     delete t;
156*ccdc9c3eSSadaf Ebrahimi   }
157*ccdc9c3eSSadaf Ebrahimi }
158*ccdc9c3eSSadaf Ebrahimi 
AllocThread()159*ccdc9c3eSSadaf Ebrahimi NFA::Thread* NFA::AllocThread() {
160*ccdc9c3eSSadaf Ebrahimi   Thread* t = free_threads_;
161*ccdc9c3eSSadaf Ebrahimi   if (t == NULL) {
162*ccdc9c3eSSadaf Ebrahimi     t = new Thread;
163*ccdc9c3eSSadaf Ebrahimi     t->ref = 1;
164*ccdc9c3eSSadaf Ebrahimi     t->capture = new const char*[ncapture_];
165*ccdc9c3eSSadaf Ebrahimi     return t;
166*ccdc9c3eSSadaf Ebrahimi   }
167*ccdc9c3eSSadaf Ebrahimi   free_threads_ = t->next;
168*ccdc9c3eSSadaf Ebrahimi   t->ref = 1;
169*ccdc9c3eSSadaf Ebrahimi   return t;
170*ccdc9c3eSSadaf Ebrahimi }
171*ccdc9c3eSSadaf Ebrahimi 
Incref(Thread * t)172*ccdc9c3eSSadaf Ebrahimi NFA::Thread* NFA::Incref(Thread* t) {
173*ccdc9c3eSSadaf Ebrahimi   DCHECK(t != NULL);
174*ccdc9c3eSSadaf Ebrahimi   t->ref++;
175*ccdc9c3eSSadaf Ebrahimi   return t;
176*ccdc9c3eSSadaf Ebrahimi }
177*ccdc9c3eSSadaf Ebrahimi 
Decref(Thread * t)178*ccdc9c3eSSadaf Ebrahimi void NFA::Decref(Thread* t) {
179*ccdc9c3eSSadaf Ebrahimi   if (t == NULL)
180*ccdc9c3eSSadaf Ebrahimi     return;
181*ccdc9c3eSSadaf Ebrahimi   t->ref--;
182*ccdc9c3eSSadaf Ebrahimi   if (t->ref > 0)
183*ccdc9c3eSSadaf Ebrahimi     return;
184*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(t->ref, 0);
185*ccdc9c3eSSadaf Ebrahimi   t->next = free_threads_;
186*ccdc9c3eSSadaf Ebrahimi   free_threads_ = t;
187*ccdc9c3eSSadaf Ebrahimi }
188*ccdc9c3eSSadaf Ebrahimi 
CopyCapture(const char ** dst,const char ** src)189*ccdc9c3eSSadaf Ebrahimi void NFA::CopyCapture(const char** dst, const char** src) {
190*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < ncapture_; i+=2) {
191*ccdc9c3eSSadaf Ebrahimi     dst[i] = src[i];
192*ccdc9c3eSSadaf Ebrahimi     dst[i+1] = src[i+1];
193*ccdc9c3eSSadaf Ebrahimi   }
194*ccdc9c3eSSadaf Ebrahimi }
195*ccdc9c3eSSadaf Ebrahimi 
196*ccdc9c3eSSadaf Ebrahimi // Follows all empty arrows from id0 and enqueues all the states reached.
197*ccdc9c3eSSadaf Ebrahimi // Enqueues only the ByteRange instructions that match byte c.
198*ccdc9c3eSSadaf Ebrahimi // context is used (with p) for evaluating empty-width specials.
199*ccdc9c3eSSadaf Ebrahimi // p is the current input position, and t0 is the current thread.
AddToThreadq(Threadq * q,int id0,int c,const StringPiece & context,const char * p,Thread * t0)200*ccdc9c3eSSadaf Ebrahimi void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
201*ccdc9c3eSSadaf Ebrahimi                        const char* p, Thread* t0) {
202*ccdc9c3eSSadaf Ebrahimi   if (id0 == 0)
203*ccdc9c3eSSadaf Ebrahimi     return;
204*ccdc9c3eSSadaf Ebrahimi 
205*ccdc9c3eSSadaf Ebrahimi   // Use stack_ to hold our stack of instructions yet to process.
206*ccdc9c3eSSadaf Ebrahimi   // It was preallocated as follows:
207*ccdc9c3eSSadaf Ebrahimi   //   two entries per Capture;
208*ccdc9c3eSSadaf Ebrahimi   //   one entry per EmptyWidth; and
209*ccdc9c3eSSadaf Ebrahimi   //   one entry per Nop.
210*ccdc9c3eSSadaf Ebrahimi   // This reflects the maximum number of stack pushes that each can
211*ccdc9c3eSSadaf Ebrahimi   // perform. (Each instruction can be processed at most once.)
212*ccdc9c3eSSadaf Ebrahimi   AddState* stk = stack_.data();
213*ccdc9c3eSSadaf Ebrahimi   int nstk = 0;
214*ccdc9c3eSSadaf Ebrahimi 
215*ccdc9c3eSSadaf Ebrahimi   stk[nstk++] = {id0, NULL};
216*ccdc9c3eSSadaf Ebrahimi   while (nstk > 0) {
217*ccdc9c3eSSadaf Ebrahimi     DCHECK_LE(nstk, stack_.size());
218*ccdc9c3eSSadaf Ebrahimi     AddState a = stk[--nstk];
219*ccdc9c3eSSadaf Ebrahimi 
220*ccdc9c3eSSadaf Ebrahimi   Loop:
221*ccdc9c3eSSadaf Ebrahimi     if (a.t != NULL) {
222*ccdc9c3eSSadaf Ebrahimi       // t0 was a thread that we allocated and copied in order to
223*ccdc9c3eSSadaf Ebrahimi       // record the capture, so we must now decref it.
224*ccdc9c3eSSadaf Ebrahimi       Decref(t0);
225*ccdc9c3eSSadaf Ebrahimi       t0 = a.t;
226*ccdc9c3eSSadaf Ebrahimi     }
227*ccdc9c3eSSadaf Ebrahimi 
228*ccdc9c3eSSadaf Ebrahimi     int id = a.id;
229*ccdc9c3eSSadaf Ebrahimi     if (id == 0)
230*ccdc9c3eSSadaf Ebrahimi       continue;
231*ccdc9c3eSSadaf Ebrahimi     if (q->has_index(id)) {
232*ccdc9c3eSSadaf Ebrahimi       if (ExtraDebug)
233*ccdc9c3eSSadaf Ebrahimi         fprintf(stderr, "  [%d%s]\n", id, FormatCapture(t0->capture).c_str());
234*ccdc9c3eSSadaf Ebrahimi       continue;
235*ccdc9c3eSSadaf Ebrahimi     }
236*ccdc9c3eSSadaf Ebrahimi 
237*ccdc9c3eSSadaf Ebrahimi     // Create entry in q no matter what.  We might fill it in below,
238*ccdc9c3eSSadaf Ebrahimi     // or we might not.  Even if not, it is necessary to have it,
239*ccdc9c3eSSadaf Ebrahimi     // so that we don't revisit id0 during the recursion.
240*ccdc9c3eSSadaf Ebrahimi     q->set_new(id, NULL);
241*ccdc9c3eSSadaf Ebrahimi     Thread** tp = &q->get_existing(id);
242*ccdc9c3eSSadaf Ebrahimi     int j;
243*ccdc9c3eSSadaf Ebrahimi     Thread* t;
244*ccdc9c3eSSadaf Ebrahimi     Prog::Inst* ip = prog_->inst(id);
245*ccdc9c3eSSadaf Ebrahimi     switch (ip->opcode()) {
246*ccdc9c3eSSadaf Ebrahimi     default:
247*ccdc9c3eSSadaf Ebrahimi       LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
248*ccdc9c3eSSadaf Ebrahimi       break;
249*ccdc9c3eSSadaf Ebrahimi 
250*ccdc9c3eSSadaf Ebrahimi     case kInstFail:
251*ccdc9c3eSSadaf Ebrahimi       break;
252*ccdc9c3eSSadaf Ebrahimi 
253*ccdc9c3eSSadaf Ebrahimi     case kInstAltMatch:
254*ccdc9c3eSSadaf Ebrahimi       // Save state; will pick up at next byte.
255*ccdc9c3eSSadaf Ebrahimi       t = Incref(t0);
256*ccdc9c3eSSadaf Ebrahimi       *tp = t;
257*ccdc9c3eSSadaf Ebrahimi 
258*ccdc9c3eSSadaf Ebrahimi       DCHECK(!ip->last());
259*ccdc9c3eSSadaf Ebrahimi       a = {id+1, NULL};
260*ccdc9c3eSSadaf Ebrahimi       goto Loop;
261*ccdc9c3eSSadaf Ebrahimi 
262*ccdc9c3eSSadaf Ebrahimi     case kInstNop:
263*ccdc9c3eSSadaf Ebrahimi       if (!ip->last())
264*ccdc9c3eSSadaf Ebrahimi         stk[nstk++] = {id+1, NULL};
265*ccdc9c3eSSadaf Ebrahimi 
266*ccdc9c3eSSadaf Ebrahimi       // Continue on.
267*ccdc9c3eSSadaf Ebrahimi       a = {ip->out(), NULL};
268*ccdc9c3eSSadaf Ebrahimi       goto Loop;
269*ccdc9c3eSSadaf Ebrahimi 
270*ccdc9c3eSSadaf Ebrahimi     case kInstCapture:
271*ccdc9c3eSSadaf Ebrahimi       if (!ip->last())
272*ccdc9c3eSSadaf Ebrahimi         stk[nstk++] = {id+1, NULL};
273*ccdc9c3eSSadaf Ebrahimi 
274*ccdc9c3eSSadaf Ebrahimi       if ((j=ip->cap()) < ncapture_) {
275*ccdc9c3eSSadaf Ebrahimi         // Push a dummy whose only job is to restore t0
276*ccdc9c3eSSadaf Ebrahimi         // once we finish exploring this possibility.
277*ccdc9c3eSSadaf Ebrahimi         stk[nstk++] = {0, t0};
278*ccdc9c3eSSadaf Ebrahimi 
279*ccdc9c3eSSadaf Ebrahimi         // Record capture.
280*ccdc9c3eSSadaf Ebrahimi         t = AllocThread();
281*ccdc9c3eSSadaf Ebrahimi         CopyCapture(t->capture, t0->capture);
282*ccdc9c3eSSadaf Ebrahimi         t->capture[j] = p;
283*ccdc9c3eSSadaf Ebrahimi         t0 = t;
284*ccdc9c3eSSadaf Ebrahimi       }
285*ccdc9c3eSSadaf Ebrahimi       a = {ip->out(), NULL};
286*ccdc9c3eSSadaf Ebrahimi       goto Loop;
287*ccdc9c3eSSadaf Ebrahimi 
288*ccdc9c3eSSadaf Ebrahimi     case kInstByteRange:
289*ccdc9c3eSSadaf Ebrahimi       if (!ip->Matches(c))
290*ccdc9c3eSSadaf Ebrahimi         goto Next;
291*ccdc9c3eSSadaf Ebrahimi       FALLTHROUGH_INTENDED;
292*ccdc9c3eSSadaf Ebrahimi 
293*ccdc9c3eSSadaf Ebrahimi     case kInstMatch:
294*ccdc9c3eSSadaf Ebrahimi       // Save state; will pick up at next byte.
295*ccdc9c3eSSadaf Ebrahimi       t = Incref(t0);
296*ccdc9c3eSSadaf Ebrahimi       *tp = t;
297*ccdc9c3eSSadaf Ebrahimi       if (ExtraDebug)
298*ccdc9c3eSSadaf Ebrahimi         fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
299*ccdc9c3eSSadaf Ebrahimi 
300*ccdc9c3eSSadaf Ebrahimi     Next:
301*ccdc9c3eSSadaf Ebrahimi       if (ip->last())
302*ccdc9c3eSSadaf Ebrahimi         break;
303*ccdc9c3eSSadaf Ebrahimi       a = {id+1, NULL};
304*ccdc9c3eSSadaf Ebrahimi       goto Loop;
305*ccdc9c3eSSadaf Ebrahimi 
306*ccdc9c3eSSadaf Ebrahimi     case kInstEmptyWidth:
307*ccdc9c3eSSadaf Ebrahimi       if (!ip->last())
308*ccdc9c3eSSadaf Ebrahimi         stk[nstk++] = {id+1, NULL};
309*ccdc9c3eSSadaf Ebrahimi 
310*ccdc9c3eSSadaf Ebrahimi       // Continue on if we have all the right flag bits.
311*ccdc9c3eSSadaf Ebrahimi       if (ip->empty() & ~Prog::EmptyFlags(context, p))
312*ccdc9c3eSSadaf Ebrahimi         break;
313*ccdc9c3eSSadaf Ebrahimi       a = {ip->out(), NULL};
314*ccdc9c3eSSadaf Ebrahimi       goto Loop;
315*ccdc9c3eSSadaf Ebrahimi     }
316*ccdc9c3eSSadaf Ebrahimi   }
317*ccdc9c3eSSadaf Ebrahimi }
318*ccdc9c3eSSadaf Ebrahimi 
319*ccdc9c3eSSadaf Ebrahimi // Run runq on byte c, appending new states to nextq.
320*ccdc9c3eSSadaf Ebrahimi // Updates matched_ and match_ as new, better matches are found.
321*ccdc9c3eSSadaf Ebrahimi // context is used (with p) for evaluating empty-width specials.
322*ccdc9c3eSSadaf Ebrahimi // p is the position of byte c in the input string for AddToThreadq;
323*ccdc9c3eSSadaf Ebrahimi // p-1 will be used when processing Match instructions.
324*ccdc9c3eSSadaf Ebrahimi // Frees all the threads on runq.
325*ccdc9c3eSSadaf Ebrahimi // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,const StringPiece & context,const char * p)326*ccdc9c3eSSadaf Ebrahimi int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
327*ccdc9c3eSSadaf Ebrahimi               const char* p) {
328*ccdc9c3eSSadaf Ebrahimi   nextq->clear();
329*ccdc9c3eSSadaf Ebrahimi 
330*ccdc9c3eSSadaf Ebrahimi   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
331*ccdc9c3eSSadaf Ebrahimi     Thread* t = i->value();
332*ccdc9c3eSSadaf Ebrahimi     if (t == NULL)
333*ccdc9c3eSSadaf Ebrahimi       continue;
334*ccdc9c3eSSadaf Ebrahimi 
335*ccdc9c3eSSadaf Ebrahimi     if (longest_) {
336*ccdc9c3eSSadaf Ebrahimi       // Can skip any threads started after our current best match.
337*ccdc9c3eSSadaf Ebrahimi       if (matched_ && match_[0] < t->capture[0]) {
338*ccdc9c3eSSadaf Ebrahimi         Decref(t);
339*ccdc9c3eSSadaf Ebrahimi         continue;
340*ccdc9c3eSSadaf Ebrahimi       }
341*ccdc9c3eSSadaf Ebrahimi     }
342*ccdc9c3eSSadaf Ebrahimi 
343*ccdc9c3eSSadaf Ebrahimi     int id = i->index();
344*ccdc9c3eSSadaf Ebrahimi     Prog::Inst* ip = prog_->inst(id);
345*ccdc9c3eSSadaf Ebrahimi 
346*ccdc9c3eSSadaf Ebrahimi     switch (ip->opcode()) {
347*ccdc9c3eSSadaf Ebrahimi       default:
348*ccdc9c3eSSadaf Ebrahimi         // Should only see the values handled below.
349*ccdc9c3eSSadaf Ebrahimi         LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
350*ccdc9c3eSSadaf Ebrahimi         break;
351*ccdc9c3eSSadaf Ebrahimi 
352*ccdc9c3eSSadaf Ebrahimi       case kInstByteRange:
353*ccdc9c3eSSadaf Ebrahimi         AddToThreadq(nextq, ip->out(), c, context, p, t);
354*ccdc9c3eSSadaf Ebrahimi         break;
355*ccdc9c3eSSadaf Ebrahimi 
356*ccdc9c3eSSadaf Ebrahimi       case kInstAltMatch:
357*ccdc9c3eSSadaf Ebrahimi         if (i != runq->begin())
358*ccdc9c3eSSadaf Ebrahimi           break;
359*ccdc9c3eSSadaf Ebrahimi         // The match is ours if we want it.
360*ccdc9c3eSSadaf Ebrahimi         if (ip->greedy(prog_) || longest_) {
361*ccdc9c3eSSadaf Ebrahimi           CopyCapture(match_, t->capture);
362*ccdc9c3eSSadaf Ebrahimi           matched_ = true;
363*ccdc9c3eSSadaf Ebrahimi 
364*ccdc9c3eSSadaf Ebrahimi           Decref(t);
365*ccdc9c3eSSadaf Ebrahimi           for (++i; i != runq->end(); ++i)
366*ccdc9c3eSSadaf Ebrahimi             Decref(i->value());
367*ccdc9c3eSSadaf Ebrahimi           runq->clear();
368*ccdc9c3eSSadaf Ebrahimi           if (ip->greedy(prog_))
369*ccdc9c3eSSadaf Ebrahimi             return ip->out1();
370*ccdc9c3eSSadaf Ebrahimi           return ip->out();
371*ccdc9c3eSSadaf Ebrahimi         }
372*ccdc9c3eSSadaf Ebrahimi         break;
373*ccdc9c3eSSadaf Ebrahimi 
374*ccdc9c3eSSadaf Ebrahimi       case kInstMatch: {
375*ccdc9c3eSSadaf Ebrahimi         // Avoid invoking undefined behavior when p happens
376*ccdc9c3eSSadaf Ebrahimi         // to be null - and p-1 would be meaningless anyway.
377*ccdc9c3eSSadaf Ebrahimi         if (p == NULL)
378*ccdc9c3eSSadaf Ebrahimi           break;
379*ccdc9c3eSSadaf Ebrahimi 
380*ccdc9c3eSSadaf Ebrahimi         if (endmatch_ && p-1 != etext_)
381*ccdc9c3eSSadaf Ebrahimi           break;
382*ccdc9c3eSSadaf Ebrahimi 
383*ccdc9c3eSSadaf Ebrahimi         if (longest_) {
384*ccdc9c3eSSadaf Ebrahimi           // Leftmost-longest mode: save this match only if
385*ccdc9c3eSSadaf Ebrahimi           // it is either farther to the left or at the same
386*ccdc9c3eSSadaf Ebrahimi           // point but longer than an existing match.
387*ccdc9c3eSSadaf Ebrahimi           if (!matched_ || t->capture[0] < match_[0] ||
388*ccdc9c3eSSadaf Ebrahimi               (t->capture[0] == match_[0] && p-1 > match_[1])) {
389*ccdc9c3eSSadaf Ebrahimi             CopyCapture(match_, t->capture);
390*ccdc9c3eSSadaf Ebrahimi             match_[1] = p-1;
391*ccdc9c3eSSadaf Ebrahimi             matched_ = true;
392*ccdc9c3eSSadaf Ebrahimi           }
393*ccdc9c3eSSadaf Ebrahimi         } else {
394*ccdc9c3eSSadaf Ebrahimi           // Leftmost-biased mode: this match is by definition
395*ccdc9c3eSSadaf Ebrahimi           // better than what we've already found (see next line).
396*ccdc9c3eSSadaf Ebrahimi           CopyCapture(match_, t->capture);
397*ccdc9c3eSSadaf Ebrahimi           match_[1] = p-1;
398*ccdc9c3eSSadaf Ebrahimi           matched_ = true;
399*ccdc9c3eSSadaf Ebrahimi 
400*ccdc9c3eSSadaf Ebrahimi           // Cut off the threads that can only find matches
401*ccdc9c3eSSadaf Ebrahimi           // worse than the one we just found: don't run the
402*ccdc9c3eSSadaf Ebrahimi           // rest of the current Threadq.
403*ccdc9c3eSSadaf Ebrahimi           Decref(t);
404*ccdc9c3eSSadaf Ebrahimi           for (++i; i != runq->end(); ++i)
405*ccdc9c3eSSadaf Ebrahimi             Decref(i->value());
406*ccdc9c3eSSadaf Ebrahimi           runq->clear();
407*ccdc9c3eSSadaf Ebrahimi           return 0;
408*ccdc9c3eSSadaf Ebrahimi         }
409*ccdc9c3eSSadaf Ebrahimi         break;
410*ccdc9c3eSSadaf Ebrahimi       }
411*ccdc9c3eSSadaf Ebrahimi     }
412*ccdc9c3eSSadaf Ebrahimi     Decref(t);
413*ccdc9c3eSSadaf Ebrahimi   }
414*ccdc9c3eSSadaf Ebrahimi   runq->clear();
415*ccdc9c3eSSadaf Ebrahimi   return 0;
416*ccdc9c3eSSadaf Ebrahimi }
417*ccdc9c3eSSadaf Ebrahimi 
FormatCapture(const char ** capture)418*ccdc9c3eSSadaf Ebrahimi string NFA::FormatCapture(const char** capture) {
419*ccdc9c3eSSadaf Ebrahimi   string s;
420*ccdc9c3eSSadaf Ebrahimi 
421*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < ncapture_; i+=2) {
422*ccdc9c3eSSadaf Ebrahimi     if (capture[i] == NULL)
423*ccdc9c3eSSadaf Ebrahimi       StringAppendF(&s, "(?,?)");
424*ccdc9c3eSSadaf Ebrahimi     else if (capture[i+1] == NULL)
425*ccdc9c3eSSadaf Ebrahimi       StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
426*ccdc9c3eSSadaf Ebrahimi     else
427*ccdc9c3eSSadaf Ebrahimi       StringAppendF(&s, "(%d,%d)",
428*ccdc9c3eSSadaf Ebrahimi                     (int)(capture[i] - btext_),
429*ccdc9c3eSSadaf Ebrahimi                     (int)(capture[i+1] - btext_));
430*ccdc9c3eSSadaf Ebrahimi   }
431*ccdc9c3eSSadaf Ebrahimi   return s;
432*ccdc9c3eSSadaf Ebrahimi }
433*ccdc9c3eSSadaf Ebrahimi 
Search(const StringPiece & text,const StringPiece & const_context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)434*ccdc9c3eSSadaf Ebrahimi bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
435*ccdc9c3eSSadaf Ebrahimi             bool anchored, bool longest,
436*ccdc9c3eSSadaf Ebrahimi             StringPiece* submatch, int nsubmatch) {
437*ccdc9c3eSSadaf Ebrahimi   if (start_ == 0)
438*ccdc9c3eSSadaf Ebrahimi     return false;
439*ccdc9c3eSSadaf Ebrahimi 
440*ccdc9c3eSSadaf Ebrahimi   StringPiece context = const_context;
441*ccdc9c3eSSadaf Ebrahimi   if (context.begin() == NULL)
442*ccdc9c3eSSadaf Ebrahimi     context = text;
443*ccdc9c3eSSadaf Ebrahimi 
444*ccdc9c3eSSadaf Ebrahimi   // Sanity check: make sure that text lies within context.
445*ccdc9c3eSSadaf Ebrahimi   if (text.begin() < context.begin() || text.end() > context.end()) {
446*ccdc9c3eSSadaf Ebrahimi     LOG(DFATAL) << "context does not contain text";
447*ccdc9c3eSSadaf Ebrahimi     return false;
448*ccdc9c3eSSadaf Ebrahimi   }
449*ccdc9c3eSSadaf Ebrahimi 
450*ccdc9c3eSSadaf Ebrahimi   if (prog_->anchor_start() && context.begin() != text.begin())
451*ccdc9c3eSSadaf Ebrahimi     return false;
452*ccdc9c3eSSadaf Ebrahimi   if (prog_->anchor_end() && context.end() != text.end())
453*ccdc9c3eSSadaf Ebrahimi     return false;
454*ccdc9c3eSSadaf Ebrahimi   anchored |= prog_->anchor_start();
455*ccdc9c3eSSadaf Ebrahimi   if (prog_->anchor_end()) {
456*ccdc9c3eSSadaf Ebrahimi     longest = true;
457*ccdc9c3eSSadaf Ebrahimi     endmatch_ = true;
458*ccdc9c3eSSadaf Ebrahimi     etext_ = text.end();
459*ccdc9c3eSSadaf Ebrahimi   }
460*ccdc9c3eSSadaf Ebrahimi 
461*ccdc9c3eSSadaf Ebrahimi   if (nsubmatch < 0) {
462*ccdc9c3eSSadaf Ebrahimi     LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
463*ccdc9c3eSSadaf Ebrahimi     return false;
464*ccdc9c3eSSadaf Ebrahimi   }
465*ccdc9c3eSSadaf Ebrahimi 
466*ccdc9c3eSSadaf Ebrahimi   // Save search parameters.
467*ccdc9c3eSSadaf Ebrahimi   ncapture_ = 2*nsubmatch;
468*ccdc9c3eSSadaf Ebrahimi   longest_ = longest;
469*ccdc9c3eSSadaf Ebrahimi 
470*ccdc9c3eSSadaf Ebrahimi   if (nsubmatch == 0) {
471*ccdc9c3eSSadaf Ebrahimi     // We need to maintain match[0], both to distinguish the
472*ccdc9c3eSSadaf Ebrahimi     // longest match (if longest is true) and also to tell
473*ccdc9c3eSSadaf Ebrahimi     // whether we've seen any matches at all.
474*ccdc9c3eSSadaf Ebrahimi     ncapture_ = 2;
475*ccdc9c3eSSadaf Ebrahimi   }
476*ccdc9c3eSSadaf Ebrahimi 
477*ccdc9c3eSSadaf Ebrahimi   match_ = new const char*[ncapture_];
478*ccdc9c3eSSadaf Ebrahimi   matched_ = false;
479*ccdc9c3eSSadaf Ebrahimi 
480*ccdc9c3eSSadaf Ebrahimi   // For debugging prints.
481*ccdc9c3eSSadaf Ebrahimi   btext_ = context.begin();
482*ccdc9c3eSSadaf Ebrahimi 
483*ccdc9c3eSSadaf Ebrahimi   if (ExtraDebug)
484*ccdc9c3eSSadaf Ebrahimi     fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
485*ccdc9c3eSSadaf Ebrahimi             string(text).c_str(), string(context).c_str(), anchored, longest);
486*ccdc9c3eSSadaf Ebrahimi 
487*ccdc9c3eSSadaf Ebrahimi   // Set up search.
488*ccdc9c3eSSadaf Ebrahimi   Threadq* runq = &q0_;
489*ccdc9c3eSSadaf Ebrahimi   Threadq* nextq = &q1_;
490*ccdc9c3eSSadaf Ebrahimi   runq->clear();
491*ccdc9c3eSSadaf Ebrahimi   nextq->clear();
492*ccdc9c3eSSadaf Ebrahimi   memset(&match_[0], 0, ncapture_*sizeof match_[0]);
493*ccdc9c3eSSadaf Ebrahimi 
494*ccdc9c3eSSadaf Ebrahimi   // Loop over the text, stepping the machine.
495*ccdc9c3eSSadaf Ebrahimi   for (const char* p = text.begin();; p++) {
496*ccdc9c3eSSadaf Ebrahimi     if (ExtraDebug) {
497*ccdc9c3eSSadaf Ebrahimi       int c = 0;
498*ccdc9c3eSSadaf Ebrahimi       if (p == context.begin())
499*ccdc9c3eSSadaf Ebrahimi         c = '^';
500*ccdc9c3eSSadaf Ebrahimi       else if (p > text.end())
501*ccdc9c3eSSadaf Ebrahimi         c = '$';
502*ccdc9c3eSSadaf Ebrahimi       else if (p < text.end())
503*ccdc9c3eSSadaf Ebrahimi         c = p[0] & 0xFF;
504*ccdc9c3eSSadaf Ebrahimi 
505*ccdc9c3eSSadaf Ebrahimi       fprintf(stderr, "%c:", c);
506*ccdc9c3eSSadaf Ebrahimi       for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
507*ccdc9c3eSSadaf Ebrahimi         Thread* t = i->value();
508*ccdc9c3eSSadaf Ebrahimi         if (t == NULL)
509*ccdc9c3eSSadaf Ebrahimi           continue;
510*ccdc9c3eSSadaf Ebrahimi         fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
511*ccdc9c3eSSadaf Ebrahimi       }
512*ccdc9c3eSSadaf Ebrahimi       fprintf(stderr, "\n");
513*ccdc9c3eSSadaf Ebrahimi     }
514*ccdc9c3eSSadaf Ebrahimi 
515*ccdc9c3eSSadaf Ebrahimi     // This is a no-op the first time around the loop because runq is empty.
516*ccdc9c3eSSadaf Ebrahimi     int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p);
517*ccdc9c3eSSadaf Ebrahimi     DCHECK_EQ(runq->size(), 0);
518*ccdc9c3eSSadaf Ebrahimi     using std::swap;
519*ccdc9c3eSSadaf Ebrahimi     swap(nextq, runq);
520*ccdc9c3eSSadaf Ebrahimi     nextq->clear();
521*ccdc9c3eSSadaf Ebrahimi     if (id != 0) {
522*ccdc9c3eSSadaf Ebrahimi       // We're done: full match ahead.
523*ccdc9c3eSSadaf Ebrahimi       p = text.end();
524*ccdc9c3eSSadaf Ebrahimi       for (;;) {
525*ccdc9c3eSSadaf Ebrahimi         Prog::Inst* ip = prog_->inst(id);
526*ccdc9c3eSSadaf Ebrahimi         switch (ip->opcode()) {
527*ccdc9c3eSSadaf Ebrahimi           default:
528*ccdc9c3eSSadaf Ebrahimi             LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
529*ccdc9c3eSSadaf Ebrahimi             break;
530*ccdc9c3eSSadaf Ebrahimi 
531*ccdc9c3eSSadaf Ebrahimi           case kInstCapture:
532*ccdc9c3eSSadaf Ebrahimi             if (ip->cap() < ncapture_)
533*ccdc9c3eSSadaf Ebrahimi               match_[ip->cap()] = p;
534*ccdc9c3eSSadaf Ebrahimi             id = ip->out();
535*ccdc9c3eSSadaf Ebrahimi             continue;
536*ccdc9c3eSSadaf Ebrahimi 
537*ccdc9c3eSSadaf Ebrahimi           case kInstNop:
538*ccdc9c3eSSadaf Ebrahimi             id = ip->out();
539*ccdc9c3eSSadaf Ebrahimi             continue;
540*ccdc9c3eSSadaf Ebrahimi 
541*ccdc9c3eSSadaf Ebrahimi           case kInstMatch:
542*ccdc9c3eSSadaf Ebrahimi             match_[1] = p;
543*ccdc9c3eSSadaf Ebrahimi             matched_ = true;
544*ccdc9c3eSSadaf Ebrahimi             break;
545*ccdc9c3eSSadaf Ebrahimi         }
546*ccdc9c3eSSadaf Ebrahimi         break;
547*ccdc9c3eSSadaf Ebrahimi       }
548*ccdc9c3eSSadaf Ebrahimi       break;
549*ccdc9c3eSSadaf Ebrahimi     }
550*ccdc9c3eSSadaf Ebrahimi 
551*ccdc9c3eSSadaf Ebrahimi     if (p > text.end())
552*ccdc9c3eSSadaf Ebrahimi       break;
553*ccdc9c3eSSadaf Ebrahimi 
554*ccdc9c3eSSadaf Ebrahimi     // Start a new thread if there have not been any matches.
555*ccdc9c3eSSadaf Ebrahimi     // (No point in starting a new thread if there have been
556*ccdc9c3eSSadaf Ebrahimi     // matches, since it would be to the right of the match
557*ccdc9c3eSSadaf Ebrahimi     // we already found.)
558*ccdc9c3eSSadaf Ebrahimi     if (!matched_ && (!anchored || p == text.begin())) {
559*ccdc9c3eSSadaf Ebrahimi       // If there's a required first byte for an unanchored search
560*ccdc9c3eSSadaf Ebrahimi       // and we're not in the middle of any possible matches,
561*ccdc9c3eSSadaf Ebrahimi       // use memchr to search for the byte quickly.
562*ccdc9c3eSSadaf Ebrahimi       int fb = prog_->first_byte();
563*ccdc9c3eSSadaf Ebrahimi       if (!anchored && runq->size() == 0 &&
564*ccdc9c3eSSadaf Ebrahimi           fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
565*ccdc9c3eSSadaf Ebrahimi         p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
566*ccdc9c3eSSadaf Ebrahimi         if (p == NULL) {
567*ccdc9c3eSSadaf Ebrahimi           p = text.end();
568*ccdc9c3eSSadaf Ebrahimi         }
569*ccdc9c3eSSadaf Ebrahimi       }
570*ccdc9c3eSSadaf Ebrahimi 
571*ccdc9c3eSSadaf Ebrahimi       Thread* t = AllocThread();
572*ccdc9c3eSSadaf Ebrahimi       CopyCapture(t->capture, match_);
573*ccdc9c3eSSadaf Ebrahimi       t->capture[0] = p;
574*ccdc9c3eSSadaf Ebrahimi       AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p,
575*ccdc9c3eSSadaf Ebrahimi                    t);
576*ccdc9c3eSSadaf Ebrahimi       Decref(t);
577*ccdc9c3eSSadaf Ebrahimi     }
578*ccdc9c3eSSadaf Ebrahimi 
579*ccdc9c3eSSadaf Ebrahimi     // If all the threads have died, stop early.
580*ccdc9c3eSSadaf Ebrahimi     if (runq->size() == 0) {
581*ccdc9c3eSSadaf Ebrahimi       if (ExtraDebug)
582*ccdc9c3eSSadaf Ebrahimi         fprintf(stderr, "dead\n");
583*ccdc9c3eSSadaf Ebrahimi       break;
584*ccdc9c3eSSadaf Ebrahimi     }
585*ccdc9c3eSSadaf Ebrahimi   }
586*ccdc9c3eSSadaf Ebrahimi 
587*ccdc9c3eSSadaf Ebrahimi   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
588*ccdc9c3eSSadaf Ebrahimi     Decref(i->value());
589*ccdc9c3eSSadaf Ebrahimi 
590*ccdc9c3eSSadaf Ebrahimi   if (matched_) {
591*ccdc9c3eSSadaf Ebrahimi     for (int i = 0; i < nsubmatch; i++)
592*ccdc9c3eSSadaf Ebrahimi       submatch[i] =
593*ccdc9c3eSSadaf Ebrahimi           StringPiece(match_[2 * i],
594*ccdc9c3eSSadaf Ebrahimi                       static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
595*ccdc9c3eSSadaf Ebrahimi     if (ExtraDebug)
596*ccdc9c3eSSadaf Ebrahimi       fprintf(stderr, "match (%td,%td)\n",
597*ccdc9c3eSSadaf Ebrahimi               match_[0] - btext_, match_[1] - btext_);
598*ccdc9c3eSSadaf Ebrahimi     return true;
599*ccdc9c3eSSadaf Ebrahimi   }
600*ccdc9c3eSSadaf Ebrahimi   return false;
601*ccdc9c3eSSadaf Ebrahimi }
602*ccdc9c3eSSadaf Ebrahimi 
603*ccdc9c3eSSadaf Ebrahimi // Computes whether all successful matches have a common first byte,
604*ccdc9c3eSSadaf Ebrahimi // and if so, returns that byte.  If not, returns -1.
ComputeFirstByte()605*ccdc9c3eSSadaf Ebrahimi int Prog::ComputeFirstByte() {
606*ccdc9c3eSSadaf Ebrahimi   int b = -1;
607*ccdc9c3eSSadaf Ebrahimi   SparseSet q(size());
608*ccdc9c3eSSadaf Ebrahimi   q.insert(start());
609*ccdc9c3eSSadaf Ebrahimi   for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
610*ccdc9c3eSSadaf Ebrahimi     int id = *it;
611*ccdc9c3eSSadaf Ebrahimi     Prog::Inst* ip = inst(id);
612*ccdc9c3eSSadaf Ebrahimi     switch (ip->opcode()) {
613*ccdc9c3eSSadaf Ebrahimi       default:
614*ccdc9c3eSSadaf Ebrahimi         LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
615*ccdc9c3eSSadaf Ebrahimi         break;
616*ccdc9c3eSSadaf Ebrahimi 
617*ccdc9c3eSSadaf Ebrahimi       case kInstMatch:
618*ccdc9c3eSSadaf Ebrahimi         // The empty string matches: no first byte.
619*ccdc9c3eSSadaf Ebrahimi         return -1;
620*ccdc9c3eSSadaf Ebrahimi 
621*ccdc9c3eSSadaf Ebrahimi       case kInstByteRange:
622*ccdc9c3eSSadaf Ebrahimi         if (!ip->last())
623*ccdc9c3eSSadaf Ebrahimi           q.insert(id+1);
624*ccdc9c3eSSadaf Ebrahimi 
625*ccdc9c3eSSadaf Ebrahimi         // Must match only a single byte
626*ccdc9c3eSSadaf Ebrahimi         if (ip->lo() != ip->hi())
627*ccdc9c3eSSadaf Ebrahimi           return -1;
628*ccdc9c3eSSadaf Ebrahimi         if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
629*ccdc9c3eSSadaf Ebrahimi           return -1;
630*ccdc9c3eSSadaf Ebrahimi         // If we haven't seen any bytes yet, record it;
631*ccdc9c3eSSadaf Ebrahimi         // otherwise must match the one we saw before.
632*ccdc9c3eSSadaf Ebrahimi         if (b == -1)
633*ccdc9c3eSSadaf Ebrahimi           b = ip->lo();
634*ccdc9c3eSSadaf Ebrahimi         else if (b != ip->lo())
635*ccdc9c3eSSadaf Ebrahimi           return -1;
636*ccdc9c3eSSadaf Ebrahimi         break;
637*ccdc9c3eSSadaf Ebrahimi 
638*ccdc9c3eSSadaf Ebrahimi       case kInstNop:
639*ccdc9c3eSSadaf Ebrahimi       case kInstCapture:
640*ccdc9c3eSSadaf Ebrahimi       case kInstEmptyWidth:
641*ccdc9c3eSSadaf Ebrahimi         if (!ip->last())
642*ccdc9c3eSSadaf Ebrahimi           q.insert(id+1);
643*ccdc9c3eSSadaf Ebrahimi 
644*ccdc9c3eSSadaf Ebrahimi         // Continue on.
645*ccdc9c3eSSadaf Ebrahimi         // Ignore ip->empty() flags for kInstEmptyWidth
646*ccdc9c3eSSadaf Ebrahimi         // in order to be as conservative as possible
647*ccdc9c3eSSadaf Ebrahimi         // (assume all possible empty-width flags are true).
648*ccdc9c3eSSadaf Ebrahimi         if (ip->out())
649*ccdc9c3eSSadaf Ebrahimi           q.insert(ip->out());
650*ccdc9c3eSSadaf Ebrahimi         break;
651*ccdc9c3eSSadaf Ebrahimi 
652*ccdc9c3eSSadaf Ebrahimi       case kInstAltMatch:
653*ccdc9c3eSSadaf Ebrahimi         DCHECK(!ip->last());
654*ccdc9c3eSSadaf Ebrahimi         q.insert(id+1);
655*ccdc9c3eSSadaf Ebrahimi         break;
656*ccdc9c3eSSadaf Ebrahimi 
657*ccdc9c3eSSadaf Ebrahimi       case kInstFail:
658*ccdc9c3eSSadaf Ebrahimi         break;
659*ccdc9c3eSSadaf Ebrahimi     }
660*ccdc9c3eSSadaf Ebrahimi   }
661*ccdc9c3eSSadaf Ebrahimi   return b;
662*ccdc9c3eSSadaf Ebrahimi }
663*ccdc9c3eSSadaf Ebrahimi 
664*ccdc9c3eSSadaf Ebrahimi bool
SearchNFA(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)665*ccdc9c3eSSadaf Ebrahimi Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
666*ccdc9c3eSSadaf Ebrahimi                 Anchor anchor, MatchKind kind,
667*ccdc9c3eSSadaf Ebrahimi                 StringPiece* match, int nmatch) {
668*ccdc9c3eSSadaf Ebrahimi   if (ExtraDebug)
669*ccdc9c3eSSadaf Ebrahimi     Dump();
670*ccdc9c3eSSadaf Ebrahimi 
671*ccdc9c3eSSadaf Ebrahimi   NFA nfa(this);
672*ccdc9c3eSSadaf Ebrahimi   StringPiece sp;
673*ccdc9c3eSSadaf Ebrahimi   if (kind == kFullMatch) {
674*ccdc9c3eSSadaf Ebrahimi     anchor = kAnchored;
675*ccdc9c3eSSadaf Ebrahimi     if (nmatch == 0) {
676*ccdc9c3eSSadaf Ebrahimi       match = &sp;
677*ccdc9c3eSSadaf Ebrahimi       nmatch = 1;
678*ccdc9c3eSSadaf Ebrahimi     }
679*ccdc9c3eSSadaf Ebrahimi   }
680*ccdc9c3eSSadaf Ebrahimi   if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
681*ccdc9c3eSSadaf Ebrahimi     return false;
682*ccdc9c3eSSadaf Ebrahimi   if (kind == kFullMatch && match[0].end() != text.end())
683*ccdc9c3eSSadaf Ebrahimi     return false;
684*ccdc9c3eSSadaf Ebrahimi   return true;
685*ccdc9c3eSSadaf Ebrahimi }
686*ccdc9c3eSSadaf Ebrahimi 
687*ccdc9c3eSSadaf Ebrahimi // For each instruction i in the program reachable from the start, compute the
688*ccdc9c3eSSadaf Ebrahimi // number of instructions reachable from i by following only empty transitions
689*ccdc9c3eSSadaf Ebrahimi // and record that count as fanout[i].
690*ccdc9c3eSSadaf Ebrahimi //
691*ccdc9c3eSSadaf Ebrahimi // fanout holds the results and is also the work queue for the outer iteration.
692*ccdc9c3eSSadaf Ebrahimi // reachable holds the reached nodes for the inner iteration.
Fanout(SparseArray<int> * fanout)693*ccdc9c3eSSadaf Ebrahimi void Prog::Fanout(SparseArray<int>* fanout) {
694*ccdc9c3eSSadaf Ebrahimi   DCHECK_EQ(fanout->max_size(), size());
695*ccdc9c3eSSadaf Ebrahimi   SparseSet reachable(size());
696*ccdc9c3eSSadaf Ebrahimi   fanout->clear();
697*ccdc9c3eSSadaf Ebrahimi   fanout->set_new(start(), 0);
698*ccdc9c3eSSadaf Ebrahimi   for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
699*ccdc9c3eSSadaf Ebrahimi     int* count = &i->value();
700*ccdc9c3eSSadaf Ebrahimi     reachable.clear();
701*ccdc9c3eSSadaf Ebrahimi     reachable.insert(i->index());
702*ccdc9c3eSSadaf Ebrahimi     for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
703*ccdc9c3eSSadaf Ebrahimi       int id = *j;
704*ccdc9c3eSSadaf Ebrahimi       Prog::Inst* ip = inst(id);
705*ccdc9c3eSSadaf Ebrahimi       switch (ip->opcode()) {
706*ccdc9c3eSSadaf Ebrahimi         default:
707*ccdc9c3eSSadaf Ebrahimi           LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
708*ccdc9c3eSSadaf Ebrahimi           break;
709*ccdc9c3eSSadaf Ebrahimi 
710*ccdc9c3eSSadaf Ebrahimi         case kInstByteRange:
711*ccdc9c3eSSadaf Ebrahimi           if (!ip->last())
712*ccdc9c3eSSadaf Ebrahimi             reachable.insert(id+1);
713*ccdc9c3eSSadaf Ebrahimi 
714*ccdc9c3eSSadaf Ebrahimi           (*count)++;
715*ccdc9c3eSSadaf Ebrahimi           if (!fanout->has_index(ip->out())) {
716*ccdc9c3eSSadaf Ebrahimi             fanout->set_new(ip->out(), 0);
717*ccdc9c3eSSadaf Ebrahimi           }
718*ccdc9c3eSSadaf Ebrahimi           break;
719*ccdc9c3eSSadaf Ebrahimi 
720*ccdc9c3eSSadaf Ebrahimi         case kInstAltMatch:
721*ccdc9c3eSSadaf Ebrahimi           DCHECK(!ip->last());
722*ccdc9c3eSSadaf Ebrahimi           reachable.insert(id+1);
723*ccdc9c3eSSadaf Ebrahimi           break;
724*ccdc9c3eSSadaf Ebrahimi 
725*ccdc9c3eSSadaf Ebrahimi         case kInstCapture:
726*ccdc9c3eSSadaf Ebrahimi         case kInstEmptyWidth:
727*ccdc9c3eSSadaf Ebrahimi         case kInstNop:
728*ccdc9c3eSSadaf Ebrahimi           if (!ip->last())
729*ccdc9c3eSSadaf Ebrahimi             reachable.insert(id+1);
730*ccdc9c3eSSadaf Ebrahimi 
731*ccdc9c3eSSadaf Ebrahimi           reachable.insert(ip->out());
732*ccdc9c3eSSadaf Ebrahimi           break;
733*ccdc9c3eSSadaf Ebrahimi 
734*ccdc9c3eSSadaf Ebrahimi         case kInstMatch:
735*ccdc9c3eSSadaf Ebrahimi           if (!ip->last())
736*ccdc9c3eSSadaf Ebrahimi             reachable.insert(id+1);
737*ccdc9c3eSSadaf Ebrahimi           break;
738*ccdc9c3eSSadaf Ebrahimi 
739*ccdc9c3eSSadaf Ebrahimi         case kInstFail:
740*ccdc9c3eSSadaf Ebrahimi           break;
741*ccdc9c3eSSadaf Ebrahimi       }
742*ccdc9c3eSSadaf Ebrahimi     }
743*ccdc9c3eSSadaf Ebrahimi   }
744*ccdc9c3eSSadaf Ebrahimi }
745*ccdc9c3eSSadaf Ebrahimi 
746*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
747