1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi // Tested by search_test.cc.
6*ccdc9c3eSSadaf Ebrahimi //
7*ccdc9c3eSSadaf Ebrahimi // Prog::SearchNFA, an NFA search.
8*ccdc9c3eSSadaf Ebrahimi // This is an actual NFA like the theorists talk about,
9*ccdc9c3eSSadaf Ebrahimi // not the pseudo-NFA found in backtracking regexp implementations.
10*ccdc9c3eSSadaf Ebrahimi //
11*ccdc9c3eSSadaf Ebrahimi // IMPLEMENTATION
12*ccdc9c3eSSadaf Ebrahimi //
13*ccdc9c3eSSadaf Ebrahimi // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14*ccdc9c3eSSadaf Ebrahimi // which is a variant of the one described in Thompson's 1968 CACM paper.
15*ccdc9c3eSSadaf Ebrahimi // See http://swtch.com/~rsc/regexp/ for various history. The main feature
16*ccdc9c3eSSadaf Ebrahimi // over the DFA implementation is that it tracks submatch boundaries.
17*ccdc9c3eSSadaf Ebrahimi //
18*ccdc9c3eSSadaf Ebrahimi // When the choice of submatch boundaries is ambiguous, this particular
19*ccdc9c3eSSadaf Ebrahimi // implementation makes the same choices that traditional backtracking
20*ccdc9c3eSSadaf Ebrahimi // implementations (in particular, Perl and PCRE) do.
21*ccdc9c3eSSadaf Ebrahimi // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22*ccdc9c3eSSadaf Ebrahimi // time in the length of the input.
23*ccdc9c3eSSadaf Ebrahimi //
24*ccdc9c3eSSadaf Ebrahimi // Like Thompson's original machine and like the DFA implementation, this
25*ccdc9c3eSSadaf Ebrahimi // implementation notices a match only once it is one byte past it.
26*ccdc9c3eSSadaf Ebrahimi
27*ccdc9c3eSSadaf Ebrahimi #include <stdio.h>
28*ccdc9c3eSSadaf Ebrahimi #include <string.h>
29*ccdc9c3eSSadaf Ebrahimi #include <algorithm>
30*ccdc9c3eSSadaf Ebrahimi #include <string>
31*ccdc9c3eSSadaf Ebrahimi #include <utility>
32*ccdc9c3eSSadaf Ebrahimi #include <vector>
33*ccdc9c3eSSadaf Ebrahimi
34*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
35*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
36*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
37*ccdc9c3eSSadaf Ebrahimi #include "util/pod_array.h"
38*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_array.h"
39*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_set.h"
40*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
41*ccdc9c3eSSadaf Ebrahimi
42*ccdc9c3eSSadaf Ebrahimi namespace re2 {
43*ccdc9c3eSSadaf Ebrahimi
44*ccdc9c3eSSadaf Ebrahimi static const bool ExtraDebug = false;
45*ccdc9c3eSSadaf Ebrahimi
46*ccdc9c3eSSadaf Ebrahimi class NFA {
47*ccdc9c3eSSadaf Ebrahimi public:
48*ccdc9c3eSSadaf Ebrahimi NFA(Prog* prog);
49*ccdc9c3eSSadaf Ebrahimi ~NFA();
50*ccdc9c3eSSadaf Ebrahimi
51*ccdc9c3eSSadaf Ebrahimi // Searches for a matching string.
52*ccdc9c3eSSadaf Ebrahimi // * If anchored is true, only considers matches starting at offset.
53*ccdc9c3eSSadaf Ebrahimi // Otherwise finds lefmost match at or after offset.
54*ccdc9c3eSSadaf Ebrahimi // * If longest is true, returns the longest match starting
55*ccdc9c3eSSadaf Ebrahimi // at the chosen start point. Otherwise returns the so-called
56*ccdc9c3eSSadaf Ebrahimi // left-biased match, the one traditional backtracking engines
57*ccdc9c3eSSadaf Ebrahimi // (like Perl and PCRE) find.
58*ccdc9c3eSSadaf Ebrahimi // Records submatch boundaries in submatch[1..nsubmatch-1].
59*ccdc9c3eSSadaf Ebrahimi // Submatch[0] is the entire match. When there is a choice in
60*ccdc9c3eSSadaf Ebrahimi // which text matches each subexpression, the submatch boundaries
61*ccdc9c3eSSadaf Ebrahimi // are chosen to match what a backtracking implementation would choose.
62*ccdc9c3eSSadaf Ebrahimi bool Search(const StringPiece& text, const StringPiece& context,
63*ccdc9c3eSSadaf Ebrahimi bool anchored, bool longest,
64*ccdc9c3eSSadaf Ebrahimi StringPiece* submatch, int nsubmatch);
65*ccdc9c3eSSadaf Ebrahimi
66*ccdc9c3eSSadaf Ebrahimi private:
67*ccdc9c3eSSadaf Ebrahimi struct Thread {
68*ccdc9c3eSSadaf Ebrahimi union {
69*ccdc9c3eSSadaf Ebrahimi int ref;
70*ccdc9c3eSSadaf Ebrahimi Thread* next; // when on free list
71*ccdc9c3eSSadaf Ebrahimi };
72*ccdc9c3eSSadaf Ebrahimi const char** capture;
73*ccdc9c3eSSadaf Ebrahimi };
74*ccdc9c3eSSadaf Ebrahimi
75*ccdc9c3eSSadaf Ebrahimi // State for explicit stack in AddToThreadq.
76*ccdc9c3eSSadaf Ebrahimi struct AddState {
77*ccdc9c3eSSadaf Ebrahimi int id; // Inst to process
78*ccdc9c3eSSadaf Ebrahimi Thread* t; // if not null, set t0 = t before processing id
79*ccdc9c3eSSadaf Ebrahimi };
80*ccdc9c3eSSadaf Ebrahimi
81*ccdc9c3eSSadaf Ebrahimi // Threadq is a list of threads. The list is sorted by the order
82*ccdc9c3eSSadaf Ebrahimi // in which Perl would explore that particular state -- the earlier
83*ccdc9c3eSSadaf Ebrahimi // choices appear earlier in the list.
84*ccdc9c3eSSadaf Ebrahimi typedef SparseArray<Thread*> Threadq;
85*ccdc9c3eSSadaf Ebrahimi
86*ccdc9c3eSSadaf Ebrahimi inline Thread* AllocThread();
87*ccdc9c3eSSadaf Ebrahimi inline Thread* Incref(Thread* t);
88*ccdc9c3eSSadaf Ebrahimi inline void Decref(Thread* t);
89*ccdc9c3eSSadaf Ebrahimi
90*ccdc9c3eSSadaf Ebrahimi // Follows all empty arrows from id0 and enqueues all the states reached.
91*ccdc9c3eSSadaf Ebrahimi // Enqueues only the ByteRange instructions that match byte c.
92*ccdc9c3eSSadaf Ebrahimi // context is used (with p) for evaluating empty-width specials.
93*ccdc9c3eSSadaf Ebrahimi // p is the current input position, and t0 is the current thread.
94*ccdc9c3eSSadaf Ebrahimi void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
95*ccdc9c3eSSadaf Ebrahimi const char* p, Thread* t0);
96*ccdc9c3eSSadaf Ebrahimi
97*ccdc9c3eSSadaf Ebrahimi // Run runq on byte c, appending new states to nextq.
98*ccdc9c3eSSadaf Ebrahimi // Updates matched_ and match_ as new, better matches are found.
99*ccdc9c3eSSadaf Ebrahimi // context is used (with p) for evaluating empty-width specials.
100*ccdc9c3eSSadaf Ebrahimi // p is the position of byte c in the input string for AddToThreadq;
101*ccdc9c3eSSadaf Ebrahimi // p-1 will be used when processing Match instructions.
102*ccdc9c3eSSadaf Ebrahimi // Frees all the threads on runq.
103*ccdc9c3eSSadaf Ebrahimi // If there is a shortcut to the end, returns that shortcut.
104*ccdc9c3eSSadaf Ebrahimi int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
105*ccdc9c3eSSadaf Ebrahimi const char* p);
106*ccdc9c3eSSadaf Ebrahimi
107*ccdc9c3eSSadaf Ebrahimi // Returns text version of capture information, for debugging.
108*ccdc9c3eSSadaf Ebrahimi string FormatCapture(const char** capture);
109*ccdc9c3eSSadaf Ebrahimi
110*ccdc9c3eSSadaf Ebrahimi inline void CopyCapture(const char** dst, const char** src);
111*ccdc9c3eSSadaf Ebrahimi
112*ccdc9c3eSSadaf Ebrahimi Prog* prog_; // underlying program
113*ccdc9c3eSSadaf Ebrahimi int start_; // start instruction in program
114*ccdc9c3eSSadaf Ebrahimi int ncapture_; // number of submatches to track
115*ccdc9c3eSSadaf Ebrahimi bool longest_; // whether searching for longest match
116*ccdc9c3eSSadaf Ebrahimi bool endmatch_; // whether match must end at text.end()
117*ccdc9c3eSSadaf Ebrahimi const char* btext_; // beginning of text being matched (for FormatSubmatch)
118*ccdc9c3eSSadaf Ebrahimi const char* etext_; // end of text being matched (for endmatch_)
119*ccdc9c3eSSadaf Ebrahimi Threadq q0_, q1_; // pre-allocated for Search.
120*ccdc9c3eSSadaf Ebrahimi PODArray<AddState> stack_; // pre-allocated for AddToThreadq
121*ccdc9c3eSSadaf Ebrahimi Thread* free_threads_; // free list
122*ccdc9c3eSSadaf Ebrahimi const char** match_; // best match so far
123*ccdc9c3eSSadaf Ebrahimi bool matched_; // any match so far?
124*ccdc9c3eSSadaf Ebrahimi
125*ccdc9c3eSSadaf Ebrahimi NFA(const NFA&) = delete;
126*ccdc9c3eSSadaf Ebrahimi NFA& operator=(const NFA&) = delete;
127*ccdc9c3eSSadaf Ebrahimi };
128*ccdc9c3eSSadaf Ebrahimi
NFA(Prog * prog)129*ccdc9c3eSSadaf Ebrahimi NFA::NFA(Prog* prog) {
130*ccdc9c3eSSadaf Ebrahimi prog_ = prog;
131*ccdc9c3eSSadaf Ebrahimi start_ = prog_->start();
132*ccdc9c3eSSadaf Ebrahimi ncapture_ = 0;
133*ccdc9c3eSSadaf Ebrahimi longest_ = false;
134*ccdc9c3eSSadaf Ebrahimi endmatch_ = false;
135*ccdc9c3eSSadaf Ebrahimi btext_ = NULL;
136*ccdc9c3eSSadaf Ebrahimi etext_ = NULL;
137*ccdc9c3eSSadaf Ebrahimi q0_.resize(prog_->size());
138*ccdc9c3eSSadaf Ebrahimi q1_.resize(prog_->size());
139*ccdc9c3eSSadaf Ebrahimi // See NFA::AddToThreadq() for why this is so.
140*ccdc9c3eSSadaf Ebrahimi int nstack = 2*prog_->inst_count(kInstCapture) +
141*ccdc9c3eSSadaf Ebrahimi prog_->inst_count(kInstEmptyWidth) +
142*ccdc9c3eSSadaf Ebrahimi prog_->inst_count(kInstNop) + 1; // + 1 for start inst
143*ccdc9c3eSSadaf Ebrahimi stack_ = PODArray<AddState>(nstack);
144*ccdc9c3eSSadaf Ebrahimi free_threads_ = NULL;
145*ccdc9c3eSSadaf Ebrahimi match_ = NULL;
146*ccdc9c3eSSadaf Ebrahimi matched_ = false;
147*ccdc9c3eSSadaf Ebrahimi }
148*ccdc9c3eSSadaf Ebrahimi
~NFA()149*ccdc9c3eSSadaf Ebrahimi NFA::~NFA() {
150*ccdc9c3eSSadaf Ebrahimi delete[] match_;
151*ccdc9c3eSSadaf Ebrahimi Thread* next;
152*ccdc9c3eSSadaf Ebrahimi for (Thread* t = free_threads_; t; t = next) {
153*ccdc9c3eSSadaf Ebrahimi next = t->next;
154*ccdc9c3eSSadaf Ebrahimi delete[] t->capture;
155*ccdc9c3eSSadaf Ebrahimi delete t;
156*ccdc9c3eSSadaf Ebrahimi }
157*ccdc9c3eSSadaf Ebrahimi }
158*ccdc9c3eSSadaf Ebrahimi
AllocThread()159*ccdc9c3eSSadaf Ebrahimi NFA::Thread* NFA::AllocThread() {
160*ccdc9c3eSSadaf Ebrahimi Thread* t = free_threads_;
161*ccdc9c3eSSadaf Ebrahimi if (t == NULL) {
162*ccdc9c3eSSadaf Ebrahimi t = new Thread;
163*ccdc9c3eSSadaf Ebrahimi t->ref = 1;
164*ccdc9c3eSSadaf Ebrahimi t->capture = new const char*[ncapture_];
165*ccdc9c3eSSadaf Ebrahimi return t;
166*ccdc9c3eSSadaf Ebrahimi }
167*ccdc9c3eSSadaf Ebrahimi free_threads_ = t->next;
168*ccdc9c3eSSadaf Ebrahimi t->ref = 1;
169*ccdc9c3eSSadaf Ebrahimi return t;
170*ccdc9c3eSSadaf Ebrahimi }
171*ccdc9c3eSSadaf Ebrahimi
Incref(Thread * t)172*ccdc9c3eSSadaf Ebrahimi NFA::Thread* NFA::Incref(Thread* t) {
173*ccdc9c3eSSadaf Ebrahimi DCHECK(t != NULL);
174*ccdc9c3eSSadaf Ebrahimi t->ref++;
175*ccdc9c3eSSadaf Ebrahimi return t;
176*ccdc9c3eSSadaf Ebrahimi }
177*ccdc9c3eSSadaf Ebrahimi
Decref(Thread * t)178*ccdc9c3eSSadaf Ebrahimi void NFA::Decref(Thread* t) {
179*ccdc9c3eSSadaf Ebrahimi if (t == NULL)
180*ccdc9c3eSSadaf Ebrahimi return;
181*ccdc9c3eSSadaf Ebrahimi t->ref--;
182*ccdc9c3eSSadaf Ebrahimi if (t->ref > 0)
183*ccdc9c3eSSadaf Ebrahimi return;
184*ccdc9c3eSSadaf Ebrahimi DCHECK_EQ(t->ref, 0);
185*ccdc9c3eSSadaf Ebrahimi t->next = free_threads_;
186*ccdc9c3eSSadaf Ebrahimi free_threads_ = t;
187*ccdc9c3eSSadaf Ebrahimi }
188*ccdc9c3eSSadaf Ebrahimi
CopyCapture(const char ** dst,const char ** src)189*ccdc9c3eSSadaf Ebrahimi void NFA::CopyCapture(const char** dst, const char** src) {
190*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < ncapture_; i+=2) {
191*ccdc9c3eSSadaf Ebrahimi dst[i] = src[i];
192*ccdc9c3eSSadaf Ebrahimi dst[i+1] = src[i+1];
193*ccdc9c3eSSadaf Ebrahimi }
194*ccdc9c3eSSadaf Ebrahimi }
195*ccdc9c3eSSadaf Ebrahimi
196*ccdc9c3eSSadaf Ebrahimi // Follows all empty arrows from id0 and enqueues all the states reached.
197*ccdc9c3eSSadaf Ebrahimi // Enqueues only the ByteRange instructions that match byte c.
198*ccdc9c3eSSadaf Ebrahimi // context is used (with p) for evaluating empty-width specials.
199*ccdc9c3eSSadaf Ebrahimi // p is the current input position, and t0 is the current thread.
AddToThreadq(Threadq * q,int id0,int c,const StringPiece & context,const char * p,Thread * t0)200*ccdc9c3eSSadaf Ebrahimi void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
201*ccdc9c3eSSadaf Ebrahimi const char* p, Thread* t0) {
202*ccdc9c3eSSadaf Ebrahimi if (id0 == 0)
203*ccdc9c3eSSadaf Ebrahimi return;
204*ccdc9c3eSSadaf Ebrahimi
205*ccdc9c3eSSadaf Ebrahimi // Use stack_ to hold our stack of instructions yet to process.
206*ccdc9c3eSSadaf Ebrahimi // It was preallocated as follows:
207*ccdc9c3eSSadaf Ebrahimi // two entries per Capture;
208*ccdc9c3eSSadaf Ebrahimi // one entry per EmptyWidth; and
209*ccdc9c3eSSadaf Ebrahimi // one entry per Nop.
210*ccdc9c3eSSadaf Ebrahimi // This reflects the maximum number of stack pushes that each can
211*ccdc9c3eSSadaf Ebrahimi // perform. (Each instruction can be processed at most once.)
212*ccdc9c3eSSadaf Ebrahimi AddState* stk = stack_.data();
213*ccdc9c3eSSadaf Ebrahimi int nstk = 0;
214*ccdc9c3eSSadaf Ebrahimi
215*ccdc9c3eSSadaf Ebrahimi stk[nstk++] = {id0, NULL};
216*ccdc9c3eSSadaf Ebrahimi while (nstk > 0) {
217*ccdc9c3eSSadaf Ebrahimi DCHECK_LE(nstk, stack_.size());
218*ccdc9c3eSSadaf Ebrahimi AddState a = stk[--nstk];
219*ccdc9c3eSSadaf Ebrahimi
220*ccdc9c3eSSadaf Ebrahimi Loop:
221*ccdc9c3eSSadaf Ebrahimi if (a.t != NULL) {
222*ccdc9c3eSSadaf Ebrahimi // t0 was a thread that we allocated and copied in order to
223*ccdc9c3eSSadaf Ebrahimi // record the capture, so we must now decref it.
224*ccdc9c3eSSadaf Ebrahimi Decref(t0);
225*ccdc9c3eSSadaf Ebrahimi t0 = a.t;
226*ccdc9c3eSSadaf Ebrahimi }
227*ccdc9c3eSSadaf Ebrahimi
228*ccdc9c3eSSadaf Ebrahimi int id = a.id;
229*ccdc9c3eSSadaf Ebrahimi if (id == 0)
230*ccdc9c3eSSadaf Ebrahimi continue;
231*ccdc9c3eSSadaf Ebrahimi if (q->has_index(id)) {
232*ccdc9c3eSSadaf Ebrahimi if (ExtraDebug)
233*ccdc9c3eSSadaf Ebrahimi fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
234*ccdc9c3eSSadaf Ebrahimi continue;
235*ccdc9c3eSSadaf Ebrahimi }
236*ccdc9c3eSSadaf Ebrahimi
237*ccdc9c3eSSadaf Ebrahimi // Create entry in q no matter what. We might fill it in below,
238*ccdc9c3eSSadaf Ebrahimi // or we might not. Even if not, it is necessary to have it,
239*ccdc9c3eSSadaf Ebrahimi // so that we don't revisit id0 during the recursion.
240*ccdc9c3eSSadaf Ebrahimi q->set_new(id, NULL);
241*ccdc9c3eSSadaf Ebrahimi Thread** tp = &q->get_existing(id);
242*ccdc9c3eSSadaf Ebrahimi int j;
243*ccdc9c3eSSadaf Ebrahimi Thread* t;
244*ccdc9c3eSSadaf Ebrahimi Prog::Inst* ip = prog_->inst(id);
245*ccdc9c3eSSadaf Ebrahimi switch (ip->opcode()) {
246*ccdc9c3eSSadaf Ebrahimi default:
247*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
248*ccdc9c3eSSadaf Ebrahimi break;
249*ccdc9c3eSSadaf Ebrahimi
250*ccdc9c3eSSadaf Ebrahimi case kInstFail:
251*ccdc9c3eSSadaf Ebrahimi break;
252*ccdc9c3eSSadaf Ebrahimi
253*ccdc9c3eSSadaf Ebrahimi case kInstAltMatch:
254*ccdc9c3eSSadaf Ebrahimi // Save state; will pick up at next byte.
255*ccdc9c3eSSadaf Ebrahimi t = Incref(t0);
256*ccdc9c3eSSadaf Ebrahimi *tp = t;
257*ccdc9c3eSSadaf Ebrahimi
258*ccdc9c3eSSadaf Ebrahimi DCHECK(!ip->last());
259*ccdc9c3eSSadaf Ebrahimi a = {id+1, NULL};
260*ccdc9c3eSSadaf Ebrahimi goto Loop;
261*ccdc9c3eSSadaf Ebrahimi
262*ccdc9c3eSSadaf Ebrahimi case kInstNop:
263*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
264*ccdc9c3eSSadaf Ebrahimi stk[nstk++] = {id+1, NULL};
265*ccdc9c3eSSadaf Ebrahimi
266*ccdc9c3eSSadaf Ebrahimi // Continue on.
267*ccdc9c3eSSadaf Ebrahimi a = {ip->out(), NULL};
268*ccdc9c3eSSadaf Ebrahimi goto Loop;
269*ccdc9c3eSSadaf Ebrahimi
270*ccdc9c3eSSadaf Ebrahimi case kInstCapture:
271*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
272*ccdc9c3eSSadaf Ebrahimi stk[nstk++] = {id+1, NULL};
273*ccdc9c3eSSadaf Ebrahimi
274*ccdc9c3eSSadaf Ebrahimi if ((j=ip->cap()) < ncapture_) {
275*ccdc9c3eSSadaf Ebrahimi // Push a dummy whose only job is to restore t0
276*ccdc9c3eSSadaf Ebrahimi // once we finish exploring this possibility.
277*ccdc9c3eSSadaf Ebrahimi stk[nstk++] = {0, t0};
278*ccdc9c3eSSadaf Ebrahimi
279*ccdc9c3eSSadaf Ebrahimi // Record capture.
280*ccdc9c3eSSadaf Ebrahimi t = AllocThread();
281*ccdc9c3eSSadaf Ebrahimi CopyCapture(t->capture, t0->capture);
282*ccdc9c3eSSadaf Ebrahimi t->capture[j] = p;
283*ccdc9c3eSSadaf Ebrahimi t0 = t;
284*ccdc9c3eSSadaf Ebrahimi }
285*ccdc9c3eSSadaf Ebrahimi a = {ip->out(), NULL};
286*ccdc9c3eSSadaf Ebrahimi goto Loop;
287*ccdc9c3eSSadaf Ebrahimi
288*ccdc9c3eSSadaf Ebrahimi case kInstByteRange:
289*ccdc9c3eSSadaf Ebrahimi if (!ip->Matches(c))
290*ccdc9c3eSSadaf Ebrahimi goto Next;
291*ccdc9c3eSSadaf Ebrahimi FALLTHROUGH_INTENDED;
292*ccdc9c3eSSadaf Ebrahimi
293*ccdc9c3eSSadaf Ebrahimi case kInstMatch:
294*ccdc9c3eSSadaf Ebrahimi // Save state; will pick up at next byte.
295*ccdc9c3eSSadaf Ebrahimi t = Incref(t0);
296*ccdc9c3eSSadaf Ebrahimi *tp = t;
297*ccdc9c3eSSadaf Ebrahimi if (ExtraDebug)
298*ccdc9c3eSSadaf Ebrahimi fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
299*ccdc9c3eSSadaf Ebrahimi
300*ccdc9c3eSSadaf Ebrahimi Next:
301*ccdc9c3eSSadaf Ebrahimi if (ip->last())
302*ccdc9c3eSSadaf Ebrahimi break;
303*ccdc9c3eSSadaf Ebrahimi a = {id+1, NULL};
304*ccdc9c3eSSadaf Ebrahimi goto Loop;
305*ccdc9c3eSSadaf Ebrahimi
306*ccdc9c3eSSadaf Ebrahimi case kInstEmptyWidth:
307*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
308*ccdc9c3eSSadaf Ebrahimi stk[nstk++] = {id+1, NULL};
309*ccdc9c3eSSadaf Ebrahimi
310*ccdc9c3eSSadaf Ebrahimi // Continue on if we have all the right flag bits.
311*ccdc9c3eSSadaf Ebrahimi if (ip->empty() & ~Prog::EmptyFlags(context, p))
312*ccdc9c3eSSadaf Ebrahimi break;
313*ccdc9c3eSSadaf Ebrahimi a = {ip->out(), NULL};
314*ccdc9c3eSSadaf Ebrahimi goto Loop;
315*ccdc9c3eSSadaf Ebrahimi }
316*ccdc9c3eSSadaf Ebrahimi }
317*ccdc9c3eSSadaf Ebrahimi }
318*ccdc9c3eSSadaf Ebrahimi
319*ccdc9c3eSSadaf Ebrahimi // Run runq on byte c, appending new states to nextq.
320*ccdc9c3eSSadaf Ebrahimi // Updates matched_ and match_ as new, better matches are found.
321*ccdc9c3eSSadaf Ebrahimi // context is used (with p) for evaluating empty-width specials.
322*ccdc9c3eSSadaf Ebrahimi // p is the position of byte c in the input string for AddToThreadq;
323*ccdc9c3eSSadaf Ebrahimi // p-1 will be used when processing Match instructions.
324*ccdc9c3eSSadaf Ebrahimi // Frees all the threads on runq.
325*ccdc9c3eSSadaf Ebrahimi // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,const StringPiece & context,const char * p)326*ccdc9c3eSSadaf Ebrahimi int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
327*ccdc9c3eSSadaf Ebrahimi const char* p) {
328*ccdc9c3eSSadaf Ebrahimi nextq->clear();
329*ccdc9c3eSSadaf Ebrahimi
330*ccdc9c3eSSadaf Ebrahimi for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
331*ccdc9c3eSSadaf Ebrahimi Thread* t = i->value();
332*ccdc9c3eSSadaf Ebrahimi if (t == NULL)
333*ccdc9c3eSSadaf Ebrahimi continue;
334*ccdc9c3eSSadaf Ebrahimi
335*ccdc9c3eSSadaf Ebrahimi if (longest_) {
336*ccdc9c3eSSadaf Ebrahimi // Can skip any threads started after our current best match.
337*ccdc9c3eSSadaf Ebrahimi if (matched_ && match_[0] < t->capture[0]) {
338*ccdc9c3eSSadaf Ebrahimi Decref(t);
339*ccdc9c3eSSadaf Ebrahimi continue;
340*ccdc9c3eSSadaf Ebrahimi }
341*ccdc9c3eSSadaf Ebrahimi }
342*ccdc9c3eSSadaf Ebrahimi
343*ccdc9c3eSSadaf Ebrahimi int id = i->index();
344*ccdc9c3eSSadaf Ebrahimi Prog::Inst* ip = prog_->inst(id);
345*ccdc9c3eSSadaf Ebrahimi
346*ccdc9c3eSSadaf Ebrahimi switch (ip->opcode()) {
347*ccdc9c3eSSadaf Ebrahimi default:
348*ccdc9c3eSSadaf Ebrahimi // Should only see the values handled below.
349*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
350*ccdc9c3eSSadaf Ebrahimi break;
351*ccdc9c3eSSadaf Ebrahimi
352*ccdc9c3eSSadaf Ebrahimi case kInstByteRange:
353*ccdc9c3eSSadaf Ebrahimi AddToThreadq(nextq, ip->out(), c, context, p, t);
354*ccdc9c3eSSadaf Ebrahimi break;
355*ccdc9c3eSSadaf Ebrahimi
356*ccdc9c3eSSadaf Ebrahimi case kInstAltMatch:
357*ccdc9c3eSSadaf Ebrahimi if (i != runq->begin())
358*ccdc9c3eSSadaf Ebrahimi break;
359*ccdc9c3eSSadaf Ebrahimi // The match is ours if we want it.
360*ccdc9c3eSSadaf Ebrahimi if (ip->greedy(prog_) || longest_) {
361*ccdc9c3eSSadaf Ebrahimi CopyCapture(match_, t->capture);
362*ccdc9c3eSSadaf Ebrahimi matched_ = true;
363*ccdc9c3eSSadaf Ebrahimi
364*ccdc9c3eSSadaf Ebrahimi Decref(t);
365*ccdc9c3eSSadaf Ebrahimi for (++i; i != runq->end(); ++i)
366*ccdc9c3eSSadaf Ebrahimi Decref(i->value());
367*ccdc9c3eSSadaf Ebrahimi runq->clear();
368*ccdc9c3eSSadaf Ebrahimi if (ip->greedy(prog_))
369*ccdc9c3eSSadaf Ebrahimi return ip->out1();
370*ccdc9c3eSSadaf Ebrahimi return ip->out();
371*ccdc9c3eSSadaf Ebrahimi }
372*ccdc9c3eSSadaf Ebrahimi break;
373*ccdc9c3eSSadaf Ebrahimi
374*ccdc9c3eSSadaf Ebrahimi case kInstMatch: {
375*ccdc9c3eSSadaf Ebrahimi // Avoid invoking undefined behavior when p happens
376*ccdc9c3eSSadaf Ebrahimi // to be null - and p-1 would be meaningless anyway.
377*ccdc9c3eSSadaf Ebrahimi if (p == NULL)
378*ccdc9c3eSSadaf Ebrahimi break;
379*ccdc9c3eSSadaf Ebrahimi
380*ccdc9c3eSSadaf Ebrahimi if (endmatch_ && p-1 != etext_)
381*ccdc9c3eSSadaf Ebrahimi break;
382*ccdc9c3eSSadaf Ebrahimi
383*ccdc9c3eSSadaf Ebrahimi if (longest_) {
384*ccdc9c3eSSadaf Ebrahimi // Leftmost-longest mode: save this match only if
385*ccdc9c3eSSadaf Ebrahimi // it is either farther to the left or at the same
386*ccdc9c3eSSadaf Ebrahimi // point but longer than an existing match.
387*ccdc9c3eSSadaf Ebrahimi if (!matched_ || t->capture[0] < match_[0] ||
388*ccdc9c3eSSadaf Ebrahimi (t->capture[0] == match_[0] && p-1 > match_[1])) {
389*ccdc9c3eSSadaf Ebrahimi CopyCapture(match_, t->capture);
390*ccdc9c3eSSadaf Ebrahimi match_[1] = p-1;
391*ccdc9c3eSSadaf Ebrahimi matched_ = true;
392*ccdc9c3eSSadaf Ebrahimi }
393*ccdc9c3eSSadaf Ebrahimi } else {
394*ccdc9c3eSSadaf Ebrahimi // Leftmost-biased mode: this match is by definition
395*ccdc9c3eSSadaf Ebrahimi // better than what we've already found (see next line).
396*ccdc9c3eSSadaf Ebrahimi CopyCapture(match_, t->capture);
397*ccdc9c3eSSadaf Ebrahimi match_[1] = p-1;
398*ccdc9c3eSSadaf Ebrahimi matched_ = true;
399*ccdc9c3eSSadaf Ebrahimi
400*ccdc9c3eSSadaf Ebrahimi // Cut off the threads that can only find matches
401*ccdc9c3eSSadaf Ebrahimi // worse than the one we just found: don't run the
402*ccdc9c3eSSadaf Ebrahimi // rest of the current Threadq.
403*ccdc9c3eSSadaf Ebrahimi Decref(t);
404*ccdc9c3eSSadaf Ebrahimi for (++i; i != runq->end(); ++i)
405*ccdc9c3eSSadaf Ebrahimi Decref(i->value());
406*ccdc9c3eSSadaf Ebrahimi runq->clear();
407*ccdc9c3eSSadaf Ebrahimi return 0;
408*ccdc9c3eSSadaf Ebrahimi }
409*ccdc9c3eSSadaf Ebrahimi break;
410*ccdc9c3eSSadaf Ebrahimi }
411*ccdc9c3eSSadaf Ebrahimi }
412*ccdc9c3eSSadaf Ebrahimi Decref(t);
413*ccdc9c3eSSadaf Ebrahimi }
414*ccdc9c3eSSadaf Ebrahimi runq->clear();
415*ccdc9c3eSSadaf Ebrahimi return 0;
416*ccdc9c3eSSadaf Ebrahimi }
417*ccdc9c3eSSadaf Ebrahimi
FormatCapture(const char ** capture)418*ccdc9c3eSSadaf Ebrahimi string NFA::FormatCapture(const char** capture) {
419*ccdc9c3eSSadaf Ebrahimi string s;
420*ccdc9c3eSSadaf Ebrahimi
421*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < ncapture_; i+=2) {
422*ccdc9c3eSSadaf Ebrahimi if (capture[i] == NULL)
423*ccdc9c3eSSadaf Ebrahimi StringAppendF(&s, "(?,?)");
424*ccdc9c3eSSadaf Ebrahimi else if (capture[i+1] == NULL)
425*ccdc9c3eSSadaf Ebrahimi StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
426*ccdc9c3eSSadaf Ebrahimi else
427*ccdc9c3eSSadaf Ebrahimi StringAppendF(&s, "(%d,%d)",
428*ccdc9c3eSSadaf Ebrahimi (int)(capture[i] - btext_),
429*ccdc9c3eSSadaf Ebrahimi (int)(capture[i+1] - btext_));
430*ccdc9c3eSSadaf Ebrahimi }
431*ccdc9c3eSSadaf Ebrahimi return s;
432*ccdc9c3eSSadaf Ebrahimi }
433*ccdc9c3eSSadaf Ebrahimi
Search(const StringPiece & text,const StringPiece & const_context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)434*ccdc9c3eSSadaf Ebrahimi bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
435*ccdc9c3eSSadaf Ebrahimi bool anchored, bool longest,
436*ccdc9c3eSSadaf Ebrahimi StringPiece* submatch, int nsubmatch) {
437*ccdc9c3eSSadaf Ebrahimi if (start_ == 0)
438*ccdc9c3eSSadaf Ebrahimi return false;
439*ccdc9c3eSSadaf Ebrahimi
440*ccdc9c3eSSadaf Ebrahimi StringPiece context = const_context;
441*ccdc9c3eSSadaf Ebrahimi if (context.begin() == NULL)
442*ccdc9c3eSSadaf Ebrahimi context = text;
443*ccdc9c3eSSadaf Ebrahimi
444*ccdc9c3eSSadaf Ebrahimi // Sanity check: make sure that text lies within context.
445*ccdc9c3eSSadaf Ebrahimi if (text.begin() < context.begin() || text.end() > context.end()) {
446*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "context does not contain text";
447*ccdc9c3eSSadaf Ebrahimi return false;
448*ccdc9c3eSSadaf Ebrahimi }
449*ccdc9c3eSSadaf Ebrahimi
450*ccdc9c3eSSadaf Ebrahimi if (prog_->anchor_start() && context.begin() != text.begin())
451*ccdc9c3eSSadaf Ebrahimi return false;
452*ccdc9c3eSSadaf Ebrahimi if (prog_->anchor_end() && context.end() != text.end())
453*ccdc9c3eSSadaf Ebrahimi return false;
454*ccdc9c3eSSadaf Ebrahimi anchored |= prog_->anchor_start();
455*ccdc9c3eSSadaf Ebrahimi if (prog_->anchor_end()) {
456*ccdc9c3eSSadaf Ebrahimi longest = true;
457*ccdc9c3eSSadaf Ebrahimi endmatch_ = true;
458*ccdc9c3eSSadaf Ebrahimi etext_ = text.end();
459*ccdc9c3eSSadaf Ebrahimi }
460*ccdc9c3eSSadaf Ebrahimi
461*ccdc9c3eSSadaf Ebrahimi if (nsubmatch < 0) {
462*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
463*ccdc9c3eSSadaf Ebrahimi return false;
464*ccdc9c3eSSadaf Ebrahimi }
465*ccdc9c3eSSadaf Ebrahimi
466*ccdc9c3eSSadaf Ebrahimi // Save search parameters.
467*ccdc9c3eSSadaf Ebrahimi ncapture_ = 2*nsubmatch;
468*ccdc9c3eSSadaf Ebrahimi longest_ = longest;
469*ccdc9c3eSSadaf Ebrahimi
470*ccdc9c3eSSadaf Ebrahimi if (nsubmatch == 0) {
471*ccdc9c3eSSadaf Ebrahimi // We need to maintain match[0], both to distinguish the
472*ccdc9c3eSSadaf Ebrahimi // longest match (if longest is true) and also to tell
473*ccdc9c3eSSadaf Ebrahimi // whether we've seen any matches at all.
474*ccdc9c3eSSadaf Ebrahimi ncapture_ = 2;
475*ccdc9c3eSSadaf Ebrahimi }
476*ccdc9c3eSSadaf Ebrahimi
477*ccdc9c3eSSadaf Ebrahimi match_ = new const char*[ncapture_];
478*ccdc9c3eSSadaf Ebrahimi matched_ = false;
479*ccdc9c3eSSadaf Ebrahimi
480*ccdc9c3eSSadaf Ebrahimi // For debugging prints.
481*ccdc9c3eSSadaf Ebrahimi btext_ = context.begin();
482*ccdc9c3eSSadaf Ebrahimi
483*ccdc9c3eSSadaf Ebrahimi if (ExtraDebug)
484*ccdc9c3eSSadaf Ebrahimi fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
485*ccdc9c3eSSadaf Ebrahimi string(text).c_str(), string(context).c_str(), anchored, longest);
486*ccdc9c3eSSadaf Ebrahimi
487*ccdc9c3eSSadaf Ebrahimi // Set up search.
488*ccdc9c3eSSadaf Ebrahimi Threadq* runq = &q0_;
489*ccdc9c3eSSadaf Ebrahimi Threadq* nextq = &q1_;
490*ccdc9c3eSSadaf Ebrahimi runq->clear();
491*ccdc9c3eSSadaf Ebrahimi nextq->clear();
492*ccdc9c3eSSadaf Ebrahimi memset(&match_[0], 0, ncapture_*sizeof match_[0]);
493*ccdc9c3eSSadaf Ebrahimi
494*ccdc9c3eSSadaf Ebrahimi // Loop over the text, stepping the machine.
495*ccdc9c3eSSadaf Ebrahimi for (const char* p = text.begin();; p++) {
496*ccdc9c3eSSadaf Ebrahimi if (ExtraDebug) {
497*ccdc9c3eSSadaf Ebrahimi int c = 0;
498*ccdc9c3eSSadaf Ebrahimi if (p == context.begin())
499*ccdc9c3eSSadaf Ebrahimi c = '^';
500*ccdc9c3eSSadaf Ebrahimi else if (p > text.end())
501*ccdc9c3eSSadaf Ebrahimi c = '$';
502*ccdc9c3eSSadaf Ebrahimi else if (p < text.end())
503*ccdc9c3eSSadaf Ebrahimi c = p[0] & 0xFF;
504*ccdc9c3eSSadaf Ebrahimi
505*ccdc9c3eSSadaf Ebrahimi fprintf(stderr, "%c:", c);
506*ccdc9c3eSSadaf Ebrahimi for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
507*ccdc9c3eSSadaf Ebrahimi Thread* t = i->value();
508*ccdc9c3eSSadaf Ebrahimi if (t == NULL)
509*ccdc9c3eSSadaf Ebrahimi continue;
510*ccdc9c3eSSadaf Ebrahimi fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
511*ccdc9c3eSSadaf Ebrahimi }
512*ccdc9c3eSSadaf Ebrahimi fprintf(stderr, "\n");
513*ccdc9c3eSSadaf Ebrahimi }
514*ccdc9c3eSSadaf Ebrahimi
515*ccdc9c3eSSadaf Ebrahimi // This is a no-op the first time around the loop because runq is empty.
516*ccdc9c3eSSadaf Ebrahimi int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p);
517*ccdc9c3eSSadaf Ebrahimi DCHECK_EQ(runq->size(), 0);
518*ccdc9c3eSSadaf Ebrahimi using std::swap;
519*ccdc9c3eSSadaf Ebrahimi swap(nextq, runq);
520*ccdc9c3eSSadaf Ebrahimi nextq->clear();
521*ccdc9c3eSSadaf Ebrahimi if (id != 0) {
522*ccdc9c3eSSadaf Ebrahimi // We're done: full match ahead.
523*ccdc9c3eSSadaf Ebrahimi p = text.end();
524*ccdc9c3eSSadaf Ebrahimi for (;;) {
525*ccdc9c3eSSadaf Ebrahimi Prog::Inst* ip = prog_->inst(id);
526*ccdc9c3eSSadaf Ebrahimi switch (ip->opcode()) {
527*ccdc9c3eSSadaf Ebrahimi default:
528*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
529*ccdc9c3eSSadaf Ebrahimi break;
530*ccdc9c3eSSadaf Ebrahimi
531*ccdc9c3eSSadaf Ebrahimi case kInstCapture:
532*ccdc9c3eSSadaf Ebrahimi if (ip->cap() < ncapture_)
533*ccdc9c3eSSadaf Ebrahimi match_[ip->cap()] = p;
534*ccdc9c3eSSadaf Ebrahimi id = ip->out();
535*ccdc9c3eSSadaf Ebrahimi continue;
536*ccdc9c3eSSadaf Ebrahimi
537*ccdc9c3eSSadaf Ebrahimi case kInstNop:
538*ccdc9c3eSSadaf Ebrahimi id = ip->out();
539*ccdc9c3eSSadaf Ebrahimi continue;
540*ccdc9c3eSSadaf Ebrahimi
541*ccdc9c3eSSadaf Ebrahimi case kInstMatch:
542*ccdc9c3eSSadaf Ebrahimi match_[1] = p;
543*ccdc9c3eSSadaf Ebrahimi matched_ = true;
544*ccdc9c3eSSadaf Ebrahimi break;
545*ccdc9c3eSSadaf Ebrahimi }
546*ccdc9c3eSSadaf Ebrahimi break;
547*ccdc9c3eSSadaf Ebrahimi }
548*ccdc9c3eSSadaf Ebrahimi break;
549*ccdc9c3eSSadaf Ebrahimi }
550*ccdc9c3eSSadaf Ebrahimi
551*ccdc9c3eSSadaf Ebrahimi if (p > text.end())
552*ccdc9c3eSSadaf Ebrahimi break;
553*ccdc9c3eSSadaf Ebrahimi
554*ccdc9c3eSSadaf Ebrahimi // Start a new thread if there have not been any matches.
555*ccdc9c3eSSadaf Ebrahimi // (No point in starting a new thread if there have been
556*ccdc9c3eSSadaf Ebrahimi // matches, since it would be to the right of the match
557*ccdc9c3eSSadaf Ebrahimi // we already found.)
558*ccdc9c3eSSadaf Ebrahimi if (!matched_ && (!anchored || p == text.begin())) {
559*ccdc9c3eSSadaf Ebrahimi // If there's a required first byte for an unanchored search
560*ccdc9c3eSSadaf Ebrahimi // and we're not in the middle of any possible matches,
561*ccdc9c3eSSadaf Ebrahimi // use memchr to search for the byte quickly.
562*ccdc9c3eSSadaf Ebrahimi int fb = prog_->first_byte();
563*ccdc9c3eSSadaf Ebrahimi if (!anchored && runq->size() == 0 &&
564*ccdc9c3eSSadaf Ebrahimi fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
565*ccdc9c3eSSadaf Ebrahimi p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
566*ccdc9c3eSSadaf Ebrahimi if (p == NULL) {
567*ccdc9c3eSSadaf Ebrahimi p = text.end();
568*ccdc9c3eSSadaf Ebrahimi }
569*ccdc9c3eSSadaf Ebrahimi }
570*ccdc9c3eSSadaf Ebrahimi
571*ccdc9c3eSSadaf Ebrahimi Thread* t = AllocThread();
572*ccdc9c3eSSadaf Ebrahimi CopyCapture(t->capture, match_);
573*ccdc9c3eSSadaf Ebrahimi t->capture[0] = p;
574*ccdc9c3eSSadaf Ebrahimi AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p,
575*ccdc9c3eSSadaf Ebrahimi t);
576*ccdc9c3eSSadaf Ebrahimi Decref(t);
577*ccdc9c3eSSadaf Ebrahimi }
578*ccdc9c3eSSadaf Ebrahimi
579*ccdc9c3eSSadaf Ebrahimi // If all the threads have died, stop early.
580*ccdc9c3eSSadaf Ebrahimi if (runq->size() == 0) {
581*ccdc9c3eSSadaf Ebrahimi if (ExtraDebug)
582*ccdc9c3eSSadaf Ebrahimi fprintf(stderr, "dead\n");
583*ccdc9c3eSSadaf Ebrahimi break;
584*ccdc9c3eSSadaf Ebrahimi }
585*ccdc9c3eSSadaf Ebrahimi }
586*ccdc9c3eSSadaf Ebrahimi
587*ccdc9c3eSSadaf Ebrahimi for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
588*ccdc9c3eSSadaf Ebrahimi Decref(i->value());
589*ccdc9c3eSSadaf Ebrahimi
590*ccdc9c3eSSadaf Ebrahimi if (matched_) {
591*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < nsubmatch; i++)
592*ccdc9c3eSSadaf Ebrahimi submatch[i] =
593*ccdc9c3eSSadaf Ebrahimi StringPiece(match_[2 * i],
594*ccdc9c3eSSadaf Ebrahimi static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
595*ccdc9c3eSSadaf Ebrahimi if (ExtraDebug)
596*ccdc9c3eSSadaf Ebrahimi fprintf(stderr, "match (%td,%td)\n",
597*ccdc9c3eSSadaf Ebrahimi match_[0] - btext_, match_[1] - btext_);
598*ccdc9c3eSSadaf Ebrahimi return true;
599*ccdc9c3eSSadaf Ebrahimi }
600*ccdc9c3eSSadaf Ebrahimi return false;
601*ccdc9c3eSSadaf Ebrahimi }
602*ccdc9c3eSSadaf Ebrahimi
603*ccdc9c3eSSadaf Ebrahimi // Computes whether all successful matches have a common first byte,
604*ccdc9c3eSSadaf Ebrahimi // and if so, returns that byte. If not, returns -1.
ComputeFirstByte()605*ccdc9c3eSSadaf Ebrahimi int Prog::ComputeFirstByte() {
606*ccdc9c3eSSadaf Ebrahimi int b = -1;
607*ccdc9c3eSSadaf Ebrahimi SparseSet q(size());
608*ccdc9c3eSSadaf Ebrahimi q.insert(start());
609*ccdc9c3eSSadaf Ebrahimi for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
610*ccdc9c3eSSadaf Ebrahimi int id = *it;
611*ccdc9c3eSSadaf Ebrahimi Prog::Inst* ip = inst(id);
612*ccdc9c3eSSadaf Ebrahimi switch (ip->opcode()) {
613*ccdc9c3eSSadaf Ebrahimi default:
614*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
615*ccdc9c3eSSadaf Ebrahimi break;
616*ccdc9c3eSSadaf Ebrahimi
617*ccdc9c3eSSadaf Ebrahimi case kInstMatch:
618*ccdc9c3eSSadaf Ebrahimi // The empty string matches: no first byte.
619*ccdc9c3eSSadaf Ebrahimi return -1;
620*ccdc9c3eSSadaf Ebrahimi
621*ccdc9c3eSSadaf Ebrahimi case kInstByteRange:
622*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
623*ccdc9c3eSSadaf Ebrahimi q.insert(id+1);
624*ccdc9c3eSSadaf Ebrahimi
625*ccdc9c3eSSadaf Ebrahimi // Must match only a single byte
626*ccdc9c3eSSadaf Ebrahimi if (ip->lo() != ip->hi())
627*ccdc9c3eSSadaf Ebrahimi return -1;
628*ccdc9c3eSSadaf Ebrahimi if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
629*ccdc9c3eSSadaf Ebrahimi return -1;
630*ccdc9c3eSSadaf Ebrahimi // If we haven't seen any bytes yet, record it;
631*ccdc9c3eSSadaf Ebrahimi // otherwise must match the one we saw before.
632*ccdc9c3eSSadaf Ebrahimi if (b == -1)
633*ccdc9c3eSSadaf Ebrahimi b = ip->lo();
634*ccdc9c3eSSadaf Ebrahimi else if (b != ip->lo())
635*ccdc9c3eSSadaf Ebrahimi return -1;
636*ccdc9c3eSSadaf Ebrahimi break;
637*ccdc9c3eSSadaf Ebrahimi
638*ccdc9c3eSSadaf Ebrahimi case kInstNop:
639*ccdc9c3eSSadaf Ebrahimi case kInstCapture:
640*ccdc9c3eSSadaf Ebrahimi case kInstEmptyWidth:
641*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
642*ccdc9c3eSSadaf Ebrahimi q.insert(id+1);
643*ccdc9c3eSSadaf Ebrahimi
644*ccdc9c3eSSadaf Ebrahimi // Continue on.
645*ccdc9c3eSSadaf Ebrahimi // Ignore ip->empty() flags for kInstEmptyWidth
646*ccdc9c3eSSadaf Ebrahimi // in order to be as conservative as possible
647*ccdc9c3eSSadaf Ebrahimi // (assume all possible empty-width flags are true).
648*ccdc9c3eSSadaf Ebrahimi if (ip->out())
649*ccdc9c3eSSadaf Ebrahimi q.insert(ip->out());
650*ccdc9c3eSSadaf Ebrahimi break;
651*ccdc9c3eSSadaf Ebrahimi
652*ccdc9c3eSSadaf Ebrahimi case kInstAltMatch:
653*ccdc9c3eSSadaf Ebrahimi DCHECK(!ip->last());
654*ccdc9c3eSSadaf Ebrahimi q.insert(id+1);
655*ccdc9c3eSSadaf Ebrahimi break;
656*ccdc9c3eSSadaf Ebrahimi
657*ccdc9c3eSSadaf Ebrahimi case kInstFail:
658*ccdc9c3eSSadaf Ebrahimi break;
659*ccdc9c3eSSadaf Ebrahimi }
660*ccdc9c3eSSadaf Ebrahimi }
661*ccdc9c3eSSadaf Ebrahimi return b;
662*ccdc9c3eSSadaf Ebrahimi }
663*ccdc9c3eSSadaf Ebrahimi
664*ccdc9c3eSSadaf Ebrahimi bool
SearchNFA(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)665*ccdc9c3eSSadaf Ebrahimi Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
666*ccdc9c3eSSadaf Ebrahimi Anchor anchor, MatchKind kind,
667*ccdc9c3eSSadaf Ebrahimi StringPiece* match, int nmatch) {
668*ccdc9c3eSSadaf Ebrahimi if (ExtraDebug)
669*ccdc9c3eSSadaf Ebrahimi Dump();
670*ccdc9c3eSSadaf Ebrahimi
671*ccdc9c3eSSadaf Ebrahimi NFA nfa(this);
672*ccdc9c3eSSadaf Ebrahimi StringPiece sp;
673*ccdc9c3eSSadaf Ebrahimi if (kind == kFullMatch) {
674*ccdc9c3eSSadaf Ebrahimi anchor = kAnchored;
675*ccdc9c3eSSadaf Ebrahimi if (nmatch == 0) {
676*ccdc9c3eSSadaf Ebrahimi match = &sp;
677*ccdc9c3eSSadaf Ebrahimi nmatch = 1;
678*ccdc9c3eSSadaf Ebrahimi }
679*ccdc9c3eSSadaf Ebrahimi }
680*ccdc9c3eSSadaf Ebrahimi if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
681*ccdc9c3eSSadaf Ebrahimi return false;
682*ccdc9c3eSSadaf Ebrahimi if (kind == kFullMatch && match[0].end() != text.end())
683*ccdc9c3eSSadaf Ebrahimi return false;
684*ccdc9c3eSSadaf Ebrahimi return true;
685*ccdc9c3eSSadaf Ebrahimi }
686*ccdc9c3eSSadaf Ebrahimi
687*ccdc9c3eSSadaf Ebrahimi // For each instruction i in the program reachable from the start, compute the
688*ccdc9c3eSSadaf Ebrahimi // number of instructions reachable from i by following only empty transitions
689*ccdc9c3eSSadaf Ebrahimi // and record that count as fanout[i].
690*ccdc9c3eSSadaf Ebrahimi //
691*ccdc9c3eSSadaf Ebrahimi // fanout holds the results and is also the work queue for the outer iteration.
692*ccdc9c3eSSadaf Ebrahimi // reachable holds the reached nodes for the inner iteration.
Fanout(SparseArray<int> * fanout)693*ccdc9c3eSSadaf Ebrahimi void Prog::Fanout(SparseArray<int>* fanout) {
694*ccdc9c3eSSadaf Ebrahimi DCHECK_EQ(fanout->max_size(), size());
695*ccdc9c3eSSadaf Ebrahimi SparseSet reachable(size());
696*ccdc9c3eSSadaf Ebrahimi fanout->clear();
697*ccdc9c3eSSadaf Ebrahimi fanout->set_new(start(), 0);
698*ccdc9c3eSSadaf Ebrahimi for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
699*ccdc9c3eSSadaf Ebrahimi int* count = &i->value();
700*ccdc9c3eSSadaf Ebrahimi reachable.clear();
701*ccdc9c3eSSadaf Ebrahimi reachable.insert(i->index());
702*ccdc9c3eSSadaf Ebrahimi for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
703*ccdc9c3eSSadaf Ebrahimi int id = *j;
704*ccdc9c3eSSadaf Ebrahimi Prog::Inst* ip = inst(id);
705*ccdc9c3eSSadaf Ebrahimi switch (ip->opcode()) {
706*ccdc9c3eSSadaf Ebrahimi default:
707*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
708*ccdc9c3eSSadaf Ebrahimi break;
709*ccdc9c3eSSadaf Ebrahimi
710*ccdc9c3eSSadaf Ebrahimi case kInstByteRange:
711*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
712*ccdc9c3eSSadaf Ebrahimi reachable.insert(id+1);
713*ccdc9c3eSSadaf Ebrahimi
714*ccdc9c3eSSadaf Ebrahimi (*count)++;
715*ccdc9c3eSSadaf Ebrahimi if (!fanout->has_index(ip->out())) {
716*ccdc9c3eSSadaf Ebrahimi fanout->set_new(ip->out(), 0);
717*ccdc9c3eSSadaf Ebrahimi }
718*ccdc9c3eSSadaf Ebrahimi break;
719*ccdc9c3eSSadaf Ebrahimi
720*ccdc9c3eSSadaf Ebrahimi case kInstAltMatch:
721*ccdc9c3eSSadaf Ebrahimi DCHECK(!ip->last());
722*ccdc9c3eSSadaf Ebrahimi reachable.insert(id+1);
723*ccdc9c3eSSadaf Ebrahimi break;
724*ccdc9c3eSSadaf Ebrahimi
725*ccdc9c3eSSadaf Ebrahimi case kInstCapture:
726*ccdc9c3eSSadaf Ebrahimi case kInstEmptyWidth:
727*ccdc9c3eSSadaf Ebrahimi case kInstNop:
728*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
729*ccdc9c3eSSadaf Ebrahimi reachable.insert(id+1);
730*ccdc9c3eSSadaf Ebrahimi
731*ccdc9c3eSSadaf Ebrahimi reachable.insert(ip->out());
732*ccdc9c3eSSadaf Ebrahimi break;
733*ccdc9c3eSSadaf Ebrahimi
734*ccdc9c3eSSadaf Ebrahimi case kInstMatch:
735*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
736*ccdc9c3eSSadaf Ebrahimi reachable.insert(id+1);
737*ccdc9c3eSSadaf Ebrahimi break;
738*ccdc9c3eSSadaf Ebrahimi
739*ccdc9c3eSSadaf Ebrahimi case kInstFail:
740*ccdc9c3eSSadaf Ebrahimi break;
741*ccdc9c3eSSadaf Ebrahimi }
742*ccdc9c3eSSadaf Ebrahimi }
743*ccdc9c3eSSadaf Ebrahimi }
744*ccdc9c3eSSadaf Ebrahimi }
745*ccdc9c3eSSadaf Ebrahimi
746*ccdc9c3eSSadaf Ebrahimi } // namespace re2
747