1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6*ccdc9c3eSSadaf Ebrahimi
7*ccdc9c3eSSadaf Ebrahimi // Prog::SearchBitState is a regular expression search with submatch
8*ccdc9c3eSSadaf Ebrahimi // tracking for small regular expressions and texts. Like
9*ccdc9c3eSSadaf Ebrahimi // testing/backtrack.cc, it allocates a bit vector with (length of
10*ccdc9c3eSSadaf Ebrahimi // text) * (length of prog) bits, to make sure it never explores the
11*ccdc9c3eSSadaf Ebrahimi // same (character position, instruction) state multiple times. This
12*ccdc9c3eSSadaf Ebrahimi // limits the search to run in time linear in the length of the text.
13*ccdc9c3eSSadaf Ebrahimi //
14*ccdc9c3eSSadaf Ebrahimi // Unlike testing/backtrack.cc, SearchBitState is not recursive
15*ccdc9c3eSSadaf Ebrahimi // on the text.
16*ccdc9c3eSSadaf Ebrahimi //
17*ccdc9c3eSSadaf Ebrahimi // SearchBitState is a fast replacement for the NFA code on small
18*ccdc9c3eSSadaf Ebrahimi // regexps and texts when SearchOnePass cannot be used.
19*ccdc9c3eSSadaf Ebrahimi
20*ccdc9c3eSSadaf Ebrahimi #include <stddef.h>
21*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
22*ccdc9c3eSSadaf Ebrahimi #include <string.h>
23*ccdc9c3eSSadaf Ebrahimi #include <utility>
24*ccdc9c3eSSadaf Ebrahimi
25*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
26*ccdc9c3eSSadaf Ebrahimi #include "util/pod_array.h"
27*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
28*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
29*ccdc9c3eSSadaf Ebrahimi
30*ccdc9c3eSSadaf Ebrahimi namespace re2 {
31*ccdc9c3eSSadaf Ebrahimi
32*ccdc9c3eSSadaf Ebrahimi struct Job {
33*ccdc9c3eSSadaf Ebrahimi int id;
34*ccdc9c3eSSadaf Ebrahimi int arg;
35*ccdc9c3eSSadaf Ebrahimi const char* p;
36*ccdc9c3eSSadaf Ebrahimi };
37*ccdc9c3eSSadaf Ebrahimi
38*ccdc9c3eSSadaf Ebrahimi class BitState {
39*ccdc9c3eSSadaf Ebrahimi public:
40*ccdc9c3eSSadaf Ebrahimi explicit BitState(Prog* prog);
41*ccdc9c3eSSadaf Ebrahimi
42*ccdc9c3eSSadaf Ebrahimi // The usual Search prototype.
43*ccdc9c3eSSadaf Ebrahimi // Can only call Search once per BitState.
44*ccdc9c3eSSadaf Ebrahimi bool Search(const StringPiece& text, const StringPiece& context,
45*ccdc9c3eSSadaf Ebrahimi bool anchored, bool longest,
46*ccdc9c3eSSadaf Ebrahimi StringPiece* submatch, int nsubmatch);
47*ccdc9c3eSSadaf Ebrahimi
48*ccdc9c3eSSadaf Ebrahimi private:
49*ccdc9c3eSSadaf Ebrahimi inline bool ShouldVisit(int id, const char* p);
50*ccdc9c3eSSadaf Ebrahimi void Push(int id, const char* p, int arg);
51*ccdc9c3eSSadaf Ebrahimi void GrowStack();
52*ccdc9c3eSSadaf Ebrahimi bool TrySearch(int id, const char* p);
53*ccdc9c3eSSadaf Ebrahimi
54*ccdc9c3eSSadaf Ebrahimi // Search parameters
55*ccdc9c3eSSadaf Ebrahimi Prog* prog_; // program being run
56*ccdc9c3eSSadaf Ebrahimi StringPiece text_; // text being searched
57*ccdc9c3eSSadaf Ebrahimi StringPiece context_; // greater context of text being searched
58*ccdc9c3eSSadaf Ebrahimi bool anchored_; // whether search is anchored at text.begin()
59*ccdc9c3eSSadaf Ebrahimi bool longest_; // whether search wants leftmost-longest match
60*ccdc9c3eSSadaf Ebrahimi bool endmatch_; // whether match must end at text.end()
61*ccdc9c3eSSadaf Ebrahimi StringPiece* submatch_; // submatches to fill in
62*ccdc9c3eSSadaf Ebrahimi int nsubmatch_; // # of submatches to fill in
63*ccdc9c3eSSadaf Ebrahimi
64*ccdc9c3eSSadaf Ebrahimi // Search state
65*ccdc9c3eSSadaf Ebrahimi static const int VisitedBits = 32;
66*ccdc9c3eSSadaf Ebrahimi PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited
67*ccdc9c3eSSadaf Ebrahimi PODArray<const char*> cap_; // capture registers
68*ccdc9c3eSSadaf Ebrahimi PODArray<Job> job_; // stack of text positions to explore
69*ccdc9c3eSSadaf Ebrahimi int njob_; // stack size
70*ccdc9c3eSSadaf Ebrahimi };
71*ccdc9c3eSSadaf Ebrahimi
BitState(Prog * prog)72*ccdc9c3eSSadaf Ebrahimi BitState::BitState(Prog* prog)
73*ccdc9c3eSSadaf Ebrahimi : prog_(prog),
74*ccdc9c3eSSadaf Ebrahimi anchored_(false),
75*ccdc9c3eSSadaf Ebrahimi longest_(false),
76*ccdc9c3eSSadaf Ebrahimi endmatch_(false),
77*ccdc9c3eSSadaf Ebrahimi submatch_(NULL),
78*ccdc9c3eSSadaf Ebrahimi nsubmatch_(0),
79*ccdc9c3eSSadaf Ebrahimi njob_(0) {
80*ccdc9c3eSSadaf Ebrahimi }
81*ccdc9c3eSSadaf Ebrahimi
82*ccdc9c3eSSadaf Ebrahimi // Should the search visit the pair ip, p?
83*ccdc9c3eSSadaf Ebrahimi // If so, remember that it was visited so that the next time,
84*ccdc9c3eSSadaf Ebrahimi // we don't repeat the visit.
ShouldVisit(int id,const char * p)85*ccdc9c3eSSadaf Ebrahimi bool BitState::ShouldVisit(int id, const char* p) {
86*ccdc9c3eSSadaf Ebrahimi int n = id * static_cast<int>(text_.size()+1) +
87*ccdc9c3eSSadaf Ebrahimi static_cast<int>(p-text_.begin());
88*ccdc9c3eSSadaf Ebrahimi if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
89*ccdc9c3eSSadaf Ebrahimi return false;
90*ccdc9c3eSSadaf Ebrahimi visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
91*ccdc9c3eSSadaf Ebrahimi return true;
92*ccdc9c3eSSadaf Ebrahimi }
93*ccdc9c3eSSadaf Ebrahimi
94*ccdc9c3eSSadaf Ebrahimi // Grow the stack.
GrowStack()95*ccdc9c3eSSadaf Ebrahimi void BitState::GrowStack() {
96*ccdc9c3eSSadaf Ebrahimi PODArray<Job> tmp(2*job_.size());
97*ccdc9c3eSSadaf Ebrahimi memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
98*ccdc9c3eSSadaf Ebrahimi job_ = std::move(tmp);
99*ccdc9c3eSSadaf Ebrahimi }
100*ccdc9c3eSSadaf Ebrahimi
101*ccdc9c3eSSadaf Ebrahimi // Push the triple (id, p, arg) onto the stack, growing it if necessary.
Push(int id,const char * p,int arg)102*ccdc9c3eSSadaf Ebrahimi void BitState::Push(int id, const char* p, int arg) {
103*ccdc9c3eSSadaf Ebrahimi if (njob_ >= job_.size()) {
104*ccdc9c3eSSadaf Ebrahimi GrowStack();
105*ccdc9c3eSSadaf Ebrahimi if (njob_ >= job_.size()) {
106*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "GrowStack() failed: "
107*ccdc9c3eSSadaf Ebrahimi << "njob_ = " << njob_ << ", "
108*ccdc9c3eSSadaf Ebrahimi << "job_.size() = " << job_.size();
109*ccdc9c3eSSadaf Ebrahimi return;
110*ccdc9c3eSSadaf Ebrahimi }
111*ccdc9c3eSSadaf Ebrahimi }
112*ccdc9c3eSSadaf Ebrahimi int op = prog_->inst(id)->opcode();
113*ccdc9c3eSSadaf Ebrahimi if (op == kInstFail)
114*ccdc9c3eSSadaf Ebrahimi return;
115*ccdc9c3eSSadaf Ebrahimi
116*ccdc9c3eSSadaf Ebrahimi // Only check ShouldVisit when arg == 0.
117*ccdc9c3eSSadaf Ebrahimi // When arg > 0, we are continuing a previous visit.
118*ccdc9c3eSSadaf Ebrahimi if (arg == 0 && !ShouldVisit(id, p))
119*ccdc9c3eSSadaf Ebrahimi return;
120*ccdc9c3eSSadaf Ebrahimi
121*ccdc9c3eSSadaf Ebrahimi Job* j = &job_[njob_++];
122*ccdc9c3eSSadaf Ebrahimi j->id = id;
123*ccdc9c3eSSadaf Ebrahimi j->p = p;
124*ccdc9c3eSSadaf Ebrahimi j->arg = arg;
125*ccdc9c3eSSadaf Ebrahimi }
126*ccdc9c3eSSadaf Ebrahimi
127*ccdc9c3eSSadaf Ebrahimi // Try a search from instruction id0 in state p0.
128*ccdc9c3eSSadaf Ebrahimi // Return whether it succeeded.
TrySearch(int id0,const char * p0)129*ccdc9c3eSSadaf Ebrahimi bool BitState::TrySearch(int id0, const char* p0) {
130*ccdc9c3eSSadaf Ebrahimi bool matched = false;
131*ccdc9c3eSSadaf Ebrahimi bool inaltmatch = false;
132*ccdc9c3eSSadaf Ebrahimi const char* end = text_.end();
133*ccdc9c3eSSadaf Ebrahimi njob_ = 0;
134*ccdc9c3eSSadaf Ebrahimi Push(id0, p0, 0);
135*ccdc9c3eSSadaf Ebrahimi while (njob_ > 0) {
136*ccdc9c3eSSadaf Ebrahimi // Pop job off stack.
137*ccdc9c3eSSadaf Ebrahimi --njob_;
138*ccdc9c3eSSadaf Ebrahimi int id = job_[njob_].id;
139*ccdc9c3eSSadaf Ebrahimi const char* p = job_[njob_].p;
140*ccdc9c3eSSadaf Ebrahimi int arg = job_[njob_].arg;
141*ccdc9c3eSSadaf Ebrahimi
142*ccdc9c3eSSadaf Ebrahimi // Optimization: rather than push and pop,
143*ccdc9c3eSSadaf Ebrahimi // code that is going to Push and continue
144*ccdc9c3eSSadaf Ebrahimi // the loop simply updates ip, p, and arg
145*ccdc9c3eSSadaf Ebrahimi // and jumps to CheckAndLoop. We have to
146*ccdc9c3eSSadaf Ebrahimi // do the ShouldVisit check that Push
147*ccdc9c3eSSadaf Ebrahimi // would have, but we avoid the stack
148*ccdc9c3eSSadaf Ebrahimi // manipulation.
149*ccdc9c3eSSadaf Ebrahimi if (0) {
150*ccdc9c3eSSadaf Ebrahimi Next:
151*ccdc9c3eSSadaf Ebrahimi // If the Match of a non-greedy AltMatch failed,
152*ccdc9c3eSSadaf Ebrahimi // we stop ourselves from trying the ByteRange,
153*ccdc9c3eSSadaf Ebrahimi // which would steer us off the short circuit.
154*ccdc9c3eSSadaf Ebrahimi if (prog_->inst(id)->last() || inaltmatch)
155*ccdc9c3eSSadaf Ebrahimi continue;
156*ccdc9c3eSSadaf Ebrahimi id++;
157*ccdc9c3eSSadaf Ebrahimi
158*ccdc9c3eSSadaf Ebrahimi CheckAndLoop:
159*ccdc9c3eSSadaf Ebrahimi if (!ShouldVisit(id, p))
160*ccdc9c3eSSadaf Ebrahimi continue;
161*ccdc9c3eSSadaf Ebrahimi }
162*ccdc9c3eSSadaf Ebrahimi
163*ccdc9c3eSSadaf Ebrahimi // Visit ip, p.
164*ccdc9c3eSSadaf Ebrahimi Prog::Inst* ip = prog_->inst(id);
165*ccdc9c3eSSadaf Ebrahimi switch (ip->opcode()) {
166*ccdc9c3eSSadaf Ebrahimi default:
167*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
168*ccdc9c3eSSadaf Ebrahimi return false;
169*ccdc9c3eSSadaf Ebrahimi
170*ccdc9c3eSSadaf Ebrahimi case kInstFail:
171*ccdc9c3eSSadaf Ebrahimi continue;
172*ccdc9c3eSSadaf Ebrahimi
173*ccdc9c3eSSadaf Ebrahimi case kInstAltMatch:
174*ccdc9c3eSSadaf Ebrahimi switch (arg) {
175*ccdc9c3eSSadaf Ebrahimi case 0:
176*ccdc9c3eSSadaf Ebrahimi inaltmatch = true;
177*ccdc9c3eSSadaf Ebrahimi Push(id, p, 1); // come back when we're done
178*ccdc9c3eSSadaf Ebrahimi
179*ccdc9c3eSSadaf Ebrahimi // One opcode is ByteRange; the other leads to Match
180*ccdc9c3eSSadaf Ebrahimi // (possibly via Nop or Capture).
181*ccdc9c3eSSadaf Ebrahimi if (ip->greedy(prog_)) {
182*ccdc9c3eSSadaf Ebrahimi // out1 is the match
183*ccdc9c3eSSadaf Ebrahimi Push(ip->out1(), p, 0);
184*ccdc9c3eSSadaf Ebrahimi id = ip->out1();
185*ccdc9c3eSSadaf Ebrahimi p = end;
186*ccdc9c3eSSadaf Ebrahimi goto CheckAndLoop;
187*ccdc9c3eSSadaf Ebrahimi }
188*ccdc9c3eSSadaf Ebrahimi // out is the match - non-greedy
189*ccdc9c3eSSadaf Ebrahimi Push(ip->out(), end, 0);
190*ccdc9c3eSSadaf Ebrahimi id = ip->out();
191*ccdc9c3eSSadaf Ebrahimi goto CheckAndLoop;
192*ccdc9c3eSSadaf Ebrahimi
193*ccdc9c3eSSadaf Ebrahimi case 1:
194*ccdc9c3eSSadaf Ebrahimi inaltmatch = false;
195*ccdc9c3eSSadaf Ebrahimi continue;
196*ccdc9c3eSSadaf Ebrahimi }
197*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg;
198*ccdc9c3eSSadaf Ebrahimi continue;
199*ccdc9c3eSSadaf Ebrahimi
200*ccdc9c3eSSadaf Ebrahimi case kInstByteRange: {
201*ccdc9c3eSSadaf Ebrahimi int c = -1;
202*ccdc9c3eSSadaf Ebrahimi if (p < end)
203*ccdc9c3eSSadaf Ebrahimi c = *p & 0xFF;
204*ccdc9c3eSSadaf Ebrahimi if (!ip->Matches(c))
205*ccdc9c3eSSadaf Ebrahimi goto Next;
206*ccdc9c3eSSadaf Ebrahimi
207*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
208*ccdc9c3eSSadaf Ebrahimi Push(id+1, p, 0); // try the next when we're done
209*ccdc9c3eSSadaf Ebrahimi id = ip->out();
210*ccdc9c3eSSadaf Ebrahimi p++;
211*ccdc9c3eSSadaf Ebrahimi goto CheckAndLoop;
212*ccdc9c3eSSadaf Ebrahimi }
213*ccdc9c3eSSadaf Ebrahimi
214*ccdc9c3eSSadaf Ebrahimi case kInstCapture:
215*ccdc9c3eSSadaf Ebrahimi switch (arg) {
216*ccdc9c3eSSadaf Ebrahimi case 0:
217*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
218*ccdc9c3eSSadaf Ebrahimi Push(id+1, p, 0); // try the next when we're done
219*ccdc9c3eSSadaf Ebrahimi
220*ccdc9c3eSSadaf Ebrahimi if (0 <= ip->cap() && ip->cap() < cap_.size()) {
221*ccdc9c3eSSadaf Ebrahimi // Capture p to register, but save old value.
222*ccdc9c3eSSadaf Ebrahimi Push(id, cap_[ip->cap()], 1); // come back when we're done
223*ccdc9c3eSSadaf Ebrahimi cap_[ip->cap()] = p;
224*ccdc9c3eSSadaf Ebrahimi }
225*ccdc9c3eSSadaf Ebrahimi
226*ccdc9c3eSSadaf Ebrahimi // Continue on.
227*ccdc9c3eSSadaf Ebrahimi id = ip->out();
228*ccdc9c3eSSadaf Ebrahimi goto CheckAndLoop;
229*ccdc9c3eSSadaf Ebrahimi
230*ccdc9c3eSSadaf Ebrahimi case 1:
231*ccdc9c3eSSadaf Ebrahimi // Finished ip->out(); restore the old value.
232*ccdc9c3eSSadaf Ebrahimi cap_[ip->cap()] = p;
233*ccdc9c3eSSadaf Ebrahimi continue;
234*ccdc9c3eSSadaf Ebrahimi }
235*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
236*ccdc9c3eSSadaf Ebrahimi continue;
237*ccdc9c3eSSadaf Ebrahimi
238*ccdc9c3eSSadaf Ebrahimi case kInstEmptyWidth:
239*ccdc9c3eSSadaf Ebrahimi if (ip->empty() & ~Prog::EmptyFlags(context_, p))
240*ccdc9c3eSSadaf Ebrahimi goto Next;
241*ccdc9c3eSSadaf Ebrahimi
242*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
243*ccdc9c3eSSadaf Ebrahimi Push(id+1, p, 0); // try the next when we're done
244*ccdc9c3eSSadaf Ebrahimi id = ip->out();
245*ccdc9c3eSSadaf Ebrahimi goto CheckAndLoop;
246*ccdc9c3eSSadaf Ebrahimi
247*ccdc9c3eSSadaf Ebrahimi case kInstNop:
248*ccdc9c3eSSadaf Ebrahimi if (!ip->last())
249*ccdc9c3eSSadaf Ebrahimi Push(id+1, p, 0); // try the next when we're done
250*ccdc9c3eSSadaf Ebrahimi id = ip->out();
251*ccdc9c3eSSadaf Ebrahimi goto CheckAndLoop;
252*ccdc9c3eSSadaf Ebrahimi
253*ccdc9c3eSSadaf Ebrahimi case kInstMatch: {
254*ccdc9c3eSSadaf Ebrahimi if (endmatch_ && p != text_.end())
255*ccdc9c3eSSadaf Ebrahimi goto Next;
256*ccdc9c3eSSadaf Ebrahimi
257*ccdc9c3eSSadaf Ebrahimi // We found a match. If the caller doesn't care
258*ccdc9c3eSSadaf Ebrahimi // where the match is, no point going further.
259*ccdc9c3eSSadaf Ebrahimi if (nsubmatch_ == 0)
260*ccdc9c3eSSadaf Ebrahimi return true;
261*ccdc9c3eSSadaf Ebrahimi
262*ccdc9c3eSSadaf Ebrahimi // Record best match so far.
263*ccdc9c3eSSadaf Ebrahimi // Only need to check end point, because this entire
264*ccdc9c3eSSadaf Ebrahimi // call is only considering one start position.
265*ccdc9c3eSSadaf Ebrahimi matched = true;
266*ccdc9c3eSSadaf Ebrahimi cap_[1] = p;
267*ccdc9c3eSSadaf Ebrahimi if (submatch_[0].data() == NULL ||
268*ccdc9c3eSSadaf Ebrahimi (longest_ && p > submatch_[0].end())) {
269*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < nsubmatch_; i++)
270*ccdc9c3eSSadaf Ebrahimi submatch_[i] =
271*ccdc9c3eSSadaf Ebrahimi StringPiece(cap_[2 * i],
272*ccdc9c3eSSadaf Ebrahimi static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
273*ccdc9c3eSSadaf Ebrahimi }
274*ccdc9c3eSSadaf Ebrahimi
275*ccdc9c3eSSadaf Ebrahimi // If going for first match, we're done.
276*ccdc9c3eSSadaf Ebrahimi if (!longest_)
277*ccdc9c3eSSadaf Ebrahimi return true;
278*ccdc9c3eSSadaf Ebrahimi
279*ccdc9c3eSSadaf Ebrahimi // If we used the entire text, no longer match is possible.
280*ccdc9c3eSSadaf Ebrahimi if (p == text_.end())
281*ccdc9c3eSSadaf Ebrahimi return true;
282*ccdc9c3eSSadaf Ebrahimi
283*ccdc9c3eSSadaf Ebrahimi // Otherwise, continue on in hope of a longer match.
284*ccdc9c3eSSadaf Ebrahimi goto Next;
285*ccdc9c3eSSadaf Ebrahimi }
286*ccdc9c3eSSadaf Ebrahimi }
287*ccdc9c3eSSadaf Ebrahimi }
288*ccdc9c3eSSadaf Ebrahimi return matched;
289*ccdc9c3eSSadaf Ebrahimi }
290*ccdc9c3eSSadaf Ebrahimi
291*ccdc9c3eSSadaf Ebrahimi // Search text (within context) for prog_.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)292*ccdc9c3eSSadaf Ebrahimi bool BitState::Search(const StringPiece& text, const StringPiece& context,
293*ccdc9c3eSSadaf Ebrahimi bool anchored, bool longest,
294*ccdc9c3eSSadaf Ebrahimi StringPiece* submatch, int nsubmatch) {
295*ccdc9c3eSSadaf Ebrahimi // Search parameters.
296*ccdc9c3eSSadaf Ebrahimi text_ = text;
297*ccdc9c3eSSadaf Ebrahimi context_ = context;
298*ccdc9c3eSSadaf Ebrahimi if (context_.begin() == NULL)
299*ccdc9c3eSSadaf Ebrahimi context_ = text;
300*ccdc9c3eSSadaf Ebrahimi if (prog_->anchor_start() && context_.begin() != text.begin())
301*ccdc9c3eSSadaf Ebrahimi return false;
302*ccdc9c3eSSadaf Ebrahimi if (prog_->anchor_end() && context_.end() != text.end())
303*ccdc9c3eSSadaf Ebrahimi return false;
304*ccdc9c3eSSadaf Ebrahimi anchored_ = anchored || prog_->anchor_start();
305*ccdc9c3eSSadaf Ebrahimi longest_ = longest || prog_->anchor_end();
306*ccdc9c3eSSadaf Ebrahimi endmatch_ = prog_->anchor_end();
307*ccdc9c3eSSadaf Ebrahimi submatch_ = submatch;
308*ccdc9c3eSSadaf Ebrahimi nsubmatch_ = nsubmatch;
309*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < nsubmatch_; i++)
310*ccdc9c3eSSadaf Ebrahimi submatch_[i] = StringPiece();
311*ccdc9c3eSSadaf Ebrahimi
312*ccdc9c3eSSadaf Ebrahimi // Allocate scratch space.
313*ccdc9c3eSSadaf Ebrahimi int nvisited = prog_->size() * static_cast<int>(text.size()+1);
314*ccdc9c3eSSadaf Ebrahimi nvisited = (nvisited + VisitedBits-1) / VisitedBits;
315*ccdc9c3eSSadaf Ebrahimi visited_ = PODArray<uint32_t>(nvisited);
316*ccdc9c3eSSadaf Ebrahimi memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
317*ccdc9c3eSSadaf Ebrahimi
318*ccdc9c3eSSadaf Ebrahimi int ncap = 2*nsubmatch;
319*ccdc9c3eSSadaf Ebrahimi if (ncap < 2)
320*ccdc9c3eSSadaf Ebrahimi ncap = 2;
321*ccdc9c3eSSadaf Ebrahimi cap_ = PODArray<const char*>(ncap);
322*ccdc9c3eSSadaf Ebrahimi memset(cap_.data(), 0, ncap*sizeof cap_[0]);
323*ccdc9c3eSSadaf Ebrahimi
324*ccdc9c3eSSadaf Ebrahimi // When sizeof(Job) == 16, we start with a nice round 4KiB. :)
325*ccdc9c3eSSadaf Ebrahimi job_ = PODArray<Job>(256);
326*ccdc9c3eSSadaf Ebrahimi
327*ccdc9c3eSSadaf Ebrahimi // Anchored search must start at text.begin().
328*ccdc9c3eSSadaf Ebrahimi if (anchored_) {
329*ccdc9c3eSSadaf Ebrahimi cap_[0] = text.begin();
330*ccdc9c3eSSadaf Ebrahimi return TrySearch(prog_->start(), text.begin());
331*ccdc9c3eSSadaf Ebrahimi }
332*ccdc9c3eSSadaf Ebrahimi
333*ccdc9c3eSSadaf Ebrahimi // Unanchored search, starting from each possible text position.
334*ccdc9c3eSSadaf Ebrahimi // Notice that we have to try the empty string at the end of
335*ccdc9c3eSSadaf Ebrahimi // the text, so the loop condition is p <= text.end(), not p < text.end().
336*ccdc9c3eSSadaf Ebrahimi // This looks like it's quadratic in the size of the text,
337*ccdc9c3eSSadaf Ebrahimi // but we are not clearing visited_ between calls to TrySearch,
338*ccdc9c3eSSadaf Ebrahimi // so no work is duplicated and it ends up still being linear.
339*ccdc9c3eSSadaf Ebrahimi for (const char* p = text.begin(); p <= text.end(); p++) {
340*ccdc9c3eSSadaf Ebrahimi // Try to use memchr to find the first byte quickly.
341*ccdc9c3eSSadaf Ebrahimi int fb = prog_->first_byte();
342*ccdc9c3eSSadaf Ebrahimi if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
343*ccdc9c3eSSadaf Ebrahimi p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
344*ccdc9c3eSSadaf Ebrahimi if (p == NULL)
345*ccdc9c3eSSadaf Ebrahimi p = text.end();
346*ccdc9c3eSSadaf Ebrahimi }
347*ccdc9c3eSSadaf Ebrahimi
348*ccdc9c3eSSadaf Ebrahimi cap_[0] = p;
349*ccdc9c3eSSadaf Ebrahimi if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
350*ccdc9c3eSSadaf Ebrahimi return true;
351*ccdc9c3eSSadaf Ebrahimi }
352*ccdc9c3eSSadaf Ebrahimi return false;
353*ccdc9c3eSSadaf Ebrahimi }
354*ccdc9c3eSSadaf Ebrahimi
355*ccdc9c3eSSadaf Ebrahimi // Bit-state search.
SearchBitState(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)356*ccdc9c3eSSadaf Ebrahimi bool Prog::SearchBitState(const StringPiece& text,
357*ccdc9c3eSSadaf Ebrahimi const StringPiece& context,
358*ccdc9c3eSSadaf Ebrahimi Anchor anchor,
359*ccdc9c3eSSadaf Ebrahimi MatchKind kind,
360*ccdc9c3eSSadaf Ebrahimi StringPiece* match,
361*ccdc9c3eSSadaf Ebrahimi int nmatch) {
362*ccdc9c3eSSadaf Ebrahimi // If full match, we ask for an anchored longest match
363*ccdc9c3eSSadaf Ebrahimi // and then check that match[0] == text.
364*ccdc9c3eSSadaf Ebrahimi // So make sure match[0] exists.
365*ccdc9c3eSSadaf Ebrahimi StringPiece sp0;
366*ccdc9c3eSSadaf Ebrahimi if (kind == kFullMatch) {
367*ccdc9c3eSSadaf Ebrahimi anchor = kAnchored;
368*ccdc9c3eSSadaf Ebrahimi if (nmatch < 1) {
369*ccdc9c3eSSadaf Ebrahimi match = &sp0;
370*ccdc9c3eSSadaf Ebrahimi nmatch = 1;
371*ccdc9c3eSSadaf Ebrahimi }
372*ccdc9c3eSSadaf Ebrahimi }
373*ccdc9c3eSSadaf Ebrahimi
374*ccdc9c3eSSadaf Ebrahimi // Run the search.
375*ccdc9c3eSSadaf Ebrahimi BitState b(this);
376*ccdc9c3eSSadaf Ebrahimi bool anchored = anchor == kAnchored;
377*ccdc9c3eSSadaf Ebrahimi bool longest = kind != kFirstMatch;
378*ccdc9c3eSSadaf Ebrahimi if (!b.Search(text, context, anchored, longest, match, nmatch))
379*ccdc9c3eSSadaf Ebrahimi return false;
380*ccdc9c3eSSadaf Ebrahimi if (kind == kFullMatch && match[0].end() != text.end())
381*ccdc9c3eSSadaf Ebrahimi return false;
382*ccdc9c3eSSadaf Ebrahimi return true;
383*ccdc9c3eSSadaf Ebrahimi }
384*ccdc9c3eSSadaf Ebrahimi
385*ccdc9c3eSSadaf Ebrahimi } // namespace re2
386