1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6
7 // Prog::SearchBitState is a regular expression search with submatch
8 // tracking for small regular expressions and texts. Like
9 // testing/backtrack.cc, it allocates a bit vector with (length of
10 // text) * (length of prog) bits, to make sure it never explores the
11 // same (character position, instruction) state multiple times. This
12 // limits the search to run in time linear in the length of the text.
13 //
14 // Unlike testing/backtrack.cc, SearchBitState is not recursive
15 // on the text.
16 //
17 // SearchBitState is a fast replacement for the NFA code on small
18 // regexps and texts when SearchOnePass cannot be used.
19
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <string.h>
23 #include <utility>
24
25 #include "util/logging.h"
26 #include "util/pod_array.h"
27 #include "re2/prog.h"
28 #include "re2/regexp.h"
29
30 namespace re2 {
31
32 struct Job {
33 int id;
34 int arg;
35 const char* p;
36 };
37
38 class BitState {
39 public:
40 explicit BitState(Prog* prog);
41
42 // The usual Search prototype.
43 // Can only call Search once per BitState.
44 bool Search(const StringPiece& text, const StringPiece& context,
45 bool anchored, bool longest,
46 StringPiece* submatch, int nsubmatch);
47
48 private:
49 inline bool ShouldVisit(int id, const char* p);
50 void Push(int id, const char* p, int arg);
51 void GrowStack();
52 bool TrySearch(int id, const char* p);
53
54 // Search parameters
55 Prog* prog_; // program being run
56 StringPiece text_; // text being searched
57 StringPiece context_; // greater context of text being searched
58 bool anchored_; // whether search is anchored at text.begin()
59 bool longest_; // whether search wants leftmost-longest match
60 bool endmatch_; // whether match must end at text.end()
61 StringPiece* submatch_; // submatches to fill in
62 int nsubmatch_; // # of submatches to fill in
63
64 // Search state
65 static const int VisitedBits = 32;
66 PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited
67 PODArray<const char*> cap_; // capture registers
68 PODArray<Job> job_; // stack of text positions to explore
69 int njob_; // stack size
70 };
71
BitState(Prog * prog)72 BitState::BitState(Prog* prog)
73 : prog_(prog),
74 anchored_(false),
75 longest_(false),
76 endmatch_(false),
77 submatch_(NULL),
78 nsubmatch_(0),
79 njob_(0) {
80 }
81
82 // Should the search visit the pair ip, p?
83 // If so, remember that it was visited so that the next time,
84 // we don't repeat the visit.
ShouldVisit(int id,const char * p)85 bool BitState::ShouldVisit(int id, const char* p) {
86 int n = id * static_cast<int>(text_.size()+1) +
87 static_cast<int>(p-text_.begin());
88 if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
89 return false;
90 visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
91 return true;
92 }
93
94 // Grow the stack.
GrowStack()95 void BitState::GrowStack() {
96 PODArray<Job> tmp(2*job_.size());
97 memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
98 job_ = std::move(tmp);
99 }
100
101 // Push the triple (id, p, arg) onto the stack, growing it if necessary.
Push(int id,const char * p,int arg)102 void BitState::Push(int id, const char* p, int arg) {
103 if (njob_ >= job_.size()) {
104 GrowStack();
105 if (njob_ >= job_.size()) {
106 LOG(DFATAL) << "GrowStack() failed: "
107 << "njob_ = " << njob_ << ", "
108 << "job_.size() = " << job_.size();
109 return;
110 }
111 }
112 int op = prog_->inst(id)->opcode();
113 if (op == kInstFail)
114 return;
115
116 // Only check ShouldVisit when arg == 0.
117 // When arg > 0, we are continuing a previous visit.
118 if (arg == 0 && !ShouldVisit(id, p))
119 return;
120
121 Job* j = &job_[njob_++];
122 j->id = id;
123 j->p = p;
124 j->arg = arg;
125 }
126
127 // Try a search from instruction id0 in state p0.
128 // Return whether it succeeded.
TrySearch(int id0,const char * p0)129 bool BitState::TrySearch(int id0, const char* p0) {
130 bool matched = false;
131 bool inaltmatch = false;
132 const char* end = text_.end();
133 njob_ = 0;
134 Push(id0, p0, 0);
135 while (njob_ > 0) {
136 // Pop job off stack.
137 --njob_;
138 int id = job_[njob_].id;
139 const char* p = job_[njob_].p;
140 int arg = job_[njob_].arg;
141
142 // Optimization: rather than push and pop,
143 // code that is going to Push and continue
144 // the loop simply updates ip, p, and arg
145 // and jumps to CheckAndLoop. We have to
146 // do the ShouldVisit check that Push
147 // would have, but we avoid the stack
148 // manipulation.
149 if (0) {
150 Next:
151 // If the Match of a non-greedy AltMatch failed,
152 // we stop ourselves from trying the ByteRange,
153 // which would steer us off the short circuit.
154 if (prog_->inst(id)->last() || inaltmatch)
155 continue;
156 id++;
157
158 CheckAndLoop:
159 if (!ShouldVisit(id, p))
160 continue;
161 }
162
163 // Visit ip, p.
164 Prog::Inst* ip = prog_->inst(id);
165 switch (ip->opcode()) {
166 default:
167 LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
168 return false;
169
170 case kInstFail:
171 continue;
172
173 case kInstAltMatch:
174 switch (arg) {
175 case 0:
176 inaltmatch = true;
177 Push(id, p, 1); // come back when we're done
178
179 // One opcode is ByteRange; the other leads to Match
180 // (possibly via Nop or Capture).
181 if (ip->greedy(prog_)) {
182 // out1 is the match
183 Push(ip->out1(), p, 0);
184 id = ip->out1();
185 p = end;
186 goto CheckAndLoop;
187 }
188 // out is the match - non-greedy
189 Push(ip->out(), end, 0);
190 id = ip->out();
191 goto CheckAndLoop;
192
193 case 1:
194 inaltmatch = false;
195 continue;
196 }
197 LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg;
198 continue;
199
200 case kInstByteRange: {
201 int c = -1;
202 if (p < end)
203 c = *p & 0xFF;
204 if (!ip->Matches(c))
205 goto Next;
206
207 if (!ip->last())
208 Push(id+1, p, 0); // try the next when we're done
209 id = ip->out();
210 p++;
211 goto CheckAndLoop;
212 }
213
214 case kInstCapture:
215 switch (arg) {
216 case 0:
217 if (!ip->last())
218 Push(id+1, p, 0); // try the next when we're done
219
220 if (0 <= ip->cap() && ip->cap() < cap_.size()) {
221 // Capture p to register, but save old value.
222 Push(id, cap_[ip->cap()], 1); // come back when we're done
223 cap_[ip->cap()] = p;
224 }
225
226 // Continue on.
227 id = ip->out();
228 goto CheckAndLoop;
229
230 case 1:
231 // Finished ip->out(); restore the old value.
232 cap_[ip->cap()] = p;
233 continue;
234 }
235 LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
236 continue;
237
238 case kInstEmptyWidth:
239 if (ip->empty() & ~Prog::EmptyFlags(context_, p))
240 goto Next;
241
242 if (!ip->last())
243 Push(id+1, p, 0); // try the next when we're done
244 id = ip->out();
245 goto CheckAndLoop;
246
247 case kInstNop:
248 if (!ip->last())
249 Push(id+1, p, 0); // try the next when we're done
250 id = ip->out();
251 goto CheckAndLoop;
252
253 case kInstMatch: {
254 if (endmatch_ && p != text_.end())
255 goto Next;
256
257 // We found a match. If the caller doesn't care
258 // where the match is, no point going further.
259 if (nsubmatch_ == 0)
260 return true;
261
262 // Record best match so far.
263 // Only need to check end point, because this entire
264 // call is only considering one start position.
265 matched = true;
266 cap_[1] = p;
267 if (submatch_[0].data() == NULL ||
268 (longest_ && p > submatch_[0].end())) {
269 for (int i = 0; i < nsubmatch_; i++)
270 submatch_[i] =
271 StringPiece(cap_[2 * i],
272 static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
273 }
274
275 // If going for first match, we're done.
276 if (!longest_)
277 return true;
278
279 // If we used the entire text, no longer match is possible.
280 if (p == text_.end())
281 return true;
282
283 // Otherwise, continue on in hope of a longer match.
284 goto Next;
285 }
286 }
287 }
288 return matched;
289 }
290
291 // Search text (within context) for prog_.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)292 bool BitState::Search(const StringPiece& text, const StringPiece& context,
293 bool anchored, bool longest,
294 StringPiece* submatch, int nsubmatch) {
295 // Search parameters.
296 text_ = text;
297 context_ = context;
298 if (context_.begin() == NULL)
299 context_ = text;
300 if (prog_->anchor_start() && context_.begin() != text.begin())
301 return false;
302 if (prog_->anchor_end() && context_.end() != text.end())
303 return false;
304 anchored_ = anchored || prog_->anchor_start();
305 longest_ = longest || prog_->anchor_end();
306 endmatch_ = prog_->anchor_end();
307 submatch_ = submatch;
308 nsubmatch_ = nsubmatch;
309 for (int i = 0; i < nsubmatch_; i++)
310 submatch_[i] = StringPiece();
311
312 // Allocate scratch space.
313 int nvisited = prog_->size() * static_cast<int>(text.size()+1);
314 nvisited = (nvisited + VisitedBits-1) / VisitedBits;
315 visited_ = PODArray<uint32_t>(nvisited);
316 memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
317
318 int ncap = 2*nsubmatch;
319 if (ncap < 2)
320 ncap = 2;
321 cap_ = PODArray<const char*>(ncap);
322 memset(cap_.data(), 0, ncap*sizeof cap_[0]);
323
324 // When sizeof(Job) == 16, we start with a nice round 4KiB. :)
325 job_ = PODArray<Job>(256);
326
327 // Anchored search must start at text.begin().
328 if (anchored_) {
329 cap_[0] = text.begin();
330 return TrySearch(prog_->start(), text.begin());
331 }
332
333 // Unanchored search, starting from each possible text position.
334 // Notice that we have to try the empty string at the end of
335 // the text, so the loop condition is p <= text.end(), not p < text.end().
336 // This looks like it's quadratic in the size of the text,
337 // but we are not clearing visited_ between calls to TrySearch,
338 // so no work is duplicated and it ends up still being linear.
339 for (const char* p = text.begin(); p <= text.end(); p++) {
340 // Try to use memchr to find the first byte quickly.
341 int fb = prog_->first_byte();
342 if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
343 p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
344 if (p == NULL)
345 p = text.end();
346 }
347
348 cap_[0] = p;
349 if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
350 return true;
351 }
352 return false;
353 }
354
355 // Bit-state search.
SearchBitState(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)356 bool Prog::SearchBitState(const StringPiece& text,
357 const StringPiece& context,
358 Anchor anchor,
359 MatchKind kind,
360 StringPiece* match,
361 int nmatch) {
362 // If full match, we ask for an anchored longest match
363 // and then check that match[0] == text.
364 // So make sure match[0] exists.
365 StringPiece sp0;
366 if (kind == kFullMatch) {
367 anchor = kAnchored;
368 if (nmatch < 1) {
369 match = &sp0;
370 nmatch = 1;
371 }
372 }
373
374 // Run the search.
375 BitState b(this);
376 bool anchored = anchor == kAnchored;
377 bool longest = kind != kFirstMatch;
378 if (!b.Search(text, context, anchored, longest, match, nmatch))
379 return false;
380 if (kind == kFullMatch && match[0].end() != text.end())
381 return false;
382 return true;
383 }
384
385 } // namespace re2
386