xref: /aosp_15_r20/external/regex-re2/re2/bitstate.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6 
7 // Prog::SearchBitState is a regular expression search with submatch
8 // tracking for small regular expressions and texts.  Like
9 // testing/backtrack.cc, it allocates a bit vector with (length of
10 // text) * (length of prog) bits, to make sure it never explores the
11 // same (character position, instruction) state multiple times.  This
12 // limits the search to run in time linear in the length of the text.
13 //
14 // Unlike testing/backtrack.cc, SearchBitState is not recursive
15 // on the text.
16 //
17 // SearchBitState is a fast replacement for the NFA code on small
18 // regexps and texts when SearchOnePass cannot be used.
19 
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <string.h>
23 #include <utility>
24 
25 #include "util/logging.h"
26 #include "util/pod_array.h"
27 #include "re2/prog.h"
28 #include "re2/regexp.h"
29 
30 namespace re2 {
31 
32 struct Job {
33   int id;
34   int arg;
35   const char* p;
36 };
37 
38 class BitState {
39  public:
40   explicit BitState(Prog* prog);
41 
42   // The usual Search prototype.
43   // Can only call Search once per BitState.
44   bool Search(const StringPiece& text, const StringPiece& context,
45               bool anchored, bool longest,
46               StringPiece* submatch, int nsubmatch);
47 
48  private:
49   inline bool ShouldVisit(int id, const char* p);
50   void Push(int id, const char* p, int arg);
51   void GrowStack();
52   bool TrySearch(int id, const char* p);
53 
54   // Search parameters
55   Prog* prog_;              // program being run
56   StringPiece text_;        // text being searched
57   StringPiece context_;     // greater context of text being searched
58   bool anchored_;           // whether search is anchored at text.begin()
59   bool longest_;            // whether search wants leftmost-longest match
60   bool endmatch_;           // whether match must end at text.end()
61   StringPiece* submatch_;   // submatches to fill in
62   int nsubmatch_;           //   # of submatches to fill in
63 
64   // Search state
65   static const int VisitedBits = 32;
66   PODArray<uint32_t> visited_;  // bitmap: (Inst*, char*) pairs visited
67   PODArray<const char*> cap_;   // capture registers
68   PODArray<Job> job_;           // stack of text positions to explore
69   int njob_;                    // stack size
70 };
71 
BitState(Prog * prog)72 BitState::BitState(Prog* prog)
73   : prog_(prog),
74     anchored_(false),
75     longest_(false),
76     endmatch_(false),
77     submatch_(NULL),
78     nsubmatch_(0),
79     njob_(0) {
80 }
81 
82 // Should the search visit the pair ip, p?
83 // If so, remember that it was visited so that the next time,
84 // we don't repeat the visit.
ShouldVisit(int id,const char * p)85 bool BitState::ShouldVisit(int id, const char* p) {
86   int n = id * static_cast<int>(text_.size()+1) +
87           static_cast<int>(p-text_.begin());
88   if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
89     return false;
90   visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
91   return true;
92 }
93 
94 // Grow the stack.
GrowStack()95 void BitState::GrowStack() {
96   PODArray<Job> tmp(2*job_.size());
97   memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
98   job_ = std::move(tmp);
99 }
100 
101 // Push the triple (id, p, arg) onto the stack, growing it if necessary.
Push(int id,const char * p,int arg)102 void BitState::Push(int id, const char* p, int arg) {
103   if (njob_ >= job_.size()) {
104     GrowStack();
105     if (njob_ >= job_.size()) {
106       LOG(DFATAL) << "GrowStack() failed: "
107                   << "njob_ = " << njob_ << ", "
108                   << "job_.size() = " << job_.size();
109       return;
110     }
111   }
112   int op = prog_->inst(id)->opcode();
113   if (op == kInstFail)
114     return;
115 
116   // Only check ShouldVisit when arg == 0.
117   // When arg > 0, we are continuing a previous visit.
118   if (arg == 0 && !ShouldVisit(id, p))
119     return;
120 
121   Job* j = &job_[njob_++];
122   j->id = id;
123   j->p = p;
124   j->arg = arg;
125 }
126 
127 // Try a search from instruction id0 in state p0.
128 // Return whether it succeeded.
TrySearch(int id0,const char * p0)129 bool BitState::TrySearch(int id0, const char* p0) {
130   bool matched = false;
131   bool inaltmatch = false;
132   const char* end = text_.end();
133   njob_ = 0;
134   Push(id0, p0, 0);
135   while (njob_ > 0) {
136     // Pop job off stack.
137     --njob_;
138     int id = job_[njob_].id;
139     const char* p = job_[njob_].p;
140     int arg = job_[njob_].arg;
141 
142     // Optimization: rather than push and pop,
143     // code that is going to Push and continue
144     // the loop simply updates ip, p, and arg
145     // and jumps to CheckAndLoop.  We have to
146     // do the ShouldVisit check that Push
147     // would have, but we avoid the stack
148     // manipulation.
149     if (0) {
150     Next:
151       // If the Match of a non-greedy AltMatch failed,
152       // we stop ourselves from trying the ByteRange,
153       // which would steer us off the short circuit.
154       if (prog_->inst(id)->last() || inaltmatch)
155         continue;
156       id++;
157 
158     CheckAndLoop:
159       if (!ShouldVisit(id, p))
160         continue;
161     }
162 
163     // Visit ip, p.
164     Prog::Inst* ip = prog_->inst(id);
165     switch (ip->opcode()) {
166       default:
167         LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
168         return false;
169 
170       case kInstFail:
171         continue;
172 
173       case kInstAltMatch:
174         switch (arg) {
175           case 0:
176             inaltmatch = true;
177             Push(id, p, 1);  // come back when we're done
178 
179             // One opcode is ByteRange; the other leads to Match
180             // (possibly via Nop or Capture).
181             if (ip->greedy(prog_)) {
182               // out1 is the match
183               Push(ip->out1(), p, 0);
184               id = ip->out1();
185               p = end;
186               goto CheckAndLoop;
187             }
188             // out is the match - non-greedy
189             Push(ip->out(), end, 0);
190             id = ip->out();
191             goto CheckAndLoop;
192 
193           case 1:
194             inaltmatch = false;
195             continue;
196         }
197         LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg;
198         continue;
199 
200       case kInstByteRange: {
201         int c = -1;
202         if (p < end)
203           c = *p & 0xFF;
204         if (!ip->Matches(c))
205           goto Next;
206 
207         if (!ip->last())
208           Push(id+1, p, 0);  // try the next when we're done
209         id = ip->out();
210         p++;
211         goto CheckAndLoop;
212       }
213 
214       case kInstCapture:
215         switch (arg) {
216           case 0:
217             if (!ip->last())
218               Push(id+1, p, 0);  // try the next when we're done
219 
220             if (0 <= ip->cap() && ip->cap() < cap_.size()) {
221               // Capture p to register, but save old value.
222               Push(id, cap_[ip->cap()], 1);  // come back when we're done
223               cap_[ip->cap()] = p;
224             }
225 
226             // Continue on.
227             id = ip->out();
228             goto CheckAndLoop;
229 
230           case 1:
231             // Finished ip->out(); restore the old value.
232             cap_[ip->cap()] = p;
233             continue;
234         }
235         LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
236         continue;
237 
238       case kInstEmptyWidth:
239         if (ip->empty() & ~Prog::EmptyFlags(context_, p))
240           goto Next;
241 
242         if (!ip->last())
243           Push(id+1, p, 0);  // try the next when we're done
244         id = ip->out();
245         goto CheckAndLoop;
246 
247       case kInstNop:
248         if (!ip->last())
249           Push(id+1, p, 0);  // try the next when we're done
250         id = ip->out();
251         goto CheckAndLoop;
252 
253       case kInstMatch: {
254         if (endmatch_ && p != text_.end())
255           goto Next;
256 
257         // We found a match.  If the caller doesn't care
258         // where the match is, no point going further.
259         if (nsubmatch_ == 0)
260           return true;
261 
262         // Record best match so far.
263         // Only need to check end point, because this entire
264         // call is only considering one start position.
265         matched = true;
266         cap_[1] = p;
267         if (submatch_[0].data() == NULL ||
268             (longest_ && p > submatch_[0].end())) {
269           for (int i = 0; i < nsubmatch_; i++)
270             submatch_[i] =
271                 StringPiece(cap_[2 * i],
272                             static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
273         }
274 
275         // If going for first match, we're done.
276         if (!longest_)
277           return true;
278 
279         // If we used the entire text, no longer match is possible.
280         if (p == text_.end())
281           return true;
282 
283         // Otherwise, continue on in hope of a longer match.
284         goto Next;
285       }
286     }
287   }
288   return matched;
289 }
290 
291 // Search text (within context) for prog_.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)292 bool BitState::Search(const StringPiece& text, const StringPiece& context,
293                       bool anchored, bool longest,
294                       StringPiece* submatch, int nsubmatch) {
295   // Search parameters.
296   text_ = text;
297   context_ = context;
298   if (context_.begin() == NULL)
299     context_ = text;
300   if (prog_->anchor_start() && context_.begin() != text.begin())
301     return false;
302   if (prog_->anchor_end() && context_.end() != text.end())
303     return false;
304   anchored_ = anchored || prog_->anchor_start();
305   longest_ = longest || prog_->anchor_end();
306   endmatch_ = prog_->anchor_end();
307   submatch_ = submatch;
308   nsubmatch_ = nsubmatch;
309   for (int i = 0; i < nsubmatch_; i++)
310     submatch_[i] = StringPiece();
311 
312   // Allocate scratch space.
313   int nvisited = prog_->size() * static_cast<int>(text.size()+1);
314   nvisited = (nvisited + VisitedBits-1) / VisitedBits;
315   visited_ = PODArray<uint32_t>(nvisited);
316   memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
317 
318   int ncap = 2*nsubmatch;
319   if (ncap < 2)
320     ncap = 2;
321   cap_ = PODArray<const char*>(ncap);
322   memset(cap_.data(), 0, ncap*sizeof cap_[0]);
323 
324   // When sizeof(Job) == 16, we start with a nice round 4KiB. :)
325   job_ = PODArray<Job>(256);
326 
327   // Anchored search must start at text.begin().
328   if (anchored_) {
329     cap_[0] = text.begin();
330     return TrySearch(prog_->start(), text.begin());
331   }
332 
333   // Unanchored search, starting from each possible text position.
334   // Notice that we have to try the empty string at the end of
335   // the text, so the loop condition is p <= text.end(), not p < text.end().
336   // This looks like it's quadratic in the size of the text,
337   // but we are not clearing visited_ between calls to TrySearch,
338   // so no work is duplicated and it ends up still being linear.
339   for (const char* p = text.begin(); p <= text.end(); p++) {
340     // Try to use memchr to find the first byte quickly.
341     int fb = prog_->first_byte();
342     if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
343       p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
344       if (p == NULL)
345         p = text.end();
346     }
347 
348     cap_[0] = p;
349     if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
350       return true;
351   }
352   return false;
353 }
354 
355 // Bit-state search.
SearchBitState(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)356 bool Prog::SearchBitState(const StringPiece& text,
357                           const StringPiece& context,
358                           Anchor anchor,
359                           MatchKind kind,
360                           StringPiece* match,
361                           int nmatch) {
362   // If full match, we ask for an anchored longest match
363   // and then check that match[0] == text.
364   // So make sure match[0] exists.
365   StringPiece sp0;
366   if (kind == kFullMatch) {
367     anchor = kAnchored;
368     if (nmatch < 1) {
369       match = &sp0;
370       nmatch = 1;
371     }
372   }
373 
374   // Run the search.
375   BitState b(this);
376   bool anchored = anchor == kAnchored;
377   bool longest = kind != kFirstMatch;
378   if (!b.Search(text, context, anchored, longest, match, nmatch))
379     return false;
380   if (kind == kFullMatch && match[0].end() != text.end())
381     return false;
382   return true;
383 }
384 
385 }  // namespace re2
386