xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/bitstate.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6 
7 // Prog::SearchBitState is a regular expression search with submatch
8 // tracking for small regular expressions and texts.  Similarly to
9 // testing/backtrack.cc, it allocates a bitmap with (count of
10 // lists) * (length of text) bits to make sure it never explores the
11 // same (instruction list, character position) multiple times.  This
12 // limits the search to run in time linear in the length of the text.
13 //
14 // Unlike testing/backtrack.cc, SearchBitState is not recursive
15 // on the text.
16 //
17 // SearchBitState is a fast replacement for the NFA code on small
18 // regexps and texts when SearchOnePass cannot be used.
19 
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <string.h>
23 #include <limits>
24 #include <utility>
25 
26 #include "util/logging.h"
27 #include "re2/pod_array.h"
28 #include "re2/prog.h"
29 #include "re2/regexp.h"
30 
31 namespace re2 {
32 
33 struct Job {
34   int id;
35   int rle;  // run length encoding
36   const char* p;
37 };
38 
39 class BitState {
40  public:
41   explicit BitState(Prog* prog);
42 
43   // The usual Search prototype.
44   // Can only call Search once per BitState.
45   bool Search(absl::string_view text, absl::string_view context, bool anchored,
46               bool longest, absl::string_view* submatch, int nsubmatch);
47 
48  private:
49   inline bool ShouldVisit(int id, const char* p);
50   void Push(int id, const char* p);
51   void GrowStack();
52   bool TrySearch(int id, const char* p);
53 
54   // Search parameters
55   Prog* prog_;                   // program being run
56   absl::string_view text_;       // text being searched
57   absl::string_view context_;    // greater context of text being searched
58   bool anchored_;                // whether search is anchored at text.begin()
59   bool longest_;                 // whether search wants leftmost-longest match
60   bool endmatch_;                // whether match must end at text.end()
61   absl::string_view* submatch_;  // submatches to fill in
62   int nsubmatch_;                //   # of submatches to fill in
63 
64   // Search state
65   static constexpr int kVisitedBits = 64;
66   PODArray<uint64_t> visited_;  // bitmap: (list ID, char*) pairs visited
67   PODArray<const char*> cap_;   // capture registers
68   PODArray<Job> job_;           // stack of text positions to explore
69   int njob_;                    // stack size
70 
71   BitState(const BitState&) = delete;
72   BitState& operator=(const BitState&) = delete;
73 };
74 
BitState(Prog * prog)75 BitState::BitState(Prog* prog)
76   : prog_(prog),
77     anchored_(false),
78     longest_(false),
79     endmatch_(false),
80     submatch_(NULL),
81     nsubmatch_(0),
82     njob_(0) {
83 }
84 
85 // Given id, which *must* be a list head, we can look up its list ID.
86 // Then the question is: Should the search visit the (list ID, p) pair?
87 // If so, remember that it was visited so that the next time,
88 // we don't repeat the visit.
ShouldVisit(int id,const char * p)89 bool BitState::ShouldVisit(int id, const char* p) {
90   int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
91           static_cast<int>(p-text_.data());
92   if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
93     return false;
94   visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
95   return true;
96 }
97 
98 // Grow the stack.
GrowStack()99 void BitState::GrowStack() {
100   PODArray<Job> tmp(2*job_.size());
101   memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
102   job_ = std::move(tmp);
103 }
104 
105 // Push (id, p) onto the stack, growing it if necessary.
Push(int id,const char * p)106 void BitState::Push(int id, const char* p) {
107   if (njob_ >= job_.size()) {
108     GrowStack();
109     if (njob_ >= job_.size()) {
110       LOG(DFATAL) << "GrowStack() failed: "
111                   << "njob_ = " << njob_ << ", "
112                   << "job_.size() = " << job_.size();
113       return;
114     }
115   }
116 
117   // If id < 0, it's undoing a Capture,
118   // so we mustn't interfere with that.
119   if (id >= 0 && njob_ > 0) {
120     Job* top = &job_[njob_-1];
121     if (id == top->id &&
122         p == top->p + top->rle + 1 &&
123         top->rle < std::numeric_limits<int>::max()) {
124       ++top->rle;
125       return;
126     }
127   }
128 
129   Job* top = &job_[njob_++];
130   top->id = id;
131   top->rle = 0;
132   top->p = p;
133 }
134 
135 // Try a search from instruction id0 in state p0.
136 // Return whether it succeeded.
TrySearch(int id0,const char * p0)137 bool BitState::TrySearch(int id0, const char* p0) {
138   bool matched = false;
139   const char* end = text_.data() + text_.size();
140   njob_ = 0;
141   // Push() no longer checks ShouldVisit(),
142   // so we must perform the check ourselves.
143   if (ShouldVisit(id0, p0))
144     Push(id0, p0);
145   while (njob_ > 0) {
146     // Pop job off stack.
147     --njob_;
148     int id = job_[njob_].id;
149     int& rle = job_[njob_].rle;
150     const char* p = job_[njob_].p;
151 
152     if (id < 0) {
153       // Undo the Capture.
154       cap_[prog_->inst(-id)->cap()] = p;
155       continue;
156     }
157 
158     if (rle > 0) {
159       p += rle;
160       // Revivify job on stack.
161       --rle;
162       ++njob_;
163     }
164 
165   Loop:
166     // Visit id, p.
167     Prog::Inst* ip = prog_->inst(id);
168     switch (ip->opcode()) {
169       default:
170         LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
171         return false;
172 
173       case kInstFail:
174         break;
175 
176       case kInstAltMatch:
177         if (ip->greedy(prog_)) {
178           // out1 is the Match instruction.
179           id = ip->out1();
180           p = end;
181           goto Loop;
182         }
183         if (longest_) {
184           // ip must be non-greedy...
185           // out is the Match instruction.
186           id = ip->out();
187           p = end;
188           goto Loop;
189         }
190         goto Next;
191 
192       case kInstByteRange: {
193         int c = -1;
194         if (p < end)
195           c = *p & 0xFF;
196         if (!ip->Matches(c))
197           goto Next;
198 
199         if (ip->hint() != 0)
200           Push(id+ip->hint(), p);  // try the next when we're done
201         id = ip->out();
202         p++;
203         goto CheckAndLoop;
204       }
205 
206       case kInstCapture:
207         if (!ip->last())
208           Push(id+1, p);  // try the next when we're done
209 
210         if (0 <= ip->cap() && ip->cap() < cap_.size()) {
211           // Capture p to register, but save old value first.
212           Push(-id, cap_[ip->cap()]);  // undo when we're done
213           cap_[ip->cap()] = p;
214         }
215 
216         id = ip->out();
217         goto CheckAndLoop;
218 
219       case kInstEmptyWidth:
220         if (ip->empty() & ~Prog::EmptyFlags(context_, p))
221           goto Next;
222 
223         if (!ip->last())
224           Push(id+1, p);  // try the next when we're done
225         id = ip->out();
226         goto CheckAndLoop;
227 
228       case kInstNop:
229         if (!ip->last())
230           Push(id+1, p);  // try the next when we're done
231         id = ip->out();
232 
233       CheckAndLoop:
234         // Sanity check: id is the head of its list, which must
235         // be the case if id-1 is the last of *its* list. :)
236         DCHECK(id == 0 || prog_->inst(id-1)->last());
237         if (ShouldVisit(id, p))
238           goto Loop;
239         break;
240 
241       case kInstMatch: {
242         if (endmatch_ && p != end)
243           goto Next;
244 
245         // We found a match.  If the caller doesn't care
246         // where the match is, no point going further.
247         if (nsubmatch_ == 0)
248           return true;
249 
250         // Record best match so far.
251         // Only need to check end point, because this entire
252         // call is only considering one start position.
253         matched = true;
254         cap_[1] = p;
255         if (submatch_[0].data() == NULL ||
256             (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
257           for (int i = 0; i < nsubmatch_; i++)
258             submatch_[i] = absl::string_view(
259                 cap_[2 * i],
260                 static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
261         }
262 
263         // If going for first match, we're done.
264         if (!longest_)
265           return true;
266 
267         // If we used the entire text, no longer match is possible.
268         if (p == end)
269           return true;
270 
271         // Otherwise, continue on in hope of a longer match.
272         // Note the absence of the ShouldVisit() check here
273         // due to execution remaining in the same list.
274       Next:
275         if (!ip->last()) {
276           id++;
277           goto Loop;
278         }
279         break;
280       }
281     }
282   }
283   return matched;
284 }
285 
286 // Search text (within context) for prog_.
Search(absl::string_view text,absl::string_view context,bool anchored,bool longest,absl::string_view * submatch,int nsubmatch)287 bool BitState::Search(absl::string_view text, absl::string_view context,
288                       bool anchored, bool longest, absl::string_view* submatch,
289                       int nsubmatch) {
290   // Search parameters.
291   text_ = text;
292   context_ = context;
293   if (context_.data() == NULL)
294     context_ = text;
295   if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
296     return false;
297   if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
298     return false;
299   anchored_ = anchored || prog_->anchor_start();
300   longest_ = longest || prog_->anchor_end();
301   endmatch_ = prog_->anchor_end();
302   submatch_ = submatch;
303   nsubmatch_ = nsubmatch;
304   for (int i = 0; i < nsubmatch_; i++)
305     submatch_[i] = absl::string_view();
306 
307   // Allocate scratch space.
308   int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
309   nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
310   visited_ = PODArray<uint64_t>(nvisited);
311   memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
312 
313   int ncap = 2*nsubmatch;
314   if (ncap < 2)
315     ncap = 2;
316   cap_ = PODArray<const char*>(ncap);
317   memset(cap_.data(), 0, ncap*sizeof cap_[0]);
318 
319   // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
320   job_ = PODArray<Job>(64);
321 
322   // Anchored search must start at text.begin().
323   if (anchored_) {
324     cap_[0] = text.data();
325     return TrySearch(prog_->start(), text.data());
326   }
327 
328   // Unanchored search, starting from each possible text position.
329   // Notice that we have to try the empty string at the end of
330   // the text, so the loop condition is p <= text.end(), not p < text.end().
331   // This looks like it's quadratic in the size of the text,
332   // but we are not clearing visited_ between calls to TrySearch,
333   // so no work is duplicated and it ends up still being linear.
334   const char* etext = text.data() + text.size();
335   for (const char* p = text.data(); p <= etext; p++) {
336     // Try to use prefix accel (e.g. memchr) to skip ahead.
337     if (p < etext && prog_->can_prefix_accel()) {
338       p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
339       if (p == NULL)
340         p = etext;
341     }
342 
343     cap_[0] = p;
344     if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
345       return true;
346     // Avoid invoking undefined behavior (arithmetic on a null pointer)
347     // by simply not continuing the loop.
348     if (p == NULL)
349       break;
350   }
351   return false;
352 }
353 
354 // Bit-state search.
SearchBitState(absl::string_view text,absl::string_view context,Anchor anchor,MatchKind kind,absl::string_view * match,int nmatch)355 bool Prog::SearchBitState(absl::string_view text, absl::string_view context,
356                           Anchor anchor, MatchKind kind,
357                           absl::string_view* match, int nmatch) {
358   // If full match, we ask for an anchored longest match
359   // and then check that match[0] == text.
360   // So make sure match[0] exists.
361   absl::string_view sp0;
362   if (kind == kFullMatch) {
363     anchor = kAnchored;
364     if (nmatch < 1) {
365       match = &sp0;
366       nmatch = 1;
367     }
368   }
369 
370   // Run the search.
371   BitState b(this);
372   bool anchored = anchor == kAnchored;
373   bool longest = kind != kFirstMatch;
374   if (!b.Search(text, context, anchored, longest, match, nmatch))
375     return false;
376   if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
377     return false;
378   return true;
379 }
380 
381 }  // namespace re2
382