xref: /aosp_15_r20/external/regex-re2/re2/testing/backtrack.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6*ccdc9c3eSSadaf Ebrahimi //
7*ccdc9c3eSSadaf Ebrahimi // Prog::UnsafeSearchBacktrack is a backtracking regular expression search,
8*ccdc9c3eSSadaf Ebrahimi // except that it remembers where it has been, trading a lot of
9*ccdc9c3eSSadaf Ebrahimi // memory for a lot of time. It exists only for testing purposes.
10*ccdc9c3eSSadaf Ebrahimi //
11*ccdc9c3eSSadaf Ebrahimi // Let me repeat that.
12*ccdc9c3eSSadaf Ebrahimi //
13*ccdc9c3eSSadaf Ebrahimi // THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
14*ccdc9c3eSSadaf Ebrahimi //   - It uses a ton of memory.
15*ccdc9c3eSSadaf Ebrahimi //   - It uses a ton of stack.
16*ccdc9c3eSSadaf Ebrahimi //   - It uses CHECK and LOG(FATAL).
17*ccdc9c3eSSadaf Ebrahimi //   - It implements unanchored search by repeated anchored search.
18*ccdc9c3eSSadaf Ebrahimi //
19*ccdc9c3eSSadaf Ebrahimi // On the other hand, it is very simple and a good reference
20*ccdc9c3eSSadaf Ebrahimi // implementation for the more complicated regexp packages.
21*ccdc9c3eSSadaf Ebrahimi //
22*ccdc9c3eSSadaf Ebrahimi // In BUILD, this file is linked into the ":testing" library,
23*ccdc9c3eSSadaf Ebrahimi // not the main library, in order to make it harder to pick up
24*ccdc9c3eSSadaf Ebrahimi // accidentally.
25*ccdc9c3eSSadaf Ebrahimi 
26*ccdc9c3eSSadaf Ebrahimi #include <stddef.h>
27*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
28*ccdc9c3eSSadaf Ebrahimi #include <string.h>
29*ccdc9c3eSSadaf Ebrahimi 
30*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
31*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
32*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
33*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
34*ccdc9c3eSSadaf Ebrahimi 
35*ccdc9c3eSSadaf Ebrahimi namespace re2 {
36*ccdc9c3eSSadaf Ebrahimi 
37*ccdc9c3eSSadaf Ebrahimi // Backtracker holds the state for a backtracking search.
38*ccdc9c3eSSadaf Ebrahimi //
39*ccdc9c3eSSadaf Ebrahimi // Excluding the search parameters, the main search state
40*ccdc9c3eSSadaf Ebrahimi // is just the "capture registers", which record, for the
41*ccdc9c3eSSadaf Ebrahimi // current execution, the string position at which each
42*ccdc9c3eSSadaf Ebrahimi // parenthesis was passed.  cap_[0] and cap_[1] are the
43*ccdc9c3eSSadaf Ebrahimi // left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
44*ccdc9c3eSSadaf Ebrahimi //
45*ccdc9c3eSSadaf Ebrahimi // To avoid infinite loops during backtracking on expressions
46*ccdc9c3eSSadaf Ebrahimi // like (a*)*, the visited_[] bitmap marks the (state, string-position)
47*ccdc9c3eSSadaf Ebrahimi // pairs that have already been explored and are thus not worth
48*ccdc9c3eSSadaf Ebrahimi // re-exploring if we get there via another path.  Modern backtracking
49*ccdc9c3eSSadaf Ebrahimi // libraries engineer their program representation differently, to make
50*ccdc9c3eSSadaf Ebrahimi // such infinite loops possible to avoid without keeping a giant visited_
51*ccdc9c3eSSadaf Ebrahimi // bitmap, but visited_ works fine for a reference implementation
52*ccdc9c3eSSadaf Ebrahimi // and it has the nice benefit of making the search run in linear time.
53*ccdc9c3eSSadaf Ebrahimi class Backtracker {
54*ccdc9c3eSSadaf Ebrahimi  public:
55*ccdc9c3eSSadaf Ebrahimi   explicit Backtracker(Prog* prog);
56*ccdc9c3eSSadaf Ebrahimi   ~Backtracker();
57*ccdc9c3eSSadaf Ebrahimi 
58*ccdc9c3eSSadaf Ebrahimi   bool Search(const StringPiece& text, const StringPiece& context,
59*ccdc9c3eSSadaf Ebrahimi               bool anchored, bool longest,
60*ccdc9c3eSSadaf Ebrahimi               StringPiece* submatch, int nsubmatch);
61*ccdc9c3eSSadaf Ebrahimi 
62*ccdc9c3eSSadaf Ebrahimi  private:
63*ccdc9c3eSSadaf Ebrahimi   // Explores from instruction id at string position p looking for a match.
64*ccdc9c3eSSadaf Ebrahimi   // Returns true if found (so that caller can stop trying other possibilities).
65*ccdc9c3eSSadaf Ebrahimi   bool Visit(int id, const char* p);
66*ccdc9c3eSSadaf Ebrahimi 
67*ccdc9c3eSSadaf Ebrahimi   // Tries instruction id at string position p.
68*ccdc9c3eSSadaf Ebrahimi   // Returns true if a match is found.
69*ccdc9c3eSSadaf Ebrahimi   bool Try(int id, const char* p);
70*ccdc9c3eSSadaf Ebrahimi 
71*ccdc9c3eSSadaf Ebrahimi   // Search parameters
72*ccdc9c3eSSadaf Ebrahimi   Prog* prog_;              // program being run
73*ccdc9c3eSSadaf Ebrahimi   StringPiece text_;        // text being searched
74*ccdc9c3eSSadaf Ebrahimi   StringPiece context_;     // greater context of text being searched
75*ccdc9c3eSSadaf Ebrahimi   bool anchored_;           // whether search is anchored at text.begin()
76*ccdc9c3eSSadaf Ebrahimi   bool longest_;            // whether search wants leftmost-longest match
77*ccdc9c3eSSadaf Ebrahimi   bool endmatch_;           // whether search must end at text.end()
78*ccdc9c3eSSadaf Ebrahimi   StringPiece *submatch_;   // submatches to fill in
79*ccdc9c3eSSadaf Ebrahimi   int nsubmatch_;           //   # of submatches to fill in
80*ccdc9c3eSSadaf Ebrahimi 
81*ccdc9c3eSSadaf Ebrahimi   // Search state
82*ccdc9c3eSSadaf Ebrahimi   const char* cap_[64];     // capture registers
83*ccdc9c3eSSadaf Ebrahimi   uint32_t *visited_;       // bitmap: (Inst*, char*) pairs already backtracked
84*ccdc9c3eSSadaf Ebrahimi   size_t nvisited_;         //   # of words in bitmap
85*ccdc9c3eSSadaf Ebrahimi };
86*ccdc9c3eSSadaf Ebrahimi 
Backtracker(Prog * prog)87*ccdc9c3eSSadaf Ebrahimi Backtracker::Backtracker(Prog* prog)
88*ccdc9c3eSSadaf Ebrahimi   : prog_(prog),
89*ccdc9c3eSSadaf Ebrahimi     anchored_(false),
90*ccdc9c3eSSadaf Ebrahimi     longest_(false),
91*ccdc9c3eSSadaf Ebrahimi     endmatch_(false),
92*ccdc9c3eSSadaf Ebrahimi     submatch_(NULL),
93*ccdc9c3eSSadaf Ebrahimi     nsubmatch_(0),
94*ccdc9c3eSSadaf Ebrahimi     visited_(NULL),
95*ccdc9c3eSSadaf Ebrahimi     nvisited_(0) {
96*ccdc9c3eSSadaf Ebrahimi }
97*ccdc9c3eSSadaf Ebrahimi 
~Backtracker()98*ccdc9c3eSSadaf Ebrahimi Backtracker::~Backtracker() {
99*ccdc9c3eSSadaf Ebrahimi   delete[] visited_;
100*ccdc9c3eSSadaf Ebrahimi }
101*ccdc9c3eSSadaf Ebrahimi 
102*ccdc9c3eSSadaf Ebrahimi // Runs a backtracking search.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)103*ccdc9c3eSSadaf Ebrahimi bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
104*ccdc9c3eSSadaf Ebrahimi                          bool anchored, bool longest,
105*ccdc9c3eSSadaf Ebrahimi                          StringPiece* submatch, int nsubmatch) {
106*ccdc9c3eSSadaf Ebrahimi   text_ = text;
107*ccdc9c3eSSadaf Ebrahimi   context_ = context;
108*ccdc9c3eSSadaf Ebrahimi   if (context_.begin() == NULL)
109*ccdc9c3eSSadaf Ebrahimi     context_ = text;
110*ccdc9c3eSSadaf Ebrahimi   if (prog_->anchor_start() && text.begin() > context_.begin())
111*ccdc9c3eSSadaf Ebrahimi     return false;
112*ccdc9c3eSSadaf Ebrahimi   if (prog_->anchor_end() && text.end() < context_.end())
113*ccdc9c3eSSadaf Ebrahimi     return false;
114*ccdc9c3eSSadaf Ebrahimi   anchored_ = anchored | prog_->anchor_start();
115*ccdc9c3eSSadaf Ebrahimi   longest_ = longest | prog_->anchor_end();
116*ccdc9c3eSSadaf Ebrahimi   endmatch_ = prog_->anchor_end();
117*ccdc9c3eSSadaf Ebrahimi   submatch_ = submatch;
118*ccdc9c3eSSadaf Ebrahimi   nsubmatch_ = nsubmatch;
119*ccdc9c3eSSadaf Ebrahimi   CHECK(2*nsubmatch_ < arraysize(cap_));
120*ccdc9c3eSSadaf Ebrahimi   memset(cap_, 0, sizeof cap_);
121*ccdc9c3eSSadaf Ebrahimi 
122*ccdc9c3eSSadaf Ebrahimi   // We use submatch_[0] for our own bookkeeping,
123*ccdc9c3eSSadaf Ebrahimi   // so it had better exist.
124*ccdc9c3eSSadaf Ebrahimi   StringPiece sp0;
125*ccdc9c3eSSadaf Ebrahimi   if (nsubmatch < 1) {
126*ccdc9c3eSSadaf Ebrahimi     submatch_ = &sp0;
127*ccdc9c3eSSadaf Ebrahimi     nsubmatch_ = 1;
128*ccdc9c3eSSadaf Ebrahimi   }
129*ccdc9c3eSSadaf Ebrahimi   submatch_[0] = StringPiece();
130*ccdc9c3eSSadaf Ebrahimi 
131*ccdc9c3eSSadaf Ebrahimi   // Allocate new visited_ bitmap -- size is proportional
132*ccdc9c3eSSadaf Ebrahimi   // to text, so have to reallocate on each call to Search.
133*ccdc9c3eSSadaf Ebrahimi   delete[] visited_;
134*ccdc9c3eSSadaf Ebrahimi   nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
135*ccdc9c3eSSadaf Ebrahimi   visited_ = new uint32_t[nvisited_];
136*ccdc9c3eSSadaf Ebrahimi   memset(visited_, 0, nvisited_*sizeof visited_[0]);
137*ccdc9c3eSSadaf Ebrahimi 
138*ccdc9c3eSSadaf Ebrahimi   // Anchored search must start at text.begin().
139*ccdc9c3eSSadaf Ebrahimi   if (anchored_) {
140*ccdc9c3eSSadaf Ebrahimi     cap_[0] = text.begin();
141*ccdc9c3eSSadaf Ebrahimi     return Visit(prog_->start(), text.begin());
142*ccdc9c3eSSadaf Ebrahimi   }
143*ccdc9c3eSSadaf Ebrahimi 
144*ccdc9c3eSSadaf Ebrahimi   // Unanchored search, starting from each possible text position.
145*ccdc9c3eSSadaf Ebrahimi   // Notice that we have to try the empty string at the end of
146*ccdc9c3eSSadaf Ebrahimi   // the text, so the loop condition is p <= text.end(), not p < text.end().
147*ccdc9c3eSSadaf Ebrahimi   for (const char* p = text.begin(); p <= text.end(); p++) {
148*ccdc9c3eSSadaf Ebrahimi     cap_[0] = p;
149*ccdc9c3eSSadaf Ebrahimi     if (Visit(prog_->start(), p))  // Match must be leftmost; done.
150*ccdc9c3eSSadaf Ebrahimi       return true;
151*ccdc9c3eSSadaf Ebrahimi   }
152*ccdc9c3eSSadaf Ebrahimi   return false;
153*ccdc9c3eSSadaf Ebrahimi }
154*ccdc9c3eSSadaf Ebrahimi 
155*ccdc9c3eSSadaf Ebrahimi // Explores from instruction id at string position p looking for a match.
156*ccdc9c3eSSadaf Ebrahimi // Return true if found (so that caller can stop trying other possibilities).
Visit(int id,const char * p)157*ccdc9c3eSSadaf Ebrahimi bool Backtracker::Visit(int id, const char* p) {
158*ccdc9c3eSSadaf Ebrahimi   // Check bitmap.  If we've already explored from here,
159*ccdc9c3eSSadaf Ebrahimi   // either it didn't match or it did but we're hoping for a better match.
160*ccdc9c3eSSadaf Ebrahimi   // Either way, don't go down that road again.
161*ccdc9c3eSSadaf Ebrahimi   CHECK(p <= text_.end());
162*ccdc9c3eSSadaf Ebrahimi   size_t n = id*(text_.size()+1) + (p - text_.begin());
163*ccdc9c3eSSadaf Ebrahimi   CHECK_LT(n/32, nvisited_);
164*ccdc9c3eSSadaf Ebrahimi   if (visited_[n/32] & (1 << (n&31)))
165*ccdc9c3eSSadaf Ebrahimi     return false;
166*ccdc9c3eSSadaf Ebrahimi   visited_[n/32] |= 1 << (n&31);
167*ccdc9c3eSSadaf Ebrahimi 
168*ccdc9c3eSSadaf Ebrahimi   Prog::Inst* ip = prog_->inst(id);
169*ccdc9c3eSSadaf Ebrahimi   if (Try(id, p)) {
170*ccdc9c3eSSadaf Ebrahimi     if (longest_ && !ip->last())
171*ccdc9c3eSSadaf Ebrahimi       Visit(id+1, p);
172*ccdc9c3eSSadaf Ebrahimi     return true;
173*ccdc9c3eSSadaf Ebrahimi   }
174*ccdc9c3eSSadaf Ebrahimi   if (!ip->last())
175*ccdc9c3eSSadaf Ebrahimi     return Visit(id+1, p);
176*ccdc9c3eSSadaf Ebrahimi   return false;
177*ccdc9c3eSSadaf Ebrahimi }
178*ccdc9c3eSSadaf Ebrahimi 
179*ccdc9c3eSSadaf Ebrahimi // Tries instruction id at string position p.
180*ccdc9c3eSSadaf Ebrahimi // Returns true if a match is found.
Try(int id,const char * p)181*ccdc9c3eSSadaf Ebrahimi bool Backtracker::Try(int id, const char* p) {
182*ccdc9c3eSSadaf Ebrahimi   // Pick out byte at current position.  If at end of string,
183*ccdc9c3eSSadaf Ebrahimi   // have to explore in hope of finishing a match.  Use impossible byte -1.
184*ccdc9c3eSSadaf Ebrahimi   int c = -1;
185*ccdc9c3eSSadaf Ebrahimi   if (p < text_.end())
186*ccdc9c3eSSadaf Ebrahimi     c = *p & 0xFF;
187*ccdc9c3eSSadaf Ebrahimi 
188*ccdc9c3eSSadaf Ebrahimi   Prog::Inst* ip = prog_->inst(id);
189*ccdc9c3eSSadaf Ebrahimi   switch (ip->opcode()) {
190*ccdc9c3eSSadaf Ebrahimi     default:
191*ccdc9c3eSSadaf Ebrahimi       LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
192*ccdc9c3eSSadaf Ebrahimi       return false;  // not reached
193*ccdc9c3eSSadaf Ebrahimi 
194*ccdc9c3eSSadaf Ebrahimi     case kInstAltMatch:
195*ccdc9c3eSSadaf Ebrahimi       // Ignored.
196*ccdc9c3eSSadaf Ebrahimi       return false;
197*ccdc9c3eSSadaf Ebrahimi 
198*ccdc9c3eSSadaf Ebrahimi     case kInstByteRange:
199*ccdc9c3eSSadaf Ebrahimi       if (ip->Matches(c))
200*ccdc9c3eSSadaf Ebrahimi         return Visit(ip->out(), p+1);
201*ccdc9c3eSSadaf Ebrahimi       return false;
202*ccdc9c3eSSadaf Ebrahimi 
203*ccdc9c3eSSadaf Ebrahimi     case kInstCapture:
204*ccdc9c3eSSadaf Ebrahimi       if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
205*ccdc9c3eSSadaf Ebrahimi         // Capture p to register, but save old value.
206*ccdc9c3eSSadaf Ebrahimi         const char* q = cap_[ip->cap()];
207*ccdc9c3eSSadaf Ebrahimi         cap_[ip->cap()] = p;
208*ccdc9c3eSSadaf Ebrahimi         bool ret = Visit(ip->out(), p);
209*ccdc9c3eSSadaf Ebrahimi         // Restore old value as we backtrack.
210*ccdc9c3eSSadaf Ebrahimi         cap_[ip->cap()] = q;
211*ccdc9c3eSSadaf Ebrahimi         return ret;
212*ccdc9c3eSSadaf Ebrahimi       }
213*ccdc9c3eSSadaf Ebrahimi       return Visit(ip->out(), p);
214*ccdc9c3eSSadaf Ebrahimi 
215*ccdc9c3eSSadaf Ebrahimi     case kInstEmptyWidth:
216*ccdc9c3eSSadaf Ebrahimi       if (ip->empty() & ~Prog::EmptyFlags(context_, p))
217*ccdc9c3eSSadaf Ebrahimi         return false;
218*ccdc9c3eSSadaf Ebrahimi       return Visit(ip->out(), p);
219*ccdc9c3eSSadaf Ebrahimi 
220*ccdc9c3eSSadaf Ebrahimi     case kInstNop:
221*ccdc9c3eSSadaf Ebrahimi       return Visit(ip->out(), p);
222*ccdc9c3eSSadaf Ebrahimi 
223*ccdc9c3eSSadaf Ebrahimi     case kInstMatch:
224*ccdc9c3eSSadaf Ebrahimi       // We found a match.  If it's the best so far, record the
225*ccdc9c3eSSadaf Ebrahimi       // parameters in the caller's submatch_ array.
226*ccdc9c3eSSadaf Ebrahimi       if (endmatch_ && p != context_.end())
227*ccdc9c3eSSadaf Ebrahimi         return false;
228*ccdc9c3eSSadaf Ebrahimi       cap_[1] = p;
229*ccdc9c3eSSadaf Ebrahimi       if (submatch_[0].data() == NULL ||           // First match so far ...
230*ccdc9c3eSSadaf Ebrahimi           (longest_ && p > submatch_[0].end())) {  // ... or better match
231*ccdc9c3eSSadaf Ebrahimi         for (int i = 0; i < nsubmatch_; i++)
232*ccdc9c3eSSadaf Ebrahimi           submatch_[i] = StringPiece(
233*ccdc9c3eSSadaf Ebrahimi               cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
234*ccdc9c3eSSadaf Ebrahimi       }
235*ccdc9c3eSSadaf Ebrahimi       return true;
236*ccdc9c3eSSadaf Ebrahimi 
237*ccdc9c3eSSadaf Ebrahimi     case kInstFail:
238*ccdc9c3eSSadaf Ebrahimi       return false;
239*ccdc9c3eSSadaf Ebrahimi   }
240*ccdc9c3eSSadaf Ebrahimi }
241*ccdc9c3eSSadaf Ebrahimi 
242*ccdc9c3eSSadaf Ebrahimi // Runs a backtracking search.
UnsafeSearchBacktrack(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)243*ccdc9c3eSSadaf Ebrahimi bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
244*ccdc9c3eSSadaf Ebrahimi                                  const StringPiece& context,
245*ccdc9c3eSSadaf Ebrahimi                                  Anchor anchor,
246*ccdc9c3eSSadaf Ebrahimi                                  MatchKind kind,
247*ccdc9c3eSSadaf Ebrahimi                                  StringPiece* match,
248*ccdc9c3eSSadaf Ebrahimi                                  int nmatch) {
249*ccdc9c3eSSadaf Ebrahimi   // If full match, we ask for an anchored longest match
250*ccdc9c3eSSadaf Ebrahimi   // and then check that match[0] == text.
251*ccdc9c3eSSadaf Ebrahimi   // So make sure match[0] exists.
252*ccdc9c3eSSadaf Ebrahimi   StringPiece sp0;
253*ccdc9c3eSSadaf Ebrahimi   if (kind == kFullMatch) {
254*ccdc9c3eSSadaf Ebrahimi     anchor = kAnchored;
255*ccdc9c3eSSadaf Ebrahimi     if (nmatch < 1) {
256*ccdc9c3eSSadaf Ebrahimi       match = &sp0;
257*ccdc9c3eSSadaf Ebrahimi       nmatch = 1;
258*ccdc9c3eSSadaf Ebrahimi     }
259*ccdc9c3eSSadaf Ebrahimi   }
260*ccdc9c3eSSadaf Ebrahimi 
261*ccdc9c3eSSadaf Ebrahimi   // Run the search.
262*ccdc9c3eSSadaf Ebrahimi   Backtracker b(this);
263*ccdc9c3eSSadaf Ebrahimi   bool anchored = anchor == kAnchored;
264*ccdc9c3eSSadaf Ebrahimi   bool longest = kind != kFirstMatch;
265*ccdc9c3eSSadaf Ebrahimi   if (!b.Search(text, context, anchored, longest, match, nmatch))
266*ccdc9c3eSSadaf Ebrahimi     return false;
267*ccdc9c3eSSadaf Ebrahimi   if (kind == kFullMatch && match[0].end() != text.end())
268*ccdc9c3eSSadaf Ebrahimi     return false;
269*ccdc9c3eSSadaf Ebrahimi   return true;
270*ccdc9c3eSSadaf Ebrahimi }
271*ccdc9c3eSSadaf Ebrahimi 
272*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
273