xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/nfa.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc.
6 //
7 // Prog::SearchNFA, an NFA search.
8 // This is an actual NFA like the theorists talk about,
9 // not the pseudo-NFA found in backtracking regexp implementations.
10 //
11 // IMPLEMENTATION
12 //
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14 // which is a variant of the one described in Thompson's 1968 CACM paper.
15 // See http://swtch.com/~rsc/regexp/ for various history.  The main feature
16 // over the DFA implementation is that it tracks submatch boundaries.
17 //
18 // When the choice of submatch boundaries is ambiguous, this particular
19 // implementation makes the same choices that traditional backtracking
20 // implementations (in particular, Perl and PCRE) do.
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22 // time in the length of the input.
23 //
24 // Like Thompson's original machine and like the DFA implementation, this
25 // implementation notices a match only once it is one byte past it.
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include <algorithm>
30 #include <deque>
31 #include <string>
32 #include <utility>
33 #include <vector>
34 
35 #include "absl/strings/str_format.h"
36 #include "util/logging.h"
37 #include "re2/pod_array.h"
38 #include "re2/prog.h"
39 #include "re2/regexp.h"
40 #include "re2/sparse_array.h"
41 #include "re2/sparse_set.h"
42 
43 namespace re2 {
44 
45 static const bool ExtraDebug = false;
46 
47 class NFA {
48  public:
49   NFA(Prog* prog);
50   ~NFA();
51 
52   // Searches for a matching string.
53   //   * If anchored is true, only considers matches starting at offset.
54   //     Otherwise finds lefmost match at or after offset.
55   //   * If longest is true, returns the longest match starting
56   //     at the chosen start point.  Otherwise returns the so-called
57   //     left-biased match, the one traditional backtracking engines
58   //     (like Perl and PCRE) find.
59   // Records submatch boundaries in submatch[1..nsubmatch-1].
60   // Submatch[0] is the entire match.  When there is a choice in
61   // which text matches each subexpression, the submatch boundaries
62   // are chosen to match what a backtracking implementation would choose.
63   bool Search(absl::string_view text, absl::string_view context, bool anchored,
64               bool longest, absl::string_view* submatch, int nsubmatch);
65 
66  private:
67   struct Thread {
68     union {
69       int ref;
70       Thread* next;  // when on free list
71     };
72     const char** capture;
73   };
74 
75   // State for explicit stack in AddToThreadq.
76   struct AddState {
77     int id;     // Inst to process
78     Thread* t;  // if not null, set t0 = t before processing id
79   };
80 
81   // Threadq is a list of threads.  The list is sorted by the order
82   // in which Perl would explore that particular state -- the earlier
83   // choices appear earlier in the list.
84   typedef SparseArray<Thread*> Threadq;
85 
86   inline Thread* AllocThread();
87   inline Thread* Incref(Thread* t);
88   inline void Decref(Thread* t);
89 
90   // Follows all empty arrows from id0 and enqueues all the states reached.
91   // Enqueues only the ByteRange instructions that match byte c.
92   // context is used (with p) for evaluating empty-width specials.
93   // p is the current input position, and t0 is the current thread.
94   void AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
95                     const char* p, Thread* t0);
96 
97   // Run runq on byte c, appending new states to nextq.
98   // Updates matched_ and match_ as new, better matches are found.
99   // context is used (with p) for evaluating empty-width specials.
100   // p is the position of byte c in the input string for AddToThreadq;
101   // p-1 will be used when processing Match instructions.
102   // Frees all the threads on runq.
103   // If there is a shortcut to the end, returns that shortcut.
104   int Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
105            const char* p);
106 
107   // Returns text version of capture information, for debugging.
108   std::string FormatCapture(const char** capture);
109 
CopyCapture(const char ** dst,const char ** src)110   void CopyCapture(const char** dst, const char** src) {
111     memmove(dst, src, ncapture_*sizeof src[0]);
112   }
113 
114   Prog* prog_;                // underlying program
115   int start_;                 // start instruction in program
116   int ncapture_;              // number of submatches to track
117   bool longest_;              // whether searching for longest match
118   bool endmatch_;             // whether match must end at text.end()
119   const char* btext_;         // beginning of text (for FormatSubmatch)
120   const char* etext_;         // end of text (for endmatch_)
121   Threadq q0_, q1_;           // pre-allocated for Search.
122   PODArray<AddState> stack_;  // pre-allocated for AddToThreadq
123   std::deque<Thread> arena_;  // thread arena
124   Thread* freelist_;          // thread freelist
125   const char** match_;        // best match so far
126   bool matched_;              // any match so far?
127 
128   NFA(const NFA&) = delete;
129   NFA& operator=(const NFA&) = delete;
130 };
131 
NFA(Prog * prog)132 NFA::NFA(Prog* prog) {
133   prog_ = prog;
134   start_ = prog_->start();
135   ncapture_ = 0;
136   longest_ = false;
137   endmatch_ = false;
138   btext_ = NULL;
139   etext_ = NULL;
140   q0_.resize(prog_->size());
141   q1_.resize(prog_->size());
142   // See NFA::AddToThreadq() for why this is so.
143   int nstack = 2*prog_->inst_count(kInstCapture) +
144                prog_->inst_count(kInstEmptyWidth) +
145                prog_->inst_count(kInstNop) + 1;  // + 1 for start inst
146   stack_ = PODArray<AddState>(nstack);
147   freelist_ = NULL;
148   match_ = NULL;
149   matched_ = false;
150 }
151 
~NFA()152 NFA::~NFA() {
153   delete[] match_;
154   for (const Thread& t : arena_)
155     delete[] t.capture;
156 }
157 
AllocThread()158 NFA::Thread* NFA::AllocThread() {
159   Thread* t = freelist_;
160   if (t != NULL) {
161     freelist_ = t->next;
162     t->ref = 1;
163     // We don't need to touch t->capture because
164     // the caller will immediately overwrite it.
165     return t;
166   }
167   arena_.emplace_back();
168   t = &arena_.back();
169   t->ref = 1;
170   t->capture = new const char*[ncapture_];
171   return t;
172 }
173 
Incref(Thread * t)174 NFA::Thread* NFA::Incref(Thread* t) {
175   DCHECK(t != NULL);
176   t->ref++;
177   return t;
178 }
179 
Decref(Thread * t)180 void NFA::Decref(Thread* t) {
181   DCHECK(t != NULL);
182   t->ref--;
183   if (t->ref > 0)
184     return;
185   DCHECK_EQ(t->ref, 0);
186   t->next = freelist_;
187   freelist_ = t;
188 }
189 
190 // Follows all empty arrows from id0 and enqueues all the states reached.
191 // Enqueues only the ByteRange instructions that match byte c.
192 // context is used (with p) for evaluating empty-width specials.
193 // p is the current input position, and t0 is the current thread.
AddToThreadq(Threadq * q,int id0,int c,absl::string_view context,const char * p,Thread * t0)194 void NFA::AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
195                        const char* p, Thread* t0) {
196   if (id0 == 0)
197     return;
198 
199   // Use stack_ to hold our stack of instructions yet to process.
200   // It was preallocated as follows:
201   //   two entries per Capture;
202   //   one entry per EmptyWidth; and
203   //   one entry per Nop.
204   // This reflects the maximum number of stack pushes that each can
205   // perform. (Each instruction can be processed at most once.)
206   AddState* stk = stack_.data();
207   int nstk = 0;
208 
209   stk[nstk++] = {id0, NULL};
210   while (nstk > 0) {
211     DCHECK_LE(nstk, stack_.size());
212     AddState a = stk[--nstk];
213 
214   Loop:
215     if (a.t != NULL) {
216       // t0 was a thread that we allocated and copied in order to
217       // record the capture, so we must now decref it.
218       Decref(t0);
219       t0 = a.t;
220     }
221 
222     int id = a.id;
223     if (id == 0)
224       continue;
225     if (q->has_index(id)) {
226       if (ExtraDebug)
227         absl::FPrintF(stderr, "  [%d%s]\n", id, FormatCapture(t0->capture));
228       continue;
229     }
230 
231     // Create entry in q no matter what.  We might fill it in below,
232     // or we might not.  Even if not, it is necessary to have it,
233     // so that we don't revisit id0 during the recursion.
234     q->set_new(id, NULL);
235     Thread** tp = &q->get_existing(id);
236     int j;
237     Thread* t;
238     Prog::Inst* ip = prog_->inst(id);
239     switch (ip->opcode()) {
240     default:
241       LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
242       break;
243 
244     case kInstFail:
245       break;
246 
247     case kInstAltMatch:
248       // Save state; will pick up at next byte.
249       t = Incref(t0);
250       *tp = t;
251 
252       DCHECK(!ip->last());
253       a = {id+1, NULL};
254       goto Loop;
255 
256     case kInstNop:
257       if (!ip->last())
258         stk[nstk++] = {id+1, NULL};
259 
260       // Continue on.
261       a = {ip->out(), NULL};
262       goto Loop;
263 
264     case kInstCapture:
265       if (!ip->last())
266         stk[nstk++] = {id+1, NULL};
267 
268       if ((j=ip->cap()) < ncapture_) {
269         // Push a dummy whose only job is to restore t0
270         // once we finish exploring this possibility.
271         stk[nstk++] = {0, t0};
272 
273         // Record capture.
274         t = AllocThread();
275         CopyCapture(t->capture, t0->capture);
276         t->capture[j] = p;
277         t0 = t;
278       }
279       a = {ip->out(), NULL};
280       goto Loop;
281 
282     case kInstByteRange:
283       if (!ip->Matches(c))
284         goto Next;
285 
286       // Save state; will pick up at next byte.
287       t = Incref(t0);
288       *tp = t;
289       if (ExtraDebug)
290         absl::FPrintF(stderr, " + %d%s\n", id, FormatCapture(t0->capture));
291 
292       if (ip->hint() == 0)
293         break;
294       a = {id+ip->hint(), NULL};
295       goto Loop;
296 
297     case kInstMatch:
298       // Save state; will pick up at next byte.
299       t = Incref(t0);
300       *tp = t;
301       if (ExtraDebug)
302         absl::FPrintF(stderr, " ! %d%s\n", id, FormatCapture(t0->capture));
303 
304     Next:
305       if (ip->last())
306         break;
307       a = {id+1, NULL};
308       goto Loop;
309 
310     case kInstEmptyWidth:
311       if (!ip->last())
312         stk[nstk++] = {id+1, NULL};
313 
314       // Continue on if we have all the right flag bits.
315       if (ip->empty() & ~Prog::EmptyFlags(context, p))
316         break;
317       a = {ip->out(), NULL};
318       goto Loop;
319     }
320   }
321 }
322 
323 // Run runq on byte c, appending new states to nextq.
324 // Updates matched_ and match_ as new, better matches are found.
325 // context is used (with p) for evaluating empty-width specials.
326 // p is the position of byte c in the input string for AddToThreadq;
327 // p-1 will be used when processing Match instructions.
328 // Frees all the threads on runq.
329 // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,absl::string_view context,const char * p)330 int NFA::Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
331               const char* p) {
332   nextq->clear();
333 
334   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
335     Thread* t = i->value();
336     if (t == NULL)
337       continue;
338 
339     if (longest_) {
340       // Can skip any threads started after our current best match.
341       if (matched_ && match_[0] < t->capture[0]) {
342         Decref(t);
343         continue;
344       }
345     }
346 
347     int id = i->index();
348     Prog::Inst* ip = prog_->inst(id);
349 
350     switch (ip->opcode()) {
351       default:
352         // Should only see the values handled below.
353         LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
354         break;
355 
356       case kInstByteRange:
357         AddToThreadq(nextq, ip->out(), c, context, p, t);
358         break;
359 
360       case kInstAltMatch:
361         if (i != runq->begin())
362           break;
363         // The match is ours if we want it.
364         if (ip->greedy(prog_) || longest_) {
365           CopyCapture(match_, t->capture);
366           matched_ = true;
367 
368           Decref(t);
369           for (++i; i != runq->end(); ++i) {
370             if (i->value() != NULL)
371               Decref(i->value());
372           }
373           runq->clear();
374           if (ip->greedy(prog_))
375             return ip->out1();
376           return ip->out();
377         }
378         break;
379 
380       case kInstMatch: {
381         // Avoid invoking undefined behavior (arithmetic on a null pointer)
382         // by storing p instead of p-1. (What would the latter even mean?!)
383         // This complements the special case in NFA::Search().
384         if (p == NULL) {
385           CopyCapture(match_, t->capture);
386           match_[1] = p;
387           matched_ = true;
388           break;
389         }
390 
391         if (endmatch_ && p-1 != etext_)
392           break;
393 
394         if (longest_) {
395           // Leftmost-longest mode: save this match only if
396           // it is either farther to the left or at the same
397           // point but longer than an existing match.
398           if (!matched_ || t->capture[0] < match_[0] ||
399               (t->capture[0] == match_[0] && p-1 > match_[1])) {
400             CopyCapture(match_, t->capture);
401             match_[1] = p-1;
402             matched_ = true;
403           }
404         } else {
405           // Leftmost-biased mode: this match is by definition
406           // better than what we've already found (see next line).
407           CopyCapture(match_, t->capture);
408           match_[1] = p-1;
409           matched_ = true;
410 
411           // Cut off the threads that can only find matches
412           // worse than the one we just found: don't run the
413           // rest of the current Threadq.
414           Decref(t);
415           for (++i; i != runq->end(); ++i) {
416             if (i->value() != NULL)
417               Decref(i->value());
418           }
419           runq->clear();
420           return 0;
421         }
422         break;
423       }
424     }
425     Decref(t);
426   }
427   runq->clear();
428   return 0;
429 }
430 
FormatCapture(const char ** capture)431 std::string NFA::FormatCapture(const char** capture) {
432   std::string s;
433   for (int i = 0; i < ncapture_; i+=2) {
434     if (capture[i] == NULL)
435       s += "(?,?)";
436     else if (capture[i+1] == NULL)
437       s += absl::StrFormat("(%d,?)",
438                            capture[i] - btext_);
439     else
440       s += absl::StrFormat("(%d,%d)",
441                            capture[i] - btext_,
442                            capture[i+1] - btext_);
443   }
444   return s;
445 }
446 
Search(absl::string_view text,absl::string_view context,bool anchored,bool longest,absl::string_view * submatch,int nsubmatch)447 bool NFA::Search(absl::string_view text, absl::string_view context,
448                  bool anchored, bool longest, absl::string_view* submatch,
449                  int nsubmatch) {
450   if (start_ == 0)
451     return false;
452 
453   if (context.data() == NULL)
454     context = text;
455 
456   // Sanity check: make sure that text lies within context.
457   if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
458     LOG(DFATAL) << "context does not contain text";
459     return false;
460   }
461 
462   if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text))
463     return false;
464   if (prog_->anchor_end() && EndPtr(context) != EndPtr(text))
465     return false;
466   anchored |= prog_->anchor_start();
467   if (prog_->anchor_end()) {
468     longest = true;
469     endmatch_ = true;
470   }
471 
472   if (nsubmatch < 0) {
473     LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
474     return false;
475   }
476 
477   // Save search parameters.
478   ncapture_ = 2*nsubmatch;
479   longest_ = longest;
480 
481   if (nsubmatch == 0) {
482     // We need to maintain match[0], both to distinguish the
483     // longest match (if longest is true) and also to tell
484     // whether we've seen any matches at all.
485     ncapture_ = 2;
486   }
487 
488   match_ = new const char*[ncapture_];
489   memset(match_, 0, ncapture_*sizeof match_[0]);
490   matched_ = false;
491 
492   // For debugging prints.
493   btext_ = context.data();
494   // For convenience.
495   etext_ = text.data() + text.size();
496 
497   if (ExtraDebug)
498     absl::FPrintF(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
499                   text, context, anchored, longest);
500 
501   // Set up search.
502   Threadq* runq = &q0_;
503   Threadq* nextq = &q1_;
504   runq->clear();
505   nextq->clear();
506 
507   // Loop over the text, stepping the machine.
508   for (const char* p = text.data();; p++) {
509     if (ExtraDebug) {
510       int c = 0;
511       if (p == btext_)
512         c = '^';
513       else if (p > etext_)
514         c = '$';
515       else if (p < etext_)
516         c = p[0] & 0xFF;
517 
518       absl::FPrintF(stderr, "%c:", c);
519       for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
520         Thread* t = i->value();
521         if (t == NULL)
522           continue;
523         absl::FPrintF(stderr, " %d%s", i->index(), FormatCapture(t->capture));
524       }
525       absl::FPrintF(stderr, "\n");
526     }
527 
528     // This is a no-op the first time around the loop because runq is empty.
529     int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
530     DCHECK_EQ(runq->size(), 0);
531     using std::swap;
532     swap(nextq, runq);
533     nextq->clear();
534     if (id != 0) {
535       // We're done: full match ahead.
536       p = etext_;
537       for (;;) {
538         Prog::Inst* ip = prog_->inst(id);
539         switch (ip->opcode()) {
540           default:
541             LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
542             break;
543 
544           case kInstCapture:
545             if (ip->cap() < ncapture_)
546               match_[ip->cap()] = p;
547             id = ip->out();
548             continue;
549 
550           case kInstNop:
551             id = ip->out();
552             continue;
553 
554           case kInstMatch:
555             match_[1] = p;
556             matched_ = true;
557             break;
558         }
559         break;
560       }
561       break;
562     }
563 
564     if (p > etext_)
565       break;
566 
567     // Start a new thread if there have not been any matches.
568     // (No point in starting a new thread if there have been
569     // matches, since it would be to the right of the match
570     // we already found.)
571     if (!matched_ && (!anchored || p == text.data())) {
572       // Try to use prefix accel (e.g. memchr) to skip ahead.
573       // The search must be unanchored and there must be zero
574       // possible matches already.
575       if (!anchored && runq->size() == 0 &&
576           p < etext_ && prog_->can_prefix_accel()) {
577         p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
578         if (p == NULL)
579           p = etext_;
580       }
581 
582       Thread* t = AllocThread();
583       CopyCapture(t->capture, match_);
584       t->capture[0] = p;
585       AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
586                    t);
587       Decref(t);
588     }
589 
590     // If all the threads have died, stop early.
591     if (runq->size() == 0) {
592       if (ExtraDebug)
593         absl::FPrintF(stderr, "dead\n");
594       break;
595     }
596 
597     // Avoid invoking undefined behavior (arithmetic on a null pointer)
598     // by simply not continuing the loop.
599     // This complements the special case in NFA::Step().
600     if (p == NULL) {
601       (void) Step(runq, nextq, -1, context, p);
602       DCHECK_EQ(runq->size(), 0);
603       using std::swap;
604       swap(nextq, runq);
605       nextq->clear();
606       break;
607     }
608   }
609 
610   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
611     if (i->value() != NULL)
612       Decref(i->value());
613   }
614 
615   if (matched_) {
616     for (int i = 0; i < nsubmatch; i++)
617       submatch[i] = absl::string_view(
618           match_[2 * i],
619           static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
620     if (ExtraDebug)
621       absl::FPrintF(stderr, "match (%d,%d)\n",
622                     match_[0] - btext_,
623                     match_[1] - btext_);
624     return true;
625   }
626   return false;
627 }
628 
SearchNFA(absl::string_view text,absl::string_view context,Anchor anchor,MatchKind kind,absl::string_view * match,int nmatch)629 bool Prog::SearchNFA(absl::string_view text, absl::string_view context,
630                      Anchor anchor, MatchKind kind, absl::string_view* match,
631                      int nmatch) {
632   if (ExtraDebug)
633     Dump();
634 
635   NFA nfa(this);
636   absl::string_view sp;
637   if (kind == kFullMatch) {
638     anchor = kAnchored;
639     if (nmatch == 0) {
640       match = &sp;
641       nmatch = 1;
642     }
643   }
644   if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
645     return false;
646   if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
647     return false;
648   return true;
649 }
650 
651 // For each instruction i in the program reachable from the start, compute the
652 // number of instructions reachable from i by following only empty transitions
653 // and record that count as fanout[i].
654 //
655 // fanout holds the results and is also the work queue for the outer iteration.
656 // reachable holds the reached nodes for the inner iteration.
Fanout(SparseArray<int> * fanout)657 void Prog::Fanout(SparseArray<int>* fanout) {
658   DCHECK_EQ(fanout->max_size(), size());
659   SparseSet reachable(size());
660   fanout->clear();
661   fanout->set_new(start(), 0);
662   for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
663     int* count = &i->value();
664     reachable.clear();
665     reachable.insert(i->index());
666     for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
667       int id = *j;
668       Prog::Inst* ip = inst(id);
669       switch (ip->opcode()) {
670         default:
671           LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
672           break;
673 
674         case kInstByteRange:
675           if (!ip->last())
676             reachable.insert(id+1);
677 
678           (*count)++;
679           if (!fanout->has_index(ip->out())) {
680             fanout->set_new(ip->out(), 0);
681           }
682           break;
683 
684         case kInstAltMatch:
685           DCHECK(!ip->last());
686           reachable.insert(id+1);
687           break;
688 
689         case kInstCapture:
690         case kInstEmptyWidth:
691         case kInstNop:
692           if (!ip->last())
693             reachable.insert(id+1);
694 
695           reachable.insert(ip->out());
696           break;
697 
698         case kInstMatch:
699           if (!ip->last())
700             reachable.insert(id+1);
701           break;
702 
703         case kInstFail:
704           break;
705       }
706     }
707   }
708 }
709 
710 }  // namespace re2
711