xref: /aosp_15_r20/external/regex-re2/re2/walker-inl.h (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_WALKER_INL_H_
6*ccdc9c3eSSadaf Ebrahimi #define RE2_WALKER_INL_H_
7*ccdc9c3eSSadaf Ebrahimi 
8*ccdc9c3eSSadaf Ebrahimi // Helper class for traversing Regexps without recursion.
9*ccdc9c3eSSadaf Ebrahimi // Clients should declare their own subclasses that override
10*ccdc9c3eSSadaf Ebrahimi // the PreVisit and PostVisit methods, which are called before
11*ccdc9c3eSSadaf Ebrahimi // and after visiting the subexpressions.
12*ccdc9c3eSSadaf Ebrahimi 
13*ccdc9c3eSSadaf Ebrahimi // Not quite the Visitor pattern, because (among other things)
14*ccdc9c3eSSadaf Ebrahimi // the Visitor pattern is recursive.
15*ccdc9c3eSSadaf Ebrahimi 
16*ccdc9c3eSSadaf Ebrahimi #include <stack>
17*ccdc9c3eSSadaf Ebrahimi 
18*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
19*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
20*ccdc9c3eSSadaf Ebrahimi 
21*ccdc9c3eSSadaf Ebrahimi namespace re2 {
22*ccdc9c3eSSadaf Ebrahimi 
23*ccdc9c3eSSadaf Ebrahimi template<typename T> struct WalkState;
24*ccdc9c3eSSadaf Ebrahimi 
25*ccdc9c3eSSadaf Ebrahimi template<typename T> class Regexp::Walker {
26*ccdc9c3eSSadaf Ebrahimi  public:
27*ccdc9c3eSSadaf Ebrahimi   Walker();
28*ccdc9c3eSSadaf Ebrahimi   virtual ~Walker();
29*ccdc9c3eSSadaf Ebrahimi 
30*ccdc9c3eSSadaf Ebrahimi   // Virtual method called before visiting re's children.
31*ccdc9c3eSSadaf Ebrahimi   // PreVisit passes ownership of its return value to its caller.
32*ccdc9c3eSSadaf Ebrahimi   // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
33*ccdc9c3eSSadaf Ebrahimi   // and passed to the child PreVisits and PostVisits as parent_arg.
34*ccdc9c3eSSadaf Ebrahimi   // At the top-most Regexp, parent_arg is arg passed to walk.
35*ccdc9c3eSSadaf Ebrahimi   // If PreVisit sets *stop to true, the walk does not recurse
36*ccdc9c3eSSadaf Ebrahimi   // into the children.  Instead it behaves as though the return
37*ccdc9c3eSSadaf Ebrahimi   // value from PreVisit is the return value from PostVisit.
38*ccdc9c3eSSadaf Ebrahimi   // The default PreVisit returns parent_arg.
39*ccdc9c3eSSadaf Ebrahimi   virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
40*ccdc9c3eSSadaf Ebrahimi 
41*ccdc9c3eSSadaf Ebrahimi   // Virtual method called after visiting re's children.
42*ccdc9c3eSSadaf Ebrahimi   // The pre_arg is the T that PreVisit returned.
43*ccdc9c3eSSadaf Ebrahimi   // The child_args is a vector of the T that the child PostVisits returned.
44*ccdc9c3eSSadaf Ebrahimi   // PostVisit takes ownership of pre_arg.
45*ccdc9c3eSSadaf Ebrahimi   // PostVisit takes ownership of the Ts
46*ccdc9c3eSSadaf Ebrahimi   // in *child_args, but not the vector itself.
47*ccdc9c3eSSadaf Ebrahimi   // PostVisit passes ownership of its return value
48*ccdc9c3eSSadaf Ebrahimi   // to its caller.
49*ccdc9c3eSSadaf Ebrahimi   // The default PostVisit simply returns pre_arg.
50*ccdc9c3eSSadaf Ebrahimi   virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
51*ccdc9c3eSSadaf Ebrahimi                       T* child_args, int nchild_args);
52*ccdc9c3eSSadaf Ebrahimi 
53*ccdc9c3eSSadaf Ebrahimi   // Virtual method called to copy a T,
54*ccdc9c3eSSadaf Ebrahimi   // when Walk notices that more than one child is the same re.
55*ccdc9c3eSSadaf Ebrahimi   virtual T Copy(T arg);
56*ccdc9c3eSSadaf Ebrahimi 
57*ccdc9c3eSSadaf Ebrahimi   // Virtual method called to do a "quick visit" of the re,
58*ccdc9c3eSSadaf Ebrahimi   // but not its children.  Only called once the visit budget
59*ccdc9c3eSSadaf Ebrahimi   // has been used up and we're trying to abort the walk
60*ccdc9c3eSSadaf Ebrahimi   // as quickly as possible.  Should return a value that
61*ccdc9c3eSSadaf Ebrahimi   // makes sense for the parent PostVisits still to be run.
62*ccdc9c3eSSadaf Ebrahimi   // This function is (hopefully) only called by
63*ccdc9c3eSSadaf Ebrahimi   // WalkExponential, but must be implemented by all clients,
64*ccdc9c3eSSadaf Ebrahimi   // just in case.
65*ccdc9c3eSSadaf Ebrahimi   virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
66*ccdc9c3eSSadaf Ebrahimi 
67*ccdc9c3eSSadaf Ebrahimi   // Walks over a regular expression.
68*ccdc9c3eSSadaf Ebrahimi   // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
69*ccdc9c3eSSadaf Ebrahimi   // Returns the T returned by PostVisit on re.
70*ccdc9c3eSSadaf Ebrahimi   T Walk(Regexp* re, T top_arg);
71*ccdc9c3eSSadaf Ebrahimi 
72*ccdc9c3eSSadaf Ebrahimi   // Like Walk, but doesn't use Copy.  This can lead to
73*ccdc9c3eSSadaf Ebrahimi   // exponential runtimes on cross-linked Regexps like the
74*ccdc9c3eSSadaf Ebrahimi   // ones generated by Simplify.  To help limit this,
75*ccdc9c3eSSadaf Ebrahimi   // at most max_visits nodes will be visited and then
76*ccdc9c3eSSadaf Ebrahimi   // the walk will be cut off early.
77*ccdc9c3eSSadaf Ebrahimi   // If the walk *is* cut off early, ShortVisit(re)
78*ccdc9c3eSSadaf Ebrahimi   // will be called on regexps that cannot be fully
79*ccdc9c3eSSadaf Ebrahimi   // visited rather than calling PreVisit/PostVisit.
80*ccdc9c3eSSadaf Ebrahimi   T WalkExponential(Regexp* re, T top_arg, int max_visits);
81*ccdc9c3eSSadaf Ebrahimi 
82*ccdc9c3eSSadaf Ebrahimi   // Clears the stack.  Should never be necessary, since
83*ccdc9c3eSSadaf Ebrahimi   // Walk always enters and exits with an empty stack.
84*ccdc9c3eSSadaf Ebrahimi   // Logs DFATAL if stack is not already clear.
85*ccdc9c3eSSadaf Ebrahimi   void Reset();
86*ccdc9c3eSSadaf Ebrahimi 
87*ccdc9c3eSSadaf Ebrahimi   // Returns whether walk was cut off.
stopped_early()88*ccdc9c3eSSadaf Ebrahimi   bool stopped_early() { return stopped_early_; }
89*ccdc9c3eSSadaf Ebrahimi 
90*ccdc9c3eSSadaf Ebrahimi  private:
91*ccdc9c3eSSadaf Ebrahimi   // Walk state for the entire traversal.
92*ccdc9c3eSSadaf Ebrahimi   std::stack<WalkState<T> >* stack_;
93*ccdc9c3eSSadaf Ebrahimi   bool stopped_early_;
94*ccdc9c3eSSadaf Ebrahimi   int max_visits_;
95*ccdc9c3eSSadaf Ebrahimi 
96*ccdc9c3eSSadaf Ebrahimi   T WalkInternal(Regexp* re, T top_arg, bool use_copy);
97*ccdc9c3eSSadaf Ebrahimi 
98*ccdc9c3eSSadaf Ebrahimi   Walker(const Walker&) = delete;
99*ccdc9c3eSSadaf Ebrahimi   Walker& operator=(const Walker&) = delete;
100*ccdc9c3eSSadaf Ebrahimi };
101*ccdc9c3eSSadaf Ebrahimi 
PreVisit(Regexp * re,T parent_arg,bool * stop)102*ccdc9c3eSSadaf Ebrahimi template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
103*ccdc9c3eSSadaf Ebrahimi                                                    T parent_arg,
104*ccdc9c3eSSadaf Ebrahimi                                                    bool* stop) {
105*ccdc9c3eSSadaf Ebrahimi   return parent_arg;
106*ccdc9c3eSSadaf Ebrahimi }
107*ccdc9c3eSSadaf Ebrahimi 
PostVisit(Regexp * re,T parent_arg,T pre_arg,T * child_args,int nchild_args)108*ccdc9c3eSSadaf Ebrahimi template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
109*ccdc9c3eSSadaf Ebrahimi                                                     T parent_arg,
110*ccdc9c3eSSadaf Ebrahimi                                                     T pre_arg,
111*ccdc9c3eSSadaf Ebrahimi                                                     T* child_args,
112*ccdc9c3eSSadaf Ebrahimi                                                     int nchild_args) {
113*ccdc9c3eSSadaf Ebrahimi   return pre_arg;
114*ccdc9c3eSSadaf Ebrahimi }
115*ccdc9c3eSSadaf Ebrahimi 
Copy(T arg)116*ccdc9c3eSSadaf Ebrahimi template<typename T> T Regexp::Walker<T>::Copy(T arg) {
117*ccdc9c3eSSadaf Ebrahimi   return arg;
118*ccdc9c3eSSadaf Ebrahimi }
119*ccdc9c3eSSadaf Ebrahimi 
120*ccdc9c3eSSadaf Ebrahimi // State about a single level in the traversal.
121*ccdc9c3eSSadaf Ebrahimi template<typename T> struct WalkState {
WalkStateWalkState122*ccdc9c3eSSadaf Ebrahimi   WalkState<T>(Regexp* re, T parent)
123*ccdc9c3eSSadaf Ebrahimi     : re(re),
124*ccdc9c3eSSadaf Ebrahimi       n(-1),
125*ccdc9c3eSSadaf Ebrahimi       parent_arg(parent),
126*ccdc9c3eSSadaf Ebrahimi       child_args(NULL) { }
127*ccdc9c3eSSadaf Ebrahimi 
128*ccdc9c3eSSadaf Ebrahimi   Regexp* re;  // The regexp
129*ccdc9c3eSSadaf Ebrahimi   int n;  // The index of the next child to process; -1 means need to PreVisit
130*ccdc9c3eSSadaf Ebrahimi   T parent_arg;  // Accumulated arguments.
131*ccdc9c3eSSadaf Ebrahimi   T pre_arg;
132*ccdc9c3eSSadaf Ebrahimi   T child_arg;  // One-element buffer for child_args.
133*ccdc9c3eSSadaf Ebrahimi   T* child_args;
134*ccdc9c3eSSadaf Ebrahimi };
135*ccdc9c3eSSadaf Ebrahimi 
Walker()136*ccdc9c3eSSadaf Ebrahimi template<typename T> Regexp::Walker<T>::Walker() {
137*ccdc9c3eSSadaf Ebrahimi   stack_ = new std::stack<WalkState<T> >;
138*ccdc9c3eSSadaf Ebrahimi   stopped_early_ = false;
139*ccdc9c3eSSadaf Ebrahimi }
140*ccdc9c3eSSadaf Ebrahimi 
~Walker()141*ccdc9c3eSSadaf Ebrahimi template<typename T> Regexp::Walker<T>::~Walker() {
142*ccdc9c3eSSadaf Ebrahimi   Reset();
143*ccdc9c3eSSadaf Ebrahimi   delete stack_;
144*ccdc9c3eSSadaf Ebrahimi }
145*ccdc9c3eSSadaf Ebrahimi 
146*ccdc9c3eSSadaf Ebrahimi // Clears the stack.  Should never be necessary, since
147*ccdc9c3eSSadaf Ebrahimi // Walk always enters and exits with an empty stack.
148*ccdc9c3eSSadaf Ebrahimi // Logs DFATAL if stack is not already clear.
Reset()149*ccdc9c3eSSadaf Ebrahimi template<typename T> void Regexp::Walker<T>::Reset() {
150*ccdc9c3eSSadaf Ebrahimi   if (stack_ && stack_->size() > 0) {
151*ccdc9c3eSSadaf Ebrahimi     LOG(DFATAL) << "Stack not empty.";
152*ccdc9c3eSSadaf Ebrahimi     while (stack_->size() > 0) {
153*ccdc9c3eSSadaf Ebrahimi       delete stack_->top().child_args;
154*ccdc9c3eSSadaf Ebrahimi       stack_->pop();
155*ccdc9c3eSSadaf Ebrahimi     }
156*ccdc9c3eSSadaf Ebrahimi   }
157*ccdc9c3eSSadaf Ebrahimi }
158*ccdc9c3eSSadaf Ebrahimi 
WalkInternal(Regexp * re,T top_arg,bool use_copy)159*ccdc9c3eSSadaf Ebrahimi template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
160*ccdc9c3eSSadaf Ebrahimi                                                        bool use_copy) {
161*ccdc9c3eSSadaf Ebrahimi   Reset();
162*ccdc9c3eSSadaf Ebrahimi 
163*ccdc9c3eSSadaf Ebrahimi   if (re == NULL) {
164*ccdc9c3eSSadaf Ebrahimi     LOG(DFATAL) << "Walk NULL";
165*ccdc9c3eSSadaf Ebrahimi     return top_arg;
166*ccdc9c3eSSadaf Ebrahimi   }
167*ccdc9c3eSSadaf Ebrahimi 
168*ccdc9c3eSSadaf Ebrahimi   stack_->push(WalkState<T>(re, top_arg));
169*ccdc9c3eSSadaf Ebrahimi 
170*ccdc9c3eSSadaf Ebrahimi   WalkState<T>* s;
171*ccdc9c3eSSadaf Ebrahimi   for (;;) {
172*ccdc9c3eSSadaf Ebrahimi     T t;
173*ccdc9c3eSSadaf Ebrahimi     s = &stack_->top();
174*ccdc9c3eSSadaf Ebrahimi     Regexp* re = s->re;
175*ccdc9c3eSSadaf Ebrahimi     switch (s->n) {
176*ccdc9c3eSSadaf Ebrahimi       case -1: {
177*ccdc9c3eSSadaf Ebrahimi         if (--max_visits_ < 0) {
178*ccdc9c3eSSadaf Ebrahimi           stopped_early_ = true;
179*ccdc9c3eSSadaf Ebrahimi           t = ShortVisit(re, s->parent_arg);
180*ccdc9c3eSSadaf Ebrahimi           break;
181*ccdc9c3eSSadaf Ebrahimi         }
182*ccdc9c3eSSadaf Ebrahimi         bool stop = false;
183*ccdc9c3eSSadaf Ebrahimi         s->pre_arg = PreVisit(re, s->parent_arg, &stop);
184*ccdc9c3eSSadaf Ebrahimi         if (stop) {
185*ccdc9c3eSSadaf Ebrahimi           t = s->pre_arg;
186*ccdc9c3eSSadaf Ebrahimi           break;
187*ccdc9c3eSSadaf Ebrahimi         }
188*ccdc9c3eSSadaf Ebrahimi         s->n = 0;
189*ccdc9c3eSSadaf Ebrahimi         s->child_args = NULL;
190*ccdc9c3eSSadaf Ebrahimi         if (re->nsub_ == 1)
191*ccdc9c3eSSadaf Ebrahimi           s->child_args = &s->child_arg;
192*ccdc9c3eSSadaf Ebrahimi         else if (re->nsub_ > 1)
193*ccdc9c3eSSadaf Ebrahimi           s->child_args = new T[re->nsub_];
194*ccdc9c3eSSadaf Ebrahimi         FALLTHROUGH_INTENDED;
195*ccdc9c3eSSadaf Ebrahimi       }
196*ccdc9c3eSSadaf Ebrahimi       default: {
197*ccdc9c3eSSadaf Ebrahimi         if (re->nsub_ > 0) {
198*ccdc9c3eSSadaf Ebrahimi           Regexp** sub = re->sub();
199*ccdc9c3eSSadaf Ebrahimi           if (s->n < re->nsub_) {
200*ccdc9c3eSSadaf Ebrahimi             if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
201*ccdc9c3eSSadaf Ebrahimi               s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
202*ccdc9c3eSSadaf Ebrahimi               s->n++;
203*ccdc9c3eSSadaf Ebrahimi             } else {
204*ccdc9c3eSSadaf Ebrahimi               stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
205*ccdc9c3eSSadaf Ebrahimi             }
206*ccdc9c3eSSadaf Ebrahimi             continue;
207*ccdc9c3eSSadaf Ebrahimi           }
208*ccdc9c3eSSadaf Ebrahimi         }
209*ccdc9c3eSSadaf Ebrahimi 
210*ccdc9c3eSSadaf Ebrahimi         t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
211*ccdc9c3eSSadaf Ebrahimi         if (re->nsub_ > 1)
212*ccdc9c3eSSadaf Ebrahimi           delete[] s->child_args;
213*ccdc9c3eSSadaf Ebrahimi         break;
214*ccdc9c3eSSadaf Ebrahimi       }
215*ccdc9c3eSSadaf Ebrahimi     }
216*ccdc9c3eSSadaf Ebrahimi 
217*ccdc9c3eSSadaf Ebrahimi     // We've finished stack_->top().
218*ccdc9c3eSSadaf Ebrahimi     // Update next guy down.
219*ccdc9c3eSSadaf Ebrahimi     stack_->pop();
220*ccdc9c3eSSadaf Ebrahimi     if (stack_->size() == 0)
221*ccdc9c3eSSadaf Ebrahimi       return t;
222*ccdc9c3eSSadaf Ebrahimi     s = &stack_->top();
223*ccdc9c3eSSadaf Ebrahimi     if (s->child_args != NULL)
224*ccdc9c3eSSadaf Ebrahimi       s->child_args[s->n] = t;
225*ccdc9c3eSSadaf Ebrahimi     else
226*ccdc9c3eSSadaf Ebrahimi       s->child_arg = t;
227*ccdc9c3eSSadaf Ebrahimi     s->n++;
228*ccdc9c3eSSadaf Ebrahimi   }
229*ccdc9c3eSSadaf Ebrahimi }
230*ccdc9c3eSSadaf Ebrahimi 
Walk(Regexp * re,T top_arg)231*ccdc9c3eSSadaf Ebrahimi template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
232*ccdc9c3eSSadaf Ebrahimi   // Without the exponential walking behavior,
233*ccdc9c3eSSadaf Ebrahimi   // this budget should be more than enough for any
234*ccdc9c3eSSadaf Ebrahimi   // regexp, and yet not enough to get us in trouble
235*ccdc9c3eSSadaf Ebrahimi   // as far as CPU time.
236*ccdc9c3eSSadaf Ebrahimi   max_visits_ = 1000000;
237*ccdc9c3eSSadaf Ebrahimi   return WalkInternal(re, top_arg, true);
238*ccdc9c3eSSadaf Ebrahimi }
239*ccdc9c3eSSadaf Ebrahimi 
WalkExponential(Regexp * re,T top_arg,int max_visits)240*ccdc9c3eSSadaf Ebrahimi template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
241*ccdc9c3eSSadaf Ebrahimi                                                           int max_visits) {
242*ccdc9c3eSSadaf Ebrahimi   max_visits_ = max_visits;
243*ccdc9c3eSSadaf Ebrahimi   return WalkInternal(re, top_arg, false);
244*ccdc9c3eSSadaf Ebrahimi }
245*ccdc9c3eSSadaf Ebrahimi 
246*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
247*ccdc9c3eSSadaf Ebrahimi 
248*ccdc9c3eSSadaf Ebrahimi #endif  // RE2_WALKER_INL_H_
249