xref: /aosp_15_r20/external/regex-re2/re2/mimics_pcre.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Determine whether this library should match PCRE exactly
6*ccdc9c3eSSadaf Ebrahimi // for a particular Regexp.  (If so, the testing framework can
7*ccdc9c3eSSadaf Ebrahimi // check that it does.)
8*ccdc9c3eSSadaf Ebrahimi //
9*ccdc9c3eSSadaf Ebrahimi // This library matches PCRE except in these cases:
10*ccdc9c3eSSadaf Ebrahimi //   * the regexp contains a repetition of an empty string,
11*ccdc9c3eSSadaf Ebrahimi //     like (a*)* or (a*)+.  In this case, PCRE will treat
12*ccdc9c3eSSadaf Ebrahimi //     the repetition sequence as ending with an empty string,
13*ccdc9c3eSSadaf Ebrahimi //     while this library does not.
14*ccdc9c3eSSadaf Ebrahimi //   * Perl and PCRE differ on whether \v matches \n.
15*ccdc9c3eSSadaf Ebrahimi //     For historical reasons, this library implements the Perl behavior.
16*ccdc9c3eSSadaf Ebrahimi //   * Perl and PCRE allow $ in one-line mode to match either the very
17*ccdc9c3eSSadaf Ebrahimi //     end of the text or just before a \n at the end of the text.
18*ccdc9c3eSSadaf Ebrahimi //     This library requires it to match only the end of the text.
19*ccdc9c3eSSadaf Ebrahimi //   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20*ccdc9c3eSSadaf Ebrahimi //     match the end of the text if the last character is a \n.
21*ccdc9c3eSSadaf Ebrahimi //     This library does allow it.
22*ccdc9c3eSSadaf Ebrahimi //
23*ccdc9c3eSSadaf Ebrahimi // Regexp::MimicsPCRE checks for any of these conditions.
24*ccdc9c3eSSadaf Ebrahimi 
25*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
26*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
27*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
28*ccdc9c3eSSadaf Ebrahimi #include "re2/walker-inl.h"
29*ccdc9c3eSSadaf Ebrahimi 
30*ccdc9c3eSSadaf Ebrahimi namespace re2 {
31*ccdc9c3eSSadaf Ebrahimi 
32*ccdc9c3eSSadaf Ebrahimi // Returns whether re might match an empty string.
33*ccdc9c3eSSadaf Ebrahimi static bool CanBeEmptyString(Regexp *re);
34*ccdc9c3eSSadaf Ebrahimi 
35*ccdc9c3eSSadaf Ebrahimi // Walker class to compute whether library handles a regexp
36*ccdc9c3eSSadaf Ebrahimi // exactly as PCRE would.  See comment at top for conditions.
37*ccdc9c3eSSadaf Ebrahimi 
38*ccdc9c3eSSadaf Ebrahimi class PCREWalker : public Regexp::Walker<bool> {
39*ccdc9c3eSSadaf Ebrahimi  public:
PCREWalker()40*ccdc9c3eSSadaf Ebrahimi   PCREWalker() {}
41*ccdc9c3eSSadaf Ebrahimi   bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
42*ccdc9c3eSSadaf Ebrahimi                  int nchild_args);
43*ccdc9c3eSSadaf Ebrahimi 
ShortVisit(Regexp * re,bool a)44*ccdc9c3eSSadaf Ebrahimi   bool ShortVisit(Regexp* re, bool a) {
45*ccdc9c3eSSadaf Ebrahimi     // Should never be called: we use Walk not WalkExponential.
46*ccdc9c3eSSadaf Ebrahimi     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
47*ccdc9c3eSSadaf Ebrahimi     return a;
48*ccdc9c3eSSadaf Ebrahimi   }
49*ccdc9c3eSSadaf Ebrahimi };
50*ccdc9c3eSSadaf Ebrahimi 
51*ccdc9c3eSSadaf Ebrahimi // Called after visiting each of re's children and accumulating
52*ccdc9c3eSSadaf Ebrahimi // the return values in child_args.  So child_args contains whether
53*ccdc9c3eSSadaf Ebrahimi // this library mimics PCRE for those subexpressions.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)54*ccdc9c3eSSadaf Ebrahimi bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
55*ccdc9c3eSSadaf Ebrahimi                            bool* child_args, int nchild_args) {
56*ccdc9c3eSSadaf Ebrahimi   // If children failed, so do we.
57*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < nchild_args; i++)
58*ccdc9c3eSSadaf Ebrahimi     if (!child_args[i])
59*ccdc9c3eSSadaf Ebrahimi       return false;
60*ccdc9c3eSSadaf Ebrahimi 
61*ccdc9c3eSSadaf Ebrahimi   // Otherwise look for other reasons to fail.
62*ccdc9c3eSSadaf Ebrahimi   switch (re->op()) {
63*ccdc9c3eSSadaf Ebrahimi     // Look for repeated empty string.
64*ccdc9c3eSSadaf Ebrahimi     case kRegexpStar:
65*ccdc9c3eSSadaf Ebrahimi     case kRegexpPlus:
66*ccdc9c3eSSadaf Ebrahimi     case kRegexpQuest:
67*ccdc9c3eSSadaf Ebrahimi       if (CanBeEmptyString(re->sub()[0]))
68*ccdc9c3eSSadaf Ebrahimi         return false;
69*ccdc9c3eSSadaf Ebrahimi       break;
70*ccdc9c3eSSadaf Ebrahimi     case kRegexpRepeat:
71*ccdc9c3eSSadaf Ebrahimi       if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
72*ccdc9c3eSSadaf Ebrahimi         return false;
73*ccdc9c3eSSadaf Ebrahimi       break;
74*ccdc9c3eSSadaf Ebrahimi 
75*ccdc9c3eSSadaf Ebrahimi     // Look for \v
76*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteral:
77*ccdc9c3eSSadaf Ebrahimi       if (re->rune() == '\v')
78*ccdc9c3eSSadaf Ebrahimi         return false;
79*ccdc9c3eSSadaf Ebrahimi       break;
80*ccdc9c3eSSadaf Ebrahimi 
81*ccdc9c3eSSadaf Ebrahimi     // Look for $ in single-line mode.
82*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndText:
83*ccdc9c3eSSadaf Ebrahimi     case kRegexpEmptyMatch:
84*ccdc9c3eSSadaf Ebrahimi       if (re->parse_flags() & Regexp::WasDollar)
85*ccdc9c3eSSadaf Ebrahimi         return false;
86*ccdc9c3eSSadaf Ebrahimi       break;
87*ccdc9c3eSSadaf Ebrahimi 
88*ccdc9c3eSSadaf Ebrahimi     // Look for ^ in multi-line mode.
89*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginLine:
90*ccdc9c3eSSadaf Ebrahimi       // No condition: in single-line mode ^ becomes kRegexpBeginText.
91*ccdc9c3eSSadaf Ebrahimi       return false;
92*ccdc9c3eSSadaf Ebrahimi 
93*ccdc9c3eSSadaf Ebrahimi     default:
94*ccdc9c3eSSadaf Ebrahimi       break;
95*ccdc9c3eSSadaf Ebrahimi   }
96*ccdc9c3eSSadaf Ebrahimi 
97*ccdc9c3eSSadaf Ebrahimi   // Not proven guilty.
98*ccdc9c3eSSadaf Ebrahimi   return true;
99*ccdc9c3eSSadaf Ebrahimi }
100*ccdc9c3eSSadaf Ebrahimi 
101*ccdc9c3eSSadaf Ebrahimi // Returns whether this regexp's behavior will mimic PCRE's exactly.
MimicsPCRE()102*ccdc9c3eSSadaf Ebrahimi bool Regexp::MimicsPCRE() {
103*ccdc9c3eSSadaf Ebrahimi   PCREWalker w;
104*ccdc9c3eSSadaf Ebrahimi   return w.Walk(this, true);
105*ccdc9c3eSSadaf Ebrahimi }
106*ccdc9c3eSSadaf Ebrahimi 
107*ccdc9c3eSSadaf Ebrahimi 
108*ccdc9c3eSSadaf Ebrahimi // Walker class to compute whether a Regexp can match an empty string.
109*ccdc9c3eSSadaf Ebrahimi // It is okay to overestimate.  For example, \b\B cannot match an empty
110*ccdc9c3eSSadaf Ebrahimi // string, because \b and \B are mutually exclusive, but this isn't
111*ccdc9c3eSSadaf Ebrahimi // that smart and will say it can.  Spurious empty strings
112*ccdc9c3eSSadaf Ebrahimi // will reduce the number of regexps we sanity check against PCRE,
113*ccdc9c3eSSadaf Ebrahimi // but they won't break anything.
114*ccdc9c3eSSadaf Ebrahimi 
115*ccdc9c3eSSadaf Ebrahimi class EmptyStringWalker : public Regexp::Walker<bool> {
116*ccdc9c3eSSadaf Ebrahimi  public:
EmptyStringWalker()117*ccdc9c3eSSadaf Ebrahimi   EmptyStringWalker() { }
118*ccdc9c3eSSadaf Ebrahimi   bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
119*ccdc9c3eSSadaf Ebrahimi                  bool* child_args, int nchild_args);
120*ccdc9c3eSSadaf Ebrahimi 
ShortVisit(Regexp * re,bool a)121*ccdc9c3eSSadaf Ebrahimi   bool ShortVisit(Regexp* re, bool a) {
122*ccdc9c3eSSadaf Ebrahimi     // Should never be called: we use Walk not WalkExponential.
123*ccdc9c3eSSadaf Ebrahimi     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
124*ccdc9c3eSSadaf Ebrahimi     return a;
125*ccdc9c3eSSadaf Ebrahimi   }
126*ccdc9c3eSSadaf Ebrahimi 
127*ccdc9c3eSSadaf Ebrahimi  private:
128*ccdc9c3eSSadaf Ebrahimi   EmptyStringWalker(const EmptyStringWalker&) = delete;
129*ccdc9c3eSSadaf Ebrahimi   EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
130*ccdc9c3eSSadaf Ebrahimi };
131*ccdc9c3eSSadaf Ebrahimi 
132*ccdc9c3eSSadaf Ebrahimi // Called after visiting re's children.  child_args contains the return
133*ccdc9c3eSSadaf Ebrahimi // value from each of the children's PostVisits (i.e., whether each child
134*ccdc9c3eSSadaf Ebrahimi // can match an empty string).  Returns whether this clause can match an
135*ccdc9c3eSSadaf Ebrahimi // empty string.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)136*ccdc9c3eSSadaf Ebrahimi bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
137*ccdc9c3eSSadaf Ebrahimi                                   bool* child_args, int nchild_args) {
138*ccdc9c3eSSadaf Ebrahimi   switch (re->op()) {
139*ccdc9c3eSSadaf Ebrahimi     case kRegexpNoMatch:               // never empty
140*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteral:
141*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyChar:
142*ccdc9c3eSSadaf Ebrahimi     case kRegexpAnyByte:
143*ccdc9c3eSSadaf Ebrahimi     case kRegexpCharClass:
144*ccdc9c3eSSadaf Ebrahimi     case kRegexpLiteralString:
145*ccdc9c3eSSadaf Ebrahimi       return false;
146*ccdc9c3eSSadaf Ebrahimi 
147*ccdc9c3eSSadaf Ebrahimi     case kRegexpEmptyMatch:            // always empty
148*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginLine:             // always empty, when they match
149*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndLine:
150*ccdc9c3eSSadaf Ebrahimi     case kRegexpNoWordBoundary:
151*ccdc9c3eSSadaf Ebrahimi     case kRegexpWordBoundary:
152*ccdc9c3eSSadaf Ebrahimi     case kRegexpBeginText:
153*ccdc9c3eSSadaf Ebrahimi     case kRegexpEndText:
154*ccdc9c3eSSadaf Ebrahimi     case kRegexpStar:                  // can always be empty
155*ccdc9c3eSSadaf Ebrahimi     case kRegexpQuest:
156*ccdc9c3eSSadaf Ebrahimi     case kRegexpHaveMatch:
157*ccdc9c3eSSadaf Ebrahimi       return true;
158*ccdc9c3eSSadaf Ebrahimi 
159*ccdc9c3eSSadaf Ebrahimi     case kRegexpConcat:                // can be empty if all children can
160*ccdc9c3eSSadaf Ebrahimi       for (int i = 0; i < nchild_args; i++)
161*ccdc9c3eSSadaf Ebrahimi         if (!child_args[i])
162*ccdc9c3eSSadaf Ebrahimi           return false;
163*ccdc9c3eSSadaf Ebrahimi       return true;
164*ccdc9c3eSSadaf Ebrahimi 
165*ccdc9c3eSSadaf Ebrahimi     case kRegexpAlternate:             // can be empty if any child can
166*ccdc9c3eSSadaf Ebrahimi       for (int i = 0; i < nchild_args; i++)
167*ccdc9c3eSSadaf Ebrahimi         if (child_args[i])
168*ccdc9c3eSSadaf Ebrahimi           return true;
169*ccdc9c3eSSadaf Ebrahimi       return false;
170*ccdc9c3eSSadaf Ebrahimi 
171*ccdc9c3eSSadaf Ebrahimi     case kRegexpPlus:                  // can be empty if the child can
172*ccdc9c3eSSadaf Ebrahimi     case kRegexpCapture:
173*ccdc9c3eSSadaf Ebrahimi       return child_args[0];
174*ccdc9c3eSSadaf Ebrahimi 
175*ccdc9c3eSSadaf Ebrahimi     case kRegexpRepeat:                // can be empty if child can or is x{0}
176*ccdc9c3eSSadaf Ebrahimi       return child_args[0] || re->min() == 0;
177*ccdc9c3eSSadaf Ebrahimi   }
178*ccdc9c3eSSadaf Ebrahimi   return false;
179*ccdc9c3eSSadaf Ebrahimi }
180*ccdc9c3eSSadaf Ebrahimi 
181*ccdc9c3eSSadaf Ebrahimi // Returns whether re can match an empty string.
CanBeEmptyString(Regexp * re)182*ccdc9c3eSSadaf Ebrahimi static bool CanBeEmptyString(Regexp* re) {
183*ccdc9c3eSSadaf Ebrahimi   EmptyStringWalker w;
184*ccdc9c3eSSadaf Ebrahimi   return w.Walk(re, true);
185*ccdc9c3eSSadaf Ebrahimi }
186*ccdc9c3eSSadaf Ebrahimi 
187*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
188