xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/prefilter.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "re2/prefilter.h"
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string>
10 #include <utility>
11 #include <vector>
12 
13 #include "absl/strings/str_format.h"
14 #include "util/logging.h"
15 #include "util/utf.h"
16 #include "re2/re2.h"
17 #include "re2/unicode_casefold.h"
18 #include "re2/walker-inl.h"
19 
20 namespace re2 {
21 
22 static const bool ExtraDebug = false;
23 
24 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)25 Prefilter::Prefilter(Op op) {
26   op_ = op;
27   subs_ = NULL;
28   if (op_ == AND || op_ == OR)
29     subs_ = new std::vector<Prefilter*>;
30 }
31 
32 // Destroys a Prefilter.
~Prefilter()33 Prefilter::~Prefilter() {
34   if (subs_) {
35     for (size_t i = 0; i < subs_->size(); i++)
36       delete (*subs_)[i];
37     delete subs_;
38     subs_ = NULL;
39   }
40 }
41 
42 // Simplify if the node is an empty Or or And.
Simplify()43 Prefilter* Prefilter::Simplify() {
44   if (op_ != AND && op_ != OR) {
45     return this;
46   }
47 
48   // Nothing left in the AND/OR.
49   if (subs_->empty()) {
50     if (op_ == AND)
51       op_ = ALL;  // AND of nothing is true
52     else
53       op_ = NONE;  // OR of nothing is false
54 
55     return this;
56   }
57 
58   // Just one subnode: throw away wrapper.
59   if (subs_->size() == 1) {
60     Prefilter* a = (*subs_)[0];
61     subs_->clear();
62     delete this;
63     return a->Simplify();
64   }
65 
66   return this;
67 }
68 
69 // Combines two Prefilters together to create an "op" (AND or OR).
70 // The passed Prefilters will be part of the returned Prefilter or deleted.
71 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)72 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
73   // If a, b can be rewritten as op, do so.
74   a = a->Simplify();
75   b = b->Simplify();
76 
77   // Canonicalize: a->op <= b->op.
78   if (a->op() > b->op()) {
79     Prefilter* t = a;
80     a = b;
81     b = t;
82   }
83 
84   // Trivial cases.
85   //    ALL AND b = b
86   //    NONE OR b = b
87   //    ALL OR b   = ALL
88   //    NONE AND b = NONE
89   // Don't need to look at b, because of canonicalization above.
90   // ALL and NONE are smallest opcodes.
91   if (a->op() == ALL || a->op() == NONE) {
92     if ((a->op() == ALL && op == AND) ||
93         (a->op() == NONE && op == OR)) {
94       delete a;
95       return b;
96     } else {
97       delete b;
98       return a;
99     }
100   }
101 
102   // If a and b match op, merge their contents.
103   if (a->op() == op && b->op() == op) {
104     for (size_t i = 0; i < b->subs()->size(); i++) {
105       Prefilter* bb = (*b->subs())[i];
106       a->subs()->push_back(bb);
107     }
108     b->subs()->clear();
109     delete b;
110     return a;
111   }
112 
113   // If a already has the same op as the op that is under construction
114   // add in b (similarly if b already has the same op, add in a).
115   if (b->op() == op) {
116     Prefilter* t = a;
117     a = b;
118     b = t;
119   }
120   if (a->op() == op) {
121     a->subs()->push_back(b);
122     return a;
123   }
124 
125   // Otherwise just return the op.
126   Prefilter* c = new Prefilter(op);
127   c->subs()->push_back(a);
128   c->subs()->push_back(b);
129   return c;
130 }
131 
And(Prefilter * a,Prefilter * b)132 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
133   return AndOr(AND, a, b);
134 }
135 
Or(Prefilter * a,Prefilter * b)136 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
137   return AndOr(OR, a, b);
138 }
139 
SimplifyStringSet(SSet * ss)140 void Prefilter::SimplifyStringSet(SSet* ss) {
141   // Now make sure that the strings aren't redundant.  For example, if
142   // we know "ab" is a required string, then it doesn't help at all to
143   // know that "abc" is also a required string, so delete "abc". This
144   // is because, when we are performing a string search to filter
145   // regexps, matching "ab" will already allow this regexp to be a
146   // candidate for match, so further matching "abc" is redundant.
147   // Note that we must ignore "" because find() would find it at the
148   // start of everything and thus we would end up erasing everything.
149   //
150   // The SSet sorts strings by length, then lexicographically. Note that
151   // smaller strings appear first and all strings must be unique. These
152   // observations let us skip string comparisons when possible.
153   SSIter i = ss->begin();
154   if (i != ss->end() && i->empty()) {
155     ++i;
156   }
157   for (; i != ss->end(); ++i) {
158     SSIter j = i;
159     ++j;
160     while (j != ss->end()) {
161       if (j->size() > i->size() && j->find(*i) != std::string::npos) {
162         j = ss->erase(j);
163         continue;
164       }
165       ++j;
166     }
167   }
168 }
169 
OrStrings(SSet * ss)170 Prefilter* Prefilter::OrStrings(SSet* ss) {
171   Prefilter* or_prefilter = new Prefilter(NONE);
172   SimplifyStringSet(ss);
173   for (SSIter i = ss->begin(); i != ss->end(); ++i)
174     or_prefilter = Or(or_prefilter, FromString(*i));
175   return or_prefilter;
176 }
177 
ToLowerRune(Rune r)178 static Rune ToLowerRune(Rune r) {
179   if (r < Runeself) {
180     if ('A' <= r && r <= 'Z')
181       r += 'a' - 'A';
182     return r;
183   }
184 
185   const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
186   if (f == NULL || r < f->lo)
187     return r;
188   return ApplyFold(f, r);
189 }
190 
ToLowerRuneLatin1(Rune r)191 static Rune ToLowerRuneLatin1(Rune r) {
192   if ('A' <= r && r <= 'Z')
193     r += 'a' - 'A';
194   return r;
195 }
196 
FromString(const std::string & str)197 Prefilter* Prefilter::FromString(const std::string& str) {
198   Prefilter* m = new Prefilter(Prefilter::ATOM);
199   m->atom_ = str;
200   return m;
201 }
202 
203 // Information about a regexp used during computation of Prefilter.
204 // Can be thought of as information about the set of strings matching
205 // the given regular expression.
206 class Prefilter::Info {
207  public:
208   Info();
209   ~Info();
210 
211   // More constructors.  They delete their Info* arguments.
212   static Info* Alt(Info* a, Info* b);
213   static Info* Concat(Info* a, Info* b);
214   static Info* And(Info* a, Info* b);
215   static Info* Star(Info* a);
216   static Info* Plus(Info* a);
217   static Info* Quest(Info* a);
218   static Info* EmptyString();
219   static Info* NoMatch();
220   static Info* AnyCharOrAnyByte();
221   static Info* CClass(CharClass* cc, bool latin1);
222   static Info* Literal(Rune r);
223   static Info* LiteralLatin1(Rune r);
224   static Info* AnyMatch();
225 
226   // Format Info as a string.
227   std::string ToString();
228 
229   // Caller takes ownership of the Prefilter.
230   Prefilter* TakeMatch();
231 
exact()232   SSet& exact() { return exact_; }
233 
is_exact() const234   bool is_exact() const { return is_exact_; }
235 
236   class Walker;
237 
238  private:
239   SSet exact_;
240 
241   // When is_exact_ is true, the strings that match
242   // are placed in exact_. When it is no longer an exact
243   // set of strings that match this RE, then is_exact_
244   // is false and the match_ contains the required match
245   // criteria.
246   bool is_exact_;
247 
248   // Accumulated Prefilter query that any
249   // match for this regexp is guaranteed to match.
250   Prefilter* match_;
251 };
252 
253 
Info()254 Prefilter::Info::Info()
255   : is_exact_(false),
256     match_(NULL) {
257 }
258 
~Info()259 Prefilter::Info::~Info() {
260   delete match_;
261 }
262 
TakeMatch()263 Prefilter* Prefilter::Info::TakeMatch() {
264   if (is_exact_) {
265     match_ = Prefilter::OrStrings(&exact_);
266     is_exact_ = false;
267   }
268   Prefilter* m = match_;
269   match_ = NULL;
270   return m;
271 }
272 
273 // Format a Info in string form.
ToString()274 std::string Prefilter::Info::ToString() {
275   if (is_exact_) {
276     int n = 0;
277     std::string s;
278     for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
279       if (n++ > 0)
280         s += ",";
281       s += *i;
282     }
283     return s;
284   }
285 
286   if (match_)
287     return match_->DebugString();
288 
289   return "";
290 }
291 
CrossProduct(const SSet & a,const SSet & b,SSet * dst)292 void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) {
293   for (ConstSSIter i = a.begin(); i != a.end(); ++i)
294     for (ConstSSIter j = b.begin(); j != b.end(); ++j)
295       dst->insert(*i + *j);
296 }
297 
298 // Concats a and b. Requires that both are exact sets.
299 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)300 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
301   if (a == NULL)
302     return b;
303   DCHECK(a->is_exact_);
304   DCHECK(b && b->is_exact_);
305   Info *ab = new Info();
306 
307   CrossProduct(a->exact_, b->exact_, &ab->exact_);
308   ab->is_exact_ = true;
309 
310   delete a;
311   delete b;
312   return ab;
313 }
314 
315 // Constructs an inexact Info for ab given a and b.
316 // Used only when a or b is not exact or when the
317 // exact cross product is likely to be too big.
And(Info * a,Info * b)318 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
319   if (a == NULL)
320     return b;
321   if (b == NULL)
322     return a;
323 
324   Info *ab = new Info();
325 
326   ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
327   ab->is_exact_ = false;
328   delete a;
329   delete b;
330   return ab;
331 }
332 
333 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)334 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
335   Info *ab = new Info();
336 
337   if (a->is_exact_ && b->is_exact_) {
338     // Avoid string copies by moving the larger exact_ set into
339     // ab directly, then merge in the smaller set.
340     if (a->exact_.size() < b->exact_.size()) {
341       using std::swap;
342       swap(a, b);
343     }
344     ab->exact_ = std::move(a->exact_);
345     ab->exact_.insert(b->exact_.begin(), b->exact_.end());
346     ab->is_exact_ = true;
347   } else {
348     // Either a or b has is_exact_ = false. If the other
349     // one has is_exact_ = true, we move it to match_ and
350     // then create a OR of a,b. The resulting Info has
351     // is_exact_ = false.
352     ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
353     ab->is_exact_ = false;
354   }
355 
356   delete a;
357   delete b;
358   return ab;
359 }
360 
361 // Constructs Info for a? given a.
Quest(Info * a)362 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
363   Info *ab = new Info();
364 
365   ab->is_exact_ = false;
366   ab->match_ = new Prefilter(ALL);
367   delete a;
368   return ab;
369 }
370 
371 // Constructs Info for a* given a.
372 // Same as a? -- not much to do.
Star(Info * a)373 Prefilter::Info* Prefilter::Info::Star(Info *a) {
374   return Quest(a);
375 }
376 
377 // Constructs Info for a+ given a. If a was exact set, it isn't
378 // anymore.
Plus(Info * a)379 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
380   Info *ab = new Info();
381 
382   ab->match_ = a->TakeMatch();
383   ab->is_exact_ = false;
384 
385   delete a;
386   return ab;
387 }
388 
RuneToString(Rune r)389 static std::string RuneToString(Rune r) {
390   char buf[UTFmax];
391   int n = runetochar(buf, &r);
392   return std::string(buf, n);
393 }
394 
RuneToStringLatin1(Rune r)395 static std::string RuneToStringLatin1(Rune r) {
396   char c = r & 0xff;
397   return std::string(&c, 1);
398 }
399 
400 // Constructs Info for literal rune.
Literal(Rune r)401 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
402   Info* info = new Info();
403   info->exact_.insert(RuneToString(ToLowerRune(r)));
404   info->is_exact_ = true;
405   return info;
406 }
407 
408 // Constructs Info for literal rune for Latin1 encoded string.
LiteralLatin1(Rune r)409 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
410   Info* info = new Info();
411   info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
412   info->is_exact_ = true;
413   return info;
414 }
415 
416 // Constructs Info for dot (any character) or \C (any byte).
AnyCharOrAnyByte()417 Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
418   Prefilter::Info* info = new Prefilter::Info();
419   info->match_ = new Prefilter(ALL);
420   return info;
421 }
422 
423 // Constructs Prefilter::Info for no possible match.
NoMatch()424 Prefilter::Info* Prefilter::Info::NoMatch() {
425   Prefilter::Info* info = new Prefilter::Info();
426   info->match_ = new Prefilter(NONE);
427   return info;
428 }
429 
430 // Constructs Prefilter::Info for any possible match.
431 // This Prefilter::Info is valid for any regular expression,
432 // since it makes no assertions whatsoever about the
433 // strings being matched.
AnyMatch()434 Prefilter::Info* Prefilter::Info::AnyMatch() {
435   Prefilter::Info *info = new Prefilter::Info();
436   info->match_ = new Prefilter(ALL);
437   return info;
438 }
439 
440 // Constructs Prefilter::Info for just the empty string.
EmptyString()441 Prefilter::Info* Prefilter::Info::EmptyString() {
442   Prefilter::Info* info = new Prefilter::Info();
443   info->is_exact_ = true;
444   info->exact_.insert("");
445   return info;
446 }
447 
448 // Constructs Prefilter::Info for a character class.
449 typedef CharClass::iterator CCIter;
CClass(CharClass * cc,bool latin1)450 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
451                                          bool latin1) {
452   if (ExtraDebug) {
453     LOG(ERROR) << "CharClassInfo:";
454     for (CCIter i = cc->begin(); i != cc->end(); ++i)
455       LOG(ERROR) << "  " << i->lo << "-" << i->hi;
456   }
457 
458   // If the class is too large, it's okay to overestimate.
459   if (cc->size() > 10)
460     return AnyCharOrAnyByte();
461 
462   Prefilter::Info *a = new Prefilter::Info();
463   for (CCIter i = cc->begin(); i != cc->end(); ++i)
464     for (Rune r = i->lo; r <= i->hi; r++) {
465       if (latin1) {
466         a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
467       } else {
468         a->exact_.insert(RuneToString(ToLowerRune(r)));
469       }
470     }
471 
472 
473   a->is_exact_ = true;
474 
475   if (ExtraDebug)
476     LOG(ERROR) << " = " << a->ToString();
477 
478   return a;
479 }
480 
481 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
482  public:
Walker(bool latin1)483   Walker(bool latin1) : latin1_(latin1) {}
484 
485   virtual Info* PostVisit(
486       Regexp* re, Info* parent_arg,
487       Info* pre_arg,
488       Info** child_args, int nchild_args);
489 
490   virtual Info* ShortVisit(
491       Regexp* re,
492       Info* parent_arg);
493 
latin1()494   bool latin1() { return latin1_; }
495  private:
496   bool latin1_;
497 
498   Walker(const Walker&) = delete;
499   Walker& operator=(const Walker&) = delete;
500 };
501 
BuildInfo(Regexp * re)502 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
503   if (ExtraDebug)
504     LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
505 
506   bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
507   Prefilter::Info::Walker w(latin1);
508   Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
509 
510   if (w.stopped_early()) {
511     delete info;
512     return NULL;
513   }
514 
515   return info;
516 }
517 
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)518 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
519     Regexp* re, Prefilter::Info* parent_arg) {
520   return AnyMatch();
521 }
522 
523 // Constructs the Prefilter::Info for the given regular expression.
524 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)525 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
526     Regexp* re, Prefilter::Info* parent_arg,
527     Prefilter::Info* pre_arg, Prefilter::Info** child_args,
528     int nchild_args) {
529   Prefilter::Info *info;
530   switch (re->op()) {
531     default:
532     case kRegexpRepeat:
533       info = EmptyString();
534       LOG(DFATAL) << "Bad regexp op " << re->op();
535       break;
536 
537     case kRegexpNoMatch:
538       info = NoMatch();
539       break;
540 
541     // These ops match the empty string:
542     case kRegexpEmptyMatch:      // anywhere
543     case kRegexpBeginLine:       // at beginning of line
544     case kRegexpEndLine:         // at end of line
545     case kRegexpBeginText:       // at beginning of text
546     case kRegexpEndText:         // at end of text
547     case kRegexpWordBoundary:    // at word boundary
548     case kRegexpNoWordBoundary:  // not at word boundary
549       info = EmptyString();
550       break;
551 
552     case kRegexpLiteral:
553       if (latin1()) {
554         info = LiteralLatin1(re->rune());
555       }
556       else {
557         info = Literal(re->rune());
558       }
559       break;
560 
561     case kRegexpLiteralString:
562       if (re->nrunes() == 0) {
563         info = NoMatch();
564         break;
565       }
566       if (latin1()) {
567         info = LiteralLatin1(re->runes()[0]);
568         for (int i = 1; i < re->nrunes(); i++) {
569           info = Concat(info, LiteralLatin1(re->runes()[i]));
570         }
571       } else {
572         info = Literal(re->runes()[0]);
573         for (int i = 1; i < re->nrunes(); i++) {
574           info = Concat(info, Literal(re->runes()[i]));
575         }
576       }
577       break;
578 
579     case kRegexpConcat: {
580       // Accumulate in info.
581       // Exact is concat of recent contiguous exact nodes.
582       info = NULL;
583       Info* exact = NULL;
584       for (int i = 0; i < nchild_args; i++) {
585         Info* ci = child_args[i];  // child info
586         if (!ci->is_exact() ||
587             (exact && ci->exact().size() * exact->exact().size() > 16)) {
588           // Exact run is over.
589           info = And(info, exact);
590           exact = NULL;
591           // Add this child's info.
592           info = And(info, ci);
593         } else {
594           // Append to exact run.
595           exact = Concat(exact, ci);
596         }
597       }
598       info = And(info, exact);
599     }
600       break;
601 
602     case kRegexpAlternate:
603       info = child_args[0];
604       for (int i = 1; i < nchild_args; i++)
605         info = Alt(info, child_args[i]);
606       break;
607 
608     case kRegexpStar:
609       info = Star(child_args[0]);
610       break;
611 
612     case kRegexpQuest:
613       info = Quest(child_args[0]);
614       break;
615 
616     case kRegexpPlus:
617       info = Plus(child_args[0]);
618       break;
619 
620     case kRegexpAnyChar:
621     case kRegexpAnyByte:
622       // Claim nothing, except that it's not empty.
623       info = AnyCharOrAnyByte();
624       break;
625 
626     case kRegexpCharClass:
627       info = CClass(re->cc(), latin1());
628       break;
629 
630     case kRegexpCapture:
631       // These don't affect the set of matching strings.
632       info = child_args[0];
633       break;
634   }
635 
636   if (ExtraDebug)
637     LOG(ERROR) << "BuildInfo " << re->ToString()
638                << ": " << (info ? info->ToString() : "");
639 
640   return info;
641 }
642 
643 
FromRegexp(Regexp * re)644 Prefilter* Prefilter::FromRegexp(Regexp* re) {
645   if (re == NULL)
646     return NULL;
647 
648   Regexp* simple = re->Simplify();
649   if (simple == NULL)
650     return NULL;
651 
652   Prefilter::Info* info = BuildInfo(simple);
653   simple->Decref();
654   if (info == NULL)
655     return NULL;
656 
657   Prefilter* m = info->TakeMatch();
658   delete info;
659   return m;
660 }
661 
DebugString() const662 std::string Prefilter::DebugString() const {
663   switch (op_) {
664     default:
665       LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
666       return absl::StrFormat("op%d", op_);
667     case NONE:
668       return "*no-matches*";
669     case ATOM:
670       return atom_;
671     case ALL:
672       return "";
673     case AND: {
674       std::string s = "";
675       for (size_t i = 0; i < subs_->size(); i++) {
676         if (i > 0)
677           s += " ";
678         Prefilter* sub = (*subs_)[i];
679         s += sub ? sub->DebugString() : "<nil>";
680       }
681       return s;
682     }
683     case OR: {
684       std::string s = "(";
685       for (size_t i = 0; i < subs_->size(); i++) {
686         if (i > 0)
687           s += "|";
688         Prefilter* sub = (*subs_)[i];
689         s += sub ? sub->DebugString() : "<nil>";
690       }
691       s += ")";
692       return s;
693     }
694   }
695 }
696 
FromRE2(const RE2 * re2)697 Prefilter* Prefilter::FromRE2(const RE2* re2) {
698   if (re2 == NULL)
699     return NULL;
700 
701   Regexp* regexp = re2->Regexp();
702   if (regexp == NULL)
703     return NULL;
704 
705   return FromRegexp(regexp);
706 }
707 
708 
709 }  // namespace re2
710