1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "re2/prefilter.h"
6
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string>
10 #include <utility>
11 #include <vector>
12
13 #include "absl/strings/str_format.h"
14 #include "util/logging.h"
15 #include "util/utf.h"
16 #include "re2/re2.h"
17 #include "re2/unicode_casefold.h"
18 #include "re2/walker-inl.h"
19
20 namespace re2 {
21
22 static const bool ExtraDebug = false;
23
24 // Initializes a Prefilter, allocating subs_ as necessary.
Prefilter(Op op)25 Prefilter::Prefilter(Op op) {
26 op_ = op;
27 subs_ = NULL;
28 if (op_ == AND || op_ == OR)
29 subs_ = new std::vector<Prefilter*>;
30 }
31
32 // Destroys a Prefilter.
~Prefilter()33 Prefilter::~Prefilter() {
34 if (subs_) {
35 for (size_t i = 0; i < subs_->size(); i++)
36 delete (*subs_)[i];
37 delete subs_;
38 subs_ = NULL;
39 }
40 }
41
42 // Simplify if the node is an empty Or or And.
Simplify()43 Prefilter* Prefilter::Simplify() {
44 if (op_ != AND && op_ != OR) {
45 return this;
46 }
47
48 // Nothing left in the AND/OR.
49 if (subs_->empty()) {
50 if (op_ == AND)
51 op_ = ALL; // AND of nothing is true
52 else
53 op_ = NONE; // OR of nothing is false
54
55 return this;
56 }
57
58 // Just one subnode: throw away wrapper.
59 if (subs_->size() == 1) {
60 Prefilter* a = (*subs_)[0];
61 subs_->clear();
62 delete this;
63 return a->Simplify();
64 }
65
66 return this;
67 }
68
69 // Combines two Prefilters together to create an "op" (AND or OR).
70 // The passed Prefilters will be part of the returned Prefilter or deleted.
71 // Does lots of work to avoid creating unnecessarily complicated structures.
AndOr(Op op,Prefilter * a,Prefilter * b)72 Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
73 // If a, b can be rewritten as op, do so.
74 a = a->Simplify();
75 b = b->Simplify();
76
77 // Canonicalize: a->op <= b->op.
78 if (a->op() > b->op()) {
79 Prefilter* t = a;
80 a = b;
81 b = t;
82 }
83
84 // Trivial cases.
85 // ALL AND b = b
86 // NONE OR b = b
87 // ALL OR b = ALL
88 // NONE AND b = NONE
89 // Don't need to look at b, because of canonicalization above.
90 // ALL and NONE are smallest opcodes.
91 if (a->op() == ALL || a->op() == NONE) {
92 if ((a->op() == ALL && op == AND) ||
93 (a->op() == NONE && op == OR)) {
94 delete a;
95 return b;
96 } else {
97 delete b;
98 return a;
99 }
100 }
101
102 // If a and b match op, merge their contents.
103 if (a->op() == op && b->op() == op) {
104 for (size_t i = 0; i < b->subs()->size(); i++) {
105 Prefilter* bb = (*b->subs())[i];
106 a->subs()->push_back(bb);
107 }
108 b->subs()->clear();
109 delete b;
110 return a;
111 }
112
113 // If a already has the same op as the op that is under construction
114 // add in b (similarly if b already has the same op, add in a).
115 if (b->op() == op) {
116 Prefilter* t = a;
117 a = b;
118 b = t;
119 }
120 if (a->op() == op) {
121 a->subs()->push_back(b);
122 return a;
123 }
124
125 // Otherwise just return the op.
126 Prefilter* c = new Prefilter(op);
127 c->subs()->push_back(a);
128 c->subs()->push_back(b);
129 return c;
130 }
131
And(Prefilter * a,Prefilter * b)132 Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
133 return AndOr(AND, a, b);
134 }
135
Or(Prefilter * a,Prefilter * b)136 Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
137 return AndOr(OR, a, b);
138 }
139
SimplifyStringSet(SSet * ss)140 void Prefilter::SimplifyStringSet(SSet* ss) {
141 // Now make sure that the strings aren't redundant. For example, if
142 // we know "ab" is a required string, then it doesn't help at all to
143 // know that "abc" is also a required string, so delete "abc". This
144 // is because, when we are performing a string search to filter
145 // regexps, matching "ab" will already allow this regexp to be a
146 // candidate for match, so further matching "abc" is redundant.
147 // Note that we must ignore "" because find() would find it at the
148 // start of everything and thus we would end up erasing everything.
149 //
150 // The SSet sorts strings by length, then lexicographically. Note that
151 // smaller strings appear first and all strings must be unique. These
152 // observations let us skip string comparisons when possible.
153 SSIter i = ss->begin();
154 if (i != ss->end() && i->empty()) {
155 ++i;
156 }
157 for (; i != ss->end(); ++i) {
158 SSIter j = i;
159 ++j;
160 while (j != ss->end()) {
161 if (j->size() > i->size() && j->find(*i) != std::string::npos) {
162 j = ss->erase(j);
163 continue;
164 }
165 ++j;
166 }
167 }
168 }
169
OrStrings(SSet * ss)170 Prefilter* Prefilter::OrStrings(SSet* ss) {
171 Prefilter* or_prefilter = new Prefilter(NONE);
172 SimplifyStringSet(ss);
173 for (SSIter i = ss->begin(); i != ss->end(); ++i)
174 or_prefilter = Or(or_prefilter, FromString(*i));
175 return or_prefilter;
176 }
177
ToLowerRune(Rune r)178 static Rune ToLowerRune(Rune r) {
179 if (r < Runeself) {
180 if ('A' <= r && r <= 'Z')
181 r += 'a' - 'A';
182 return r;
183 }
184
185 const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
186 if (f == NULL || r < f->lo)
187 return r;
188 return ApplyFold(f, r);
189 }
190
ToLowerRuneLatin1(Rune r)191 static Rune ToLowerRuneLatin1(Rune r) {
192 if ('A' <= r && r <= 'Z')
193 r += 'a' - 'A';
194 return r;
195 }
196
FromString(const std::string & str)197 Prefilter* Prefilter::FromString(const std::string& str) {
198 Prefilter* m = new Prefilter(Prefilter::ATOM);
199 m->atom_ = str;
200 return m;
201 }
202
203 // Information about a regexp used during computation of Prefilter.
204 // Can be thought of as information about the set of strings matching
205 // the given regular expression.
206 class Prefilter::Info {
207 public:
208 Info();
209 ~Info();
210
211 // More constructors. They delete their Info* arguments.
212 static Info* Alt(Info* a, Info* b);
213 static Info* Concat(Info* a, Info* b);
214 static Info* And(Info* a, Info* b);
215 static Info* Star(Info* a);
216 static Info* Plus(Info* a);
217 static Info* Quest(Info* a);
218 static Info* EmptyString();
219 static Info* NoMatch();
220 static Info* AnyCharOrAnyByte();
221 static Info* CClass(CharClass* cc, bool latin1);
222 static Info* Literal(Rune r);
223 static Info* LiteralLatin1(Rune r);
224 static Info* AnyMatch();
225
226 // Format Info as a string.
227 std::string ToString();
228
229 // Caller takes ownership of the Prefilter.
230 Prefilter* TakeMatch();
231
exact()232 SSet& exact() { return exact_; }
233
is_exact() const234 bool is_exact() const { return is_exact_; }
235
236 class Walker;
237
238 private:
239 SSet exact_;
240
241 // When is_exact_ is true, the strings that match
242 // are placed in exact_. When it is no longer an exact
243 // set of strings that match this RE, then is_exact_
244 // is false and the match_ contains the required match
245 // criteria.
246 bool is_exact_;
247
248 // Accumulated Prefilter query that any
249 // match for this regexp is guaranteed to match.
250 Prefilter* match_;
251 };
252
253
Info()254 Prefilter::Info::Info()
255 : is_exact_(false),
256 match_(NULL) {
257 }
258
~Info()259 Prefilter::Info::~Info() {
260 delete match_;
261 }
262
TakeMatch()263 Prefilter* Prefilter::Info::TakeMatch() {
264 if (is_exact_) {
265 match_ = Prefilter::OrStrings(&exact_);
266 is_exact_ = false;
267 }
268 Prefilter* m = match_;
269 match_ = NULL;
270 return m;
271 }
272
273 // Format a Info in string form.
ToString()274 std::string Prefilter::Info::ToString() {
275 if (is_exact_) {
276 int n = 0;
277 std::string s;
278 for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
279 if (n++ > 0)
280 s += ",";
281 s += *i;
282 }
283 return s;
284 }
285
286 if (match_)
287 return match_->DebugString();
288
289 return "";
290 }
291
CrossProduct(const SSet & a,const SSet & b,SSet * dst)292 void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) {
293 for (ConstSSIter i = a.begin(); i != a.end(); ++i)
294 for (ConstSSIter j = b.begin(); j != b.end(); ++j)
295 dst->insert(*i + *j);
296 }
297
298 // Concats a and b. Requires that both are exact sets.
299 // Forms an exact set that is a crossproduct of a and b.
Concat(Info * a,Info * b)300 Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
301 if (a == NULL)
302 return b;
303 DCHECK(a->is_exact_);
304 DCHECK(b && b->is_exact_);
305 Info *ab = new Info();
306
307 CrossProduct(a->exact_, b->exact_, &ab->exact_);
308 ab->is_exact_ = true;
309
310 delete a;
311 delete b;
312 return ab;
313 }
314
315 // Constructs an inexact Info for ab given a and b.
316 // Used only when a or b is not exact or when the
317 // exact cross product is likely to be too big.
And(Info * a,Info * b)318 Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
319 if (a == NULL)
320 return b;
321 if (b == NULL)
322 return a;
323
324 Info *ab = new Info();
325
326 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
327 ab->is_exact_ = false;
328 delete a;
329 delete b;
330 return ab;
331 }
332
333 // Constructs Info for a|b given a and b.
Alt(Info * a,Info * b)334 Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
335 Info *ab = new Info();
336
337 if (a->is_exact_ && b->is_exact_) {
338 // Avoid string copies by moving the larger exact_ set into
339 // ab directly, then merge in the smaller set.
340 if (a->exact_.size() < b->exact_.size()) {
341 using std::swap;
342 swap(a, b);
343 }
344 ab->exact_ = std::move(a->exact_);
345 ab->exact_.insert(b->exact_.begin(), b->exact_.end());
346 ab->is_exact_ = true;
347 } else {
348 // Either a or b has is_exact_ = false. If the other
349 // one has is_exact_ = true, we move it to match_ and
350 // then create a OR of a,b. The resulting Info has
351 // is_exact_ = false.
352 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
353 ab->is_exact_ = false;
354 }
355
356 delete a;
357 delete b;
358 return ab;
359 }
360
361 // Constructs Info for a? given a.
Quest(Info * a)362 Prefilter::Info* Prefilter::Info::Quest(Info *a) {
363 Info *ab = new Info();
364
365 ab->is_exact_ = false;
366 ab->match_ = new Prefilter(ALL);
367 delete a;
368 return ab;
369 }
370
371 // Constructs Info for a* given a.
372 // Same as a? -- not much to do.
Star(Info * a)373 Prefilter::Info* Prefilter::Info::Star(Info *a) {
374 return Quest(a);
375 }
376
377 // Constructs Info for a+ given a. If a was exact set, it isn't
378 // anymore.
Plus(Info * a)379 Prefilter::Info* Prefilter::Info::Plus(Info *a) {
380 Info *ab = new Info();
381
382 ab->match_ = a->TakeMatch();
383 ab->is_exact_ = false;
384
385 delete a;
386 return ab;
387 }
388
RuneToString(Rune r)389 static std::string RuneToString(Rune r) {
390 char buf[UTFmax];
391 int n = runetochar(buf, &r);
392 return std::string(buf, n);
393 }
394
RuneToStringLatin1(Rune r)395 static std::string RuneToStringLatin1(Rune r) {
396 char c = r & 0xff;
397 return std::string(&c, 1);
398 }
399
400 // Constructs Info for literal rune.
Literal(Rune r)401 Prefilter::Info* Prefilter::Info::Literal(Rune r) {
402 Info* info = new Info();
403 info->exact_.insert(RuneToString(ToLowerRune(r)));
404 info->is_exact_ = true;
405 return info;
406 }
407
408 // Constructs Info for literal rune for Latin1 encoded string.
LiteralLatin1(Rune r)409 Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
410 Info* info = new Info();
411 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
412 info->is_exact_ = true;
413 return info;
414 }
415
416 // Constructs Info for dot (any character) or \C (any byte).
AnyCharOrAnyByte()417 Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
418 Prefilter::Info* info = new Prefilter::Info();
419 info->match_ = new Prefilter(ALL);
420 return info;
421 }
422
423 // Constructs Prefilter::Info for no possible match.
NoMatch()424 Prefilter::Info* Prefilter::Info::NoMatch() {
425 Prefilter::Info* info = new Prefilter::Info();
426 info->match_ = new Prefilter(NONE);
427 return info;
428 }
429
430 // Constructs Prefilter::Info for any possible match.
431 // This Prefilter::Info is valid for any regular expression,
432 // since it makes no assertions whatsoever about the
433 // strings being matched.
AnyMatch()434 Prefilter::Info* Prefilter::Info::AnyMatch() {
435 Prefilter::Info *info = new Prefilter::Info();
436 info->match_ = new Prefilter(ALL);
437 return info;
438 }
439
440 // Constructs Prefilter::Info for just the empty string.
EmptyString()441 Prefilter::Info* Prefilter::Info::EmptyString() {
442 Prefilter::Info* info = new Prefilter::Info();
443 info->is_exact_ = true;
444 info->exact_.insert("");
445 return info;
446 }
447
448 // Constructs Prefilter::Info for a character class.
449 typedef CharClass::iterator CCIter;
CClass(CharClass * cc,bool latin1)450 Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
451 bool latin1) {
452 if (ExtraDebug) {
453 LOG(ERROR) << "CharClassInfo:";
454 for (CCIter i = cc->begin(); i != cc->end(); ++i)
455 LOG(ERROR) << " " << i->lo << "-" << i->hi;
456 }
457
458 // If the class is too large, it's okay to overestimate.
459 if (cc->size() > 10)
460 return AnyCharOrAnyByte();
461
462 Prefilter::Info *a = new Prefilter::Info();
463 for (CCIter i = cc->begin(); i != cc->end(); ++i)
464 for (Rune r = i->lo; r <= i->hi; r++) {
465 if (latin1) {
466 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
467 } else {
468 a->exact_.insert(RuneToString(ToLowerRune(r)));
469 }
470 }
471
472
473 a->is_exact_ = true;
474
475 if (ExtraDebug)
476 LOG(ERROR) << " = " << a->ToString();
477
478 return a;
479 }
480
481 class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
482 public:
Walker(bool latin1)483 Walker(bool latin1) : latin1_(latin1) {}
484
485 virtual Info* PostVisit(
486 Regexp* re, Info* parent_arg,
487 Info* pre_arg,
488 Info** child_args, int nchild_args);
489
490 virtual Info* ShortVisit(
491 Regexp* re,
492 Info* parent_arg);
493
latin1()494 bool latin1() { return latin1_; }
495 private:
496 bool latin1_;
497
498 Walker(const Walker&) = delete;
499 Walker& operator=(const Walker&) = delete;
500 };
501
BuildInfo(Regexp * re)502 Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
503 if (ExtraDebug)
504 LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
505
506 bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
507 Prefilter::Info::Walker w(latin1);
508 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
509
510 if (w.stopped_early()) {
511 delete info;
512 return NULL;
513 }
514
515 return info;
516 }
517
ShortVisit(Regexp * re,Prefilter::Info * parent_arg)518 Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
519 Regexp* re, Prefilter::Info* parent_arg) {
520 return AnyMatch();
521 }
522
523 // Constructs the Prefilter::Info for the given regular expression.
524 // Assumes re is simplified.
PostVisit(Regexp * re,Prefilter::Info * parent_arg,Prefilter::Info * pre_arg,Prefilter::Info ** child_args,int nchild_args)525 Prefilter::Info* Prefilter::Info::Walker::PostVisit(
526 Regexp* re, Prefilter::Info* parent_arg,
527 Prefilter::Info* pre_arg, Prefilter::Info** child_args,
528 int nchild_args) {
529 Prefilter::Info *info;
530 switch (re->op()) {
531 default:
532 case kRegexpRepeat:
533 info = EmptyString();
534 LOG(DFATAL) << "Bad regexp op " << re->op();
535 break;
536
537 case kRegexpNoMatch:
538 info = NoMatch();
539 break;
540
541 // These ops match the empty string:
542 case kRegexpEmptyMatch: // anywhere
543 case kRegexpBeginLine: // at beginning of line
544 case kRegexpEndLine: // at end of line
545 case kRegexpBeginText: // at beginning of text
546 case kRegexpEndText: // at end of text
547 case kRegexpWordBoundary: // at word boundary
548 case kRegexpNoWordBoundary: // not at word boundary
549 info = EmptyString();
550 break;
551
552 case kRegexpLiteral:
553 if (latin1()) {
554 info = LiteralLatin1(re->rune());
555 }
556 else {
557 info = Literal(re->rune());
558 }
559 break;
560
561 case kRegexpLiteralString:
562 if (re->nrunes() == 0) {
563 info = NoMatch();
564 break;
565 }
566 if (latin1()) {
567 info = LiteralLatin1(re->runes()[0]);
568 for (int i = 1; i < re->nrunes(); i++) {
569 info = Concat(info, LiteralLatin1(re->runes()[i]));
570 }
571 } else {
572 info = Literal(re->runes()[0]);
573 for (int i = 1; i < re->nrunes(); i++) {
574 info = Concat(info, Literal(re->runes()[i]));
575 }
576 }
577 break;
578
579 case kRegexpConcat: {
580 // Accumulate in info.
581 // Exact is concat of recent contiguous exact nodes.
582 info = NULL;
583 Info* exact = NULL;
584 for (int i = 0; i < nchild_args; i++) {
585 Info* ci = child_args[i]; // child info
586 if (!ci->is_exact() ||
587 (exact && ci->exact().size() * exact->exact().size() > 16)) {
588 // Exact run is over.
589 info = And(info, exact);
590 exact = NULL;
591 // Add this child's info.
592 info = And(info, ci);
593 } else {
594 // Append to exact run.
595 exact = Concat(exact, ci);
596 }
597 }
598 info = And(info, exact);
599 }
600 break;
601
602 case kRegexpAlternate:
603 info = child_args[0];
604 for (int i = 1; i < nchild_args; i++)
605 info = Alt(info, child_args[i]);
606 break;
607
608 case kRegexpStar:
609 info = Star(child_args[0]);
610 break;
611
612 case kRegexpQuest:
613 info = Quest(child_args[0]);
614 break;
615
616 case kRegexpPlus:
617 info = Plus(child_args[0]);
618 break;
619
620 case kRegexpAnyChar:
621 case kRegexpAnyByte:
622 // Claim nothing, except that it's not empty.
623 info = AnyCharOrAnyByte();
624 break;
625
626 case kRegexpCharClass:
627 info = CClass(re->cc(), latin1());
628 break;
629
630 case kRegexpCapture:
631 // These don't affect the set of matching strings.
632 info = child_args[0];
633 break;
634 }
635
636 if (ExtraDebug)
637 LOG(ERROR) << "BuildInfo " << re->ToString()
638 << ": " << (info ? info->ToString() : "");
639
640 return info;
641 }
642
643
FromRegexp(Regexp * re)644 Prefilter* Prefilter::FromRegexp(Regexp* re) {
645 if (re == NULL)
646 return NULL;
647
648 Regexp* simple = re->Simplify();
649 if (simple == NULL)
650 return NULL;
651
652 Prefilter::Info* info = BuildInfo(simple);
653 simple->Decref();
654 if (info == NULL)
655 return NULL;
656
657 Prefilter* m = info->TakeMatch();
658 delete info;
659 return m;
660 }
661
DebugString() const662 std::string Prefilter::DebugString() const {
663 switch (op_) {
664 default:
665 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
666 return absl::StrFormat("op%d", op_);
667 case NONE:
668 return "*no-matches*";
669 case ATOM:
670 return atom_;
671 case ALL:
672 return "";
673 case AND: {
674 std::string s = "";
675 for (size_t i = 0; i < subs_->size(); i++) {
676 if (i > 0)
677 s += " ";
678 Prefilter* sub = (*subs_)[i];
679 s += sub ? sub->DebugString() : "<nil>";
680 }
681 return s;
682 }
683 case OR: {
684 std::string s = "(";
685 for (size_t i = 0; i < subs_->size(); i++) {
686 if (i > 0)
687 s += "|";
688 Prefilter* sub = (*subs_)[i];
689 s += sub ? sub->DebugString() : "<nil>";
690 }
691 s += ")";
692 return s;
693 }
694 }
695 }
696
FromRE2(const RE2 * re2)697 Prefilter* Prefilter::FromRE2(const RE2* re2) {
698 if (re2 == NULL)
699 return NULL;
700
701 Regexp* regexp = re2->Regexp();
702 if (regexp == NULL)
703 return NULL;
704
705 return FromRegexp(regexp);
706 }
707
708
709 } // namespace re2
710