// Copyright 2006 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Rewrite POSIX and other features in re // to use simple extended regular expression features. // Also sort and simplify character classes. #include #include "util/util.h" #include "util/logging.h" #include "util/pod_array.h" #include "util/utf.h" #include "re2/regexp.h" #include "re2/walker-inl.h" namespace re2 { // Parses the regexp src and then simplifies it and sets *dst to the // string representation of the simplified form. Returns true on success. // Returns false and sets *error (if error != NULL) on error. bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, string* dst, RegexpStatus* status) { Regexp* re = Parse(src, flags, status); if (re == NULL) return false; Regexp* sre = re->Simplify(); re->Decref(); if (sre == NULL) { // Should not happen, since Simplify never fails. LOG(ERROR) << "Simplify failed on " << src; if (status) { status->set_code(kRegexpInternalError); status->set_error_arg(src); } return false; } *dst = sre->ToString(); sre->Decref(); return true; } // Assuming the simple_ flags on the children are accurate, // is this Regexp* simple? bool Regexp::ComputeSimple() { Regexp** subs; switch (op_) { case kRegexpNoMatch: case kRegexpEmptyMatch: case kRegexpLiteral: case kRegexpLiteralString: case kRegexpBeginLine: case kRegexpEndLine: case kRegexpBeginText: case kRegexpWordBoundary: case kRegexpNoWordBoundary: case kRegexpEndText: case kRegexpAnyChar: case kRegexpAnyByte: case kRegexpHaveMatch: return true; case kRegexpConcat: case kRegexpAlternate: // These are simple as long as the subpieces are simple. subs = sub(); for (int i = 0; i < nsub_; i++) if (!subs[i]->simple()) return false; return true; case kRegexpCharClass: // Simple as long as the char class is not empty, not full. if (ccb_ != NULL) return !ccb_->empty() && !ccb_->full(); return !cc_->empty() && !cc_->full(); case kRegexpCapture: subs = sub(); return subs[0]->simple(); case kRegexpStar: case kRegexpPlus: case kRegexpQuest: subs = sub(); if (!subs[0]->simple()) return false; switch (subs[0]->op_) { case kRegexpStar: case kRegexpPlus: case kRegexpQuest: case kRegexpEmptyMatch: case kRegexpNoMatch: return false; default: break; } return true; case kRegexpRepeat: return false; } LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; return false; } // Walker subclass used by Simplify. // Coalesces runs of star/plus/quest/repeat of the same literal along with any // occurrences of that literal into repeats of that literal. It also works for // char classes, any char and any byte. // PostVisit creates the coalesced result, which should then be simplified. class CoalesceWalker : public Regexp::Walker { public: CoalesceWalker() {} virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args); virtual Regexp* Copy(Regexp* re); virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); private: // These functions are declared inside CoalesceWalker so that // they can edit the private fields of the Regexps they construct. // Returns true if r1 and r2 can be coalesced. In particular, ensures that // the parse flags are consistent. (They will not be checked again later.) static bool CanCoalesce(Regexp* r1, Regexp* r2); // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards // will be empty match and the coalesced op. In other cases, where part of a // literal string was removed to be coalesced, the array elements afterwards // will be the coalesced op and the remainder of the literal string. static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr); CoalesceWalker(const CoalesceWalker&) = delete; CoalesceWalker& operator=(const CoalesceWalker&) = delete; }; // Walker subclass used by Simplify. // The simplify walk is purely post-recursive: given the simplified children, // PostVisit creates the simplified result. // The child_args are simplified Regexp*s. class SimplifyWalker : public Regexp::Walker { public: SimplifyWalker() {} virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args); virtual Regexp* Copy(Regexp* re); virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); private: // These functions are declared inside SimplifyWalker so that // they can edit the private fields of the Regexps they construct. // Creates a concatenation of two Regexp, consuming refs to re1 and re2. // Caller must Decref return value when done with it. static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); // Simplifies the expression re{min,max} in terms of *, +, and ?. // Returns a new regexp. Does not edit re. Does not consume reference to re. // Caller must Decref return value when done with it. static Regexp* SimplifyRepeat(Regexp* re, int min, int max, Regexp::ParseFlags parse_flags); // Simplifies a character class by expanding any named classes // into rune ranges. Does not edit re. Does not consume ref to re. // Caller must Decref return value when done with it. static Regexp* SimplifyCharClass(Regexp* re); SimplifyWalker(const SimplifyWalker&) = delete; SimplifyWalker& operator=(const SimplifyWalker&) = delete; }; // Simplifies a regular expression, returning a new regexp. // The new regexp uses traditional Unix egrep features only, // plus the Perl (?:) non-capturing parentheses. // Otherwise, no POSIX or Perl additions. The new regexp // captures exactly the same subexpressions (with the same indices) // as the original. // Does not edit current object. // Caller must Decref() return value when done with it. Regexp* Regexp::Simplify() { CoalesceWalker cw; Regexp* cre = cw.Walk(this, NULL); if (cre == NULL) return cre; SimplifyWalker sw; Regexp* sre = sw.Walk(cre, NULL); cre->Decref(); return sre; } #define Simplify DontCallSimplify // Avoid accidental recursion // Utility function for PostVisit implementations that compares re->sub() with // child_args to determine whether any child_args changed. In the common case, // where nothing changed, calls Decref() for all child_args and returns false, // so PostVisit must return re->Incref(). Otherwise, returns true. static bool ChildArgsChanged(Regexp* re, Regexp** child_args) { for (int i = 0; i < re->nsub(); i++) { Regexp* sub = re->sub()[i]; Regexp* newsub = child_args[i]; if (newsub != sub) return true; } for (int i = 0; i < re->nsub(); i++) { Regexp* newsub = child_args[i]; newsub->Decref(); } return false; } Regexp* CoalesceWalker::Copy(Regexp* re) { return re->Incref(); } Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { // This should never be called, since we use Walk and not // WalkExponential. LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; return re->Incref(); } Regexp* CoalesceWalker::PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args) { if (re->nsub() == 0) return re->Incref(); if (re->op() != kRegexpConcat) { if (!ChildArgsChanged(re, child_args)) return re->Incref(); // Something changed. Build a new op. Regexp* nre = new Regexp(re->op(), re->parse_flags()); nre->AllocSub(re->nsub()); Regexp** nre_subs = nre->sub(); for (int i = 0; i < re->nsub(); i++) nre_subs[i] = child_args[i]; // Repeats and Captures have additional data that must be copied. if (re->op() == kRegexpRepeat) { nre->min_ = re->min(); nre->max_ = re->max(); } else if (re->op() == kRegexpCapture) { nre->cap_ = re->cap(); } return nre; } bool can_coalesce = false; for (int i = 0; i < re->nsub(); i++) { if (i+1 < re->nsub() && CanCoalesce(child_args[i], child_args[i+1])) { can_coalesce = true; break; } } if (!can_coalesce) { if (!ChildArgsChanged(re, child_args)) return re->Incref(); // Something changed. Build a new op. Regexp* nre = new Regexp(re->op(), re->parse_flags()); nre->AllocSub(re->nsub()); Regexp** nre_subs = nre->sub(); for (int i = 0; i < re->nsub(); i++) nre_subs[i] = child_args[i]; return nre; } for (int i = 0; i < re->nsub(); i++) { if (i+1 < re->nsub() && CanCoalesce(child_args[i], child_args[i+1])) DoCoalesce(&child_args[i], &child_args[i+1]); } // Determine how many empty matches were left by DoCoalesce. int n = 0; for (int i = n; i < re->nsub(); i++) { if (child_args[i]->op() == kRegexpEmptyMatch) n++; } // Build a new op. Regexp* nre = new Regexp(re->op(), re->parse_flags()); nre->AllocSub(re->nsub() - n); Regexp** nre_subs = nre->sub(); for (int i = 0, j = 0; i < re->nsub(); i++) { if (child_args[i]->op() == kRegexpEmptyMatch) { child_args[i]->Decref(); continue; } nre_subs[j] = child_args[i]; j++; } return nre; } bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) { // r1 must be a star/plus/quest/repeat of a literal, char class, any char or // any byte. if ((r1->op() == kRegexpStar || r1->op() == kRegexpPlus || r1->op() == kRegexpQuest || r1->op() == kRegexpRepeat) && (r1->sub()[0]->op() == kRegexpLiteral || r1->sub()[0]->op() == kRegexpCharClass || r1->sub()[0]->op() == kRegexpAnyChar || r1->sub()[0]->op() == kRegexpAnyByte)) { // r2 must be a star/plus/quest/repeat of the same literal, char class, // any char or any byte. if ((r2->op() == kRegexpStar || r2->op() == kRegexpPlus || r2->op() == kRegexpQuest || r2->op() == kRegexpRepeat) && Regexp::Equal(r1->sub()[0], r2->sub()[0]) && // The parse flags must be consistent. ((r1->parse_flags() & Regexp::NonGreedy) == (r2->parse_flags() & Regexp::NonGreedy))) { return true; } // ... OR an occurrence of that literal, char class, any char or any byte if (Regexp::Equal(r1->sub()[0], r2)) { return true; } // ... OR a literal string that begins with that literal. if (r1->sub()[0]->op() == kRegexpLiteral && r2->op() == kRegexpLiteralString && r2->runes()[0] == r1->sub()[0]->rune() && // The parse flags must be consistent. ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) == (r2->parse_flags() & Regexp::FoldCase))) { return true; } } return false; } void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { Regexp* r1 = *r1ptr; Regexp* r2 = *r2ptr; Regexp* nre = Regexp::Repeat( r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0); switch (r1->op()) { case kRegexpStar: nre->min_ = 0; nre->max_ = -1; break; case kRegexpPlus: nre->min_ = 1; nre->max_ = -1; break; case kRegexpQuest: nre->min_ = 0; nre->max_ = 1; break; case kRegexpRepeat: nre->min_ = r1->min(); nre->max_ = r1->max(); break; default: LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); nre->Decref(); return; } switch (r2->op()) { case kRegexpStar: nre->max_ = -1; goto LeaveEmpty; case kRegexpPlus: nre->min_++; nre->max_ = -1; goto LeaveEmpty; case kRegexpQuest: if (nre->max() != -1) nre->max_++; goto LeaveEmpty; case kRegexpRepeat: nre->min_ += r2->min(); if (r2->max() == -1) nre->max_ = -1; else if (nre->max() != -1) nre->max_ += r2->max(); goto LeaveEmpty; case kRegexpLiteral: case kRegexpCharClass: case kRegexpAnyChar: case kRegexpAnyByte: nre->min_++; if (nre->max() != -1) nre->max_++; goto LeaveEmpty; LeaveEmpty: *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags); *r2ptr = nre; break; case kRegexpLiteralString: { Rune r = r1->sub()[0]->rune(); // Determine how much of the literal string is removed. // We know that we have at least one rune. :) int n = 1; while (n < r2->nrunes() && r2->runes()[n] == r) n++; nre->min_ += n; if (nre->max() != -1) nre->max_ += n; if (n == r2->nrunes()) goto LeaveEmpty; *r1ptr = nre; *r2ptr = Regexp::LiteralString( &r2->runes()[n], r2->nrunes() - n, r2->parse_flags()); break; } default: LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); nre->Decref(); return; } r1->Decref(); r2->Decref(); } Regexp* SimplifyWalker::Copy(Regexp* re) { return re->Incref(); } Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { // This should never be called, since we use Walk and not // WalkExponential. LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; return re->Incref(); } Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { if (re->simple()) { *stop = true; return re->Incref(); } return NULL; } Regexp* SimplifyWalker::PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args) { switch (re->op()) { case kRegexpNoMatch: case kRegexpEmptyMatch: case kRegexpLiteral: case kRegexpLiteralString: case kRegexpBeginLine: case kRegexpEndLine: case kRegexpBeginText: case kRegexpWordBoundary: case kRegexpNoWordBoundary: case kRegexpEndText: case kRegexpAnyChar: case kRegexpAnyByte: case kRegexpHaveMatch: // All these are always simple. re->simple_ = true; return re->Incref(); case kRegexpConcat: case kRegexpAlternate: { // These are simple as long as the subpieces are simple. if (!ChildArgsChanged(re, child_args)) { re->simple_ = true; return re->Incref(); } Regexp* nre = new Regexp(re->op(), re->parse_flags()); nre->AllocSub(re->nsub()); Regexp** nre_subs = nre->sub(); for (int i = 0; i < re->nsub(); i++) nre_subs[i] = child_args[i]; nre->simple_ = true; return nre; } case kRegexpCapture: { Regexp* newsub = child_args[0]; if (newsub == re->sub()[0]) { newsub->Decref(); re->simple_ = true; return re->Incref(); } Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); nre->AllocSub(1); nre->sub()[0] = newsub; nre->cap_ = re->cap(); nre->simple_ = true; return nre; } case kRegexpStar: case kRegexpPlus: case kRegexpQuest: { Regexp* newsub = child_args[0]; // Special case: repeat the empty string as much as // you want, but it's still the empty string. if (newsub->op() == kRegexpEmptyMatch) return newsub; // These are simple as long as the subpiece is simple. if (newsub == re->sub()[0]) { newsub->Decref(); re->simple_ = true; return re->Incref(); } // These are also idempotent if flags are constant. if (re->op() == newsub->op() && re->parse_flags() == newsub->parse_flags()) return newsub; Regexp* nre = new Regexp(re->op(), re->parse_flags()); nre->AllocSub(1); nre->sub()[0] = newsub; nre->simple_ = true; return nre; } case kRegexpRepeat: { Regexp* newsub = child_args[0]; // Special case: repeat the empty string as much as // you want, but it's still the empty string. if (newsub->op() == kRegexpEmptyMatch) return newsub; Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, re->parse_flags()); newsub->Decref(); nre->simple_ = true; return nre; } case kRegexpCharClass: { Regexp* nre = SimplifyCharClass(re); nre->simple_ = true; return nre; } } LOG(ERROR) << "Simplify case not handled: " << re->op(); return re->Incref(); } // Creates a concatenation of two Regexp, consuming refs to re1 and re2. // Returns a new Regexp, handing the ref to the caller. Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags parse_flags) { Regexp* re = new Regexp(kRegexpConcat, parse_flags); re->AllocSub(2); Regexp** subs = re->sub(); subs[0] = re1; subs[1] = re2; return re; } // Simplifies the expression re{min,max} in terms of *, +, and ?. // Returns a new regexp. Does not edit re. Does not consume reference to re. // Caller must Decref return value when done with it. // The result will *not* necessarily have the right capturing parens // if you call ToString() and re-parse it: (x){2} becomes (x)(x), // but in the Regexp* representation, both (x) are marked as $1. Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, Regexp::ParseFlags f) { // x{n,} means at least n matches of x. if (max == -1) { // Special case: x{0,} is x* if (min == 0) return Regexp::Star(re->Incref(), f); // Special case: x{1,} is x+ if (min == 1) return Regexp::Plus(re->Incref(), f); // General case: x{4,} is xxxx+ PODArray nre_subs(min); for (int i = 0; i < min-1; i++) nre_subs[i] = re->Incref(); nre_subs[min-1] = Regexp::Plus(re->Incref(), f); return Regexp::Concat(nre_subs.data(), min, f); } // Special case: (x){0} matches only empty string. if (min == 0 && max == 0) return new Regexp(kRegexpEmptyMatch, f); // Special case: x{1} is just x. if (min == 1 && max == 1) return re->Incref(); // General case: x{n,m} means n copies of x and m copies of x?. // The machine will do less work if we nest the final m copies, // so that x{2,5} = xx(x(x(x)?)?)? // Build leading prefix: xx. Capturing only on the last one. Regexp* nre = NULL; if (min > 0) { PODArray nre_subs(min); for (int i = 0; i < min; i++) nre_subs[i] = re->Incref(); nre = Regexp::Concat(nre_subs.data(), min, f); } // Build and attach suffix: (x(x(x)?)?)? if (max > min) { Regexp* suf = Regexp::Quest(re->Incref(), f); for (int i = min+1; i < max; i++) suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); if (nre == NULL) nre = suf; else nre = Concat2(nre, suf, f); } if (nre == NULL) { // Some degenerate case, like min > max, or min < max < 0. // This shouldn't happen, because the parser rejects such regexps. LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; return new Regexp(kRegexpNoMatch, f); } return nre; } // Simplifies a character class. // Caller must Decref return value when done with it. Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { CharClass* cc = re->cc(); // Special cases if (cc->empty()) return new Regexp(kRegexpNoMatch, re->parse_flags()); if (cc->full()) return new Regexp(kRegexpAnyChar, re->parse_flags()); return re->Incref(); } } // namespace re2