1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Rewrite POSIX and other features in re
6 // to use simple extended regular expression features.
7 // Also sort and simplify character classes.
8
9 #include <string>
10
11 #include "util/util.h"
12 #include "util/logging.h"
13 #include "util/pod_array.h"
14 #include "util/utf.h"
15 #include "re2/regexp.h"
16 #include "re2/walker-inl.h"
17
18 namespace re2 {
19
20 // Parses the regexp src and then simplifies it and sets *dst to the
21 // string representation of the simplified form. Returns true on success.
22 // Returns false and sets *error (if error != NULL) on error.
SimplifyRegexp(const StringPiece & src,ParseFlags flags,string * dst,RegexpStatus * status)23 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
24 string* dst,
25 RegexpStatus* status) {
26 Regexp* re = Parse(src, flags, status);
27 if (re == NULL)
28 return false;
29 Regexp* sre = re->Simplify();
30 re->Decref();
31 if (sre == NULL) {
32 // Should not happen, since Simplify never fails.
33 LOG(ERROR) << "Simplify failed on " << src;
34 if (status) {
35 status->set_code(kRegexpInternalError);
36 status->set_error_arg(src);
37 }
38 return false;
39 }
40 *dst = sre->ToString();
41 sre->Decref();
42 return true;
43 }
44
45 // Assuming the simple_ flags on the children are accurate,
46 // is this Regexp* simple?
ComputeSimple()47 bool Regexp::ComputeSimple() {
48 Regexp** subs;
49 switch (op_) {
50 case kRegexpNoMatch:
51 case kRegexpEmptyMatch:
52 case kRegexpLiteral:
53 case kRegexpLiteralString:
54 case kRegexpBeginLine:
55 case kRegexpEndLine:
56 case kRegexpBeginText:
57 case kRegexpWordBoundary:
58 case kRegexpNoWordBoundary:
59 case kRegexpEndText:
60 case kRegexpAnyChar:
61 case kRegexpAnyByte:
62 case kRegexpHaveMatch:
63 return true;
64 case kRegexpConcat:
65 case kRegexpAlternate:
66 // These are simple as long as the subpieces are simple.
67 subs = sub();
68 for (int i = 0; i < nsub_; i++)
69 if (!subs[i]->simple())
70 return false;
71 return true;
72 case kRegexpCharClass:
73 // Simple as long as the char class is not empty, not full.
74 if (ccb_ != NULL)
75 return !ccb_->empty() && !ccb_->full();
76 return !cc_->empty() && !cc_->full();
77 case kRegexpCapture:
78 subs = sub();
79 return subs[0]->simple();
80 case kRegexpStar:
81 case kRegexpPlus:
82 case kRegexpQuest:
83 subs = sub();
84 if (!subs[0]->simple())
85 return false;
86 switch (subs[0]->op_) {
87 case kRegexpStar:
88 case kRegexpPlus:
89 case kRegexpQuest:
90 case kRegexpEmptyMatch:
91 case kRegexpNoMatch:
92 return false;
93 default:
94 break;
95 }
96 return true;
97 case kRegexpRepeat:
98 return false;
99 }
100 LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
101 return false;
102 }
103
104 // Walker subclass used by Simplify.
105 // Coalesces runs of star/plus/quest/repeat of the same literal along with any
106 // occurrences of that literal into repeats of that literal. It also works for
107 // char classes, any char and any byte.
108 // PostVisit creates the coalesced result, which should then be simplified.
109 class CoalesceWalker : public Regexp::Walker<Regexp*> {
110 public:
CoalesceWalker()111 CoalesceWalker() {}
112 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
113 Regexp** child_args, int nchild_args);
114 virtual Regexp* Copy(Regexp* re);
115 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
116
117 private:
118 // These functions are declared inside CoalesceWalker so that
119 // they can edit the private fields of the Regexps they construct.
120
121 // Returns true if r1 and r2 can be coalesced. In particular, ensures that
122 // the parse flags are consistent. (They will not be checked again later.)
123 static bool CanCoalesce(Regexp* r1, Regexp* r2);
124
125 // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
126 // will be empty match and the coalesced op. In other cases, where part of a
127 // literal string was removed to be coalesced, the array elements afterwards
128 // will be the coalesced op and the remainder of the literal string.
129 static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
130
131 CoalesceWalker(const CoalesceWalker&) = delete;
132 CoalesceWalker& operator=(const CoalesceWalker&) = delete;
133 };
134
135 // Walker subclass used by Simplify.
136 // The simplify walk is purely post-recursive: given the simplified children,
137 // PostVisit creates the simplified result.
138 // The child_args are simplified Regexp*s.
139 class SimplifyWalker : public Regexp::Walker<Regexp*> {
140 public:
SimplifyWalker()141 SimplifyWalker() {}
142 virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
143 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
144 Regexp** child_args, int nchild_args);
145 virtual Regexp* Copy(Regexp* re);
146 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
147
148 private:
149 // These functions are declared inside SimplifyWalker so that
150 // they can edit the private fields of the Regexps they construct.
151
152 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
153 // Caller must Decref return value when done with it.
154 static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
155
156 // Simplifies the expression re{min,max} in terms of *, +, and ?.
157 // Returns a new regexp. Does not edit re. Does not consume reference to re.
158 // Caller must Decref return value when done with it.
159 static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
160 Regexp::ParseFlags parse_flags);
161
162 // Simplifies a character class by expanding any named classes
163 // into rune ranges. Does not edit re. Does not consume ref to re.
164 // Caller must Decref return value when done with it.
165 static Regexp* SimplifyCharClass(Regexp* re);
166
167 SimplifyWalker(const SimplifyWalker&) = delete;
168 SimplifyWalker& operator=(const SimplifyWalker&) = delete;
169 };
170
171 // Simplifies a regular expression, returning a new regexp.
172 // The new regexp uses traditional Unix egrep features only,
173 // plus the Perl (?:) non-capturing parentheses.
174 // Otherwise, no POSIX or Perl additions. The new regexp
175 // captures exactly the same subexpressions (with the same indices)
176 // as the original.
177 // Does not edit current object.
178 // Caller must Decref() return value when done with it.
179
Simplify()180 Regexp* Regexp::Simplify() {
181 CoalesceWalker cw;
182 Regexp* cre = cw.Walk(this, NULL);
183 if (cre == NULL)
184 return cre;
185 SimplifyWalker sw;
186 Regexp* sre = sw.Walk(cre, NULL);
187 cre->Decref();
188 return sre;
189 }
190
191 #define Simplify DontCallSimplify // Avoid accidental recursion
192
193 // Utility function for PostVisit implementations that compares re->sub() with
194 // child_args to determine whether any child_args changed. In the common case,
195 // where nothing changed, calls Decref() for all child_args and returns false,
196 // so PostVisit must return re->Incref(). Otherwise, returns true.
ChildArgsChanged(Regexp * re,Regexp ** child_args)197 static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
198 for (int i = 0; i < re->nsub(); i++) {
199 Regexp* sub = re->sub()[i];
200 Regexp* newsub = child_args[i];
201 if (newsub != sub)
202 return true;
203 }
204 for (int i = 0; i < re->nsub(); i++) {
205 Regexp* newsub = child_args[i];
206 newsub->Decref();
207 }
208 return false;
209 }
210
Copy(Regexp * re)211 Regexp* CoalesceWalker::Copy(Regexp* re) {
212 return re->Incref();
213 }
214
ShortVisit(Regexp * re,Regexp * parent_arg)215 Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
216 // This should never be called, since we use Walk and not
217 // WalkExponential.
218 LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
219 return re->Incref();
220 }
221
PostVisit(Regexp * re,Regexp * parent_arg,Regexp * pre_arg,Regexp ** child_args,int nchild_args)222 Regexp* CoalesceWalker::PostVisit(Regexp* re,
223 Regexp* parent_arg,
224 Regexp* pre_arg,
225 Regexp** child_args,
226 int nchild_args) {
227 if (re->nsub() == 0)
228 return re->Incref();
229
230 if (re->op() != kRegexpConcat) {
231 if (!ChildArgsChanged(re, child_args))
232 return re->Incref();
233
234 // Something changed. Build a new op.
235 Regexp* nre = new Regexp(re->op(), re->parse_flags());
236 nre->AllocSub(re->nsub());
237 Regexp** nre_subs = nre->sub();
238 for (int i = 0; i < re->nsub(); i++)
239 nre_subs[i] = child_args[i];
240 // Repeats and Captures have additional data that must be copied.
241 if (re->op() == kRegexpRepeat) {
242 nre->min_ = re->min();
243 nre->max_ = re->max();
244 } else if (re->op() == kRegexpCapture) {
245 nre->cap_ = re->cap();
246 }
247 return nre;
248 }
249
250 bool can_coalesce = false;
251 for (int i = 0; i < re->nsub(); i++) {
252 if (i+1 < re->nsub() &&
253 CanCoalesce(child_args[i], child_args[i+1])) {
254 can_coalesce = true;
255 break;
256 }
257 }
258 if (!can_coalesce) {
259 if (!ChildArgsChanged(re, child_args))
260 return re->Incref();
261
262 // Something changed. Build a new op.
263 Regexp* nre = new Regexp(re->op(), re->parse_flags());
264 nre->AllocSub(re->nsub());
265 Regexp** nre_subs = nre->sub();
266 for (int i = 0; i < re->nsub(); i++)
267 nre_subs[i] = child_args[i];
268 return nre;
269 }
270
271 for (int i = 0; i < re->nsub(); i++) {
272 if (i+1 < re->nsub() &&
273 CanCoalesce(child_args[i], child_args[i+1]))
274 DoCoalesce(&child_args[i], &child_args[i+1]);
275 }
276 // Determine how many empty matches were left by DoCoalesce.
277 int n = 0;
278 for (int i = n; i < re->nsub(); i++) {
279 if (child_args[i]->op() == kRegexpEmptyMatch)
280 n++;
281 }
282 // Build a new op.
283 Regexp* nre = new Regexp(re->op(), re->parse_flags());
284 nre->AllocSub(re->nsub() - n);
285 Regexp** nre_subs = nre->sub();
286 for (int i = 0, j = 0; i < re->nsub(); i++) {
287 if (child_args[i]->op() == kRegexpEmptyMatch) {
288 child_args[i]->Decref();
289 continue;
290 }
291 nre_subs[j] = child_args[i];
292 j++;
293 }
294 return nre;
295 }
296
CanCoalesce(Regexp * r1,Regexp * r2)297 bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
298 // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
299 // any byte.
300 if ((r1->op() == kRegexpStar ||
301 r1->op() == kRegexpPlus ||
302 r1->op() == kRegexpQuest ||
303 r1->op() == kRegexpRepeat) &&
304 (r1->sub()[0]->op() == kRegexpLiteral ||
305 r1->sub()[0]->op() == kRegexpCharClass ||
306 r1->sub()[0]->op() == kRegexpAnyChar ||
307 r1->sub()[0]->op() == kRegexpAnyByte)) {
308 // r2 must be a star/plus/quest/repeat of the same literal, char class,
309 // any char or any byte.
310 if ((r2->op() == kRegexpStar ||
311 r2->op() == kRegexpPlus ||
312 r2->op() == kRegexpQuest ||
313 r2->op() == kRegexpRepeat) &&
314 Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
315 // The parse flags must be consistent.
316 ((r1->parse_flags() & Regexp::NonGreedy) ==
317 (r2->parse_flags() & Regexp::NonGreedy))) {
318 return true;
319 }
320 // ... OR an occurrence of that literal, char class, any char or any byte
321 if (Regexp::Equal(r1->sub()[0], r2)) {
322 return true;
323 }
324 // ... OR a literal string that begins with that literal.
325 if (r1->sub()[0]->op() == kRegexpLiteral &&
326 r2->op() == kRegexpLiteralString &&
327 r2->runes()[0] == r1->sub()[0]->rune() &&
328 // The parse flags must be consistent.
329 ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
330 (r2->parse_flags() & Regexp::FoldCase))) {
331 return true;
332 }
333 }
334 return false;
335 }
336
DoCoalesce(Regexp ** r1ptr,Regexp ** r2ptr)337 void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
338 Regexp* r1 = *r1ptr;
339 Regexp* r2 = *r2ptr;
340
341 Regexp* nre = Regexp::Repeat(
342 r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
343
344 switch (r1->op()) {
345 case kRegexpStar:
346 nre->min_ = 0;
347 nre->max_ = -1;
348 break;
349
350 case kRegexpPlus:
351 nre->min_ = 1;
352 nre->max_ = -1;
353 break;
354
355 case kRegexpQuest:
356 nre->min_ = 0;
357 nre->max_ = 1;
358 break;
359
360 case kRegexpRepeat:
361 nre->min_ = r1->min();
362 nre->max_ = r1->max();
363 break;
364
365 default:
366 LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
367 nre->Decref();
368 return;
369 }
370
371 switch (r2->op()) {
372 case kRegexpStar:
373 nre->max_ = -1;
374 goto LeaveEmpty;
375
376 case kRegexpPlus:
377 nre->min_++;
378 nre->max_ = -1;
379 goto LeaveEmpty;
380
381 case kRegexpQuest:
382 if (nre->max() != -1)
383 nre->max_++;
384 goto LeaveEmpty;
385
386 case kRegexpRepeat:
387 nre->min_ += r2->min();
388 if (r2->max() == -1)
389 nre->max_ = -1;
390 else if (nre->max() != -1)
391 nre->max_ += r2->max();
392 goto LeaveEmpty;
393
394 case kRegexpLiteral:
395 case kRegexpCharClass:
396 case kRegexpAnyChar:
397 case kRegexpAnyByte:
398 nre->min_++;
399 if (nre->max() != -1)
400 nre->max_++;
401 goto LeaveEmpty;
402
403 LeaveEmpty:
404 *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
405 *r2ptr = nre;
406 break;
407
408 case kRegexpLiteralString: {
409 Rune r = r1->sub()[0]->rune();
410 // Determine how much of the literal string is removed.
411 // We know that we have at least one rune. :)
412 int n = 1;
413 while (n < r2->nrunes() && r2->runes()[n] == r)
414 n++;
415 nre->min_ += n;
416 if (nre->max() != -1)
417 nre->max_ += n;
418 if (n == r2->nrunes())
419 goto LeaveEmpty;
420 *r1ptr = nre;
421 *r2ptr = Regexp::LiteralString(
422 &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
423 break;
424 }
425
426 default:
427 LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
428 nre->Decref();
429 return;
430 }
431
432 r1->Decref();
433 r2->Decref();
434 }
435
Copy(Regexp * re)436 Regexp* SimplifyWalker::Copy(Regexp* re) {
437 return re->Incref();
438 }
439
ShortVisit(Regexp * re,Regexp * parent_arg)440 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
441 // This should never be called, since we use Walk and not
442 // WalkExponential.
443 LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
444 return re->Incref();
445 }
446
PreVisit(Regexp * re,Regexp * parent_arg,bool * stop)447 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
448 if (re->simple()) {
449 *stop = true;
450 return re->Incref();
451 }
452 return NULL;
453 }
454
PostVisit(Regexp * re,Regexp * parent_arg,Regexp * pre_arg,Regexp ** child_args,int nchild_args)455 Regexp* SimplifyWalker::PostVisit(Regexp* re,
456 Regexp* parent_arg,
457 Regexp* pre_arg,
458 Regexp** child_args,
459 int nchild_args) {
460 switch (re->op()) {
461 case kRegexpNoMatch:
462 case kRegexpEmptyMatch:
463 case kRegexpLiteral:
464 case kRegexpLiteralString:
465 case kRegexpBeginLine:
466 case kRegexpEndLine:
467 case kRegexpBeginText:
468 case kRegexpWordBoundary:
469 case kRegexpNoWordBoundary:
470 case kRegexpEndText:
471 case kRegexpAnyChar:
472 case kRegexpAnyByte:
473 case kRegexpHaveMatch:
474 // All these are always simple.
475 re->simple_ = true;
476 return re->Incref();
477
478 case kRegexpConcat:
479 case kRegexpAlternate: {
480 // These are simple as long as the subpieces are simple.
481 if (!ChildArgsChanged(re, child_args)) {
482 re->simple_ = true;
483 return re->Incref();
484 }
485 Regexp* nre = new Regexp(re->op(), re->parse_flags());
486 nre->AllocSub(re->nsub());
487 Regexp** nre_subs = nre->sub();
488 for (int i = 0; i < re->nsub(); i++)
489 nre_subs[i] = child_args[i];
490 nre->simple_ = true;
491 return nre;
492 }
493
494 case kRegexpCapture: {
495 Regexp* newsub = child_args[0];
496 if (newsub == re->sub()[0]) {
497 newsub->Decref();
498 re->simple_ = true;
499 return re->Incref();
500 }
501 Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
502 nre->AllocSub(1);
503 nre->sub()[0] = newsub;
504 nre->cap_ = re->cap();
505 nre->simple_ = true;
506 return nre;
507 }
508
509 case kRegexpStar:
510 case kRegexpPlus:
511 case kRegexpQuest: {
512 Regexp* newsub = child_args[0];
513 // Special case: repeat the empty string as much as
514 // you want, but it's still the empty string.
515 if (newsub->op() == kRegexpEmptyMatch)
516 return newsub;
517
518 // These are simple as long as the subpiece is simple.
519 if (newsub == re->sub()[0]) {
520 newsub->Decref();
521 re->simple_ = true;
522 return re->Incref();
523 }
524
525 // These are also idempotent if flags are constant.
526 if (re->op() == newsub->op() &&
527 re->parse_flags() == newsub->parse_flags())
528 return newsub;
529
530 Regexp* nre = new Regexp(re->op(), re->parse_flags());
531 nre->AllocSub(1);
532 nre->sub()[0] = newsub;
533 nre->simple_ = true;
534 return nre;
535 }
536
537 case kRegexpRepeat: {
538 Regexp* newsub = child_args[0];
539 // Special case: repeat the empty string as much as
540 // you want, but it's still the empty string.
541 if (newsub->op() == kRegexpEmptyMatch)
542 return newsub;
543
544 Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
545 re->parse_flags());
546 newsub->Decref();
547 nre->simple_ = true;
548 return nre;
549 }
550
551 case kRegexpCharClass: {
552 Regexp* nre = SimplifyCharClass(re);
553 nre->simple_ = true;
554 return nre;
555 }
556 }
557
558 LOG(ERROR) << "Simplify case not handled: " << re->op();
559 return re->Incref();
560 }
561
562 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
563 // Returns a new Regexp, handing the ref to the caller.
Concat2(Regexp * re1,Regexp * re2,Regexp::ParseFlags parse_flags)564 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
565 Regexp::ParseFlags parse_flags) {
566 Regexp* re = new Regexp(kRegexpConcat, parse_flags);
567 re->AllocSub(2);
568 Regexp** subs = re->sub();
569 subs[0] = re1;
570 subs[1] = re2;
571 return re;
572 }
573
574 // Simplifies the expression re{min,max} in terms of *, +, and ?.
575 // Returns a new regexp. Does not edit re. Does not consume reference to re.
576 // Caller must Decref return value when done with it.
577 // The result will *not* necessarily have the right capturing parens
578 // if you call ToString() and re-parse it: (x){2} becomes (x)(x),
579 // but in the Regexp* representation, both (x) are marked as $1.
SimplifyRepeat(Regexp * re,int min,int max,Regexp::ParseFlags f)580 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
581 Regexp::ParseFlags f) {
582 // x{n,} means at least n matches of x.
583 if (max == -1) {
584 // Special case: x{0,} is x*
585 if (min == 0)
586 return Regexp::Star(re->Incref(), f);
587
588 // Special case: x{1,} is x+
589 if (min == 1)
590 return Regexp::Plus(re->Incref(), f);
591
592 // General case: x{4,} is xxxx+
593 PODArray<Regexp*> nre_subs(min);
594 for (int i = 0; i < min-1; i++)
595 nre_subs[i] = re->Incref();
596 nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
597 return Regexp::Concat(nre_subs.data(), min, f);
598 }
599
600 // Special case: (x){0} matches only empty string.
601 if (min == 0 && max == 0)
602 return new Regexp(kRegexpEmptyMatch, f);
603
604 // Special case: x{1} is just x.
605 if (min == 1 && max == 1)
606 return re->Incref();
607
608 // General case: x{n,m} means n copies of x and m copies of x?.
609 // The machine will do less work if we nest the final m copies,
610 // so that x{2,5} = xx(x(x(x)?)?)?
611
612 // Build leading prefix: xx. Capturing only on the last one.
613 Regexp* nre = NULL;
614 if (min > 0) {
615 PODArray<Regexp*> nre_subs(min);
616 for (int i = 0; i < min; i++)
617 nre_subs[i] = re->Incref();
618 nre = Regexp::Concat(nre_subs.data(), min, f);
619 }
620
621 // Build and attach suffix: (x(x(x)?)?)?
622 if (max > min) {
623 Regexp* suf = Regexp::Quest(re->Incref(), f);
624 for (int i = min+1; i < max; i++)
625 suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
626 if (nre == NULL)
627 nre = suf;
628 else
629 nre = Concat2(nre, suf, f);
630 }
631
632 if (nre == NULL) {
633 // Some degenerate case, like min > max, or min < max < 0.
634 // This shouldn't happen, because the parser rejects such regexps.
635 LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
636 return new Regexp(kRegexpNoMatch, f);
637 }
638
639 return nre;
640 }
641
642 // Simplifies a character class.
643 // Caller must Decref return value when done with it.
SimplifyCharClass(Regexp * re)644 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
645 CharClass* cc = re->cc();
646
647 // Special cases
648 if (cc->empty())
649 return new Regexp(kRegexpNoMatch, re->parse_flags());
650 if (cc->full())
651 return new Regexp(kRegexpAnyChar, re->parse_flags());
652
653 return re->Incref();
654 }
655
656 } // namespace re2
657