1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression engine tester -- test all the implementations against each other.
6
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string.h>
10 #include <string>
11
12 #include "util/util.h"
13 #include "util/flags.h"
14 #include "util/logging.h"
15 #include "util/strutil.h"
16 #include "re2/testing/tester.h"
17 #include "re2/prog.h"
18 #include "re2/re2.h"
19 #include "re2/regexp.h"
20
21 DEFINE_bool(dump_prog, false, "dump regexp program");
22 DEFINE_bool(log_okay, false, "log successful runs");
23 DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
24
25 DEFINE_int32(max_regexp_failures, 100,
26 "maximum number of regexp test failures (-1 = unlimited)");
27
28 DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
29
30 namespace re2 {
31
32 enum {
33 kMaxSubmatch = 1+16, // $0...$16
34 };
35
36 const char* engine_names[kEngineMax] = {
37 "Backtrack",
38 "NFA",
39 "DFA",
40 "DFA1",
41 "OnePass",
42 "BitState",
43 "RE2",
44 "RE2a",
45 "RE2b",
46 "PCRE",
47 };
48
49 // Returns the name of the engine.
EngineName(Engine e)50 static const char* EngineName(Engine e) {
51 CHECK_GE(e, 0);
52 CHECK_LT(e, arraysize(engine_names));
53 CHECK(engine_names[e] != NULL);
54 return engine_names[e];
55 }
56
57 // Returns bit mask of engines to use.
Engines()58 static uint32_t Engines() {
59 static bool did_parse = false;
60 static uint32_t cached_engines = 0;
61
62 if (did_parse)
63 return cached_engines;
64
65 if (FLAGS_regexp_engines.empty()) {
66 cached_engines = ~0;
67 } else {
68 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
69 if (FLAGS_regexp_engines.find(EngineName(i)) != string::npos)
70 cached_engines |= 1<<i;
71 }
72
73 if (cached_engines == 0)
74 LOG(INFO) << "Warning: no engines enabled.";
75 if (!UsingPCRE)
76 cached_engines &= ~(1<<kEnginePCRE);
77 for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
78 if (cached_engines & (1<<i))
79 LOG(INFO) << EngineName(i) << " enabled";
80 }
81
82 did_parse = true;
83 return cached_engines;
84 }
85
86 // The result of running a match.
87 struct TestInstance::Result {
88 bool skipped; // test skipped: wasn't applicable
89 bool matched; // found a match
90 bool untrusted; // don't really trust the answer
91 bool have_submatch; // computed all submatch info
92 bool have_submatch0; // computed just submatch[0]
93 StringPiece submatch[kMaxSubmatch];
94 };
95
96 typedef TestInstance::Result Result;
97
98 // Formats a single capture range s in text in the form (a,b)
99 // where a and b are the starting and ending offsets of s in text.
FormatCapture(const StringPiece & text,const StringPiece & s)100 static string FormatCapture(const StringPiece& text, const StringPiece& s) {
101 if (s.begin() == NULL)
102 return "(?,?)";
103 return StringPrintf("(%td,%td)",
104 s.begin() - text.begin(), s.end() - text.begin());
105 }
106
107 // Returns whether text contains non-ASCII (>= 0x80) bytes.
NonASCII(const StringPiece & text)108 static bool NonASCII(const StringPiece& text) {
109 for (size_t i = 0; i < text.size(); i++)
110 if ((uint8_t)text[i] >= 0x80)
111 return true;
112 return false;
113 }
114
115 // Returns string representation of match kind.
FormatKind(Prog::MatchKind kind)116 static string FormatKind(Prog::MatchKind kind) {
117 switch (kind) {
118 case Prog::kFullMatch:
119 return "full match";
120 case Prog::kLongestMatch:
121 return "longest match";
122 case Prog::kFirstMatch:
123 return "first match";
124 case Prog::kManyMatch:
125 return "many match";
126 }
127 return "???";
128 }
129
130 // Returns string representation of anchor kind.
FormatAnchor(Prog::Anchor anchor)131 static string FormatAnchor(Prog::Anchor anchor) {
132 switch (anchor) {
133 case Prog::kAnchored:
134 return "anchored";
135 case Prog::kUnanchored:
136 return "unanchored";
137 }
138 return "???";
139 }
140
141 struct ParseMode {
142 Regexp::ParseFlags parse_flags;
143 string desc;
144 };
145
146 static const Regexp::ParseFlags single_line =
147 Regexp::LikePerl;
148 static const Regexp::ParseFlags multi_line =
149 static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
150
151 static ParseMode parse_modes[] = {
152 { single_line, "single-line" },
153 { single_line|Regexp::Latin1, "single-line, latin1" },
154 { multi_line, "multiline" },
155 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
156 { multi_line|Regexp::Latin1, "multiline, latin1" },
157 };
158
FormatMode(Regexp::ParseFlags flags)159 static string FormatMode(Regexp::ParseFlags flags) {
160 for (int i = 0; i < arraysize(parse_modes); i++)
161 if (parse_modes[i].parse_flags == flags)
162 return parse_modes[i].desc;
163 return StringPrintf("%#x", static_cast<uint32_t>(flags));
164 }
165
166 // Constructs and saves all the matching engines that
167 // will be required for the given tests.
TestInstance(const StringPiece & regexp_str,Prog::MatchKind kind,Regexp::ParseFlags flags)168 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
169 Regexp::ParseFlags flags)
170 : regexp_str_(regexp_str),
171 kind_(kind),
172 flags_(flags),
173 error_(false),
174 regexp_(NULL),
175 num_captures_(0),
176 prog_(NULL),
177 rprog_(NULL),
178 re_(NULL),
179 re2_(NULL) {
180
181 VLOG(1) << CEscape(regexp_str);
182
183 // Compile regexp to prog.
184 // Always required - needed for backtracking (reference implementation).
185 RegexpStatus status;
186 regexp_ = Regexp::Parse(regexp_str, flags, &status);
187 if (regexp_ == NULL) {
188 LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
189 << " mode: " << FormatMode(flags);
190 error_ = true;
191 return;
192 }
193 num_captures_ = regexp_->NumCaptures();
194 prog_ = regexp_->CompileToProg(0);
195 if (prog_ == NULL) {
196 LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
197 error_ = true;
198 return;
199 }
200 if (FLAGS_dump_prog) {
201 LOG(INFO) << "Prog for "
202 << " regexp "
203 << CEscape(regexp_str_)
204 << " (" << FormatKind(kind_)
205 << ", " << FormatMode(flags_)
206 << ")\n"
207 << prog_->Dump();
208 }
209
210 // Compile regexp to reversed prog. Only needed for DFA engines.
211 if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
212 rprog_ = regexp_->CompileToReverseProg(0);
213 if (rprog_ == NULL) {
214 LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
215 error_ = true;
216 return;
217 }
218 if (FLAGS_dump_rprog)
219 LOG(INFO) << rprog_->Dump();
220 }
221
222 // Create re string that will be used for RE and RE2.
223 string re = string(regexp_str);
224 // Accomodate flags.
225 // Regexp::Latin1 will be accomodated below.
226 if (!(flags & Regexp::OneLine))
227 re = "(?m)" + re;
228 if (flags & Regexp::NonGreedy)
229 re = "(?U)" + re;
230 if (flags & Regexp::DotNL)
231 re = "(?s)" + re;
232
233 // Compile regexp to RE2.
234 if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
235 RE2::Options options;
236 if (flags & Regexp::Latin1)
237 options.set_encoding(RE2::Options::EncodingLatin1);
238 if (kind_ == Prog::kLongestMatch)
239 options.set_longest_match(true);
240 re2_ = new RE2(re, options);
241 if (!re2_->error().empty()) {
242 LOG(INFO) << "Cannot RE2: " << CEscape(re);
243 error_ = true;
244 return;
245 }
246 }
247
248 // Compile regexp to RE.
249 // PCRE as exposed by the RE interface isn't always usable.
250 // 1. It disagrees about handling of empty-string reptitions
251 // like matching (a*)* against "b". PCRE treats the (a*) as
252 // occurring once, while we treat it as occurring not at all.
253 // 2. It treats $ as this weird thing meaning end of string
254 // or before the \n at the end of the string.
255 // 3. It doesn't implement POSIX leftmost-longest matching.
256 // 4. It lets \s match vertical tab.
257 // MimicsPCRE() detects 1 and 2.
258 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
259 kind_ != Prog::kLongestMatch) {
260 PCRE_Options o;
261 o.set_option(PCRE::UTF8);
262 if (flags & Regexp::Latin1)
263 o.set_option(PCRE::None);
264 // PCRE has interface bug keeping us from finding $0, so
265 // add one more layer of parens.
266 re_ = new PCRE("("+re+")", o);
267 if (!re_->error().empty()) {
268 LOG(INFO) << "Cannot PCRE: " << CEscape(re);
269 error_ = true;
270 return;
271 }
272 }
273 }
274
~TestInstance()275 TestInstance::~TestInstance() {
276 if (regexp_)
277 regexp_->Decref();
278 delete prog_;
279 delete rprog_;
280 delete re_;
281 delete re2_;
282 }
283
284 // Runs a single search using the named engine type.
285 // This interface hides all the irregularities of the various
286 // engine interfaces from the rest of this file.
RunSearch(Engine type,const StringPiece & orig_text,const StringPiece & orig_context,Prog::Anchor anchor,Result * result)287 void TestInstance::RunSearch(Engine type,
288 const StringPiece& orig_text,
289 const StringPiece& orig_context,
290 Prog::Anchor anchor,
291 Result* result) {
292 // Result is not trivial, so we cannot freely clear it with memset(3),
293 // but zeroing objects like so is safe and expedient for our purposes.
294 memset(reinterpret_cast<void*>(result), 0, sizeof *result);
295 if (regexp_ == NULL) {
296 result->skipped = true;
297 return;
298 }
299 int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
300 if (nsubmatch > kMaxSubmatch)
301 nsubmatch = kMaxSubmatch;
302
303 StringPiece text = orig_text;
304 StringPiece context = orig_context;
305
306 switch (type) {
307 default:
308 LOG(FATAL) << "Bad RunSearch type: " << (int)type;
309
310 case kEngineBacktrack:
311 if (prog_ == NULL) {
312 result->skipped = true;
313 break;
314 }
315 result->matched =
316 prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
317 result->submatch, nsubmatch);
318 result->have_submatch = true;
319 break;
320
321 case kEngineNFA:
322 if (prog_ == NULL) {
323 result->skipped = true;
324 break;
325 }
326 result->matched =
327 prog_->SearchNFA(text, context, anchor, kind_,
328 result->submatch, nsubmatch);
329 result->have_submatch = true;
330 break;
331
332 case kEngineDFA:
333 if (prog_ == NULL) {
334 result->skipped = true;
335 break;
336 }
337 result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
338 &result->skipped, NULL);
339 break;
340
341 case kEngineDFA1:
342 if (prog_ == NULL || rprog_ == NULL) {
343 result->skipped = true;
344 break;
345 }
346 result->matched =
347 prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
348 &result->skipped, NULL);
349 // If anchored, no need for second run,
350 // but do it anyway to find more bugs.
351 if (result->matched) {
352 if (!rprog_->SearchDFA(result->submatch[0], context,
353 Prog::kAnchored, Prog::kLongestMatch,
354 result->submatch,
355 &result->skipped, NULL)) {
356 LOG(ERROR) << "Reverse DFA inconsistency: "
357 << CEscape(regexp_str_)
358 << " on " << CEscape(text);
359 result->matched = false;
360 }
361 }
362 result->have_submatch0 = true;
363 break;
364
365 case kEngineOnePass:
366 if (prog_ == NULL ||
367 anchor == Prog::kUnanchored ||
368 !prog_->IsOnePass() ||
369 nsubmatch > Prog::kMaxOnePassCapture) {
370 result->skipped = true;
371 break;
372 }
373 result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
374 result->submatch, nsubmatch);
375 result->have_submatch = true;
376 break;
377
378 case kEngineBitState:
379 if (prog_ == NULL) {
380 result->skipped = true;
381 break;
382 }
383 result->matched = prog_->SearchBitState(text, context, anchor, kind_,
384 result->submatch, nsubmatch);
385 result->have_submatch = true;
386 break;
387
388 case kEngineRE2:
389 case kEngineRE2a:
390 case kEngineRE2b: {
391 if (!re2_ || text.end() != context.end()) {
392 result->skipped = true;
393 break;
394 }
395
396 RE2::Anchor re_anchor;
397 if (anchor == Prog::kAnchored)
398 re_anchor = RE2::ANCHOR_START;
399 else
400 re_anchor = RE2::UNANCHORED;
401 if (kind_ == Prog::kFullMatch)
402 re_anchor = RE2::ANCHOR_BOTH;
403
404 result->matched = re2_->Match(
405 context,
406 static_cast<size_t>(text.begin() - context.begin()),
407 static_cast<size_t>(text.end() - context.begin()),
408 re_anchor,
409 result->submatch,
410 nsubmatch);
411 result->have_submatch = nsubmatch > 0;
412 break;
413 }
414
415 case kEnginePCRE: {
416 if (!re_ || text.begin() != context.begin() ||
417 text.end() != context.end()) {
418 result->skipped = true;
419 break;
420 }
421
422 // In Perl/PCRE, \v matches any character considered vertical
423 // whitespace, not just vertical tab. Regexp::MimicsPCRE() is
424 // unable to handle all cases of this, unfortunately, so just
425 // catch them here. :(
426 if (regexp_str_.find("\\v") != StringPiece::npos &&
427 (text.find('\n') != StringPiece::npos ||
428 text.find('\f') != StringPiece::npos ||
429 text.find('\r') != StringPiece::npos)) {
430 result->skipped = true;
431 break;
432 }
433
434 // PCRE 8.34 or so started allowing vertical tab to match \s,
435 // following a change made in Perl 5.18. RE2 does not.
436 if ((regexp_str_.find("\\s") != StringPiece::npos ||
437 regexp_str_.find("\\S") != StringPiece::npos) &&
438 text.find('\v') != StringPiece::npos) {
439 result->skipped = true;
440 break;
441 }
442
443 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
444 PCRE::Arg *a = new PCRE::Arg[nsubmatch];
445 for (int i = 0; i < nsubmatch; i++) {
446 a[i] = PCRE::Arg(&result->submatch[i]);
447 argptr[i] = &a[i];
448 }
449 size_t consumed;
450 PCRE::Anchor pcre_anchor;
451 if (anchor == Prog::kAnchored)
452 pcre_anchor = PCRE::ANCHOR_START;
453 else
454 pcre_anchor = PCRE::UNANCHORED;
455 if (kind_ == Prog::kFullMatch)
456 pcre_anchor = PCRE::ANCHOR_BOTH;
457 re_->ClearHitLimit();
458 result->matched =
459 re_->DoMatch(text,
460 pcre_anchor,
461 &consumed,
462 argptr, nsubmatch);
463 if (re_->HitLimit()) {
464 result->untrusted = true;
465 delete[] argptr;
466 delete[] a;
467 break;
468 }
469 result->have_submatch = true;
470 delete[] argptr;
471 delete[] a;
472 break;
473 }
474 }
475
476 if (!result->matched)
477 memset(result->submatch, 0, sizeof result->submatch);
478 }
479
480 // Checks whether r is okay given that correct is the right answer.
481 // Specifically, r's answers have to match (but it doesn't have to
482 // claim to have all the answers).
ResultOkay(const Result & r,const Result & correct)483 static bool ResultOkay(const Result& r, const Result& correct) {
484 if (r.skipped)
485 return true;
486 if (r.matched != correct.matched)
487 return false;
488 if (r.have_submatch || r.have_submatch0) {
489 for (int i = 0; i < kMaxSubmatch; i++) {
490 if (correct.submatch[i].begin() != r.submatch[i].begin() ||
491 correct.submatch[i].size() != r.submatch[i].size())
492 return false;
493 if (!r.have_submatch)
494 break;
495 }
496 }
497 return true;
498 }
499
500 // Runs a single test.
RunCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)501 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
502 Prog::Anchor anchor) {
503 // Backtracking is the gold standard.
504 Result correct;
505 RunSearch(kEngineBacktrack, text, context, anchor, &correct);
506 if (correct.skipped) {
507 if (regexp_ == NULL)
508 return true;
509 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
510 << " " << FormatMode(flags_);
511 return false;
512 }
513 VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
514 << " text " << CEscape(text)
515 << " (" << FormatKind(kind_)
516 << ", " << FormatAnchor(anchor)
517 << ", " << FormatMode(flags_)
518 << ")";
519
520 // Compare the others.
521 bool all_okay = true;
522 for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
523 if (!(Engines() & (1<<i)))
524 continue;
525
526 Result r;
527 RunSearch(i, text, context, anchor, &r);
528 if (ResultOkay(r, correct)) {
529 if (FLAGS_log_okay)
530 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
531 continue;
532 }
533
534 // We disagree with PCRE on the meaning of some Unicode matches.
535 // In particular, we treat non-ASCII UTF-8 as non-word characters.
536 // We also treat "empty" character sets like [^\w\W] as being
537 // impossible to match, while PCRE apparently excludes some code
538 // points (e.g., 0x0080) from both \w and \W.
539 if (i == kEnginePCRE && NonASCII(text))
540 continue;
541
542 if (!r.untrusted)
543 all_okay = false;
544
545 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
546 context, anchor);
547 if (r.matched != correct.matched) {
548 if (r.matched) {
549 LOG(INFO) << " Should not match (but does).";
550 } else {
551 LOG(INFO) << " Should match (but does not).";
552 continue;
553 }
554 }
555 for (int i = 0; i < 1+num_captures_; i++) {
556 if (r.submatch[i].begin() != correct.submatch[i].begin() ||
557 r.submatch[i].end() != correct.submatch[i].end()) {
558 LOG(INFO) <<
559 StringPrintf(" $%d: should be %s is %s",
560 i,
561 FormatCapture(text, correct.submatch[i]).c_str(),
562 FormatCapture(text, r.submatch[i]).c_str());
563 } else {
564 LOG(INFO) <<
565 StringPrintf(" $%d: %s ok", i,
566 FormatCapture(text, r.submatch[i]).c_str());
567 }
568 }
569 }
570
571 if (!all_okay) {
572 if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
573 LOG(QFATAL) << "Too many regexp failures.";
574 }
575
576 return all_okay;
577 }
578
LogMatch(const char * prefix,Engine e,const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)579 void TestInstance::LogMatch(const char* prefix, Engine e,
580 const StringPiece& text, const StringPiece& context,
581 Prog::Anchor anchor) {
582 LOG(INFO) << prefix
583 << EngineName(e)
584 << " regexp "
585 << CEscape(regexp_str_)
586 << " "
587 << CEscape(regexp_->ToString())
588 << " text "
589 << CEscape(text)
590 << " ("
591 << text.begin() - context.begin()
592 << ","
593 << text.end() - context.begin()
594 << ") of context "
595 << CEscape(context)
596 << " (" << FormatKind(kind_)
597 << ", " << FormatAnchor(anchor)
598 << ", " << FormatMode(flags_)
599 << ")";
600 }
601
602 static Prog::MatchKind kinds[] = {
603 Prog::kFirstMatch,
604 Prog::kLongestMatch,
605 Prog::kFullMatch,
606 };
607
608 // Test all possible match kinds and parse modes.
Tester(const StringPiece & regexp)609 Tester::Tester(const StringPiece& regexp) {
610 error_ = false;
611 for (int i = 0; i < arraysize(kinds); i++) {
612 for (int j = 0; j < arraysize(parse_modes); j++) {
613 TestInstance* t = new TestInstance(regexp, kinds[i],
614 parse_modes[j].parse_flags);
615 error_ |= t->error();
616 v_.push_back(t);
617 }
618 }
619 }
620
~Tester()621 Tester::~Tester() {
622 for (size_t i = 0; i < v_.size(); i++)
623 delete v_[i];
624 }
625
TestCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)626 bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
627 Prog::Anchor anchor) {
628 bool okay = true;
629 for (size_t i = 0; i < v_.size(); i++)
630 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
631 return okay;
632 }
633
634 static Prog::Anchor anchors[] = {
635 Prog::kAnchored,
636 Prog::kUnanchored
637 };
638
TestInput(const StringPiece & text)639 bool Tester::TestInput(const StringPiece& text) {
640 bool okay = TestInputInContext(text, text);
641 if (text.size() > 0) {
642 StringPiece sp;
643 sp = text;
644 sp.remove_prefix(1);
645 okay &= TestInputInContext(sp, text);
646 sp = text;
647 sp.remove_suffix(1);
648 okay &= TestInputInContext(sp, text);
649 }
650 return okay;
651 }
652
TestInputInContext(const StringPiece & text,const StringPiece & context)653 bool Tester::TestInputInContext(const StringPiece& text,
654 const StringPiece& context) {
655 bool okay = true;
656 for (int i = 0; i < arraysize(anchors); i++)
657 okay &= TestCase(text, context, anchors[i]);
658 return okay;
659 }
660
TestRegexpOnText(const StringPiece & regexp,const StringPiece & text)661 bool TestRegexpOnText(const StringPiece& regexp,
662 const StringPiece& text) {
663 Tester t(regexp);
664 return t.TestInput(text);
665 }
666
667 } // namespace re2
668