xref: /aosp_15_r20/external/regex-re2/re2/testing/tester.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression engine tester -- test all the implementations against each other.
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string.h>
10 #include <string>
11 
12 #include "util/util.h"
13 #include "util/flags.h"
14 #include "util/logging.h"
15 #include "util/strutil.h"
16 #include "re2/testing/tester.h"
17 #include "re2/prog.h"
18 #include "re2/re2.h"
19 #include "re2/regexp.h"
20 
21 DEFINE_bool(dump_prog, false, "dump regexp program");
22 DEFINE_bool(log_okay, false, "log successful runs");
23 DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
24 
25 DEFINE_int32(max_regexp_failures, 100,
26              "maximum number of regexp test failures (-1 = unlimited)");
27 
28 DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
29 
30 namespace re2 {
31 
32 enum {
33   kMaxSubmatch = 1+16,  // $0...$16
34 };
35 
36 const char* engine_names[kEngineMax] = {
37   "Backtrack",
38   "NFA",
39   "DFA",
40   "DFA1",
41   "OnePass",
42   "BitState",
43   "RE2",
44   "RE2a",
45   "RE2b",
46   "PCRE",
47 };
48 
49 // Returns the name of the engine.
EngineName(Engine e)50 static const char* EngineName(Engine e) {
51   CHECK_GE(e, 0);
52   CHECK_LT(e, arraysize(engine_names));
53   CHECK(engine_names[e] != NULL);
54   return engine_names[e];
55 }
56 
57 // Returns bit mask of engines to use.
Engines()58 static uint32_t Engines() {
59   static bool did_parse = false;
60   static uint32_t cached_engines = 0;
61 
62   if (did_parse)
63     return cached_engines;
64 
65   if (FLAGS_regexp_engines.empty()) {
66     cached_engines = ~0;
67   } else {
68     for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
69       if (FLAGS_regexp_engines.find(EngineName(i)) != string::npos)
70         cached_engines |= 1<<i;
71   }
72 
73   if (cached_engines == 0)
74     LOG(INFO) << "Warning: no engines enabled.";
75   if (!UsingPCRE)
76     cached_engines &= ~(1<<kEnginePCRE);
77   for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
78     if (cached_engines & (1<<i))
79       LOG(INFO) << EngineName(i) << " enabled";
80   }
81 
82   did_parse = true;
83   return cached_engines;
84 }
85 
86 // The result of running a match.
87 struct TestInstance::Result {
88   bool skipped;         // test skipped: wasn't applicable
89   bool matched;         // found a match
90   bool untrusted;       // don't really trust the answer
91   bool have_submatch;   // computed all submatch info
92   bool have_submatch0;  // computed just submatch[0]
93   StringPiece submatch[kMaxSubmatch];
94 };
95 
96 typedef TestInstance::Result Result;
97 
98 // Formats a single capture range s in text in the form (a,b)
99 // where a and b are the starting and ending offsets of s in text.
FormatCapture(const StringPiece & text,const StringPiece & s)100 static string FormatCapture(const StringPiece& text, const StringPiece& s) {
101   if (s.begin() == NULL)
102     return "(?,?)";
103   return StringPrintf("(%td,%td)",
104                       s.begin() - text.begin(), s.end() - text.begin());
105 }
106 
107 // Returns whether text contains non-ASCII (>= 0x80) bytes.
NonASCII(const StringPiece & text)108 static bool NonASCII(const StringPiece& text) {
109   for (size_t i = 0; i < text.size(); i++)
110     if ((uint8_t)text[i] >= 0x80)
111       return true;
112   return false;
113 }
114 
115 // Returns string representation of match kind.
FormatKind(Prog::MatchKind kind)116 static string FormatKind(Prog::MatchKind kind) {
117   switch (kind) {
118     case Prog::kFullMatch:
119       return "full match";
120     case Prog::kLongestMatch:
121       return "longest match";
122     case Prog::kFirstMatch:
123       return "first match";
124     case Prog::kManyMatch:
125       return "many match";
126   }
127   return "???";
128 }
129 
130 // Returns string representation of anchor kind.
FormatAnchor(Prog::Anchor anchor)131 static string FormatAnchor(Prog::Anchor anchor) {
132   switch (anchor) {
133     case Prog::kAnchored:
134       return "anchored";
135     case Prog::kUnanchored:
136       return "unanchored";
137   }
138   return "???";
139 }
140 
141 struct ParseMode {
142   Regexp::ParseFlags parse_flags;
143   string desc;
144 };
145 
146 static const Regexp::ParseFlags single_line =
147   Regexp::LikePerl;
148 static const Regexp::ParseFlags multi_line =
149   static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
150 
151 static ParseMode parse_modes[] = {
152   { single_line,                   "single-line"          },
153   { single_line|Regexp::Latin1,    "single-line, latin1"  },
154   { multi_line,                    "multiline"            },
155   { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
156   { multi_line|Regexp::Latin1,     "multiline, latin1"    },
157 };
158 
FormatMode(Regexp::ParseFlags flags)159 static string FormatMode(Regexp::ParseFlags flags) {
160   for (int i = 0; i < arraysize(parse_modes); i++)
161     if (parse_modes[i].parse_flags == flags)
162       return parse_modes[i].desc;
163   return StringPrintf("%#x", static_cast<uint32_t>(flags));
164 }
165 
166 // Constructs and saves all the matching engines that
167 // will be required for the given tests.
TestInstance(const StringPiece & regexp_str,Prog::MatchKind kind,Regexp::ParseFlags flags)168 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
169                            Regexp::ParseFlags flags)
170   : regexp_str_(regexp_str),
171     kind_(kind),
172     flags_(flags),
173     error_(false),
174     regexp_(NULL),
175     num_captures_(0),
176     prog_(NULL),
177     rprog_(NULL),
178     re_(NULL),
179     re2_(NULL) {
180 
181   VLOG(1) << CEscape(regexp_str);
182 
183   // Compile regexp to prog.
184   // Always required - needed for backtracking (reference implementation).
185   RegexpStatus status;
186   regexp_ = Regexp::Parse(regexp_str, flags, &status);
187   if (regexp_ == NULL) {
188     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
189               << " mode: " << FormatMode(flags);
190     error_ = true;
191     return;
192   }
193   num_captures_ = regexp_->NumCaptures();
194   prog_ = regexp_->CompileToProg(0);
195   if (prog_ == NULL) {
196     LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
197     error_ = true;
198     return;
199   }
200   if (FLAGS_dump_prog) {
201     LOG(INFO) << "Prog for "
202               << " regexp "
203               << CEscape(regexp_str_)
204               << " (" << FormatKind(kind_)
205               << ", " << FormatMode(flags_)
206               << ")\n"
207               << prog_->Dump();
208   }
209 
210   // Compile regexp to reversed prog.  Only needed for DFA engines.
211   if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
212     rprog_ = regexp_->CompileToReverseProg(0);
213     if (rprog_ == NULL) {
214       LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
215       error_ = true;
216       return;
217     }
218     if (FLAGS_dump_rprog)
219       LOG(INFO) << rprog_->Dump();
220   }
221 
222   // Create re string that will be used for RE and RE2.
223   string re = string(regexp_str);
224   // Accomodate flags.
225   // Regexp::Latin1 will be accomodated below.
226   if (!(flags & Regexp::OneLine))
227     re = "(?m)" + re;
228   if (flags & Regexp::NonGreedy)
229     re = "(?U)" + re;
230   if (flags & Regexp::DotNL)
231     re = "(?s)" + re;
232 
233   // Compile regexp to RE2.
234   if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
235     RE2::Options options;
236     if (flags & Regexp::Latin1)
237       options.set_encoding(RE2::Options::EncodingLatin1);
238     if (kind_ == Prog::kLongestMatch)
239       options.set_longest_match(true);
240     re2_ = new RE2(re, options);
241     if (!re2_->error().empty()) {
242       LOG(INFO) << "Cannot RE2: " << CEscape(re);
243       error_ = true;
244       return;
245     }
246   }
247 
248   // Compile regexp to RE.
249   // PCRE as exposed by the RE interface isn't always usable.
250   // 1. It disagrees about handling of empty-string reptitions
251   //    like matching (a*)* against "b".  PCRE treats the (a*) as
252   //    occurring once, while we treat it as occurring not at all.
253   // 2. It treats $ as this weird thing meaning end of string
254   //    or before the \n at the end of the string.
255   // 3. It doesn't implement POSIX leftmost-longest matching.
256   // 4. It lets \s match vertical tab.
257   // MimicsPCRE() detects 1 and 2.
258   if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
259       kind_ != Prog::kLongestMatch) {
260     PCRE_Options o;
261     o.set_option(PCRE::UTF8);
262     if (flags & Regexp::Latin1)
263       o.set_option(PCRE::None);
264     // PCRE has interface bug keeping us from finding $0, so
265     // add one more layer of parens.
266     re_ = new PCRE("("+re+")", o);
267     if (!re_->error().empty()) {
268       LOG(INFO) << "Cannot PCRE: " << CEscape(re);
269       error_ = true;
270       return;
271     }
272   }
273 }
274 
~TestInstance()275 TestInstance::~TestInstance() {
276   if (regexp_)
277     regexp_->Decref();
278   delete prog_;
279   delete rprog_;
280   delete re_;
281   delete re2_;
282 }
283 
284 // Runs a single search using the named engine type.
285 // This interface hides all the irregularities of the various
286 // engine interfaces from the rest of this file.
RunSearch(Engine type,const StringPiece & orig_text,const StringPiece & orig_context,Prog::Anchor anchor,Result * result)287 void TestInstance::RunSearch(Engine type,
288                              const StringPiece& orig_text,
289                              const StringPiece& orig_context,
290                              Prog::Anchor anchor,
291                              Result* result) {
292   // Result is not trivial, so we cannot freely clear it with memset(3),
293   // but zeroing objects like so is safe and expedient for our purposes.
294   memset(reinterpret_cast<void*>(result), 0, sizeof *result);
295   if (regexp_ == NULL) {
296     result->skipped = true;
297     return;
298   }
299   int nsubmatch = 1 + num_captures_;  // NumCaptures doesn't count $0
300   if (nsubmatch > kMaxSubmatch)
301     nsubmatch = kMaxSubmatch;
302 
303   StringPiece text = orig_text;
304   StringPiece context = orig_context;
305 
306   switch (type) {
307     default:
308       LOG(FATAL) << "Bad RunSearch type: " << (int)type;
309 
310     case kEngineBacktrack:
311       if (prog_ == NULL) {
312         result->skipped = true;
313         break;
314       }
315       result->matched =
316         prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
317                                      result->submatch, nsubmatch);
318       result->have_submatch = true;
319       break;
320 
321     case kEngineNFA:
322       if (prog_ == NULL) {
323         result->skipped = true;
324         break;
325       }
326       result->matched =
327         prog_->SearchNFA(text, context, anchor, kind_,
328                         result->submatch, nsubmatch);
329       result->have_submatch = true;
330       break;
331 
332     case kEngineDFA:
333       if (prog_ == NULL) {
334         result->skipped = true;
335         break;
336       }
337       result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
338                                          &result->skipped, NULL);
339       break;
340 
341     case kEngineDFA1:
342       if (prog_ == NULL || rprog_ == NULL) {
343         result->skipped = true;
344         break;
345       }
346       result->matched =
347         prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
348                          &result->skipped, NULL);
349       // If anchored, no need for second run,
350       // but do it anyway to find more bugs.
351       if (result->matched) {
352         if (!rprog_->SearchDFA(result->submatch[0], context,
353                                Prog::kAnchored, Prog::kLongestMatch,
354                                result->submatch,
355                                &result->skipped, NULL)) {
356           LOG(ERROR) << "Reverse DFA inconsistency: "
357                      << CEscape(regexp_str_)
358                      << " on " << CEscape(text);
359           result->matched = false;
360         }
361       }
362       result->have_submatch0 = true;
363       break;
364 
365     case kEngineOnePass:
366       if (prog_ == NULL ||
367           anchor == Prog::kUnanchored ||
368           !prog_->IsOnePass() ||
369           nsubmatch > Prog::kMaxOnePassCapture) {
370         result->skipped = true;
371         break;
372       }
373       result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
374                                       result->submatch, nsubmatch);
375       result->have_submatch = true;
376       break;
377 
378     case kEngineBitState:
379       if (prog_ == NULL) {
380         result->skipped = true;
381         break;
382       }
383       result->matched = prog_->SearchBitState(text, context, anchor, kind_,
384                                               result->submatch, nsubmatch);
385       result->have_submatch = true;
386       break;
387 
388     case kEngineRE2:
389     case kEngineRE2a:
390     case kEngineRE2b: {
391       if (!re2_ || text.end() != context.end()) {
392         result->skipped = true;
393         break;
394       }
395 
396       RE2::Anchor re_anchor;
397       if (anchor == Prog::kAnchored)
398         re_anchor = RE2::ANCHOR_START;
399       else
400         re_anchor = RE2::UNANCHORED;
401       if (kind_ == Prog::kFullMatch)
402         re_anchor = RE2::ANCHOR_BOTH;
403 
404       result->matched = re2_->Match(
405           context,
406           static_cast<size_t>(text.begin() - context.begin()),
407           static_cast<size_t>(text.end() - context.begin()),
408           re_anchor,
409           result->submatch,
410           nsubmatch);
411       result->have_submatch = nsubmatch > 0;
412       break;
413     }
414 
415     case kEnginePCRE: {
416       if (!re_ || text.begin() != context.begin() ||
417           text.end() != context.end()) {
418         result->skipped = true;
419         break;
420       }
421 
422       // In Perl/PCRE, \v matches any character considered vertical
423       // whitespace, not just vertical tab. Regexp::MimicsPCRE() is
424       // unable to handle all cases of this, unfortunately, so just
425       // catch them here. :(
426       if (regexp_str_.find("\\v") != StringPiece::npos &&
427           (text.find('\n') != StringPiece::npos ||
428            text.find('\f') != StringPiece::npos ||
429            text.find('\r') != StringPiece::npos)) {
430         result->skipped = true;
431         break;
432       }
433 
434       // PCRE 8.34 or so started allowing vertical tab to match \s,
435       // following a change made in Perl 5.18. RE2 does not.
436       if ((regexp_str_.find("\\s") != StringPiece::npos ||
437            regexp_str_.find("\\S") != StringPiece::npos) &&
438           text.find('\v') != StringPiece::npos) {
439         result->skipped = true;
440         break;
441       }
442 
443       const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
444       PCRE::Arg *a = new PCRE::Arg[nsubmatch];
445       for (int i = 0; i < nsubmatch; i++) {
446         a[i] = PCRE::Arg(&result->submatch[i]);
447         argptr[i] = &a[i];
448       }
449       size_t consumed;
450       PCRE::Anchor pcre_anchor;
451       if (anchor == Prog::kAnchored)
452         pcre_anchor = PCRE::ANCHOR_START;
453       else
454         pcre_anchor = PCRE::UNANCHORED;
455       if (kind_ == Prog::kFullMatch)
456         pcre_anchor = PCRE::ANCHOR_BOTH;
457       re_->ClearHitLimit();
458       result->matched =
459         re_->DoMatch(text,
460                      pcre_anchor,
461                      &consumed,
462                      argptr, nsubmatch);
463       if (re_->HitLimit()) {
464         result->untrusted = true;
465         delete[] argptr;
466         delete[] a;
467         break;
468       }
469       result->have_submatch = true;
470       delete[] argptr;
471       delete[] a;
472       break;
473     }
474   }
475 
476   if (!result->matched)
477     memset(result->submatch, 0, sizeof result->submatch);
478 }
479 
480 // Checks whether r is okay given that correct is the right answer.
481 // Specifically, r's answers have to match (but it doesn't have to
482 // claim to have all the answers).
ResultOkay(const Result & r,const Result & correct)483 static bool ResultOkay(const Result& r, const Result& correct) {
484   if (r.skipped)
485     return true;
486   if (r.matched != correct.matched)
487     return false;
488   if (r.have_submatch || r.have_submatch0) {
489     for (int i = 0; i < kMaxSubmatch; i++) {
490       if (correct.submatch[i].begin() != r.submatch[i].begin() ||
491           correct.submatch[i].size() != r.submatch[i].size())
492         return false;
493       if (!r.have_submatch)
494         break;
495     }
496   }
497   return true;
498 }
499 
500 // Runs a single test.
RunCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)501 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
502                            Prog::Anchor anchor) {
503   // Backtracking is the gold standard.
504   Result correct;
505   RunSearch(kEngineBacktrack, text, context, anchor, &correct);
506   if (correct.skipped) {
507     if (regexp_ == NULL)
508       return true;
509     LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
510                << " " << FormatMode(flags_);
511     return false;
512   }
513   VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
514           << " text " << CEscape(text)
515           << " (" << FormatKind(kind_)
516           << ", " << FormatAnchor(anchor)
517           << ", " << FormatMode(flags_)
518           << ")";
519 
520   // Compare the others.
521   bool all_okay = true;
522   for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
523     if (!(Engines() & (1<<i)))
524       continue;
525 
526     Result r;
527     RunSearch(i, text, context, anchor, &r);
528     if (ResultOkay(r, correct)) {
529       if (FLAGS_log_okay)
530         LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
531       continue;
532     }
533 
534     // We disagree with PCRE on the meaning of some Unicode matches.
535     // In particular, we treat non-ASCII UTF-8 as non-word characters.
536     // We also treat "empty" character sets like [^\w\W] as being
537     // impossible to match, while PCRE apparently excludes some code
538     // points (e.g., 0x0080) from both \w and \W.
539     if (i == kEnginePCRE && NonASCII(text))
540       continue;
541 
542     if (!r.untrusted)
543       all_okay = false;
544 
545     LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
546              context, anchor);
547     if (r.matched != correct.matched) {
548       if (r.matched) {
549         LOG(INFO) << "   Should not match (but does).";
550       } else {
551         LOG(INFO) << "   Should match (but does not).";
552         continue;
553       }
554     }
555     for (int i = 0; i < 1+num_captures_; i++) {
556       if (r.submatch[i].begin() != correct.submatch[i].begin() ||
557           r.submatch[i].end() != correct.submatch[i].end()) {
558         LOG(INFO) <<
559           StringPrintf("   $%d: should be %s is %s",
560                        i,
561                        FormatCapture(text, correct.submatch[i]).c_str(),
562                        FormatCapture(text, r.submatch[i]).c_str());
563       } else {
564         LOG(INFO) <<
565           StringPrintf("   $%d: %s ok", i,
566                        FormatCapture(text, r.submatch[i]).c_str());
567       }
568     }
569   }
570 
571   if (!all_okay) {
572     if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
573       LOG(QFATAL) << "Too many regexp failures.";
574   }
575 
576   return all_okay;
577 }
578 
LogMatch(const char * prefix,Engine e,const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)579 void TestInstance::LogMatch(const char* prefix, Engine e,
580                             const StringPiece& text, const StringPiece& context,
581                             Prog::Anchor anchor) {
582   LOG(INFO) << prefix
583     << EngineName(e)
584     << " regexp "
585     << CEscape(regexp_str_)
586     << " "
587     << CEscape(regexp_->ToString())
588     << " text "
589     << CEscape(text)
590     << " ("
591     << text.begin() - context.begin()
592     << ","
593     << text.end() - context.begin()
594     << ") of context "
595     << CEscape(context)
596     << " (" << FormatKind(kind_)
597     << ", " << FormatAnchor(anchor)
598     << ", " << FormatMode(flags_)
599     << ")";
600 }
601 
602 static Prog::MatchKind kinds[] = {
603   Prog::kFirstMatch,
604   Prog::kLongestMatch,
605   Prog::kFullMatch,
606 };
607 
608 // Test all possible match kinds and parse modes.
Tester(const StringPiece & regexp)609 Tester::Tester(const StringPiece& regexp) {
610   error_ = false;
611   for (int i = 0; i < arraysize(kinds); i++) {
612     for (int j = 0; j < arraysize(parse_modes); j++) {
613       TestInstance* t = new TestInstance(regexp, kinds[i],
614                                          parse_modes[j].parse_flags);
615       error_ |= t->error();
616       v_.push_back(t);
617     }
618   }
619 }
620 
~Tester()621 Tester::~Tester() {
622   for (size_t i = 0; i < v_.size(); i++)
623     delete v_[i];
624 }
625 
TestCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)626 bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
627                          Prog::Anchor anchor) {
628   bool okay = true;
629   for (size_t i = 0; i < v_.size(); i++)
630     okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
631   return okay;
632 }
633 
634 static Prog::Anchor anchors[] = {
635   Prog::kAnchored,
636   Prog::kUnanchored
637 };
638 
TestInput(const StringPiece & text)639 bool Tester::TestInput(const StringPiece& text) {
640   bool okay = TestInputInContext(text, text);
641   if (text.size() > 0) {
642     StringPiece sp;
643     sp = text;
644     sp.remove_prefix(1);
645     okay &= TestInputInContext(sp, text);
646     sp = text;
647     sp.remove_suffix(1);
648     okay &= TestInputInContext(sp, text);
649   }
650   return okay;
651 }
652 
TestInputInContext(const StringPiece & text,const StringPiece & context)653 bool Tester::TestInputInContext(const StringPiece& text,
654                                 const StringPiece& context) {
655   bool okay = true;
656   for (int i = 0; i < arraysize(anchors); i++)
657     okay &= TestCase(text, context, anchors[i]);
658   return okay;
659 }
660 
TestRegexpOnText(const StringPiece & regexp,const StringPiece & text)661 bool TestRegexpOnText(const StringPiece& regexp,
662                       const StringPiece& text) {
663   Tester t(regexp);
664   return t.TestInput(text);
665 }
666 
667 }  // namespace re2
668