1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi // Exhaustive testing of regular expression matching.
6*ccdc9c3eSSadaf Ebrahimi
7*ccdc9c3eSSadaf Ebrahimi // Each test picks an alphabet (e.g., "abc"), a maximum string length,
8*ccdc9c3eSSadaf Ebrahimi // a maximum regular expression length, and a maximum number of letters
9*ccdc9c3eSSadaf Ebrahimi // that can appear in the regular expression. Given these parameters,
10*ccdc9c3eSSadaf Ebrahimi // it tries every possible regular expression and string, verifying that
11*ccdc9c3eSSadaf Ebrahimi // the NFA, DFA, and a trivial backtracking implementation agree about
12*ccdc9c3eSSadaf Ebrahimi // the location of the match.
13*ccdc9c3eSSadaf Ebrahimi
14*ccdc9c3eSSadaf Ebrahimi #include <stdio.h>
15*ccdc9c3eSSadaf Ebrahimi
16*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
17*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
18*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
19*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/exhaustive_tester.h"
20*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/tester.h"
21*ccdc9c3eSSadaf Ebrahimi
22*ccdc9c3eSSadaf Ebrahimi // For target `log' in the Makefile.
23*ccdc9c3eSSadaf Ebrahimi #ifndef LOGGING
24*ccdc9c3eSSadaf Ebrahimi #define LOGGING 0
25*ccdc9c3eSSadaf Ebrahimi #endif
26*ccdc9c3eSSadaf Ebrahimi
27*ccdc9c3eSSadaf Ebrahimi DEFINE_bool(show_regexps, false, "show regexps during testing");
28*ccdc9c3eSSadaf Ebrahimi
29*ccdc9c3eSSadaf Ebrahimi DEFINE_int32(max_bad_regexp_inputs, 1,
30*ccdc9c3eSSadaf Ebrahimi "Stop testing a regular expression after finding this many "
31*ccdc9c3eSSadaf Ebrahimi "strings that break it.");
32*ccdc9c3eSSadaf Ebrahimi
33*ccdc9c3eSSadaf Ebrahimi namespace re2 {
34*ccdc9c3eSSadaf Ebrahimi
escape(const StringPiece & sp)35*ccdc9c3eSSadaf Ebrahimi static char* escape(const StringPiece& sp) {
36*ccdc9c3eSSadaf Ebrahimi static char buf[512];
37*ccdc9c3eSSadaf Ebrahimi char* p = buf;
38*ccdc9c3eSSadaf Ebrahimi *p++ = '\"';
39*ccdc9c3eSSadaf Ebrahimi for (size_t i = 0; i < sp.size(); i++) {
40*ccdc9c3eSSadaf Ebrahimi if(p+5 >= buf+sizeof buf)
41*ccdc9c3eSSadaf Ebrahimi LOG(FATAL) << "ExhaustiveTester escape: too long";
42*ccdc9c3eSSadaf Ebrahimi if(sp[i] == '\\' || sp[i] == '\"') {
43*ccdc9c3eSSadaf Ebrahimi *p++ = '\\';
44*ccdc9c3eSSadaf Ebrahimi *p++ = sp[i];
45*ccdc9c3eSSadaf Ebrahimi } else if(sp[i] == '\n') {
46*ccdc9c3eSSadaf Ebrahimi *p++ = '\\';
47*ccdc9c3eSSadaf Ebrahimi *p++ = 'n';
48*ccdc9c3eSSadaf Ebrahimi } else {
49*ccdc9c3eSSadaf Ebrahimi *p++ = sp[i];
50*ccdc9c3eSSadaf Ebrahimi }
51*ccdc9c3eSSadaf Ebrahimi }
52*ccdc9c3eSSadaf Ebrahimi *p++ = '\"';
53*ccdc9c3eSSadaf Ebrahimi *p = '\0';
54*ccdc9c3eSSadaf Ebrahimi return buf;
55*ccdc9c3eSSadaf Ebrahimi }
56*ccdc9c3eSSadaf Ebrahimi
PrintResult(const RE2 & re,const StringPiece & input,RE2::Anchor anchor,StringPiece * m,int n)57*ccdc9c3eSSadaf Ebrahimi static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) {
58*ccdc9c3eSSadaf Ebrahimi if (!re.Match(input, 0, input.size(), anchor, m, n)) {
59*ccdc9c3eSSadaf Ebrahimi printf("-");
60*ccdc9c3eSSadaf Ebrahimi return;
61*ccdc9c3eSSadaf Ebrahimi }
62*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < n; i++) {
63*ccdc9c3eSSadaf Ebrahimi if (i > 0)
64*ccdc9c3eSSadaf Ebrahimi printf(" ");
65*ccdc9c3eSSadaf Ebrahimi if (m[i].begin() == NULL)
66*ccdc9c3eSSadaf Ebrahimi printf("-");
67*ccdc9c3eSSadaf Ebrahimi else
68*ccdc9c3eSSadaf Ebrahimi printf("%td-%td",
69*ccdc9c3eSSadaf Ebrahimi m[i].begin() - input.begin(), m[i].end() - input.begin());
70*ccdc9c3eSSadaf Ebrahimi }
71*ccdc9c3eSSadaf Ebrahimi }
72*ccdc9c3eSSadaf Ebrahimi
73*ccdc9c3eSSadaf Ebrahimi // Processes a single generated regexp.
74*ccdc9c3eSSadaf Ebrahimi // Compiles it using Regexp interface and PCRE, and then
75*ccdc9c3eSSadaf Ebrahimi // checks that NFA, DFA, and PCRE all return the same results.
HandleRegexp(const string & const_regexp)76*ccdc9c3eSSadaf Ebrahimi void ExhaustiveTester::HandleRegexp(const string& const_regexp) {
77*ccdc9c3eSSadaf Ebrahimi regexps_++;
78*ccdc9c3eSSadaf Ebrahimi string regexp = const_regexp;
79*ccdc9c3eSSadaf Ebrahimi if (!topwrapper_.empty())
80*ccdc9c3eSSadaf Ebrahimi regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
81*ccdc9c3eSSadaf Ebrahimi
82*ccdc9c3eSSadaf Ebrahimi if (FLAGS_show_regexps) {
83*ccdc9c3eSSadaf Ebrahimi printf("\r%s", regexp.c_str());
84*ccdc9c3eSSadaf Ebrahimi fflush(stdout);
85*ccdc9c3eSSadaf Ebrahimi }
86*ccdc9c3eSSadaf Ebrahimi
87*ccdc9c3eSSadaf Ebrahimi if (LOGGING) {
88*ccdc9c3eSSadaf Ebrahimi // Write out test cases and answers for use in testing
89*ccdc9c3eSSadaf Ebrahimi // other implementations, such as Go's regexp package.
90*ccdc9c3eSSadaf Ebrahimi if (randomstrings_)
91*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Cannot log with random strings.";
92*ccdc9c3eSSadaf Ebrahimi if (regexps_ == 1) { // first
93*ccdc9c3eSSadaf Ebrahimi printf("strings\n");
94*ccdc9c3eSSadaf Ebrahimi strgen_.Reset();
95*ccdc9c3eSSadaf Ebrahimi while (strgen_.HasNext())
96*ccdc9c3eSSadaf Ebrahimi printf("%s\n", escape(strgen_.Next()));
97*ccdc9c3eSSadaf Ebrahimi printf("regexps\n");
98*ccdc9c3eSSadaf Ebrahimi }
99*ccdc9c3eSSadaf Ebrahimi printf("%s\n", escape(regexp));
100*ccdc9c3eSSadaf Ebrahimi
101*ccdc9c3eSSadaf Ebrahimi RE2 re(regexp);
102*ccdc9c3eSSadaf Ebrahimi RE2::Options longest;
103*ccdc9c3eSSadaf Ebrahimi longest.set_longest_match(true);
104*ccdc9c3eSSadaf Ebrahimi RE2 relongest(regexp, longest);
105*ccdc9c3eSSadaf Ebrahimi int ngroup = re.NumberOfCapturingGroups()+1;
106*ccdc9c3eSSadaf Ebrahimi StringPiece* group = new StringPiece[ngroup];
107*ccdc9c3eSSadaf Ebrahimi
108*ccdc9c3eSSadaf Ebrahimi strgen_.Reset();
109*ccdc9c3eSSadaf Ebrahimi while (strgen_.HasNext()) {
110*ccdc9c3eSSadaf Ebrahimi StringPiece input = strgen_.Next();
111*ccdc9c3eSSadaf Ebrahimi PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
112*ccdc9c3eSSadaf Ebrahimi printf(";");
113*ccdc9c3eSSadaf Ebrahimi PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
114*ccdc9c3eSSadaf Ebrahimi printf(";");
115*ccdc9c3eSSadaf Ebrahimi PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
116*ccdc9c3eSSadaf Ebrahimi printf(";");
117*ccdc9c3eSSadaf Ebrahimi PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
118*ccdc9c3eSSadaf Ebrahimi printf("\n");
119*ccdc9c3eSSadaf Ebrahimi }
120*ccdc9c3eSSadaf Ebrahimi delete[] group;
121*ccdc9c3eSSadaf Ebrahimi return;
122*ccdc9c3eSSadaf Ebrahimi }
123*ccdc9c3eSSadaf Ebrahimi
124*ccdc9c3eSSadaf Ebrahimi Tester tester(regexp);
125*ccdc9c3eSSadaf Ebrahimi if (tester.error())
126*ccdc9c3eSSadaf Ebrahimi return;
127*ccdc9c3eSSadaf Ebrahimi
128*ccdc9c3eSSadaf Ebrahimi strgen_.Reset();
129*ccdc9c3eSSadaf Ebrahimi strgen_.GenerateNULL();
130*ccdc9c3eSSadaf Ebrahimi if (randomstrings_)
131*ccdc9c3eSSadaf Ebrahimi strgen_.Random(stringseed_, stringcount_);
132*ccdc9c3eSSadaf Ebrahimi int bad_inputs = 0;
133*ccdc9c3eSSadaf Ebrahimi while (strgen_.HasNext()) {
134*ccdc9c3eSSadaf Ebrahimi tests_++;
135*ccdc9c3eSSadaf Ebrahimi if (!tester.TestInput(strgen_.Next())) {
136*ccdc9c3eSSadaf Ebrahimi failures_++;
137*ccdc9c3eSSadaf Ebrahimi if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
138*ccdc9c3eSSadaf Ebrahimi break;
139*ccdc9c3eSSadaf Ebrahimi }
140*ccdc9c3eSSadaf Ebrahimi }
141*ccdc9c3eSSadaf Ebrahimi }
142*ccdc9c3eSSadaf Ebrahimi
143*ccdc9c3eSSadaf Ebrahimi // Runs an exhaustive test on the given parameters.
ExhaustiveTest(int maxatoms,int maxops,const std::vector<string> & alphabet,const std::vector<string> & ops,int maxstrlen,const std::vector<string> & stralphabet,const string & wrapper,const string & topwrapper)144*ccdc9c3eSSadaf Ebrahimi void ExhaustiveTest(int maxatoms, int maxops,
145*ccdc9c3eSSadaf Ebrahimi const std::vector<string>& alphabet,
146*ccdc9c3eSSadaf Ebrahimi const std::vector<string>& ops,
147*ccdc9c3eSSadaf Ebrahimi int maxstrlen,
148*ccdc9c3eSSadaf Ebrahimi const std::vector<string>& stralphabet,
149*ccdc9c3eSSadaf Ebrahimi const string& wrapper,
150*ccdc9c3eSSadaf Ebrahimi const string& topwrapper) {
151*ccdc9c3eSSadaf Ebrahimi if (RE2_DEBUG_MODE) {
152*ccdc9c3eSSadaf Ebrahimi if (maxatoms > 1)
153*ccdc9c3eSSadaf Ebrahimi maxatoms--;
154*ccdc9c3eSSadaf Ebrahimi if (maxops > 1)
155*ccdc9c3eSSadaf Ebrahimi maxops--;
156*ccdc9c3eSSadaf Ebrahimi if (maxstrlen > 1)
157*ccdc9c3eSSadaf Ebrahimi maxstrlen--;
158*ccdc9c3eSSadaf Ebrahimi }
159*ccdc9c3eSSadaf Ebrahimi ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
160*ccdc9c3eSSadaf Ebrahimi maxstrlen, stralphabet, wrapper,
161*ccdc9c3eSSadaf Ebrahimi topwrapper);
162*ccdc9c3eSSadaf Ebrahimi t.Generate();
163*ccdc9c3eSSadaf Ebrahimi if (!LOGGING) {
164*ccdc9c3eSSadaf Ebrahimi printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
165*ccdc9c3eSSadaf Ebrahimi t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
166*ccdc9c3eSSadaf Ebrahimi }
167*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ(0, t.failures());
168*ccdc9c3eSSadaf Ebrahimi }
169*ccdc9c3eSSadaf Ebrahimi
170*ccdc9c3eSSadaf Ebrahimi // Runs an exhaustive test using the given parameters and
171*ccdc9c3eSSadaf Ebrahimi // the basic egrep operators.
EgrepTest(int maxatoms,int maxops,const string & alphabet,int maxstrlen,const string & stralphabet,const string & wrapper)172*ccdc9c3eSSadaf Ebrahimi void EgrepTest(int maxatoms, int maxops, const string& alphabet,
173*ccdc9c3eSSadaf Ebrahimi int maxstrlen, const string& stralphabet,
174*ccdc9c3eSSadaf Ebrahimi const string& wrapper) {
175*ccdc9c3eSSadaf Ebrahimi const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
176*ccdc9c3eSSadaf Ebrahimi
177*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < arraysize(tops); i++) {
178*ccdc9c3eSSadaf Ebrahimi ExhaustiveTest(maxatoms, maxops,
179*ccdc9c3eSSadaf Ebrahimi Split("", alphabet),
180*ccdc9c3eSSadaf Ebrahimi RegexpGenerator::EgrepOps(),
181*ccdc9c3eSSadaf Ebrahimi maxstrlen,
182*ccdc9c3eSSadaf Ebrahimi Split("", stralphabet),
183*ccdc9c3eSSadaf Ebrahimi wrapper,
184*ccdc9c3eSSadaf Ebrahimi tops[i]);
185*ccdc9c3eSSadaf Ebrahimi }
186*ccdc9c3eSSadaf Ebrahimi }
187*ccdc9c3eSSadaf Ebrahimi
188*ccdc9c3eSSadaf Ebrahimi } // namespace re2
189