xref: /aosp_15_r20/external/regex-re2/re2/testing/dfa_test.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
6*ccdc9c3eSSadaf Ebrahimi #include <string>
7*ccdc9c3eSSadaf Ebrahimi #include <thread>
8*ccdc9c3eSSadaf Ebrahimi #include <vector>
9*ccdc9c3eSSadaf Ebrahimi 
10*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
11*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
12*ccdc9c3eSSadaf Ebrahimi #include "util/strutil.h"
13*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
14*ccdc9c3eSSadaf Ebrahimi #include "re2/re2.h"
15*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
16*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/regexp_generator.h"
17*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/string_generator.h"
18*ccdc9c3eSSadaf Ebrahimi 
19*ccdc9c3eSSadaf Ebrahimi static const bool UsingMallocCounter = false;
20*ccdc9c3eSSadaf Ebrahimi 
21*ccdc9c3eSSadaf Ebrahimi DEFINE_int32(size, 8, "log2(number of DFA nodes)");
22*ccdc9c3eSSadaf Ebrahimi DEFINE_int32(repeat, 2, "Repetition count.");
23*ccdc9c3eSSadaf Ebrahimi DEFINE_int32(threads, 4, "number of threads");
24*ccdc9c3eSSadaf Ebrahimi 
25*ccdc9c3eSSadaf Ebrahimi namespace re2 {
26*ccdc9c3eSSadaf Ebrahimi 
27*ccdc9c3eSSadaf Ebrahimi // Check that multithreaded access to DFA class works.
28*ccdc9c3eSSadaf Ebrahimi 
29*ccdc9c3eSSadaf Ebrahimi // Helper function: builds entire DFA for prog.
DoBuild(Prog * prog)30*ccdc9c3eSSadaf Ebrahimi static void DoBuild(Prog* prog) {
31*ccdc9c3eSSadaf Ebrahimi   ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
32*ccdc9c3eSSadaf Ebrahimi }
33*ccdc9c3eSSadaf Ebrahimi 
TEST(Multithreaded,BuildEntireDFA)34*ccdc9c3eSSadaf Ebrahimi TEST(Multithreaded, BuildEntireDFA) {
35*ccdc9c3eSSadaf Ebrahimi   // Create regexp with 2^FLAGS_size states in DFA.
36*ccdc9c3eSSadaf Ebrahimi   string s = "a";
37*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < FLAGS_size; i++)
38*ccdc9c3eSSadaf Ebrahimi     s += "[ab]";
39*ccdc9c3eSSadaf Ebrahimi   s += "b";
40*ccdc9c3eSSadaf Ebrahimi   Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
41*ccdc9c3eSSadaf Ebrahimi   ASSERT_TRUE(re != NULL);
42*ccdc9c3eSSadaf Ebrahimi 
43*ccdc9c3eSSadaf Ebrahimi   // Check that single-threaded code works.
44*ccdc9c3eSSadaf Ebrahimi   {
45*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToProg(0);
46*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(prog != NULL);
47*ccdc9c3eSSadaf Ebrahimi 
48*ccdc9c3eSSadaf Ebrahimi     std::thread t(DoBuild, prog);
49*ccdc9c3eSSadaf Ebrahimi     t.join();
50*ccdc9c3eSSadaf Ebrahimi 
51*ccdc9c3eSSadaf Ebrahimi     delete prog;
52*ccdc9c3eSSadaf Ebrahimi   }
53*ccdc9c3eSSadaf Ebrahimi 
54*ccdc9c3eSSadaf Ebrahimi   // Build the DFA simultaneously in a bunch of threads.
55*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < FLAGS_repeat; i++) {
56*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToProg(0);
57*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(prog != NULL);
58*ccdc9c3eSSadaf Ebrahimi 
59*ccdc9c3eSSadaf Ebrahimi     std::vector<std::thread> threads;
60*ccdc9c3eSSadaf Ebrahimi     for (int j = 0; j < FLAGS_threads; j++)
61*ccdc9c3eSSadaf Ebrahimi       threads.emplace_back(DoBuild, prog);
62*ccdc9c3eSSadaf Ebrahimi     for (int j = 0; j < FLAGS_threads; j++)
63*ccdc9c3eSSadaf Ebrahimi       threads[j].join();
64*ccdc9c3eSSadaf Ebrahimi 
65*ccdc9c3eSSadaf Ebrahimi     // One more compile, to make sure everything is okay.
66*ccdc9c3eSSadaf Ebrahimi     prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
67*ccdc9c3eSSadaf Ebrahimi     delete prog;
68*ccdc9c3eSSadaf Ebrahimi   }
69*ccdc9c3eSSadaf Ebrahimi 
70*ccdc9c3eSSadaf Ebrahimi   re->Decref();
71*ccdc9c3eSSadaf Ebrahimi }
72*ccdc9c3eSSadaf Ebrahimi 
73*ccdc9c3eSSadaf Ebrahimi // Check that DFA size requirements are followed.
74*ccdc9c3eSSadaf Ebrahimi // BuildEntireDFA will, like SearchDFA, stop building out
75*ccdc9c3eSSadaf Ebrahimi // the DFA once the memory limits are reached.
TEST(SingleThreaded,BuildEntireDFA)76*ccdc9c3eSSadaf Ebrahimi TEST(SingleThreaded, BuildEntireDFA) {
77*ccdc9c3eSSadaf Ebrahimi   // Create regexp with 2^30 states in DFA.
78*ccdc9c3eSSadaf Ebrahimi   Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
79*ccdc9c3eSSadaf Ebrahimi   ASSERT_TRUE(re != NULL);
80*ccdc9c3eSSadaf Ebrahimi 
81*ccdc9c3eSSadaf Ebrahimi   for (int i = 17; i < 24; i++) {
82*ccdc9c3eSSadaf Ebrahimi     int64_t limit = int64_t{1}<<i;
83*ccdc9c3eSSadaf Ebrahimi     int64_t usage;
84*ccdc9c3eSSadaf Ebrahimi     //int64_t progusage, dfamem;
85*ccdc9c3eSSadaf Ebrahimi     {
86*ccdc9c3eSSadaf Ebrahimi       testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
87*ccdc9c3eSSadaf Ebrahimi       Prog* prog = re->CompileToProg(limit);
88*ccdc9c3eSSadaf Ebrahimi       ASSERT_TRUE(prog != NULL);
89*ccdc9c3eSSadaf Ebrahimi       //progusage = m.HeapGrowth();
90*ccdc9c3eSSadaf Ebrahimi       //dfamem = prog->dfa_mem();
91*ccdc9c3eSSadaf Ebrahimi       prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
92*ccdc9c3eSSadaf Ebrahimi       prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
93*ccdc9c3eSSadaf Ebrahimi       usage = m.HeapGrowth();
94*ccdc9c3eSSadaf Ebrahimi       delete prog;
95*ccdc9c3eSSadaf Ebrahimi     }
96*ccdc9c3eSSadaf Ebrahimi     if (UsingMallocCounter) {
97*ccdc9c3eSSadaf Ebrahimi       //LOG(INFO) << "limit " << limit << ", "
98*ccdc9c3eSSadaf Ebrahimi       //          << "prog usage " << progusage << ", "
99*ccdc9c3eSSadaf Ebrahimi       //          << "DFA budget " << dfamem << ", "
100*ccdc9c3eSSadaf Ebrahimi       //          << "total " << usage;
101*ccdc9c3eSSadaf Ebrahimi       // Tolerate +/- 10%.
102*ccdc9c3eSSadaf Ebrahimi       ASSERT_GT(usage, limit*9/10);
103*ccdc9c3eSSadaf Ebrahimi       ASSERT_LT(usage, limit*11/10);
104*ccdc9c3eSSadaf Ebrahimi     }
105*ccdc9c3eSSadaf Ebrahimi   }
106*ccdc9c3eSSadaf Ebrahimi   re->Decref();
107*ccdc9c3eSSadaf Ebrahimi }
108*ccdc9c3eSSadaf Ebrahimi 
109*ccdc9c3eSSadaf Ebrahimi // Generates and returns a string over binary alphabet {0,1} that contains
110*ccdc9c3eSSadaf Ebrahimi // all possible binary sequences of length n as subsequences.  The obvious
111*ccdc9c3eSSadaf Ebrahimi // brute force method would generate a string of length n * 2^n, but this
112*ccdc9c3eSSadaf Ebrahimi // generates a string of length n + 2^n - 1 called a De Bruijn cycle.
113*ccdc9c3eSSadaf Ebrahimi // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
114*ccdc9c3eSSadaf Ebrahimi // Such a string is useful for testing a DFA.  If you have a DFA
115*ccdc9c3eSSadaf Ebrahimi // where distinct last n bytes implies distinct states, then running on a
116*ccdc9c3eSSadaf Ebrahimi // DeBruijn string causes the DFA to need to create a new state at every
117*ccdc9c3eSSadaf Ebrahimi // position in the input, never reusing any states until it gets to the
118*ccdc9c3eSSadaf Ebrahimi // end of the string.  This is the worst possible case for DFA execution.
DeBruijnString(int n)119*ccdc9c3eSSadaf Ebrahimi static string DeBruijnString(int n) {
120*ccdc9c3eSSadaf Ebrahimi   CHECK_LT(n, static_cast<int>(8*sizeof(int)));
121*ccdc9c3eSSadaf Ebrahimi   CHECK_GT(n, 0);
122*ccdc9c3eSSadaf Ebrahimi 
123*ccdc9c3eSSadaf Ebrahimi   std::vector<bool> did(size_t{1}<<n);
124*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < 1<<n; i++)
125*ccdc9c3eSSadaf Ebrahimi     did[i] = false;
126*ccdc9c3eSSadaf Ebrahimi 
127*ccdc9c3eSSadaf Ebrahimi   string s;
128*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < n-1; i++)
129*ccdc9c3eSSadaf Ebrahimi     s.append("0");
130*ccdc9c3eSSadaf Ebrahimi   int bits = 0;
131*ccdc9c3eSSadaf Ebrahimi   int mask = (1<<n) - 1;
132*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < (1<<n); i++) {
133*ccdc9c3eSSadaf Ebrahimi     bits <<= 1;
134*ccdc9c3eSSadaf Ebrahimi     bits &= mask;
135*ccdc9c3eSSadaf Ebrahimi     if (!did[bits|1]) {
136*ccdc9c3eSSadaf Ebrahimi       bits |= 1;
137*ccdc9c3eSSadaf Ebrahimi       s.append("1");
138*ccdc9c3eSSadaf Ebrahimi     } else {
139*ccdc9c3eSSadaf Ebrahimi       s.append("0");
140*ccdc9c3eSSadaf Ebrahimi     }
141*ccdc9c3eSSadaf Ebrahimi     CHECK(!did[bits]);
142*ccdc9c3eSSadaf Ebrahimi     did[bits] = true;
143*ccdc9c3eSSadaf Ebrahimi   }
144*ccdc9c3eSSadaf Ebrahimi   return s;
145*ccdc9c3eSSadaf Ebrahimi }
146*ccdc9c3eSSadaf Ebrahimi 
147*ccdc9c3eSSadaf Ebrahimi // Test that the DFA gets the right result even if it runs
148*ccdc9c3eSSadaf Ebrahimi // out of memory during a search.  The regular expression
149*ccdc9c3eSSadaf Ebrahimi // 0[01]{n}$ matches a binary string of 0s and 1s only if
150*ccdc9c3eSSadaf Ebrahimi // the (n+1)th-to-last character is a 0.  Matching this in
151*ccdc9c3eSSadaf Ebrahimi // a single forward pass (as done by the DFA) requires
152*ccdc9c3eSSadaf Ebrahimi // keeping one bit for each of the last n+1 characters
153*ccdc9c3eSSadaf Ebrahimi // (whether each was a 0), or 2^(n+1) possible states.
154*ccdc9c3eSSadaf Ebrahimi // If we run this regexp to search in a string that contains
155*ccdc9c3eSSadaf Ebrahimi // every possible n-character binary string as a substring,
156*ccdc9c3eSSadaf Ebrahimi // then it will have to run through at least 2^n states.
157*ccdc9c3eSSadaf Ebrahimi // States are big data structures -- certainly more than 1 byte --
158*ccdc9c3eSSadaf Ebrahimi // so if the DFA can search correctly while staying within a
159*ccdc9c3eSSadaf Ebrahimi // 2^n byte limit, it must be handling out-of-memory conditions
160*ccdc9c3eSSadaf Ebrahimi // gracefully.
TEST(SingleThreaded,SearchDFA)161*ccdc9c3eSSadaf Ebrahimi TEST(SingleThreaded, SearchDFA) {
162*ccdc9c3eSSadaf Ebrahimi   // The De Bruijn string is the worst case input for this regexp.
163*ccdc9c3eSSadaf Ebrahimi   // By default, the DFA will notice that it is flushing its cache
164*ccdc9c3eSSadaf Ebrahimi   // too frequently and will bail out early, so that RE2 can use the
165*ccdc9c3eSSadaf Ebrahimi   // NFA implementation instead.  (The DFA loses its speed advantage
166*ccdc9c3eSSadaf Ebrahimi   // if it can't get a good cache hit rate.)
167*ccdc9c3eSSadaf Ebrahimi   // Tell the DFA to trudge along instead.
168*ccdc9c3eSSadaf Ebrahimi   Prog::TEST_dfa_should_bail_when_slow(false);
169*ccdc9c3eSSadaf Ebrahimi 
170*ccdc9c3eSSadaf Ebrahimi   // Choice of n is mostly arbitrary, except that:
171*ccdc9c3eSSadaf Ebrahimi   //   * making n too big makes the test run for too long.
172*ccdc9c3eSSadaf Ebrahimi   //   * making n too small makes the DFA refuse to run,
173*ccdc9c3eSSadaf Ebrahimi   //     because it has so little memory compared to the program size.
174*ccdc9c3eSSadaf Ebrahimi   // Empirically, n = 18 is a good compromise between the two.
175*ccdc9c3eSSadaf Ebrahimi   const int n = 18;
176*ccdc9c3eSSadaf Ebrahimi 
177*ccdc9c3eSSadaf Ebrahimi   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
178*ccdc9c3eSSadaf Ebrahimi                              Regexp::LikePerl, NULL);
179*ccdc9c3eSSadaf Ebrahimi   ASSERT_TRUE(re != NULL);
180*ccdc9c3eSSadaf Ebrahimi 
181*ccdc9c3eSSadaf Ebrahimi   // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
182*ccdc9c3eSSadaf Ebrahimi   // which is not a match for 0[01]{n}$.  Adding one more 0 is a match.
183*ccdc9c3eSSadaf Ebrahimi   string no_match = DeBruijnString(n);
184*ccdc9c3eSSadaf Ebrahimi   string match = no_match + "0";
185*ccdc9c3eSSadaf Ebrahimi 
186*ccdc9c3eSSadaf Ebrahimi   int64_t usage;
187*ccdc9c3eSSadaf Ebrahimi   int64_t peak_usage;
188*ccdc9c3eSSadaf Ebrahimi   {
189*ccdc9c3eSSadaf Ebrahimi     testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
190*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToProg(1<<n);
191*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(prog != NULL);
192*ccdc9c3eSSadaf Ebrahimi     for (int i = 0; i < 10; i++) {
193*ccdc9c3eSSadaf Ebrahimi       bool matched = false;
194*ccdc9c3eSSadaf Ebrahimi       bool failed = false;
195*ccdc9c3eSSadaf Ebrahimi       matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
196*ccdc9c3eSSadaf Ebrahimi                                 Prog::kFirstMatch, NULL, &failed, NULL);
197*ccdc9c3eSSadaf Ebrahimi       ASSERT_FALSE(failed);
198*ccdc9c3eSSadaf Ebrahimi       ASSERT_TRUE(matched);
199*ccdc9c3eSSadaf Ebrahimi       matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
200*ccdc9c3eSSadaf Ebrahimi                                 Prog::kFirstMatch, NULL, &failed, NULL);
201*ccdc9c3eSSadaf Ebrahimi       ASSERT_FALSE(failed);
202*ccdc9c3eSSadaf Ebrahimi       ASSERT_FALSE(matched);
203*ccdc9c3eSSadaf Ebrahimi     }
204*ccdc9c3eSSadaf Ebrahimi     usage = m.HeapGrowth();
205*ccdc9c3eSSadaf Ebrahimi     peak_usage = m.PeakHeapGrowth();
206*ccdc9c3eSSadaf Ebrahimi     delete prog;
207*ccdc9c3eSSadaf Ebrahimi   }
208*ccdc9c3eSSadaf Ebrahimi   if (UsingMallocCounter) {
209*ccdc9c3eSSadaf Ebrahimi     //LOG(INFO) << "usage " << usage << ", "
210*ccdc9c3eSSadaf Ebrahimi     //          << "peak usage " << peak_usage;
211*ccdc9c3eSSadaf Ebrahimi     ASSERT_LT(usage, 1<<n);
212*ccdc9c3eSSadaf Ebrahimi     ASSERT_LT(peak_usage, 1<<n);
213*ccdc9c3eSSadaf Ebrahimi   }
214*ccdc9c3eSSadaf Ebrahimi   re->Decref();
215*ccdc9c3eSSadaf Ebrahimi 
216*ccdc9c3eSSadaf Ebrahimi   // Reset to original behaviour.
217*ccdc9c3eSSadaf Ebrahimi   Prog::TEST_dfa_should_bail_when_slow(true);
218*ccdc9c3eSSadaf Ebrahimi }
219*ccdc9c3eSSadaf Ebrahimi 
220*ccdc9c3eSSadaf Ebrahimi // Helper function: searches for match, which should match,
221*ccdc9c3eSSadaf Ebrahimi // and no_match, which should not.
DoSearch(Prog * prog,const StringPiece & match,const StringPiece & no_match)222*ccdc9c3eSSadaf Ebrahimi static void DoSearch(Prog* prog, const StringPiece& match,
223*ccdc9c3eSSadaf Ebrahimi                      const StringPiece& no_match) {
224*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < 2; i++) {
225*ccdc9c3eSSadaf Ebrahimi     bool matched = false;
226*ccdc9c3eSSadaf Ebrahimi     bool failed = false;
227*ccdc9c3eSSadaf Ebrahimi     matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
228*ccdc9c3eSSadaf Ebrahimi                               Prog::kFirstMatch, NULL, &failed, NULL);
229*ccdc9c3eSSadaf Ebrahimi     ASSERT_FALSE(failed);
230*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(matched);
231*ccdc9c3eSSadaf Ebrahimi     matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
232*ccdc9c3eSSadaf Ebrahimi                               Prog::kFirstMatch, NULL, &failed, NULL);
233*ccdc9c3eSSadaf Ebrahimi     ASSERT_FALSE(failed);
234*ccdc9c3eSSadaf Ebrahimi     ASSERT_FALSE(matched);
235*ccdc9c3eSSadaf Ebrahimi   }
236*ccdc9c3eSSadaf Ebrahimi }
237*ccdc9c3eSSadaf Ebrahimi 
TEST(Multithreaded,SearchDFA)238*ccdc9c3eSSadaf Ebrahimi TEST(Multithreaded, SearchDFA) {
239*ccdc9c3eSSadaf Ebrahimi   Prog::TEST_dfa_should_bail_when_slow(false);
240*ccdc9c3eSSadaf Ebrahimi 
241*ccdc9c3eSSadaf Ebrahimi   // Same as single-threaded test above.
242*ccdc9c3eSSadaf Ebrahimi   const int n = 18;
243*ccdc9c3eSSadaf Ebrahimi   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
244*ccdc9c3eSSadaf Ebrahimi                              Regexp::LikePerl, NULL);
245*ccdc9c3eSSadaf Ebrahimi   ASSERT_TRUE(re != NULL);
246*ccdc9c3eSSadaf Ebrahimi   string no_match = DeBruijnString(n);
247*ccdc9c3eSSadaf Ebrahimi   string match = no_match + "0";
248*ccdc9c3eSSadaf Ebrahimi 
249*ccdc9c3eSSadaf Ebrahimi   // Check that single-threaded code works.
250*ccdc9c3eSSadaf Ebrahimi   {
251*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToProg(1<<n);
252*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(prog != NULL);
253*ccdc9c3eSSadaf Ebrahimi 
254*ccdc9c3eSSadaf Ebrahimi     std::thread t(DoSearch, prog, match, no_match);
255*ccdc9c3eSSadaf Ebrahimi     t.join();
256*ccdc9c3eSSadaf Ebrahimi 
257*ccdc9c3eSSadaf Ebrahimi     delete prog;
258*ccdc9c3eSSadaf Ebrahimi   }
259*ccdc9c3eSSadaf Ebrahimi 
260*ccdc9c3eSSadaf Ebrahimi   // Run the search simultaneously in a bunch of threads.
261*ccdc9c3eSSadaf Ebrahimi   // Reuse same flags for Multithreaded.BuildDFA above.
262*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < FLAGS_repeat; i++) {
263*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToProg(1<<n);
264*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(prog != NULL);
265*ccdc9c3eSSadaf Ebrahimi 
266*ccdc9c3eSSadaf Ebrahimi     std::vector<std::thread> threads;
267*ccdc9c3eSSadaf Ebrahimi     for (int j = 0; j < FLAGS_threads; j++)
268*ccdc9c3eSSadaf Ebrahimi       threads.emplace_back(DoSearch, prog, match, no_match);
269*ccdc9c3eSSadaf Ebrahimi     for (int j = 0; j < FLAGS_threads; j++)
270*ccdc9c3eSSadaf Ebrahimi       threads[j].join();
271*ccdc9c3eSSadaf Ebrahimi 
272*ccdc9c3eSSadaf Ebrahimi     delete prog;
273*ccdc9c3eSSadaf Ebrahimi   }
274*ccdc9c3eSSadaf Ebrahimi 
275*ccdc9c3eSSadaf Ebrahimi   re->Decref();
276*ccdc9c3eSSadaf Ebrahimi 
277*ccdc9c3eSSadaf Ebrahimi   // Reset to original behaviour.
278*ccdc9c3eSSadaf Ebrahimi   Prog::TEST_dfa_should_bail_when_slow(true);
279*ccdc9c3eSSadaf Ebrahimi }
280*ccdc9c3eSSadaf Ebrahimi 
281*ccdc9c3eSSadaf Ebrahimi struct ReverseTest {
282*ccdc9c3eSSadaf Ebrahimi   const char* regexp;
283*ccdc9c3eSSadaf Ebrahimi   const char* text;
284*ccdc9c3eSSadaf Ebrahimi   bool match;
285*ccdc9c3eSSadaf Ebrahimi };
286*ccdc9c3eSSadaf Ebrahimi 
287*ccdc9c3eSSadaf Ebrahimi // Test that reverse DFA handles anchored/unanchored correctly.
288*ccdc9c3eSSadaf Ebrahimi // It's in the DFA interface but not used by RE2.
289*ccdc9c3eSSadaf Ebrahimi ReverseTest reverse_tests[] = {
290*ccdc9c3eSSadaf Ebrahimi   { "\\A(a|b)", "abc", true },
291*ccdc9c3eSSadaf Ebrahimi   { "(a|b)\\z", "cba", true },
292*ccdc9c3eSSadaf Ebrahimi   { "\\A(a|b)", "cba", false },
293*ccdc9c3eSSadaf Ebrahimi   { "(a|b)\\z", "abc", false },
294*ccdc9c3eSSadaf Ebrahimi };
295*ccdc9c3eSSadaf Ebrahimi 
TEST(DFA,ReverseMatch)296*ccdc9c3eSSadaf Ebrahimi TEST(DFA, ReverseMatch) {
297*ccdc9c3eSSadaf Ebrahimi   int nfail = 0;
298*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < arraysize(reverse_tests); i++) {
299*ccdc9c3eSSadaf Ebrahimi     const ReverseTest& t = reverse_tests[i];
300*ccdc9c3eSSadaf Ebrahimi     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
301*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(re != NULL);
302*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToReverseProg(0);
303*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(prog != NULL);
304*ccdc9c3eSSadaf Ebrahimi     bool failed = false;
305*ccdc9c3eSSadaf Ebrahimi     bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored,
306*ccdc9c3eSSadaf Ebrahimi                                    Prog::kFirstMatch, NULL, &failed, NULL);
307*ccdc9c3eSSadaf Ebrahimi     if (matched != t.match) {
308*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
309*ccdc9c3eSSadaf Ebrahimi       nfail++;
310*ccdc9c3eSSadaf Ebrahimi     }
311*ccdc9c3eSSadaf Ebrahimi     delete prog;
312*ccdc9c3eSSadaf Ebrahimi     re->Decref();
313*ccdc9c3eSSadaf Ebrahimi   }
314*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ(nfail, 0);
315*ccdc9c3eSSadaf Ebrahimi }
316*ccdc9c3eSSadaf Ebrahimi 
317*ccdc9c3eSSadaf Ebrahimi struct CallbackTest {
318*ccdc9c3eSSadaf Ebrahimi   const char* regexp;
319*ccdc9c3eSSadaf Ebrahimi   const char* dump;
320*ccdc9c3eSSadaf Ebrahimi };
321*ccdc9c3eSSadaf Ebrahimi 
322*ccdc9c3eSSadaf Ebrahimi // Test that DFA::BuildAllStates() builds the expected DFA states
323*ccdc9c3eSSadaf Ebrahimi // and issues the expected callbacks. These test cases reflect the
324*ccdc9c3eSSadaf Ebrahimi // very compact encoding of the callbacks, but that also makes them
325*ccdc9c3eSSadaf Ebrahimi // very difficult to understand, so let's work through "\\Aa\\z".
326*ccdc9c3eSSadaf Ebrahimi // There are three slots per DFA state because the bytemap has two
327*ccdc9c3eSSadaf Ebrahimi // equivalence classes and there is a third slot for kByteEndText:
328*ccdc9c3eSSadaf Ebrahimi //   0: all bytes that are not 'a'
329*ccdc9c3eSSadaf Ebrahimi //   1: the byte 'a'
330*ccdc9c3eSSadaf Ebrahimi //   2: kByteEndText
331*ccdc9c3eSSadaf Ebrahimi // -1 means that there is no transition from that DFA state to any
332*ccdc9c3eSSadaf Ebrahimi // other DFA state for that slot. The valid transitions are thus:
333*ccdc9c3eSSadaf Ebrahimi //   state 0 --slot 1--> state 1
334*ccdc9c3eSSadaf Ebrahimi //   state 1 --slot 2--> state 2
335*ccdc9c3eSSadaf Ebrahimi // The double brackets indicate that state 2 is a matching state.
336*ccdc9c3eSSadaf Ebrahimi // Putting it together, this means that the DFA must consume the
337*ccdc9c3eSSadaf Ebrahimi // byte 'a' and then hit end of text. Q.E.D.
338*ccdc9c3eSSadaf Ebrahimi CallbackTest callback_tests[] = {
339*ccdc9c3eSSadaf Ebrahimi   { "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
340*ccdc9c3eSSadaf Ebrahimi   { "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
341*ccdc9c3eSSadaf Ebrahimi   { "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
342*ccdc9c3eSSadaf Ebrahimi   { "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
343*ccdc9c3eSSadaf Ebrahimi   { "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
344*ccdc9c3eSSadaf Ebrahimi   { "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
345*ccdc9c3eSSadaf Ebrahimi   { "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
346*ccdc9c3eSSadaf Ebrahimi   { "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
347*ccdc9c3eSSadaf Ebrahimi   { "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
348*ccdc9c3eSSadaf Ebrahimi   { "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
349*ccdc9c3eSSadaf Ebrahimi };
350*ccdc9c3eSSadaf Ebrahimi 
TEST(DFA,Callback)351*ccdc9c3eSSadaf Ebrahimi TEST(DFA, Callback) {
352*ccdc9c3eSSadaf Ebrahimi   int nfail = 0;
353*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < arraysize(callback_tests); i++) {
354*ccdc9c3eSSadaf Ebrahimi     const CallbackTest& t = callback_tests[i];
355*ccdc9c3eSSadaf Ebrahimi     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
356*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(re != NULL);
357*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToProg(0);
358*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(prog != NULL);
359*ccdc9c3eSSadaf Ebrahimi     string dump;
360*ccdc9c3eSSadaf Ebrahimi     prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
361*ccdc9c3eSSadaf Ebrahimi       ASSERT_TRUE(next != NULL);
362*ccdc9c3eSSadaf Ebrahimi       if (!dump.empty())
363*ccdc9c3eSSadaf Ebrahimi         StringAppendF(&dump, " ");
364*ccdc9c3eSSadaf Ebrahimi       StringAppendF(&dump, match ? "[[" : "[");
365*ccdc9c3eSSadaf Ebrahimi       for (int b = 0; b < prog->bytemap_range() + 1; b++)
366*ccdc9c3eSSadaf Ebrahimi         StringAppendF(&dump, "%d,", next[b]);
367*ccdc9c3eSSadaf Ebrahimi       dump.pop_back();
368*ccdc9c3eSSadaf Ebrahimi       StringAppendF(&dump, match ? "]]" : "]");
369*ccdc9c3eSSadaf Ebrahimi     });
370*ccdc9c3eSSadaf Ebrahimi     if (dump != t.dump) {
371*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
372*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
373*ccdc9c3eSSadaf Ebrahimi       nfail++;
374*ccdc9c3eSSadaf Ebrahimi     }
375*ccdc9c3eSSadaf Ebrahimi     delete prog;
376*ccdc9c3eSSadaf Ebrahimi     re->Decref();
377*ccdc9c3eSSadaf Ebrahimi   }
378*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ(nfail, 0);
379*ccdc9c3eSSadaf Ebrahimi }
380*ccdc9c3eSSadaf Ebrahimi 
381*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
382