xref: /aosp_15_r20/external/regex-re2/re2/testing/dfa_test.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <stdint.h>
6 #include <string>
7 #include <thread>
8 #include <vector>
9 
10 #include "util/test.h"
11 #include "util/logging.h"
12 #include "util/strutil.h"
13 #include "re2/prog.h"
14 #include "re2/re2.h"
15 #include "re2/regexp.h"
16 #include "re2/testing/regexp_generator.h"
17 #include "re2/testing/string_generator.h"
18 
19 static const bool UsingMallocCounter = false;
20 
21 DEFINE_int32(size, 8, "log2(number of DFA nodes)");
22 DEFINE_int32(repeat, 2, "Repetition count.");
23 DEFINE_int32(threads, 4, "number of threads");
24 
25 namespace re2 {
26 
27 // Check that multithreaded access to DFA class works.
28 
29 // Helper function: builds entire DFA for prog.
DoBuild(Prog * prog)30 static void DoBuild(Prog* prog) {
31   ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
32 }
33 
TEST(Multithreaded,BuildEntireDFA)34 TEST(Multithreaded, BuildEntireDFA) {
35   // Create regexp with 2^FLAGS_size states in DFA.
36   string s = "a";
37   for (int i = 0; i < FLAGS_size; i++)
38     s += "[ab]";
39   s += "b";
40   Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
41   ASSERT_TRUE(re != NULL);
42 
43   // Check that single-threaded code works.
44   {
45     Prog* prog = re->CompileToProg(0);
46     ASSERT_TRUE(prog != NULL);
47 
48     std::thread t(DoBuild, prog);
49     t.join();
50 
51     delete prog;
52   }
53 
54   // Build the DFA simultaneously in a bunch of threads.
55   for (int i = 0; i < FLAGS_repeat; i++) {
56     Prog* prog = re->CompileToProg(0);
57     ASSERT_TRUE(prog != NULL);
58 
59     std::vector<std::thread> threads;
60     for (int j = 0; j < FLAGS_threads; j++)
61       threads.emplace_back(DoBuild, prog);
62     for (int j = 0; j < FLAGS_threads; j++)
63       threads[j].join();
64 
65     // One more compile, to make sure everything is okay.
66     prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
67     delete prog;
68   }
69 
70   re->Decref();
71 }
72 
73 // Check that DFA size requirements are followed.
74 // BuildEntireDFA will, like SearchDFA, stop building out
75 // the DFA once the memory limits are reached.
TEST(SingleThreaded,BuildEntireDFA)76 TEST(SingleThreaded, BuildEntireDFA) {
77   // Create regexp with 2^30 states in DFA.
78   Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
79   ASSERT_TRUE(re != NULL);
80 
81   for (int i = 17; i < 24; i++) {
82     int64_t limit = int64_t{1}<<i;
83     int64_t usage;
84     //int64_t progusage, dfamem;
85     {
86       testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
87       Prog* prog = re->CompileToProg(limit);
88       ASSERT_TRUE(prog != NULL);
89       //progusage = m.HeapGrowth();
90       //dfamem = prog->dfa_mem();
91       prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
92       prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
93       usage = m.HeapGrowth();
94       delete prog;
95     }
96     if (UsingMallocCounter) {
97       //LOG(INFO) << "limit " << limit << ", "
98       //          << "prog usage " << progusage << ", "
99       //          << "DFA budget " << dfamem << ", "
100       //          << "total " << usage;
101       // Tolerate +/- 10%.
102       ASSERT_GT(usage, limit*9/10);
103       ASSERT_LT(usage, limit*11/10);
104     }
105   }
106   re->Decref();
107 }
108 
109 // Generates and returns a string over binary alphabet {0,1} that contains
110 // all possible binary sequences of length n as subsequences.  The obvious
111 // brute force method would generate a string of length n * 2^n, but this
112 // generates a string of length n + 2^n - 1 called a De Bruijn cycle.
113 // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
114 // Such a string is useful for testing a DFA.  If you have a DFA
115 // where distinct last n bytes implies distinct states, then running on a
116 // DeBruijn string causes the DFA to need to create a new state at every
117 // position in the input, never reusing any states until it gets to the
118 // end of the string.  This is the worst possible case for DFA execution.
DeBruijnString(int n)119 static string DeBruijnString(int n) {
120   CHECK_LT(n, static_cast<int>(8*sizeof(int)));
121   CHECK_GT(n, 0);
122 
123   std::vector<bool> did(size_t{1}<<n);
124   for (int i = 0; i < 1<<n; i++)
125     did[i] = false;
126 
127   string s;
128   for (int i = 0; i < n-1; i++)
129     s.append("0");
130   int bits = 0;
131   int mask = (1<<n) - 1;
132   for (int i = 0; i < (1<<n); i++) {
133     bits <<= 1;
134     bits &= mask;
135     if (!did[bits|1]) {
136       bits |= 1;
137       s.append("1");
138     } else {
139       s.append("0");
140     }
141     CHECK(!did[bits]);
142     did[bits] = true;
143   }
144   return s;
145 }
146 
147 // Test that the DFA gets the right result even if it runs
148 // out of memory during a search.  The regular expression
149 // 0[01]{n}$ matches a binary string of 0s and 1s only if
150 // the (n+1)th-to-last character is a 0.  Matching this in
151 // a single forward pass (as done by the DFA) requires
152 // keeping one bit for each of the last n+1 characters
153 // (whether each was a 0), or 2^(n+1) possible states.
154 // If we run this regexp to search in a string that contains
155 // every possible n-character binary string as a substring,
156 // then it will have to run through at least 2^n states.
157 // States are big data structures -- certainly more than 1 byte --
158 // so if the DFA can search correctly while staying within a
159 // 2^n byte limit, it must be handling out-of-memory conditions
160 // gracefully.
TEST(SingleThreaded,SearchDFA)161 TEST(SingleThreaded, SearchDFA) {
162   // The De Bruijn string is the worst case input for this regexp.
163   // By default, the DFA will notice that it is flushing its cache
164   // too frequently and will bail out early, so that RE2 can use the
165   // NFA implementation instead.  (The DFA loses its speed advantage
166   // if it can't get a good cache hit rate.)
167   // Tell the DFA to trudge along instead.
168   Prog::TEST_dfa_should_bail_when_slow(false);
169 
170   // Choice of n is mostly arbitrary, except that:
171   //   * making n too big makes the test run for too long.
172   //   * making n too small makes the DFA refuse to run,
173   //     because it has so little memory compared to the program size.
174   // Empirically, n = 18 is a good compromise between the two.
175   const int n = 18;
176 
177   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
178                              Regexp::LikePerl, NULL);
179   ASSERT_TRUE(re != NULL);
180 
181   // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
182   // which is not a match for 0[01]{n}$.  Adding one more 0 is a match.
183   string no_match = DeBruijnString(n);
184   string match = no_match + "0";
185 
186   int64_t usage;
187   int64_t peak_usage;
188   {
189     testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
190     Prog* prog = re->CompileToProg(1<<n);
191     ASSERT_TRUE(prog != NULL);
192     for (int i = 0; i < 10; i++) {
193       bool matched = false;
194       bool failed = false;
195       matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
196                                 Prog::kFirstMatch, NULL, &failed, NULL);
197       ASSERT_FALSE(failed);
198       ASSERT_TRUE(matched);
199       matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
200                                 Prog::kFirstMatch, NULL, &failed, NULL);
201       ASSERT_FALSE(failed);
202       ASSERT_FALSE(matched);
203     }
204     usage = m.HeapGrowth();
205     peak_usage = m.PeakHeapGrowth();
206     delete prog;
207   }
208   if (UsingMallocCounter) {
209     //LOG(INFO) << "usage " << usage << ", "
210     //          << "peak usage " << peak_usage;
211     ASSERT_LT(usage, 1<<n);
212     ASSERT_LT(peak_usage, 1<<n);
213   }
214   re->Decref();
215 
216   // Reset to original behaviour.
217   Prog::TEST_dfa_should_bail_when_slow(true);
218 }
219 
220 // Helper function: searches for match, which should match,
221 // and no_match, which should not.
DoSearch(Prog * prog,const StringPiece & match,const StringPiece & no_match)222 static void DoSearch(Prog* prog, const StringPiece& match,
223                      const StringPiece& no_match) {
224   for (int i = 0; i < 2; i++) {
225     bool matched = false;
226     bool failed = false;
227     matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
228                               Prog::kFirstMatch, NULL, &failed, NULL);
229     ASSERT_FALSE(failed);
230     ASSERT_TRUE(matched);
231     matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
232                               Prog::kFirstMatch, NULL, &failed, NULL);
233     ASSERT_FALSE(failed);
234     ASSERT_FALSE(matched);
235   }
236 }
237 
TEST(Multithreaded,SearchDFA)238 TEST(Multithreaded, SearchDFA) {
239   Prog::TEST_dfa_should_bail_when_slow(false);
240 
241   // Same as single-threaded test above.
242   const int n = 18;
243   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
244                              Regexp::LikePerl, NULL);
245   ASSERT_TRUE(re != NULL);
246   string no_match = DeBruijnString(n);
247   string match = no_match + "0";
248 
249   // Check that single-threaded code works.
250   {
251     Prog* prog = re->CompileToProg(1<<n);
252     ASSERT_TRUE(prog != NULL);
253 
254     std::thread t(DoSearch, prog, match, no_match);
255     t.join();
256 
257     delete prog;
258   }
259 
260   // Run the search simultaneously in a bunch of threads.
261   // Reuse same flags for Multithreaded.BuildDFA above.
262   for (int i = 0; i < FLAGS_repeat; i++) {
263     Prog* prog = re->CompileToProg(1<<n);
264     ASSERT_TRUE(prog != NULL);
265 
266     std::vector<std::thread> threads;
267     for (int j = 0; j < FLAGS_threads; j++)
268       threads.emplace_back(DoSearch, prog, match, no_match);
269     for (int j = 0; j < FLAGS_threads; j++)
270       threads[j].join();
271 
272     delete prog;
273   }
274 
275   re->Decref();
276 
277   // Reset to original behaviour.
278   Prog::TEST_dfa_should_bail_when_slow(true);
279 }
280 
281 struct ReverseTest {
282   const char* regexp;
283   const char* text;
284   bool match;
285 };
286 
287 // Test that reverse DFA handles anchored/unanchored correctly.
288 // It's in the DFA interface but not used by RE2.
289 ReverseTest reverse_tests[] = {
290   { "\\A(a|b)", "abc", true },
291   { "(a|b)\\z", "cba", true },
292   { "\\A(a|b)", "cba", false },
293   { "(a|b)\\z", "abc", false },
294 };
295 
TEST(DFA,ReverseMatch)296 TEST(DFA, ReverseMatch) {
297   int nfail = 0;
298   for (int i = 0; i < arraysize(reverse_tests); i++) {
299     const ReverseTest& t = reverse_tests[i];
300     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
301     ASSERT_TRUE(re != NULL);
302     Prog* prog = re->CompileToReverseProg(0);
303     ASSERT_TRUE(prog != NULL);
304     bool failed = false;
305     bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored,
306                                    Prog::kFirstMatch, NULL, &failed, NULL);
307     if (matched != t.match) {
308       LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
309       nfail++;
310     }
311     delete prog;
312     re->Decref();
313   }
314   EXPECT_EQ(nfail, 0);
315 }
316 
317 struct CallbackTest {
318   const char* regexp;
319   const char* dump;
320 };
321 
322 // Test that DFA::BuildAllStates() builds the expected DFA states
323 // and issues the expected callbacks. These test cases reflect the
324 // very compact encoding of the callbacks, but that also makes them
325 // very difficult to understand, so let's work through "\\Aa\\z".
326 // There are three slots per DFA state because the bytemap has two
327 // equivalence classes and there is a third slot for kByteEndText:
328 //   0: all bytes that are not 'a'
329 //   1: the byte 'a'
330 //   2: kByteEndText
331 // -1 means that there is no transition from that DFA state to any
332 // other DFA state for that slot. The valid transitions are thus:
333 //   state 0 --slot 1--> state 1
334 //   state 1 --slot 2--> state 2
335 // The double brackets indicate that state 2 is a matching state.
336 // Putting it together, this means that the DFA must consume the
337 // byte 'a' and then hit end of text. Q.E.D.
338 CallbackTest callback_tests[] = {
339   { "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
340   { "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
341   { "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
342   { "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
343   { "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
344   { "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
345   { "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
346   { "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
347   { "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
348   { "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
349 };
350 
TEST(DFA,Callback)351 TEST(DFA, Callback) {
352   int nfail = 0;
353   for (int i = 0; i < arraysize(callback_tests); i++) {
354     const CallbackTest& t = callback_tests[i];
355     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
356     ASSERT_TRUE(re != NULL);
357     Prog* prog = re->CompileToProg(0);
358     ASSERT_TRUE(prog != NULL);
359     string dump;
360     prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
361       ASSERT_TRUE(next != NULL);
362       if (!dump.empty())
363         StringAppendF(&dump, " ");
364       StringAppendF(&dump, match ? "[[" : "[");
365       for (int b = 0; b < prog->bytemap_range() + 1; b++)
366         StringAppendF(&dump, "%d,", next[b]);
367       dump.pop_back();
368       StringAppendF(&dump, match ? "]]" : "]");
369     });
370     if (dump != t.dump) {
371       LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
372       LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
373       nfail++;
374     }
375     delete prog;
376     re->Decref();
377   }
378   EXPECT_EQ(nfail, 0);
379 }
380 
381 }  // namespace re2
382