1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test prog.cc, compile.cc
6 
7 #include <string>
8 
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "re2/regexp.h"
12 #include "re2/prog.h"
13 
14 namespace re2 {
15 
16 // Simple input/output tests checking that
17 // the regexp compiles to the expected code.
18 // These are just to sanity check the basic implementation.
19 // The real confidence tests happen by testing the NFA/DFA
20 // that run the compiled code.
21 
22 struct Test {
23   const char* regexp;
24   const char* code;
25 };
26 
27 static Test tests[] = {
28   { "a",
29     "3. byte [61-61] 0 -> 4\n"
30     "4. match! 0\n" },
31   { "ab",
32     "3. byte [61-61] 0 -> 4\n"
33     "4. byte [62-62] 0 -> 5\n"
34     "5. match! 0\n" },
35   { "a|c",
36     "3+ byte [61-61] 0 -> 5\n"
37     "4. byte [63-63] 0 -> 5\n"
38     "5. match! 0\n" },
39   { "a|b",
40     "3. byte [61-62] 0 -> 4\n"
41     "4. match! 0\n" },
42   { "[ab]",
43     "3. byte [61-62] 0 -> 4\n"
44     "4. match! 0\n" },
45   { "a+",
46     "3. byte [61-61] 0 -> 4\n"
47     "4+ nop -> 3\n"
48     "5. match! 0\n" },
49   { "a+?",
50     "3. byte [61-61] 0 -> 4\n"
51     "4+ match! 0\n"
52     "5. nop -> 3\n" },
53   { "a*",
54     "3+ byte [61-61] 1 -> 3\n"
55     "4. match! 0\n" },
56   { "a*?",
57     "3+ match! 0\n"
58     "4. byte [61-61] 0 -> 3\n" },
59   { "a?",
60     "3+ byte [61-61] 1 -> 5\n"
61     "4. nop -> 5\n"
62     "5. match! 0\n" },
63   { "a??",
64     "3+ nop -> 5\n"
65     "4. byte [61-61] 0 -> 5\n"
66     "5. match! 0\n" },
67   { "a{4}",
68     "3. byte [61-61] 0 -> 4\n"
69     "4. byte [61-61] 0 -> 5\n"
70     "5. byte [61-61] 0 -> 6\n"
71     "6. byte [61-61] 0 -> 7\n"
72     "7. match! 0\n" },
73   { "(a)",
74     "3. capture 2 -> 4\n"
75     "4. byte [61-61] 0 -> 5\n"
76     "5. capture 3 -> 6\n"
77     "6. match! 0\n" },
78   { "(?:a)",
79     "3. byte [61-61] 0 -> 4\n"
80     "4. match! 0\n" },
81   { "",
82     "3. match! 0\n" },
83   { ".",
84     "3+ byte [00-09] 0 -> 5\n"
85     "4. byte [0b-ff] 0 -> 5\n"
86     "5. match! 0\n" },
87   { "[^ab]",
88     "3+ byte [00-09] 0 -> 6\n"
89     "4+ byte [0b-60] 0 -> 6\n"
90     "5. byte [63-ff] 0 -> 6\n"
91     "6. match! 0\n" },
92   { "[Aa]",
93     "3. byte/i [61-61] 0 -> 4\n"
94     "4. match! 0\n" },
95   { "\\C+",
96     "3. byte [00-ff] 0 -> 4\n"
97     "4+ altmatch -> 5 | 6\n"
98     "5+ nop -> 3\n"
99     "6. match! 0\n" },
100   { "\\C*",
101     "3+ altmatch -> 4 | 5\n"
102     "4+ byte [00-ff] 1 -> 3\n"
103     "5. match! 0\n" },
104   { "\\C?",
105     "3+ byte [00-ff] 1 -> 5\n"
106     "4. nop -> 5\n"
107     "5. match! 0\n" },
108   // Issue 20992936
109   { "[[-`]",
110     "3. byte [5b-60] 0 -> 4\n"
111     "4. match! 0\n" },
112   // Issue 310
113   { "(?:|a)*",
114     "3+ nop -> 7\n"
115     "4. nop -> 9\n"
116     "5+ nop -> 7\n"
117     "6. nop -> 9\n"
118     "7+ nop -> 5\n"
119     "8. byte [61-61] 0 -> 5\n"
120     "9. match! 0\n" },
121   { "(?:|a)+",
122     "3+ nop -> 5\n"
123     "4. byte [61-61] 0 -> 5\n"
124     "5+ nop -> 3\n"
125     "6. match! 0\n" },
126 };
127 
TEST(TestRegexpCompileToProg,Simple)128 TEST(TestRegexpCompileToProg, Simple) {
129   int failed = 0;
130   for (size_t i = 0; i < arraysize(tests); i++) {
131     const re2::Test& t = tests[i];
132     Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
133     if (re == NULL) {
134       LOG(ERROR) << "Cannot parse: " << t.regexp;
135       failed++;
136       continue;
137     }
138     Prog* prog = re->CompileToProg(0);
139     if (prog == NULL) {
140       LOG(ERROR) << "Cannot compile: " << t.regexp;
141       re->Decref();
142       failed++;
143       continue;
144     }
145     ASSERT_TRUE(re->CompileToProg(1) == NULL);
146     std::string s = prog->Dump();
147     if (s != t.code) {
148       LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
149       LOG(ERROR) << "Want:\n" << t.code;
150       LOG(ERROR) << "Got:\n" << s;
151       failed++;
152     }
153     delete prog;
154     re->Decref();
155   }
156   EXPECT_EQ(failed, 0);
157 }
158 
DumpByteMap(StringPiece pattern,Regexp::ParseFlags flags,std::string * bytemap)159 static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
160                         std::string* bytemap) {
161   Regexp* re = Regexp::Parse(pattern, flags, NULL);
162   EXPECT_TRUE(re != NULL);
163 
164   {
165     Prog* prog = re->CompileToProg(0);
166     EXPECT_TRUE(prog != NULL);
167     *bytemap = prog->DumpByteMap();
168     delete prog;
169   }
170 
171   {
172     Prog* prog = re->CompileToReverseProg(0);
173     EXPECT_TRUE(prog != NULL);
174     EXPECT_EQ(*bytemap, prog->DumpByteMap());
175     delete prog;
176   }
177 
178   re->Decref();
179 }
180 
TEST(TestCompile,Latin1Ranges)181 TEST(TestCompile, Latin1Ranges) {
182   // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
183 
184   std::string bytemap;
185 
186   DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
187   EXPECT_EQ("[00-09] -> 0\n"
188             "[0a-0a] -> 1\n"
189             "[0b-ff] -> 0\n",
190             bytemap);
191 }
192 
TEST(TestCompile,OtherByteMapTests)193 TEST(TestCompile, OtherByteMapTests) {
194   std::string bytemap;
195 
196   // Test that "absent" ranges are mapped to the same byte class.
197   DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
198   EXPECT_EQ("[00-2f] -> 0\n"
199             "[30-39] -> 1\n"
200             "[3a-40] -> 0\n"
201             "[41-46] -> 1\n"
202             "[47-60] -> 0\n"
203             "[61-66] -> 1\n"
204             "[67-ff] -> 0\n",
205             bytemap);
206 
207   // Test the byte classes for \b.
208   DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
209   EXPECT_EQ("[00-2f] -> 0\n"
210             "[30-39] -> 1\n"
211             "[3a-40] -> 0\n"
212             "[41-5a] -> 1\n"
213             "[5b-5e] -> 0\n"
214             "[5f-5f] -> 1\n"
215             "[60-60] -> 0\n"
216             "[61-7a] -> 1\n"
217             "[7b-ff] -> 0\n",
218             bytemap);
219 
220   // Bug in the ASCII case-folding optimization created too many byte classes.
221   DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
222   EXPECT_EQ("[00-5e] -> 0\n"
223             "[5f-5f] -> 1\n"
224             "[60-ff] -> 0\n",
225             bytemap);
226 }
227 
TEST(TestCompile,UTF8Ranges)228 TEST(TestCompile, UTF8Ranges) {
229   // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
230   // Once, erroneously split between 0x3f and 0x40 because it is
231   // a 6-bit boundary.
232 
233   std::string bytemap;
234 
235   DumpByteMap(".", Regexp::PerlX, &bytemap);
236   EXPECT_EQ("[00-09] -> 0\n"
237             "[0a-0a] -> 1\n"
238             "[0b-7f] -> 0\n"
239             "[80-bf] -> 2\n"
240             "[c0-c1] -> 1\n"
241             "[c2-df] -> 3\n"
242             "[e0-ef] -> 4\n"
243             "[f0-f4] -> 5\n"
244             "[f5-ff] -> 1\n",
245             bytemap);
246 }
247 
TEST(TestCompile,InsufficientMemory)248 TEST(TestCompile, InsufficientMemory) {
249   Regexp* re = Regexp::Parse(
250       "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
251       Regexp::LikePerl, NULL);
252   EXPECT_TRUE(re != NULL);
253   Prog* prog = re->CompileToProg(850);
254   // If the memory budget has been exhausted, compilation should fail
255   // and return NULL instead of trying to do anything with NoMatch().
256   EXPECT_TRUE(prog == NULL);
257   re->Decref();
258 }
259 
Dump(StringPiece pattern,Regexp::ParseFlags flags,std::string * forward,std::string * reverse)260 static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
261                  std::string* forward, std::string* reverse) {
262   Regexp* re = Regexp::Parse(pattern, flags, NULL);
263   EXPECT_TRUE(re != NULL);
264 
265   if (forward != NULL) {
266     Prog* prog = re->CompileToProg(0);
267     EXPECT_TRUE(prog != NULL);
268     *forward = prog->Dump();
269     delete prog;
270   }
271 
272   if (reverse != NULL) {
273     Prog* prog = re->CompileToReverseProg(0);
274     EXPECT_TRUE(prog != NULL);
275     *reverse = prog->Dump();
276     delete prog;
277   }
278 
279   re->Decref();
280 }
281 
TEST(TestCompile,Bug26705922)282 TEST(TestCompile, Bug26705922) {
283   // Bug in the compiler caused inefficient bytecode to be generated for Unicode
284   // groups: common suffixes were cached, but common prefixes were not factored.
285 
286   std::string forward, reverse;
287 
288   Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
289   EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
290             "4. byte [90-90] 0 -> 5\n"
291             "5. byte [80-80] 0 -> 6\n"
292             "6+ byte [80-80] 0 -> 8\n"
293             "7. byte [90-90] 0 -> 8\n"
294             "8. match! 0\n",
295             forward);
296   EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
297             "4. byte [90-90] 0 -> 5\n"
298             "5. byte [80-80] 0 -> 6\n"
299             "6. byte [90-90] 0 -> 7\n"
300             "7. byte [f0-f0] 0 -> 8\n"
301             "8. match! 0\n",
302             reverse);
303 
304   Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
305   EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
306             "4. byte [f0-f0] 0 -> 8\n"
307             "5. byte [80-bf] 0 -> 6\n"
308             "6. byte [80-bf] 0 -> 7\n"
309             "7. match! 0\n"
310             "8. byte [90-90] 0 -> 5\n",
311             forward);
312   EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
313             "4. byte [80-bf] 0 -> 5\n"
314             "5+ byte [e8-ef] 0 -> 7\n"
315             "6. byte [90-90] 0 -> 8\n"
316             "7. match! 0\n"
317             "8. byte [f0-f0] 0 -> 7\n",
318             reverse);
319 
320   Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
321   EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
322             "4+ byte [e0-ef] 0 -> 8\n"
323             "5. byte [f0-f4] 0 -> 9\n"
324             "6. byte [80-bf] 0 -> 7\n"
325             "7. match! 0\n"
326             "8. byte [80-bf] 0 -> 6\n"
327             "9. byte [80-bf] 0 -> 8\n",
328             forward);
329   EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
330             "4+ byte [c2-df] 0 -> 6\n"
331             "5. byte [80-bf] 0 -> 7\n"
332             "6. match! 0\n"
333             "7+ byte [e0-ef] 0 -> 6\n"
334             "8. byte [80-bf] 0 -> 9\n"
335             "9. byte [f0-f4] 0 -> 6\n",
336             reverse);
337 }
338 
TEST(TestCompile,Bug35237384)339 TEST(TestCompile, Bug35237384) {
340   // Bug in the compiler caused inefficient bytecode to be generated for
341   // nested nullable subexpressions.
342 
343   std::string forward;
344 
345   Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
346   EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
347             "4. nop -> 5\n"
348             "5+ byte [61-61] 1 -> 5\n"
349             "6. nop -> 7\n"
350             "7+ byte [61-61] 1 -> 7\n"
351             "8. match! 0\n",
352             forward);
353 
354   Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
355   EXPECT_EQ("3+ nop -> 28\n"
356             "4. nop -> 30\n"
357             "5+ byte [61-61] 1 -> 5\n"
358             "6. nop -> 32\n"
359             "7+ byte [61-61] 1 -> 7\n"
360             "8. nop -> 26\n"
361             "9+ byte [61-61] 1 -> 9\n"
362             "10. nop -> 20\n"
363             "11+ byte [62-62] 1 -> 11\n"
364             "12. nop -> 20\n"
365             "13+ byte [62-62] 1 -> 13\n"
366             "14. nop -> 26\n"
367             "15+ byte [62-62] 1 -> 15\n"
368             "16. nop -> 32\n"
369             "17+ nop -> 9\n"
370             "18. nop -> 11\n"
371             "19. match! 0\n"
372             "20+ nop -> 17\n"
373             "21. nop -> 19\n"
374             "22+ nop -> 7\n"
375             "23. nop -> 13\n"
376             "24+ nop -> 17\n"
377             "25. nop -> 19\n"
378             "26+ nop -> 22\n"
379             "27. nop -> 24\n"
380             "28+ nop -> 5\n"
381             "29. nop -> 15\n"
382             "30+ nop -> 22\n"
383             "31. nop -> 24\n"
384             "32+ nop -> 28\n"
385             "33. nop -> 30\n",
386       forward);
387 
388   Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
389   EXPECT_EQ("3+ nop -> 36\n"
390             "4+ nop -> 31\n"
391             "5. nop -> 33\n"
392             "6+ byte [00-09] 0 -> 8\n"
393             "7. byte [0b-ff] 0 -> 8\n"
394             "8+ nop -> 6\n"
395             "9+ nop -> 29\n"
396             "10. nop -> 28\n"
397             "11+ byte [00-09] 0 -> 13\n"
398             "12. byte [0b-ff] 0 -> 13\n"
399             "13+ nop -> 11\n"
400             "14+ nop -> 26\n"
401             "15. nop -> 28\n"
402             "16+ byte [00-09] 0 -> 18\n"
403             "17. byte [0b-ff] 0 -> 18\n"
404             "18+ nop -> 16\n"
405             "19+ nop -> 36\n"
406             "20. nop -> 33\n"
407             "21+ byte [00-09] 0 -> 23\n"
408             "22. byte [0b-ff] 0 -> 23\n"
409             "23+ nop -> 21\n"
410             "24+ nop -> 31\n"
411             "25. nop -> 33\n"
412             "26+ nop -> 28\n"
413             "27. byte [53-53] 0 -> 11\n"
414             "28. match! 0\n"
415             "29+ nop -> 28\n"
416             "30. byte [53-53] 0 -> 6\n"
417             "31+ nop -> 33\n"
418             "32. byte [53-53] 0 -> 21\n"
419             "33+ nop -> 29\n"
420             "34+ nop -> 26\n"
421             "35. nop -> 28\n"
422             "36+ nop -> 33\n"
423             "37. byte [53-53] 0 -> 16\n",
424       forward);
425 }
426 
427 }  // namespace re2
428