1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Test prog.cc, compile.cc
6
7 #include <string>
8
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "re2/regexp.h"
12 #include "re2/prog.h"
13
14 namespace re2 {
15
16 // Simple input/output tests checking that
17 // the regexp compiles to the expected code.
18 // These are just to sanity check the basic implementation.
19 // The real confidence tests happen by testing the NFA/DFA
20 // that run the compiled code.
21
22 struct Test {
23 const char* regexp;
24 const char* code;
25 };
26
27 static Test tests[] = {
28 { "a",
29 "3. byte [61-61] 0 -> 4\n"
30 "4. match! 0\n" },
31 { "ab",
32 "3. byte [61-61] 0 -> 4\n"
33 "4. byte [62-62] 0 -> 5\n"
34 "5. match! 0\n" },
35 { "a|c",
36 "3+ byte [61-61] 0 -> 5\n"
37 "4. byte [63-63] 0 -> 5\n"
38 "5. match! 0\n" },
39 { "a|b",
40 "3. byte [61-62] 0 -> 4\n"
41 "4. match! 0\n" },
42 { "[ab]",
43 "3. byte [61-62] 0 -> 4\n"
44 "4. match! 0\n" },
45 { "a+",
46 "3. byte [61-61] 0 -> 4\n"
47 "4+ nop -> 3\n"
48 "5. match! 0\n" },
49 { "a+?",
50 "3. byte [61-61] 0 -> 4\n"
51 "4+ match! 0\n"
52 "5. nop -> 3\n" },
53 { "a*",
54 "3+ byte [61-61] 1 -> 3\n"
55 "4. match! 0\n" },
56 { "a*?",
57 "3+ match! 0\n"
58 "4. byte [61-61] 0 -> 3\n" },
59 { "a?",
60 "3+ byte [61-61] 1 -> 5\n"
61 "4. nop -> 5\n"
62 "5. match! 0\n" },
63 { "a??",
64 "3+ nop -> 5\n"
65 "4. byte [61-61] 0 -> 5\n"
66 "5. match! 0\n" },
67 { "a{4}",
68 "3. byte [61-61] 0 -> 4\n"
69 "4. byte [61-61] 0 -> 5\n"
70 "5. byte [61-61] 0 -> 6\n"
71 "6. byte [61-61] 0 -> 7\n"
72 "7. match! 0\n" },
73 { "(a)",
74 "3. capture 2 -> 4\n"
75 "4. byte [61-61] 0 -> 5\n"
76 "5. capture 3 -> 6\n"
77 "6. match! 0\n" },
78 { "(?:a)",
79 "3. byte [61-61] 0 -> 4\n"
80 "4. match! 0\n" },
81 { "",
82 "3. match! 0\n" },
83 { ".",
84 "3+ byte [00-09] 0 -> 5\n"
85 "4. byte [0b-ff] 0 -> 5\n"
86 "5. match! 0\n" },
87 { "[^ab]",
88 "3+ byte [00-09] 0 -> 6\n"
89 "4+ byte [0b-60] 0 -> 6\n"
90 "5. byte [63-ff] 0 -> 6\n"
91 "6. match! 0\n" },
92 { "[Aa]",
93 "3. byte/i [61-61] 0 -> 4\n"
94 "4. match! 0\n" },
95 { "\\C+",
96 "3. byte [00-ff] 0 -> 4\n"
97 "4+ altmatch -> 5 | 6\n"
98 "5+ nop -> 3\n"
99 "6. match! 0\n" },
100 { "\\C*",
101 "3+ altmatch -> 4 | 5\n"
102 "4+ byte [00-ff] 1 -> 3\n"
103 "5. match! 0\n" },
104 { "\\C?",
105 "3+ byte [00-ff] 1 -> 5\n"
106 "4. nop -> 5\n"
107 "5. match! 0\n" },
108 // Issue 20992936
109 { "[[-`]",
110 "3. byte [5b-60] 0 -> 4\n"
111 "4. match! 0\n" },
112 // Issue 310
113 { "(?:|a)*",
114 "3+ nop -> 7\n"
115 "4. nop -> 9\n"
116 "5+ nop -> 7\n"
117 "6. nop -> 9\n"
118 "7+ nop -> 5\n"
119 "8. byte [61-61] 0 -> 5\n"
120 "9. match! 0\n" },
121 { "(?:|a)+",
122 "3+ nop -> 5\n"
123 "4. byte [61-61] 0 -> 5\n"
124 "5+ nop -> 3\n"
125 "6. match! 0\n" },
126 };
127
TEST(TestRegexpCompileToProg,Simple)128 TEST(TestRegexpCompileToProg, Simple) {
129 int failed = 0;
130 for (size_t i = 0; i < arraysize(tests); i++) {
131 const re2::Test& t = tests[i];
132 Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
133 if (re == NULL) {
134 LOG(ERROR) << "Cannot parse: " << t.regexp;
135 failed++;
136 continue;
137 }
138 Prog* prog = re->CompileToProg(0);
139 if (prog == NULL) {
140 LOG(ERROR) << "Cannot compile: " << t.regexp;
141 re->Decref();
142 failed++;
143 continue;
144 }
145 ASSERT_TRUE(re->CompileToProg(1) == NULL);
146 std::string s = prog->Dump();
147 if (s != t.code) {
148 LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
149 LOG(ERROR) << "Want:\n" << t.code;
150 LOG(ERROR) << "Got:\n" << s;
151 failed++;
152 }
153 delete prog;
154 re->Decref();
155 }
156 EXPECT_EQ(failed, 0);
157 }
158
DumpByteMap(StringPiece pattern,Regexp::ParseFlags flags,std::string * bytemap)159 static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
160 std::string* bytemap) {
161 Regexp* re = Regexp::Parse(pattern, flags, NULL);
162 EXPECT_TRUE(re != NULL);
163
164 {
165 Prog* prog = re->CompileToProg(0);
166 EXPECT_TRUE(prog != NULL);
167 *bytemap = prog->DumpByteMap();
168 delete prog;
169 }
170
171 {
172 Prog* prog = re->CompileToReverseProg(0);
173 EXPECT_TRUE(prog != NULL);
174 EXPECT_EQ(*bytemap, prog->DumpByteMap());
175 delete prog;
176 }
177
178 re->Decref();
179 }
180
TEST(TestCompile,Latin1Ranges)181 TEST(TestCompile, Latin1Ranges) {
182 // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
183
184 std::string bytemap;
185
186 DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
187 EXPECT_EQ("[00-09] -> 0\n"
188 "[0a-0a] -> 1\n"
189 "[0b-ff] -> 0\n",
190 bytemap);
191 }
192
TEST(TestCompile,OtherByteMapTests)193 TEST(TestCompile, OtherByteMapTests) {
194 std::string bytemap;
195
196 // Test that "absent" ranges are mapped to the same byte class.
197 DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
198 EXPECT_EQ("[00-2f] -> 0\n"
199 "[30-39] -> 1\n"
200 "[3a-40] -> 0\n"
201 "[41-46] -> 1\n"
202 "[47-60] -> 0\n"
203 "[61-66] -> 1\n"
204 "[67-ff] -> 0\n",
205 bytemap);
206
207 // Test the byte classes for \b.
208 DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
209 EXPECT_EQ("[00-2f] -> 0\n"
210 "[30-39] -> 1\n"
211 "[3a-40] -> 0\n"
212 "[41-5a] -> 1\n"
213 "[5b-5e] -> 0\n"
214 "[5f-5f] -> 1\n"
215 "[60-60] -> 0\n"
216 "[61-7a] -> 1\n"
217 "[7b-ff] -> 0\n",
218 bytemap);
219
220 // Bug in the ASCII case-folding optimization created too many byte classes.
221 DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
222 EXPECT_EQ("[00-5e] -> 0\n"
223 "[5f-5f] -> 1\n"
224 "[60-ff] -> 0\n",
225 bytemap);
226 }
227
TEST(TestCompile,UTF8Ranges)228 TEST(TestCompile, UTF8Ranges) {
229 // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
230 // Once, erroneously split between 0x3f and 0x40 because it is
231 // a 6-bit boundary.
232
233 std::string bytemap;
234
235 DumpByteMap(".", Regexp::PerlX, &bytemap);
236 EXPECT_EQ("[00-09] -> 0\n"
237 "[0a-0a] -> 1\n"
238 "[0b-7f] -> 0\n"
239 "[80-bf] -> 2\n"
240 "[c0-c1] -> 1\n"
241 "[c2-df] -> 3\n"
242 "[e0-ef] -> 4\n"
243 "[f0-f4] -> 5\n"
244 "[f5-ff] -> 1\n",
245 bytemap);
246 }
247
TEST(TestCompile,InsufficientMemory)248 TEST(TestCompile, InsufficientMemory) {
249 Regexp* re = Regexp::Parse(
250 "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
251 Regexp::LikePerl, NULL);
252 EXPECT_TRUE(re != NULL);
253 Prog* prog = re->CompileToProg(850);
254 // If the memory budget has been exhausted, compilation should fail
255 // and return NULL instead of trying to do anything with NoMatch().
256 EXPECT_TRUE(prog == NULL);
257 re->Decref();
258 }
259
Dump(StringPiece pattern,Regexp::ParseFlags flags,std::string * forward,std::string * reverse)260 static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
261 std::string* forward, std::string* reverse) {
262 Regexp* re = Regexp::Parse(pattern, flags, NULL);
263 EXPECT_TRUE(re != NULL);
264
265 if (forward != NULL) {
266 Prog* prog = re->CompileToProg(0);
267 EXPECT_TRUE(prog != NULL);
268 *forward = prog->Dump();
269 delete prog;
270 }
271
272 if (reverse != NULL) {
273 Prog* prog = re->CompileToReverseProg(0);
274 EXPECT_TRUE(prog != NULL);
275 *reverse = prog->Dump();
276 delete prog;
277 }
278
279 re->Decref();
280 }
281
TEST(TestCompile,Bug26705922)282 TEST(TestCompile, Bug26705922) {
283 // Bug in the compiler caused inefficient bytecode to be generated for Unicode
284 // groups: common suffixes were cached, but common prefixes were not factored.
285
286 std::string forward, reverse;
287
288 Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
289 EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
290 "4. byte [90-90] 0 -> 5\n"
291 "5. byte [80-80] 0 -> 6\n"
292 "6+ byte [80-80] 0 -> 8\n"
293 "7. byte [90-90] 0 -> 8\n"
294 "8. match! 0\n",
295 forward);
296 EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
297 "4. byte [90-90] 0 -> 5\n"
298 "5. byte [80-80] 0 -> 6\n"
299 "6. byte [90-90] 0 -> 7\n"
300 "7. byte [f0-f0] 0 -> 8\n"
301 "8. match! 0\n",
302 reverse);
303
304 Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
305 EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
306 "4. byte [f0-f0] 0 -> 8\n"
307 "5. byte [80-bf] 0 -> 6\n"
308 "6. byte [80-bf] 0 -> 7\n"
309 "7. match! 0\n"
310 "8. byte [90-90] 0 -> 5\n",
311 forward);
312 EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
313 "4. byte [80-bf] 0 -> 5\n"
314 "5+ byte [e8-ef] 0 -> 7\n"
315 "6. byte [90-90] 0 -> 8\n"
316 "7. match! 0\n"
317 "8. byte [f0-f0] 0 -> 7\n",
318 reverse);
319
320 Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
321 EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
322 "4+ byte [e0-ef] 0 -> 8\n"
323 "5. byte [f0-f4] 0 -> 9\n"
324 "6. byte [80-bf] 0 -> 7\n"
325 "7. match! 0\n"
326 "8. byte [80-bf] 0 -> 6\n"
327 "9. byte [80-bf] 0 -> 8\n",
328 forward);
329 EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
330 "4+ byte [c2-df] 0 -> 6\n"
331 "5. byte [80-bf] 0 -> 7\n"
332 "6. match! 0\n"
333 "7+ byte [e0-ef] 0 -> 6\n"
334 "8. byte [80-bf] 0 -> 9\n"
335 "9. byte [f0-f4] 0 -> 6\n",
336 reverse);
337 }
338
TEST(TestCompile,Bug35237384)339 TEST(TestCompile, Bug35237384) {
340 // Bug in the compiler caused inefficient bytecode to be generated for
341 // nested nullable subexpressions.
342
343 std::string forward;
344
345 Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
346 EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
347 "4. nop -> 5\n"
348 "5+ byte [61-61] 1 -> 5\n"
349 "6. nop -> 7\n"
350 "7+ byte [61-61] 1 -> 7\n"
351 "8. match! 0\n",
352 forward);
353
354 Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
355 EXPECT_EQ("3+ nop -> 28\n"
356 "4. nop -> 30\n"
357 "5+ byte [61-61] 1 -> 5\n"
358 "6. nop -> 32\n"
359 "7+ byte [61-61] 1 -> 7\n"
360 "8. nop -> 26\n"
361 "9+ byte [61-61] 1 -> 9\n"
362 "10. nop -> 20\n"
363 "11+ byte [62-62] 1 -> 11\n"
364 "12. nop -> 20\n"
365 "13+ byte [62-62] 1 -> 13\n"
366 "14. nop -> 26\n"
367 "15+ byte [62-62] 1 -> 15\n"
368 "16. nop -> 32\n"
369 "17+ nop -> 9\n"
370 "18. nop -> 11\n"
371 "19. match! 0\n"
372 "20+ nop -> 17\n"
373 "21. nop -> 19\n"
374 "22+ nop -> 7\n"
375 "23. nop -> 13\n"
376 "24+ nop -> 17\n"
377 "25. nop -> 19\n"
378 "26+ nop -> 22\n"
379 "27. nop -> 24\n"
380 "28+ nop -> 5\n"
381 "29. nop -> 15\n"
382 "30+ nop -> 22\n"
383 "31. nop -> 24\n"
384 "32+ nop -> 28\n"
385 "33. nop -> 30\n",
386 forward);
387
388 Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
389 EXPECT_EQ("3+ nop -> 36\n"
390 "4+ nop -> 31\n"
391 "5. nop -> 33\n"
392 "6+ byte [00-09] 0 -> 8\n"
393 "7. byte [0b-ff] 0 -> 8\n"
394 "8+ nop -> 6\n"
395 "9+ nop -> 29\n"
396 "10. nop -> 28\n"
397 "11+ byte [00-09] 0 -> 13\n"
398 "12. byte [0b-ff] 0 -> 13\n"
399 "13+ nop -> 11\n"
400 "14+ nop -> 26\n"
401 "15. nop -> 28\n"
402 "16+ byte [00-09] 0 -> 18\n"
403 "17. byte [0b-ff] 0 -> 18\n"
404 "18+ nop -> 16\n"
405 "19+ nop -> 36\n"
406 "20. nop -> 33\n"
407 "21+ byte [00-09] 0 -> 23\n"
408 "22. byte [0b-ff] 0 -> 23\n"
409 "23+ nop -> 21\n"
410 "24+ nop -> 31\n"
411 "25. nop -> 33\n"
412 "26+ nop -> 28\n"
413 "27. byte [53-53] 0 -> 11\n"
414 "28. match! 0\n"
415 "29+ nop -> 28\n"
416 "30. byte [53-53] 0 -> 6\n"
417 "31+ nop -> 33\n"
418 "32. byte [53-53] 0 -> 21\n"
419 "33+ nop -> 29\n"
420 "34+ nop -> 26\n"
421 "35. nop -> 28\n"
422 "36+ nop -> 33\n"
423 "37. byte [53-53] 0 -> 16\n",
424 forward);
425 }
426
427 } // namespace re2
428