1*ccdc9c3eSSadaf Ebrahimi // Copyright 2007 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi // Test prog.cc, compile.cc
6*ccdc9c3eSSadaf Ebrahimi
7*ccdc9c3eSSadaf Ebrahimi #include <string>
8*ccdc9c3eSSadaf Ebrahimi
9*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
10*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
11*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
12*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
13*ccdc9c3eSSadaf Ebrahimi
14*ccdc9c3eSSadaf Ebrahimi namespace re2 {
15*ccdc9c3eSSadaf Ebrahimi
16*ccdc9c3eSSadaf Ebrahimi // Simple input/output tests checking that
17*ccdc9c3eSSadaf Ebrahimi // the regexp compiles to the expected code.
18*ccdc9c3eSSadaf Ebrahimi // These are just to sanity check the basic implementation.
19*ccdc9c3eSSadaf Ebrahimi // The real confidence tests happen by testing the NFA/DFA
20*ccdc9c3eSSadaf Ebrahimi // that run the compiled code.
21*ccdc9c3eSSadaf Ebrahimi
22*ccdc9c3eSSadaf Ebrahimi struct Test {
23*ccdc9c3eSSadaf Ebrahimi const char* regexp;
24*ccdc9c3eSSadaf Ebrahimi const char* code;
25*ccdc9c3eSSadaf Ebrahimi };
26*ccdc9c3eSSadaf Ebrahimi
27*ccdc9c3eSSadaf Ebrahimi static Test tests[] = {
28*ccdc9c3eSSadaf Ebrahimi { "a",
29*ccdc9c3eSSadaf Ebrahimi "3. byte [61-61] -> 4\n"
30*ccdc9c3eSSadaf Ebrahimi "4. match! 0\n" },
31*ccdc9c3eSSadaf Ebrahimi { "ab",
32*ccdc9c3eSSadaf Ebrahimi "3. byte [61-61] -> 4\n"
33*ccdc9c3eSSadaf Ebrahimi "4. byte [62-62] -> 5\n"
34*ccdc9c3eSSadaf Ebrahimi "5. match! 0\n" },
35*ccdc9c3eSSadaf Ebrahimi { "a|c",
36*ccdc9c3eSSadaf Ebrahimi "3+ byte [61-61] -> 5\n"
37*ccdc9c3eSSadaf Ebrahimi "4. byte [63-63] -> 5\n"
38*ccdc9c3eSSadaf Ebrahimi "5. match! 0\n" },
39*ccdc9c3eSSadaf Ebrahimi { "a|b",
40*ccdc9c3eSSadaf Ebrahimi "3. byte [61-62] -> 4\n"
41*ccdc9c3eSSadaf Ebrahimi "4. match! 0\n" },
42*ccdc9c3eSSadaf Ebrahimi { "[ab]",
43*ccdc9c3eSSadaf Ebrahimi "3. byte [61-62] -> 4\n"
44*ccdc9c3eSSadaf Ebrahimi "4. match! 0\n" },
45*ccdc9c3eSSadaf Ebrahimi { "a+",
46*ccdc9c3eSSadaf Ebrahimi "3. byte [61-61] -> 4\n"
47*ccdc9c3eSSadaf Ebrahimi "4+ nop -> 3\n"
48*ccdc9c3eSSadaf Ebrahimi "5. match! 0\n" },
49*ccdc9c3eSSadaf Ebrahimi { "a+?",
50*ccdc9c3eSSadaf Ebrahimi "3. byte [61-61] -> 4\n"
51*ccdc9c3eSSadaf Ebrahimi "4+ match! 0\n"
52*ccdc9c3eSSadaf Ebrahimi "5. nop -> 3\n" },
53*ccdc9c3eSSadaf Ebrahimi { "a*",
54*ccdc9c3eSSadaf Ebrahimi "3+ byte [61-61] -> 3\n"
55*ccdc9c3eSSadaf Ebrahimi "4. match! 0\n" },
56*ccdc9c3eSSadaf Ebrahimi { "a*?",
57*ccdc9c3eSSadaf Ebrahimi "3+ match! 0\n"
58*ccdc9c3eSSadaf Ebrahimi "4. byte [61-61] -> 3\n" },
59*ccdc9c3eSSadaf Ebrahimi { "a?",
60*ccdc9c3eSSadaf Ebrahimi "3+ byte [61-61] -> 5\n"
61*ccdc9c3eSSadaf Ebrahimi "4. nop -> 5\n"
62*ccdc9c3eSSadaf Ebrahimi "5. match! 0\n" },
63*ccdc9c3eSSadaf Ebrahimi { "a??",
64*ccdc9c3eSSadaf Ebrahimi "3+ nop -> 5\n"
65*ccdc9c3eSSadaf Ebrahimi "4. byte [61-61] -> 5\n"
66*ccdc9c3eSSadaf Ebrahimi "5. match! 0\n" },
67*ccdc9c3eSSadaf Ebrahimi { "a{4}",
68*ccdc9c3eSSadaf Ebrahimi "3. byte [61-61] -> 4\n"
69*ccdc9c3eSSadaf Ebrahimi "4. byte [61-61] -> 5\n"
70*ccdc9c3eSSadaf Ebrahimi "5. byte [61-61] -> 6\n"
71*ccdc9c3eSSadaf Ebrahimi "6. byte [61-61] -> 7\n"
72*ccdc9c3eSSadaf Ebrahimi "7. match! 0\n" },
73*ccdc9c3eSSadaf Ebrahimi { "(a)",
74*ccdc9c3eSSadaf Ebrahimi "3. capture 2 -> 4\n"
75*ccdc9c3eSSadaf Ebrahimi "4. byte [61-61] -> 5\n"
76*ccdc9c3eSSadaf Ebrahimi "5. capture 3 -> 6\n"
77*ccdc9c3eSSadaf Ebrahimi "6. match! 0\n" },
78*ccdc9c3eSSadaf Ebrahimi { "(?:a)",
79*ccdc9c3eSSadaf Ebrahimi "3. byte [61-61] -> 4\n"
80*ccdc9c3eSSadaf Ebrahimi "4. match! 0\n" },
81*ccdc9c3eSSadaf Ebrahimi { "",
82*ccdc9c3eSSadaf Ebrahimi "3. match! 0\n" },
83*ccdc9c3eSSadaf Ebrahimi { ".",
84*ccdc9c3eSSadaf Ebrahimi "3+ byte [00-09] -> 5\n"
85*ccdc9c3eSSadaf Ebrahimi "4. byte [0b-ff] -> 5\n"
86*ccdc9c3eSSadaf Ebrahimi "5. match! 0\n" },
87*ccdc9c3eSSadaf Ebrahimi { "[^ab]",
88*ccdc9c3eSSadaf Ebrahimi "3+ byte [00-09] -> 6\n"
89*ccdc9c3eSSadaf Ebrahimi "4+ byte [0b-60] -> 6\n"
90*ccdc9c3eSSadaf Ebrahimi "5. byte [63-ff] -> 6\n"
91*ccdc9c3eSSadaf Ebrahimi "6. match! 0\n" },
92*ccdc9c3eSSadaf Ebrahimi { "[Aa]",
93*ccdc9c3eSSadaf Ebrahimi "3. byte/i [61-61] -> 4\n"
94*ccdc9c3eSSadaf Ebrahimi "4. match! 0\n" },
95*ccdc9c3eSSadaf Ebrahimi { "\\C+",
96*ccdc9c3eSSadaf Ebrahimi "3. byte [00-ff] -> 4\n"
97*ccdc9c3eSSadaf Ebrahimi "4+ altmatch -> 5 | 6\n"
98*ccdc9c3eSSadaf Ebrahimi "5+ nop -> 3\n"
99*ccdc9c3eSSadaf Ebrahimi "6. match! 0\n" },
100*ccdc9c3eSSadaf Ebrahimi { "\\C*",
101*ccdc9c3eSSadaf Ebrahimi "3+ altmatch -> 4 | 5\n"
102*ccdc9c3eSSadaf Ebrahimi "4+ byte [00-ff] -> 3\n"
103*ccdc9c3eSSadaf Ebrahimi "5. match! 0\n" },
104*ccdc9c3eSSadaf Ebrahimi { "\\C?",
105*ccdc9c3eSSadaf Ebrahimi "3+ byte [00-ff] -> 5\n"
106*ccdc9c3eSSadaf Ebrahimi "4. nop -> 5\n"
107*ccdc9c3eSSadaf Ebrahimi "5. match! 0\n" },
108*ccdc9c3eSSadaf Ebrahimi // Issue 20992936
109*ccdc9c3eSSadaf Ebrahimi { "[[-`]",
110*ccdc9c3eSSadaf Ebrahimi "3. byte [5b-60] -> 4\n"
111*ccdc9c3eSSadaf Ebrahimi "4. match! 0\n" },
112*ccdc9c3eSSadaf Ebrahimi };
113*ccdc9c3eSSadaf Ebrahimi
TEST(TestRegexpCompileToProg,Simple)114*ccdc9c3eSSadaf Ebrahimi TEST(TestRegexpCompileToProg, Simple) {
115*ccdc9c3eSSadaf Ebrahimi int failed = 0;
116*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < arraysize(tests); i++) {
117*ccdc9c3eSSadaf Ebrahimi const re2::Test& t = tests[i];
118*ccdc9c3eSSadaf Ebrahimi Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
119*ccdc9c3eSSadaf Ebrahimi if (re == NULL) {
120*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Cannot parse: " << t.regexp;
121*ccdc9c3eSSadaf Ebrahimi failed++;
122*ccdc9c3eSSadaf Ebrahimi continue;
123*ccdc9c3eSSadaf Ebrahimi }
124*ccdc9c3eSSadaf Ebrahimi Prog* prog = re->CompileToProg(0);
125*ccdc9c3eSSadaf Ebrahimi if (prog == NULL) {
126*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Cannot compile: " << t.regexp;
127*ccdc9c3eSSadaf Ebrahimi re->Decref();
128*ccdc9c3eSSadaf Ebrahimi failed++;
129*ccdc9c3eSSadaf Ebrahimi continue;
130*ccdc9c3eSSadaf Ebrahimi }
131*ccdc9c3eSSadaf Ebrahimi ASSERT_TRUE(re->CompileToProg(1) == NULL);
132*ccdc9c3eSSadaf Ebrahimi string s = prog->Dump();
133*ccdc9c3eSSadaf Ebrahimi if (s != t.code) {
134*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
135*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Want:\n" << t.code;
136*ccdc9c3eSSadaf Ebrahimi LOG(ERROR) << "Got:\n" << s;
137*ccdc9c3eSSadaf Ebrahimi failed++;
138*ccdc9c3eSSadaf Ebrahimi }
139*ccdc9c3eSSadaf Ebrahimi delete prog;
140*ccdc9c3eSSadaf Ebrahimi re->Decref();
141*ccdc9c3eSSadaf Ebrahimi }
142*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ(failed, 0);
143*ccdc9c3eSSadaf Ebrahimi }
144*ccdc9c3eSSadaf Ebrahimi
DumpByteMap(StringPiece pattern,Regexp::ParseFlags flags,string * bytemap)145*ccdc9c3eSSadaf Ebrahimi static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
146*ccdc9c3eSSadaf Ebrahimi string* bytemap) {
147*ccdc9c3eSSadaf Ebrahimi Regexp* re = Regexp::Parse(pattern, flags, NULL);
148*ccdc9c3eSSadaf Ebrahimi EXPECT_TRUE(re != NULL);
149*ccdc9c3eSSadaf Ebrahimi
150*ccdc9c3eSSadaf Ebrahimi Prog* prog = re->CompileToProg(0);
151*ccdc9c3eSSadaf Ebrahimi EXPECT_TRUE(prog != NULL);
152*ccdc9c3eSSadaf Ebrahimi *bytemap = prog->DumpByteMap();
153*ccdc9c3eSSadaf Ebrahimi delete prog;
154*ccdc9c3eSSadaf Ebrahimi
155*ccdc9c3eSSadaf Ebrahimi re->Decref();
156*ccdc9c3eSSadaf Ebrahimi }
157*ccdc9c3eSSadaf Ebrahimi
TEST(TestCompile,Latin1Ranges)158*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, Latin1Ranges) {
159*ccdc9c3eSSadaf Ebrahimi // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
160*ccdc9c3eSSadaf Ebrahimi
161*ccdc9c3eSSadaf Ebrahimi string bytemap;
162*ccdc9c3eSSadaf Ebrahimi
163*ccdc9c3eSSadaf Ebrahimi DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
164*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("[00-09] -> 0\n"
165*ccdc9c3eSSadaf Ebrahimi "[0a-0a] -> 1\n"
166*ccdc9c3eSSadaf Ebrahimi "[0b-ff] -> 0\n",
167*ccdc9c3eSSadaf Ebrahimi bytemap);
168*ccdc9c3eSSadaf Ebrahimi }
169*ccdc9c3eSSadaf Ebrahimi
TEST(TestCompile,OtherByteMapTests)170*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, OtherByteMapTests) {
171*ccdc9c3eSSadaf Ebrahimi string bytemap;
172*ccdc9c3eSSadaf Ebrahimi
173*ccdc9c3eSSadaf Ebrahimi // Test that "absent" ranges are mapped to the same byte class.
174*ccdc9c3eSSadaf Ebrahimi DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
175*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("[00-2f] -> 0\n"
176*ccdc9c3eSSadaf Ebrahimi "[30-39] -> 1\n"
177*ccdc9c3eSSadaf Ebrahimi "[3a-40] -> 0\n"
178*ccdc9c3eSSadaf Ebrahimi "[41-46] -> 1\n"
179*ccdc9c3eSSadaf Ebrahimi "[47-60] -> 0\n"
180*ccdc9c3eSSadaf Ebrahimi "[61-66] -> 1\n"
181*ccdc9c3eSSadaf Ebrahimi "[67-ff] -> 0\n",
182*ccdc9c3eSSadaf Ebrahimi bytemap);
183*ccdc9c3eSSadaf Ebrahimi
184*ccdc9c3eSSadaf Ebrahimi // Test the byte classes for \b.
185*ccdc9c3eSSadaf Ebrahimi DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
186*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("[00-2f] -> 0\n"
187*ccdc9c3eSSadaf Ebrahimi "[30-39] -> 1\n"
188*ccdc9c3eSSadaf Ebrahimi "[3a-40] -> 0\n"
189*ccdc9c3eSSadaf Ebrahimi "[41-5a] -> 1\n"
190*ccdc9c3eSSadaf Ebrahimi "[5b-5e] -> 0\n"
191*ccdc9c3eSSadaf Ebrahimi "[5f-5f] -> 1\n"
192*ccdc9c3eSSadaf Ebrahimi "[60-60] -> 0\n"
193*ccdc9c3eSSadaf Ebrahimi "[61-7a] -> 1\n"
194*ccdc9c3eSSadaf Ebrahimi "[7b-ff] -> 0\n",
195*ccdc9c3eSSadaf Ebrahimi bytemap);
196*ccdc9c3eSSadaf Ebrahimi
197*ccdc9c3eSSadaf Ebrahimi // Bug in the ASCII case-folding optimization created too many byte classes.
198*ccdc9c3eSSadaf Ebrahimi DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
199*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("[00-5e] -> 0\n"
200*ccdc9c3eSSadaf Ebrahimi "[5f-5f] -> 1\n"
201*ccdc9c3eSSadaf Ebrahimi "[60-ff] -> 0\n",
202*ccdc9c3eSSadaf Ebrahimi bytemap);
203*ccdc9c3eSSadaf Ebrahimi }
204*ccdc9c3eSSadaf Ebrahimi
TEST(TestCompile,UTF8Ranges)205*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, UTF8Ranges) {
206*ccdc9c3eSSadaf Ebrahimi // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
207*ccdc9c3eSSadaf Ebrahimi // Once, erroneously split between 0x3f and 0x40 because it is
208*ccdc9c3eSSadaf Ebrahimi // a 6-bit boundary.
209*ccdc9c3eSSadaf Ebrahimi
210*ccdc9c3eSSadaf Ebrahimi string bytemap;
211*ccdc9c3eSSadaf Ebrahimi
212*ccdc9c3eSSadaf Ebrahimi DumpByteMap(".", Regexp::PerlX, &bytemap);
213*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("[00-09] -> 0\n"
214*ccdc9c3eSSadaf Ebrahimi "[0a-0a] -> 1\n"
215*ccdc9c3eSSadaf Ebrahimi "[0b-7f] -> 0\n"
216*ccdc9c3eSSadaf Ebrahimi "[80-8f] -> 2\n"
217*ccdc9c3eSSadaf Ebrahimi "[90-9f] -> 3\n"
218*ccdc9c3eSSadaf Ebrahimi "[a0-bf] -> 4\n"
219*ccdc9c3eSSadaf Ebrahimi "[c0-c1] -> 1\n"
220*ccdc9c3eSSadaf Ebrahimi "[c2-df] -> 5\n"
221*ccdc9c3eSSadaf Ebrahimi "[e0-e0] -> 6\n"
222*ccdc9c3eSSadaf Ebrahimi "[e1-ef] -> 7\n"
223*ccdc9c3eSSadaf Ebrahimi "[f0-f0] -> 8\n"
224*ccdc9c3eSSadaf Ebrahimi "[f1-f3] -> 9\n"
225*ccdc9c3eSSadaf Ebrahimi "[f4-f4] -> 10\n"
226*ccdc9c3eSSadaf Ebrahimi "[f5-ff] -> 1\n",
227*ccdc9c3eSSadaf Ebrahimi bytemap);
228*ccdc9c3eSSadaf Ebrahimi }
229*ccdc9c3eSSadaf Ebrahimi
TEST(TestCompile,InsufficientMemory)230*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, InsufficientMemory) {
231*ccdc9c3eSSadaf Ebrahimi Regexp* re = Regexp::Parse(
232*ccdc9c3eSSadaf Ebrahimi "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
233*ccdc9c3eSSadaf Ebrahimi Regexp::LikePerl, NULL);
234*ccdc9c3eSSadaf Ebrahimi EXPECT_TRUE(re != NULL);
235*ccdc9c3eSSadaf Ebrahimi Prog* prog = re->CompileToProg(920);
236*ccdc9c3eSSadaf Ebrahimi // If the memory budget has been exhausted, compilation should fail
237*ccdc9c3eSSadaf Ebrahimi // and return NULL instead of trying to do anything with NoMatch().
238*ccdc9c3eSSadaf Ebrahimi EXPECT_TRUE(prog == NULL);
239*ccdc9c3eSSadaf Ebrahimi re->Decref();
240*ccdc9c3eSSadaf Ebrahimi }
241*ccdc9c3eSSadaf Ebrahimi
Dump(StringPiece pattern,Regexp::ParseFlags flags,string * forward,string * reverse)242*ccdc9c3eSSadaf Ebrahimi static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
243*ccdc9c3eSSadaf Ebrahimi string* forward, string* reverse) {
244*ccdc9c3eSSadaf Ebrahimi Regexp* re = Regexp::Parse(pattern, flags, NULL);
245*ccdc9c3eSSadaf Ebrahimi EXPECT_TRUE(re != NULL);
246*ccdc9c3eSSadaf Ebrahimi
247*ccdc9c3eSSadaf Ebrahimi if (forward != NULL) {
248*ccdc9c3eSSadaf Ebrahimi Prog* prog = re->CompileToProg(0);
249*ccdc9c3eSSadaf Ebrahimi EXPECT_TRUE(prog != NULL);
250*ccdc9c3eSSadaf Ebrahimi *forward = prog->Dump();
251*ccdc9c3eSSadaf Ebrahimi delete prog;
252*ccdc9c3eSSadaf Ebrahimi }
253*ccdc9c3eSSadaf Ebrahimi
254*ccdc9c3eSSadaf Ebrahimi if (reverse != NULL) {
255*ccdc9c3eSSadaf Ebrahimi Prog* prog = re->CompileToReverseProg(0);
256*ccdc9c3eSSadaf Ebrahimi EXPECT_TRUE(prog != NULL);
257*ccdc9c3eSSadaf Ebrahimi *reverse = prog->Dump();
258*ccdc9c3eSSadaf Ebrahimi delete prog;
259*ccdc9c3eSSadaf Ebrahimi }
260*ccdc9c3eSSadaf Ebrahimi
261*ccdc9c3eSSadaf Ebrahimi re->Decref();
262*ccdc9c3eSSadaf Ebrahimi }
263*ccdc9c3eSSadaf Ebrahimi
TEST(TestCompile,Bug26705922)264*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, Bug26705922) {
265*ccdc9c3eSSadaf Ebrahimi // Bug in the compiler caused inefficient bytecode to be generated for Unicode
266*ccdc9c3eSSadaf Ebrahimi // groups: common suffixes were cached, but common prefixes were not factored.
267*ccdc9c3eSSadaf Ebrahimi
268*ccdc9c3eSSadaf Ebrahimi string forward, reverse;
269*ccdc9c3eSSadaf Ebrahimi
270*ccdc9c3eSSadaf Ebrahimi Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
271*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("3. byte [f0-f0] -> 4\n"
272*ccdc9c3eSSadaf Ebrahimi "4. byte [90-90] -> 5\n"
273*ccdc9c3eSSadaf Ebrahimi "5. byte [80-80] -> 6\n"
274*ccdc9c3eSSadaf Ebrahimi "6+ byte [80-80] -> 8\n"
275*ccdc9c3eSSadaf Ebrahimi "7. byte [90-90] -> 8\n"
276*ccdc9c3eSSadaf Ebrahimi "8. match! 0\n",
277*ccdc9c3eSSadaf Ebrahimi forward);
278*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("3+ byte [80-80] -> 5\n"
279*ccdc9c3eSSadaf Ebrahimi "4. byte [90-90] -> 5\n"
280*ccdc9c3eSSadaf Ebrahimi "5. byte [80-80] -> 6\n"
281*ccdc9c3eSSadaf Ebrahimi "6. byte [90-90] -> 7\n"
282*ccdc9c3eSSadaf Ebrahimi "7. byte [f0-f0] -> 8\n"
283*ccdc9c3eSSadaf Ebrahimi "8. match! 0\n",
284*ccdc9c3eSSadaf Ebrahimi reverse);
285*ccdc9c3eSSadaf Ebrahimi
286*ccdc9c3eSSadaf Ebrahimi Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
287*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("3+ byte [e8-ef] -> 5\n"
288*ccdc9c3eSSadaf Ebrahimi "4. byte [f0-f0] -> 8\n"
289*ccdc9c3eSSadaf Ebrahimi "5. byte [80-bf] -> 6\n"
290*ccdc9c3eSSadaf Ebrahimi "6. byte [80-bf] -> 7\n"
291*ccdc9c3eSSadaf Ebrahimi "7. match! 0\n"
292*ccdc9c3eSSadaf Ebrahimi "8. byte [90-90] -> 5\n",
293*ccdc9c3eSSadaf Ebrahimi forward);
294*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("3. byte [80-bf] -> 4\n"
295*ccdc9c3eSSadaf Ebrahimi "4. byte [80-bf] -> 5\n"
296*ccdc9c3eSSadaf Ebrahimi "5+ byte [e8-ef] -> 7\n"
297*ccdc9c3eSSadaf Ebrahimi "6. byte [90-90] -> 8\n"
298*ccdc9c3eSSadaf Ebrahimi "7. match! 0\n"
299*ccdc9c3eSSadaf Ebrahimi "8. byte [f0-f0] -> 7\n",
300*ccdc9c3eSSadaf Ebrahimi reverse);
301*ccdc9c3eSSadaf Ebrahimi
302*ccdc9c3eSSadaf Ebrahimi Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
303*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("3. byte [80-bf] -> 4\n"
304*ccdc9c3eSSadaf Ebrahimi "4+ byte [c2-df] -> 7\n"
305*ccdc9c3eSSadaf Ebrahimi "5+ byte [a0-bf] -> 8\n"
306*ccdc9c3eSSadaf Ebrahimi "6. byte [80-bf] -> 9\n"
307*ccdc9c3eSSadaf Ebrahimi "7. match! 0\n"
308*ccdc9c3eSSadaf Ebrahimi "8. byte [e0-e0] -> 7\n"
309*ccdc9c3eSSadaf Ebrahimi "9+ byte [e1-ef] -> 7\n"
310*ccdc9c3eSSadaf Ebrahimi "10+ byte [90-bf] -> 13\n"
311*ccdc9c3eSSadaf Ebrahimi "11+ byte [80-bf] -> 14\n"
312*ccdc9c3eSSadaf Ebrahimi "12. byte [80-8f] -> 15\n"
313*ccdc9c3eSSadaf Ebrahimi "13. byte [f0-f0] -> 7\n"
314*ccdc9c3eSSadaf Ebrahimi "14. byte [f1-f3] -> 7\n"
315*ccdc9c3eSSadaf Ebrahimi "15. byte [f4-f4] -> 7\n",
316*ccdc9c3eSSadaf Ebrahimi reverse);
317*ccdc9c3eSSadaf Ebrahimi }
318*ccdc9c3eSSadaf Ebrahimi
TEST(TestCompile,Bug35237384)319*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, Bug35237384) {
320*ccdc9c3eSSadaf Ebrahimi // Bug in the compiler caused inefficient bytecode to be generated for
321*ccdc9c3eSSadaf Ebrahimi // nested nullable subexpressions.
322*ccdc9c3eSSadaf Ebrahimi
323*ccdc9c3eSSadaf Ebrahimi string forward;
324*ccdc9c3eSSadaf Ebrahimi
325*ccdc9c3eSSadaf Ebrahimi Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
326*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("3+ byte [61-61] -> 3\n"
327*ccdc9c3eSSadaf Ebrahimi "4. nop -> 5\n"
328*ccdc9c3eSSadaf Ebrahimi "5+ byte [61-61] -> 5\n"
329*ccdc9c3eSSadaf Ebrahimi "6. nop -> 7\n"
330*ccdc9c3eSSadaf Ebrahimi "7+ byte [61-61] -> 7\n"
331*ccdc9c3eSSadaf Ebrahimi "8. match! 0\n",
332*ccdc9c3eSSadaf Ebrahimi forward);
333*ccdc9c3eSSadaf Ebrahimi
334*ccdc9c3eSSadaf Ebrahimi Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
335*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("3+ nop -> 6\n"
336*ccdc9c3eSSadaf Ebrahimi "4+ nop -> 8\n"
337*ccdc9c3eSSadaf Ebrahimi "5. nop -> 21\n"
338*ccdc9c3eSSadaf Ebrahimi "6+ byte [61-61] -> 6\n"
339*ccdc9c3eSSadaf Ebrahimi "7. nop -> 3\n"
340*ccdc9c3eSSadaf Ebrahimi "8+ byte [62-62] -> 8\n"
341*ccdc9c3eSSadaf Ebrahimi "9. nop -> 3\n"
342*ccdc9c3eSSadaf Ebrahimi "10+ byte [61-61] -> 10\n"
343*ccdc9c3eSSadaf Ebrahimi "11. nop -> 21\n"
344*ccdc9c3eSSadaf Ebrahimi "12+ byte [62-62] -> 12\n"
345*ccdc9c3eSSadaf Ebrahimi "13. nop -> 21\n"
346*ccdc9c3eSSadaf Ebrahimi "14+ byte [61-61] -> 14\n"
347*ccdc9c3eSSadaf Ebrahimi "15. nop -> 18\n"
348*ccdc9c3eSSadaf Ebrahimi "16+ byte [62-62] -> 16\n"
349*ccdc9c3eSSadaf Ebrahimi "17. nop -> 18\n"
350*ccdc9c3eSSadaf Ebrahimi "18+ nop -> 14\n"
351*ccdc9c3eSSadaf Ebrahimi "19+ nop -> 16\n"
352*ccdc9c3eSSadaf Ebrahimi "20. match! 0\n"
353*ccdc9c3eSSadaf Ebrahimi "21+ nop -> 10\n"
354*ccdc9c3eSSadaf Ebrahimi "22+ nop -> 12\n"
355*ccdc9c3eSSadaf Ebrahimi "23. nop -> 18\n",
356*ccdc9c3eSSadaf Ebrahimi forward);
357*ccdc9c3eSSadaf Ebrahimi
358*ccdc9c3eSSadaf Ebrahimi Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
359*ccdc9c3eSSadaf Ebrahimi EXPECT_EQ("3+ nop -> 36\n"
360*ccdc9c3eSSadaf Ebrahimi "4+ nop -> 31\n"
361*ccdc9c3eSSadaf Ebrahimi "5. nop -> 33\n"
362*ccdc9c3eSSadaf Ebrahimi "6+ byte [00-09] -> 8\n"
363*ccdc9c3eSSadaf Ebrahimi "7. byte [0b-ff] -> 8\n"
364*ccdc9c3eSSadaf Ebrahimi "8+ nop -> 6\n"
365*ccdc9c3eSSadaf Ebrahimi "9+ nop -> 29\n"
366*ccdc9c3eSSadaf Ebrahimi "10. nop -> 28\n"
367*ccdc9c3eSSadaf Ebrahimi "11+ byte [00-09] -> 13\n"
368*ccdc9c3eSSadaf Ebrahimi "12. byte [0b-ff] -> 13\n"
369*ccdc9c3eSSadaf Ebrahimi "13+ nop -> 11\n"
370*ccdc9c3eSSadaf Ebrahimi "14+ nop -> 26\n"
371*ccdc9c3eSSadaf Ebrahimi "15. nop -> 28\n"
372*ccdc9c3eSSadaf Ebrahimi "16+ byte [00-09] -> 18\n"
373*ccdc9c3eSSadaf Ebrahimi "17. byte [0b-ff] -> 18\n"
374*ccdc9c3eSSadaf Ebrahimi "18+ nop -> 16\n"
375*ccdc9c3eSSadaf Ebrahimi "19+ nop -> 36\n"
376*ccdc9c3eSSadaf Ebrahimi "20. nop -> 33\n"
377*ccdc9c3eSSadaf Ebrahimi "21+ byte [00-09] -> 23\n"
378*ccdc9c3eSSadaf Ebrahimi "22. byte [0b-ff] -> 23\n"
379*ccdc9c3eSSadaf Ebrahimi "23+ nop -> 21\n"
380*ccdc9c3eSSadaf Ebrahimi "24+ nop -> 31\n"
381*ccdc9c3eSSadaf Ebrahimi "25. nop -> 33\n"
382*ccdc9c3eSSadaf Ebrahimi "26+ nop -> 28\n"
383*ccdc9c3eSSadaf Ebrahimi "27. byte [53-53] -> 11\n"
384*ccdc9c3eSSadaf Ebrahimi "28. match! 0\n"
385*ccdc9c3eSSadaf Ebrahimi "29+ nop -> 28\n"
386*ccdc9c3eSSadaf Ebrahimi "30. byte [53-53] -> 6\n"
387*ccdc9c3eSSadaf Ebrahimi "31+ nop -> 33\n"
388*ccdc9c3eSSadaf Ebrahimi "32. byte [53-53] -> 21\n"
389*ccdc9c3eSSadaf Ebrahimi "33+ nop -> 29\n"
390*ccdc9c3eSSadaf Ebrahimi "34+ nop -> 26\n"
391*ccdc9c3eSSadaf Ebrahimi "35. nop -> 28\n"
392*ccdc9c3eSSadaf Ebrahimi "36+ nop -> 33\n"
393*ccdc9c3eSSadaf Ebrahimi "37. byte [53-53] -> 16\n",
394*ccdc9c3eSSadaf Ebrahimi forward);
395*ccdc9c3eSSadaf Ebrahimi }
396*ccdc9c3eSSadaf Ebrahimi
397*ccdc9c3eSSadaf Ebrahimi } // namespace re2
398