xref: /aosp_15_r20/external/regex-re2/re2/testing/compile_test.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Test prog.cc, compile.cc
6*ccdc9c3eSSadaf Ebrahimi 
7*ccdc9c3eSSadaf Ebrahimi #include <string>
8*ccdc9c3eSSadaf Ebrahimi 
9*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
10*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
11*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
12*ccdc9c3eSSadaf Ebrahimi #include "re2/prog.h"
13*ccdc9c3eSSadaf Ebrahimi 
14*ccdc9c3eSSadaf Ebrahimi namespace re2 {
15*ccdc9c3eSSadaf Ebrahimi 
16*ccdc9c3eSSadaf Ebrahimi // Simple input/output tests checking that
17*ccdc9c3eSSadaf Ebrahimi // the regexp compiles to the expected code.
18*ccdc9c3eSSadaf Ebrahimi // These are just to sanity check the basic implementation.
19*ccdc9c3eSSadaf Ebrahimi // The real confidence tests happen by testing the NFA/DFA
20*ccdc9c3eSSadaf Ebrahimi // that run the compiled code.
21*ccdc9c3eSSadaf Ebrahimi 
22*ccdc9c3eSSadaf Ebrahimi struct Test {
23*ccdc9c3eSSadaf Ebrahimi   const char* regexp;
24*ccdc9c3eSSadaf Ebrahimi   const char* code;
25*ccdc9c3eSSadaf Ebrahimi };
26*ccdc9c3eSSadaf Ebrahimi 
27*ccdc9c3eSSadaf Ebrahimi static Test tests[] = {
28*ccdc9c3eSSadaf Ebrahimi   { "a",
29*ccdc9c3eSSadaf Ebrahimi     "3. byte [61-61] -> 4\n"
30*ccdc9c3eSSadaf Ebrahimi     "4. match! 0\n" },
31*ccdc9c3eSSadaf Ebrahimi   { "ab",
32*ccdc9c3eSSadaf Ebrahimi     "3. byte [61-61] -> 4\n"
33*ccdc9c3eSSadaf Ebrahimi     "4. byte [62-62] -> 5\n"
34*ccdc9c3eSSadaf Ebrahimi     "5. match! 0\n" },
35*ccdc9c3eSSadaf Ebrahimi   { "a|c",
36*ccdc9c3eSSadaf Ebrahimi     "3+ byte [61-61] -> 5\n"
37*ccdc9c3eSSadaf Ebrahimi     "4. byte [63-63] -> 5\n"
38*ccdc9c3eSSadaf Ebrahimi     "5. match! 0\n" },
39*ccdc9c3eSSadaf Ebrahimi   { "a|b",
40*ccdc9c3eSSadaf Ebrahimi     "3. byte [61-62] -> 4\n"
41*ccdc9c3eSSadaf Ebrahimi     "4. match! 0\n" },
42*ccdc9c3eSSadaf Ebrahimi   { "[ab]",
43*ccdc9c3eSSadaf Ebrahimi     "3. byte [61-62] -> 4\n"
44*ccdc9c3eSSadaf Ebrahimi     "4. match! 0\n" },
45*ccdc9c3eSSadaf Ebrahimi   { "a+",
46*ccdc9c3eSSadaf Ebrahimi     "3. byte [61-61] -> 4\n"
47*ccdc9c3eSSadaf Ebrahimi     "4+ nop -> 3\n"
48*ccdc9c3eSSadaf Ebrahimi     "5. match! 0\n" },
49*ccdc9c3eSSadaf Ebrahimi   { "a+?",
50*ccdc9c3eSSadaf Ebrahimi     "3. byte [61-61] -> 4\n"
51*ccdc9c3eSSadaf Ebrahimi     "4+ match! 0\n"
52*ccdc9c3eSSadaf Ebrahimi     "5. nop -> 3\n" },
53*ccdc9c3eSSadaf Ebrahimi   { "a*",
54*ccdc9c3eSSadaf Ebrahimi     "3+ byte [61-61] -> 3\n"
55*ccdc9c3eSSadaf Ebrahimi     "4. match! 0\n" },
56*ccdc9c3eSSadaf Ebrahimi   { "a*?",
57*ccdc9c3eSSadaf Ebrahimi     "3+ match! 0\n"
58*ccdc9c3eSSadaf Ebrahimi     "4. byte [61-61] -> 3\n" },
59*ccdc9c3eSSadaf Ebrahimi   { "a?",
60*ccdc9c3eSSadaf Ebrahimi     "3+ byte [61-61] -> 5\n"
61*ccdc9c3eSSadaf Ebrahimi     "4. nop -> 5\n"
62*ccdc9c3eSSadaf Ebrahimi     "5. match! 0\n" },
63*ccdc9c3eSSadaf Ebrahimi   { "a??",
64*ccdc9c3eSSadaf Ebrahimi     "3+ nop -> 5\n"
65*ccdc9c3eSSadaf Ebrahimi     "4. byte [61-61] -> 5\n"
66*ccdc9c3eSSadaf Ebrahimi     "5. match! 0\n" },
67*ccdc9c3eSSadaf Ebrahimi   { "a{4}",
68*ccdc9c3eSSadaf Ebrahimi     "3. byte [61-61] -> 4\n"
69*ccdc9c3eSSadaf Ebrahimi     "4. byte [61-61] -> 5\n"
70*ccdc9c3eSSadaf Ebrahimi     "5. byte [61-61] -> 6\n"
71*ccdc9c3eSSadaf Ebrahimi     "6. byte [61-61] -> 7\n"
72*ccdc9c3eSSadaf Ebrahimi     "7. match! 0\n" },
73*ccdc9c3eSSadaf Ebrahimi   { "(a)",
74*ccdc9c3eSSadaf Ebrahimi     "3. capture 2 -> 4\n"
75*ccdc9c3eSSadaf Ebrahimi     "4. byte [61-61] -> 5\n"
76*ccdc9c3eSSadaf Ebrahimi     "5. capture 3 -> 6\n"
77*ccdc9c3eSSadaf Ebrahimi     "6. match! 0\n" },
78*ccdc9c3eSSadaf Ebrahimi   { "(?:a)",
79*ccdc9c3eSSadaf Ebrahimi     "3. byte [61-61] -> 4\n"
80*ccdc9c3eSSadaf Ebrahimi     "4. match! 0\n" },
81*ccdc9c3eSSadaf Ebrahimi   { "",
82*ccdc9c3eSSadaf Ebrahimi     "3. match! 0\n" },
83*ccdc9c3eSSadaf Ebrahimi   { ".",
84*ccdc9c3eSSadaf Ebrahimi     "3+ byte [00-09] -> 5\n"
85*ccdc9c3eSSadaf Ebrahimi     "4. byte [0b-ff] -> 5\n"
86*ccdc9c3eSSadaf Ebrahimi     "5. match! 0\n" },
87*ccdc9c3eSSadaf Ebrahimi   { "[^ab]",
88*ccdc9c3eSSadaf Ebrahimi     "3+ byte [00-09] -> 6\n"
89*ccdc9c3eSSadaf Ebrahimi     "4+ byte [0b-60] -> 6\n"
90*ccdc9c3eSSadaf Ebrahimi     "5. byte [63-ff] -> 6\n"
91*ccdc9c3eSSadaf Ebrahimi     "6. match! 0\n" },
92*ccdc9c3eSSadaf Ebrahimi   { "[Aa]",
93*ccdc9c3eSSadaf Ebrahimi     "3. byte/i [61-61] -> 4\n"
94*ccdc9c3eSSadaf Ebrahimi     "4. match! 0\n" },
95*ccdc9c3eSSadaf Ebrahimi   { "\\C+",
96*ccdc9c3eSSadaf Ebrahimi     "3. byte [00-ff] -> 4\n"
97*ccdc9c3eSSadaf Ebrahimi     "4+ altmatch -> 5 | 6\n"
98*ccdc9c3eSSadaf Ebrahimi     "5+ nop -> 3\n"
99*ccdc9c3eSSadaf Ebrahimi     "6. match! 0\n" },
100*ccdc9c3eSSadaf Ebrahimi   { "\\C*",
101*ccdc9c3eSSadaf Ebrahimi     "3+ altmatch -> 4 | 5\n"
102*ccdc9c3eSSadaf Ebrahimi     "4+ byte [00-ff] -> 3\n"
103*ccdc9c3eSSadaf Ebrahimi     "5. match! 0\n" },
104*ccdc9c3eSSadaf Ebrahimi   { "\\C?",
105*ccdc9c3eSSadaf Ebrahimi     "3+ byte [00-ff] -> 5\n"
106*ccdc9c3eSSadaf Ebrahimi     "4. nop -> 5\n"
107*ccdc9c3eSSadaf Ebrahimi     "5. match! 0\n" },
108*ccdc9c3eSSadaf Ebrahimi   // Issue 20992936
109*ccdc9c3eSSadaf Ebrahimi   { "[[-`]",
110*ccdc9c3eSSadaf Ebrahimi     "3. byte [5b-60] -> 4\n"
111*ccdc9c3eSSadaf Ebrahimi     "4. match! 0\n" },
112*ccdc9c3eSSadaf Ebrahimi };
113*ccdc9c3eSSadaf Ebrahimi 
TEST(TestRegexpCompileToProg,Simple)114*ccdc9c3eSSadaf Ebrahimi TEST(TestRegexpCompileToProg, Simple) {
115*ccdc9c3eSSadaf Ebrahimi   int failed = 0;
116*ccdc9c3eSSadaf Ebrahimi   for (int i = 0; i < arraysize(tests); i++) {
117*ccdc9c3eSSadaf Ebrahimi     const re2::Test& t = tests[i];
118*ccdc9c3eSSadaf Ebrahimi     Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
119*ccdc9c3eSSadaf Ebrahimi     if (re == NULL) {
120*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "Cannot parse: " << t.regexp;
121*ccdc9c3eSSadaf Ebrahimi       failed++;
122*ccdc9c3eSSadaf Ebrahimi       continue;
123*ccdc9c3eSSadaf Ebrahimi     }
124*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToProg(0);
125*ccdc9c3eSSadaf Ebrahimi     if (prog == NULL) {
126*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "Cannot compile: " << t.regexp;
127*ccdc9c3eSSadaf Ebrahimi       re->Decref();
128*ccdc9c3eSSadaf Ebrahimi       failed++;
129*ccdc9c3eSSadaf Ebrahimi       continue;
130*ccdc9c3eSSadaf Ebrahimi     }
131*ccdc9c3eSSadaf Ebrahimi     ASSERT_TRUE(re->CompileToProg(1) == NULL);
132*ccdc9c3eSSadaf Ebrahimi     string s = prog->Dump();
133*ccdc9c3eSSadaf Ebrahimi     if (s != t.code) {
134*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
135*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "Want:\n" << t.code;
136*ccdc9c3eSSadaf Ebrahimi       LOG(ERROR) << "Got:\n" << s;
137*ccdc9c3eSSadaf Ebrahimi       failed++;
138*ccdc9c3eSSadaf Ebrahimi     }
139*ccdc9c3eSSadaf Ebrahimi     delete prog;
140*ccdc9c3eSSadaf Ebrahimi     re->Decref();
141*ccdc9c3eSSadaf Ebrahimi   }
142*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ(failed, 0);
143*ccdc9c3eSSadaf Ebrahimi }
144*ccdc9c3eSSadaf Ebrahimi 
DumpByteMap(StringPiece pattern,Regexp::ParseFlags flags,string * bytemap)145*ccdc9c3eSSadaf Ebrahimi static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
146*ccdc9c3eSSadaf Ebrahimi                         string* bytemap) {
147*ccdc9c3eSSadaf Ebrahimi   Regexp* re = Regexp::Parse(pattern, flags, NULL);
148*ccdc9c3eSSadaf Ebrahimi   EXPECT_TRUE(re != NULL);
149*ccdc9c3eSSadaf Ebrahimi 
150*ccdc9c3eSSadaf Ebrahimi   Prog* prog = re->CompileToProg(0);
151*ccdc9c3eSSadaf Ebrahimi   EXPECT_TRUE(prog != NULL);
152*ccdc9c3eSSadaf Ebrahimi   *bytemap = prog->DumpByteMap();
153*ccdc9c3eSSadaf Ebrahimi   delete prog;
154*ccdc9c3eSSadaf Ebrahimi 
155*ccdc9c3eSSadaf Ebrahimi   re->Decref();
156*ccdc9c3eSSadaf Ebrahimi }
157*ccdc9c3eSSadaf Ebrahimi 
TEST(TestCompile,Latin1Ranges)158*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, Latin1Ranges) {
159*ccdc9c3eSSadaf Ebrahimi   // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
160*ccdc9c3eSSadaf Ebrahimi 
161*ccdc9c3eSSadaf Ebrahimi   string bytemap;
162*ccdc9c3eSSadaf Ebrahimi 
163*ccdc9c3eSSadaf Ebrahimi   DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
164*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("[00-09] -> 0\n"
165*ccdc9c3eSSadaf Ebrahimi             "[0a-0a] -> 1\n"
166*ccdc9c3eSSadaf Ebrahimi             "[0b-ff] -> 0\n",
167*ccdc9c3eSSadaf Ebrahimi             bytemap);
168*ccdc9c3eSSadaf Ebrahimi }
169*ccdc9c3eSSadaf Ebrahimi 
TEST(TestCompile,OtherByteMapTests)170*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, OtherByteMapTests) {
171*ccdc9c3eSSadaf Ebrahimi   string bytemap;
172*ccdc9c3eSSadaf Ebrahimi 
173*ccdc9c3eSSadaf Ebrahimi   // Test that "absent" ranges are mapped to the same byte class.
174*ccdc9c3eSSadaf Ebrahimi   DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
175*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("[00-2f] -> 0\n"
176*ccdc9c3eSSadaf Ebrahimi             "[30-39] -> 1\n"
177*ccdc9c3eSSadaf Ebrahimi             "[3a-40] -> 0\n"
178*ccdc9c3eSSadaf Ebrahimi             "[41-46] -> 1\n"
179*ccdc9c3eSSadaf Ebrahimi             "[47-60] -> 0\n"
180*ccdc9c3eSSadaf Ebrahimi             "[61-66] -> 1\n"
181*ccdc9c3eSSadaf Ebrahimi             "[67-ff] -> 0\n",
182*ccdc9c3eSSadaf Ebrahimi             bytemap);
183*ccdc9c3eSSadaf Ebrahimi 
184*ccdc9c3eSSadaf Ebrahimi   // Test the byte classes for \b.
185*ccdc9c3eSSadaf Ebrahimi   DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
186*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("[00-2f] -> 0\n"
187*ccdc9c3eSSadaf Ebrahimi             "[30-39] -> 1\n"
188*ccdc9c3eSSadaf Ebrahimi             "[3a-40] -> 0\n"
189*ccdc9c3eSSadaf Ebrahimi             "[41-5a] -> 1\n"
190*ccdc9c3eSSadaf Ebrahimi             "[5b-5e] -> 0\n"
191*ccdc9c3eSSadaf Ebrahimi             "[5f-5f] -> 1\n"
192*ccdc9c3eSSadaf Ebrahimi             "[60-60] -> 0\n"
193*ccdc9c3eSSadaf Ebrahimi             "[61-7a] -> 1\n"
194*ccdc9c3eSSadaf Ebrahimi             "[7b-ff] -> 0\n",
195*ccdc9c3eSSadaf Ebrahimi             bytemap);
196*ccdc9c3eSSadaf Ebrahimi 
197*ccdc9c3eSSadaf Ebrahimi   // Bug in the ASCII case-folding optimization created too many byte classes.
198*ccdc9c3eSSadaf Ebrahimi   DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
199*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("[00-5e] -> 0\n"
200*ccdc9c3eSSadaf Ebrahimi             "[5f-5f] -> 1\n"
201*ccdc9c3eSSadaf Ebrahimi             "[60-ff] -> 0\n",
202*ccdc9c3eSSadaf Ebrahimi             bytemap);
203*ccdc9c3eSSadaf Ebrahimi }
204*ccdc9c3eSSadaf Ebrahimi 
TEST(TestCompile,UTF8Ranges)205*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, UTF8Ranges) {
206*ccdc9c3eSSadaf Ebrahimi   // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
207*ccdc9c3eSSadaf Ebrahimi   // Once, erroneously split between 0x3f and 0x40 because it is
208*ccdc9c3eSSadaf Ebrahimi   // a 6-bit boundary.
209*ccdc9c3eSSadaf Ebrahimi 
210*ccdc9c3eSSadaf Ebrahimi   string bytemap;
211*ccdc9c3eSSadaf Ebrahimi 
212*ccdc9c3eSSadaf Ebrahimi   DumpByteMap(".", Regexp::PerlX, &bytemap);
213*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("[00-09] -> 0\n"
214*ccdc9c3eSSadaf Ebrahimi             "[0a-0a] -> 1\n"
215*ccdc9c3eSSadaf Ebrahimi             "[0b-7f] -> 0\n"
216*ccdc9c3eSSadaf Ebrahimi             "[80-8f] -> 2\n"
217*ccdc9c3eSSadaf Ebrahimi             "[90-9f] -> 3\n"
218*ccdc9c3eSSadaf Ebrahimi             "[a0-bf] -> 4\n"
219*ccdc9c3eSSadaf Ebrahimi             "[c0-c1] -> 1\n"
220*ccdc9c3eSSadaf Ebrahimi             "[c2-df] -> 5\n"
221*ccdc9c3eSSadaf Ebrahimi             "[e0-e0] -> 6\n"
222*ccdc9c3eSSadaf Ebrahimi             "[e1-ef] -> 7\n"
223*ccdc9c3eSSadaf Ebrahimi             "[f0-f0] -> 8\n"
224*ccdc9c3eSSadaf Ebrahimi             "[f1-f3] -> 9\n"
225*ccdc9c3eSSadaf Ebrahimi             "[f4-f4] -> 10\n"
226*ccdc9c3eSSadaf Ebrahimi             "[f5-ff] -> 1\n",
227*ccdc9c3eSSadaf Ebrahimi             bytemap);
228*ccdc9c3eSSadaf Ebrahimi }
229*ccdc9c3eSSadaf Ebrahimi 
TEST(TestCompile,InsufficientMemory)230*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, InsufficientMemory) {
231*ccdc9c3eSSadaf Ebrahimi   Regexp* re = Regexp::Parse(
232*ccdc9c3eSSadaf Ebrahimi       "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
233*ccdc9c3eSSadaf Ebrahimi       Regexp::LikePerl, NULL);
234*ccdc9c3eSSadaf Ebrahimi   EXPECT_TRUE(re != NULL);
235*ccdc9c3eSSadaf Ebrahimi   Prog* prog = re->CompileToProg(920);
236*ccdc9c3eSSadaf Ebrahimi   // If the memory budget has been exhausted, compilation should fail
237*ccdc9c3eSSadaf Ebrahimi   // and return NULL instead of trying to do anything with NoMatch().
238*ccdc9c3eSSadaf Ebrahimi   EXPECT_TRUE(prog == NULL);
239*ccdc9c3eSSadaf Ebrahimi   re->Decref();
240*ccdc9c3eSSadaf Ebrahimi }
241*ccdc9c3eSSadaf Ebrahimi 
Dump(StringPiece pattern,Regexp::ParseFlags flags,string * forward,string * reverse)242*ccdc9c3eSSadaf Ebrahimi static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
243*ccdc9c3eSSadaf Ebrahimi                  string* forward, string* reverse) {
244*ccdc9c3eSSadaf Ebrahimi   Regexp* re = Regexp::Parse(pattern, flags, NULL);
245*ccdc9c3eSSadaf Ebrahimi   EXPECT_TRUE(re != NULL);
246*ccdc9c3eSSadaf Ebrahimi 
247*ccdc9c3eSSadaf Ebrahimi   if (forward != NULL) {
248*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToProg(0);
249*ccdc9c3eSSadaf Ebrahimi     EXPECT_TRUE(prog != NULL);
250*ccdc9c3eSSadaf Ebrahimi     *forward = prog->Dump();
251*ccdc9c3eSSadaf Ebrahimi     delete prog;
252*ccdc9c3eSSadaf Ebrahimi   }
253*ccdc9c3eSSadaf Ebrahimi 
254*ccdc9c3eSSadaf Ebrahimi   if (reverse != NULL) {
255*ccdc9c3eSSadaf Ebrahimi     Prog* prog = re->CompileToReverseProg(0);
256*ccdc9c3eSSadaf Ebrahimi     EXPECT_TRUE(prog != NULL);
257*ccdc9c3eSSadaf Ebrahimi     *reverse = prog->Dump();
258*ccdc9c3eSSadaf Ebrahimi     delete prog;
259*ccdc9c3eSSadaf Ebrahimi   }
260*ccdc9c3eSSadaf Ebrahimi 
261*ccdc9c3eSSadaf Ebrahimi   re->Decref();
262*ccdc9c3eSSadaf Ebrahimi }
263*ccdc9c3eSSadaf Ebrahimi 
TEST(TestCompile,Bug26705922)264*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, Bug26705922) {
265*ccdc9c3eSSadaf Ebrahimi   // Bug in the compiler caused inefficient bytecode to be generated for Unicode
266*ccdc9c3eSSadaf Ebrahimi   // groups: common suffixes were cached, but common prefixes were not factored.
267*ccdc9c3eSSadaf Ebrahimi 
268*ccdc9c3eSSadaf Ebrahimi   string forward, reverse;
269*ccdc9c3eSSadaf Ebrahimi 
270*ccdc9c3eSSadaf Ebrahimi   Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
271*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("3. byte [f0-f0] -> 4\n"
272*ccdc9c3eSSadaf Ebrahimi             "4. byte [90-90] -> 5\n"
273*ccdc9c3eSSadaf Ebrahimi             "5. byte [80-80] -> 6\n"
274*ccdc9c3eSSadaf Ebrahimi             "6+ byte [80-80] -> 8\n"
275*ccdc9c3eSSadaf Ebrahimi             "7. byte [90-90] -> 8\n"
276*ccdc9c3eSSadaf Ebrahimi             "8. match! 0\n",
277*ccdc9c3eSSadaf Ebrahimi             forward);
278*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("3+ byte [80-80] -> 5\n"
279*ccdc9c3eSSadaf Ebrahimi             "4. byte [90-90] -> 5\n"
280*ccdc9c3eSSadaf Ebrahimi             "5. byte [80-80] -> 6\n"
281*ccdc9c3eSSadaf Ebrahimi             "6. byte [90-90] -> 7\n"
282*ccdc9c3eSSadaf Ebrahimi             "7. byte [f0-f0] -> 8\n"
283*ccdc9c3eSSadaf Ebrahimi             "8. match! 0\n",
284*ccdc9c3eSSadaf Ebrahimi             reverse);
285*ccdc9c3eSSadaf Ebrahimi 
286*ccdc9c3eSSadaf Ebrahimi   Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
287*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("3+ byte [e8-ef] -> 5\n"
288*ccdc9c3eSSadaf Ebrahimi             "4. byte [f0-f0] -> 8\n"
289*ccdc9c3eSSadaf Ebrahimi             "5. byte [80-bf] -> 6\n"
290*ccdc9c3eSSadaf Ebrahimi             "6. byte [80-bf] -> 7\n"
291*ccdc9c3eSSadaf Ebrahimi             "7. match! 0\n"
292*ccdc9c3eSSadaf Ebrahimi             "8. byte [90-90] -> 5\n",
293*ccdc9c3eSSadaf Ebrahimi             forward);
294*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("3. byte [80-bf] -> 4\n"
295*ccdc9c3eSSadaf Ebrahimi             "4. byte [80-bf] -> 5\n"
296*ccdc9c3eSSadaf Ebrahimi             "5+ byte [e8-ef] -> 7\n"
297*ccdc9c3eSSadaf Ebrahimi             "6. byte [90-90] -> 8\n"
298*ccdc9c3eSSadaf Ebrahimi             "7. match! 0\n"
299*ccdc9c3eSSadaf Ebrahimi             "8. byte [f0-f0] -> 7\n",
300*ccdc9c3eSSadaf Ebrahimi             reverse);
301*ccdc9c3eSSadaf Ebrahimi 
302*ccdc9c3eSSadaf Ebrahimi   Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
303*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("3. byte [80-bf] -> 4\n"
304*ccdc9c3eSSadaf Ebrahimi             "4+ byte [c2-df] -> 7\n"
305*ccdc9c3eSSadaf Ebrahimi             "5+ byte [a0-bf] -> 8\n"
306*ccdc9c3eSSadaf Ebrahimi             "6. byte [80-bf] -> 9\n"
307*ccdc9c3eSSadaf Ebrahimi             "7. match! 0\n"
308*ccdc9c3eSSadaf Ebrahimi             "8. byte [e0-e0] -> 7\n"
309*ccdc9c3eSSadaf Ebrahimi             "9+ byte [e1-ef] -> 7\n"
310*ccdc9c3eSSadaf Ebrahimi             "10+ byte [90-bf] -> 13\n"
311*ccdc9c3eSSadaf Ebrahimi             "11+ byte [80-bf] -> 14\n"
312*ccdc9c3eSSadaf Ebrahimi             "12. byte [80-8f] -> 15\n"
313*ccdc9c3eSSadaf Ebrahimi             "13. byte [f0-f0] -> 7\n"
314*ccdc9c3eSSadaf Ebrahimi             "14. byte [f1-f3] -> 7\n"
315*ccdc9c3eSSadaf Ebrahimi             "15. byte [f4-f4] -> 7\n",
316*ccdc9c3eSSadaf Ebrahimi             reverse);
317*ccdc9c3eSSadaf Ebrahimi }
318*ccdc9c3eSSadaf Ebrahimi 
TEST(TestCompile,Bug35237384)319*ccdc9c3eSSadaf Ebrahimi TEST(TestCompile, Bug35237384) {
320*ccdc9c3eSSadaf Ebrahimi   // Bug in the compiler caused inefficient bytecode to be generated for
321*ccdc9c3eSSadaf Ebrahimi   // nested nullable subexpressions.
322*ccdc9c3eSSadaf Ebrahimi 
323*ccdc9c3eSSadaf Ebrahimi   string forward;
324*ccdc9c3eSSadaf Ebrahimi 
325*ccdc9c3eSSadaf Ebrahimi   Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
326*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("3+ byte [61-61] -> 3\n"
327*ccdc9c3eSSadaf Ebrahimi             "4. nop -> 5\n"
328*ccdc9c3eSSadaf Ebrahimi             "5+ byte [61-61] -> 5\n"
329*ccdc9c3eSSadaf Ebrahimi             "6. nop -> 7\n"
330*ccdc9c3eSSadaf Ebrahimi             "7+ byte [61-61] -> 7\n"
331*ccdc9c3eSSadaf Ebrahimi             "8. match! 0\n",
332*ccdc9c3eSSadaf Ebrahimi             forward);
333*ccdc9c3eSSadaf Ebrahimi 
334*ccdc9c3eSSadaf Ebrahimi   Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
335*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("3+ nop -> 6\n"
336*ccdc9c3eSSadaf Ebrahimi             "4+ nop -> 8\n"
337*ccdc9c3eSSadaf Ebrahimi             "5. nop -> 21\n"
338*ccdc9c3eSSadaf Ebrahimi             "6+ byte [61-61] -> 6\n"
339*ccdc9c3eSSadaf Ebrahimi             "7. nop -> 3\n"
340*ccdc9c3eSSadaf Ebrahimi             "8+ byte [62-62] -> 8\n"
341*ccdc9c3eSSadaf Ebrahimi             "9. nop -> 3\n"
342*ccdc9c3eSSadaf Ebrahimi             "10+ byte [61-61] -> 10\n"
343*ccdc9c3eSSadaf Ebrahimi             "11. nop -> 21\n"
344*ccdc9c3eSSadaf Ebrahimi             "12+ byte [62-62] -> 12\n"
345*ccdc9c3eSSadaf Ebrahimi             "13. nop -> 21\n"
346*ccdc9c3eSSadaf Ebrahimi             "14+ byte [61-61] -> 14\n"
347*ccdc9c3eSSadaf Ebrahimi             "15. nop -> 18\n"
348*ccdc9c3eSSadaf Ebrahimi             "16+ byte [62-62] -> 16\n"
349*ccdc9c3eSSadaf Ebrahimi             "17. nop -> 18\n"
350*ccdc9c3eSSadaf Ebrahimi             "18+ nop -> 14\n"
351*ccdc9c3eSSadaf Ebrahimi             "19+ nop -> 16\n"
352*ccdc9c3eSSadaf Ebrahimi             "20. match! 0\n"
353*ccdc9c3eSSadaf Ebrahimi             "21+ nop -> 10\n"
354*ccdc9c3eSSadaf Ebrahimi             "22+ nop -> 12\n"
355*ccdc9c3eSSadaf Ebrahimi             "23. nop -> 18\n",
356*ccdc9c3eSSadaf Ebrahimi       forward);
357*ccdc9c3eSSadaf Ebrahimi 
358*ccdc9c3eSSadaf Ebrahimi   Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
359*ccdc9c3eSSadaf Ebrahimi   EXPECT_EQ("3+ nop -> 36\n"
360*ccdc9c3eSSadaf Ebrahimi             "4+ nop -> 31\n"
361*ccdc9c3eSSadaf Ebrahimi             "5. nop -> 33\n"
362*ccdc9c3eSSadaf Ebrahimi             "6+ byte [00-09] -> 8\n"
363*ccdc9c3eSSadaf Ebrahimi             "7. byte [0b-ff] -> 8\n"
364*ccdc9c3eSSadaf Ebrahimi             "8+ nop -> 6\n"
365*ccdc9c3eSSadaf Ebrahimi             "9+ nop -> 29\n"
366*ccdc9c3eSSadaf Ebrahimi             "10. nop -> 28\n"
367*ccdc9c3eSSadaf Ebrahimi             "11+ byte [00-09] -> 13\n"
368*ccdc9c3eSSadaf Ebrahimi             "12. byte [0b-ff] -> 13\n"
369*ccdc9c3eSSadaf Ebrahimi             "13+ nop -> 11\n"
370*ccdc9c3eSSadaf Ebrahimi             "14+ nop -> 26\n"
371*ccdc9c3eSSadaf Ebrahimi             "15. nop -> 28\n"
372*ccdc9c3eSSadaf Ebrahimi             "16+ byte [00-09] -> 18\n"
373*ccdc9c3eSSadaf Ebrahimi             "17. byte [0b-ff] -> 18\n"
374*ccdc9c3eSSadaf Ebrahimi             "18+ nop -> 16\n"
375*ccdc9c3eSSadaf Ebrahimi             "19+ nop -> 36\n"
376*ccdc9c3eSSadaf Ebrahimi             "20. nop -> 33\n"
377*ccdc9c3eSSadaf Ebrahimi             "21+ byte [00-09] -> 23\n"
378*ccdc9c3eSSadaf Ebrahimi             "22. byte [0b-ff] -> 23\n"
379*ccdc9c3eSSadaf Ebrahimi             "23+ nop -> 21\n"
380*ccdc9c3eSSadaf Ebrahimi             "24+ nop -> 31\n"
381*ccdc9c3eSSadaf Ebrahimi             "25. nop -> 33\n"
382*ccdc9c3eSSadaf Ebrahimi             "26+ nop -> 28\n"
383*ccdc9c3eSSadaf Ebrahimi             "27. byte [53-53] -> 11\n"
384*ccdc9c3eSSadaf Ebrahimi             "28. match! 0\n"
385*ccdc9c3eSSadaf Ebrahimi             "29+ nop -> 28\n"
386*ccdc9c3eSSadaf Ebrahimi             "30. byte [53-53] -> 6\n"
387*ccdc9c3eSSadaf Ebrahimi             "31+ nop -> 33\n"
388*ccdc9c3eSSadaf Ebrahimi             "32. byte [53-53] -> 21\n"
389*ccdc9c3eSSadaf Ebrahimi             "33+ nop -> 29\n"
390*ccdc9c3eSSadaf Ebrahimi             "34+ nop -> 26\n"
391*ccdc9c3eSSadaf Ebrahimi             "35. nop -> 28\n"
392*ccdc9c3eSSadaf Ebrahimi             "36+ nop -> 33\n"
393*ccdc9c3eSSadaf Ebrahimi             "37. byte [53-53] -> 16\n",
394*ccdc9c3eSSadaf Ebrahimi       forward);
395*ccdc9c3eSSadaf Ebrahimi }
396*ccdc9c3eSSadaf Ebrahimi 
397*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
398