1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Test parse.cc, dump.cc, and tostring.cc.
6
7 #include <string>
8
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "re2/regexp.h"
12
13 namespace re2 {
14
15 // In the past, we used 1<<30 here and zeroed the bit later, but that
16 // has undefined behaviour, so now we use an internal-only flag because
17 // otherwise we would have to introduce a new flag value just for this.
18 static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;
19
20 struct Test {
21 const char* regexp;
22 const char* parse;
23 Regexp::ParseFlags flags;
24 };
25
26 static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
27 Regexp::PerlX |
28 Regexp::PerlClasses |
29 Regexp::UnicodeGroups;
30
31 static Test tests[] = {
32 // Base cases
33 { "a", "lit{a}" },
34 { "a.", "cat{lit{a}dot{}}" },
35 { "a.b", "cat{lit{a}dot{}lit{b}}" },
36 { "ab", "str{ab}" },
37 { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
38 { "abc", "str{abc}" },
39 { "a|^", "alt{lit{a}bol{}}" },
40 { "a|b", "cc{0x61-0x62}" },
41 { "(a)", "cap{lit{a}}" },
42 { "(a)|b", "alt{cap{lit{a}}lit{b}}" },
43 { "a*", "star{lit{a}}" },
44 { "a+", "plus{lit{a}}" },
45 { "a?", "que{lit{a}}" },
46 { "a{2}", "rep{2,2 lit{a}}" },
47 { "a{2,3}", "rep{2,3 lit{a}}" },
48 { "a{2,}", "rep{2,-1 lit{a}}" },
49 { "a*?", "nstar{lit{a}}" },
50 { "a+?", "nplus{lit{a}}" },
51 { "a??", "nque{lit{a}}" },
52 { "a{2}?", "nrep{2,2 lit{a}}" },
53 { "a{2,3}?", "nrep{2,3 lit{a}}" },
54 { "a{2,}?", "nrep{2,-1 lit{a}}" },
55 { "", "emp{}" },
56 { "|", "alt{emp{}emp{}}" },
57 { "|x|", "alt{emp{}lit{x}emp{}}" },
58 { ".", "dot{}" },
59 { "^", "bol{}" },
60 { "$", "eol{}" },
61 { "\\|", "lit{|}" },
62 { "\\(", "lit{(}" },
63 { "\\)", "lit{)}" },
64 { "\\*", "lit{*}" },
65 { "\\+", "lit{+}" },
66 { "\\?", "lit{?}" },
67 { "{", "lit{{}" },
68 { "}", "lit{}}" },
69 { "\\.", "lit{.}" },
70 { "\\^", "lit{^}" },
71 { "\\$", "lit{$}" },
72 { "\\\\", "lit{\\}" },
73 { "[ace]", "cc{0x61 0x63 0x65}" },
74 { "[abc]", "cc{0x61-0x63}" },
75 { "[a-z]", "cc{0x61-0x7a}" },
76 { "[a]", "lit{a}" },
77 { "\\-", "lit{-}" },
78 { "-", "lit{-}" },
79 { "\\_", "lit{_}" },
80
81 // Posix and Perl extensions
82 { "[[:lower:]]", "cc{0x61-0x7a}" },
83 { "[a-z]", "cc{0x61-0x7a}" },
84 { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
85 { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
86 { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
87 { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
88 { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
89 { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
90 { "\\d", "cc{0x30-0x39}" },
91 { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
92 { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
93 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
94 { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
95 { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
96 { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
97 { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
98 { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
99 { "\\C", "byte{}" },
100
101 // Unicode, negatives, and a double negative.
102 { "\\p{Braille}", "cc{0x2800-0x28ff}" },
103 { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
104 { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
105 { "\\P{^Braille}", "cc{0x2800-0x28ff}" },
106
107 // More interesting regular expressions.
108 { "a{,2}", "str{a{,2}}" },
109 { "\\.\\^\\$\\\\", "str{.^$\\}" },
110 { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
111 { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
112 { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
113 { "a*{", "cat{star{lit{a}}lit{{}}" },
114
115 // Test precedences
116 { "(?:ab)*", "star{str{ab}}" },
117 { "(ab)*", "star{cap{str{ab}}}" },
118 { "ab|cd", "alt{str{ab}str{cd}}" },
119 { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
120
121 // Test squashing of **, ++, ?? et cetera.
122 { "(?:(?:a)*)*", "star{lit{a}}" },
123 { "(?:(?:a)+)+", "plus{lit{a}}" },
124 { "(?:(?:a)?)?", "que{lit{a}}" },
125 { "(?:(?:a)*)+", "star{lit{a}}" },
126 { "(?:(?:a)*)?", "star{lit{a}}" },
127 { "(?:(?:a)+)*", "star{lit{a}}" },
128 { "(?:(?:a)+)?", "star{lit{a}}" },
129 { "(?:(?:a)?)*", "star{lit{a}}" },
130 { "(?:(?:a)?)+", "star{lit{a}}" },
131
132 // Test flattening.
133 { "(?:a)", "lit{a}" },
134 { "(?:ab)(?:cd)", "str{abcd}" },
135 { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
136 { "a|c", "cc{0x61 0x63}" },
137 { "a|[cd]", "cc{0x61 0x63-0x64}" },
138 { "a|.", "dot{}" },
139 { "[ab]|c", "cc{0x61-0x63}" },
140 { "[ab]|[cd]", "cc{0x61-0x64}" },
141 { "[ab]|.", "dot{}" },
142 { ".|c", "dot{}" },
143 { ".|[cd]", "dot{}" },
144 { ".|.", "dot{}" },
145
146 // Test Perl quoted literals
147 { "\\Q+|*?{[\\E", "str{+|*?{[}" },
148 { "\\Q+\\E+", "plus{lit{+}}" },
149 { "\\Q\\\\E", "lit{\\}" },
150 { "\\Q\\\\\\E", "str{\\\\}" },
151 { "\\Qa\\E*", "star{lit{a}}" },
152 { "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" },
153 { "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" },
154
155 // Test Perl \A and \z
156 { "(?m)^", "bol{}" },
157 { "(?m)$", "eol{}" },
158 { "(?-m)^", "bot{}" },
159 { "(?-m)$", "eot{}" },
160 { "(?m)\\A", "bot{}" },
161 { "(?m)\\z", "eot{\\z}" },
162 { "(?-m)\\A", "bot{}" },
163 { "(?-m)\\z", "eot{\\z}" },
164
165 // Test named captures
166 { "(?P<name>a)", "cap{name:lit{a}}" },
167 { "(?P<中文>a)", "cap{中文:lit{a}}" },
168
169 // Case-folded literals
170 { "[Aa]", "litfold{a}" },
171
172 // Strings
173 { "abcde", "str{abcde}" },
174 { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
175
176 // Reported bug involving \n leaking in despite use of NeverNL.
177 { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
178 { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
179 { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
180 { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
181 { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
182 { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
183 { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
184 { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
185 { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
186 { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
187 { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
188 { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
189 { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
190 { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
191 { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
192 { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
193 { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
194 { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
195 { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
196 { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
197 { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
198 { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
199 { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
200 { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
201 { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
202 { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
203 { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
204 { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
205 { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
206 Regexp::PerlClasses },
207 { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
208 Regexp::PerlClasses | Regexp::FoldCase },
209 { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
210 Regexp::PerlClasses | Regexp::NeverNL },
211 { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
212 Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
213 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
214 Regexp::PerlClasses },
215 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
216 Regexp::PerlClasses | Regexp::FoldCase },
217 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
218 Regexp::PerlClasses | Regexp::NeverNL },
219 { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
220 Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
221
222 // Bug in Regexp::ToString() that emitted [^], which
223 // would (obviously) fail to parse when fed back in.
224 { "[\\s\\S]", "cc{0-0x10ffff}" },
225 };
226
RegexpEqualTestingOnly(Regexp * a,Regexp * b)227 bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
228 return Regexp::Equal(a, b);
229 }
230
TestParse(const Test * tests,int ntests,Regexp::ParseFlags flags,const std::string & title)231 void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
232 const std::string& title) {
233 Regexp** re = new Regexp*[ntests];
234 for (int i = 0; i < ntests; i++) {
235 RegexpStatus status;
236 Regexp::ParseFlags f = flags;
237 if (tests[i].flags != 0) {
238 f = tests[i].flags & ~TestZeroFlags;
239 }
240 re[i] = Regexp::Parse(tests[i].regexp, f, &status);
241 ASSERT_TRUE(re[i] != NULL)
242 << " " << tests[i].regexp << " " << status.Text();
243 std::string s = re[i]->Dump();
244 EXPECT_EQ(std::string(tests[i].parse), s)
245 << "Regexp: " << tests[i].regexp
246 << "\nparse: " << std::string(tests[i].parse)
247 << " s: " << s << " flag=" << f;
248 }
249
250 for (int i = 0; i < ntests; i++) {
251 for (int j = 0; j < ntests; j++) {
252 EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse),
253 RegexpEqualTestingOnly(re[i], re[j]))
254 << "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
255 }
256 }
257
258 for (int i = 0; i < ntests; i++)
259 re[i]->Decref();
260 delete[] re;
261 }
262
263 // Test that regexps parse to expected structures.
TEST(TestParse,SimpleRegexps)264 TEST(TestParse, SimpleRegexps) {
265 TestParse(tests, arraysize(tests), kTestFlags, "simple");
266 }
267
268 Test foldcase_tests[] = {
269 { "AbCdE", "strfold{abcde}" },
270 { "[Aa]", "litfold{a}" },
271 { "a", "litfold{a}" },
272
273 // 0x17F is an old English long s (looks like an f) and folds to s.
274 // 0x212A is the Kelvin symbol and folds to k.
275 { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
276 { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
277 { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
278 };
279
280 // Test that parsing with FoldCase works.
TEST(TestParse,FoldCase)281 TEST(TestParse, FoldCase) {
282 TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
283 }
284
285 Test literal_tests[] = {
286 { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
287 };
288
289 // Test that parsing with Literal works.
TEST(TestParse,Literal)290 TEST(TestParse, Literal) {
291 TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
292 }
293
294 Test matchnl_tests[] = {
295 { ".", "dot{}" },
296 { "\n", "lit{\n}" },
297 { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
298 { "[a\\n]", "cc{0xa 0x61}" },
299 };
300
301 // Test that parsing with MatchNL works.
302 // (Also tested above during simple cases.)
TEST(TestParse,MatchNL)303 TEST(TestParse, MatchNL) {
304 TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
305 }
306
307 Test nomatchnl_tests[] = {
308 { ".", "cc{0-0x9 0xb-0x10ffff}" },
309 { "\n", "lit{\n}" },
310 { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
311 { "[a\\n]", "cc{0xa 0x61}" },
312 };
313
314 // Test that parsing without MatchNL works.
TEST(TestParse,NoMatchNL)315 TEST(TestParse, NoMatchNL) {
316 TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
317 }
318
319 Test prefix_tests[] = {
320 { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
321 { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
322 { "abc|abd|aef|bcx|bcy",
323 "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
324 "cat{str{bc}cc{0x78-0x79}}}" },
325 { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
326 { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
327 { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
328 { ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" },
329 { "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" },
330 { "x{2}|x{2}[0-9]",
331 "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
332 { "x{2}y|x{2}[0-9]y",
333 "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
334 { "n|r|rs",
335 "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
336 { "n|rs|r",
337 "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
338 { "r|rs|n",
339 "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
340 { "rs|r|n",
341 "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
342 { "a\\C*?c|a\\C*?b",
343 "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" },
344 { "^/a/bc|^/a/de",
345 "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" },
346 // In the past, factoring was limited to kFactorAlternationMaxDepth (8).
347 { "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa",
348 "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
349 "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
350 "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
351 "lit{a}}}}}}}}}}}}}}}}}}}" },
352 { "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones",
353 "cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}"
354 "cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}"
355 "str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" },
356 };
357
358 // Test that prefix factoring works.
TEST(TestParse,Prefix)359 TEST(TestParse, Prefix) {
360 TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
361 }
362
363 Test nested_tests[] = {
364 { "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))",
365 "cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" },
366 { "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
367 "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" },
368 { "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
369 "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" },
370 { "((((((x{2}){2}){2}){5}){5}){5})",
371 "cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" },
372 };
373
374 // Test that nested repetition works.
TEST(TestParse,Nested)375 TEST(TestParse, Nested) {
376 TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested");
377 }
378
379 // Invalid regular expressions
380 const char* badtests[] = {
381 "(",
382 ")",
383 "(a",
384 "(a|b|",
385 "(a|b",
386 "[a-z",
387 "([a-z)",
388 "x{1001}",
389 "\xff", // Invalid UTF-8
390 "[\xff]",
391 "[\\\xff]",
392 "\\\xff",
393 "(?P<name>a",
394 "(?P<name>",
395 "(?P<name",
396 "(?P<x y>a)",
397 "(?P<>a)",
398 "[a-Z]",
399 "(?i)[a-Z]",
400 "a{100000}",
401 "a{100000,}",
402 "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
403 "(((x{7}){11}){13})",
404 "\\Q\\E*",
405 };
406
407 // Valid in Perl, bad in POSIX
408 const char* only_perl[] = {
409 "[a-b-c]",
410 "\\Qabc\\E",
411 "\\Q*+?{[\\E",
412 "\\Q\\\\E",
413 "\\Q\\\\\\E",
414 "\\Q\\\\\\\\E",
415 "\\Q\\\\\\\\\\E",
416 "(?:a)",
417 "(?P<name>a)",
418 };
419
420 // Valid in POSIX, bad in Perl.
421 const char* only_posix[] = {
422 "a++",
423 "a**",
424 "a?*",
425 "a+*",
426 "a{1}*",
427 };
428
429 // Test that parser rejects bad regexps.
TEST(TestParse,InvalidRegexps)430 TEST(TestParse, InvalidRegexps) {
431 for (size_t i = 0; i < arraysize(badtests); i++) {
432 ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
433 << " " << badtests[i];
434 ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
435 << " " << badtests[i];
436 }
437 for (size_t i = 0; i < arraysize(only_posix); i++) {
438 ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
439 << " " << only_posix[i];
440 Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
441 ASSERT_TRUE(re != NULL) << " " << only_posix[i];
442 re->Decref();
443 }
444 for (size_t i = 0; i < arraysize(only_perl); i++) {
445 ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
446 << " " << only_perl[i];
447 Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
448 ASSERT_TRUE(re != NULL) << " " << only_perl[i];
449 re->Decref();
450 }
451 }
452
453 // Test that ToString produces original regexp or equivalent one.
TEST(TestToString,EquivalentParse)454 TEST(TestToString, EquivalentParse) {
455 for (size_t i = 0; i < arraysize(tests); i++) {
456 RegexpStatus status;
457 Regexp::ParseFlags f = kTestFlags;
458 if (tests[i].flags != 0) {
459 f = tests[i].flags & ~TestZeroFlags;
460 }
461 Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
462 ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
463 std::string s = re->Dump();
464 EXPECT_EQ(std::string(tests[i].parse), s)
465 << "Regexp: " << tests[i].regexp
466 << "\nparse: " << std::string(tests[i].parse)
467 << " s: " << s << " flag=" << f;
468 std::string t = re->ToString();
469 if (t != tests[i].regexp) {
470 // If ToString didn't return the original regexp,
471 // it must have found one with fewer parens.
472 // Unfortunately we can't check the length here, because
473 // ToString produces "\\{" for a literal brace,
474 // but "{" is a shorter equivalent.
475 // ASSERT_LT(t.size(), strlen(tests[i].regexp))
476 // << " t=" << t << " regexp=" << tests[i].regexp;
477
478 // Test that if we parse the new regexp we get the same structure.
479 Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
480 ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
481 std::string ss = nre->Dump();
482 std::string tt = nre->ToString();
483 if (s != ss || t != tt)
484 LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
485 EXPECT_EQ(s, ss);
486 EXPECT_EQ(t, tt);
487 nre->Decref();
488 }
489 re->Decref();
490 }
491 }
492
493 // Test that capture error args are correct.
TEST(NamedCaptures,ErrorArgs)494 TEST(NamedCaptures, ErrorArgs) {
495 RegexpStatus status;
496 Regexp* re;
497
498 re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
499 EXPECT_TRUE(re == NULL);
500 EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
501 EXPECT_EQ(status.error_arg(), "(?P<name");
502
503 re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
504 EXPECT_TRUE(re == NULL);
505 EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
506 EXPECT_EQ(status.error_arg(), "(?P<space bar>");
507 }
508
509 } // namespace re2
510