1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include <string>
6
7 #include "util/test.h"
8 #include "util/logging.h"
9 #include "re2/prog.h"
10 #include "re2/regexp.h"
11
12 namespace re2 {
13
14 struct PrefixTest {
15 const char* regexp;
16 bool return_value;
17 const char* prefix;
18 bool foldcase;
19 const char* suffix;
20 };
21
22 static PrefixTest tests[] = {
23 // Empty cases.
24 { "", false },
25 { "(?m)^", false },
26 { "(?-m)^", false },
27
28 // If the regexp has no ^, there's no required prefix.
29 { "abc", false },
30
31 // If the regexp immediately goes into
32 // something not a literal match, there's no required prefix.
33 { "^a*", false },
34 { "^(abc)", false },
35
36 // Otherwise, it should work.
37 { "^abc$", true, "abc", false, "(?-m:$)" },
38 { "^abc", true, "abc", false, "" },
39 { "^(?i)abc", true, "abc", true, "" },
40 { "^abcd*", true, "abc", false, "d*" },
41 { "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
42 { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
43 { "^☺abc", true, "☺abc", false, "" },
44 };
45
TEST(RequiredPrefix,SimpleTests)46 TEST(RequiredPrefix, SimpleTests) {
47 for (size_t i = 0; i < arraysize(tests); i++) {
48 const PrefixTest& t = tests[i];
49 for (size_t j = 0; j < 2; j++) {
50 Regexp::ParseFlags flags = Regexp::LikePerl;
51 if (j == 0)
52 flags = flags | Regexp::Latin1;
53 Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
54 ASSERT_TRUE(re != NULL) << " " << t.regexp;
55
56 std::string p;
57 bool f;
58 Regexp* s;
59 ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
60 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
61 << " " << re->Dump();
62 if (t.return_value) {
63 ASSERT_EQ(p, std::string(t.prefix))
64 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
65 ASSERT_EQ(f, t.foldcase)
66 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
67 ASSERT_EQ(s->ToString(), std::string(t.suffix))
68 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
69 s->Decref();
70 }
71 re->Decref();
72 }
73 }
74 }
75
76 static PrefixTest for_accel_tests[] = {
77 // Empty cases.
78 { "", false },
79 { "(?m)^", false },
80 { "(?-m)^", false },
81
82 // If the regexp has a ^, there's no required prefix.
83 { "^abc", false },
84
85 // If the regexp immediately goes into
86 // something not a literal match, there's no required prefix.
87 { "a*", false },
88
89 // Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through"
90 // capturing groups, but doesn't try to glue prefix fragments together.
91 { "(a?)def", false },
92 { "(ab?)def", true, "a", false },
93 { "(abc?)def", true, "ab", false },
94 { "(()a)def", false },
95 { "((a)b)def", true, "a", false },
96 { "((ab)c)def", true, "ab", false },
97
98 // Otherwise, it should work.
99 { "abc$", true, "abc", false },
100 { "abc", true, "abc", false },
101 { "(?i)abc", true, "abc", true },
102 { "abcd*", true, "abc", false },
103 { "[Aa][Bb]cd*", true, "ab", true },
104 { "ab[Cc]d*", true, "ab", false },
105 { "☺abc", true, "☺abc", false },
106 };
107
TEST(RequiredPrefixForAccel,SimpleTests)108 TEST(RequiredPrefixForAccel, SimpleTests) {
109 for (size_t i = 0; i < arraysize(for_accel_tests); i++) {
110 const PrefixTest& t = for_accel_tests[i];
111 for (size_t j = 0; j < 2; j++) {
112 Regexp::ParseFlags flags = Regexp::LikePerl;
113 if (j == 0)
114 flags = flags | Regexp::Latin1;
115 Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
116 ASSERT_TRUE(re != NULL) << " " << t.regexp;
117
118 std::string p;
119 bool f;
120 ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f))
121 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
122 << " " << re->Dump();
123 if (t.return_value) {
124 ASSERT_EQ(p, std::string(t.prefix))
125 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
126 ASSERT_EQ(f, t.foldcase)
127 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
128 }
129 re->Decref();
130 }
131 }
132 }
133
TEST(RequiredPrefixForAccel,CaseFoldingForKAndS)134 TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) {
135 Regexp* re;
136 std::string p;
137 bool f;
138
139 // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'.
140 re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL);
141 ASSERT_TRUE(re != NULL);
142 ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
143 ASSERT_EQ(p, "klm");
144 ASSERT_EQ(f, true);
145 re->Decref();
146
147 re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL);
148 ASSERT_TRUE(re != NULL);
149 ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
150 ASSERT_EQ(p, "stu");
151 ASSERT_EQ(f, true);
152 re->Decref();
153
154 // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'.
155 // This is because they match U+212A and U+017F, respectively, and
156 // so the parser ends up emitting character classes, not literals.
157 re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL);
158 ASSERT_TRUE(re != NULL);
159 ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
160 re->Decref();
161
162 re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL);
163 ASSERT_TRUE(re != NULL);
164 ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
165 re->Decref();
166 }
167
168 static const char* prefix_accel_tests[] = {
169 "aababc\\d+",
170 "(?i)AABABC\\d+",
171 };
172
TEST(PrefixAccel,SimpleTests)173 TEST(PrefixAccel, SimpleTests) {
174 for (size_t i = 0; i < arraysize(prefix_accel_tests); i++) {
175 const char* pattern = prefix_accel_tests[i];
176 Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL);
177 ASSERT_TRUE(re != NULL);
178 Prog* prog = re->CompileToProg(0);
179 ASSERT_TRUE(prog != NULL);
180 ASSERT_TRUE(prog->can_prefix_accel());
181 for (int j = 0; j < 100; j++) {
182 std::string text(j, 'a');
183 const char* p = reinterpret_cast<const char*>(
184 prog->PrefixAccel(text.data(), text.size()));
185 EXPECT_TRUE(p == NULL);
186 text.append("aababc");
187 for (int k = 0; k < 100; k++) {
188 text.append(k, 'a');
189 p = reinterpret_cast<const char*>(
190 prog->PrefixAccel(text.data(), text.size()));
191 EXPECT_EQ(j, p - text.data());
192 }
193 }
194 delete prog;
195 re->Decref();
196 }
197 }
198
199 } // namespace re2
200