1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <string>
6 
7 #include "util/test.h"
8 #include "util/logging.h"
9 #include "re2/prog.h"
10 #include "re2/regexp.h"
11 
12 namespace re2 {
13 
14 struct PrefixTest {
15   const char* regexp;
16   bool return_value;
17   const char* prefix;
18   bool foldcase;
19   const char* suffix;
20 };
21 
22 static PrefixTest tests[] = {
23   // Empty cases.
24   { "", false },
25   { "(?m)^", false },
26   { "(?-m)^", false },
27 
28   // If the regexp has no ^, there's no required prefix.
29   { "abc", false },
30 
31   // If the regexp immediately goes into
32   // something not a literal match, there's no required prefix.
33   { "^a*",  false },
34   { "^(abc)", false },
35 
36   // Otherwise, it should work.
37   { "^abc$", true, "abc", false, "(?-m:$)" },
38   { "^abc", true, "abc", false, "" },
39   { "^(?i)abc", true, "abc", true, "" },
40   { "^abcd*", true, "abc", false, "d*" },
41   { "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
42   { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
43   { "^☺abc", true, "☺abc", false, "" },
44 };
45 
TEST(RequiredPrefix,SimpleTests)46 TEST(RequiredPrefix, SimpleTests) {
47   for (size_t i = 0; i < arraysize(tests); i++) {
48     const PrefixTest& t = tests[i];
49     for (size_t j = 0; j < 2; j++) {
50       Regexp::ParseFlags flags = Regexp::LikePerl;
51       if (j == 0)
52         flags = flags | Regexp::Latin1;
53       Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
54       ASSERT_TRUE(re != NULL) << " " << t.regexp;
55 
56       std::string p;
57       bool f;
58       Regexp* s;
59       ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
60         << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
61         << " " << re->Dump();
62       if (t.return_value) {
63         ASSERT_EQ(p, std::string(t.prefix))
64           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
65         ASSERT_EQ(f, t.foldcase)
66           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
67         ASSERT_EQ(s->ToString(), std::string(t.suffix))
68           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
69         s->Decref();
70       }
71       re->Decref();
72     }
73   }
74 }
75 
76 static PrefixTest for_accel_tests[] = {
77   // Empty cases.
78   { "", false },
79   { "(?m)^", false },
80   { "(?-m)^", false },
81 
82   // If the regexp has a ^, there's no required prefix.
83   { "^abc", false },
84 
85   // If the regexp immediately goes into
86   // something not a literal match, there's no required prefix.
87   { "a*",  false },
88 
89   // Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through"
90   // capturing groups, but doesn't try to glue prefix fragments together.
91   { "(a?)def", false },
92   { "(ab?)def", true, "a", false },
93   { "(abc?)def", true, "ab", false },
94   { "(()a)def", false },
95   { "((a)b)def", true, "a", false },
96   { "((ab)c)def", true, "ab", false },
97 
98   // Otherwise, it should work.
99   { "abc$", true, "abc", false },
100   { "abc", true, "abc", false },
101   { "(?i)abc", true, "abc", true },
102   { "abcd*", true, "abc", false },
103   { "[Aa][Bb]cd*", true, "ab", true },
104   { "ab[Cc]d*", true, "ab", false },
105   { "☺abc", true, "☺abc", false },
106 };
107 
TEST(RequiredPrefixForAccel,SimpleTests)108 TEST(RequiredPrefixForAccel, SimpleTests) {
109   for (size_t i = 0; i < arraysize(for_accel_tests); i++) {
110     const PrefixTest& t = for_accel_tests[i];
111     for (size_t j = 0; j < 2; j++) {
112       Regexp::ParseFlags flags = Regexp::LikePerl;
113       if (j == 0)
114         flags = flags | Regexp::Latin1;
115       Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
116       ASSERT_TRUE(re != NULL) << " " << t.regexp;
117 
118       std::string p;
119       bool f;
120       ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f))
121         << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
122         << " " << re->Dump();
123       if (t.return_value) {
124         ASSERT_EQ(p, std::string(t.prefix))
125           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
126         ASSERT_EQ(f, t.foldcase)
127           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
128       }
129       re->Decref();
130     }
131   }
132 }
133 
TEST(RequiredPrefixForAccel,CaseFoldingForKAndS)134 TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) {
135   Regexp* re;
136   std::string p;
137   bool f;
138 
139   // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'.
140   re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL);
141   ASSERT_TRUE(re != NULL);
142   ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
143   ASSERT_EQ(p, "klm");
144   ASSERT_EQ(f, true);
145   re->Decref();
146 
147   re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL);
148   ASSERT_TRUE(re != NULL);
149   ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
150   ASSERT_EQ(p, "stu");
151   ASSERT_EQ(f, true);
152   re->Decref();
153 
154   // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'.
155   // This is because they match U+212A and U+017F, respectively, and
156   // so the parser ends up emitting character classes, not literals.
157   re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL);
158   ASSERT_TRUE(re != NULL);
159   ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
160   re->Decref();
161 
162   re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL);
163   ASSERT_TRUE(re != NULL);
164   ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
165   re->Decref();
166 }
167 
168 static const char* prefix_accel_tests[] = {
169     "aababc\\d+",
170     "(?i)AABABC\\d+",
171 };
172 
TEST(PrefixAccel,SimpleTests)173 TEST(PrefixAccel, SimpleTests) {
174   for (size_t i = 0; i < arraysize(prefix_accel_tests); i++) {
175     const char* pattern = prefix_accel_tests[i];
176     Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL);
177     ASSERT_TRUE(re != NULL);
178     Prog* prog = re->CompileToProg(0);
179     ASSERT_TRUE(prog != NULL);
180     ASSERT_TRUE(prog->can_prefix_accel());
181     for (int j = 0; j < 100; j++) {
182       std::string text(j, 'a');
183       const char* p = reinterpret_cast<const char*>(
184           prog->PrefixAccel(text.data(), text.size()));
185       EXPECT_TRUE(p == NULL);
186       text.append("aababc");
187       for (int k = 0; k < 100; k++) {
188         text.append(k, 'a');
189         p = reinterpret_cast<const char*>(
190             prog->PrefixAccel(text.data(), text.size()));
191         EXPECT_EQ(j, p - text.data());
192       }
193     }
194     delete prog;
195     re->Decref();
196   }
197 }
198 
199 }  // namespace re2
200