1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi // Exhaustive testing of regular expression matching.
6*ccdc9c3eSSadaf Ebrahimi
7*ccdc9c3eSSadaf Ebrahimi #include <stddef.h>
8*ccdc9c3eSSadaf Ebrahimi #include <memory>
9*ccdc9c3eSSadaf Ebrahimi #include <string>
10*ccdc9c3eSSadaf Ebrahimi #include <vector>
11*ccdc9c3eSSadaf Ebrahimi
12*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
13*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
14*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/exhaustive_tester.h"
15*ccdc9c3eSSadaf Ebrahimi
16*ccdc9c3eSSadaf Ebrahimi namespace re2 {
17*ccdc9c3eSSadaf Ebrahimi
18*ccdc9c3eSSadaf Ebrahimi // Test simple character classes by themselves.
TEST(CharacterClasses,Exhaustive)19*ccdc9c3eSSadaf Ebrahimi TEST(CharacterClasses, Exhaustive) {
20*ccdc9c3eSSadaf Ebrahimi std::vector<string> atoms = Split(" ",
21*ccdc9c3eSSadaf Ebrahimi "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
22*ccdc9c3eSSadaf Ebrahimi ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
23*ccdc9c3eSSadaf Ebrahimi 5, Explode("ab"), "", "");
24*ccdc9c3eSSadaf Ebrahimi }
25*ccdc9c3eSSadaf Ebrahimi
26*ccdc9c3eSSadaf Ebrahimi // Test simple character classes inside a___b (for example, a[a]b).
TEST(CharacterClasses,ExhaustiveAB)27*ccdc9c3eSSadaf Ebrahimi TEST(CharacterClasses, ExhaustiveAB) {
28*ccdc9c3eSSadaf Ebrahimi std::vector<string> atoms = Split(" ",
29*ccdc9c3eSSadaf Ebrahimi "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
30*ccdc9c3eSSadaf Ebrahimi ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
31*ccdc9c3eSSadaf Ebrahimi 5, Explode("ab"), "a%sb", "");
32*ccdc9c3eSSadaf Ebrahimi }
33*ccdc9c3eSSadaf Ebrahimi
34*ccdc9c3eSSadaf Ebrahimi // Returns UTF8 for Rune r
UTF8(Rune r)35*ccdc9c3eSSadaf Ebrahimi static string UTF8(Rune r) {
36*ccdc9c3eSSadaf Ebrahimi char buf[UTFmax+1];
37*ccdc9c3eSSadaf Ebrahimi buf[runetochar(buf, &r)] = 0;
38*ccdc9c3eSSadaf Ebrahimi return string(buf);
39*ccdc9c3eSSadaf Ebrahimi }
40*ccdc9c3eSSadaf Ebrahimi
41*ccdc9c3eSSadaf Ebrahimi // Returns a vector of "interesting" UTF8 characters.
42*ccdc9c3eSSadaf Ebrahimi // Unicode is now too big to just return all of them,
43*ccdc9c3eSSadaf Ebrahimi // so UTF8Characters return a set likely to be good test cases.
InterestingUTF8()44*ccdc9c3eSSadaf Ebrahimi static const std::vector<string>& InterestingUTF8() {
45*ccdc9c3eSSadaf Ebrahimi static bool init;
46*ccdc9c3eSSadaf Ebrahimi static std::vector<string> v;
47*ccdc9c3eSSadaf Ebrahimi
48*ccdc9c3eSSadaf Ebrahimi if (init)
49*ccdc9c3eSSadaf Ebrahimi return v;
50*ccdc9c3eSSadaf Ebrahimi
51*ccdc9c3eSSadaf Ebrahimi init = true;
52*ccdc9c3eSSadaf Ebrahimi // All the Latin1 equivalents are interesting.
53*ccdc9c3eSSadaf Ebrahimi for (int i = 1; i < 256; i++)
54*ccdc9c3eSSadaf Ebrahimi v.push_back(UTF8(i));
55*ccdc9c3eSSadaf Ebrahimi
56*ccdc9c3eSSadaf Ebrahimi // After that, the codes near bit boundaries are
57*ccdc9c3eSSadaf Ebrahimi // interesting, because they span byte sequence lengths.
58*ccdc9c3eSSadaf Ebrahimi for (int j = 0; j < 8; j++)
59*ccdc9c3eSSadaf Ebrahimi v.push_back(UTF8(256 + j));
60*ccdc9c3eSSadaf Ebrahimi for (int i = 512; i < Runemax; i <<= 1)
61*ccdc9c3eSSadaf Ebrahimi for (int j = -8; j < 8; j++)
62*ccdc9c3eSSadaf Ebrahimi v.push_back(UTF8(i + j));
63*ccdc9c3eSSadaf Ebrahimi
64*ccdc9c3eSSadaf Ebrahimi // The codes near Runemax, including Runemax itself, are interesting.
65*ccdc9c3eSSadaf Ebrahimi for (int j = -8; j <= 0; j++)
66*ccdc9c3eSSadaf Ebrahimi v.push_back(UTF8(Runemax + j));
67*ccdc9c3eSSadaf Ebrahimi
68*ccdc9c3eSSadaf Ebrahimi return v;
69*ccdc9c3eSSadaf Ebrahimi }
70*ccdc9c3eSSadaf Ebrahimi
71*ccdc9c3eSSadaf Ebrahimi // Test interesting UTF-8 characters against character classes.
TEST(InterestingUTF8,SingleOps)72*ccdc9c3eSSadaf Ebrahimi TEST(InterestingUTF8, SingleOps) {
73*ccdc9c3eSSadaf Ebrahimi std::vector<string> atoms = Split(" ",
74*ccdc9c3eSSadaf Ebrahimi ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
75*ccdc9c3eSSadaf Ebrahimi "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
76*ccdc9c3eSSadaf Ebrahimi "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
77*ccdc9c3eSSadaf Ebrahimi "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
78*ccdc9c3eSSadaf Ebrahimi std::vector<string> ops; // no ops
79*ccdc9c3eSSadaf Ebrahimi ExhaustiveTest(1, 0, atoms, ops,
80*ccdc9c3eSSadaf Ebrahimi 1, InterestingUTF8(), "", "");
81*ccdc9c3eSSadaf Ebrahimi }
82*ccdc9c3eSSadaf Ebrahimi
83*ccdc9c3eSSadaf Ebrahimi // Test interesting UTF-8 characters against character classes,
84*ccdc9c3eSSadaf Ebrahimi // but wrap everything inside AB.
TEST(InterestingUTF8,AB)85*ccdc9c3eSSadaf Ebrahimi TEST(InterestingUTF8, AB) {
86*ccdc9c3eSSadaf Ebrahimi std::vector<string> atoms = Split(" ",
87*ccdc9c3eSSadaf Ebrahimi ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
88*ccdc9c3eSSadaf Ebrahimi "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
89*ccdc9c3eSSadaf Ebrahimi "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
90*ccdc9c3eSSadaf Ebrahimi "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
91*ccdc9c3eSSadaf Ebrahimi std::vector<string> ops; // no ops
92*ccdc9c3eSSadaf Ebrahimi std::vector<string> alpha = InterestingUTF8();
93*ccdc9c3eSSadaf Ebrahimi for (size_t i = 0; i < alpha.size(); i++)
94*ccdc9c3eSSadaf Ebrahimi alpha[i] = "a" + alpha[i] + "b";
95*ccdc9c3eSSadaf Ebrahimi ExhaustiveTest(1, 0, atoms, ops,
96*ccdc9c3eSSadaf Ebrahimi 1, alpha, "a%sb", "");
97*ccdc9c3eSSadaf Ebrahimi }
98*ccdc9c3eSSadaf Ebrahimi
99*ccdc9c3eSSadaf Ebrahimi } // namespace re2
100*ccdc9c3eSSadaf Ebrahimi
101