xref: /aosp_15_r20/external/regex-re2/re2/testing/exhaustive2_test.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi // Exhaustive testing of regular expression matching.
6*ccdc9c3eSSadaf Ebrahimi 
7*ccdc9c3eSSadaf Ebrahimi #include <stddef.h>
8*ccdc9c3eSSadaf Ebrahimi #include <memory>
9*ccdc9c3eSSadaf Ebrahimi #include <string>
10*ccdc9c3eSSadaf Ebrahimi #include <vector>
11*ccdc9c3eSSadaf Ebrahimi 
12*ccdc9c3eSSadaf Ebrahimi #include "util/test.h"
13*ccdc9c3eSSadaf Ebrahimi #include "re2/re2.h"
14*ccdc9c3eSSadaf Ebrahimi #include "re2/testing/exhaustive_tester.h"
15*ccdc9c3eSSadaf Ebrahimi 
16*ccdc9c3eSSadaf Ebrahimi namespace re2 {
17*ccdc9c3eSSadaf Ebrahimi 
18*ccdc9c3eSSadaf Ebrahimi // Test empty string matches (aka "(?:)")
TEST(EmptyString,Exhaustive)19*ccdc9c3eSSadaf Ebrahimi TEST(EmptyString, Exhaustive) {
20*ccdc9c3eSSadaf Ebrahimi   ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
21*ccdc9c3eSSadaf Ebrahimi                  RegexpGenerator::EgrepOps(),
22*ccdc9c3eSSadaf Ebrahimi                  5, Split("", "ab"), "", "");
23*ccdc9c3eSSadaf Ebrahimi }
24*ccdc9c3eSSadaf Ebrahimi 
25*ccdc9c3eSSadaf Ebrahimi // Test escaped versions of regexp syntax.
TEST(Punctuation,Literals)26*ccdc9c3eSSadaf Ebrahimi TEST(Punctuation, Literals) {
27*ccdc9c3eSSadaf Ebrahimi   std::vector<string> alphabet = Explode("()*+?{}[]\\^$.");
28*ccdc9c3eSSadaf Ebrahimi   std::vector<string> escaped = alphabet;
29*ccdc9c3eSSadaf Ebrahimi   for (size_t i = 0; i < escaped.size(); i++)
30*ccdc9c3eSSadaf Ebrahimi     escaped[i] = "\\" + escaped[i];
31*ccdc9c3eSSadaf Ebrahimi   ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
32*ccdc9c3eSSadaf Ebrahimi                  2, alphabet, "", "");
33*ccdc9c3eSSadaf Ebrahimi }
34*ccdc9c3eSSadaf Ebrahimi 
35*ccdc9c3eSSadaf Ebrahimi // Test ^ $ . \A \z in presence of line endings.
36*ccdc9c3eSSadaf Ebrahimi // Have to wrap the empty-width ones in (?:) so that
37*ccdc9c3eSSadaf Ebrahimi // they can be repeated -- PCRE rejects ^* but allows (?:^)*
TEST(LineEnds,Exhaustive)38*ccdc9c3eSSadaf Ebrahimi TEST(LineEnds, Exhaustive) {
39*ccdc9c3eSSadaf Ebrahimi   ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
40*ccdc9c3eSSadaf Ebrahimi                  RegexpGenerator::EgrepOps(),
41*ccdc9c3eSSadaf Ebrahimi                  4, Explode("ab\n"), "", "");
42*ccdc9c3eSSadaf Ebrahimi }
43*ccdc9c3eSSadaf Ebrahimi 
44*ccdc9c3eSSadaf Ebrahimi // Test what does and does not match \n.
45*ccdc9c3eSSadaf Ebrahimi // This would be a good test, except that PCRE seems to have a bug:
46*ccdc9c3eSSadaf Ebrahimi // in single-byte character set mode (the default),
47*ccdc9c3eSSadaf Ebrahimi // [^a] matches \n, but in UTF-8 mode it does not.
48*ccdc9c3eSSadaf Ebrahimi // So when we run the test, the tester complains that
49*ccdc9c3eSSadaf Ebrahimi // we don't agree with PCRE, but it's PCRE that is at fault.
50*ccdc9c3eSSadaf Ebrahimi // For what it's worth, Perl gets this right (matches
51*ccdc9c3eSSadaf Ebrahimi // regardless of whether UTF-8 input is selected):
52*ccdc9c3eSSadaf Ebrahimi //
53*ccdc9c3eSSadaf Ebrahimi //     #!/usr/bin/perl
54*ccdc9c3eSSadaf Ebrahimi //     use POSIX qw(locale_h);
55*ccdc9c3eSSadaf Ebrahimi //     print "matches in latin1\n" if "\n" =~ /[^a]/;
56*ccdc9c3eSSadaf Ebrahimi //     setlocale("en_US.utf8");
57*ccdc9c3eSSadaf Ebrahimi //     print "matches in utf8\n" if "\n" =~ /[^a]/;
58*ccdc9c3eSSadaf Ebrahimi //
59*ccdc9c3eSSadaf Ebrahimi // The rule chosen for RE2 is that by default, like Perl,
60*ccdc9c3eSSadaf Ebrahimi // dot does not match \n but negated character classes [^a] do.
61*ccdc9c3eSSadaf Ebrahimi // (?s) will allow dot to match \n; there is no way in RE2
62*ccdc9c3eSSadaf Ebrahimi // to stop [^a] from matching \n, though the underlying library
63*ccdc9c3eSSadaf Ebrahimi // provides a mechanism, and RE2 could add new syntax if needed.
64*ccdc9c3eSSadaf Ebrahimi //
65*ccdc9c3eSSadaf Ebrahimi // TEST(Newlines, Exhaustive) {
66*ccdc9c3eSSadaf Ebrahimi //   std::vector<string> empty_vector;
67*ccdc9c3eSSadaf Ebrahimi //   ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
68*ccdc9c3eSSadaf Ebrahimi //                  RegexpGenerator::EgrepOps(),
69*ccdc9c3eSSadaf Ebrahimi //                  4, Explode("a\n"), "");
70*ccdc9c3eSSadaf Ebrahimi // }
71*ccdc9c3eSSadaf Ebrahimi 
72*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
73*ccdc9c3eSSadaf Ebrahimi 
74