xref: /aosp_15_r20/external/regex-re2/re2/fuzzing/re2_fuzzer.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2016 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <map>
8 #include <memory>
9 #include <queue>
10 #include <string>
11 
12 #include "re2/prefilter.h"
13 #include "re2/re2.h"
14 
15 using re2::StringPiece;
16 using std::string;
17 
18 // NOT static, NOT signed.
19 uint8_t dummy = 0;
20 
Test(StringPiece pattern,const RE2::Options & options,StringPiece text)21 void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
22   RE2 re(pattern, options);
23   if (!re.ok())
24     return;
25 
26   // Don't waste time fuzzing high-size programs.
27   // They can cause bug reports due to fuzzer timeouts.
28   int size = re.ProgramSize();
29   if (size > 9999)
30     return;
31   int rsize = re.ReverseProgramSize();
32   if (rsize > 9999)
33     return;
34 
35   // Don't waste time fuzzing high-fanout programs.
36   // They can cause bug reports due to fuzzer timeouts.
37   std::map<int, int> histogram;
38   int fanout = re.ProgramFanout(&histogram);
39   if (fanout > 9)
40     return;
41   int rfanout = re.ReverseProgramFanout(&histogram);
42   if (rfanout > 9)
43     return;
44 
45   // Don't waste time fuzzing programs with large substrings.
46   // They can cause bug reports due to fuzzer timeouts when they
47   // are repetitions (e.g. hundreds of NUL bytes) and matching is
48   // unanchored. And they aren't interesting for fuzzing purposes.
49   std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
50   if (prefilter == nullptr)
51     return;
52   std::queue<re2::Prefilter*> nodes;
53   nodes.push(prefilter.get());
54   while (!nodes.empty()) {
55     re2::Prefilter* node = nodes.front();
56     nodes.pop();
57     if (node->op() == re2::Prefilter::ATOM) {
58       if (node->atom().size() > 9)
59         return;
60     } else if (node->op() == re2::Prefilter::AND ||
61                node->op() == re2::Prefilter::OR) {
62       for (re2::Prefilter* sub : *node->subs())
63         nodes.push(sub);
64     }
65   }
66 
67   if (re.NumberOfCapturingGroups() == 0) {
68     // Avoid early return due to too many arguments.
69     StringPiece sp = text;
70     RE2::FullMatch(sp, re);
71     RE2::PartialMatch(sp, re);
72     RE2::Consume(&sp, re);
73     sp = text;  // Reset.
74     RE2::FindAndConsume(&sp, re);
75   } else {
76     // Okay, we have at least one capturing group...
77     // Try conversion for variously typed arguments.
78     StringPiece sp = text;
79     short s;
80     RE2::FullMatch(sp, re, &s);
81     long l;
82     RE2::PartialMatch(sp, re, &l);
83     float f;
84     RE2::Consume(&sp, re, &f);
85     sp = text;  // Reset.
86     double d;
87     RE2::FindAndConsume(&sp, re, &d);
88   }
89 
90   string s = string(text);
91   RE2::Replace(&s, re, "");
92   s = string(text);  // Reset.
93   RE2::GlobalReplace(&s, re, "");
94 
95   string min, max;
96   re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
97 
98   // Exercise some other API functionality.
99   dummy += re.NamedCapturingGroups().size();
100   dummy += re.CapturingGroupNames().size();
101   dummy += RE2::QuoteMeta(pattern).size();
102 }
103 
104 // Entry point for libFuzzer.
LLVMFuzzerTestOneInput(const uint8_t * data,size_t size)105 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
106   if (size == 0 || size > 999)
107     return 0;
108 
109   // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
110   // Otherwise, we will waste time on inputs that have long runs of various
111   // character classes. The fuzzer has shown itself to be easily capable of
112   // generating such patterns that fall within the other limits, but result
113   // in timeouts nonetheless. The marginal cost is high - even more so when
114   // counted repetition is involved - whereas the marginal benefit is zero.
115   // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
116   int cc = 0;
117   for (size_t i = 0; i < size; i++) {
118     if (data[i] == '.')
119       cc++;
120     if (data[i] != '\\')
121       continue;
122     i++;
123     if (i >= size)
124       break;
125     if (data[i] == 'p' || data[i] == 'P' ||
126         data[i] == 'd' || data[i] == 'D' ||
127         data[i] == 's' || data[i] == 'S' ||
128         data[i] == 'w' || data[i] == 'W')
129       cc++;
130   }
131   if (cc > 9)
132     return 0;
133 
134   // The one-at-a-time hash by Bob Jenkins.
135   uint32_t hash = 0;
136   for (size_t i = 0; i < size; i++) {
137     hash += data[i];
138     hash += (hash << 10);
139     hash ^= (hash >> 6);
140   }
141   hash += (hash << 3);
142   hash ^= (hash >> 11);
143   hash += (hash << 15);
144 
145   RE2::Options options;
146   options.set_log_errors(false);
147   options.set_max_mem(64 << 20);
148   options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1
149                                 : RE2::Options::EncodingUTF8);
150   options.set_posix_syntax(hash & 2);
151   options.set_longest_match(hash & 4);
152   options.set_literal(hash & 8);
153   options.set_never_nl(hash & 16);
154   options.set_dot_nl(hash & 32);
155   options.set_never_capture(hash & 64);
156   options.set_case_sensitive(hash & 128);
157   options.set_perl_classes(hash & 256);
158   options.set_word_boundary(hash & 512);
159   options.set_one_line(hash & 1024);
160 
161   const char* ptr = reinterpret_cast<const char*>(data);
162   int len = static_cast<int>(size);
163 
164   StringPiece pattern(ptr, len);
165   StringPiece text(ptr, len);
166   Test(pattern, options, text);
167 
168   return 0;
169 }
170