1 // Copyright 2016 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include <fuzzer/FuzzedDataProvider.h>
6 #include <stddef.h>
7 #include <stdint.h>
8 #include <algorithm>
9 #include <string>
10 #include <vector>
11
12 #include "re2/re2.h"
13 #include "re2/regexp.h"
14 #include "re2/walker-inl.h"
15
16 using re2::StringPiece;
17
18 // NOT static, NOT signed.
19 uint8_t dummy = 0;
20
21 // Walks kRegexpConcat and kRegexpAlternate subexpressions
22 // to determine their maximum length.
23 class SubexpressionWalker : public re2::Regexp::Walker<int> {
24 public:
25 SubexpressionWalker() = default;
26 ~SubexpressionWalker() override = default;
27
PostVisit(re2::Regexp * re,int parent_arg,int pre_arg,int * child_args,int nchild_args)28 int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
29 int* child_args, int nchild_args) override {
30 switch (re->op()) {
31 case re2::kRegexpConcat:
32 case re2::kRegexpAlternate: {
33 int max = nchild_args;
34 for (int i = 0; i < nchild_args; i++)
35 max = std::max(max, child_args[i]);
36 return max;
37 }
38
39 default:
40 break;
41 }
42 return -1;
43 }
44
45 // Should never be called: we use Walk(), not WalkExponential().
ShortVisit(re2::Regexp * re,int parent_arg)46 int ShortVisit(re2::Regexp* re, int parent_arg) override {
47 return parent_arg;
48 }
49
50 private:
51 SubexpressionWalker(const SubexpressionWalker&) = delete;
52 SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
53 };
54
55 // Walks substrings (i.e. kRegexpLiteralString subexpressions)
56 // to determine their maximum length... in runes, but avoiding
57 // overheads due to UTF-8 encoding is worthwhile when fuzzing.
58 class SubstringWalker : public re2::Regexp::Walker<int> {
59 public:
60 SubstringWalker() = default;
61 ~SubstringWalker() override = default;
62
PostVisit(re2::Regexp * re,int parent_arg,int pre_arg,int * child_args,int nchild_args)63 int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
64 int* child_args, int nchild_args) override {
65 switch (re->op()) {
66 case re2::kRegexpConcat:
67 case re2::kRegexpAlternate:
68 case re2::kRegexpStar:
69 case re2::kRegexpPlus:
70 case re2::kRegexpQuest:
71 case re2::kRegexpRepeat:
72 case re2::kRegexpCapture: {
73 int max = -1;
74 for (int i = 0; i < nchild_args; i++)
75 max = std::max(max, child_args[i]);
76 return max;
77 }
78
79 case re2::kRegexpLiteralString:
80 return re->nrunes();
81
82 default:
83 break;
84 }
85 return -1;
86 }
87
88 // Should never be called: we use Walk(), not WalkExponential().
ShortVisit(re2::Regexp * re,int parent_arg)89 int ShortVisit(re2::Regexp* re, int parent_arg) override {
90 return parent_arg;
91 }
92
93 private:
94 SubstringWalker(const SubstringWalker&) = delete;
95 SubstringWalker& operator=(const SubstringWalker&) = delete;
96 };
97
TestOneInput(StringPiece pattern,const RE2::Options & options,StringPiece text)98 void TestOneInput(StringPiece pattern, const RE2::Options& options,
99 StringPiece text) {
100 // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
101 // Otherwise, we will waste time on inputs that have long runs of various
102 // character classes. The fuzzer has shown itself to be easily capable of
103 // generating such patterns that fall within the other limits, but result
104 // in timeouts nonetheless. The marginal cost is high - even more so when
105 // counted repetition is involved - whereas the marginal benefit is zero.
106 // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
107 // three-element character classes when case-insensitive and using UTF-8.
108 // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
109 int char_class = 0;
110 int backslash_p = 0; // very expensive, so handle specially
111 for (size_t i = 0; i < pattern.size(); i++) {
112 if (pattern[i] == '.' ||
113 pattern[i] == 'k' || pattern[i] == 'K' ||
114 pattern[i] == 's' || pattern[i] == 'S')
115 char_class++;
116 if (pattern[i] != '\\')
117 continue;
118 i++;
119 if (i >= pattern.size())
120 break;
121 if (pattern[i] == 'p' || pattern[i] == 'P' ||
122 pattern[i] == 'd' || pattern[i] == 'D' ||
123 pattern[i] == 's' || pattern[i] == 'S' ||
124 pattern[i] == 'w' || pattern[i] == 'W')
125 char_class++;
126 if (pattern[i] == 'p' || pattern[i] == 'P')
127 backslash_p++;
128 }
129 if (char_class > 9)
130 return;
131 if (backslash_p > 1)
132 return;
133
134 // The default is 1000. Even 100 turned out to be too generous
135 // for fuzzing, empirically speaking, so let's try 10 instead.
136 re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
137
138 RE2 re(pattern, options);
139 if (!re.ok())
140 return;
141
142 // Don't waste time fuzzing programs with large subexpressions.
143 // They can cause bug reports due to fuzzer timeouts. And they
144 // aren't interesting for fuzzing purposes.
145 if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
146 return;
147
148 // Don't waste time fuzzing programs with large substrings.
149 // They can cause bug reports due to fuzzer timeouts when they
150 // are repetitions (e.g. hundreds of NUL bytes) and matching is
151 // unanchored. And they aren't interesting for fuzzing purposes.
152 if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
153 return;
154
155 // Don't waste time fuzzing high-size programs.
156 // They can cause bug reports due to fuzzer timeouts.
157 int size = re.ProgramSize();
158 if (size > 9999)
159 return;
160 int rsize = re.ReverseProgramSize();
161 if (rsize > 9999)
162 return;
163
164 // Don't waste time fuzzing high-fanout programs.
165 // They can cause bug reports due to fuzzer timeouts.
166 std::vector<int> histogram;
167 int fanout = re.ProgramFanout(&histogram);
168 if (fanout > 9)
169 return;
170 int rfanout = re.ReverseProgramFanout(&histogram);
171 if (rfanout > 9)
172 return;
173
174 if (re.NumberOfCapturingGroups() == 0) {
175 // Avoid early return due to too many arguments.
176 StringPiece sp = text;
177 RE2::FullMatch(sp, re);
178 RE2::PartialMatch(sp, re);
179 RE2::Consume(&sp, re);
180 sp = text; // Reset.
181 RE2::FindAndConsume(&sp, re);
182 } else {
183 // Okay, we have at least one capturing group...
184 // Try conversion for variously typed arguments.
185 StringPiece sp = text;
186 short s;
187 RE2::FullMatch(sp, re, &s);
188 long l;
189 RE2::PartialMatch(sp, re, &l);
190 float f;
191 RE2::Consume(&sp, re, &f);
192 sp = text; // Reset.
193 double d;
194 RE2::FindAndConsume(&sp, re, &d);
195 }
196
197 std::string s = std::string(text);
198 RE2::Replace(&s, re, "");
199 s = std::string(text); // Reset.
200 RE2::GlobalReplace(&s, re, "");
201
202 std::string min, max;
203 re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
204
205 // Exercise some other API functionality.
206 dummy += re.NamedCapturingGroups().size();
207 dummy += re.CapturingGroupNames().size();
208 dummy += RE2::QuoteMeta(pattern).size();
209 }
210
211 // Entry point for libFuzzer.
LLVMFuzzerTestOneInput(const uint8_t * data,size_t size)212 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
213 // An input larger than 4 KiB probably isn't interesting. (This limit
214 // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.)
215 if (size == 0 || size > 4096)
216 return 0;
217
218 FuzzedDataProvider fdp(data, size);
219
220 // The convention here is that fdp.ConsumeBool() returning false sets
221 // the default value whereas returning true sets the alternate value:
222 // most options default to false and so can be set directly; encoding
223 // defaults to UTF-8; case_sensitive defaults to true. We do NOT want
224 // to log errors. max_mem is 64 MiB because we can afford to use more
225 // RAM in exchange for (hopefully) faster fuzzing.
226 RE2::Options options;
227 options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1
228 : RE2::Options::EncodingUTF8);
229 options.set_posix_syntax(fdp.ConsumeBool());
230 options.set_longest_match(fdp.ConsumeBool());
231 options.set_log_errors(false);
232 options.set_max_mem(64 << 20);
233 options.set_literal(fdp.ConsumeBool());
234 options.set_never_nl(fdp.ConsumeBool());
235 options.set_dot_nl(fdp.ConsumeBool());
236 options.set_never_capture(fdp.ConsumeBool());
237 options.set_case_sensitive(!fdp.ConsumeBool());
238 options.set_perl_classes(fdp.ConsumeBool());
239 options.set_word_boundary(fdp.ConsumeBool());
240 options.set_one_line(fdp.ConsumeBool());
241
242 std::string pattern = fdp.ConsumeRandomLengthString(999);
243 std::string text = fdp.ConsumeRandomLengthString(999);
244
245 TestOneInput(pattern, options, text);
246 return 0;
247 }
248