1*ccdc9c3eSSadaf Ebrahimi // Copyright 2016 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi #include <stddef.h>
6*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
7*ccdc9c3eSSadaf Ebrahimi #include <map>
8*ccdc9c3eSSadaf Ebrahimi #include <memory>
9*ccdc9c3eSSadaf Ebrahimi #include <queue>
10*ccdc9c3eSSadaf Ebrahimi #include <string>
11*ccdc9c3eSSadaf Ebrahimi
12*ccdc9c3eSSadaf Ebrahimi #include "re2/prefilter.h"
13*ccdc9c3eSSadaf Ebrahimi #include "re2/re2.h"
14*ccdc9c3eSSadaf Ebrahimi
15*ccdc9c3eSSadaf Ebrahimi using re2::StringPiece;
16*ccdc9c3eSSadaf Ebrahimi using std::string;
17*ccdc9c3eSSadaf Ebrahimi
18*ccdc9c3eSSadaf Ebrahimi // NOT static, NOT signed.
19*ccdc9c3eSSadaf Ebrahimi uint8_t dummy = 0;
20*ccdc9c3eSSadaf Ebrahimi
Test(StringPiece pattern,const RE2::Options & options,StringPiece text)21*ccdc9c3eSSadaf Ebrahimi void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
22*ccdc9c3eSSadaf Ebrahimi RE2 re(pattern, options);
23*ccdc9c3eSSadaf Ebrahimi if (!re.ok())
24*ccdc9c3eSSadaf Ebrahimi return;
25*ccdc9c3eSSadaf Ebrahimi
26*ccdc9c3eSSadaf Ebrahimi // Don't waste time fuzzing high-size programs.
27*ccdc9c3eSSadaf Ebrahimi // They can cause bug reports due to fuzzer timeouts.
28*ccdc9c3eSSadaf Ebrahimi int size = re.ProgramSize();
29*ccdc9c3eSSadaf Ebrahimi if (size > 9999)
30*ccdc9c3eSSadaf Ebrahimi return;
31*ccdc9c3eSSadaf Ebrahimi int rsize = re.ReverseProgramSize();
32*ccdc9c3eSSadaf Ebrahimi if (rsize > 9999)
33*ccdc9c3eSSadaf Ebrahimi return;
34*ccdc9c3eSSadaf Ebrahimi
35*ccdc9c3eSSadaf Ebrahimi // Don't waste time fuzzing high-fanout programs.
36*ccdc9c3eSSadaf Ebrahimi // They can cause bug reports due to fuzzer timeouts.
37*ccdc9c3eSSadaf Ebrahimi std::map<int, int> histogram;
38*ccdc9c3eSSadaf Ebrahimi int fanout = re.ProgramFanout(&histogram);
39*ccdc9c3eSSadaf Ebrahimi if (fanout > 9)
40*ccdc9c3eSSadaf Ebrahimi return;
41*ccdc9c3eSSadaf Ebrahimi int rfanout = re.ReverseProgramFanout(&histogram);
42*ccdc9c3eSSadaf Ebrahimi if (rfanout > 9)
43*ccdc9c3eSSadaf Ebrahimi return;
44*ccdc9c3eSSadaf Ebrahimi
45*ccdc9c3eSSadaf Ebrahimi // Don't waste time fuzzing programs with large substrings.
46*ccdc9c3eSSadaf Ebrahimi // They can cause bug reports due to fuzzer timeouts when they
47*ccdc9c3eSSadaf Ebrahimi // are repetitions (e.g. hundreds of NUL bytes) and matching is
48*ccdc9c3eSSadaf Ebrahimi // unanchored. And they aren't interesting for fuzzing purposes.
49*ccdc9c3eSSadaf Ebrahimi std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
50*ccdc9c3eSSadaf Ebrahimi if (prefilter == nullptr)
51*ccdc9c3eSSadaf Ebrahimi return;
52*ccdc9c3eSSadaf Ebrahimi std::queue<re2::Prefilter*> nodes;
53*ccdc9c3eSSadaf Ebrahimi nodes.push(prefilter.get());
54*ccdc9c3eSSadaf Ebrahimi while (!nodes.empty()) {
55*ccdc9c3eSSadaf Ebrahimi re2::Prefilter* node = nodes.front();
56*ccdc9c3eSSadaf Ebrahimi nodes.pop();
57*ccdc9c3eSSadaf Ebrahimi if (node->op() == re2::Prefilter::ATOM) {
58*ccdc9c3eSSadaf Ebrahimi if (node->atom().size() > 9)
59*ccdc9c3eSSadaf Ebrahimi return;
60*ccdc9c3eSSadaf Ebrahimi } else if (node->op() == re2::Prefilter::AND ||
61*ccdc9c3eSSadaf Ebrahimi node->op() == re2::Prefilter::OR) {
62*ccdc9c3eSSadaf Ebrahimi for (re2::Prefilter* sub : *node->subs())
63*ccdc9c3eSSadaf Ebrahimi nodes.push(sub);
64*ccdc9c3eSSadaf Ebrahimi }
65*ccdc9c3eSSadaf Ebrahimi }
66*ccdc9c3eSSadaf Ebrahimi
67*ccdc9c3eSSadaf Ebrahimi if (re.NumberOfCapturingGroups() == 0) {
68*ccdc9c3eSSadaf Ebrahimi // Avoid early return due to too many arguments.
69*ccdc9c3eSSadaf Ebrahimi StringPiece sp = text;
70*ccdc9c3eSSadaf Ebrahimi RE2::FullMatch(sp, re);
71*ccdc9c3eSSadaf Ebrahimi RE2::PartialMatch(sp, re);
72*ccdc9c3eSSadaf Ebrahimi RE2::Consume(&sp, re);
73*ccdc9c3eSSadaf Ebrahimi sp = text; // Reset.
74*ccdc9c3eSSadaf Ebrahimi RE2::FindAndConsume(&sp, re);
75*ccdc9c3eSSadaf Ebrahimi } else {
76*ccdc9c3eSSadaf Ebrahimi // Okay, we have at least one capturing group...
77*ccdc9c3eSSadaf Ebrahimi // Try conversion for variously typed arguments.
78*ccdc9c3eSSadaf Ebrahimi StringPiece sp = text;
79*ccdc9c3eSSadaf Ebrahimi short s;
80*ccdc9c3eSSadaf Ebrahimi RE2::FullMatch(sp, re, &s);
81*ccdc9c3eSSadaf Ebrahimi long l;
82*ccdc9c3eSSadaf Ebrahimi RE2::PartialMatch(sp, re, &l);
83*ccdc9c3eSSadaf Ebrahimi float f;
84*ccdc9c3eSSadaf Ebrahimi RE2::Consume(&sp, re, &f);
85*ccdc9c3eSSadaf Ebrahimi sp = text; // Reset.
86*ccdc9c3eSSadaf Ebrahimi double d;
87*ccdc9c3eSSadaf Ebrahimi RE2::FindAndConsume(&sp, re, &d);
88*ccdc9c3eSSadaf Ebrahimi }
89*ccdc9c3eSSadaf Ebrahimi
90*ccdc9c3eSSadaf Ebrahimi string s = string(text);
91*ccdc9c3eSSadaf Ebrahimi RE2::Replace(&s, re, "");
92*ccdc9c3eSSadaf Ebrahimi s = string(text); // Reset.
93*ccdc9c3eSSadaf Ebrahimi RE2::GlobalReplace(&s, re, "");
94*ccdc9c3eSSadaf Ebrahimi
95*ccdc9c3eSSadaf Ebrahimi string min, max;
96*ccdc9c3eSSadaf Ebrahimi re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
97*ccdc9c3eSSadaf Ebrahimi
98*ccdc9c3eSSadaf Ebrahimi // Exercise some other API functionality.
99*ccdc9c3eSSadaf Ebrahimi dummy += re.NamedCapturingGroups().size();
100*ccdc9c3eSSadaf Ebrahimi dummy += re.CapturingGroupNames().size();
101*ccdc9c3eSSadaf Ebrahimi dummy += RE2::QuoteMeta(pattern).size();
102*ccdc9c3eSSadaf Ebrahimi }
103*ccdc9c3eSSadaf Ebrahimi
104*ccdc9c3eSSadaf Ebrahimi // Entry point for libFuzzer.
LLVMFuzzerTestOneInput(const uint8_t * data,size_t size)105*ccdc9c3eSSadaf Ebrahimi extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
106*ccdc9c3eSSadaf Ebrahimi if (size == 0 || size > 999)
107*ccdc9c3eSSadaf Ebrahimi return 0;
108*ccdc9c3eSSadaf Ebrahimi
109*ccdc9c3eSSadaf Ebrahimi // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
110*ccdc9c3eSSadaf Ebrahimi // Otherwise, we will waste time on inputs that have long runs of various
111*ccdc9c3eSSadaf Ebrahimi // character classes. The fuzzer has shown itself to be easily capable of
112*ccdc9c3eSSadaf Ebrahimi // generating such patterns that fall within the other limits, but result
113*ccdc9c3eSSadaf Ebrahimi // in timeouts nonetheless. The marginal cost is high - even more so when
114*ccdc9c3eSSadaf Ebrahimi // counted repetition is involved - whereas the marginal benefit is zero.
115*ccdc9c3eSSadaf Ebrahimi // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
116*ccdc9c3eSSadaf Ebrahimi int cc = 0;
117*ccdc9c3eSSadaf Ebrahimi for (size_t i = 0; i < size; i++) {
118*ccdc9c3eSSadaf Ebrahimi if (data[i] == '.')
119*ccdc9c3eSSadaf Ebrahimi cc++;
120*ccdc9c3eSSadaf Ebrahimi if (data[i] != '\\')
121*ccdc9c3eSSadaf Ebrahimi continue;
122*ccdc9c3eSSadaf Ebrahimi i++;
123*ccdc9c3eSSadaf Ebrahimi if (i >= size)
124*ccdc9c3eSSadaf Ebrahimi break;
125*ccdc9c3eSSadaf Ebrahimi if (data[i] == 'p' || data[i] == 'P' ||
126*ccdc9c3eSSadaf Ebrahimi data[i] == 'd' || data[i] == 'D' ||
127*ccdc9c3eSSadaf Ebrahimi data[i] == 's' || data[i] == 'S' ||
128*ccdc9c3eSSadaf Ebrahimi data[i] == 'w' || data[i] == 'W')
129*ccdc9c3eSSadaf Ebrahimi cc++;
130*ccdc9c3eSSadaf Ebrahimi }
131*ccdc9c3eSSadaf Ebrahimi if (cc > 9)
132*ccdc9c3eSSadaf Ebrahimi return 0;
133*ccdc9c3eSSadaf Ebrahimi
134*ccdc9c3eSSadaf Ebrahimi // The one-at-a-time hash by Bob Jenkins.
135*ccdc9c3eSSadaf Ebrahimi uint32_t hash = 0;
136*ccdc9c3eSSadaf Ebrahimi for (size_t i = 0; i < size; i++) {
137*ccdc9c3eSSadaf Ebrahimi hash += data[i];
138*ccdc9c3eSSadaf Ebrahimi hash += (hash << 10);
139*ccdc9c3eSSadaf Ebrahimi hash ^= (hash >> 6);
140*ccdc9c3eSSadaf Ebrahimi }
141*ccdc9c3eSSadaf Ebrahimi hash += (hash << 3);
142*ccdc9c3eSSadaf Ebrahimi hash ^= (hash >> 11);
143*ccdc9c3eSSadaf Ebrahimi hash += (hash << 15);
144*ccdc9c3eSSadaf Ebrahimi
145*ccdc9c3eSSadaf Ebrahimi RE2::Options options;
146*ccdc9c3eSSadaf Ebrahimi options.set_log_errors(false);
147*ccdc9c3eSSadaf Ebrahimi options.set_max_mem(64 << 20);
148*ccdc9c3eSSadaf Ebrahimi options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1
149*ccdc9c3eSSadaf Ebrahimi : RE2::Options::EncodingUTF8);
150*ccdc9c3eSSadaf Ebrahimi options.set_posix_syntax(hash & 2);
151*ccdc9c3eSSadaf Ebrahimi options.set_longest_match(hash & 4);
152*ccdc9c3eSSadaf Ebrahimi options.set_literal(hash & 8);
153*ccdc9c3eSSadaf Ebrahimi options.set_never_nl(hash & 16);
154*ccdc9c3eSSadaf Ebrahimi options.set_dot_nl(hash & 32);
155*ccdc9c3eSSadaf Ebrahimi options.set_never_capture(hash & 64);
156*ccdc9c3eSSadaf Ebrahimi options.set_case_sensitive(hash & 128);
157*ccdc9c3eSSadaf Ebrahimi options.set_perl_classes(hash & 256);
158*ccdc9c3eSSadaf Ebrahimi options.set_word_boundary(hash & 512);
159*ccdc9c3eSSadaf Ebrahimi options.set_one_line(hash & 1024);
160*ccdc9c3eSSadaf Ebrahimi
161*ccdc9c3eSSadaf Ebrahimi const char* ptr = reinterpret_cast<const char*>(data);
162*ccdc9c3eSSadaf Ebrahimi int len = static_cast<int>(size);
163*ccdc9c3eSSadaf Ebrahimi
164*ccdc9c3eSSadaf Ebrahimi StringPiece pattern(ptr, len);
165*ccdc9c3eSSadaf Ebrahimi StringPiece text(ptr, len);
166*ccdc9c3eSSadaf Ebrahimi Test(pattern, options, text);
167*ccdc9c3eSSadaf Ebrahimi
168*ccdc9c3eSSadaf Ebrahimi return 0;
169*ccdc9c3eSSadaf Ebrahimi }
170