1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9
10 #include "re2/re2.h"
11
12 #include <assert.h>
13 #include <ctype.h>
14 #include <errno.h>
15 #include <stdint.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <algorithm>
19 #include <iterator>
20 #include <mutex>
21 #include <string>
22 #include <utility>
23 #include <vector>
24
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "util/sparse_array.h"
28 #include "util/strutil.h"
29 #include "util/utf.h"
30 #include "re2/prog.h"
31 #include "re2/regexp.h"
32
33 namespace re2 {
34
35 // Maximum number of args we can set
36 static const int kMaxArgs = 16;
37 static const int kVecSize = 1+kMaxArgs;
38
39 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
40
Options(RE2::CannedOptions opt)41 RE2::Options::Options(RE2::CannedOptions opt)
42 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
43 posix_syntax_(opt == RE2::POSIX),
44 longest_match_(opt == RE2::POSIX),
45 log_errors_(opt != RE2::Quiet),
46 max_mem_(kDefaultMaxMem),
47 literal_(false),
48 never_nl_(false),
49 dot_nl_(false),
50 never_capture_(false),
51 case_sensitive_(true),
52 perl_classes_(false),
53 word_boundary_(false),
54 one_line_(false) {
55 }
56
57 // static empty objects for use as const references.
58 // To avoid global constructors, allocated in RE2::Init().
59 static const string* empty_string;
60 static const std::map<string, int>* empty_named_groups;
61 static const std::map<int, string>* empty_group_names;
62
63 // Converts from Regexp error code to RE2 error code.
64 // Maybe some day they will diverge. In any event, this
65 // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)66 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
67 switch (code) {
68 case re2::kRegexpSuccess:
69 return RE2::NoError;
70 case re2::kRegexpInternalError:
71 return RE2::ErrorInternal;
72 case re2::kRegexpBadEscape:
73 return RE2::ErrorBadEscape;
74 case re2::kRegexpBadCharClass:
75 return RE2::ErrorBadCharClass;
76 case re2::kRegexpBadCharRange:
77 return RE2::ErrorBadCharRange;
78 case re2::kRegexpMissingBracket:
79 return RE2::ErrorMissingBracket;
80 case re2::kRegexpMissingParen:
81 return RE2::ErrorMissingParen;
82 case re2::kRegexpTrailingBackslash:
83 return RE2::ErrorTrailingBackslash;
84 case re2::kRegexpRepeatArgument:
85 return RE2::ErrorRepeatArgument;
86 case re2::kRegexpRepeatSize:
87 return RE2::ErrorRepeatSize;
88 case re2::kRegexpRepeatOp:
89 return RE2::ErrorRepeatOp;
90 case re2::kRegexpBadPerlOp:
91 return RE2::ErrorBadPerlOp;
92 case re2::kRegexpBadUTF8:
93 return RE2::ErrorBadUTF8;
94 case re2::kRegexpBadNamedCapture:
95 return RE2::ErrorBadNamedCapture;
96 }
97 return RE2::ErrorInternal;
98 }
99
trunc(const StringPiece & pattern)100 static string trunc(const StringPiece& pattern) {
101 if (pattern.size() < 100)
102 return string(pattern);
103 return string(pattern.substr(0, 100)) + "...";
104 }
105
106
RE2(const char * pattern)107 RE2::RE2(const char* pattern) {
108 Init(pattern, DefaultOptions);
109 }
110
RE2(const string & pattern)111 RE2::RE2(const string& pattern) {
112 Init(pattern, DefaultOptions);
113 }
114
RE2(const StringPiece & pattern)115 RE2::RE2(const StringPiece& pattern) {
116 Init(pattern, DefaultOptions);
117 }
118
RE2(const StringPiece & pattern,const Options & options)119 RE2::RE2(const StringPiece& pattern, const Options& options) {
120 Init(pattern, options);
121 }
122
ParseFlags() const123 int RE2::Options::ParseFlags() const {
124 int flags = Regexp::ClassNL;
125 switch (encoding()) {
126 default:
127 if (log_errors())
128 LOG(ERROR) << "Unknown encoding " << encoding();
129 break;
130 case RE2::Options::EncodingUTF8:
131 break;
132 case RE2::Options::EncodingLatin1:
133 flags |= Regexp::Latin1;
134 break;
135 }
136
137 if (!posix_syntax())
138 flags |= Regexp::LikePerl;
139
140 if (literal())
141 flags |= Regexp::Literal;
142
143 if (never_nl())
144 flags |= Regexp::NeverNL;
145
146 if (dot_nl())
147 flags |= Regexp::DotNL;
148
149 if (never_capture())
150 flags |= Regexp::NeverCapture;
151
152 if (!case_sensitive())
153 flags |= Regexp::FoldCase;
154
155 if (perl_classes())
156 flags |= Regexp::PerlClasses;
157
158 if (word_boundary())
159 flags |= Regexp::PerlB;
160
161 if (one_line())
162 flags |= Regexp::OneLine;
163
164 return flags;
165 }
166
Init(const StringPiece & pattern,const Options & options)167 void RE2::Init(const StringPiece& pattern, const Options& options) {
168 static std::once_flag empty_once;
169 std::call_once(empty_once, []() {
170 empty_string = new string;
171 empty_named_groups = new std::map<string, int>;
172 empty_group_names = new std::map<int, string>;
173 });
174
175 pattern_ = string(pattern);
176 options_.Copy(options);
177 entire_regexp_ = NULL;
178 suffix_regexp_ = NULL;
179 prog_ = NULL;
180 num_captures_ = -1;
181 rprog_ = NULL;
182 error_ = empty_string;
183 error_code_ = NoError;
184 named_groups_ = NULL;
185 group_names_ = NULL;
186
187 RegexpStatus status;
188 entire_regexp_ = Regexp::Parse(
189 pattern_,
190 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191 &status);
192 if (entire_regexp_ == NULL) {
193 if (options_.log_errors()) {
194 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
195 << status.Text();
196 }
197 error_ = new string(status.Text());
198 error_code_ = RegexpErrorToRE2(status.code());
199 error_arg_ = string(status.error_arg());
200 return;
201 }
202
203 re2::Regexp* suffix;
204 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
205 suffix_regexp_ = suffix;
206 else
207 suffix_regexp_ = entire_regexp_->Incref();
208
209 // Two thirds of the memory goes to the forward Prog,
210 // one third to the reverse prog, because the forward
211 // Prog has two DFAs but the reverse prog has one.
212 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
213 if (prog_ == NULL) {
214 if (options_.log_errors())
215 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
216 error_ = new string("pattern too large - compile failed");
217 error_code_ = RE2::ErrorPatternTooLarge;
218 return;
219 }
220
221 // We used to compute this lazily, but it's used during the
222 // typical control flow for a match call, so we now compute
223 // it eagerly, which avoids the overhead of std::once_flag.
224 num_captures_ = suffix_regexp_->NumCaptures();
225
226 // Could delay this until the first match call that
227 // cares about submatch information, but the one-pass
228 // machine's memory gets cut from the DFA memory budget,
229 // and that is harder to do if the DFA has already
230 // been built.
231 is_one_pass_ = prog_->IsOnePass();
232 }
233
234 // Returns rprog_, computing it if needed.
ReverseProg() const235 re2::Prog* RE2::ReverseProg() const {
236 std::call_once(rprog_once_, [](const RE2* re) {
237 re->rprog_ =
238 re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
239 if (re->rprog_ == NULL) {
240 if (re->options_.log_errors())
241 LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
242 re->error_ = new string("pattern too large - reverse compile failed");
243 re->error_code_ = RE2::ErrorPatternTooLarge;
244 }
245 }, this);
246 return rprog_;
247 }
248
~RE2()249 RE2::~RE2() {
250 if (suffix_regexp_)
251 suffix_regexp_->Decref();
252 if (entire_regexp_)
253 entire_regexp_->Decref();
254 delete prog_;
255 delete rprog_;
256 if (error_ != empty_string)
257 delete error_;
258 if (named_groups_ != NULL && named_groups_ != empty_named_groups)
259 delete named_groups_;
260 if (group_names_ != NULL && group_names_ != empty_group_names)
261 delete group_names_;
262 }
263
ProgramSize() const264 int RE2::ProgramSize() const {
265 if (prog_ == NULL)
266 return -1;
267 return prog_->size();
268 }
269
ReverseProgramSize() const270 int RE2::ReverseProgramSize() const {
271 if (prog_ == NULL)
272 return -1;
273 Prog* prog = ReverseProg();
274 if (prog == NULL)
275 return -1;
276 return prog->size();
277 }
278
Fanout(Prog * prog,std::map<int,int> * histogram)279 static int Fanout(Prog* prog, std::map<int, int>* histogram) {
280 SparseArray<int> fanout(prog->size());
281 prog->Fanout(&fanout);
282 histogram->clear();
283 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
284 // TODO(junyer): Optimise this?
285 int bucket = 0;
286 while (1 << bucket < i->value()) {
287 bucket++;
288 }
289 (*histogram)[bucket]++;
290 }
291 return histogram->rbegin()->first;
292 }
293
ProgramFanout(std::map<int,int> * histogram) const294 int RE2::ProgramFanout(std::map<int, int>* histogram) const {
295 if (prog_ == NULL)
296 return -1;
297 return Fanout(prog_, histogram);
298 }
299
ReverseProgramFanout(std::map<int,int> * histogram) const300 int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
301 if (prog_ == NULL)
302 return -1;
303 Prog* prog = ReverseProg();
304 if (prog == NULL)
305 return -1;
306 return Fanout(prog, histogram);
307 }
308
309 // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const310 const std::map<string, int>& RE2::NamedCapturingGroups() const {
311 std::call_once(named_groups_once_, [](const RE2* re) {
312 if (re->suffix_regexp_ != NULL)
313 re->named_groups_ = re->suffix_regexp_->NamedCaptures();
314 if (re->named_groups_ == NULL)
315 re->named_groups_ = empty_named_groups;
316 }, this);
317 return *named_groups_;
318 }
319
320 // Returns group_names_, computing it if needed.
CapturingGroupNames() const321 const std::map<int, string>& RE2::CapturingGroupNames() const {
322 std::call_once(group_names_once_, [](const RE2* re) {
323 if (re->suffix_regexp_ != NULL)
324 re->group_names_ = re->suffix_regexp_->CaptureNames();
325 if (re->group_names_ == NULL)
326 re->group_names_ = empty_group_names;
327 }, this);
328 return *group_names_;
329 }
330
331 /***** Convenience interfaces *****/
332
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)333 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
334 const Arg* const args[], int n) {
335 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
336 }
337
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)338 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
339 const Arg* const args[], int n) {
340 return re.DoMatch(text, UNANCHORED, NULL, args, n);
341 }
342
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)343 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
344 const Arg* const args[], int n) {
345 size_t consumed;
346 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
347 input->remove_prefix(consumed);
348 return true;
349 } else {
350 return false;
351 }
352 }
353
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)354 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
355 const Arg* const args[], int n) {
356 size_t consumed;
357 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
358 input->remove_prefix(consumed);
359 return true;
360 } else {
361 return false;
362 }
363 }
364
Replace(string * str,const RE2 & re,const StringPiece & rewrite)365 bool RE2::Replace(string* str,
366 const RE2& re,
367 const StringPiece& rewrite) {
368 StringPiece vec[kVecSize];
369 int nvec = 1 + MaxSubmatch(rewrite);
370 if (nvec > arraysize(vec))
371 return false;
372 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
373 return false;
374
375 string s;
376 if (!re.Rewrite(&s, rewrite, vec, nvec))
377 return false;
378
379 assert(vec[0].begin() >= str->data());
380 assert(vec[0].end() <= str->data()+str->size());
381 str->replace(vec[0].data() - str->data(), vec[0].size(), s);
382 return true;
383 }
384
GlobalReplace(string * str,const RE2 & re,const StringPiece & rewrite)385 int RE2::GlobalReplace(string* str,
386 const RE2& re,
387 const StringPiece& rewrite) {
388 StringPiece vec[kVecSize];
389 int nvec = 1 + MaxSubmatch(rewrite);
390 if (nvec > arraysize(vec))
391 return false;
392
393 const char* p = str->data();
394 const char* ep = p + str->size();
395 const char* lastend = NULL;
396 string out;
397 int count = 0;
398 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
399 // Iterate just once when fuzzing. Otherwise, we easily get bogged down
400 // and coverage is unlikely to improve despite significant expense.
401 while (p == str->data()) {
402 #else
403 while (p <= ep) {
404 #endif
405 if (!re.Match(*str, static_cast<size_t>(p - str->data()),
406 str->size(), UNANCHORED, vec, nvec))
407 break;
408 if (p < vec[0].begin())
409 out.append(p, vec[0].begin() - p);
410 if (vec[0].begin() == lastend && vec[0].size() == 0) {
411 // Disallow empty match at end of last match: skip ahead.
412 //
413 // fullrune() takes int, not size_t. However, it just looks
414 // at the leading byte and treats any length >= 4 the same.
415 if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
416 fullrune(p, static_cast<int>(std::min(static_cast<ptrdiff_t>(4),
417 ep - p)))) {
418 // re is in UTF-8 mode and there is enough left of str
419 // to allow us to advance by up to UTFmax bytes.
420 Rune r;
421 int n = chartorune(&r, p);
422 // Some copies of chartorune have a bug that accepts
423 // encodings of values in (10FFFF, 1FFFFF] as valid.
424 if (r > Runemax) {
425 n = 1;
426 r = Runeerror;
427 }
428 if (!(n == 1 && r == Runeerror)) { // no decoding error
429 out.append(p, n);
430 p += n;
431 continue;
432 }
433 }
434 // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
435 // we fell through from above and the GIGO principle applies.
436 if (p < ep)
437 out.append(p, 1);
438 p++;
439 continue;
440 }
441 re.Rewrite(&out, rewrite, vec, nvec);
442 p = vec[0].end();
443 lastend = p;
444 count++;
445 }
446
447 if (count == 0)
448 return 0;
449
450 if (p < ep)
451 out.append(p, ep - p);
452 using std::swap;
453 swap(out, *str);
454 return count;
455 }
456
457 bool RE2::Extract(const StringPiece& text,
458 const RE2& re,
459 const StringPiece& rewrite,
460 string* out) {
461 StringPiece vec[kVecSize];
462 int nvec = 1 + MaxSubmatch(rewrite);
463 if (nvec > arraysize(vec))
464 return false;
465
466 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
467 return false;
468
469 out->clear();
470 return re.Rewrite(out, rewrite, vec, nvec);
471 }
472
473 string RE2::QuoteMeta(const StringPiece& unquoted) {
474 string result;
475 result.reserve(unquoted.size() << 1);
476
477 // Escape any ascii character not in [A-Za-z_0-9].
478 //
479 // Note that it's legal to escape a character even if it has no
480 // special meaning in a regular expression -- so this function does
481 // that. (This also makes it identical to the perl function of the
482 // same name except for the null-character special case;
483 // see `perldoc -f quotemeta`.)
484 for (size_t ii = 0; ii < unquoted.size(); ++ii) {
485 // Note that using 'isalnum' here raises the benchmark time from
486 // 32ns to 58ns:
487 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
488 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
489 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
490 unquoted[ii] != '_' &&
491 // If this is the part of a UTF8 or Latin1 character, we need
492 // to copy this byte without escaping. Experimentally this is
493 // what works correctly with the regexp library.
494 !(unquoted[ii] & 128)) {
495 if (unquoted[ii] == '\0') { // Special handling for null chars.
496 // Note that this special handling is not strictly required for RE2,
497 // but this quoting is required for other regexp libraries such as
498 // PCRE.
499 // Can't use "\\0" since the next character might be a digit.
500 result += "\\x00";
501 continue;
502 }
503 result += '\\';
504 }
505 result += unquoted[ii];
506 }
507
508 return result;
509 }
510
511 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
512 if (prog_ == NULL)
513 return false;
514
515 int n = static_cast<int>(prefix_.size());
516 if (n > maxlen)
517 n = maxlen;
518
519 // Determine initial min max from prefix_ literal.
520 *min = prefix_.substr(0, n);
521 *max = prefix_.substr(0, n);
522 if (prefix_foldcase_) {
523 // prefix is ASCII lowercase; change *min to uppercase.
524 for (int i = 0; i < n; i++) {
525 char& c = (*min)[i];
526 if ('a' <= c && c <= 'z')
527 c += 'A' - 'a';
528 }
529 }
530
531 // Add to prefix min max using PossibleMatchRange on regexp.
532 string dmin, dmax;
533 maxlen -= n;
534 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
535 min->append(dmin);
536 max->append(dmax);
537 } else if (!max->empty()) {
538 // prog_->PossibleMatchRange has failed us,
539 // but we still have useful information from prefix_.
540 // Round up *max to allow any possible suffix.
541 PrefixSuccessor(max);
542 } else {
543 // Nothing useful.
544 *min = "";
545 *max = "";
546 return false;
547 }
548
549 return true;
550 }
551
552 // Avoid possible locale nonsense in standard strcasecmp.
553 // The string a is known to be all lowercase.
554 static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
555 const char* ae = a + len;
556
557 for (; a < ae; a++, b++) {
558 uint8_t x = *a;
559 uint8_t y = *b;
560 if ('A' <= y && y <= 'Z')
561 y += 'a' - 'A';
562 if (x != y)
563 return x - y;
564 }
565 return 0;
566 }
567
568
569 /***** Actual matching and rewriting code *****/
570
571 bool RE2::Match(const StringPiece& text,
572 size_t startpos,
573 size_t endpos,
574 Anchor re_anchor,
575 StringPiece* submatch,
576 int nsubmatch) const {
577 if (!ok()) {
578 if (options_.log_errors())
579 LOG(ERROR) << "Invalid RE2: " << *error_;
580 return false;
581 }
582
583 if (startpos > endpos || endpos > text.size()) {
584 if (options_.log_errors())
585 LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
586 << "startpos: " << startpos << ", "
587 << "endpos: " << endpos << ", "
588 << "text size: " << text.size() << "]";
589 return false;
590 }
591
592 StringPiece subtext = text;
593 subtext.remove_prefix(startpos);
594 subtext.remove_suffix(text.size() - endpos);
595
596 // Use DFAs to find exact location of match, filter out non-matches.
597
598 // Don't ask for the location if we won't use it.
599 // SearchDFA can do extra optimizations in that case.
600 StringPiece match;
601 StringPiece* matchp = &match;
602 if (nsubmatch == 0)
603 matchp = NULL;
604
605 int ncap = 1 + NumberOfCapturingGroups();
606 if (ncap > nsubmatch)
607 ncap = nsubmatch;
608
609 // If the regexp is anchored explicitly, must not be in middle of text.
610 if (prog_->anchor_start() && startpos != 0)
611 return false;
612
613 // If the regexp is anchored explicitly, update re_anchor
614 // so that we can potentially fall into a faster case below.
615 if (prog_->anchor_start() && prog_->anchor_end())
616 re_anchor = ANCHOR_BOTH;
617 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
618 re_anchor = ANCHOR_START;
619
620 // Check for the required prefix, if any.
621 size_t prefixlen = 0;
622 if (!prefix_.empty()) {
623 if (startpos != 0)
624 return false;
625 prefixlen = prefix_.size();
626 if (prefixlen > subtext.size())
627 return false;
628 if (prefix_foldcase_) {
629 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
630 return false;
631 } else {
632 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
633 return false;
634 }
635 subtext.remove_prefix(prefixlen);
636 // If there is a required prefix, the anchor must be at least ANCHOR_START.
637 if (re_anchor != ANCHOR_BOTH)
638 re_anchor = ANCHOR_START;
639 }
640
641 Prog::Anchor anchor = Prog::kUnanchored;
642 Prog::MatchKind kind = Prog::kFirstMatch;
643 if (options_.longest_match())
644 kind = Prog::kLongestMatch;
645 bool skipped_test = false;
646
647 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
648
649 // SearchBitState allocates a bit vector of size prog_->size() * text.size().
650 // It also allocates a stack of 3-word structures which could potentially
651 // grow as large as prog_->size() * text.size() but in practice is much
652 // smaller.
653 // Conditions for using SearchBitState:
654 const int MaxBitStateProg = 500; // prog_->size() <= Max.
655 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)
656 bool can_bit_state = prog_->size() <= MaxBitStateProg;
657 size_t bit_state_text_max = MaxBitStateVector / prog_->size();
658
659 bool dfa_failed = false;
660 switch (re_anchor) {
661 default:
662 case UNANCHORED: {
663 if (!prog_->SearchDFA(subtext, text, anchor, kind,
664 matchp, &dfa_failed, NULL)) {
665 if (dfa_failed) {
666 if (options_.log_errors())
667 LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
668 << "bytemap range " << prog_->bytemap_range() << ", "
669 << "list count " << prog_->list_count();
670 // Fall back to NFA below.
671 skipped_test = true;
672 break;
673 }
674 return false;
675 }
676 if (matchp == NULL) // Matched. Don't care where
677 return true;
678 // SearchDFA set match[0].end() but didn't know where the
679 // match started. Run the regexp backward from match[0].end()
680 // to find the longest possible match -- that's where it started.
681 Prog* prog = ReverseProg();
682 if (prog == NULL)
683 return false;
684 if (!prog->SearchDFA(match, text, Prog::kAnchored,
685 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
686 if (dfa_failed) {
687 if (options_.log_errors())
688 LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", "
689 << "bytemap range " << prog->bytemap_range() << ", "
690 << "list count " << prog->list_count();
691 // Fall back to NFA below.
692 skipped_test = true;
693 break;
694 }
695 if (options_.log_errors())
696 LOG(ERROR) << "SearchDFA inconsistency";
697 return false;
698 }
699 break;
700 }
701
702 case ANCHOR_BOTH:
703 case ANCHOR_START:
704 if (re_anchor == ANCHOR_BOTH)
705 kind = Prog::kFullMatch;
706 anchor = Prog::kAnchored;
707
708 // If only a small amount of text and need submatch
709 // information anyway and we're going to use OnePass or BitState
710 // to get it, we might as well not even bother with the DFA:
711 // OnePass or BitState will be fast enough.
712 // On tiny texts, OnePass outruns even the DFA, and
713 // it doesn't have the shared state and occasional mutex that
714 // the DFA does.
715 if (can_one_pass && text.size() <= 4096 &&
716 (ncap > 1 || text.size() <= 8)) {
717 skipped_test = true;
718 break;
719 }
720 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
721 skipped_test = true;
722 break;
723 }
724 if (!prog_->SearchDFA(subtext, text, anchor, kind,
725 &match, &dfa_failed, NULL)) {
726 if (dfa_failed) {
727 if (options_.log_errors())
728 LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
729 << "bytemap range " << prog_->bytemap_range() << ", "
730 << "list count " << prog_->list_count();
731 // Fall back to NFA below.
732 skipped_test = true;
733 break;
734 }
735 return false;
736 }
737 break;
738 }
739
740 if (!skipped_test && ncap <= 1) {
741 // We know exactly where it matches. That's enough.
742 if (ncap == 1)
743 submatch[0] = match;
744 } else {
745 StringPiece subtext1;
746 if (skipped_test) {
747 // DFA ran out of memory or was skipped:
748 // need to search in entire original text.
749 subtext1 = subtext;
750 } else {
751 // DFA found the exact match location:
752 // let NFA run an anchored, full match search
753 // to find submatch locations.
754 subtext1 = match;
755 anchor = Prog::kAnchored;
756 kind = Prog::kFullMatch;
757 }
758
759 if (can_one_pass && anchor != Prog::kUnanchored) {
760 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
761 if (!skipped_test && options_.log_errors())
762 LOG(ERROR) << "SearchOnePass inconsistency";
763 return false;
764 }
765 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
766 if (!prog_->SearchBitState(subtext1, text, anchor,
767 kind, submatch, ncap)) {
768 if (!skipped_test && options_.log_errors())
769 LOG(ERROR) << "SearchBitState inconsistency";
770 return false;
771 }
772 } else {
773 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
774 if (!skipped_test && options_.log_errors())
775 LOG(ERROR) << "SearchNFA inconsistency";
776 return false;
777 }
778 }
779 }
780
781 // Adjust overall match for required prefix that we stripped off.
782 if (prefixlen > 0 && nsubmatch > 0)
783 submatch[0] = StringPiece(submatch[0].data() - prefixlen,
784 submatch[0].size() + prefixlen);
785
786 // Zero submatches that don't exist in the regexp.
787 for (int i = ncap; i < nsubmatch; i++)
788 submatch[i] = StringPiece();
789 return true;
790 }
791
792 // Internal matcher - like Match() but takes Args not StringPieces.
793 bool RE2::DoMatch(const StringPiece& text,
794 Anchor re_anchor,
795 size_t* consumed,
796 const Arg* const* args,
797 int n) const {
798 if (!ok()) {
799 if (options_.log_errors())
800 LOG(ERROR) << "Invalid RE2: " << *error_;
801 return false;
802 }
803
804 if (NumberOfCapturingGroups() < n) {
805 // RE has fewer capturing groups than number of Arg pointers passed in.
806 return false;
807 }
808
809 // Count number of capture groups needed.
810 int nvec;
811 if (n == 0 && consumed == NULL)
812 nvec = 0;
813 else
814 nvec = n+1;
815
816 StringPiece* vec;
817 StringPiece stkvec[kVecSize];
818 StringPiece* heapvec = NULL;
819
820 if (nvec <= arraysize(stkvec)) {
821 vec = stkvec;
822 } else {
823 vec = new StringPiece[nvec];
824 heapvec = vec;
825 }
826
827 if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
828 delete[] heapvec;
829 return false;
830 }
831
832 if (consumed != NULL)
833 *consumed = static_cast<size_t>(vec[0].end() - text.begin());
834
835 if (n == 0 || args == NULL) {
836 // We are not interested in results
837 delete[] heapvec;
838 return true;
839 }
840
841 // If we got here, we must have matched the whole pattern.
842 for (int i = 0; i < n; i++) {
843 const StringPiece& s = vec[i+1];
844 if (!args[i]->Parse(s.data(), s.size())) {
845 // TODO: Should we indicate what the error was?
846 delete[] heapvec;
847 return false;
848 }
849 }
850
851 delete[] heapvec;
852 return true;
853 }
854
855 // Checks that the rewrite string is well-formed with respect to this
856 // regular expression.
857 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
858 int max_token = -1;
859 for (const char *s = rewrite.data(), *end = s + rewrite.size();
860 s < end; s++) {
861 int c = *s;
862 if (c != '\\') {
863 continue;
864 }
865 if (++s == end) {
866 *error = "Rewrite schema error: '\\' not allowed at end.";
867 return false;
868 }
869 c = *s;
870 if (c == '\\') {
871 continue;
872 }
873 if (!isdigit(c)) {
874 *error = "Rewrite schema error: "
875 "'\\' must be followed by a digit or '\\'.";
876 return false;
877 }
878 int n = (c - '0');
879 if (max_token < n) {
880 max_token = n;
881 }
882 }
883
884 if (max_token > NumberOfCapturingGroups()) {
885 SStringPrintf(error, "Rewrite schema requests %d matches, "
886 "but the regexp only has %d parenthesized subexpressions.",
887 max_token, NumberOfCapturingGroups());
888 return false;
889 }
890 return true;
891 }
892
893 // Returns the maximum submatch needed for the rewrite to be done by Replace().
894 // E.g. if rewrite == "foo \\2,\\1", returns 2.
895 int RE2::MaxSubmatch(const StringPiece& rewrite) {
896 int max = 0;
897 for (const char *s = rewrite.data(), *end = s + rewrite.size();
898 s < end; s++) {
899 if (*s == '\\') {
900 s++;
901 int c = (s < end) ? *s : -1;
902 if (isdigit(c)) {
903 int n = (c - '0');
904 if (n > max)
905 max = n;
906 }
907 }
908 }
909 return max;
910 }
911
912 // Append the "rewrite" string, with backslash subsitutions from "vec",
913 // to string "out".
914 bool RE2::Rewrite(string* out,
915 const StringPiece& rewrite,
916 const StringPiece* vec,
917 int veclen) const {
918 for (const char *s = rewrite.data(), *end = s + rewrite.size();
919 s < end; s++) {
920 if (*s != '\\') {
921 out->push_back(*s);
922 continue;
923 }
924 s++;
925 int c = (s < end) ? *s : -1;
926 if (isdigit(c)) {
927 int n = (c - '0');
928 if (n >= veclen) {
929 if (options_.log_errors()) {
930 LOG(ERROR) << "requested group " << n
931 << " in regexp " << rewrite.data();
932 }
933 return false;
934 }
935 StringPiece snip = vec[n];
936 if (snip.size() > 0)
937 out->append(snip.data(), snip.size());
938 } else if (c == '\\') {
939 out->push_back('\\');
940 } else {
941 if (options_.log_errors())
942 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
943 return false;
944 }
945 }
946 return true;
947 }
948
949 /***** Parsers for various types *****/
950
951 bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
952 // We fail if somebody asked us to store into a non-NULL void* pointer
953 return (dest == NULL);
954 }
955
956 bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) {
957 if (dest == NULL) return true;
958 reinterpret_cast<string*>(dest)->assign(str, n);
959 return true;
960 }
961
962 bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
963 if (dest == NULL) return true;
964 *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
965 return true;
966 }
967
968 bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) {
969 if (n != 1) return false;
970 if (dest == NULL) return true;
971 *(reinterpret_cast<char*>(dest)) = str[0];
972 return true;
973 }
974
975 bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) {
976 if (n != 1) return false;
977 if (dest == NULL) return true;
978 *(reinterpret_cast<signed char*>(dest)) = str[0];
979 return true;
980 }
981
982 bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) {
983 if (n != 1) return false;
984 if (dest == NULL) return true;
985 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
986 return true;
987 }
988
989 // Largest number spec that we are willing to parse
990 static const int kMaxNumberLength = 32;
991
992 // REQUIRES "buf" must have length at least nbuf.
993 // Copies "str" into "buf" and null-terminates.
994 // Overwrites *np with the new length.
995 static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
996 size_t* np, bool accept_spaces) {
997 size_t n = *np;
998 if (n == 0) return "";
999 if (n > 0 && isspace(*str)) {
1000 // We are less forgiving than the strtoxxx() routines and do not
1001 // allow leading spaces. We do allow leading spaces for floats.
1002 if (!accept_spaces) {
1003 return "";
1004 }
1005 while (n > 0 && isspace(*str)) {
1006 n--;
1007 str++;
1008 }
1009 }
1010
1011 // Although buf has a fixed maximum size, we can still handle
1012 // arbitrarily large integers correctly by omitting leading zeros.
1013 // (Numbers that are still too long will be out of range.)
1014 // Before deciding whether str is too long,
1015 // remove leading zeros with s/000+/00/.
1016 // Leaving the leading two zeros in place means that
1017 // we don't change 0000x123 (invalid) into 0x123 (valid).
1018 // Skip over leading - before replacing.
1019 bool neg = false;
1020 if (n >= 1 && str[0] == '-') {
1021 neg = true;
1022 n--;
1023 str++;
1024 }
1025
1026 if (n >= 3 && str[0] == '0' && str[1] == '0') {
1027 while (n >= 3 && str[2] == '0') {
1028 n--;
1029 str++;
1030 }
1031 }
1032
1033 if (neg) { // make room in buf for -
1034 n++;
1035 str--;
1036 }
1037
1038 if (n > nbuf-1) return "";
1039
1040 memmove(buf, str, n);
1041 if (neg) {
1042 buf[0] = '-';
1043 }
1044 buf[n] = '\0';
1045 *np = n;
1046 return buf;
1047 }
1048
1049 bool RE2::Arg::parse_long_radix(const char* str,
1050 size_t n,
1051 void* dest,
1052 int radix) {
1053 if (n == 0) return false;
1054 char buf[kMaxNumberLength+1];
1055 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1056 char* end;
1057 errno = 0;
1058 long r = strtol(str, &end, radix);
1059 if (end != str + n) return false; // Leftover junk
1060 if (errno) return false;
1061 if (dest == NULL) return true;
1062 *(reinterpret_cast<long*>(dest)) = r;
1063 return true;
1064 }
1065
1066 bool RE2::Arg::parse_ulong_radix(const char* str,
1067 size_t n,
1068 void* dest,
1069 int radix) {
1070 if (n == 0) return false;
1071 char buf[kMaxNumberLength+1];
1072 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1073 if (str[0] == '-') {
1074 // strtoul() will silently accept negative numbers and parse
1075 // them. This module is more strict and treats them as errors.
1076 return false;
1077 }
1078
1079 char* end;
1080 errno = 0;
1081 unsigned long r = strtoul(str, &end, radix);
1082 if (end != str + n) return false; // Leftover junk
1083 if (errno) return false;
1084 if (dest == NULL) return true;
1085 *(reinterpret_cast<unsigned long*>(dest)) = r;
1086 return true;
1087 }
1088
1089 bool RE2::Arg::parse_short_radix(const char* str,
1090 size_t n,
1091 void* dest,
1092 int radix) {
1093 long r;
1094 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1095 if ((short)r != r) return false; // Out of range
1096 if (dest == NULL) return true;
1097 *(reinterpret_cast<short*>(dest)) = (short)r;
1098 return true;
1099 }
1100
1101 bool RE2::Arg::parse_ushort_radix(const char* str,
1102 size_t n,
1103 void* dest,
1104 int radix) {
1105 unsigned long r;
1106 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1107 if ((unsigned short)r != r) return false; // Out of range
1108 if (dest == NULL) return true;
1109 *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
1110 return true;
1111 }
1112
1113 bool RE2::Arg::parse_int_radix(const char* str,
1114 size_t n,
1115 void* dest,
1116 int radix) {
1117 long r;
1118 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1119 if ((int)r != r) return false; // Out of range
1120 if (dest == NULL) return true;
1121 *(reinterpret_cast<int*>(dest)) = (int)r;
1122 return true;
1123 }
1124
1125 bool RE2::Arg::parse_uint_radix(const char* str,
1126 size_t n,
1127 void* dest,
1128 int radix) {
1129 unsigned long r;
1130 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1131 if ((unsigned int)r != r) return false; // Out of range
1132 if (dest == NULL) return true;
1133 *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
1134 return true;
1135 }
1136
1137 bool RE2::Arg::parse_longlong_radix(const char* str,
1138 size_t n,
1139 void* dest,
1140 int radix) {
1141 if (n == 0) return false;
1142 char buf[kMaxNumberLength+1];
1143 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1144 char* end;
1145 errno = 0;
1146 long long r = strtoll(str, &end, radix);
1147 if (end != str + n) return false; // Leftover junk
1148 if (errno) return false;
1149 if (dest == NULL) return true;
1150 *(reinterpret_cast<long long*>(dest)) = r;
1151 return true;
1152 }
1153
1154 bool RE2::Arg::parse_ulonglong_radix(const char* str,
1155 size_t n,
1156 void* dest,
1157 int radix) {
1158 if (n == 0) return false;
1159 char buf[kMaxNumberLength+1];
1160 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1161 if (str[0] == '-') {
1162 // strtoull() will silently accept negative numbers and parse
1163 // them. This module is more strict and treats them as errors.
1164 return false;
1165 }
1166 char* end;
1167 errno = 0;
1168 unsigned long long r = strtoull(str, &end, radix);
1169 if (end != str + n) return false; // Leftover junk
1170 if (errno) return false;
1171 if (dest == NULL) return true;
1172 *(reinterpret_cast<unsigned long long*>(dest)) = r;
1173 return true;
1174 }
1175
1176 static bool parse_double_float(const char* str, size_t n, bool isfloat,
1177 void* dest) {
1178 if (n == 0) return false;
1179 static const int kMaxLength = 200;
1180 char buf[kMaxLength+1];
1181 str = TerminateNumber(buf, sizeof buf, str, &n, true);
1182 char* end;
1183 errno = 0;
1184 double r;
1185 if (isfloat) {
1186 r = strtof(str, &end);
1187 } else {
1188 r = strtod(str, &end);
1189 }
1190 if (end != str + n) return false; // Leftover junk
1191 if (errno) return false;
1192 if (dest == NULL) return true;
1193 if (isfloat) {
1194 *(reinterpret_cast<float*>(dest)) = (float)r;
1195 } else {
1196 *(reinterpret_cast<double*>(dest)) = r;
1197 }
1198 return true;
1199 }
1200
1201 bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) {
1202 return parse_double_float(str, n, false, dest);
1203 }
1204
1205 bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) {
1206 return parse_double_float(str, n, true, dest);
1207 }
1208
1209 #define DEFINE_INTEGER_PARSER(name) \
1210 bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \
1211 return parse_##name##_radix(str, n, dest, 10); \
1212 } \
1213 bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
1214 return parse_##name##_radix(str, n, dest, 16); \
1215 } \
1216 bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \
1217 return parse_##name##_radix(str, n, dest, 8); \
1218 } \
1219 bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \
1220 void* dest) { \
1221 return parse_##name##_radix(str, n, dest, 0); \
1222 }
1223
1224 DEFINE_INTEGER_PARSER(short);
1225 DEFINE_INTEGER_PARSER(ushort);
1226 DEFINE_INTEGER_PARSER(int);
1227 DEFINE_INTEGER_PARSER(uint);
1228 DEFINE_INTEGER_PARSER(long);
1229 DEFINE_INTEGER_PARSER(ulong);
1230 DEFINE_INTEGER_PARSER(longlong);
1231 DEFINE_INTEGER_PARSER(ulonglong);
1232
1233 #undef DEFINE_INTEGER_PARSER
1234
1235 } // namespace re2
1236