xref: /aosp_15_r20/external/regex-re2/re2/re2.cc (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1 // Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9 
10 #include "re2/re2.h"
11 
12 #include <assert.h>
13 #include <ctype.h>
14 #include <errno.h>
15 #include <stdint.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <algorithm>
19 #include <iterator>
20 #include <mutex>
21 #include <string>
22 #include <utility>
23 #include <vector>
24 
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "util/sparse_array.h"
28 #include "util/strutil.h"
29 #include "util/utf.h"
30 #include "re2/prog.h"
31 #include "re2/regexp.h"
32 
33 namespace re2 {
34 
35 // Maximum number of args we can set
36 static const int kMaxArgs = 16;
37 static const int kVecSize = 1+kMaxArgs;
38 
39 const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
40 
Options(RE2::CannedOptions opt)41 RE2::Options::Options(RE2::CannedOptions opt)
42   : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
43     posix_syntax_(opt == RE2::POSIX),
44     longest_match_(opt == RE2::POSIX),
45     log_errors_(opt != RE2::Quiet),
46     max_mem_(kDefaultMaxMem),
47     literal_(false),
48     never_nl_(false),
49     dot_nl_(false),
50     never_capture_(false),
51     case_sensitive_(true),
52     perl_classes_(false),
53     word_boundary_(false),
54     one_line_(false) {
55 }
56 
57 // static empty objects for use as const references.
58 // To avoid global constructors, allocated in RE2::Init().
59 static const string* empty_string;
60 static const std::map<string, int>* empty_named_groups;
61 static const std::map<int, string>* empty_group_names;
62 
63 // Converts from Regexp error code to RE2 error code.
64 // Maybe some day they will diverge.  In any event, this
65 // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)66 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
67   switch (code) {
68     case re2::kRegexpSuccess:
69       return RE2::NoError;
70     case re2::kRegexpInternalError:
71       return RE2::ErrorInternal;
72     case re2::kRegexpBadEscape:
73       return RE2::ErrorBadEscape;
74     case re2::kRegexpBadCharClass:
75       return RE2::ErrorBadCharClass;
76     case re2::kRegexpBadCharRange:
77       return RE2::ErrorBadCharRange;
78     case re2::kRegexpMissingBracket:
79       return RE2::ErrorMissingBracket;
80     case re2::kRegexpMissingParen:
81       return RE2::ErrorMissingParen;
82     case re2::kRegexpTrailingBackslash:
83       return RE2::ErrorTrailingBackslash;
84     case re2::kRegexpRepeatArgument:
85       return RE2::ErrorRepeatArgument;
86     case re2::kRegexpRepeatSize:
87       return RE2::ErrorRepeatSize;
88     case re2::kRegexpRepeatOp:
89       return RE2::ErrorRepeatOp;
90     case re2::kRegexpBadPerlOp:
91       return RE2::ErrorBadPerlOp;
92     case re2::kRegexpBadUTF8:
93       return RE2::ErrorBadUTF8;
94     case re2::kRegexpBadNamedCapture:
95       return RE2::ErrorBadNamedCapture;
96   }
97   return RE2::ErrorInternal;
98 }
99 
trunc(const StringPiece & pattern)100 static string trunc(const StringPiece& pattern) {
101   if (pattern.size() < 100)
102     return string(pattern);
103   return string(pattern.substr(0, 100)) + "...";
104 }
105 
106 
RE2(const char * pattern)107 RE2::RE2(const char* pattern) {
108   Init(pattern, DefaultOptions);
109 }
110 
RE2(const string & pattern)111 RE2::RE2(const string& pattern) {
112   Init(pattern, DefaultOptions);
113 }
114 
RE2(const StringPiece & pattern)115 RE2::RE2(const StringPiece& pattern) {
116   Init(pattern, DefaultOptions);
117 }
118 
RE2(const StringPiece & pattern,const Options & options)119 RE2::RE2(const StringPiece& pattern, const Options& options) {
120   Init(pattern, options);
121 }
122 
ParseFlags() const123 int RE2::Options::ParseFlags() const {
124   int flags = Regexp::ClassNL;
125   switch (encoding()) {
126     default:
127       if (log_errors())
128         LOG(ERROR) << "Unknown encoding " << encoding();
129       break;
130     case RE2::Options::EncodingUTF8:
131       break;
132     case RE2::Options::EncodingLatin1:
133       flags |= Regexp::Latin1;
134       break;
135   }
136 
137   if (!posix_syntax())
138     flags |= Regexp::LikePerl;
139 
140   if (literal())
141     flags |= Regexp::Literal;
142 
143   if (never_nl())
144     flags |= Regexp::NeverNL;
145 
146   if (dot_nl())
147     flags |= Regexp::DotNL;
148 
149   if (never_capture())
150     flags |= Regexp::NeverCapture;
151 
152   if (!case_sensitive())
153     flags |= Regexp::FoldCase;
154 
155   if (perl_classes())
156     flags |= Regexp::PerlClasses;
157 
158   if (word_boundary())
159     flags |= Regexp::PerlB;
160 
161   if (one_line())
162     flags |= Regexp::OneLine;
163 
164   return flags;
165 }
166 
Init(const StringPiece & pattern,const Options & options)167 void RE2::Init(const StringPiece& pattern, const Options& options) {
168   static std::once_flag empty_once;
169   std::call_once(empty_once, []() {
170     empty_string = new string;
171     empty_named_groups = new std::map<string, int>;
172     empty_group_names = new std::map<int, string>;
173   });
174 
175   pattern_ = string(pattern);
176   options_.Copy(options);
177   entire_regexp_ = NULL;
178   suffix_regexp_ = NULL;
179   prog_ = NULL;
180   num_captures_ = -1;
181   rprog_ = NULL;
182   error_ = empty_string;
183   error_code_ = NoError;
184   named_groups_ = NULL;
185   group_names_ = NULL;
186 
187   RegexpStatus status;
188   entire_regexp_ = Regexp::Parse(
189     pattern_,
190     static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191     &status);
192   if (entire_regexp_ == NULL) {
193     if (options_.log_errors()) {
194       LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
195                  << status.Text();
196     }
197     error_ = new string(status.Text());
198     error_code_ = RegexpErrorToRE2(status.code());
199     error_arg_ = string(status.error_arg());
200     return;
201   }
202 
203   re2::Regexp* suffix;
204   if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
205     suffix_regexp_ = suffix;
206   else
207     suffix_regexp_ = entire_regexp_->Incref();
208 
209   // Two thirds of the memory goes to the forward Prog,
210   // one third to the reverse prog, because the forward
211   // Prog has two DFAs but the reverse prog has one.
212   prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
213   if (prog_ == NULL) {
214     if (options_.log_errors())
215       LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
216     error_ = new string("pattern too large - compile failed");
217     error_code_ = RE2::ErrorPatternTooLarge;
218     return;
219   }
220 
221   // We used to compute this lazily, but it's used during the
222   // typical control flow for a match call, so we now compute
223   // it eagerly, which avoids the overhead of std::once_flag.
224   num_captures_ = suffix_regexp_->NumCaptures();
225 
226   // Could delay this until the first match call that
227   // cares about submatch information, but the one-pass
228   // machine's memory gets cut from the DFA memory budget,
229   // and that is harder to do if the DFA has already
230   // been built.
231   is_one_pass_ = prog_->IsOnePass();
232 }
233 
234 // Returns rprog_, computing it if needed.
ReverseProg() const235 re2::Prog* RE2::ReverseProg() const {
236   std::call_once(rprog_once_, [](const RE2* re) {
237     re->rprog_ =
238         re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
239     if (re->rprog_ == NULL) {
240       if (re->options_.log_errors())
241         LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
242       re->error_ = new string("pattern too large - reverse compile failed");
243       re->error_code_ = RE2::ErrorPatternTooLarge;
244     }
245   }, this);
246   return rprog_;
247 }
248 
~RE2()249 RE2::~RE2() {
250   if (suffix_regexp_)
251     suffix_regexp_->Decref();
252   if (entire_regexp_)
253     entire_regexp_->Decref();
254   delete prog_;
255   delete rprog_;
256   if (error_ != empty_string)
257     delete error_;
258   if (named_groups_ != NULL && named_groups_ != empty_named_groups)
259     delete named_groups_;
260   if (group_names_ != NULL &&  group_names_ != empty_group_names)
261     delete group_names_;
262 }
263 
ProgramSize() const264 int RE2::ProgramSize() const {
265   if (prog_ == NULL)
266     return -1;
267   return prog_->size();
268 }
269 
ReverseProgramSize() const270 int RE2::ReverseProgramSize() const {
271   if (prog_ == NULL)
272     return -1;
273   Prog* prog = ReverseProg();
274   if (prog == NULL)
275     return -1;
276   return prog->size();
277 }
278 
Fanout(Prog * prog,std::map<int,int> * histogram)279 static int Fanout(Prog* prog, std::map<int, int>* histogram) {
280   SparseArray<int> fanout(prog->size());
281   prog->Fanout(&fanout);
282   histogram->clear();
283   for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
284     // TODO(junyer): Optimise this?
285     int bucket = 0;
286     while (1 << bucket < i->value()) {
287       bucket++;
288     }
289     (*histogram)[bucket]++;
290   }
291   return histogram->rbegin()->first;
292 }
293 
ProgramFanout(std::map<int,int> * histogram) const294 int RE2::ProgramFanout(std::map<int, int>* histogram) const {
295   if (prog_ == NULL)
296     return -1;
297   return Fanout(prog_, histogram);
298 }
299 
ReverseProgramFanout(std::map<int,int> * histogram) const300 int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
301   if (prog_ == NULL)
302     return -1;
303   Prog* prog = ReverseProg();
304   if (prog == NULL)
305     return -1;
306   return Fanout(prog, histogram);
307 }
308 
309 // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const310 const std::map<string, int>& RE2::NamedCapturingGroups() const {
311   std::call_once(named_groups_once_, [](const RE2* re) {
312     if (re->suffix_regexp_ != NULL)
313       re->named_groups_ = re->suffix_regexp_->NamedCaptures();
314     if (re->named_groups_ == NULL)
315       re->named_groups_ = empty_named_groups;
316   }, this);
317   return *named_groups_;
318 }
319 
320 // Returns group_names_, computing it if needed.
CapturingGroupNames() const321 const std::map<int, string>& RE2::CapturingGroupNames() const {
322   std::call_once(group_names_once_, [](const RE2* re) {
323     if (re->suffix_regexp_ != NULL)
324       re->group_names_ = re->suffix_regexp_->CaptureNames();
325     if (re->group_names_ == NULL)
326       re->group_names_ = empty_group_names;
327   }, this);
328   return *group_names_;
329 }
330 
331 /***** Convenience interfaces *****/
332 
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)333 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
334                      const Arg* const args[], int n) {
335   return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
336 }
337 
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)338 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
339                         const Arg* const args[], int n) {
340   return re.DoMatch(text, UNANCHORED, NULL, args, n);
341 }
342 
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)343 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
344                    const Arg* const args[], int n) {
345   size_t consumed;
346   if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
347     input->remove_prefix(consumed);
348     return true;
349   } else {
350     return false;
351   }
352 }
353 
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)354 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
355                           const Arg* const args[], int n) {
356   size_t consumed;
357   if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
358     input->remove_prefix(consumed);
359     return true;
360   } else {
361     return false;
362   }
363 }
364 
Replace(string * str,const RE2 & re,const StringPiece & rewrite)365 bool RE2::Replace(string* str,
366                   const RE2& re,
367                   const StringPiece& rewrite) {
368   StringPiece vec[kVecSize];
369   int nvec = 1 + MaxSubmatch(rewrite);
370   if (nvec > arraysize(vec))
371     return false;
372   if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
373     return false;
374 
375   string s;
376   if (!re.Rewrite(&s, rewrite, vec, nvec))
377     return false;
378 
379   assert(vec[0].begin() >= str->data());
380   assert(vec[0].end() <= str->data()+str->size());
381   str->replace(vec[0].data() - str->data(), vec[0].size(), s);
382   return true;
383 }
384 
GlobalReplace(string * str,const RE2 & re,const StringPiece & rewrite)385 int RE2::GlobalReplace(string* str,
386                        const RE2& re,
387                        const StringPiece& rewrite) {
388   StringPiece vec[kVecSize];
389   int nvec = 1 + MaxSubmatch(rewrite);
390   if (nvec > arraysize(vec))
391     return false;
392 
393   const char* p = str->data();
394   const char* ep = p + str->size();
395   const char* lastend = NULL;
396   string out;
397   int count = 0;
398 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
399   // Iterate just once when fuzzing. Otherwise, we easily get bogged down
400   // and coverage is unlikely to improve despite significant expense.
401   while (p == str->data()) {
402 #else
403   while (p <= ep) {
404 #endif
405     if (!re.Match(*str, static_cast<size_t>(p - str->data()),
406                   str->size(), UNANCHORED, vec, nvec))
407       break;
408     if (p < vec[0].begin())
409       out.append(p, vec[0].begin() - p);
410     if (vec[0].begin() == lastend && vec[0].size() == 0) {
411       // Disallow empty match at end of last match: skip ahead.
412       //
413       // fullrune() takes int, not size_t. However, it just looks
414       // at the leading byte and treats any length >= 4 the same.
415       if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
416           fullrune(p, static_cast<int>(std::min(static_cast<ptrdiff_t>(4),
417                                                 ep - p)))) {
418         // re is in UTF-8 mode and there is enough left of str
419         // to allow us to advance by up to UTFmax bytes.
420         Rune r;
421         int n = chartorune(&r, p);
422         // Some copies of chartorune have a bug that accepts
423         // encodings of values in (10FFFF, 1FFFFF] as valid.
424         if (r > Runemax) {
425           n = 1;
426           r = Runeerror;
427         }
428         if (!(n == 1 && r == Runeerror)) {  // no decoding error
429           out.append(p, n);
430           p += n;
431           continue;
432         }
433       }
434       // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
435       // we fell through from above and the GIGO principle applies.
436       if (p < ep)
437         out.append(p, 1);
438       p++;
439       continue;
440     }
441     re.Rewrite(&out, rewrite, vec, nvec);
442     p = vec[0].end();
443     lastend = p;
444     count++;
445   }
446 
447   if (count == 0)
448     return 0;
449 
450   if (p < ep)
451     out.append(p, ep - p);
452   using std::swap;
453   swap(out, *str);
454   return count;
455 }
456 
457 bool RE2::Extract(const StringPiece& text,
458                   const RE2& re,
459                   const StringPiece& rewrite,
460                   string* out) {
461   StringPiece vec[kVecSize];
462   int nvec = 1 + MaxSubmatch(rewrite);
463   if (nvec > arraysize(vec))
464     return false;
465 
466   if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
467     return false;
468 
469   out->clear();
470   return re.Rewrite(out, rewrite, vec, nvec);
471 }
472 
473 string RE2::QuoteMeta(const StringPiece& unquoted) {
474   string result;
475   result.reserve(unquoted.size() << 1);
476 
477   // Escape any ascii character not in [A-Za-z_0-9].
478   //
479   // Note that it's legal to escape a character even if it has no
480   // special meaning in a regular expression -- so this function does
481   // that.  (This also makes it identical to the perl function of the
482   // same name except for the null-character special case;
483   // see `perldoc -f quotemeta`.)
484   for (size_t ii = 0; ii < unquoted.size(); ++ii) {
485     // Note that using 'isalnum' here raises the benchmark time from
486     // 32ns to 58ns:
487     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
488         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
489         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
490         unquoted[ii] != '_' &&
491         // If this is the part of a UTF8 or Latin1 character, we need
492         // to copy this byte without escaping.  Experimentally this is
493         // what works correctly with the regexp library.
494         !(unquoted[ii] & 128)) {
495       if (unquoted[ii] == '\0') {  // Special handling for null chars.
496         // Note that this special handling is not strictly required for RE2,
497         // but this quoting is required for other regexp libraries such as
498         // PCRE.
499         // Can't use "\\0" since the next character might be a digit.
500         result += "\\x00";
501         continue;
502       }
503       result += '\\';
504     }
505     result += unquoted[ii];
506   }
507 
508   return result;
509 }
510 
511 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
512   if (prog_ == NULL)
513     return false;
514 
515   int n = static_cast<int>(prefix_.size());
516   if (n > maxlen)
517     n = maxlen;
518 
519   // Determine initial min max from prefix_ literal.
520   *min = prefix_.substr(0, n);
521   *max = prefix_.substr(0, n);
522   if (prefix_foldcase_) {
523     // prefix is ASCII lowercase; change *min to uppercase.
524     for (int i = 0; i < n; i++) {
525       char& c = (*min)[i];
526       if ('a' <= c && c <= 'z')
527         c += 'A' - 'a';
528     }
529   }
530 
531   // Add to prefix min max using PossibleMatchRange on regexp.
532   string dmin, dmax;
533   maxlen -= n;
534   if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
535     min->append(dmin);
536     max->append(dmax);
537   } else if (!max->empty()) {
538     // prog_->PossibleMatchRange has failed us,
539     // but we still have useful information from prefix_.
540     // Round up *max to allow any possible suffix.
541     PrefixSuccessor(max);
542   } else {
543     // Nothing useful.
544     *min = "";
545     *max = "";
546     return false;
547   }
548 
549   return true;
550 }
551 
552 // Avoid possible locale nonsense in standard strcasecmp.
553 // The string a is known to be all lowercase.
554 static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
555   const char* ae = a + len;
556 
557   for (; a < ae; a++, b++) {
558     uint8_t x = *a;
559     uint8_t y = *b;
560     if ('A' <= y && y <= 'Z')
561       y += 'a' - 'A';
562     if (x != y)
563       return x - y;
564   }
565   return 0;
566 }
567 
568 
569 /***** Actual matching and rewriting code *****/
570 
571 bool RE2::Match(const StringPiece& text,
572                 size_t startpos,
573                 size_t endpos,
574                 Anchor re_anchor,
575                 StringPiece* submatch,
576                 int nsubmatch) const {
577   if (!ok()) {
578     if (options_.log_errors())
579       LOG(ERROR) << "Invalid RE2: " << *error_;
580     return false;
581   }
582 
583   if (startpos > endpos || endpos > text.size()) {
584     if (options_.log_errors())
585       LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
586                  << "startpos: " << startpos << ", "
587                  << "endpos: " << endpos << ", "
588                  << "text size: " << text.size() << "]";
589     return false;
590   }
591 
592   StringPiece subtext = text;
593   subtext.remove_prefix(startpos);
594   subtext.remove_suffix(text.size() - endpos);
595 
596   // Use DFAs to find exact location of match, filter out non-matches.
597 
598   // Don't ask for the location if we won't use it.
599   // SearchDFA can do extra optimizations in that case.
600   StringPiece match;
601   StringPiece* matchp = &match;
602   if (nsubmatch == 0)
603     matchp = NULL;
604 
605   int ncap = 1 + NumberOfCapturingGroups();
606   if (ncap > nsubmatch)
607     ncap = nsubmatch;
608 
609   // If the regexp is anchored explicitly, must not be in middle of text.
610   if (prog_->anchor_start() && startpos != 0)
611     return false;
612 
613   // If the regexp is anchored explicitly, update re_anchor
614   // so that we can potentially fall into a faster case below.
615   if (prog_->anchor_start() && prog_->anchor_end())
616     re_anchor = ANCHOR_BOTH;
617   else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
618     re_anchor = ANCHOR_START;
619 
620   // Check for the required prefix, if any.
621   size_t prefixlen = 0;
622   if (!prefix_.empty()) {
623     if (startpos != 0)
624       return false;
625     prefixlen = prefix_.size();
626     if (prefixlen > subtext.size())
627       return false;
628     if (prefix_foldcase_) {
629       if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
630         return false;
631     } else {
632       if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
633         return false;
634     }
635     subtext.remove_prefix(prefixlen);
636     // If there is a required prefix, the anchor must be at least ANCHOR_START.
637     if (re_anchor != ANCHOR_BOTH)
638       re_anchor = ANCHOR_START;
639   }
640 
641   Prog::Anchor anchor = Prog::kUnanchored;
642   Prog::MatchKind kind = Prog::kFirstMatch;
643   if (options_.longest_match())
644     kind = Prog::kLongestMatch;
645   bool skipped_test = false;
646 
647   bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
648 
649   // SearchBitState allocates a bit vector of size prog_->size() * text.size().
650   // It also allocates a stack of 3-word structures which could potentially
651   // grow as large as prog_->size() * text.size() but in practice is much
652   // smaller.
653   // Conditions for using SearchBitState:
654   const int MaxBitStateProg = 500;   // prog_->size() <= Max.
655   const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
656   bool can_bit_state = prog_->size() <= MaxBitStateProg;
657   size_t bit_state_text_max = MaxBitStateVector / prog_->size();
658 
659   bool dfa_failed = false;
660   switch (re_anchor) {
661     default:
662     case UNANCHORED: {
663       if (!prog_->SearchDFA(subtext, text, anchor, kind,
664                             matchp, &dfa_failed, NULL)) {
665         if (dfa_failed) {
666           if (options_.log_errors())
667             LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
668                        << "bytemap range " << prog_->bytemap_range() << ", "
669                        << "list count " << prog_->list_count();
670           // Fall back to NFA below.
671           skipped_test = true;
672           break;
673         }
674         return false;
675       }
676       if (matchp == NULL)  // Matched.  Don't care where
677         return true;
678       // SearchDFA set match[0].end() but didn't know where the
679       // match started.  Run the regexp backward from match[0].end()
680       // to find the longest possible match -- that's where it started.
681       Prog* prog = ReverseProg();
682       if (prog == NULL)
683         return false;
684       if (!prog->SearchDFA(match, text, Prog::kAnchored,
685                            Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
686         if (dfa_failed) {
687           if (options_.log_errors())
688             LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", "
689                        << "bytemap range " << prog->bytemap_range() << ", "
690                        << "list count " << prog->list_count();
691           // Fall back to NFA below.
692           skipped_test = true;
693           break;
694         }
695         if (options_.log_errors())
696           LOG(ERROR) << "SearchDFA inconsistency";
697         return false;
698       }
699       break;
700     }
701 
702     case ANCHOR_BOTH:
703     case ANCHOR_START:
704       if (re_anchor == ANCHOR_BOTH)
705         kind = Prog::kFullMatch;
706       anchor = Prog::kAnchored;
707 
708       // If only a small amount of text and need submatch
709       // information anyway and we're going to use OnePass or BitState
710       // to get it, we might as well not even bother with the DFA:
711       // OnePass or BitState will be fast enough.
712       // On tiny texts, OnePass outruns even the DFA, and
713       // it doesn't have the shared state and occasional mutex that
714       // the DFA does.
715       if (can_one_pass && text.size() <= 4096 &&
716           (ncap > 1 || text.size() <= 8)) {
717         skipped_test = true;
718         break;
719       }
720       if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
721         skipped_test = true;
722         break;
723       }
724       if (!prog_->SearchDFA(subtext, text, anchor, kind,
725                             &match, &dfa_failed, NULL)) {
726         if (dfa_failed) {
727           if (options_.log_errors())
728             LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
729                        << "bytemap range " << prog_->bytemap_range() << ", "
730                        << "list count " << prog_->list_count();
731           // Fall back to NFA below.
732           skipped_test = true;
733           break;
734         }
735         return false;
736       }
737       break;
738   }
739 
740   if (!skipped_test && ncap <= 1) {
741     // We know exactly where it matches.  That's enough.
742     if (ncap == 1)
743       submatch[0] = match;
744   } else {
745     StringPiece subtext1;
746     if (skipped_test) {
747       // DFA ran out of memory or was skipped:
748       // need to search in entire original text.
749       subtext1 = subtext;
750     } else {
751       // DFA found the exact match location:
752       // let NFA run an anchored, full match search
753       // to find submatch locations.
754       subtext1 = match;
755       anchor = Prog::kAnchored;
756       kind = Prog::kFullMatch;
757     }
758 
759     if (can_one_pass && anchor != Prog::kUnanchored) {
760       if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
761         if (!skipped_test && options_.log_errors())
762           LOG(ERROR) << "SearchOnePass inconsistency";
763         return false;
764       }
765     } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
766       if (!prog_->SearchBitState(subtext1, text, anchor,
767                                  kind, submatch, ncap)) {
768         if (!skipped_test && options_.log_errors())
769           LOG(ERROR) << "SearchBitState inconsistency";
770         return false;
771       }
772     } else {
773       if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
774         if (!skipped_test && options_.log_errors())
775           LOG(ERROR) << "SearchNFA inconsistency";
776         return false;
777       }
778     }
779   }
780 
781   // Adjust overall match for required prefix that we stripped off.
782   if (prefixlen > 0 && nsubmatch > 0)
783     submatch[0] = StringPiece(submatch[0].data() - prefixlen,
784                               submatch[0].size() + prefixlen);
785 
786   // Zero submatches that don't exist in the regexp.
787   for (int i = ncap; i < nsubmatch; i++)
788     submatch[i] = StringPiece();
789   return true;
790 }
791 
792 // Internal matcher - like Match() but takes Args not StringPieces.
793 bool RE2::DoMatch(const StringPiece& text,
794                   Anchor re_anchor,
795                   size_t* consumed,
796                   const Arg* const* args,
797                   int n) const {
798   if (!ok()) {
799     if (options_.log_errors())
800       LOG(ERROR) << "Invalid RE2: " << *error_;
801     return false;
802   }
803 
804   if (NumberOfCapturingGroups() < n) {
805     // RE has fewer capturing groups than number of Arg pointers passed in.
806     return false;
807   }
808 
809   // Count number of capture groups needed.
810   int nvec;
811   if (n == 0 && consumed == NULL)
812     nvec = 0;
813   else
814     nvec = n+1;
815 
816   StringPiece* vec;
817   StringPiece stkvec[kVecSize];
818   StringPiece* heapvec = NULL;
819 
820   if (nvec <= arraysize(stkvec)) {
821     vec = stkvec;
822   } else {
823     vec = new StringPiece[nvec];
824     heapvec = vec;
825   }
826 
827   if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
828     delete[] heapvec;
829     return false;
830   }
831 
832   if (consumed != NULL)
833     *consumed = static_cast<size_t>(vec[0].end() - text.begin());
834 
835   if (n == 0 || args == NULL) {
836     // We are not interested in results
837     delete[] heapvec;
838     return true;
839   }
840 
841   // If we got here, we must have matched the whole pattern.
842   for (int i = 0; i < n; i++) {
843     const StringPiece& s = vec[i+1];
844     if (!args[i]->Parse(s.data(), s.size())) {
845       // TODO: Should we indicate what the error was?
846       delete[] heapvec;
847       return false;
848     }
849   }
850 
851   delete[] heapvec;
852   return true;
853 }
854 
855 // Checks that the rewrite string is well-formed with respect to this
856 // regular expression.
857 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
858   int max_token = -1;
859   for (const char *s = rewrite.data(), *end = s + rewrite.size();
860        s < end; s++) {
861     int c = *s;
862     if (c != '\\') {
863       continue;
864     }
865     if (++s == end) {
866       *error = "Rewrite schema error: '\\' not allowed at end.";
867       return false;
868     }
869     c = *s;
870     if (c == '\\') {
871       continue;
872     }
873     if (!isdigit(c)) {
874       *error = "Rewrite schema error: "
875                "'\\' must be followed by a digit or '\\'.";
876       return false;
877     }
878     int n = (c - '0');
879     if (max_token < n) {
880       max_token = n;
881     }
882   }
883 
884   if (max_token > NumberOfCapturingGroups()) {
885     SStringPrintf(error, "Rewrite schema requests %d matches, "
886                   "but the regexp only has %d parenthesized subexpressions.",
887                   max_token, NumberOfCapturingGroups());
888     return false;
889   }
890   return true;
891 }
892 
893 // Returns the maximum submatch needed for the rewrite to be done by Replace().
894 // E.g. if rewrite == "foo \\2,\\1", returns 2.
895 int RE2::MaxSubmatch(const StringPiece& rewrite) {
896   int max = 0;
897   for (const char *s = rewrite.data(), *end = s + rewrite.size();
898        s < end; s++) {
899     if (*s == '\\') {
900       s++;
901       int c = (s < end) ? *s : -1;
902       if (isdigit(c)) {
903         int n = (c - '0');
904         if (n > max)
905           max = n;
906       }
907     }
908   }
909   return max;
910 }
911 
912 // Append the "rewrite" string, with backslash subsitutions from "vec",
913 // to string "out".
914 bool RE2::Rewrite(string* out,
915                   const StringPiece& rewrite,
916                   const StringPiece* vec,
917                   int veclen) const {
918   for (const char *s = rewrite.data(), *end = s + rewrite.size();
919        s < end; s++) {
920     if (*s != '\\') {
921       out->push_back(*s);
922       continue;
923     }
924     s++;
925     int c = (s < end) ? *s : -1;
926     if (isdigit(c)) {
927       int n = (c - '0');
928       if (n >= veclen) {
929         if (options_.log_errors()) {
930           LOG(ERROR) << "requested group " << n
931                      << " in regexp " << rewrite.data();
932         }
933         return false;
934       }
935       StringPiece snip = vec[n];
936       if (snip.size() > 0)
937         out->append(snip.data(), snip.size());
938     } else if (c == '\\') {
939       out->push_back('\\');
940     } else {
941       if (options_.log_errors())
942         LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
943       return false;
944     }
945   }
946   return true;
947 }
948 
949 /***** Parsers for various types *****/
950 
951 bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
952   // We fail if somebody asked us to store into a non-NULL void* pointer
953   return (dest == NULL);
954 }
955 
956 bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) {
957   if (dest == NULL) return true;
958   reinterpret_cast<string*>(dest)->assign(str, n);
959   return true;
960 }
961 
962 bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
963   if (dest == NULL) return true;
964   *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
965   return true;
966 }
967 
968 bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) {
969   if (n != 1) return false;
970   if (dest == NULL) return true;
971   *(reinterpret_cast<char*>(dest)) = str[0];
972   return true;
973 }
974 
975 bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) {
976   if (n != 1) return false;
977   if (dest == NULL) return true;
978   *(reinterpret_cast<signed char*>(dest)) = str[0];
979   return true;
980 }
981 
982 bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) {
983   if (n != 1) return false;
984   if (dest == NULL) return true;
985   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
986   return true;
987 }
988 
989 // Largest number spec that we are willing to parse
990 static const int kMaxNumberLength = 32;
991 
992 // REQUIRES "buf" must have length at least nbuf.
993 // Copies "str" into "buf" and null-terminates.
994 // Overwrites *np with the new length.
995 static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
996                                    size_t* np, bool accept_spaces) {
997   size_t n = *np;
998   if (n == 0) return "";
999   if (n > 0 && isspace(*str)) {
1000     // We are less forgiving than the strtoxxx() routines and do not
1001     // allow leading spaces. We do allow leading spaces for floats.
1002     if (!accept_spaces) {
1003       return "";
1004     }
1005     while (n > 0 && isspace(*str)) {
1006       n--;
1007       str++;
1008     }
1009   }
1010 
1011   // Although buf has a fixed maximum size, we can still handle
1012   // arbitrarily large integers correctly by omitting leading zeros.
1013   // (Numbers that are still too long will be out of range.)
1014   // Before deciding whether str is too long,
1015   // remove leading zeros with s/000+/00/.
1016   // Leaving the leading two zeros in place means that
1017   // we don't change 0000x123 (invalid) into 0x123 (valid).
1018   // Skip over leading - before replacing.
1019   bool neg = false;
1020   if (n >= 1 && str[0] == '-') {
1021     neg = true;
1022     n--;
1023     str++;
1024   }
1025 
1026   if (n >= 3 && str[0] == '0' && str[1] == '0') {
1027     while (n >= 3 && str[2] == '0') {
1028       n--;
1029       str++;
1030     }
1031   }
1032 
1033   if (neg) {  // make room in buf for -
1034     n++;
1035     str--;
1036   }
1037 
1038   if (n > nbuf-1) return "";
1039 
1040   memmove(buf, str, n);
1041   if (neg) {
1042     buf[0] = '-';
1043   }
1044   buf[n] = '\0';
1045   *np = n;
1046   return buf;
1047 }
1048 
1049 bool RE2::Arg::parse_long_radix(const char* str,
1050                                 size_t n,
1051                                 void* dest,
1052                                 int radix) {
1053   if (n == 0) return false;
1054   char buf[kMaxNumberLength+1];
1055   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1056   char* end;
1057   errno = 0;
1058   long r = strtol(str, &end, radix);
1059   if (end != str + n) return false;   // Leftover junk
1060   if (errno) return false;
1061   if (dest == NULL) return true;
1062   *(reinterpret_cast<long*>(dest)) = r;
1063   return true;
1064 }
1065 
1066 bool RE2::Arg::parse_ulong_radix(const char* str,
1067                                  size_t n,
1068                                  void* dest,
1069                                  int radix) {
1070   if (n == 0) return false;
1071   char buf[kMaxNumberLength+1];
1072   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1073   if (str[0] == '-') {
1074     // strtoul() will silently accept negative numbers and parse
1075     // them.  This module is more strict and treats them as errors.
1076     return false;
1077   }
1078 
1079   char* end;
1080   errno = 0;
1081   unsigned long r = strtoul(str, &end, radix);
1082   if (end != str + n) return false;   // Leftover junk
1083   if (errno) return false;
1084   if (dest == NULL) return true;
1085   *(reinterpret_cast<unsigned long*>(dest)) = r;
1086   return true;
1087 }
1088 
1089 bool RE2::Arg::parse_short_radix(const char* str,
1090                                  size_t n,
1091                                  void* dest,
1092                                  int radix) {
1093   long r;
1094   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
1095   if ((short)r != r) return false;                         // Out of range
1096   if (dest == NULL) return true;
1097   *(reinterpret_cast<short*>(dest)) = (short)r;
1098   return true;
1099 }
1100 
1101 bool RE2::Arg::parse_ushort_radix(const char* str,
1102                                   size_t n,
1103                                   void* dest,
1104                                   int radix) {
1105   unsigned long r;
1106   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
1107   if ((unsigned short)r != r) return false;                 // Out of range
1108   if (dest == NULL) return true;
1109   *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
1110   return true;
1111 }
1112 
1113 bool RE2::Arg::parse_int_radix(const char* str,
1114                                size_t n,
1115                                void* dest,
1116                                int radix) {
1117   long r;
1118   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
1119   if ((int)r != r) return false;                           // Out of range
1120   if (dest == NULL) return true;
1121   *(reinterpret_cast<int*>(dest)) = (int)r;
1122   return true;
1123 }
1124 
1125 bool RE2::Arg::parse_uint_radix(const char* str,
1126                                 size_t n,
1127                                 void* dest,
1128                                 int radix) {
1129   unsigned long r;
1130   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
1131   if ((unsigned int)r != r) return false;                   // Out of range
1132   if (dest == NULL) return true;
1133   *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
1134   return true;
1135 }
1136 
1137 bool RE2::Arg::parse_longlong_radix(const char* str,
1138                                     size_t n,
1139                                     void* dest,
1140                                     int radix) {
1141   if (n == 0) return false;
1142   char buf[kMaxNumberLength+1];
1143   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1144   char* end;
1145   errno = 0;
1146   long long r = strtoll(str, &end, radix);
1147   if (end != str + n) return false;   // Leftover junk
1148   if (errno) return false;
1149   if (dest == NULL) return true;
1150   *(reinterpret_cast<long long*>(dest)) = r;
1151   return true;
1152 }
1153 
1154 bool RE2::Arg::parse_ulonglong_radix(const char* str,
1155                                      size_t n,
1156                                      void* dest,
1157                                      int radix) {
1158   if (n == 0) return false;
1159   char buf[kMaxNumberLength+1];
1160   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1161   if (str[0] == '-') {
1162     // strtoull() will silently accept negative numbers and parse
1163     // them.  This module is more strict and treats them as errors.
1164     return false;
1165   }
1166   char* end;
1167   errno = 0;
1168   unsigned long long r = strtoull(str, &end, radix);
1169   if (end != str + n) return false;   // Leftover junk
1170   if (errno) return false;
1171   if (dest == NULL) return true;
1172   *(reinterpret_cast<unsigned long long*>(dest)) = r;
1173   return true;
1174 }
1175 
1176 static bool parse_double_float(const char* str, size_t n, bool isfloat,
1177                                void* dest) {
1178   if (n == 0) return false;
1179   static const int kMaxLength = 200;
1180   char buf[kMaxLength+1];
1181   str = TerminateNumber(buf, sizeof buf, str, &n, true);
1182   char* end;
1183   errno = 0;
1184   double r;
1185   if (isfloat) {
1186     r = strtof(str, &end);
1187   } else {
1188     r = strtod(str, &end);
1189   }
1190   if (end != str + n) return false;   // Leftover junk
1191   if (errno) return false;
1192   if (dest == NULL) return true;
1193   if (isfloat) {
1194     *(reinterpret_cast<float*>(dest)) = (float)r;
1195   } else {
1196     *(reinterpret_cast<double*>(dest)) = r;
1197   }
1198   return true;
1199 }
1200 
1201 bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) {
1202   return parse_double_float(str, n, false, dest);
1203 }
1204 
1205 bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) {
1206   return parse_double_float(str, n, true, dest);
1207 }
1208 
1209 #define DEFINE_INTEGER_PARSER(name)                                            \
1210   bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) {         \
1211     return parse_##name##_radix(str, n, dest, 10);                             \
1212   }                                                                            \
1213   bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) {   \
1214     return parse_##name##_radix(str, n, dest, 16);                             \
1215   }                                                                            \
1216   bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \
1217     return parse_##name##_radix(str, n, dest, 8);                              \
1218   }                                                                            \
1219   bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n,              \
1220                                        void* dest) {                           \
1221     return parse_##name##_radix(str, n, dest, 0);                              \
1222   }
1223 
1224 DEFINE_INTEGER_PARSER(short);
1225 DEFINE_INTEGER_PARSER(ushort);
1226 DEFINE_INTEGER_PARSER(int);
1227 DEFINE_INTEGER_PARSER(uint);
1228 DEFINE_INTEGER_PARSER(long);
1229 DEFINE_INTEGER_PARSER(ulong);
1230 DEFINE_INTEGER_PARSER(longlong);
1231 DEFINE_INTEGER_PARSER(ulonglong);
1232 
1233 #undef DEFINE_INTEGER_PARSER
1234 
1235 }  // namespace re2
1236