1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression representation.
6 // Tested by parse_test.cc
7
8 #include "re2/regexp.h"
9
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 #include <algorithm>
14 #include <map>
15 #include <mutex>
16 #include <string>
17 #include <vector>
18
19 #include "util/util.h"
20 #include "util/logging.h"
21 #include "util/mutex.h"
22 #include "util/utf.h"
23 #include "re2/pod_array.h"
24 #include "re2/stringpiece.h"
25 #include "re2/walker-inl.h"
26
27 namespace re2 {
28
29 // Constructor. Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)30 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
31 : op_(static_cast<uint8_t>(op)),
32 simple_(false),
33 parse_flags_(static_cast<uint16_t>(parse_flags)),
34 ref_(1),
35 nsub_(0),
36 down_(NULL) {
37 subone_ = NULL;
38 memset(the_union_, 0, sizeof the_union_);
39 }
40
41 // Destructor. Assumes already cleaned up children.
42 // Private: use Decref() instead of delete to destroy Regexps.
43 // Can't call Decref on the sub-Regexps here because
44 // that could cause arbitrarily deep recursion, so
45 // required Decref() to have handled them for us.
~Regexp()46 Regexp::~Regexp() {
47 if (nsub_ > 0)
48 LOG(DFATAL) << "Regexp not destroyed.";
49
50 switch (op_) {
51 default:
52 break;
53 case kRegexpCapture:
54 delete name_;
55 break;
56 case kRegexpLiteralString:
57 delete[] runes_;
58 break;
59 case kRegexpCharClass:
60 if (cc_)
61 cc_->Delete();
62 delete ccb_;
63 break;
64 }
65 }
66
67 // If it's possible to destroy this regexp without recurring,
68 // do so and return true. Else return false.
QuickDestroy()69 bool Regexp::QuickDestroy() {
70 if (nsub_ == 0) {
71 delete this;
72 return true;
73 }
74 return false;
75 }
76
77 // Lazily allocated.
78 static Mutex* ref_mutex;
79 static std::map<Regexp*, int>* ref_map;
80
Ref()81 int Regexp::Ref() {
82 if (ref_ < kMaxRef)
83 return ref_;
84
85 MutexLock l(ref_mutex);
86 return (*ref_map)[this];
87 }
88
89 // Increments reference count, returns object as convenience.
Incref()90 Regexp* Regexp::Incref() {
91 if (ref_ >= kMaxRef-1) {
92 static std::once_flag ref_once;
93 std::call_once(ref_once, []() {
94 ref_mutex = new Mutex;
95 ref_map = new std::map<Regexp*, int>;
96 });
97
98 // Store ref count in overflow map.
99 MutexLock l(ref_mutex);
100 if (ref_ == kMaxRef) {
101 // already overflowed
102 (*ref_map)[this]++;
103 } else {
104 // overflowing now
105 (*ref_map)[this] = kMaxRef;
106 ref_ = kMaxRef;
107 }
108 return this;
109 }
110
111 ref_++;
112 return this;
113 }
114
115 // Decrements reference count and deletes this object if count reaches 0.
Decref()116 void Regexp::Decref() {
117 if (ref_ == kMaxRef) {
118 // Ref count is stored in overflow map.
119 MutexLock l(ref_mutex);
120 int r = (*ref_map)[this] - 1;
121 if (r < kMaxRef) {
122 ref_ = static_cast<uint16_t>(r);
123 ref_map->erase(this);
124 } else {
125 (*ref_map)[this] = r;
126 }
127 return;
128 }
129 ref_--;
130 if (ref_ == 0)
131 Destroy();
132 }
133
134 // Deletes this object; ref count has count reached 0.
Destroy()135 void Regexp::Destroy() {
136 if (QuickDestroy())
137 return;
138
139 // Handle recursive Destroy with explicit stack
140 // to avoid arbitrarily deep recursion on process stack [sigh].
141 down_ = NULL;
142 Regexp* stack = this;
143 while (stack != NULL) {
144 Regexp* re = stack;
145 stack = re->down_;
146 if (re->ref_ != 0)
147 LOG(DFATAL) << "Bad reference count " << re->ref_;
148 if (re->nsub_ > 0) {
149 Regexp** subs = re->sub();
150 for (int i = 0; i < re->nsub_; i++) {
151 Regexp* sub = subs[i];
152 if (sub == NULL)
153 continue;
154 if (sub->ref_ == kMaxRef)
155 sub->Decref();
156 else
157 --sub->ref_;
158 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
159 sub->down_ = stack;
160 stack = sub;
161 }
162 }
163 if (re->nsub_ > 1)
164 delete[] subs;
165 re->nsub_ = 0;
166 }
167 delete re;
168 }
169 }
170
AddRuneToString(Rune r)171 void Regexp::AddRuneToString(Rune r) {
172 DCHECK(op_ == kRegexpLiteralString);
173 if (nrunes_ == 0) {
174 // start with 8
175 runes_ = new Rune[8];
176 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
177 // double on powers of two
178 Rune *old = runes_;
179 runes_ = new Rune[nrunes_ * 2];
180 for (int i = 0; i < nrunes_; i++)
181 runes_[i] = old[i];
182 delete[] old;
183 }
184
185 runes_[nrunes_++] = r;
186 }
187
HaveMatch(int match_id,ParseFlags flags)188 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
189 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
190 re->match_id_ = match_id;
191 return re;
192 }
193
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)194 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
195 // Squash **, ++ and ??.
196 if (op == sub->op() && flags == sub->parse_flags())
197 return sub;
198
199 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
200 // op is Star/Plus/Quest, we just have to check that sub->op() is too.
201 if ((sub->op() == kRegexpStar ||
202 sub->op() == kRegexpPlus ||
203 sub->op() == kRegexpQuest) &&
204 flags == sub->parse_flags()) {
205 // If sub is Star, no need to rewrite it.
206 if (sub->op() == kRegexpStar)
207 return sub;
208
209 // Rewrite sub to Star.
210 Regexp* re = new Regexp(kRegexpStar, flags);
211 re->AllocSub(1);
212 re->sub()[0] = sub->sub()[0]->Incref();
213 sub->Decref(); // We didn't consume the reference after all.
214 return re;
215 }
216
217 Regexp* re = new Regexp(op, flags);
218 re->AllocSub(1);
219 re->sub()[0] = sub;
220 return re;
221 }
222
Plus(Regexp * sub,ParseFlags flags)223 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
224 return StarPlusOrQuest(kRegexpPlus, sub, flags);
225 }
226
Star(Regexp * sub,ParseFlags flags)227 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
228 return StarPlusOrQuest(kRegexpStar, sub, flags);
229 }
230
Quest(Regexp * sub,ParseFlags flags)231 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
232 return StarPlusOrQuest(kRegexpQuest, sub, flags);
233 }
234
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)235 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
236 ParseFlags flags, bool can_factor) {
237 if (nsub == 1)
238 return sub[0];
239
240 if (nsub == 0) {
241 if (op == kRegexpAlternate)
242 return new Regexp(kRegexpNoMatch, flags);
243 else
244 return new Regexp(kRegexpEmptyMatch, flags);
245 }
246
247 PODArray<Regexp*> subcopy;
248 if (op == kRegexpAlternate && can_factor) {
249 // Going to edit sub; make a copy so we don't step on caller.
250 subcopy = PODArray<Regexp*>(nsub);
251 memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
252 sub = subcopy.data();
253 nsub = FactorAlternation(sub, nsub, flags);
254 if (nsub == 1) {
255 Regexp* re = sub[0];
256 return re;
257 }
258 }
259
260 if (nsub > kMaxNsub) {
261 // Too many subexpressions to fit in a single Regexp.
262 // Make a two-level tree. Two levels gets us to 65535^2.
263 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
264 Regexp* re = new Regexp(op, flags);
265 re->AllocSub(nbigsub);
266 Regexp** subs = re->sub();
267 for (int i = 0; i < nbigsub - 1; i++)
268 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
269 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
270 nsub - (nbigsub-1)*kMaxNsub, flags,
271 false);
272 return re;
273 }
274
275 Regexp* re = new Regexp(op, flags);
276 re->AllocSub(nsub);
277 Regexp** subs = re->sub();
278 for (int i = 0; i < nsub; i++)
279 subs[i] = sub[i];
280 return re;
281 }
282
Concat(Regexp ** sub,int nsub,ParseFlags flags)283 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
284 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
285 }
286
Alternate(Regexp ** sub,int nsub,ParseFlags flags)287 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
288 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
289 }
290
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)291 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
292 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
293 }
294
Capture(Regexp * sub,ParseFlags flags,int cap)295 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
296 Regexp* re = new Regexp(kRegexpCapture, flags);
297 re->AllocSub(1);
298 re->sub()[0] = sub;
299 re->cap_ = cap;
300 return re;
301 }
302
Repeat(Regexp * sub,ParseFlags flags,int min,int max)303 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
304 Regexp* re = new Regexp(kRegexpRepeat, flags);
305 re->AllocSub(1);
306 re->sub()[0] = sub;
307 re->min_ = min;
308 re->max_ = max;
309 return re;
310 }
311
NewLiteral(Rune rune,ParseFlags flags)312 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
313 Regexp* re = new Regexp(kRegexpLiteral, flags);
314 re->rune_ = rune;
315 return re;
316 }
317
LiteralString(Rune * runes,int nrunes,ParseFlags flags)318 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
319 if (nrunes <= 0)
320 return new Regexp(kRegexpEmptyMatch, flags);
321 if (nrunes == 1)
322 return NewLiteral(runes[0], flags);
323 Regexp* re = new Regexp(kRegexpLiteralString, flags);
324 for (int i = 0; i < nrunes; i++)
325 re->AddRuneToString(runes[i]);
326 return re;
327 }
328
NewCharClass(CharClass * cc,ParseFlags flags)329 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
330 Regexp* re = new Regexp(kRegexpCharClass, flags);
331 re->cc_ = cc;
332 return re;
333 }
334
Swap(Regexp * that)335 void Regexp::Swap(Regexp* that) {
336 // Regexp is not trivially copyable, so we cannot freely copy it with
337 // memmove(3), but swapping objects like so is safe for our purposes.
338 char tmp[sizeof *this];
339 void* vthis = reinterpret_cast<void*>(this);
340 void* vthat = reinterpret_cast<void*>(that);
341 memmove(tmp, vthis, sizeof *this);
342 memmove(vthis, vthat, sizeof *this);
343 memmove(vthat, tmp, sizeof *this);
344 }
345
346 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)347 static bool TopEqual(Regexp* a, Regexp* b) {
348 if (a->op() != b->op())
349 return false;
350
351 switch (a->op()) {
352 case kRegexpNoMatch:
353 case kRegexpEmptyMatch:
354 case kRegexpAnyChar:
355 case kRegexpAnyByte:
356 case kRegexpBeginLine:
357 case kRegexpEndLine:
358 case kRegexpWordBoundary:
359 case kRegexpNoWordBoundary:
360 case kRegexpBeginText:
361 return true;
362
363 case kRegexpEndText:
364 // The parse flags remember whether it's \z or (?-m:$),
365 // which matters when testing against PCRE.
366 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
367
368 case kRegexpLiteral:
369 return a->rune() == b->rune() &&
370 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
371
372 case kRegexpLiteralString:
373 return a->nrunes() == b->nrunes() &&
374 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
375 memcmp(a->runes(), b->runes(),
376 a->nrunes() * sizeof a->runes()[0]) == 0;
377
378 case kRegexpAlternate:
379 case kRegexpConcat:
380 return a->nsub() == b->nsub();
381
382 case kRegexpStar:
383 case kRegexpPlus:
384 case kRegexpQuest:
385 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
386
387 case kRegexpRepeat:
388 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
389 a->min() == b->min() &&
390 a->max() == b->max();
391
392 case kRegexpCapture:
393 return a->cap() == b->cap() && a->name() == b->name();
394
395 case kRegexpHaveMatch:
396 return a->match_id() == b->match_id();
397
398 case kRegexpCharClass: {
399 CharClass* acc = a->cc();
400 CharClass* bcc = b->cc();
401 return acc->size() == bcc->size() &&
402 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
403 memcmp(acc->begin(), bcc->begin(),
404 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
405 }
406 }
407
408 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
409 return 0;
410 }
411
Equal(Regexp * a,Regexp * b)412 bool Regexp::Equal(Regexp* a, Regexp* b) {
413 if (a == NULL || b == NULL)
414 return a == b;
415
416 if (!TopEqual(a, b))
417 return false;
418
419 // Fast path:
420 // return without allocating vector if there are no subregexps.
421 switch (a->op()) {
422 case kRegexpAlternate:
423 case kRegexpConcat:
424 case kRegexpStar:
425 case kRegexpPlus:
426 case kRegexpQuest:
427 case kRegexpRepeat:
428 case kRegexpCapture:
429 break;
430
431 default:
432 return true;
433 }
434
435 // Committed to doing real work.
436 // The stack (vector) has pairs of regexps waiting to
437 // be compared. The regexps are only equal if
438 // all the pairs end up being equal.
439 std::vector<Regexp*> stk;
440
441 for (;;) {
442 // Invariant: TopEqual(a, b) == true.
443 Regexp* a2;
444 Regexp* b2;
445 switch (a->op()) {
446 default:
447 break;
448 case kRegexpAlternate:
449 case kRegexpConcat:
450 for (int i = 0; i < a->nsub(); i++) {
451 a2 = a->sub()[i];
452 b2 = b->sub()[i];
453 if (!TopEqual(a2, b2))
454 return false;
455 stk.push_back(a2);
456 stk.push_back(b2);
457 }
458 break;
459
460 case kRegexpStar:
461 case kRegexpPlus:
462 case kRegexpQuest:
463 case kRegexpRepeat:
464 case kRegexpCapture:
465 a2 = a->sub()[0];
466 b2 = b->sub()[0];
467 if (!TopEqual(a2, b2))
468 return false;
469 // Really:
470 // stk.push_back(a2);
471 // stk.push_back(b2);
472 // break;
473 // but faster to assign directly and loop.
474 a = a2;
475 b = b2;
476 continue;
477 }
478
479 size_t n = stk.size();
480 if (n == 0)
481 break;
482
483 DCHECK_GE(n, 2);
484 a = stk[n-2];
485 b = stk[n-1];
486 stk.resize(n-2);
487 }
488
489 return true;
490 }
491
492 // Keep in sync with enum RegexpStatusCode in regexp.h
493 static const char *kErrorStrings[] = {
494 "no error",
495 "unexpected error",
496 "invalid escape sequence",
497 "invalid character class",
498 "invalid character class range",
499 "missing ]",
500 "missing )",
501 "unexpected )",
502 "trailing \\",
503 "no argument for repetition operator",
504 "invalid repetition size",
505 "bad repetition operator",
506 "invalid perl operator",
507 "invalid UTF-8",
508 "invalid named capture group",
509 };
510
CodeText(enum RegexpStatusCode code)511 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
512 if (code < 0 || code >= arraysize(kErrorStrings))
513 code = kRegexpInternalError;
514 return kErrorStrings[code];
515 }
516
Text() const517 std::string RegexpStatus::Text() const {
518 if (error_arg_.empty())
519 return CodeText(code_);
520 std::string s;
521 s.append(CodeText(code_));
522 s.append(": ");
523 s.append(error_arg_.data(), error_arg_.size());
524 return s;
525 }
526
Copy(const RegexpStatus & status)527 void RegexpStatus::Copy(const RegexpStatus& status) {
528 code_ = status.code_;
529 error_arg_ = status.error_arg_;
530 }
531
532 typedef int Ignored; // Walker<void> doesn't exist
533
534 // Walker subclass to count capturing parens in regexp.
535 class NumCapturesWalker : public Regexp::Walker<Ignored> {
536 public:
NumCapturesWalker()537 NumCapturesWalker() : ncapture_(0) {}
ncapture()538 int ncapture() { return ncapture_; }
539
PreVisit(Regexp * re,Ignored ignored,bool * stop)540 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
541 if (re->op() == kRegexpCapture)
542 ncapture_++;
543 return ignored;
544 }
545
ShortVisit(Regexp * re,Ignored ignored)546 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
547 // Should never be called: we use Walk(), not WalkExponential().
548 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
549 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
550 #endif
551 return ignored;
552 }
553
554 private:
555 int ncapture_;
556
557 NumCapturesWalker(const NumCapturesWalker&) = delete;
558 NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
559 };
560
NumCaptures()561 int Regexp::NumCaptures() {
562 NumCapturesWalker w;
563 w.Walk(this, 0);
564 return w.ncapture();
565 }
566
567 // Walker class to build map of named capture groups and their indices.
568 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
569 public:
NamedCapturesWalker()570 NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()571 ~NamedCapturesWalker() { delete map_; }
572
TakeMap()573 std::map<std::string, int>* TakeMap() {
574 std::map<std::string, int>* m = map_;
575 map_ = NULL;
576 return m;
577 }
578
PreVisit(Regexp * re,Ignored ignored,bool * stop)579 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
580 if (re->op() == kRegexpCapture && re->name() != NULL) {
581 // Allocate map once we find a name.
582 if (map_ == NULL)
583 map_ = new std::map<std::string, int>;
584
585 // Record first occurrence of each name.
586 // (The rule is that if you have the same name
587 // multiple times, only the leftmost one counts.)
588 map_->insert({*re->name(), re->cap()});
589 }
590 return ignored;
591 }
592
ShortVisit(Regexp * re,Ignored ignored)593 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
594 // Should never be called: we use Walk(), not WalkExponential().
595 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
596 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
597 #endif
598 return ignored;
599 }
600
601 private:
602 std::map<std::string, int>* map_;
603
604 NamedCapturesWalker(const NamedCapturesWalker&) = delete;
605 NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
606 };
607
NamedCaptures()608 std::map<std::string, int>* Regexp::NamedCaptures() {
609 NamedCapturesWalker w;
610 w.Walk(this, 0);
611 return w.TakeMap();
612 }
613
614 // Walker class to build map from capture group indices to their names.
615 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
616 public:
CaptureNamesWalker()617 CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()618 ~CaptureNamesWalker() { delete map_; }
619
TakeMap()620 std::map<int, std::string>* TakeMap() {
621 std::map<int, std::string>* m = map_;
622 map_ = NULL;
623 return m;
624 }
625
PreVisit(Regexp * re,Ignored ignored,bool * stop)626 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
627 if (re->op() == kRegexpCapture && re->name() != NULL) {
628 // Allocate map once we find a name.
629 if (map_ == NULL)
630 map_ = new std::map<int, std::string>;
631
632 (*map_)[re->cap()] = *re->name();
633 }
634 return ignored;
635 }
636
ShortVisit(Regexp * re,Ignored ignored)637 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
638 // Should never be called: we use Walk(), not WalkExponential().
639 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
640 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
641 #endif
642 return ignored;
643 }
644
645 private:
646 std::map<int, std::string>* map_;
647
648 CaptureNamesWalker(const CaptureNamesWalker&) = delete;
649 CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
650 };
651
CaptureNames()652 std::map<int, std::string>* Regexp::CaptureNames() {
653 CaptureNamesWalker w;
654 w.Walk(this, 0);
655 return w.TakeMap();
656 }
657
ConvertRunesToBytes(bool latin1,Rune * runes,int nrunes,std::string * bytes)658 void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
659 std::string* bytes) {
660 if (latin1) {
661 bytes->resize(nrunes);
662 for (int i = 0; i < nrunes; i++)
663 (*bytes)[i] = static_cast<char>(runes[i]);
664 } else {
665 bytes->resize(nrunes * UTFmax); // worst case
666 char* p = &(*bytes)[0];
667 for (int i = 0; i < nrunes; i++)
668 p += runetochar(p, &runes[i]);
669 bytes->resize(p - &(*bytes)[0]);
670 bytes->shrink_to_fit();
671 }
672 }
673
674 // Determines whether regexp matches must be anchored
675 // with a fixed string prefix. If so, returns the prefix and
676 // the regexp that remains after the prefix. The prefix might
677 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)678 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
679 Regexp** suffix) {
680 prefix->clear();
681 *foldcase = false;
682 *suffix = NULL;
683
684 // No need for a walker: the regexp must be of the form
685 // 1. some number of ^ anchors
686 // 2. a literal char or string
687 // 3. the rest
688 if (op_ != kRegexpConcat)
689 return false;
690 int i = 0;
691 while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
692 i++;
693 if (i == 0 || i >= nsub_)
694 return false;
695 Regexp* re = sub()[i];
696 if (re->op_ != kRegexpLiteral &&
697 re->op_ != kRegexpLiteralString)
698 return false;
699 i++;
700 if (i < nsub_) {
701 for (int j = i; j < nsub_; j++)
702 sub()[j]->Incref();
703 *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
704 } else {
705 *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
706 }
707
708 bool latin1 = (re->parse_flags() & Latin1) != 0;
709 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
710 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
711 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
712 *foldcase = (re->parse_flags() & FoldCase) != 0;
713 return true;
714 }
715
716 // Determines whether regexp matches must be unanchored
717 // with a fixed string prefix. If so, returns the prefix.
718 // The prefix might be ASCII case-insensitive.
RequiredPrefixForAccel(std::string * prefix,bool * foldcase)719 bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
720 prefix->clear();
721 *foldcase = false;
722
723 // No need for a walker: the regexp must either begin with or be
724 // a literal char or string. We "see through" capturing groups,
725 // but make no effort to glue multiple prefix fragments together.
726 Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
727 while (re->op_ == kRegexpCapture) {
728 re = re->sub()[0];
729 if (re->op_ == kRegexpConcat && re->nsub_ > 0)
730 re = re->sub()[0];
731 }
732 if (re->op_ != kRegexpLiteral &&
733 re->op_ != kRegexpLiteralString)
734 return false;
735
736 bool latin1 = (re->parse_flags() & Latin1) != 0;
737 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
738 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
739 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
740 *foldcase = (re->parse_flags() & FoldCase) != 0;
741 return true;
742 }
743
744 // Character class builder is a balanced binary tree (STL set)
745 // containing non-overlapping, non-abutting RuneRanges.
746 // The less-than operator used in the tree treats two
747 // ranges as equal if they overlap at all, so that
748 // lookups for a particular Rune are possible.
749
CharClassBuilder()750 CharClassBuilder::CharClassBuilder() {
751 nrunes_ = 0;
752 upper_ = 0;
753 lower_ = 0;
754 }
755
756 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)757 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
758 if (hi < lo)
759 return false;
760
761 if (lo <= 'z' && hi >= 'A') {
762 // Overlaps some alpha, maybe not all.
763 // Update bitmaps telling which ASCII letters are in the set.
764 Rune lo1 = std::max<Rune>(lo, 'A');
765 Rune hi1 = std::min<Rune>(hi, 'Z');
766 if (lo1 <= hi1)
767 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
768
769 lo1 = std::max<Rune>(lo, 'a');
770 hi1 = std::min<Rune>(hi, 'z');
771 if (lo1 <= hi1)
772 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
773 }
774
775 { // Check whether lo, hi is already in the class.
776 iterator it = ranges_.find(RuneRange(lo, lo));
777 if (it != end() && it->lo <= lo && hi <= it->hi)
778 return false;
779 }
780
781 // Look for a range abutting lo on the left.
782 // If it exists, take it out and increase our range.
783 if (lo > 0) {
784 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
785 if (it != end()) {
786 lo = it->lo;
787 if (it->hi > hi)
788 hi = it->hi;
789 nrunes_ -= it->hi - it->lo + 1;
790 ranges_.erase(it);
791 }
792 }
793
794 // Look for a range abutting hi on the right.
795 // If it exists, take it out and increase our range.
796 if (hi < Runemax) {
797 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
798 if (it != end()) {
799 hi = it->hi;
800 nrunes_ -= it->hi - it->lo + 1;
801 ranges_.erase(it);
802 }
803 }
804
805 // Look for ranges between lo and hi. Take them out.
806 // This is only safe because the set has no overlapping ranges.
807 // We've already removed any ranges abutting lo and hi, so
808 // any that overlap [lo, hi] must be contained within it.
809 for (;;) {
810 iterator it = ranges_.find(RuneRange(lo, hi));
811 if (it == end())
812 break;
813 nrunes_ -= it->hi - it->lo + 1;
814 ranges_.erase(it);
815 }
816
817 // Finally, add [lo, hi].
818 nrunes_ += hi - lo + 1;
819 ranges_.insert(RuneRange(lo, hi));
820 return true;
821 }
822
AddCharClass(CharClassBuilder * cc)823 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
824 for (iterator it = cc->begin(); it != cc->end(); ++it)
825 AddRange(it->lo, it->hi);
826 }
827
Contains(Rune r)828 bool CharClassBuilder::Contains(Rune r) {
829 return ranges_.find(RuneRange(r, r)) != end();
830 }
831
832 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()833 bool CharClassBuilder::FoldsASCII() {
834 return ((upper_ ^ lower_) & AlphaMask) == 0;
835 }
836
Copy()837 CharClassBuilder* CharClassBuilder::Copy() {
838 CharClassBuilder* cc = new CharClassBuilder;
839 for (iterator it = begin(); it != end(); ++it)
840 cc->ranges_.insert(RuneRange(it->lo, it->hi));
841 cc->upper_ = upper_;
842 cc->lower_ = lower_;
843 cc->nrunes_ = nrunes_;
844 return cc;
845 }
846
847
848
RemoveAbove(Rune r)849 void CharClassBuilder::RemoveAbove(Rune r) {
850 if (r >= Runemax)
851 return;
852
853 if (r < 'z') {
854 if (r < 'a')
855 lower_ = 0;
856 else
857 lower_ &= AlphaMask >> ('z' - r);
858 }
859
860 if (r < 'Z') {
861 if (r < 'A')
862 upper_ = 0;
863 else
864 upper_ &= AlphaMask >> ('Z' - r);
865 }
866
867 for (;;) {
868
869 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
870 if (it == end())
871 break;
872 RuneRange rr = *it;
873 ranges_.erase(it);
874 nrunes_ -= rr.hi - rr.lo + 1;
875 if (rr.lo <= r) {
876 rr.hi = r;
877 ranges_.insert(rr);
878 nrunes_ += rr.hi - rr.lo + 1;
879 }
880 }
881 }
882
Negate()883 void CharClassBuilder::Negate() {
884 // Build up negation and then copy in.
885 // Could edit ranges in place, but C++ won't let me.
886 std::vector<RuneRange> v;
887 v.reserve(ranges_.size() + 1);
888
889 // In negation, first range begins at 0, unless
890 // the current class begins at 0.
891 iterator it = begin();
892 if (it == end()) {
893 v.push_back(RuneRange(0, Runemax));
894 } else {
895 int nextlo = 0;
896 if (it->lo == 0) {
897 nextlo = it->hi + 1;
898 ++it;
899 }
900 for (; it != end(); ++it) {
901 v.push_back(RuneRange(nextlo, it->lo - 1));
902 nextlo = it->hi + 1;
903 }
904 if (nextlo <= Runemax)
905 v.push_back(RuneRange(nextlo, Runemax));
906 }
907
908 ranges_.clear();
909 for (size_t i = 0; i < v.size(); i++)
910 ranges_.insert(v[i]);
911
912 upper_ = AlphaMask & ~upper_;
913 lower_ = AlphaMask & ~lower_;
914 nrunes_ = Runemax+1 - nrunes_;
915 }
916
917 // Character class is a sorted list of ranges.
918 // The ranges are allocated in the same block as the header,
919 // necessitating a special allocator and Delete method.
920
New(size_t maxranges)921 CharClass* CharClass::New(size_t maxranges) {
922 CharClass* cc;
923 uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
924 cc = reinterpret_cast<CharClass*>(data);
925 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
926 cc->nranges_ = 0;
927 cc->folds_ascii_ = false;
928 cc->nrunes_ = 0;
929 return cc;
930 }
931
Delete()932 void CharClass::Delete() {
933 uint8_t* data = reinterpret_cast<uint8_t*>(this);
934 delete[] data;
935 }
936
Negate()937 CharClass* CharClass::Negate() {
938 CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
939 cc->folds_ascii_ = folds_ascii_;
940 cc->nrunes_ = Runemax + 1 - nrunes_;
941 int n = 0;
942 int nextlo = 0;
943 for (CharClass::iterator it = begin(); it != end(); ++it) {
944 if (it->lo == nextlo) {
945 nextlo = it->hi + 1;
946 } else {
947 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
948 nextlo = it->hi + 1;
949 }
950 }
951 if (nextlo <= Runemax)
952 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
953 cc->nranges_ = n;
954 return cc;
955 }
956
Contains(Rune r) const957 bool CharClass::Contains(Rune r) const {
958 RuneRange* rr = ranges_;
959 int n = nranges_;
960 while (n > 0) {
961 int m = n/2;
962 if (rr[m].hi < r) {
963 rr += m+1;
964 n -= m+1;
965 } else if (r < rr[m].lo) {
966 n = m;
967 } else { // rr[m].lo <= r && r <= rr[m].hi
968 return true;
969 }
970 }
971 return false;
972 }
973
GetCharClass()974 CharClass* CharClassBuilder::GetCharClass() {
975 CharClass* cc = CharClass::New(ranges_.size());
976 int n = 0;
977 for (iterator it = begin(); it != end(); ++it)
978 cc->ranges_[n++] = *it;
979 cc->nranges_ = n;
980 DCHECK_LE(n, static_cast<int>(ranges_.size()));
981 cc->nrunes_ = nrunes_;
982 cc->folds_ascii_ = FoldsASCII();
983 return cc;
984 }
985
986 } // namespace re2
987