1*ccdc9c3eSSadaf Ebrahimi // Copyright 2006 The RE2 Authors. All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi // Regular expression representation.
6*ccdc9c3eSSadaf Ebrahimi // Tested by parse_test.cc
7*ccdc9c3eSSadaf Ebrahimi
8*ccdc9c3eSSadaf Ebrahimi #include "re2/regexp.h"
9*ccdc9c3eSSadaf Ebrahimi
10*ccdc9c3eSSadaf Ebrahimi #include <stddef.h>
11*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
12*ccdc9c3eSSadaf Ebrahimi #include <string.h>
13*ccdc9c3eSSadaf Ebrahimi #include <algorithm>
14*ccdc9c3eSSadaf Ebrahimi #include <map>
15*ccdc9c3eSSadaf Ebrahimi #include <mutex>
16*ccdc9c3eSSadaf Ebrahimi #include <string>
17*ccdc9c3eSSadaf Ebrahimi #include <vector>
18*ccdc9c3eSSadaf Ebrahimi
19*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
20*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h"
21*ccdc9c3eSSadaf Ebrahimi #include "util/mutex.h"
22*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
23*ccdc9c3eSSadaf Ebrahimi #include "re2/stringpiece.h"
24*ccdc9c3eSSadaf Ebrahimi #include "re2/walker-inl.h"
25*ccdc9c3eSSadaf Ebrahimi
26*ccdc9c3eSSadaf Ebrahimi namespace re2 {
27*ccdc9c3eSSadaf Ebrahimi
28*ccdc9c3eSSadaf Ebrahimi // Constructor. Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)29*ccdc9c3eSSadaf Ebrahimi Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
30*ccdc9c3eSSadaf Ebrahimi : op_(static_cast<uint8_t>(op)),
31*ccdc9c3eSSadaf Ebrahimi simple_(false),
32*ccdc9c3eSSadaf Ebrahimi parse_flags_(static_cast<uint16_t>(parse_flags)),
33*ccdc9c3eSSadaf Ebrahimi ref_(1),
34*ccdc9c3eSSadaf Ebrahimi nsub_(0),
35*ccdc9c3eSSadaf Ebrahimi down_(NULL) {
36*ccdc9c3eSSadaf Ebrahimi subone_ = NULL;
37*ccdc9c3eSSadaf Ebrahimi memset(the_union_, 0, sizeof the_union_);
38*ccdc9c3eSSadaf Ebrahimi }
39*ccdc9c3eSSadaf Ebrahimi
40*ccdc9c3eSSadaf Ebrahimi // Destructor. Assumes already cleaned up children.
41*ccdc9c3eSSadaf Ebrahimi // Private: use Decref() instead of delete to destroy Regexps.
42*ccdc9c3eSSadaf Ebrahimi // Can't call Decref on the sub-Regexps here because
43*ccdc9c3eSSadaf Ebrahimi // that could cause arbitrarily deep recursion, so
44*ccdc9c3eSSadaf Ebrahimi // required Decref() to have handled them for us.
~Regexp()45*ccdc9c3eSSadaf Ebrahimi Regexp::~Regexp() {
46*ccdc9c3eSSadaf Ebrahimi if (nsub_ > 0)
47*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "Regexp not destroyed.";
48*ccdc9c3eSSadaf Ebrahimi
49*ccdc9c3eSSadaf Ebrahimi switch (op_) {
50*ccdc9c3eSSadaf Ebrahimi default:
51*ccdc9c3eSSadaf Ebrahimi break;
52*ccdc9c3eSSadaf Ebrahimi case kRegexpCapture:
53*ccdc9c3eSSadaf Ebrahimi delete name_;
54*ccdc9c3eSSadaf Ebrahimi break;
55*ccdc9c3eSSadaf Ebrahimi case kRegexpLiteralString:
56*ccdc9c3eSSadaf Ebrahimi delete[] runes_;
57*ccdc9c3eSSadaf Ebrahimi break;
58*ccdc9c3eSSadaf Ebrahimi case kRegexpCharClass:
59*ccdc9c3eSSadaf Ebrahimi if (cc_)
60*ccdc9c3eSSadaf Ebrahimi cc_->Delete();
61*ccdc9c3eSSadaf Ebrahimi delete ccb_;
62*ccdc9c3eSSadaf Ebrahimi break;
63*ccdc9c3eSSadaf Ebrahimi }
64*ccdc9c3eSSadaf Ebrahimi }
65*ccdc9c3eSSadaf Ebrahimi
66*ccdc9c3eSSadaf Ebrahimi // If it's possible to destroy this regexp without recurring,
67*ccdc9c3eSSadaf Ebrahimi // do so and return true. Else return false.
QuickDestroy()68*ccdc9c3eSSadaf Ebrahimi bool Regexp::QuickDestroy() {
69*ccdc9c3eSSadaf Ebrahimi if (nsub_ == 0) {
70*ccdc9c3eSSadaf Ebrahimi delete this;
71*ccdc9c3eSSadaf Ebrahimi return true;
72*ccdc9c3eSSadaf Ebrahimi }
73*ccdc9c3eSSadaf Ebrahimi return false;
74*ccdc9c3eSSadaf Ebrahimi }
75*ccdc9c3eSSadaf Ebrahimi
76*ccdc9c3eSSadaf Ebrahimi // Lazily allocated.
77*ccdc9c3eSSadaf Ebrahimi static Mutex* ref_mutex;
78*ccdc9c3eSSadaf Ebrahimi static std::map<Regexp*, int>* ref_map;
79*ccdc9c3eSSadaf Ebrahimi
Ref()80*ccdc9c3eSSadaf Ebrahimi int Regexp::Ref() {
81*ccdc9c3eSSadaf Ebrahimi if (ref_ < kMaxRef)
82*ccdc9c3eSSadaf Ebrahimi return ref_;
83*ccdc9c3eSSadaf Ebrahimi
84*ccdc9c3eSSadaf Ebrahimi MutexLock l(ref_mutex);
85*ccdc9c3eSSadaf Ebrahimi return (*ref_map)[this];
86*ccdc9c3eSSadaf Ebrahimi }
87*ccdc9c3eSSadaf Ebrahimi
88*ccdc9c3eSSadaf Ebrahimi // Increments reference count, returns object as convenience.
Incref()89*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::Incref() {
90*ccdc9c3eSSadaf Ebrahimi if (ref_ >= kMaxRef-1) {
91*ccdc9c3eSSadaf Ebrahimi static std::once_flag ref_once;
92*ccdc9c3eSSadaf Ebrahimi std::call_once(ref_once, []() {
93*ccdc9c3eSSadaf Ebrahimi ref_mutex = new Mutex;
94*ccdc9c3eSSadaf Ebrahimi ref_map = new std::map<Regexp*, int>;
95*ccdc9c3eSSadaf Ebrahimi });
96*ccdc9c3eSSadaf Ebrahimi
97*ccdc9c3eSSadaf Ebrahimi // Store ref count in overflow map.
98*ccdc9c3eSSadaf Ebrahimi MutexLock l(ref_mutex);
99*ccdc9c3eSSadaf Ebrahimi if (ref_ == kMaxRef) {
100*ccdc9c3eSSadaf Ebrahimi // already overflowed
101*ccdc9c3eSSadaf Ebrahimi (*ref_map)[this]++;
102*ccdc9c3eSSadaf Ebrahimi } else {
103*ccdc9c3eSSadaf Ebrahimi // overflowing now
104*ccdc9c3eSSadaf Ebrahimi (*ref_map)[this] = kMaxRef;
105*ccdc9c3eSSadaf Ebrahimi ref_ = kMaxRef;
106*ccdc9c3eSSadaf Ebrahimi }
107*ccdc9c3eSSadaf Ebrahimi return this;
108*ccdc9c3eSSadaf Ebrahimi }
109*ccdc9c3eSSadaf Ebrahimi
110*ccdc9c3eSSadaf Ebrahimi ref_++;
111*ccdc9c3eSSadaf Ebrahimi return this;
112*ccdc9c3eSSadaf Ebrahimi }
113*ccdc9c3eSSadaf Ebrahimi
114*ccdc9c3eSSadaf Ebrahimi // Decrements reference count and deletes this object if count reaches 0.
Decref()115*ccdc9c3eSSadaf Ebrahimi void Regexp::Decref() {
116*ccdc9c3eSSadaf Ebrahimi if (ref_ == kMaxRef) {
117*ccdc9c3eSSadaf Ebrahimi // Ref count is stored in overflow map.
118*ccdc9c3eSSadaf Ebrahimi MutexLock l(ref_mutex);
119*ccdc9c3eSSadaf Ebrahimi int r = (*ref_map)[this] - 1;
120*ccdc9c3eSSadaf Ebrahimi if (r < kMaxRef) {
121*ccdc9c3eSSadaf Ebrahimi ref_ = static_cast<uint16_t>(r);
122*ccdc9c3eSSadaf Ebrahimi ref_map->erase(this);
123*ccdc9c3eSSadaf Ebrahimi } else {
124*ccdc9c3eSSadaf Ebrahimi (*ref_map)[this] = r;
125*ccdc9c3eSSadaf Ebrahimi }
126*ccdc9c3eSSadaf Ebrahimi return;
127*ccdc9c3eSSadaf Ebrahimi }
128*ccdc9c3eSSadaf Ebrahimi ref_--;
129*ccdc9c3eSSadaf Ebrahimi if (ref_ == 0)
130*ccdc9c3eSSadaf Ebrahimi Destroy();
131*ccdc9c3eSSadaf Ebrahimi }
132*ccdc9c3eSSadaf Ebrahimi
133*ccdc9c3eSSadaf Ebrahimi // Deletes this object; ref count has count reached 0.
Destroy()134*ccdc9c3eSSadaf Ebrahimi void Regexp::Destroy() {
135*ccdc9c3eSSadaf Ebrahimi if (QuickDestroy())
136*ccdc9c3eSSadaf Ebrahimi return;
137*ccdc9c3eSSadaf Ebrahimi
138*ccdc9c3eSSadaf Ebrahimi // Handle recursive Destroy with explicit stack
139*ccdc9c3eSSadaf Ebrahimi // to avoid arbitrarily deep recursion on process stack [sigh].
140*ccdc9c3eSSadaf Ebrahimi down_ = NULL;
141*ccdc9c3eSSadaf Ebrahimi Regexp* stack = this;
142*ccdc9c3eSSadaf Ebrahimi while (stack != NULL) {
143*ccdc9c3eSSadaf Ebrahimi Regexp* re = stack;
144*ccdc9c3eSSadaf Ebrahimi stack = re->down_;
145*ccdc9c3eSSadaf Ebrahimi if (re->ref_ != 0)
146*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "Bad reference count " << re->ref_;
147*ccdc9c3eSSadaf Ebrahimi if (re->nsub_ > 0) {
148*ccdc9c3eSSadaf Ebrahimi Regexp** subs = re->sub();
149*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < re->nsub_; i++) {
150*ccdc9c3eSSadaf Ebrahimi Regexp* sub = subs[i];
151*ccdc9c3eSSadaf Ebrahimi if (sub == NULL)
152*ccdc9c3eSSadaf Ebrahimi continue;
153*ccdc9c3eSSadaf Ebrahimi if (sub->ref_ == kMaxRef)
154*ccdc9c3eSSadaf Ebrahimi sub->Decref();
155*ccdc9c3eSSadaf Ebrahimi else
156*ccdc9c3eSSadaf Ebrahimi --sub->ref_;
157*ccdc9c3eSSadaf Ebrahimi if (sub->ref_ == 0 && !sub->QuickDestroy()) {
158*ccdc9c3eSSadaf Ebrahimi sub->down_ = stack;
159*ccdc9c3eSSadaf Ebrahimi stack = sub;
160*ccdc9c3eSSadaf Ebrahimi }
161*ccdc9c3eSSadaf Ebrahimi }
162*ccdc9c3eSSadaf Ebrahimi if (re->nsub_ > 1)
163*ccdc9c3eSSadaf Ebrahimi delete[] subs;
164*ccdc9c3eSSadaf Ebrahimi re->nsub_ = 0;
165*ccdc9c3eSSadaf Ebrahimi }
166*ccdc9c3eSSadaf Ebrahimi delete re;
167*ccdc9c3eSSadaf Ebrahimi }
168*ccdc9c3eSSadaf Ebrahimi }
169*ccdc9c3eSSadaf Ebrahimi
AddRuneToString(Rune r)170*ccdc9c3eSSadaf Ebrahimi void Regexp::AddRuneToString(Rune r) {
171*ccdc9c3eSSadaf Ebrahimi DCHECK(op_ == kRegexpLiteralString);
172*ccdc9c3eSSadaf Ebrahimi if (nrunes_ == 0) {
173*ccdc9c3eSSadaf Ebrahimi // start with 8
174*ccdc9c3eSSadaf Ebrahimi runes_ = new Rune[8];
175*ccdc9c3eSSadaf Ebrahimi } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
176*ccdc9c3eSSadaf Ebrahimi // double on powers of two
177*ccdc9c3eSSadaf Ebrahimi Rune *old = runes_;
178*ccdc9c3eSSadaf Ebrahimi runes_ = new Rune[nrunes_ * 2];
179*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < nrunes_; i++)
180*ccdc9c3eSSadaf Ebrahimi runes_[i] = old[i];
181*ccdc9c3eSSadaf Ebrahimi delete[] old;
182*ccdc9c3eSSadaf Ebrahimi }
183*ccdc9c3eSSadaf Ebrahimi
184*ccdc9c3eSSadaf Ebrahimi runes_[nrunes_++] = r;
185*ccdc9c3eSSadaf Ebrahimi }
186*ccdc9c3eSSadaf Ebrahimi
HaveMatch(int match_id,ParseFlags flags)187*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
188*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(kRegexpHaveMatch, flags);
189*ccdc9c3eSSadaf Ebrahimi re->match_id_ = match_id;
190*ccdc9c3eSSadaf Ebrahimi return re;
191*ccdc9c3eSSadaf Ebrahimi }
192*ccdc9c3eSSadaf Ebrahimi
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)193*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
194*ccdc9c3eSSadaf Ebrahimi // Squash **, ++ and ??.
195*ccdc9c3eSSadaf Ebrahimi if (op == sub->op() && flags == sub->parse_flags())
196*ccdc9c3eSSadaf Ebrahimi return sub;
197*ccdc9c3eSSadaf Ebrahimi
198*ccdc9c3eSSadaf Ebrahimi // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
199*ccdc9c3eSSadaf Ebrahimi // op is Star/Plus/Quest, we just have to check that sub->op() is too.
200*ccdc9c3eSSadaf Ebrahimi if ((sub->op() == kRegexpStar ||
201*ccdc9c3eSSadaf Ebrahimi sub->op() == kRegexpPlus ||
202*ccdc9c3eSSadaf Ebrahimi sub->op() == kRegexpQuest) &&
203*ccdc9c3eSSadaf Ebrahimi flags == sub->parse_flags()) {
204*ccdc9c3eSSadaf Ebrahimi // If sub is Star, no need to rewrite it.
205*ccdc9c3eSSadaf Ebrahimi if (sub->op() == kRegexpStar)
206*ccdc9c3eSSadaf Ebrahimi return sub;
207*ccdc9c3eSSadaf Ebrahimi
208*ccdc9c3eSSadaf Ebrahimi // Rewrite sub to Star.
209*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(kRegexpStar, flags);
210*ccdc9c3eSSadaf Ebrahimi re->AllocSub(1);
211*ccdc9c3eSSadaf Ebrahimi re->sub()[0] = sub->sub()[0]->Incref();
212*ccdc9c3eSSadaf Ebrahimi sub->Decref(); // We didn't consume the reference after all.
213*ccdc9c3eSSadaf Ebrahimi return re;
214*ccdc9c3eSSadaf Ebrahimi }
215*ccdc9c3eSSadaf Ebrahimi
216*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(op, flags);
217*ccdc9c3eSSadaf Ebrahimi re->AllocSub(1);
218*ccdc9c3eSSadaf Ebrahimi re->sub()[0] = sub;
219*ccdc9c3eSSadaf Ebrahimi return re;
220*ccdc9c3eSSadaf Ebrahimi }
221*ccdc9c3eSSadaf Ebrahimi
Plus(Regexp * sub,ParseFlags flags)222*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
223*ccdc9c3eSSadaf Ebrahimi return StarPlusOrQuest(kRegexpPlus, sub, flags);
224*ccdc9c3eSSadaf Ebrahimi }
225*ccdc9c3eSSadaf Ebrahimi
Star(Regexp * sub,ParseFlags flags)226*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
227*ccdc9c3eSSadaf Ebrahimi return StarPlusOrQuest(kRegexpStar, sub, flags);
228*ccdc9c3eSSadaf Ebrahimi }
229*ccdc9c3eSSadaf Ebrahimi
Quest(Regexp * sub,ParseFlags flags)230*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
231*ccdc9c3eSSadaf Ebrahimi return StarPlusOrQuest(kRegexpQuest, sub, flags);
232*ccdc9c3eSSadaf Ebrahimi }
233*ccdc9c3eSSadaf Ebrahimi
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)234*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
235*ccdc9c3eSSadaf Ebrahimi ParseFlags flags, bool can_factor) {
236*ccdc9c3eSSadaf Ebrahimi if (nsub == 1)
237*ccdc9c3eSSadaf Ebrahimi return sub[0];
238*ccdc9c3eSSadaf Ebrahimi
239*ccdc9c3eSSadaf Ebrahimi if (nsub == 0) {
240*ccdc9c3eSSadaf Ebrahimi if (op == kRegexpAlternate)
241*ccdc9c3eSSadaf Ebrahimi return new Regexp(kRegexpNoMatch, flags);
242*ccdc9c3eSSadaf Ebrahimi else
243*ccdc9c3eSSadaf Ebrahimi return new Regexp(kRegexpEmptyMatch, flags);
244*ccdc9c3eSSadaf Ebrahimi }
245*ccdc9c3eSSadaf Ebrahimi
246*ccdc9c3eSSadaf Ebrahimi Regexp** subcopy = NULL;
247*ccdc9c3eSSadaf Ebrahimi if (op == kRegexpAlternate && can_factor) {
248*ccdc9c3eSSadaf Ebrahimi // Going to edit sub; make a copy so we don't step on caller.
249*ccdc9c3eSSadaf Ebrahimi subcopy = new Regexp*[nsub];
250*ccdc9c3eSSadaf Ebrahimi memmove(subcopy, sub, nsub * sizeof sub[0]);
251*ccdc9c3eSSadaf Ebrahimi sub = subcopy;
252*ccdc9c3eSSadaf Ebrahimi nsub = FactorAlternation(sub, nsub, flags);
253*ccdc9c3eSSadaf Ebrahimi if (nsub == 1) {
254*ccdc9c3eSSadaf Ebrahimi Regexp* re = sub[0];
255*ccdc9c3eSSadaf Ebrahimi delete[] subcopy;
256*ccdc9c3eSSadaf Ebrahimi return re;
257*ccdc9c3eSSadaf Ebrahimi }
258*ccdc9c3eSSadaf Ebrahimi }
259*ccdc9c3eSSadaf Ebrahimi
260*ccdc9c3eSSadaf Ebrahimi if (nsub > kMaxNsub) {
261*ccdc9c3eSSadaf Ebrahimi // Too many subexpressions to fit in a single Regexp.
262*ccdc9c3eSSadaf Ebrahimi // Make a two-level tree. Two levels gets us to 65535^2.
263*ccdc9c3eSSadaf Ebrahimi int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
264*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(op, flags);
265*ccdc9c3eSSadaf Ebrahimi re->AllocSub(nbigsub);
266*ccdc9c3eSSadaf Ebrahimi Regexp** subs = re->sub();
267*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < nbigsub - 1; i++)
268*ccdc9c3eSSadaf Ebrahimi subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
269*ccdc9c3eSSadaf Ebrahimi subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
270*ccdc9c3eSSadaf Ebrahimi nsub - (nbigsub-1)*kMaxNsub, flags,
271*ccdc9c3eSSadaf Ebrahimi false);
272*ccdc9c3eSSadaf Ebrahimi delete[] subcopy;
273*ccdc9c3eSSadaf Ebrahimi return re;
274*ccdc9c3eSSadaf Ebrahimi }
275*ccdc9c3eSSadaf Ebrahimi
276*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(op, flags);
277*ccdc9c3eSSadaf Ebrahimi re->AllocSub(nsub);
278*ccdc9c3eSSadaf Ebrahimi Regexp** subs = re->sub();
279*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < nsub; i++)
280*ccdc9c3eSSadaf Ebrahimi subs[i] = sub[i];
281*ccdc9c3eSSadaf Ebrahimi
282*ccdc9c3eSSadaf Ebrahimi delete[] subcopy;
283*ccdc9c3eSSadaf Ebrahimi return re;
284*ccdc9c3eSSadaf Ebrahimi }
285*ccdc9c3eSSadaf Ebrahimi
Concat(Regexp ** sub,int nsub,ParseFlags flags)286*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
287*ccdc9c3eSSadaf Ebrahimi return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
288*ccdc9c3eSSadaf Ebrahimi }
289*ccdc9c3eSSadaf Ebrahimi
Alternate(Regexp ** sub,int nsub,ParseFlags flags)290*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
291*ccdc9c3eSSadaf Ebrahimi return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
292*ccdc9c3eSSadaf Ebrahimi }
293*ccdc9c3eSSadaf Ebrahimi
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)294*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
295*ccdc9c3eSSadaf Ebrahimi return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
296*ccdc9c3eSSadaf Ebrahimi }
297*ccdc9c3eSSadaf Ebrahimi
Capture(Regexp * sub,ParseFlags flags,int cap)298*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
299*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(kRegexpCapture, flags);
300*ccdc9c3eSSadaf Ebrahimi re->AllocSub(1);
301*ccdc9c3eSSadaf Ebrahimi re->sub()[0] = sub;
302*ccdc9c3eSSadaf Ebrahimi re->cap_ = cap;
303*ccdc9c3eSSadaf Ebrahimi return re;
304*ccdc9c3eSSadaf Ebrahimi }
305*ccdc9c3eSSadaf Ebrahimi
Repeat(Regexp * sub,ParseFlags flags,int min,int max)306*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
307*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(kRegexpRepeat, flags);
308*ccdc9c3eSSadaf Ebrahimi re->AllocSub(1);
309*ccdc9c3eSSadaf Ebrahimi re->sub()[0] = sub;
310*ccdc9c3eSSadaf Ebrahimi re->min_ = min;
311*ccdc9c3eSSadaf Ebrahimi re->max_ = max;
312*ccdc9c3eSSadaf Ebrahimi return re;
313*ccdc9c3eSSadaf Ebrahimi }
314*ccdc9c3eSSadaf Ebrahimi
NewLiteral(Rune rune,ParseFlags flags)315*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
316*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(kRegexpLiteral, flags);
317*ccdc9c3eSSadaf Ebrahimi re->rune_ = rune;
318*ccdc9c3eSSadaf Ebrahimi return re;
319*ccdc9c3eSSadaf Ebrahimi }
320*ccdc9c3eSSadaf Ebrahimi
LiteralString(Rune * runes,int nrunes,ParseFlags flags)321*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
322*ccdc9c3eSSadaf Ebrahimi if (nrunes <= 0)
323*ccdc9c3eSSadaf Ebrahimi return new Regexp(kRegexpEmptyMatch, flags);
324*ccdc9c3eSSadaf Ebrahimi if (nrunes == 1)
325*ccdc9c3eSSadaf Ebrahimi return NewLiteral(runes[0], flags);
326*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(kRegexpLiteralString, flags);
327*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < nrunes; i++)
328*ccdc9c3eSSadaf Ebrahimi re->AddRuneToString(runes[i]);
329*ccdc9c3eSSadaf Ebrahimi return re;
330*ccdc9c3eSSadaf Ebrahimi }
331*ccdc9c3eSSadaf Ebrahimi
NewCharClass(CharClass * cc,ParseFlags flags)332*ccdc9c3eSSadaf Ebrahimi Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
333*ccdc9c3eSSadaf Ebrahimi Regexp* re = new Regexp(kRegexpCharClass, flags);
334*ccdc9c3eSSadaf Ebrahimi re->cc_ = cc;
335*ccdc9c3eSSadaf Ebrahimi return re;
336*ccdc9c3eSSadaf Ebrahimi }
337*ccdc9c3eSSadaf Ebrahimi
Swap(Regexp * that)338*ccdc9c3eSSadaf Ebrahimi void Regexp::Swap(Regexp* that) {
339*ccdc9c3eSSadaf Ebrahimi // Regexp is not trivially copyable, so we cannot freely copy it with
340*ccdc9c3eSSadaf Ebrahimi // memmove(3), but swapping objects like so is safe for our purposes.
341*ccdc9c3eSSadaf Ebrahimi char tmp[sizeof *this];
342*ccdc9c3eSSadaf Ebrahimi void* vthis = reinterpret_cast<void*>(this);
343*ccdc9c3eSSadaf Ebrahimi void* vthat = reinterpret_cast<void*>(that);
344*ccdc9c3eSSadaf Ebrahimi memmove(tmp, vthis, sizeof *this);
345*ccdc9c3eSSadaf Ebrahimi memmove(vthis, vthat, sizeof *this);
346*ccdc9c3eSSadaf Ebrahimi memmove(vthat, tmp, sizeof *this);
347*ccdc9c3eSSadaf Ebrahimi }
348*ccdc9c3eSSadaf Ebrahimi
349*ccdc9c3eSSadaf Ebrahimi // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)350*ccdc9c3eSSadaf Ebrahimi static bool TopEqual(Regexp* a, Regexp* b) {
351*ccdc9c3eSSadaf Ebrahimi if (a->op() != b->op())
352*ccdc9c3eSSadaf Ebrahimi return false;
353*ccdc9c3eSSadaf Ebrahimi
354*ccdc9c3eSSadaf Ebrahimi switch (a->op()) {
355*ccdc9c3eSSadaf Ebrahimi case kRegexpNoMatch:
356*ccdc9c3eSSadaf Ebrahimi case kRegexpEmptyMatch:
357*ccdc9c3eSSadaf Ebrahimi case kRegexpAnyChar:
358*ccdc9c3eSSadaf Ebrahimi case kRegexpAnyByte:
359*ccdc9c3eSSadaf Ebrahimi case kRegexpBeginLine:
360*ccdc9c3eSSadaf Ebrahimi case kRegexpEndLine:
361*ccdc9c3eSSadaf Ebrahimi case kRegexpWordBoundary:
362*ccdc9c3eSSadaf Ebrahimi case kRegexpNoWordBoundary:
363*ccdc9c3eSSadaf Ebrahimi case kRegexpBeginText:
364*ccdc9c3eSSadaf Ebrahimi return true;
365*ccdc9c3eSSadaf Ebrahimi
366*ccdc9c3eSSadaf Ebrahimi case kRegexpEndText:
367*ccdc9c3eSSadaf Ebrahimi // The parse flags remember whether it's \z or (?-m:$),
368*ccdc9c3eSSadaf Ebrahimi // which matters when testing against PCRE.
369*ccdc9c3eSSadaf Ebrahimi return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
370*ccdc9c3eSSadaf Ebrahimi
371*ccdc9c3eSSadaf Ebrahimi case kRegexpLiteral:
372*ccdc9c3eSSadaf Ebrahimi return a->rune() == b->rune() &&
373*ccdc9c3eSSadaf Ebrahimi ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
374*ccdc9c3eSSadaf Ebrahimi
375*ccdc9c3eSSadaf Ebrahimi case kRegexpLiteralString:
376*ccdc9c3eSSadaf Ebrahimi return a->nrunes() == b->nrunes() &&
377*ccdc9c3eSSadaf Ebrahimi ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
378*ccdc9c3eSSadaf Ebrahimi memcmp(a->runes(), b->runes(),
379*ccdc9c3eSSadaf Ebrahimi a->nrunes() * sizeof a->runes()[0]) == 0;
380*ccdc9c3eSSadaf Ebrahimi
381*ccdc9c3eSSadaf Ebrahimi case kRegexpAlternate:
382*ccdc9c3eSSadaf Ebrahimi case kRegexpConcat:
383*ccdc9c3eSSadaf Ebrahimi return a->nsub() == b->nsub();
384*ccdc9c3eSSadaf Ebrahimi
385*ccdc9c3eSSadaf Ebrahimi case kRegexpStar:
386*ccdc9c3eSSadaf Ebrahimi case kRegexpPlus:
387*ccdc9c3eSSadaf Ebrahimi case kRegexpQuest:
388*ccdc9c3eSSadaf Ebrahimi return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
389*ccdc9c3eSSadaf Ebrahimi
390*ccdc9c3eSSadaf Ebrahimi case kRegexpRepeat:
391*ccdc9c3eSSadaf Ebrahimi return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
392*ccdc9c3eSSadaf Ebrahimi a->min() == b->min() &&
393*ccdc9c3eSSadaf Ebrahimi a->max() == b->max();
394*ccdc9c3eSSadaf Ebrahimi
395*ccdc9c3eSSadaf Ebrahimi case kRegexpCapture:
396*ccdc9c3eSSadaf Ebrahimi return a->cap() == b->cap() && a->name() == b->name();
397*ccdc9c3eSSadaf Ebrahimi
398*ccdc9c3eSSadaf Ebrahimi case kRegexpHaveMatch:
399*ccdc9c3eSSadaf Ebrahimi return a->match_id() == b->match_id();
400*ccdc9c3eSSadaf Ebrahimi
401*ccdc9c3eSSadaf Ebrahimi case kRegexpCharClass: {
402*ccdc9c3eSSadaf Ebrahimi CharClass* acc = a->cc();
403*ccdc9c3eSSadaf Ebrahimi CharClass* bcc = b->cc();
404*ccdc9c3eSSadaf Ebrahimi return acc->size() == bcc->size() &&
405*ccdc9c3eSSadaf Ebrahimi acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
406*ccdc9c3eSSadaf Ebrahimi memcmp(acc->begin(), bcc->begin(),
407*ccdc9c3eSSadaf Ebrahimi (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
408*ccdc9c3eSSadaf Ebrahimi }
409*ccdc9c3eSSadaf Ebrahimi }
410*ccdc9c3eSSadaf Ebrahimi
411*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
412*ccdc9c3eSSadaf Ebrahimi return 0;
413*ccdc9c3eSSadaf Ebrahimi }
414*ccdc9c3eSSadaf Ebrahimi
Equal(Regexp * a,Regexp * b)415*ccdc9c3eSSadaf Ebrahimi bool Regexp::Equal(Regexp* a, Regexp* b) {
416*ccdc9c3eSSadaf Ebrahimi if (a == NULL || b == NULL)
417*ccdc9c3eSSadaf Ebrahimi return a == b;
418*ccdc9c3eSSadaf Ebrahimi
419*ccdc9c3eSSadaf Ebrahimi if (!TopEqual(a, b))
420*ccdc9c3eSSadaf Ebrahimi return false;
421*ccdc9c3eSSadaf Ebrahimi
422*ccdc9c3eSSadaf Ebrahimi // Fast path:
423*ccdc9c3eSSadaf Ebrahimi // return without allocating vector if there are no subregexps.
424*ccdc9c3eSSadaf Ebrahimi switch (a->op()) {
425*ccdc9c3eSSadaf Ebrahimi case kRegexpAlternate:
426*ccdc9c3eSSadaf Ebrahimi case kRegexpConcat:
427*ccdc9c3eSSadaf Ebrahimi case kRegexpStar:
428*ccdc9c3eSSadaf Ebrahimi case kRegexpPlus:
429*ccdc9c3eSSadaf Ebrahimi case kRegexpQuest:
430*ccdc9c3eSSadaf Ebrahimi case kRegexpRepeat:
431*ccdc9c3eSSadaf Ebrahimi case kRegexpCapture:
432*ccdc9c3eSSadaf Ebrahimi break;
433*ccdc9c3eSSadaf Ebrahimi
434*ccdc9c3eSSadaf Ebrahimi default:
435*ccdc9c3eSSadaf Ebrahimi return true;
436*ccdc9c3eSSadaf Ebrahimi }
437*ccdc9c3eSSadaf Ebrahimi
438*ccdc9c3eSSadaf Ebrahimi // Committed to doing real work.
439*ccdc9c3eSSadaf Ebrahimi // The stack (vector) has pairs of regexps waiting to
440*ccdc9c3eSSadaf Ebrahimi // be compared. The regexps are only equal if
441*ccdc9c3eSSadaf Ebrahimi // all the pairs end up being equal.
442*ccdc9c3eSSadaf Ebrahimi std::vector<Regexp*> stk;
443*ccdc9c3eSSadaf Ebrahimi
444*ccdc9c3eSSadaf Ebrahimi for (;;) {
445*ccdc9c3eSSadaf Ebrahimi // Invariant: TopEqual(a, b) == true.
446*ccdc9c3eSSadaf Ebrahimi Regexp* a2;
447*ccdc9c3eSSadaf Ebrahimi Regexp* b2;
448*ccdc9c3eSSadaf Ebrahimi switch (a->op()) {
449*ccdc9c3eSSadaf Ebrahimi default:
450*ccdc9c3eSSadaf Ebrahimi break;
451*ccdc9c3eSSadaf Ebrahimi case kRegexpAlternate:
452*ccdc9c3eSSadaf Ebrahimi case kRegexpConcat:
453*ccdc9c3eSSadaf Ebrahimi for (int i = 0; i < a->nsub(); i++) {
454*ccdc9c3eSSadaf Ebrahimi a2 = a->sub()[i];
455*ccdc9c3eSSadaf Ebrahimi b2 = b->sub()[i];
456*ccdc9c3eSSadaf Ebrahimi if (!TopEqual(a2, b2))
457*ccdc9c3eSSadaf Ebrahimi return false;
458*ccdc9c3eSSadaf Ebrahimi stk.push_back(a2);
459*ccdc9c3eSSadaf Ebrahimi stk.push_back(b2);
460*ccdc9c3eSSadaf Ebrahimi }
461*ccdc9c3eSSadaf Ebrahimi break;
462*ccdc9c3eSSadaf Ebrahimi
463*ccdc9c3eSSadaf Ebrahimi case kRegexpStar:
464*ccdc9c3eSSadaf Ebrahimi case kRegexpPlus:
465*ccdc9c3eSSadaf Ebrahimi case kRegexpQuest:
466*ccdc9c3eSSadaf Ebrahimi case kRegexpRepeat:
467*ccdc9c3eSSadaf Ebrahimi case kRegexpCapture:
468*ccdc9c3eSSadaf Ebrahimi a2 = a->sub()[0];
469*ccdc9c3eSSadaf Ebrahimi b2 = b->sub()[0];
470*ccdc9c3eSSadaf Ebrahimi if (!TopEqual(a2, b2))
471*ccdc9c3eSSadaf Ebrahimi return false;
472*ccdc9c3eSSadaf Ebrahimi // Really:
473*ccdc9c3eSSadaf Ebrahimi // stk.push_back(a2);
474*ccdc9c3eSSadaf Ebrahimi // stk.push_back(b2);
475*ccdc9c3eSSadaf Ebrahimi // break;
476*ccdc9c3eSSadaf Ebrahimi // but faster to assign directly and loop.
477*ccdc9c3eSSadaf Ebrahimi a = a2;
478*ccdc9c3eSSadaf Ebrahimi b = b2;
479*ccdc9c3eSSadaf Ebrahimi continue;
480*ccdc9c3eSSadaf Ebrahimi }
481*ccdc9c3eSSadaf Ebrahimi
482*ccdc9c3eSSadaf Ebrahimi size_t n = stk.size();
483*ccdc9c3eSSadaf Ebrahimi if (n == 0)
484*ccdc9c3eSSadaf Ebrahimi break;
485*ccdc9c3eSSadaf Ebrahimi
486*ccdc9c3eSSadaf Ebrahimi DCHECK_GE(n, 2);
487*ccdc9c3eSSadaf Ebrahimi a = stk[n-2];
488*ccdc9c3eSSadaf Ebrahimi b = stk[n-1];
489*ccdc9c3eSSadaf Ebrahimi stk.resize(n-2);
490*ccdc9c3eSSadaf Ebrahimi }
491*ccdc9c3eSSadaf Ebrahimi
492*ccdc9c3eSSadaf Ebrahimi return true;
493*ccdc9c3eSSadaf Ebrahimi }
494*ccdc9c3eSSadaf Ebrahimi
495*ccdc9c3eSSadaf Ebrahimi // Keep in sync with enum RegexpStatusCode in regexp.h
496*ccdc9c3eSSadaf Ebrahimi static const char *kErrorStrings[] = {
497*ccdc9c3eSSadaf Ebrahimi "no error",
498*ccdc9c3eSSadaf Ebrahimi "unexpected error",
499*ccdc9c3eSSadaf Ebrahimi "invalid escape sequence",
500*ccdc9c3eSSadaf Ebrahimi "invalid character class",
501*ccdc9c3eSSadaf Ebrahimi "invalid character class range",
502*ccdc9c3eSSadaf Ebrahimi "missing ]",
503*ccdc9c3eSSadaf Ebrahimi "missing )",
504*ccdc9c3eSSadaf Ebrahimi "trailing \\",
505*ccdc9c3eSSadaf Ebrahimi "no argument for repetition operator",
506*ccdc9c3eSSadaf Ebrahimi "invalid repetition size",
507*ccdc9c3eSSadaf Ebrahimi "bad repetition operator",
508*ccdc9c3eSSadaf Ebrahimi "invalid perl operator",
509*ccdc9c3eSSadaf Ebrahimi "invalid UTF-8",
510*ccdc9c3eSSadaf Ebrahimi "invalid named capture group",
511*ccdc9c3eSSadaf Ebrahimi };
512*ccdc9c3eSSadaf Ebrahimi
CodeText(enum RegexpStatusCode code)513*ccdc9c3eSSadaf Ebrahimi string RegexpStatus::CodeText(enum RegexpStatusCode code) {
514*ccdc9c3eSSadaf Ebrahimi if (code < 0 || code >= arraysize(kErrorStrings))
515*ccdc9c3eSSadaf Ebrahimi code = kRegexpInternalError;
516*ccdc9c3eSSadaf Ebrahimi return kErrorStrings[code];
517*ccdc9c3eSSadaf Ebrahimi }
518*ccdc9c3eSSadaf Ebrahimi
Text() const519*ccdc9c3eSSadaf Ebrahimi string RegexpStatus::Text() const {
520*ccdc9c3eSSadaf Ebrahimi if (error_arg_.empty())
521*ccdc9c3eSSadaf Ebrahimi return CodeText(code_);
522*ccdc9c3eSSadaf Ebrahimi string s;
523*ccdc9c3eSSadaf Ebrahimi s.append(CodeText(code_));
524*ccdc9c3eSSadaf Ebrahimi s.append(": ");
525*ccdc9c3eSSadaf Ebrahimi s.append(error_arg_.data(), error_arg_.size());
526*ccdc9c3eSSadaf Ebrahimi return s;
527*ccdc9c3eSSadaf Ebrahimi }
528*ccdc9c3eSSadaf Ebrahimi
Copy(const RegexpStatus & status)529*ccdc9c3eSSadaf Ebrahimi void RegexpStatus::Copy(const RegexpStatus& status) {
530*ccdc9c3eSSadaf Ebrahimi code_ = status.code_;
531*ccdc9c3eSSadaf Ebrahimi error_arg_ = status.error_arg_;
532*ccdc9c3eSSadaf Ebrahimi }
533*ccdc9c3eSSadaf Ebrahimi
534*ccdc9c3eSSadaf Ebrahimi typedef int Ignored; // Walker<void> doesn't exist
535*ccdc9c3eSSadaf Ebrahimi
536*ccdc9c3eSSadaf Ebrahimi // Walker subclass to count capturing parens in regexp.
537*ccdc9c3eSSadaf Ebrahimi class NumCapturesWalker : public Regexp::Walker<Ignored> {
538*ccdc9c3eSSadaf Ebrahimi public:
NumCapturesWalker()539*ccdc9c3eSSadaf Ebrahimi NumCapturesWalker() : ncapture_(0) {}
ncapture()540*ccdc9c3eSSadaf Ebrahimi int ncapture() { return ncapture_; }
541*ccdc9c3eSSadaf Ebrahimi
PreVisit(Regexp * re,Ignored ignored,bool * stop)542*ccdc9c3eSSadaf Ebrahimi virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
543*ccdc9c3eSSadaf Ebrahimi if (re->op() == kRegexpCapture)
544*ccdc9c3eSSadaf Ebrahimi ncapture_++;
545*ccdc9c3eSSadaf Ebrahimi return ignored;
546*ccdc9c3eSSadaf Ebrahimi }
ShortVisit(Regexp * re,Ignored ignored)547*ccdc9c3eSSadaf Ebrahimi virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
548*ccdc9c3eSSadaf Ebrahimi // Should never be called: we use Walk not WalkExponential.
549*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
550*ccdc9c3eSSadaf Ebrahimi return ignored;
551*ccdc9c3eSSadaf Ebrahimi }
552*ccdc9c3eSSadaf Ebrahimi
553*ccdc9c3eSSadaf Ebrahimi private:
554*ccdc9c3eSSadaf Ebrahimi int ncapture_;
555*ccdc9c3eSSadaf Ebrahimi
556*ccdc9c3eSSadaf Ebrahimi NumCapturesWalker(const NumCapturesWalker&) = delete;
557*ccdc9c3eSSadaf Ebrahimi NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
558*ccdc9c3eSSadaf Ebrahimi };
559*ccdc9c3eSSadaf Ebrahimi
NumCaptures()560*ccdc9c3eSSadaf Ebrahimi int Regexp::NumCaptures() {
561*ccdc9c3eSSadaf Ebrahimi NumCapturesWalker w;
562*ccdc9c3eSSadaf Ebrahimi w.Walk(this, 0);
563*ccdc9c3eSSadaf Ebrahimi return w.ncapture();
564*ccdc9c3eSSadaf Ebrahimi }
565*ccdc9c3eSSadaf Ebrahimi
566*ccdc9c3eSSadaf Ebrahimi // Walker class to build map of named capture groups and their indices.
567*ccdc9c3eSSadaf Ebrahimi class NamedCapturesWalker : public Regexp::Walker<Ignored> {
568*ccdc9c3eSSadaf Ebrahimi public:
NamedCapturesWalker()569*ccdc9c3eSSadaf Ebrahimi NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()570*ccdc9c3eSSadaf Ebrahimi ~NamedCapturesWalker() { delete map_; }
571*ccdc9c3eSSadaf Ebrahimi
TakeMap()572*ccdc9c3eSSadaf Ebrahimi std::map<string, int>* TakeMap() {
573*ccdc9c3eSSadaf Ebrahimi std::map<string, int>* m = map_;
574*ccdc9c3eSSadaf Ebrahimi map_ = NULL;
575*ccdc9c3eSSadaf Ebrahimi return m;
576*ccdc9c3eSSadaf Ebrahimi }
577*ccdc9c3eSSadaf Ebrahimi
PreVisit(Regexp * re,Ignored ignored,bool * stop)578*ccdc9c3eSSadaf Ebrahimi Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
579*ccdc9c3eSSadaf Ebrahimi if (re->op() == kRegexpCapture && re->name() != NULL) {
580*ccdc9c3eSSadaf Ebrahimi // Allocate map once we find a name.
581*ccdc9c3eSSadaf Ebrahimi if (map_ == NULL)
582*ccdc9c3eSSadaf Ebrahimi map_ = new std::map<string, int>;
583*ccdc9c3eSSadaf Ebrahimi
584*ccdc9c3eSSadaf Ebrahimi // Record first occurrence of each name.
585*ccdc9c3eSSadaf Ebrahimi // (The rule is that if you have the same name
586*ccdc9c3eSSadaf Ebrahimi // multiple times, only the leftmost one counts.)
587*ccdc9c3eSSadaf Ebrahimi if (map_->find(*re->name()) == map_->end())
588*ccdc9c3eSSadaf Ebrahimi (*map_)[*re->name()] = re->cap();
589*ccdc9c3eSSadaf Ebrahimi }
590*ccdc9c3eSSadaf Ebrahimi return ignored;
591*ccdc9c3eSSadaf Ebrahimi }
592*ccdc9c3eSSadaf Ebrahimi
ShortVisit(Regexp * re,Ignored ignored)593*ccdc9c3eSSadaf Ebrahimi virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
594*ccdc9c3eSSadaf Ebrahimi // Should never be called: we use Walk not WalkExponential.
595*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
596*ccdc9c3eSSadaf Ebrahimi return ignored;
597*ccdc9c3eSSadaf Ebrahimi }
598*ccdc9c3eSSadaf Ebrahimi
599*ccdc9c3eSSadaf Ebrahimi private:
600*ccdc9c3eSSadaf Ebrahimi std::map<string, int>* map_;
601*ccdc9c3eSSadaf Ebrahimi
602*ccdc9c3eSSadaf Ebrahimi NamedCapturesWalker(const NamedCapturesWalker&) = delete;
603*ccdc9c3eSSadaf Ebrahimi NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
604*ccdc9c3eSSadaf Ebrahimi };
605*ccdc9c3eSSadaf Ebrahimi
NamedCaptures()606*ccdc9c3eSSadaf Ebrahimi std::map<string, int>* Regexp::NamedCaptures() {
607*ccdc9c3eSSadaf Ebrahimi NamedCapturesWalker w;
608*ccdc9c3eSSadaf Ebrahimi w.Walk(this, 0);
609*ccdc9c3eSSadaf Ebrahimi return w.TakeMap();
610*ccdc9c3eSSadaf Ebrahimi }
611*ccdc9c3eSSadaf Ebrahimi
612*ccdc9c3eSSadaf Ebrahimi // Walker class to build map from capture group indices to their names.
613*ccdc9c3eSSadaf Ebrahimi class CaptureNamesWalker : public Regexp::Walker<Ignored> {
614*ccdc9c3eSSadaf Ebrahimi public:
CaptureNamesWalker()615*ccdc9c3eSSadaf Ebrahimi CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()616*ccdc9c3eSSadaf Ebrahimi ~CaptureNamesWalker() { delete map_; }
617*ccdc9c3eSSadaf Ebrahimi
TakeMap()618*ccdc9c3eSSadaf Ebrahimi std::map<int, string>* TakeMap() {
619*ccdc9c3eSSadaf Ebrahimi std::map<int, string>* m = map_;
620*ccdc9c3eSSadaf Ebrahimi map_ = NULL;
621*ccdc9c3eSSadaf Ebrahimi return m;
622*ccdc9c3eSSadaf Ebrahimi }
623*ccdc9c3eSSadaf Ebrahimi
PreVisit(Regexp * re,Ignored ignored,bool * stop)624*ccdc9c3eSSadaf Ebrahimi Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
625*ccdc9c3eSSadaf Ebrahimi if (re->op() == kRegexpCapture && re->name() != NULL) {
626*ccdc9c3eSSadaf Ebrahimi // Allocate map once we find a name.
627*ccdc9c3eSSadaf Ebrahimi if (map_ == NULL)
628*ccdc9c3eSSadaf Ebrahimi map_ = new std::map<int, string>;
629*ccdc9c3eSSadaf Ebrahimi
630*ccdc9c3eSSadaf Ebrahimi (*map_)[re->cap()] = *re->name();
631*ccdc9c3eSSadaf Ebrahimi }
632*ccdc9c3eSSadaf Ebrahimi return ignored;
633*ccdc9c3eSSadaf Ebrahimi }
634*ccdc9c3eSSadaf Ebrahimi
ShortVisit(Regexp * re,Ignored ignored)635*ccdc9c3eSSadaf Ebrahimi virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
636*ccdc9c3eSSadaf Ebrahimi // Should never be called: we use Walk not WalkExponential.
637*ccdc9c3eSSadaf Ebrahimi LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
638*ccdc9c3eSSadaf Ebrahimi return ignored;
639*ccdc9c3eSSadaf Ebrahimi }
640*ccdc9c3eSSadaf Ebrahimi
641*ccdc9c3eSSadaf Ebrahimi private:
642*ccdc9c3eSSadaf Ebrahimi std::map<int, string>* map_;
643*ccdc9c3eSSadaf Ebrahimi
644*ccdc9c3eSSadaf Ebrahimi CaptureNamesWalker(const CaptureNamesWalker&) = delete;
645*ccdc9c3eSSadaf Ebrahimi CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
646*ccdc9c3eSSadaf Ebrahimi };
647*ccdc9c3eSSadaf Ebrahimi
CaptureNames()648*ccdc9c3eSSadaf Ebrahimi std::map<int, string>* Regexp::CaptureNames() {
649*ccdc9c3eSSadaf Ebrahimi CaptureNamesWalker w;
650*ccdc9c3eSSadaf Ebrahimi w.Walk(this, 0);
651*ccdc9c3eSSadaf Ebrahimi return w.TakeMap();
652*ccdc9c3eSSadaf Ebrahimi }
653*ccdc9c3eSSadaf Ebrahimi
654*ccdc9c3eSSadaf Ebrahimi // Determines whether regexp matches must be anchored
655*ccdc9c3eSSadaf Ebrahimi // with a fixed string prefix. If so, returns the prefix and
656*ccdc9c3eSSadaf Ebrahimi // the regexp that remains after the prefix. The prefix might
657*ccdc9c3eSSadaf Ebrahimi // be ASCII case-insensitive.
RequiredPrefix(string * prefix,bool * foldcase,Regexp ** suffix)658*ccdc9c3eSSadaf Ebrahimi bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) {
659*ccdc9c3eSSadaf Ebrahimi // No need for a walker: the regexp must be of the form
660*ccdc9c3eSSadaf Ebrahimi // 1. some number of ^ anchors
661*ccdc9c3eSSadaf Ebrahimi // 2. a literal char or string
662*ccdc9c3eSSadaf Ebrahimi // 3. the rest
663*ccdc9c3eSSadaf Ebrahimi prefix->clear();
664*ccdc9c3eSSadaf Ebrahimi *foldcase = false;
665*ccdc9c3eSSadaf Ebrahimi *suffix = NULL;
666*ccdc9c3eSSadaf Ebrahimi if (op_ != kRegexpConcat)
667*ccdc9c3eSSadaf Ebrahimi return false;
668*ccdc9c3eSSadaf Ebrahimi
669*ccdc9c3eSSadaf Ebrahimi // Some number of anchors, then a literal or concatenation.
670*ccdc9c3eSSadaf Ebrahimi int i = 0;
671*ccdc9c3eSSadaf Ebrahimi Regexp** sub = this->sub();
672*ccdc9c3eSSadaf Ebrahimi while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
673*ccdc9c3eSSadaf Ebrahimi i++;
674*ccdc9c3eSSadaf Ebrahimi if (i == 0 || i >= nsub_)
675*ccdc9c3eSSadaf Ebrahimi return false;
676*ccdc9c3eSSadaf Ebrahimi
677*ccdc9c3eSSadaf Ebrahimi Regexp* re = sub[i];
678*ccdc9c3eSSadaf Ebrahimi switch (re->op_) {
679*ccdc9c3eSSadaf Ebrahimi default:
680*ccdc9c3eSSadaf Ebrahimi return false;
681*ccdc9c3eSSadaf Ebrahimi
682*ccdc9c3eSSadaf Ebrahimi case kRegexpLiteralString:
683*ccdc9c3eSSadaf Ebrahimi // Convert to string in proper encoding.
684*ccdc9c3eSSadaf Ebrahimi if (re->parse_flags() & Latin1) {
685*ccdc9c3eSSadaf Ebrahimi prefix->resize(re->nrunes_);
686*ccdc9c3eSSadaf Ebrahimi for (int j = 0; j < re->nrunes_; j++)
687*ccdc9c3eSSadaf Ebrahimi (*prefix)[j] = static_cast<char>(re->runes_[j]);
688*ccdc9c3eSSadaf Ebrahimi } else {
689*ccdc9c3eSSadaf Ebrahimi // Convert to UTF-8 in place.
690*ccdc9c3eSSadaf Ebrahimi // Assume worst-case space and then trim.
691*ccdc9c3eSSadaf Ebrahimi prefix->resize(re->nrunes_ * UTFmax);
692*ccdc9c3eSSadaf Ebrahimi char *p = &(*prefix)[0];
693*ccdc9c3eSSadaf Ebrahimi for (int j = 0; j < re->nrunes_; j++) {
694*ccdc9c3eSSadaf Ebrahimi Rune r = re->runes_[j];
695*ccdc9c3eSSadaf Ebrahimi if (r < Runeself)
696*ccdc9c3eSSadaf Ebrahimi *p++ = static_cast<char>(r);
697*ccdc9c3eSSadaf Ebrahimi else
698*ccdc9c3eSSadaf Ebrahimi p += runetochar(p, &r);
699*ccdc9c3eSSadaf Ebrahimi }
700*ccdc9c3eSSadaf Ebrahimi prefix->resize(p - &(*prefix)[0]);
701*ccdc9c3eSSadaf Ebrahimi }
702*ccdc9c3eSSadaf Ebrahimi break;
703*ccdc9c3eSSadaf Ebrahimi
704*ccdc9c3eSSadaf Ebrahimi case kRegexpLiteral:
705*ccdc9c3eSSadaf Ebrahimi if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
706*ccdc9c3eSSadaf Ebrahimi prefix->append(1, static_cast<char>(re->rune_));
707*ccdc9c3eSSadaf Ebrahimi } else {
708*ccdc9c3eSSadaf Ebrahimi char buf[UTFmax];
709*ccdc9c3eSSadaf Ebrahimi prefix->append(buf, runetochar(buf, &re->rune_));
710*ccdc9c3eSSadaf Ebrahimi }
711*ccdc9c3eSSadaf Ebrahimi break;
712*ccdc9c3eSSadaf Ebrahimi }
713*ccdc9c3eSSadaf Ebrahimi *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
714*ccdc9c3eSSadaf Ebrahimi i++;
715*ccdc9c3eSSadaf Ebrahimi
716*ccdc9c3eSSadaf Ebrahimi // The rest.
717*ccdc9c3eSSadaf Ebrahimi if (i < nsub_) {
718*ccdc9c3eSSadaf Ebrahimi for (int j = i; j < nsub_; j++)
719*ccdc9c3eSSadaf Ebrahimi sub[j]->Incref();
720*ccdc9c3eSSadaf Ebrahimi re = Concat(sub + i, nsub_ - i, parse_flags());
721*ccdc9c3eSSadaf Ebrahimi } else {
722*ccdc9c3eSSadaf Ebrahimi re = new Regexp(kRegexpEmptyMatch, parse_flags());
723*ccdc9c3eSSadaf Ebrahimi }
724*ccdc9c3eSSadaf Ebrahimi *suffix = re;
725*ccdc9c3eSSadaf Ebrahimi return true;
726*ccdc9c3eSSadaf Ebrahimi }
727*ccdc9c3eSSadaf Ebrahimi
728*ccdc9c3eSSadaf Ebrahimi // Character class builder is a balanced binary tree (STL set)
729*ccdc9c3eSSadaf Ebrahimi // containing non-overlapping, non-abutting RuneRanges.
730*ccdc9c3eSSadaf Ebrahimi // The less-than operator used in the tree treats two
731*ccdc9c3eSSadaf Ebrahimi // ranges as equal if they overlap at all, so that
732*ccdc9c3eSSadaf Ebrahimi // lookups for a particular Rune are possible.
733*ccdc9c3eSSadaf Ebrahimi
CharClassBuilder()734*ccdc9c3eSSadaf Ebrahimi CharClassBuilder::CharClassBuilder() {
735*ccdc9c3eSSadaf Ebrahimi nrunes_ = 0;
736*ccdc9c3eSSadaf Ebrahimi upper_ = 0;
737*ccdc9c3eSSadaf Ebrahimi lower_ = 0;
738*ccdc9c3eSSadaf Ebrahimi }
739*ccdc9c3eSSadaf Ebrahimi
740*ccdc9c3eSSadaf Ebrahimi // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)741*ccdc9c3eSSadaf Ebrahimi bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
742*ccdc9c3eSSadaf Ebrahimi if (hi < lo)
743*ccdc9c3eSSadaf Ebrahimi return false;
744*ccdc9c3eSSadaf Ebrahimi
745*ccdc9c3eSSadaf Ebrahimi if (lo <= 'z' && hi >= 'A') {
746*ccdc9c3eSSadaf Ebrahimi // Overlaps some alpha, maybe not all.
747*ccdc9c3eSSadaf Ebrahimi // Update bitmaps telling which ASCII letters are in the set.
748*ccdc9c3eSSadaf Ebrahimi Rune lo1 = std::max<Rune>(lo, 'A');
749*ccdc9c3eSSadaf Ebrahimi Rune hi1 = std::min<Rune>(hi, 'Z');
750*ccdc9c3eSSadaf Ebrahimi if (lo1 <= hi1)
751*ccdc9c3eSSadaf Ebrahimi upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
752*ccdc9c3eSSadaf Ebrahimi
753*ccdc9c3eSSadaf Ebrahimi lo1 = std::max<Rune>(lo, 'a');
754*ccdc9c3eSSadaf Ebrahimi hi1 = std::min<Rune>(hi, 'z');
755*ccdc9c3eSSadaf Ebrahimi if (lo1 <= hi1)
756*ccdc9c3eSSadaf Ebrahimi lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
757*ccdc9c3eSSadaf Ebrahimi }
758*ccdc9c3eSSadaf Ebrahimi
759*ccdc9c3eSSadaf Ebrahimi { // Check whether lo, hi is already in the class.
760*ccdc9c3eSSadaf Ebrahimi iterator it = ranges_.find(RuneRange(lo, lo));
761*ccdc9c3eSSadaf Ebrahimi if (it != end() && it->lo <= lo && hi <= it->hi)
762*ccdc9c3eSSadaf Ebrahimi return false;
763*ccdc9c3eSSadaf Ebrahimi }
764*ccdc9c3eSSadaf Ebrahimi
765*ccdc9c3eSSadaf Ebrahimi // Look for a range abutting lo on the left.
766*ccdc9c3eSSadaf Ebrahimi // If it exists, take it out and increase our range.
767*ccdc9c3eSSadaf Ebrahimi if (lo > 0) {
768*ccdc9c3eSSadaf Ebrahimi iterator it = ranges_.find(RuneRange(lo-1, lo-1));
769*ccdc9c3eSSadaf Ebrahimi if (it != end()) {
770*ccdc9c3eSSadaf Ebrahimi lo = it->lo;
771*ccdc9c3eSSadaf Ebrahimi if (it->hi > hi)
772*ccdc9c3eSSadaf Ebrahimi hi = it->hi;
773*ccdc9c3eSSadaf Ebrahimi nrunes_ -= it->hi - it->lo + 1;
774*ccdc9c3eSSadaf Ebrahimi ranges_.erase(it);
775*ccdc9c3eSSadaf Ebrahimi }
776*ccdc9c3eSSadaf Ebrahimi }
777*ccdc9c3eSSadaf Ebrahimi
778*ccdc9c3eSSadaf Ebrahimi // Look for a range abutting hi on the right.
779*ccdc9c3eSSadaf Ebrahimi // If it exists, take it out and increase our range.
780*ccdc9c3eSSadaf Ebrahimi if (hi < Runemax) {
781*ccdc9c3eSSadaf Ebrahimi iterator it = ranges_.find(RuneRange(hi+1, hi+1));
782*ccdc9c3eSSadaf Ebrahimi if (it != end()) {
783*ccdc9c3eSSadaf Ebrahimi hi = it->hi;
784*ccdc9c3eSSadaf Ebrahimi nrunes_ -= it->hi - it->lo + 1;
785*ccdc9c3eSSadaf Ebrahimi ranges_.erase(it);
786*ccdc9c3eSSadaf Ebrahimi }
787*ccdc9c3eSSadaf Ebrahimi }
788*ccdc9c3eSSadaf Ebrahimi
789*ccdc9c3eSSadaf Ebrahimi // Look for ranges between lo and hi. Take them out.
790*ccdc9c3eSSadaf Ebrahimi // This is only safe because the set has no overlapping ranges.
791*ccdc9c3eSSadaf Ebrahimi // We've already removed any ranges abutting lo and hi, so
792*ccdc9c3eSSadaf Ebrahimi // any that overlap [lo, hi] must be contained within it.
793*ccdc9c3eSSadaf Ebrahimi for (;;) {
794*ccdc9c3eSSadaf Ebrahimi iterator it = ranges_.find(RuneRange(lo, hi));
795*ccdc9c3eSSadaf Ebrahimi if (it == end())
796*ccdc9c3eSSadaf Ebrahimi break;
797*ccdc9c3eSSadaf Ebrahimi nrunes_ -= it->hi - it->lo + 1;
798*ccdc9c3eSSadaf Ebrahimi ranges_.erase(it);
799*ccdc9c3eSSadaf Ebrahimi }
800*ccdc9c3eSSadaf Ebrahimi
801*ccdc9c3eSSadaf Ebrahimi // Finally, add [lo, hi].
802*ccdc9c3eSSadaf Ebrahimi nrunes_ += hi - lo + 1;
803*ccdc9c3eSSadaf Ebrahimi ranges_.insert(RuneRange(lo, hi));
804*ccdc9c3eSSadaf Ebrahimi return true;
805*ccdc9c3eSSadaf Ebrahimi }
806*ccdc9c3eSSadaf Ebrahimi
AddCharClass(CharClassBuilder * cc)807*ccdc9c3eSSadaf Ebrahimi void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
808*ccdc9c3eSSadaf Ebrahimi for (iterator it = cc->begin(); it != cc->end(); ++it)
809*ccdc9c3eSSadaf Ebrahimi AddRange(it->lo, it->hi);
810*ccdc9c3eSSadaf Ebrahimi }
811*ccdc9c3eSSadaf Ebrahimi
Contains(Rune r)812*ccdc9c3eSSadaf Ebrahimi bool CharClassBuilder::Contains(Rune r) {
813*ccdc9c3eSSadaf Ebrahimi return ranges_.find(RuneRange(r, r)) != end();
814*ccdc9c3eSSadaf Ebrahimi }
815*ccdc9c3eSSadaf Ebrahimi
816*ccdc9c3eSSadaf Ebrahimi // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()817*ccdc9c3eSSadaf Ebrahimi bool CharClassBuilder::FoldsASCII() {
818*ccdc9c3eSSadaf Ebrahimi return ((upper_ ^ lower_) & AlphaMask) == 0;
819*ccdc9c3eSSadaf Ebrahimi }
820*ccdc9c3eSSadaf Ebrahimi
Copy()821*ccdc9c3eSSadaf Ebrahimi CharClassBuilder* CharClassBuilder::Copy() {
822*ccdc9c3eSSadaf Ebrahimi CharClassBuilder* cc = new CharClassBuilder;
823*ccdc9c3eSSadaf Ebrahimi for (iterator it = begin(); it != end(); ++it)
824*ccdc9c3eSSadaf Ebrahimi cc->ranges_.insert(RuneRange(it->lo, it->hi));
825*ccdc9c3eSSadaf Ebrahimi cc->upper_ = upper_;
826*ccdc9c3eSSadaf Ebrahimi cc->lower_ = lower_;
827*ccdc9c3eSSadaf Ebrahimi cc->nrunes_ = nrunes_;
828*ccdc9c3eSSadaf Ebrahimi return cc;
829*ccdc9c3eSSadaf Ebrahimi }
830*ccdc9c3eSSadaf Ebrahimi
831*ccdc9c3eSSadaf Ebrahimi
832*ccdc9c3eSSadaf Ebrahimi
RemoveAbove(Rune r)833*ccdc9c3eSSadaf Ebrahimi void CharClassBuilder::RemoveAbove(Rune r) {
834*ccdc9c3eSSadaf Ebrahimi if (r >= Runemax)
835*ccdc9c3eSSadaf Ebrahimi return;
836*ccdc9c3eSSadaf Ebrahimi
837*ccdc9c3eSSadaf Ebrahimi if (r < 'z') {
838*ccdc9c3eSSadaf Ebrahimi if (r < 'a')
839*ccdc9c3eSSadaf Ebrahimi lower_ = 0;
840*ccdc9c3eSSadaf Ebrahimi else
841*ccdc9c3eSSadaf Ebrahimi lower_ &= AlphaMask >> ('z' - r);
842*ccdc9c3eSSadaf Ebrahimi }
843*ccdc9c3eSSadaf Ebrahimi
844*ccdc9c3eSSadaf Ebrahimi if (r < 'Z') {
845*ccdc9c3eSSadaf Ebrahimi if (r < 'A')
846*ccdc9c3eSSadaf Ebrahimi upper_ = 0;
847*ccdc9c3eSSadaf Ebrahimi else
848*ccdc9c3eSSadaf Ebrahimi upper_ &= AlphaMask >> ('Z' - r);
849*ccdc9c3eSSadaf Ebrahimi }
850*ccdc9c3eSSadaf Ebrahimi
851*ccdc9c3eSSadaf Ebrahimi for (;;) {
852*ccdc9c3eSSadaf Ebrahimi
853*ccdc9c3eSSadaf Ebrahimi iterator it = ranges_.find(RuneRange(r + 1, Runemax));
854*ccdc9c3eSSadaf Ebrahimi if (it == end())
855*ccdc9c3eSSadaf Ebrahimi break;
856*ccdc9c3eSSadaf Ebrahimi RuneRange rr = *it;
857*ccdc9c3eSSadaf Ebrahimi ranges_.erase(it);
858*ccdc9c3eSSadaf Ebrahimi nrunes_ -= rr.hi - rr.lo + 1;
859*ccdc9c3eSSadaf Ebrahimi if (rr.lo <= r) {
860*ccdc9c3eSSadaf Ebrahimi rr.hi = r;
861*ccdc9c3eSSadaf Ebrahimi ranges_.insert(rr);
862*ccdc9c3eSSadaf Ebrahimi nrunes_ += rr.hi - rr.lo + 1;
863*ccdc9c3eSSadaf Ebrahimi }
864*ccdc9c3eSSadaf Ebrahimi }
865*ccdc9c3eSSadaf Ebrahimi }
866*ccdc9c3eSSadaf Ebrahimi
Negate()867*ccdc9c3eSSadaf Ebrahimi void CharClassBuilder::Negate() {
868*ccdc9c3eSSadaf Ebrahimi // Build up negation and then copy in.
869*ccdc9c3eSSadaf Ebrahimi // Could edit ranges in place, but C++ won't let me.
870*ccdc9c3eSSadaf Ebrahimi std::vector<RuneRange> v;
871*ccdc9c3eSSadaf Ebrahimi v.reserve(ranges_.size() + 1);
872*ccdc9c3eSSadaf Ebrahimi
873*ccdc9c3eSSadaf Ebrahimi // In negation, first range begins at 0, unless
874*ccdc9c3eSSadaf Ebrahimi // the current class begins at 0.
875*ccdc9c3eSSadaf Ebrahimi iterator it = begin();
876*ccdc9c3eSSadaf Ebrahimi if (it == end()) {
877*ccdc9c3eSSadaf Ebrahimi v.push_back(RuneRange(0, Runemax));
878*ccdc9c3eSSadaf Ebrahimi } else {
879*ccdc9c3eSSadaf Ebrahimi int nextlo = 0;
880*ccdc9c3eSSadaf Ebrahimi if (it->lo == 0) {
881*ccdc9c3eSSadaf Ebrahimi nextlo = it->hi + 1;
882*ccdc9c3eSSadaf Ebrahimi ++it;
883*ccdc9c3eSSadaf Ebrahimi }
884*ccdc9c3eSSadaf Ebrahimi for (; it != end(); ++it) {
885*ccdc9c3eSSadaf Ebrahimi v.push_back(RuneRange(nextlo, it->lo - 1));
886*ccdc9c3eSSadaf Ebrahimi nextlo = it->hi + 1;
887*ccdc9c3eSSadaf Ebrahimi }
888*ccdc9c3eSSadaf Ebrahimi if (nextlo <= Runemax)
889*ccdc9c3eSSadaf Ebrahimi v.push_back(RuneRange(nextlo, Runemax));
890*ccdc9c3eSSadaf Ebrahimi }
891*ccdc9c3eSSadaf Ebrahimi
892*ccdc9c3eSSadaf Ebrahimi ranges_.clear();
893*ccdc9c3eSSadaf Ebrahimi for (size_t i = 0; i < v.size(); i++)
894*ccdc9c3eSSadaf Ebrahimi ranges_.insert(v[i]);
895*ccdc9c3eSSadaf Ebrahimi
896*ccdc9c3eSSadaf Ebrahimi upper_ = AlphaMask & ~upper_;
897*ccdc9c3eSSadaf Ebrahimi lower_ = AlphaMask & ~lower_;
898*ccdc9c3eSSadaf Ebrahimi nrunes_ = Runemax+1 - nrunes_;
899*ccdc9c3eSSadaf Ebrahimi }
900*ccdc9c3eSSadaf Ebrahimi
901*ccdc9c3eSSadaf Ebrahimi // Character class is a sorted list of ranges.
902*ccdc9c3eSSadaf Ebrahimi // The ranges are allocated in the same block as the header,
903*ccdc9c3eSSadaf Ebrahimi // necessitating a special allocator and Delete method.
904*ccdc9c3eSSadaf Ebrahimi
New(int maxranges)905*ccdc9c3eSSadaf Ebrahimi CharClass* CharClass::New(int maxranges) {
906*ccdc9c3eSSadaf Ebrahimi CharClass* cc;
907*ccdc9c3eSSadaf Ebrahimi uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
908*ccdc9c3eSSadaf Ebrahimi cc = reinterpret_cast<CharClass*>(data);
909*ccdc9c3eSSadaf Ebrahimi cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
910*ccdc9c3eSSadaf Ebrahimi cc->nranges_ = 0;
911*ccdc9c3eSSadaf Ebrahimi cc->folds_ascii_ = false;
912*ccdc9c3eSSadaf Ebrahimi cc->nrunes_ = 0;
913*ccdc9c3eSSadaf Ebrahimi return cc;
914*ccdc9c3eSSadaf Ebrahimi }
915*ccdc9c3eSSadaf Ebrahimi
Delete()916*ccdc9c3eSSadaf Ebrahimi void CharClass::Delete() {
917*ccdc9c3eSSadaf Ebrahimi uint8_t* data = reinterpret_cast<uint8_t*>(this);
918*ccdc9c3eSSadaf Ebrahimi delete[] data;
919*ccdc9c3eSSadaf Ebrahimi }
920*ccdc9c3eSSadaf Ebrahimi
Negate()921*ccdc9c3eSSadaf Ebrahimi CharClass* CharClass::Negate() {
922*ccdc9c3eSSadaf Ebrahimi CharClass* cc = CharClass::New(nranges_+1);
923*ccdc9c3eSSadaf Ebrahimi cc->folds_ascii_ = folds_ascii_;
924*ccdc9c3eSSadaf Ebrahimi cc->nrunes_ = Runemax + 1 - nrunes_;
925*ccdc9c3eSSadaf Ebrahimi int n = 0;
926*ccdc9c3eSSadaf Ebrahimi int nextlo = 0;
927*ccdc9c3eSSadaf Ebrahimi for (CharClass::iterator it = begin(); it != end(); ++it) {
928*ccdc9c3eSSadaf Ebrahimi if (it->lo == nextlo) {
929*ccdc9c3eSSadaf Ebrahimi nextlo = it->hi + 1;
930*ccdc9c3eSSadaf Ebrahimi } else {
931*ccdc9c3eSSadaf Ebrahimi cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
932*ccdc9c3eSSadaf Ebrahimi nextlo = it->hi + 1;
933*ccdc9c3eSSadaf Ebrahimi }
934*ccdc9c3eSSadaf Ebrahimi }
935*ccdc9c3eSSadaf Ebrahimi if (nextlo <= Runemax)
936*ccdc9c3eSSadaf Ebrahimi cc->ranges_[n++] = RuneRange(nextlo, Runemax);
937*ccdc9c3eSSadaf Ebrahimi cc->nranges_ = n;
938*ccdc9c3eSSadaf Ebrahimi return cc;
939*ccdc9c3eSSadaf Ebrahimi }
940*ccdc9c3eSSadaf Ebrahimi
Contains(Rune r)941*ccdc9c3eSSadaf Ebrahimi bool CharClass::Contains(Rune r) {
942*ccdc9c3eSSadaf Ebrahimi RuneRange* rr = ranges_;
943*ccdc9c3eSSadaf Ebrahimi int n = nranges_;
944*ccdc9c3eSSadaf Ebrahimi while (n > 0) {
945*ccdc9c3eSSadaf Ebrahimi int m = n/2;
946*ccdc9c3eSSadaf Ebrahimi if (rr[m].hi < r) {
947*ccdc9c3eSSadaf Ebrahimi rr += m+1;
948*ccdc9c3eSSadaf Ebrahimi n -= m+1;
949*ccdc9c3eSSadaf Ebrahimi } else if (r < rr[m].lo) {
950*ccdc9c3eSSadaf Ebrahimi n = m;
951*ccdc9c3eSSadaf Ebrahimi } else { // rr[m].lo <= r && r <= rr[m].hi
952*ccdc9c3eSSadaf Ebrahimi return true;
953*ccdc9c3eSSadaf Ebrahimi }
954*ccdc9c3eSSadaf Ebrahimi }
955*ccdc9c3eSSadaf Ebrahimi return false;
956*ccdc9c3eSSadaf Ebrahimi }
957*ccdc9c3eSSadaf Ebrahimi
GetCharClass()958*ccdc9c3eSSadaf Ebrahimi CharClass* CharClassBuilder::GetCharClass() {
959*ccdc9c3eSSadaf Ebrahimi CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
960*ccdc9c3eSSadaf Ebrahimi int n = 0;
961*ccdc9c3eSSadaf Ebrahimi for (iterator it = begin(); it != end(); ++it)
962*ccdc9c3eSSadaf Ebrahimi cc->ranges_[n++] = *it;
963*ccdc9c3eSSadaf Ebrahimi cc->nranges_ = n;
964*ccdc9c3eSSadaf Ebrahimi DCHECK_LE(n, static_cast<int>(ranges_.size()));
965*ccdc9c3eSSadaf Ebrahimi cc->nrunes_ = nrunes_;
966*ccdc9c3eSSadaf Ebrahimi cc->folds_ascii_ = FoldsASCII();
967*ccdc9c3eSSadaf Ebrahimi return cc;
968*ccdc9c3eSSadaf Ebrahimi }
969*ccdc9c3eSSadaf Ebrahimi
970*ccdc9c3eSSadaf Ebrahimi } // namespace re2
971