1*ccdc9c3eSSadaf Ebrahimi // Copyright 2007 The RE2 Authors. All Rights Reserved. 2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style 3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file. 4*ccdc9c3eSSadaf Ebrahimi 5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_PROG_H_ 6*ccdc9c3eSSadaf Ebrahimi #define RE2_PROG_H_ 7*ccdc9c3eSSadaf Ebrahimi 8*ccdc9c3eSSadaf Ebrahimi // Compiled representation of regular expressions. 9*ccdc9c3eSSadaf Ebrahimi // See regexp.h for the Regexp class, which represents a regular 10*ccdc9c3eSSadaf Ebrahimi // expression symbolically. 11*ccdc9c3eSSadaf Ebrahimi 12*ccdc9c3eSSadaf Ebrahimi #include <stdint.h> 13*ccdc9c3eSSadaf Ebrahimi #include <functional> 14*ccdc9c3eSSadaf Ebrahimi #include <mutex> 15*ccdc9c3eSSadaf Ebrahimi #include <string> 16*ccdc9c3eSSadaf Ebrahimi #include <vector> 17*ccdc9c3eSSadaf Ebrahimi #include <type_traits> 18*ccdc9c3eSSadaf Ebrahimi 19*ccdc9c3eSSadaf Ebrahimi #include "util/util.h" 20*ccdc9c3eSSadaf Ebrahimi #include "util/logging.h" 21*ccdc9c3eSSadaf Ebrahimi #include "util/pod_array.h" 22*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_array.h" 23*ccdc9c3eSSadaf Ebrahimi #include "util/sparse_set.h" 24*ccdc9c3eSSadaf Ebrahimi #include "re2/re2.h" 25*ccdc9c3eSSadaf Ebrahimi 26*ccdc9c3eSSadaf Ebrahimi namespace re2 { 27*ccdc9c3eSSadaf Ebrahimi 28*ccdc9c3eSSadaf Ebrahimi // Opcodes for Inst 29*ccdc9c3eSSadaf Ebrahimi enum InstOp { 30*ccdc9c3eSSadaf Ebrahimi kInstAlt = 0, // choose between out_ and out1_ 31*ccdc9c3eSSadaf Ebrahimi kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. 32*ccdc9c3eSSadaf Ebrahimi kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] 33*ccdc9c3eSSadaf Ebrahimi kInstCapture, // capturing parenthesis number cap_ 34*ccdc9c3eSSadaf Ebrahimi kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ 35*ccdc9c3eSSadaf Ebrahimi kInstMatch, // found a match! 36*ccdc9c3eSSadaf Ebrahimi kInstNop, // no-op; occasionally unavoidable 37*ccdc9c3eSSadaf Ebrahimi kInstFail, // never match; occasionally unavoidable 38*ccdc9c3eSSadaf Ebrahimi kNumInst, 39*ccdc9c3eSSadaf Ebrahimi }; 40*ccdc9c3eSSadaf Ebrahimi 41*ccdc9c3eSSadaf Ebrahimi // Bit flags for empty-width specials 42*ccdc9c3eSSadaf Ebrahimi enum EmptyOp { 43*ccdc9c3eSSadaf Ebrahimi kEmptyBeginLine = 1<<0, // ^ - beginning of line 44*ccdc9c3eSSadaf Ebrahimi kEmptyEndLine = 1<<1, // $ - end of line 45*ccdc9c3eSSadaf Ebrahimi kEmptyBeginText = 1<<2, // \A - beginning of text 46*ccdc9c3eSSadaf Ebrahimi kEmptyEndText = 1<<3, // \z - end of text 47*ccdc9c3eSSadaf Ebrahimi kEmptyWordBoundary = 1<<4, // \b - word boundary 48*ccdc9c3eSSadaf Ebrahimi kEmptyNonWordBoundary = 1<<5, // \B - not \b 49*ccdc9c3eSSadaf Ebrahimi kEmptyAllFlags = (1<<6)-1, 50*ccdc9c3eSSadaf Ebrahimi }; 51*ccdc9c3eSSadaf Ebrahimi 52*ccdc9c3eSSadaf Ebrahimi class DFA; 53*ccdc9c3eSSadaf Ebrahimi class Regexp; 54*ccdc9c3eSSadaf Ebrahimi 55*ccdc9c3eSSadaf Ebrahimi // Compiled form of regexp program. 56*ccdc9c3eSSadaf Ebrahimi class Prog { 57*ccdc9c3eSSadaf Ebrahimi public: 58*ccdc9c3eSSadaf Ebrahimi Prog(); 59*ccdc9c3eSSadaf Ebrahimi ~Prog(); 60*ccdc9c3eSSadaf Ebrahimi 61*ccdc9c3eSSadaf Ebrahimi // Single instruction in regexp program. 62*ccdc9c3eSSadaf Ebrahimi class Inst { 63*ccdc9c3eSSadaf Ebrahimi public: 64*ccdc9c3eSSadaf Ebrahimi // See the assertion below for why this is so. 65*ccdc9c3eSSadaf Ebrahimi Inst() = default; 66*ccdc9c3eSSadaf Ebrahimi 67*ccdc9c3eSSadaf Ebrahimi // Copyable. 68*ccdc9c3eSSadaf Ebrahimi Inst(const Inst&) = default; 69*ccdc9c3eSSadaf Ebrahimi Inst& operator=(const Inst&) = default; 70*ccdc9c3eSSadaf Ebrahimi 71*ccdc9c3eSSadaf Ebrahimi // Constructors per opcode 72*ccdc9c3eSSadaf Ebrahimi void InitAlt(uint32_t out, uint32_t out1); 73*ccdc9c3eSSadaf Ebrahimi void InitByteRange(int lo, int hi, int foldcase, uint32_t out); 74*ccdc9c3eSSadaf Ebrahimi void InitCapture(int cap, uint32_t out); 75*ccdc9c3eSSadaf Ebrahimi void InitEmptyWidth(EmptyOp empty, uint32_t out); 76*ccdc9c3eSSadaf Ebrahimi void InitMatch(int id); 77*ccdc9c3eSSadaf Ebrahimi void InitNop(uint32_t out); 78*ccdc9c3eSSadaf Ebrahimi void InitFail(); 79*ccdc9c3eSSadaf Ebrahimi 80*ccdc9c3eSSadaf Ebrahimi // Getters id(Prog * p)81*ccdc9c3eSSadaf Ebrahimi int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); } opcode()82*ccdc9c3eSSadaf Ebrahimi InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } last()83*ccdc9c3eSSadaf Ebrahimi int last() { return (out_opcode_>>3)&1; } out()84*ccdc9c3eSSadaf Ebrahimi int out() { return out_opcode_>>4; } out1()85*ccdc9c3eSSadaf Ebrahimi int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } cap()86*ccdc9c3eSSadaf Ebrahimi int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } lo()87*ccdc9c3eSSadaf Ebrahimi int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } hi()88*ccdc9c3eSSadaf Ebrahimi int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } foldcase()89*ccdc9c3eSSadaf Ebrahimi int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; } match_id()90*ccdc9c3eSSadaf Ebrahimi int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } empty()91*ccdc9c3eSSadaf Ebrahimi EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } 92*ccdc9c3eSSadaf Ebrahimi greedy(Prog * p)93*ccdc9c3eSSadaf Ebrahimi bool greedy(Prog* p) { 94*ccdc9c3eSSadaf Ebrahimi DCHECK_EQ(opcode(), kInstAltMatch); 95*ccdc9c3eSSadaf Ebrahimi return p->inst(out())->opcode() == kInstByteRange || 96*ccdc9c3eSSadaf Ebrahimi (p->inst(out())->opcode() == kInstNop && 97*ccdc9c3eSSadaf Ebrahimi p->inst(p->inst(out())->out())->opcode() == kInstByteRange); 98*ccdc9c3eSSadaf Ebrahimi } 99*ccdc9c3eSSadaf Ebrahimi 100*ccdc9c3eSSadaf Ebrahimi // Does this inst (an kInstByteRange) match c? Matches(int c)101*ccdc9c3eSSadaf Ebrahimi inline bool Matches(int c) { 102*ccdc9c3eSSadaf Ebrahimi DCHECK_EQ(opcode(), kInstByteRange); 103*ccdc9c3eSSadaf Ebrahimi if (foldcase_ && 'A' <= c && c <= 'Z') 104*ccdc9c3eSSadaf Ebrahimi c += 'a' - 'A'; 105*ccdc9c3eSSadaf Ebrahimi return lo_ <= c && c <= hi_; 106*ccdc9c3eSSadaf Ebrahimi } 107*ccdc9c3eSSadaf Ebrahimi 108*ccdc9c3eSSadaf Ebrahimi // Returns string representation for debugging. 109*ccdc9c3eSSadaf Ebrahimi string Dump(); 110*ccdc9c3eSSadaf Ebrahimi 111*ccdc9c3eSSadaf Ebrahimi // Maximum instruction id. 112*ccdc9c3eSSadaf Ebrahimi // (Must fit in out_opcode_. PatchList/last steal another bit.) 113*ccdc9c3eSSadaf Ebrahimi static const int kMaxInst = (1<<28) - 1; 114*ccdc9c3eSSadaf Ebrahimi 115*ccdc9c3eSSadaf Ebrahimi private: set_opcode(InstOp opcode)116*ccdc9c3eSSadaf Ebrahimi void set_opcode(InstOp opcode) { 117*ccdc9c3eSSadaf Ebrahimi out_opcode_ = (out()<<4) | (last()<<3) | opcode; 118*ccdc9c3eSSadaf Ebrahimi } 119*ccdc9c3eSSadaf Ebrahimi set_last()120*ccdc9c3eSSadaf Ebrahimi void set_last() { 121*ccdc9c3eSSadaf Ebrahimi out_opcode_ = (out()<<4) | (1<<3) | opcode(); 122*ccdc9c3eSSadaf Ebrahimi } 123*ccdc9c3eSSadaf Ebrahimi set_out(int out)124*ccdc9c3eSSadaf Ebrahimi void set_out(int out) { 125*ccdc9c3eSSadaf Ebrahimi out_opcode_ = (out<<4) | (last()<<3) | opcode(); 126*ccdc9c3eSSadaf Ebrahimi } 127*ccdc9c3eSSadaf Ebrahimi set_out_opcode(int out,InstOp opcode)128*ccdc9c3eSSadaf Ebrahimi void set_out_opcode(int out, InstOp opcode) { 129*ccdc9c3eSSadaf Ebrahimi out_opcode_ = (out<<4) | (last()<<3) | opcode; 130*ccdc9c3eSSadaf Ebrahimi } 131*ccdc9c3eSSadaf Ebrahimi 132*ccdc9c3eSSadaf Ebrahimi uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode 133*ccdc9c3eSSadaf Ebrahimi union { // additional instruction arguments: 134*ccdc9c3eSSadaf Ebrahimi uint32_t out1_; // opcode == kInstAlt 135*ccdc9c3eSSadaf Ebrahimi // alternate next instruction 136*ccdc9c3eSSadaf Ebrahimi 137*ccdc9c3eSSadaf Ebrahimi int32_t cap_; // opcode == kInstCapture 138*ccdc9c3eSSadaf Ebrahimi // Index of capture register (holds text 139*ccdc9c3eSSadaf Ebrahimi // position recorded by capturing parentheses). 140*ccdc9c3eSSadaf Ebrahimi // For \n (the submatch for the nth parentheses), 141*ccdc9c3eSSadaf Ebrahimi // the left parenthesis captures into register 2*n 142*ccdc9c3eSSadaf Ebrahimi // and the right one captures into register 2*n+1. 143*ccdc9c3eSSadaf Ebrahimi 144*ccdc9c3eSSadaf Ebrahimi int32_t match_id_; // opcode == kInstMatch 145*ccdc9c3eSSadaf Ebrahimi // Match ID to identify this match (for re2::Set). 146*ccdc9c3eSSadaf Ebrahimi 147*ccdc9c3eSSadaf Ebrahimi struct { // opcode == kInstByteRange 148*ccdc9c3eSSadaf Ebrahimi uint8_t lo_; // byte range is lo_-hi_ inclusive 149*ccdc9c3eSSadaf Ebrahimi uint8_t hi_; // 150*ccdc9c3eSSadaf Ebrahimi uint8_t foldcase_; // convert A-Z to a-z before checking range. 151*ccdc9c3eSSadaf Ebrahimi }; 152*ccdc9c3eSSadaf Ebrahimi 153*ccdc9c3eSSadaf Ebrahimi EmptyOp empty_; // opcode == kInstEmptyWidth 154*ccdc9c3eSSadaf Ebrahimi // empty_ is bitwise OR of kEmpty* flags above. 155*ccdc9c3eSSadaf Ebrahimi }; 156*ccdc9c3eSSadaf Ebrahimi 157*ccdc9c3eSSadaf Ebrahimi friend class Compiler; 158*ccdc9c3eSSadaf Ebrahimi friend struct PatchList; 159*ccdc9c3eSSadaf Ebrahimi friend class Prog; 160*ccdc9c3eSSadaf Ebrahimi }; 161*ccdc9c3eSSadaf Ebrahimi 162*ccdc9c3eSSadaf Ebrahimi // Inst must be trivial so that we can freely clear it with memset(3). 163*ccdc9c3eSSadaf Ebrahimi // Arrays of Inst are initialised by copying the initial elements with 164*ccdc9c3eSSadaf Ebrahimi // memmove(3) and then clearing any remaining elements with memset(3). 165*ccdc9c3eSSadaf Ebrahimi static_assert(std::is_trivial<Inst>::value, "Inst must be trivial"); 166*ccdc9c3eSSadaf Ebrahimi 167*ccdc9c3eSSadaf Ebrahimi // Whether to anchor the search. 168*ccdc9c3eSSadaf Ebrahimi enum Anchor { 169*ccdc9c3eSSadaf Ebrahimi kUnanchored, // match anywhere 170*ccdc9c3eSSadaf Ebrahimi kAnchored, // match only starting at beginning of text 171*ccdc9c3eSSadaf Ebrahimi }; 172*ccdc9c3eSSadaf Ebrahimi 173*ccdc9c3eSSadaf Ebrahimi // Kind of match to look for (for anchor != kFullMatch) 174*ccdc9c3eSSadaf Ebrahimi // 175*ccdc9c3eSSadaf Ebrahimi // kLongestMatch mode finds the overall longest 176*ccdc9c3eSSadaf Ebrahimi // match but still makes its submatch choices the way 177*ccdc9c3eSSadaf Ebrahimi // Perl would, not in the way prescribed by POSIX. 178*ccdc9c3eSSadaf Ebrahimi // The POSIX rules are much more expensive to implement, 179*ccdc9c3eSSadaf Ebrahimi // and no one has needed them. 180*ccdc9c3eSSadaf Ebrahimi // 181*ccdc9c3eSSadaf Ebrahimi // kFullMatch is not strictly necessary -- we could use 182*ccdc9c3eSSadaf Ebrahimi // kLongestMatch and then check the length of the match -- but 183*ccdc9c3eSSadaf Ebrahimi // the matching code can run faster if it knows to consider only 184*ccdc9c3eSSadaf Ebrahimi // full matches. 185*ccdc9c3eSSadaf Ebrahimi enum MatchKind { 186*ccdc9c3eSSadaf Ebrahimi kFirstMatch, // like Perl, PCRE 187*ccdc9c3eSSadaf Ebrahimi kLongestMatch, // like egrep or POSIX 188*ccdc9c3eSSadaf Ebrahimi kFullMatch, // match only entire text; implies anchor==kAnchored 189*ccdc9c3eSSadaf Ebrahimi kManyMatch // for SearchDFA, records set of matches 190*ccdc9c3eSSadaf Ebrahimi }; 191*ccdc9c3eSSadaf Ebrahimi inst(int id)192*ccdc9c3eSSadaf Ebrahimi Inst *inst(int id) { return &inst_[id]; } start()193*ccdc9c3eSSadaf Ebrahimi int start() { return start_; } start_unanchored()194*ccdc9c3eSSadaf Ebrahimi int start_unanchored() { return start_unanchored_; } set_start(int start)195*ccdc9c3eSSadaf Ebrahimi void set_start(int start) { start_ = start; } set_start_unanchored(int start)196*ccdc9c3eSSadaf Ebrahimi void set_start_unanchored(int start) { start_unanchored_ = start; } size()197*ccdc9c3eSSadaf Ebrahimi int size() { return size_; } reversed()198*ccdc9c3eSSadaf Ebrahimi bool reversed() { return reversed_; } set_reversed(bool reversed)199*ccdc9c3eSSadaf Ebrahimi void set_reversed(bool reversed) { reversed_ = reversed; } list_count()200*ccdc9c3eSSadaf Ebrahimi int list_count() { return list_count_; } inst_count(InstOp op)201*ccdc9c3eSSadaf Ebrahimi int inst_count(InstOp op) { return inst_count_[op]; } set_dfa_mem(int64_t dfa_mem)202*ccdc9c3eSSadaf Ebrahimi void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } dfa_mem()203*ccdc9c3eSSadaf Ebrahimi int64_t dfa_mem() { return dfa_mem_; } flags()204*ccdc9c3eSSadaf Ebrahimi int flags() { return flags_; } set_flags(int flags)205*ccdc9c3eSSadaf Ebrahimi void set_flags(int flags) { flags_ = flags; } anchor_start()206*ccdc9c3eSSadaf Ebrahimi bool anchor_start() { return anchor_start_; } set_anchor_start(bool b)207*ccdc9c3eSSadaf Ebrahimi void set_anchor_start(bool b) { anchor_start_ = b; } anchor_end()208*ccdc9c3eSSadaf Ebrahimi bool anchor_end() { return anchor_end_; } set_anchor_end(bool b)209*ccdc9c3eSSadaf Ebrahimi void set_anchor_end(bool b) { anchor_end_ = b; } bytemap_range()210*ccdc9c3eSSadaf Ebrahimi int bytemap_range() { return bytemap_range_; } bytemap()211*ccdc9c3eSSadaf Ebrahimi const uint8_t* bytemap() { return bytemap_; } 212*ccdc9c3eSSadaf Ebrahimi 213*ccdc9c3eSSadaf Ebrahimi // Lazily computed. 214*ccdc9c3eSSadaf Ebrahimi int first_byte(); 215*ccdc9c3eSSadaf Ebrahimi 216*ccdc9c3eSSadaf Ebrahimi // Returns string representation of program for debugging. 217*ccdc9c3eSSadaf Ebrahimi string Dump(); 218*ccdc9c3eSSadaf Ebrahimi string DumpUnanchored(); 219*ccdc9c3eSSadaf Ebrahimi string DumpByteMap(); 220*ccdc9c3eSSadaf Ebrahimi 221*ccdc9c3eSSadaf Ebrahimi // Returns the set of kEmpty flags that are in effect at 222*ccdc9c3eSSadaf Ebrahimi // position p within context. 223*ccdc9c3eSSadaf Ebrahimi static uint32_t EmptyFlags(const StringPiece& context, const char* p); 224*ccdc9c3eSSadaf Ebrahimi 225*ccdc9c3eSSadaf Ebrahimi // Returns whether byte c is a word character: ASCII only. 226*ccdc9c3eSSadaf Ebrahimi // Used by the implementation of \b and \B. 227*ccdc9c3eSSadaf Ebrahimi // This is not right for Unicode, but: 228*ccdc9c3eSSadaf Ebrahimi // - it's hard to get right in a byte-at-a-time matching world 229*ccdc9c3eSSadaf Ebrahimi // (the DFA has only one-byte lookahead). 230*ccdc9c3eSSadaf Ebrahimi // - even if the lookahead were possible, the Progs would be huge. 231*ccdc9c3eSSadaf Ebrahimi // This crude approximation is the same one PCRE uses. IsWordChar(uint8_t c)232*ccdc9c3eSSadaf Ebrahimi static bool IsWordChar(uint8_t c) { 233*ccdc9c3eSSadaf Ebrahimi return ('A' <= c && c <= 'Z') || 234*ccdc9c3eSSadaf Ebrahimi ('a' <= c && c <= 'z') || 235*ccdc9c3eSSadaf Ebrahimi ('0' <= c && c <= '9') || 236*ccdc9c3eSSadaf Ebrahimi c == '_'; 237*ccdc9c3eSSadaf Ebrahimi } 238*ccdc9c3eSSadaf Ebrahimi 239*ccdc9c3eSSadaf Ebrahimi // Execution engines. They all search for the regexp (run the prog) 240*ccdc9c3eSSadaf Ebrahimi // in text, which is in the larger context (used for ^ $ \b etc). 241*ccdc9c3eSSadaf Ebrahimi // Anchor and kind control the kind of search. 242*ccdc9c3eSSadaf Ebrahimi // Returns true if match found, false if not. 243*ccdc9c3eSSadaf Ebrahimi // If match found, fills match[0..nmatch-1] with submatch info. 244*ccdc9c3eSSadaf Ebrahimi // match[0] is overall match, match[1] is first set of parens, etc. 245*ccdc9c3eSSadaf Ebrahimi // If a particular submatch is not matched during the regexp match, 246*ccdc9c3eSSadaf Ebrahimi // it is set to NULL. 247*ccdc9c3eSSadaf Ebrahimi // 248*ccdc9c3eSSadaf Ebrahimi // Matching text == StringPiece(NULL, 0) is treated as any other empty 249*ccdc9c3eSSadaf Ebrahimi // string, but note that on return, it will not be possible to distinguish 250*ccdc9c3eSSadaf Ebrahimi // submatches that matched that empty string from submatches that didn't 251*ccdc9c3eSSadaf Ebrahimi // match anything. Either way, match[i] == NULL. 252*ccdc9c3eSSadaf Ebrahimi 253*ccdc9c3eSSadaf Ebrahimi // Search using NFA: can find submatches but kind of slow. 254*ccdc9c3eSSadaf Ebrahimi bool SearchNFA(const StringPiece& text, const StringPiece& context, 255*ccdc9c3eSSadaf Ebrahimi Anchor anchor, MatchKind kind, 256*ccdc9c3eSSadaf Ebrahimi StringPiece* match, int nmatch); 257*ccdc9c3eSSadaf Ebrahimi 258*ccdc9c3eSSadaf Ebrahimi // Search using DFA: much faster than NFA but only finds 259*ccdc9c3eSSadaf Ebrahimi // end of match and can use a lot more memory. 260*ccdc9c3eSSadaf Ebrahimi // Returns whether a match was found. 261*ccdc9c3eSSadaf Ebrahimi // If the DFA runs out of memory, sets *failed to true and returns false. 262*ccdc9c3eSSadaf Ebrahimi // If matches != NULL and kind == kManyMatch and there is a match, 263*ccdc9c3eSSadaf Ebrahimi // SearchDFA fills matches with the match IDs of the final matching state. 264*ccdc9c3eSSadaf Ebrahimi bool SearchDFA(const StringPiece& text, const StringPiece& context, 265*ccdc9c3eSSadaf Ebrahimi Anchor anchor, MatchKind kind, StringPiece* match0, 266*ccdc9c3eSSadaf Ebrahimi bool* failed, SparseSet* matches); 267*ccdc9c3eSSadaf Ebrahimi 268*ccdc9c3eSSadaf Ebrahimi // The callback issued after building each DFA state with BuildEntireDFA(). 269*ccdc9c3eSSadaf Ebrahimi // If next is null, then the memory budget has been exhausted and building 270*ccdc9c3eSSadaf Ebrahimi // will halt. Otherwise, the state has been built and next points to an array 271*ccdc9c3eSSadaf Ebrahimi // of bytemap_range()+1 slots holding the next states as per the bytemap and 272*ccdc9c3eSSadaf Ebrahimi // kByteEndText. The number of the state is implied by the callback sequence: 273*ccdc9c3eSSadaf Ebrahimi // the first callback is for state 0, the second callback is for state 1, ... 274*ccdc9c3eSSadaf Ebrahimi // match indicates whether the state is a matching state. 275*ccdc9c3eSSadaf Ebrahimi using DFAStateCallback = std::function<void(const int* next, bool match)>; 276*ccdc9c3eSSadaf Ebrahimi 277*ccdc9c3eSSadaf Ebrahimi // Build the entire DFA for the given match kind. 278*ccdc9c3eSSadaf Ebrahimi // Usually the DFA is built out incrementally, as needed, which 279*ccdc9c3eSSadaf Ebrahimi // avoids lots of unnecessary work. 280*ccdc9c3eSSadaf Ebrahimi // If cb is not empty, it receives one callback per state built. 281*ccdc9c3eSSadaf Ebrahimi // Returns the number of states built. 282*ccdc9c3eSSadaf Ebrahimi // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. 283*ccdc9c3eSSadaf Ebrahimi int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb); 284*ccdc9c3eSSadaf Ebrahimi 285*ccdc9c3eSSadaf Ebrahimi // Controls whether the DFA should bail out early if the NFA would be faster. 286*ccdc9c3eSSadaf Ebrahimi // FOR TESTING ONLY. 287*ccdc9c3eSSadaf Ebrahimi static void TEST_dfa_should_bail_when_slow(bool b); 288*ccdc9c3eSSadaf Ebrahimi 289*ccdc9c3eSSadaf Ebrahimi // Compute bytemap. 290*ccdc9c3eSSadaf Ebrahimi void ComputeByteMap(); 291*ccdc9c3eSSadaf Ebrahimi 292*ccdc9c3eSSadaf Ebrahimi // Computes whether all matches must begin with the same first 293*ccdc9c3eSSadaf Ebrahimi // byte, and if so, returns that byte. If not, returns -1. 294*ccdc9c3eSSadaf Ebrahimi int ComputeFirstByte(); 295*ccdc9c3eSSadaf Ebrahimi 296*ccdc9c3eSSadaf Ebrahimi // Run peep-hole optimizer on program. 297*ccdc9c3eSSadaf Ebrahimi void Optimize(); 298*ccdc9c3eSSadaf Ebrahimi 299*ccdc9c3eSSadaf Ebrahimi // One-pass NFA: only correct if IsOnePass() is true, 300*ccdc9c3eSSadaf Ebrahimi // but much faster than NFA (competitive with PCRE) 301*ccdc9c3eSSadaf Ebrahimi // for those expressions. 302*ccdc9c3eSSadaf Ebrahimi bool IsOnePass(); 303*ccdc9c3eSSadaf Ebrahimi bool SearchOnePass(const StringPiece& text, const StringPiece& context, 304*ccdc9c3eSSadaf Ebrahimi Anchor anchor, MatchKind kind, 305*ccdc9c3eSSadaf Ebrahimi StringPiece* match, int nmatch); 306*ccdc9c3eSSadaf Ebrahimi 307*ccdc9c3eSSadaf Ebrahimi // Bit-state backtracking. Fast on small cases but uses memory 308*ccdc9c3eSSadaf Ebrahimi // proportional to the product of the program size and the text size. 309*ccdc9c3eSSadaf Ebrahimi bool SearchBitState(const StringPiece& text, const StringPiece& context, 310*ccdc9c3eSSadaf Ebrahimi Anchor anchor, MatchKind kind, 311*ccdc9c3eSSadaf Ebrahimi StringPiece* match, int nmatch); 312*ccdc9c3eSSadaf Ebrahimi 313*ccdc9c3eSSadaf Ebrahimi static const int kMaxOnePassCapture = 5; // $0 through $4 314*ccdc9c3eSSadaf Ebrahimi 315*ccdc9c3eSSadaf Ebrahimi // Backtracking search: the gold standard against which the other 316*ccdc9c3eSSadaf Ebrahimi // implementations are checked. FOR TESTING ONLY. 317*ccdc9c3eSSadaf Ebrahimi // It allocates a ton of memory to avoid running forever. 318*ccdc9c3eSSadaf Ebrahimi // It is also recursive, so can't use in production (will overflow stacks). 319*ccdc9c3eSSadaf Ebrahimi // The name "Unsafe" here is supposed to be a flag that 320*ccdc9c3eSSadaf Ebrahimi // you should not be using this function. 321*ccdc9c3eSSadaf Ebrahimi bool UnsafeSearchBacktrack(const StringPiece& text, 322*ccdc9c3eSSadaf Ebrahimi const StringPiece& context, 323*ccdc9c3eSSadaf Ebrahimi Anchor anchor, MatchKind kind, 324*ccdc9c3eSSadaf Ebrahimi StringPiece* match, int nmatch); 325*ccdc9c3eSSadaf Ebrahimi 326*ccdc9c3eSSadaf Ebrahimi // Computes range for any strings matching regexp. The min and max can in 327*ccdc9c3eSSadaf Ebrahimi // some cases be arbitrarily precise, so the caller gets to specify the 328*ccdc9c3eSSadaf Ebrahimi // maximum desired length of string returned. 329*ccdc9c3eSSadaf Ebrahimi // 330*ccdc9c3eSSadaf Ebrahimi // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any 331*ccdc9c3eSSadaf Ebrahimi // string s that is an anchored match for this regexp satisfies 332*ccdc9c3eSSadaf Ebrahimi // min <= s && s <= max. 333*ccdc9c3eSSadaf Ebrahimi // 334*ccdc9c3eSSadaf Ebrahimi // Note that PossibleMatchRange() will only consider the first copy of an 335*ccdc9c3eSSadaf Ebrahimi // infinitely repeated element (i.e., any regexp element followed by a '*' or 336*ccdc9c3eSSadaf Ebrahimi // '+' operator). Regexps with "{N}" constructions are not affected, as those 337*ccdc9c3eSSadaf Ebrahimi // do not compile down to infinite repetitions. 338*ccdc9c3eSSadaf Ebrahimi // 339*ccdc9c3eSSadaf Ebrahimi // Returns true on success, false on error. 340*ccdc9c3eSSadaf Ebrahimi bool PossibleMatchRange(string* min, string* max, int maxlen); 341*ccdc9c3eSSadaf Ebrahimi 342*ccdc9c3eSSadaf Ebrahimi // EXPERIMENTAL! SUBJECT TO CHANGE! 343*ccdc9c3eSSadaf Ebrahimi // Outputs the program fanout into the given sparse array. 344*ccdc9c3eSSadaf Ebrahimi void Fanout(SparseArray<int>* fanout); 345*ccdc9c3eSSadaf Ebrahimi 346*ccdc9c3eSSadaf Ebrahimi // Compiles a collection of regexps to Prog. Each regexp will have 347*ccdc9c3eSSadaf Ebrahimi // its own Match instruction recording the index in the output vector. 348*ccdc9c3eSSadaf Ebrahimi static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); 349*ccdc9c3eSSadaf Ebrahimi 350*ccdc9c3eSSadaf Ebrahimi // Flattens the Prog from "tree" form to "list" form. This is an in-place 351*ccdc9c3eSSadaf Ebrahimi // operation in the sense that the old instructions are lost. 352*ccdc9c3eSSadaf Ebrahimi void Flatten(); 353*ccdc9c3eSSadaf Ebrahimi 354*ccdc9c3eSSadaf Ebrahimi // Walks the Prog; the "successor roots" or predecessors of the reachable 355*ccdc9c3eSSadaf Ebrahimi // instructions are marked in rootmap or predmap/predvec, respectively. 356*ccdc9c3eSSadaf Ebrahimi // reachable and stk are preallocated scratch structures. 357*ccdc9c3eSSadaf Ebrahimi void MarkSuccessors(SparseArray<int>* rootmap, 358*ccdc9c3eSSadaf Ebrahimi SparseArray<int>* predmap, 359*ccdc9c3eSSadaf Ebrahimi std::vector<std::vector<int>>* predvec, 360*ccdc9c3eSSadaf Ebrahimi SparseSet* reachable, std::vector<int>* stk); 361*ccdc9c3eSSadaf Ebrahimi 362*ccdc9c3eSSadaf Ebrahimi // Walks the Prog from the given "root" instruction; the "dominator root" 363*ccdc9c3eSSadaf Ebrahimi // of the reachable instructions (if such exists) is marked in rootmap. 364*ccdc9c3eSSadaf Ebrahimi // reachable and stk are preallocated scratch structures. 365*ccdc9c3eSSadaf Ebrahimi void MarkDominator(int root, SparseArray<int>* rootmap, 366*ccdc9c3eSSadaf Ebrahimi SparseArray<int>* predmap, 367*ccdc9c3eSSadaf Ebrahimi std::vector<std::vector<int>>* predvec, 368*ccdc9c3eSSadaf Ebrahimi SparseSet* reachable, std::vector<int>* stk); 369*ccdc9c3eSSadaf Ebrahimi 370*ccdc9c3eSSadaf Ebrahimi // Walks the Prog from the given "root" instruction; the reachable 371*ccdc9c3eSSadaf Ebrahimi // instructions are emitted in "list" form and appended to flat. 372*ccdc9c3eSSadaf Ebrahimi // reachable and stk are preallocated scratch structures. 373*ccdc9c3eSSadaf Ebrahimi void EmitList(int root, SparseArray<int>* rootmap, 374*ccdc9c3eSSadaf Ebrahimi std::vector<Inst>* flat, 375*ccdc9c3eSSadaf Ebrahimi SparseSet* reachable, std::vector<int>* stk); 376*ccdc9c3eSSadaf Ebrahimi 377*ccdc9c3eSSadaf Ebrahimi private: 378*ccdc9c3eSSadaf Ebrahimi friend class Compiler; 379*ccdc9c3eSSadaf Ebrahimi 380*ccdc9c3eSSadaf Ebrahimi DFA* GetDFA(MatchKind kind); 381*ccdc9c3eSSadaf Ebrahimi void DeleteDFA(DFA* dfa); 382*ccdc9c3eSSadaf Ebrahimi 383*ccdc9c3eSSadaf Ebrahimi bool anchor_start_; // regexp has explicit start anchor 384*ccdc9c3eSSadaf Ebrahimi bool anchor_end_; // regexp has explicit end anchor 385*ccdc9c3eSSadaf Ebrahimi bool reversed_; // whether program runs backward over input 386*ccdc9c3eSSadaf Ebrahimi bool did_flatten_; // has Flatten been called? 387*ccdc9c3eSSadaf Ebrahimi bool did_onepass_; // has IsOnePass been called? 388*ccdc9c3eSSadaf Ebrahimi 389*ccdc9c3eSSadaf Ebrahimi int start_; // entry point for program 390*ccdc9c3eSSadaf Ebrahimi int start_unanchored_; // unanchored entry point for program 391*ccdc9c3eSSadaf Ebrahimi int size_; // number of instructions 392*ccdc9c3eSSadaf Ebrahimi int bytemap_range_; // bytemap_[x] < bytemap_range_ 393*ccdc9c3eSSadaf Ebrahimi int first_byte_; // required first byte for match, or -1 if none 394*ccdc9c3eSSadaf Ebrahimi int flags_; // regexp parse flags 395*ccdc9c3eSSadaf Ebrahimi 396*ccdc9c3eSSadaf Ebrahimi int list_count_; // count of lists (see above) 397*ccdc9c3eSSadaf Ebrahimi int inst_count_[kNumInst]; // count of instructions by opcode 398*ccdc9c3eSSadaf Ebrahimi 399*ccdc9c3eSSadaf Ebrahimi PODArray<Inst> inst_; // pointer to instruction array 400*ccdc9c3eSSadaf Ebrahimi uint8_t* onepass_nodes_; // data for OnePass nodes 401*ccdc9c3eSSadaf Ebrahimi 402*ccdc9c3eSSadaf Ebrahimi int64_t dfa_mem_; // Maximum memory for DFAs. 403*ccdc9c3eSSadaf Ebrahimi DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch 404*ccdc9c3eSSadaf Ebrahimi DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch 405*ccdc9c3eSSadaf Ebrahimi 406*ccdc9c3eSSadaf Ebrahimi uint8_t bytemap_[256]; // map from input bytes to byte classes 407*ccdc9c3eSSadaf Ebrahimi 408*ccdc9c3eSSadaf Ebrahimi std::once_flag first_byte_once_; 409*ccdc9c3eSSadaf Ebrahimi std::once_flag dfa_first_once_; 410*ccdc9c3eSSadaf Ebrahimi std::once_flag dfa_longest_once_; 411*ccdc9c3eSSadaf Ebrahimi 412*ccdc9c3eSSadaf Ebrahimi Prog(const Prog&) = delete; 413*ccdc9c3eSSadaf Ebrahimi Prog& operator=(const Prog&) = delete; 414*ccdc9c3eSSadaf Ebrahimi }; 415*ccdc9c3eSSadaf Ebrahimi 416*ccdc9c3eSSadaf Ebrahimi } // namespace re2 417*ccdc9c3eSSadaf Ebrahimi 418*ccdc9c3eSSadaf Ebrahimi #endif // RE2_PROG_H_ 419