1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker // 4*0e209d39SAndroid Build Coastguard Worker // rbbirb.h 5*0e209d39SAndroid Build Coastguard Worker // 6*0e209d39SAndroid Build Coastguard Worker // Copyright (C) 2002-2008, International Business Machines Corporation and others. 7*0e209d39SAndroid Build Coastguard Worker // All Rights Reserved. 8*0e209d39SAndroid Build Coastguard Worker // 9*0e209d39SAndroid Build Coastguard Worker // This file contains declarations for several classes from the 10*0e209d39SAndroid Build Coastguard Worker // Rule Based Break Iterator rule builder. 11*0e209d39SAndroid Build Coastguard Worker // 12*0e209d39SAndroid Build Coastguard Worker 13*0e209d39SAndroid Build Coastguard Worker 14*0e209d39SAndroid Build Coastguard Worker #ifndef RBBIRB_H 15*0e209d39SAndroid Build Coastguard Worker #define RBBIRB_H 16*0e209d39SAndroid Build Coastguard Worker 17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 18*0e209d39SAndroid Build Coastguard Worker 19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_BREAK_ITERATION 20*0e209d39SAndroid Build Coastguard Worker 21*0e209d39SAndroid Build Coastguard Worker #include <utility> 22*0e209d39SAndroid Build Coastguard Worker 23*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h" 24*0e209d39SAndroid Build Coastguard Worker #include "unicode/rbbi.h" 25*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h" 26*0e209d39SAndroid Build Coastguard Worker #include "unicode/parseerr.h" 27*0e209d39SAndroid Build Coastguard Worker #include "uhash.h" 28*0e209d39SAndroid Build Coastguard Worker #include "uvector.h" 29*0e209d39SAndroid Build Coastguard Worker #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 30*0e209d39SAndroid Build Coastguard Worker // looks up references to $variables within a set. 31*0e209d39SAndroid Build Coastguard Worker 32*0e209d39SAndroid Build Coastguard Worker 33*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 34*0e209d39SAndroid Build Coastguard Worker 35*0e209d39SAndroid Build Coastguard Worker class RBBIRuleScanner; 36*0e209d39SAndroid Build Coastguard Worker struct RBBIRuleTableEl; 37*0e209d39SAndroid Build Coastguard Worker class RBBISetBuilder; 38*0e209d39SAndroid Build Coastguard Worker class RBBINode; 39*0e209d39SAndroid Build Coastguard Worker class RBBITableBuilder; 40*0e209d39SAndroid Build Coastguard Worker 41*0e209d39SAndroid Build Coastguard Worker 42*0e209d39SAndroid Build Coastguard Worker 43*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------------------- 44*0e209d39SAndroid Build Coastguard Worker // 45*0e209d39SAndroid Build Coastguard Worker // RBBISymbolTable. Implements SymbolTable interface that is used by the 46*0e209d39SAndroid Build Coastguard Worker // UnicodeSet parser to resolve references to $variables. 47*0e209d39SAndroid Build Coastguard Worker // 48*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------------------- 49*0e209d39SAndroid Build Coastguard Worker class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one 50*0e209d39SAndroid Build Coastguard Worker public: // of these structs for each entry. 51*0e209d39SAndroid Build Coastguard Worker RBBISymbolTableEntry(); 52*0e209d39SAndroid Build Coastguard Worker UnicodeString key; 53*0e209d39SAndroid Build Coastguard Worker RBBINode *val; 54*0e209d39SAndroid Build Coastguard Worker ~RBBISymbolTableEntry(); 55*0e209d39SAndroid Build Coastguard Worker 56*0e209d39SAndroid Build Coastguard Worker private: 57*0e209d39SAndroid Build Coastguard Worker RBBISymbolTableEntry(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class 58*0e209d39SAndroid Build Coastguard Worker RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class 59*0e209d39SAndroid Build Coastguard Worker }; 60*0e209d39SAndroid Build Coastguard Worker 61*0e209d39SAndroid Build Coastguard Worker 62*0e209d39SAndroid Build Coastguard Worker class RBBISymbolTable : public UMemory, public SymbolTable { 63*0e209d39SAndroid Build Coastguard Worker private: 64*0e209d39SAndroid Build Coastguard Worker const UnicodeString &fRules; 65*0e209d39SAndroid Build Coastguard Worker UHashtable *fHashTable; 66*0e209d39SAndroid Build Coastguard Worker RBBIRuleScanner *fRuleScanner; 67*0e209d39SAndroid Build Coastguard Worker 68*0e209d39SAndroid Build Coastguard Worker // These next two fields are part of the mechanism for passing references to 69*0e209d39SAndroid Build Coastguard Worker // already-constructed UnicodeSets back to the UnicodeSet constructor 70*0e209d39SAndroid Build Coastguard Worker // when the pattern includes $variable references. 71*0e209d39SAndroid Build Coastguard Worker const UnicodeString ffffString; // = "/uffff" 72*0e209d39SAndroid Build Coastguard Worker UnicodeSet *fCachedSetLookup; 73*0e209d39SAndroid Build Coastguard Worker 74*0e209d39SAndroid Build Coastguard Worker public: 75*0e209d39SAndroid Build Coastguard Worker // API inherited from class SymbolTable 76*0e209d39SAndroid Build Coastguard Worker virtual const UnicodeString* lookup(const UnicodeString& s) const override; 77*0e209d39SAndroid Build Coastguard Worker virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override; 78*0e209d39SAndroid Build Coastguard Worker virtual UnicodeString parseReference(const UnicodeString& text, 79*0e209d39SAndroid Build Coastguard Worker ParsePosition& pos, int32_t limit) const override; 80*0e209d39SAndroid Build Coastguard Worker 81*0e209d39SAndroid Build Coastguard Worker // Additional Functions 82*0e209d39SAndroid Build Coastguard Worker RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); 83*0e209d39SAndroid Build Coastguard Worker virtual ~RBBISymbolTable(); 84*0e209d39SAndroid Build Coastguard Worker 85*0e209d39SAndroid Build Coastguard Worker virtual RBBINode *lookupNode(const UnicodeString &key) const; 86*0e209d39SAndroid Build Coastguard Worker virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); 87*0e209d39SAndroid Build Coastguard Worker 88*0e209d39SAndroid Build Coastguard Worker #ifdef RBBI_DEBUG 89*0e209d39SAndroid Build Coastguard Worker virtual void rbbiSymtablePrint() const; 90*0e209d39SAndroid Build Coastguard Worker #else 91*0e209d39SAndroid Build Coastguard Worker // A do-nothing inline function for non-debug builds. Member funcs can't be empty 92*0e209d39SAndroid Build Coastguard Worker // or the call sites won't compile. 93*0e209d39SAndroid Build Coastguard Worker int32_t fFakeField; 94*0e209d39SAndroid Build Coastguard Worker #define rbbiSymtablePrint() fFakeField=0; 95*0e209d39SAndroid Build Coastguard Worker #endif 96*0e209d39SAndroid Build Coastguard Worker 97*0e209d39SAndroid Build Coastguard Worker private: 98*0e209d39SAndroid Build Coastguard Worker RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class 99*0e209d39SAndroid Build Coastguard Worker RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class 100*0e209d39SAndroid Build Coastguard Worker }; 101*0e209d39SAndroid Build Coastguard Worker 102*0e209d39SAndroid Build Coastguard Worker 103*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------------------- 104*0e209d39SAndroid Build Coastguard Worker // 105*0e209d39SAndroid Build Coastguard Worker // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. 106*0e209d39SAndroid Build Coastguard Worker // 107*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------------------- 108*0e209d39SAndroid Build Coastguard Worker class RBBIRuleBuilder : public UMemory { 109*0e209d39SAndroid Build Coastguard Worker public: 110*0e209d39SAndroid Build Coastguard Worker 111*0e209d39SAndroid Build Coastguard Worker // Create a rule based break iterator from a set of rules. 112*0e209d39SAndroid Build Coastguard Worker // This function is the main entry point into the rule builder. The 113*0e209d39SAndroid Build Coastguard Worker // public ICU API for creating RBBIs uses this function to do the actual work. 114*0e209d39SAndroid Build Coastguard Worker // 115*0e209d39SAndroid Build Coastguard Worker static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, 116*0e209d39SAndroid Build Coastguard Worker UParseError *parseError, 117*0e209d39SAndroid Build Coastguard Worker UErrorCode &status); 118*0e209d39SAndroid Build Coastguard Worker 119*0e209d39SAndroid Build Coastguard Worker public: 120*0e209d39SAndroid Build Coastguard Worker // The "public" functions and data members that appear below are accessed 121*0e209d39SAndroid Build Coastguard Worker // (and shared) by the various parts that make up the rule builder. They 122*0e209d39SAndroid Build Coastguard Worker // are NOT intended to be accessed by anything outside of the 123*0e209d39SAndroid Build Coastguard Worker // rule builder implementation. 124*0e209d39SAndroid Build Coastguard Worker RBBIRuleBuilder(const UnicodeString &rules, 125*0e209d39SAndroid Build Coastguard Worker UParseError *parseErr, 126*0e209d39SAndroid Build Coastguard Worker UErrorCode &status 127*0e209d39SAndroid Build Coastguard Worker ); 128*0e209d39SAndroid Build Coastguard Worker 129*0e209d39SAndroid Build Coastguard Worker virtual ~RBBIRuleBuilder(); 130*0e209d39SAndroid Build Coastguard Worker 131*0e209d39SAndroid Build Coastguard Worker /** 132*0e209d39SAndroid Build Coastguard Worker * Build the state tables and char class Trie from the source rules. 133*0e209d39SAndroid Build Coastguard Worker */ 134*0e209d39SAndroid Build Coastguard Worker RBBIDataHeader *build(UErrorCode &status); 135*0e209d39SAndroid Build Coastguard Worker 136*0e209d39SAndroid Build Coastguard Worker 137*0e209d39SAndroid Build Coastguard Worker /** 138*0e209d39SAndroid Build Coastguard Worker * Fold together redundant character classes (table columns) and 139*0e209d39SAndroid Build Coastguard Worker * redundant states (table rows). Done after initial table generation, 140*0e209d39SAndroid Build Coastguard Worker * before serializing the result. 141*0e209d39SAndroid Build Coastguard Worker */ 142*0e209d39SAndroid Build Coastguard Worker void optimizeTables(); 143*0e209d39SAndroid Build Coastguard Worker 144*0e209d39SAndroid Build Coastguard Worker char *fDebugEnv; // controls debug trace output 145*0e209d39SAndroid Build Coastguard Worker UErrorCode *fStatus; // Error reporting. Keeping status 146*0e209d39SAndroid Build Coastguard Worker UParseError *fParseError; // here avoids passing it everywhere. 147*0e209d39SAndroid Build Coastguard Worker const UnicodeString &fRules; // The rule string that we are compiling 148*0e209d39SAndroid Build Coastguard Worker UnicodeString fStrippedRules; // The rule string, with comments stripped. 149*0e209d39SAndroid Build Coastguard Worker 150*0e209d39SAndroid Build Coastguard Worker RBBIRuleScanner *fScanner; // The scanner. 151*0e209d39SAndroid Build Coastguard Worker RBBINode *fForwardTree; // The parse trees, generated by the scanner, 152*0e209d39SAndroid Build Coastguard Worker RBBINode *fReverseTree; // then manipulated by subsequent steps. 153*0e209d39SAndroid Build Coastguard Worker RBBINode *fSafeFwdTree; 154*0e209d39SAndroid Build Coastguard Worker RBBINode *fSafeRevTree; 155*0e209d39SAndroid Build Coastguard Worker 156*0e209d39SAndroid Build Coastguard Worker RBBINode **fDefaultTree; // For rules not qualified with a ! 157*0e209d39SAndroid Build Coastguard Worker // the tree to which they belong to. 158*0e209d39SAndroid Build Coastguard Worker 159*0e209d39SAndroid Build Coastguard Worker UBool fChainRules; // True for chained Unicode TR style rules. 160*0e209d39SAndroid Build Coastguard Worker // False for traditional regexp rules. 161*0e209d39SAndroid Build Coastguard Worker 162*0e209d39SAndroid Build Coastguard Worker UBool fLookAheadHardBreak; // True: Look ahead matches cause an 163*0e209d39SAndroid Build Coastguard Worker // immediate break, no continuing for the 164*0e209d39SAndroid Build Coastguard Worker // longest match. 165*0e209d39SAndroid Build Coastguard Worker 166*0e209d39SAndroid Build Coastguard Worker RBBISetBuilder *fSetBuilder; // Set and Character Category builder. 167*0e209d39SAndroid Build Coastguard Worker UVector *fUSetNodes; // Vector of all uset nodes. 168*0e209d39SAndroid Build Coastguard Worker 169*0e209d39SAndroid Build Coastguard Worker RBBITableBuilder *fForwardTable; // State transition table, build time form. 170*0e209d39SAndroid Build Coastguard Worker 171*0e209d39SAndroid Build Coastguard Worker UVector *fRuleStatusVals; // The values that can be returned 172*0e209d39SAndroid Build Coastguard Worker // from getRuleStatus(). 173*0e209d39SAndroid Build Coastguard Worker 174*0e209d39SAndroid Build Coastguard Worker RBBIDataHeader *flattenData(); // Create the flattened (runtime format) 175*0e209d39SAndroid Build Coastguard Worker // data tables.. 176*0e209d39SAndroid Build Coastguard Worker private: 177*0e209d39SAndroid Build Coastguard Worker RBBIRuleBuilder(const RBBIRuleBuilder &other) = delete; // forbid copying of this class 178*0e209d39SAndroid Build Coastguard Worker RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other) = delete; // forbid copying of this class 179*0e209d39SAndroid Build Coastguard Worker }; 180*0e209d39SAndroid Build Coastguard Worker 181*0e209d39SAndroid Build Coastguard Worker 182*0e209d39SAndroid Build Coastguard Worker 183*0e209d39SAndroid Build Coastguard Worker 184*0e209d39SAndroid Build Coastguard Worker //---------------------------------------------------------------------------- 185*0e209d39SAndroid Build Coastguard Worker // 186*0e209d39SAndroid Build Coastguard Worker // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 187*0e209d39SAndroid Build Coastguard Worker // been encountered. The val Node will be of nodetype uset 188*0e209d39SAndroid Build Coastguard Worker // and contain pointers to the actual UnicodeSets. 189*0e209d39SAndroid Build Coastguard Worker // The Key is the source string for initializing the set. 190*0e209d39SAndroid Build Coastguard Worker // 191*0e209d39SAndroid Build Coastguard Worker // The hash table is used to avoid creating duplicate 192*0e209d39SAndroid Build Coastguard Worker // unnamed (not $var references) UnicodeSets. 193*0e209d39SAndroid Build Coastguard Worker // 194*0e209d39SAndroid Build Coastguard Worker // Memory Management: 195*0e209d39SAndroid Build Coastguard Worker // The Hash Table owns these RBBISetTableEl structs and 196*0e209d39SAndroid Build Coastguard Worker // the key strings. It does NOT own the val nodes. 197*0e209d39SAndroid Build Coastguard Worker // 198*0e209d39SAndroid Build Coastguard Worker //---------------------------------------------------------------------------- 199*0e209d39SAndroid Build Coastguard Worker struct RBBISetTableEl { 200*0e209d39SAndroid Build Coastguard Worker UnicodeString *key; 201*0e209d39SAndroid Build Coastguard Worker RBBINode *val; 202*0e209d39SAndroid Build Coastguard Worker }; 203*0e209d39SAndroid Build Coastguard Worker 204*0e209d39SAndroid Build Coastguard Worker /** 205*0e209d39SAndroid Build Coastguard Worker * A pair of ints, used to bundle pairs of states or pairs of character classes. 206*0e209d39SAndroid Build Coastguard Worker */ 207*0e209d39SAndroid Build Coastguard Worker typedef std::pair<int32_t, int32_t> IntPair; 208*0e209d39SAndroid Build Coastguard Worker 209*0e209d39SAndroid Build Coastguard Worker 210*0e209d39SAndroid Build Coastguard Worker //---------------------------------------------------------------------------- 211*0e209d39SAndroid Build Coastguard Worker // 212*0e209d39SAndroid Build Coastguard Worker // RBBIDebugPrintf Printf equivalent, for debugging output. 213*0e209d39SAndroid Build Coastguard Worker // Conditional compilation of the implementation lets us 214*0e209d39SAndroid Build Coastguard Worker // get rid of the stdio dependency in environments where it 215*0e209d39SAndroid Build Coastguard Worker // is unavailable. 216*0e209d39SAndroid Build Coastguard Worker // 217*0e209d39SAndroid Build Coastguard Worker //---------------------------------------------------------------------------- 218*0e209d39SAndroid Build Coastguard Worker #ifdef RBBI_DEBUG 219*0e209d39SAndroid Build Coastguard Worker #include <stdio.h> 220*0e209d39SAndroid Build Coastguard Worker #define RBBIDebugPrintf printf 221*0e209d39SAndroid Build Coastguard Worker #define RBBIDebugPuts puts 222*0e209d39SAndroid Build Coastguard Worker #else 223*0e209d39SAndroid Build Coastguard Worker #undef RBBIDebugPrintf 224*0e209d39SAndroid Build Coastguard Worker #define RBBIDebugPuts(arg) 225*0e209d39SAndroid Build Coastguard Worker #endif 226*0e209d39SAndroid Build Coastguard Worker 227*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 228*0e209d39SAndroid Build Coastguard Worker 229*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 230*0e209d39SAndroid Build Coastguard Worker 231*0e209d39SAndroid Build Coastguard Worker #endif 232*0e209d39SAndroid Build Coastguard Worker 233*0e209d39SAndroid Build Coastguard Worker 234*0e209d39SAndroid Build Coastguard Worker 235