1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbirb.h 5 // 6 // Copyright (C) 2002-2008, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains declarations for several classes from the 10 // Rule Based Break Iterator rule builder. 11 // 12 13 14 #ifndef RBBIRB_H 15 #define RBBIRB_H 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_BREAK_ITERATION 20 21 #include <utility> 22 23 #include "unicode/uobject.h" 24 #include "unicode/rbbi.h" 25 #include "unicode/uniset.h" 26 #include "unicode/parseerr.h" 27 #include "uhash.h" 28 #include "uvector.h" 29 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 30 // looks up references to $variables within a set. 31 32 33 U_NAMESPACE_BEGIN 34 35 class RBBIRuleScanner; 36 struct RBBIRuleTableEl; 37 class RBBISetBuilder; 38 class RBBINode; 39 class RBBITableBuilder; 40 41 42 43 //-------------------------------------------------------------------------------- 44 // 45 // RBBISymbolTable. Implements SymbolTable interface that is used by the 46 // UnicodeSet parser to resolve references to $variables. 47 // 48 //-------------------------------------------------------------------------------- 49 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one 50 public: // of these structs for each entry. 51 RBBISymbolTableEntry(); 52 UnicodeString key; 53 RBBINode *val; 54 ~RBBISymbolTableEntry(); 55 56 private: 57 RBBISymbolTableEntry(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class 58 RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class 59 }; 60 61 62 class RBBISymbolTable : public UMemory, public SymbolTable { 63 private: 64 const UnicodeString &fRules; 65 UHashtable *fHashTable; 66 RBBIRuleScanner *fRuleScanner; 67 68 // These next two fields are part of the mechanism for passing references to 69 // already-constructed UnicodeSets back to the UnicodeSet constructor 70 // when the pattern includes $variable references. 71 const UnicodeString ffffString; // = "/uffff" 72 UnicodeSet *fCachedSetLookup; 73 74 public: 75 // API inherited from class SymbolTable 76 virtual const UnicodeString* lookup(const UnicodeString& s) const override; 77 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override; 78 virtual UnicodeString parseReference(const UnicodeString& text, 79 ParsePosition& pos, int32_t limit) const override; 80 81 // Additional Functions 82 RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); 83 virtual ~RBBISymbolTable(); 84 85 virtual RBBINode *lookupNode(const UnicodeString &key) const; 86 virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); 87 88 #ifdef RBBI_DEBUG 89 virtual void rbbiSymtablePrint() const; 90 #else 91 // A do-nothing inline function for non-debug builds. Member funcs can't be empty 92 // or the call sites won't compile. 93 int32_t fFakeField; 94 #define rbbiSymtablePrint() fFakeField=0; 95 #endif 96 97 private: 98 RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class 99 RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class 100 }; 101 102 103 //-------------------------------------------------------------------------------- 104 // 105 // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. 106 // 107 //-------------------------------------------------------------------------------- 108 class RBBIRuleBuilder : public UMemory { 109 public: 110 111 // Create a rule based break iterator from a set of rules. 112 // This function is the main entry point into the rule builder. The 113 // public ICU API for creating RBBIs uses this function to do the actual work. 114 // 115 static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, 116 UParseError *parseError, 117 UErrorCode &status); 118 119 public: 120 // The "public" functions and data members that appear below are accessed 121 // (and shared) by the various parts that make up the rule builder. They 122 // are NOT intended to be accessed by anything outside of the 123 // rule builder implementation. 124 RBBIRuleBuilder(const UnicodeString &rules, 125 UParseError *parseErr, 126 UErrorCode &status 127 ); 128 129 virtual ~RBBIRuleBuilder(); 130 131 /** 132 * Build the state tables and char class Trie from the source rules. 133 */ 134 RBBIDataHeader *build(UErrorCode &status); 135 136 137 /** 138 * Fold together redundant character classes (table columns) and 139 * redundant states (table rows). Done after initial table generation, 140 * before serializing the result. 141 */ 142 void optimizeTables(); 143 144 char *fDebugEnv; // controls debug trace output 145 UErrorCode *fStatus; // Error reporting. Keeping status 146 UParseError *fParseError; // here avoids passing it everywhere. 147 const UnicodeString &fRules; // The rule string that we are compiling 148 UnicodeString fStrippedRules; // The rule string, with comments stripped. 149 150 RBBIRuleScanner *fScanner; // The scanner. 151 RBBINode *fForwardTree; // The parse trees, generated by the scanner, 152 RBBINode *fReverseTree; // then manipulated by subsequent steps. 153 RBBINode *fSafeFwdTree; 154 RBBINode *fSafeRevTree; 155 156 RBBINode **fDefaultTree; // For rules not qualified with a ! 157 // the tree to which they belong to. 158 159 UBool fChainRules; // True for chained Unicode TR style rules. 160 // False for traditional regexp rules. 161 162 UBool fLookAheadHardBreak; // True: Look ahead matches cause an 163 // immediate break, no continuing for the 164 // longest match. 165 166 RBBISetBuilder *fSetBuilder; // Set and Character Category builder. 167 UVector *fUSetNodes; // Vector of all uset nodes. 168 169 RBBITableBuilder *fForwardTable; // State transition table, build time form. 170 171 UVector *fRuleStatusVals; // The values that can be returned 172 // from getRuleStatus(). 173 174 RBBIDataHeader *flattenData(); // Create the flattened (runtime format) 175 // data tables.. 176 private: 177 RBBIRuleBuilder(const RBBIRuleBuilder &other) = delete; // forbid copying of this class 178 RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other) = delete; // forbid copying of this class 179 }; 180 181 182 183 184 //---------------------------------------------------------------------------- 185 // 186 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 187 // been encountered. The val Node will be of nodetype uset 188 // and contain pointers to the actual UnicodeSets. 189 // The Key is the source string for initializing the set. 190 // 191 // The hash table is used to avoid creating duplicate 192 // unnamed (not $var references) UnicodeSets. 193 // 194 // Memory Management: 195 // The Hash Table owns these RBBISetTableEl structs and 196 // the key strings. It does NOT own the val nodes. 197 // 198 //---------------------------------------------------------------------------- 199 struct RBBISetTableEl { 200 UnicodeString *key; 201 RBBINode *val; 202 }; 203 204 /** 205 * A pair of ints, used to bundle pairs of states or pairs of character classes. 206 */ 207 typedef std::pair<int32_t, int32_t> IntPair; 208 209 210 //---------------------------------------------------------------------------- 211 // 212 // RBBIDebugPrintf Printf equivalent, for debugging output. 213 // Conditional compilation of the implementation lets us 214 // get rid of the stdio dependency in environments where it 215 // is unavailable. 216 // 217 //---------------------------------------------------------------------------- 218 #ifdef RBBI_DEBUG 219 #include <stdio.h> 220 #define RBBIDebugPrintf printf 221 #define RBBIDebugPuts puts 222 #else 223 #undef RBBIDebugPrintf 224 #define RBBIDebugPuts(arg) 225 #endif 226 227 U_NAMESPACE_END 228 229 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 230 231 #endif 232 233 234 235