1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker // 4*0e209d39SAndroid Build Coastguard Worker // rbbisetb.h 5*0e209d39SAndroid Build Coastguard Worker /* 6*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 7*0e209d39SAndroid Build Coastguard Worker * Copyright (c) 2001-2005, International Business Machines 8*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved. 9*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 10*0e209d39SAndroid Build Coastguard Worker */ 11*0e209d39SAndroid Build Coastguard Worker 12*0e209d39SAndroid Build Coastguard Worker #ifndef RBBISETB_H 13*0e209d39SAndroid Build Coastguard Worker #define RBBISETB_H 14*0e209d39SAndroid Build Coastguard Worker 15*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 16*0e209d39SAndroid Build Coastguard Worker 17*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_BREAK_ITERATION 18*0e209d39SAndroid Build Coastguard Worker 19*0e209d39SAndroid Build Coastguard Worker #include "unicode/ucptrie.h" 20*0e209d39SAndroid Build Coastguard Worker #include "unicode/umutablecptrie.h" 21*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h" 22*0e209d39SAndroid Build Coastguard Worker #include "rbbirb.h" 23*0e209d39SAndroid Build Coastguard Worker #include "uvector.h" 24*0e209d39SAndroid Build Coastguard Worker 25*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 26*0e209d39SAndroid Build Coastguard Worker 27*0e209d39SAndroid Build Coastguard Worker // 28*0e209d39SAndroid Build Coastguard Worker // RBBISetBuilder Derives the character categories used by the runtime RBBI engine 29*0e209d39SAndroid Build Coastguard Worker // from the Unicode Sets appearing in the source RBBI rules, and 30*0e209d39SAndroid Build Coastguard Worker // creates the TRIE table used to map from Unicode to the 31*0e209d39SAndroid Build Coastguard Worker // character categories. 32*0e209d39SAndroid Build Coastguard Worker // 33*0e209d39SAndroid Build Coastguard Worker 34*0e209d39SAndroid Build Coastguard Worker 35*0e209d39SAndroid Build Coastguard Worker // 36*0e209d39SAndroid Build Coastguard Worker // RangeDescriptor 37*0e209d39SAndroid Build Coastguard Worker // 38*0e209d39SAndroid Build Coastguard Worker // Each of the non-overlapping character ranges gets one of these descriptors. 39*0e209d39SAndroid Build Coastguard Worker // All of them are strung together in a linked list, which is kept in order 40*0e209d39SAndroid Build Coastguard Worker // (by character) 41*0e209d39SAndroid Build Coastguard Worker // 42*0e209d39SAndroid Build Coastguard Worker class RangeDescriptor : public UMemory { 43*0e209d39SAndroid Build Coastguard Worker public: 44*0e209d39SAndroid Build Coastguard Worker UChar32 fStartChar {}; // Start of range, unicode 32 bit value. 45*0e209d39SAndroid Build Coastguard Worker UChar32 fEndChar {}; // End of range, unicode 32 bit value. 46*0e209d39SAndroid Build Coastguard Worker int32_t fNum {0}; // runtime-mapped input value for this range. 47*0e209d39SAndroid Build Coastguard Worker bool fIncludesDict {false}; // True if the range includes $dictionary. 48*0e209d39SAndroid Build Coastguard Worker bool fFirstInGroup {false}; // True if first range in a group with the same fNum. 49*0e209d39SAndroid Build Coastguard Worker UVector *fIncludesSets {nullptr}; // vector of the the original 50*0e209d39SAndroid Build Coastguard Worker // Unicode sets that include this range. 51*0e209d39SAndroid Build Coastguard Worker // (Contains ptrs to uset nodes) 52*0e209d39SAndroid Build Coastguard Worker RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list. 53*0e209d39SAndroid Build Coastguard Worker 54*0e209d39SAndroid Build Coastguard Worker RangeDescriptor(UErrorCode &status); 55*0e209d39SAndroid Build Coastguard Worker RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); 56*0e209d39SAndroid Build Coastguard Worker ~RangeDescriptor(); 57*0e209d39SAndroid Build Coastguard Worker void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with 58*0e209d39SAndroid Build Coastguard Worker // where appearing in the second (higher) part. 59*0e209d39SAndroid Build Coastguard Worker bool isDictionaryRange(); // Check whether this range appears as part of 60*0e209d39SAndroid Build Coastguard Worker // the Unicode set named "dictionary" 61*0e209d39SAndroid Build Coastguard Worker 62*0e209d39SAndroid Build Coastguard Worker RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class 63*0e209d39SAndroid Build Coastguard Worker RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class 64*0e209d39SAndroid Build Coastguard Worker }; 65*0e209d39SAndroid Build Coastguard Worker 66*0e209d39SAndroid Build Coastguard Worker 67*0e209d39SAndroid Build Coastguard Worker // 68*0e209d39SAndroid Build Coastguard Worker // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. 69*0e209d39SAndroid Build Coastguard Worker // 70*0e209d39SAndroid Build Coastguard Worker // Starting with the rules parse tree from the scanner, 71*0e209d39SAndroid Build Coastguard Worker // 72*0e209d39SAndroid Build Coastguard Worker // - Enumerate the set of UnicodeSets that are referenced 73*0e209d39SAndroid Build Coastguard Worker // by the RBBI rules. 74*0e209d39SAndroid Build Coastguard Worker // - compute a derived set of non-overlapping UnicodeSets 75*0e209d39SAndroid Build Coastguard Worker // that will correspond to columns in the state table for 76*0e209d39SAndroid Build Coastguard Worker // the RBBI execution engine. 77*0e209d39SAndroid Build Coastguard Worker // - construct the trie table that maps input characters 78*0e209d39SAndroid Build Coastguard Worker // to set numbers in the non-overlapping set of sets. 79*0e209d39SAndroid Build Coastguard Worker // 80*0e209d39SAndroid Build Coastguard Worker 81*0e209d39SAndroid Build Coastguard Worker 82*0e209d39SAndroid Build Coastguard Worker class RBBISetBuilder : public UMemory { 83*0e209d39SAndroid Build Coastguard Worker public: 84*0e209d39SAndroid Build Coastguard Worker RBBISetBuilder(RBBIRuleBuilder *rb); 85*0e209d39SAndroid Build Coastguard Worker ~RBBISetBuilder(); 86*0e209d39SAndroid Build Coastguard Worker 87*0e209d39SAndroid Build Coastguard Worker void buildRanges(); 88*0e209d39SAndroid Build Coastguard Worker void buildTrie(); 89*0e209d39SAndroid Build Coastguard Worker void addValToSets(UVector *sets, uint32_t val); 90*0e209d39SAndroid Build Coastguard Worker void addValToSet (RBBINode *usetNode, uint32_t val); 91*0e209d39SAndroid Build Coastguard Worker int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the 92*0e209d39SAndroid Build Coastguard Worker // runtime state machine, which are the same as 93*0e209d39SAndroid Build Coastguard Worker // columns in the DFA state table 94*0e209d39SAndroid Build Coastguard Worker int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or 95*0e209d39SAndroid Build Coastguard Worker // last category + 1 if there are no dictionary categories. 96*0e209d39SAndroid Build Coastguard Worker int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. 97*0e209d39SAndroid Build Coastguard Worker void serializeTrie(uint8_t *where); // write out the serialized Trie. 98*0e209d39SAndroid Build Coastguard Worker UChar32 getFirstChar(int32_t val) const; 99*0e209d39SAndroid Build Coastguard Worker UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo 100*0e209d39SAndroid Build Coastguard Worker // character were encountered. 101*0e209d39SAndroid Build Coastguard Worker /** 102*0e209d39SAndroid Build Coastguard Worker * Merge two character categories that have been identified as having equivalent behavior. 103*0e209d39SAndroid Build Coastguard Worker * The ranges belonging to the second category (table column) will be added to the first. 104*0e209d39SAndroid Build Coastguard Worker * @param categories the pair of categories to be merged. 105*0e209d39SAndroid Build Coastguard Worker */ 106*0e209d39SAndroid Build Coastguard Worker void mergeCategories(IntPair categories); 107*0e209d39SAndroid Build Coastguard Worker 108*0e209d39SAndroid Build Coastguard Worker #ifdef RBBI_DEBUG 109*0e209d39SAndroid Build Coastguard Worker void printSets(); 110*0e209d39SAndroid Build Coastguard Worker void printRanges(); 111*0e209d39SAndroid Build Coastguard Worker void printRangeGroups(); 112*0e209d39SAndroid Build Coastguard Worker #else 113*0e209d39SAndroid Build Coastguard Worker #define printSets() 114*0e209d39SAndroid Build Coastguard Worker #define printRanges() 115*0e209d39SAndroid Build Coastguard Worker #define printRangeGroups() 116*0e209d39SAndroid Build Coastguard Worker #endif 117*0e209d39SAndroid Build Coastguard Worker 118*0e209d39SAndroid Build Coastguard Worker private: 119*0e209d39SAndroid Build Coastguard Worker RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. 120*0e209d39SAndroid Build Coastguard Worker UErrorCode *fStatus; 121*0e209d39SAndroid Build Coastguard Worker 122*0e209d39SAndroid Build Coastguard Worker RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors 123*0e209d39SAndroid Build Coastguard Worker 124*0e209d39SAndroid Build Coastguard Worker UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing 125*0e209d39SAndroid Build Coastguard Worker UCPTrie *fTrie; // the Unicode Sets. 126*0e209d39SAndroid Build Coastguard Worker uint32_t fTrieSize; 127*0e209d39SAndroid Build Coastguard Worker 128*0e209d39SAndroid Build Coastguard Worker // Number of range groups, which are groups of ranges that are in the same original UnicodeSets. 129*0e209d39SAndroid Build Coastguard Worker int32_t fGroupCount; 130*0e209d39SAndroid Build Coastguard Worker 131*0e209d39SAndroid Build Coastguard Worker // The number of the first dictionary char category. 132*0e209d39SAndroid Build Coastguard Worker // If there are no Dictionary categories, set to the last category + 1. 133*0e209d39SAndroid Build Coastguard Worker int32_t fDictCategoriesStart; 134*0e209d39SAndroid Build Coastguard Worker 135*0e209d39SAndroid Build Coastguard Worker UBool fSawBOF; 136*0e209d39SAndroid Build Coastguard Worker 137*0e209d39SAndroid Build Coastguard Worker RBBISetBuilder(const RBBISetBuilder &other) = delete; // forbid copying of this class 138*0e209d39SAndroid Build Coastguard Worker RBBISetBuilder &operator=(const RBBISetBuilder &other) = delete; // forbid copying of this class 139*0e209d39SAndroid Build Coastguard Worker }; 140*0e209d39SAndroid Build Coastguard Worker 141*0e209d39SAndroid Build Coastguard Worker 142*0e209d39SAndroid Build Coastguard Worker 143*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 144*0e209d39SAndroid Build Coastguard Worker 145*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 146*0e209d39SAndroid Build Coastguard Worker 147*0e209d39SAndroid Build Coastguard Worker #endif 148