1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 5*0e209d39SAndroid Build Coastguard Worker * 6*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 1999-2014 International Business Machines 7*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved. 8*0e209d39SAndroid Build Coastguard Worker * 9*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 10*0e209d39SAndroid Build Coastguard Worker * file name: rbbidata.h 11*0e209d39SAndroid Build Coastguard Worker * encoding: UTF-8 12*0e209d39SAndroid Build Coastguard Worker * tab size: 8 (not used) 13*0e209d39SAndroid Build Coastguard Worker * indentation:4 14*0e209d39SAndroid Build Coastguard Worker * 15*0e209d39SAndroid Build Coastguard Worker * RBBI data formats Includes 16*0e209d39SAndroid Build Coastguard Worker * 17*0e209d39SAndroid Build Coastguard Worker * Structs that describes the format of the Binary RBBI data, 18*0e209d39SAndroid Build Coastguard Worker * as it is stored in ICU's data file. 19*0e209d39SAndroid Build Coastguard Worker * 20*0e209d39SAndroid Build Coastguard Worker * RBBIDataWrapper - Instances of this class sit between the 21*0e209d39SAndroid Build Coastguard Worker * raw data structs and the RulesBasedBreakIterator objects 22*0e209d39SAndroid Build Coastguard Worker * that are created by applications. The wrapper class 23*0e209d39SAndroid Build Coastguard Worker * provides reference counting for the underlying data, 24*0e209d39SAndroid Build Coastguard Worker * and direct pointers to data that would not otherwise 25*0e209d39SAndroid Build Coastguard Worker * be accessible without ugly pointer arithmetic. The 26*0e209d39SAndroid Build Coastguard Worker * wrapper does not attempt to provide any higher level 27*0e209d39SAndroid Build Coastguard Worker * abstractions for the data itself. 28*0e209d39SAndroid Build Coastguard Worker * 29*0e209d39SAndroid Build Coastguard Worker * There will be only one instance of RBBIDataWrapper for any 30*0e209d39SAndroid Build Coastguard Worker * set of RBBI run time data being shared by instances 31*0e209d39SAndroid Build Coastguard Worker * (clones) of RulesBasedBreakIterator. 32*0e209d39SAndroid Build Coastguard Worker */ 33*0e209d39SAndroid Build Coastguard Worker 34*0e209d39SAndroid Build Coastguard Worker #ifndef __RBBIDATA_H__ 35*0e209d39SAndroid Build Coastguard Worker #define __RBBIDATA_H__ 36*0e209d39SAndroid Build Coastguard Worker 37*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 38*0e209d39SAndroid Build Coastguard Worker #include "unicode/udata.h" 39*0e209d39SAndroid Build Coastguard Worker #include "udataswp.h" 40*0e209d39SAndroid Build Coastguard Worker 41*0e209d39SAndroid Build Coastguard Worker /** 42*0e209d39SAndroid Build Coastguard Worker * Swap RBBI data. See udataswp.h. 43*0e209d39SAndroid Build Coastguard Worker * @internal 44*0e209d39SAndroid Build Coastguard Worker */ 45*0e209d39SAndroid Build Coastguard Worker U_CAPI int32_t U_EXPORT2 46*0e209d39SAndroid Build Coastguard Worker ubrk_swap(const UDataSwapper *ds, 47*0e209d39SAndroid Build Coastguard Worker const void *inData, int32_t length, void *outData, 48*0e209d39SAndroid Build Coastguard Worker UErrorCode *pErrorCode); 49*0e209d39SAndroid Build Coastguard Worker 50*0e209d39SAndroid Build Coastguard Worker #ifdef __cplusplus 51*0e209d39SAndroid Build Coastguard Worker 52*0e209d39SAndroid Build Coastguard Worker #include "unicode/ucptrie.h" 53*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h" 54*0e209d39SAndroid Build Coastguard Worker #include "unicode/unistr.h" 55*0e209d39SAndroid Build Coastguard Worker #include "unicode/uversion.h" 56*0e209d39SAndroid Build Coastguard Worker #include "umutex.h" 57*0e209d39SAndroid Build Coastguard Worker 58*0e209d39SAndroid Build Coastguard Worker 59*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 60*0e209d39SAndroid Build Coastguard Worker 61*0e209d39SAndroid Build Coastguard Worker // The current RBBI data format version. 62*0e209d39SAndroid Build Coastguard Worker static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0}; 63*0e209d39SAndroid Build Coastguard Worker 64*0e209d39SAndroid Build Coastguard Worker /* 65*0e209d39SAndroid Build Coastguard Worker * The following structs map exactly onto the raw data from ICU common data file. 66*0e209d39SAndroid Build Coastguard Worker */ 67*0e209d39SAndroid Build Coastguard Worker struct RBBIDataHeader { 68*0e209d39SAndroid Build Coastguard Worker uint32_t fMagic; /* == 0xbla0 */ 69*0e209d39SAndroid Build Coastguard Worker UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */ 70*0e209d39SAndroid Build Coastguard Worker /* if there is one associated with this data. */ 71*0e209d39SAndroid Build Coastguard Worker /* (version originates in rbbi, is copied to UDataInfo) */ 72*0e209d39SAndroid Build Coastguard Worker uint32_t fLength; /* Total length in bytes of this RBBI Data, */ 73*0e209d39SAndroid Build Coastguard Worker /* including all sections, not just the header. */ 74*0e209d39SAndroid Build Coastguard Worker uint32_t fCatCount; /* Number of character categories. */ 75*0e209d39SAndroid Build Coastguard Worker 76*0e209d39SAndroid Build Coastguard Worker /* */ 77*0e209d39SAndroid Build Coastguard Worker /* Offsets and sizes of each of the subsections within the RBBI data. */ 78*0e209d39SAndroid Build Coastguard Worker /* All offsets are bytes from the start of the RBBIDataHeader. */ 79*0e209d39SAndroid Build Coastguard Worker /* All sizes are in bytes. */ 80*0e209d39SAndroid Build Coastguard Worker /* */ 81*0e209d39SAndroid Build Coastguard Worker uint32_t fFTable; /* forward state transition table. */ 82*0e209d39SAndroid Build Coastguard Worker uint32_t fFTableLen; 83*0e209d39SAndroid Build Coastguard Worker uint32_t fRTable; /* Offset to the reverse state transition table. */ 84*0e209d39SAndroid Build Coastguard Worker uint32_t fRTableLen; 85*0e209d39SAndroid Build Coastguard Worker uint32_t fTrie; /* Offset to Trie data for character categories */ 86*0e209d39SAndroid Build Coastguard Worker uint32_t fTrieLen; 87*0e209d39SAndroid Build Coastguard Worker uint32_t fRuleSource; /* Offset to the source for for the break */ 88*0e209d39SAndroid Build Coastguard Worker uint32_t fRuleSourceLen; /* rules. Stored char16_t *. */ 89*0e209d39SAndroid Build Coastguard Worker uint32_t fStatusTable; /* Offset to the table of rule status values */ 90*0e209d39SAndroid Build Coastguard Worker uint32_t fStatusTableLen; 91*0e209d39SAndroid Build Coastguard Worker 92*0e209d39SAndroid Build Coastguard Worker uint32_t fReserved[6]; /* Reserved for expansion */ 93*0e209d39SAndroid Build Coastguard Worker 94*0e209d39SAndroid Build Coastguard Worker }; 95*0e209d39SAndroid Build Coastguard Worker 96*0e209d39SAndroid Build Coastguard Worker 97*0e209d39SAndroid Build Coastguard Worker 98*0e209d39SAndroid Build Coastguard Worker template <typename T> 99*0e209d39SAndroid Build Coastguard Worker struct RBBIStateTableRowT { 100*0e209d39SAndroid Build Coastguard Worker T fAccepting; // Non-zero if this row is for an accepting state. 101*0e209d39SAndroid Build Coastguard Worker // Value 0: not an accepting state. 102*0e209d39SAndroid Build Coastguard Worker // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state. 103*0e209d39SAndroid Build Coastguard Worker // >1: Look-ahead match has completed. 104*0e209d39SAndroid Build Coastguard Worker // Actual boundary position happened earlier. 105*0e209d39SAndroid Build Coastguard Worker // Value here == fLookAhead in earlier 106*0e209d39SAndroid Build Coastguard Worker // state, at actual boundary pos. 107*0e209d39SAndroid Build Coastguard Worker T fLookAhead; // Non-zero if this row is for a state that 108*0e209d39SAndroid Build Coastguard Worker // corresponds to a '/' in the rule source. 109*0e209d39SAndroid Build Coastguard Worker // Value is the same as the fAccepting 110*0e209d39SAndroid Build Coastguard Worker // value for the rule (which will appear 111*0e209d39SAndroid Build Coastguard Worker // in a different state. 112*0e209d39SAndroid Build Coastguard Worker T fTagsIdx; // Non-zero if this row covers a {tagged} position 113*0e209d39SAndroid Build Coastguard Worker // from a rule. Value is the index in the 114*0e209d39SAndroid Build Coastguard Worker // StatusTable of the set of matching 115*0e209d39SAndroid Build Coastguard Worker // tags (rule status values) 116*0e209d39SAndroid Build Coastguard Worker T fNextState[1]; // Next State, indexed by char category. 117*0e209d39SAndroid Build Coastguard Worker // Variable-length array declared with length 1 118*0e209d39SAndroid Build Coastguard Worker // to disable bounds checkers. 119*0e209d39SAndroid Build Coastguard Worker // Array Size is actually fData->fHeader->fCatCount 120*0e209d39SAndroid Build Coastguard Worker // CAUTION: see RBBITableBuilder::getTableSize() 121*0e209d39SAndroid Build Coastguard Worker // before changing anything here. 122*0e209d39SAndroid Build Coastguard Worker }; 123*0e209d39SAndroid Build Coastguard Worker 124*0e209d39SAndroid Build Coastguard Worker typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8; 125*0e209d39SAndroid Build Coastguard Worker typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16; 126*0e209d39SAndroid Build Coastguard Worker 127*0e209d39SAndroid Build Coastguard Worker constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting 128*0e209d39SAndroid Build Coastguard Worker 129*0e209d39SAndroid Build Coastguard Worker union RBBIStateTableRow { 130*0e209d39SAndroid Build Coastguard Worker RBBIStateTableRow16 r16; 131*0e209d39SAndroid Build Coastguard Worker RBBIStateTableRow8 r8; 132*0e209d39SAndroid Build Coastguard Worker }; 133*0e209d39SAndroid Build Coastguard Worker 134*0e209d39SAndroid Build Coastguard Worker struct RBBIStateTable { 135*0e209d39SAndroid Build Coastguard Worker uint32_t fNumStates; // Number of states. 136*0e209d39SAndroid Build Coastguard Worker uint32_t fRowLen; // Length of a state table row, in bytes. 137*0e209d39SAndroid Build Coastguard Worker uint32_t fDictCategoriesStart; // Char category number of the first dictionary 138*0e209d39SAndroid Build Coastguard Worker // char class, or the the largest category number + 1 139*0e209d39SAndroid Build Coastguard Worker // if there are no dictionary categories. 140*0e209d39SAndroid Build Coastguard Worker uint32_t fLookAheadResultsSize; // Size of run-time array required for holding 141*0e209d39SAndroid Build Coastguard Worker // look-ahead results. Indexed by row.fLookAhead. 142*0e209d39SAndroid Build Coastguard Worker uint32_t fFlags; // Option Flags for this state table. 143*0e209d39SAndroid Build Coastguard Worker char fTableData[1]; // First RBBIStateTableRow begins here. 144*0e209d39SAndroid Build Coastguard Worker // Variable-length array declared with length 1 145*0e209d39SAndroid Build Coastguard Worker // to disable bounds checkers. 146*0e209d39SAndroid Build Coastguard Worker // (making it char[] simplifies ugly address 147*0e209d39SAndroid Build Coastguard Worker // arithmetic for indexing variable length rows.) 148*0e209d39SAndroid Build Coastguard Worker }; 149*0e209d39SAndroid Build Coastguard Worker 150*0e209d39SAndroid Build Coastguard Worker constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1; 151*0e209d39SAndroid Build Coastguard Worker constexpr uint32_t RBBI_BOF_REQUIRED = 2; 152*0e209d39SAndroid Build Coastguard Worker constexpr uint32_t RBBI_8BITS_ROWS = 4; 153*0e209d39SAndroid Build Coastguard Worker 154*0e209d39SAndroid Build Coastguard Worker 155*0e209d39SAndroid Build Coastguard Worker /* */ 156*0e209d39SAndroid Build Coastguard Worker /* The reference counting wrapper class */ 157*0e209d39SAndroid Build Coastguard Worker /* */ 158*0e209d39SAndroid Build Coastguard Worker class RBBIDataWrapper : public UMemory { 159*0e209d39SAndroid Build Coastguard Worker public: 160*0e209d39SAndroid Build Coastguard Worker enum EDontAdopt { 161*0e209d39SAndroid Build Coastguard Worker kDontAdopt 162*0e209d39SAndroid Build Coastguard Worker }; 163*0e209d39SAndroid Build Coastguard Worker RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status); 164*0e209d39SAndroid Build Coastguard Worker RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); 165*0e209d39SAndroid Build Coastguard Worker RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); 166*0e209d39SAndroid Build Coastguard Worker ~RBBIDataWrapper(); 167*0e209d39SAndroid Build Coastguard Worker 168*0e209d39SAndroid Build Coastguard Worker static UBool isDataVersionAcceptable(const UVersionInfo version); 169*0e209d39SAndroid Build Coastguard Worker 170*0e209d39SAndroid Build Coastguard Worker void init0(); 171*0e209d39SAndroid Build Coastguard Worker void init(const RBBIDataHeader *data, UErrorCode &status); 172*0e209d39SAndroid Build Coastguard Worker RBBIDataWrapper *addReference(); 173*0e209d39SAndroid Build Coastguard Worker void removeReference(); 174*0e209d39SAndroid Build Coastguard Worker bool operator ==(const RBBIDataWrapper &other) const; 175*0e209d39SAndroid Build Coastguard Worker int32_t hashCode(); 176*0e209d39SAndroid Build Coastguard Worker const UnicodeString &getRuleSourceString() const; 177*0e209d39SAndroid Build Coastguard Worker void printData(); 178*0e209d39SAndroid Build Coastguard Worker void printTable(const char *heading, const RBBIStateTable *table); 179*0e209d39SAndroid Build Coastguard Worker 180*0e209d39SAndroid Build Coastguard Worker /* */ 181*0e209d39SAndroid Build Coastguard Worker /* Pointers to items within the data */ 182*0e209d39SAndroid Build Coastguard Worker /* */ 183*0e209d39SAndroid Build Coastguard Worker const RBBIDataHeader *fHeader; 184*0e209d39SAndroid Build Coastguard Worker const RBBIStateTable *fForwardTable; 185*0e209d39SAndroid Build Coastguard Worker const RBBIStateTable *fReverseTable; 186*0e209d39SAndroid Build Coastguard Worker const char *fRuleSource; 187*0e209d39SAndroid Build Coastguard Worker const int32_t *fRuleStatusTable; 188*0e209d39SAndroid Build Coastguard Worker 189*0e209d39SAndroid Build Coastguard Worker /* number of int32_t values in the rule status table. Used to sanity check indexing */ 190*0e209d39SAndroid Build Coastguard Worker int32_t fStatusMaxIdx; 191*0e209d39SAndroid Build Coastguard Worker 192*0e209d39SAndroid Build Coastguard Worker UCPTrie *fTrie; 193*0e209d39SAndroid Build Coastguard Worker 194*0e209d39SAndroid Build Coastguard Worker private: 195*0e209d39SAndroid Build Coastguard Worker u_atomic_int32_t fRefCount; 196*0e209d39SAndroid Build Coastguard Worker UDataMemory *fUDataMem; 197*0e209d39SAndroid Build Coastguard Worker UnicodeString fRuleString; 198*0e209d39SAndroid Build Coastguard Worker UBool fDontFreeData; 199*0e209d39SAndroid Build Coastguard Worker 200*0e209d39SAndroid Build Coastguard Worker RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ 201*0e209d39SAndroid Build Coastguard Worker RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ 202*0e209d39SAndroid Build Coastguard Worker }; 203*0e209d39SAndroid Build Coastguard Worker 204*0e209d39SAndroid Build Coastguard Worker 205*0e209d39SAndroid Build Coastguard Worker 206*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 207*0e209d39SAndroid Build Coastguard Worker 208*0e209d39SAndroid Build Coastguard Worker U_CFUNC UBool rbbi_cleanup(); 209*0e209d39SAndroid Build Coastguard Worker 210*0e209d39SAndroid Build Coastguard Worker #endif /* C++ */ 211*0e209d39SAndroid Build Coastguard Worker 212*0e209d39SAndroid Build Coastguard Worker #endif 213