1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2014, International Business Machines 6*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved. 7*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 8*0e209d39SAndroid Build Coastguard Worker * dictionarydata.h 9*0e209d39SAndroid Build Coastguard Worker * 10*0e209d39SAndroid Build Coastguard Worker * created on: 2012may31 11*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer & Maxime Serrano 12*0e209d39SAndroid Build Coastguard Worker */ 13*0e209d39SAndroid Build Coastguard Worker 14*0e209d39SAndroid Build Coastguard Worker #ifndef __DICTIONARYDATA_H__ 15*0e209d39SAndroid Build Coastguard Worker #define __DICTIONARYDATA_H__ 16*0e209d39SAndroid Build Coastguard Worker 17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 18*0e209d39SAndroid Build Coastguard Worker 19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_BREAK_ITERATION 20*0e209d39SAndroid Build Coastguard Worker 21*0e209d39SAndroid Build Coastguard Worker #include "unicode/utext.h" 22*0e209d39SAndroid Build Coastguard Worker #include "unicode/udata.h" 23*0e209d39SAndroid Build Coastguard Worker #include "udataswp.h" 24*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h" 25*0e209d39SAndroid Build Coastguard Worker #include "unicode/ustringtrie.h" 26*0e209d39SAndroid Build Coastguard Worker 27*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 28*0e209d39SAndroid Build Coastguard Worker 29*0e209d39SAndroid Build Coastguard Worker class UCharsTrie; 30*0e209d39SAndroid Build Coastguard Worker class BytesTrie; 31*0e209d39SAndroid Build Coastguard Worker 32*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API DictionaryData : public UMemory { 33*0e209d39SAndroid Build Coastguard Worker public: 34*0e209d39SAndroid Build Coastguard Worker static const int32_t TRIE_TYPE_BYTES; // = 0; 35*0e209d39SAndroid Build Coastguard Worker static const int32_t TRIE_TYPE_UCHARS; // = 1; 36*0e209d39SAndroid Build Coastguard Worker static const int32_t TRIE_TYPE_MASK; // = 7; 37*0e209d39SAndroid Build Coastguard Worker static const int32_t TRIE_HAS_VALUES; // = 8; 38*0e209d39SAndroid Build Coastguard Worker 39*0e209d39SAndroid Build Coastguard Worker static const int32_t TRANSFORM_NONE; // = 0; 40*0e209d39SAndroid Build Coastguard Worker static const int32_t TRANSFORM_TYPE_OFFSET; // = 0x1000000; 41*0e209d39SAndroid Build Coastguard Worker static const int32_t TRANSFORM_TYPE_MASK; // = 0x7f000000; 42*0e209d39SAndroid Build Coastguard Worker static const int32_t TRANSFORM_OFFSET_MASK; // = 0x1fffff; 43*0e209d39SAndroid Build Coastguard Worker 44*0e209d39SAndroid Build Coastguard Worker enum { 45*0e209d39SAndroid Build Coastguard Worker // Byte offsets from the start of the data, after the generic header. 46*0e209d39SAndroid Build Coastguard Worker IX_STRING_TRIE_OFFSET, 47*0e209d39SAndroid Build Coastguard Worker IX_RESERVED1_OFFSET, 48*0e209d39SAndroid Build Coastguard Worker IX_RESERVED2_OFFSET, 49*0e209d39SAndroid Build Coastguard Worker IX_TOTAL_SIZE, 50*0e209d39SAndroid Build Coastguard Worker 51*0e209d39SAndroid Build Coastguard Worker // Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc. 52*0e209d39SAndroid Build Coastguard Worker IX_TRIE_TYPE, 53*0e209d39SAndroid Build Coastguard Worker // Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc. 54*0e209d39SAndroid Build Coastguard Worker IX_TRANSFORM, 55*0e209d39SAndroid Build Coastguard Worker 56*0e209d39SAndroid Build Coastguard Worker IX_RESERVED6, 57*0e209d39SAndroid Build Coastguard Worker IX_RESERVED7, 58*0e209d39SAndroid Build Coastguard Worker IX_COUNT 59*0e209d39SAndroid Build Coastguard Worker }; 60*0e209d39SAndroid Build Coastguard Worker }; 61*0e209d39SAndroid Build Coastguard Worker 62*0e209d39SAndroid Build Coastguard Worker /** 63*0e209d39SAndroid Build Coastguard Worker * Wrapper class around generic dictionaries, implementing matches(). 64*0e209d39SAndroid Build Coastguard Worker * getType() should return a TRIE_TYPE_??? constant from DictionaryData. 65*0e209d39SAndroid Build Coastguard Worker * 66*0e209d39SAndroid Build Coastguard Worker * All implementations of this interface must be thread-safe if they are to be used inside of the 67*0e209d39SAndroid Build Coastguard Worker * dictionary-based break iteration code. 68*0e209d39SAndroid Build Coastguard Worker */ 69*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API DictionaryMatcher : public UMemory { 70*0e209d39SAndroid Build Coastguard Worker public: DictionaryMatcher()71*0e209d39SAndroid Build Coastguard Worker DictionaryMatcher() {} 72*0e209d39SAndroid Build Coastguard Worker virtual ~DictionaryMatcher(); 73*0e209d39SAndroid Build Coastguard Worker // this should emulate CompactTrieDictionary::matches() 74*0e209d39SAndroid Build Coastguard Worker /* @param text The text in which to look for matching words. Matching begins 75*0e209d39SAndroid Build Coastguard Worker * at the current position of the UText. 76*0e209d39SAndroid Build Coastguard Worker * @param maxLength The max length of match to consider. Units are the native indexing 77*0e209d39SAndroid Build Coastguard Worker * units of the UText. 78*0e209d39SAndroid Build Coastguard Worker * @param limit Capacity of output arrays, which is also the maximum number of 79*0e209d39SAndroid Build Coastguard Worker * matching words to be found. 80*0e209d39SAndroid Build Coastguard Worker * @param lengths output array, filled with the lengths of the matches, in order, 81*0e209d39SAndroid Build Coastguard Worker * from shortest to longest. Lengths are in native indexing units 82*0e209d39SAndroid Build Coastguard Worker * of the UText. May be nullptr. 83*0e209d39SAndroid Build Coastguard Worker * @param cpLengths output array, filled with the lengths of the matches, in order, 84*0e209d39SAndroid Build Coastguard Worker * from shortest to longest. Lengths are the number of Unicode code points. 85*0e209d39SAndroid Build Coastguard Worker * May be nullptr. 86*0e209d39SAndroid Build Coastguard Worker * @param values Output array, filled with the values associated with the words found. 87*0e209d39SAndroid Build Coastguard Worker * May be nullptr. 88*0e209d39SAndroid Build Coastguard Worker * @param prefix Output parameter, the code point length of the prefix match, even if that 89*0e209d39SAndroid Build Coastguard Worker * prefix didn't lead to a complete word. Will always be >= the cpLength 90*0e209d39SAndroid Build Coastguard Worker * of the longest complete word matched. May be nullptr. 91*0e209d39SAndroid Build Coastguard Worker * @return Number of matching words found. 92*0e209d39SAndroid Build Coastguard Worker */ 93*0e209d39SAndroid Build Coastguard Worker virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, 94*0e209d39SAndroid Build Coastguard Worker int32_t *lengths, int32_t *cpLengths, int32_t *values, 95*0e209d39SAndroid Build Coastguard Worker int32_t *prefix) const = 0; 96*0e209d39SAndroid Build Coastguard Worker 97*0e209d39SAndroid Build Coastguard Worker /** @return DictionaryData::TRIE_TYPE_XYZ */ 98*0e209d39SAndroid Build Coastguard Worker virtual int32_t getType() const = 0; 99*0e209d39SAndroid Build Coastguard Worker }; 100*0e209d39SAndroid Build Coastguard Worker 101*0e209d39SAndroid Build Coastguard Worker // Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary 102*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher { 103*0e209d39SAndroid Build Coastguard Worker public: 104*0e209d39SAndroid Build Coastguard Worker // constructs a new UCharsDictionaryMatcher. 105*0e209d39SAndroid Build Coastguard Worker // The UDataMemory * will be closed on this object's destruction. UCharsDictionaryMatcher(const char16_t * c,UDataMemory * f)106*0e209d39SAndroid Build Coastguard Worker UCharsDictionaryMatcher(const char16_t *c, UDataMemory *f) : characters(c), file(f) { } 107*0e209d39SAndroid Build Coastguard Worker virtual ~UCharsDictionaryMatcher(); 108*0e209d39SAndroid Build Coastguard Worker virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, 109*0e209d39SAndroid Build Coastguard Worker int32_t *lengths, int32_t *cpLengths, int32_t *values, 110*0e209d39SAndroid Build Coastguard Worker int32_t *prefix) const override; 111*0e209d39SAndroid Build Coastguard Worker virtual int32_t getType() const override; 112*0e209d39SAndroid Build Coastguard Worker private: 113*0e209d39SAndroid Build Coastguard Worker const char16_t *characters; 114*0e209d39SAndroid Build Coastguard Worker UDataMemory *file; 115*0e209d39SAndroid Build Coastguard Worker }; 116*0e209d39SAndroid Build Coastguard Worker 117*0e209d39SAndroid Build Coastguard Worker // Implementation of the DictionaryMatcher interface for a BytesTrie dictionary 118*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher { 119*0e209d39SAndroid Build Coastguard Worker public: 120*0e209d39SAndroid Build Coastguard Worker // constructs a new BytesTrieDictionaryMatcher 121*0e209d39SAndroid Build Coastguard Worker // the transform constant should be the constant read from the file, not a masked version! 122*0e209d39SAndroid Build Coastguard Worker // the UDataMemory * fed in here will be closed on this object's destruction BytesDictionaryMatcher(const char * c,int32_t t,UDataMemory * f)123*0e209d39SAndroid Build Coastguard Worker BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f) 124*0e209d39SAndroid Build Coastguard Worker : characters(c), transformConstant(t), file(f) { } 125*0e209d39SAndroid Build Coastguard Worker virtual ~BytesDictionaryMatcher(); 126*0e209d39SAndroid Build Coastguard Worker virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, 127*0e209d39SAndroid Build Coastguard Worker int32_t *lengths, int32_t *cpLengths, int32_t *values, 128*0e209d39SAndroid Build Coastguard Worker int32_t *prefix) const override; 129*0e209d39SAndroid Build Coastguard Worker virtual int32_t getType() const override; 130*0e209d39SAndroid Build Coastguard Worker private: 131*0e209d39SAndroid Build Coastguard Worker UChar32 transform(UChar32 c) const; 132*0e209d39SAndroid Build Coastguard Worker 133*0e209d39SAndroid Build Coastguard Worker const char *characters; 134*0e209d39SAndroid Build Coastguard Worker int32_t transformConstant; 135*0e209d39SAndroid Build Coastguard Worker UDataMemory *file; 136*0e209d39SAndroid Build Coastguard Worker }; 137*0e209d39SAndroid Build Coastguard Worker 138*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 139*0e209d39SAndroid Build Coastguard Worker 140*0e209d39SAndroid Build Coastguard Worker U_CAPI int32_t U_EXPORT2 141*0e209d39SAndroid Build Coastguard Worker udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode); 142*0e209d39SAndroid Build Coastguard Worker 143*0e209d39SAndroid Build Coastguard Worker /** 144*0e209d39SAndroid Build Coastguard Worker * Format of dictionary .dict data files. 145*0e209d39SAndroid Build Coastguard Worker * Format version 1.0. 146*0e209d39SAndroid Build Coastguard Worker * 147*0e209d39SAndroid Build Coastguard Worker * A dictionary .dict data file contains a byte-serialized BytesTrie or 148*0e209d39SAndroid Build Coastguard Worker * a UChars-serialized UCharsTrie. 149*0e209d39SAndroid Build Coastguard Worker * Such files are used in dictionary-based break iteration (DBBI). 150*0e209d39SAndroid Build Coastguard Worker * 151*0e209d39SAndroid Build Coastguard Worker * For a BytesTrie, a transformation type is specified for 152*0e209d39SAndroid Build Coastguard Worker * transforming Unicode strings into byte sequences. 153*0e209d39SAndroid Build Coastguard Worker * 154*0e209d39SAndroid Build Coastguard Worker * A .dict file begins with a standard ICU data file header 155*0e209d39SAndroid Build Coastguard Worker * (DataHeader, see ucmndata.h and unicode/udata.h). 156*0e209d39SAndroid Build Coastguard Worker * The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0). 157*0e209d39SAndroid Build Coastguard Worker * 158*0e209d39SAndroid Build Coastguard Worker * After the header, the file contains the following parts. 159*0e209d39SAndroid Build Coastguard Worker * Constants are defined in the DictionaryData class. 160*0e209d39SAndroid Build Coastguard Worker * 161*0e209d39SAndroid Build Coastguard Worker * For the data structure of BytesTrie & UCharsTrie see 162*0e209d39SAndroid Build Coastguard Worker * https://icu.unicode.org/design/struct/tries 163*0e209d39SAndroid Build Coastguard Worker * and the bytestrie.h and ucharstrie.h header files. 164*0e209d39SAndroid Build Coastguard Worker * 165*0e209d39SAndroid Build Coastguard Worker * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4; 166*0e209d39SAndroid Build Coastguard Worker * 167*0e209d39SAndroid Build Coastguard Worker * The first four indexes are byte offsets in ascending order. 168*0e209d39SAndroid Build Coastguard Worker * Each byte offset marks the start of the next part in the data file, 169*0e209d39SAndroid Build Coastguard Worker * and the end of the previous one. 170*0e209d39SAndroid Build Coastguard Worker * When two consecutive byte offsets are the same, then the corresponding part is empty. 171*0e209d39SAndroid Build Coastguard Worker * Byte offsets are offsets from after the header, 172*0e209d39SAndroid Build Coastguard Worker * that is, from the beginning of the indexes[]. 173*0e209d39SAndroid Build Coastguard Worker * Each part starts at an offset with proper alignment for its data. 174*0e209d39SAndroid Build Coastguard Worker * If necessary, the previous part may include padding bytes to achieve this alignment. 175*0e209d39SAndroid Build Coastguard Worker * 176*0e209d39SAndroid Build Coastguard Worker * trieType=indexes[IX_TRIE_TYPE] defines the trie type. 177*0e209d39SAndroid Build Coastguard Worker * transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation. 178*0e209d39SAndroid Build Coastguard Worker * If the transformation type is TRANSFORM_TYPE_OFFSET, 179*0e209d39SAndroid Build Coastguard Worker * then the lower 21 bits contain the offset code point. 180*0e209d39SAndroid Build Coastguard Worker * Each code point c is mapped to byte b = (c - offset). 181*0e209d39SAndroid Build Coastguard Worker * Code points outside the range offset..(offset+0xff) cannot be mapped 182*0e209d39SAndroid Build Coastguard Worker * and do not occur in the dictionary. 183*0e209d39SAndroid Build Coastguard Worker * 184*0e209d39SAndroid Build Coastguard Worker * stringTrie; -- a serialized BytesTrie or UCharsTrie 185*0e209d39SAndroid Build Coastguard Worker * 186*0e209d39SAndroid Build Coastguard Worker * The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType), 187*0e209d39SAndroid Build Coastguard Worker * or it maps all strings to 0 (TRIE_HAS_VALUES bit not set). 188*0e209d39SAndroid Build Coastguard Worker */ 189*0e209d39SAndroid Build Coastguard Worker 190*0e209d39SAndroid Build Coastguard Worker #endif /* !UCONFIG_NO_BREAK_ITERATION */ 191*0e209d39SAndroid Build Coastguard Worker #endif /* __DICTIONARYDATA_H__ */ 192