1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationdata.h 9 * 10 * created on: 2010oct27 11 * created by: Markus W. Scherer 12 */ 13 14 #ifndef __COLLATIONDATA_H__ 15 #define __COLLATIONDATA_H__ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_COLLATION 20 21 #include "unicode/ucol.h" 22 #include "unicode/uniset.h" 23 #include "collation.h" 24 #include "normalizer2impl.h" 25 #include "utrie2.h" 26 27 struct UDataMemory; 28 29 U_NAMESPACE_BEGIN 30 31 class UVector32; 32 33 /** 34 * Collation data container. 35 * Immutable data created by a CollationDataBuilder, or loaded from a file, 36 * or deserialized from API-provided binary data. 37 * 38 * Includes data for the collation base (root/default), aliased if this is not the base. 39 */ 40 struct U_I18N_API CollationData : public UMemory { 41 // Note: The ucadata.icu loader could discover the reserved ranges by setting an array 42 // parallel with the ranges, and resetting ranges that are indexed. 43 // The reordering builder code could clone the resulting template array. 44 static constexpr int32_t REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14; 45 static constexpr int32_t REORDER_RESERVED_AFTER_LATIN = REORDER_RESERVED_BEFORE_LATIN + 1; 46 47 static constexpr int32_t MAX_NUM_SPECIAL_REORDER_CODES = 8; 48 /** C++ only, data reader check scriptStartsLength. */ 49 static constexpr int32_t MAX_NUM_SCRIPT_RANGES = 256; 50 CollationDataCollationData51 CollationData(const Normalizer2Impl &nfc) 52 : trie(nullptr), 53 ce32s(nullptr), ces(nullptr), contexts(nullptr), base(nullptr), 54 jamoCE32s(nullptr), 55 nfcImpl(nfc), 56 numericPrimary(0x12000000), 57 ce32sLength(0), cesLength(0), contextsLength(0), 58 compressibleBytes(nullptr), 59 unsafeBackwardSet(nullptr), 60 fastLatinTable(nullptr), fastLatinTableLength(0), 61 numScripts(0), scriptsIndex(nullptr), scriptStarts(nullptr), scriptStartsLength(0), 62 rootElements(nullptr), rootElementsLength(0) {} 63 getCE32CollationData64 uint32_t getCE32(UChar32 c) const { 65 return UTRIE2_GET32(trie, c); 66 } 67 getCE32FromSupplementaryCollationData68 uint32_t getCE32FromSupplementary(UChar32 c) const { 69 return UTRIE2_GET32_FROM_SUPP(trie, c); 70 } 71 isDigitCollationData72 UBool isDigit(UChar32 c) const { 73 return c < 0x660 ? c <= 0x39 && 0x30 <= c : 74 Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG); 75 } 76 isUnsafeBackwardCollationData77 UBool isUnsafeBackward(UChar32 c, UBool numeric) const { 78 return unsafeBackwardSet->contains(c) || (numeric && isDigit(c)); 79 } 80 isCompressibleLeadByteCollationData81 UBool isCompressibleLeadByte(uint32_t b) const { 82 return compressibleBytes[b]; 83 } 84 isCompressiblePrimaryCollationData85 inline UBool isCompressiblePrimary(uint32_t p) const { 86 return isCompressibleLeadByte(p >> 24); 87 } 88 89 /** 90 * Returns the CE32 from two contexts words. 91 * Access to the defaultCE32 for contraction and prefix matching. 92 */ readCE32CollationData93 static uint32_t readCE32(const char16_t *p) { 94 return ((uint32_t)p[0] << 16) | p[1]; 95 } 96 97 /** 98 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG). 99 * Requires that ce32 is special. 100 */ 101 uint32_t getIndirectCE32(uint32_t ce32) const; 102 /** 103 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG), 104 * if ce32 is special. 105 */ 106 uint32_t getFinalCE32(uint32_t ce32) const; 107 108 /** 109 * Computes a CE from c's ce32 which has the OFFSET_TAG. 110 */ getCEFromOffsetCE32CollationData111 int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const { 112 int64_t dataCE = ces[Collation::indexFromCE32(ce32)]; 113 return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE)); 114 } 115 116 /** 117 * Returns the single CE that c maps to. 118 * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE. 119 */ 120 int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; 121 122 /** 123 * Returns the FCD16 value for code point c. c must be >= 0. 124 */ getFCD16CollationData125 uint16_t getFCD16(UChar32 c) const { 126 return nfcImpl.getFCD16(c); 127 } 128 129 /** 130 * Returns the first primary for the script's reordering group. 131 * @return the primary with only the first primary lead byte of the group 132 * (not necessarily an actual root collator primary weight), 133 * or 0 if the script is unknown 134 */ 135 uint32_t getFirstPrimaryForGroup(int32_t script) const; 136 137 /** 138 * Returns the last primary for the script's reordering group. 139 * @return the last primary of the group 140 * (not an actual root collator primary weight), 141 * or 0 if the script is unknown 142 */ 143 uint32_t getLastPrimaryForGroup(int32_t script) const; 144 145 /** 146 * Finds the reordering group which contains the primary weight. 147 * @return the first script of the group, or -1 if the weight is beyond the last group 148 */ 149 int32_t getGroupForPrimary(uint32_t p) const; 150 151 int32_t getEquivalentScripts(int32_t script, 152 int32_t dest[], int32_t capacity, UErrorCode &errorCode) const; 153 154 /** 155 * Writes the permutation of primary-weight ranges 156 * for the given reordering of scripts and groups. 157 * The caller checks for illegal arguments and 158 * takes care of [DEFAULT] and memory allocation. 159 * 160 * Each list element will be a (limit, offset) pair as described 161 * for the CollationSettings::reorderRanges. 162 * The list will be empty if no ranges are reordered. 163 */ 164 void makeReorderRanges(const int32_t *reorder, int32_t length, 165 UVector32 &ranges, UErrorCode &errorCode) const; 166 167 /** @see jamoCE32s */ 168 static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27; 169 170 /** Main lookup trie. */ 171 const UTrie2 *trie; 172 /** 173 * Array of CE32 values. 174 * At index 0 there must be CE32(U+0000) 175 * to support U+0000's special-tag for NUL-termination handling. 176 */ 177 const uint32_t *ce32s; 178 /** Array of CE values for expansions and OFFSET_TAG. */ 179 const int64_t *ces; 180 /** Array of prefix and contraction-suffix matching data. */ 181 const char16_t *contexts; 182 /** Base collation data, or nullptr if this data itself is a base. */ 183 const CollationData *base; 184 /** 185 * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T. 186 * They are normally simple CE32s, rarely expansions. 187 * For fast handling of HANGUL_TAG. 188 */ 189 const uint32_t *jamoCE32s; 190 const Normalizer2Impl &nfcImpl; 191 /** The single-byte primary weight (xx000000) for numeric collation. */ 192 uint32_t numericPrimary; 193 194 int32_t ce32sLength; 195 int32_t cesLength; 196 int32_t contextsLength; 197 198 /** 256 flags for which primary-weight lead bytes are compressible. */ 199 const UBool *compressibleBytes; 200 /** 201 * Set of code points that are unsafe for starting string comparison after an identical prefix, 202 * or in backwards CE iteration. 203 */ 204 const UnicodeSet *unsafeBackwardSet; 205 206 /** 207 * Fast Latin table for common-Latin-text string comparisons. 208 * Data structure see class CollationFastLatin. 209 */ 210 const uint16_t *fastLatinTable; 211 int32_t fastLatinTableLength; 212 213 /** 214 * Data for scripts and reordering groups. 215 * Uses include building a reordering permutation table and 216 * providing script boundaries to AlphabeticIndex. 217 */ 218 int32_t numScripts; 219 /** 220 * The length of scriptsIndex is numScripts+16. 221 * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts. 222 * 16 special reorder codes (not all used) are mapped starting at numScripts. 223 * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit. 224 * There are special codes at the end for reorder-reserved primary ranges. 225 * 226 * Multiple scripts may share a range and index, for example Hira & Kana. 227 */ 228 const uint16_t *scriptsIndex; 229 /** 230 * Start primary weight (top 16 bits only) for a group/script/reserved range 231 * indexed by scriptsIndex. 232 * The first range (separators & terminators) and the last range (trailing weights) 233 * are not reorderable, and no scriptsIndex entry points to them. 234 */ 235 const uint16_t *scriptStarts; 236 int32_t scriptStartsLength; 237 238 /** 239 * Collation elements in the root collator. 240 * Used by the CollationRootElements class. The data structure is described there. 241 * nullptr in a tailoring. 242 */ 243 const uint32_t *rootElements; 244 int32_t rootElementsLength; 245 246 private: 247 int32_t getScriptIndex(int32_t script) const; 248 void makeReorderRanges(const int32_t *reorder, int32_t length, 249 UBool latinMustMove, 250 UVector32 &ranges, UErrorCode &errorCode) const; 251 int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const; 252 int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const; 253 }; 254 255 U_NAMESPACE_END 256 257 #endif // !UCONFIG_NO_COLLATION 258 #endif // __COLLATIONDATA_H__ 259