1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2012-2014, International Business Machines 6*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved. 7*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 8*0e209d39SAndroid Build Coastguard Worker * collationfcd.h 9*0e209d39SAndroid Build Coastguard Worker * 10*0e209d39SAndroid Build Coastguard Worker * created on: 2012aug18 11*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer 12*0e209d39SAndroid Build Coastguard Worker */ 13*0e209d39SAndroid Build Coastguard Worker 14*0e209d39SAndroid Build Coastguard Worker #ifndef __COLLATIONFCD_H__ 15*0e209d39SAndroid Build Coastguard Worker #define __COLLATIONFCD_H__ 16*0e209d39SAndroid Build Coastguard Worker 17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 18*0e209d39SAndroid Build Coastguard Worker 19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION 20*0e209d39SAndroid Build Coastguard Worker 21*0e209d39SAndroid Build Coastguard Worker #include "unicode/utf16.h" 22*0e209d39SAndroid Build Coastguard Worker 23*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 24*0e209d39SAndroid Build Coastguard Worker 25*0e209d39SAndroid Build Coastguard Worker /** 26*0e209d39SAndroid Build Coastguard Worker * Data and functions for the FCD check fast path. 27*0e209d39SAndroid Build Coastguard Worker * 28*0e209d39SAndroid Build Coastguard Worker * The fast path looks at a pair of 16-bit code units and checks 29*0e209d39SAndroid Build Coastguard Worker * whether there is an FCD boundary between them; 30*0e209d39SAndroid Build Coastguard Worker * there is if the first unit has a trailing ccc=0 (!hasTccc(first)) 31*0e209d39SAndroid Build Coastguard Worker * or the second unit has a leading ccc=0 (!hasLccc(second)), 32*0e209d39SAndroid Build Coastguard Worker * or both. 33*0e209d39SAndroid Build Coastguard Worker * When the fast path finds a possible non-boundary, 34*0e209d39SAndroid Build Coastguard Worker * then the FCD check slow path looks at the actual sequence of FCD values. 35*0e209d39SAndroid Build Coastguard Worker * 36*0e209d39SAndroid Build Coastguard Worker * This is a pure optimization. 37*0e209d39SAndroid Build Coastguard Worker * The fast path must at least find all possible non-boundaries. 38*0e209d39SAndroid Build Coastguard Worker * If the fast path is too pessimistic, it costs performance. 39*0e209d39SAndroid Build Coastguard Worker * 40*0e209d39SAndroid Build Coastguard Worker * For a pair of BMP characters, the fast path tests are precise (1 bit per character). 41*0e209d39SAndroid Build Coastguard Worker * 42*0e209d39SAndroid Build Coastguard Worker * For a supplementary code point, the two units are its lead and trail surrogates. 43*0e209d39SAndroid Build Coastguard Worker * We set hasTccc(lead)=true if any of its 1024 associated supplementary code points 44*0e209d39SAndroid Build Coastguard Worker * has lccc!=0 or tccc!=0. 45*0e209d39SAndroid Build Coastguard Worker * We set hasLccc(trail)=true for all trail surrogates. 46*0e209d39SAndroid Build Coastguard Worker * As a result, we leave the fast path if the lead surrogate might start a 47*0e209d39SAndroid Build Coastguard Worker * supplementary code point that is not FCD-inert. 48*0e209d39SAndroid Build Coastguard Worker * (So the fast path need not detect that there is a surrogate pair, 49*0e209d39SAndroid Build Coastguard Worker * nor look ahead to the next full code point.) 50*0e209d39SAndroid Build Coastguard Worker * 51*0e209d39SAndroid Build Coastguard Worker * hasLccc(lead)=true if any of its 1024 associated supplementary code points 52*0e209d39SAndroid Build Coastguard Worker * has lccc!=0, for fast boundary checking between BMP & supplementary. 53*0e209d39SAndroid Build Coastguard Worker * 54*0e209d39SAndroid Build Coastguard Worker * hasTccc(trail)=false: 55*0e209d39SAndroid Build Coastguard Worker * It should only be tested for unpaired trail surrogates which are FCD-inert. 56*0e209d39SAndroid Build Coastguard Worker */ 57*0e209d39SAndroid Build Coastguard Worker class U_I18N_API CollationFCD { 58*0e209d39SAndroid Build Coastguard Worker public: hasLccc(UChar32 c)59*0e209d39SAndroid Build Coastguard Worker static inline UBool hasLccc(UChar32 c) { 60*0e209d39SAndroid Build Coastguard Worker // assert c <= 0xffff 61*0e209d39SAndroid Build Coastguard Worker // c can be negative, e.g., U_SENTINEL from UCharIterator; 62*0e209d39SAndroid Build Coastguard Worker // that is handled in the first test. 63*0e209d39SAndroid Build Coastguard Worker int32_t i; 64*0e209d39SAndroid Build Coastguard Worker return 65*0e209d39SAndroid Build Coastguard Worker // U+0300 is the first character with lccc!=0. 66*0e209d39SAndroid Build Coastguard Worker c >= 0x300 && 67*0e209d39SAndroid Build Coastguard Worker (i = lcccIndex[c >> 5]) != 0 && 68*0e209d39SAndroid Build Coastguard Worker (lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0; 69*0e209d39SAndroid Build Coastguard Worker } 70*0e209d39SAndroid Build Coastguard Worker hasTccc(UChar32 c)71*0e209d39SAndroid Build Coastguard Worker static inline UBool hasTccc(UChar32 c) { 72*0e209d39SAndroid Build Coastguard Worker // assert c <= 0xffff 73*0e209d39SAndroid Build Coastguard Worker // c can be negative, e.g., U_SENTINEL from UCharIterator; 74*0e209d39SAndroid Build Coastguard Worker // that is handled in the first test. 75*0e209d39SAndroid Build Coastguard Worker int32_t i; 76*0e209d39SAndroid Build Coastguard Worker return 77*0e209d39SAndroid Build Coastguard Worker // U+00C0 is the first character with tccc!=0. 78*0e209d39SAndroid Build Coastguard Worker c >= 0xc0 && 79*0e209d39SAndroid Build Coastguard Worker (i = tcccIndex[c >> 5]) != 0 && 80*0e209d39SAndroid Build Coastguard Worker (tcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0; 81*0e209d39SAndroid Build Coastguard Worker } 82*0e209d39SAndroid Build Coastguard Worker mayHaveLccc(UChar32 c)83*0e209d39SAndroid Build Coastguard Worker static inline UBool mayHaveLccc(UChar32 c) { 84*0e209d39SAndroid Build Coastguard Worker // Handles all of Unicode 0..10FFFF. 85*0e209d39SAndroid Build Coastguard Worker // c can be negative, e.g., U_SENTINEL. 86*0e209d39SAndroid Build Coastguard Worker // U+0300 is the first character with lccc!=0. 87*0e209d39SAndroid Build Coastguard Worker if(c < 0x300) { return false; } 88*0e209d39SAndroid Build Coastguard Worker if(c > 0xffff) { c = U16_LEAD(c); } 89*0e209d39SAndroid Build Coastguard Worker int32_t i; 90*0e209d39SAndroid Build Coastguard Worker return 91*0e209d39SAndroid Build Coastguard Worker (i = lcccIndex[c >> 5]) != 0 && 92*0e209d39SAndroid Build Coastguard Worker (lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0; 93*0e209d39SAndroid Build Coastguard Worker } 94*0e209d39SAndroid Build Coastguard Worker 95*0e209d39SAndroid Build Coastguard Worker /** 96*0e209d39SAndroid Build Coastguard Worker * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81) 97*0e209d39SAndroid Build Coastguard Worker * must be decomposed before reaching the core collation code, 98*0e209d39SAndroid Build Coastguard Worker * or else some sequences including them, even ones passing the FCD check, 99*0e209d39SAndroid Build Coastguard Worker * do not yield canonically equivalent results. 100*0e209d39SAndroid Build Coastguard Worker * 101*0e209d39SAndroid Build Coastguard Worker * This is a fast and imprecise test. 102*0e209d39SAndroid Build Coastguard Worker * 103*0e209d39SAndroid Build Coastguard Worker * @param c a code point 104*0e209d39SAndroid Build Coastguard Worker * @return true if c is U+0F73, U+0F75 or U+0F81 or one of several other Tibetan characters 105*0e209d39SAndroid Build Coastguard Worker */ maybeTibetanCompositeVowel(UChar32 c)106*0e209d39SAndroid Build Coastguard Worker static inline UBool maybeTibetanCompositeVowel(UChar32 c) { 107*0e209d39SAndroid Build Coastguard Worker return (c & 0x1fff01) == 0xf01; 108*0e209d39SAndroid Build Coastguard Worker } 109*0e209d39SAndroid Build Coastguard Worker 110*0e209d39SAndroid Build Coastguard Worker /** 111*0e209d39SAndroid Build Coastguard Worker * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81) 112*0e209d39SAndroid Build Coastguard Worker * must be decomposed before reaching the core collation code, 113*0e209d39SAndroid Build Coastguard Worker * or else some sequences including them, even ones passing the FCD check, 114*0e209d39SAndroid Build Coastguard Worker * do not yield canonically equivalent results. 115*0e209d39SAndroid Build Coastguard Worker * 116*0e209d39SAndroid Build Coastguard Worker * They have distinct lccc/tccc combinations: 129/130 or 129/132. 117*0e209d39SAndroid Build Coastguard Worker * 118*0e209d39SAndroid Build Coastguard Worker * @param fcd16 the FCD value (lccc/tccc combination) of a code point 119*0e209d39SAndroid Build Coastguard Worker * @return true if fcd16 is from U+0F73, U+0F75 or U+0F81 120*0e209d39SAndroid Build Coastguard Worker */ isFCD16OfTibetanCompositeVowel(uint16_t fcd16)121*0e209d39SAndroid Build Coastguard Worker static inline UBool isFCD16OfTibetanCompositeVowel(uint16_t fcd16) { 122*0e209d39SAndroid Build Coastguard Worker return fcd16 == 0x8182 || fcd16 == 0x8184; 123*0e209d39SAndroid Build Coastguard Worker } 124*0e209d39SAndroid Build Coastguard Worker 125*0e209d39SAndroid Build Coastguard Worker private: 126*0e209d39SAndroid Build Coastguard Worker CollationFCD() = delete; // No instantiation. 127*0e209d39SAndroid Build Coastguard Worker 128*0e209d39SAndroid Build Coastguard Worker static const uint8_t lcccIndex[2048]; 129*0e209d39SAndroid Build Coastguard Worker static const uint8_t tcccIndex[2048]; 130*0e209d39SAndroid Build Coastguard Worker static const uint32_t lcccBits[]; 131*0e209d39SAndroid Build Coastguard Worker static const uint32_t tcccBits[]; 132*0e209d39SAndroid Build Coastguard Worker }; 133*0e209d39SAndroid Build Coastguard Worker 134*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 135*0e209d39SAndroid Build Coastguard Worker 136*0e209d39SAndroid Build Coastguard Worker #endif // !UCONFIG_NO_COLLATION 137*0e209d39SAndroid Build Coastguard Worker #endif // __COLLATIONFCD_H__ 138