1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2001-2015 IBM and others. All rights reserved. 6*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 7*0e209d39SAndroid Build Coastguard Worker * Date Name Description 8*0e209d39SAndroid Build Coastguard Worker * 08/13/2001 synwee Creation. 9*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 10*0e209d39SAndroid Build Coastguard Worker */ 11*0e209d39SAndroid Build Coastguard Worker #ifndef USRCHIMP_H 12*0e209d39SAndroid Build Coastguard Worker #define USRCHIMP_H 13*0e209d39SAndroid Build Coastguard Worker 14*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 15*0e209d39SAndroid Build Coastguard Worker 16*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION 17*0e209d39SAndroid Build Coastguard Worker 18*0e209d39SAndroid Build Coastguard Worker #include "unicode/normalizer2.h" 19*0e209d39SAndroid Build Coastguard Worker #include "unicode/ucol.h" 20*0e209d39SAndroid Build Coastguard Worker #include "unicode/ucoleitr.h" 21*0e209d39SAndroid Build Coastguard Worker #include "unicode/ubrk.h" 22*0e209d39SAndroid Build Coastguard Worker 23*0e209d39SAndroid Build Coastguard Worker /* mask off anything but primary order */ 24*0e209d39SAndroid Build Coastguard Worker #define UCOL_PRIMARYORDERMASK 0xffff0000 25*0e209d39SAndroid Build Coastguard Worker /* mask off anything but secondary order */ 26*0e209d39SAndroid Build Coastguard Worker #define UCOL_SECONDARYORDERMASK 0x0000ff00 27*0e209d39SAndroid Build Coastguard Worker /* mask off anything but tertiary order */ 28*0e209d39SAndroid Build Coastguard Worker #define UCOL_TERTIARYORDERMASK 0x000000ff 29*0e209d39SAndroid Build Coastguard Worker /* primary order shift */ 30*0e209d39SAndroid Build Coastguard Worker #define UCOL_PRIMARYORDERSHIFT 16 31*0e209d39SAndroid Build Coastguard Worker /* secondary order shift */ 32*0e209d39SAndroid Build Coastguard Worker #define UCOL_SECONDARYORDERSHIFT 8 33*0e209d39SAndroid Build Coastguard Worker 34*0e209d39SAndroid Build Coastguard Worker #define UCOL_IGNORABLE 0 35*0e209d39SAndroid Build Coastguard Worker 36*0e209d39SAndroid Build Coastguard Worker /* get weights from a CE */ 37*0e209d39SAndroid Build Coastguard Worker #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff) 38*0e209d39SAndroid Build Coastguard Worker #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT) 39*0e209d39SAndroid Build Coastguard Worker #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK) 40*0e209d39SAndroid Build Coastguard Worker 41*0e209d39SAndroid Build Coastguard Worker #define UCOL_CONTINUATION_MARKER 0xC0 42*0e209d39SAndroid Build Coastguard Worker 43*0e209d39SAndroid Build Coastguard Worker #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER) 44*0e209d39SAndroid Build Coastguard Worker 45*0e209d39SAndroid Build Coastguard Worker /** 46*0e209d39SAndroid Build Coastguard Worker * This indicates an error has occurred during processing or there are no more CEs 47*0e209d39SAndroid Build Coastguard Worker * to be returned. 48*0e209d39SAndroid Build Coastguard Worker */ 49*0e209d39SAndroid Build Coastguard Worker #define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX) 50*0e209d39SAndroid Build Coastguard Worker 51*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 52*0e209d39SAndroid Build Coastguard Worker 53*0e209d39SAndroid Build Coastguard Worker class CollationElementIterator; 54*0e209d39SAndroid Build Coastguard Worker class Collator; 55*0e209d39SAndroid Build Coastguard Worker 56*0e209d39SAndroid Build Coastguard Worker struct PCEI 57*0e209d39SAndroid Build Coastguard Worker { 58*0e209d39SAndroid Build Coastguard Worker uint64_t ce; 59*0e209d39SAndroid Build Coastguard Worker int32_t low; 60*0e209d39SAndroid Build Coastguard Worker int32_t high; 61*0e209d39SAndroid Build Coastguard Worker }; 62*0e209d39SAndroid Build Coastguard Worker 63*0e209d39SAndroid Build Coastguard Worker struct PCEBuffer 64*0e209d39SAndroid Build Coastguard Worker { 65*0e209d39SAndroid Build Coastguard Worker PCEI defaultBuffer[16]; 66*0e209d39SAndroid Build Coastguard Worker PCEI *buffer; 67*0e209d39SAndroid Build Coastguard Worker int32_t bufferIndex; 68*0e209d39SAndroid Build Coastguard Worker int32_t bufferSize; 69*0e209d39SAndroid Build Coastguard Worker 70*0e209d39SAndroid Build Coastguard Worker PCEBuffer(); 71*0e209d39SAndroid Build Coastguard Worker ~PCEBuffer(); 72*0e209d39SAndroid Build Coastguard Worker 73*0e209d39SAndroid Build Coastguard Worker void reset(); 74*0e209d39SAndroid Build Coastguard Worker UBool isEmpty() const; 75*0e209d39SAndroid Build Coastguard Worker void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); 76*0e209d39SAndroid Build Coastguard Worker const PCEI *get(); 77*0e209d39SAndroid Build Coastguard Worker }; 78*0e209d39SAndroid Build Coastguard Worker 79*0e209d39SAndroid Build Coastguard Worker class UCollationPCE : public UMemory { 80*0e209d39SAndroid Build Coastguard Worker private: 81*0e209d39SAndroid Build Coastguard Worker PCEBuffer pceBuffer; 82*0e209d39SAndroid Build Coastguard Worker CollationElementIterator *cei; 83*0e209d39SAndroid Build Coastguard Worker UCollationStrength strength; 84*0e209d39SAndroid Build Coastguard Worker UBool toShift; 85*0e209d39SAndroid Build Coastguard Worker UBool isShifted; 86*0e209d39SAndroid Build Coastguard Worker uint32_t variableTop; 87*0e209d39SAndroid Build Coastguard Worker 88*0e209d39SAndroid Build Coastguard Worker public: 89*0e209d39SAndroid Build Coastguard Worker UCollationPCE(UCollationElements *elems); 90*0e209d39SAndroid Build Coastguard Worker UCollationPCE(CollationElementIterator *iter); 91*0e209d39SAndroid Build Coastguard Worker ~UCollationPCE(); 92*0e209d39SAndroid Build Coastguard Worker 93*0e209d39SAndroid Build Coastguard Worker void init(UCollationElements *elems); 94*0e209d39SAndroid Build Coastguard Worker void init(CollationElementIterator *iter); 95*0e209d39SAndroid Build Coastguard Worker 96*0e209d39SAndroid Build Coastguard Worker /** 97*0e209d39SAndroid Build Coastguard Worker * Get the processed ordering priority of the next collation element in the text. 98*0e209d39SAndroid Build Coastguard Worker * A single character may contain more than one collation element. 99*0e209d39SAndroid Build Coastguard Worker * 100*0e209d39SAndroid Build Coastguard Worker * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE. 101*0e209d39SAndroid Build Coastguard Worker * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE. 102*0e209d39SAndroid Build Coastguard Worker * @param status A pointer to an UErrorCode to receive any errors. 103*0e209d39SAndroid Build Coastguard Worker * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER 104*0e209d39SAndroid Build Coastguard Worker * if an error has occurred or if the end of string has been reached 105*0e209d39SAndroid Build Coastguard Worker */ 106*0e209d39SAndroid Build Coastguard Worker int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 107*0e209d39SAndroid Build Coastguard Worker /** 108*0e209d39SAndroid Build Coastguard Worker * Get the processed ordering priority of the previous collation element in the text. 109*0e209d39SAndroid Build Coastguard Worker * A single character may contain more than one collation element. 110*0e209d39SAndroid Build Coastguard Worker * 111*0e209d39SAndroid Build Coastguard Worker * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE 112*0e209d39SAndroid Build Coastguard Worker * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE 113*0e209d39SAndroid Build Coastguard Worker * @param status A pointer to an UErrorCode to receive any errors. Notably 114*0e209d39SAndroid Build Coastguard Worker * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack 115*0e209d39SAndroid Build Coastguard Worker * buffer has been exhausted. 116*0e209d39SAndroid Build Coastguard Worker * @return The previous collation elements ordering, otherwise returns 117*0e209d39SAndroid Build Coastguard Worker * UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of 118*0e209d39SAndroid Build Coastguard Worker * string has been reached. 119*0e209d39SAndroid Build Coastguard Worker */ 120*0e209d39SAndroid Build Coastguard Worker int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 121*0e209d39SAndroid Build Coastguard Worker 122*0e209d39SAndroid Build Coastguard Worker private: 123*0e209d39SAndroid Build Coastguard Worker void init(const Collator &coll); 124*0e209d39SAndroid Build Coastguard Worker uint64_t processCE(uint32_t ce); 125*0e209d39SAndroid Build Coastguard Worker }; 126*0e209d39SAndroid Build Coastguard Worker 127*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 128*0e209d39SAndroid Build Coastguard Worker 129*0e209d39SAndroid Build Coastguard Worker #define INITIAL_ARRAY_SIZE_ 256 130*0e209d39SAndroid Build Coastguard Worker 131*0e209d39SAndroid Build Coastguard Worker struct USearch { 132*0e209d39SAndroid Build Coastguard Worker // required since collation element iterator does not have a getText API 133*0e209d39SAndroid Build Coastguard Worker const UChar *text; 134*0e209d39SAndroid Build Coastguard Worker int32_t textLength; // exact length 135*0e209d39SAndroid Build Coastguard Worker UBool isOverlap; 136*0e209d39SAndroid Build Coastguard Worker UBool isCanonicalMatch; 137*0e209d39SAndroid Build Coastguard Worker int16_t elementComparisonType; 138*0e209d39SAndroid Build Coastguard Worker UBreakIterator *internalBreakIter; // internal character breakiterator, lazily created. 139*0e209d39SAndroid Build Coastguard Worker UBreakIterator *breakIter; // caller provided character breakiterator 140*0e209d39SAndroid Build Coastguard Worker // value USEARCH_DONE is the default value 141*0e209d39SAndroid Build Coastguard Worker // if we are not at the start of the text or the end of the text, 142*0e209d39SAndroid Build Coastguard Worker // depending on the iteration direction and matchedIndex is USEARCH_DONE 143*0e209d39SAndroid Build Coastguard Worker // it means that we can't find any more matches in that particular direction 144*0e209d39SAndroid Build Coastguard Worker int32_t matchedIndex; 145*0e209d39SAndroid Build Coastguard Worker int32_t matchedLength; 146*0e209d39SAndroid Build Coastguard Worker UBool isForwardSearching; 147*0e209d39SAndroid Build Coastguard Worker UBool reset; 148*0e209d39SAndroid Build Coastguard Worker }; 149*0e209d39SAndroid Build Coastguard Worker 150*0e209d39SAndroid Build Coastguard Worker struct UPattern { 151*0e209d39SAndroid Build Coastguard Worker const UChar *text; 152*0e209d39SAndroid Build Coastguard Worker int32_t textLength; // exact length 153*0e209d39SAndroid Build Coastguard Worker // length required for backwards ce comparison 154*0e209d39SAndroid Build Coastguard Worker int32_t cesLength; 155*0e209d39SAndroid Build Coastguard Worker int32_t *ces; 156*0e209d39SAndroid Build Coastguard Worker int32_t cesBuffer[INITIAL_ARRAY_SIZE_]; 157*0e209d39SAndroid Build Coastguard Worker int32_t pcesLength; 158*0e209d39SAndroid Build Coastguard Worker int64_t *pces; 159*0e209d39SAndroid Build Coastguard Worker int64_t pcesBuffer[INITIAL_ARRAY_SIZE_]; 160*0e209d39SAndroid Build Coastguard Worker UBool hasPrefixAccents; 161*0e209d39SAndroid Build Coastguard Worker UBool hasSuffixAccents; 162*0e209d39SAndroid Build Coastguard Worker }; 163*0e209d39SAndroid Build Coastguard Worker 164*0e209d39SAndroid Build Coastguard Worker struct UStringSearch { 165*0e209d39SAndroid Build Coastguard Worker struct USearch *search; 166*0e209d39SAndroid Build Coastguard Worker struct UPattern pattern; 167*0e209d39SAndroid Build Coastguard Worker const UCollator *collator; 168*0e209d39SAndroid Build Coastguard Worker const icu::Normalizer2 *nfd; 169*0e209d39SAndroid Build Coastguard Worker // positions within the collation element iterator is used to determine 170*0e209d39SAndroid Build Coastguard Worker // if we are at the start of the text. 171*0e209d39SAndroid Build Coastguard Worker UCollationElements *textIter; 172*0e209d39SAndroid Build Coastguard Worker icu::UCollationPCE *textProcessedIter; 173*0e209d39SAndroid Build Coastguard Worker // utility collation element, used throughout program for temporary 174*0e209d39SAndroid Build Coastguard Worker // iteration. 175*0e209d39SAndroid Build Coastguard Worker UCollationElements *utilIter; 176*0e209d39SAndroid Build Coastguard Worker UBool ownCollator; 177*0e209d39SAndroid Build Coastguard Worker UCollationStrength strength; 178*0e209d39SAndroid Build Coastguard Worker uint32_t ceMask; 179*0e209d39SAndroid Build Coastguard Worker uint32_t variableTop; 180*0e209d39SAndroid Build Coastguard Worker UBool toShift; 181*0e209d39SAndroid Build Coastguard Worker }; 182*0e209d39SAndroid Build Coastguard Worker 183*0e209d39SAndroid Build Coastguard Worker /** 184*0e209d39SAndroid Build Coastguard Worker * Exact matches without checking for the ends for extra accents. 185*0e209d39SAndroid Build Coastguard Worker * The match after the position within the collation element iterator is to be 186*0e209d39SAndroid Build Coastguard Worker * found. 187*0e209d39SAndroid Build Coastguard Worker * After a match is found the offset in the collation element iterator will be 188*0e209d39SAndroid Build Coastguard Worker * shifted to the start of the match. 189*0e209d39SAndroid Build Coastguard Worker * Implementation note: 190*0e209d39SAndroid Build Coastguard Worker * For tertiary we can't use the collator->tertiaryMask, that is a 191*0e209d39SAndroid Build Coastguard Worker * preprocessed mask that takes into account case options. since we are only 192*0e209d39SAndroid Build Coastguard Worker * concerned with exact matches, we don't need that. 193*0e209d39SAndroid Build Coastguard Worker * Alternate handling - since only the 16 most significant digits is only used, 194*0e209d39SAndroid Build Coastguard Worker * we can safely do a compare without masking if the ce is a variable, we mask 195*0e209d39SAndroid Build Coastguard Worker * and get only the primary values no shifting to quartenary is required since 196*0e209d39SAndroid Build Coastguard Worker * all primary values less than variabletop will need to be masked off anyway. 197*0e209d39SAndroid Build Coastguard Worker * If the end character is composite and the pattern ce does not match the text 198*0e209d39SAndroid Build Coastguard Worker * ce, we skip it until we find a match in the end composite character or when 199*0e209d39SAndroid Build Coastguard Worker * it has passed the character. This is so that we can match pattern "a" with 200*0e209d39SAndroid Build Coastguard Worker * the text "\u00e6" 201*0e209d39SAndroid Build Coastguard Worker * @param strsrch string search data 202*0e209d39SAndroid Build Coastguard Worker * @param status error status if any 203*0e209d39SAndroid Build Coastguard Worker * @return true if an exact match is found, false otherwise 204*0e209d39SAndroid Build Coastguard Worker */ 205*0e209d39SAndroid Build Coastguard Worker U_CFUNC 206*0e209d39SAndroid Build Coastguard Worker UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); 207*0e209d39SAndroid Build Coastguard Worker 208*0e209d39SAndroid Build Coastguard Worker /** 209*0e209d39SAndroid Build Coastguard Worker * Canonical matches. 210*0e209d39SAndroid Build Coastguard Worker * According to the definition, matches found here will include the whole span 211*0e209d39SAndroid Build Coastguard Worker * of beginning and ending accents if it overlaps that region. 212*0e209d39SAndroid Build Coastguard Worker * @param strsrch string search data 213*0e209d39SAndroid Build Coastguard Worker * @param status error status if any 214*0e209d39SAndroid Build Coastguard Worker * @return true if a canonical match is found, false otherwise 215*0e209d39SAndroid Build Coastguard Worker */ 216*0e209d39SAndroid Build Coastguard Worker U_CFUNC 217*0e209d39SAndroid Build Coastguard Worker UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); 218*0e209d39SAndroid Build Coastguard Worker 219*0e209d39SAndroid Build Coastguard Worker /** 220*0e209d39SAndroid Build Coastguard Worker * Gets the previous match. 221*0e209d39SAndroid Build Coastguard Worker * Comments follows from handleNextExact 222*0e209d39SAndroid Build Coastguard Worker * @param strsrch string search data 223*0e209d39SAndroid Build Coastguard Worker * @param status error status if any 224*0e209d39SAndroid Build Coastguard Worker * @return True if a exact math is found, false otherwise. 225*0e209d39SAndroid Build Coastguard Worker */ 226*0e209d39SAndroid Build Coastguard Worker U_CFUNC 227*0e209d39SAndroid Build Coastguard Worker UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); 228*0e209d39SAndroid Build Coastguard Worker 229*0e209d39SAndroid Build Coastguard Worker /** 230*0e209d39SAndroid Build Coastguard Worker * Canonical matches. 231*0e209d39SAndroid Build Coastguard Worker * According to the definition, matches found here will include the whole span 232*0e209d39SAndroid Build Coastguard Worker * of beginning and ending accents if it overlaps that region. 233*0e209d39SAndroid Build Coastguard Worker * @param strsrch string search data 234*0e209d39SAndroid Build Coastguard Worker * @param status error status if any 235*0e209d39SAndroid Build Coastguard Worker * @return true if a canonical match is found, false otherwise 236*0e209d39SAndroid Build Coastguard Worker */ 237*0e209d39SAndroid Build Coastguard Worker U_CFUNC 238*0e209d39SAndroid Build Coastguard Worker UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 239*0e209d39SAndroid Build Coastguard Worker UErrorCode *status); 240*0e209d39SAndroid Build Coastguard Worker 241*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_COLLATION */ 242*0e209d39SAndroid Build Coastguard Worker 243*0e209d39SAndroid Build Coastguard Worker #endif 244