1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2012-2016, International Business Machines 6*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved. 7*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 8*0e209d39SAndroid Build Coastguard Worker * utf8collationiterator.h 9*0e209d39SAndroid Build Coastguard Worker * 10*0e209d39SAndroid Build Coastguard Worker * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h) 11*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer 12*0e209d39SAndroid Build Coastguard Worker */ 13*0e209d39SAndroid Build Coastguard Worker 14*0e209d39SAndroid Build Coastguard Worker #ifndef __UTF8COLLATIONITERATOR_H__ 15*0e209d39SAndroid Build Coastguard Worker #define __UTF8COLLATIONITERATOR_H__ 16*0e209d39SAndroid Build Coastguard Worker 17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 18*0e209d39SAndroid Build Coastguard Worker 19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION 20*0e209d39SAndroid Build Coastguard Worker 21*0e209d39SAndroid Build Coastguard Worker #include "cmemory.h" 22*0e209d39SAndroid Build Coastguard Worker #include "collation.h" 23*0e209d39SAndroid Build Coastguard Worker #include "collationdata.h" 24*0e209d39SAndroid Build Coastguard Worker #include "collationiterator.h" 25*0e209d39SAndroid Build Coastguard Worker #include "normalizer2impl.h" 26*0e209d39SAndroid Build Coastguard Worker 27*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 28*0e209d39SAndroid Build Coastguard Worker 29*0e209d39SAndroid Build Coastguard Worker /** 30*0e209d39SAndroid Build Coastguard Worker * UTF-8 collation element and character iterator. 31*0e209d39SAndroid Build Coastguard Worker * Handles normalized UTF-8 text inline, with length or NUL-terminated. 32*0e209d39SAndroid Build Coastguard Worker * Unnormalized text is handled by a subclass. 33*0e209d39SAndroid Build Coastguard Worker */ 34*0e209d39SAndroid Build Coastguard Worker class U_I18N_API UTF8CollationIterator : public CollationIterator { 35*0e209d39SAndroid Build Coastguard Worker public: UTF8CollationIterator(const CollationData * d,UBool numeric,const uint8_t * s,int32_t p,int32_t len)36*0e209d39SAndroid Build Coastguard Worker UTF8CollationIterator(const CollationData *d, UBool numeric, 37*0e209d39SAndroid Build Coastguard Worker const uint8_t *s, int32_t p, int32_t len) 38*0e209d39SAndroid Build Coastguard Worker : CollationIterator(d, numeric), 39*0e209d39SAndroid Build Coastguard Worker u8(s), pos(p), length(len) {} 40*0e209d39SAndroid Build Coastguard Worker 41*0e209d39SAndroid Build Coastguard Worker virtual ~UTF8CollationIterator(); 42*0e209d39SAndroid Build Coastguard Worker 43*0e209d39SAndroid Build Coastguard Worker virtual void resetToOffset(int32_t newOffset) override; 44*0e209d39SAndroid Build Coastguard Worker 45*0e209d39SAndroid Build Coastguard Worker virtual int32_t getOffset() const override; 46*0e209d39SAndroid Build Coastguard Worker 47*0e209d39SAndroid Build Coastguard Worker virtual UChar32 nextCodePoint(UErrorCode &errorCode) override; 48*0e209d39SAndroid Build Coastguard Worker 49*0e209d39SAndroid Build Coastguard Worker virtual UChar32 previousCodePoint(UErrorCode &errorCode) override; 50*0e209d39SAndroid Build Coastguard Worker 51*0e209d39SAndroid Build Coastguard Worker protected: 52*0e209d39SAndroid Build Coastguard Worker /** 53*0e209d39SAndroid Build Coastguard Worker * For byte sequences that are illegal in UTF-8, an error value may be returned 54*0e209d39SAndroid Build Coastguard Worker * together with a bogus code point. The caller will ignore that code point. 55*0e209d39SAndroid Build Coastguard Worker * 56*0e209d39SAndroid Build Coastguard Worker * Special values may be returned for surrogate code points, which are also illegal in UTF-8, 57*0e209d39SAndroid Build Coastguard Worker * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true. 58*0e209d39SAndroid Build Coastguard Worker * 59*0e209d39SAndroid Build Coastguard Worker * Valid lead surrogates are returned from inside a normalized text segment, 60*0e209d39SAndroid Build Coastguard Worker * where handleGetTrailSurrogate() will return the matching trail surrogate. 61*0e209d39SAndroid Build Coastguard Worker */ 62*0e209d39SAndroid Build Coastguard Worker virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override; 63*0e209d39SAndroid Build Coastguard Worker 64*0e209d39SAndroid Build Coastguard Worker virtual UBool foundNULTerminator() override; 65*0e209d39SAndroid Build Coastguard Worker 66*0e209d39SAndroid Build Coastguard Worker virtual UBool forbidSurrogateCodePoints() const override; 67*0e209d39SAndroid Build Coastguard Worker 68*0e209d39SAndroid Build Coastguard Worker virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; 69*0e209d39SAndroid Build Coastguard Worker 70*0e209d39SAndroid Build Coastguard Worker virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; 71*0e209d39SAndroid Build Coastguard Worker 72*0e209d39SAndroid Build Coastguard Worker const uint8_t *u8; 73*0e209d39SAndroid Build Coastguard Worker int32_t pos; 74*0e209d39SAndroid Build Coastguard Worker int32_t length; // <0 for NUL-terminated strings 75*0e209d39SAndroid Build Coastguard Worker }; 76*0e209d39SAndroid Build Coastguard Worker 77*0e209d39SAndroid Build Coastguard Worker /** 78*0e209d39SAndroid Build Coastguard Worker * Incrementally checks the input text for FCD and normalizes where necessary. 79*0e209d39SAndroid Build Coastguard Worker */ 80*0e209d39SAndroid Build Coastguard Worker class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator { 81*0e209d39SAndroid Build Coastguard Worker public: FCDUTF8CollationIterator(const CollationData * data,UBool numeric,const uint8_t * s,int32_t p,int32_t len)82*0e209d39SAndroid Build Coastguard Worker FCDUTF8CollationIterator(const CollationData *data, UBool numeric, 83*0e209d39SAndroid Build Coastguard Worker const uint8_t *s, int32_t p, int32_t len) 84*0e209d39SAndroid Build Coastguard Worker : UTF8CollationIterator(data, numeric, s, p, len), 85*0e209d39SAndroid Build Coastguard Worker state(CHECK_FWD), start(p), 86*0e209d39SAndroid Build Coastguard Worker nfcImpl(data->nfcImpl) {} 87*0e209d39SAndroid Build Coastguard Worker 88*0e209d39SAndroid Build Coastguard Worker virtual ~FCDUTF8CollationIterator(); 89*0e209d39SAndroid Build Coastguard Worker 90*0e209d39SAndroid Build Coastguard Worker virtual void resetToOffset(int32_t newOffset) override; 91*0e209d39SAndroid Build Coastguard Worker 92*0e209d39SAndroid Build Coastguard Worker virtual int32_t getOffset() const override; 93*0e209d39SAndroid Build Coastguard Worker 94*0e209d39SAndroid Build Coastguard Worker virtual UChar32 nextCodePoint(UErrorCode &errorCode) override; 95*0e209d39SAndroid Build Coastguard Worker 96*0e209d39SAndroid Build Coastguard Worker virtual UChar32 previousCodePoint(UErrorCode &errorCode) override; 97*0e209d39SAndroid Build Coastguard Worker 98*0e209d39SAndroid Build Coastguard Worker protected: 99*0e209d39SAndroid Build Coastguard Worker virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override; 100*0e209d39SAndroid Build Coastguard Worker 101*0e209d39SAndroid Build Coastguard Worker virtual char16_t handleGetTrailSurrogate() override; 102*0e209d39SAndroid Build Coastguard Worker 103*0e209d39SAndroid Build Coastguard Worker virtual UBool foundNULTerminator() override; 104*0e209d39SAndroid Build Coastguard Worker 105*0e209d39SAndroid Build Coastguard Worker virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; 106*0e209d39SAndroid Build Coastguard Worker 107*0e209d39SAndroid Build Coastguard Worker virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; 108*0e209d39SAndroid Build Coastguard Worker 109*0e209d39SAndroid Build Coastguard Worker private: 110*0e209d39SAndroid Build Coastguard Worker UBool nextHasLccc() const; 111*0e209d39SAndroid Build Coastguard Worker UBool previousHasTccc() const; 112*0e209d39SAndroid Build Coastguard Worker 113*0e209d39SAndroid Build Coastguard Worker /** 114*0e209d39SAndroid Build Coastguard Worker * Switches to forward checking if possible. 115*0e209d39SAndroid Build Coastguard Worker */ 116*0e209d39SAndroid Build Coastguard Worker void switchToForward(); 117*0e209d39SAndroid Build Coastguard Worker 118*0e209d39SAndroid Build Coastguard Worker /** 119*0e209d39SAndroid Build Coastguard Worker * Extends the FCD text segment forward or normalizes around pos. 120*0e209d39SAndroid Build Coastguard Worker * @return true if success 121*0e209d39SAndroid Build Coastguard Worker */ 122*0e209d39SAndroid Build Coastguard Worker UBool nextSegment(UErrorCode &errorCode); 123*0e209d39SAndroid Build Coastguard Worker 124*0e209d39SAndroid Build Coastguard Worker /** 125*0e209d39SAndroid Build Coastguard Worker * Switches to backward checking. 126*0e209d39SAndroid Build Coastguard Worker */ 127*0e209d39SAndroid Build Coastguard Worker void switchToBackward(); 128*0e209d39SAndroid Build Coastguard Worker 129*0e209d39SAndroid Build Coastguard Worker /** 130*0e209d39SAndroid Build Coastguard Worker * Extends the FCD text segment backward or normalizes around pos. 131*0e209d39SAndroid Build Coastguard Worker * @return true if success 132*0e209d39SAndroid Build Coastguard Worker */ 133*0e209d39SAndroid Build Coastguard Worker UBool previousSegment(UErrorCode &errorCode); 134*0e209d39SAndroid Build Coastguard Worker 135*0e209d39SAndroid Build Coastguard Worker UBool normalize(const UnicodeString &s, UErrorCode &errorCode); 136*0e209d39SAndroid Build Coastguard Worker 137*0e209d39SAndroid Build Coastguard Worker enum State { 138*0e209d39SAndroid Build Coastguard Worker /** 139*0e209d39SAndroid Build Coastguard Worker * The input text [start..pos[ passes the FCD check. 140*0e209d39SAndroid Build Coastguard Worker * Moving forward checks incrementally. 141*0e209d39SAndroid Build Coastguard Worker * limit is undefined. 142*0e209d39SAndroid Build Coastguard Worker */ 143*0e209d39SAndroid Build Coastguard Worker CHECK_FWD, 144*0e209d39SAndroid Build Coastguard Worker /** 145*0e209d39SAndroid Build Coastguard Worker * The input text [pos..limit[ passes the FCD check. 146*0e209d39SAndroid Build Coastguard Worker * Moving backward checks incrementally. 147*0e209d39SAndroid Build Coastguard Worker * start is undefined. 148*0e209d39SAndroid Build Coastguard Worker */ 149*0e209d39SAndroid Build Coastguard Worker CHECK_BWD, 150*0e209d39SAndroid Build Coastguard Worker /** 151*0e209d39SAndroid Build Coastguard Worker * The input text [start..limit[ passes the FCD check. 152*0e209d39SAndroid Build Coastguard Worker * pos tracks the current text index. 153*0e209d39SAndroid Build Coastguard Worker */ 154*0e209d39SAndroid Build Coastguard Worker IN_FCD_SEGMENT, 155*0e209d39SAndroid Build Coastguard Worker /** 156*0e209d39SAndroid Build Coastguard Worker * The input text [start..limit[ failed the FCD check and was normalized. 157*0e209d39SAndroid Build Coastguard Worker * pos tracks the current index in the normalized string. 158*0e209d39SAndroid Build Coastguard Worker */ 159*0e209d39SAndroid Build Coastguard Worker IN_NORMALIZED 160*0e209d39SAndroid Build Coastguard Worker }; 161*0e209d39SAndroid Build Coastguard Worker 162*0e209d39SAndroid Build Coastguard Worker State state; 163*0e209d39SAndroid Build Coastguard Worker 164*0e209d39SAndroid Build Coastguard Worker int32_t start; 165*0e209d39SAndroid Build Coastguard Worker int32_t limit; 166*0e209d39SAndroid Build Coastguard Worker 167*0e209d39SAndroid Build Coastguard Worker const Normalizer2Impl &nfcImpl; 168*0e209d39SAndroid Build Coastguard Worker UnicodeString normalized; 169*0e209d39SAndroid Build Coastguard Worker }; 170*0e209d39SAndroid Build Coastguard Worker 171*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 172*0e209d39SAndroid Build Coastguard Worker 173*0e209d39SAndroid Build Coastguard Worker #endif // !UCONFIG_NO_COLLATION 174*0e209d39SAndroid Build Coastguard Worker #endif // __UTF8COLLATIONITERATOR_H__ 175