1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker *************************************************************************** 5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 1999-2016 International Business Machines Corporation * 6*0e209d39SAndroid Build Coastguard Worker * and others. All rights reserved. * 7*0e209d39SAndroid Build Coastguard Worker *************************************************************************** 8*0e209d39SAndroid Build Coastguard Worker 9*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 10*0e209d39SAndroid Build Coastguard Worker * Date Name Description 11*0e209d39SAndroid Build Coastguard Worker * 10/22/99 alan Creation. 12*0e209d39SAndroid Build Coastguard Worker * 11/11/99 rgillam Complete port from Java. 13*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 14*0e209d39SAndroid Build Coastguard Worker */ 15*0e209d39SAndroid Build Coastguard Worker 16*0e209d39SAndroid Build Coastguard Worker #ifndef RBBI_H 17*0e209d39SAndroid Build Coastguard Worker #define RBBI_H 18*0e209d39SAndroid Build Coastguard Worker 19*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 20*0e209d39SAndroid Build Coastguard Worker 21*0e209d39SAndroid Build Coastguard Worker #if U_SHOW_CPLUSPLUS_API 22*0e209d39SAndroid Build Coastguard Worker 23*0e209d39SAndroid Build Coastguard Worker /** 24*0e209d39SAndroid Build Coastguard Worker * \file 25*0e209d39SAndroid Build Coastguard Worker * \brief C++ API: Rule Based Break Iterator 26*0e209d39SAndroid Build Coastguard Worker */ 27*0e209d39SAndroid Build Coastguard Worker 28*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_BREAK_ITERATION 29*0e209d39SAndroid Build Coastguard Worker 30*0e209d39SAndroid Build Coastguard Worker #include "unicode/brkiter.h" 31*0e209d39SAndroid Build Coastguard Worker #include "unicode/udata.h" 32*0e209d39SAndroid Build Coastguard Worker #include "unicode/parseerr.h" 33*0e209d39SAndroid Build Coastguard Worker #include "unicode/schriter.h" 34*0e209d39SAndroid Build Coastguard Worker 35*0e209d39SAndroid Build Coastguard Worker struct UCPTrie; 36*0e209d39SAndroid Build Coastguard Worker 37*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 38*0e209d39SAndroid Build Coastguard Worker 39*0e209d39SAndroid Build Coastguard Worker /** @internal */ 40*0e209d39SAndroid Build Coastguard Worker class LanguageBreakEngine; 41*0e209d39SAndroid Build Coastguard Worker struct RBBIDataHeader; 42*0e209d39SAndroid Build Coastguard Worker class RBBIDataWrapper; 43*0e209d39SAndroid Build Coastguard Worker class UnhandledEngine; 44*0e209d39SAndroid Build Coastguard Worker class UStack; 45*0e209d39SAndroid Build Coastguard Worker 46*0e209d39SAndroid Build Coastguard Worker 47*0e209d39SAndroid Build Coastguard Worker #ifndef U_HIDE_INTERNAL_API 48*0e209d39SAndroid Build Coastguard Worker /** 49*0e209d39SAndroid Build Coastguard Worker * The ExternalBreakEngine class define an abstract interface for the host environment 50*0e209d39SAndroid Build Coastguard Worker * to provide a low level facility to break text for unicode text in script that the text boundary 51*0e209d39SAndroid Build Coastguard Worker * cannot be handled by upper level rule based logic, for example, for Chinese and Japanese 52*0e209d39SAndroid Build Coastguard Worker * word breaking, Thai, Khmer, Burmese, Lao and other Southeast Asian scripts. 53*0e209d39SAndroid Build Coastguard Worker * The host environment implement one or more subclass of ExternalBreakEngine and 54*0e209d39SAndroid Build Coastguard Worker * register them in the initialization time by calling 55*0e209d39SAndroid Build Coastguard Worker * RuleBasedBreakIterator::registerExternalBreakEngine(). ICU adopt and own the engine and will 56*0e209d39SAndroid Build Coastguard Worker * delete the registered external engine in proper time during the clean up 57*0e209d39SAndroid Build Coastguard Worker * event. 58*0e209d39SAndroid Build Coastguard Worker * @internal ICU 74 technology preview 59*0e209d39SAndroid Build Coastguard Worker */ 60*0e209d39SAndroid Build Coastguard Worker class ExternalBreakEngine : public UObject { 61*0e209d39SAndroid Build Coastguard Worker public: 62*0e209d39SAndroid Build Coastguard Worker /** 63*0e209d39SAndroid Build Coastguard Worker * destructor 64*0e209d39SAndroid Build Coastguard Worker * @internal ICU 74 technology preview 65*0e209d39SAndroid Build Coastguard Worker */ ~ExternalBreakEngine()66*0e209d39SAndroid Build Coastguard Worker virtual ~ExternalBreakEngine() {} 67*0e209d39SAndroid Build Coastguard Worker 68*0e209d39SAndroid Build Coastguard Worker /** 69*0e209d39SAndroid Build Coastguard Worker * <p>Indicate whether this engine handles a particular character when 70*0e209d39SAndroid Build Coastguard Worker * the RuleBasedBreakIterator is used for a particular locale. This method is used 71*0e209d39SAndroid Build Coastguard Worker * by the RuleBasedBreakIterator to find a break engine.</p> 72*0e209d39SAndroid Build Coastguard Worker * @param c A character which begins a run that the engine might handle. 73*0e209d39SAndroid Build Coastguard Worker * @param locale The locale. 74*0e209d39SAndroid Build Coastguard Worker * @return true if this engine handles the particular character for that locale. 75*0e209d39SAndroid Build Coastguard Worker * @internal ICU 74 technology preview 76*0e209d39SAndroid Build Coastguard Worker */ 77*0e209d39SAndroid Build Coastguard Worker virtual bool isFor(UChar32 c, const char* locale) const = 0; 78*0e209d39SAndroid Build Coastguard Worker 79*0e209d39SAndroid Build Coastguard Worker /** 80*0e209d39SAndroid Build Coastguard Worker * <p>Indicate whether this engine handles a particular character.This method is 81*0e209d39SAndroid Build Coastguard Worker * used by the RuleBasedBreakIterator after it already find a break engine to see which 82*0e209d39SAndroid Build Coastguard Worker * characters after the first one can be handled by this break engine.</p> 83*0e209d39SAndroid Build Coastguard Worker * @param c A character that the engine might handle. 84*0e209d39SAndroid Build Coastguard Worker * @return true if this engine handles the particular character. 85*0e209d39SAndroid Build Coastguard Worker * @internal ICU 74 technology preview 86*0e209d39SAndroid Build Coastguard Worker */ 87*0e209d39SAndroid Build Coastguard Worker virtual bool handles(UChar32 c) const = 0; 88*0e209d39SAndroid Build Coastguard Worker 89*0e209d39SAndroid Build Coastguard Worker /** 90*0e209d39SAndroid Build Coastguard Worker * <p>Divide up a range of text handled by this break engine.</p> 91*0e209d39SAndroid Build Coastguard Worker * 92*0e209d39SAndroid Build Coastguard Worker * @param text A UText representing the text 93*0e209d39SAndroid Build Coastguard Worker * @param start The start of the range of known characters 94*0e209d39SAndroid Build Coastguard Worker * @param end The end of the range of known characters 95*0e209d39SAndroid Build Coastguard Worker * @param foundBreaks Output of C array of int32_t break positions, or 96*0e209d39SAndroid Build Coastguard Worker * nullptr 97*0e209d39SAndroid Build Coastguard Worker * @param foundBreaksCapacity The capacity of foundBreaks 98*0e209d39SAndroid Build Coastguard Worker * @param status Information on any errors encountered. 99*0e209d39SAndroid Build Coastguard Worker * @return The number of breaks found 100*0e209d39SAndroid Build Coastguard Worker * @internal ICU 74 technology preview 101*0e209d39SAndroid Build Coastguard Worker */ 102*0e209d39SAndroid Build Coastguard Worker virtual int32_t fillBreaks(UText* text, int32_t start, int32_t end, 103*0e209d39SAndroid Build Coastguard Worker int32_t* foundBreaks, int32_t foundBreaksCapacity, 104*0e209d39SAndroid Build Coastguard Worker UErrorCode& status) const = 0; 105*0e209d39SAndroid Build Coastguard Worker }; 106*0e209d39SAndroid Build Coastguard Worker #endif /* U_HIDE_INTERNAL_API */ 107*0e209d39SAndroid Build Coastguard Worker 108*0e209d39SAndroid Build Coastguard Worker 109*0e209d39SAndroid Build Coastguard Worker /** 110*0e209d39SAndroid Build Coastguard Worker * 111*0e209d39SAndroid Build Coastguard Worker * A subclass of BreakIterator whose behavior is specified using a list of rules. 112*0e209d39SAndroid Build Coastguard Worker * <p>Instances of this class are most commonly created by the factory methods of 113*0e209d39SAndroid Build Coastguard Worker * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc., 114*0e209d39SAndroid Build Coastguard Worker * and then used via the abstract API in class BreakIterator</p> 115*0e209d39SAndroid Build Coastguard Worker * 116*0e209d39SAndroid Build Coastguard Worker * <p>See the ICU User Guide for information on Break Iterator Rules.</p> 117*0e209d39SAndroid Build Coastguard Worker * 118*0e209d39SAndroid Build Coastguard Worker * <p>This class is not intended to be subclassed.</p> 119*0e209d39SAndroid Build Coastguard Worker */ 120*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator { 121*0e209d39SAndroid Build Coastguard Worker 122*0e209d39SAndroid Build Coastguard Worker private: 123*0e209d39SAndroid Build Coastguard Worker /** 124*0e209d39SAndroid Build Coastguard Worker * The UText through which this BreakIterator accesses the text 125*0e209d39SAndroid Build Coastguard Worker * @internal (private) 126*0e209d39SAndroid Build Coastguard Worker */ 127*0e209d39SAndroid Build Coastguard Worker UText fText = UTEXT_INITIALIZER; 128*0e209d39SAndroid Build Coastguard Worker 129*0e209d39SAndroid Build Coastguard Worker #ifndef U_HIDE_INTERNAL_API 130*0e209d39SAndroid Build Coastguard Worker public: 131*0e209d39SAndroid Build Coastguard Worker #endif /* U_HIDE_INTERNAL_API */ 132*0e209d39SAndroid Build Coastguard Worker /** 133*0e209d39SAndroid Build Coastguard Worker * The rule data for this BreakIterator instance. 134*0e209d39SAndroid Build Coastguard Worker * Not for general use; Public only for testing purposes. 135*0e209d39SAndroid Build Coastguard Worker * @internal 136*0e209d39SAndroid Build Coastguard Worker */ 137*0e209d39SAndroid Build Coastguard Worker RBBIDataWrapper *fData = nullptr; 138*0e209d39SAndroid Build Coastguard Worker 139*0e209d39SAndroid Build Coastguard Worker private: 140*0e209d39SAndroid Build Coastguard Worker /** 141*0e209d39SAndroid Build Coastguard Worker * The saved error code associated with this break iterator. 142*0e209d39SAndroid Build Coastguard Worker * This is the value to be returned by copyErrorTo(). 143*0e209d39SAndroid Build Coastguard Worker */ 144*0e209d39SAndroid Build Coastguard Worker UErrorCode fErrorCode = U_ZERO_ERROR; 145*0e209d39SAndroid Build Coastguard Worker 146*0e209d39SAndroid Build Coastguard Worker /** 147*0e209d39SAndroid Build Coastguard Worker * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 148*0e209d39SAndroid Build Coastguard Worker * Never has the value UBRK_DONE (-1). 149*0e209d39SAndroid Build Coastguard Worker */ 150*0e209d39SAndroid Build Coastguard Worker int32_t fPosition = 0; 151*0e209d39SAndroid Build Coastguard Worker 152*0e209d39SAndroid Build Coastguard Worker /** 153*0e209d39SAndroid Build Coastguard Worker * TODO: 154*0e209d39SAndroid Build Coastguard Worker */ 155*0e209d39SAndroid Build Coastguard Worker int32_t fRuleStatusIndex = 0; 156*0e209d39SAndroid Build Coastguard Worker 157*0e209d39SAndroid Build Coastguard Worker /** 158*0e209d39SAndroid Build Coastguard Worker * Cache of previously determined boundary positions. 159*0e209d39SAndroid Build Coastguard Worker */ 160*0e209d39SAndroid Build Coastguard Worker class BreakCache; 161*0e209d39SAndroid Build Coastguard Worker BreakCache *fBreakCache = nullptr; 162*0e209d39SAndroid Build Coastguard Worker 163*0e209d39SAndroid Build Coastguard Worker /** 164*0e209d39SAndroid Build Coastguard Worker * Cache of boundary positions within a region of text that has been 165*0e209d39SAndroid Build Coastguard Worker * sub-divided by dictionary based breaking. 166*0e209d39SAndroid Build Coastguard Worker */ 167*0e209d39SAndroid Build Coastguard Worker class DictionaryCache; 168*0e209d39SAndroid Build Coastguard Worker DictionaryCache *fDictionaryCache = nullptr; 169*0e209d39SAndroid Build Coastguard Worker 170*0e209d39SAndroid Build Coastguard Worker /** 171*0e209d39SAndroid Build Coastguard Worker * 172*0e209d39SAndroid Build Coastguard Worker * If present, UStack of LanguageBreakEngine objects that might handle 173*0e209d39SAndroid Build Coastguard Worker * dictionary characters. Searched from top to bottom to find an object to 174*0e209d39SAndroid Build Coastguard Worker * handle a given character. 175*0e209d39SAndroid Build Coastguard Worker * @internal (private) 176*0e209d39SAndroid Build Coastguard Worker */ 177*0e209d39SAndroid Build Coastguard Worker UStack *fLanguageBreakEngines = nullptr; 178*0e209d39SAndroid Build Coastguard Worker 179*0e209d39SAndroid Build Coastguard Worker /** 180*0e209d39SAndroid Build Coastguard Worker * 181*0e209d39SAndroid Build Coastguard Worker * If present, the special LanguageBreakEngine used for handling 182*0e209d39SAndroid Build Coastguard Worker * characters that are in the dictionary set, but not handled by any 183*0e209d39SAndroid Build Coastguard Worker * LanguageBreakEngine. 184*0e209d39SAndroid Build Coastguard Worker * @internal (private) 185*0e209d39SAndroid Build Coastguard Worker */ 186*0e209d39SAndroid Build Coastguard Worker UnhandledEngine *fUnhandledBreakEngine = nullptr; 187*0e209d39SAndroid Build Coastguard Worker 188*0e209d39SAndroid Build Coastguard Worker /** 189*0e209d39SAndroid Build Coastguard Worker * Counter for the number of characters encountered with the "dictionary" 190*0e209d39SAndroid Build Coastguard Worker * flag set. 191*0e209d39SAndroid Build Coastguard Worker * @internal (private) 192*0e209d39SAndroid Build Coastguard Worker */ 193*0e209d39SAndroid Build Coastguard Worker uint32_t fDictionaryCharCount = 0; 194*0e209d39SAndroid Build Coastguard Worker 195*0e209d39SAndroid Build Coastguard Worker /** 196*0e209d39SAndroid Build Coastguard Worker * A character iterator that refers to the same text as the UText, above. 197*0e209d39SAndroid Build Coastguard Worker * Only included for compatibility with old API, which was based on CharacterIterators. 198*0e209d39SAndroid Build Coastguard Worker * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below. 199*0e209d39SAndroid Build Coastguard Worker */ 200*0e209d39SAndroid Build Coastguard Worker CharacterIterator *fCharIter = &fSCharIter; 201*0e209d39SAndroid Build Coastguard Worker 202*0e209d39SAndroid Build Coastguard Worker /** 203*0e209d39SAndroid Build Coastguard Worker * When the input text is provided by a UnicodeString, this will point to 204*0e209d39SAndroid Build Coastguard Worker * a characterIterator that wraps that data. Needed only for the 205*0e209d39SAndroid Build Coastguard Worker * implementation of getText(), a backwards compatibility issue. 206*0e209d39SAndroid Build Coastguard Worker */ 207*0e209d39SAndroid Build Coastguard Worker UCharCharacterIterator fSCharIter {u"", 0}; 208*0e209d39SAndroid Build Coastguard Worker 209*0e209d39SAndroid Build Coastguard Worker /** 210*0e209d39SAndroid Build Coastguard Worker * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 211*0e209d39SAndroid Build Coastguard Worker */ 212*0e209d39SAndroid Build Coastguard Worker bool fDone = false; 213*0e209d39SAndroid Build Coastguard Worker 214*0e209d39SAndroid Build Coastguard Worker /** 215*0e209d39SAndroid Build Coastguard Worker * Array of look-ahead tentative results. 216*0e209d39SAndroid Build Coastguard Worker */ 217*0e209d39SAndroid Build Coastguard Worker int32_t *fLookAheadMatches = nullptr; 218*0e209d39SAndroid Build Coastguard Worker 219*0e209d39SAndroid Build Coastguard Worker /** 220*0e209d39SAndroid Build Coastguard Worker * A flag to indicate if phrase based breaking is enabled. 221*0e209d39SAndroid Build Coastguard Worker */ 222*0e209d39SAndroid Build Coastguard Worker UBool fIsPhraseBreaking = false; 223*0e209d39SAndroid Build Coastguard Worker 224*0e209d39SAndroid Build Coastguard Worker //======================================================================= 225*0e209d39SAndroid Build Coastguard Worker // constructors 226*0e209d39SAndroid Build Coastguard Worker //======================================================================= 227*0e209d39SAndroid Build Coastguard Worker 228*0e209d39SAndroid Build Coastguard Worker /** 229*0e209d39SAndroid Build Coastguard Worker * Constructor from a flattened set of RBBI data in malloced memory. 230*0e209d39SAndroid Build Coastguard Worker * RulesBasedBreakIterators built from a custom set of rules 231*0e209d39SAndroid Build Coastguard Worker * are created via this constructor; the rules are compiled 232*0e209d39SAndroid Build Coastguard Worker * into memory, then the break iterator is constructed here. 233*0e209d39SAndroid Build Coastguard Worker * 234*0e209d39SAndroid Build Coastguard Worker * The break iterator adopts the memory, and will 235*0e209d39SAndroid Build Coastguard Worker * free it when done. 236*0e209d39SAndroid Build Coastguard Worker * @internal (private) 237*0e209d39SAndroid Build Coastguard Worker */ 238*0e209d39SAndroid Build Coastguard Worker RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status); 239*0e209d39SAndroid Build Coastguard Worker 240*0e209d39SAndroid Build Coastguard Worker /** 241*0e209d39SAndroid Build Coastguard Worker * This constructor uses the udata interface to create a BreakIterator 242*0e209d39SAndroid Build Coastguard Worker * whose internal tables live in a memory-mapped file. "image" is an 243*0e209d39SAndroid Build Coastguard Worker * ICU UDataMemory handle for the pre-compiled break iterator tables. 244*0e209d39SAndroid Build Coastguard Worker * @param image handle to the memory image for the break iterator data. 245*0e209d39SAndroid Build Coastguard Worker * Ownership of the UDataMemory handle passes to the Break Iterator, 246*0e209d39SAndroid Build Coastguard Worker * which will be responsible for closing it when it is no longer needed. 247*0e209d39SAndroid Build Coastguard Worker * @param status Information on any errors encountered. 248*0e209d39SAndroid Build Coastguard Worker * @param isPhraseBreaking true if phrase based breaking is required, otherwise false. 249*0e209d39SAndroid Build Coastguard Worker * @see udata_open 250*0e209d39SAndroid Build Coastguard Worker * @see #getBinaryRules 251*0e209d39SAndroid Build Coastguard Worker * @internal (private) 252*0e209d39SAndroid Build Coastguard Worker */ 253*0e209d39SAndroid Build Coastguard Worker RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status); 254*0e209d39SAndroid Build Coastguard Worker 255*0e209d39SAndroid Build Coastguard Worker /** @internal */ 256*0e209d39SAndroid Build Coastguard Worker friend class RBBIRuleBuilder; 257*0e209d39SAndroid Build Coastguard Worker /** @internal */ 258*0e209d39SAndroid Build Coastguard Worker friend class BreakIterator; 259*0e209d39SAndroid Build Coastguard Worker 260*0e209d39SAndroid Build Coastguard Worker /** 261*0e209d39SAndroid Build Coastguard Worker * Default constructor with an error code parameter. 262*0e209d39SAndroid Build Coastguard Worker * Aside from error handling, otherwise identical to the default constructor. 263*0e209d39SAndroid Build Coastguard Worker * Internally, handles common initialization for other constructors. 264*0e209d39SAndroid Build Coastguard Worker * @internal (private) 265*0e209d39SAndroid Build Coastguard Worker */ 266*0e209d39SAndroid Build Coastguard Worker RuleBasedBreakIterator(UErrorCode *status); 267*0e209d39SAndroid Build Coastguard Worker 268*0e209d39SAndroid Build Coastguard Worker public: 269*0e209d39SAndroid Build Coastguard Worker 270*0e209d39SAndroid Build Coastguard Worker /** Default constructor. Creates an empty shell of an iterator, with no 271*0e209d39SAndroid Build Coastguard Worker * rules or text to iterate over. Object can subsequently be assigned to, 272*0e209d39SAndroid Build Coastguard Worker * but is otherwise unusable. 273*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.2 274*0e209d39SAndroid Build Coastguard Worker */ 275*0e209d39SAndroid Build Coastguard Worker RuleBasedBreakIterator(); 276*0e209d39SAndroid Build Coastguard Worker 277*0e209d39SAndroid Build Coastguard Worker /** 278*0e209d39SAndroid Build Coastguard Worker * Copy constructor. Will produce a break iterator with the same behavior, 279*0e209d39SAndroid Build Coastguard Worker * and which iterates over the same text, as the one passed in. 280*0e209d39SAndroid Build Coastguard Worker * @param that The RuleBasedBreakIterator passed to be copied 281*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 282*0e209d39SAndroid Build Coastguard Worker */ 283*0e209d39SAndroid Build Coastguard Worker RuleBasedBreakIterator(const RuleBasedBreakIterator& that); 284*0e209d39SAndroid Build Coastguard Worker 285*0e209d39SAndroid Build Coastguard Worker /** 286*0e209d39SAndroid Build Coastguard Worker * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 287*0e209d39SAndroid Build Coastguard Worker * @param rules The break rules to be used. 288*0e209d39SAndroid Build Coastguard Worker * @param parseError In the event of a syntax error in the rules, provides the location 289*0e209d39SAndroid Build Coastguard Worker * within the rules of the problem. 290*0e209d39SAndroid Build Coastguard Worker * @param status Information on any errors encountered. 291*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.2 292*0e209d39SAndroid Build Coastguard Worker */ 293*0e209d39SAndroid Build Coastguard Worker RuleBasedBreakIterator( const UnicodeString &rules, 294*0e209d39SAndroid Build Coastguard Worker UParseError &parseError, 295*0e209d39SAndroid Build Coastguard Worker UErrorCode &status); 296*0e209d39SAndroid Build Coastguard Worker 297*0e209d39SAndroid Build Coastguard Worker /** 298*0e209d39SAndroid Build Coastguard Worker * Construct a RuleBasedBreakIterator from a set of precompiled binary rules. 299*0e209d39SAndroid Build Coastguard Worker * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules(). 300*0e209d39SAndroid Build Coastguard Worker * Construction of a break iterator in this way is substantially faster than 301*0e209d39SAndroid Build Coastguard Worker * construction from source rules. 302*0e209d39SAndroid Build Coastguard Worker * 303*0e209d39SAndroid Build Coastguard Worker * Ownership of the storage containing the compiled rules remains with the 304*0e209d39SAndroid Build Coastguard Worker * caller of this function. The compiled rules must not be modified or 305*0e209d39SAndroid Build Coastguard Worker * deleted during the life of the break iterator. 306*0e209d39SAndroid Build Coastguard Worker * 307*0e209d39SAndroid Build Coastguard Worker * The compiled rules are not compatible across different major versions of ICU. 308*0e209d39SAndroid Build Coastguard Worker * The compiled rules are compatible only between machines with the same 309*0e209d39SAndroid Build Coastguard Worker * byte ordering (little or big endian) and the same base character set family 310*0e209d39SAndroid Build Coastguard Worker * (ASCII or EBCDIC). 311*0e209d39SAndroid Build Coastguard Worker * 312*0e209d39SAndroid Build Coastguard Worker * @see #getBinaryRules 313*0e209d39SAndroid Build Coastguard Worker * @param compiledRules A pointer to the compiled break rules to be used. 314*0e209d39SAndroid Build Coastguard Worker * @param ruleLength The length of the compiled break rules, in bytes. This 315*0e209d39SAndroid Build Coastguard Worker * corresponds to the length value produced by getBinaryRules(). 316*0e209d39SAndroid Build Coastguard Worker * @param status Information on any errors encountered, including invalid 317*0e209d39SAndroid Build Coastguard Worker * binary rules. 318*0e209d39SAndroid Build Coastguard Worker * @stable ICU 4.8 319*0e209d39SAndroid Build Coastguard Worker */ 320*0e209d39SAndroid Build Coastguard Worker RuleBasedBreakIterator(const uint8_t *compiledRules, 321*0e209d39SAndroid Build Coastguard Worker uint32_t ruleLength, 322*0e209d39SAndroid Build Coastguard Worker UErrorCode &status); 323*0e209d39SAndroid Build Coastguard Worker 324*0e209d39SAndroid Build Coastguard Worker /** 325*0e209d39SAndroid Build Coastguard Worker * This constructor uses the udata interface to create a BreakIterator 326*0e209d39SAndroid Build Coastguard Worker * whose internal tables live in a memory-mapped file. "image" is an 327*0e209d39SAndroid Build Coastguard Worker * ICU UDataMemory handle for the pre-compiled break iterator tables. 328*0e209d39SAndroid Build Coastguard Worker * @param image handle to the memory image for the break iterator data. 329*0e209d39SAndroid Build Coastguard Worker * Ownership of the UDataMemory handle passes to the Break Iterator, 330*0e209d39SAndroid Build Coastguard Worker * which will be responsible for closing it when it is no longer needed. 331*0e209d39SAndroid Build Coastguard Worker * @param status Information on any errors encountered. 332*0e209d39SAndroid Build Coastguard Worker * @see udata_open 333*0e209d39SAndroid Build Coastguard Worker * @see #getBinaryRules 334*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.8 335*0e209d39SAndroid Build Coastguard Worker */ 336*0e209d39SAndroid Build Coastguard Worker RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status); 337*0e209d39SAndroid Build Coastguard Worker 338*0e209d39SAndroid Build Coastguard Worker /** 339*0e209d39SAndroid Build Coastguard Worker * Destructor 340*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 341*0e209d39SAndroid Build Coastguard Worker */ 342*0e209d39SAndroid Build Coastguard Worker virtual ~RuleBasedBreakIterator(); 343*0e209d39SAndroid Build Coastguard Worker 344*0e209d39SAndroid Build Coastguard Worker /** 345*0e209d39SAndroid Build Coastguard Worker * Assignment operator. Sets this iterator to have the same behavior, 346*0e209d39SAndroid Build Coastguard Worker * and iterate over the same text, as the one passed in. 347*0e209d39SAndroid Build Coastguard Worker * @param that The RuleBasedBreakItertor passed in 348*0e209d39SAndroid Build Coastguard Worker * @return the newly created RuleBasedBreakIterator 349*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 350*0e209d39SAndroid Build Coastguard Worker */ 351*0e209d39SAndroid Build Coastguard Worker RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that); 352*0e209d39SAndroid Build Coastguard Worker 353*0e209d39SAndroid Build Coastguard Worker /** 354*0e209d39SAndroid Build Coastguard Worker * Equality operator. Returns true if both BreakIterators are of the 355*0e209d39SAndroid Build Coastguard Worker * same class, have the same behavior, and iterate over the same text. 356*0e209d39SAndroid Build Coastguard Worker * @param that The BreakIterator to be compared for equality 357*0e209d39SAndroid Build Coastguard Worker * @return true if both BreakIterators are of the 358*0e209d39SAndroid Build Coastguard Worker * same class, have the same behavior, and iterate over the same text. 359*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 360*0e209d39SAndroid Build Coastguard Worker */ 361*0e209d39SAndroid Build Coastguard Worker virtual bool operator==(const BreakIterator& that) const override; 362*0e209d39SAndroid Build Coastguard Worker 363*0e209d39SAndroid Build Coastguard Worker /** 364*0e209d39SAndroid Build Coastguard Worker * Not-equal operator. If operator== returns true, this returns false, 365*0e209d39SAndroid Build Coastguard Worker * and vice versa. 366*0e209d39SAndroid Build Coastguard Worker * @param that The BreakIterator to be compared for inequality 367*0e209d39SAndroid Build Coastguard Worker * @return true if both BreakIterators are not same. 368*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 369*0e209d39SAndroid Build Coastguard Worker */ 370*0e209d39SAndroid Build Coastguard Worker inline bool operator!=(const BreakIterator& that) const { 371*0e209d39SAndroid Build Coastguard Worker return !operator==(that); 372*0e209d39SAndroid Build Coastguard Worker } 373*0e209d39SAndroid Build Coastguard Worker 374*0e209d39SAndroid Build Coastguard Worker /** 375*0e209d39SAndroid Build Coastguard Worker * Returns a newly-constructed RuleBasedBreakIterator with the same 376*0e209d39SAndroid Build Coastguard Worker * behavior, and iterating over the same text, as this one. 377*0e209d39SAndroid Build Coastguard Worker * Differs from the copy constructor in that it is polymorphic, and 378*0e209d39SAndroid Build Coastguard Worker * will correctly clone (copy) a derived class. 379*0e209d39SAndroid Build Coastguard Worker * clone() is thread safe. Multiple threads may simultaneously 380*0e209d39SAndroid Build Coastguard Worker * clone the same source break iterator. 381*0e209d39SAndroid Build Coastguard Worker * @return a newly-constructed RuleBasedBreakIterator 382*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 383*0e209d39SAndroid Build Coastguard Worker */ 384*0e209d39SAndroid Build Coastguard Worker virtual RuleBasedBreakIterator* clone() const override; 385*0e209d39SAndroid Build Coastguard Worker 386*0e209d39SAndroid Build Coastguard Worker /** 387*0e209d39SAndroid Build Coastguard Worker * Compute a hash code for this BreakIterator 388*0e209d39SAndroid Build Coastguard Worker * @return A hash code 389*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 390*0e209d39SAndroid Build Coastguard Worker */ 391*0e209d39SAndroid Build Coastguard Worker virtual int32_t hashCode() const; 392*0e209d39SAndroid Build Coastguard Worker 393*0e209d39SAndroid Build Coastguard Worker /** 394*0e209d39SAndroid Build Coastguard Worker * Returns the description used to create this iterator 395*0e209d39SAndroid Build Coastguard Worker * @return the description used to create this iterator 396*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 397*0e209d39SAndroid Build Coastguard Worker */ 398*0e209d39SAndroid Build Coastguard Worker virtual const UnicodeString& getRules() const; 399*0e209d39SAndroid Build Coastguard Worker 400*0e209d39SAndroid Build Coastguard Worker //======================================================================= 401*0e209d39SAndroid Build Coastguard Worker // BreakIterator overrides 402*0e209d39SAndroid Build Coastguard Worker //======================================================================= 403*0e209d39SAndroid Build Coastguard Worker 404*0e209d39SAndroid Build Coastguard Worker /** 405*0e209d39SAndroid Build Coastguard Worker * <p> 406*0e209d39SAndroid Build Coastguard Worker * Return a CharacterIterator over the text being analyzed. 407*0e209d39SAndroid Build Coastguard Worker * The returned character iterator is owned by the break iterator, and must 408*0e209d39SAndroid Build Coastguard Worker * not be deleted by the caller. Repeated calls to this function may 409*0e209d39SAndroid Build Coastguard Worker * return the same CharacterIterator. 410*0e209d39SAndroid Build Coastguard Worker * </p> 411*0e209d39SAndroid Build Coastguard Worker * <p> 412*0e209d39SAndroid Build Coastguard Worker * The returned character iterator must not be used concurrently with 413*0e209d39SAndroid Build Coastguard Worker * the break iterator. If concurrent operation is needed, clone the 414*0e209d39SAndroid Build Coastguard Worker * returned character iterator first and operate on the clone. 415*0e209d39SAndroid Build Coastguard Worker * </p> 416*0e209d39SAndroid Build Coastguard Worker * <p> 417*0e209d39SAndroid Build Coastguard Worker * When the break iterator is operating on text supplied via a UText, 418*0e209d39SAndroid Build Coastguard Worker * this function will fail, returning a CharacterIterator containing no text. 419*0e209d39SAndroid Build Coastguard Worker * The function getUText() provides similar functionality, 420*0e209d39SAndroid Build Coastguard Worker * is reliable, and is more efficient. 421*0e209d39SAndroid Build Coastguard Worker * </p> 422*0e209d39SAndroid Build Coastguard Worker * 423*0e209d39SAndroid Build Coastguard Worker * TODO: deprecate this function? 424*0e209d39SAndroid Build Coastguard Worker * 425*0e209d39SAndroid Build Coastguard Worker * @return An iterator over the text being analyzed. 426*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 427*0e209d39SAndroid Build Coastguard Worker */ 428*0e209d39SAndroid Build Coastguard Worker virtual CharacterIterator& getText() const override; 429*0e209d39SAndroid Build Coastguard Worker 430*0e209d39SAndroid Build Coastguard Worker /** 431*0e209d39SAndroid Build Coastguard Worker * Get a UText for the text being analyzed. 432*0e209d39SAndroid Build Coastguard Worker * The returned UText is a shallow clone of the UText used internally 433*0e209d39SAndroid Build Coastguard Worker * by the break iterator implementation. It can safely be used to 434*0e209d39SAndroid Build Coastguard Worker * access the text without impacting any break iterator operations, 435*0e209d39SAndroid Build Coastguard Worker * but the underlying text itself must not be altered. 436*0e209d39SAndroid Build Coastguard Worker * 437*0e209d39SAndroid Build Coastguard Worker * @param fillIn A UText to be filled in. If nullptr, a new UText will be 438*0e209d39SAndroid Build Coastguard Worker * allocated to hold the result. 439*0e209d39SAndroid Build Coastguard Worker * @param status receives any error codes. 440*0e209d39SAndroid Build Coastguard Worker * @return The current UText for this break iterator. If an input 441*0e209d39SAndroid Build Coastguard Worker * UText was provided, it will always be returned. 442*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.4 443*0e209d39SAndroid Build Coastguard Worker */ 444*0e209d39SAndroid Build Coastguard Worker virtual UText *getUText(UText *fillIn, UErrorCode &status) const override; 445*0e209d39SAndroid Build Coastguard Worker 446*0e209d39SAndroid Build Coastguard Worker /** 447*0e209d39SAndroid Build Coastguard Worker * Set the iterator to analyze a new piece of text. This function resets 448*0e209d39SAndroid Build Coastguard Worker * the current iteration position to the beginning of the text. 449*0e209d39SAndroid Build Coastguard Worker * @param newText An iterator over the text to analyze. The BreakIterator 450*0e209d39SAndroid Build Coastguard Worker * takes ownership of the character iterator. The caller MUST NOT delete it! 451*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 452*0e209d39SAndroid Build Coastguard Worker */ 453*0e209d39SAndroid Build Coastguard Worker virtual void adoptText(CharacterIterator* newText) override; 454*0e209d39SAndroid Build Coastguard Worker 455*0e209d39SAndroid Build Coastguard Worker /** 456*0e209d39SAndroid Build Coastguard Worker * Set the iterator to analyze a new piece of text. This function resets 457*0e209d39SAndroid Build Coastguard Worker * the current iteration position to the beginning of the text. 458*0e209d39SAndroid Build Coastguard Worker * 459*0e209d39SAndroid Build Coastguard Worker * The BreakIterator will retain a reference to the supplied string. 460*0e209d39SAndroid Build Coastguard Worker * The caller must not modify or delete the text while the BreakIterator 461*0e209d39SAndroid Build Coastguard Worker * retains the reference. 462*0e209d39SAndroid Build Coastguard Worker * 463*0e209d39SAndroid Build Coastguard Worker * @param newText The text to analyze. 464*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 465*0e209d39SAndroid Build Coastguard Worker */ 466*0e209d39SAndroid Build Coastguard Worker virtual void setText(const UnicodeString& newText) override; 467*0e209d39SAndroid Build Coastguard Worker 468*0e209d39SAndroid Build Coastguard Worker /** 469*0e209d39SAndroid Build Coastguard Worker * Reset the break iterator to operate over the text represented by 470*0e209d39SAndroid Build Coastguard Worker * the UText. The iterator position is reset to the start. 471*0e209d39SAndroid Build Coastguard Worker * 472*0e209d39SAndroid Build Coastguard Worker * This function makes a shallow clone of the supplied UText. This means 473*0e209d39SAndroid Build Coastguard Worker * that the caller is free to immediately close or otherwise reuse the 474*0e209d39SAndroid Build Coastguard Worker * Utext that was passed as a parameter, but that the underlying text itself 475*0e209d39SAndroid Build Coastguard Worker * must not be altered while being referenced by the break iterator. 476*0e209d39SAndroid Build Coastguard Worker * 477*0e209d39SAndroid Build Coastguard Worker * @param text The UText used to change the text. 478*0e209d39SAndroid Build Coastguard Worker * @param status Receives any error codes. 479*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.4 480*0e209d39SAndroid Build Coastguard Worker */ 481*0e209d39SAndroid Build Coastguard Worker virtual void setText(UText *text, UErrorCode &status) override; 482*0e209d39SAndroid Build Coastguard Worker 483*0e209d39SAndroid Build Coastguard Worker /** 484*0e209d39SAndroid Build Coastguard Worker * Sets the current iteration position to the beginning of the text, position zero. 485*0e209d39SAndroid Build Coastguard Worker * @return The offset of the beginning of the text, zero. 486*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 487*0e209d39SAndroid Build Coastguard Worker */ 488*0e209d39SAndroid Build Coastguard Worker virtual int32_t first() override; 489*0e209d39SAndroid Build Coastguard Worker 490*0e209d39SAndroid Build Coastguard Worker /** 491*0e209d39SAndroid Build Coastguard Worker * Sets the current iteration position to the end of the text. 492*0e209d39SAndroid Build Coastguard Worker * @return The text's past-the-end offset. 493*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 494*0e209d39SAndroid Build Coastguard Worker */ 495*0e209d39SAndroid Build Coastguard Worker virtual int32_t last() override; 496*0e209d39SAndroid Build Coastguard Worker 497*0e209d39SAndroid Build Coastguard Worker /** 498*0e209d39SAndroid Build Coastguard Worker * Advances the iterator either forward or backward the specified number of steps. 499*0e209d39SAndroid Build Coastguard Worker * Negative values move backward, and positive values move forward. This is 500*0e209d39SAndroid Build Coastguard Worker * equivalent to repeatedly calling next() or previous(). 501*0e209d39SAndroid Build Coastguard Worker * @param n The number of steps to move. The sign indicates the direction 502*0e209d39SAndroid Build Coastguard Worker * (negative is backwards, and positive is forwards). 503*0e209d39SAndroid Build Coastguard Worker * @return The character offset of the boundary position n boundaries away from 504*0e209d39SAndroid Build Coastguard Worker * the current one. 505*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 506*0e209d39SAndroid Build Coastguard Worker */ 507*0e209d39SAndroid Build Coastguard Worker virtual int32_t next(int32_t n) override; 508*0e209d39SAndroid Build Coastguard Worker 509*0e209d39SAndroid Build Coastguard Worker /** 510*0e209d39SAndroid Build Coastguard Worker * Advances the iterator to the next boundary position. 511*0e209d39SAndroid Build Coastguard Worker * @return The position of the first boundary after this one. 512*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 513*0e209d39SAndroid Build Coastguard Worker */ 514*0e209d39SAndroid Build Coastguard Worker virtual int32_t next() override; 515*0e209d39SAndroid Build Coastguard Worker 516*0e209d39SAndroid Build Coastguard Worker /** 517*0e209d39SAndroid Build Coastguard Worker * Moves the iterator backwards, to the last boundary preceding this one. 518*0e209d39SAndroid Build Coastguard Worker * @return The position of the last boundary position preceding this one. 519*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 520*0e209d39SAndroid Build Coastguard Worker */ 521*0e209d39SAndroid Build Coastguard Worker virtual int32_t previous() override; 522*0e209d39SAndroid Build Coastguard Worker 523*0e209d39SAndroid Build Coastguard Worker /** 524*0e209d39SAndroid Build Coastguard Worker * Sets the iterator to refer to the first boundary position following 525*0e209d39SAndroid Build Coastguard Worker * the specified position. 526*0e209d39SAndroid Build Coastguard Worker * @param offset The position from which to begin searching for a break position. 527*0e209d39SAndroid Build Coastguard Worker * @return The position of the first break after the current position. 528*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 529*0e209d39SAndroid Build Coastguard Worker */ 530*0e209d39SAndroid Build Coastguard Worker virtual int32_t following(int32_t offset) override; 531*0e209d39SAndroid Build Coastguard Worker 532*0e209d39SAndroid Build Coastguard Worker /** 533*0e209d39SAndroid Build Coastguard Worker * Sets the iterator to refer to the last boundary position before the 534*0e209d39SAndroid Build Coastguard Worker * specified position. 535*0e209d39SAndroid Build Coastguard Worker * @param offset The position to begin searching for a break from. 536*0e209d39SAndroid Build Coastguard Worker * @return The position of the last boundary before the starting position. 537*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 538*0e209d39SAndroid Build Coastguard Worker */ 539*0e209d39SAndroid Build Coastguard Worker virtual int32_t preceding(int32_t offset) override; 540*0e209d39SAndroid Build Coastguard Worker 541*0e209d39SAndroid Build Coastguard Worker /** 542*0e209d39SAndroid Build Coastguard Worker * Returns true if the specified position is a boundary position. As a side 543*0e209d39SAndroid Build Coastguard Worker * effect, leaves the iterator pointing to the first boundary position at 544*0e209d39SAndroid Build Coastguard Worker * or after "offset". 545*0e209d39SAndroid Build Coastguard Worker * @param offset the offset to check. 546*0e209d39SAndroid Build Coastguard Worker * @return True if "offset" is a boundary position. 547*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 548*0e209d39SAndroid Build Coastguard Worker */ 549*0e209d39SAndroid Build Coastguard Worker virtual UBool isBoundary(int32_t offset) override; 550*0e209d39SAndroid Build Coastguard Worker 551*0e209d39SAndroid Build Coastguard Worker /** 552*0e209d39SAndroid Build Coastguard Worker * Returns the current iteration position. Note that UBRK_DONE is never 553*0e209d39SAndroid Build Coastguard Worker * returned from this function; if iteration has run to the end of a 554*0e209d39SAndroid Build Coastguard Worker * string, current() will return the length of the string while 555*0e209d39SAndroid Build Coastguard Worker * next() will return UBRK_DONE). 556*0e209d39SAndroid Build Coastguard Worker * @return The current iteration position. 557*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 558*0e209d39SAndroid Build Coastguard Worker */ 559*0e209d39SAndroid Build Coastguard Worker virtual int32_t current() const override; 560*0e209d39SAndroid Build Coastguard Worker 561*0e209d39SAndroid Build Coastguard Worker /** 562*0e209d39SAndroid Build Coastguard Worker * Return the status tag from the break rule that determined the boundary at 563*0e209d39SAndroid Build Coastguard Worker * the current iteration position. For break rules that do not specify a 564*0e209d39SAndroid Build Coastguard Worker * status, a default value of 0 is returned. If more than one break rule 565*0e209d39SAndroid Build Coastguard Worker * would cause a boundary to be located at some position in the text, 566*0e209d39SAndroid Build Coastguard Worker * the numerically largest of the applicable status values is returned. 567*0e209d39SAndroid Build Coastguard Worker * <p> 568*0e209d39SAndroid Build Coastguard Worker * Of the standard types of ICU break iterators, only word break and 569*0e209d39SAndroid Build Coastguard Worker * line break provide status values. The values are defined in 570*0e209d39SAndroid Build Coastguard Worker * the header file ubrk.h. For Word breaks, the status allows distinguishing between words 571*0e209d39SAndroid Build Coastguard Worker * that contain alphabetic letters, "words" that appear to be numbers, 572*0e209d39SAndroid Build Coastguard Worker * punctuation and spaces, words containing ideographic characters, and 573*0e209d39SAndroid Build Coastguard Worker * more. For Line Break, the status distinguishes between hard (mandatory) breaks 574*0e209d39SAndroid Build Coastguard Worker * and soft (potential) break positions. 575*0e209d39SAndroid Build Coastguard Worker * <p> 576*0e209d39SAndroid Build Coastguard Worker * <code>getRuleStatus()</code> can be called after obtaining a boundary 577*0e209d39SAndroid Build Coastguard Worker * position from <code>next()</code>, <code>previous()</code>, or 578*0e209d39SAndroid Build Coastguard Worker * any other break iterator functions that returns a boundary position. 579*0e209d39SAndroid Build Coastguard Worker * <p> 580*0e209d39SAndroid Build Coastguard Worker * Note that <code>getRuleStatus()</code> returns the value corresponding to 581*0e209d39SAndroid Build Coastguard Worker * <code>current()</code> index even after <code>next()</code> has returned DONE. 582*0e209d39SAndroid Build Coastguard Worker * <p> 583*0e209d39SAndroid Build Coastguard Worker * When creating custom break rules, one is free to define whatever 584*0e209d39SAndroid Build Coastguard Worker * status values may be convenient for the application. 585*0e209d39SAndroid Build Coastguard Worker * <p> 586*0e209d39SAndroid Build Coastguard Worker * @return the status from the break rule that determined the boundary 587*0e209d39SAndroid Build Coastguard Worker * at the current iteration position. 588*0e209d39SAndroid Build Coastguard Worker * 589*0e209d39SAndroid Build Coastguard Worker * @see UWordBreak 590*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.2 591*0e209d39SAndroid Build Coastguard Worker */ 592*0e209d39SAndroid Build Coastguard Worker virtual int32_t getRuleStatus() const override; 593*0e209d39SAndroid Build Coastguard Worker 594*0e209d39SAndroid Build Coastguard Worker /** 595*0e209d39SAndroid Build Coastguard Worker * Get the status (tag) values from the break rule(s) that determined the boundary 596*0e209d39SAndroid Build Coastguard Worker * at the current iteration position. 597*0e209d39SAndroid Build Coastguard Worker * <p> 598*0e209d39SAndroid Build Coastguard Worker * The returned status value(s) are stored into an array provided by the caller. 599*0e209d39SAndroid Build Coastguard Worker * The values are stored in sorted (ascending) order. 600*0e209d39SAndroid Build Coastguard Worker * If the capacity of the output array is insufficient to hold the data, 601*0e209d39SAndroid Build Coastguard Worker * the output will be truncated to the available length, and a 602*0e209d39SAndroid Build Coastguard Worker * U_BUFFER_OVERFLOW_ERROR will be signaled. 603*0e209d39SAndroid Build Coastguard Worker * 604*0e209d39SAndroid Build Coastguard Worker * @param fillInVec an array to be filled in with the status values. 605*0e209d39SAndroid Build Coastguard Worker * @param capacity the length of the supplied vector. A length of zero causes 606*0e209d39SAndroid Build Coastguard Worker * the function to return the number of status values, in the 607*0e209d39SAndroid Build Coastguard Worker * normal way, without attempting to store any values. 608*0e209d39SAndroid Build Coastguard Worker * @param status receives error codes. 609*0e209d39SAndroid Build Coastguard Worker * @return The number of rule status values from the rules that determined 610*0e209d39SAndroid Build Coastguard Worker * the boundary at the current iteration position. 611*0e209d39SAndroid Build Coastguard Worker * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value 612*0e209d39SAndroid Build Coastguard Worker * is the total number of status values that were available, 613*0e209d39SAndroid Build Coastguard Worker * not the reduced number that were actually returned. 614*0e209d39SAndroid Build Coastguard Worker * @see getRuleStatus 615*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.0 616*0e209d39SAndroid Build Coastguard Worker */ 617*0e209d39SAndroid Build Coastguard Worker virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override; 618*0e209d39SAndroid Build Coastguard Worker 619*0e209d39SAndroid Build Coastguard Worker /** 620*0e209d39SAndroid Build Coastguard Worker * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. 621*0e209d39SAndroid Build Coastguard Worker * This method is to implement a simple version of RTTI, since not all 622*0e209d39SAndroid Build Coastguard Worker * C++ compilers support genuine RTTI. Polymorphic operator==() and 623*0e209d39SAndroid Build Coastguard Worker * clone() methods call this method. 624*0e209d39SAndroid Build Coastguard Worker * 625*0e209d39SAndroid Build Coastguard Worker * @return The class ID for this object. All objects of a 626*0e209d39SAndroid Build Coastguard Worker * given class have the same class ID. Objects of 627*0e209d39SAndroid Build Coastguard Worker * other classes have different class IDs. 628*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 629*0e209d39SAndroid Build Coastguard Worker */ 630*0e209d39SAndroid Build Coastguard Worker virtual UClassID getDynamicClassID() const override; 631*0e209d39SAndroid Build Coastguard Worker 632*0e209d39SAndroid Build Coastguard Worker /** 633*0e209d39SAndroid Build Coastguard Worker * Returns the class ID for this class. This is useful only for 634*0e209d39SAndroid Build Coastguard Worker * comparing to a return value from getDynamicClassID(). For example: 635*0e209d39SAndroid Build Coastguard Worker * 636*0e209d39SAndroid Build Coastguard Worker * Base* polymorphic_pointer = createPolymorphicObject(); 637*0e209d39SAndroid Build Coastguard Worker * if (polymorphic_pointer->getDynamicClassID() == 638*0e209d39SAndroid Build Coastguard Worker * Derived::getStaticClassID()) ... 639*0e209d39SAndroid Build Coastguard Worker * 640*0e209d39SAndroid Build Coastguard Worker * @return The class ID for all objects of this class. 641*0e209d39SAndroid Build Coastguard Worker * @stable ICU 2.0 642*0e209d39SAndroid Build Coastguard Worker */ 643*0e209d39SAndroid Build Coastguard Worker static UClassID U_EXPORT2 getStaticClassID(); 644*0e209d39SAndroid Build Coastguard Worker 645*0e209d39SAndroid Build Coastguard Worker #ifndef U_FORCE_HIDE_DEPRECATED_API 646*0e209d39SAndroid Build Coastguard Worker /** 647*0e209d39SAndroid Build Coastguard Worker * Deprecated functionality. Use clone() instead. 648*0e209d39SAndroid Build Coastguard Worker * 649*0e209d39SAndroid Build Coastguard Worker * Create a clone (copy) of this break iterator in memory provided 650*0e209d39SAndroid Build Coastguard Worker * by the caller. The idea is to increase performance by avoiding 651*0e209d39SAndroid Build Coastguard Worker * a storage allocation. Use of this function is NOT RECOMMENDED. 652*0e209d39SAndroid Build Coastguard Worker * Performance gains are minimal, and correct buffer management is 653*0e209d39SAndroid Build Coastguard Worker * tricky. Use clone() instead. 654*0e209d39SAndroid Build Coastguard Worker * 655*0e209d39SAndroid Build Coastguard Worker * @param stackBuffer The pointer to the memory into which the cloned object 656*0e209d39SAndroid Build Coastguard Worker * should be placed. If nullptr, allocate heap memory 657*0e209d39SAndroid Build Coastguard Worker * for the cloned object. 658*0e209d39SAndroid Build Coastguard Worker * @param BufferSize The size of the buffer. If zero, return the required 659*0e209d39SAndroid Build Coastguard Worker * buffer size, but do not clone the object. If the 660*0e209d39SAndroid Build Coastguard Worker * size was too small (but not zero), allocate heap 661*0e209d39SAndroid Build Coastguard Worker * storage for the cloned object. 662*0e209d39SAndroid Build Coastguard Worker * 663*0e209d39SAndroid Build Coastguard Worker * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be 664*0e209d39SAndroid Build Coastguard Worker * returned if the provided buffer was too small, and 665*0e209d39SAndroid Build Coastguard Worker * the clone was therefore put on the heap. 666*0e209d39SAndroid Build Coastguard Worker * 667*0e209d39SAndroid Build Coastguard Worker * @return Pointer to the clone object. This may differ from the stackBuffer 668*0e209d39SAndroid Build Coastguard Worker * address if the byte alignment of the stack buffer was not suitable 669*0e209d39SAndroid Build Coastguard Worker * or if the stackBuffer was too small to hold the clone. 670*0e209d39SAndroid Build Coastguard Worker * @deprecated ICU 52. Use clone() instead. 671*0e209d39SAndroid Build Coastguard Worker */ 672*0e209d39SAndroid Build Coastguard Worker virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer, 673*0e209d39SAndroid Build Coastguard Worker int32_t &BufferSize, 674*0e209d39SAndroid Build Coastguard Worker UErrorCode &status) override; 675*0e209d39SAndroid Build Coastguard Worker #endif // U_FORCE_HIDE_DEPRECATED_API 676*0e209d39SAndroid Build Coastguard Worker 677*0e209d39SAndroid Build Coastguard Worker /** 678*0e209d39SAndroid Build Coastguard Worker * Return the binary form of compiled break rules, 679*0e209d39SAndroid Build Coastguard Worker * which can then be used to create a new break iterator at some 680*0e209d39SAndroid Build Coastguard Worker * time in the future. Creating a break iterator from pre-compiled rules 681*0e209d39SAndroid Build Coastguard Worker * is much faster than building one from the source form of the 682*0e209d39SAndroid Build Coastguard Worker * break rules. 683*0e209d39SAndroid Build Coastguard Worker * 684*0e209d39SAndroid Build Coastguard Worker * The binary data can only be used with the same version of ICU 685*0e209d39SAndroid Build Coastguard Worker * and on the same platform type (processor endian-ness) 686*0e209d39SAndroid Build Coastguard Worker * 687*0e209d39SAndroid Build Coastguard Worker * @param length Returns the length of the binary data. (Out parameter.) 688*0e209d39SAndroid Build Coastguard Worker * 689*0e209d39SAndroid Build Coastguard Worker * @return A pointer to the binary (compiled) rule data. The storage 690*0e209d39SAndroid Build Coastguard Worker * belongs to the RulesBasedBreakIterator object, not the 691*0e209d39SAndroid Build Coastguard Worker * caller, and must not be modified or deleted. 692*0e209d39SAndroid Build Coastguard Worker * @stable ICU 4.8 693*0e209d39SAndroid Build Coastguard Worker */ 694*0e209d39SAndroid Build Coastguard Worker virtual const uint8_t *getBinaryRules(uint32_t &length); 695*0e209d39SAndroid Build Coastguard Worker 696*0e209d39SAndroid Build Coastguard Worker /** 697*0e209d39SAndroid Build Coastguard Worker * Set the subject text string upon which the break iterator is operating 698*0e209d39SAndroid Build Coastguard Worker * without changing any other aspect of the matching state. 699*0e209d39SAndroid Build Coastguard Worker * The new and previous text strings must have the same content. 700*0e209d39SAndroid Build Coastguard Worker * 701*0e209d39SAndroid Build Coastguard Worker * This function is intended for use in environments where ICU is operating on 702*0e209d39SAndroid Build Coastguard Worker * strings that may move around in memory. It provides a mechanism for notifying 703*0e209d39SAndroid Build Coastguard Worker * ICU that the string has been relocated, and providing a new UText to access the 704*0e209d39SAndroid Build Coastguard Worker * string in its new position. 705*0e209d39SAndroid Build Coastguard Worker * 706*0e209d39SAndroid Build Coastguard Worker * Note that the break iterator implementation never copies the underlying text 707*0e209d39SAndroid Build Coastguard Worker * of a string being processed, but always operates directly on the original text 708*0e209d39SAndroid Build Coastguard Worker * provided by the user. Refreshing simply drops the references to the old text 709*0e209d39SAndroid Build Coastguard Worker * and replaces them with references to the new. 710*0e209d39SAndroid Build Coastguard Worker * 711*0e209d39SAndroid Build Coastguard Worker * Caution: this function is normally used only by very specialized, 712*0e209d39SAndroid Build Coastguard Worker * system-level code. One example use case is with garbage collection that moves 713*0e209d39SAndroid Build Coastguard Worker * the text in memory. 714*0e209d39SAndroid Build Coastguard Worker * 715*0e209d39SAndroid Build Coastguard Worker * @param input The new (moved) text string. 716*0e209d39SAndroid Build Coastguard Worker * @param status Receives errors detected by this function. 717*0e209d39SAndroid Build Coastguard Worker * @return *this 718*0e209d39SAndroid Build Coastguard Worker * 719*0e209d39SAndroid Build Coastguard Worker * @stable ICU 49 720*0e209d39SAndroid Build Coastguard Worker */ 721*0e209d39SAndroid Build Coastguard Worker virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status) override; 722*0e209d39SAndroid Build Coastguard Worker 723*0e209d39SAndroid Build Coastguard Worker 724*0e209d39SAndroid Build Coastguard Worker private: 725*0e209d39SAndroid Build Coastguard Worker //======================================================================= 726*0e209d39SAndroid Build Coastguard Worker // implementation 727*0e209d39SAndroid Build Coastguard Worker //======================================================================= 728*0e209d39SAndroid Build Coastguard Worker /** 729*0e209d39SAndroid Build Coastguard Worker * Iterate backwards from an arbitrary position in the input text using the 730*0e209d39SAndroid Build Coastguard Worker * synthesized Safe Reverse rules. 731*0e209d39SAndroid Build Coastguard Worker * This locates a "Safe Position" from which the forward break rules 732*0e209d39SAndroid Build Coastguard Worker * will operate correctly. A Safe Position is not necessarily a boundary itself. 733*0e209d39SAndroid Build Coastguard Worker * 734*0e209d39SAndroid Build Coastguard Worker * @param fromPosition the position in the input text to begin the iteration. 735*0e209d39SAndroid Build Coastguard Worker * @internal (private) 736*0e209d39SAndroid Build Coastguard Worker */ 737*0e209d39SAndroid Build Coastguard Worker int32_t handleSafePrevious(int32_t fromPosition); 738*0e209d39SAndroid Build Coastguard Worker 739*0e209d39SAndroid Build Coastguard Worker /** 740*0e209d39SAndroid Build Coastguard Worker * Find a rule-based boundary by running the state machine. 741*0e209d39SAndroid Build Coastguard Worker * Input 742*0e209d39SAndroid Build Coastguard Worker * fPosition, the position in the text to begin from. 743*0e209d39SAndroid Build Coastguard Worker * Output 744*0e209d39SAndroid Build Coastguard Worker * fPosition: the boundary following the starting position. 745*0e209d39SAndroid Build Coastguard Worker * fDictionaryCharCount the number of dictionary characters encountered. 746*0e209d39SAndroid Build Coastguard Worker * If > 0, the segment will be further subdivided 747*0e209d39SAndroid Build Coastguard Worker * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 748*0e209d39SAndroid Build Coastguard Worker * 749*0e209d39SAndroid Build Coastguard Worker * @internal (private) 750*0e209d39SAndroid Build Coastguard Worker */ 751*0e209d39SAndroid Build Coastguard Worker int32_t handleNext(); 752*0e209d39SAndroid Build Coastguard Worker 753*0e209d39SAndroid Build Coastguard Worker /* 754*0e209d39SAndroid Build Coastguard Worker * Templatized version of handleNext() and handleSafePrevious(). 755*0e209d39SAndroid Build Coastguard Worker * 756*0e209d39SAndroid Build Coastguard Worker * There will be exactly four instantiations, two each for 8 and 16 bit tables, 757*0e209d39SAndroid Build Coastguard Worker * two each for 8 and 16 bit trie. 758*0e209d39SAndroid Build Coastguard Worker * Having separate instantiations for the table types keeps conditional tests of 759*0e209d39SAndroid Build Coastguard Worker * the table type out of the inner loops, at the expense of replicated code. 760*0e209d39SAndroid Build Coastguard Worker * 761*0e209d39SAndroid Build Coastguard Worker * The template parameter for the Trie access function is a value, not a type. 762*0e209d39SAndroid Build Coastguard Worker * Doing it this way, the compiler will inline the Trie function in the 763*0e209d39SAndroid Build Coastguard Worker * expanded functions. (Both the 8 and 16 bit access functions have the same type 764*0e209d39SAndroid Build Coastguard Worker * signature) 765*0e209d39SAndroid Build Coastguard Worker */ 766*0e209d39SAndroid Build Coastguard Worker 767*0e209d39SAndroid Build Coastguard Worker typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32); 768*0e209d39SAndroid Build Coastguard Worker 769*0e209d39SAndroid Build Coastguard Worker template<typename RowType, PTrieFunc trieFunc> 770*0e209d39SAndroid Build Coastguard Worker int32_t handleSafePrevious(int32_t fromPosition); 771*0e209d39SAndroid Build Coastguard Worker 772*0e209d39SAndroid Build Coastguard Worker template<typename RowType, PTrieFunc trieFunc> 773*0e209d39SAndroid Build Coastguard Worker int32_t handleNext(); 774*0e209d39SAndroid Build Coastguard Worker 775*0e209d39SAndroid Build Coastguard Worker 776*0e209d39SAndroid Build Coastguard Worker /** 777*0e209d39SAndroid Build Coastguard Worker * This function returns the appropriate LanguageBreakEngine for a 778*0e209d39SAndroid Build Coastguard Worker * given character c. 779*0e209d39SAndroid Build Coastguard Worker * @param c A character in the dictionary set 780*0e209d39SAndroid Build Coastguard Worker * @param locale The locale. 781*0e209d39SAndroid Build Coastguard Worker * @internal (private) 782*0e209d39SAndroid Build Coastguard Worker */ 783*0e209d39SAndroid Build Coastguard Worker const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale); 784*0e209d39SAndroid Build Coastguard Worker 785*0e209d39SAndroid Build Coastguard Worker public: 786*0e209d39SAndroid Build Coastguard Worker #ifndef U_HIDE_INTERNAL_API 787*0e209d39SAndroid Build Coastguard Worker /** 788*0e209d39SAndroid Build Coastguard Worker * Debugging function only. 789*0e209d39SAndroid Build Coastguard Worker * @internal 790*0e209d39SAndroid Build Coastguard Worker */ 791*0e209d39SAndroid Build Coastguard Worker void dumpCache(); 792*0e209d39SAndroid Build Coastguard Worker 793*0e209d39SAndroid Build Coastguard Worker /** 794*0e209d39SAndroid Build Coastguard Worker * Debugging function only. 795*0e209d39SAndroid Build Coastguard Worker * @internal 796*0e209d39SAndroid Build Coastguard Worker */ 797*0e209d39SAndroid Build Coastguard Worker void dumpTables(); 798*0e209d39SAndroid Build Coastguard Worker #endif /* U_HIDE_INTERNAL_API */ 799*0e209d39SAndroid Build Coastguard Worker 800*0e209d39SAndroid Build Coastguard Worker #ifndef U_HIDE_INTERNAL_API 801*0e209d39SAndroid Build Coastguard Worker /** 802*0e209d39SAndroid Build Coastguard Worker * Register a new external break engine. The external break engine will be adopted. 803*0e209d39SAndroid Build Coastguard Worker * Because ICU may choose to cache break engine internally, this must 804*0e209d39SAndroid Build Coastguard Worker * be called at application startup, prior to any calls to 805*0e209d39SAndroid Build Coastguard Worker * object methods of RuleBasedBreakIterator to avoid undefined behavior. 806*0e209d39SAndroid Build Coastguard Worker * @param toAdopt the ExternalBreakEngine instance to be adopted 807*0e209d39SAndroid Build Coastguard Worker * @param status the in/out status code, no special meanings are assigned 808*0e209d39SAndroid Build Coastguard Worker * @internal ICU 74 technology preview 809*0e209d39SAndroid Build Coastguard Worker */ 810*0e209d39SAndroid Build Coastguard Worker static void U_EXPORT2 registerExternalBreakEngine( 811*0e209d39SAndroid Build Coastguard Worker ExternalBreakEngine* toAdopt, UErrorCode& status); 812*0e209d39SAndroid Build Coastguard Worker #endif /* U_HIDE_INTERNAL_API */ 813*0e209d39SAndroid Build Coastguard Worker 814*0e209d39SAndroid Build Coastguard Worker }; 815*0e209d39SAndroid Build Coastguard Worker 816*0e209d39SAndroid Build Coastguard Worker 817*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 818*0e209d39SAndroid Build Coastguard Worker 819*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 820*0e209d39SAndroid Build Coastguard Worker 821*0e209d39SAndroid Build Coastguard Worker #endif /* U_SHOW_CPLUSPLUS_API */ 822*0e209d39SAndroid Build Coastguard Worker 823*0e209d39SAndroid Build Coastguard Worker #endif 824