1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ************************************************************************************ 5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. * 6 * All Rights Reserved. * 7 ************************************************************************************ 8 */ 9 10 #ifndef BRKENG_H 11 #define BRKENG_H 12 13 #include "unicode/umisc.h" 14 #include "unicode/utypes.h" 15 #include "unicode/uobject.h" 16 #include "unicode/utext.h" 17 #include "unicode/uscript.h" 18 19 U_NAMESPACE_BEGIN 20 21 class UnicodeSet; 22 class UStack; 23 class UVector32; 24 class DictionaryMatcher; 25 class ExternalBreakEngine; 26 27 /******************************************************************* 28 * LanguageBreakEngine 29 */ 30 31 /** 32 * <p>LanguageBreakEngines implement language-specific knowledge for 33 * finding text boundaries within a run of characters belonging to a 34 * specific set. The boundaries will be of a specific kind, e.g. word, 35 * line, etc.</p> 36 * 37 * <p>LanguageBreakEngines should normally be implemented so as to 38 * be shared between threads without locking.</p> 39 */ 40 class LanguageBreakEngine : public UObject { 41 public: 42 43 /** 44 * <p>Default constructor.</p> 45 * 46 */ 47 LanguageBreakEngine(); 48 49 /** 50 * <p>Virtual destructor.</p> 51 */ 52 virtual ~LanguageBreakEngine(); 53 54 /** 55 * <p>Indicate whether this engine handles a particular character for 56 * a particular kind of break.</p> 57 * 58 * @param c A character which begins a run that the engine might handle 59 * @param locale The locale. 60 * @return true if this engine handles the particular character and break 61 * type. 62 */ 63 virtual UBool handles(UChar32 c, const char* locale) const = 0; 64 65 /** 66 * <p>Find any breaks within a run in the supplied text.</p> 67 * 68 * @param text A UText representing the text. The 69 * iterator is left at the end of the run of characters which the engine 70 * is capable of handling. 71 * @param startPos The start of the run within the supplied text. 72 * @param endPos The end of the run within the supplied text. 73 * @param foundBreaks A Vector of int32_t to receive the breaks. 74 * @param status Information on any errors encountered. 75 * @return The number of breaks found. 76 */ 77 virtual int32_t findBreaks( UText *text, 78 int32_t startPos, 79 int32_t endPos, 80 UVector32 &foundBreaks, 81 UBool isPhraseBreaking, 82 UErrorCode &status) const = 0; 83 84 }; 85 86 /******************************************************************* 87 * BreakEngineWrapper 88 */ 89 90 /** 91 * <p>BreakEngineWrapper implement LanguageBreakEngine by 92 * a thin wrapper that delegate the task to ExternalBreakEngine 93 * </p> 94 */ 95 class BreakEngineWrapper : public LanguageBreakEngine { 96 public: 97 98 BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status); 99 100 virtual ~BreakEngineWrapper(); 101 102 virtual UBool handles(UChar32 c, const char* locale) const override; 103 104 virtual int32_t findBreaks( UText *text, 105 int32_t startPos, 106 int32_t endPos, 107 UVector32 &foundBreaks, 108 UBool isPhraseBreaking, 109 UErrorCode &status) const override; 110 111 private: 112 LocalPointer<ExternalBreakEngine> delegate; 113 }; 114 115 /******************************************************************* 116 * LanguageBreakFactory 117 */ 118 119 /** 120 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine 121 * that can determine breaks for characters in a specific set, if 122 * such an object can be found.</p> 123 * 124 * <p>If a LanguageBreakFactory is to be shared between threads, 125 * appropriate synchronization must be used; there is none internal 126 * to the factory.</p> 127 * 128 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can 129 * normally be shared between threads without synchronization, unless 130 * the specific subclass of LanguageBreakFactory indicates otherwise.</p> 131 * 132 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine 133 * it returns when it itself is deleted, unless the specific subclass of 134 * LanguageBreakFactory indicates otherwise. Naturally, the factory should 135 * not be deleted until the LanguageBreakEngines it has returned are no 136 * longer needed.</p> 137 */ 138 class LanguageBreakFactory : public UMemory { 139 public: 140 141 /** 142 * <p>Default constructor.</p> 143 * 144 */ 145 LanguageBreakFactory(); 146 147 /** 148 * <p>Virtual destructor.</p> 149 */ 150 virtual ~LanguageBreakFactory(); 151 152 /** 153 * <p>Find and return a LanguageBreakEngine that can find the desired 154 * kind of break for the set of characters to which the supplied 155 * character belongs. It is up to the set of available engines to 156 * determine what the sets of characters are.</p> 157 * 158 * @param c A character that begins a run for which a LanguageBreakEngine is 159 * sought. 160 * @param locale The locale. 161 * @return A LanguageBreakEngine with the desired characteristics, or 0. 162 */ 163 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0; 164 165 }; 166 167 /******************************************************************* 168 * UnhandledEngine 169 */ 170 171 /** 172 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that 173 * handles characters that no other LanguageBreakEngine is available to 174 * handle. It is told the character and the type of break; at its 175 * discretion it may handle more than the specified character (e.g., 176 * the entire script to which that character belongs.</p> 177 * 178 * <p>UnhandledEngines may not be shared between threads without 179 * external synchronization.</p> 180 */ 181 182 class UnhandledEngine : public LanguageBreakEngine { 183 private: 184 185 /** 186 * The sets of characters handled. 187 * @internal 188 */ 189 190 UnicodeSet *fHandled; 191 192 public: 193 194 /** 195 * <p>Default constructor.</p> 196 * 197 */ 198 UnhandledEngine(UErrorCode &status); 199 200 /** 201 * <p>Virtual destructor.</p> 202 */ 203 virtual ~UnhandledEngine(); 204 205 /** 206 * <p>Indicate whether this engine handles a particular character for 207 * a particular kind of break.</p> 208 * 209 * @param c A character which begins a run that the engine might handle 210 * @param locale The locale. 211 * @return true if this engine handles the particular character and break 212 * type. 213 */ 214 virtual UBool handles(UChar32 c, const char* locale) const override; 215 216 /** 217 * <p>Find any breaks within a run in the supplied text.</p> 218 * 219 * @param text A UText representing the text (TODO: UText). The 220 * iterator is left at the end of the run of characters which the engine 221 * is capable of handling. 222 * @param startPos The start of the run within the supplied text. 223 * @param endPos The end of the run within the supplied text. 224 * @param foundBreaks An allocated C array of the breaks found, if any 225 * @param status Information on any errors encountered. 226 * @return The number of breaks found. 227 */ 228 virtual int32_t findBreaks( UText *text, 229 int32_t startPos, 230 int32_t endPos, 231 UVector32 &foundBreaks, 232 UBool isPhraseBreaking, 233 UErrorCode &status) const override; 234 235 /** 236 * <p>Tell the engine to handle a particular character and break type.</p> 237 * 238 * @param c A character which the engine should handle 239 */ 240 virtual void handleCharacter(UChar32 c); 241 242 }; 243 244 /******************************************************************* 245 * ICULanguageBreakFactory 246 */ 247 248 /** 249 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for 250 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary 251 * data in the ICU data file.</p> 252 */ 253 class ICULanguageBreakFactory : public LanguageBreakFactory { 254 private: 255 256 /** 257 * The stack of break engines created by this factory 258 * @internal 259 */ 260 261 UStack *fEngines; 262 263 public: 264 265 /** 266 * <p>Standard constructor.</p> 267 * 268 */ 269 ICULanguageBreakFactory(UErrorCode &status); 270 271 /** 272 * <p>Virtual destructor.</p> 273 */ 274 virtual ~ICULanguageBreakFactory(); 275 276 /** 277 * <p>Find and return a LanguageBreakEngine that can find the desired 278 * kind of break for the set of characters to which the supplied 279 * character belongs. It is up to the set of available engines to 280 * determine what the sets of characters are.</p> 281 * 282 * @param c A character that begins a run for which a LanguageBreakEngine is 283 * sought. 284 * @param locale The locale. 285 * @return A LanguageBreakEngine with the desired characteristics, or 0. 286 */ 287 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override; 288 289 /** 290 * Add and adopt the engine and return an URegistryKey. 291 * @param engine The ExternalBreakEngine to be added and adopt. The caller 292 * pass the ownership and should not release the memory after this. 293 * @param status the error code. 294 */ 295 virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status); 296 297 protected: 298 /** 299 * <p>Create a LanguageBreakEngine for the set of characters to which 300 * the supplied character belongs, for the specified break type.</p> 301 * 302 * @param c A character that begins a run for which a LanguageBreakEngine is 303 * sought. 304 * @param locale The locale. 305 * @return A LanguageBreakEngine with the desired characteristics, or 0. 306 */ 307 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale); 308 309 /** 310 * <p>Create a DictionaryMatcher for the specified script and break type.</p> 311 * @param script An ISO 15924 script code that identifies the dictionary to be 312 * created. 313 * @return A DictionaryMatcher with the desired characteristics, or nullptr. 314 */ 315 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script); 316 317 private: 318 void ensureEngines(UErrorCode& status); 319 }; 320 321 U_NAMESPACE_END 322 323 /* BRKENG_H */ 324 #endif 325