1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2014, International Business Machines Corporation * 6 * and others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 #ifndef DICTBE_H 11 #define DICTBE_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uniset.h" 15 #include "unicode/utext.h" 16 17 #include "brkeng.h" 18 #include "hash.h" 19 #include "mlbe.h" 20 #include "uvectr32.h" 21 22 U_NAMESPACE_BEGIN 23 24 class DictionaryMatcher; 25 class MlBreakEngine; 26 class Normalizer2; 27 28 /******************************************************************* 29 * DictionaryBreakEngine 30 */ 31 32 /** 33 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 34 * dictionary to determine language-specific breaks.</p> 35 * 36 * <p>After it is constructed a DictionaryBreakEngine may be shared between 37 * threads without synchronization.</p> 38 */ 39 class DictionaryBreakEngine : public LanguageBreakEngine { 40 private: 41 /** 42 * The set of characters handled by this engine 43 * @internal 44 */ 45 46 UnicodeSet fSet; 47 48 public: 49 50 /** 51 * <p>Constructor </p> 52 */ 53 DictionaryBreakEngine(); 54 55 /** 56 * <p>Virtual destructor.</p> 57 */ 58 virtual ~DictionaryBreakEngine(); 59 60 /** 61 * <p>Indicate whether this engine handles a particular character for 62 * a particular kind of break.</p> 63 * 64 * @param c A character which begins a run that the engine might handle 65 * @param locale The locale. 66 * @return true if this engine handles the particular character and break 67 * type. 68 */ 69 virtual UBool handles(UChar32 c, const char* locale) const override; 70 71 /** 72 * <p>Find any breaks within a run in the supplied text.</p> 73 * 74 * @param text A UText representing the text. The iterator is left at 75 * the end of the run of characters which the engine is capable of handling 76 * that starts from the first character in the range. 77 * @param startPos The start of the run within the supplied text. 78 * @param endPos The end of the run within the supplied text. 79 * @param foundBreaks vector of int32_t to receive the break positions 80 * @param status Information on any errors encountered. 81 * @return The number of breaks found. 82 */ 83 virtual int32_t findBreaks( UText *text, 84 int32_t startPos, 85 int32_t endPos, 86 UVector32 &foundBreaks, 87 UBool isPhraseBreaking, 88 UErrorCode& status ) const override; 89 90 protected: 91 92 /** 93 * <p>Set the character set handled by this engine.</p> 94 * 95 * @param set A UnicodeSet of the set of characters handled by the engine 96 */ 97 virtual void setCharacters( const UnicodeSet &set ); 98 99 /** 100 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 101 * 102 * @param text A UText representing the text 103 * @param rangeStart The start of the range of dictionary characters 104 * @param rangeEnd The end of the range of dictionary characters 105 * @param foundBreaks Output of C array of int32_t break positions, or 0 106 * @param status Information on any errors encountered. 107 * @return The number of breaks found 108 */ 109 virtual int32_t divideUpDictionaryRange( UText *text, 110 int32_t rangeStart, 111 int32_t rangeEnd, 112 UVector32 &foundBreaks, 113 UBool isPhraseBreaking, 114 UErrorCode& status) const = 0; 115 116 }; 117 118 /******************************************************************* 119 * ThaiBreakEngine 120 */ 121 122 /** 123 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 124 * dictionary and heuristics to determine Thai-specific breaks.</p> 125 * 126 * <p>After it is constructed a ThaiBreakEngine may be shared between 127 * threads without synchronization.</p> 128 */ 129 class ThaiBreakEngine : public DictionaryBreakEngine { 130 private: 131 /** 132 * The set of characters handled by this engine 133 * @internal 134 */ 135 136 UnicodeSet fEndWordSet; 137 UnicodeSet fBeginWordSet; 138 UnicodeSet fSuffixSet; 139 UnicodeSet fMarkSet; 140 DictionaryMatcher *fDictionary; 141 142 public: 143 144 /** 145 * <p>Default constructor.</p> 146 * 147 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 148 * engine is deleted. 149 */ 150 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 151 152 /** 153 * <p>Virtual destructor.</p> 154 */ 155 virtual ~ThaiBreakEngine(); 156 157 protected: 158 /** 159 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 160 * 161 * @param text A UText representing the text 162 * @param rangeStart The start of the range of dictionary characters 163 * @param rangeEnd The end of the range of dictionary characters 164 * @param foundBreaks Output of C array of int32_t break positions, or 0 165 * @param status Information on any errors encountered. 166 * @return The number of breaks found 167 */ 168 virtual int32_t divideUpDictionaryRange( UText *text, 169 int32_t rangeStart, 170 int32_t rangeEnd, 171 UVector32 &foundBreaks, 172 UBool isPhraseBreaking, 173 UErrorCode& status) const override; 174 175 }; 176 177 /******************************************************************* 178 * LaoBreakEngine 179 */ 180 181 /** 182 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 183 * dictionary and heuristics to determine Lao-specific breaks.</p> 184 * 185 * <p>After it is constructed a LaoBreakEngine may be shared between 186 * threads without synchronization.</p> 187 */ 188 class LaoBreakEngine : public DictionaryBreakEngine { 189 private: 190 /** 191 * The set of characters handled by this engine 192 * @internal 193 */ 194 195 UnicodeSet fEndWordSet; 196 UnicodeSet fBeginWordSet; 197 UnicodeSet fMarkSet; 198 DictionaryMatcher *fDictionary; 199 200 public: 201 202 /** 203 * <p>Default constructor.</p> 204 * 205 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 206 * engine is deleted. 207 */ 208 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 209 210 /** 211 * <p>Virtual destructor.</p> 212 */ 213 virtual ~LaoBreakEngine(); 214 215 protected: 216 /** 217 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 218 * 219 * @param text A UText representing the text 220 * @param rangeStart The start of the range of dictionary characters 221 * @param rangeEnd The end of the range of dictionary characters 222 * @param foundBreaks Output of C array of int32_t break positions, or 0 223 * @param status Information on any errors encountered. 224 * @return The number of breaks found 225 */ 226 virtual int32_t divideUpDictionaryRange( UText *text, 227 int32_t rangeStart, 228 int32_t rangeEnd, 229 UVector32 &foundBreaks, 230 UBool isPhraseBreaking, 231 UErrorCode& status) const override; 232 233 }; 234 235 /******************************************************************* 236 * BurmeseBreakEngine 237 */ 238 239 /** 240 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 241 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 242 * 243 * <p>After it is constructed a BurmeseBreakEngine may be shared between 244 * threads without synchronization.</p> 245 */ 246 class BurmeseBreakEngine : public DictionaryBreakEngine { 247 private: 248 /** 249 * The set of characters handled by this engine 250 * @internal 251 */ 252 253 UnicodeSet fEndWordSet; 254 UnicodeSet fBeginWordSet; 255 UnicodeSet fMarkSet; 256 DictionaryMatcher *fDictionary; 257 258 public: 259 260 /** 261 * <p>Default constructor.</p> 262 * 263 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 264 * engine is deleted. 265 */ 266 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 267 268 /** 269 * <p>Virtual destructor.</p> 270 */ 271 virtual ~BurmeseBreakEngine(); 272 273 protected: 274 /** 275 * <p>Divide up a range of known dictionary characters.</p> 276 * 277 * @param text A UText representing the text 278 * @param rangeStart The start of the range of dictionary characters 279 * @param rangeEnd The end of the range of dictionary characters 280 * @param foundBreaks Output of C array of int32_t break positions, or 0 281 * @param status Information on any errors encountered. 282 * @return The number of breaks found 283 */ 284 virtual int32_t divideUpDictionaryRange( UText *text, 285 int32_t rangeStart, 286 int32_t rangeEnd, 287 UVector32 &foundBreaks, 288 UBool isPhraseBreaking, 289 UErrorCode& status) const override; 290 291 }; 292 293 /******************************************************************* 294 * KhmerBreakEngine 295 */ 296 297 /** 298 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 299 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 300 * 301 * <p>After it is constructed a KhmerBreakEngine may be shared between 302 * threads without synchronization.</p> 303 */ 304 class KhmerBreakEngine : public DictionaryBreakEngine { 305 private: 306 /** 307 * The set of characters handled by this engine 308 * @internal 309 */ 310 311 UnicodeSet fEndWordSet; 312 UnicodeSet fBeginWordSet; 313 UnicodeSet fMarkSet; 314 DictionaryMatcher *fDictionary; 315 316 public: 317 318 /** 319 * <p>Default constructor.</p> 320 * 321 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 322 * engine is deleted. 323 */ 324 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 325 326 /** 327 * <p>Virtual destructor.</p> 328 */ 329 virtual ~KhmerBreakEngine(); 330 331 protected: 332 /** 333 * <p>Divide up a range of known dictionary characters.</p> 334 * 335 * @param text A UText representing the text 336 * @param rangeStart The start of the range of dictionary characters 337 * @param rangeEnd The end of the range of dictionary characters 338 * @param foundBreaks Output of C array of int32_t break positions, or 0 339 * @param status Information on any errors encountered. 340 * @return The number of breaks found 341 */ 342 virtual int32_t divideUpDictionaryRange( UText *text, 343 int32_t rangeStart, 344 int32_t rangeEnd, 345 UVector32 &foundBreaks, 346 UBool isPhraseBreaking, 347 UErrorCode& status) const override; 348 349 }; 350 351 #if !UCONFIG_NO_NORMALIZATION 352 353 /******************************************************************* 354 * CjkBreakEngine 355 */ 356 357 //indicates language/script that the CjkBreakEngine will handle 358 enum LanguageType { 359 kKorean, 360 kChineseJapanese 361 }; 362 363 /** 364 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 365 * dictionary with costs associated with each word and 366 * Viterbi decoding to determine CJK-specific breaks.</p> 367 */ 368 class CjkBreakEngine : public DictionaryBreakEngine { 369 protected: 370 /** 371 * The set of characters handled by this engine 372 * @internal 373 */ 374 UnicodeSet fHangulWordSet; 375 UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; 376 UnicodeSet fClosePunctuationSet; 377 378 DictionaryMatcher *fDictionary; 379 const Normalizer2 *nfkcNorm2; 380 MlBreakEngine *fMlBreakEngine; 381 bool isCj; 382 383 private: 384 // Load Japanese extensions. 385 void loadJapaneseExtensions(UErrorCode& error); 386 // Load Japanese Hiragana. 387 void loadHiragana(UErrorCode& error); 388 // Initialize fSkipSet by loading Japanese Hiragana and extensions. 389 void initJapanesePhraseParameter(UErrorCode& error); 390 391 Hashtable fSkipSet; 392 393 public: 394 395 /** 396 * <p>Default constructor.</p> 397 * 398 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 399 * engine is deleted. The DictionaryMatcher must contain costs for each word 400 * in order for the dictionary to work properly. 401 */ 402 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 403 404 /** 405 * <p>Virtual destructor.</p> 406 */ 407 virtual ~CjkBreakEngine(); 408 409 protected: 410 /** 411 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 412 * 413 * @param text A UText representing the text 414 * @param rangeStart The start of the range of dictionary characters 415 * @param rangeEnd The end of the range of dictionary characters 416 * @param foundBreaks Output of C array of int32_t break positions, or 0 417 * @param status Information on any errors encountered. 418 * @return The number of breaks found 419 */ 420 virtual int32_t divideUpDictionaryRange( UText *text, 421 int32_t rangeStart, 422 int32_t rangeEnd, 423 UVector32 &foundBreaks, 424 UBool isPhraseBreaking, 425 UErrorCode& status) const override; 426 427 }; 428 429 #endif 430 431 U_NAMESPACE_END 432 433 /* DICTBE_H */ 434 #endif 435