xref: /aosp_15_r20/external/icu/libicu/cts_headers/dictbe.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2014, International Business Machines Corporation   *
6  * and others. All Rights Reserved.                                            *
7  *******************************************************************************
8  */
9 
10 #ifndef DICTBE_H
11 #define DICTBE_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
16 
17 #include "brkeng.h"
18 #include "hash.h"
19 #include "mlbe.h"
20 #include "uvectr32.h"
21 
22 U_NAMESPACE_BEGIN
23 
24 class DictionaryMatcher;
25 class MlBreakEngine;
26 class Normalizer2;
27 
28 /*******************************************************************
29  * DictionaryBreakEngine
30  */
31 
32 /**
33  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
34  * dictionary to determine language-specific breaks.</p>
35  *
36  * <p>After it is constructed a DictionaryBreakEngine may be shared between
37  * threads without synchronization.</p>
38  */
39 class DictionaryBreakEngine : public LanguageBreakEngine {
40  private:
41     /**
42      * The set of characters handled by this engine
43      * @internal
44      */
45 
46   UnicodeSet    fSet;
47 
48  public:
49 
50   /**
51    * <p>Constructor </p>
52    */
53   DictionaryBreakEngine();
54 
55   /**
56    * <p>Virtual destructor.</p>
57    */
58   virtual ~DictionaryBreakEngine();
59 
60   /**
61    * <p>Indicate whether this engine handles a particular character for
62    * a particular kind of break.</p>
63    *
64    * @param c A character which begins a run that the engine might handle
65    * @param locale The locale.
66    * @return true if this engine handles the particular character and break
67    * type.
68    */
69   virtual UBool handles(UChar32 c, const char* locale) const override;
70 
71   /**
72    * <p>Find any breaks within a run in the supplied text.</p>
73    *
74    * @param text A UText representing the text. The iterator is left at
75    * the end of the run of characters which the engine is capable of handling
76    * that starts from the first character in the range.
77    * @param startPos The start of the run within the supplied text.
78    * @param endPos The end of the run within the supplied text.
79    * @param foundBreaks vector of int32_t to receive the break positions
80    * @param status Information on any errors encountered.
81    * @return The number of breaks found.
82    */
83   virtual int32_t findBreaks( UText *text,
84                               int32_t startPos,
85                               int32_t endPos,
86                               UVector32 &foundBreaks,
87                               UBool isPhraseBreaking,
88                               UErrorCode& status ) const override;
89 
90  protected:
91 
92  /**
93   * <p>Set the character set handled by this engine.</p>
94   *
95   * @param set A UnicodeSet of the set of characters handled by the engine
96   */
97   virtual void setCharacters( const UnicodeSet &set );
98 
99  /**
100   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
101   *
102   * @param text A UText representing the text
103   * @param rangeStart The start of the range of dictionary characters
104   * @param rangeEnd The end of the range of dictionary characters
105   * @param foundBreaks Output of C array of int32_t break positions, or 0
106   * @param status Information on any errors encountered.
107   * @return The number of breaks found
108   */
109   virtual int32_t divideUpDictionaryRange( UText *text,
110                                            int32_t rangeStart,
111                                            int32_t rangeEnd,
112                                            UVector32 &foundBreaks,
113                                            UBool isPhraseBreaking,
114                                            UErrorCode& status) const = 0;
115 
116 };
117 
118 /*******************************************************************
119  * ThaiBreakEngine
120  */
121 
122 /**
123  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
124  * dictionary and heuristics to determine Thai-specific breaks.</p>
125  *
126  * <p>After it is constructed a ThaiBreakEngine may be shared between
127  * threads without synchronization.</p>
128  */
129 class ThaiBreakEngine : public DictionaryBreakEngine {
130  private:
131     /**
132      * The set of characters handled by this engine
133      * @internal
134      */
135 
136   UnicodeSet                fEndWordSet;
137   UnicodeSet                fBeginWordSet;
138   UnicodeSet                fSuffixSet;
139   UnicodeSet                fMarkSet;
140   DictionaryMatcher  *fDictionary;
141 
142  public:
143 
144   /**
145    * <p>Default constructor.</p>
146    *
147    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
148    * engine is deleted.
149    */
150   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
151 
152   /**
153    * <p>Virtual destructor.</p>
154    */
155   virtual ~ThaiBreakEngine();
156 
157  protected:
158  /**
159   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
160   *
161   * @param text A UText representing the text
162   * @param rangeStart The start of the range of dictionary characters
163   * @param rangeEnd The end of the range of dictionary characters
164   * @param foundBreaks Output of C array of int32_t break positions, or 0
165   * @param status Information on any errors encountered.
166   * @return The number of breaks found
167   */
168   virtual int32_t divideUpDictionaryRange( UText *text,
169                                            int32_t rangeStart,
170                                            int32_t rangeEnd,
171                                            UVector32 &foundBreaks,
172                                            UBool isPhraseBreaking,
173                                            UErrorCode& status) const override;
174 
175 };
176 
177 /*******************************************************************
178  * LaoBreakEngine
179  */
180 
181 /**
182  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
183  * dictionary and heuristics to determine Lao-specific breaks.</p>
184  *
185  * <p>After it is constructed a LaoBreakEngine may be shared between
186  * threads without synchronization.</p>
187  */
188 class LaoBreakEngine : public DictionaryBreakEngine {
189  private:
190     /**
191      * The set of characters handled by this engine
192      * @internal
193      */
194 
195   UnicodeSet                fEndWordSet;
196   UnicodeSet                fBeginWordSet;
197   UnicodeSet                fMarkSet;
198   DictionaryMatcher  *fDictionary;
199 
200  public:
201 
202   /**
203    * <p>Default constructor.</p>
204    *
205    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
206    * engine is deleted.
207    */
208   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
209 
210   /**
211    * <p>Virtual destructor.</p>
212    */
213   virtual ~LaoBreakEngine();
214 
215  protected:
216  /**
217   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
218   *
219   * @param text A UText representing the text
220   * @param rangeStart The start of the range of dictionary characters
221   * @param rangeEnd The end of the range of dictionary characters
222   * @param foundBreaks Output of C array of int32_t break positions, or 0
223   * @param status Information on any errors encountered.
224   * @return The number of breaks found
225   */
226   virtual int32_t divideUpDictionaryRange( UText *text,
227                                            int32_t rangeStart,
228                                            int32_t rangeEnd,
229                                            UVector32 &foundBreaks,
230                                            UBool isPhraseBreaking,
231                                            UErrorCode& status) const override;
232 
233 };
234 
235 /*******************************************************************
236  * BurmeseBreakEngine
237  */
238 
239 /**
240  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
241  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
242  *
243  * <p>After it is constructed a BurmeseBreakEngine may be shared between
244  * threads without synchronization.</p>
245  */
246 class BurmeseBreakEngine : public DictionaryBreakEngine {
247  private:
248     /**
249      * The set of characters handled by this engine
250      * @internal
251      */
252 
253   UnicodeSet                fEndWordSet;
254   UnicodeSet                fBeginWordSet;
255   UnicodeSet                fMarkSet;
256   DictionaryMatcher  *fDictionary;
257 
258  public:
259 
260   /**
261    * <p>Default constructor.</p>
262    *
263    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
264    * engine is deleted.
265    */
266   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
267 
268   /**
269    * <p>Virtual destructor.</p>
270    */
271   virtual ~BurmeseBreakEngine();
272 
273  protected:
274  /**
275   * <p>Divide up a range of known dictionary characters.</p>
276   *
277   * @param text A UText representing the text
278   * @param rangeStart The start of the range of dictionary characters
279   * @param rangeEnd The end of the range of dictionary characters
280   * @param foundBreaks Output of C array of int32_t break positions, or 0
281   * @param status Information on any errors encountered.
282   * @return The number of breaks found
283   */
284   virtual int32_t divideUpDictionaryRange( UText *text,
285                                            int32_t rangeStart,
286                                            int32_t rangeEnd,
287                                            UVector32 &foundBreaks,
288                                            UBool isPhraseBreaking,
289                                            UErrorCode& status) const override;
290 
291 };
292 
293 /*******************************************************************
294  * KhmerBreakEngine
295  */
296 
297 /**
298  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
299  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
300  *
301  * <p>After it is constructed a KhmerBreakEngine may be shared between
302  * threads without synchronization.</p>
303  */
304 class KhmerBreakEngine : public DictionaryBreakEngine {
305  private:
306     /**
307      * The set of characters handled by this engine
308      * @internal
309      */
310 
311   UnicodeSet                fEndWordSet;
312   UnicodeSet                fBeginWordSet;
313   UnicodeSet                fMarkSet;
314   DictionaryMatcher  *fDictionary;
315 
316  public:
317 
318   /**
319    * <p>Default constructor.</p>
320    *
321    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
322    * engine is deleted.
323    */
324   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
325 
326   /**
327    * <p>Virtual destructor.</p>
328    */
329   virtual ~KhmerBreakEngine();
330 
331  protected:
332  /**
333   * <p>Divide up a range of known dictionary characters.</p>
334   *
335   * @param text A UText representing the text
336   * @param rangeStart The start of the range of dictionary characters
337   * @param rangeEnd The end of the range of dictionary characters
338   * @param foundBreaks Output of C array of int32_t break positions, or 0
339   * @param status Information on any errors encountered.
340   * @return The number of breaks found
341   */
342   virtual int32_t divideUpDictionaryRange( UText *text,
343                                            int32_t rangeStart,
344                                            int32_t rangeEnd,
345                                            UVector32 &foundBreaks,
346                                            UBool isPhraseBreaking,
347                                            UErrorCode& status) const override;
348 
349 };
350 
351 #if !UCONFIG_NO_NORMALIZATION
352 
353 /*******************************************************************
354  * CjkBreakEngine
355  */
356 
357 //indicates language/script that the CjkBreakEngine will handle
358 enum LanguageType {
359     kKorean,
360     kChineseJapanese
361 };
362 
363 /**
364  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
365  * dictionary with costs associated with each word and
366  * Viterbi decoding to determine CJK-specific breaks.</p>
367  */
368 class CjkBreakEngine : public DictionaryBreakEngine {
369  protected:
370     /**
371      * The set of characters handled by this engine
372      * @internal
373      */
374   UnicodeSet                fHangulWordSet;
375   UnicodeSet                fDigitOrOpenPunctuationOrAlphabetSet;
376   UnicodeSet                fClosePunctuationSet;
377 
378   DictionaryMatcher        *fDictionary;
379   const Normalizer2        *nfkcNorm2;
380   MlBreakEngine            *fMlBreakEngine;
381   bool                      isCj;
382 
383  private:
384   // Load Japanese extensions.
385   void loadJapaneseExtensions(UErrorCode& error);
386   // Load Japanese Hiragana.
387   void loadHiragana(UErrorCode& error);
388   // Initialize fSkipSet by loading Japanese Hiragana and extensions.
389   void initJapanesePhraseParameter(UErrorCode& error);
390 
391   Hashtable fSkipSet;
392 
393  public:
394 
395     /**
396      * <p>Default constructor.</p>
397      *
398      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
399      * engine is deleted. The DictionaryMatcher must contain costs for each word
400      * in order for the dictionary to work properly.
401      */
402   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
403 
404     /**
405      * <p>Virtual destructor.</p>
406      */
407   virtual ~CjkBreakEngine();
408 
409  protected:
410     /**
411      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
412      *
413      * @param text A UText representing the text
414      * @param rangeStart The start of the range of dictionary characters
415      * @param rangeEnd The end of the range of dictionary characters
416      * @param foundBreaks Output of C array of int32_t break positions, or 0
417      * @param status Information on any errors encountered.
418      * @return The number of breaks found
419      */
420   virtual int32_t divideUpDictionaryRange( UText *text,
421           int32_t rangeStart,
422           int32_t rangeEnd,
423           UVector32 &foundBreaks,
424           UBool isPhraseBreaking,
425           UErrorCode& status) const override;
426 
427 };
428 
429 #endif
430 
431 U_NAMESPACE_END
432 
433     /* DICTBE_H */
434 #endif
435