xref: /aosp_15_r20/external/icu/libicu/cts_headers/brkeng.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  ************************************************************************************
5  * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6  * All Rights Reserved.                                                             *
7  ************************************************************************************
8  */
9 
10 #ifndef BRKENG_H
11 #define BRKENG_H
12 
13 #include "unicode/umisc.h"
14 #include "unicode/utypes.h"
15 #include "unicode/uobject.h"
16 #include "unicode/utext.h"
17 #include "unicode/uscript.h"
18 
19 U_NAMESPACE_BEGIN
20 
21 class UnicodeSet;
22 class UStack;
23 class UVector32;
24 class DictionaryMatcher;
25 class ExternalBreakEngine;
26 
27 /*******************************************************************
28  * LanguageBreakEngine
29  */
30 
31 /**
32  * <p>LanguageBreakEngines implement language-specific knowledge for
33  * finding text boundaries within a run of characters belonging to a
34  * specific set. The boundaries will be of a specific kind, e.g. word,
35  * line, etc.</p>
36  *
37  * <p>LanguageBreakEngines should normally be implemented so as to
38  * be shared between threads without locking.</p>
39  */
40 class LanguageBreakEngine : public UObject {
41  public:
42 
43   /**
44    * <p>Default constructor.</p>
45    *
46    */
47   LanguageBreakEngine();
48 
49   /**
50    * <p>Virtual destructor.</p>
51    */
52   virtual ~LanguageBreakEngine();
53 
54  /**
55   * <p>Indicate whether this engine handles a particular character for
56   * a particular kind of break.</p>
57   *
58   * @param c A character which begins a run that the engine might handle
59   * @param locale The locale.
60   * @return true if this engine handles the particular character and break
61   * type.
62   */
63   virtual UBool handles(UChar32 c, const char* locale) const = 0;
64 
65  /**
66   * <p>Find any breaks within a run in the supplied text.</p>
67   *
68   * @param text A UText representing the text. The
69   * iterator is left at the end of the run of characters which the engine
70   * is capable of handling.
71   * @param startPos The start of the run within the supplied text.
72   * @param endPos The end of the run within the supplied text.
73   * @param foundBreaks A Vector of int32_t to receive the breaks.
74   * @param status Information on any errors encountered.
75   * @return The number of breaks found.
76   */
77   virtual int32_t findBreaks( UText *text,
78                               int32_t startPos,
79                               int32_t endPos,
80                               UVector32 &foundBreaks,
81                               UBool isPhraseBreaking,
82                               UErrorCode &status) const = 0;
83 
84 };
85 
86 /*******************************************************************
87  * BreakEngineWrapper
88  */
89 
90 /**
91  * <p>BreakEngineWrapper implement LanguageBreakEngine by
92  * a thin wrapper that delegate the task to ExternalBreakEngine
93  * </p>
94  */
95 class BreakEngineWrapper : public  LanguageBreakEngine {
96  public:
97 
98   BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);
99 
100   virtual ~BreakEngineWrapper();
101 
102   virtual UBool handles(UChar32 c, const char* locale) const override;
103 
104   virtual int32_t findBreaks( UText *text,
105                               int32_t startPos,
106                               int32_t endPos,
107                               UVector32 &foundBreaks,
108                               UBool isPhraseBreaking,
109                               UErrorCode &status) const override;
110 
111  private:
112   LocalPointer<ExternalBreakEngine> delegate;
113 };
114 
115 /*******************************************************************
116  * LanguageBreakFactory
117  */
118 
119 /**
120  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
121  * that can determine breaks for characters in a specific set, if
122  * such an object can be found.</p>
123  *
124  * <p>If a LanguageBreakFactory is to be shared between threads,
125  * appropriate synchronization must be used; there is none internal
126  * to the factory.</p>
127  *
128  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
129  * normally be shared between threads without synchronization, unless
130  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
131  *
132  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
133  * it returns when it itself is deleted, unless the specific subclass of
134  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
135  * not be deleted until the LanguageBreakEngines it has returned are no
136  * longer needed.</p>
137  */
138 class LanguageBreakFactory : public UMemory {
139  public:
140 
141   /**
142    * <p>Default constructor.</p>
143    *
144    */
145   LanguageBreakFactory();
146 
147   /**
148    * <p>Virtual destructor.</p>
149    */
150   virtual ~LanguageBreakFactory();
151 
152  /**
153   * <p>Find and return a LanguageBreakEngine that can find the desired
154   * kind of break for the set of characters to which the supplied
155   * character belongs. It is up to the set of available engines to
156   * determine what the sets of characters are.</p>
157   *
158   * @param c A character that begins a run for which a LanguageBreakEngine is
159   * sought.
160   * @param locale The locale.
161   * @return A LanguageBreakEngine with the desired characteristics, or 0.
162   */
163   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;
164 
165 };
166 
167 /*******************************************************************
168  * UnhandledEngine
169  */
170 
171 /**
172  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
173  * handles characters that no other LanguageBreakEngine is available to
174  * handle. It is told the character and the type of break; at its
175  * discretion it may handle more than the specified character (e.g.,
176  * the entire script to which that character belongs.</p>
177  *
178  * <p>UnhandledEngines may not be shared between threads without
179  * external synchronization.</p>
180  */
181 
182 class UnhandledEngine : public LanguageBreakEngine {
183  private:
184 
185     /**
186      * The sets of characters handled.
187      * @internal
188      */
189 
190   UnicodeSet    *fHandled;
191 
192  public:
193 
194   /**
195    * <p>Default constructor.</p>
196    *
197    */
198   UnhandledEngine(UErrorCode &status);
199 
200   /**
201    * <p>Virtual destructor.</p>
202    */
203   virtual ~UnhandledEngine();
204 
205  /**
206   * <p>Indicate whether this engine handles a particular character for
207   * a particular kind of break.</p>
208   *
209   * @param c A character which begins a run that the engine might handle
210   * @param locale The locale.
211   * @return true if this engine handles the particular character and break
212   * type.
213   */
214   virtual UBool handles(UChar32 c, const char* locale) const override;
215 
216  /**
217   * <p>Find any breaks within a run in the supplied text.</p>
218   *
219   * @param text A UText representing the text (TODO: UText). The
220   * iterator is left at the end of the run of characters which the engine
221   * is capable of handling.
222   * @param startPos The start of the run within the supplied text.
223   * @param endPos The end of the run within the supplied text.
224   * @param foundBreaks An allocated C array of the breaks found, if any
225   * @param status Information on any errors encountered.
226   * @return The number of breaks found.
227   */
228   virtual int32_t findBreaks( UText *text,
229                               int32_t startPos,
230                               int32_t endPos,
231                               UVector32 &foundBreaks,
232                               UBool isPhraseBreaking,
233                               UErrorCode &status) const override;
234 
235  /**
236   * <p>Tell the engine to handle a particular character and break type.</p>
237   *
238   * @param c A character which the engine should handle
239   */
240   virtual void handleCharacter(UChar32 c);
241 
242 };
243 
244 /*******************************************************************
245  * ICULanguageBreakFactory
246  */
247 
248 /**
249  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
250  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
251  * data in the ICU data file.</p>
252  */
253 class ICULanguageBreakFactory : public LanguageBreakFactory {
254  private:
255 
256     /**
257      * The stack of break engines created by this factory
258      * @internal
259      */
260 
261   UStack    *fEngines;
262 
263  public:
264 
265   /**
266    * <p>Standard constructor.</p>
267    *
268    */
269   ICULanguageBreakFactory(UErrorCode &status);
270 
271   /**
272    * <p>Virtual destructor.</p>
273    */
274   virtual ~ICULanguageBreakFactory();
275 
276  /**
277   * <p>Find and return a LanguageBreakEngine that can find the desired
278   * kind of break for the set of characters to which the supplied
279   * character belongs. It is up to the set of available engines to
280   * determine what the sets of characters are.</p>
281   *
282   * @param c A character that begins a run for which a LanguageBreakEngine is
283   * sought.
284   * @param locale The locale.
285   * @return A LanguageBreakEngine with the desired characteristics, or 0.
286   */
287   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;
288 
289   /**
290    * Add and adopt the engine and return an URegistryKey.
291    * @param engine The ExternalBreakEngine to be added and adopt. The caller
292    *     pass the ownership and should not release the memory after this.
293    * @param status the error code.
294    */
295   virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);
296 
297 protected:
298  /**
299   * <p>Create a LanguageBreakEngine for the set of characters to which
300   * the supplied character belongs, for the specified break type.</p>
301   *
302   * @param c A character that begins a run for which a LanguageBreakEngine is
303   * sought.
304   * @param locale The locale.
305   * @return A LanguageBreakEngine with the desired characteristics, or 0.
306   */
307   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);
308 
309   /**
310    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
311    * @param script An ISO 15924 script code that identifies the dictionary to be
312    * created.
313    * @return A DictionaryMatcher with the desired characteristics, or nullptr.
314    */
315   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
316 
317  private:
318   void ensureEngines(UErrorCode& status);
319 };
320 
321 U_NAMESPACE_END
322 
323     /* BRKENG_H */
324 #endif
325