xref: /aosp_15_r20/external/icu/libicu/cts_headers/lstmbe.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2021 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #ifndef LSTMBE_H
5 #define LSTMBE_H
6 
7 #include "unicode/utypes.h"
8 
9 #if !UCONFIG_NO_BREAK_ITERATION
10 
11 #include "unicode/uniset.h"
12 #include "unicode/ures.h"
13 #include "unicode/utext.h"
14 #include "unicode/utypes.h"
15 
16 #include "brkeng.h"
17 #include "dictbe.h"
18 #include "uvectr32.h"
19 
20 U_NAMESPACE_BEGIN
21 
22 class Vectorizer;
23 struct LSTMData;
24 
25 /*******************************************************************
26  * LSTMBreakEngine
27  */
28 
29 /**
30  * <p>LSTMBreakEngine is a kind of DictionaryBreakEngine that uses a
31  * LSTM to determine language-specific breaks.</p>
32  *
33  * <p>After it is constructed a LSTMBreakEngine may be shared between
34  * threads without synchronization.</p>
35  */
36 class LSTMBreakEngine : public DictionaryBreakEngine {
37 public:
38     /**
39      * <p>Constructor.</p>
40      */
41     LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status);
42 
43     /**
44      * <p>Virtual destructor.</p>
45      */
46     virtual ~LSTMBreakEngine();
47 
48     virtual const char16_t* name() const;
49 
50 protected:
51     /**
52      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
53      *
54      * @param text A UText representing the text
55      * @param rangeStart The start of the range of dictionary characters
56      * @param rangeEnd The end of the range of dictionary characters
57      * @param foundBreaks Output of C array of int32_t break positions, or 0
58      * @param status Information on any errors encountered.
59      * @return The number of breaks found
60      */
61      virtual int32_t divideUpDictionaryRange(UText *text,
62                                              int32_t rangeStart,
63                                              int32_t rangeEnd,
64                                              UVector32 &foundBreaks,
65                                              UBool isPhraseBreaking,
66                                              UErrorCode& status) const override;
67 private:
68     const LSTMData* fData;
69     const Vectorizer* fVectorizer;
70 };
71 
72 U_CAPI const LanguageBreakEngine* U_EXPORT2 CreateLSTMBreakEngine(
73     UScriptCode script, const LSTMData* data, UErrorCode& status);
74 
75 U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(
76     UResourceBundle* rb, UErrorCode& status);
77 
78 U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(
79     UScriptCode script, UErrorCode& status);
80 
81 U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data);
82 U_CAPI const char16_t* U_EXPORT2 LSTMDataName(const LSTMData* data);
83 
84 U_NAMESPACE_END
85 
86 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
87 
88 #endif  /* LSTMBE_H */
89