xref: /aosp_15_r20/external/icu/libicu/cts_headers/mlbe.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2022 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker 
4*0e209d39SAndroid Build Coastguard Worker #ifndef MLBREAKENGINE_H
5*0e209d39SAndroid Build Coastguard Worker #define MLBREAKENGINE_H
6*0e209d39SAndroid Build Coastguard Worker 
7*0e209d39SAndroid Build Coastguard Worker #include "hash.h"
8*0e209d39SAndroid Build Coastguard Worker #include "unicode/resbund.h"
9*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h"
10*0e209d39SAndroid Build Coastguard Worker #include "unicode/utext.h"
11*0e209d39SAndroid Build Coastguard Worker #include "uvectr32.h"
12*0e209d39SAndroid Build Coastguard Worker 
13*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
14*0e209d39SAndroid Build Coastguard Worker 
15*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_BREAK_ITERATION
16*0e209d39SAndroid Build Coastguard Worker 
17*0e209d39SAndroid Build Coastguard Worker /**
18*0e209d39SAndroid Build Coastguard Worker  * A machine learning break engine for the phrase breaking in Japanese.
19*0e209d39SAndroid Build Coastguard Worker  */
20*0e209d39SAndroid Build Coastguard Worker class MlBreakEngine : public UMemory {
21*0e209d39SAndroid Build Coastguard Worker    public:
22*0e209d39SAndroid Build Coastguard Worker     /**
23*0e209d39SAndroid Build Coastguard Worker      * Constructor.
24*0e209d39SAndroid Build Coastguard Worker      *
25*0e209d39SAndroid Build Coastguard Worker      * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
26*0e209d39SAndroid Build Coastguard Worker      * alphabet.
27*0e209d39SAndroid Build Coastguard Worker      * @param closePunctuationSet An UnicodeSet with close punctuation.
28*0e209d39SAndroid Build Coastguard Worker      * @param status Information on any errors encountered.
29*0e209d39SAndroid Build Coastguard Worker      */
30*0e209d39SAndroid Build Coastguard Worker     MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
31*0e209d39SAndroid Build Coastguard Worker                   const UnicodeSet &closePunctuationSet, UErrorCode &status);
32*0e209d39SAndroid Build Coastguard Worker 
33*0e209d39SAndroid Build Coastguard Worker     /**
34*0e209d39SAndroid Build Coastguard Worker      * Virtual destructor.
35*0e209d39SAndroid Build Coastguard Worker      */
36*0e209d39SAndroid Build Coastguard Worker     virtual ~MlBreakEngine();
37*0e209d39SAndroid Build Coastguard Worker 
38*0e209d39SAndroid Build Coastguard Worker    public:
39*0e209d39SAndroid Build Coastguard Worker     /**
40*0e209d39SAndroid Build Coastguard Worker      * Divide up a range of characters handled by this break engine.
41*0e209d39SAndroid Build Coastguard Worker      *
42*0e209d39SAndroid Build Coastguard Worker      * @param inText A UText representing the text
43*0e209d39SAndroid Build Coastguard Worker      * @param rangeStart The start of the range of the characters
44*0e209d39SAndroid Build Coastguard Worker      * @param rangeEnd The end of the range of the characters
45*0e209d39SAndroid Build Coastguard Worker      * @param foundBreaks Output of C array of int32_t break positions, or 0
46*0e209d39SAndroid Build Coastguard Worker      * @param inString The normalized string of text ranging from rangeStart to rangeEnd
47*0e209d39SAndroid Build Coastguard Worker      * @param inputMap The vector storing the native index of inText
48*0e209d39SAndroid Build Coastguard Worker      * @param status Information on any errors encountered.
49*0e209d39SAndroid Build Coastguard Worker      * @return The number of breaks found
50*0e209d39SAndroid Build Coastguard Worker      */
51*0e209d39SAndroid Build Coastguard Worker     int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
52*0e209d39SAndroid Build Coastguard Worker                           UVector32 &foundBreaks, const UnicodeString &inString,
53*0e209d39SAndroid Build Coastguard Worker                           const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
54*0e209d39SAndroid Build Coastguard Worker 
55*0e209d39SAndroid Build Coastguard Worker    private:
56*0e209d39SAndroid Build Coastguard Worker     /**
57*0e209d39SAndroid Build Coastguard Worker      * Load the machine learning's model file.
58*0e209d39SAndroid Build Coastguard Worker      *
59*0e209d39SAndroid Build Coastguard Worker      * @param error Information on any errors encountered.
60*0e209d39SAndroid Build Coastguard Worker      */
61*0e209d39SAndroid Build Coastguard Worker     void loadMLModel(UErrorCode &error);
62*0e209d39SAndroid Build Coastguard Worker 
63*0e209d39SAndroid Build Coastguard Worker     /**
64*0e209d39SAndroid Build Coastguard Worker      * In the machine learning's model file, specify the name of the key and value to load the
65*0e209d39SAndroid Build Coastguard Worker      * corresponding feature and its score.
66*0e209d39SAndroid Build Coastguard Worker      *
67*0e209d39SAndroid Build Coastguard Worker      * @param rb A ResouceBundle corresponding to the model file.
68*0e209d39SAndroid Build Coastguard Worker      * @param keyName The kay name in the model file.
69*0e209d39SAndroid Build Coastguard Worker      * @param valueName The value name in the model file.
70*0e209d39SAndroid Build Coastguard Worker      * @param model A hashtable to store the pairs of the feature and its score.
71*0e209d39SAndroid Build Coastguard Worker      * @param error Information on any errors encountered.
72*0e209d39SAndroid Build Coastguard Worker      */
73*0e209d39SAndroid Build Coastguard Worker     void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
74*0e209d39SAndroid Build Coastguard Worker                       Hashtable &model, UErrorCode &error);
75*0e209d39SAndroid Build Coastguard Worker 
76*0e209d39SAndroid Build Coastguard Worker     /**
77*0e209d39SAndroid Build Coastguard Worker      * Initialize the index list from the input string.
78*0e209d39SAndroid Build Coastguard Worker      *
79*0e209d39SAndroid Build Coastguard Worker      * @param inString A input string to be segmented.
80*0e209d39SAndroid Build Coastguard Worker      * @param indexList A code unit index list of inString.
81*0e209d39SAndroid Build Coastguard Worker      * @param status Information on any errors encountered.
82*0e209d39SAndroid Build Coastguard Worker      * @return The number of code units of the first four characters in inString.
83*0e209d39SAndroid Build Coastguard Worker      */
84*0e209d39SAndroid Build Coastguard Worker     int32_t initIndexList(const UnicodeString &inString, int32_t *indexList,
85*0e209d39SAndroid Build Coastguard Worker                           UErrorCode &status) const;
86*0e209d39SAndroid Build Coastguard Worker 
87*0e209d39SAndroid Build Coastguard Worker     /**
88*0e209d39SAndroid Build Coastguard Worker      * Evaluate whether the index is a potential breakpoint.
89*0e209d39SAndroid Build Coastguard Worker      *
90*0e209d39SAndroid Build Coastguard Worker      * @param inString A input string to be segmented.
91*0e209d39SAndroid Build Coastguard Worker      * @param indexList A code unit index list of the inString.
92*0e209d39SAndroid Build Coastguard Worker      * @param startIdx The start index of the indexList.
93*0e209d39SAndroid Build Coastguard Worker      * @param numCodeUnits  The current code unit boundary of the indexList.
94*0e209d39SAndroid Build Coastguard Worker      * @param numBreaks The accumulated number of breakpoints.
95*0e209d39SAndroid Build Coastguard Worker      * @param boundary A vector including the index of the breakpoint.
96*0e209d39SAndroid Build Coastguard Worker      * @param status Information on any errors encountered.
97*0e209d39SAndroid Build Coastguard Worker      * @return The number of breakpoints
98*0e209d39SAndroid Build Coastguard Worker      */
99*0e209d39SAndroid Build Coastguard Worker     int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx,
100*0e209d39SAndroid Build Coastguard Worker                                int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary,
101*0e209d39SAndroid Build Coastguard Worker                                UErrorCode &status) const;
102*0e209d39SAndroid Build Coastguard Worker 
103*0e209d39SAndroid Build Coastguard Worker     void printUnicodeString(const UnicodeString &s) const;
104*0e209d39SAndroid Build Coastguard Worker 
105*0e209d39SAndroid Build Coastguard Worker     UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
106*0e209d39SAndroid Build Coastguard Worker     UnicodeSet fClosePunctuationSet;
107*0e209d39SAndroid Build Coastguard Worker     Hashtable fModel[13];  // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
108*0e209d39SAndroid Build Coastguard Worker     int32_t fNegativeSum;
109*0e209d39SAndroid Build Coastguard Worker };
110*0e209d39SAndroid Build Coastguard Worker 
111*0e209d39SAndroid Build Coastguard Worker #endif
112*0e209d39SAndroid Build Coastguard Worker 
113*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
114*0e209d39SAndroid Build Coastguard Worker 
115*0e209d39SAndroid Build Coastguard Worker /* MLBREAKENGINE_H */
116*0e209d39SAndroid Build Coastguard Worker #endif
117