xref: /aosp_15_r20/external/icu/libicu/cts_headers/strmatch.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker  * Copyright (C) 2001-2011, International Business Machines Corporation
5*0e209d39SAndroid Build Coastguard Worker  * and others. All Rights Reserved.
6*0e209d39SAndroid Build Coastguard Worker  **********************************************************************
7*0e209d39SAndroid Build Coastguard Worker  *   Date        Name        Description
8*0e209d39SAndroid Build Coastguard Worker  *   07/23/01    aliu        Creation.
9*0e209d39SAndroid Build Coastguard Worker  **********************************************************************
10*0e209d39SAndroid Build Coastguard Worker  */
11*0e209d39SAndroid Build Coastguard Worker #ifndef STRMATCH_H
12*0e209d39SAndroid Build Coastguard Worker #define STRMATCH_H
13*0e209d39SAndroid Build Coastguard Worker 
14*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
15*0e209d39SAndroid Build Coastguard Worker 
16*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_TRANSLITERATION
17*0e209d39SAndroid Build Coastguard Worker 
18*0e209d39SAndroid Build Coastguard Worker #include "unicode/unistr.h"
19*0e209d39SAndroid Build Coastguard Worker #include "unicode/unifunct.h"
20*0e209d39SAndroid Build Coastguard Worker #include "unicode/unimatch.h"
21*0e209d39SAndroid Build Coastguard Worker #include "unicode/unirepl.h"
22*0e209d39SAndroid Build Coastguard Worker 
23*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
24*0e209d39SAndroid Build Coastguard Worker 
25*0e209d39SAndroid Build Coastguard Worker class TransliterationRuleData;
26*0e209d39SAndroid Build Coastguard Worker 
27*0e209d39SAndroid Build Coastguard Worker /**
28*0e209d39SAndroid Build Coastguard Worker  * An object that matches a fixed input string, implementing the
29*0e209d39SAndroid Build Coastguard Worker  * UnicodeMatcher API.  This object also implements the
30*0e209d39SAndroid Build Coastguard Worker  * UnicodeReplacer API, allowing it to emit the matched text as
31*0e209d39SAndroid Build Coastguard Worker  * output.  Since the match text may contain flexible match elements,
32*0e209d39SAndroid Build Coastguard Worker  * such as UnicodeSets, the emitted text is not the match pattern, but
33*0e209d39SAndroid Build Coastguard Worker  * instead a substring of the actual matched text.  Following
34*0e209d39SAndroid Build Coastguard Worker  * convention, the output text is the leftmost match seen up to this
35*0e209d39SAndroid Build Coastguard Worker  * point.
36*0e209d39SAndroid Build Coastguard Worker  *
37*0e209d39SAndroid Build Coastguard Worker  * A StringMatcher may represent a segment, in which case it has a
38*0e209d39SAndroid Build Coastguard Worker  * positive segment number.  This affects how the matcher converts
39*0e209d39SAndroid Build Coastguard Worker  * itself to a pattern but does not otherwise affect its function.
40*0e209d39SAndroid Build Coastguard Worker  *
41*0e209d39SAndroid Build Coastguard Worker  * A StringMatcher that is not a segment should not be used as a
42*0e209d39SAndroid Build Coastguard Worker  * UnicodeReplacer.
43*0e209d39SAndroid Build Coastguard Worker  */
44*0e209d39SAndroid Build Coastguard Worker class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
45*0e209d39SAndroid Build Coastguard Worker 
46*0e209d39SAndroid Build Coastguard Worker  public:
47*0e209d39SAndroid Build Coastguard Worker 
48*0e209d39SAndroid Build Coastguard Worker     /**
49*0e209d39SAndroid Build Coastguard Worker      * Construct a matcher that matches the given pattern string.
50*0e209d39SAndroid Build Coastguard Worker      * @param string the pattern to be matched, possibly containing
51*0e209d39SAndroid Build Coastguard Worker      * stand-ins that represent nested UnicodeMatcher objects.
52*0e209d39SAndroid Build Coastguard Worker      * @param start inclusive start index of text to be replaced
53*0e209d39SAndroid Build Coastguard Worker      * @param limit exclusive end index of text to be replaced;
54*0e209d39SAndroid Build Coastguard Worker      * must be greater than or equal to start
55*0e209d39SAndroid Build Coastguard Worker      * @param segmentNum the segment number from 1..n, or 0 if this is
56*0e209d39SAndroid Build Coastguard Worker      * not a segment.
57*0e209d39SAndroid Build Coastguard Worker      * @param data context object mapping stand-ins to
58*0e209d39SAndroid Build Coastguard Worker      * UnicodeMatcher objects.
59*0e209d39SAndroid Build Coastguard Worker      */
60*0e209d39SAndroid Build Coastguard Worker     StringMatcher(const UnicodeString& string,
61*0e209d39SAndroid Build Coastguard Worker                   int32_t start,
62*0e209d39SAndroid Build Coastguard Worker                   int32_t limit,
63*0e209d39SAndroid Build Coastguard Worker                   int32_t segmentNum,
64*0e209d39SAndroid Build Coastguard Worker                   const TransliterationRuleData& data);
65*0e209d39SAndroid Build Coastguard Worker 
66*0e209d39SAndroid Build Coastguard Worker     /**
67*0e209d39SAndroid Build Coastguard Worker      * Copy constructor
68*0e209d39SAndroid Build Coastguard Worker      * @param o  the object to be copied.
69*0e209d39SAndroid Build Coastguard Worker      */
70*0e209d39SAndroid Build Coastguard Worker     StringMatcher(const StringMatcher& o);
71*0e209d39SAndroid Build Coastguard Worker 
72*0e209d39SAndroid Build Coastguard Worker     /**
73*0e209d39SAndroid Build Coastguard Worker      * Destructor
74*0e209d39SAndroid Build Coastguard Worker      */
75*0e209d39SAndroid Build Coastguard Worker     virtual ~StringMatcher();
76*0e209d39SAndroid Build Coastguard Worker 
77*0e209d39SAndroid Build Coastguard Worker     /**
78*0e209d39SAndroid Build Coastguard Worker      * Implement UnicodeFunctor
79*0e209d39SAndroid Build Coastguard Worker      * @return a copy of the object.
80*0e209d39SAndroid Build Coastguard Worker      */
81*0e209d39SAndroid Build Coastguard Worker     virtual StringMatcher* clone() const override;
82*0e209d39SAndroid Build Coastguard Worker 
83*0e209d39SAndroid Build Coastguard Worker     /**
84*0e209d39SAndroid Build Coastguard Worker      * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
85*0e209d39SAndroid Build Coastguard Worker      * and return the pointer.
86*0e209d39SAndroid Build Coastguard Worker      * @return the UnicodeMatcher point.
87*0e209d39SAndroid Build Coastguard Worker      */
88*0e209d39SAndroid Build Coastguard Worker     virtual UnicodeMatcher* toMatcher() const override;
89*0e209d39SAndroid Build Coastguard Worker 
90*0e209d39SAndroid Build Coastguard Worker     /**
91*0e209d39SAndroid Build Coastguard Worker      * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
92*0e209d39SAndroid Build Coastguard Worker      * and return the pointer.
93*0e209d39SAndroid Build Coastguard Worker      * @return the UnicodeReplacer pointer.
94*0e209d39SAndroid Build Coastguard Worker      */
95*0e209d39SAndroid Build Coastguard Worker     virtual UnicodeReplacer* toReplacer() const override;
96*0e209d39SAndroid Build Coastguard Worker 
97*0e209d39SAndroid Build Coastguard Worker     /**
98*0e209d39SAndroid Build Coastguard Worker      * Implement UnicodeMatcher
99*0e209d39SAndroid Build Coastguard Worker      * @param text the text to be matched
100*0e209d39SAndroid Build Coastguard Worker      * @param offset on input, the index into text at which to begin
101*0e209d39SAndroid Build Coastguard Worker      * matching.  On output, the limit of the matched text.  The
102*0e209d39SAndroid Build Coastguard Worker      * number of matched characters is the output value of offset
103*0e209d39SAndroid Build Coastguard Worker      * minus the input value.  Offset should always point to the
104*0e209d39SAndroid Build Coastguard Worker      * HIGH SURROGATE (leading code unit) of a pair of surrogates,
105*0e209d39SAndroid Build Coastguard Worker      * both on entry and upon return.
106*0e209d39SAndroid Build Coastguard Worker      * @param limit the limit index of text to be matched.  Greater
107*0e209d39SAndroid Build Coastguard Worker      * than offset for a forward direction match, less than offset for
108*0e209d39SAndroid Build Coastguard Worker      * a backward direction match.  The last character to be
109*0e209d39SAndroid Build Coastguard Worker      * considered for matching will be text.charAt(limit-1) in the
110*0e209d39SAndroid Build Coastguard Worker      * forward direction or text.charAt(limit+1) in the backward
111*0e209d39SAndroid Build Coastguard Worker      * direction.
112*0e209d39SAndroid Build Coastguard Worker      * @param incremental  if true, then assume further characters may
113*0e209d39SAndroid Build Coastguard Worker      * be inserted at limit and check for partial matching.  Otherwise
114*0e209d39SAndroid Build Coastguard Worker      * assume the text as given is complete.
115*0e209d39SAndroid Build Coastguard Worker      * @return a match degree value indicating a full match, a partial
116*0e209d39SAndroid Build Coastguard Worker      * match, or a mismatch.  If incremental is false then
117*0e209d39SAndroid Build Coastguard Worker      * U_PARTIAL_MATCH should never be returned.
118*0e209d39SAndroid Build Coastguard Worker      */
119*0e209d39SAndroid Build Coastguard Worker     virtual UMatchDegree matches(const Replaceable& text,
120*0e209d39SAndroid Build Coastguard Worker                                  int32_t& offset,
121*0e209d39SAndroid Build Coastguard Worker                                  int32_t limit,
122*0e209d39SAndroid Build Coastguard Worker                                  UBool incremental) override;
123*0e209d39SAndroid Build Coastguard Worker 
124*0e209d39SAndroid Build Coastguard Worker     /**
125*0e209d39SAndroid Build Coastguard Worker      * Implement UnicodeMatcher
126*0e209d39SAndroid Build Coastguard Worker      * @param result            Output param to receive the pattern.
127*0e209d39SAndroid Build Coastguard Worker      * @param escapeUnprintable if True then escape the unprintable characters.
128*0e209d39SAndroid Build Coastguard Worker      * @return                  A reference to 'result'.
129*0e209d39SAndroid Build Coastguard Worker      */
130*0e209d39SAndroid Build Coastguard Worker     virtual UnicodeString& toPattern(UnicodeString& result,
131*0e209d39SAndroid Build Coastguard Worker                                      UBool escapeUnprintable = false) const override;
132*0e209d39SAndroid Build Coastguard Worker 
133*0e209d39SAndroid Build Coastguard Worker     /**
134*0e209d39SAndroid Build Coastguard Worker      * Implement UnicodeMatcher
135*0e209d39SAndroid Build Coastguard Worker      * Returns true if this matcher will match a character c, where c
136*0e209d39SAndroid Build Coastguard Worker      * & 0xFF == v, at offset, in the forward direction (with limit >
137*0e209d39SAndroid Build Coastguard Worker      * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
138*0e209d39SAndroid Build Coastguard Worker      * indexing.
139*0e209d39SAndroid Build Coastguard Worker      * @param v    the given value
140*0e209d39SAndroid Build Coastguard Worker      * @return     true if this matcher will match a character c,
141*0e209d39SAndroid Build Coastguard Worker      *             where c & 0xFF == v
142*0e209d39SAndroid Build Coastguard Worker      */
143*0e209d39SAndroid Build Coastguard Worker     virtual UBool matchesIndexValue(uint8_t v) const override;
144*0e209d39SAndroid Build Coastguard Worker 
145*0e209d39SAndroid Build Coastguard Worker     /**
146*0e209d39SAndroid Build Coastguard Worker      * Implement UnicodeMatcher
147*0e209d39SAndroid Build Coastguard Worker      */
148*0e209d39SAndroid Build Coastguard Worker     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
149*0e209d39SAndroid Build Coastguard Worker 
150*0e209d39SAndroid Build Coastguard Worker     /**
151*0e209d39SAndroid Build Coastguard Worker      * Implement UnicodeFunctor
152*0e209d39SAndroid Build Coastguard Worker      */
153*0e209d39SAndroid Build Coastguard Worker     virtual void setData(const TransliterationRuleData*) override;
154*0e209d39SAndroid Build Coastguard Worker 
155*0e209d39SAndroid Build Coastguard Worker     /**
156*0e209d39SAndroid Build Coastguard Worker      * Replace characters in 'text' from 'start' to 'limit' with the
157*0e209d39SAndroid Build Coastguard Worker      * output text of this object.  Update the 'cursor' parameter to
158*0e209d39SAndroid Build Coastguard Worker      * give the cursor position and return the length of the
159*0e209d39SAndroid Build Coastguard Worker      * replacement text.
160*0e209d39SAndroid Build Coastguard Worker      *
161*0e209d39SAndroid Build Coastguard Worker      * @param text the text to be matched
162*0e209d39SAndroid Build Coastguard Worker      * @param start inclusive start index of text to be replaced
163*0e209d39SAndroid Build Coastguard Worker      * @param limit exclusive end index of text to be replaced;
164*0e209d39SAndroid Build Coastguard Worker      * must be greater than or equal to start
165*0e209d39SAndroid Build Coastguard Worker      * @param cursor output parameter for the cursor position.
166*0e209d39SAndroid Build Coastguard Worker      * Not all replacer objects will update this, but in a complete
167*0e209d39SAndroid Build Coastguard Worker      * tree of replacer objects, representing the entire output side
168*0e209d39SAndroid Build Coastguard Worker      * of a transliteration rule, at least one must update it.
169*0e209d39SAndroid Build Coastguard Worker      * @return the number of 16-bit code units in the text replacing
170*0e209d39SAndroid Build Coastguard Worker      * the characters at offsets start..(limit-1) in text
171*0e209d39SAndroid Build Coastguard Worker      */
172*0e209d39SAndroid Build Coastguard Worker     virtual int32_t replace(Replaceable& text,
173*0e209d39SAndroid Build Coastguard Worker                             int32_t start,
174*0e209d39SAndroid Build Coastguard Worker                             int32_t limit,
175*0e209d39SAndroid Build Coastguard Worker                             int32_t& cursor) override;
176*0e209d39SAndroid Build Coastguard Worker 
177*0e209d39SAndroid Build Coastguard Worker     /**
178*0e209d39SAndroid Build Coastguard Worker      * Returns a string representation of this replacer.  If the
179*0e209d39SAndroid Build Coastguard Worker      * result of calling this function is passed to the appropriate
180*0e209d39SAndroid Build Coastguard Worker      * parser, typically TransliteratorParser, it will produce another
181*0e209d39SAndroid Build Coastguard Worker      * replacer that is equal to this one.
182*0e209d39SAndroid Build Coastguard Worker      * @param result the string to receive the pattern.  Previous
183*0e209d39SAndroid Build Coastguard Worker      * contents will be deleted.
184*0e209d39SAndroid Build Coastguard Worker      * @param escapeUnprintable if true then convert unprintable
185*0e209d39SAndroid Build Coastguard Worker      * character to their hex escape representations, \\uxxxx or
186*0e209d39SAndroid Build Coastguard Worker      * \\Uxxxxxxxx.  Unprintable characters are defined by
187*0e209d39SAndroid Build Coastguard Worker      * Utility.isUnprintable().
188*0e209d39SAndroid Build Coastguard Worker      * @return a reference to 'result'.
189*0e209d39SAndroid Build Coastguard Worker      */
190*0e209d39SAndroid Build Coastguard Worker     virtual UnicodeString& toReplacerPattern(UnicodeString& result,
191*0e209d39SAndroid Build Coastguard Worker                                              UBool escapeUnprintable) const override;
192*0e209d39SAndroid Build Coastguard Worker 
193*0e209d39SAndroid Build Coastguard Worker     /**
194*0e209d39SAndroid Build Coastguard Worker      * Remove any match data.  This must be called before performing a
195*0e209d39SAndroid Build Coastguard Worker      * set of matches with this segment.
196*0e209d39SAndroid Build Coastguard Worker      */
197*0e209d39SAndroid Build Coastguard Worker     void resetMatch();
198*0e209d39SAndroid Build Coastguard Worker 
199*0e209d39SAndroid Build Coastguard Worker     /**
200*0e209d39SAndroid Build Coastguard Worker      * ICU "poor man's RTTI", returns a UClassID for the actual class.
201*0e209d39SAndroid Build Coastguard Worker      */
202*0e209d39SAndroid Build Coastguard Worker     virtual UClassID getDynamicClassID() const override;
203*0e209d39SAndroid Build Coastguard Worker 
204*0e209d39SAndroid Build Coastguard Worker     /**
205*0e209d39SAndroid Build Coastguard Worker      * ICU "poor man's RTTI", returns a UClassID for this class.
206*0e209d39SAndroid Build Coastguard Worker      */
207*0e209d39SAndroid Build Coastguard Worker     static UClassID U_EXPORT2 getStaticClassID();
208*0e209d39SAndroid Build Coastguard Worker 
209*0e209d39SAndroid Build Coastguard Worker     /**
210*0e209d39SAndroid Build Coastguard Worker      * Union the set of all characters that may output by this object
211*0e209d39SAndroid Build Coastguard Worker      * into the given set.
212*0e209d39SAndroid Build Coastguard Worker      * @param toUnionTo the set into which to union the output characters
213*0e209d39SAndroid Build Coastguard Worker      */
214*0e209d39SAndroid Build Coastguard Worker     virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const override;
215*0e209d39SAndroid Build Coastguard Worker 
216*0e209d39SAndroid Build Coastguard Worker  private:
217*0e209d39SAndroid Build Coastguard Worker 
218*0e209d39SAndroid Build Coastguard Worker     /**
219*0e209d39SAndroid Build Coastguard Worker      * The text to be matched.
220*0e209d39SAndroid Build Coastguard Worker      */
221*0e209d39SAndroid Build Coastguard Worker     UnicodeString pattern;
222*0e209d39SAndroid Build Coastguard Worker 
223*0e209d39SAndroid Build Coastguard Worker     /**
224*0e209d39SAndroid Build Coastguard Worker      * Context object that maps stand-ins to matcher and replacer
225*0e209d39SAndroid Build Coastguard Worker      * objects.
226*0e209d39SAndroid Build Coastguard Worker      */
227*0e209d39SAndroid Build Coastguard Worker     const TransliterationRuleData* data;
228*0e209d39SAndroid Build Coastguard Worker 
229*0e209d39SAndroid Build Coastguard Worker     /**
230*0e209d39SAndroid Build Coastguard Worker      * The segment number, 1-based, or 0 if not a segment.
231*0e209d39SAndroid Build Coastguard Worker      */
232*0e209d39SAndroid Build Coastguard Worker     int32_t segmentNumber;
233*0e209d39SAndroid Build Coastguard Worker 
234*0e209d39SAndroid Build Coastguard Worker     /**
235*0e209d39SAndroid Build Coastguard Worker      * Start offset, in the match text, of the <em>rightmost</em>
236*0e209d39SAndroid Build Coastguard Worker      * match.
237*0e209d39SAndroid Build Coastguard Worker      */
238*0e209d39SAndroid Build Coastguard Worker     int32_t matchStart;
239*0e209d39SAndroid Build Coastguard Worker 
240*0e209d39SAndroid Build Coastguard Worker     /**
241*0e209d39SAndroid Build Coastguard Worker      * Limit offset, in the match text, of the <em>rightmost</em>
242*0e209d39SAndroid Build Coastguard Worker      * match.
243*0e209d39SAndroid Build Coastguard Worker      */
244*0e209d39SAndroid Build Coastguard Worker     int32_t matchLimit;
245*0e209d39SAndroid Build Coastguard Worker 
246*0e209d39SAndroid Build Coastguard Worker };
247*0e209d39SAndroid Build Coastguard Worker 
248*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
249*0e209d39SAndroid Build Coastguard Worker 
250*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_TRANSLITERATION */
251*0e209d39SAndroid Build Coastguard Worker 
252*0e209d39SAndroid Build Coastguard Worker #endif
253