1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker **********************************************************************
5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 1999-2011, International Business Machines
6*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved.
7*0e209d39SAndroid Build Coastguard Worker **********************************************************************
8*0e209d39SAndroid Build Coastguard Worker * Date Name Description
9*0e209d39SAndroid Build Coastguard Worker * 11/17/99 aliu Creation.
10*0e209d39SAndroid Build Coastguard Worker **********************************************************************
11*0e209d39SAndroid Build Coastguard Worker */
12*0e209d39SAndroid Build Coastguard Worker
13*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
14*0e209d39SAndroid Build Coastguard Worker
15*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_TRANSLITERATION
16*0e209d39SAndroid Build Coastguard Worker
17*0e209d39SAndroid Build Coastguard Worker #include "unicode/rep.h"
18*0e209d39SAndroid Build Coastguard Worker #include "unicode/unifilt.h"
19*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h"
20*0e209d39SAndroid Build Coastguard Worker #include "unicode/utf16.h"
21*0e209d39SAndroid Build Coastguard Worker #include "rbt_rule.h"
22*0e209d39SAndroid Build Coastguard Worker #include "rbt_data.h"
23*0e209d39SAndroid Build Coastguard Worker #include "cmemory.h"
24*0e209d39SAndroid Build Coastguard Worker #include "strmatch.h"
25*0e209d39SAndroid Build Coastguard Worker #include "strrepl.h"
26*0e209d39SAndroid Build Coastguard Worker #include "util.h"
27*0e209d39SAndroid Build Coastguard Worker #include "putilimp.h"
28*0e209d39SAndroid Build Coastguard Worker
29*0e209d39SAndroid Build Coastguard Worker static const char16_t FORWARD_OP[] = {32,62,32,0}; // " > "
30*0e209d39SAndroid Build Coastguard Worker
31*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
32*0e209d39SAndroid Build Coastguard Worker
33*0e209d39SAndroid Build Coastguard Worker /**
34*0e209d39SAndroid Build Coastguard Worker * Construct a new rule with the given input, output text, and other
35*0e209d39SAndroid Build Coastguard Worker * attributes. A cursor position may be specified for the output text.
36*0e209d39SAndroid Build Coastguard Worker * @param input input string, including key and optional ante and
37*0e209d39SAndroid Build Coastguard Worker * post context
38*0e209d39SAndroid Build Coastguard Worker * @param anteContextPos offset into input to end of ante context, or -1 if
39*0e209d39SAndroid Build Coastguard Worker * none. Must be <= input.length() if not -1.
40*0e209d39SAndroid Build Coastguard Worker * @param postContextPos offset into input to start of post context, or -1
41*0e209d39SAndroid Build Coastguard Worker * if none. Must be <= input.length() if not -1, and must be >=
42*0e209d39SAndroid Build Coastguard Worker * anteContextPos.
43*0e209d39SAndroid Build Coastguard Worker * @param output output string
44*0e209d39SAndroid Build Coastguard Worker * @param cursorPosition offset into output at which cursor is located, or -1 if
45*0e209d39SAndroid Build Coastguard Worker * none. If less than zero, then the cursor is placed after the
46*0e209d39SAndroid Build Coastguard Worker * <code>output</code>; that is, -1 is equivalent to
47*0e209d39SAndroid Build Coastguard Worker * <code>output.length()</code>. If greater than
48*0e209d39SAndroid Build Coastguard Worker * <code>output.length()</code> then an exception is thrown.
49*0e209d39SAndroid Build Coastguard Worker * @param segs array of UnicodeFunctors corresponding to input pattern
50*0e209d39SAndroid Build Coastguard Worker * segments, or null if there are none. The array itself is adopted,
51*0e209d39SAndroid Build Coastguard Worker * but the pointers within it are not.
52*0e209d39SAndroid Build Coastguard Worker * @param segsCount number of elements in segs[]
53*0e209d39SAndroid Build Coastguard Worker * @param anchorStart true if the the rule is anchored on the left to
54*0e209d39SAndroid Build Coastguard Worker * the context start
55*0e209d39SAndroid Build Coastguard Worker * @param anchorEnd true if the rule is anchored on the right to the
56*0e209d39SAndroid Build Coastguard Worker * context limit
57*0e209d39SAndroid Build Coastguard Worker */
TransliterationRule(const UnicodeString & input,int32_t anteContextPos,int32_t postContextPos,const UnicodeString & outputStr,int32_t cursorPosition,int32_t cursorOffset,UnicodeFunctor ** segs,int32_t segsCount,UBool anchorStart,UBool anchorEnd,const TransliterationRuleData * theData,UErrorCode & status)58*0e209d39SAndroid Build Coastguard Worker TransliterationRule::TransliterationRule(const UnicodeString& input,
59*0e209d39SAndroid Build Coastguard Worker int32_t anteContextPos, int32_t postContextPos,
60*0e209d39SAndroid Build Coastguard Worker const UnicodeString& outputStr,
61*0e209d39SAndroid Build Coastguard Worker int32_t cursorPosition, int32_t cursorOffset,
62*0e209d39SAndroid Build Coastguard Worker UnicodeFunctor** segs,
63*0e209d39SAndroid Build Coastguard Worker int32_t segsCount,
64*0e209d39SAndroid Build Coastguard Worker UBool anchorStart, UBool anchorEnd,
65*0e209d39SAndroid Build Coastguard Worker const TransliterationRuleData* theData,
66*0e209d39SAndroid Build Coastguard Worker UErrorCode& status) :
67*0e209d39SAndroid Build Coastguard Worker UMemory(),
68*0e209d39SAndroid Build Coastguard Worker segments(nullptr),
69*0e209d39SAndroid Build Coastguard Worker data(theData) {
70*0e209d39SAndroid Build Coastguard Worker
71*0e209d39SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
72*0e209d39SAndroid Build Coastguard Worker return;
73*0e209d39SAndroid Build Coastguard Worker }
74*0e209d39SAndroid Build Coastguard Worker // Do range checks only when warranted to save time
75*0e209d39SAndroid Build Coastguard Worker if (anteContextPos < 0) {
76*0e209d39SAndroid Build Coastguard Worker anteContextLength = 0;
77*0e209d39SAndroid Build Coastguard Worker } else {
78*0e209d39SAndroid Build Coastguard Worker if (anteContextPos > input.length()) {
79*0e209d39SAndroid Build Coastguard Worker // throw new IllegalArgumentException("Invalid ante context");
80*0e209d39SAndroid Build Coastguard Worker status = U_ILLEGAL_ARGUMENT_ERROR;
81*0e209d39SAndroid Build Coastguard Worker return;
82*0e209d39SAndroid Build Coastguard Worker }
83*0e209d39SAndroid Build Coastguard Worker anteContextLength = anteContextPos;
84*0e209d39SAndroid Build Coastguard Worker }
85*0e209d39SAndroid Build Coastguard Worker if (postContextPos < 0) {
86*0e209d39SAndroid Build Coastguard Worker keyLength = input.length() - anteContextLength;
87*0e209d39SAndroid Build Coastguard Worker } else {
88*0e209d39SAndroid Build Coastguard Worker if (postContextPos < anteContextLength ||
89*0e209d39SAndroid Build Coastguard Worker postContextPos > input.length()) {
90*0e209d39SAndroid Build Coastguard Worker // throw new IllegalArgumentException("Invalid post context");
91*0e209d39SAndroid Build Coastguard Worker status = U_ILLEGAL_ARGUMENT_ERROR;
92*0e209d39SAndroid Build Coastguard Worker return;
93*0e209d39SAndroid Build Coastguard Worker }
94*0e209d39SAndroid Build Coastguard Worker keyLength = postContextPos - anteContextLength;
95*0e209d39SAndroid Build Coastguard Worker }
96*0e209d39SAndroid Build Coastguard Worker if (cursorPosition < 0) {
97*0e209d39SAndroid Build Coastguard Worker cursorPosition = outputStr.length();
98*0e209d39SAndroid Build Coastguard Worker } else if (cursorPosition > outputStr.length()) {
99*0e209d39SAndroid Build Coastguard Worker // throw new IllegalArgumentException("Invalid cursor position");
100*0e209d39SAndroid Build Coastguard Worker status = U_ILLEGAL_ARGUMENT_ERROR;
101*0e209d39SAndroid Build Coastguard Worker return;
102*0e209d39SAndroid Build Coastguard Worker }
103*0e209d39SAndroid Build Coastguard Worker // We don't validate the segments array. The caller must
104*0e209d39SAndroid Build Coastguard Worker // guarantee that the segments are well-formed (that is, that
105*0e209d39SAndroid Build Coastguard Worker // all $n references in the output refer to indices of this
106*0e209d39SAndroid Build Coastguard Worker // array, and that no array elements are null).
107*0e209d39SAndroid Build Coastguard Worker this->segments = segs;
108*0e209d39SAndroid Build Coastguard Worker this->segmentsCount = segsCount;
109*0e209d39SAndroid Build Coastguard Worker
110*0e209d39SAndroid Build Coastguard Worker pattern = input;
111*0e209d39SAndroid Build Coastguard Worker flags = 0;
112*0e209d39SAndroid Build Coastguard Worker if (anchorStart) {
113*0e209d39SAndroid Build Coastguard Worker flags |= ANCHOR_START;
114*0e209d39SAndroid Build Coastguard Worker }
115*0e209d39SAndroid Build Coastguard Worker if (anchorEnd) {
116*0e209d39SAndroid Build Coastguard Worker flags |= ANCHOR_END;
117*0e209d39SAndroid Build Coastguard Worker }
118*0e209d39SAndroid Build Coastguard Worker
119*0e209d39SAndroid Build Coastguard Worker anteContext = nullptr;
120*0e209d39SAndroid Build Coastguard Worker if (anteContextLength > 0) {
121*0e209d39SAndroid Build Coastguard Worker anteContext = new StringMatcher(pattern, 0, anteContextLength,
122*0e209d39SAndroid Build Coastguard Worker false, *data);
123*0e209d39SAndroid Build Coastguard Worker /* test for nullptr */
124*0e209d39SAndroid Build Coastguard Worker if (anteContext == nullptr) {
125*0e209d39SAndroid Build Coastguard Worker status = U_MEMORY_ALLOCATION_ERROR;
126*0e209d39SAndroid Build Coastguard Worker return;
127*0e209d39SAndroid Build Coastguard Worker }
128*0e209d39SAndroid Build Coastguard Worker }
129*0e209d39SAndroid Build Coastguard Worker
130*0e209d39SAndroid Build Coastguard Worker key = nullptr;
131*0e209d39SAndroid Build Coastguard Worker if (keyLength > 0) {
132*0e209d39SAndroid Build Coastguard Worker key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength,
133*0e209d39SAndroid Build Coastguard Worker false, *data);
134*0e209d39SAndroid Build Coastguard Worker /* test for nullptr */
135*0e209d39SAndroid Build Coastguard Worker if (key == nullptr) {
136*0e209d39SAndroid Build Coastguard Worker status = U_MEMORY_ALLOCATION_ERROR;
137*0e209d39SAndroid Build Coastguard Worker return;
138*0e209d39SAndroid Build Coastguard Worker }
139*0e209d39SAndroid Build Coastguard Worker }
140*0e209d39SAndroid Build Coastguard Worker
141*0e209d39SAndroid Build Coastguard Worker int32_t postContextLength = pattern.length() - keyLength - anteContextLength;
142*0e209d39SAndroid Build Coastguard Worker postContext = nullptr;
143*0e209d39SAndroid Build Coastguard Worker if (postContextLength > 0) {
144*0e209d39SAndroid Build Coastguard Worker postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(),
145*0e209d39SAndroid Build Coastguard Worker false, *data);
146*0e209d39SAndroid Build Coastguard Worker /* test for nullptr */
147*0e209d39SAndroid Build Coastguard Worker if (postContext == nullptr) {
148*0e209d39SAndroid Build Coastguard Worker status = U_MEMORY_ALLOCATION_ERROR;
149*0e209d39SAndroid Build Coastguard Worker return;
150*0e209d39SAndroid Build Coastguard Worker }
151*0e209d39SAndroid Build Coastguard Worker }
152*0e209d39SAndroid Build Coastguard Worker
153*0e209d39SAndroid Build Coastguard Worker this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data);
154*0e209d39SAndroid Build Coastguard Worker /* test for nullptr */
155*0e209d39SAndroid Build Coastguard Worker if (this->output == nullptr) {
156*0e209d39SAndroid Build Coastguard Worker status = U_MEMORY_ALLOCATION_ERROR;
157*0e209d39SAndroid Build Coastguard Worker return;
158*0e209d39SAndroid Build Coastguard Worker }
159*0e209d39SAndroid Build Coastguard Worker }
160*0e209d39SAndroid Build Coastguard Worker
161*0e209d39SAndroid Build Coastguard Worker /**
162*0e209d39SAndroid Build Coastguard Worker * Copy constructor.
163*0e209d39SAndroid Build Coastguard Worker */
TransliterationRule(TransliterationRule & other)164*0e209d39SAndroid Build Coastguard Worker TransliterationRule::TransliterationRule(TransliterationRule& other) :
165*0e209d39SAndroid Build Coastguard Worker UMemory(other),
166*0e209d39SAndroid Build Coastguard Worker anteContext(nullptr),
167*0e209d39SAndroid Build Coastguard Worker key(nullptr),
168*0e209d39SAndroid Build Coastguard Worker postContext(nullptr),
169*0e209d39SAndroid Build Coastguard Worker pattern(other.pattern),
170*0e209d39SAndroid Build Coastguard Worker anteContextLength(other.anteContextLength),
171*0e209d39SAndroid Build Coastguard Worker keyLength(other.keyLength),
172*0e209d39SAndroid Build Coastguard Worker flags(other.flags),
173*0e209d39SAndroid Build Coastguard Worker data(other.data) {
174*0e209d39SAndroid Build Coastguard Worker
175*0e209d39SAndroid Build Coastguard Worker segments = nullptr;
176*0e209d39SAndroid Build Coastguard Worker segmentsCount = 0;
177*0e209d39SAndroid Build Coastguard Worker if (other.segmentsCount > 0) {
178*0e209d39SAndroid Build Coastguard Worker segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *));
179*0e209d39SAndroid Build Coastguard Worker uprv_memcpy(segments, other.segments, (size_t)other.segmentsCount*sizeof(segments[0]));
180*0e209d39SAndroid Build Coastguard Worker }
181*0e209d39SAndroid Build Coastguard Worker
182*0e209d39SAndroid Build Coastguard Worker if (other.anteContext != nullptr) {
183*0e209d39SAndroid Build Coastguard Worker anteContext = other.anteContext->clone();
184*0e209d39SAndroid Build Coastguard Worker }
185*0e209d39SAndroid Build Coastguard Worker if (other.key != nullptr) {
186*0e209d39SAndroid Build Coastguard Worker key = other.key->clone();
187*0e209d39SAndroid Build Coastguard Worker }
188*0e209d39SAndroid Build Coastguard Worker if (other.postContext != nullptr) {
189*0e209d39SAndroid Build Coastguard Worker postContext = other.postContext->clone();
190*0e209d39SAndroid Build Coastguard Worker }
191*0e209d39SAndroid Build Coastguard Worker output = other.output->clone();
192*0e209d39SAndroid Build Coastguard Worker }
193*0e209d39SAndroid Build Coastguard Worker
~TransliterationRule()194*0e209d39SAndroid Build Coastguard Worker TransliterationRule::~TransliterationRule() {
195*0e209d39SAndroid Build Coastguard Worker uprv_free(segments);
196*0e209d39SAndroid Build Coastguard Worker delete anteContext;
197*0e209d39SAndroid Build Coastguard Worker delete key;
198*0e209d39SAndroid Build Coastguard Worker delete postContext;
199*0e209d39SAndroid Build Coastguard Worker delete output;
200*0e209d39SAndroid Build Coastguard Worker }
201*0e209d39SAndroid Build Coastguard Worker
202*0e209d39SAndroid Build Coastguard Worker /**
203*0e209d39SAndroid Build Coastguard Worker * Return the preceding context length. This method is needed to
204*0e209d39SAndroid Build Coastguard Worker * support the <code>Transliterator</code> method
205*0e209d39SAndroid Build Coastguard Worker * <code>getMaximumContextLength()</code>. Internally, this is
206*0e209d39SAndroid Build Coastguard Worker * implemented as the anteContextLength, optionally plus one if
207*0e209d39SAndroid Build Coastguard Worker * there is a start anchor. The one character anchor gap is
208*0e209d39SAndroid Build Coastguard Worker * needed to make repeated incremental transliteration with
209*0e209d39SAndroid Build Coastguard Worker * anchors work.
210*0e209d39SAndroid Build Coastguard Worker */
getContextLength() const211*0e209d39SAndroid Build Coastguard Worker int32_t TransliterationRule::getContextLength() const {
212*0e209d39SAndroid Build Coastguard Worker return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);
213*0e209d39SAndroid Build Coastguard Worker }
214*0e209d39SAndroid Build Coastguard Worker
215*0e209d39SAndroid Build Coastguard Worker /**
216*0e209d39SAndroid Build Coastguard Worker * Internal method. Returns 8-bit index value for this rule.
217*0e209d39SAndroid Build Coastguard Worker * This is the low byte of the first character of the key,
218*0e209d39SAndroid Build Coastguard Worker * unless the first character of the key is a set. If it's a
219*0e209d39SAndroid Build Coastguard Worker * set, or otherwise can match multiple keys, the index value is -1.
220*0e209d39SAndroid Build Coastguard Worker */
getIndexValue() const221*0e209d39SAndroid Build Coastguard Worker int16_t TransliterationRule::getIndexValue() const {
222*0e209d39SAndroid Build Coastguard Worker if (anteContextLength == pattern.length()) {
223*0e209d39SAndroid Build Coastguard Worker // A pattern with just ante context {such as foo)>bar} can
224*0e209d39SAndroid Build Coastguard Worker // match any key.
225*0e209d39SAndroid Build Coastguard Worker return -1;
226*0e209d39SAndroid Build Coastguard Worker }
227*0e209d39SAndroid Build Coastguard Worker UChar32 c = pattern.char32At(anteContextLength);
228*0e209d39SAndroid Build Coastguard Worker return (int16_t)(data->lookupMatcher(c) == nullptr ? (c & 0xFF) : -1);
229*0e209d39SAndroid Build Coastguard Worker }
230*0e209d39SAndroid Build Coastguard Worker
231*0e209d39SAndroid Build Coastguard Worker /**
232*0e209d39SAndroid Build Coastguard Worker * Internal method. Returns true if this rule matches the given
233*0e209d39SAndroid Build Coastguard Worker * index value. The index value is an 8-bit integer, 0..255,
234*0e209d39SAndroid Build Coastguard Worker * representing the low byte of the first character of the key.
235*0e209d39SAndroid Build Coastguard Worker * It matches this rule if it matches the first character of the
236*0e209d39SAndroid Build Coastguard Worker * key, or if the first character of the key is a set, and the set
237*0e209d39SAndroid Build Coastguard Worker * contains any character with a low byte equal to the index
238*0e209d39SAndroid Build Coastguard Worker * value. If the rule contains only ante context, as in foo)>bar,
239*0e209d39SAndroid Build Coastguard Worker * then it will match any key.
240*0e209d39SAndroid Build Coastguard Worker */
matchesIndexValue(uint8_t v) const241*0e209d39SAndroid Build Coastguard Worker UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
242*0e209d39SAndroid Build Coastguard Worker // Delegate to the key, or if there is none, to the postContext.
243*0e209d39SAndroid Build Coastguard Worker // If there is neither then we match any key; return true.
244*0e209d39SAndroid Build Coastguard Worker UnicodeMatcher *m = (key != nullptr) ? key : postContext;
245*0e209d39SAndroid Build Coastguard Worker return (m != nullptr) ? m->matchesIndexValue(v) : true;
246*0e209d39SAndroid Build Coastguard Worker }
247*0e209d39SAndroid Build Coastguard Worker
248*0e209d39SAndroid Build Coastguard Worker /**
249*0e209d39SAndroid Build Coastguard Worker * Return true if this rule masks another rule. If r1 masks r2 then
250*0e209d39SAndroid Build Coastguard Worker * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
251*0e209d39SAndroid Build Coastguard Worker * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
252*0e209d39SAndroid Build Coastguard Worker * "[c]a>x" masks "[dc]a>y".
253*0e209d39SAndroid Build Coastguard Worker */
masks(const TransliterationRule & r2) const254*0e209d39SAndroid Build Coastguard Worker UBool TransliterationRule::masks(const TransliterationRule& r2) const {
255*0e209d39SAndroid Build Coastguard Worker /* Rule r1 masks rule r2 if the string formed of the
256*0e209d39SAndroid Build Coastguard Worker * antecontext, key, and postcontext overlaps in the following
257*0e209d39SAndroid Build Coastguard Worker * way:
258*0e209d39SAndroid Build Coastguard Worker *
259*0e209d39SAndroid Build Coastguard Worker * r1: aakkkpppp
260*0e209d39SAndroid Build Coastguard Worker * r2: aaakkkkkpppp
261*0e209d39SAndroid Build Coastguard Worker * ^
262*0e209d39SAndroid Build Coastguard Worker *
263*0e209d39SAndroid Build Coastguard Worker * The strings must be aligned at the first character of the
264*0e209d39SAndroid Build Coastguard Worker * key. The length of r1 to the left of the alignment point
265*0e209d39SAndroid Build Coastguard Worker * must be <= the length of r2 to the left; ditto for the
266*0e209d39SAndroid Build Coastguard Worker * right. The characters of r1 must equal (or be a superset
267*0e209d39SAndroid Build Coastguard Worker * of) the corresponding characters of r2. The superset
268*0e209d39SAndroid Build Coastguard Worker * operation should be performed to check for UnicodeSet
269*0e209d39SAndroid Build Coastguard Worker * masking.
270*0e209d39SAndroid Build Coastguard Worker *
271*0e209d39SAndroid Build Coastguard Worker * Anchors: Two patterns that differ only in anchors only
272*0e209d39SAndroid Build Coastguard Worker * mask one another if they are exactly equal, and r2 has
273*0e209d39SAndroid Build Coastguard Worker * all the anchors r1 has (optionally, plus some). Here Y
274*0e209d39SAndroid Build Coastguard Worker * means the row masks the column, N means it doesn't.
275*0e209d39SAndroid Build Coastguard Worker *
276*0e209d39SAndroid Build Coastguard Worker * ab ^ab ab$ ^ab$
277*0e209d39SAndroid Build Coastguard Worker * ab Y Y Y Y
278*0e209d39SAndroid Build Coastguard Worker * ^ab N Y N Y
279*0e209d39SAndroid Build Coastguard Worker * ab$ N N Y Y
280*0e209d39SAndroid Build Coastguard Worker * ^ab$ N N N Y
281*0e209d39SAndroid Build Coastguard Worker *
282*0e209d39SAndroid Build Coastguard Worker * Post context: {a}b masks ab, but not vice versa, since {a}b
283*0e209d39SAndroid Build Coastguard Worker * matches everything ab matches, and {a}b matches {|a|}b but ab
284*0e209d39SAndroid Build Coastguard Worker * does not. Pre context is different (a{b} does not align with
285*0e209d39SAndroid Build Coastguard Worker * ab).
286*0e209d39SAndroid Build Coastguard Worker */
287*0e209d39SAndroid Build Coastguard Worker
288*0e209d39SAndroid Build Coastguard Worker /* LIMITATION of the current mask algorithm: Some rule
289*0e209d39SAndroid Build Coastguard Worker * maskings are currently not detected. For example,
290*0e209d39SAndroid Build Coastguard Worker * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO
291*0e209d39SAndroid Build Coastguard Worker */
292*0e209d39SAndroid Build Coastguard Worker
293*0e209d39SAndroid Build Coastguard Worker int32_t len = pattern.length();
294*0e209d39SAndroid Build Coastguard Worker int32_t left = anteContextLength;
295*0e209d39SAndroid Build Coastguard Worker int32_t left2 = r2.anteContextLength;
296*0e209d39SAndroid Build Coastguard Worker int32_t right = len - left;
297*0e209d39SAndroid Build Coastguard Worker int32_t right2 = r2.pattern.length() - left2;
298*0e209d39SAndroid Build Coastguard Worker int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern);
299*0e209d39SAndroid Build Coastguard Worker
300*0e209d39SAndroid Build Coastguard Worker // TODO Clean this up -- some logic might be combinable with the
301*0e209d39SAndroid Build Coastguard Worker // next statement.
302*0e209d39SAndroid Build Coastguard Worker
303*0e209d39SAndroid Build Coastguard Worker // Test for anchor masking
304*0e209d39SAndroid Build Coastguard Worker if (left == left2 && right == right2 &&
305*0e209d39SAndroid Build Coastguard Worker keyLength <= r2.keyLength &&
306*0e209d39SAndroid Build Coastguard Worker 0 == cachedCompare) {
307*0e209d39SAndroid Build Coastguard Worker // The following boolean logic implements the table above
308*0e209d39SAndroid Build Coastguard Worker return (flags == r2.flags) ||
309*0e209d39SAndroid Build Coastguard Worker (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) ||
310*0e209d39SAndroid Build Coastguard Worker ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));
311*0e209d39SAndroid Build Coastguard Worker }
312*0e209d39SAndroid Build Coastguard Worker
313*0e209d39SAndroid Build Coastguard Worker return left <= left2 &&
314*0e209d39SAndroid Build Coastguard Worker (right < right2 ||
315*0e209d39SAndroid Build Coastguard Worker (right == right2 && keyLength <= r2.keyLength)) &&
316*0e209d39SAndroid Build Coastguard Worker (0 == cachedCompare);
317*0e209d39SAndroid Build Coastguard Worker }
318*0e209d39SAndroid Build Coastguard Worker
posBefore(const Replaceable & str,int32_t pos)319*0e209d39SAndroid Build Coastguard Worker static inline int32_t posBefore(const Replaceable& str, int32_t pos) {
320*0e209d39SAndroid Build Coastguard Worker return (pos > 0) ?
321*0e209d39SAndroid Build Coastguard Worker pos - U16_LENGTH(str.char32At(pos-1)) :
322*0e209d39SAndroid Build Coastguard Worker pos - 1;
323*0e209d39SAndroid Build Coastguard Worker }
324*0e209d39SAndroid Build Coastguard Worker
posAfter(const Replaceable & str,int32_t pos)325*0e209d39SAndroid Build Coastguard Worker static inline int32_t posAfter(const Replaceable& str, int32_t pos) {
326*0e209d39SAndroid Build Coastguard Worker return (pos >= 0 && pos < str.length()) ?
327*0e209d39SAndroid Build Coastguard Worker pos + U16_LENGTH(str.char32At(pos)) :
328*0e209d39SAndroid Build Coastguard Worker pos + 1;
329*0e209d39SAndroid Build Coastguard Worker }
330*0e209d39SAndroid Build Coastguard Worker
331*0e209d39SAndroid Build Coastguard Worker /**
332*0e209d39SAndroid Build Coastguard Worker * Attempt a match and replacement at the given position. Return
333*0e209d39SAndroid Build Coastguard Worker * the degree of match between this rule and the given text. The
334*0e209d39SAndroid Build Coastguard Worker * degree of match may be mismatch, a partial match, or a full
335*0e209d39SAndroid Build Coastguard Worker * match. A mismatch means at least one character of the text
336*0e209d39SAndroid Build Coastguard Worker * does not match the context or key. A partial match means some
337*0e209d39SAndroid Build Coastguard Worker * context and key characters match, but the text is not long
338*0e209d39SAndroid Build Coastguard Worker * enough to match all of them. A full match means all context
339*0e209d39SAndroid Build Coastguard Worker * and key characters match.
340*0e209d39SAndroid Build Coastguard Worker *
341*0e209d39SAndroid Build Coastguard Worker * If a full match is obtained, perform a replacement, update pos,
342*0e209d39SAndroid Build Coastguard Worker * and return U_MATCH. Otherwise both text and pos are unchanged.
343*0e209d39SAndroid Build Coastguard Worker *
344*0e209d39SAndroid Build Coastguard Worker * @param text the text
345*0e209d39SAndroid Build Coastguard Worker * @param pos the position indices
346*0e209d39SAndroid Build Coastguard Worker * @param incremental if true, test for partial matches that may
347*0e209d39SAndroid Build Coastguard Worker * be completed by additional text inserted at pos.limit.
348*0e209d39SAndroid Build Coastguard Worker * @return one of <code>U_MISMATCH</code>,
349*0e209d39SAndroid Build Coastguard Worker * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
350*0e209d39SAndroid Build Coastguard Worker * incremental is false then U_PARTIAL_MATCH will not be returned.
351*0e209d39SAndroid Build Coastguard Worker */
matchAndReplace(Replaceable & text,UTransPosition & pos,UBool incremental) const352*0e209d39SAndroid Build Coastguard Worker UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
353*0e209d39SAndroid Build Coastguard Worker UTransPosition& pos,
354*0e209d39SAndroid Build Coastguard Worker UBool incremental) const {
355*0e209d39SAndroid Build Coastguard Worker // Matching and replacing are done in one method because the
356*0e209d39SAndroid Build Coastguard Worker // replacement operation needs information obtained during the
357*0e209d39SAndroid Build Coastguard Worker // match. Another way to do this is to have the match method
358*0e209d39SAndroid Build Coastguard Worker // create a match result struct with relevant offsets, and to pass
359*0e209d39SAndroid Build Coastguard Worker // this into the replace method.
360*0e209d39SAndroid Build Coastguard Worker
361*0e209d39SAndroid Build Coastguard Worker // ============================ MATCH ===========================
362*0e209d39SAndroid Build Coastguard Worker
363*0e209d39SAndroid Build Coastguard Worker // Reset segment match data
364*0e209d39SAndroid Build Coastguard Worker if (segments != nullptr) {
365*0e209d39SAndroid Build Coastguard Worker for (int32_t i=0; i<segmentsCount; ++i) {
366*0e209d39SAndroid Build Coastguard Worker ((StringMatcher*) segments[i])->resetMatch();
367*0e209d39SAndroid Build Coastguard Worker }
368*0e209d39SAndroid Build Coastguard Worker }
369*0e209d39SAndroid Build Coastguard Worker
370*0e209d39SAndroid Build Coastguard Worker // int32_t lenDelta, keyLimit;
371*0e209d39SAndroid Build Coastguard Worker int32_t keyLimit;
372*0e209d39SAndroid Build Coastguard Worker
373*0e209d39SAndroid Build Coastguard Worker // ------------------------ Ante Context ------------------------
374*0e209d39SAndroid Build Coastguard Worker
375*0e209d39SAndroid Build Coastguard Worker // A mismatch in the ante context, or with the start anchor,
376*0e209d39SAndroid Build Coastguard Worker // is an outright U_MISMATCH regardless of whether we are
377*0e209d39SAndroid Build Coastguard Worker // incremental or not.
378*0e209d39SAndroid Build Coastguard Worker int32_t oText; // offset into 'text'
379*0e209d39SAndroid Build Coastguard Worker // int32_t newStart = 0;
380*0e209d39SAndroid Build Coastguard Worker int32_t minOText;
381*0e209d39SAndroid Build Coastguard Worker
382*0e209d39SAndroid Build Coastguard Worker // Note (1): We process text in 16-bit code units, rather than
383*0e209d39SAndroid Build Coastguard Worker // 32-bit code points. This works because stand-ins are
384*0e209d39SAndroid Build Coastguard Worker // always in the BMP and because we are doing a literal match
385*0e209d39SAndroid Build Coastguard Worker // operation, which can be done 16-bits at a time.
386*0e209d39SAndroid Build Coastguard Worker
387*0e209d39SAndroid Build Coastguard Worker int32_t anteLimit = posBefore(text, pos.contextStart);
388*0e209d39SAndroid Build Coastguard Worker
389*0e209d39SAndroid Build Coastguard Worker UMatchDegree match;
390*0e209d39SAndroid Build Coastguard Worker
391*0e209d39SAndroid Build Coastguard Worker // Start reverse match at char before pos.start
392*0e209d39SAndroid Build Coastguard Worker oText = posBefore(text, pos.start);
393*0e209d39SAndroid Build Coastguard Worker
394*0e209d39SAndroid Build Coastguard Worker if (anteContext != nullptr) {
395*0e209d39SAndroid Build Coastguard Worker match = anteContext->matches(text, oText, anteLimit, false);
396*0e209d39SAndroid Build Coastguard Worker if (match != U_MATCH) {
397*0e209d39SAndroid Build Coastguard Worker return U_MISMATCH;
398*0e209d39SAndroid Build Coastguard Worker }
399*0e209d39SAndroid Build Coastguard Worker }
400*0e209d39SAndroid Build Coastguard Worker
401*0e209d39SAndroid Build Coastguard Worker minOText = posAfter(text, oText);
402*0e209d39SAndroid Build Coastguard Worker
403*0e209d39SAndroid Build Coastguard Worker // ------------------------ Start Anchor ------------------------
404*0e209d39SAndroid Build Coastguard Worker
405*0e209d39SAndroid Build Coastguard Worker if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {
406*0e209d39SAndroid Build Coastguard Worker return U_MISMATCH;
407*0e209d39SAndroid Build Coastguard Worker }
408*0e209d39SAndroid Build Coastguard Worker
409*0e209d39SAndroid Build Coastguard Worker // -------------------- Key and Post Context --------------------
410*0e209d39SAndroid Build Coastguard Worker
411*0e209d39SAndroid Build Coastguard Worker oText = pos.start;
412*0e209d39SAndroid Build Coastguard Worker
413*0e209d39SAndroid Build Coastguard Worker if (key != nullptr) {
414*0e209d39SAndroid Build Coastguard Worker match = key->matches(text, oText, pos.limit, incremental);
415*0e209d39SAndroid Build Coastguard Worker if (match != U_MATCH) {
416*0e209d39SAndroid Build Coastguard Worker return match;
417*0e209d39SAndroid Build Coastguard Worker }
418*0e209d39SAndroid Build Coastguard Worker }
419*0e209d39SAndroid Build Coastguard Worker
420*0e209d39SAndroid Build Coastguard Worker keyLimit = oText;
421*0e209d39SAndroid Build Coastguard Worker
422*0e209d39SAndroid Build Coastguard Worker if (postContext != nullptr) {
423*0e209d39SAndroid Build Coastguard Worker if (incremental && keyLimit == pos.limit) {
424*0e209d39SAndroid Build Coastguard Worker // The key matches just before pos.limit, and there is
425*0e209d39SAndroid Build Coastguard Worker // a postContext. Since we are in incremental mode,
426*0e209d39SAndroid Build Coastguard Worker // we must assume more characters may be inserted at
427*0e209d39SAndroid Build Coastguard Worker // pos.limit -- this is a partial match.
428*0e209d39SAndroid Build Coastguard Worker return U_PARTIAL_MATCH;
429*0e209d39SAndroid Build Coastguard Worker }
430*0e209d39SAndroid Build Coastguard Worker
431*0e209d39SAndroid Build Coastguard Worker match = postContext->matches(text, oText, pos.contextLimit, incremental);
432*0e209d39SAndroid Build Coastguard Worker if (match != U_MATCH) {
433*0e209d39SAndroid Build Coastguard Worker return match;
434*0e209d39SAndroid Build Coastguard Worker }
435*0e209d39SAndroid Build Coastguard Worker }
436*0e209d39SAndroid Build Coastguard Worker
437*0e209d39SAndroid Build Coastguard Worker // ------------------------- Stop Anchor ------------------------
438*0e209d39SAndroid Build Coastguard Worker
439*0e209d39SAndroid Build Coastguard Worker if (((flags & ANCHOR_END)) != 0) {
440*0e209d39SAndroid Build Coastguard Worker if (oText != pos.contextLimit) {
441*0e209d39SAndroid Build Coastguard Worker return U_MISMATCH;
442*0e209d39SAndroid Build Coastguard Worker }
443*0e209d39SAndroid Build Coastguard Worker if (incremental) {
444*0e209d39SAndroid Build Coastguard Worker return U_PARTIAL_MATCH;
445*0e209d39SAndroid Build Coastguard Worker }
446*0e209d39SAndroid Build Coastguard Worker }
447*0e209d39SAndroid Build Coastguard Worker
448*0e209d39SAndroid Build Coastguard Worker // =========================== REPLACE ==========================
449*0e209d39SAndroid Build Coastguard Worker
450*0e209d39SAndroid Build Coastguard Worker // We have a full match. The key is between pos.start and
451*0e209d39SAndroid Build Coastguard Worker // keyLimit.
452*0e209d39SAndroid Build Coastguard Worker
453*0e209d39SAndroid Build Coastguard Worker int32_t newStart;
454*0e209d39SAndroid Build Coastguard Worker int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart);
455*0e209d39SAndroid Build Coastguard Worker int32_t lenDelta = newLength - (keyLimit - pos.start);
456*0e209d39SAndroid Build Coastguard Worker
457*0e209d39SAndroid Build Coastguard Worker oText += lenDelta;
458*0e209d39SAndroid Build Coastguard Worker pos.limit += lenDelta;
459*0e209d39SAndroid Build Coastguard Worker pos.contextLimit += lenDelta;
460*0e209d39SAndroid Build Coastguard Worker // Restrict new value of start to [minOText, min(oText, pos.limit)].
461*0e209d39SAndroid Build Coastguard Worker pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
462*0e209d39SAndroid Build Coastguard Worker return U_MATCH;
463*0e209d39SAndroid Build Coastguard Worker }
464*0e209d39SAndroid Build Coastguard Worker
465*0e209d39SAndroid Build Coastguard Worker /**
466*0e209d39SAndroid Build Coastguard Worker * Create a source string that represents this rule. Append it to the
467*0e209d39SAndroid Build Coastguard Worker * given string.
468*0e209d39SAndroid Build Coastguard Worker */
toRule(UnicodeString & rule,UBool escapeUnprintable) const469*0e209d39SAndroid Build Coastguard Worker UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
470*0e209d39SAndroid Build Coastguard Worker UBool escapeUnprintable) const {
471*0e209d39SAndroid Build Coastguard Worker
472*0e209d39SAndroid Build Coastguard Worker // Accumulate special characters (and non-specials following them)
473*0e209d39SAndroid Build Coastguard Worker // into quoteBuf. Append quoteBuf, within single quotes, when
474*0e209d39SAndroid Build Coastguard Worker // a non-quoted element must be inserted.
475*0e209d39SAndroid Build Coastguard Worker UnicodeString str, quoteBuf;
476*0e209d39SAndroid Build Coastguard Worker
477*0e209d39SAndroid Build Coastguard Worker // Do not emit the braces '{' '}' around the pattern if there
478*0e209d39SAndroid Build Coastguard Worker // is neither anteContext nor postContext.
479*0e209d39SAndroid Build Coastguard Worker UBool emitBraces =
480*0e209d39SAndroid Build Coastguard Worker (anteContext != nullptr) || (postContext != nullptr);
481*0e209d39SAndroid Build Coastguard Worker
482*0e209d39SAndroid Build Coastguard Worker // Emit start anchor
483*0e209d39SAndroid Build Coastguard Worker if ((flags & ANCHOR_START) != 0) {
484*0e209d39SAndroid Build Coastguard Worker rule.append((char16_t)94/*^*/);
485*0e209d39SAndroid Build Coastguard Worker }
486*0e209d39SAndroid Build Coastguard Worker
487*0e209d39SAndroid Build Coastguard Worker // Emit the input pattern
488*0e209d39SAndroid Build Coastguard Worker ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);
489*0e209d39SAndroid Build Coastguard Worker
490*0e209d39SAndroid Build Coastguard Worker if (emitBraces) {
491*0e209d39SAndroid Build Coastguard Worker ICU_Utility::appendToRule(rule, (char16_t) 0x007B /*{*/, true, escapeUnprintable, quoteBuf);
492*0e209d39SAndroid Build Coastguard Worker }
493*0e209d39SAndroid Build Coastguard Worker
494*0e209d39SAndroid Build Coastguard Worker ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf);
495*0e209d39SAndroid Build Coastguard Worker
496*0e209d39SAndroid Build Coastguard Worker if (emitBraces) {
497*0e209d39SAndroid Build Coastguard Worker ICU_Utility::appendToRule(rule, (char16_t) 0x007D /*}*/, true, escapeUnprintable, quoteBuf);
498*0e209d39SAndroid Build Coastguard Worker }
499*0e209d39SAndroid Build Coastguard Worker
500*0e209d39SAndroid Build Coastguard Worker ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf);
501*0e209d39SAndroid Build Coastguard Worker
502*0e209d39SAndroid Build Coastguard Worker // Emit end anchor
503*0e209d39SAndroid Build Coastguard Worker if ((flags & ANCHOR_END) != 0) {
504*0e209d39SAndroid Build Coastguard Worker rule.append((char16_t)36/*$*/);
505*0e209d39SAndroid Build Coastguard Worker }
506*0e209d39SAndroid Build Coastguard Worker
507*0e209d39SAndroid Build Coastguard Worker ICU_Utility::appendToRule(rule, UnicodeString(true, FORWARD_OP, 3), true, escapeUnprintable, quoteBuf);
508*0e209d39SAndroid Build Coastguard Worker
509*0e209d39SAndroid Build Coastguard Worker // Emit the output pattern
510*0e209d39SAndroid Build Coastguard Worker
511*0e209d39SAndroid Build Coastguard Worker ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable),
512*0e209d39SAndroid Build Coastguard Worker true, escapeUnprintable, quoteBuf);
513*0e209d39SAndroid Build Coastguard Worker
514*0e209d39SAndroid Build Coastguard Worker ICU_Utility::appendToRule(rule, (char16_t) 0x003B /*;*/, true, escapeUnprintable, quoteBuf);
515*0e209d39SAndroid Build Coastguard Worker
516*0e209d39SAndroid Build Coastguard Worker return rule;
517*0e209d39SAndroid Build Coastguard Worker }
518*0e209d39SAndroid Build Coastguard Worker
setData(const TransliterationRuleData * d)519*0e209d39SAndroid Build Coastguard Worker void TransliterationRule::setData(const TransliterationRuleData* d) {
520*0e209d39SAndroid Build Coastguard Worker data = d;
521*0e209d39SAndroid Build Coastguard Worker if (anteContext != nullptr) anteContext->setData(d);
522*0e209d39SAndroid Build Coastguard Worker if (postContext != nullptr) postContext->setData(d);
523*0e209d39SAndroid Build Coastguard Worker if (key != nullptr) key->setData(d);
524*0e209d39SAndroid Build Coastguard Worker // assert(output != nullptr);
525*0e209d39SAndroid Build Coastguard Worker output->setData(d);
526*0e209d39SAndroid Build Coastguard Worker // Don't have to do segments since they are in the context or key
527*0e209d39SAndroid Build Coastguard Worker }
528*0e209d39SAndroid Build Coastguard Worker
529*0e209d39SAndroid Build Coastguard Worker /**
530*0e209d39SAndroid Build Coastguard Worker * Union the set of all characters that may be modified by this rule
531*0e209d39SAndroid Build Coastguard Worker * into the given set.
532*0e209d39SAndroid Build Coastguard Worker */
addSourceSetTo(UnicodeSet & toUnionTo) const533*0e209d39SAndroid Build Coastguard Worker void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {
534*0e209d39SAndroid Build Coastguard Worker int32_t limit = anteContextLength + keyLength;
535*0e209d39SAndroid Build Coastguard Worker for (int32_t i=anteContextLength; i<limit; ) {
536*0e209d39SAndroid Build Coastguard Worker UChar32 ch = pattern.char32At(i);
537*0e209d39SAndroid Build Coastguard Worker i += U16_LENGTH(ch);
538*0e209d39SAndroid Build Coastguard Worker const UnicodeMatcher* matcher = data->lookupMatcher(ch);
539*0e209d39SAndroid Build Coastguard Worker if (matcher == nullptr) {
540*0e209d39SAndroid Build Coastguard Worker toUnionTo.add(ch);
541*0e209d39SAndroid Build Coastguard Worker } else {
542*0e209d39SAndroid Build Coastguard Worker matcher->addMatchSetTo(toUnionTo);
543*0e209d39SAndroid Build Coastguard Worker }
544*0e209d39SAndroid Build Coastguard Worker }
545*0e209d39SAndroid Build Coastguard Worker }
546*0e209d39SAndroid Build Coastguard Worker
547*0e209d39SAndroid Build Coastguard Worker /**
548*0e209d39SAndroid Build Coastguard Worker * Union the set of all characters that may be emitted by this rule
549*0e209d39SAndroid Build Coastguard Worker * into the given set.
550*0e209d39SAndroid Build Coastguard Worker */
addTargetSetTo(UnicodeSet & toUnionTo) const551*0e209d39SAndroid Build Coastguard Worker void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const {
552*0e209d39SAndroid Build Coastguard Worker output->toReplacer()->addReplacementSetTo(toUnionTo);
553*0e209d39SAndroid Build Coastguard Worker }
554*0e209d39SAndroid Build Coastguard Worker
555*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
556*0e209d39SAndroid Build Coastguard Worker
557*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_TRANSLITERATION */
558*0e209d39SAndroid Build Coastguard Worker
559*0e209d39SAndroid Build Coastguard Worker //eof
560