xref: /aosp_15_r20/external/icu/libicu/cts_headers/unicode/normalizer2.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2013, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  normalizer2.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov22
16 *   created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
22 /**
23  * \file
24  * \brief C++ API: New API for Unicode Normalization.
25  */
26 
27 #include "unicode/utypes.h"
28 
29 #if U_SHOW_CPLUSPLUS_API
30 
31 #if !UCONFIG_NO_NORMALIZATION
32 
33 #include "unicode/stringpiece.h"
34 #include "unicode/uniset.h"
35 #include "unicode/unistr.h"
36 #include "unicode/unorm2.h"
37 
38 U_NAMESPACE_BEGIN
39 
40 class ByteSink;
41 
42 /**
43  * Unicode normalization functionality for standard Unicode normalization or
44  * for using custom mapping tables.
45  * All instances of this class are unmodifiable/immutable.
46  * Instances returned by getInstance() are singletons that must not be deleted by the caller.
47  * The Normalizer2 class is not intended for public subclassing.
48  *
49  * The primary functions are to produce a normalized string and to detect whether
50  * a string is already normalized.
51  * The most commonly used normalization forms are those defined in
52  * http://www.unicode.org/unicode/reports/tr15/
53  * However, this API supports additional normalization forms for specialized purposes.
54  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
55  * and can be used in implementations of UTS #46.
56  *
57  * Not only are the standard compose and decompose modes supplied,
58  * but additional modes are provided as documented in the Mode enum.
59  *
60  * Some of the functions in this class identify normalization boundaries.
61  * At a normalization boundary, the portions of the string
62  * before it and starting from it do not interact and can be handled independently.
63  *
64  * The spanQuickCheckYes() stops at a normalization boundary.
65  * When the goal is a normalized string, then the text before the boundary
66  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
67  *
68  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
69  * a character is guaranteed to be at a normalization boundary,
70  * regardless of context.
71  * This is used for moving from one normalization boundary to the next
72  * or preceding boundary, and for performing iterative normalization.
73  *
74  * Iterative normalization is useful when only a small portion of a
75  * longer string needs to be processed.
76  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
77  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
78  * (to process only the substring for which sort key bytes are computed).
79  *
80  * The set of normalization boundaries returned by these functions may not be
81  * complete: There may be more boundaries that could be returned.
82  * Different functions may return different boundaries.
83  * @stable ICU 4.4
84  */
85 class U_COMMON_API Normalizer2 : public UObject {
86 public:
87     /**
88      * Destructor.
89      * @stable ICU 4.4
90      */
91     ~Normalizer2();
92 
93     /**
94      * Returns a Normalizer2 instance for Unicode NFC normalization.
95      * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
96      * Returns an unmodifiable singleton instance. Do not delete it.
97      * @param errorCode Standard ICU error code. Its input value must
98      *                  pass the U_SUCCESS() test, or else the function returns
99      *                  immediately. Check for U_FAILURE() on output or use with
100      *                  function chaining. (See User Guide for details.)
101      * @return the requested Normalizer2, if successful
102      * @stable ICU 49
103      */
104     static const Normalizer2 *
105     getNFCInstance(UErrorCode &errorCode);
106 
107     /**
108      * Returns a Normalizer2 instance for Unicode NFD normalization.
109      * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
110      * Returns an unmodifiable singleton instance. Do not delete it.
111      * @param errorCode Standard ICU error code. Its input value must
112      *                  pass the U_SUCCESS() test, or else the function returns
113      *                  immediately. Check for U_FAILURE() on output or use with
114      *                  function chaining. (See User Guide for details.)
115      * @return the requested Normalizer2, if successful
116      * @stable ICU 49
117      */
118     static const Normalizer2 *
119     getNFDInstance(UErrorCode &errorCode);
120 
121     /**
122      * Returns a Normalizer2 instance for Unicode NFKC normalization.
123      * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
124      * Returns an unmodifiable singleton instance. Do not delete it.
125      * @param errorCode Standard ICU error code. Its input value must
126      *                  pass the U_SUCCESS() test, or else the function returns
127      *                  immediately. Check for U_FAILURE() on output or use with
128      *                  function chaining. (See User Guide for details.)
129      * @return the requested Normalizer2, if successful
130      * @stable ICU 49
131      */
132     static const Normalizer2 *
133     getNFKCInstance(UErrorCode &errorCode);
134 
135     /**
136      * Returns a Normalizer2 instance for Unicode NFKD normalization.
137      * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
138      * Returns an unmodifiable singleton instance. Do not delete it.
139      * @param errorCode Standard ICU error code. Its input value must
140      *                  pass the U_SUCCESS() test, or else the function returns
141      *                  immediately. Check for U_FAILURE() on output or use with
142      *                  function chaining. (See User Guide for details.)
143      * @return the requested Normalizer2, if successful
144      * @stable ICU 49
145      */
146     static const Normalizer2 *
147     getNFKDInstance(UErrorCode &errorCode);
148 
149     /**
150      * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
151      * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
152      * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
153      *
154      * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
155      * Returns an unmodifiable singleton instance. Do not delete it.
156      * @param errorCode Standard ICU error code. Its input value must
157      *                  pass the U_SUCCESS() test, or else the function returns
158      *                  immediately. Check for U_FAILURE() on output or use with
159      *                  function chaining. (See User Guide for details.)
160      * @return the requested Normalizer2, if successful
161      * @stable ICU 49
162      */
163     static const Normalizer2 *
164     getNFKCCasefoldInstance(UErrorCode &errorCode);
165 
166 #ifndef U_HIDE_DRAFT_API
167     /**
168      * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
169      * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
170      * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
171      *
172      * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
173      * Returns an unmodifiable singleton instance. Do not delete it.
174      * @param errorCode Standard ICU error code. Its input value must
175      *                  pass the U_SUCCESS() test, or else the function returns
176      *                  immediately. Check for U_FAILURE() on output or use with
177      *                  function chaining. (See User Guide for details.)
178      * @return the requested Normalizer2, if successful
179      * @draft ICU 74
180      */
181     static const Normalizer2 *
182     getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
183 #endif  // U_HIDE_DRAFT_API
184 
185     /**
186      * Returns a Normalizer2 instance which uses the specified data file
187      * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
188      * and which composes or decomposes text according to the specified mode.
189      * Returns an unmodifiable singleton instance. Do not delete it.
190      *
191      * Use packageName=nullptr for data files that are part of ICU's own data.
192      * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
193      * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
194      * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
195      *
196      * @param packageName nullptr for ICU built-in data, otherwise application data package name
197      * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
198      * @param mode normalization mode (compose or decompose etc.)
199      * @param errorCode Standard ICU error code. Its input value must
200      *                  pass the U_SUCCESS() test, or else the function returns
201      *                  immediately. Check for U_FAILURE() on output or use with
202      *                  function chaining. (See User Guide for details.)
203      * @return the requested Normalizer2, if successful
204      * @stable ICU 4.4
205      */
206     static const Normalizer2 *
207     getInstance(const char *packageName,
208                 const char *name,
209                 UNormalization2Mode mode,
210                 UErrorCode &errorCode);
211 
212     /**
213      * Returns the normalized form of the source string.
214      * @param src source string
215      * @param errorCode Standard ICU error code. Its input value must
216      *                  pass the U_SUCCESS() test, or else the function returns
217      *                  immediately. Check for U_FAILURE() on output or use with
218      *                  function chaining. (See User Guide for details.)
219      * @return normalized src
220      * @stable ICU 4.4
221      */
222     UnicodeString
normalize(const UnicodeString & src,UErrorCode & errorCode)223     normalize(const UnicodeString &src, UErrorCode &errorCode) const {
224         UnicodeString result;
225         normalize(src, result, errorCode);
226         return result;
227     }
228     /**
229      * Writes the normalized form of the source string to the destination string
230      * (replacing its contents) and returns the destination string.
231      * The source and destination strings must be different objects.
232      * @param src source string
233      * @param dest destination string; its contents is replaced with normalized src
234      * @param errorCode Standard ICU error code. Its input value must
235      *                  pass the U_SUCCESS() test, or else the function returns
236      *                  immediately. Check for U_FAILURE() on output or use with
237      *                  function chaining. (See User Guide for details.)
238      * @return dest
239      * @stable ICU 4.4
240      */
241     virtual UnicodeString &
242     normalize(const UnicodeString &src,
243               UnicodeString &dest,
244               UErrorCode &errorCode) const = 0;
245 
246     /**
247      * Normalizes a UTF-8 string and optionally records how source substrings
248      * relate to changed and unchanged result substrings.
249      *
250      * Implemented completely for all built-in modes except for FCD.
251      * The base class implementation converts to & from UTF-16 and does not support edits.
252      *
253      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
254      * @param src       Source UTF-8 string.
255      * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
256      *                  sink.Flush() is called at the end.
257      * @param edits     Records edits for index mapping, working with styled text,
258      *                  and getting only changes (if any).
259      *                  The Edits contents is undefined if any error occurs.
260      *                  This function calls edits->reset() first unless
261      *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
262      * @param errorCode Standard ICU error code. Its input value must
263      *                  pass the U_SUCCESS() test, or else the function returns
264      *                  immediately. Check for U_FAILURE() on output or use with
265      *                  function chaining. (See User Guide for details.)
266      * @stable ICU 60
267      */
268     virtual void
269     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
270                   Edits *edits, UErrorCode &errorCode) const;
271 
272     /**
273      * Appends the normalized form of the second string to the first string
274      * (merging them at the boundary) and returns the first string.
275      * The result is normalized if the first string was normalized.
276      * The first and second strings must be different objects.
277      * @param first string, should be normalized
278      * @param second string, will be normalized
279      * @param errorCode Standard ICU error code. Its input value must
280      *                  pass the U_SUCCESS() test, or else the function returns
281      *                  immediately. Check for U_FAILURE() on output or use with
282      *                  function chaining. (See User Guide for details.)
283      * @return first
284      * @stable ICU 4.4
285      */
286     virtual UnicodeString &
287     normalizeSecondAndAppend(UnicodeString &first,
288                              const UnicodeString &second,
289                              UErrorCode &errorCode) const = 0;
290     /**
291      * Appends the second string to the first string
292      * (merging them at the boundary) and returns the first string.
293      * The result is normalized if both the strings were normalized.
294      * The first and second strings must be different objects.
295      * @param first string, should be normalized
296      * @param second string, should be normalized
297      * @param errorCode Standard ICU error code. Its input value must
298      *                  pass the U_SUCCESS() test, or else the function returns
299      *                  immediately. Check for U_FAILURE() on output or use with
300      *                  function chaining. (See User Guide for details.)
301      * @return first
302      * @stable ICU 4.4
303      */
304     virtual UnicodeString &
305     append(UnicodeString &first,
306            const UnicodeString &second,
307            UErrorCode &errorCode) const = 0;
308 
309     /**
310      * Gets the decomposition mapping of c.
311      * Roughly equivalent to normalizing the String form of c
312      * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
313      * returns false and does not write a string
314      * if c does not have a decomposition mapping in this instance's data.
315      * This function is independent of the mode of the Normalizer2.
316      * @param c code point
317      * @param decomposition String object which will be set to c's
318      *                      decomposition mapping, if there is one.
319      * @return true if c has a decomposition, otherwise false
320      * @stable ICU 4.6
321      */
322     virtual UBool
323     getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
324 
325     /**
326      * Gets the raw decomposition mapping of c.
327      *
328      * This is similar to the getDecomposition() method but returns the
329      * raw decomposition mapping as specified in UnicodeData.txt or
330      * (for custom data) in the mapping files processed by the gennorm2 tool.
331      * By contrast, getDecomposition() returns the processed,
332      * recursively-decomposed version of this mapping.
333      *
334      * When used on a standard NFKC Normalizer2 instance,
335      * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
336      *
337      * When used on a standard NFC Normalizer2 instance,
338      * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
339      * in this case, the result contains either one or two code points (=1..4 char16_ts).
340      *
341      * This function is independent of the mode of the Normalizer2.
342      * The default implementation returns false.
343      * @param c code point
344      * @param decomposition String object which will be set to c's
345      *                      raw decomposition mapping, if there is one.
346      * @return true if c has a decomposition, otherwise false
347      * @stable ICU 49
348      */
349     virtual UBool
350     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
351 
352     /**
353      * Performs pairwise composition of a & b and returns the composite if there is one.
354      *
355      * Returns a composite code point c only if c has a two-way mapping to a+b.
356      * In standard Unicode normalization, this means that
357      * c has a canonical decomposition to a+b
358      * and c does not have the Full_Composition_Exclusion property.
359      *
360      * This function is independent of the mode of the Normalizer2.
361      * The default implementation returns a negative value.
362      * @param a A (normalization starter) code point.
363      * @param b Another code point.
364      * @return The non-negative composite code point if there is one; otherwise a negative value.
365      * @stable ICU 49
366      */
367     virtual UChar32
368     composePair(UChar32 a, UChar32 b) const;
369 
370     /**
371      * Gets the combining class of c.
372      * The default implementation returns 0
373      * but all standard implementations return the Unicode Canonical_Combining_Class value.
374      * @param c code point
375      * @return c's combining class
376      * @stable ICU 49
377      */
378     virtual uint8_t
379     getCombiningClass(UChar32 c) const;
380 
381     /**
382      * Tests if the string is normalized.
383      * Internally, in cases where the quickCheck() method would return "maybe"
384      * (which is only possible for the two COMPOSE modes) this method
385      * resolves to "yes" or "no" to provide a definitive result,
386      * at the cost of doing more work in those cases.
387      * @param s input string
388      * @param errorCode Standard ICU error code. Its input value must
389      *                  pass the U_SUCCESS() test, or else the function returns
390      *                  immediately. Check for U_FAILURE() on output or use with
391      *                  function chaining. (See User Guide for details.)
392      * @return true if s is normalized
393      * @stable ICU 4.4
394      */
395     virtual UBool
396     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
397     /**
398      * Tests if the UTF-8 string is normalized.
399      * Internally, in cases where the quickCheck() method would return "maybe"
400      * (which is only possible for the two COMPOSE modes) this method
401      * resolves to "yes" or "no" to provide a definitive result,
402      * at the cost of doing more work in those cases.
403      *
404      * This works for all normalization modes.
405      * It is optimized for UTF-8 for all built-in modes except for FCD.
406      * The base class implementation converts to UTF-16 and calls isNormalized().
407      *
408      * @param s UTF-8 input string
409      * @param errorCode Standard ICU error code. Its input value must
410      *                  pass the U_SUCCESS() test, or else the function returns
411      *                  immediately. Check for U_FAILURE() on output or use with
412      *                  function chaining. (See User Guide for details.)
413      * @return true if s is normalized
414      * @stable ICU 60
415      */
416     virtual UBool
417     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
418 
419 
420     /**
421      * Tests if the string is normalized.
422      * For the two COMPOSE modes, the result could be "maybe" in cases that
423      * would take a little more work to resolve definitively.
424      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
425      * combination of quick check + normalization, to avoid
426      * re-checking the "yes" prefix.
427      * @param s input string
428      * @param errorCode Standard ICU error code. Its input value must
429      *                  pass the U_SUCCESS() test, or else the function returns
430      *                  immediately. Check for U_FAILURE() on output or use with
431      *                  function chaining. (See User Guide for details.)
432      * @return UNormalizationCheckResult
433      * @stable ICU 4.4
434      */
435     virtual UNormalizationCheckResult
436     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
437 
438     /**
439      * Returns the end of the normalized substring of the input string.
440      * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
441      * the substring <code>UnicodeString(s, 0, end)</code>
442      * will pass the quick check with a "yes" result.
443      *
444      * The returned end index is usually one or more characters before the
445      * "no" or "maybe" character: The end index is at a normalization boundary.
446      * (See the class documentation for more about normalization boundaries.)
447      *
448      * When the goal is a normalized string and most input strings are expected
449      * to be normalized already, then call this method,
450      * and if it returns a prefix shorter than the input string,
451      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
452      * @param s input string
453      * @param errorCode Standard ICU error code. Its input value must
454      *                  pass the U_SUCCESS() test, or else the function returns
455      *                  immediately. Check for U_FAILURE() on output or use with
456      *                  function chaining. (See User Guide for details.)
457      * @return "yes" span end index
458      * @stable ICU 4.4
459      */
460     virtual int32_t
461     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
462 
463     /**
464      * Tests if the character always has a normalization boundary before it,
465      * regardless of context.
466      * If true, then the character does not normalization-interact with
467      * preceding characters.
468      * In other words, a string containing this character can be normalized
469      * by processing portions before this character and starting from this
470      * character independently.
471      * This is used for iterative normalization. See the class documentation for details.
472      * @param c character to test
473      * @return true if c has a normalization boundary before it
474      * @stable ICU 4.4
475      */
476     virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
477 
478     /**
479      * Tests if the character always has a normalization boundary after it,
480      * regardless of context.
481      * If true, then the character does not normalization-interact with
482      * following characters.
483      * In other words, a string containing this character can be normalized
484      * by processing portions up to this character and after this
485      * character independently.
486      * This is used for iterative normalization. See the class documentation for details.
487      * Note that this operation may be significantly slower than hasBoundaryBefore().
488      * @param c character to test
489      * @return true if c has a normalization boundary after it
490      * @stable ICU 4.4
491      */
492     virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
493 
494     /**
495      * Tests if the character is normalization-inert.
496      * If true, then the character does not change, nor normalization-interact with
497      * preceding or following characters.
498      * In other words, a string containing this character can be normalized
499      * by processing portions before this character and after this
500      * character independently.
501      * This is used for iterative normalization. See the class documentation for details.
502      * Note that this operation may be significantly slower than hasBoundaryBefore().
503      * @param c character to test
504      * @return true if c is normalization-inert
505      * @stable ICU 4.4
506      */
507     virtual UBool isInert(UChar32 c) const = 0;
508 };
509 
510 /**
511  * Normalization filtered by a UnicodeSet.
512  * Normalizes portions of the text contained in the filter set and leaves
513  * portions not contained in the filter set unchanged.
514  * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
515  * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
516  * This class implements all of (and only) the Normalizer2 API.
517  * An instance of this class is unmodifiable/immutable but is constructed and
518  * must be destructed by the owner.
519  * @stable ICU 4.4
520  */
521 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
522 public:
523     /**
524      * Constructs a filtered normalizer wrapping any Normalizer2 instance
525      * and a filter set.
526      * Both are aliased and must not be modified or deleted while this object
527      * is used.
528      * The filter set should be frozen; otherwise the performance will suffer greatly.
529      * @param n2 wrapped Normalizer2 instance
530      * @param filterSet UnicodeSet which determines the characters to be normalized
531      * @stable ICU 4.4
532      */
FilteredNormalizer2(const Normalizer2 & n2,const UnicodeSet & filterSet)533     FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
534             norm2(n2), set(filterSet) {}
535 
536     /**
537      * Destructor.
538      * @stable ICU 4.4
539      */
540     ~FilteredNormalizer2();
541 
542     /**
543      * Writes the normalized form of the source string to the destination string
544      * (replacing its contents) and returns the destination string.
545      * The source and destination strings must be different objects.
546      * @param src source string
547      * @param dest destination string; its contents is replaced with normalized src
548      * @param errorCode Standard ICU error code. Its input value must
549      *                  pass the U_SUCCESS() test, or else the function returns
550      *                  immediately. Check for U_FAILURE() on output or use with
551      *                  function chaining. (See User Guide for details.)
552      * @return dest
553      * @stable ICU 4.4
554      */
555     virtual UnicodeString &
556     normalize(const UnicodeString &src,
557               UnicodeString &dest,
558               UErrorCode &errorCode) const override;
559 
560     /**
561      * Normalizes a UTF-8 string and optionally records how source substrings
562      * relate to changed and unchanged result substrings.
563      *
564      * Implemented completely for most built-in modes except for FCD.
565      * The base class implementation converts to & from UTF-16 and does not support edits.
566      *
567      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
568      * @param src       Source UTF-8 string.
569      * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
570      *                  sink.Flush() is called at the end.
571      * @param edits     Records edits for index mapping, working with styled text,
572      *                  and getting only changes (if any).
573      *                  The Edits contents is undefined if any error occurs.
574      *                  This function calls edits->reset() first unless
575      *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
576      * @param errorCode Standard ICU error code. Its input value must
577      *                  pass the U_SUCCESS() test, or else the function returns
578      *                  immediately. Check for U_FAILURE() on output or use with
579      *                  function chaining. (See User Guide for details.)
580      * @stable ICU 60
581      */
582     virtual void
583     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
584                   Edits *edits, UErrorCode &errorCode) const override;
585 
586     /**
587      * Appends the normalized form of the second string to the first string
588      * (merging them at the boundary) and returns the first string.
589      * The result is normalized if the first string was normalized.
590      * The first and second strings must be different objects.
591      * @param first string, should be normalized
592      * @param second string, will be normalized
593      * @param errorCode Standard ICU error code. Its input value must
594      *                  pass the U_SUCCESS() test, or else the function returns
595      *                  immediately. Check for U_FAILURE() on output or use with
596      *                  function chaining. (See User Guide for details.)
597      * @return first
598      * @stable ICU 4.4
599      */
600     virtual UnicodeString &
601     normalizeSecondAndAppend(UnicodeString &first,
602                              const UnicodeString &second,
603                              UErrorCode &errorCode) const override;
604     /**
605      * Appends the second string to the first string
606      * (merging them at the boundary) and returns the first string.
607      * The result is normalized if both the strings were normalized.
608      * The first and second strings must be different objects.
609      * @param first string, should be normalized
610      * @param second string, should be normalized
611      * @param errorCode Standard ICU error code. Its input value must
612      *                  pass the U_SUCCESS() test, or else the function returns
613      *                  immediately. Check for U_FAILURE() on output or use with
614      *                  function chaining. (See User Guide for details.)
615      * @return first
616      * @stable ICU 4.4
617      */
618     virtual UnicodeString &
619     append(UnicodeString &first,
620            const UnicodeString &second,
621            UErrorCode &errorCode) const override;
622 
623     /**
624      * Gets the decomposition mapping of c.
625      * For details see the base class documentation.
626      *
627      * This function is independent of the mode of the Normalizer2.
628      * @param c code point
629      * @param decomposition String object which will be set to c's
630      *                      decomposition mapping, if there is one.
631      * @return true if c has a decomposition, otherwise false
632      * @stable ICU 4.6
633      */
634     virtual UBool
635     getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
636 
637     /**
638      * Gets the raw decomposition mapping of c.
639      * For details see the base class documentation.
640      *
641      * This function is independent of the mode of the Normalizer2.
642      * @param c code point
643      * @param decomposition String object which will be set to c's
644      *                      raw decomposition mapping, if there is one.
645      * @return true if c has a decomposition, otherwise false
646      * @stable ICU 49
647      */
648     virtual UBool
649     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
650 
651     /**
652      * Performs pairwise composition of a & b and returns the composite if there is one.
653      * For details see the base class documentation.
654      *
655      * This function is independent of the mode of the Normalizer2.
656      * @param a A (normalization starter) code point.
657      * @param b Another code point.
658      * @return The non-negative composite code point if there is one; otherwise a negative value.
659      * @stable ICU 49
660      */
661     virtual UChar32
662     composePair(UChar32 a, UChar32 b) const override;
663 
664     /**
665      * Gets the combining class of c.
666      * The default implementation returns 0
667      * but all standard implementations return the Unicode Canonical_Combining_Class value.
668      * @param c code point
669      * @return c's combining class
670      * @stable ICU 49
671      */
672     virtual uint8_t
673     getCombiningClass(UChar32 c) const override;
674 
675     /**
676      * Tests if the string is normalized.
677      * For details see the Normalizer2 base class documentation.
678      * @param s input string
679      * @param errorCode Standard ICU error code. Its input value must
680      *                  pass the U_SUCCESS() test, or else the function returns
681      *                  immediately. Check for U_FAILURE() on output or use with
682      *                  function chaining. (See User Guide for details.)
683      * @return true if s is normalized
684      * @stable ICU 4.4
685      */
686     virtual UBool
687     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
688     /**
689      * Tests if the UTF-8 string is normalized.
690      * Internally, in cases where the quickCheck() method would return "maybe"
691      * (which is only possible for the two COMPOSE modes) this method
692      * resolves to "yes" or "no" to provide a definitive result,
693      * at the cost of doing more work in those cases.
694      *
695      * This works for all normalization modes.
696      * It is optimized for UTF-8 for all built-in modes except for FCD.
697      * The base class implementation converts to UTF-16 and calls isNormalized().
698      *
699      * @param s UTF-8 input string
700      * @param errorCode Standard ICU error code. Its input value must
701      *                  pass the U_SUCCESS() test, or else the function returns
702      *                  immediately. Check for U_FAILURE() on output or use with
703      *                  function chaining. (See User Guide for details.)
704      * @return true if s is normalized
705      * @stable ICU 60
706      */
707     virtual UBool
708     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
709     /**
710      * Tests if the string is normalized.
711      * For details see the Normalizer2 base class documentation.
712      * @param s input string
713      * @param errorCode Standard ICU error code. Its input value must
714      *                  pass the U_SUCCESS() test, or else the function returns
715      *                  immediately. Check for U_FAILURE() on output or use with
716      *                  function chaining. (See User Guide for details.)
717      * @return UNormalizationCheckResult
718      * @stable ICU 4.4
719      */
720     virtual UNormalizationCheckResult
721     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
722     /**
723      * Returns the end of the normalized substring of the input string.
724      * For details see the Normalizer2 base class documentation.
725      * @param s input string
726      * @param errorCode Standard ICU error code. Its input value must
727      *                  pass the U_SUCCESS() test, or else the function returns
728      *                  immediately. Check for U_FAILURE() on output or use with
729      *                  function chaining. (See User Guide for details.)
730      * @return "yes" span end index
731      * @stable ICU 4.4
732      */
733     virtual int32_t
734     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
735 
736     /**
737      * Tests if the character always has a normalization boundary before it,
738      * regardless of context.
739      * For details see the Normalizer2 base class documentation.
740      * @param c character to test
741      * @return true if c has a normalization boundary before it
742      * @stable ICU 4.4
743      */
744     virtual UBool hasBoundaryBefore(UChar32 c) const override;
745 
746     /**
747      * Tests if the character always has a normalization boundary after it,
748      * regardless of context.
749      * For details see the Normalizer2 base class documentation.
750      * @param c character to test
751      * @return true if c has a normalization boundary after it
752      * @stable ICU 4.4
753      */
754     virtual UBool hasBoundaryAfter(UChar32 c) const override;
755 
756     /**
757      * Tests if the character is normalization-inert.
758      * For details see the Normalizer2 base class documentation.
759      * @param c character to test
760      * @return true if c is normalization-inert
761      * @stable ICU 4.4
762      */
763     virtual UBool isInert(UChar32 c) const override;
764 private:
765     UnicodeString &
766     normalize(const UnicodeString &src,
767               UnicodeString &dest,
768               USetSpanCondition spanCondition,
769               UErrorCode &errorCode) const;
770 
771     void
772     normalizeUTF8(uint32_t options, const char *src, int32_t length,
773                   ByteSink &sink, Edits *edits,
774                   USetSpanCondition spanCondition,
775                   UErrorCode &errorCode) const;
776 
777     UnicodeString &
778     normalizeSecondAndAppend(UnicodeString &first,
779                              const UnicodeString &second,
780                              UBool doNormalize,
781                              UErrorCode &errorCode) const;
782 
783     const Normalizer2 &norm2;
784     const UnicodeSet &set;
785 };
786 
787 U_NAMESPACE_END
788 
789 #endif  // !UCONFIG_NO_NORMALIZATION
790 
791 #endif /* U_SHOW_CPLUSPLUS_API */
792 
793 #endif  // __NORMALIZER2_H__
794