xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/SimpleUnicodeSetFormatter.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.util;
2 
3 import com.google.common.base.Splitter;
4 import com.ibm.icu.lang.CharSequences;
5 import com.ibm.icu.text.Collator;
6 import com.ibm.icu.text.Normalizer2;
7 import com.ibm.icu.text.UTF16;
8 import com.ibm.icu.text.UnicodeSet;
9 import com.ibm.icu.util.ULocale;
10 import java.util.ArrayList;
11 import java.util.Collection;
12 import java.util.Comparator;
13 import java.util.TreeSet;
14 import java.util.function.Function;
15 
16 /**
17  * Goal is a very simple format for UnicodeSet, that keeps vetters from having to know about \ for
18  * quoting or {...} for strings, or $ for FFFF. We do this by using spaces to always separate
19  * different characters, and special syntax for ranges, escaped hex, and named entities. There are 2
20  * special characters:
21  *
22  * <ul>
23  *   <li>➖ a range, but if between two code points
24  *   <li>❰ start of hex or named escape, but only if followed by [A-Fa-f0-9]+ ❱
25  * </ul>
26  *
27  * <b>EBNF</b><br>
28  * result = item (" " item)*<br>
29  * item = string | range | codePoint<br>
30  * string = codePoint+<br>
31  * range = codePoint "➖" codePoint<br>
32  * codepoint = literal // excludes " ", "❰", "❱"<br>
33  * codepoint = "❰" (namedEscape | hex) "❱"<br>
34  * namedEscape = [A-Fa-f0-9]+ // as per CodePointEscape<br>
35  * hex = [A-Fa-f0-9]{2,6} // must be valid code point 0x0..0x10FFFF<br>
36  * ❰ was chosen to be avoid special use of \\u or \x<br>
37  *
38  * @author markdavis
39  */
40 public class SimpleUnicodeSetFormatter implements FormatterParser<UnicodeSet> {
41     public static Normalizer2 nfc = Normalizer2.getNFCInstance();
42 
43     public static final Comparator<String> BASIC_COLLATOR =
44             (Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL);
45 
46     private static final int DEFAULT_MAX_DISALLOW_RANGES = 199;
47 
48     private final Comparator<String> comparator;
49     private final UnicodeSet forceHex;
50     private final int maxDisallowRanges;
51     private final UTF16.StringComparator codepointComparator =
52             new UTF16.StringComparator(true, false, 0);
53 
54     /**
55      * Create a simple formatter, with a comparator for the ordering and a UnicodeSet of characters
56      * that are to use hex. Immutable (if the collator is).
57      *
58      * @param col — collator. The default is BASIC_COLLATOR, which is the root collator.
59      * @param forceHex - UnicodeSet to force to be hex. It will be frozen if not already. Warning:
60      *     may not round-trip unless it includes all of CodePointEscaper.getNamedEscapes()
61      * @param maxDisallowRanges — under this number, there will be no ranges; at or above there may
62      *     be ranges, and the collator will be disregarded.
63      */
SimpleUnicodeSetFormatter( Comparator<String> col, UnicodeSet forceHex, int maxDisallowRanges)64     public SimpleUnicodeSetFormatter(
65             Comparator<String> col, UnicodeSet forceHex, int maxDisallowRanges) {
66         // collate, but preserve non-equivalents
67         this.comparator = ComparatorUtilities.wrapForCodePoints(col);
68         this.forceHex = forceHex == null ? CodePointEscaper.FORCE_ESCAPE : forceHex.freeze();
69         this.maxDisallowRanges = maxDisallowRanges;
70     }
71 
72     static final int DEFAULT_MAX = 1024;
73 
fromIcuLocale(String localeId)74     public static SimpleUnicodeSetFormatter fromIcuLocale(String localeId) {
75         return new SimpleUnicodeSetFormatter(
76                 (Comparator) ComparatorUtilities.getIcuCollator(localeId, Collator.IDENTICAL),
77                 null,
78                 DEFAULT_MAX);
79     }
80 
SimpleUnicodeSetFormatter(Comparator<String> col, UnicodeSet forceHex)81     public SimpleUnicodeSetFormatter(Comparator<String> col, UnicodeSet forceHex) {
82         this(col, forceHex, DEFAULT_MAX_DISALLOW_RANGES);
83     }
84 
SimpleUnicodeSetFormatter(Comparator<String> col)85     public SimpleUnicodeSetFormatter(Comparator<String> col) {
86         this(col, null, DEFAULT_MAX);
87     }
88 
SimpleUnicodeSetFormatter()89     public SimpleUnicodeSetFormatter() {
90         this(
91                 (Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL),
92                 null,
93                 DEFAULT_MAX);
94     }
95 
96     static class Lazy {
97         static SimpleUnicodeSetFormatter SINGLETON = new SimpleUnicodeSetFormatter();
98 
getSingleton()99         static SimpleUnicodeSetFormatter getSingleton() {
100             return SINGLETON;
101         }
102     }
103 
getDefault()104     public static SimpleUnicodeSetFormatter getDefault() {
105         return Lazy.getSingleton();
106     }
107 
108     /** Parse as UnicodeSet if of the form […], else parse with default SimpleUnicodeSetFormatter */
parseLenient(String source)109     public static UnicodeSet parseLenient(String source) {
110         if (source.startsWith("[") && source.endsWith("]")) {
111             return new UnicodeSet(source);
112         } else {
113             return getDefault().parse(source);
114         }
115     }
116 
117     @Override
format(UnicodeSet input)118     public String format(UnicodeSet input) {
119         final boolean allowRanges = input.size() > maxDisallowRanges;
120         StringBuilder result = new StringBuilder();
121         Collection<String> sorted =
122                 input.addAllTo(allowRanges ? new ArrayList<>() : new TreeSet<>(comparator));
123         //                : transformAndAddAllTo(
124         //                        input, null, new TreeSet<>(comparator)); // x -> nfc.normalize(x)
125         int firstOfRange = -2;
126         int lastOfRange = -2;
127         for (String item : sorted) {
128             int cp = CharSequences.getSingleCodePoint(item);
129             if (cp == Integer.MAX_VALUE) { // string
130                 if (lastOfRange >= 0) {
131                     if (firstOfRange != lastOfRange) {
132                         result.append(
133                                 firstOfRange + 1 != lastOfRange
134                                         ? CodePointEscaper.RANGE_SYNTAX
135                                         : ' ');
136                         appendWithHex(result, lastOfRange, forceHex);
137                     }
138                     firstOfRange = lastOfRange = -2;
139                 }
140                 if (result.length() > 0) {
141                     result.append(' ');
142                 }
143                 appendWithHex(result, item, forceHex);
144             } else if (allowRanges && lastOfRange == cp - 1) {
145                 ++lastOfRange;
146             } else {
147                 if (firstOfRange != lastOfRange) {
148                     result.append(
149                             firstOfRange + 1 != lastOfRange ? CodePointEscaper.RANGE_SYNTAX : ' ');
150                     appendWithHex(result, lastOfRange, forceHex);
151                 }
152                 if (result.length() > 0) {
153                     result.append(' ');
154                 }
155                 appendWithHex(result, cp, forceHex);
156                 firstOfRange = lastOfRange = cp;
157             }
158         }
159         if (firstOfRange != lastOfRange) {
160             result.append(firstOfRange + 1 != lastOfRange ? CodePointEscaper.RANGE_SYNTAX : ' ');
161             appendWithHex(result, lastOfRange, forceHex);
162         }
163         return result.toString();
164     }
165 
appendWithHex( StringBuilder ap, CharSequence s, UnicodeSet forceHex)166     public static final StringBuilder appendWithHex(
167             StringBuilder ap, CharSequence s, UnicodeSet forceHex) {
168         for (int cp : With.codePointArray(s)) {
169             appendWithHex(ap, cp, forceHex);
170         }
171         return ap;
172     }
173 
appendWithHex(StringBuilder ap, int cp, UnicodeSet forceHex)174     public static StringBuilder appendWithHex(StringBuilder ap, int cp, UnicodeSet forceHex) {
175         if (!forceHex.contains(cp)) {
176             ap.appendCodePoint(cp);
177         } else {
178             ap.append(CodePointEscaper.codePointToEscaped(cp));
179         }
180         return ap;
181     }
182 
183     static final Splitter SPACE_SPLITTER = Splitter.on(' ').omitEmptyStrings();
184 
185     @Override
parse(String input)186     public UnicodeSet parse(String input) {
187         UnicodeSet result = new UnicodeSet();
188         // Note: could be optimized but probably not worth the effort
189 
190         for (String word : SPACE_SPLITTER.split(input)) {
191             // parts between spaces can be single code points, or strings, or ranges of single code
192             // points
193             // points
194             int rangePos = word.indexOf(CodePointEscaper.RANGE_SYNTAX);
195             if (rangePos < 0) {
196                 result.add(unescape(word));
197             } else {
198                 int range2Pos = word.indexOf(CodePointEscaper.RANGE_SYNTAX, rangePos + 1);
199                 final String before = word.substring(0, rangePos);
200                 final String after = word.substring(rangePos + 1);
201                 if (rangePos == 0) {
202                     throw new IllegalArgumentException(
203                             "Must have exactly one character before '➖': " + before + "❌➖" + after);
204                 } else if (rangePos == word.length() - 1) {
205                     throw new IllegalArgumentException(
206                             "Must have exactly one character after '➖': " + before + "➖❌" + after);
207                 } else if (range2Pos >= 0) {
208                     throw new IllegalArgumentException(
209                             "Must not have two '➖' characters: " + before + "➖❌" + after);
210                 }
211                 // get the code points on either side
212                 int first = CharSequences.getSingleCodePoint(unescape(before));
213                 int second = CharSequences.getSingleCodePoint(unescape(after));
214                 if (first == Integer.MAX_VALUE) {
215                     throw new IllegalArgumentException(
216                             "Must have exactly one character before '➖': " + before + "❌➖" + after);
217                 } else if (second == Integer.MAX_VALUE) {
218                     throw new IllegalArgumentException(
219                             "Must have exactly one character after '➖': " + before + "➖❌" + after);
220                 }
221                 result.add(first, second);
222             }
223         }
224         return result;
225     }
226 
227     /** Unescape a whole string. */
unescape(String word)228     public static CharSequence unescape(String word) {
229         StringBuilder result = new StringBuilder();
230         for (int i = 0; i < word.length(); ) {
231             int escapeStart = word.indexOf(CodePointEscaper.ESCAPE_START, i);
232             if (escapeStart < 0) {
233                 final String toAppend = i == 0 ? word : word.substring(i);
234                 final int endStart = toAppend.indexOf(CodePointEscaper.ESCAPE_END);
235                 if (endStart >= 0) {
236                     throw new IllegalArgumentException(
237                             "Missing start escape "
238                                     + CodePointEscaper.ESCAPE_START
239                                     + ": "
240                                     + word.substring(0, endStart)
241                                     + "❌"
242                                     + word.substring(endStart));
243                 }
244                 // Otherwise we are done, the rest is unescaped.
245                 result.append(toAppend);
246                 break;
247             }
248             // we have an escape start, so we append what is before that.
249             final String toAppend = word.substring(i, escapeStart);
250             // if we don't find an escape end
251             final int endStart = toAppend.indexOf(CodePointEscaper.ESCAPE_END);
252             if (endStart >= 0) {
253                 throw new IllegalArgumentException(
254                         "Missing start escape "
255                                 + CodePointEscaper.ESCAPE_START
256                                 + ": "
257                                 + toAppend.substring(0, endStart)
258                                 + "❌"
259                                 + toAppend.substring(endStart));
260             }
261             result.append(toAppend);
262             int interiorStart = escapeStart + 1;
263             int escapeEnd = word.indexOf(CodePointEscaper.ESCAPE_END, interiorStart);
264             if (escapeEnd < 0) {
265                 throw new IllegalArgumentException(
266                         "Missing end escape " + CodePointEscaper.ESCAPE_END + ": " + word + "❌");
267             }
268             result.appendCodePoint(
269                     CodePointEscaper.rawEscapedToCodePoint(
270                             word.substring(interiorStart, escapeEnd)));
271             i = escapeEnd + 1;
272         }
273         return result;
274     }
275 
transform(UnicodeSet expected, Function<String, String> function)276     public static UnicodeSet transform(UnicodeSet expected, Function<String, String> function) {
277         UnicodeSet result = new UnicodeSet();
278         for (String s : expected) {
279             String t = function.apply(s);
280             result.add(t);
281         }
282         return result;
283     }
284 
transformAndAddAllTo( UnicodeSet expected, Function<String, String> function, T target)285     public static <T extends Collection<String>> T transformAndAddAllTo(
286             UnicodeSet expected, Function<String, String> function, T target) {
287         for (String s : expected) {
288             String t = function == null ? s : function.apply(s);
289             target.add(t);
290         }
291         return target;
292     }
293 }
294