1 package org.unicode.cldr.util; 2 3 import com.google.common.base.Splitter; 4 import com.ibm.icu.lang.CharSequences; 5 import com.ibm.icu.text.Collator; 6 import com.ibm.icu.text.Normalizer2; 7 import com.ibm.icu.text.UTF16; 8 import com.ibm.icu.text.UnicodeSet; 9 import com.ibm.icu.util.ULocale; 10 import java.util.ArrayList; 11 import java.util.Collection; 12 import java.util.Comparator; 13 import java.util.TreeSet; 14 import java.util.function.Function; 15 16 /** 17 * Goal is a very simple format for UnicodeSet, that keeps vetters from having to know about \ for 18 * quoting or {...} for strings, or $ for FFFF. We do this by using spaces to always separate 19 * different characters, and special syntax for ranges, escaped hex, and named entities. There are 2 20 * special characters: 21 * 22 * <ul> 23 * <li>➖ a range, but if between two code points 24 * <li>❰ start of hex or named escape, but only if followed by [A-Fa-f0-9]+ ❱ 25 * </ul> 26 * 27 * <b>EBNF</b><br> 28 * result = item (" " item)*<br> 29 * item = string | range | codePoint<br> 30 * string = codePoint+<br> 31 * range = codePoint "➖" codePoint<br> 32 * codepoint = literal // excludes " ", "❰", "❱"<br> 33 * codepoint = "❰" (namedEscape | hex) "❱"<br> 34 * namedEscape = [A-Fa-f0-9]+ // as per CodePointEscape<br> 35 * hex = [A-Fa-f0-9]{2,6} // must be valid code point 0x0..0x10FFFF<br> 36 * ❰ was chosen to be avoid special use of \\u or \x<br> 37 * 38 * @author markdavis 39 */ 40 public class SimpleUnicodeSetFormatter implements FormatterParser<UnicodeSet> { 41 public static Normalizer2 nfc = Normalizer2.getNFCInstance(); 42 43 public static final Comparator<String> BASIC_COLLATOR = 44 (Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL); 45 46 private static final int DEFAULT_MAX_DISALLOW_RANGES = 199; 47 48 private final Comparator<String> comparator; 49 private final UnicodeSet forceHex; 50 private final int maxDisallowRanges; 51 private final UTF16.StringComparator codepointComparator = 52 new UTF16.StringComparator(true, false, 0); 53 54 /** 55 * Create a simple formatter, with a comparator for the ordering and a UnicodeSet of characters 56 * that are to use hex. Immutable (if the collator is). 57 * 58 * @param col — collator. The default is BASIC_COLLATOR, which is the root collator. 59 * @param forceHex - UnicodeSet to force to be hex. It will be frozen if not already. Warning: 60 * may not round-trip unless it includes all of CodePointEscaper.getNamedEscapes() 61 * @param maxDisallowRanges — under this number, there will be no ranges; at or above there may 62 * be ranges, and the collator will be disregarded. 63 */ SimpleUnicodeSetFormatter( Comparator<String> col, UnicodeSet forceHex, int maxDisallowRanges)64 public SimpleUnicodeSetFormatter( 65 Comparator<String> col, UnicodeSet forceHex, int maxDisallowRanges) { 66 // collate, but preserve non-equivalents 67 this.comparator = ComparatorUtilities.wrapForCodePoints(col); 68 this.forceHex = forceHex == null ? CodePointEscaper.FORCE_ESCAPE : forceHex.freeze(); 69 this.maxDisallowRanges = maxDisallowRanges; 70 } 71 72 static final int DEFAULT_MAX = 1024; 73 fromIcuLocale(String localeId)74 public static SimpleUnicodeSetFormatter fromIcuLocale(String localeId) { 75 return new SimpleUnicodeSetFormatter( 76 (Comparator) ComparatorUtilities.getIcuCollator(localeId, Collator.IDENTICAL), 77 null, 78 DEFAULT_MAX); 79 } 80 SimpleUnicodeSetFormatter(Comparator<String> col, UnicodeSet forceHex)81 public SimpleUnicodeSetFormatter(Comparator<String> col, UnicodeSet forceHex) { 82 this(col, forceHex, DEFAULT_MAX_DISALLOW_RANGES); 83 } 84 SimpleUnicodeSetFormatter(Comparator<String> col)85 public SimpleUnicodeSetFormatter(Comparator<String> col) { 86 this(col, null, DEFAULT_MAX); 87 } 88 SimpleUnicodeSetFormatter()89 public SimpleUnicodeSetFormatter() { 90 this( 91 (Comparator) ComparatorUtilities.getIcuCollator(ULocale.ROOT, Collator.IDENTICAL), 92 null, 93 DEFAULT_MAX); 94 } 95 96 static class Lazy { 97 static SimpleUnicodeSetFormatter SINGLETON = new SimpleUnicodeSetFormatter(); 98 getSingleton()99 static SimpleUnicodeSetFormatter getSingleton() { 100 return SINGLETON; 101 } 102 } 103 getDefault()104 public static SimpleUnicodeSetFormatter getDefault() { 105 return Lazy.getSingleton(); 106 } 107 108 /** Parse as UnicodeSet if of the form […], else parse with default SimpleUnicodeSetFormatter */ parseLenient(String source)109 public static UnicodeSet parseLenient(String source) { 110 if (source.startsWith("[") && source.endsWith("]")) { 111 return new UnicodeSet(source); 112 } else { 113 return getDefault().parse(source); 114 } 115 } 116 117 @Override format(UnicodeSet input)118 public String format(UnicodeSet input) { 119 final boolean allowRanges = input.size() > maxDisallowRanges; 120 StringBuilder result = new StringBuilder(); 121 Collection<String> sorted = 122 input.addAllTo(allowRanges ? new ArrayList<>() : new TreeSet<>(comparator)); 123 // : transformAndAddAllTo( 124 // input, null, new TreeSet<>(comparator)); // x -> nfc.normalize(x) 125 int firstOfRange = -2; 126 int lastOfRange = -2; 127 for (String item : sorted) { 128 int cp = CharSequences.getSingleCodePoint(item); 129 if (cp == Integer.MAX_VALUE) { // string 130 if (lastOfRange >= 0) { 131 if (firstOfRange != lastOfRange) { 132 result.append( 133 firstOfRange + 1 != lastOfRange 134 ? CodePointEscaper.RANGE_SYNTAX 135 : ' '); 136 appendWithHex(result, lastOfRange, forceHex); 137 } 138 firstOfRange = lastOfRange = -2; 139 } 140 if (result.length() > 0) { 141 result.append(' '); 142 } 143 appendWithHex(result, item, forceHex); 144 } else if (allowRanges && lastOfRange == cp - 1) { 145 ++lastOfRange; 146 } else { 147 if (firstOfRange != lastOfRange) { 148 result.append( 149 firstOfRange + 1 != lastOfRange ? CodePointEscaper.RANGE_SYNTAX : ' '); 150 appendWithHex(result, lastOfRange, forceHex); 151 } 152 if (result.length() > 0) { 153 result.append(' '); 154 } 155 appendWithHex(result, cp, forceHex); 156 firstOfRange = lastOfRange = cp; 157 } 158 } 159 if (firstOfRange != lastOfRange) { 160 result.append(firstOfRange + 1 != lastOfRange ? CodePointEscaper.RANGE_SYNTAX : ' '); 161 appendWithHex(result, lastOfRange, forceHex); 162 } 163 return result.toString(); 164 } 165 appendWithHex( StringBuilder ap, CharSequence s, UnicodeSet forceHex)166 public static final StringBuilder appendWithHex( 167 StringBuilder ap, CharSequence s, UnicodeSet forceHex) { 168 for (int cp : With.codePointArray(s)) { 169 appendWithHex(ap, cp, forceHex); 170 } 171 return ap; 172 } 173 appendWithHex(StringBuilder ap, int cp, UnicodeSet forceHex)174 public static StringBuilder appendWithHex(StringBuilder ap, int cp, UnicodeSet forceHex) { 175 if (!forceHex.contains(cp)) { 176 ap.appendCodePoint(cp); 177 } else { 178 ap.append(CodePointEscaper.codePointToEscaped(cp)); 179 } 180 return ap; 181 } 182 183 static final Splitter SPACE_SPLITTER = Splitter.on(' ').omitEmptyStrings(); 184 185 @Override parse(String input)186 public UnicodeSet parse(String input) { 187 UnicodeSet result = new UnicodeSet(); 188 // Note: could be optimized but probably not worth the effort 189 190 for (String word : SPACE_SPLITTER.split(input)) { 191 // parts between spaces can be single code points, or strings, or ranges of single code 192 // points 193 // points 194 int rangePos = word.indexOf(CodePointEscaper.RANGE_SYNTAX); 195 if (rangePos < 0) { 196 result.add(unescape(word)); 197 } else { 198 int range2Pos = word.indexOf(CodePointEscaper.RANGE_SYNTAX, rangePos + 1); 199 final String before = word.substring(0, rangePos); 200 final String after = word.substring(rangePos + 1); 201 if (rangePos == 0) { 202 throw new IllegalArgumentException( 203 "Must have exactly one character before '➖': " + before + "❌➖" + after); 204 } else if (rangePos == word.length() - 1) { 205 throw new IllegalArgumentException( 206 "Must have exactly one character after '➖': " + before + "➖❌" + after); 207 } else if (range2Pos >= 0) { 208 throw new IllegalArgumentException( 209 "Must not have two '➖' characters: " + before + "➖❌" + after); 210 } 211 // get the code points on either side 212 int first = CharSequences.getSingleCodePoint(unescape(before)); 213 int second = CharSequences.getSingleCodePoint(unescape(after)); 214 if (first == Integer.MAX_VALUE) { 215 throw new IllegalArgumentException( 216 "Must have exactly one character before '➖': " + before + "❌➖" + after); 217 } else if (second == Integer.MAX_VALUE) { 218 throw new IllegalArgumentException( 219 "Must have exactly one character after '➖': " + before + "➖❌" + after); 220 } 221 result.add(first, second); 222 } 223 } 224 return result; 225 } 226 227 /** Unescape a whole string. */ unescape(String word)228 public static CharSequence unescape(String word) { 229 StringBuilder result = new StringBuilder(); 230 for (int i = 0; i < word.length(); ) { 231 int escapeStart = word.indexOf(CodePointEscaper.ESCAPE_START, i); 232 if (escapeStart < 0) { 233 final String toAppend = i == 0 ? word : word.substring(i); 234 final int endStart = toAppend.indexOf(CodePointEscaper.ESCAPE_END); 235 if (endStart >= 0) { 236 throw new IllegalArgumentException( 237 "Missing start escape " 238 + CodePointEscaper.ESCAPE_START 239 + ": " 240 + word.substring(0, endStart) 241 + "❌" 242 + word.substring(endStart)); 243 } 244 // Otherwise we are done, the rest is unescaped. 245 result.append(toAppend); 246 break; 247 } 248 // we have an escape start, so we append what is before that. 249 final String toAppend = word.substring(i, escapeStart); 250 // if we don't find an escape end 251 final int endStart = toAppend.indexOf(CodePointEscaper.ESCAPE_END); 252 if (endStart >= 0) { 253 throw new IllegalArgumentException( 254 "Missing start escape " 255 + CodePointEscaper.ESCAPE_START 256 + ": " 257 + toAppend.substring(0, endStart) 258 + "❌" 259 + toAppend.substring(endStart)); 260 } 261 result.append(toAppend); 262 int interiorStart = escapeStart + 1; 263 int escapeEnd = word.indexOf(CodePointEscaper.ESCAPE_END, interiorStart); 264 if (escapeEnd < 0) { 265 throw new IllegalArgumentException( 266 "Missing end escape " + CodePointEscaper.ESCAPE_END + ": " + word + "❌"); 267 } 268 result.appendCodePoint( 269 CodePointEscaper.rawEscapedToCodePoint( 270 word.substring(interiorStart, escapeEnd))); 271 i = escapeEnd + 1; 272 } 273 return result; 274 } 275 transform(UnicodeSet expected, Function<String, String> function)276 public static UnicodeSet transform(UnicodeSet expected, Function<String, String> function) { 277 UnicodeSet result = new UnicodeSet(); 278 for (String s : expected) { 279 String t = function.apply(s); 280 result.add(t); 281 } 282 return result; 283 } 284 transformAndAddAllTo( UnicodeSet expected, Function<String, String> function, T target)285 public static <T extends Collection<String>> T transformAndAddAllTo( 286 UnicodeSet expected, Function<String, String> function, T target) { 287 for (String s : expected) { 288 String t = function == null ? s : function.apply(s); 289 target.add(t); 290 } 291 return target; 292 } 293 } 294