1 package org.unicode.cldr.util; 2 3 import com.ibm.icu.dev.util.UnicodeMap; 4 import com.ibm.icu.lang.UCharacter; 5 import com.ibm.icu.text.UnicodeSet; 6 import java.util.Locale; 7 8 /** 9 * Provide a set of code point abbreviations. Includes conversions to and from codepoints, including 10 * hex. Typicaly To test whether a string could have escapes, use either: 11 * 12 * <ul> 13 * <li> 14 */ 15 public enum CodePointEscaper { 16 // These are characters found in CLDR data fields 17 // The long names don't necessarily match the formal Unicode names 18 TAB(9, "tab"), 19 LF(0xA, "line feed"), 20 CR(0xD, "carriage return"), 21 SP(0x20, "space", "ASCII space"), 22 NSP(0x2009, "narrow/thin space", "Also known as ‘thin space’"), 23 NBSP(0xA0, "no-break space", "Same as space, but doesn’t line wrap."), 24 25 NNBSP(0x202F, "narrow/thin no-break space", "Same as narrow space, but doesn’t line wrap."), 26 27 WNJ( 28 0x200B, 29 "allow line wrap after, aka ZWSP", 30 "Invisible character allowing a line-wrap afterwards. Also known as ‘ZWSP’."), 31 WJ( 32 0x2060, 33 "prevent line wrap", 34 "Keeps adjacent characters from line-wrapping. Also known as ‘word-joiner’."), 35 SHY( 36 0x00AD, 37 "soft hyphen", 38 "Invisible character allowing a line-wrap afterwards, but appears like a hyphen in most languages."), 39 40 ZWNJ(0x200C, "cursive non-joiner", "Breaks cursive connections, where possible."), 41 ZWJ(0x200D, "cursive joiner", "Forces cursive connections, if possible."), 42 43 ALM( 44 0x061C, 45 "Arabic letter mark", 46 "For BIDI, invisible character that behaves like Arabic letter."), 47 LRM( 48 0x200E, 49 "left-right mark", 50 "For BIDI, invisible character that behaves like Hebrew letter."), 51 RLM(0x200F, "right-left mark", "For BIDI, invisible character that behaves like Latin letter."), 52 53 LRO(0x202D, "left-right override"), 54 RLO(0x202E, "right-left override"), 55 PDF(0x202C, "end override"), 56 57 BOM(0xFEFF, "byte-order mark"), 58 59 ANS(0x0600, "Arabic number sign"), 60 ASNS(0x0601, "Arabic sanah sign"), 61 AFM(0x602, "Arabic footnote marker"), 62 ASFS(0x603, "Arabic safha sign"), 63 SAM(0x70F, "Syriac abbreviation mark"), 64 KIAQ(0x17B4, "Khmer inherent aq"), 65 KIAA(0x17B5, "Khmer inherent aa"), 66 67 RANGE('➖', "range syntax mark", "heavy minus sign"), 68 ESCS('❰', "escape start", "heavy open angle bracket"), 69 ESCE('❱', "escape end", "heavy close angle bracket"); 70 71 public static final char RANGE_SYNTAX = (char) RANGE.getCodePoint(); 72 public static final char ESCAPE_START = (char) ESCS.getCodePoint(); 73 public static final char ESCAPE_END = (char) ESCE.getCodePoint(); 74 75 /** Assemble the reverse mapping */ 76 private static final UnicodeMap<CodePointEscaper> _fromCodePoint = new UnicodeMap<>(); 77 78 static { 79 for (CodePointEscaper abbr : CodePointEscaper.values()) { 80 CodePointEscaper oldValue = _fromCodePoint.get(abbr.codePoint); 81 if (oldValue != null) { 82 throw new IllegalArgumentException( 83 "Abbreviation code points collide: " 84 + oldValue.name() 85 + ", " 86 + abbr.name()); 87 } _fromCodePoint.put(abbr.codePoint, abbr)88 _fromCodePoint.put(abbr.codePoint, abbr); 89 } _fromCodePoint.freeze()90 _fromCodePoint.freeze(); 91 } 92 93 /** Characters that need escaping */ 94 public static final UnicodeSet EMOJI_INVISIBLES = 95 new UnicodeSet("[\\uFE0F\\U000E0020-\\U000E007F]").freeze(); 96 97 public static final UnicodeSet FORCE_ESCAPE = 98 new UnicodeSet("[[:DI:][:Pat_WS:][:WSpace:][:C:][:Z:]]") 99 .addAll(getNamedEscapes()) 100 .removeAll(EMOJI_INVISIBLES) 101 .freeze(); 102 103 public static final UnicodeSet NON_SPACING = new UnicodeSet("[[:Mn:][:Me:]]").freeze(); 104 105 public static final UnicodeSet FORCE_ESCAPE_WITH_NONSPACING = 106 new UnicodeSet(FORCE_ESCAPE).addAll(NON_SPACING).freeze(); 107 108 private final int codePoint; 109 private final String shortName; 110 private final String description; 111 CodePointEscaper(int codePoint, String shortName)112 private CodePointEscaper(int codePoint, String shortName) { 113 this.codePoint = codePoint; 114 this.shortName = shortName; 115 this.description = ""; 116 } 117 CodePointEscaper(int codePoint, String shortName, String description)118 private CodePointEscaper(int codePoint, String shortName, String description) { 119 this.codePoint = codePoint; 120 this.shortName = shortName; 121 this.description = description; 122 } 123 getNamedEscapes()124 public static final UnicodeSet getNamedEscapes() { 125 return _fromCodePoint.keySet().freeze(); 126 } 127 128 /** 129 * Return long names for this character. The set is immutable and ordered, with the first name 130 * being the most user-friendly. 131 */ getShortName()132 public String getShortName() { 133 return shortName; 134 } 135 136 /** 137 * Return a longer description, if available; otherwise "" 138 * 139 * @return 140 */ getDescription()141 public String getDescription() { 142 return description; 143 } 144 145 /** Return the code point for this character. */ getCodePoint()146 public int getCodePoint() { 147 return codePoint; 148 } 149 150 /** Returns the escaped form from the code point for this enum */ codePointToEscaped()151 public String codePointToEscaped() { 152 return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END; 153 } 154 155 /** Returns a code point from the escaped form <b>of a single code point</b> */ escapedToCodePoint(String value)156 public static int escapedToCodePoint(String value) { 157 if (value.codePointAt(0) != CodePointEscaper.ESCAPE_START 158 || value.codePointAt(value.length() - 1) != CodePointEscaper.ESCAPE_END) { 159 throw new IllegalArgumentException( 160 "Must be of the form " 161 + CodePointEscaper.ESCAPE_START 162 + "…" 163 + CodePointEscaper.ESCAPE_END); 164 } 165 return rawEscapedToCodePoint(value.substring(1, value.length() - 1)); 166 } 167 168 /** Returns the escaped form from a code point */ codePointToEscaped(int codePoint)169 public static String codePointToEscaped(int codePoint) { 170 return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END; 171 } 172 173 /** Returns the escaped form from a string */ toEscaped(String unescaped)174 public static String toEscaped(String unescaped) { 175 return toEscaped(unescaped, FORCE_ESCAPE); 176 } 177 178 /** Returns the escaped form from a string */ toEscaped(String unescaped, UnicodeSet toEscape)179 public static String toEscaped(String unescaped, UnicodeSet toEscape) { 180 StringBuilder result = new StringBuilder(); 181 unescaped 182 .codePoints() 183 .forEach( 184 cp -> { 185 if (!toEscape.contains(cp)) { 186 result.appendCodePoint(cp); 187 } else { 188 result.append(codePointToEscaped(cp)); 189 } 190 }); 191 return result.toString(); 192 } 193 /** Return unescaped string */ toUnescaped(String value)194 public static String toUnescaped(String value) { 195 StringBuilder result = null; 196 int donePart = 0; 197 int found = value.indexOf(ESCAPE_START); 198 while (found >= 0) { 199 int foundEnd = value.indexOf(ESCAPE_END, found); 200 if (foundEnd < 0) { 201 throw new IllegalArgumentException( 202 "Malformed escaped string, missing: " + ESCAPE_END); 203 } 204 if (result == null) { 205 result = new StringBuilder(); 206 } 207 result.append(value, donePart, found); 208 donePart = ++foundEnd; 209 result.appendCodePoint(escapedToCodePoint(value.substring(found, foundEnd))); 210 found = value.indexOf(ESCAPE_START, foundEnd); 211 } 212 return donePart == 0 ? value : result.append(value, donePart, value.length()).toString(); 213 } 214 215 private static final String HAS_NAME = " ≡ "; 216 toExample(int codePoint)217 public static String toExample(int codePoint) { 218 CodePointEscaper cpe = _fromCodePoint.get(codePoint); 219 if (cpe == null) { // hex 220 return codePointToEscaped(codePoint) 221 + HAS_NAME 222 + UCharacter.getName(codePoint).toLowerCase(); 223 } else { 224 return CodePointEscaper.codePointToEscaped(cpe.codePoint) 225 + HAS_NAME 226 + cpe.shortName; // TODO show hover with cpe.description 227 } 228 } 229 230 /** 231 * Returns a code point from an abbreviation string or hex string <b>without the escape 232 * brackets</b> 233 */ rawEscapedToCodePoint(CharSequence value)234 public static int rawEscapedToCodePoint(CharSequence value) { 235 try { 236 return valueOf(value.toString().toUpperCase(Locale.ROOT)).codePoint; 237 } catch (Exception e) { 238 } 239 int codePoint; 240 try { 241 codePoint = Integer.parseInt(value.toString(), 16); 242 } catch (NumberFormatException e) { 243 throw new IllegalArgumentException("Not a named or hex escape: ❰" + value + "❌❱"); 244 } 245 if (codePoint < 0 || codePoint > 0x10FFFF) { 246 throw new IllegalArgumentException("Illegal code point: ❰" + value + "❌❱"); 247 } 248 return codePoint; 249 } 250 251 /** 252 * Returns an abbreviation string or hex string <b>without the escape brackets</b> from a code 253 * point. 254 */ rawCodePointToEscaped(int codePoint)255 public static String rawCodePointToEscaped(int codePoint) { 256 CodePointEscaper result = CodePointEscaper._fromCodePoint.get(codePoint); 257 return result == null 258 ? Integer.toString(codePoint, 16).toUpperCase(Locale.ROOT) 259 : result.toString(); 260 } 261 } 262