xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/CodePointEscaper.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.util;
2 
3 import com.ibm.icu.dev.util.UnicodeMap;
4 import com.ibm.icu.lang.UCharacter;
5 import com.ibm.icu.text.UnicodeSet;
6 import java.util.Locale;
7 
8 /**
9  * Provide a set of code point abbreviations. Includes conversions to and from codepoints, including
10  * hex. Typicaly To test whether a string could have escapes, use either:
11  *
12  * <ul>
13  *   <li>
14  */
15 public enum CodePointEscaper {
16     // These are characters found in CLDR data fields
17     // The long names don't necessarily match the formal Unicode names
18     TAB(9, "tab"),
19     LF(0xA, "line feed"),
20     CR(0xD, "carriage return"),
21     SP(0x20, "space", "ASCII space"),
22     NSP(0x2009, "narrow/thin space", "Also known as ‘thin space’"),
23     NBSP(0xA0, "no-break space", "Same as space, but doesn’t line wrap."),
24 
25     NNBSP(0x202F, "narrow/thin no-break space", "Same as narrow space, but doesn’t line wrap."),
26 
27     WNJ(
28             0x200B,
29             "allow line wrap after, aka ZWSP",
30             "Invisible character allowing a line-wrap afterwards. Also known as ‘ZWSP’."),
31     WJ(
32             0x2060,
33             "prevent line wrap",
34             "Keeps adjacent characters from line-wrapping. Also known as ‘word-joiner’."),
35     SHY(
36             0x00AD,
37             "soft hyphen",
38             "Invisible character allowing a line-wrap afterwards, but appears like a hyphen in most languages."),
39 
40     ZWNJ(0x200C, "cursive non-joiner", "Breaks cursive connections, where possible."),
41     ZWJ(0x200D, "cursive joiner", "Forces cursive connections, if possible."),
42 
43     ALM(
44             0x061C,
45             "Arabic letter mark",
46             "For BIDI, invisible character that behaves like Arabic letter."),
47     LRM(
48             0x200E,
49             "left-right mark",
50             "For BIDI, invisible character that behaves like Hebrew letter."),
51     RLM(0x200F, "right-left mark", "For BIDI, invisible character that behaves like Latin letter."),
52 
53     LRO(0x202D, "left-right override"),
54     RLO(0x202E, "right-left override"),
55     PDF(0x202C, "end override"),
56 
57     BOM(0xFEFF, "byte-order mark"),
58 
59     ANS(0x0600, "Arabic number sign"),
60     ASNS(0x0601, "Arabic sanah sign"),
61     AFM(0x602, "Arabic footnote marker"),
62     ASFS(0x603, "Arabic safha sign"),
63     SAM(0x70F, "Syriac abbreviation mark"),
64     KIAQ(0x17B4, "Khmer inherent aq"),
65     KIAA(0x17B5, "Khmer inherent aa"),
66 
67     RANGE('➖', "range syntax mark", "heavy minus sign"),
68     ESCS('❰', "escape start", "heavy open angle bracket"),
69     ESCE('❱', "escape end", "heavy close angle bracket");
70 
71     public static final char RANGE_SYNTAX = (char) RANGE.getCodePoint();
72     public static final char ESCAPE_START = (char) ESCS.getCodePoint();
73     public static final char ESCAPE_END = (char) ESCE.getCodePoint();
74 
75     /** Assemble the reverse mapping */
76     private static final UnicodeMap<CodePointEscaper> _fromCodePoint = new UnicodeMap<>();
77 
78     static {
79         for (CodePointEscaper abbr : CodePointEscaper.values()) {
80             CodePointEscaper oldValue = _fromCodePoint.get(abbr.codePoint);
81             if (oldValue != null) {
82                 throw new IllegalArgumentException(
83                         "Abbreviation code points collide: "
84                                 + oldValue.name()
85                                 + ", "
86                                 + abbr.name());
87             }
_fromCodePoint.put(abbr.codePoint, abbr)88             _fromCodePoint.put(abbr.codePoint, abbr);
89         }
_fromCodePoint.freeze()90         _fromCodePoint.freeze();
91     }
92 
93     /** Characters that need escaping */
94     public static final UnicodeSet EMOJI_INVISIBLES =
95             new UnicodeSet("[\\uFE0F\\U000E0020-\\U000E007F]").freeze();
96 
97     public static final UnicodeSet FORCE_ESCAPE =
98             new UnicodeSet("[[:DI:][:Pat_WS:][:WSpace:][:C:][:Z:]]")
99                     .addAll(getNamedEscapes())
100                     .removeAll(EMOJI_INVISIBLES)
101                     .freeze();
102 
103     public static final UnicodeSet NON_SPACING = new UnicodeSet("[[:Mn:][:Me:]]").freeze();
104 
105     public static final UnicodeSet FORCE_ESCAPE_WITH_NONSPACING =
106             new UnicodeSet(FORCE_ESCAPE).addAll(NON_SPACING).freeze();
107 
108     private final int codePoint;
109     private final String shortName;
110     private final String description;
111 
CodePointEscaper(int codePoint, String shortName)112     private CodePointEscaper(int codePoint, String shortName) {
113         this.codePoint = codePoint;
114         this.shortName = shortName;
115         this.description = "";
116     }
117 
CodePointEscaper(int codePoint, String shortName, String description)118     private CodePointEscaper(int codePoint, String shortName, String description) {
119         this.codePoint = codePoint;
120         this.shortName = shortName;
121         this.description = description;
122     }
123 
getNamedEscapes()124     public static final UnicodeSet getNamedEscapes() {
125         return _fromCodePoint.keySet().freeze();
126     }
127 
128     /**
129      * Return long names for this character. The set is immutable and ordered, with the first name
130      * being the most user-friendly.
131      */
getShortName()132     public String getShortName() {
133         return shortName;
134     }
135 
136     /**
137      * Return a longer description, if available; otherwise ""
138      *
139      * @return
140      */
getDescription()141     public String getDescription() {
142         return description;
143     }
144 
145     /** Return the code point for this character. */
getCodePoint()146     public int getCodePoint() {
147         return codePoint;
148     }
149 
150     /** Returns the escaped form from the code point for this enum */
codePointToEscaped()151     public String codePointToEscaped() {
152         return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END;
153     }
154 
155     /** Returns a code point from the escaped form <b>of a single code point</b> */
escapedToCodePoint(String value)156     public static int escapedToCodePoint(String value) {
157         if (value.codePointAt(0) != CodePointEscaper.ESCAPE_START
158                 || value.codePointAt(value.length() - 1) != CodePointEscaper.ESCAPE_END) {
159             throw new IllegalArgumentException(
160                     "Must be of the form "
161                             + CodePointEscaper.ESCAPE_START
162                             + "…"
163                             + CodePointEscaper.ESCAPE_END);
164         }
165         return rawEscapedToCodePoint(value.substring(1, value.length() - 1));
166     }
167 
168     /** Returns the escaped form from a code point */
codePointToEscaped(int codePoint)169     public static String codePointToEscaped(int codePoint) {
170         return ESCAPE_START + rawCodePointToEscaped(codePoint) + ESCAPE_END;
171     }
172 
173     /** Returns the escaped form from a string */
toEscaped(String unescaped)174     public static String toEscaped(String unescaped) {
175         return toEscaped(unescaped, FORCE_ESCAPE);
176     }
177 
178     /** Returns the escaped form from a string */
toEscaped(String unescaped, UnicodeSet toEscape)179     public static String toEscaped(String unescaped, UnicodeSet toEscape) {
180         StringBuilder result = new StringBuilder();
181         unescaped
182                 .codePoints()
183                 .forEach(
184                         cp -> {
185                             if (!toEscape.contains(cp)) {
186                                 result.appendCodePoint(cp);
187                             } else {
188                                 result.append(codePointToEscaped(cp));
189                             }
190                         });
191         return result.toString();
192     }
193     /** Return unescaped string */
toUnescaped(String value)194     public static String toUnescaped(String value) {
195         StringBuilder result = null;
196         int donePart = 0;
197         int found = value.indexOf(ESCAPE_START);
198         while (found >= 0) {
199             int foundEnd = value.indexOf(ESCAPE_END, found);
200             if (foundEnd < 0) {
201                 throw new IllegalArgumentException(
202                         "Malformed escaped string, missing: " + ESCAPE_END);
203             }
204             if (result == null) {
205                 result = new StringBuilder();
206             }
207             result.append(value, donePart, found);
208             donePart = ++foundEnd;
209             result.appendCodePoint(escapedToCodePoint(value.substring(found, foundEnd)));
210             found = value.indexOf(ESCAPE_START, foundEnd);
211         }
212         return donePart == 0 ? value : result.append(value, donePart, value.length()).toString();
213     }
214 
215     private static final String HAS_NAME = " ≡ ";
216 
toExample(int codePoint)217     public static String toExample(int codePoint) {
218         CodePointEscaper cpe = _fromCodePoint.get(codePoint);
219         if (cpe == null) { // hex
220             return codePointToEscaped(codePoint)
221                     + HAS_NAME
222                     + UCharacter.getName(codePoint).toLowerCase();
223         } else {
224             return CodePointEscaper.codePointToEscaped(cpe.codePoint)
225                     + HAS_NAME
226                     + cpe.shortName; // TODO show hover with cpe.description
227         }
228     }
229 
230     /**
231      * Returns a code point from an abbreviation string or hex string <b>without the escape
232      * brackets</b>
233      */
rawEscapedToCodePoint(CharSequence value)234     public static int rawEscapedToCodePoint(CharSequence value) {
235         try {
236             return valueOf(value.toString().toUpperCase(Locale.ROOT)).codePoint;
237         } catch (Exception e) {
238         }
239         int codePoint;
240         try {
241             codePoint = Integer.parseInt(value.toString(), 16);
242         } catch (NumberFormatException e) {
243             throw new IllegalArgumentException("Not a named or hex escape: ❰" + value + "❌❱");
244         }
245         if (codePoint < 0 || codePoint > 0x10FFFF) {
246             throw new IllegalArgumentException("Illegal code point: ❰" + value + "❌❱");
247         }
248         return codePoint;
249     }
250 
251     /**
252      * Returns an abbreviation string or hex string <b>without the escape brackets</b> from a code
253      * point.
254      */
rawCodePointToEscaped(int codePoint)255     public static String rawCodePointToEscaped(int codePoint) {
256         CodePointEscaper result = CodePointEscaper._fromCodePoint.get(codePoint);
257         return result == null
258                 ? Integer.toString(codePoint, 16).toUpperCase(Locale.ROOT)
259                 : result.toString();
260     }
261 }
262