xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/test/TestMisc.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.test;
2 
3 import com.ibm.icu.lang.UCharacter;
4 import com.ibm.icu.lang.UScript;
5 import com.ibm.icu.text.Collator;
6 import com.ibm.icu.text.DecimalFormat;
7 import com.ibm.icu.text.NumberFormat;
8 import com.ibm.icu.text.Transliterator;
9 import com.ibm.icu.text.UTF16;
10 import com.ibm.icu.text.UnicodeSet;
11 import com.ibm.icu.text.UnicodeSetIterator;
12 import com.ibm.icu.util.Currency;
13 import com.ibm.icu.util.ULocale;
14 import java.io.PrintWriter;
15 import java.math.BigDecimal;
16 import java.text.ParsePosition;
17 import java.util.ArrayList;
18 import java.util.Arrays;
19 import java.util.BitSet;
20 import java.util.Collection;
21 import java.util.Collections;
22 import java.util.EnumSet;
23 import java.util.HashMap;
24 import java.util.HashSet;
25 import java.util.Iterator;
26 import java.util.List;
27 import java.util.Locale;
28 import java.util.Map;
29 import java.util.Set;
30 import java.util.TreeSet;
31 import java.util.regex.Matcher;
32 import org.unicode.cldr.util.CLDRFile;
33 import org.unicode.cldr.util.CLDRFile.Status;
34 import org.unicode.cldr.util.CLDRPaths;
35 import org.unicode.cldr.util.CldrUtility;
36 import org.unicode.cldr.util.DtdType;
37 import org.unicode.cldr.util.Factory;
38 import org.unicode.cldr.util.Iso639Data;
39 import org.unicode.cldr.util.Iso639Data.Scope;
40 import org.unicode.cldr.util.Level;
41 import org.unicode.cldr.util.Pair;
42 import org.unicode.cldr.util.PatternCache;
43 import org.unicode.cldr.util.SimpleFactory;
44 import org.unicode.cldr.util.StandardCodes;
45 import org.unicode.cldr.util.VariantFolder;
46 import org.unicode.cldr.util.VariantFolder.CanonicalFolder;
47 import org.unicode.cldr.util.VariantFolder.CaseVariantFolder;
48 import org.unicode.cldr.util.VariantFolder.CompatibilityFolder;
49 import org.unicode.cldr.util.XPathParts;
50 import org.unicode.cldr.util.props.BagFormatter;
51 
52 public class TestMisc {
53 
54     static Currency SWISS_FRANC = Currency.getInstance("CHF");
55 
56     static class Lists {
sortedCopy(Collection<E> iterable)57         public static <E extends Comparable> List<E> sortedCopy(Collection<E> iterable) {
58             List<E> list = new ArrayList<>();
59             list.addAll(iterable);
60             Collections.sort(list);
61             return list;
62         }
63     }
64 
65     enum Foo {
66         A,
67         M,
68         Z
69     }
70 
main(String[] args)71     public static void main(String[] args) {
72 
73         checkAliases();
74         if (true) return;
75 
76         Transliterator en_ru = Transliterator.getInstance("en-ru");
77         System.out.println("Mark + " + en_ru.transform("Mark"));
78 
79         Transliterator latn_cyrl = Transliterator.getInstance("Latn-Cyrl");
80         System.out.println("Mark + " + latn_cyrl.transform("Mark"));
81 
82         Transliterator ulatn_ucyrl = Transliterator.getInstance("und_Latn-und_Cyrl");
83         System.out.println("Mark + " + latn_cyrl.transform("Mark"));
84 
85         Locale locale =
86                 new Locale("abc-d αγζθ ?ef_g%hi", "abc-d αγζθ ?ef_g%hi", "abc-d αγζθ ?ef_g%hi");
87 
88         System.out.println(
89                 "Locale locale = new Locale(\"abc-d αγζθ ?ef_g%hi\",\"abc-d αγζθ ?ef_g%hi\",\"abc-d αγζθ ?ef_g%hi\");");
90         System.out.println("locale.toString() == \"" + locale + "\"");
91 
92         MyXSymbolTable sym = new MyXSymbolTable();
93         BagFormatter bf = new BagFormatter();
94         for (String test :
95                 new String[] {
96                     "[:reduceCase=[[Åå{fi}]]:]",
97                     "[:reduceCanonical=[[Åå{fi}]]:]",
98                     "[[,٫.]]",
99                     "[[,٫.][:close=compatibility:]]",
100                     "[[\\ ,٬.']]",
101                     "[[\\ ,٬.'][:close=compatibility:]]",
102                     "[[\u002E\u2024\uFE52\uFF0E\u3002][:close=compatibility:]]",
103                     "[[[\u002C \u002E \u066B \u2024 \u3002 \uFE52 \uFF0E、، \u002E \u2024 \uFE52 \uFF0E \u3002]-[\u002E\u2024\uFE52\uFF0E\u3002]][:close=compatibility:]]",
104                     "[["
105                             + "\\u0020"
106                             + "[, ٬ ..․﹒ '' \u2018 \u2019 ]"
107                             + "-[.\u2024\u3002\uFE12\uFE52\uFF0E\uFF61]"
108                             + "-[,\u060C\u066B\u3001\uFE10\uFE11\uFE50\uFE51\uFF0C\uFF64]]"
109                             + "[:close=compatibility:]]",
110 
111                     /*
112                      * "[[Åå{fi}][:close=canonical:]]",
113                      * "[[Åå{fi}][:close=compatibility:]]",
114                      * "[[Åå{fi}][:reduce=case:]]",
115                      * "[[Åå{fi}][:reduce=canonical:]]",
116                      * "[[Åå{fi}][:reduce=compatibility:]]",
117                      */
118                 }) {
119             ParsePosition p = new ParsePosition(0);
120             UnicodeSet set = new UnicodeSet(test, p, sym);
121             UnicodeSet codes = set.complement().complement();
122             System.out.println(
123                     test
124                             + CldrUtility.LINE_SEPARATOR
125                             + codes.toPattern(true)
126                             + CldrUtility.LINE_SEPARATOR
127                             + bf.showSetNames(set.complement().complement())
128                             + CldrUtility.LINE_SEPARATOR);
129         }
130         if (true) return;
131 
132         StandardCodes sc = StandardCodes.make();
133         for (String s : new String[] {"language", "script", "territory"}) {
134             System.out.println(s + ":\t" + sc.getGoodAvailableCodes(s).size());
135         }
136         if (true) return;
137 
138         Set<Foo> inFileOrder = EnumSet.allOf(Foo.class);
139         List<Foo> inAlphaOrder = Lists.sortedCopy(inFileOrder);
140         System.out.println(inFileOrder);
141         System.out.println(inAlphaOrder);
142 
143         DecimalFormat currencyFormat =
144                 (DecimalFormat) NumberFormat.getCurrencyInstance(new ULocale("de-CH"));
145         currencyFormat.setCurrency(SWISS_FRANC);
146         // sometime later...
147         // we want the financial format of the currency, not the retail format
148         System.out.println("Retail:\t" + currencyFormat.format(123.53));
149         BigDecimal increment = currencyFormat.getRoundingIncrement();
150         System.out.println("Rounding Increment:\t" + increment);
151         double double_increment = increment.doubleValue();
152         System.out.println("Double rounding Increment:\t" + double_increment);
153         double log = Math.log10(double_increment);
154         System.out.println("Double log:\t" + log);
155         double new_increment = Math.pow(10, Math.floor(log));
156         System.out.println("Floored Increment:\t" + new_increment);
157         currencyFormat.setRoundingIncrement(new_increment);
158         System.out.println("Financial:\t" + currencyFormat.format(123.53));
159 
160         if (true) return;
161 
162         testWeights();
163         if (true) return;
164 
165         testScripts();
166         testToRegex();
167         // checkEastAsianWidth();
168         if (true) return;
169         // import ICU
170         UnicodeSet RTL =
171                 new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]");
172 
173         checkCollections();
174 
175         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
176         CLDRFile englishFile = cldrFactory.make("en", true);
177         ExampleGenerator eg = new ExampleGenerator(englishFile, englishFile);
178         System.out.println(
179                 eg.getHelpHtml(
180                         "//ldml/numbers/currencyFormats/currencyFormatLength/currencyFormat[@type=\"standard\"]/pattern[@type=\"standard\"][@draft=\"provisional\"]",
181                         ""));
182         System.out.println(eg.getHelpHtml("/exemplarCharacters", ""));
183         System.out.println(eg.getHelpHtml("/calendar/pattern", ""));
184 
185         if (true) return;
186         Set<String> s = new HashSet<>(Arrays.asList("a", "A", "c"));
187         Collator caselessCompare = Collator.getInstance(Locale.ENGLISH);
188         caselessCompare.setStrength(Collator.PRIMARY);
189         Set<String> t = new TreeSet<>(caselessCompare);
190         t.addAll(Arrays.asList("a", "b", "c"));
191         System.out.println("s equals t: " + s.equals(t));
192         System.out.println("t equals s: " + t.equals(s));
193 
194         Set<String> u = Collections.unmodifiableSet(t);
195         System.out.println("s==t " + (s.equals(t)));
196         System.out.println("s==u " + (s.equals(u)));
197         UnicodeSet x = new UnicodeSet("[a-z]");
198         UnicodeSet y = new UnicodeSet("[a-z]").freeze();
199         System.out.println("x==y " + (x.equals(y)));
200         // showEnglish();
201         // checkPrivateUse();
202         // testPopulous();
203         // checkDistinguishing();
204         // checkEastAsianWidth();
205         // checkEnglishPaths();
206         System.out.println("Done");
207     }
208 
checkAliases()209     private static void checkAliases() {
210         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
211         CLDRFile en = cldrFactory.make("root", true);
212         Status status = new Status();
213         Matcher m = PatternCache.get("gregorian.*dayPeriods").matcher("");
214         for (Iterator<String> it = en.iterator(null, en.getComparator()); it.hasNext(); ) {
215             String path = it.next();
216             if (!m.reset(path).find()) {
217                 continue;
218             }
219             // String locale = en.getSourceLocaleID(path, status);
220             String value = en.getStringValue(path);
221             String fullPath = en.getFullXPath(path);
222             System.out.println("value:\t" + value + "\tpath:\t" + fullPath);
223             if (!path.equals(status.pathWhereFound)) {
224                 System.out.println("\torigin:\t" + status);
225             }
226             // System.out.println("locale:\t" + locale);
227             System.out.println();
228         }
229     }
230 
testWeights()231     private static void testWeights() {
232         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
233         CLDRFile english = cldrFactory.make("en", true);
234         Set<Pair<Integer, String>> rel = new TreeSet<>();
235         for (String desiredLocale : cldrFactory.getAvailable()) {
236             int vote = Level.getDefaultWeight("google", desiredLocale);
237             rel.add(new Pair<>(vote, desiredLocale));
238         }
239         for (Pair<Integer, String> p : rel) {
240             System.out.println(p + "\t" + english.getName(p.getSecond()));
241         }
242     }
243 
testScripts()244     private static void testScripts() {
245         BagFormatter bf = new BagFormatter();
246 
247         UnicodeSet caseFolded = new UnicodeSet();
248         UnicodeSet simpleCaseFolded = new UnicodeSet();
249         for (int i = 0; i < 0x10FFFF; ++i) {
250             String form = UTF16.valueOf(i);
251             if (UCharacter.foldCase(form, true).equals(form)) {
252                 caseFolded.add(i);
253             }
254             if (UCharacter.foldCase(i, true) == i) {
255                 simpleCaseFolded.add(i);
256             }
257         }
258         caseFolded.freeze();
259         simpleCaseFolded.freeze();
260 
261         UnicodeSet functionalExceptCase =
262                 new UnicodeSet(
263                                 "["
264                                         + "[:L:][:Mc:][:Mn:][:Nd:]"
265                                         + "&[:^NFKC_QuickCheck=No:]"
266                                         + "&[:^default_ignorable_code_point:]]")
267                         .freeze();
268 
269         UnicodeSet asciiIdn = new UnicodeSet("[-A-Z0-9]").freeze();
270 
271         UnicodeSet archaic =
272                 new UnicodeSet(
273                                 "["
274                                         + "[:script=Bugi:]"
275                                         + "[:script=Copt:]"
276                                         + "[:script=Cprt:]"
277                                         + "[:script=Dsrt:]"
278                                         + "[:script=Glag:]"
279                                         + "[:script=Goth:]"
280                                         + "[:script=Hano:]"
281                                         + "[:script=Ital:]"
282                                         + "[:script=Khar:]"
283                                         + "[:script=Linb:]"
284                                         + "[:script=Ogam:]"
285                                         + "[:script=Osma:]"
286                                         + "[:script=Phag:]"
287                                         + "[:script=Phnx:]"
288                                         + "[:script=Runr:]"
289                                         + "[:script=Shaw:]"
290                                         + "[:script=Sylo:]"
291                                         + "[:script=Syrc:]"
292                                         + "[:script=Tagb:]"
293                                         + "[:script=Tglg:]"
294                                         + "[:script=Ugar:]"
295                                         + "[:script=Xpeo:]"
296                                         + "[:script=Xsux:]"
297                                         +
298                                         // "[:script=Arab:]" +
299                                         // "[:script=Armn:]" +
300                                         // "[:script=Beng:]" +
301                                         // "[:script=Bopo:]" +
302                                         "[:block=Combining_Diacritical_Marks _for_Symbols:]"
303                                         + "[:block=Musical_Symbols:]"
304                                         + "[:block=Ancient_Greek_Musical_Notation:]]")
305                         .freeze();
306 
307         System.out.println("functionalExceptCase: " + functionalExceptCase);
308         System.out.println("archaic: " + archaic);
309 
310         System.out.println(
311                 "SimpleCaseFolded & !CaseFolded & Functional & !Archaic:"
312                         + CldrUtility.LINE_SEPARATOR
313                         + bf.showSetNames(
314                                 new UnicodeSet(simpleCaseFolded)
315                                         .removeAll(caseFolded)
316                                         .retainAll(functionalExceptCase)
317                                         .removeAll(archaic)
318                                         .removeAll(asciiIdn)));
319 
320         UnicodeSet functional = new UnicodeSet(functionalExceptCase).retainAll(caseFolded).freeze();
321         System.out.println("functional: " + functional.size());
322         UnicodeSet functionalAndNotArchaic = new UnicodeSet(functional).removeAll(archaic).freeze();
323         System.out.println("archaic: " + archaic.size());
324         System.out.println("functionalAndNotArchaic: " + functionalAndNotArchaic.size());
325 
326         // System.out.println(bf.showSetNames("Case Folded", caseFolded,"Simple Case Folded",
327         // simpleCaseFolded));
328 
329         UnicodeSet functionalCommon =
330                 new UnicodeSet("[:script=common:]")
331                         .retainAll(functional)
332                         .removeAll(archaic)
333                         .removeAll(asciiIdn);
334         System.out.println(
335                 "Common & Functional & !Archaic:"
336                         + CldrUtility.LINE_SEPARATOR
337                         + bf.showSetNames(functionalCommon));
338 
339         UnicodeSet functionalInherited =
340                 new UnicodeSet("[:script=inherited:]")
341                         .retainAll(functional)
342                         .removeAll(archaic)
343                         .removeAll(asciiIdn);
344         System.out.println(
345                 "Inherited & Functional & !Archaic:"
346                         + CldrUtility.LINE_SEPARATOR
347                         + bf.showSetNames(functionalInherited));
348 
349         UnicodeSet nl = new UnicodeSet("[:Nl:]").retainAll(functional).removeAll(archaic);
350         System.out.println(
351                 "Nl:" + CldrUtility.LINE_SEPARATOR + bf.showSetNames(new UnicodeSet("[:Nl:]")));
352         System.out.println(
353                 "Nl & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR + bf.showSetNames(nl));
354 
355         UnicodeSet restrictedXidContinue =
356                 new UnicodeSet(
357                                 "[[:xid_continue:]"
358                                         + "&[:^NFKC_QuickCheck=No:]"
359                                         + "&[:^default_ignorable_code_point:]"
360                                         + "&[:^Pc:]]")
361                         .retainAll(caseFolded);
362 
363         System.out.println(
364                 bf.showSetDifferences(
365                         "IDNA Functional",
366                         functional,
367                         "Unicode XID & NFKC &!DefaultIgnorable &! Pc",
368                         restrictedXidContinue));
369 
370         Transliterator t = Transliterator.getInstance("lower");
371         System.out.println("ABC " + t.transliterate("ABC"));
372         /*
373          * generalCategory(cp) is {Ll, Lu, Lo, Lm, Mn, Mc, Nd}, AND
374          * NFKC(cp) == cp, AND
375          * casefold(cp) == cp, AND
376          * !defaultIgnorableCodePoint(cp)
377          */
378         BitSet scripts = new BitSet();
379         for (int cp = 0; cp < 0x10FFFF; ++cp) {
380             int script = UScript.getScript(cp);
381             if (script == UScript.COMMON
382                     || script == UScript.UNKNOWN
383                     || script == UScript.INHERITED) {
384                 continue;
385             }
386             scripts.set(script);
387         }
388         Set<String> toPrint = new TreeSet<>();
389         for (int script = 0; script < scripts.size(); ++script) {
390             if (!scripts.get(script)) continue;
391             String code = UScript.getShortName(script);
392             String name = UScript.getName(script);
393             if (StandardCodes.isScriptModern(code)) {
394                 toPrint.add("modern\t" + code + "\t" + name);
395             } else {
396                 toPrint.add("archaic\t" + code + "\t" + name);
397             }
398         }
399         for (String line : toPrint) {
400             System.out.println(line);
401         }
402     }
403 
checkCollections()404     private static void checkCollections() {
405         System.out.println("Collections");
406         new org.unicode.cldr.util.CldrUtility.Apply<String>() {
407             @Override
408             public void apply(String item) {
409                 if (Iso639Data.getScope(item.toString()) != Scope.Collection) return;
410                 System.out.println(item + "\t" + CldrUtility.join(Iso639Data.getNames(item), ", "));
411             }
412         }.applyTo(Iso639Data.getAvailable());
413         System.out.println(CldrUtility.LINE_SEPARATOR + "Macrolanguages");
414         new org.unicode.cldr.util.CldrUtility.Apply<String>() {
415             @Override
416             public void apply(String item) {
417                 if (Iso639Data.getScope(item.toString()) != Scope.Macrolanguage) return;
418                 System.out.println(item + "\t" + CldrUtility.join(Iso639Data.getNames(item), ", "));
419             }
420         }.applyTo(Iso639Data.getAvailable());
421     }
422 
testToRegex()423     static void testToRegex() {
424         String[] tests = {
425             "\\-",
426             "a",
427             "d-f",
428             "\\u2000",
429             "\\uAC00-\\uAC12",
430             "{AB}",
431             "{CDE}",
432             "\\uFFF0-\\U0010000F",
433             "\\U0010100F-\\U0010300F"
434         }; // }; //
435         for (int i = (1 << tests.length) - 1; i >= 0; --i) {
436             String test = "[";
437             for (int j = 0; j < tests.length; ++j) {
438                 if ((i & (1 << j)) != 0) {
439                     test += tests[j];
440                 }
441             }
442             test += "]";
443             testToRegex(new UnicodeSet(test));
444         }
445     }
446 
testToRegex(UnicodeSet test)447     private static void testToRegex(UnicodeSet test) {
448         String formatted = CldrUtility.toRegex(test);
449         System.out.println(test + "\t->\t" + formatted);
450         Matcher newTest = PatternCache.get(formatted).matcher("");
451         UnicodeSet failures = new UnicodeSet();
452         for (UnicodeSetIterator it = new UnicodeSetIterator(test); it.next(); ) {
453             if (!newTest.reset(it.getString()).matches()) {
454                 failures.add(it.getString());
455             }
456         }
457         if (failures.size() != 0) {
458             System.out.println("\tFailed on: " + failures);
459         }
460         System.out.flush();
461     }
462 
checkEastAsianWidth()463     static void checkEastAsianWidth() {
464         UnicodeSet dontCares = new UnicodeSet("[[:surrogate:][:unassigned:][:control:]]").freeze();
465         UnicodeSet dontCares2 = new UnicodeSet("[:^letter:]").freeze();
466 
467         // UnicodeSet wide = new
468         // UnicodeSet("[[:East_Asian_Width=wide:][:East_Asian_Width=fullwidth:][:Co:]]"); //
469         // remove supplementaries
470         // System.out.format("Wide %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR,
471         // wide);
472         // System.out.format("Wide(spanned) %s" + Utility.LINE_SEPARATOR + "" +
473         // Utility.LINE_SEPARATOR,
474         // Utility.addDontCareSpans(wide, dontCares));
475         // UnicodeSet zeroWidth = new
476         // UnicodeSet("[[:default_ignorable_code_point:][:Mn:][:Me:]-[:Noncharacter_Code_Point:]-[:Cc:]]"); // remove
477         // supplementaries
478         // System.out.format("ZeroWidth %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR,
479         // zeroWidth);
480         // System.out.format("ZeroWidth(spanned) %s" + Utility.LINE_SEPARATOR + "" +
481         // Utility.LINE_SEPARATOR,
482         // Utility.addDontCareSpans(zeroWidth, dontCares));
483 
484         // P2. In each paragraph, find the first character of type L, AL, or R.
485         UnicodeSet strongL = new UnicodeSet("[[:BidiClass=L:]-[:unassigned:]]").freeze(); //
486         showSpans("Bidi L", strongL, dontCares);
487         showSpans("Bidi L*", strongL, dontCares2);
488 
489         UnicodeSet strongRAL =
490                 new UnicodeSet("[[:BidiClass=R:][:BidiClass=AL:]-[:unassigned:]]").freeze();
491         showSpans("Bidi R,AL", strongRAL, dontCares);
492         showSpans("Bidi R,AL*", strongRAL, dontCares2);
493 
494         UnicodeSet strong =
495                 new UnicodeSet("[[:BidiClass=L:][:BidiClass=R:][:BidiClass=AL:]-[:unassigned:]]")
496                         .freeze();
497         showSpans("Strong", strong, dontCares);
498         showSpans("Strong*", strong, dontCares2);
499     }
500 
showSpans(String title, UnicodeSet sourceSet, UnicodeSet dontCares)501     private static void showSpans(String title, UnicodeSet sourceSet, UnicodeSet dontCares) {
502         System.out.println(title);
503         System.out.format("\tSource Set: %s" + CldrUtility.LINE_SEPARATOR, sourceSet);
504         System.out.format("\tDon't Cares: %s" + CldrUtility.LINE_SEPARATOR, dontCares);
505         UnicodeSet spanned = new UnicodeSet(sourceSet).addBridges(dontCares);
506         spanned = spanned.complement().complement();
507         String spannedString = spanned.toString();
508         String unescapedString = spanned.toPattern(false);
509         System.out.format("\tRanges: %d" + CldrUtility.LINE_SEPARATOR, spanned.getRangeCount());
510         System.out.format("\tStrlen(\\u): %d" + CldrUtility.LINE_SEPARATOR, spannedString.length());
511         System.out.format(
512                 "\tStrlen(!\\u): %d" + CldrUtility.LINE_SEPARATOR, unescapedString.length());
513         String title2 = "Result";
514         String sample = spannedString;
515         if (false) {
516             if (sample.length() > 60) {
517                 title2 = "Sample";
518                 sample = sample.substring(0, 60) + " ...";
519             }
520         }
521         System.out.format("\t%s: %s" + CldrUtility.LINE_SEPARATOR, title2, sample);
522         System.out.println();
523     }
524 
525     static int[] extraCJK = {
526         0x3006, // IDEOGRAPHIC CLOSING MARK;Lo
527         0x302A, // IDEOGRAPHIC LEVEL TONE MARK;Mn
528         0x302B, // IDEOGRAPHIC RISING TONE MARK;Mn
529         0x302C, // IDEOGRAPHIC DEPARTING TONE MARK;Mn
530         0x302D, // IDEOGRAPHIC ENTERING TONE MARK;Mn
531         0x302E, // HANGUL SINGLE DOT TONE MARK;Mn
532         0x302F, // HANGUL DOUBLE DOT TONE MARK;Mn
533         0x3031, // VERTICAL KANA REPEAT MARK;Lm
534         0x3032, // VERTICAL KANA REPEAT WITH VOICED SOUND MARK;Lm
535         0x3033, // VERTICAL KANA REPEAT MARK UPPER HALF;Lm
536         0x3034, // VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF;Lm
537         0x3035, // VERTICAL KANA REPEAT MARK LOWER HALF;Lm
538         0x303C, // MASU MARK;Lo
539         0x3099, // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn
540         0x309A, // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn
541         0x309B, // KATAKANA-HIRAGANA VOICED SOUND MARK;Sk
542         0x309C, // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Sk
543         0x30A0, // KATAKANA-HIRAGANA DOUBLE HYPHEN;Pd
544         0x30FC, // KATAKANA-HIRAGANA PROLONGED SOUND MARK;Lm
545         0xFF70, // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK;Lm
546         0xFF9E, // HALFWIDTH KATAKANA VOICED SOUND MARK;Lm
547         0xFF9F, // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm
548     };
549 
checkCFK()550     void checkCFK() {
551         // UnicodeSet Han, Hangul, Hiragana, Katakana, or Bopomofo
552     }
553 
checkDistinguishing()554     private static void checkDistinguishing() {
555         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
556         Set<String> cldrFiles = cldrFactory.getAvailableLanguages();
557         Set<String> distinguishing = new TreeSet<>();
558         Set<String> nondistinguishing = new TreeSet<>();
559         for (Iterator<String> it = cldrFiles.iterator(); it.hasNext(); ) {
560             CLDRFile cldrFile = cldrFactory.make(it.next(), false);
561             DtdType dtdType = null;
562             if (cldrFile.isNonInheriting()) {
563                 continue;
564             }
565             for (Iterator<String> it2 = cldrFile.iterator(); it2.hasNext(); ) {
566                 String path = it2.next();
567                 if (dtdType == null) {
568                     dtdType = DtdType.fromPath(path);
569                 }
570                 String fullPath = cldrFile.getFullXPath(path);
571                 if (path.equals(fullPath)) {
572                     continue;
573                 }
574                 XPathParts parts = XPathParts.getFrozenInstance(fullPath);
575                 for (int i = 0; i < parts.size(); ++i) {
576                     Map<String, String> m = parts.getAttributes(i);
577                     if (m.size() == 0) {
578                         continue;
579                     }
580                     String element = parts.getElement(i);
581                     for (Iterator<String> mit = m.keySet().iterator(); mit.hasNext(); ) {
582                         String attribute = mit.next();
583                         if (CLDRFile.isDistinguishing(dtdType, element, attribute)) {
584                             distinguishing.add(attribute + "\tD\t" + element);
585                         } else {
586                             nondistinguishing.add(attribute + "\tN\t" + element);
587                         }
588                     }
589                 }
590             }
591         }
592         System.out.println("Distinguishing");
593         for (Iterator<String> it = distinguishing.iterator(); it.hasNext(); ) {
594             System.out.println(it.next());
595         }
596         System.out.println();
597         System.out.println("Non-Distinguishing");
598         for (Iterator<String> it = nondistinguishing.iterator(); it.hasNext(); ) {
599             System.out.println(it.next());
600         }
601     }
602 
showEnglish()603     private static void showEnglish() {
604         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
605         String requestedLocale = "en";
606         CLDRFile cldrFile = cldrFactory.make(requestedLocale, true);
607         CLDRFile.Status status = new CLDRFile.Status();
608         for (Iterator<String> it = cldrFile.iterator(); it.hasNext(); ) {
609             String requestedPath = it.next();
610             String localeWhereFound = cldrFile.getSourceLocaleID(requestedPath, status);
611             if (!localeWhereFound.equals(requestedLocale)
612                     || !status.pathWhereFound.equals(requestedPath)) {
613                 System.out.println(
614                         "requested path:\t"
615                                 + requestedPath
616                                 + "\tfound locale:\t"
617                                 + localeWhereFound
618                                 + "\tsame?\t"
619                                 + localeWhereFound.equals(requestedLocale)
620                                 + "\tfound path:\t"
621                                 + status.pathWhereFound
622                                 + "\tsame?\t"
623                                 + status.pathWhereFound.equals(requestedPath));
624             }
625         }
626     }
627 
checkPrivateUse()628     private static void checkPrivateUse() {
629         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
630         String requestedLocale = "en";
631         CLDRFile cldrFile = cldrFactory.make(requestedLocale, true);
632         StandardCodes sc = StandardCodes.make();
633         Set<String> careAbout =
634                 new HashSet<>(
635                         Arrays.asList(new String[] {"language", "script", "territory", "variant"}));
636         HashMap<String, Set<String>> foundItems = new HashMap<>();
637         TreeSet<String> problems = new TreeSet<>();
638         for (Iterator<String> it =
639                         cldrFile.iterator("", new UTF16.StringComparator(true, false, 0));
640                 it.hasNext(); ) {
641             String requestedPath = it.next();
642             XPathParts parts = XPathParts.getFrozenInstance(requestedPath);
643             String element = parts.getElement(-1);
644             if (!careAbout.contains(element)) {
645                 continue;
646             }
647             String type = parts.getAttributeValue(-1, "type");
648             if (type == null) {
649                 continue;
650             }
651             Set<String> foundSet = foundItems.get(element);
652             if (foundSet == null) {
653                 foundItems.put(element, foundSet = new TreeSet<>());
654             }
655             foundSet.add(type);
656 
657             List<String> data = sc.getFullData(element, type);
658             if (data == null) {
659                 problems.add(
660                         "No RFC3066bis data for: "
661                                 + element
662                                 + "\t"
663                                 + type
664                                 + "\t"
665                                 + cldrFile.getStringValue(requestedPath));
666                 continue;
667             }
668             if (isPrivateOrDeprecated(data)) {
669                 problems.add(
670                         "Private/Deprecated Data for: "
671                                 + element
672                                 + "\t"
673                                 + type
674                                 + "\t"
675                                 + cldrFile.getStringValue(requestedPath)
676                                 + "\t"
677                                 + data);
678             }
679             // String canonical_value = (String)data.get(2);
680         }
681         for (Iterator<String> it = problems.iterator(); it.hasNext(); ) {
682             System.out.println(it.next());
683         }
684         for (Iterator<String> it = careAbout.iterator(); it.hasNext(); ) {
685             String element = it.next();
686             Set<String> real = sc.getAvailableCodes(element);
687             Set<String> notFound = new TreeSet<>(real);
688             notFound.removeAll(foundItems.get(element));
689             for (Iterator<String> it2 = notFound.iterator(); it2.hasNext(); ) {
690                 String type = it2.next();
691                 List<String> data = sc.getFullData(element, type);
692                 if (isPrivateOrDeprecated(data)) continue;
693                 System.out.println(
694                         "Missing Translation for: " + element + "\t" + type + "\t" + "\t" + data);
695             }
696         }
697     }
698 
isPrivateOrDeprecated(List<String> data)699     static boolean isPrivateOrDeprecated(List<String> data) {
700         if (data.toString().indexOf("PRIVATE") >= 0) {
701             return true;
702         }
703         if ("PRIVATE USE".equals(data.get(0))) return true;
704         if (data.size() < 3) return false;
705         if (data.get(2) == null) return false;
706         if (data.get(2).toString().length() != 0) return true;
707         return false;
708     }
709 
testPopulous()710     static void testPopulous() {
711         Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
712         CLDRFile supp = cldrFactory.make("supplementalData", false);
713         CLDRFile temp = SimpleFactory.makeFile("supplemental");
714         temp.setNonInheriting(true);
715         for (Iterator<String> it = supp.iterator(null, supp.getComparator()); it.hasNext(); ) {
716             String path = it.next();
717             String value = supp.getStringValue(path);
718             String fullPath = supp.getFullXPath(path);
719             XPathParts parts = XPathParts.getFrozenInstance(fullPath);
720             String type = parts.getAttributeValue(-1, "type");
721             String pop = language_territory_hack_map.get(type);
722             if (pop != null) {
723                 parts = parts.cloneAsThawed();
724                 parts.putAttributeValue(-1, "mostPopulousTerritory", pop);
725                 fullPath = parts.toString();
726             }
727             temp.add(fullPath, value);
728         }
729         PrintWriter pw = new PrintWriter(System.out);
730         temp.write(pw);
731         pw.close();
732     }
733 
734     private static final Map<String, String> language_territory_hack_map = new HashMap<>();
735     private static final String[][] language_territory_hack = {
736         {"af", "ZA"},
737         {"am", "ET"},
738         {"ar", "SA"},
739         {"as", "IN"},
740         {"ay", "PE"},
741         {"az", "AZ"},
742         {"bal", "PK"},
743         {"be", "BY"},
744         {"bg", "BG"},
745         {"bn", "IN"},
746         {"bs", "BA"},
747         {"ca", "ES"},
748         {"ch", "MP"},
749         {"cpe", "SL"},
750         {"cs", "CZ"},
751         {"cy", "GB"},
752         {"da", "DK"},
753         {"de", "DE"},
754         {"dv", "MV"},
755         {"dz", "BT"},
756         {"el", "GR"},
757         {"en", "US"},
758         {"es", "ES"},
759         {"et", "EE"},
760         {"eu", "ES"},
761         {"fa", "IR"},
762         {"fi", "FI"},
763         {"fil", "PH"},
764         {"fj", "FJ"},
765         {"fo", "FO"},
766         {"fr", "FR"},
767         {"ga", "IE"},
768         {"gd", "GB"},
769         {"gl", "ES"},
770         {"gn", "PY"},
771         {"gu", "IN"},
772         {"gv", "GB"},
773         {"ha", "NG"},
774         {"he", "IL"},
775         {"hi", "IN"},
776         {"ho", "PG"},
777         {"hr", "HR"},
778         {"ht", "HT"},
779         {"hu", "HU"},
780         {"hy", "AM"},
781         {"id", "ID"},
782         {"is", "IS"},
783         {"it", "IT"},
784         {"ja", "JP"},
785         {"ka", "GE"},
786         {"kk", "KZ"},
787         {"kl", "GL"},
788         {"km", "KH"},
789         {"kn", "IN"},
790         {"ko", "KR"},
791         {"kok", "IN"},
792         {"ks", "IN"},
793         {"ku", "TR"},
794         {"ky", "KG"},
795         {"la", "VA"},
796         {"lb", "LU"},
797         {"ln", "CG"},
798         {"lo", "LA"},
799         {"lt", "LT"},
800         {"lv", "LV"},
801         {"mai", "IN"},
802         {"men", "GN"},
803         {"mg", "MG"},
804         {"mh", "MH"},
805         {"mk", "MK"},
806         {"ml", "IN"},
807         {"mn", "MN"},
808         {"mni", "IN"},
809         {"mo", "MD"},
810         {"mr", "IN"},
811         {"ms", "MY"},
812         {"mt", "MT"},
813         {"my", "MM"},
814         {"na", "NR"},
815         {"nb", "NO"},
816         {"nd", "ZA"},
817         {"ne", "NP"},
818         {"niu", "NU"},
819         {"nl", "NL"},
820         {"nn", "NO"},
821         {"no", "NO"},
822         {"nr", "ZA"},
823         {"nso", "ZA"},
824         {"ny", "MW"},
825         {"om", "KE"},
826         {"or", "IN"},
827         {"pa", "IN"},
828         {"pau", "PW"},
829         {"pl", "PL"},
830         {"ps", "PK"},
831         {"pt", "BR"},
832         {"qu", "PE"},
833         {"rn", "BI"},
834         {"ro", "RO"},
835         {"ru", "RU"},
836         {"rw", "RW"},
837         {"sd", "IN"},
838         {"sg", "CF"},
839         {"si", "LK"},
840         {"sk", "SK"},
841         {"sl", "SI"},
842         {"sm", "WS"},
843         {"so", "DJ"},
844         {"sq", "CS"},
845         {"sr", "CS"},
846         {"ss", "ZA"},
847         {"st", "ZA"},
848         {"sv", "SE"},
849         {"sw", "KE"},
850         {"ta", "IN"},
851         {"te", "IN"},
852         {"tem", "SL"},
853         {"tet", "TL"},
854         {"th", "TH"},
855         {"ti", "ET"},
856         {"tg", "TJ"},
857         {"tk", "TM"},
858         {"tkl", "TK"},
859         {"tvl", "TV"},
860         {"tl", "PH"},
861         {"tn", "ZA"},
862         {"to", "TO"},
863         {"tpi", "PG"},
864         {"tr", "TR"},
865         {"ts", "ZA"},
866         {"uk", "UA"},
867         {"ur", "IN"},
868         {"uz", "UZ"},
869         {"ve", "ZA"},
870         {"vi", "VN"},
871         {"wo", "SN"},
872         {"xh", "ZA"},
873         {"zh", "CN"},
874         {"zh_Hant", "TW"},
875         {"zu", "ZA"},
876         {"aa", "ET"},
877         {"byn", "ER"},
878         {"eo", "DE"},
879         {"gez", "ET"},
880         {"haw", "US"},
881         {"iu", "CA"},
882         {"kw", "GB"},
883         {"sa", "IN"},
884         {"sh", "HR"},
885         {"sid", "ET"},
886         {"syr", "SY"},
887         {"tig", "ER"},
888         {"tt", "RU"},
889         {"wal", "ET"},
890     };
891 
892     static {
893         for (int i = 0; i < language_territory_hack.length; ++i) {
language_territory_hack_map.put( language_territory_hack[i][0], language_territory_hack[i][1])894             language_territory_hack_map.put(
895                     language_territory_hack[i][0], language_territory_hack[i][1]);
896         }
897     }
898 
899     static class MyXSymbolTable extends UnicodeSet.XSymbolTable {
900         static VariantFolder caseFolder = new VariantFolder(new CaseVariantFolder());
901         static VariantFolder canonicalFolder = new VariantFolder(new CanonicalFolder());
902         static VariantFolder compatibilityFolder = new VariantFolder(new CompatibilityFolder());
903 
904         @Override
applyPropertyAlias( String propertyName, String propertyValue, UnicodeSet result)905         public boolean applyPropertyAlias(
906                 String propertyName, String propertyValue, UnicodeSet result) {
907             if (propertyName.equalsIgnoreCase("close")) {
908                 if (propertyValue.equalsIgnoreCase("case")) {
909                     result.addAll(caseFolder.getClosure(result));
910                 } else if (propertyValue.equalsIgnoreCase("canonical")) {
911                     result.addAll(canonicalFolder.getClosure(result));
912                 } else if (propertyValue.equalsIgnoreCase("compatibility")) {
913                     result.addAll(compatibilityFolder.getClosure(result));
914                 }
915                 return true;
916             } else if (propertyName.equalsIgnoreCase("reduce")) {
917                 if (propertyValue.equalsIgnoreCase("case")) {
918                     UnicodeSet temp = caseFolder.reduce(result);
919                     result.clear().addAll(temp);
920                 } else if (propertyValue.equalsIgnoreCase("canonical")) {
921                     UnicodeSet temp = canonicalFolder.reduce(result);
922                     result.clear().addAll(temp);
923                 } else if (propertyValue.equalsIgnoreCase("compatibility")) {
924                     UnicodeSet temp = compatibilityFolder.reduce(result);
925                     result.clear().addAll(temp);
926                 }
927                 return true;
928             } else if (propertyName.equalsIgnoreCase("reduceCase")) {
929                 UnicodeSet temp =
930                         caseFolder.reduce(new UnicodeSet(propertyValue.replace("·]", ":]")));
931                 result.clear().addAll(temp);
932                 return true;
933             } else if (propertyName.equalsIgnoreCase("reduceCanonical")) {
934                 UnicodeSet temp =
935                         canonicalFolder.reduce(new UnicodeSet(propertyValue.replace("·]", ":]")));
936                 result.clear().addAll(temp);
937                 return true;
938             } else if (propertyName.equalsIgnoreCase("reduceCase")) {
939                 UnicodeSet temp =
940                         caseFolder.reduce(new UnicodeSet(propertyValue.replace("·]", ":]")));
941                 result.clear().addAll(temp);
942                 return true;
943             }
944             return false;
945         }
946     }
947 }
948