1 package org.unicode.cldr.test; 2 3 import com.ibm.icu.lang.UCharacter; 4 import com.ibm.icu.lang.UScript; 5 import com.ibm.icu.text.Collator; 6 import com.ibm.icu.text.DecimalFormat; 7 import com.ibm.icu.text.NumberFormat; 8 import com.ibm.icu.text.Transliterator; 9 import com.ibm.icu.text.UTF16; 10 import com.ibm.icu.text.UnicodeSet; 11 import com.ibm.icu.text.UnicodeSetIterator; 12 import com.ibm.icu.util.Currency; 13 import com.ibm.icu.util.ULocale; 14 import java.io.PrintWriter; 15 import java.math.BigDecimal; 16 import java.text.ParsePosition; 17 import java.util.ArrayList; 18 import java.util.Arrays; 19 import java.util.BitSet; 20 import java.util.Collection; 21 import java.util.Collections; 22 import java.util.EnumSet; 23 import java.util.HashMap; 24 import java.util.HashSet; 25 import java.util.Iterator; 26 import java.util.List; 27 import java.util.Locale; 28 import java.util.Map; 29 import java.util.Set; 30 import java.util.TreeSet; 31 import java.util.regex.Matcher; 32 import org.unicode.cldr.util.CLDRFile; 33 import org.unicode.cldr.util.CLDRFile.Status; 34 import org.unicode.cldr.util.CLDRPaths; 35 import org.unicode.cldr.util.CldrUtility; 36 import org.unicode.cldr.util.DtdType; 37 import org.unicode.cldr.util.Factory; 38 import org.unicode.cldr.util.Iso639Data; 39 import org.unicode.cldr.util.Iso639Data.Scope; 40 import org.unicode.cldr.util.Level; 41 import org.unicode.cldr.util.Pair; 42 import org.unicode.cldr.util.PatternCache; 43 import org.unicode.cldr.util.SimpleFactory; 44 import org.unicode.cldr.util.StandardCodes; 45 import org.unicode.cldr.util.VariantFolder; 46 import org.unicode.cldr.util.VariantFolder.CanonicalFolder; 47 import org.unicode.cldr.util.VariantFolder.CaseVariantFolder; 48 import org.unicode.cldr.util.VariantFolder.CompatibilityFolder; 49 import org.unicode.cldr.util.XPathParts; 50 import org.unicode.cldr.util.props.BagFormatter; 51 52 public class TestMisc { 53 54 static Currency SWISS_FRANC = Currency.getInstance("CHF"); 55 56 static class Lists { sortedCopy(Collection<E> iterable)57 public static <E extends Comparable> List<E> sortedCopy(Collection<E> iterable) { 58 List<E> list = new ArrayList<>(); 59 list.addAll(iterable); 60 Collections.sort(list); 61 return list; 62 } 63 } 64 65 enum Foo { 66 A, 67 M, 68 Z 69 } 70 main(String[] args)71 public static void main(String[] args) { 72 73 checkAliases(); 74 if (true) return; 75 76 Transliterator en_ru = Transliterator.getInstance("en-ru"); 77 System.out.println("Mark + " + en_ru.transform("Mark")); 78 79 Transliterator latn_cyrl = Transliterator.getInstance("Latn-Cyrl"); 80 System.out.println("Mark + " + latn_cyrl.transform("Mark")); 81 82 Transliterator ulatn_ucyrl = Transliterator.getInstance("und_Latn-und_Cyrl"); 83 System.out.println("Mark + " + latn_cyrl.transform("Mark")); 84 85 Locale locale = 86 new Locale("abc-d αγζθ ?ef_g%hi", "abc-d αγζθ ?ef_g%hi", "abc-d αγζθ ?ef_g%hi"); 87 88 System.out.println( 89 "Locale locale = new Locale(\"abc-d αγζθ ?ef_g%hi\",\"abc-d αγζθ ?ef_g%hi\",\"abc-d αγζθ ?ef_g%hi\");"); 90 System.out.println("locale.toString() == \"" + locale + "\""); 91 92 MyXSymbolTable sym = new MyXSymbolTable(); 93 BagFormatter bf = new BagFormatter(); 94 for (String test : 95 new String[] { 96 "[:reduceCase=[[Åå{fi}]]:]", 97 "[:reduceCanonical=[[Åå{fi}]]:]", 98 "[[,٫.]]", 99 "[[,٫.][:close=compatibility:]]", 100 "[[\\ ,٬.']]", 101 "[[\\ ,٬.'][:close=compatibility:]]", 102 "[[\u002E\u2024\uFE52\uFF0E\u3002][:close=compatibility:]]", 103 "[[[\u002C \u002E \u066B \u2024 \u3002 \uFE52 \uFF0E、، \u002E \u2024 \uFE52 \uFF0E \u3002]-[\u002E\u2024\uFE52\uFF0E\u3002]][:close=compatibility:]]", 104 "[[" 105 + "\\u0020" 106 + "[, ٬ ..․﹒ '' \u2018 \u2019 ]" 107 + "-[.\u2024\u3002\uFE12\uFE52\uFF0E\uFF61]" 108 + "-[,\u060C\u066B\u3001\uFE10\uFE11\uFE50\uFE51\uFF0C\uFF64]]" 109 + "[:close=compatibility:]]", 110 111 /* 112 * "[[Åå{fi}][:close=canonical:]]", 113 * "[[Åå{fi}][:close=compatibility:]]", 114 * "[[Åå{fi}][:reduce=case:]]", 115 * "[[Åå{fi}][:reduce=canonical:]]", 116 * "[[Åå{fi}][:reduce=compatibility:]]", 117 */ 118 }) { 119 ParsePosition p = new ParsePosition(0); 120 UnicodeSet set = new UnicodeSet(test, p, sym); 121 UnicodeSet codes = set.complement().complement(); 122 System.out.println( 123 test 124 + CldrUtility.LINE_SEPARATOR 125 + codes.toPattern(true) 126 + CldrUtility.LINE_SEPARATOR 127 + bf.showSetNames(set.complement().complement()) 128 + CldrUtility.LINE_SEPARATOR); 129 } 130 if (true) return; 131 132 StandardCodes sc = StandardCodes.make(); 133 for (String s : new String[] {"language", "script", "territory"}) { 134 System.out.println(s + ":\t" + sc.getGoodAvailableCodes(s).size()); 135 } 136 if (true) return; 137 138 Set<Foo> inFileOrder = EnumSet.allOf(Foo.class); 139 List<Foo> inAlphaOrder = Lists.sortedCopy(inFileOrder); 140 System.out.println(inFileOrder); 141 System.out.println(inAlphaOrder); 142 143 DecimalFormat currencyFormat = 144 (DecimalFormat) NumberFormat.getCurrencyInstance(new ULocale("de-CH")); 145 currencyFormat.setCurrency(SWISS_FRANC); 146 // sometime later... 147 // we want the financial format of the currency, not the retail format 148 System.out.println("Retail:\t" + currencyFormat.format(123.53)); 149 BigDecimal increment = currencyFormat.getRoundingIncrement(); 150 System.out.println("Rounding Increment:\t" + increment); 151 double double_increment = increment.doubleValue(); 152 System.out.println("Double rounding Increment:\t" + double_increment); 153 double log = Math.log10(double_increment); 154 System.out.println("Double log:\t" + log); 155 double new_increment = Math.pow(10, Math.floor(log)); 156 System.out.println("Floored Increment:\t" + new_increment); 157 currencyFormat.setRoundingIncrement(new_increment); 158 System.out.println("Financial:\t" + currencyFormat.format(123.53)); 159 160 if (true) return; 161 162 testWeights(); 163 if (true) return; 164 165 testScripts(); 166 testToRegex(); 167 // checkEastAsianWidth(); 168 if (true) return; 169 // import ICU 170 UnicodeSet RTL = 171 new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]"); 172 173 checkCollections(); 174 175 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 176 CLDRFile englishFile = cldrFactory.make("en", true); 177 ExampleGenerator eg = new ExampleGenerator(englishFile, englishFile); 178 System.out.println( 179 eg.getHelpHtml( 180 "//ldml/numbers/currencyFormats/currencyFormatLength/currencyFormat[@type=\"standard\"]/pattern[@type=\"standard\"][@draft=\"provisional\"]", 181 "")); 182 System.out.println(eg.getHelpHtml("/exemplarCharacters", "")); 183 System.out.println(eg.getHelpHtml("/calendar/pattern", "")); 184 185 if (true) return; 186 Set<String> s = new HashSet<>(Arrays.asList("a", "A", "c")); 187 Collator caselessCompare = Collator.getInstance(Locale.ENGLISH); 188 caselessCompare.setStrength(Collator.PRIMARY); 189 Set<String> t = new TreeSet<>(caselessCompare); 190 t.addAll(Arrays.asList("a", "b", "c")); 191 System.out.println("s equals t: " + s.equals(t)); 192 System.out.println("t equals s: " + t.equals(s)); 193 194 Set<String> u = Collections.unmodifiableSet(t); 195 System.out.println("s==t " + (s.equals(t))); 196 System.out.println("s==u " + (s.equals(u))); 197 UnicodeSet x = new UnicodeSet("[a-z]"); 198 UnicodeSet y = new UnicodeSet("[a-z]").freeze(); 199 System.out.println("x==y " + (x.equals(y))); 200 // showEnglish(); 201 // checkPrivateUse(); 202 // testPopulous(); 203 // checkDistinguishing(); 204 // checkEastAsianWidth(); 205 // checkEnglishPaths(); 206 System.out.println("Done"); 207 } 208 checkAliases()209 private static void checkAliases() { 210 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 211 CLDRFile en = cldrFactory.make("root", true); 212 Status status = new Status(); 213 Matcher m = PatternCache.get("gregorian.*dayPeriods").matcher(""); 214 for (Iterator<String> it = en.iterator(null, en.getComparator()); it.hasNext(); ) { 215 String path = it.next(); 216 if (!m.reset(path).find()) { 217 continue; 218 } 219 // String locale = en.getSourceLocaleID(path, status); 220 String value = en.getStringValue(path); 221 String fullPath = en.getFullXPath(path); 222 System.out.println("value:\t" + value + "\tpath:\t" + fullPath); 223 if (!path.equals(status.pathWhereFound)) { 224 System.out.println("\torigin:\t" + status); 225 } 226 // System.out.println("locale:\t" + locale); 227 System.out.println(); 228 } 229 } 230 testWeights()231 private static void testWeights() { 232 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 233 CLDRFile english = cldrFactory.make("en", true); 234 Set<Pair<Integer, String>> rel = new TreeSet<>(); 235 for (String desiredLocale : cldrFactory.getAvailable()) { 236 int vote = Level.getDefaultWeight("google", desiredLocale); 237 rel.add(new Pair<>(vote, desiredLocale)); 238 } 239 for (Pair<Integer, String> p : rel) { 240 System.out.println(p + "\t" + english.getName(p.getSecond())); 241 } 242 } 243 testScripts()244 private static void testScripts() { 245 BagFormatter bf = new BagFormatter(); 246 247 UnicodeSet caseFolded = new UnicodeSet(); 248 UnicodeSet simpleCaseFolded = new UnicodeSet(); 249 for (int i = 0; i < 0x10FFFF; ++i) { 250 String form = UTF16.valueOf(i); 251 if (UCharacter.foldCase(form, true).equals(form)) { 252 caseFolded.add(i); 253 } 254 if (UCharacter.foldCase(i, true) == i) { 255 simpleCaseFolded.add(i); 256 } 257 } 258 caseFolded.freeze(); 259 simpleCaseFolded.freeze(); 260 261 UnicodeSet functionalExceptCase = 262 new UnicodeSet( 263 "[" 264 + "[:L:][:Mc:][:Mn:][:Nd:]" 265 + "&[:^NFKC_QuickCheck=No:]" 266 + "&[:^default_ignorable_code_point:]]") 267 .freeze(); 268 269 UnicodeSet asciiIdn = new UnicodeSet("[-A-Z0-9]").freeze(); 270 271 UnicodeSet archaic = 272 new UnicodeSet( 273 "[" 274 + "[:script=Bugi:]" 275 + "[:script=Copt:]" 276 + "[:script=Cprt:]" 277 + "[:script=Dsrt:]" 278 + "[:script=Glag:]" 279 + "[:script=Goth:]" 280 + "[:script=Hano:]" 281 + "[:script=Ital:]" 282 + "[:script=Khar:]" 283 + "[:script=Linb:]" 284 + "[:script=Ogam:]" 285 + "[:script=Osma:]" 286 + "[:script=Phag:]" 287 + "[:script=Phnx:]" 288 + "[:script=Runr:]" 289 + "[:script=Shaw:]" 290 + "[:script=Sylo:]" 291 + "[:script=Syrc:]" 292 + "[:script=Tagb:]" 293 + "[:script=Tglg:]" 294 + "[:script=Ugar:]" 295 + "[:script=Xpeo:]" 296 + "[:script=Xsux:]" 297 + 298 // "[:script=Arab:]" + 299 // "[:script=Armn:]" + 300 // "[:script=Beng:]" + 301 // "[:script=Bopo:]" + 302 "[:block=Combining_Diacritical_Marks _for_Symbols:]" 303 + "[:block=Musical_Symbols:]" 304 + "[:block=Ancient_Greek_Musical_Notation:]]") 305 .freeze(); 306 307 System.out.println("functionalExceptCase: " + functionalExceptCase); 308 System.out.println("archaic: " + archaic); 309 310 System.out.println( 311 "SimpleCaseFolded & !CaseFolded & Functional & !Archaic:" 312 + CldrUtility.LINE_SEPARATOR 313 + bf.showSetNames( 314 new UnicodeSet(simpleCaseFolded) 315 .removeAll(caseFolded) 316 .retainAll(functionalExceptCase) 317 .removeAll(archaic) 318 .removeAll(asciiIdn))); 319 320 UnicodeSet functional = new UnicodeSet(functionalExceptCase).retainAll(caseFolded).freeze(); 321 System.out.println("functional: " + functional.size()); 322 UnicodeSet functionalAndNotArchaic = new UnicodeSet(functional).removeAll(archaic).freeze(); 323 System.out.println("archaic: " + archaic.size()); 324 System.out.println("functionalAndNotArchaic: " + functionalAndNotArchaic.size()); 325 326 // System.out.println(bf.showSetNames("Case Folded", caseFolded,"Simple Case Folded", 327 // simpleCaseFolded)); 328 329 UnicodeSet functionalCommon = 330 new UnicodeSet("[:script=common:]") 331 .retainAll(functional) 332 .removeAll(archaic) 333 .removeAll(asciiIdn); 334 System.out.println( 335 "Common & Functional & !Archaic:" 336 + CldrUtility.LINE_SEPARATOR 337 + bf.showSetNames(functionalCommon)); 338 339 UnicodeSet functionalInherited = 340 new UnicodeSet("[:script=inherited:]") 341 .retainAll(functional) 342 .removeAll(archaic) 343 .removeAll(asciiIdn); 344 System.out.println( 345 "Inherited & Functional & !Archaic:" 346 + CldrUtility.LINE_SEPARATOR 347 + bf.showSetNames(functionalInherited)); 348 349 UnicodeSet nl = new UnicodeSet("[:Nl:]").retainAll(functional).removeAll(archaic); 350 System.out.println( 351 "Nl:" + CldrUtility.LINE_SEPARATOR + bf.showSetNames(new UnicodeSet("[:Nl:]"))); 352 System.out.println( 353 "Nl & Functional & !Archaic:" + CldrUtility.LINE_SEPARATOR + bf.showSetNames(nl)); 354 355 UnicodeSet restrictedXidContinue = 356 new UnicodeSet( 357 "[[:xid_continue:]" 358 + "&[:^NFKC_QuickCheck=No:]" 359 + "&[:^default_ignorable_code_point:]" 360 + "&[:^Pc:]]") 361 .retainAll(caseFolded); 362 363 System.out.println( 364 bf.showSetDifferences( 365 "IDNA Functional", 366 functional, 367 "Unicode XID & NFKC &!DefaultIgnorable &! Pc", 368 restrictedXidContinue)); 369 370 Transliterator t = Transliterator.getInstance("lower"); 371 System.out.println("ABC " + t.transliterate("ABC")); 372 /* 373 * generalCategory(cp) is {Ll, Lu, Lo, Lm, Mn, Mc, Nd}, AND 374 * NFKC(cp) == cp, AND 375 * casefold(cp) == cp, AND 376 * !defaultIgnorableCodePoint(cp) 377 */ 378 BitSet scripts = new BitSet(); 379 for (int cp = 0; cp < 0x10FFFF; ++cp) { 380 int script = UScript.getScript(cp); 381 if (script == UScript.COMMON 382 || script == UScript.UNKNOWN 383 || script == UScript.INHERITED) { 384 continue; 385 } 386 scripts.set(script); 387 } 388 Set<String> toPrint = new TreeSet<>(); 389 for (int script = 0; script < scripts.size(); ++script) { 390 if (!scripts.get(script)) continue; 391 String code = UScript.getShortName(script); 392 String name = UScript.getName(script); 393 if (StandardCodes.isScriptModern(code)) { 394 toPrint.add("modern\t" + code + "\t" + name); 395 } else { 396 toPrint.add("archaic\t" + code + "\t" + name); 397 } 398 } 399 for (String line : toPrint) { 400 System.out.println(line); 401 } 402 } 403 checkCollections()404 private static void checkCollections() { 405 System.out.println("Collections"); 406 new org.unicode.cldr.util.CldrUtility.Apply<String>() { 407 @Override 408 public void apply(String item) { 409 if (Iso639Data.getScope(item.toString()) != Scope.Collection) return; 410 System.out.println(item + "\t" + CldrUtility.join(Iso639Data.getNames(item), ", ")); 411 } 412 }.applyTo(Iso639Data.getAvailable()); 413 System.out.println(CldrUtility.LINE_SEPARATOR + "Macrolanguages"); 414 new org.unicode.cldr.util.CldrUtility.Apply<String>() { 415 @Override 416 public void apply(String item) { 417 if (Iso639Data.getScope(item.toString()) != Scope.Macrolanguage) return; 418 System.out.println(item + "\t" + CldrUtility.join(Iso639Data.getNames(item), ", ")); 419 } 420 }.applyTo(Iso639Data.getAvailable()); 421 } 422 testToRegex()423 static void testToRegex() { 424 String[] tests = { 425 "\\-", 426 "a", 427 "d-f", 428 "\\u2000", 429 "\\uAC00-\\uAC12", 430 "{AB}", 431 "{CDE}", 432 "\\uFFF0-\\U0010000F", 433 "\\U0010100F-\\U0010300F" 434 }; // }; // 435 for (int i = (1 << tests.length) - 1; i >= 0; --i) { 436 String test = "["; 437 for (int j = 0; j < tests.length; ++j) { 438 if ((i & (1 << j)) != 0) { 439 test += tests[j]; 440 } 441 } 442 test += "]"; 443 testToRegex(new UnicodeSet(test)); 444 } 445 } 446 testToRegex(UnicodeSet test)447 private static void testToRegex(UnicodeSet test) { 448 String formatted = CldrUtility.toRegex(test); 449 System.out.println(test + "\t->\t" + formatted); 450 Matcher newTest = PatternCache.get(formatted).matcher(""); 451 UnicodeSet failures = new UnicodeSet(); 452 for (UnicodeSetIterator it = new UnicodeSetIterator(test); it.next(); ) { 453 if (!newTest.reset(it.getString()).matches()) { 454 failures.add(it.getString()); 455 } 456 } 457 if (failures.size() != 0) { 458 System.out.println("\tFailed on: " + failures); 459 } 460 System.out.flush(); 461 } 462 checkEastAsianWidth()463 static void checkEastAsianWidth() { 464 UnicodeSet dontCares = new UnicodeSet("[[:surrogate:][:unassigned:][:control:]]").freeze(); 465 UnicodeSet dontCares2 = new UnicodeSet("[:^letter:]").freeze(); 466 467 // UnicodeSet wide = new 468 // UnicodeSet("[[:East_Asian_Width=wide:][:East_Asian_Width=fullwidth:][:Co:]]"); // 469 // remove supplementaries 470 // System.out.format("Wide %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR, 471 // wide); 472 // System.out.format("Wide(spanned) %s" + Utility.LINE_SEPARATOR + "" + 473 // Utility.LINE_SEPARATOR, 474 // Utility.addDontCareSpans(wide, dontCares)); 475 // UnicodeSet zeroWidth = new 476 // UnicodeSet("[[:default_ignorable_code_point:][:Mn:][:Me:]-[:Noncharacter_Code_Point:]-[:Cc:]]"); // remove 477 // supplementaries 478 // System.out.format("ZeroWidth %s" + Utility.LINE_SEPARATOR + "" + Utility.LINE_SEPARATOR, 479 // zeroWidth); 480 // System.out.format("ZeroWidth(spanned) %s" + Utility.LINE_SEPARATOR + "" + 481 // Utility.LINE_SEPARATOR, 482 // Utility.addDontCareSpans(zeroWidth, dontCares)); 483 484 // P2. In each paragraph, find the first character of type L, AL, or R. 485 UnicodeSet strongL = new UnicodeSet("[[:BidiClass=L:]-[:unassigned:]]").freeze(); // 486 showSpans("Bidi L", strongL, dontCares); 487 showSpans("Bidi L*", strongL, dontCares2); 488 489 UnicodeSet strongRAL = 490 new UnicodeSet("[[:BidiClass=R:][:BidiClass=AL:]-[:unassigned:]]").freeze(); 491 showSpans("Bidi R,AL", strongRAL, dontCares); 492 showSpans("Bidi R,AL*", strongRAL, dontCares2); 493 494 UnicodeSet strong = 495 new UnicodeSet("[[:BidiClass=L:][:BidiClass=R:][:BidiClass=AL:]-[:unassigned:]]") 496 .freeze(); 497 showSpans("Strong", strong, dontCares); 498 showSpans("Strong*", strong, dontCares2); 499 } 500 showSpans(String title, UnicodeSet sourceSet, UnicodeSet dontCares)501 private static void showSpans(String title, UnicodeSet sourceSet, UnicodeSet dontCares) { 502 System.out.println(title); 503 System.out.format("\tSource Set: %s" + CldrUtility.LINE_SEPARATOR, sourceSet); 504 System.out.format("\tDon't Cares: %s" + CldrUtility.LINE_SEPARATOR, dontCares); 505 UnicodeSet spanned = new UnicodeSet(sourceSet).addBridges(dontCares); 506 spanned = spanned.complement().complement(); 507 String spannedString = spanned.toString(); 508 String unescapedString = spanned.toPattern(false); 509 System.out.format("\tRanges: %d" + CldrUtility.LINE_SEPARATOR, spanned.getRangeCount()); 510 System.out.format("\tStrlen(\\u): %d" + CldrUtility.LINE_SEPARATOR, spannedString.length()); 511 System.out.format( 512 "\tStrlen(!\\u): %d" + CldrUtility.LINE_SEPARATOR, unescapedString.length()); 513 String title2 = "Result"; 514 String sample = spannedString; 515 if (false) { 516 if (sample.length() > 60) { 517 title2 = "Sample"; 518 sample = sample.substring(0, 60) + " ..."; 519 } 520 } 521 System.out.format("\t%s: %s" + CldrUtility.LINE_SEPARATOR, title2, sample); 522 System.out.println(); 523 } 524 525 static int[] extraCJK = { 526 0x3006, // IDEOGRAPHIC CLOSING MARK;Lo 527 0x302A, // IDEOGRAPHIC LEVEL TONE MARK;Mn 528 0x302B, // IDEOGRAPHIC RISING TONE MARK;Mn 529 0x302C, // IDEOGRAPHIC DEPARTING TONE MARK;Mn 530 0x302D, // IDEOGRAPHIC ENTERING TONE MARK;Mn 531 0x302E, // HANGUL SINGLE DOT TONE MARK;Mn 532 0x302F, // HANGUL DOUBLE DOT TONE MARK;Mn 533 0x3031, // VERTICAL KANA REPEAT MARK;Lm 534 0x3032, // VERTICAL KANA REPEAT WITH VOICED SOUND MARK;Lm 535 0x3033, // VERTICAL KANA REPEAT MARK UPPER HALF;Lm 536 0x3034, // VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF;Lm 537 0x3035, // VERTICAL KANA REPEAT MARK LOWER HALF;Lm 538 0x303C, // MASU MARK;Lo 539 0x3099, // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn 540 0x309A, // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn 541 0x309B, // KATAKANA-HIRAGANA VOICED SOUND MARK;Sk 542 0x309C, // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Sk 543 0x30A0, // KATAKANA-HIRAGANA DOUBLE HYPHEN;Pd 544 0x30FC, // KATAKANA-HIRAGANA PROLONGED SOUND MARK;Lm 545 0xFF70, // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK;Lm 546 0xFF9E, // HALFWIDTH KATAKANA VOICED SOUND MARK;Lm 547 0xFF9F, // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm 548 }; 549 checkCFK()550 void checkCFK() { 551 // UnicodeSet Han, Hangul, Hiragana, Katakana, or Bopomofo 552 } 553 checkDistinguishing()554 private static void checkDistinguishing() { 555 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 556 Set<String> cldrFiles = cldrFactory.getAvailableLanguages(); 557 Set<String> distinguishing = new TreeSet<>(); 558 Set<String> nondistinguishing = new TreeSet<>(); 559 for (Iterator<String> it = cldrFiles.iterator(); it.hasNext(); ) { 560 CLDRFile cldrFile = cldrFactory.make(it.next(), false); 561 DtdType dtdType = null; 562 if (cldrFile.isNonInheriting()) { 563 continue; 564 } 565 for (Iterator<String> it2 = cldrFile.iterator(); it2.hasNext(); ) { 566 String path = it2.next(); 567 if (dtdType == null) { 568 dtdType = DtdType.fromPath(path); 569 } 570 String fullPath = cldrFile.getFullXPath(path); 571 if (path.equals(fullPath)) { 572 continue; 573 } 574 XPathParts parts = XPathParts.getFrozenInstance(fullPath); 575 for (int i = 0; i < parts.size(); ++i) { 576 Map<String, String> m = parts.getAttributes(i); 577 if (m.size() == 0) { 578 continue; 579 } 580 String element = parts.getElement(i); 581 for (Iterator<String> mit = m.keySet().iterator(); mit.hasNext(); ) { 582 String attribute = mit.next(); 583 if (CLDRFile.isDistinguishing(dtdType, element, attribute)) { 584 distinguishing.add(attribute + "\tD\t" + element); 585 } else { 586 nondistinguishing.add(attribute + "\tN\t" + element); 587 } 588 } 589 } 590 } 591 } 592 System.out.println("Distinguishing"); 593 for (Iterator<String> it = distinguishing.iterator(); it.hasNext(); ) { 594 System.out.println(it.next()); 595 } 596 System.out.println(); 597 System.out.println("Non-Distinguishing"); 598 for (Iterator<String> it = nondistinguishing.iterator(); it.hasNext(); ) { 599 System.out.println(it.next()); 600 } 601 } 602 showEnglish()603 private static void showEnglish() { 604 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 605 String requestedLocale = "en"; 606 CLDRFile cldrFile = cldrFactory.make(requestedLocale, true); 607 CLDRFile.Status status = new CLDRFile.Status(); 608 for (Iterator<String> it = cldrFile.iterator(); it.hasNext(); ) { 609 String requestedPath = it.next(); 610 String localeWhereFound = cldrFile.getSourceLocaleID(requestedPath, status); 611 if (!localeWhereFound.equals(requestedLocale) 612 || !status.pathWhereFound.equals(requestedPath)) { 613 System.out.println( 614 "requested path:\t" 615 + requestedPath 616 + "\tfound locale:\t" 617 + localeWhereFound 618 + "\tsame?\t" 619 + localeWhereFound.equals(requestedLocale) 620 + "\tfound path:\t" 621 + status.pathWhereFound 622 + "\tsame?\t" 623 + status.pathWhereFound.equals(requestedPath)); 624 } 625 } 626 } 627 checkPrivateUse()628 private static void checkPrivateUse() { 629 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 630 String requestedLocale = "en"; 631 CLDRFile cldrFile = cldrFactory.make(requestedLocale, true); 632 StandardCodes sc = StandardCodes.make(); 633 Set<String> careAbout = 634 new HashSet<>( 635 Arrays.asList(new String[] {"language", "script", "territory", "variant"})); 636 HashMap<String, Set<String>> foundItems = new HashMap<>(); 637 TreeSet<String> problems = new TreeSet<>(); 638 for (Iterator<String> it = 639 cldrFile.iterator("", new UTF16.StringComparator(true, false, 0)); 640 it.hasNext(); ) { 641 String requestedPath = it.next(); 642 XPathParts parts = XPathParts.getFrozenInstance(requestedPath); 643 String element = parts.getElement(-1); 644 if (!careAbout.contains(element)) { 645 continue; 646 } 647 String type = parts.getAttributeValue(-1, "type"); 648 if (type == null) { 649 continue; 650 } 651 Set<String> foundSet = foundItems.get(element); 652 if (foundSet == null) { 653 foundItems.put(element, foundSet = new TreeSet<>()); 654 } 655 foundSet.add(type); 656 657 List<String> data = sc.getFullData(element, type); 658 if (data == null) { 659 problems.add( 660 "No RFC3066bis data for: " 661 + element 662 + "\t" 663 + type 664 + "\t" 665 + cldrFile.getStringValue(requestedPath)); 666 continue; 667 } 668 if (isPrivateOrDeprecated(data)) { 669 problems.add( 670 "Private/Deprecated Data for: " 671 + element 672 + "\t" 673 + type 674 + "\t" 675 + cldrFile.getStringValue(requestedPath) 676 + "\t" 677 + data); 678 } 679 // String canonical_value = (String)data.get(2); 680 } 681 for (Iterator<String> it = problems.iterator(); it.hasNext(); ) { 682 System.out.println(it.next()); 683 } 684 for (Iterator<String> it = careAbout.iterator(); it.hasNext(); ) { 685 String element = it.next(); 686 Set<String> real = sc.getAvailableCodes(element); 687 Set<String> notFound = new TreeSet<>(real); 688 notFound.removeAll(foundItems.get(element)); 689 for (Iterator<String> it2 = notFound.iterator(); it2.hasNext(); ) { 690 String type = it2.next(); 691 List<String> data = sc.getFullData(element, type); 692 if (isPrivateOrDeprecated(data)) continue; 693 System.out.println( 694 "Missing Translation for: " + element + "\t" + type + "\t" + "\t" + data); 695 } 696 } 697 } 698 isPrivateOrDeprecated(List<String> data)699 static boolean isPrivateOrDeprecated(List<String> data) { 700 if (data.toString().indexOf("PRIVATE") >= 0) { 701 return true; 702 } 703 if ("PRIVATE USE".equals(data.get(0))) return true; 704 if (data.size() < 3) return false; 705 if (data.get(2) == null) return false; 706 if (data.get(2).toString().length() != 0) return true; 707 return false; 708 } 709 testPopulous()710 static void testPopulous() { 711 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 712 CLDRFile supp = cldrFactory.make("supplementalData", false); 713 CLDRFile temp = SimpleFactory.makeFile("supplemental"); 714 temp.setNonInheriting(true); 715 for (Iterator<String> it = supp.iterator(null, supp.getComparator()); it.hasNext(); ) { 716 String path = it.next(); 717 String value = supp.getStringValue(path); 718 String fullPath = supp.getFullXPath(path); 719 XPathParts parts = XPathParts.getFrozenInstance(fullPath); 720 String type = parts.getAttributeValue(-1, "type"); 721 String pop = language_territory_hack_map.get(type); 722 if (pop != null) { 723 parts = parts.cloneAsThawed(); 724 parts.putAttributeValue(-1, "mostPopulousTerritory", pop); 725 fullPath = parts.toString(); 726 } 727 temp.add(fullPath, value); 728 } 729 PrintWriter pw = new PrintWriter(System.out); 730 temp.write(pw); 731 pw.close(); 732 } 733 734 private static final Map<String, String> language_territory_hack_map = new HashMap<>(); 735 private static final String[][] language_territory_hack = { 736 {"af", "ZA"}, 737 {"am", "ET"}, 738 {"ar", "SA"}, 739 {"as", "IN"}, 740 {"ay", "PE"}, 741 {"az", "AZ"}, 742 {"bal", "PK"}, 743 {"be", "BY"}, 744 {"bg", "BG"}, 745 {"bn", "IN"}, 746 {"bs", "BA"}, 747 {"ca", "ES"}, 748 {"ch", "MP"}, 749 {"cpe", "SL"}, 750 {"cs", "CZ"}, 751 {"cy", "GB"}, 752 {"da", "DK"}, 753 {"de", "DE"}, 754 {"dv", "MV"}, 755 {"dz", "BT"}, 756 {"el", "GR"}, 757 {"en", "US"}, 758 {"es", "ES"}, 759 {"et", "EE"}, 760 {"eu", "ES"}, 761 {"fa", "IR"}, 762 {"fi", "FI"}, 763 {"fil", "PH"}, 764 {"fj", "FJ"}, 765 {"fo", "FO"}, 766 {"fr", "FR"}, 767 {"ga", "IE"}, 768 {"gd", "GB"}, 769 {"gl", "ES"}, 770 {"gn", "PY"}, 771 {"gu", "IN"}, 772 {"gv", "GB"}, 773 {"ha", "NG"}, 774 {"he", "IL"}, 775 {"hi", "IN"}, 776 {"ho", "PG"}, 777 {"hr", "HR"}, 778 {"ht", "HT"}, 779 {"hu", "HU"}, 780 {"hy", "AM"}, 781 {"id", "ID"}, 782 {"is", "IS"}, 783 {"it", "IT"}, 784 {"ja", "JP"}, 785 {"ka", "GE"}, 786 {"kk", "KZ"}, 787 {"kl", "GL"}, 788 {"km", "KH"}, 789 {"kn", "IN"}, 790 {"ko", "KR"}, 791 {"kok", "IN"}, 792 {"ks", "IN"}, 793 {"ku", "TR"}, 794 {"ky", "KG"}, 795 {"la", "VA"}, 796 {"lb", "LU"}, 797 {"ln", "CG"}, 798 {"lo", "LA"}, 799 {"lt", "LT"}, 800 {"lv", "LV"}, 801 {"mai", "IN"}, 802 {"men", "GN"}, 803 {"mg", "MG"}, 804 {"mh", "MH"}, 805 {"mk", "MK"}, 806 {"ml", "IN"}, 807 {"mn", "MN"}, 808 {"mni", "IN"}, 809 {"mo", "MD"}, 810 {"mr", "IN"}, 811 {"ms", "MY"}, 812 {"mt", "MT"}, 813 {"my", "MM"}, 814 {"na", "NR"}, 815 {"nb", "NO"}, 816 {"nd", "ZA"}, 817 {"ne", "NP"}, 818 {"niu", "NU"}, 819 {"nl", "NL"}, 820 {"nn", "NO"}, 821 {"no", "NO"}, 822 {"nr", "ZA"}, 823 {"nso", "ZA"}, 824 {"ny", "MW"}, 825 {"om", "KE"}, 826 {"or", "IN"}, 827 {"pa", "IN"}, 828 {"pau", "PW"}, 829 {"pl", "PL"}, 830 {"ps", "PK"}, 831 {"pt", "BR"}, 832 {"qu", "PE"}, 833 {"rn", "BI"}, 834 {"ro", "RO"}, 835 {"ru", "RU"}, 836 {"rw", "RW"}, 837 {"sd", "IN"}, 838 {"sg", "CF"}, 839 {"si", "LK"}, 840 {"sk", "SK"}, 841 {"sl", "SI"}, 842 {"sm", "WS"}, 843 {"so", "DJ"}, 844 {"sq", "CS"}, 845 {"sr", "CS"}, 846 {"ss", "ZA"}, 847 {"st", "ZA"}, 848 {"sv", "SE"}, 849 {"sw", "KE"}, 850 {"ta", "IN"}, 851 {"te", "IN"}, 852 {"tem", "SL"}, 853 {"tet", "TL"}, 854 {"th", "TH"}, 855 {"ti", "ET"}, 856 {"tg", "TJ"}, 857 {"tk", "TM"}, 858 {"tkl", "TK"}, 859 {"tvl", "TV"}, 860 {"tl", "PH"}, 861 {"tn", "ZA"}, 862 {"to", "TO"}, 863 {"tpi", "PG"}, 864 {"tr", "TR"}, 865 {"ts", "ZA"}, 866 {"uk", "UA"}, 867 {"ur", "IN"}, 868 {"uz", "UZ"}, 869 {"ve", "ZA"}, 870 {"vi", "VN"}, 871 {"wo", "SN"}, 872 {"xh", "ZA"}, 873 {"zh", "CN"}, 874 {"zh_Hant", "TW"}, 875 {"zu", "ZA"}, 876 {"aa", "ET"}, 877 {"byn", "ER"}, 878 {"eo", "DE"}, 879 {"gez", "ET"}, 880 {"haw", "US"}, 881 {"iu", "CA"}, 882 {"kw", "GB"}, 883 {"sa", "IN"}, 884 {"sh", "HR"}, 885 {"sid", "ET"}, 886 {"syr", "SY"}, 887 {"tig", "ER"}, 888 {"tt", "RU"}, 889 {"wal", "ET"}, 890 }; 891 892 static { 893 for (int i = 0; i < language_territory_hack.length; ++i) { language_territory_hack_map.put( language_territory_hack[i][0], language_territory_hack[i][1])894 language_territory_hack_map.put( 895 language_territory_hack[i][0], language_territory_hack[i][1]); 896 } 897 } 898 899 static class MyXSymbolTable extends UnicodeSet.XSymbolTable { 900 static VariantFolder caseFolder = new VariantFolder(new CaseVariantFolder()); 901 static VariantFolder canonicalFolder = new VariantFolder(new CanonicalFolder()); 902 static VariantFolder compatibilityFolder = new VariantFolder(new CompatibilityFolder()); 903 904 @Override applyPropertyAlias( String propertyName, String propertyValue, UnicodeSet result)905 public boolean applyPropertyAlias( 906 String propertyName, String propertyValue, UnicodeSet result) { 907 if (propertyName.equalsIgnoreCase("close")) { 908 if (propertyValue.equalsIgnoreCase("case")) { 909 result.addAll(caseFolder.getClosure(result)); 910 } else if (propertyValue.equalsIgnoreCase("canonical")) { 911 result.addAll(canonicalFolder.getClosure(result)); 912 } else if (propertyValue.equalsIgnoreCase("compatibility")) { 913 result.addAll(compatibilityFolder.getClosure(result)); 914 } 915 return true; 916 } else if (propertyName.equalsIgnoreCase("reduce")) { 917 if (propertyValue.equalsIgnoreCase("case")) { 918 UnicodeSet temp = caseFolder.reduce(result); 919 result.clear().addAll(temp); 920 } else if (propertyValue.equalsIgnoreCase("canonical")) { 921 UnicodeSet temp = canonicalFolder.reduce(result); 922 result.clear().addAll(temp); 923 } else if (propertyValue.equalsIgnoreCase("compatibility")) { 924 UnicodeSet temp = compatibilityFolder.reduce(result); 925 result.clear().addAll(temp); 926 } 927 return true; 928 } else if (propertyName.equalsIgnoreCase("reduceCase")) { 929 UnicodeSet temp = 930 caseFolder.reduce(new UnicodeSet(propertyValue.replace("·]", ":]"))); 931 result.clear().addAll(temp); 932 return true; 933 } else if (propertyName.equalsIgnoreCase("reduceCanonical")) { 934 UnicodeSet temp = 935 canonicalFolder.reduce(new UnicodeSet(propertyValue.replace("·]", ":]"))); 936 result.clear().addAll(temp); 937 return true; 938 } else if (propertyName.equalsIgnoreCase("reduceCase")) { 939 UnicodeSet temp = 940 caseFolder.reduce(new UnicodeSet(propertyValue.replace("·]", ":]"))); 941 result.clear().addAll(temp); 942 return true; 943 } 944 return false; 945 } 946 } 947 } 948