1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Splitter; 4 import com.google.common.collect.ImmutableMap; 5 import com.google.common.collect.ImmutableSet; 6 import com.google.common.collect.LinkedHashMultimap; 7 import com.google.common.collect.Multimap; 8 import com.google.common.collect.TreeMultimap; 9 import com.ibm.icu.impl.Row; 10 import com.ibm.icu.lang.UScript; 11 import com.ibm.icu.text.UnicodeSet; 12 import com.ibm.icu.util.Output; 13 import java.io.IOException; 14 import java.io.UncheckedIOException; 15 import java.nio.file.Files; 16 import java.nio.file.Path; 17 import java.nio.file.Paths; 18 import java.util.Collection; 19 import java.util.List; 20 import java.util.Map; 21 import java.util.Map.Entry; 22 import java.util.Set; 23 import java.util.TreeMap; 24 import java.util.TreeSet; 25 import java.util.regex.Matcher; 26 import java.util.regex.Pattern; 27 import org.unicode.cldr.util.CLDRConfig; 28 import org.unicode.cldr.util.CLDRFile; 29 import org.unicode.cldr.util.CLDRFile.ExemplarType; 30 import org.unicode.cldr.util.CLDRPaths; 31 import org.unicode.cldr.util.CLDRTool; 32 import org.unicode.cldr.util.Factory; 33 import org.unicode.cldr.util.Iso639Data; 34 import org.unicode.cldr.util.Iso639Data.Type; 35 import org.unicode.cldr.util.LanguageTagParser; 36 import org.unicode.cldr.util.StandardCodes.LstrType; 37 import org.unicode.cldr.util.Validity; 38 import org.unicode.cldr.util.Validity.Status; 39 40 /** TODO: Merge into GenerateMaximalLocales, see CLDR-16380 */ 41 @CLDRTool( 42 description = "Generate additional likely subtag data, see CLDR-16380", 43 url = "https://unicode-org.atlassian.net/browse/CLDR-16380", 44 alias = "generate-additional-likely") 45 public class GenerateAdditionalLikely { 46 47 private static final String SIL = "sil1"; 48 private static final boolean ADD_SEED_EXEMPLARS = false; 49 50 private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 51 private static final Splitter UNDERBAR = Splitter.on('_'); 52 private static final Splitter TAB_SPLITTER = Splitter.on('\t'); 53 54 private static final Factory factory = CLDR_CONFIG.getExemplarsFactory(); 55 private static final CLDRFile english = CLDR_CONFIG.getEnglish(); 56 private static final LanguageTagParser ltpFull = new LanguageTagParser(); 57 private static final LanguageTagParser ltpTag = new LanguageTagParser(); 58 private static final Validity validity = Validity.getInstance(); 59 60 private static final Set<String> LANGUAGE_REGULAR = 61 validity.getStatusToCodes(LstrType.language).get(Status.regular); 62 private static final Set<String> SCRIPT_REGULAR = 63 validity.getStatusToCodes(LstrType.script).get(Status.regular); 64 private static final Set<String> REGION_REGULAR = 65 validity.getStatusToCodes(LstrType.region).get(Status.regular); 66 67 private static final Set<String> LIKELY_SPECIALS = 68 ImmutableSet.of("in", "iw", "ji", "jw", "mo"); 69 private static final Set<String> FIX_VALIDITY = ImmutableSet.of("Zanb"); 70 private static final Set<String> FIX_COUNTRY = ImmutableSet.of("yi"); 71 72 static class LSRSource implements Comparable<LSRSource> { 73 final Row.R4<String, String, String, String> data; 74 LSRSource(String lang, String script, String region, String source)75 LSRSource(String lang, String script, String region, String source) { 76 if (script.contains("Soyo") || region.contains("Soyo")) { 77 int debug = 0; 78 } 79 data = Row.of(lang, script, region, source); 80 data.freeze(); 81 } 82 83 @Override toString()84 public String toString() { 85 return combineLSR(data.get0(), data.get1(), data.get2()) + " // " + data.get3(); 86 } 87 88 @Override compareTo(LSRSource o)89 public int compareTo(LSRSource o) { 90 return data.compareTo(o.data); 91 } 92 93 @Override hashCode()94 public int hashCode() { 95 return data.hashCode(); 96 } 97 98 @Override equals(Object obj)99 public boolean equals(Object obj) { 100 return data.equals(obj); 101 } 102 line(String source)103 public String line(String source) { 104 // TODO Auto-generated method stub 105 // <likelySubtag from="aa" to="aa_Latn_ET"/> 106 // <!--{ Afar; ?; ? } => { Afar; Latin; Ethiopia }--> 107 final String target = combineLSR(data.get0(), data.get1(), data.get2()); 108 final String origin = data.get3(); 109 final String result = 110 "<likelySubtag from=\"" 111 + source 112 + "\" to=\"" 113 + target 114 + (origin.isBlank() ? "" : "\" origin=\"" + origin) 115 + "\"/>" 116 + "\t<!-- " 117 + english.getName(source) 118 + " ➡︎ " 119 + english.getName(target) 120 + " -->"; 121 return result; 122 } 123 combineLSR(String lang, String script, String region)124 public static String combineLSR(String lang, String script, String region) { 125 return lang 126 + (script.isEmpty() ? "" : "_" + script) 127 + (region.isEmpty() ? "" : "_" + region); 128 } 129 } 130 isOk( String lang, String script, String region, Map<LstrType, Status> errors)131 private static boolean isOk( 132 String lang, String script, String region, Map<LstrType, Status> errors) { 133 errors.clear(); 134 if (!LIKELY_SPECIALS.contains(lang)) { 135 check(LstrType.language, lang, errors); 136 } 137 if (!FIX_VALIDITY.contains(script)) { 138 check(LstrType.script, script, errors); 139 } 140 if (region.equals("001") && Iso639Data.getType(lang) == Type.Constructed) { 141 // ok 142 } else { 143 check(LstrType.region, region, errors); 144 } 145 return errors.isEmpty(); 146 } 147 check(LstrType lstrType, String lang, Map<LstrType, Status> errors)148 private static void check(LstrType lstrType, String lang, Map<LstrType, Status> errors) { 149 final Status status = validity.getCodeToStatus(lstrType).get(lang); 150 if (status != Status.regular) { 151 errors.put(lstrType, status); 152 } 153 } 154 155 private static class LikelySources { 156 private static LikelySources SINGLETON = new LikelySources(); 157 getSources()158 public static Set<String> getSources() { 159 return SINGLETON.alreadyLangs; 160 } 161 162 final ImmutableSet<String> alreadyLangs; 163 LikelySources()164 private LikelySources() { 165 Map<LstrType, Status> errors = new TreeMap<>(); 166 Map<String, String> likely = CLDR_CONFIG.getSupplementalDataInfo().getLikelySubtags(); 167 Set<String> _alreadyLangs = new TreeSet<>(); 168 _alreadyLangs.add("und"); 169 likely.forEach( 170 (key, value) -> { 171 String lang = ltpFull.set(value).getLanguage(); 172 String script = ltpFull.set(value).getScript(); 173 String region = ltpFull.set(value).getRegion(); 174 _alreadyLangs.add(lang); 175 if (!isOk(lang, script, region, errors)) { 176 showSkip("Skipping scope, CLDR", key, value, errors); 177 } 178 }); 179 System.out.println(); 180 alreadyLangs = ImmutableSet.copyOf(_alreadyLangs); 181 } 182 } 183 184 static Multimap<String, String> langToRegion; 185 main(String[] args)186 public static void main(String[] args) { 187 188 Map<String, LSRSource> result = new TreeMap<>(); 189 Map<LstrType, Status> errors = new TreeMap<>(); 190 191 Errors processErrors = new Errors(); 192 193 langToRegion = readWikidata(LikelySources.getSources()); 194 readJson(LikelySources.getSources(), result, processErrors); 195 196 processErrors.printAll(); 197 198 if (ADD_SEED_EXEMPLARS) { 199 200 for (String locale : factory.getAvailable()) { 201 CLDRFile file = factory.make(locale, false); 202 UnicodeSet exemplars = file.getExemplarSet(ExemplarType.main, null); 203 String lang = ltpFull.set(locale).getLanguage(); 204 if (!LikelySources.getSources().contains(lang)) { 205 String script = getScript(exemplars); 206 Collection<String> regions = langToRegion.get(lang); 207 for (String region : regions) { 208 addIfOk(result, lang, lang, script, region, "wiki+exemplars", errors); 209 } 210 } 211 } 212 } 213 System.out.println(); 214 215 Multimap<String, String> defects = LinkedHashMultimap.create(); 216 217 for (Entry<String, LSRSource> entry : result.entrySet()) { 218 String source = entry.getKey(); 219 LSRSource lsrs = entry.getValue(); 220 String tagLang = ltpTag.set(source).getLanguage(); 221 if (!result.containsKey(tagLang)) { 222 defects.put(source, tagLang); 223 showError("Missing lang record", source, lsrs.toString(), "Needs\t" + tagLang); 224 } 225 } 226 227 System.out.println("\nData to add: " + (result.entrySet().size() - defects.size()) + "\n"); 228 229 for (Entry<String, LSRSource> entry : result.entrySet()) { 230 String source = entry.getKey(); 231 if (defects.containsKey(source)) { 232 continue; 233 } 234 LSRSource lsrs = entry.getValue(); 235 System.out.println("\t\t" + lsrs.line(source)); 236 } 237 238 // Multimap<String, String> likelyAdditions = TreeMultimap.create(); 239 // System.out.println("\nAdd"); 240 // likelyAdditions.asMap().entrySet().forEach(x -> { 241 // String key = x.getKey(); 242 // if (x.getValue().size() == 1) { 243 // for (String value : x.getValue()) { 244 // System.out.println(key + "\t" + value + "\t" + infoFields(value)); 245 // } 246 // } 247 // } 248 // ); 249 // 250 // System.out.println("\nFix & Add"); 251 // 252 // likelyAdditions.asMap().entrySet().forEach(x -> { 253 // String key = x.getKey(); 254 // if (x.getValue().size() != 1) { 255 // for (String value : x.getValue()) { 256 // System.out.println(key + "\t" + value + "\t" + infoFields(value)); 257 // } 258 // System.out.println(); 259 // } 260 // } 261 // ); 262 263 } 264 265 static ImmutableMap<String, String> remap = ImmutableMap.of("iw", "he", "jw", "jv"); 266 list(String string)267 private static void list(String string) { 268 for (String code : string.split(" ")) { 269 ltpFull.set(code.replace("-", "_")); 270 String lang = ltpFull.getLanguage(); 271 String cldrLang = remap.get(lang); 272 if (cldrLang != null) { 273 lang = cldrLang; 274 } 275 276 System.out.println( 277 code 278 + "\t" 279 + english.getName(code) 280 + "\t" 281 + Iso639Data.getType(lang) 282 + "\t" 283 + Iso639Data.getScope(lang)); 284 } 285 System.out.println(); 286 } 287 showSkip( String message, String source, String target, Map<LstrType, Status> errors)288 public static void showSkip( 289 String message, String source, String target, Map<LstrType, Status> errors) { 290 showError(message, source, target, infoFields(target) + "\t" + errors); 291 } 292 showError(String message, String source, String target, String errors)293 public static void showError(String message, String source, String target, String errors) { 294 System.out.println( 295 message + "\t" + source + " ➡ " + target + (errors.isEmpty() ? "" : "\t" + errors)); 296 } 297 infoFields(String value)298 private static String infoFields(String value) { 299 int under = value.indexOf('_'); 300 String lang = under < 0 ? value : value.substring(0, under); 301 return english.getName(value) 302 + "\t" 303 + Iso639Data.getScope(lang) 304 + "\t" 305 + Iso639Data.getType(lang); 306 } 307 308 // add <likelySubtag from="aa" to="aa_Latn_ET"/>, status 309 310 // private static void handle(Entry<String, LSRSource> original, Multimap<String, String> 311 // likelyAdditions) { 312 // String source = original.getKey(); 313 // LSRSource lsr = original.getValue(); 314 // if (source.contains("_")) { 315 // int debug = 0; 316 // } 317 // // it is ok if there is a single LSR, eg 318 // // eg aaa Ghotuo {Latn={NG=[sil]}} 319 // // eg aak Ankave {Latn={PG=[sil, wiki+exemplars]}} 320 // 321 // for (Entry<R3<String, String, String>, String> entry : lsr.data) { 322 // addKeys(source, entry.getKey(), entry.getValue(), likelyAdditions); 323 // } 324 // } 325 326 // private static void addKeys(String source, R3<String, String, String> r3, String comment, 327 // Multimap<String, String> likelyAdditions) { 328 // likelyAdditions.put(source, r3.get0() + "_" + r3.get1() + "_" + r3.get2() + comment); 329 // } 330 331 static final Pattern fullTagMatch = Pattern.compile("\\s*\"(full|tag)\": \"([^\"]+)\","); 332 333 private static class Errors { 334 public enum Type { 335 ill_formed_tags("Ill-formed tags"), 336 already_CLDR("Language already in CLDR"), 337 tag_not_in_full("tag ⊄ full"), 338 exception("exception"); 339 private final String printable; 340 341 private Type(String printable) { 342 this.printable = printable; 343 } 344 } 345 346 public Multimap<Type, String> data = TreeMultimap.create(); 347 348 public void put( 349 Type illFormedTags, String tagValue, String fullValue, String errorMessage) { 350 data.put( 351 illFormedTags, 352 tagValue 353 + " ➡ " 354 + fullValue 355 + (errorMessage == null || errorMessage.isEmpty() 356 ? "" 357 : "\t—\t" + errorMessage)); 358 } 359 360 public void printAll() { 361 for (Entry<Type, Collection<String>> entry : data.asMap().entrySet()) { 362 Type type = entry.getKey(); 363 System.out.println(); 364 for (String message : entry.getValue()) { 365 System.out.println(type + "\t" + message); 366 } 367 } 368 } 369 } 370 371 private static Map<String, LSRSource> readJson( 372 Set<String> alreadyLangs, Map<String, LSRSource> result, Errors processErrors) { 373 Path path = Paths.get(CLDRPaths.BIRTH_DATA_DIR, "/../external/langtags.json"); 374 Matcher full = fullTagMatch.matcher(""); 375 Map<LstrType, Status> errors = new TreeMap<>(); 376 377 Output<String> lastFull = new Output<>(); 378 try { 379 Files.lines(path) 380 .forEach( 381 x -> { 382 if (full.reset(x).matches()) { 383 final String key = full.group(1); 384 final String value = full.group(2).replace("-", "_"); 385 if (value.startsWith("aai")) { 386 int debug = 0; 387 } 388 switch (key) { 389 case "full": 390 lastFull.value = value; 391 break; 392 case "tag": 393 try { 394 String fullLang = 395 ltpFull.set(lastFull.value).getLanguage(); 396 if (alreadyLangs.contains(fullLang)) { 397 processErrors.put( 398 Errors.Type.already_CLDR, 399 value, 400 lastFull.value, 401 ""); 402 break; 403 } else if (isIllFormed(lastFull.value, ltpFull) 404 || isIllFormed(value, ltpTag.set(value))) { 405 processErrors.put( 406 Errors.Type.ill_formed_tags, 407 value, 408 lastFull.value, 409 ""); 410 } else { 411 String reference = SIL; 412 final String fullScript = ltpFull.getScript(); 413 String fullRegion = ltpFull.getRegion(); 414 if (fullRegion.equals("ZZ") 415 || fullRegion.equals("001")) { 416 Collection<String> tempRegions = 417 langToRegion.get( 418 fullLang); // synthesize 419 if (!tempRegions.isEmpty()) { 420 fullRegion = 421 tempRegions.iterator().next(); 422 reference += " wikidata"; 423 } 424 } 425 426 String tagLang = ltpTag.getLanguage(); 427 String tagScript = ltpTag.getScript(); 428 String tagRegion = ltpTag.getRegion(); 429 430 if (!tagLang.equals(fullLang) 431 || (!tagScript.isEmpty() 432 && !tagScript.equals( 433 fullScript)) 434 || (!tagRegion.isEmpty() 435 && !tagRegion.equals( 436 fullRegion))) { 437 processErrors.put( 438 Errors.Type.tag_not_in_full, 439 value, 440 lastFull.value, 441 ""); 442 } else { 443 addIfOk( 444 result, 445 value, 446 fullLang, 447 fullScript, 448 fullRegion, 449 reference, 450 errors); 451 } 452 } 453 } catch (Exception e) { 454 processErrors.put( 455 Errors.Type.exception, 456 value, 457 lastFull.value, 458 e.getMessage()); 459 } 460 break; 461 default: 462 throw new IllegalArgumentException(); // never happens 463 } 464 } 465 }); 466 return result; 467 } catch (IOException ex) { 468 throw new UncheckedIOException(ex); 469 } 470 } 471 isIllFormed(String source, LanguageTagParser languageTagParser)472 private static boolean isIllFormed(String source, LanguageTagParser languageTagParser) { 473 return languageTagParser.getLanguage().isEmpty() 474 || !languageTagParser.getVariants().isEmpty() 475 || !languageTagParser.getExtensions().isEmpty() 476 || !languageTagParser.getLocaleExtensions().isEmpty() 477 || source.contains("@"); 478 } 479 addIfOk( Map<String, LSRSource> result, String source, String lang, final String script, final String region, String reference, Map<LstrType, Status> errors)480 private static void addIfOk( 481 Map<String, LSRSource> result, 482 String source, 483 String lang, 484 final String script, 485 final String region, 486 String reference, 487 Map<LstrType, Status> errors) { 488 if (isOk(lang, script, region, errors)) { 489 add(result, source, lang, script, region, reference); 490 } else { 491 showSkip("Skipping scope, SIL", source, ltpFull.toString(), errors); 492 } 493 } 494 readWikidata(Set<String> alreadyLangs)495 private static Multimap<String, String> readWikidata(Set<String> alreadyLangs) { 496 Multimap<String, String> result = TreeMultimap.create(); 497 Path path = Paths.get(CLDRPaths.BIRTH_DATA_DIR, "/../external/wididata_lang_region.tsv"); 498 try { 499 Files.lines(path) 500 .forEach( 501 x -> { 502 if (!x.startsWith("#")) { 503 List<String> list = TAB_SPLITTER.splitToList(x); 504 String lang = list.get(1); 505 String region = list.get(3); 506 result.put(lang, region); 507 } 508 }); 509 } catch (IOException ex) { 510 throw new UncheckedIOException(ex); 511 } 512 return result; 513 } 514 add( Map<String, LSRSource> result, String source, String lang, final String script, final String region, String reference)515 private static void add( 516 Map<String, LSRSource> result, 517 String source, 518 String lang, 519 final String script, 520 final String region, 521 String reference) { 522 LSRSource old = result.get(source); 523 LSRSource newVersion = new LSRSource(lang, script, region, reference); 524 if (old != null && !old.equals(newVersion)) { 525 throw new IllegalArgumentException( 526 "Data already exists for " + source + ": old=" + old + ", new: " + newVersion); 527 } 528 result.put(source, newVersion); 529 } 530 getScript(UnicodeSet exemplars)531 private static String getScript(UnicodeSet exemplars) { 532 for (String s : exemplars) { 533 int scriptNum = UScript.getScript(s.codePointAt(0)); 534 if (scriptNum != UScript.COMMON && scriptNum != UScript.INHERITED) { 535 return UScript.getShortName(scriptNum); 536 } 537 } 538 return "Zxxx"; 539 } 540 } 541