1 package org.unicode.cldr.util; 2 3 import com.google.common.base.Splitter; 4 import com.ibm.icu.impl.Relation; 5 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; 6 import com.ibm.icu.util.ICUUncheckedIOException; 7 import java.io.BufferedReader; 8 import java.io.IOException; 9 import java.util.Arrays; 10 import java.util.Collections; 11 import java.util.HashMap; 12 import java.util.LinkedHashSet; 13 import java.util.List; 14 import java.util.Map; 15 import java.util.Set; 16 import java.util.TreeMap; 17 import java.util.regex.Pattern; 18 19 public class Iso639Data { 20 21 static Map<String, String> toAlpha3; 22 23 static Map<String, String> fromAlpha3; 24 25 static Map<String, String> toBiblio3; 26 27 static Map<String, String> fromBiblio3; 28 29 static Relation<String, String> toNames; 30 31 static Relation<String, String> toRetirements; 32 33 static Map<String, String> toChangeTo; 34 35 static Map<String, Scope> toScope; 36 37 static Map<String, List<String>> toHeirarchy; 38 39 static Map<String, Type> toType; 40 41 static Map<String, String> encompassed_macro; 42 43 static Relation<String, String> macro_encompassed; 44 45 static Map<String, Source> toSource; 46 47 private static String version; 48 49 /** 50 * 51 * 52 * <h3><a NAME="I">Individual</a> languages</h3> 53 * 54 * <p>Judgments regarding when two varieties are considered to be the same or different 55 * languages are based on a number of factors, including linguistic similarity, intelligibility, 56 * a common literature, the views of speakers concerning the relationship between language and 57 * identity, and other factors. 58 * 59 * <h3><a NAME="M">Macrolanguages</a></h3> 60 * 61 * <p>In various parts of the world, there are clusters of closely-related language varieties 62 * that, based on the criteria discussed above, can be considered distinct individual languages, 63 * yet in certain usage contexts a single language identity for all is needed. 64 * 65 * <p>Macrolanguages are distinguished from language collections in that the individual 66 * languages that correspond to a macrolanguage must be very closely related, and there must be 67 * some domain in which only a single language identity is recognized. 68 * 69 * <h3><a NAME="C">Collections</a> of languages</h3> 70 * 71 * <p>A collective language code element is an identifier that represents a group of individual 72 * languages that are not deemed to be one language in any usage context. 73 * 74 * <h3><a NAME="R">Private Use</a></h3> 75 * 76 * <p>Identifiers <tt>qaa</tt> through <tt>qtz</tt> are reserved for local use, to be used in 77 * cases in which there is no suitable existing code in ISO 639. There are no constraints as to 78 * scope of denotation. These identifiers may only be used locally, and may not be used in 79 * interchange without a private agreement. 80 * 81 * <h3><a NAME="S">Special situations</a></h3> 82 * 83 * <p>A few code elements are defined for other special situations. For more information, see 84 * http://www.sil.org/iso639-3/scope.asp 85 * 86 * <p>Note that the casing on these enum values is chosen to match standard usage. 87 */ 88 public enum Scope { 89 Individual, 90 Macrolanguage, 91 Special, 92 Collection, 93 PrivateUse, 94 Unknown; 95 fromString(String input)96 public static Scope fromString(String input) { 97 input = input.replace("-", ""); 98 for (Scope item : Scope.values()) { 99 if (item.toString().equalsIgnoreCase(input)) { 100 return item; 101 } 102 } 103 return Scope.valueOf(input); // to get exception 104 } 105 } 106 107 /** 108 * 109 * 110 * <h3><a NAME="L"></a>Living languages</h3> 111 * 112 * <p>A language is listed as <i>living</i> when there are people still living who learned it as 113 * a first language. 114 * 115 * <h3><a NAME="E"></a>Extinct languages</h3> 116 * 117 * <p>A language is listed as <i>extinct</i> if it has gone extinct in recent times. (e.g. in 118 * the last few centuries). 119 * 120 * <h3><a NAME="A"></a>Ancient languages</h3> 121 * 122 * <p>A language is listed as <i>ancient</i> if it went extinct in ancient times (e.g. more than 123 * a millennium ago). 124 * 125 * <h3><a NAME="H"></a>Historic languages</h3> 126 * 127 * <p>A language is listed as <i>historic</i> when it is considered to be distinct from any 128 * modern languages that are descended from it; for instance, Old English and Middle English. 129 * 130 * <h3><a NAME="C"></a>Constructed languages</h3> 131 * 132 * <p>Artificial languages are those like Esperanto: it excludes programming languages. 133 * 134 * <p>Note that the casing on these enum values is chosen to match standard usage. <i>For more 135 * information, see http://www.sil.org/iso639-3/scope.asp</i> 136 */ 137 public enum Type { 138 Ancient, 139 Constructed, 140 Extinct, 141 Historical, 142 Living, 143 Special, 144 Collection, 145 Unknown 146 } 147 148 /** 149 * This indicates the source of the language subtag. 150 * 151 * @author markdavis 152 */ 153 public enum Source { 154 ISO_639_1, 155 ISO_639_2, 156 ISO_639_3, 157 BCP47, 158 CLDR 159 } 160 getVersion()161 public static String getVersion() { 162 return version; 163 } 164 getSource(String languageSubtag)165 public static Source getSource(String languageSubtag) { 166 if (toAlpha3 == null) { 167 getData(); 168 } 169 if (!isValid(languageSubtag)) { 170 return null; 171 } 172 Source result = toSource.get(languageSubtag); 173 if (result == null) return Source.ISO_639_3; 174 return result; 175 } 176 toAlpha3(String languageSubtag)177 public static String toAlpha3(String languageSubtag) { 178 if (toAlpha3 == null) { 179 getData(); 180 } 181 if (!isValid(languageSubtag)) { 182 return null; 183 } 184 return toAlpha3.get(languageSubtag); 185 } 186 fromAlpha3(String alpha3)187 public static String fromAlpha3(String alpha3) { 188 if (fromAlpha3 == null) { 189 getData(); 190 } 191 String alpha2 = fromAlpha3.get(alpha3); 192 if (alpha2 != null) { 193 return alpha2; 194 } 195 // it only exists if it has a name 196 if (isValid(alpha3)) { 197 return alpha3; 198 } 199 return null; 200 } 201 isValid(String alpha3)202 private static boolean isValid(String alpha3) { 203 return toNames.containsKey(alpha3); 204 } 205 fromBiblio3(String biblio3)206 public static String fromBiblio3(String biblio3) { 207 if (toNames == null) { 208 getData(); 209 } 210 String result = fromBiblio3.get(biblio3); 211 if (result != null) { 212 return result; 213 } 214 return fromAlpha3(biblio3); 215 } 216 toBiblio3(String languageTag)217 public static String toBiblio3(String languageTag) { 218 if (toNames == null) { 219 getData(); 220 } 221 String result = toBiblio3.get(languageTag); 222 if (result != null) { 223 return result; 224 } 225 return toAlpha3(languageTag); 226 } 227 hasBiblio3()228 public static Set<String> hasBiblio3() { 229 return toBiblio3.keySet(); 230 } 231 getNames(String languageSubtag)232 public static Set<String> getNames(String languageSubtag) { 233 if (toNames == null) { 234 getData(); 235 } 236 return toNames.getAll(languageSubtag); 237 } 238 getScope(String languageSubtag)239 public static Scope getScope(String languageSubtag) { 240 if (toScope == null) { 241 getData(); 242 } 243 if (!isValid(languageSubtag)) return Scope.Unknown; 244 Scope result = toScope.get(languageSubtag); 245 if (result != null) return result; 246 return Scope.Individual; 247 } 248 249 /** Returns the ISO 639-5 heirarchy if available, otherwise null. */ getHeirarchy(String languageSubtag)250 public static List<String> getHeirarchy(String languageSubtag) { 251 if (toHeirarchy == null) { 252 getData(); 253 } 254 return toHeirarchy.get(languageSubtag); 255 } 256 getType(String languageSubtag)257 public static Type getType(String languageSubtag) { 258 if (toAlpha3 == null) { 259 getData(); 260 } 261 if (!isValid(languageSubtag)) return Type.Unknown; 262 Type result = toType.get(languageSubtag); 263 if (result != null) return result; 264 return Type.Living; 265 } 266 267 /** 268 * Id char(3) NOT NULL, -- The three-letter 639-3 identifier Part2B char(3) NULL, -- Equivalent 269 * 639-2 identifier of the bibliographic applications code set, if there is one Part2T char(3) 270 * NULL, -- Equivalent 639-2 identifier of the terminology applications code set, if there is 271 * one Part1 char(2) NULL, -- Equivalent 639-1 identifier, if there is one Scope char(1) NOT 272 * NULL, -- I(ndividual), M(acrolanguage), S(pecial) Type char(1) NOT NULL, -- A(ncient), 273 * C(onstructed), -- E(xtinct), H(istorical), L(iving), S(pecial) Ref_Name varchar(150) NOT 274 * NULL) -- Reference language name 275 * 276 * @throws IOException 277 */ 278 enum IsoColumn { 279 Id, 280 Part2B, 281 Part2T, 282 Part1, 283 Scope, 284 Type, 285 Ref_Name 286 } 287 288 /** 289 * Id char(3) NOT NULL, -- The three-letter 639-3 identifier Print_Name varchar(75) NOT NULL, -- 290 * One of the names associated with this identifier Inverted_Name varchar(75) NOT NULL) -- The 291 * inverted form of this Print_Name form 292 */ 293 enum IsoNamesColumn { 294 Id, 295 Print_Name, 296 Inverted_Name 297 } 298 getData()299 private static void getData() { 300 try { 301 BufferedReader in = CldrUtility.getUTF8Data("iso-639-3-version.tab"); 302 version = in.readLine().trim(); 303 in.close(); 304 305 in = CldrUtility.getUTF8Data("iso-639-3.tab"); 306 SplitToArray tabs = new SplitToArray(Splitter.on('\t').trimResults()); 307 toAlpha3 = new HashMap<>(); 308 fromAlpha3 = new HashMap<>(); 309 toBiblio3 = new HashMap<>(); 310 fromBiblio3 = new HashMap<>(); 311 toScope = new HashMap<>(); 312 toType = new HashMap<>(); 313 toNames = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class); 314 toRetirements = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class); 315 toChangeTo = new TreeMap<>(); 316 macro_encompassed = 317 Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class); 318 encompassed_macro = new HashMap<>(); 319 toSource = new HashMap<>(); 320 toSource.put("sh", Source.ISO_639_1); // add deprecated language 321 322 int count = 0; // line count for debugging 323 while (true) { 324 ++count; 325 String line = in.readLine(); 326 if (line == null) { 327 break; 328 } 329 if (line.startsWith("\uFEFF")) { 330 line = line.substring(1); 331 } 332 line = line.trim(); 333 if (line.isEmpty()) { 334 continue; 335 } 336 String[] parts = tabs.split(line); 337 String alpha3 = parts[IsoColumn.Id.ordinal()]; 338 if (alpha3.equals("Id")) continue; 339 String languageSubtag = alpha3; 340 if (parts[IsoColumn.Part1.ordinal()].length() != 0) { // parts.length > 341 // IsoColumn.Part1.ordinal() 342 // && 343 languageSubtag = parts[IsoColumn.Part1.ordinal()]; 344 toAlpha3.put(languageSubtag, alpha3); 345 fromAlpha3.put(alpha3, languageSubtag); 346 } 347 348 if (parts[IsoColumn.Part2B.ordinal()].length() != 0) { // parts.length > 349 // IsoColumn.Part1.ordinal() 350 // && 351 String biblio = parts[IsoColumn.Part2B.ordinal()]; 352 if (!biblio.equals(alpha3)) { 353 toBiblio3.put(languageSubtag, biblio); 354 fromBiblio3.put(biblio, languageSubtag); 355 } 356 } 357 358 toNames.put(languageSubtag, parts[IsoColumn.Ref_Name.ordinal()]); 359 Scope scope = findMatchToPrefix(parts[IsoColumn.Scope.ordinal()], Scope.values()); 360 if (scope != Scope.Individual) toScope.put(languageSubtag, scope); 361 Type type = findMatchToPrefix(parts[IsoColumn.Type.ordinal()], Type.values()); 362 if (type != Type.Living) toType.put(languageSubtag, type); 363 } 364 // System.out.println("Size:\t" + toNames.size()); 365 in.close(); 366 367 // Id Ref_Name Ret_Reason Change_To Ret_Remedy Effective 368 in = CldrUtility.getUTF8Data("iso-639-3_Retirements.tab"); 369 while (true) { 370 String line = in.readLine(); 371 if (line == null) break; 372 if (line.startsWith("\uFEFF")) line = line.substring(1); 373 String[] parts = tabs.split(line); 374 String alpha3 = parts[0]; 375 if (alpha3.equals("Id")) continue; 376 // Id Ref_Name Ret_Reason Change_To Ret_Remedy Effective 377 // fri Western Frisian C fry 2007-02-01 378 379 toNames.put(alpha3, parts[1]); 380 if (!parts[3].isEmpty()) { 381 toChangeTo.put(alpha3, parts[3]); 382 } 383 toRetirements.put(alpha3, line); 384 // skip inverted name for now 385 } 386 // System.out.println("Size:\t" + toNames.size()); 387 in.close(); 388 389 // Id Print_Name Inverted_Name 390 in = CldrUtility.getUTF8Data("iso-639-3-macrolanguages.tab"); 391 while (true) { 392 String line = in.readLine(); 393 if (line == null) break; 394 if (line.startsWith("\uFEFF")) line = line.substring(1); 395 String[] parts = tabs.split(line); 396 String prefix = parts[0]; 397 if (prefix.equals("M_Id")) continue; 398 prefix = fromAlpha3(prefix); 399 String suffix = fromAlpha3(parts[1]); 400 if (suffix == null || prefix == null) { 401 throw new IllegalArgumentException(); 402 } 403 encompassed_macro.put(suffix, prefix); 404 macro_encompassed.put(prefix, suffix); 405 // skip inverted name for now 406 } 407 // System.out.println("Size:\t" + toNames.size()); 408 in.close(); 409 410 // Id Print_Name Inverted_Name 411 in = CldrUtility.getUTF8Data("iso-639-3_Name_Index.tab"); 412 while (true) { 413 String line = in.readLine(); 414 if (line == null) break; 415 if (line.startsWith("\uFEFF")) line = line.substring(1); 416 String[] parts = tabs.split(line); 417 String alpha3 = parts[IsoColumn.Id.ordinal()]; 418 if (alpha3.equals("Id")) continue; 419 String languageSubTag = fromAlpha3(alpha3); 420 toNames.put(languageSubTag, parts[IsoNamesColumn.Print_Name.ordinal()]); 421 // skip inverted name for now 422 } 423 // System.out.println("Size:\t" + toNames.size()); 424 in.close(); 425 426 in = CldrUtility.getUTF8Data("ISO-639-2_values_8bits.txt"); 427 // An alpha-3 (bibliographic) code, 428 // an alpha-3 (terminologic) code (when given), 429 // an alpha-2 code (when given), 430 // an English name, 431 // and a French name of a language are all separated by pipe (|) 432 // characters. 433 while (true) { 434 String line = in.readLine(); 435 if (line == null) break; 436 if (line.startsWith("\uFEFF")) line = line.substring(1); 437 String[] parts = line.split("\\s*\\|\\s*"); 438 String alpha3 = parts[0]; 439 if (alpha3.equals("qaa-qtz")) { 440 for (char second = 'a'; second <= 't'; ++second) { 441 for (char third = 'a'; third <= 'z'; ++third) { 442 String languageSubtag = (("q" + second) + third); 443 toScope.put(languageSubtag, Scope.PrivateUse); 444 toType.put(languageSubtag, Type.Special); 445 toNames.put(languageSubtag, "private-use"); 446 toSource.put(languageSubtag, Source.ISO_639_2); 447 } 448 } 449 continue; 450 } 451 if (parts[1].length() != 0) alpha3 = parts[1]; 452 String languageSubtag = parts[2]; 453 if (languageSubtag.length() == 0) { 454 languageSubtag = alpha3; 455 } 456 String[] english = parts[3].split(";"); 457 toSource.put( 458 languageSubtag, 459 languageSubtag.length() == 2 ? Source.ISO_639_1 : Source.ISO_639_2); 460 if (!isValid(languageSubtag)) { 461 // we don't have it already, 462 // System.out.println("Adding2: " + alpha3 + "\t" + languageSubtag + "\t" + 463 // Arrays.asList(english)); 464 if (languageSubtag.length() == 2) { 465 toAlpha3.put(languageSubtag, alpha3); 466 fromAlpha3.put(alpha3, languageSubtag); 467 } 468 toScope.put(languageSubtag, Scope.Collection); 469 toType.put(languageSubtag, Type.Special); 470 toNames.putAll(languageSubtag, Arrays.asList(english)); 471 } 472 // skip inverted name for now 473 } 474 in.close(); 475 476 Map<String, String> toHeirarchyTemp = new TreeMap<>(); 477 in = CldrUtility.getUTF8Data("external/Iso639-5.html"); 478 String lastCode = null; 479 int column = 0; 480 boolean lastAttributeIsScope = false; 481 boolean lastElementIsTD = false; 482 boolean hadPop = true; 483 // if the table level is 1 (we are in the main table), then we look for 484 // <td>...</td><td>...</td>. That means 485 // that we have column 1 and column 2. 486 487 SimpleHtmlParser simple = new SimpleHtmlParser().setReader(in); 488 StringBuilder result = new StringBuilder(); 489 490 main: 491 while (true) { 492 SimpleHtmlParser.Type x = simple.next(result); 493 // System.out.println(column + "\t" + x + "\t" + result); 494 switch (x) { 495 case ELEMENT_START: 496 hadPop = false; 497 lastElementIsTD = false; 498 break; 499 case ELEMENT: 500 if (SimpleHtmlParser.equals("tr", result)) { 501 column = 0; 502 } else if (SimpleHtmlParser.equals("td", result)) { 503 lastElementIsTD = true; 504 } 505 break; 506 case ELEMENT_POP: 507 hadPop = true; 508 break; 509 case ELEMENT_END: 510 // if we get a POP and a TD, and we have column > 0, we increment 511 if (lastElementIsTD && hadPop && column > 0) { 512 ++column; 513 } 514 break; 515 case ELEMENT_CONTENT: 516 /* 517 * <th scope="col">Identifier<br />Indicatif</th> 518 * <th scope="col">English name<br />Nom anglais</th> 519 * <th scope="col">French name<br />Nom français</th> 520 * <th scope="col">639-2</th> 521 * <th scope="col">Hierarchy<br />Hiérarchie</th> 522 * <th scope="col">Notes<br />Notes</th> 523 * 524 * <td scope="row">apa</td> 525 * <td>Apache languages</td> 526 * <td>apaches, langues</td> 527 * <td>language group<br />groupe de langues</td> 528 * <td>nai : xnd : ath : apa</td> 529 * <td> 530 * <br /> 531 * </td> 532 */ 533 switch (column) { 534 case 1: 535 lastCode = result.toString(); 536 break; 537 case 5: 538 String old = toHeirarchyTemp.get(lastCode); 539 toHeirarchyTemp.put( 540 lastCode, 541 old == null || old.length() == 0 542 ? result.toString().trim() 543 : old + " " + result.toString().trim()); 544 break; 545 case 2: 546 break; 547 case 3: 548 break; 549 case 4: 550 break; 551 case 0: 552 break; 553 default: 554 break; 555 } 556 break; 557 case ATTRIBUTE: 558 lastAttributeIsScope = SimpleHtmlParser.equals("scope", result); 559 break; 560 case ATTRIBUTE_CONTENT: 561 if (lastAttributeIsScope && SimpleHtmlParser.equals("row", result)) { 562 column = 1; 563 } 564 break; 565 case QUOTE: 566 break; 567 case DONE: 568 break main; 569 } 570 } 571 572 in.close(); 573 574 Pattern SPLIT_HEIRARCHY = PatternCache.get("\\s*:\\s*"); 575 toHeirarchy = new TreeMap<>(); 576 // for (String code : toHeirarchyTemp.keySet()) { 577 // System.out.println(code + " => " + toHeirarchyTemp.get(code)); 578 // } 579 for (String code : toHeirarchyTemp.keySet()) { 580 String valueString = toHeirarchyTemp.get(code); 581 String[] values = SPLIT_HEIRARCHY.split(valueString); 582 for (String value : values) { 583 if (toScope.get(value) == null && toHeirarchyTemp.get(value) == null) { 584 throw new IllegalArgumentException( 585 "Unexpected value in heirarchy:\t" 586 + value 587 + "\t" 588 + code 589 + "\t" 590 + valueString); 591 } 592 } 593 toHeirarchy.put(code, Arrays.asList(values)); 594 } 595 // System.out.println("Size:\t" + toNames.size()); 596 597 // make data unmodifiable, just to prevent mistakes 598 599 toAlpha3 = Collections.unmodifiableMap(toAlpha3); 600 fromAlpha3 = Collections.unmodifiableMap(fromAlpha3); 601 toBiblio3 = Collections.unmodifiableMap(toBiblio3); 602 fromBiblio3 = Collections.unmodifiableMap(fromBiblio3); 603 toScope = Collections.unmodifiableMap(toScope); 604 toType = Collections.unmodifiableMap(toType); 605 toHeirarchy = Collections.unmodifiableMap(toHeirarchy); 606 607 toNames.freeze(); 608 toRetirements.freeze(); 609 macro_encompassed.freeze(); 610 toChangeTo = ImmutableMap.copyOf(toChangeTo); 611 612 } catch (IOException e) { 613 throw new ICUUncheckedIOException("Cannot parse file", e); 614 } 615 } 616 findMatchToPrefix(String prefix, T[] values)617 public static <T> T findMatchToPrefix(String prefix, T[] values) { 618 for (T x : values) { 619 if (x.toString().startsWith(prefix)) { 620 return x; 621 } 622 } 623 throw new IllegalArgumentException( 624 "Prefix <" + prefix + "> not found in " + Arrays.asList(values)); 625 } 626 getAvailable()627 public static Set<String> getAvailable() { 628 if (toAlpha3 == null) { 629 getData(); 630 } 631 return toNames.keySet(); 632 } 633 getMacroForEncompassed(String suffix)634 public static String getMacroForEncompassed(String suffix) { 635 String prefix = encompassed_macro.get(suffix); 636 if (prefix != null) return prefix; 637 if (suffix.equals("sgn")) return null; 638 Set<String> names = toNames.getAll(suffix); 639 if (names == null) return null; 640 for (String name : names) { 641 if (name.contains("Sign Language")) return "sgn"; 642 } 643 return null; 644 } 645 getEncompassedForMacro(String prefix)646 public static Set<String> getEncompassedForMacro(String prefix) { 647 return macro_encompassed.getAll(prefix); 648 } 649 getMacros()650 public static Set<String> getMacros() { 651 return macro_encompassed.keySet(); 652 } 653 getEncompassed()654 public static Set<String> getEncompassed() { 655 return encompassed_macro.keySet(); 656 } 657 getChangeTo(String subtag)658 public static String getChangeTo(String subtag) { 659 return getChangeToMap().get(subtag); 660 } 661 getChangeToMap()662 public static Map<String, String> getChangeToMap() { 663 if (toChangeTo == null) { 664 getData(); 665 } 666 return toChangeTo; 667 } 668 } 669