xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/Iso639Data.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.util;
2 
3 import com.google.common.base.Splitter;
4 import com.ibm.icu.impl.Relation;
5 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap;
6 import com.ibm.icu.util.ICUUncheckedIOException;
7 import java.io.BufferedReader;
8 import java.io.IOException;
9 import java.util.Arrays;
10 import java.util.Collections;
11 import java.util.HashMap;
12 import java.util.LinkedHashSet;
13 import java.util.List;
14 import java.util.Map;
15 import java.util.Set;
16 import java.util.TreeMap;
17 import java.util.regex.Pattern;
18 
19 public class Iso639Data {
20 
21     static Map<String, String> toAlpha3;
22 
23     static Map<String, String> fromAlpha3;
24 
25     static Map<String, String> toBiblio3;
26 
27     static Map<String, String> fromBiblio3;
28 
29     static Relation<String, String> toNames;
30 
31     static Relation<String, String> toRetirements;
32 
33     static Map<String, String> toChangeTo;
34 
35     static Map<String, Scope> toScope;
36 
37     static Map<String, List<String>> toHeirarchy;
38 
39     static Map<String, Type> toType;
40 
41     static Map<String, String> encompassed_macro;
42 
43     static Relation<String, String> macro_encompassed;
44 
45     static Map<String, Source> toSource;
46 
47     private static String version;
48 
49     /**
50      *
51      *
52      * <h3><a NAME="I">Individual</a> languages</h3>
53      *
54      * <p>Judgments regarding when two varieties are considered to be the same or different
55      * languages are based on a number of factors, including linguistic similarity, intelligibility,
56      * a common literature, the views of speakers concerning the relationship between language and
57      * identity, and other factors.
58      *
59      * <h3><a NAME="M">Macrolanguages</a></h3>
60      *
61      * <p>In various parts of the world, there are clusters of closely-related language varieties
62      * that, based on the criteria discussed above, can be considered distinct individual languages,
63      * yet in certain usage contexts a single language identity for all is needed.
64      *
65      * <p>Macrolanguages are distinguished from language collections in that the individual
66      * languages that correspond to a macrolanguage must be very closely related, and there must be
67      * some domain in which only a single language identity is recognized.
68      *
69      * <h3><a NAME="C">Collections</a> of languages</h3>
70      *
71      * <p>A collective language code element is an identifier that represents a group of individual
72      * languages that are not deemed to be one language in any usage context.
73      *
74      * <h3><a NAME="R">Private Use</a></h3>
75      *
76      * <p>Identifiers <tt>qaa</tt> through <tt>qtz</tt> are reserved for local use, to be used in
77      * cases in which there is no suitable existing code in ISO 639. There are no constraints as to
78      * scope of denotation. These identifiers may only be used locally, and may not be used in
79      * interchange without a private agreement.
80      *
81      * <h3><a NAME="S">Special situations</a></h3>
82      *
83      * <p>A few code elements are defined for other special situations. For more information, see
84      * http://www.sil.org/iso639-3/scope.asp
85      *
86      * <p>Note that the casing on these enum values is chosen to match standard usage.
87      */
88     public enum Scope {
89         Individual,
90         Macrolanguage,
91         Special,
92         Collection,
93         PrivateUse,
94         Unknown;
95 
fromString(String input)96         public static Scope fromString(String input) {
97             input = input.replace("-", "");
98             for (Scope item : Scope.values()) {
99                 if (item.toString().equalsIgnoreCase(input)) {
100                     return item;
101                 }
102             }
103             return Scope.valueOf(input); // to get exception
104         }
105     }
106 
107     /**
108      *
109      *
110      * <h3><a NAME="L"></a>Living languages</h3>
111      *
112      * <p>A language is listed as <i>living</i> when there are people still living who learned it as
113      * a first language.
114      *
115      * <h3><a NAME="E"></a>Extinct languages</h3>
116      *
117      * <p>A language is listed as <i>extinct</i> if it has gone extinct in recent times. (e.g. in
118      * the last few centuries).
119      *
120      * <h3><a NAME="A"></a>Ancient languages</h3>
121      *
122      * <p>A language is listed as <i>ancient</i> if it went extinct in ancient times (e.g. more than
123      * a millennium ago).
124      *
125      * <h3><a NAME="H"></a>Historic languages</h3>
126      *
127      * <p>A language is listed as <i>historic</i> when it is considered to be distinct from any
128      * modern languages that are descended from it; for instance, Old English and Middle English.
129      *
130      * <h3><a NAME="C"></a>Constructed languages</h3>
131      *
132      * <p>Artificial languages are those like Esperanto: it excludes programming languages.
133      *
134      * <p>Note that the casing on these enum values is chosen to match standard usage. <i>For more
135      * information, see http://www.sil.org/iso639-3/scope.asp</i>
136      */
137     public enum Type {
138         Ancient,
139         Constructed,
140         Extinct,
141         Historical,
142         Living,
143         Special,
144         Collection,
145         Unknown
146     }
147 
148     /**
149      * This indicates the source of the language subtag.
150      *
151      * @author markdavis
152      */
153     public enum Source {
154         ISO_639_1,
155         ISO_639_2,
156         ISO_639_3,
157         BCP47,
158         CLDR
159     }
160 
getVersion()161     public static String getVersion() {
162         return version;
163     }
164 
getSource(String languageSubtag)165     public static Source getSource(String languageSubtag) {
166         if (toAlpha3 == null) {
167             getData();
168         }
169         if (!isValid(languageSubtag)) {
170             return null;
171         }
172         Source result = toSource.get(languageSubtag);
173         if (result == null) return Source.ISO_639_3;
174         return result;
175     }
176 
toAlpha3(String languageSubtag)177     public static String toAlpha3(String languageSubtag) {
178         if (toAlpha3 == null) {
179             getData();
180         }
181         if (!isValid(languageSubtag)) {
182             return null;
183         }
184         return toAlpha3.get(languageSubtag);
185     }
186 
fromAlpha3(String alpha3)187     public static String fromAlpha3(String alpha3) {
188         if (fromAlpha3 == null) {
189             getData();
190         }
191         String alpha2 = fromAlpha3.get(alpha3);
192         if (alpha2 != null) {
193             return alpha2;
194         }
195         // it only exists if it has a name
196         if (isValid(alpha3)) {
197             return alpha3;
198         }
199         return null;
200     }
201 
isValid(String alpha3)202     private static boolean isValid(String alpha3) {
203         return toNames.containsKey(alpha3);
204     }
205 
fromBiblio3(String biblio3)206     public static String fromBiblio3(String biblio3) {
207         if (toNames == null) {
208             getData();
209         }
210         String result = fromBiblio3.get(biblio3);
211         if (result != null) {
212             return result;
213         }
214         return fromAlpha3(biblio3);
215     }
216 
toBiblio3(String languageTag)217     public static String toBiblio3(String languageTag) {
218         if (toNames == null) {
219             getData();
220         }
221         String result = toBiblio3.get(languageTag);
222         if (result != null) {
223             return result;
224         }
225         return toAlpha3(languageTag);
226     }
227 
hasBiblio3()228     public static Set<String> hasBiblio3() {
229         return toBiblio3.keySet();
230     }
231 
getNames(String languageSubtag)232     public static Set<String> getNames(String languageSubtag) {
233         if (toNames == null) {
234             getData();
235         }
236         return toNames.getAll(languageSubtag);
237     }
238 
getScope(String languageSubtag)239     public static Scope getScope(String languageSubtag) {
240         if (toScope == null) {
241             getData();
242         }
243         if (!isValid(languageSubtag)) return Scope.Unknown;
244         Scope result = toScope.get(languageSubtag);
245         if (result != null) return result;
246         return Scope.Individual;
247     }
248 
249     /** Returns the ISO 639-5 heirarchy if available, otherwise null. */
getHeirarchy(String languageSubtag)250     public static List<String> getHeirarchy(String languageSubtag) {
251         if (toHeirarchy == null) {
252             getData();
253         }
254         return toHeirarchy.get(languageSubtag);
255     }
256 
getType(String languageSubtag)257     public static Type getType(String languageSubtag) {
258         if (toAlpha3 == null) {
259             getData();
260         }
261         if (!isValid(languageSubtag)) return Type.Unknown;
262         Type result = toType.get(languageSubtag);
263         if (result != null) return result;
264         return Type.Living;
265     }
266 
267     /**
268      * Id char(3) NOT NULL, -- The three-letter 639-3 identifier Part2B char(3) NULL, -- Equivalent
269      * 639-2 identifier of the bibliographic applications code set, if there is one Part2T char(3)
270      * NULL, -- Equivalent 639-2 identifier of the terminology applications code set, if there is
271      * one Part1 char(2) NULL, -- Equivalent 639-1 identifier, if there is one Scope char(1) NOT
272      * NULL, -- I(ndividual), M(acrolanguage), S(pecial) Type char(1) NOT NULL, -- A(ncient),
273      * C(onstructed), -- E(xtinct), H(istorical), L(iving), S(pecial) Ref_Name varchar(150) NOT
274      * NULL) -- Reference language name
275      *
276      * @throws IOException
277      */
278     enum IsoColumn {
279         Id,
280         Part2B,
281         Part2T,
282         Part1,
283         Scope,
284         Type,
285         Ref_Name
286     }
287 
288     /**
289      * Id char(3) NOT NULL, -- The three-letter 639-3 identifier Print_Name varchar(75) NOT NULL, --
290      * One of the names associated with this identifier Inverted_Name varchar(75) NOT NULL) -- The
291      * inverted form of this Print_Name form
292      */
293     enum IsoNamesColumn {
294         Id,
295         Print_Name,
296         Inverted_Name
297     }
298 
getData()299     private static void getData() {
300         try {
301             BufferedReader in = CldrUtility.getUTF8Data("iso-639-3-version.tab");
302             version = in.readLine().trim();
303             in.close();
304 
305             in = CldrUtility.getUTF8Data("iso-639-3.tab");
306             SplitToArray tabs = new SplitToArray(Splitter.on('\t').trimResults());
307             toAlpha3 = new HashMap<>();
308             fromAlpha3 = new HashMap<>();
309             toBiblio3 = new HashMap<>();
310             fromBiblio3 = new HashMap<>();
311             toScope = new HashMap<>();
312             toType = new HashMap<>();
313             toNames = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class);
314             toRetirements = Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class);
315             toChangeTo = new TreeMap<>();
316             macro_encompassed =
317                     Relation.of(new TreeMap<String, Set<String>>(), LinkedHashSet.class);
318             encompassed_macro = new HashMap<>();
319             toSource = new HashMap<>();
320             toSource.put("sh", Source.ISO_639_1); // add deprecated language
321 
322             int count = 0; // line count for debugging
323             while (true) {
324                 ++count;
325                 String line = in.readLine();
326                 if (line == null) {
327                     break;
328                 }
329                 if (line.startsWith("\uFEFF")) {
330                     line = line.substring(1);
331                 }
332                 line = line.trim();
333                 if (line.isEmpty()) {
334                     continue;
335                 }
336                 String[] parts = tabs.split(line);
337                 String alpha3 = parts[IsoColumn.Id.ordinal()];
338                 if (alpha3.equals("Id")) continue;
339                 String languageSubtag = alpha3;
340                 if (parts[IsoColumn.Part1.ordinal()].length() != 0) { // parts.length >
341                     // IsoColumn.Part1.ordinal()
342                     // &&
343                     languageSubtag = parts[IsoColumn.Part1.ordinal()];
344                     toAlpha3.put(languageSubtag, alpha3);
345                     fromAlpha3.put(alpha3, languageSubtag);
346                 }
347 
348                 if (parts[IsoColumn.Part2B.ordinal()].length() != 0) { // parts.length >
349                     // IsoColumn.Part1.ordinal()
350                     // &&
351                     String biblio = parts[IsoColumn.Part2B.ordinal()];
352                     if (!biblio.equals(alpha3)) {
353                         toBiblio3.put(languageSubtag, biblio);
354                         fromBiblio3.put(biblio, languageSubtag);
355                     }
356                 }
357 
358                 toNames.put(languageSubtag, parts[IsoColumn.Ref_Name.ordinal()]);
359                 Scope scope = findMatchToPrefix(parts[IsoColumn.Scope.ordinal()], Scope.values());
360                 if (scope != Scope.Individual) toScope.put(languageSubtag, scope);
361                 Type type = findMatchToPrefix(parts[IsoColumn.Type.ordinal()], Type.values());
362                 if (type != Type.Living) toType.put(languageSubtag, type);
363             }
364             // System.out.println("Size:\t" + toNames.size());
365             in.close();
366 
367             // Id Ref_Name Ret_Reason Change_To Ret_Remedy Effective
368             in = CldrUtility.getUTF8Data("iso-639-3_Retirements.tab");
369             while (true) {
370                 String line = in.readLine();
371                 if (line == null) break;
372                 if (line.startsWith("\uFEFF")) line = line.substring(1);
373                 String[] parts = tabs.split(line);
374                 String alpha3 = parts[0];
375                 if (alpha3.equals("Id")) continue;
376                 // Id   Ref_Name    Ret_Reason  Change_To   Ret_Remedy  Effective
377                 // fri  Western Frisian C   fry     2007-02-01
378 
379                 toNames.put(alpha3, parts[1]);
380                 if (!parts[3].isEmpty()) {
381                     toChangeTo.put(alpha3, parts[3]);
382                 }
383                 toRetirements.put(alpha3, line);
384                 // skip inverted name for now
385             }
386             // System.out.println("Size:\t" + toNames.size());
387             in.close();
388 
389             // Id Print_Name Inverted_Name
390             in = CldrUtility.getUTF8Data("iso-639-3-macrolanguages.tab");
391             while (true) {
392                 String line = in.readLine();
393                 if (line == null) break;
394                 if (line.startsWith("\uFEFF")) line = line.substring(1);
395                 String[] parts = tabs.split(line);
396                 String prefix = parts[0];
397                 if (prefix.equals("M_Id")) continue;
398                 prefix = fromAlpha3(prefix);
399                 String suffix = fromAlpha3(parts[1]);
400                 if (suffix == null || prefix == null) {
401                     throw new IllegalArgumentException();
402                 }
403                 encompassed_macro.put(suffix, prefix);
404                 macro_encompassed.put(prefix, suffix);
405                 // skip inverted name for now
406             }
407             // System.out.println("Size:\t" + toNames.size());
408             in.close();
409 
410             // Id Print_Name Inverted_Name
411             in = CldrUtility.getUTF8Data("iso-639-3_Name_Index.tab");
412             while (true) {
413                 String line = in.readLine();
414                 if (line == null) break;
415                 if (line.startsWith("\uFEFF")) line = line.substring(1);
416                 String[] parts = tabs.split(line);
417                 String alpha3 = parts[IsoColumn.Id.ordinal()];
418                 if (alpha3.equals("Id")) continue;
419                 String languageSubTag = fromAlpha3(alpha3);
420                 toNames.put(languageSubTag, parts[IsoNamesColumn.Print_Name.ordinal()]);
421                 // skip inverted name for now
422             }
423             // System.out.println("Size:\t" + toNames.size());
424             in.close();
425 
426             in = CldrUtility.getUTF8Data("ISO-639-2_values_8bits.txt");
427             // An alpha-3 (bibliographic) code,
428             // an alpha-3 (terminologic) code (when given),
429             // an alpha-2 code (when given),
430             // an English name,
431             // and a French name of a language are all separated by pipe (|)
432             // characters.
433             while (true) {
434                 String line = in.readLine();
435                 if (line == null) break;
436                 if (line.startsWith("\uFEFF")) line = line.substring(1);
437                 String[] parts = line.split("\\s*\\|\\s*");
438                 String alpha3 = parts[0];
439                 if (alpha3.equals("qaa-qtz")) {
440                     for (char second = 'a'; second <= 't'; ++second) {
441                         for (char third = 'a'; third <= 'z'; ++third) {
442                             String languageSubtag = (("q" + second) + third);
443                             toScope.put(languageSubtag, Scope.PrivateUse);
444                             toType.put(languageSubtag, Type.Special);
445                             toNames.put(languageSubtag, "private-use");
446                             toSource.put(languageSubtag, Source.ISO_639_2);
447                         }
448                     }
449                     continue;
450                 }
451                 if (parts[1].length() != 0) alpha3 = parts[1];
452                 String languageSubtag = parts[2];
453                 if (languageSubtag.length() == 0) {
454                     languageSubtag = alpha3;
455                 }
456                 String[] english = parts[3].split(";");
457                 toSource.put(
458                         languageSubtag,
459                         languageSubtag.length() == 2 ? Source.ISO_639_1 : Source.ISO_639_2);
460                 if (!isValid(languageSubtag)) {
461                     // we don't have it already,
462                     // System.out.println("Adding2: " + alpha3 + "\t" + languageSubtag + "\t" +
463                     // Arrays.asList(english));
464                     if (languageSubtag.length() == 2) {
465                         toAlpha3.put(languageSubtag, alpha3);
466                         fromAlpha3.put(alpha3, languageSubtag);
467                     }
468                     toScope.put(languageSubtag, Scope.Collection);
469                     toType.put(languageSubtag, Type.Special);
470                     toNames.putAll(languageSubtag, Arrays.asList(english));
471                 }
472                 // skip inverted name for now
473             }
474             in.close();
475 
476             Map<String, String> toHeirarchyTemp = new TreeMap<>();
477             in = CldrUtility.getUTF8Data("external/Iso639-5.html");
478             String lastCode = null;
479             int column = 0;
480             boolean lastAttributeIsScope = false;
481             boolean lastElementIsTD = false;
482             boolean hadPop = true;
483             // if the table level is 1 (we are in the main table), then we look for
484             // <td>...</td><td>...</td>. That means
485             // that we have column 1 and column 2.
486 
487             SimpleHtmlParser simple = new SimpleHtmlParser().setReader(in);
488             StringBuilder result = new StringBuilder();
489 
490             main:
491             while (true) {
492                 SimpleHtmlParser.Type x = simple.next(result);
493                 // System.out.println(column + "\t" + x + "\t" + result);
494                 switch (x) {
495                     case ELEMENT_START:
496                         hadPop = false;
497                         lastElementIsTD = false;
498                         break;
499                     case ELEMENT:
500                         if (SimpleHtmlParser.equals("tr", result)) {
501                             column = 0;
502                         } else if (SimpleHtmlParser.equals("td", result)) {
503                             lastElementIsTD = true;
504                         }
505                         break;
506                     case ELEMENT_POP:
507                         hadPop = true;
508                         break;
509                     case ELEMENT_END:
510                         // if we get a POP and a TD, and we have column > 0, we increment
511                         if (lastElementIsTD && hadPop && column > 0) {
512                             ++column;
513                         }
514                         break;
515                     case ELEMENT_CONTENT:
516                         /*
517                          * <th scope="col">Identifier<br />Indicatif</th>
518                          * <th scope="col">English name<br />Nom anglais</th>
519                          * <th scope="col">French name<br />Nom français</th>
520                          * <th scope="col">639-2</th>
521                          * <th scope="col">Hierarchy<br />Hiérarchie</th>
522                          * <th scope="col">Notes<br />Notes</th>
523                          *
524                          * <td scope="row">apa</td>
525                          * <td>Apache languages</td>
526                          * <td>apaches, langues</td>
527                          * <td>language group<br />groupe de langues</td>
528                          * <td>nai : xnd : ath : apa</td>
529                          * <td>
530                          * <br />
531                          * </td>
532                          */
533                         switch (column) {
534                             case 1:
535                                 lastCode = result.toString();
536                                 break;
537                             case 5:
538                                 String old = toHeirarchyTemp.get(lastCode);
539                                 toHeirarchyTemp.put(
540                                         lastCode,
541                                         old == null || old.length() == 0
542                                                 ? result.toString().trim()
543                                                 : old + " " + result.toString().trim());
544                                 break;
545                             case 2:
546                                 break;
547                             case 3:
548                                 break;
549                             case 4:
550                                 break;
551                             case 0:
552                                 break;
553                             default:
554                                 break;
555                         }
556                         break;
557                     case ATTRIBUTE:
558                         lastAttributeIsScope = SimpleHtmlParser.equals("scope", result);
559                         break;
560                     case ATTRIBUTE_CONTENT:
561                         if (lastAttributeIsScope && SimpleHtmlParser.equals("row", result)) {
562                             column = 1;
563                         }
564                         break;
565                     case QUOTE:
566                         break;
567                     case DONE:
568                         break main;
569                 }
570             }
571 
572             in.close();
573 
574             Pattern SPLIT_HEIRARCHY = PatternCache.get("\\s*:\\s*");
575             toHeirarchy = new TreeMap<>();
576             // for (String code : toHeirarchyTemp.keySet()) {
577             // System.out.println(code + " => " + toHeirarchyTemp.get(code));
578             // }
579             for (String code : toHeirarchyTemp.keySet()) {
580                 String valueString = toHeirarchyTemp.get(code);
581                 String[] values = SPLIT_HEIRARCHY.split(valueString);
582                 for (String value : values) {
583                     if (toScope.get(value) == null && toHeirarchyTemp.get(value) == null) {
584                         throw new IllegalArgumentException(
585                                 "Unexpected value in heirarchy:\t"
586                                         + value
587                                         + "\t"
588                                         + code
589                                         + "\t"
590                                         + valueString);
591                     }
592                 }
593                 toHeirarchy.put(code, Arrays.asList(values));
594             }
595             // System.out.println("Size:\t" + toNames.size());
596 
597             // make data unmodifiable, just to prevent mistakes
598 
599             toAlpha3 = Collections.unmodifiableMap(toAlpha3);
600             fromAlpha3 = Collections.unmodifiableMap(fromAlpha3);
601             toBiblio3 = Collections.unmodifiableMap(toBiblio3);
602             fromBiblio3 = Collections.unmodifiableMap(fromBiblio3);
603             toScope = Collections.unmodifiableMap(toScope);
604             toType = Collections.unmodifiableMap(toType);
605             toHeirarchy = Collections.unmodifiableMap(toHeirarchy);
606 
607             toNames.freeze();
608             toRetirements.freeze();
609             macro_encompassed.freeze();
610             toChangeTo = ImmutableMap.copyOf(toChangeTo);
611 
612         } catch (IOException e) {
613             throw new ICUUncheckedIOException("Cannot parse file", e);
614         }
615     }
616 
findMatchToPrefix(String prefix, T[] values)617     public static <T> T findMatchToPrefix(String prefix, T[] values) {
618         for (T x : values) {
619             if (x.toString().startsWith(prefix)) {
620                 return x;
621             }
622         }
623         throw new IllegalArgumentException(
624                 "Prefix <" + prefix + "> not found in " + Arrays.asList(values));
625     }
626 
getAvailable()627     public static Set<String> getAvailable() {
628         if (toAlpha3 == null) {
629             getData();
630         }
631         return toNames.keySet();
632     }
633 
getMacroForEncompassed(String suffix)634     public static String getMacroForEncompassed(String suffix) {
635         String prefix = encompassed_macro.get(suffix);
636         if (prefix != null) return prefix;
637         if (suffix.equals("sgn")) return null;
638         Set<String> names = toNames.getAll(suffix);
639         if (names == null) return null;
640         for (String name : names) {
641             if (name.contains("Sign Language")) return "sgn";
642         }
643         return null;
644     }
645 
getEncompassedForMacro(String prefix)646     public static Set<String> getEncompassedForMacro(String prefix) {
647         return macro_encompassed.getAll(prefix);
648     }
649 
getMacros()650     public static Set<String> getMacros() {
651         return macro_encompassed.keySet();
652     }
653 
getEncompassed()654     public static Set<String> getEncompassed() {
655         return encompassed_macro.keySet();
656     }
657 
getChangeTo(String subtag)658     public static String getChangeTo(String subtag) {
659         return getChangeToMap().get(subtag);
660     }
661 
getChangeToMap()662     public static Map<String, String> getChangeToMap() {
663         if (toChangeTo == null) {
664             getData();
665         }
666         return toChangeTo;
667     }
668 }
669