xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateAdditionalLikely.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.base.Splitter;
4 import com.google.common.collect.ImmutableMap;
5 import com.google.common.collect.ImmutableSet;
6 import com.google.common.collect.LinkedHashMultimap;
7 import com.google.common.collect.Multimap;
8 import com.google.common.collect.TreeMultimap;
9 import com.ibm.icu.impl.Row;
10 import com.ibm.icu.lang.UScript;
11 import com.ibm.icu.text.UnicodeSet;
12 import com.ibm.icu.util.Output;
13 import java.io.IOException;
14 import java.io.UncheckedIOException;
15 import java.nio.file.Files;
16 import java.nio.file.Path;
17 import java.nio.file.Paths;
18 import java.util.Collection;
19 import java.util.List;
20 import java.util.Map;
21 import java.util.Map.Entry;
22 import java.util.Set;
23 import java.util.TreeMap;
24 import java.util.TreeSet;
25 import java.util.regex.Matcher;
26 import java.util.regex.Pattern;
27 import org.unicode.cldr.util.CLDRConfig;
28 import org.unicode.cldr.util.CLDRFile;
29 import org.unicode.cldr.util.CLDRFile.ExemplarType;
30 import org.unicode.cldr.util.CLDRPaths;
31 import org.unicode.cldr.util.CLDRTool;
32 import org.unicode.cldr.util.Factory;
33 import org.unicode.cldr.util.Iso639Data;
34 import org.unicode.cldr.util.Iso639Data.Type;
35 import org.unicode.cldr.util.LanguageTagParser;
36 import org.unicode.cldr.util.StandardCodes.LstrType;
37 import org.unicode.cldr.util.Validity;
38 import org.unicode.cldr.util.Validity.Status;
39 
40 /** TODO: Merge into GenerateMaximalLocales, see CLDR-16380 */
41 @CLDRTool(
42         description = "Generate additional likely subtag data, see CLDR-16380",
43         url = "https://unicode-org.atlassian.net/browse/CLDR-16380",
44         alias = "generate-additional-likely")
45 public class GenerateAdditionalLikely {
46 
47     private static final String SIL = "sil1";
48     private static final boolean ADD_SEED_EXEMPLARS = false;
49 
50     private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance();
51     private static final Splitter UNDERBAR = Splitter.on('_');
52     private static final Splitter TAB_SPLITTER = Splitter.on('\t');
53 
54     private static final Factory factory = CLDR_CONFIG.getExemplarsFactory();
55     private static final CLDRFile english = CLDR_CONFIG.getEnglish();
56     private static final LanguageTagParser ltpFull = new LanguageTagParser();
57     private static final LanguageTagParser ltpTag = new LanguageTagParser();
58     private static final Validity validity = Validity.getInstance();
59 
60     private static final Set<String> LANGUAGE_REGULAR =
61             validity.getStatusToCodes(LstrType.language).get(Status.regular);
62     private static final Set<String> SCRIPT_REGULAR =
63             validity.getStatusToCodes(LstrType.script).get(Status.regular);
64     private static final Set<String> REGION_REGULAR =
65             validity.getStatusToCodes(LstrType.region).get(Status.regular);
66 
67     private static final Set<String> LIKELY_SPECIALS =
68             ImmutableSet.of("in", "iw", "ji", "jw", "mo");
69     private static final Set<String> FIX_VALIDITY = ImmutableSet.of("Zanb");
70     private static final Set<String> FIX_COUNTRY = ImmutableSet.of("yi");
71 
72     static class LSRSource implements Comparable<LSRSource> {
73         final Row.R4<String, String, String, String> data;
74 
LSRSource(String lang, String script, String region, String source)75         LSRSource(String lang, String script, String region, String source) {
76             if (script.contains("Soyo") || region.contains("Soyo")) {
77                 int debug = 0;
78             }
79             data = Row.of(lang, script, region, source);
80             data.freeze();
81         }
82 
83         @Override
toString()84         public String toString() {
85             return combineLSR(data.get0(), data.get1(), data.get2()) + " // " + data.get3();
86         }
87 
88         @Override
compareTo(LSRSource o)89         public int compareTo(LSRSource o) {
90             return data.compareTo(o.data);
91         }
92 
93         @Override
hashCode()94         public int hashCode() {
95             return data.hashCode();
96         }
97 
98         @Override
equals(Object obj)99         public boolean equals(Object obj) {
100             return data.equals(obj);
101         }
102 
line(String source)103         public String line(String source) {
104             // TODO Auto-generated method stub
105             //      <likelySubtag from="aa" to="aa_Latn_ET"/>
106             // <!--{ Afar; ?; ? } => { Afar; Latin; Ethiopia }-->
107             final String target = combineLSR(data.get0(), data.get1(), data.get2());
108             final String origin = data.get3();
109             final String result =
110                     "<likelySubtag from=\""
111                             + source
112                             + "\" to=\""
113                             + target
114                             + (origin.isBlank() ? "" : "\" origin=\"" + origin)
115                             + "\"/>"
116                             + "\t<!-- "
117                             + english.getName(source)
118                             + " ➡︎ "
119                             + english.getName(target)
120                             + " -->";
121             return result;
122         }
123 
combineLSR(String lang, String script, String region)124         public static String combineLSR(String lang, String script, String region) {
125             return lang
126                     + (script.isEmpty() ? "" : "_" + script)
127                     + (region.isEmpty() ? "" : "_" + region);
128         }
129     }
130 
isOk( String lang, String script, String region, Map<LstrType, Status> errors)131     private static boolean isOk(
132             String lang, String script, String region, Map<LstrType, Status> errors) {
133         errors.clear();
134         if (!LIKELY_SPECIALS.contains(lang)) {
135             check(LstrType.language, lang, errors);
136         }
137         if (!FIX_VALIDITY.contains(script)) {
138             check(LstrType.script, script, errors);
139         }
140         if (region.equals("001") && Iso639Data.getType(lang) == Type.Constructed) {
141             // ok
142         } else {
143             check(LstrType.region, region, errors);
144         }
145         return errors.isEmpty();
146     }
147 
check(LstrType lstrType, String lang, Map<LstrType, Status> errors)148     private static void check(LstrType lstrType, String lang, Map<LstrType, Status> errors) {
149         final Status status = validity.getCodeToStatus(lstrType).get(lang);
150         if (status != Status.regular) {
151             errors.put(lstrType, status);
152         }
153     }
154 
155     private static class LikelySources {
156         private static LikelySources SINGLETON = new LikelySources();
157 
getSources()158         public static Set<String> getSources() {
159             return SINGLETON.alreadyLangs;
160         }
161 
162         final ImmutableSet<String> alreadyLangs;
163 
LikelySources()164         private LikelySources() {
165             Map<LstrType, Status> errors = new TreeMap<>();
166             Map<String, String> likely = CLDR_CONFIG.getSupplementalDataInfo().getLikelySubtags();
167             Set<String> _alreadyLangs = new TreeSet<>();
168             _alreadyLangs.add("und");
169             likely.forEach(
170                     (key, value) -> {
171                         String lang = ltpFull.set(value).getLanguage();
172                         String script = ltpFull.set(value).getScript();
173                         String region = ltpFull.set(value).getRegion();
174                         _alreadyLangs.add(lang);
175                         if (!isOk(lang, script, region, errors)) {
176                             showSkip("Skipping scope, CLDR", key, value, errors);
177                         }
178                     });
179             System.out.println();
180             alreadyLangs = ImmutableSet.copyOf(_alreadyLangs);
181         }
182     }
183 
184     static Multimap<String, String> langToRegion;
185 
main(String[] args)186     public static void main(String[] args) {
187 
188         Map<String, LSRSource> result = new TreeMap<>();
189         Map<LstrType, Status> errors = new TreeMap<>();
190 
191         Errors processErrors = new Errors();
192 
193         langToRegion = readWikidata(LikelySources.getSources());
194         readJson(LikelySources.getSources(), result, processErrors);
195 
196         processErrors.printAll();
197 
198         if (ADD_SEED_EXEMPLARS) {
199 
200             for (String locale : factory.getAvailable()) {
201                 CLDRFile file = factory.make(locale, false);
202                 UnicodeSet exemplars = file.getExemplarSet(ExemplarType.main, null);
203                 String lang = ltpFull.set(locale).getLanguage();
204                 if (!LikelySources.getSources().contains(lang)) {
205                     String script = getScript(exemplars);
206                     Collection<String> regions = langToRegion.get(lang);
207                     for (String region : regions) {
208                         addIfOk(result, lang, lang, script, region, "wiki+exemplars", errors);
209                     }
210                 }
211             }
212         }
213         System.out.println();
214 
215         Multimap<String, String> defects = LinkedHashMultimap.create();
216 
217         for (Entry<String, LSRSource> entry : result.entrySet()) {
218             String source = entry.getKey();
219             LSRSource lsrs = entry.getValue();
220             String tagLang = ltpTag.set(source).getLanguage();
221             if (!result.containsKey(tagLang)) {
222                 defects.put(source, tagLang);
223                 showError("Missing lang record", source, lsrs.toString(), "Needs\t" + tagLang);
224             }
225         }
226 
227         System.out.println("\nData to add: " + (result.entrySet().size() - defects.size()) + "\n");
228 
229         for (Entry<String, LSRSource> entry : result.entrySet()) {
230             String source = entry.getKey();
231             if (defects.containsKey(source)) {
232                 continue;
233             }
234             LSRSource lsrs = entry.getValue();
235             System.out.println("\t\t" + lsrs.line(source));
236         }
237 
238         //        Multimap<String, String> likelyAdditions = TreeMultimap.create();
239         //        System.out.println("\nAdd");
240         //        likelyAdditions.asMap().entrySet().forEach(x -> {
241         //            String key = x.getKey();
242         //            if (x.getValue().size() == 1) {
243         //                for (String value : x.getValue()) {
244         //                    System.out.println(key + "\t" + value + "\t" + infoFields(value));
245         //                }
246         //            }
247         //        }
248         //            );
249         //
250         //        System.out.println("\nFix & Add");
251         //
252         //        likelyAdditions.asMap().entrySet().forEach(x -> {
253         //            String key = x.getKey();
254         //            if (x.getValue().size() != 1) {
255         //                for (String value : x.getValue()) {
256         //                    System.out.println(key + "\t" + value + "\t" + infoFields(value));
257         //                }
258         //                System.out.println();
259         //            }
260         //        }
261         //            );
262 
263     }
264 
265     static ImmutableMap<String, String> remap = ImmutableMap.of("iw", "he", "jw", "jv");
266 
list(String string)267     private static void list(String string) {
268         for (String code : string.split(" ")) {
269             ltpFull.set(code.replace("-", "_"));
270             String lang = ltpFull.getLanguage();
271             String cldrLang = remap.get(lang);
272             if (cldrLang != null) {
273                 lang = cldrLang;
274             }
275 
276             System.out.println(
277                     code
278                             + "\t"
279                             + english.getName(code)
280                             + "\t"
281                             + Iso639Data.getType(lang)
282                             + "\t"
283                             + Iso639Data.getScope(lang));
284         }
285         System.out.println();
286     }
287 
showSkip( String message, String source, String target, Map<LstrType, Status> errors)288     public static void showSkip(
289             String message, String source, String target, Map<LstrType, Status> errors) {
290         showError(message, source, target, infoFields(target) + "\t" + errors);
291     }
292 
showError(String message, String source, String target, String errors)293     public static void showError(String message, String source, String target, String errors) {
294         System.out.println(
295                 message + "\t" + source + " ➡ " + target + (errors.isEmpty() ? "" : "\t" + errors));
296     }
297 
infoFields(String value)298     private static String infoFields(String value) {
299         int under = value.indexOf('_');
300         String lang = under < 0 ? value : value.substring(0, under);
301         return english.getName(value)
302                 + "\t"
303                 + Iso639Data.getScope(lang)
304                 + "\t"
305                 + Iso639Data.getType(lang);
306     }
307 
308     // add  <likelySubtag from="aa" to="aa_Latn_ET"/>, status
309 
310     //    private static void handle(Entry<String, LSRSource> original, Multimap<String, String>
311     // likelyAdditions) {
312     //        String source = original.getKey();
313     //        LSRSource lsr = original.getValue();
314     //        if (source.contains("_")) {
315     //            int debug = 0;
316     //        }
317     //        // it is ok if there is a single LSR, eg
318     //        // eg aaa   Ghotuo  {Latn={NG=[sil]}}
319     //        // eg aak   Ankave  {Latn={PG=[sil, wiki+exemplars]}}
320     //
321     //        for (Entry<R3<String, String, String>, String> entry : lsr.data) {
322     //            addKeys(source, entry.getKey(), entry.getValue(),  likelyAdditions);
323     //        }
324     //    }
325 
326     //    private static void addKeys(String source, R3<String, String, String> r3, String comment,
327     // Multimap<String, String> likelyAdditions) {
328     //        likelyAdditions.put(source, r3.get0() + "_" + r3.get1() + "_" + r3.get2() + comment);
329     //    }
330 
331     static final Pattern fullTagMatch = Pattern.compile("\\s*\"(full|tag)\": \"([^\"]+)\",");
332 
333     private static class Errors {
334         public enum Type {
335             ill_formed_tags("Ill-formed tags"),
336             already_CLDR("Language already in CLDR"),
337             tag_not_in_full("tag ⊄ full"),
338             exception("exception");
339             private final String printable;
340 
341             private Type(String printable) {
342                 this.printable = printable;
343             }
344         }
345 
346         public Multimap<Type, String> data = TreeMultimap.create();
347 
348         public void put(
349                 Type illFormedTags, String tagValue, String fullValue, String errorMessage) {
350             data.put(
351                     illFormedTags,
352                     tagValue
353                             + " ➡ "
354                             + fullValue
355                             + (errorMessage == null || errorMessage.isEmpty()
356                                     ? ""
357                                     : "\t—\t" + errorMessage));
358         }
359 
360         public void printAll() {
361             for (Entry<Type, Collection<String>> entry : data.asMap().entrySet()) {
362                 Type type = entry.getKey();
363                 System.out.println();
364                 for (String message : entry.getValue()) {
365                     System.out.println(type + "\t" + message);
366                 }
367             }
368         }
369     }
370 
371     private static Map<String, LSRSource> readJson(
372             Set<String> alreadyLangs, Map<String, LSRSource> result, Errors processErrors) {
373         Path path = Paths.get(CLDRPaths.BIRTH_DATA_DIR, "/../external/langtags.json");
374         Matcher full = fullTagMatch.matcher("");
375         Map<LstrType, Status> errors = new TreeMap<>();
376 
377         Output<String> lastFull = new Output<>();
378         try {
379             Files.lines(path)
380                     .forEach(
381                             x -> {
382                                 if (full.reset(x).matches()) {
383                                     final String key = full.group(1);
384                                     final String value = full.group(2).replace("-", "_");
385                                     if (value.startsWith("aai")) {
386                                         int debug = 0;
387                                     }
388                                     switch (key) {
389                                         case "full":
390                                             lastFull.value = value;
391                                             break;
392                                         case "tag":
393                                             try {
394                                                 String fullLang =
395                                                         ltpFull.set(lastFull.value).getLanguage();
396                                                 if (alreadyLangs.contains(fullLang)) {
397                                                     processErrors.put(
398                                                             Errors.Type.already_CLDR,
399                                                             value,
400                                                             lastFull.value,
401                                                             "");
402                                                     break;
403                                                 } else if (isIllFormed(lastFull.value, ltpFull)
404                                                         || isIllFormed(value, ltpTag.set(value))) {
405                                                     processErrors.put(
406                                                             Errors.Type.ill_formed_tags,
407                                                             value,
408                                                             lastFull.value,
409                                                             "");
410                                                 } else {
411                                                     String reference = SIL;
412                                                     final String fullScript = ltpFull.getScript();
413                                                     String fullRegion = ltpFull.getRegion();
414                                                     if (fullRegion.equals("ZZ")
415                                                             || fullRegion.equals("001")) {
416                                                         Collection<String> tempRegions =
417                                                                 langToRegion.get(
418                                                                         fullLang); // synthesize
419                                                         if (!tempRegions.isEmpty()) {
420                                                             fullRegion =
421                                                                     tempRegions.iterator().next();
422                                                             reference += " wikidata";
423                                                         }
424                                                     }
425 
426                                                     String tagLang = ltpTag.getLanguage();
427                                                     String tagScript = ltpTag.getScript();
428                                                     String tagRegion = ltpTag.getRegion();
429 
430                                                     if (!tagLang.equals(fullLang)
431                                                             || (!tagScript.isEmpty()
432                                                                     && !tagScript.equals(
433                                                                             fullScript))
434                                                             || (!tagRegion.isEmpty()
435                                                                     && !tagRegion.equals(
436                                                                             fullRegion))) {
437                                                         processErrors.put(
438                                                                 Errors.Type.tag_not_in_full,
439                                                                 value,
440                                                                 lastFull.value,
441                                                                 "");
442                                                     } else {
443                                                         addIfOk(
444                                                                 result,
445                                                                 value,
446                                                                 fullLang,
447                                                                 fullScript,
448                                                                 fullRegion,
449                                                                 reference,
450                                                                 errors);
451                                                     }
452                                                 }
453                                             } catch (Exception e) {
454                                                 processErrors.put(
455                                                         Errors.Type.exception,
456                                                         value,
457                                                         lastFull.value,
458                                                         e.getMessage());
459                                             }
460                                             break;
461                                         default:
462                                             throw new IllegalArgumentException(); // never happens
463                                     }
464                                 }
465                             });
466             return result;
467         } catch (IOException ex) {
468             throw new UncheckedIOException(ex);
469         }
470     }
471 
isIllFormed(String source, LanguageTagParser languageTagParser)472     private static boolean isIllFormed(String source, LanguageTagParser languageTagParser) {
473         return languageTagParser.getLanguage().isEmpty()
474                 || !languageTagParser.getVariants().isEmpty()
475                 || !languageTagParser.getExtensions().isEmpty()
476                 || !languageTagParser.getLocaleExtensions().isEmpty()
477                 || source.contains("@");
478     }
479 
addIfOk( Map<String, LSRSource> result, String source, String lang, final String script, final String region, String reference, Map<LstrType, Status> errors)480     private static void addIfOk(
481             Map<String, LSRSource> result,
482             String source,
483             String lang,
484             final String script,
485             final String region,
486             String reference,
487             Map<LstrType, Status> errors) {
488         if (isOk(lang, script, region, errors)) {
489             add(result, source, lang, script, region, reference);
490         } else {
491             showSkip("Skipping scope, SIL", source, ltpFull.toString(), errors);
492         }
493     }
494 
readWikidata(Set<String> alreadyLangs)495     private static Multimap<String, String> readWikidata(Set<String> alreadyLangs) {
496         Multimap<String, String> result = TreeMultimap.create();
497         Path path = Paths.get(CLDRPaths.BIRTH_DATA_DIR, "/../external/wididata_lang_region.tsv");
498         try {
499             Files.lines(path)
500                     .forEach(
501                             x -> {
502                                 if (!x.startsWith("#")) {
503                                     List<String> list = TAB_SPLITTER.splitToList(x);
504                                     String lang = list.get(1);
505                                     String region = list.get(3);
506                                     result.put(lang, region);
507                                 }
508                             });
509         } catch (IOException ex) {
510             throw new UncheckedIOException(ex);
511         }
512         return result;
513     }
514 
add( Map<String, LSRSource> result, String source, String lang, final String script, final String region, String reference)515     private static void add(
516             Map<String, LSRSource> result,
517             String source,
518             String lang,
519             final String script,
520             final String region,
521             String reference) {
522         LSRSource old = result.get(source);
523         LSRSource newVersion = new LSRSource(lang, script, region, reference);
524         if (old != null && !old.equals(newVersion)) {
525             throw new IllegalArgumentException(
526                     "Data already exists for " + source + ": old=" + old + ", new: " + newVersion);
527         }
528         result.put(source, newVersion);
529     }
530 
getScript(UnicodeSet exemplars)531     private static String getScript(UnicodeSet exemplars) {
532         for (String s : exemplars) {
533             int scriptNum = UScript.getScript(s.codePointAt(0));
534             if (scriptNum != UScript.COMMON && scriptNum != UScript.INHERITED) {
535                 return UScript.getShortName(scriptNum);
536             }
537         }
538         return "Zxxx";
539     }
540 }
541