xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/test/BestMinimalPairSamples.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.test;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.collect.ImmutableMultimap;
5 import com.google.common.collect.ImmutableSet;
6 import com.google.common.collect.Maps;
7 import com.google.common.collect.Multimap;
8 import com.google.common.collect.Multimaps;
9 import com.google.common.collect.TreeMultimap;
10 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap;
11 import com.ibm.icu.impl.number.DecimalQuantity;
12 import com.ibm.icu.text.DecimalFormat;
13 import com.ibm.icu.text.PluralRules;
14 import com.ibm.icu.text.PluralRules.DecimalQuantitySamples;
15 import com.ibm.icu.text.PluralRules.DecimalQuantitySamplesRange;
16 import com.ibm.icu.text.PluralRules.Operand;
17 import com.ibm.icu.text.PluralRules.SampleType;
18 import com.ibm.icu.util.Output;
19 import java.util.Collection;
20 import java.util.Collections;
21 import java.util.Map;
22 import java.util.Map.Entry;
23 import java.util.NavigableSet;
24 import java.util.Set;
25 import java.util.TreeMap;
26 import org.unicode.cldr.util.CLDRConfig;
27 import org.unicode.cldr.util.CLDRFile;
28 import org.unicode.cldr.util.GrammarInfo;
29 import org.unicode.cldr.util.GrammarInfo.GrammaticalFeature;
30 import org.unicode.cldr.util.GrammarInfo.GrammaticalScope;
31 import org.unicode.cldr.util.GrammarInfo.GrammaticalTarget;
32 import org.unicode.cldr.util.ICUServiceBuilder;
33 import org.unicode.cldr.util.Pair;
34 import org.unicode.cldr.util.SupplementalDataInfo;
35 import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count;
36 import org.unicode.cldr.util.SupplementalDataInfo.PluralType;
37 import org.unicode.cldr.util.UnitConverter.UnitSystem;
38 import org.unicode.cldr.util.UnitPathType;
39 
40 /**
41  * Return the best samples for illustrating minimal pairs
42  *
43  * @author markdavis
44  */
45 public class BestMinimalPairSamples {
46     private static final boolean DEBUG_WEIGHTS = false;
47 
48     public static final String EQUALS_NOMINATIVE = "=nominative";
49     private static final Joiner PLUS_JOINER = Joiner.on("+");
50     private static final CLDRConfig CONFIG = CLDRConfig.getInstance();
51     private static final SupplementalDataInfo supplementalDataInfo =
52             SupplementalDataInfo.getInstance();
53 
54     private final CLDRFile cldrFile;
55     private final GrammarInfo grammarInfo;
56     private final PluralRules pluralInfo;
57     private final PluralRules ordinalInfo;
58     private final ICUServiceBuilder icuServiceBuilder;
59     private CaseAndGenderSamples caseAndGenderSamples = null; // lazy evaluated
60     private Multimap<String, String> genderToUnits;
61     private Multimap<Integer, String> uniqueCaseAndCountToUnits;
62     private Multimap<String, String> distinctNominativeCaseToUnit;
63     private final boolean gatherStats;
64 
BestMinimalPairSamples( CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats)65     public BestMinimalPairSamples(
66             CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats) {
67         this.cldrFile = cldrFile;
68         grammarInfo = supplementalDataInfo.getGrammarInfo(cldrFile.getLocaleID());
69         pluralInfo =
70                 supplementalDataInfo
71                         .getPlurals(PluralType.cardinal, cldrFile.getLocaleID())
72                         .getPluralRules();
73         ordinalInfo =
74                 supplementalDataInfo
75                         .getPlurals(PluralType.ordinal, cldrFile.getLocaleID())
76                         .getPluralRules();
77         this.icuServiceBuilder = icuServiceBuilder;
78         genderToUnits = TreeMultimap.create();
79         uniqueCaseAndCountToUnits = TreeMultimap.create();
80         this.gatherStats = gatherStats;
81     }
82 
83     static final class CaseAndGenderSamples {
84         private final Map<String, Pair<String, String>> genderCache;
85         private final Map<String, String> caseCache;
86         private final String caseUnitId;
87 
CaseAndGenderSamples( Map<String, String> caseCache2, String bestCaseUnitId, Map<String, Pair<String, String>> genderCache2)88         public CaseAndGenderSamples(
89                 Map<String, String> caseCache2,
90                 String bestCaseUnitId,
91                 Map<String, Pair<String, String>> genderCache2) {
92             genderCache = genderCache2;
93             caseCache = caseCache2;
94             caseUnitId = bestCaseUnitId;
95         }
96 
getGender(String gender, Output<String> shortUnitId)97         public String getGender(String gender, Output<String> shortUnitId) {
98             Pair<String, String> result = genderCache.get(gender);
99             if (result == null) {
100                 return null;
101             }
102             shortUnitId.value = result.getFirst();
103             return result.getSecond();
104         }
105 
getCase(String unitCase, Output<String> shortUnitId)106         public String getCase(String unitCase, Output<String> shortUnitId) {
107             shortUnitId.value = caseUnitId;
108             return caseCache.get(unitCase);
109         }
110     }
111 
112     /**
113      * Returns a "good" value for a unit. Favors metric units, and simple units
114      *
115      * @param shortUnitId
116      */
getBestUnitWithGender(String gender, Output<String> shortUnitId)117     public synchronized String getBestUnitWithGender(String gender, Output<String> shortUnitId) {
118         if (gender == null || grammarInfo == null) {
119             return null;
120         }
121         if (caseAndGenderSamples == null) {
122             caseAndGenderSamples = loadCaches();
123         }
124         return caseAndGenderSamples.getGender(gender, shortUnitId);
125     }
126 
127     /**
128      * Returns a "good" value for a unit. Favors metric units, and simple units
129      *
130      * @param shortUnitId
131      */
getBestUnitWithCase(String unitCase, Output<String> shortUnitId)132     public synchronized String getBestUnitWithCase(String unitCase, Output<String> shortUnitId) {
133         if (unitCase == null || grammarInfo == null) {
134             return null;
135         }
136         if (caseAndGenderSamples == null) {
137             caseAndGenderSamples = loadCaches();
138         }
139         return caseAndGenderSamples.getCase(unitCase, shortUnitId);
140     }
141 
142     static final Set<String> SKIP_CASE =
143             ImmutableSet.of(
144                     "concentr-ofglucose",
145                     "concentr-portion",
146                     "length-100-kilometer",
147                     "pressure-ofhg");
148 
loadCaches()149     public CaseAndGenderSamples loadCaches() {
150         Collection<String> unitCases =
151                 grammarInfo.get(
152                         GrammaticalTarget.nominal,
153                         GrammaticalFeature.grammaticalCase,
154                         GrammaticalScope.units);
155         Map<String, String> genderResults = Maps.newHashMap();
156         Multimap<String, Pair<String, String>> unitPatternToCaseAndCounts = TreeMultimap.create();
157         distinctNominativeCaseToUnit = TreeMultimap.create();
158 
159         int bestCaseFormCount = 0;
160         String bestCaseUnitId = null;
161         Multimap<String, Pair<String, String>> bestUnitPatternToCases = null;
162         Multimap<String, String> unitToDistinctNominativeCase = TreeMultimap.create();
163 
164         int i = 0;
165         for (String longUnitId : GrammarInfo.getUnitsToAddGrammar()) {
166             String possibleGender =
167                     cldrFile.getStringValue(
168                             "//ldml/units/unitLength[@type=\"long\"]/unit[@type=\""
169                                     + longUnitId
170                                     + "\"]/gender");
171             String shortUnitId = ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId);
172             if (shortUnitId.equals("hour") && cldrFile.getLocaleID().equals("ta")) {
173                 int debug = 0;
174             }
175             if (possibleGender != null) {
176                 if (gatherStats) {
177                     genderToUnits.put(possibleGender, shortUnitId);
178                 }
179                 String formerLongUnitId = genderResults.get(possibleGender);
180                 if (formerLongUnitId == null || isBetterUnit(longUnitId, formerLongUnitId)) {
181                     genderResults.put(possibleGender, longUnitId);
182                     if (DEBUG_WEIGHTS) {
183                         final int sw = systemWeight(longUnitId);
184                         final int cs = categoryWeight(longUnitId);
185                         System.out.println(
186                                 i++ + ") gender " + longUnitId + "; sw: " + sw + " cw: " + cs);
187                     }
188                 }
189             }
190             if (!unitCases.isEmpty()) {
191                 unitPatternToCaseAndCounts.clear();
192                 for (String count : pluralInfo.getKeywords()) {
193                     for (String unitCase : unitCases) {
194                         String grammarAttributes =
195                                 GrammarInfo.getGrammaticalInfoAttributes(
196                                         grammarInfo, UnitPathType.unit, count, null, unitCase);
197                         String unitPattern =
198                                 cldrFile.getStringValue(
199                                         "//ldml/units/unitLength[@type=\"long\"]/unit[@type=\""
200                                                 + longUnitId
201                                                 + "\"]/unitPattern"
202                                                 + grammarAttributes);
203                         if (unitPattern == null) {
204                             continue;
205                         }
206                         unitPattern = unitPattern.replace("\u00A0", "").trim();
207                         final Pair<String, String> caseAndCount = Pair.of(unitCase, count);
208                         unitPatternToCaseAndCounts.put(unitPattern, caseAndCount);
209                     }
210                 }
211                 int caseFormCount = unitPatternToCaseAndCounts.keySet().size();
212 
213                 boolean alwaysSameAsNominative = true;
214                 TreeMultimap<Pair<String, String>, String> caseAndCountToPattern =
215                         Multimaps.invertFrom(unitPatternToCaseAndCounts, TreeMultimap.create());
216                 for (Entry<Pair<String, String>, String> entry : caseAndCountToPattern.entries()) {
217                     Pair<String, String> caseAndCount = entry.getKey();
218                     String pattern = entry.getValue();
219                     String gCase = caseAndCount.getFirst();
220                     if (!gCase.equals("nominative")) {
221                         Pair<String, String> nomPair =
222                                 Pair.of("nominative", caseAndCount.getSecond());
223                         NavigableSet<String> nomPatterns = caseAndCountToPattern.get(nomPair);
224                         if (!nomPatterns.contains(pattern)) {
225                             unitToDistinctNominativeCase.put(shortUnitId, gCase);
226                             alwaysSameAsNominative = false;
227                         }
228                     }
229                 }
230                 for (Entry<String, Collection<String>> entry :
231                         unitToDistinctNominativeCase.asMap().entrySet()) {
232                     distinctNominativeCaseToUnit.put(
233                             PLUS_JOINER.join(entry.getValue()), entry.getKey());
234                 }
235                 if (alwaysSameAsNominative) {
236                     distinctNominativeCaseToUnit.put(EQUALS_NOMINATIVE, shortUnitId);
237                 }
238 
239                 if (gatherStats && !SKIP_CASE.contains(longUnitId)) {
240                     uniqueCaseAndCountToUnits.put(caseFormCount, shortUnitId);
241                 }
242 
243                 // For case, we should do something fancier, but for now we pick the units with the
244                 // largest number of distinct forms.
245                 int diff = caseFormCount - bestCaseFormCount;
246                 if (diff > 0 || diff == 0 && isBetterUnit(longUnitId, bestCaseUnitId)) {
247                     //                    System.out.println(cldrFile.getLocaleID() + "\t" +
248                     // longUnitId + " better than " + bestCaseUnitId);
249                     //                 if (WORSE.contains(longUnitId)) {
250                     //                        isBetterUnit(longUnitId, bestCaseUnitId);
251                     //                    }
252                     bestCaseFormCount = caseFormCount;
253                     bestCaseUnitId = longUnitId;
254                     bestUnitPatternToCases = TreeMultimap.create(unitPatternToCaseAndCounts);
255                 }
256             }
257         }
258         // Fill the case cache with the most distinctive forms.
259         Map<String, String> caseCache = getBestCasePatterns(bestUnitPatternToCases);
260 
261         // Make the gender cache be translated units as well as unit IDs
262         Count count = pluralInfo.getKeywords().contains("one") ? Count.one : Count.other;
263         Map<String, Pair<String, String>> result2 = Maps.newHashMap();
264 
265         for (Entry<String, String> entry : genderResults.entrySet()) {
266             String longUnitId = entry.getValue();
267             String unitPattern =
268                     cldrFile.getStringValue(
269                             "//ldml/units/unitLength[@type=\"long\"]/unit[@type=\""
270                                     + longUnitId
271                                     + "\"]/unitPattern[@count=\""
272                                     + count
273                                     + "\"]");
274             unitPattern = unitPattern.replace("{0}", "").replace("\u00A0", "").trim();
275             result2.put(
276                     entry.getKey(),
277                     Pair.of(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId), unitPattern));
278         }
279         // it doesn't matter if we reset this due to multiple threads
280         Map<String, Pair<String, String>> genderCache = ImmutableMap.copyOf(result2);
281         CaseAndGenderSamples result =
282                 new CaseAndGenderSamples(
283                         caseCache,
284                         ExampleGenerator.UNIT_CONVERTER.getShortId(bestCaseUnitId),
285                         genderCache);
286 
287         genderToUnits = ImmutableMultimap.copyOf(genderToUnits);
288         uniqueCaseAndCountToUnits = ImmutableMultimap.copyOf(uniqueCaseAndCountToUnits);
289         distinctNominativeCaseToUnit = ImmutableMultimap.copyOf(distinctNominativeCaseToUnit);
290         return result;
291     }
292 
293     /**
294      * Get the a pattern that is most unique for each case.
295      *
296      * @param bestUnitPatternToCases
297      * @return
298      */
getBestCasePatterns( Multimap<String, Pair<String, String>> bestUnitPatternToCases)299     private Map<String, String> getBestCasePatterns(
300             Multimap<String, Pair<String, String>> bestUnitPatternToCases) {
301         if (bestUnitPatternToCases == null || bestUnitPatternToCases.isEmpty()) {
302             return Collections.emptyMap();
303         }
304         Map<String, String> result = new TreeMap<>();
305         while (true) {
306             String bestPattern = getBestPattern(bestUnitPatternToCases);
307             Pair<String, String> bestCaseCount =
308                     bestUnitPatternToCases.get(bestPattern).iterator().next();
309             String bestCase = bestCaseCount.getFirst();
310             String bestCount = bestCaseCount.getSecond();
311             String sample = getPluralOrOrdinalSample(PluralType.cardinal, bestCount);
312             if (sample == null) { // debugging
313                 getPluralOrOrdinalSample(PluralType.cardinal, bestCount);
314             }
315             result.put(bestCaseCount.getFirst(), bestPattern.replace("{0}", sample));
316             TreeMultimap<Pair<String, String>, String> caseToPatterns =
317                     Multimaps.invertFrom(bestUnitPatternToCases, TreeMultimap.create());
318             for (String count : pluralInfo.getKeywords()) {
319                 caseToPatterns.removeAll(Pair.of(bestCase, count));
320             }
321             if (caseToPatterns.keySet().isEmpty()) {
322                 return result;
323             }
324             bestUnitPatternToCases = Multimaps.invertFrom(caseToPatterns, TreeMultimap.create());
325         }
326     }
327 
getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases)328     private String getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases) {
329         int bestCaseSize = 1000;
330         String bestPattern = null;
331         Collection<Pair<String, String>> bestCase = null;
332         for (Entry<String, Collection<Pair<String, String>>> entry :
333                 bestUnitPatternToCases.asMap().entrySet()) {
334             final Collection<Pair<String, String>> setOfCases = entry.getValue();
335             if (setOfCases.size() < bestCaseSize) {
336                 bestCaseSize = setOfCases.size();
337                 bestPattern = entry.getKey();
338                 bestCase = setOfCases;
339             }
340         }
341         return bestPattern;
342     }
343 
isBetterUnit(String longUnitId, String formerLongUnitId)344     public boolean isBetterUnit(String longUnitId, String formerLongUnitId) {
345         // replace if as good or better (where better is smaller). Metric is better. If both metric,
346         // choose alphabetical
347         boolean isBetter = false;
348         int diff = systemWeight(longUnitId) - systemWeight(formerLongUnitId);
349         if (diff < 0) {
350             isBetter = true;
351         } else if (diff == 0) {
352             diff = categoryWeight(longUnitId) - categoryWeight(formerLongUnitId);
353             if (diff < 0) {
354                 isBetter = true;
355             } else if (diff == 0 && longUnitId.compareTo(formerLongUnitId) < 0) {
356                 isBetter = true;
357             }
358         }
359         return isBetter;
360     }
361 
362     static final Set<String> WORSE =
363             ImmutableSet.of("length-100-kilometer", "length-mile-scandinavian");
364     static final Set<String> BEST =
365             ImmutableSet.of(
366                     "duration-year",
367                     "duration-month",
368                     "duration-week",
369                     "duration-day",
370                     "duration-hour",
371                     "duration-minute");
372     /**
373      * better result is smaller
374      *
375      * @param longUnitId
376      * @return
377      */
systemWeight(String longUnitId)378     public int systemWeight(String longUnitId) {
379         if (WORSE.contains(longUnitId)) {
380             return 99;
381         }
382         if (GrammarInfo.getUnitsToAddGrammar().contains(longUnitId)) {
383             if (BEST.contains(longUnitId)) {
384                 return 0; // better
385             }
386             final String shortId = ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId);
387             Set<UnitSystem> systems = ExampleGenerator.UNIT_CONVERTER.getSystemsEnum(shortId);
388             if (systems.contains(UnitSystem.metric)) {
389                 return 1; // better
390             } else {
391                 return systems.iterator().next().ordinal() + 2;
392             }
393         }
394         return 99;
395     }
396 
categoryWeight(String longUnitId)397     private int categoryWeight(String longUnitId) {
398         if (longUnitId.startsWith("length")) {
399             return 0;
400         } else if (longUnitId.startsWith("weight")) {
401             return 1;
402         } else if (longUnitId.startsWith("duration")) {
403             return 2;
404         } else if (longUnitId.startsWith("area")) {
405             return 2;
406         } else if (longUnitId.startsWith("volume")) {
407             return 2;
408         }
409         return 999;
410     }
411 
getPluralOrOrdinalSample(PluralType pluralType, String code)412     public String getPluralOrOrdinalSample(PluralType pluralType, String code) {
413         PluralRules rules = pluralType == PluralType.cardinal ? pluralInfo : ordinalInfo;
414         DecimalQuantitySamples samples = rules.getDecimalSamples(code, SampleType.INTEGER);
415         if (samples == null) {
416             samples = rules.getDecimalSamples(code, SampleType.DECIMAL);
417         }
418         if (samples == null) {
419             return null;
420         }
421 
422         // get good sample. Avoid zero if possible
423         DecimalQuantity sample = null;
424         for (DecimalQuantitySamplesRange sampleRange : samples.getSamples()) {
425             sample = sampleRange.start;
426             if (sample.toDouble() != 0d) {
427                 break;
428             }
429         }
430 
431         if (icuServiceBuilder != null) {
432             int visibleDigits = (int) sample.getPluralOperand(Operand.v);
433             DecimalFormat nf;
434             if (visibleDigits == 0) {
435                 nf = icuServiceBuilder.getNumberFormat(0); // 0 is integer, 1 is decimal
436             } else {
437                 nf = icuServiceBuilder.getNumberFormat(1); // 0 is integer, 1 is decimal
438                 int minFracDigits = nf.getMinimumFractionDigits();
439                 int maxFracDigits = nf.getMaximumFractionDigits();
440                 if (minFracDigits != visibleDigits || maxFracDigits != visibleDigits) {
441                     nf = (DecimalFormat) nf.clone();
442                     nf.setMaximumFractionDigits(visibleDigits);
443                     nf.setMinimumFractionDigits(visibleDigits);
444                 }
445             }
446             return nf.format(sample.toBigDecimal());
447         }
448         return sample.toString();
449     }
450 
451     /** Get the best value to show, plus the shortUnitId if relevant (case/gender) */
getBestValue(String header, String code, Output<String> shortUnitId)452     public String getBestValue(String header, String code, Output<String> shortUnitId) {
453         String result = null;
454         switch (header) {
455             case "Case":
456                 result = getBestUnitWithCase(code, shortUnitId);
457                 break;
458             case "Gender":
459                 result = getBestUnitWithGender(code, shortUnitId);
460                 break;
461             case "Ordinal":
462                 result = getPluralOrOrdinalSample(PluralType.ordinal, code);
463                 shortUnitId.value = "n/a";
464                 break;
465             case "Plural":
466                 result = getPluralOrOrdinalSample(PluralType.cardinal, code);
467                 shortUnitId.value = "n/a";
468                 break;
469         }
470         return result == null ? "X" : result;
471     }
472 
getGenderToUnits()473     public Multimap<String, String> getGenderToUnits() {
474         return genderToUnits;
475     }
476 
getUniqueCaseAndCountToUnits()477     public Multimap<Integer, String> getUniqueCaseAndCountToUnits() {
478         return uniqueCaseAndCountToUnits;
479     }
480 
getDistinctNominativeCaseToUnit()481     public Multimap<String, String> getDistinctNominativeCaseToUnit() {
482         return distinctNominativeCaseToUnit;
483     }
484 }
485