1 package org.unicode.cldr.test; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.collect.ImmutableMultimap; 5 import com.google.common.collect.ImmutableSet; 6 import com.google.common.collect.Maps; 7 import com.google.common.collect.Multimap; 8 import com.google.common.collect.Multimaps; 9 import com.google.common.collect.TreeMultimap; 10 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; 11 import com.ibm.icu.impl.number.DecimalQuantity; 12 import com.ibm.icu.text.DecimalFormat; 13 import com.ibm.icu.text.PluralRules; 14 import com.ibm.icu.text.PluralRules.DecimalQuantitySamples; 15 import com.ibm.icu.text.PluralRules.DecimalQuantitySamplesRange; 16 import com.ibm.icu.text.PluralRules.Operand; 17 import com.ibm.icu.text.PluralRules.SampleType; 18 import com.ibm.icu.util.Output; 19 import java.util.Collection; 20 import java.util.Collections; 21 import java.util.Map; 22 import java.util.Map.Entry; 23 import java.util.NavigableSet; 24 import java.util.Set; 25 import java.util.TreeMap; 26 import org.unicode.cldr.util.CLDRConfig; 27 import org.unicode.cldr.util.CLDRFile; 28 import org.unicode.cldr.util.GrammarInfo; 29 import org.unicode.cldr.util.GrammarInfo.GrammaticalFeature; 30 import org.unicode.cldr.util.GrammarInfo.GrammaticalScope; 31 import org.unicode.cldr.util.GrammarInfo.GrammaticalTarget; 32 import org.unicode.cldr.util.ICUServiceBuilder; 33 import org.unicode.cldr.util.Pair; 34 import org.unicode.cldr.util.SupplementalDataInfo; 35 import org.unicode.cldr.util.SupplementalDataInfo.PluralInfo.Count; 36 import org.unicode.cldr.util.SupplementalDataInfo.PluralType; 37 import org.unicode.cldr.util.UnitConverter.UnitSystem; 38 import org.unicode.cldr.util.UnitPathType; 39 40 /** 41 * Return the best samples for illustrating minimal pairs 42 * 43 * @author markdavis 44 */ 45 public class BestMinimalPairSamples { 46 private static final boolean DEBUG_WEIGHTS = false; 47 48 public static final String EQUALS_NOMINATIVE = "=nominative"; 49 private static final Joiner PLUS_JOINER = Joiner.on("+"); 50 private static final CLDRConfig CONFIG = CLDRConfig.getInstance(); 51 private static final SupplementalDataInfo supplementalDataInfo = 52 SupplementalDataInfo.getInstance(); 53 54 private final CLDRFile cldrFile; 55 private final GrammarInfo grammarInfo; 56 private final PluralRules pluralInfo; 57 private final PluralRules ordinalInfo; 58 private final ICUServiceBuilder icuServiceBuilder; 59 private CaseAndGenderSamples caseAndGenderSamples = null; // lazy evaluated 60 private Multimap<String, String> genderToUnits; 61 private Multimap<Integer, String> uniqueCaseAndCountToUnits; 62 private Multimap<String, String> distinctNominativeCaseToUnit; 63 private final boolean gatherStats; 64 BestMinimalPairSamples( CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats)65 public BestMinimalPairSamples( 66 CLDRFile cldrFile, ICUServiceBuilder icuServiceBuilder, boolean gatherStats) { 67 this.cldrFile = cldrFile; 68 grammarInfo = supplementalDataInfo.getGrammarInfo(cldrFile.getLocaleID()); 69 pluralInfo = 70 supplementalDataInfo 71 .getPlurals(PluralType.cardinal, cldrFile.getLocaleID()) 72 .getPluralRules(); 73 ordinalInfo = 74 supplementalDataInfo 75 .getPlurals(PluralType.ordinal, cldrFile.getLocaleID()) 76 .getPluralRules(); 77 this.icuServiceBuilder = icuServiceBuilder; 78 genderToUnits = TreeMultimap.create(); 79 uniqueCaseAndCountToUnits = TreeMultimap.create(); 80 this.gatherStats = gatherStats; 81 } 82 83 static final class CaseAndGenderSamples { 84 private final Map<String, Pair<String, String>> genderCache; 85 private final Map<String, String> caseCache; 86 private final String caseUnitId; 87 CaseAndGenderSamples( Map<String, String> caseCache2, String bestCaseUnitId, Map<String, Pair<String, String>> genderCache2)88 public CaseAndGenderSamples( 89 Map<String, String> caseCache2, 90 String bestCaseUnitId, 91 Map<String, Pair<String, String>> genderCache2) { 92 genderCache = genderCache2; 93 caseCache = caseCache2; 94 caseUnitId = bestCaseUnitId; 95 } 96 getGender(String gender, Output<String> shortUnitId)97 public String getGender(String gender, Output<String> shortUnitId) { 98 Pair<String, String> result = genderCache.get(gender); 99 if (result == null) { 100 return null; 101 } 102 shortUnitId.value = result.getFirst(); 103 return result.getSecond(); 104 } 105 getCase(String unitCase, Output<String> shortUnitId)106 public String getCase(String unitCase, Output<String> shortUnitId) { 107 shortUnitId.value = caseUnitId; 108 return caseCache.get(unitCase); 109 } 110 } 111 112 /** 113 * Returns a "good" value for a unit. Favors metric units, and simple units 114 * 115 * @param shortUnitId 116 */ getBestUnitWithGender(String gender, Output<String> shortUnitId)117 public synchronized String getBestUnitWithGender(String gender, Output<String> shortUnitId) { 118 if (gender == null || grammarInfo == null) { 119 return null; 120 } 121 if (caseAndGenderSamples == null) { 122 caseAndGenderSamples = loadCaches(); 123 } 124 return caseAndGenderSamples.getGender(gender, shortUnitId); 125 } 126 127 /** 128 * Returns a "good" value for a unit. Favors metric units, and simple units 129 * 130 * @param shortUnitId 131 */ getBestUnitWithCase(String unitCase, Output<String> shortUnitId)132 public synchronized String getBestUnitWithCase(String unitCase, Output<String> shortUnitId) { 133 if (unitCase == null || grammarInfo == null) { 134 return null; 135 } 136 if (caseAndGenderSamples == null) { 137 caseAndGenderSamples = loadCaches(); 138 } 139 return caseAndGenderSamples.getCase(unitCase, shortUnitId); 140 } 141 142 static final Set<String> SKIP_CASE = 143 ImmutableSet.of( 144 "concentr-ofglucose", 145 "concentr-portion", 146 "length-100-kilometer", 147 "pressure-ofhg"); 148 loadCaches()149 public CaseAndGenderSamples loadCaches() { 150 Collection<String> unitCases = 151 grammarInfo.get( 152 GrammaticalTarget.nominal, 153 GrammaticalFeature.grammaticalCase, 154 GrammaticalScope.units); 155 Map<String, String> genderResults = Maps.newHashMap(); 156 Multimap<String, Pair<String, String>> unitPatternToCaseAndCounts = TreeMultimap.create(); 157 distinctNominativeCaseToUnit = TreeMultimap.create(); 158 159 int bestCaseFormCount = 0; 160 String bestCaseUnitId = null; 161 Multimap<String, Pair<String, String>> bestUnitPatternToCases = null; 162 Multimap<String, String> unitToDistinctNominativeCase = TreeMultimap.create(); 163 164 int i = 0; 165 for (String longUnitId : GrammarInfo.getUnitsToAddGrammar()) { 166 String possibleGender = 167 cldrFile.getStringValue( 168 "//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" 169 + longUnitId 170 + "\"]/gender"); 171 String shortUnitId = ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId); 172 if (shortUnitId.equals("hour") && cldrFile.getLocaleID().equals("ta")) { 173 int debug = 0; 174 } 175 if (possibleGender != null) { 176 if (gatherStats) { 177 genderToUnits.put(possibleGender, shortUnitId); 178 } 179 String formerLongUnitId = genderResults.get(possibleGender); 180 if (formerLongUnitId == null || isBetterUnit(longUnitId, formerLongUnitId)) { 181 genderResults.put(possibleGender, longUnitId); 182 if (DEBUG_WEIGHTS) { 183 final int sw = systemWeight(longUnitId); 184 final int cs = categoryWeight(longUnitId); 185 System.out.println( 186 i++ + ") gender " + longUnitId + "; sw: " + sw + " cw: " + cs); 187 } 188 } 189 } 190 if (!unitCases.isEmpty()) { 191 unitPatternToCaseAndCounts.clear(); 192 for (String count : pluralInfo.getKeywords()) { 193 for (String unitCase : unitCases) { 194 String grammarAttributes = 195 GrammarInfo.getGrammaticalInfoAttributes( 196 grammarInfo, UnitPathType.unit, count, null, unitCase); 197 String unitPattern = 198 cldrFile.getStringValue( 199 "//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" 200 + longUnitId 201 + "\"]/unitPattern" 202 + grammarAttributes); 203 if (unitPattern == null) { 204 continue; 205 } 206 unitPattern = unitPattern.replace("\u00A0", "").trim(); 207 final Pair<String, String> caseAndCount = Pair.of(unitCase, count); 208 unitPatternToCaseAndCounts.put(unitPattern, caseAndCount); 209 } 210 } 211 int caseFormCount = unitPatternToCaseAndCounts.keySet().size(); 212 213 boolean alwaysSameAsNominative = true; 214 TreeMultimap<Pair<String, String>, String> caseAndCountToPattern = 215 Multimaps.invertFrom(unitPatternToCaseAndCounts, TreeMultimap.create()); 216 for (Entry<Pair<String, String>, String> entry : caseAndCountToPattern.entries()) { 217 Pair<String, String> caseAndCount = entry.getKey(); 218 String pattern = entry.getValue(); 219 String gCase = caseAndCount.getFirst(); 220 if (!gCase.equals("nominative")) { 221 Pair<String, String> nomPair = 222 Pair.of("nominative", caseAndCount.getSecond()); 223 NavigableSet<String> nomPatterns = caseAndCountToPattern.get(nomPair); 224 if (!nomPatterns.contains(pattern)) { 225 unitToDistinctNominativeCase.put(shortUnitId, gCase); 226 alwaysSameAsNominative = false; 227 } 228 } 229 } 230 for (Entry<String, Collection<String>> entry : 231 unitToDistinctNominativeCase.asMap().entrySet()) { 232 distinctNominativeCaseToUnit.put( 233 PLUS_JOINER.join(entry.getValue()), entry.getKey()); 234 } 235 if (alwaysSameAsNominative) { 236 distinctNominativeCaseToUnit.put(EQUALS_NOMINATIVE, shortUnitId); 237 } 238 239 if (gatherStats && !SKIP_CASE.contains(longUnitId)) { 240 uniqueCaseAndCountToUnits.put(caseFormCount, shortUnitId); 241 } 242 243 // For case, we should do something fancier, but for now we pick the units with the 244 // largest number of distinct forms. 245 int diff = caseFormCount - bestCaseFormCount; 246 if (diff > 0 || diff == 0 && isBetterUnit(longUnitId, bestCaseUnitId)) { 247 // System.out.println(cldrFile.getLocaleID() + "\t" + 248 // longUnitId + " better than " + bestCaseUnitId); 249 // if (WORSE.contains(longUnitId)) { 250 // isBetterUnit(longUnitId, bestCaseUnitId); 251 // } 252 bestCaseFormCount = caseFormCount; 253 bestCaseUnitId = longUnitId; 254 bestUnitPatternToCases = TreeMultimap.create(unitPatternToCaseAndCounts); 255 } 256 } 257 } 258 // Fill the case cache with the most distinctive forms. 259 Map<String, String> caseCache = getBestCasePatterns(bestUnitPatternToCases); 260 261 // Make the gender cache be translated units as well as unit IDs 262 Count count = pluralInfo.getKeywords().contains("one") ? Count.one : Count.other; 263 Map<String, Pair<String, String>> result2 = Maps.newHashMap(); 264 265 for (Entry<String, String> entry : genderResults.entrySet()) { 266 String longUnitId = entry.getValue(); 267 String unitPattern = 268 cldrFile.getStringValue( 269 "//ldml/units/unitLength[@type=\"long\"]/unit[@type=\"" 270 + longUnitId 271 + "\"]/unitPattern[@count=\"" 272 + count 273 + "\"]"); 274 unitPattern = unitPattern.replace("{0}", "").replace("\u00A0", "").trim(); 275 result2.put( 276 entry.getKey(), 277 Pair.of(ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId), unitPattern)); 278 } 279 // it doesn't matter if we reset this due to multiple threads 280 Map<String, Pair<String, String>> genderCache = ImmutableMap.copyOf(result2); 281 CaseAndGenderSamples result = 282 new CaseAndGenderSamples( 283 caseCache, 284 ExampleGenerator.UNIT_CONVERTER.getShortId(bestCaseUnitId), 285 genderCache); 286 287 genderToUnits = ImmutableMultimap.copyOf(genderToUnits); 288 uniqueCaseAndCountToUnits = ImmutableMultimap.copyOf(uniqueCaseAndCountToUnits); 289 distinctNominativeCaseToUnit = ImmutableMultimap.copyOf(distinctNominativeCaseToUnit); 290 return result; 291 } 292 293 /** 294 * Get the a pattern that is most unique for each case. 295 * 296 * @param bestUnitPatternToCases 297 * @return 298 */ getBestCasePatterns( Multimap<String, Pair<String, String>> bestUnitPatternToCases)299 private Map<String, String> getBestCasePatterns( 300 Multimap<String, Pair<String, String>> bestUnitPatternToCases) { 301 if (bestUnitPatternToCases == null || bestUnitPatternToCases.isEmpty()) { 302 return Collections.emptyMap(); 303 } 304 Map<String, String> result = new TreeMap<>(); 305 while (true) { 306 String bestPattern = getBestPattern(bestUnitPatternToCases); 307 Pair<String, String> bestCaseCount = 308 bestUnitPatternToCases.get(bestPattern).iterator().next(); 309 String bestCase = bestCaseCount.getFirst(); 310 String bestCount = bestCaseCount.getSecond(); 311 String sample = getPluralOrOrdinalSample(PluralType.cardinal, bestCount); 312 if (sample == null) { // debugging 313 getPluralOrOrdinalSample(PluralType.cardinal, bestCount); 314 } 315 result.put(bestCaseCount.getFirst(), bestPattern.replace("{0}", sample)); 316 TreeMultimap<Pair<String, String>, String> caseToPatterns = 317 Multimaps.invertFrom(bestUnitPatternToCases, TreeMultimap.create()); 318 for (String count : pluralInfo.getKeywords()) { 319 caseToPatterns.removeAll(Pair.of(bestCase, count)); 320 } 321 if (caseToPatterns.keySet().isEmpty()) { 322 return result; 323 } 324 bestUnitPatternToCases = Multimaps.invertFrom(caseToPatterns, TreeMultimap.create()); 325 } 326 } 327 getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases)328 private String getBestPattern(Multimap<String, Pair<String, String>> bestUnitPatternToCases) { 329 int bestCaseSize = 1000; 330 String bestPattern = null; 331 Collection<Pair<String, String>> bestCase = null; 332 for (Entry<String, Collection<Pair<String, String>>> entry : 333 bestUnitPatternToCases.asMap().entrySet()) { 334 final Collection<Pair<String, String>> setOfCases = entry.getValue(); 335 if (setOfCases.size() < bestCaseSize) { 336 bestCaseSize = setOfCases.size(); 337 bestPattern = entry.getKey(); 338 bestCase = setOfCases; 339 } 340 } 341 return bestPattern; 342 } 343 isBetterUnit(String longUnitId, String formerLongUnitId)344 public boolean isBetterUnit(String longUnitId, String formerLongUnitId) { 345 // replace if as good or better (where better is smaller). Metric is better. If both metric, 346 // choose alphabetical 347 boolean isBetter = false; 348 int diff = systemWeight(longUnitId) - systemWeight(formerLongUnitId); 349 if (diff < 0) { 350 isBetter = true; 351 } else if (diff == 0) { 352 diff = categoryWeight(longUnitId) - categoryWeight(formerLongUnitId); 353 if (diff < 0) { 354 isBetter = true; 355 } else if (diff == 0 && longUnitId.compareTo(formerLongUnitId) < 0) { 356 isBetter = true; 357 } 358 } 359 return isBetter; 360 } 361 362 static final Set<String> WORSE = 363 ImmutableSet.of("length-100-kilometer", "length-mile-scandinavian"); 364 static final Set<String> BEST = 365 ImmutableSet.of( 366 "duration-year", 367 "duration-month", 368 "duration-week", 369 "duration-day", 370 "duration-hour", 371 "duration-minute"); 372 /** 373 * better result is smaller 374 * 375 * @param longUnitId 376 * @return 377 */ systemWeight(String longUnitId)378 public int systemWeight(String longUnitId) { 379 if (WORSE.contains(longUnitId)) { 380 return 99; 381 } 382 if (GrammarInfo.getUnitsToAddGrammar().contains(longUnitId)) { 383 if (BEST.contains(longUnitId)) { 384 return 0; // better 385 } 386 final String shortId = ExampleGenerator.UNIT_CONVERTER.getShortId(longUnitId); 387 Set<UnitSystem> systems = ExampleGenerator.UNIT_CONVERTER.getSystemsEnum(shortId); 388 if (systems.contains(UnitSystem.metric)) { 389 return 1; // better 390 } else { 391 return systems.iterator().next().ordinal() + 2; 392 } 393 } 394 return 99; 395 } 396 categoryWeight(String longUnitId)397 private int categoryWeight(String longUnitId) { 398 if (longUnitId.startsWith("length")) { 399 return 0; 400 } else if (longUnitId.startsWith("weight")) { 401 return 1; 402 } else if (longUnitId.startsWith("duration")) { 403 return 2; 404 } else if (longUnitId.startsWith("area")) { 405 return 2; 406 } else if (longUnitId.startsWith("volume")) { 407 return 2; 408 } 409 return 999; 410 } 411 getPluralOrOrdinalSample(PluralType pluralType, String code)412 public String getPluralOrOrdinalSample(PluralType pluralType, String code) { 413 PluralRules rules = pluralType == PluralType.cardinal ? pluralInfo : ordinalInfo; 414 DecimalQuantitySamples samples = rules.getDecimalSamples(code, SampleType.INTEGER); 415 if (samples == null) { 416 samples = rules.getDecimalSamples(code, SampleType.DECIMAL); 417 } 418 if (samples == null) { 419 return null; 420 } 421 422 // get good sample. Avoid zero if possible 423 DecimalQuantity sample = null; 424 for (DecimalQuantitySamplesRange sampleRange : samples.getSamples()) { 425 sample = sampleRange.start; 426 if (sample.toDouble() != 0d) { 427 break; 428 } 429 } 430 431 if (icuServiceBuilder != null) { 432 int visibleDigits = (int) sample.getPluralOperand(Operand.v); 433 DecimalFormat nf; 434 if (visibleDigits == 0) { 435 nf = icuServiceBuilder.getNumberFormat(0); // 0 is integer, 1 is decimal 436 } else { 437 nf = icuServiceBuilder.getNumberFormat(1); // 0 is integer, 1 is decimal 438 int minFracDigits = nf.getMinimumFractionDigits(); 439 int maxFracDigits = nf.getMaximumFractionDigits(); 440 if (minFracDigits != visibleDigits || maxFracDigits != visibleDigits) { 441 nf = (DecimalFormat) nf.clone(); 442 nf.setMaximumFractionDigits(visibleDigits); 443 nf.setMinimumFractionDigits(visibleDigits); 444 } 445 } 446 return nf.format(sample.toBigDecimal()); 447 } 448 return sample.toString(); 449 } 450 451 /** Get the best value to show, plus the shortUnitId if relevant (case/gender) */ getBestValue(String header, String code, Output<String> shortUnitId)452 public String getBestValue(String header, String code, Output<String> shortUnitId) { 453 String result = null; 454 switch (header) { 455 case "Case": 456 result = getBestUnitWithCase(code, shortUnitId); 457 break; 458 case "Gender": 459 result = getBestUnitWithGender(code, shortUnitId); 460 break; 461 case "Ordinal": 462 result = getPluralOrOrdinalSample(PluralType.ordinal, code); 463 shortUnitId.value = "n/a"; 464 break; 465 case "Plural": 466 result = getPluralOrOrdinalSample(PluralType.cardinal, code); 467 shortUnitId.value = "n/a"; 468 break; 469 } 470 return result == null ? "X" : result; 471 } 472 getGenderToUnits()473 public Multimap<String, String> getGenderToUnits() { 474 return genderToUnits; 475 } 476 getUniqueCaseAndCountToUnits()477 public Multimap<Integer, String> getUniqueCaseAndCountToUnits() { 478 return uniqueCaseAndCountToUnits; 479 } 480 getDistinctNominativeCaseToUnit()481 public Multimap<String, String> getDistinctNominativeCaseToUnit() { 482 return distinctNominativeCaseToUnit; 483 } 484 } 485