1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.localedistance; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkNotNull; 7 import static com.google.common.base.Preconditions.checkState; 8 import static java.util.Arrays.asList; 9 import static org.unicode.cldr.api.CldrData.PathOrder.DTD; 10 import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL; 11 12 import java.io.IOException; 13 import java.util.ArrayList; 14 import java.util.Arrays; 15 import java.util.Collection; 16 import java.util.LinkedHashSet; 17 import java.util.List; 18 import java.util.Map; 19 import java.util.Optional; 20 import java.util.Set; 21 import java.util.logging.Logger; 22 import java.util.stream.Collectors; 23 import java.util.stream.IntStream; 24 import java.util.stream.Stream; 25 26 import org.unicode.cldr.api.AttributeKey; 27 import org.unicode.cldr.api.CldrData; 28 import org.unicode.cldr.api.CldrDataSupplier; 29 import org.unicode.cldr.api.CldrPath; 30 import org.unicode.cldr.api.CldrValue; 31 import org.unicode.cldr.api.PathMatcher; 32 import org.unicode.icu.tool.cldrtoicu.DebugWriter; 33 import org.unicode.icu.tool.cldrtoicu.IcuData; 34 import org.unicode.icu.tool.cldrtoicu.RbPath; 35 import org.unicode.icu.tool.cldrtoicu.RbValue; 36 37 import com.google.common.annotations.VisibleForTesting; 38 import com.google.common.base.Splitter; 39 import com.google.common.collect.ImmutableList; 40 import com.google.common.collect.ImmutableSet; 41 import com.google.common.collect.Iterables; 42 import com.google.common.primitives.Bytes; 43 import com.ibm.icu.impl.locale.LSR; 44 import com.ibm.icu.impl.locale.LikelySubtags; 45 import com.ibm.icu.impl.locale.LocaleDistance; 46 import com.ibm.icu.lang.UScript; 47 48 import com.ibm.icu.util.ULocale; 49 50 /** 51 * Mapper for generating locale distance tables from CLDR language data. 52 * 53 * <p>Note that this is an atypical mapper which does a lot more processing than other 54 * ICU mapper classes and relies on several auxilliary classes (which is why it's in a 55 * different package). Conceptually it's still a "mapper" though, just not a simple one. 56 * 57 * <p>This mapper was converted from the LocaleDistanceBuilder code in the ICU4J project. 58 */ 59 public final class LocaleDistanceMapper { 60 private static final Logger logger = Logger.getLogger(LocaleDistanceMapper.class.getName()); 61 62 // All the language matching data comes from the "written_new" language data in 63 // "common/supplemental/languageInfo.xml". 64 private static final PathMatcher WRITTEN_LANGUAGE_PREFIX = 65 PathMatcher.of("//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]"); 66 67 // Definitions of region containment variables used when expressing match distances. E.g.: 68 // <matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/> 69 private static final PathMatcher VARIABLE_PATH = 70 WRITTEN_LANGUAGE_PREFIX.withSuffix("matchVariable[@id=*]"); 71 private static final AttributeKey VARIABLE_ID = AttributeKey.keyOf("matchVariable", "id"); 72 private static final AttributeKey VARIABLE_VALUE = AttributeKey.keyOf("matchVariable", "value"); 73 74 // Language distance data, including wildcards and variable references (possibly negated). E.g.: 75 // <languageMatch desired="ja_Latn" supported="ja_Jpan" distance="5" oneway="true"/> 76 // <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/> 77 // <languageMatch desired="en_*_$!enUS" supported="en_*_GB" distance="3"/> 78 private static final PathMatcher LANGUAGE_MATCH_PATH = 79 WRITTEN_LANGUAGE_PREFIX.withSuffix("languageMatch[@desired=*][@supported=*]"); 80 private static final AttributeKey MATCH_DESIRED = 81 AttributeKey.keyOf("languageMatch", "desired"); 82 private static final AttributeKey MATCH_SUPPORTED = 83 AttributeKey.keyOf("languageMatch", "supported"); 84 private static final AttributeKey MATCH_DISTANCE = 85 AttributeKey.keyOf("languageMatch", "distance"); 86 // Optional, assume false if not present. 87 private static final AttributeKey MATCH_ONEWAY = 88 AttributeKey.keyOf("languageMatch", "oneway"); 89 90 // Singleton element containing the list of special case "paradigm" locales, which should 91 // always be preferred if there is a tie. E.g.: 92 // <paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/> 93 // 94 // Since there are no distinguishing attributes for this path, there can only be one 95 // instance which we can just lookup directly. 96 private static final CldrPath PARADIGM_LOCALES_PATH = CldrPath.parseDistinguishingPath( 97 "//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]/paradigmLocales"); 98 private static final AttributeKey PARADIGM_LOCALES = 99 AttributeKey.keyOf("paradigmLocales", "locales"); 100 101 // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", ""). 102 private static final Splitter LIST_SPLITTER = 103 Splitter.on(' ').trimResults().omitEmptyStrings(); 104 105 // Output resource bundle paths, split into two basic groups for likely locale mappings 106 // and match data. 107 private static final RbPath LIKELY_LANGUAGES = RbPath.of("likely", "languageAliases"); 108 private static final RbPath LIKELY_M49 = RbPath.of("likely", "m49"); 109 private static final RbPath LIKELY_REGIONS = RbPath.of("likely", "regionAliases"); 110 private static final RbPath LIKELY_TRIE = RbPath.of("likely", "trie:bin"); 111 private static final RbPath LIKELY_LSRNUM = RbPath.of("likely", "lsrnum:intvector"); 112 113 private static final RbPath MATCH_TRIE = RbPath.of("match", "trie:bin"); 114 private static final RbPath MATCH_REGION_TO_PARTITIONS = RbPath.of("match", "regionToPartitions:bin"); 115 private static final RbPath MATCH_PARTITIONS = RbPath.of("match", "partitions"); 116 private static final RbPath MATCH_PARADIGMNUM = RbPath.of("match", "paradigmnum:intvector"); 117 private static final RbPath MATCH_DISTANCES = RbPath.of("match", "distances:intvector"); 118 119 // To split locale specifications (e.g. "ja_Latn" or "en_*_$!enUS"). 120 private static final Splitter UNDERSCORE = Splitter.on('_'); 121 122 // The encoding scheme allow us to only encode up to 27 M.49 code below. 123 // The size is later check while reading the M49 List. 124 private static final List<String> M49 = Arrays.asList("001", "143", "419"); 125 126 /** 127 * Processes data from the given supplier to generate locale matcher ICU data. 128 * 129 * @param src the CLDR data supplier to process. 130 * @return the IcuData instance to be written to a file. 131 */ process(CldrDataSupplier src)132 public static IcuData process(CldrDataSupplier src) { 133 return process(src.getDataForType(SUPPLEMENTAL)); 134 } 135 136 @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier. process(CldrData data)137 static IcuData process(CldrData data) { 138 IcuData icuData = new IcuData("langInfo", false); 139 140 if (M49.size() > 27) { 141 throw new IllegalStateException( 142 "The M49 list is too long. We can only encode up to 27 M49 codes."); 143 } 144 LikelySubtags.Data likelyData = LikelySubtagsBuilder.build(data); 145 icuData.add(LIKELY_LANGUAGES, ofMapEntries(likelyData.languageAliases)); 146 icuData.add(LIKELY_M49, RbValue.of(M49)); 147 icuData.add(LIKELY_REGIONS, ofMapEntries(likelyData.regionAliases)); 148 icuData.add(LIKELY_TRIE, ofBytes(likelyData.trie)); 149 icuData.add(LIKELY_LSRNUM, ofLsrNum(asList(likelyData.lsrs))); 150 151 LocaleDistance.Data distanceData = buildDistanceData(data); 152 icuData.add(MATCH_TRIE, ofBytes(distanceData.trie)); 153 icuData.add(MATCH_REGION_TO_PARTITIONS, ofBytes(distanceData.regionToPartitionsIndex)); 154 icuData.add(MATCH_PARTITIONS, RbValue.of(distanceData.partitionArrays)); 155 icuData.add(MATCH_PARADIGMNUM, ofLsrNum(distanceData.paradigmLSRs)); 156 icuData.add(MATCH_DISTANCES, RbValue.of(Arrays.stream(distanceData.distances).mapToObj(Integer::toString))); 157 return icuData; 158 } 159 160 /** 161 * A simple holder for language, script and region which allows for wildcards (i.e. "*") 162 * and variables to represent partitions of regions (e.g. "$enUS"). Minimal additional 163 * validation is done on incoming fields as data is assumed to be correct. 164 */ 165 private static final class LsrSpec { 166 /** 167 * Parse a raw specification string (e.g. "en", "ja_Latn", "*_*_*", "ar_*_$maghreb" 168 * or "en_*_GB") into a structured spec. Note that if the specification string 169 * contains a "bare" region (e.g. "en_*_GB") then it is registered as a variable in 170 * the given RegionMapper builder, so the returned {@code LsrSpec} will be 171 * {@code "en_*_$GB"}. 172 */ parse(String rawSpec, PartitionInfo.Builder rmb)173 public static LsrSpec parse(String rawSpec, PartitionInfo.Builder rmb) { 174 List<String> parts = UNDERSCORE.splitToList(rawSpec); 175 checkArgument(parts.size() <= 3, "invalid raw LSR specification: %s", rawSpec); 176 String language = parts.get(0); 177 Optional<String> script = parts.size() > 1 ? Optional.of(parts.get(1)) : Optional.empty(); 178 // While parsing the region part, ensure any "bare" region subtags are converted 179 // to variables (e.g. "GB" -> "$GB") and registered with the parition map. 180 Optional<String> region = 181 parts.size() > 2 ? Optional.of(rmb.ensureVariable(parts.get(2))) : Optional.empty(); 182 return new LsrSpec(language, script, region); 183 } 184 185 // A language subtag (e.g. "en") or "*". 186 private final String language; 187 // If present, a script subtag (e.g. "Latn") or "*". 188 private final Optional<String> script; 189 // If present, a registered variable with '$' prefix (e.g. "$foo" or "$GB") or "*". 190 private final Optional<String> regionVariable; 191 LsrSpec(String language, Optional<String> script, Optional<String> regionVariable)192 private LsrSpec(String language, Optional<String> script, Optional<String> regionVariable) { 193 this.language = language; 194 this.script = script; 195 this.regionVariable = regionVariable; 196 // Implementation shortcuts assume: 197 // - If the language subtags are '*', the other-level subtags must also be '*' (if present). 198 // If there are rules that do not fit these constraints, we need to revise the implementation. 199 if (isAny(language)) { 200 script.ifPresent( 201 s -> checkArgument(isAny(s), "expected wildcard script, got: %s", script)); 202 regionVariable.ifPresent( 203 r -> checkArgument(isAny(r), "expected wildcard region, got: %s", regionVariable)); 204 } 205 } 206 getLanguage()207 public String getLanguage() { 208 return language; 209 } 210 getScript()211 public String getScript() { 212 return script.orElseThrow(() -> new IllegalArgumentException("no script available: " + this)); 213 } 214 getRegionVariable()215 public String getRegionVariable() { 216 return regionVariable.orElseThrow(() -> new IllegalArgumentException("no region available: " + this)); 217 } 218 size()219 public int size() { 220 return regionVariable.isPresent() ? 3 : script.isPresent() ? 2 : 1; 221 } 222 223 @Override toString()224 public String toString() { 225 return language + script.map(s -> "_" + s).orElse("") + regionVariable.map(r -> "_" + r).orElse(""); 226 } 227 } 228 229 /** 230 * Represents a {@code <languageMatch>} rule derived from supplemental data, such as: 231 * <pre>{@code 232 * <languageMatch desired="zh_Hans" supported="zh_Hant" distance="15" oneway="true"/> 233 * }</pre> 234 * or: 235 * <pre>{@code 236 * <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/> 237 * }</pre> 238 * 239 * <p>The job of a {@code Rule} is to provide a mechanism for capturing the data in 240 * {@code <languageMatch>} elements and subsequently adding that information to a 241 * {@link DistanceTable.Builder} in a structured way. 242 */ 243 private static final class LanguageMatchRule { 244 private final LsrSpec desired; 245 private final LsrSpec supported; 246 private final int distance; 247 private final boolean oneway; 248 LanguageMatchRule(LsrSpec desired, LsrSpec supported, int distance, boolean oneway)249 public LanguageMatchRule(LsrSpec desired, LsrSpec supported, int distance, boolean oneway) { 250 this.desired = checkNotNull(desired); 251 this.supported = checkNotNull(supported); 252 this.distance = distance; 253 this.oneway = oneway; 254 // Implementation shortcuts assume: 255 // - At any level, either both or neither spec subtags are *. 256 // If there are rules that do not fit these constraints, we need to revise the implementation. 257 checkArgument(desired.size() == supported.size(), 258 "mismatched rule specifications in: %s, %s", desired, supported); 259 checkArgument(isAny(desired.language) == isAny(supported.language), 260 "wildcard mismatch for languages in: %s, %s", desired, supported); 261 checkArgument(isAny(desired.script) == isAny(supported.script), 262 "wildcard mismatch for scripts in: %s, %s", desired, supported); 263 checkArgument(isAny(desired.regionVariable) == isAny(supported.regionVariable), 264 "wildcard mismatch for languages in: %s, %s", desired, supported); 265 } 266 size()267 int size() { 268 return desired.size(); 269 } 270 isDefaultRule()271 boolean isDefaultRule() { 272 // We already know that in LsrSpec, if the language is "*" then all subtags are too. 273 return isAny(desired.language); 274 } 275 276 /** 277 * Adds this rule to the given distance table, using the given partition map to 278 * resolve any region variables present in the desired or supported specs. 279 */ addTo(DistanceTable.Builder distanceTable, PartitionInfo partitions)280 void addTo(DistanceTable.Builder distanceTable, PartitionInfo partitions) { 281 // Note that rather than using the rule's "size" to mediate the different 282 // cases, we could have had 3 distinct sub-types of a common rule API (e.g. 283 // "LanguageRule", "ScriptRule" and "RegionRule"), each with a different 284 // addTo() callback. However this would have been quite a lot more code 285 // for not much real gain. 286 switch (size()) { 287 case 1: // Language only. 288 distanceTable.addDistance(distance, oneway, 289 desired.getLanguage(), supported.getLanguage()); 290 break; 291 292 case 2: // Language and script present. 293 distanceTable.addDistance(distance, oneway, 294 desired.getLanguage(), supported.getLanguage(), 295 desired.getScript(), supported.getScript()); 296 break; 297 298 case 3: // Language, script and region variable present. 299 // Add the rule distance for every combination of desired/supported 300 // partition IDs for the region variables. This is important for 301 // variables like "$americas" which overlap with multiple paritions. 302 // 303 // Note that in this case (because region variables map to sets of 304 // partition IDs) we can get situations where "shouldReverse" is true, 305 // but the desired/supported pairs being passed in are identical (e.g. 306 // different region variables map to distinct partition groups which 307 // share some common elements). 308 // 309 // This is fine, providing that the distance table is going to ignore 310 // identical mappings (which it does). Alternatively we could just 311 // re-calculate "shouldReverse" inside this loop to account for partition 312 // IDs rather than region variables. 313 ImmutableSet<String> desiredPartitionIds = 314 partitions.getPartitionIds(desired.getRegionVariable()); 315 ImmutableSet<String> supportedPartitionIds = 316 partitions.getPartitionIds(supported.getRegionVariable()); 317 for (String desiredPartitionId : desiredPartitionIds) { 318 for (String supportedPartitionId : supportedPartitionIds) { 319 distanceTable.addDistance(distance, oneway, 320 desired.getLanguage(), supported.getLanguage(), 321 desired.getScript(), supported.getScript(), 322 desiredPartitionId, supportedPartitionId); 323 } 324 } 325 break; 326 327 default: 328 throw new IllegalStateException("invalid size for LsrSpec: " + this); 329 } 330 } 331 332 @Override toString()333 public String toString() { 334 return String.format( 335 "Rule{ desired=%s, supported=%s, distance=%d, oneway=%b }", 336 desired, supported, distance, oneway); 337 } 338 } 339 buildDistanceData(CldrData supplementalData)340 private static LocaleDistance.Data buildDistanceData(CldrData supplementalData) { 341 // Resolve any explicitly declared region variables into the partition map. 342 // Territory containment information is used to recursively resolve region 343 // variables (e.g. "$enUS") into a collection of non-macro regions. 344 PartitionInfo.Builder partitionBuilder = 345 PartitionInfo.builder(TerritoryContainment.getContainment(supplementalData)); 346 supplementalData.accept(DTD, v -> { 347 CldrPath path = v.getPath(); 348 if (VARIABLE_PATH.matches(path)) { 349 partitionBuilder.addVariableExpression(v.get(VARIABLE_ID), v.get(VARIABLE_VALUE)); 350 } 351 }); 352 353 // Parse the rules from <languageMatch> elements. Note that the <languageMatch> 354 // element is marked as "ORDERED" in the DTD, which means the elements always 355 // appear in the same order is in the CLDR XML file (even when using DTD order). 356 // 357 // This is one of the relatively rare situations in which using DTD order will 358 // not isolate the ICU data from reordering of the CLDR data. In particular this 359 // matters when specifying language matcher preferences (such as "en_*_GB" vs 360 // "en_*_!enUS"). 361 // 362 // We could almost process the rules while reading them from the source data, but 363 // rules may contain region codes rather than variables, and we need to create a 364 // variable for each such region code before the RegionMapper is built, and 365 // before processing the rules (this happens when the LsrSpec is parsed). 366 List<LanguageMatchRule> rules = new ArrayList<>(); 367 supplementalData.accept(DTD, v -> { 368 CldrPath path = v.getPath(); 369 if (LANGUAGE_MATCH_PATH.matches(path)) { 370 int distance = Integer.parseInt(v.get(MATCH_DISTANCE)); 371 // Lenient against there being no "oneway" attribute. 372 boolean oneway = "true".equalsIgnoreCase(v.get(MATCH_ONEWAY)); 373 LsrSpec desired = LsrSpec.parse(v.get(MATCH_DESIRED), partitionBuilder); 374 LsrSpec supported = LsrSpec.parse(v.get(MATCH_SUPPORTED), partitionBuilder); 375 LanguageMatchRule rule = new LanguageMatchRule(desired, supported, distance, oneway); 376 logger.fine(() -> String.format("rule: %s", rule)); 377 rules.add(rule); 378 } 379 }); 380 // Check that the rules are in the expected order. Rule order is important in ensuring 381 // data correctness and incorrect order may violate business logic assumptions later. 382 // TODO: Consider what other ordering/sanity checks make sense here. 383 for (int n = 0, prevSize = 1; n < rules.size(); n++) { 384 LanguageMatchRule rule = rules.get(n); 385 checkArgument(rule.size() >= prevSize, "<languageMatch> elements out of order at: %s", rule); 386 checkArgument(rule.size() == prevSize || (n > 0 && rules.get(n - 1).isDefaultRule()), 387 "missing default rule before: %s", rule); 388 prevSize = rule.size(); 389 } 390 checkState(rules.stream().distinct().count() == rules.size(), "duplicated rule in: %s", rules); 391 392 // Build region partition data after all the variables have been accounted for 393 // (including the implicit variables found while processing LsrSpecs). 394 PartitionInfo partitions = partitionBuilder.build(); 395 396 // Add all the rules (in order) to the distance table. 397 DistanceTable.Builder distanceTableBuilder = DistanceTable.builder(); 398 rules.forEach(r -> r.addTo(distanceTableBuilder, partitions)); 399 DistanceTable distanceTable = distanceTableBuilder.build(); 400 401 // Note: Using LocaleDistance.Data as a fairly "dumb" container for the return values 402 // requires us to do slightly awkward things, like passing mutable arrays and LSR 403 // instances around, but the advantage it has is that this data structure is also what's 404 // used in client code, so if the likely subtags data changes, it will be a forcing 405 // function to change this code. 406 return new LocaleDistance.Data( 407 distanceTable.getTrie().toByteArray(), 408 partitions.getPartitionLookupArray(), 409 partitions.getPartitionStrings(), 410 getParadigmLsrs(supplementalData), 411 distanceTable.getDefaultDistances()); 412 } 413 getParadigmLsrs(CldrData supplementalData)414 private static Set<LSR> getParadigmLsrs(CldrData supplementalData) { 415 // LinkedHashSet for stable order; otherwise a unit test is flaky. 416 CldrValue cldrValue = supplementalData.get(PARADIGM_LOCALES_PATH); 417 checkState(cldrValue != null, 418 "<paradigmLocales> element was missing: %s", PARADIGM_LOCALES_PATH); 419 String localesList = cldrValue.get(PARADIGM_LOCALES); 420 checkState(localesList != null, 421 "<paradigmLocales> 'locales' attribute was missing: %s", cldrValue); 422 423 Set<LSR> paradigmLSRs = new LinkedHashSet<>(); 424 for (String paradigm : LIST_SPLITTER.split(localesList)) { 425 LSR max = LikelySubtags.INSTANCE.makeMaximizedLsrFrom(new ULocale(paradigm), false); 426 // Clear the LSR flags to make the data equality test in LocaleDistanceTest happy. 427 paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS)); 428 } 429 checkArgument(paradigmLSRs.size() % 2 == 0, "unpaired paradigm locales: %s", paradigmLSRs); 430 return paradigmLSRs; 431 } 432 433 // Returns an RbValue serialized from a map as a sequence of alternating (key, value) 434 // pairs (formatted as one pair per line in the IcuData file). 435 // 436 // E.g. 437 // foo{ 438 // key1, value1, 439 // ... 440 // keyN, valueN, 441 // } ofMapEntries(Map<String, String> map)442 private static RbValue ofMapEntries(Map<String, String> map) { 443 return RbValue.of( 444 map.entrySet().stream() 445 .flatMap(e -> Stream.of(e.getKey(), e.getValue())) 446 .collect(Collectors.toList())) 447 .elementsPerLine(2); 448 } 449 450 // Returns an RbValue serialized from a sequence of LSR instance as a sequence of number 451 // represent (language, region, script) tuples (formatted as one number per line in the IcuData file). ofLsrNum(Collection<LSR> lsrs)452 private static RbValue ofLsrNum(Collection<LSR> lsrs) { 453 return RbValue.of( 454 lsrs.stream() 455 .flatMapToInt(lsr -> IntStream.of(LSRToNum(lsr))) 456 .mapToObj(Integer::toString)); 457 } 458 459 // This method is added only to support encodeToIntForResource() 460 // It only support [a-z]{2,3} and will not work for other cases. 461 // TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool. encodeLanguageToInt(String language)462 static private int encodeLanguageToInt(String language) { 463 assert language.length() >= 2; 464 assert language.length() <= 3; 465 assert language.charAt(0) >= 'a'; 466 assert language.charAt(0) <= 'z'; 467 assert language.charAt(1) >= 'a'; 468 assert language.charAt(1) <= 'z'; 469 assert language.length() == 2 || language.charAt(2) >= 'a'; 470 assert language.length() == 2 || language.charAt(2) <= 'z'; 471 return language.charAt(0) - 'a' + 1 + 472 27 * (language.charAt(1) - 'a' + 1) + 473 ((language.length() == 2) ? 0 : 27 * 27 * (language.charAt(2) - 'a' + 1)); 474 } 475 // This method is added only to support encodeToIntForResource() 476 // It only support [A-Z][a-z]{3} which defined in UScript and does not work for other cases. 477 // TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool. encodeScriptToInt(String script)478 static private int encodeScriptToInt(String script) { 479 int ret = UScript.getCodeFromName(script); 480 assert ret != UScript.INVALID_CODE; 481 return ret; 482 } 483 // This method is added only to support encodeToIntForResource() 484 // It only support [A-Z]{2}|001|143|419 and does not work for other cases. 485 // TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool. encodeRegionToInt(String region, List<String> m49)486 static private int encodeRegionToInt(String region, List<String> m49) { 487 assert region.length() >= 2; 488 assert region.length() <= 3; 489 // Do not have enough bits to store the all 1000 possible combination of \d{3} 490 // Only support what is in M49. 491 if (region.length() == 3) { 492 int index = m49.indexOf(region); 493 assert index >= 0; 494 if (index < 0) { 495 throw new IllegalStateException( 496 "Please add '" + region + "' to M49 in LocaleDistanceMapper.java"); 497 } 498 return index; 499 } 500 assert region.charAt(0) >= 'A'; 501 assert region.charAt(0) <= 'Z'; 502 assert region.charAt(1) >= 'A'; 503 assert region.charAt(1) <= 'Z'; 504 // 'AA' => 1+27*1 = 28 505 // ... 506 // 'AZ' => 1+27*26 = 703 507 // 'BA' => 2+27*1 = 29 508 // ... 509 // 'IN' => 9+27*14 = 387 510 // 'ZZ' => 26+27*26 = 728 511 return (region.charAt(0) - 'A' + 1) + 27 * (region.charAt(1) - 'A' + 1); 512 } 513 // This is designed to only support encoding some LSR into resources but not for other cases. 514 // TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool. encodeToIntForResource(LSR lsr)515 static int encodeToIntForResource(LSR lsr) { 516 return (encodeLanguageToInt(lsr.language) + (27*27*27) * encodeRegionToInt(lsr.region, M49)) | 517 (encodeScriptToInt(lsr.script) << 24); 518 } 519 LSRToNum(LSR lsr)520 private static int LSRToNum(LSR lsr) { 521 // Special number for "", "", "" return 0 522 if (lsr.language.isEmpty() && lsr.script.isEmpty() && lsr.region.isEmpty()) { 523 return 0; 524 } 525 // Special number for "skip", "script", "" return 1 526 if (lsr.language.equals("skip") && lsr.script.equals("script") && lsr.region.isEmpty()) { 527 return 1; 528 } 529 // TODO(ftang) Change to the following line after LSR.encodeToIntForResource is available to the tool. 530 // return lsr.encodeToIntForResource(); 531 return encodeToIntForResource(lsr); 532 } 533 534 // Returns an RbValue serialized from a byte array, as a concatenated sequence of rows of 535 // hex values. This is intended only for RbPaths using the ":bin" suffix. 536 // 537 // E.g. 538 // foo{ 539 // 0123456789abcdef0123456789abcdef 540 // ... 541 // 1c0de4c0ffee 542 // } 543 // 544 // Note that typically no indentation is used when writting this binary "blob". ofBytes(byte[] data)545 private static RbValue ofBytes(byte[] data) { 546 ImmutableList.Builder<String> hexValues = ImmutableList.builder(); 547 List<Byte> bytes = Bytes.asList(data); 548 for (List<Byte> line : Iterables.partition(bytes, 16)) { 549 hexValues.add(line.stream().map(b -> String.format("%02x", b)).collect(Collectors.joining())); 550 } 551 return RbValue.of(hexValues.build()); 552 } 553 554 // Returns if the subtag is the '*' wildcard. This is not to be confused with the 555 // "ANY" character used in DistanceTable. isAny(String subtag)556 private static boolean isAny(String subtag) { 557 return subtag.equals("*"); 558 } 559 560 // Returns if the subtag exists and is the '*' wildcard. isAny(Optional<String> subtag)561 private static boolean isAny(Optional<String> subtag) { 562 return subtag.map(LocaleDistanceMapper::isAny).orElse(false); 563 } 564 565 // Main method for running this mapper directly with logging enabled. 566 // CLDR_DIR is picked up from system properties or envirnment variables. 567 // Arguments: <output-file> [<log-level>] main(String[] args)568 public static void main(String[] args) throws IOException { 569 DebugWriter.writeForDebugging(args, LocaleDistanceMapper::process); 570 } 571 } 572