1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.localedistance; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkState; 7 import static com.ibm.icu.impl.locale.LocaleDistance.DISTANCE_SKIP_SCRIPT; 8 import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_LANG_DISTANCE; 9 import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_REGION_DISTANCE; 10 import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_SCRIPT_DISTANCE; 11 import static com.ibm.icu.impl.locale.LocaleDistance.IX_LIMIT; 12 import static com.ibm.icu.impl.locale.LocaleDistance.IX_MIN_REGION_DISTANCE; 13 import static java.util.Arrays.asList; 14 15 import java.util.Arrays; 16 import java.util.Map; 17 import java.util.logging.Logger; 18 19 import com.google.common.collect.ImmutableList; 20 import com.google.common.collect.Table; 21 import com.google.common.collect.TreeBasedTable; 22 23 /** 24 * Represents the conceptual distance between pairs of language specifications. 25 * 26 * <p>Mappings for {@code (desired, supported)} pairs are added at one of three 27 * levels in the table; language, script and region. Distances can be resolved at 28 * any level in the table (e.g. {@code ("en","fr")}, {@code ("en_Latn","ru_Cyrl")} 29 * or {@code ("en_Latn_GB", "en_Latn_AU")}). 30 * 31 * <p>However in reality the "regions" in the table are actually "partition IDs" 32 * representing groups of regions with the same language characteristics. For more 33 * information on partitions and how they are generated, see {@link PartitionInfo}. 34 * 35 * <p>This is mentioned here because anyone debugging this code might be surprised 36 * to see values like {@code "5"} for a "region" in the code. Using the term 37 * "region" matches the conceptual level of the data and is more familiar to most 38 * people, whereas "partition ID" would probably be jarring. 39 * 40 * <p>The builder class is not resuable, and once a table is built, the builder is 41 * invalid. Furthermore, since the table data itself is mutable, care must be taken 42 * to avoid modifying either the Trie or the returned distance array. 43 * 44 * <p>Note that internally the {@code '*'} character used as a wildcard for subtags 45 * is replaced by the {@code '�'} character (a.k.a ANY), whenever a subtag is 46 * passed into the API. This is because the underlying Trie structure generated by 47 * the distance table reserves {@code '*'} for a different purpose. This difference 48 * is encapsulated within this class and the {@link Trie} class only. 49 */ 50 final class DistanceTable { 51 private static final Logger logger = Logger.getLogger(DistanceTable.class.getName()); 52 53 // Represents a wildcard match in the data table (the equivalent of '*' in 54 // <languageMatch> locale subtag). Any incoming subtags are normalized to 55 // convert '*' to this character by the builder. 56 private static final String ANY = "�"; 57 58 // Distances must be in the range [0-127] because bit 7 of the distance value 59 // is used for a special flag (DISTANCE_SKIP_SCRIPT). Setting the explicit max 60 // to 100 is just a more human readable maximum that satisfies that constraint. 61 private static final int MAX_REGION_DISTANCE = 100; 62 63 static final class Builder { 64 private final Node rootNode = new Node(-1); 65 private int minRegionDistance = MAX_REGION_DISTANCE; 66 Builder()67 private Builder() {} 68 69 /** 70 * Adds a distance to the table between the specified and desired tuples. 71 * This method takes 1, 2 or 3 sequential {@code (desired, supported)} pairs 72 * of values corresponding to language subtags, script subtags and regions 73 * (partition IDs). All values can be the wildcard '*'. 74 */ addDistance(int distance, boolean oneway, String... args)75 public void addDistance(int distance, boolean oneway, String... args) { 76 MappingKey key = MappingKey.fromSubtags(args, distance); 77 logger.fine(key::toString); 78 // Minimum region distance needs to be tracked specially. 79 if (key.getDepth() == 3 && distance < minRegionDistance) { 80 minRegionDistance = distance; 81 } 82 addMapping(key); 83 if (!oneway && !key.isSymmetrical()) { 84 addMapping(key.reverse()); 85 } 86 } 87 addMapping(MappingKey key)88 private void addMapping(MappingKey key) { 89 rootNode.addExplicitMapping(key); 90 if (key.hasWildcardMappings()) { 91 rootNode.addWildcardMappings(key); 92 } 93 } 94 95 /** Returns the final minimized distance table information. */ build()96 public DistanceTable build() { 97 Node defLangNode = rootNode.getAnyNode(); 98 checkState(defLangNode != null, "missing default language mapping: %s", rootNode); 99 Node defScriptNode = defLangNode.getAnyNode(); 100 checkState(defScriptNode != null, "missing default script mapping: %s", rootNode); 101 Node defRegionNode = defScriptNode.getAnyNode(); 102 checkState(defRegionNode != null, "missing default region mapping: %s", rootNode); 103 104 // Because we prune the data table, it's important to store the default 105 // distance values separately. 106 int[] distances = new int[IX_LIMIT]; 107 distances[IX_DEF_LANG_DISTANCE] = defLangNode.distance; 108 distances[IX_DEF_SCRIPT_DISTANCE] = defScriptNode.distance; 109 distances[IX_DEF_REGION_DISTANCE] = defRegionNode.distance; 110 distances[IX_MIN_REGION_DISTANCE] = minRegionDistance; 111 112 // Having determined the distances, prune the Trie to remove any sub-tables 113 // where distances could only be determined to be the default value (i.e. 114 // where the existence of that sub-table has no effect). 115 pruneDefaultDistances(defScriptNode.distance, defRegionNode.distance); 116 return new DistanceTable(rootNode, distances); 117 } 118 119 @Override toString()120 public String toString() { 121 return String.format("minimum region distance: %d\n%s\n", minRegionDistance, rootNode); 122 } 123 pruneDefaultDistances(int defScriptDistance, int defRegionDistance)124 private void pruneDefaultDistances(int defScriptDistance, int defRegionDistance) { 125 logger.fine("==== pruning subtables ===="); 126 rootNode.subtables.values().forEach(langNode -> { 127 langNode.subtables.values().forEach(scriptNode -> { 128 if (scriptNode.subtables.size() == 1) { 129 // If a script node *only* contains region data with the default 130 // region distance, that region data can be removed. Since region 131 // is the lowest level, there's no need to worry about "skipping" 132 // anything during lookup (unlike the case below). 133 Node defRegionNode = scriptNode.getAnyNode(); 134 checkState(defRegionNode != null, 135 "missing default region node for script: %s", scriptNode); 136 if (defRegionNode.distance == defRegionDistance) { 137 scriptNode.subtables.clear(); 138 } 139 } 140 }); 141 // Do the pruning in the "upwards" phase of visitation (after recursion) so 142 // if script subtables are pruned, it's visible here. 143 if (langNode.subtables.size() == 1) { 144 // If a language node *only* contains script data with the default 145 // script distance, we can't just remove it (because it might contain 146 // region data). 147 Node defScriptNode = langNode.getAnyNode(); 148 if (defScriptNode.distance == defScriptDistance) { 149 checkState(defScriptNode != null, 150 "missing default script node for language: %s", langNode); 151 if (defScriptNode.subtables.isEmpty()) { 152 // If the default script node has no region data, remove it. 153 langNode.subtables.clear(); 154 } else { 155 // Otherwise mark script data as "skippable", which indicates 156 // it should be written in a compact form in the Trie (while 157 // retaining any region data as normal). 158 langNode.distance |= DISTANCE_SKIP_SCRIPT; 159 } 160 } 161 } 162 }); 163 // After pruning we don't expect any data in the top-level default table. 164 checkState(rootNode.getAnyNode().subtables.isEmpty(), 165 "invalid table state: %s", rootNode.getAnyNode()); 166 rootNode.subtables.rowMap().remove(ANY); 167 } 168 } 169 builder()170 public static Builder builder() { 171 return new Builder(); 172 } 173 174 private final Node rootNode; 175 private final int[] distances; 176 DistanceTable(Node rootNode, int[] distances)177 private DistanceTable(Node rootNode, int[] distances) { 178 this.rootNode = rootNode; 179 this.distances = distances; 180 } 181 getTrie()182 public Trie getTrie() { 183 Trie trie = new Trie(); 184 rootNode.writeTo(trie.root()); 185 return trie; 186 } 187 getDefaultDistances()188 public int[] getDefaultDistances() { 189 return distances; 190 } 191 192 @Override toString()193 public String toString() { 194 return String.format("default distances: %s\n%s\n", Arrays.toString(distances), rootNode); 195 } 196 197 private static final class Node { 198 private final Table<String, String, Node> subtables = TreeBasedTable.create(); 199 // Distance for the lookup so far (-1 for top level nodes). 200 private int distance; 201 Node(int distance)202 Node(int distance) { 203 checkArgument(distance >= -1, "invalid distance: %s", distance); 204 this.distance = distance; 205 } 206 207 /** Returns the subtable node for the top-level mapping of a key. */ getNode(MappingKey key)208 private Node getNode(MappingKey key) { 209 return subtables.get(key.getDesired(), key.getSupported()); 210 } 211 212 /** Returns the subtable node for the {@code <ANY,ANY>} mapping. */ getAnyNode()213 Node getAnyNode() { 214 return subtables.get(ANY, ANY); 215 } 216 addExplicitMapping(MappingKey key)217 void addExplicitMapping(MappingKey key) { 218 if (key.isLeaf()) { 219 if (!putIfAbsent(key)) { 220 logger.fine(() -> String.format("Ignore existing mapping: %s", key)); 221 } 222 } else { 223 getIntermediateNode(key).addExplicitMapping(key.getSuffix()); 224 } 225 } 226 addWildcardMappings(MappingKey key)227 void addWildcardMappings(MappingKey key) { 228 if (key.isLeaf()) { 229 putIfAbsent(key); 230 } else if (key.isWildcard()) { 231 // An intermediate wildcard mapping is applied to all existing sub-nodes. 232 // NOTE: This will need to change if we want to support "mixed" wildcard mappings. 233 for (Node node : subtables.values()) { 234 node.addWildcardMappings(key.getSuffix()); 235 } 236 } else { 237 // An explicit intermediate mapping only affects an existing exact match. 238 Node node = getNode(key); 239 if (node != null) { 240 node.addWildcardMappings(key.getSuffix()); 241 } 242 } 243 } 244 245 /** 246 * Adds a new mapping to this node with the specified distance if it didn't already 247 * exist. 248 * 249 * <p>Note: If a mapping already exists, then this method has no effect (even if the 250 * existing distance differs from the given distance). This is necessary to for two 251 * reasons: 252 * <ol> 253 * <li>An earlier match rule may have set an explicit value for the mapping, 254 * and we subsequently try to set a default value (via a wildcard mapping). 255 * This should be ignored, since we want the non-default value to win. 256 * This means it's important to always have explicit {@code <languageMatch>} 257 * rules before any related wildcard rules in the CLDR data. 258 * 259 * <li>A preferential {@code <languageMatch>} rule appears earlier in CLDR data. 260 * This occurs because of the way partitions are defined and allows for two 261 * distinct {@code <languageMatch>} rules to generate the same mapping (with 262 * different distances). This is because region variables reference sets of 263 * partition IDs and these are not always disjoint (e.g. "en_*_$!enUS" and 264 * "en_*_GB" both contain the partition ID for "GB"). 265 * </ol> 266 * 267 * @return true if a new mapping was added, or if the distances were equal (i.e. 268 * the operation was idempotent). 269 */ putIfAbsent(MappingKey key)270 private boolean putIfAbsent(MappingKey key) { 271 Node node = getNode(key); 272 if (node == null) { 273 logger.fine(() -> String.format("add: %s", key)); 274 subtables.put(key.getDesired(), key.getSupported(), new Node(key.getDistance())); 275 return true; 276 } 277 return (key.getDistance() == node.distance); 278 } 279 280 /** 281 * Returns a sub-node corresponding to the given {@code (desired, supported)} mapping. 282 * If the node already exists, it is simply returned, otherwise a new node is created 283 * and any existing wildcard mappings are copied into it. 284 */ getIntermediateNode(MappingKey key)285 private Node getIntermediateNode(MappingKey key) { 286 Node node = getNode(key); 287 if (node == null) { 288 // This is expected to succeed because match rules are given in length 289 // order (i.e. language only before language+script etc.) and we always 290 // expect each group to end with an <ANY,ANY> mapping for the default 291 // distance. Thus, for any longer match rule, we should find (at least) 292 // the <ANY,ANY> node when looking for intermediate nodes. 293 // 294 // NOTE: Currently (desired==ANY) if-and-only-if (supported=ANY), so the 295 // only non-exact match we can get here is the <ANY,ANY> node. If we ever 296 // allow a mix of wildcard/non-wildcard keys, replace the getAnyNode() call 297 // with something like the line below: 298 // ---- 299 // Node wildcardMatch = Iterables.find( 300 // asList(getNode(desired, ANY), getNode(ANY, supported), getNode(ANY,ANY)), 301 // Objects::nonNull); 302 // ---- 303 Node wildcardMatch = getAnyNode(); 304 checkState(wildcardMatch != null, "missing <ANY,ANY> mapping: %s", this); 305 // Default distances are the distance between any two *different* unknown 306 // subtags (so if the subtags are the same, the distance is zero). 307 int distance = key.getDesired().equals(key.getSupported()) ? 0 : wildcardMatch.distance; 308 node = new Node(distance); 309 node.copySubtablesFrom(wildcardMatch); 310 subtables.put(key.getDesired(), key.getSupported(), node); 311 } 312 return node; 313 } 314 315 /** Copies all subtable mappings from the given node into this one. */ copySubtablesFrom(Node src)316 private void copySubtablesFrom(Node src) { 317 checkState(subtables.isEmpty()); 318 src.subtables.cellSet().forEach( 319 c -> subtables.put(c.getRowKey(), c.getColumnKey(), new Node(c.getValue().distance))); 320 } 321 322 /** 323 * Writes all the mappings in the distance table sequentially to given Trie in sorted 324 * table order. 325 * 326 * <p>Mappings are written in a top-down recursive visitation with sub-tables inheriting 327 * the current prefix from parent tables via the given Trie span. At each level any 328 * mapped distances are written before recursing into the sub-tables. 329 */ writeTo(Trie.Span trieSpan)330 private void writeTo(Trie.Span trieSpan) { 331 if (distance >= 0 && (distance & DISTANCE_SKIP_SCRIPT) != 0) { 332 // If a node has a distance set and has been explicitly marked as "skippable", 333 // then write the "default" subtable using the current Trie prefix (effectively 334 // having an "empty" prefix for this case). 335 getAnyNode().writeTo(trieSpan); 336 } else { 337 // In the normal case, just write the mappings explicitly. 338 subtables.rowMap().forEach( 339 (desired, supportedNodes) -> writeSupported(trieSpan, desired, supportedNodes)); 340 } 341 } 342 writeSupported(Trie.Span trieSpan, String desired, Map<String, Node> supportedNodes)343 private void writeSupported(Trie.Span trieSpan, String desired, Map<String, Node> supportedNodes) { 344 // Collapse any (desired=ANY, supported=ANY) mappings into a single '*' in the trie. 345 if (desired.equals(ANY)) { 346 // If desired is ANY, the only supported subtag must also be ANY. 347 Node node = supportedNodes.get(ANY); 348 checkState(node != null && supportedNodes.size() == 1, 349 "invalid supported subtags for desired='ANY': %s", supportedNodes); 350 // Remember that ANY != "*", even though it corresponds to "*" in the original 351 // language match rules. Putting "*" in a Trie means something different (but 352 // similar enough to be a bit confusing). 353 trieSpan.with("*", node::writeDistancePlusSubtables); 354 } else { 355 // In the general case, just write the <desired,supported> distance mapping. 356 trieSpan.with(desired, withDesiredSpan -> 357 supportedNodes.forEach((supported, node) -> { 358 checkState(!supported.equals(ANY), 359 "unexpected supported='ANY' subtag: %s", supported); 360 withDesiredSpan.with(supported, node::writeDistancePlusSubtables); 361 }) 362 ); 363 } 364 } 365 366 // Writes the distance of this node to the given trie, then recursively writes any 367 // subtable information. writeDistancePlusSubtables(Trie.Span trieSpan)368 private void writeDistancePlusSubtables(Trie.Span trieSpan) { 369 trieSpan.putPrefixAndValue(distance); 370 writeTo(trieSpan); 371 } 372 373 @Override toString()374 public String toString() { 375 StringBuilder buffer = new StringBuilder("distance: ").append(distance).append('\n'); 376 return appendToString("", buffer).toString(); 377 } 378 appendToString(String indent, StringBuilder buffer)379 private StringBuilder appendToString(String indent, StringBuilder buffer) { 380 // Top level values are not padded with tabs. 381 String rowIndent = indent.isEmpty() ? "" : "\t"; 382 for (Map.Entry<String, Map<String, Node>> row : subtables.rowMap().entrySet()) { 383 buffer.append(rowIndent).append(row.getKey()); 384 // First column extends the current row, so single tab indent. 385 String colIndent = "\t"; 386 for (Map.Entry<String, Node> col : row.getValue().entrySet()) { 387 buffer.append(colIndent).append(col.getKey()); 388 Node subnode = col.getValue(); 389 buffer.append('\t').append(subnode.distance); 390 // Append any sub-nodes (starting on the same line). 391 subnode.appendToString(indent + "\t\t\t", buffer).append('\n'); 392 // Later columns need full indent (including skipping row key). 393 colIndent = indent + '\t'; 394 } 395 // Later rows need full indent. 396 rowIndent = indent; 397 } 398 return buffer; 399 } 400 } 401 402 /** 403 * Excapsulates a sequence of {@code <desired,supported>} pairwise mappings over 404 * language, script and region, with an associated distance. This is an alternate 405 * way to represent a mapping of desired and supported language match rules. 406 * 407 * <p>For example: 408 * <pre>{@code 409 * <languageMatch desired="en_*_$!enUS", supported="en_*_$GB", distance="3"/> 410 * }</pre> 411 * results in a set of keys of the form: 412 * <pre>{@code 413 * <en,en> -> <ANY,ANY> -> <X,Y> = 3 414 * }</pre> 415 * where the "region" part {@code <X,Y>} is constructed from all the possible 416 * combinations of partition IDs associated with the original region variables. 417 * 418 * <p>Mapping keys have several useful properties: 419 * <ul> 420 * <li>They can be reversed (e.g. {@code <A,B> -> <C,D> = N} becomes 421 * {@code <B,A> -> <D,C> = N}). 422 * <li>They can be symmetrical (e.g. {@code <X,X> -> <Y,Y> = N}), in which 423 * case the reversed key is the same as the original. 424 * <li>They can have wildcard mappings (i.e. {@code <ANY,ANY>}). 425 * <li>They can produce "suffix" keys (e.g. the suffix of 426 * {@code <A,B> -> <C,D> = N} is {@code <C,D> = N}). 427 * </ul> 428 */ 429 private static final class MappingKey { 430 /** 431 * Returns a new key from the specified subtag pairs, converting {@code '*'} 432 * subtags to the special {@code ANY} string and performing consistency checks. 433 * 434 * @param subtagPairs a sequence of {@code <desired,suported>} pairs. 435 * @param distance the distance associated with the subtag mapping. 436 */ fromSubtags(String[] subtagPairs, int distance)437 static MappingKey fromSubtags(String[] subtagPairs, int distance) { 438 int pairCount = subtagPairs.length; 439 checkArgument(pairCount == 2 || pairCount == 4 || pairCount == 6, 440 "invalid number of arguments (expected 1, 2 or 3 pairs): %s", asList(subtagPairs)); 441 ImmutableList.Builder<String> keyPairs = ImmutableList.builder(); 442 for (String subtag : subtagPairs) { 443 keyPairs.add(fixAny(subtag)); 444 } 445 return new MappingKey(keyPairs.build(), distance, false); 446 } 447 448 // Converts a '*' (from a subtag) into the wildcard match character used by the Trie. 449 // The Trie uses '*' to mean something else, so we convert it at the boundary. fixAny(String subtag)450 private static String fixAny(String subtag) { 451 return subtag.equals("*") ? ANY : subtag; 452 } 453 454 private final ImmutableList<String> pairs; 455 private final int distance; 456 private final boolean isReversed; 457 private final boolean isSymmetrical; 458 private final boolean hasWildcardMappings; 459 MappingKey(ImmutableList<String> pairs, int distance, boolean isReversed)460 private MappingKey(ImmutableList<String> pairs, int distance, boolean isReversed) { 461 this.pairs = pairs; 462 this.distance = distance; 463 this.isReversed = isReversed; 464 checkArgument(distance >= 0 && distance <= MAX_REGION_DISTANCE, 465 "invalid mapping key distance: %s", distance); 466 // Check that if a key has "ANY" mappings, it is consistent. We expect to only 467 // get <ANY,ANY> pairs (e.g. not <X,ANY> or <ANY,X>). 468 boolean isSymmetrical = true; 469 boolean hasWildcardMappings = false; 470 for (int i = 0; i < pairs.size(); i += 2) { 471 String desired = pairs.get(i); 472 String supported = pairs.get(i + 1); 473 checkArgument(desired.equals(ANY) == supported.equals(ANY), 474 "invalid mapping key pairs: %s", pairs); 475 hasWildcardMappings |= desired.equals(ANY); 476 isSymmetrical &= desired.equals(supported); 477 } 478 this.isSymmetrical = isSymmetrical; 479 this.hasWildcardMappings = hasWildcardMappings; 480 } 481 482 /** Returns the "desired" value of the current (top-level) mapping. */ getDesired()483 String getDesired() { 484 return pairs.get(isReversed ? 1 : 0); 485 } 486 487 /** Returns the "supported" value of the current (top-level) mapping. */ getSupported()488 String getSupported() { 489 return pairs.get(isReversed ? 0 : 1); 490 } 491 492 /** Returns the non-negative distance mapped to by this key. */ getDistance()493 int getDistance() { 494 return distance; 495 } 496 497 /** 498 * Returns the number of {@code <desired,supported>} mappings in this key; this is 499 * either 1 (language-only), 2 (language & script) or 3 (language, script & region). 500 */ getDepth()501 int getDepth() { 502 return pairs.size() / 2; 503 } 504 505 /** Returns true if this key does not have a suffix. */ isLeaf()506 boolean isLeaf() { 507 return getDepth() == 1; 508 } 509 510 /** 511 * Returns if any of the {@code <desired,supported>} mappings are {@code <ANY,ANY>}. 512 */ hasWildcardMappings()513 boolean hasWildcardMappings() { 514 return hasWildcardMappings; 515 } 516 517 /** 518 * Returns if the top-level {@code <desired,supported>} mapping is {@code <ANY,ANY>}. 519 */ isWildcard()520 boolean isWildcard() { 521 return getDesired().equals(ANY); 522 } 523 524 /** 525 * Returns if this key is pair-wise symmetrical (e.g. {@code "<X,X> -> <Y,Y> = N"}). 526 * Symmetrical mappings don't need to be added in reverse. 527 */ isSymmetrical()528 boolean isSymmetrical() { 529 return isSymmetrical; 530 } 531 532 /** Returns a new key where each {@code <desired,supported>} mapping is reversed. */ reverse()533 MappingKey reverse() { 534 checkState(!isReversed, "cannot revese a reversed key"); 535 return new MappingKey(pairs, distance, true); 536 } 537 538 /** 539 * Returns the suffix of this non-leaf key with the top-level mapping removed. For 540 * example, the suffix of {@code "<A,B> -> <C,D> = N"} is {@code "<C,D> = N"}). 541 */ getSuffix()542 MappingKey getSuffix() { 543 checkState(!isLeaf(), "cannot get 'next' for an empty key"); 544 return new MappingKey(pairs.subList(2, pairs.size()), distance, isReversed); 545 } 546 547 @Override toString()548 public String toString() { 549 return isLeaf() 550 ? String.format("<%s, %s> = %d", getDesired(), getSupported(), getDistance()) 551 : String.format("<%s, %s> -> %s", getDesired(), getSupported(), getSuffix()); 552 } 553 } 554 } 555