1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu.localedistance;
4 
5 import static com.google.common.base.Preconditions.checkArgument;
6 import static com.google.common.base.Preconditions.checkState;
7 import static com.ibm.icu.impl.locale.LocaleDistance.DISTANCE_SKIP_SCRIPT;
8 import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_LANG_DISTANCE;
9 import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_REGION_DISTANCE;
10 import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_SCRIPT_DISTANCE;
11 import static com.ibm.icu.impl.locale.LocaleDistance.IX_LIMIT;
12 import static com.ibm.icu.impl.locale.LocaleDistance.IX_MIN_REGION_DISTANCE;
13 import static java.util.Arrays.asList;
14 
15 import java.util.Arrays;
16 import java.util.Map;
17 import java.util.logging.Logger;
18 
19 import com.google.common.collect.ImmutableList;
20 import com.google.common.collect.Table;
21 import com.google.common.collect.TreeBasedTable;
22 
23 /**
24  * Represents the conceptual distance between pairs of language specifications.
25  *
26  * <p>Mappings for {@code (desired, supported)} pairs are added at one of three
27  * levels in the table; language, script and region. Distances can be resolved at
28  * any level in the table (e.g. {@code ("en","fr")}, {@code ("en_Latn","ru_Cyrl")}
29  * or {@code ("en_Latn_GB", "en_Latn_AU")}).
30  *
31  * <p>However in reality the "regions" in the table are actually "partition IDs"
32  * representing groups of regions with the same language characteristics. For more
33  * information on partitions and how they are generated, see {@link PartitionInfo}.
34  *
35  * <p>This is mentioned here because anyone debugging this code might be surprised
36  * to see values like {@code "5"} for a "region" in the code. Using the term
37  * "region" matches the conceptual level of the data and is more familiar to most
38  * people, whereas "partition ID" would probably be jarring.
39  *
40  * <p>The builder class is not resuable, and once a table is built, the builder is
41  * invalid. Furthermore, since the table data itself is mutable, care must be taken
42  * to avoid modifying either the Trie or the returned distance array.
43  *
44  * <p>Note that internally the {@code '*'} character used as a wildcard for subtags
45  * is replaced by the {@code '�'} character (a.k.a ANY), whenever a subtag is
46  * passed into the API. This is because the underlying Trie structure generated by
47  * the distance table reserves {@code '*'} for a different purpose. This difference
48  * is encapsulated within this class and the {@link Trie} class only.
49  */
50 final class DistanceTable {
51     private static final Logger logger = Logger.getLogger(DistanceTable.class.getName());
52 
53     // Represents a wildcard match in the data table (the equivalent of '*' in
54     // <languageMatch> locale subtag). Any incoming subtags are normalized to
55     // convert '*' to this character by the builder.
56     private static final String ANY = "�";
57 
58     // Distances must be in the range [0-127] because bit 7 of the distance value
59     // is used for a special flag (DISTANCE_SKIP_SCRIPT). Setting the explicit max
60     // to 100 is just a more human readable maximum that satisfies that constraint.
61     private static final int MAX_REGION_DISTANCE = 100;
62 
63     static final class Builder {
64         private final Node rootNode = new Node(-1);
65         private int minRegionDistance = MAX_REGION_DISTANCE;
66 
Builder()67         private Builder() {}
68 
69         /**
70          * Adds a distance to the table between the specified and desired tuples.
71          * This method takes 1, 2 or 3 sequential {@code (desired, supported)} pairs
72          * of values corresponding to language subtags, script subtags and regions
73          * (partition IDs). All values can be the wildcard '*'.
74          */
addDistance(int distance, boolean oneway, String... args)75         public void addDistance(int distance, boolean oneway, String... args) {
76             MappingKey key = MappingKey.fromSubtags(args, distance);
77             logger.fine(key::toString);
78             // Minimum region distance needs to be tracked specially.
79             if (key.getDepth() == 3 && distance < minRegionDistance) {
80                 minRegionDistance = distance;
81             }
82             addMapping(key);
83             if (!oneway && !key.isSymmetrical()) {
84                 addMapping(key.reverse());
85             }
86         }
87 
addMapping(MappingKey key)88         private void addMapping(MappingKey key) {
89             rootNode.addExplicitMapping(key);
90             if (key.hasWildcardMappings()) {
91                 rootNode.addWildcardMappings(key);
92             }
93         }
94 
95         /** Returns the final minimized distance table information. */
build()96         public DistanceTable build() {
97             Node defLangNode = rootNode.getAnyNode();
98             checkState(defLangNode != null, "missing default language mapping: %s", rootNode);
99             Node defScriptNode = defLangNode.getAnyNode();
100             checkState(defScriptNode != null, "missing default script mapping: %s", rootNode);
101             Node defRegionNode = defScriptNode.getAnyNode();
102             checkState(defRegionNode != null, "missing default region mapping: %s", rootNode);
103 
104             // Because we prune the data table, it's important to store the default
105             // distance values separately.
106             int[] distances = new int[IX_LIMIT];
107             distances[IX_DEF_LANG_DISTANCE] = defLangNode.distance;
108             distances[IX_DEF_SCRIPT_DISTANCE] = defScriptNode.distance;
109             distances[IX_DEF_REGION_DISTANCE] = defRegionNode.distance;
110             distances[IX_MIN_REGION_DISTANCE] = minRegionDistance;
111 
112             // Having determined the distances, prune the Trie to remove any sub-tables
113             // where distances could only be determined to be the default value (i.e.
114             // where the existence of that sub-table has no effect).
115             pruneDefaultDistances(defScriptNode.distance, defRegionNode.distance);
116             return new DistanceTable(rootNode, distances);
117         }
118 
119         @Override
toString()120         public String toString() {
121             return String.format("minimum region distance: %d\n%s\n", minRegionDistance, rootNode);
122         }
123 
pruneDefaultDistances(int defScriptDistance, int defRegionDistance)124         private void pruneDefaultDistances(int defScriptDistance, int defRegionDistance) {
125             logger.fine("==== pruning subtables ====");
126             rootNode.subtables.values().forEach(langNode -> {
127                 langNode.subtables.values().forEach(scriptNode -> {
128                     if (scriptNode.subtables.size() == 1) {
129                         // If a script node *only* contains region data with the default
130                         // region distance, that region data can be removed. Since region
131                         // is the lowest level, there's no need to worry about "skipping"
132                         // anything during lookup (unlike the case below).
133                         Node defRegionNode = scriptNode.getAnyNode();
134                         checkState(defRegionNode != null,
135                                 "missing default region node for script: %s", scriptNode);
136                         if (defRegionNode.distance == defRegionDistance) {
137                             scriptNode.subtables.clear();
138                         }
139                     }
140                 });
141                 // Do the pruning in the "upwards" phase of visitation (after recursion) so
142                 // if script subtables are pruned, it's visible here.
143                 if (langNode.subtables.size() == 1) {
144                     // If a language node *only* contains script data with the default
145                     // script distance, we can't just remove it (because it might contain
146                     // region data).
147                     Node defScriptNode = langNode.getAnyNode();
148                     if (defScriptNode.distance == defScriptDistance) {
149                         checkState(defScriptNode != null,
150                                 "missing default script node for language: %s", langNode);
151                         if (defScriptNode.subtables.isEmpty()) {
152                             // If the default script node has no region data, remove it.
153                             langNode.subtables.clear();
154                         } else {
155                             // Otherwise mark script data as "skippable", which indicates
156                             // it should be written in a compact form in the Trie (while
157                             // retaining any region data as normal).
158                             langNode.distance |= DISTANCE_SKIP_SCRIPT;
159                         }
160                     }
161                 }
162             });
163             // After pruning we don't expect any data in the top-level default table.
164             checkState(rootNode.getAnyNode().subtables.isEmpty(),
165                     "invalid table state: %s", rootNode.getAnyNode());
166             rootNode.subtables.rowMap().remove(ANY);
167         }
168     }
169 
builder()170     public static Builder builder() {
171         return new Builder();
172     }
173 
174     private final Node rootNode;
175     private final int[] distances;
176 
DistanceTable(Node rootNode, int[] distances)177     private DistanceTable(Node rootNode, int[] distances) {
178         this.rootNode = rootNode;
179         this.distances = distances;
180     }
181 
getTrie()182     public Trie getTrie() {
183         Trie trie = new Trie();
184         rootNode.writeTo(trie.root());
185         return trie;
186     }
187 
getDefaultDistances()188     public int[] getDefaultDistances() {
189         return distances;
190     }
191 
192     @Override
toString()193     public String toString() {
194         return String.format("default distances: %s\n%s\n", Arrays.toString(distances), rootNode);
195     }
196 
197     private static final class Node {
198         private final Table<String, String, Node> subtables = TreeBasedTable.create();
199         // Distance for the lookup so far (-1 for top level nodes).
200         private int distance;
201 
Node(int distance)202         Node(int distance) {
203             checkArgument(distance >= -1, "invalid distance: %s", distance);
204             this.distance = distance;
205         }
206 
207         /** Returns the subtable node for the top-level mapping of a key. */
getNode(MappingKey key)208         private Node getNode(MappingKey key) {
209             return subtables.get(key.getDesired(), key.getSupported());
210         }
211 
212         /** Returns the subtable node for the {@code <ANY,ANY>} mapping. */
getAnyNode()213         Node getAnyNode() {
214             return subtables.get(ANY, ANY);
215         }
216 
addExplicitMapping(MappingKey key)217         void addExplicitMapping(MappingKey key) {
218             if (key.isLeaf()) {
219                 if (!putIfAbsent(key)) {
220                     logger.fine(() -> String.format("Ignore existing mapping: %s", key));
221                 }
222             } else {
223                 getIntermediateNode(key).addExplicitMapping(key.getSuffix());
224             }
225         }
226 
addWildcardMappings(MappingKey key)227         void addWildcardMappings(MappingKey key) {
228             if (key.isLeaf()) {
229                 putIfAbsent(key);
230             } else if (key.isWildcard()) {
231                 // An intermediate wildcard mapping is applied to all existing sub-nodes.
232                 // NOTE: This will need to change if we want to support "mixed" wildcard mappings.
233                 for (Node node : subtables.values()) {
234                     node.addWildcardMappings(key.getSuffix());
235                 }
236             } else {
237                 // An explicit intermediate mapping only affects an existing exact match.
238                 Node node = getNode(key);
239                 if (node != null) {
240                     node.addWildcardMappings(key.getSuffix());
241                 }
242             }
243         }
244 
245         /**
246          * Adds a new mapping to this node with the specified distance if it didn't already
247          * exist.
248          *
249          * <p>Note: If a mapping already exists, then this method has no effect (even if the
250          * existing distance differs from the given distance). This is necessary to for two
251          * reasons:
252          * <ol>
253          *     <li>An earlier match rule may have set an explicit value for the mapping,
254          *         and we subsequently try to set a default value (via a wildcard mapping).
255          *         This should be ignored, since we want the non-default value to win.
256          *         This means it's important to always have explicit {@code <languageMatch>}
257          *         rules before any related wildcard rules in the CLDR data.
258          *
259          *     <li>A preferential {@code <languageMatch>} rule appears earlier in CLDR data.
260          *         This occurs because of the way partitions are defined and allows for two
261          *         distinct {@code <languageMatch>} rules to generate the same mapping (with
262          *         different distances). This is because region variables reference sets of
263          *         partition IDs and these are not always disjoint (e.g. "en_*_$!enUS" and
264          *         "en_*_GB" both contain the partition ID for "GB").
265          * </ol>
266          *
267          * @return true if a new mapping was added, or if the distances were equal (i.e.
268          *         the operation was idempotent).
269          */
putIfAbsent(MappingKey key)270         private boolean putIfAbsent(MappingKey key) {
271             Node node = getNode(key);
272             if (node == null) {
273                 logger.fine(() -> String.format("add: %s", key));
274                 subtables.put(key.getDesired(), key.getSupported(), new Node(key.getDistance()));
275                 return true;
276             }
277             return (key.getDistance() == node.distance);
278         }
279 
280         /**
281          * Returns a sub-node corresponding to the given {@code (desired, supported)} mapping.
282          * If the node already exists, it is simply returned, otherwise a new node is created
283          * and any existing wildcard mappings are copied into it.
284          */
getIntermediateNode(MappingKey key)285         private Node getIntermediateNode(MappingKey key) {
286             Node node = getNode(key);
287             if (node == null) {
288                 // This is expected to succeed because match rules are given in length
289                 // order (i.e. language only before language+script etc.) and we always
290                 // expect each group to end with an <ANY,ANY> mapping for the default
291                 // distance. Thus, for any longer match rule, we should find (at least)
292                 // the <ANY,ANY> node when looking for intermediate nodes.
293                 //
294                 // NOTE: Currently (desired==ANY) if-and-only-if (supported=ANY), so the
295                 // only non-exact match we can get here is the <ANY,ANY> node. If we ever
296                 // allow a mix of wildcard/non-wildcard keys, replace the getAnyNode() call
297                 // with something like the line below:
298                 // ----
299                 // Node wildcardMatch = Iterables.find(
300                 //      asList(getNode(desired, ANY), getNode(ANY, supported), getNode(ANY,ANY)),
301                 //      Objects::nonNull);
302                 // ----
303                 Node wildcardMatch = getAnyNode();
304                 checkState(wildcardMatch != null, "missing <ANY,ANY> mapping: %s", this);
305                 // Default distances are the distance between any two *different* unknown
306                 // subtags (so if the subtags are the same, the distance is zero).
307                 int distance = key.getDesired().equals(key.getSupported()) ? 0 : wildcardMatch.distance;
308                 node = new Node(distance);
309                 node.copySubtablesFrom(wildcardMatch);
310                 subtables.put(key.getDesired(), key.getSupported(), node);
311             }
312             return node;
313         }
314 
315         /** Copies all subtable mappings from the given node into this one. */
copySubtablesFrom(Node src)316         private void copySubtablesFrom(Node src) {
317             checkState(subtables.isEmpty());
318             src.subtables.cellSet().forEach(
319                     c -> subtables.put(c.getRowKey(), c.getColumnKey(), new Node(c.getValue().distance)));
320         }
321 
322         /**
323          * Writes all the mappings in the distance table sequentially to given Trie in sorted
324          * table order.
325          *
326          * <p>Mappings are written in a top-down recursive visitation with sub-tables inheriting
327          * the current prefix from parent tables via the given Trie span. At each level any
328          * mapped distances are written before recursing into the sub-tables.
329          */
writeTo(Trie.Span trieSpan)330         private void writeTo(Trie.Span trieSpan) {
331             if (distance >= 0 && (distance & DISTANCE_SKIP_SCRIPT) != 0) {
332                 // If a node has a distance set and has been explicitly marked as "skippable",
333                 // then write the "default" subtable using the current Trie prefix (effectively
334                 // having an "empty" prefix for this case).
335                 getAnyNode().writeTo(trieSpan);
336             } else {
337                 // In the normal case, just write the mappings explicitly.
338                 subtables.rowMap().forEach(
339                         (desired, supportedNodes) -> writeSupported(trieSpan, desired, supportedNodes));
340             }
341         }
342 
writeSupported(Trie.Span trieSpan, String desired, Map<String, Node> supportedNodes)343         private void writeSupported(Trie.Span trieSpan, String desired, Map<String, Node> supportedNodes) {
344             // Collapse any (desired=ANY, supported=ANY) mappings into a single '*' in the trie.
345             if (desired.equals(ANY)) {
346                 // If desired is ANY, the only supported subtag must also be ANY.
347                 Node node = supportedNodes.get(ANY);
348                 checkState(node != null && supportedNodes.size() == 1,
349                         "invalid supported subtags for desired='ANY': %s", supportedNodes);
350                 // Remember that ANY != "*", even though it corresponds to "*" in the original
351                 // language match rules. Putting "*" in a Trie means something different (but
352                 // similar enough to be a bit confusing).
353                 trieSpan.with("*", node::writeDistancePlusSubtables);
354             } else {
355                 // In the general case, just write the <desired,supported> distance mapping.
356                 trieSpan.with(desired, withDesiredSpan ->
357                         supportedNodes.forEach((supported, node) -> {
358                             checkState(!supported.equals(ANY),
359                                     "unexpected supported='ANY' subtag: %s", supported);
360                             withDesiredSpan.with(supported, node::writeDistancePlusSubtables);
361                         })
362                 );
363             }
364         }
365 
366         // Writes the distance of this node to the given trie, then recursively writes any
367         // subtable information.
writeDistancePlusSubtables(Trie.Span trieSpan)368         private void writeDistancePlusSubtables(Trie.Span trieSpan) {
369             trieSpan.putPrefixAndValue(distance);
370             writeTo(trieSpan);
371         }
372 
373         @Override
toString()374         public String toString() {
375             StringBuilder buffer = new StringBuilder("distance: ").append(distance).append('\n');
376             return appendToString("", buffer).toString();
377         }
378 
appendToString(String indent, StringBuilder buffer)379         private StringBuilder appendToString(String indent, StringBuilder buffer) {
380             // Top level values are not padded with tabs.
381             String rowIndent = indent.isEmpty() ? "" : "\t";
382             for (Map.Entry<String, Map<String, Node>> row : subtables.rowMap().entrySet()) {
383                 buffer.append(rowIndent).append(row.getKey());
384                 // First column extends the current row, so single tab indent.
385                 String colIndent = "\t";
386                 for (Map.Entry<String, Node> col : row.getValue().entrySet()) {
387                     buffer.append(colIndent).append(col.getKey());
388                     Node subnode = col.getValue();
389                     buffer.append('\t').append(subnode.distance);
390                     // Append any sub-nodes (starting on the same line).
391                     subnode.appendToString(indent + "\t\t\t", buffer).append('\n');
392                     // Later columns need full indent (including skipping row key).
393                     colIndent = indent + '\t';
394                 }
395                 // Later rows need full indent.
396                 rowIndent = indent;
397             }
398             return buffer;
399         }
400     }
401 
402     /**
403      * Excapsulates a sequence of {@code <desired,supported>} pairwise mappings over
404      * language, script and region, with an associated distance. This is an alternate
405      * way to represent a mapping of desired and supported language match rules.
406      *
407      * <p>For example:
408      * <pre>{@code
409      *   <languageMatch desired="en_*_$!enUS", supported="en_*_$GB", distance="3"/>
410      * }</pre>
411      * results in a set of keys of the form:
412      * <pre>{@code
413      *   <en,en> -> <ANY,ANY> -> <X,Y> = 3
414      * }</pre>
415      * where the "region" part {@code <X,Y>} is constructed from all the possible
416      * combinations of partition IDs associated with the original region variables.
417      *
418      * <p>Mapping keys have several useful properties:
419      * <ul>
420      *     <li>They can be reversed (e.g. {@code <A,B> -> <C,D> = N} becomes
421      *     {@code <B,A> -> <D,C> = N}).
422      *     <li>They can be symmetrical (e.g. {@code <X,X> -> <Y,Y> = N}), in which
423      *     case the reversed key is the same as the original.
424      *     <li>They can have wildcard mappings (i.e. {@code <ANY,ANY>}).
425      *     <li>They can produce "suffix" keys (e.g. the suffix of
426      *     {@code <A,B> -> <C,D> = N} is {@code <C,D> = N}).
427      * </ul>
428      */
429     private static final class MappingKey {
430         /**
431          * Returns a new key from the specified subtag pairs, converting {@code '*'}
432          * subtags to the special {@code ANY} string and performing consistency checks.
433          *
434          * @param subtagPairs a sequence of {@code <desired,suported>} pairs.
435          * @param distance the distance associated with the subtag mapping.
436          */
fromSubtags(String[] subtagPairs, int distance)437         static MappingKey fromSubtags(String[] subtagPairs, int distance) {
438             int pairCount = subtagPairs.length;
439             checkArgument(pairCount == 2 || pairCount == 4 || pairCount == 6,
440                     "invalid number of arguments (expected 1, 2 or 3 pairs): %s", asList(subtagPairs));
441             ImmutableList.Builder<String> keyPairs = ImmutableList.builder();
442             for (String subtag : subtagPairs) {
443                 keyPairs.add(fixAny(subtag));
444             }
445             return new MappingKey(keyPairs.build(), distance, false);
446         }
447 
448         // Converts a '*' (from a subtag) into the wildcard match character used by the Trie.
449         // The Trie uses '*' to mean something else, so we convert it at the boundary.
fixAny(String subtag)450         private static String fixAny(String subtag) {
451             return subtag.equals("*") ? ANY : subtag;
452         }
453 
454         private final ImmutableList<String> pairs;
455         private final int distance;
456         private final boolean isReversed;
457         private final boolean isSymmetrical;
458         private final boolean hasWildcardMappings;
459 
MappingKey(ImmutableList<String> pairs, int distance, boolean isReversed)460         private MappingKey(ImmutableList<String> pairs, int distance, boolean isReversed) {
461             this.pairs = pairs;
462             this.distance = distance;
463             this.isReversed = isReversed;
464             checkArgument(distance >= 0 && distance <= MAX_REGION_DISTANCE,
465                     "invalid mapping key distance: %s", distance);
466             // Check that if a key has "ANY" mappings, it is consistent. We expect to only
467             // get <ANY,ANY> pairs (e.g. not <X,ANY> or <ANY,X>).
468             boolean isSymmetrical = true;
469             boolean hasWildcardMappings = false;
470             for (int i = 0; i < pairs.size(); i += 2) {
471                 String desired = pairs.get(i);
472                 String supported = pairs.get(i + 1);
473                 checkArgument(desired.equals(ANY) == supported.equals(ANY),
474                         "invalid mapping key pairs: %s", pairs);
475                 hasWildcardMappings |= desired.equals(ANY);
476                 isSymmetrical &= desired.equals(supported);
477             }
478             this.isSymmetrical = isSymmetrical;
479             this.hasWildcardMappings = hasWildcardMappings;
480         }
481 
482         /** Returns the "desired" value of the current (top-level) mapping. */
getDesired()483         String getDesired() {
484             return pairs.get(isReversed ? 1 : 0);
485         }
486 
487         /** Returns the "supported" value of the current (top-level) mapping. */
getSupported()488         String getSupported() {
489             return pairs.get(isReversed ? 0 : 1);
490         }
491 
492         /** Returns the non-negative distance mapped to by this key. */
getDistance()493         int getDistance() {
494             return distance;
495         }
496 
497         /**
498          * Returns the number of {@code <desired,supported>} mappings in this key; this is
499          * either 1 (language-only), 2 (language & script) or 3 (language, script & region).
500          */
getDepth()501         int getDepth() {
502             return pairs.size() / 2;
503         }
504 
505         /** Returns true if this key does not have a suffix. */
isLeaf()506         boolean isLeaf() {
507             return getDepth() == 1;
508         }
509 
510         /**
511          * Returns if any of the {@code <desired,supported>} mappings are {@code <ANY,ANY>}.
512          */
hasWildcardMappings()513         boolean hasWildcardMappings() {
514             return hasWildcardMappings;
515         }
516 
517         /**
518          * Returns if the top-level {@code <desired,supported>} mapping is {@code <ANY,ANY>}.
519          */
isWildcard()520         boolean isWildcard() {
521             return getDesired().equals(ANY);
522         }
523 
524         /**
525          * Returns if this key is pair-wise symmetrical (e.g. {@code "<X,X> -> <Y,Y> = N"}).
526          * Symmetrical mappings don't need to be added in reverse.
527          */
isSymmetrical()528         boolean isSymmetrical() {
529             return isSymmetrical;
530         }
531 
532         /** Returns a new key where each {@code <desired,supported>} mapping is reversed. */
reverse()533         MappingKey reverse() {
534             checkState(!isReversed, "cannot revese a reversed key");
535             return new MappingKey(pairs, distance, true);
536         }
537 
538         /**
539          * Returns the suffix of this non-leaf key with the top-level mapping removed. For
540          * example, the suffix of {@code "<A,B> -> <C,D> = N"} is {@code "<C,D> = N"}).
541          */
getSuffix()542         MappingKey getSuffix() {
543             checkState(!isLeaf(), "cannot get 'next' for an empty key");
544             return new MappingKey(pairs.subList(2, pairs.size()), distance, isReversed);
545         }
546 
547         @Override
toString()548         public String toString() {
549             return isLeaf()
550                     ? String.format("<%s, %s> = %d", getDesired(), getSupported(), getDistance())
551                     : String.format("<%s, %s> -> %s", getDesired(), getSupported(), getSuffix());
552         }
553     }
554 }
555