1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu.localedistance;
4 
5 import static com.google.common.base.Preconditions.checkArgument;
6 import static com.google.common.base.Preconditions.checkState;
7 
8 import java.util.Collection;
9 import java.util.Collections;
10 import java.util.HashSet;
11 import java.util.Map;
12 import java.util.Set;
13 import java.util.TreeSet;
14 import java.util.function.BiConsumer;
15 import java.util.function.Consumer;
16 import java.util.function.Function;
17 import java.util.logging.Logger;
18 
19 import com.google.common.base.CharMatcher;
20 import com.google.common.collect.ImmutableMap;
21 import com.google.common.collect.ImmutableSet;
22 import com.google.common.collect.ImmutableSetMultimap;
23 import com.google.common.collect.SetMultimap;
24 import com.google.common.collect.Sets;
25 import com.google.common.collect.SortedSetMultimap;
26 import com.google.common.collect.TreeMultimap;
27 import com.ibm.icu.impl.locale.LSR;
28 
29 /**
30  * Provides mapping arrays to quickly lookup partition information for any region
31  * code in client libraries.
32  *
33  * <p>A region's partition is defined by the set of region variables (e.g. "$enUS")
34  * in the CLDR data. Each unique combination of variables forms a partition, and
35  * groups of partitions uniquely define language distance groupings. In slightly
36  * mathematical terms, partition groups form an "equivalence class" for regions
37  * with respect to language distance.
38  *
39  * <p>So by determining the minimum set of partitions and partition groups, and
40  * assigning short IDs to them, it's possibe to create data structures which
41  * support all region pairings while being small and fast to access in client code.
42  */
43 final class PartitionInfo {
44     private static final Logger logger = Logger.getLogger(PartitionInfo.class.getName());
45 
46     /**
47      * A builder, to which region variables are added in order to define partitions
48      * and partition groups based on territory containment.
49      */
50     static final class Builder {
51         // Possible operations to parse from a region expression (e.g. "US+005-BR").
52         private static final CharMatcher REGION_OPS = CharMatcher.anyOf("+-");
53 
54         private final TerritoryContainment territories;
55         private final Set<String> variables = new HashSet<>();
56         private final SortedSetMultimap<String, String> regionToVariables = TreeMultimap.create();
57 
Builder(TerritoryContainment territories)58         private Builder(TerritoryContainment territories) {
59             this.territories = territories;
60         }
61 
62         // Returns whether the given string is a known variable or the wildcard token.
63         // Non variable strings (e.g. plain region codes) can be passed in and simply
64         // return false.
isKnownVariableOrWildcard(String variable)65         private boolean isKnownVariableOrWildcard(String variable) {
66             return variables.contains(variable) || variable.equals("*");
67         }
68 
69         /**
70          * Adds a variable expression (e.g. "$foo = "US+005-BR") from CLDR data and
71          * fully resolves all macro regions to their contained leaf regions.
72          *
73          * <p>The syntax is simple for now:
74          * <pre>
75          *     regionSet := region ([-+] region)*
76          * </pre>
77          * There is no precedence, so "x+y-y+z" is "(((x+y)-y)+z)", and <em>not</em>
78          * "(x+y)-(y+z)".
79          */
addVariableExpression(String variable, String expr)80         public void addVariableExpression(String variable, String expr) {
81             checkState(variable.startsWith("$") && !variable.startsWith("$!"),
82                     "invalid variable: %s", variable);
83             checkState(!isKnownVariableOrWildcard(variable),
84                     "duplicate variable: %s", variable);
85             // Parsing also flattens the list to the corresponding leaf regions,
86             // so there should be no macro regions here.
87             Set<String> regions = parseAndFlattenRegionExpression(expr, territories);
88             // Add the mappings ("$foo" -> X) and the inverse ("$!foo" -> not(X)).
89             //
90             // The reason that the inverse mapping is needed is because some rules use
91             // the negated form of a variable (e.g. "$!enUS") and we must be able to
92             // resolve the set of associated partition IDs for it.
93             //
94             // If we only wanted the set of regions for the negated variable, that
95             // would be trivial (and there would be no need to store the negated values)
96             // but because the set of partition IDs for a negated variable is NOT always
97             // the negated set of parition IDs for the original variable (due to the way
98             // partitions overlap) it's not straightforward.
99             //
100             // In other words:
101             //     regions-for("$!foo") == !regions-for("$foo))
102             // but:
103             //     partition-ids-for("$!foo") != !partition-ids-for("$foo")
104             addVariable(variable, regions);
105             addVariable(
106                     "$!" + variable.substring(1),
107                     Sets.difference(territories.getLeafRegions(), regions));
108         }
109 
addVariable(String variable, Iterable<String> regions)110         private void addVariable(String variable, Iterable<String> regions) {
111             checkArgument(variables.add(variable),
112                     "variable '%s' already present in: %s", variable, regions);
113             for (String region : regions) {
114                 checkArgument(!region.isEmpty(), "%s", regions);
115                 regionToVariables.put(region, variable);
116             }
117         }
118 
119         // Parses a region expression (e.g. "US+005-BR") to a set of resolved "leaf"
120         // regions.
parseAndFlattenRegionExpression( String expr, TerritoryContainment territories)121         private static Set<String> parseAndFlattenRegionExpression(
122                 String expr, TerritoryContainment territories) {
123             Set<String> regions = new TreeSet<>();
124             Consumer<String> operation = regions::add;
125             int last = 0;
126             for (int i = REGION_OPS.indexIn(expr); i != -1; i = REGION_OPS.indexIn(expr, last)) {
127                 applyOperation(operation, expr.substring(last, i), territories);
128                 // Set up the next operation based on the separator char ('+' or '-').
129                 operation = (expr.charAt(i) == '+') ? regions::add : regions::remove;
130                 last = i + 1;
131             }
132             applyOperation(operation, expr.substring(last), territories);
133             return regions;
134         }
135 
applyOperation( Consumer<String> operation, String region, TerritoryContainment territories)136         private static void applyOperation(
137                 Consumer<String> operation, String region, TerritoryContainment territories) {
138             checkArgument(!region.isEmpty(), "invalid region expresson (missing region)");
139             ImmutableSet<String> contained = territories.getLeafRegionsOf(region);
140             if (!contained.isEmpty()) {
141                 // For macro regions, add all their contained leaf regions (direct or indirect).
142                 contained.forEach(operation);
143             } else {
144                 // Leaf regions are just added directly.
145                 operation.accept(region);
146             }
147         }
148 
149         /**
150          * Registers an implicit variable defined by a region code, and returns the new variable
151          * name.
152          *
153          * <p>This method exists because the {@code <languageMatch>} syntax supports referencing
154          * regions directly, rather than just as pre-defined variables (e.g. "en_*_GB"). We still
155          * want to track these variables however since they may interact with macro-regions.
156          *
157          * @param regionOrVariable a region or an existing variable reference.
158          * @return the name of the registered variable (including '$' prefix).
159          */
ensureVariable(String regionOrVariable)160         public String ensureVariable(String regionOrVariable) {
161             if (isKnownVariableOrWildcard(regionOrVariable)) {
162                 return regionOrVariable;
163             }
164             // Here we either have a "raw" region (e.g. "GB") or an unknown variable (e.g. "$foo").
165             // However all explicit variables should have already been registered, so if this does
166             // start with '$', then it's an error.
167             checkArgument(!regionOrVariable.startsWith("$"), "unregistered variable: %s", regionOrVariable);
168 
169             // This is an implicit variable, referenced by its region code, so we know that it
170             // can never be referenced in the negated form (i.e. "$!GB"), so we don't need to add
171             // the inverse mapping in the same way we do for explicitly defined variables.
172             //
173             // We also allow implicit variables to appear more than once in the list of match
174             // rules, so don't call addVariable() here, since that prohibits repeated addition.
175             // Since 'regionToVariables' is a _set_ multimap, adding implicit variables is an
176             // idempotent operation, so it's okay if it's done more than once.
177             String variable = "$" + regionOrVariable;
178             variables.add(variable);
179             regionToVariables.put(regionOrVariable, variable);
180             return variable;
181         }
182 
build()183         public PartitionInfo build() {
184             // Step 1: Map regions to a unique "partition" ID.
185             //
186             // A region's partition is the set of variables which include it, and
187             // variables can be explicit (e.g. "$enUS"), implicit (e.g. "$GB") or
188             // negated (e.g. "$!enUS).
189             //
190             // For example, region "US" is included in the variables "$americas" and
191             // "$enUS", but is also referenced in the "negated" variables "$!cnsar"
192             // and "$!maghreb", so the "partition" of "US" is:
193             //     { $americas, $enUS, $!cnsar, $!maghreb }
194             //
195             // A partition ID is a token associated with each unique variable partition.
196             //
197             // Since other regions, such as "PR" (Puerto Rico) and "VI" (U.S. Virgin
198             // Islands), are also "in" the same partition as "US", they will share the
199             // same partition ID.
200             //
201             // However, while "CA" is also included in "$americas", it's NOT defined as
202             // an "$enUS" (American English) region, so its partition is:
203             //     { $americas, $!enUS, $!cnsar, $!maghreb }
204             // and it will have a different partition ID.
205 
206             // Check that the region-to-partition map covers every leaf region (this
207             // is important to ensure partitions form a disjoint covering).
208             checkArgument(regionToVariables.keySet().equals(territories.getLeafRegions()),
209                     "unexpected variable grouping (should cover all leaf regions): %s",
210                     regionToVariables);
211             ImmutableMap<String, String> regionToPartitionId =
212                     mapLeafRegionsToPartitionIds(regionToVariables);
213             logger.fine(() -> String.format("region to partition ID: %s", regionToPartitionId));
214 
215             // Step 2: Construct mappings to and from partition IDs, to group regions
216             // by the variables that define them.
217 
218             // A sorted mapping from every variable ("$foo" or "$!foo") to the IDs of
219             // the partitions it exists in.
220             //
221             // For example, "$americas" exists in partitions for both "$enUS" (American
222             // English) and "$!enUS" (non-American English) regions, so will be mapped
223             // to (at least) two unique parition IDs (e.g. X & Y).
224             //   "$americas" -> { X, Y }
225             ImmutableSetMultimap<String, String> variableToPartitionIds =
226                     mapVariablesToPartitionIds(regionToPartitionId, regionToVariables);
227             logger.fine(() -> String.format("variable to partition IDs: %s", variableToPartitionIds));
228 
229             // A sorted mapping of each macro region to the partitions it intersects
230             // with. Unlike leaf regions, macro regions can map to groups of partitions
231             // rather than just a single one.
232             //
233             // For example, the macro region "419" (Latin America) intersects with
234             // both partitions:
235             //     X = {$americas, $enUS, ...}  (i.e. "Americas + American English")
236             // and:
237             //     Y = {$americas, $!enUS, ...} (i.e. "Americas + non-American English")
238             // so this map would contain:
239             //     "419" -> { X, Y }
240             ImmutableSetMultimap<String, String> macroRegionToPartitionIds =
241                     mapMacroRegionsToPartitionIds(regionToPartitionId, territories);
242 
243             // Step 3: Write the sparse "region index to partition group index" lookup
244             // array. This is the fast lookup array used to go from LSR region index to
245             // the partition group IDs for that region.
246             //
247             // Note that most entries in the array are zero, since the array maps from
248             // all possible regions, not just ones which exist. This is a space/time
249             // trade-off (and the array is compressed in the ICU data files anyway).
250             byte[] partitionLookupArray = new byte[LSR.REGION_INDEX_LIMIT];
251             String[] partitionStrings = writePartitionLookupTable(
252                     partitionLookupArray, regionToPartitionId, macroRegionToPartitionIds);
253 
254             return new PartitionInfo(variableToPartitionIds, partitionLookupArray, partitionStrings);
255         }
256 
mapLeafRegionsToPartitionIds( SetMultimap<String, String> regionToVariables)257         private static ImmutableMap<String, String> mapLeafRegionsToPartitionIds(
258                 SetMultimap<String, String> regionToVariables) {
259             // A generator for partition IDs which returns a single ASCII character for
260             // each unique partition.
261             //
262             // Partition IDs are emitted into the ICU data, so it's important they are
263             // small and compatible with the ICU data file format.
264             Function<Collection<String>, String> partitionToId =
265                     Indexer.create(i -> {
266                         // Must be a single 7-bit ASCII value and not '*'. This is NOT
267                         // used as a numeric value anywhere and could end up being a non
268                         // digit character if the number of unique partitions is > 10.
269                         // As of June 2020, there are only 7 unique paritions.
270                         char partitionChar = (char) ('0' + i);
271                         checkState(partitionChar < 0x7f, "too many partitions: %s", i);
272                         return String.valueOf(partitionChar);
273                     });
274 
275             // For each region, find its partition ID (based on the unique combination
276             // of variables that define it).
277             ImmutableMap.Builder<String, String> regionToId = ImmutableMap.builder();
278             regionToVariables.asMap().forEach(
279                     (region, variables) -> regionToId.put(region, partitionToId.apply(variables)));
280             return regionToId.build();
281         }
282 
mapVariablesToPartitionIds( ImmutableMap<String, String> regionToPartitionId, SortedSetMultimap<String, String> regionToVariables)283         private static ImmutableSetMultimap<String, String> mapVariablesToPartitionIds(
284                 ImmutableMap<String, String> regionToPartitionId,
285                 SortedSetMultimap<String, String> regionToVariables) {
286 
287             // It's vital that this is a sorted multimap (of values as well as keys)
288             // since the values are later indexed and turned into partition strings
289             // (so stability of ID order in values is necessary).
290             SortedSetMultimap<String, String> variableToPartitionIds = TreeMultimap.create();
291             regionToVariables.asMap().forEach((region, variables) -> {
292                 String partitionId = regionToPartitionId.get(region);
293                 for (String variable : variables) {
294                     variableToPartitionIds.put(variable, partitionId);
295                 }
296             });
297             return ImmutableSetMultimap.copyOf(variableToPartitionIds);
298         }
299 
mapMacroRegionsToPartitionIds( ImmutableMap<String, String> regionToPartitionId, TerritoryContainment territories)300         private static ImmutableSetMultimap<String, String> mapMacroRegionsToPartitionIds(
301                 ImmutableMap<String, String> regionToPartitionId,
302                 TerritoryContainment territories) {
303 
304             // A mapping from each unique partition ID to the regions it contains.
305             // This mapping forms a disjoint covering of all (non-macro) regions and
306             // is just the "inverse" of the initial "region to partition ID" map.
307             //
308             // For example, following the examples above where:
309             //     X = {$americas, $enUS, ...}
310             // and:
311             //     Y = {$americas, $!enUS, ...}
312             //
313             // We would get something like:
314             //     X -> {"PR", "US", "VI", ...}
315             //     Y -> {"CA", ...}
316             Map<String, Collection<String>> partitionToRegions =
317                     regionToPartitionId.asMultimap().inverse().asMap();
318 
319             // Each macro region can then be decomposed to a mapping to the unique set
320             // of partitions it overlaps with based on its leaf regions and the regions
321             // of all known partitions.
322             SortedSetMultimap<String, String> macroToPartitions = TreeMultimap.create();
323             for (String macro : territories.getMacroRegions()) {
324                 ImmutableSet<String> leaves = territories.getLeafRegionsOf(macro);
325                 partitionToRegions.forEach((partition, regions) -> {
326                     if (!Collections.disjoint(leaves, regions)) {
327                         macroToPartitions.put(macro, partition);
328                     }
329                 });
330             }
331             return ImmutableSetMultimap.copyOf(macroToPartitions);
332         }
333 
writePartitionLookupTable( byte[] partitionLookupArray, ImmutableMap<String, String> regionToPartitionId, ImmutableSetMultimap<String, String> macroRegionToPartitionIds)334         private static String[] writePartitionLookupTable(
335                 byte[] partitionLookupArray,
336                 ImmutableMap<String, String> regionToPartitionId,
337                 ImmutableSetMultimap<String, String> macroRegionToPartitionIds) {
338 
339             // A generator for indices of partition groups, based on partition IDs.
340             //
341             // For leaf regions this generates a one-to-one mapping with the single
342             // partition ID, but macro regions can overlap multiple partitions.
343             Indexer<Collection<String>, Byte> partitionGroupIndexer =
344                     Indexer.create(i -> {
345                         // The partition group index must fit in a byte.
346                         // For Java code simplicity, we want it to also be non-negative.
347                         // As of June 2020, there are 15 partition groups.
348                         checkState(i <= 0x7f, "too many partition groups: %s", i);
349                         return (byte) i.intValue();
350                     });
351 
352             // The default value in the partition lookup array (index 0) is mapped to by
353             // any unsupported region (since "LSR.indexForRegion(<invalid region>)" is 0).
354             // We must therefore reserve a special parition group index for these cases
355             // before adding the rest of the partitions.
356             partitionGroupIndexer.apply(ImmutableSet.of("."));
357 
358             // Populate the radix-based sparse index array, where each region is converted
359             // to the LSR region index (which must correspond to how regions are indexed in
360             // the client side code).
361             BiConsumer<String, Collection<String>> writePartitionIndex =
362                     (region, ids) -> partitionLookupArray[LSR.indexForRegion(region)] =
363                             partitionGroupIndexer.apply(ids);
364 
365             // Write leaf regions first (mostly to match the original code behaviour)
366             // and then macro regions.
367             //
368             // Convert the Map<String, String> to a Map<String, Collection<String>>
369             // to match the macro regions (even though each collection is a singleton).
370             regionToPartitionId.asMultimap().asMap().forEach(writePartitionIndex);
371             macroRegionToPartitionIds.asMap().forEach(writePartitionIndex);
372 
373             // Check invalid reigons will map to the special "missing partition" value.
374             checkState(partitionLookupArray[0] == 0);
375 
376             // Return the unique partition groups (sets of partition IDs) as strings
377             // (as a sequence of single letter partition IDs). Leaf regions will always
378             // have a single partition ID, but macro regions can overlap with multiple
379             // partitions.
380             return partitionGroupIndexer.getValues().stream()
381                     .map(ids -> String.join("", ids)).toArray(String[]::new);
382         }
383     }
384 
385     /**
386      * Returns a builder to which variable mappings are added, from which partition
387      * information is derived.
388      */
builder(TerritoryContainment territories)389     public static Builder builder(TerritoryContainment territories) {
390         return new Builder(territories);
391     }
392 
393     private final ImmutableSetMultimap<String, String> variableToPartitionIds;
394     private final byte[] partitionLookupArray;
395     private final String[] partitionStrings;
396 
PartitionInfo( ImmutableSetMultimap<String, String> variableToPartitionIds, byte[] partitionLookupArray, String[] partitionStrings)397     private PartitionInfo(
398             ImmutableSetMultimap<String, String> variableToPartitionIds,
399             byte[] partitionLookupArray,
400             String[] partitionStrings) {
401         this.variableToPartitionIds = ImmutableSetMultimap.copyOf(variableToPartitionIds);
402         this.partitionLookupArray = partitionLookupArray;
403         this.partitionStrings = partitionStrings;
404     }
405 
406     /**
407      * Returns the set of partition IDs for the given variable, or {@code {"*"}} if the
408      * speical '*' variable was given. The returned set must be non-empty because every
409      * variable includes at least one region, and all regions map to a partition ID.
410      */
getPartitionIds(String variable)411     public ImmutableSet<String> getPartitionIds(String variable) {
412         if (variable.equals("*")) {
413             return ImmutableSet.of("*");
414         }
415         ImmutableSet<String> result = variableToPartitionIds.get(variable);
416         checkArgument(!result.isEmpty(), "variable not defined: %s", variable);
417         return result;
418     }
419 
420     /** Returns the sparse lookup array from LSR region index to partition group index. */
getPartitionLookupArray()421     public byte[] getPartitionLookupArray() {
422         return partitionLookupArray;
423     }
424 
425     /**
426      * Returns the partition group lookup array from partition group index to partition
427      * ID string.
428      */
getPartitionStrings()429     public String[] getPartitionStrings() {
430         return partitionStrings;
431     }
432 }
433