1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.localedistance; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkState; 7 8 import java.util.Collection; 9 import java.util.Collections; 10 import java.util.HashSet; 11 import java.util.Map; 12 import java.util.Set; 13 import java.util.TreeSet; 14 import java.util.function.BiConsumer; 15 import java.util.function.Consumer; 16 import java.util.function.Function; 17 import java.util.logging.Logger; 18 19 import com.google.common.base.CharMatcher; 20 import com.google.common.collect.ImmutableMap; 21 import com.google.common.collect.ImmutableSet; 22 import com.google.common.collect.ImmutableSetMultimap; 23 import com.google.common.collect.SetMultimap; 24 import com.google.common.collect.Sets; 25 import com.google.common.collect.SortedSetMultimap; 26 import com.google.common.collect.TreeMultimap; 27 import com.ibm.icu.impl.locale.LSR; 28 29 /** 30 * Provides mapping arrays to quickly lookup partition information for any region 31 * code in client libraries. 32 * 33 * <p>A region's partition is defined by the set of region variables (e.g. "$enUS") 34 * in the CLDR data. Each unique combination of variables forms a partition, and 35 * groups of partitions uniquely define language distance groupings. In slightly 36 * mathematical terms, partition groups form an "equivalence class" for regions 37 * with respect to language distance. 38 * 39 * <p>So by determining the minimum set of partitions and partition groups, and 40 * assigning short IDs to them, it's possibe to create data structures which 41 * support all region pairings while being small and fast to access in client code. 42 */ 43 final class PartitionInfo { 44 private static final Logger logger = Logger.getLogger(PartitionInfo.class.getName()); 45 46 /** 47 * A builder, to which region variables are added in order to define partitions 48 * and partition groups based on territory containment. 49 */ 50 static final class Builder { 51 // Possible operations to parse from a region expression (e.g. "US+005-BR"). 52 private static final CharMatcher REGION_OPS = CharMatcher.anyOf("+-"); 53 54 private final TerritoryContainment territories; 55 private final Set<String> variables = new HashSet<>(); 56 private final SortedSetMultimap<String, String> regionToVariables = TreeMultimap.create(); 57 Builder(TerritoryContainment territories)58 private Builder(TerritoryContainment territories) { 59 this.territories = territories; 60 } 61 62 // Returns whether the given string is a known variable or the wildcard token. 63 // Non variable strings (e.g. plain region codes) can be passed in and simply 64 // return false. isKnownVariableOrWildcard(String variable)65 private boolean isKnownVariableOrWildcard(String variable) { 66 return variables.contains(variable) || variable.equals("*"); 67 } 68 69 /** 70 * Adds a variable expression (e.g. "$foo = "US+005-BR") from CLDR data and 71 * fully resolves all macro regions to their contained leaf regions. 72 * 73 * <p>The syntax is simple for now: 74 * <pre> 75 * regionSet := region ([-+] region)* 76 * </pre> 77 * There is no precedence, so "x+y-y+z" is "(((x+y)-y)+z)", and <em>not</em> 78 * "(x+y)-(y+z)". 79 */ addVariableExpression(String variable, String expr)80 public void addVariableExpression(String variable, String expr) { 81 checkState(variable.startsWith("$") && !variable.startsWith("$!"), 82 "invalid variable: %s", variable); 83 checkState(!isKnownVariableOrWildcard(variable), 84 "duplicate variable: %s", variable); 85 // Parsing also flattens the list to the corresponding leaf regions, 86 // so there should be no macro regions here. 87 Set<String> regions = parseAndFlattenRegionExpression(expr, territories); 88 // Add the mappings ("$foo" -> X) and the inverse ("$!foo" -> not(X)). 89 // 90 // The reason that the inverse mapping is needed is because some rules use 91 // the negated form of a variable (e.g. "$!enUS") and we must be able to 92 // resolve the set of associated partition IDs for it. 93 // 94 // If we only wanted the set of regions for the negated variable, that 95 // would be trivial (and there would be no need to store the negated values) 96 // but because the set of partition IDs for a negated variable is NOT always 97 // the negated set of parition IDs for the original variable (due to the way 98 // partitions overlap) it's not straightforward. 99 // 100 // In other words: 101 // regions-for("$!foo") == !regions-for("$foo)) 102 // but: 103 // partition-ids-for("$!foo") != !partition-ids-for("$foo") 104 addVariable(variable, regions); 105 addVariable( 106 "$!" + variable.substring(1), 107 Sets.difference(territories.getLeafRegions(), regions)); 108 } 109 addVariable(String variable, Iterable<String> regions)110 private void addVariable(String variable, Iterable<String> regions) { 111 checkArgument(variables.add(variable), 112 "variable '%s' already present in: %s", variable, regions); 113 for (String region : regions) { 114 checkArgument(!region.isEmpty(), "%s", regions); 115 regionToVariables.put(region, variable); 116 } 117 } 118 119 // Parses a region expression (e.g. "US+005-BR") to a set of resolved "leaf" 120 // regions. parseAndFlattenRegionExpression( String expr, TerritoryContainment territories)121 private static Set<String> parseAndFlattenRegionExpression( 122 String expr, TerritoryContainment territories) { 123 Set<String> regions = new TreeSet<>(); 124 Consumer<String> operation = regions::add; 125 int last = 0; 126 for (int i = REGION_OPS.indexIn(expr); i != -1; i = REGION_OPS.indexIn(expr, last)) { 127 applyOperation(operation, expr.substring(last, i), territories); 128 // Set up the next operation based on the separator char ('+' or '-'). 129 operation = (expr.charAt(i) == '+') ? regions::add : regions::remove; 130 last = i + 1; 131 } 132 applyOperation(operation, expr.substring(last), territories); 133 return regions; 134 } 135 applyOperation( Consumer<String> operation, String region, TerritoryContainment territories)136 private static void applyOperation( 137 Consumer<String> operation, String region, TerritoryContainment territories) { 138 checkArgument(!region.isEmpty(), "invalid region expresson (missing region)"); 139 ImmutableSet<String> contained = territories.getLeafRegionsOf(region); 140 if (!contained.isEmpty()) { 141 // For macro regions, add all their contained leaf regions (direct or indirect). 142 contained.forEach(operation); 143 } else { 144 // Leaf regions are just added directly. 145 operation.accept(region); 146 } 147 } 148 149 /** 150 * Registers an implicit variable defined by a region code, and returns the new variable 151 * name. 152 * 153 * <p>This method exists because the {@code <languageMatch>} syntax supports referencing 154 * regions directly, rather than just as pre-defined variables (e.g. "en_*_GB"). We still 155 * want to track these variables however since they may interact with macro-regions. 156 * 157 * @param regionOrVariable a region or an existing variable reference. 158 * @return the name of the registered variable (including '$' prefix). 159 */ ensureVariable(String regionOrVariable)160 public String ensureVariable(String regionOrVariable) { 161 if (isKnownVariableOrWildcard(regionOrVariable)) { 162 return regionOrVariable; 163 } 164 // Here we either have a "raw" region (e.g. "GB") or an unknown variable (e.g. "$foo"). 165 // However all explicit variables should have already been registered, so if this does 166 // start with '$', then it's an error. 167 checkArgument(!regionOrVariable.startsWith("$"), "unregistered variable: %s", regionOrVariable); 168 169 // This is an implicit variable, referenced by its region code, so we know that it 170 // can never be referenced in the negated form (i.e. "$!GB"), so we don't need to add 171 // the inverse mapping in the same way we do for explicitly defined variables. 172 // 173 // We also allow implicit variables to appear more than once in the list of match 174 // rules, so don't call addVariable() here, since that prohibits repeated addition. 175 // Since 'regionToVariables' is a _set_ multimap, adding implicit variables is an 176 // idempotent operation, so it's okay if it's done more than once. 177 String variable = "$" + regionOrVariable; 178 variables.add(variable); 179 regionToVariables.put(regionOrVariable, variable); 180 return variable; 181 } 182 build()183 public PartitionInfo build() { 184 // Step 1: Map regions to a unique "partition" ID. 185 // 186 // A region's partition is the set of variables which include it, and 187 // variables can be explicit (e.g. "$enUS"), implicit (e.g. "$GB") or 188 // negated (e.g. "$!enUS). 189 // 190 // For example, region "US" is included in the variables "$americas" and 191 // "$enUS", but is also referenced in the "negated" variables "$!cnsar" 192 // and "$!maghreb", so the "partition" of "US" is: 193 // { $americas, $enUS, $!cnsar, $!maghreb } 194 // 195 // A partition ID is a token associated with each unique variable partition. 196 // 197 // Since other regions, such as "PR" (Puerto Rico) and "VI" (U.S. Virgin 198 // Islands), are also "in" the same partition as "US", they will share the 199 // same partition ID. 200 // 201 // However, while "CA" is also included in "$americas", it's NOT defined as 202 // an "$enUS" (American English) region, so its partition is: 203 // { $americas, $!enUS, $!cnsar, $!maghreb } 204 // and it will have a different partition ID. 205 206 // Check that the region-to-partition map covers every leaf region (this 207 // is important to ensure partitions form a disjoint covering). 208 checkArgument(regionToVariables.keySet().equals(territories.getLeafRegions()), 209 "unexpected variable grouping (should cover all leaf regions): %s", 210 regionToVariables); 211 ImmutableMap<String, String> regionToPartitionId = 212 mapLeafRegionsToPartitionIds(regionToVariables); 213 logger.fine(() -> String.format("region to partition ID: %s", regionToPartitionId)); 214 215 // Step 2: Construct mappings to and from partition IDs, to group regions 216 // by the variables that define them. 217 218 // A sorted mapping from every variable ("$foo" or "$!foo") to the IDs of 219 // the partitions it exists in. 220 // 221 // For example, "$americas" exists in partitions for both "$enUS" (American 222 // English) and "$!enUS" (non-American English) regions, so will be mapped 223 // to (at least) two unique parition IDs (e.g. X & Y). 224 // "$americas" -> { X, Y } 225 ImmutableSetMultimap<String, String> variableToPartitionIds = 226 mapVariablesToPartitionIds(regionToPartitionId, regionToVariables); 227 logger.fine(() -> String.format("variable to partition IDs: %s", variableToPartitionIds)); 228 229 // A sorted mapping of each macro region to the partitions it intersects 230 // with. Unlike leaf regions, macro regions can map to groups of partitions 231 // rather than just a single one. 232 // 233 // For example, the macro region "419" (Latin America) intersects with 234 // both partitions: 235 // X = {$americas, $enUS, ...} (i.e. "Americas + American English") 236 // and: 237 // Y = {$americas, $!enUS, ...} (i.e. "Americas + non-American English") 238 // so this map would contain: 239 // "419" -> { X, Y } 240 ImmutableSetMultimap<String, String> macroRegionToPartitionIds = 241 mapMacroRegionsToPartitionIds(regionToPartitionId, territories); 242 243 // Step 3: Write the sparse "region index to partition group index" lookup 244 // array. This is the fast lookup array used to go from LSR region index to 245 // the partition group IDs for that region. 246 // 247 // Note that most entries in the array are zero, since the array maps from 248 // all possible regions, not just ones which exist. This is a space/time 249 // trade-off (and the array is compressed in the ICU data files anyway). 250 byte[] partitionLookupArray = new byte[LSR.REGION_INDEX_LIMIT]; 251 String[] partitionStrings = writePartitionLookupTable( 252 partitionLookupArray, regionToPartitionId, macroRegionToPartitionIds); 253 254 return new PartitionInfo(variableToPartitionIds, partitionLookupArray, partitionStrings); 255 } 256 mapLeafRegionsToPartitionIds( SetMultimap<String, String> regionToVariables)257 private static ImmutableMap<String, String> mapLeafRegionsToPartitionIds( 258 SetMultimap<String, String> regionToVariables) { 259 // A generator for partition IDs which returns a single ASCII character for 260 // each unique partition. 261 // 262 // Partition IDs are emitted into the ICU data, so it's important they are 263 // small and compatible with the ICU data file format. 264 Function<Collection<String>, String> partitionToId = 265 Indexer.create(i -> { 266 // Must be a single 7-bit ASCII value and not '*'. This is NOT 267 // used as a numeric value anywhere and could end up being a non 268 // digit character if the number of unique partitions is > 10. 269 // As of June 2020, there are only 7 unique paritions. 270 char partitionChar = (char) ('0' + i); 271 checkState(partitionChar < 0x7f, "too many partitions: %s", i); 272 return String.valueOf(partitionChar); 273 }); 274 275 // For each region, find its partition ID (based on the unique combination 276 // of variables that define it). 277 ImmutableMap.Builder<String, String> regionToId = ImmutableMap.builder(); 278 regionToVariables.asMap().forEach( 279 (region, variables) -> regionToId.put(region, partitionToId.apply(variables))); 280 return regionToId.build(); 281 } 282 mapVariablesToPartitionIds( ImmutableMap<String, String> regionToPartitionId, SortedSetMultimap<String, String> regionToVariables)283 private static ImmutableSetMultimap<String, String> mapVariablesToPartitionIds( 284 ImmutableMap<String, String> regionToPartitionId, 285 SortedSetMultimap<String, String> regionToVariables) { 286 287 // It's vital that this is a sorted multimap (of values as well as keys) 288 // since the values are later indexed and turned into partition strings 289 // (so stability of ID order in values is necessary). 290 SortedSetMultimap<String, String> variableToPartitionIds = TreeMultimap.create(); 291 regionToVariables.asMap().forEach((region, variables) -> { 292 String partitionId = regionToPartitionId.get(region); 293 for (String variable : variables) { 294 variableToPartitionIds.put(variable, partitionId); 295 } 296 }); 297 return ImmutableSetMultimap.copyOf(variableToPartitionIds); 298 } 299 mapMacroRegionsToPartitionIds( ImmutableMap<String, String> regionToPartitionId, TerritoryContainment territories)300 private static ImmutableSetMultimap<String, String> mapMacroRegionsToPartitionIds( 301 ImmutableMap<String, String> regionToPartitionId, 302 TerritoryContainment territories) { 303 304 // A mapping from each unique partition ID to the regions it contains. 305 // This mapping forms a disjoint covering of all (non-macro) regions and 306 // is just the "inverse" of the initial "region to partition ID" map. 307 // 308 // For example, following the examples above where: 309 // X = {$americas, $enUS, ...} 310 // and: 311 // Y = {$americas, $!enUS, ...} 312 // 313 // We would get something like: 314 // X -> {"PR", "US", "VI", ...} 315 // Y -> {"CA", ...} 316 Map<String, Collection<String>> partitionToRegions = 317 regionToPartitionId.asMultimap().inverse().asMap(); 318 319 // Each macro region can then be decomposed to a mapping to the unique set 320 // of partitions it overlaps with based on its leaf regions and the regions 321 // of all known partitions. 322 SortedSetMultimap<String, String> macroToPartitions = TreeMultimap.create(); 323 for (String macro : territories.getMacroRegions()) { 324 ImmutableSet<String> leaves = territories.getLeafRegionsOf(macro); 325 partitionToRegions.forEach((partition, regions) -> { 326 if (!Collections.disjoint(leaves, regions)) { 327 macroToPartitions.put(macro, partition); 328 } 329 }); 330 } 331 return ImmutableSetMultimap.copyOf(macroToPartitions); 332 } 333 writePartitionLookupTable( byte[] partitionLookupArray, ImmutableMap<String, String> regionToPartitionId, ImmutableSetMultimap<String, String> macroRegionToPartitionIds)334 private static String[] writePartitionLookupTable( 335 byte[] partitionLookupArray, 336 ImmutableMap<String, String> regionToPartitionId, 337 ImmutableSetMultimap<String, String> macroRegionToPartitionIds) { 338 339 // A generator for indices of partition groups, based on partition IDs. 340 // 341 // For leaf regions this generates a one-to-one mapping with the single 342 // partition ID, but macro regions can overlap multiple partitions. 343 Indexer<Collection<String>, Byte> partitionGroupIndexer = 344 Indexer.create(i -> { 345 // The partition group index must fit in a byte. 346 // For Java code simplicity, we want it to also be non-negative. 347 // As of June 2020, there are 15 partition groups. 348 checkState(i <= 0x7f, "too many partition groups: %s", i); 349 return (byte) i.intValue(); 350 }); 351 352 // The default value in the partition lookup array (index 0) is mapped to by 353 // any unsupported region (since "LSR.indexForRegion(<invalid region>)" is 0). 354 // We must therefore reserve a special parition group index for these cases 355 // before adding the rest of the partitions. 356 partitionGroupIndexer.apply(ImmutableSet.of(".")); 357 358 // Populate the radix-based sparse index array, where each region is converted 359 // to the LSR region index (which must correspond to how regions are indexed in 360 // the client side code). 361 BiConsumer<String, Collection<String>> writePartitionIndex = 362 (region, ids) -> partitionLookupArray[LSR.indexForRegion(region)] = 363 partitionGroupIndexer.apply(ids); 364 365 // Write leaf regions first (mostly to match the original code behaviour) 366 // and then macro regions. 367 // 368 // Convert the Map<String, String> to a Map<String, Collection<String>> 369 // to match the macro regions (even though each collection is a singleton). 370 regionToPartitionId.asMultimap().asMap().forEach(writePartitionIndex); 371 macroRegionToPartitionIds.asMap().forEach(writePartitionIndex); 372 373 // Check invalid reigons will map to the special "missing partition" value. 374 checkState(partitionLookupArray[0] == 0); 375 376 // Return the unique partition groups (sets of partition IDs) as strings 377 // (as a sequence of single letter partition IDs). Leaf regions will always 378 // have a single partition ID, but macro regions can overlap with multiple 379 // partitions. 380 return partitionGroupIndexer.getValues().stream() 381 .map(ids -> String.join("", ids)).toArray(String[]::new); 382 } 383 } 384 385 /** 386 * Returns a builder to which variable mappings are added, from which partition 387 * information is derived. 388 */ builder(TerritoryContainment territories)389 public static Builder builder(TerritoryContainment territories) { 390 return new Builder(territories); 391 } 392 393 private final ImmutableSetMultimap<String, String> variableToPartitionIds; 394 private final byte[] partitionLookupArray; 395 private final String[] partitionStrings; 396 PartitionInfo( ImmutableSetMultimap<String, String> variableToPartitionIds, byte[] partitionLookupArray, String[] partitionStrings)397 private PartitionInfo( 398 ImmutableSetMultimap<String, String> variableToPartitionIds, 399 byte[] partitionLookupArray, 400 String[] partitionStrings) { 401 this.variableToPartitionIds = ImmutableSetMultimap.copyOf(variableToPartitionIds); 402 this.partitionLookupArray = partitionLookupArray; 403 this.partitionStrings = partitionStrings; 404 } 405 406 /** 407 * Returns the set of partition IDs for the given variable, or {@code {"*"}} if the 408 * speical '*' variable was given. The returned set must be non-empty because every 409 * variable includes at least one region, and all regions map to a partition ID. 410 */ getPartitionIds(String variable)411 public ImmutableSet<String> getPartitionIds(String variable) { 412 if (variable.equals("*")) { 413 return ImmutableSet.of("*"); 414 } 415 ImmutableSet<String> result = variableToPartitionIds.get(variable); 416 checkArgument(!result.isEmpty(), "variable not defined: %s", variable); 417 return result; 418 } 419 420 /** Returns the sparse lookup array from LSR region index to partition group index. */ getPartitionLookupArray()421 public byte[] getPartitionLookupArray() { 422 return partitionLookupArray; 423 } 424 425 /** 426 * Returns the partition group lookup array from partition group index to partition 427 * ID string. 428 */ getPartitionStrings()429 public String[] getPartitionStrings() { 430 return partitionStrings; 431 } 432 } 433