1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu.localedistance;
4 
5 import static com.google.common.base.Preconditions.checkArgument;
6 import static com.google.common.base.Preconditions.checkNotNull;
7 import static com.google.common.base.Preconditions.checkState;
8 import static java.util.Arrays.asList;
9 import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
10 import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
11 
12 import java.io.IOException;
13 import java.util.ArrayList;
14 import java.util.Arrays;
15 import java.util.Collection;
16 import java.util.LinkedHashSet;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Optional;
20 import java.util.Set;
21 import java.util.logging.Logger;
22 import java.util.stream.Collectors;
23 import java.util.stream.IntStream;
24 import java.util.stream.Stream;
25 
26 import org.unicode.cldr.api.AttributeKey;
27 import org.unicode.cldr.api.CldrData;
28 import org.unicode.cldr.api.CldrDataSupplier;
29 import org.unicode.cldr.api.CldrPath;
30 import org.unicode.cldr.api.CldrValue;
31 import org.unicode.cldr.api.PathMatcher;
32 import org.unicode.icu.tool.cldrtoicu.DebugWriter;
33 import org.unicode.icu.tool.cldrtoicu.IcuData;
34 import org.unicode.icu.tool.cldrtoicu.RbPath;
35 import org.unicode.icu.tool.cldrtoicu.RbValue;
36 
37 import com.google.common.annotations.VisibleForTesting;
38 import com.google.common.base.Splitter;
39 import com.google.common.collect.ImmutableList;
40 import com.google.common.collect.ImmutableSet;
41 import com.google.common.collect.Iterables;
42 import com.google.common.primitives.Bytes;
43 import com.ibm.icu.impl.locale.LSR;
44 import com.ibm.icu.impl.locale.LikelySubtags;
45 import com.ibm.icu.impl.locale.LocaleDistance;
46 import com.ibm.icu.lang.UScript;
47 
48 import com.ibm.icu.util.ULocale;
49 
50 /**
51  * Mapper for generating locale distance tables from CLDR language data.
52  *
53  * <p>Note that this is an atypical mapper which does a lot more processing than other
54  * ICU mapper classes and relies on several auxilliary classes (which is why it's in a
55  * different package). Conceptually it's still a "mapper" though, just not a simple one.
56  *
57  * <p>This mapper was converted from the LocaleDistanceBuilder code in the ICU4J project.
58  */
59 public final class LocaleDistanceMapper {
60     private static final Logger logger = Logger.getLogger(LocaleDistanceMapper.class.getName());
61 
62     // All the language matching data comes from the "written_new" language data in
63     // "common/supplemental/languageInfo.xml".
64     private static final PathMatcher WRITTEN_LANGUAGE_PREFIX =
65         PathMatcher.of("//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]");
66 
67     // Definitions of region containment variables used when expressing match distances. E.g.:
68     // <matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/>
69     private static final PathMatcher VARIABLE_PATH =
70         WRITTEN_LANGUAGE_PREFIX.withSuffix("matchVariable[@id=*]");
71     private static final AttributeKey VARIABLE_ID = AttributeKey.keyOf("matchVariable", "id");
72     private static final AttributeKey VARIABLE_VALUE = AttributeKey.keyOf("matchVariable", "value");
73 
74     // Language distance data, including wildcards and variable references (possibly negated). E.g.:
75     // <languageMatch desired="ja_Latn"       supported="ja_Jpan"       distance="5" oneway="true"/>
76     // <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/>
77     // <languageMatch desired="en_*_$!enUS"   supported="en_*_GB"       distance="3"/>
78     private static final PathMatcher LANGUAGE_MATCH_PATH =
79         WRITTEN_LANGUAGE_PREFIX.withSuffix("languageMatch[@desired=*][@supported=*]");
80     private static final AttributeKey MATCH_DESIRED =
81         AttributeKey.keyOf("languageMatch", "desired");
82     private static final AttributeKey MATCH_SUPPORTED =
83         AttributeKey.keyOf("languageMatch", "supported");
84     private static final AttributeKey MATCH_DISTANCE =
85         AttributeKey.keyOf("languageMatch", "distance");
86     // Optional, assume false if not present.
87     private static final AttributeKey MATCH_ONEWAY =
88         AttributeKey.keyOf("languageMatch", "oneway");
89 
90     // Singleton element containing the list of special case "paradigm" locales, which should
91     // always be preferred if there is a tie. E.g.:
92     // <paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/>
93     //
94     // Since there are no distinguishing attributes for this path, there can only be one
95     // instance which we can just lookup directly.
96     private static final CldrPath PARADIGM_LOCALES_PATH = CldrPath.parseDistinguishingPath(
97         "//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]/paradigmLocales");
98     private static final AttributeKey PARADIGM_LOCALES =
99         AttributeKey.keyOf("paradigmLocales", "locales");
100 
101     // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", "").
102     private static final Splitter LIST_SPLITTER =
103             Splitter.on(' ').trimResults().omitEmptyStrings();
104 
105     // Output resource bundle paths, split into two basic groups for likely locale mappings
106     // and match data.
107     private static final RbPath LIKELY_LANGUAGES = RbPath.of("likely", "languageAliases");
108     private static final RbPath LIKELY_M49 = RbPath.of("likely", "m49");
109     private static final RbPath LIKELY_REGIONS = RbPath.of("likely", "regionAliases");
110     private static final RbPath LIKELY_TRIE = RbPath.of("likely", "trie:bin");
111     private static final RbPath LIKELY_LSRNUM = RbPath.of("likely", "lsrnum:intvector");
112 
113     private static final RbPath MATCH_TRIE = RbPath.of("match", "trie:bin");
114     private static final RbPath MATCH_REGION_TO_PARTITIONS = RbPath.of("match", "regionToPartitions:bin");
115     private static final RbPath MATCH_PARTITIONS = RbPath.of("match", "partitions");
116     private static final RbPath MATCH_PARADIGMNUM = RbPath.of("match", "paradigmnum:intvector");
117     private static final RbPath MATCH_DISTANCES = RbPath.of("match", "distances:intvector");
118 
119     // To split locale specifications (e.g. "ja_Latn" or "en_*_$!enUS").
120     private static final Splitter UNDERSCORE = Splitter.on('_');
121 
122     // The encoding scheme allow us to only encode up to 27 M.49 code below.
123     // The size is later check while reading the M49 List.
124     private static final List<String> M49 = Arrays.asList("001", "143", "419");
125 
126     /**
127      * Processes data from the given supplier to generate locale matcher ICU data.
128      *
129      * @param src the CLDR data supplier to process.
130      * @return the IcuData instance to be written to a file.
131      */
process(CldrDataSupplier src)132     public static IcuData process(CldrDataSupplier src) {
133         return process(src.getDataForType(SUPPLEMENTAL));
134     }
135 
136     @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier.
process(CldrData data)137     static IcuData process(CldrData data) {
138         IcuData icuData = new IcuData("langInfo", false);
139 
140         if (M49.size() > 27) {
141             throw new IllegalStateException(
142                 "The M49 list is too long. We can only encode up to 27 M49 codes.");
143         }
144         LikelySubtags.Data likelyData = LikelySubtagsBuilder.build(data);
145         icuData.add(LIKELY_LANGUAGES, ofMapEntries(likelyData.languageAliases));
146         icuData.add(LIKELY_M49, RbValue.of(M49));
147         icuData.add(LIKELY_REGIONS, ofMapEntries(likelyData.regionAliases));
148         icuData.add(LIKELY_TRIE, ofBytes(likelyData.trie));
149         icuData.add(LIKELY_LSRNUM, ofLsrNum(asList(likelyData.lsrs)));
150 
151         LocaleDistance.Data distanceData = buildDistanceData(data);
152         icuData.add(MATCH_TRIE, ofBytes(distanceData.trie));
153         icuData.add(MATCH_REGION_TO_PARTITIONS, ofBytes(distanceData.regionToPartitionsIndex));
154         icuData.add(MATCH_PARTITIONS, RbValue.of(distanceData.partitionArrays));
155         icuData.add(MATCH_PARADIGMNUM, ofLsrNum(distanceData.paradigmLSRs));
156         icuData.add(MATCH_DISTANCES, RbValue.of(Arrays.stream(distanceData.distances).mapToObj(Integer::toString)));
157         return icuData;
158     }
159 
160     /**
161      * A simple holder for language, script and region which allows for wildcards (i.e. "*")
162      * and variables to represent partitions of regions (e.g. "$enUS"). Minimal additional
163      * validation is done on incoming fields as data is assumed to be correct.
164      */
165     private static final class LsrSpec {
166         /**
167          * Parse a raw specification string (e.g. "en", "ja_Latn", "*_*_*", "ar_*_$maghreb"
168          * or "en_*_GB") into a structured spec. Note that if the specification string
169          * contains a "bare" region (e.g. "en_*_GB") then it is registered as a variable in
170          * the given RegionMapper builder, so the returned {@code LsrSpec} will be
171          * {@code "en_*_$GB"}.
172          */
parse(String rawSpec, PartitionInfo.Builder rmb)173         public static LsrSpec parse(String rawSpec, PartitionInfo.Builder rmb) {
174             List<String> parts = UNDERSCORE.splitToList(rawSpec);
175             checkArgument(parts.size() <= 3, "invalid raw LSR specification: %s", rawSpec);
176             String language = parts.get(0);
177             Optional<String> script = parts.size() > 1 ? Optional.of(parts.get(1)) : Optional.empty();
178             // While parsing the region part, ensure any "bare" region subtags are converted
179             // to variables (e.g. "GB" -> "$GB") and registered with the parition map.
180             Optional<String> region =
181                     parts.size() > 2 ? Optional.of(rmb.ensureVariable(parts.get(2))) : Optional.empty();
182             return new LsrSpec(language, script, region);
183         }
184 
185         // A language subtag (e.g. "en") or "*".
186         private final String language;
187         // If present, a script subtag (e.g. "Latn") or "*".
188         private final Optional<String> script;
189         // If present, a registered variable with '$' prefix (e.g. "$foo" or "$GB") or "*".
190         private final Optional<String> regionVariable;
191 
LsrSpec(String language, Optional<String> script, Optional<String> regionVariable)192         private LsrSpec(String language, Optional<String> script, Optional<String> regionVariable) {
193             this.language = language;
194             this.script = script;
195             this.regionVariable = regionVariable;
196             // Implementation shortcuts assume:
197             // - If the language subtags are '*', the other-level subtags must also be '*' (if present).
198             // If there are rules that do not fit these constraints, we need to revise the implementation.
199             if (isAny(language)) {
200                 script.ifPresent(
201                         s -> checkArgument(isAny(s), "expected wildcard script, got: %s", script));
202                 regionVariable.ifPresent(
203                         r -> checkArgument(isAny(r), "expected wildcard region, got: %s", regionVariable));
204             }
205         }
206 
getLanguage()207         public String getLanguage() {
208             return language;
209         }
210 
getScript()211         public String getScript() {
212             return script.orElseThrow(() -> new IllegalArgumentException("no script available: " + this));
213         }
214 
getRegionVariable()215         public String getRegionVariable() {
216             return regionVariable.orElseThrow(() -> new IllegalArgumentException("no region available: " + this));
217         }
218 
size()219         public int size() {
220             return regionVariable.isPresent() ? 3 : script.isPresent() ? 2 : 1;
221         }
222 
223         @Override
toString()224         public String toString() {
225             return language + script.map(s -> "_" + s).orElse("") + regionVariable.map(r -> "_" + r).orElse("");
226         }
227     }
228 
229     /**
230      * Represents a {@code <languageMatch>} rule derived from supplemental data, such as:
231      * <pre>{@code
232      *   <languageMatch desired="zh_Hans" supported="zh_Hant" distance="15" oneway="true"/>
233      * }</pre>
234      * or:
235      * <pre>{@code
236      *   <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/>
237      * }</pre>
238      *
239      * <p>The job of a {@code Rule} is to provide a mechanism for capturing the data in
240      * {@code <languageMatch>} elements and subsequently adding that information to a
241      * {@link DistanceTable.Builder} in a structured way.
242      */
243     private static final class LanguageMatchRule {
244         private final LsrSpec desired;
245         private final LsrSpec supported;
246         private final int distance;
247         private final boolean oneway;
248 
LanguageMatchRule(LsrSpec desired, LsrSpec supported, int distance, boolean oneway)249         public LanguageMatchRule(LsrSpec desired, LsrSpec supported, int distance, boolean oneway) {
250             this.desired = checkNotNull(desired);
251             this.supported = checkNotNull(supported);
252             this.distance = distance;
253             this.oneway = oneway;
254             // Implementation shortcuts assume:
255             // - At any level, either both or neither spec subtags are *.
256             // If there are rules that do not fit these constraints, we need to revise the implementation.
257             checkArgument(desired.size() == supported.size(),
258                     "mismatched rule specifications in: %s, %s", desired, supported);
259             checkArgument(isAny(desired.language) == isAny(supported.language),
260                     "wildcard mismatch for languages in: %s, %s", desired, supported);
261             checkArgument(isAny(desired.script) == isAny(supported.script),
262                     "wildcard mismatch for scripts in: %s, %s", desired, supported);
263             checkArgument(isAny(desired.regionVariable) == isAny(supported.regionVariable),
264                     "wildcard mismatch for languages in: %s, %s", desired, supported);
265         }
266 
size()267         int size() {
268             return desired.size();
269         }
270 
isDefaultRule()271         boolean isDefaultRule() {
272             // We already know that in LsrSpec, if the language is "*" then all subtags are too.
273             return isAny(desired.language);
274         }
275 
276         /**
277          * Adds this rule to the given distance table, using the given partition map to
278          * resolve any region variables present in the desired or supported specs.
279          */
addTo(DistanceTable.Builder distanceTable, PartitionInfo partitions)280         void addTo(DistanceTable.Builder distanceTable, PartitionInfo partitions) {
281             // Note that rather than using the rule's "size" to mediate the different
282             // cases, we could have had 3 distinct sub-types of a common rule API (e.g.
283             // "LanguageRule", "ScriptRule" and "RegionRule"), each with a different
284             // addTo() callback. However this would have been quite a lot more code
285             // for not much real gain.
286             switch (size()) {
287             case 1:  // Language only.
288                 distanceTable.addDistance(distance, oneway,
289                         desired.getLanguage(), supported.getLanguage());
290                 break;
291 
292             case 2:  // Language and script present.
293                 distanceTable.addDistance(distance, oneway,
294                         desired.getLanguage(), supported.getLanguage(),
295                         desired.getScript(), supported.getScript());
296                 break;
297 
298             case 3:  // Language, script and region variable present.
299                 // Add the rule distance for every combination of desired/supported
300                 // partition IDs for the region variables. This is important for
301                 // variables like "$americas" which overlap with multiple paritions.
302                 //
303                 // Note that in this case (because region variables map to sets of
304                 // partition IDs) we can get situations where "shouldReverse" is true,
305                 // but the desired/supported pairs being passed in are identical (e.g.
306                 // different region variables map to distinct partition groups which
307                 // share some common elements).
308                 //
309                 // This is fine, providing that the distance table is going to ignore
310                 // identical mappings (which it does). Alternatively we could just
311                 // re-calculate "shouldReverse" inside this loop to account for partition
312                 // IDs rather than region variables.
313                 ImmutableSet<String> desiredPartitionIds =
314                         partitions.getPartitionIds(desired.getRegionVariable());
315                 ImmutableSet<String> supportedPartitionIds =
316                         partitions.getPartitionIds(supported.getRegionVariable());
317                 for (String desiredPartitionId : desiredPartitionIds) {
318                     for (String supportedPartitionId : supportedPartitionIds) {
319                         distanceTable.addDistance(distance, oneway,
320                                 desired.getLanguage(), supported.getLanguage(),
321                                 desired.getScript(), supported.getScript(),
322                                 desiredPartitionId, supportedPartitionId);
323                     }
324                 }
325                 break;
326 
327             default:
328                 throw new IllegalStateException("invalid size for LsrSpec: " + this);
329             }
330         }
331 
332         @Override
toString()333         public String toString() {
334             return String.format(
335                     "Rule{ desired=%s, supported=%s, distance=%d, oneway=%b }",
336                     desired, supported, distance, oneway);
337         }
338     }
339 
buildDistanceData(CldrData supplementalData)340     private static LocaleDistance.Data buildDistanceData(CldrData supplementalData) {
341         // Resolve any explicitly declared region variables into the partition map.
342         // Territory containment information is used to recursively resolve region
343         // variables (e.g. "$enUS") into a collection of non-macro regions.
344         PartitionInfo.Builder partitionBuilder =
345                 PartitionInfo.builder(TerritoryContainment.getContainment(supplementalData));
346         supplementalData.accept(DTD, v -> {
347             CldrPath path = v.getPath();
348             if (VARIABLE_PATH.matches(path)) {
349                 partitionBuilder.addVariableExpression(v.get(VARIABLE_ID), v.get(VARIABLE_VALUE));
350             }
351         });
352 
353         // Parse the rules from <languageMatch> elements. Note that the <languageMatch>
354         // element is marked as "ORDERED" in the DTD, which means the elements always
355         // appear in the same order is in the CLDR XML file (even when using DTD order).
356         //
357         // This is one of the relatively rare situations in which using DTD order will
358         // not isolate the ICU data from reordering of the CLDR data. In particular this
359         // matters when specifying language matcher preferences (such as "en_*_GB" vs
360         // "en_*_!enUS").
361         //
362         // We could almost process the rules while reading them from the source data, but
363         // rules may contain region codes rather than variables, and we need to create a
364         // variable for each such region code before the RegionMapper is built, and
365         // before processing the rules (this happens when the LsrSpec is parsed).
366         List<LanguageMatchRule> rules = new ArrayList<>();
367         supplementalData.accept(DTD, v -> {
368             CldrPath path = v.getPath();
369             if (LANGUAGE_MATCH_PATH.matches(path)) {
370                 int distance = Integer.parseInt(v.get(MATCH_DISTANCE));
371                 // Lenient against there being no "oneway" attribute.
372                 boolean oneway = "true".equalsIgnoreCase(v.get(MATCH_ONEWAY));
373                 LsrSpec desired = LsrSpec.parse(v.get(MATCH_DESIRED), partitionBuilder);
374                 LsrSpec supported = LsrSpec.parse(v.get(MATCH_SUPPORTED), partitionBuilder);
375                 LanguageMatchRule rule = new LanguageMatchRule(desired, supported, distance, oneway);
376                 logger.fine(() -> String.format("rule: %s", rule));
377                 rules.add(rule);
378             }
379         });
380         // Check that the rules are in the expected order. Rule order is important in ensuring
381         // data correctness and incorrect order may violate business logic assumptions later.
382         // TODO: Consider what other ordering/sanity checks make sense here.
383         for (int n = 0, prevSize = 1; n < rules.size(); n++) {
384             LanguageMatchRule rule = rules.get(n);
385             checkArgument(rule.size() >= prevSize, "<languageMatch> elements out of order at: %s", rule);
386             checkArgument(rule.size() == prevSize || (n > 0 && rules.get(n - 1).isDefaultRule()),
387                "missing default rule before: %s", rule);
388             prevSize = rule.size();
389         }
390         checkState(rules.stream().distinct().count() == rules.size(), "duplicated rule in: %s", rules);
391 
392         // Build region partition data after all the variables have been accounted for
393         // (including the implicit variables found while processing LsrSpecs).
394         PartitionInfo partitions = partitionBuilder.build();
395 
396         // Add all the rules (in order) to the distance table.
397         DistanceTable.Builder distanceTableBuilder = DistanceTable.builder();
398         rules.forEach(r -> r.addTo(distanceTableBuilder, partitions));
399         DistanceTable distanceTable = distanceTableBuilder.build();
400 
401         // Note: Using LocaleDistance.Data as a fairly "dumb" container for the return values
402         // requires us to do slightly awkward things, like passing mutable arrays and LSR
403         // instances around, but the advantage it has is that this data structure is also what's
404         // used in client code, so if the likely subtags data changes, it will be a forcing
405         // function to change this code.
406         return new LocaleDistance.Data(
407                 distanceTable.getTrie().toByteArray(),
408                 partitions.getPartitionLookupArray(),
409                 partitions.getPartitionStrings(),
410                 getParadigmLsrs(supplementalData),
411                 distanceTable.getDefaultDistances());
412     }
413 
getParadigmLsrs(CldrData supplementalData)414     private static Set<LSR> getParadigmLsrs(CldrData supplementalData) {
415         // LinkedHashSet for stable order; otherwise a unit test is flaky.
416         CldrValue cldrValue = supplementalData.get(PARADIGM_LOCALES_PATH);
417         checkState(cldrValue != null,
418                 "<paradigmLocales> element was missing: %s", PARADIGM_LOCALES_PATH);
419         String localesList = cldrValue.get(PARADIGM_LOCALES);
420         checkState(localesList != null,
421                 "<paradigmLocales> 'locales' attribute was missing: %s", cldrValue);
422 
423         Set<LSR> paradigmLSRs = new LinkedHashSet<>();
424         for (String paradigm : LIST_SPLITTER.split(localesList)) {
425             LSR max = LikelySubtags.INSTANCE.makeMaximizedLsrFrom(new ULocale(paradigm), false);
426             // Clear the LSR flags to make the data equality test in LocaleDistanceTest happy.
427             paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS));
428         }
429         checkArgument(paradigmLSRs.size() % 2 == 0, "unpaired paradigm locales: %s", paradigmLSRs);
430         return paradigmLSRs;
431     }
432 
433     // Returns an RbValue serialized from a map as a sequence of alternating (key, value)
434     // pairs (formatted as one pair per line in the IcuData file).
435     //
436     // E.g.
437     // foo{
438     //     key1, value1,
439     //     ...
440     //     keyN, valueN,
441     // }
ofMapEntries(Map<String, String> map)442     private static RbValue ofMapEntries(Map<String, String> map) {
443         return RbValue.of(
444                 map.entrySet().stream()
445                         .flatMap(e -> Stream.of(e.getKey(), e.getValue()))
446                         .collect(Collectors.toList()))
447                 .elementsPerLine(2);
448     }
449 
450     // Returns an RbValue serialized from a sequence of LSR instance as a sequence of number
451     // represent (language, region, script) tuples (formatted as one number per line in the IcuData file).
ofLsrNum(Collection<LSR> lsrs)452     private static RbValue ofLsrNum(Collection<LSR> lsrs) {
453         return RbValue.of(
454                 lsrs.stream()
455                         .flatMapToInt(lsr -> IntStream.of(LSRToNum(lsr)))
456                         .mapToObj(Integer::toString));
457     }
458 
459     // This method is added only to support encodeToIntForResource()
460     // It only support [a-z]{2,3} and will not work for other cases.
461     // TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
encodeLanguageToInt(String language)462     static private int encodeLanguageToInt(String language) {
463         assert language.length() >= 2;
464         assert language.length() <= 3;
465         assert language.charAt(0) >= 'a';
466         assert language.charAt(0) <= 'z';
467         assert language.charAt(1) >= 'a';
468         assert language.charAt(1) <= 'z';
469         assert language.length() == 2 || language.charAt(2) >= 'a';
470         assert language.length() == 2 || language.charAt(2) <= 'z';
471         return language.charAt(0) - 'a' + 1 +
472                27 * (language.charAt(1) - 'a' + 1) +
473                ((language.length() == 2) ? 0 : 27 * 27 * (language.charAt(2) - 'a' + 1));
474     }
475     // This method is added only to support encodeToIntForResource()
476     // It only support [A-Z][a-z]{3} which defined in UScript and does not work for other cases.
477     // TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
encodeScriptToInt(String script)478     static private int encodeScriptToInt(String script) {
479         int ret = UScript.getCodeFromName(script);
480         assert ret != UScript.INVALID_CODE;
481         return ret;
482     }
483     // This method is added only to support encodeToIntForResource()
484     // It only support [A-Z]{2}|001|143|419 and does not work for other cases.
485     // TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
encodeRegionToInt(String region, List<String> m49)486     static private int encodeRegionToInt(String region, List<String> m49) {
487         assert region.length() >= 2;
488         assert region.length() <= 3;
489         // Do not have enough bits to store the all 1000 possible combination of \d{3}
490         // Only support what is in M49.
491         if (region.length() == 3) {
492             int index = m49.indexOf(region);
493             assert index >= 0;
494             if (index < 0) {
495                 throw new IllegalStateException(
496                     "Please add '" + region + "' to M49 in LocaleDistanceMapper.java");
497             }
498             return index;
499         }
500         assert region.charAt(0) >= 'A';
501         assert region.charAt(0) <= 'Z';
502         assert region.charAt(1) >= 'A';
503         assert region.charAt(1) <= 'Z';
504         // 'AA' => 1+27*1  = 28
505         // ...
506         // 'AZ' => 1+27*26 = 703
507         // 'BA' => 2+27*1  = 29
508         // ...
509         // 'IN' => 9+27*14 = 387
510         // 'ZZ' => 26+27*26 = 728
511         return (region.charAt(0) - 'A' + 1) + 27 * (region.charAt(1) - 'A' + 1);
512     }
513     // This is designed to only support encoding some LSR into resources but not for other cases.
514     // TODO(ftang) Remove after LSR.encodeToIntForResource is available to the tool.
encodeToIntForResource(LSR lsr)515     static int encodeToIntForResource(LSR lsr) {
516         return (encodeLanguageToInt(lsr.language) + (27*27*27) * encodeRegionToInt(lsr.region, M49)) |
517             (encodeScriptToInt(lsr.script) << 24);
518     }
519 
LSRToNum(LSR lsr)520     private static int LSRToNum(LSR lsr) {
521         // Special number for "", "", "" return 0
522         if (lsr.language.isEmpty() && lsr.script.isEmpty() && lsr.region.isEmpty()) {
523             return 0;
524         }
525         // Special number for "skip", "script", "" return 1
526         if (lsr.language.equals("skip") && lsr.script.equals("script") && lsr.region.isEmpty()) {
527             return 1;
528         }
529         // TODO(ftang) Change to the following line after LSR.encodeToIntForResource is available to the tool.
530         // return lsr.encodeToIntForResource();
531         return encodeToIntForResource(lsr);
532     }
533 
534     // Returns an RbValue serialized from a byte array, as a concatenated sequence of rows of
535     // hex values. This is intended only for RbPaths using the ":bin" suffix.
536     //
537     // E.g.
538     // foo{
539     // 0123456789abcdef0123456789abcdef
540     //     ...
541     // 1c0de4c0ffee
542     // }
543     //
544     // Note that typically no indentation is used when writting this binary "blob".
ofBytes(byte[] data)545     private static RbValue ofBytes(byte[] data) {
546         ImmutableList.Builder<String> hexValues = ImmutableList.builder();
547         List<Byte> bytes = Bytes.asList(data);
548         for (List<Byte> line : Iterables.partition(bytes, 16)) {
549             hexValues.add(line.stream().map(b -> String.format("%02x", b)).collect(Collectors.joining()));
550         }
551         return RbValue.of(hexValues.build());
552     }
553 
554     // Returns if the subtag is the '*' wildcard. This is not to be confused with the
555     // "ANY" character used in DistanceTable.
isAny(String subtag)556     private static boolean isAny(String subtag) {
557         return subtag.equals("*");
558     }
559 
560     // Returns if the subtag exists and is the '*' wildcard.
isAny(Optional<String> subtag)561     private static boolean isAny(Optional<String> subtag) {
562         return subtag.map(LocaleDistanceMapper::isAny).orElse(false);
563     }
564 
565     // Main method for running this mapper directly with logging enabled.
566     // CLDR_DIR is picked up from system properties or envirnment variables.
567     // Arguments: <output-file> [<log-level>]
main(String[] args)568     public static void main(String[] args) throws IOException {
569         DebugWriter.writeForDebugging(args, LocaleDistanceMapper::process);
570     }
571 }
572