xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/api/CldrPaths.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.api;
2 
3 import static com.google.common.base.Preconditions.checkArgument;
4 import static com.google.common.base.Preconditions.checkState;
5 import static java.lang.Integer.signum;
6 import static org.unicode.cldr.api.CldrDataType.LDML;
7 
8 import com.google.common.collect.ImmutableMap;
9 import com.google.common.collect.ImmutableSetMultimap;
10 import java.util.ArrayList;
11 import java.util.Comparator;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.Map.Entry;
15 import java.util.function.BiConsumer;
16 import java.util.function.Consumer;
17 import java.util.function.Predicate;
18 import java.util.stream.Stream;
19 import org.unicode.cldr.util.DtdData;
20 import org.unicode.cldr.util.XPathParts;
21 
22 /**
23  * Utilities related to CLDR paths. It's possible that one day we might wish to expose the path
24  * comparator from this class, but if so we should key it off something other than {@code DtdType}.
25  */
26 final class CldrPaths {
27     // Constants to make comparator logic a bit more readable (i.e. LHS < RHS ==> return -1).
28     private static final int LHS_FIRST = -1;
29     private static final int RHS_FIRST = 1;
30 
31     // A synthetic attribute used by CLDR code to apply an explicit ordering to otherwise identical
32     // paths (this only happens for "ORDERED" elements). We handle this differently and have the
33     // sort index as an explicit field in CldrPath rather than treating it as an attribute.
34     private static final String HIDDEN_SORT_INDEX_ATTRIBUTE = "_q";
35 
36     // When calculating things about elements we need to ignore deprecated ones, especially when
37     // looking for "leaf" elements, since CLDR data used to allow mixed content and the DTD still
38     // has elements with both data and (deprecated) child elements present.
39     private static final Predicate<DtdData.Element> IS_NOT_DEPRECATED = e -> !e.isDeprecated();
40 
41     // A map of the leaf element names for each supported DTD.
42     private static final ImmutableSetMultimap<CldrDataType, String> LEAF_ELEMENTS_MAP;
43     // A map of the ordered element names for each supported DTD.
44     private static final ImmutableSetMultimap<CldrDataType, String> ORDERED_ELEMENTS_MAP;
45 
46     static {
47         ImmutableSetMultimap.Builder<CldrDataType, String> leafElementsMap =
48                 ImmutableSetMultimap.builder();
49         ImmutableSetMultimap.Builder<CldrDataType, String> orderedElementsMap =
50                 ImmutableSetMultimap.builder();
51         for (CldrDataType type : CldrDataType.values()) {
52             // While at happened to be true (at the time of writing) that the getElements() method
53             // returns a new, mutable set, this is completely undocumented so we cannot rely on it
54             // (otherwise we could just do "removeIf(...)").
55             //
56             // Note that while the "specials" element has no children in the DTD, it is not
57             // considered a "leaf" element as it is expected to contain any additional elements
58             // from unknown namespaces (e.g. "icu:").
59             //
60             // We know that Guava's collection classes have a policy of never iterating over a
61             // collection more than once, so it's safe to use the ::iterator trick to convert a
62             // stream into a one-shot iterable (saves have to make temporary collections).
leafElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter( e -> e.getChildren().keySet().stream() .noneMatch(IS_NOT_DEPRECATED)) .map(DtdData.Element::getName) .filter(e -> !e.equals("special")) ::iterator)63             leafElementsMap.putAll(
64                     type,
65                     type.getElements()
66                                     .filter(IS_NOT_DEPRECATED)
67                                     // NOTE: Some leaf elements still have deprecated children (from
68                                     // when mixed
69                                     // data was permitted).
70                                     .filter(
71                                             e ->
72                                                     e.getChildren().keySet().stream()
73                                                             .noneMatch(IS_NOT_DEPRECATED))
74                                     .map(DtdData.Element::getName)
75                                     .filter(e -> !e.equals("special"))
76                             ::iterator);
orderedElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter(DtdData.Element::isOrdered) .map(DtdData.Element::getName) ::iterator)77             orderedElementsMap.putAll(
78                     type,
79                     type.getElements()
80                                     .filter(IS_NOT_DEPRECATED)
81                                     .filter(DtdData.Element::isOrdered)
82                                     .map(DtdData.Element::getName)
83                             ::iterator);
84         }
85         // Special case "alias" is an alternate leaf element for a lot of LDML elements.
leafElementsMap.put(LDML, "alias")86         leafElementsMap.put(LDML, "alias");
87         LEAF_ELEMENTS_MAP = leafElementsMap.build();
88         ORDERED_ELEMENTS_MAP = orderedElementsMap.build();
89     }
90 
91     // A map of path comparators for each supported DTD.
92     private static final ImmutableMap<CldrDataType, DtdPathComparator> PATH_COMPARATOR_MAP;
93 
94     // This static block must come after the attribute map is created since it uses it.
95     static {
96         ImmutableMap.Builder<CldrDataType, DtdPathComparator> pathMap = ImmutableMap.builder();
97         for (CldrDataType type : CldrDataType.values()) {
pathMap.put(type, new DtdPathComparator(type))98             pathMap.put(type, new DtdPathComparator(type));
99         }
100         PATH_COMPARATOR_MAP = pathMap.build();
101     }
102 
103     /**
104      * Returns a comparator for {@link CldrPath}s which is compatible with the canonical path
105      * ordering for the given DTD instance (e.g. {@link DtdData#getDtdComparator}()).
106      */
107     // TODO: Add tests to ensure it continues to agree to DTD ordering.
getPathComparator(CldrDataType type)108     static Comparator<CldrPath> getPathComparator(CldrDataType type) {
109         return PATH_COMPARATOR_MAP.get(type);
110     }
111 
112     private static final class DtdPathComparator implements Comparator<CldrPath> {
113         private final Comparator<String> elementNameComparator;
114         private final Comparator<String> attributeNameComparator;
115 
DtdPathComparator(CldrDataType dataType)116         private DtdPathComparator(CldrDataType dataType) {
117             this.elementNameComparator = dataType.getElementComparator();
118             this.attributeNameComparator = dataType.getAttributeComparator();
119         }
120 
121         // This code should only return "signum" values for ordering (i.e. {-1, 0, 1}).
122         @Override
compare(CldrPath lhs, CldrPath rhs)123         public int compare(CldrPath lhs, CldrPath rhs) {
124             int length = lhs.getLength();
125             if (length == rhs.getLength()) {
126                 if (length == 1) {
127                     // Root nodes are special as they define the DTD type and must always be equal.
128                     // Paths with different types can be compared, but not via this comparator.
129                     checkState(
130                             lhs.getName().equals(rhs.getName()),
131                             "cannot compare paths with different DTD type: %s / %s",
132                             lhs,
133                             rhs);
134                     return 0;
135                 }
136                 // Compare parent paths first and return if they give an ordering.
137                 int signum = compare(lhs.getParent(), rhs.getParent());
138                 return (signum != 0) ? signum : compareCurrentElement(lhs, rhs);
139             } else if (length < rhs.getLength()) {
140                 // Recursively shorten the RHS path until it's the same length.
141                 int signum = compare(lhs, rhs.getParent());
142                 // If the LHS is equal to the RHS parent, then the (shorter) LHS path comes first.
143                 // Note this this can only happen if we are comparing non-leaf node paths.
144                 return signum != 0 ? signum : LHS_FIRST;
145             } else {
146                 // Flip the comparison if LHS was longer (we do this at most once per comparison).
147                 return -compare(rhs, lhs);
148             }
149         }
150 
compareCurrentElement(CldrPath lhs, CldrPath rhs)151         private int compareCurrentElement(CldrPath lhs, CldrPath rhs) {
152             String elementName = lhs.getName();
153             int signum = signum(elementNameComparator.compare(elementName, rhs.getName()));
154             if (signum != 0) {
155                 return signum;
156             }
157             // Primary order within a path element is defined by the element index (this is a
158             // bit of a special value used only for "ORDERED" elements in the DTD. This always
159             // trumps any attribute ordering but is almost always -1.
160             signum = Integer.compare(lhs.getSortIndex(), rhs.getSortIndex());
161             if (signum != 0) {
162                 return signum;
163             }
164             // Element name is the same, so test attributes. Attributes are already known to be
165             // ordered by the element's DTD order, so we only need to find and compare the first
166             // difference.
167             int minAttributeCount = Math.min(lhs.getAttributeCount(), rhs.getAttributeCount());
168             for (int n = 0; n < minAttributeCount && signum == 0; n++) {
169                 String attributeName = lhs.getLocalAttributeName(n);
170                 // Important: We negate the comparison result here because we want elements with
171                 // "missing" attributes to sort earlier.
172                 //
173                 // E.g. for two elements LHS="foo[a=x][b=y]" and RHS="foo[b=y]" we want to say
174                 // that "RHS < LHS" because RHS is "missing" the attribute "[a=x]". But when we
175                 // compare the first attributes we find "a" (LHS) < "b" (RHS), which is the
176                 // opposite of what we want. This is because while LHS has a lower ordered
177                 // attribute, that indicates that RHS is missing that attribute in the same
178                 // position, which should make RHS sort first.
179                 signum =
180                         -signum(
181                                 attributeNameComparator.compare(
182                                         attributeName, rhs.getLocalAttributeName(n)));
183                 if (signum == 0) {
184                     // Attribute names equal, so test attribute value.
185                     signum =
186                             signum(
187                                     DtdData.getAttributeValueComparator(elementName, attributeName)
188                                             .compare(
189                                                     lhs.getLocalAttributeValue(n),
190                                                     rhs.getLocalAttributeValue(n)));
191                 }
192             }
193             if (signum == 0) {
194                 // Attributes match up to the minimum length, but one element might have more.
195                 // Elements with more attributes sort _after_ those without.
196                 if (lhs.getAttributeCount() > minAttributeCount) {
197                     signum = RHS_FIRST;
198                 } else if (rhs.getAttributeCount() > minAttributeCount) {
199                     signum = LHS_FIRST;
200                 }
201             }
202             return signum;
203         }
204     }
205 
206     /** Returns whether this path is a full path ending in a "leaf" element. */
isLeafPath(CldrPath path)207     static boolean isLeafPath(CldrPath path) {
208         String lastElementName = path.getName();
209         return lastElementName.indexOf(':') != -1
210                 || LEAF_ELEMENTS_MAP.get(path.getDataType()).contains(lastElementName);
211     }
212 
213     /**
214      * Returns whether an element is "ORDERED" in the specified DTD (and should have an explicit
215      * sort index).
216      */
isOrdered(CldrDataType dtdType, String elementName)217     static boolean isOrdered(CldrDataType dtdType, String elementName) {
218         // Elements with namespaces unknown to the DTD are never ordered
219         // (but it knows about icu: elements).
220         if (elementName.indexOf(':') != -1 && !elementName.startsWith("icu:")) {
221             return false;
222         }
223         // Returns empty set if DTD unknown, but it might also be the case that a valid DTD has no
224         // ordered elements, so we can't reasonable check for anything here.
225         return ORDERED_ELEMENTS_MAP.get(dtdType).contains(elementName);
226     }
227 
228     // This can't go further up due to static initialization ordering issues.
229     // TODO: Move all reading of DTDs and setup for paths into a lazy holder.
230     private static final CldrPath LDML_VERSION =
231             CldrPath.parseDistinguishingPath("//ldml/identity/version");
232 
233     /**
234      * Returns whether this path should be emitted by a data supplier. New cases can be added as
235      * needed.
236      */
shouldEmit(CldrPath path)237     static boolean shouldEmit(CldrPath path) {
238         // CLDRFile seems to do some interesting things with the version based on the DTD in
239         // which we see:
240         //
241         // <!ATTLIST version number CDATA #REQUIRED >
242         //    <!--@MATCH:regex/\$Revision.*\$-->
243         //    <!--@METADATA-->
244         //
245         // <!ATTLIST version cldrVersion CDATA #FIXED "36" >
246         //    <!--@MATCH:any-->
247         //    <!--@VALUE-->
248         //
249         // This results in conflict between values obtained via CLDRFile and those obtained
250         // directly by parsing an LDML XML file (which happens for "special" XML files in ICU).
251         //
252         // So for now I've decided to just ignore the version (since it's hardly used in the
253         // ICU converter code and always available via CldrDataSupplier#getCldrVersionString().
254 
255         // Hacky way to detect the version declaration in LDML files (which must always be
256         // skipped since they are duplicate paths and reveal XML file boundaries). The path is
257         // always "//ldmlXxxx/version" for some DTD type.
258         if (path.getLength() == 2 && path.getName().equals("version")) {
259             return false;
260         }
261 
262         // Note that there is a need for some kind of versioning for some bits of data (since
263         // changes to things like collation can invalidate database search indexes) but this should
264         // be handled directly in the logic which builds the ICU data and isn't strictly the same
265         // as the LDML version anyway.
266         //
267         // TODO: Remove this code if and when LDML version strings are removed.
268         return !path.equals(LDML_VERSION);
269     }
270 
271     /**
272      * Processes a full path to extract a distinguishing CldrPath and handle any value attributes
273      * present. This is designed for iterating over successive paths from CLDRFile, but can be used
274      * to create paths in isolation if necessary.
275      *
276      * @param fullPath a parsed full path.
277      * @param previousElements an optional list of previous CldrPath elements which will be used as
278      *     the prefix to the new path wherever possible (e.g. if previousElements="(a,b,c,d)" and
279      *     "fullPath=a/b/x/y/z", then the new path will share the path prefix up to and including
280      *     'b'). When processing sorted paths, this will greatly reduce allocations.
281      * @param valueAttributeCollector a collector into which value attributes will be added (in DTD
282      *     order).
283      * @return a new CldrPath corresponding to the distinguishing path of {@code fullPath}.
284      * @throws IllegalArgumentException if the path string is invalid.
285      */
processXPath( String fullPath, List<CldrPath> previousElements, BiConsumer<AttributeKey, String> valueAttributeCollector)286     static CldrPath processXPath(
287             String fullPath,
288             List<CldrPath> previousElements,
289             BiConsumer<AttributeKey, String> valueAttributeCollector) {
290         checkArgument(!fullPath.isEmpty(), "path must not be empty");
291         // This fails if attribute names are invalid, but not if element names are invalid, and we
292         // want to control/stabalize the error messages in this API.
293         XPathParts pathParts;
294         try {
295             pathParts = XPathParts.getFrozenInstance(fullPath);
296         } catch (IllegalArgumentException e) {
297             throw new IllegalArgumentException("invalid path: " + fullPath);
298         }
299         int length = pathParts.size();
300         checkArgument(length > 0, "cldr path must not be empty: %s", pathParts);
301         DtdData dtd = pathParts.getDtdData();
302         checkArgument(dtd != null, "unknown DTD type: %s", pathParts);
303         CldrDataType dtdType = CldrDataType.forRawType(dtd.dtdType);
304 
305         // The path we're returning, constructed from top-to-bottom.
306         CldrPath path = null;
307         // Reusable key/value attributes list.
308         List<String> keyValuePairs = new ArrayList<>();
309         Consumer<Entry<String, String>> collectElementAttribute =
310                 e -> {
311                     keyValuePairs.add(e.getKey());
312                     keyValuePairs.add(e.getValue());
313                 };
314         // True once we've started to create a new path rather than reusing previous elements.
315         boolean diverged = false;
316         for (int n = 0; n < length; n++) {
317             String elementName = pathParts.getElement(n);
318 
319             // If this path is from CldrPath.toString() then the sort index is encoded in the
320             // element name suffix (e.g. foo#42[@bar="x"]), otherwise it's in the synthetic "_q"
321             // attribute. Most often there is no sort index however, so this code should optimize
322             // to the null case.
323             int sortIndex = -1;
324             Map<String, String> attributes = pathParts.getAttributes(n);
325             int nameEnd = elementName.indexOf('#');
326             if (nameEnd != -1) {
327                 sortIndex = Integer.parseUnsignedInt(elementName.substring(nameEnd + 1));
328                 elementName = elementName.substring(0, nameEnd);
329             } else {
330                 String sortIndexStr = attributes.get(HIDDEN_SORT_INDEX_ATTRIBUTE);
331                 if (sortIndexStr != null) {
332                     sortIndex = Integer.parseUnsignedInt(sortIndexStr);
333                 }
334             }
335 
336             // Note that element names need not be known to the DTD. If an element's name is
337             // prefixed with an unknown namespace (e.g. "icu:") then it is always permitted.
338             // Similarly, we never filter out attributes in unknown namespaces. For now we assume
339             // that any explicit namespace is unknown.
340             boolean hasNamespace = elementName.indexOf(':') != -1;
341             checkArgument(
342                     hasNamespace || dtd.getElementFromName().containsKey(elementName),
343                     "invalid path: %s",
344                     fullPath);
345 
346             // The keyValuePairs list is used by the collectElementAttribute callback but we don't
347             // want/need to make a new callback each time, so just clear the underlying list.
348             keyValuePairs.clear();
349             processPathAttributes(
350                     elementName,
351                     attributes,
352                     dtdType,
353                     collectElementAttribute,
354                     valueAttributeCollector);
355 
356             // WARNING: We cannot just get the draft attribute value from the attributes map (as
357             // would be expected) because it throws an exception. This is because you can only
358             // "get()" values from the attributes map if they are potentially valid attributes for
359             // that element. Unfortunately this is due to a deliberate choice in the implementation
360             // of MapComparator, which explicitly throws an exception if an unknown attribute is
361             // encountered. Thus we have to check that "draft" is a possible attribute before
362             // asking for its value.
363             //
364             // TODO(dbeaumont): Fix this properly, ideally by fixing the comparator to not throw.
365             CldrDraftStatus draftStatus =
366                     dtd.getAttribute(elementName, "draft") != null
367                             ? CldrDraftStatus.forString(attributes.get("draft"))
368                             : null;
369 
370             if (!diverged && n < previousElements.size()) {
371                 CldrPath p = previousElements.get(n);
372                 if (p.matchesContent(elementName, sortIndex, keyValuePairs, draftStatus)) {
373                     // The previous path we processed was the same at least down to this depth, so
374                     // we can just reuse that element instead of creating a new one.
375                     //
376                     // In tests over the resolved paths for "en_GB" in DTD order, there are ~128k
377                     // path elements processed, with ~103k being reused here and ~25k being newly
378                     // allocated (a > 80% reuse rate). However this works best when the incoming
379                     // paths are sorted, since otherwise the "previous" path is random.
380                     //
381                     // For unsorted paths, the reuse rate is reduced to approximately 25%. This is
382                     // still saving ~32k allocations for the "en_GB" example, and it still saved
383                     // about 35% in terms of measured time to process the paths.
384                     path = p;
385                     continue;
386                 }
387             }
388             // This path has diverged from the previous path, so we must start making new elements.
389             path = new CldrPath(path, elementName, keyValuePairs, dtdType, draftStatus, sortIndex);
390             diverged = true;
391         }
392         return path;
393     }
394 
395     // Returns the element's sort index (or -1 if not present).
processPathAttributes( String elementName, Map<String, String> attributeMap, CldrDataType dtdType, Consumer<Entry<String, String>> collectElementAttribute, BiConsumer<AttributeKey, String> collectValueAttribute)396     static void processPathAttributes(
397             String elementName,
398             /* @Nullable */ Map<String, String> attributeMap,
399             CldrDataType dtdType,
400             Consumer<Entry<String, String>> collectElementAttribute,
401             BiConsumer<AttributeKey, String> collectValueAttribute) {
402 
403         // Why not just a Map? Maps don't efficiently handle "get the Nth element" which is
404         // something that's used a lot in the ICU conversion code. Distinguishing attributes in
405         // paths are used more as a "list of pairs" than a map with key/value lookup. Even
406         // extensions like "NavigableMap" don't give you what you want.
407         //
408         // This does mean that lookup-by-name is a linear search (unless you want to pay the cost
409         // of also having a map here) but the average number of attributes is very small (<3).
410         if (attributeMap != null && !attributeMap.isEmpty()) {
411             processAttributes(
412                             attributeMap.entrySet().stream(),
413                             elementName,
414                             collectValueAttribute,
415                             dtdType)
416                     .forEach(collectElementAttribute);
417         }
418     }
419 
processAttributes( Stream<Entry<String, String>> in, String elementName, BiConsumer<AttributeKey, String> collectValueAttribute, CldrDataType dtdType)420     static Stream<Entry<String, String>> processAttributes(
421             Stream<Entry<String, String>> in,
422             String elementName,
423             BiConsumer<AttributeKey, String> collectValueAttribute,
424             CldrDataType dtdType) {
425         Consumer<Entry<String, String>> collectValueAttributes =
426                 e -> {
427                     if (dtdType.isValueAttribute(elementName, e.getKey())) {
428                         collectValueAttribute.accept(
429                                 AttributeKey.keyOf(elementName, e.getKey()), e.getValue());
430                     }
431                 };
432         return in
433                 // Special case of a synthetic distinguishing attribute that's _not_ in the DTD, and
434                 // should be ignored.
435                 .filter(e -> !e.getKey().equals(HIDDEN_SORT_INDEX_ATTRIBUTE))
436                 .peek(collectValueAttributes)
437                 .filter(e -> dtdType.isDistinguishingAttribute(elementName, e.getKey()));
438     }
439 
CldrPaths()440     private CldrPaths() {}
441 }
442