xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/VariantFolder.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.util;
2 
3 import com.ibm.icu.lang.UCharacter;
4 import com.ibm.icu.text.Normalizer;
5 import com.ibm.icu.text.UTF16;
6 import com.ibm.icu.text.UnicodeSet;
7 import com.ibm.icu.text.UnicodeSetIterator;
8 import com.ibm.icu.util.ULocale;
9 import java.util.Collection;
10 import java.util.Comparator;
11 import java.util.HashSet;
12 import java.util.Set;
13 import java.util.TreeSet;
14 import org.unicode.cldr.util.XEquivalenceClass.SetMaker;
15 
16 public class VariantFolder {
17     private AlternateFetcher alternateFetcher;
18     public static SetMaker mySetMaker =
19             new SetMaker() {
20                 Comparator c = new UTF16.StringComparator(true, false, 0);
21                 Comparator bestIsLowest =
22                         new Comparator() {
23                             @Override
24                             public int compare(Object o1, Object o2) {
25                                 String s1 = o1.toString();
26                                 String s2 = o2.toString();
27                                 final boolean casefold1 = UCharacter.foldCase(s1, true).equals(s1);
28                                 final boolean casefold2 = UCharacter.foldCase(s2, true).equals(s2);
29                                 if (casefold1 != casefold2) {
30                                     return casefold1 ? -1 : 1;
31                                 }
32                                 final boolean canonical1 =
33                                         Normalizer.isNormalized(s1, Normalizer.COMPOSE, 0);
34                                 final boolean canonical2 =
35                                         Normalizer.isNormalized(s2, Normalizer.COMPOSE, 0);
36                                 if (canonical1 != canonical2) {
37                                     return canonical1 ? -1 : 1;
38                                 }
39                                 int len1 = s1.codePointCount(0, s1.length());
40                                 int len2 = s2.codePointCount(0, s2.length());
41                                 if (len1 != len2) {
42                                     return len1 - len2;
43                                 }
44                                 return c.compare(s1, s2);
45                             }
46                         };
47 
48                 @Override
49                 public Set make() {
50                     return new TreeSet(bestIsLowest);
51                 }
52             };
53     public static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]");
54 
55     // private String source;
56 
57     // private Set<String> result;
58 
59     public interface AlternateFetcher {
60         /**
61          * The input string MUST be in the output set. Note that the results must be valid even if
62          * input string might not be on even code point boundaries. For example, if the input is
63          * "XabY" where X and Y are have surrogates, and the alternates are by case, then the
64          * results have to be {"XabY", "XAbY", "XaBY", "XABY"}.
65          *
66          * <p>The caller must never modify the set.
67          *
68          * @param item
69          * @return
70          */
getAlternates(String item, Set<String> output)71         Set<String> getAlternates(String item, Set<String> output);
72     }
73 
74     public static class CompatibilityFolder implements VariantFolder.AlternateFetcher {
75         private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]");
76         static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker);
77 
78         static {
79             for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next(); ) {
80                 String item = it.getString();
equivalents.add(item, Normalizer.decompose(item, true))81                 equivalents.add(item, Normalizer.decompose(item, true));
equivalents.add(item, Normalizer.compose(item, true))82                 equivalents.add(item, Normalizer.compose(item, true));
83             }
84         }
85 
86         @Override
getAlternates(String item, Set<String> output)87         public Set<String> getAlternates(String item, Set<String> output) {
88             output.add(item);
89             return equivalents.getEquivalences(item);
90         }
91     }
92 
93     public static class CanonicalFolder implements VariantFolder.AlternateFetcher {
94         static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker);
95 
96         static {
97             for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next(); ) {
98                 String item = it.getString();
equivalents.add(item, Normalizer.decompose(item, false))99                 equivalents.add(item, Normalizer.decompose(item, false));
equivalents.add(item, Normalizer.compose(item, false))100                 equivalents.add(item, Normalizer.compose(item, false));
101             }
102         }
103 
104         @Override
getAlternates(String item, Set<String> output)105         public Set<String> getAlternates(String item, Set<String> output) {
106             output.add(item);
107             return equivalents.getEquivalences(item);
108         }
109     }
110 
111     public static class CaseVariantFolder implements VariantFolder.AlternateFetcher {
112         private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]");
113         static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker);
114 
115         static {
116             for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next(); ) {
117                 String item = it.getString();
equivalents.add(item, UCharacter.toLowerCase(item))118                 equivalents.add(item, UCharacter.toLowerCase(item));
equivalents.add(item, UCharacter.toUpperCase(item))119                 equivalents.add(item, UCharacter.toUpperCase(item));
equivalents.add(item, UCharacter.foldCase(item, true))120                 equivalents.add(item, UCharacter.foldCase(item, true));
equivalents.add(item, UCharacter.toTitleCase(ULocale.ROOT, item, null))121                 equivalents.add(item, UCharacter.toTitleCase(ULocale.ROOT, item, null));
122             }
123         }
124 
125         @Override
getAlternates(String item, Set<String> output)126         public Set<String> getAlternates(String item, Set<String> output) {
127             output.add(item);
128             return equivalents.getEquivalences(item);
129         }
130     }
131 
132     /**
133      * The class is designed to be immutable, at least as far as Java allows. That is, if the
134      * alternateFetcher is, then it will be.
135      *
136      * @param alternateFetcher
137      */
VariantFolder(AlternateFetcher alternateFetcher)138     public VariantFolder(AlternateFetcher alternateFetcher) {
139         this.alternateFetcher = alternateFetcher;
140     }
141 
142     // We keep track of the alternates for each combination of start,len
143     // so with a length of 3 we have the following structure
144     // {{0,1}, {1,2}, {2,3} -- length of 1
145     // {0,2}, {1,3}
146     // {0,3}}
147 
148     @SuppressWarnings("unchecked")
getClosure(String source)149     public Set<String> getClosure(String source) {
150         int stringLength = source.length();
151         if (stringLength == 0) {
152             Set<String> result = new HashSet<>();
153             result.add(source);
154             return result;
155         }
156         Set<String>[][] combos = new Set[stringLength][];
157         for (int i = 0; i < stringLength; ++i) {
158             combos[i] = new Set[stringLength - i];
159         }
160         for (int i = 0; i < stringLength; ++i) {
161             combos[0][i] =
162                     alternateFetcher.getAlternates(
163                             source.substring(i, i + 1), new HashSet<String>());
164         }
165         for (int level = 1; level < stringLength; ++level) {
166             // at each level, we add strings of that length (plus 1)
167             for (int start = 0; start < stringLength - level; ++start) {
168                 int limit = start + level + 1;
169                 // System.out.println(start + ", " + limit);
170                 // we first add any longer alternates
171                 Collection<String> current = combos[level][start] = new HashSet<>();
172                 current.addAll(
173                         alternateFetcher.getAlternates(
174                                 source.substring(start, limit), new HashSet<String>()));
175                 // then we add the cross product of shorter strings
176                 for (int breakPoint = start + 1; breakPoint < limit; ++breakPoint) {
177                     addCrossProduct(
178                             combos[breakPoint - start - 1][start],
179                             combos[limit - breakPoint - 1][breakPoint],
180                             current);
181                 }
182             }
183         }
184         return combos[combos.length - 1][0];
185     }
186 
addCrossProduct( Collection<String> source1, Collection<String> source2, Collection<String> output)187     private void addCrossProduct(
188             Collection<String> source1, Collection<String> source2, Collection<String> output) {
189         for (String x : source1) {
190             for (String y : source2) {
191                 output.add(x + y);
192             }
193         }
194     }
195 
getClosure(UnicodeSet input)196     public UnicodeSet getClosure(UnicodeSet input) {
197         UnicodeSet result = new UnicodeSet();
198         for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) {
199             Set<String> temp = getClosure(it.getString());
200             for (String s : temp) {
201                 result.add(s);
202             }
203         }
204         return result;
205     }
206 
reduce(String s)207     public String reduce(String s) {
208         Set<String> temp = getClosure(s);
209         return temp.iterator().next();
210     }
211 
reduce(UnicodeSet input)212     public UnicodeSet reduce(UnicodeSet input) {
213         UnicodeSet result = new UnicodeSet();
214         for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) {
215             final String reduce = reduce(it.getString());
216             result.add(reduce);
217         }
218         return result;
219     }
220 }
221