1 package org.unicode.cldr.util; 2 3 import com.ibm.icu.lang.UCharacter; 4 import com.ibm.icu.text.Normalizer; 5 import com.ibm.icu.text.UTF16; 6 import com.ibm.icu.text.UnicodeSet; 7 import com.ibm.icu.text.UnicodeSetIterator; 8 import com.ibm.icu.util.ULocale; 9 import java.util.Collection; 10 import java.util.Comparator; 11 import java.util.HashSet; 12 import java.util.Set; 13 import java.util.TreeSet; 14 import org.unicode.cldr.util.XEquivalenceClass.SetMaker; 15 16 public class VariantFolder { 17 private AlternateFetcher alternateFetcher; 18 public static SetMaker mySetMaker = 19 new SetMaker() { 20 Comparator c = new UTF16.StringComparator(true, false, 0); 21 Comparator bestIsLowest = 22 new Comparator() { 23 @Override 24 public int compare(Object o1, Object o2) { 25 String s1 = o1.toString(); 26 String s2 = o2.toString(); 27 final boolean casefold1 = UCharacter.foldCase(s1, true).equals(s1); 28 final boolean casefold2 = UCharacter.foldCase(s2, true).equals(s2); 29 if (casefold1 != casefold2) { 30 return casefold1 ? -1 : 1; 31 } 32 final boolean canonical1 = 33 Normalizer.isNormalized(s1, Normalizer.COMPOSE, 0); 34 final boolean canonical2 = 35 Normalizer.isNormalized(s2, Normalizer.COMPOSE, 0); 36 if (canonical1 != canonical2) { 37 return canonical1 ? -1 : 1; 38 } 39 int len1 = s1.codePointCount(0, s1.length()); 40 int len2 = s2.codePointCount(0, s2.length()); 41 if (len1 != len2) { 42 return len1 - len2; 43 } 44 return c.compare(s1, s2); 45 } 46 }; 47 48 @Override 49 public Set make() { 50 return new TreeSet(bestIsLowest); 51 } 52 }; 53 public static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]"); 54 55 // private String source; 56 57 // private Set<String> result; 58 59 public interface AlternateFetcher { 60 /** 61 * The input string MUST be in the output set. Note that the results must be valid even if 62 * input string might not be on even code point boundaries. For example, if the input is 63 * "XabY" where X and Y are have surrogates, and the alternates are by case, then the 64 * results have to be {"XabY", "XAbY", "XaBY", "XABY"}. 65 * 66 * <p>The caller must never modify the set. 67 * 68 * @param item 69 * @return 70 */ getAlternates(String item, Set<String> output)71 Set<String> getAlternates(String item, Set<String> output); 72 } 73 74 public static class CompatibilityFolder implements VariantFolder.AlternateFetcher { 75 private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]"); 76 static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker); 77 78 static { 79 for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next(); ) { 80 String item = it.getString(); equivalents.add(item, Normalizer.decompose(item, true))81 equivalents.add(item, Normalizer.decompose(item, true)); equivalents.add(item, Normalizer.compose(item, true))82 equivalents.add(item, Normalizer.compose(item, true)); 83 } 84 } 85 86 @Override getAlternates(String item, Set<String> output)87 public Set<String> getAlternates(String item, Set<String> output) { 88 output.add(item); 89 return equivalents.getEquivalences(item); 90 } 91 } 92 93 public static class CanonicalFolder implements VariantFolder.AlternateFetcher { 94 static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker); 95 96 static { 97 for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next(); ) { 98 String item = it.getString(); equivalents.add(item, Normalizer.decompose(item, false))99 equivalents.add(item, Normalizer.decompose(item, false)); equivalents.add(item, Normalizer.compose(item, false))100 equivalents.add(item, Normalizer.compose(item, false)); 101 } 102 } 103 104 @Override getAlternates(String item, Set<String> output)105 public Set<String> getAlternates(String item, Set<String> output) { 106 output.add(item); 107 return equivalents.getEquivalences(item); 108 } 109 } 110 111 public static class CaseVariantFolder implements VariantFolder.AlternateFetcher { 112 private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]"); 113 static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker); 114 115 static { 116 for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next(); ) { 117 String item = it.getString(); equivalents.add(item, UCharacter.toLowerCase(item))118 equivalents.add(item, UCharacter.toLowerCase(item)); equivalents.add(item, UCharacter.toUpperCase(item))119 equivalents.add(item, UCharacter.toUpperCase(item)); equivalents.add(item, UCharacter.foldCase(item, true))120 equivalents.add(item, UCharacter.foldCase(item, true)); equivalents.add(item, UCharacter.toTitleCase(ULocale.ROOT, item, null))121 equivalents.add(item, UCharacter.toTitleCase(ULocale.ROOT, item, null)); 122 } 123 } 124 125 @Override getAlternates(String item, Set<String> output)126 public Set<String> getAlternates(String item, Set<String> output) { 127 output.add(item); 128 return equivalents.getEquivalences(item); 129 } 130 } 131 132 /** 133 * The class is designed to be immutable, at least as far as Java allows. That is, if the 134 * alternateFetcher is, then it will be. 135 * 136 * @param alternateFetcher 137 */ VariantFolder(AlternateFetcher alternateFetcher)138 public VariantFolder(AlternateFetcher alternateFetcher) { 139 this.alternateFetcher = alternateFetcher; 140 } 141 142 // We keep track of the alternates for each combination of start,len 143 // so with a length of 3 we have the following structure 144 // {{0,1}, {1,2}, {2,3} -- length of 1 145 // {0,2}, {1,3} 146 // {0,3}} 147 148 @SuppressWarnings("unchecked") getClosure(String source)149 public Set<String> getClosure(String source) { 150 int stringLength = source.length(); 151 if (stringLength == 0) { 152 Set<String> result = new HashSet<>(); 153 result.add(source); 154 return result; 155 } 156 Set<String>[][] combos = new Set[stringLength][]; 157 for (int i = 0; i < stringLength; ++i) { 158 combos[i] = new Set[stringLength - i]; 159 } 160 for (int i = 0; i < stringLength; ++i) { 161 combos[0][i] = 162 alternateFetcher.getAlternates( 163 source.substring(i, i + 1), new HashSet<String>()); 164 } 165 for (int level = 1; level < stringLength; ++level) { 166 // at each level, we add strings of that length (plus 1) 167 for (int start = 0; start < stringLength - level; ++start) { 168 int limit = start + level + 1; 169 // System.out.println(start + ", " + limit); 170 // we first add any longer alternates 171 Collection<String> current = combos[level][start] = new HashSet<>(); 172 current.addAll( 173 alternateFetcher.getAlternates( 174 source.substring(start, limit), new HashSet<String>())); 175 // then we add the cross product of shorter strings 176 for (int breakPoint = start + 1; breakPoint < limit; ++breakPoint) { 177 addCrossProduct( 178 combos[breakPoint - start - 1][start], 179 combos[limit - breakPoint - 1][breakPoint], 180 current); 181 } 182 } 183 } 184 return combos[combos.length - 1][0]; 185 } 186 addCrossProduct( Collection<String> source1, Collection<String> source2, Collection<String> output)187 private void addCrossProduct( 188 Collection<String> source1, Collection<String> source2, Collection<String> output) { 189 for (String x : source1) { 190 for (String y : source2) { 191 output.add(x + y); 192 } 193 } 194 } 195 getClosure(UnicodeSet input)196 public UnicodeSet getClosure(UnicodeSet input) { 197 UnicodeSet result = new UnicodeSet(); 198 for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) { 199 Set<String> temp = getClosure(it.getString()); 200 for (String s : temp) { 201 result.add(s); 202 } 203 } 204 return result; 205 } 206 reduce(String s)207 public String reduce(String s) { 208 Set<String> temp = getClosure(s); 209 return temp.iterator().next(); 210 } 211 reduce(UnicodeSet input)212 public UnicodeSet reduce(UnicodeSet input) { 213 UnicodeSet result = new UnicodeSet(); 214 for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) { 215 final String reduce = reduce(it.getString()); 216 result.add(reduce); 217 } 218 return result; 219 } 220 } 221