VariantFolder.java (revision 912701f9769bb47905792267661f0baf2b85bed5) - OpenGrok cross reference for /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/VariantFolder.java

package org.unicode.cldr.util;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
import org.unicode.cldr.util.XEquivalenceClass.SetMaker;

public class VariantFolder {
    private AlternateFetcher alternateFetcher;
    public static SetMaker mySetMaker =
            new SetMaker() {
                Comparator c = new UTF16.StringComparator(true, false, 0);
                Comparator bestIsLowest =
                        new Comparator() {
                            @Override
                            public int compare(Object o1, Object o2) {
                                String s1 = o1.toString();
                                String s2 = o2.toString();
                                final boolean casefold1 = UCharacter.foldCase(s1, true).equals(s1);
                                final boolean casefold2 = UCharacter.foldCase(s2, true).equals(s2);
                                if (casefold1 != casefold2) {
                                    return casefold1 ? -1 : 1;
                                }
                                final boolean canonical1 =
                                        Normalizer.isNormalized(s1, Normalizer.COMPOSE, 0);
                                final boolean canonical2 =
                                        Normalizer.isNormalized(s2, Normalizer.COMPOSE, 0);
                                if (canonical1 != canonical2) {
                                    return canonical1 ? -1 : 1;
                                }
                                int len1 = s1.codePointCount(0, s1.length());
                                int len2 = s2.codePointCount(0, s2.length());
                                if (len1 != len2) {
                                    return len1 - len2;
                                }
                                return c.compare(s1, s2);
                            }
                        };

                @Override
                public Set make() {
                    return new TreeSet(bestIsLowest);
                }
            };
    public static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]");

    // private String source;

    // private Set<String> result;

    public interface AlternateFetcher {
        /**
         * The input string MUST be in the output set. Note that the results must be valid even if
         * input string might not be on even code point boundaries. For example, if the input is
         * "XabY" where X and Y are have surrogates, and the alternates are by case, then the
         * results have to be {"XabY", "XAbY", "XaBY", "XABY"}.
         *
         * <p>The caller must never modify the set.
         *
         * @param item
         * @return
         */
        Set<String> getAlternates(String item, Set<String> output);
    }

    public static class CompatibilityFolder implements VariantFolder.AlternateFetcher {
        private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]");
        static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker);

        static {
            for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next(); ) {
                String item = it.getString();
                equivalents.add(item, Normalizer.decompose(item, true));
                equivalents.add(item, Normalizer.compose(item, true));
            }
        }

        @Override
        public Set<String> getAlternates(String item, Set<String> output) {
            output.add(item);
            return equivalents.getEquivalences(item);
        }
    }

    public static class CanonicalFolder implements VariantFolder.AlternateFetcher {
        static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker);

        static {
            for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next(); ) {
                String item = it.getString();
                equivalents.add(item, Normalizer.decompose(item, false));
                equivalents.add(item, Normalizer.compose(item, false));
            }
        }

        @Override
        public Set<String> getAlternates(String item, Set<String> output) {
            output.add(item);
            return equivalents.getEquivalences(item);
        }
    }

    public static class CaseVariantFolder implements VariantFolder.AlternateFetcher {
        private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]");
        static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker);

        static {
            for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next(); ) {
                String item = it.getString();
                equivalents.add(item, UCharacter.toLowerCase(item));
                equivalents.add(item, UCharacter.toUpperCase(item));
                equivalents.add(item, UCharacter.foldCase(item, true));
                equivalents.add(item, UCharacter.toTitleCase(ULocale.ROOT, item, null));
            }
        }

        @Override
        public Set<String> getAlternates(String item, Set<String> output) {
            output.add(item);
            return equivalents.getEquivalences(item);
        }
    }

    /**
     * The class is designed to be immutable, at least as far as Java allows. That is, if the
     * alternateFetcher is, then it will be.
     *
     * @param alternateFetcher
     */
    public VariantFolder(AlternateFetcher alternateFetcher) {
        this.alternateFetcher = alternateFetcher;
    }

    // We keep track of the alternates for each combination of start,len
    // so with a length of 3 we have the following structure
    // {{0,1}, {1,2}, {2,3} -- length of 1
    // {0,2}, {1,3}
    // {0,3}}

    @SuppressWarnings("unchecked")
    public Set<String> getClosure(String source) {
        int stringLength = source.length();
        if (stringLength == 0) {
            Set<String> result = new HashSet<>();
            result.add(source);
            return result;
        }
        Set<String>[][] combos = new Set[stringLength][];
        for (int i = 0; i < stringLength; ++i) {
            combos[i] = new Set[stringLength - i];
        }
        for (int i = 0; i < stringLength; ++i) {
            combos[0][i] =
                    alternateFetcher.getAlternates(
                            source.substring(i, i + 1), new HashSet<String>());
        }
        for (int level = 1; level < stringLength; ++level) {
            // at each level, we add strings of that length (plus 1)
            for (int start = 0; start < stringLength - level; ++start) {
                int limit = start + level + 1;
                // System.out.println(start + ", " + limit);
                // we first add any longer alternates
                Collection<String> current = combos[level][start] = new HashSet<>();
                current.addAll(
                        alternateFetcher.getAlternates(
                                source.substring(start, limit), new HashSet<String>()));
                // then we add the cross product of shorter strings
                for (int breakPoint = start + 1; breakPoint < limit; ++breakPoint) {
                    addCrossProduct(
                            combos[breakPoint - start - 1][start],
                            combos[limit - breakPoint - 1][breakPoint],
                            current);
                }
            }
        }
        return combos[combos.length - 1][0];
    }

    private void addCrossProduct(
            Collection<String> source1, Collection<String> source2, Collection<String> output) {
        for (String x : source1) {
            for (String y : source2) {
                output.add(x + y);
            }
        }
    }

    public UnicodeSet getClosure(UnicodeSet input) {
        UnicodeSet result = new UnicodeSet();
        for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) {
            Set<String> temp = getClosure(it.getString());
            for (String s : temp) {
                result.add(s);
            }
        }
        return result;
    }

    public String reduce(String s) {
        Set<String> temp = getClosure(s);
        return temp.iterator().next();
    }

    public UnicodeSet reduce(UnicodeSet input) {
        UnicodeSet result = new UnicodeSet();
        for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) {
            final String reduce = reduce(it.getString());
            result.add(reduce);
        }
        return result;
    }
}