xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/test/CasingInfo.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.test;
2 
3 import com.ibm.icu.text.MessageFormat;
4 import com.ibm.icu.text.UnicodeSet;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.PrintWriter;
8 import java.util.ArrayList;
9 import java.util.EnumMap;
10 import java.util.HashMap;
11 import java.util.LinkedHashSet;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.Set;
15 import java.util.regex.Matcher;
16 import java.util.regex.Pattern;
17 import org.unicode.cldr.test.CheckConsistentCasing.CasingType;
18 import org.unicode.cldr.test.CheckConsistentCasing.CasingTypeAndErrFlag;
19 import org.unicode.cldr.test.CheckConsistentCasing.Category;
20 import org.unicode.cldr.tool.Option.Options;
21 import org.unicode.cldr.util.CLDRFile;
22 import org.unicode.cldr.util.CLDRFile.WinningChoice;
23 import org.unicode.cldr.util.CLDRPaths;
24 import org.unicode.cldr.util.CldrUtility;
25 import org.unicode.cldr.util.Factory;
26 import org.unicode.cldr.util.LocaleIDParser;
27 import org.unicode.cldr.util.PatternCache;
28 import org.unicode.cldr.util.SimpleXMLSource;
29 import org.unicode.cldr.util.SupplementalDataInfo;
30 import org.unicode.cldr.util.XMLFileReader;
31 import org.unicode.cldr.util.XMLSource;
32 import org.unicode.cldr.util.XPathParts;
33 
34 /**
35  * Calculates, reads, writes and returns casing information about locales for CheckConsistentCasing.
36  * Run main() to generate the casing information files which will be stored in common/casing.
37  *
38  * @author jchye
39  */
40 public class CasingInfo {
41     private static final Options options =
42             new Options("This program is used to generate casing files for locales.")
43                     .add(
44                             "locales",
45                             ".*",
46                             ".*",
47                             "A regex of the locales to generate casing information for")
48                     .add(
49                             "summary",
50                             null,
51                             "generates a summary of the casing for all locales that had casing generated for this run");
52     private Map<String, Map<Category, CasingTypeAndErrFlag>> casing;
53     private List<File> casingDirs;
54 
CasingInfo(Factory factory)55     public CasingInfo(Factory factory) {
56         casingDirs = new ArrayList<>();
57         for (File f : factory.getSourceDirectories()) {
58             this.casingDirs.add(new File(f.getAbsolutePath() + "/../casing"));
59         }
60         casing = CldrUtility.newConcurrentHashMap();
61     }
62 
63     /** ONLY usable in command line tests. */
CasingInfo()64     public CasingInfo() {
65         casingDirs = new ArrayList<>();
66         this.casingDirs.add(new File(CLDRPaths.CASING_DIRECTORY));
67         casing = CldrUtility.newConcurrentHashMap();
68     }
69 
70     /**
71      * Returns casing information to be used for a specified locale.
72      *
73      * @param localeID
74      * @return
75      */
getLocaleCasing(String localeID)76     public Map<Category, CasingTypeAndErrFlag> getLocaleCasing(String localeID) {
77         // Check if the localeID contains casing first.
78         // If there isn't a casing file available for the locale,
79         // recurse over the locale's parents until something is found.
80         if (!casing.containsKey(localeID)) {
81             // Synchronize writes to casing map in an attempt to avoid NPEs (cldrbug 5051).
82             synchronized (casing) {
83                 CasingHandler handler = loadFromXml(localeID);
84                 if (handler != null) {
85                     handler.addParsedResult(casing);
86                 }
87                 if (!casing.containsKey(localeID)) {
88                     String parentID = LocaleIDParser.getSimpleParent(localeID);
89                     if (!parentID.equals("root")) {
90                         casing.put(localeID, getLocaleCasing(parentID));
91                     }
92                 }
93             }
94         }
95 
96         return casing.get(localeID);
97     }
98 
99     /**
100      * Loads casing information about a specified locale from the casing XML, if it exists.
101      *
102      * @param localeID
103      */
loadFromXml(String localeID)104     private CasingHandler loadFromXml(String localeID) {
105         for (File casingDir : casingDirs) {
106             File casingFile = new File(casingDir, localeID + ".xml");
107             if (casingFile.isFile()) {
108                 CasingHandler handler = new CasingHandler();
109                 XMLFileReader xfr = new XMLFileReader().setHandler(handler);
110                 xfr.read(casingFile.toString(), -1, true);
111                 return handler;
112             }
113         } // Fail silently if file not found.
114         return null;
115     }
116 
117     /** Calculates casing information about all languages from the locale data. */
generateCasingInformation(String localePattern)118     private Map<String, Boolean> generateCasingInformation(String localePattern) {
119         SupplementalDataInfo supplementalDataInfo = SupplementalDataInfo.getInstance();
120         Set<String> defaultContentLocales = supplementalDataInfo.getDefaultContentLocales();
121         String sourceDirectory = CldrUtility.checkValidDirectory(CLDRPaths.MAIN_DIRECTORY);
122         Factory cldrFactory = Factory.make(sourceDirectory, localePattern);
123         Set<String> locales = new LinkedHashSet<>(cldrFactory.getAvailable());
124         locales.removeAll(defaultContentLocales); // Skip all default content locales
125         UnicodeSet allCaps = new UnicodeSet("[:Lu:]");
126         Map<String, Boolean> localeUsesCasing = new HashMap<>();
127         LocaleIDParser parser = new LocaleIDParser();
128 
129         for (String localeID : locales) {
130             if (CLDRFile.isSupplementalName(localeID)) continue;
131 
132             // We want country/script differences but not region differences
133             // (unless it's pt_PT, which we do want).
134             // Keep regional locales only if there isn't already a locale for its script,
135             // e.g. keep zh_Hans_HK because zh_Hans is a default locale.
136             parser.set(localeID);
137             if (parser.getRegion().length() > 0 && !localeID.equals("pt_PT")) {
138                 System.out.println("Skipping regional locale " + localeID);
139                 continue;
140             }
141 
142             // Save casing information about the locale.
143             CLDRFile file = cldrFactory.make(localeID, true);
144             UnicodeSet examplars = file.getExemplarSet("", WinningChoice.NORMAL);
145             localeUsesCasing.put(localeID, examplars.containsSome(allCaps));
146             createCasingXml(localeID, CheckConsistentCasing.getSamples(file));
147         }
148         return localeUsesCasing;
149     }
150 
151     /**
152      * Creates a CSV summary of casing information over all locales for verification.
153      *
154      * @param outputFile
155      */
createCasingSummary(String outputFile, Map<String, Boolean> localeUsesCasing)156     private void createCasingSummary(String outputFile, Map<String, Boolean> localeUsesCasing) {
157         PrintWriter out;
158         try {
159             out = new PrintWriter(outputFile);
160         } catch (IOException e) {
161             e.printStackTrace();
162             return;
163         }
164 
165         // Header
166         out.print(",");
167         for (Category category : Category.values()) {
168             out.print("," + category.toString().replace('_', '-'));
169         }
170         out.println();
171         out.print("Locale ID,Case");
172         for (int i = 0; i < Category.values().length; i++) {
173             out.print("," + i);
174         }
175         out.println();
176 
177         Set<String> locales = casing.keySet();
178         for (String localeID : locales) {
179             // Write casing information about the locale to file.
180             out.print(localeID);
181             out.print(",");
182             out.print(localeUsesCasing.get(localeID) ? "Y" : "N");
183             Map<Category, CasingTypeAndErrFlag> types = casing.get(localeID);
184             for (Category category : Category.values()) {
185                 CasingTypeAndErrFlag value = types.get(category);
186                 out.print("," + value == null ? null : value.type().toString().charAt(0));
187             }
188             out.println();
189             out.flush();
190         }
191         out.close();
192     }
193 
194     /** Writes casing information for the specified locale to XML format. */
createCasingXml(String localeID, Map<Category, CasingType> localeCasing)195     private void createCasingXml(String localeID, Map<Category, CasingType> localeCasing) {
196         // Load any existing overrides over casing info.
197         CasingHandler handler = loadFromXml(localeID);
198         Map<Category, CasingType> overrides =
199                 handler == null ? new EnumMap<>(Category.class) : handler.getOverrides();
200         localeCasing.putAll(overrides);
201 
202         XMLSource source = new SimpleXMLSource(localeID);
203         for (Category category : Category.values()) {
204             if (category == Category.NOT_USED) continue;
205             CasingType type = localeCasing.get(category);
206             if (overrides.containsKey(category)) {
207                 String path =
208                         MessageFormat.format(
209                                 "//ldml/metadata/casingData/casingItem[@type=\"{0}\"][@override=\"true\"]",
210                                 category);
211                 source.putValueAtPath(path, type.toString());
212             } else if (type != CasingType.other) {
213                 String path = "//ldml/metadata/casingData/casingItem[@type=\"" + category + "\"]";
214                 source.putValueAtPath(path, type.toString());
215             }
216         }
217         CLDRFile cldrFile = new CLDRFile(source);
218         File casingFile = new File(CLDRPaths.GEN_DIRECTORY + "/casing", localeID + ".xml");
219 
220         try {
221             PrintWriter out = new PrintWriter(casingFile);
222             cldrFile.write(out);
223             out.close();
224         } catch (IOException e) {
225             e.printStackTrace();
226         }
227     }
228 
229     /**
230      * Generates all the casing information and writes it to XML. A CSV summary of casing
231      * information is written to file if a filename argument is provided.
232      *
233      * @param args
234      */
main(String[] args)235     public static void main(String[] args) {
236         CasingInfo casingInfo = new CasingInfo();
237         options.parse(args, true);
238         Map<String, Boolean> localeUsesCasing =
239                 casingInfo.generateCasingInformation(options.get("locales").getValue());
240         if (options.get("summary").doesOccur()) {
241             casingInfo.createCasingSummary(args[0], localeUsesCasing);
242         }
243     }
244 
245     /** XML handler for parsing casing files. */
246     private class CasingHandler extends XMLFileReader.SimpleHandler {
247         private Pattern localePattern =
248                 PatternCache.get("//ldml/identity/language\\[@type=\"(\\w+)\"\\]");
249         private String localeID;
250         private Map<Category, CasingTypeAndErrFlag> caseMap = new EnumMap<>(Category.class);
251         private Map<Category, CasingType> overrideMap = new EnumMap<>(Category.class);
252 
253         @Override
handlePathValue(String path, String value)254         public void handlePathValue(String path, String value) {
255             // Parse casing info.
256             if (path.contains("casingItem")) {
257                 XPathParts parts = XPathParts.getFrozenInstance(path);
258                 Category category =
259                         Category.valueOf(parts.getAttributeValue(-1, "type").replace('-', '_'));
260                 CasingType casingType = CasingType.valueOf(value);
261                 boolean errFlag = Boolean.parseBoolean(parts.getAttributeValue(-1, "forceError"));
262                 for (CasingTypeAndErrFlag typeAndFlag : CasingTypeAndErrFlag.values()) {
263                     if (casingType == typeAndFlag.type() && errFlag == typeAndFlag.flag()) {
264                         caseMap.put(category, typeAndFlag);
265                         break;
266                     }
267                 }
268                 if (Boolean.valueOf(parts.getAttributeValue(-1, "override"))) {
269                     overrideMap.put(category, casingType);
270                 }
271             } else {
272                 // Parse the locale that the casing is for.
273                 Matcher matcher = localePattern.matcher(path);
274                 if (matcher.matches()) {
275                     localeID = matcher.group(1);
276                 }
277             }
278         }
279 
addParsedResult(Map<String, Map<Category, CasingTypeAndErrFlag>> map)280         public void addParsedResult(Map<String, Map<Category, CasingTypeAndErrFlag>> map) {
281             map.put(localeID, caseMap);
282         }
283 
getOverrides()284         public Map<Category, CasingType> getOverrides() {
285             return overrideMap;
286         }
287     }
288 }
289