xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/test/CheckConsistentCasing.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.test;
2 
3 import com.google.common.base.Joiner;
4 import com.ibm.icu.lang.UCharacter;
5 import com.ibm.icu.text.BreakIterator;
6 import com.ibm.icu.util.ULocale;
7 import java.util.Collections;
8 import java.util.EnumMap;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Set;
12 import java.util.TreeSet;
13 import java.util.regex.Matcher;
14 import org.unicode.cldr.draft.ScriptMetadata;
15 import org.unicode.cldr.draft.ScriptMetadata.Info;
16 import org.unicode.cldr.draft.ScriptMetadata.Trinary;
17 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
18 import org.unicode.cldr.tool.LikelySubtags;
19 import org.unicode.cldr.util.CLDRFile;
20 import org.unicode.cldr.util.CLDRURLS;
21 import org.unicode.cldr.util.CldrUtility;
22 import org.unicode.cldr.util.Counter;
23 import org.unicode.cldr.util.Factory;
24 import org.unicode.cldr.util.PathStarrer;
25 import org.unicode.cldr.util.PatternCache;
26 import org.unicode.cldr.util.RegexLookup;
27 import org.unicode.cldr.util.SpecialLocales;
28 
29 public class CheckConsistentCasing extends FactoryCheckCLDR {
30 
31     private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false);
32 
33     private static final double MIN_FACTOR = 2.5;
34     // remember to add this class to the list in CheckCLDR.getCheckAll
35     // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.*
36     // -t.*Currencies.*
37 
38     ULocale uLocale = null;
39     BreakIterator breaker = null;
40     private String locale;
41     CasingInfo casingInfo;
42     private boolean hasCasingInfo;
43 
CheckConsistentCasing(Factory factory)44     public CheckConsistentCasing(Factory factory) {
45         super(factory);
46         casingInfo = new CasingInfo(factory);
47     }
48 
49     @Override
handleSetCldrFileToCheck( CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)50     public CheckCLDR handleSetCldrFileToCheck(
51             CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors) {
52         if (cldrFileToCheck == null) return this;
53         super.handleSetCldrFileToCheck(cldrFileToCheck, options, possibleErrors);
54         locale = cldrFileToCheck.getLocaleID();
55         // get info about casing; note that this is done in two steps since
56         // ScriptMetadata.getInfo() returns null, in some instances.
57         // OLD: Info localeInfo = ScriptMetadata.getInfo(locale);
58         String script = new LikelySubtags().getLikelyScript(locale);
59         Info localeInfo = ScriptMetadata.getInfo(script);
60 
61         if (localeInfo != null && localeInfo.hasCase == Trinary.YES) {
62             // this script has casing info, so we can request it here
63             try {
64                 types = casingInfo.getLocaleCasing(locale);
65             } catch (Exception e) {
66                 types = Collections.emptyMap();
67             }
68         } else {
69             // no casing info - since the types Map is global, and null checks aren't done,
70             // we are better off  with an empty map here
71             types = Collections.emptyMap();
72         }
73         if ((types == null || types.isEmpty()) && !SpecialLocales.isScratchLocale(locale)) {
74             possibleErrors.add(
75                     new CheckStatus()
76                             .setCause(this)
77                             .setMainType(CheckStatus.warningType)
78                             .setSubtype(Subtype.incorrectCasing)
79                             .setMessage("Could not load casing info for {0}", locale));
80         }
81         // types may be null, avoid NPE
82         hasCasingInfo = (types == null) ? false : types.size() > 0;
83         return this;
84     }
85 
86     // If you don't need any file initialization or postprocessing, you only need this one routine
87     @Override
handleCheck( String path, String fullPath, String value, Options options, List<CheckStatus> result)88     public CheckCLDR handleCheck(
89             String path, String fullPath, String value, Options options, List<CheckStatus> result) {
90         // it helps performance to have a quick reject of most paths
91         if (fullPath == null) return this; // skip paths that we don't have
92         if (!accept(result)) return this; // causes hasCasingInfo to be calculated
93         if (!hasCasingInfo) return this;
94 
95         String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null);
96         if (locale2.equals(locale) && value != null && value.length() > 0) {
97             Category category = getCategory(path);
98             if (category != null) {
99                 checkConsistentCasing(category, path, fullPath, value, options, result);
100             }
101         }
102         return this;
103     }
104 
105     static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher("");
106 
107     /** The casing type of a given string. */
108     public enum CasingType {
109         titlecase,
110         lowercase,
111         other;
112 
from(String s)113         public static CasingType from(String s) {
114             if (s == null || s.length() == 0) {
115                 return other;
116             }
117             int cp;
118             // Look for the first meaningful character in the string to determine case.
119             for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
120                 cp = s.codePointAt(i);
121                 // used to skip the placeholders, but works better to have them be 'other'
122                 // if (cp == '{') {
123                 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) {
124                 // i = placeholder.end() - 1; // skip
125                 // continue;
126                 // }
127                 // }
128                 int type = UCharacter.getType(cp);
129                 switch (type) {
130                     case UCharacter.LOWERCASE_LETTER:
131                         return lowercase;
132 
133                     case UCharacter.UPPERCASE_LETTER:
134                     case UCharacter.TITLECASE_LETTER:
135                         return titlecase;
136 
137                         // for other letters / numbers / symbols, return other
138                     case UCharacter.OTHER_LETTER:
139                     case UCharacter.DECIMAL_DIGIT_NUMBER:
140                     case UCharacter.LETTER_NUMBER:
141                     case UCharacter.OTHER_NUMBER:
142                     case UCharacter.MATH_SYMBOL:
143                     case UCharacter.CURRENCY_SYMBOL:
144                     case UCharacter.MODIFIER_SYMBOL:
145                     case UCharacter.OTHER_SYMBOL:
146                         return other;
147                         // ignore everything else (whitespace, punctuation, etc) and keep going
148                 }
149             }
150             return other;
151         }
152 
153         /** Return true if either is other, or they are identical. */
worksWith(CasingType otherType)154         public boolean worksWith(CasingType otherType) {
155             return otherType == null
156                     || this == otherType
157                     || this == CasingType.other
158                     || otherType == CasingType.other;
159         }
160     }
161 
162     public enum CasingTypeAndErrFlag {
163         titlecase_mismatchWarn(CasingType.titlecase, false),
164         titlecase_mismatchErr(CasingType.titlecase, true),
165         lowercase_mismatchWarn(CasingType.lowercase, false),
166         lowercase_mismatchErr(CasingType.lowercase, true),
167         other_mismatchWarn(CasingType.other, false),
168         other_mismatchErr(CasingType.other, true);
169 
170         private final CasingType type;
171         private final boolean flag; // force error instead of warning for mismatch
172 
CasingTypeAndErrFlag(CasingType type, boolean flag)173         private CasingTypeAndErrFlag(CasingType type, boolean flag) {
174             this.type = type;
175             this.flag = flag;
176         }
177 
type()178         public CasingType type() {
179             return type;
180         }
181 
flag()182         public boolean flag() {
183             return flag;
184         }
185     }
186 
187     static final RegexLookup<Category> pathToBucket =
188             new RegexLookup<Category>()
189                     .add("//ldml/localeDisplayNames/languages/language", Category.language)
190                     .add("//ldml/localeDisplayNames/scripts/script", Category.script)
191                     .add("//ldml/localeDisplayNames/territories/territory", Category.territory)
192                     .add("//ldml/localeDisplayNames/variants/variant", Category.variant)
193                     .add("//ldml/localeDisplayNames/keys/key", Category.key)
194                     .add("//ldml/localeDisplayNames/types/type", Category.keyValue)
195                     .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow)
196                     .add(
197                             "//ldml/dates/calendars/calendar.*/months.*format",
198                             Category.month_format_except_narrow)
199                     .add(
200                             "//ldml/dates/calendars/calendar.*/months",
201                             Category.month_standalone_except_narrow)
202                     .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow)
203                     .add(
204                             "//ldml/dates/calendars/calendar.*/days.*format",
205                             Category.day_format_except_narrow)
206                     .add(
207                             "//ldml/dates/calendars/calendar.*/days",
208                             Category.day_standalone_except_narrow)
209                     .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow)
210                     .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr)
211                     .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name)
212                     .add(
213                             "//ldml/dates/calendars/calendar.*/quarters.*narrow",
214                             Category.quarter_narrow)
215                     .add(
216                             "//ldml/dates/calendars/calendar.*/quarters.*abbreviated",
217                             Category.quarter_abbreviated)
218                     .add(
219                             "//ldml/dates/calendars/calendar.*/quarters.*format",
220                             Category.quarter_format_wide)
221                     .add(
222                             "//ldml/dates/calendars/calendar.*/quarters",
223                             Category.quarter_standalone_wide)
224                     .add("//ldml/.*/relative", Category.relative)
225                     .add("//ldml/dates/fields", Category.calendar_field)
226                     .add(
227                             "//ldml/dates/timeZoneNames/zone.*/exemplarCity",
228                             Category.zone_exemplarCity)
229                     .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short)
230                     .add("//ldml/dates/timeZoneNames/zone", Category.zone_long)
231                     .add(
232                             "//ldml/dates/timeZoneNames/metazone.*/commonlyUsed",
233                             Category.NOT_USED) // just to remove them from the other cases
234                     .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long)
235                     .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long)
236                     .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol)
237                     .add(
238                             "//ldml/numbers/currencies/currency.*/displayName.*@count",
239                             Category.currencyName_count)
240                     .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName)
241                     .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative)
242                     .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern)
243             // ldml/localeDisplayNames/keys/key[@type=".*"]
244             // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"]
245             // ldml/localeDisplayNames/transformNames/transformName[@type=".*"]
246             ;
247 
248     Map<Category, CasingTypeAndErrFlag> types = new EnumMap<>(Category.class);
249 
250     public enum Category {
251         language,
252         script,
253         territory,
254         variant,
255         keyValue,
256         month_narrow,
257         month_format_except_narrow,
258         month_standalone_except_narrow,
259         day_narrow,
260         day_format_except_narrow,
261         day_standalone_except_narrow,
262         era_narrow,
263         era_abbr,
264         era_name,
265         quarter_narrow,
266         quarter_abbreviated,
267         quarter_format_wide,
268         quarter_standalone_wide,
269         calendar_field,
270         zone_exemplarCity,
271         zone_short,
272         zone_long,
273         NOT_USED,
274         metazone_short,
275         metazone_long,
276         symbol,
277         currencyName_count,
278         currencyName,
279         relative,
280         unit_pattern,
281         key;
282     }
283 
284     // //ldml/numbers/currencies/currency[@type="ADP"]/displayName
285     // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"]
286     // //ldml/numbers/currencies/currency[@type="BYB"]/symbol
287 
getCategory(String path)288     static Category getCategory(String path) {
289         return pathToBucket.get(path);
290     }
291 
292     /**
293      * Calculates casing information using data from the specified CLDRFile.
294      *
295      * @param resolved the resolved CLDRFile to calculate casing information from
296      * @return
297      */
getSamples(CLDRFile resolved)298     public static Map<Category, CasingType> getSamples(CLDRFile resolved) {
299         // Use EnumMap instead of an array for type safety.
300         Map<Category, Counter<CasingType>> counters = new EnumMap<>(Category.class);
301 
302         for (Category category : Category.values()) {
303             counters.put(category, new Counter<CasingType>());
304         }
305         PathStarrer starrer = new PathStarrer();
306         boolean isRoot = "root".equals(resolved.getLocaleID());
307         Set<String> missing = !DEBUG ? null : new TreeSet<>();
308 
309         for (String path : resolved) {
310             if (!isRoot) {
311                 String locale2 = resolved.getSourceLocaleID(path, null);
312                 if (locale2.equals("root") || locale2.equals("code-fallback")) {
313                     continue;
314                 }
315             }
316             String winningPath = resolved.getWinningPath(path);
317             if (!winningPath.equals(path)) {
318                 continue;
319             }
320             Category category = getCategory(path);
321             if (category != null) {
322                 String value = resolved.getStringValue(path);
323                 if (value == null || value.length() == 0) continue;
324                 CasingType ft = CasingType.from(value);
325                 counters.get(category).add(ft, 1);
326             } else if (DEBUG) {
327                 String starred = starrer.set(path);
328                 missing.add(starred);
329             }
330         }
331 
332         Map<Category, CasingType> info = new EnumMap<>(Category.class);
333         for (Category category : Category.values()) {
334             if (category == Category.NOT_USED) continue;
335             Counter<CasingType> counter = counters.get(category);
336             long countLower = counter.getCount(CasingType.lowercase);
337             long countUpper = counter.getCount(CasingType.titlecase);
338             long countOther = counter.getCount(CasingType.other);
339             CasingType type;
340             if (countLower + countUpper == 0) {
341                 type = CasingType.other;
342             } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) {
343                 type = CasingType.lowercase;
344             } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) {
345                 type = CasingType.titlecase;
346             } else {
347                 type = CasingType.other;
348             }
349             info.put(category, type);
350         }
351         if (DEBUG && missing.size() != 0) {
352             System.out.println("Paths skipped:\n" + Joiner.on("\n").join(missing));
353         }
354         return info;
355     }
356 
357     private static final String CASE_WARNING =
358             "The first letter of 〈{0}〉 is {1}, which differs from what is expected "
359                     + "for the {2} category: that almost all values be {3}.\n\n"
360                     + "For guidance, see "
361                     + CLDRURLS.CAPITALIZATION_URL
362                     + ". "
363                     + "If this warning is wrong, please file a ticket at "
364                     + CLDRURLS.CLDR_NEWTICKET_URL
365                     + ".";
366 
checkConsistentCasing( Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)367     private void checkConsistentCasing(
368             Category category,
369             String path,
370             String fullPath,
371             String value,
372             Options options,
373             List<CheckStatus> result) {
374         // Avoid NPE
375         if (types != null) {
376             CasingType ft = CasingType.from(value);
377             CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category);
378             if (typeAndFlagFromCat == null) {
379                 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn;
380             }
381             if (!ft.worksWith(typeAndFlagFromCat.type())) {
382                 result.add(
383                         new CheckStatus()
384                                 .setCause(this)
385                                 .setMainType(
386                                         typeAndFlagFromCat.flag()
387                                                 ? CheckStatus.errorType
388                                                 : CheckStatus.warningType)
389                                 .setSubtype(Subtype.incorrectCasing) // typically warningType or
390                                 // errorType
391                                 .setMessage(
392                                         CASE_WARNING,
393                                         value,
394                                         ft,
395                                         category,
396                                         typeAndFlagFromCat
397                                                 .type())); // the message; can be MessageFormat with
398                 // arguments
399             }
400         }
401     }
402 }
403