1 package org.unicode.cldr.test; 2 3 import com.google.common.base.Joiner; 4 import com.ibm.icu.lang.UCharacter; 5 import com.ibm.icu.text.BreakIterator; 6 import com.ibm.icu.util.ULocale; 7 import java.util.Collections; 8 import java.util.EnumMap; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Set; 12 import java.util.TreeSet; 13 import java.util.regex.Matcher; 14 import org.unicode.cldr.draft.ScriptMetadata; 15 import org.unicode.cldr.draft.ScriptMetadata.Info; 16 import org.unicode.cldr.draft.ScriptMetadata.Trinary; 17 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype; 18 import org.unicode.cldr.tool.LikelySubtags; 19 import org.unicode.cldr.util.CLDRFile; 20 import org.unicode.cldr.util.CLDRURLS; 21 import org.unicode.cldr.util.CldrUtility; 22 import org.unicode.cldr.util.Counter; 23 import org.unicode.cldr.util.Factory; 24 import org.unicode.cldr.util.PathStarrer; 25 import org.unicode.cldr.util.PatternCache; 26 import org.unicode.cldr.util.RegexLookup; 27 import org.unicode.cldr.util.SpecialLocales; 28 29 public class CheckConsistentCasing extends FactoryCheckCLDR { 30 31 private static final boolean DEBUG = CldrUtility.getProperty("DEBUG", false); 32 33 private static final double MIN_FACTOR = 2.5; 34 // remember to add this class to the list in CheckCLDR.getCheckAll 35 // to run just this test, on just locales starting with 'nl', use CheckCLDR with -fnl.* 36 // -t.*Currencies.* 37 38 ULocale uLocale = null; 39 BreakIterator breaker = null; 40 private String locale; 41 CasingInfo casingInfo; 42 private boolean hasCasingInfo; 43 CheckConsistentCasing(Factory factory)44 public CheckConsistentCasing(Factory factory) { 45 super(factory); 46 casingInfo = new CasingInfo(factory); 47 } 48 49 @Override handleSetCldrFileToCheck( CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors)50 public CheckCLDR handleSetCldrFileToCheck( 51 CLDRFile cldrFileToCheck, Options options, List<CheckStatus> possibleErrors) { 52 if (cldrFileToCheck == null) return this; 53 super.handleSetCldrFileToCheck(cldrFileToCheck, options, possibleErrors); 54 locale = cldrFileToCheck.getLocaleID(); 55 // get info about casing; note that this is done in two steps since 56 // ScriptMetadata.getInfo() returns null, in some instances. 57 // OLD: Info localeInfo = ScriptMetadata.getInfo(locale); 58 String script = new LikelySubtags().getLikelyScript(locale); 59 Info localeInfo = ScriptMetadata.getInfo(script); 60 61 if (localeInfo != null && localeInfo.hasCase == Trinary.YES) { 62 // this script has casing info, so we can request it here 63 try { 64 types = casingInfo.getLocaleCasing(locale); 65 } catch (Exception e) { 66 types = Collections.emptyMap(); 67 } 68 } else { 69 // no casing info - since the types Map is global, and null checks aren't done, 70 // we are better off with an empty map here 71 types = Collections.emptyMap(); 72 } 73 if ((types == null || types.isEmpty()) && !SpecialLocales.isScratchLocale(locale)) { 74 possibleErrors.add( 75 new CheckStatus() 76 .setCause(this) 77 .setMainType(CheckStatus.warningType) 78 .setSubtype(Subtype.incorrectCasing) 79 .setMessage("Could not load casing info for {0}", locale)); 80 } 81 // types may be null, avoid NPE 82 hasCasingInfo = (types == null) ? false : types.size() > 0; 83 return this; 84 } 85 86 // If you don't need any file initialization or postprocessing, you only need this one routine 87 @Override handleCheck( String path, String fullPath, String value, Options options, List<CheckStatus> result)88 public CheckCLDR handleCheck( 89 String path, String fullPath, String value, Options options, List<CheckStatus> result) { 90 // it helps performance to have a quick reject of most paths 91 if (fullPath == null) return this; // skip paths that we don't have 92 if (!accept(result)) return this; // causes hasCasingInfo to be calculated 93 if (!hasCasingInfo) return this; 94 95 String locale2 = getCldrFileToCheck().getSourceLocaleID(path, null); 96 if (locale2.equals(locale) && value != null && value.length() > 0) { 97 Category category = getCategory(path); 98 if (category != null) { 99 checkConsistentCasing(category, path, fullPath, value, options, result); 100 } 101 } 102 return this; 103 } 104 105 static final Matcher placeholder = PatternCache.get("\\{\\d+\\}").matcher(""); 106 107 /** The casing type of a given string. */ 108 public enum CasingType { 109 titlecase, 110 lowercase, 111 other; 112 from(String s)113 public static CasingType from(String s) { 114 if (s == null || s.length() == 0) { 115 return other; 116 } 117 int cp; 118 // Look for the first meaningful character in the string to determine case. 119 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 120 cp = s.codePointAt(i); 121 // used to skip the placeholders, but works better to have them be 'other' 122 // if (cp == '{') { 123 // if (placeholder.reset(s).region(i,s.length()).lookingAt()) { 124 // i = placeholder.end() - 1; // skip 125 // continue; 126 // } 127 // } 128 int type = UCharacter.getType(cp); 129 switch (type) { 130 case UCharacter.LOWERCASE_LETTER: 131 return lowercase; 132 133 case UCharacter.UPPERCASE_LETTER: 134 case UCharacter.TITLECASE_LETTER: 135 return titlecase; 136 137 // for other letters / numbers / symbols, return other 138 case UCharacter.OTHER_LETTER: 139 case UCharacter.DECIMAL_DIGIT_NUMBER: 140 case UCharacter.LETTER_NUMBER: 141 case UCharacter.OTHER_NUMBER: 142 case UCharacter.MATH_SYMBOL: 143 case UCharacter.CURRENCY_SYMBOL: 144 case UCharacter.MODIFIER_SYMBOL: 145 case UCharacter.OTHER_SYMBOL: 146 return other; 147 // ignore everything else (whitespace, punctuation, etc) and keep going 148 } 149 } 150 return other; 151 } 152 153 /** Return true if either is other, or they are identical. */ worksWith(CasingType otherType)154 public boolean worksWith(CasingType otherType) { 155 return otherType == null 156 || this == otherType 157 || this == CasingType.other 158 || otherType == CasingType.other; 159 } 160 } 161 162 public enum CasingTypeAndErrFlag { 163 titlecase_mismatchWarn(CasingType.titlecase, false), 164 titlecase_mismatchErr(CasingType.titlecase, true), 165 lowercase_mismatchWarn(CasingType.lowercase, false), 166 lowercase_mismatchErr(CasingType.lowercase, true), 167 other_mismatchWarn(CasingType.other, false), 168 other_mismatchErr(CasingType.other, true); 169 170 private final CasingType type; 171 private final boolean flag; // force error instead of warning for mismatch 172 CasingTypeAndErrFlag(CasingType type, boolean flag)173 private CasingTypeAndErrFlag(CasingType type, boolean flag) { 174 this.type = type; 175 this.flag = flag; 176 } 177 type()178 public CasingType type() { 179 return type; 180 } 181 flag()182 public boolean flag() { 183 return flag; 184 } 185 } 186 187 static final RegexLookup<Category> pathToBucket = 188 new RegexLookup<Category>() 189 .add("//ldml/localeDisplayNames/languages/language", Category.language) 190 .add("//ldml/localeDisplayNames/scripts/script", Category.script) 191 .add("//ldml/localeDisplayNames/territories/territory", Category.territory) 192 .add("//ldml/localeDisplayNames/variants/variant", Category.variant) 193 .add("//ldml/localeDisplayNames/keys/key", Category.key) 194 .add("//ldml/localeDisplayNames/types/type", Category.keyValue) 195 .add("//ldml/dates/calendars/calendar.*/months.*narrow", Category.month_narrow) 196 .add( 197 "//ldml/dates/calendars/calendar.*/months.*format", 198 Category.month_format_except_narrow) 199 .add( 200 "//ldml/dates/calendars/calendar.*/months", 201 Category.month_standalone_except_narrow) 202 .add("//ldml/dates/calendars/calendar.*/days.*narrow", Category.day_narrow) 203 .add( 204 "//ldml/dates/calendars/calendar.*/days.*format", 205 Category.day_format_except_narrow) 206 .add( 207 "//ldml/dates/calendars/calendar.*/days", 208 Category.day_standalone_except_narrow) 209 .add("//ldml/dates/calendars/calendar.*/eras/eraNarrow", Category.era_narrow) 210 .add("//ldml/dates/calendars/calendar.*/eras/eraAbbr", Category.era_abbr) 211 .add("//ldml/dates/calendars/calendar.*/eras/", Category.era_name) 212 .add( 213 "//ldml/dates/calendars/calendar.*/quarters.*narrow", 214 Category.quarter_narrow) 215 .add( 216 "//ldml/dates/calendars/calendar.*/quarters.*abbreviated", 217 Category.quarter_abbreviated) 218 .add( 219 "//ldml/dates/calendars/calendar.*/quarters.*format", 220 Category.quarter_format_wide) 221 .add( 222 "//ldml/dates/calendars/calendar.*/quarters", 223 Category.quarter_standalone_wide) 224 .add("//ldml/.*/relative", Category.relative) 225 .add("//ldml/dates/fields", Category.calendar_field) 226 .add( 227 "//ldml/dates/timeZoneNames/zone.*/exemplarCity", 228 Category.zone_exemplarCity) 229 .add("//ldml/dates/timeZoneNames/zone.*/short", Category.zone_short) 230 .add("//ldml/dates/timeZoneNames/zone", Category.zone_long) 231 .add( 232 "//ldml/dates/timeZoneNames/metazone.*/commonlyUsed", 233 Category.NOT_USED) // just to remove them from the other cases 234 .add("//ldml/dates/timeZoneNames/metazone.*/short", Category.metazone_long) 235 .add("//ldml/dates/timeZoneNames/metazone", Category.metazone_long) 236 .add("//ldml/numbers/currencies/currency.*/symbol", Category.symbol) 237 .add( 238 "//ldml/numbers/currencies/currency.*/displayName.*@count", 239 Category.currencyName_count) 240 .add("//ldml/numbers/currencies/currency.*/displayName", Category.currencyName) 241 .add("//ldml/units/unit.*/unitPattern.*(past|future)", Category.relative) 242 .add("//ldml/units/unit.*/unitPattern", Category.unit_pattern) 243 // ldml/localeDisplayNames/keys/key[@type=".*"] 244 // ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type=".*"] 245 // ldml/localeDisplayNames/transformNames/transformName[@type=".*"] 246 ; 247 248 Map<Category, CasingTypeAndErrFlag> types = new EnumMap<>(Category.class); 249 250 public enum Category { 251 language, 252 script, 253 territory, 254 variant, 255 keyValue, 256 month_narrow, 257 month_format_except_narrow, 258 month_standalone_except_narrow, 259 day_narrow, 260 day_format_except_narrow, 261 day_standalone_except_narrow, 262 era_narrow, 263 era_abbr, 264 era_name, 265 quarter_narrow, 266 quarter_abbreviated, 267 quarter_format_wide, 268 quarter_standalone_wide, 269 calendar_field, 270 zone_exemplarCity, 271 zone_short, 272 zone_long, 273 NOT_USED, 274 metazone_short, 275 metazone_long, 276 symbol, 277 currencyName_count, 278 currencyName, 279 relative, 280 unit_pattern, 281 key; 282 } 283 284 // //ldml/numbers/currencies/currency[@type="ADP"]/displayName 285 // //ldml/numbers/currencies/currency[@type="RON"]/displayName[@count="other"] 286 // //ldml/numbers/currencies/currency[@type="BYB"]/symbol 287 getCategory(String path)288 static Category getCategory(String path) { 289 return pathToBucket.get(path); 290 } 291 292 /** 293 * Calculates casing information using data from the specified CLDRFile. 294 * 295 * @param resolved the resolved CLDRFile to calculate casing information from 296 * @return 297 */ getSamples(CLDRFile resolved)298 public static Map<Category, CasingType> getSamples(CLDRFile resolved) { 299 // Use EnumMap instead of an array for type safety. 300 Map<Category, Counter<CasingType>> counters = new EnumMap<>(Category.class); 301 302 for (Category category : Category.values()) { 303 counters.put(category, new Counter<CasingType>()); 304 } 305 PathStarrer starrer = new PathStarrer(); 306 boolean isRoot = "root".equals(resolved.getLocaleID()); 307 Set<String> missing = !DEBUG ? null : new TreeSet<>(); 308 309 for (String path : resolved) { 310 if (!isRoot) { 311 String locale2 = resolved.getSourceLocaleID(path, null); 312 if (locale2.equals("root") || locale2.equals("code-fallback")) { 313 continue; 314 } 315 } 316 String winningPath = resolved.getWinningPath(path); 317 if (!winningPath.equals(path)) { 318 continue; 319 } 320 Category category = getCategory(path); 321 if (category != null) { 322 String value = resolved.getStringValue(path); 323 if (value == null || value.length() == 0) continue; 324 CasingType ft = CasingType.from(value); 325 counters.get(category).add(ft, 1); 326 } else if (DEBUG) { 327 String starred = starrer.set(path); 328 missing.add(starred); 329 } 330 } 331 332 Map<Category, CasingType> info = new EnumMap<>(Category.class); 333 for (Category category : Category.values()) { 334 if (category == Category.NOT_USED) continue; 335 Counter<CasingType> counter = counters.get(category); 336 long countLower = counter.getCount(CasingType.lowercase); 337 long countUpper = counter.getCount(CasingType.titlecase); 338 long countOther = counter.getCount(CasingType.other); 339 CasingType type; 340 if (countLower + countUpper == 0) { 341 type = CasingType.other; 342 } else if (countLower >= countUpper * MIN_FACTOR && countLower >= countOther) { 343 type = CasingType.lowercase; 344 } else if (countUpper >= countLower * MIN_FACTOR && countUpper >= countOther) { 345 type = CasingType.titlecase; 346 } else { 347 type = CasingType.other; 348 } 349 info.put(category, type); 350 } 351 if (DEBUG && missing.size() != 0) { 352 System.out.println("Paths skipped:\n" + Joiner.on("\n").join(missing)); 353 } 354 return info; 355 } 356 357 private static final String CASE_WARNING = 358 "The first letter of 〈{0}〉 is {1}, which differs from what is expected " 359 + "for the {2} category: that almost all values be {3}.\n\n" 360 + "For guidance, see " 361 + CLDRURLS.CAPITALIZATION_URL 362 + ". " 363 + "If this warning is wrong, please file a ticket at " 364 + CLDRURLS.CLDR_NEWTICKET_URL 365 + "."; 366 checkConsistentCasing( Category category, String path, String fullPath, String value, Options options, List<CheckStatus> result)367 private void checkConsistentCasing( 368 Category category, 369 String path, 370 String fullPath, 371 String value, 372 Options options, 373 List<CheckStatus> result) { 374 // Avoid NPE 375 if (types != null) { 376 CasingType ft = CasingType.from(value); 377 CasingTypeAndErrFlag typeAndFlagFromCat = types.get(category); 378 if (typeAndFlagFromCat == null) { 379 typeAndFlagFromCat = CasingTypeAndErrFlag.other_mismatchWarn; 380 } 381 if (!ft.worksWith(typeAndFlagFromCat.type())) { 382 result.add( 383 new CheckStatus() 384 .setCause(this) 385 .setMainType( 386 typeAndFlagFromCat.flag() 387 ? CheckStatus.errorType 388 : CheckStatus.warningType) 389 .setSubtype(Subtype.incorrectCasing) // typically warningType or 390 // errorType 391 .setMessage( 392 CASE_WARNING, 393 value, 394 ft, 395 category, 396 typeAndFlagFromCat 397 .type())); // the message; can be MessageFormat with 398 // arguments 399 } 400 } 401 } 402 } 403