1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.io.PrintWriter; 5 import java.util.Arrays; 6 import java.util.HashSet; 7 import java.util.Map; 8 import java.util.Set; 9 import java.util.TreeMap; 10 import java.util.TreeSet; 11 import java.util.regex.Matcher; 12 import java.util.regex.Pattern; 13 import org.unicode.cldr.draft.FileUtilities; 14 import org.unicode.cldr.util.PatternCache; 15 import org.unicode.cldr.util.StandardCodes; 16 17 public class GetDescriptions { 18 19 static Matcher matcher = PatternCache.get("([^,(]+)(,([^(]+))?(.*)").matcher(""); 20 21 static Map<String, String> items = new TreeMap<>(); 22 static int allCount = 1; 23 static int commaCount = 1; 24 25 private static Map<String, Map<String, Set<String>>> name_type_codes = new TreeMap<>(); 26 27 private static Set<String> preCommas = new TreeSet<>(); 28 29 private static Set<String> postCommas = new TreeSet<>(); 30 31 private static Map<String, String> descriptionWithoutComments = new TreeMap<>(); 32 33 private static Set<String> uninvertedNames = new HashSet<>(); 34 main(String[] args)35 public static void main(String[] args) throws IOException { 36 StandardCodes sc = StandardCodes.make(); 37 PrintWriter commas = 38 FileUtilities.openUTF8Writer("c:\\data\\gen\\ltru\\", "ltru-commas.txt"); 39 commas.write('\uFEFF'); 40 PrintWriter all = FileUtilities.openUTF8Writer("c:\\data\\gen\\ltru\\", "ltru-all.txt"); 41 all.write('\uFEFF'); 42 43 for (String type : sc.getAvailableTypes()) { 44 if (type.equals("tzid")) continue; 45 if (type.equals("currency")) continue; 46 for (String code : sc.getAvailableCodes(type)) { 47 Map<String, String> x = sc.getLangData(type, code); 48 if (x == null) { 49 continue; 50 } 51 boolean isDeprecated = x.get("Deprecated") != null; 52 53 all.println(allCount++ + "\t" + type + "\t" + code + "\t" + x); 54 String descriptionField = x.get("Description"); 55 String[] descriptions = descriptionField.split("\u25AA"); 56 items.clear(); 57 58 for (String description : descriptions) { 59 if (!matcher.reset(description).matches()) { 60 commas.println( 61 commaCount++ 62 + "\t" 63 + type 64 + "\t" 65 + code 66 + "\t" 67 + description 68 + "\t@NO_MATCH"); 69 continue; 70 } 71 String preComma = matcher.group(1).trim(); 72 String postComma = matcher.group(3); 73 postComma = postComma == null ? "" : postComma.trim(); 74 String parens = matcher.group(4); 75 parens = parens == null ? "" : parens.trim(); 76 77 if (preComma.length() != 0) preCommas.add(preComma); 78 if (postComma.length() != 0) postCommas.add(postComma); 79 80 String newDescription = preComma; 81 82 String descriptionWithoutComment = preComma; 83 String newDescriptionWithoutComment = preComma; 84 uninvertedNames.add(newDescriptionWithoutComment); 85 86 if (postComma.length() != 0) { 87 descriptionWithoutComment += ", " + postComma; 88 newDescription = postComma + " " + newDescription; 89 newDescriptionWithoutComment = newDescription; 90 } 91 if (parens.length() != 0) { 92 newDescription += " " + parens; 93 } 94 95 if (!isDeprecated) { 96 if (descriptionWithoutComment.length() != 0) 97 descriptionWithoutComments.put( 98 descriptionWithoutComment, newDescriptionWithoutComment); 99 addTypeNameCode(name_type_codes, type, code, newDescriptionWithoutComment); 100 } 101 102 if (!descriptionField.contains(",") && !descriptionField.contains("(")) { 103 continue; 104 } 105 106 checkDuplicates(commas, type, code, descriptionWithoutComment, description); 107 if (!newDescriptionWithoutComment.equals(descriptionWithoutComment)) { 108 checkDuplicates( 109 commas, type, code, newDescriptionWithoutComment, description); 110 } 111 112 if (postComma.contains(",")) { 113 commas.println( 114 commaCount++ 115 + "\t" 116 + type 117 + "\t" 118 + code 119 + "\t" 120 + description 121 + "\t@DOUBLE_COMMA"); 122 continue; 123 } 124 125 if (postComma.length() == 0) { 126 commas.println( 127 commaCount++ + "\t" + type + "\t" + code + "\t" + description); 128 continue; 129 } 130 131 commas.println( 132 commaCount++ 133 + "\t" 134 + type 135 + "\t" 136 + code 137 + "\t" 138 + description 139 + "\t=>\t" 140 + newDescription); 141 } 142 checkInversion(commas, type, code, descriptions); 143 } 144 } 145 all.close(); 146 commas.close(); 147 showReverse(); 148 System.out.println("DONE"); 149 } 150 showReverse()151 private static void showReverse() throws IOException { 152 PrintWriter reverse = 153 FileUtilities.openUTF8Writer("c:\\data\\gen\\ltru\\", "ltru-reverse.txt"); 154 reverse.write('\uFEFF'); 155 int reverseCount = 1; 156 for (String name : name_type_codes.keySet()) { 157 boolean privateUse = name.equals("PRIVATE USE"); 158 Map<String, Set<String>> type_codes = name_type_codes.get(name); 159 Set<String> types = type_codes.keySet(); 160 for (String type : type_codes.keySet()) { 161 String baseCode = null; 162 for (String code : type_codes.get(type)) { 163 if (baseCode == null || privateUse) { 164 baseCode = code; 165 reverse.println(reverseCount++ + "\t" + name + "\t" + type + "\t" + code); 166 continue; 167 } 168 reverse.println( 169 reverseCount++ 170 + "\t" 171 + name 172 + "\t" 173 + type 174 + "\t" 175 + code 176 + "\t@DUPLICATE_IN\t" 177 + "\t" 178 + baseCode); 179 } 180 } 181 reverseIfPossible(name, types); 182 } 183 reverse.close(); 184 reverseCount = 1; 185 PrintWriter inversions = 186 FileUtilities.openUTF8Writer("c:\\data\\gen\\ltru\\", "ltru-inversions.txt"); 187 for (String invertedName : descriptionWithoutComments.keySet()) { 188 String name = descriptionWithoutComments.get(invertedName); 189 if (name.equals(invertedName)) continue; 190 inversions.println(reverseCount++ + "\t" + invertedName + "\t" + name); 191 } 192 inversions.close(); 193 } 194 reverseIfPossible(String name, Set<String> types)195 static void reverseIfPossible(String name, Set<String> types) { 196 for (String uninvert : uninvertedNames) { 197 if (name.endsWith(uninvert)) { 198 addEnd(name, uninvert, types); 199 } 200 if (name.startsWith(uninvert)) { 201 addStart(name, uninvert, types); 202 } 203 } 204 for (String preComma : preCommas) { 205 if (name.endsWith(preComma)) { 206 addEnd(name, preComma, types); 207 } 208 } 209 for (String postComma : postCommas) { 210 if (name.startsWith(postComma)) { 211 addStart(name, postComma, types); 212 } 213 } 214 } 215 addStart(String name, String postComma, Set<String> types)216 private static void addStart(String name, String postComma, Set<String> types) { 217 if (name.equals(postComma)) return; 218 if (!name.startsWith(postComma + " ")) return; 219 String trial = name.substring(postComma.length()).trim() + ", " + postComma; 220 if (descriptionWithoutComments.keySet().contains(trial)) { 221 return; 222 } 223 descriptionWithoutComments.put(trial, name + "\t@MISSING\t" + types); 224 } 225 addEnd(String name, String preComma, Set<String> types)226 private static void addEnd(String name, String preComma, Set<String> types) { 227 if (name.equals(preComma)) return; 228 if (!name.endsWith(" " + preComma)) return; 229 String trial = 230 preComma + ", " + name.substring(0, name.length() - preComma.length()).trim(); 231 if (descriptionWithoutComments.keySet().contains(trial)) { 232 return; 233 } 234 descriptionWithoutComments.put(trial, name + "\t@MISSING\t" + types); 235 } 236 addTypeNameCode( Map<String, Map<String, Set<String>>> name_type_codes, String type, String code, String newDescriptionWithoutComment)237 private static void addTypeNameCode( 238 Map<String, Map<String, Set<String>>> name_type_codes, 239 String type, 240 String code, 241 String newDescriptionWithoutComment) { 242 Map<String, Set<String>> type_codes = name_type_codes.get(newDescriptionWithoutComment); 243 if (type_codes == null) 244 name_type_codes.put(newDescriptionWithoutComment, type_codes = new TreeMap<>()); 245 Set<String> codes = type_codes.get(type); 246 if (codes == null) type_codes.put(type, codes = new TreeSet<>()); 247 codes.add(code); 248 } 249 250 static Matcher directional = 251 Pattern.compile( 252 "(West Central|Northern|Southern|Western|Eastern|North|South|East|West|Central|Ancient|Classical|Coastal" 253 + "|Highland|Isthmus|Low|Lower|Lowland|Middle|Northeastern|Northwestern|Old|Plains|Southeastern|Southwestern|Straits|Upper|Valley" 254 + "|Written)\\s+(.+)") 255 .matcher(""); 256 checkInversion( PrintWriter commas, String type, String code, String[] parts)257 private static void checkInversion( 258 PrintWriter commas, String type, String code, String[] parts) { 259 Set<String> items = new TreeSet<>(Arrays.asList(parts)); 260 for (String item : items) { 261 if (!directional.reset(item).matches()) { 262 continue; 263 } 264 String trial = 265 directional.group(2) 266 + (directional.group(2).contains(",") ? " " : ", ") 267 + directional.group(1); 268 if (!items.contains(trial)) { 269 commas.println( 270 commaCount++ + "\t" + type + "\t" + code + "\t" + "\t@MISSING\t" + trial); 271 } 272 } 273 } 274 checkDuplicates( PrintWriter commas, String type, String code, String newPartNoComment, String part)275 private static void checkDuplicates( 276 PrintWriter commas, String type, String code, String newPartNoComment, String part) { 277 String old = items.get(newPartNoComment); 278 if (old != null) { 279 commas.println( 280 commaCount++ 281 + "\t" 282 + type 283 + "\t" 284 + code 285 + "\t" 286 + part 287 + "\t@DUPLICATES\t" 288 + old); 289 } else { 290 items.put(newPartNoComment, part); 291 } 292 } 293 } 294