1 package org.unicode.cldr.tool; 2 3 import com.ibm.icu.text.ListFormat; 4 import com.ibm.icu.text.NumberFormat; 5 import com.ibm.icu.text.UnicodeSet; 6 import com.ibm.icu.util.Output; 7 import com.ibm.icu.util.ULocale; 8 import java.io.IOException; 9 import java.text.ParseException; 10 import java.util.ArrayList; 11 import java.util.HashMap; 12 import java.util.Iterator; 13 import java.util.LinkedList; 14 import java.util.List; 15 import java.util.Locale; 16 import java.util.Map; 17 import java.util.Set; 18 import java.util.TreeSet; 19 import java.util.regex.Matcher; 20 import java.util.regex.Pattern; 21 import org.unicode.cldr.util.CldrUtility; 22 import org.unicode.cldr.util.CldrUtility.LineHandler; 23 import org.unicode.cldr.util.Counter2; 24 import org.unicode.cldr.util.Pair; 25 import org.unicode.cldr.util.StandardCodes; 26 27 public class AddPopulationData { 28 static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false); 29 static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false); 30 31 enum WBLine { 32 // "Afghanistan","AFG","GNI, PPP (current international 33 // $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..", 34 35 // Country Name,Country Code,Series Name,Series Code,2000 [YR2000],2001 [YR2001],2002 36 // [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 37 // [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 38 // [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 39 // [YR2020] 40 Country_Name, 41 Country_Code, 42 Series_Name, 43 Series_Code, 44 Year("(\\d+)\\s*\\[YR(\\d+)\\]"); 45 46 final Pattern pattern; 47 WBLine()48 WBLine() { 49 this.pattern = Pattern.compile(name().replaceAll("_", " ")); 50 } 51 WBLine(final String regex)52 WBLine(final String regex) { 53 this.pattern = Pattern.compile(regex); 54 } 55 match(String str)56 Matcher match(String str) { 57 // Skip BOM 58 if (str.startsWith("\uFEFF")) { 59 str = str.substring("\uFEFF".length()); 60 } 61 return this.pattern.matcher(str); 62 } 63 find(final String str)64 static Pair<WBLine, Integer> find(final String str) { 65 for (WBLine i : values()) { 66 final Matcher m = i.match(str); 67 if (m.matches()) { 68 Integer val = 0; 69 if (m.groupCount() > 0) { 70 val = Integer.parseInt(m.group(1)); 71 } 72 return Pair.of(i, val); 73 } 74 } 75 return null; 76 } 77 parseHeader(final String[] pieces)78 static ArrayList<Pair<WBLine, Integer>> parseHeader(final String[] pieces) { 79 ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null; 80 columnToTypeAndValue = new ArrayList<>(); 81 for (int i = 0; i < pieces.length; i++) { 82 columnToTypeAndValue.add(i, WBLine.find(pieces[i])); 83 } 84 return columnToTypeAndValue; 85 } 86 } 87 88 enum FBLine { 89 Rank, 90 Country, 91 Value, 92 Year; 93 get(String[] pieces)94 String get(String[] pieces) { 95 return pieces[ordinal()]; 96 } 97 } 98 99 enum FBLiteracy { 100 Rank, 101 Country, 102 Percent; 103 get(String[] pieces)104 String get(String[] pieces) { 105 return pieces[ordinal()]; 106 } 107 } 108 109 private static final String GCP = "NY.GNP.MKTP.PP.CD"; 110 private static final String POP = "SP.POP.TOTL"; 111 private static final String EMPTY = ".."; 112 private static Counter2<String> worldbank_gdp = new Counter2<>(); 113 private static Counter2<String> worldbank_population = new Counter2<>(); 114 private static Counter2<String> un_literacy = new Counter2<>(); 115 116 private static Counter2<String> factbook_gdp = new Counter2<>(); 117 private static Counter2<String> factbook_population = new Counter2<>(); 118 private static Counter2<String> factbook_literacy = new Counter2<>(); 119 120 private static CountryData other = new CountryData(); 121 122 static class CountryData { 123 private static Counter2<String> population = new Counter2<>(); 124 private static Counter2<String> gdp = new Counter2<>(); 125 private static Counter2<String> literacy = new Counter2<>(); 126 } 127 128 static final Set<String> missing = new TreeSet<String>(); 129 main(String[] args)130 public static void main(String[] args) throws IOException { 131 132 System.out.println( 133 "Code" + "\t" + "Name" + "\t" + "Pop" + "\t" + "GDP-PPP" + "\t" + "UN Literacy"); 134 135 for (String country : StandardCodes.make().getGoodCountries()) { 136 showCountryData(country); 137 } 138 Set<String> outliers = new TreeSet<>(); 139 outliers.addAll(factbook_population.keySet()); 140 outliers.addAll(worldbank_population.keySet()); 141 outliers.addAll(factbook_gdp.keySet()); 142 outliers.addAll(worldbank_gdp.keySet()); 143 outliers.addAll(un_literacy.keySet()); 144 for (Iterator<String> it = outliers.iterator(); it.hasNext(); ) { 145 if (StandardCodes.isCountry(it.next())) { 146 it.remove(); 147 } 148 } 149 // outliers.remove("AN"); 150 if (outliers.size() != 0) { 151 System.out.println("Mistakes: data for non-UN codes"); 152 for (String country : outliers) { 153 showCountryData(country); 154 } 155 throw new IllegalArgumentException("Mistakes: data for non-country codes"); 156 } 157 Set<String> altNames = new TreeSet<>(); 158 String oldCode = ""; 159 for (String display : CountryCodeConverter.names()) { 160 String code = CountryCodeConverter.getCodeFromName(display, true, missing); 161 String icu = ULocale.getDisplayCountry("und-" + code, "en"); 162 if (!display.equalsIgnoreCase(icu)) { 163 altNames.add(code + "\t" + display + "\t" + icu); 164 } 165 } 166 oldCode = ""; 167 if (SHOW_ALTERNATE_NAMES) { 168 for (String altName : altNames) { 169 String[] pieces = altName.split("\t"); 170 String code = pieces[0]; 171 if (code.equals("ZZ")) continue; 172 if (!code.equals(oldCode)) { 173 oldCode = code; 174 System.out.println(); 175 } 176 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]); 177 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + 178 // pieces[1] + 179 // "</territory> <!-- " + pieces[2] + " -->"); 180 } 181 } 182 if (!missing.isEmpty()) { 183 throw new RuntimeException( 184 "Could not load codes for: " 185 + ListFormat.getInstance(Locale.getDefault()).format(missing)); 186 } 187 } 188 showCountryData(String country)189 private static void showCountryData(String country) { 190 number.setMaximumFractionDigits(0); 191 System.out.println( 192 country 193 + "\t" 194 + ULocale.getDisplayCountry("und-" + country, "en") 195 + "\t" 196 + number.format(getPopulation(country)) 197 + "\t" 198 + number.format(getGdp(country)) 199 + "\t" 200 + percent.format(getLiteracy(country) / 100)); 201 } 202 getLiteracy(String country)203 public static Double getLiteracy(String country) { 204 return firstNonZero( 205 factbook_literacy.getCount(country), 206 un_literacy.getCount(country), 207 CountryData.literacy.getCount(country)); 208 } 209 getGdp(String country)210 public static Double getGdp(String country) { 211 return firstNonZero( 212 factbook_gdp.getCount(country), 213 worldbank_gdp.getCount(country), 214 CountryData.gdp.getCount(country)); 215 } 216 getPopulation(String country)217 public static Double getPopulation(String country) { 218 return firstNonZero( 219 factbook_population.getCount(country), 220 worldbank_population.getCount(country), 221 CountryData.population.getCount(country)); 222 } 223 firstNonZero(Double... items)224 private static Double firstNonZero(Double... items) { 225 for (Double item : items) { 226 if (item.doubleValue() != 0) { 227 return item; 228 } 229 } 230 return 0.0; 231 } 232 splitCommaSeparated(String line)233 static String[] splitCommaSeparated(String line) { 234 // items are separated by ',' 235 // each item is of the form abc... 236 // or "..." (required if a comma or quote is contained) 237 // " in a field is represented by "" 238 List<String> result = new ArrayList<>(); 239 StringBuilder item = new StringBuilder(); 240 boolean inQuote = false; 241 for (int i = 0; i < line.length(); ++i) { 242 char ch = line.charAt(i); // don't worry about supplementaries 243 switch (ch) { 244 case '"': 245 inQuote = !inQuote; 246 // at start or end, that's enough 247 // if get a quote when we are not in a quote, and not at start, then add it and 248 // return to inQuote 249 if (inQuote && item.length() != 0) { 250 item.append('"'); 251 inQuote = true; 252 } 253 break; 254 case ',': 255 if (!inQuote) { 256 result.add(item.toString()); 257 item.setLength(0); 258 } else { 259 item.append(ch); 260 } 261 break; 262 default: 263 item.append(ch); 264 break; 265 } 266 } 267 result.add(item.toString()); 268 return result.toArray(new String[result.size()]); 269 } 270 loadFactbookInfo(String filename, final Counter2<String> factbookGdp)271 private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) 272 throws IOException { 273 CldrUtility.handleFile( 274 filename, 275 new LineHandler() { 276 @Override 277 public boolean handle(String line) { 278 if (line.length() == 0 279 || line.startsWith("This tab") 280 || line.startsWith("Rank") 281 || line.startsWith(" This file")) { 282 return false; 283 } 284 String[] pieces = line.split("\\s{2,}"); 285 String code = 286 CountryCodeConverter.getCodeFromName( 287 FBLine.Country.get(pieces), true, missing); 288 if (code == null) { 289 return false; 290 } 291 if (!StandardCodes.isCountry(code)) { 292 if (ADD_POP) { 293 System.out.println("Skipping factbook info for: " + code); 294 } 295 return false; 296 } 297 code = code.toUpperCase(Locale.ENGLISH); 298 String valueString = FBLine.Value.get(pieces).trim(); 299 if (valueString.startsWith("$")) { 300 valueString = valueString.substring(1); 301 } 302 valueString = valueString.replace(",", ""); 303 double value = Double.parseDouble(valueString.trim()); 304 factbookGdp.add(code, value); 305 if (ADD_POP) { 306 System.out.println("Factbook gdp:\t" + code + "\t" + value); 307 } 308 return true; 309 } 310 }); 311 } 312 313 static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US); 314 static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US); 315 static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US); 316 317 static class MyLineHandler implements LineHandler { 318 CountryData countryData; 319 MyLineHandler(CountryData countryData)320 public MyLineHandler(CountryData countryData) { 321 super(); 322 this.countryData = countryData; 323 } 324 325 @Override handle(String line)326 public boolean handle(String line) throws ParseException { 327 if (line.startsWith("#")) return true; 328 if (line.length() == 0) { 329 return true; 330 } 331 String[] pieces = line.split(";"); 332 final String code = pieces[0].trim(); 333 if (code.equals("Code")) { 334 return false; 335 } 336 // Code;Name;Type;Data;Source 337 final String typeString = pieces[2].trim(); 338 final String data = pieces[3].trim(); 339 if (typeString.equals("gdp-ppp")) { 340 if (StandardCodes.isCountry(data)) { 341 Double otherPop = getPopulation(data); 342 Double otherGdp = getGdp(data); 343 Double myPop = getPopulation(code); 344 if (myPop.doubleValue() == 0 345 || otherPop.doubleValue() == 0 346 || otherGdp.doubleValue() == 0) { 347 otherPop = getPopulation(data); 348 otherGdp = getPopulation(data); 349 myPop = getPopulation(code); 350 throw new IllegalArgumentException("Zero population"); 351 } 352 CountryData.gdp.add(code, otherGdp * myPop / otherPop); 353 } else { 354 CountryData.gdp.add(code, dollars.parse(data).doubleValue()); 355 } 356 } else if (typeString.equals("population")) { 357 if (StandardCodes.isCountry(data)) { 358 throw new IllegalArgumentException("Population can't use other country's"); 359 } 360 CountryData.population.add(code, number.parse(data).doubleValue()); 361 } else if (typeString.equals("literacy")) { 362 if (StandardCodes.isCountry(data)) { 363 Double otherPop = getLiteracy(data); 364 CountryData.literacy.add(code, otherPop); 365 } else { 366 CountryData.literacy.add(code, number.parse(data).doubleValue()); 367 } 368 } else { 369 throw new IllegalArgumentException("Illegal type"); 370 } 371 return true; 372 } 373 } 374 375 static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze(); 376 loadFactbookLiteracy()377 private static void loadFactbookLiteracy() throws IOException { 378 final String filename = "external/factbook_literacy.txt"; 379 CldrUtility.handleFile( 380 filename, 381 new LineHandler() { 382 @Override 383 public boolean handle(String line) { 384 String[] pieces = line.split("\\t"); 385 String code = 386 CountryCodeConverter.getCodeFromName( 387 FBLiteracy.Country.get(pieces), true, missing); 388 if (code == null) { 389 return false; 390 } 391 if (!StandardCodes.isCountry(code)) { 392 if (ADD_POP) { 393 System.out.println("Skipping factbook literacy for: " + code); 394 } 395 return false; 396 } 397 code = code.toUpperCase(Locale.ENGLISH); 398 String valueString = FBLiteracy.Percent.get(pieces).trim(); 399 double percent = Double.parseDouble(valueString); 400 factbook_literacy.put(code, percent); 401 if (ADD_POP) { 402 System.out.println("Factbook literacy:\t" + code + "\t" + percent); 403 } 404 code = null; 405 return true; 406 } 407 }); 408 } 409 loadWorldBankInfo()410 private static void loadWorldBankInfo() throws IOException { 411 final String filename = "external/world_bank_data.csv"; 412 413 // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename)); 414 415 CldrUtility.handleFile( 416 filename, 417 new LineHandler() { 418 ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null; 419 420 @Override 421 public boolean handle(String line) { 422 String[] pieces = splitCommaSeparated(line); 423 if (columnToTypeAndValue == null) { 424 columnToTypeAndValue = WBLine.parseHeader(pieces); 425 return false; 426 } 427 428 final HashMap<Pair<WBLine, Integer>, String> lineAsHash = new HashMap<>(); 429 for (int i = 0; i < pieces.length; i++) { 430 lineAsHash.put(columnToTypeAndValue.get(i), pieces[i]); 431 } 432 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\""); 433 final String seriesCode = lineAsHash.get(Pair.of(WBLine.Series_Code, 0)); 434 435 // find the last year 436 String last = null; 437 438 for (int n = 0; n < columnToTypeAndValue.size(); n++) { 439 // assume the years are in ascending order 440 Pair<WBLine, Integer> i = columnToTypeAndValue.get(n); 441 if (i.getFirst() == WBLine.Year) { 442 String current = pieces[n]; 443 if (current.length() != 0 && !current.equals(EMPTY)) { 444 last = current; 445 } 446 } 447 } 448 if (last == null) { 449 return false; 450 } 451 final String countryName = lineAsHash.get(Pair.of(WBLine.Country_Name, 0)); 452 String country = 453 CountryCodeConverter.getCodeFromName(countryName, true, missing); 454 if (country == null) { 455 return false; 456 } 457 if (!StandardCodes.isCountry(country)) { 458 if (ADD_POP) { 459 System.out.println("Skipping worldbank info for: " + country); 460 } 461 return false; 462 } 463 double value; 464 try { 465 value = Double.parseDouble(last); 466 } catch (NumberFormatException e) { 467 throw new IllegalArgumentException( 468 "File changed format: need to modify code"); 469 } 470 if (seriesCode.equals(GCP)) { 471 worldbank_gdp.add(country, value); 472 } else if (seriesCode.equals(POP)) { 473 worldbank_population.add(country, value); 474 } else { 475 throw new IllegalArgumentException(); 476 } 477 return true; 478 } 479 }); 480 } 481 loadUnLiteracy()482 static void loadUnLiteracy() throws IOException { 483 for (final Pair<String, Double> p : getUnLiteracy(null)) { 484 un_literacy.add(p.getFirst(), p.getSecond()); 485 } 486 } 487 488 /** 489 * @param hadErr on return, true if there were errs 490 * @return list of code,percent values 491 * @throws IOException 492 */ getUnLiteracy(Output<Boolean> hadErr)493 static List<Pair<String, Double>> getUnLiteracy(Output<Boolean> hadErr) throws IOException { 494 List<Pair<String, Double>> result = new LinkedList<>(); 495 UnLiteracyParser ulp; 496 try { 497 ulp = new UnLiteracyParser().read(); 498 } catch (Throwable t) { 499 throw new IOException("Could not read UN data " + UnLiteracyParser.UN_LITERACY, t); 500 } 501 502 for (final Map.Entry<String, UnLiteracyParser.PerCountry> e : ulp.perCountry.entrySet()) { 503 final String country = e.getKey(); 504 final String latest = e.getValue().latest(); 505 final UnLiteracyParser.PerYear py = e.getValue().perYear.get(latest); 506 507 Long literate = py.total(UnLiteracyParser.LITERATE); 508 Long illiterate = py.total(UnLiteracyParser.ILLITERATE); 509 510 String code = CountryCodeConverter.getCodeFromName(country, true, missing); 511 if (code == null) { 512 if (hadErr != null) { 513 hadErr.value = true; 514 } 515 continue; 516 } 517 if (!StandardCodes.isCountry(code)) { 518 if (ADD_POP) { 519 System.out.println("Skipping UN info for: " + code); 520 } 521 continue; 522 } 523 double total = literate + illiterate; 524 double percent = ((double) literate) / total; 525 result.add(Pair.of(code, percent)); 526 } 527 if (result.isEmpty()) { 528 hadErr.value = true; 529 } 530 return result; 531 } 532 533 static { 534 try { loadFactbookLiteracy()535 loadFactbookLiteracy(); loadUnLiteracy()536 loadUnLiteracy(); 537 538 loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp); 539 loadFactbookInfo("external/factbook_population.txt", factbook_population); 540 CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other)); 541 loadWorldBankInfo()542 loadWorldBankInfo(); 543 StandardCodes sc = StandardCodes.make(); 544 StringBuilder myErrors = new StringBuilder(); 545 for (String territory : sc.getGoodAvailableCodes("territory")) { 546 if (!StandardCodes.isCountry(territory)) { 547 continue; 548 } 549 double gdp = getGdp(territory); 550 double literacy = getLiteracy(territory); 551 double population = getPopulation(territory); 552 if (population == 0) { 553 // AX;Aland Islands;population;26,200;www.aland.ax 554 myErrors.append( 555 "\n" 556 + territory 557 + ";" 558 + sc.getData("territory", territory) 559 + ";population;0;reason"); 560 } 561 if (gdp == 0) { 562 myErrors.append( 563 "\n" 564 + territory 565 + ";" 566 + sc.getData("territory", territory) 567 + ";gdp-ppp;0;reason"); 568 } 569 if (literacy == 0) { 570 myErrors.append( 571 "\n" 572 + territory 573 + ";" 574 + sc.getData("territory", territory) 575 + ";literacy;0;reason"); 576 } 577 } 578 if (myErrors.length() != 0) { 579 throw new IllegalArgumentException( 580 "Missing Country values, the following and add to external/other_country_data to fix, chaning the 0 to the real value:" 581 + myErrors); 582 } 583 } catch (IOException e) { 584 } 585 } 586 } 587