xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/AddPopulationData.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.ibm.icu.text.ListFormat;
4 import com.ibm.icu.text.NumberFormat;
5 import com.ibm.icu.text.UnicodeSet;
6 import com.ibm.icu.util.Output;
7 import com.ibm.icu.util.ULocale;
8 import java.io.IOException;
9 import java.text.ParseException;
10 import java.util.ArrayList;
11 import java.util.HashMap;
12 import java.util.Iterator;
13 import java.util.LinkedList;
14 import java.util.List;
15 import java.util.Locale;
16 import java.util.Map;
17 import java.util.Set;
18 import java.util.TreeSet;
19 import java.util.regex.Matcher;
20 import java.util.regex.Pattern;
21 import org.unicode.cldr.util.CldrUtility;
22 import org.unicode.cldr.util.CldrUtility.LineHandler;
23 import org.unicode.cldr.util.Counter2;
24 import org.unicode.cldr.util.Pair;
25 import org.unicode.cldr.util.StandardCodes;
26 
27 public class AddPopulationData {
28     static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false);
29     static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false);
30 
31     enum WBLine {
32         // "Afghanistan","AFG","GNI, PPP (current international
33         // $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..",
34 
35         // Country Name,Country Code,Series Name,Series Code,2000 [YR2000],2001 [YR2001],2002
36         // [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008
37         // [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014
38         // [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020
39         // [YR2020]
40         Country_Name,
41         Country_Code,
42         Series_Name,
43         Series_Code,
44         Year("(\\d+)\\s*\\[YR(\\d+)\\]");
45 
46         final Pattern pattern;
47 
WBLine()48         WBLine() {
49             this.pattern = Pattern.compile(name().replaceAll("_", " "));
50         }
51 
WBLine(final String regex)52         WBLine(final String regex) {
53             this.pattern = Pattern.compile(regex);
54         }
55 
match(String str)56         Matcher match(String str) {
57             // Skip BOM
58             if (str.startsWith("\uFEFF")) {
59                 str = str.substring("\uFEFF".length());
60             }
61             return this.pattern.matcher(str);
62         }
63 
find(final String str)64         static Pair<WBLine, Integer> find(final String str) {
65             for (WBLine i : values()) {
66                 final Matcher m = i.match(str);
67                 if (m.matches()) {
68                     Integer val = 0;
69                     if (m.groupCount() > 0) {
70                         val = Integer.parseInt(m.group(1));
71                     }
72                     return Pair.of(i, val);
73                 }
74             }
75             return null;
76         }
77 
parseHeader(final String[] pieces)78         static ArrayList<Pair<WBLine, Integer>> parseHeader(final String[] pieces) {
79             ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null;
80             columnToTypeAndValue = new ArrayList<>();
81             for (int i = 0; i < pieces.length; i++) {
82                 columnToTypeAndValue.add(i, WBLine.find(pieces[i]));
83             }
84             return columnToTypeAndValue;
85         }
86     }
87 
88     enum FBLine {
89         Rank,
90         Country,
91         Value,
92         Year;
93 
get(String[] pieces)94         String get(String[] pieces) {
95             return pieces[ordinal()];
96         }
97     }
98 
99     enum FBLiteracy {
100         Rank,
101         Country,
102         Percent;
103 
get(String[] pieces)104         String get(String[] pieces) {
105             return pieces[ordinal()];
106         }
107     }
108 
109     private static final String GCP = "NY.GNP.MKTP.PP.CD";
110     private static final String POP = "SP.POP.TOTL";
111     private static final String EMPTY = "..";
112     private static Counter2<String> worldbank_gdp = new Counter2<>();
113     private static Counter2<String> worldbank_population = new Counter2<>();
114     private static Counter2<String> un_literacy = new Counter2<>();
115 
116     private static Counter2<String> factbook_gdp = new Counter2<>();
117     private static Counter2<String> factbook_population = new Counter2<>();
118     private static Counter2<String> factbook_literacy = new Counter2<>();
119 
120     private static CountryData other = new CountryData();
121 
122     static class CountryData {
123         private static Counter2<String> population = new Counter2<>();
124         private static Counter2<String> gdp = new Counter2<>();
125         private static Counter2<String> literacy = new Counter2<>();
126     }
127 
128     static final Set<String> missing = new TreeSet<String>();
129 
main(String[] args)130     public static void main(String[] args) throws IOException {
131 
132         System.out.println(
133                 "Code" + "\t" + "Name" + "\t" + "Pop" + "\t" + "GDP-PPP" + "\t" + "UN Literacy");
134 
135         for (String country : StandardCodes.make().getGoodCountries()) {
136             showCountryData(country);
137         }
138         Set<String> outliers = new TreeSet<>();
139         outliers.addAll(factbook_population.keySet());
140         outliers.addAll(worldbank_population.keySet());
141         outliers.addAll(factbook_gdp.keySet());
142         outliers.addAll(worldbank_gdp.keySet());
143         outliers.addAll(un_literacy.keySet());
144         for (Iterator<String> it = outliers.iterator(); it.hasNext(); ) {
145             if (StandardCodes.isCountry(it.next())) {
146                 it.remove();
147             }
148         }
149         // outliers.remove("AN");
150         if (outliers.size() != 0) {
151             System.out.println("Mistakes: data for non-UN codes");
152             for (String country : outliers) {
153                 showCountryData(country);
154             }
155             throw new IllegalArgumentException("Mistakes: data for non-country codes");
156         }
157         Set<String> altNames = new TreeSet<>();
158         String oldCode = "";
159         for (String display : CountryCodeConverter.names()) {
160             String code = CountryCodeConverter.getCodeFromName(display, true, missing);
161             String icu = ULocale.getDisplayCountry("und-" + code, "en");
162             if (!display.equalsIgnoreCase(icu)) {
163                 altNames.add(code + "\t" + display + "\t" + icu);
164             }
165         }
166         oldCode = "";
167         if (SHOW_ALTERNATE_NAMES) {
168             for (String altName : altNames) {
169                 String[] pieces = altName.split("\t");
170                 String code = pieces[0];
171                 if (code.equals("ZZ")) continue;
172                 if (!code.equals(oldCode)) {
173                     oldCode = code;
174                     System.out.println();
175                 }
176                 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]);
177                 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" +
178                 // pieces[1] +
179                 // "</territory> <!-- " + pieces[2] + " -->");
180             }
181         }
182         if (!missing.isEmpty()) {
183             throw new RuntimeException(
184                     "Could not load codes for: "
185                             + ListFormat.getInstance(Locale.getDefault()).format(missing));
186         }
187     }
188 
showCountryData(String country)189     private static void showCountryData(String country) {
190         number.setMaximumFractionDigits(0);
191         System.out.println(
192                 country
193                         + "\t"
194                         + ULocale.getDisplayCountry("und-" + country, "en")
195                         + "\t"
196                         + number.format(getPopulation(country))
197                         + "\t"
198                         + number.format(getGdp(country))
199                         + "\t"
200                         + percent.format(getLiteracy(country) / 100));
201     }
202 
getLiteracy(String country)203     public static Double getLiteracy(String country) {
204         return firstNonZero(
205                 factbook_literacy.getCount(country),
206                 un_literacy.getCount(country),
207                 CountryData.literacy.getCount(country));
208     }
209 
getGdp(String country)210     public static Double getGdp(String country) {
211         return firstNonZero(
212                 factbook_gdp.getCount(country),
213                 worldbank_gdp.getCount(country),
214                 CountryData.gdp.getCount(country));
215     }
216 
getPopulation(String country)217     public static Double getPopulation(String country) {
218         return firstNonZero(
219                 factbook_population.getCount(country),
220                 worldbank_population.getCount(country),
221                 CountryData.population.getCount(country));
222     }
223 
firstNonZero(Double... items)224     private static Double firstNonZero(Double... items) {
225         for (Double item : items) {
226             if (item.doubleValue() != 0) {
227                 return item;
228             }
229         }
230         return 0.0;
231     }
232 
splitCommaSeparated(String line)233     static String[] splitCommaSeparated(String line) {
234         // items are separated by ','
235         // each item is of the form abc...
236         // or "..." (required if a comma or quote is contained)
237         // " in a field is represented by ""
238         List<String> result = new ArrayList<>();
239         StringBuilder item = new StringBuilder();
240         boolean inQuote = false;
241         for (int i = 0; i < line.length(); ++i) {
242             char ch = line.charAt(i); // don't worry about supplementaries
243             switch (ch) {
244                 case '"':
245                     inQuote = !inQuote;
246                     // at start or end, that's enough
247                     // if get a quote when we are not in a quote, and not at start, then add it and
248                     // return to inQuote
249                     if (inQuote && item.length() != 0) {
250                         item.append('"');
251                         inQuote = true;
252                     }
253                     break;
254                 case ',':
255                     if (!inQuote) {
256                         result.add(item.toString());
257                         item.setLength(0);
258                     } else {
259                         item.append(ch);
260                     }
261                     break;
262                 default:
263                     item.append(ch);
264                     break;
265             }
266         }
267         result.add(item.toString());
268         return result.toArray(new String[result.size()]);
269     }
270 
loadFactbookInfo(String filename, final Counter2<String> factbookGdp)271     private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp)
272             throws IOException {
273         CldrUtility.handleFile(
274                 filename,
275                 new LineHandler() {
276                     @Override
277                     public boolean handle(String line) {
278                         if (line.length() == 0
279                                 || line.startsWith("This tab")
280                                 || line.startsWith("Rank")
281                                 || line.startsWith(" This file")) {
282                             return false;
283                         }
284                         String[] pieces = line.split("\\s{2,}");
285                         String code =
286                                 CountryCodeConverter.getCodeFromName(
287                                         FBLine.Country.get(pieces), true, missing);
288                         if (code == null) {
289                             return false;
290                         }
291                         if (!StandardCodes.isCountry(code)) {
292                             if (ADD_POP) {
293                                 System.out.println("Skipping factbook info for: " + code);
294                             }
295                             return false;
296                         }
297                         code = code.toUpperCase(Locale.ENGLISH);
298                         String valueString = FBLine.Value.get(pieces).trim();
299                         if (valueString.startsWith("$")) {
300                             valueString = valueString.substring(1);
301                         }
302                         valueString = valueString.replace(",", "");
303                         double value = Double.parseDouble(valueString.trim());
304                         factbookGdp.add(code, value);
305                         if (ADD_POP) {
306                             System.out.println("Factbook gdp:\t" + code + "\t" + value);
307                         }
308                         return true;
309                     }
310                 });
311     }
312 
313     static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US);
314     static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US);
315     static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US);
316 
317     static class MyLineHandler implements LineHandler {
318         CountryData countryData;
319 
MyLineHandler(CountryData countryData)320         public MyLineHandler(CountryData countryData) {
321             super();
322             this.countryData = countryData;
323         }
324 
325         @Override
handle(String line)326         public boolean handle(String line) throws ParseException {
327             if (line.startsWith("#")) return true;
328             if (line.length() == 0) {
329                 return true;
330             }
331             String[] pieces = line.split(";");
332             final String code = pieces[0].trim();
333             if (code.equals("Code")) {
334                 return false;
335             }
336             // Code;Name;Type;Data;Source
337             final String typeString = pieces[2].trim();
338             final String data = pieces[3].trim();
339             if (typeString.equals("gdp-ppp")) {
340                 if (StandardCodes.isCountry(data)) {
341                     Double otherPop = getPopulation(data);
342                     Double otherGdp = getGdp(data);
343                     Double myPop = getPopulation(code);
344                     if (myPop.doubleValue() == 0
345                             || otherPop.doubleValue() == 0
346                             || otherGdp.doubleValue() == 0) {
347                         otherPop = getPopulation(data);
348                         otherGdp = getPopulation(data);
349                         myPop = getPopulation(code);
350                         throw new IllegalArgumentException("Zero population");
351                     }
352                     CountryData.gdp.add(code, otherGdp * myPop / otherPop);
353                 } else {
354                     CountryData.gdp.add(code, dollars.parse(data).doubleValue());
355                 }
356             } else if (typeString.equals("population")) {
357                 if (StandardCodes.isCountry(data)) {
358                     throw new IllegalArgumentException("Population can't use other country's");
359                 }
360                 CountryData.population.add(code, number.parse(data).doubleValue());
361             } else if (typeString.equals("literacy")) {
362                 if (StandardCodes.isCountry(data)) {
363                     Double otherPop = getLiteracy(data);
364                     CountryData.literacy.add(code, otherPop);
365                 } else {
366                     CountryData.literacy.add(code, number.parse(data).doubleValue());
367                 }
368             } else {
369                 throw new IllegalArgumentException("Illegal type");
370             }
371             return true;
372         }
373     }
374 
375     static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze();
376 
loadFactbookLiteracy()377     private static void loadFactbookLiteracy() throws IOException {
378         final String filename = "external/factbook_literacy.txt";
379         CldrUtility.handleFile(
380                 filename,
381                 new LineHandler() {
382                     @Override
383                     public boolean handle(String line) {
384                         String[] pieces = line.split("\\t");
385                         String code =
386                                 CountryCodeConverter.getCodeFromName(
387                                         FBLiteracy.Country.get(pieces), true, missing);
388                         if (code == null) {
389                             return false;
390                         }
391                         if (!StandardCodes.isCountry(code)) {
392                             if (ADD_POP) {
393                                 System.out.println("Skipping factbook literacy for: " + code);
394                             }
395                             return false;
396                         }
397                         code = code.toUpperCase(Locale.ENGLISH);
398                         String valueString = FBLiteracy.Percent.get(pieces).trim();
399                         double percent = Double.parseDouble(valueString);
400                         factbook_literacy.put(code, percent);
401                         if (ADD_POP) {
402                             System.out.println("Factbook literacy:\t" + code + "\t" + percent);
403                         }
404                         code = null;
405                         return true;
406                     }
407                 });
408     }
409 
loadWorldBankInfo()410     private static void loadWorldBankInfo() throws IOException {
411         final String filename = "external/world_bank_data.csv";
412 
413         // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename));
414 
415         CldrUtility.handleFile(
416                 filename,
417                 new LineHandler() {
418                     ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null;
419 
420                     @Override
421                     public boolean handle(String line) {
422                         String[] pieces = splitCommaSeparated(line);
423                         if (columnToTypeAndValue == null) {
424                             columnToTypeAndValue = WBLine.parseHeader(pieces);
425                             return false;
426                         }
427 
428                         final HashMap<Pair<WBLine, Integer>, String> lineAsHash = new HashMap<>();
429                         for (int i = 0; i < pieces.length; i++) {
430                             lineAsHash.put(columnToTypeAndValue.get(i), pieces[i]);
431                         }
432                         // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\"");
433                         final String seriesCode = lineAsHash.get(Pair.of(WBLine.Series_Code, 0));
434 
435                         // find the last year
436                         String last = null;
437 
438                         for (int n = 0; n < columnToTypeAndValue.size(); n++) {
439                             // assume the years are in ascending order
440                             Pair<WBLine, Integer> i = columnToTypeAndValue.get(n);
441                             if (i.getFirst() == WBLine.Year) {
442                                 String current = pieces[n];
443                                 if (current.length() != 0 && !current.equals(EMPTY)) {
444                                     last = current;
445                                 }
446                             }
447                         }
448                         if (last == null) {
449                             return false;
450                         }
451                         final String countryName = lineAsHash.get(Pair.of(WBLine.Country_Name, 0));
452                         String country =
453                                 CountryCodeConverter.getCodeFromName(countryName, true, missing);
454                         if (country == null) {
455                             return false;
456                         }
457                         if (!StandardCodes.isCountry(country)) {
458                             if (ADD_POP) {
459                                 System.out.println("Skipping worldbank info for: " + country);
460                             }
461                             return false;
462                         }
463                         double value;
464                         try {
465                             value = Double.parseDouble(last);
466                         } catch (NumberFormatException e) {
467                             throw new IllegalArgumentException(
468                                     "File changed format: need to modify code");
469                         }
470                         if (seriesCode.equals(GCP)) {
471                             worldbank_gdp.add(country, value);
472                         } else if (seriesCode.equals(POP)) {
473                             worldbank_population.add(country, value);
474                         } else {
475                             throw new IllegalArgumentException();
476                         }
477                         return true;
478                     }
479                 });
480     }
481 
loadUnLiteracy()482     static void loadUnLiteracy() throws IOException {
483         for (final Pair<String, Double> p : getUnLiteracy(null)) {
484             un_literacy.add(p.getFirst(), p.getSecond());
485         }
486     }
487 
488     /**
489      * @param hadErr on return, true if there were errs
490      * @return list of code,percent values
491      * @throws IOException
492      */
getUnLiteracy(Output<Boolean> hadErr)493     static List<Pair<String, Double>> getUnLiteracy(Output<Boolean> hadErr) throws IOException {
494         List<Pair<String, Double>> result = new LinkedList<>();
495         UnLiteracyParser ulp;
496         try {
497             ulp = new UnLiteracyParser().read();
498         } catch (Throwable t) {
499             throw new IOException("Could not read UN data " + UnLiteracyParser.UN_LITERACY, t);
500         }
501 
502         for (final Map.Entry<String, UnLiteracyParser.PerCountry> e : ulp.perCountry.entrySet()) {
503             final String country = e.getKey();
504             final String latest = e.getValue().latest();
505             final UnLiteracyParser.PerYear py = e.getValue().perYear.get(latest);
506 
507             Long literate = py.total(UnLiteracyParser.LITERATE);
508             Long illiterate = py.total(UnLiteracyParser.ILLITERATE);
509 
510             String code = CountryCodeConverter.getCodeFromName(country, true, missing);
511             if (code == null) {
512                 if (hadErr != null) {
513                     hadErr.value = true;
514                 }
515                 continue;
516             }
517             if (!StandardCodes.isCountry(code)) {
518                 if (ADD_POP) {
519                     System.out.println("Skipping UN info for: " + code);
520                 }
521                 continue;
522             }
523             double total = literate + illiterate;
524             double percent = ((double) literate) / total;
525             result.add(Pair.of(code, percent));
526         }
527         if (result.isEmpty()) {
528             hadErr.value = true;
529         }
530         return result;
531     }
532 
533     static {
534         try {
loadFactbookLiteracy()535             loadFactbookLiteracy();
loadUnLiteracy()536             loadUnLiteracy();
537 
538             loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp);
539             loadFactbookInfo("external/factbook_population.txt", factbook_population);
540             CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other));
541 
loadWorldBankInfo()542             loadWorldBankInfo();
543             StandardCodes sc = StandardCodes.make();
544             StringBuilder myErrors = new StringBuilder();
545             for (String territory : sc.getGoodAvailableCodes("territory")) {
546                 if (!StandardCodes.isCountry(territory)) {
547                     continue;
548                 }
549                 double gdp = getGdp(territory);
550                 double literacy = getLiteracy(territory);
551                 double population = getPopulation(territory);
552                 if (population == 0) {
553                     // AX;Aland Islands;population;26,200;www.aland.ax
554                     myErrors.append(
555                             "\n"
556                                     + territory
557                                     + ";"
558                                     + sc.getData("territory", territory)
559                                     + ";population;0;reason");
560                 }
561                 if (gdp == 0) {
562                     myErrors.append(
563                             "\n"
564                                     + territory
565                                     + ";"
566                                     + sc.getData("territory", territory)
567                                     + ";gdp-ppp;0;reason");
568                 }
569                 if (literacy == 0) {
570                     myErrors.append(
571                             "\n"
572                                     + territory
573                                     + ";"
574                                     + sc.getData("territory", territory)
575                                     + ";literacy;0;reason");
576                 }
577             }
578             if (myErrors.length() != 0) {
579                 throw new IllegalArgumentException(
580                         "Missing Country values, the following and add to external/other_country_data to fix, chaning the 0 to the real value:"
581                                 + myErrors);
582             }
583         } catch (IOException e) {
584         }
585     }
586 }
587